diff options
author | markus <> | 2003-05-11 21:36:58 +0000 |
---|---|---|
committer | markus <> | 2003-05-11 21:36:58 +0000 |
commit | 1c98a87f0daac81245653c227eb2f2508a22a965 (patch) | |
tree | 3de6d603296ec563b936da4e6a8a1e33d48f8884 /src/lib/libcrypto/bn/asm | |
parent | 31392c89d1135cf2a416f97295f6d21681b3fbc4 (diff) | |
download | openbsd-1c98a87f0daac81245653c227eb2f2508a22a965.tar.gz openbsd-1c98a87f0daac81245653c227eb2f2508a22a965.tar.bz2 openbsd-1c98a87f0daac81245653c227eb2f2508a22a965.zip |
import 0.9.7b (without idea and rc5)
Diffstat (limited to 'src/lib/libcrypto/bn/asm')
-rw-r--r-- | src/lib/libcrypto/bn/asm/ia64.S | 235 | ||||
-rw-r--r-- | src/lib/libcrypto/bn/asm/pa-risc2.s | 36 | ||||
-rw-r--r-- | src/lib/libcrypto/bn/asm/x86_64-gcc.c | 575 |
3 files changed, 764 insertions, 82 deletions
diff --git a/src/lib/libcrypto/bn/asm/ia64.S b/src/lib/libcrypto/bn/asm/ia64.S index ae56066310..7dfda85566 100644 --- a/src/lib/libcrypto/bn/asm/ia64.S +++ b/src/lib/libcrypto/bn/asm/ia64.S | |||
@@ -1,6 +1,6 @@ | |||
1 | .explicit | 1 | .explicit |
2 | .text | 2 | .text |
3 | .ident "ia64.S, Version 1.1" | 3 | .ident "ia64.S, Version 2.0" |
4 | .ident "IA-64 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>" | 4 | .ident "IA-64 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>" |
5 | 5 | ||
6 | // | 6 | // |
@@ -13,6 +13,35 @@ | |||
13 | // disclaimed. | 13 | // disclaimed. |
14 | // ==================================================================== | 14 | // ==================================================================== |
15 | // | 15 | // |
16 | // Version 2.x is Itanium2 re-tune. Few words about how Itanum2 is | ||
17 | // different from Itanium to this module viewpoint. Most notably, is it | ||
18 | // "wider" than Itanium? Can you experience loop scalability as | ||
19 | // discussed in commentary sections? Not really:-( Itanium2 has 6 | ||
20 | // integer ALU ports, i.e. it's 2 ports wider, but it's not enough to | ||
21 | // spin twice as fast, as I need 8 IALU ports. Amount of floating point | ||
22 | // ports is the same, i.e. 2, while I need 4. In other words, to this | ||
23 | // module Itanium2 remains effectively as "wide" as Itanium. Yet it's | ||
24 | // essentially different in respect to this module, and a re-tune was | ||
25 | // required. Well, because some intruction latencies has changed. Most | ||
26 | // noticeably those intensively used: | ||
27 | // | ||
28 | // Itanium Itanium2 | ||
29 | // ldf8 9 6 L2 hit | ||
30 | // ld8 2 1 L1 hit | ||
31 | // getf 2 5 | ||
32 | // xma[->getf] 7[+1] 4[+0] | ||
33 | // add[->st8] 1[+1] 1[+0] | ||
34 | // | ||
35 | // What does it mean? You might ratiocinate that the original code | ||
36 | // should run just faster... Because sum of latencies is smaller... | ||
37 | // Wrong! Note that getf latency increased. This means that if a loop is | ||
38 | // scheduled for lower latency (and they are), then it will suffer from | ||
39 | // stall condition and the code will therefore turn anti-scalable, e.g. | ||
40 | // original bn_mul_words spun at 5*n or 2.5 times slower than expected | ||
41 | // on Itanium2! What to do? Reschedule loops for Itanium2? But then | ||
42 | // Itanium would exhibit anti-scalability. So I've chosen to reschedule | ||
43 | // for worst latency for every instruction aiming for best *all-round* | ||
44 | // performance. | ||
16 | 45 | ||
17 | // Q. How much faster does it get? | 46 | // Q. How much faster does it get? |
18 | // A. Here is the output from 'openssl speed rsa dsa' for vanilla | 47 | // A. Here is the output from 'openssl speed rsa dsa' for vanilla |
@@ -149,12 +178,27 @@ bn_add_words: | |||
149 | brp.loop.imp .L_bn_add_words_ctop,.L_bn_add_words_cend-16 | 178 | brp.loop.imp .L_bn_add_words_ctop,.L_bn_add_words_cend-16 |
150 | } | 179 | } |
151 | .body | 180 | .body |
152 | { .mib; mov r14=r32 // rp | 181 | { .mib; |
182 | #if defined(_HPUX_SOURCE) && defined(_ILP32) | ||
183 | addp4 r14=0,r32 // rp | ||
184 | #else | ||
185 | mov r14=r32 // rp | ||
186 | #endif | ||
153 | mov r9=pr };; | 187 | mov r9=pr };; |
154 | { .mii; mov r15=r33 // ap | 188 | { .mii; |
189 | #if defined(_HPUX_SOURCE) && defined(_ILP32) | ||
190 | addp4 r15=0,r33 // ap | ||
191 | #else | ||
192 | mov r15=r33 // ap | ||
193 | #endif | ||
155 | mov ar.lc=r10 | 194 | mov ar.lc=r10 |
156 | mov ar.ec=6 } | 195 | mov ar.ec=6 } |
157 | { .mib; mov r16=r34 // bp | 196 | { .mib; |
197 | #if defined(_HPUX_SOURCE) && defined(_ILP32) | ||
198 | addp4 r16=0,r34 // bp | ||
199 | #else | ||
200 | mov r16=r34 // bp | ||
201 | #endif | ||
158 | mov pr.rot=1<<16 };; | 202 | mov pr.rot=1<<16 };; |
159 | 203 | ||
160 | .L_bn_add_words_ctop: | 204 | .L_bn_add_words_ctop: |
@@ -174,7 +218,7 @@ bn_add_words: | |||
174 | 218 | ||
175 | { .mii; | 219 | { .mii; |
176 | (p59) add r8=1,r8 // return value | 220 | (p59) add r8=1,r8 // return value |
177 | mov pr=r9,-1 | 221 | mov pr=r9,0x1ffff |
178 | mov ar.lc=r3 } | 222 | mov ar.lc=r3 } |
179 | { .mbb; nop.b 0x0 | 223 | { .mbb; nop.b 0x0 |
180 | br.ret.sptk.many b0 };; | 224 | br.ret.sptk.many b0 };; |
@@ -202,12 +246,27 @@ bn_sub_words: | |||
202 | brp.loop.imp .L_bn_sub_words_ctop,.L_bn_sub_words_cend-16 | 246 | brp.loop.imp .L_bn_sub_words_ctop,.L_bn_sub_words_cend-16 |
203 | } | 247 | } |
204 | .body | 248 | .body |
205 | { .mib; mov r14=r32 // rp | 249 | { .mib; |
250 | #if defined(_HPUX_SOURCE) && defined(_ILP32) | ||
251 | addp4 r14=0,r32 // rp | ||
252 | #else | ||
253 | mov r14=r32 // rp | ||
254 | #endif | ||
206 | mov r9=pr };; | 255 | mov r9=pr };; |
207 | { .mii; mov r15=r33 // ap | 256 | { .mii; |
257 | #if defined(_HPUX_SOURCE) && defined(_ILP32) | ||
258 | addp4 r15=0,r33 // ap | ||
259 | #else | ||
260 | mov r15=r33 // ap | ||
261 | #endif | ||
208 | mov ar.lc=r10 | 262 | mov ar.lc=r10 |
209 | mov ar.ec=6 } | 263 | mov ar.ec=6 } |
210 | { .mib; mov r16=r34 // bp | 264 | { .mib; |
265 | #if defined(_HPUX_SOURCE) && defined(_ILP32) | ||
266 | addp4 r16=0,r34 // bp | ||
267 | #else | ||
268 | mov r16=r34 // bp | ||
269 | #endif | ||
211 | mov pr.rot=1<<16 };; | 270 | mov pr.rot=1<<16 };; |
212 | 271 | ||
213 | .L_bn_sub_words_ctop: | 272 | .L_bn_sub_words_ctop: |
@@ -227,7 +286,7 @@ bn_sub_words: | |||
227 | 286 | ||
228 | { .mii; | 287 | { .mii; |
229 | (p59) add r8=1,r8 // return value | 288 | (p59) add r8=1,r8 // return value |
230 | mov pr=r9,-1 | 289 | mov pr=r9,0x1ffff |
231 | mov ar.lc=r3 } | 290 | mov ar.lc=r3 } |
232 | { .mbb; nop.b 0x0 | 291 | { .mbb; nop.b 0x0 |
233 | br.ret.sptk.many b0 };; | 292 | br.ret.sptk.many b0 };; |
@@ -253,7 +312,7 @@ bn_mul_words: | |||
253 | #ifdef XMA_TEMPTATION | 312 | #ifdef XMA_TEMPTATION |
254 | { .mfi; alloc r2=ar.pfs,4,0,0,0 };; | 313 | { .mfi; alloc r2=ar.pfs,4,0,0,0 };; |
255 | #else | 314 | #else |
256 | { .mfi; alloc r2=ar.pfs,4,4,0,8 };; | 315 | { .mfi; alloc r2=ar.pfs,4,12,0,16 };; |
257 | #endif | 316 | #endif |
258 | { .mib; mov r8=r0 // return value | 317 | { .mib; mov r8=r0 // return value |
259 | cmp4.le p6,p0=r34,r0 | 318 | cmp4.le p6,p0=r34,r0 |
@@ -266,24 +325,30 @@ bn_mul_words: | |||
266 | 325 | ||
267 | .body | 326 | .body |
268 | { .mib; setf.sig f8=r35 // w | 327 | { .mib; setf.sig f8=r35 // w |
269 | mov pr.rot=0x400001<<16 | 328 | mov pr.rot=0x800001<<16 |
270 | // ------^----- serves as (p48) at first (p26) | 329 | // ------^----- serves as (p50) at first (p27) |
271 | brp.loop.imp .L_bn_mul_words_ctop,.L_bn_mul_words_cend-16 | 330 | brp.loop.imp .L_bn_mul_words_ctop,.L_bn_mul_words_cend-16 |
272 | } | 331 | } |
273 | 332 | ||
274 | #ifndef XMA_TEMPTATION | 333 | #ifndef XMA_TEMPTATION |
275 | 334 | ||
276 | { .mii; mov r14=r32 // rp | 335 | { .mii; |
277 | mov r15=r33 // ap | 336 | #if defined(_HPUX_SOURCE) && defined(_ILP32) |
337 | addp4 r14=0,r32 // rp | ||
338 | addp4 r15=0,r33 // ap | ||
339 | #else | ||
340 | mov r14=r32 // rp | ||
341 | mov r15=r33 // ap | ||
342 | #endif | ||
278 | mov ar.lc=r10 } | 343 | mov ar.lc=r10 } |
279 | { .mii; mov r39=0 // serves as r33 at first (p26) | 344 | { .mii; mov r40=0 // serves as r35 at first (p27) |
280 | mov ar.ec=12 };; | 345 | mov ar.ec=13 };; |
281 | 346 | ||
282 | // This loop spins in 2*(n+11) ticks. It's scheduled for data in L2 | 347 | // This loop spins in 2*(n+12) ticks. It's scheduled for data in Itanium |
283 | // cache (i.e. 9 ticks away) as floating point load/store instructions | 348 | // L2 cache (i.e. 9 ticks away) as floating point load/store instructions |
284 | // bypass L1 cache and L2 latency is actually best-case scenario for | 349 | // bypass L1 cache and L2 latency is actually best-case scenario for |
285 | // ldf8. The loop is not scalable and shall run in 2*(n+11) even on | 350 | // ldf8. The loop is not scalable and shall run in 2*(n+12) even on |
286 | // "wider" IA-64 implementations. It's a trade-off here. n+22 loop | 351 | // "wider" IA-64 implementations. It's a trade-off here. n+24 loop |
287 | // would give us ~5% in *overall* performance improvement on "wider" | 352 | // would give us ~5% in *overall* performance improvement on "wider" |
288 | // IA-64, but would hurt Itanium for about same because of longer | 353 | // IA-64, but would hurt Itanium for about same because of longer |
289 | // epilogue. As it's a matter of few percents in either case I've | 354 | // epilogue. As it's a matter of few percents in either case I've |
@@ -291,25 +356,25 @@ bn_mul_words: | |||
291 | // this very instruction sequence in bn_mul_add_words loop which in | 356 | // this very instruction sequence in bn_mul_add_words loop which in |
292 | // turn is scalable). | 357 | // turn is scalable). |
293 | .L_bn_mul_words_ctop: | 358 | .L_bn_mul_words_ctop: |
294 | { .mfi; (p25) getf.sig r36=f49 // low | 359 | { .mfi; (p25) getf.sig r36=f52 // low |
295 | (p21) xmpy.lu f45=f37,f8 | 360 | (p21) xmpy.lu f48=f37,f8 |
296 | (p27) cmp.ltu p52,p48=r39,r38 } | 361 | (p28) cmp.ltu p54,p50=r41,r39 } |
297 | { .mfi; (p16) ldf8 f32=[r15],8 | 362 | { .mfi; (p16) ldf8 f32=[r15],8 |
298 | (p21) xmpy.hu f38=f37,f8 | 363 | (p21) xmpy.hu f40=f37,f8 |
299 | (p0) nop.i 0x0 };; | 364 | (p0) nop.i 0x0 };; |
300 | { .mii; (p26) getf.sig r32=f43 // high | 365 | { .mii; (p25) getf.sig r32=f44 // high |
301 | .pred.rel "mutex",p48,p52 | 366 | .pred.rel "mutex",p50,p54 |
302 | (p48) add r38=r37,r33 // (p26) | 367 | (p50) add r40=r38,r35 // (p27) |
303 | (p52) add r38=r37,r33,1 } // (p26) | 368 | (p54) add r40=r38,r35,1 } // (p27) |
304 | { .mfb; (p27) st8 [r14]=r39,8 | 369 | { .mfb; (p28) st8 [r14]=r41,8 |
305 | (p0) nop.f 0x0 | 370 | (p0) nop.f 0x0 |
306 | br.ctop.sptk .L_bn_mul_words_ctop };; | 371 | br.ctop.sptk .L_bn_mul_words_ctop };; |
307 | .L_bn_mul_words_cend: | 372 | .L_bn_mul_words_cend: |
308 | 373 | ||
309 | { .mii; nop.m 0x0 | 374 | { .mii; nop.m 0x0 |
310 | .pred.rel "mutex",p49,p53 | 375 | .pred.rel "mutex",p51,p55 |
311 | (p49) add r8=r34,r0 | 376 | (p51) add r8=r36,r0 |
312 | (p53) add r8=r34,r0,1 } | 377 | (p55) add r8=r36,r0,1 } |
313 | { .mfb; nop.m 0x0 | 378 | { .mfb; nop.m 0x0 |
314 | nop.f 0x0 | 379 | nop.f 0x0 |
315 | nop.b 0x0 } | 380 | nop.b 0x0 } |
@@ -344,7 +409,7 @@ bn_mul_words: | |||
344 | #endif // XMA_TEMPTATION | 409 | #endif // XMA_TEMPTATION |
345 | 410 | ||
346 | { .mii; nop.m 0x0 | 411 | { .mii; nop.m 0x0 |
347 | mov pr=r9,-1 | 412 | mov pr=r9,0x1ffff |
348 | mov ar.lc=r3 } | 413 | mov ar.lc=r3 } |
349 | { .mfb; rum 1<<5 // clear um.mfh | 414 | { .mfb; rum 1<<5 // clear um.mfh |
350 | nop.f 0x0 | 415 | nop.f 0x0 |
@@ -376,59 +441,69 @@ bn_mul_add_words: | |||
376 | 441 | ||
377 | .body | 442 | .body |
378 | { .mib; setf.sig f8=r35 // w | 443 | { .mib; setf.sig f8=r35 // w |
379 | mov pr.rot=0x400001<<16 | 444 | mov pr.rot=0x800001<<16 |
380 | // ------^----- serves as (p48) at first (p26) | 445 | // ------^----- serves as (p50) at first (p27) |
381 | brp.loop.imp .L_bn_mul_add_words_ctop,.L_bn_mul_add_words_cend-16 | 446 | brp.loop.imp .L_bn_mul_add_words_ctop,.L_bn_mul_add_words_cend-16 |
382 | } | 447 | } |
383 | { .mii; mov r14=r32 // rp | 448 | { .mii; |
384 | mov r15=r33 // ap | 449 | #if defined(_HPUX_SOURCE) && defined(_ILP32) |
450 | addp4 r14=0,r32 // rp | ||
451 | addp4 r15=0,r33 // ap | ||
452 | #else | ||
453 | mov r14=r32 // rp | ||
454 | mov r15=r33 // ap | ||
455 | #endif | ||
385 | mov ar.lc=r10 } | 456 | mov ar.lc=r10 } |
386 | { .mii; mov r39=0 // serves as r33 at first (p26) | 457 | { .mii; mov r40=0 // serves as r35 at first (p27) |
387 | mov r18=r32 // rp copy | 458 | #if defined(_HPUX_SOURCE) && defined(_ILP32) |
388 | mov ar.ec=14 };; | 459 | addp4 r18=0,r32 // rp copy |
460 | #else | ||
461 | mov r18=r32 // rp copy | ||
462 | #endif | ||
463 | mov ar.ec=15 };; | ||
389 | 464 | ||
390 | // This loop spins in 3*(n+13) ticks on Itanium and should spin in | 465 | // This loop spins in 3*(n+14) ticks on Itanium and should spin in |
391 | // 2*(n+13) on "wider" IA-64 implementations (to be verified with new | 466 | // 2*(n+14) on "wider" IA-64 implementations (to be verified with new |
392 | // µ-architecture manuals as they become available). As usual it's | 467 | // µ-architecture manuals as they become available). As usual it's |
393 | // possible to compress the epilogue, down to 10 in this case, at the | 468 | // possible to compress the epilogue, down to 10 in this case, at the |
394 | // cost of scalability. Compressed (and therefore non-scalable) loop | 469 | // cost of scalability. Compressed (and therefore non-scalable) loop |
395 | // running at 3*(n+10) would buy you ~10% on Itanium but take ~35% | 470 | // running at 3*(n+11) would buy you ~10% on Itanium but take ~35% |
396 | // from "wider" IA-64 so let it be scalable! Special attention was | 471 | // from "wider" IA-64 so let it be scalable! Special attention was |
397 | // paid for having the loop body split at 64-byte boundary. ld8 is | 472 | // paid for having the loop body split at 64-byte boundary. ld8 is |
398 | // scheduled for L1 cache as the data is more than likely there. | 473 | // scheduled for L1 cache as the data is more than likely there. |
399 | // Indeed, bn_mul_words has put it there a moment ago:-) | 474 | // Indeed, bn_mul_words has put it there a moment ago:-) |
400 | .L_bn_mul_add_words_ctop: | 475 | .L_bn_mul_add_words_ctop: |
401 | { .mfi; (p25) getf.sig r36=f49 // low | 476 | { .mfi; (p25) getf.sig r36=f52 // low |
402 | (p21) xmpy.lu f45=f37,f8 | 477 | (p21) xmpy.lu f48=f37,f8 |
403 | (p27) cmp.ltu p52,p48=r39,r38 } | 478 | (p28) cmp.ltu p54,p50=r41,r39 } |
404 | { .mfi; (p16) ldf8 f32=[r15],8 | 479 | { .mfi; (p16) ldf8 f32=[r15],8 |
405 | (p21) xmpy.hu f38=f37,f8 | 480 | (p21) xmpy.hu f40=f37,f8 |
406 | (p27) add r43=r43,r39 };; | 481 | (p28) add r45=r45,r41 };; |
407 | { .mii; (p26) getf.sig r32=f43 // high | 482 | { .mii; (p25) getf.sig r32=f44 // high |
408 | .pred.rel "mutex",p48,p52 | 483 | .pred.rel "mutex",p50,p54 |
409 | (p48) add r38=r37,r33 // (p26) | 484 | (p50) add r40=r38,r35 // (p27) |
410 | (p52) add r38=r37,r33,1 } // (p26) | 485 | (p54) add r40=r38,r35,1 } // (p27) |
411 | { .mfb; (p27) cmp.ltu.unc p56,p0=r43,r39 | 486 | { .mfb; (p28) cmp.ltu.unc p60,p0=r45,r41 |
412 | (p0) nop.f 0x0 | 487 | (p0) nop.f 0x0 |
413 | (p0) nop.b 0x0 } | 488 | (p0) nop.b 0x0 } |
414 | { .mii; (p26) ld8 r42=[r18],8 | 489 | { .mii; (p27) ld8 r44=[r18],8 |
415 | (p58) cmp.eq.or p57,p0=-1,r44 | 490 | (p62) cmp.eq.or p61,p0=-1,r46 |
416 | (p58) add r44=1,r44 } | 491 | (p62) add r46=1,r46 } |
417 | { .mfb; (p29) st8 [r14]=r45,8 | 492 | { .mfb; (p30) st8 [r14]=r47,8 |
418 | (p0) nop.f 0x0 | 493 | (p0) nop.f 0x0 |
419 | br.ctop.sptk .L_bn_mul_add_words_ctop};; | 494 | br.ctop.sptk .L_bn_mul_add_words_ctop};; |
420 | .L_bn_mul_add_words_cend: | 495 | .L_bn_mul_add_words_cend: |
421 | 496 | ||
422 | { .mii; nop.m 0x0 | 497 | { .mii; nop.m 0x0 |
423 | .pred.rel "mutex",p51,p55 | 498 | .pred.rel "mutex",p53,p57 |
424 | (p51) add r8=r36,r0 | 499 | (p53) add r8=r38,r0 |
425 | (p55) add r8=r36,r0,1 } | 500 | (p57) add r8=r38,r0,1 } |
426 | { .mfb; nop.m 0x0 | 501 | { .mfb; nop.m 0x0 |
427 | nop.f 0x0 | 502 | nop.f 0x0 |
428 | nop.b 0x0 };; | 503 | nop.b 0x0 };; |
429 | { .mii; | 504 | { .mii; |
430 | (p59) add r8=1,r8 | 505 | (p63) add r8=1,r8 |
431 | mov pr=r9,-1 | 506 | mov pr=r9,0x1ffff |
432 | mov ar.lc=r3 } | 507 | mov ar.lc=r3 } |
433 | { .mfb; rum 1<<5 // clear um.mfh | 508 | { .mfb; rum 1<<5 // clear um.mfh |
434 | nop.f 0x0 | 509 | nop.f 0x0 |
@@ -461,6 +536,10 @@ bn_sqr_words: | |||
461 | mov r9=pr };; | 536 | mov r9=pr };; |
462 | 537 | ||
463 | .body | 538 | .body |
539 | #if defined(_HPUX_SOURCE) && defined(_ILP32) | ||
540 | { .mii; addp4 r32=0,r32 | ||
541 | addp4 r33=0,r33 };; | ||
542 | #endif | ||
464 | { .mib; | 543 | { .mib; |
465 | mov pr.rot=1<<16 | 544 | mov pr.rot=1<<16 |
466 | brp.loop.imp .L_bn_sqr_words_ctop,.L_bn_sqr_words_cend-16 | 545 | brp.loop.imp .L_bn_sqr_words_ctop,.L_bn_sqr_words_cend-16 |
@@ -492,7 +571,7 @@ bn_sqr_words: | |||
492 | .L_bn_sqr_words_cend: | 571 | .L_bn_sqr_words_cend: |
493 | 572 | ||
494 | { .mii; nop.m 0x0 | 573 | { .mii; nop.m 0x0 |
495 | mov pr=r9,-1 | 574 | mov pr=r9,0x1ffff |
496 | mov ar.lc=r3 } | 575 | mov ar.lc=r3 } |
497 | { .mfb; rum 1<<5 // clear um.mfh | 576 | { .mfb; rum 1<<5 // clear um.mfh |
498 | nop.f 0x0 | 577 | nop.f 0x0 |
@@ -526,7 +605,14 @@ bn_sqr_comba8: | |||
526 | .prologue | 605 | .prologue |
527 | .fframe 0 | 606 | .fframe 0 |
528 | .save ar.pfs,r2 | 607 | .save ar.pfs,r2 |
608 | #if defined(_HPUX_SOURCE) && defined(_ILP32) | ||
529 | { .mii; alloc r2=ar.pfs,2,1,0,0 | 609 | { .mii; alloc r2=ar.pfs,2,1,0,0 |
610 | addp4 r33=0,r33 | ||
611 | addp4 r32=0,r32 };; | ||
612 | { .mii; | ||
613 | #else | ||
614 | { .mii; alloc r2=ar.pfs,2,1,0,0 | ||
615 | #endif | ||
530 | mov r34=r33 | 616 | mov r34=r33 |
531 | add r14=8,r33 };; | 617 | add r14=8,r33 };; |
532 | .body | 618 | .body |
@@ -587,7 +673,14 @@ bn_mul_comba8: | |||
587 | .prologue | 673 | .prologue |
588 | .fframe 0 | 674 | .fframe 0 |
589 | .save ar.pfs,r2 | 675 | .save ar.pfs,r2 |
676 | #if defined(_HPUX_SOURCE) && defined(_ILP32) | ||
590 | { .mii; alloc r2=ar.pfs,3,0,0,0 | 677 | { .mii; alloc r2=ar.pfs,3,0,0,0 |
678 | addp4 r33=0,r33 | ||
679 | addp4 r34=0,r34 };; | ||
680 | { .mii; addp4 r32=0,r32 | ||
681 | #else | ||
682 | { .mii; alloc r2=ar.pfs,3,0,0,0 | ||
683 | #endif | ||
591 | add r14=8,r33 | 684 | add r14=8,r33 |
592 | add r17=8,r34 } | 685 | add r17=8,r34 } |
593 | .body | 686 | .body |
@@ -1138,7 +1231,14 @@ bn_sqr_comba4: | |||
1138 | .prologue | 1231 | .prologue |
1139 | .fframe 0 | 1232 | .fframe 0 |
1140 | .save ar.pfs,r2 | 1233 | .save ar.pfs,r2 |
1234 | #if defined(_HPUX_SOURCE) && defined(_ILP32) | ||
1235 | { .mii; alloc r2=ar.pfs,2,1,0,0 | ||
1236 | addp4 r32=0,r32 | ||
1237 | addp4 r33=0,r33 };; | ||
1238 | { .mii; | ||
1239 | #else | ||
1141 | { .mii; alloc r2=ar.pfs,2,1,0,0 | 1240 | { .mii; alloc r2=ar.pfs,2,1,0,0 |
1241 | #endif | ||
1142 | mov r34=r33 | 1242 | mov r34=r33 |
1143 | add r14=8,r33 };; | 1243 | add r14=8,r33 };; |
1144 | .body | 1244 | .body |
@@ -1164,7 +1264,14 @@ bn_mul_comba4: | |||
1164 | .prologue | 1264 | .prologue |
1165 | .fframe 0 | 1265 | .fframe 0 |
1166 | .save ar.pfs,r2 | 1266 | .save ar.pfs,r2 |
1267 | #if defined(_HPUX_SOURCE) && defined(_ILP32) | ||
1268 | { .mii; alloc r2=ar.pfs,3,0,0,0 | ||
1269 | addp4 r33=0,r33 | ||
1270 | addp4 r34=0,r34 };; | ||
1271 | { .mii; addp4 r32=0,r32 | ||
1272 | #else | ||
1167 | { .mii; alloc r2=ar.pfs,3,0,0,0 | 1273 | { .mii; alloc r2=ar.pfs,3,0,0,0 |
1274 | #endif | ||
1168 | add r14=8,r33 | 1275 | add r14=8,r33 |
1169 | add r17=8,r34 } | 1276 | add r17=8,r34 } |
1170 | .body | 1277 | .body |
@@ -1464,7 +1571,7 @@ bn_div_words: | |||
1464 | or r8=r8,r33 | 1571 | or r8=r8,r33 |
1465 | mov ar.pfs=r2 };; | 1572 | mov ar.pfs=r2 };; |
1466 | { .mii; shr.u r9=H,I // remainder if anybody wants it | 1573 | { .mii; shr.u r9=H,I // remainder if anybody wants it |
1467 | mov pr=r10,-1 } | 1574 | mov pr=r10,0x1ffff } |
1468 | { .mfb; br.ret.sptk.many b0 };; | 1575 | { .mfb; br.ret.sptk.many b0 };; |
1469 | 1576 | ||
1470 | // Unsigned 64 by 32 (well, by 64 for the moment) bit integer division | 1577 | // Unsigned 64 by 32 (well, by 64 for the moment) bit integer division |
diff --git a/src/lib/libcrypto/bn/asm/pa-risc2.s b/src/lib/libcrypto/bn/asm/pa-risc2.s index af9730d062..f3b16290eb 100644 --- a/src/lib/libcrypto/bn/asm/pa-risc2.s +++ b/src/lib/libcrypto/bn/asm/pa-risc2.s | |||
@@ -747,8 +747,8 @@ bn_div_words | |||
747 | .PROC | 747 | .PROC |
748 | .EXPORT bn_div_words,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR,RTNVAL=GR,LONG_RETURN | 748 | .EXPORT bn_div_words,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR,RTNVAL=GR,LONG_RETURN |
749 | .IMPORT BN_num_bits_word,CODE | 749 | .IMPORT BN_num_bits_word,CODE |
750 | .IMPORT __iob,DATA | 750 | ;--- not PIC .IMPORT __iob,DATA |
751 | .IMPORT fprintf,CODE | 751 | ;--- not PIC .IMPORT fprintf,CODE |
752 | .IMPORT abort,CODE | 752 | .IMPORT abort,CODE |
753 | .IMPORT $$div2U,MILLICODE | 753 | .IMPORT $$div2U,MILLICODE |
754 | .CALLINFO CALLER,FRAME=144,ENTRY_GR=%r9,SAVE_RP,ARGS_SAVED,ORDERING_AWARE | 754 | .CALLINFO CALLER,FRAME=144,ENTRY_GR=%r9,SAVE_RP,ARGS_SAVED,ORDERING_AWARE |
@@ -844,12 +844,12 @@ $0006001A | |||
844 | MOVIB,TR 2,%r8,$0006001C ;offset 0xa18 | 844 | MOVIB,TR 2,%r8,$0006001C ;offset 0xa18 |
845 | EXTRD,U %r3,63,32,%r7 ;offset 0xa1c | 845 | EXTRD,U %r3,63,32,%r7 ;offset 0xa1c |
846 | $D2 | 846 | $D2 |
847 | ADDIL LR'__iob-$global$,%r27,%r1 ;offset 0xa20 | 847 | ;--- not PIC ADDIL LR'__iob-$global$,%r27,%r1 ;offset 0xa20 |
848 | LDIL LR'C$7,%r21 ;offset 0xa24 | 848 | ;--- not PIC LDIL LR'C$7,%r21 ;offset 0xa24 |
849 | LDO RR'__iob-$global$+32(%r1),%r26 ;offset 0xa28 | 849 | ;--- not PIC LDO RR'__iob-$global$+32(%r1),%r26 ;offset 0xa28 |
850 | .CALL ARGW0=GR,ARGW1=GR,ARGW2=GR,RTNVAL=GR ;in=24,25,26;out=28; | 850 | ;--- not PIC .CALL ARGW0=GR,ARGW1=GR,ARGW2=GR,RTNVAL=GR ;in=24,25,26;out=28; |
851 | B,L fprintf,%r2 ;offset 0xa2c | 851 | ;--- not PIC B,L fprintf,%r2 ;offset 0xa2c |
852 | LDO RR'C$7(%r21),%r25 ;offset 0xa30 | 852 | ;--- not PIC LDO RR'C$7(%r21),%r25 ;offset 0xa30 |
853 | .CALL ; | 853 | .CALL ; |
854 | B,L abort,%r2 ;offset 0xa34 | 854 | B,L abort,%r2 ;offset 0xa34 |
855 | NOP ;offset 0xa38 | 855 | NOP ;offset 0xa38 |
@@ -1605,14 +1605,14 @@ bn_mul_comba4 | |||
1605 | .PROCEND | 1605 | .PROCEND |
1606 | 1606 | ||
1607 | 1607 | ||
1608 | .SPACE $TEXT$ | 1608 | ;--- not PIC .SPACE $TEXT$ |
1609 | .SUBSPA $CODE$ | 1609 | ;--- not PIC .SUBSPA $CODE$ |
1610 | .SPACE $PRIVATE$,SORT=16 | 1610 | ;--- not PIC .SPACE $PRIVATE$,SORT=16 |
1611 | .IMPORT $global$,DATA | 1611 | ;--- not PIC .IMPORT $global$,DATA |
1612 | .SPACE $TEXT$ | 1612 | ;--- not PIC .SPACE $TEXT$ |
1613 | .SUBSPA $CODE$ | 1613 | ;--- not PIC .SUBSPA $CODE$ |
1614 | .SUBSPA $LIT$,ACCESS=0x2c | 1614 | ;--- not PIC .SUBSPA $LIT$,ACCESS=0x2c |
1615 | C$7 | 1615 | ;--- not PIC C$7 |
1616 | .ALIGN 8 | 1616 | ;--- not PIC .ALIGN 8 |
1617 | .STRINGZ "Division would overflow (%d)\n" | 1617 | ;--- not PIC .STRINGZ "Division would overflow (%d)\n" |
1618 | .END | 1618 | .END |
diff --git a/src/lib/libcrypto/bn/asm/x86_64-gcc.c b/src/lib/libcrypto/bn/asm/x86_64-gcc.c new file mode 100644 index 0000000000..b97b394661 --- /dev/null +++ b/src/lib/libcrypto/bn/asm/x86_64-gcc.c | |||
@@ -0,0 +1,575 @@ | |||
1 | /* | ||
2 | * x86_64 BIGNUM accelerator version 0.1, December 2002. | ||
3 | * | ||
4 | * Implemented by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | ||
5 | * project. | ||
6 | * | ||
7 | * Rights for redistribution and usage in source and binary forms are | ||
8 | * granted according to the OpenSSL license. Warranty of any kind is | ||
9 | * disclaimed. | ||
10 | * | ||
11 | * Q. Version 0.1? It doesn't sound like Andy, he used to assign real | ||
12 | * versions, like 1.0... | ||
13 | * A. Well, that's because this code is basically a quick-n-dirty | ||
14 | * proof-of-concept hack. As you can see it's implemented with | ||
15 | * inline assembler, which means that you're bound to GCC and that | ||
16 | * there must be a room for fine-tuning. | ||
17 | * | ||
18 | * Q. Why inline assembler? | ||
19 | * A. x86_64 features own ABI I'm not familiar with. Which is why | ||
20 | * I decided to let the compiler take care of subroutine | ||
21 | * prologue/epilogue as well as register allocation. | ||
22 | * | ||
23 | * Q. How much faster does it get? | ||
24 | * A. Unfortunately people sitting on x86_64 hardware are prohibited | ||
25 | * to disclose the performance numbers, so they (SuSE labs to be | ||
26 | * specific) wouldn't tell me. However! Very similar coding technique | ||
27 | * (reaching out for 128-bit result from 64x64-bit multiplication) | ||
28 | * results in >3 times performance improvement on MIPS and I see no | ||
29 | * reason why gain on x86_64 would be so much different:-) | ||
30 | */ | ||
31 | |||
32 | #define BN_ULONG unsigned long | ||
33 | |||
34 | /* | ||
35 | * "m"(a), "+m"(r) is the way to favor DirectPath µ-code; | ||
36 | * "g"(0) let the compiler to decide where does it | ||
37 | * want to keep the value of zero; | ||
38 | */ | ||
39 | #define mul_add(r,a,word,carry) do { \ | ||
40 | register BN_ULONG high,low; \ | ||
41 | asm ("mulq %3" \ | ||
42 | : "=a"(low),"=d"(high) \ | ||
43 | : "a"(word),"m"(a) \ | ||
44 | : "cc"); \ | ||
45 | asm ("addq %2,%0; adcq %3,%1" \ | ||
46 | : "+r"(carry),"+d"(high)\ | ||
47 | : "a"(low),"g"(0) \ | ||
48 | : "cc"); \ | ||
49 | asm ("addq %2,%0; adcq %3,%1" \ | ||
50 | : "+m"(r),"+d"(high) \ | ||
51 | : "r"(carry),"g"(0) \ | ||
52 | : "cc"); \ | ||
53 | carry=high; \ | ||
54 | } while (0) | ||
55 | |||
56 | #define mul(r,a,word,carry) do { \ | ||
57 | register BN_ULONG high,low; \ | ||
58 | asm ("mulq %3" \ | ||
59 | : "=a"(low),"=d"(high) \ | ||
60 | : "a"(word),"g"(a) \ | ||
61 | : "cc"); \ | ||
62 | asm ("addq %2,%0; adcq %3,%1" \ | ||
63 | : "+r"(carry),"+d"(high)\ | ||
64 | : "a"(low),"g"(0) \ | ||
65 | : "cc"); \ | ||
66 | (r)=carry, carry=high; \ | ||
67 | } while (0) | ||
68 | |||
69 | #define sqr(r0,r1,a) \ | ||
70 | asm ("mulq %2" \ | ||
71 | : "=a"(r0),"=d"(r1) \ | ||
72 | : "a"(a) \ | ||
73 | : "cc"); | ||
74 | |||
75 | BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w) | ||
76 | { | ||
77 | BN_ULONG c1=0; | ||
78 | |||
79 | if (num <= 0) return(c1); | ||
80 | |||
81 | while (num&~3) | ||
82 | { | ||
83 | mul_add(rp[0],ap[0],w,c1); | ||
84 | mul_add(rp[1],ap[1],w,c1); | ||
85 | mul_add(rp[2],ap[2],w,c1); | ||
86 | mul_add(rp[3],ap[3],w,c1); | ||
87 | ap+=4; rp+=4; num-=4; | ||
88 | } | ||
89 | if (num) | ||
90 | { | ||
91 | mul_add(rp[0],ap[0],w,c1); if (--num==0) return c1; | ||
92 | mul_add(rp[1],ap[1],w,c1); if (--num==0) return c1; | ||
93 | mul_add(rp[2],ap[2],w,c1); return c1; | ||
94 | } | ||
95 | |||
96 | return(c1); | ||
97 | } | ||
98 | |||
99 | BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w) | ||
100 | { | ||
101 | BN_ULONG c1=0; | ||
102 | |||
103 | if (num <= 0) return(c1); | ||
104 | |||
105 | while (num&~3) | ||
106 | { | ||
107 | mul(rp[0],ap[0],w,c1); | ||
108 | mul(rp[1],ap[1],w,c1); | ||
109 | mul(rp[2],ap[2],w,c1); | ||
110 | mul(rp[3],ap[3],w,c1); | ||
111 | ap+=4; rp+=4; num-=4; | ||
112 | } | ||
113 | if (num) | ||
114 | { | ||
115 | mul(rp[0],ap[0],w,c1); if (--num == 0) return c1; | ||
116 | mul(rp[1],ap[1],w,c1); if (--num == 0) return c1; | ||
117 | mul(rp[2],ap[2],w,c1); | ||
118 | } | ||
119 | return(c1); | ||
120 | } | ||
121 | |||
122 | void bn_sqr_words(BN_ULONG *r, BN_ULONG *a, int n) | ||
123 | { | ||
124 | if (n <= 0) return; | ||
125 | |||
126 | while (n&~3) | ||
127 | { | ||
128 | sqr(r[0],r[1],a[0]); | ||
129 | sqr(r[2],r[3],a[1]); | ||
130 | sqr(r[4],r[5],a[2]); | ||
131 | sqr(r[6],r[7],a[3]); | ||
132 | a+=4; r+=8; n-=4; | ||
133 | } | ||
134 | if (n) | ||
135 | { | ||
136 | sqr(r[0],r[1],a[0]); if (--n == 0) return; | ||
137 | sqr(r[2],r[3],a[1]); if (--n == 0) return; | ||
138 | sqr(r[4],r[5],a[2]); | ||
139 | } | ||
140 | } | ||
141 | |||
142 | BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) | ||
143 | { BN_ULONG ret,waste; | ||
144 | |||
145 | asm ("divq %3" | ||
146 | : "=a"(ret),"=d"(waste) | ||
147 | : "a"(l),"d"(h),"g"(d) | ||
148 | : "cc"); | ||
149 | |||
150 | return ret; | ||
151 | } | ||
152 | |||
153 | BN_ULONG bn_add_words (BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int n) | ||
154 | { BN_ULONG ret,i; | ||
155 | |||
156 | if (n <= 0) return 0; | ||
157 | |||
158 | asm ( | ||
159 | " subq %2,%2 \n" | ||
160 | ".align 16 \n" | ||
161 | "1: movq (%4,%2,8),%0 \n" | ||
162 | " adcq (%5,%2,8),%0 \n" | ||
163 | " movq %0,(%3,%2,8) \n" | ||
164 | " leaq 1(%2),%2 \n" | ||
165 | " loop 1b \n" | ||
166 | " sbbq %0,%0 \n" | ||
167 | : "+a"(ret),"+c"(n),"+r"(i) | ||
168 | : "r"(rp),"r"(ap),"r"(bp) | ||
169 | : "cc" | ||
170 | ); | ||
171 | |||
172 | return ret&1; | ||
173 | } | ||
174 | |||
175 | #ifndef SIMICS | ||
176 | BN_ULONG bn_sub_words (BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int n) | ||
177 | { BN_ULONG ret,i; | ||
178 | |||
179 | if (n <= 0) return 0; | ||
180 | |||
181 | asm ( | ||
182 | " subq %2,%2 \n" | ||
183 | ".align 16 \n" | ||
184 | "1: movq (%4,%2,8),%0 \n" | ||
185 | " sbbq (%5,%2,8),%0 \n" | ||
186 | " movq %0,(%3,%2,8) \n" | ||
187 | " leaq 1(%2),%2 \n" | ||
188 | " loop 1b \n" | ||
189 | " sbbq %0,%0 \n" | ||
190 | : "+a"(ret),"+c"(n),"+r"(i) | ||
191 | : "r"(rp),"r"(ap),"r"(bp) | ||
192 | : "cc" | ||
193 | ); | ||
194 | |||
195 | return ret&1; | ||
196 | } | ||
197 | #else | ||
198 | /* Simics 1.4<7 has buggy sbbq:-( */ | ||
199 | #define BN_MASK2 0xffffffffffffffffL | ||
200 | BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) | ||
201 | { | ||
202 | BN_ULONG t1,t2; | ||
203 | int c=0; | ||
204 | |||
205 | if (n <= 0) return((BN_ULONG)0); | ||
206 | |||
207 | for (;;) | ||
208 | { | ||
209 | t1=a[0]; t2=b[0]; | ||
210 | r[0]=(t1-t2-c)&BN_MASK2; | ||
211 | if (t1 != t2) c=(t1 < t2); | ||
212 | if (--n <= 0) break; | ||
213 | |||
214 | t1=a[1]; t2=b[1]; | ||
215 | r[1]=(t1-t2-c)&BN_MASK2; | ||
216 | if (t1 != t2) c=(t1 < t2); | ||
217 | if (--n <= 0) break; | ||
218 | |||
219 | t1=a[2]; t2=b[2]; | ||
220 | r[2]=(t1-t2-c)&BN_MASK2; | ||
221 | if (t1 != t2) c=(t1 < t2); | ||
222 | if (--n <= 0) break; | ||
223 | |||
224 | t1=a[3]; t2=b[3]; | ||
225 | r[3]=(t1-t2-c)&BN_MASK2; | ||
226 | if (t1 != t2) c=(t1 < t2); | ||
227 | if (--n <= 0) break; | ||
228 | |||
229 | a+=4; | ||
230 | b+=4; | ||
231 | r+=4; | ||
232 | } | ||
233 | return(c); | ||
234 | } | ||
235 | #endif | ||
236 | |||
237 | /* mul_add_c(a,b,c0,c1,c2) -- c+=a*b for three word number c=(c2,c1,c0) */ | ||
238 | /* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */ | ||
239 | /* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */ | ||
240 | /* sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0) */ | ||
241 | |||
242 | #if 0 | ||
243 | /* original macros are kept for reference purposes */ | ||
244 | #define mul_add_c(a,b,c0,c1,c2) { \ | ||
245 | BN_ULONG ta=(a),tb=(b); \ | ||
246 | t1 = ta * tb; \ | ||
247 | t2 = BN_UMULT_HIGH(ta,tb); \ | ||
248 | c0 += t1; t2 += (c0<t1)?1:0; \ | ||
249 | c1 += t2; c2 += (c1<t2)?1:0; \ | ||
250 | } | ||
251 | |||
252 | #define mul_add_c2(a,b,c0,c1,c2) { \ | ||
253 | BN_ULONG ta=(a),tb=(b),t0; \ | ||
254 | t1 = BN_UMULT_HIGH(ta,tb); \ | ||
255 | t0 = ta * tb; \ | ||
256 | t2 = t1+t1; c2 += (t2<t1)?1:0; \ | ||
257 | t1 = t0+t0; t2 += (t1<t0)?1:0; \ | ||
258 | c0 += t1; t2 += (c0<t1)?1:0; \ | ||
259 | c1 += t2; c2 += (c1<t2)?1:0; \ | ||
260 | } | ||
261 | #else | ||
262 | #define mul_add_c(a,b,c0,c1,c2) do { \ | ||
263 | asm ("mulq %3" \ | ||
264 | : "=a"(t1),"=d"(t2) \ | ||
265 | : "a"(a),"m"(b) \ | ||
266 | : "cc"); \ | ||
267 | asm ("addq %2,%0; adcq %3,%1" \ | ||
268 | : "+r"(c0),"+d"(t2) \ | ||
269 | : "a"(t1),"g"(0) \ | ||
270 | : "cc"); \ | ||
271 | asm ("addq %2,%0; adcq %3,%1" \ | ||
272 | : "+r"(c1),"+r"(c2) \ | ||
273 | : "d"(t2),"g"(0) \ | ||
274 | : "cc"); \ | ||
275 | } while (0) | ||
276 | |||
277 | #define sqr_add_c(a,i,c0,c1,c2) do { \ | ||
278 | asm ("mulq %2" \ | ||
279 | : "=a"(t1),"=d"(t2) \ | ||
280 | : "a"(a[i]) \ | ||
281 | : "cc"); \ | ||
282 | asm ("addq %2,%0; adcq %3,%1" \ | ||
283 | : "+r"(c0),"+d"(t2) \ | ||
284 | : "a"(t1),"g"(0) \ | ||
285 | : "cc"); \ | ||
286 | asm ("addq %2,%0; adcq %3,%1" \ | ||
287 | : "+r"(c1),"+r"(c2) \ | ||
288 | : "d"(t2),"g"(0) \ | ||
289 | : "cc"); \ | ||
290 | } while (0) | ||
291 | |||
292 | #define mul_add_c2(a,b,c0,c1,c2) do { \ | ||
293 | asm ("mulq %3" \ | ||
294 | : "=a"(t1),"=d"(t2) \ | ||
295 | : "a"(a),"m"(b) \ | ||
296 | : "cc"); \ | ||
297 | asm ("addq %0,%0; adcq %2,%1" \ | ||
298 | : "+d"(t2),"+r"(c2) \ | ||
299 | : "g"(0) \ | ||
300 | : "cc"); \ | ||
301 | asm ("addq %0,%0; adcq %2,%1" \ | ||
302 | : "+a"(t1),"+d"(t2) \ | ||
303 | : "g"(0) \ | ||
304 | : "cc"); \ | ||
305 | asm ("addq %2,%0; adcq %3,%1" \ | ||
306 | : "+r"(c0),"+d"(t2) \ | ||
307 | : "a"(t1),"g"(0) \ | ||
308 | : "cc"); \ | ||
309 | asm ("addq %2,%0; adcq %3,%1" \ | ||
310 | : "+r"(c1),"+r"(c2) \ | ||
311 | : "d"(t2),"g"(0) \ | ||
312 | : "cc"); \ | ||
313 | } while (0) | ||
314 | #endif | ||
315 | |||
316 | #define sqr_add_c2(a,i,j,c0,c1,c2) \ | ||
317 | mul_add_c2((a)[i],(a)[j],c0,c1,c2) | ||
318 | |||
319 | void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) | ||
320 | { | ||
321 | BN_ULONG bl,bh; | ||
322 | BN_ULONG t1,t2; | ||
323 | BN_ULONG c1,c2,c3; | ||
324 | |||
325 | c1=0; | ||
326 | c2=0; | ||
327 | c3=0; | ||
328 | mul_add_c(a[0],b[0],c1,c2,c3); | ||
329 | r[0]=c1; | ||
330 | c1=0; | ||
331 | mul_add_c(a[0],b[1],c2,c3,c1); | ||
332 | mul_add_c(a[1],b[0],c2,c3,c1); | ||
333 | r[1]=c2; | ||
334 | c2=0; | ||
335 | mul_add_c(a[2],b[0],c3,c1,c2); | ||
336 | mul_add_c(a[1],b[1],c3,c1,c2); | ||
337 | mul_add_c(a[0],b[2],c3,c1,c2); | ||
338 | r[2]=c3; | ||
339 | c3=0; | ||
340 | mul_add_c(a[0],b[3],c1,c2,c3); | ||
341 | mul_add_c(a[1],b[2],c1,c2,c3); | ||
342 | mul_add_c(a[2],b[1],c1,c2,c3); | ||
343 | mul_add_c(a[3],b[0],c1,c2,c3); | ||
344 | r[3]=c1; | ||
345 | c1=0; | ||
346 | mul_add_c(a[4],b[0],c2,c3,c1); | ||
347 | mul_add_c(a[3],b[1],c2,c3,c1); | ||
348 | mul_add_c(a[2],b[2],c2,c3,c1); | ||
349 | mul_add_c(a[1],b[3],c2,c3,c1); | ||
350 | mul_add_c(a[0],b[4],c2,c3,c1); | ||
351 | r[4]=c2; | ||
352 | c2=0; | ||
353 | mul_add_c(a[0],b[5],c3,c1,c2); | ||
354 | mul_add_c(a[1],b[4],c3,c1,c2); | ||
355 | mul_add_c(a[2],b[3],c3,c1,c2); | ||
356 | mul_add_c(a[3],b[2],c3,c1,c2); | ||
357 | mul_add_c(a[4],b[1],c3,c1,c2); | ||
358 | mul_add_c(a[5],b[0],c3,c1,c2); | ||
359 | r[5]=c3; | ||
360 | c3=0; | ||
361 | mul_add_c(a[6],b[0],c1,c2,c3); | ||
362 | mul_add_c(a[5],b[1],c1,c2,c3); | ||
363 | mul_add_c(a[4],b[2],c1,c2,c3); | ||
364 | mul_add_c(a[3],b[3],c1,c2,c3); | ||
365 | mul_add_c(a[2],b[4],c1,c2,c3); | ||
366 | mul_add_c(a[1],b[5],c1,c2,c3); | ||
367 | mul_add_c(a[0],b[6],c1,c2,c3); | ||
368 | r[6]=c1; | ||
369 | c1=0; | ||
370 | mul_add_c(a[0],b[7],c2,c3,c1); | ||
371 | mul_add_c(a[1],b[6],c2,c3,c1); | ||
372 | mul_add_c(a[2],b[5],c2,c3,c1); | ||
373 | mul_add_c(a[3],b[4],c2,c3,c1); | ||
374 | mul_add_c(a[4],b[3],c2,c3,c1); | ||
375 | mul_add_c(a[5],b[2],c2,c3,c1); | ||
376 | mul_add_c(a[6],b[1],c2,c3,c1); | ||
377 | mul_add_c(a[7],b[0],c2,c3,c1); | ||
378 | r[7]=c2; | ||
379 | c2=0; | ||
380 | mul_add_c(a[7],b[1],c3,c1,c2); | ||
381 | mul_add_c(a[6],b[2],c3,c1,c2); | ||
382 | mul_add_c(a[5],b[3],c3,c1,c2); | ||
383 | mul_add_c(a[4],b[4],c3,c1,c2); | ||
384 | mul_add_c(a[3],b[5],c3,c1,c2); | ||
385 | mul_add_c(a[2],b[6],c3,c1,c2); | ||
386 | mul_add_c(a[1],b[7],c3,c1,c2); | ||
387 | r[8]=c3; | ||
388 | c3=0; | ||
389 | mul_add_c(a[2],b[7],c1,c2,c3); | ||
390 | mul_add_c(a[3],b[6],c1,c2,c3); | ||
391 | mul_add_c(a[4],b[5],c1,c2,c3); | ||
392 | mul_add_c(a[5],b[4],c1,c2,c3); | ||
393 | mul_add_c(a[6],b[3],c1,c2,c3); | ||
394 | mul_add_c(a[7],b[2],c1,c2,c3); | ||
395 | r[9]=c1; | ||
396 | c1=0; | ||
397 | mul_add_c(a[7],b[3],c2,c3,c1); | ||
398 | mul_add_c(a[6],b[4],c2,c3,c1); | ||
399 | mul_add_c(a[5],b[5],c2,c3,c1); | ||
400 | mul_add_c(a[4],b[6],c2,c3,c1); | ||
401 | mul_add_c(a[3],b[7],c2,c3,c1); | ||
402 | r[10]=c2; | ||
403 | c2=0; | ||
404 | mul_add_c(a[4],b[7],c3,c1,c2); | ||
405 | mul_add_c(a[5],b[6],c3,c1,c2); | ||
406 | mul_add_c(a[6],b[5],c3,c1,c2); | ||
407 | mul_add_c(a[7],b[4],c3,c1,c2); | ||
408 | r[11]=c3; | ||
409 | c3=0; | ||
410 | mul_add_c(a[7],b[5],c1,c2,c3); | ||
411 | mul_add_c(a[6],b[6],c1,c2,c3); | ||
412 | mul_add_c(a[5],b[7],c1,c2,c3); | ||
413 | r[12]=c1; | ||
414 | c1=0; | ||
415 | mul_add_c(a[6],b[7],c2,c3,c1); | ||
416 | mul_add_c(a[7],b[6],c2,c3,c1); | ||
417 | r[13]=c2; | ||
418 | c2=0; | ||
419 | mul_add_c(a[7],b[7],c3,c1,c2); | ||
420 | r[14]=c3; | ||
421 | r[15]=c1; | ||
422 | } | ||
423 | |||
424 | void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) | ||
425 | { | ||
426 | BN_ULONG bl,bh; | ||
427 | BN_ULONG t1,t2; | ||
428 | BN_ULONG c1,c2,c3; | ||
429 | |||
430 | c1=0; | ||
431 | c2=0; | ||
432 | c3=0; | ||
433 | mul_add_c(a[0],b[0],c1,c2,c3); | ||
434 | r[0]=c1; | ||
435 | c1=0; | ||
436 | mul_add_c(a[0],b[1],c2,c3,c1); | ||
437 | mul_add_c(a[1],b[0],c2,c3,c1); | ||
438 | r[1]=c2; | ||
439 | c2=0; | ||
440 | mul_add_c(a[2],b[0],c3,c1,c2); | ||
441 | mul_add_c(a[1],b[1],c3,c1,c2); | ||
442 | mul_add_c(a[0],b[2],c3,c1,c2); | ||
443 | r[2]=c3; | ||
444 | c3=0; | ||
445 | mul_add_c(a[0],b[3],c1,c2,c3); | ||
446 | mul_add_c(a[1],b[2],c1,c2,c3); | ||
447 | mul_add_c(a[2],b[1],c1,c2,c3); | ||
448 | mul_add_c(a[3],b[0],c1,c2,c3); | ||
449 | r[3]=c1; | ||
450 | c1=0; | ||
451 | mul_add_c(a[3],b[1],c2,c3,c1); | ||
452 | mul_add_c(a[2],b[2],c2,c3,c1); | ||
453 | mul_add_c(a[1],b[3],c2,c3,c1); | ||
454 | r[4]=c2; | ||
455 | c2=0; | ||
456 | mul_add_c(a[2],b[3],c3,c1,c2); | ||
457 | mul_add_c(a[3],b[2],c3,c1,c2); | ||
458 | r[5]=c3; | ||
459 | c3=0; | ||
460 | mul_add_c(a[3],b[3],c1,c2,c3); | ||
461 | r[6]=c1; | ||
462 | r[7]=c2; | ||
463 | } | ||
464 | |||
465 | void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a) | ||
466 | { | ||
467 | BN_ULONG bl,bh; | ||
468 | BN_ULONG t1,t2; | ||
469 | BN_ULONG c1,c2,c3; | ||
470 | |||
471 | c1=0; | ||
472 | c2=0; | ||
473 | c3=0; | ||
474 | sqr_add_c(a,0,c1,c2,c3); | ||
475 | r[0]=c1; | ||
476 | c1=0; | ||
477 | sqr_add_c2(a,1,0,c2,c3,c1); | ||
478 | r[1]=c2; | ||
479 | c2=0; | ||
480 | sqr_add_c(a,1,c3,c1,c2); | ||
481 | sqr_add_c2(a,2,0,c3,c1,c2); | ||
482 | r[2]=c3; | ||
483 | c3=0; | ||
484 | sqr_add_c2(a,3,0,c1,c2,c3); | ||
485 | sqr_add_c2(a,2,1,c1,c2,c3); | ||
486 | r[3]=c1; | ||
487 | c1=0; | ||
488 | sqr_add_c(a,2,c2,c3,c1); | ||
489 | sqr_add_c2(a,3,1,c2,c3,c1); | ||
490 | sqr_add_c2(a,4,0,c2,c3,c1); | ||
491 | r[4]=c2; | ||
492 | c2=0; | ||
493 | sqr_add_c2(a,5,0,c3,c1,c2); | ||
494 | sqr_add_c2(a,4,1,c3,c1,c2); | ||
495 | sqr_add_c2(a,3,2,c3,c1,c2); | ||
496 | r[5]=c3; | ||
497 | c3=0; | ||
498 | sqr_add_c(a,3,c1,c2,c3); | ||
499 | sqr_add_c2(a,4,2,c1,c2,c3); | ||
500 | sqr_add_c2(a,5,1,c1,c2,c3); | ||
501 | sqr_add_c2(a,6,0,c1,c2,c3); | ||
502 | r[6]=c1; | ||
503 | c1=0; | ||
504 | sqr_add_c2(a,7,0,c2,c3,c1); | ||
505 | sqr_add_c2(a,6,1,c2,c3,c1); | ||
506 | sqr_add_c2(a,5,2,c2,c3,c1); | ||
507 | sqr_add_c2(a,4,3,c2,c3,c1); | ||
508 | r[7]=c2; | ||
509 | c2=0; | ||
510 | sqr_add_c(a,4,c3,c1,c2); | ||
511 | sqr_add_c2(a,5,3,c3,c1,c2); | ||
512 | sqr_add_c2(a,6,2,c3,c1,c2); | ||
513 | sqr_add_c2(a,7,1,c3,c1,c2); | ||
514 | r[8]=c3; | ||
515 | c3=0; | ||
516 | sqr_add_c2(a,7,2,c1,c2,c3); | ||
517 | sqr_add_c2(a,6,3,c1,c2,c3); | ||
518 | sqr_add_c2(a,5,4,c1,c2,c3); | ||
519 | r[9]=c1; | ||
520 | c1=0; | ||
521 | sqr_add_c(a,5,c2,c3,c1); | ||
522 | sqr_add_c2(a,6,4,c2,c3,c1); | ||
523 | sqr_add_c2(a,7,3,c2,c3,c1); | ||
524 | r[10]=c2; | ||
525 | c2=0; | ||
526 | sqr_add_c2(a,7,4,c3,c1,c2); | ||
527 | sqr_add_c2(a,6,5,c3,c1,c2); | ||
528 | r[11]=c3; | ||
529 | c3=0; | ||
530 | sqr_add_c(a,6,c1,c2,c3); | ||
531 | sqr_add_c2(a,7,5,c1,c2,c3); | ||
532 | r[12]=c1; | ||
533 | c1=0; | ||
534 | sqr_add_c2(a,7,6,c2,c3,c1); | ||
535 | r[13]=c2; | ||
536 | c2=0; | ||
537 | sqr_add_c(a,7,c3,c1,c2); | ||
538 | r[14]=c3; | ||
539 | r[15]=c1; | ||
540 | } | ||
541 | |||
542 | void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a) | ||
543 | { | ||
544 | BN_ULONG bl,bh; | ||
545 | BN_ULONG t1,t2; | ||
546 | BN_ULONG c1,c2,c3; | ||
547 | |||
548 | c1=0; | ||
549 | c2=0; | ||
550 | c3=0; | ||
551 | sqr_add_c(a,0,c1,c2,c3); | ||
552 | r[0]=c1; | ||
553 | c1=0; | ||
554 | sqr_add_c2(a,1,0,c2,c3,c1); | ||
555 | r[1]=c2; | ||
556 | c2=0; | ||
557 | sqr_add_c(a,1,c3,c1,c2); | ||
558 | sqr_add_c2(a,2,0,c3,c1,c2); | ||
559 | r[2]=c3; | ||
560 | c3=0; | ||
561 | sqr_add_c2(a,3,0,c1,c2,c3); | ||
562 | sqr_add_c2(a,2,1,c1,c2,c3); | ||
563 | r[3]=c1; | ||
564 | c1=0; | ||
565 | sqr_add_c(a,2,c2,c3,c1); | ||
566 | sqr_add_c2(a,3,1,c2,c3,c1); | ||
567 | r[4]=c2; | ||
568 | c2=0; | ||
569 | sqr_add_c2(a,3,2,c3,c1,c2); | ||
570 | r[5]=c3; | ||
571 | c3=0; | ||
572 | sqr_add_c(a,3,c1,c2,c3); | ||
573 | r[6]=c1; | ||
574 | r[7]=c2; | ||
575 | } | ||