diff options
| author | markus <> | 2003-05-11 21:36:58 +0000 |
|---|---|---|
| committer | markus <> | 2003-05-11 21:36:58 +0000 |
| commit | 1c98a87f0daac81245653c227eb2f2508a22a965 (patch) | |
| tree | 3de6d603296ec563b936da4e6a8a1e33d48f8884 /src/lib/libcrypto/bn | |
| parent | 31392c89d1135cf2a416f97295f6d21681b3fbc4 (diff) | |
| download | openbsd-1c98a87f0daac81245653c227eb2f2508a22a965.tar.gz openbsd-1c98a87f0daac81245653c227eb2f2508a22a965.tar.bz2 openbsd-1c98a87f0daac81245653c227eb2f2508a22a965.zip | |
import 0.9.7b (without idea and rc5)
Diffstat (limited to 'src/lib/libcrypto/bn')
| -rw-r--r-- | src/lib/libcrypto/bn/asm/ia64.S | 235 | ||||
| -rw-r--r-- | src/lib/libcrypto/bn/asm/pa-risc2.s | 36 | ||||
| -rw-r--r-- | src/lib/libcrypto/bn/asm/x86_64-gcc.c | 575 | ||||
| -rw-r--r-- | src/lib/libcrypto/bn/bn.h | 2 | ||||
| -rw-r--r-- | src/lib/libcrypto/bn/bn_div.c | 28 | ||||
| -rw-r--r-- | src/lib/libcrypto/bn/bn_lcl.h | 26 | ||||
| -rw-r--r-- | src/lib/libcrypto/bn/bn_lib.c | 4 | ||||
| -rw-r--r-- | src/lib/libcrypto/bn/bn_mul.c | 529 | ||||
| -rw-r--r-- | src/lib/libcrypto/bn/bn_prime.c | 2 | ||||
| -rw-r--r-- | src/lib/libcrypto/bn/bn_rand.c | 2 | ||||
| -rw-r--r-- | src/lib/libcrypto/bn/bn_word.c | 5 |
11 files changed, 905 insertions, 539 deletions
diff --git a/src/lib/libcrypto/bn/asm/ia64.S b/src/lib/libcrypto/bn/asm/ia64.S index ae56066310..7dfda85566 100644 --- a/src/lib/libcrypto/bn/asm/ia64.S +++ b/src/lib/libcrypto/bn/asm/ia64.S | |||
| @@ -1,6 +1,6 @@ | |||
| 1 | .explicit | 1 | .explicit |
| 2 | .text | 2 | .text |
| 3 | .ident "ia64.S, Version 1.1" | 3 | .ident "ia64.S, Version 2.0" |
| 4 | .ident "IA-64 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>" | 4 | .ident "IA-64 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>" |
| 5 | 5 | ||
| 6 | // | 6 | // |
| @@ -13,6 +13,35 @@ | |||
| 13 | // disclaimed. | 13 | // disclaimed. |
| 14 | // ==================================================================== | 14 | // ==================================================================== |
| 15 | // | 15 | // |
| 16 | // Version 2.x is Itanium2 re-tune. Few words about how Itanum2 is | ||
| 17 | // different from Itanium to this module viewpoint. Most notably, is it | ||
| 18 | // "wider" than Itanium? Can you experience loop scalability as | ||
| 19 | // discussed in commentary sections? Not really:-( Itanium2 has 6 | ||
| 20 | // integer ALU ports, i.e. it's 2 ports wider, but it's not enough to | ||
| 21 | // spin twice as fast, as I need 8 IALU ports. Amount of floating point | ||
| 22 | // ports is the same, i.e. 2, while I need 4. In other words, to this | ||
| 23 | // module Itanium2 remains effectively as "wide" as Itanium. Yet it's | ||
| 24 | // essentially different in respect to this module, and a re-tune was | ||
| 25 | // required. Well, because some intruction latencies has changed. Most | ||
| 26 | // noticeably those intensively used: | ||
| 27 | // | ||
| 28 | // Itanium Itanium2 | ||
| 29 | // ldf8 9 6 L2 hit | ||
| 30 | // ld8 2 1 L1 hit | ||
| 31 | // getf 2 5 | ||
| 32 | // xma[->getf] 7[+1] 4[+0] | ||
| 33 | // add[->st8] 1[+1] 1[+0] | ||
| 34 | // | ||
| 35 | // What does it mean? You might ratiocinate that the original code | ||
| 36 | // should run just faster... Because sum of latencies is smaller... | ||
| 37 | // Wrong! Note that getf latency increased. This means that if a loop is | ||
| 38 | // scheduled for lower latency (and they are), then it will suffer from | ||
| 39 | // stall condition and the code will therefore turn anti-scalable, e.g. | ||
| 40 | // original bn_mul_words spun at 5*n or 2.5 times slower than expected | ||
| 41 | // on Itanium2! What to do? Reschedule loops for Itanium2? But then | ||
| 42 | // Itanium would exhibit anti-scalability. So I've chosen to reschedule | ||
| 43 | // for worst latency for every instruction aiming for best *all-round* | ||
| 44 | // performance. | ||
| 16 | 45 | ||
| 17 | // Q. How much faster does it get? | 46 | // Q. How much faster does it get? |
| 18 | // A. Here is the output from 'openssl speed rsa dsa' for vanilla | 47 | // A. Here is the output from 'openssl speed rsa dsa' for vanilla |
| @@ -149,12 +178,27 @@ bn_add_words: | |||
| 149 | brp.loop.imp .L_bn_add_words_ctop,.L_bn_add_words_cend-16 | 178 | brp.loop.imp .L_bn_add_words_ctop,.L_bn_add_words_cend-16 |
| 150 | } | 179 | } |
| 151 | .body | 180 | .body |
| 152 | { .mib; mov r14=r32 // rp | 181 | { .mib; |
| 182 | #if defined(_HPUX_SOURCE) && defined(_ILP32) | ||
| 183 | addp4 r14=0,r32 // rp | ||
| 184 | #else | ||
| 185 | mov r14=r32 // rp | ||
| 186 | #endif | ||
| 153 | mov r9=pr };; | 187 | mov r9=pr };; |
| 154 | { .mii; mov r15=r33 // ap | 188 | { .mii; |
| 189 | #if defined(_HPUX_SOURCE) && defined(_ILP32) | ||
| 190 | addp4 r15=0,r33 // ap | ||
| 191 | #else | ||
| 192 | mov r15=r33 // ap | ||
| 193 | #endif | ||
| 155 | mov ar.lc=r10 | 194 | mov ar.lc=r10 |
| 156 | mov ar.ec=6 } | 195 | mov ar.ec=6 } |
| 157 | { .mib; mov r16=r34 // bp | 196 | { .mib; |
| 197 | #if defined(_HPUX_SOURCE) && defined(_ILP32) | ||
| 198 | addp4 r16=0,r34 // bp | ||
| 199 | #else | ||
| 200 | mov r16=r34 // bp | ||
| 201 | #endif | ||
| 158 | mov pr.rot=1<<16 };; | 202 | mov pr.rot=1<<16 };; |
| 159 | 203 | ||
| 160 | .L_bn_add_words_ctop: | 204 | .L_bn_add_words_ctop: |
| @@ -174,7 +218,7 @@ bn_add_words: | |||
| 174 | 218 | ||
| 175 | { .mii; | 219 | { .mii; |
| 176 | (p59) add r8=1,r8 // return value | 220 | (p59) add r8=1,r8 // return value |
| 177 | mov pr=r9,-1 | 221 | mov pr=r9,0x1ffff |
| 178 | mov ar.lc=r3 } | 222 | mov ar.lc=r3 } |
| 179 | { .mbb; nop.b 0x0 | 223 | { .mbb; nop.b 0x0 |
| 180 | br.ret.sptk.many b0 };; | 224 | br.ret.sptk.many b0 };; |
| @@ -202,12 +246,27 @@ bn_sub_words: | |||
| 202 | brp.loop.imp .L_bn_sub_words_ctop,.L_bn_sub_words_cend-16 | 246 | brp.loop.imp .L_bn_sub_words_ctop,.L_bn_sub_words_cend-16 |
| 203 | } | 247 | } |
| 204 | .body | 248 | .body |
| 205 | { .mib; mov r14=r32 // rp | 249 | { .mib; |
| 250 | #if defined(_HPUX_SOURCE) && defined(_ILP32) | ||
| 251 | addp4 r14=0,r32 // rp | ||
| 252 | #else | ||
| 253 | mov r14=r32 // rp | ||
| 254 | #endif | ||
| 206 | mov r9=pr };; | 255 | mov r9=pr };; |
| 207 | { .mii; mov r15=r33 // ap | 256 | { .mii; |
| 257 | #if defined(_HPUX_SOURCE) && defined(_ILP32) | ||
| 258 | addp4 r15=0,r33 // ap | ||
| 259 | #else | ||
| 260 | mov r15=r33 // ap | ||
| 261 | #endif | ||
| 208 | mov ar.lc=r10 | 262 | mov ar.lc=r10 |
| 209 | mov ar.ec=6 } | 263 | mov ar.ec=6 } |
| 210 | { .mib; mov r16=r34 // bp | 264 | { .mib; |
| 265 | #if defined(_HPUX_SOURCE) && defined(_ILP32) | ||
| 266 | addp4 r16=0,r34 // bp | ||
| 267 | #else | ||
| 268 | mov r16=r34 // bp | ||
| 269 | #endif | ||
| 211 | mov pr.rot=1<<16 };; | 270 | mov pr.rot=1<<16 };; |
| 212 | 271 | ||
| 213 | .L_bn_sub_words_ctop: | 272 | .L_bn_sub_words_ctop: |
| @@ -227,7 +286,7 @@ bn_sub_words: | |||
| 227 | 286 | ||
| 228 | { .mii; | 287 | { .mii; |
| 229 | (p59) add r8=1,r8 // return value | 288 | (p59) add r8=1,r8 // return value |
| 230 | mov pr=r9,-1 | 289 | mov pr=r9,0x1ffff |
| 231 | mov ar.lc=r3 } | 290 | mov ar.lc=r3 } |
| 232 | { .mbb; nop.b 0x0 | 291 | { .mbb; nop.b 0x0 |
| 233 | br.ret.sptk.many b0 };; | 292 | br.ret.sptk.many b0 };; |
| @@ -253,7 +312,7 @@ bn_mul_words: | |||
| 253 | #ifdef XMA_TEMPTATION | 312 | #ifdef XMA_TEMPTATION |
| 254 | { .mfi; alloc r2=ar.pfs,4,0,0,0 };; | 313 | { .mfi; alloc r2=ar.pfs,4,0,0,0 };; |
| 255 | #else | 314 | #else |
| 256 | { .mfi; alloc r2=ar.pfs,4,4,0,8 };; | 315 | { .mfi; alloc r2=ar.pfs,4,12,0,16 };; |
| 257 | #endif | 316 | #endif |
| 258 | { .mib; mov r8=r0 // return value | 317 | { .mib; mov r8=r0 // return value |
| 259 | cmp4.le p6,p0=r34,r0 | 318 | cmp4.le p6,p0=r34,r0 |
| @@ -266,24 +325,30 @@ bn_mul_words: | |||
| 266 | 325 | ||
| 267 | .body | 326 | .body |
| 268 | { .mib; setf.sig f8=r35 // w | 327 | { .mib; setf.sig f8=r35 // w |
| 269 | mov pr.rot=0x400001<<16 | 328 | mov pr.rot=0x800001<<16 |
| 270 | // ------^----- serves as (p48) at first (p26) | 329 | // ------^----- serves as (p50) at first (p27) |
| 271 | brp.loop.imp .L_bn_mul_words_ctop,.L_bn_mul_words_cend-16 | 330 | brp.loop.imp .L_bn_mul_words_ctop,.L_bn_mul_words_cend-16 |
| 272 | } | 331 | } |
| 273 | 332 | ||
| 274 | #ifndef XMA_TEMPTATION | 333 | #ifndef XMA_TEMPTATION |
| 275 | 334 | ||
| 276 | { .mii; mov r14=r32 // rp | 335 | { .mii; |
| 277 | mov r15=r33 // ap | 336 | #if defined(_HPUX_SOURCE) && defined(_ILP32) |
| 337 | addp4 r14=0,r32 // rp | ||
| 338 | addp4 r15=0,r33 // ap | ||
| 339 | #else | ||
| 340 | mov r14=r32 // rp | ||
| 341 | mov r15=r33 // ap | ||
| 342 | #endif | ||
| 278 | mov ar.lc=r10 } | 343 | mov ar.lc=r10 } |
| 279 | { .mii; mov r39=0 // serves as r33 at first (p26) | 344 | { .mii; mov r40=0 // serves as r35 at first (p27) |
| 280 | mov ar.ec=12 };; | 345 | mov ar.ec=13 };; |
| 281 | 346 | ||
| 282 | // This loop spins in 2*(n+11) ticks. It's scheduled for data in L2 | 347 | // This loop spins in 2*(n+12) ticks. It's scheduled for data in Itanium |
| 283 | // cache (i.e. 9 ticks away) as floating point load/store instructions | 348 | // L2 cache (i.e. 9 ticks away) as floating point load/store instructions |
| 284 | // bypass L1 cache and L2 latency is actually best-case scenario for | 349 | // bypass L1 cache and L2 latency is actually best-case scenario for |
| 285 | // ldf8. The loop is not scalable and shall run in 2*(n+11) even on | 350 | // ldf8. The loop is not scalable and shall run in 2*(n+12) even on |
| 286 | // "wider" IA-64 implementations. It's a trade-off here. n+22 loop | 351 | // "wider" IA-64 implementations. It's a trade-off here. n+24 loop |
| 287 | // would give us ~5% in *overall* performance improvement on "wider" | 352 | // would give us ~5% in *overall* performance improvement on "wider" |
| 288 | // IA-64, but would hurt Itanium for about same because of longer | 353 | // IA-64, but would hurt Itanium for about same because of longer |
| 289 | // epilogue. As it's a matter of few percents in either case I've | 354 | // epilogue. As it's a matter of few percents in either case I've |
| @@ -291,25 +356,25 @@ bn_mul_words: | |||
| 291 | // this very instruction sequence in bn_mul_add_words loop which in | 356 | // this very instruction sequence in bn_mul_add_words loop which in |
| 292 | // turn is scalable). | 357 | // turn is scalable). |
| 293 | .L_bn_mul_words_ctop: | 358 | .L_bn_mul_words_ctop: |
| 294 | { .mfi; (p25) getf.sig r36=f49 // low | 359 | { .mfi; (p25) getf.sig r36=f52 // low |
| 295 | (p21) xmpy.lu f45=f37,f8 | 360 | (p21) xmpy.lu f48=f37,f8 |
| 296 | (p27) cmp.ltu p52,p48=r39,r38 } | 361 | (p28) cmp.ltu p54,p50=r41,r39 } |
| 297 | { .mfi; (p16) ldf8 f32=[r15],8 | 362 | { .mfi; (p16) ldf8 f32=[r15],8 |
| 298 | (p21) xmpy.hu f38=f37,f8 | 363 | (p21) xmpy.hu f40=f37,f8 |
| 299 | (p0) nop.i 0x0 };; | 364 | (p0) nop.i 0x0 };; |
| 300 | { .mii; (p26) getf.sig r32=f43 // high | 365 | { .mii; (p25) getf.sig r32=f44 // high |
| 301 | .pred.rel "mutex",p48,p52 | 366 | .pred.rel "mutex",p50,p54 |
| 302 | (p48) add r38=r37,r33 // (p26) | 367 | (p50) add r40=r38,r35 // (p27) |
| 303 | (p52) add r38=r37,r33,1 } // (p26) | 368 | (p54) add r40=r38,r35,1 } // (p27) |
| 304 | { .mfb; (p27) st8 [r14]=r39,8 | 369 | { .mfb; (p28) st8 [r14]=r41,8 |
| 305 | (p0) nop.f 0x0 | 370 | (p0) nop.f 0x0 |
| 306 | br.ctop.sptk .L_bn_mul_words_ctop };; | 371 | br.ctop.sptk .L_bn_mul_words_ctop };; |
| 307 | .L_bn_mul_words_cend: | 372 | .L_bn_mul_words_cend: |
| 308 | 373 | ||
| 309 | { .mii; nop.m 0x0 | 374 | { .mii; nop.m 0x0 |
| 310 | .pred.rel "mutex",p49,p53 | 375 | .pred.rel "mutex",p51,p55 |
| 311 | (p49) add r8=r34,r0 | 376 | (p51) add r8=r36,r0 |
| 312 | (p53) add r8=r34,r0,1 } | 377 | (p55) add r8=r36,r0,1 } |
| 313 | { .mfb; nop.m 0x0 | 378 | { .mfb; nop.m 0x0 |
| 314 | nop.f 0x0 | 379 | nop.f 0x0 |
| 315 | nop.b 0x0 } | 380 | nop.b 0x0 } |
| @@ -344,7 +409,7 @@ bn_mul_words: | |||
| 344 | #endif // XMA_TEMPTATION | 409 | #endif // XMA_TEMPTATION |
| 345 | 410 | ||
| 346 | { .mii; nop.m 0x0 | 411 | { .mii; nop.m 0x0 |
| 347 | mov pr=r9,-1 | 412 | mov pr=r9,0x1ffff |
| 348 | mov ar.lc=r3 } | 413 | mov ar.lc=r3 } |
| 349 | { .mfb; rum 1<<5 // clear um.mfh | 414 | { .mfb; rum 1<<5 // clear um.mfh |
| 350 | nop.f 0x0 | 415 | nop.f 0x0 |
| @@ -376,59 +441,69 @@ bn_mul_add_words: | |||
| 376 | 441 | ||
| 377 | .body | 442 | .body |
| 378 | { .mib; setf.sig f8=r35 // w | 443 | { .mib; setf.sig f8=r35 // w |
| 379 | mov pr.rot=0x400001<<16 | 444 | mov pr.rot=0x800001<<16 |
| 380 | // ------^----- serves as (p48) at first (p26) | 445 | // ------^----- serves as (p50) at first (p27) |
| 381 | brp.loop.imp .L_bn_mul_add_words_ctop,.L_bn_mul_add_words_cend-16 | 446 | brp.loop.imp .L_bn_mul_add_words_ctop,.L_bn_mul_add_words_cend-16 |
| 382 | } | 447 | } |
| 383 | { .mii; mov r14=r32 // rp | 448 | { .mii; |
| 384 | mov r15=r33 // ap | 449 | #if defined(_HPUX_SOURCE) && defined(_ILP32) |
| 450 | addp4 r14=0,r32 // rp | ||
| 451 | addp4 r15=0,r33 // ap | ||
| 452 | #else | ||
| 453 | mov r14=r32 // rp | ||
| 454 | mov r15=r33 // ap | ||
| 455 | #endif | ||
| 385 | mov ar.lc=r10 } | 456 | mov ar.lc=r10 } |
| 386 | { .mii; mov r39=0 // serves as r33 at first (p26) | 457 | { .mii; mov r40=0 // serves as r35 at first (p27) |
| 387 | mov r18=r32 // rp copy | 458 | #if defined(_HPUX_SOURCE) && defined(_ILP32) |
| 388 | mov ar.ec=14 };; | 459 | addp4 r18=0,r32 // rp copy |
| 460 | #else | ||
| 461 | mov r18=r32 // rp copy | ||
| 462 | #endif | ||
| 463 | mov ar.ec=15 };; | ||
| 389 | 464 | ||
| 390 | // This loop spins in 3*(n+13) ticks on Itanium and should spin in | 465 | // This loop spins in 3*(n+14) ticks on Itanium and should spin in |
| 391 | // 2*(n+13) on "wider" IA-64 implementations (to be verified with new | 466 | // 2*(n+14) on "wider" IA-64 implementations (to be verified with new |
| 392 | // µ-architecture manuals as they become available). As usual it's | 467 | // µ-architecture manuals as they become available). As usual it's |
| 393 | // possible to compress the epilogue, down to 10 in this case, at the | 468 | // possible to compress the epilogue, down to 10 in this case, at the |
| 394 | // cost of scalability. Compressed (and therefore non-scalable) loop | 469 | // cost of scalability. Compressed (and therefore non-scalable) loop |
| 395 | // running at 3*(n+10) would buy you ~10% on Itanium but take ~35% | 470 | // running at 3*(n+11) would buy you ~10% on Itanium but take ~35% |
| 396 | // from "wider" IA-64 so let it be scalable! Special attention was | 471 | // from "wider" IA-64 so let it be scalable! Special attention was |
| 397 | // paid for having the loop body split at 64-byte boundary. ld8 is | 472 | // paid for having the loop body split at 64-byte boundary. ld8 is |
| 398 | // scheduled for L1 cache as the data is more than likely there. | 473 | // scheduled for L1 cache as the data is more than likely there. |
| 399 | // Indeed, bn_mul_words has put it there a moment ago:-) | 474 | // Indeed, bn_mul_words has put it there a moment ago:-) |
| 400 | .L_bn_mul_add_words_ctop: | 475 | .L_bn_mul_add_words_ctop: |
| 401 | { .mfi; (p25) getf.sig r36=f49 // low | 476 | { .mfi; (p25) getf.sig r36=f52 // low |
| 402 | (p21) xmpy.lu f45=f37,f8 | 477 | (p21) xmpy.lu f48=f37,f8 |
| 403 | (p27) cmp.ltu p52,p48=r39,r38 } | 478 | (p28) cmp.ltu p54,p50=r41,r39 } |
| 404 | { .mfi; (p16) ldf8 f32=[r15],8 | 479 | { .mfi; (p16) ldf8 f32=[r15],8 |
| 405 | (p21) xmpy.hu f38=f37,f8 | 480 | (p21) xmpy.hu f40=f37,f8 |
| 406 | (p27) add r43=r43,r39 };; | 481 | (p28) add r45=r45,r41 };; |
| 407 | { .mii; (p26) getf.sig r32=f43 // high | 482 | { .mii; (p25) getf.sig r32=f44 // high |
| 408 | .pred.rel "mutex",p48,p52 | 483 | .pred.rel "mutex",p50,p54 |
| 409 | (p48) add r38=r37,r33 // (p26) | 484 | (p50) add r40=r38,r35 // (p27) |
| 410 | (p52) add r38=r37,r33,1 } // (p26) | 485 | (p54) add r40=r38,r35,1 } // (p27) |
| 411 | { .mfb; (p27) cmp.ltu.unc p56,p0=r43,r39 | 486 | { .mfb; (p28) cmp.ltu.unc p60,p0=r45,r41 |
| 412 | (p0) nop.f 0x0 | 487 | (p0) nop.f 0x0 |
| 413 | (p0) nop.b 0x0 } | 488 | (p0) nop.b 0x0 } |
| 414 | { .mii; (p26) ld8 r42=[r18],8 | 489 | { .mii; (p27) ld8 r44=[r18],8 |
| 415 | (p58) cmp.eq.or p57,p0=-1,r44 | 490 | (p62) cmp.eq.or p61,p0=-1,r46 |
| 416 | (p58) add r44=1,r44 } | 491 | (p62) add r46=1,r46 } |
| 417 | { .mfb; (p29) st8 [r14]=r45,8 | 492 | { .mfb; (p30) st8 [r14]=r47,8 |
| 418 | (p0) nop.f 0x0 | 493 | (p0) nop.f 0x0 |
| 419 | br.ctop.sptk .L_bn_mul_add_words_ctop};; | 494 | br.ctop.sptk .L_bn_mul_add_words_ctop};; |
| 420 | .L_bn_mul_add_words_cend: | 495 | .L_bn_mul_add_words_cend: |
| 421 | 496 | ||
| 422 | { .mii; nop.m 0x0 | 497 | { .mii; nop.m 0x0 |
| 423 | .pred.rel "mutex",p51,p55 | 498 | .pred.rel "mutex",p53,p57 |
| 424 | (p51) add r8=r36,r0 | 499 | (p53) add r8=r38,r0 |
| 425 | (p55) add r8=r36,r0,1 } | 500 | (p57) add r8=r38,r0,1 } |
| 426 | { .mfb; nop.m 0x0 | 501 | { .mfb; nop.m 0x0 |
| 427 | nop.f 0x0 | 502 | nop.f 0x0 |
| 428 | nop.b 0x0 };; | 503 | nop.b 0x0 };; |
| 429 | { .mii; | 504 | { .mii; |
| 430 | (p59) add r8=1,r8 | 505 | (p63) add r8=1,r8 |
| 431 | mov pr=r9,-1 | 506 | mov pr=r9,0x1ffff |
| 432 | mov ar.lc=r3 } | 507 | mov ar.lc=r3 } |
| 433 | { .mfb; rum 1<<5 // clear um.mfh | 508 | { .mfb; rum 1<<5 // clear um.mfh |
| 434 | nop.f 0x0 | 509 | nop.f 0x0 |
| @@ -461,6 +536,10 @@ bn_sqr_words: | |||
| 461 | mov r9=pr };; | 536 | mov r9=pr };; |
| 462 | 537 | ||
| 463 | .body | 538 | .body |
| 539 | #if defined(_HPUX_SOURCE) && defined(_ILP32) | ||
| 540 | { .mii; addp4 r32=0,r32 | ||
| 541 | addp4 r33=0,r33 };; | ||
| 542 | #endif | ||
| 464 | { .mib; | 543 | { .mib; |
| 465 | mov pr.rot=1<<16 | 544 | mov pr.rot=1<<16 |
| 466 | brp.loop.imp .L_bn_sqr_words_ctop,.L_bn_sqr_words_cend-16 | 545 | brp.loop.imp .L_bn_sqr_words_ctop,.L_bn_sqr_words_cend-16 |
| @@ -492,7 +571,7 @@ bn_sqr_words: | |||
| 492 | .L_bn_sqr_words_cend: | 571 | .L_bn_sqr_words_cend: |
| 493 | 572 | ||
| 494 | { .mii; nop.m 0x0 | 573 | { .mii; nop.m 0x0 |
| 495 | mov pr=r9,-1 | 574 | mov pr=r9,0x1ffff |
| 496 | mov ar.lc=r3 } | 575 | mov ar.lc=r3 } |
| 497 | { .mfb; rum 1<<5 // clear um.mfh | 576 | { .mfb; rum 1<<5 // clear um.mfh |
| 498 | nop.f 0x0 | 577 | nop.f 0x0 |
| @@ -526,7 +605,14 @@ bn_sqr_comba8: | |||
| 526 | .prologue | 605 | .prologue |
| 527 | .fframe 0 | 606 | .fframe 0 |
| 528 | .save ar.pfs,r2 | 607 | .save ar.pfs,r2 |
| 608 | #if defined(_HPUX_SOURCE) && defined(_ILP32) | ||
| 529 | { .mii; alloc r2=ar.pfs,2,1,0,0 | 609 | { .mii; alloc r2=ar.pfs,2,1,0,0 |
| 610 | addp4 r33=0,r33 | ||
| 611 | addp4 r32=0,r32 };; | ||
| 612 | { .mii; | ||
| 613 | #else | ||
| 614 | { .mii; alloc r2=ar.pfs,2,1,0,0 | ||
| 615 | #endif | ||
| 530 | mov r34=r33 | 616 | mov r34=r33 |
| 531 | add r14=8,r33 };; | 617 | add r14=8,r33 };; |
| 532 | .body | 618 | .body |
| @@ -587,7 +673,14 @@ bn_mul_comba8: | |||
| 587 | .prologue | 673 | .prologue |
| 588 | .fframe 0 | 674 | .fframe 0 |
| 589 | .save ar.pfs,r2 | 675 | .save ar.pfs,r2 |
| 676 | #if defined(_HPUX_SOURCE) && defined(_ILP32) | ||
| 590 | { .mii; alloc r2=ar.pfs,3,0,0,0 | 677 | { .mii; alloc r2=ar.pfs,3,0,0,0 |
| 678 | addp4 r33=0,r33 | ||
| 679 | addp4 r34=0,r34 };; | ||
| 680 | { .mii; addp4 r32=0,r32 | ||
| 681 | #else | ||
| 682 | { .mii; alloc r2=ar.pfs,3,0,0,0 | ||
| 683 | #endif | ||
| 591 | add r14=8,r33 | 684 | add r14=8,r33 |
| 592 | add r17=8,r34 } | 685 | add r17=8,r34 } |
| 593 | .body | 686 | .body |
| @@ -1138,7 +1231,14 @@ bn_sqr_comba4: | |||
| 1138 | .prologue | 1231 | .prologue |
| 1139 | .fframe 0 | 1232 | .fframe 0 |
| 1140 | .save ar.pfs,r2 | 1233 | .save ar.pfs,r2 |
| 1234 | #if defined(_HPUX_SOURCE) && defined(_ILP32) | ||
| 1235 | { .mii; alloc r2=ar.pfs,2,1,0,0 | ||
| 1236 | addp4 r32=0,r32 | ||
| 1237 | addp4 r33=0,r33 };; | ||
| 1238 | { .mii; | ||
| 1239 | #else | ||
| 1141 | { .mii; alloc r2=ar.pfs,2,1,0,0 | 1240 | { .mii; alloc r2=ar.pfs,2,1,0,0 |
| 1241 | #endif | ||
| 1142 | mov r34=r33 | 1242 | mov r34=r33 |
| 1143 | add r14=8,r33 };; | 1243 | add r14=8,r33 };; |
| 1144 | .body | 1244 | .body |
| @@ -1164,7 +1264,14 @@ bn_mul_comba4: | |||
| 1164 | .prologue | 1264 | .prologue |
| 1165 | .fframe 0 | 1265 | .fframe 0 |
| 1166 | .save ar.pfs,r2 | 1266 | .save ar.pfs,r2 |
| 1267 | #if defined(_HPUX_SOURCE) && defined(_ILP32) | ||
| 1268 | { .mii; alloc r2=ar.pfs,3,0,0,0 | ||
| 1269 | addp4 r33=0,r33 | ||
| 1270 | addp4 r34=0,r34 };; | ||
| 1271 | { .mii; addp4 r32=0,r32 | ||
| 1272 | #else | ||
| 1167 | { .mii; alloc r2=ar.pfs,3,0,0,0 | 1273 | { .mii; alloc r2=ar.pfs,3,0,0,0 |
| 1274 | #endif | ||
| 1168 | add r14=8,r33 | 1275 | add r14=8,r33 |
| 1169 | add r17=8,r34 } | 1276 | add r17=8,r34 } |
| 1170 | .body | 1277 | .body |
| @@ -1464,7 +1571,7 @@ bn_div_words: | |||
| 1464 | or r8=r8,r33 | 1571 | or r8=r8,r33 |
| 1465 | mov ar.pfs=r2 };; | 1572 | mov ar.pfs=r2 };; |
| 1466 | { .mii; shr.u r9=H,I // remainder if anybody wants it | 1573 | { .mii; shr.u r9=H,I // remainder if anybody wants it |
| 1467 | mov pr=r10,-1 } | 1574 | mov pr=r10,0x1ffff } |
| 1468 | { .mfb; br.ret.sptk.many b0 };; | 1575 | { .mfb; br.ret.sptk.many b0 };; |
| 1469 | 1576 | ||
| 1470 | // Unsigned 64 by 32 (well, by 64 for the moment) bit integer division | 1577 | // Unsigned 64 by 32 (well, by 64 for the moment) bit integer division |
diff --git a/src/lib/libcrypto/bn/asm/pa-risc2.s b/src/lib/libcrypto/bn/asm/pa-risc2.s index af9730d062..f3b16290eb 100644 --- a/src/lib/libcrypto/bn/asm/pa-risc2.s +++ b/src/lib/libcrypto/bn/asm/pa-risc2.s | |||
| @@ -747,8 +747,8 @@ bn_div_words | |||
| 747 | .PROC | 747 | .PROC |
| 748 | .EXPORT bn_div_words,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR,RTNVAL=GR,LONG_RETURN | 748 | .EXPORT bn_div_words,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR,RTNVAL=GR,LONG_RETURN |
| 749 | .IMPORT BN_num_bits_word,CODE | 749 | .IMPORT BN_num_bits_word,CODE |
| 750 | .IMPORT __iob,DATA | 750 | ;--- not PIC .IMPORT __iob,DATA |
| 751 | .IMPORT fprintf,CODE | 751 | ;--- not PIC .IMPORT fprintf,CODE |
| 752 | .IMPORT abort,CODE | 752 | .IMPORT abort,CODE |
| 753 | .IMPORT $$div2U,MILLICODE | 753 | .IMPORT $$div2U,MILLICODE |
| 754 | .CALLINFO CALLER,FRAME=144,ENTRY_GR=%r9,SAVE_RP,ARGS_SAVED,ORDERING_AWARE | 754 | .CALLINFO CALLER,FRAME=144,ENTRY_GR=%r9,SAVE_RP,ARGS_SAVED,ORDERING_AWARE |
| @@ -844,12 +844,12 @@ $0006001A | |||
| 844 | MOVIB,TR 2,%r8,$0006001C ;offset 0xa18 | 844 | MOVIB,TR 2,%r8,$0006001C ;offset 0xa18 |
| 845 | EXTRD,U %r3,63,32,%r7 ;offset 0xa1c | 845 | EXTRD,U %r3,63,32,%r7 ;offset 0xa1c |
| 846 | $D2 | 846 | $D2 |
| 847 | ADDIL LR'__iob-$global$,%r27,%r1 ;offset 0xa20 | 847 | ;--- not PIC ADDIL LR'__iob-$global$,%r27,%r1 ;offset 0xa20 |
| 848 | LDIL LR'C$7,%r21 ;offset 0xa24 | 848 | ;--- not PIC LDIL LR'C$7,%r21 ;offset 0xa24 |
| 849 | LDO RR'__iob-$global$+32(%r1),%r26 ;offset 0xa28 | 849 | ;--- not PIC LDO RR'__iob-$global$+32(%r1),%r26 ;offset 0xa28 |
| 850 | .CALL ARGW0=GR,ARGW1=GR,ARGW2=GR,RTNVAL=GR ;in=24,25,26;out=28; | 850 | ;--- not PIC .CALL ARGW0=GR,ARGW1=GR,ARGW2=GR,RTNVAL=GR ;in=24,25,26;out=28; |
| 851 | B,L fprintf,%r2 ;offset 0xa2c | 851 | ;--- not PIC B,L fprintf,%r2 ;offset 0xa2c |
| 852 | LDO RR'C$7(%r21),%r25 ;offset 0xa30 | 852 | ;--- not PIC LDO RR'C$7(%r21),%r25 ;offset 0xa30 |
| 853 | .CALL ; | 853 | .CALL ; |
| 854 | B,L abort,%r2 ;offset 0xa34 | 854 | B,L abort,%r2 ;offset 0xa34 |
| 855 | NOP ;offset 0xa38 | 855 | NOP ;offset 0xa38 |
| @@ -1605,14 +1605,14 @@ bn_mul_comba4 | |||
| 1605 | .PROCEND | 1605 | .PROCEND |
| 1606 | 1606 | ||
| 1607 | 1607 | ||
| 1608 | .SPACE $TEXT$ | 1608 | ;--- not PIC .SPACE $TEXT$ |
| 1609 | .SUBSPA $CODE$ | 1609 | ;--- not PIC .SUBSPA $CODE$ |
| 1610 | .SPACE $PRIVATE$,SORT=16 | 1610 | ;--- not PIC .SPACE $PRIVATE$,SORT=16 |
| 1611 | .IMPORT $global$,DATA | 1611 | ;--- not PIC .IMPORT $global$,DATA |
| 1612 | .SPACE $TEXT$ | 1612 | ;--- not PIC .SPACE $TEXT$ |
| 1613 | .SUBSPA $CODE$ | 1613 | ;--- not PIC .SUBSPA $CODE$ |
| 1614 | .SUBSPA $LIT$,ACCESS=0x2c | 1614 | ;--- not PIC .SUBSPA $LIT$,ACCESS=0x2c |
| 1615 | C$7 | 1615 | ;--- not PIC C$7 |
| 1616 | .ALIGN 8 | 1616 | ;--- not PIC .ALIGN 8 |
| 1617 | .STRINGZ "Division would overflow (%d)\n" | 1617 | ;--- not PIC .STRINGZ "Division would overflow (%d)\n" |
| 1618 | .END | 1618 | .END |
diff --git a/src/lib/libcrypto/bn/asm/x86_64-gcc.c b/src/lib/libcrypto/bn/asm/x86_64-gcc.c new file mode 100644 index 0000000000..b97b394661 --- /dev/null +++ b/src/lib/libcrypto/bn/asm/x86_64-gcc.c | |||
| @@ -0,0 +1,575 @@ | |||
| 1 | /* | ||
| 2 | * x86_64 BIGNUM accelerator version 0.1, December 2002. | ||
| 3 | * | ||
| 4 | * Implemented by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | ||
| 5 | * project. | ||
| 6 | * | ||
| 7 | * Rights for redistribution and usage in source and binary forms are | ||
| 8 | * granted according to the OpenSSL license. Warranty of any kind is | ||
| 9 | * disclaimed. | ||
| 10 | * | ||
| 11 | * Q. Version 0.1? It doesn't sound like Andy, he used to assign real | ||
| 12 | * versions, like 1.0... | ||
| 13 | * A. Well, that's because this code is basically a quick-n-dirty | ||
| 14 | * proof-of-concept hack. As you can see it's implemented with | ||
| 15 | * inline assembler, which means that you're bound to GCC and that | ||
| 16 | * there must be a room for fine-tuning. | ||
| 17 | * | ||
| 18 | * Q. Why inline assembler? | ||
| 19 | * A. x86_64 features own ABI I'm not familiar with. Which is why | ||
| 20 | * I decided to let the compiler take care of subroutine | ||
| 21 | * prologue/epilogue as well as register allocation. | ||
| 22 | * | ||
| 23 | * Q. How much faster does it get? | ||
| 24 | * A. Unfortunately people sitting on x86_64 hardware are prohibited | ||
| 25 | * to disclose the performance numbers, so they (SuSE labs to be | ||
| 26 | * specific) wouldn't tell me. However! Very similar coding technique | ||
| 27 | * (reaching out for 128-bit result from 64x64-bit multiplication) | ||
| 28 | * results in >3 times performance improvement on MIPS and I see no | ||
| 29 | * reason why gain on x86_64 would be so much different:-) | ||
| 30 | */ | ||
| 31 | |||
| 32 | #define BN_ULONG unsigned long | ||
| 33 | |||
| 34 | /* | ||
| 35 | * "m"(a), "+m"(r) is the way to favor DirectPath µ-code; | ||
| 36 | * "g"(0) let the compiler to decide where does it | ||
| 37 | * want to keep the value of zero; | ||
| 38 | */ | ||
| 39 | #define mul_add(r,a,word,carry) do { \ | ||
| 40 | register BN_ULONG high,low; \ | ||
| 41 | asm ("mulq %3" \ | ||
| 42 | : "=a"(low),"=d"(high) \ | ||
| 43 | : "a"(word),"m"(a) \ | ||
| 44 | : "cc"); \ | ||
| 45 | asm ("addq %2,%0; adcq %3,%1" \ | ||
| 46 | : "+r"(carry),"+d"(high)\ | ||
| 47 | : "a"(low),"g"(0) \ | ||
| 48 | : "cc"); \ | ||
| 49 | asm ("addq %2,%0; adcq %3,%1" \ | ||
| 50 | : "+m"(r),"+d"(high) \ | ||
| 51 | : "r"(carry),"g"(0) \ | ||
| 52 | : "cc"); \ | ||
| 53 | carry=high; \ | ||
| 54 | } while (0) | ||
| 55 | |||
| 56 | #define mul(r,a,word,carry) do { \ | ||
| 57 | register BN_ULONG high,low; \ | ||
| 58 | asm ("mulq %3" \ | ||
| 59 | : "=a"(low),"=d"(high) \ | ||
| 60 | : "a"(word),"g"(a) \ | ||
| 61 | : "cc"); \ | ||
| 62 | asm ("addq %2,%0; adcq %3,%1" \ | ||
| 63 | : "+r"(carry),"+d"(high)\ | ||
| 64 | : "a"(low),"g"(0) \ | ||
| 65 | : "cc"); \ | ||
| 66 | (r)=carry, carry=high; \ | ||
| 67 | } while (0) | ||
| 68 | |||
| 69 | #define sqr(r0,r1,a) \ | ||
| 70 | asm ("mulq %2" \ | ||
| 71 | : "=a"(r0),"=d"(r1) \ | ||
| 72 | : "a"(a) \ | ||
| 73 | : "cc"); | ||
| 74 | |||
| 75 | BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w) | ||
| 76 | { | ||
| 77 | BN_ULONG c1=0; | ||
| 78 | |||
| 79 | if (num <= 0) return(c1); | ||
| 80 | |||
| 81 | while (num&~3) | ||
| 82 | { | ||
| 83 | mul_add(rp[0],ap[0],w,c1); | ||
| 84 | mul_add(rp[1],ap[1],w,c1); | ||
| 85 | mul_add(rp[2],ap[2],w,c1); | ||
| 86 | mul_add(rp[3],ap[3],w,c1); | ||
| 87 | ap+=4; rp+=4; num-=4; | ||
| 88 | } | ||
| 89 | if (num) | ||
| 90 | { | ||
| 91 | mul_add(rp[0],ap[0],w,c1); if (--num==0) return c1; | ||
| 92 | mul_add(rp[1],ap[1],w,c1); if (--num==0) return c1; | ||
| 93 | mul_add(rp[2],ap[2],w,c1); return c1; | ||
| 94 | } | ||
| 95 | |||
| 96 | return(c1); | ||
| 97 | } | ||
| 98 | |||
| 99 | BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w) | ||
| 100 | { | ||
| 101 | BN_ULONG c1=0; | ||
| 102 | |||
| 103 | if (num <= 0) return(c1); | ||
| 104 | |||
| 105 | while (num&~3) | ||
| 106 | { | ||
| 107 | mul(rp[0],ap[0],w,c1); | ||
| 108 | mul(rp[1],ap[1],w,c1); | ||
| 109 | mul(rp[2],ap[2],w,c1); | ||
| 110 | mul(rp[3],ap[3],w,c1); | ||
| 111 | ap+=4; rp+=4; num-=4; | ||
| 112 | } | ||
| 113 | if (num) | ||
| 114 | { | ||
| 115 | mul(rp[0],ap[0],w,c1); if (--num == 0) return c1; | ||
| 116 | mul(rp[1],ap[1],w,c1); if (--num == 0) return c1; | ||
| 117 | mul(rp[2],ap[2],w,c1); | ||
| 118 | } | ||
| 119 | return(c1); | ||
| 120 | } | ||
| 121 | |||
| 122 | void bn_sqr_words(BN_ULONG *r, BN_ULONG *a, int n) | ||
| 123 | { | ||
| 124 | if (n <= 0) return; | ||
| 125 | |||
| 126 | while (n&~3) | ||
| 127 | { | ||
| 128 | sqr(r[0],r[1],a[0]); | ||
| 129 | sqr(r[2],r[3],a[1]); | ||
| 130 | sqr(r[4],r[5],a[2]); | ||
| 131 | sqr(r[6],r[7],a[3]); | ||
| 132 | a+=4; r+=8; n-=4; | ||
| 133 | } | ||
| 134 | if (n) | ||
| 135 | { | ||
| 136 | sqr(r[0],r[1],a[0]); if (--n == 0) return; | ||
| 137 | sqr(r[2],r[3],a[1]); if (--n == 0) return; | ||
| 138 | sqr(r[4],r[5],a[2]); | ||
| 139 | } | ||
| 140 | } | ||
| 141 | |||
| 142 | BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) | ||
| 143 | { BN_ULONG ret,waste; | ||
| 144 | |||
| 145 | asm ("divq %3" | ||
| 146 | : "=a"(ret),"=d"(waste) | ||
| 147 | : "a"(l),"d"(h),"g"(d) | ||
| 148 | : "cc"); | ||
| 149 | |||
| 150 | return ret; | ||
| 151 | } | ||
| 152 | |||
| 153 | BN_ULONG bn_add_words (BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int n) | ||
| 154 | { BN_ULONG ret,i; | ||
| 155 | |||
| 156 | if (n <= 0) return 0; | ||
| 157 | |||
| 158 | asm ( | ||
| 159 | " subq %2,%2 \n" | ||
| 160 | ".align 16 \n" | ||
| 161 | "1: movq (%4,%2,8),%0 \n" | ||
| 162 | " adcq (%5,%2,8),%0 \n" | ||
| 163 | " movq %0,(%3,%2,8) \n" | ||
| 164 | " leaq 1(%2),%2 \n" | ||
| 165 | " loop 1b \n" | ||
| 166 | " sbbq %0,%0 \n" | ||
| 167 | : "+a"(ret),"+c"(n),"+r"(i) | ||
| 168 | : "r"(rp),"r"(ap),"r"(bp) | ||
| 169 | : "cc" | ||
| 170 | ); | ||
| 171 | |||
| 172 | return ret&1; | ||
| 173 | } | ||
| 174 | |||
| 175 | #ifndef SIMICS | ||
| 176 | BN_ULONG bn_sub_words (BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int n) | ||
| 177 | { BN_ULONG ret,i; | ||
| 178 | |||
| 179 | if (n <= 0) return 0; | ||
| 180 | |||
| 181 | asm ( | ||
| 182 | " subq %2,%2 \n" | ||
| 183 | ".align 16 \n" | ||
| 184 | "1: movq (%4,%2,8),%0 \n" | ||
| 185 | " sbbq (%5,%2,8),%0 \n" | ||
| 186 | " movq %0,(%3,%2,8) \n" | ||
| 187 | " leaq 1(%2),%2 \n" | ||
| 188 | " loop 1b \n" | ||
| 189 | " sbbq %0,%0 \n" | ||
| 190 | : "+a"(ret),"+c"(n),"+r"(i) | ||
| 191 | : "r"(rp),"r"(ap),"r"(bp) | ||
| 192 | : "cc" | ||
| 193 | ); | ||
| 194 | |||
| 195 | return ret&1; | ||
| 196 | } | ||
| 197 | #else | ||
| 198 | /* Simics 1.4<7 has buggy sbbq:-( */ | ||
| 199 | #define BN_MASK2 0xffffffffffffffffL | ||
| 200 | BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) | ||
| 201 | { | ||
| 202 | BN_ULONG t1,t2; | ||
| 203 | int c=0; | ||
| 204 | |||
| 205 | if (n <= 0) return((BN_ULONG)0); | ||
| 206 | |||
| 207 | for (;;) | ||
| 208 | { | ||
| 209 | t1=a[0]; t2=b[0]; | ||
| 210 | r[0]=(t1-t2-c)&BN_MASK2; | ||
| 211 | if (t1 != t2) c=(t1 < t2); | ||
| 212 | if (--n <= 0) break; | ||
| 213 | |||
| 214 | t1=a[1]; t2=b[1]; | ||
| 215 | r[1]=(t1-t2-c)&BN_MASK2; | ||
| 216 | if (t1 != t2) c=(t1 < t2); | ||
| 217 | if (--n <= 0) break; | ||
| 218 | |||
| 219 | t1=a[2]; t2=b[2]; | ||
| 220 | r[2]=(t1-t2-c)&BN_MASK2; | ||
| 221 | if (t1 != t2) c=(t1 < t2); | ||
| 222 | if (--n <= 0) break; | ||
| 223 | |||
| 224 | t1=a[3]; t2=b[3]; | ||
| 225 | r[3]=(t1-t2-c)&BN_MASK2; | ||
| 226 | if (t1 != t2) c=(t1 < t2); | ||
| 227 | if (--n <= 0) break; | ||
| 228 | |||
| 229 | a+=4; | ||
| 230 | b+=4; | ||
| 231 | r+=4; | ||
| 232 | } | ||
| 233 | return(c); | ||
| 234 | } | ||
| 235 | #endif | ||
| 236 | |||
| 237 | /* mul_add_c(a,b,c0,c1,c2) -- c+=a*b for three word number c=(c2,c1,c0) */ | ||
| 238 | /* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */ | ||
| 239 | /* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */ | ||
| 240 | /* sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0) */ | ||
| 241 | |||
| 242 | #if 0 | ||
| 243 | /* original macros are kept for reference purposes */ | ||
| 244 | #define mul_add_c(a,b,c0,c1,c2) { \ | ||
| 245 | BN_ULONG ta=(a),tb=(b); \ | ||
| 246 | t1 = ta * tb; \ | ||
| 247 | t2 = BN_UMULT_HIGH(ta,tb); \ | ||
| 248 | c0 += t1; t2 += (c0<t1)?1:0; \ | ||
| 249 | c1 += t2; c2 += (c1<t2)?1:0; \ | ||
| 250 | } | ||
| 251 | |||
| 252 | #define mul_add_c2(a,b,c0,c1,c2) { \ | ||
| 253 | BN_ULONG ta=(a),tb=(b),t0; \ | ||
| 254 | t1 = BN_UMULT_HIGH(ta,tb); \ | ||
| 255 | t0 = ta * tb; \ | ||
| 256 | t2 = t1+t1; c2 += (t2<t1)?1:0; \ | ||
| 257 | t1 = t0+t0; t2 += (t1<t0)?1:0; \ | ||
| 258 | c0 += t1; t2 += (c0<t1)?1:0; \ | ||
| 259 | c1 += t2; c2 += (c1<t2)?1:0; \ | ||
| 260 | } | ||
| 261 | #else | ||
| 262 | #define mul_add_c(a,b,c0,c1,c2) do { \ | ||
| 263 | asm ("mulq %3" \ | ||
| 264 | : "=a"(t1),"=d"(t2) \ | ||
| 265 | : "a"(a),"m"(b) \ | ||
| 266 | : "cc"); \ | ||
| 267 | asm ("addq %2,%0; adcq %3,%1" \ | ||
| 268 | : "+r"(c0),"+d"(t2) \ | ||
| 269 | : "a"(t1),"g"(0) \ | ||
| 270 | : "cc"); \ | ||
| 271 | asm ("addq %2,%0; adcq %3,%1" \ | ||
| 272 | : "+r"(c1),"+r"(c2) \ | ||
| 273 | : "d"(t2),"g"(0) \ | ||
| 274 | : "cc"); \ | ||
| 275 | } while (0) | ||
| 276 | |||
| 277 | #define sqr_add_c(a,i,c0,c1,c2) do { \ | ||
| 278 | asm ("mulq %2" \ | ||
| 279 | : "=a"(t1),"=d"(t2) \ | ||
| 280 | : "a"(a[i]) \ | ||
| 281 | : "cc"); \ | ||
| 282 | asm ("addq %2,%0; adcq %3,%1" \ | ||
| 283 | : "+r"(c0),"+d"(t2) \ | ||
| 284 | : "a"(t1),"g"(0) \ | ||
| 285 | : "cc"); \ | ||
| 286 | asm ("addq %2,%0; adcq %3,%1" \ | ||
| 287 | : "+r"(c1),"+r"(c2) \ | ||
| 288 | : "d"(t2),"g"(0) \ | ||
| 289 | : "cc"); \ | ||
| 290 | } while (0) | ||
| 291 | |||
| 292 | #define mul_add_c2(a,b,c0,c1,c2) do { \ | ||
| 293 | asm ("mulq %3" \ | ||
| 294 | : "=a"(t1),"=d"(t2) \ | ||
| 295 | : "a"(a),"m"(b) \ | ||
| 296 | : "cc"); \ | ||
| 297 | asm ("addq %0,%0; adcq %2,%1" \ | ||
| 298 | : "+d"(t2),"+r"(c2) \ | ||
| 299 | : "g"(0) \ | ||
| 300 | : "cc"); \ | ||
| 301 | asm ("addq %0,%0; adcq %2,%1" \ | ||
| 302 | : "+a"(t1),"+d"(t2) \ | ||
| 303 | : "g"(0) \ | ||
| 304 | : "cc"); \ | ||
| 305 | asm ("addq %2,%0; adcq %3,%1" \ | ||
| 306 | : "+r"(c0),"+d"(t2) \ | ||
| 307 | : "a"(t1),"g"(0) \ | ||
| 308 | : "cc"); \ | ||
| 309 | asm ("addq %2,%0; adcq %3,%1" \ | ||
| 310 | : "+r"(c1),"+r"(c2) \ | ||
| 311 | : "d"(t2),"g"(0) \ | ||
| 312 | : "cc"); \ | ||
| 313 | } while (0) | ||
| 314 | #endif | ||
| 315 | |||
| 316 | #define sqr_add_c2(a,i,j,c0,c1,c2) \ | ||
| 317 | mul_add_c2((a)[i],(a)[j],c0,c1,c2) | ||
| 318 | |||
| 319 | void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) | ||
| 320 | { | ||
| 321 | BN_ULONG bl,bh; | ||
| 322 | BN_ULONG t1,t2; | ||
| 323 | BN_ULONG c1,c2,c3; | ||
| 324 | |||
| 325 | c1=0; | ||
| 326 | c2=0; | ||
| 327 | c3=0; | ||
| 328 | mul_add_c(a[0],b[0],c1,c2,c3); | ||
| 329 | r[0]=c1; | ||
| 330 | c1=0; | ||
| 331 | mul_add_c(a[0],b[1],c2,c3,c1); | ||
| 332 | mul_add_c(a[1],b[0],c2,c3,c1); | ||
| 333 | r[1]=c2; | ||
| 334 | c2=0; | ||
| 335 | mul_add_c(a[2],b[0],c3,c1,c2); | ||
| 336 | mul_add_c(a[1],b[1],c3,c1,c2); | ||
| 337 | mul_add_c(a[0],b[2],c3,c1,c2); | ||
| 338 | r[2]=c3; | ||
| 339 | c3=0; | ||
| 340 | mul_add_c(a[0],b[3],c1,c2,c3); | ||
| 341 | mul_add_c(a[1],b[2],c1,c2,c3); | ||
| 342 | mul_add_c(a[2],b[1],c1,c2,c3); | ||
| 343 | mul_add_c(a[3],b[0],c1,c2,c3); | ||
| 344 | r[3]=c1; | ||
| 345 | c1=0; | ||
| 346 | mul_add_c(a[4],b[0],c2,c3,c1); | ||
| 347 | mul_add_c(a[3],b[1],c2,c3,c1); | ||
| 348 | mul_add_c(a[2],b[2],c2,c3,c1); | ||
| 349 | mul_add_c(a[1],b[3],c2,c3,c1); | ||
| 350 | mul_add_c(a[0],b[4],c2,c3,c1); | ||
| 351 | r[4]=c2; | ||
| 352 | c2=0; | ||
| 353 | mul_add_c(a[0],b[5],c3,c1,c2); | ||
| 354 | mul_add_c(a[1],b[4],c3,c1,c2); | ||
| 355 | mul_add_c(a[2],b[3],c3,c1,c2); | ||
| 356 | mul_add_c(a[3],b[2],c3,c1,c2); | ||
| 357 | mul_add_c(a[4],b[1],c3,c1,c2); | ||
| 358 | mul_add_c(a[5],b[0],c3,c1,c2); | ||
| 359 | r[5]=c3; | ||
| 360 | c3=0; | ||
| 361 | mul_add_c(a[6],b[0],c1,c2,c3); | ||
| 362 | mul_add_c(a[5],b[1],c1,c2,c3); | ||
| 363 | mul_add_c(a[4],b[2],c1,c2,c3); | ||
| 364 | mul_add_c(a[3],b[3],c1,c2,c3); | ||
| 365 | mul_add_c(a[2],b[4],c1,c2,c3); | ||
| 366 | mul_add_c(a[1],b[5],c1,c2,c3); | ||
| 367 | mul_add_c(a[0],b[6],c1,c2,c3); | ||
| 368 | r[6]=c1; | ||
| 369 | c1=0; | ||
| 370 | mul_add_c(a[0],b[7],c2,c3,c1); | ||
| 371 | mul_add_c(a[1],b[6],c2,c3,c1); | ||
| 372 | mul_add_c(a[2],b[5],c2,c3,c1); | ||
| 373 | mul_add_c(a[3],b[4],c2,c3,c1); | ||
| 374 | mul_add_c(a[4],b[3],c2,c3,c1); | ||
| 375 | mul_add_c(a[5],b[2],c2,c3,c1); | ||
| 376 | mul_add_c(a[6],b[1],c2,c3,c1); | ||
| 377 | mul_add_c(a[7],b[0],c2,c3,c1); | ||
| 378 | r[7]=c2; | ||
| 379 | c2=0; | ||
| 380 | mul_add_c(a[7],b[1],c3,c1,c2); | ||
| 381 | mul_add_c(a[6],b[2],c3,c1,c2); | ||
| 382 | mul_add_c(a[5],b[3],c3,c1,c2); | ||
| 383 | mul_add_c(a[4],b[4],c3,c1,c2); | ||
| 384 | mul_add_c(a[3],b[5],c3,c1,c2); | ||
| 385 | mul_add_c(a[2],b[6],c3,c1,c2); | ||
| 386 | mul_add_c(a[1],b[7],c3,c1,c2); | ||
| 387 | r[8]=c3; | ||
| 388 | c3=0; | ||
| 389 | mul_add_c(a[2],b[7],c1,c2,c3); | ||
| 390 | mul_add_c(a[3],b[6],c1,c2,c3); | ||
| 391 | mul_add_c(a[4],b[5],c1,c2,c3); | ||
| 392 | mul_add_c(a[5],b[4],c1,c2,c3); | ||
| 393 | mul_add_c(a[6],b[3],c1,c2,c3); | ||
| 394 | mul_add_c(a[7],b[2],c1,c2,c3); | ||
| 395 | r[9]=c1; | ||
| 396 | c1=0; | ||
| 397 | mul_add_c(a[7],b[3],c2,c3,c1); | ||
| 398 | mul_add_c(a[6],b[4],c2,c3,c1); | ||
| 399 | mul_add_c(a[5],b[5],c2,c3,c1); | ||
| 400 | mul_add_c(a[4],b[6],c2,c3,c1); | ||
| 401 | mul_add_c(a[3],b[7],c2,c3,c1); | ||
| 402 | r[10]=c2; | ||
| 403 | c2=0; | ||
| 404 | mul_add_c(a[4],b[7],c3,c1,c2); | ||
| 405 | mul_add_c(a[5],b[6],c3,c1,c2); | ||
| 406 | mul_add_c(a[6],b[5],c3,c1,c2); | ||
| 407 | mul_add_c(a[7],b[4],c3,c1,c2); | ||
| 408 | r[11]=c3; | ||
| 409 | c3=0; | ||
| 410 | mul_add_c(a[7],b[5],c1,c2,c3); | ||
| 411 | mul_add_c(a[6],b[6],c1,c2,c3); | ||
| 412 | mul_add_c(a[5],b[7],c1,c2,c3); | ||
| 413 | r[12]=c1; | ||
| 414 | c1=0; | ||
| 415 | mul_add_c(a[6],b[7],c2,c3,c1); | ||
| 416 | mul_add_c(a[7],b[6],c2,c3,c1); | ||
| 417 | r[13]=c2; | ||
| 418 | c2=0; | ||
| 419 | mul_add_c(a[7],b[7],c3,c1,c2); | ||
| 420 | r[14]=c3; | ||
| 421 | r[15]=c1; | ||
| 422 | } | ||
| 423 | |||
| 424 | void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) | ||
| 425 | { | ||
| 426 | BN_ULONG bl,bh; | ||
| 427 | BN_ULONG t1,t2; | ||
| 428 | BN_ULONG c1,c2,c3; | ||
| 429 | |||
| 430 | c1=0; | ||
| 431 | c2=0; | ||
| 432 | c3=0; | ||
| 433 | mul_add_c(a[0],b[0],c1,c2,c3); | ||
| 434 | r[0]=c1; | ||
| 435 | c1=0; | ||
| 436 | mul_add_c(a[0],b[1],c2,c3,c1); | ||
| 437 | mul_add_c(a[1],b[0],c2,c3,c1); | ||
| 438 | r[1]=c2; | ||
| 439 | c2=0; | ||
| 440 | mul_add_c(a[2],b[0],c3,c1,c2); | ||
| 441 | mul_add_c(a[1],b[1],c3,c1,c2); | ||
| 442 | mul_add_c(a[0],b[2],c3,c1,c2); | ||
| 443 | r[2]=c3; | ||
| 444 | c3=0; | ||
| 445 | mul_add_c(a[0],b[3],c1,c2,c3); | ||
| 446 | mul_add_c(a[1],b[2],c1,c2,c3); | ||
| 447 | mul_add_c(a[2],b[1],c1,c2,c3); | ||
| 448 | mul_add_c(a[3],b[0],c1,c2,c3); | ||
| 449 | r[3]=c1; | ||
| 450 | c1=0; | ||
| 451 | mul_add_c(a[3],b[1],c2,c3,c1); | ||
| 452 | mul_add_c(a[2],b[2],c2,c3,c1); | ||
| 453 | mul_add_c(a[1],b[3],c2,c3,c1); | ||
| 454 | r[4]=c2; | ||
| 455 | c2=0; | ||
| 456 | mul_add_c(a[2],b[3],c3,c1,c2); | ||
| 457 | mul_add_c(a[3],b[2],c3,c1,c2); | ||
| 458 | r[5]=c3; | ||
| 459 | c3=0; | ||
| 460 | mul_add_c(a[3],b[3],c1,c2,c3); | ||
| 461 | r[6]=c1; | ||
| 462 | r[7]=c2; | ||
| 463 | } | ||
| 464 | |||
| 465 | void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a) | ||
| 466 | { | ||
| 467 | BN_ULONG bl,bh; | ||
| 468 | BN_ULONG t1,t2; | ||
| 469 | BN_ULONG c1,c2,c3; | ||
| 470 | |||
| 471 | c1=0; | ||
| 472 | c2=0; | ||
| 473 | c3=0; | ||
| 474 | sqr_add_c(a,0,c1,c2,c3); | ||
| 475 | r[0]=c1; | ||
| 476 | c1=0; | ||
| 477 | sqr_add_c2(a,1,0,c2,c3,c1); | ||
| 478 | r[1]=c2; | ||
| 479 | c2=0; | ||
| 480 | sqr_add_c(a,1,c3,c1,c2); | ||
| 481 | sqr_add_c2(a,2,0,c3,c1,c2); | ||
| 482 | r[2]=c3; | ||
| 483 | c3=0; | ||
| 484 | sqr_add_c2(a,3,0,c1,c2,c3); | ||
| 485 | sqr_add_c2(a,2,1,c1,c2,c3); | ||
| 486 | r[3]=c1; | ||
| 487 | c1=0; | ||
| 488 | sqr_add_c(a,2,c2,c3,c1); | ||
| 489 | sqr_add_c2(a,3,1,c2,c3,c1); | ||
| 490 | sqr_add_c2(a,4,0,c2,c3,c1); | ||
| 491 | r[4]=c2; | ||
| 492 | c2=0; | ||
| 493 | sqr_add_c2(a,5,0,c3,c1,c2); | ||
| 494 | sqr_add_c2(a,4,1,c3,c1,c2); | ||
| 495 | sqr_add_c2(a,3,2,c3,c1,c2); | ||
| 496 | r[5]=c3; | ||
| 497 | c3=0; | ||
| 498 | sqr_add_c(a,3,c1,c2,c3); | ||
| 499 | sqr_add_c2(a,4,2,c1,c2,c3); | ||
| 500 | sqr_add_c2(a,5,1,c1,c2,c3); | ||
| 501 | sqr_add_c2(a,6,0,c1,c2,c3); | ||
| 502 | r[6]=c1; | ||
| 503 | c1=0; | ||
| 504 | sqr_add_c2(a,7,0,c2,c3,c1); | ||
| 505 | sqr_add_c2(a,6,1,c2,c3,c1); | ||
| 506 | sqr_add_c2(a,5,2,c2,c3,c1); | ||
| 507 | sqr_add_c2(a,4,3,c2,c3,c1); | ||
| 508 | r[7]=c2; | ||
| 509 | c2=0; | ||
| 510 | sqr_add_c(a,4,c3,c1,c2); | ||
| 511 | sqr_add_c2(a,5,3,c3,c1,c2); | ||
| 512 | sqr_add_c2(a,6,2,c3,c1,c2); | ||
| 513 | sqr_add_c2(a,7,1,c3,c1,c2); | ||
| 514 | r[8]=c3; | ||
| 515 | c3=0; | ||
| 516 | sqr_add_c2(a,7,2,c1,c2,c3); | ||
| 517 | sqr_add_c2(a,6,3,c1,c2,c3); | ||
| 518 | sqr_add_c2(a,5,4,c1,c2,c3); | ||
| 519 | r[9]=c1; | ||
| 520 | c1=0; | ||
| 521 | sqr_add_c(a,5,c2,c3,c1); | ||
| 522 | sqr_add_c2(a,6,4,c2,c3,c1); | ||
| 523 | sqr_add_c2(a,7,3,c2,c3,c1); | ||
| 524 | r[10]=c2; | ||
| 525 | c2=0; | ||
| 526 | sqr_add_c2(a,7,4,c3,c1,c2); | ||
| 527 | sqr_add_c2(a,6,5,c3,c1,c2); | ||
| 528 | r[11]=c3; | ||
| 529 | c3=0; | ||
| 530 | sqr_add_c(a,6,c1,c2,c3); | ||
| 531 | sqr_add_c2(a,7,5,c1,c2,c3); | ||
| 532 | r[12]=c1; | ||
| 533 | c1=0; | ||
| 534 | sqr_add_c2(a,7,6,c2,c3,c1); | ||
| 535 | r[13]=c2; | ||
| 536 | c2=0; | ||
| 537 | sqr_add_c(a,7,c3,c1,c2); | ||
| 538 | r[14]=c3; | ||
| 539 | r[15]=c1; | ||
| 540 | } | ||
| 541 | |||
| 542 | void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a) | ||
| 543 | { | ||
| 544 | BN_ULONG bl,bh; | ||
| 545 | BN_ULONG t1,t2; | ||
| 546 | BN_ULONG c1,c2,c3; | ||
| 547 | |||
| 548 | c1=0; | ||
| 549 | c2=0; | ||
| 550 | c3=0; | ||
| 551 | sqr_add_c(a,0,c1,c2,c3); | ||
| 552 | r[0]=c1; | ||
| 553 | c1=0; | ||
| 554 | sqr_add_c2(a,1,0,c2,c3,c1); | ||
| 555 | r[1]=c2; | ||
| 556 | c2=0; | ||
| 557 | sqr_add_c(a,1,c3,c1,c2); | ||
| 558 | sqr_add_c2(a,2,0,c3,c1,c2); | ||
| 559 | r[2]=c3; | ||
| 560 | c3=0; | ||
| 561 | sqr_add_c2(a,3,0,c1,c2,c3); | ||
| 562 | sqr_add_c2(a,2,1,c1,c2,c3); | ||
| 563 | r[3]=c1; | ||
| 564 | c1=0; | ||
| 565 | sqr_add_c(a,2,c2,c3,c1); | ||
| 566 | sqr_add_c2(a,3,1,c2,c3,c1); | ||
| 567 | r[4]=c2; | ||
| 568 | c2=0; | ||
| 569 | sqr_add_c2(a,3,2,c3,c1,c2); | ||
| 570 | r[5]=c3; | ||
| 571 | c3=0; | ||
| 572 | sqr_add_c(a,3,c1,c2,c3); | ||
| 573 | r[6]=c1; | ||
| 574 | r[7]=c2; | ||
| 575 | } | ||
diff --git a/src/lib/libcrypto/bn/bn.h b/src/lib/libcrypto/bn/bn.h index b40682f831..3da6d8ced9 100644 --- a/src/lib/libcrypto/bn/bn.h +++ b/src/lib/libcrypto/bn/bn.h | |||
| @@ -248,6 +248,8 @@ typedef struct bn_blinding_st | |||
| 248 | BIGNUM *A; | 248 | BIGNUM *A; |
| 249 | BIGNUM *Ai; | 249 | BIGNUM *Ai; |
| 250 | BIGNUM *mod; /* just a reference */ | 250 | BIGNUM *mod; /* just a reference */ |
| 251 | unsigned long thread_id; /* added in OpenSSL 0.9.6j and 0.9.7b; | ||
| 252 | * used only by crypto/rsa/rsa_eay.c, rsa_lib.c */ | ||
| 251 | } BN_BLINDING; | 253 | } BN_BLINDING; |
| 252 | 254 | ||
| 253 | /* Used for montgomery multiplication */ | 255 | /* Used for montgomery multiplication */ |
diff --git a/src/lib/libcrypto/bn/bn_div.c b/src/lib/libcrypto/bn/bn_div.c index f9a095e3b3..580d1201bc 100644 --- a/src/lib/libcrypto/bn/bn_div.c +++ b/src/lib/libcrypto/bn/bn_div.c | |||
| @@ -150,6 +150,20 @@ int BN_div(BIGNUM *dv, BIGNUM *rem, const BIGNUM *m, const BIGNUM *d, | |||
| 150 | q; \ | 150 | q; \ |
| 151 | }) | 151 | }) |
| 152 | # define REMAINDER_IS_ALREADY_CALCULATED | 152 | # define REMAINDER_IS_ALREADY_CALCULATED |
| 153 | # elif defined(__x86_64) && defined(SIXTY_FOUR_BIT_LONG) | ||
| 154 | /* | ||
| 155 | * Same story here, but it's 128-bit by 64-bit division. Wow! | ||
| 156 | * <appro@fy.chalmers.se> | ||
| 157 | */ | ||
| 158 | # define bn_div_words(n0,n1,d0) \ | ||
| 159 | ({ asm volatile ( \ | ||
| 160 | "divq %4" \ | ||
| 161 | : "=a"(q), "=d"(rem) \ | ||
| 162 | : "a"(n1), "d"(n0), "g"(d0) \ | ||
| 163 | : "cc"); \ | ||
| 164 | q; \ | ||
| 165 | }) | ||
| 166 | # define REMAINDER_IS_ALREADY_CALCULATED | ||
| 153 | # endif /* __<cpu> */ | 167 | # endif /* __<cpu> */ |
| 154 | # endif /* __GNUC__ */ | 168 | # endif /* __GNUC__ */ |
| 155 | #endif /* OPENSSL_NO_ASM */ | 169 | #endif /* OPENSSL_NO_ASM */ |
| @@ -268,6 +282,11 @@ int BN_div(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, const BIGNUM *divisor, | |||
| 268 | q=(BN_ULONG)(((((BN_ULLONG)n0)<<BN_BITS2)|n1)/d0); | 282 | q=(BN_ULONG)(((((BN_ULLONG)n0)<<BN_BITS2)|n1)/d0); |
| 269 | #else | 283 | #else |
| 270 | q=bn_div_words(n0,n1,d0); | 284 | q=bn_div_words(n0,n1,d0); |
| 285 | #ifdef BN_DEBUG_LEVITTE | ||
| 286 | fprintf(stderr,"DEBUG: bn_div_words(0x%08X,0x%08X,0x%08\ | ||
| 287 | X) -> 0x%08X\n", | ||
| 288 | n0, n1, d0, q); | ||
| 289 | #endif | ||
| 271 | #endif | 290 | #endif |
| 272 | 291 | ||
| 273 | #ifndef REMAINDER_IS_ALREADY_CALCULATED | 292 | #ifndef REMAINDER_IS_ALREADY_CALCULATED |
| @@ -292,11 +311,18 @@ int BN_div(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, const BIGNUM *divisor, | |||
| 292 | BN_ULONG t2l,t2h,ql,qh; | 311 | BN_ULONG t2l,t2h,ql,qh; |
| 293 | 312 | ||
| 294 | q=bn_div_words(n0,n1,d0); | 313 | q=bn_div_words(n0,n1,d0); |
| 314 | #ifdef BN_DEBUG_LEVITTE | ||
| 315 | fprintf(stderr,"DEBUG: bn_div_words(0x%08X,0x%08X,0x%08\ | ||
| 316 | X) -> 0x%08X\n", | ||
| 317 | n0, n1, d0, q); | ||
| 318 | #endif | ||
| 295 | #ifndef REMAINDER_IS_ALREADY_CALCULATED | 319 | #ifndef REMAINDER_IS_ALREADY_CALCULATED |
| 296 | rem=(n1-q*d0)&BN_MASK2; | 320 | rem=(n1-q*d0)&BN_MASK2; |
| 297 | #endif | 321 | #endif |
| 298 | 322 | ||
| 299 | #ifdef BN_UMULT_HIGH | 323 | #if defined(BN_UMULT_LOHI) |
| 324 | BN_UMULT_LOHI(t2l,t2h,d1,q); | ||
| 325 | #elif defined(BN_UMULT_HIGH) | ||
| 300 | t2l = d1 * q; | 326 | t2l = d1 * q; |
| 301 | t2h = BN_UMULT_HIGH(d1,q); | 327 | t2h = BN_UMULT_HIGH(d1,q); |
| 302 | #else | 328 | #else |
diff --git a/src/lib/libcrypto/bn/bn_lcl.h b/src/lib/libcrypto/bn/bn_lcl.h index 8a4dba375a..5614bc6164 100644 --- a/src/lib/libcrypto/bn/bn_lcl.h +++ b/src/lib/libcrypto/bn/bn_lcl.h | |||
| @@ -230,6 +230,21 @@ struct bignum_ctx | |||
| 230 | : "r"(a), "r"(b)); \ | 230 | : "r"(a), "r"(b)); \ |
| 231 | ret; }) | 231 | ret; }) |
| 232 | # endif /* compiler */ | 232 | # endif /* compiler */ |
| 233 | # elif defined(__x86_64) && defined(SIXTY_FOUR_BIT_LONG) | ||
| 234 | # if defined(__GNUC__) | ||
| 235 | # define BN_UMULT_HIGH(a,b) ({ \ | ||
| 236 | register BN_ULONG ret,discard; \ | ||
| 237 | asm ("mulq %3" \ | ||
| 238 | : "=a"(discard),"=d"(ret) \ | ||
| 239 | : "a"(a), "g"(b) \ | ||
| 240 | : "cc"); \ | ||
| 241 | ret; }) | ||
| 242 | # define BN_UMULT_LOHI(low,high,a,b) \ | ||
| 243 | asm ("mulq %3" \ | ||
| 244 | : "=a"(low),"=d"(high) \ | ||
| 245 | : "a"(a),"g"(b) \ | ||
| 246 | : "cc"); | ||
| 247 | # endif | ||
| 233 | # endif /* cpu */ | 248 | # endif /* cpu */ |
| 234 | #endif /* OPENSSL_NO_ASM */ | 249 | #endif /* OPENSSL_NO_ASM */ |
| 235 | 250 | ||
| @@ -337,7 +352,7 @@ struct bignum_ctx | |||
| 337 | 352 | ||
| 338 | #define LBITS(a) ((a)&BN_MASK2l) | 353 | #define LBITS(a) ((a)&BN_MASK2l) |
| 339 | #define HBITS(a) (((a)>>BN_BITS4)&BN_MASK2l) | 354 | #define HBITS(a) (((a)>>BN_BITS4)&BN_MASK2l) |
| 340 | #define L2HBITS(a) ((BN_ULONG)((a)&BN_MASK2l)<<BN_BITS4) | 355 | #define L2HBITS(a) (((a)<<BN_BITS4)&BN_MASK2) |
| 341 | 356 | ||
| 342 | #define LLBITS(a) ((a)&BN_MASKl) | 357 | #define LLBITS(a) ((a)&BN_MASKl) |
| 343 | #define LHBITS(a) (((a)>>BN_BITS2)&BN_MASKl) | 358 | #define LHBITS(a) (((a)>>BN_BITS2)&BN_MASKl) |
| @@ -353,7 +368,7 @@ struct bignum_ctx | |||
| 353 | lt=(bl)*(lt); \ | 368 | lt=(bl)*(lt); \ |
| 354 | m1=(bl)*(ht); \ | 369 | m1=(bl)*(ht); \ |
| 355 | ht =(bh)*(ht); \ | 370 | ht =(bh)*(ht); \ |
| 356 | m=(m+m1)&BN_MASK2; if (m < m1) ht+=L2HBITS(1L); \ | 371 | m=(m+m1)&BN_MASK2; if (m < m1) ht+=L2HBITS((BN_ULONG)1); \ |
| 357 | ht+=HBITS(m); \ | 372 | ht+=HBITS(m); \ |
| 358 | m1=L2HBITS(m); \ | 373 | m1=L2HBITS(m); \ |
| 359 | lt=(lt+m1)&BN_MASK2; if (lt < m1) ht++; \ | 374 | lt=(lt+m1)&BN_MASK2; if (lt < m1) ht++; \ |
| @@ -418,20 +433,19 @@ void bn_sqr_comba4(BN_ULONG *r,const BN_ULONG *a); | |||
| 418 | int bn_cmp_words(const BN_ULONG *a,const BN_ULONG *b,int n); | 433 | int bn_cmp_words(const BN_ULONG *a,const BN_ULONG *b,int n); |
| 419 | int bn_cmp_part_words(const BN_ULONG *a, const BN_ULONG *b, | 434 | int bn_cmp_part_words(const BN_ULONG *a, const BN_ULONG *b, |
| 420 | int cl, int dl); | 435 | int cl, int dl); |
| 436 | #if 0 | ||
| 437 | /* bn_mul.c rollback <appro> */ | ||
| 421 | void bn_mul_recursive(BN_ULONG *r,BN_ULONG *a,BN_ULONG *b,int n2, | 438 | void bn_mul_recursive(BN_ULONG *r,BN_ULONG *a,BN_ULONG *b,int n2, |
| 422 | int dna,int dnb,BN_ULONG *t); | 439 | int dna,int dnb,BN_ULONG *t); |
| 423 | void bn_mul_part_recursive(BN_ULONG *r,BN_ULONG *a,BN_ULONG *b, | 440 | void bn_mul_part_recursive(BN_ULONG *r,BN_ULONG *a,BN_ULONG *b, |
| 424 | int n,int tna,int tnb,BN_ULONG *t); | 441 | int n,int tna,int tnb,BN_ULONG *t); |
| 442 | #endif | ||
| 425 | void bn_sqr_recursive(BN_ULONG *r,const BN_ULONG *a, int n2, BN_ULONG *t); | 443 | void bn_sqr_recursive(BN_ULONG *r,const BN_ULONG *a, int n2, BN_ULONG *t); |
| 426 | void bn_mul_low_normal(BN_ULONG *r,BN_ULONG *a,BN_ULONG *b, int n); | 444 | void bn_mul_low_normal(BN_ULONG *r,BN_ULONG *a,BN_ULONG *b, int n); |
| 427 | void bn_mul_low_recursive(BN_ULONG *r,BN_ULONG *a,BN_ULONG *b,int n2, | 445 | void bn_mul_low_recursive(BN_ULONG *r,BN_ULONG *a,BN_ULONG *b,int n2, |
| 428 | BN_ULONG *t); | 446 | BN_ULONG *t); |
| 429 | void bn_mul_high(BN_ULONG *r,BN_ULONG *a,BN_ULONG *b,BN_ULONG *l,int n2, | 447 | void bn_mul_high(BN_ULONG *r,BN_ULONG *a,BN_ULONG *b,BN_ULONG *l,int n2, |
| 430 | BN_ULONG *t); | 448 | BN_ULONG *t); |
| 431 | BN_ULONG bn_add_part_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, | ||
| 432 | int cl, int dl); | ||
| 433 | BN_ULONG bn_sub_part_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, | ||
| 434 | int cl, int dl); | ||
| 435 | 449 | ||
| 436 | #ifdef __cplusplus | 450 | #ifdef __cplusplus |
| 437 | } | 451 | } |
diff --git a/src/lib/libcrypto/bn/bn_lib.c b/src/lib/libcrypto/bn/bn_lib.c index 8abe095af2..fa0ff485ad 100644 --- a/src/lib/libcrypto/bn/bn_lib.c +++ b/src/lib/libcrypto/bn/bn_lib.c | |||
| @@ -263,12 +263,12 @@ void BN_clear_free(BIGNUM *a) | |||
| 263 | if (a == NULL) return; | 263 | if (a == NULL) return; |
| 264 | if (a->d != NULL) | 264 | if (a->d != NULL) |
| 265 | { | 265 | { |
| 266 | memset(a->d,0,a->dmax*sizeof(a->d[0])); | 266 | OPENSSL_cleanse(a->d,a->dmax*sizeof(a->d[0])); |
| 267 | if (!(BN_get_flags(a,BN_FLG_STATIC_DATA))) | 267 | if (!(BN_get_flags(a,BN_FLG_STATIC_DATA))) |
| 268 | OPENSSL_free(a->d); | 268 | OPENSSL_free(a->d); |
| 269 | } | 269 | } |
| 270 | i=BN_get_flags(a,BN_FLG_MALLOCED); | 270 | i=BN_get_flags(a,BN_FLG_MALLOCED); |
| 271 | memset(a,0,sizeof(BIGNUM)); | 271 | OPENSSL_cleanse(a,sizeof(BIGNUM)); |
| 272 | if (i) | 272 | if (i) |
| 273 | OPENSSL_free(a); | 273 | OPENSSL_free(a); |
| 274 | } | 274 | } |
diff --git a/src/lib/libcrypto/bn/bn_mul.c b/src/lib/libcrypto/bn/bn_mul.c index b03458d002..cb93ac3356 100644 --- a/src/lib/libcrypto/bn/bn_mul.c +++ b/src/lib/libcrypto/bn/bn_mul.c | |||
| @@ -56,325 +56,10 @@ | |||
| 56 | * [including the GNU Public Licence.] | 56 | * [including the GNU Public Licence.] |
| 57 | */ | 57 | */ |
| 58 | 58 | ||
| 59 | #ifndef BN_DEBUG | ||
| 60 | # undef NDEBUG /* avoid conflicting definitions */ | ||
| 61 | # define NDEBUG | ||
| 62 | #endif | ||
| 63 | |||
| 64 | #include <stdio.h> | 59 | #include <stdio.h> |
| 65 | #include <assert.h> | ||
| 66 | #include "cryptlib.h" | 60 | #include "cryptlib.h" |
| 67 | #include "bn_lcl.h" | 61 | #include "bn_lcl.h" |
| 68 | 62 | ||
| 69 | #if defined(OPENSSL_NO_ASM) || !(defined(__i386) || defined(__i386__)) || defined(__DJGPP__) /* Assembler implementation exists only for x86 */ | ||
| 70 | /* Here follows specialised variants of bn_add_words() and | ||
| 71 | bn_sub_words(). They have the property performing operations on | ||
| 72 | arrays of different sizes. The sizes of those arrays is expressed through | ||
| 73 | cl, which is the common length ( basicall, min(len(a),len(b)) ), and dl, | ||
| 74 | which is the delta between the two lengths, calculated as len(a)-len(b). | ||
| 75 | All lengths are the number of BN_ULONGs... For the operations that require | ||
| 76 | a result array as parameter, it must have the length cl+abs(dl). | ||
| 77 | These functions should probably end up in bn_asm.c as soon as there are | ||
| 78 | assembler counterparts for the systems that use assembler files. */ | ||
| 79 | |||
| 80 | BN_ULONG bn_sub_part_words(BN_ULONG *r, | ||
| 81 | const BN_ULONG *a, const BN_ULONG *b, | ||
| 82 | int cl, int dl) | ||
| 83 | { | ||
| 84 | BN_ULONG c, t; | ||
| 85 | |||
| 86 | assert(cl >= 0); | ||
| 87 | c = bn_sub_words(r, a, b, cl); | ||
| 88 | |||
| 89 | if (dl == 0) | ||
| 90 | return c; | ||
| 91 | |||
| 92 | r += cl; | ||
| 93 | a += cl; | ||
| 94 | b += cl; | ||
| 95 | |||
| 96 | if (dl < 0) | ||
| 97 | { | ||
| 98 | #ifdef BN_COUNT | ||
| 99 | fprintf(stderr, " bn_sub_part_words %d + %d (dl < 0, c = %d)\n", cl, dl, c); | ||
| 100 | #endif | ||
| 101 | for (;;) | ||
| 102 | { | ||
| 103 | t = b[0]; | ||
| 104 | r[0] = (0-t-c)&BN_MASK2; | ||
| 105 | if (t != 0) c=1; | ||
| 106 | if (++dl >= 0) break; | ||
| 107 | |||
| 108 | t = b[1]; | ||
| 109 | r[1] = (0-t-c)&BN_MASK2; | ||
| 110 | if (t != 0) c=1; | ||
| 111 | if (++dl >= 0) break; | ||
| 112 | |||
| 113 | t = b[2]; | ||
| 114 | r[2] = (0-t-c)&BN_MASK2; | ||
| 115 | if (t != 0) c=1; | ||
| 116 | if (++dl >= 0) break; | ||
| 117 | |||
| 118 | t = b[3]; | ||
| 119 | r[3] = (0-t-c)&BN_MASK2; | ||
| 120 | if (t != 0) c=1; | ||
| 121 | if (++dl >= 0) break; | ||
| 122 | |||
| 123 | b += 4; | ||
| 124 | r += 4; | ||
| 125 | } | ||
| 126 | } | ||
| 127 | else | ||
| 128 | { | ||
| 129 | int save_dl = dl; | ||
| 130 | #ifdef BN_COUNT | ||
| 131 | fprintf(stderr, " bn_sub_part_words %d + %d (dl > 0, c = %d)\n", cl, dl, c); | ||
| 132 | #endif | ||
| 133 | while(c) | ||
| 134 | { | ||
| 135 | t = a[0]; | ||
| 136 | r[0] = (t-c)&BN_MASK2; | ||
| 137 | if (t != 0) c=0; | ||
| 138 | if (--dl <= 0) break; | ||
| 139 | |||
| 140 | t = a[1]; | ||
| 141 | r[1] = (t-c)&BN_MASK2; | ||
| 142 | if (t != 0) c=0; | ||
| 143 | if (--dl <= 0) break; | ||
| 144 | |||
| 145 | t = a[2]; | ||
| 146 | r[2] = (t-c)&BN_MASK2; | ||
| 147 | if (t != 0) c=0; | ||
| 148 | if (--dl <= 0) break; | ||
| 149 | |||
| 150 | t = a[3]; | ||
| 151 | r[3] = (t-c)&BN_MASK2; | ||
| 152 | if (t != 0) c=0; | ||
| 153 | if (--dl <= 0) break; | ||
| 154 | |||
| 155 | save_dl = dl; | ||
| 156 | a += 4; | ||
| 157 | r += 4; | ||
| 158 | } | ||
| 159 | if (dl > 0) | ||
| 160 | { | ||
| 161 | #ifdef BN_COUNT | ||
| 162 | fprintf(stderr, " bn_sub_part_words %d + %d (dl > 0, c == 0)\n", cl, dl); | ||
| 163 | #endif | ||
| 164 | if (save_dl > dl) | ||
| 165 | { | ||
| 166 | switch (save_dl - dl) | ||
| 167 | { | ||
| 168 | case 1: | ||
| 169 | r[1] = a[1]; | ||
| 170 | if (--dl <= 0) break; | ||
| 171 | case 2: | ||
| 172 | r[2] = a[2]; | ||
| 173 | if (--dl <= 0) break; | ||
| 174 | case 3: | ||
| 175 | r[3] = a[3]; | ||
| 176 | if (--dl <= 0) break; | ||
| 177 | } | ||
| 178 | a += 4; | ||
| 179 | r += 4; | ||
| 180 | } | ||
| 181 | } | ||
| 182 | if (dl > 0) | ||
| 183 | { | ||
| 184 | #ifdef BN_COUNT | ||
| 185 | fprintf(stderr, " bn_sub_part_words %d + %d (dl > 0, copy)\n", cl, dl); | ||
| 186 | #endif | ||
| 187 | for(;;) | ||
| 188 | { | ||
| 189 | r[0] = a[0]; | ||
| 190 | if (--dl <= 0) break; | ||
| 191 | r[1] = a[1]; | ||
| 192 | if (--dl <= 0) break; | ||
| 193 | r[2] = a[2]; | ||
| 194 | if (--dl <= 0) break; | ||
| 195 | r[3] = a[3]; | ||
| 196 | if (--dl <= 0) break; | ||
| 197 | |||
| 198 | a += 4; | ||
| 199 | r += 4; | ||
| 200 | } | ||
| 201 | } | ||
| 202 | } | ||
| 203 | return c; | ||
| 204 | } | ||
| 205 | #endif | ||
| 206 | |||
| 207 | BN_ULONG bn_add_part_words(BN_ULONG *r, | ||
| 208 | const BN_ULONG *a, const BN_ULONG *b, | ||
| 209 | int cl, int dl) | ||
| 210 | { | ||
| 211 | BN_ULONG c, l, t; | ||
| 212 | |||
| 213 | assert(cl >= 0); | ||
| 214 | c = bn_add_words(r, a, b, cl); | ||
| 215 | |||
| 216 | if (dl == 0) | ||
| 217 | return c; | ||
| 218 | |||
| 219 | r += cl; | ||
| 220 | a += cl; | ||
| 221 | b += cl; | ||
| 222 | |||
| 223 | if (dl < 0) | ||
| 224 | { | ||
| 225 | int save_dl = dl; | ||
| 226 | #ifdef BN_COUNT | ||
| 227 | fprintf(stderr, " bn_add_part_words %d + %d (dl < 0, c = %d)\n", cl, dl, c); | ||
| 228 | #endif | ||
| 229 | while (c) | ||
| 230 | { | ||
| 231 | l=(c+b[0])&BN_MASK2; | ||
| 232 | c=(l < c); | ||
| 233 | r[0]=l; | ||
| 234 | if (++dl >= 0) break; | ||
| 235 | |||
| 236 | l=(c+b[1])&BN_MASK2; | ||
| 237 | c=(l < c); | ||
| 238 | r[1]=l; | ||
| 239 | if (++dl >= 0) break; | ||
| 240 | |||
| 241 | l=(c+b[2])&BN_MASK2; | ||
| 242 | c=(l < c); | ||
| 243 | r[2]=l; | ||
| 244 | if (++dl >= 0) break; | ||
| 245 | |||
| 246 | l=(c+b[3])&BN_MASK2; | ||
| 247 | c=(l < c); | ||
| 248 | r[3]=l; | ||
| 249 | if (++dl >= 0) break; | ||
| 250 | |||
| 251 | save_dl = dl; | ||
| 252 | b+=4; | ||
| 253 | r+=4; | ||
| 254 | } | ||
| 255 | if (dl < 0) | ||
| 256 | { | ||
| 257 | #ifdef BN_COUNT | ||
| 258 | fprintf(stderr, " bn_add_part_words %d + %d (dl < 0, c == 0)\n", cl, dl); | ||
| 259 | #endif | ||
| 260 | if (save_dl < dl) | ||
| 261 | { | ||
| 262 | switch (dl - save_dl) | ||
| 263 | { | ||
| 264 | case 1: | ||
| 265 | r[1] = b[1]; | ||
| 266 | if (++dl >= 0) break; | ||
| 267 | case 2: | ||
| 268 | r[2] = b[2]; | ||
| 269 | if (++dl >= 0) break; | ||
| 270 | case 3: | ||
| 271 | r[3] = b[3]; | ||
| 272 | if (++dl >= 0) break; | ||
| 273 | } | ||
| 274 | b += 4; | ||
| 275 | r += 4; | ||
| 276 | } | ||
| 277 | } | ||
| 278 | if (dl < 0) | ||
| 279 | { | ||
| 280 | #ifdef BN_COUNT | ||
| 281 | fprintf(stderr, " bn_add_part_words %d + %d (dl < 0, copy)\n", cl, dl); | ||
| 282 | #endif | ||
| 283 | for(;;) | ||
| 284 | { | ||
| 285 | r[0] = b[0]; | ||
| 286 | if (++dl >= 0) break; | ||
| 287 | r[1] = b[1]; | ||
| 288 | if (++dl >= 0) break; | ||
| 289 | r[2] = b[2]; | ||
| 290 | if (++dl >= 0) break; | ||
| 291 | r[3] = b[3]; | ||
| 292 | if (++dl >= 0) break; | ||
| 293 | |||
| 294 | b += 4; | ||
| 295 | r += 4; | ||
| 296 | } | ||
| 297 | } | ||
| 298 | } | ||
| 299 | else | ||
| 300 | { | ||
| 301 | int save_dl = dl; | ||
| 302 | #ifdef BN_COUNT | ||
| 303 | fprintf(stderr, " bn_add_part_words %d + %d (dl > 0)\n", cl, dl); | ||
| 304 | #endif | ||
| 305 | while (c) | ||
| 306 | { | ||
| 307 | t=(a[0]+c)&BN_MASK2; | ||
| 308 | c=(t < c); | ||
| 309 | r[0]=t; | ||
| 310 | if (--dl <= 0) break; | ||
| 311 | |||
| 312 | t=(a[1]+c)&BN_MASK2; | ||
| 313 | c=(t < c); | ||
| 314 | r[1]=t; | ||
| 315 | if (--dl <= 0) break; | ||
| 316 | |||
| 317 | t=(a[2]+c)&BN_MASK2; | ||
| 318 | c=(t < c); | ||
| 319 | r[2]=t; | ||
| 320 | if (--dl <= 0) break; | ||
| 321 | |||
| 322 | t=(a[3]+c)&BN_MASK2; | ||
| 323 | c=(t < c); | ||
| 324 | r[3]=t; | ||
| 325 | if (--dl <= 0) break; | ||
| 326 | |||
| 327 | save_dl = dl; | ||
| 328 | a+=4; | ||
| 329 | r+=4; | ||
| 330 | } | ||
| 331 | #ifdef BN_COUNT | ||
| 332 | fprintf(stderr, " bn_add_part_words %d + %d (dl > 0, c == 0)\n", cl, dl); | ||
| 333 | #endif | ||
| 334 | if (dl > 0) | ||
| 335 | { | ||
| 336 | if (save_dl > dl) | ||
| 337 | { | ||
| 338 | switch (save_dl - dl) | ||
| 339 | { | ||
| 340 | case 1: | ||
| 341 | r[1] = a[1]; | ||
| 342 | if (--dl <= 0) break; | ||
| 343 | case 2: | ||
| 344 | r[2] = a[2]; | ||
| 345 | if (--dl <= 0) break; | ||
| 346 | case 3: | ||
| 347 | r[3] = a[3]; | ||
| 348 | if (--dl <= 0) break; | ||
| 349 | } | ||
| 350 | a += 4; | ||
| 351 | r += 4; | ||
| 352 | } | ||
| 353 | } | ||
| 354 | if (dl > 0) | ||
| 355 | { | ||
| 356 | #ifdef BN_COUNT | ||
| 357 | fprintf(stderr, " bn_add_part_words %d + %d (dl > 0, copy)\n", cl, dl); | ||
| 358 | #endif | ||
| 359 | for(;;) | ||
| 360 | { | ||
| 361 | r[0] = a[0]; | ||
| 362 | if (--dl <= 0) break; | ||
| 363 | r[1] = a[1]; | ||
| 364 | if (--dl <= 0) break; | ||
| 365 | r[2] = a[2]; | ||
| 366 | if (--dl <= 0) break; | ||
| 367 | r[3] = a[3]; | ||
| 368 | if (--dl <= 0) break; | ||
| 369 | |||
| 370 | a += 4; | ||
| 371 | r += 4; | ||
| 372 | } | ||
| 373 | } | ||
| 374 | } | ||
| 375 | return c; | ||
| 376 | } | ||
| 377 | |||
| 378 | #ifdef BN_RECURSION | 63 | #ifdef BN_RECURSION |
| 379 | /* Karatsuba recursive multiplication algorithm | 64 | /* Karatsuba recursive multiplication algorithm |
| 380 | * (cf. Knuth, The Art of Computer Programming, Vol. 2) */ | 65 | * (cf. Knuth, The Art of Computer Programming, Vol. 2) */ |
| @@ -390,15 +75,14 @@ BN_ULONG bn_add_part_words(BN_ULONG *r, | |||
| 390 | * a[1]*b[1] | 75 | * a[1]*b[1] |
| 391 | */ | 76 | */ |
| 392 | void bn_mul_recursive(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n2, | 77 | void bn_mul_recursive(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n2, |
| 393 | int dna, int dnb, BN_ULONG *t) | 78 | BN_ULONG *t) |
| 394 | { | 79 | { |
| 395 | int n=n2/2,c1,c2; | 80 | int n=n2/2,c1,c2; |
| 396 | int tna=n+dna, tnb=n+dnb; | ||
| 397 | unsigned int neg,zero; | 81 | unsigned int neg,zero; |
| 398 | BN_ULONG ln,lo,*p; | 82 | BN_ULONG ln,lo,*p; |
| 399 | 83 | ||
| 400 | # ifdef BN_COUNT | 84 | # ifdef BN_COUNT |
| 401 | fprintf(stderr," bn_mul_recursive %d * %d\n",n2,n2); | 85 | printf(" bn_mul_recursive %d * %d\n",n2,n2); |
| 402 | # endif | 86 | # endif |
| 403 | # ifdef BN_MUL_COMBA | 87 | # ifdef BN_MUL_COMBA |
| 404 | # if 0 | 88 | # if 0 |
| @@ -408,40 +92,34 @@ void bn_mul_recursive(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n2, | |||
| 408 | return; | 92 | return; |
| 409 | } | 93 | } |
| 410 | # endif | 94 | # endif |
| 411 | /* Only call bn_mul_comba 8 if n2 == 8 and the | 95 | if (n2 == 8) |
| 412 | * two arrays are complete [steve] | ||
| 413 | */ | ||
| 414 | if (n2 == 8 && dna == 0 && dnb == 0) | ||
| 415 | { | 96 | { |
| 416 | bn_mul_comba8(r,a,b); | 97 | bn_mul_comba8(r,a,b); |
| 417 | return; | 98 | return; |
| 418 | } | 99 | } |
| 419 | # endif /* BN_MUL_COMBA */ | 100 | # endif /* BN_MUL_COMBA */ |
| 420 | /* Else do normal multiply */ | ||
| 421 | if (n2 < BN_MUL_RECURSIVE_SIZE_NORMAL) | 101 | if (n2 < BN_MUL_RECURSIVE_SIZE_NORMAL) |
| 422 | { | 102 | { |
| 423 | bn_mul_normal(r,a,n2+dna,b,n2+dnb); | 103 | /* This should not happen */ |
| 424 | if ((dna + dnb) < 0) | 104 | bn_mul_normal(r,a,n2,b,n2); |
| 425 | memset(&r[2*n2 + dna + dnb], 0, | ||
| 426 | sizeof(BN_ULONG) * -(dna + dnb)); | ||
| 427 | return; | 105 | return; |
| 428 | } | 106 | } |
| 429 | /* r=(a[0]-a[1])*(b[1]-b[0]) */ | 107 | /* r=(a[0]-a[1])*(b[1]-b[0]) */ |
| 430 | c1=bn_cmp_part_words(a,&(a[n]),tna,n-tna); | 108 | c1=bn_cmp_words(a,&(a[n]),n); |
| 431 | c2=bn_cmp_part_words(&(b[n]),b,tnb,tnb-n); | 109 | c2=bn_cmp_words(&(b[n]),b,n); |
| 432 | zero=neg=0; | 110 | zero=neg=0; |
| 433 | switch (c1*3+c2) | 111 | switch (c1*3+c2) |
| 434 | { | 112 | { |
| 435 | case -4: | 113 | case -4: |
| 436 | bn_sub_part_words(t, &(a[n]),a, tna,tna-n); /* - */ | 114 | bn_sub_words(t, &(a[n]),a, n); /* - */ |
| 437 | bn_sub_part_words(&(t[n]),b, &(b[n]),tnb,n-tnb); /* - */ | 115 | bn_sub_words(&(t[n]),b, &(b[n]),n); /* - */ |
| 438 | break; | 116 | break; |
| 439 | case -3: | 117 | case -3: |
| 440 | zero=1; | 118 | zero=1; |
| 441 | break; | 119 | break; |
| 442 | case -2: | 120 | case -2: |
| 443 | bn_sub_part_words(t, &(a[n]),a, tna,tna-n); /* - */ | 121 | bn_sub_words(t, &(a[n]),a, n); /* - */ |
| 444 | bn_sub_part_words(&(t[n]),&(b[n]),b, tnb,tnb-n); /* + */ | 122 | bn_sub_words(&(t[n]),&(b[n]),b, n); /* + */ |
| 445 | neg=1; | 123 | neg=1; |
| 446 | break; | 124 | break; |
| 447 | case -1: | 125 | case -1: |
| @@ -450,22 +128,21 @@ void bn_mul_recursive(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n2, | |||
| 450 | zero=1; | 128 | zero=1; |
| 451 | break; | 129 | break; |
| 452 | case 2: | 130 | case 2: |
| 453 | bn_sub_part_words(t, a, &(a[n]),tna,n-tna); /* + */ | 131 | bn_sub_words(t, a, &(a[n]),n); /* + */ |
| 454 | bn_sub_part_words(&(t[n]),b, &(b[n]),tnb,n-tnb); /* - */ | 132 | bn_sub_words(&(t[n]),b, &(b[n]),n); /* - */ |
| 455 | neg=1; | 133 | neg=1; |
| 456 | break; | 134 | break; |
| 457 | case 3: | 135 | case 3: |
| 458 | zero=1; | 136 | zero=1; |
| 459 | break; | 137 | break; |
| 460 | case 4: | 138 | case 4: |
| 461 | bn_sub_part_words(t, a, &(a[n]),tna,n-tna); | 139 | bn_sub_words(t, a, &(a[n]),n); |
| 462 | bn_sub_part_words(&(t[n]),&(b[n]),b, tnb,tnb-n); | 140 | bn_sub_words(&(t[n]),&(b[n]),b, n); |
| 463 | break; | 141 | break; |
| 464 | } | 142 | } |
| 465 | 143 | ||
| 466 | # ifdef BN_MUL_COMBA | 144 | # ifdef BN_MUL_COMBA |
| 467 | if (n == 4 && dna == 0 && dnb == 0) /* XXX: bn_mul_comba4 could take | 145 | if (n == 4) |
| 468 | extra args to do this well */ | ||
| 469 | { | 146 | { |
| 470 | if (!zero) | 147 | if (!zero) |
| 471 | bn_mul_comba4(&(t[n2]),t,&(t[n])); | 148 | bn_mul_comba4(&(t[n2]),t,&(t[n])); |
| @@ -475,9 +152,7 @@ void bn_mul_recursive(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n2, | |||
| 475 | bn_mul_comba4(r,a,b); | 152 | bn_mul_comba4(r,a,b); |
| 476 | bn_mul_comba4(&(r[n2]),&(a[n]),&(b[n])); | 153 | bn_mul_comba4(&(r[n2]),&(a[n]),&(b[n])); |
| 477 | } | 154 | } |
| 478 | else if (n == 8 && dna == 0 && dnb == 0) /* XXX: bn_mul_comba8 could | 155 | else if (n == 8) |
| 479 | take extra args to do this | ||
| 480 | well */ | ||
| 481 | { | 156 | { |
| 482 | if (!zero) | 157 | if (!zero) |
| 483 | bn_mul_comba8(&(t[n2]),t,&(t[n])); | 158 | bn_mul_comba8(&(t[n2]),t,&(t[n])); |
| @@ -492,11 +167,11 @@ void bn_mul_recursive(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n2, | |||
| 492 | { | 167 | { |
| 493 | p= &(t[n2*2]); | 168 | p= &(t[n2*2]); |
| 494 | if (!zero) | 169 | if (!zero) |
| 495 | bn_mul_recursive(&(t[n2]),t,&(t[n]),n,0,0,p); | 170 | bn_mul_recursive(&(t[n2]),t,&(t[n]),n,p); |
| 496 | else | 171 | else |
| 497 | memset(&(t[n2]),0,n2*sizeof(BN_ULONG)); | 172 | memset(&(t[n2]),0,n2*sizeof(BN_ULONG)); |
| 498 | bn_mul_recursive(r,a,b,n,0,0,p); | 173 | bn_mul_recursive(r,a,b,n,p); |
| 499 | bn_mul_recursive(&(r[n2]),&(a[n]),&(b[n]),n,dna,dnb,p); | 174 | bn_mul_recursive(&(r[n2]),&(a[n]),&(b[n]),n,p); |
| 500 | } | 175 | } |
| 501 | 176 | ||
| 502 | /* t[32] holds (a[0]-a[1])*(b[1]-b[0]), c1 is the sign | 177 | /* t[32] holds (a[0]-a[1])*(b[1]-b[0]), c1 is the sign |
| @@ -545,39 +220,39 @@ void bn_mul_recursive(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n2, | |||
| 545 | 220 | ||
| 546 | /* n+tn is the word length | 221 | /* n+tn is the word length |
| 547 | * t needs to be n*4 is size, as does r */ | 222 | * t needs to be n*4 is size, as does r */ |
| 548 | void bn_mul_part_recursive(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n, | 223 | void bn_mul_part_recursive(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int tn, |
| 549 | int tna, int tnb, BN_ULONG *t) | 224 | int n, BN_ULONG *t) |
| 550 | { | 225 | { |
| 551 | int i,j,n2=n*2; | 226 | int i,j,n2=n*2; |
| 552 | unsigned int c1,c2,neg,zero; | 227 | unsigned int c1,c2,neg,zero; |
| 553 | BN_ULONG ln,lo,*p; | 228 | BN_ULONG ln,lo,*p; |
| 554 | 229 | ||
| 555 | # ifdef BN_COUNT | 230 | # ifdef BN_COUNT |
| 556 | fprintf(stderr," bn_mul_part_recursive (%d+%d) * (%d+%d)\n", | 231 | printf(" bn_mul_part_recursive %d * %d\n",tn+n,tn+n); |
| 557 | tna, n, tnb, n); | ||
| 558 | # endif | 232 | # endif |
| 559 | if (n < 8) | 233 | if (n < 8) |
| 560 | { | 234 | { |
| 561 | bn_mul_normal(r,a,n+tna,b,n+tnb); | 235 | i=tn+n; |
| 236 | bn_mul_normal(r,a,i,b,i); | ||
| 562 | return; | 237 | return; |
| 563 | } | 238 | } |
| 564 | 239 | ||
| 565 | /* r=(a[0]-a[1])*(b[1]-b[0]) */ | 240 | /* r=(a[0]-a[1])*(b[1]-b[0]) */ |
| 566 | c1=bn_cmp_part_words(a,&(a[n]),tna,n-tna); | 241 | c1=bn_cmp_words(a,&(a[n]),n); |
| 567 | c2=bn_cmp_part_words(&(b[n]),b,tnb,tnb-n); | 242 | c2=bn_cmp_words(&(b[n]),b,n); |
| 568 | zero=neg=0; | 243 | zero=neg=0; |
| 569 | switch (c1*3+c2) | 244 | switch (c1*3+c2) |
| 570 | { | 245 | { |
| 571 | case -4: | 246 | case -4: |
| 572 | bn_sub_part_words(t, &(a[n]),a, tna,tna-n); /* - */ | 247 | bn_sub_words(t, &(a[n]),a, n); /* - */ |
| 573 | bn_sub_part_words(&(t[n]),b, &(b[n]),tnb,n-tnb); /* - */ | 248 | bn_sub_words(&(t[n]),b, &(b[n]),n); /* - */ |
| 574 | break; | 249 | break; |
| 575 | case -3: | 250 | case -3: |
| 576 | zero=1; | 251 | zero=1; |
| 577 | /* break; */ | 252 | /* break; */ |
| 578 | case -2: | 253 | case -2: |
| 579 | bn_sub_part_words(t, &(a[n]),a, tna,tna-n); /* - */ | 254 | bn_sub_words(t, &(a[n]),a, n); /* - */ |
| 580 | bn_sub_part_words(&(t[n]),&(b[n]),b, tnb,tnb-n); /* + */ | 255 | bn_sub_words(&(t[n]),&(b[n]),b, n); /* + */ |
| 581 | neg=1; | 256 | neg=1; |
| 582 | break; | 257 | break; |
| 583 | case -1: | 258 | case -1: |
| @@ -586,16 +261,16 @@ void bn_mul_part_recursive(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n, | |||
| 586 | zero=1; | 261 | zero=1; |
| 587 | /* break; */ | 262 | /* break; */ |
| 588 | case 2: | 263 | case 2: |
| 589 | bn_sub_part_words(t, a, &(a[n]),tna,n-tna); /* + */ | 264 | bn_sub_words(t, a, &(a[n]),n); /* + */ |
| 590 | bn_sub_part_words(&(t[n]),b, &(b[n]),tnb,n-tnb); /* - */ | 265 | bn_sub_words(&(t[n]),b, &(b[n]),n); /* - */ |
| 591 | neg=1; | 266 | neg=1; |
| 592 | break; | 267 | break; |
| 593 | case 3: | 268 | case 3: |
| 594 | zero=1; | 269 | zero=1; |
| 595 | /* break; */ | 270 | /* break; */ |
| 596 | case 4: | 271 | case 4: |
| 597 | bn_sub_part_words(t, a, &(a[n]),tna,n-tna); | 272 | bn_sub_words(t, a, &(a[n]),n); |
| 598 | bn_sub_part_words(&(t[n]),&(b[n]),b, tnb,tnb-n); | 273 | bn_sub_words(&(t[n]),&(b[n]),b, n); |
| 599 | break; | 274 | break; |
| 600 | } | 275 | } |
| 601 | /* The zero case isn't yet implemented here. The speedup | 276 | /* The zero case isn't yet implemented here. The speedup |
| @@ -614,59 +289,54 @@ void bn_mul_part_recursive(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n, | |||
| 614 | { | 289 | { |
| 615 | bn_mul_comba8(&(t[n2]),t,&(t[n])); | 290 | bn_mul_comba8(&(t[n2]),t,&(t[n])); |
| 616 | bn_mul_comba8(r,a,b); | 291 | bn_mul_comba8(r,a,b); |
| 617 | bn_mul_normal(&(r[n2]),&(a[n]),tna,&(b[n]),tnb); | 292 | bn_mul_normal(&(r[n2]),&(a[n]),tn,&(b[n]),tn); |
| 618 | memset(&(r[n2+tna+tnb]),0,sizeof(BN_ULONG)*(n2-tna-tnb)); | 293 | memset(&(r[n2+tn*2]),0,sizeof(BN_ULONG)*(n2-tn*2)); |
| 619 | } | 294 | } |
| 620 | else | 295 | else |
| 621 | { | 296 | { |
| 622 | p= &(t[n2*2]); | 297 | p= &(t[n2*2]); |
| 623 | bn_mul_recursive(&(t[n2]),t,&(t[n]),n,0,0,p); | 298 | bn_mul_recursive(&(t[n2]),t,&(t[n]),n,p); |
| 624 | bn_mul_recursive(r,a,b,n,0,0,p); | 299 | bn_mul_recursive(r,a,b,n,p); |
| 625 | i=n/2; | 300 | i=n/2; |
| 626 | /* If there is only a bottom half to the number, | 301 | /* If there is only a bottom half to the number, |
| 627 | * just do it */ | 302 | * just do it */ |
| 628 | if (tna > tnb) | 303 | j=tn-i; |
| 629 | j = tna - i; | ||
| 630 | else | ||
| 631 | j = tnb - i; | ||
| 632 | if (j == 0) | 304 | if (j == 0) |
| 633 | { | 305 | { |
| 634 | bn_mul_recursive(&(r[n2]),&(a[n]),&(b[n]), | 306 | bn_mul_recursive(&(r[n2]),&(a[n]),&(b[n]),i,p); |
| 635 | i,tna-i,tnb-i,p); | ||
| 636 | memset(&(r[n2+i*2]),0,sizeof(BN_ULONG)*(n2-i*2)); | 307 | memset(&(r[n2+i*2]),0,sizeof(BN_ULONG)*(n2-i*2)); |
| 637 | } | 308 | } |
| 638 | else if (j > 0) /* eg, n == 16, i == 8 and tn == 11 */ | 309 | else if (j > 0) /* eg, n == 16, i == 8 and tn == 11 */ |
| 639 | { | 310 | { |
| 640 | bn_mul_part_recursive(&(r[n2]),&(a[n]),&(b[n]), | 311 | bn_mul_part_recursive(&(r[n2]),&(a[n]),&(b[n]), |
| 641 | i,tna-i,tnb-i,p); | 312 | j,i,p); |
| 642 | memset(&(r[n2+tna+tnb]),0, | 313 | memset(&(r[n2+tn*2]),0, |
| 643 | sizeof(BN_ULONG)*(n2-tna-tnb)); | 314 | sizeof(BN_ULONG)*(n2-tn*2)); |
| 644 | } | 315 | } |
| 645 | else /* (j < 0) eg, n == 16, i == 8 and tn == 5 */ | 316 | else /* (j < 0) eg, n == 16, i == 8 and tn == 5 */ |
| 646 | { | 317 | { |
| 647 | memset(&(r[n2]),0,sizeof(BN_ULONG)*n2); | 318 | memset(&(r[n2]),0,sizeof(BN_ULONG)*n2); |
| 648 | if (tna < BN_MUL_RECURSIVE_SIZE_NORMAL | 319 | if (tn < BN_MUL_RECURSIVE_SIZE_NORMAL) |
| 649 | && tnb < BN_MUL_RECURSIVE_SIZE_NORMAL) | ||
| 650 | { | 320 | { |
| 651 | bn_mul_normal(&(r[n2]),&(a[n]),tna,&(b[n]),tnb); | 321 | bn_mul_normal(&(r[n2]),&(a[n]),tn,&(b[n]),tn); |
| 652 | } | 322 | } |
| 653 | else | 323 | else |
| 654 | { | 324 | { |
| 655 | for (;;) | 325 | for (;;) |
| 656 | { | 326 | { |
| 657 | i/=2; | 327 | i/=2; |
| 658 | if (i < tna && i < tnb) | 328 | if (i < tn) |
| 659 | { | 329 | { |
| 660 | bn_mul_part_recursive(&(r[n2]), | 330 | bn_mul_part_recursive(&(r[n2]), |
| 661 | &(a[n]),&(b[n]), | 331 | &(a[n]),&(b[n]), |
| 662 | i,tna-i,tnb-i,p); | 332 | tn-i,i,p); |
| 663 | break; | 333 | break; |
| 664 | } | 334 | } |
| 665 | else if (i <= tna && i <= tnb) | 335 | else if (i == tn) |
| 666 | { | 336 | { |
| 667 | bn_mul_recursive(&(r[n2]), | 337 | bn_mul_recursive(&(r[n2]), |
| 668 | &(a[n]),&(b[n]), | 338 | &(a[n]),&(b[n]), |
| 669 | i,tna-i,tnb-i,p); | 339 | i,p); |
| 670 | break; | 340 | break; |
| 671 | } | 341 | } |
| 672 | } | 342 | } |
| @@ -727,10 +397,10 @@ void bn_mul_low_recursive(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n2, | |||
| 727 | int n=n2/2; | 397 | int n=n2/2; |
| 728 | 398 | ||
| 729 | # ifdef BN_COUNT | 399 | # ifdef BN_COUNT |
| 730 | fprintf(stderr," bn_mul_low_recursive %d * %d\n",n2,n2); | 400 | printf(" bn_mul_low_recursive %d * %d\n",n2,n2); |
| 731 | # endif | 401 | # endif |
| 732 | 402 | ||
| 733 | bn_mul_recursive(r,a,b,n,0,0,&(t[0])); | 403 | bn_mul_recursive(r,a,b,n,&(t[0])); |
| 734 | if (n >= BN_MUL_LOW_RECURSIVE_SIZE_NORMAL) | 404 | if (n >= BN_MUL_LOW_RECURSIVE_SIZE_NORMAL) |
| 735 | { | 405 | { |
| 736 | bn_mul_low_recursive(&(t[0]),&(a[0]),&(b[n]),n,&(t[n2])); | 406 | bn_mul_low_recursive(&(t[0]),&(a[0]),&(b[n]),n,&(t[n2])); |
| @@ -761,7 +431,7 @@ void bn_mul_high(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, BN_ULONG *l, int n2, | |||
| 761 | BN_ULONG ll,lc,*lp,*mp; | 431 | BN_ULONG ll,lc,*lp,*mp; |
| 762 | 432 | ||
| 763 | # ifdef BN_COUNT | 433 | # ifdef BN_COUNT |
| 764 | fprintf(stderr," bn_mul_high %d * %d\n",n2,n2); | 434 | printf(" bn_mul_high %d * %d\n",n2,n2); |
| 765 | # endif | 435 | # endif |
| 766 | n=n2/2; | 436 | n=n2/2; |
| 767 | 437 | ||
| @@ -814,8 +484,8 @@ void bn_mul_high(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, BN_ULONG *l, int n2, | |||
| 814 | else | 484 | else |
| 815 | # endif | 485 | # endif |
| 816 | { | 486 | { |
| 817 | bn_mul_recursive(&(t[0]),&(r[0]),&(r[n]),n,0,0,&(t[n2])); | 487 | bn_mul_recursive(&(t[0]),&(r[0]),&(r[n]),n,&(t[n2])); |
| 818 | bn_mul_recursive(r,&(a[n]),&(b[n]),n,0,0,&(t[n2])); | 488 | bn_mul_recursive(r,&(a[n]),&(b[n]),n,&(t[n2])); |
| 819 | } | 489 | } |
| 820 | 490 | ||
| 821 | /* s0 == low(al*bl) | 491 | /* s0 == low(al*bl) |
| @@ -940,19 +610,19 @@ void bn_mul_high(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, BN_ULONG *l, int n2, | |||
| 940 | 610 | ||
| 941 | int BN_mul(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx) | 611 | int BN_mul(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx) |
| 942 | { | 612 | { |
| 943 | int ret=0; | ||
| 944 | int top,al,bl; | 613 | int top,al,bl; |
| 945 | BIGNUM *rr; | 614 | BIGNUM *rr; |
| 615 | int ret = 0; | ||
| 946 | #if defined(BN_MUL_COMBA) || defined(BN_RECURSION) | 616 | #if defined(BN_MUL_COMBA) || defined(BN_RECURSION) |
| 947 | int i; | 617 | int i; |
| 948 | #endif | 618 | #endif |
| 949 | #ifdef BN_RECURSION | 619 | #ifdef BN_RECURSION |
| 950 | BIGNUM *t=NULL; | 620 | BIGNUM *t; |
| 951 | int j=0,k; | 621 | int j,k; |
| 952 | #endif | 622 | #endif |
| 953 | 623 | ||
| 954 | #ifdef BN_COUNT | 624 | #ifdef BN_COUNT |
| 955 | fprintf(stderr,"BN_mul %d * %d\n",a->top,b->top); | 625 | printf("BN_mul %d * %d\n",a->top,b->top); |
| 956 | #endif | 626 | #endif |
| 957 | 627 | ||
| 958 | bn_check_top(a); | 628 | bn_check_top(a); |
| @@ -1005,55 +675,21 @@ int BN_mul(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx) | |||
| 1005 | #ifdef BN_RECURSION | 675 | #ifdef BN_RECURSION |
| 1006 | if ((al >= BN_MULL_SIZE_NORMAL) && (bl >= BN_MULL_SIZE_NORMAL)) | 676 | if ((al >= BN_MULL_SIZE_NORMAL) && (bl >= BN_MULL_SIZE_NORMAL)) |
| 1007 | { | 677 | { |
| 1008 | if (i >= -1 && i <= 1) | 678 | if (i == 1 && !BN_get_flags(b,BN_FLG_STATIC_DATA) && bl<b->dmax) |
| 1009 | { | 679 | { |
| 1010 | int sav_j =0; | 680 | #if 0 /* tribute to const-ification, bl<b->dmax above covers for this */ |
| 1011 | /* Find out the power of two lower or equal | 681 | if (bn_wexpand(b,al) == NULL) goto err; |
| 1012 | to the longest of the two numbers */ | 682 | #endif |
| 1013 | if (i >= 0) | 683 | b->d[bl]=0; |
| 1014 | { | ||
| 1015 | j = BN_num_bits_word((BN_ULONG)al); | ||
| 1016 | } | ||
| 1017 | if (i == -1) | ||
| 1018 | { | ||
| 1019 | j = BN_num_bits_word((BN_ULONG)bl); | ||
| 1020 | } | ||
| 1021 | sav_j = j; | ||
| 1022 | j = 1<<(j-1); | ||
| 1023 | assert(j <= al || j <= bl); | ||
| 1024 | k = j+j; | ||
| 1025 | t = BN_CTX_get(ctx); | ||
| 1026 | if (al > j || bl > j) | ||
| 1027 | { | ||
| 1028 | bn_wexpand(t,k*4); | ||
| 1029 | bn_wexpand(rr,k*4); | ||
| 1030 | bn_mul_part_recursive(rr->d,a->d,b->d, | ||
| 1031 | j,al-j,bl-j,t->d); | ||
| 1032 | } | ||
| 1033 | else /* al <= j || bl <= j */ | ||
| 1034 | { | ||
| 1035 | bn_wexpand(t,k*2); | ||
| 1036 | bn_wexpand(rr,k*2); | ||
| 1037 | bn_mul_recursive(rr->d,a->d,b->d, | ||
| 1038 | j,al-j,bl-j,t->d); | ||
| 1039 | } | ||
| 1040 | rr->top=top; | ||
| 1041 | goto end; | ||
| 1042 | } | ||
| 1043 | #if 0 | ||
| 1044 | if (i == 1 && !BN_get_flags(b,BN_FLG_STATIC_DATA)) | ||
| 1045 | { | ||
| 1046 | BIGNUM *tmp_bn = (BIGNUM *)b; | ||
| 1047 | if (bn_wexpand(tmp_bn,al) == NULL) goto err; | ||
| 1048 | tmp_bn->d[bl]=0; | ||
| 1049 | bl++; | 684 | bl++; |
| 1050 | i--; | 685 | i--; |
| 1051 | } | 686 | } |
| 1052 | else if (i == -1 && !BN_get_flags(a,BN_FLG_STATIC_DATA)) | 687 | else if (i == -1 && !BN_get_flags(a,BN_FLG_STATIC_DATA) && al<a->dmax) |
| 1053 | { | 688 | { |
| 1054 | BIGNUM *tmp_bn = (BIGNUM *)a; | 689 | #if 0 /* tribute to const-ification, al<a->dmax above covers for this */ |
| 1055 | if (bn_wexpand(tmp_bn,bl) == NULL) goto err; | 690 | if (bn_wexpand(a,bl) == NULL) goto err; |
| 1056 | tmp_bn->d[al]=0; | 691 | #endif |
| 692 | a->d[al]=0; | ||
| 1057 | al++; | 693 | al++; |
| 1058 | i++; | 694 | i++; |
| 1059 | } | 695 | } |
| @@ -1070,17 +706,26 @@ int BN_mul(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx) | |||
| 1070 | if (bn_wexpand(t,k*2) == NULL) goto err; | 706 | if (bn_wexpand(t,k*2) == NULL) goto err; |
| 1071 | if (bn_wexpand(rr,k*2) == NULL) goto err; | 707 | if (bn_wexpand(rr,k*2) == NULL) goto err; |
| 1072 | bn_mul_recursive(rr->d,a->d,b->d,al,t->d); | 708 | bn_mul_recursive(rr->d,a->d,b->d,al,t->d); |
| 709 | rr->top=top; | ||
| 710 | goto end; | ||
| 1073 | } | 711 | } |
| 712 | #if 0 /* tribute to const-ification, rsa/dsa performance is not affected */ | ||
| 1074 | else | 713 | else |
| 1075 | { | 714 | { |
| 1076 | if (bn_wexpand(t,k*4) == NULL) goto err; | 715 | if (bn_wexpand(a,k) == NULL ) goto err; |
| 1077 | if (bn_wexpand(rr,k*4) == NULL) goto err; | 716 | if (bn_wexpand(b,k) == NULL ) goto err; |
| 717 | if (bn_wexpand(t,k*4) == NULL ) goto err; | ||
| 718 | if (bn_wexpand(rr,k*4) == NULL ) goto err; | ||
| 719 | for (i=a->top; i<k; i++) | ||
| 720 | a->d[i]=0; | ||
| 721 | for (i=b->top; i<k; i++) | ||
| 722 | b->d[i]=0; | ||
| 1078 | bn_mul_part_recursive(rr->d,a->d,b->d,al-j,j,t->d); | 723 | bn_mul_part_recursive(rr->d,a->d,b->d,al-j,j,t->d); |
| 1079 | } | 724 | } |
| 1080 | rr->top=top; | 725 | rr->top=top; |
| 1081 | goto end; | 726 | goto end; |
| 1082 | } | ||
| 1083 | #endif | 727 | #endif |
| 728 | } | ||
| 1084 | } | 729 | } |
| 1085 | #endif /* BN_RECURSION */ | 730 | #endif /* BN_RECURSION */ |
| 1086 | if (bn_wexpand(rr,top) == NULL) goto err; | 731 | if (bn_wexpand(rr,top) == NULL) goto err; |
| @@ -1103,7 +748,7 @@ void bn_mul_normal(BN_ULONG *r, BN_ULONG *a, int na, BN_ULONG *b, int nb) | |||
| 1103 | BN_ULONG *rr; | 748 | BN_ULONG *rr; |
| 1104 | 749 | ||
| 1105 | #ifdef BN_COUNT | 750 | #ifdef BN_COUNT |
| 1106 | fprintf(stderr," bn_mul_normal %d * %d\n",na,nb); | 751 | printf(" bn_mul_normal %d * %d\n",na,nb); |
| 1107 | #endif | 752 | #endif |
| 1108 | 753 | ||
| 1109 | if (na < nb) | 754 | if (na < nb) |
| @@ -1116,13 +761,7 @@ void bn_mul_normal(BN_ULONG *r, BN_ULONG *a, int na, BN_ULONG *b, int nb) | |||
| 1116 | 761 | ||
| 1117 | } | 762 | } |
| 1118 | rr= &(r[na]); | 763 | rr= &(r[na]); |
| 1119 | if (nb <= 0) | 764 | rr[0]=bn_mul_words(r,a,na,b[0]); |
| 1120 | { | ||
| 1121 | (void)bn_mul_words(r,a,na,0); | ||
| 1122 | return; | ||
| 1123 | } | ||
| 1124 | else | ||
| 1125 | rr[0]=bn_mul_words(r,a,na,b[0]); | ||
| 1126 | 765 | ||
| 1127 | for (;;) | 766 | for (;;) |
| 1128 | { | 767 | { |
| @@ -1143,7 +782,7 @@ void bn_mul_normal(BN_ULONG *r, BN_ULONG *a, int na, BN_ULONG *b, int nb) | |||
| 1143 | void bn_mul_low_normal(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) | 782 | void bn_mul_low_normal(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) |
| 1144 | { | 783 | { |
| 1145 | #ifdef BN_COUNT | 784 | #ifdef BN_COUNT |
| 1146 | fprintf(stderr," bn_mul_low_normal %d * %d\n",n,n); | 785 | printf(" bn_mul_low_normal %d * %d\n",n,n); |
| 1147 | #endif | 786 | #endif |
| 1148 | bn_mul_words(r,a,n,b[0]); | 787 | bn_mul_words(r,a,n,b[0]); |
| 1149 | 788 | ||
diff --git a/src/lib/libcrypto/bn/bn_prime.c b/src/lib/libcrypto/bn/bn_prime.c index 918b9237c6..e072d9255c 100644 --- a/src/lib/libcrypto/bn/bn_prime.c +++ b/src/lib/libcrypto/bn/bn_prime.c | |||
| @@ -140,6 +140,7 @@ BIGNUM *BN_generate_prime(BIGNUM *ret, int bits, int safe, | |||
| 140 | BN_CTX *ctx; | 140 | BN_CTX *ctx; |
| 141 | int checks = BN_prime_checks_for_size(bits); | 141 | int checks = BN_prime_checks_for_size(bits); |
| 142 | 142 | ||
| 143 | BN_init(&t); | ||
| 143 | ctx=BN_CTX_new(); | 144 | ctx=BN_CTX_new(); |
| 144 | if (ctx == NULL) goto err; | 145 | if (ctx == NULL) goto err; |
| 145 | if (ret == NULL) | 146 | if (ret == NULL) |
| @@ -148,7 +149,6 @@ BIGNUM *BN_generate_prime(BIGNUM *ret, int bits, int safe, | |||
| 148 | } | 149 | } |
| 149 | else | 150 | else |
| 150 | rnd=ret; | 151 | rnd=ret; |
| 151 | BN_init(&t); | ||
| 152 | loop: | 152 | loop: |
| 153 | /* make a random number and set the top and bottom bits */ | 153 | /* make a random number and set the top and bottom bits */ |
| 154 | if (add == NULL) | 154 | if (add == NULL) |
diff --git a/src/lib/libcrypto/bn/bn_rand.c b/src/lib/libcrypto/bn/bn_rand.c index 9e08ccd22e..893c9d2af9 100644 --- a/src/lib/libcrypto/bn/bn_rand.c +++ b/src/lib/libcrypto/bn/bn_rand.c | |||
| @@ -201,7 +201,7 @@ static int bnrand(int pseudorand, BIGNUM *rnd, int bits, int top, int bottom) | |||
| 201 | err: | 201 | err: |
| 202 | if (buf != NULL) | 202 | if (buf != NULL) |
| 203 | { | 203 | { |
| 204 | memset(buf,0,bytes); | 204 | OPENSSL_cleanse(buf,bytes); |
| 205 | OPENSSL_free(buf); | 205 | OPENSSL_free(buf); |
| 206 | } | 206 | } |
| 207 | return(ret); | 207 | return(ret); |
diff --git a/src/lib/libcrypto/bn/bn_word.c b/src/lib/libcrypto/bn/bn_word.c index cd59baa2c4..988e0ca7b3 100644 --- a/src/lib/libcrypto/bn/bn_word.c +++ b/src/lib/libcrypto/bn/bn_word.c | |||
| @@ -123,7 +123,10 @@ int BN_add_word(BIGNUM *a, BN_ULONG w) | |||
| 123 | i=0; | 123 | i=0; |
| 124 | for (;;) | 124 | for (;;) |
| 125 | { | 125 | { |
| 126 | l=(a->d[i]+(BN_ULONG)w)&BN_MASK2; | 126 | if (i >= a->top) |
| 127 | l=w; | ||
| 128 | else | ||
| 129 | l=(a->d[i]+(BN_ULONG)w)&BN_MASK2; | ||
| 127 | a->d[i]=l; | 130 | a->d[i]=l; |
| 128 | if (w > l) | 131 | if (w > l) |
| 129 | w=1; | 132 | w=1; |
