diff options
Diffstat (limited to '')
| -rwxr-xr-x | src/lib/libcrypto/bn/asm/x86-mont.pl | 593 | ||||
| -rw-r--r-- | src/lib/libcrypto/bn/asm/x86.pl | 28 | ||||
| -rw-r--r-- | src/lib/libcrypto/bn/asm/x86/add.pl | 76 | ||||
| -rw-r--r-- | src/lib/libcrypto/bn/asm/x86/comba.pl | 277 | ||||
| -rw-r--r-- | src/lib/libcrypto/bn/asm/x86/div.pl | 15 | ||||
| -rw-r--r-- | src/lib/libcrypto/bn/asm/x86/f | 3 | ||||
| -rw-r--r-- | src/lib/libcrypto/bn/asm/x86/mul.pl | 77 | ||||
| -rw-r--r-- | src/lib/libcrypto/bn/asm/x86/mul_add.pl | 87 | ||||
| -rw-r--r-- | src/lib/libcrypto/bn/asm/x86/sqr.pl | 60 | ||||
| -rw-r--r-- | src/lib/libcrypto/bn/asm/x86/sub.pl | 76 | ||||
| -rw-r--r-- | src/lib/libcrypto/bn/asm/x86_64-gcc.c | 606 | ||||
| -rwxr-xr-x | src/lib/libcrypto/bn/asm/x86_64-mont.pl | 330 |
12 files changed, 0 insertions, 2228 deletions
diff --git a/src/lib/libcrypto/bn/asm/x86-mont.pl b/src/lib/libcrypto/bn/asm/x86-mont.pl deleted file mode 100755 index e8f6b05084..0000000000 --- a/src/lib/libcrypto/bn/asm/x86-mont.pl +++ /dev/null | |||
| @@ -1,593 +0,0 @@ | |||
| 1 | #!/usr/bin/env perl | ||
| 2 | |||
| 3 | # ==================================================================== | ||
| 4 | # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | ||
| 5 | # project. The module is, however, dual licensed under OpenSSL and | ||
| 6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
| 7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
| 8 | # ==================================================================== | ||
| 9 | |||
| 10 | # October 2005 | ||
| 11 | # | ||
| 12 | # This is a "teaser" code, as it can be improved in several ways... | ||
| 13 | # First of all non-SSE2 path should be implemented (yes, for now it | ||
| 14 | # performs Montgomery multiplication/convolution only on SSE2-capable | ||
| 15 | # CPUs such as P4, others fall down to original code). Then inner loop | ||
| 16 | # can be unrolled and modulo-scheduled to improve ILP and possibly | ||
| 17 | # moved to 128-bit XMM register bank (though it would require input | ||
| 18 | # rearrangement and/or increase bus bandwidth utilization). Dedicated | ||
| 19 | # squaring procedure should give further performance improvement... | ||
| 20 | # Yet, for being draft, the code improves rsa512 *sign* benchmark by | ||
| 21 | # 110%(!), rsa1024 one - by 70% and rsa4096 - by 20%:-) | ||
| 22 | |||
| 23 | # December 2006 | ||
| 24 | # | ||
| 25 | # Modulo-scheduling SSE2 loops results in further 15-20% improvement. | ||
| 26 | # Integer-only code [being equipped with dedicated squaring procedure] | ||
| 27 | # gives ~40% on rsa512 sign benchmark... | ||
| 28 | |||
| 29 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | ||
| 30 | push(@INC,"${dir}","${dir}../../perlasm"); | ||
| 31 | require "x86asm.pl"; | ||
| 32 | |||
| 33 | &asm_init($ARGV[0],$0); | ||
| 34 | |||
| 35 | $sse2=0; | ||
| 36 | for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } | ||
| 37 | |||
| 38 | &external_label("OPENSSL_ia32cap_P") if ($sse2); | ||
| 39 | |||
| 40 | &function_begin("bn_mul_mont"); | ||
| 41 | |||
| 42 | $i="edx"; | ||
| 43 | $j="ecx"; | ||
| 44 | $ap="esi"; $tp="esi"; # overlapping variables!!! | ||
| 45 | $rp="edi"; $bp="edi"; # overlapping variables!!! | ||
| 46 | $np="ebp"; | ||
| 47 | $num="ebx"; | ||
| 48 | |||
| 49 | $_num=&DWP(4*0,"esp"); # stack top layout | ||
| 50 | $_rp=&DWP(4*1,"esp"); | ||
| 51 | $_ap=&DWP(4*2,"esp"); | ||
| 52 | $_bp=&DWP(4*3,"esp"); | ||
| 53 | $_np=&DWP(4*4,"esp"); | ||
| 54 | $_n0=&DWP(4*5,"esp"); $_n0q=&QWP(4*5,"esp"); | ||
| 55 | $_sp=&DWP(4*6,"esp"); | ||
| 56 | $_bpend=&DWP(4*7,"esp"); | ||
| 57 | $frame=32; # size of above frame rounded up to 16n | ||
| 58 | |||
| 59 | &xor ("eax","eax"); | ||
| 60 | &mov ("edi",&wparam(5)); # int num | ||
| 61 | &cmp ("edi",4); | ||
| 62 | &jl (&label("just_leave")); | ||
| 63 | |||
| 64 | &lea ("esi",&wparam(0)); # put aside pointer to argument block | ||
| 65 | &lea ("edx",&wparam(1)); # load ap | ||
| 66 | &mov ("ebp","esp"); # saved stack pointer! | ||
| 67 | &add ("edi",2); # extra two words on top of tp | ||
| 68 | &neg ("edi"); | ||
| 69 | &lea ("esp",&DWP(-$frame,"esp","edi",4)); # alloca($frame+4*(num+2)) | ||
| 70 | &neg ("edi"); | ||
| 71 | |||
| 72 | # minimize cache contention by arraning 2K window between stack | ||
| 73 | # pointer and ap argument [np is also position sensitive vector, | ||
| 74 | # but it's assumed to be near ap, as it's allocated at ~same | ||
| 75 | # time]. | ||
| 76 | &mov ("eax","esp"); | ||
| 77 | &sub ("eax","edx"); | ||
| 78 | &and ("eax",2047); | ||
| 79 | &sub ("esp","eax"); # this aligns sp and ap modulo 2048 | ||
| 80 | |||
| 81 | &xor ("edx","esp"); | ||
| 82 | &and ("edx",2048); | ||
| 83 | &xor ("edx",2048); | ||
| 84 | &sub ("esp","edx"); # this splits them apart modulo 4096 | ||
| 85 | |||
| 86 | &and ("esp",-64); # align to cache line | ||
| 87 | |||
| 88 | ################################# load argument block... | ||
| 89 | &mov ("eax",&DWP(0*4,"esi"));# BN_ULONG *rp | ||
| 90 | &mov ("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap | ||
| 91 | &mov ("ecx",&DWP(2*4,"esi"));# const BN_ULONG *bp | ||
| 92 | &mov ("edx",&DWP(3*4,"esi"));# const BN_ULONG *np | ||
| 93 | &mov ("esi",&DWP(4*4,"esi"));# const BN_ULONG *n0 | ||
| 94 | #&mov ("edi",&DWP(5*4,"esi"));# int num | ||
| 95 | |||
| 96 | &mov ("esi",&DWP(0,"esi")); # pull n0[0] | ||
| 97 | &mov ($_rp,"eax"); # ... save a copy of argument block | ||
| 98 | &mov ($_ap,"ebx"); | ||
| 99 | &mov ($_bp,"ecx"); | ||
| 100 | &mov ($_np,"edx"); | ||
| 101 | &mov ($_n0,"esi"); | ||
| 102 | &lea ($num,&DWP(-3,"edi")); # num=num-1 to assist modulo-scheduling | ||
| 103 | #&mov ($_num,$num); # redundant as $num is not reused | ||
| 104 | &mov ($_sp,"ebp"); # saved stack pointer! | ||
| 105 | |||
| 106 | if($sse2) { | ||
| 107 | $acc0="mm0"; # mmx register bank layout | ||
| 108 | $acc1="mm1"; | ||
| 109 | $car0="mm2"; | ||
| 110 | $car1="mm3"; | ||
| 111 | $mul0="mm4"; | ||
| 112 | $mul1="mm5"; | ||
| 113 | $temp="mm6"; | ||
| 114 | $mask="mm7"; | ||
| 115 | |||
| 116 | &picmeup("eax","OPENSSL_ia32cap_P"); | ||
| 117 | &bt (&DWP(0,"eax"),26); | ||
| 118 | &jnc (&label("non_sse2")); | ||
| 119 | |||
| 120 | &mov ("eax",-1); | ||
| 121 | &movd ($mask,"eax"); # mask 32 lower bits | ||
| 122 | |||
| 123 | &mov ($ap,$_ap); # load input pointers | ||
| 124 | &mov ($bp,$_bp); | ||
| 125 | &mov ($np,$_np); | ||
| 126 | |||
| 127 | &xor ($i,$i); # i=0 | ||
| 128 | &xor ($j,$j); # j=0 | ||
| 129 | |||
| 130 | &movd ($mul0,&DWP(0,$bp)); # bp[0] | ||
| 131 | &movd ($mul1,&DWP(0,$ap)); # ap[0] | ||
| 132 | &movd ($car1,&DWP(0,$np)); # np[0] | ||
| 133 | |||
| 134 | &pmuludq($mul1,$mul0); # ap[0]*bp[0] | ||
| 135 | &movq ($car0,$mul1); | ||
| 136 | &movq ($acc0,$mul1); # I wish movd worked for | ||
| 137 | &pand ($acc0,$mask); # inter-register transfers | ||
| 138 | |||
| 139 | &pmuludq($mul1,$_n0q); # *=n0 | ||
| 140 | |||
| 141 | &pmuludq($car1,$mul1); # "t[0]"*np[0]*n0 | ||
| 142 | &paddq ($car1,$acc0); | ||
| 143 | |||
| 144 | &movd ($acc1,&DWP(4,$np)); # np[1] | ||
| 145 | &movd ($acc0,&DWP(4,$ap)); # ap[1] | ||
| 146 | |||
| 147 | &psrlq ($car0,32); | ||
| 148 | &psrlq ($car1,32); | ||
| 149 | |||
| 150 | &inc ($j); # j++ | ||
| 151 | &set_label("1st",16); | ||
| 152 | &pmuludq($acc0,$mul0); # ap[j]*bp[0] | ||
| 153 | &pmuludq($acc1,$mul1); # np[j]*m1 | ||
| 154 | &paddq ($car0,$acc0); # +=c0 | ||
| 155 | &paddq ($car1,$acc1); # +=c1 | ||
| 156 | |||
| 157 | &movq ($acc0,$car0); | ||
| 158 | &pand ($acc0,$mask); | ||
| 159 | &movd ($acc1,&DWP(4,$np,$j,4)); # np[j+1] | ||
| 160 | &paddq ($car1,$acc0); # +=ap[j]*bp[0]; | ||
| 161 | &movd ($acc0,&DWP(4,$ap,$j,4)); # ap[j+1] | ||
| 162 | &psrlq ($car0,32); | ||
| 163 | &movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[j-1]= | ||
| 164 | &psrlq ($car1,32); | ||
| 165 | |||
| 166 | &lea ($j,&DWP(1,$j)); | ||
| 167 | &cmp ($j,$num); | ||
| 168 | &jl (&label("1st")); | ||
| 169 | |||
| 170 | &pmuludq($acc0,$mul0); # ap[num-1]*bp[0] | ||
| 171 | &pmuludq($acc1,$mul1); # np[num-1]*m1 | ||
| 172 | &paddq ($car0,$acc0); # +=c0 | ||
| 173 | &paddq ($car1,$acc1); # +=c1 | ||
| 174 | |||
| 175 | &movq ($acc0,$car0); | ||
| 176 | &pand ($acc0,$mask); | ||
| 177 | &paddq ($car1,$acc0); # +=ap[num-1]*bp[0]; | ||
| 178 | &movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[num-2]= | ||
| 179 | |||
| 180 | &psrlq ($car0,32); | ||
| 181 | &psrlq ($car1,32); | ||
| 182 | |||
| 183 | &paddq ($car1,$car0); | ||
| 184 | &movq (&QWP($frame,"esp",$num,4),$car1); # tp[num].tp[num-1] | ||
| 185 | |||
| 186 | &inc ($i); # i++ | ||
| 187 | &set_label("outer"); | ||
| 188 | &xor ($j,$j); # j=0 | ||
| 189 | |||
| 190 | &movd ($mul0,&DWP(0,$bp,$i,4)); # bp[i] | ||
| 191 | &movd ($mul1,&DWP(0,$ap)); # ap[0] | ||
| 192 | &movd ($temp,&DWP($frame,"esp")); # tp[0] | ||
| 193 | &movd ($car1,&DWP(0,$np)); # np[0] | ||
| 194 | &pmuludq($mul1,$mul0); # ap[0]*bp[i] | ||
| 195 | |||
| 196 | &paddq ($mul1,$temp); # +=tp[0] | ||
| 197 | &movq ($acc0,$mul1); | ||
| 198 | &movq ($car0,$mul1); | ||
| 199 | &pand ($acc0,$mask); | ||
| 200 | |||
| 201 | &pmuludq($mul1,$_n0q); # *=n0 | ||
| 202 | |||
| 203 | &pmuludq($car1,$mul1); | ||
| 204 | &paddq ($car1,$acc0); | ||
| 205 | |||
| 206 | &movd ($temp,&DWP($frame+4,"esp")); # tp[1] | ||
| 207 | &movd ($acc1,&DWP(4,$np)); # np[1] | ||
| 208 | &movd ($acc0,&DWP(4,$ap)); # ap[1] | ||
| 209 | |||
| 210 | &psrlq ($car0,32); | ||
| 211 | &psrlq ($car1,32); | ||
| 212 | &paddq ($car0,$temp); # +=tp[1] | ||
| 213 | |||
| 214 | &inc ($j); # j++ | ||
| 215 | &dec ($num); | ||
| 216 | &set_label("inner"); | ||
| 217 | &pmuludq($acc0,$mul0); # ap[j]*bp[i] | ||
| 218 | &pmuludq($acc1,$mul1); # np[j]*m1 | ||
| 219 | &paddq ($car0,$acc0); # +=c0 | ||
| 220 | &paddq ($car1,$acc1); # +=c1 | ||
| 221 | |||
| 222 | &movq ($acc0,$car0); | ||
| 223 | &movd ($temp,&DWP($frame+4,"esp",$j,4));# tp[j+1] | ||
| 224 | &pand ($acc0,$mask); | ||
| 225 | &movd ($acc1,&DWP(4,$np,$j,4)); # np[j+1] | ||
| 226 | &paddq ($car1,$acc0); # +=ap[j]*bp[i]+tp[j] | ||
| 227 | &movd ($acc0,&DWP(4,$ap,$j,4)); # ap[j+1] | ||
| 228 | &psrlq ($car0,32); | ||
| 229 | &movd (&DWP($frame-4,"esp",$j,4),$car1);# tp[j-1]= | ||
| 230 | &psrlq ($car1,32); | ||
| 231 | &paddq ($car0,$temp); # +=tp[j+1] | ||
| 232 | |||
| 233 | &dec ($num); | ||
| 234 | &lea ($j,&DWP(1,$j)); # j++ | ||
| 235 | &jnz (&label("inner")); | ||
| 236 | |||
| 237 | &mov ($num,$j); | ||
| 238 | &pmuludq($acc0,$mul0); # ap[num-1]*bp[i] | ||
| 239 | &pmuludq($acc1,$mul1); # np[num-1]*m1 | ||
| 240 | &paddq ($car0,$acc0); # +=c0 | ||
| 241 | &paddq ($car1,$acc1); # +=c1 | ||
| 242 | |||
| 243 | &movq ($acc0,$car0); | ||
| 244 | &pand ($acc0,$mask); | ||
| 245 | &paddq ($car1,$acc0); # +=ap[num-1]*bp[i]+tp[num-1] | ||
| 246 | &movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[num-2]= | ||
| 247 | &psrlq ($car0,32); | ||
| 248 | &psrlq ($car1,32); | ||
| 249 | |||
| 250 | &movd ($temp,&DWP($frame+4,"esp",$num,4)); # += tp[num] | ||
| 251 | &paddq ($car1,$car0); | ||
| 252 | &paddq ($car1,$temp); | ||
| 253 | &movq (&QWP($frame,"esp",$num,4),$car1); # tp[num].tp[num-1] | ||
| 254 | |||
| 255 | &lea ($i,&DWP(1,$i)); # i++ | ||
| 256 | &cmp ($i,$num); | ||
| 257 | &jle (&label("outer")); | ||
| 258 | |||
| 259 | &emms (); # done with mmx bank | ||
| 260 | &jmp (&label("common_tail")); | ||
| 261 | |||
| 262 | &set_label("non_sse2",16); | ||
| 263 | } | ||
| 264 | |||
| 265 | if (0) { | ||
| 266 | &mov ("esp",$_sp); | ||
| 267 | &xor ("eax","eax"); # signal "not fast enough [yet]" | ||
| 268 | &jmp (&label("just_leave")); | ||
| 269 | # While the below code provides competitive performance for | ||
| 270 | # all key lengthes on modern Intel cores, it's still more | ||
| 271 | # than 10% slower for 4096-bit key elsewhere:-( "Competitive" | ||
| 272 | # means compared to the original integer-only assembler. | ||
| 273 | # 512-bit RSA sign is better by ~40%, but that's about all | ||
| 274 | # one can say about all CPUs... | ||
| 275 | } else { | ||
| 276 | $inp="esi"; # integer path uses these registers differently | ||
| 277 | $word="edi"; | ||
| 278 | $carry="ebp"; | ||
| 279 | |||
| 280 | &mov ($inp,$_ap); | ||
| 281 | &lea ($carry,&DWP(1,$num)); | ||
| 282 | &mov ($word,$_bp); | ||
| 283 | &xor ($j,$j); # j=0 | ||
| 284 | &mov ("edx",$inp); | ||
| 285 | &and ($carry,1); # see if num is even | ||
| 286 | &sub ("edx",$word); # see if ap==bp | ||
| 287 | &lea ("eax",&DWP(4,$word,$num,4)); # &bp[num] | ||
| 288 | &or ($carry,"edx"); | ||
| 289 | &mov ($word,&DWP(0,$word)); # bp[0] | ||
| 290 | &jz (&label("bn_sqr_mont")); | ||
| 291 | &mov ($_bpend,"eax"); | ||
| 292 | &mov ("eax",&DWP(0,$inp)); | ||
| 293 | &xor ("edx","edx"); | ||
| 294 | |||
| 295 | &set_label("mull",16); | ||
| 296 | &mov ($carry,"edx"); | ||
| 297 | &mul ($word); # ap[j]*bp[0] | ||
| 298 | &add ($carry,"eax"); | ||
| 299 | &lea ($j,&DWP(1,$j)); | ||
| 300 | &adc ("edx",0); | ||
| 301 | &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j+1] | ||
| 302 | &cmp ($j,$num); | ||
| 303 | &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]= | ||
| 304 | &jl (&label("mull")); | ||
| 305 | |||
| 306 | &mov ($carry,"edx"); | ||
| 307 | &mul ($word); # ap[num-1]*bp[0] | ||
| 308 | &mov ($word,$_n0); | ||
| 309 | &add ("eax",$carry); | ||
| 310 | &mov ($inp,$_np); | ||
| 311 | &adc ("edx",0); | ||
| 312 | &imul ($word,&DWP($frame,"esp")); # n0*tp[0] | ||
| 313 | |||
| 314 | &mov (&DWP($frame,"esp",$num,4),"eax"); # tp[num-1]= | ||
| 315 | &xor ($j,$j); | ||
| 316 | &mov (&DWP($frame+4,"esp",$num,4),"edx"); # tp[num]= | ||
| 317 | &mov (&DWP($frame+8,"esp",$num,4),$j); # tp[num+1]= | ||
| 318 | |||
| 319 | &mov ("eax",&DWP(0,$inp)); # np[0] | ||
| 320 | &mul ($word); # np[0]*m | ||
| 321 | &add ("eax",&DWP($frame,"esp")); # +=tp[0] | ||
| 322 | &mov ("eax",&DWP(4,$inp)); # np[1] | ||
| 323 | &adc ("edx",0); | ||
| 324 | &inc ($j); | ||
| 325 | |||
| 326 | &jmp (&label("2ndmadd")); | ||
| 327 | |||
| 328 | &set_label("1stmadd",16); | ||
| 329 | &mov ($carry,"edx"); | ||
| 330 | &mul ($word); # ap[j]*bp[i] | ||
| 331 | &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j] | ||
| 332 | &lea ($j,&DWP(1,$j)); | ||
| 333 | &adc ("edx",0); | ||
| 334 | &add ($carry,"eax"); | ||
| 335 | &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j+1] | ||
| 336 | &adc ("edx",0); | ||
| 337 | &cmp ($j,$num); | ||
| 338 | &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]= | ||
| 339 | &jl (&label("1stmadd")); | ||
| 340 | |||
| 341 | &mov ($carry,"edx"); | ||
| 342 | &mul ($word); # ap[num-1]*bp[i] | ||
| 343 | &add ("eax",&DWP($frame,"esp",$num,4)); # +=tp[num-1] | ||
| 344 | &mov ($word,$_n0); | ||
| 345 | &adc ("edx",0); | ||
| 346 | &mov ($inp,$_np); | ||
| 347 | &add ($carry,"eax"); | ||
| 348 | &adc ("edx",0); | ||
| 349 | &imul ($word,&DWP($frame,"esp")); # n0*tp[0] | ||
| 350 | |||
| 351 | &xor ($j,$j); | ||
| 352 | &add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num] | ||
| 353 | &mov (&DWP($frame,"esp",$num,4),$carry); # tp[num-1]= | ||
| 354 | &adc ($j,0); | ||
| 355 | &mov ("eax",&DWP(0,$inp)); # np[0] | ||
| 356 | &mov (&DWP($frame+4,"esp",$num,4),"edx"); # tp[num]= | ||
| 357 | &mov (&DWP($frame+8,"esp",$num,4),$j); # tp[num+1]= | ||
| 358 | |||
| 359 | &mul ($word); # np[0]*m | ||
| 360 | &add ("eax",&DWP($frame,"esp")); # +=tp[0] | ||
| 361 | &mov ("eax",&DWP(4,$inp)); # np[1] | ||
| 362 | &adc ("edx",0); | ||
| 363 | &mov ($j,1); | ||
| 364 | |||
| 365 | &set_label("2ndmadd",16); | ||
| 366 | &mov ($carry,"edx"); | ||
| 367 | &mul ($word); # np[j]*m | ||
| 368 | &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j] | ||
| 369 | &lea ($j,&DWP(1,$j)); | ||
| 370 | &adc ("edx",0); | ||
| 371 | &add ($carry,"eax"); | ||
| 372 | &mov ("eax",&DWP(0,$inp,$j,4)); # np[j+1] | ||
| 373 | &adc ("edx",0); | ||
| 374 | &cmp ($j,$num); | ||
| 375 | &mov (&DWP($frame-8,"esp",$j,4),$carry); # tp[j-1]= | ||
| 376 | &jl (&label("2ndmadd")); | ||
| 377 | |||
| 378 | &mov ($carry,"edx"); | ||
| 379 | &mul ($word); # np[j]*m | ||
| 380 | &add ($carry,&DWP($frame,"esp",$num,4)); # +=tp[num-1] | ||
| 381 | &adc ("edx",0); | ||
| 382 | &add ($carry,"eax"); | ||
| 383 | &adc ("edx",0); | ||
| 384 | &mov (&DWP($frame-4,"esp",$num,4),$carry); # tp[num-2]= | ||
| 385 | |||
| 386 | &xor ("eax","eax"); | ||
| 387 | &mov ($j,$_bp); # &bp[i] | ||
| 388 | &add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num] | ||
| 389 | &adc ("eax",&DWP($frame+8,"esp",$num,4)); # +=tp[num+1] | ||
| 390 | &lea ($j,&DWP(4,$j)); | ||
| 391 | &mov (&DWP($frame,"esp",$num,4),"edx"); # tp[num-1]= | ||
| 392 | &cmp ($j,$_bpend); | ||
| 393 | &mov (&DWP($frame+4,"esp",$num,4),"eax"); # tp[num]= | ||
| 394 | &je (&label("common_tail")); | ||
| 395 | |||
| 396 | &mov ($word,&DWP(0,$j)); # bp[i+1] | ||
| 397 | &mov ($inp,$_ap); | ||
| 398 | &mov ($_bp,$j); # &bp[++i] | ||
| 399 | &xor ($j,$j); | ||
| 400 | &xor ("edx","edx"); | ||
| 401 | &mov ("eax",&DWP(0,$inp)); | ||
| 402 | &jmp (&label("1stmadd")); | ||
| 403 | |||
| 404 | &set_label("bn_sqr_mont",16); | ||
| 405 | $sbit=$num; | ||
| 406 | &mov ($_num,$num); | ||
| 407 | &mov ($_bp,$j); # i=0 | ||
| 408 | |||
| 409 | &mov ("eax",$word); # ap[0] | ||
| 410 | &mul ($word); # ap[0]*ap[0] | ||
| 411 | &mov (&DWP($frame,"esp"),"eax"); # tp[0]= | ||
| 412 | &mov ($sbit,"edx"); | ||
| 413 | &shr ("edx",1); | ||
| 414 | &and ($sbit,1); | ||
| 415 | &inc ($j); | ||
| 416 | &set_label("sqr",16); | ||
| 417 | &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j] | ||
| 418 | &mov ($carry,"edx"); | ||
| 419 | &mul ($word); # ap[j]*ap[0] | ||
| 420 | &add ("eax",$carry); | ||
| 421 | &lea ($j,&DWP(1,$j)); | ||
| 422 | &adc ("edx",0); | ||
| 423 | &lea ($carry,&DWP(0,$sbit,"eax",2)); | ||
| 424 | &shr ("eax",31); | ||
| 425 | &cmp ($j,$_num); | ||
| 426 | &mov ($sbit,"eax"); | ||
| 427 | &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]= | ||
| 428 | &jl (&label("sqr")); | ||
| 429 | |||
| 430 | &mov ("eax",&DWP(0,$inp,$j,4)); # ap[num-1] | ||
| 431 | &mov ($carry,"edx"); | ||
| 432 | &mul ($word); # ap[num-1]*ap[0] | ||
| 433 | &add ("eax",$carry); | ||
| 434 | &mov ($word,$_n0); | ||
| 435 | &adc ("edx",0); | ||
| 436 | &mov ($inp,$_np); | ||
| 437 | &lea ($carry,&DWP(0,$sbit,"eax",2)); | ||
| 438 | &imul ($word,&DWP($frame,"esp")); # n0*tp[0] | ||
| 439 | &shr ("eax",31); | ||
| 440 | &mov (&DWP($frame,"esp",$j,4),$carry); # tp[num-1]= | ||
| 441 | |||
| 442 | &lea ($carry,&DWP(0,"eax","edx",2)); | ||
| 443 | &mov ("eax",&DWP(0,$inp)); # np[0] | ||
| 444 | &shr ("edx",31); | ||
| 445 | &mov (&DWP($frame+4,"esp",$j,4),$carry); # tp[num]= | ||
| 446 | &mov (&DWP($frame+8,"esp",$j,4),"edx"); # tp[num+1]= | ||
| 447 | |||
| 448 | &mul ($word); # np[0]*m | ||
| 449 | &add ("eax",&DWP($frame,"esp")); # +=tp[0] | ||
| 450 | &mov ($num,$j); | ||
| 451 | &adc ("edx",0); | ||
| 452 | &mov ("eax",&DWP(4,$inp)); # np[1] | ||
| 453 | &mov ($j,1); | ||
| 454 | |||
| 455 | &set_label("3rdmadd",16); | ||
| 456 | &mov ($carry,"edx"); | ||
| 457 | &mul ($word); # np[j]*m | ||
| 458 | &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j] | ||
| 459 | &adc ("edx",0); | ||
| 460 | &add ($carry,"eax"); | ||
| 461 | &mov ("eax",&DWP(4,$inp,$j,4)); # np[j+1] | ||
| 462 | &adc ("edx",0); | ||
| 463 | &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j-1]= | ||
| 464 | |||
| 465 | &mov ($carry,"edx"); | ||
| 466 | &mul ($word); # np[j+1]*m | ||
| 467 | &add ($carry,&DWP($frame+4,"esp",$j,4)); # +=tp[j+1] | ||
| 468 | &lea ($j,&DWP(2,$j)); | ||
| 469 | &adc ("edx",0); | ||
| 470 | &add ($carry,"eax"); | ||
| 471 | &mov ("eax",&DWP(0,$inp,$j,4)); # np[j+2] | ||
| 472 | &adc ("edx",0); | ||
| 473 | &cmp ($j,$num); | ||
| 474 | &mov (&DWP($frame-8,"esp",$j,4),$carry); # tp[j]= | ||
| 475 | &jl (&label("3rdmadd")); | ||
| 476 | |||
| 477 | &mov ($carry,"edx"); | ||
| 478 | &mul ($word); # np[j]*m | ||
| 479 | &add ($carry,&DWP($frame,"esp",$num,4)); # +=tp[num-1] | ||
| 480 | &adc ("edx",0); | ||
| 481 | &add ($carry,"eax"); | ||
| 482 | &adc ("edx",0); | ||
| 483 | &mov (&DWP($frame-4,"esp",$num,4),$carry); # tp[num-2]= | ||
| 484 | |||
| 485 | &mov ($j,$_bp); # i | ||
| 486 | &xor ("eax","eax"); | ||
| 487 | &mov ($inp,$_ap); | ||
| 488 | &add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num] | ||
| 489 | &adc ("eax",&DWP($frame+8,"esp",$num,4)); # +=tp[num+1] | ||
| 490 | &mov (&DWP($frame,"esp",$num,4),"edx"); # tp[num-1]= | ||
| 491 | &cmp ($j,$num); | ||
| 492 | &mov (&DWP($frame+4,"esp",$num,4),"eax"); # tp[num]= | ||
| 493 | &je (&label("common_tail")); | ||
| 494 | |||
| 495 | &mov ($word,&DWP(4,$inp,$j,4)); # ap[i] | ||
| 496 | &lea ($j,&DWP(1,$j)); | ||
| 497 | &mov ("eax",$word); | ||
| 498 | &mov ($_bp,$j); # ++i | ||
| 499 | &mul ($word); # ap[i]*ap[i] | ||
| 500 | &add ("eax",&DWP($frame,"esp",$j,4)); # +=tp[i] | ||
| 501 | &adc ("edx",0); | ||
| 502 | &mov (&DWP($frame,"esp",$j,4),"eax"); # tp[i]= | ||
| 503 | &xor ($carry,$carry); | ||
| 504 | &cmp ($j,$num); | ||
| 505 | &lea ($j,&DWP(1,$j)); | ||
| 506 | &je (&label("sqrlast")); | ||
| 507 | |||
| 508 | &mov ($sbit,"edx"); # zaps $num | ||
| 509 | &shr ("edx",1); | ||
| 510 | &and ($sbit,1); | ||
| 511 | &set_label("sqradd",16); | ||
| 512 | &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j] | ||
| 513 | &mov ($carry,"edx"); | ||
| 514 | &mul ($word); # ap[j]*ap[i] | ||
| 515 | &add ("eax",$carry); | ||
| 516 | &lea ($carry,&DWP(0,"eax","eax")); | ||
| 517 | &adc ("edx",0); | ||
| 518 | &shr ("eax",31); | ||
| 519 | &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j] | ||
| 520 | &lea ($j,&DWP(1,$j)); | ||
| 521 | &adc ("eax",0); | ||
| 522 | &add ($carry,$sbit); | ||
| 523 | &adc ("eax",0); | ||
| 524 | &cmp ($j,$_num); | ||
| 525 | &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]= | ||
| 526 | &mov ($sbit,"eax"); | ||
| 527 | &jle (&label("sqradd")); | ||
| 528 | |||
| 529 | &mov ($carry,"edx"); | ||
| 530 | &add ("edx","edx"); | ||
| 531 | &shr ($carry,31); | ||
| 532 | &add ("edx",$sbit); | ||
| 533 | &adc ($carry,0); | ||
| 534 | &set_label("sqrlast"); | ||
| 535 | &mov ($word,$_n0); | ||
| 536 | &mov ($inp,$_np); | ||
| 537 | &imul ($word,&DWP($frame,"esp")); # n0*tp[0] | ||
| 538 | |||
| 539 | &add ("edx",&DWP($frame,"esp",$j,4)); # +=tp[num] | ||
| 540 | &mov ("eax",&DWP(0,$inp)); # np[0] | ||
| 541 | &adc ($carry,0); | ||
| 542 | &mov (&DWP($frame,"esp",$j,4),"edx"); # tp[num]= | ||
| 543 | &mov (&DWP($frame+4,"esp",$j,4),$carry); # tp[num+1]= | ||
| 544 | |||
| 545 | &mul ($word); # np[0]*m | ||
| 546 | &add ("eax",&DWP($frame,"esp")); # +=tp[0] | ||
| 547 | &lea ($num,&DWP(-1,$j)); | ||
| 548 | &adc ("edx",0); | ||
| 549 | &mov ($j,1); | ||
| 550 | &mov ("eax",&DWP(4,$inp)); # np[1] | ||
| 551 | |||
| 552 | &jmp (&label("3rdmadd")); | ||
| 553 | } | ||
| 554 | |||
| 555 | &set_label("common_tail",16); | ||
| 556 | &mov ($np,$_np); # load modulus pointer | ||
| 557 | &mov ($rp,$_rp); # load result pointer | ||
| 558 | &lea ($tp,&DWP($frame,"esp")); # [$ap and $bp are zapped] | ||
| 559 | |||
| 560 | &mov ("eax",&DWP(0,$tp)); # tp[0] | ||
| 561 | &mov ($j,$num); # j=num-1 | ||
| 562 | &xor ($i,$i); # i=0 and clear CF! | ||
| 563 | |||
| 564 | &set_label("sub",16); | ||
| 565 | &sbb ("eax",&DWP(0,$np,$i,4)); | ||
| 566 | &mov (&DWP(0,$rp,$i,4),"eax"); # rp[i]=tp[i]-np[i] | ||
| 567 | &dec ($j); # doesn't affect CF! | ||
| 568 | &mov ("eax",&DWP(4,$tp,$i,4)); # tp[i+1] | ||
| 569 | &lea ($i,&DWP(1,$i)); # i++ | ||
| 570 | &jge (&label("sub")); | ||
| 571 | |||
| 572 | &sbb ("eax",0); # handle upmost overflow bit | ||
| 573 | &and ($tp,"eax"); | ||
| 574 | ¬ ("eax"); | ||
| 575 | &mov ($np,$rp); | ||
| 576 | &and ($np,"eax"); | ||
| 577 | &or ($tp,$np); # tp=carry?tp:rp | ||
| 578 | |||
| 579 | &set_label("copy",16); # copy or in-place refresh | ||
| 580 | &mov ("eax",&DWP(0,$tp,$num,4)); | ||
| 581 | &mov (&DWP(0,$rp,$num,4),"eax"); # rp[i]=tp[i] | ||
| 582 | &mov (&DWP($frame,"esp",$num,4),$j); # zap temporary vector | ||
| 583 | &dec ($num); | ||
| 584 | &jge (&label("copy")); | ||
| 585 | |||
| 586 | &mov ("esp",$_sp); # pull saved stack pointer | ||
| 587 | &mov ("eax",1); | ||
| 588 | &set_label("just_leave"); | ||
| 589 | &function_end("bn_mul_mont"); | ||
| 590 | |||
| 591 | &asciz("Montgomery Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>"); | ||
| 592 | |||
| 593 | &asm_finish(); | ||
diff --git a/src/lib/libcrypto/bn/asm/x86.pl b/src/lib/libcrypto/bn/asm/x86.pl deleted file mode 100644 index 1bc4f1bb27..0000000000 --- a/src/lib/libcrypto/bn/asm/x86.pl +++ /dev/null | |||
| @@ -1,28 +0,0 @@ | |||
| 1 | #!/usr/local/bin/perl | ||
| 2 | |||
| 3 | push(@INC,"perlasm","../../perlasm"); | ||
| 4 | require "x86asm.pl"; | ||
| 5 | |||
| 6 | require("x86/mul_add.pl"); | ||
| 7 | require("x86/mul.pl"); | ||
| 8 | require("x86/sqr.pl"); | ||
| 9 | require("x86/div.pl"); | ||
| 10 | require("x86/add.pl"); | ||
| 11 | require("x86/sub.pl"); | ||
| 12 | require("x86/comba.pl"); | ||
| 13 | |||
| 14 | &asm_init($ARGV[0],$0); | ||
| 15 | |||
| 16 | &bn_mul_add_words("bn_mul_add_words"); | ||
| 17 | &bn_mul_words("bn_mul_words"); | ||
| 18 | &bn_sqr_words("bn_sqr_words"); | ||
| 19 | &bn_div_words("bn_div_words"); | ||
| 20 | &bn_add_words("bn_add_words"); | ||
| 21 | &bn_sub_words("bn_sub_words"); | ||
| 22 | &bn_mul_comba("bn_mul_comba8",8); | ||
| 23 | &bn_mul_comba("bn_mul_comba4",4); | ||
| 24 | &bn_sqr_comba("bn_sqr_comba8",8); | ||
| 25 | &bn_sqr_comba("bn_sqr_comba4",4); | ||
| 26 | |||
| 27 | &asm_finish(); | ||
| 28 | |||
diff --git a/src/lib/libcrypto/bn/asm/x86/add.pl b/src/lib/libcrypto/bn/asm/x86/add.pl deleted file mode 100644 index 0b5cf583e3..0000000000 --- a/src/lib/libcrypto/bn/asm/x86/add.pl +++ /dev/null | |||
| @@ -1,76 +0,0 @@ | |||
| 1 | #!/usr/local/bin/perl | ||
| 2 | # x86 assember | ||
| 3 | |||
| 4 | sub bn_add_words | ||
| 5 | { | ||
| 6 | local($name)=@_; | ||
| 7 | |||
| 8 | &function_begin($name,""); | ||
| 9 | |||
| 10 | &comment(""); | ||
| 11 | $a="esi"; | ||
| 12 | $b="edi"; | ||
| 13 | $c="eax"; | ||
| 14 | $r="ebx"; | ||
| 15 | $tmp1="ecx"; | ||
| 16 | $tmp2="edx"; | ||
| 17 | $num="ebp"; | ||
| 18 | |||
| 19 | &mov($r,&wparam(0)); # get r | ||
| 20 | &mov($a,&wparam(1)); # get a | ||
| 21 | &mov($b,&wparam(2)); # get b | ||
| 22 | &mov($num,&wparam(3)); # get num | ||
| 23 | &xor($c,$c); # clear carry | ||
| 24 | &and($num,0xfffffff8); # num / 8 | ||
| 25 | |||
| 26 | &jz(&label("aw_finish")); | ||
| 27 | |||
| 28 | &set_label("aw_loop",0); | ||
| 29 | for ($i=0; $i<8; $i++) | ||
| 30 | { | ||
| 31 | &comment("Round $i"); | ||
| 32 | |||
| 33 | &mov($tmp1,&DWP($i*4,$a,"",0)); # *a | ||
| 34 | &mov($tmp2,&DWP($i*4,$b,"",0)); # *b | ||
| 35 | &add($tmp1,$c); | ||
| 36 | &mov($c,0); | ||
| 37 | &adc($c,$c); | ||
| 38 | &add($tmp1,$tmp2); | ||
| 39 | &adc($c,0); | ||
| 40 | &mov(&DWP($i*4,$r,"",0),$tmp1); # *r | ||
| 41 | } | ||
| 42 | |||
| 43 | &comment(""); | ||
| 44 | &add($a,32); | ||
| 45 | &add($b,32); | ||
| 46 | &add($r,32); | ||
| 47 | &sub($num,8); | ||
| 48 | &jnz(&label("aw_loop")); | ||
| 49 | |||
| 50 | &set_label("aw_finish",0); | ||
| 51 | &mov($num,&wparam(3)); # get num | ||
| 52 | &and($num,7); | ||
| 53 | &jz(&label("aw_end")); | ||
| 54 | |||
| 55 | for ($i=0; $i<7; $i++) | ||
| 56 | { | ||
| 57 | &comment("Tail Round $i"); | ||
| 58 | &mov($tmp1,&DWP($i*4,$a,"",0)); # *a | ||
| 59 | &mov($tmp2,&DWP($i*4,$b,"",0));# *b | ||
| 60 | &add($tmp1,$c); | ||
| 61 | &mov($c,0); | ||
| 62 | &adc($c,$c); | ||
| 63 | &add($tmp1,$tmp2); | ||
| 64 | &adc($c,0); | ||
| 65 | &dec($num) if ($i != 6); | ||
| 66 | &mov(&DWP($i*4,$r,"",0),$tmp1); # *a | ||
| 67 | &jz(&label("aw_end")) if ($i != 6); | ||
| 68 | } | ||
| 69 | &set_label("aw_end",0); | ||
| 70 | |||
| 71 | # &mov("eax",$c); # $c is "eax" | ||
| 72 | |||
| 73 | &function_end($name); | ||
| 74 | } | ||
| 75 | |||
| 76 | 1; | ||
diff --git a/src/lib/libcrypto/bn/asm/x86/comba.pl b/src/lib/libcrypto/bn/asm/x86/comba.pl deleted file mode 100644 index 2291253629..0000000000 --- a/src/lib/libcrypto/bn/asm/x86/comba.pl +++ /dev/null | |||
| @@ -1,277 +0,0 @@ | |||
| 1 | #!/usr/local/bin/perl | ||
| 2 | # x86 assember | ||
| 3 | |||
| 4 | sub mul_add_c | ||
| 5 | { | ||
| 6 | local($a,$ai,$b,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_; | ||
| 7 | |||
| 8 | # pos == -1 if eax and edx are pre-loaded, 0 to load from next | ||
| 9 | # words, and 1 if load return value | ||
| 10 | |||
| 11 | &comment("mul a[$ai]*b[$bi]"); | ||
| 12 | |||
| 13 | # "eax" and "edx" will always be pre-loaded. | ||
| 14 | # &mov("eax",&DWP($ai*4,$a,"",0)) ; | ||
| 15 | # &mov("edx",&DWP($bi*4,$b,"",0)); | ||
| 16 | |||
| 17 | &mul("edx"); | ||
| 18 | &add($c0,"eax"); | ||
| 19 | &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # laod next a | ||
| 20 | &mov("eax",&wparam(0)) if $pos > 0; # load r[] | ||
| 21 | ### | ||
| 22 | &adc($c1,"edx"); | ||
| 23 | &mov("edx",&DWP(($nb)*4,$b,"",0)) if $pos == 0; # laod next b | ||
| 24 | &mov("edx",&DWP(($nb)*4,$b,"",0)) if $pos == 1; # laod next b | ||
| 25 | ### | ||
| 26 | &adc($c2,0); | ||
| 27 | # is pos > 1, it means it is the last loop | ||
| 28 | &mov(&DWP($i*4,"eax","",0),$c0) if $pos > 0; # save r[]; | ||
| 29 | &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # laod next a | ||
| 30 | } | ||
| 31 | |||
| 32 | sub sqr_add_c | ||
| 33 | { | ||
| 34 | local($r,$a,$ai,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_; | ||
| 35 | |||
| 36 | # pos == -1 if eax and edx are pre-loaded, 0 to load from next | ||
| 37 | # words, and 1 if load return value | ||
| 38 | |||
| 39 | &comment("sqr a[$ai]*a[$bi]"); | ||
| 40 | |||
| 41 | # "eax" and "edx" will always be pre-loaded. | ||
| 42 | # &mov("eax",&DWP($ai*4,$a,"",0)) ; | ||
| 43 | # &mov("edx",&DWP($bi*4,$b,"",0)); | ||
| 44 | |||
| 45 | if ($ai == $bi) | ||
| 46 | { &mul("eax");} | ||
| 47 | else | ||
| 48 | { &mul("edx");} | ||
| 49 | &add($c0,"eax"); | ||
| 50 | &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # load next a | ||
| 51 | ### | ||
| 52 | &adc($c1,"edx"); | ||
| 53 | &mov("edx",&DWP(($nb)*4,$a,"",0)) if ($pos == 1) && ($na != $nb); | ||
| 54 | ### | ||
| 55 | &adc($c2,0); | ||
| 56 | # is pos > 1, it means it is the last loop | ||
| 57 | &mov(&DWP($i*4,$r,"",0),$c0) if $pos > 0; # save r[]; | ||
| 58 | &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # load next b | ||
| 59 | } | ||
| 60 | |||
| 61 | sub sqr_add_c2 | ||
| 62 | { | ||
| 63 | local($r,$a,$ai,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_; | ||
| 64 | |||
| 65 | # pos == -1 if eax and edx are pre-loaded, 0 to load from next | ||
| 66 | # words, and 1 if load return value | ||
| 67 | |||
| 68 | &comment("sqr a[$ai]*a[$bi]"); | ||
| 69 | |||
| 70 | # "eax" and "edx" will always be pre-loaded. | ||
| 71 | # &mov("eax",&DWP($ai*4,$a,"",0)) ; | ||
| 72 | # &mov("edx",&DWP($bi*4,$a,"",0)); | ||
| 73 | |||
| 74 | if ($ai == $bi) | ||
| 75 | { &mul("eax");} | ||
| 76 | else | ||
| 77 | { &mul("edx");} | ||
| 78 | &add("eax","eax"); | ||
| 79 | ### | ||
| 80 | &adc("edx","edx"); | ||
| 81 | ### | ||
| 82 | &adc($c2,0); | ||
| 83 | &add($c0,"eax"); | ||
| 84 | &adc($c1,"edx"); | ||
| 85 | &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # load next a | ||
| 86 | &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # load next b | ||
| 87 | &adc($c2,0); | ||
| 88 | &mov(&DWP($i*4,$r,"",0),$c0) if $pos > 0; # save r[]; | ||
| 89 | &mov("edx",&DWP(($nb)*4,$a,"",0)) if ($pos <= 1) && ($na != $nb); | ||
| 90 | ### | ||
| 91 | } | ||
| 92 | |||
| 93 | sub bn_mul_comba | ||
| 94 | { | ||
| 95 | local($name,$num)=@_; | ||
| 96 | local($a,$b,$c0,$c1,$c2); | ||
| 97 | local($i,$as,$ae,$bs,$be,$ai,$bi); | ||
| 98 | local($tot,$end); | ||
| 99 | |||
| 100 | &function_begin_B($name,""); | ||
| 101 | |||
| 102 | $c0="ebx"; | ||
| 103 | $c1="ecx"; | ||
| 104 | $c2="ebp"; | ||
| 105 | $a="esi"; | ||
| 106 | $b="edi"; | ||
| 107 | |||
| 108 | $as=0; | ||
| 109 | $ae=0; | ||
| 110 | $bs=0; | ||
| 111 | $be=0; | ||
| 112 | $tot=$num+$num-1; | ||
| 113 | |||
| 114 | &push("esi"); | ||
| 115 | &mov($a,&wparam(1)); | ||
| 116 | &push("edi"); | ||
| 117 | &mov($b,&wparam(2)); | ||
| 118 | &push("ebp"); | ||
| 119 | &push("ebx"); | ||
| 120 | |||
| 121 | &xor($c0,$c0); | ||
| 122 | &mov("eax",&DWP(0,$a,"",0)); # load the first word | ||
| 123 | &xor($c1,$c1); | ||
| 124 | &mov("edx",&DWP(0,$b,"",0)); # load the first second | ||
| 125 | |||
| 126 | for ($i=0; $i<$tot; $i++) | ||
| 127 | { | ||
| 128 | $ai=$as; | ||
| 129 | $bi=$bs; | ||
| 130 | $end=$be+1; | ||
| 131 | |||
| 132 | &comment("################## Calculate word $i"); | ||
| 133 | |||
| 134 | for ($j=$bs; $j<$end; $j++) | ||
| 135 | { | ||
| 136 | &xor($c2,$c2) if ($j == $bs); | ||
| 137 | if (($j+1) == $end) | ||
| 138 | { | ||
| 139 | $v=1; | ||
| 140 | $v=2 if (($i+1) == $tot); | ||
| 141 | } | ||
| 142 | else | ||
| 143 | { $v=0; } | ||
| 144 | if (($j+1) != $end) | ||
| 145 | { | ||
| 146 | $na=($ai-1); | ||
| 147 | $nb=($bi+1); | ||
| 148 | } | ||
| 149 | else | ||
| 150 | { | ||
| 151 | $na=$as+($i < ($num-1)); | ||
| 152 | $nb=$bs+($i >= ($num-1)); | ||
| 153 | } | ||
| 154 | #printf STDERR "[$ai,$bi] -> [$na,$nb]\n"; | ||
| 155 | &mul_add_c($a,$ai,$b,$bi,$c0,$c1,$c2,$v,$i,$na,$nb); | ||
| 156 | if ($v) | ||
| 157 | { | ||
| 158 | &comment("saved r[$i]"); | ||
| 159 | # &mov("eax",&wparam(0)); | ||
| 160 | # &mov(&DWP($i*4,"eax","",0),$c0); | ||
| 161 | ($c0,$c1,$c2)=($c1,$c2,$c0); | ||
| 162 | } | ||
| 163 | $ai--; | ||
| 164 | $bi++; | ||
| 165 | } | ||
| 166 | $as++ if ($i < ($num-1)); | ||
| 167 | $ae++ if ($i >= ($num-1)); | ||
| 168 | |||
| 169 | $bs++ if ($i >= ($num-1)); | ||
| 170 | $be++ if ($i < ($num-1)); | ||
| 171 | } | ||
| 172 | &comment("save r[$i]"); | ||
| 173 | # &mov("eax",&wparam(0)); | ||
| 174 | &mov(&DWP($i*4,"eax","",0),$c0); | ||
| 175 | |||
| 176 | &pop("ebx"); | ||
| 177 | &pop("ebp"); | ||
| 178 | &pop("edi"); | ||
| 179 | &pop("esi"); | ||
| 180 | &ret(); | ||
| 181 | &function_end_B($name); | ||
| 182 | } | ||
| 183 | |||
| 184 | sub bn_sqr_comba | ||
| 185 | { | ||
| 186 | local($name,$num)=@_; | ||
| 187 | local($r,$a,$c0,$c1,$c2)=@_; | ||
| 188 | local($i,$as,$ae,$bs,$be,$ai,$bi); | ||
| 189 | local($b,$tot,$end,$half); | ||
| 190 | |||
| 191 | &function_begin_B($name,""); | ||
| 192 | |||
| 193 | $c0="ebx"; | ||
| 194 | $c1="ecx"; | ||
| 195 | $c2="ebp"; | ||
| 196 | $a="esi"; | ||
| 197 | $r="edi"; | ||
| 198 | |||
| 199 | &push("esi"); | ||
| 200 | &push("edi"); | ||
| 201 | &push("ebp"); | ||
| 202 | &push("ebx"); | ||
| 203 | &mov($r,&wparam(0)); | ||
| 204 | &mov($a,&wparam(1)); | ||
| 205 | &xor($c0,$c0); | ||
| 206 | &xor($c1,$c1); | ||
| 207 | &mov("eax",&DWP(0,$a,"",0)); # load the first word | ||
| 208 | |||
| 209 | $as=0; | ||
| 210 | $ae=0; | ||
| 211 | $bs=0; | ||
| 212 | $be=0; | ||
| 213 | $tot=$num+$num-1; | ||
| 214 | |||
| 215 | for ($i=0; $i<$tot; $i++) | ||
| 216 | { | ||
| 217 | $ai=$as; | ||
| 218 | $bi=$bs; | ||
| 219 | $end=$be+1; | ||
| 220 | |||
| 221 | &comment("############### Calculate word $i"); | ||
| 222 | for ($j=$bs; $j<$end; $j++) | ||
| 223 | { | ||
| 224 | &xor($c2,$c2) if ($j == $bs); | ||
| 225 | if (($ai-1) < ($bi+1)) | ||
| 226 | { | ||
| 227 | $v=1; | ||
| 228 | $v=2 if ($i+1) == $tot; | ||
| 229 | } | ||
| 230 | else | ||
| 231 | { $v=0; } | ||
| 232 | if (!$v) | ||
| 233 | { | ||
| 234 | $na=$ai-1; | ||
| 235 | $nb=$bi+1; | ||
| 236 | } | ||
| 237 | else | ||
| 238 | { | ||
| 239 | $na=$as+($i < ($num-1)); | ||
| 240 | $nb=$bs+($i >= ($num-1)); | ||
| 241 | } | ||
| 242 | if ($ai == $bi) | ||
| 243 | { | ||
| 244 | &sqr_add_c($r,$a,$ai,$bi, | ||
| 245 | $c0,$c1,$c2,$v,$i,$na,$nb); | ||
| 246 | } | ||
| 247 | else | ||
| 248 | { | ||
| 249 | &sqr_add_c2($r,$a,$ai,$bi, | ||
| 250 | $c0,$c1,$c2,$v,$i,$na,$nb); | ||
| 251 | } | ||
| 252 | if ($v) | ||
| 253 | { | ||
| 254 | &comment("saved r[$i]"); | ||
| 255 | #&mov(&DWP($i*4,$r,"",0),$c0); | ||
| 256 | ($c0,$c1,$c2)=($c1,$c2,$c0); | ||
| 257 | last; | ||
| 258 | } | ||
| 259 | $ai--; | ||
| 260 | $bi++; | ||
| 261 | } | ||
| 262 | $as++ if ($i < ($num-1)); | ||
| 263 | $ae++ if ($i >= ($num-1)); | ||
| 264 | |||
| 265 | $bs++ if ($i >= ($num-1)); | ||
| 266 | $be++ if ($i < ($num-1)); | ||
| 267 | } | ||
| 268 | &mov(&DWP($i*4,$r,"",0),$c0); | ||
| 269 | &pop("ebx"); | ||
| 270 | &pop("ebp"); | ||
| 271 | &pop("edi"); | ||
| 272 | &pop("esi"); | ||
| 273 | &ret(); | ||
| 274 | &function_end_B($name); | ||
| 275 | } | ||
| 276 | |||
| 277 | 1; | ||
diff --git a/src/lib/libcrypto/bn/asm/x86/div.pl b/src/lib/libcrypto/bn/asm/x86/div.pl deleted file mode 100644 index 0e90152caa..0000000000 --- a/src/lib/libcrypto/bn/asm/x86/div.pl +++ /dev/null | |||
| @@ -1,15 +0,0 @@ | |||
| 1 | #!/usr/local/bin/perl | ||
| 2 | # x86 assember | ||
| 3 | |||
| 4 | sub bn_div_words | ||
| 5 | { | ||
| 6 | local($name)=@_; | ||
| 7 | |||
| 8 | &function_begin($name,""); | ||
| 9 | &mov("edx",&wparam(0)); # | ||
| 10 | &mov("eax",&wparam(1)); # | ||
| 11 | &mov("ebx",&wparam(2)); # | ||
| 12 | &div("ebx"); | ||
| 13 | &function_end($name); | ||
| 14 | } | ||
| 15 | 1; | ||
diff --git a/src/lib/libcrypto/bn/asm/x86/f b/src/lib/libcrypto/bn/asm/x86/f deleted file mode 100644 index 22e4112224..0000000000 --- a/src/lib/libcrypto/bn/asm/x86/f +++ /dev/null | |||
| @@ -1,3 +0,0 @@ | |||
| 1 | #!/usr/local/bin/perl | ||
| 2 | # x86 assember | ||
| 3 | |||
diff --git a/src/lib/libcrypto/bn/asm/x86/mul.pl b/src/lib/libcrypto/bn/asm/x86/mul.pl deleted file mode 100644 index 674cb9b055..0000000000 --- a/src/lib/libcrypto/bn/asm/x86/mul.pl +++ /dev/null | |||
| @@ -1,77 +0,0 @@ | |||
| 1 | #!/usr/local/bin/perl | ||
| 2 | # x86 assember | ||
| 3 | |||
| 4 | sub bn_mul_words | ||
| 5 | { | ||
| 6 | local($name)=@_; | ||
| 7 | |||
| 8 | &function_begin($name,""); | ||
| 9 | |||
| 10 | &comment(""); | ||
| 11 | $Low="eax"; | ||
| 12 | $High="edx"; | ||
| 13 | $a="ebx"; | ||
| 14 | $w="ecx"; | ||
| 15 | $r="edi"; | ||
| 16 | $c="esi"; | ||
| 17 | $num="ebp"; | ||
| 18 | |||
| 19 | &xor($c,$c); # clear carry | ||
| 20 | &mov($r,&wparam(0)); # | ||
| 21 | &mov($a,&wparam(1)); # | ||
| 22 | &mov($num,&wparam(2)); # | ||
| 23 | &mov($w,&wparam(3)); # | ||
| 24 | |||
| 25 | &and($num,0xfffffff8); # num / 8 | ||
| 26 | &jz(&label("mw_finish")); | ||
| 27 | |||
| 28 | &set_label("mw_loop",0); | ||
| 29 | for ($i=0; $i<32; $i+=4) | ||
| 30 | { | ||
| 31 | &comment("Round $i"); | ||
| 32 | |||
| 33 | &mov("eax",&DWP($i,$a,"",0)); # *a | ||
| 34 | &mul($w); # *a * w | ||
| 35 | &add("eax",$c); # L(t)+=c | ||
| 36 | # XXX | ||
| 37 | |||
| 38 | &adc("edx",0); # H(t)+=carry | ||
| 39 | &mov(&DWP($i,$r,"",0),"eax"); # *r= L(t); | ||
| 40 | |||
| 41 | &mov($c,"edx"); # c= H(t); | ||
| 42 | } | ||
| 43 | |||
| 44 | &comment(""); | ||
| 45 | &add($a,32); | ||
| 46 | &add($r,32); | ||
| 47 | &sub($num,8); | ||
| 48 | &jz(&label("mw_finish")); | ||
| 49 | &jmp(&label("mw_loop")); | ||
| 50 | |||
| 51 | &set_label("mw_finish",0); | ||
| 52 | &mov($num,&wparam(2)); # get num | ||
| 53 | &and($num,7); | ||
| 54 | &jnz(&label("mw_finish2")); | ||
| 55 | &jmp(&label("mw_end")); | ||
| 56 | |||
| 57 | &set_label("mw_finish2",1); | ||
| 58 | for ($i=0; $i<7; $i++) | ||
| 59 | { | ||
| 60 | &comment("Tail Round $i"); | ||
| 61 | &mov("eax",&DWP($i*4,$a,"",0));# *a | ||
| 62 | &mul($w); # *a * w | ||
| 63 | &add("eax",$c); # L(t)+=c | ||
| 64 | # XXX | ||
| 65 | &adc("edx",0); # H(t)+=carry | ||
| 66 | &mov(&DWP($i*4,$r,"",0),"eax");# *r= L(t); | ||
| 67 | &mov($c,"edx"); # c= H(t); | ||
| 68 | &dec($num) if ($i != 7-1); | ||
| 69 | &jz(&label("mw_end")) if ($i != 7-1); | ||
| 70 | } | ||
| 71 | &set_label("mw_end",0); | ||
| 72 | &mov("eax",$c); | ||
| 73 | |||
| 74 | &function_end($name); | ||
| 75 | } | ||
| 76 | |||
| 77 | 1; | ||
diff --git a/src/lib/libcrypto/bn/asm/x86/mul_add.pl b/src/lib/libcrypto/bn/asm/x86/mul_add.pl deleted file mode 100644 index 61830d3a90..0000000000 --- a/src/lib/libcrypto/bn/asm/x86/mul_add.pl +++ /dev/null | |||
| @@ -1,87 +0,0 @@ | |||
| 1 | #!/usr/local/bin/perl | ||
| 2 | # x86 assember | ||
| 3 | |||
| 4 | sub bn_mul_add_words | ||
| 5 | { | ||
| 6 | local($name)=@_; | ||
| 7 | |||
| 8 | &function_begin($name,""); | ||
| 9 | |||
| 10 | &comment(""); | ||
| 11 | $Low="eax"; | ||
| 12 | $High="edx"; | ||
| 13 | $a="ebx"; | ||
| 14 | $w="ebp"; | ||
| 15 | $r="edi"; | ||
| 16 | $c="esi"; | ||
| 17 | |||
| 18 | &xor($c,$c); # clear carry | ||
| 19 | &mov($r,&wparam(0)); # | ||
| 20 | |||
| 21 | &mov("ecx",&wparam(2)); # | ||
| 22 | &mov($a,&wparam(1)); # | ||
| 23 | |||
| 24 | &and("ecx",0xfffffff8); # num / 8 | ||
| 25 | &mov($w,&wparam(3)); # | ||
| 26 | |||
| 27 | &push("ecx"); # Up the stack for a tmp variable | ||
| 28 | |||
| 29 | &jz(&label("maw_finish")); | ||
| 30 | |||
| 31 | &set_label("maw_loop",0); | ||
| 32 | |||
| 33 | &mov(&swtmp(0),"ecx"); # | ||
| 34 | |||
| 35 | for ($i=0; $i<32; $i+=4) | ||
| 36 | { | ||
| 37 | &comment("Round $i"); | ||
| 38 | |||
| 39 | &mov("eax",&DWP($i,$a,"",0)); # *a | ||
| 40 | &mul($w); # *a * w | ||
| 41 | &add("eax",$c); # L(t)+= *r | ||
| 42 | &mov($c,&DWP($i,$r,"",0)); # L(t)+= *r | ||
| 43 | &adc("edx",0); # H(t)+=carry | ||
| 44 | &add("eax",$c); # L(t)+=c | ||
| 45 | &adc("edx",0); # H(t)+=carry | ||
| 46 | &mov(&DWP($i,$r,"",0),"eax"); # *r= L(t); | ||
| 47 | &mov($c,"edx"); # c= H(t); | ||
| 48 | } | ||
| 49 | |||
| 50 | &comment(""); | ||
| 51 | &mov("ecx",&swtmp(0)); # | ||
| 52 | &add($a,32); | ||
| 53 | &add($r,32); | ||
| 54 | &sub("ecx",8); | ||
| 55 | &jnz(&label("maw_loop")); | ||
| 56 | |||
| 57 | &set_label("maw_finish",0); | ||
| 58 | &mov("ecx",&wparam(2)); # get num | ||
| 59 | &and("ecx",7); | ||
| 60 | &jnz(&label("maw_finish2")); # helps branch prediction | ||
| 61 | &jmp(&label("maw_end")); | ||
| 62 | |||
| 63 | &set_label("maw_finish2",1); | ||
| 64 | for ($i=0; $i<7; $i++) | ||
| 65 | { | ||
| 66 | &comment("Tail Round $i"); | ||
| 67 | &mov("eax",&DWP($i*4,$a,"",0));# *a | ||
| 68 | &mul($w); # *a * w | ||
| 69 | &add("eax",$c); # L(t)+=c | ||
| 70 | &mov($c,&DWP($i*4,$r,"",0)); # L(t)+= *r | ||
| 71 | &adc("edx",0); # H(t)+=carry | ||
| 72 | &add("eax",$c); | ||
| 73 | &adc("edx",0); # H(t)+=carry | ||
| 74 | &dec("ecx") if ($i != 7-1); | ||
| 75 | &mov(&DWP($i*4,$r,"",0),"eax"); # *r= L(t); | ||
| 76 | &mov($c,"edx"); # c= H(t); | ||
| 77 | &jz(&label("maw_end")) if ($i != 7-1); | ||
| 78 | } | ||
| 79 | &set_label("maw_end",0); | ||
| 80 | &mov("eax",$c); | ||
| 81 | |||
| 82 | &pop("ecx"); # clear variable from | ||
| 83 | |||
| 84 | &function_end($name); | ||
| 85 | } | ||
| 86 | |||
| 87 | 1; | ||
diff --git a/src/lib/libcrypto/bn/asm/x86/sqr.pl b/src/lib/libcrypto/bn/asm/x86/sqr.pl deleted file mode 100644 index 1f90993cf6..0000000000 --- a/src/lib/libcrypto/bn/asm/x86/sqr.pl +++ /dev/null | |||
| @@ -1,60 +0,0 @@ | |||
| 1 | #!/usr/local/bin/perl | ||
| 2 | # x86 assember | ||
| 3 | |||
| 4 | sub bn_sqr_words | ||
| 5 | { | ||
| 6 | local($name)=@_; | ||
| 7 | |||
| 8 | &function_begin($name,""); | ||
| 9 | |||
| 10 | &comment(""); | ||
| 11 | $r="esi"; | ||
| 12 | $a="edi"; | ||
| 13 | $num="ebx"; | ||
| 14 | |||
| 15 | &mov($r,&wparam(0)); # | ||
| 16 | &mov($a,&wparam(1)); # | ||
| 17 | &mov($num,&wparam(2)); # | ||
| 18 | |||
| 19 | &and($num,0xfffffff8); # num / 8 | ||
| 20 | &jz(&label("sw_finish")); | ||
| 21 | |||
| 22 | &set_label("sw_loop",0); | ||
| 23 | for ($i=0; $i<32; $i+=4) | ||
| 24 | { | ||
| 25 | &comment("Round $i"); | ||
| 26 | &mov("eax",&DWP($i,$a,"",0)); # *a | ||
| 27 | # XXX | ||
| 28 | &mul("eax"); # *a * *a | ||
| 29 | &mov(&DWP($i*2,$r,"",0),"eax"); # | ||
| 30 | &mov(&DWP($i*2+4,$r,"",0),"edx");# | ||
| 31 | } | ||
| 32 | |||
| 33 | &comment(""); | ||
| 34 | &add($a,32); | ||
| 35 | &add($r,64); | ||
| 36 | &sub($num,8); | ||
| 37 | &jnz(&label("sw_loop")); | ||
| 38 | |||
| 39 | &set_label("sw_finish",0); | ||
| 40 | &mov($num,&wparam(2)); # get num | ||
| 41 | &and($num,7); | ||
| 42 | &jz(&label("sw_end")); | ||
| 43 | |||
| 44 | for ($i=0; $i<7; $i++) | ||
| 45 | { | ||
| 46 | &comment("Tail Round $i"); | ||
| 47 | &mov("eax",&DWP($i*4,$a,"",0)); # *a | ||
| 48 | # XXX | ||
| 49 | &mul("eax"); # *a * *a | ||
| 50 | &mov(&DWP($i*8,$r,"",0),"eax"); # | ||
| 51 | &dec($num) if ($i != 7-1); | ||
| 52 | &mov(&DWP($i*8+4,$r,"",0),"edx"); | ||
| 53 | &jz(&label("sw_end")) if ($i != 7-1); | ||
| 54 | } | ||
| 55 | &set_label("sw_end",0); | ||
| 56 | |||
| 57 | &function_end($name); | ||
| 58 | } | ||
| 59 | |||
| 60 | 1; | ||
diff --git a/src/lib/libcrypto/bn/asm/x86/sub.pl b/src/lib/libcrypto/bn/asm/x86/sub.pl deleted file mode 100644 index 837b0e1b07..0000000000 --- a/src/lib/libcrypto/bn/asm/x86/sub.pl +++ /dev/null | |||
| @@ -1,76 +0,0 @@ | |||
| 1 | #!/usr/local/bin/perl | ||
| 2 | # x86 assember | ||
| 3 | |||
| 4 | sub bn_sub_words | ||
| 5 | { | ||
| 6 | local($name)=@_; | ||
| 7 | |||
| 8 | &function_begin($name,""); | ||
| 9 | |||
| 10 | &comment(""); | ||
| 11 | $a="esi"; | ||
| 12 | $b="edi"; | ||
| 13 | $c="eax"; | ||
| 14 | $r="ebx"; | ||
| 15 | $tmp1="ecx"; | ||
| 16 | $tmp2="edx"; | ||
| 17 | $num="ebp"; | ||
| 18 | |||
| 19 | &mov($r,&wparam(0)); # get r | ||
| 20 | &mov($a,&wparam(1)); # get a | ||
| 21 | &mov($b,&wparam(2)); # get b | ||
| 22 | &mov($num,&wparam(3)); # get num | ||
| 23 | &xor($c,$c); # clear carry | ||
| 24 | &and($num,0xfffffff8); # num / 8 | ||
| 25 | |||
| 26 | &jz(&label("aw_finish")); | ||
| 27 | |||
| 28 | &set_label("aw_loop",0); | ||
| 29 | for ($i=0; $i<8; $i++) | ||
| 30 | { | ||
| 31 | &comment("Round $i"); | ||
| 32 | |||
| 33 | &mov($tmp1,&DWP($i*4,$a,"",0)); # *a | ||
| 34 | &mov($tmp2,&DWP($i*4,$b,"",0)); # *b | ||
| 35 | &sub($tmp1,$c); | ||
| 36 | &mov($c,0); | ||
| 37 | &adc($c,$c); | ||
| 38 | &sub($tmp1,$tmp2); | ||
| 39 | &adc($c,0); | ||
| 40 | &mov(&DWP($i*4,$r,"",0),$tmp1); # *r | ||
| 41 | } | ||
| 42 | |||
| 43 | &comment(""); | ||
| 44 | &add($a,32); | ||
| 45 | &add($b,32); | ||
| 46 | &add($r,32); | ||
| 47 | &sub($num,8); | ||
| 48 | &jnz(&label("aw_loop")); | ||
| 49 | |||
| 50 | &set_label("aw_finish",0); | ||
| 51 | &mov($num,&wparam(3)); # get num | ||
| 52 | &and($num,7); | ||
| 53 | &jz(&label("aw_end")); | ||
| 54 | |||
| 55 | for ($i=0; $i<7; $i++) | ||
| 56 | { | ||
| 57 | &comment("Tail Round $i"); | ||
| 58 | &mov($tmp1,&DWP($i*4,$a,"",0)); # *a | ||
| 59 | &mov($tmp2,&DWP($i*4,$b,"",0));# *b | ||
| 60 | &sub($tmp1,$c); | ||
| 61 | &mov($c,0); | ||
| 62 | &adc($c,$c); | ||
| 63 | &sub($tmp1,$tmp2); | ||
| 64 | &adc($c,0); | ||
| 65 | &dec($num) if ($i != 6); | ||
| 66 | &mov(&DWP($i*4,$r,"",0),$tmp1); # *a | ||
| 67 | &jz(&label("aw_end")) if ($i != 6); | ||
| 68 | } | ||
| 69 | &set_label("aw_end",0); | ||
| 70 | |||
| 71 | # &mov("eax",$c); # $c is "eax" | ||
| 72 | |||
| 73 | &function_end($name); | ||
| 74 | } | ||
| 75 | |||
| 76 | 1; | ||
diff --git a/src/lib/libcrypto/bn/asm/x86_64-gcc.c b/src/lib/libcrypto/bn/asm/x86_64-gcc.c deleted file mode 100644 index acb0b40118..0000000000 --- a/src/lib/libcrypto/bn/asm/x86_64-gcc.c +++ /dev/null | |||
| @@ -1,606 +0,0 @@ | |||
| 1 | #include "../bn_lcl.h" | ||
| 2 | #if !(defined(__GNUC__) && __GNUC__>=2) | ||
| 3 | # include "../bn_asm.c" /* kind of dirty hack for Sun Studio */ | ||
| 4 | #else | ||
| 5 | /* | ||
| 6 | * x86_64 BIGNUM accelerator version 0.1, December 2002. | ||
| 7 | * | ||
| 8 | * Implemented by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | ||
| 9 | * project. | ||
| 10 | * | ||
| 11 | * Rights for redistribution and usage in source and binary forms are | ||
| 12 | * granted according to the OpenSSL license. Warranty of any kind is | ||
| 13 | * disclaimed. | ||
| 14 | * | ||
| 15 | * Q. Version 0.1? It doesn't sound like Andy, he used to assign real | ||
| 16 | * versions, like 1.0... | ||
| 17 | * A. Well, that's because this code is basically a quick-n-dirty | ||
| 18 | * proof-of-concept hack. As you can see it's implemented with | ||
| 19 | * inline assembler, which means that you're bound to GCC and that | ||
| 20 | * there might be enough room for further improvement. | ||
| 21 | * | ||
| 22 | * Q. Why inline assembler? | ||
| 23 | * A. x86_64 features own ABI which I'm not familiar with. This is | ||
| 24 | * why I decided to let the compiler take care of subroutine | ||
| 25 | * prologue/epilogue as well as register allocation. For reference. | ||
| 26 | * Win64 implements different ABI for AMD64, different from Linux. | ||
| 27 | * | ||
| 28 | * Q. How much faster does it get? | ||
| 29 | * A. 'apps/openssl speed rsa dsa' output with no-asm: | ||
| 30 | * | ||
| 31 | * sign verify sign/s verify/s | ||
| 32 | * rsa 512 bits 0.0006s 0.0001s 1683.8 18456.2 | ||
| 33 | * rsa 1024 bits 0.0028s 0.0002s 356.0 6407.0 | ||
| 34 | * rsa 2048 bits 0.0172s 0.0005s 58.0 1957.8 | ||
| 35 | * rsa 4096 bits 0.1155s 0.0018s 8.7 555.6 | ||
| 36 | * sign verify sign/s verify/s | ||
| 37 | * dsa 512 bits 0.0005s 0.0006s 2100.8 1768.3 | ||
| 38 | * dsa 1024 bits 0.0014s 0.0018s 692.3 559.2 | ||
| 39 | * dsa 2048 bits 0.0049s 0.0061s 204.7 165.0 | ||
| 40 | * | ||
| 41 | * 'apps/openssl speed rsa dsa' output with this module: | ||
| 42 | * | ||
| 43 | * sign verify sign/s verify/s | ||
| 44 | * rsa 512 bits 0.0004s 0.0000s 2767.1 33297.9 | ||
| 45 | * rsa 1024 bits 0.0012s 0.0001s 867.4 14674.7 | ||
| 46 | * rsa 2048 bits 0.0061s 0.0002s 164.0 5270.0 | ||
| 47 | * rsa 4096 bits 0.0384s 0.0006s 26.1 1650.8 | ||
| 48 | * sign verify sign/s verify/s | ||
| 49 | * dsa 512 bits 0.0002s 0.0003s 4442.2 3786.3 | ||
| 50 | * dsa 1024 bits 0.0005s 0.0007s 1835.1 1497.4 | ||
| 51 | * dsa 2048 bits 0.0016s 0.0020s 620.4 504.6 | ||
| 52 | * | ||
| 53 | * For the reference. IA-32 assembler implementation performs | ||
| 54 | * very much like 64-bit code compiled with no-asm on the same | ||
| 55 | * machine. | ||
| 56 | */ | ||
| 57 | |||
| 58 | #ifdef _WIN64 | ||
| 59 | #define BN_ULONG unsigned long long | ||
| 60 | #else | ||
| 61 | #define BN_ULONG unsigned long | ||
| 62 | #endif | ||
| 63 | |||
| 64 | #undef mul | ||
| 65 | #undef mul_add | ||
| 66 | #undef sqr | ||
| 67 | |||
| 68 | /* | ||
| 69 | * "m"(a), "+m"(r) is the way to favor DirectPath µ-code; | ||
| 70 | * "g"(0) let the compiler to decide where does it | ||
| 71 | * want to keep the value of zero; | ||
| 72 | */ | ||
| 73 | #define mul_add(r,a,word,carry) do { \ | ||
| 74 | register BN_ULONG high,low; \ | ||
| 75 | asm ("mulq %3" \ | ||
| 76 | : "=a"(low),"=d"(high) \ | ||
| 77 | : "a"(word),"m"(a) \ | ||
| 78 | : "cc"); \ | ||
| 79 | asm ("addq %2,%0; adcq %3,%1" \ | ||
| 80 | : "+r"(carry),"+d"(high)\ | ||
| 81 | : "a"(low),"g"(0) \ | ||
| 82 | : "cc"); \ | ||
| 83 | asm ("addq %2,%0; adcq %3,%1" \ | ||
| 84 | : "+m"(r),"+d"(high) \ | ||
| 85 | : "r"(carry),"g"(0) \ | ||
| 86 | : "cc"); \ | ||
| 87 | carry=high; \ | ||
| 88 | } while (0) | ||
| 89 | |||
| 90 | #define mul(r,a,word,carry) do { \ | ||
| 91 | register BN_ULONG high,low; \ | ||
| 92 | asm ("mulq %3" \ | ||
| 93 | : "=a"(low),"=d"(high) \ | ||
| 94 | : "a"(word),"g"(a) \ | ||
| 95 | : "cc"); \ | ||
| 96 | asm ("addq %2,%0; adcq %3,%1" \ | ||
| 97 | : "+r"(carry),"+d"(high)\ | ||
| 98 | : "a"(low),"g"(0) \ | ||
| 99 | : "cc"); \ | ||
| 100 | (r)=carry, carry=high; \ | ||
| 101 | } while (0) | ||
| 102 | |||
| 103 | #define sqr(r0,r1,a) \ | ||
| 104 | asm ("mulq %2" \ | ||
| 105 | : "=a"(r0),"=d"(r1) \ | ||
| 106 | : "a"(a) \ | ||
| 107 | : "cc"); | ||
| 108 | |||
| 109 | BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) | ||
| 110 | { | ||
| 111 | BN_ULONG c1=0; | ||
| 112 | |||
| 113 | if (num <= 0) return(c1); | ||
| 114 | |||
| 115 | while (num&~3) | ||
| 116 | { | ||
| 117 | mul_add(rp[0],ap[0],w,c1); | ||
| 118 | mul_add(rp[1],ap[1],w,c1); | ||
| 119 | mul_add(rp[2],ap[2],w,c1); | ||
| 120 | mul_add(rp[3],ap[3],w,c1); | ||
| 121 | ap+=4; rp+=4; num-=4; | ||
| 122 | } | ||
| 123 | if (num) | ||
| 124 | { | ||
| 125 | mul_add(rp[0],ap[0],w,c1); if (--num==0) return c1; | ||
| 126 | mul_add(rp[1],ap[1],w,c1); if (--num==0) return c1; | ||
| 127 | mul_add(rp[2],ap[2],w,c1); return c1; | ||
| 128 | } | ||
| 129 | |||
| 130 | return(c1); | ||
| 131 | } | ||
| 132 | |||
| 133 | BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) | ||
| 134 | { | ||
| 135 | BN_ULONG c1=0; | ||
| 136 | |||
| 137 | if (num <= 0) return(c1); | ||
| 138 | |||
| 139 | while (num&~3) | ||
| 140 | { | ||
| 141 | mul(rp[0],ap[0],w,c1); | ||
| 142 | mul(rp[1],ap[1],w,c1); | ||
| 143 | mul(rp[2],ap[2],w,c1); | ||
| 144 | mul(rp[3],ap[3],w,c1); | ||
| 145 | ap+=4; rp+=4; num-=4; | ||
| 146 | } | ||
| 147 | if (num) | ||
| 148 | { | ||
| 149 | mul(rp[0],ap[0],w,c1); if (--num == 0) return c1; | ||
| 150 | mul(rp[1],ap[1],w,c1); if (--num == 0) return c1; | ||
| 151 | mul(rp[2],ap[2],w,c1); | ||
| 152 | } | ||
| 153 | return(c1); | ||
| 154 | } | ||
| 155 | |||
| 156 | void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) | ||
| 157 | { | ||
| 158 | if (n <= 0) return; | ||
| 159 | |||
| 160 | while (n&~3) | ||
| 161 | { | ||
| 162 | sqr(r[0],r[1],a[0]); | ||
| 163 | sqr(r[2],r[3],a[1]); | ||
| 164 | sqr(r[4],r[5],a[2]); | ||
| 165 | sqr(r[6],r[7],a[3]); | ||
| 166 | a+=4; r+=8; n-=4; | ||
| 167 | } | ||
| 168 | if (n) | ||
| 169 | { | ||
| 170 | sqr(r[0],r[1],a[0]); if (--n == 0) return; | ||
| 171 | sqr(r[2],r[3],a[1]); if (--n == 0) return; | ||
| 172 | sqr(r[4],r[5],a[2]); | ||
| 173 | } | ||
| 174 | } | ||
| 175 | |||
| 176 | BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) | ||
| 177 | { BN_ULONG ret,waste; | ||
| 178 | |||
| 179 | asm ("divq %4" | ||
| 180 | : "=a"(ret),"=d"(waste) | ||
| 181 | : "a"(l),"d"(h),"g"(d) | ||
| 182 | : "cc"); | ||
| 183 | |||
| 184 | return ret; | ||
| 185 | } | ||
| 186 | |||
| 187 | BN_ULONG bn_add_words (BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,int n) | ||
| 188 | { BN_ULONG ret=0,i=0; | ||
| 189 | |||
| 190 | if (n <= 0) return 0; | ||
| 191 | |||
| 192 | asm ( | ||
| 193 | " subq %2,%2 \n" | ||
| 194 | ".p2align 4 \n" | ||
| 195 | "1: movq (%4,%2,8),%0 \n" | ||
| 196 | " adcq (%5,%2,8),%0 \n" | ||
| 197 | " movq %0,(%3,%2,8) \n" | ||
| 198 | " leaq 1(%2),%2 \n" | ||
| 199 | " loop 1b \n" | ||
| 200 | " sbbq %0,%0 \n" | ||
| 201 | : "=&a"(ret),"+c"(n),"=&r"(i) | ||
| 202 | : "r"(rp),"r"(ap),"r"(bp) | ||
| 203 | : "cc" | ||
| 204 | ); | ||
| 205 | |||
| 206 | return ret&1; | ||
| 207 | } | ||
| 208 | |||
| 209 | #ifndef SIMICS | ||
| 210 | BN_ULONG bn_sub_words (BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,int n) | ||
| 211 | { BN_ULONG ret=0,i=0; | ||
| 212 | |||
| 213 | if (n <= 0) return 0; | ||
| 214 | |||
| 215 | asm ( | ||
| 216 | " subq %2,%2 \n" | ||
| 217 | ".p2align 4 \n" | ||
| 218 | "1: movq (%4,%2,8),%0 \n" | ||
| 219 | " sbbq (%5,%2,8),%0 \n" | ||
| 220 | " movq %0,(%3,%2,8) \n" | ||
| 221 | " leaq 1(%2),%2 \n" | ||
| 222 | " loop 1b \n" | ||
| 223 | " sbbq %0,%0 \n" | ||
| 224 | : "=&a"(ret),"+c"(n),"=&r"(i) | ||
| 225 | : "r"(rp),"r"(ap),"r"(bp) | ||
| 226 | : "cc" | ||
| 227 | ); | ||
| 228 | |||
| 229 | return ret&1; | ||
| 230 | } | ||
| 231 | #else | ||
| 232 | /* Simics 1.4<7 has buggy sbbq:-( */ | ||
| 233 | #define BN_MASK2 0xffffffffffffffffL | ||
| 234 | BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) | ||
| 235 | { | ||
| 236 | BN_ULONG t1,t2; | ||
| 237 | int c=0; | ||
| 238 | |||
| 239 | if (n <= 0) return((BN_ULONG)0); | ||
| 240 | |||
| 241 | for (;;) | ||
| 242 | { | ||
| 243 | t1=a[0]; t2=b[0]; | ||
| 244 | r[0]=(t1-t2-c)&BN_MASK2; | ||
| 245 | if (t1 != t2) c=(t1 < t2); | ||
| 246 | if (--n <= 0) break; | ||
| 247 | |||
| 248 | t1=a[1]; t2=b[1]; | ||
| 249 | r[1]=(t1-t2-c)&BN_MASK2; | ||
| 250 | if (t1 != t2) c=(t1 < t2); | ||
| 251 | if (--n <= 0) break; | ||
| 252 | |||
| 253 | t1=a[2]; t2=b[2]; | ||
| 254 | r[2]=(t1-t2-c)&BN_MASK2; | ||
| 255 | if (t1 != t2) c=(t1 < t2); | ||
| 256 | if (--n <= 0) break; | ||
| 257 | |||
| 258 | t1=a[3]; t2=b[3]; | ||
| 259 | r[3]=(t1-t2-c)&BN_MASK2; | ||
| 260 | if (t1 != t2) c=(t1 < t2); | ||
| 261 | if (--n <= 0) break; | ||
| 262 | |||
| 263 | a+=4; | ||
| 264 | b+=4; | ||
| 265 | r+=4; | ||
| 266 | } | ||
| 267 | return(c); | ||
| 268 | } | ||
| 269 | #endif | ||
| 270 | |||
| 271 | /* mul_add_c(a,b,c0,c1,c2) -- c+=a*b for three word number c=(c2,c1,c0) */ | ||
| 272 | /* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */ | ||
| 273 | /* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */ | ||
| 274 | /* sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0) */ | ||
| 275 | |||
| 276 | #if 0 | ||
| 277 | /* original macros are kept for reference purposes */ | ||
| 278 | #define mul_add_c(a,b,c0,c1,c2) { \ | ||
| 279 | BN_ULONG ta=(a),tb=(b); \ | ||
| 280 | t1 = ta * tb; \ | ||
| 281 | t2 = BN_UMULT_HIGH(ta,tb); \ | ||
| 282 | c0 += t1; t2 += (c0<t1)?1:0; \ | ||
| 283 | c1 += t2; c2 += (c1<t2)?1:0; \ | ||
| 284 | } | ||
| 285 | |||
| 286 | #define mul_add_c2(a,b,c0,c1,c2) { \ | ||
| 287 | BN_ULONG ta=(a),tb=(b),t0; \ | ||
| 288 | t1 = BN_UMULT_HIGH(ta,tb); \ | ||
| 289 | t0 = ta * tb; \ | ||
| 290 | t2 = t1+t1; c2 += (t2<t1)?1:0; \ | ||
| 291 | t1 = t0+t0; t2 += (t1<t0)?1:0; \ | ||
| 292 | c0 += t1; t2 += (c0<t1)?1:0; \ | ||
| 293 | c1 += t2; c2 += (c1<t2)?1:0; \ | ||
| 294 | } | ||
| 295 | #else | ||
| 296 | #define mul_add_c(a,b,c0,c1,c2) do { \ | ||
| 297 | asm ("mulq %3" \ | ||
| 298 | : "=a"(t1),"=d"(t2) \ | ||
| 299 | : "a"(a),"m"(b) \ | ||
| 300 | : "cc"); \ | ||
| 301 | asm ("addq %2,%0; adcq %3,%1" \ | ||
| 302 | : "+r"(c0),"+d"(t2) \ | ||
| 303 | : "a"(t1),"g"(0) \ | ||
| 304 | : "cc"); \ | ||
| 305 | asm ("addq %2,%0; adcq %3,%1" \ | ||
| 306 | : "+r"(c1),"+r"(c2) \ | ||
| 307 | : "d"(t2),"g"(0) \ | ||
| 308 | : "cc"); \ | ||
| 309 | } while (0) | ||
| 310 | |||
| 311 | #define sqr_add_c(a,i,c0,c1,c2) do { \ | ||
| 312 | asm ("mulq %2" \ | ||
| 313 | : "=a"(t1),"=d"(t2) \ | ||
| 314 | : "a"(a[i]) \ | ||
| 315 | : "cc"); \ | ||
| 316 | asm ("addq %2,%0; adcq %3,%1" \ | ||
| 317 | : "+r"(c0),"+d"(t2) \ | ||
| 318 | : "a"(t1),"g"(0) \ | ||
| 319 | : "cc"); \ | ||
| 320 | asm ("addq %2,%0; adcq %3,%1" \ | ||
| 321 | : "+r"(c1),"+r"(c2) \ | ||
| 322 | : "d"(t2),"g"(0) \ | ||
| 323 | : "cc"); \ | ||
| 324 | } while (0) | ||
| 325 | |||
| 326 | #define mul_add_c2(a,b,c0,c1,c2) do { \ | ||
| 327 | asm ("mulq %3" \ | ||
| 328 | : "=a"(t1),"=d"(t2) \ | ||
| 329 | : "a"(a),"m"(b) \ | ||
| 330 | : "cc"); \ | ||
| 331 | asm ("addq %0,%0; adcq %2,%1" \ | ||
| 332 | : "+d"(t2),"+r"(c2) \ | ||
| 333 | : "g"(0) \ | ||
| 334 | : "cc"); \ | ||
| 335 | asm ("addq %0,%0; adcq %2,%1" \ | ||
| 336 | : "+a"(t1),"+d"(t2) \ | ||
| 337 | : "g"(0) \ | ||
| 338 | : "cc"); \ | ||
| 339 | asm ("addq %2,%0; adcq %3,%1" \ | ||
| 340 | : "+r"(c0),"+d"(t2) \ | ||
| 341 | : "a"(t1),"g"(0) \ | ||
| 342 | : "cc"); \ | ||
| 343 | asm ("addq %2,%0; adcq %3,%1" \ | ||
| 344 | : "+r"(c1),"+r"(c2) \ | ||
| 345 | : "d"(t2),"g"(0) \ | ||
| 346 | : "cc"); \ | ||
| 347 | } while (0) | ||
| 348 | #endif | ||
| 349 | |||
| 350 | #define sqr_add_c2(a,i,j,c0,c1,c2) \ | ||
| 351 | mul_add_c2((a)[i],(a)[j],c0,c1,c2) | ||
| 352 | |||
| 353 | void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) | ||
| 354 | { | ||
| 355 | BN_ULONG t1,t2; | ||
| 356 | BN_ULONG c1,c2,c3; | ||
| 357 | |||
| 358 | c1=0; | ||
| 359 | c2=0; | ||
| 360 | c3=0; | ||
| 361 | mul_add_c(a[0],b[0],c1,c2,c3); | ||
| 362 | r[0]=c1; | ||
| 363 | c1=0; | ||
| 364 | mul_add_c(a[0],b[1],c2,c3,c1); | ||
| 365 | mul_add_c(a[1],b[0],c2,c3,c1); | ||
| 366 | r[1]=c2; | ||
| 367 | c2=0; | ||
| 368 | mul_add_c(a[2],b[0],c3,c1,c2); | ||
| 369 | mul_add_c(a[1],b[1],c3,c1,c2); | ||
| 370 | mul_add_c(a[0],b[2],c3,c1,c2); | ||
| 371 | r[2]=c3; | ||
| 372 | c3=0; | ||
| 373 | mul_add_c(a[0],b[3],c1,c2,c3); | ||
| 374 | mul_add_c(a[1],b[2],c1,c2,c3); | ||
| 375 | mul_add_c(a[2],b[1],c1,c2,c3); | ||
| 376 | mul_add_c(a[3],b[0],c1,c2,c3); | ||
| 377 | r[3]=c1; | ||
| 378 | c1=0; | ||
| 379 | mul_add_c(a[4],b[0],c2,c3,c1); | ||
| 380 | mul_add_c(a[3],b[1],c2,c3,c1); | ||
| 381 | mul_add_c(a[2],b[2],c2,c3,c1); | ||
| 382 | mul_add_c(a[1],b[3],c2,c3,c1); | ||
| 383 | mul_add_c(a[0],b[4],c2,c3,c1); | ||
| 384 | r[4]=c2; | ||
| 385 | c2=0; | ||
| 386 | mul_add_c(a[0],b[5],c3,c1,c2); | ||
| 387 | mul_add_c(a[1],b[4],c3,c1,c2); | ||
| 388 | mul_add_c(a[2],b[3],c3,c1,c2); | ||
| 389 | mul_add_c(a[3],b[2],c3,c1,c2); | ||
| 390 | mul_add_c(a[4],b[1],c3,c1,c2); | ||
| 391 | mul_add_c(a[5],b[0],c3,c1,c2); | ||
| 392 | r[5]=c3; | ||
| 393 | c3=0; | ||
| 394 | mul_add_c(a[6],b[0],c1,c2,c3); | ||
| 395 | mul_add_c(a[5],b[1],c1,c2,c3); | ||
| 396 | mul_add_c(a[4],b[2],c1,c2,c3); | ||
| 397 | mul_add_c(a[3],b[3],c1,c2,c3); | ||
| 398 | mul_add_c(a[2],b[4],c1,c2,c3); | ||
| 399 | mul_add_c(a[1],b[5],c1,c2,c3); | ||
| 400 | mul_add_c(a[0],b[6],c1,c2,c3); | ||
| 401 | r[6]=c1; | ||
| 402 | c1=0; | ||
| 403 | mul_add_c(a[0],b[7],c2,c3,c1); | ||
| 404 | mul_add_c(a[1],b[6],c2,c3,c1); | ||
| 405 | mul_add_c(a[2],b[5],c2,c3,c1); | ||
| 406 | mul_add_c(a[3],b[4],c2,c3,c1); | ||
| 407 | mul_add_c(a[4],b[3],c2,c3,c1); | ||
| 408 | mul_add_c(a[5],b[2],c2,c3,c1); | ||
| 409 | mul_add_c(a[6],b[1],c2,c3,c1); | ||
| 410 | mul_add_c(a[7],b[0],c2,c3,c1); | ||
| 411 | r[7]=c2; | ||
| 412 | c2=0; | ||
| 413 | mul_add_c(a[7],b[1],c3,c1,c2); | ||
| 414 | mul_add_c(a[6],b[2],c3,c1,c2); | ||
| 415 | mul_add_c(a[5],b[3],c3,c1,c2); | ||
| 416 | mul_add_c(a[4],b[4],c3,c1,c2); | ||
| 417 | mul_add_c(a[3],b[5],c3,c1,c2); | ||
| 418 | mul_add_c(a[2],b[6],c3,c1,c2); | ||
| 419 | mul_add_c(a[1],b[7],c3,c1,c2); | ||
| 420 | r[8]=c3; | ||
| 421 | c3=0; | ||
| 422 | mul_add_c(a[2],b[7],c1,c2,c3); | ||
| 423 | mul_add_c(a[3],b[6],c1,c2,c3); | ||
| 424 | mul_add_c(a[4],b[5],c1,c2,c3); | ||
| 425 | mul_add_c(a[5],b[4],c1,c2,c3); | ||
| 426 | mul_add_c(a[6],b[3],c1,c2,c3); | ||
| 427 | mul_add_c(a[7],b[2],c1,c2,c3); | ||
| 428 | r[9]=c1; | ||
| 429 | c1=0; | ||
| 430 | mul_add_c(a[7],b[3],c2,c3,c1); | ||
| 431 | mul_add_c(a[6],b[4],c2,c3,c1); | ||
| 432 | mul_add_c(a[5],b[5],c2,c3,c1); | ||
| 433 | mul_add_c(a[4],b[6],c2,c3,c1); | ||
| 434 | mul_add_c(a[3],b[7],c2,c3,c1); | ||
| 435 | r[10]=c2; | ||
| 436 | c2=0; | ||
| 437 | mul_add_c(a[4],b[7],c3,c1,c2); | ||
| 438 | mul_add_c(a[5],b[6],c3,c1,c2); | ||
| 439 | mul_add_c(a[6],b[5],c3,c1,c2); | ||
| 440 | mul_add_c(a[7],b[4],c3,c1,c2); | ||
| 441 | r[11]=c3; | ||
| 442 | c3=0; | ||
| 443 | mul_add_c(a[7],b[5],c1,c2,c3); | ||
| 444 | mul_add_c(a[6],b[6],c1,c2,c3); | ||
| 445 | mul_add_c(a[5],b[7],c1,c2,c3); | ||
| 446 | r[12]=c1; | ||
| 447 | c1=0; | ||
| 448 | mul_add_c(a[6],b[7],c2,c3,c1); | ||
| 449 | mul_add_c(a[7],b[6],c2,c3,c1); | ||
| 450 | r[13]=c2; | ||
| 451 | c2=0; | ||
| 452 | mul_add_c(a[7],b[7],c3,c1,c2); | ||
| 453 | r[14]=c3; | ||
| 454 | r[15]=c1; | ||
| 455 | } | ||
| 456 | |||
| 457 | void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) | ||
| 458 | { | ||
| 459 | BN_ULONG t1,t2; | ||
| 460 | BN_ULONG c1,c2,c3; | ||
| 461 | |||
| 462 | c1=0; | ||
| 463 | c2=0; | ||
| 464 | c3=0; | ||
| 465 | mul_add_c(a[0],b[0],c1,c2,c3); | ||
| 466 | r[0]=c1; | ||
| 467 | c1=0; | ||
| 468 | mul_add_c(a[0],b[1],c2,c3,c1); | ||
| 469 | mul_add_c(a[1],b[0],c2,c3,c1); | ||
| 470 | r[1]=c2; | ||
| 471 | c2=0; | ||
| 472 | mul_add_c(a[2],b[0],c3,c1,c2); | ||
| 473 | mul_add_c(a[1],b[1],c3,c1,c2); | ||
| 474 | mul_add_c(a[0],b[2],c3,c1,c2); | ||
| 475 | r[2]=c3; | ||
| 476 | c3=0; | ||
| 477 | mul_add_c(a[0],b[3],c1,c2,c3); | ||
| 478 | mul_add_c(a[1],b[2],c1,c2,c3); | ||
| 479 | mul_add_c(a[2],b[1],c1,c2,c3); | ||
| 480 | mul_add_c(a[3],b[0],c1,c2,c3); | ||
| 481 | r[3]=c1; | ||
| 482 | c1=0; | ||
| 483 | mul_add_c(a[3],b[1],c2,c3,c1); | ||
| 484 | mul_add_c(a[2],b[2],c2,c3,c1); | ||
| 485 | mul_add_c(a[1],b[3],c2,c3,c1); | ||
| 486 | r[4]=c2; | ||
| 487 | c2=0; | ||
| 488 | mul_add_c(a[2],b[3],c3,c1,c2); | ||
| 489 | mul_add_c(a[3],b[2],c3,c1,c2); | ||
| 490 | r[5]=c3; | ||
| 491 | c3=0; | ||
| 492 | mul_add_c(a[3],b[3],c1,c2,c3); | ||
| 493 | r[6]=c1; | ||
| 494 | r[7]=c2; | ||
| 495 | } | ||
| 496 | |||
| 497 | void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) | ||
| 498 | { | ||
| 499 | BN_ULONG t1,t2; | ||
| 500 | BN_ULONG c1,c2,c3; | ||
| 501 | |||
| 502 | c1=0; | ||
| 503 | c2=0; | ||
| 504 | c3=0; | ||
| 505 | sqr_add_c(a,0,c1,c2,c3); | ||
| 506 | r[0]=c1; | ||
| 507 | c1=0; | ||
| 508 | sqr_add_c2(a,1,0,c2,c3,c1); | ||
| 509 | r[1]=c2; | ||
| 510 | c2=0; | ||
| 511 | sqr_add_c(a,1,c3,c1,c2); | ||
| 512 | sqr_add_c2(a,2,0,c3,c1,c2); | ||
| 513 | r[2]=c3; | ||
| 514 | c3=0; | ||
| 515 | sqr_add_c2(a,3,0,c1,c2,c3); | ||
| 516 | sqr_add_c2(a,2,1,c1,c2,c3); | ||
| 517 | r[3]=c1; | ||
| 518 | c1=0; | ||
| 519 | sqr_add_c(a,2,c2,c3,c1); | ||
| 520 | sqr_add_c2(a,3,1,c2,c3,c1); | ||
| 521 | sqr_add_c2(a,4,0,c2,c3,c1); | ||
| 522 | r[4]=c2; | ||
| 523 | c2=0; | ||
| 524 | sqr_add_c2(a,5,0,c3,c1,c2); | ||
| 525 | sqr_add_c2(a,4,1,c3,c1,c2); | ||
| 526 | sqr_add_c2(a,3,2,c3,c1,c2); | ||
| 527 | r[5]=c3; | ||
| 528 | c3=0; | ||
| 529 | sqr_add_c(a,3,c1,c2,c3); | ||
| 530 | sqr_add_c2(a,4,2,c1,c2,c3); | ||
| 531 | sqr_add_c2(a,5,1,c1,c2,c3); | ||
| 532 | sqr_add_c2(a,6,0,c1,c2,c3); | ||
| 533 | r[6]=c1; | ||
| 534 | c1=0; | ||
| 535 | sqr_add_c2(a,7,0,c2,c3,c1); | ||
| 536 | sqr_add_c2(a,6,1,c2,c3,c1); | ||
| 537 | sqr_add_c2(a,5,2,c2,c3,c1); | ||
| 538 | sqr_add_c2(a,4,3,c2,c3,c1); | ||
| 539 | r[7]=c2; | ||
| 540 | c2=0; | ||
| 541 | sqr_add_c(a,4,c3,c1,c2); | ||
| 542 | sqr_add_c2(a,5,3,c3,c1,c2); | ||
| 543 | sqr_add_c2(a,6,2,c3,c1,c2); | ||
| 544 | sqr_add_c2(a,7,1,c3,c1,c2); | ||
| 545 | r[8]=c3; | ||
| 546 | c3=0; | ||
| 547 | sqr_add_c2(a,7,2,c1,c2,c3); | ||
| 548 | sqr_add_c2(a,6,3,c1,c2,c3); | ||
| 549 | sqr_add_c2(a,5,4,c1,c2,c3); | ||
| 550 | r[9]=c1; | ||
| 551 | c1=0; | ||
| 552 | sqr_add_c(a,5,c2,c3,c1); | ||
| 553 | sqr_add_c2(a,6,4,c2,c3,c1); | ||
| 554 | sqr_add_c2(a,7,3,c2,c3,c1); | ||
| 555 | r[10]=c2; | ||
| 556 | c2=0; | ||
| 557 | sqr_add_c2(a,7,4,c3,c1,c2); | ||
| 558 | sqr_add_c2(a,6,5,c3,c1,c2); | ||
| 559 | r[11]=c3; | ||
| 560 | c3=0; | ||
| 561 | sqr_add_c(a,6,c1,c2,c3); | ||
| 562 | sqr_add_c2(a,7,5,c1,c2,c3); | ||
| 563 | r[12]=c1; | ||
| 564 | c1=0; | ||
| 565 | sqr_add_c2(a,7,6,c2,c3,c1); | ||
| 566 | r[13]=c2; | ||
| 567 | c2=0; | ||
| 568 | sqr_add_c(a,7,c3,c1,c2); | ||
| 569 | r[14]=c3; | ||
| 570 | r[15]=c1; | ||
| 571 | } | ||
| 572 | |||
| 573 | void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) | ||
| 574 | { | ||
| 575 | BN_ULONG t1,t2; | ||
| 576 | BN_ULONG c1,c2,c3; | ||
| 577 | |||
| 578 | c1=0; | ||
| 579 | c2=0; | ||
| 580 | c3=0; | ||
| 581 | sqr_add_c(a,0,c1,c2,c3); | ||
| 582 | r[0]=c1; | ||
| 583 | c1=0; | ||
| 584 | sqr_add_c2(a,1,0,c2,c3,c1); | ||
| 585 | r[1]=c2; | ||
| 586 | c2=0; | ||
| 587 | sqr_add_c(a,1,c3,c1,c2); | ||
| 588 | sqr_add_c2(a,2,0,c3,c1,c2); | ||
| 589 | r[2]=c3; | ||
| 590 | c3=0; | ||
| 591 | sqr_add_c2(a,3,0,c1,c2,c3); | ||
| 592 | sqr_add_c2(a,2,1,c1,c2,c3); | ||
| 593 | r[3]=c1; | ||
| 594 | c1=0; | ||
| 595 | sqr_add_c(a,2,c2,c3,c1); | ||
| 596 | sqr_add_c2(a,3,1,c2,c3,c1); | ||
| 597 | r[4]=c2; | ||
| 598 | c2=0; | ||
| 599 | sqr_add_c2(a,3,2,c3,c1,c2); | ||
| 600 | r[5]=c3; | ||
| 601 | c3=0; | ||
| 602 | sqr_add_c(a,3,c1,c2,c3); | ||
| 603 | r[6]=c1; | ||
| 604 | r[7]=c2; | ||
| 605 | } | ||
| 606 | #endif | ||
diff --git a/src/lib/libcrypto/bn/asm/x86_64-mont.pl b/src/lib/libcrypto/bn/asm/x86_64-mont.pl deleted file mode 100755 index 3b7a6f243f..0000000000 --- a/src/lib/libcrypto/bn/asm/x86_64-mont.pl +++ /dev/null | |||
| @@ -1,330 +0,0 @@ | |||
| 1 | #!/usr/bin/env perl | ||
| 2 | |||
| 3 | # ==================================================================== | ||
| 4 | # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | ||
| 5 | # project. The module is, however, dual licensed under OpenSSL and | ||
| 6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
| 7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
| 8 | # ==================================================================== | ||
| 9 | |||
| 10 | # October 2005. | ||
| 11 | # | ||
| 12 | # Montgomery multiplication routine for x86_64. While it gives modest | ||
| 13 | # 9% improvement of rsa4096 sign on Opteron, rsa512 sign runs more | ||
| 14 | # than twice, >2x, as fast. Most common rsa1024 sign is improved by | ||
| 15 | # respectful 50%. It remains to be seen if loop unrolling and | ||
| 16 | # dedicated squaring routine can provide further improvement... | ||
| 17 | |||
| 18 | $flavour = shift; | ||
| 19 | $output = shift; | ||
| 20 | if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } | ||
| 21 | |||
| 22 | $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); | ||
| 23 | |||
| 24 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | ||
| 25 | ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or | ||
| 26 | ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or | ||
| 27 | die "can't locate x86_64-xlate.pl"; | ||
| 28 | |||
| 29 | open STDOUT,"| $^X $xlate $flavour $output"; | ||
| 30 | |||
| 31 | # int bn_mul_mont( | ||
| 32 | $rp="%rdi"; # BN_ULONG *rp, | ||
| 33 | $ap="%rsi"; # const BN_ULONG *ap, | ||
| 34 | $bp="%rdx"; # const BN_ULONG *bp, | ||
| 35 | $np="%rcx"; # const BN_ULONG *np, | ||
| 36 | $n0="%r8"; # const BN_ULONG *n0, | ||
| 37 | $num="%r9"; # int num); | ||
| 38 | $lo0="%r10"; | ||
| 39 | $hi0="%r11"; | ||
| 40 | $bp="%r12"; # reassign $bp | ||
| 41 | $hi1="%r13"; | ||
| 42 | $i="%r14"; | ||
| 43 | $j="%r15"; | ||
| 44 | $m0="%rbx"; | ||
| 45 | $m1="%rbp"; | ||
| 46 | |||
| 47 | $code=<<___; | ||
| 48 | .text | ||
| 49 | |||
| 50 | .globl bn_mul_mont | ||
| 51 | .type bn_mul_mont,\@function,6 | ||
| 52 | .align 16 | ||
| 53 | bn_mul_mont: | ||
| 54 | push %rbx | ||
| 55 | push %rbp | ||
| 56 | push %r12 | ||
| 57 | push %r13 | ||
| 58 | push %r14 | ||
| 59 | push %r15 | ||
| 60 | |||
| 61 | mov ${num}d,${num}d | ||
| 62 | lea 2($num),%r10 | ||
| 63 | mov %rsp,%r11 | ||
| 64 | neg %r10 | ||
| 65 | lea (%rsp,%r10,8),%rsp # tp=alloca(8*(num+2)) | ||
| 66 | and \$-1024,%rsp # minimize TLB usage | ||
| 67 | |||
| 68 | mov %r11,8(%rsp,$num,8) # tp[num+1]=%rsp | ||
| 69 | .Lprologue: | ||
| 70 | mov %rdx,$bp # $bp reassigned, remember? | ||
| 71 | |||
| 72 | mov ($n0),$n0 # pull n0[0] value | ||
| 73 | |||
| 74 | xor $i,$i # i=0 | ||
| 75 | xor $j,$j # j=0 | ||
| 76 | |||
| 77 | mov ($bp),$m0 # m0=bp[0] | ||
| 78 | mov ($ap),%rax | ||
| 79 | mulq $m0 # ap[0]*bp[0] | ||
| 80 | mov %rax,$lo0 | ||
| 81 | mov %rdx,$hi0 | ||
| 82 | |||
| 83 | imulq $n0,%rax # "tp[0]"*n0 | ||
| 84 | mov %rax,$m1 | ||
| 85 | |||
| 86 | mulq ($np) # np[0]*m1 | ||
| 87 | add $lo0,%rax # discarded | ||
| 88 | adc \$0,%rdx | ||
| 89 | mov %rdx,$hi1 | ||
| 90 | |||
| 91 | lea 1($j),$j # j++ | ||
| 92 | .L1st: | ||
| 93 | mov ($ap,$j,8),%rax | ||
| 94 | mulq $m0 # ap[j]*bp[0] | ||
| 95 | add $hi0,%rax | ||
| 96 | adc \$0,%rdx | ||
| 97 | mov %rax,$lo0 | ||
| 98 | mov ($np,$j,8),%rax | ||
| 99 | mov %rdx,$hi0 | ||
| 100 | |||
| 101 | mulq $m1 # np[j]*m1 | ||
| 102 | add $hi1,%rax | ||
| 103 | lea 1($j),$j # j++ | ||
| 104 | adc \$0,%rdx | ||
| 105 | add $lo0,%rax # np[j]*m1+ap[j]*bp[0] | ||
| 106 | adc \$0,%rdx | ||
| 107 | mov %rax,-16(%rsp,$j,8) # tp[j-1] | ||
| 108 | cmp $num,$j | ||
| 109 | mov %rdx,$hi1 | ||
| 110 | jl .L1st | ||
| 111 | |||
| 112 | xor %rdx,%rdx | ||
| 113 | add $hi0,$hi1 | ||
| 114 | adc \$0,%rdx | ||
| 115 | mov $hi1,-8(%rsp,$num,8) | ||
| 116 | mov %rdx,(%rsp,$num,8) # store upmost overflow bit | ||
| 117 | |||
| 118 | lea 1($i),$i # i++ | ||
| 119 | .align 4 | ||
| 120 | .Louter: | ||
| 121 | xor $j,$j # j=0 | ||
| 122 | |||
| 123 | mov ($bp,$i,8),$m0 # m0=bp[i] | ||
| 124 | mov ($ap),%rax # ap[0] | ||
| 125 | mulq $m0 # ap[0]*bp[i] | ||
| 126 | add (%rsp),%rax # ap[0]*bp[i]+tp[0] | ||
| 127 | adc \$0,%rdx | ||
| 128 | mov %rax,$lo0 | ||
| 129 | mov %rdx,$hi0 | ||
| 130 | |||
| 131 | imulq $n0,%rax # tp[0]*n0 | ||
| 132 | mov %rax,$m1 | ||
| 133 | |||
| 134 | mulq ($np,$j,8) # np[0]*m1 | ||
| 135 | add $lo0,%rax # discarded | ||
| 136 | mov 8(%rsp),$lo0 # tp[1] | ||
| 137 | adc \$0,%rdx | ||
| 138 | mov %rdx,$hi1 | ||
| 139 | |||
| 140 | lea 1($j),$j # j++ | ||
| 141 | .align 4 | ||
| 142 | .Linner: | ||
| 143 | mov ($ap,$j,8),%rax | ||
| 144 | mulq $m0 # ap[j]*bp[i] | ||
| 145 | add $hi0,%rax | ||
| 146 | adc \$0,%rdx | ||
| 147 | add %rax,$lo0 # ap[j]*bp[i]+tp[j] | ||
| 148 | mov ($np,$j,8),%rax | ||
| 149 | adc \$0,%rdx | ||
| 150 | mov %rdx,$hi0 | ||
| 151 | |||
| 152 | mulq $m1 # np[j]*m1 | ||
| 153 | add $hi1,%rax | ||
| 154 | lea 1($j),$j # j++ | ||
| 155 | adc \$0,%rdx | ||
| 156 | add $lo0,%rax # np[j]*m1+ap[j]*bp[i]+tp[j] | ||
| 157 | adc \$0,%rdx | ||
| 158 | mov (%rsp,$j,8),$lo0 | ||
| 159 | cmp $num,$j | ||
| 160 | mov %rax,-16(%rsp,$j,8) # tp[j-1] | ||
| 161 | mov %rdx,$hi1 | ||
| 162 | jl .Linner | ||
| 163 | |||
| 164 | xor %rdx,%rdx | ||
| 165 | add $hi0,$hi1 | ||
| 166 | adc \$0,%rdx | ||
| 167 | add $lo0,$hi1 # pull upmost overflow bit | ||
| 168 | adc \$0,%rdx | ||
| 169 | mov $hi1,-8(%rsp,$num,8) | ||
| 170 | mov %rdx,(%rsp,$num,8) # store upmost overflow bit | ||
| 171 | |||
| 172 | lea 1($i),$i # i++ | ||
| 173 | cmp $num,$i | ||
| 174 | jl .Louter | ||
| 175 | |||
| 176 | lea (%rsp),$ap # borrow ap for tp | ||
| 177 | lea -1($num),$j # j=num-1 | ||
| 178 | |||
| 179 | mov ($ap),%rax # tp[0] | ||
| 180 | xor $i,$i # i=0 and clear CF! | ||
| 181 | jmp .Lsub | ||
| 182 | .align 16 | ||
| 183 | .Lsub: sbb ($np,$i,8),%rax | ||
| 184 | mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i] | ||
| 185 | dec $j # doesn't affect CF! | ||
| 186 | mov 8($ap,$i,8),%rax # tp[i+1] | ||
| 187 | lea 1($i),$i # i++ | ||
| 188 | jge .Lsub | ||
| 189 | |||
| 190 | sbb \$0,%rax # handle upmost overflow bit | ||
| 191 | and %rax,$ap | ||
| 192 | not %rax | ||
| 193 | mov $rp,$np | ||
| 194 | and %rax,$np | ||
| 195 | lea -1($num),$j | ||
| 196 | or $np,$ap # ap=borrow?tp:rp | ||
| 197 | .align 16 | ||
| 198 | .Lcopy: # copy or in-place refresh | ||
| 199 | mov ($ap,$j,8),%rax | ||
| 200 | mov %rax,($rp,$j,8) # rp[i]=tp[i] | ||
| 201 | mov $i,(%rsp,$j,8) # zap temporary vector | ||
| 202 | dec $j | ||
| 203 | jge .Lcopy | ||
| 204 | |||
| 205 | mov 8(%rsp,$num,8),%rsi # restore %rsp | ||
| 206 | mov \$1,%rax | ||
| 207 | mov (%rsi),%r15 | ||
| 208 | mov 8(%rsi),%r14 | ||
| 209 | mov 16(%rsi),%r13 | ||
| 210 | mov 24(%rsi),%r12 | ||
| 211 | mov 32(%rsi),%rbp | ||
| 212 | mov 40(%rsi),%rbx | ||
| 213 | lea 48(%rsi),%rsp | ||
| 214 | .Lepilogue: | ||
| 215 | ret | ||
| 216 | .size bn_mul_mont,.-bn_mul_mont | ||
| 217 | .asciz "Montgomery Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>" | ||
| 218 | .align 16 | ||
| 219 | ___ | ||
| 220 | |||
| 221 | # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, | ||
| 222 | # CONTEXT *context,DISPATCHER_CONTEXT *disp) | ||
| 223 | if ($win64) { | ||
| 224 | $rec="%rcx"; | ||
| 225 | $frame="%rdx"; | ||
| 226 | $context="%r8"; | ||
| 227 | $disp="%r9"; | ||
| 228 | |||
| 229 | $code.=<<___; | ||
| 230 | .extern __imp_RtlVirtualUnwind | ||
| 231 | .type se_handler,\@abi-omnipotent | ||
| 232 | .align 16 | ||
| 233 | se_handler: | ||
| 234 | push %rsi | ||
| 235 | push %rdi | ||
| 236 | push %rbx | ||
| 237 | push %rbp | ||
| 238 | push %r12 | ||
| 239 | push %r13 | ||
| 240 | push %r14 | ||
| 241 | push %r15 | ||
| 242 | pushfq | ||
| 243 | sub \$64,%rsp | ||
| 244 | |||
| 245 | mov 120($context),%rax # pull context->Rax | ||
| 246 | mov 248($context),%rbx # pull context->Rip | ||
| 247 | |||
| 248 | lea .Lprologue(%rip),%r10 | ||
| 249 | cmp %r10,%rbx # context->Rip<.Lprologue | ||
| 250 | jb .Lin_prologue | ||
| 251 | |||
| 252 | mov 152($context),%rax # pull context->Rsp | ||
| 253 | |||
| 254 | lea .Lepilogue(%rip),%r10 | ||
| 255 | cmp %r10,%rbx # context->Rip>=.Lepilogue | ||
| 256 | jae .Lin_prologue | ||
| 257 | |||
| 258 | mov 192($context),%r10 # pull $num | ||
| 259 | mov 8(%rax,%r10,8),%rax # pull saved stack pointer | ||
| 260 | lea 48(%rax),%rax | ||
| 261 | |||
| 262 | mov -8(%rax),%rbx | ||
| 263 | mov -16(%rax),%rbp | ||
| 264 | mov -24(%rax),%r12 | ||
| 265 | mov -32(%rax),%r13 | ||
| 266 | mov -40(%rax),%r14 | ||
| 267 | mov -48(%rax),%r15 | ||
| 268 | mov %rbx,144($context) # restore context->Rbx | ||
| 269 | mov %rbp,160($context) # restore context->Rbp | ||
| 270 | mov %r12,216($context) # restore context->R12 | ||
| 271 | mov %r13,224($context) # restore context->R13 | ||
| 272 | mov %r14,232($context) # restore context->R14 | ||
| 273 | mov %r15,240($context) # restore context->R15 | ||
| 274 | |||
| 275 | .Lin_prologue: | ||
| 276 | mov 8(%rax),%rdi | ||
| 277 | mov 16(%rax),%rsi | ||
| 278 | mov %rax,152($context) # restore context->Rsp | ||
| 279 | mov %rsi,168($context) # restore context->Rsi | ||
| 280 | mov %rdi,176($context) # restore context->Rdi | ||
| 281 | |||
| 282 | mov 40($disp),%rdi # disp->ContextRecord | ||
| 283 | mov $context,%rsi # context | ||
| 284 | mov \$154,%ecx # sizeof(CONTEXT) | ||
| 285 | .long 0xa548f3fc # cld; rep movsq | ||
| 286 | |||
| 287 | mov $disp,%rsi | ||
| 288 | xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER | ||
| 289 | mov 8(%rsi),%rdx # arg2, disp->ImageBase | ||
| 290 | mov 0(%rsi),%r8 # arg3, disp->ControlPc | ||
| 291 | mov 16(%rsi),%r9 # arg4, disp->FunctionEntry | ||
| 292 | mov 40(%rsi),%r10 # disp->ContextRecord | ||
| 293 | lea 56(%rsi),%r11 # &disp->HandlerData | ||
| 294 | lea 24(%rsi),%r12 # &disp->EstablisherFrame | ||
| 295 | mov %r10,32(%rsp) # arg5 | ||
| 296 | mov %r11,40(%rsp) # arg6 | ||
| 297 | mov %r12,48(%rsp) # arg7 | ||
| 298 | mov %rcx,56(%rsp) # arg8, (NULL) | ||
| 299 | call *__imp_RtlVirtualUnwind(%rip) | ||
| 300 | |||
| 301 | mov \$1,%eax # ExceptionContinueSearch | ||
| 302 | add \$64,%rsp | ||
| 303 | popfq | ||
| 304 | pop %r15 | ||
| 305 | pop %r14 | ||
| 306 | pop %r13 | ||
| 307 | pop %r12 | ||
| 308 | pop %rbp | ||
| 309 | pop %rbx | ||
| 310 | pop %rdi | ||
| 311 | pop %rsi | ||
| 312 | ret | ||
| 313 | .size se_handler,.-se_handler | ||
| 314 | |||
| 315 | .section .pdata | ||
| 316 | .align 4 | ||
| 317 | .rva .LSEH_begin_bn_mul_mont | ||
| 318 | .rva .LSEH_end_bn_mul_mont | ||
| 319 | .rva .LSEH_info_bn_mul_mont | ||
| 320 | |||
| 321 | .section .xdata | ||
| 322 | .align 8 | ||
| 323 | .LSEH_info_bn_mul_mont: | ||
| 324 | .byte 9,0,0,0 | ||
| 325 | .rva se_handler | ||
| 326 | ___ | ||
| 327 | } | ||
| 328 | |||
| 329 | print $code; | ||
| 330 | close STDOUT; | ||
