summaryrefslogtreecommitdiff
path: root/src/lib/libcrypto/bn/asm/x86
diff options
context:
space:
mode:
authorcvs2svn <admin@example.com>2012-07-13 17:49:56 +0000
committercvs2svn <admin@example.com>2012-07-13 17:49:56 +0000
commitee04221ea8063435416c7e6369e6eae76843aa71 (patch)
tree821921a1dd0a5a3cece91121e121cc63c4b68128 /src/lib/libcrypto/bn/asm/x86
parentadf6731f6e1d04718aee00cb93435143046aee9a (diff)
downloadopenbsd-eric_g2k12.tar.gz
openbsd-eric_g2k12.tar.bz2
openbsd-eric_g2k12.zip
This commit was manufactured by cvs2git to create tag 'eric_g2k12'.eric_g2k12
Diffstat (limited to '')
-rwxr-xr-xsrc/lib/libcrypto/bn/asm/x86-mont.pl593
-rw-r--r--src/lib/libcrypto/bn/asm/x86.pl28
-rw-r--r--src/lib/libcrypto/bn/asm/x86/add.pl76
-rw-r--r--src/lib/libcrypto/bn/asm/x86/comba.pl277
-rw-r--r--src/lib/libcrypto/bn/asm/x86/div.pl15
-rw-r--r--src/lib/libcrypto/bn/asm/x86/f3
-rw-r--r--src/lib/libcrypto/bn/asm/x86/mul.pl77
-rw-r--r--src/lib/libcrypto/bn/asm/x86/mul_add.pl87
-rw-r--r--src/lib/libcrypto/bn/asm/x86/sqr.pl60
-rw-r--r--src/lib/libcrypto/bn/asm/x86/sub.pl76
-rw-r--r--src/lib/libcrypto/bn/asm/x86_64-gcc.c606
-rwxr-xr-xsrc/lib/libcrypto/bn/asm/x86_64-mont.pl330
12 files changed, 0 insertions, 2228 deletions
diff --git a/src/lib/libcrypto/bn/asm/x86-mont.pl b/src/lib/libcrypto/bn/asm/x86-mont.pl
deleted file mode 100755
index e8f6b05084..0000000000
--- a/src/lib/libcrypto/bn/asm/x86-mont.pl
+++ /dev/null
@@ -1,593 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# October 2005
11#
12# This is a "teaser" code, as it can be improved in several ways...
13# First of all non-SSE2 path should be implemented (yes, for now it
14# performs Montgomery multiplication/convolution only on SSE2-capable
15# CPUs such as P4, others fall down to original code). Then inner loop
16# can be unrolled and modulo-scheduled to improve ILP and possibly
17# moved to 128-bit XMM register bank (though it would require input
18# rearrangement and/or increase bus bandwidth utilization). Dedicated
19# squaring procedure should give further performance improvement...
20# Yet, for being draft, the code improves rsa512 *sign* benchmark by
21# 110%(!), rsa1024 one - by 70% and rsa4096 - by 20%:-)
22
23# December 2006
24#
25# Modulo-scheduling SSE2 loops results in further 15-20% improvement.
26# Integer-only code [being equipped with dedicated squaring procedure]
27# gives ~40% on rsa512 sign benchmark...
28
29$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
30push(@INC,"${dir}","${dir}../../perlasm");
31require "x86asm.pl";
32
33&asm_init($ARGV[0],$0);
34
35$sse2=0;
36for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
37
38&external_label("OPENSSL_ia32cap_P") if ($sse2);
39
40&function_begin("bn_mul_mont");
41
42$i="edx";
43$j="ecx";
44$ap="esi"; $tp="esi"; # overlapping variables!!!
45$rp="edi"; $bp="edi"; # overlapping variables!!!
46$np="ebp";
47$num="ebx";
48
49$_num=&DWP(4*0,"esp"); # stack top layout
50$_rp=&DWP(4*1,"esp");
51$_ap=&DWP(4*2,"esp");
52$_bp=&DWP(4*3,"esp");
53$_np=&DWP(4*4,"esp");
54$_n0=&DWP(4*5,"esp"); $_n0q=&QWP(4*5,"esp");
55$_sp=&DWP(4*6,"esp");
56$_bpend=&DWP(4*7,"esp");
57$frame=32; # size of above frame rounded up to 16n
58
59 &xor ("eax","eax");
60 &mov ("edi",&wparam(5)); # int num
61 &cmp ("edi",4);
62 &jl (&label("just_leave"));
63
64 &lea ("esi",&wparam(0)); # put aside pointer to argument block
65 &lea ("edx",&wparam(1)); # load ap
66 &mov ("ebp","esp"); # saved stack pointer!
67 &add ("edi",2); # extra two words on top of tp
68 &neg ("edi");
69 &lea ("esp",&DWP(-$frame,"esp","edi",4)); # alloca($frame+4*(num+2))
70 &neg ("edi");
71
72 # minimize cache contention by arraning 2K window between stack
73 # pointer and ap argument [np is also position sensitive vector,
74 # but it's assumed to be near ap, as it's allocated at ~same
75 # time].
76 &mov ("eax","esp");
77 &sub ("eax","edx");
78 &and ("eax",2047);
79 &sub ("esp","eax"); # this aligns sp and ap modulo 2048
80
81 &xor ("edx","esp");
82 &and ("edx",2048);
83 &xor ("edx",2048);
84 &sub ("esp","edx"); # this splits them apart modulo 4096
85
86 &and ("esp",-64); # align to cache line
87
88 ################################# load argument block...
89 &mov ("eax",&DWP(0*4,"esi"));# BN_ULONG *rp
90 &mov ("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap
91 &mov ("ecx",&DWP(2*4,"esi"));# const BN_ULONG *bp
92 &mov ("edx",&DWP(3*4,"esi"));# const BN_ULONG *np
93 &mov ("esi",&DWP(4*4,"esi"));# const BN_ULONG *n0
94 #&mov ("edi",&DWP(5*4,"esi"));# int num
95
96 &mov ("esi",&DWP(0,"esi")); # pull n0[0]
97 &mov ($_rp,"eax"); # ... save a copy of argument block
98 &mov ($_ap,"ebx");
99 &mov ($_bp,"ecx");
100 &mov ($_np,"edx");
101 &mov ($_n0,"esi");
102 &lea ($num,&DWP(-3,"edi")); # num=num-1 to assist modulo-scheduling
103 #&mov ($_num,$num); # redundant as $num is not reused
104 &mov ($_sp,"ebp"); # saved stack pointer!
105
106if($sse2) {
107$acc0="mm0"; # mmx register bank layout
108$acc1="mm1";
109$car0="mm2";
110$car1="mm3";
111$mul0="mm4";
112$mul1="mm5";
113$temp="mm6";
114$mask="mm7";
115
116 &picmeup("eax","OPENSSL_ia32cap_P");
117 &bt (&DWP(0,"eax"),26);
118 &jnc (&label("non_sse2"));
119
120 &mov ("eax",-1);
121 &movd ($mask,"eax"); # mask 32 lower bits
122
123 &mov ($ap,$_ap); # load input pointers
124 &mov ($bp,$_bp);
125 &mov ($np,$_np);
126
127 &xor ($i,$i); # i=0
128 &xor ($j,$j); # j=0
129
130 &movd ($mul0,&DWP(0,$bp)); # bp[0]
131 &movd ($mul1,&DWP(0,$ap)); # ap[0]
132 &movd ($car1,&DWP(0,$np)); # np[0]
133
134 &pmuludq($mul1,$mul0); # ap[0]*bp[0]
135 &movq ($car0,$mul1);
136 &movq ($acc0,$mul1); # I wish movd worked for
137 &pand ($acc0,$mask); # inter-register transfers
138
139 &pmuludq($mul1,$_n0q); # *=n0
140
141 &pmuludq($car1,$mul1); # "t[0]"*np[0]*n0
142 &paddq ($car1,$acc0);
143
144 &movd ($acc1,&DWP(4,$np)); # np[1]
145 &movd ($acc0,&DWP(4,$ap)); # ap[1]
146
147 &psrlq ($car0,32);
148 &psrlq ($car1,32);
149
150 &inc ($j); # j++
151&set_label("1st",16);
152 &pmuludq($acc0,$mul0); # ap[j]*bp[0]
153 &pmuludq($acc1,$mul1); # np[j]*m1
154 &paddq ($car0,$acc0); # +=c0
155 &paddq ($car1,$acc1); # +=c1
156
157 &movq ($acc0,$car0);
158 &pand ($acc0,$mask);
159 &movd ($acc1,&DWP(4,$np,$j,4)); # np[j+1]
160 &paddq ($car1,$acc0); # +=ap[j]*bp[0];
161 &movd ($acc0,&DWP(4,$ap,$j,4)); # ap[j+1]
162 &psrlq ($car0,32);
163 &movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[j-1]=
164 &psrlq ($car1,32);
165
166 &lea ($j,&DWP(1,$j));
167 &cmp ($j,$num);
168 &jl (&label("1st"));
169
170 &pmuludq($acc0,$mul0); # ap[num-1]*bp[0]
171 &pmuludq($acc1,$mul1); # np[num-1]*m1
172 &paddq ($car0,$acc0); # +=c0
173 &paddq ($car1,$acc1); # +=c1
174
175 &movq ($acc0,$car0);
176 &pand ($acc0,$mask);
177 &paddq ($car1,$acc0); # +=ap[num-1]*bp[0];
178 &movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[num-2]=
179
180 &psrlq ($car0,32);
181 &psrlq ($car1,32);
182
183 &paddq ($car1,$car0);
184 &movq (&QWP($frame,"esp",$num,4),$car1); # tp[num].tp[num-1]
185
186 &inc ($i); # i++
187&set_label("outer");
188 &xor ($j,$j); # j=0
189
190 &movd ($mul0,&DWP(0,$bp,$i,4)); # bp[i]
191 &movd ($mul1,&DWP(0,$ap)); # ap[0]
192 &movd ($temp,&DWP($frame,"esp")); # tp[0]
193 &movd ($car1,&DWP(0,$np)); # np[0]
194 &pmuludq($mul1,$mul0); # ap[0]*bp[i]
195
196 &paddq ($mul1,$temp); # +=tp[0]
197 &movq ($acc0,$mul1);
198 &movq ($car0,$mul1);
199 &pand ($acc0,$mask);
200
201 &pmuludq($mul1,$_n0q); # *=n0
202
203 &pmuludq($car1,$mul1);
204 &paddq ($car1,$acc0);
205
206 &movd ($temp,&DWP($frame+4,"esp")); # tp[1]
207 &movd ($acc1,&DWP(4,$np)); # np[1]
208 &movd ($acc0,&DWP(4,$ap)); # ap[1]
209
210 &psrlq ($car0,32);
211 &psrlq ($car1,32);
212 &paddq ($car0,$temp); # +=tp[1]
213
214 &inc ($j); # j++
215 &dec ($num);
216&set_label("inner");
217 &pmuludq($acc0,$mul0); # ap[j]*bp[i]
218 &pmuludq($acc1,$mul1); # np[j]*m1
219 &paddq ($car0,$acc0); # +=c0
220 &paddq ($car1,$acc1); # +=c1
221
222 &movq ($acc0,$car0);
223 &movd ($temp,&DWP($frame+4,"esp",$j,4));# tp[j+1]
224 &pand ($acc0,$mask);
225 &movd ($acc1,&DWP(4,$np,$j,4)); # np[j+1]
226 &paddq ($car1,$acc0); # +=ap[j]*bp[i]+tp[j]
227 &movd ($acc0,&DWP(4,$ap,$j,4)); # ap[j+1]
228 &psrlq ($car0,32);
229 &movd (&DWP($frame-4,"esp",$j,4),$car1);# tp[j-1]=
230 &psrlq ($car1,32);
231 &paddq ($car0,$temp); # +=tp[j+1]
232
233 &dec ($num);
234 &lea ($j,&DWP(1,$j)); # j++
235 &jnz (&label("inner"));
236
237 &mov ($num,$j);
238 &pmuludq($acc0,$mul0); # ap[num-1]*bp[i]
239 &pmuludq($acc1,$mul1); # np[num-1]*m1
240 &paddq ($car0,$acc0); # +=c0
241 &paddq ($car1,$acc1); # +=c1
242
243 &movq ($acc0,$car0);
244 &pand ($acc0,$mask);
245 &paddq ($car1,$acc0); # +=ap[num-1]*bp[i]+tp[num-1]
246 &movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[num-2]=
247 &psrlq ($car0,32);
248 &psrlq ($car1,32);
249
250 &movd ($temp,&DWP($frame+4,"esp",$num,4)); # += tp[num]
251 &paddq ($car1,$car0);
252 &paddq ($car1,$temp);
253 &movq (&QWP($frame,"esp",$num,4),$car1); # tp[num].tp[num-1]
254
255 &lea ($i,&DWP(1,$i)); # i++
256 &cmp ($i,$num);
257 &jle (&label("outer"));
258
259 &emms (); # done with mmx bank
260 &jmp (&label("common_tail"));
261
262&set_label("non_sse2",16);
263}
264
265if (0) {
266 &mov ("esp",$_sp);
267 &xor ("eax","eax"); # signal "not fast enough [yet]"
268 &jmp (&label("just_leave"));
269 # While the below code provides competitive performance for
270 # all key lengthes on modern Intel cores, it's still more
271 # than 10% slower for 4096-bit key elsewhere:-( "Competitive"
272 # means compared to the original integer-only assembler.
273 # 512-bit RSA sign is better by ~40%, but that's about all
274 # one can say about all CPUs...
275} else {
276$inp="esi"; # integer path uses these registers differently
277$word="edi";
278$carry="ebp";
279
280 &mov ($inp,$_ap);
281 &lea ($carry,&DWP(1,$num));
282 &mov ($word,$_bp);
283 &xor ($j,$j); # j=0
284 &mov ("edx",$inp);
285 &and ($carry,1); # see if num is even
286 &sub ("edx",$word); # see if ap==bp
287 &lea ("eax",&DWP(4,$word,$num,4)); # &bp[num]
288 &or ($carry,"edx");
289 &mov ($word,&DWP(0,$word)); # bp[0]
290 &jz (&label("bn_sqr_mont"));
291 &mov ($_bpend,"eax");
292 &mov ("eax",&DWP(0,$inp));
293 &xor ("edx","edx");
294
295&set_label("mull",16);
296 &mov ($carry,"edx");
297 &mul ($word); # ap[j]*bp[0]
298 &add ($carry,"eax");
299 &lea ($j,&DWP(1,$j));
300 &adc ("edx",0);
301 &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j+1]
302 &cmp ($j,$num);
303 &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]=
304 &jl (&label("mull"));
305
306 &mov ($carry,"edx");
307 &mul ($word); # ap[num-1]*bp[0]
308 &mov ($word,$_n0);
309 &add ("eax",$carry);
310 &mov ($inp,$_np);
311 &adc ("edx",0);
312 &imul ($word,&DWP($frame,"esp")); # n0*tp[0]
313
314 &mov (&DWP($frame,"esp",$num,4),"eax"); # tp[num-1]=
315 &xor ($j,$j);
316 &mov (&DWP($frame+4,"esp",$num,4),"edx"); # tp[num]=
317 &mov (&DWP($frame+8,"esp",$num,4),$j); # tp[num+1]=
318
319 &mov ("eax",&DWP(0,$inp)); # np[0]
320 &mul ($word); # np[0]*m
321 &add ("eax",&DWP($frame,"esp")); # +=tp[0]
322 &mov ("eax",&DWP(4,$inp)); # np[1]
323 &adc ("edx",0);
324 &inc ($j);
325
326 &jmp (&label("2ndmadd"));
327
328&set_label("1stmadd",16);
329 &mov ($carry,"edx");
330 &mul ($word); # ap[j]*bp[i]
331 &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j]
332 &lea ($j,&DWP(1,$j));
333 &adc ("edx",0);
334 &add ($carry,"eax");
335 &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j+1]
336 &adc ("edx",0);
337 &cmp ($j,$num);
338 &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]=
339 &jl (&label("1stmadd"));
340
341 &mov ($carry,"edx");
342 &mul ($word); # ap[num-1]*bp[i]
343 &add ("eax",&DWP($frame,"esp",$num,4)); # +=tp[num-1]
344 &mov ($word,$_n0);
345 &adc ("edx",0);
346 &mov ($inp,$_np);
347 &add ($carry,"eax");
348 &adc ("edx",0);
349 &imul ($word,&DWP($frame,"esp")); # n0*tp[0]
350
351 &xor ($j,$j);
352 &add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num]
353 &mov (&DWP($frame,"esp",$num,4),$carry); # tp[num-1]=
354 &adc ($j,0);
355 &mov ("eax",&DWP(0,$inp)); # np[0]
356 &mov (&DWP($frame+4,"esp",$num,4),"edx"); # tp[num]=
357 &mov (&DWP($frame+8,"esp",$num,4),$j); # tp[num+1]=
358
359 &mul ($word); # np[0]*m
360 &add ("eax",&DWP($frame,"esp")); # +=tp[0]
361 &mov ("eax",&DWP(4,$inp)); # np[1]
362 &adc ("edx",0);
363 &mov ($j,1);
364
365&set_label("2ndmadd",16);
366 &mov ($carry,"edx");
367 &mul ($word); # np[j]*m
368 &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j]
369 &lea ($j,&DWP(1,$j));
370 &adc ("edx",0);
371 &add ($carry,"eax");
372 &mov ("eax",&DWP(0,$inp,$j,4)); # np[j+1]
373 &adc ("edx",0);
374 &cmp ($j,$num);
375 &mov (&DWP($frame-8,"esp",$j,4),$carry); # tp[j-1]=
376 &jl (&label("2ndmadd"));
377
378 &mov ($carry,"edx");
379 &mul ($word); # np[j]*m
380 &add ($carry,&DWP($frame,"esp",$num,4)); # +=tp[num-1]
381 &adc ("edx",0);
382 &add ($carry,"eax");
383 &adc ("edx",0);
384 &mov (&DWP($frame-4,"esp",$num,4),$carry); # tp[num-2]=
385
386 &xor ("eax","eax");
387 &mov ($j,$_bp); # &bp[i]
388 &add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num]
389 &adc ("eax",&DWP($frame+8,"esp",$num,4)); # +=tp[num+1]
390 &lea ($j,&DWP(4,$j));
391 &mov (&DWP($frame,"esp",$num,4),"edx"); # tp[num-1]=
392 &cmp ($j,$_bpend);
393 &mov (&DWP($frame+4,"esp",$num,4),"eax"); # tp[num]=
394 &je (&label("common_tail"));
395
396 &mov ($word,&DWP(0,$j)); # bp[i+1]
397 &mov ($inp,$_ap);
398 &mov ($_bp,$j); # &bp[++i]
399 &xor ($j,$j);
400 &xor ("edx","edx");
401 &mov ("eax",&DWP(0,$inp));
402 &jmp (&label("1stmadd"));
403
404&set_label("bn_sqr_mont",16);
405$sbit=$num;
406 &mov ($_num,$num);
407 &mov ($_bp,$j); # i=0
408
409 &mov ("eax",$word); # ap[0]
410 &mul ($word); # ap[0]*ap[0]
411 &mov (&DWP($frame,"esp"),"eax"); # tp[0]=
412 &mov ($sbit,"edx");
413 &shr ("edx",1);
414 &and ($sbit,1);
415 &inc ($j);
416&set_label("sqr",16);
417 &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j]
418 &mov ($carry,"edx");
419 &mul ($word); # ap[j]*ap[0]
420 &add ("eax",$carry);
421 &lea ($j,&DWP(1,$j));
422 &adc ("edx",0);
423 &lea ($carry,&DWP(0,$sbit,"eax",2));
424 &shr ("eax",31);
425 &cmp ($j,$_num);
426 &mov ($sbit,"eax");
427 &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]=
428 &jl (&label("sqr"));
429
430 &mov ("eax",&DWP(0,$inp,$j,4)); # ap[num-1]
431 &mov ($carry,"edx");
432 &mul ($word); # ap[num-1]*ap[0]
433 &add ("eax",$carry);
434 &mov ($word,$_n0);
435 &adc ("edx",0);
436 &mov ($inp,$_np);
437 &lea ($carry,&DWP(0,$sbit,"eax",2));
438 &imul ($word,&DWP($frame,"esp")); # n0*tp[0]
439 &shr ("eax",31);
440 &mov (&DWP($frame,"esp",$j,4),$carry); # tp[num-1]=
441
442 &lea ($carry,&DWP(0,"eax","edx",2));
443 &mov ("eax",&DWP(0,$inp)); # np[0]
444 &shr ("edx",31);
445 &mov (&DWP($frame+4,"esp",$j,4),$carry); # tp[num]=
446 &mov (&DWP($frame+8,"esp",$j,4),"edx"); # tp[num+1]=
447
448 &mul ($word); # np[0]*m
449 &add ("eax",&DWP($frame,"esp")); # +=tp[0]
450 &mov ($num,$j);
451 &adc ("edx",0);
452 &mov ("eax",&DWP(4,$inp)); # np[1]
453 &mov ($j,1);
454
455&set_label("3rdmadd",16);
456 &mov ($carry,"edx");
457 &mul ($word); # np[j]*m
458 &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j]
459 &adc ("edx",0);
460 &add ($carry,"eax");
461 &mov ("eax",&DWP(4,$inp,$j,4)); # np[j+1]
462 &adc ("edx",0);
463 &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j-1]=
464
465 &mov ($carry,"edx");
466 &mul ($word); # np[j+1]*m
467 &add ($carry,&DWP($frame+4,"esp",$j,4)); # +=tp[j+1]
468 &lea ($j,&DWP(2,$j));
469 &adc ("edx",0);
470 &add ($carry,"eax");
471 &mov ("eax",&DWP(0,$inp,$j,4)); # np[j+2]
472 &adc ("edx",0);
473 &cmp ($j,$num);
474 &mov (&DWP($frame-8,"esp",$j,4),$carry); # tp[j]=
475 &jl (&label("3rdmadd"));
476
477 &mov ($carry,"edx");
478 &mul ($word); # np[j]*m
479 &add ($carry,&DWP($frame,"esp",$num,4)); # +=tp[num-1]
480 &adc ("edx",0);
481 &add ($carry,"eax");
482 &adc ("edx",0);
483 &mov (&DWP($frame-4,"esp",$num,4),$carry); # tp[num-2]=
484
485 &mov ($j,$_bp); # i
486 &xor ("eax","eax");
487 &mov ($inp,$_ap);
488 &add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num]
489 &adc ("eax",&DWP($frame+8,"esp",$num,4)); # +=tp[num+1]
490 &mov (&DWP($frame,"esp",$num,4),"edx"); # tp[num-1]=
491 &cmp ($j,$num);
492 &mov (&DWP($frame+4,"esp",$num,4),"eax"); # tp[num]=
493 &je (&label("common_tail"));
494
495 &mov ($word,&DWP(4,$inp,$j,4)); # ap[i]
496 &lea ($j,&DWP(1,$j));
497 &mov ("eax",$word);
498 &mov ($_bp,$j); # ++i
499 &mul ($word); # ap[i]*ap[i]
500 &add ("eax",&DWP($frame,"esp",$j,4)); # +=tp[i]
501 &adc ("edx",0);
502 &mov (&DWP($frame,"esp",$j,4),"eax"); # tp[i]=
503 &xor ($carry,$carry);
504 &cmp ($j,$num);
505 &lea ($j,&DWP(1,$j));
506 &je (&label("sqrlast"));
507
508 &mov ($sbit,"edx"); # zaps $num
509 &shr ("edx",1);
510 &and ($sbit,1);
511&set_label("sqradd",16);
512 &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j]
513 &mov ($carry,"edx");
514 &mul ($word); # ap[j]*ap[i]
515 &add ("eax",$carry);
516 &lea ($carry,&DWP(0,"eax","eax"));
517 &adc ("edx",0);
518 &shr ("eax",31);
519 &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j]
520 &lea ($j,&DWP(1,$j));
521 &adc ("eax",0);
522 &add ($carry,$sbit);
523 &adc ("eax",0);
524 &cmp ($j,$_num);
525 &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]=
526 &mov ($sbit,"eax");
527 &jle (&label("sqradd"));
528
529 &mov ($carry,"edx");
530 &add ("edx","edx");
531 &shr ($carry,31);
532 &add ("edx",$sbit);
533 &adc ($carry,0);
534&set_label("sqrlast");
535 &mov ($word,$_n0);
536 &mov ($inp,$_np);
537 &imul ($word,&DWP($frame,"esp")); # n0*tp[0]
538
539 &add ("edx",&DWP($frame,"esp",$j,4)); # +=tp[num]
540 &mov ("eax",&DWP(0,$inp)); # np[0]
541 &adc ($carry,0);
542 &mov (&DWP($frame,"esp",$j,4),"edx"); # tp[num]=
543 &mov (&DWP($frame+4,"esp",$j,4),$carry); # tp[num+1]=
544
545 &mul ($word); # np[0]*m
546 &add ("eax",&DWP($frame,"esp")); # +=tp[0]
547 &lea ($num,&DWP(-1,$j));
548 &adc ("edx",0);
549 &mov ($j,1);
550 &mov ("eax",&DWP(4,$inp)); # np[1]
551
552 &jmp (&label("3rdmadd"));
553}
554
555&set_label("common_tail",16);
556 &mov ($np,$_np); # load modulus pointer
557 &mov ($rp,$_rp); # load result pointer
558 &lea ($tp,&DWP($frame,"esp")); # [$ap and $bp are zapped]
559
560 &mov ("eax",&DWP(0,$tp)); # tp[0]
561 &mov ($j,$num); # j=num-1
562 &xor ($i,$i); # i=0 and clear CF!
563
564&set_label("sub",16);
565 &sbb ("eax",&DWP(0,$np,$i,4));
566 &mov (&DWP(0,$rp,$i,4),"eax"); # rp[i]=tp[i]-np[i]
567 &dec ($j); # doesn't affect CF!
568 &mov ("eax",&DWP(4,$tp,$i,4)); # tp[i+1]
569 &lea ($i,&DWP(1,$i)); # i++
570 &jge (&label("sub"));
571
572 &sbb ("eax",0); # handle upmost overflow bit
573 &and ($tp,"eax");
574 &not ("eax");
575 &mov ($np,$rp);
576 &and ($np,"eax");
577 &or ($tp,$np); # tp=carry?tp:rp
578
579&set_label("copy",16); # copy or in-place refresh
580 &mov ("eax",&DWP(0,$tp,$num,4));
581 &mov (&DWP(0,$rp,$num,4),"eax"); # rp[i]=tp[i]
582 &mov (&DWP($frame,"esp",$num,4),$j); # zap temporary vector
583 &dec ($num);
584 &jge (&label("copy"));
585
586 &mov ("esp",$_sp); # pull saved stack pointer
587 &mov ("eax",1);
588&set_label("just_leave");
589&function_end("bn_mul_mont");
590
591&asciz("Montgomery Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>");
592
593&asm_finish();
diff --git a/src/lib/libcrypto/bn/asm/x86.pl b/src/lib/libcrypto/bn/asm/x86.pl
deleted file mode 100644
index 1bc4f1bb27..0000000000
--- a/src/lib/libcrypto/bn/asm/x86.pl
+++ /dev/null
@@ -1,28 +0,0 @@
1#!/usr/local/bin/perl
2
3push(@INC,"perlasm","../../perlasm");
4require "x86asm.pl";
5
6require("x86/mul_add.pl");
7require("x86/mul.pl");
8require("x86/sqr.pl");
9require("x86/div.pl");
10require("x86/add.pl");
11require("x86/sub.pl");
12require("x86/comba.pl");
13
14&asm_init($ARGV[0],$0);
15
16&bn_mul_add_words("bn_mul_add_words");
17&bn_mul_words("bn_mul_words");
18&bn_sqr_words("bn_sqr_words");
19&bn_div_words("bn_div_words");
20&bn_add_words("bn_add_words");
21&bn_sub_words("bn_sub_words");
22&bn_mul_comba("bn_mul_comba8",8);
23&bn_mul_comba("bn_mul_comba4",4);
24&bn_sqr_comba("bn_sqr_comba8",8);
25&bn_sqr_comba("bn_sqr_comba4",4);
26
27&asm_finish();
28
diff --git a/src/lib/libcrypto/bn/asm/x86/add.pl b/src/lib/libcrypto/bn/asm/x86/add.pl
deleted file mode 100644
index 0b5cf583e3..0000000000
--- a/src/lib/libcrypto/bn/asm/x86/add.pl
+++ /dev/null
@@ -1,76 +0,0 @@
1#!/usr/local/bin/perl
2# x86 assember
3
4sub bn_add_words
5 {
6 local($name)=@_;
7
8 &function_begin($name,"");
9
10 &comment("");
11 $a="esi";
12 $b="edi";
13 $c="eax";
14 $r="ebx";
15 $tmp1="ecx";
16 $tmp2="edx";
17 $num="ebp";
18
19 &mov($r,&wparam(0)); # get r
20 &mov($a,&wparam(1)); # get a
21 &mov($b,&wparam(2)); # get b
22 &mov($num,&wparam(3)); # get num
23 &xor($c,$c); # clear carry
24 &and($num,0xfffffff8); # num / 8
25
26 &jz(&label("aw_finish"));
27
28 &set_label("aw_loop",0);
29 for ($i=0; $i<8; $i++)
30 {
31 &comment("Round $i");
32
33 &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
34 &mov($tmp2,&DWP($i*4,$b,"",0)); # *b
35 &add($tmp1,$c);
36 &mov($c,0);
37 &adc($c,$c);
38 &add($tmp1,$tmp2);
39 &adc($c,0);
40 &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
41 }
42
43 &comment("");
44 &add($a,32);
45 &add($b,32);
46 &add($r,32);
47 &sub($num,8);
48 &jnz(&label("aw_loop"));
49
50 &set_label("aw_finish",0);
51 &mov($num,&wparam(3)); # get num
52 &and($num,7);
53 &jz(&label("aw_end"));
54
55 for ($i=0; $i<7; $i++)
56 {
57 &comment("Tail Round $i");
58 &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
59 &mov($tmp2,&DWP($i*4,$b,"",0));# *b
60 &add($tmp1,$c);
61 &mov($c,0);
62 &adc($c,$c);
63 &add($tmp1,$tmp2);
64 &adc($c,0);
65 &dec($num) if ($i != 6);
66 &mov(&DWP($i*4,$r,"",0),$tmp1); # *a
67 &jz(&label("aw_end")) if ($i != 6);
68 }
69 &set_label("aw_end",0);
70
71# &mov("eax",$c); # $c is "eax"
72
73 &function_end($name);
74 }
75
761;
diff --git a/src/lib/libcrypto/bn/asm/x86/comba.pl b/src/lib/libcrypto/bn/asm/x86/comba.pl
deleted file mode 100644
index 2291253629..0000000000
--- a/src/lib/libcrypto/bn/asm/x86/comba.pl
+++ /dev/null
@@ -1,277 +0,0 @@
1#!/usr/local/bin/perl
2# x86 assember
3
4sub mul_add_c
5 {
6 local($a,$ai,$b,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
7
8 # pos == -1 if eax and edx are pre-loaded, 0 to load from next
9 # words, and 1 if load return value
10
11 &comment("mul a[$ai]*b[$bi]");
12
13 # "eax" and "edx" will always be pre-loaded.
14 # &mov("eax",&DWP($ai*4,$a,"",0)) ;
15 # &mov("edx",&DWP($bi*4,$b,"",0));
16
17 &mul("edx");
18 &add($c0,"eax");
19 &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # laod next a
20 &mov("eax",&wparam(0)) if $pos > 0; # load r[]
21 ###
22 &adc($c1,"edx");
23 &mov("edx",&DWP(($nb)*4,$b,"",0)) if $pos == 0; # laod next b
24 &mov("edx",&DWP(($nb)*4,$b,"",0)) if $pos == 1; # laod next b
25 ###
26 &adc($c2,0);
27 # is pos > 1, it means it is the last loop
28 &mov(&DWP($i*4,"eax","",0),$c0) if $pos > 0; # save r[];
29 &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # laod next a
30 }
31
32sub sqr_add_c
33 {
34 local($r,$a,$ai,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
35
36 # pos == -1 if eax and edx are pre-loaded, 0 to load from next
37 # words, and 1 if load return value
38
39 &comment("sqr a[$ai]*a[$bi]");
40
41 # "eax" and "edx" will always be pre-loaded.
42 # &mov("eax",&DWP($ai*4,$a,"",0)) ;
43 # &mov("edx",&DWP($bi*4,$b,"",0));
44
45 if ($ai == $bi)
46 { &mul("eax");}
47 else
48 { &mul("edx");}
49 &add($c0,"eax");
50 &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # load next a
51 ###
52 &adc($c1,"edx");
53 &mov("edx",&DWP(($nb)*4,$a,"",0)) if ($pos == 1) && ($na != $nb);
54 ###
55 &adc($c2,0);
56 # is pos > 1, it means it is the last loop
57 &mov(&DWP($i*4,$r,"",0),$c0) if $pos > 0; # save r[];
58 &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # load next b
59 }
60
61sub sqr_add_c2
62 {
63 local($r,$a,$ai,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
64
65 # pos == -1 if eax and edx are pre-loaded, 0 to load from next
66 # words, and 1 if load return value
67
68 &comment("sqr a[$ai]*a[$bi]");
69
70 # "eax" and "edx" will always be pre-loaded.
71 # &mov("eax",&DWP($ai*4,$a,"",0)) ;
72 # &mov("edx",&DWP($bi*4,$a,"",0));
73
74 if ($ai == $bi)
75 { &mul("eax");}
76 else
77 { &mul("edx");}
78 &add("eax","eax");
79 ###
80 &adc("edx","edx");
81 ###
82 &adc($c2,0);
83 &add($c0,"eax");
84 &adc($c1,"edx");
85 &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # load next a
86 &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # load next b
87 &adc($c2,0);
88 &mov(&DWP($i*4,$r,"",0),$c0) if $pos > 0; # save r[];
89 &mov("edx",&DWP(($nb)*4,$a,"",0)) if ($pos <= 1) && ($na != $nb);
90 ###
91 }
92
93sub bn_mul_comba
94 {
95 local($name,$num)=@_;
96 local($a,$b,$c0,$c1,$c2);
97 local($i,$as,$ae,$bs,$be,$ai,$bi);
98 local($tot,$end);
99
100 &function_begin_B($name,"");
101
102 $c0="ebx";
103 $c1="ecx";
104 $c2="ebp";
105 $a="esi";
106 $b="edi";
107
108 $as=0;
109 $ae=0;
110 $bs=0;
111 $be=0;
112 $tot=$num+$num-1;
113
114 &push("esi");
115 &mov($a,&wparam(1));
116 &push("edi");
117 &mov($b,&wparam(2));
118 &push("ebp");
119 &push("ebx");
120
121 &xor($c0,$c0);
122 &mov("eax",&DWP(0,$a,"",0)); # load the first word
123 &xor($c1,$c1);
124 &mov("edx",&DWP(0,$b,"",0)); # load the first second
125
126 for ($i=0; $i<$tot; $i++)
127 {
128 $ai=$as;
129 $bi=$bs;
130 $end=$be+1;
131
132 &comment("################## Calculate word $i");
133
134 for ($j=$bs; $j<$end; $j++)
135 {
136 &xor($c2,$c2) if ($j == $bs);
137 if (($j+1) == $end)
138 {
139 $v=1;
140 $v=2 if (($i+1) == $tot);
141 }
142 else
143 { $v=0; }
144 if (($j+1) != $end)
145 {
146 $na=($ai-1);
147 $nb=($bi+1);
148 }
149 else
150 {
151 $na=$as+($i < ($num-1));
152 $nb=$bs+($i >= ($num-1));
153 }
154#printf STDERR "[$ai,$bi] -> [$na,$nb]\n";
155 &mul_add_c($a,$ai,$b,$bi,$c0,$c1,$c2,$v,$i,$na,$nb);
156 if ($v)
157 {
158 &comment("saved r[$i]");
159 # &mov("eax",&wparam(0));
160 # &mov(&DWP($i*4,"eax","",0),$c0);
161 ($c0,$c1,$c2)=($c1,$c2,$c0);
162 }
163 $ai--;
164 $bi++;
165 }
166 $as++ if ($i < ($num-1));
167 $ae++ if ($i >= ($num-1));
168
169 $bs++ if ($i >= ($num-1));
170 $be++ if ($i < ($num-1));
171 }
172 &comment("save r[$i]");
173 # &mov("eax",&wparam(0));
174 &mov(&DWP($i*4,"eax","",0),$c0);
175
176 &pop("ebx");
177 &pop("ebp");
178 &pop("edi");
179 &pop("esi");
180 &ret();
181 &function_end_B($name);
182 }
183
184sub bn_sqr_comba
185 {
186 local($name,$num)=@_;
187 local($r,$a,$c0,$c1,$c2)=@_;
188 local($i,$as,$ae,$bs,$be,$ai,$bi);
189 local($b,$tot,$end,$half);
190
191 &function_begin_B($name,"");
192
193 $c0="ebx";
194 $c1="ecx";
195 $c2="ebp";
196 $a="esi";
197 $r="edi";
198
199 &push("esi");
200 &push("edi");
201 &push("ebp");
202 &push("ebx");
203 &mov($r,&wparam(0));
204 &mov($a,&wparam(1));
205 &xor($c0,$c0);
206 &xor($c1,$c1);
207 &mov("eax",&DWP(0,$a,"",0)); # load the first word
208
209 $as=0;
210 $ae=0;
211 $bs=0;
212 $be=0;
213 $tot=$num+$num-1;
214
215 for ($i=0; $i<$tot; $i++)
216 {
217 $ai=$as;
218 $bi=$bs;
219 $end=$be+1;
220
221 &comment("############### Calculate word $i");
222 for ($j=$bs; $j<$end; $j++)
223 {
224 &xor($c2,$c2) if ($j == $bs);
225 if (($ai-1) < ($bi+1))
226 {
227 $v=1;
228 $v=2 if ($i+1) == $tot;
229 }
230 else
231 { $v=0; }
232 if (!$v)
233 {
234 $na=$ai-1;
235 $nb=$bi+1;
236 }
237 else
238 {
239 $na=$as+($i < ($num-1));
240 $nb=$bs+($i >= ($num-1));
241 }
242 if ($ai == $bi)
243 {
244 &sqr_add_c($r,$a,$ai,$bi,
245 $c0,$c1,$c2,$v,$i,$na,$nb);
246 }
247 else
248 {
249 &sqr_add_c2($r,$a,$ai,$bi,
250 $c0,$c1,$c2,$v,$i,$na,$nb);
251 }
252 if ($v)
253 {
254 &comment("saved r[$i]");
255 #&mov(&DWP($i*4,$r,"",0),$c0);
256 ($c0,$c1,$c2)=($c1,$c2,$c0);
257 last;
258 }
259 $ai--;
260 $bi++;
261 }
262 $as++ if ($i < ($num-1));
263 $ae++ if ($i >= ($num-1));
264
265 $bs++ if ($i >= ($num-1));
266 $be++ if ($i < ($num-1));
267 }
268 &mov(&DWP($i*4,$r,"",0),$c0);
269 &pop("ebx");
270 &pop("ebp");
271 &pop("edi");
272 &pop("esi");
273 &ret();
274 &function_end_B($name);
275 }
276
2771;
diff --git a/src/lib/libcrypto/bn/asm/x86/div.pl b/src/lib/libcrypto/bn/asm/x86/div.pl
deleted file mode 100644
index 0e90152caa..0000000000
--- a/src/lib/libcrypto/bn/asm/x86/div.pl
+++ /dev/null
@@ -1,15 +0,0 @@
1#!/usr/local/bin/perl
2# x86 assember
3
4sub bn_div_words
5 {
6 local($name)=@_;
7
8 &function_begin($name,"");
9 &mov("edx",&wparam(0)); #
10 &mov("eax",&wparam(1)); #
11 &mov("ebx",&wparam(2)); #
12 &div("ebx");
13 &function_end($name);
14 }
151;
diff --git a/src/lib/libcrypto/bn/asm/x86/f b/src/lib/libcrypto/bn/asm/x86/f
deleted file mode 100644
index 22e4112224..0000000000
--- a/src/lib/libcrypto/bn/asm/x86/f
+++ /dev/null
@@ -1,3 +0,0 @@
1#!/usr/local/bin/perl
2# x86 assember
3
diff --git a/src/lib/libcrypto/bn/asm/x86/mul.pl b/src/lib/libcrypto/bn/asm/x86/mul.pl
deleted file mode 100644
index 674cb9b055..0000000000
--- a/src/lib/libcrypto/bn/asm/x86/mul.pl
+++ /dev/null
@@ -1,77 +0,0 @@
1#!/usr/local/bin/perl
2# x86 assember
3
4sub bn_mul_words
5 {
6 local($name)=@_;
7
8 &function_begin($name,"");
9
10 &comment("");
11 $Low="eax";
12 $High="edx";
13 $a="ebx";
14 $w="ecx";
15 $r="edi";
16 $c="esi";
17 $num="ebp";
18
19 &xor($c,$c); # clear carry
20 &mov($r,&wparam(0)); #
21 &mov($a,&wparam(1)); #
22 &mov($num,&wparam(2)); #
23 &mov($w,&wparam(3)); #
24
25 &and($num,0xfffffff8); # num / 8
26 &jz(&label("mw_finish"));
27
28 &set_label("mw_loop",0);
29 for ($i=0; $i<32; $i+=4)
30 {
31 &comment("Round $i");
32
33 &mov("eax",&DWP($i,$a,"",0)); # *a
34 &mul($w); # *a * w
35 &add("eax",$c); # L(t)+=c
36 # XXX
37
38 &adc("edx",0); # H(t)+=carry
39 &mov(&DWP($i,$r,"",0),"eax"); # *r= L(t);
40
41 &mov($c,"edx"); # c= H(t);
42 }
43
44 &comment("");
45 &add($a,32);
46 &add($r,32);
47 &sub($num,8);
48 &jz(&label("mw_finish"));
49 &jmp(&label("mw_loop"));
50
51 &set_label("mw_finish",0);
52 &mov($num,&wparam(2)); # get num
53 &and($num,7);
54 &jnz(&label("mw_finish2"));
55 &jmp(&label("mw_end"));
56
57 &set_label("mw_finish2",1);
58 for ($i=0; $i<7; $i++)
59 {
60 &comment("Tail Round $i");
61 &mov("eax",&DWP($i*4,$a,"",0));# *a
62 &mul($w); # *a * w
63 &add("eax",$c); # L(t)+=c
64 # XXX
65 &adc("edx",0); # H(t)+=carry
66 &mov(&DWP($i*4,$r,"",0),"eax");# *r= L(t);
67 &mov($c,"edx"); # c= H(t);
68 &dec($num) if ($i != 7-1);
69 &jz(&label("mw_end")) if ($i != 7-1);
70 }
71 &set_label("mw_end",0);
72 &mov("eax",$c);
73
74 &function_end($name);
75 }
76
771;
diff --git a/src/lib/libcrypto/bn/asm/x86/mul_add.pl b/src/lib/libcrypto/bn/asm/x86/mul_add.pl
deleted file mode 100644
index 61830d3a90..0000000000
--- a/src/lib/libcrypto/bn/asm/x86/mul_add.pl
+++ /dev/null
@@ -1,87 +0,0 @@
1#!/usr/local/bin/perl
2# x86 assember
3
4sub bn_mul_add_words
5 {
6 local($name)=@_;
7
8 &function_begin($name,"");
9
10 &comment("");
11 $Low="eax";
12 $High="edx";
13 $a="ebx";
14 $w="ebp";
15 $r="edi";
16 $c="esi";
17
18 &xor($c,$c); # clear carry
19 &mov($r,&wparam(0)); #
20
21 &mov("ecx",&wparam(2)); #
22 &mov($a,&wparam(1)); #
23
24 &and("ecx",0xfffffff8); # num / 8
25 &mov($w,&wparam(3)); #
26
27 &push("ecx"); # Up the stack for a tmp variable
28
29 &jz(&label("maw_finish"));
30
31 &set_label("maw_loop",0);
32
33 &mov(&swtmp(0),"ecx"); #
34
35 for ($i=0; $i<32; $i+=4)
36 {
37 &comment("Round $i");
38
39 &mov("eax",&DWP($i,$a,"",0)); # *a
40 &mul($w); # *a * w
41 &add("eax",$c); # L(t)+= *r
42 &mov($c,&DWP($i,$r,"",0)); # L(t)+= *r
43 &adc("edx",0); # H(t)+=carry
44 &add("eax",$c); # L(t)+=c
45 &adc("edx",0); # H(t)+=carry
46 &mov(&DWP($i,$r,"",0),"eax"); # *r= L(t);
47 &mov($c,"edx"); # c= H(t);
48 }
49
50 &comment("");
51 &mov("ecx",&swtmp(0)); #
52 &add($a,32);
53 &add($r,32);
54 &sub("ecx",8);
55 &jnz(&label("maw_loop"));
56
57 &set_label("maw_finish",0);
58 &mov("ecx",&wparam(2)); # get num
59 &and("ecx",7);
60 &jnz(&label("maw_finish2")); # helps branch prediction
61 &jmp(&label("maw_end"));
62
63 &set_label("maw_finish2",1);
64 for ($i=0; $i<7; $i++)
65 {
66 &comment("Tail Round $i");
67 &mov("eax",&DWP($i*4,$a,"",0));# *a
68 &mul($w); # *a * w
69 &add("eax",$c); # L(t)+=c
70 &mov($c,&DWP($i*4,$r,"",0)); # L(t)+= *r
71 &adc("edx",0); # H(t)+=carry
72 &add("eax",$c);
73 &adc("edx",0); # H(t)+=carry
74 &dec("ecx") if ($i != 7-1);
75 &mov(&DWP($i*4,$r,"",0),"eax"); # *r= L(t);
76 &mov($c,"edx"); # c= H(t);
77 &jz(&label("maw_end")) if ($i != 7-1);
78 }
79 &set_label("maw_end",0);
80 &mov("eax",$c);
81
82 &pop("ecx"); # clear variable from
83
84 &function_end($name);
85 }
86
871;
diff --git a/src/lib/libcrypto/bn/asm/x86/sqr.pl b/src/lib/libcrypto/bn/asm/x86/sqr.pl
deleted file mode 100644
index 1f90993cf6..0000000000
--- a/src/lib/libcrypto/bn/asm/x86/sqr.pl
+++ /dev/null
@@ -1,60 +0,0 @@
1#!/usr/local/bin/perl
2# x86 assember
3
4sub bn_sqr_words
5 {
6 local($name)=@_;
7
8 &function_begin($name,"");
9
10 &comment("");
11 $r="esi";
12 $a="edi";
13 $num="ebx";
14
15 &mov($r,&wparam(0)); #
16 &mov($a,&wparam(1)); #
17 &mov($num,&wparam(2)); #
18
19 &and($num,0xfffffff8); # num / 8
20 &jz(&label("sw_finish"));
21
22 &set_label("sw_loop",0);
23 for ($i=0; $i<32; $i+=4)
24 {
25 &comment("Round $i");
26 &mov("eax",&DWP($i,$a,"",0)); # *a
27 # XXX
28 &mul("eax"); # *a * *a
29 &mov(&DWP($i*2,$r,"",0),"eax"); #
30 &mov(&DWP($i*2+4,$r,"",0),"edx");#
31 }
32
33 &comment("");
34 &add($a,32);
35 &add($r,64);
36 &sub($num,8);
37 &jnz(&label("sw_loop"));
38
39 &set_label("sw_finish",0);
40 &mov($num,&wparam(2)); # get num
41 &and($num,7);
42 &jz(&label("sw_end"));
43
44 for ($i=0; $i<7; $i++)
45 {
46 &comment("Tail Round $i");
47 &mov("eax",&DWP($i*4,$a,"",0)); # *a
48 # XXX
49 &mul("eax"); # *a * *a
50 &mov(&DWP($i*8,$r,"",0),"eax"); #
51 &dec($num) if ($i != 7-1);
52 &mov(&DWP($i*8+4,$r,"",0),"edx");
53 &jz(&label("sw_end")) if ($i != 7-1);
54 }
55 &set_label("sw_end",0);
56
57 &function_end($name);
58 }
59
601;
diff --git a/src/lib/libcrypto/bn/asm/x86/sub.pl b/src/lib/libcrypto/bn/asm/x86/sub.pl
deleted file mode 100644
index 837b0e1b07..0000000000
--- a/src/lib/libcrypto/bn/asm/x86/sub.pl
+++ /dev/null
@@ -1,76 +0,0 @@
1#!/usr/local/bin/perl
2# x86 assember
3
4sub bn_sub_words
5 {
6 local($name)=@_;
7
8 &function_begin($name,"");
9
10 &comment("");
11 $a="esi";
12 $b="edi";
13 $c="eax";
14 $r="ebx";
15 $tmp1="ecx";
16 $tmp2="edx";
17 $num="ebp";
18
19 &mov($r,&wparam(0)); # get r
20 &mov($a,&wparam(1)); # get a
21 &mov($b,&wparam(2)); # get b
22 &mov($num,&wparam(3)); # get num
23 &xor($c,$c); # clear carry
24 &and($num,0xfffffff8); # num / 8
25
26 &jz(&label("aw_finish"));
27
28 &set_label("aw_loop",0);
29 for ($i=0; $i<8; $i++)
30 {
31 &comment("Round $i");
32
33 &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
34 &mov($tmp2,&DWP($i*4,$b,"",0)); # *b
35 &sub($tmp1,$c);
36 &mov($c,0);
37 &adc($c,$c);
38 &sub($tmp1,$tmp2);
39 &adc($c,0);
40 &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
41 }
42
43 &comment("");
44 &add($a,32);
45 &add($b,32);
46 &add($r,32);
47 &sub($num,8);
48 &jnz(&label("aw_loop"));
49
50 &set_label("aw_finish",0);
51 &mov($num,&wparam(3)); # get num
52 &and($num,7);
53 &jz(&label("aw_end"));
54
55 for ($i=0; $i<7; $i++)
56 {
57 &comment("Tail Round $i");
58 &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
59 &mov($tmp2,&DWP($i*4,$b,"",0));# *b
60 &sub($tmp1,$c);
61 &mov($c,0);
62 &adc($c,$c);
63 &sub($tmp1,$tmp2);
64 &adc($c,0);
65 &dec($num) if ($i != 6);
66 &mov(&DWP($i*4,$r,"",0),$tmp1); # *a
67 &jz(&label("aw_end")) if ($i != 6);
68 }
69 &set_label("aw_end",0);
70
71# &mov("eax",$c); # $c is "eax"
72
73 &function_end($name);
74 }
75
761;
diff --git a/src/lib/libcrypto/bn/asm/x86_64-gcc.c b/src/lib/libcrypto/bn/asm/x86_64-gcc.c
deleted file mode 100644
index acb0b40118..0000000000
--- a/src/lib/libcrypto/bn/asm/x86_64-gcc.c
+++ /dev/null
@@ -1,606 +0,0 @@
1#include "../bn_lcl.h"
2#if !(defined(__GNUC__) && __GNUC__>=2)
3# include "../bn_asm.c" /* kind of dirty hack for Sun Studio */
4#else
5/*
6 * x86_64 BIGNUM accelerator version 0.1, December 2002.
7 *
8 * Implemented by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
9 * project.
10 *
11 * Rights for redistribution and usage in source and binary forms are
12 * granted according to the OpenSSL license. Warranty of any kind is
13 * disclaimed.
14 *
15 * Q. Version 0.1? It doesn't sound like Andy, he used to assign real
16 * versions, like 1.0...
17 * A. Well, that's because this code is basically a quick-n-dirty
18 * proof-of-concept hack. As you can see it's implemented with
19 * inline assembler, which means that you're bound to GCC and that
20 * there might be enough room for further improvement.
21 *
22 * Q. Why inline assembler?
23 * A. x86_64 features own ABI which I'm not familiar with. This is
24 * why I decided to let the compiler take care of subroutine
25 * prologue/epilogue as well as register allocation. For reference.
26 * Win64 implements different ABI for AMD64, different from Linux.
27 *
28 * Q. How much faster does it get?
29 * A. 'apps/openssl speed rsa dsa' output with no-asm:
30 *
31 * sign verify sign/s verify/s
32 * rsa 512 bits 0.0006s 0.0001s 1683.8 18456.2
33 * rsa 1024 bits 0.0028s 0.0002s 356.0 6407.0
34 * rsa 2048 bits 0.0172s 0.0005s 58.0 1957.8
35 * rsa 4096 bits 0.1155s 0.0018s 8.7 555.6
36 * sign verify sign/s verify/s
37 * dsa 512 bits 0.0005s 0.0006s 2100.8 1768.3
38 * dsa 1024 bits 0.0014s 0.0018s 692.3 559.2
39 * dsa 2048 bits 0.0049s 0.0061s 204.7 165.0
40 *
41 * 'apps/openssl speed rsa dsa' output with this module:
42 *
43 * sign verify sign/s verify/s
44 * rsa 512 bits 0.0004s 0.0000s 2767.1 33297.9
45 * rsa 1024 bits 0.0012s 0.0001s 867.4 14674.7
46 * rsa 2048 bits 0.0061s 0.0002s 164.0 5270.0
47 * rsa 4096 bits 0.0384s 0.0006s 26.1 1650.8
48 * sign verify sign/s verify/s
49 * dsa 512 bits 0.0002s 0.0003s 4442.2 3786.3
50 * dsa 1024 bits 0.0005s 0.0007s 1835.1 1497.4
51 * dsa 2048 bits 0.0016s 0.0020s 620.4 504.6
52 *
53 * For the reference. IA-32 assembler implementation performs
54 * very much like 64-bit code compiled with no-asm on the same
55 * machine.
56 */
57
58#ifdef _WIN64
59#define BN_ULONG unsigned long long
60#else
61#define BN_ULONG unsigned long
62#endif
63
64#undef mul
65#undef mul_add
66#undef sqr
67
68/*
69 * "m"(a), "+m"(r) is the way to favor DirectPath µ-code;
70 * "g"(0) let the compiler to decide where does it
71 * want to keep the value of zero;
72 */
73#define mul_add(r,a,word,carry) do { \
74 register BN_ULONG high,low; \
75 asm ("mulq %3" \
76 : "=a"(low),"=d"(high) \
77 : "a"(word),"m"(a) \
78 : "cc"); \
79 asm ("addq %2,%0; adcq %3,%1" \
80 : "+r"(carry),"+d"(high)\
81 : "a"(low),"g"(0) \
82 : "cc"); \
83 asm ("addq %2,%0; adcq %3,%1" \
84 : "+m"(r),"+d"(high) \
85 : "r"(carry),"g"(0) \
86 : "cc"); \
87 carry=high; \
88 } while (0)
89
90#define mul(r,a,word,carry) do { \
91 register BN_ULONG high,low; \
92 asm ("mulq %3" \
93 : "=a"(low),"=d"(high) \
94 : "a"(word),"g"(a) \
95 : "cc"); \
96 asm ("addq %2,%0; adcq %3,%1" \
97 : "+r"(carry),"+d"(high)\
98 : "a"(low),"g"(0) \
99 : "cc"); \
100 (r)=carry, carry=high; \
101 } while (0)
102
103#define sqr(r0,r1,a) \
104 asm ("mulq %2" \
105 : "=a"(r0),"=d"(r1) \
106 : "a"(a) \
107 : "cc");
108
109BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
110 {
111 BN_ULONG c1=0;
112
113 if (num <= 0) return(c1);
114
115 while (num&~3)
116 {
117 mul_add(rp[0],ap[0],w,c1);
118 mul_add(rp[1],ap[1],w,c1);
119 mul_add(rp[2],ap[2],w,c1);
120 mul_add(rp[3],ap[3],w,c1);
121 ap+=4; rp+=4; num-=4;
122 }
123 if (num)
124 {
125 mul_add(rp[0],ap[0],w,c1); if (--num==0) return c1;
126 mul_add(rp[1],ap[1],w,c1); if (--num==0) return c1;
127 mul_add(rp[2],ap[2],w,c1); return c1;
128 }
129
130 return(c1);
131 }
132
133BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
134 {
135 BN_ULONG c1=0;
136
137 if (num <= 0) return(c1);
138
139 while (num&~3)
140 {
141 mul(rp[0],ap[0],w,c1);
142 mul(rp[1],ap[1],w,c1);
143 mul(rp[2],ap[2],w,c1);
144 mul(rp[3],ap[3],w,c1);
145 ap+=4; rp+=4; num-=4;
146 }
147 if (num)
148 {
149 mul(rp[0],ap[0],w,c1); if (--num == 0) return c1;
150 mul(rp[1],ap[1],w,c1); if (--num == 0) return c1;
151 mul(rp[2],ap[2],w,c1);
152 }
153 return(c1);
154 }
155
156void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n)
157 {
158 if (n <= 0) return;
159
160 while (n&~3)
161 {
162 sqr(r[0],r[1],a[0]);
163 sqr(r[2],r[3],a[1]);
164 sqr(r[4],r[5],a[2]);
165 sqr(r[6],r[7],a[3]);
166 a+=4; r+=8; n-=4;
167 }
168 if (n)
169 {
170 sqr(r[0],r[1],a[0]); if (--n == 0) return;
171 sqr(r[2],r[3],a[1]); if (--n == 0) return;
172 sqr(r[4],r[5],a[2]);
173 }
174 }
175
176BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
177{ BN_ULONG ret,waste;
178
179 asm ("divq %4"
180 : "=a"(ret),"=d"(waste)
181 : "a"(l),"d"(h),"g"(d)
182 : "cc");
183
184 return ret;
185}
186
187BN_ULONG bn_add_words (BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,int n)
188{ BN_ULONG ret=0,i=0;
189
190 if (n <= 0) return 0;
191
192 asm (
193 " subq %2,%2 \n"
194 ".p2align 4 \n"
195 "1: movq (%4,%2,8),%0 \n"
196 " adcq (%5,%2,8),%0 \n"
197 " movq %0,(%3,%2,8) \n"
198 " leaq 1(%2),%2 \n"
199 " loop 1b \n"
200 " sbbq %0,%0 \n"
201 : "=&a"(ret),"+c"(n),"=&r"(i)
202 : "r"(rp),"r"(ap),"r"(bp)
203 : "cc"
204 );
205
206 return ret&1;
207}
208
209#ifndef SIMICS
210BN_ULONG bn_sub_words (BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,int n)
211{ BN_ULONG ret=0,i=0;
212
213 if (n <= 0) return 0;
214
215 asm (
216 " subq %2,%2 \n"
217 ".p2align 4 \n"
218 "1: movq (%4,%2,8),%0 \n"
219 " sbbq (%5,%2,8),%0 \n"
220 " movq %0,(%3,%2,8) \n"
221 " leaq 1(%2),%2 \n"
222 " loop 1b \n"
223 " sbbq %0,%0 \n"
224 : "=&a"(ret),"+c"(n),"=&r"(i)
225 : "r"(rp),"r"(ap),"r"(bp)
226 : "cc"
227 );
228
229 return ret&1;
230}
231#else
232/* Simics 1.4<7 has buggy sbbq:-( */
233#define BN_MASK2 0xffffffffffffffffL
234BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
235 {
236 BN_ULONG t1,t2;
237 int c=0;
238
239 if (n <= 0) return((BN_ULONG)0);
240
241 for (;;)
242 {
243 t1=a[0]; t2=b[0];
244 r[0]=(t1-t2-c)&BN_MASK2;
245 if (t1 != t2) c=(t1 < t2);
246 if (--n <= 0) break;
247
248 t1=a[1]; t2=b[1];
249 r[1]=(t1-t2-c)&BN_MASK2;
250 if (t1 != t2) c=(t1 < t2);
251 if (--n <= 0) break;
252
253 t1=a[2]; t2=b[2];
254 r[2]=(t1-t2-c)&BN_MASK2;
255 if (t1 != t2) c=(t1 < t2);
256 if (--n <= 0) break;
257
258 t1=a[3]; t2=b[3];
259 r[3]=(t1-t2-c)&BN_MASK2;
260 if (t1 != t2) c=(t1 < t2);
261 if (--n <= 0) break;
262
263 a+=4;
264 b+=4;
265 r+=4;
266 }
267 return(c);
268 }
269#endif
270
271/* mul_add_c(a,b,c0,c1,c2) -- c+=a*b for three word number c=(c2,c1,c0) */
272/* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */
273/* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
274/* sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0) */
275
276#if 0
277/* original macros are kept for reference purposes */
278#define mul_add_c(a,b,c0,c1,c2) { \
279 BN_ULONG ta=(a),tb=(b); \
280 t1 = ta * tb; \
281 t2 = BN_UMULT_HIGH(ta,tb); \
282 c0 += t1; t2 += (c0<t1)?1:0; \
283 c1 += t2; c2 += (c1<t2)?1:0; \
284 }
285
286#define mul_add_c2(a,b,c0,c1,c2) { \
287 BN_ULONG ta=(a),tb=(b),t0; \
288 t1 = BN_UMULT_HIGH(ta,tb); \
289 t0 = ta * tb; \
290 t2 = t1+t1; c2 += (t2<t1)?1:0; \
291 t1 = t0+t0; t2 += (t1<t0)?1:0; \
292 c0 += t1; t2 += (c0<t1)?1:0; \
293 c1 += t2; c2 += (c1<t2)?1:0; \
294 }
295#else
296#define mul_add_c(a,b,c0,c1,c2) do { \
297 asm ("mulq %3" \
298 : "=a"(t1),"=d"(t2) \
299 : "a"(a),"m"(b) \
300 : "cc"); \
301 asm ("addq %2,%0; adcq %3,%1" \
302 : "+r"(c0),"+d"(t2) \
303 : "a"(t1),"g"(0) \
304 : "cc"); \
305 asm ("addq %2,%0; adcq %3,%1" \
306 : "+r"(c1),"+r"(c2) \
307 : "d"(t2),"g"(0) \
308 : "cc"); \
309 } while (0)
310
311#define sqr_add_c(a,i,c0,c1,c2) do { \
312 asm ("mulq %2" \
313 : "=a"(t1),"=d"(t2) \
314 : "a"(a[i]) \
315 : "cc"); \
316 asm ("addq %2,%0; adcq %3,%1" \
317 : "+r"(c0),"+d"(t2) \
318 : "a"(t1),"g"(0) \
319 : "cc"); \
320 asm ("addq %2,%0; adcq %3,%1" \
321 : "+r"(c1),"+r"(c2) \
322 : "d"(t2),"g"(0) \
323 : "cc"); \
324 } while (0)
325
326#define mul_add_c2(a,b,c0,c1,c2) do { \
327 asm ("mulq %3" \
328 : "=a"(t1),"=d"(t2) \
329 : "a"(a),"m"(b) \
330 : "cc"); \
331 asm ("addq %0,%0; adcq %2,%1" \
332 : "+d"(t2),"+r"(c2) \
333 : "g"(0) \
334 : "cc"); \
335 asm ("addq %0,%0; adcq %2,%1" \
336 : "+a"(t1),"+d"(t2) \
337 : "g"(0) \
338 : "cc"); \
339 asm ("addq %2,%0; adcq %3,%1" \
340 : "+r"(c0),"+d"(t2) \
341 : "a"(t1),"g"(0) \
342 : "cc"); \
343 asm ("addq %2,%0; adcq %3,%1" \
344 : "+r"(c1),"+r"(c2) \
345 : "d"(t2),"g"(0) \
346 : "cc"); \
347 } while (0)
348#endif
349
350#define sqr_add_c2(a,i,j,c0,c1,c2) \
351 mul_add_c2((a)[i],(a)[j],c0,c1,c2)
352
353void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
354 {
355 BN_ULONG t1,t2;
356 BN_ULONG c1,c2,c3;
357
358 c1=0;
359 c2=0;
360 c3=0;
361 mul_add_c(a[0],b[0],c1,c2,c3);
362 r[0]=c1;
363 c1=0;
364 mul_add_c(a[0],b[1],c2,c3,c1);
365 mul_add_c(a[1],b[0],c2,c3,c1);
366 r[1]=c2;
367 c2=0;
368 mul_add_c(a[2],b[0],c3,c1,c2);
369 mul_add_c(a[1],b[1],c3,c1,c2);
370 mul_add_c(a[0],b[2],c3,c1,c2);
371 r[2]=c3;
372 c3=0;
373 mul_add_c(a[0],b[3],c1,c2,c3);
374 mul_add_c(a[1],b[2],c1,c2,c3);
375 mul_add_c(a[2],b[1],c1,c2,c3);
376 mul_add_c(a[3],b[0],c1,c2,c3);
377 r[3]=c1;
378 c1=0;
379 mul_add_c(a[4],b[0],c2,c3,c1);
380 mul_add_c(a[3],b[1],c2,c3,c1);
381 mul_add_c(a[2],b[2],c2,c3,c1);
382 mul_add_c(a[1],b[3],c2,c3,c1);
383 mul_add_c(a[0],b[4],c2,c3,c1);
384 r[4]=c2;
385 c2=0;
386 mul_add_c(a[0],b[5],c3,c1,c2);
387 mul_add_c(a[1],b[4],c3,c1,c2);
388 mul_add_c(a[2],b[3],c3,c1,c2);
389 mul_add_c(a[3],b[2],c3,c1,c2);
390 mul_add_c(a[4],b[1],c3,c1,c2);
391 mul_add_c(a[5],b[0],c3,c1,c2);
392 r[5]=c3;
393 c3=0;
394 mul_add_c(a[6],b[0],c1,c2,c3);
395 mul_add_c(a[5],b[1],c1,c2,c3);
396 mul_add_c(a[4],b[2],c1,c2,c3);
397 mul_add_c(a[3],b[3],c1,c2,c3);
398 mul_add_c(a[2],b[4],c1,c2,c3);
399 mul_add_c(a[1],b[5],c1,c2,c3);
400 mul_add_c(a[0],b[6],c1,c2,c3);
401 r[6]=c1;
402 c1=0;
403 mul_add_c(a[0],b[7],c2,c3,c1);
404 mul_add_c(a[1],b[6],c2,c3,c1);
405 mul_add_c(a[2],b[5],c2,c3,c1);
406 mul_add_c(a[3],b[4],c2,c3,c1);
407 mul_add_c(a[4],b[3],c2,c3,c1);
408 mul_add_c(a[5],b[2],c2,c3,c1);
409 mul_add_c(a[6],b[1],c2,c3,c1);
410 mul_add_c(a[7],b[0],c2,c3,c1);
411 r[7]=c2;
412 c2=0;
413 mul_add_c(a[7],b[1],c3,c1,c2);
414 mul_add_c(a[6],b[2],c3,c1,c2);
415 mul_add_c(a[5],b[3],c3,c1,c2);
416 mul_add_c(a[4],b[4],c3,c1,c2);
417 mul_add_c(a[3],b[5],c3,c1,c2);
418 mul_add_c(a[2],b[6],c3,c1,c2);
419 mul_add_c(a[1],b[7],c3,c1,c2);
420 r[8]=c3;
421 c3=0;
422 mul_add_c(a[2],b[7],c1,c2,c3);
423 mul_add_c(a[3],b[6],c1,c2,c3);
424 mul_add_c(a[4],b[5],c1,c2,c3);
425 mul_add_c(a[5],b[4],c1,c2,c3);
426 mul_add_c(a[6],b[3],c1,c2,c3);
427 mul_add_c(a[7],b[2],c1,c2,c3);
428 r[9]=c1;
429 c1=0;
430 mul_add_c(a[7],b[3],c2,c3,c1);
431 mul_add_c(a[6],b[4],c2,c3,c1);
432 mul_add_c(a[5],b[5],c2,c3,c1);
433 mul_add_c(a[4],b[6],c2,c3,c1);
434 mul_add_c(a[3],b[7],c2,c3,c1);
435 r[10]=c2;
436 c2=0;
437 mul_add_c(a[4],b[7],c3,c1,c2);
438 mul_add_c(a[5],b[6],c3,c1,c2);
439 mul_add_c(a[6],b[5],c3,c1,c2);
440 mul_add_c(a[7],b[4],c3,c1,c2);
441 r[11]=c3;
442 c3=0;
443 mul_add_c(a[7],b[5],c1,c2,c3);
444 mul_add_c(a[6],b[6],c1,c2,c3);
445 mul_add_c(a[5],b[7],c1,c2,c3);
446 r[12]=c1;
447 c1=0;
448 mul_add_c(a[6],b[7],c2,c3,c1);
449 mul_add_c(a[7],b[6],c2,c3,c1);
450 r[13]=c2;
451 c2=0;
452 mul_add_c(a[7],b[7],c3,c1,c2);
453 r[14]=c3;
454 r[15]=c1;
455 }
456
457void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
458 {
459 BN_ULONG t1,t2;
460 BN_ULONG c1,c2,c3;
461
462 c1=0;
463 c2=0;
464 c3=0;
465 mul_add_c(a[0],b[0],c1,c2,c3);
466 r[0]=c1;
467 c1=0;
468 mul_add_c(a[0],b[1],c2,c3,c1);
469 mul_add_c(a[1],b[0],c2,c3,c1);
470 r[1]=c2;
471 c2=0;
472 mul_add_c(a[2],b[0],c3,c1,c2);
473 mul_add_c(a[1],b[1],c3,c1,c2);
474 mul_add_c(a[0],b[2],c3,c1,c2);
475 r[2]=c3;
476 c3=0;
477 mul_add_c(a[0],b[3],c1,c2,c3);
478 mul_add_c(a[1],b[2],c1,c2,c3);
479 mul_add_c(a[2],b[1],c1,c2,c3);
480 mul_add_c(a[3],b[0],c1,c2,c3);
481 r[3]=c1;
482 c1=0;
483 mul_add_c(a[3],b[1],c2,c3,c1);
484 mul_add_c(a[2],b[2],c2,c3,c1);
485 mul_add_c(a[1],b[3],c2,c3,c1);
486 r[4]=c2;
487 c2=0;
488 mul_add_c(a[2],b[3],c3,c1,c2);
489 mul_add_c(a[3],b[2],c3,c1,c2);
490 r[5]=c3;
491 c3=0;
492 mul_add_c(a[3],b[3],c1,c2,c3);
493 r[6]=c1;
494 r[7]=c2;
495 }
496
497void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
498 {
499 BN_ULONG t1,t2;
500 BN_ULONG c1,c2,c3;
501
502 c1=0;
503 c2=0;
504 c3=0;
505 sqr_add_c(a,0,c1,c2,c3);
506 r[0]=c1;
507 c1=0;
508 sqr_add_c2(a,1,0,c2,c3,c1);
509 r[1]=c2;
510 c2=0;
511 sqr_add_c(a,1,c3,c1,c2);
512 sqr_add_c2(a,2,0,c3,c1,c2);
513 r[2]=c3;
514 c3=0;
515 sqr_add_c2(a,3,0,c1,c2,c3);
516 sqr_add_c2(a,2,1,c1,c2,c3);
517 r[3]=c1;
518 c1=0;
519 sqr_add_c(a,2,c2,c3,c1);
520 sqr_add_c2(a,3,1,c2,c3,c1);
521 sqr_add_c2(a,4,0,c2,c3,c1);
522 r[4]=c2;
523 c2=0;
524 sqr_add_c2(a,5,0,c3,c1,c2);
525 sqr_add_c2(a,4,1,c3,c1,c2);
526 sqr_add_c2(a,3,2,c3,c1,c2);
527 r[5]=c3;
528 c3=0;
529 sqr_add_c(a,3,c1,c2,c3);
530 sqr_add_c2(a,4,2,c1,c2,c3);
531 sqr_add_c2(a,5,1,c1,c2,c3);
532 sqr_add_c2(a,6,0,c1,c2,c3);
533 r[6]=c1;
534 c1=0;
535 sqr_add_c2(a,7,0,c2,c3,c1);
536 sqr_add_c2(a,6,1,c2,c3,c1);
537 sqr_add_c2(a,5,2,c2,c3,c1);
538 sqr_add_c2(a,4,3,c2,c3,c1);
539 r[7]=c2;
540 c2=0;
541 sqr_add_c(a,4,c3,c1,c2);
542 sqr_add_c2(a,5,3,c3,c1,c2);
543 sqr_add_c2(a,6,2,c3,c1,c2);
544 sqr_add_c2(a,7,1,c3,c1,c2);
545 r[8]=c3;
546 c3=0;
547 sqr_add_c2(a,7,2,c1,c2,c3);
548 sqr_add_c2(a,6,3,c1,c2,c3);
549 sqr_add_c2(a,5,4,c1,c2,c3);
550 r[9]=c1;
551 c1=0;
552 sqr_add_c(a,5,c2,c3,c1);
553 sqr_add_c2(a,6,4,c2,c3,c1);
554 sqr_add_c2(a,7,3,c2,c3,c1);
555 r[10]=c2;
556 c2=0;
557 sqr_add_c2(a,7,4,c3,c1,c2);
558 sqr_add_c2(a,6,5,c3,c1,c2);
559 r[11]=c3;
560 c3=0;
561 sqr_add_c(a,6,c1,c2,c3);
562 sqr_add_c2(a,7,5,c1,c2,c3);
563 r[12]=c1;
564 c1=0;
565 sqr_add_c2(a,7,6,c2,c3,c1);
566 r[13]=c2;
567 c2=0;
568 sqr_add_c(a,7,c3,c1,c2);
569 r[14]=c3;
570 r[15]=c1;
571 }
572
573void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
574 {
575 BN_ULONG t1,t2;
576 BN_ULONG c1,c2,c3;
577
578 c1=0;
579 c2=0;
580 c3=0;
581 sqr_add_c(a,0,c1,c2,c3);
582 r[0]=c1;
583 c1=0;
584 sqr_add_c2(a,1,0,c2,c3,c1);
585 r[1]=c2;
586 c2=0;
587 sqr_add_c(a,1,c3,c1,c2);
588 sqr_add_c2(a,2,0,c3,c1,c2);
589 r[2]=c3;
590 c3=0;
591 sqr_add_c2(a,3,0,c1,c2,c3);
592 sqr_add_c2(a,2,1,c1,c2,c3);
593 r[3]=c1;
594 c1=0;
595 sqr_add_c(a,2,c2,c3,c1);
596 sqr_add_c2(a,3,1,c2,c3,c1);
597 r[4]=c2;
598 c2=0;
599 sqr_add_c2(a,3,2,c3,c1,c2);
600 r[5]=c3;
601 c3=0;
602 sqr_add_c(a,3,c1,c2,c3);
603 r[6]=c1;
604 r[7]=c2;
605 }
606#endif
diff --git a/src/lib/libcrypto/bn/asm/x86_64-mont.pl b/src/lib/libcrypto/bn/asm/x86_64-mont.pl
deleted file mode 100755
index 3b7a6f243f..0000000000
--- a/src/lib/libcrypto/bn/asm/x86_64-mont.pl
+++ /dev/null
@@ -1,330 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# October 2005.
11#
12# Montgomery multiplication routine for x86_64. While it gives modest
13# 9% improvement of rsa4096 sign on Opteron, rsa512 sign runs more
14# than twice, >2x, as fast. Most common rsa1024 sign is improved by
15# respectful 50%. It remains to be seen if loop unrolling and
16# dedicated squaring routine can provide further improvement...
17
18$flavour = shift;
19$output = shift;
20if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
21
22$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
23
24$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
25( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
26( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
27die "can't locate x86_64-xlate.pl";
28
29open STDOUT,"| $^X $xlate $flavour $output";
30
31# int bn_mul_mont(
32$rp="%rdi"; # BN_ULONG *rp,
33$ap="%rsi"; # const BN_ULONG *ap,
34$bp="%rdx"; # const BN_ULONG *bp,
35$np="%rcx"; # const BN_ULONG *np,
36$n0="%r8"; # const BN_ULONG *n0,
37$num="%r9"; # int num);
38$lo0="%r10";
39$hi0="%r11";
40$bp="%r12"; # reassign $bp
41$hi1="%r13";
42$i="%r14";
43$j="%r15";
44$m0="%rbx";
45$m1="%rbp";
46
47$code=<<___;
48.text
49
50.globl bn_mul_mont
51.type bn_mul_mont,\@function,6
52.align 16
53bn_mul_mont:
54 push %rbx
55 push %rbp
56 push %r12
57 push %r13
58 push %r14
59 push %r15
60
61 mov ${num}d,${num}d
62 lea 2($num),%r10
63 mov %rsp,%r11
64 neg %r10
65 lea (%rsp,%r10,8),%rsp # tp=alloca(8*(num+2))
66 and \$-1024,%rsp # minimize TLB usage
67
68 mov %r11,8(%rsp,$num,8) # tp[num+1]=%rsp
69.Lprologue:
70 mov %rdx,$bp # $bp reassigned, remember?
71
72 mov ($n0),$n0 # pull n0[0] value
73
74 xor $i,$i # i=0
75 xor $j,$j # j=0
76
77 mov ($bp),$m0 # m0=bp[0]
78 mov ($ap),%rax
79 mulq $m0 # ap[0]*bp[0]
80 mov %rax,$lo0
81 mov %rdx,$hi0
82
83 imulq $n0,%rax # "tp[0]"*n0
84 mov %rax,$m1
85
86 mulq ($np) # np[0]*m1
87 add $lo0,%rax # discarded
88 adc \$0,%rdx
89 mov %rdx,$hi1
90
91 lea 1($j),$j # j++
92.L1st:
93 mov ($ap,$j,8),%rax
94 mulq $m0 # ap[j]*bp[0]
95 add $hi0,%rax
96 adc \$0,%rdx
97 mov %rax,$lo0
98 mov ($np,$j,8),%rax
99 mov %rdx,$hi0
100
101 mulq $m1 # np[j]*m1
102 add $hi1,%rax
103 lea 1($j),$j # j++
104 adc \$0,%rdx
105 add $lo0,%rax # np[j]*m1+ap[j]*bp[0]
106 adc \$0,%rdx
107 mov %rax,-16(%rsp,$j,8) # tp[j-1]
108 cmp $num,$j
109 mov %rdx,$hi1
110 jl .L1st
111
112 xor %rdx,%rdx
113 add $hi0,$hi1
114 adc \$0,%rdx
115 mov $hi1,-8(%rsp,$num,8)
116 mov %rdx,(%rsp,$num,8) # store upmost overflow bit
117
118 lea 1($i),$i # i++
119.align 4
120.Louter:
121 xor $j,$j # j=0
122
123 mov ($bp,$i,8),$m0 # m0=bp[i]
124 mov ($ap),%rax # ap[0]
125 mulq $m0 # ap[0]*bp[i]
126 add (%rsp),%rax # ap[0]*bp[i]+tp[0]
127 adc \$0,%rdx
128 mov %rax,$lo0
129 mov %rdx,$hi0
130
131 imulq $n0,%rax # tp[0]*n0
132 mov %rax,$m1
133
134 mulq ($np,$j,8) # np[0]*m1
135 add $lo0,%rax # discarded
136 mov 8(%rsp),$lo0 # tp[1]
137 adc \$0,%rdx
138 mov %rdx,$hi1
139
140 lea 1($j),$j # j++
141.align 4
142.Linner:
143 mov ($ap,$j,8),%rax
144 mulq $m0 # ap[j]*bp[i]
145 add $hi0,%rax
146 adc \$0,%rdx
147 add %rax,$lo0 # ap[j]*bp[i]+tp[j]
148 mov ($np,$j,8),%rax
149 adc \$0,%rdx
150 mov %rdx,$hi0
151
152 mulq $m1 # np[j]*m1
153 add $hi1,%rax
154 lea 1($j),$j # j++
155 adc \$0,%rdx
156 add $lo0,%rax # np[j]*m1+ap[j]*bp[i]+tp[j]
157 adc \$0,%rdx
158 mov (%rsp,$j,8),$lo0
159 cmp $num,$j
160 mov %rax,-16(%rsp,$j,8) # tp[j-1]
161 mov %rdx,$hi1
162 jl .Linner
163
164 xor %rdx,%rdx
165 add $hi0,$hi1
166 adc \$0,%rdx
167 add $lo0,$hi1 # pull upmost overflow bit
168 adc \$0,%rdx
169 mov $hi1,-8(%rsp,$num,8)
170 mov %rdx,(%rsp,$num,8) # store upmost overflow bit
171
172 lea 1($i),$i # i++
173 cmp $num,$i
174 jl .Louter
175
176 lea (%rsp),$ap # borrow ap for tp
177 lea -1($num),$j # j=num-1
178
179 mov ($ap),%rax # tp[0]
180 xor $i,$i # i=0 and clear CF!
181 jmp .Lsub
182.align 16
183.Lsub: sbb ($np,$i,8),%rax
184 mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i]
185 dec $j # doesn't affect CF!
186 mov 8($ap,$i,8),%rax # tp[i+1]
187 lea 1($i),$i # i++
188 jge .Lsub
189
190 sbb \$0,%rax # handle upmost overflow bit
191 and %rax,$ap
192 not %rax
193 mov $rp,$np
194 and %rax,$np
195 lea -1($num),$j
196 or $np,$ap # ap=borrow?tp:rp
197.align 16
198.Lcopy: # copy or in-place refresh
199 mov ($ap,$j,8),%rax
200 mov %rax,($rp,$j,8) # rp[i]=tp[i]
201 mov $i,(%rsp,$j,8) # zap temporary vector
202 dec $j
203 jge .Lcopy
204
205 mov 8(%rsp,$num,8),%rsi # restore %rsp
206 mov \$1,%rax
207 mov (%rsi),%r15
208 mov 8(%rsi),%r14
209 mov 16(%rsi),%r13
210 mov 24(%rsi),%r12
211 mov 32(%rsi),%rbp
212 mov 40(%rsi),%rbx
213 lea 48(%rsi),%rsp
214.Lepilogue:
215 ret
216.size bn_mul_mont,.-bn_mul_mont
217.asciz "Montgomery Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
218.align 16
219___
220
221# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
222# CONTEXT *context,DISPATCHER_CONTEXT *disp)
223if ($win64) {
224$rec="%rcx";
225$frame="%rdx";
226$context="%r8";
227$disp="%r9";
228
229$code.=<<___;
230.extern __imp_RtlVirtualUnwind
231.type se_handler,\@abi-omnipotent
232.align 16
233se_handler:
234 push %rsi
235 push %rdi
236 push %rbx
237 push %rbp
238 push %r12
239 push %r13
240 push %r14
241 push %r15
242 pushfq
243 sub \$64,%rsp
244
245 mov 120($context),%rax # pull context->Rax
246 mov 248($context),%rbx # pull context->Rip
247
248 lea .Lprologue(%rip),%r10
249 cmp %r10,%rbx # context->Rip<.Lprologue
250 jb .Lin_prologue
251
252 mov 152($context),%rax # pull context->Rsp
253
254 lea .Lepilogue(%rip),%r10
255 cmp %r10,%rbx # context->Rip>=.Lepilogue
256 jae .Lin_prologue
257
258 mov 192($context),%r10 # pull $num
259 mov 8(%rax,%r10,8),%rax # pull saved stack pointer
260 lea 48(%rax),%rax
261
262 mov -8(%rax),%rbx
263 mov -16(%rax),%rbp
264 mov -24(%rax),%r12
265 mov -32(%rax),%r13
266 mov -40(%rax),%r14
267 mov -48(%rax),%r15
268 mov %rbx,144($context) # restore context->Rbx
269 mov %rbp,160($context) # restore context->Rbp
270 mov %r12,216($context) # restore context->R12
271 mov %r13,224($context) # restore context->R13
272 mov %r14,232($context) # restore context->R14
273 mov %r15,240($context) # restore context->R15
274
275.Lin_prologue:
276 mov 8(%rax),%rdi
277 mov 16(%rax),%rsi
278 mov %rax,152($context) # restore context->Rsp
279 mov %rsi,168($context) # restore context->Rsi
280 mov %rdi,176($context) # restore context->Rdi
281
282 mov 40($disp),%rdi # disp->ContextRecord
283 mov $context,%rsi # context
284 mov \$154,%ecx # sizeof(CONTEXT)
285 .long 0xa548f3fc # cld; rep movsq
286
287 mov $disp,%rsi
288 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
289 mov 8(%rsi),%rdx # arg2, disp->ImageBase
290 mov 0(%rsi),%r8 # arg3, disp->ControlPc
291 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
292 mov 40(%rsi),%r10 # disp->ContextRecord
293 lea 56(%rsi),%r11 # &disp->HandlerData
294 lea 24(%rsi),%r12 # &disp->EstablisherFrame
295 mov %r10,32(%rsp) # arg5
296 mov %r11,40(%rsp) # arg6
297 mov %r12,48(%rsp) # arg7
298 mov %rcx,56(%rsp) # arg8, (NULL)
299 call *__imp_RtlVirtualUnwind(%rip)
300
301 mov \$1,%eax # ExceptionContinueSearch
302 add \$64,%rsp
303 popfq
304 pop %r15
305 pop %r14
306 pop %r13
307 pop %r12
308 pop %rbp
309 pop %rbx
310 pop %rdi
311 pop %rsi
312 ret
313.size se_handler,.-se_handler
314
315.section .pdata
316.align 4
317 .rva .LSEH_begin_bn_mul_mont
318 .rva .LSEH_end_bn_mul_mont
319 .rva .LSEH_info_bn_mul_mont
320
321.section .xdata
322.align 8
323.LSEH_info_bn_mul_mont:
324 .byte 9,0,0,0
325 .rva se_handler
326___
327}
328
329print $code;
330close STDOUT;