diff options
Diffstat (limited to 'src/lib/libcrypto/bn/asm/bn-586.pl')
-rw-r--r-- | src/lib/libcrypto/bn/asm/bn-586.pl | 203 |
1 files changed, 151 insertions, 52 deletions
diff --git a/src/lib/libcrypto/bn/asm/bn-586.pl b/src/lib/libcrypto/bn/asm/bn-586.pl index 26c2685a72..332ef3e91d 100644 --- a/src/lib/libcrypto/bn/asm/bn-586.pl +++ b/src/lib/libcrypto/bn/asm/bn-586.pl | |||
@@ -1,6 +1,7 @@ | |||
1 | #!/usr/local/bin/perl | 1 | #!/usr/local/bin/perl |
2 | 2 | ||
3 | push(@INC,"perlasm","../../perlasm"); | 3 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; |
4 | push(@INC,"${dir}","${dir}../../perlasm"); | ||
4 | require "x86asm.pl"; | 5 | require "x86asm.pl"; |
5 | 6 | ||
6 | &asm_init($ARGV[0],$0); | 7 | &asm_init($ARGV[0],$0); |
@@ -24,38 +25,25 @@ sub bn_mul_add_words | |||
24 | { | 25 | { |
25 | local($name)=@_; | 26 | local($name)=@_; |
26 | 27 | ||
27 | &function_begin($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":""); | 28 | &function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":""); |
28 | 29 | ||
29 | &comment(""); | 30 | $r="eax"; |
30 | $Low="eax"; | 31 | $a="edx"; |
31 | $High="edx"; | 32 | $c="ecx"; |
32 | $a="ebx"; | ||
33 | $w="ebp"; | ||
34 | $r="edi"; | ||
35 | $c="esi"; | ||
36 | |||
37 | &xor($c,$c); # clear carry | ||
38 | &mov($r,&wparam(0)); # | ||
39 | |||
40 | &mov("ecx",&wparam(2)); # | ||
41 | &mov($a,&wparam(1)); # | ||
42 | |||
43 | &and("ecx",0xfffffff8); # num / 8 | ||
44 | &mov($w,&wparam(3)); # | ||
45 | |||
46 | &push("ecx"); # Up the stack for a tmp variable | ||
47 | |||
48 | &jz(&label("maw_finish")); | ||
49 | 33 | ||
50 | if ($sse2) { | 34 | if ($sse2) { |
51 | &picmeup("eax","OPENSSL_ia32cap_P"); | 35 | &picmeup("eax","OPENSSL_ia32cap_P"); |
52 | &bt(&DWP(0,"eax"),26); | 36 | &bt(&DWP(0,"eax"),26); |
53 | &jnc(&label("maw_loop")); | 37 | &jnc(&label("maw_non_sse2")); |
54 | 38 | ||
55 | &movd("mm0",$w); # mm0 = w | 39 | &mov($r,&wparam(0)); |
40 | &mov($a,&wparam(1)); | ||
41 | &mov($c,&wparam(2)); | ||
42 | &movd("mm0",&wparam(3)); # mm0 = w | ||
56 | &pxor("mm1","mm1"); # mm1 = carry_in | 43 | &pxor("mm1","mm1"); # mm1 = carry_in |
57 | 44 | &jmp(&label("maw_sse2_entry")); | |
58 | &set_label("maw_sse2_loop",0); | 45 | |
46 | &set_label("maw_sse2_unrolled",16); | ||
59 | &movd("mm3",&DWP(0,$r,"",0)); # mm3 = r[0] | 47 | &movd("mm3",&DWP(0,$r,"",0)); # mm3 = r[0] |
60 | &paddq("mm1","mm3"); # mm1 = carry_in + r[0] | 48 | &paddq("mm1","mm3"); # mm1 = carry_in + r[0] |
61 | &movd("mm2",&DWP(0,$a,"",0)); # mm2 = a[0] | 49 | &movd("mm2",&DWP(0,$a,"",0)); # mm2 = a[0] |
@@ -112,42 +100,82 @@ sub bn_mul_add_words | |||
112 | &psrlq("mm1",32); # mm1 = carry6 | 100 | &psrlq("mm1",32); # mm1 = carry6 |
113 | &paddq("mm1","mm3"); # mm1 = carry6 + r[7] + w*a[7] | 101 | &paddq("mm1","mm3"); # mm1 = carry6 + r[7] + w*a[7] |
114 | &movd(&DWP(28,$r,"",0),"mm1"); | 102 | &movd(&DWP(28,$r,"",0),"mm1"); |
115 | &add($r,32); | 103 | &lea($r,&DWP(32,$r)); |
116 | &psrlq("mm1",32); # mm1 = carry_out | 104 | &psrlq("mm1",32); # mm1 = carry_out |
117 | 105 | ||
118 | &sub("ecx",8); | 106 | &sub($c,8); |
107 | &jz(&label("maw_sse2_exit")); | ||
108 | &set_label("maw_sse2_entry"); | ||
109 | &test($c,0xfffffff8); | ||
110 | &jnz(&label("maw_sse2_unrolled")); | ||
111 | |||
112 | &set_label("maw_sse2_loop",4); | ||
113 | &movd("mm2",&DWP(0,$a)); # mm2 = a[i] | ||
114 | &movd("mm3",&DWP(0,$r)); # mm3 = r[i] | ||
115 | &pmuludq("mm2","mm0"); # a[i] *= w | ||
116 | &lea($a,&DWP(4,$a)); | ||
117 | &paddq("mm1","mm3"); # carry += r[i] | ||
118 | &paddq("mm1","mm2"); # carry += a[i]*w | ||
119 | &movd(&DWP(0,$r),"mm1"); # r[i] = carry_low | ||
120 | &sub($c,1); | ||
121 | &psrlq("mm1",32); # carry = carry_high | ||
122 | &lea($r,&DWP(4,$r)); | ||
119 | &jnz(&label("maw_sse2_loop")); | 123 | &jnz(&label("maw_sse2_loop")); |
120 | 124 | &set_label("maw_sse2_exit"); | |
121 | &movd($c,"mm1"); # c = carry_out | 125 | &movd("eax","mm1"); # c = carry_out |
122 | &emms(); | 126 | &emms(); |
127 | &ret(); | ||
123 | 128 | ||
124 | &jmp(&label("maw_finish")); | 129 | &set_label("maw_non_sse2",16); |
125 | } | 130 | } |
126 | 131 | ||
127 | &set_label("maw_loop",0); | 132 | # function_begin prologue |
133 | &push("ebp"); | ||
134 | &push("ebx"); | ||
135 | &push("esi"); | ||
136 | &push("edi"); | ||
137 | |||
138 | &comment(""); | ||
139 | $Low="eax"; | ||
140 | $High="edx"; | ||
141 | $a="ebx"; | ||
142 | $w="ebp"; | ||
143 | $r="edi"; | ||
144 | $c="esi"; | ||
145 | |||
146 | &xor($c,$c); # clear carry | ||
147 | &mov($r,&wparam(0)); # | ||
148 | |||
149 | &mov("ecx",&wparam(2)); # | ||
150 | &mov($a,&wparam(1)); # | ||
151 | |||
152 | &and("ecx",0xfffffff8); # num / 8 | ||
153 | &mov($w,&wparam(3)); # | ||
128 | 154 | ||
129 | &mov(&swtmp(0),"ecx"); # | 155 | &push("ecx"); # Up the stack for a tmp variable |
156 | |||
157 | &jz(&label("maw_finish")); | ||
158 | |||
159 | &set_label("maw_loop",16); | ||
130 | 160 | ||
131 | for ($i=0; $i<32; $i+=4) | 161 | for ($i=0; $i<32; $i+=4) |
132 | { | 162 | { |
133 | &comment("Round $i"); | 163 | &comment("Round $i"); |
134 | 164 | ||
135 | &mov("eax",&DWP($i,$a,"",0)); # *a | 165 | &mov("eax",&DWP($i,$a)); # *a |
136 | &mul($w); # *a * w | 166 | &mul($w); # *a * w |
137 | &add("eax",$c); # L(t)+= *r | 167 | &add("eax",$c); # L(t)+= c |
138 | &mov($c,&DWP($i,$r,"",0)); # L(t)+= *r | ||
139 | &adc("edx",0); # H(t)+=carry | 168 | &adc("edx",0); # H(t)+=carry |
140 | &add("eax",$c); # L(t)+=c | 169 | &add("eax",&DWP($i,$r)); # L(t)+= *r |
141 | &adc("edx",0); # H(t)+=carry | 170 | &adc("edx",0); # H(t)+=carry |
142 | &mov(&DWP($i,$r,"",0),"eax"); # *r= L(t); | 171 | &mov(&DWP($i,$r),"eax"); # *r= L(t); |
143 | &mov($c,"edx"); # c= H(t); | 172 | &mov($c,"edx"); # c= H(t); |
144 | } | 173 | } |
145 | 174 | ||
146 | &comment(""); | 175 | &comment(""); |
147 | &mov("ecx",&swtmp(0)); # | ||
148 | &add($a,32); | ||
149 | &add($r,32); | ||
150 | &sub("ecx",8); | 176 | &sub("ecx",8); |
177 | &lea($a,&DWP(32,$a)); | ||
178 | &lea($r,&DWP(32,$r)); | ||
151 | &jnz(&label("maw_loop")); | 179 | &jnz(&label("maw_loop")); |
152 | 180 | ||
153 | &set_label("maw_finish",0); | 181 | &set_label("maw_finish",0); |
@@ -160,16 +188,15 @@ sub bn_mul_add_words | |||
160 | for ($i=0; $i<7; $i++) | 188 | for ($i=0; $i<7; $i++) |
161 | { | 189 | { |
162 | &comment("Tail Round $i"); | 190 | &comment("Tail Round $i"); |
163 | &mov("eax",&DWP($i*4,$a,"",0));# *a | 191 | &mov("eax",&DWP($i*4,$a)); # *a |
164 | &mul($w); # *a * w | 192 | &mul($w); # *a * w |
165 | &add("eax",$c); # L(t)+=c | 193 | &add("eax",$c); # L(t)+=c |
166 | &mov($c,&DWP($i*4,$r,"",0)); # L(t)+= *r | ||
167 | &adc("edx",0); # H(t)+=carry | 194 | &adc("edx",0); # H(t)+=carry |
168 | &add("eax",$c); | 195 | &add("eax",&DWP($i*4,$r)); # L(t)+= *r |
169 | &adc("edx",0); # H(t)+=carry | 196 | &adc("edx",0); # H(t)+=carry |
170 | &dec("ecx") if ($i != 7-1); | 197 | &dec("ecx") if ($i != 7-1); |
171 | &mov(&DWP($i*4,$r,"",0),"eax"); # *r= L(t); | 198 | &mov(&DWP($i*4,$r),"eax"); # *r= L(t); |
172 | &mov($c,"edx"); # c= H(t); | 199 | &mov($c,"edx"); # c= H(t); |
173 | &jz(&label("maw_end")) if ($i != 7-1); | 200 | &jz(&label("maw_end")) if ($i != 7-1); |
174 | } | 201 | } |
175 | &set_label("maw_end",0); | 202 | &set_label("maw_end",0); |
@@ -184,7 +211,45 @@ sub bn_mul_words | |||
184 | { | 211 | { |
185 | local($name)=@_; | 212 | local($name)=@_; |
186 | 213 | ||
187 | &function_begin($name,""); | 214 | &function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":""); |
215 | |||
216 | $r="eax"; | ||
217 | $a="edx"; | ||
218 | $c="ecx"; | ||
219 | |||
220 | if ($sse2) { | ||
221 | &picmeup("eax","OPENSSL_ia32cap_P"); | ||
222 | &bt(&DWP(0,"eax"),26); | ||
223 | &jnc(&label("mw_non_sse2")); | ||
224 | |||
225 | &mov($r,&wparam(0)); | ||
226 | &mov($a,&wparam(1)); | ||
227 | &mov($c,&wparam(2)); | ||
228 | &movd("mm0",&wparam(3)); # mm0 = w | ||
229 | &pxor("mm1","mm1"); # mm1 = carry = 0 | ||
230 | |||
231 | &set_label("mw_sse2_loop",16); | ||
232 | &movd("mm2",&DWP(0,$a)); # mm2 = a[i] | ||
233 | &pmuludq("mm2","mm0"); # a[i] *= w | ||
234 | &lea($a,&DWP(4,$a)); | ||
235 | &paddq("mm1","mm2"); # carry += a[i]*w | ||
236 | &movd(&DWP(0,$r),"mm1"); # r[i] = carry_low | ||
237 | &sub($c,1); | ||
238 | &psrlq("mm1",32); # carry = carry_high | ||
239 | &lea($r,&DWP(4,$r)); | ||
240 | &jnz(&label("mw_sse2_loop")); | ||
241 | |||
242 | &movd("eax","mm1"); # return carry | ||
243 | &emms(); | ||
244 | &ret(); | ||
245 | &set_label("mw_non_sse2",16); | ||
246 | } | ||
247 | |||
248 | # function_begin prologue | ||
249 | &push("ebp"); | ||
250 | &push("ebx"); | ||
251 | &push("esi"); | ||
252 | &push("edi"); | ||
188 | 253 | ||
189 | &comment(""); | 254 | &comment(""); |
190 | $Low="eax"; | 255 | $Low="eax"; |
@@ -257,7 +322,40 @@ sub bn_sqr_words | |||
257 | { | 322 | { |
258 | local($name)=@_; | 323 | local($name)=@_; |
259 | 324 | ||
260 | &function_begin($name,""); | 325 | &function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":""); |
326 | |||
327 | $r="eax"; | ||
328 | $a="edx"; | ||
329 | $c="ecx"; | ||
330 | |||
331 | if ($sse2) { | ||
332 | &picmeup("eax","OPENSSL_ia32cap_P"); | ||
333 | &bt(&DWP(0,"eax"),26); | ||
334 | &jnc(&label("sqr_non_sse2")); | ||
335 | |||
336 | &mov($r,&wparam(0)); | ||
337 | &mov($a,&wparam(1)); | ||
338 | &mov($c,&wparam(2)); | ||
339 | |||
340 | &set_label("sqr_sse2_loop",16); | ||
341 | &movd("mm0",&DWP(0,$a)); # mm0 = a[i] | ||
342 | &pmuludq("mm0","mm0"); # a[i] *= a[i] | ||
343 | &lea($a,&DWP(4,$a)); # a++ | ||
344 | &movq(&QWP(0,$r),"mm0"); # r[i] = a[i]*a[i] | ||
345 | &sub($c,1); | ||
346 | &lea($r,&DWP(8,$r)); # r += 2 | ||
347 | &jnz(&label("sqr_sse2_loop")); | ||
348 | |||
349 | &emms(); | ||
350 | &ret(); | ||
351 | &set_label("sqr_non_sse2",16); | ||
352 | } | ||
353 | |||
354 | # function_begin prologue | ||
355 | &push("ebp"); | ||
356 | &push("ebx"); | ||
357 | &push("esi"); | ||
358 | &push("edi"); | ||
261 | 359 | ||
262 | &comment(""); | 360 | &comment(""); |
263 | $r="esi"; | 361 | $r="esi"; |
@@ -313,12 +411,13 @@ sub bn_div_words | |||
313 | { | 411 | { |
314 | local($name)=@_; | 412 | local($name)=@_; |
315 | 413 | ||
316 | &function_begin($name,""); | 414 | &function_begin_B($name,""); |
317 | &mov("edx",&wparam(0)); # | 415 | &mov("edx",&wparam(0)); # |
318 | &mov("eax",&wparam(1)); # | 416 | &mov("eax",&wparam(1)); # |
319 | &mov("ebx",&wparam(2)); # | 417 | &mov("ecx",&wparam(2)); # |
320 | &div("ebx"); | 418 | &div("ecx"); |
321 | &function_end($name); | 419 | &ret(); |
420 | &function_end_B($name); | ||
322 | } | 421 | } |
323 | 422 | ||
324 | sub bn_add_words | 423 | sub bn_add_words |