diff options
Diffstat (limited to 'src/lib/libcrypto/bn/asm/sparcv9a-mont.pl')
-rwxr-xr-x | src/lib/libcrypto/bn/asm/sparcv9a-mont.pl | 882 |
1 files changed, 882 insertions, 0 deletions
diff --git a/src/lib/libcrypto/bn/asm/sparcv9a-mont.pl b/src/lib/libcrypto/bn/asm/sparcv9a-mont.pl new file mode 100755 index 0000000000..a14205f2f0 --- /dev/null +++ b/src/lib/libcrypto/bn/asm/sparcv9a-mont.pl | |||
@@ -0,0 +1,882 @@ | |||
1 | #!/usr/bin/env perl | ||
2 | |||
3 | # ==================================================================== | ||
4 | # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | ||
5 | # project. The module is, however, dual licensed under OpenSSL and | ||
6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
8 | # ==================================================================== | ||
9 | |||
10 | # October 2005 | ||
11 | # | ||
12 | # "Teaser" Montgomery multiplication module for UltraSPARC. Why FPU? | ||
13 | # Because unlike integer multiplier, which simply stalls whole CPU, | ||
14 | # FPU is fully pipelined and can effectively emit 48 bit partial | ||
15 | # product every cycle. Why not blended SPARC v9? One can argue that | ||
16 | # making this module dependent on UltraSPARC VIS extension limits its | ||
17 | # binary compatibility. Well yes, it does exclude SPARC64 prior-V(!) | ||
18 | # implementations from compatibility matrix. But the rest, whole Sun | ||
19 | # UltraSPARC family and brand new Fujitsu's SPARC64 V, all support | ||
20 | # VIS extension instructions used in this module. This is considered | ||
21 | # good enough to not care about HAL SPARC64 users [if any] who have | ||
22 | # integer-only pure SPARCv9 module to "fall down" to. | ||
23 | |||
24 | # USI&II cores currently exhibit uniform 2x improvement [over pre- | ||
25 | # bn_mul_mont codebase] for all key lengths and benchmarks. On USIII | ||
26 | # performance improves few percents for shorter keys and worsens few | ||
27 | # percents for longer keys. This is because USIII integer multiplier | ||
28 | # is >3x faster than USI&II one, which is harder to match [but see | ||
29 | # TODO list below]. It should also be noted that SPARC64 V features | ||
30 | # out-of-order execution, which *might* mean that integer multiplier | ||
31 | # is pipelined, which in turn *might* be impossible to match... On | ||
32 | # additional note, SPARC64 V implements FP Multiply-Add instruction, | ||
33 | # which is perfectly usable in this context... In other words, as far | ||
34 | # as Fujitsu SPARC64 V goes, talk to the author:-) | ||
35 | |||
36 | # The implementation implies following "non-natural" limitations on | ||
37 | # input arguments: | ||
38 | # - num may not be less than 4; | ||
39 | # - num has to be even; | ||
40 | # Failure to meet either condition has no fatal effects, simply | ||
41 | # doesn't give any performance gain. | ||
42 | |||
43 | # TODO: | ||
44 | # - modulo-schedule inner loop for better performance (on in-order | ||
45 | # execution core such as UltraSPARC this shall result in further | ||
46 | # noticeable(!) improvement); | ||
47 | # - dedicated squaring procedure[?]; | ||
48 | |||
49 | ###################################################################### | ||
50 | # November 2006 | ||
51 | # | ||
52 | # Modulo-scheduled inner loops allow to interleave floating point and | ||
53 | # integer instructions and minimize Read-After-Write penalties. This | ||
54 | # results in *further* 20-50% perfromance improvement [depending on | ||
55 | # key length, more for longer keys] on USI&II cores and 30-80% - on | ||
56 | # USIII&IV. | ||
57 | |||
58 | $fname="bn_mul_mont_fpu"; | ||
59 | $bits=32; | ||
60 | for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); } | ||
61 | |||
62 | if ($bits==64) { | ||
63 | $bias=2047; | ||
64 | $frame=192; | ||
65 | } else { | ||
66 | $bias=0; | ||
67 | $frame=128; # 96 rounded up to largest known cache-line | ||
68 | } | ||
69 | $locals=64; | ||
70 | |||
71 | # In order to provide for 32-/64-bit ABI duality, I keep integers wider | ||
72 | # than 32 bit in %g1-%g4 and %o0-%o5. %l0-%l7 and %i0-%i5 are used | ||
73 | # exclusively for pointers, indexes and other small values... | ||
74 | # int bn_mul_mont( | ||
75 | $rp="%i0"; # BN_ULONG *rp, | ||
76 | $ap="%i1"; # const BN_ULONG *ap, | ||
77 | $bp="%i2"; # const BN_ULONG *bp, | ||
78 | $np="%i3"; # const BN_ULONG *np, | ||
79 | $n0="%i4"; # const BN_ULONG *n0, | ||
80 | $num="%i5"; # int num); | ||
81 | |||
82 | $tp="%l0"; # t[num] | ||
83 | $ap_l="%l1"; # a[num],n[num] are smashed to 32-bit words and saved | ||
84 | $ap_h="%l2"; # to these four vectors as double-precision FP values. | ||
85 | $np_l="%l3"; # This way a bunch of fxtods are eliminated in second | ||
86 | $np_h="%l4"; # loop and L1-cache aliasing is minimized... | ||
87 | $i="%l5"; | ||
88 | $j="%l6"; | ||
89 | $mask="%l7"; # 16-bit mask, 0xffff | ||
90 | |||
91 | $n0="%g4"; # reassigned(!) to "64-bit" register | ||
92 | $carry="%i4"; # %i4 reused(!) for a carry bit | ||
93 | |||
94 | # FP register naming chart | ||
95 | # | ||
96 | # ..HILO | ||
97 | # dcba | ||
98 | # -------- | ||
99 | # LOa | ||
100 | # LOb | ||
101 | # LOc | ||
102 | # LOd | ||
103 | # HIa | ||
104 | # HIb | ||
105 | # HIc | ||
106 | # HId | ||
107 | # ..a | ||
108 | # ..b | ||
109 | $ba="%f0"; $bb="%f2"; $bc="%f4"; $bd="%f6"; | ||
110 | $na="%f8"; $nb="%f10"; $nc="%f12"; $nd="%f14"; | ||
111 | $alo="%f16"; $alo_="%f17"; $ahi="%f18"; $ahi_="%f19"; | ||
112 | $nlo="%f20"; $nlo_="%f21"; $nhi="%f22"; $nhi_="%f23"; | ||
113 | |||
114 | $dota="%f24"; $dotb="%f26"; | ||
115 | |||
116 | $aloa="%f32"; $alob="%f34"; $aloc="%f36"; $alod="%f38"; | ||
117 | $ahia="%f40"; $ahib="%f42"; $ahic="%f44"; $ahid="%f46"; | ||
118 | $nloa="%f48"; $nlob="%f50"; $nloc="%f52"; $nlod="%f54"; | ||
119 | $nhia="%f56"; $nhib="%f58"; $nhic="%f60"; $nhid="%f62"; | ||
120 | |||
121 | $ASI_FL16_P=0xD2; # magic ASI value to engage 16-bit FP load | ||
122 | |||
123 | $code=<<___; | ||
124 | .section ".text",#alloc,#execinstr | ||
125 | |||
126 | .global $fname | ||
127 | .align 32 | ||
128 | $fname: | ||
129 | save %sp,-$frame-$locals,%sp | ||
130 | |||
131 | cmp $num,4 | ||
132 | bl,a,pn %icc,.Lret | ||
133 | clr %i0 | ||
134 | andcc $num,1,%g0 ! $num has to be even... | ||
135 | bnz,a,pn %icc,.Lret | ||
136 | clr %i0 ! signal "unsupported input value" | ||
137 | |||
138 | srl $num,1,$num | ||
139 | sethi %hi(0xffff),$mask | ||
140 | ld [%i4+0],$n0 ! $n0 reassigned, remember? | ||
141 | or $mask,%lo(0xffff),$mask | ||
142 | ld [%i4+4],%o0 | ||
143 | sllx %o0,32,%o0 | ||
144 | or %o0,$n0,$n0 ! $n0=n0[1].n0[0] | ||
145 | |||
146 | sll $num,3,$num ! num*=8 | ||
147 | |||
148 | add %sp,$bias,%o0 ! real top of stack | ||
149 | sll $num,2,%o1 | ||
150 | add %o1,$num,%o1 ! %o1=num*5 | ||
151 | sub %o0,%o1,%o0 | ||
152 | and %o0,-2048,%o0 ! optimize TLB utilization | ||
153 | sub %o0,$bias,%sp ! alloca(5*num*8) | ||
154 | |||
155 | rd %asi,%o7 ! save %asi | ||
156 | add %sp,$bias+$frame+$locals,$tp | ||
157 | add $tp,$num,$ap_l | ||
158 | add $ap_l,$num,$ap_l ! [an]p_[lh] point at the vectors' ends ! | ||
159 | add $ap_l,$num,$ap_h | ||
160 | add $ap_h,$num,$np_l | ||
161 | add $np_l,$num,$np_h | ||
162 | |||
163 | wr %g0,$ASI_FL16_P,%asi ! setup %asi for 16-bit FP loads | ||
164 | |||
165 | add $rp,$num,$rp ! readjust input pointers to point | ||
166 | add $ap,$num,$ap ! at the ends too... | ||
167 | add $bp,$num,$bp | ||
168 | add $np,$num,$np | ||
169 | |||
170 | stx %o7,[%sp+$bias+$frame+48] ! save %asi | ||
171 | |||
172 | sub %g0,$num,$i ! i=-num | ||
173 | sub %g0,$num,$j ! j=-num | ||
174 | |||
175 | add $ap,$j,%o3 | ||
176 | add $bp,$i,%o4 | ||
177 | |||
178 | ld [%o3+4],%g1 ! bp[0] | ||
179 | ld [%o3+0],%o0 | ||
180 | ld [%o4+4],%g5 ! ap[0] | ||
181 | sllx %g1,32,%g1 | ||
182 | ld [%o4+0],%o1 | ||
183 | sllx %g5,32,%g5 | ||
184 | or %g1,%o0,%o0 | ||
185 | or %g5,%o1,%o1 | ||
186 | |||
187 | add $np,$j,%o5 | ||
188 | |||
189 | mulx %o1,%o0,%o0 ! ap[0]*bp[0] | ||
190 | mulx $n0,%o0,%o0 ! ap[0]*bp[0]*n0 | ||
191 | stx %o0,[%sp+$bias+$frame+0] | ||
192 | |||
193 | ld [%o3+0],$alo_ ! load a[j] as pair of 32-bit words | ||
194 | fzeros $alo | ||
195 | ld [%o3+4],$ahi_ | ||
196 | fzeros $ahi | ||
197 | ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words | ||
198 | fzeros $nlo | ||
199 | ld [%o5+4],$nhi_ | ||
200 | fzeros $nhi | ||
201 | |||
202 | ! transfer b[i] to FPU as 4x16-bit values | ||
203 | ldda [%o4+2]%asi,$ba | ||
204 | fxtod $alo,$alo | ||
205 | ldda [%o4+0]%asi,$bb | ||
206 | fxtod $ahi,$ahi | ||
207 | ldda [%o4+6]%asi,$bc | ||
208 | fxtod $nlo,$nlo | ||
209 | ldda [%o4+4]%asi,$bd | ||
210 | fxtod $nhi,$nhi | ||
211 | |||
212 | ! transfer ap[0]*b[0]*n0 to FPU as 4x16-bit values | ||
213 | ldda [%sp+$bias+$frame+6]%asi,$na | ||
214 | fxtod $ba,$ba | ||
215 | ldda [%sp+$bias+$frame+4]%asi,$nb | ||
216 | fxtod $bb,$bb | ||
217 | ldda [%sp+$bias+$frame+2]%asi,$nc | ||
218 | fxtod $bc,$bc | ||
219 | ldda [%sp+$bias+$frame+0]%asi,$nd | ||
220 | fxtod $bd,$bd | ||
221 | |||
222 | std $alo,[$ap_l+$j] ! save smashed ap[j] in double format | ||
223 | fxtod $na,$na | ||
224 | std $ahi,[$ap_h+$j] | ||
225 | fxtod $nb,$nb | ||
226 | std $nlo,[$np_l+$j] ! save smashed np[j] in double format | ||
227 | fxtod $nc,$nc | ||
228 | std $nhi,[$np_h+$j] | ||
229 | fxtod $nd,$nd | ||
230 | |||
231 | fmuld $alo,$ba,$aloa | ||
232 | fmuld $nlo,$na,$nloa | ||
233 | fmuld $alo,$bb,$alob | ||
234 | fmuld $nlo,$nb,$nlob | ||
235 | fmuld $alo,$bc,$aloc | ||
236 | faddd $aloa,$nloa,$nloa | ||
237 | fmuld $nlo,$nc,$nloc | ||
238 | fmuld $alo,$bd,$alod | ||
239 | faddd $alob,$nlob,$nlob | ||
240 | fmuld $nlo,$nd,$nlod | ||
241 | fmuld $ahi,$ba,$ahia | ||
242 | faddd $aloc,$nloc,$nloc | ||
243 | fmuld $nhi,$na,$nhia | ||
244 | fmuld $ahi,$bb,$ahib | ||
245 | faddd $alod,$nlod,$nlod | ||
246 | fmuld $nhi,$nb,$nhib | ||
247 | fmuld $ahi,$bc,$ahic | ||
248 | faddd $ahia,$nhia,$nhia | ||
249 | fmuld $nhi,$nc,$nhic | ||
250 | fmuld $ahi,$bd,$ahid | ||
251 | faddd $ahib,$nhib,$nhib | ||
252 | fmuld $nhi,$nd,$nhid | ||
253 | |||
254 | faddd $ahic,$nhic,$dota ! $nhic | ||
255 | faddd $ahid,$nhid,$dotb ! $nhid | ||
256 | |||
257 | faddd $nloc,$nhia,$nloc | ||
258 | faddd $nlod,$nhib,$nlod | ||
259 | |||
260 | fdtox $nloa,$nloa | ||
261 | fdtox $nlob,$nlob | ||
262 | fdtox $nloc,$nloc | ||
263 | fdtox $nlod,$nlod | ||
264 | |||
265 | std $nloa,[%sp+$bias+$frame+0] | ||
266 | add $j,8,$j | ||
267 | std $nlob,[%sp+$bias+$frame+8] | ||
268 | add $ap,$j,%o4 | ||
269 | std $nloc,[%sp+$bias+$frame+16] | ||
270 | add $np,$j,%o5 | ||
271 | std $nlod,[%sp+$bias+$frame+24] | ||
272 | |||
273 | ld [%o4+0],$alo_ ! load a[j] as pair of 32-bit words | ||
274 | fzeros $alo | ||
275 | ld [%o4+4],$ahi_ | ||
276 | fzeros $ahi | ||
277 | ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words | ||
278 | fzeros $nlo | ||
279 | ld [%o5+4],$nhi_ | ||
280 | fzeros $nhi | ||
281 | |||
282 | fxtod $alo,$alo | ||
283 | fxtod $ahi,$ahi | ||
284 | fxtod $nlo,$nlo | ||
285 | fxtod $nhi,$nhi | ||
286 | |||
287 | ldx [%sp+$bias+$frame+0],%o0 | ||
288 | fmuld $alo,$ba,$aloa | ||
289 | ldx [%sp+$bias+$frame+8],%o1 | ||
290 | fmuld $nlo,$na,$nloa | ||
291 | ldx [%sp+$bias+$frame+16],%o2 | ||
292 | fmuld $alo,$bb,$alob | ||
293 | ldx [%sp+$bias+$frame+24],%o3 | ||
294 | fmuld $nlo,$nb,$nlob | ||
295 | |||
296 | srlx %o0,16,%o7 | ||
297 | std $alo,[$ap_l+$j] ! save smashed ap[j] in double format | ||
298 | fmuld $alo,$bc,$aloc | ||
299 | add %o7,%o1,%o1 | ||
300 | std $ahi,[$ap_h+$j] | ||
301 | faddd $aloa,$nloa,$nloa | ||
302 | fmuld $nlo,$nc,$nloc | ||
303 | srlx %o1,16,%o7 | ||
304 | std $nlo,[$np_l+$j] ! save smashed np[j] in double format | ||
305 | fmuld $alo,$bd,$alod | ||
306 | add %o7,%o2,%o2 | ||
307 | std $nhi,[$np_h+$j] | ||
308 | faddd $alob,$nlob,$nlob | ||
309 | fmuld $nlo,$nd,$nlod | ||
310 | srlx %o2,16,%o7 | ||
311 | fmuld $ahi,$ba,$ahia | ||
312 | add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] | ||
313 | faddd $aloc,$nloc,$nloc | ||
314 | fmuld $nhi,$na,$nhia | ||
315 | !and %o0,$mask,%o0 | ||
316 | !and %o1,$mask,%o1 | ||
317 | !and %o2,$mask,%o2 | ||
318 | !sllx %o1,16,%o1 | ||
319 | !sllx %o2,32,%o2 | ||
320 | !sllx %o3,48,%o7 | ||
321 | !or %o1,%o0,%o0 | ||
322 | !or %o2,%o0,%o0 | ||
323 | !or %o7,%o0,%o0 ! 64-bit result | ||
324 | srlx %o3,16,%g1 ! 34-bit carry | ||
325 | fmuld $ahi,$bb,$ahib | ||
326 | |||
327 | faddd $alod,$nlod,$nlod | ||
328 | fmuld $nhi,$nb,$nhib | ||
329 | fmuld $ahi,$bc,$ahic | ||
330 | faddd $ahia,$nhia,$nhia | ||
331 | fmuld $nhi,$nc,$nhic | ||
332 | fmuld $ahi,$bd,$ahid | ||
333 | faddd $ahib,$nhib,$nhib | ||
334 | fmuld $nhi,$nd,$nhid | ||
335 | |||
336 | faddd $dota,$nloa,$nloa | ||
337 | faddd $dotb,$nlob,$nlob | ||
338 | faddd $ahic,$nhic,$dota ! $nhic | ||
339 | faddd $ahid,$nhid,$dotb ! $nhid | ||
340 | |||
341 | faddd $nloc,$nhia,$nloc | ||
342 | faddd $nlod,$nhib,$nlod | ||
343 | |||
344 | fdtox $nloa,$nloa | ||
345 | fdtox $nlob,$nlob | ||
346 | fdtox $nloc,$nloc | ||
347 | fdtox $nlod,$nlod | ||
348 | |||
349 | std $nloa,[%sp+$bias+$frame+0] | ||
350 | std $nlob,[%sp+$bias+$frame+8] | ||
351 | addcc $j,8,$j | ||
352 | std $nloc,[%sp+$bias+$frame+16] | ||
353 | bz,pn %icc,.L1stskip | ||
354 | std $nlod,[%sp+$bias+$frame+24] | ||
355 | |||
356 | .align 32 ! incidentally already aligned ! | ||
357 | .L1st: | ||
358 | add $ap,$j,%o4 | ||
359 | add $np,$j,%o5 | ||
360 | ld [%o4+0],$alo_ ! load a[j] as pair of 32-bit words | ||
361 | fzeros $alo | ||
362 | ld [%o4+4],$ahi_ | ||
363 | fzeros $ahi | ||
364 | ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words | ||
365 | fzeros $nlo | ||
366 | ld [%o5+4],$nhi_ | ||
367 | fzeros $nhi | ||
368 | |||
369 | fxtod $alo,$alo | ||
370 | fxtod $ahi,$ahi | ||
371 | fxtod $nlo,$nlo | ||
372 | fxtod $nhi,$nhi | ||
373 | |||
374 | ldx [%sp+$bias+$frame+0],%o0 | ||
375 | fmuld $alo,$ba,$aloa | ||
376 | ldx [%sp+$bias+$frame+8],%o1 | ||
377 | fmuld $nlo,$na,$nloa | ||
378 | ldx [%sp+$bias+$frame+16],%o2 | ||
379 | fmuld $alo,$bb,$alob | ||
380 | ldx [%sp+$bias+$frame+24],%o3 | ||
381 | fmuld $nlo,$nb,$nlob | ||
382 | |||
383 | srlx %o0,16,%o7 | ||
384 | std $alo,[$ap_l+$j] ! save smashed ap[j] in double format | ||
385 | fmuld $alo,$bc,$aloc | ||
386 | add %o7,%o1,%o1 | ||
387 | std $ahi,[$ap_h+$j] | ||
388 | faddd $aloa,$nloa,$nloa | ||
389 | fmuld $nlo,$nc,$nloc | ||
390 | srlx %o1,16,%o7 | ||
391 | std $nlo,[$np_l+$j] ! save smashed np[j] in double format | ||
392 | fmuld $alo,$bd,$alod | ||
393 | add %o7,%o2,%o2 | ||
394 | std $nhi,[$np_h+$j] | ||
395 | faddd $alob,$nlob,$nlob | ||
396 | fmuld $nlo,$nd,$nlod | ||
397 | srlx %o2,16,%o7 | ||
398 | fmuld $ahi,$ba,$ahia | ||
399 | add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] | ||
400 | and %o0,$mask,%o0 | ||
401 | faddd $aloc,$nloc,$nloc | ||
402 | fmuld $nhi,$na,$nhia | ||
403 | and %o1,$mask,%o1 | ||
404 | and %o2,$mask,%o2 | ||
405 | fmuld $ahi,$bb,$ahib | ||
406 | sllx %o1,16,%o1 | ||
407 | faddd $alod,$nlod,$nlod | ||
408 | fmuld $nhi,$nb,$nhib | ||
409 | sllx %o2,32,%o2 | ||
410 | fmuld $ahi,$bc,$ahic | ||
411 | sllx %o3,48,%o7 | ||
412 | or %o1,%o0,%o0 | ||
413 | faddd $ahia,$nhia,$nhia | ||
414 | fmuld $nhi,$nc,$nhic | ||
415 | or %o2,%o0,%o0 | ||
416 | fmuld $ahi,$bd,$ahid | ||
417 | or %o7,%o0,%o0 ! 64-bit result | ||
418 | faddd $ahib,$nhib,$nhib | ||
419 | fmuld $nhi,$nd,$nhid | ||
420 | addcc %g1,%o0,%o0 | ||
421 | faddd $dota,$nloa,$nloa | ||
422 | srlx %o3,16,%g1 ! 34-bit carry | ||
423 | faddd $dotb,$nlob,$nlob | ||
424 | bcs,a %xcc,.+8 | ||
425 | add %g1,1,%g1 | ||
426 | |||
427 | stx %o0,[$tp] ! tp[j-1]= | ||
428 | |||
429 | faddd $ahic,$nhic,$dota ! $nhic | ||
430 | faddd $ahid,$nhid,$dotb ! $nhid | ||
431 | |||
432 | faddd $nloc,$nhia,$nloc | ||
433 | faddd $nlod,$nhib,$nlod | ||
434 | |||
435 | fdtox $nloa,$nloa | ||
436 | fdtox $nlob,$nlob | ||
437 | fdtox $nloc,$nloc | ||
438 | fdtox $nlod,$nlod | ||
439 | |||
440 | std $nloa,[%sp+$bias+$frame+0] | ||
441 | std $nlob,[%sp+$bias+$frame+8] | ||
442 | std $nloc,[%sp+$bias+$frame+16] | ||
443 | std $nlod,[%sp+$bias+$frame+24] | ||
444 | |||
445 | addcc $j,8,$j | ||
446 | bnz,pt %icc,.L1st | ||
447 | add $tp,8,$tp | ||
448 | |||
449 | .L1stskip: | ||
450 | fdtox $dota,$dota | ||
451 | fdtox $dotb,$dotb | ||
452 | |||
453 | ldx [%sp+$bias+$frame+0],%o0 | ||
454 | ldx [%sp+$bias+$frame+8],%o1 | ||
455 | ldx [%sp+$bias+$frame+16],%o2 | ||
456 | ldx [%sp+$bias+$frame+24],%o3 | ||
457 | |||
458 | srlx %o0,16,%o7 | ||
459 | std $dota,[%sp+$bias+$frame+32] | ||
460 | add %o7,%o1,%o1 | ||
461 | std $dotb,[%sp+$bias+$frame+40] | ||
462 | srlx %o1,16,%o7 | ||
463 | add %o7,%o2,%o2 | ||
464 | srlx %o2,16,%o7 | ||
465 | add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] | ||
466 | and %o0,$mask,%o0 | ||
467 | and %o1,$mask,%o1 | ||
468 | and %o2,$mask,%o2 | ||
469 | sllx %o1,16,%o1 | ||
470 | sllx %o2,32,%o2 | ||
471 | sllx %o3,48,%o7 | ||
472 | or %o1,%o0,%o0 | ||
473 | or %o2,%o0,%o0 | ||
474 | or %o7,%o0,%o0 ! 64-bit result | ||
475 | ldx [%sp+$bias+$frame+32],%o4 | ||
476 | addcc %g1,%o0,%o0 | ||
477 | ldx [%sp+$bias+$frame+40],%o5 | ||
478 | srlx %o3,16,%g1 ! 34-bit carry | ||
479 | bcs,a %xcc,.+8 | ||
480 | add %g1,1,%g1 | ||
481 | |||
482 | stx %o0,[$tp] ! tp[j-1]= | ||
483 | add $tp,8,$tp | ||
484 | |||
485 | srlx %o4,16,%o7 | ||
486 | add %o7,%o5,%o5 | ||
487 | and %o4,$mask,%o4 | ||
488 | sllx %o5,16,%o7 | ||
489 | or %o7,%o4,%o4 | ||
490 | addcc %g1,%o4,%o4 | ||
491 | srlx %o5,48,%g1 | ||
492 | bcs,a %xcc,.+8 | ||
493 | add %g1,1,%g1 | ||
494 | |||
495 | mov %g1,$carry | ||
496 | stx %o4,[$tp] ! tp[num-1]= | ||
497 | |||
498 | ba .Louter | ||
499 | add $i,8,$i | ||
500 | .align 32 | ||
501 | .Louter: | ||
502 | sub %g0,$num,$j ! j=-num | ||
503 | add %sp,$bias+$frame+$locals,$tp | ||
504 | |||
505 | add $ap,$j,%o3 | ||
506 | add $bp,$i,%o4 | ||
507 | |||
508 | ld [%o3+4],%g1 ! bp[i] | ||
509 | ld [%o3+0],%o0 | ||
510 | ld [%o4+4],%g5 ! ap[0] | ||
511 | sllx %g1,32,%g1 | ||
512 | ld [%o4+0],%o1 | ||
513 | sllx %g5,32,%g5 | ||
514 | or %g1,%o0,%o0 | ||
515 | or %g5,%o1,%o1 | ||
516 | |||
517 | ldx [$tp],%o2 ! tp[0] | ||
518 | mulx %o1,%o0,%o0 | ||
519 | addcc %o2,%o0,%o0 | ||
520 | mulx $n0,%o0,%o0 ! (ap[0]*bp[i]+t[0])*n0 | ||
521 | stx %o0,[%sp+$bias+$frame+0] | ||
522 | |||
523 | ! transfer b[i] to FPU as 4x16-bit values | ||
524 | ldda [%o4+2]%asi,$ba | ||
525 | ldda [%o4+0]%asi,$bb | ||
526 | ldda [%o4+6]%asi,$bc | ||
527 | ldda [%o4+4]%asi,$bd | ||
528 | |||
529 | ! transfer (ap[0]*b[i]+t[0])*n0 to FPU as 4x16-bit values | ||
530 | ldda [%sp+$bias+$frame+6]%asi,$na | ||
531 | fxtod $ba,$ba | ||
532 | ldda [%sp+$bias+$frame+4]%asi,$nb | ||
533 | fxtod $bb,$bb | ||
534 | ldda [%sp+$bias+$frame+2]%asi,$nc | ||
535 | fxtod $bc,$bc | ||
536 | ldda [%sp+$bias+$frame+0]%asi,$nd | ||
537 | fxtod $bd,$bd | ||
538 | ldd [$ap_l+$j],$alo ! load a[j] in double format | ||
539 | fxtod $na,$na | ||
540 | ldd [$ap_h+$j],$ahi | ||
541 | fxtod $nb,$nb | ||
542 | ldd [$np_l+$j],$nlo ! load n[j] in double format | ||
543 | fxtod $nc,$nc | ||
544 | ldd [$np_h+$j],$nhi | ||
545 | fxtod $nd,$nd | ||
546 | |||
547 | fmuld $alo,$ba,$aloa | ||
548 | fmuld $nlo,$na,$nloa | ||
549 | fmuld $alo,$bb,$alob | ||
550 | fmuld $nlo,$nb,$nlob | ||
551 | fmuld $alo,$bc,$aloc | ||
552 | faddd $aloa,$nloa,$nloa | ||
553 | fmuld $nlo,$nc,$nloc | ||
554 | fmuld $alo,$bd,$alod | ||
555 | faddd $alob,$nlob,$nlob | ||
556 | fmuld $nlo,$nd,$nlod | ||
557 | fmuld $ahi,$ba,$ahia | ||
558 | faddd $aloc,$nloc,$nloc | ||
559 | fmuld $nhi,$na,$nhia | ||
560 | fmuld $ahi,$bb,$ahib | ||
561 | faddd $alod,$nlod,$nlod | ||
562 | fmuld $nhi,$nb,$nhib | ||
563 | fmuld $ahi,$bc,$ahic | ||
564 | faddd $ahia,$nhia,$nhia | ||
565 | fmuld $nhi,$nc,$nhic | ||
566 | fmuld $ahi,$bd,$ahid | ||
567 | faddd $ahib,$nhib,$nhib | ||
568 | fmuld $nhi,$nd,$nhid | ||
569 | |||
570 | faddd $ahic,$nhic,$dota ! $nhic | ||
571 | faddd $ahid,$nhid,$dotb ! $nhid | ||
572 | |||
573 | faddd $nloc,$nhia,$nloc | ||
574 | faddd $nlod,$nhib,$nlod | ||
575 | |||
576 | fdtox $nloa,$nloa | ||
577 | fdtox $nlob,$nlob | ||
578 | fdtox $nloc,$nloc | ||
579 | fdtox $nlod,$nlod | ||
580 | |||
581 | std $nloa,[%sp+$bias+$frame+0] | ||
582 | std $nlob,[%sp+$bias+$frame+8] | ||
583 | std $nloc,[%sp+$bias+$frame+16] | ||
584 | add $j,8,$j | ||
585 | std $nlod,[%sp+$bias+$frame+24] | ||
586 | |||
587 | ldd [$ap_l+$j],$alo ! load a[j] in double format | ||
588 | ldd [$ap_h+$j],$ahi | ||
589 | ldd [$np_l+$j],$nlo ! load n[j] in double format | ||
590 | ldd [$np_h+$j],$nhi | ||
591 | |||
592 | fmuld $alo,$ba,$aloa | ||
593 | fmuld $nlo,$na,$nloa | ||
594 | fmuld $alo,$bb,$alob | ||
595 | fmuld $nlo,$nb,$nlob | ||
596 | fmuld $alo,$bc,$aloc | ||
597 | ldx [%sp+$bias+$frame+0],%o0 | ||
598 | faddd $aloa,$nloa,$nloa | ||
599 | fmuld $nlo,$nc,$nloc | ||
600 | ldx [%sp+$bias+$frame+8],%o1 | ||
601 | fmuld $alo,$bd,$alod | ||
602 | ldx [%sp+$bias+$frame+16],%o2 | ||
603 | faddd $alob,$nlob,$nlob | ||
604 | fmuld $nlo,$nd,$nlod | ||
605 | ldx [%sp+$bias+$frame+24],%o3 | ||
606 | fmuld $ahi,$ba,$ahia | ||
607 | |||
608 | srlx %o0,16,%o7 | ||
609 | faddd $aloc,$nloc,$nloc | ||
610 | fmuld $nhi,$na,$nhia | ||
611 | add %o7,%o1,%o1 | ||
612 | fmuld $ahi,$bb,$ahib | ||
613 | srlx %o1,16,%o7 | ||
614 | faddd $alod,$nlod,$nlod | ||
615 | fmuld $nhi,$nb,$nhib | ||
616 | add %o7,%o2,%o2 | ||
617 | fmuld $ahi,$bc,$ahic | ||
618 | srlx %o2,16,%o7 | ||
619 | faddd $ahia,$nhia,$nhia | ||
620 | fmuld $nhi,$nc,$nhic | ||
621 | add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] | ||
622 | ! why? | ||
623 | and %o0,$mask,%o0 | ||
624 | fmuld $ahi,$bd,$ahid | ||
625 | and %o1,$mask,%o1 | ||
626 | and %o2,$mask,%o2 | ||
627 | faddd $ahib,$nhib,$nhib | ||
628 | fmuld $nhi,$nd,$nhid | ||
629 | sllx %o1,16,%o1 | ||
630 | faddd $dota,$nloa,$nloa | ||
631 | sllx %o2,32,%o2 | ||
632 | faddd $dotb,$nlob,$nlob | ||
633 | sllx %o3,48,%o7 | ||
634 | or %o1,%o0,%o0 | ||
635 | faddd $ahic,$nhic,$dota ! $nhic | ||
636 | or %o2,%o0,%o0 | ||
637 | faddd $ahid,$nhid,$dotb ! $nhid | ||
638 | or %o7,%o0,%o0 ! 64-bit result | ||
639 | ldx [$tp],%o7 | ||
640 | faddd $nloc,$nhia,$nloc | ||
641 | addcc %o7,%o0,%o0 | ||
642 | ! end-of-why? | ||
643 | faddd $nlod,$nhib,$nlod | ||
644 | srlx %o3,16,%g1 ! 34-bit carry | ||
645 | fdtox $nloa,$nloa | ||
646 | bcs,a %xcc,.+8 | ||
647 | add %g1,1,%g1 | ||
648 | |||
649 | fdtox $nlob,$nlob | ||
650 | fdtox $nloc,$nloc | ||
651 | fdtox $nlod,$nlod | ||
652 | |||
653 | std $nloa,[%sp+$bias+$frame+0] | ||
654 | std $nlob,[%sp+$bias+$frame+8] | ||
655 | addcc $j,8,$j | ||
656 | std $nloc,[%sp+$bias+$frame+16] | ||
657 | bz,pn %icc,.Linnerskip | ||
658 | std $nlod,[%sp+$bias+$frame+24] | ||
659 | |||
660 | ba .Linner | ||
661 | nop | ||
662 | .align 32 | ||
663 | .Linner: | ||
664 | ldd [$ap_l+$j],$alo ! load a[j] in double format | ||
665 | ldd [$ap_h+$j],$ahi | ||
666 | ldd [$np_l+$j],$nlo ! load n[j] in double format | ||
667 | ldd [$np_h+$j],$nhi | ||
668 | |||
669 | fmuld $alo,$ba,$aloa | ||
670 | fmuld $nlo,$na,$nloa | ||
671 | fmuld $alo,$bb,$alob | ||
672 | fmuld $nlo,$nb,$nlob | ||
673 | fmuld $alo,$bc,$aloc | ||
674 | ldx [%sp+$bias+$frame+0],%o0 | ||
675 | faddd $aloa,$nloa,$nloa | ||
676 | fmuld $nlo,$nc,$nloc | ||
677 | ldx [%sp+$bias+$frame+8],%o1 | ||
678 | fmuld $alo,$bd,$alod | ||
679 | ldx [%sp+$bias+$frame+16],%o2 | ||
680 | faddd $alob,$nlob,$nlob | ||
681 | fmuld $nlo,$nd,$nlod | ||
682 | ldx [%sp+$bias+$frame+24],%o3 | ||
683 | fmuld $ahi,$ba,$ahia | ||
684 | |||
685 | srlx %o0,16,%o7 | ||
686 | faddd $aloc,$nloc,$nloc | ||
687 | fmuld $nhi,$na,$nhia | ||
688 | add %o7,%o1,%o1 | ||
689 | fmuld $ahi,$bb,$ahib | ||
690 | srlx %o1,16,%o7 | ||
691 | faddd $alod,$nlod,$nlod | ||
692 | fmuld $nhi,$nb,$nhib | ||
693 | add %o7,%o2,%o2 | ||
694 | fmuld $ahi,$bc,$ahic | ||
695 | srlx %o2,16,%o7 | ||
696 | faddd $ahia,$nhia,$nhia | ||
697 | fmuld $nhi,$nc,$nhic | ||
698 | add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] | ||
699 | and %o0,$mask,%o0 | ||
700 | fmuld $ahi,$bd,$ahid | ||
701 | and %o1,$mask,%o1 | ||
702 | and %o2,$mask,%o2 | ||
703 | faddd $ahib,$nhib,$nhib | ||
704 | fmuld $nhi,$nd,$nhid | ||
705 | sllx %o1,16,%o1 | ||
706 | faddd $dota,$nloa,$nloa | ||
707 | sllx %o2,32,%o2 | ||
708 | faddd $dotb,$nlob,$nlob | ||
709 | sllx %o3,48,%o7 | ||
710 | or %o1,%o0,%o0 | ||
711 | faddd $ahic,$nhic,$dota ! $nhic | ||
712 | or %o2,%o0,%o0 | ||
713 | faddd $ahid,$nhid,$dotb ! $nhid | ||
714 | or %o7,%o0,%o0 ! 64-bit result | ||
715 | faddd $nloc,$nhia,$nloc | ||
716 | addcc %g1,%o0,%o0 | ||
717 | ldx [$tp+8],%o7 ! tp[j] | ||
718 | faddd $nlod,$nhib,$nlod | ||
719 | srlx %o3,16,%g1 ! 34-bit carry | ||
720 | fdtox $nloa,$nloa | ||
721 | bcs,a %xcc,.+8 | ||
722 | add %g1,1,%g1 | ||
723 | fdtox $nlob,$nlob | ||
724 | addcc %o7,%o0,%o0 | ||
725 | fdtox $nloc,$nloc | ||
726 | bcs,a %xcc,.+8 | ||
727 | add %g1,1,%g1 | ||
728 | |||
729 | stx %o0,[$tp] ! tp[j-1] | ||
730 | fdtox $nlod,$nlod | ||
731 | |||
732 | std $nloa,[%sp+$bias+$frame+0] | ||
733 | std $nlob,[%sp+$bias+$frame+8] | ||
734 | std $nloc,[%sp+$bias+$frame+16] | ||
735 | addcc $j,8,$j | ||
736 | std $nlod,[%sp+$bias+$frame+24] | ||
737 | bnz,pt %icc,.Linner | ||
738 | add $tp,8,$tp | ||
739 | |||
740 | .Linnerskip: | ||
741 | fdtox $dota,$dota | ||
742 | fdtox $dotb,$dotb | ||
743 | |||
744 | ldx [%sp+$bias+$frame+0],%o0 | ||
745 | ldx [%sp+$bias+$frame+8],%o1 | ||
746 | ldx [%sp+$bias+$frame+16],%o2 | ||
747 | ldx [%sp+$bias+$frame+24],%o3 | ||
748 | |||
749 | srlx %o0,16,%o7 | ||
750 | std $dota,[%sp+$bias+$frame+32] | ||
751 | add %o7,%o1,%o1 | ||
752 | std $dotb,[%sp+$bias+$frame+40] | ||
753 | srlx %o1,16,%o7 | ||
754 | add %o7,%o2,%o2 | ||
755 | srlx %o2,16,%o7 | ||
756 | add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] | ||
757 | and %o0,$mask,%o0 | ||
758 | and %o1,$mask,%o1 | ||
759 | and %o2,$mask,%o2 | ||
760 | sllx %o1,16,%o1 | ||
761 | sllx %o2,32,%o2 | ||
762 | sllx %o3,48,%o7 | ||
763 | or %o1,%o0,%o0 | ||
764 | or %o2,%o0,%o0 | ||
765 | ldx [%sp+$bias+$frame+32],%o4 | ||
766 | or %o7,%o0,%o0 ! 64-bit result | ||
767 | ldx [%sp+$bias+$frame+40],%o5 | ||
768 | addcc %g1,%o0,%o0 | ||
769 | ldx [$tp+8],%o7 ! tp[j] | ||
770 | srlx %o3,16,%g1 ! 34-bit carry | ||
771 | bcs,a %xcc,.+8 | ||
772 | add %g1,1,%g1 | ||
773 | |||
774 | addcc %o7,%o0,%o0 | ||
775 | bcs,a %xcc,.+8 | ||
776 | add %g1,1,%g1 | ||
777 | |||
778 | stx %o0,[$tp] ! tp[j-1] | ||
779 | add $tp,8,$tp | ||
780 | |||
781 | srlx %o4,16,%o7 | ||
782 | add %o7,%o5,%o5 | ||
783 | and %o4,$mask,%o4 | ||
784 | sllx %o5,16,%o7 | ||
785 | or %o7,%o4,%o4 | ||
786 | addcc %g1,%o4,%o4 | ||
787 | srlx %o5,48,%g1 | ||
788 | bcs,a %xcc,.+8 | ||
789 | add %g1,1,%g1 | ||
790 | |||
791 | addcc $carry,%o4,%o4 | ||
792 | stx %o4,[$tp] ! tp[num-1] | ||
793 | mov %g1,$carry | ||
794 | bcs,a %xcc,.+8 | ||
795 | add $carry,1,$carry | ||
796 | |||
797 | addcc $i,8,$i | ||
798 | bnz %icc,.Louter | ||
799 | nop | ||
800 | |||
801 | add $tp,8,$tp ! adjust tp to point at the end | ||
802 | orn %g0,%g0,%g4 | ||
803 | sub %g0,$num,%o7 ! n=-num | ||
804 | ba .Lsub | ||
805 | subcc %g0,%g0,%g0 ! clear %icc.c | ||
806 | |||
807 | .align 32 | ||
808 | .Lsub: | ||
809 | ldx [$tp+%o7],%o0 | ||
810 | add $np,%o7,%g1 | ||
811 | ld [%g1+0],%o2 | ||
812 | ld [%g1+4],%o3 | ||
813 | srlx %o0,32,%o1 | ||
814 | subccc %o0,%o2,%o2 | ||
815 | add $rp,%o7,%g1 | ||
816 | subccc %o1,%o3,%o3 | ||
817 | st %o2,[%g1+0] | ||
818 | add %o7,8,%o7 | ||
819 | brnz,pt %o7,.Lsub | ||
820 | st %o3,[%g1+4] | ||
821 | subc $carry,0,%g4 | ||
822 | sub %g0,$num,%o7 ! n=-num | ||
823 | ba .Lcopy | ||
824 | nop | ||
825 | |||
826 | .align 32 | ||
827 | .Lcopy: | ||
828 | ldx [$tp+%o7],%o0 | ||
829 | add $rp,%o7,%g1 | ||
830 | ld [%g1+0],%o2 | ||
831 | ld [%g1+4],%o3 | ||
832 | stx %g0,[$tp+%o7] | ||
833 | and %o0,%g4,%o0 | ||
834 | srlx %o0,32,%o1 | ||
835 | andn %o2,%g4,%o2 | ||
836 | andn %o3,%g4,%o3 | ||
837 | or %o2,%o0,%o0 | ||
838 | or %o3,%o1,%o1 | ||
839 | st %o0,[%g1+0] | ||
840 | add %o7,8,%o7 | ||
841 | brnz,pt %o7,.Lcopy | ||
842 | st %o1,[%g1+4] | ||
843 | sub %g0,$num,%o7 ! n=-num | ||
844 | |||
845 | .Lzap: | ||
846 | stx %g0,[$ap_l+%o7] | ||
847 | stx %g0,[$ap_h+%o7] | ||
848 | stx %g0,[$np_l+%o7] | ||
849 | stx %g0,[$np_h+%o7] | ||
850 | add %o7,8,%o7 | ||
851 | brnz,pt %o7,.Lzap | ||
852 | nop | ||
853 | |||
854 | ldx [%sp+$bias+$frame+48],%o7 | ||
855 | wr %g0,%o7,%asi ! restore %asi | ||
856 | |||
857 | mov 1,%i0 | ||
858 | .Lret: | ||
859 | ret | ||
860 | restore | ||
861 | .type $fname,#function | ||
862 | .size $fname,(.-$fname) | ||
863 | .asciz "Montgomery Multipltication for UltraSPARC, CRYPTOGAMS by <appro\@openssl.org>" | ||
864 | .align 32 | ||
865 | ___ | ||
866 | |||
867 | $code =~ s/\`([^\`]*)\`/eval($1)/gem; | ||
868 | |||
869 | # Below substitution makes it possible to compile without demanding | ||
870 | # VIS extentions on command line, e.g. -xarch=v9 vs. -xarch=v9a. I | ||
871 | # dare to do this, because VIS capability is detected at run-time now | ||
872 | # and this routine is not called on CPU not capable to execute it. Do | ||
873 | # note that fzeros is not the only VIS dependency! Another dependency | ||
874 | # is implicit and is just _a_ numerical value loaded to %asi register, | ||
875 | # which assembler can't recognize as VIS specific... | ||
876 | $code =~ s/fzeros\s+%f([0-9]+)/ | ||
877 | sprintf(".word\t0x%x\t! fzeros %%f%d",0x81b00c20|($1<<25),$1) | ||
878 | /gem; | ||
879 | |||
880 | print $code; | ||
881 | # flush | ||
882 | close STDOUT; | ||