diff options
Diffstat (limited to 'src/lib/libcrypto/bn/asm/sparcv9-mont.pl')
-rw-r--r-- | src/lib/libcrypto/bn/asm/sparcv9-mont.pl | 606 |
1 files changed, 0 insertions, 606 deletions
diff --git a/src/lib/libcrypto/bn/asm/sparcv9-mont.pl b/src/lib/libcrypto/bn/asm/sparcv9-mont.pl deleted file mode 100644 index b8fb1e8a25..0000000000 --- a/src/lib/libcrypto/bn/asm/sparcv9-mont.pl +++ /dev/null | |||
@@ -1,606 +0,0 @@ | |||
1 | #!/usr/bin/env perl | ||
2 | |||
3 | # ==================================================================== | ||
4 | # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | ||
5 | # project. The module is, however, dual licensed under OpenSSL and | ||
6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
8 | # ==================================================================== | ||
9 | |||
10 | # December 2005 | ||
11 | # | ||
12 | # Pure SPARCv9/8+ and IALU-only bn_mul_mont implementation. The reasons | ||
13 | # for undertaken effort are multiple. First of all, UltraSPARC is not | ||
14 | # the whole SPARCv9 universe and other VIS-free implementations deserve | ||
15 | # optimized code as much. Secondly, newly introduced UltraSPARC T1, | ||
16 | # a.k.a. Niagara, has shared FPU and concurrent FPU-intensive pathes, | ||
17 | # such as sparcv9a-mont, will simply sink it. Yes, T1 is equipped with | ||
18 | # several integrated RSA/DSA accelerator circuits accessible through | ||
19 | # kernel driver [only(*)], but having decent user-land software | ||
20 | # implementation is important too. Finally, reasons like desire to | ||
21 | # experiment with dedicated squaring procedure. Yes, this module | ||
22 | # implements one, because it was easiest to draft it in SPARCv9 | ||
23 | # instructions... | ||
24 | |||
25 | # (*) Engine accessing the driver in question is on my TODO list. | ||
26 | # For reference, acceleator is estimated to give 6 to 10 times | ||
27 | # improvement on single-threaded RSA sign. It should be noted | ||
28 | # that 6-10x improvement coefficient does not actually mean | ||
29 | # something extraordinary in terms of absolute [single-threaded] | ||
30 | # performance, as SPARCv9 instruction set is by all means least | ||
31 | # suitable for high performance crypto among other 64 bit | ||
32 | # platforms. 6-10x factor simply places T1 in same performance | ||
33 | # domain as say AMD64 and IA-64. Improvement of RSA verify don't | ||
34 | # appear impressive at all, but it's the sign operation which is | ||
35 | # far more critical/interesting. | ||
36 | |||
37 | # You might notice that inner loops are modulo-scheduled:-) This has | ||
38 | # essentially negligible impact on UltraSPARC performance, it's | ||
39 | # Fujitsu SPARC64 V users who should notice and hopefully appreciate | ||
40 | # the advantage... Currently this module surpasses sparcv9a-mont.pl | ||
41 | # by ~20% on UltraSPARC-III and later cores, but recall that sparcv9a | ||
42 | # module still have hidden potential [see TODO list there], which is | ||
43 | # estimated to be larger than 20%... | ||
44 | |||
45 | # int bn_mul_mont( | ||
46 | $rp="%i0"; # BN_ULONG *rp, | ||
47 | $ap="%i1"; # const BN_ULONG *ap, | ||
48 | $bp="%i2"; # const BN_ULONG *bp, | ||
49 | $np="%i3"; # const BN_ULONG *np, | ||
50 | $n0="%i4"; # const BN_ULONG *n0, | ||
51 | $num="%i5"; # int num); | ||
52 | |||
53 | $bits=32; | ||
54 | for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); } | ||
55 | if ($bits==64) { $bias=2047; $frame=192; } | ||
56 | else { $bias=0; $frame=128; } | ||
57 | |||
58 | $car0="%o0"; | ||
59 | $car1="%o1"; | ||
60 | $car2="%o2"; # 1 bit | ||
61 | $acc0="%o3"; | ||
62 | $acc1="%o4"; | ||
63 | $mask="%g1"; # 32 bits, what a waste... | ||
64 | $tmp0="%g4"; | ||
65 | $tmp1="%g5"; | ||
66 | |||
67 | $i="%l0"; | ||
68 | $j="%l1"; | ||
69 | $mul0="%l2"; | ||
70 | $mul1="%l3"; | ||
71 | $tp="%l4"; | ||
72 | $apj="%l5"; | ||
73 | $npj="%l6"; | ||
74 | $tpj="%l7"; | ||
75 | |||
76 | $fname="bn_mul_mont_int"; | ||
77 | |||
78 | $code=<<___; | ||
79 | .section ".text",#alloc,#execinstr | ||
80 | |||
81 | .global $fname | ||
82 | .align 32 | ||
83 | $fname: | ||
84 | cmp %o5,4 ! 128 bits minimum | ||
85 | bge,pt %icc,.Lenter | ||
86 | sethi %hi(0xffffffff),$mask | ||
87 | retl | ||
88 | clr %o0 | ||
89 | .align 32 | ||
90 | .Lenter: | ||
91 | save %sp,-$frame,%sp | ||
92 | sll $num,2,$num ! num*=4 | ||
93 | or $mask,%lo(0xffffffff),$mask | ||
94 | ld [$n0],$n0 | ||
95 | cmp $ap,$bp | ||
96 | and $num,$mask,$num | ||
97 | ld [$bp],$mul0 ! bp[0] | ||
98 | nop | ||
99 | |||
100 | add %sp,$bias,%o7 ! real top of stack | ||
101 | ld [$ap],$car0 ! ap[0] ! redundant in squaring context | ||
102 | sub %o7,$num,%o7 | ||
103 | ld [$ap+4],$apj ! ap[1] | ||
104 | and %o7,-1024,%o7 | ||
105 | ld [$np],$car1 ! np[0] | ||
106 | sub %o7,$bias,%sp ! alloca | ||
107 | ld [$np+4],$npj ! np[1] | ||
108 | be,pt `$bits==32?"%icc":"%xcc"`,.Lbn_sqr_mont | ||
109 | mov 12,$j | ||
110 | |||
111 | mulx $car0,$mul0,$car0 ! ap[0]*bp[0] | ||
112 | mulx $apj,$mul0,$tmp0 !prologue! ap[1]*bp[0] | ||
113 | and $car0,$mask,$acc0 | ||
114 | add %sp,$bias+$frame,$tp | ||
115 | ld [$ap+8],$apj !prologue! | ||
116 | |||
117 | mulx $n0,$acc0,$mul1 ! "t[0]"*n0 | ||
118 | and $mul1,$mask,$mul1 | ||
119 | |||
120 | mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0 | ||
121 | mulx $npj,$mul1,$acc1 !prologue! np[1]*"t[0]"*n0 | ||
122 | srlx $car0,32,$car0 | ||
123 | add $acc0,$car1,$car1 | ||
124 | ld [$np+8],$npj !prologue! | ||
125 | srlx $car1,32,$car1 | ||
126 | mov $tmp0,$acc0 !prologue! | ||
127 | |||
128 | .L1st: | ||
129 | mulx $apj,$mul0,$tmp0 | ||
130 | mulx $npj,$mul1,$tmp1 | ||
131 | add $acc0,$car0,$car0 | ||
132 | ld [$ap+$j],$apj ! ap[j] | ||
133 | and $car0,$mask,$acc0 | ||
134 | add $acc1,$car1,$car1 | ||
135 | ld [$np+$j],$npj ! np[j] | ||
136 | srlx $car0,32,$car0 | ||
137 | add $acc0,$car1,$car1 | ||
138 | add $j,4,$j ! j++ | ||
139 | mov $tmp0,$acc0 | ||
140 | st $car1,[$tp] | ||
141 | cmp $j,$num | ||
142 | mov $tmp1,$acc1 | ||
143 | srlx $car1,32,$car1 | ||
144 | bl %icc,.L1st | ||
145 | add $tp,4,$tp ! tp++ | ||
146 | !.L1st | ||
147 | |||
148 | mulx $apj,$mul0,$tmp0 !epilogue! | ||
149 | mulx $npj,$mul1,$tmp1 | ||
150 | add $acc0,$car0,$car0 | ||
151 | and $car0,$mask,$acc0 | ||
152 | add $acc1,$car1,$car1 | ||
153 | srlx $car0,32,$car0 | ||
154 | add $acc0,$car1,$car1 | ||
155 | st $car1,[$tp] | ||
156 | srlx $car1,32,$car1 | ||
157 | |||
158 | add $tmp0,$car0,$car0 | ||
159 | and $car0,$mask,$acc0 | ||
160 | add $tmp1,$car1,$car1 | ||
161 | srlx $car0,32,$car0 | ||
162 | add $acc0,$car1,$car1 | ||
163 | st $car1,[$tp+4] | ||
164 | srlx $car1,32,$car1 | ||
165 | |||
166 | add $car0,$car1,$car1 | ||
167 | st $car1,[$tp+8] | ||
168 | srlx $car1,32,$car2 | ||
169 | |||
170 | mov 4,$i ! i++ | ||
171 | ld [$bp+4],$mul0 ! bp[1] | ||
172 | .Louter: | ||
173 | add %sp,$bias+$frame,$tp | ||
174 | ld [$ap],$car0 ! ap[0] | ||
175 | ld [$ap+4],$apj ! ap[1] | ||
176 | ld [$np],$car1 ! np[0] | ||
177 | ld [$np+4],$npj ! np[1] | ||
178 | ld [$tp],$tmp1 ! tp[0] | ||
179 | ld [$tp+4],$tpj ! tp[1] | ||
180 | mov 12,$j | ||
181 | |||
182 | mulx $car0,$mul0,$car0 | ||
183 | mulx $apj,$mul0,$tmp0 !prologue! | ||
184 | add $tmp1,$car0,$car0 | ||
185 | ld [$ap+8],$apj !prologue! | ||
186 | and $car0,$mask,$acc0 | ||
187 | |||
188 | mulx $n0,$acc0,$mul1 | ||
189 | and $mul1,$mask,$mul1 | ||
190 | |||
191 | mulx $car1,$mul1,$car1 | ||
192 | mulx $npj,$mul1,$acc1 !prologue! | ||
193 | srlx $car0,32,$car0 | ||
194 | add $acc0,$car1,$car1 | ||
195 | ld [$np+8],$npj !prologue! | ||
196 | srlx $car1,32,$car1 | ||
197 | mov $tmp0,$acc0 !prologue! | ||
198 | |||
199 | .Linner: | ||
200 | mulx $apj,$mul0,$tmp0 | ||
201 | mulx $npj,$mul1,$tmp1 | ||
202 | add $tpj,$car0,$car0 | ||
203 | ld [$ap+$j],$apj ! ap[j] | ||
204 | add $acc0,$car0,$car0 | ||
205 | add $acc1,$car1,$car1 | ||
206 | ld [$np+$j],$npj ! np[j] | ||
207 | and $car0,$mask,$acc0 | ||
208 | ld [$tp+8],$tpj ! tp[j] | ||
209 | srlx $car0,32,$car0 | ||
210 | add $acc0,$car1,$car1 | ||
211 | add $j,4,$j ! j++ | ||
212 | mov $tmp0,$acc0 | ||
213 | st $car1,[$tp] ! tp[j-1] | ||
214 | srlx $car1,32,$car1 | ||
215 | mov $tmp1,$acc1 | ||
216 | cmp $j,$num | ||
217 | bl %icc,.Linner | ||
218 | add $tp,4,$tp ! tp++ | ||
219 | !.Linner | ||
220 | |||
221 | mulx $apj,$mul0,$tmp0 !epilogue! | ||
222 | mulx $npj,$mul1,$tmp1 | ||
223 | add $tpj,$car0,$car0 | ||
224 | add $acc0,$car0,$car0 | ||
225 | ld [$tp+8],$tpj ! tp[j] | ||
226 | and $car0,$mask,$acc0 | ||
227 | add $acc1,$car1,$car1 | ||
228 | srlx $car0,32,$car0 | ||
229 | add $acc0,$car1,$car1 | ||
230 | st $car1,[$tp] ! tp[j-1] | ||
231 | srlx $car1,32,$car1 | ||
232 | |||
233 | add $tpj,$car0,$car0 | ||
234 | add $tmp0,$car0,$car0 | ||
235 | and $car0,$mask,$acc0 | ||
236 | add $tmp1,$car1,$car1 | ||
237 | add $acc0,$car1,$car1 | ||
238 | st $car1,[$tp+4] ! tp[j-1] | ||
239 | srlx $car0,32,$car0 | ||
240 | add $i,4,$i ! i++ | ||
241 | srlx $car1,32,$car1 | ||
242 | |||
243 | add $car0,$car1,$car1 | ||
244 | cmp $i,$num | ||
245 | add $car2,$car1,$car1 | ||
246 | st $car1,[$tp+8] | ||
247 | |||
248 | srlx $car1,32,$car2 | ||
249 | bl,a %icc,.Louter | ||
250 | ld [$bp+$i],$mul0 ! bp[i] | ||
251 | !.Louter | ||
252 | |||
253 | add $tp,12,$tp | ||
254 | |||
255 | .Ltail: | ||
256 | add $np,$num,$np | ||
257 | add $rp,$num,$rp | ||
258 | mov $tp,$ap | ||
259 | sub %g0,$num,%o7 ! k=-num | ||
260 | ba .Lsub | ||
261 | subcc %g0,%g0,%g0 ! clear %icc.c | ||
262 | .align 16 | ||
263 | .Lsub: | ||
264 | ld [$tp+%o7],%o0 | ||
265 | ld [$np+%o7],%o1 | ||
266 | subccc %o0,%o1,%o1 ! tp[j]-np[j] | ||
267 | add $rp,%o7,$i | ||
268 | add %o7,4,%o7 | ||
269 | brnz %o7,.Lsub | ||
270 | st %o1,[$i] | ||
271 | subc $car2,0,$car2 ! handle upmost overflow bit | ||
272 | and $tp,$car2,$ap | ||
273 | andn $rp,$car2,$np | ||
274 | or $ap,$np,$ap | ||
275 | sub %g0,$num,%o7 | ||
276 | |||
277 | .Lcopy: | ||
278 | ld [$ap+%o7],%o0 ! copy or in-place refresh | ||
279 | st %g0,[$tp+%o7] ! zap tp | ||
280 | st %o0,[$rp+%o7] | ||
281 | add %o7,4,%o7 | ||
282 | brnz %o7,.Lcopy | ||
283 | nop | ||
284 | mov 1,%i0 | ||
285 | ret | ||
286 | restore | ||
287 | ___ | ||
288 | |||
289 | ######## | ||
290 | ######## .Lbn_sqr_mont gives up to 20% *overall* improvement over | ||
291 | ######## code without following dedicated squaring procedure. | ||
292 | ######## | ||
293 | $sbit="%i2"; # re-use $bp! | ||
294 | |||
295 | $code.=<<___; | ||
296 | .align 32 | ||
297 | .Lbn_sqr_mont: | ||
298 | mulx $mul0,$mul0,$car0 ! ap[0]*ap[0] | ||
299 | mulx $apj,$mul0,$tmp0 !prologue! | ||
300 | and $car0,$mask,$acc0 | ||
301 | add %sp,$bias+$frame,$tp | ||
302 | ld [$ap+8],$apj !prologue! | ||
303 | |||
304 | mulx $n0,$acc0,$mul1 ! "t[0]"*n0 | ||
305 | srlx $car0,32,$car0 | ||
306 | and $mul1,$mask,$mul1 | ||
307 | |||
308 | mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0 | ||
309 | mulx $npj,$mul1,$acc1 !prologue! | ||
310 | and $car0,1,$sbit | ||
311 | ld [$np+8],$npj !prologue! | ||
312 | srlx $car0,1,$car0 | ||
313 | add $acc0,$car1,$car1 | ||
314 | srlx $car1,32,$car1 | ||
315 | mov $tmp0,$acc0 !prologue! | ||
316 | |||
317 | .Lsqr_1st: | ||
318 | mulx $apj,$mul0,$tmp0 | ||
319 | mulx $npj,$mul1,$tmp1 | ||
320 | add $acc0,$car0,$car0 ! ap[j]*a0+c0 | ||
321 | add $acc1,$car1,$car1 | ||
322 | ld [$ap+$j],$apj ! ap[j] | ||
323 | and $car0,$mask,$acc0 | ||
324 | ld [$np+$j],$npj ! np[j] | ||
325 | srlx $car0,32,$car0 | ||
326 | add $acc0,$acc0,$acc0 | ||
327 | or $sbit,$acc0,$acc0 | ||
328 | mov $tmp1,$acc1 | ||
329 | srlx $acc0,32,$sbit | ||
330 | add $j,4,$j ! j++ | ||
331 | and $acc0,$mask,$acc0 | ||
332 | cmp $j,$num | ||
333 | add $acc0,$car1,$car1 | ||
334 | st $car1,[$tp] | ||
335 | mov $tmp0,$acc0 | ||
336 | srlx $car1,32,$car1 | ||
337 | bl %icc,.Lsqr_1st | ||
338 | add $tp,4,$tp ! tp++ | ||
339 | !.Lsqr_1st | ||
340 | |||
341 | mulx $apj,$mul0,$tmp0 ! epilogue | ||
342 | mulx $npj,$mul1,$tmp1 | ||
343 | add $acc0,$car0,$car0 ! ap[j]*a0+c0 | ||
344 | add $acc1,$car1,$car1 | ||
345 | and $car0,$mask,$acc0 | ||
346 | srlx $car0,32,$car0 | ||
347 | add $acc0,$acc0,$acc0 | ||
348 | or $sbit,$acc0,$acc0 | ||
349 | srlx $acc0,32,$sbit | ||
350 | and $acc0,$mask,$acc0 | ||
351 | add $acc0,$car1,$car1 | ||
352 | st $car1,[$tp] | ||
353 | srlx $car1,32,$car1 | ||
354 | |||
355 | add $tmp0,$car0,$car0 ! ap[j]*a0+c0 | ||
356 | add $tmp1,$car1,$car1 | ||
357 | and $car0,$mask,$acc0 | ||
358 | srlx $car0,32,$car0 | ||
359 | add $acc0,$acc0,$acc0 | ||
360 | or $sbit,$acc0,$acc0 | ||
361 | srlx $acc0,32,$sbit | ||
362 | and $acc0,$mask,$acc0 | ||
363 | add $acc0,$car1,$car1 | ||
364 | st $car1,[$tp+4] | ||
365 | srlx $car1,32,$car1 | ||
366 | |||
367 | add $car0,$car0,$car0 | ||
368 | or $sbit,$car0,$car0 | ||
369 | add $car0,$car1,$car1 | ||
370 | st $car1,[$tp+8] | ||
371 | srlx $car1,32,$car2 | ||
372 | |||
373 | ld [%sp+$bias+$frame],$tmp0 ! tp[0] | ||
374 | ld [%sp+$bias+$frame+4],$tmp1 ! tp[1] | ||
375 | ld [%sp+$bias+$frame+8],$tpj ! tp[2] | ||
376 | ld [$ap+4],$mul0 ! ap[1] | ||
377 | ld [$ap+8],$apj ! ap[2] | ||
378 | ld [$np],$car1 ! np[0] | ||
379 | ld [$np+4],$npj ! np[1] | ||
380 | mulx $n0,$tmp0,$mul1 | ||
381 | |||
382 | mulx $mul0,$mul0,$car0 | ||
383 | and $mul1,$mask,$mul1 | ||
384 | |||
385 | mulx $car1,$mul1,$car1 | ||
386 | mulx $npj,$mul1,$acc1 | ||
387 | add $tmp0,$car1,$car1 | ||
388 | and $car0,$mask,$acc0 | ||
389 | ld [$np+8],$npj ! np[2] | ||
390 | srlx $car1,32,$car1 | ||
391 | add $tmp1,$car1,$car1 | ||
392 | srlx $car0,32,$car0 | ||
393 | add $acc0,$car1,$car1 | ||
394 | and $car0,1,$sbit | ||
395 | add $acc1,$car1,$car1 | ||
396 | srlx $car0,1,$car0 | ||
397 | mov 12,$j | ||
398 | st $car1,[%sp+$bias+$frame] ! tp[0]= | ||
399 | srlx $car1,32,$car1 | ||
400 | add %sp,$bias+$frame+4,$tp | ||
401 | |||
402 | .Lsqr_2nd: | ||
403 | mulx $apj,$mul0,$acc0 | ||
404 | mulx $npj,$mul1,$acc1 | ||
405 | add $acc0,$car0,$car0 | ||
406 | add $tpj,$car1,$car1 | ||
407 | ld [$ap+$j],$apj ! ap[j] | ||
408 | and $car0,$mask,$acc0 | ||
409 | ld [$np+$j],$npj ! np[j] | ||
410 | srlx $car0,32,$car0 | ||
411 | add $acc1,$car1,$car1 | ||
412 | ld [$tp+8],$tpj ! tp[j] | ||
413 | add $acc0,$acc0,$acc0 | ||
414 | add $j,4,$j ! j++ | ||
415 | or $sbit,$acc0,$acc0 | ||
416 | srlx $acc0,32,$sbit | ||
417 | and $acc0,$mask,$acc0 | ||
418 | cmp $j,$num | ||
419 | add $acc0,$car1,$car1 | ||
420 | st $car1,[$tp] ! tp[j-1] | ||
421 | srlx $car1,32,$car1 | ||
422 | bl %icc,.Lsqr_2nd | ||
423 | add $tp,4,$tp ! tp++ | ||
424 | !.Lsqr_2nd | ||
425 | |||
426 | mulx $apj,$mul0,$acc0 | ||
427 | mulx $npj,$mul1,$acc1 | ||
428 | add $acc0,$car0,$car0 | ||
429 | add $tpj,$car1,$car1 | ||
430 | and $car0,$mask,$acc0 | ||
431 | srlx $car0,32,$car0 | ||
432 | add $acc1,$car1,$car1 | ||
433 | add $acc0,$acc0,$acc0 | ||
434 | or $sbit,$acc0,$acc0 | ||
435 | srlx $acc0,32,$sbit | ||
436 | and $acc0,$mask,$acc0 | ||
437 | add $acc0,$car1,$car1 | ||
438 | st $car1,[$tp] ! tp[j-1] | ||
439 | srlx $car1,32,$car1 | ||
440 | |||
441 | add $car0,$car0,$car0 | ||
442 | or $sbit,$car0,$car0 | ||
443 | add $car0,$car1,$car1 | ||
444 | add $car2,$car1,$car1 | ||
445 | st $car1,[$tp+4] | ||
446 | srlx $car1,32,$car2 | ||
447 | |||
448 | ld [%sp+$bias+$frame],$tmp1 ! tp[0] | ||
449 | ld [%sp+$bias+$frame+4],$tpj ! tp[1] | ||
450 | ld [$ap+8],$mul0 ! ap[2] | ||
451 | ld [$np],$car1 ! np[0] | ||
452 | ld [$np+4],$npj ! np[1] | ||
453 | mulx $n0,$tmp1,$mul1 | ||
454 | and $mul1,$mask,$mul1 | ||
455 | mov 8,$i | ||
456 | |||
457 | mulx $mul0,$mul0,$car0 | ||
458 | mulx $car1,$mul1,$car1 | ||
459 | and $car0,$mask,$acc0 | ||
460 | add $tmp1,$car1,$car1 | ||
461 | srlx $car0,32,$car0 | ||
462 | add %sp,$bias+$frame,$tp | ||
463 | srlx $car1,32,$car1 | ||
464 | and $car0,1,$sbit | ||
465 | srlx $car0,1,$car0 | ||
466 | mov 4,$j | ||
467 | |||
468 | .Lsqr_outer: | ||
469 | .Lsqr_inner1: | ||
470 | mulx $npj,$mul1,$acc1 | ||
471 | add $tpj,$car1,$car1 | ||
472 | add $j,4,$j | ||
473 | ld [$tp+8],$tpj | ||
474 | cmp $j,$i | ||
475 | add $acc1,$car1,$car1 | ||
476 | ld [$np+$j],$npj | ||
477 | st $car1,[$tp] | ||
478 | srlx $car1,32,$car1 | ||
479 | bl %icc,.Lsqr_inner1 | ||
480 | add $tp,4,$tp | ||
481 | !.Lsqr_inner1 | ||
482 | |||
483 | add $j,4,$j | ||
484 | ld [$ap+$j],$apj ! ap[j] | ||
485 | mulx $npj,$mul1,$acc1 | ||
486 | add $tpj,$car1,$car1 | ||
487 | ld [$np+$j],$npj ! np[j] | ||
488 | add $acc0,$car1,$car1 | ||
489 | ld [$tp+8],$tpj ! tp[j] | ||
490 | add $acc1,$car1,$car1 | ||
491 | st $car1,[$tp] | ||
492 | srlx $car1,32,$car1 | ||
493 | |||
494 | add $j,4,$j | ||
495 | cmp $j,$num | ||
496 | be,pn %icc,.Lsqr_no_inner2 | ||
497 | add $tp,4,$tp | ||
498 | |||
499 | .Lsqr_inner2: | ||
500 | mulx $apj,$mul0,$acc0 | ||
501 | mulx $npj,$mul1,$acc1 | ||
502 | add $tpj,$car1,$car1 | ||
503 | add $acc0,$car0,$car0 | ||
504 | ld [$ap+$j],$apj ! ap[j] | ||
505 | and $car0,$mask,$acc0 | ||
506 | ld [$np+$j],$npj ! np[j] | ||
507 | srlx $car0,32,$car0 | ||
508 | add $acc0,$acc0,$acc0 | ||
509 | ld [$tp+8],$tpj ! tp[j] | ||
510 | or $sbit,$acc0,$acc0 | ||
511 | add $j,4,$j ! j++ | ||
512 | srlx $acc0,32,$sbit | ||
513 | and $acc0,$mask,$acc0 | ||
514 | cmp $j,$num | ||
515 | add $acc0,$car1,$car1 | ||
516 | add $acc1,$car1,$car1 | ||
517 | st $car1,[$tp] ! tp[j-1] | ||
518 | srlx $car1,32,$car1 | ||
519 | bl %icc,.Lsqr_inner2 | ||
520 | add $tp,4,$tp ! tp++ | ||
521 | |||
522 | .Lsqr_no_inner2: | ||
523 | mulx $apj,$mul0,$acc0 | ||
524 | mulx $npj,$mul1,$acc1 | ||
525 | add $tpj,$car1,$car1 | ||
526 | add $acc0,$car0,$car0 | ||
527 | and $car0,$mask,$acc0 | ||
528 | srlx $car0,32,$car0 | ||
529 | add $acc0,$acc0,$acc0 | ||
530 | or $sbit,$acc0,$acc0 | ||
531 | srlx $acc0,32,$sbit | ||
532 | and $acc0,$mask,$acc0 | ||
533 | add $acc0,$car1,$car1 | ||
534 | add $acc1,$car1,$car1 | ||
535 | st $car1,[$tp] ! tp[j-1] | ||
536 | srlx $car1,32,$car1 | ||
537 | |||
538 | add $car0,$car0,$car0 | ||
539 | or $sbit,$car0,$car0 | ||
540 | add $car0,$car1,$car1 | ||
541 | add $car2,$car1,$car1 | ||
542 | st $car1,[$tp+4] | ||
543 | srlx $car1,32,$car2 | ||
544 | |||
545 | add $i,4,$i ! i++ | ||
546 | ld [%sp+$bias+$frame],$tmp1 ! tp[0] | ||
547 | ld [%sp+$bias+$frame+4],$tpj ! tp[1] | ||
548 | ld [$ap+$i],$mul0 ! ap[j] | ||
549 | ld [$np],$car1 ! np[0] | ||
550 | ld [$np+4],$npj ! np[1] | ||
551 | mulx $n0,$tmp1,$mul1 | ||
552 | and $mul1,$mask,$mul1 | ||
553 | add $i,4,$tmp0 | ||
554 | |||
555 | mulx $mul0,$mul0,$car0 | ||
556 | mulx $car1,$mul1,$car1 | ||
557 | and $car0,$mask,$acc0 | ||
558 | add $tmp1,$car1,$car1 | ||
559 | srlx $car0,32,$car0 | ||
560 | add %sp,$bias+$frame,$tp | ||
561 | srlx $car1,32,$car1 | ||
562 | and $car0,1,$sbit | ||
563 | srlx $car0,1,$car0 | ||
564 | |||
565 | cmp $tmp0,$num ! i<num-1 | ||
566 | bl %icc,.Lsqr_outer | ||
567 | mov 4,$j | ||
568 | |||
569 | .Lsqr_last: | ||
570 | mulx $npj,$mul1,$acc1 | ||
571 | add $tpj,$car1,$car1 | ||
572 | add $j,4,$j | ||
573 | ld [$tp+8],$tpj | ||
574 | cmp $j,$i | ||
575 | add $acc1,$car1,$car1 | ||
576 | ld [$np+$j],$npj | ||
577 | st $car1,[$tp] | ||
578 | srlx $car1,32,$car1 | ||
579 | bl %icc,.Lsqr_last | ||
580 | add $tp,4,$tp | ||
581 | !.Lsqr_last | ||
582 | |||
583 | mulx $npj,$mul1,$acc1 | ||
584 | add $tpj,$car1,$car1 | ||
585 | add $acc0,$car1,$car1 | ||
586 | add $acc1,$car1,$car1 | ||
587 | st $car1,[$tp] | ||
588 | srlx $car1,32,$car1 | ||
589 | |||
590 | add $car0,$car0,$car0 ! recover $car0 | ||
591 | or $sbit,$car0,$car0 | ||
592 | add $car0,$car1,$car1 | ||
593 | add $car2,$car1,$car1 | ||
594 | st $car1,[$tp+4] | ||
595 | srlx $car1,32,$car2 | ||
596 | |||
597 | ba .Ltail | ||
598 | add $tp,8,$tp | ||
599 | .type $fname,#function | ||
600 | .size $fname,(.-$fname) | ||
601 | .asciz "Montgomery Multipltication for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>" | ||
602 | .align 32 | ||
603 | ___ | ||
604 | $code =~ s/\`([^\`]*)\`/eval($1)/gem; | ||
605 | print $code; | ||
606 | close STDOUT; | ||