diff options
Diffstat (limited to 'src/lib/libcrypto/bn/asm/sparcv8plus.S')
-rw-r--r-- | src/lib/libcrypto/bn/asm/sparcv8plus.S | 1535 |
1 files changed, 1535 insertions, 0 deletions
diff --git a/src/lib/libcrypto/bn/asm/sparcv8plus.S b/src/lib/libcrypto/bn/asm/sparcv8plus.S new file mode 100644 index 0000000000..0074dfdb75 --- /dev/null +++ b/src/lib/libcrypto/bn/asm/sparcv8plus.S | |||
@@ -0,0 +1,1535 @@ | |||
1 | .ident "sparcv8plus.s, Version 1.4" | ||
2 | .ident "SPARC v9 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>" | ||
3 | |||
4 | /* | ||
5 | * ==================================================================== | ||
6 | * Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | ||
7 | * project. | ||
8 | * | ||
9 | * Rights for redistribution and usage in source and binary forms are | ||
10 | * granted according to the OpenSSL license. Warranty of any kind is | ||
11 | * disclaimed. | ||
12 | * ==================================================================== | ||
13 | */ | ||
14 | |||
15 | /* | ||
16 | * This is my modest contributon to OpenSSL project (see | ||
17 | * http://www.openssl.org/ for more information about it) and is | ||
18 | * a drop-in UltraSPARC ISA replacement for crypto/bn/bn_asm.c | ||
19 | * module. For updates see http://fy.chalmers.se/~appro/hpe/. | ||
20 | * | ||
21 | * Questions-n-answers. | ||
22 | * | ||
23 | * Q. How to compile? | ||
24 | * A. With SC4.x/SC5.x: | ||
25 | * | ||
26 | * cc -xarch=v8plus -c bn_asm.sparc.v8plus.S -o bn_asm.o | ||
27 | * | ||
28 | * and with gcc: | ||
29 | * | ||
30 | * gcc -mcpu=ultrasparc -c bn_asm.sparc.v8plus.S -o bn_asm.o | ||
31 | * | ||
32 | * or if above fails (it does if you have gas installed): | ||
33 | * | ||
34 | * gcc -E bn_asm.sparc.v8plus.S | as -xarch=v8plus /dev/fd/0 -o bn_asm.o | ||
35 | * | ||
36 | * Quick-n-dirty way to fuse the module into the library. | ||
37 | * Provided that the library is already configured and built | ||
38 | * (in 0.9.2 case with no-asm option): | ||
39 | * | ||
40 | * # cd crypto/bn | ||
41 | * # cp /some/place/bn_asm.sparc.v8plus.S . | ||
42 | * # cc -xarch=v8plus -c bn_asm.sparc.v8plus.S -o bn_asm.o | ||
43 | * # make | ||
44 | * # cd ../.. | ||
45 | * # make; make test | ||
46 | * | ||
47 | * Quick-n-dirty way to get rid of it: | ||
48 | * | ||
49 | * # cd crypto/bn | ||
50 | * # touch bn_asm.c | ||
51 | * # make | ||
52 | * # cd ../.. | ||
53 | * # make; make test | ||
54 | * | ||
55 | * Q. V8plus achitecture? What kind of beast is that? | ||
56 | * A. Well, it's rather a programming model than an architecture... | ||
57 | * It's actually v9-compliant, i.e. *any* UltraSPARC, CPU under | ||
58 | * special conditions, namely when kernel doesn't preserve upper | ||
59 | * 32 bits of otherwise 64-bit registers during a context switch. | ||
60 | * | ||
61 | * Q. Why just UltraSPARC? What about SuperSPARC? | ||
62 | * A. Original release did target UltraSPARC only. Now SuperSPARC | ||
63 | * version is provided along. Both version share bn_*comba[48] | ||
64 | * implementations (see comment later in code for explanation). | ||
65 | * But what's so special about this UltraSPARC implementation? | ||
66 | * Why didn't I let compiler do the job? Trouble is that most of | ||
67 | * available compilers (well, SC5.0 is the only exception) don't | ||
68 | * attempt to take advantage of UltraSPARC's 64-bitness under | ||
69 | * 32-bit kernels even though it's perfectly possible (see next | ||
70 | * question). | ||
71 | * | ||
72 | * Q. 64-bit registers under 32-bit kernels? Didn't you just say it | ||
73 | * doesn't work? | ||
74 | * A. You can't adress *all* registers as 64-bit wide:-( The catch is | ||
75 | * that you actually may rely upon %o0-%o5 and %g1-%g4 being fully | ||
76 | * preserved if you're in a leaf function, i.e. such never calling | ||
77 | * any other functions. All functions in this module are leaf and | ||
78 | * 10 registers is a handful. And as a matter of fact none-"comba" | ||
79 | * routines don't require even that much and I could even afford to | ||
80 | * not allocate own stack frame for 'em:-) | ||
81 | * | ||
82 | * Q. What about 64-bit kernels? | ||
83 | * A. What about 'em? Just kidding:-) Pure 64-bit version is currently | ||
84 | * under evaluation and development... | ||
85 | * | ||
86 | * Q. What about shared libraries? | ||
87 | * A. What about 'em? Kidding again:-) Code does *not* contain any | ||
88 | * code position dependencies and it's safe to include it into | ||
89 | * shared library as is. | ||
90 | * | ||
91 | * Q. How much faster does it go? | ||
92 | * A. Do you have a good benchmark? In either case below is what I | ||
93 | * experience with crypto/bn/expspeed.c test program: | ||
94 | * | ||
95 | * v8plus module on U10/300MHz against bn_asm.c compiled with: | ||
96 | * | ||
97 | * cc-5.0 -xarch=v8plus -xO5 -xdepend +7-12% | ||
98 | * cc-4.2 -xarch=v8plus -xO5 -xdepend +25-35% | ||
99 | * egcs-1.1.2 -mcpu=ultrasparc -O3 +35-45% | ||
100 | * | ||
101 | * v8 module on SS10/60MHz against bn_asm.c compiled with: | ||
102 | * | ||
103 | * cc-5.0 -xarch=v8 -xO5 -xdepend +7-10% | ||
104 | * cc-4.2 -xarch=v8 -xO5 -xdepend +10% | ||
105 | * egcs-1.1.2 -mv8 -O3 +35-45% | ||
106 | * | ||
107 | * As you can see it's damn hard to beat the new Sun C compiler | ||
108 | * and it's in first place GNU C users who will appreciate this | ||
109 | * assembler implementation:-) | ||
110 | */ | ||
111 | |||
112 | /* | ||
113 | * Revision history. | ||
114 | * | ||
115 | * 1.0 - initial release; | ||
116 | * 1.1 - new loop unrolling model(*); | ||
117 | * - some more fine tuning; | ||
118 | * 1.2 - made gas friendly; | ||
119 | * - updates to documentation concerning v9; | ||
120 | * - new performance comparison matrix; | ||
121 | * 1.3 - fixed problem with /usr/ccs/lib/cpp; | ||
122 | * 1.4 - native V9 bn_*_comba[48] implementation (15% more efficient) | ||
123 | * resulting in slight overall performance kick; | ||
124 | * - some retunes; | ||
125 | * - support for GNU as added; | ||
126 | * | ||
127 | * (*) Originally unrolled loop looked like this: | ||
128 | * for (;;) { | ||
129 | * op(p+0); if (--n==0) break; | ||
130 | * op(p+1); if (--n==0) break; | ||
131 | * op(p+2); if (--n==0) break; | ||
132 | * op(p+3); if (--n==0) break; | ||
133 | * p+=4; | ||
134 | * } | ||
135 | * I unroll according to following: | ||
136 | * while (n&~3) { | ||
137 | * op(p+0); op(p+1); op(p+2); op(p+3); | ||
138 | * p+=4; n=-4; | ||
139 | * } | ||
140 | * if (n) { | ||
141 | * op(p+0); if (--n==0) return; | ||
142 | * op(p+2); if (--n==0) return; | ||
143 | * op(p+3); return; | ||
144 | * } | ||
145 | */ | ||
146 | |||
147 | /* | ||
148 | * GNU assembler can't stand stuw:-( | ||
149 | */ | ||
150 | #define stuw st | ||
151 | |||
152 | .section ".text",#alloc,#execinstr | ||
153 | .file "bn_asm.sparc.v8plus.S" | ||
154 | |||
155 | .align 32 | ||
156 | |||
157 | .global bn_mul_add_words | ||
158 | /* | ||
159 | * BN_ULONG bn_mul_add_words(rp,ap,num,w) | ||
160 | * BN_ULONG *rp,*ap; | ||
161 | * int num; | ||
162 | * BN_ULONG w; | ||
163 | */ | ||
164 | bn_mul_add_words: | ||
165 | brgz,a %o2,.L_bn_mul_add_words_proceed | ||
166 | lduw [%o1],%g2 | ||
167 | retl | ||
168 | clr %o0 | ||
169 | |||
170 | .L_bn_mul_add_words_proceed: | ||
171 | srl %o3,%g0,%o3 ! clruw %o3 | ||
172 | andcc %o2,-4,%g0 | ||
173 | bz,pn %icc,.L_bn_mul_add_words_tail | ||
174 | clr %o5 | ||
175 | |||
176 | .L_bn_mul_add_words_loop: ! wow! 32 aligned! | ||
177 | lduw [%o0],%g1 | ||
178 | lduw [%o1+4],%g3 | ||
179 | mulx %o3,%g2,%g2 | ||
180 | add %g1,%o5,%o4 | ||
181 | nop | ||
182 | add %o4,%g2,%o4 | ||
183 | stuw %o4,[%o0] | ||
184 | srlx %o4,32,%o5 | ||
185 | |||
186 | lduw [%o0+4],%g1 | ||
187 | lduw [%o1+8],%g2 | ||
188 | mulx %o3,%g3,%g3 | ||
189 | add %g1,%o5,%o4 | ||
190 | dec 4,%o2 | ||
191 | add %o4,%g3,%o4 | ||
192 | stuw %o4,[%o0+4] | ||
193 | srlx %o4,32,%o5 | ||
194 | |||
195 | lduw [%o0+8],%g1 | ||
196 | lduw [%o1+12],%g3 | ||
197 | mulx %o3,%g2,%g2 | ||
198 | add %g1,%o5,%o4 | ||
199 | inc 16,%o1 | ||
200 | add %o4,%g2,%o4 | ||
201 | stuw %o4,[%o0+8] | ||
202 | srlx %o4,32,%o5 | ||
203 | |||
204 | lduw [%o0+12],%g1 | ||
205 | mulx %o3,%g3,%g3 | ||
206 | add %g1,%o5,%o4 | ||
207 | inc 16,%o0 | ||
208 | add %o4,%g3,%o4 | ||
209 | andcc %o2,-4,%g0 | ||
210 | stuw %o4,[%o0-4] | ||
211 | srlx %o4,32,%o5 | ||
212 | bnz,a,pt %icc,.L_bn_mul_add_words_loop | ||
213 | lduw [%o1],%g2 | ||
214 | |||
215 | brnz,a,pn %o2,.L_bn_mul_add_words_tail | ||
216 | lduw [%o1],%g2 | ||
217 | .L_bn_mul_add_words_return: | ||
218 | retl | ||
219 | mov %o5,%o0 | ||
220 | |||
221 | .L_bn_mul_add_words_tail: | ||
222 | lduw [%o0],%g1 | ||
223 | mulx %o3,%g2,%g2 | ||
224 | add %g1,%o5,%o4 | ||
225 | dec %o2 | ||
226 | add %o4,%g2,%o4 | ||
227 | srlx %o4,32,%o5 | ||
228 | brz,pt %o2,.L_bn_mul_add_words_return | ||
229 | stuw %o4,[%o0] | ||
230 | |||
231 | lduw [%o1+4],%g2 | ||
232 | lduw [%o0+4],%g1 | ||
233 | mulx %o3,%g2,%g2 | ||
234 | add %g1,%o5,%o4 | ||
235 | dec %o2 | ||
236 | add %o4,%g2,%o4 | ||
237 | srlx %o4,32,%o5 | ||
238 | brz,pt %o2,.L_bn_mul_add_words_return | ||
239 | stuw %o4,[%o0+4] | ||
240 | |||
241 | lduw [%o1+8],%g2 | ||
242 | lduw [%o0+8],%g1 | ||
243 | mulx %o3,%g2,%g2 | ||
244 | add %g1,%o5,%o4 | ||
245 | add %o4,%g2,%o4 | ||
246 | stuw %o4,[%o0+8] | ||
247 | retl | ||
248 | srlx %o4,32,%o0 | ||
249 | |||
250 | .type bn_mul_add_words,#function | ||
251 | .size bn_mul_add_words,(.-bn_mul_add_words) | ||
252 | |||
253 | .align 32 | ||
254 | |||
255 | .global bn_mul_words | ||
256 | /* | ||
257 | * BN_ULONG bn_mul_words(rp,ap,num,w) | ||
258 | * BN_ULONG *rp,*ap; | ||
259 | * int num; | ||
260 | * BN_ULONG w; | ||
261 | */ | ||
262 | bn_mul_words: | ||
263 | brgz,a %o2,.L_bn_mul_words_proceeed | ||
264 | lduw [%o1],%g2 | ||
265 | retl | ||
266 | clr %o0 | ||
267 | |||
268 | .L_bn_mul_words_proceeed: | ||
269 | srl %o3,%g0,%o3 ! clruw %o3 | ||
270 | andcc %o2,-4,%g0 | ||
271 | bz,pn %icc,.L_bn_mul_words_tail | ||
272 | clr %o5 | ||
273 | |||
274 | .L_bn_mul_words_loop: ! wow! 32 aligned! | ||
275 | lduw [%o1+4],%g3 | ||
276 | mulx %o3,%g2,%g2 | ||
277 | add %g2,%o5,%o4 | ||
278 | nop | ||
279 | stuw %o4,[%o0] | ||
280 | srlx %o4,32,%o5 | ||
281 | |||
282 | lduw [%o1+8],%g2 | ||
283 | mulx %o3,%g3,%g3 | ||
284 | add %g3,%o5,%o4 | ||
285 | dec 4,%o2 | ||
286 | stuw %o4,[%o0+4] | ||
287 | srlx %o4,32,%o5 | ||
288 | |||
289 | lduw [%o1+12],%g3 | ||
290 | mulx %o3,%g2,%g2 | ||
291 | add %g2,%o5,%o4 | ||
292 | inc 16,%o1 | ||
293 | stuw %o4,[%o0+8] | ||
294 | srlx %o4,32,%o5 | ||
295 | |||
296 | mulx %o3,%g3,%g3 | ||
297 | add %g3,%o5,%o4 | ||
298 | inc 16,%o0 | ||
299 | stuw %o4,[%o0-4] | ||
300 | srlx %o4,32,%o5 | ||
301 | andcc %o2,-4,%g0 | ||
302 | bnz,a,pt %icc,.L_bn_mul_words_loop | ||
303 | lduw [%o1],%g2 | ||
304 | nop | ||
305 | nop | ||
306 | |||
307 | brnz,a,pn %o2,.L_bn_mul_words_tail | ||
308 | lduw [%o1],%g2 | ||
309 | .L_bn_mul_words_return: | ||
310 | retl | ||
311 | mov %o5,%o0 | ||
312 | |||
313 | .L_bn_mul_words_tail: | ||
314 | mulx %o3,%g2,%g2 | ||
315 | add %g2,%o5,%o4 | ||
316 | dec %o2 | ||
317 | srlx %o4,32,%o5 | ||
318 | brz,pt %o2,.L_bn_mul_words_return | ||
319 | stuw %o4,[%o0] | ||
320 | |||
321 | lduw [%o1+4],%g2 | ||
322 | mulx %o3,%g2,%g2 | ||
323 | add %g2,%o5,%o4 | ||
324 | dec %o2 | ||
325 | srlx %o4,32,%o5 | ||
326 | brz,pt %o2,.L_bn_mul_words_return | ||
327 | stuw %o4,[%o0+4] | ||
328 | |||
329 | lduw [%o1+8],%g2 | ||
330 | mulx %o3,%g2,%g2 | ||
331 | add %g2,%o5,%o4 | ||
332 | stuw %o4,[%o0+8] | ||
333 | retl | ||
334 | srlx %o4,32,%o0 | ||
335 | |||
336 | .type bn_mul_words,#function | ||
337 | .size bn_mul_words,(.-bn_mul_words) | ||
338 | |||
339 | .align 32 | ||
340 | .global bn_sqr_words | ||
341 | /* | ||
342 | * void bn_sqr_words(r,a,n) | ||
343 | * BN_ULONG *r,*a; | ||
344 | * int n; | ||
345 | */ | ||
346 | bn_sqr_words: | ||
347 | brgz,a %o2,.L_bn_sqr_words_proceeed | ||
348 | lduw [%o1],%g2 | ||
349 | retl | ||
350 | clr %o0 | ||
351 | |||
352 | .L_bn_sqr_words_proceeed: | ||
353 | andcc %o2,-4,%g0 | ||
354 | nop | ||
355 | bz,pn %icc,.L_bn_sqr_words_tail | ||
356 | nop | ||
357 | |||
358 | .L_bn_sqr_words_loop: ! wow! 32 aligned! | ||
359 | lduw [%o1+4],%g3 | ||
360 | mulx %g2,%g2,%o4 | ||
361 | stuw %o4,[%o0] | ||
362 | srlx %o4,32,%o5 | ||
363 | stuw %o5,[%o0+4] | ||
364 | nop | ||
365 | |||
366 | lduw [%o1+8],%g2 | ||
367 | mulx %g3,%g3,%o4 | ||
368 | dec 4,%o2 | ||
369 | stuw %o4,[%o0+8] | ||
370 | srlx %o4,32,%o5 | ||
371 | stuw %o5,[%o0+12] | ||
372 | |||
373 | lduw [%o1+12],%g3 | ||
374 | mulx %g2,%g2,%o4 | ||
375 | srlx %o4,32,%o5 | ||
376 | stuw %o4,[%o0+16] | ||
377 | inc 16,%o1 | ||
378 | stuw %o5,[%o0+20] | ||
379 | |||
380 | mulx %g3,%g3,%o4 | ||
381 | inc 32,%o0 | ||
382 | stuw %o4,[%o0-8] | ||
383 | srlx %o4,32,%o5 | ||
384 | andcc %o2,-4,%g2 | ||
385 | stuw %o5,[%o0-4] | ||
386 | bnz,a,pt %icc,.L_bn_sqr_words_loop | ||
387 | lduw [%o1],%g2 | ||
388 | nop | ||
389 | |||
390 | brnz,a,pn %o2,.L_bn_sqr_words_tail | ||
391 | lduw [%o1],%g2 | ||
392 | .L_bn_sqr_words_return: | ||
393 | retl | ||
394 | clr %o0 | ||
395 | |||
396 | .L_bn_sqr_words_tail: | ||
397 | mulx %g2,%g2,%o4 | ||
398 | dec %o2 | ||
399 | stuw %o4,[%o0] | ||
400 | srlx %o4,32,%o5 | ||
401 | brz,pt %o2,.L_bn_sqr_words_return | ||
402 | stuw %o5,[%o0+4] | ||
403 | |||
404 | lduw [%o1+4],%g2 | ||
405 | mulx %g2,%g2,%o4 | ||
406 | dec %o2 | ||
407 | stuw %o4,[%o0+8] | ||
408 | srlx %o4,32,%o5 | ||
409 | brz,pt %o2,.L_bn_sqr_words_return | ||
410 | stuw %o5,[%o0+12] | ||
411 | |||
412 | lduw [%o1+8],%g2 | ||
413 | mulx %g2,%g2,%o4 | ||
414 | srlx %o4,32,%o5 | ||
415 | stuw %o4,[%o0+16] | ||
416 | stuw %o5,[%o0+20] | ||
417 | retl | ||
418 | clr %o0 | ||
419 | |||
420 | .type bn_sqr_words,#function | ||
421 | .size bn_sqr_words,(.-bn_sqr_words) | ||
422 | |||
423 | .align 32 | ||
424 | .global bn_div_words | ||
425 | /* | ||
426 | * BN_ULONG bn_div_words(h,l,d) | ||
427 | * BN_ULONG h,l,d; | ||
428 | */ | ||
429 | bn_div_words: | ||
430 | sllx %o0,32,%o0 | ||
431 | or %o0,%o1,%o0 | ||
432 | udivx %o0,%o2,%o0 | ||
433 | retl | ||
434 | srl %o0,%g0,%o0 ! clruw %o0 | ||
435 | |||
436 | .type bn_div_words,#function | ||
437 | .size bn_div_words,(.-bn_div_words) | ||
438 | |||
439 | .align 32 | ||
440 | |||
441 | .global bn_add_words | ||
442 | /* | ||
443 | * BN_ULONG bn_add_words(rp,ap,bp,n) | ||
444 | * BN_ULONG *rp,*ap,*bp; | ||
445 | * int n; | ||
446 | */ | ||
447 | bn_add_words: | ||
448 | brgz,a %o3,.L_bn_add_words_proceed | ||
449 | lduw [%o1],%o4 | ||
450 | retl | ||
451 | clr %o0 | ||
452 | |||
453 | .L_bn_add_words_proceed: | ||
454 | andcc %o3,-4,%g0 | ||
455 | bz,pn %icc,.L_bn_add_words_tail | ||
456 | addcc %g0,0,%g0 ! clear carry flag | ||
457 | nop | ||
458 | |||
459 | .L_bn_add_words_loop: ! wow! 32 aligned! | ||
460 | dec 4,%o3 | ||
461 | lduw [%o2],%o5 | ||
462 | lduw [%o1+4],%g1 | ||
463 | lduw [%o2+4],%g2 | ||
464 | lduw [%o1+8],%g3 | ||
465 | lduw [%o2+8],%g4 | ||
466 | addccc %o5,%o4,%o5 | ||
467 | stuw %o5,[%o0] | ||
468 | |||
469 | lduw [%o1+12],%o4 | ||
470 | lduw [%o2+12],%o5 | ||
471 | inc 16,%o1 | ||
472 | addccc %g1,%g2,%g1 | ||
473 | stuw %g1,[%o0+4] | ||
474 | |||
475 | inc 16,%o2 | ||
476 | addccc %g3,%g4,%g3 | ||
477 | stuw %g3,[%o0+8] | ||
478 | |||
479 | inc 16,%o0 | ||
480 | addccc %o5,%o4,%o5 | ||
481 | stuw %o5,[%o0-4] | ||
482 | and %o3,-4,%g1 | ||
483 | brnz,a,pt %g1,.L_bn_add_words_loop | ||
484 | lduw [%o1],%o4 | ||
485 | |||
486 | brnz,a,pn %o3,.L_bn_add_words_tail | ||
487 | lduw [%o1],%o4 | ||
488 | .L_bn_add_words_return: | ||
489 | clr %o0 | ||
490 | retl | ||
491 | movcs %icc,1,%o0 | ||
492 | nop | ||
493 | |||
494 | .L_bn_add_words_tail: | ||
495 | lduw [%o2],%o5 | ||
496 | dec %o3 | ||
497 | addccc %o5,%o4,%o5 | ||
498 | brz,pt %o3,.L_bn_add_words_return | ||
499 | stuw %o5,[%o0] | ||
500 | |||
501 | lduw [%o1+4],%o4 | ||
502 | lduw [%o2+4],%o5 | ||
503 | dec %o3 | ||
504 | addccc %o5,%o4,%o5 | ||
505 | brz,pt %o3,.L_bn_add_words_return | ||
506 | stuw %o5,[%o0+4] | ||
507 | |||
508 | lduw [%o1+8],%o4 | ||
509 | lduw [%o2+8],%o5 | ||
510 | addccc %o5,%o4,%o5 | ||
511 | stuw %o5,[%o0+8] | ||
512 | clr %o0 | ||
513 | retl | ||
514 | movcs %icc,1,%o0 | ||
515 | |||
516 | .type bn_add_words,#function | ||
517 | .size bn_add_words,(.-bn_add_words) | ||
518 | |||
519 | .global bn_sub_words | ||
520 | /* | ||
521 | * BN_ULONG bn_sub_words(rp,ap,bp,n) | ||
522 | * BN_ULONG *rp,*ap,*bp; | ||
523 | * int n; | ||
524 | */ | ||
525 | bn_sub_words: | ||
526 | brgz,a %o3,.L_bn_sub_words_proceed | ||
527 | lduw [%o1],%o4 | ||
528 | retl | ||
529 | clr %o0 | ||
530 | |||
531 | .L_bn_sub_words_proceed: | ||
532 | andcc %o3,-4,%g0 | ||
533 | bz,pn %icc,.L_bn_sub_words_tail | ||
534 | addcc %g0,0,%g0 ! clear carry flag | ||
535 | nop | ||
536 | |||
537 | .L_bn_sub_words_loop: ! wow! 32 aligned! | ||
538 | dec 4,%o3 | ||
539 | lduw [%o2],%o5 | ||
540 | lduw [%o1+4],%g1 | ||
541 | lduw [%o2+4],%g2 | ||
542 | lduw [%o1+8],%g3 | ||
543 | lduw [%o2+8],%g4 | ||
544 | subccc %o4,%o5,%o5 | ||
545 | stuw %o5,[%o0] | ||
546 | |||
547 | lduw [%o1+12],%o4 | ||
548 | lduw [%o2+12],%o5 | ||
549 | inc 16,%o1 | ||
550 | subccc %g1,%g2,%g2 | ||
551 | stuw %g2,[%o0+4] | ||
552 | |||
553 | inc 16,%o2 | ||
554 | subccc %g3,%g4,%g4 | ||
555 | stuw %g4,[%o0+8] | ||
556 | |||
557 | inc 16,%o0 | ||
558 | subccc %o4,%o5,%o5 | ||
559 | stuw %o5,[%o0-4] | ||
560 | and %o3,-4,%g1 | ||
561 | brnz,a,pt %g1,.L_bn_sub_words_loop | ||
562 | lduw [%o1],%o4 | ||
563 | |||
564 | brnz,a,pn %o3,.L_bn_sub_words_tail | ||
565 | lduw [%o1],%o4 | ||
566 | .L_bn_sub_words_return: | ||
567 | clr %o0 | ||
568 | retl | ||
569 | movcs %icc,1,%o0 | ||
570 | nop | ||
571 | |||
572 | .L_bn_sub_words_tail: ! wow! 32 aligned! | ||
573 | lduw [%o2],%o5 | ||
574 | dec %o3 | ||
575 | subccc %o4,%o5,%o5 | ||
576 | brz,pt %o3,.L_bn_sub_words_return | ||
577 | stuw %o5,[%o0] | ||
578 | |||
579 | lduw [%o1+4],%o4 | ||
580 | lduw [%o2+4],%o5 | ||
581 | dec %o3 | ||
582 | subccc %o4,%o5,%o5 | ||
583 | brz,pt %o3,.L_bn_sub_words_return | ||
584 | stuw %o5,[%o0+4] | ||
585 | |||
586 | lduw [%o1+8],%o4 | ||
587 | lduw [%o2+8],%o5 | ||
588 | subccc %o4,%o5,%o5 | ||
589 | stuw %o5,[%o0+8] | ||
590 | clr %o0 | ||
591 | retl | ||
592 | movcs %icc,1,%o0 | ||
593 | |||
594 | .type bn_sub_words,#function | ||
595 | .size bn_sub_words,(.-bn_sub_words) | ||
596 | |||
597 | /* | ||
598 | * Code below depends on the fact that upper parts of the %l0-%l7 | ||
599 | * and %i0-%i7 are zeroed by kernel after context switch. In | ||
600 | * previous versions this comment stated that "the trouble is that | ||
601 | * it's not feasible to implement the mumbo-jumbo in less V9 | ||
602 | * instructions:-(" which apparently isn't true thanks to | ||
603 | * 'bcs,a %xcc,.+8; inc %rd' pair. But the performance improvement | ||
604 | * results not from the shorter code, but from elimination of | ||
605 | * multicycle none-pairable 'rd %y,%rd' instructions. | ||
606 | * | ||
607 | * Andy. | ||
608 | */ | ||
609 | |||
610 | #define FRAME_SIZE -96 | ||
611 | |||
612 | /* | ||
613 | * Here is register usage map for *all* routines below. | ||
614 | */ | ||
615 | #define t_1 %o0 | ||
616 | #define t_2 %o1 | ||
617 | #define c_12 %o2 | ||
618 | #define c_3 %o3 | ||
619 | |||
620 | #define ap(I) [%i1+4*I] | ||
621 | #define bp(I) [%i2+4*I] | ||
622 | #define rp(I) [%i0+4*I] | ||
623 | |||
624 | #define a_0 %l0 | ||
625 | #define a_1 %l1 | ||
626 | #define a_2 %l2 | ||
627 | #define a_3 %l3 | ||
628 | #define a_4 %l4 | ||
629 | #define a_5 %l5 | ||
630 | #define a_6 %l6 | ||
631 | #define a_7 %l7 | ||
632 | |||
633 | #define b_0 %i3 | ||
634 | #define b_1 %i4 | ||
635 | #define b_2 %i5 | ||
636 | #define b_3 %o4 | ||
637 | #define b_4 %o5 | ||
638 | #define b_5 %o7 | ||
639 | #define b_6 %g1 | ||
640 | #define b_7 %g4 | ||
641 | |||
642 | .align 32 | ||
643 | .global bn_mul_comba8 | ||
644 | /* | ||
645 | * void bn_mul_comba8(r,a,b) | ||
646 | * BN_ULONG *r,*a,*b; | ||
647 | */ | ||
648 | bn_mul_comba8: | ||
649 | save %sp,FRAME_SIZE,%sp | ||
650 | mov 1,t_2 | ||
651 | lduw ap(0),a_0 | ||
652 | sllx t_2,32,t_2 | ||
653 | lduw bp(0),b_0 != | ||
654 | lduw bp(1),b_1 | ||
655 | mulx a_0,b_0,t_1 !mul_add_c(a[0],b[0],c1,c2,c3); | ||
656 | srlx t_1,32,c_12 | ||
657 | stuw t_1,rp(0) !=!r[0]=c1; | ||
658 | |||
659 | lduw ap(1),a_1 | ||
660 | mulx a_0,b_1,t_1 !mul_add_c(a[0],b[1],c2,c3,c1); | ||
661 | addcc c_12,t_1,c_12 | ||
662 | clr c_3 != | ||
663 | bcs,a %xcc,.+8 | ||
664 | add c_3,t_2,c_3 | ||
665 | lduw ap(2),a_2 | ||
666 | mulx a_1,b_0,t_1 !=!mul_add_c(a[1],b[0],c2,c3,c1); | ||
667 | addcc c_12,t_1,t_1 | ||
668 | bcs,a %xcc,.+8 | ||
669 | add c_3,t_2,c_3 | ||
670 | srlx t_1,32,c_12 != | ||
671 | stuw t_1,rp(1) !r[1]=c2; | ||
672 | or c_12,c_3,c_12 | ||
673 | |||
674 | mulx a_2,b_0,t_1 !mul_add_c(a[2],b[0],c3,c1,c2); | ||
675 | addcc c_12,t_1,c_12 != | ||
676 | clr c_3 | ||
677 | bcs,a %xcc,.+8 | ||
678 | add c_3,t_2,c_3 | ||
679 | lduw bp(2),b_2 != | ||
680 | mulx a_1,b_1,t_1 !mul_add_c(a[1],b[1],c3,c1,c2); | ||
681 | addcc c_12,t_1,c_12 | ||
682 | bcs,a %xcc,.+8 | ||
683 | add c_3,t_2,c_3 != | ||
684 | lduw bp(3),b_3 | ||
685 | mulx a_0,b_2,t_1 !mul_add_c(a[0],b[2],c3,c1,c2); | ||
686 | addcc c_12,t_1,t_1 | ||
687 | bcs,a %xcc,.+8 != | ||
688 | add c_3,t_2,c_3 | ||
689 | srlx t_1,32,c_12 | ||
690 | stuw t_1,rp(2) !r[2]=c3; | ||
691 | or c_12,c_3,c_12 != | ||
692 | |||
693 | mulx a_0,b_3,t_1 !mul_add_c(a[0],b[3],c1,c2,c3); | ||
694 | addcc c_12,t_1,c_12 | ||
695 | clr c_3 | ||
696 | bcs,a %xcc,.+8 != | ||
697 | add c_3,t_2,c_3 | ||
698 | mulx a_1,b_2,t_1 !=!mul_add_c(a[1],b[2],c1,c2,c3); | ||
699 | addcc c_12,t_1,c_12 | ||
700 | bcs,a %xcc,.+8 != | ||
701 | add c_3,t_2,c_3 | ||
702 | lduw ap(3),a_3 | ||
703 | mulx a_2,b_1,t_1 !mul_add_c(a[2],b[1],c1,c2,c3); | ||
704 | addcc c_12,t_1,c_12 != | ||
705 | bcs,a %xcc,.+8 | ||
706 | add c_3,t_2,c_3 | ||
707 | lduw ap(4),a_4 | ||
708 | mulx a_3,b_0,t_1 !=!mul_add_c(a[3],b[0],c1,c2,c3);!= | ||
709 | addcc c_12,t_1,t_1 | ||
710 | bcs,a %xcc,.+8 | ||
711 | add c_3,t_2,c_3 | ||
712 | srlx t_1,32,c_12 != | ||
713 | stuw t_1,rp(3) !r[3]=c1; | ||
714 | or c_12,c_3,c_12 | ||
715 | |||
716 | mulx a_4,b_0,t_1 !mul_add_c(a[4],b[0],c2,c3,c1); | ||
717 | addcc c_12,t_1,c_12 != | ||
718 | clr c_3 | ||
719 | bcs,a %xcc,.+8 | ||
720 | add c_3,t_2,c_3 | ||
721 | mulx a_3,b_1,t_1 !=!mul_add_c(a[3],b[1],c2,c3,c1); | ||
722 | addcc c_12,t_1,c_12 | ||
723 | bcs,a %xcc,.+8 | ||
724 | add c_3,t_2,c_3 | ||
725 | mulx a_2,b_2,t_1 !=!mul_add_c(a[2],b[2],c2,c3,c1); | ||
726 | addcc c_12,t_1,c_12 | ||
727 | bcs,a %xcc,.+8 | ||
728 | add c_3,t_2,c_3 | ||
729 | lduw bp(4),b_4 != | ||
730 | mulx a_1,b_3,t_1 !mul_add_c(a[1],b[3],c2,c3,c1); | ||
731 | addcc c_12,t_1,c_12 | ||
732 | bcs,a %xcc,.+8 | ||
733 | add c_3,t_2,c_3 != | ||
734 | lduw bp(5),b_5 | ||
735 | mulx a_0,b_4,t_1 !mul_add_c(a[0],b[4],c2,c3,c1); | ||
736 | addcc c_12,t_1,t_1 | ||
737 | bcs,a %xcc,.+8 != | ||
738 | add c_3,t_2,c_3 | ||
739 | srlx t_1,32,c_12 | ||
740 | stuw t_1,rp(4) !r[4]=c2; | ||
741 | or c_12,c_3,c_12 != | ||
742 | |||
743 | mulx a_0,b_5,t_1 !mul_add_c(a[0],b[5],c3,c1,c2); | ||
744 | addcc c_12,t_1,c_12 | ||
745 | clr c_3 | ||
746 | bcs,a %xcc,.+8 != | ||
747 | add c_3,t_2,c_3 | ||
748 | mulx a_1,b_4,t_1 !mul_add_c(a[1],b[4],c3,c1,c2); | ||
749 | addcc c_12,t_1,c_12 | ||
750 | bcs,a %xcc,.+8 != | ||
751 | add c_3,t_2,c_3 | ||
752 | mulx a_2,b_3,t_1 !mul_add_c(a[2],b[3],c3,c1,c2); | ||
753 | addcc c_12,t_1,c_12 | ||
754 | bcs,a %xcc,.+8 != | ||
755 | add c_3,t_2,c_3 | ||
756 | mulx a_3,b_2,t_1 !mul_add_c(a[3],b[2],c3,c1,c2); | ||
757 | addcc c_12,t_1,c_12 | ||
758 | bcs,a %xcc,.+8 != | ||
759 | add c_3,t_2,c_3 | ||
760 | lduw ap(5),a_5 | ||
761 | mulx a_4,b_1,t_1 !mul_add_c(a[4],b[1],c3,c1,c2); | ||
762 | addcc c_12,t_1,c_12 != | ||
763 | bcs,a %xcc,.+8 | ||
764 | add c_3,t_2,c_3 | ||
765 | lduw ap(6),a_6 | ||
766 | mulx a_5,b_0,t_1 !=!mul_add_c(a[5],b[0],c3,c1,c2); | ||
767 | addcc c_12,t_1,t_1 | ||
768 | bcs,a %xcc,.+8 | ||
769 | add c_3,t_2,c_3 | ||
770 | srlx t_1,32,c_12 != | ||
771 | stuw t_1,rp(5) !r[5]=c3; | ||
772 | or c_12,c_3,c_12 | ||
773 | |||
774 | mulx a_6,b_0,t_1 !mul_add_c(a[6],b[0],c1,c2,c3); | ||
775 | addcc c_12,t_1,c_12 != | ||
776 | clr c_3 | ||
777 | bcs,a %xcc,.+8 | ||
778 | add c_3,t_2,c_3 | ||
779 | mulx a_5,b_1,t_1 !=!mul_add_c(a[5],b[1],c1,c2,c3); | ||
780 | addcc c_12,t_1,c_12 | ||
781 | bcs,a %xcc,.+8 | ||
782 | add c_3,t_2,c_3 | ||
783 | mulx a_4,b_2,t_1 !=!mul_add_c(a[4],b[2],c1,c2,c3); | ||
784 | addcc c_12,t_1,c_12 | ||
785 | bcs,a %xcc,.+8 | ||
786 | add c_3,t_2,c_3 | ||
787 | mulx a_3,b_3,t_1 !=!mul_add_c(a[3],b[3],c1,c2,c3); | ||
788 | addcc c_12,t_1,c_12 | ||
789 | bcs,a %xcc,.+8 | ||
790 | add c_3,t_2,c_3 | ||
791 | mulx a_2,b_4,t_1 !=!mul_add_c(a[2],b[4],c1,c2,c3); | ||
792 | addcc c_12,t_1,c_12 | ||
793 | bcs,a %xcc,.+8 | ||
794 | add c_3,t_2,c_3 | ||
795 | lduw bp(6),b_6 != | ||
796 | mulx a_1,b_5,t_1 !mul_add_c(a[1],b[5],c1,c2,c3); | ||
797 | addcc c_12,t_1,c_12 | ||
798 | bcs,a %xcc,.+8 | ||
799 | add c_3,t_2,c_3 != | ||
800 | lduw bp(7),b_7 | ||
801 | mulx a_0,b_6,t_1 !mul_add_c(a[0],b[6],c1,c2,c3); | ||
802 | addcc c_12,t_1,t_1 | ||
803 | bcs,a %xcc,.+8 != | ||
804 | add c_3,t_2,c_3 | ||
805 | srlx t_1,32,c_12 | ||
806 | stuw t_1,rp(6) !r[6]=c1; | ||
807 | or c_12,c_3,c_12 != | ||
808 | |||
809 | mulx a_0,b_7,t_1 !mul_add_c(a[0],b[7],c2,c3,c1); | ||
810 | addcc c_12,t_1,c_12 | ||
811 | clr c_3 | ||
812 | bcs,a %xcc,.+8 != | ||
813 | add c_3,t_2,c_3 | ||
814 | mulx a_1,b_6,t_1 !mul_add_c(a[1],b[6],c2,c3,c1); | ||
815 | addcc c_12,t_1,c_12 | ||
816 | bcs,a %xcc,.+8 != | ||
817 | add c_3,t_2,c_3 | ||
818 | mulx a_2,b_5,t_1 !mul_add_c(a[2],b[5],c2,c3,c1); | ||
819 | addcc c_12,t_1,c_12 | ||
820 | bcs,a %xcc,.+8 != | ||
821 | add c_3,t_2,c_3 | ||
822 | mulx a_3,b_4,t_1 !mul_add_c(a[3],b[4],c2,c3,c1); | ||
823 | addcc c_12,t_1,c_12 | ||
824 | bcs,a %xcc,.+8 != | ||
825 | add c_3,t_2,c_3 | ||
826 | mulx a_4,b_3,t_1 !mul_add_c(a[4],b[3],c2,c3,c1); | ||
827 | addcc c_12,t_1,c_12 | ||
828 | bcs,a %xcc,.+8 != | ||
829 | add c_3,t_2,c_3 | ||
830 | mulx a_5,b_2,t_1 !mul_add_c(a[5],b[2],c2,c3,c1); | ||
831 | addcc c_12,t_1,c_12 | ||
832 | bcs,a %xcc,.+8 != | ||
833 | add c_3,t_2,c_3 | ||
834 | lduw ap(7),a_7 | ||
835 | mulx a_6,b_1,t_1 !=!mul_add_c(a[6],b[1],c2,c3,c1); | ||
836 | addcc c_12,t_1,c_12 | ||
837 | bcs,a %xcc,.+8 | ||
838 | add c_3,t_2,c_3 | ||
839 | mulx a_7,b_0,t_1 !=!mul_add_c(a[7],b[0],c2,c3,c1); | ||
840 | addcc c_12,t_1,t_1 | ||
841 | bcs,a %xcc,.+8 | ||
842 | add c_3,t_2,c_3 | ||
843 | srlx t_1,32,c_12 != | ||
844 | stuw t_1,rp(7) !r[7]=c2; | ||
845 | or c_12,c_3,c_12 | ||
846 | |||
847 | mulx a_7,b_1,t_1 !=!mul_add_c(a[7],b[1],c3,c1,c2); | ||
848 | addcc c_12,t_1,c_12 | ||
849 | clr c_3 | ||
850 | bcs,a %xcc,.+8 | ||
851 | add c_3,t_2,c_3 != | ||
852 | mulx a_6,b_2,t_1 !mul_add_c(a[6],b[2],c3,c1,c2); | ||
853 | addcc c_12,t_1,c_12 | ||
854 | bcs,a %xcc,.+8 | ||
855 | add c_3,t_2,c_3 != | ||
856 | mulx a_5,b_3,t_1 !mul_add_c(a[5],b[3],c3,c1,c2); | ||
857 | addcc c_12,t_1,c_12 | ||
858 | bcs,a %xcc,.+8 | ||
859 | add c_3,t_2,c_3 != | ||
860 | mulx a_4,b_4,t_1 !mul_add_c(a[4],b[4],c3,c1,c2); | ||
861 | addcc c_12,t_1,c_12 | ||
862 | bcs,a %xcc,.+8 | ||
863 | add c_3,t_2,c_3 != | ||
864 | mulx a_3,b_5,t_1 !mul_add_c(a[3],b[5],c3,c1,c2); | ||
865 | addcc c_12,t_1,c_12 | ||
866 | bcs,a %xcc,.+8 | ||
867 | add c_3,t_2,c_3 != | ||
868 | mulx a_2,b_6,t_1 !mul_add_c(a[2],b[6],c3,c1,c2); | ||
869 | addcc c_12,t_1,c_12 | ||
870 | bcs,a %xcc,.+8 | ||
871 | add c_3,t_2,c_3 != | ||
872 | mulx a_1,b_7,t_1 !mul_add_c(a[1],b[7],c3,c1,c2); | ||
873 | addcc c_12,t_1,t_1 | ||
874 | bcs,a %xcc,.+8 | ||
875 | add c_3,t_2,c_3 != | ||
876 | srlx t_1,32,c_12 | ||
877 | stuw t_1,rp(8) !r[8]=c3; | ||
878 | or c_12,c_3,c_12 | ||
879 | |||
880 | mulx a_2,b_7,t_1 !=!mul_add_c(a[2],b[7],c1,c2,c3); | ||
881 | addcc c_12,t_1,c_12 | ||
882 | clr c_3 | ||
883 | bcs,a %xcc,.+8 | ||
884 | add c_3,t_2,c_3 != | ||
885 | mulx a_3,b_6,t_1 !mul_add_c(a[3],b[6],c1,c2,c3); | ||
886 | addcc c_12,t_1,c_12 | ||
887 | bcs,a %xcc,.+8 != | ||
888 | add c_3,t_2,c_3 | ||
889 | mulx a_4,b_5,t_1 !mul_add_c(a[4],b[5],c1,c2,c3); | ||
890 | addcc c_12,t_1,c_12 | ||
891 | bcs,a %xcc,.+8 != | ||
892 | add c_3,t_2,c_3 | ||
893 | mulx a_5,b_4,t_1 !mul_add_c(a[5],b[4],c1,c2,c3); | ||
894 | addcc c_12,t_1,c_12 | ||
895 | bcs,a %xcc,.+8 != | ||
896 | add c_3,t_2,c_3 | ||
897 | mulx a_6,b_3,t_1 !mul_add_c(a[6],b[3],c1,c2,c3); | ||
898 | addcc c_12,t_1,c_12 | ||
899 | bcs,a %xcc,.+8 != | ||
900 | add c_3,t_2,c_3 | ||
901 | mulx a_7,b_2,t_1 !mul_add_c(a[7],b[2],c1,c2,c3); | ||
902 | addcc c_12,t_1,t_1 | ||
903 | bcs,a %xcc,.+8 != | ||
904 | add c_3,t_2,c_3 | ||
905 | srlx t_1,32,c_12 | ||
906 | stuw t_1,rp(9) !r[9]=c1; | ||
907 | or c_12,c_3,c_12 != | ||
908 | |||
909 | mulx a_7,b_3,t_1 !mul_add_c(a[7],b[3],c2,c3,c1); | ||
910 | addcc c_12,t_1,c_12 | ||
911 | clr c_3 | ||
912 | bcs,a %xcc,.+8 != | ||
913 | add c_3,t_2,c_3 | ||
914 | mulx a_6,b_4,t_1 !mul_add_c(a[6],b[4],c2,c3,c1); | ||
915 | addcc c_12,t_1,c_12 | ||
916 | bcs,a %xcc,.+8 != | ||
917 | add c_3,t_2,c_3 | ||
918 | mulx a_5,b_5,t_1 !mul_add_c(a[5],b[5],c2,c3,c1); | ||
919 | addcc c_12,t_1,c_12 | ||
920 | bcs,a %xcc,.+8 != | ||
921 | add c_3,t_2,c_3 | ||
922 | mulx a_4,b_6,t_1 !mul_add_c(a[4],b[6],c2,c3,c1); | ||
923 | addcc c_12,t_1,c_12 | ||
924 | bcs,a %xcc,.+8 != | ||
925 | add c_3,t_2,c_3 | ||
926 | mulx a_3,b_7,t_1 !mul_add_c(a[3],b[7],c2,c3,c1); | ||
927 | addcc c_12,t_1,t_1 | ||
928 | bcs,a %xcc,.+8 != | ||
929 | add c_3,t_2,c_3 | ||
930 | srlx t_1,32,c_12 | ||
931 | stuw t_1,rp(10) !r[10]=c2; | ||
932 | or c_12,c_3,c_12 != | ||
933 | |||
934 | mulx a_4,b_7,t_1 !mul_add_c(a[4],b[7],c3,c1,c2); | ||
935 | addcc c_12,t_1,c_12 | ||
936 | clr c_3 | ||
937 | bcs,a %xcc,.+8 != | ||
938 | add c_3,t_2,c_3 | ||
939 | mulx a_5,b_6,t_1 !mul_add_c(a[5],b[6],c3,c1,c2); | ||
940 | addcc c_12,t_1,c_12 | ||
941 | bcs,a %xcc,.+8 != | ||
942 | add c_3,t_2,c_3 | ||
943 | mulx a_6,b_5,t_1 !mul_add_c(a[6],b[5],c3,c1,c2); | ||
944 | addcc c_12,t_1,c_12 | ||
945 | bcs,a %xcc,.+8 != | ||
946 | add c_3,t_2,c_3 | ||
947 | mulx a_7,b_4,t_1 !mul_add_c(a[7],b[4],c3,c1,c2); | ||
948 | addcc c_12,t_1,t_1 | ||
949 | bcs,a %xcc,.+8 != | ||
950 | add c_3,t_2,c_3 | ||
951 | srlx t_1,32,c_12 | ||
952 | stuw t_1,rp(11) !r[11]=c3; | ||
953 | or c_12,c_3,c_12 != | ||
954 | |||
955 | mulx a_7,b_5,t_1 !mul_add_c(a[7],b[5],c1,c2,c3); | ||
956 | addcc c_12,t_1,c_12 | ||
957 | clr c_3 | ||
958 | bcs,a %xcc,.+8 != | ||
959 | add c_3,t_2,c_3 | ||
960 | mulx a_6,b_6,t_1 !mul_add_c(a[6],b[6],c1,c2,c3); | ||
961 | addcc c_12,t_1,c_12 | ||
962 | bcs,a %xcc,.+8 != | ||
963 | add c_3,t_2,c_3 | ||
964 | mulx a_5,b_7,t_1 !mul_add_c(a[5],b[7],c1,c2,c3); | ||
965 | addcc c_12,t_1,t_1 | ||
966 | bcs,a %xcc,.+8 != | ||
967 | add c_3,t_2,c_3 | ||
968 | srlx t_1,32,c_12 | ||
969 | stuw t_1,rp(12) !r[12]=c1; | ||
970 | or c_12,c_3,c_12 != | ||
971 | |||
972 | mulx a_6,b_7,t_1 !mul_add_c(a[6],b[7],c2,c3,c1); | ||
973 | addcc c_12,t_1,c_12 | ||
974 | clr c_3 | ||
975 | bcs,a %xcc,.+8 != | ||
976 | add c_3,t_2,c_3 | ||
977 | mulx a_7,b_6,t_1 !mul_add_c(a[7],b[6],c2,c3,c1); | ||
978 | addcc c_12,t_1,t_1 | ||
979 | bcs,a %xcc,.+8 != | ||
980 | add c_3,t_2,c_3 | ||
981 | srlx t_1,32,c_12 | ||
982 | st t_1,rp(13) !r[13]=c2; | ||
983 | or c_12,c_3,c_12 != | ||
984 | |||
985 | mulx a_7,b_7,t_1 !mul_add_c(a[7],b[7],c3,c1,c2); | ||
986 | addcc c_12,t_1,t_1 | ||
987 | srlx t_1,32,c_12 != | ||
988 | stuw t_1,rp(14) !r[14]=c3; | ||
989 | stuw c_12,rp(15) !r[15]=c1; | ||
990 | |||
991 | ret | ||
992 | restore %g0,%g0,%o0 != | ||
993 | |||
994 | .type bn_mul_comba8,#function | ||
995 | .size bn_mul_comba8,(.-bn_mul_comba8) | ||
996 | |||
997 | .align 32 | ||
998 | |||
999 | .global bn_mul_comba4 | ||
1000 | /* | ||
1001 | * void bn_mul_comba4(r,a,b) | ||
1002 | * BN_ULONG *r,*a,*b; | ||
1003 | */ | ||
1004 | bn_mul_comba4: | ||
1005 | save %sp,FRAME_SIZE,%sp | ||
1006 | lduw ap(0),a_0 | ||
1007 | mov 1,t_2 | ||
1008 | lduw bp(0),b_0 | ||
1009 | sllx t_2,32,t_2 != | ||
1010 | lduw bp(1),b_1 | ||
1011 | mulx a_0,b_0,t_1 !mul_add_c(a[0],b[0],c1,c2,c3); | ||
1012 | srlx t_1,32,c_12 | ||
1013 | stuw t_1,rp(0) !=!r[0]=c1; | ||
1014 | |||
1015 | lduw ap(1),a_1 | ||
1016 | mulx a_0,b_1,t_1 !mul_add_c(a[0],b[1],c2,c3,c1); | ||
1017 | addcc c_12,t_1,c_12 | ||
1018 | clr c_3 != | ||
1019 | bcs,a %xcc,.+8 | ||
1020 | add c_3,t_2,c_3 | ||
1021 | lduw ap(2),a_2 | ||
1022 | mulx a_1,b_0,t_1 !=!mul_add_c(a[1],b[0],c2,c3,c1); | ||
1023 | addcc c_12,t_1,t_1 | ||
1024 | bcs,a %xcc,.+8 | ||
1025 | add c_3,t_2,c_3 | ||
1026 | srlx t_1,32,c_12 != | ||
1027 | stuw t_1,rp(1) !r[1]=c2; | ||
1028 | or c_12,c_3,c_12 | ||
1029 | |||
1030 | mulx a_2,b_0,t_1 !mul_add_c(a[2],b[0],c3,c1,c2); | ||
1031 | addcc c_12,t_1,c_12 != | ||
1032 | clr c_3 | ||
1033 | bcs,a %xcc,.+8 | ||
1034 | add c_3,t_2,c_3 | ||
1035 | lduw bp(2),b_2 != | ||
1036 | mulx a_1,b_1,t_1 !mul_add_c(a[1],b[1],c3,c1,c2); | ||
1037 | addcc c_12,t_1,c_12 | ||
1038 | bcs,a %xcc,.+8 | ||
1039 | add c_3,t_2,c_3 != | ||
1040 | lduw bp(3),b_3 | ||
1041 | mulx a_0,b_2,t_1 !mul_add_c(a[0],b[2],c3,c1,c2); | ||
1042 | addcc c_12,t_1,t_1 | ||
1043 | bcs,a %xcc,.+8 != | ||
1044 | add c_3,t_2,c_3 | ||
1045 | srlx t_1,32,c_12 | ||
1046 | stuw t_1,rp(2) !r[2]=c3; | ||
1047 | or c_12,c_3,c_12 != | ||
1048 | |||
1049 | mulx a_0,b_3,t_1 !mul_add_c(a[0],b[3],c1,c2,c3); | ||
1050 | addcc c_12,t_1,c_12 | ||
1051 | clr c_3 | ||
1052 | bcs,a %xcc,.+8 != | ||
1053 | add c_3,t_2,c_3 | ||
1054 | mulx a_1,b_2,t_1 !mul_add_c(a[1],b[2],c1,c2,c3); | ||
1055 | addcc c_12,t_1,c_12 | ||
1056 | bcs,a %xcc,.+8 != | ||
1057 | add c_3,t_2,c_3 | ||
1058 | lduw ap(3),a_3 | ||
1059 | mulx a_2,b_1,t_1 !mul_add_c(a[2],b[1],c1,c2,c3); | ||
1060 | addcc c_12,t_1,c_12 != | ||
1061 | bcs,a %xcc,.+8 | ||
1062 | add c_3,t_2,c_3 | ||
1063 | mulx a_3,b_0,t_1 !mul_add_c(a[3],b[0],c1,c2,c3);!= | ||
1064 | addcc c_12,t_1,t_1 != | ||
1065 | bcs,a %xcc,.+8 | ||
1066 | add c_3,t_2,c_3 | ||
1067 | srlx t_1,32,c_12 | ||
1068 | stuw t_1,rp(3) !=!r[3]=c1; | ||
1069 | or c_12,c_3,c_12 | ||
1070 | |||
1071 | mulx a_3,b_1,t_1 !mul_add_c(a[3],b[1],c2,c3,c1); | ||
1072 | addcc c_12,t_1,c_12 | ||
1073 | clr c_3 != | ||
1074 | bcs,a %xcc,.+8 | ||
1075 | add c_3,t_2,c_3 | ||
1076 | mulx a_2,b_2,t_1 !mul_add_c(a[2],b[2],c2,c3,c1); | ||
1077 | addcc c_12,t_1,c_12 != | ||
1078 | bcs,a %xcc,.+8 | ||
1079 | add c_3,t_2,c_3 | ||
1080 | mulx a_1,b_3,t_1 !mul_add_c(a[1],b[3],c2,c3,c1); | ||
1081 | addcc c_12,t_1,t_1 != | ||
1082 | bcs,a %xcc,.+8 | ||
1083 | add c_3,t_2,c_3 | ||
1084 | srlx t_1,32,c_12 | ||
1085 | stuw t_1,rp(4) !=!r[4]=c2; | ||
1086 | or c_12,c_3,c_12 | ||
1087 | |||
1088 | mulx a_2,b_3,t_1 !mul_add_c(a[2],b[3],c3,c1,c2); | ||
1089 | addcc c_12,t_1,c_12 | ||
1090 | clr c_3 != | ||
1091 | bcs,a %xcc,.+8 | ||
1092 | add c_3,t_2,c_3 | ||
1093 | mulx a_3,b_2,t_1 !mul_add_c(a[3],b[2],c3,c1,c2); | ||
1094 | addcc c_12,t_1,t_1 != | ||
1095 | bcs,a %xcc,.+8 | ||
1096 | add c_3,t_2,c_3 | ||
1097 | srlx t_1,32,c_12 | ||
1098 | stuw t_1,rp(5) !=!r[5]=c3; | ||
1099 | or c_12,c_3,c_12 | ||
1100 | |||
1101 | mulx a_3,b_3,t_1 !mul_add_c(a[3],b[3],c1,c2,c3); | ||
1102 | addcc c_12,t_1,t_1 | ||
1103 | srlx t_1,32,c_12 != | ||
1104 | stuw t_1,rp(6) !r[6]=c1; | ||
1105 | stuw c_12,rp(7) !r[7]=c2; | ||
1106 | |||
1107 | ret | ||
1108 | restore %g0,%g0,%o0 | ||
1109 | |||
1110 | .type bn_mul_comba4,#function | ||
1111 | .size bn_mul_comba4,(.-bn_mul_comba4) | ||
1112 | |||
1113 | .align 32 | ||
1114 | |||
1115 | .global bn_sqr_comba8 | ||
1116 | bn_sqr_comba8: | ||
1117 | save %sp,FRAME_SIZE,%sp | ||
1118 | mov 1,t_2 | ||
1119 | lduw ap(0),a_0 | ||
1120 | sllx t_2,32,t_2 | ||
1121 | lduw ap(1),a_1 | ||
1122 | mulx a_0,a_0,t_1 !sqr_add_c(a,0,c1,c2,c3); | ||
1123 | srlx t_1,32,c_12 | ||
1124 | stuw t_1,rp(0) !r[0]=c1; | ||
1125 | |||
1126 | lduw ap(2),a_2 | ||
1127 | mulx a_0,a_1,t_1 !=!sqr_add_c2(a,1,0,c2,c3,c1); | ||
1128 | addcc c_12,t_1,c_12 | ||
1129 | clr c_3 | ||
1130 | bcs,a %xcc,.+8 | ||
1131 | add c_3,t_2,c_3 | ||
1132 | addcc c_12,t_1,t_1 | ||
1133 | bcs,a %xcc,.+8 | ||
1134 | add c_3,t_2,c_3 | ||
1135 | srlx t_1,32,c_12 | ||
1136 | stuw t_1,rp(1) !r[1]=c2; | ||
1137 | or c_12,c_3,c_12 | ||
1138 | |||
1139 | mulx a_2,a_0,t_1 !sqr_add_c2(a,2,0,c3,c1,c2); | ||
1140 | addcc c_12,t_1,c_12 | ||
1141 | clr c_3 | ||
1142 | bcs,a %xcc,.+8 | ||
1143 | add c_3,t_2,c_3 | ||
1144 | addcc c_12,t_1,c_12 | ||
1145 | bcs,a %xcc,.+8 | ||
1146 | add c_3,t_2,c_3 | ||
1147 | lduw ap(3),a_3 | ||
1148 | mulx a_1,a_1,t_1 !sqr_add_c(a,1,c3,c1,c2); | ||
1149 | addcc c_12,t_1,t_1 | ||
1150 | bcs,a %xcc,.+8 | ||
1151 | add c_3,t_2,c_3 | ||
1152 | srlx t_1,32,c_12 | ||
1153 | stuw t_1,rp(2) !r[2]=c3; | ||
1154 | or c_12,c_3,c_12 | ||
1155 | |||
1156 | mulx a_0,a_3,t_1 !sqr_add_c2(a,3,0,c1,c2,c3); | ||
1157 | addcc c_12,t_1,c_12 | ||
1158 | clr c_3 | ||
1159 | bcs,a %xcc,.+8 | ||
1160 | add c_3,t_2,c_3 | ||
1161 | addcc c_12,t_1,c_12 | ||
1162 | bcs,a %xcc,.+8 | ||
1163 | add c_3,t_2,c_3 | ||
1164 | lduw ap(4),a_4 | ||
1165 | mulx a_1,a_2,t_1 !sqr_add_c2(a,2,1,c1,c2,c3); | ||
1166 | addcc c_12,t_1,c_12 | ||
1167 | bcs,a %xcc,.+8 | ||
1168 | add c_3,t_2,c_3 | ||
1169 | addcc c_12,t_1,t_1 | ||
1170 | bcs,a %xcc,.+8 | ||
1171 | add c_3,t_2,c_3 | ||
1172 | srlx t_1,32,c_12 | ||
1173 | st t_1,rp(3) !r[3]=c1; | ||
1174 | or c_12,c_3,c_12 | ||
1175 | |||
1176 | mulx a_4,a_0,t_1 !sqr_add_c2(a,4,0,c2,c3,c1); | ||
1177 | addcc c_12,t_1,c_12 | ||
1178 | clr c_3 | ||
1179 | bcs,a %xcc,.+8 | ||
1180 | add c_3,t_2,c_3 | ||
1181 | addcc c_12,t_1,c_12 | ||
1182 | bcs,a %xcc,.+8 | ||
1183 | add c_3,t_2,c_3 | ||
1184 | mulx a_3,a_1,t_1 !sqr_add_c2(a,3,1,c2,c3,c1); | ||
1185 | addcc c_12,t_1,c_12 | ||
1186 | bcs,a %xcc,.+8 | ||
1187 | add c_3,t_2,c_3 | ||
1188 | addcc c_12,t_1,c_12 | ||
1189 | bcs,a %xcc,.+8 | ||
1190 | add c_3,t_2,c_3 | ||
1191 | lduw ap(5),a_5 | ||
1192 | mulx a_2,a_2,t_1 !sqr_add_c(a,2,c2,c3,c1); | ||
1193 | addcc c_12,t_1,t_1 | ||
1194 | bcs,a %xcc,.+8 | ||
1195 | add c_3,t_2,c_3 | ||
1196 | srlx t_1,32,c_12 | ||
1197 | stuw t_1,rp(4) !r[4]=c2; | ||
1198 | or c_12,c_3,c_12 | ||
1199 | |||
1200 | mulx a_0,a_5,t_1 !sqr_add_c2(a,5,0,c3,c1,c2); | ||
1201 | addcc c_12,t_1,c_12 | ||
1202 | clr c_3 | ||
1203 | bcs,a %xcc,.+8 | ||
1204 | add c_3,t_2,c_3 | ||
1205 | addcc c_12,t_1,c_12 | ||
1206 | bcs,a %xcc,.+8 | ||
1207 | add c_3,t_2,c_3 | ||
1208 | mulx a_1,a_4,t_1 !sqr_add_c2(a,4,1,c3,c1,c2); | ||
1209 | addcc c_12,t_1,c_12 | ||
1210 | bcs,a %xcc,.+8 | ||
1211 | add c_3,t_2,c_3 | ||
1212 | addcc c_12,t_1,c_12 | ||
1213 | bcs,a %xcc,.+8 | ||
1214 | add c_3,t_2,c_3 | ||
1215 | lduw ap(6),a_6 | ||
1216 | mulx a_2,a_3,t_1 !sqr_add_c2(a,3,2,c3,c1,c2); | ||
1217 | addcc c_12,t_1,c_12 | ||
1218 | bcs,a %xcc,.+8 | ||
1219 | add c_3,t_2,c_3 | ||
1220 | addcc c_12,t_1,t_1 | ||
1221 | bcs,a %xcc,.+8 | ||
1222 | add c_3,t_2,c_3 | ||
1223 | srlx t_1,32,c_12 | ||
1224 | stuw t_1,rp(5) !r[5]=c3; | ||
1225 | or c_12,c_3,c_12 | ||
1226 | |||
1227 | mulx a_6,a_0,t_1 !sqr_add_c2(a,6,0,c1,c2,c3); | ||
1228 | addcc c_12,t_1,c_12 | ||
1229 | clr c_3 | ||
1230 | bcs,a %xcc,.+8 | ||
1231 | add c_3,t_2,c_3 | ||
1232 | addcc c_12,t_1,c_12 | ||
1233 | bcs,a %xcc,.+8 | ||
1234 | add c_3,t_2,c_3 | ||
1235 | mulx a_5,a_1,t_1 !sqr_add_c2(a,5,1,c1,c2,c3); | ||
1236 | addcc c_12,t_1,c_12 | ||
1237 | bcs,a %xcc,.+8 | ||
1238 | add c_3,t_2,c_3 | ||
1239 | addcc c_12,t_1,c_12 | ||
1240 | bcs,a %xcc,.+8 | ||
1241 | add c_3,t_2,c_3 | ||
1242 | mulx a_4,a_2,t_1 !sqr_add_c2(a,4,2,c1,c2,c3); | ||
1243 | addcc c_12,t_1,c_12 | ||
1244 | bcs,a %xcc,.+8 | ||
1245 | add c_3,t_2,c_3 | ||
1246 | addcc c_12,t_1,c_12 | ||
1247 | bcs,a %xcc,.+8 | ||
1248 | add c_3,t_2,c_3 | ||
1249 | lduw ap(7),a_7 | ||
1250 | mulx a_3,a_3,t_1 !=!sqr_add_c(a,3,c1,c2,c3); | ||
1251 | addcc c_12,t_1,t_1 | ||
1252 | bcs,a %xcc,.+8 | ||
1253 | add c_3,t_2,c_3 | ||
1254 | srlx t_1,32,c_12 | ||
1255 | stuw t_1,rp(6) !r[6]=c1; | ||
1256 | or c_12,c_3,c_12 | ||
1257 | |||
1258 | mulx a_0,a_7,t_1 !sqr_add_c2(a,7,0,c2,c3,c1); | ||
1259 | addcc c_12,t_1,c_12 | ||
1260 | clr c_3 | ||
1261 | bcs,a %xcc,.+8 | ||
1262 | add c_3,t_2,c_3 | ||
1263 | addcc c_12,t_1,c_12 | ||
1264 | bcs,a %xcc,.+8 | ||
1265 | add c_3,t_2,c_3 | ||
1266 | mulx a_1,a_6,t_1 !sqr_add_c2(a,6,1,c2,c3,c1); | ||
1267 | addcc c_12,t_1,c_12 | ||
1268 | bcs,a %xcc,.+8 | ||
1269 | add c_3,t_2,c_3 | ||
1270 | addcc c_12,t_1,c_12 | ||
1271 | bcs,a %xcc,.+8 | ||
1272 | add c_3,t_2,c_3 | ||
1273 | mulx a_2,a_5,t_1 !sqr_add_c2(a,5,2,c2,c3,c1); | ||
1274 | addcc c_12,t_1,c_12 | ||
1275 | bcs,a %xcc,.+8 | ||
1276 | add c_3,t_2,c_3 | ||
1277 | addcc c_12,t_1,c_12 | ||
1278 | bcs,a %xcc,.+8 | ||
1279 | add c_3,t_2,c_3 | ||
1280 | mulx a_3,a_4,t_1 !sqr_add_c2(a,4,3,c2,c3,c1); | ||
1281 | addcc c_12,t_1,c_12 | ||
1282 | bcs,a %xcc,.+8 | ||
1283 | add c_3,t_2,c_3 | ||
1284 | addcc c_12,t_1,t_1 | ||
1285 | bcs,a %xcc,.+8 | ||
1286 | add c_3,t_2,c_3 | ||
1287 | srlx t_1,32,c_12 | ||
1288 | stuw t_1,rp(7) !r[7]=c2; | ||
1289 | or c_12,c_3,c_12 | ||
1290 | |||
1291 | mulx a_7,a_1,t_1 !sqr_add_c2(a,7,1,c3,c1,c2); | ||
1292 | addcc c_12,t_1,c_12 | ||
1293 | clr c_3 | ||
1294 | bcs,a %xcc,.+8 | ||
1295 | add c_3,t_2,c_3 | ||
1296 | addcc c_12,t_1,c_12 | ||
1297 | bcs,a %xcc,.+8 | ||
1298 | add c_3,t_2,c_3 | ||
1299 | mulx a_6,a_2,t_1 !sqr_add_c2(a,6,2,c3,c1,c2); | ||
1300 | addcc c_12,t_1,c_12 | ||
1301 | bcs,a %xcc,.+8 | ||
1302 | add c_3,t_2,c_3 | ||
1303 | addcc c_12,t_1,c_12 | ||
1304 | bcs,a %xcc,.+8 | ||
1305 | add c_3,t_2,c_3 | ||
1306 | mulx a_5,a_3,t_1 !sqr_add_c2(a,5,3,c3,c1,c2); | ||
1307 | addcc c_12,t_1,c_12 | ||
1308 | bcs,a %xcc,.+8 | ||
1309 | add c_3,t_2,c_3 | ||
1310 | addcc c_12,t_1,c_12 | ||
1311 | bcs,a %xcc,.+8 | ||
1312 | add c_3,t_2,c_3 | ||
1313 | mulx a_4,a_4,t_1 !sqr_add_c(a,4,c3,c1,c2); | ||
1314 | addcc c_12,t_1,t_1 | ||
1315 | bcs,a %xcc,.+8 | ||
1316 | add c_3,t_2,c_3 | ||
1317 | srlx t_1,32,c_12 | ||
1318 | stuw t_1,rp(8) !r[8]=c3; | ||
1319 | or c_12,c_3,c_12 | ||
1320 | |||
1321 | mulx a_2,a_7,t_1 !sqr_add_c2(a,7,2,c1,c2,c3); | ||
1322 | addcc c_12,t_1,c_12 | ||
1323 | clr c_3 | ||
1324 | bcs,a %xcc,.+8 | ||
1325 | add c_3,t_2,c_3 | ||
1326 | addcc c_12,t_1,c_12 | ||
1327 | bcs,a %xcc,.+8 | ||
1328 | add c_3,t_2,c_3 | ||
1329 | mulx a_3,a_6,t_1 !sqr_add_c2(a,6,3,c1,c2,c3); | ||
1330 | addcc c_12,t_1,c_12 | ||
1331 | bcs,a %xcc,.+8 | ||
1332 | add c_3,t_2,c_3 | ||
1333 | addcc c_12,t_1,c_12 | ||
1334 | bcs,a %xcc,.+8 | ||
1335 | add c_3,t_2,c_3 | ||
1336 | mulx a_4,a_5,t_1 !sqr_add_c2(a,5,4,c1,c2,c3); | ||
1337 | addcc c_12,t_1,c_12 | ||
1338 | bcs,a %xcc,.+8 | ||
1339 | add c_3,t_2,c_3 | ||
1340 | addcc c_12,t_1,t_1 | ||
1341 | bcs,a %xcc,.+8 | ||
1342 | add c_3,t_2,c_3 | ||
1343 | srlx t_1,32,c_12 | ||
1344 | stuw t_1,rp(9) !r[9]=c1; | ||
1345 | or c_12,c_3,c_12 | ||
1346 | |||
1347 | mulx a_7,a_3,t_1 !sqr_add_c2(a,7,3,c2,c3,c1); | ||
1348 | addcc c_12,t_1,c_12 | ||
1349 | clr c_3 | ||
1350 | bcs,a %xcc,.+8 | ||
1351 | add c_3,t_2,c_3 | ||
1352 | addcc c_12,t_1,c_12 | ||
1353 | bcs,a %xcc,.+8 | ||
1354 | add c_3,t_2,c_3 | ||
1355 | mulx a_6,a_4,t_1 !sqr_add_c2(a,6,4,c2,c3,c1); | ||
1356 | addcc c_12,t_1,c_12 | ||
1357 | bcs,a %xcc,.+8 | ||
1358 | add c_3,t_2,c_3 | ||
1359 | addcc c_12,t_1,c_12 | ||
1360 | bcs,a %xcc,.+8 | ||
1361 | add c_3,t_2,c_3 | ||
1362 | mulx a_5,a_5,t_1 !sqr_add_c(a,5,c2,c3,c1); | ||
1363 | addcc c_12,t_1,t_1 | ||
1364 | bcs,a %xcc,.+8 | ||
1365 | add c_3,t_2,c_3 | ||
1366 | srlx t_1,32,c_12 | ||
1367 | stuw t_1,rp(10) !r[10]=c2; | ||
1368 | or c_12,c_3,c_12 | ||
1369 | |||
1370 | mulx a_4,a_7,t_1 !sqr_add_c2(a,7,4,c3,c1,c2); | ||
1371 | addcc c_12,t_1,c_12 | ||
1372 | clr c_3 | ||
1373 | bcs,a %xcc,.+8 | ||
1374 | add c_3,t_2,c_3 | ||
1375 | addcc c_12,t_1,c_12 | ||
1376 | bcs,a %xcc,.+8 | ||
1377 | add c_3,t_2,c_3 | ||
1378 | mulx a_5,a_6,t_1 !sqr_add_c2(a,6,5,c3,c1,c2); | ||
1379 | addcc c_12,t_1,c_12 | ||
1380 | bcs,a %xcc,.+8 | ||
1381 | add c_3,t_2,c_3 | ||
1382 | addcc c_12,t_1,t_1 | ||
1383 | bcs,a %xcc,.+8 | ||
1384 | add c_3,t_2,c_3 | ||
1385 | srlx t_1,32,c_12 | ||
1386 | stuw t_1,rp(11) !r[11]=c3; | ||
1387 | or c_12,c_3,c_12 | ||
1388 | |||
1389 | mulx a_7,a_5,t_1 !sqr_add_c2(a,7,5,c1,c2,c3); | ||
1390 | addcc c_12,t_1,c_12 | ||
1391 | clr c_3 | ||
1392 | bcs,a %xcc,.+8 | ||
1393 | add c_3,t_2,c_3 | ||
1394 | addcc c_12,t_1,c_12 | ||
1395 | bcs,a %xcc,.+8 | ||
1396 | add c_3,t_2,c_3 | ||
1397 | mulx a_6,a_6,t_1 !sqr_add_c(a,6,c1,c2,c3); | ||
1398 | addcc c_12,t_1,t_1 | ||
1399 | bcs,a %xcc,.+8 | ||
1400 | add c_3,t_2,c_3 | ||
1401 | srlx t_1,32,c_12 | ||
1402 | stuw t_1,rp(12) !r[12]=c1; | ||
1403 | or c_12,c_3,c_12 | ||
1404 | |||
1405 | mulx a_6,a_7,t_1 !sqr_add_c2(a,7,6,c2,c3,c1); | ||
1406 | addcc c_12,t_1,c_12 | ||
1407 | clr c_3 | ||
1408 | bcs,a %xcc,.+8 | ||
1409 | add c_3,t_2,c_3 | ||
1410 | addcc c_12,t_1,t_1 | ||
1411 | bcs,a %xcc,.+8 | ||
1412 | add c_3,t_2,c_3 | ||
1413 | srlx t_1,32,c_12 | ||
1414 | stuw t_1,rp(13) !r[13]=c2; | ||
1415 | or c_12,c_3,c_12 | ||
1416 | |||
1417 | mulx a_7,a_7,t_1 !sqr_add_c(a,7,c3,c1,c2); | ||
1418 | addcc c_12,t_1,t_1 | ||
1419 | srlx t_1,32,c_12 | ||
1420 | stuw t_1,rp(14) !r[14]=c3; | ||
1421 | stuw c_12,rp(15) !r[15]=c1; | ||
1422 | |||
1423 | ret | ||
1424 | restore %g0,%g0,%o0 | ||
1425 | |||
1426 | .type bn_sqr_comba8,#function | ||
1427 | .size bn_sqr_comba8,(.-bn_sqr_comba8) | ||
1428 | |||
1429 | .align 32 | ||
1430 | |||
1431 | .global bn_sqr_comba4 | ||
1432 | /* | ||
1433 | * void bn_sqr_comba4(r,a) | ||
1434 | * BN_ULONG *r,*a; | ||
1435 | */ | ||
1436 | bn_sqr_comba4: | ||
1437 | save %sp,FRAME_SIZE,%sp | ||
1438 | mov 1,t_2 | ||
1439 | lduw ap(0),a_0 | ||
1440 | sllx t_2,32,t_2 | ||
1441 | lduw ap(1),a_1 | ||
1442 | mulx a_0,a_0,t_1 !sqr_add_c(a,0,c1,c2,c3); | ||
1443 | srlx t_1,32,c_12 | ||
1444 | stuw t_1,rp(0) !r[0]=c1; | ||
1445 | |||
1446 | lduw ap(2),a_2 | ||
1447 | mulx a_0,a_1,t_1 !sqr_add_c2(a,1,0,c2,c3,c1); | ||
1448 | addcc c_12,t_1,c_12 | ||
1449 | clr c_3 | ||
1450 | bcs,a %xcc,.+8 | ||
1451 | add c_3,t_2,c_3 | ||
1452 | addcc c_12,t_1,t_1 | ||
1453 | bcs,a %xcc,.+8 | ||
1454 | add c_3,t_2,c_3 | ||
1455 | srlx t_1,32,c_12 | ||
1456 | stuw t_1,rp(1) !r[1]=c2; | ||
1457 | or c_12,c_3,c_12 | ||
1458 | |||
1459 | mulx a_2,a_0,t_1 !sqr_add_c2(a,2,0,c3,c1,c2); | ||
1460 | addcc c_12,t_1,c_12 | ||
1461 | clr c_3 | ||
1462 | bcs,a %xcc,.+8 | ||
1463 | add c_3,t_2,c_3 | ||
1464 | addcc c_12,t_1,c_12 | ||
1465 | bcs,a %xcc,.+8 | ||
1466 | add c_3,t_2,c_3 | ||
1467 | lduw ap(3),a_3 | ||
1468 | mulx a_1,a_1,t_1 !sqr_add_c(a,1,c3,c1,c2); | ||
1469 | addcc c_12,t_1,t_1 | ||
1470 | bcs,a %xcc,.+8 | ||
1471 | add c_3,t_2,c_3 | ||
1472 | srlx t_1,32,c_12 | ||
1473 | stuw t_1,rp(2) !r[2]=c3; | ||
1474 | or c_12,c_3,c_12 | ||
1475 | |||
1476 | mulx a_0,a_3,t_1 !sqr_add_c2(a,3,0,c1,c2,c3); | ||
1477 | addcc c_12,t_1,c_12 | ||
1478 | clr c_3 | ||
1479 | bcs,a %xcc,.+8 | ||
1480 | add c_3,t_2,c_3 | ||
1481 | addcc c_12,t_1,c_12 | ||
1482 | bcs,a %xcc,.+8 | ||
1483 | add c_3,t_2,c_3 | ||
1484 | mulx a_1,a_2,t_1 !sqr_add_c2(a,2,1,c1,c2,c3); | ||
1485 | addcc c_12,t_1,c_12 | ||
1486 | bcs,a %xcc,.+8 | ||
1487 | add c_3,t_2,c_3 | ||
1488 | addcc c_12,t_1,t_1 | ||
1489 | bcs,a %xcc,.+8 | ||
1490 | add c_3,t_2,c_3 | ||
1491 | srlx t_1,32,c_12 | ||
1492 | stuw t_1,rp(3) !r[3]=c1; | ||
1493 | or c_12,c_3,c_12 | ||
1494 | |||
1495 | mulx a_3,a_1,t_1 !sqr_add_c2(a,3,1,c2,c3,c1); | ||
1496 | addcc c_12,t_1,c_12 | ||
1497 | clr c_3 | ||
1498 | bcs,a %xcc,.+8 | ||
1499 | add c_3,t_2,c_3 | ||
1500 | addcc c_12,t_1,c_12 | ||
1501 | bcs,a %xcc,.+8 | ||
1502 | add c_3,t_2,c_3 | ||
1503 | mulx a_2,a_2,t_1 !sqr_add_c(a,2,c2,c3,c1); | ||
1504 | addcc c_12,t_1,t_1 | ||
1505 | bcs,a %xcc,.+8 | ||
1506 | add c_3,t_2,c_3 | ||
1507 | srlx t_1,32,c_12 | ||
1508 | stuw t_1,rp(4) !r[4]=c2; | ||
1509 | or c_12,c_3,c_12 | ||
1510 | |||
1511 | mulx a_2,a_3,t_1 !sqr_add_c2(a,3,2,c3,c1,c2); | ||
1512 | addcc c_12,t_1,c_12 | ||
1513 | clr c_3 | ||
1514 | bcs,a %xcc,.+8 | ||
1515 | add c_3,t_2,c_3 | ||
1516 | addcc c_12,t_1,t_1 | ||
1517 | bcs,a %xcc,.+8 | ||
1518 | add c_3,t_2,c_3 | ||
1519 | srlx t_1,32,c_12 | ||
1520 | stuw t_1,rp(5) !r[5]=c3; | ||
1521 | or c_12,c_3,c_12 | ||
1522 | |||
1523 | mulx a_3,a_3,t_1 !sqr_add_c(a,3,c1,c2,c3); | ||
1524 | addcc c_12,t_1,t_1 | ||
1525 | srlx t_1,32,c_12 | ||
1526 | stuw t_1,rp(6) !r[6]=c1; | ||
1527 | stuw c_12,rp(7) !r[7]=c2; | ||
1528 | |||
1529 | ret | ||
1530 | restore %g0,%g0,%o0 | ||
1531 | |||
1532 | .type bn_sqr_comba4,#function | ||
1533 | .size bn_sqr_comba4,(.-bn_sqr_comba4) | ||
1534 | |||
1535 | .align 32 | ||