diff options
Diffstat (limited to 'src/lib/libcrypto/bn/asm/sparcv8plus.S')
-rw-r--r-- | src/lib/libcrypto/bn/asm/sparcv8plus.S | 1547 |
1 files changed, 1547 insertions, 0 deletions
diff --git a/src/lib/libcrypto/bn/asm/sparcv8plus.S b/src/lib/libcrypto/bn/asm/sparcv8plus.S new file mode 100644 index 0000000000..8c56e2e7e7 --- /dev/null +++ b/src/lib/libcrypto/bn/asm/sparcv8plus.S | |||
@@ -0,0 +1,1547 @@ | |||
1 | .ident "sparcv8plus.s, Version 1.4" | ||
2 | .ident "SPARC v9 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>" | ||
3 | |||
4 | /* | ||
5 | * ==================================================================== | ||
6 | * Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | ||
7 | * project. | ||
8 | * | ||
9 | * Rights for redistribution and usage in source and binary forms are | ||
10 | * granted according to the OpenSSL license. Warranty of any kind is | ||
11 | * disclaimed. | ||
12 | * ==================================================================== | ||
13 | */ | ||
14 | |||
15 | /* | ||
16 | * This is my modest contributon to OpenSSL project (see | ||
17 | * http://www.openssl.org/ for more information about it) and is | ||
18 | * a drop-in UltraSPARC ISA replacement for crypto/bn/bn_asm.c | ||
19 | * module. For updates see http://fy.chalmers.se/~appro/hpe/. | ||
20 | * | ||
21 | * Questions-n-answers. | ||
22 | * | ||
23 | * Q. How to compile? | ||
24 | * A. With SC4.x/SC5.x: | ||
25 | * | ||
26 | * cc -xarch=v8plus -c bn_asm.sparc.v8plus.S -o bn_asm.o | ||
27 | * | ||
28 | * and with gcc: | ||
29 | * | ||
30 | * gcc -mcpu=ultrasparc -c bn_asm.sparc.v8plus.S -o bn_asm.o | ||
31 | * | ||
32 | * or if above fails (it does if you have gas installed): | ||
33 | * | ||
34 | * gcc -E bn_asm.sparc.v8plus.S | as -xarch=v8plus /dev/fd/0 -o bn_asm.o | ||
35 | * | ||
36 | * Quick-n-dirty way to fuse the module into the library. | ||
37 | * Provided that the library is already configured and built | ||
38 | * (in 0.9.2 case with no-asm option): | ||
39 | * | ||
40 | * # cd crypto/bn | ||
41 | * # cp /some/place/bn_asm.sparc.v8plus.S . | ||
42 | * # cc -xarch=v8plus -c bn_asm.sparc.v8plus.S -o bn_asm.o | ||
43 | * # make | ||
44 | * # cd ../.. | ||
45 | * # make; make test | ||
46 | * | ||
47 | * Quick-n-dirty way to get rid of it: | ||
48 | * | ||
49 | * # cd crypto/bn | ||
50 | * # touch bn_asm.c | ||
51 | * # make | ||
52 | * # cd ../.. | ||
53 | * # make; make test | ||
54 | * | ||
55 | * Q. V8plus achitecture? What kind of beast is that? | ||
56 | * A. Well, it's rather a programming model than an architecture... | ||
57 | * It's actually v9-compliant, i.e. *any* UltraSPARC, CPU under | ||
58 | * special conditions, namely when kernel doesn't preserve upper | ||
59 | * 32 bits of otherwise 64-bit registers during a context switch. | ||
60 | * | ||
61 | * Q. Why just UltraSPARC? What about SuperSPARC? | ||
62 | * A. Original release did target UltraSPARC only. Now SuperSPARC | ||
63 | * version is provided along. Both version share bn_*comba[48] | ||
64 | * implementations (see comment later in code for explanation). | ||
65 | * But what's so special about this UltraSPARC implementation? | ||
66 | * Why didn't I let compiler do the job? Trouble is that most of | ||
67 | * available compilers (well, SC5.0 is the only exception) don't | ||
68 | * attempt to take advantage of UltraSPARC's 64-bitness under | ||
69 | * 32-bit kernels even though it's perfectly possible (see next | ||
70 | * question). | ||
71 | * | ||
72 | * Q. 64-bit registers under 32-bit kernels? Didn't you just say it | ||
73 | * doesn't work? | ||
74 | * A. You can't adress *all* registers as 64-bit wide:-( The catch is | ||
75 | * that you actually may rely upon %o0-%o5 and %g1-%g4 being fully | ||
76 | * preserved if you're in a leaf function, i.e. such never calling | ||
77 | * any other functions. All functions in this module are leaf and | ||
78 | * 10 registers is a handful. And as a matter of fact none-"comba" | ||
79 | * routines don't require even that much and I could even afford to | ||
80 | * not allocate own stack frame for 'em:-) | ||
81 | * | ||
82 | * Q. What about 64-bit kernels? | ||
83 | * A. What about 'em? Just kidding:-) Pure 64-bit version is currently | ||
84 | * under evaluation and development... | ||
85 | * | ||
86 | * Q. What about shared libraries? | ||
87 | * A. What about 'em? Kidding again:-) Code does *not* contain any | ||
88 | * code position dependencies and it's safe to include it into | ||
89 | * shared library as is. | ||
90 | * | ||
91 | * Q. How much faster does it go? | ||
92 | * A. Do you have a good benchmark? In either case below is what I | ||
93 | * experience with crypto/bn/expspeed.c test program: | ||
94 | * | ||
95 | * v8plus module on U10/300MHz against bn_asm.c compiled with: | ||
96 | * | ||
97 | * cc-5.0 -xarch=v8plus -xO5 -xdepend +7-12% | ||
98 | * cc-4.2 -xarch=v8plus -xO5 -xdepend +25-35% | ||
99 | * egcs-1.1.2 -mcpu=ultrasparc -O3 +35-45% | ||
100 | * | ||
101 | * v8 module on SS10/60MHz against bn_asm.c compiled with: | ||
102 | * | ||
103 | * cc-5.0 -xarch=v8 -xO5 -xdepend +7-10% | ||
104 | * cc-4.2 -xarch=v8 -xO5 -xdepend +10% | ||
105 | * egcs-1.1.2 -mv8 -O3 +35-45% | ||
106 | * | ||
107 | * As you can see it's damn hard to beat the new Sun C compiler | ||
108 | * and it's in first place GNU C users who will appreciate this | ||
109 | * assembler implementation:-) | ||
110 | */ | ||
111 | |||
112 | /* | ||
113 | * Revision history. | ||
114 | * | ||
115 | * 1.0 - initial release; | ||
116 | * 1.1 - new loop unrolling model(*); | ||
117 | * - some more fine tuning; | ||
118 | * 1.2 - made gas friendly; | ||
119 | * - updates to documentation concerning v9; | ||
120 | * - new performance comparison matrix; | ||
121 | * 1.3 - fixed problem with /usr/ccs/lib/cpp; | ||
122 | * 1.4 - native V9 bn_*_comba[48] implementation (15% more efficient) | ||
123 | * resulting in slight overall performance kick; | ||
124 | * - some retunes; | ||
125 | * - support for GNU as added; | ||
126 | * | ||
127 | * (*) Originally unrolled loop looked like this: | ||
128 | * for (;;) { | ||
129 | * op(p+0); if (--n==0) break; | ||
130 | * op(p+1); if (--n==0) break; | ||
131 | * op(p+2); if (--n==0) break; | ||
132 | * op(p+3); if (--n==0) break; | ||
133 | * p+=4; | ||
134 | * } | ||
135 | * I unroll according to following: | ||
136 | * while (n&~3) { | ||
137 | * op(p+0); op(p+1); op(p+2); op(p+3); | ||
138 | * p+=4; n=-4; | ||
139 | * } | ||
140 | * if (n) { | ||
141 | * op(p+0); if (--n==0) return; | ||
142 | * op(p+2); if (--n==0) return; | ||
143 | * op(p+3); return; | ||
144 | * } | ||
145 | */ | ||
146 | |||
147 | /* | ||
148 | * GNU assembler can't stand stuw:-( | ||
149 | */ | ||
150 | #define stuw st | ||
151 | |||
152 | .section ".text",#alloc,#execinstr | ||
153 | .file "bn_asm.sparc.v8plus.S" | ||
154 | |||
155 | .align 32 | ||
156 | |||
157 | .global bn_mul_add_words | ||
158 | /* | ||
159 | * BN_ULONG bn_mul_add_words(rp,ap,num,w) | ||
160 | * BN_ULONG *rp,*ap; | ||
161 | * int num; | ||
162 | * BN_ULONG w; | ||
163 | */ | ||
164 | bn_mul_add_words: | ||
165 | sra %o2,%g0,%o2 ! signx %o2 | ||
166 | brgz,a %o2,.L_bn_mul_add_words_proceed | ||
167 | lduw [%o1],%g2 | ||
168 | retl | ||
169 | clr %o0 | ||
170 | nop | ||
171 | nop | ||
172 | nop | ||
173 | |||
174 | .L_bn_mul_add_words_proceed: | ||
175 | srl %o3,%g0,%o3 ! clruw %o3 | ||
176 | andcc %o2,-4,%g0 | ||
177 | bz,pn %icc,.L_bn_mul_add_words_tail | ||
178 | clr %o5 | ||
179 | |||
180 | .L_bn_mul_add_words_loop: ! wow! 32 aligned! | ||
181 | lduw [%o0],%g1 | ||
182 | lduw [%o1+4],%g3 | ||
183 | mulx %o3,%g2,%g2 | ||
184 | add %g1,%o5,%o4 | ||
185 | nop | ||
186 | add %o4,%g2,%o4 | ||
187 | stuw %o4,[%o0] | ||
188 | srlx %o4,32,%o5 | ||
189 | |||
190 | lduw [%o0+4],%g1 | ||
191 | lduw [%o1+8],%g2 | ||
192 | mulx %o3,%g3,%g3 | ||
193 | add %g1,%o5,%o4 | ||
194 | dec 4,%o2 | ||
195 | add %o4,%g3,%o4 | ||
196 | stuw %o4,[%o0+4] | ||
197 | srlx %o4,32,%o5 | ||
198 | |||
199 | lduw [%o0+8],%g1 | ||
200 | lduw [%o1+12],%g3 | ||
201 | mulx %o3,%g2,%g2 | ||
202 | add %g1,%o5,%o4 | ||
203 | inc 16,%o1 | ||
204 | add %o4,%g2,%o4 | ||
205 | stuw %o4,[%o0+8] | ||
206 | srlx %o4,32,%o5 | ||
207 | |||
208 | lduw [%o0+12],%g1 | ||
209 | mulx %o3,%g3,%g3 | ||
210 | add %g1,%o5,%o4 | ||
211 | inc 16,%o0 | ||
212 | add %o4,%g3,%o4 | ||
213 | andcc %o2,-4,%g0 | ||
214 | stuw %o4,[%o0-4] | ||
215 | srlx %o4,32,%o5 | ||
216 | bnz,a,pt %icc,.L_bn_mul_add_words_loop | ||
217 | lduw [%o1],%g2 | ||
218 | |||
219 | brnz,a,pn %o2,.L_bn_mul_add_words_tail | ||
220 | lduw [%o1],%g2 | ||
221 | .L_bn_mul_add_words_return: | ||
222 | retl | ||
223 | mov %o5,%o0 | ||
224 | |||
225 | .L_bn_mul_add_words_tail: | ||
226 | lduw [%o0],%g1 | ||
227 | mulx %o3,%g2,%g2 | ||
228 | add %g1,%o5,%o4 | ||
229 | dec %o2 | ||
230 | add %o4,%g2,%o4 | ||
231 | srlx %o4,32,%o5 | ||
232 | brz,pt %o2,.L_bn_mul_add_words_return | ||
233 | stuw %o4,[%o0] | ||
234 | |||
235 | lduw [%o1+4],%g2 | ||
236 | lduw [%o0+4],%g1 | ||
237 | mulx %o3,%g2,%g2 | ||
238 | add %g1,%o5,%o4 | ||
239 | dec %o2 | ||
240 | add %o4,%g2,%o4 | ||
241 | srlx %o4,32,%o5 | ||
242 | brz,pt %o2,.L_bn_mul_add_words_return | ||
243 | stuw %o4,[%o0+4] | ||
244 | |||
245 | lduw [%o1+8],%g2 | ||
246 | lduw [%o0+8],%g1 | ||
247 | mulx %o3,%g2,%g2 | ||
248 | add %g1,%o5,%o4 | ||
249 | add %o4,%g2,%o4 | ||
250 | stuw %o4,[%o0+8] | ||
251 | retl | ||
252 | srlx %o4,32,%o0 | ||
253 | |||
254 | .type bn_mul_add_words,#function | ||
255 | .size bn_mul_add_words,(.-bn_mul_add_words) | ||
256 | |||
257 | .align 32 | ||
258 | |||
259 | .global bn_mul_words | ||
260 | /* | ||
261 | * BN_ULONG bn_mul_words(rp,ap,num,w) | ||
262 | * BN_ULONG *rp,*ap; | ||
263 | * int num; | ||
264 | * BN_ULONG w; | ||
265 | */ | ||
266 | bn_mul_words: | ||
267 | sra %o2,%g0,%o2 ! signx %o2 | ||
268 | brgz,a %o2,.L_bn_mul_words_proceeed | ||
269 | lduw [%o1],%g2 | ||
270 | retl | ||
271 | clr %o0 | ||
272 | nop | ||
273 | nop | ||
274 | nop | ||
275 | |||
276 | .L_bn_mul_words_proceeed: | ||
277 | srl %o3,%g0,%o3 ! clruw %o3 | ||
278 | andcc %o2,-4,%g0 | ||
279 | bz,pn %icc,.L_bn_mul_words_tail | ||
280 | clr %o5 | ||
281 | |||
282 | .L_bn_mul_words_loop: ! wow! 32 aligned! | ||
283 | lduw [%o1+4],%g3 | ||
284 | mulx %o3,%g2,%g2 | ||
285 | add %g2,%o5,%o4 | ||
286 | nop | ||
287 | stuw %o4,[%o0] | ||
288 | srlx %o4,32,%o5 | ||
289 | |||
290 | lduw [%o1+8],%g2 | ||
291 | mulx %o3,%g3,%g3 | ||
292 | add %g3,%o5,%o4 | ||
293 | dec 4,%o2 | ||
294 | stuw %o4,[%o0+4] | ||
295 | srlx %o4,32,%o5 | ||
296 | |||
297 | lduw [%o1+12],%g3 | ||
298 | mulx %o3,%g2,%g2 | ||
299 | add %g2,%o5,%o4 | ||
300 | inc 16,%o1 | ||
301 | stuw %o4,[%o0+8] | ||
302 | srlx %o4,32,%o5 | ||
303 | |||
304 | mulx %o3,%g3,%g3 | ||
305 | add %g3,%o5,%o4 | ||
306 | inc 16,%o0 | ||
307 | stuw %o4,[%o0-4] | ||
308 | srlx %o4,32,%o5 | ||
309 | andcc %o2,-4,%g0 | ||
310 | bnz,a,pt %icc,.L_bn_mul_words_loop | ||
311 | lduw [%o1],%g2 | ||
312 | nop | ||
313 | nop | ||
314 | |||
315 | brnz,a,pn %o2,.L_bn_mul_words_tail | ||
316 | lduw [%o1],%g2 | ||
317 | .L_bn_mul_words_return: | ||
318 | retl | ||
319 | mov %o5,%o0 | ||
320 | |||
321 | .L_bn_mul_words_tail: | ||
322 | mulx %o3,%g2,%g2 | ||
323 | add %g2,%o5,%o4 | ||
324 | dec %o2 | ||
325 | srlx %o4,32,%o5 | ||
326 | brz,pt %o2,.L_bn_mul_words_return | ||
327 | stuw %o4,[%o0] | ||
328 | |||
329 | lduw [%o1+4],%g2 | ||
330 | mulx %o3,%g2,%g2 | ||
331 | add %g2,%o5,%o4 | ||
332 | dec %o2 | ||
333 | srlx %o4,32,%o5 | ||
334 | brz,pt %o2,.L_bn_mul_words_return | ||
335 | stuw %o4,[%o0+4] | ||
336 | |||
337 | lduw [%o1+8],%g2 | ||
338 | mulx %o3,%g2,%g2 | ||
339 | add %g2,%o5,%o4 | ||
340 | stuw %o4,[%o0+8] | ||
341 | retl | ||
342 | srlx %o4,32,%o0 | ||
343 | |||
344 | .type bn_mul_words,#function | ||
345 | .size bn_mul_words,(.-bn_mul_words) | ||
346 | |||
347 | .align 32 | ||
348 | .global bn_sqr_words | ||
349 | /* | ||
350 | * void bn_sqr_words(r,a,n) | ||
351 | * BN_ULONG *r,*a; | ||
352 | * int n; | ||
353 | */ | ||
354 | bn_sqr_words: | ||
355 | sra %o2,%g0,%o2 ! signx %o2 | ||
356 | brgz,a %o2,.L_bn_sqr_words_proceeed | ||
357 | lduw [%o1],%g2 | ||
358 | retl | ||
359 | clr %o0 | ||
360 | nop | ||
361 | nop | ||
362 | nop | ||
363 | |||
364 | .L_bn_sqr_words_proceeed: | ||
365 | andcc %o2,-4,%g0 | ||
366 | nop | ||
367 | bz,pn %icc,.L_bn_sqr_words_tail | ||
368 | nop | ||
369 | |||
370 | .L_bn_sqr_words_loop: ! wow! 32 aligned! | ||
371 | lduw [%o1+4],%g3 | ||
372 | mulx %g2,%g2,%o4 | ||
373 | stuw %o4,[%o0] | ||
374 | srlx %o4,32,%o5 | ||
375 | stuw %o5,[%o0+4] | ||
376 | nop | ||
377 | |||
378 | lduw [%o1+8],%g2 | ||
379 | mulx %g3,%g3,%o4 | ||
380 | dec 4,%o2 | ||
381 | stuw %o4,[%o0+8] | ||
382 | srlx %o4,32,%o5 | ||
383 | stuw %o5,[%o0+12] | ||
384 | |||
385 | lduw [%o1+12],%g3 | ||
386 | mulx %g2,%g2,%o4 | ||
387 | srlx %o4,32,%o5 | ||
388 | stuw %o4,[%o0+16] | ||
389 | inc 16,%o1 | ||
390 | stuw %o5,[%o0+20] | ||
391 | |||
392 | mulx %g3,%g3,%o4 | ||
393 | inc 32,%o0 | ||
394 | stuw %o4,[%o0-8] | ||
395 | srlx %o4,32,%o5 | ||
396 | andcc %o2,-4,%g2 | ||
397 | stuw %o5,[%o0-4] | ||
398 | bnz,a,pt %icc,.L_bn_sqr_words_loop | ||
399 | lduw [%o1],%g2 | ||
400 | nop | ||
401 | |||
402 | brnz,a,pn %o2,.L_bn_sqr_words_tail | ||
403 | lduw [%o1],%g2 | ||
404 | .L_bn_sqr_words_return: | ||
405 | retl | ||
406 | clr %o0 | ||
407 | |||
408 | .L_bn_sqr_words_tail: | ||
409 | mulx %g2,%g2,%o4 | ||
410 | dec %o2 | ||
411 | stuw %o4,[%o0] | ||
412 | srlx %o4,32,%o5 | ||
413 | brz,pt %o2,.L_bn_sqr_words_return | ||
414 | stuw %o5,[%o0+4] | ||
415 | |||
416 | lduw [%o1+4],%g2 | ||
417 | mulx %g2,%g2,%o4 | ||
418 | dec %o2 | ||
419 | stuw %o4,[%o0+8] | ||
420 | srlx %o4,32,%o5 | ||
421 | brz,pt %o2,.L_bn_sqr_words_return | ||
422 | stuw %o5,[%o0+12] | ||
423 | |||
424 | lduw [%o1+8],%g2 | ||
425 | mulx %g2,%g2,%o4 | ||
426 | srlx %o4,32,%o5 | ||
427 | stuw %o4,[%o0+16] | ||
428 | stuw %o5,[%o0+20] | ||
429 | retl | ||
430 | clr %o0 | ||
431 | |||
432 | .type bn_sqr_words,#function | ||
433 | .size bn_sqr_words,(.-bn_sqr_words) | ||
434 | |||
435 | .align 32 | ||
436 | .global bn_div_words | ||
437 | /* | ||
438 | * BN_ULONG bn_div_words(h,l,d) | ||
439 | * BN_ULONG h,l,d; | ||
440 | */ | ||
441 | bn_div_words: | ||
442 | sllx %o0,32,%o0 | ||
443 | or %o0,%o1,%o0 | ||
444 | udivx %o0,%o2,%o0 | ||
445 | retl | ||
446 | srl %o0,%g0,%o0 ! clruw %o0 | ||
447 | |||
448 | .type bn_div_words,#function | ||
449 | .size bn_div_words,(.-bn_div_words) | ||
450 | |||
451 | .align 32 | ||
452 | |||
453 | .global bn_add_words | ||
454 | /* | ||
455 | * BN_ULONG bn_add_words(rp,ap,bp,n) | ||
456 | * BN_ULONG *rp,*ap,*bp; | ||
457 | * int n; | ||
458 | */ | ||
459 | bn_add_words: | ||
460 | sra %o3,%g0,%o3 ! signx %o3 | ||
461 | brgz,a %o3,.L_bn_add_words_proceed | ||
462 | lduw [%o1],%o4 | ||
463 | retl | ||
464 | clr %o0 | ||
465 | |||
466 | .L_bn_add_words_proceed: | ||
467 | andcc %o3,-4,%g0 | ||
468 | bz,pn %icc,.L_bn_add_words_tail | ||
469 | addcc %g0,0,%g0 ! clear carry flag | ||
470 | |||
471 | .L_bn_add_words_loop: ! wow! 32 aligned! | ||
472 | dec 4,%o3 | ||
473 | lduw [%o2],%o5 | ||
474 | lduw [%o1+4],%g1 | ||
475 | lduw [%o2+4],%g2 | ||
476 | lduw [%o1+8],%g3 | ||
477 | lduw [%o2+8],%g4 | ||
478 | addccc %o5,%o4,%o5 | ||
479 | stuw %o5,[%o0] | ||
480 | |||
481 | lduw [%o1+12],%o4 | ||
482 | lduw [%o2+12],%o5 | ||
483 | inc 16,%o1 | ||
484 | addccc %g1,%g2,%g1 | ||
485 | stuw %g1,[%o0+4] | ||
486 | |||
487 | inc 16,%o2 | ||
488 | addccc %g3,%g4,%g3 | ||
489 | stuw %g3,[%o0+8] | ||
490 | |||
491 | inc 16,%o0 | ||
492 | addccc %o5,%o4,%o5 | ||
493 | stuw %o5,[%o0-4] | ||
494 | and %o3,-4,%g1 | ||
495 | brnz,a,pt %g1,.L_bn_add_words_loop | ||
496 | lduw [%o1],%o4 | ||
497 | |||
498 | brnz,a,pn %o3,.L_bn_add_words_tail | ||
499 | lduw [%o1],%o4 | ||
500 | .L_bn_add_words_return: | ||
501 | clr %o0 | ||
502 | retl | ||
503 | movcs %icc,1,%o0 | ||
504 | nop | ||
505 | |||
506 | .L_bn_add_words_tail: | ||
507 | lduw [%o2],%o5 | ||
508 | dec %o3 | ||
509 | addccc %o5,%o4,%o5 | ||
510 | brz,pt %o3,.L_bn_add_words_return | ||
511 | stuw %o5,[%o0] | ||
512 | |||
513 | lduw [%o1+4],%o4 | ||
514 | lduw [%o2+4],%o5 | ||
515 | dec %o3 | ||
516 | addccc %o5,%o4,%o5 | ||
517 | brz,pt %o3,.L_bn_add_words_return | ||
518 | stuw %o5,[%o0+4] | ||
519 | |||
520 | lduw [%o1+8],%o4 | ||
521 | lduw [%o2+8],%o5 | ||
522 | addccc %o5,%o4,%o5 | ||
523 | stuw %o5,[%o0+8] | ||
524 | clr %o0 | ||
525 | retl | ||
526 | movcs %icc,1,%o0 | ||
527 | |||
528 | .type bn_add_words,#function | ||
529 | .size bn_add_words,(.-bn_add_words) | ||
530 | |||
531 | .global bn_sub_words | ||
532 | /* | ||
533 | * BN_ULONG bn_sub_words(rp,ap,bp,n) | ||
534 | * BN_ULONG *rp,*ap,*bp; | ||
535 | * int n; | ||
536 | */ | ||
537 | bn_sub_words: | ||
538 | sra %o3,%g0,%o3 ! signx %o3 | ||
539 | brgz,a %o3,.L_bn_sub_words_proceed | ||
540 | lduw [%o1],%o4 | ||
541 | retl | ||
542 | clr %o0 | ||
543 | |||
544 | .L_bn_sub_words_proceed: | ||
545 | andcc %o3,-4,%g0 | ||
546 | bz,pn %icc,.L_bn_sub_words_tail | ||
547 | addcc %g0,0,%g0 ! clear carry flag | ||
548 | |||
549 | .L_bn_sub_words_loop: ! wow! 32 aligned! | ||
550 | dec 4,%o3 | ||
551 | lduw [%o2],%o5 | ||
552 | lduw [%o1+4],%g1 | ||
553 | lduw [%o2+4],%g2 | ||
554 | lduw [%o1+8],%g3 | ||
555 | lduw [%o2+8],%g4 | ||
556 | subccc %o4,%o5,%o5 | ||
557 | stuw %o5,[%o0] | ||
558 | |||
559 | lduw [%o1+12],%o4 | ||
560 | lduw [%o2+12],%o5 | ||
561 | inc 16,%o1 | ||
562 | subccc %g1,%g2,%g2 | ||
563 | stuw %g2,[%o0+4] | ||
564 | |||
565 | inc 16,%o2 | ||
566 | subccc %g3,%g4,%g4 | ||
567 | stuw %g4,[%o0+8] | ||
568 | |||
569 | inc 16,%o0 | ||
570 | subccc %o4,%o5,%o5 | ||
571 | stuw %o5,[%o0-4] | ||
572 | and %o3,-4,%g1 | ||
573 | brnz,a,pt %g1,.L_bn_sub_words_loop | ||
574 | lduw [%o1],%o4 | ||
575 | |||
576 | brnz,a,pn %o3,.L_bn_sub_words_tail | ||
577 | lduw [%o1],%o4 | ||
578 | .L_bn_sub_words_return: | ||
579 | clr %o0 | ||
580 | retl | ||
581 | movcs %icc,1,%o0 | ||
582 | nop | ||
583 | |||
584 | .L_bn_sub_words_tail: ! wow! 32 aligned! | ||
585 | lduw [%o2],%o5 | ||
586 | dec %o3 | ||
587 | subccc %o4,%o5,%o5 | ||
588 | brz,pt %o3,.L_bn_sub_words_return | ||
589 | stuw %o5,[%o0] | ||
590 | |||
591 | lduw [%o1+4],%o4 | ||
592 | lduw [%o2+4],%o5 | ||
593 | dec %o3 | ||
594 | subccc %o4,%o5,%o5 | ||
595 | brz,pt %o3,.L_bn_sub_words_return | ||
596 | stuw %o5,[%o0+4] | ||
597 | |||
598 | lduw [%o1+8],%o4 | ||
599 | lduw [%o2+8],%o5 | ||
600 | subccc %o4,%o5,%o5 | ||
601 | stuw %o5,[%o0+8] | ||
602 | clr %o0 | ||
603 | retl | ||
604 | movcs %icc,1,%o0 | ||
605 | |||
606 | .type bn_sub_words,#function | ||
607 | .size bn_sub_words,(.-bn_sub_words) | ||
608 | |||
609 | /* | ||
610 | * Code below depends on the fact that upper parts of the %l0-%l7 | ||
611 | * and %i0-%i7 are zeroed by kernel after context switch. In | ||
612 | * previous versions this comment stated that "the trouble is that | ||
613 | * it's not feasible to implement the mumbo-jumbo in less V9 | ||
614 | * instructions:-(" which apparently isn't true thanks to | ||
615 | * 'bcs,a %xcc,.+8; inc %rd' pair. But the performance improvement | ||
616 | * results not from the shorter code, but from elimination of | ||
617 | * multicycle none-pairable 'rd %y,%rd' instructions. | ||
618 | * | ||
619 | * Andy. | ||
620 | */ | ||
621 | |||
622 | #define FRAME_SIZE -96 | ||
623 | |||
624 | /* | ||
625 | * Here is register usage map for *all* routines below. | ||
626 | */ | ||
627 | #define t_1 %o0 | ||
628 | #define t_2 %o1 | ||
629 | #define c_12 %o2 | ||
630 | #define c_3 %o3 | ||
631 | |||
632 | #define ap(I) [%i1+4*I] | ||
633 | #define bp(I) [%i2+4*I] | ||
634 | #define rp(I) [%i0+4*I] | ||
635 | |||
636 | #define a_0 %l0 | ||
637 | #define a_1 %l1 | ||
638 | #define a_2 %l2 | ||
639 | #define a_3 %l3 | ||
640 | #define a_4 %l4 | ||
641 | #define a_5 %l5 | ||
642 | #define a_6 %l6 | ||
643 | #define a_7 %l7 | ||
644 | |||
645 | #define b_0 %i3 | ||
646 | #define b_1 %i4 | ||
647 | #define b_2 %i5 | ||
648 | #define b_3 %o4 | ||
649 | #define b_4 %o5 | ||
650 | #define b_5 %o7 | ||
651 | #define b_6 %g1 | ||
652 | #define b_7 %g4 | ||
653 | |||
654 | .align 32 | ||
655 | .global bn_mul_comba8 | ||
656 | /* | ||
657 | * void bn_mul_comba8(r,a,b) | ||
658 | * BN_ULONG *r,*a,*b; | ||
659 | */ | ||
660 | bn_mul_comba8: | ||
661 | save %sp,FRAME_SIZE,%sp | ||
662 | mov 1,t_2 | ||
663 | lduw ap(0),a_0 | ||
664 | sllx t_2,32,t_2 | ||
665 | lduw bp(0),b_0 != | ||
666 | lduw bp(1),b_1 | ||
667 | mulx a_0,b_0,t_1 !mul_add_c(a[0],b[0],c1,c2,c3); | ||
668 | srlx t_1,32,c_12 | ||
669 | stuw t_1,rp(0) !=!r[0]=c1; | ||
670 | |||
671 | lduw ap(1),a_1 | ||
672 | mulx a_0,b_1,t_1 !mul_add_c(a[0],b[1],c2,c3,c1); | ||
673 | addcc c_12,t_1,c_12 | ||
674 | clr c_3 != | ||
675 | bcs,a %xcc,.+8 | ||
676 | add c_3,t_2,c_3 | ||
677 | lduw ap(2),a_2 | ||
678 | mulx a_1,b_0,t_1 !=!mul_add_c(a[1],b[0],c2,c3,c1); | ||
679 | addcc c_12,t_1,t_1 | ||
680 | bcs,a %xcc,.+8 | ||
681 | add c_3,t_2,c_3 | ||
682 | srlx t_1,32,c_12 != | ||
683 | stuw t_1,rp(1) !r[1]=c2; | ||
684 | or c_12,c_3,c_12 | ||
685 | |||
686 | mulx a_2,b_0,t_1 !mul_add_c(a[2],b[0],c3,c1,c2); | ||
687 | addcc c_12,t_1,c_12 != | ||
688 | clr c_3 | ||
689 | bcs,a %xcc,.+8 | ||
690 | add c_3,t_2,c_3 | ||
691 | lduw bp(2),b_2 != | ||
692 | mulx a_1,b_1,t_1 !mul_add_c(a[1],b[1],c3,c1,c2); | ||
693 | addcc c_12,t_1,c_12 | ||
694 | bcs,a %xcc,.+8 | ||
695 | add c_3,t_2,c_3 != | ||
696 | lduw bp(3),b_3 | ||
697 | mulx a_0,b_2,t_1 !mul_add_c(a[0],b[2],c3,c1,c2); | ||
698 | addcc c_12,t_1,t_1 | ||
699 | bcs,a %xcc,.+8 != | ||
700 | add c_3,t_2,c_3 | ||
701 | srlx t_1,32,c_12 | ||
702 | stuw t_1,rp(2) !r[2]=c3; | ||
703 | or c_12,c_3,c_12 != | ||
704 | |||
705 | mulx a_0,b_3,t_1 !mul_add_c(a[0],b[3],c1,c2,c3); | ||
706 | addcc c_12,t_1,c_12 | ||
707 | clr c_3 | ||
708 | bcs,a %xcc,.+8 != | ||
709 | add c_3,t_2,c_3 | ||
710 | mulx a_1,b_2,t_1 !=!mul_add_c(a[1],b[2],c1,c2,c3); | ||
711 | addcc c_12,t_1,c_12 | ||
712 | bcs,a %xcc,.+8 != | ||
713 | add c_3,t_2,c_3 | ||
714 | lduw ap(3),a_3 | ||
715 | mulx a_2,b_1,t_1 !mul_add_c(a[2],b[1],c1,c2,c3); | ||
716 | addcc c_12,t_1,c_12 != | ||
717 | bcs,a %xcc,.+8 | ||
718 | add c_3,t_2,c_3 | ||
719 | lduw ap(4),a_4 | ||
720 | mulx a_3,b_0,t_1 !=!mul_add_c(a[3],b[0],c1,c2,c3);!= | ||
721 | addcc c_12,t_1,t_1 | ||
722 | bcs,a %xcc,.+8 | ||
723 | add c_3,t_2,c_3 | ||
724 | srlx t_1,32,c_12 != | ||
725 | stuw t_1,rp(3) !r[3]=c1; | ||
726 | or c_12,c_3,c_12 | ||
727 | |||
728 | mulx a_4,b_0,t_1 !mul_add_c(a[4],b[0],c2,c3,c1); | ||
729 | addcc c_12,t_1,c_12 != | ||
730 | clr c_3 | ||
731 | bcs,a %xcc,.+8 | ||
732 | add c_3,t_2,c_3 | ||
733 | mulx a_3,b_1,t_1 !=!mul_add_c(a[3],b[1],c2,c3,c1); | ||
734 | addcc c_12,t_1,c_12 | ||
735 | bcs,a %xcc,.+8 | ||
736 | add c_3,t_2,c_3 | ||
737 | mulx a_2,b_2,t_1 !=!mul_add_c(a[2],b[2],c2,c3,c1); | ||
738 | addcc c_12,t_1,c_12 | ||
739 | bcs,a %xcc,.+8 | ||
740 | add c_3,t_2,c_3 | ||
741 | lduw bp(4),b_4 != | ||
742 | mulx a_1,b_3,t_1 !mul_add_c(a[1],b[3],c2,c3,c1); | ||
743 | addcc c_12,t_1,c_12 | ||
744 | bcs,a %xcc,.+8 | ||
745 | add c_3,t_2,c_3 != | ||
746 | lduw bp(5),b_5 | ||
747 | mulx a_0,b_4,t_1 !mul_add_c(a[0],b[4],c2,c3,c1); | ||
748 | addcc c_12,t_1,t_1 | ||
749 | bcs,a %xcc,.+8 != | ||
750 | add c_3,t_2,c_3 | ||
751 | srlx t_1,32,c_12 | ||
752 | stuw t_1,rp(4) !r[4]=c2; | ||
753 | or c_12,c_3,c_12 != | ||
754 | |||
755 | mulx a_0,b_5,t_1 !mul_add_c(a[0],b[5],c3,c1,c2); | ||
756 | addcc c_12,t_1,c_12 | ||
757 | clr c_3 | ||
758 | bcs,a %xcc,.+8 != | ||
759 | add c_3,t_2,c_3 | ||
760 | mulx a_1,b_4,t_1 !mul_add_c(a[1],b[4],c3,c1,c2); | ||
761 | addcc c_12,t_1,c_12 | ||
762 | bcs,a %xcc,.+8 != | ||
763 | add c_3,t_2,c_3 | ||
764 | mulx a_2,b_3,t_1 !mul_add_c(a[2],b[3],c3,c1,c2); | ||
765 | addcc c_12,t_1,c_12 | ||
766 | bcs,a %xcc,.+8 != | ||
767 | add c_3,t_2,c_3 | ||
768 | mulx a_3,b_2,t_1 !mul_add_c(a[3],b[2],c3,c1,c2); | ||
769 | addcc c_12,t_1,c_12 | ||
770 | bcs,a %xcc,.+8 != | ||
771 | add c_3,t_2,c_3 | ||
772 | lduw ap(5),a_5 | ||
773 | mulx a_4,b_1,t_1 !mul_add_c(a[4],b[1],c3,c1,c2); | ||
774 | addcc c_12,t_1,c_12 != | ||
775 | bcs,a %xcc,.+8 | ||
776 | add c_3,t_2,c_3 | ||
777 | lduw ap(6),a_6 | ||
778 | mulx a_5,b_0,t_1 !=!mul_add_c(a[5],b[0],c3,c1,c2); | ||
779 | addcc c_12,t_1,t_1 | ||
780 | bcs,a %xcc,.+8 | ||
781 | add c_3,t_2,c_3 | ||
782 | srlx t_1,32,c_12 != | ||
783 | stuw t_1,rp(5) !r[5]=c3; | ||
784 | or c_12,c_3,c_12 | ||
785 | |||
786 | mulx a_6,b_0,t_1 !mul_add_c(a[6],b[0],c1,c2,c3); | ||
787 | addcc c_12,t_1,c_12 != | ||
788 | clr c_3 | ||
789 | bcs,a %xcc,.+8 | ||
790 | add c_3,t_2,c_3 | ||
791 | mulx a_5,b_1,t_1 !=!mul_add_c(a[5],b[1],c1,c2,c3); | ||
792 | addcc c_12,t_1,c_12 | ||
793 | bcs,a %xcc,.+8 | ||
794 | add c_3,t_2,c_3 | ||
795 | mulx a_4,b_2,t_1 !=!mul_add_c(a[4],b[2],c1,c2,c3); | ||
796 | addcc c_12,t_1,c_12 | ||
797 | bcs,a %xcc,.+8 | ||
798 | add c_3,t_2,c_3 | ||
799 | mulx a_3,b_3,t_1 !=!mul_add_c(a[3],b[3],c1,c2,c3); | ||
800 | addcc c_12,t_1,c_12 | ||
801 | bcs,a %xcc,.+8 | ||
802 | add c_3,t_2,c_3 | ||
803 | mulx a_2,b_4,t_1 !=!mul_add_c(a[2],b[4],c1,c2,c3); | ||
804 | addcc c_12,t_1,c_12 | ||
805 | bcs,a %xcc,.+8 | ||
806 | add c_3,t_2,c_3 | ||
807 | lduw bp(6),b_6 != | ||
808 | mulx a_1,b_5,t_1 !mul_add_c(a[1],b[5],c1,c2,c3); | ||
809 | addcc c_12,t_1,c_12 | ||
810 | bcs,a %xcc,.+8 | ||
811 | add c_3,t_2,c_3 != | ||
812 | lduw bp(7),b_7 | ||
813 | mulx a_0,b_6,t_1 !mul_add_c(a[0],b[6],c1,c2,c3); | ||
814 | addcc c_12,t_1,t_1 | ||
815 | bcs,a %xcc,.+8 != | ||
816 | add c_3,t_2,c_3 | ||
817 | srlx t_1,32,c_12 | ||
818 | stuw t_1,rp(6) !r[6]=c1; | ||
819 | or c_12,c_3,c_12 != | ||
820 | |||
821 | mulx a_0,b_7,t_1 !mul_add_c(a[0],b[7],c2,c3,c1); | ||
822 | addcc c_12,t_1,c_12 | ||
823 | clr c_3 | ||
824 | bcs,a %xcc,.+8 != | ||
825 | add c_3,t_2,c_3 | ||
826 | mulx a_1,b_6,t_1 !mul_add_c(a[1],b[6],c2,c3,c1); | ||
827 | addcc c_12,t_1,c_12 | ||
828 | bcs,a %xcc,.+8 != | ||
829 | add c_3,t_2,c_3 | ||
830 | mulx a_2,b_5,t_1 !mul_add_c(a[2],b[5],c2,c3,c1); | ||
831 | addcc c_12,t_1,c_12 | ||
832 | bcs,a %xcc,.+8 != | ||
833 | add c_3,t_2,c_3 | ||
834 | mulx a_3,b_4,t_1 !mul_add_c(a[3],b[4],c2,c3,c1); | ||
835 | addcc c_12,t_1,c_12 | ||
836 | bcs,a %xcc,.+8 != | ||
837 | add c_3,t_2,c_3 | ||
838 | mulx a_4,b_3,t_1 !mul_add_c(a[4],b[3],c2,c3,c1); | ||
839 | addcc c_12,t_1,c_12 | ||
840 | bcs,a %xcc,.+8 != | ||
841 | add c_3,t_2,c_3 | ||
842 | mulx a_5,b_2,t_1 !mul_add_c(a[5],b[2],c2,c3,c1); | ||
843 | addcc c_12,t_1,c_12 | ||
844 | bcs,a %xcc,.+8 != | ||
845 | add c_3,t_2,c_3 | ||
846 | lduw ap(7),a_7 | ||
847 | mulx a_6,b_1,t_1 !=!mul_add_c(a[6],b[1],c2,c3,c1); | ||
848 | addcc c_12,t_1,c_12 | ||
849 | bcs,a %xcc,.+8 | ||
850 | add c_3,t_2,c_3 | ||
851 | mulx a_7,b_0,t_1 !=!mul_add_c(a[7],b[0],c2,c3,c1); | ||
852 | addcc c_12,t_1,t_1 | ||
853 | bcs,a %xcc,.+8 | ||
854 | add c_3,t_2,c_3 | ||
855 | srlx t_1,32,c_12 != | ||
856 | stuw t_1,rp(7) !r[7]=c2; | ||
857 | or c_12,c_3,c_12 | ||
858 | |||
859 | mulx a_7,b_1,t_1 !=!mul_add_c(a[7],b[1],c3,c1,c2); | ||
860 | addcc c_12,t_1,c_12 | ||
861 | clr c_3 | ||
862 | bcs,a %xcc,.+8 | ||
863 | add c_3,t_2,c_3 != | ||
864 | mulx a_6,b_2,t_1 !mul_add_c(a[6],b[2],c3,c1,c2); | ||
865 | addcc c_12,t_1,c_12 | ||
866 | bcs,a %xcc,.+8 | ||
867 | add c_3,t_2,c_3 != | ||
868 | mulx a_5,b_3,t_1 !mul_add_c(a[5],b[3],c3,c1,c2); | ||
869 | addcc c_12,t_1,c_12 | ||
870 | bcs,a %xcc,.+8 | ||
871 | add c_3,t_2,c_3 != | ||
872 | mulx a_4,b_4,t_1 !mul_add_c(a[4],b[4],c3,c1,c2); | ||
873 | addcc c_12,t_1,c_12 | ||
874 | bcs,a %xcc,.+8 | ||
875 | add c_3,t_2,c_3 != | ||
876 | mulx a_3,b_5,t_1 !mul_add_c(a[3],b[5],c3,c1,c2); | ||
877 | addcc c_12,t_1,c_12 | ||
878 | bcs,a %xcc,.+8 | ||
879 | add c_3,t_2,c_3 != | ||
880 | mulx a_2,b_6,t_1 !mul_add_c(a[2],b[6],c3,c1,c2); | ||
881 | addcc c_12,t_1,c_12 | ||
882 | bcs,a %xcc,.+8 | ||
883 | add c_3,t_2,c_3 != | ||
884 | mulx a_1,b_7,t_1 !mul_add_c(a[1],b[7],c3,c1,c2); | ||
885 | addcc c_12,t_1,t_1 | ||
886 | bcs,a %xcc,.+8 | ||
887 | add c_3,t_2,c_3 != | ||
888 | srlx t_1,32,c_12 | ||
889 | stuw t_1,rp(8) !r[8]=c3; | ||
890 | or c_12,c_3,c_12 | ||
891 | |||
892 | mulx a_2,b_7,t_1 !=!mul_add_c(a[2],b[7],c1,c2,c3); | ||
893 | addcc c_12,t_1,c_12 | ||
894 | clr c_3 | ||
895 | bcs,a %xcc,.+8 | ||
896 | add c_3,t_2,c_3 != | ||
897 | mulx a_3,b_6,t_1 !mul_add_c(a[3],b[6],c1,c2,c3); | ||
898 | addcc c_12,t_1,c_12 | ||
899 | bcs,a %xcc,.+8 != | ||
900 | add c_3,t_2,c_3 | ||
901 | mulx a_4,b_5,t_1 !mul_add_c(a[4],b[5],c1,c2,c3); | ||
902 | addcc c_12,t_1,c_12 | ||
903 | bcs,a %xcc,.+8 != | ||
904 | add c_3,t_2,c_3 | ||
905 | mulx a_5,b_4,t_1 !mul_add_c(a[5],b[4],c1,c2,c3); | ||
906 | addcc c_12,t_1,c_12 | ||
907 | bcs,a %xcc,.+8 != | ||
908 | add c_3,t_2,c_3 | ||
909 | mulx a_6,b_3,t_1 !mul_add_c(a[6],b[3],c1,c2,c3); | ||
910 | addcc c_12,t_1,c_12 | ||
911 | bcs,a %xcc,.+8 != | ||
912 | add c_3,t_2,c_3 | ||
913 | mulx a_7,b_2,t_1 !mul_add_c(a[7],b[2],c1,c2,c3); | ||
914 | addcc c_12,t_1,t_1 | ||
915 | bcs,a %xcc,.+8 != | ||
916 | add c_3,t_2,c_3 | ||
917 | srlx t_1,32,c_12 | ||
918 | stuw t_1,rp(9) !r[9]=c1; | ||
919 | or c_12,c_3,c_12 != | ||
920 | |||
921 | mulx a_7,b_3,t_1 !mul_add_c(a[7],b[3],c2,c3,c1); | ||
922 | addcc c_12,t_1,c_12 | ||
923 | clr c_3 | ||
924 | bcs,a %xcc,.+8 != | ||
925 | add c_3,t_2,c_3 | ||
926 | mulx a_6,b_4,t_1 !mul_add_c(a[6],b[4],c2,c3,c1); | ||
927 | addcc c_12,t_1,c_12 | ||
928 | bcs,a %xcc,.+8 != | ||
929 | add c_3,t_2,c_3 | ||
930 | mulx a_5,b_5,t_1 !mul_add_c(a[5],b[5],c2,c3,c1); | ||
931 | addcc c_12,t_1,c_12 | ||
932 | bcs,a %xcc,.+8 != | ||
933 | add c_3,t_2,c_3 | ||
934 | mulx a_4,b_6,t_1 !mul_add_c(a[4],b[6],c2,c3,c1); | ||
935 | addcc c_12,t_1,c_12 | ||
936 | bcs,a %xcc,.+8 != | ||
937 | add c_3,t_2,c_3 | ||
938 | mulx a_3,b_7,t_1 !mul_add_c(a[3],b[7],c2,c3,c1); | ||
939 | addcc c_12,t_1,t_1 | ||
940 | bcs,a %xcc,.+8 != | ||
941 | add c_3,t_2,c_3 | ||
942 | srlx t_1,32,c_12 | ||
943 | stuw t_1,rp(10) !r[10]=c2; | ||
944 | or c_12,c_3,c_12 != | ||
945 | |||
946 | mulx a_4,b_7,t_1 !mul_add_c(a[4],b[7],c3,c1,c2); | ||
947 | addcc c_12,t_1,c_12 | ||
948 | clr c_3 | ||
949 | bcs,a %xcc,.+8 != | ||
950 | add c_3,t_2,c_3 | ||
951 | mulx a_5,b_6,t_1 !mul_add_c(a[5],b[6],c3,c1,c2); | ||
952 | addcc c_12,t_1,c_12 | ||
953 | bcs,a %xcc,.+8 != | ||
954 | add c_3,t_2,c_3 | ||
955 | mulx a_6,b_5,t_1 !mul_add_c(a[6],b[5],c3,c1,c2); | ||
956 | addcc c_12,t_1,c_12 | ||
957 | bcs,a %xcc,.+8 != | ||
958 | add c_3,t_2,c_3 | ||
959 | mulx a_7,b_4,t_1 !mul_add_c(a[7],b[4],c3,c1,c2); | ||
960 | addcc c_12,t_1,t_1 | ||
961 | bcs,a %xcc,.+8 != | ||
962 | add c_3,t_2,c_3 | ||
963 | srlx t_1,32,c_12 | ||
964 | stuw t_1,rp(11) !r[11]=c3; | ||
965 | or c_12,c_3,c_12 != | ||
966 | |||
967 | mulx a_7,b_5,t_1 !mul_add_c(a[7],b[5],c1,c2,c3); | ||
968 | addcc c_12,t_1,c_12 | ||
969 | clr c_3 | ||
970 | bcs,a %xcc,.+8 != | ||
971 | add c_3,t_2,c_3 | ||
972 | mulx a_6,b_6,t_1 !mul_add_c(a[6],b[6],c1,c2,c3); | ||
973 | addcc c_12,t_1,c_12 | ||
974 | bcs,a %xcc,.+8 != | ||
975 | add c_3,t_2,c_3 | ||
976 | mulx a_5,b_7,t_1 !mul_add_c(a[5],b[7],c1,c2,c3); | ||
977 | addcc c_12,t_1,t_1 | ||
978 | bcs,a %xcc,.+8 != | ||
979 | add c_3,t_2,c_3 | ||
980 | srlx t_1,32,c_12 | ||
981 | stuw t_1,rp(12) !r[12]=c1; | ||
982 | or c_12,c_3,c_12 != | ||
983 | |||
984 | mulx a_6,b_7,t_1 !mul_add_c(a[6],b[7],c2,c3,c1); | ||
985 | addcc c_12,t_1,c_12 | ||
986 | clr c_3 | ||
987 | bcs,a %xcc,.+8 != | ||
988 | add c_3,t_2,c_3 | ||
989 | mulx a_7,b_6,t_1 !mul_add_c(a[7],b[6],c2,c3,c1); | ||
990 | addcc c_12,t_1,t_1 | ||
991 | bcs,a %xcc,.+8 != | ||
992 | add c_3,t_2,c_3 | ||
993 | srlx t_1,32,c_12 | ||
994 | st t_1,rp(13) !r[13]=c2; | ||
995 | or c_12,c_3,c_12 != | ||
996 | |||
997 | mulx a_7,b_7,t_1 !mul_add_c(a[7],b[7],c3,c1,c2); | ||
998 | addcc c_12,t_1,t_1 | ||
999 | srlx t_1,32,c_12 != | ||
1000 | stuw t_1,rp(14) !r[14]=c3; | ||
1001 | stuw c_12,rp(15) !r[15]=c1; | ||
1002 | |||
1003 | ret | ||
1004 | restore %g0,%g0,%o0 != | ||
1005 | |||
1006 | .type bn_mul_comba8,#function | ||
1007 | .size bn_mul_comba8,(.-bn_mul_comba8) | ||
1008 | |||
1009 | .align 32 | ||
1010 | |||
1011 | .global bn_mul_comba4 | ||
1012 | /* | ||
1013 | * void bn_mul_comba4(r,a,b) | ||
1014 | * BN_ULONG *r,*a,*b; | ||
1015 | */ | ||
1016 | bn_mul_comba4: | ||
1017 | save %sp,FRAME_SIZE,%sp | ||
1018 | lduw ap(0),a_0 | ||
1019 | mov 1,t_2 | ||
1020 | lduw bp(0),b_0 | ||
1021 | sllx t_2,32,t_2 != | ||
1022 | lduw bp(1),b_1 | ||
1023 | mulx a_0,b_0,t_1 !mul_add_c(a[0],b[0],c1,c2,c3); | ||
1024 | srlx t_1,32,c_12 | ||
1025 | stuw t_1,rp(0) !=!r[0]=c1; | ||
1026 | |||
1027 | lduw ap(1),a_1 | ||
1028 | mulx a_0,b_1,t_1 !mul_add_c(a[0],b[1],c2,c3,c1); | ||
1029 | addcc c_12,t_1,c_12 | ||
1030 | clr c_3 != | ||
1031 | bcs,a %xcc,.+8 | ||
1032 | add c_3,t_2,c_3 | ||
1033 | lduw ap(2),a_2 | ||
1034 | mulx a_1,b_0,t_1 !=!mul_add_c(a[1],b[0],c2,c3,c1); | ||
1035 | addcc c_12,t_1,t_1 | ||
1036 | bcs,a %xcc,.+8 | ||
1037 | add c_3,t_2,c_3 | ||
1038 | srlx t_1,32,c_12 != | ||
1039 | stuw t_1,rp(1) !r[1]=c2; | ||
1040 | or c_12,c_3,c_12 | ||
1041 | |||
1042 | mulx a_2,b_0,t_1 !mul_add_c(a[2],b[0],c3,c1,c2); | ||
1043 | addcc c_12,t_1,c_12 != | ||
1044 | clr c_3 | ||
1045 | bcs,a %xcc,.+8 | ||
1046 | add c_3,t_2,c_3 | ||
1047 | lduw bp(2),b_2 != | ||
1048 | mulx a_1,b_1,t_1 !mul_add_c(a[1],b[1],c3,c1,c2); | ||
1049 | addcc c_12,t_1,c_12 | ||
1050 | bcs,a %xcc,.+8 | ||
1051 | add c_3,t_2,c_3 != | ||
1052 | lduw bp(3),b_3 | ||
1053 | mulx a_0,b_2,t_1 !mul_add_c(a[0],b[2],c3,c1,c2); | ||
1054 | addcc c_12,t_1,t_1 | ||
1055 | bcs,a %xcc,.+8 != | ||
1056 | add c_3,t_2,c_3 | ||
1057 | srlx t_1,32,c_12 | ||
1058 | stuw t_1,rp(2) !r[2]=c3; | ||
1059 | or c_12,c_3,c_12 != | ||
1060 | |||
1061 | mulx a_0,b_3,t_1 !mul_add_c(a[0],b[3],c1,c2,c3); | ||
1062 | addcc c_12,t_1,c_12 | ||
1063 | clr c_3 | ||
1064 | bcs,a %xcc,.+8 != | ||
1065 | add c_3,t_2,c_3 | ||
1066 | mulx a_1,b_2,t_1 !mul_add_c(a[1],b[2],c1,c2,c3); | ||
1067 | addcc c_12,t_1,c_12 | ||
1068 | bcs,a %xcc,.+8 != | ||
1069 | add c_3,t_2,c_3 | ||
1070 | lduw ap(3),a_3 | ||
1071 | mulx a_2,b_1,t_1 !mul_add_c(a[2],b[1],c1,c2,c3); | ||
1072 | addcc c_12,t_1,c_12 != | ||
1073 | bcs,a %xcc,.+8 | ||
1074 | add c_3,t_2,c_3 | ||
1075 | mulx a_3,b_0,t_1 !mul_add_c(a[3],b[0],c1,c2,c3);!= | ||
1076 | addcc c_12,t_1,t_1 != | ||
1077 | bcs,a %xcc,.+8 | ||
1078 | add c_3,t_2,c_3 | ||
1079 | srlx t_1,32,c_12 | ||
1080 | stuw t_1,rp(3) !=!r[3]=c1; | ||
1081 | or c_12,c_3,c_12 | ||
1082 | |||
1083 | mulx a_3,b_1,t_1 !mul_add_c(a[3],b[1],c2,c3,c1); | ||
1084 | addcc c_12,t_1,c_12 | ||
1085 | clr c_3 != | ||
1086 | bcs,a %xcc,.+8 | ||
1087 | add c_3,t_2,c_3 | ||
1088 | mulx a_2,b_2,t_1 !mul_add_c(a[2],b[2],c2,c3,c1); | ||
1089 | addcc c_12,t_1,c_12 != | ||
1090 | bcs,a %xcc,.+8 | ||
1091 | add c_3,t_2,c_3 | ||
1092 | mulx a_1,b_3,t_1 !mul_add_c(a[1],b[3],c2,c3,c1); | ||
1093 | addcc c_12,t_1,t_1 != | ||
1094 | bcs,a %xcc,.+8 | ||
1095 | add c_3,t_2,c_3 | ||
1096 | srlx t_1,32,c_12 | ||
1097 | stuw t_1,rp(4) !=!r[4]=c2; | ||
1098 | or c_12,c_3,c_12 | ||
1099 | |||
1100 | mulx a_2,b_3,t_1 !mul_add_c(a[2],b[3],c3,c1,c2); | ||
1101 | addcc c_12,t_1,c_12 | ||
1102 | clr c_3 != | ||
1103 | bcs,a %xcc,.+8 | ||
1104 | add c_3,t_2,c_3 | ||
1105 | mulx a_3,b_2,t_1 !mul_add_c(a[3],b[2],c3,c1,c2); | ||
1106 | addcc c_12,t_1,t_1 != | ||
1107 | bcs,a %xcc,.+8 | ||
1108 | add c_3,t_2,c_3 | ||
1109 | srlx t_1,32,c_12 | ||
1110 | stuw t_1,rp(5) !=!r[5]=c3; | ||
1111 | or c_12,c_3,c_12 | ||
1112 | |||
1113 | mulx a_3,b_3,t_1 !mul_add_c(a[3],b[3],c1,c2,c3); | ||
1114 | addcc c_12,t_1,t_1 | ||
1115 | srlx t_1,32,c_12 != | ||
1116 | stuw t_1,rp(6) !r[6]=c1; | ||
1117 | stuw c_12,rp(7) !r[7]=c2; | ||
1118 | |||
1119 | ret | ||
1120 | restore %g0,%g0,%o0 | ||
1121 | |||
1122 | .type bn_mul_comba4,#function | ||
1123 | .size bn_mul_comba4,(.-bn_mul_comba4) | ||
1124 | |||
1125 | .align 32 | ||
1126 | |||
1127 | .global bn_sqr_comba8 | ||
1128 | bn_sqr_comba8: | ||
1129 | save %sp,FRAME_SIZE,%sp | ||
1130 | mov 1,t_2 | ||
1131 | lduw ap(0),a_0 | ||
1132 | sllx t_2,32,t_2 | ||
1133 | lduw ap(1),a_1 | ||
1134 | mulx a_0,a_0,t_1 !sqr_add_c(a,0,c1,c2,c3); | ||
1135 | srlx t_1,32,c_12 | ||
1136 | stuw t_1,rp(0) !r[0]=c1; | ||
1137 | |||
1138 | lduw ap(2),a_2 | ||
1139 | mulx a_0,a_1,t_1 !=!sqr_add_c2(a,1,0,c2,c3,c1); | ||
1140 | addcc c_12,t_1,c_12 | ||
1141 | clr c_3 | ||
1142 | bcs,a %xcc,.+8 | ||
1143 | add c_3,t_2,c_3 | ||
1144 | addcc c_12,t_1,t_1 | ||
1145 | bcs,a %xcc,.+8 | ||
1146 | add c_3,t_2,c_3 | ||
1147 | srlx t_1,32,c_12 | ||
1148 | stuw t_1,rp(1) !r[1]=c2; | ||
1149 | or c_12,c_3,c_12 | ||
1150 | |||
1151 | mulx a_2,a_0,t_1 !sqr_add_c2(a,2,0,c3,c1,c2); | ||
1152 | addcc c_12,t_1,c_12 | ||
1153 | clr c_3 | ||
1154 | bcs,a %xcc,.+8 | ||
1155 | add c_3,t_2,c_3 | ||
1156 | addcc c_12,t_1,c_12 | ||
1157 | bcs,a %xcc,.+8 | ||
1158 | add c_3,t_2,c_3 | ||
1159 | lduw ap(3),a_3 | ||
1160 | mulx a_1,a_1,t_1 !sqr_add_c(a,1,c3,c1,c2); | ||
1161 | addcc c_12,t_1,t_1 | ||
1162 | bcs,a %xcc,.+8 | ||
1163 | add c_3,t_2,c_3 | ||
1164 | srlx t_1,32,c_12 | ||
1165 | stuw t_1,rp(2) !r[2]=c3; | ||
1166 | or c_12,c_3,c_12 | ||
1167 | |||
1168 | mulx a_0,a_3,t_1 !sqr_add_c2(a,3,0,c1,c2,c3); | ||
1169 | addcc c_12,t_1,c_12 | ||
1170 | clr c_3 | ||
1171 | bcs,a %xcc,.+8 | ||
1172 | add c_3,t_2,c_3 | ||
1173 | addcc c_12,t_1,c_12 | ||
1174 | bcs,a %xcc,.+8 | ||
1175 | add c_3,t_2,c_3 | ||
1176 | lduw ap(4),a_4 | ||
1177 | mulx a_1,a_2,t_1 !sqr_add_c2(a,2,1,c1,c2,c3); | ||
1178 | addcc c_12,t_1,c_12 | ||
1179 | bcs,a %xcc,.+8 | ||
1180 | add c_3,t_2,c_3 | ||
1181 | addcc c_12,t_1,t_1 | ||
1182 | bcs,a %xcc,.+8 | ||
1183 | add c_3,t_2,c_3 | ||
1184 | srlx t_1,32,c_12 | ||
1185 | st t_1,rp(3) !r[3]=c1; | ||
1186 | or c_12,c_3,c_12 | ||
1187 | |||
1188 | mulx a_4,a_0,t_1 !sqr_add_c2(a,4,0,c2,c3,c1); | ||
1189 | addcc c_12,t_1,c_12 | ||
1190 | clr c_3 | ||
1191 | bcs,a %xcc,.+8 | ||
1192 | add c_3,t_2,c_3 | ||
1193 | addcc c_12,t_1,c_12 | ||
1194 | bcs,a %xcc,.+8 | ||
1195 | add c_3,t_2,c_3 | ||
1196 | mulx a_3,a_1,t_1 !sqr_add_c2(a,3,1,c2,c3,c1); | ||
1197 | addcc c_12,t_1,c_12 | ||
1198 | bcs,a %xcc,.+8 | ||
1199 | add c_3,t_2,c_3 | ||
1200 | addcc c_12,t_1,c_12 | ||
1201 | bcs,a %xcc,.+8 | ||
1202 | add c_3,t_2,c_3 | ||
1203 | lduw ap(5),a_5 | ||
1204 | mulx a_2,a_2,t_1 !sqr_add_c(a,2,c2,c3,c1); | ||
1205 | addcc c_12,t_1,t_1 | ||
1206 | bcs,a %xcc,.+8 | ||
1207 | add c_3,t_2,c_3 | ||
1208 | srlx t_1,32,c_12 | ||
1209 | stuw t_1,rp(4) !r[4]=c2; | ||
1210 | or c_12,c_3,c_12 | ||
1211 | |||
1212 | mulx a_0,a_5,t_1 !sqr_add_c2(a,5,0,c3,c1,c2); | ||
1213 | addcc c_12,t_1,c_12 | ||
1214 | clr c_3 | ||
1215 | bcs,a %xcc,.+8 | ||
1216 | add c_3,t_2,c_3 | ||
1217 | addcc c_12,t_1,c_12 | ||
1218 | bcs,a %xcc,.+8 | ||
1219 | add c_3,t_2,c_3 | ||
1220 | mulx a_1,a_4,t_1 !sqr_add_c2(a,4,1,c3,c1,c2); | ||
1221 | addcc c_12,t_1,c_12 | ||
1222 | bcs,a %xcc,.+8 | ||
1223 | add c_3,t_2,c_3 | ||
1224 | addcc c_12,t_1,c_12 | ||
1225 | bcs,a %xcc,.+8 | ||
1226 | add c_3,t_2,c_3 | ||
1227 | lduw ap(6),a_6 | ||
1228 | mulx a_2,a_3,t_1 !sqr_add_c2(a,3,2,c3,c1,c2); | ||
1229 | addcc c_12,t_1,c_12 | ||
1230 | bcs,a %xcc,.+8 | ||
1231 | add c_3,t_2,c_3 | ||
1232 | addcc c_12,t_1,t_1 | ||
1233 | bcs,a %xcc,.+8 | ||
1234 | add c_3,t_2,c_3 | ||
1235 | srlx t_1,32,c_12 | ||
1236 | stuw t_1,rp(5) !r[5]=c3; | ||
1237 | or c_12,c_3,c_12 | ||
1238 | |||
1239 | mulx a_6,a_0,t_1 !sqr_add_c2(a,6,0,c1,c2,c3); | ||
1240 | addcc c_12,t_1,c_12 | ||
1241 | clr c_3 | ||
1242 | bcs,a %xcc,.+8 | ||
1243 | add c_3,t_2,c_3 | ||
1244 | addcc c_12,t_1,c_12 | ||
1245 | bcs,a %xcc,.+8 | ||
1246 | add c_3,t_2,c_3 | ||
1247 | mulx a_5,a_1,t_1 !sqr_add_c2(a,5,1,c1,c2,c3); | ||
1248 | addcc c_12,t_1,c_12 | ||
1249 | bcs,a %xcc,.+8 | ||
1250 | add c_3,t_2,c_3 | ||
1251 | addcc c_12,t_1,c_12 | ||
1252 | bcs,a %xcc,.+8 | ||
1253 | add c_3,t_2,c_3 | ||
1254 | mulx a_4,a_2,t_1 !sqr_add_c2(a,4,2,c1,c2,c3); | ||
1255 | addcc c_12,t_1,c_12 | ||
1256 | bcs,a %xcc,.+8 | ||
1257 | add c_3,t_2,c_3 | ||
1258 | addcc c_12,t_1,c_12 | ||
1259 | bcs,a %xcc,.+8 | ||
1260 | add c_3,t_2,c_3 | ||
1261 | lduw ap(7),a_7 | ||
1262 | mulx a_3,a_3,t_1 !=!sqr_add_c(a,3,c1,c2,c3); | ||
1263 | addcc c_12,t_1,t_1 | ||
1264 | bcs,a %xcc,.+8 | ||
1265 | add c_3,t_2,c_3 | ||
1266 | srlx t_1,32,c_12 | ||
1267 | stuw t_1,rp(6) !r[6]=c1; | ||
1268 | or c_12,c_3,c_12 | ||
1269 | |||
1270 | mulx a_0,a_7,t_1 !sqr_add_c2(a,7,0,c2,c3,c1); | ||
1271 | addcc c_12,t_1,c_12 | ||
1272 | clr c_3 | ||
1273 | bcs,a %xcc,.+8 | ||
1274 | add c_3,t_2,c_3 | ||
1275 | addcc c_12,t_1,c_12 | ||
1276 | bcs,a %xcc,.+8 | ||
1277 | add c_3,t_2,c_3 | ||
1278 | mulx a_1,a_6,t_1 !sqr_add_c2(a,6,1,c2,c3,c1); | ||
1279 | addcc c_12,t_1,c_12 | ||
1280 | bcs,a %xcc,.+8 | ||
1281 | add c_3,t_2,c_3 | ||
1282 | addcc c_12,t_1,c_12 | ||
1283 | bcs,a %xcc,.+8 | ||
1284 | add c_3,t_2,c_3 | ||
1285 | mulx a_2,a_5,t_1 !sqr_add_c2(a,5,2,c2,c3,c1); | ||
1286 | addcc c_12,t_1,c_12 | ||
1287 | bcs,a %xcc,.+8 | ||
1288 | add c_3,t_2,c_3 | ||
1289 | addcc c_12,t_1,c_12 | ||
1290 | bcs,a %xcc,.+8 | ||
1291 | add c_3,t_2,c_3 | ||
1292 | mulx a_3,a_4,t_1 !sqr_add_c2(a,4,3,c2,c3,c1); | ||
1293 | addcc c_12,t_1,c_12 | ||
1294 | bcs,a %xcc,.+8 | ||
1295 | add c_3,t_2,c_3 | ||
1296 | addcc c_12,t_1,t_1 | ||
1297 | bcs,a %xcc,.+8 | ||
1298 | add c_3,t_2,c_3 | ||
1299 | srlx t_1,32,c_12 | ||
1300 | stuw t_1,rp(7) !r[7]=c2; | ||
1301 | or c_12,c_3,c_12 | ||
1302 | |||
1303 | mulx a_7,a_1,t_1 !sqr_add_c2(a,7,1,c3,c1,c2); | ||
1304 | addcc c_12,t_1,c_12 | ||
1305 | clr c_3 | ||
1306 | bcs,a %xcc,.+8 | ||
1307 | add c_3,t_2,c_3 | ||
1308 | addcc c_12,t_1,c_12 | ||
1309 | bcs,a %xcc,.+8 | ||
1310 | add c_3,t_2,c_3 | ||
1311 | mulx a_6,a_2,t_1 !sqr_add_c2(a,6,2,c3,c1,c2); | ||
1312 | addcc c_12,t_1,c_12 | ||
1313 | bcs,a %xcc,.+8 | ||
1314 | add c_3,t_2,c_3 | ||
1315 | addcc c_12,t_1,c_12 | ||
1316 | bcs,a %xcc,.+8 | ||
1317 | add c_3,t_2,c_3 | ||
1318 | mulx a_5,a_3,t_1 !sqr_add_c2(a,5,3,c3,c1,c2); | ||
1319 | addcc c_12,t_1,c_12 | ||
1320 | bcs,a %xcc,.+8 | ||
1321 | add c_3,t_2,c_3 | ||
1322 | addcc c_12,t_1,c_12 | ||
1323 | bcs,a %xcc,.+8 | ||
1324 | add c_3,t_2,c_3 | ||
1325 | mulx a_4,a_4,t_1 !sqr_add_c(a,4,c3,c1,c2); | ||
1326 | addcc c_12,t_1,t_1 | ||
1327 | bcs,a %xcc,.+8 | ||
1328 | add c_3,t_2,c_3 | ||
1329 | srlx t_1,32,c_12 | ||
1330 | stuw t_1,rp(8) !r[8]=c3; | ||
1331 | or c_12,c_3,c_12 | ||
1332 | |||
1333 | mulx a_2,a_7,t_1 !sqr_add_c2(a,7,2,c1,c2,c3); | ||
1334 | addcc c_12,t_1,c_12 | ||
1335 | clr c_3 | ||
1336 | bcs,a %xcc,.+8 | ||
1337 | add c_3,t_2,c_3 | ||
1338 | addcc c_12,t_1,c_12 | ||
1339 | bcs,a %xcc,.+8 | ||
1340 | add c_3,t_2,c_3 | ||
1341 | mulx a_3,a_6,t_1 !sqr_add_c2(a,6,3,c1,c2,c3); | ||
1342 | addcc c_12,t_1,c_12 | ||
1343 | bcs,a %xcc,.+8 | ||
1344 | add c_3,t_2,c_3 | ||
1345 | addcc c_12,t_1,c_12 | ||
1346 | bcs,a %xcc,.+8 | ||
1347 | add c_3,t_2,c_3 | ||
1348 | mulx a_4,a_5,t_1 !sqr_add_c2(a,5,4,c1,c2,c3); | ||
1349 | addcc c_12,t_1,c_12 | ||
1350 | bcs,a %xcc,.+8 | ||
1351 | add c_3,t_2,c_3 | ||
1352 | addcc c_12,t_1,t_1 | ||
1353 | bcs,a %xcc,.+8 | ||
1354 | add c_3,t_2,c_3 | ||
1355 | srlx t_1,32,c_12 | ||
1356 | stuw t_1,rp(9) !r[9]=c1; | ||
1357 | or c_12,c_3,c_12 | ||
1358 | |||
1359 | mulx a_7,a_3,t_1 !sqr_add_c2(a,7,3,c2,c3,c1); | ||
1360 | addcc c_12,t_1,c_12 | ||
1361 | clr c_3 | ||
1362 | bcs,a %xcc,.+8 | ||
1363 | add c_3,t_2,c_3 | ||
1364 | addcc c_12,t_1,c_12 | ||
1365 | bcs,a %xcc,.+8 | ||
1366 | add c_3,t_2,c_3 | ||
1367 | mulx a_6,a_4,t_1 !sqr_add_c2(a,6,4,c2,c3,c1); | ||
1368 | addcc c_12,t_1,c_12 | ||
1369 | bcs,a %xcc,.+8 | ||
1370 | add c_3,t_2,c_3 | ||
1371 | addcc c_12,t_1,c_12 | ||
1372 | bcs,a %xcc,.+8 | ||
1373 | add c_3,t_2,c_3 | ||
1374 | mulx a_5,a_5,t_1 !sqr_add_c(a,5,c2,c3,c1); | ||
1375 | addcc c_12,t_1,t_1 | ||
1376 | bcs,a %xcc,.+8 | ||
1377 | add c_3,t_2,c_3 | ||
1378 | srlx t_1,32,c_12 | ||
1379 | stuw t_1,rp(10) !r[10]=c2; | ||
1380 | or c_12,c_3,c_12 | ||
1381 | |||
1382 | mulx a_4,a_7,t_1 !sqr_add_c2(a,7,4,c3,c1,c2); | ||
1383 | addcc c_12,t_1,c_12 | ||
1384 | clr c_3 | ||
1385 | bcs,a %xcc,.+8 | ||
1386 | add c_3,t_2,c_3 | ||
1387 | addcc c_12,t_1,c_12 | ||
1388 | bcs,a %xcc,.+8 | ||
1389 | add c_3,t_2,c_3 | ||
1390 | mulx a_5,a_6,t_1 !sqr_add_c2(a,6,5,c3,c1,c2); | ||
1391 | addcc c_12,t_1,c_12 | ||
1392 | bcs,a %xcc,.+8 | ||
1393 | add c_3,t_2,c_3 | ||
1394 | addcc c_12,t_1,t_1 | ||
1395 | bcs,a %xcc,.+8 | ||
1396 | add c_3,t_2,c_3 | ||
1397 | srlx t_1,32,c_12 | ||
1398 | stuw t_1,rp(11) !r[11]=c3; | ||
1399 | or c_12,c_3,c_12 | ||
1400 | |||
1401 | mulx a_7,a_5,t_1 !sqr_add_c2(a,7,5,c1,c2,c3); | ||
1402 | addcc c_12,t_1,c_12 | ||
1403 | clr c_3 | ||
1404 | bcs,a %xcc,.+8 | ||
1405 | add c_3,t_2,c_3 | ||
1406 | addcc c_12,t_1,c_12 | ||
1407 | bcs,a %xcc,.+8 | ||
1408 | add c_3,t_2,c_3 | ||
1409 | mulx a_6,a_6,t_1 !sqr_add_c(a,6,c1,c2,c3); | ||
1410 | addcc c_12,t_1,t_1 | ||
1411 | bcs,a %xcc,.+8 | ||
1412 | add c_3,t_2,c_3 | ||
1413 | srlx t_1,32,c_12 | ||
1414 | stuw t_1,rp(12) !r[12]=c1; | ||
1415 | or c_12,c_3,c_12 | ||
1416 | |||
1417 | mulx a_6,a_7,t_1 !sqr_add_c2(a,7,6,c2,c3,c1); | ||
1418 | addcc c_12,t_1,c_12 | ||
1419 | clr c_3 | ||
1420 | bcs,a %xcc,.+8 | ||
1421 | add c_3,t_2,c_3 | ||
1422 | addcc c_12,t_1,t_1 | ||
1423 | bcs,a %xcc,.+8 | ||
1424 | add c_3,t_2,c_3 | ||
1425 | srlx t_1,32,c_12 | ||
1426 | stuw t_1,rp(13) !r[13]=c2; | ||
1427 | or c_12,c_3,c_12 | ||
1428 | |||
1429 | mulx a_7,a_7,t_1 !sqr_add_c(a,7,c3,c1,c2); | ||
1430 | addcc c_12,t_1,t_1 | ||
1431 | srlx t_1,32,c_12 | ||
1432 | stuw t_1,rp(14) !r[14]=c3; | ||
1433 | stuw c_12,rp(15) !r[15]=c1; | ||
1434 | |||
1435 | ret | ||
1436 | restore %g0,%g0,%o0 | ||
1437 | |||
1438 | .type bn_sqr_comba8,#function | ||
1439 | .size bn_sqr_comba8,(.-bn_sqr_comba8) | ||
1440 | |||
1441 | .align 32 | ||
1442 | |||
1443 | .global bn_sqr_comba4 | ||
1444 | /* | ||
1445 | * void bn_sqr_comba4(r,a) | ||
1446 | * BN_ULONG *r,*a; | ||
1447 | */ | ||
1448 | bn_sqr_comba4: | ||
1449 | save %sp,FRAME_SIZE,%sp | ||
1450 | mov 1,t_2 | ||
1451 | lduw ap(0),a_0 | ||
1452 | sllx t_2,32,t_2 | ||
1453 | lduw ap(1),a_1 | ||
1454 | mulx a_0,a_0,t_1 !sqr_add_c(a,0,c1,c2,c3); | ||
1455 | srlx t_1,32,c_12 | ||
1456 | stuw t_1,rp(0) !r[0]=c1; | ||
1457 | |||
1458 | lduw ap(2),a_2 | ||
1459 | mulx a_0,a_1,t_1 !sqr_add_c2(a,1,0,c2,c3,c1); | ||
1460 | addcc c_12,t_1,c_12 | ||
1461 | clr c_3 | ||
1462 | bcs,a %xcc,.+8 | ||
1463 | add c_3,t_2,c_3 | ||
1464 | addcc c_12,t_1,t_1 | ||
1465 | bcs,a %xcc,.+8 | ||
1466 | add c_3,t_2,c_3 | ||
1467 | srlx t_1,32,c_12 | ||
1468 | stuw t_1,rp(1) !r[1]=c2; | ||
1469 | or c_12,c_3,c_12 | ||
1470 | |||
1471 | mulx a_2,a_0,t_1 !sqr_add_c2(a,2,0,c3,c1,c2); | ||
1472 | addcc c_12,t_1,c_12 | ||
1473 | clr c_3 | ||
1474 | bcs,a %xcc,.+8 | ||
1475 | add c_3,t_2,c_3 | ||
1476 | addcc c_12,t_1,c_12 | ||
1477 | bcs,a %xcc,.+8 | ||
1478 | add c_3,t_2,c_3 | ||
1479 | lduw ap(3),a_3 | ||
1480 | mulx a_1,a_1,t_1 !sqr_add_c(a,1,c3,c1,c2); | ||
1481 | addcc c_12,t_1,t_1 | ||
1482 | bcs,a %xcc,.+8 | ||
1483 | add c_3,t_2,c_3 | ||
1484 | srlx t_1,32,c_12 | ||
1485 | stuw t_1,rp(2) !r[2]=c3; | ||
1486 | or c_12,c_3,c_12 | ||
1487 | |||
1488 | mulx a_0,a_3,t_1 !sqr_add_c2(a,3,0,c1,c2,c3); | ||
1489 | addcc c_12,t_1,c_12 | ||
1490 | clr c_3 | ||
1491 | bcs,a %xcc,.+8 | ||
1492 | add c_3,t_2,c_3 | ||
1493 | addcc c_12,t_1,c_12 | ||
1494 | bcs,a %xcc,.+8 | ||
1495 | add c_3,t_2,c_3 | ||
1496 | mulx a_1,a_2,t_1 !sqr_add_c2(a,2,1,c1,c2,c3); | ||
1497 | addcc c_12,t_1,c_12 | ||
1498 | bcs,a %xcc,.+8 | ||
1499 | add c_3,t_2,c_3 | ||
1500 | addcc c_12,t_1,t_1 | ||
1501 | bcs,a %xcc,.+8 | ||
1502 | add c_3,t_2,c_3 | ||
1503 | srlx t_1,32,c_12 | ||
1504 | stuw t_1,rp(3) !r[3]=c1; | ||
1505 | or c_12,c_3,c_12 | ||
1506 | |||
1507 | mulx a_3,a_1,t_1 !sqr_add_c2(a,3,1,c2,c3,c1); | ||
1508 | addcc c_12,t_1,c_12 | ||
1509 | clr c_3 | ||
1510 | bcs,a %xcc,.+8 | ||
1511 | add c_3,t_2,c_3 | ||
1512 | addcc c_12,t_1,c_12 | ||
1513 | bcs,a %xcc,.+8 | ||
1514 | add c_3,t_2,c_3 | ||
1515 | mulx a_2,a_2,t_1 !sqr_add_c(a,2,c2,c3,c1); | ||
1516 | addcc c_12,t_1,t_1 | ||
1517 | bcs,a %xcc,.+8 | ||
1518 | add c_3,t_2,c_3 | ||
1519 | srlx t_1,32,c_12 | ||
1520 | stuw t_1,rp(4) !r[4]=c2; | ||
1521 | or c_12,c_3,c_12 | ||
1522 | |||
1523 | mulx a_2,a_3,t_1 !sqr_add_c2(a,3,2,c3,c1,c2); | ||
1524 | addcc c_12,t_1,c_12 | ||
1525 | clr c_3 | ||
1526 | bcs,a %xcc,.+8 | ||
1527 | add c_3,t_2,c_3 | ||
1528 | addcc c_12,t_1,t_1 | ||
1529 | bcs,a %xcc,.+8 | ||
1530 | add c_3,t_2,c_3 | ||
1531 | srlx t_1,32,c_12 | ||
1532 | stuw t_1,rp(5) !r[5]=c3; | ||
1533 | or c_12,c_3,c_12 | ||
1534 | |||
1535 | mulx a_3,a_3,t_1 !sqr_add_c(a,3,c1,c2,c3); | ||
1536 | addcc c_12,t_1,t_1 | ||
1537 | srlx t_1,32,c_12 | ||
1538 | stuw t_1,rp(6) !r[6]=c1; | ||
1539 | stuw c_12,rp(7) !r[7]=c2; | ||
1540 | |||
1541 | ret | ||
1542 | restore %g0,%g0,%o0 | ||
1543 | |||
1544 | .type bn_sqr_comba4,#function | ||
1545 | .size bn_sqr_comba4,(.-bn_sqr_comba4) | ||
1546 | |||
1547 | .align 32 | ||