summaryrefslogtreecommitdiff
path: root/src/lib/libcrypto/bn/asm/sparcv8plus.S
diff options
context:
space:
mode:
Diffstat (limited to 'src/lib/libcrypto/bn/asm/sparcv8plus.S')
-rw-r--r--src/lib/libcrypto/bn/asm/sparcv8plus.S1547
1 files changed, 1547 insertions, 0 deletions
diff --git a/src/lib/libcrypto/bn/asm/sparcv8plus.S b/src/lib/libcrypto/bn/asm/sparcv8plus.S
new file mode 100644
index 0000000000..8c56e2e7e7
--- /dev/null
+++ b/src/lib/libcrypto/bn/asm/sparcv8plus.S
@@ -0,0 +1,1547 @@
1.ident "sparcv8plus.s, Version 1.4"
2.ident "SPARC v9 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>"
3
4/*
5 * ====================================================================
6 * Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
7 * project.
8 *
9 * Rights for redistribution and usage in source and binary forms are
10 * granted according to the OpenSSL license. Warranty of any kind is
11 * disclaimed.
12 * ====================================================================
13 */
14
15/*
16 * This is my modest contributon to OpenSSL project (see
17 * http://www.openssl.org/ for more information about it) and is
18 * a drop-in UltraSPARC ISA replacement for crypto/bn/bn_asm.c
19 * module. For updates see http://fy.chalmers.se/~appro/hpe/.
20 *
21 * Questions-n-answers.
22 *
23 * Q. How to compile?
24 * A. With SC4.x/SC5.x:
25 *
26 * cc -xarch=v8plus -c bn_asm.sparc.v8plus.S -o bn_asm.o
27 *
28 * and with gcc:
29 *
30 * gcc -mcpu=ultrasparc -c bn_asm.sparc.v8plus.S -o bn_asm.o
31 *
32 * or if above fails (it does if you have gas installed):
33 *
34 * gcc -E bn_asm.sparc.v8plus.S | as -xarch=v8plus /dev/fd/0 -o bn_asm.o
35 *
36 * Quick-n-dirty way to fuse the module into the library.
37 * Provided that the library is already configured and built
38 * (in 0.9.2 case with no-asm option):
39 *
40 * # cd crypto/bn
41 * # cp /some/place/bn_asm.sparc.v8plus.S .
42 * # cc -xarch=v8plus -c bn_asm.sparc.v8plus.S -o bn_asm.o
43 * # make
44 * # cd ../..
45 * # make; make test
46 *
47 * Quick-n-dirty way to get rid of it:
48 *
49 * # cd crypto/bn
50 * # touch bn_asm.c
51 * # make
52 * # cd ../..
53 * # make; make test
54 *
55 * Q. V8plus achitecture? What kind of beast is that?
56 * A. Well, it's rather a programming model than an architecture...
57 * It's actually v9-compliant, i.e. *any* UltraSPARC, CPU under
58 * special conditions, namely when kernel doesn't preserve upper
59 * 32 bits of otherwise 64-bit registers during a context switch.
60 *
61 * Q. Why just UltraSPARC? What about SuperSPARC?
62 * A. Original release did target UltraSPARC only. Now SuperSPARC
63 * version is provided along. Both version share bn_*comba[48]
64 * implementations (see comment later in code for explanation).
65 * But what's so special about this UltraSPARC implementation?
66 * Why didn't I let compiler do the job? Trouble is that most of
67 * available compilers (well, SC5.0 is the only exception) don't
68 * attempt to take advantage of UltraSPARC's 64-bitness under
69 * 32-bit kernels even though it's perfectly possible (see next
70 * question).
71 *
72 * Q. 64-bit registers under 32-bit kernels? Didn't you just say it
73 * doesn't work?
74 * A. You can't adress *all* registers as 64-bit wide:-( The catch is
75 * that you actually may rely upon %o0-%o5 and %g1-%g4 being fully
76 * preserved if you're in a leaf function, i.e. such never calling
77 * any other functions. All functions in this module are leaf and
78 * 10 registers is a handful. And as a matter of fact none-"comba"
79 * routines don't require even that much and I could even afford to
80 * not allocate own stack frame for 'em:-)
81 *
82 * Q. What about 64-bit kernels?
83 * A. What about 'em? Just kidding:-) Pure 64-bit version is currently
84 * under evaluation and development...
85 *
86 * Q. What about shared libraries?
87 * A. What about 'em? Kidding again:-) Code does *not* contain any
88 * code position dependencies and it's safe to include it into
89 * shared library as is.
90 *
91 * Q. How much faster does it go?
92 * A. Do you have a good benchmark? In either case below is what I
93 * experience with crypto/bn/expspeed.c test program:
94 *
95 * v8plus module on U10/300MHz against bn_asm.c compiled with:
96 *
97 * cc-5.0 -xarch=v8plus -xO5 -xdepend +7-12%
98 * cc-4.2 -xarch=v8plus -xO5 -xdepend +25-35%
99 * egcs-1.1.2 -mcpu=ultrasparc -O3 +35-45%
100 *
101 * v8 module on SS10/60MHz against bn_asm.c compiled with:
102 *
103 * cc-5.0 -xarch=v8 -xO5 -xdepend +7-10%
104 * cc-4.2 -xarch=v8 -xO5 -xdepend +10%
105 * egcs-1.1.2 -mv8 -O3 +35-45%
106 *
107 * As you can see it's damn hard to beat the new Sun C compiler
108 * and it's in first place GNU C users who will appreciate this
109 * assembler implementation:-)
110 */
111
112/*
113 * Revision history.
114 *
115 * 1.0 - initial release;
116 * 1.1 - new loop unrolling model(*);
117 * - some more fine tuning;
118 * 1.2 - made gas friendly;
119 * - updates to documentation concerning v9;
120 * - new performance comparison matrix;
121 * 1.3 - fixed problem with /usr/ccs/lib/cpp;
122 * 1.4 - native V9 bn_*_comba[48] implementation (15% more efficient)
123 * resulting in slight overall performance kick;
124 * - some retunes;
125 * - support for GNU as added;
126 *
127 * (*) Originally unrolled loop looked like this:
128 * for (;;) {
129 * op(p+0); if (--n==0) break;
130 * op(p+1); if (--n==0) break;
131 * op(p+2); if (--n==0) break;
132 * op(p+3); if (--n==0) break;
133 * p+=4;
134 * }
135 * I unroll according to following:
136 * while (n&~3) {
137 * op(p+0); op(p+1); op(p+2); op(p+3);
138 * p+=4; n=-4;
139 * }
140 * if (n) {
141 * op(p+0); if (--n==0) return;
142 * op(p+2); if (--n==0) return;
143 * op(p+3); return;
144 * }
145 */
146
147/*
148 * GNU assembler can't stand stuw:-(
149 */
150#define stuw st
151
152.section ".text",#alloc,#execinstr
153.file "bn_asm.sparc.v8plus.S"
154
155.align 32
156
157.global bn_mul_add_words
158/*
159 * BN_ULONG bn_mul_add_words(rp,ap,num,w)
160 * BN_ULONG *rp,*ap;
161 * int num;
162 * BN_ULONG w;
163 */
164bn_mul_add_words:
165 sra %o2,%g0,%o2 ! signx %o2
166 brgz,a %o2,.L_bn_mul_add_words_proceed
167 lduw [%o1],%g2
168 retl
169 clr %o0
170 nop
171 nop
172 nop
173
174.L_bn_mul_add_words_proceed:
175 srl %o3,%g0,%o3 ! clruw %o3
176 andcc %o2,-4,%g0
177 bz,pn %icc,.L_bn_mul_add_words_tail
178 clr %o5
179
180.L_bn_mul_add_words_loop: ! wow! 32 aligned!
181 lduw [%o0],%g1
182 lduw [%o1+4],%g3
183 mulx %o3,%g2,%g2
184 add %g1,%o5,%o4
185 nop
186 add %o4,%g2,%o4
187 stuw %o4,[%o0]
188 srlx %o4,32,%o5
189
190 lduw [%o0+4],%g1
191 lduw [%o1+8],%g2
192 mulx %o3,%g3,%g3
193 add %g1,%o5,%o4
194 dec 4,%o2
195 add %o4,%g3,%o4
196 stuw %o4,[%o0+4]
197 srlx %o4,32,%o5
198
199 lduw [%o0+8],%g1
200 lduw [%o1+12],%g3
201 mulx %o3,%g2,%g2
202 add %g1,%o5,%o4
203 inc 16,%o1
204 add %o4,%g2,%o4
205 stuw %o4,[%o0+8]
206 srlx %o4,32,%o5
207
208 lduw [%o0+12],%g1
209 mulx %o3,%g3,%g3
210 add %g1,%o5,%o4
211 inc 16,%o0
212 add %o4,%g3,%o4
213 andcc %o2,-4,%g0
214 stuw %o4,[%o0-4]
215 srlx %o4,32,%o5
216 bnz,a,pt %icc,.L_bn_mul_add_words_loop
217 lduw [%o1],%g2
218
219 brnz,a,pn %o2,.L_bn_mul_add_words_tail
220 lduw [%o1],%g2
221.L_bn_mul_add_words_return:
222 retl
223 mov %o5,%o0
224
225.L_bn_mul_add_words_tail:
226 lduw [%o0],%g1
227 mulx %o3,%g2,%g2
228 add %g1,%o5,%o4
229 dec %o2
230 add %o4,%g2,%o4
231 srlx %o4,32,%o5
232 brz,pt %o2,.L_bn_mul_add_words_return
233 stuw %o4,[%o0]
234
235 lduw [%o1+4],%g2
236 lduw [%o0+4],%g1
237 mulx %o3,%g2,%g2
238 add %g1,%o5,%o4
239 dec %o2
240 add %o4,%g2,%o4
241 srlx %o4,32,%o5
242 brz,pt %o2,.L_bn_mul_add_words_return
243 stuw %o4,[%o0+4]
244
245 lduw [%o1+8],%g2
246 lduw [%o0+8],%g1
247 mulx %o3,%g2,%g2
248 add %g1,%o5,%o4
249 add %o4,%g2,%o4
250 stuw %o4,[%o0+8]
251 retl
252 srlx %o4,32,%o0
253
254.type bn_mul_add_words,#function
255.size bn_mul_add_words,(.-bn_mul_add_words)
256
257.align 32
258
259.global bn_mul_words
260/*
261 * BN_ULONG bn_mul_words(rp,ap,num,w)
262 * BN_ULONG *rp,*ap;
263 * int num;
264 * BN_ULONG w;
265 */
266bn_mul_words:
267 sra %o2,%g0,%o2 ! signx %o2
268 brgz,a %o2,.L_bn_mul_words_proceeed
269 lduw [%o1],%g2
270 retl
271 clr %o0
272 nop
273 nop
274 nop
275
276.L_bn_mul_words_proceeed:
277 srl %o3,%g0,%o3 ! clruw %o3
278 andcc %o2,-4,%g0
279 bz,pn %icc,.L_bn_mul_words_tail
280 clr %o5
281
282.L_bn_mul_words_loop: ! wow! 32 aligned!
283 lduw [%o1+4],%g3
284 mulx %o3,%g2,%g2
285 add %g2,%o5,%o4
286 nop
287 stuw %o4,[%o0]
288 srlx %o4,32,%o5
289
290 lduw [%o1+8],%g2
291 mulx %o3,%g3,%g3
292 add %g3,%o5,%o4
293 dec 4,%o2
294 stuw %o4,[%o0+4]
295 srlx %o4,32,%o5
296
297 lduw [%o1+12],%g3
298 mulx %o3,%g2,%g2
299 add %g2,%o5,%o4
300 inc 16,%o1
301 stuw %o4,[%o0+8]
302 srlx %o4,32,%o5
303
304 mulx %o3,%g3,%g3
305 add %g3,%o5,%o4
306 inc 16,%o0
307 stuw %o4,[%o0-4]
308 srlx %o4,32,%o5
309 andcc %o2,-4,%g0
310 bnz,a,pt %icc,.L_bn_mul_words_loop
311 lduw [%o1],%g2
312 nop
313 nop
314
315 brnz,a,pn %o2,.L_bn_mul_words_tail
316 lduw [%o1],%g2
317.L_bn_mul_words_return:
318 retl
319 mov %o5,%o0
320
321.L_bn_mul_words_tail:
322 mulx %o3,%g2,%g2
323 add %g2,%o5,%o4
324 dec %o2
325 srlx %o4,32,%o5
326 brz,pt %o2,.L_bn_mul_words_return
327 stuw %o4,[%o0]
328
329 lduw [%o1+4],%g2
330 mulx %o3,%g2,%g2
331 add %g2,%o5,%o4
332 dec %o2
333 srlx %o4,32,%o5
334 brz,pt %o2,.L_bn_mul_words_return
335 stuw %o4,[%o0+4]
336
337 lduw [%o1+8],%g2
338 mulx %o3,%g2,%g2
339 add %g2,%o5,%o4
340 stuw %o4,[%o0+8]
341 retl
342 srlx %o4,32,%o0
343
344.type bn_mul_words,#function
345.size bn_mul_words,(.-bn_mul_words)
346
347.align 32
348.global bn_sqr_words
349/*
350 * void bn_sqr_words(r,a,n)
351 * BN_ULONG *r,*a;
352 * int n;
353 */
354bn_sqr_words:
355 sra %o2,%g0,%o2 ! signx %o2
356 brgz,a %o2,.L_bn_sqr_words_proceeed
357 lduw [%o1],%g2
358 retl
359 clr %o0
360 nop
361 nop
362 nop
363
364.L_bn_sqr_words_proceeed:
365 andcc %o2,-4,%g0
366 nop
367 bz,pn %icc,.L_bn_sqr_words_tail
368 nop
369
370.L_bn_sqr_words_loop: ! wow! 32 aligned!
371 lduw [%o1+4],%g3
372 mulx %g2,%g2,%o4
373 stuw %o4,[%o0]
374 srlx %o4,32,%o5
375 stuw %o5,[%o0+4]
376 nop
377
378 lduw [%o1+8],%g2
379 mulx %g3,%g3,%o4
380 dec 4,%o2
381 stuw %o4,[%o0+8]
382 srlx %o4,32,%o5
383 stuw %o5,[%o0+12]
384
385 lduw [%o1+12],%g3
386 mulx %g2,%g2,%o4
387 srlx %o4,32,%o5
388 stuw %o4,[%o0+16]
389 inc 16,%o1
390 stuw %o5,[%o0+20]
391
392 mulx %g3,%g3,%o4
393 inc 32,%o0
394 stuw %o4,[%o0-8]
395 srlx %o4,32,%o5
396 andcc %o2,-4,%g2
397 stuw %o5,[%o0-4]
398 bnz,a,pt %icc,.L_bn_sqr_words_loop
399 lduw [%o1],%g2
400 nop
401
402 brnz,a,pn %o2,.L_bn_sqr_words_tail
403 lduw [%o1],%g2
404.L_bn_sqr_words_return:
405 retl
406 clr %o0
407
408.L_bn_sqr_words_tail:
409 mulx %g2,%g2,%o4
410 dec %o2
411 stuw %o4,[%o0]
412 srlx %o4,32,%o5
413 brz,pt %o2,.L_bn_sqr_words_return
414 stuw %o5,[%o0+4]
415
416 lduw [%o1+4],%g2
417 mulx %g2,%g2,%o4
418 dec %o2
419 stuw %o4,[%o0+8]
420 srlx %o4,32,%o5
421 brz,pt %o2,.L_bn_sqr_words_return
422 stuw %o5,[%o0+12]
423
424 lduw [%o1+8],%g2
425 mulx %g2,%g2,%o4
426 srlx %o4,32,%o5
427 stuw %o4,[%o0+16]
428 stuw %o5,[%o0+20]
429 retl
430 clr %o0
431
432.type bn_sqr_words,#function
433.size bn_sqr_words,(.-bn_sqr_words)
434
435.align 32
436.global bn_div_words
437/*
438 * BN_ULONG bn_div_words(h,l,d)
439 * BN_ULONG h,l,d;
440 */
441bn_div_words:
442 sllx %o0,32,%o0
443 or %o0,%o1,%o0
444 udivx %o0,%o2,%o0
445 retl
446 srl %o0,%g0,%o0 ! clruw %o0
447
448.type bn_div_words,#function
449.size bn_div_words,(.-bn_div_words)
450
451.align 32
452
453.global bn_add_words
454/*
455 * BN_ULONG bn_add_words(rp,ap,bp,n)
456 * BN_ULONG *rp,*ap,*bp;
457 * int n;
458 */
459bn_add_words:
460 sra %o3,%g0,%o3 ! signx %o3
461 brgz,a %o3,.L_bn_add_words_proceed
462 lduw [%o1],%o4
463 retl
464 clr %o0
465
466.L_bn_add_words_proceed:
467 andcc %o3,-4,%g0
468 bz,pn %icc,.L_bn_add_words_tail
469 addcc %g0,0,%g0 ! clear carry flag
470
471.L_bn_add_words_loop: ! wow! 32 aligned!
472 dec 4,%o3
473 lduw [%o2],%o5
474 lduw [%o1+4],%g1
475 lduw [%o2+4],%g2
476 lduw [%o1+8],%g3
477 lduw [%o2+8],%g4
478 addccc %o5,%o4,%o5
479 stuw %o5,[%o0]
480
481 lduw [%o1+12],%o4
482 lduw [%o2+12],%o5
483 inc 16,%o1
484 addccc %g1,%g2,%g1
485 stuw %g1,[%o0+4]
486
487 inc 16,%o2
488 addccc %g3,%g4,%g3
489 stuw %g3,[%o0+8]
490
491 inc 16,%o0
492 addccc %o5,%o4,%o5
493 stuw %o5,[%o0-4]
494 and %o3,-4,%g1
495 brnz,a,pt %g1,.L_bn_add_words_loop
496 lduw [%o1],%o4
497
498 brnz,a,pn %o3,.L_bn_add_words_tail
499 lduw [%o1],%o4
500.L_bn_add_words_return:
501 clr %o0
502 retl
503 movcs %icc,1,%o0
504 nop
505
506.L_bn_add_words_tail:
507 lduw [%o2],%o5
508 dec %o3
509 addccc %o5,%o4,%o5
510 brz,pt %o3,.L_bn_add_words_return
511 stuw %o5,[%o0]
512
513 lduw [%o1+4],%o4
514 lduw [%o2+4],%o5
515 dec %o3
516 addccc %o5,%o4,%o5
517 brz,pt %o3,.L_bn_add_words_return
518 stuw %o5,[%o0+4]
519
520 lduw [%o1+8],%o4
521 lduw [%o2+8],%o5
522 addccc %o5,%o4,%o5
523 stuw %o5,[%o0+8]
524 clr %o0
525 retl
526 movcs %icc,1,%o0
527
528.type bn_add_words,#function
529.size bn_add_words,(.-bn_add_words)
530
531.global bn_sub_words
532/*
533 * BN_ULONG bn_sub_words(rp,ap,bp,n)
534 * BN_ULONG *rp,*ap,*bp;
535 * int n;
536 */
537bn_sub_words:
538 sra %o3,%g0,%o3 ! signx %o3
539 brgz,a %o3,.L_bn_sub_words_proceed
540 lduw [%o1],%o4
541 retl
542 clr %o0
543
544.L_bn_sub_words_proceed:
545 andcc %o3,-4,%g0
546 bz,pn %icc,.L_bn_sub_words_tail
547 addcc %g0,0,%g0 ! clear carry flag
548
549.L_bn_sub_words_loop: ! wow! 32 aligned!
550 dec 4,%o3
551 lduw [%o2],%o5
552 lduw [%o1+4],%g1
553 lduw [%o2+4],%g2
554 lduw [%o1+8],%g3
555 lduw [%o2+8],%g4
556 subccc %o4,%o5,%o5
557 stuw %o5,[%o0]
558
559 lduw [%o1+12],%o4
560 lduw [%o2+12],%o5
561 inc 16,%o1
562 subccc %g1,%g2,%g2
563 stuw %g2,[%o0+4]
564
565 inc 16,%o2
566 subccc %g3,%g4,%g4
567 stuw %g4,[%o0+8]
568
569 inc 16,%o0
570 subccc %o4,%o5,%o5
571 stuw %o5,[%o0-4]
572 and %o3,-4,%g1
573 brnz,a,pt %g1,.L_bn_sub_words_loop
574 lduw [%o1],%o4
575
576 brnz,a,pn %o3,.L_bn_sub_words_tail
577 lduw [%o1],%o4
578.L_bn_sub_words_return:
579 clr %o0
580 retl
581 movcs %icc,1,%o0
582 nop
583
584.L_bn_sub_words_tail: ! wow! 32 aligned!
585 lduw [%o2],%o5
586 dec %o3
587 subccc %o4,%o5,%o5
588 brz,pt %o3,.L_bn_sub_words_return
589 stuw %o5,[%o0]
590
591 lduw [%o1+4],%o4
592 lduw [%o2+4],%o5
593 dec %o3
594 subccc %o4,%o5,%o5
595 brz,pt %o3,.L_bn_sub_words_return
596 stuw %o5,[%o0+4]
597
598 lduw [%o1+8],%o4
599 lduw [%o2+8],%o5
600 subccc %o4,%o5,%o5
601 stuw %o5,[%o0+8]
602 clr %o0
603 retl
604 movcs %icc,1,%o0
605
606.type bn_sub_words,#function
607.size bn_sub_words,(.-bn_sub_words)
608
609/*
610 * Code below depends on the fact that upper parts of the %l0-%l7
611 * and %i0-%i7 are zeroed by kernel after context switch. In
612 * previous versions this comment stated that "the trouble is that
613 * it's not feasible to implement the mumbo-jumbo in less V9
614 * instructions:-(" which apparently isn't true thanks to
615 * 'bcs,a %xcc,.+8; inc %rd' pair. But the performance improvement
616 * results not from the shorter code, but from elimination of
617 * multicycle none-pairable 'rd %y,%rd' instructions.
618 *
619 * Andy.
620 */
621
622#define FRAME_SIZE -96
623
624/*
625 * Here is register usage map for *all* routines below.
626 */
627#define t_1 %o0
628#define t_2 %o1
629#define c_12 %o2
630#define c_3 %o3
631
632#define ap(I) [%i1+4*I]
633#define bp(I) [%i2+4*I]
634#define rp(I) [%i0+4*I]
635
636#define a_0 %l0
637#define a_1 %l1
638#define a_2 %l2
639#define a_3 %l3
640#define a_4 %l4
641#define a_5 %l5
642#define a_6 %l6
643#define a_7 %l7
644
645#define b_0 %i3
646#define b_1 %i4
647#define b_2 %i5
648#define b_3 %o4
649#define b_4 %o5
650#define b_5 %o7
651#define b_6 %g1
652#define b_7 %g4
653
654.align 32
655.global bn_mul_comba8
656/*
657 * void bn_mul_comba8(r,a,b)
658 * BN_ULONG *r,*a,*b;
659 */
660bn_mul_comba8:
661 save %sp,FRAME_SIZE,%sp
662 mov 1,t_2
663 lduw ap(0),a_0
664 sllx t_2,32,t_2
665 lduw bp(0),b_0 !=
666 lduw bp(1),b_1
667 mulx a_0,b_0,t_1 !mul_add_c(a[0],b[0],c1,c2,c3);
668 srlx t_1,32,c_12
669 stuw t_1,rp(0) !=!r[0]=c1;
670
671 lduw ap(1),a_1
672 mulx a_0,b_1,t_1 !mul_add_c(a[0],b[1],c2,c3,c1);
673 addcc c_12,t_1,c_12
674 clr c_3 !=
675 bcs,a %xcc,.+8
676 add c_3,t_2,c_3
677 lduw ap(2),a_2
678 mulx a_1,b_0,t_1 !=!mul_add_c(a[1],b[0],c2,c3,c1);
679 addcc c_12,t_1,t_1
680 bcs,a %xcc,.+8
681 add c_3,t_2,c_3
682 srlx t_1,32,c_12 !=
683 stuw t_1,rp(1) !r[1]=c2;
684 or c_12,c_3,c_12
685
686 mulx a_2,b_0,t_1 !mul_add_c(a[2],b[0],c3,c1,c2);
687 addcc c_12,t_1,c_12 !=
688 clr c_3
689 bcs,a %xcc,.+8
690 add c_3,t_2,c_3
691 lduw bp(2),b_2 !=
692 mulx a_1,b_1,t_1 !mul_add_c(a[1],b[1],c3,c1,c2);
693 addcc c_12,t_1,c_12
694 bcs,a %xcc,.+8
695 add c_3,t_2,c_3 !=
696 lduw bp(3),b_3
697 mulx a_0,b_2,t_1 !mul_add_c(a[0],b[2],c3,c1,c2);
698 addcc c_12,t_1,t_1
699 bcs,a %xcc,.+8 !=
700 add c_3,t_2,c_3
701 srlx t_1,32,c_12
702 stuw t_1,rp(2) !r[2]=c3;
703 or c_12,c_3,c_12 !=
704
705 mulx a_0,b_3,t_1 !mul_add_c(a[0],b[3],c1,c2,c3);
706 addcc c_12,t_1,c_12
707 clr c_3
708 bcs,a %xcc,.+8 !=
709 add c_3,t_2,c_3
710 mulx a_1,b_2,t_1 !=!mul_add_c(a[1],b[2],c1,c2,c3);
711 addcc c_12,t_1,c_12
712 bcs,a %xcc,.+8 !=
713 add c_3,t_2,c_3
714 lduw ap(3),a_3
715 mulx a_2,b_1,t_1 !mul_add_c(a[2],b[1],c1,c2,c3);
716 addcc c_12,t_1,c_12 !=
717 bcs,a %xcc,.+8
718 add c_3,t_2,c_3
719 lduw ap(4),a_4
720 mulx a_3,b_0,t_1 !=!mul_add_c(a[3],b[0],c1,c2,c3);!=
721 addcc c_12,t_1,t_1
722 bcs,a %xcc,.+8
723 add c_3,t_2,c_3
724 srlx t_1,32,c_12 !=
725 stuw t_1,rp(3) !r[3]=c1;
726 or c_12,c_3,c_12
727
728 mulx a_4,b_0,t_1 !mul_add_c(a[4],b[0],c2,c3,c1);
729 addcc c_12,t_1,c_12 !=
730 clr c_3
731 bcs,a %xcc,.+8
732 add c_3,t_2,c_3
733 mulx a_3,b_1,t_1 !=!mul_add_c(a[3],b[1],c2,c3,c1);
734 addcc c_12,t_1,c_12
735 bcs,a %xcc,.+8
736 add c_3,t_2,c_3
737 mulx a_2,b_2,t_1 !=!mul_add_c(a[2],b[2],c2,c3,c1);
738 addcc c_12,t_1,c_12
739 bcs,a %xcc,.+8
740 add c_3,t_2,c_3
741 lduw bp(4),b_4 !=
742 mulx a_1,b_3,t_1 !mul_add_c(a[1],b[3],c2,c3,c1);
743 addcc c_12,t_1,c_12
744 bcs,a %xcc,.+8
745 add c_3,t_2,c_3 !=
746 lduw bp(5),b_5
747 mulx a_0,b_4,t_1 !mul_add_c(a[0],b[4],c2,c3,c1);
748 addcc c_12,t_1,t_1
749 bcs,a %xcc,.+8 !=
750 add c_3,t_2,c_3
751 srlx t_1,32,c_12
752 stuw t_1,rp(4) !r[4]=c2;
753 or c_12,c_3,c_12 !=
754
755 mulx a_0,b_5,t_1 !mul_add_c(a[0],b[5],c3,c1,c2);
756 addcc c_12,t_1,c_12
757 clr c_3
758 bcs,a %xcc,.+8 !=
759 add c_3,t_2,c_3
760 mulx a_1,b_4,t_1 !mul_add_c(a[1],b[4],c3,c1,c2);
761 addcc c_12,t_1,c_12
762 bcs,a %xcc,.+8 !=
763 add c_3,t_2,c_3
764 mulx a_2,b_3,t_1 !mul_add_c(a[2],b[3],c3,c1,c2);
765 addcc c_12,t_1,c_12
766 bcs,a %xcc,.+8 !=
767 add c_3,t_2,c_3
768 mulx a_3,b_2,t_1 !mul_add_c(a[3],b[2],c3,c1,c2);
769 addcc c_12,t_1,c_12
770 bcs,a %xcc,.+8 !=
771 add c_3,t_2,c_3
772 lduw ap(5),a_5
773 mulx a_4,b_1,t_1 !mul_add_c(a[4],b[1],c3,c1,c2);
774 addcc c_12,t_1,c_12 !=
775 bcs,a %xcc,.+8
776 add c_3,t_2,c_3
777 lduw ap(6),a_6
778 mulx a_5,b_0,t_1 !=!mul_add_c(a[5],b[0],c3,c1,c2);
779 addcc c_12,t_1,t_1
780 bcs,a %xcc,.+8
781 add c_3,t_2,c_3
782 srlx t_1,32,c_12 !=
783 stuw t_1,rp(5) !r[5]=c3;
784 or c_12,c_3,c_12
785
786 mulx a_6,b_0,t_1 !mul_add_c(a[6],b[0],c1,c2,c3);
787 addcc c_12,t_1,c_12 !=
788 clr c_3
789 bcs,a %xcc,.+8
790 add c_3,t_2,c_3
791 mulx a_5,b_1,t_1 !=!mul_add_c(a[5],b[1],c1,c2,c3);
792 addcc c_12,t_1,c_12
793 bcs,a %xcc,.+8
794 add c_3,t_2,c_3
795 mulx a_4,b_2,t_1 !=!mul_add_c(a[4],b[2],c1,c2,c3);
796 addcc c_12,t_1,c_12
797 bcs,a %xcc,.+8
798 add c_3,t_2,c_3
799 mulx a_3,b_3,t_1 !=!mul_add_c(a[3],b[3],c1,c2,c3);
800 addcc c_12,t_1,c_12
801 bcs,a %xcc,.+8
802 add c_3,t_2,c_3
803 mulx a_2,b_4,t_1 !=!mul_add_c(a[2],b[4],c1,c2,c3);
804 addcc c_12,t_1,c_12
805 bcs,a %xcc,.+8
806 add c_3,t_2,c_3
807 lduw bp(6),b_6 !=
808 mulx a_1,b_5,t_1 !mul_add_c(a[1],b[5],c1,c2,c3);
809 addcc c_12,t_1,c_12
810 bcs,a %xcc,.+8
811 add c_3,t_2,c_3 !=
812 lduw bp(7),b_7
813 mulx a_0,b_6,t_1 !mul_add_c(a[0],b[6],c1,c2,c3);
814 addcc c_12,t_1,t_1
815 bcs,a %xcc,.+8 !=
816 add c_3,t_2,c_3
817 srlx t_1,32,c_12
818 stuw t_1,rp(6) !r[6]=c1;
819 or c_12,c_3,c_12 !=
820
821 mulx a_0,b_7,t_1 !mul_add_c(a[0],b[7],c2,c3,c1);
822 addcc c_12,t_1,c_12
823 clr c_3
824 bcs,a %xcc,.+8 !=
825 add c_3,t_2,c_3
826 mulx a_1,b_6,t_1 !mul_add_c(a[1],b[6],c2,c3,c1);
827 addcc c_12,t_1,c_12
828 bcs,a %xcc,.+8 !=
829 add c_3,t_2,c_3
830 mulx a_2,b_5,t_1 !mul_add_c(a[2],b[5],c2,c3,c1);
831 addcc c_12,t_1,c_12
832 bcs,a %xcc,.+8 !=
833 add c_3,t_2,c_3
834 mulx a_3,b_4,t_1 !mul_add_c(a[3],b[4],c2,c3,c1);
835 addcc c_12,t_1,c_12
836 bcs,a %xcc,.+8 !=
837 add c_3,t_2,c_3
838 mulx a_4,b_3,t_1 !mul_add_c(a[4],b[3],c2,c3,c1);
839 addcc c_12,t_1,c_12
840 bcs,a %xcc,.+8 !=
841 add c_3,t_2,c_3
842 mulx a_5,b_2,t_1 !mul_add_c(a[5],b[2],c2,c3,c1);
843 addcc c_12,t_1,c_12
844 bcs,a %xcc,.+8 !=
845 add c_3,t_2,c_3
846 lduw ap(7),a_7
847 mulx a_6,b_1,t_1 !=!mul_add_c(a[6],b[1],c2,c3,c1);
848 addcc c_12,t_1,c_12
849 bcs,a %xcc,.+8
850 add c_3,t_2,c_3
851 mulx a_7,b_0,t_1 !=!mul_add_c(a[7],b[0],c2,c3,c1);
852 addcc c_12,t_1,t_1
853 bcs,a %xcc,.+8
854 add c_3,t_2,c_3
855 srlx t_1,32,c_12 !=
856 stuw t_1,rp(7) !r[7]=c2;
857 or c_12,c_3,c_12
858
859 mulx a_7,b_1,t_1 !=!mul_add_c(a[7],b[1],c3,c1,c2);
860 addcc c_12,t_1,c_12
861 clr c_3
862 bcs,a %xcc,.+8
863 add c_3,t_2,c_3 !=
864 mulx a_6,b_2,t_1 !mul_add_c(a[6],b[2],c3,c1,c2);
865 addcc c_12,t_1,c_12
866 bcs,a %xcc,.+8
867 add c_3,t_2,c_3 !=
868 mulx a_5,b_3,t_1 !mul_add_c(a[5],b[3],c3,c1,c2);
869 addcc c_12,t_1,c_12
870 bcs,a %xcc,.+8
871 add c_3,t_2,c_3 !=
872 mulx a_4,b_4,t_1 !mul_add_c(a[4],b[4],c3,c1,c2);
873 addcc c_12,t_1,c_12
874 bcs,a %xcc,.+8
875 add c_3,t_2,c_3 !=
876 mulx a_3,b_5,t_1 !mul_add_c(a[3],b[5],c3,c1,c2);
877 addcc c_12,t_1,c_12
878 bcs,a %xcc,.+8
879 add c_3,t_2,c_3 !=
880 mulx a_2,b_6,t_1 !mul_add_c(a[2],b[6],c3,c1,c2);
881 addcc c_12,t_1,c_12
882 bcs,a %xcc,.+8
883 add c_3,t_2,c_3 !=
884 mulx a_1,b_7,t_1 !mul_add_c(a[1],b[7],c3,c1,c2);
885 addcc c_12,t_1,t_1
886 bcs,a %xcc,.+8
887 add c_3,t_2,c_3 !=
888 srlx t_1,32,c_12
889 stuw t_1,rp(8) !r[8]=c3;
890 or c_12,c_3,c_12
891
892 mulx a_2,b_7,t_1 !=!mul_add_c(a[2],b[7],c1,c2,c3);
893 addcc c_12,t_1,c_12
894 clr c_3
895 bcs,a %xcc,.+8
896 add c_3,t_2,c_3 !=
897 mulx a_3,b_6,t_1 !mul_add_c(a[3],b[6],c1,c2,c3);
898 addcc c_12,t_1,c_12
899 bcs,a %xcc,.+8 !=
900 add c_3,t_2,c_3
901 mulx a_4,b_5,t_1 !mul_add_c(a[4],b[5],c1,c2,c3);
902 addcc c_12,t_1,c_12
903 bcs,a %xcc,.+8 !=
904 add c_3,t_2,c_3
905 mulx a_5,b_4,t_1 !mul_add_c(a[5],b[4],c1,c2,c3);
906 addcc c_12,t_1,c_12
907 bcs,a %xcc,.+8 !=
908 add c_3,t_2,c_3
909 mulx a_6,b_3,t_1 !mul_add_c(a[6],b[3],c1,c2,c3);
910 addcc c_12,t_1,c_12
911 bcs,a %xcc,.+8 !=
912 add c_3,t_2,c_3
913 mulx a_7,b_2,t_1 !mul_add_c(a[7],b[2],c1,c2,c3);
914 addcc c_12,t_1,t_1
915 bcs,a %xcc,.+8 !=
916 add c_3,t_2,c_3
917 srlx t_1,32,c_12
918 stuw t_1,rp(9) !r[9]=c1;
919 or c_12,c_3,c_12 !=
920
921 mulx a_7,b_3,t_1 !mul_add_c(a[7],b[3],c2,c3,c1);
922 addcc c_12,t_1,c_12
923 clr c_3
924 bcs,a %xcc,.+8 !=
925 add c_3,t_2,c_3
926 mulx a_6,b_4,t_1 !mul_add_c(a[6],b[4],c2,c3,c1);
927 addcc c_12,t_1,c_12
928 bcs,a %xcc,.+8 !=
929 add c_3,t_2,c_3
930 mulx a_5,b_5,t_1 !mul_add_c(a[5],b[5],c2,c3,c1);
931 addcc c_12,t_1,c_12
932 bcs,a %xcc,.+8 !=
933 add c_3,t_2,c_3
934 mulx a_4,b_6,t_1 !mul_add_c(a[4],b[6],c2,c3,c1);
935 addcc c_12,t_1,c_12
936 bcs,a %xcc,.+8 !=
937 add c_3,t_2,c_3
938 mulx a_3,b_7,t_1 !mul_add_c(a[3],b[7],c2,c3,c1);
939 addcc c_12,t_1,t_1
940 bcs,a %xcc,.+8 !=
941 add c_3,t_2,c_3
942 srlx t_1,32,c_12
943 stuw t_1,rp(10) !r[10]=c2;
944 or c_12,c_3,c_12 !=
945
946 mulx a_4,b_7,t_1 !mul_add_c(a[4],b[7],c3,c1,c2);
947 addcc c_12,t_1,c_12
948 clr c_3
949 bcs,a %xcc,.+8 !=
950 add c_3,t_2,c_3
951 mulx a_5,b_6,t_1 !mul_add_c(a[5],b[6],c3,c1,c2);
952 addcc c_12,t_1,c_12
953 bcs,a %xcc,.+8 !=
954 add c_3,t_2,c_3
955 mulx a_6,b_5,t_1 !mul_add_c(a[6],b[5],c3,c1,c2);
956 addcc c_12,t_1,c_12
957 bcs,a %xcc,.+8 !=
958 add c_3,t_2,c_3
959 mulx a_7,b_4,t_1 !mul_add_c(a[7],b[4],c3,c1,c2);
960 addcc c_12,t_1,t_1
961 bcs,a %xcc,.+8 !=
962 add c_3,t_2,c_3
963 srlx t_1,32,c_12
964 stuw t_1,rp(11) !r[11]=c3;
965 or c_12,c_3,c_12 !=
966
967 mulx a_7,b_5,t_1 !mul_add_c(a[7],b[5],c1,c2,c3);
968 addcc c_12,t_1,c_12
969 clr c_3
970 bcs,a %xcc,.+8 !=
971 add c_3,t_2,c_3
972 mulx a_6,b_6,t_1 !mul_add_c(a[6],b[6],c1,c2,c3);
973 addcc c_12,t_1,c_12
974 bcs,a %xcc,.+8 !=
975 add c_3,t_2,c_3
976 mulx a_5,b_7,t_1 !mul_add_c(a[5],b[7],c1,c2,c3);
977 addcc c_12,t_1,t_1
978 bcs,a %xcc,.+8 !=
979 add c_3,t_2,c_3
980 srlx t_1,32,c_12
981 stuw t_1,rp(12) !r[12]=c1;
982 or c_12,c_3,c_12 !=
983
984 mulx a_6,b_7,t_1 !mul_add_c(a[6],b[7],c2,c3,c1);
985 addcc c_12,t_1,c_12
986 clr c_3
987 bcs,a %xcc,.+8 !=
988 add c_3,t_2,c_3
989 mulx a_7,b_6,t_1 !mul_add_c(a[7],b[6],c2,c3,c1);
990 addcc c_12,t_1,t_1
991 bcs,a %xcc,.+8 !=
992 add c_3,t_2,c_3
993 srlx t_1,32,c_12
994 st t_1,rp(13) !r[13]=c2;
995 or c_12,c_3,c_12 !=
996
997 mulx a_7,b_7,t_1 !mul_add_c(a[7],b[7],c3,c1,c2);
998 addcc c_12,t_1,t_1
999 srlx t_1,32,c_12 !=
1000 stuw t_1,rp(14) !r[14]=c3;
1001 stuw c_12,rp(15) !r[15]=c1;
1002
1003 ret
1004 restore %g0,%g0,%o0 !=
1005
1006.type bn_mul_comba8,#function
1007.size bn_mul_comba8,(.-bn_mul_comba8)
1008
1009.align 32
1010
1011.global bn_mul_comba4
1012/*
1013 * void bn_mul_comba4(r,a,b)
1014 * BN_ULONG *r,*a,*b;
1015 */
1016bn_mul_comba4:
1017 save %sp,FRAME_SIZE,%sp
1018 lduw ap(0),a_0
1019 mov 1,t_2
1020 lduw bp(0),b_0
1021 sllx t_2,32,t_2 !=
1022 lduw bp(1),b_1
1023 mulx a_0,b_0,t_1 !mul_add_c(a[0],b[0],c1,c2,c3);
1024 srlx t_1,32,c_12
1025 stuw t_1,rp(0) !=!r[0]=c1;
1026
1027 lduw ap(1),a_1
1028 mulx a_0,b_1,t_1 !mul_add_c(a[0],b[1],c2,c3,c1);
1029 addcc c_12,t_1,c_12
1030 clr c_3 !=
1031 bcs,a %xcc,.+8
1032 add c_3,t_2,c_3
1033 lduw ap(2),a_2
1034 mulx a_1,b_0,t_1 !=!mul_add_c(a[1],b[0],c2,c3,c1);
1035 addcc c_12,t_1,t_1
1036 bcs,a %xcc,.+8
1037 add c_3,t_2,c_3
1038 srlx t_1,32,c_12 !=
1039 stuw t_1,rp(1) !r[1]=c2;
1040 or c_12,c_3,c_12
1041
1042 mulx a_2,b_0,t_1 !mul_add_c(a[2],b[0],c3,c1,c2);
1043 addcc c_12,t_1,c_12 !=
1044 clr c_3
1045 bcs,a %xcc,.+8
1046 add c_3,t_2,c_3
1047 lduw bp(2),b_2 !=
1048 mulx a_1,b_1,t_1 !mul_add_c(a[1],b[1],c3,c1,c2);
1049 addcc c_12,t_1,c_12
1050 bcs,a %xcc,.+8
1051 add c_3,t_2,c_3 !=
1052 lduw bp(3),b_3
1053 mulx a_0,b_2,t_1 !mul_add_c(a[0],b[2],c3,c1,c2);
1054 addcc c_12,t_1,t_1
1055 bcs,a %xcc,.+8 !=
1056 add c_3,t_2,c_3
1057 srlx t_1,32,c_12
1058 stuw t_1,rp(2) !r[2]=c3;
1059 or c_12,c_3,c_12 !=
1060
1061 mulx a_0,b_3,t_1 !mul_add_c(a[0],b[3],c1,c2,c3);
1062 addcc c_12,t_1,c_12
1063 clr c_3
1064 bcs,a %xcc,.+8 !=
1065 add c_3,t_2,c_3
1066 mulx a_1,b_2,t_1 !mul_add_c(a[1],b[2],c1,c2,c3);
1067 addcc c_12,t_1,c_12
1068 bcs,a %xcc,.+8 !=
1069 add c_3,t_2,c_3
1070 lduw ap(3),a_3
1071 mulx a_2,b_1,t_1 !mul_add_c(a[2],b[1],c1,c2,c3);
1072 addcc c_12,t_1,c_12 !=
1073 bcs,a %xcc,.+8
1074 add c_3,t_2,c_3
1075 mulx a_3,b_0,t_1 !mul_add_c(a[3],b[0],c1,c2,c3);!=
1076 addcc c_12,t_1,t_1 !=
1077 bcs,a %xcc,.+8
1078 add c_3,t_2,c_3
1079 srlx t_1,32,c_12
1080 stuw t_1,rp(3) !=!r[3]=c1;
1081 or c_12,c_3,c_12
1082
1083 mulx a_3,b_1,t_1 !mul_add_c(a[3],b[1],c2,c3,c1);
1084 addcc c_12,t_1,c_12
1085 clr c_3 !=
1086 bcs,a %xcc,.+8
1087 add c_3,t_2,c_3
1088 mulx a_2,b_2,t_1 !mul_add_c(a[2],b[2],c2,c3,c1);
1089 addcc c_12,t_1,c_12 !=
1090 bcs,a %xcc,.+8
1091 add c_3,t_2,c_3
1092 mulx a_1,b_3,t_1 !mul_add_c(a[1],b[3],c2,c3,c1);
1093 addcc c_12,t_1,t_1 !=
1094 bcs,a %xcc,.+8
1095 add c_3,t_2,c_3
1096 srlx t_1,32,c_12
1097 stuw t_1,rp(4) !=!r[4]=c2;
1098 or c_12,c_3,c_12
1099
1100 mulx a_2,b_3,t_1 !mul_add_c(a[2],b[3],c3,c1,c2);
1101 addcc c_12,t_1,c_12
1102 clr c_3 !=
1103 bcs,a %xcc,.+8
1104 add c_3,t_2,c_3
1105 mulx a_3,b_2,t_1 !mul_add_c(a[3],b[2],c3,c1,c2);
1106 addcc c_12,t_1,t_1 !=
1107 bcs,a %xcc,.+8
1108 add c_3,t_2,c_3
1109 srlx t_1,32,c_12
1110 stuw t_1,rp(5) !=!r[5]=c3;
1111 or c_12,c_3,c_12
1112
1113 mulx a_3,b_3,t_1 !mul_add_c(a[3],b[3],c1,c2,c3);
1114 addcc c_12,t_1,t_1
1115 srlx t_1,32,c_12 !=
1116 stuw t_1,rp(6) !r[6]=c1;
1117 stuw c_12,rp(7) !r[7]=c2;
1118
1119 ret
1120 restore %g0,%g0,%o0
1121
1122.type bn_mul_comba4,#function
1123.size bn_mul_comba4,(.-bn_mul_comba4)
1124
1125.align 32
1126
1127.global bn_sqr_comba8
1128bn_sqr_comba8:
1129 save %sp,FRAME_SIZE,%sp
1130 mov 1,t_2
1131 lduw ap(0),a_0
1132 sllx t_2,32,t_2
1133 lduw ap(1),a_1
1134 mulx a_0,a_0,t_1 !sqr_add_c(a,0,c1,c2,c3);
1135 srlx t_1,32,c_12
1136 stuw t_1,rp(0) !r[0]=c1;
1137
1138 lduw ap(2),a_2
1139 mulx a_0,a_1,t_1 !=!sqr_add_c2(a,1,0,c2,c3,c1);
1140 addcc c_12,t_1,c_12
1141 clr c_3
1142 bcs,a %xcc,.+8
1143 add c_3,t_2,c_3
1144 addcc c_12,t_1,t_1
1145 bcs,a %xcc,.+8
1146 add c_3,t_2,c_3
1147 srlx t_1,32,c_12
1148 stuw t_1,rp(1) !r[1]=c2;
1149 or c_12,c_3,c_12
1150
1151 mulx a_2,a_0,t_1 !sqr_add_c2(a,2,0,c3,c1,c2);
1152 addcc c_12,t_1,c_12
1153 clr c_3
1154 bcs,a %xcc,.+8
1155 add c_3,t_2,c_3
1156 addcc c_12,t_1,c_12
1157 bcs,a %xcc,.+8
1158 add c_3,t_2,c_3
1159 lduw ap(3),a_3
1160 mulx a_1,a_1,t_1 !sqr_add_c(a,1,c3,c1,c2);
1161 addcc c_12,t_1,t_1
1162 bcs,a %xcc,.+8
1163 add c_3,t_2,c_3
1164 srlx t_1,32,c_12
1165 stuw t_1,rp(2) !r[2]=c3;
1166 or c_12,c_3,c_12
1167
1168 mulx a_0,a_3,t_1 !sqr_add_c2(a,3,0,c1,c2,c3);
1169 addcc c_12,t_1,c_12
1170 clr c_3
1171 bcs,a %xcc,.+8
1172 add c_3,t_2,c_3
1173 addcc c_12,t_1,c_12
1174 bcs,a %xcc,.+8
1175 add c_3,t_2,c_3
1176 lduw ap(4),a_4
1177 mulx a_1,a_2,t_1 !sqr_add_c2(a,2,1,c1,c2,c3);
1178 addcc c_12,t_1,c_12
1179 bcs,a %xcc,.+8
1180 add c_3,t_2,c_3
1181 addcc c_12,t_1,t_1
1182 bcs,a %xcc,.+8
1183 add c_3,t_2,c_3
1184 srlx t_1,32,c_12
1185 st t_1,rp(3) !r[3]=c1;
1186 or c_12,c_3,c_12
1187
1188 mulx a_4,a_0,t_1 !sqr_add_c2(a,4,0,c2,c3,c1);
1189 addcc c_12,t_1,c_12
1190 clr c_3
1191 bcs,a %xcc,.+8
1192 add c_3,t_2,c_3
1193 addcc c_12,t_1,c_12
1194 bcs,a %xcc,.+8
1195 add c_3,t_2,c_3
1196 mulx a_3,a_1,t_1 !sqr_add_c2(a,3,1,c2,c3,c1);
1197 addcc c_12,t_1,c_12
1198 bcs,a %xcc,.+8
1199 add c_3,t_2,c_3
1200 addcc c_12,t_1,c_12
1201 bcs,a %xcc,.+8
1202 add c_3,t_2,c_3
1203 lduw ap(5),a_5
1204 mulx a_2,a_2,t_1 !sqr_add_c(a,2,c2,c3,c1);
1205 addcc c_12,t_1,t_1
1206 bcs,a %xcc,.+8
1207 add c_3,t_2,c_3
1208 srlx t_1,32,c_12
1209 stuw t_1,rp(4) !r[4]=c2;
1210 or c_12,c_3,c_12
1211
1212 mulx a_0,a_5,t_1 !sqr_add_c2(a,5,0,c3,c1,c2);
1213 addcc c_12,t_1,c_12
1214 clr c_3
1215 bcs,a %xcc,.+8
1216 add c_3,t_2,c_3
1217 addcc c_12,t_1,c_12
1218 bcs,a %xcc,.+8
1219 add c_3,t_2,c_3
1220 mulx a_1,a_4,t_1 !sqr_add_c2(a,4,1,c3,c1,c2);
1221 addcc c_12,t_1,c_12
1222 bcs,a %xcc,.+8
1223 add c_3,t_2,c_3
1224 addcc c_12,t_1,c_12
1225 bcs,a %xcc,.+8
1226 add c_3,t_2,c_3
1227 lduw ap(6),a_6
1228 mulx a_2,a_3,t_1 !sqr_add_c2(a,3,2,c3,c1,c2);
1229 addcc c_12,t_1,c_12
1230 bcs,a %xcc,.+8
1231 add c_3,t_2,c_3
1232 addcc c_12,t_1,t_1
1233 bcs,a %xcc,.+8
1234 add c_3,t_2,c_3
1235 srlx t_1,32,c_12
1236 stuw t_1,rp(5) !r[5]=c3;
1237 or c_12,c_3,c_12
1238
1239 mulx a_6,a_0,t_1 !sqr_add_c2(a,6,0,c1,c2,c3);
1240 addcc c_12,t_1,c_12
1241 clr c_3
1242 bcs,a %xcc,.+8
1243 add c_3,t_2,c_3
1244 addcc c_12,t_1,c_12
1245 bcs,a %xcc,.+8
1246 add c_3,t_2,c_3
1247 mulx a_5,a_1,t_1 !sqr_add_c2(a,5,1,c1,c2,c3);
1248 addcc c_12,t_1,c_12
1249 bcs,a %xcc,.+8
1250 add c_3,t_2,c_3
1251 addcc c_12,t_1,c_12
1252 bcs,a %xcc,.+8
1253 add c_3,t_2,c_3
1254 mulx a_4,a_2,t_1 !sqr_add_c2(a,4,2,c1,c2,c3);
1255 addcc c_12,t_1,c_12
1256 bcs,a %xcc,.+8
1257 add c_3,t_2,c_3
1258 addcc c_12,t_1,c_12
1259 bcs,a %xcc,.+8
1260 add c_3,t_2,c_3
1261 lduw ap(7),a_7
1262 mulx a_3,a_3,t_1 !=!sqr_add_c(a,3,c1,c2,c3);
1263 addcc c_12,t_1,t_1
1264 bcs,a %xcc,.+8
1265 add c_3,t_2,c_3
1266 srlx t_1,32,c_12
1267 stuw t_1,rp(6) !r[6]=c1;
1268 or c_12,c_3,c_12
1269
1270 mulx a_0,a_7,t_1 !sqr_add_c2(a,7,0,c2,c3,c1);
1271 addcc c_12,t_1,c_12
1272 clr c_3
1273 bcs,a %xcc,.+8
1274 add c_3,t_2,c_3
1275 addcc c_12,t_1,c_12
1276 bcs,a %xcc,.+8
1277 add c_3,t_2,c_3
1278 mulx a_1,a_6,t_1 !sqr_add_c2(a,6,1,c2,c3,c1);
1279 addcc c_12,t_1,c_12
1280 bcs,a %xcc,.+8
1281 add c_3,t_2,c_3
1282 addcc c_12,t_1,c_12
1283 bcs,a %xcc,.+8
1284 add c_3,t_2,c_3
1285 mulx a_2,a_5,t_1 !sqr_add_c2(a,5,2,c2,c3,c1);
1286 addcc c_12,t_1,c_12
1287 bcs,a %xcc,.+8
1288 add c_3,t_2,c_3
1289 addcc c_12,t_1,c_12
1290 bcs,a %xcc,.+8
1291 add c_3,t_2,c_3
1292 mulx a_3,a_4,t_1 !sqr_add_c2(a,4,3,c2,c3,c1);
1293 addcc c_12,t_1,c_12
1294 bcs,a %xcc,.+8
1295 add c_3,t_2,c_3
1296 addcc c_12,t_1,t_1
1297 bcs,a %xcc,.+8
1298 add c_3,t_2,c_3
1299 srlx t_1,32,c_12
1300 stuw t_1,rp(7) !r[7]=c2;
1301 or c_12,c_3,c_12
1302
1303 mulx a_7,a_1,t_1 !sqr_add_c2(a,7,1,c3,c1,c2);
1304 addcc c_12,t_1,c_12
1305 clr c_3
1306 bcs,a %xcc,.+8
1307 add c_3,t_2,c_3
1308 addcc c_12,t_1,c_12
1309 bcs,a %xcc,.+8
1310 add c_3,t_2,c_3
1311 mulx a_6,a_2,t_1 !sqr_add_c2(a,6,2,c3,c1,c2);
1312 addcc c_12,t_1,c_12
1313 bcs,a %xcc,.+8
1314 add c_3,t_2,c_3
1315 addcc c_12,t_1,c_12
1316 bcs,a %xcc,.+8
1317 add c_3,t_2,c_3
1318 mulx a_5,a_3,t_1 !sqr_add_c2(a,5,3,c3,c1,c2);
1319 addcc c_12,t_1,c_12
1320 bcs,a %xcc,.+8
1321 add c_3,t_2,c_3
1322 addcc c_12,t_1,c_12
1323 bcs,a %xcc,.+8
1324 add c_3,t_2,c_3
1325 mulx a_4,a_4,t_1 !sqr_add_c(a,4,c3,c1,c2);
1326 addcc c_12,t_1,t_1
1327 bcs,a %xcc,.+8
1328 add c_3,t_2,c_3
1329 srlx t_1,32,c_12
1330 stuw t_1,rp(8) !r[8]=c3;
1331 or c_12,c_3,c_12
1332
1333 mulx a_2,a_7,t_1 !sqr_add_c2(a,7,2,c1,c2,c3);
1334 addcc c_12,t_1,c_12
1335 clr c_3
1336 bcs,a %xcc,.+8
1337 add c_3,t_2,c_3
1338 addcc c_12,t_1,c_12
1339 bcs,a %xcc,.+8
1340 add c_3,t_2,c_3
1341 mulx a_3,a_6,t_1 !sqr_add_c2(a,6,3,c1,c2,c3);
1342 addcc c_12,t_1,c_12
1343 bcs,a %xcc,.+8
1344 add c_3,t_2,c_3
1345 addcc c_12,t_1,c_12
1346 bcs,a %xcc,.+8
1347 add c_3,t_2,c_3
1348 mulx a_4,a_5,t_1 !sqr_add_c2(a,5,4,c1,c2,c3);
1349 addcc c_12,t_1,c_12
1350 bcs,a %xcc,.+8
1351 add c_3,t_2,c_3
1352 addcc c_12,t_1,t_1
1353 bcs,a %xcc,.+8
1354 add c_3,t_2,c_3
1355 srlx t_1,32,c_12
1356 stuw t_1,rp(9) !r[9]=c1;
1357 or c_12,c_3,c_12
1358
1359 mulx a_7,a_3,t_1 !sqr_add_c2(a,7,3,c2,c3,c1);
1360 addcc c_12,t_1,c_12
1361 clr c_3
1362 bcs,a %xcc,.+8
1363 add c_3,t_2,c_3
1364 addcc c_12,t_1,c_12
1365 bcs,a %xcc,.+8
1366 add c_3,t_2,c_3
1367 mulx a_6,a_4,t_1 !sqr_add_c2(a,6,4,c2,c3,c1);
1368 addcc c_12,t_1,c_12
1369 bcs,a %xcc,.+8
1370 add c_3,t_2,c_3
1371 addcc c_12,t_1,c_12
1372 bcs,a %xcc,.+8
1373 add c_3,t_2,c_3
1374 mulx a_5,a_5,t_1 !sqr_add_c(a,5,c2,c3,c1);
1375 addcc c_12,t_1,t_1
1376 bcs,a %xcc,.+8
1377 add c_3,t_2,c_3
1378 srlx t_1,32,c_12
1379 stuw t_1,rp(10) !r[10]=c2;
1380 or c_12,c_3,c_12
1381
1382 mulx a_4,a_7,t_1 !sqr_add_c2(a,7,4,c3,c1,c2);
1383 addcc c_12,t_1,c_12
1384 clr c_3
1385 bcs,a %xcc,.+8
1386 add c_3,t_2,c_3
1387 addcc c_12,t_1,c_12
1388 bcs,a %xcc,.+8
1389 add c_3,t_2,c_3
1390 mulx a_5,a_6,t_1 !sqr_add_c2(a,6,5,c3,c1,c2);
1391 addcc c_12,t_1,c_12
1392 bcs,a %xcc,.+8
1393 add c_3,t_2,c_3
1394 addcc c_12,t_1,t_1
1395 bcs,a %xcc,.+8
1396 add c_3,t_2,c_3
1397 srlx t_1,32,c_12
1398 stuw t_1,rp(11) !r[11]=c3;
1399 or c_12,c_3,c_12
1400
1401 mulx a_7,a_5,t_1 !sqr_add_c2(a,7,5,c1,c2,c3);
1402 addcc c_12,t_1,c_12
1403 clr c_3
1404 bcs,a %xcc,.+8
1405 add c_3,t_2,c_3
1406 addcc c_12,t_1,c_12
1407 bcs,a %xcc,.+8
1408 add c_3,t_2,c_3
1409 mulx a_6,a_6,t_1 !sqr_add_c(a,6,c1,c2,c3);
1410 addcc c_12,t_1,t_1
1411 bcs,a %xcc,.+8
1412 add c_3,t_2,c_3
1413 srlx t_1,32,c_12
1414 stuw t_1,rp(12) !r[12]=c1;
1415 or c_12,c_3,c_12
1416
1417 mulx a_6,a_7,t_1 !sqr_add_c2(a,7,6,c2,c3,c1);
1418 addcc c_12,t_1,c_12
1419 clr c_3
1420 bcs,a %xcc,.+8
1421 add c_3,t_2,c_3
1422 addcc c_12,t_1,t_1
1423 bcs,a %xcc,.+8
1424 add c_3,t_2,c_3
1425 srlx t_1,32,c_12
1426 stuw t_1,rp(13) !r[13]=c2;
1427 or c_12,c_3,c_12
1428
1429 mulx a_7,a_7,t_1 !sqr_add_c(a,7,c3,c1,c2);
1430 addcc c_12,t_1,t_1
1431 srlx t_1,32,c_12
1432 stuw t_1,rp(14) !r[14]=c3;
1433 stuw c_12,rp(15) !r[15]=c1;
1434
1435 ret
1436 restore %g0,%g0,%o0
1437
1438.type bn_sqr_comba8,#function
1439.size bn_sqr_comba8,(.-bn_sqr_comba8)
1440
1441.align 32
1442
1443.global bn_sqr_comba4
1444/*
1445 * void bn_sqr_comba4(r,a)
1446 * BN_ULONG *r,*a;
1447 */
1448bn_sqr_comba4:
1449 save %sp,FRAME_SIZE,%sp
1450 mov 1,t_2
1451 lduw ap(0),a_0
1452 sllx t_2,32,t_2
1453 lduw ap(1),a_1
1454 mulx a_0,a_0,t_1 !sqr_add_c(a,0,c1,c2,c3);
1455 srlx t_1,32,c_12
1456 stuw t_1,rp(0) !r[0]=c1;
1457
1458 lduw ap(2),a_2
1459 mulx a_0,a_1,t_1 !sqr_add_c2(a,1,0,c2,c3,c1);
1460 addcc c_12,t_1,c_12
1461 clr c_3
1462 bcs,a %xcc,.+8
1463 add c_3,t_2,c_3
1464 addcc c_12,t_1,t_1
1465 bcs,a %xcc,.+8
1466 add c_3,t_2,c_3
1467 srlx t_1,32,c_12
1468 stuw t_1,rp(1) !r[1]=c2;
1469 or c_12,c_3,c_12
1470
1471 mulx a_2,a_0,t_1 !sqr_add_c2(a,2,0,c3,c1,c2);
1472 addcc c_12,t_1,c_12
1473 clr c_3
1474 bcs,a %xcc,.+8
1475 add c_3,t_2,c_3
1476 addcc c_12,t_1,c_12
1477 bcs,a %xcc,.+8
1478 add c_3,t_2,c_3
1479 lduw ap(3),a_3
1480 mulx a_1,a_1,t_1 !sqr_add_c(a,1,c3,c1,c2);
1481 addcc c_12,t_1,t_1
1482 bcs,a %xcc,.+8
1483 add c_3,t_2,c_3
1484 srlx t_1,32,c_12
1485 stuw t_1,rp(2) !r[2]=c3;
1486 or c_12,c_3,c_12
1487
1488 mulx a_0,a_3,t_1 !sqr_add_c2(a,3,0,c1,c2,c3);
1489 addcc c_12,t_1,c_12
1490 clr c_3
1491 bcs,a %xcc,.+8
1492 add c_3,t_2,c_3
1493 addcc c_12,t_1,c_12
1494 bcs,a %xcc,.+8
1495 add c_3,t_2,c_3
1496 mulx a_1,a_2,t_1 !sqr_add_c2(a,2,1,c1,c2,c3);
1497 addcc c_12,t_1,c_12
1498 bcs,a %xcc,.+8
1499 add c_3,t_2,c_3
1500 addcc c_12,t_1,t_1
1501 bcs,a %xcc,.+8
1502 add c_3,t_2,c_3
1503 srlx t_1,32,c_12
1504 stuw t_1,rp(3) !r[3]=c1;
1505 or c_12,c_3,c_12
1506
1507 mulx a_3,a_1,t_1 !sqr_add_c2(a,3,1,c2,c3,c1);
1508 addcc c_12,t_1,c_12
1509 clr c_3
1510 bcs,a %xcc,.+8
1511 add c_3,t_2,c_3
1512 addcc c_12,t_1,c_12
1513 bcs,a %xcc,.+8
1514 add c_3,t_2,c_3
1515 mulx a_2,a_2,t_1 !sqr_add_c(a,2,c2,c3,c1);
1516 addcc c_12,t_1,t_1
1517 bcs,a %xcc,.+8
1518 add c_3,t_2,c_3
1519 srlx t_1,32,c_12
1520 stuw t_1,rp(4) !r[4]=c2;
1521 or c_12,c_3,c_12
1522
1523 mulx a_2,a_3,t_1 !sqr_add_c2(a,3,2,c3,c1,c2);
1524 addcc c_12,t_1,c_12
1525 clr c_3
1526 bcs,a %xcc,.+8
1527 add c_3,t_2,c_3
1528 addcc c_12,t_1,t_1
1529 bcs,a %xcc,.+8
1530 add c_3,t_2,c_3
1531 srlx t_1,32,c_12
1532 stuw t_1,rp(5) !r[5]=c3;
1533 or c_12,c_3,c_12
1534
1535 mulx a_3,a_3,t_1 !sqr_add_c(a,3,c1,c2,c3);
1536 addcc c_12,t_1,t_1
1537 srlx t_1,32,c_12
1538 stuw t_1,rp(6) !r[6]=c1;
1539 stuw c_12,rp(7) !r[7]=c2;
1540
1541 ret
1542 restore %g0,%g0,%o0
1543
1544.type bn_sqr_comba4,#function
1545.size bn_sqr_comba4,(.-bn_sqr_comba4)
1546
1547.align 32