diff options
Diffstat (limited to 'src/lib/libcrypto/bn/asm/sparcv8plus.S')
-rw-r--r-- | src/lib/libcrypto/bn/asm/sparcv8plus.S | 1558 |
1 files changed, 0 insertions, 1558 deletions
diff --git a/src/lib/libcrypto/bn/asm/sparcv8plus.S b/src/lib/libcrypto/bn/asm/sparcv8plus.S deleted file mode 100644 index 63de1860f2..0000000000 --- a/src/lib/libcrypto/bn/asm/sparcv8plus.S +++ /dev/null | |||
@@ -1,1558 +0,0 @@ | |||
1 | .ident "sparcv8plus.s, Version 1.4" | ||
2 | .ident "SPARC v9 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>" | ||
3 | |||
4 | /* | ||
5 | * ==================================================================== | ||
6 | * Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | ||
7 | * project. | ||
8 | * | ||
9 | * Rights for redistribution and usage in source and binary forms are | ||
10 | * granted according to the OpenSSL license. Warranty of any kind is | ||
11 | * disclaimed. | ||
12 | * ==================================================================== | ||
13 | */ | ||
14 | |||
15 | /* | ||
16 | * This is my modest contributon to OpenSSL project (see | ||
17 | * http://www.openssl.org/ for more information about it) and is | ||
18 | * a drop-in UltraSPARC ISA replacement for crypto/bn/bn_asm.c | ||
19 | * module. For updates see http://fy.chalmers.se/~appro/hpe/. | ||
20 | * | ||
21 | * Questions-n-answers. | ||
22 | * | ||
23 | * Q. How to compile? | ||
24 | * A. With SC4.x/SC5.x: | ||
25 | * | ||
26 | * cc -xarch=v8plus -c bn_asm.sparc.v8plus.S -o bn_asm.o | ||
27 | * | ||
28 | * and with gcc: | ||
29 | * | ||
30 | * gcc -mcpu=ultrasparc -c bn_asm.sparc.v8plus.S -o bn_asm.o | ||
31 | * | ||
32 | * or if above fails (it does if you have gas installed): | ||
33 | * | ||
34 | * gcc -E bn_asm.sparc.v8plus.S | as -xarch=v8plus /dev/fd/0 -o bn_asm.o | ||
35 | * | ||
36 | * Quick-n-dirty way to fuse the module into the library. | ||
37 | * Provided that the library is already configured and built | ||
38 | * (in 0.9.2 case with no-asm option): | ||
39 | * | ||
40 | * # cd crypto/bn | ||
41 | * # cp /some/place/bn_asm.sparc.v8plus.S . | ||
42 | * # cc -xarch=v8plus -c bn_asm.sparc.v8plus.S -o bn_asm.o | ||
43 | * # make | ||
44 | * # cd ../.. | ||
45 | * # make; make test | ||
46 | * | ||
47 | * Quick-n-dirty way to get rid of it: | ||
48 | * | ||
49 | * # cd crypto/bn | ||
50 | * # touch bn_asm.c | ||
51 | * # make | ||
52 | * # cd ../.. | ||
53 | * # make; make test | ||
54 | * | ||
55 | * Q. V8plus achitecture? What kind of beast is that? | ||
56 | * A. Well, it's rather a programming model than an architecture... | ||
57 | * It's actually v9-compliant, i.e. *any* UltraSPARC, CPU under | ||
58 | * special conditions, namely when kernel doesn't preserve upper | ||
59 | * 32 bits of otherwise 64-bit registers during a context switch. | ||
60 | * | ||
61 | * Q. Why just UltraSPARC? What about SuperSPARC? | ||
62 | * A. Original release did target UltraSPARC only. Now SuperSPARC | ||
63 | * version is provided along. Both version share bn_*comba[48] | ||
64 | * implementations (see comment later in code for explanation). | ||
65 | * But what's so special about this UltraSPARC implementation? | ||
66 | * Why didn't I let compiler do the job? Trouble is that most of | ||
67 | * available compilers (well, SC5.0 is the only exception) don't | ||
68 | * attempt to take advantage of UltraSPARC's 64-bitness under | ||
69 | * 32-bit kernels even though it's perfectly possible (see next | ||
70 | * question). | ||
71 | * | ||
72 | * Q. 64-bit registers under 32-bit kernels? Didn't you just say it | ||
73 | * doesn't work? | ||
74 | * A. You can't adress *all* registers as 64-bit wide:-( The catch is | ||
75 | * that you actually may rely upon %o0-%o5 and %g1-%g4 being fully | ||
76 | * preserved if you're in a leaf function, i.e. such never calling | ||
77 | * any other functions. All functions in this module are leaf and | ||
78 | * 10 registers is a handful. And as a matter of fact none-"comba" | ||
79 | * routines don't require even that much and I could even afford to | ||
80 | * not allocate own stack frame for 'em:-) | ||
81 | * | ||
82 | * Q. What about 64-bit kernels? | ||
83 | * A. What about 'em? Just kidding:-) Pure 64-bit version is currently | ||
84 | * under evaluation and development... | ||
85 | * | ||
86 | * Q. What about shared libraries? | ||
87 | * A. What about 'em? Kidding again:-) Code does *not* contain any | ||
88 | * code position dependencies and it's safe to include it into | ||
89 | * shared library as is. | ||
90 | * | ||
91 | * Q. How much faster does it go? | ||
92 | * A. Do you have a good benchmark? In either case below is what I | ||
93 | * experience with crypto/bn/expspeed.c test program: | ||
94 | * | ||
95 | * v8plus module on U10/300MHz against bn_asm.c compiled with: | ||
96 | * | ||
97 | * cc-5.0 -xarch=v8plus -xO5 -xdepend +7-12% | ||
98 | * cc-4.2 -xarch=v8plus -xO5 -xdepend +25-35% | ||
99 | * egcs-1.1.2 -mcpu=ultrasparc -O3 +35-45% | ||
100 | * | ||
101 | * v8 module on SS10/60MHz against bn_asm.c compiled with: | ||
102 | * | ||
103 | * cc-5.0 -xarch=v8 -xO5 -xdepend +7-10% | ||
104 | * cc-4.2 -xarch=v8 -xO5 -xdepend +10% | ||
105 | * egcs-1.1.2 -mv8 -O3 +35-45% | ||
106 | * | ||
107 | * As you can see it's damn hard to beat the new Sun C compiler | ||
108 | * and it's in first place GNU C users who will appreciate this | ||
109 | * assembler implementation:-) | ||
110 | */ | ||
111 | |||
112 | /* | ||
113 | * Revision history. | ||
114 | * | ||
115 | * 1.0 - initial release; | ||
116 | * 1.1 - new loop unrolling model(*); | ||
117 | * - some more fine tuning; | ||
118 | * 1.2 - made gas friendly; | ||
119 | * - updates to documentation concerning v9; | ||
120 | * - new performance comparison matrix; | ||
121 | * 1.3 - fixed problem with /usr/ccs/lib/cpp; | ||
122 | * 1.4 - native V9 bn_*_comba[48] implementation (15% more efficient) | ||
123 | * resulting in slight overall performance kick; | ||
124 | * - some retunes; | ||
125 | * - support for GNU as added; | ||
126 | * | ||
127 | * (*) Originally unrolled loop looked like this: | ||
128 | * for (;;) { | ||
129 | * op(p+0); if (--n==0) break; | ||
130 | * op(p+1); if (--n==0) break; | ||
131 | * op(p+2); if (--n==0) break; | ||
132 | * op(p+3); if (--n==0) break; | ||
133 | * p+=4; | ||
134 | * } | ||
135 | * I unroll according to following: | ||
136 | * while (n&~3) { | ||
137 | * op(p+0); op(p+1); op(p+2); op(p+3); | ||
138 | * p+=4; n=-4; | ||
139 | * } | ||
140 | * if (n) { | ||
141 | * op(p+0); if (--n==0) return; | ||
142 | * op(p+2); if (--n==0) return; | ||
143 | * op(p+3); return; | ||
144 | * } | ||
145 | */ | ||
146 | |||
147 | #if defined(__SUNPRO_C) && defined(__sparcv9) | ||
148 | /* They've said -xarch=v9 at command line */ | ||
149 | .register %g2,#scratch | ||
150 | .register %g3,#scratch | ||
151 | # define FRAME_SIZE -192 | ||
152 | #elif defined(__GNUC__) && defined(__arch64__) | ||
153 | /* They've said -m64 at command line */ | ||
154 | .register %g2,#scratch | ||
155 | .register %g3,#scratch | ||
156 | # define FRAME_SIZE -192 | ||
157 | #else | ||
158 | # define FRAME_SIZE -96 | ||
159 | #endif | ||
160 | /* | ||
161 | * GNU assembler can't stand stuw:-( | ||
162 | */ | ||
163 | #define stuw st | ||
164 | |||
165 | .section ".text",#alloc,#execinstr | ||
166 | .file "bn_asm.sparc.v8plus.S" | ||
167 | |||
168 | .align 32 | ||
169 | |||
170 | .global bn_mul_add_words | ||
171 | /* | ||
172 | * BN_ULONG bn_mul_add_words(rp,ap,num,w) | ||
173 | * BN_ULONG *rp,*ap; | ||
174 | * int num; | ||
175 | * BN_ULONG w; | ||
176 | */ | ||
177 | bn_mul_add_words: | ||
178 | sra %o2,%g0,%o2 ! signx %o2 | ||
179 | brgz,a %o2,.L_bn_mul_add_words_proceed | ||
180 | lduw [%o1],%g2 | ||
181 | retl | ||
182 | clr %o0 | ||
183 | nop | ||
184 | nop | ||
185 | nop | ||
186 | |||
187 | .L_bn_mul_add_words_proceed: | ||
188 | srl %o3,%g0,%o3 ! clruw %o3 | ||
189 | andcc %o2,-4,%g0 | ||
190 | bz,pn %icc,.L_bn_mul_add_words_tail | ||
191 | clr %o5 | ||
192 | |||
193 | .L_bn_mul_add_words_loop: ! wow! 32 aligned! | ||
194 | lduw [%o0],%g1 | ||
195 | lduw [%o1+4],%g3 | ||
196 | mulx %o3,%g2,%g2 | ||
197 | add %g1,%o5,%o4 | ||
198 | nop | ||
199 | add %o4,%g2,%o4 | ||
200 | stuw %o4,[%o0] | ||
201 | srlx %o4,32,%o5 | ||
202 | |||
203 | lduw [%o0+4],%g1 | ||
204 | lduw [%o1+8],%g2 | ||
205 | mulx %o3,%g3,%g3 | ||
206 | add %g1,%o5,%o4 | ||
207 | dec 4,%o2 | ||
208 | add %o4,%g3,%o4 | ||
209 | stuw %o4,[%o0+4] | ||
210 | srlx %o4,32,%o5 | ||
211 | |||
212 | lduw [%o0+8],%g1 | ||
213 | lduw [%o1+12],%g3 | ||
214 | mulx %o3,%g2,%g2 | ||
215 | add %g1,%o5,%o4 | ||
216 | inc 16,%o1 | ||
217 | add %o4,%g2,%o4 | ||
218 | stuw %o4,[%o0+8] | ||
219 | srlx %o4,32,%o5 | ||
220 | |||
221 | lduw [%o0+12],%g1 | ||
222 | mulx %o3,%g3,%g3 | ||
223 | add %g1,%o5,%o4 | ||
224 | inc 16,%o0 | ||
225 | add %o4,%g3,%o4 | ||
226 | andcc %o2,-4,%g0 | ||
227 | stuw %o4,[%o0-4] | ||
228 | srlx %o4,32,%o5 | ||
229 | bnz,a,pt %icc,.L_bn_mul_add_words_loop | ||
230 | lduw [%o1],%g2 | ||
231 | |||
232 | brnz,a,pn %o2,.L_bn_mul_add_words_tail | ||
233 | lduw [%o1],%g2 | ||
234 | .L_bn_mul_add_words_return: | ||
235 | retl | ||
236 | mov %o5,%o0 | ||
237 | |||
238 | .L_bn_mul_add_words_tail: | ||
239 | lduw [%o0],%g1 | ||
240 | mulx %o3,%g2,%g2 | ||
241 | add %g1,%o5,%o4 | ||
242 | dec %o2 | ||
243 | add %o4,%g2,%o4 | ||
244 | srlx %o4,32,%o5 | ||
245 | brz,pt %o2,.L_bn_mul_add_words_return | ||
246 | stuw %o4,[%o0] | ||
247 | |||
248 | lduw [%o1+4],%g2 | ||
249 | lduw [%o0+4],%g1 | ||
250 | mulx %o3,%g2,%g2 | ||
251 | add %g1,%o5,%o4 | ||
252 | dec %o2 | ||
253 | add %o4,%g2,%o4 | ||
254 | srlx %o4,32,%o5 | ||
255 | brz,pt %o2,.L_bn_mul_add_words_return | ||
256 | stuw %o4,[%o0+4] | ||
257 | |||
258 | lduw [%o1+8],%g2 | ||
259 | lduw [%o0+8],%g1 | ||
260 | mulx %o3,%g2,%g2 | ||
261 | add %g1,%o5,%o4 | ||
262 | add %o4,%g2,%o4 | ||
263 | stuw %o4,[%o0+8] | ||
264 | retl | ||
265 | srlx %o4,32,%o0 | ||
266 | |||
267 | .type bn_mul_add_words,#function | ||
268 | .size bn_mul_add_words,(.-bn_mul_add_words) | ||
269 | |||
270 | .align 32 | ||
271 | |||
272 | .global bn_mul_words | ||
273 | /* | ||
274 | * BN_ULONG bn_mul_words(rp,ap,num,w) | ||
275 | * BN_ULONG *rp,*ap; | ||
276 | * int num; | ||
277 | * BN_ULONG w; | ||
278 | */ | ||
279 | bn_mul_words: | ||
280 | sra %o2,%g0,%o2 ! signx %o2 | ||
281 | brgz,a %o2,.L_bn_mul_words_proceeed | ||
282 | lduw [%o1],%g2 | ||
283 | retl | ||
284 | clr %o0 | ||
285 | nop | ||
286 | nop | ||
287 | nop | ||
288 | |||
289 | .L_bn_mul_words_proceeed: | ||
290 | srl %o3,%g0,%o3 ! clruw %o3 | ||
291 | andcc %o2,-4,%g0 | ||
292 | bz,pn %icc,.L_bn_mul_words_tail | ||
293 | clr %o5 | ||
294 | |||
295 | .L_bn_mul_words_loop: ! wow! 32 aligned! | ||
296 | lduw [%o1+4],%g3 | ||
297 | mulx %o3,%g2,%g2 | ||
298 | add %g2,%o5,%o4 | ||
299 | nop | ||
300 | stuw %o4,[%o0] | ||
301 | srlx %o4,32,%o5 | ||
302 | |||
303 | lduw [%o1+8],%g2 | ||
304 | mulx %o3,%g3,%g3 | ||
305 | add %g3,%o5,%o4 | ||
306 | dec 4,%o2 | ||
307 | stuw %o4,[%o0+4] | ||
308 | srlx %o4,32,%o5 | ||
309 | |||
310 | lduw [%o1+12],%g3 | ||
311 | mulx %o3,%g2,%g2 | ||
312 | add %g2,%o5,%o4 | ||
313 | inc 16,%o1 | ||
314 | stuw %o4,[%o0+8] | ||
315 | srlx %o4,32,%o5 | ||
316 | |||
317 | mulx %o3,%g3,%g3 | ||
318 | add %g3,%o5,%o4 | ||
319 | inc 16,%o0 | ||
320 | stuw %o4,[%o0-4] | ||
321 | srlx %o4,32,%o5 | ||
322 | andcc %o2,-4,%g0 | ||
323 | bnz,a,pt %icc,.L_bn_mul_words_loop | ||
324 | lduw [%o1],%g2 | ||
325 | nop | ||
326 | nop | ||
327 | |||
328 | brnz,a,pn %o2,.L_bn_mul_words_tail | ||
329 | lduw [%o1],%g2 | ||
330 | .L_bn_mul_words_return: | ||
331 | retl | ||
332 | mov %o5,%o0 | ||
333 | |||
334 | .L_bn_mul_words_tail: | ||
335 | mulx %o3,%g2,%g2 | ||
336 | add %g2,%o5,%o4 | ||
337 | dec %o2 | ||
338 | srlx %o4,32,%o5 | ||
339 | brz,pt %o2,.L_bn_mul_words_return | ||
340 | stuw %o4,[%o0] | ||
341 | |||
342 | lduw [%o1+4],%g2 | ||
343 | mulx %o3,%g2,%g2 | ||
344 | add %g2,%o5,%o4 | ||
345 | dec %o2 | ||
346 | srlx %o4,32,%o5 | ||
347 | brz,pt %o2,.L_bn_mul_words_return | ||
348 | stuw %o4,[%o0+4] | ||
349 | |||
350 | lduw [%o1+8],%g2 | ||
351 | mulx %o3,%g2,%g2 | ||
352 | add %g2,%o5,%o4 | ||
353 | stuw %o4,[%o0+8] | ||
354 | retl | ||
355 | srlx %o4,32,%o0 | ||
356 | |||
357 | .type bn_mul_words,#function | ||
358 | .size bn_mul_words,(.-bn_mul_words) | ||
359 | |||
360 | .align 32 | ||
361 | .global bn_sqr_words | ||
362 | /* | ||
363 | * void bn_sqr_words(r,a,n) | ||
364 | * BN_ULONG *r,*a; | ||
365 | * int n; | ||
366 | */ | ||
367 | bn_sqr_words: | ||
368 | sra %o2,%g0,%o2 ! signx %o2 | ||
369 | brgz,a %o2,.L_bn_sqr_words_proceeed | ||
370 | lduw [%o1],%g2 | ||
371 | retl | ||
372 | clr %o0 | ||
373 | nop | ||
374 | nop | ||
375 | nop | ||
376 | |||
377 | .L_bn_sqr_words_proceeed: | ||
378 | andcc %o2,-4,%g0 | ||
379 | nop | ||
380 | bz,pn %icc,.L_bn_sqr_words_tail | ||
381 | nop | ||
382 | |||
383 | .L_bn_sqr_words_loop: ! wow! 32 aligned! | ||
384 | lduw [%o1+4],%g3 | ||
385 | mulx %g2,%g2,%o4 | ||
386 | stuw %o4,[%o0] | ||
387 | srlx %o4,32,%o5 | ||
388 | stuw %o5,[%o0+4] | ||
389 | nop | ||
390 | |||
391 | lduw [%o1+8],%g2 | ||
392 | mulx %g3,%g3,%o4 | ||
393 | dec 4,%o2 | ||
394 | stuw %o4,[%o0+8] | ||
395 | srlx %o4,32,%o5 | ||
396 | stuw %o5,[%o0+12] | ||
397 | |||
398 | lduw [%o1+12],%g3 | ||
399 | mulx %g2,%g2,%o4 | ||
400 | srlx %o4,32,%o5 | ||
401 | stuw %o4,[%o0+16] | ||
402 | inc 16,%o1 | ||
403 | stuw %o5,[%o0+20] | ||
404 | |||
405 | mulx %g3,%g3,%o4 | ||
406 | inc 32,%o0 | ||
407 | stuw %o4,[%o0-8] | ||
408 | srlx %o4,32,%o5 | ||
409 | andcc %o2,-4,%g2 | ||
410 | stuw %o5,[%o0-4] | ||
411 | bnz,a,pt %icc,.L_bn_sqr_words_loop | ||
412 | lduw [%o1],%g2 | ||
413 | nop | ||
414 | |||
415 | brnz,a,pn %o2,.L_bn_sqr_words_tail | ||
416 | lduw [%o1],%g2 | ||
417 | .L_bn_sqr_words_return: | ||
418 | retl | ||
419 | clr %o0 | ||
420 | |||
421 | .L_bn_sqr_words_tail: | ||
422 | mulx %g2,%g2,%o4 | ||
423 | dec %o2 | ||
424 | stuw %o4,[%o0] | ||
425 | srlx %o4,32,%o5 | ||
426 | brz,pt %o2,.L_bn_sqr_words_return | ||
427 | stuw %o5,[%o0+4] | ||
428 | |||
429 | lduw [%o1+4],%g2 | ||
430 | mulx %g2,%g2,%o4 | ||
431 | dec %o2 | ||
432 | stuw %o4,[%o0+8] | ||
433 | srlx %o4,32,%o5 | ||
434 | brz,pt %o2,.L_bn_sqr_words_return | ||
435 | stuw %o5,[%o0+12] | ||
436 | |||
437 | lduw [%o1+8],%g2 | ||
438 | mulx %g2,%g2,%o4 | ||
439 | srlx %o4,32,%o5 | ||
440 | stuw %o4,[%o0+16] | ||
441 | stuw %o5,[%o0+20] | ||
442 | retl | ||
443 | clr %o0 | ||
444 | |||
445 | .type bn_sqr_words,#function | ||
446 | .size bn_sqr_words,(.-bn_sqr_words) | ||
447 | |||
448 | .align 32 | ||
449 | .global bn_div_words | ||
450 | /* | ||
451 | * BN_ULONG bn_div_words(h,l,d) | ||
452 | * BN_ULONG h,l,d; | ||
453 | */ | ||
454 | bn_div_words: | ||
455 | sllx %o0,32,%o0 | ||
456 | or %o0,%o1,%o0 | ||
457 | udivx %o0,%o2,%o0 | ||
458 | retl | ||
459 | srl %o0,%g0,%o0 ! clruw %o0 | ||
460 | |||
461 | .type bn_div_words,#function | ||
462 | .size bn_div_words,(.-bn_div_words) | ||
463 | |||
464 | .align 32 | ||
465 | |||
466 | .global bn_add_words | ||
467 | /* | ||
468 | * BN_ULONG bn_add_words(rp,ap,bp,n) | ||
469 | * BN_ULONG *rp,*ap,*bp; | ||
470 | * int n; | ||
471 | */ | ||
472 | bn_add_words: | ||
473 | sra %o3,%g0,%o3 ! signx %o3 | ||
474 | brgz,a %o3,.L_bn_add_words_proceed | ||
475 | lduw [%o1],%o4 | ||
476 | retl | ||
477 | clr %o0 | ||
478 | |||
479 | .L_bn_add_words_proceed: | ||
480 | andcc %o3,-4,%g0 | ||
481 | bz,pn %icc,.L_bn_add_words_tail | ||
482 | addcc %g0,0,%g0 ! clear carry flag | ||
483 | |||
484 | .L_bn_add_words_loop: ! wow! 32 aligned! | ||
485 | dec 4,%o3 | ||
486 | lduw [%o2],%o5 | ||
487 | lduw [%o1+4],%g1 | ||
488 | lduw [%o2+4],%g2 | ||
489 | lduw [%o1+8],%g3 | ||
490 | lduw [%o2+8],%g4 | ||
491 | addccc %o5,%o4,%o5 | ||
492 | stuw %o5,[%o0] | ||
493 | |||
494 | lduw [%o1+12],%o4 | ||
495 | lduw [%o2+12],%o5 | ||
496 | inc 16,%o1 | ||
497 | addccc %g1,%g2,%g1 | ||
498 | stuw %g1,[%o0+4] | ||
499 | |||
500 | inc 16,%o2 | ||
501 | addccc %g3,%g4,%g3 | ||
502 | stuw %g3,[%o0+8] | ||
503 | |||
504 | inc 16,%o0 | ||
505 | addccc %o5,%o4,%o5 | ||
506 | stuw %o5,[%o0-4] | ||
507 | and %o3,-4,%g1 | ||
508 | brnz,a,pt %g1,.L_bn_add_words_loop | ||
509 | lduw [%o1],%o4 | ||
510 | |||
511 | brnz,a,pn %o3,.L_bn_add_words_tail | ||
512 | lduw [%o1],%o4 | ||
513 | .L_bn_add_words_return: | ||
514 | clr %o0 | ||
515 | retl | ||
516 | movcs %icc,1,%o0 | ||
517 | nop | ||
518 | |||
519 | .L_bn_add_words_tail: | ||
520 | lduw [%o2],%o5 | ||
521 | dec %o3 | ||
522 | addccc %o5,%o4,%o5 | ||
523 | brz,pt %o3,.L_bn_add_words_return | ||
524 | stuw %o5,[%o0] | ||
525 | |||
526 | lduw [%o1+4],%o4 | ||
527 | lduw [%o2+4],%o5 | ||
528 | dec %o3 | ||
529 | addccc %o5,%o4,%o5 | ||
530 | brz,pt %o3,.L_bn_add_words_return | ||
531 | stuw %o5,[%o0+4] | ||
532 | |||
533 | lduw [%o1+8],%o4 | ||
534 | lduw [%o2+8],%o5 | ||
535 | addccc %o5,%o4,%o5 | ||
536 | stuw %o5,[%o0+8] | ||
537 | clr %o0 | ||
538 | retl | ||
539 | movcs %icc,1,%o0 | ||
540 | |||
541 | .type bn_add_words,#function | ||
542 | .size bn_add_words,(.-bn_add_words) | ||
543 | |||
544 | .global bn_sub_words | ||
545 | /* | ||
546 | * BN_ULONG bn_sub_words(rp,ap,bp,n) | ||
547 | * BN_ULONG *rp,*ap,*bp; | ||
548 | * int n; | ||
549 | */ | ||
550 | bn_sub_words: | ||
551 | sra %o3,%g0,%o3 ! signx %o3 | ||
552 | brgz,a %o3,.L_bn_sub_words_proceed | ||
553 | lduw [%o1],%o4 | ||
554 | retl | ||
555 | clr %o0 | ||
556 | |||
557 | .L_bn_sub_words_proceed: | ||
558 | andcc %o3,-4,%g0 | ||
559 | bz,pn %icc,.L_bn_sub_words_tail | ||
560 | addcc %g0,0,%g0 ! clear carry flag | ||
561 | |||
562 | .L_bn_sub_words_loop: ! wow! 32 aligned! | ||
563 | dec 4,%o3 | ||
564 | lduw [%o2],%o5 | ||
565 | lduw [%o1+4],%g1 | ||
566 | lduw [%o2+4],%g2 | ||
567 | lduw [%o1+8],%g3 | ||
568 | lduw [%o2+8],%g4 | ||
569 | subccc %o4,%o5,%o5 | ||
570 | stuw %o5,[%o0] | ||
571 | |||
572 | lduw [%o1+12],%o4 | ||
573 | lduw [%o2+12],%o5 | ||
574 | inc 16,%o1 | ||
575 | subccc %g1,%g2,%g2 | ||
576 | stuw %g2,[%o0+4] | ||
577 | |||
578 | inc 16,%o2 | ||
579 | subccc %g3,%g4,%g4 | ||
580 | stuw %g4,[%o0+8] | ||
581 | |||
582 | inc 16,%o0 | ||
583 | subccc %o4,%o5,%o5 | ||
584 | stuw %o5,[%o0-4] | ||
585 | and %o3,-4,%g1 | ||
586 | brnz,a,pt %g1,.L_bn_sub_words_loop | ||
587 | lduw [%o1],%o4 | ||
588 | |||
589 | brnz,a,pn %o3,.L_bn_sub_words_tail | ||
590 | lduw [%o1],%o4 | ||
591 | .L_bn_sub_words_return: | ||
592 | clr %o0 | ||
593 | retl | ||
594 | movcs %icc,1,%o0 | ||
595 | nop | ||
596 | |||
597 | .L_bn_sub_words_tail: ! wow! 32 aligned! | ||
598 | lduw [%o2],%o5 | ||
599 | dec %o3 | ||
600 | subccc %o4,%o5,%o5 | ||
601 | brz,pt %o3,.L_bn_sub_words_return | ||
602 | stuw %o5,[%o0] | ||
603 | |||
604 | lduw [%o1+4],%o4 | ||
605 | lduw [%o2+4],%o5 | ||
606 | dec %o3 | ||
607 | subccc %o4,%o5,%o5 | ||
608 | brz,pt %o3,.L_bn_sub_words_return | ||
609 | stuw %o5,[%o0+4] | ||
610 | |||
611 | lduw [%o1+8],%o4 | ||
612 | lduw [%o2+8],%o5 | ||
613 | subccc %o4,%o5,%o5 | ||
614 | stuw %o5,[%o0+8] | ||
615 | clr %o0 | ||
616 | retl | ||
617 | movcs %icc,1,%o0 | ||
618 | |||
619 | .type bn_sub_words,#function | ||
620 | .size bn_sub_words,(.-bn_sub_words) | ||
621 | |||
622 | /* | ||
623 | * Code below depends on the fact that upper parts of the %l0-%l7 | ||
624 | * and %i0-%i7 are zeroed by kernel after context switch. In | ||
625 | * previous versions this comment stated that "the trouble is that | ||
626 | * it's not feasible to implement the mumbo-jumbo in less V9 | ||
627 | * instructions:-(" which apparently isn't true thanks to | ||
628 | * 'bcs,a %xcc,.+8; inc %rd' pair. But the performance improvement | ||
629 | * results not from the shorter code, but from elimination of | ||
630 | * multicycle none-pairable 'rd %y,%rd' instructions. | ||
631 | * | ||
632 | * Andy. | ||
633 | */ | ||
634 | |||
635 | /* | ||
636 | * Here is register usage map for *all* routines below. | ||
637 | */ | ||
638 | #define t_1 %o0 | ||
639 | #define t_2 %o1 | ||
640 | #define c_12 %o2 | ||
641 | #define c_3 %o3 | ||
642 | |||
643 | #define ap(I) [%i1+4*I] | ||
644 | #define bp(I) [%i2+4*I] | ||
645 | #define rp(I) [%i0+4*I] | ||
646 | |||
647 | #define a_0 %l0 | ||
648 | #define a_1 %l1 | ||
649 | #define a_2 %l2 | ||
650 | #define a_3 %l3 | ||
651 | #define a_4 %l4 | ||
652 | #define a_5 %l5 | ||
653 | #define a_6 %l6 | ||
654 | #define a_7 %l7 | ||
655 | |||
656 | #define b_0 %i3 | ||
657 | #define b_1 %i4 | ||
658 | #define b_2 %i5 | ||
659 | #define b_3 %o4 | ||
660 | #define b_4 %o5 | ||
661 | #define b_5 %o7 | ||
662 | #define b_6 %g1 | ||
663 | #define b_7 %g4 | ||
664 | |||
665 | .align 32 | ||
666 | .global bn_mul_comba8 | ||
667 | /* | ||
668 | * void bn_mul_comba8(r,a,b) | ||
669 | * BN_ULONG *r,*a,*b; | ||
670 | */ | ||
671 | bn_mul_comba8: | ||
672 | save %sp,FRAME_SIZE,%sp | ||
673 | mov 1,t_2 | ||
674 | lduw ap(0),a_0 | ||
675 | sllx t_2,32,t_2 | ||
676 | lduw bp(0),b_0 != | ||
677 | lduw bp(1),b_1 | ||
678 | mulx a_0,b_0,t_1 !mul_add_c(a[0],b[0],c1,c2,c3); | ||
679 | srlx t_1,32,c_12 | ||
680 | stuw t_1,rp(0) !=!r[0]=c1; | ||
681 | |||
682 | lduw ap(1),a_1 | ||
683 | mulx a_0,b_1,t_1 !mul_add_c(a[0],b[1],c2,c3,c1); | ||
684 | addcc c_12,t_1,c_12 | ||
685 | clr c_3 != | ||
686 | bcs,a %xcc,.+8 | ||
687 | add c_3,t_2,c_3 | ||
688 | lduw ap(2),a_2 | ||
689 | mulx a_1,b_0,t_1 !=!mul_add_c(a[1],b[0],c2,c3,c1); | ||
690 | addcc c_12,t_1,t_1 | ||
691 | bcs,a %xcc,.+8 | ||
692 | add c_3,t_2,c_3 | ||
693 | srlx t_1,32,c_12 != | ||
694 | stuw t_1,rp(1) !r[1]=c2; | ||
695 | or c_12,c_3,c_12 | ||
696 | |||
697 | mulx a_2,b_0,t_1 !mul_add_c(a[2],b[0],c3,c1,c2); | ||
698 | addcc c_12,t_1,c_12 != | ||
699 | clr c_3 | ||
700 | bcs,a %xcc,.+8 | ||
701 | add c_3,t_2,c_3 | ||
702 | lduw bp(2),b_2 != | ||
703 | mulx a_1,b_1,t_1 !mul_add_c(a[1],b[1],c3,c1,c2); | ||
704 | addcc c_12,t_1,c_12 | ||
705 | bcs,a %xcc,.+8 | ||
706 | add c_3,t_2,c_3 != | ||
707 | lduw bp(3),b_3 | ||
708 | mulx a_0,b_2,t_1 !mul_add_c(a[0],b[2],c3,c1,c2); | ||
709 | addcc c_12,t_1,t_1 | ||
710 | bcs,a %xcc,.+8 != | ||
711 | add c_3,t_2,c_3 | ||
712 | srlx t_1,32,c_12 | ||
713 | stuw t_1,rp(2) !r[2]=c3; | ||
714 | or c_12,c_3,c_12 != | ||
715 | |||
716 | mulx a_0,b_3,t_1 !mul_add_c(a[0],b[3],c1,c2,c3); | ||
717 | addcc c_12,t_1,c_12 | ||
718 | clr c_3 | ||
719 | bcs,a %xcc,.+8 != | ||
720 | add c_3,t_2,c_3 | ||
721 | mulx a_1,b_2,t_1 !=!mul_add_c(a[1],b[2],c1,c2,c3); | ||
722 | addcc c_12,t_1,c_12 | ||
723 | bcs,a %xcc,.+8 != | ||
724 | add c_3,t_2,c_3 | ||
725 | lduw ap(3),a_3 | ||
726 | mulx a_2,b_1,t_1 !mul_add_c(a[2],b[1],c1,c2,c3); | ||
727 | addcc c_12,t_1,c_12 != | ||
728 | bcs,a %xcc,.+8 | ||
729 | add c_3,t_2,c_3 | ||
730 | lduw ap(4),a_4 | ||
731 | mulx a_3,b_0,t_1 !=!mul_add_c(a[3],b[0],c1,c2,c3);!= | ||
732 | addcc c_12,t_1,t_1 | ||
733 | bcs,a %xcc,.+8 | ||
734 | add c_3,t_2,c_3 | ||
735 | srlx t_1,32,c_12 != | ||
736 | stuw t_1,rp(3) !r[3]=c1; | ||
737 | or c_12,c_3,c_12 | ||
738 | |||
739 | mulx a_4,b_0,t_1 !mul_add_c(a[4],b[0],c2,c3,c1); | ||
740 | addcc c_12,t_1,c_12 != | ||
741 | clr c_3 | ||
742 | bcs,a %xcc,.+8 | ||
743 | add c_3,t_2,c_3 | ||
744 | mulx a_3,b_1,t_1 !=!mul_add_c(a[3],b[1],c2,c3,c1); | ||
745 | addcc c_12,t_1,c_12 | ||
746 | bcs,a %xcc,.+8 | ||
747 | add c_3,t_2,c_3 | ||
748 | mulx a_2,b_2,t_1 !=!mul_add_c(a[2],b[2],c2,c3,c1); | ||
749 | addcc c_12,t_1,c_12 | ||
750 | bcs,a %xcc,.+8 | ||
751 | add c_3,t_2,c_3 | ||
752 | lduw bp(4),b_4 != | ||
753 | mulx a_1,b_3,t_1 !mul_add_c(a[1],b[3],c2,c3,c1); | ||
754 | addcc c_12,t_1,c_12 | ||
755 | bcs,a %xcc,.+8 | ||
756 | add c_3,t_2,c_3 != | ||
757 | lduw bp(5),b_5 | ||
758 | mulx a_0,b_4,t_1 !mul_add_c(a[0],b[4],c2,c3,c1); | ||
759 | addcc c_12,t_1,t_1 | ||
760 | bcs,a %xcc,.+8 != | ||
761 | add c_3,t_2,c_3 | ||
762 | srlx t_1,32,c_12 | ||
763 | stuw t_1,rp(4) !r[4]=c2; | ||
764 | or c_12,c_3,c_12 != | ||
765 | |||
766 | mulx a_0,b_5,t_1 !mul_add_c(a[0],b[5],c3,c1,c2); | ||
767 | addcc c_12,t_1,c_12 | ||
768 | clr c_3 | ||
769 | bcs,a %xcc,.+8 != | ||
770 | add c_3,t_2,c_3 | ||
771 | mulx a_1,b_4,t_1 !mul_add_c(a[1],b[4],c3,c1,c2); | ||
772 | addcc c_12,t_1,c_12 | ||
773 | bcs,a %xcc,.+8 != | ||
774 | add c_3,t_2,c_3 | ||
775 | mulx a_2,b_3,t_1 !mul_add_c(a[2],b[3],c3,c1,c2); | ||
776 | addcc c_12,t_1,c_12 | ||
777 | bcs,a %xcc,.+8 != | ||
778 | add c_3,t_2,c_3 | ||
779 | mulx a_3,b_2,t_1 !mul_add_c(a[3],b[2],c3,c1,c2); | ||
780 | addcc c_12,t_1,c_12 | ||
781 | bcs,a %xcc,.+8 != | ||
782 | add c_3,t_2,c_3 | ||
783 | lduw ap(5),a_5 | ||
784 | mulx a_4,b_1,t_1 !mul_add_c(a[4],b[1],c3,c1,c2); | ||
785 | addcc c_12,t_1,c_12 != | ||
786 | bcs,a %xcc,.+8 | ||
787 | add c_3,t_2,c_3 | ||
788 | lduw ap(6),a_6 | ||
789 | mulx a_5,b_0,t_1 !=!mul_add_c(a[5],b[0],c3,c1,c2); | ||
790 | addcc c_12,t_1,t_1 | ||
791 | bcs,a %xcc,.+8 | ||
792 | add c_3,t_2,c_3 | ||
793 | srlx t_1,32,c_12 != | ||
794 | stuw t_1,rp(5) !r[5]=c3; | ||
795 | or c_12,c_3,c_12 | ||
796 | |||
797 | mulx a_6,b_0,t_1 !mul_add_c(a[6],b[0],c1,c2,c3); | ||
798 | addcc c_12,t_1,c_12 != | ||
799 | clr c_3 | ||
800 | bcs,a %xcc,.+8 | ||
801 | add c_3,t_2,c_3 | ||
802 | mulx a_5,b_1,t_1 !=!mul_add_c(a[5],b[1],c1,c2,c3); | ||
803 | addcc c_12,t_1,c_12 | ||
804 | bcs,a %xcc,.+8 | ||
805 | add c_3,t_2,c_3 | ||
806 | mulx a_4,b_2,t_1 !=!mul_add_c(a[4],b[2],c1,c2,c3); | ||
807 | addcc c_12,t_1,c_12 | ||
808 | bcs,a %xcc,.+8 | ||
809 | add c_3,t_2,c_3 | ||
810 | mulx a_3,b_3,t_1 !=!mul_add_c(a[3],b[3],c1,c2,c3); | ||
811 | addcc c_12,t_1,c_12 | ||
812 | bcs,a %xcc,.+8 | ||
813 | add c_3,t_2,c_3 | ||
814 | mulx a_2,b_4,t_1 !=!mul_add_c(a[2],b[4],c1,c2,c3); | ||
815 | addcc c_12,t_1,c_12 | ||
816 | bcs,a %xcc,.+8 | ||
817 | add c_3,t_2,c_3 | ||
818 | lduw bp(6),b_6 != | ||
819 | mulx a_1,b_5,t_1 !mul_add_c(a[1],b[5],c1,c2,c3); | ||
820 | addcc c_12,t_1,c_12 | ||
821 | bcs,a %xcc,.+8 | ||
822 | add c_3,t_2,c_3 != | ||
823 | lduw bp(7),b_7 | ||
824 | mulx a_0,b_6,t_1 !mul_add_c(a[0],b[6],c1,c2,c3); | ||
825 | addcc c_12,t_1,t_1 | ||
826 | bcs,a %xcc,.+8 != | ||
827 | add c_3,t_2,c_3 | ||
828 | srlx t_1,32,c_12 | ||
829 | stuw t_1,rp(6) !r[6]=c1; | ||
830 | or c_12,c_3,c_12 != | ||
831 | |||
832 | mulx a_0,b_7,t_1 !mul_add_c(a[0],b[7],c2,c3,c1); | ||
833 | addcc c_12,t_1,c_12 | ||
834 | clr c_3 | ||
835 | bcs,a %xcc,.+8 != | ||
836 | add c_3,t_2,c_3 | ||
837 | mulx a_1,b_6,t_1 !mul_add_c(a[1],b[6],c2,c3,c1); | ||
838 | addcc c_12,t_1,c_12 | ||
839 | bcs,a %xcc,.+8 != | ||
840 | add c_3,t_2,c_3 | ||
841 | mulx a_2,b_5,t_1 !mul_add_c(a[2],b[5],c2,c3,c1); | ||
842 | addcc c_12,t_1,c_12 | ||
843 | bcs,a %xcc,.+8 != | ||
844 | add c_3,t_2,c_3 | ||
845 | mulx a_3,b_4,t_1 !mul_add_c(a[3],b[4],c2,c3,c1); | ||
846 | addcc c_12,t_1,c_12 | ||
847 | bcs,a %xcc,.+8 != | ||
848 | add c_3,t_2,c_3 | ||
849 | mulx a_4,b_3,t_1 !mul_add_c(a[4],b[3],c2,c3,c1); | ||
850 | addcc c_12,t_1,c_12 | ||
851 | bcs,a %xcc,.+8 != | ||
852 | add c_3,t_2,c_3 | ||
853 | mulx a_5,b_2,t_1 !mul_add_c(a[5],b[2],c2,c3,c1); | ||
854 | addcc c_12,t_1,c_12 | ||
855 | bcs,a %xcc,.+8 != | ||
856 | add c_3,t_2,c_3 | ||
857 | lduw ap(7),a_7 | ||
858 | mulx a_6,b_1,t_1 !=!mul_add_c(a[6],b[1],c2,c3,c1); | ||
859 | addcc c_12,t_1,c_12 | ||
860 | bcs,a %xcc,.+8 | ||
861 | add c_3,t_2,c_3 | ||
862 | mulx a_7,b_0,t_1 !=!mul_add_c(a[7],b[0],c2,c3,c1); | ||
863 | addcc c_12,t_1,t_1 | ||
864 | bcs,a %xcc,.+8 | ||
865 | add c_3,t_2,c_3 | ||
866 | srlx t_1,32,c_12 != | ||
867 | stuw t_1,rp(7) !r[7]=c2; | ||
868 | or c_12,c_3,c_12 | ||
869 | |||
870 | mulx a_7,b_1,t_1 !=!mul_add_c(a[7],b[1],c3,c1,c2); | ||
871 | addcc c_12,t_1,c_12 | ||
872 | clr c_3 | ||
873 | bcs,a %xcc,.+8 | ||
874 | add c_3,t_2,c_3 != | ||
875 | mulx a_6,b_2,t_1 !mul_add_c(a[6],b[2],c3,c1,c2); | ||
876 | addcc c_12,t_1,c_12 | ||
877 | bcs,a %xcc,.+8 | ||
878 | add c_3,t_2,c_3 != | ||
879 | mulx a_5,b_3,t_1 !mul_add_c(a[5],b[3],c3,c1,c2); | ||
880 | addcc c_12,t_1,c_12 | ||
881 | bcs,a %xcc,.+8 | ||
882 | add c_3,t_2,c_3 != | ||
883 | mulx a_4,b_4,t_1 !mul_add_c(a[4],b[4],c3,c1,c2); | ||
884 | addcc c_12,t_1,c_12 | ||
885 | bcs,a %xcc,.+8 | ||
886 | add c_3,t_2,c_3 != | ||
887 | mulx a_3,b_5,t_1 !mul_add_c(a[3],b[5],c3,c1,c2); | ||
888 | addcc c_12,t_1,c_12 | ||
889 | bcs,a %xcc,.+8 | ||
890 | add c_3,t_2,c_3 != | ||
891 | mulx a_2,b_6,t_1 !mul_add_c(a[2],b[6],c3,c1,c2); | ||
892 | addcc c_12,t_1,c_12 | ||
893 | bcs,a %xcc,.+8 | ||
894 | add c_3,t_2,c_3 != | ||
895 | mulx a_1,b_7,t_1 !mul_add_c(a[1],b[7],c3,c1,c2); | ||
896 | addcc c_12,t_1,t_1 | ||
897 | bcs,a %xcc,.+8 | ||
898 | add c_3,t_2,c_3 != | ||
899 | srlx t_1,32,c_12 | ||
900 | stuw t_1,rp(8) !r[8]=c3; | ||
901 | or c_12,c_3,c_12 | ||
902 | |||
903 | mulx a_2,b_7,t_1 !=!mul_add_c(a[2],b[7],c1,c2,c3); | ||
904 | addcc c_12,t_1,c_12 | ||
905 | clr c_3 | ||
906 | bcs,a %xcc,.+8 | ||
907 | add c_3,t_2,c_3 != | ||
908 | mulx a_3,b_6,t_1 !mul_add_c(a[3],b[6],c1,c2,c3); | ||
909 | addcc c_12,t_1,c_12 | ||
910 | bcs,a %xcc,.+8 != | ||
911 | add c_3,t_2,c_3 | ||
912 | mulx a_4,b_5,t_1 !mul_add_c(a[4],b[5],c1,c2,c3); | ||
913 | addcc c_12,t_1,c_12 | ||
914 | bcs,a %xcc,.+8 != | ||
915 | add c_3,t_2,c_3 | ||
916 | mulx a_5,b_4,t_1 !mul_add_c(a[5],b[4],c1,c2,c3); | ||
917 | addcc c_12,t_1,c_12 | ||
918 | bcs,a %xcc,.+8 != | ||
919 | add c_3,t_2,c_3 | ||
920 | mulx a_6,b_3,t_1 !mul_add_c(a[6],b[3],c1,c2,c3); | ||
921 | addcc c_12,t_1,c_12 | ||
922 | bcs,a %xcc,.+8 != | ||
923 | add c_3,t_2,c_3 | ||
924 | mulx a_7,b_2,t_1 !mul_add_c(a[7],b[2],c1,c2,c3); | ||
925 | addcc c_12,t_1,t_1 | ||
926 | bcs,a %xcc,.+8 != | ||
927 | add c_3,t_2,c_3 | ||
928 | srlx t_1,32,c_12 | ||
929 | stuw t_1,rp(9) !r[9]=c1; | ||
930 | or c_12,c_3,c_12 != | ||
931 | |||
932 | mulx a_7,b_3,t_1 !mul_add_c(a[7],b[3],c2,c3,c1); | ||
933 | addcc c_12,t_1,c_12 | ||
934 | clr c_3 | ||
935 | bcs,a %xcc,.+8 != | ||
936 | add c_3,t_2,c_3 | ||
937 | mulx a_6,b_4,t_1 !mul_add_c(a[6],b[4],c2,c3,c1); | ||
938 | addcc c_12,t_1,c_12 | ||
939 | bcs,a %xcc,.+8 != | ||
940 | add c_3,t_2,c_3 | ||
941 | mulx a_5,b_5,t_1 !mul_add_c(a[5],b[5],c2,c3,c1); | ||
942 | addcc c_12,t_1,c_12 | ||
943 | bcs,a %xcc,.+8 != | ||
944 | add c_3,t_2,c_3 | ||
945 | mulx a_4,b_6,t_1 !mul_add_c(a[4],b[6],c2,c3,c1); | ||
946 | addcc c_12,t_1,c_12 | ||
947 | bcs,a %xcc,.+8 != | ||
948 | add c_3,t_2,c_3 | ||
949 | mulx a_3,b_7,t_1 !mul_add_c(a[3],b[7],c2,c3,c1); | ||
950 | addcc c_12,t_1,t_1 | ||
951 | bcs,a %xcc,.+8 != | ||
952 | add c_3,t_2,c_3 | ||
953 | srlx t_1,32,c_12 | ||
954 | stuw t_1,rp(10) !r[10]=c2; | ||
955 | or c_12,c_3,c_12 != | ||
956 | |||
957 | mulx a_4,b_7,t_1 !mul_add_c(a[4],b[7],c3,c1,c2); | ||
958 | addcc c_12,t_1,c_12 | ||
959 | clr c_3 | ||
960 | bcs,a %xcc,.+8 != | ||
961 | add c_3,t_2,c_3 | ||
962 | mulx a_5,b_6,t_1 !mul_add_c(a[5],b[6],c3,c1,c2); | ||
963 | addcc c_12,t_1,c_12 | ||
964 | bcs,a %xcc,.+8 != | ||
965 | add c_3,t_2,c_3 | ||
966 | mulx a_6,b_5,t_1 !mul_add_c(a[6],b[5],c3,c1,c2); | ||
967 | addcc c_12,t_1,c_12 | ||
968 | bcs,a %xcc,.+8 != | ||
969 | add c_3,t_2,c_3 | ||
970 | mulx a_7,b_4,t_1 !mul_add_c(a[7],b[4],c3,c1,c2); | ||
971 | addcc c_12,t_1,t_1 | ||
972 | bcs,a %xcc,.+8 != | ||
973 | add c_3,t_2,c_3 | ||
974 | srlx t_1,32,c_12 | ||
975 | stuw t_1,rp(11) !r[11]=c3; | ||
976 | or c_12,c_3,c_12 != | ||
977 | |||
978 | mulx a_7,b_5,t_1 !mul_add_c(a[7],b[5],c1,c2,c3); | ||
979 | addcc c_12,t_1,c_12 | ||
980 | clr c_3 | ||
981 | bcs,a %xcc,.+8 != | ||
982 | add c_3,t_2,c_3 | ||
983 | mulx a_6,b_6,t_1 !mul_add_c(a[6],b[6],c1,c2,c3); | ||
984 | addcc c_12,t_1,c_12 | ||
985 | bcs,a %xcc,.+8 != | ||
986 | add c_3,t_2,c_3 | ||
987 | mulx a_5,b_7,t_1 !mul_add_c(a[5],b[7],c1,c2,c3); | ||
988 | addcc c_12,t_1,t_1 | ||
989 | bcs,a %xcc,.+8 != | ||
990 | add c_3,t_2,c_3 | ||
991 | srlx t_1,32,c_12 | ||
992 | stuw t_1,rp(12) !r[12]=c1; | ||
993 | or c_12,c_3,c_12 != | ||
994 | |||
995 | mulx a_6,b_7,t_1 !mul_add_c(a[6],b[7],c2,c3,c1); | ||
996 | addcc c_12,t_1,c_12 | ||
997 | clr c_3 | ||
998 | bcs,a %xcc,.+8 != | ||
999 | add c_3,t_2,c_3 | ||
1000 | mulx a_7,b_6,t_1 !mul_add_c(a[7],b[6],c2,c3,c1); | ||
1001 | addcc c_12,t_1,t_1 | ||
1002 | bcs,a %xcc,.+8 != | ||
1003 | add c_3,t_2,c_3 | ||
1004 | srlx t_1,32,c_12 | ||
1005 | st t_1,rp(13) !r[13]=c2; | ||
1006 | or c_12,c_3,c_12 != | ||
1007 | |||
1008 | mulx a_7,b_7,t_1 !mul_add_c(a[7],b[7],c3,c1,c2); | ||
1009 | addcc c_12,t_1,t_1 | ||
1010 | srlx t_1,32,c_12 != | ||
1011 | stuw t_1,rp(14) !r[14]=c3; | ||
1012 | stuw c_12,rp(15) !r[15]=c1; | ||
1013 | |||
1014 | ret | ||
1015 | restore %g0,%g0,%o0 != | ||
1016 | |||
1017 | .type bn_mul_comba8,#function | ||
1018 | .size bn_mul_comba8,(.-bn_mul_comba8) | ||
1019 | |||
1020 | .align 32 | ||
1021 | |||
1022 | .global bn_mul_comba4 | ||
1023 | /* | ||
1024 | * void bn_mul_comba4(r,a,b) | ||
1025 | * BN_ULONG *r,*a,*b; | ||
1026 | */ | ||
1027 | bn_mul_comba4: | ||
1028 | save %sp,FRAME_SIZE,%sp | ||
1029 | lduw ap(0),a_0 | ||
1030 | mov 1,t_2 | ||
1031 | lduw bp(0),b_0 | ||
1032 | sllx t_2,32,t_2 != | ||
1033 | lduw bp(1),b_1 | ||
1034 | mulx a_0,b_0,t_1 !mul_add_c(a[0],b[0],c1,c2,c3); | ||
1035 | srlx t_1,32,c_12 | ||
1036 | stuw t_1,rp(0) !=!r[0]=c1; | ||
1037 | |||
1038 | lduw ap(1),a_1 | ||
1039 | mulx a_0,b_1,t_1 !mul_add_c(a[0],b[1],c2,c3,c1); | ||
1040 | addcc c_12,t_1,c_12 | ||
1041 | clr c_3 != | ||
1042 | bcs,a %xcc,.+8 | ||
1043 | add c_3,t_2,c_3 | ||
1044 | lduw ap(2),a_2 | ||
1045 | mulx a_1,b_0,t_1 !=!mul_add_c(a[1],b[0],c2,c3,c1); | ||
1046 | addcc c_12,t_1,t_1 | ||
1047 | bcs,a %xcc,.+8 | ||
1048 | add c_3,t_2,c_3 | ||
1049 | srlx t_1,32,c_12 != | ||
1050 | stuw t_1,rp(1) !r[1]=c2; | ||
1051 | or c_12,c_3,c_12 | ||
1052 | |||
1053 | mulx a_2,b_0,t_1 !mul_add_c(a[2],b[0],c3,c1,c2); | ||
1054 | addcc c_12,t_1,c_12 != | ||
1055 | clr c_3 | ||
1056 | bcs,a %xcc,.+8 | ||
1057 | add c_3,t_2,c_3 | ||
1058 | lduw bp(2),b_2 != | ||
1059 | mulx a_1,b_1,t_1 !mul_add_c(a[1],b[1],c3,c1,c2); | ||
1060 | addcc c_12,t_1,c_12 | ||
1061 | bcs,a %xcc,.+8 | ||
1062 | add c_3,t_2,c_3 != | ||
1063 | lduw bp(3),b_3 | ||
1064 | mulx a_0,b_2,t_1 !mul_add_c(a[0],b[2],c3,c1,c2); | ||
1065 | addcc c_12,t_1,t_1 | ||
1066 | bcs,a %xcc,.+8 != | ||
1067 | add c_3,t_2,c_3 | ||
1068 | srlx t_1,32,c_12 | ||
1069 | stuw t_1,rp(2) !r[2]=c3; | ||
1070 | or c_12,c_3,c_12 != | ||
1071 | |||
1072 | mulx a_0,b_3,t_1 !mul_add_c(a[0],b[3],c1,c2,c3); | ||
1073 | addcc c_12,t_1,c_12 | ||
1074 | clr c_3 | ||
1075 | bcs,a %xcc,.+8 != | ||
1076 | add c_3,t_2,c_3 | ||
1077 | mulx a_1,b_2,t_1 !mul_add_c(a[1],b[2],c1,c2,c3); | ||
1078 | addcc c_12,t_1,c_12 | ||
1079 | bcs,a %xcc,.+8 != | ||
1080 | add c_3,t_2,c_3 | ||
1081 | lduw ap(3),a_3 | ||
1082 | mulx a_2,b_1,t_1 !mul_add_c(a[2],b[1],c1,c2,c3); | ||
1083 | addcc c_12,t_1,c_12 != | ||
1084 | bcs,a %xcc,.+8 | ||
1085 | add c_3,t_2,c_3 | ||
1086 | mulx a_3,b_0,t_1 !mul_add_c(a[3],b[0],c1,c2,c3);!= | ||
1087 | addcc c_12,t_1,t_1 != | ||
1088 | bcs,a %xcc,.+8 | ||
1089 | add c_3,t_2,c_3 | ||
1090 | srlx t_1,32,c_12 | ||
1091 | stuw t_1,rp(3) !=!r[3]=c1; | ||
1092 | or c_12,c_3,c_12 | ||
1093 | |||
1094 | mulx a_3,b_1,t_1 !mul_add_c(a[3],b[1],c2,c3,c1); | ||
1095 | addcc c_12,t_1,c_12 | ||
1096 | clr c_3 != | ||
1097 | bcs,a %xcc,.+8 | ||
1098 | add c_3,t_2,c_3 | ||
1099 | mulx a_2,b_2,t_1 !mul_add_c(a[2],b[2],c2,c3,c1); | ||
1100 | addcc c_12,t_1,c_12 != | ||
1101 | bcs,a %xcc,.+8 | ||
1102 | add c_3,t_2,c_3 | ||
1103 | mulx a_1,b_3,t_1 !mul_add_c(a[1],b[3],c2,c3,c1); | ||
1104 | addcc c_12,t_1,t_1 != | ||
1105 | bcs,a %xcc,.+8 | ||
1106 | add c_3,t_2,c_3 | ||
1107 | srlx t_1,32,c_12 | ||
1108 | stuw t_1,rp(4) !=!r[4]=c2; | ||
1109 | or c_12,c_3,c_12 | ||
1110 | |||
1111 | mulx a_2,b_3,t_1 !mul_add_c(a[2],b[3],c3,c1,c2); | ||
1112 | addcc c_12,t_1,c_12 | ||
1113 | clr c_3 != | ||
1114 | bcs,a %xcc,.+8 | ||
1115 | add c_3,t_2,c_3 | ||
1116 | mulx a_3,b_2,t_1 !mul_add_c(a[3],b[2],c3,c1,c2); | ||
1117 | addcc c_12,t_1,t_1 != | ||
1118 | bcs,a %xcc,.+8 | ||
1119 | add c_3,t_2,c_3 | ||
1120 | srlx t_1,32,c_12 | ||
1121 | stuw t_1,rp(5) !=!r[5]=c3; | ||
1122 | or c_12,c_3,c_12 | ||
1123 | |||
1124 | mulx a_3,b_3,t_1 !mul_add_c(a[3],b[3],c1,c2,c3); | ||
1125 | addcc c_12,t_1,t_1 | ||
1126 | srlx t_1,32,c_12 != | ||
1127 | stuw t_1,rp(6) !r[6]=c1; | ||
1128 | stuw c_12,rp(7) !r[7]=c2; | ||
1129 | |||
1130 | ret | ||
1131 | restore %g0,%g0,%o0 | ||
1132 | |||
1133 | .type bn_mul_comba4,#function | ||
1134 | .size bn_mul_comba4,(.-bn_mul_comba4) | ||
1135 | |||
1136 | .align 32 | ||
1137 | |||
1138 | .global bn_sqr_comba8 | ||
1139 | bn_sqr_comba8: | ||
1140 | save %sp,FRAME_SIZE,%sp | ||
1141 | mov 1,t_2 | ||
1142 | lduw ap(0),a_0 | ||
1143 | sllx t_2,32,t_2 | ||
1144 | lduw ap(1),a_1 | ||
1145 | mulx a_0,a_0,t_1 !sqr_add_c(a,0,c1,c2,c3); | ||
1146 | srlx t_1,32,c_12 | ||
1147 | stuw t_1,rp(0) !r[0]=c1; | ||
1148 | |||
1149 | lduw ap(2),a_2 | ||
1150 | mulx a_0,a_1,t_1 !=!sqr_add_c2(a,1,0,c2,c3,c1); | ||
1151 | addcc c_12,t_1,c_12 | ||
1152 | clr c_3 | ||
1153 | bcs,a %xcc,.+8 | ||
1154 | add c_3,t_2,c_3 | ||
1155 | addcc c_12,t_1,t_1 | ||
1156 | bcs,a %xcc,.+8 | ||
1157 | add c_3,t_2,c_3 | ||
1158 | srlx t_1,32,c_12 | ||
1159 | stuw t_1,rp(1) !r[1]=c2; | ||
1160 | or c_12,c_3,c_12 | ||
1161 | |||
1162 | mulx a_2,a_0,t_1 !sqr_add_c2(a,2,0,c3,c1,c2); | ||
1163 | addcc c_12,t_1,c_12 | ||
1164 | clr c_3 | ||
1165 | bcs,a %xcc,.+8 | ||
1166 | add c_3,t_2,c_3 | ||
1167 | addcc c_12,t_1,c_12 | ||
1168 | bcs,a %xcc,.+8 | ||
1169 | add c_3,t_2,c_3 | ||
1170 | lduw ap(3),a_3 | ||
1171 | mulx a_1,a_1,t_1 !sqr_add_c(a,1,c3,c1,c2); | ||
1172 | addcc c_12,t_1,t_1 | ||
1173 | bcs,a %xcc,.+8 | ||
1174 | add c_3,t_2,c_3 | ||
1175 | srlx t_1,32,c_12 | ||
1176 | stuw t_1,rp(2) !r[2]=c3; | ||
1177 | or c_12,c_3,c_12 | ||
1178 | |||
1179 | mulx a_0,a_3,t_1 !sqr_add_c2(a,3,0,c1,c2,c3); | ||
1180 | addcc c_12,t_1,c_12 | ||
1181 | clr c_3 | ||
1182 | bcs,a %xcc,.+8 | ||
1183 | add c_3,t_2,c_3 | ||
1184 | addcc c_12,t_1,c_12 | ||
1185 | bcs,a %xcc,.+8 | ||
1186 | add c_3,t_2,c_3 | ||
1187 | lduw ap(4),a_4 | ||
1188 | mulx a_1,a_2,t_1 !sqr_add_c2(a,2,1,c1,c2,c3); | ||
1189 | addcc c_12,t_1,c_12 | ||
1190 | bcs,a %xcc,.+8 | ||
1191 | add c_3,t_2,c_3 | ||
1192 | addcc c_12,t_1,t_1 | ||
1193 | bcs,a %xcc,.+8 | ||
1194 | add c_3,t_2,c_3 | ||
1195 | srlx t_1,32,c_12 | ||
1196 | st t_1,rp(3) !r[3]=c1; | ||
1197 | or c_12,c_3,c_12 | ||
1198 | |||
1199 | mulx a_4,a_0,t_1 !sqr_add_c2(a,4,0,c2,c3,c1); | ||
1200 | addcc c_12,t_1,c_12 | ||
1201 | clr c_3 | ||
1202 | bcs,a %xcc,.+8 | ||
1203 | add c_3,t_2,c_3 | ||
1204 | addcc c_12,t_1,c_12 | ||
1205 | bcs,a %xcc,.+8 | ||
1206 | add c_3,t_2,c_3 | ||
1207 | mulx a_3,a_1,t_1 !sqr_add_c2(a,3,1,c2,c3,c1); | ||
1208 | addcc c_12,t_1,c_12 | ||
1209 | bcs,a %xcc,.+8 | ||
1210 | add c_3,t_2,c_3 | ||
1211 | addcc c_12,t_1,c_12 | ||
1212 | bcs,a %xcc,.+8 | ||
1213 | add c_3,t_2,c_3 | ||
1214 | lduw ap(5),a_5 | ||
1215 | mulx a_2,a_2,t_1 !sqr_add_c(a,2,c2,c3,c1); | ||
1216 | addcc c_12,t_1,t_1 | ||
1217 | bcs,a %xcc,.+8 | ||
1218 | add c_3,t_2,c_3 | ||
1219 | srlx t_1,32,c_12 | ||
1220 | stuw t_1,rp(4) !r[4]=c2; | ||
1221 | or c_12,c_3,c_12 | ||
1222 | |||
1223 | mulx a_0,a_5,t_1 !sqr_add_c2(a,5,0,c3,c1,c2); | ||
1224 | addcc c_12,t_1,c_12 | ||
1225 | clr c_3 | ||
1226 | bcs,a %xcc,.+8 | ||
1227 | add c_3,t_2,c_3 | ||
1228 | addcc c_12,t_1,c_12 | ||
1229 | bcs,a %xcc,.+8 | ||
1230 | add c_3,t_2,c_3 | ||
1231 | mulx a_1,a_4,t_1 !sqr_add_c2(a,4,1,c3,c1,c2); | ||
1232 | addcc c_12,t_1,c_12 | ||
1233 | bcs,a %xcc,.+8 | ||
1234 | add c_3,t_2,c_3 | ||
1235 | addcc c_12,t_1,c_12 | ||
1236 | bcs,a %xcc,.+8 | ||
1237 | add c_3,t_2,c_3 | ||
1238 | lduw ap(6),a_6 | ||
1239 | mulx a_2,a_3,t_1 !sqr_add_c2(a,3,2,c3,c1,c2); | ||
1240 | addcc c_12,t_1,c_12 | ||
1241 | bcs,a %xcc,.+8 | ||
1242 | add c_3,t_2,c_3 | ||
1243 | addcc c_12,t_1,t_1 | ||
1244 | bcs,a %xcc,.+8 | ||
1245 | add c_3,t_2,c_3 | ||
1246 | srlx t_1,32,c_12 | ||
1247 | stuw t_1,rp(5) !r[5]=c3; | ||
1248 | or c_12,c_3,c_12 | ||
1249 | |||
1250 | mulx a_6,a_0,t_1 !sqr_add_c2(a,6,0,c1,c2,c3); | ||
1251 | addcc c_12,t_1,c_12 | ||
1252 | clr c_3 | ||
1253 | bcs,a %xcc,.+8 | ||
1254 | add c_3,t_2,c_3 | ||
1255 | addcc c_12,t_1,c_12 | ||
1256 | bcs,a %xcc,.+8 | ||
1257 | add c_3,t_2,c_3 | ||
1258 | mulx a_5,a_1,t_1 !sqr_add_c2(a,5,1,c1,c2,c3); | ||
1259 | addcc c_12,t_1,c_12 | ||
1260 | bcs,a %xcc,.+8 | ||
1261 | add c_3,t_2,c_3 | ||
1262 | addcc c_12,t_1,c_12 | ||
1263 | bcs,a %xcc,.+8 | ||
1264 | add c_3,t_2,c_3 | ||
1265 | mulx a_4,a_2,t_1 !sqr_add_c2(a,4,2,c1,c2,c3); | ||
1266 | addcc c_12,t_1,c_12 | ||
1267 | bcs,a %xcc,.+8 | ||
1268 | add c_3,t_2,c_3 | ||
1269 | addcc c_12,t_1,c_12 | ||
1270 | bcs,a %xcc,.+8 | ||
1271 | add c_3,t_2,c_3 | ||
1272 | lduw ap(7),a_7 | ||
1273 | mulx a_3,a_3,t_1 !=!sqr_add_c(a,3,c1,c2,c3); | ||
1274 | addcc c_12,t_1,t_1 | ||
1275 | bcs,a %xcc,.+8 | ||
1276 | add c_3,t_2,c_3 | ||
1277 | srlx t_1,32,c_12 | ||
1278 | stuw t_1,rp(6) !r[6]=c1; | ||
1279 | or c_12,c_3,c_12 | ||
1280 | |||
1281 | mulx a_0,a_7,t_1 !sqr_add_c2(a,7,0,c2,c3,c1); | ||
1282 | addcc c_12,t_1,c_12 | ||
1283 | clr c_3 | ||
1284 | bcs,a %xcc,.+8 | ||
1285 | add c_3,t_2,c_3 | ||
1286 | addcc c_12,t_1,c_12 | ||
1287 | bcs,a %xcc,.+8 | ||
1288 | add c_3,t_2,c_3 | ||
1289 | mulx a_1,a_6,t_1 !sqr_add_c2(a,6,1,c2,c3,c1); | ||
1290 | addcc c_12,t_1,c_12 | ||
1291 | bcs,a %xcc,.+8 | ||
1292 | add c_3,t_2,c_3 | ||
1293 | addcc c_12,t_1,c_12 | ||
1294 | bcs,a %xcc,.+8 | ||
1295 | add c_3,t_2,c_3 | ||
1296 | mulx a_2,a_5,t_1 !sqr_add_c2(a,5,2,c2,c3,c1); | ||
1297 | addcc c_12,t_1,c_12 | ||
1298 | bcs,a %xcc,.+8 | ||
1299 | add c_3,t_2,c_3 | ||
1300 | addcc c_12,t_1,c_12 | ||
1301 | bcs,a %xcc,.+8 | ||
1302 | add c_3,t_2,c_3 | ||
1303 | mulx a_3,a_4,t_1 !sqr_add_c2(a,4,3,c2,c3,c1); | ||
1304 | addcc c_12,t_1,c_12 | ||
1305 | bcs,a %xcc,.+8 | ||
1306 | add c_3,t_2,c_3 | ||
1307 | addcc c_12,t_1,t_1 | ||
1308 | bcs,a %xcc,.+8 | ||
1309 | add c_3,t_2,c_3 | ||
1310 | srlx t_1,32,c_12 | ||
1311 | stuw t_1,rp(7) !r[7]=c2; | ||
1312 | or c_12,c_3,c_12 | ||
1313 | |||
1314 | mulx a_7,a_1,t_1 !sqr_add_c2(a,7,1,c3,c1,c2); | ||
1315 | addcc c_12,t_1,c_12 | ||
1316 | clr c_3 | ||
1317 | bcs,a %xcc,.+8 | ||
1318 | add c_3,t_2,c_3 | ||
1319 | addcc c_12,t_1,c_12 | ||
1320 | bcs,a %xcc,.+8 | ||
1321 | add c_3,t_2,c_3 | ||
1322 | mulx a_6,a_2,t_1 !sqr_add_c2(a,6,2,c3,c1,c2); | ||
1323 | addcc c_12,t_1,c_12 | ||
1324 | bcs,a %xcc,.+8 | ||
1325 | add c_3,t_2,c_3 | ||
1326 | addcc c_12,t_1,c_12 | ||
1327 | bcs,a %xcc,.+8 | ||
1328 | add c_3,t_2,c_3 | ||
1329 | mulx a_5,a_3,t_1 !sqr_add_c2(a,5,3,c3,c1,c2); | ||
1330 | addcc c_12,t_1,c_12 | ||
1331 | bcs,a %xcc,.+8 | ||
1332 | add c_3,t_2,c_3 | ||
1333 | addcc c_12,t_1,c_12 | ||
1334 | bcs,a %xcc,.+8 | ||
1335 | add c_3,t_2,c_3 | ||
1336 | mulx a_4,a_4,t_1 !sqr_add_c(a,4,c3,c1,c2); | ||
1337 | addcc c_12,t_1,t_1 | ||
1338 | bcs,a %xcc,.+8 | ||
1339 | add c_3,t_2,c_3 | ||
1340 | srlx t_1,32,c_12 | ||
1341 | stuw t_1,rp(8) !r[8]=c3; | ||
1342 | or c_12,c_3,c_12 | ||
1343 | |||
1344 | mulx a_2,a_7,t_1 !sqr_add_c2(a,7,2,c1,c2,c3); | ||
1345 | addcc c_12,t_1,c_12 | ||
1346 | clr c_3 | ||
1347 | bcs,a %xcc,.+8 | ||
1348 | add c_3,t_2,c_3 | ||
1349 | addcc c_12,t_1,c_12 | ||
1350 | bcs,a %xcc,.+8 | ||
1351 | add c_3,t_2,c_3 | ||
1352 | mulx a_3,a_6,t_1 !sqr_add_c2(a,6,3,c1,c2,c3); | ||
1353 | addcc c_12,t_1,c_12 | ||
1354 | bcs,a %xcc,.+8 | ||
1355 | add c_3,t_2,c_3 | ||
1356 | addcc c_12,t_1,c_12 | ||
1357 | bcs,a %xcc,.+8 | ||
1358 | add c_3,t_2,c_3 | ||
1359 | mulx a_4,a_5,t_1 !sqr_add_c2(a,5,4,c1,c2,c3); | ||
1360 | addcc c_12,t_1,c_12 | ||
1361 | bcs,a %xcc,.+8 | ||
1362 | add c_3,t_2,c_3 | ||
1363 | addcc c_12,t_1,t_1 | ||
1364 | bcs,a %xcc,.+8 | ||
1365 | add c_3,t_2,c_3 | ||
1366 | srlx t_1,32,c_12 | ||
1367 | stuw t_1,rp(9) !r[9]=c1; | ||
1368 | or c_12,c_3,c_12 | ||
1369 | |||
1370 | mulx a_7,a_3,t_1 !sqr_add_c2(a,7,3,c2,c3,c1); | ||
1371 | addcc c_12,t_1,c_12 | ||
1372 | clr c_3 | ||
1373 | bcs,a %xcc,.+8 | ||
1374 | add c_3,t_2,c_3 | ||
1375 | addcc c_12,t_1,c_12 | ||
1376 | bcs,a %xcc,.+8 | ||
1377 | add c_3,t_2,c_3 | ||
1378 | mulx a_6,a_4,t_1 !sqr_add_c2(a,6,4,c2,c3,c1); | ||
1379 | addcc c_12,t_1,c_12 | ||
1380 | bcs,a %xcc,.+8 | ||
1381 | add c_3,t_2,c_3 | ||
1382 | addcc c_12,t_1,c_12 | ||
1383 | bcs,a %xcc,.+8 | ||
1384 | add c_3,t_2,c_3 | ||
1385 | mulx a_5,a_5,t_1 !sqr_add_c(a,5,c2,c3,c1); | ||
1386 | addcc c_12,t_1,t_1 | ||
1387 | bcs,a %xcc,.+8 | ||
1388 | add c_3,t_2,c_3 | ||
1389 | srlx t_1,32,c_12 | ||
1390 | stuw t_1,rp(10) !r[10]=c2; | ||
1391 | or c_12,c_3,c_12 | ||
1392 | |||
1393 | mulx a_4,a_7,t_1 !sqr_add_c2(a,7,4,c3,c1,c2); | ||
1394 | addcc c_12,t_1,c_12 | ||
1395 | clr c_3 | ||
1396 | bcs,a %xcc,.+8 | ||
1397 | add c_3,t_2,c_3 | ||
1398 | addcc c_12,t_1,c_12 | ||
1399 | bcs,a %xcc,.+8 | ||
1400 | add c_3,t_2,c_3 | ||
1401 | mulx a_5,a_6,t_1 !sqr_add_c2(a,6,5,c3,c1,c2); | ||
1402 | addcc c_12,t_1,c_12 | ||
1403 | bcs,a %xcc,.+8 | ||
1404 | add c_3,t_2,c_3 | ||
1405 | addcc c_12,t_1,t_1 | ||
1406 | bcs,a %xcc,.+8 | ||
1407 | add c_3,t_2,c_3 | ||
1408 | srlx t_1,32,c_12 | ||
1409 | stuw t_1,rp(11) !r[11]=c3; | ||
1410 | or c_12,c_3,c_12 | ||
1411 | |||
1412 | mulx a_7,a_5,t_1 !sqr_add_c2(a,7,5,c1,c2,c3); | ||
1413 | addcc c_12,t_1,c_12 | ||
1414 | clr c_3 | ||
1415 | bcs,a %xcc,.+8 | ||
1416 | add c_3,t_2,c_3 | ||
1417 | addcc c_12,t_1,c_12 | ||
1418 | bcs,a %xcc,.+8 | ||
1419 | add c_3,t_2,c_3 | ||
1420 | mulx a_6,a_6,t_1 !sqr_add_c(a,6,c1,c2,c3); | ||
1421 | addcc c_12,t_1,t_1 | ||
1422 | bcs,a %xcc,.+8 | ||
1423 | add c_3,t_2,c_3 | ||
1424 | srlx t_1,32,c_12 | ||
1425 | stuw t_1,rp(12) !r[12]=c1; | ||
1426 | or c_12,c_3,c_12 | ||
1427 | |||
1428 | mulx a_6,a_7,t_1 !sqr_add_c2(a,7,6,c2,c3,c1); | ||
1429 | addcc c_12,t_1,c_12 | ||
1430 | clr c_3 | ||
1431 | bcs,a %xcc,.+8 | ||
1432 | add c_3,t_2,c_3 | ||
1433 | addcc c_12,t_1,t_1 | ||
1434 | bcs,a %xcc,.+8 | ||
1435 | add c_3,t_2,c_3 | ||
1436 | srlx t_1,32,c_12 | ||
1437 | stuw t_1,rp(13) !r[13]=c2; | ||
1438 | or c_12,c_3,c_12 | ||
1439 | |||
1440 | mulx a_7,a_7,t_1 !sqr_add_c(a,7,c3,c1,c2); | ||
1441 | addcc c_12,t_1,t_1 | ||
1442 | srlx t_1,32,c_12 | ||
1443 | stuw t_1,rp(14) !r[14]=c3; | ||
1444 | stuw c_12,rp(15) !r[15]=c1; | ||
1445 | |||
1446 | ret | ||
1447 | restore %g0,%g0,%o0 | ||
1448 | |||
1449 | .type bn_sqr_comba8,#function | ||
1450 | .size bn_sqr_comba8,(.-bn_sqr_comba8) | ||
1451 | |||
1452 | .align 32 | ||
1453 | |||
1454 | .global bn_sqr_comba4 | ||
1455 | /* | ||
1456 | * void bn_sqr_comba4(r,a) | ||
1457 | * BN_ULONG *r,*a; | ||
1458 | */ | ||
1459 | bn_sqr_comba4: | ||
1460 | save %sp,FRAME_SIZE,%sp | ||
1461 | mov 1,t_2 | ||
1462 | lduw ap(0),a_0 | ||
1463 | sllx t_2,32,t_2 | ||
1464 | lduw ap(1),a_1 | ||
1465 | mulx a_0,a_0,t_1 !sqr_add_c(a,0,c1,c2,c3); | ||
1466 | srlx t_1,32,c_12 | ||
1467 | stuw t_1,rp(0) !r[0]=c1; | ||
1468 | |||
1469 | lduw ap(2),a_2 | ||
1470 | mulx a_0,a_1,t_1 !sqr_add_c2(a,1,0,c2,c3,c1); | ||
1471 | addcc c_12,t_1,c_12 | ||
1472 | clr c_3 | ||
1473 | bcs,a %xcc,.+8 | ||
1474 | add c_3,t_2,c_3 | ||
1475 | addcc c_12,t_1,t_1 | ||
1476 | bcs,a %xcc,.+8 | ||
1477 | add c_3,t_2,c_3 | ||
1478 | srlx t_1,32,c_12 | ||
1479 | stuw t_1,rp(1) !r[1]=c2; | ||
1480 | or c_12,c_3,c_12 | ||
1481 | |||
1482 | mulx a_2,a_0,t_1 !sqr_add_c2(a,2,0,c3,c1,c2); | ||
1483 | addcc c_12,t_1,c_12 | ||
1484 | clr c_3 | ||
1485 | bcs,a %xcc,.+8 | ||
1486 | add c_3,t_2,c_3 | ||
1487 | addcc c_12,t_1,c_12 | ||
1488 | bcs,a %xcc,.+8 | ||
1489 | add c_3,t_2,c_3 | ||
1490 | lduw ap(3),a_3 | ||
1491 | mulx a_1,a_1,t_1 !sqr_add_c(a,1,c3,c1,c2); | ||
1492 | addcc c_12,t_1,t_1 | ||
1493 | bcs,a %xcc,.+8 | ||
1494 | add c_3,t_2,c_3 | ||
1495 | srlx t_1,32,c_12 | ||
1496 | stuw t_1,rp(2) !r[2]=c3; | ||
1497 | or c_12,c_3,c_12 | ||
1498 | |||
1499 | mulx a_0,a_3,t_1 !sqr_add_c2(a,3,0,c1,c2,c3); | ||
1500 | addcc c_12,t_1,c_12 | ||
1501 | clr c_3 | ||
1502 | bcs,a %xcc,.+8 | ||
1503 | add c_3,t_2,c_3 | ||
1504 | addcc c_12,t_1,c_12 | ||
1505 | bcs,a %xcc,.+8 | ||
1506 | add c_3,t_2,c_3 | ||
1507 | mulx a_1,a_2,t_1 !sqr_add_c2(a,2,1,c1,c2,c3); | ||
1508 | addcc c_12,t_1,c_12 | ||
1509 | bcs,a %xcc,.+8 | ||
1510 | add c_3,t_2,c_3 | ||
1511 | addcc c_12,t_1,t_1 | ||
1512 | bcs,a %xcc,.+8 | ||
1513 | add c_3,t_2,c_3 | ||
1514 | srlx t_1,32,c_12 | ||
1515 | stuw t_1,rp(3) !r[3]=c1; | ||
1516 | or c_12,c_3,c_12 | ||
1517 | |||
1518 | mulx a_3,a_1,t_1 !sqr_add_c2(a,3,1,c2,c3,c1); | ||
1519 | addcc c_12,t_1,c_12 | ||
1520 | clr c_3 | ||
1521 | bcs,a %xcc,.+8 | ||
1522 | add c_3,t_2,c_3 | ||
1523 | addcc c_12,t_1,c_12 | ||
1524 | bcs,a %xcc,.+8 | ||
1525 | add c_3,t_2,c_3 | ||
1526 | mulx a_2,a_2,t_1 !sqr_add_c(a,2,c2,c3,c1); | ||
1527 | addcc c_12,t_1,t_1 | ||
1528 | bcs,a %xcc,.+8 | ||
1529 | add c_3,t_2,c_3 | ||
1530 | srlx t_1,32,c_12 | ||
1531 | stuw t_1,rp(4) !r[4]=c2; | ||
1532 | or c_12,c_3,c_12 | ||
1533 | |||
1534 | mulx a_2,a_3,t_1 !sqr_add_c2(a,3,2,c3,c1,c2); | ||
1535 | addcc c_12,t_1,c_12 | ||
1536 | clr c_3 | ||
1537 | bcs,a %xcc,.+8 | ||
1538 | add c_3,t_2,c_3 | ||
1539 | addcc c_12,t_1,t_1 | ||
1540 | bcs,a %xcc,.+8 | ||
1541 | add c_3,t_2,c_3 | ||
1542 | srlx t_1,32,c_12 | ||
1543 | stuw t_1,rp(5) !r[5]=c3; | ||
1544 | or c_12,c_3,c_12 | ||
1545 | |||
1546 | mulx a_3,a_3,t_1 !sqr_add_c(a,3,c1,c2,c3); | ||
1547 | addcc c_12,t_1,t_1 | ||
1548 | srlx t_1,32,c_12 | ||
1549 | stuw t_1,rp(6) !r[6]=c1; | ||
1550 | stuw c_12,rp(7) !r[7]=c2; | ||
1551 | |||
1552 | ret | ||
1553 | restore %g0,%g0,%o0 | ||
1554 | |||
1555 | .type bn_sqr_comba4,#function | ||
1556 | .size bn_sqr_comba4,(.-bn_sqr_comba4) | ||
1557 | |||
1558 | .align 32 | ||