diff options
author | cvs2svn <admin@example.com> | 2015-08-02 21:54:22 +0000 |
---|---|---|
committer | cvs2svn <admin@example.com> | 2015-08-02 21:54:22 +0000 |
commit | ed3760bf4be4a96a89233fb8f8b84a0d44725862 (patch) | |
tree | 5609c82060f75c53af0a7641d9b33a88574876cd /src/lib/libcrypto/bn/asm/mips.pl | |
parent | f8b563fb5ba1524c821d37308f4e6abfc866bc3f (diff) | |
download | openbsd-OPENBSD_5_8_BASE.tar.gz openbsd-OPENBSD_5_8_BASE.tar.bz2 openbsd-OPENBSD_5_8_BASE.zip |
This commit was manufactured by cvs2git to create tag 'OPENBSD_5_8_BASE'.OPENBSD_5_8_BASE
Diffstat (limited to 'src/lib/libcrypto/bn/asm/mips.pl')
-rw-r--r-- | src/lib/libcrypto/bn/asm/mips.pl | 2234 |
1 files changed, 0 insertions, 2234 deletions
diff --git a/src/lib/libcrypto/bn/asm/mips.pl b/src/lib/libcrypto/bn/asm/mips.pl deleted file mode 100644 index 215c9a7483..0000000000 --- a/src/lib/libcrypto/bn/asm/mips.pl +++ /dev/null | |||
@@ -1,2234 +0,0 @@ | |||
1 | #!/usr/bin/env perl | ||
2 | # | ||
3 | # ==================================================================== | ||
4 | # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | ||
5 | # project. | ||
6 | # | ||
7 | # Rights for redistribution and usage in source and binary forms are | ||
8 | # granted according to the OpenSSL license. Warranty of any kind is | ||
9 | # disclaimed. | ||
10 | # ==================================================================== | ||
11 | |||
12 | |||
13 | # July 1999 | ||
14 | # | ||
15 | # This is drop-in MIPS III/IV ISA replacement for crypto/bn/bn_asm.c. | ||
16 | # | ||
17 | # The module is designed to work with either of the "new" MIPS ABI(5), | ||
18 | # namely N32 or N64, offered by IRIX 6.x. It's not ment to work under | ||
19 | # IRIX 5.x not only because it doesn't support new ABIs but also | ||
20 | # because 5.x kernels put R4x00 CPU into 32-bit mode and all those | ||
21 | # 64-bit instructions (daddu, dmultu, etc.) found below gonna only | ||
22 | # cause illegal instruction exception:-( | ||
23 | # | ||
24 | # In addition the code depends on preprocessor flags set up by MIPSpro | ||
25 | # compiler driver (either as or cc) and therefore (probably?) can't be | ||
26 | # compiled by the GNU assembler. GNU C driver manages fine though... | ||
27 | # I mean as long as -mmips-as is specified or is the default option, | ||
28 | # because then it simply invokes /usr/bin/as which in turn takes | ||
29 | # perfect care of the preprocessor definitions. Another neat feature | ||
30 | # offered by the MIPSpro assembler is an optimization pass. This gave | ||
31 | # me the opportunity to have the code looking more regular as all those | ||
32 | # architecture dependent instruction rescheduling details were left to | ||
33 | # the assembler. Cool, huh? | ||
34 | # | ||
35 | # Performance improvement is astonishing! 'apps/openssl speed rsa dsa' | ||
36 | # goes way over 3 times faster! | ||
37 | # | ||
38 | # <appro@fy.chalmers.se> | ||
39 | |||
40 | # October 2010 | ||
41 | # | ||
42 | # Adapt the module even for 32-bit ABIs and other OSes. The former was | ||
43 | # achieved by mechanical replacement of 64-bit arithmetic instructions | ||
44 | # such as dmultu, daddu, etc. with their 32-bit counterparts and | ||
45 | # adjusting offsets denoting multiples of BN_ULONG. Above mentioned | ||
46 | # >3x performance improvement naturally does not apply to 32-bit code | ||
47 | # [because there is no instruction 32-bit compiler can't use], one | ||
48 | # has to content with 40-85% improvement depending on benchmark and | ||
49 | # key length, more for longer keys. | ||
50 | |||
51 | $flavour = shift; | ||
52 | while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} | ||
53 | open STDOUT,">$output"; | ||
54 | |||
55 | if ($flavour =~ /64|n32/i) { | ||
56 | $LD="ld"; | ||
57 | $ST="sd"; | ||
58 | $MULTU="dmultu"; | ||
59 | $DIVU="ddivu"; | ||
60 | $ADDU="daddu"; | ||
61 | $SUBU="dsubu"; | ||
62 | $SRL="dsrl"; | ||
63 | $SLL="dsll"; | ||
64 | $BNSZ=8; | ||
65 | $PTR_ADD="daddu"; | ||
66 | $PTR_SUB="dsubu"; | ||
67 | $SZREG=8; | ||
68 | $REG_S="sd"; | ||
69 | $REG_L="ld"; | ||
70 | } else { | ||
71 | $LD="lw"; | ||
72 | $ST="sw"; | ||
73 | $MULTU="multu"; | ||
74 | $DIVU="divu"; | ||
75 | $ADDU="addu"; | ||
76 | $SUBU="subu"; | ||
77 | $SRL="srl"; | ||
78 | $SLL="sll"; | ||
79 | $BNSZ=4; | ||
80 | $PTR_ADD="addu"; | ||
81 | $PTR_SUB="subu"; | ||
82 | $SZREG=4; | ||
83 | $REG_S="sw"; | ||
84 | $REG_L="lw"; | ||
85 | $code=".set mips2\n"; | ||
86 | } | ||
87 | |||
88 | # Below is N32/64 register layout used in the original module. | ||
89 | # | ||
90 | ($zero,$at,$v0,$v1)=map("\$$_",(0..3)); | ||
91 | ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); | ||
92 | ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25)); | ||
93 | ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23)); | ||
94 | ($gp,$sp,$fp,$ra)=map("\$$_",(28..31)); | ||
95 | ($ta0,$ta1,$ta2,$ta3)=($a4,$a5,$a6,$a7); | ||
96 | # | ||
97 | # No special adaptation is required for O32. NUBI on the other hand | ||
98 | # is treated by saving/restoring ($v1,$t0..$t3). | ||
99 | |||
100 | $gp=$v1 if ($flavour =~ /nubi/i); | ||
101 | |||
102 | $minus4=$v1; | ||
103 | |||
104 | $code.=<<___; | ||
105 | .rdata | ||
106 | .asciiz "mips3.s, Version 1.2" | ||
107 | .asciiz "MIPS II/III/IV ISA artwork by Andy Polyakov <appro\@fy.chalmers.se>" | ||
108 | |||
109 | .text | ||
110 | .set noat | ||
111 | |||
112 | .align 5 | ||
113 | .globl bn_mul_add_words | ||
114 | .ent bn_mul_add_words | ||
115 | bn_mul_add_words: | ||
116 | .set noreorder | ||
117 | bgtz $a2,bn_mul_add_words_internal | ||
118 | move $v0,$zero | ||
119 | jr $ra | ||
120 | move $a0,$v0 | ||
121 | .end bn_mul_add_words | ||
122 | |||
123 | .align 5 | ||
124 | .ent bn_mul_add_words_internal | ||
125 | bn_mul_add_words_internal: | ||
126 | ___ | ||
127 | $code.=<<___ if ($flavour =~ /nubi/i); | ||
128 | .frame $sp,6*$SZREG,$ra | ||
129 | .mask 0x8000f008,-$SZREG | ||
130 | .set noreorder | ||
131 | $PTR_SUB $sp,6*$SZREG | ||
132 | $REG_S $ra,5*$SZREG($sp) | ||
133 | $REG_S $t3,4*$SZREG($sp) | ||
134 | $REG_S $t2,3*$SZREG($sp) | ||
135 | $REG_S $t1,2*$SZREG($sp) | ||
136 | $REG_S $t0,1*$SZREG($sp) | ||
137 | $REG_S $gp,0*$SZREG($sp) | ||
138 | ___ | ||
139 | $code.=<<___; | ||
140 | .set reorder | ||
141 | li $minus4,-4 | ||
142 | and $ta0,$a2,$minus4 | ||
143 | beqz $ta0,.L_bn_mul_add_words_tail | ||
144 | |||
145 | .L_bn_mul_add_words_loop: | ||
146 | $LD $t0,0($a1) | ||
147 | $MULTU $t0,$a3 | ||
148 | $LD $t1,0($a0) | ||
149 | $LD $t2,$BNSZ($a1) | ||
150 | $LD $t3,$BNSZ($a0) | ||
151 | $LD $ta0,2*$BNSZ($a1) | ||
152 | $LD $ta1,2*$BNSZ($a0) | ||
153 | $ADDU $t1,$v0 | ||
154 | sltu $v0,$t1,$v0 # All manuals say it "compares 32-bit | ||
155 | # values", but it seems to work fine | ||
156 | # even on 64-bit registers. | ||
157 | mflo $at | ||
158 | mfhi $t0 | ||
159 | $ADDU $t1,$at | ||
160 | $ADDU $v0,$t0 | ||
161 | $MULTU $t2,$a3 | ||
162 | sltu $at,$t1,$at | ||
163 | $ST $t1,0($a0) | ||
164 | $ADDU $v0,$at | ||
165 | |||
166 | $LD $ta2,3*$BNSZ($a1) | ||
167 | $LD $ta3,3*$BNSZ($a0) | ||
168 | $ADDU $t3,$v0 | ||
169 | sltu $v0,$t3,$v0 | ||
170 | mflo $at | ||
171 | mfhi $t2 | ||
172 | $ADDU $t3,$at | ||
173 | $ADDU $v0,$t2 | ||
174 | $MULTU $ta0,$a3 | ||
175 | sltu $at,$t3,$at | ||
176 | $ST $t3,$BNSZ($a0) | ||
177 | $ADDU $v0,$at | ||
178 | |||
179 | subu $a2,4 | ||
180 | $PTR_ADD $a0,4*$BNSZ | ||
181 | $PTR_ADD $a1,4*$BNSZ | ||
182 | $ADDU $ta1,$v0 | ||
183 | sltu $v0,$ta1,$v0 | ||
184 | mflo $at | ||
185 | mfhi $ta0 | ||
186 | $ADDU $ta1,$at | ||
187 | $ADDU $v0,$ta0 | ||
188 | $MULTU $ta2,$a3 | ||
189 | sltu $at,$ta1,$at | ||
190 | $ST $ta1,-2*$BNSZ($a0) | ||
191 | $ADDU $v0,$at | ||
192 | |||
193 | |||
194 | and $ta0,$a2,$minus4 | ||
195 | $ADDU $ta3,$v0 | ||
196 | sltu $v0,$ta3,$v0 | ||
197 | mflo $at | ||
198 | mfhi $ta2 | ||
199 | $ADDU $ta3,$at | ||
200 | $ADDU $v0,$ta2 | ||
201 | sltu $at,$ta3,$at | ||
202 | $ST $ta3,-$BNSZ($a0) | ||
203 | .set noreorder | ||
204 | bgtz $ta0,.L_bn_mul_add_words_loop | ||
205 | $ADDU $v0,$at | ||
206 | |||
207 | beqz $a2,.L_bn_mul_add_words_return | ||
208 | nop | ||
209 | |||
210 | .L_bn_mul_add_words_tail: | ||
211 | .set reorder | ||
212 | $LD $t0,0($a1) | ||
213 | $MULTU $t0,$a3 | ||
214 | $LD $t1,0($a0) | ||
215 | subu $a2,1 | ||
216 | $ADDU $t1,$v0 | ||
217 | sltu $v0,$t1,$v0 | ||
218 | mflo $at | ||
219 | mfhi $t0 | ||
220 | $ADDU $t1,$at | ||
221 | $ADDU $v0,$t0 | ||
222 | sltu $at,$t1,$at | ||
223 | $ST $t1,0($a0) | ||
224 | $ADDU $v0,$at | ||
225 | beqz $a2,.L_bn_mul_add_words_return | ||
226 | |||
227 | $LD $t0,$BNSZ($a1) | ||
228 | $MULTU $t0,$a3 | ||
229 | $LD $t1,$BNSZ($a0) | ||
230 | subu $a2,1 | ||
231 | $ADDU $t1,$v0 | ||
232 | sltu $v0,$t1,$v0 | ||
233 | mflo $at | ||
234 | mfhi $t0 | ||
235 | $ADDU $t1,$at | ||
236 | $ADDU $v0,$t0 | ||
237 | sltu $at,$t1,$at | ||
238 | $ST $t1,$BNSZ($a0) | ||
239 | $ADDU $v0,$at | ||
240 | beqz $a2,.L_bn_mul_add_words_return | ||
241 | |||
242 | $LD $t0,2*$BNSZ($a1) | ||
243 | $MULTU $t0,$a3 | ||
244 | $LD $t1,2*$BNSZ($a0) | ||
245 | $ADDU $t1,$v0 | ||
246 | sltu $v0,$t1,$v0 | ||
247 | mflo $at | ||
248 | mfhi $t0 | ||
249 | $ADDU $t1,$at | ||
250 | $ADDU $v0,$t0 | ||
251 | sltu $at,$t1,$at | ||
252 | $ST $t1,2*$BNSZ($a0) | ||
253 | $ADDU $v0,$at | ||
254 | |||
255 | .L_bn_mul_add_words_return: | ||
256 | .set noreorder | ||
257 | ___ | ||
258 | $code.=<<___ if ($flavour =~ /nubi/i); | ||
259 | $REG_L $t3,4*$SZREG($sp) | ||
260 | $REG_L $t2,3*$SZREG($sp) | ||
261 | $REG_L $t1,2*$SZREG($sp) | ||
262 | $REG_L $t0,1*$SZREG($sp) | ||
263 | $REG_L $gp,0*$SZREG($sp) | ||
264 | $PTR_ADD $sp,6*$SZREG | ||
265 | ___ | ||
266 | $code.=<<___; | ||
267 | jr $ra | ||
268 | move $a0,$v0 | ||
269 | .end bn_mul_add_words_internal | ||
270 | |||
271 | .align 5 | ||
272 | .globl bn_mul_words | ||
273 | .ent bn_mul_words | ||
274 | bn_mul_words: | ||
275 | .set noreorder | ||
276 | bgtz $a2,bn_mul_words_internal | ||
277 | move $v0,$zero | ||
278 | jr $ra | ||
279 | move $a0,$v0 | ||
280 | .end bn_mul_words | ||
281 | |||
282 | .align 5 | ||
283 | .ent bn_mul_words_internal | ||
284 | bn_mul_words_internal: | ||
285 | ___ | ||
286 | $code.=<<___ if ($flavour =~ /nubi/i); | ||
287 | .frame $sp,6*$SZREG,$ra | ||
288 | .mask 0x8000f008,-$SZREG | ||
289 | .set noreorder | ||
290 | $PTR_SUB $sp,6*$SZREG | ||
291 | $REG_S $ra,5*$SZREG($sp) | ||
292 | $REG_S $t3,4*$SZREG($sp) | ||
293 | $REG_S $t2,3*$SZREG($sp) | ||
294 | $REG_S $t1,2*$SZREG($sp) | ||
295 | $REG_S $t0,1*$SZREG($sp) | ||
296 | $REG_S $gp,0*$SZREG($sp) | ||
297 | ___ | ||
298 | $code.=<<___; | ||
299 | .set reorder | ||
300 | li $minus4,-4 | ||
301 | and $ta0,$a2,$minus4 | ||
302 | beqz $ta0,.L_bn_mul_words_tail | ||
303 | |||
304 | .L_bn_mul_words_loop: | ||
305 | $LD $t0,0($a1) | ||
306 | $MULTU $t0,$a3 | ||
307 | $LD $t2,$BNSZ($a1) | ||
308 | $LD $ta0,2*$BNSZ($a1) | ||
309 | $LD $ta2,3*$BNSZ($a1) | ||
310 | mflo $at | ||
311 | mfhi $t0 | ||
312 | $ADDU $v0,$at | ||
313 | sltu $t1,$v0,$at | ||
314 | $MULTU $t2,$a3 | ||
315 | $ST $v0,0($a0) | ||
316 | $ADDU $v0,$t1,$t0 | ||
317 | |||
318 | subu $a2,4 | ||
319 | $PTR_ADD $a0,4*$BNSZ | ||
320 | $PTR_ADD $a1,4*$BNSZ | ||
321 | mflo $at | ||
322 | mfhi $t2 | ||
323 | $ADDU $v0,$at | ||
324 | sltu $t3,$v0,$at | ||
325 | $MULTU $ta0,$a3 | ||
326 | $ST $v0,-3*$BNSZ($a0) | ||
327 | $ADDU $v0,$t3,$t2 | ||
328 | |||
329 | mflo $at | ||
330 | mfhi $ta0 | ||
331 | $ADDU $v0,$at | ||
332 | sltu $ta1,$v0,$at | ||
333 | $MULTU $ta2,$a3 | ||
334 | $ST $v0,-2*$BNSZ($a0) | ||
335 | $ADDU $v0,$ta1,$ta0 | ||
336 | |||
337 | and $ta0,$a2,$minus4 | ||
338 | mflo $at | ||
339 | mfhi $ta2 | ||
340 | $ADDU $v0,$at | ||
341 | sltu $ta3,$v0,$at | ||
342 | $ST $v0,-$BNSZ($a0) | ||
343 | .set noreorder | ||
344 | bgtz $ta0,.L_bn_mul_words_loop | ||
345 | $ADDU $v0,$ta3,$ta2 | ||
346 | |||
347 | beqz $a2,.L_bn_mul_words_return | ||
348 | nop | ||
349 | |||
350 | .L_bn_mul_words_tail: | ||
351 | .set reorder | ||
352 | $LD $t0,0($a1) | ||
353 | $MULTU $t0,$a3 | ||
354 | subu $a2,1 | ||
355 | mflo $at | ||
356 | mfhi $t0 | ||
357 | $ADDU $v0,$at | ||
358 | sltu $t1,$v0,$at | ||
359 | $ST $v0,0($a0) | ||
360 | $ADDU $v0,$t1,$t0 | ||
361 | beqz $a2,.L_bn_mul_words_return | ||
362 | |||
363 | $LD $t0,$BNSZ($a1) | ||
364 | $MULTU $t0,$a3 | ||
365 | subu $a2,1 | ||
366 | mflo $at | ||
367 | mfhi $t0 | ||
368 | $ADDU $v0,$at | ||
369 | sltu $t1,$v0,$at | ||
370 | $ST $v0,$BNSZ($a0) | ||
371 | $ADDU $v0,$t1,$t0 | ||
372 | beqz $a2,.L_bn_mul_words_return | ||
373 | |||
374 | $LD $t0,2*$BNSZ($a1) | ||
375 | $MULTU $t0,$a3 | ||
376 | mflo $at | ||
377 | mfhi $t0 | ||
378 | $ADDU $v0,$at | ||
379 | sltu $t1,$v0,$at | ||
380 | $ST $v0,2*$BNSZ($a0) | ||
381 | $ADDU $v0,$t1,$t0 | ||
382 | |||
383 | .L_bn_mul_words_return: | ||
384 | .set noreorder | ||
385 | ___ | ||
386 | $code.=<<___ if ($flavour =~ /nubi/i); | ||
387 | $REG_L $t3,4*$SZREG($sp) | ||
388 | $REG_L $t2,3*$SZREG($sp) | ||
389 | $REG_L $t1,2*$SZREG($sp) | ||
390 | $REG_L $t0,1*$SZREG($sp) | ||
391 | $REG_L $gp,0*$SZREG($sp) | ||
392 | $PTR_ADD $sp,6*$SZREG | ||
393 | ___ | ||
394 | $code.=<<___; | ||
395 | jr $ra | ||
396 | move $a0,$v0 | ||
397 | .end bn_mul_words_internal | ||
398 | |||
399 | .align 5 | ||
400 | .globl bn_sqr_words | ||
401 | .ent bn_sqr_words | ||
402 | bn_sqr_words: | ||
403 | .set noreorder | ||
404 | bgtz $a2,bn_sqr_words_internal | ||
405 | move $v0,$zero | ||
406 | jr $ra | ||
407 | move $a0,$v0 | ||
408 | .end bn_sqr_words | ||
409 | |||
410 | .align 5 | ||
411 | .ent bn_sqr_words_internal | ||
412 | bn_sqr_words_internal: | ||
413 | ___ | ||
414 | $code.=<<___ if ($flavour =~ /nubi/i); | ||
415 | .frame $sp,6*$SZREG,$ra | ||
416 | .mask 0x8000f008,-$SZREG | ||
417 | .set noreorder | ||
418 | $PTR_SUB $sp,6*$SZREG | ||
419 | $REG_S $ra,5*$SZREG($sp) | ||
420 | $REG_S $t3,4*$SZREG($sp) | ||
421 | $REG_S $t2,3*$SZREG($sp) | ||
422 | $REG_S $t1,2*$SZREG($sp) | ||
423 | $REG_S $t0,1*$SZREG($sp) | ||
424 | $REG_S $gp,0*$SZREG($sp) | ||
425 | ___ | ||
426 | $code.=<<___; | ||
427 | .set reorder | ||
428 | li $minus4,-4 | ||
429 | and $ta0,$a2,$minus4 | ||
430 | beqz $ta0,.L_bn_sqr_words_tail | ||
431 | |||
432 | .L_bn_sqr_words_loop: | ||
433 | $LD $t0,0($a1) | ||
434 | $MULTU $t0,$t0 | ||
435 | $LD $t2,$BNSZ($a1) | ||
436 | $LD $ta0,2*$BNSZ($a1) | ||
437 | $LD $ta2,3*$BNSZ($a1) | ||
438 | mflo $t1 | ||
439 | mfhi $t0 | ||
440 | $ST $t1,0($a0) | ||
441 | $ST $t0,$BNSZ($a0) | ||
442 | |||
443 | $MULTU $t2,$t2 | ||
444 | subu $a2,4 | ||
445 | $PTR_ADD $a0,8*$BNSZ | ||
446 | $PTR_ADD $a1,4*$BNSZ | ||
447 | mflo $t3 | ||
448 | mfhi $t2 | ||
449 | $ST $t3,-6*$BNSZ($a0) | ||
450 | $ST $t2,-5*$BNSZ($a0) | ||
451 | |||
452 | $MULTU $ta0,$ta0 | ||
453 | mflo $ta1 | ||
454 | mfhi $ta0 | ||
455 | $ST $ta1,-4*$BNSZ($a0) | ||
456 | $ST $ta0,-3*$BNSZ($a0) | ||
457 | |||
458 | |||
459 | $MULTU $ta2,$ta2 | ||
460 | and $ta0,$a2,$minus4 | ||
461 | mflo $ta3 | ||
462 | mfhi $ta2 | ||
463 | $ST $ta3,-2*$BNSZ($a0) | ||
464 | |||
465 | .set noreorder | ||
466 | bgtz $ta0,.L_bn_sqr_words_loop | ||
467 | $ST $ta2,-$BNSZ($a0) | ||
468 | |||
469 | beqz $a2,.L_bn_sqr_words_return | ||
470 | nop | ||
471 | |||
472 | .L_bn_sqr_words_tail: | ||
473 | .set reorder | ||
474 | $LD $t0,0($a1) | ||
475 | $MULTU $t0,$t0 | ||
476 | subu $a2,1 | ||
477 | mflo $t1 | ||
478 | mfhi $t0 | ||
479 | $ST $t1,0($a0) | ||
480 | $ST $t0,$BNSZ($a0) | ||
481 | beqz $a2,.L_bn_sqr_words_return | ||
482 | |||
483 | $LD $t0,$BNSZ($a1) | ||
484 | $MULTU $t0,$t0 | ||
485 | subu $a2,1 | ||
486 | mflo $t1 | ||
487 | mfhi $t0 | ||
488 | $ST $t1,2*$BNSZ($a0) | ||
489 | $ST $t0,3*$BNSZ($a0) | ||
490 | beqz $a2,.L_bn_sqr_words_return | ||
491 | |||
492 | $LD $t0,2*$BNSZ($a1) | ||
493 | $MULTU $t0,$t0 | ||
494 | mflo $t1 | ||
495 | mfhi $t0 | ||
496 | $ST $t1,4*$BNSZ($a0) | ||
497 | $ST $t0,5*$BNSZ($a0) | ||
498 | |||
499 | .L_bn_sqr_words_return: | ||
500 | .set noreorder | ||
501 | ___ | ||
502 | $code.=<<___ if ($flavour =~ /nubi/i); | ||
503 | $REG_L $t3,4*$SZREG($sp) | ||
504 | $REG_L $t2,3*$SZREG($sp) | ||
505 | $REG_L $t1,2*$SZREG($sp) | ||
506 | $REG_L $t0,1*$SZREG($sp) | ||
507 | $REG_L $gp,0*$SZREG($sp) | ||
508 | $PTR_ADD $sp,6*$SZREG | ||
509 | ___ | ||
510 | $code.=<<___; | ||
511 | jr $ra | ||
512 | move $a0,$v0 | ||
513 | |||
514 | .end bn_sqr_words_internal | ||
515 | |||
516 | .align 5 | ||
517 | .globl bn_add_words | ||
518 | .ent bn_add_words | ||
519 | bn_add_words: | ||
520 | .set noreorder | ||
521 | bgtz $a3,bn_add_words_internal | ||
522 | move $v0,$zero | ||
523 | jr $ra | ||
524 | move $a0,$v0 | ||
525 | .end bn_add_words | ||
526 | |||
527 | .align 5 | ||
528 | .ent bn_add_words_internal | ||
529 | bn_add_words_internal: | ||
530 | ___ | ||
531 | $code.=<<___ if ($flavour =~ /nubi/i); | ||
532 | .frame $sp,6*$SZREG,$ra | ||
533 | .mask 0x8000f008,-$SZREG | ||
534 | .set noreorder | ||
535 | $PTR_SUB $sp,6*$SZREG | ||
536 | $REG_S $ra,5*$SZREG($sp) | ||
537 | $REG_S $t3,4*$SZREG($sp) | ||
538 | $REG_S $t2,3*$SZREG($sp) | ||
539 | $REG_S $t1,2*$SZREG($sp) | ||
540 | $REG_S $t0,1*$SZREG($sp) | ||
541 | $REG_S $gp,0*$SZREG($sp) | ||
542 | ___ | ||
543 | $code.=<<___; | ||
544 | .set reorder | ||
545 | li $minus4,-4 | ||
546 | and $at,$a3,$minus4 | ||
547 | beqz $at,.L_bn_add_words_tail | ||
548 | |||
549 | .L_bn_add_words_loop: | ||
550 | $LD $t0,0($a1) | ||
551 | $LD $ta0,0($a2) | ||
552 | subu $a3,4 | ||
553 | $LD $t1,$BNSZ($a1) | ||
554 | and $at,$a3,$minus4 | ||
555 | $LD $t2,2*$BNSZ($a1) | ||
556 | $PTR_ADD $a2,4*$BNSZ | ||
557 | $LD $t3,3*$BNSZ($a1) | ||
558 | $PTR_ADD $a0,4*$BNSZ | ||
559 | $LD $ta1,-3*$BNSZ($a2) | ||
560 | $PTR_ADD $a1,4*$BNSZ | ||
561 | $LD $ta2,-2*$BNSZ($a2) | ||
562 | $LD $ta3,-$BNSZ($a2) | ||
563 | $ADDU $ta0,$t0 | ||
564 | sltu $t8,$ta0,$t0 | ||
565 | $ADDU $t0,$ta0,$v0 | ||
566 | sltu $v0,$t0,$ta0 | ||
567 | $ST $t0,-4*$BNSZ($a0) | ||
568 | $ADDU $v0,$t8 | ||
569 | |||
570 | $ADDU $ta1,$t1 | ||
571 | sltu $t9,$ta1,$t1 | ||
572 | $ADDU $t1,$ta1,$v0 | ||
573 | sltu $v0,$t1,$ta1 | ||
574 | $ST $t1,-3*$BNSZ($a0) | ||
575 | $ADDU $v0,$t9 | ||
576 | |||
577 | $ADDU $ta2,$t2 | ||
578 | sltu $t8,$ta2,$t2 | ||
579 | $ADDU $t2,$ta2,$v0 | ||
580 | sltu $v0,$t2,$ta2 | ||
581 | $ST $t2,-2*$BNSZ($a0) | ||
582 | $ADDU $v0,$t8 | ||
583 | |||
584 | $ADDU $ta3,$t3 | ||
585 | sltu $t9,$ta3,$t3 | ||
586 | $ADDU $t3,$ta3,$v0 | ||
587 | sltu $v0,$t3,$ta3 | ||
588 | $ST $t3,-$BNSZ($a0) | ||
589 | |||
590 | .set noreorder | ||
591 | bgtz $at,.L_bn_add_words_loop | ||
592 | $ADDU $v0,$t9 | ||
593 | |||
594 | beqz $a3,.L_bn_add_words_return | ||
595 | nop | ||
596 | |||
597 | .L_bn_add_words_tail: | ||
598 | .set reorder | ||
599 | $LD $t0,0($a1) | ||
600 | $LD $ta0,0($a2) | ||
601 | $ADDU $ta0,$t0 | ||
602 | subu $a3,1 | ||
603 | sltu $t8,$ta0,$t0 | ||
604 | $ADDU $t0,$ta0,$v0 | ||
605 | sltu $v0,$t0,$ta0 | ||
606 | $ST $t0,0($a0) | ||
607 | $ADDU $v0,$t8 | ||
608 | beqz $a3,.L_bn_add_words_return | ||
609 | |||
610 | $LD $t1,$BNSZ($a1) | ||
611 | $LD $ta1,$BNSZ($a2) | ||
612 | $ADDU $ta1,$t1 | ||
613 | subu $a3,1 | ||
614 | sltu $t9,$ta1,$t1 | ||
615 | $ADDU $t1,$ta1,$v0 | ||
616 | sltu $v0,$t1,$ta1 | ||
617 | $ST $t1,$BNSZ($a0) | ||
618 | $ADDU $v0,$t9 | ||
619 | beqz $a3,.L_bn_add_words_return | ||
620 | |||
621 | $LD $t2,2*$BNSZ($a1) | ||
622 | $LD $ta2,2*$BNSZ($a2) | ||
623 | $ADDU $ta2,$t2 | ||
624 | sltu $t8,$ta2,$t2 | ||
625 | $ADDU $t2,$ta2,$v0 | ||
626 | sltu $v0,$t2,$ta2 | ||
627 | $ST $t2,2*$BNSZ($a0) | ||
628 | $ADDU $v0,$t8 | ||
629 | |||
630 | .L_bn_add_words_return: | ||
631 | .set noreorder | ||
632 | ___ | ||
633 | $code.=<<___ if ($flavour =~ /nubi/i); | ||
634 | $REG_L $t3,4*$SZREG($sp) | ||
635 | $REG_L $t2,3*$SZREG($sp) | ||
636 | $REG_L $t1,2*$SZREG($sp) | ||
637 | $REG_L $t0,1*$SZREG($sp) | ||
638 | $REG_L $gp,0*$SZREG($sp) | ||
639 | $PTR_ADD $sp,6*$SZREG | ||
640 | ___ | ||
641 | $code.=<<___; | ||
642 | jr $ra | ||
643 | move $a0,$v0 | ||
644 | |||
645 | .end bn_add_words_internal | ||
646 | |||
647 | .align 5 | ||
648 | .globl bn_sub_words | ||
649 | .ent bn_sub_words | ||
650 | bn_sub_words: | ||
651 | .set noreorder | ||
652 | bgtz $a3,bn_sub_words_internal | ||
653 | move $v0,$zero | ||
654 | jr $ra | ||
655 | move $a0,$zero | ||
656 | .end bn_sub_words | ||
657 | |||
658 | .align 5 | ||
659 | .ent bn_sub_words_internal | ||
660 | bn_sub_words_internal: | ||
661 | ___ | ||
662 | $code.=<<___ if ($flavour =~ /nubi/i); | ||
663 | .frame $sp,6*$SZREG,$ra | ||
664 | .mask 0x8000f008,-$SZREG | ||
665 | .set noreorder | ||
666 | $PTR_SUB $sp,6*$SZREG | ||
667 | $REG_S $ra,5*$SZREG($sp) | ||
668 | $REG_S $t3,4*$SZREG($sp) | ||
669 | $REG_S $t2,3*$SZREG($sp) | ||
670 | $REG_S $t1,2*$SZREG($sp) | ||
671 | $REG_S $t0,1*$SZREG($sp) | ||
672 | $REG_S $gp,0*$SZREG($sp) | ||
673 | ___ | ||
674 | $code.=<<___; | ||
675 | .set reorder | ||
676 | li $minus4,-4 | ||
677 | and $at,$a3,$minus4 | ||
678 | beqz $at,.L_bn_sub_words_tail | ||
679 | |||
680 | .L_bn_sub_words_loop: | ||
681 | $LD $t0,0($a1) | ||
682 | $LD $ta0,0($a2) | ||
683 | subu $a3,4 | ||
684 | $LD $t1,$BNSZ($a1) | ||
685 | and $at,$a3,$minus4 | ||
686 | $LD $t2,2*$BNSZ($a1) | ||
687 | $PTR_ADD $a2,4*$BNSZ | ||
688 | $LD $t3,3*$BNSZ($a1) | ||
689 | $PTR_ADD $a0,4*$BNSZ | ||
690 | $LD $ta1,-3*$BNSZ($a2) | ||
691 | $PTR_ADD $a1,4*$BNSZ | ||
692 | $LD $ta2,-2*$BNSZ($a2) | ||
693 | $LD $ta3,-$BNSZ($a2) | ||
694 | sltu $t8,$t0,$ta0 | ||
695 | $SUBU $ta0,$t0,$ta0 | ||
696 | $SUBU $t0,$ta0,$v0 | ||
697 | sgtu $v0,$t0,$ta0 | ||
698 | $ST $t0,-4*$BNSZ($a0) | ||
699 | $ADDU $v0,$t8 | ||
700 | |||
701 | sltu $t9,$t1,$ta1 | ||
702 | $SUBU $ta1,$t1,$ta1 | ||
703 | $SUBU $t1,$ta1,$v0 | ||
704 | sgtu $v0,$t1,$ta1 | ||
705 | $ST $t1,-3*$BNSZ($a0) | ||
706 | $ADDU $v0,$t9 | ||
707 | |||
708 | |||
709 | sltu $t8,$t2,$ta2 | ||
710 | $SUBU $ta2,$t2,$ta2 | ||
711 | $SUBU $t2,$ta2,$v0 | ||
712 | sgtu $v0,$t2,$ta2 | ||
713 | $ST $t2,-2*$BNSZ($a0) | ||
714 | $ADDU $v0,$t8 | ||
715 | |||
716 | sltu $t9,$t3,$ta3 | ||
717 | $SUBU $ta3,$t3,$ta3 | ||
718 | $SUBU $t3,$ta3,$v0 | ||
719 | sgtu $v0,$t3,$ta3 | ||
720 | $ST $t3,-$BNSZ($a0) | ||
721 | |||
722 | .set noreorder | ||
723 | bgtz $at,.L_bn_sub_words_loop | ||
724 | $ADDU $v0,$t9 | ||
725 | |||
726 | beqz $a3,.L_bn_sub_words_return | ||
727 | nop | ||
728 | |||
729 | .L_bn_sub_words_tail: | ||
730 | .set reorder | ||
731 | $LD $t0,0($a1) | ||
732 | $LD $ta0,0($a2) | ||
733 | subu $a3,1 | ||
734 | sltu $t8,$t0,$ta0 | ||
735 | $SUBU $ta0,$t0,$ta0 | ||
736 | $SUBU $t0,$ta0,$v0 | ||
737 | sgtu $v0,$t0,$ta0 | ||
738 | $ST $t0,0($a0) | ||
739 | $ADDU $v0,$t8 | ||
740 | beqz $a3,.L_bn_sub_words_return | ||
741 | |||
742 | $LD $t1,$BNSZ($a1) | ||
743 | subu $a3,1 | ||
744 | $LD $ta1,$BNSZ($a2) | ||
745 | sltu $t9,$t1,$ta1 | ||
746 | $SUBU $ta1,$t1,$ta1 | ||
747 | $SUBU $t1,$ta1,$v0 | ||
748 | sgtu $v0,$t1,$ta1 | ||
749 | $ST $t1,$BNSZ($a0) | ||
750 | $ADDU $v0,$t9 | ||
751 | beqz $a3,.L_bn_sub_words_return | ||
752 | |||
753 | $LD $t2,2*$BNSZ($a1) | ||
754 | $LD $ta2,2*$BNSZ($a2) | ||
755 | sltu $t8,$t2,$ta2 | ||
756 | $SUBU $ta2,$t2,$ta2 | ||
757 | $SUBU $t2,$ta2,$v0 | ||
758 | sgtu $v0,$t2,$ta2 | ||
759 | $ST $t2,2*$BNSZ($a0) | ||
760 | $ADDU $v0,$t8 | ||
761 | |||
762 | .L_bn_sub_words_return: | ||
763 | .set noreorder | ||
764 | ___ | ||
765 | $code.=<<___ if ($flavour =~ /nubi/i); | ||
766 | $REG_L $t3,4*$SZREG($sp) | ||
767 | $REG_L $t2,3*$SZREG($sp) | ||
768 | $REG_L $t1,2*$SZREG($sp) | ||
769 | $REG_L $t0,1*$SZREG($sp) | ||
770 | $REG_L $gp,0*$SZREG($sp) | ||
771 | $PTR_ADD $sp,6*$SZREG | ||
772 | ___ | ||
773 | $code.=<<___; | ||
774 | jr $ra | ||
775 | move $a0,$v0 | ||
776 | .end bn_sub_words_internal | ||
777 | |||
778 | .align 5 | ||
779 | .globl bn_div_3_words | ||
780 | .ent bn_div_3_words | ||
781 | bn_div_3_words: | ||
782 | .set noreorder | ||
783 | move $a3,$a0 # we know that bn_div_words does not | ||
784 | # touch $a3, $ta2, $ta3 and preserves $a2 | ||
785 | # so that we can save two arguments | ||
786 | # and return address in registers | ||
787 | # instead of stack:-) | ||
788 | |||
789 | $LD $a0,($a3) | ||
790 | move $ta2,$a1 | ||
791 | bne $a0,$a2,bn_div_3_words_internal | ||
792 | $LD $a1,-$BNSZ($a3) | ||
793 | li $v0,-1 | ||
794 | jr $ra | ||
795 | move $a0,$v0 | ||
796 | .end bn_div_3_words | ||
797 | |||
798 | .align 5 | ||
799 | .ent bn_div_3_words_internal | ||
800 | bn_div_3_words_internal: | ||
801 | ___ | ||
802 | $code.=<<___ if ($flavour =~ /nubi/i); | ||
803 | .frame $sp,6*$SZREG,$ra | ||
804 | .mask 0x8000f008,-$SZREG | ||
805 | .set noreorder | ||
806 | $PTR_SUB $sp,6*$SZREG | ||
807 | $REG_S $ra,5*$SZREG($sp) | ||
808 | $REG_S $t3,4*$SZREG($sp) | ||
809 | $REG_S $t2,3*$SZREG($sp) | ||
810 | $REG_S $t1,2*$SZREG($sp) | ||
811 | $REG_S $t0,1*$SZREG($sp) | ||
812 | $REG_S $gp,0*$SZREG($sp) | ||
813 | ___ | ||
814 | $code.=<<___; | ||
815 | .set reorder | ||
816 | move $ta3,$ra | ||
817 | bal bn_div_words_internal | ||
818 | move $ra,$ta3 | ||
819 | $MULTU $ta2,$v0 | ||
820 | $LD $t2,-2*$BNSZ($a3) | ||
821 | move $ta0,$zero | ||
822 | mfhi $t1 | ||
823 | mflo $t0 | ||
824 | sltu $t8,$t1,$a1 | ||
825 | .L_bn_div_3_words_inner_loop: | ||
826 | bnez $t8,.L_bn_div_3_words_inner_loop_done | ||
827 | sgeu $at,$t2,$t0 | ||
828 | seq $t9,$t1,$a1 | ||
829 | and $at,$t9 | ||
830 | sltu $t3,$t0,$ta2 | ||
831 | $ADDU $a1,$a2 | ||
832 | $SUBU $t1,$t3 | ||
833 | $SUBU $t0,$ta2 | ||
834 | sltu $t8,$t1,$a1 | ||
835 | sltu $ta0,$a1,$a2 | ||
836 | or $t8,$ta0 | ||
837 | .set noreorder | ||
838 | beqz $at,.L_bn_div_3_words_inner_loop | ||
839 | $SUBU $v0,1 | ||
840 | $ADDU $v0,1 | ||
841 | .set reorder | ||
842 | .L_bn_div_3_words_inner_loop_done: | ||
843 | .set noreorder | ||
844 | ___ | ||
845 | $code.=<<___ if ($flavour =~ /nubi/i); | ||
846 | $REG_L $t3,4*$SZREG($sp) | ||
847 | $REG_L $t2,3*$SZREG($sp) | ||
848 | $REG_L $t1,2*$SZREG($sp) | ||
849 | $REG_L $t0,1*$SZREG($sp) | ||
850 | $REG_L $gp,0*$SZREG($sp) | ||
851 | $PTR_ADD $sp,6*$SZREG | ||
852 | ___ | ||
853 | $code.=<<___; | ||
854 | jr $ra | ||
855 | move $a0,$v0 | ||
856 | .end bn_div_3_words_internal | ||
857 | |||
858 | .align 5 | ||
859 | .globl bn_div_words | ||
860 | .ent bn_div_words | ||
861 | bn_div_words: | ||
862 | .set noreorder | ||
863 | bnez $a2,bn_div_words_internal | ||
864 | li $v0,-1 # I would rather signal div-by-zero | ||
865 | # which can be done with 'break 7' | ||
866 | jr $ra | ||
867 | move $a0,$v0 | ||
868 | .end bn_div_words | ||
869 | |||
870 | .align 5 | ||
871 | .ent bn_div_words_internal | ||
872 | bn_div_words_internal: | ||
873 | ___ | ||
874 | $code.=<<___ if ($flavour =~ /nubi/i); | ||
875 | .frame $sp,6*$SZREG,$ra | ||
876 | .mask 0x8000f008,-$SZREG | ||
877 | .set noreorder | ||
878 | $PTR_SUB $sp,6*$SZREG | ||
879 | $REG_S $ra,5*$SZREG($sp) | ||
880 | $REG_S $t3,4*$SZREG($sp) | ||
881 | $REG_S $t2,3*$SZREG($sp) | ||
882 | $REG_S $t1,2*$SZREG($sp) | ||
883 | $REG_S $t0,1*$SZREG($sp) | ||
884 | $REG_S $gp,0*$SZREG($sp) | ||
885 | ___ | ||
886 | $code.=<<___; | ||
887 | move $v1,$zero | ||
888 | bltz $a2,.L_bn_div_words_body | ||
889 | move $t9,$v1 | ||
890 | $SLL $a2,1 | ||
891 | bgtz $a2,.-4 | ||
892 | addu $t9,1 | ||
893 | |||
894 | .set reorder | ||
895 | negu $t1,$t9 | ||
896 | li $t2,-1 | ||
897 | $SLL $t2,$t1 | ||
898 | and $t2,$a0 | ||
899 | $SRL $at,$a1,$t1 | ||
900 | .set noreorder | ||
901 | beqz $t2,.+12 | ||
902 | nop | ||
903 | break 6 # signal overflow | ||
904 | .set reorder | ||
905 | $SLL $a0,$t9 | ||
906 | $SLL $a1,$t9 | ||
907 | or $a0,$at | ||
908 | ___ | ||
909 | $QT=$ta0; | ||
910 | $HH=$ta1; | ||
911 | $DH=$v1; | ||
912 | $code.=<<___; | ||
913 | .L_bn_div_words_body: | ||
914 | $SRL $DH,$a2,4*$BNSZ # bits | ||
915 | sgeu $at,$a0,$a2 | ||
916 | .set noreorder | ||
917 | beqz $at,.+12 | ||
918 | nop | ||
919 | $SUBU $a0,$a2 | ||
920 | .set reorder | ||
921 | |||
922 | li $QT,-1 | ||
923 | $SRL $HH,$a0,4*$BNSZ # bits | ||
924 | $SRL $QT,4*$BNSZ # q=0xffffffff | ||
925 | beq $DH,$HH,.L_bn_div_words_skip_div1 | ||
926 | $DIVU $zero,$a0,$DH | ||
927 | mflo $QT | ||
928 | .L_bn_div_words_skip_div1: | ||
929 | $MULTU $a2,$QT | ||
930 | $SLL $t3,$a0,4*$BNSZ # bits | ||
931 | $SRL $at,$a1,4*$BNSZ # bits | ||
932 | or $t3,$at | ||
933 | mflo $t0 | ||
934 | mfhi $t1 | ||
935 | .L_bn_div_words_inner_loop1: | ||
936 | sltu $t2,$t3,$t0 | ||
937 | seq $t8,$HH,$t1 | ||
938 | sltu $at,$HH,$t1 | ||
939 | and $t2,$t8 | ||
940 | sltu $v0,$t0,$a2 | ||
941 | or $at,$t2 | ||
942 | .set noreorder | ||
943 | beqz $at,.L_bn_div_words_inner_loop1_done | ||
944 | $SUBU $t1,$v0 | ||
945 | $SUBU $t0,$a2 | ||
946 | b .L_bn_div_words_inner_loop1 | ||
947 | $SUBU $QT,1 | ||
948 | .set reorder | ||
949 | .L_bn_div_words_inner_loop1_done: | ||
950 | |||
951 | $SLL $a1,4*$BNSZ # bits | ||
952 | $SUBU $a0,$t3,$t0 | ||
953 | $SLL $v0,$QT,4*$BNSZ # bits | ||
954 | |||
955 | li $QT,-1 | ||
956 | $SRL $HH,$a0,4*$BNSZ # bits | ||
957 | $SRL $QT,4*$BNSZ # q=0xffffffff | ||
958 | beq $DH,$HH,.L_bn_div_words_skip_div2 | ||
959 | $DIVU $zero,$a0,$DH | ||
960 | mflo $QT | ||
961 | .L_bn_div_words_skip_div2: | ||
962 | $MULTU $a2,$QT | ||
963 | $SLL $t3,$a0,4*$BNSZ # bits | ||
964 | $SRL $at,$a1,4*$BNSZ # bits | ||
965 | or $t3,$at | ||
966 | mflo $t0 | ||
967 | mfhi $t1 | ||
968 | .L_bn_div_words_inner_loop2: | ||
969 | sltu $t2,$t3,$t0 | ||
970 | seq $t8,$HH,$t1 | ||
971 | sltu $at,$HH,$t1 | ||
972 | and $t2,$t8 | ||
973 | sltu $v1,$t0,$a2 | ||
974 | or $at,$t2 | ||
975 | .set noreorder | ||
976 | beqz $at,.L_bn_div_words_inner_loop2_done | ||
977 | $SUBU $t1,$v1 | ||
978 | $SUBU $t0,$a2 | ||
979 | b .L_bn_div_words_inner_loop2 | ||
980 | $SUBU $QT,1 | ||
981 | .set reorder | ||
982 | .L_bn_div_words_inner_loop2_done: | ||
983 | |||
984 | $SUBU $a0,$t3,$t0 | ||
985 | or $v0,$QT | ||
986 | $SRL $v1,$a0,$t9 # $v1 contains remainder if anybody wants it | ||
987 | $SRL $a2,$t9 # restore $a2 | ||
988 | |||
989 | .set noreorder | ||
990 | move $a1,$v1 | ||
991 | ___ | ||
992 | $code.=<<___ if ($flavour =~ /nubi/i); | ||
993 | $REG_L $t3,4*$SZREG($sp) | ||
994 | $REG_L $t2,3*$SZREG($sp) | ||
995 | $REG_L $t1,2*$SZREG($sp) | ||
996 | $REG_L $t0,1*$SZREG($sp) | ||
997 | $REG_L $gp,0*$SZREG($sp) | ||
998 | $PTR_ADD $sp,6*$SZREG | ||
999 | ___ | ||
1000 | $code.=<<___; | ||
1001 | jr $ra | ||
1002 | move $a0,$v0 | ||
1003 | .end bn_div_words_internal | ||
1004 | ___ | ||
1005 | undef $HH; undef $QT; undef $DH; | ||
1006 | |||
1007 | ($a_0,$a_1,$a_2,$a_3)=($t0,$t1,$t2,$t3); | ||
1008 | ($b_0,$b_1,$b_2,$b_3)=($ta0,$ta1,$ta2,$ta3); | ||
1009 | |||
1010 | ($a_4,$a_5,$a_6,$a_7)=($s0,$s2,$s4,$a1); # once we load a[7], no use for $a1 | ||
1011 | ($b_4,$b_5,$b_6,$b_7)=($s1,$s3,$s5,$a2); # once we load b[7], no use for $a2 | ||
1012 | |||
1013 | ($t_1,$t_2,$c_1,$c_2,$c_3)=($t8,$t9,$v0,$v1,$a3); | ||
1014 | |||
1015 | $code.=<<___; | ||
1016 | |||
1017 | .align 5 | ||
1018 | .globl bn_mul_comba8 | ||
1019 | .ent bn_mul_comba8 | ||
1020 | bn_mul_comba8: | ||
1021 | .set noreorder | ||
1022 | ___ | ||
1023 | $code.=<<___ if ($flavour =~ /nubi/i); | ||
1024 | .frame $sp,12*$SZREG,$ra | ||
1025 | .mask 0x803ff008,-$SZREG | ||
1026 | $PTR_SUB $sp,12*$SZREG | ||
1027 | $REG_S $ra,11*$SZREG($sp) | ||
1028 | $REG_S $s5,10*$SZREG($sp) | ||
1029 | $REG_S $s4,9*$SZREG($sp) | ||
1030 | $REG_S $s3,8*$SZREG($sp) | ||
1031 | $REG_S $s2,7*$SZREG($sp) | ||
1032 | $REG_S $s1,6*$SZREG($sp) | ||
1033 | $REG_S $s0,5*$SZREG($sp) | ||
1034 | $REG_S $t3,4*$SZREG($sp) | ||
1035 | $REG_S $t2,3*$SZREG($sp) | ||
1036 | $REG_S $t1,2*$SZREG($sp) | ||
1037 | $REG_S $t0,1*$SZREG($sp) | ||
1038 | $REG_S $gp,0*$SZREG($sp) | ||
1039 | ___ | ||
1040 | $code.=<<___ if ($flavour !~ /nubi/i); | ||
1041 | .frame $sp,6*$SZREG,$ra | ||
1042 | .mask 0x003f0000,-$SZREG | ||
1043 | $PTR_SUB $sp,6*$SZREG | ||
1044 | $REG_S $s5,5*$SZREG($sp) | ||
1045 | $REG_S $s4,4*$SZREG($sp) | ||
1046 | $REG_S $s3,3*$SZREG($sp) | ||
1047 | $REG_S $s2,2*$SZREG($sp) | ||
1048 | $REG_S $s1,1*$SZREG($sp) | ||
1049 | $REG_S $s0,0*$SZREG($sp) | ||
1050 | ___ | ||
1051 | $code.=<<___; | ||
1052 | |||
1053 | .set reorder | ||
1054 | $LD $a_0,0($a1) # If compiled with -mips3 option on | ||
1055 | # R5000 box assembler barks on this | ||
1056 | # 1ine with "should not have mult/div | ||
1057 | # as last instruction in bb (R10K | ||
1058 | # bug)" warning. If anybody out there | ||
1059 | # has a clue about how to circumvent | ||
1060 | # this do send me a note. | ||
1061 | # <appro\@fy.chalmers.se> | ||
1062 | |||
1063 | $LD $b_0,0($a2) | ||
1064 | $LD $a_1,$BNSZ($a1) | ||
1065 | $LD $a_2,2*$BNSZ($a1) | ||
1066 | $MULTU $a_0,$b_0 # mul_add_c(a[0],b[0],c1,c2,c3); | ||
1067 | $LD $a_3,3*$BNSZ($a1) | ||
1068 | $LD $b_1,$BNSZ($a2) | ||
1069 | $LD $b_2,2*$BNSZ($a2) | ||
1070 | $LD $b_3,3*$BNSZ($a2) | ||
1071 | mflo $c_1 | ||
1072 | mfhi $c_2 | ||
1073 | |||
1074 | $LD $a_4,4*$BNSZ($a1) | ||
1075 | $LD $a_5,5*$BNSZ($a1) | ||
1076 | $MULTU $a_0,$b_1 # mul_add_c(a[0],b[1],c2,c3,c1); | ||
1077 | $LD $a_6,6*$BNSZ($a1) | ||
1078 | $LD $a_7,7*$BNSZ($a1) | ||
1079 | $LD $b_4,4*$BNSZ($a2) | ||
1080 | $LD $b_5,5*$BNSZ($a2) | ||
1081 | mflo $t_1 | ||
1082 | mfhi $t_2 | ||
1083 | $ADDU $c_2,$t_1 | ||
1084 | sltu $at,$c_2,$t_1 | ||
1085 | $MULTU $a_1,$b_0 # mul_add_c(a[1],b[0],c2,c3,c1); | ||
1086 | $ADDU $c_3,$t_2,$at | ||
1087 | $LD $b_6,6*$BNSZ($a2) | ||
1088 | $LD $b_7,7*$BNSZ($a2) | ||
1089 | $ST $c_1,0($a0) # r[0]=c1; | ||
1090 | mflo $t_1 | ||
1091 | mfhi $t_2 | ||
1092 | $ADDU $c_2,$t_1 | ||
1093 | sltu $at,$c_2,$t_1 | ||
1094 | $MULTU $a_2,$b_0 # mul_add_c(a[2],b[0],c3,c1,c2); | ||
1095 | $ADDU $t_2,$at | ||
1096 | $ADDU $c_3,$t_2 | ||
1097 | sltu $c_1,$c_3,$t_2 | ||
1098 | $ST $c_2,$BNSZ($a0) # r[1]=c2; | ||
1099 | |||
1100 | mflo $t_1 | ||
1101 | mfhi $t_2 | ||
1102 | $ADDU $c_3,$t_1 | ||
1103 | sltu $at,$c_3,$t_1 | ||
1104 | $MULTU $a_1,$b_1 # mul_add_c(a[1],b[1],c3,c1,c2); | ||
1105 | $ADDU $t_2,$at | ||
1106 | $ADDU $c_1,$t_2 | ||
1107 | mflo $t_1 | ||
1108 | mfhi $t_2 | ||
1109 | $ADDU $c_3,$t_1 | ||
1110 | sltu $at,$c_3,$t_1 | ||
1111 | $MULTU $a_0,$b_2 # mul_add_c(a[0],b[2],c3,c1,c2); | ||
1112 | $ADDU $t_2,$at | ||
1113 | $ADDU $c_1,$t_2 | ||
1114 | sltu $c_2,$c_1,$t_2 | ||
1115 | mflo $t_1 | ||
1116 | mfhi $t_2 | ||
1117 | $ADDU $c_3,$t_1 | ||
1118 | sltu $at,$c_3,$t_1 | ||
1119 | $MULTU $a_0,$b_3 # mul_add_c(a[0],b[3],c1,c2,c3); | ||
1120 | $ADDU $t_2,$at | ||
1121 | $ADDU $c_1,$t_2 | ||
1122 | sltu $at,$c_1,$t_2 | ||
1123 | $ADDU $c_2,$at | ||
1124 | $ST $c_3,2*$BNSZ($a0) # r[2]=c3; | ||
1125 | |||
1126 | mflo $t_1 | ||
1127 | mfhi $t_2 | ||
1128 | $ADDU $c_1,$t_1 | ||
1129 | sltu $at,$c_1,$t_1 | ||
1130 | $MULTU $a_1,$b_2 # mul_add_c(a[1],b[2],c1,c2,c3); | ||
1131 | $ADDU $t_2,$at | ||
1132 | $ADDU $c_2,$t_2 | ||
1133 | sltu $c_3,$c_2,$t_2 | ||
1134 | mflo $t_1 | ||
1135 | mfhi $t_2 | ||
1136 | $ADDU $c_1,$t_1 | ||
1137 | sltu $at,$c_1,$t_1 | ||
1138 | $MULTU $a_2,$b_1 # mul_add_c(a[2],b[1],c1,c2,c3); | ||
1139 | $ADDU $t_2,$at | ||
1140 | $ADDU $c_2,$t_2 | ||
1141 | sltu $at,$c_2,$t_2 | ||
1142 | $ADDU $c_3,$at | ||
1143 | mflo $t_1 | ||
1144 | mfhi $t_2 | ||
1145 | $ADDU $c_1,$t_1 | ||
1146 | sltu $at,$c_1,$t_1 | ||
1147 | $MULTU $a_3,$b_0 # mul_add_c(a[3],b[0],c1,c2,c3); | ||
1148 | $ADDU $t_2,$at | ||
1149 | $ADDU $c_2,$t_2 | ||
1150 | sltu $at,$c_2,$t_2 | ||
1151 | $ADDU $c_3,$at | ||
1152 | mflo $t_1 | ||
1153 | mfhi $t_2 | ||
1154 | $ADDU $c_1,$t_1 | ||
1155 | sltu $at,$c_1,$t_1 | ||
1156 | $MULTU $a_4,$b_0 # mul_add_c(a[4],b[0],c2,c3,c1); | ||
1157 | $ADDU $t_2,$at | ||
1158 | $ADDU $c_2,$t_2 | ||
1159 | sltu $at,$c_2,$t_2 | ||
1160 | $ADDU $c_3,$at | ||
1161 | $ST $c_1,3*$BNSZ($a0) # r[3]=c1; | ||
1162 | |||
1163 | mflo $t_1 | ||
1164 | mfhi $t_2 | ||
1165 | $ADDU $c_2,$t_1 | ||
1166 | sltu $at,$c_2,$t_1 | ||
1167 | $MULTU $a_3,$b_1 # mul_add_c(a[3],b[1],c2,c3,c1); | ||
1168 | $ADDU $t_2,$at | ||
1169 | $ADDU $c_3,$t_2 | ||
1170 | sltu $c_1,$c_3,$t_2 | ||
1171 | mflo $t_1 | ||
1172 | mfhi $t_2 | ||
1173 | $ADDU $c_2,$t_1 | ||
1174 | sltu $at,$c_2,$t_1 | ||
1175 | $MULTU $a_2,$b_2 # mul_add_c(a[2],b[2],c2,c3,c1); | ||
1176 | $ADDU $t_2,$at | ||
1177 | $ADDU $c_3,$t_2 | ||
1178 | sltu $at,$c_3,$t_2 | ||
1179 | $ADDU $c_1,$at | ||
1180 | mflo $t_1 | ||
1181 | mfhi $t_2 | ||
1182 | $ADDU $c_2,$t_1 | ||
1183 | sltu $at,$c_2,$t_1 | ||
1184 | $MULTU $a_1,$b_3 # mul_add_c(a[1],b[3],c2,c3,c1); | ||
1185 | $ADDU $t_2,$at | ||
1186 | $ADDU $c_3,$t_2 | ||
1187 | sltu $at,$c_3,$t_2 | ||
1188 | $ADDU $c_1,$at | ||
1189 | mflo $t_1 | ||
1190 | mfhi $t_2 | ||
1191 | $ADDU $c_2,$t_1 | ||
1192 | sltu $at,$c_2,$t_1 | ||
1193 | $MULTU $a_0,$b_4 # mul_add_c(a[0],b[4],c2,c3,c1); | ||
1194 | $ADDU $t_2,$at | ||
1195 | $ADDU $c_3,$t_2 | ||
1196 | sltu $at,$c_3,$t_2 | ||
1197 | $ADDU $c_1,$at | ||
1198 | mflo $t_1 | ||
1199 | mfhi $t_2 | ||
1200 | $ADDU $c_2,$t_1 | ||
1201 | sltu $at,$c_2,$t_1 | ||
1202 | $MULTU $a_0,$b_5 # mul_add_c(a[0],b[5],c3,c1,c2); | ||
1203 | $ADDU $t_2,$at | ||
1204 | $ADDU $c_3,$t_2 | ||
1205 | sltu $at,$c_3,$t_2 | ||
1206 | $ADDU $c_1,$at | ||
1207 | $ST $c_2,4*$BNSZ($a0) # r[4]=c2; | ||
1208 | |||
1209 | mflo $t_1 | ||
1210 | mfhi $t_2 | ||
1211 | $ADDU $c_3,$t_1 | ||
1212 | sltu $at,$c_3,$t_1 | ||
1213 | $MULTU $a_1,$b_4 # mul_add_c(a[1],b[4],c3,c1,c2); | ||
1214 | $ADDU $t_2,$at | ||
1215 | $ADDU $c_1,$t_2 | ||
1216 | sltu $c_2,$c_1,$t_2 | ||
1217 | mflo $t_1 | ||
1218 | mfhi $t_2 | ||
1219 | $ADDU $c_3,$t_1 | ||
1220 | sltu $at,$c_3,$t_1 | ||
1221 | $MULTU $a_2,$b_3 # mul_add_c(a[2],b[3],c3,c1,c2); | ||
1222 | $ADDU $t_2,$at | ||
1223 | $ADDU $c_1,$t_2 | ||
1224 | sltu $at,$c_1,$t_2 | ||
1225 | $ADDU $c_2,$at | ||
1226 | mflo $t_1 | ||
1227 | mfhi $t_2 | ||
1228 | $ADDU $c_3,$t_1 | ||
1229 | sltu $at,$c_3,$t_1 | ||
1230 | $MULTU $a_3,$b_2 # mul_add_c(a[3],b[2],c3,c1,c2); | ||
1231 | $ADDU $t_2,$at | ||
1232 | $ADDU $c_1,$t_2 | ||
1233 | sltu $at,$c_1,$t_2 | ||
1234 | $ADDU $c_2,$at | ||
1235 | mflo $t_1 | ||
1236 | mfhi $t_2 | ||
1237 | $ADDU $c_3,$t_1 | ||
1238 | sltu $at,$c_3,$t_1 | ||
1239 | $MULTU $a_4,$b_1 # mul_add_c(a[4],b[1],c3,c1,c2); | ||
1240 | $ADDU $t_2,$at | ||
1241 | $ADDU $c_1,$t_2 | ||
1242 | sltu $at,$c_1,$t_2 | ||
1243 | $ADDU $c_2,$at | ||
1244 | mflo $t_1 | ||
1245 | mfhi $t_2 | ||
1246 | $ADDU $c_3,$t_1 | ||
1247 | sltu $at,$c_3,$t_1 | ||
1248 | $MULTU $a_5,$b_0 # mul_add_c(a[5],b[0],c3,c1,c2); | ||
1249 | $ADDU $t_2,$at | ||
1250 | $ADDU $c_1,$t_2 | ||
1251 | sltu $at,$c_1,$t_2 | ||
1252 | $ADDU $c_2,$at | ||
1253 | mflo $t_1 | ||
1254 | mfhi $t_2 | ||
1255 | $ADDU $c_3,$t_1 | ||
1256 | sltu $at,$c_3,$t_1 | ||
1257 | $MULTU $a_6,$b_0 # mul_add_c(a[6],b[0],c1,c2,c3); | ||
1258 | $ADDU $t_2,$at | ||
1259 | $ADDU $c_1,$t_2 | ||
1260 | sltu $at,$c_1,$t_2 | ||
1261 | $ADDU $c_2,$at | ||
1262 | $ST $c_3,5*$BNSZ($a0) # r[5]=c3; | ||
1263 | |||
1264 | mflo $t_1 | ||
1265 | mfhi $t_2 | ||
1266 | $ADDU $c_1,$t_1 | ||
1267 | sltu $at,$c_1,$t_1 | ||
1268 | $MULTU $a_5,$b_1 # mul_add_c(a[5],b[1],c1,c2,c3); | ||
1269 | $ADDU $t_2,$at | ||
1270 | $ADDU $c_2,$t_2 | ||
1271 | sltu $c_3,$c_2,$t_2 | ||
1272 | mflo $t_1 | ||
1273 | mfhi $t_2 | ||
1274 | $ADDU $c_1,$t_1 | ||
1275 | sltu $at,$c_1,$t_1 | ||
1276 | $MULTU $a_4,$b_2 # mul_add_c(a[4],b[2],c1,c2,c3); | ||
1277 | $ADDU $t_2,$at | ||
1278 | $ADDU $c_2,$t_2 | ||
1279 | sltu $at,$c_2,$t_2 | ||
1280 | $ADDU $c_3,$at | ||
1281 | mflo $t_1 | ||
1282 | mfhi $t_2 | ||
1283 | $ADDU $c_1,$t_1 | ||
1284 | sltu $at,$c_1,$t_1 | ||
1285 | $MULTU $a_3,$b_3 # mul_add_c(a[3],b[3],c1,c2,c3); | ||
1286 | $ADDU $t_2,$at | ||
1287 | $ADDU $c_2,$t_2 | ||
1288 | sltu $at,$c_2,$t_2 | ||
1289 | $ADDU $c_3,$at | ||
1290 | mflo $t_1 | ||
1291 | mfhi $t_2 | ||
1292 | $ADDU $c_1,$t_1 | ||
1293 | sltu $at,$c_1,$t_1 | ||
1294 | $MULTU $a_2,$b_4 # mul_add_c(a[2],b[4],c1,c2,c3); | ||
1295 | $ADDU $t_2,$at | ||
1296 | $ADDU $c_2,$t_2 | ||
1297 | sltu $at,$c_2,$t_2 | ||
1298 | $ADDU $c_3,$at | ||
1299 | mflo $t_1 | ||
1300 | mfhi $t_2 | ||
1301 | $ADDU $c_1,$t_1 | ||
1302 | sltu $at,$c_1,$t_1 | ||
1303 | $MULTU $a_1,$b_5 # mul_add_c(a[1],b[5],c1,c2,c3); | ||
1304 | $ADDU $t_2,$at | ||
1305 | $ADDU $c_2,$t_2 | ||
1306 | sltu $at,$c_2,$t_2 | ||
1307 | $ADDU $c_3,$at | ||
1308 | mflo $t_1 | ||
1309 | mfhi $t_2 | ||
1310 | $ADDU $c_1,$t_1 | ||
1311 | sltu $at,$c_1,$t_1 | ||
1312 | $MULTU $a_0,$b_6 # mul_add_c(a[0],b[6],c1,c2,c3); | ||
1313 | $ADDU $t_2,$at | ||
1314 | $ADDU $c_2,$t_2 | ||
1315 | sltu $at,$c_2,$t_2 | ||
1316 | $ADDU $c_3,$at | ||
1317 | mflo $t_1 | ||
1318 | mfhi $t_2 | ||
1319 | $ADDU $c_1,$t_1 | ||
1320 | sltu $at,$c_1,$t_1 | ||
1321 | $MULTU $a_0,$b_7 # mul_add_c(a[0],b[7],c2,c3,c1); | ||
1322 | $ADDU $t_2,$at | ||
1323 | $ADDU $c_2,$t_2 | ||
1324 | sltu $at,$c_2,$t_2 | ||
1325 | $ADDU $c_3,$at | ||
1326 | $ST $c_1,6*$BNSZ($a0) # r[6]=c1; | ||
1327 | |||
1328 | mflo $t_1 | ||
1329 | mfhi $t_2 | ||
1330 | $ADDU $c_2,$t_1 | ||
1331 | sltu $at,$c_2,$t_1 | ||
1332 | $MULTU $a_1,$b_6 # mul_add_c(a[1],b[6],c2,c3,c1); | ||
1333 | $ADDU $t_2,$at | ||
1334 | $ADDU $c_3,$t_2 | ||
1335 | sltu $c_1,$c_3,$t_2 | ||
1336 | mflo $t_1 | ||
1337 | mfhi $t_2 | ||
1338 | $ADDU $c_2,$t_1 | ||
1339 | sltu $at,$c_2,$t_1 | ||
1340 | $MULTU $a_2,$b_5 # mul_add_c(a[2],b[5],c2,c3,c1); | ||
1341 | $ADDU $t_2,$at | ||
1342 | $ADDU $c_3,$t_2 | ||
1343 | sltu $at,$c_3,$t_2 | ||
1344 | $ADDU $c_1,$at | ||
1345 | mflo $t_1 | ||
1346 | mfhi $t_2 | ||
1347 | $ADDU $c_2,$t_1 | ||
1348 | sltu $at,$c_2,$t_1 | ||
1349 | $MULTU $a_3,$b_4 # mul_add_c(a[3],b[4],c2,c3,c1); | ||
1350 | $ADDU $t_2,$at | ||
1351 | $ADDU $c_3,$t_2 | ||
1352 | sltu $at,$c_3,$t_2 | ||
1353 | $ADDU $c_1,$at | ||
1354 | mflo $t_1 | ||
1355 | mfhi $t_2 | ||
1356 | $ADDU $c_2,$t_1 | ||
1357 | sltu $at,$c_2,$t_1 | ||
1358 | $MULTU $a_4,$b_3 # mul_add_c(a[4],b[3],c2,c3,c1); | ||
1359 | $ADDU $t_2,$at | ||
1360 | $ADDU $c_3,$t_2 | ||
1361 | sltu $at,$c_3,$t_2 | ||
1362 | $ADDU $c_1,$at | ||
1363 | mflo $t_1 | ||
1364 | mfhi $t_2 | ||
1365 | $ADDU $c_2,$t_1 | ||
1366 | sltu $at,$c_2,$t_1 | ||
1367 | $MULTU $a_5,$b_2 # mul_add_c(a[5],b[2],c2,c3,c1); | ||
1368 | $ADDU $t_2,$at | ||
1369 | $ADDU $c_3,$t_2 | ||
1370 | sltu $at,$c_3,$t_2 | ||
1371 | $ADDU $c_1,$at | ||
1372 | mflo $t_1 | ||
1373 | mfhi $t_2 | ||
1374 | $ADDU $c_2,$t_1 | ||
1375 | sltu $at,$c_2,$t_1 | ||
1376 | $MULTU $a_6,$b_1 # mul_add_c(a[6],b[1],c2,c3,c1); | ||
1377 | $ADDU $t_2,$at | ||
1378 | $ADDU $c_3,$t_2 | ||
1379 | sltu $at,$c_3,$t_2 | ||
1380 | $ADDU $c_1,$at | ||
1381 | mflo $t_1 | ||
1382 | mfhi $t_2 | ||
1383 | $ADDU $c_2,$t_1 | ||
1384 | sltu $at,$c_2,$t_1 | ||
1385 | $MULTU $a_7,$b_0 # mul_add_c(a[7],b[0],c2,c3,c1); | ||
1386 | $ADDU $t_2,$at | ||
1387 | $ADDU $c_3,$t_2 | ||
1388 | sltu $at,$c_3,$t_2 | ||
1389 | $ADDU $c_1,$at | ||
1390 | mflo $t_1 | ||
1391 | mfhi $t_2 | ||
1392 | $ADDU $c_2,$t_1 | ||
1393 | sltu $at,$c_2,$t_1 | ||
1394 | $MULTU $a_7,$b_1 # mul_add_c(a[7],b[1],c3,c1,c2); | ||
1395 | $ADDU $t_2,$at | ||
1396 | $ADDU $c_3,$t_2 | ||
1397 | sltu $at,$c_3,$t_2 | ||
1398 | $ADDU $c_1,$at | ||
1399 | $ST $c_2,7*$BNSZ($a0) # r[7]=c2; | ||
1400 | |||
1401 | mflo $t_1 | ||
1402 | mfhi $t_2 | ||
1403 | $ADDU $c_3,$t_1 | ||
1404 | sltu $at,$c_3,$t_1 | ||
1405 | $MULTU $a_6,$b_2 # mul_add_c(a[6],b[2],c3,c1,c2); | ||
1406 | $ADDU $t_2,$at | ||
1407 | $ADDU $c_1,$t_2 | ||
1408 | sltu $c_2,$c_1,$t_2 | ||
1409 | mflo $t_1 | ||
1410 | mfhi $t_2 | ||
1411 | $ADDU $c_3,$t_1 | ||
1412 | sltu $at,$c_3,$t_1 | ||
1413 | $MULTU $a_5,$b_3 # mul_add_c(a[5],b[3],c3,c1,c2); | ||
1414 | $ADDU $t_2,$at | ||
1415 | $ADDU $c_1,$t_2 | ||
1416 | sltu $at,$c_1,$t_2 | ||
1417 | $ADDU $c_2,$at | ||
1418 | mflo $t_1 | ||
1419 | mfhi $t_2 | ||
1420 | $ADDU $c_3,$t_1 | ||
1421 | sltu $at,$c_3,$t_1 | ||
1422 | $MULTU $a_4,$b_4 # mul_add_c(a[4],b[4],c3,c1,c2); | ||
1423 | $ADDU $t_2,$at | ||
1424 | $ADDU $c_1,$t_2 | ||
1425 | sltu $at,$c_1,$t_2 | ||
1426 | $ADDU $c_2,$at | ||
1427 | mflo $t_1 | ||
1428 | mfhi $t_2 | ||
1429 | $ADDU $c_3,$t_1 | ||
1430 | sltu $at,$c_3,$t_1 | ||
1431 | $MULTU $a_3,$b_5 # mul_add_c(a[3],b[5],c3,c1,c2); | ||
1432 | $ADDU $t_2,$at | ||
1433 | $ADDU $c_1,$t_2 | ||
1434 | sltu $at,$c_1,$t_2 | ||
1435 | $ADDU $c_2,$at | ||
1436 | mflo $t_1 | ||
1437 | mfhi $t_2 | ||
1438 | $ADDU $c_3,$t_1 | ||
1439 | sltu $at,$c_3,$t_1 | ||
1440 | $MULTU $a_2,$b_6 # mul_add_c(a[2],b[6],c3,c1,c2); | ||
1441 | $ADDU $t_2,$at | ||
1442 | $ADDU $c_1,$t_2 | ||
1443 | sltu $at,$c_1,$t_2 | ||
1444 | $ADDU $c_2,$at | ||
1445 | mflo $t_1 | ||
1446 | mfhi $t_2 | ||
1447 | $ADDU $c_3,$t_1 | ||
1448 | sltu $at,$c_3,$t_1 | ||
1449 | $MULTU $a_1,$b_7 # mul_add_c(a[1],b[7],c3,c1,c2); | ||
1450 | $ADDU $t_2,$at | ||
1451 | $ADDU $c_1,$t_2 | ||
1452 | sltu $at,$c_1,$t_2 | ||
1453 | $ADDU $c_2,$at | ||
1454 | mflo $t_1 | ||
1455 | mfhi $t_2 | ||
1456 | $ADDU $c_3,$t_1 | ||
1457 | sltu $at,$c_3,$t_1 | ||
1458 | $MULTU $a_2,$b_7 # mul_add_c(a[2],b[7],c1,c2,c3); | ||
1459 | $ADDU $t_2,$at | ||
1460 | $ADDU $c_1,$t_2 | ||
1461 | sltu $at,$c_1,$t_2 | ||
1462 | $ADDU $c_2,$at | ||
1463 | $ST $c_3,8*$BNSZ($a0) # r[8]=c3; | ||
1464 | |||
1465 | mflo $t_1 | ||
1466 | mfhi $t_2 | ||
1467 | $ADDU $c_1,$t_1 | ||
1468 | sltu $at,$c_1,$t_1 | ||
1469 | $MULTU $a_3,$b_6 # mul_add_c(a[3],b[6],c1,c2,c3); | ||
1470 | $ADDU $t_2,$at | ||
1471 | $ADDU $c_2,$t_2 | ||
1472 | sltu $c_3,$c_2,$t_2 | ||
1473 | mflo $t_1 | ||
1474 | mfhi $t_2 | ||
1475 | $ADDU $c_1,$t_1 | ||
1476 | sltu $at,$c_1,$t_1 | ||
1477 | $MULTU $a_4,$b_5 # mul_add_c(a[4],b[5],c1,c2,c3); | ||
1478 | $ADDU $t_2,$at | ||
1479 | $ADDU $c_2,$t_2 | ||
1480 | sltu $at,$c_2,$t_2 | ||
1481 | $ADDU $c_3,$at | ||
1482 | mflo $t_1 | ||
1483 | mfhi $t_2 | ||
1484 | $ADDU $c_1,$t_1 | ||
1485 | sltu $at,$c_1,$t_1 | ||
1486 | $MULTU $a_5,$b_4 # mul_add_c(a[5],b[4],c1,c2,c3); | ||
1487 | $ADDU $t_2,$at | ||
1488 | $ADDU $c_2,$t_2 | ||
1489 | sltu $at,$c_2,$t_2 | ||
1490 | $ADDU $c_3,$at | ||
1491 | mflo $t_1 | ||
1492 | mfhi $t_2 | ||
1493 | $ADDU $c_1,$t_1 | ||
1494 | sltu $at,$c_1,$t_1 | ||
1495 | $MULTU $a_6,$b_3 # mul_add_c(a[6],b[3],c1,c2,c3); | ||
1496 | $ADDU $t_2,$at | ||
1497 | $ADDU $c_2,$t_2 | ||
1498 | sltu $at,$c_2,$t_2 | ||
1499 | $ADDU $c_3,$at | ||
1500 | mflo $t_1 | ||
1501 | mfhi $t_2 | ||
1502 | $ADDU $c_1,$t_1 | ||
1503 | sltu $at,$c_1,$t_1 | ||
1504 | $MULTU $a_7,$b_2 # mul_add_c(a[7],b[2],c1,c2,c3); | ||
1505 | $ADDU $t_2,$at | ||
1506 | $ADDU $c_2,$t_2 | ||
1507 | sltu $at,$c_2,$t_2 | ||
1508 | $ADDU $c_3,$at | ||
1509 | mflo $t_1 | ||
1510 | mfhi $t_2 | ||
1511 | $ADDU $c_1,$t_1 | ||
1512 | sltu $at,$c_1,$t_1 | ||
1513 | $MULTU $a_7,$b_3 # mul_add_c(a[7],b[3],c2,c3,c1); | ||
1514 | $ADDU $t_2,$at | ||
1515 | $ADDU $c_2,$t_2 | ||
1516 | sltu $at,$c_2,$t_2 | ||
1517 | $ADDU $c_3,$at | ||
1518 | $ST $c_1,9*$BNSZ($a0) # r[9]=c1; | ||
1519 | |||
1520 | mflo $t_1 | ||
1521 | mfhi $t_2 | ||
1522 | $ADDU $c_2,$t_1 | ||
1523 | sltu $at,$c_2,$t_1 | ||
1524 | $MULTU $a_6,$b_4 # mul_add_c(a[6],b[4],c2,c3,c1); | ||
1525 | $ADDU $t_2,$at | ||
1526 | $ADDU $c_3,$t_2 | ||
1527 | sltu $c_1,$c_3,$t_2 | ||
1528 | mflo $t_1 | ||
1529 | mfhi $t_2 | ||
1530 | $ADDU $c_2,$t_1 | ||
1531 | sltu $at,$c_2,$t_1 | ||
1532 | $MULTU $a_5,$b_5 # mul_add_c(a[5],b[5],c2,c3,c1); | ||
1533 | $ADDU $t_2,$at | ||
1534 | $ADDU $c_3,$t_2 | ||
1535 | sltu $at,$c_3,$t_2 | ||
1536 | $ADDU $c_1,$at | ||
1537 | mflo $t_1 | ||
1538 | mfhi $t_2 | ||
1539 | $ADDU $c_2,$t_1 | ||
1540 | sltu $at,$c_2,$t_1 | ||
1541 | $MULTU $a_4,$b_6 # mul_add_c(a[4],b[6],c2,c3,c1); | ||
1542 | $ADDU $t_2,$at | ||
1543 | $ADDU $c_3,$t_2 | ||
1544 | sltu $at,$c_3,$t_2 | ||
1545 | $ADDU $c_1,$at | ||
1546 | mflo $t_1 | ||
1547 | mfhi $t_2 | ||
1548 | $ADDU $c_2,$t_1 | ||
1549 | sltu $at,$c_2,$t_1 | ||
1550 | $MULTU $a_3,$b_7 # mul_add_c(a[3],b[7],c2,c3,c1); | ||
1551 | $ADDU $t_2,$at | ||
1552 | $ADDU $c_3,$t_2 | ||
1553 | sltu $at,$c_3,$t_2 | ||
1554 | $ADDU $c_1,$at | ||
1555 | mflo $t_1 | ||
1556 | mfhi $t_2 | ||
1557 | $ADDU $c_2,$t_1 | ||
1558 | sltu $at,$c_2,$t_1 | ||
1559 | $MULTU $a_4,$b_7 # mul_add_c(a[4],b[7],c3,c1,c2); | ||
1560 | $ADDU $t_2,$at | ||
1561 | $ADDU $c_3,$t_2 | ||
1562 | sltu $at,$c_3,$t_2 | ||
1563 | $ADDU $c_1,$at | ||
1564 | $ST $c_2,10*$BNSZ($a0) # r[10]=c2; | ||
1565 | |||
1566 | mflo $t_1 | ||
1567 | mfhi $t_2 | ||
1568 | $ADDU $c_3,$t_1 | ||
1569 | sltu $at,$c_3,$t_1 | ||
1570 | $MULTU $a_5,$b_6 # mul_add_c(a[5],b[6],c3,c1,c2); | ||
1571 | $ADDU $t_2,$at | ||
1572 | $ADDU $c_1,$t_2 | ||
1573 | sltu $c_2,$c_1,$t_2 | ||
1574 | mflo $t_1 | ||
1575 | mfhi $t_2 | ||
1576 | $ADDU $c_3,$t_1 | ||
1577 | sltu $at,$c_3,$t_1 | ||
1578 | $MULTU $a_6,$b_5 # mul_add_c(a[6],b[5],c3,c1,c2); | ||
1579 | $ADDU $t_2,$at | ||
1580 | $ADDU $c_1,$t_2 | ||
1581 | sltu $at,$c_1,$t_2 | ||
1582 | $ADDU $c_2,$at | ||
1583 | mflo $t_1 | ||
1584 | mfhi $t_2 | ||
1585 | $ADDU $c_3,$t_1 | ||
1586 | sltu $at,$c_3,$t_1 | ||
1587 | $MULTU $a_7,$b_4 # mul_add_c(a[7],b[4],c3,c1,c2); | ||
1588 | $ADDU $t_2,$at | ||
1589 | $ADDU $c_1,$t_2 | ||
1590 | sltu $at,$c_1,$t_2 | ||
1591 | $ADDU $c_2,$at | ||
1592 | mflo $t_1 | ||
1593 | mfhi $t_2 | ||
1594 | $ADDU $c_3,$t_1 | ||
1595 | sltu $at,$c_3,$t_1 | ||
1596 | $MULTU $a_7,$b_5 # mul_add_c(a[7],b[5],c1,c2,c3); | ||
1597 | $ADDU $t_2,$at | ||
1598 | $ADDU $c_1,$t_2 | ||
1599 | sltu $at,$c_1,$t_2 | ||
1600 | $ADDU $c_2,$at | ||
1601 | $ST $c_3,11*$BNSZ($a0) # r[11]=c3; | ||
1602 | |||
1603 | mflo $t_1 | ||
1604 | mfhi $t_2 | ||
1605 | $ADDU $c_1,$t_1 | ||
1606 | sltu $at,$c_1,$t_1 | ||
1607 | $MULTU $a_6,$b_6 # mul_add_c(a[6],b[6],c1,c2,c3); | ||
1608 | $ADDU $t_2,$at | ||
1609 | $ADDU $c_2,$t_2 | ||
1610 | sltu $c_3,$c_2,$t_2 | ||
1611 | mflo $t_1 | ||
1612 | mfhi $t_2 | ||
1613 | $ADDU $c_1,$t_1 | ||
1614 | sltu $at,$c_1,$t_1 | ||
1615 | $MULTU $a_5,$b_7 # mul_add_c(a[5],b[7],c1,c2,c3); | ||
1616 | $ADDU $t_2,$at | ||
1617 | $ADDU $c_2,$t_2 | ||
1618 | sltu $at,$c_2,$t_2 | ||
1619 | $ADDU $c_3,$at | ||
1620 | mflo $t_1 | ||
1621 | mfhi $t_2 | ||
1622 | $ADDU $c_1,$t_1 | ||
1623 | sltu $at,$c_1,$t_1 | ||
1624 | $MULTU $a_6,$b_7 # mul_add_c(a[6],b[7],c2,c3,c1); | ||
1625 | $ADDU $t_2,$at | ||
1626 | $ADDU $c_2,$t_2 | ||
1627 | sltu $at,$c_2,$t_2 | ||
1628 | $ADDU $c_3,$at | ||
1629 | $ST $c_1,12*$BNSZ($a0) # r[12]=c1; | ||
1630 | |||
1631 | mflo $t_1 | ||
1632 | mfhi $t_2 | ||
1633 | $ADDU $c_2,$t_1 | ||
1634 | sltu $at,$c_2,$t_1 | ||
1635 | $MULTU $a_7,$b_6 # mul_add_c(a[7],b[6],c2,c3,c1); | ||
1636 | $ADDU $t_2,$at | ||
1637 | $ADDU $c_3,$t_2 | ||
1638 | sltu $c_1,$c_3,$t_2 | ||
1639 | mflo $t_1 | ||
1640 | mfhi $t_2 | ||
1641 | $ADDU $c_2,$t_1 | ||
1642 | sltu $at,$c_2,$t_1 | ||
1643 | $MULTU $a_7,$b_7 # mul_add_c(a[7],b[7],c3,c1,c2); | ||
1644 | $ADDU $t_2,$at | ||
1645 | $ADDU $c_3,$t_2 | ||
1646 | sltu $at,$c_3,$t_2 | ||
1647 | $ADDU $c_1,$at | ||
1648 | $ST $c_2,13*$BNSZ($a0) # r[13]=c2; | ||
1649 | |||
1650 | mflo $t_1 | ||
1651 | mfhi $t_2 | ||
1652 | $ADDU $c_3,$t_1 | ||
1653 | sltu $at,$c_3,$t_1 | ||
1654 | $ADDU $t_2,$at | ||
1655 | $ADDU $c_1,$t_2 | ||
1656 | $ST $c_3,14*$BNSZ($a0) # r[14]=c3; | ||
1657 | $ST $c_1,15*$BNSZ($a0) # r[15]=c1; | ||
1658 | |||
1659 | .set noreorder | ||
1660 | ___ | ||
1661 | $code.=<<___ if ($flavour =~ /nubi/i); | ||
1662 | $REG_L $s5,10*$SZREG($sp) | ||
1663 | $REG_L $s4,9*$SZREG($sp) | ||
1664 | $REG_L $s3,8*$SZREG($sp) | ||
1665 | $REG_L $s2,7*$SZREG($sp) | ||
1666 | $REG_L $s1,6*$SZREG($sp) | ||
1667 | $REG_L $s0,5*$SZREG($sp) | ||
1668 | $REG_L $t3,4*$SZREG($sp) | ||
1669 | $REG_L $t2,3*$SZREG($sp) | ||
1670 | $REG_L $t1,2*$SZREG($sp) | ||
1671 | $REG_L $t0,1*$SZREG($sp) | ||
1672 | $REG_L $gp,0*$SZREG($sp) | ||
1673 | jr $ra | ||
1674 | $PTR_ADD $sp,12*$SZREG | ||
1675 | ___ | ||
1676 | $code.=<<___ if ($flavour !~ /nubi/i); | ||
1677 | $REG_L $s5,5*$SZREG($sp) | ||
1678 | $REG_L $s4,4*$SZREG($sp) | ||
1679 | $REG_L $s3,3*$SZREG($sp) | ||
1680 | $REG_L $s2,2*$SZREG($sp) | ||
1681 | $REG_L $s1,1*$SZREG($sp) | ||
1682 | $REG_L $s0,0*$SZREG($sp) | ||
1683 | jr $ra | ||
1684 | $PTR_ADD $sp,6*$SZREG | ||
1685 | ___ | ||
1686 | $code.=<<___; | ||
1687 | .end bn_mul_comba8 | ||
1688 | |||
1689 | .align 5 | ||
1690 | .globl bn_mul_comba4 | ||
1691 | .ent bn_mul_comba4 | ||
1692 | bn_mul_comba4: | ||
1693 | ___ | ||
1694 | $code.=<<___ if ($flavour =~ /nubi/i); | ||
1695 | .frame $sp,6*$SZREG,$ra | ||
1696 | .mask 0x8000f008,-$SZREG | ||
1697 | .set noreorder | ||
1698 | $PTR_SUB $sp,6*$SZREG | ||
1699 | $REG_S $ra,5*$SZREG($sp) | ||
1700 | $REG_S $t3,4*$SZREG($sp) | ||
1701 | $REG_S $t2,3*$SZREG($sp) | ||
1702 | $REG_S $t1,2*$SZREG($sp) | ||
1703 | $REG_S $t0,1*$SZREG($sp) | ||
1704 | $REG_S $gp,0*$SZREG($sp) | ||
1705 | ___ | ||
1706 | $code.=<<___; | ||
1707 | .set reorder | ||
1708 | $LD $a_0,0($a1) | ||
1709 | $LD $b_0,0($a2) | ||
1710 | $LD $a_1,$BNSZ($a1) | ||
1711 | $LD $a_2,2*$BNSZ($a1) | ||
1712 | $MULTU $a_0,$b_0 # mul_add_c(a[0],b[0],c1,c2,c3); | ||
1713 | $LD $a_3,3*$BNSZ($a1) | ||
1714 | $LD $b_1,$BNSZ($a2) | ||
1715 | $LD $b_2,2*$BNSZ($a2) | ||
1716 | $LD $b_3,3*$BNSZ($a2) | ||
1717 | mflo $c_1 | ||
1718 | mfhi $c_2 | ||
1719 | $ST $c_1,0($a0) | ||
1720 | |||
1721 | $MULTU $a_0,$b_1 # mul_add_c(a[0],b[1],c2,c3,c1); | ||
1722 | mflo $t_1 | ||
1723 | mfhi $t_2 | ||
1724 | $ADDU $c_2,$t_1 | ||
1725 | sltu $at,$c_2,$t_1 | ||
1726 | $MULTU $a_1,$b_0 # mul_add_c(a[1],b[0],c2,c3,c1); | ||
1727 | $ADDU $c_3,$t_2,$at | ||
1728 | mflo $t_1 | ||
1729 | mfhi $t_2 | ||
1730 | $ADDU $c_2,$t_1 | ||
1731 | sltu $at,$c_2,$t_1 | ||
1732 | $MULTU $a_2,$b_0 # mul_add_c(a[2],b[0],c3,c1,c2); | ||
1733 | $ADDU $t_2,$at | ||
1734 | $ADDU $c_3,$t_2 | ||
1735 | sltu $c_1,$c_3,$t_2 | ||
1736 | $ST $c_2,$BNSZ($a0) | ||
1737 | |||
1738 | mflo $t_1 | ||
1739 | mfhi $t_2 | ||
1740 | $ADDU $c_3,$t_1 | ||
1741 | sltu $at,$c_3,$t_1 | ||
1742 | $MULTU $a_1,$b_1 # mul_add_c(a[1],b[1],c3,c1,c2); | ||
1743 | $ADDU $t_2,$at | ||
1744 | $ADDU $c_1,$t_2 | ||
1745 | mflo $t_1 | ||
1746 | mfhi $t_2 | ||
1747 | $ADDU $c_3,$t_1 | ||
1748 | sltu $at,$c_3,$t_1 | ||
1749 | $MULTU $a_0,$b_2 # mul_add_c(a[0],b[2],c3,c1,c2); | ||
1750 | $ADDU $t_2,$at | ||
1751 | $ADDU $c_1,$t_2 | ||
1752 | sltu $c_2,$c_1,$t_2 | ||
1753 | mflo $t_1 | ||
1754 | mfhi $t_2 | ||
1755 | $ADDU $c_3,$t_1 | ||
1756 | sltu $at,$c_3,$t_1 | ||
1757 | $MULTU $a_0,$b_3 # mul_add_c(a[0],b[3],c1,c2,c3); | ||
1758 | $ADDU $t_2,$at | ||
1759 | $ADDU $c_1,$t_2 | ||
1760 | sltu $at,$c_1,$t_2 | ||
1761 | $ADDU $c_2,$at | ||
1762 | $ST $c_3,2*$BNSZ($a0) | ||
1763 | |||
1764 | mflo $t_1 | ||
1765 | mfhi $t_2 | ||
1766 | $ADDU $c_1,$t_1 | ||
1767 | sltu $at,$c_1,$t_1 | ||
1768 | $MULTU $a_1,$b_2 # mul_add_c(a[1],b[2],c1,c2,c3); | ||
1769 | $ADDU $t_2,$at | ||
1770 | $ADDU $c_2,$t_2 | ||
1771 | sltu $c_3,$c_2,$t_2 | ||
1772 | mflo $t_1 | ||
1773 | mfhi $t_2 | ||
1774 | $ADDU $c_1,$t_1 | ||
1775 | sltu $at,$c_1,$t_1 | ||
1776 | $MULTU $a_2,$b_1 # mul_add_c(a[2],b[1],c1,c2,c3); | ||
1777 | $ADDU $t_2,$at | ||
1778 | $ADDU $c_2,$t_2 | ||
1779 | sltu $at,$c_2,$t_2 | ||
1780 | $ADDU $c_3,$at | ||
1781 | mflo $t_1 | ||
1782 | mfhi $t_2 | ||
1783 | $ADDU $c_1,$t_1 | ||
1784 | sltu $at,$c_1,$t_1 | ||
1785 | $MULTU $a_3,$b_0 # mul_add_c(a[3],b[0],c1,c2,c3); | ||
1786 | $ADDU $t_2,$at | ||
1787 | $ADDU $c_2,$t_2 | ||
1788 | sltu $at,$c_2,$t_2 | ||
1789 | $ADDU $c_3,$at | ||
1790 | mflo $t_1 | ||
1791 | mfhi $t_2 | ||
1792 | $ADDU $c_1,$t_1 | ||
1793 | sltu $at,$c_1,$t_1 | ||
1794 | $MULTU $a_3,$b_1 # mul_add_c(a[3],b[1],c2,c3,c1); | ||
1795 | $ADDU $t_2,$at | ||
1796 | $ADDU $c_2,$t_2 | ||
1797 | sltu $at,$c_2,$t_2 | ||
1798 | $ADDU $c_3,$at | ||
1799 | $ST $c_1,3*$BNSZ($a0) | ||
1800 | |||
1801 | mflo $t_1 | ||
1802 | mfhi $t_2 | ||
1803 | $ADDU $c_2,$t_1 | ||
1804 | sltu $at,$c_2,$t_1 | ||
1805 | $MULTU $a_2,$b_2 # mul_add_c(a[2],b[2],c2,c3,c1); | ||
1806 | $ADDU $t_2,$at | ||
1807 | $ADDU $c_3,$t_2 | ||
1808 | sltu $c_1,$c_3,$t_2 | ||
1809 | mflo $t_1 | ||
1810 | mfhi $t_2 | ||
1811 | $ADDU $c_2,$t_1 | ||
1812 | sltu $at,$c_2,$t_1 | ||
1813 | $MULTU $a_1,$b_3 # mul_add_c(a[1],b[3],c2,c3,c1); | ||
1814 | $ADDU $t_2,$at | ||
1815 | $ADDU $c_3,$t_2 | ||
1816 | sltu $at,$c_3,$t_2 | ||
1817 | $ADDU $c_1,$at | ||
1818 | mflo $t_1 | ||
1819 | mfhi $t_2 | ||
1820 | $ADDU $c_2,$t_1 | ||
1821 | sltu $at,$c_2,$t_1 | ||
1822 | $MULTU $a_2,$b_3 # mul_add_c(a[2],b[3],c3,c1,c2); | ||
1823 | $ADDU $t_2,$at | ||
1824 | $ADDU $c_3,$t_2 | ||
1825 | sltu $at,$c_3,$t_2 | ||
1826 | $ADDU $c_1,$at | ||
1827 | $ST $c_2,4*$BNSZ($a0) | ||
1828 | |||
1829 | mflo $t_1 | ||
1830 | mfhi $t_2 | ||
1831 | $ADDU $c_3,$t_1 | ||
1832 | sltu $at,$c_3,$t_1 | ||
1833 | $MULTU $a_3,$b_2 # mul_add_c(a[3],b[2],c3,c1,c2); | ||
1834 | $ADDU $t_2,$at | ||
1835 | $ADDU $c_1,$t_2 | ||
1836 | sltu $c_2,$c_1,$t_2 | ||
1837 | mflo $t_1 | ||
1838 | mfhi $t_2 | ||
1839 | $ADDU $c_3,$t_1 | ||
1840 | sltu $at,$c_3,$t_1 | ||
1841 | $MULTU $a_3,$b_3 # mul_add_c(a[3],b[3],c1,c2,c3); | ||
1842 | $ADDU $t_2,$at | ||
1843 | $ADDU $c_1,$t_2 | ||
1844 | sltu $at,$c_1,$t_2 | ||
1845 | $ADDU $c_2,$at | ||
1846 | $ST $c_3,5*$BNSZ($a0) | ||
1847 | |||
1848 | mflo $t_1 | ||
1849 | mfhi $t_2 | ||
1850 | $ADDU $c_1,$t_1 | ||
1851 | sltu $at,$c_1,$t_1 | ||
1852 | $ADDU $t_2,$at | ||
1853 | $ADDU $c_2,$t_2 | ||
1854 | $ST $c_1,6*$BNSZ($a0) | ||
1855 | $ST $c_2,7*$BNSZ($a0) | ||
1856 | |||
1857 | .set noreorder | ||
1858 | ___ | ||
1859 | $code.=<<___ if ($flavour =~ /nubi/i); | ||
1860 | $REG_L $t3,4*$SZREG($sp) | ||
1861 | $REG_L $t2,3*$SZREG($sp) | ||
1862 | $REG_L $t1,2*$SZREG($sp) | ||
1863 | $REG_L $t0,1*$SZREG($sp) | ||
1864 | $REG_L $gp,0*$SZREG($sp) | ||
1865 | $PTR_ADD $sp,6*$SZREG | ||
1866 | ___ | ||
1867 | $code.=<<___; | ||
1868 | jr $ra | ||
1869 | nop | ||
1870 | .end bn_mul_comba4 | ||
1871 | ___ | ||
1872 | |||
1873 | ($a_4,$a_5,$a_6,$a_7)=($b_0,$b_1,$b_2,$b_3); | ||
1874 | |||
1875 | sub add_c2 () { | ||
1876 | my ($hi,$lo,$c0,$c1,$c2, | ||
1877 | $warm, # !$warm denotes first call with specific sequence of | ||
1878 | # $c_[XYZ] when there is no Z-carry to accumulate yet; | ||
1879 | $an,$bn # these two are arguments for multiplication which | ||
1880 | # result is used in *next* step [which is why it's | ||
1881 | # commented as "forward multiplication" below]; | ||
1882 | )=@_; | ||
1883 | $code.=<<___; | ||
1884 | mflo $lo | ||
1885 | mfhi $hi | ||
1886 | $ADDU $c0,$lo | ||
1887 | sltu $at,$c0,$lo | ||
1888 | $MULTU $an,$bn # forward multiplication | ||
1889 | $ADDU $c0,$lo | ||
1890 | $ADDU $at,$hi | ||
1891 | sltu $lo,$c0,$lo | ||
1892 | $ADDU $c1,$at | ||
1893 | $ADDU $hi,$lo | ||
1894 | ___ | ||
1895 | $code.=<<___ if (!$warm); | ||
1896 | sltu $c2,$c1,$at | ||
1897 | $ADDU $c1,$hi | ||
1898 | sltu $hi,$c1,$hi | ||
1899 | $ADDU $c2,$hi | ||
1900 | ___ | ||
1901 | $code.=<<___ if ($warm); | ||
1902 | sltu $at,$c1,$at | ||
1903 | $ADDU $c1,$hi | ||
1904 | $ADDU $c2,$at | ||
1905 | sltu $hi,$c1,$hi | ||
1906 | $ADDU $c2,$hi | ||
1907 | ___ | ||
1908 | } | ||
1909 | |||
1910 | $code.=<<___; | ||
1911 | |||
1912 | .align 5 | ||
1913 | .globl bn_sqr_comba8 | ||
1914 | .ent bn_sqr_comba8 | ||
1915 | bn_sqr_comba8: | ||
1916 | ___ | ||
1917 | $code.=<<___ if ($flavour =~ /nubi/i); | ||
1918 | .frame $sp,6*$SZREG,$ra | ||
1919 | .mask 0x8000f008,-$SZREG | ||
1920 | .set noreorder | ||
1921 | $PTR_SUB $sp,6*$SZREG | ||
1922 | $REG_S $ra,5*$SZREG($sp) | ||
1923 | $REG_S $t3,4*$SZREG($sp) | ||
1924 | $REG_S $t2,3*$SZREG($sp) | ||
1925 | $REG_S $t1,2*$SZREG($sp) | ||
1926 | $REG_S $t0,1*$SZREG($sp) | ||
1927 | $REG_S $gp,0*$SZREG($sp) | ||
1928 | ___ | ||
1929 | $code.=<<___; | ||
1930 | .set reorder | ||
1931 | $LD $a_0,0($a1) | ||
1932 | $LD $a_1,$BNSZ($a1) | ||
1933 | $LD $a_2,2*$BNSZ($a1) | ||
1934 | $LD $a_3,3*$BNSZ($a1) | ||
1935 | |||
1936 | $MULTU $a_0,$a_0 # mul_add_c(a[0],b[0],c1,c2,c3); | ||
1937 | $LD $a_4,4*$BNSZ($a1) | ||
1938 | $LD $a_5,5*$BNSZ($a1) | ||
1939 | $LD $a_6,6*$BNSZ($a1) | ||
1940 | $LD $a_7,7*$BNSZ($a1) | ||
1941 | mflo $c_1 | ||
1942 | mfhi $c_2 | ||
1943 | $ST $c_1,0($a0) | ||
1944 | |||
1945 | $MULTU $a_0,$a_1 # mul_add_c2(a[0],b[1],c2,c3,c1); | ||
1946 | mflo $t_1 | ||
1947 | mfhi $t_2 | ||
1948 | slt $c_1,$t_2,$zero | ||
1949 | $SLL $t_2,1 | ||
1950 | $MULTU $a_2,$a_0 # mul_add_c2(a[2],b[0],c3,c1,c2); | ||
1951 | slt $a2,$t_1,$zero | ||
1952 | $ADDU $t_2,$a2 | ||
1953 | $SLL $t_1,1 | ||
1954 | $ADDU $c_2,$t_1 | ||
1955 | sltu $at,$c_2,$t_1 | ||
1956 | $ADDU $c_3,$t_2,$at | ||
1957 | $ST $c_2,$BNSZ($a0) | ||
1958 | ___ | ||
1959 | &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0, | ||
1960 | $a_1,$a_1); # mul_add_c(a[1],b[1],c3,c1,c2); | ||
1961 | $code.=<<___; | ||
1962 | mflo $t_1 | ||
1963 | mfhi $t_2 | ||
1964 | $ADDU $c_3,$t_1 | ||
1965 | sltu $at,$c_3,$t_1 | ||
1966 | $MULTU $a_0,$a_3 # mul_add_c2(a[0],b[3],c1,c2,c3); | ||
1967 | $ADDU $t_2,$at | ||
1968 | $ADDU $c_1,$t_2 | ||
1969 | sltu $at,$c_1,$t_2 | ||
1970 | $ADDU $c_2,$at | ||
1971 | $ST $c_3,2*$BNSZ($a0) | ||
1972 | ___ | ||
1973 | &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0, | ||
1974 | $a_1,$a_2); # mul_add_c2(a[1],b[2],c1,c2,c3); | ||
1975 | &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1, | ||
1976 | $a_4,$a_0); # mul_add_c2(a[4],b[0],c2,c3,c1); | ||
1977 | $code.=<<___; | ||
1978 | $ST $c_1,3*$BNSZ($a0) | ||
1979 | ___ | ||
1980 | &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0, | ||
1981 | $a_3,$a_1); # mul_add_c2(a[3],b[1],c2,c3,c1); | ||
1982 | &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1, | ||
1983 | $a_2,$a_2); # mul_add_c(a[2],b[2],c2,c3,c1); | ||
1984 | $code.=<<___; | ||
1985 | mflo $t_1 | ||
1986 | mfhi $t_2 | ||
1987 | $ADDU $c_2,$t_1 | ||
1988 | sltu $at,$c_2,$t_1 | ||
1989 | $MULTU $a_0,$a_5 # mul_add_c2(a[0],b[5],c3,c1,c2); | ||
1990 | $ADDU $t_2,$at | ||
1991 | $ADDU $c_3,$t_2 | ||
1992 | sltu $at,$c_3,$t_2 | ||
1993 | $ADDU $c_1,$at | ||
1994 | $ST $c_2,4*$BNSZ($a0) | ||
1995 | ___ | ||
1996 | &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0, | ||
1997 | $a_1,$a_4); # mul_add_c2(a[1],b[4],c3,c1,c2); | ||
1998 | &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1, | ||
1999 | $a_2,$a_3); # mul_add_c2(a[2],b[3],c3,c1,c2); | ||
2000 | &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1, | ||
2001 | $a_6,$a_0); # mul_add_c2(a[6],b[0],c1,c2,c3); | ||
2002 | $code.=<<___; | ||
2003 | $ST $c_3,5*$BNSZ($a0) | ||
2004 | ___ | ||
2005 | &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0, | ||
2006 | $a_5,$a_1); # mul_add_c2(a[5],b[1],c1,c2,c3); | ||
2007 | &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1, | ||
2008 | $a_4,$a_2); # mul_add_c2(a[4],b[2],c1,c2,c3); | ||
2009 | &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1, | ||
2010 | $a_3,$a_3); # mul_add_c(a[3],b[3],c1,c2,c3); | ||
2011 | $code.=<<___; | ||
2012 | mflo $t_1 | ||
2013 | mfhi $t_2 | ||
2014 | $ADDU $c_1,$t_1 | ||
2015 | sltu $at,$c_1,$t_1 | ||
2016 | $MULTU $a_0,$a_7 # mul_add_c2(a[0],b[7],c2,c3,c1); | ||
2017 | $ADDU $t_2,$at | ||
2018 | $ADDU $c_2,$t_2 | ||
2019 | sltu $at,$c_2,$t_2 | ||
2020 | $ADDU $c_3,$at | ||
2021 | $ST $c_1,6*$BNSZ($a0) | ||
2022 | ___ | ||
2023 | &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0, | ||
2024 | $a_1,$a_6); # mul_add_c2(a[1],b[6],c2,c3,c1); | ||
2025 | &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1, | ||
2026 | $a_2,$a_5); # mul_add_c2(a[2],b[5],c2,c3,c1); | ||
2027 | &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1, | ||
2028 | $a_3,$a_4); # mul_add_c2(a[3],b[4],c2,c3,c1); | ||
2029 | &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1, | ||
2030 | $a_7,$a_1); # mul_add_c2(a[7],b[1],c3,c1,c2); | ||
2031 | $code.=<<___; | ||
2032 | $ST $c_2,7*$BNSZ($a0) | ||
2033 | ___ | ||
2034 | &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0, | ||
2035 | $a_6,$a_2); # mul_add_c2(a[6],b[2],c3,c1,c2); | ||
2036 | &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1, | ||
2037 | $a_5,$a_3); # mul_add_c2(a[5],b[3],c3,c1,c2); | ||
2038 | &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1, | ||
2039 | $a_4,$a_4); # mul_add_c(a[4],b[4],c3,c1,c2); | ||
2040 | $code.=<<___; | ||
2041 | mflo $t_1 | ||
2042 | mfhi $t_2 | ||
2043 | $ADDU $c_3,$t_1 | ||
2044 | sltu $at,$c_3,$t_1 | ||
2045 | $MULTU $a_2,$a_7 # mul_add_c2(a[2],b[7],c1,c2,c3); | ||
2046 | $ADDU $t_2,$at | ||
2047 | $ADDU $c_1,$t_2 | ||
2048 | sltu $at,$c_1,$t_2 | ||
2049 | $ADDU $c_2,$at | ||
2050 | $ST $c_3,8*$BNSZ($a0) | ||
2051 | ___ | ||
2052 | &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0, | ||
2053 | $a_3,$a_6); # mul_add_c2(a[3],b[6],c1,c2,c3); | ||
2054 | &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1, | ||
2055 | $a_4,$a_5); # mul_add_c2(a[4],b[5],c1,c2,c3); | ||
2056 | &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1, | ||
2057 | $a_7,$a_3); # mul_add_c2(a[7],b[3],c2,c3,c1); | ||
2058 | $code.=<<___; | ||
2059 | $ST $c_1,9*$BNSZ($a0) | ||
2060 | ___ | ||
2061 | &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0, | ||
2062 | $a_6,$a_4); # mul_add_c2(a[6],b[4],c2,c3,c1); | ||
2063 | &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1, | ||
2064 | $a_5,$a_5); # mul_add_c(a[5],b[5],c2,c3,c1); | ||
2065 | $code.=<<___; | ||
2066 | mflo $t_1 | ||
2067 | mfhi $t_2 | ||
2068 | $ADDU $c_2,$t_1 | ||
2069 | sltu $at,$c_2,$t_1 | ||
2070 | $MULTU $a_4,$a_7 # mul_add_c2(a[4],b[7],c3,c1,c2); | ||
2071 | $ADDU $t_2,$at | ||
2072 | $ADDU $c_3,$t_2 | ||
2073 | sltu $at,$c_3,$t_2 | ||
2074 | $ADDU $c_1,$at | ||
2075 | $ST $c_2,10*$BNSZ($a0) | ||
2076 | ___ | ||
2077 | &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0, | ||
2078 | $a_5,$a_6); # mul_add_c2(a[5],b[6],c3,c1,c2); | ||
2079 | &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1, | ||
2080 | $a_7,$a_5); # mul_add_c2(a[7],b[5],c1,c2,c3); | ||
2081 | $code.=<<___; | ||
2082 | $ST $c_3,11*$BNSZ($a0) | ||
2083 | ___ | ||
2084 | &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0, | ||
2085 | $a_6,$a_6); # mul_add_c(a[6],b[6],c1,c2,c3); | ||
2086 | $code.=<<___; | ||
2087 | mflo $t_1 | ||
2088 | mfhi $t_2 | ||
2089 | $ADDU $c_1,$t_1 | ||
2090 | sltu $at,$c_1,$t_1 | ||
2091 | $MULTU $a_6,$a_7 # mul_add_c2(a[6],b[7],c2,c3,c1); | ||
2092 | $ADDU $t_2,$at | ||
2093 | $ADDU $c_2,$t_2 | ||
2094 | sltu $at,$c_2,$t_2 | ||
2095 | $ADDU $c_3,$at | ||
2096 | $ST $c_1,12*$BNSZ($a0) | ||
2097 | ___ | ||
2098 | &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0, | ||
2099 | $a_7,$a_7); # mul_add_c(a[7],b[7],c3,c1,c2); | ||
2100 | $code.=<<___; | ||
2101 | $ST $c_2,13*$BNSZ($a0) | ||
2102 | |||
2103 | mflo $t_1 | ||
2104 | mfhi $t_2 | ||
2105 | $ADDU $c_3,$t_1 | ||
2106 | sltu $at,$c_3,$t_1 | ||
2107 | $ADDU $t_2,$at | ||
2108 | $ADDU $c_1,$t_2 | ||
2109 | $ST $c_3,14*$BNSZ($a0) | ||
2110 | $ST $c_1,15*$BNSZ($a0) | ||
2111 | |||
2112 | .set noreorder | ||
2113 | ___ | ||
2114 | $code.=<<___ if ($flavour =~ /nubi/i); | ||
2115 | $REG_L $t3,4*$SZREG($sp) | ||
2116 | $REG_L $t2,3*$SZREG($sp) | ||
2117 | $REG_L $t1,2*$SZREG($sp) | ||
2118 | $REG_L $t0,1*$SZREG($sp) | ||
2119 | $REG_L $gp,0*$SZREG($sp) | ||
2120 | $PTR_ADD $sp,6*$SZREG | ||
2121 | ___ | ||
2122 | $code.=<<___; | ||
2123 | jr $ra | ||
2124 | nop | ||
2125 | .end bn_sqr_comba8 | ||
2126 | |||
2127 | .align 5 | ||
2128 | .globl bn_sqr_comba4 | ||
2129 | .ent bn_sqr_comba4 | ||
2130 | bn_sqr_comba4: | ||
2131 | ___ | ||
2132 | $code.=<<___ if ($flavour =~ /nubi/i); | ||
2133 | .frame $sp,6*$SZREG,$ra | ||
2134 | .mask 0x8000f008,-$SZREG | ||
2135 | .set noreorder | ||
2136 | $PTR_SUB $sp,6*$SZREG | ||
2137 | $REG_S $ra,5*$SZREG($sp) | ||
2138 | $REG_S $t3,4*$SZREG($sp) | ||
2139 | $REG_S $t2,3*$SZREG($sp) | ||
2140 | $REG_S $t1,2*$SZREG($sp) | ||
2141 | $REG_S $t0,1*$SZREG($sp) | ||
2142 | $REG_S $gp,0*$SZREG($sp) | ||
2143 | ___ | ||
2144 | $code.=<<___; | ||
2145 | .set reorder | ||
2146 | $LD $a_0,0($a1) | ||
2147 | $LD $a_1,$BNSZ($a1) | ||
2148 | $MULTU $a_0,$a_0 # mul_add_c(a[0],b[0],c1,c2,c3); | ||
2149 | $LD $a_2,2*$BNSZ($a1) | ||
2150 | $LD $a_3,3*$BNSZ($a1) | ||
2151 | mflo $c_1 | ||
2152 | mfhi $c_2 | ||
2153 | $ST $c_1,0($a0) | ||
2154 | |||
2155 | $MULTU $a_0,$a_1 # mul_add_c2(a[0],b[1],c2,c3,c1); | ||
2156 | mflo $t_1 | ||
2157 | mfhi $t_2 | ||
2158 | slt $c_1,$t_2,$zero | ||
2159 | $SLL $t_2,1 | ||
2160 | $MULTU $a_2,$a_0 # mul_add_c2(a[2],b[0],c3,c1,c2); | ||
2161 | slt $a2,$t_1,$zero | ||
2162 | $ADDU $t_2,$a2 | ||
2163 | $SLL $t_1,1 | ||
2164 | $ADDU $c_2,$t_1 | ||
2165 | sltu $at,$c_2,$t_1 | ||
2166 | $ADDU $c_3,$t_2,$at | ||
2167 | $ST $c_2,$BNSZ($a0) | ||
2168 | ___ | ||
2169 | &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0, | ||
2170 | $a_1,$a_1); # mul_add_c(a[1],b[1],c3,c1,c2); | ||
2171 | $code.=<<___; | ||
2172 | mflo $t_1 | ||
2173 | mfhi $t_2 | ||
2174 | $ADDU $c_3,$t_1 | ||
2175 | sltu $at,$c_3,$t_1 | ||
2176 | $MULTU $a_0,$a_3 # mul_add_c2(a[0],b[3],c1,c2,c3); | ||
2177 | $ADDU $t_2,$at | ||
2178 | $ADDU $c_1,$t_2 | ||
2179 | sltu $at,$c_1,$t_2 | ||
2180 | $ADDU $c_2,$at | ||
2181 | $ST $c_3,2*$BNSZ($a0) | ||
2182 | ___ | ||
2183 | &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0, | ||
2184 | $a_1,$a_2); # mul_add_c2(a2[1],b[2],c1,c2,c3); | ||
2185 | &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1, | ||
2186 | $a_3,$a_1); # mul_add_c2(a[3],b[1],c2,c3,c1); | ||
2187 | $code.=<<___; | ||
2188 | $ST $c_1,3*$BNSZ($a0) | ||
2189 | ___ | ||
2190 | &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0, | ||
2191 | $a_2,$a_2); # mul_add_c(a[2],b[2],c2,c3,c1); | ||
2192 | $code.=<<___; | ||
2193 | mflo $t_1 | ||
2194 | mfhi $t_2 | ||
2195 | $ADDU $c_2,$t_1 | ||
2196 | sltu $at,$c_2,$t_1 | ||
2197 | $MULTU $a_2,$a_3 # mul_add_c2(a[2],b[3],c3,c1,c2); | ||
2198 | $ADDU $t_2,$at | ||
2199 | $ADDU $c_3,$t_2 | ||
2200 | sltu $at,$c_3,$t_2 | ||
2201 | $ADDU $c_1,$at | ||
2202 | $ST $c_2,4*$BNSZ($a0) | ||
2203 | ___ | ||
2204 | &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0, | ||
2205 | $a_3,$a_3); # mul_add_c(a[3],b[3],c1,c2,c3); | ||
2206 | $code.=<<___; | ||
2207 | $ST $c_3,5*$BNSZ($a0) | ||
2208 | |||
2209 | mflo $t_1 | ||
2210 | mfhi $t_2 | ||
2211 | $ADDU $c_1,$t_1 | ||
2212 | sltu $at,$c_1,$t_1 | ||
2213 | $ADDU $t_2,$at | ||
2214 | $ADDU $c_2,$t_2 | ||
2215 | $ST $c_1,6*$BNSZ($a0) | ||
2216 | $ST $c_2,7*$BNSZ($a0) | ||
2217 | |||
2218 | .set noreorder | ||
2219 | ___ | ||
2220 | $code.=<<___ if ($flavour =~ /nubi/i); | ||
2221 | $REG_L $t3,4*$SZREG($sp) | ||
2222 | $REG_L $t2,3*$SZREG($sp) | ||
2223 | $REG_L $t1,2*$SZREG($sp) | ||
2224 | $REG_L $t0,1*$SZREG($sp) | ||
2225 | $REG_L $gp,0*$SZREG($sp) | ||
2226 | $PTR_ADD $sp,6*$SZREG | ||
2227 | ___ | ||
2228 | $code.=<<___; | ||
2229 | jr $ra | ||
2230 | nop | ||
2231 | .end bn_sqr_comba4 | ||
2232 | ___ | ||
2233 | print $code; | ||
2234 | close STDOUT; | ||