diff options
Diffstat (limited to '')
-rw-r--r-- | src/lib/libcrypto/bn/asm/parisc-mont.pl | 993 |
1 files changed, 993 insertions, 0 deletions
diff --git a/src/lib/libcrypto/bn/asm/parisc-mont.pl b/src/lib/libcrypto/bn/asm/parisc-mont.pl new file mode 100644 index 0000000000..4a766a87fb --- /dev/null +++ b/src/lib/libcrypto/bn/asm/parisc-mont.pl | |||
@@ -0,0 +1,993 @@ | |||
1 | #!/usr/bin/env perl | ||
2 | |||
3 | # ==================================================================== | ||
4 | # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | ||
5 | # project. The module is, however, dual licensed under OpenSSL and | ||
6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
8 | # ==================================================================== | ||
9 | |||
10 | # On PA-7100LC this module performs ~90-50% better, less for longer | ||
11 | # keys, than code generated by gcc 3.2 for PA-RISC 1.1. Latter means | ||
12 | # that compiler utilized xmpyu instruction to perform 32x32=64-bit | ||
13 | # multiplication, which in turn means that "baseline" performance was | ||
14 | # optimal in respect to instruction set capabilities. Fair comparison | ||
15 | # with vendor compiler is problematic, because OpenSSL doesn't define | ||
16 | # BN_LLONG [presumably] for historical reasons, which drives compiler | ||
17 | # toward 4 times 16x16=32-bit multiplicatons [plus complementary | ||
18 | # shifts and additions] instead. This means that you should observe | ||
19 | # several times improvement over code generated by vendor compiler | ||
20 | # for PA-RISC 1.1, but the "baseline" is far from optimal. The actual | ||
21 | # improvement coefficient was never collected on PA-7100LC, or any | ||
22 | # other 1.1 CPU, because I don't have access to such machine with | ||
23 | # vendor compiler. But to give you a taste, PA-RISC 1.1 code path | ||
24 | # reportedly outperformed code generated by cc +DA1.1 +O3 by factor | ||
25 | # of ~5x on PA-8600. | ||
26 | # | ||
27 | # On PA-RISC 2.0 it has to compete with pa-risc2[W].s, which is | ||
28 | # reportedly ~2x faster than vendor compiler generated code [according | ||
29 | # to comment in pa-risc2[W].s]. Here comes a catch. Execution core of | ||
30 | # this implementation is actually 32-bit one, in the sense that it | ||
31 | # operates on 32-bit values. But pa-risc2[W].s operates on arrays of | ||
32 | # 64-bit BN_LONGs... How do they interoperate then? No problem. This | ||
33 | # module picks halves of 64-bit values in reverse order and pretends | ||
34 | # they were 32-bit BN_LONGs. But can 32-bit core compete with "pure" | ||
35 | # 64-bit code such as pa-risc2[W].s then? Well, the thing is that | ||
36 | # 32x32=64-bit multiplication is the best even PA-RISC 2.0 can do, | ||
37 | # i.e. there is no "wider" multiplication like on most other 64-bit | ||
38 | # platforms. This means that even being effectively 32-bit, this | ||
39 | # implementation performs "64-bit" computational task in same amount | ||
40 | # of arithmetic operations, most notably multiplications. It requires | ||
41 | # more memory references, most notably to tp[num], but this doesn't | ||
42 | # seem to exhaust memory port capacity. And indeed, dedicated PA-RISC | ||
43 | # 2.0 code path, provides virtually same performance as pa-risc2[W].s: | ||
44 | # it's ~10% better for shortest key length and ~10% worse for longest | ||
45 | # one. | ||
46 | # | ||
47 | # In case it wasn't clear. The module has two distinct code paths: | ||
48 | # PA-RISC 1.1 and PA-RISC 2.0 ones. Latter features carry-free 64-bit | ||
49 | # additions and 64-bit integer loads, not to mention specific | ||
50 | # instruction scheduling. In 64-bit build naturally only 2.0 code path | ||
51 | # is assembled. In 32-bit application context both code paths are | ||
52 | # assembled, PA-RISC 2.0 CPU is detected at run-time and proper path | ||
53 | # is taken automatically. Also, in 32-bit build the module imposes | ||
54 | # couple of limitations: vector lengths has to be even and vector | ||
55 | # addresses has to be 64-bit aligned. Normally neither is a problem: | ||
56 | # most common key lengths are even and vectors are commonly malloc-ed, | ||
57 | # which ensures alignment. | ||
58 | # | ||
59 | # Special thanks to polarhome.com for providing HP-UX account on | ||
60 | # PA-RISC 1.1 machine, and to correspondent who chose to remain | ||
61 | # anonymous for testing the code on PA-RISC 2.0 machine. | ||
62 | |||
63 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | ||
64 | |||
65 | $flavour = shift; | ||
66 | $output = shift; | ||
67 | |||
68 | open STDOUT,">$output"; | ||
69 | |||
70 | if ($flavour =~ /64/) { | ||
71 | $LEVEL ="2.0W"; | ||
72 | $SIZE_T =8; | ||
73 | $FRAME_MARKER =80; | ||
74 | $SAVED_RP =16; | ||
75 | $PUSH ="std"; | ||
76 | $PUSHMA ="std,ma"; | ||
77 | $POP ="ldd"; | ||
78 | $POPMB ="ldd,mb"; | ||
79 | $BN_SZ =$SIZE_T; | ||
80 | } else { | ||
81 | $LEVEL ="1.1"; #$LEVEL.="\n\t.ALLOW\t2.0"; | ||
82 | $SIZE_T =4; | ||
83 | $FRAME_MARKER =48; | ||
84 | $SAVED_RP =20; | ||
85 | $PUSH ="stw"; | ||
86 | $PUSHMA ="stwm"; | ||
87 | $POP ="ldw"; | ||
88 | $POPMB ="ldwm"; | ||
89 | $BN_SZ =$SIZE_T; | ||
90 | if (open CONF,"<${dir}../../opensslconf.h") { | ||
91 | while(<CONF>) { | ||
92 | if (m/#\s*define\s+SIXTY_FOUR_BIT/) { | ||
93 | $BN_SZ=8; | ||
94 | $LEVEL="2.0"; | ||
95 | last; | ||
96 | } | ||
97 | } | ||
98 | close CONF; | ||
99 | } | ||
100 | } | ||
101 | |||
102 | $FRAME=8*$SIZE_T+$FRAME_MARKER; # 8 saved regs + frame marker | ||
103 | # [+ argument transfer] | ||
104 | $LOCALS=$FRAME-$FRAME_MARKER; | ||
105 | $FRAME+=32; # local variables | ||
106 | |||
107 | $tp="%r31"; | ||
108 | $ti1="%r29"; | ||
109 | $ti0="%r28"; | ||
110 | |||
111 | $rp="%r26"; | ||
112 | $ap="%r25"; | ||
113 | $bp="%r24"; | ||
114 | $np="%r23"; | ||
115 | $n0="%r22"; # passed through stack in 32-bit | ||
116 | $num="%r21"; # passed through stack in 32-bit | ||
117 | $idx="%r20"; | ||
118 | $arrsz="%r19"; | ||
119 | |||
120 | $nm1="%r7"; | ||
121 | $nm0="%r6"; | ||
122 | $ab1="%r5"; | ||
123 | $ab0="%r4"; | ||
124 | |||
125 | $fp="%r3"; | ||
126 | $hi1="%r2"; | ||
127 | $hi0="%r1"; | ||
128 | |||
129 | $xfer=$n0; # accomodates [-16..15] offset in fld[dw]s | ||
130 | |||
131 | $fm0="%fr4"; $fti=$fm0; | ||
132 | $fbi="%fr5L"; | ||
133 | $fn0="%fr5R"; | ||
134 | $fai="%fr6"; $fab0="%fr7"; $fab1="%fr8"; | ||
135 | $fni="%fr9"; $fnm0="%fr10"; $fnm1="%fr11"; | ||
136 | |||
137 | $code=<<___; | ||
138 | .LEVEL $LEVEL | ||
139 | .SPACE \$TEXT\$ | ||
140 | .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY | ||
141 | |||
142 | .EXPORT bn_mul_mont,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR | ||
143 | .ALIGN 64 | ||
144 | bn_mul_mont | ||
145 | .PROC | ||
146 | .CALLINFO FRAME=`$FRAME-8*$SIZE_T`,NO_CALLS,SAVE_RP,SAVE_SP,ENTRY_GR=6 | ||
147 | .ENTRY | ||
148 | $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue | ||
149 | $PUSHMA %r3,$FRAME(%sp) | ||
150 | $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp) | ||
151 | $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp) | ||
152 | $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp) | ||
153 | $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp) | ||
154 | $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp) | ||
155 | $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp) | ||
156 | $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp) | ||
157 | ldo -$FRAME(%sp),$fp | ||
158 | ___ | ||
159 | $code.=<<___ if ($SIZE_T==4); | ||
160 | ldw `-$FRAME_MARKER-4`($fp),$n0 | ||
161 | ldw `-$FRAME_MARKER-8`($fp),$num | ||
162 | nop | ||
163 | nop ; alignment | ||
164 | ___ | ||
165 | $code.=<<___ if ($BN_SZ==4); | ||
166 | comiclr,<= 6,$num,%r0 ; are vectors long enough? | ||
167 | b L\$abort | ||
168 | ldi 0,%r28 ; signal "unhandled" | ||
169 | add,ev %r0,$num,$num ; is $num even? | ||
170 | b L\$abort | ||
171 | nop | ||
172 | or $ap,$np,$ti1 | ||
173 | extru,= $ti1,31,3,%r0 ; are ap and np 64-bit aligned? | ||
174 | b L\$abort | ||
175 | nop | ||
176 | nop ; alignment | ||
177 | nop | ||
178 | |||
179 | fldws 0($n0),${fn0} | ||
180 | fldws,ma 4($bp),${fbi} ; bp[0] | ||
181 | ___ | ||
182 | $code.=<<___ if ($BN_SZ==8); | ||
183 | comib,> 3,$num,L\$abort ; are vectors long enough? | ||
184 | ldi 0,%r28 ; signal "unhandled" | ||
185 | addl $num,$num,$num ; I operate on 32-bit values | ||
186 | |||
187 | fldws 4($n0),${fn0} ; only low part of n0 | ||
188 | fldws 4($bp),${fbi} ; bp[0] in flipped word order | ||
189 | ___ | ||
190 | $code.=<<___; | ||
191 | fldds 0($ap),${fai} ; ap[0,1] | ||
192 | fldds 0($np),${fni} ; np[0,1] | ||
193 | |||
194 | sh2addl $num,%r0,$arrsz | ||
195 | ldi 31,$hi0 | ||
196 | ldo 36($arrsz),$hi1 ; space for tp[num+1] | ||
197 | andcm $hi1,$hi0,$hi1 ; align | ||
198 | addl $hi1,%sp,%sp | ||
199 | $PUSH $fp,-$SIZE_T(%sp) | ||
200 | |||
201 | ldo `$LOCALS+16`($fp),$xfer | ||
202 | ldo `$LOCALS+32+4`($fp),$tp | ||
203 | |||
204 | xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[0] | ||
205 | xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[0] | ||
206 | xmpyu ${fn0},${fab0}R,${fm0} | ||
207 | |||
208 | addl $arrsz,$ap,$ap ; point at the end | ||
209 | addl $arrsz,$np,$np | ||
210 | subi 0,$arrsz,$idx ; j=0 | ||
211 | ldo 8($idx),$idx ; j++++ | ||
212 | |||
213 | xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m | ||
214 | xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m | ||
215 | fstds ${fab0},-16($xfer) | ||
216 | fstds ${fnm0},-8($xfer) | ||
217 | fstds ${fab1},0($xfer) | ||
218 | fstds ${fnm1},8($xfer) | ||
219 | flddx $idx($ap),${fai} ; ap[2,3] | ||
220 | flddx $idx($np),${fni} ; np[2,3] | ||
221 | ___ | ||
222 | $code.=<<___ if ($BN_SZ==4); | ||
223 | mtctl $hi0,%cr11 ; $hi0 still holds 31 | ||
224 | extrd,u,*= $hi0,%sar,1,$hi0 ; executes on PA-RISC 1.0 | ||
225 | b L\$parisc11 | ||
226 | nop | ||
227 | ___ | ||
228 | $code.=<<___; # PA-RISC 2.0 code-path | ||
229 | xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0] | ||
230 | xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m | ||
231 | ldd -16($xfer),$ab0 | ||
232 | fstds ${fab0},-16($xfer) | ||
233 | |||
234 | extrd,u $ab0,31,32,$hi0 | ||
235 | extrd,u $ab0,63,32,$ab0 | ||
236 | ldd -8($xfer),$nm0 | ||
237 | fstds ${fnm0},-8($xfer) | ||
238 | ldo 8($idx),$idx ; j++++ | ||
239 | addl $ab0,$nm0,$nm0 ; low part is discarded | ||
240 | extrd,u $nm0,31,32,$hi1 | ||
241 | |||
242 | L\$1st | ||
243 | xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[0] | ||
244 | xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m | ||
245 | ldd 0($xfer),$ab1 | ||
246 | fstds ${fab1},0($xfer) | ||
247 | addl $hi0,$ab1,$ab1 | ||
248 | extrd,u $ab1,31,32,$hi0 | ||
249 | ldd 8($xfer),$nm1 | ||
250 | fstds ${fnm1},8($xfer) | ||
251 | extrd,u $ab1,63,32,$ab1 | ||
252 | addl $hi1,$nm1,$nm1 | ||
253 | flddx $idx($ap),${fai} ; ap[j,j+1] | ||
254 | flddx $idx($np),${fni} ; np[j,j+1] | ||
255 | addl $ab1,$nm1,$nm1 | ||
256 | extrd,u $nm1,31,32,$hi1 | ||
257 | |||
258 | xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0] | ||
259 | xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m | ||
260 | ldd -16($xfer),$ab0 | ||
261 | fstds ${fab0},-16($xfer) | ||
262 | addl $hi0,$ab0,$ab0 | ||
263 | extrd,u $ab0,31,32,$hi0 | ||
264 | ldd -8($xfer),$nm0 | ||
265 | fstds ${fnm0},-8($xfer) | ||
266 | extrd,u $ab0,63,32,$ab0 | ||
267 | addl $hi1,$nm0,$nm0 | ||
268 | stw $nm1,-4($tp) ; tp[j-1] | ||
269 | addl $ab0,$nm0,$nm0 | ||
270 | stw,ma $nm0,8($tp) ; tp[j-1] | ||
271 | addib,<> 8,$idx,L\$1st ; j++++ | ||
272 | extrd,u $nm0,31,32,$hi1 | ||
273 | |||
274 | xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[0] | ||
275 | xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m | ||
276 | ldd 0($xfer),$ab1 | ||
277 | fstds ${fab1},0($xfer) | ||
278 | addl $hi0,$ab1,$ab1 | ||
279 | extrd,u $ab1,31,32,$hi0 | ||
280 | ldd 8($xfer),$nm1 | ||
281 | fstds ${fnm1},8($xfer) | ||
282 | extrd,u $ab1,63,32,$ab1 | ||
283 | addl $hi1,$nm1,$nm1 | ||
284 | ldd -16($xfer),$ab0 | ||
285 | addl $ab1,$nm1,$nm1 | ||
286 | ldd -8($xfer),$nm0 | ||
287 | extrd,u $nm1,31,32,$hi1 | ||
288 | |||
289 | addl $hi0,$ab0,$ab0 | ||
290 | extrd,u $ab0,31,32,$hi0 | ||
291 | stw $nm1,-4($tp) ; tp[j-1] | ||
292 | extrd,u $ab0,63,32,$ab0 | ||
293 | addl $hi1,$nm0,$nm0 | ||
294 | ldd 0($xfer),$ab1 | ||
295 | addl $ab0,$nm0,$nm0 | ||
296 | ldd,mb 8($xfer),$nm1 | ||
297 | extrd,u $nm0,31,32,$hi1 | ||
298 | stw,ma $nm0,8($tp) ; tp[j-1] | ||
299 | |||
300 | ldo -1($num),$num ; i-- | ||
301 | subi 0,$arrsz,$idx ; j=0 | ||
302 | ___ | ||
303 | $code.=<<___ if ($BN_SZ==4); | ||
304 | fldws,ma 4($bp),${fbi} ; bp[1] | ||
305 | ___ | ||
306 | $code.=<<___ if ($BN_SZ==8); | ||
307 | fldws 0($bp),${fbi} ; bp[1] in flipped word order | ||
308 | ___ | ||
309 | $code.=<<___; | ||
310 | flddx $idx($ap),${fai} ; ap[0,1] | ||
311 | flddx $idx($np),${fni} ; np[0,1] | ||
312 | fldws 8($xfer),${fti}R ; tp[0] | ||
313 | addl $hi0,$ab1,$ab1 | ||
314 | extrd,u $ab1,31,32,$hi0 | ||
315 | extrd,u $ab1,63,32,$ab1 | ||
316 | ldo 8($idx),$idx ; j++++ | ||
317 | xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[1] | ||
318 | xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[1] | ||
319 | addl $hi1,$nm1,$nm1 | ||
320 | addl $ab1,$nm1,$nm1 | ||
321 | extrd,u $nm1,31,32,$hi1 | ||
322 | fstws,mb ${fab0}L,-8($xfer) ; save high part | ||
323 | stw $nm1,-4($tp) ; tp[j-1] | ||
324 | |||
325 | fcpy,sgl %fr0,${fti}L ; zero high part | ||
326 | fcpy,sgl %fr0,${fab0}L | ||
327 | addl $hi1,$hi0,$hi0 | ||
328 | extrd,u $hi0,31,32,$hi1 | ||
329 | fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double | ||
330 | fcnvxf,dbl,dbl ${fab0},${fab0} | ||
331 | stw $hi0,0($tp) | ||
332 | stw $hi1,4($tp) | ||
333 | |||
334 | fadd,dbl ${fti},${fab0},${fab0} ; add tp[0] | ||
335 | fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int | ||
336 | xmpyu ${fn0},${fab0}R,${fm0} | ||
337 | ldo `$LOCALS+32+4`($fp),$tp | ||
338 | L\$outer | ||
339 | xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m | ||
340 | xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m | ||
341 | fstds ${fab0},-16($xfer) ; 33-bit value | ||
342 | fstds ${fnm0},-8($xfer) | ||
343 | flddx $idx($ap),${fai} ; ap[2] | ||
344 | flddx $idx($np),${fni} ; np[2] | ||
345 | ldo 8($idx),$idx ; j++++ | ||
346 | ldd -16($xfer),$ab0 ; 33-bit value | ||
347 | ldd -8($xfer),$nm0 | ||
348 | ldw 0($xfer),$hi0 ; high part | ||
349 | |||
350 | xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i] | ||
351 | xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m | ||
352 | extrd,u $ab0,31,32,$ti0 ; carry bit | ||
353 | extrd,u $ab0,63,32,$ab0 | ||
354 | fstds ${fab1},0($xfer) | ||
355 | addl $ti0,$hi0,$hi0 ; account carry bit | ||
356 | fstds ${fnm1},8($xfer) | ||
357 | addl $ab0,$nm0,$nm0 ; low part is discarded | ||
358 | ldw 0($tp),$ti1 ; tp[1] | ||
359 | extrd,u $nm0,31,32,$hi1 | ||
360 | fstds ${fab0},-16($xfer) | ||
361 | fstds ${fnm0},-8($xfer) | ||
362 | |||
363 | L\$inner | ||
364 | xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[i] | ||
365 | xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m | ||
366 | ldd 0($xfer),$ab1 | ||
367 | fstds ${fab1},0($xfer) | ||
368 | addl $hi0,$ti1,$ti1 | ||
369 | addl $ti1,$ab1,$ab1 | ||
370 | ldd 8($xfer),$nm1 | ||
371 | fstds ${fnm1},8($xfer) | ||
372 | extrd,u $ab1,31,32,$hi0 | ||
373 | extrd,u $ab1,63,32,$ab1 | ||
374 | flddx $idx($ap),${fai} ; ap[j,j+1] | ||
375 | flddx $idx($np),${fni} ; np[j,j+1] | ||
376 | addl $hi1,$nm1,$nm1 | ||
377 | addl $ab1,$nm1,$nm1 | ||
378 | ldw 4($tp),$ti0 ; tp[j] | ||
379 | stw $nm1,-4($tp) ; tp[j-1] | ||
380 | |||
381 | xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i] | ||
382 | xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m | ||
383 | ldd -16($xfer),$ab0 | ||
384 | fstds ${fab0},-16($xfer) | ||
385 | addl $hi0,$ti0,$ti0 | ||
386 | addl $ti0,$ab0,$ab0 | ||
387 | ldd -8($xfer),$nm0 | ||
388 | fstds ${fnm0},-8($xfer) | ||
389 | extrd,u $ab0,31,32,$hi0 | ||
390 | extrd,u $nm1,31,32,$hi1 | ||
391 | ldw 8($tp),$ti1 ; tp[j] | ||
392 | extrd,u $ab0,63,32,$ab0 | ||
393 | addl $hi1,$nm0,$nm0 | ||
394 | addl $ab0,$nm0,$nm0 | ||
395 | stw,ma $nm0,8($tp) ; tp[j-1] | ||
396 | addib,<> 8,$idx,L\$inner ; j++++ | ||
397 | extrd,u $nm0,31,32,$hi1 | ||
398 | |||
399 | xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[i] | ||
400 | xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m | ||
401 | ldd 0($xfer),$ab1 | ||
402 | fstds ${fab1},0($xfer) | ||
403 | addl $hi0,$ti1,$ti1 | ||
404 | addl $ti1,$ab1,$ab1 | ||
405 | ldd 8($xfer),$nm1 | ||
406 | fstds ${fnm1},8($xfer) | ||
407 | extrd,u $ab1,31,32,$hi0 | ||
408 | extrd,u $ab1,63,32,$ab1 | ||
409 | ldw 4($tp),$ti0 ; tp[j] | ||
410 | addl $hi1,$nm1,$nm1 | ||
411 | addl $ab1,$nm1,$nm1 | ||
412 | ldd -16($xfer),$ab0 | ||
413 | ldd -8($xfer),$nm0 | ||
414 | extrd,u $nm1,31,32,$hi1 | ||
415 | |||
416 | addl $hi0,$ab0,$ab0 | ||
417 | addl $ti0,$ab0,$ab0 | ||
418 | stw $nm1,-4($tp) ; tp[j-1] | ||
419 | extrd,u $ab0,31,32,$hi0 | ||
420 | ldw 8($tp),$ti1 ; tp[j] | ||
421 | extrd,u $ab0,63,32,$ab0 | ||
422 | addl $hi1,$nm0,$nm0 | ||
423 | ldd 0($xfer),$ab1 | ||
424 | addl $ab0,$nm0,$nm0 | ||
425 | ldd,mb 8($xfer),$nm1 | ||
426 | extrd,u $nm0,31,32,$hi1 | ||
427 | stw,ma $nm0,8($tp) ; tp[j-1] | ||
428 | |||
429 | addib,= -1,$num,L\$outerdone ; i-- | ||
430 | subi 0,$arrsz,$idx ; j=0 | ||
431 | ___ | ||
432 | $code.=<<___ if ($BN_SZ==4); | ||
433 | fldws,ma 4($bp),${fbi} ; bp[i] | ||
434 | ___ | ||
435 | $code.=<<___ if ($BN_SZ==8); | ||
436 | ldi 12,$ti0 ; bp[i] in flipped word order | ||
437 | addl,ev %r0,$num,$num | ||
438 | ldi -4,$ti0 | ||
439 | addl $ti0,$bp,$bp | ||
440 | fldws 0($bp),${fbi} | ||
441 | ___ | ||
442 | $code.=<<___; | ||
443 | flddx $idx($ap),${fai} ; ap[0] | ||
444 | addl $hi0,$ab1,$ab1 | ||
445 | flddx $idx($np),${fni} ; np[0] | ||
446 | fldws 8($xfer),${fti}R ; tp[0] | ||
447 | addl $ti1,$ab1,$ab1 | ||
448 | extrd,u $ab1,31,32,$hi0 | ||
449 | extrd,u $ab1,63,32,$ab1 | ||
450 | |||
451 | ldo 8($idx),$idx ; j++++ | ||
452 | xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[i] | ||
453 | xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[i] | ||
454 | ldw 4($tp),$ti0 ; tp[j] | ||
455 | |||
456 | addl $hi1,$nm1,$nm1 | ||
457 | fstws,mb ${fab0}L,-8($xfer) ; save high part | ||
458 | addl $ab1,$nm1,$nm1 | ||
459 | extrd,u $nm1,31,32,$hi1 | ||
460 | fcpy,sgl %fr0,${fti}L ; zero high part | ||
461 | fcpy,sgl %fr0,${fab0}L | ||
462 | stw $nm1,-4($tp) ; tp[j-1] | ||
463 | |||
464 | fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double | ||
465 | fcnvxf,dbl,dbl ${fab0},${fab0} | ||
466 | addl $hi1,$hi0,$hi0 | ||
467 | fadd,dbl ${fti},${fab0},${fab0} ; add tp[0] | ||
468 | addl $ti0,$hi0,$hi0 | ||
469 | extrd,u $hi0,31,32,$hi1 | ||
470 | fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int | ||
471 | stw $hi0,0($tp) | ||
472 | stw $hi1,4($tp) | ||
473 | xmpyu ${fn0},${fab0}R,${fm0} | ||
474 | |||
475 | b L\$outer | ||
476 | ldo `$LOCALS+32+4`($fp),$tp | ||
477 | |||
478 | L\$outerdone | ||
479 | addl $hi0,$ab1,$ab1 | ||
480 | addl $ti1,$ab1,$ab1 | ||
481 | extrd,u $ab1,31,32,$hi0 | ||
482 | extrd,u $ab1,63,32,$ab1 | ||
483 | |||
484 | ldw 4($tp),$ti0 ; tp[j] | ||
485 | |||
486 | addl $hi1,$nm1,$nm1 | ||
487 | addl $ab1,$nm1,$nm1 | ||
488 | extrd,u $nm1,31,32,$hi1 | ||
489 | stw $nm1,-4($tp) ; tp[j-1] | ||
490 | |||
491 | addl $hi1,$hi0,$hi0 | ||
492 | addl $ti0,$hi0,$hi0 | ||
493 | extrd,u $hi0,31,32,$hi1 | ||
494 | stw $hi0,0($tp) | ||
495 | stw $hi1,4($tp) | ||
496 | |||
497 | ldo `$LOCALS+32`($fp),$tp | ||
498 | sub %r0,%r0,%r0 ; clear borrow | ||
499 | ___ | ||
500 | $code.=<<___ if ($BN_SZ==4); | ||
501 | ldws,ma 4($tp),$ti0 | ||
502 | extru,= $rp,31,3,%r0 ; is rp 64-bit aligned? | ||
503 | b L\$sub_pa11 | ||
504 | addl $tp,$arrsz,$tp | ||
505 | L\$sub | ||
506 | ldwx $idx($np),$hi0 | ||
507 | subb $ti0,$hi0,$hi1 | ||
508 | ldwx $idx($tp),$ti0 | ||
509 | addib,<> 4,$idx,L\$sub | ||
510 | stws,ma $hi1,4($rp) | ||
511 | |||
512 | subb $ti0,%r0,$hi1 | ||
513 | ldo -4($tp),$tp | ||
514 | ___ | ||
515 | $code.=<<___ if ($BN_SZ==8); | ||
516 | ldd,ma 8($tp),$ti0 | ||
517 | L\$sub | ||
518 | ldd $idx($np),$hi0 | ||
519 | shrpd $ti0,$ti0,32,$ti0 ; flip word order | ||
520 | std $ti0,-8($tp) ; save flipped value | ||
521 | sub,db $ti0,$hi0,$hi1 | ||
522 | ldd,ma 8($tp),$ti0 | ||
523 | addib,<> 8,$idx,L\$sub | ||
524 | std,ma $hi1,8($rp) | ||
525 | |||
526 | extrd,u $ti0,31,32,$ti0 ; carry in flipped word order | ||
527 | sub,db $ti0,%r0,$hi1 | ||
528 | ldo -8($tp),$tp | ||
529 | ___ | ||
530 | $code.=<<___; | ||
531 | and $tp,$hi1,$ap | ||
532 | andcm $rp,$hi1,$bp | ||
533 | or $ap,$bp,$np | ||
534 | |||
535 | sub $rp,$arrsz,$rp ; rewind rp | ||
536 | subi 0,$arrsz,$idx | ||
537 | ldo `$LOCALS+32`($fp),$tp | ||
538 | L\$copy | ||
539 | ldd $idx($np),$hi0 | ||
540 | std,ma %r0,8($tp) | ||
541 | addib,<> 8,$idx,.-8 ; L\$copy | ||
542 | std,ma $hi0,8($rp) | ||
543 | ___ | ||
544 | |||
545 | if ($BN_SZ==4) { # PA-RISC 1.1 code-path | ||
546 | $ablo=$ab0; | ||
547 | $abhi=$ab1; | ||
548 | $nmlo0=$nm0; | ||
549 | $nmhi0=$nm1; | ||
550 | $nmlo1="%r9"; | ||
551 | $nmhi1="%r8"; | ||
552 | |||
553 | $code.=<<___; | ||
554 | b L\$done | ||
555 | nop | ||
556 | |||
557 | .ALIGN 8 | ||
558 | L\$parisc11 | ||
559 | xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0] | ||
560 | xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m | ||
561 | ldw -12($xfer),$ablo | ||
562 | ldw -16($xfer),$hi0 | ||
563 | ldw -4($xfer),$nmlo0 | ||
564 | ldw -8($xfer),$nmhi0 | ||
565 | fstds ${fab0},-16($xfer) | ||
566 | fstds ${fnm0},-8($xfer) | ||
567 | |||
568 | ldo 8($idx),$idx ; j++++ | ||
569 | add $ablo,$nmlo0,$nmlo0 ; discarded | ||
570 | addc %r0,$nmhi0,$hi1 | ||
571 | ldw 4($xfer),$ablo | ||
572 | ldw 0($xfer),$abhi | ||
573 | nop | ||
574 | |||
575 | L\$1st_pa11 | ||
576 | xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[0] | ||
577 | flddx $idx($ap),${fai} ; ap[j,j+1] | ||
578 | xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m | ||
579 | flddx $idx($np),${fni} ; np[j,j+1] | ||
580 | add $hi0,$ablo,$ablo | ||
581 | ldw 12($xfer),$nmlo1 | ||
582 | addc %r0,$abhi,$hi0 | ||
583 | ldw 8($xfer),$nmhi1 | ||
584 | add $ablo,$nmlo1,$nmlo1 | ||
585 | fstds ${fab1},0($xfer) | ||
586 | addc %r0,$nmhi1,$nmhi1 | ||
587 | fstds ${fnm1},8($xfer) | ||
588 | add $hi1,$nmlo1,$nmlo1 | ||
589 | ldw -12($xfer),$ablo | ||
590 | addc %r0,$nmhi1,$hi1 | ||
591 | ldw -16($xfer),$abhi | ||
592 | |||
593 | xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0] | ||
594 | ldw -4($xfer),$nmlo0 | ||
595 | xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m | ||
596 | ldw -8($xfer),$nmhi0 | ||
597 | add $hi0,$ablo,$ablo | ||
598 | stw $nmlo1,-4($tp) ; tp[j-1] | ||
599 | addc %r0,$abhi,$hi0 | ||
600 | fstds ${fab0},-16($xfer) | ||
601 | add $ablo,$nmlo0,$nmlo0 | ||
602 | fstds ${fnm0},-8($xfer) | ||
603 | addc %r0,$nmhi0,$nmhi0 | ||
604 | ldw 0($xfer),$abhi | ||
605 | add $hi1,$nmlo0,$nmlo0 | ||
606 | ldw 4($xfer),$ablo | ||
607 | stws,ma $nmlo0,8($tp) ; tp[j-1] | ||
608 | addib,<> 8,$idx,L\$1st_pa11 ; j++++ | ||
609 | addc %r0,$nmhi0,$hi1 | ||
610 | |||
611 | ldw 8($xfer),$nmhi1 | ||
612 | ldw 12($xfer),$nmlo1 | ||
613 | xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[0] | ||
614 | xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m | ||
615 | add $hi0,$ablo,$ablo | ||
616 | fstds ${fab1},0($xfer) | ||
617 | addc %r0,$abhi,$hi0 | ||
618 | fstds ${fnm1},8($xfer) | ||
619 | add $ablo,$nmlo1,$nmlo1 | ||
620 | ldw -16($xfer),$abhi | ||
621 | addc %r0,$nmhi1,$nmhi1 | ||
622 | ldw -12($xfer),$ablo | ||
623 | add $hi1,$nmlo1,$nmlo1 | ||
624 | ldw -8($xfer),$nmhi0 | ||
625 | addc %r0,$nmhi1,$hi1 | ||
626 | ldw -4($xfer),$nmlo0 | ||
627 | |||
628 | add $hi0,$ablo,$ablo | ||
629 | stw $nmlo1,-4($tp) ; tp[j-1] | ||
630 | addc %r0,$abhi,$hi0 | ||
631 | ldw 0($xfer),$abhi | ||
632 | add $ablo,$nmlo0,$nmlo0 | ||
633 | ldw 4($xfer),$ablo | ||
634 | addc %r0,$nmhi0,$nmhi0 | ||
635 | ldws,mb 8($xfer),$nmhi1 | ||
636 | add $hi1,$nmlo0,$nmlo0 | ||
637 | ldw 4($xfer),$nmlo1 | ||
638 | addc %r0,$nmhi0,$hi1 | ||
639 | stws,ma $nmlo0,8($tp) ; tp[j-1] | ||
640 | |||
641 | ldo -1($num),$num ; i-- | ||
642 | subi 0,$arrsz,$idx ; j=0 | ||
643 | |||
644 | fldws,ma 4($bp),${fbi} ; bp[1] | ||
645 | flddx $idx($ap),${fai} ; ap[0,1] | ||
646 | flddx $idx($np),${fni} ; np[0,1] | ||
647 | fldws 8($xfer),${fti}R ; tp[0] | ||
648 | add $hi0,$ablo,$ablo | ||
649 | addc %r0,$abhi,$hi0 | ||
650 | ldo 8($idx),$idx ; j++++ | ||
651 | xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[1] | ||
652 | xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[1] | ||
653 | add $hi1,$nmlo1,$nmlo1 | ||
654 | addc %r0,$nmhi1,$nmhi1 | ||
655 | add $ablo,$nmlo1,$nmlo1 | ||
656 | addc %r0,$nmhi1,$hi1 | ||
657 | fstws,mb ${fab0}L,-8($xfer) ; save high part | ||
658 | stw $nmlo1,-4($tp) ; tp[j-1] | ||
659 | |||
660 | fcpy,sgl %fr0,${fti}L ; zero high part | ||
661 | fcpy,sgl %fr0,${fab0}L | ||
662 | add $hi1,$hi0,$hi0 | ||
663 | addc %r0,%r0,$hi1 | ||
664 | fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double | ||
665 | fcnvxf,dbl,dbl ${fab0},${fab0} | ||
666 | stw $hi0,0($tp) | ||
667 | stw $hi1,4($tp) | ||
668 | |||
669 | fadd,dbl ${fti},${fab0},${fab0} ; add tp[0] | ||
670 | fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int | ||
671 | xmpyu ${fn0},${fab0}R,${fm0} | ||
672 | ldo `$LOCALS+32+4`($fp),$tp | ||
673 | L\$outer_pa11 | ||
674 | xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m | ||
675 | xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m | ||
676 | fstds ${fab0},-16($xfer) ; 33-bit value | ||
677 | fstds ${fnm0},-8($xfer) | ||
678 | flddx $idx($ap),${fai} ; ap[2,3] | ||
679 | flddx $idx($np),${fni} ; np[2,3] | ||
680 | ldw -16($xfer),$abhi ; carry bit actually | ||
681 | ldo 8($idx),$idx ; j++++ | ||
682 | ldw -12($xfer),$ablo | ||
683 | ldw -8($xfer),$nmhi0 | ||
684 | ldw -4($xfer),$nmlo0 | ||
685 | ldw 0($xfer),$hi0 ; high part | ||
686 | |||
687 | xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i] | ||
688 | xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m | ||
689 | fstds ${fab1},0($xfer) | ||
690 | addl $abhi,$hi0,$hi0 ; account carry bit | ||
691 | fstds ${fnm1},8($xfer) | ||
692 | add $ablo,$nmlo0,$nmlo0 ; discarded | ||
693 | ldw 0($tp),$ti1 ; tp[1] | ||
694 | addc %r0,$nmhi0,$hi1 | ||
695 | fstds ${fab0},-16($xfer) | ||
696 | fstds ${fnm0},-8($xfer) | ||
697 | ldw 4($xfer),$ablo | ||
698 | ldw 0($xfer),$abhi | ||
699 | |||
700 | L\$inner_pa11 | ||
701 | xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[i] | ||
702 | flddx $idx($ap),${fai} ; ap[j,j+1] | ||
703 | xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m | ||
704 | flddx $idx($np),${fni} ; np[j,j+1] | ||
705 | add $hi0,$ablo,$ablo | ||
706 | ldw 4($tp),$ti0 ; tp[j] | ||
707 | addc %r0,$abhi,$abhi | ||
708 | ldw 12($xfer),$nmlo1 | ||
709 | add $ti1,$ablo,$ablo | ||
710 | ldw 8($xfer),$nmhi1 | ||
711 | addc %r0,$abhi,$hi0 | ||
712 | fstds ${fab1},0($xfer) | ||
713 | add $ablo,$nmlo1,$nmlo1 | ||
714 | fstds ${fnm1},8($xfer) | ||
715 | addc %r0,$nmhi1,$nmhi1 | ||
716 | ldw -12($xfer),$ablo | ||
717 | add $hi1,$nmlo1,$nmlo1 | ||
718 | ldw -16($xfer),$abhi | ||
719 | addc %r0,$nmhi1,$hi1 | ||
720 | |||
721 | xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i] | ||
722 | ldw 8($tp),$ti1 ; tp[j] | ||
723 | xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m | ||
724 | ldw -4($xfer),$nmlo0 | ||
725 | add $hi0,$ablo,$ablo | ||
726 | ldw -8($xfer),$nmhi0 | ||
727 | addc %r0,$abhi,$abhi | ||
728 | stw $nmlo1,-4($tp) ; tp[j-1] | ||
729 | add $ti0,$ablo,$ablo | ||
730 | fstds ${fab0},-16($xfer) | ||
731 | addc %r0,$abhi,$hi0 | ||
732 | fstds ${fnm0},-8($xfer) | ||
733 | add $ablo,$nmlo0,$nmlo0 | ||
734 | ldw 4($xfer),$ablo | ||
735 | addc %r0,$nmhi0,$nmhi0 | ||
736 | ldw 0($xfer),$abhi | ||
737 | add $hi1,$nmlo0,$nmlo0 | ||
738 | stws,ma $nmlo0,8($tp) ; tp[j-1] | ||
739 | addib,<> 8,$idx,L\$inner_pa11 ; j++++ | ||
740 | addc %r0,$nmhi0,$hi1 | ||
741 | |||
742 | xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[i] | ||
743 | ldw 12($xfer),$nmlo1 | ||
744 | xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m | ||
745 | ldw 8($xfer),$nmhi1 | ||
746 | add $hi0,$ablo,$ablo | ||
747 | ldw 4($tp),$ti0 ; tp[j] | ||
748 | addc %r0,$abhi,$abhi | ||
749 | fstds ${fab1},0($xfer) | ||
750 | add $ti1,$ablo,$ablo | ||
751 | fstds ${fnm1},8($xfer) | ||
752 | addc %r0,$abhi,$hi0 | ||
753 | ldw -16($xfer),$abhi | ||
754 | add $ablo,$nmlo1,$nmlo1 | ||
755 | ldw -12($xfer),$ablo | ||
756 | addc %r0,$nmhi1,$nmhi1 | ||
757 | ldw -8($xfer),$nmhi0 | ||
758 | add $hi1,$nmlo1,$nmlo1 | ||
759 | ldw -4($xfer),$nmlo0 | ||
760 | addc %r0,$nmhi1,$hi1 | ||
761 | |||
762 | add $hi0,$ablo,$ablo | ||
763 | stw $nmlo1,-4($tp) ; tp[j-1] | ||
764 | addc %r0,$abhi,$abhi | ||
765 | add $ti0,$ablo,$ablo | ||
766 | ldw 8($tp),$ti1 ; tp[j] | ||
767 | addc %r0,$abhi,$hi0 | ||
768 | ldw 0($xfer),$abhi | ||
769 | add $ablo,$nmlo0,$nmlo0 | ||
770 | ldw 4($xfer),$ablo | ||
771 | addc %r0,$nmhi0,$nmhi0 | ||
772 | ldws,mb 8($xfer),$nmhi1 | ||
773 | add $hi1,$nmlo0,$nmlo0 | ||
774 | ldw 4($xfer),$nmlo1 | ||
775 | addc %r0,$nmhi0,$hi1 | ||
776 | stws,ma $nmlo0,8($tp) ; tp[j-1] | ||
777 | |||
778 | addib,= -1,$num,L\$outerdone_pa11; i-- | ||
779 | subi 0,$arrsz,$idx ; j=0 | ||
780 | |||
781 | fldws,ma 4($bp),${fbi} ; bp[i] | ||
782 | flddx $idx($ap),${fai} ; ap[0] | ||
783 | add $hi0,$ablo,$ablo | ||
784 | addc %r0,$abhi,$abhi | ||
785 | flddx $idx($np),${fni} ; np[0] | ||
786 | fldws 8($xfer),${fti}R ; tp[0] | ||
787 | add $ti1,$ablo,$ablo | ||
788 | addc %r0,$abhi,$hi0 | ||
789 | |||
790 | ldo 8($idx),$idx ; j++++ | ||
791 | xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[i] | ||
792 | xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[i] | ||
793 | ldw 4($tp),$ti0 ; tp[j] | ||
794 | |||
795 | add $hi1,$nmlo1,$nmlo1 | ||
796 | addc %r0,$nmhi1,$nmhi1 | ||
797 | fstws,mb ${fab0}L,-8($xfer) ; save high part | ||
798 | add $ablo,$nmlo1,$nmlo1 | ||
799 | addc %r0,$nmhi1,$hi1 | ||
800 | fcpy,sgl %fr0,${fti}L ; zero high part | ||
801 | fcpy,sgl %fr0,${fab0}L | ||
802 | stw $nmlo1,-4($tp) ; tp[j-1] | ||
803 | |||
804 | fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double | ||
805 | fcnvxf,dbl,dbl ${fab0},${fab0} | ||
806 | add $hi1,$hi0,$hi0 | ||
807 | addc %r0,%r0,$hi1 | ||
808 | fadd,dbl ${fti},${fab0},${fab0} ; add tp[0] | ||
809 | add $ti0,$hi0,$hi0 | ||
810 | addc %r0,$hi1,$hi1 | ||
811 | fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int | ||
812 | stw $hi0,0($tp) | ||
813 | stw $hi1,4($tp) | ||
814 | xmpyu ${fn0},${fab0}R,${fm0} | ||
815 | |||
816 | b L\$outer_pa11 | ||
817 | ldo `$LOCALS+32+4`($fp),$tp | ||
818 | |||
819 | L\$outerdone_pa11 | ||
820 | add $hi0,$ablo,$ablo | ||
821 | addc %r0,$abhi,$abhi | ||
822 | add $ti1,$ablo,$ablo | ||
823 | addc %r0,$abhi,$hi0 | ||
824 | |||
825 | ldw 4($tp),$ti0 ; tp[j] | ||
826 | |||
827 | add $hi1,$nmlo1,$nmlo1 | ||
828 | addc %r0,$nmhi1,$nmhi1 | ||
829 | add $ablo,$nmlo1,$nmlo1 | ||
830 | addc %r0,$nmhi1,$hi1 | ||
831 | stw $nmlo1,-4($tp) ; tp[j-1] | ||
832 | |||
833 | add $hi1,$hi0,$hi0 | ||
834 | addc %r0,%r0,$hi1 | ||
835 | add $ti0,$hi0,$hi0 | ||
836 | addc %r0,$hi1,$hi1 | ||
837 | stw $hi0,0($tp) | ||
838 | stw $hi1,4($tp) | ||
839 | |||
840 | ldo `$LOCALS+32+4`($fp),$tp | ||
841 | sub %r0,%r0,%r0 ; clear borrow | ||
842 | ldw -4($tp),$ti0 | ||
843 | addl $tp,$arrsz,$tp | ||
844 | L\$sub_pa11 | ||
845 | ldwx $idx($np),$hi0 | ||
846 | subb $ti0,$hi0,$hi1 | ||
847 | ldwx $idx($tp),$ti0 | ||
848 | addib,<> 4,$idx,L\$sub_pa11 | ||
849 | stws,ma $hi1,4($rp) | ||
850 | |||
851 | subb $ti0,%r0,$hi1 | ||
852 | ldo -4($tp),$tp | ||
853 | and $tp,$hi1,$ap | ||
854 | andcm $rp,$hi1,$bp | ||
855 | or $ap,$bp,$np | ||
856 | |||
857 | sub $rp,$arrsz,$rp ; rewind rp | ||
858 | subi 0,$arrsz,$idx | ||
859 | ldo `$LOCALS+32`($fp),$tp | ||
860 | L\$copy_pa11 | ||
861 | ldwx $idx($np),$hi0 | ||
862 | stws,ma %r0,4($tp) | ||
863 | addib,<> 4,$idx,L\$copy_pa11 | ||
864 | stws,ma $hi0,4($rp) | ||
865 | |||
866 | nop ; alignment | ||
867 | L\$done | ||
868 | ___ | ||
869 | } | ||
870 | |||
871 | $code.=<<___; | ||
872 | ldi 1,%r28 ; signal "handled" | ||
873 | ldo $FRAME($fp),%sp ; destroy tp[num+1] | ||
874 | |||
875 | $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue | ||
876 | $POP `-$FRAME+1*$SIZE_T`(%sp),%r4 | ||
877 | $POP `-$FRAME+2*$SIZE_T`(%sp),%r5 | ||
878 | $POP `-$FRAME+3*$SIZE_T`(%sp),%r6 | ||
879 | $POP `-$FRAME+4*$SIZE_T`(%sp),%r7 | ||
880 | $POP `-$FRAME+5*$SIZE_T`(%sp),%r8 | ||
881 | $POP `-$FRAME+6*$SIZE_T`(%sp),%r9 | ||
882 | $POP `-$FRAME+7*$SIZE_T`(%sp),%r10 | ||
883 | L\$abort | ||
884 | bv (%r2) | ||
885 | .EXIT | ||
886 | $POPMB -$FRAME(%sp),%r3 | ||
887 | .PROCEND | ||
888 | .STRINGZ "Montgomery Multiplication for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>" | ||
889 | ___ | ||
890 | |||
891 | # Explicitly encode PA-RISC 2.0 instructions used in this module, so | ||
892 | # that it can be compiled with .LEVEL 1.0. It should be noted that I | ||
893 | # wouldn't have to do this, if GNU assembler understood .ALLOW 2.0 | ||
894 | # directive... | ||
895 | |||
896 | my $ldd = sub { | ||
897 | my ($mod,$args) = @_; | ||
898 | my $orig = "ldd$mod\t$args"; | ||
899 | |||
900 | if ($args =~ /%r([0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 4 | ||
901 | { my $opcode=(0x03<<26)|($2<<21)|($1<<16)|(3<<6)|$3; | ||
902 | sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; | ||
903 | } | ||
904 | elsif ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 5 | ||
905 | { my $opcode=(0x03<<26)|($2<<21)|(1<<12)|(3<<6)|$3; | ||
906 | $opcode|=(($1&0xF)<<17)|(($1&0x10)<<12); # encode offset | ||
907 | $opcode|=(1<<5) if ($mod =~ /^,m/); | ||
908 | $opcode|=(1<<13) if ($mod =~ /^,mb/); | ||
909 | sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; | ||
910 | } | ||
911 | else { "\t".$orig; } | ||
912 | }; | ||
913 | |||
914 | my $std = sub { | ||
915 | my ($mod,$args) = @_; | ||
916 | my $orig = "std$mod\t$args"; | ||
917 | |||
918 | if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 6 | ||
919 | { my $opcode=(0x03<<26)|($3<<21)|($1<<16)|(1<<12)|(0xB<<6); | ||
920 | $opcode|=(($2&0xF)<<1)|(($2&0x10)>>4); # encode offset | ||
921 | $opcode|=(1<<5) if ($mod =~ /^,m/); | ||
922 | $opcode|=(1<<13) if ($mod =~ /^,mb/); | ||
923 | sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; | ||
924 | } | ||
925 | else { "\t".$orig; } | ||
926 | }; | ||
927 | |||
928 | my $extrd = sub { | ||
929 | my ($mod,$args) = @_; | ||
930 | my $orig = "extrd$mod\t$args"; | ||
931 | |||
932 | # I only have ",u" completer, it's implicitly encoded... | ||
933 | if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 15 | ||
934 | { my $opcode=(0x36<<26)|($1<<21)|($4<<16); | ||
935 | my $len=32-$3; | ||
936 | $opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5); # encode pos | ||
937 | $opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len | ||
938 | sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; | ||
939 | } | ||
940 | elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/) # format 12 | ||
941 | { my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9); | ||
942 | my $len=32-$2; | ||
943 | $opcode |= (($len&0x20)<<3)|($len&0x1f); # encode len | ||
944 | $opcode |= (1<<13) if ($mod =~ /,\**=/); | ||
945 | sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; | ||
946 | } | ||
947 | else { "\t".$orig; } | ||
948 | }; | ||
949 | |||
950 | my $shrpd = sub { | ||
951 | my ($mod,$args) = @_; | ||
952 | my $orig = "shrpd$mod\t$args"; | ||
953 | |||
954 | if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/) # format 14 | ||
955 | { my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4; | ||
956 | my $cpos=63-$3; | ||
957 | $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode sa | ||
958 | sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; | ||
959 | } | ||
960 | else { "\t".$orig; } | ||
961 | }; | ||
962 | |||
963 | my $sub = sub { | ||
964 | my ($mod,$args) = @_; | ||
965 | my $orig = "sub$mod\t$args"; | ||
966 | |||
967 | if ($mod eq ",db" && $args =~ /%r([0-9]+),%r([0-9]+),%r([0-9]+)/) { | ||
968 | my $opcode=(0x02<<26)|($2<<21)|($1<<16)|$3; | ||
969 | $opcode|=(1<<10); # e1 | ||
970 | $opcode|=(1<<8); # e2 | ||
971 | $opcode|=(1<<5); # d | ||
972 | sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig | ||
973 | } | ||
974 | else { "\t".$orig; } | ||
975 | }; | ||
976 | |||
977 | sub assemble { | ||
978 | my ($mnemonic,$mod,$args)=@_; | ||
979 | my $opcode = eval("\$$mnemonic"); | ||
980 | |||
981 | ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args"; | ||
982 | } | ||
983 | |||
984 | foreach (split("\n",$code)) { | ||
985 | s/\`([^\`]*)\`/eval $1/ge; | ||
986 | # flip word order in 64-bit mode... | ||
987 | s/(xmpyu\s+)($fai|$fni)([LR])/$1.$2.($3 eq "L"?"R":"L")/e if ($BN_SZ==8); | ||
988 | # assemble 2.0 instructions in 32-bit mode... | ||
989 | s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e if ($BN_SZ==4); | ||
990 | |||
991 | print $_,"\n"; | ||
992 | } | ||
993 | close STDOUT; | ||