diff options
Diffstat (limited to '')
-rw-r--r-- | src/lib/libcrypto/bn/asm/ppc64-mont.pl | 338 |
1 files changed, 254 insertions, 84 deletions
diff --git a/src/lib/libcrypto/bn/asm/ppc64-mont.pl b/src/lib/libcrypto/bn/asm/ppc64-mont.pl index 3449b35855..a14e769ad0 100644 --- a/src/lib/libcrypto/bn/asm/ppc64-mont.pl +++ b/src/lib/libcrypto/bn/asm/ppc64-mont.pl | |||
@@ -45,23 +45,40 @@ | |||
45 | # on 1.8GHz PPC970, it's only 5-55% faster. Still far from impressive | 45 | # on 1.8GHz PPC970, it's only 5-55% faster. Still far from impressive |
46 | # in absolute terms, but it's apparently the way Power 6 is... | 46 | # in absolute terms, but it's apparently the way Power 6 is... |
47 | 47 | ||
48 | # December 2009 | ||
49 | |||
50 | # Adapted for 32-bit build this module delivers 25-120%, yes, more | ||
51 | # than *twice* for longer keys, performance improvement over 32-bit | ||
52 | # ppc-mont.pl on 1.8GHz PPC970. However! This implementation utilizes | ||
53 | # even 64-bit integer operations and the trouble is that most PPC | ||
54 | # operating systems don't preserve upper halves of general purpose | ||
55 | # registers upon 32-bit signal delivery. They do preserve them upon | ||
56 | # context switch, but not signalling:-( This means that asynchronous | ||
57 | # signals have to be blocked upon entry to this subroutine. Signal | ||
58 | # masking (and of course complementary unmasking) has quite an impact | ||
59 | # on performance, naturally larger for shorter keys. It's so severe | ||
60 | # that 512-bit key performance can be as low as 1/3 of expected one. | ||
61 | # This is why this routine can be engaged for longer key operations | ||
62 | # only on these OSes, see crypto/ppccap.c for further details. MacOS X | ||
63 | # is an exception from this and doesn't require signal masking, and | ||
64 | # that's where above improvement coefficients were collected. For | ||
65 | # others alternative would be to break dependence on upper halves of | ||
66 | # GPRs by sticking to 32-bit integer operations... | ||
67 | |||
48 | $flavour = shift; | 68 | $flavour = shift; |
49 | 69 | ||
50 | if ($flavour =~ /32/) { | 70 | if ($flavour =~ /32/) { |
51 | $SIZE_T=4; | 71 | $SIZE_T=4; |
52 | $RZONE= 224; | 72 | $RZONE= 224; |
53 | $FRAME= $SIZE_T*12+8*12; | 73 | $fname= "bn_mul_mont_fpu64"; |
54 | $fname= "bn_mul_mont_ppc64"; | ||
55 | 74 | ||
56 | $STUX= "stwux"; # store indexed and update | 75 | $STUX= "stwux"; # store indexed and update |
57 | $PUSH= "stw"; | 76 | $PUSH= "stw"; |
58 | $POP= "lwz"; | 77 | $POP= "lwz"; |
59 | die "not implemented yet"; | ||
60 | } elsif ($flavour =~ /64/) { | 78 | } elsif ($flavour =~ /64/) { |
61 | $SIZE_T=8; | 79 | $SIZE_T=8; |
62 | $RZONE= 288; | 80 | $RZONE= 288; |
63 | $FRAME= $SIZE_T*12+8*12; | 81 | $fname= "bn_mul_mont_fpu64"; |
64 | $fname= "bn_mul_mont"; | ||
65 | 82 | ||
66 | # same as above, but 64-bit mnemonics... | 83 | # same as above, but 64-bit mnemonics... |
67 | $STUX= "stdux"; # store indexed and update | 84 | $STUX= "stdux"; # store indexed and update |
@@ -76,7 +93,7 @@ die "can't locate ppc-xlate.pl"; | |||
76 | 93 | ||
77 | open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!"; | 94 | open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!"; |
78 | 95 | ||
79 | $FRAME=($FRAME+63)&~63; | 96 | $FRAME=64; # padded frame header |
80 | $TRANSFER=16*8; | 97 | $TRANSFER=16*8; |
81 | 98 | ||
82 | $carry="r0"; | 99 | $carry="r0"; |
@@ -93,16 +110,16 @@ $tp="r10"; | |||
93 | $j="r11"; | 110 | $j="r11"; |
94 | $i="r12"; | 111 | $i="r12"; |
95 | # non-volatile registers | 112 | # non-volatile registers |
96 | $nap_d="r14"; # interleaved ap and np in double format | 113 | $nap_d="r22"; # interleaved ap and np in double format |
97 | $a0="r15"; # ap[0] | 114 | $a0="r23"; # ap[0] |
98 | $t0="r16"; # temporary registers | 115 | $t0="r24"; # temporary registers |
99 | $t1="r17"; | 116 | $t1="r25"; |
100 | $t2="r18"; | 117 | $t2="r26"; |
101 | $t3="r19"; | 118 | $t3="r27"; |
102 | $t4="r20"; | 119 | $t4="r28"; |
103 | $t5="r21"; | 120 | $t5="r29"; |
104 | $t6="r22"; | 121 | $t6="r30"; |
105 | $t7="r23"; | 122 | $t7="r31"; |
106 | 123 | ||
107 | # PPC offers enough register bank capacity to unroll inner loops twice | 124 | # PPC offers enough register bank capacity to unroll inner loops twice |
108 | # | 125 | # |
@@ -132,28 +149,17 @@ $ba="f0"; $bb="f1"; $bc="f2"; $bd="f3"; | |||
132 | $na="f4"; $nb="f5"; $nc="f6"; $nd="f7"; | 149 | $na="f4"; $nb="f5"; $nc="f6"; $nd="f7"; |
133 | $dota="f8"; $dotb="f9"; | 150 | $dota="f8"; $dotb="f9"; |
134 | $A0="f10"; $A1="f11"; $A2="f12"; $A3="f13"; | 151 | $A0="f10"; $A1="f11"; $A2="f12"; $A3="f13"; |
135 | $N0="f14"; $N1="f15"; $N2="f16"; $N3="f17"; | 152 | $N0="f20"; $N1="f21"; $N2="f22"; $N3="f23"; |
136 | $T0a="f18"; $T0b="f19"; | 153 | $T0a="f24"; $T0b="f25"; |
137 | $T1a="f20"; $T1b="f21"; | 154 | $T1a="f26"; $T1b="f27"; |
138 | $T2a="f22"; $T2b="f23"; | 155 | $T2a="f28"; $T2b="f29"; |
139 | $T3a="f24"; $T3b="f25"; | 156 | $T3a="f30"; $T3b="f31"; |
140 | 157 | ||
141 | # sp----------->+-------------------------------+ | 158 | # sp----------->+-------------------------------+ |
142 | # | saved sp | | 159 | # | saved sp | |
143 | # +-------------------------------+ | 160 | # +-------------------------------+ |
144 | # | | | ||
145 | # +-------------------------------+ | ||
146 | # | 10 saved gpr, r14-r23 | | ||
147 | # . . | ||
148 | # . . | ||
149 | # +12*size_t +-------------------------------+ | ||
150 | # | 12 saved fpr, f14-f25 | | ||
151 | # . . | 161 | # . . |
152 | # . . | 162 | # +64 +-------------------------------+ |
153 | # +12*8 +-------------------------------+ | ||
154 | # | padding to 64 byte boundary | | ||
155 | # . . | ||
156 | # +X +-------------------------------+ | ||
157 | # | 16 gpr<->fpr transfer zone | | 163 | # | 16 gpr<->fpr transfer zone | |
158 | # . . | 164 | # . . |
159 | # . . | 165 | # . . |
@@ -173,6 +179,16 @@ $T3a="f24"; $T3b="f25"; | |||
173 | # . . | 179 | # . . |
174 | # . . | 180 | # . . |
175 | # +-------------------------------+ | 181 | # +-------------------------------+ |
182 | # . . | ||
183 | # -12*size_t +-------------------------------+ | ||
184 | # | 10 saved gpr, r22-r31 | | ||
185 | # . . | ||
186 | # . . | ||
187 | # -12*8 +-------------------------------+ | ||
188 | # | 12 saved fpr, f20-f31 | | ||
189 | # . . | ||
190 | # . . | ||
191 | # +-------------------------------+ | ||
176 | 192 | ||
177 | $code=<<___; | 193 | $code=<<___; |
178 | .machine "any" | 194 | .machine "any" |
@@ -181,14 +197,14 @@ $code=<<___; | |||
181 | .globl .$fname | 197 | .globl .$fname |
182 | .align 5 | 198 | .align 5 |
183 | .$fname: | 199 | .$fname: |
184 | cmpwi $num,4 | 200 | cmpwi $num,`3*8/$SIZE_T` |
185 | mr $rp,r3 ; $rp is reassigned | 201 | mr $rp,r3 ; $rp is reassigned |
186 | li r3,0 ; possible "not handled" return code | 202 | li r3,0 ; possible "not handled" return code |
187 | bltlr- | 203 | bltlr- |
188 | andi. r0,$num,1 ; $num has to be even | 204 | andi. r0,$num,`16/$SIZE_T-1` ; $num has to be "even" |
189 | bnelr- | 205 | bnelr- |
190 | 206 | ||
191 | slwi $num,$num,3 ; num*=8 | 207 | slwi $num,$num,`log($SIZE_T)/log(2)` ; num*=sizeof(BN_LONG) |
192 | li $i,-4096 | 208 | li $i,-4096 |
193 | slwi $tp,$num,2 ; place for {an}p_{lh}[num], i.e. 4*num | 209 | slwi $tp,$num,2 ; place for {an}p_{lh}[num], i.e. 4*num |
194 | add $tp,$tp,$num ; place for tp[num+1] | 210 | add $tp,$tp,$num ; place for tp[num+1] |
@@ -196,35 +212,50 @@ $code=<<___; | |||
196 | subf $tp,$tp,$sp ; $sp-$tp | 212 | subf $tp,$tp,$sp ; $sp-$tp |
197 | and $tp,$tp,$i ; minimize TLB usage | 213 | and $tp,$tp,$i ; minimize TLB usage |
198 | subf $tp,$sp,$tp ; $tp-$sp | 214 | subf $tp,$sp,$tp ; $tp-$sp |
215 | mr $i,$sp | ||
199 | $STUX $sp,$sp,$tp ; alloca | 216 | $STUX $sp,$sp,$tp ; alloca |
200 | 217 | ||
201 | $PUSH r14,`2*$SIZE_T`($sp) | 218 | $PUSH r22,`-12*8-10*$SIZE_T`($i) |
202 | $PUSH r15,`3*$SIZE_T`($sp) | 219 | $PUSH r23,`-12*8-9*$SIZE_T`($i) |
203 | $PUSH r16,`4*$SIZE_T`($sp) | 220 | $PUSH r24,`-12*8-8*$SIZE_T`($i) |
204 | $PUSH r17,`5*$SIZE_T`($sp) | 221 | $PUSH r25,`-12*8-7*$SIZE_T`($i) |
205 | $PUSH r18,`6*$SIZE_T`($sp) | 222 | $PUSH r26,`-12*8-6*$SIZE_T`($i) |
206 | $PUSH r19,`7*$SIZE_T`($sp) | 223 | $PUSH r27,`-12*8-5*$SIZE_T`($i) |
207 | $PUSH r20,`8*$SIZE_T`($sp) | 224 | $PUSH r28,`-12*8-4*$SIZE_T`($i) |
208 | $PUSH r21,`9*$SIZE_T`($sp) | 225 | $PUSH r29,`-12*8-3*$SIZE_T`($i) |
209 | $PUSH r22,`10*$SIZE_T`($sp) | 226 | $PUSH r30,`-12*8-2*$SIZE_T`($i) |
210 | $PUSH r23,`11*$SIZE_T`($sp) | 227 | $PUSH r31,`-12*8-1*$SIZE_T`($i) |
211 | stfd f14,`12*$SIZE_T+0`($sp) | 228 | stfd f20,`-12*8`($i) |
212 | stfd f15,`12*$SIZE_T+8`($sp) | 229 | stfd f21,`-11*8`($i) |
213 | stfd f16,`12*$SIZE_T+16`($sp) | 230 | stfd f22,`-10*8`($i) |
214 | stfd f17,`12*$SIZE_T+24`($sp) | 231 | stfd f23,`-9*8`($i) |
215 | stfd f18,`12*$SIZE_T+32`($sp) | 232 | stfd f24,`-8*8`($i) |
216 | stfd f19,`12*$SIZE_T+40`($sp) | 233 | stfd f25,`-7*8`($i) |
217 | stfd f20,`12*$SIZE_T+48`($sp) | 234 | stfd f26,`-6*8`($i) |
218 | stfd f21,`12*$SIZE_T+56`($sp) | 235 | stfd f27,`-5*8`($i) |
219 | stfd f22,`12*$SIZE_T+64`($sp) | 236 | stfd f28,`-4*8`($i) |
220 | stfd f23,`12*$SIZE_T+72`($sp) | 237 | stfd f29,`-3*8`($i) |
221 | stfd f24,`12*$SIZE_T+80`($sp) | 238 | stfd f30,`-2*8`($i) |
222 | stfd f25,`12*$SIZE_T+88`($sp) | 239 | stfd f31,`-1*8`($i) |
223 | 240 | ___ | |
241 | $code.=<<___ if ($SIZE_T==8); | ||
224 | ld $a0,0($ap) ; pull ap[0] value | 242 | ld $a0,0($ap) ; pull ap[0] value |
225 | ld $n0,0($n0) ; pull n0[0] value | 243 | ld $n0,0($n0) ; pull n0[0] value |
226 | ld $t3,0($bp) ; bp[0] | 244 | ld $t3,0($bp) ; bp[0] |
227 | 245 | ___ | |
246 | $code.=<<___ if ($SIZE_T==4); | ||
247 | mr $t1,$n0 | ||
248 | lwz $a0,0($ap) ; pull ap[0,1] value | ||
249 | lwz $t0,4($ap) | ||
250 | lwz $n0,0($t1) ; pull n0[0,1] value | ||
251 | lwz $t1,4($t1) | ||
252 | lwz $t3,0($bp) ; bp[0,1] | ||
253 | lwz $t2,4($bp) | ||
254 | insrdi $a0,$t0,32,0 | ||
255 | insrdi $n0,$t1,32,0 | ||
256 | insrdi $t3,$t2,32,0 | ||
257 | ___ | ||
258 | $code.=<<___; | ||
228 | addi $tp,$sp,`$FRAME+$TRANSFER+8+64` | 259 | addi $tp,$sp,`$FRAME+$TRANSFER+8+64` |
229 | li $i,-64 | 260 | li $i,-64 |
230 | add $nap_d,$tp,$num | 261 | add $nap_d,$tp,$num |
@@ -258,6 +289,8 @@ $code=<<___; | |||
258 | std $t5,`$FRAME+40`($sp) | 289 | std $t5,`$FRAME+40`($sp) |
259 | std $t6,`$FRAME+48`($sp) | 290 | std $t6,`$FRAME+48`($sp) |
260 | std $t7,`$FRAME+56`($sp) | 291 | std $t7,`$FRAME+56`($sp) |
292 | ___ | ||
293 | $code.=<<___ if ($SIZE_T==8); | ||
261 | lwz $t0,4($ap) ; load a[j] as 32-bit word pair | 294 | lwz $t0,4($ap) ; load a[j] as 32-bit word pair |
262 | lwz $t1,0($ap) | 295 | lwz $t1,0($ap) |
263 | lwz $t2,12($ap) ; load a[j+1] as 32-bit word pair | 296 | lwz $t2,12($ap) ; load a[j+1] as 32-bit word pair |
@@ -266,6 +299,18 @@ $code=<<___; | |||
266 | lwz $t5,0($np) | 299 | lwz $t5,0($np) |
267 | lwz $t6,12($np) ; load n[j+1] as 32-bit word pair | 300 | lwz $t6,12($np) ; load n[j+1] as 32-bit word pair |
268 | lwz $t7,8($np) | 301 | lwz $t7,8($np) |
302 | ___ | ||
303 | $code.=<<___ if ($SIZE_T==4); | ||
304 | lwz $t0,0($ap) ; load a[j..j+3] as 32-bit word pairs | ||
305 | lwz $t1,4($ap) | ||
306 | lwz $t2,8($ap) | ||
307 | lwz $t3,12($ap) | ||
308 | lwz $t4,0($np) ; load n[j..j+3] as 32-bit word pairs | ||
309 | lwz $t5,4($np) | ||
310 | lwz $t6,8($np) | ||
311 | lwz $t7,12($np) | ||
312 | ___ | ||
313 | $code.=<<___; | ||
269 | lfd $ba,`$FRAME+0`($sp) | 314 | lfd $ba,`$FRAME+0`($sp) |
270 | lfd $bb,`$FRAME+8`($sp) | 315 | lfd $bb,`$FRAME+8`($sp) |
271 | lfd $bc,`$FRAME+16`($sp) | 316 | lfd $bc,`$FRAME+16`($sp) |
@@ -374,6 +419,8 @@ $code=<<___; | |||
374 | 419 | ||
375 | .align 5 | 420 | .align 5 |
376 | L1st: | 421 | L1st: |
422 | ___ | ||
423 | $code.=<<___ if ($SIZE_T==8); | ||
377 | lwz $t0,4($ap) ; load a[j] as 32-bit word pair | 424 | lwz $t0,4($ap) ; load a[j] as 32-bit word pair |
378 | lwz $t1,0($ap) | 425 | lwz $t1,0($ap) |
379 | lwz $t2,12($ap) ; load a[j+1] as 32-bit word pair | 426 | lwz $t2,12($ap) ; load a[j+1] as 32-bit word pair |
@@ -382,6 +429,18 @@ L1st: | |||
382 | lwz $t5,0($np) | 429 | lwz $t5,0($np) |
383 | lwz $t6,12($np) ; load n[j+1] as 32-bit word pair | 430 | lwz $t6,12($np) ; load n[j+1] as 32-bit word pair |
384 | lwz $t7,8($np) | 431 | lwz $t7,8($np) |
432 | ___ | ||
433 | $code.=<<___ if ($SIZE_T==4); | ||
434 | lwz $t0,0($ap) ; load a[j..j+3] as 32-bit word pairs | ||
435 | lwz $t1,4($ap) | ||
436 | lwz $t2,8($ap) | ||
437 | lwz $t3,12($ap) | ||
438 | lwz $t4,0($np) ; load n[j..j+3] as 32-bit word pairs | ||
439 | lwz $t5,4($np) | ||
440 | lwz $t6,8($np) | ||
441 | lwz $t7,12($np) | ||
442 | ___ | ||
443 | $code.=<<___; | ||
385 | std $t0,`$FRAME+64`($sp) | 444 | std $t0,`$FRAME+64`($sp) |
386 | std $t1,`$FRAME+72`($sp) | 445 | std $t1,`$FRAME+72`($sp) |
387 | std $t2,`$FRAME+80`($sp) | 446 | std $t2,`$FRAME+80`($sp) |
@@ -559,7 +618,17 @@ L1st: | |||
559 | li $i,8 ; i=1 | 618 | li $i,8 ; i=1 |
560 | .align 5 | 619 | .align 5 |
561 | Louter: | 620 | Louter: |
621 | ___ | ||
622 | $code.=<<___ if ($SIZE_T==8); | ||
562 | ldx $t3,$bp,$i ; bp[i] | 623 | ldx $t3,$bp,$i ; bp[i] |
624 | ___ | ||
625 | $code.=<<___ if ($SIZE_T==4); | ||
626 | add $t0,$bp,$i | ||
627 | lwz $t3,0($t0) ; bp[i,i+1] | ||
628 | lwz $t0,4($t0) | ||
629 | insrdi $t3,$t0,32,0 | ||
630 | ___ | ||
631 | $code.=<<___; | ||
563 | ld $t6,`$FRAME+$TRANSFER+8`($sp) ; tp[0] | 632 | ld $t6,`$FRAME+$TRANSFER+8`($sp) ; tp[0] |
564 | mulld $t7,$a0,$t3 ; ap[0]*bp[i] | 633 | mulld $t7,$a0,$t3 ; ap[0]*bp[i] |
565 | 634 | ||
@@ -761,6 +830,13 @@ Linner: | |||
761 | stfd $T0b,`$FRAME+8`($sp) | 830 | stfd $T0b,`$FRAME+8`($sp) |
762 | add $t7,$t7,$carry | 831 | add $t7,$t7,$carry |
763 | addc $t3,$t0,$t1 | 832 | addc $t3,$t0,$t1 |
833 | ___ | ||
834 | $code.=<<___ if ($SIZE_T==4); # adjust XER[CA] | ||
835 | extrdi $t0,$t0,32,0 | ||
836 | extrdi $t1,$t1,32,0 | ||
837 | adde $t0,$t0,$t1 | ||
838 | ___ | ||
839 | $code.=<<___; | ||
764 | stfd $T1a,`$FRAME+16`($sp) | 840 | stfd $T1a,`$FRAME+16`($sp) |
765 | stfd $T1b,`$FRAME+24`($sp) | 841 | stfd $T1b,`$FRAME+24`($sp) |
766 | insrdi $t4,$t7,16,0 ; 64..127 bits | 842 | insrdi $t4,$t7,16,0 ; 64..127 bits |
@@ -768,6 +844,13 @@ Linner: | |||
768 | stfd $T2a,`$FRAME+32`($sp) | 844 | stfd $T2a,`$FRAME+32`($sp) |
769 | stfd $T2b,`$FRAME+40`($sp) | 845 | stfd $T2b,`$FRAME+40`($sp) |
770 | adde $t5,$t4,$t2 | 846 | adde $t5,$t4,$t2 |
847 | ___ | ||
848 | $code.=<<___ if ($SIZE_T==4); # adjust XER[CA] | ||
849 | extrdi $t4,$t4,32,0 | ||
850 | extrdi $t2,$t2,32,0 | ||
851 | adde $t4,$t4,$t2 | ||
852 | ___ | ||
853 | $code.=<<___; | ||
771 | stfd $T3a,`$FRAME+48`($sp) | 854 | stfd $T3a,`$FRAME+48`($sp) |
772 | stfd $T3b,`$FRAME+56`($sp) | 855 | stfd $T3b,`$FRAME+56`($sp) |
773 | addze $carry,$carry | 856 | addze $carry,$carry |
@@ -816,7 +899,21 @@ Linner: | |||
816 | ld $t7,`$FRAME+72`($sp) | 899 | ld $t7,`$FRAME+72`($sp) |
817 | 900 | ||
818 | addc $t3,$t0,$t1 | 901 | addc $t3,$t0,$t1 |
902 | ___ | ||
903 | $code.=<<___ if ($SIZE_T==4); # adjust XER[CA] | ||
904 | extrdi $t0,$t0,32,0 | ||
905 | extrdi $t1,$t1,32,0 | ||
906 | adde $t0,$t0,$t1 | ||
907 | ___ | ||
908 | $code.=<<___; | ||
819 | adde $t5,$t4,$t2 | 909 | adde $t5,$t4,$t2 |
910 | ___ | ||
911 | $code.=<<___ if ($SIZE_T==4); # adjust XER[CA] | ||
912 | extrdi $t4,$t4,32,0 | ||
913 | extrdi $t2,$t2,32,0 | ||
914 | adde $t4,$t4,$t2 | ||
915 | ___ | ||
916 | $code.=<<___; | ||
820 | addze $carry,$carry | 917 | addze $carry,$carry |
821 | 918 | ||
822 | std $t3,-16($tp) ; tp[j-1] | 919 | std $t3,-16($tp) ; tp[j-1] |
@@ -835,7 +932,9 @@ Linner: | |||
835 | subf $nap_d,$t7,$nap_d ; rewind pointer | 932 | subf $nap_d,$t7,$nap_d ; rewind pointer |
836 | cmpw $i,$num | 933 | cmpw $i,$num |
837 | blt- Louter | 934 | blt- Louter |
935 | ___ | ||
838 | 936 | ||
937 | $code.=<<___ if ($SIZE_T==8); | ||
839 | subf $np,$num,$np ; rewind np | 938 | subf $np,$num,$np ; rewind np |
840 | addi $j,$j,1 ; restore counter | 939 | addi $j,$j,1 ; restore counter |
841 | subfc $i,$i,$i ; j=0 and "clear" XER[CA] | 940 | subfc $i,$i,$i ; j=0 and "clear" XER[CA] |
@@ -883,34 +982,105 @@ Lcopy: ; copy or in-place refresh | |||
883 | stdx $i,$t4,$i | 982 | stdx $i,$t4,$i |
884 | addi $i,$i,16 | 983 | addi $i,$i,16 |
885 | bdnz- Lcopy | 984 | bdnz- Lcopy |
985 | ___ | ||
986 | $code.=<<___ if ($SIZE_T==4); | ||
987 | subf $np,$num,$np ; rewind np | ||
988 | addi $j,$j,1 ; restore counter | ||
989 | subfc $i,$i,$i ; j=0 and "clear" XER[CA] | ||
990 | addi $tp,$sp,`$FRAME+$TRANSFER` | ||
991 | addi $np,$np,-4 | ||
992 | addi $rp,$rp,-4 | ||
993 | addi $ap,$sp,`$FRAME+$TRANSFER+4` | ||
994 | mtctr $j | ||
995 | |||
996 | .align 4 | ||
997 | Lsub: ld $t0,8($tp) ; load tp[j..j+3] in 64-bit word order | ||
998 | ldu $t2,16($tp) | ||
999 | lwz $t4,4($np) ; load np[j..j+3] in 32-bit word order | ||
1000 | lwz $t5,8($np) | ||
1001 | lwz $t6,12($np) | ||
1002 | lwzu $t7,16($np) | ||
1003 | extrdi $t1,$t0,32,0 | ||
1004 | extrdi $t3,$t2,32,0 | ||
1005 | subfe $t4,$t4,$t0 ; tp[j]-np[j] | ||
1006 | stw $t0,4($ap) ; save tp[j..j+3] in 32-bit word order | ||
1007 | subfe $t5,$t5,$t1 ; tp[j+1]-np[j+1] | ||
1008 | stw $t1,8($ap) | ||
1009 | subfe $t6,$t6,$t2 ; tp[j+2]-np[j+2] | ||
1010 | stw $t2,12($ap) | ||
1011 | subfe $t7,$t7,$t3 ; tp[j+3]-np[j+3] | ||
1012 | stwu $t3,16($ap) | ||
1013 | stw $t4,4($rp) | ||
1014 | stw $t5,8($rp) | ||
1015 | stw $t6,12($rp) | ||
1016 | stwu $t7,16($rp) | ||
1017 | bdnz- Lsub | ||
1018 | |||
1019 | li $i,0 | ||
1020 | subfe $ovf,$i,$ovf ; handle upmost overflow bit | ||
1021 | addi $tp,$sp,`$FRAME+$TRANSFER+4` | ||
1022 | subf $rp,$num,$rp ; rewind rp | ||
1023 | and $ap,$tp,$ovf | ||
1024 | andc $np,$rp,$ovf | ||
1025 | or $ap,$ap,$np ; ap=borrow?tp:rp | ||
1026 | addi $tp,$sp,`$FRAME+$TRANSFER` | ||
1027 | mtctr $j | ||
1028 | |||
1029 | .align 4 | ||
1030 | Lcopy: ; copy or in-place refresh | ||
1031 | lwz $t0,4($ap) | ||
1032 | lwz $t1,8($ap) | ||
1033 | lwz $t2,12($ap) | ||
1034 | lwzu $t3,16($ap) | ||
1035 | std $i,8($nap_d) ; zap nap_d | ||
1036 | std $i,16($nap_d) | ||
1037 | std $i,24($nap_d) | ||
1038 | std $i,32($nap_d) | ||
1039 | std $i,40($nap_d) | ||
1040 | std $i,48($nap_d) | ||
1041 | std $i,56($nap_d) | ||
1042 | stdu $i,64($nap_d) | ||
1043 | stw $t0,4($rp) | ||
1044 | stw $t1,8($rp) | ||
1045 | stw $t2,12($rp) | ||
1046 | stwu $t3,16($rp) | ||
1047 | std $i,8($tp) ; zap tp at once | ||
1048 | stdu $i,16($tp) | ||
1049 | bdnz- Lcopy | ||
1050 | ___ | ||
886 | 1051 | ||
887 | $POP r14,`2*$SIZE_T`($sp) | 1052 | $code.=<<___; |
888 | $POP r15,`3*$SIZE_T`($sp) | 1053 | $POP $i,0($sp) |
889 | $POP r16,`4*$SIZE_T`($sp) | ||
890 | $POP r17,`5*$SIZE_T`($sp) | ||
891 | $POP r18,`6*$SIZE_T`($sp) | ||
892 | $POP r19,`7*$SIZE_T`($sp) | ||
893 | $POP r20,`8*$SIZE_T`($sp) | ||
894 | $POP r21,`9*$SIZE_T`($sp) | ||
895 | $POP r22,`10*$SIZE_T`($sp) | ||
896 | $POP r23,`11*$SIZE_T`($sp) | ||
897 | lfd f14,`12*$SIZE_T+0`($sp) | ||
898 | lfd f15,`12*$SIZE_T+8`($sp) | ||
899 | lfd f16,`12*$SIZE_T+16`($sp) | ||
900 | lfd f17,`12*$SIZE_T+24`($sp) | ||
901 | lfd f18,`12*$SIZE_T+32`($sp) | ||
902 | lfd f19,`12*$SIZE_T+40`($sp) | ||
903 | lfd f20,`12*$SIZE_T+48`($sp) | ||
904 | lfd f21,`12*$SIZE_T+56`($sp) | ||
905 | lfd f22,`12*$SIZE_T+64`($sp) | ||
906 | lfd f23,`12*$SIZE_T+72`($sp) | ||
907 | lfd f24,`12*$SIZE_T+80`($sp) | ||
908 | lfd f25,`12*$SIZE_T+88`($sp) | ||
909 | $POP $sp,0($sp) | ||
910 | li r3,1 ; signal "handled" | 1054 | li r3,1 ; signal "handled" |
1055 | $POP r22,`-12*8-10*$SIZE_T`($i) | ||
1056 | $POP r23,`-12*8-9*$SIZE_T`($i) | ||
1057 | $POP r24,`-12*8-8*$SIZE_T`($i) | ||
1058 | $POP r25,`-12*8-7*$SIZE_T`($i) | ||
1059 | $POP r26,`-12*8-6*$SIZE_T`($i) | ||
1060 | $POP r27,`-12*8-5*$SIZE_T`($i) | ||
1061 | $POP r28,`-12*8-4*$SIZE_T`($i) | ||
1062 | $POP r29,`-12*8-3*$SIZE_T`($i) | ||
1063 | $POP r30,`-12*8-2*$SIZE_T`($i) | ||
1064 | $POP r31,`-12*8-1*$SIZE_T`($i) | ||
1065 | lfd f20,`-12*8`($i) | ||
1066 | lfd f21,`-11*8`($i) | ||
1067 | lfd f22,`-10*8`($i) | ||
1068 | lfd f23,`-9*8`($i) | ||
1069 | lfd f24,`-8*8`($i) | ||
1070 | lfd f25,`-7*8`($i) | ||
1071 | lfd f26,`-6*8`($i) | ||
1072 | lfd f27,`-5*8`($i) | ||
1073 | lfd f28,`-4*8`($i) | ||
1074 | lfd f29,`-3*8`($i) | ||
1075 | lfd f30,`-2*8`($i) | ||
1076 | lfd f31,`-1*8`($i) | ||
1077 | mr $sp,$i | ||
911 | blr | 1078 | blr |
912 | .long 0 | 1079 | .long 0 |
913 | .asciz "Montgomery Multiplication for PPC64, CRYPTOGAMS by <appro\@fy.chalmers.se>" | 1080 | .byte 0,12,4,0,0x8c,10,6,0 |
1081 | .long 0 | ||
1082 | |||
1083 | .asciz "Montgomery Multiplication for PPC64, CRYPTOGAMS by <appro\@openssl.org>" | ||
914 | ___ | 1084 | ___ |
915 | 1085 | ||
916 | $code =~ s/\`([^\`]*)\`/eval $1/gem; | 1086 | $code =~ s/\`([^\`]*)\`/eval $1/gem; |