summaryrefslogtreecommitdiff
path: root/src/lib/libcrypto/bn/asm/ppc64-mont.pl
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--src/lib/libcrypto/bn/asm/ppc64-mont.pl338
1 files changed, 254 insertions, 84 deletions
diff --git a/src/lib/libcrypto/bn/asm/ppc64-mont.pl b/src/lib/libcrypto/bn/asm/ppc64-mont.pl
index 3449b35855..a14e769ad0 100644
--- a/src/lib/libcrypto/bn/asm/ppc64-mont.pl
+++ b/src/lib/libcrypto/bn/asm/ppc64-mont.pl
@@ -45,23 +45,40 @@
45# on 1.8GHz PPC970, it's only 5-55% faster. Still far from impressive 45# on 1.8GHz PPC970, it's only 5-55% faster. Still far from impressive
46# in absolute terms, but it's apparently the way Power 6 is... 46# in absolute terms, but it's apparently the way Power 6 is...
47 47
48# December 2009
49
50# Adapted for 32-bit build this module delivers 25-120%, yes, more
51# than *twice* for longer keys, performance improvement over 32-bit
52# ppc-mont.pl on 1.8GHz PPC970. However! This implementation utilizes
53# even 64-bit integer operations and the trouble is that most PPC
54# operating systems don't preserve upper halves of general purpose
55# registers upon 32-bit signal delivery. They do preserve them upon
56# context switch, but not signalling:-( This means that asynchronous
57# signals have to be blocked upon entry to this subroutine. Signal
58# masking (and of course complementary unmasking) has quite an impact
59# on performance, naturally larger for shorter keys. It's so severe
60# that 512-bit key performance can be as low as 1/3 of expected one.
61# This is why this routine can be engaged for longer key operations
62# only on these OSes, see crypto/ppccap.c for further details. MacOS X
63# is an exception from this and doesn't require signal masking, and
64# that's where above improvement coefficients were collected. For
65# others alternative would be to break dependence on upper halves of
66# GPRs by sticking to 32-bit integer operations...
67
48$flavour = shift; 68$flavour = shift;
49 69
50if ($flavour =~ /32/) { 70if ($flavour =~ /32/) {
51 $SIZE_T=4; 71 $SIZE_T=4;
52 $RZONE= 224; 72 $RZONE= 224;
53 $FRAME= $SIZE_T*12+8*12; 73 $fname= "bn_mul_mont_fpu64";
54 $fname= "bn_mul_mont_ppc64";
55 74
56 $STUX= "stwux"; # store indexed and update 75 $STUX= "stwux"; # store indexed and update
57 $PUSH= "stw"; 76 $PUSH= "stw";
58 $POP= "lwz"; 77 $POP= "lwz";
59 die "not implemented yet";
60} elsif ($flavour =~ /64/) { 78} elsif ($flavour =~ /64/) {
61 $SIZE_T=8; 79 $SIZE_T=8;
62 $RZONE= 288; 80 $RZONE= 288;
63 $FRAME= $SIZE_T*12+8*12; 81 $fname= "bn_mul_mont_fpu64";
64 $fname= "bn_mul_mont";
65 82
66 # same as above, but 64-bit mnemonics... 83 # same as above, but 64-bit mnemonics...
67 $STUX= "stdux"; # store indexed and update 84 $STUX= "stdux"; # store indexed and update
@@ -76,7 +93,7 @@ die "can't locate ppc-xlate.pl";
76 93
77open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!"; 94open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
78 95
79$FRAME=($FRAME+63)&~63; 96$FRAME=64; # padded frame header
80$TRANSFER=16*8; 97$TRANSFER=16*8;
81 98
82$carry="r0"; 99$carry="r0";
@@ -93,16 +110,16 @@ $tp="r10";
93$j="r11"; 110$j="r11";
94$i="r12"; 111$i="r12";
95# non-volatile registers 112# non-volatile registers
96$nap_d="r14"; # interleaved ap and np in double format 113$nap_d="r22"; # interleaved ap and np in double format
97$a0="r15"; # ap[0] 114$a0="r23"; # ap[0]
98$t0="r16"; # temporary registers 115$t0="r24"; # temporary registers
99$t1="r17"; 116$t1="r25";
100$t2="r18"; 117$t2="r26";
101$t3="r19"; 118$t3="r27";
102$t4="r20"; 119$t4="r28";
103$t5="r21"; 120$t5="r29";
104$t6="r22"; 121$t6="r30";
105$t7="r23"; 122$t7="r31";
106 123
107# PPC offers enough register bank capacity to unroll inner loops twice 124# PPC offers enough register bank capacity to unroll inner loops twice
108# 125#
@@ -132,28 +149,17 @@ $ba="f0"; $bb="f1"; $bc="f2"; $bd="f3";
132$na="f4"; $nb="f5"; $nc="f6"; $nd="f7"; 149$na="f4"; $nb="f5"; $nc="f6"; $nd="f7";
133$dota="f8"; $dotb="f9"; 150$dota="f8"; $dotb="f9";
134$A0="f10"; $A1="f11"; $A2="f12"; $A3="f13"; 151$A0="f10"; $A1="f11"; $A2="f12"; $A3="f13";
135$N0="f14"; $N1="f15"; $N2="f16"; $N3="f17"; 152$N0="f20"; $N1="f21"; $N2="f22"; $N3="f23";
136$T0a="f18"; $T0b="f19"; 153$T0a="f24"; $T0b="f25";
137$T1a="f20"; $T1b="f21"; 154$T1a="f26"; $T1b="f27";
138$T2a="f22"; $T2b="f23"; 155$T2a="f28"; $T2b="f29";
139$T3a="f24"; $T3b="f25"; 156$T3a="f30"; $T3b="f31";
140 157
141# sp----------->+-------------------------------+ 158# sp----------->+-------------------------------+
142# | saved sp | 159# | saved sp |
143# +-------------------------------+ 160# +-------------------------------+
144# | |
145# +-------------------------------+
146# | 10 saved gpr, r14-r23 |
147# . .
148# . .
149# +12*size_t +-------------------------------+
150# | 12 saved fpr, f14-f25 |
151# . . 161# . .
152# . . 162# +64 +-------------------------------+
153# +12*8 +-------------------------------+
154# | padding to 64 byte boundary |
155# . .
156# +X +-------------------------------+
157# | 16 gpr<->fpr transfer zone | 163# | 16 gpr<->fpr transfer zone |
158# . . 164# . .
159# . . 165# . .
@@ -173,6 +179,16 @@ $T3a="f24"; $T3b="f25";
173# . . 179# . .
174# . . 180# . .
175# +-------------------------------+ 181# +-------------------------------+
182# . .
183# -12*size_t +-------------------------------+
184# | 10 saved gpr, r22-r31 |
185# . .
186# . .
187# -12*8 +-------------------------------+
188# | 12 saved fpr, f20-f31 |
189# . .
190# . .
191# +-------------------------------+
176 192
177$code=<<___; 193$code=<<___;
178.machine "any" 194.machine "any"
@@ -181,14 +197,14 @@ $code=<<___;
181.globl .$fname 197.globl .$fname
182.align 5 198.align 5
183.$fname: 199.$fname:
184 cmpwi $num,4 200 cmpwi $num,`3*8/$SIZE_T`
185 mr $rp,r3 ; $rp is reassigned 201 mr $rp,r3 ; $rp is reassigned
186 li r3,0 ; possible "not handled" return code 202 li r3,0 ; possible "not handled" return code
187 bltlr- 203 bltlr-
188 andi. r0,$num,1 ; $num has to be even 204 andi. r0,$num,`16/$SIZE_T-1` ; $num has to be "even"
189 bnelr- 205 bnelr-
190 206
191 slwi $num,$num,3 ; num*=8 207 slwi $num,$num,`log($SIZE_T)/log(2)` ; num*=sizeof(BN_LONG)
192 li $i,-4096 208 li $i,-4096
193 slwi $tp,$num,2 ; place for {an}p_{lh}[num], i.e. 4*num 209 slwi $tp,$num,2 ; place for {an}p_{lh}[num], i.e. 4*num
194 add $tp,$tp,$num ; place for tp[num+1] 210 add $tp,$tp,$num ; place for tp[num+1]
@@ -196,35 +212,50 @@ $code=<<___;
196 subf $tp,$tp,$sp ; $sp-$tp 212 subf $tp,$tp,$sp ; $sp-$tp
197 and $tp,$tp,$i ; minimize TLB usage 213 and $tp,$tp,$i ; minimize TLB usage
198 subf $tp,$sp,$tp ; $tp-$sp 214 subf $tp,$sp,$tp ; $tp-$sp
215 mr $i,$sp
199 $STUX $sp,$sp,$tp ; alloca 216 $STUX $sp,$sp,$tp ; alloca
200 217
201 $PUSH r14,`2*$SIZE_T`($sp) 218 $PUSH r22,`-12*8-10*$SIZE_T`($i)
202 $PUSH r15,`3*$SIZE_T`($sp) 219 $PUSH r23,`-12*8-9*$SIZE_T`($i)
203 $PUSH r16,`4*$SIZE_T`($sp) 220 $PUSH r24,`-12*8-8*$SIZE_T`($i)
204 $PUSH r17,`5*$SIZE_T`($sp) 221 $PUSH r25,`-12*8-7*$SIZE_T`($i)
205 $PUSH r18,`6*$SIZE_T`($sp) 222 $PUSH r26,`-12*8-6*$SIZE_T`($i)
206 $PUSH r19,`7*$SIZE_T`($sp) 223 $PUSH r27,`-12*8-5*$SIZE_T`($i)
207 $PUSH r20,`8*$SIZE_T`($sp) 224 $PUSH r28,`-12*8-4*$SIZE_T`($i)
208 $PUSH r21,`9*$SIZE_T`($sp) 225 $PUSH r29,`-12*8-3*$SIZE_T`($i)
209 $PUSH r22,`10*$SIZE_T`($sp) 226 $PUSH r30,`-12*8-2*$SIZE_T`($i)
210 $PUSH r23,`11*$SIZE_T`($sp) 227 $PUSH r31,`-12*8-1*$SIZE_T`($i)
211 stfd f14,`12*$SIZE_T+0`($sp) 228 stfd f20,`-12*8`($i)
212 stfd f15,`12*$SIZE_T+8`($sp) 229 stfd f21,`-11*8`($i)
213 stfd f16,`12*$SIZE_T+16`($sp) 230 stfd f22,`-10*8`($i)
214 stfd f17,`12*$SIZE_T+24`($sp) 231 stfd f23,`-9*8`($i)
215 stfd f18,`12*$SIZE_T+32`($sp) 232 stfd f24,`-8*8`($i)
216 stfd f19,`12*$SIZE_T+40`($sp) 233 stfd f25,`-7*8`($i)
217 stfd f20,`12*$SIZE_T+48`($sp) 234 stfd f26,`-6*8`($i)
218 stfd f21,`12*$SIZE_T+56`($sp) 235 stfd f27,`-5*8`($i)
219 stfd f22,`12*$SIZE_T+64`($sp) 236 stfd f28,`-4*8`($i)
220 stfd f23,`12*$SIZE_T+72`($sp) 237 stfd f29,`-3*8`($i)
221 stfd f24,`12*$SIZE_T+80`($sp) 238 stfd f30,`-2*8`($i)
222 stfd f25,`12*$SIZE_T+88`($sp) 239 stfd f31,`-1*8`($i)
223 240___
241$code.=<<___ if ($SIZE_T==8);
224 ld $a0,0($ap) ; pull ap[0] value 242 ld $a0,0($ap) ; pull ap[0] value
225 ld $n0,0($n0) ; pull n0[0] value 243 ld $n0,0($n0) ; pull n0[0] value
226 ld $t3,0($bp) ; bp[0] 244 ld $t3,0($bp) ; bp[0]
227 245___
246$code.=<<___ if ($SIZE_T==4);
247 mr $t1,$n0
248 lwz $a0,0($ap) ; pull ap[0,1] value
249 lwz $t0,4($ap)
250 lwz $n0,0($t1) ; pull n0[0,1] value
251 lwz $t1,4($t1)
252 lwz $t3,0($bp) ; bp[0,1]
253 lwz $t2,4($bp)
254 insrdi $a0,$t0,32,0
255 insrdi $n0,$t1,32,0
256 insrdi $t3,$t2,32,0
257___
258$code.=<<___;
228 addi $tp,$sp,`$FRAME+$TRANSFER+8+64` 259 addi $tp,$sp,`$FRAME+$TRANSFER+8+64`
229 li $i,-64 260 li $i,-64
230 add $nap_d,$tp,$num 261 add $nap_d,$tp,$num
@@ -258,6 +289,8 @@ $code=<<___;
258 std $t5,`$FRAME+40`($sp) 289 std $t5,`$FRAME+40`($sp)
259 std $t6,`$FRAME+48`($sp) 290 std $t6,`$FRAME+48`($sp)
260 std $t7,`$FRAME+56`($sp) 291 std $t7,`$FRAME+56`($sp)
292___
293$code.=<<___ if ($SIZE_T==8);
261 lwz $t0,4($ap) ; load a[j] as 32-bit word pair 294 lwz $t0,4($ap) ; load a[j] as 32-bit word pair
262 lwz $t1,0($ap) 295 lwz $t1,0($ap)
263 lwz $t2,12($ap) ; load a[j+1] as 32-bit word pair 296 lwz $t2,12($ap) ; load a[j+1] as 32-bit word pair
@@ -266,6 +299,18 @@ $code=<<___;
266 lwz $t5,0($np) 299 lwz $t5,0($np)
267 lwz $t6,12($np) ; load n[j+1] as 32-bit word pair 300 lwz $t6,12($np) ; load n[j+1] as 32-bit word pair
268 lwz $t7,8($np) 301 lwz $t7,8($np)
302___
303$code.=<<___ if ($SIZE_T==4);
304 lwz $t0,0($ap) ; load a[j..j+3] as 32-bit word pairs
305 lwz $t1,4($ap)
306 lwz $t2,8($ap)
307 lwz $t3,12($ap)
308 lwz $t4,0($np) ; load n[j..j+3] as 32-bit word pairs
309 lwz $t5,4($np)
310 lwz $t6,8($np)
311 lwz $t7,12($np)
312___
313$code.=<<___;
269 lfd $ba,`$FRAME+0`($sp) 314 lfd $ba,`$FRAME+0`($sp)
270 lfd $bb,`$FRAME+8`($sp) 315 lfd $bb,`$FRAME+8`($sp)
271 lfd $bc,`$FRAME+16`($sp) 316 lfd $bc,`$FRAME+16`($sp)
@@ -374,6 +419,8 @@ $code=<<___;
374 419
375.align 5 420.align 5
376L1st: 421L1st:
422___
423$code.=<<___ if ($SIZE_T==8);
377 lwz $t0,4($ap) ; load a[j] as 32-bit word pair 424 lwz $t0,4($ap) ; load a[j] as 32-bit word pair
378 lwz $t1,0($ap) 425 lwz $t1,0($ap)
379 lwz $t2,12($ap) ; load a[j+1] as 32-bit word pair 426 lwz $t2,12($ap) ; load a[j+1] as 32-bit word pair
@@ -382,6 +429,18 @@ L1st:
382 lwz $t5,0($np) 429 lwz $t5,0($np)
383 lwz $t6,12($np) ; load n[j+1] as 32-bit word pair 430 lwz $t6,12($np) ; load n[j+1] as 32-bit word pair
384 lwz $t7,8($np) 431 lwz $t7,8($np)
432___
433$code.=<<___ if ($SIZE_T==4);
434 lwz $t0,0($ap) ; load a[j..j+3] as 32-bit word pairs
435 lwz $t1,4($ap)
436 lwz $t2,8($ap)
437 lwz $t3,12($ap)
438 lwz $t4,0($np) ; load n[j..j+3] as 32-bit word pairs
439 lwz $t5,4($np)
440 lwz $t6,8($np)
441 lwz $t7,12($np)
442___
443$code.=<<___;
385 std $t0,`$FRAME+64`($sp) 444 std $t0,`$FRAME+64`($sp)
386 std $t1,`$FRAME+72`($sp) 445 std $t1,`$FRAME+72`($sp)
387 std $t2,`$FRAME+80`($sp) 446 std $t2,`$FRAME+80`($sp)
@@ -559,7 +618,17 @@ L1st:
559 li $i,8 ; i=1 618 li $i,8 ; i=1
560.align 5 619.align 5
561Louter: 620Louter:
621___
622$code.=<<___ if ($SIZE_T==8);
562 ldx $t3,$bp,$i ; bp[i] 623 ldx $t3,$bp,$i ; bp[i]
624___
625$code.=<<___ if ($SIZE_T==4);
626 add $t0,$bp,$i
627 lwz $t3,0($t0) ; bp[i,i+1]
628 lwz $t0,4($t0)
629 insrdi $t3,$t0,32,0
630___
631$code.=<<___;
563 ld $t6,`$FRAME+$TRANSFER+8`($sp) ; tp[0] 632 ld $t6,`$FRAME+$TRANSFER+8`($sp) ; tp[0]
564 mulld $t7,$a0,$t3 ; ap[0]*bp[i] 633 mulld $t7,$a0,$t3 ; ap[0]*bp[i]
565 634
@@ -761,6 +830,13 @@ Linner:
761 stfd $T0b,`$FRAME+8`($sp) 830 stfd $T0b,`$FRAME+8`($sp)
762 add $t7,$t7,$carry 831 add $t7,$t7,$carry
763 addc $t3,$t0,$t1 832 addc $t3,$t0,$t1
833___
834$code.=<<___ if ($SIZE_T==4); # adjust XER[CA]
835 extrdi $t0,$t0,32,0
836 extrdi $t1,$t1,32,0
837 adde $t0,$t0,$t1
838___
839$code.=<<___;
764 stfd $T1a,`$FRAME+16`($sp) 840 stfd $T1a,`$FRAME+16`($sp)
765 stfd $T1b,`$FRAME+24`($sp) 841 stfd $T1b,`$FRAME+24`($sp)
766 insrdi $t4,$t7,16,0 ; 64..127 bits 842 insrdi $t4,$t7,16,0 ; 64..127 bits
@@ -768,6 +844,13 @@ Linner:
768 stfd $T2a,`$FRAME+32`($sp) 844 stfd $T2a,`$FRAME+32`($sp)
769 stfd $T2b,`$FRAME+40`($sp) 845 stfd $T2b,`$FRAME+40`($sp)
770 adde $t5,$t4,$t2 846 adde $t5,$t4,$t2
847___
848$code.=<<___ if ($SIZE_T==4); # adjust XER[CA]
849 extrdi $t4,$t4,32,0
850 extrdi $t2,$t2,32,0
851 adde $t4,$t4,$t2
852___
853$code.=<<___;
771 stfd $T3a,`$FRAME+48`($sp) 854 stfd $T3a,`$FRAME+48`($sp)
772 stfd $T3b,`$FRAME+56`($sp) 855 stfd $T3b,`$FRAME+56`($sp)
773 addze $carry,$carry 856 addze $carry,$carry
@@ -816,7 +899,21 @@ Linner:
816 ld $t7,`$FRAME+72`($sp) 899 ld $t7,`$FRAME+72`($sp)
817 900
818 addc $t3,$t0,$t1 901 addc $t3,$t0,$t1
902___
903$code.=<<___ if ($SIZE_T==4); # adjust XER[CA]
904 extrdi $t0,$t0,32,0
905 extrdi $t1,$t1,32,0
906 adde $t0,$t0,$t1
907___
908$code.=<<___;
819 adde $t5,$t4,$t2 909 adde $t5,$t4,$t2
910___
911$code.=<<___ if ($SIZE_T==4); # adjust XER[CA]
912 extrdi $t4,$t4,32,0
913 extrdi $t2,$t2,32,0
914 adde $t4,$t4,$t2
915___
916$code.=<<___;
820 addze $carry,$carry 917 addze $carry,$carry
821 918
822 std $t3,-16($tp) ; tp[j-1] 919 std $t3,-16($tp) ; tp[j-1]
@@ -835,7 +932,9 @@ Linner:
835 subf $nap_d,$t7,$nap_d ; rewind pointer 932 subf $nap_d,$t7,$nap_d ; rewind pointer
836 cmpw $i,$num 933 cmpw $i,$num
837 blt- Louter 934 blt- Louter
935___
838 936
937$code.=<<___ if ($SIZE_T==8);
839 subf $np,$num,$np ; rewind np 938 subf $np,$num,$np ; rewind np
840 addi $j,$j,1 ; restore counter 939 addi $j,$j,1 ; restore counter
841 subfc $i,$i,$i ; j=0 and "clear" XER[CA] 940 subfc $i,$i,$i ; j=0 and "clear" XER[CA]
@@ -883,34 +982,105 @@ Lcopy: ; copy or in-place refresh
883 stdx $i,$t4,$i 982 stdx $i,$t4,$i
884 addi $i,$i,16 983 addi $i,$i,16
885 bdnz- Lcopy 984 bdnz- Lcopy
985___
986$code.=<<___ if ($SIZE_T==4);
987 subf $np,$num,$np ; rewind np
988 addi $j,$j,1 ; restore counter
989 subfc $i,$i,$i ; j=0 and "clear" XER[CA]
990 addi $tp,$sp,`$FRAME+$TRANSFER`
991 addi $np,$np,-4
992 addi $rp,$rp,-4
993 addi $ap,$sp,`$FRAME+$TRANSFER+4`
994 mtctr $j
995
996.align 4
997Lsub: ld $t0,8($tp) ; load tp[j..j+3] in 64-bit word order
998 ldu $t2,16($tp)
999 lwz $t4,4($np) ; load np[j..j+3] in 32-bit word order
1000 lwz $t5,8($np)
1001 lwz $t6,12($np)
1002 lwzu $t7,16($np)
1003 extrdi $t1,$t0,32,0
1004 extrdi $t3,$t2,32,0
1005 subfe $t4,$t4,$t0 ; tp[j]-np[j]
1006 stw $t0,4($ap) ; save tp[j..j+3] in 32-bit word order
1007 subfe $t5,$t5,$t1 ; tp[j+1]-np[j+1]
1008 stw $t1,8($ap)
1009 subfe $t6,$t6,$t2 ; tp[j+2]-np[j+2]
1010 stw $t2,12($ap)
1011 subfe $t7,$t7,$t3 ; tp[j+3]-np[j+3]
1012 stwu $t3,16($ap)
1013 stw $t4,4($rp)
1014 stw $t5,8($rp)
1015 stw $t6,12($rp)
1016 stwu $t7,16($rp)
1017 bdnz- Lsub
1018
1019 li $i,0
1020 subfe $ovf,$i,$ovf ; handle upmost overflow bit
1021 addi $tp,$sp,`$FRAME+$TRANSFER+4`
1022 subf $rp,$num,$rp ; rewind rp
1023 and $ap,$tp,$ovf
1024 andc $np,$rp,$ovf
1025 or $ap,$ap,$np ; ap=borrow?tp:rp
1026 addi $tp,$sp,`$FRAME+$TRANSFER`
1027 mtctr $j
1028
1029.align 4
1030Lcopy: ; copy or in-place refresh
1031 lwz $t0,4($ap)
1032 lwz $t1,8($ap)
1033 lwz $t2,12($ap)
1034 lwzu $t3,16($ap)
1035 std $i,8($nap_d) ; zap nap_d
1036 std $i,16($nap_d)
1037 std $i,24($nap_d)
1038 std $i,32($nap_d)
1039 std $i,40($nap_d)
1040 std $i,48($nap_d)
1041 std $i,56($nap_d)
1042 stdu $i,64($nap_d)
1043 stw $t0,4($rp)
1044 stw $t1,8($rp)
1045 stw $t2,12($rp)
1046 stwu $t3,16($rp)
1047 std $i,8($tp) ; zap tp at once
1048 stdu $i,16($tp)
1049 bdnz- Lcopy
1050___
886 1051
887 $POP r14,`2*$SIZE_T`($sp) 1052$code.=<<___;
888 $POP r15,`3*$SIZE_T`($sp) 1053 $POP $i,0($sp)
889 $POP r16,`4*$SIZE_T`($sp)
890 $POP r17,`5*$SIZE_T`($sp)
891 $POP r18,`6*$SIZE_T`($sp)
892 $POP r19,`7*$SIZE_T`($sp)
893 $POP r20,`8*$SIZE_T`($sp)
894 $POP r21,`9*$SIZE_T`($sp)
895 $POP r22,`10*$SIZE_T`($sp)
896 $POP r23,`11*$SIZE_T`($sp)
897 lfd f14,`12*$SIZE_T+0`($sp)
898 lfd f15,`12*$SIZE_T+8`($sp)
899 lfd f16,`12*$SIZE_T+16`($sp)
900 lfd f17,`12*$SIZE_T+24`($sp)
901 lfd f18,`12*$SIZE_T+32`($sp)
902 lfd f19,`12*$SIZE_T+40`($sp)
903 lfd f20,`12*$SIZE_T+48`($sp)
904 lfd f21,`12*$SIZE_T+56`($sp)
905 lfd f22,`12*$SIZE_T+64`($sp)
906 lfd f23,`12*$SIZE_T+72`($sp)
907 lfd f24,`12*$SIZE_T+80`($sp)
908 lfd f25,`12*$SIZE_T+88`($sp)
909 $POP $sp,0($sp)
910 li r3,1 ; signal "handled" 1054 li r3,1 ; signal "handled"
1055 $POP r22,`-12*8-10*$SIZE_T`($i)
1056 $POP r23,`-12*8-9*$SIZE_T`($i)
1057 $POP r24,`-12*8-8*$SIZE_T`($i)
1058 $POP r25,`-12*8-7*$SIZE_T`($i)
1059 $POP r26,`-12*8-6*$SIZE_T`($i)
1060 $POP r27,`-12*8-5*$SIZE_T`($i)
1061 $POP r28,`-12*8-4*$SIZE_T`($i)
1062 $POP r29,`-12*8-3*$SIZE_T`($i)
1063 $POP r30,`-12*8-2*$SIZE_T`($i)
1064 $POP r31,`-12*8-1*$SIZE_T`($i)
1065 lfd f20,`-12*8`($i)
1066 lfd f21,`-11*8`($i)
1067 lfd f22,`-10*8`($i)
1068 lfd f23,`-9*8`($i)
1069 lfd f24,`-8*8`($i)
1070 lfd f25,`-7*8`($i)
1071 lfd f26,`-6*8`($i)
1072 lfd f27,`-5*8`($i)
1073 lfd f28,`-4*8`($i)
1074 lfd f29,`-3*8`($i)
1075 lfd f30,`-2*8`($i)
1076 lfd f31,`-1*8`($i)
1077 mr $sp,$i
911 blr 1078 blr
912 .long 0 1079 .long 0
913.asciz "Montgomery Multiplication for PPC64, CRYPTOGAMS by <appro\@fy.chalmers.se>" 1080 .byte 0,12,4,0,0x8c,10,6,0
1081 .long 0
1082
1083.asciz "Montgomery Multiplication for PPC64, CRYPTOGAMS by <appro\@openssl.org>"
914___ 1084___
915 1085
916$code =~ s/\`([^\`]*)\`/eval $1/gem; 1086$code =~ s/\`([^\`]*)\`/eval $1/gem;