diff options
Diffstat (limited to 'src/lib/libcrypto/bn/asm/ppc-mont.pl')
-rw-r--r-- | src/lib/libcrypto/bn/asm/ppc-mont.pl | 323 |
1 files changed, 0 insertions, 323 deletions
diff --git a/src/lib/libcrypto/bn/asm/ppc-mont.pl b/src/lib/libcrypto/bn/asm/ppc-mont.pl deleted file mode 100644 index 7849eae959..0000000000 --- a/src/lib/libcrypto/bn/asm/ppc-mont.pl +++ /dev/null | |||
@@ -1,323 +0,0 @@ | |||
1 | #!/usr/bin/env perl | ||
2 | |||
3 | # ==================================================================== | ||
4 | # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | ||
5 | # project. The module is, however, dual licensed under OpenSSL and | ||
6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
8 | # ==================================================================== | ||
9 | |||
10 | # April 2006 | ||
11 | |||
12 | # "Teaser" Montgomery multiplication module for PowerPC. It's possible | ||
13 | # to gain a bit more by modulo-scheduling outer loop, then dedicated | ||
14 | # squaring procedure should give further 20% and code can be adapted | ||
15 | # for 32-bit application running on 64-bit CPU. As for the latter. | ||
16 | # It won't be able to achieve "native" 64-bit performance, because in | ||
17 | # 32-bit application context every addc instruction will have to be | ||
18 | # expanded as addc, twice right shift by 32 and finally adde, etc. | ||
19 | # So far RSA *sign* performance improvement over pre-bn_mul_mont asm | ||
20 | # for 64-bit application running on PPC970/G5 is: | ||
21 | # | ||
22 | # 512-bit +65% | ||
23 | # 1024-bit +35% | ||
24 | # 2048-bit +18% | ||
25 | # 4096-bit +4% | ||
26 | |||
27 | $flavour = shift; | ||
28 | |||
29 | if ($flavour =~ /32/) { | ||
30 | $BITS= 32; | ||
31 | $BNSZ= $BITS/8; | ||
32 | $SIZE_T=4; | ||
33 | $RZONE= 224; | ||
34 | $FRAME= $SIZE_T*16; | ||
35 | |||
36 | $LD= "lwz"; # load | ||
37 | $LDU= "lwzu"; # load and update | ||
38 | $LDX= "lwzx"; # load indexed | ||
39 | $ST= "stw"; # store | ||
40 | $STU= "stwu"; # store and update | ||
41 | $STX= "stwx"; # store indexed | ||
42 | $STUX= "stwux"; # store indexed and update | ||
43 | $UMULL= "mullw"; # unsigned multiply low | ||
44 | $UMULH= "mulhwu"; # unsigned multiply high | ||
45 | $UCMP= "cmplw"; # unsigned compare | ||
46 | $SHRI= "srwi"; # unsigned shift right by immediate | ||
47 | $PUSH= $ST; | ||
48 | $POP= $LD; | ||
49 | } elsif ($flavour =~ /64/) { | ||
50 | $BITS= 64; | ||
51 | $BNSZ= $BITS/8; | ||
52 | $SIZE_T=8; | ||
53 | $RZONE= 288; | ||
54 | $FRAME= $SIZE_T*16; | ||
55 | |||
56 | # same as above, but 64-bit mnemonics... | ||
57 | $LD= "ld"; # load | ||
58 | $LDU= "ldu"; # load and update | ||
59 | $LDX= "ldx"; # load indexed | ||
60 | $ST= "std"; # store | ||
61 | $STU= "stdu"; # store and update | ||
62 | $STX= "stdx"; # store indexed | ||
63 | $STUX= "stdux"; # store indexed and update | ||
64 | $UMULL= "mulld"; # unsigned multiply low | ||
65 | $UMULH= "mulhdu"; # unsigned multiply high | ||
66 | $UCMP= "cmpld"; # unsigned compare | ||
67 | $SHRI= "srdi"; # unsigned shift right by immediate | ||
68 | $PUSH= $ST; | ||
69 | $POP= $LD; | ||
70 | } else { die "nonsense $flavour"; } | ||
71 | |||
72 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | ||
73 | ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or | ||
74 | ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or | ||
75 | die "can't locate ppc-xlate.pl"; | ||
76 | |||
77 | open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!"; | ||
78 | |||
79 | $sp="r1"; | ||
80 | $toc="r2"; | ||
81 | $rp="r3"; $ovf="r3"; | ||
82 | $ap="r4"; | ||
83 | $bp="r5"; | ||
84 | $np="r6"; | ||
85 | $n0="r7"; | ||
86 | $num="r8"; | ||
87 | $rp="r9"; # $rp is reassigned | ||
88 | $aj="r10"; | ||
89 | $nj="r11"; | ||
90 | $tj="r12"; | ||
91 | # non-volatile registers | ||
92 | $i="r14"; | ||
93 | $j="r15"; | ||
94 | $tp="r16"; | ||
95 | $m0="r17"; | ||
96 | $m1="r18"; | ||
97 | $lo0="r19"; | ||
98 | $hi0="r20"; | ||
99 | $lo1="r21"; | ||
100 | $hi1="r22"; | ||
101 | $alo="r23"; | ||
102 | $ahi="r24"; | ||
103 | $nlo="r25"; | ||
104 | # | ||
105 | $nhi="r0"; | ||
106 | |||
107 | $code=<<___; | ||
108 | .machine "any" | ||
109 | .text | ||
110 | |||
111 | .globl .bn_mul_mont | ||
112 | .align 4 | ||
113 | .bn_mul_mont: | ||
114 | cmpwi $num,4 | ||
115 | mr $rp,r3 ; $rp is reassigned | ||
116 | li r3,0 | ||
117 | bltlr | ||
118 | |||
119 | slwi $num,$num,`log($BNSZ)/log(2)` | ||
120 | li $tj,-4096 | ||
121 | addi $ovf,$num,`$FRAME+$RZONE` | ||
122 | subf $ovf,$ovf,$sp ; $sp-$ovf | ||
123 | and $ovf,$ovf,$tj ; minimize TLB usage | ||
124 | subf $ovf,$sp,$ovf ; $ovf-$sp | ||
125 | srwi $num,$num,`log($BNSZ)/log(2)` | ||
126 | $STUX $sp,$sp,$ovf | ||
127 | |||
128 | $PUSH r14,`4*$SIZE_T`($sp) | ||
129 | $PUSH r15,`5*$SIZE_T`($sp) | ||
130 | $PUSH r16,`6*$SIZE_T`($sp) | ||
131 | $PUSH r17,`7*$SIZE_T`($sp) | ||
132 | $PUSH r18,`8*$SIZE_T`($sp) | ||
133 | $PUSH r19,`9*$SIZE_T`($sp) | ||
134 | $PUSH r20,`10*$SIZE_T`($sp) | ||
135 | $PUSH r21,`11*$SIZE_T`($sp) | ||
136 | $PUSH r22,`12*$SIZE_T`($sp) | ||
137 | $PUSH r23,`13*$SIZE_T`($sp) | ||
138 | $PUSH r24,`14*$SIZE_T`($sp) | ||
139 | $PUSH r25,`15*$SIZE_T`($sp) | ||
140 | |||
141 | $LD $n0,0($n0) ; pull n0[0] value | ||
142 | addi $num,$num,-2 ; adjust $num for counter register | ||
143 | |||
144 | $LD $m0,0($bp) ; m0=bp[0] | ||
145 | $LD $aj,0($ap) ; ap[0] | ||
146 | addi $tp,$sp,$FRAME | ||
147 | $UMULL $lo0,$aj,$m0 ; ap[0]*bp[0] | ||
148 | $UMULH $hi0,$aj,$m0 | ||
149 | |||
150 | $LD $aj,$BNSZ($ap) ; ap[1] | ||
151 | $LD $nj,0($np) ; np[0] | ||
152 | |||
153 | $UMULL $m1,$lo0,$n0 ; "tp[0]"*n0 | ||
154 | |||
155 | $UMULL $alo,$aj,$m0 ; ap[1]*bp[0] | ||
156 | $UMULH $ahi,$aj,$m0 | ||
157 | |||
158 | $UMULL $lo1,$nj,$m1 ; np[0]*m1 | ||
159 | $UMULH $hi1,$nj,$m1 | ||
160 | $LD $nj,$BNSZ($np) ; np[1] | ||
161 | addc $lo1,$lo1,$lo0 | ||
162 | addze $hi1,$hi1 | ||
163 | |||
164 | $UMULL $nlo,$nj,$m1 ; np[1]*m1 | ||
165 | $UMULH $nhi,$nj,$m1 | ||
166 | |||
167 | mtctr $num | ||
168 | li $j,`2*$BNSZ` | ||
169 | .align 4 | ||
170 | L1st: | ||
171 | $LDX $aj,$ap,$j ; ap[j] | ||
172 | addc $lo0,$alo,$hi0 | ||
173 | $LDX $nj,$np,$j ; np[j] | ||
174 | addze $hi0,$ahi | ||
175 | $UMULL $alo,$aj,$m0 ; ap[j]*bp[0] | ||
176 | addc $lo1,$nlo,$hi1 | ||
177 | $UMULH $ahi,$aj,$m0 | ||
178 | addze $hi1,$nhi | ||
179 | $UMULL $nlo,$nj,$m1 ; np[j]*m1 | ||
180 | addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[0] | ||
181 | $UMULH $nhi,$nj,$m1 | ||
182 | addze $hi1,$hi1 | ||
183 | $ST $lo1,0($tp) ; tp[j-1] | ||
184 | |||
185 | addi $j,$j,$BNSZ ; j++ | ||
186 | addi $tp,$tp,$BNSZ ; tp++ | ||
187 | bdnz- L1st | ||
188 | ;L1st | ||
189 | addc $lo0,$alo,$hi0 | ||
190 | addze $hi0,$ahi | ||
191 | |||
192 | addc $lo1,$nlo,$hi1 | ||
193 | addze $hi1,$nhi | ||
194 | addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[0] | ||
195 | addze $hi1,$hi1 | ||
196 | $ST $lo1,0($tp) ; tp[j-1] | ||
197 | |||
198 | li $ovf,0 | ||
199 | addc $hi1,$hi1,$hi0 | ||
200 | addze $ovf,$ovf ; upmost overflow bit | ||
201 | $ST $hi1,$BNSZ($tp) | ||
202 | |||
203 | li $i,$BNSZ | ||
204 | .align 4 | ||
205 | Louter: | ||
206 | $LDX $m0,$bp,$i ; m0=bp[i] | ||
207 | $LD $aj,0($ap) ; ap[0] | ||
208 | addi $tp,$sp,$FRAME | ||
209 | $LD $tj,$FRAME($sp) ; tp[0] | ||
210 | $UMULL $lo0,$aj,$m0 ; ap[0]*bp[i] | ||
211 | $UMULH $hi0,$aj,$m0 | ||
212 | $LD $aj,$BNSZ($ap) ; ap[1] | ||
213 | $LD $nj,0($np) ; np[0] | ||
214 | addc $lo0,$lo0,$tj ; ap[0]*bp[i]+tp[0] | ||
215 | $UMULL $alo,$aj,$m0 ; ap[j]*bp[i] | ||
216 | addze $hi0,$hi0 | ||
217 | $UMULL $m1,$lo0,$n0 ; tp[0]*n0 | ||
218 | $UMULH $ahi,$aj,$m0 | ||
219 | $UMULL $lo1,$nj,$m1 ; np[0]*m1 | ||
220 | $UMULH $hi1,$nj,$m1 | ||
221 | $LD $nj,$BNSZ($np) ; np[1] | ||
222 | addc $lo1,$lo1,$lo0 | ||
223 | $UMULL $nlo,$nj,$m1 ; np[1]*m1 | ||
224 | addze $hi1,$hi1 | ||
225 | $UMULH $nhi,$nj,$m1 | ||
226 | |||
227 | mtctr $num | ||
228 | li $j,`2*$BNSZ` | ||
229 | .align 4 | ||
230 | Linner: | ||
231 | $LDX $aj,$ap,$j ; ap[j] | ||
232 | addc $lo0,$alo,$hi0 | ||
233 | $LD $tj,$BNSZ($tp) ; tp[j] | ||
234 | addze $hi0,$ahi | ||
235 | $LDX $nj,$np,$j ; np[j] | ||
236 | addc $lo1,$nlo,$hi1 | ||
237 | $UMULL $alo,$aj,$m0 ; ap[j]*bp[i] | ||
238 | addze $hi1,$nhi | ||
239 | $UMULH $ahi,$aj,$m0 | ||
240 | addc $lo0,$lo0,$tj ; ap[j]*bp[i]+tp[j] | ||
241 | $UMULL $nlo,$nj,$m1 ; np[j]*m1 | ||
242 | addze $hi0,$hi0 | ||
243 | $UMULH $nhi,$nj,$m1 | ||
244 | addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[i]+tp[j] | ||
245 | addi $j,$j,$BNSZ ; j++ | ||
246 | addze $hi1,$hi1 | ||
247 | $ST $lo1,0($tp) ; tp[j-1] | ||
248 | addi $tp,$tp,$BNSZ ; tp++ | ||
249 | bdnz- Linner | ||
250 | ;Linner | ||
251 | $LD $tj,$BNSZ($tp) ; tp[j] | ||
252 | addc $lo0,$alo,$hi0 | ||
253 | addze $hi0,$ahi | ||
254 | addc $lo0,$lo0,$tj ; ap[j]*bp[i]+tp[j] | ||
255 | addze $hi0,$hi0 | ||
256 | |||
257 | addc $lo1,$nlo,$hi1 | ||
258 | addze $hi1,$nhi | ||
259 | addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[i]+tp[j] | ||
260 | addze $hi1,$hi1 | ||
261 | $ST $lo1,0($tp) ; tp[j-1] | ||
262 | |||
263 | addic $ovf,$ovf,-1 ; move upmost overflow to XER[CA] | ||
264 | li $ovf,0 | ||
265 | adde $hi1,$hi1,$hi0 | ||
266 | addze $ovf,$ovf | ||
267 | $ST $hi1,$BNSZ($tp) | ||
268 | ; | ||
269 | slwi $tj,$num,`log($BNSZ)/log(2)` | ||
270 | $UCMP $i,$tj | ||
271 | addi $i,$i,$BNSZ | ||
272 | ble- Louter | ||
273 | |||
274 | addi $num,$num,2 ; restore $num | ||
275 | subfc $j,$j,$j ; j=0 and "clear" XER[CA] | ||
276 | addi $tp,$sp,$FRAME | ||
277 | mtctr $num | ||
278 | |||
279 | .align 4 | ||
280 | Lsub: $LDX $tj,$tp,$j | ||
281 | $LDX $nj,$np,$j | ||
282 | subfe $aj,$nj,$tj ; tp[j]-np[j] | ||
283 | $STX $aj,$rp,$j | ||
284 | addi $j,$j,$BNSZ | ||
285 | bdnz- Lsub | ||
286 | |||
287 | li $j,0 | ||
288 | mtctr $num | ||
289 | subfe $ovf,$j,$ovf ; handle upmost overflow bit | ||
290 | and $ap,$tp,$ovf | ||
291 | andc $np,$rp,$ovf | ||
292 | or $ap,$ap,$np ; ap=borrow?tp:rp | ||
293 | |||
294 | .align 4 | ||
295 | Lcopy: ; copy or in-place refresh | ||
296 | $LDX $tj,$ap,$j | ||
297 | $STX $tj,$rp,$j | ||
298 | $STX $j,$tp,$j ; zap at once | ||
299 | addi $j,$j,$BNSZ | ||
300 | bdnz- Lcopy | ||
301 | |||
302 | $POP r14,`4*$SIZE_T`($sp) | ||
303 | $POP r15,`5*$SIZE_T`($sp) | ||
304 | $POP r16,`6*$SIZE_T`($sp) | ||
305 | $POP r17,`7*$SIZE_T`($sp) | ||
306 | $POP r18,`8*$SIZE_T`($sp) | ||
307 | $POP r19,`9*$SIZE_T`($sp) | ||
308 | $POP r20,`10*$SIZE_T`($sp) | ||
309 | $POP r21,`11*$SIZE_T`($sp) | ||
310 | $POP r22,`12*$SIZE_T`($sp) | ||
311 | $POP r23,`13*$SIZE_T`($sp) | ||
312 | $POP r24,`14*$SIZE_T`($sp) | ||
313 | $POP r25,`15*$SIZE_T`($sp) | ||
314 | $POP $sp,0($sp) | ||
315 | li r3,1 | ||
316 | blr | ||
317 | .long 0 | ||
318 | .asciz "Montgomery Multiplication for PPC, CRYPTOGAMS by <appro\@fy.chalmers.se>" | ||
319 | ___ | ||
320 | |||
321 | $code =~ s/\`([^\`]*)\`/eval $1/gem; | ||
322 | print $code; | ||
323 | close STDOUT; | ||