diff options
Diffstat (limited to 'src/lib/libcrypto/bn/asm/alpha-mont.pl')
-rw-r--r-- | src/lib/libcrypto/bn/asm/alpha-mont.pl | 317 |
1 files changed, 317 insertions, 0 deletions
diff --git a/src/lib/libcrypto/bn/asm/alpha-mont.pl b/src/lib/libcrypto/bn/asm/alpha-mont.pl new file mode 100644 index 0000000000..7a2cc3173b --- /dev/null +++ b/src/lib/libcrypto/bn/asm/alpha-mont.pl | |||
@@ -0,0 +1,317 @@ | |||
1 | #!/usr/bin/env perl | ||
2 | # | ||
3 | # ==================================================================== | ||
4 | # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | ||
5 | # project. The module is, however, dual licensed under OpenSSL and | ||
6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
8 | # ==================================================================== | ||
9 | # | ||
10 | # On 21264 RSA sign performance improves by 70/35/20/15 percent for | ||
11 | # 512/1024/2048/4096 bit key lengths. This is against vendor compiler | ||
12 | # instructed to '-tune host' code with in-line assembler. Other | ||
13 | # benchmarks improve by 15-20%. To anchor it to something else, the | ||
14 | # code provides approximately the same performance per GHz as AMD64. | ||
15 | # I.e. if you compare 1GHz 21264 and 2GHz Opteron, you'll observe ~2x | ||
16 | # difference. | ||
17 | |||
18 | # int bn_mul_mont( | ||
19 | $rp="a0"; # BN_ULONG *rp, | ||
20 | $ap="a1"; # const BN_ULONG *ap, | ||
21 | $bp="a2"; # const BN_ULONG *bp, | ||
22 | $np="a3"; # const BN_ULONG *np, | ||
23 | $n0="a4"; # const BN_ULONG *n0, | ||
24 | $num="a5"; # int num); | ||
25 | |||
26 | $lo0="t0"; | ||
27 | $hi0="t1"; | ||
28 | $lo1="t2"; | ||
29 | $hi1="t3"; | ||
30 | $aj="t4"; | ||
31 | $bi="t5"; | ||
32 | $nj="t6"; | ||
33 | $tp="t7"; | ||
34 | $alo="t8"; | ||
35 | $ahi="t9"; | ||
36 | $nlo="t10"; | ||
37 | $nhi="t11"; | ||
38 | $tj="t12"; | ||
39 | $i="s3"; | ||
40 | $j="s4"; | ||
41 | $m1="s5"; | ||
42 | |||
43 | $code=<<___; | ||
44 | #include <asm.h> | ||
45 | #include <regdef.h> | ||
46 | |||
47 | .text | ||
48 | |||
49 | .set noat | ||
50 | .set noreorder | ||
51 | |||
52 | .globl bn_mul_mont | ||
53 | .align 5 | ||
54 | .ent bn_mul_mont | ||
55 | bn_mul_mont: | ||
56 | lda sp,-40(sp) | ||
57 | stq ra,0(sp) | ||
58 | stq s3,8(sp) | ||
59 | stq s4,16(sp) | ||
60 | stq s5,24(sp) | ||
61 | stq fp,32(sp) | ||
62 | mov sp,fp | ||
63 | .mask 0x0400f000,-40 | ||
64 | .frame fp,40,ra | ||
65 | .prologue 0 | ||
66 | |||
67 | .align 4 | ||
68 | .set reorder | ||
69 | sextl $num,$num | ||
70 | mov 0,v0 | ||
71 | cmplt $num,4,AT | ||
72 | bne AT,.Lexit | ||
73 | |||
74 | ldq $hi0,0($ap) # ap[0] | ||
75 | s8addq $num,16,AT | ||
76 | ldq $aj,8($ap) | ||
77 | subq sp,AT,sp | ||
78 | ldq $bi,0($bp) # bp[0] | ||
79 | mov -4096,AT | ||
80 | ldq $n0,0($n0) | ||
81 | and sp,AT,sp | ||
82 | |||
83 | mulq $hi0,$bi,$lo0 | ||
84 | ldq $hi1,0($np) # np[0] | ||
85 | umulh $hi0,$bi,$hi0 | ||
86 | ldq $nj,8($np) | ||
87 | |||
88 | mulq $lo0,$n0,$m1 | ||
89 | |||
90 | mulq $hi1,$m1,$lo1 | ||
91 | umulh $hi1,$m1,$hi1 | ||
92 | |||
93 | addq $lo1,$lo0,$lo1 | ||
94 | cmpult $lo1,$lo0,AT | ||
95 | addq $hi1,AT,$hi1 | ||
96 | |||
97 | mulq $aj,$bi,$alo | ||
98 | mov 2,$j | ||
99 | umulh $aj,$bi,$ahi | ||
100 | mov sp,$tp | ||
101 | |||
102 | mulq $nj,$m1,$nlo | ||
103 | s8addq $j,$ap,$aj | ||
104 | umulh $nj,$m1,$nhi | ||
105 | s8addq $j,$np,$nj | ||
106 | .align 4 | ||
107 | .L1st: | ||
108 | .set noreorder | ||
109 | ldq $aj,($aj) | ||
110 | addl $j,1,$j | ||
111 | ldq $nj,($nj) | ||
112 | lda $tp,8($tp) | ||
113 | |||
114 | addq $alo,$hi0,$lo0 | ||
115 | mulq $aj,$bi,$alo | ||
116 | cmpult $lo0,$hi0,AT | ||
117 | addq $nlo,$hi1,$lo1 | ||
118 | |||
119 | mulq $nj,$m1,$nlo | ||
120 | addq $ahi,AT,$hi0 | ||
121 | cmpult $lo1,$hi1,v0 | ||
122 | cmplt $j,$num,$tj | ||
123 | |||
124 | umulh $aj,$bi,$ahi | ||
125 | addq $nhi,v0,$hi1 | ||
126 | addq $lo1,$lo0,$lo1 | ||
127 | s8addq $j,$ap,$aj | ||
128 | |||
129 | umulh $nj,$m1,$nhi | ||
130 | cmpult $lo1,$lo0,v0 | ||
131 | addq $hi1,v0,$hi1 | ||
132 | s8addq $j,$np,$nj | ||
133 | |||
134 | stq $lo1,-8($tp) | ||
135 | nop | ||
136 | unop | ||
137 | bne $tj,.L1st | ||
138 | .set reorder | ||
139 | |||
140 | addq $alo,$hi0,$lo0 | ||
141 | addq $nlo,$hi1,$lo1 | ||
142 | cmpult $lo0,$hi0,AT | ||
143 | cmpult $lo1,$hi1,v0 | ||
144 | addq $ahi,AT,$hi0 | ||
145 | addq $nhi,v0,$hi1 | ||
146 | |||
147 | addq $lo1,$lo0,$lo1 | ||
148 | cmpult $lo1,$lo0,v0 | ||
149 | addq $hi1,v0,$hi1 | ||
150 | |||
151 | stq $lo1,0($tp) | ||
152 | |||
153 | addq $hi1,$hi0,$hi1 | ||
154 | cmpult $hi1,$hi0,AT | ||
155 | stq $hi1,8($tp) | ||
156 | stq AT,16($tp) | ||
157 | |||
158 | mov 1,$i | ||
159 | .align 4 | ||
160 | .Louter: | ||
161 | s8addq $i,$bp,$bi | ||
162 | ldq $hi0,($ap) | ||
163 | ldq $aj,8($ap) | ||
164 | ldq $bi,($bi) | ||
165 | ldq $hi1,($np) | ||
166 | ldq $nj,8($np) | ||
167 | ldq $tj,(sp) | ||
168 | |||
169 | mulq $hi0,$bi,$lo0 | ||
170 | umulh $hi0,$bi,$hi0 | ||
171 | |||
172 | addq $lo0,$tj,$lo0 | ||
173 | cmpult $lo0,$tj,AT | ||
174 | addq $hi0,AT,$hi0 | ||
175 | |||
176 | mulq $lo0,$n0,$m1 | ||
177 | |||
178 | mulq $hi1,$m1,$lo1 | ||
179 | umulh $hi1,$m1,$hi1 | ||
180 | |||
181 | addq $lo1,$lo0,$lo1 | ||
182 | cmpult $lo1,$lo0,AT | ||
183 | mov 2,$j | ||
184 | addq $hi1,AT,$hi1 | ||
185 | |||
186 | mulq $aj,$bi,$alo | ||
187 | mov sp,$tp | ||
188 | umulh $aj,$bi,$ahi | ||
189 | |||
190 | mulq $nj,$m1,$nlo | ||
191 | s8addq $j,$ap,$aj | ||
192 | umulh $nj,$m1,$nhi | ||
193 | .align 4 | ||
194 | .Linner: | ||
195 | .set noreorder | ||
196 | ldq $tj,8($tp) #L0 | ||
197 | nop #U1 | ||
198 | ldq $aj,($aj) #L1 | ||
199 | s8addq $j,$np,$nj #U0 | ||
200 | |||
201 | ldq $nj,($nj) #L0 | ||
202 | nop #U1 | ||
203 | addq $alo,$hi0,$lo0 #L1 | ||
204 | lda $tp,8($tp) | ||
205 | |||
206 | mulq $aj,$bi,$alo #U1 | ||
207 | cmpult $lo0,$hi0,AT #L0 | ||
208 | addq $nlo,$hi1,$lo1 #L1 | ||
209 | addl $j,1,$j | ||
210 | |||
211 | mulq $nj,$m1,$nlo #U1 | ||
212 | addq $ahi,AT,$hi0 #L0 | ||
213 | addq $lo0,$tj,$lo0 #L1 | ||
214 | cmpult $lo1,$hi1,v0 #U0 | ||
215 | |||
216 | umulh $aj,$bi,$ahi #U1 | ||
217 | cmpult $lo0,$tj,AT #L0 | ||
218 | addq $lo1,$lo0,$lo1 #L1 | ||
219 | addq $nhi,v0,$hi1 #U0 | ||
220 | |||
221 | umulh $nj,$m1,$nhi #U1 | ||
222 | s8addq $j,$ap,$aj #L0 | ||
223 | cmpult $lo1,$lo0,v0 #L1 | ||
224 | cmplt $j,$num,$tj #U0 # borrow $tj | ||
225 | |||
226 | addq $hi0,AT,$hi0 #L0 | ||
227 | addq $hi1,v0,$hi1 #U1 | ||
228 | stq $lo1,-8($tp) #L1 | ||
229 | bne $tj,.Linner #U0 | ||
230 | .set reorder | ||
231 | |||
232 | ldq $tj,8($tp) | ||
233 | addq $alo,$hi0,$lo0 | ||
234 | addq $nlo,$hi1,$lo1 | ||
235 | cmpult $lo0,$hi0,AT | ||
236 | cmpult $lo1,$hi1,v0 | ||
237 | addq $ahi,AT,$hi0 | ||
238 | addq $nhi,v0,$hi1 | ||
239 | |||
240 | addq $lo0,$tj,$lo0 | ||
241 | cmpult $lo0,$tj,AT | ||
242 | addq $hi0,AT,$hi0 | ||
243 | |||
244 | ldq $tj,16($tp) | ||
245 | addq $lo1,$lo0,$j | ||
246 | cmpult $j,$lo0,v0 | ||
247 | addq $hi1,v0,$hi1 | ||
248 | |||
249 | addq $hi1,$hi0,$lo1 | ||
250 | stq $j,($tp) | ||
251 | cmpult $lo1,$hi0,$hi1 | ||
252 | addq $lo1,$tj,$lo1 | ||
253 | cmpult $lo1,$tj,AT | ||
254 | addl $i,1,$i | ||
255 | addq $hi1,AT,$hi1 | ||
256 | stq $lo1,8($tp) | ||
257 | cmplt $i,$num,$tj # borrow $tj | ||
258 | stq $hi1,16($tp) | ||
259 | bne $tj,.Louter | ||
260 | |||
261 | s8addq $num,sp,$tj # &tp[num] | ||
262 | mov $rp,$bp # put rp aside | ||
263 | mov sp,$tp | ||
264 | mov sp,$ap | ||
265 | mov 0,$hi0 # clear borrow bit | ||
266 | |||
267 | .align 4 | ||
268 | .Lsub: ldq $lo0,($tp) | ||
269 | ldq $lo1,($np) | ||
270 | lda $tp,8($tp) | ||
271 | lda $np,8($np) | ||
272 | subq $lo0,$lo1,$lo1 # tp[i]-np[i] | ||
273 | cmpult $lo0,$lo1,AT | ||
274 | subq $lo1,$hi0,$lo0 | ||
275 | cmpult $lo1,$lo0,$hi0 | ||
276 | or $hi0,AT,$hi0 | ||
277 | stq $lo0,($rp) | ||
278 | cmpult $tp,$tj,v0 | ||
279 | lda $rp,8($rp) | ||
280 | bne v0,.Lsub | ||
281 | |||
282 | subq $hi1,$hi0,$hi0 # handle upmost overflow bit | ||
283 | mov sp,$tp | ||
284 | mov $bp,$rp # restore rp | ||
285 | |||
286 | and sp,$hi0,$ap | ||
287 | bic $bp,$hi0,$bp | ||
288 | bis $bp,$ap,$ap # ap=borrow?tp:rp | ||
289 | |||
290 | .align 4 | ||
291 | .Lcopy: ldq $aj,($ap) # copy or in-place refresh | ||
292 | lda $tp,8($tp) | ||
293 | lda $rp,8($rp) | ||
294 | lda $ap,8($ap) | ||
295 | stq zero,-8($tp) # zap tp | ||
296 | cmpult $tp,$tj,AT | ||
297 | stq $aj,-8($rp) | ||
298 | bne AT,.Lcopy | ||
299 | mov 1,v0 | ||
300 | |||
301 | .Lexit: | ||
302 | .set noreorder | ||
303 | mov fp,sp | ||
304 | /*ldq ra,0(sp)*/ | ||
305 | ldq s3,8(sp) | ||
306 | ldq s4,16(sp) | ||
307 | ldq s5,24(sp) | ||
308 | ldq fp,32(sp) | ||
309 | lda sp,40(sp) | ||
310 | ret (ra) | ||
311 | .end bn_mul_mont | ||
312 | .rdata | ||
313 | .asciiz "Montgomery Multiplication for Alpha, CRYPTOGAMS by <appro\@openssl.org>" | ||
314 | ___ | ||
315 | |||
316 | print $code; | ||
317 | close STDOUT; | ||