summaryrefslogtreecommitdiff
path: root/src/lib/libcrypto/bn/asm/alpha-mont.pl
diff options
context:
space:
mode:
Diffstat (limited to 'src/lib/libcrypto/bn/asm/alpha-mont.pl')
-rw-r--r--src/lib/libcrypto/bn/asm/alpha-mont.pl315
1 files changed, 0 insertions, 315 deletions
diff --git a/src/lib/libcrypto/bn/asm/alpha-mont.pl b/src/lib/libcrypto/bn/asm/alpha-mont.pl
deleted file mode 100644
index 874597f1c0..0000000000
--- a/src/lib/libcrypto/bn/asm/alpha-mont.pl
+++ /dev/null
@@ -1,315 +0,0 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# On 21264 RSA sign performance improves by 70/35/20/15 percent for
11# 512/1024/2048/4096 bit key lengths. This is against vendor compiler
12# instructed to '-tune host' code with in-line assembler. Other
13# benchmarks improve by 15-20%. To anchor it to something else, the
14# code provides approximately the same performance per GHz as AMD64.
15# I.e. if you compare 1GHz 21264 and 2GHz Opteron, you'll observe ~2x
16# difference.
17
18# int bn_mul_mont(
19$rp="a0"; # BN_ULONG *rp,
20$ap="a1"; # const BN_ULONG *ap,
21$bp="a2"; # const BN_ULONG *bp,
22$np="a3"; # const BN_ULONG *np,
23$n0="a4"; # const BN_ULONG *n0,
24$num="a5"; # int num);
25
26$lo0="t0";
27$hi0="t1";
28$lo1="t2";
29$hi1="t3";
30$aj="t4";
31$bi="t5";
32$nj="t6";
33$tp="t7";
34$alo="t8";
35$ahi="t9";
36$nlo="t10";
37$nhi="t11";
38$tj="t12";
39$i="s3";
40$j="s4";
41$m1="s5";
42
43$code=<<___;
44#include <machine/asm.h>
45
46.text
47
48.set noat
49.set noreorder
50
51.globl bn_mul_mont
52.align 5
53.ent bn_mul_mont
54bn_mul_mont:
55 lda sp,-48(sp)
56 stq ra,0(sp)
57 stq s3,8(sp)
58 stq s4,16(sp)
59 stq s5,24(sp)
60 stq fp,32(sp)
61 mov sp,fp
62 .mask 0x0400f000,-48
63 .frame fp,48,ra
64 .prologue 0
65
66 .align 4
67 .set reorder
68 sextl $num,$num
69 mov 0,v0
70 cmplt $num,4,AT
71 bne AT,.Lexit
72
73 ldq $hi0,0($ap) # ap[0]
74 s8addq $num,16,AT
75 ldq $aj,8($ap)
76 subq sp,AT,sp
77 ldq $bi,0($bp) # bp[0]
78 lda AT,-4096(zero) # mov -4096,AT
79 ldq $n0,0($n0)
80 and sp,AT,sp
81
82 mulq $hi0,$bi,$lo0
83 ldq $hi1,0($np) # np[0]
84 umulh $hi0,$bi,$hi0
85 ldq $nj,8($np)
86
87 mulq $lo0,$n0,$m1
88
89 mulq $hi1,$m1,$lo1
90 umulh $hi1,$m1,$hi1
91
92 addq $lo1,$lo0,$lo1
93 cmpult $lo1,$lo0,AT
94 addq $hi1,AT,$hi1
95
96 mulq $aj,$bi,$alo
97 mov 2,$j
98 umulh $aj,$bi,$ahi
99 mov sp,$tp
100
101 mulq $nj,$m1,$nlo
102 s8addq $j,$ap,$aj
103 umulh $nj,$m1,$nhi
104 s8addq $j,$np,$nj
105.align 4
106.L1st:
107 .set noreorder
108 ldq $aj,0($aj)
109 addl $j,1,$j
110 ldq $nj,0($nj)
111 lda $tp,8($tp)
112
113 addq $alo,$hi0,$lo0
114 mulq $aj,$bi,$alo
115 cmpult $lo0,$hi0,AT
116 addq $nlo,$hi1,$lo1
117
118 mulq $nj,$m1,$nlo
119 addq $ahi,AT,$hi0
120 cmpult $lo1,$hi1,v0
121 cmplt $j,$num,$tj
122
123 umulh $aj,$bi,$ahi
124 addq $nhi,v0,$hi1
125 addq $lo1,$lo0,$lo1
126 s8addq $j,$ap,$aj
127
128 umulh $nj,$m1,$nhi
129 cmpult $lo1,$lo0,v0
130 addq $hi1,v0,$hi1
131 s8addq $j,$np,$nj
132
133 stq $lo1,-8($tp)
134 nop
135 unop
136 bne $tj,.L1st
137 .set reorder
138
139 addq $alo,$hi0,$lo0
140 addq $nlo,$hi1,$lo1
141 cmpult $lo0,$hi0,AT
142 cmpult $lo1,$hi1,v0
143 addq $ahi,AT,$hi0
144 addq $nhi,v0,$hi1
145
146 addq $lo1,$lo0,$lo1
147 cmpult $lo1,$lo0,v0
148 addq $hi1,v0,$hi1
149
150 stq $lo1,0($tp)
151
152 addq $hi1,$hi0,$hi1
153 cmpult $hi1,$hi0,AT
154 stq $hi1,8($tp)
155 stq AT,16($tp)
156
157 mov 1,$i
158.align 4
159.Louter:
160 s8addq $i,$bp,$bi
161 ldq $hi0,0($ap)
162 ldq $aj,8($ap)
163 ldq $bi,0($bi)
164 ldq $hi1,0($np)
165 ldq $nj,8($np)
166 ldq $tj,0(sp)
167
168 mulq $hi0,$bi,$lo0
169 umulh $hi0,$bi,$hi0
170
171 addq $lo0,$tj,$lo0
172 cmpult $lo0,$tj,AT
173 addq $hi0,AT,$hi0
174
175 mulq $lo0,$n0,$m1
176
177 mulq $hi1,$m1,$lo1
178 umulh $hi1,$m1,$hi1
179
180 addq $lo1,$lo0,$lo1
181 cmpult $lo1,$lo0,AT
182 mov 2,$j
183 addq $hi1,AT,$hi1
184
185 mulq $aj,$bi,$alo
186 mov sp,$tp
187 umulh $aj,$bi,$ahi
188
189 mulq $nj,$m1,$nlo
190 s8addq $j,$ap,$aj
191 umulh $nj,$m1,$nhi
192.align 4
193.Linner:
194 .set noreorder
195 ldq $tj,8($tp) #L0
196 nop #U1
197 ldq $aj,0($aj) #L1
198 s8addq $j,$np,$nj #U0
199
200 ldq $nj,0($nj) #L0
201 nop #U1
202 addq $alo,$hi0,$lo0 #L1
203 lda $tp,8($tp)
204
205 mulq $aj,$bi,$alo #U1
206 cmpult $lo0,$hi0,AT #L0
207 addq $nlo,$hi1,$lo1 #L1
208 addl $j,1,$j
209
210 mulq $nj,$m1,$nlo #U1
211 addq $ahi,AT,$hi0 #L0
212 addq $lo0,$tj,$lo0 #L1
213 cmpult $lo1,$hi1,v0 #U0
214
215 umulh $aj,$bi,$ahi #U1
216 cmpult $lo0,$tj,AT #L0
217 addq $lo1,$lo0,$lo1 #L1
218 addq $nhi,v0,$hi1 #U0
219
220 umulh $nj,$m1,$nhi #U1
221 s8addq $j,$ap,$aj #L0
222 cmpult $lo1,$lo0,v0 #L1
223 cmplt $j,$num,$tj #U0 # borrow $tj
224
225 addq $hi0,AT,$hi0 #L0
226 addq $hi1,v0,$hi1 #U1
227 stq $lo1,-8($tp) #L1
228 bne $tj,.Linner #U0
229 .set reorder
230
231 ldq $tj,8($tp)
232 addq $alo,$hi0,$lo0
233 addq $nlo,$hi1,$lo1
234 cmpult $lo0,$hi0,AT
235 cmpult $lo1,$hi1,v0
236 addq $ahi,AT,$hi0
237 addq $nhi,v0,$hi1
238
239 addq $lo0,$tj,$lo0
240 cmpult $lo0,$tj,AT
241 addq $hi0,AT,$hi0
242
243 ldq $tj,16($tp)
244 addq $lo1,$lo0,$j
245 cmpult $j,$lo0,v0
246 addq $hi1,v0,$hi1
247
248 addq $hi1,$hi0,$lo1
249 stq $j,0($tp)
250 cmpult $lo1,$hi0,$hi1
251 addq $lo1,$tj,$lo1
252 cmpult $lo1,$tj,AT
253 addl $i,1,$i
254 addq $hi1,AT,$hi1
255 stq $lo1,8($tp)
256 cmplt $i,$num,$tj # borrow $tj
257 stq $hi1,16($tp)
258 bne $tj,.Louter
259
260 s8addq $num,sp,$tj # &tp[num]
261 mov $rp,$bp # put rp aside
262 mov sp,$tp
263 mov sp,$ap
264 mov 0,$hi0 # clear borrow bit
265
266.align 4
267.Lsub: ldq $lo0,0($tp)
268 ldq $lo1,0($np)
269 lda $tp,8($tp)
270 lda $np,8($np)
271 subq $lo0,$lo1,$lo1 # tp[i]-np[i]
272 cmpult $lo0,$lo1,AT
273 subq $lo1,$hi0,$lo0
274 cmpult $lo1,$lo0,$hi0
275 or $hi0,AT,$hi0
276 stq $lo0,0($rp)
277 cmpult $tp,$tj,v0
278 lda $rp,8($rp)
279 bne v0,.Lsub
280
281 subq $hi1,$hi0,$hi0 # handle upmost overflow bit
282 mov sp,$tp
283 mov $bp,$rp # restore rp
284
285 and sp,$hi0,$ap
286 bic $bp,$hi0,$bp
287 bis $bp,$ap,$ap # ap=borrow?tp:rp
288
289.align 4
290.Lcopy: ldq $aj,0($ap) # copy or in-place refresh
291 lda $tp,8($tp)
292 lda $rp,8($rp)
293 lda $ap,8($ap)
294 stq zero,-8($tp) # zap tp
295 cmpult $tp,$tj,AT
296 stq $aj,-8($rp)
297 bne AT,.Lcopy
298 mov 1,v0
299
300.Lexit:
301 .set noreorder
302 mov fp,sp
303 /*ldq ra,0(sp)*/
304 ldq s3,8(sp)
305 ldq s4,16(sp)
306 ldq s5,24(sp)
307 ldq fp,32(sp)
308 lda sp,48(sp)
309 ret (ra)
310.end bn_mul_mont
311.align 2
312___
313
314print $code;
315close STDOUT;