summaryrefslogtreecommitdiff
path: root/src/lib/libcrypto/bn/asm/alpha-mont.pl
diff options
context:
space:
mode:
Diffstat (limited to 'src/lib/libcrypto/bn/asm/alpha-mont.pl')
-rw-r--r--src/lib/libcrypto/bn/asm/alpha-mont.pl317
1 files changed, 0 insertions, 317 deletions
diff --git a/src/lib/libcrypto/bn/asm/alpha-mont.pl b/src/lib/libcrypto/bn/asm/alpha-mont.pl
deleted file mode 100644
index f7e0ca1646..0000000000
--- a/src/lib/libcrypto/bn/asm/alpha-mont.pl
+++ /dev/null
@@ -1,317 +0,0 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# On 21264 RSA sign performance improves by 70/35/20/15 percent for
11# 512/1024/2048/4096 bit key lengths. This is against vendor compiler
12# instructed to '-tune host' code with in-line assembler. Other
13# benchmarks improve by 15-20%. To anchor it to something else, the
14# code provides approximately the same performance per GHz as AMD64.
15# I.e. if you compare 1GHz 21264 and 2GHz Opteron, you'll observe ~2x
16# difference.
17
18# int bn_mul_mont(
19$rp="a0"; # BN_ULONG *rp,
20$ap="a1"; # const BN_ULONG *ap,
21$bp="a2"; # const BN_ULONG *bp,
22$np="a3"; # const BN_ULONG *np,
23$n0="a4"; # const BN_ULONG *n0,
24$num="a5"; # int num);
25
26$lo0="t0";
27$hi0="t1";
28$lo1="t2";
29$hi1="t3";
30$aj="t4";
31$bi="t5";
32$nj="t6";
33$tp="t7";
34$alo="t8";
35$ahi="t9";
36$nlo="t10";
37$nhi="t11";
38$tj="t12";
39$i="s3";
40$j="s4";
41$m1="s5";
42
43$code=<<___;
44#include <asm.h>
45#include <regdef.h>
46
47.text
48
49.set noat
50.set noreorder
51
52.globl bn_mul_mont
53.align 5
54.ent bn_mul_mont
55bn_mul_mont:
56 lda sp,-48(sp)
57 stq ra,0(sp)
58 stq s3,8(sp)
59 stq s4,16(sp)
60 stq s5,24(sp)
61 stq fp,32(sp)
62 mov sp,fp
63 .mask 0x0400f000,-48
64 .frame fp,48,ra
65 .prologue 0
66
67 .align 4
68 .set reorder
69 sextl $num,$num
70 mov 0,v0
71 cmplt $num,4,AT
72 bne AT,.Lexit
73
74 ldq $hi0,0($ap) # ap[0]
75 s8addq $num,16,AT
76 ldq $aj,8($ap)
77 subq sp,AT,sp
78 ldq $bi,0($bp) # bp[0]
79 mov -4096,AT
80 ldq $n0,0($n0)
81 and sp,AT,sp
82
83 mulq $hi0,$bi,$lo0
84 ldq $hi1,0($np) # np[0]
85 umulh $hi0,$bi,$hi0
86 ldq $nj,8($np)
87
88 mulq $lo0,$n0,$m1
89
90 mulq $hi1,$m1,$lo1
91 umulh $hi1,$m1,$hi1
92
93 addq $lo1,$lo0,$lo1
94 cmpult $lo1,$lo0,AT
95 addq $hi1,AT,$hi1
96
97 mulq $aj,$bi,$alo
98 mov 2,$j
99 umulh $aj,$bi,$ahi
100 mov sp,$tp
101
102 mulq $nj,$m1,$nlo
103 s8addq $j,$ap,$aj
104 umulh $nj,$m1,$nhi
105 s8addq $j,$np,$nj
106.align 4
107.L1st:
108 .set noreorder
109 ldq $aj,($aj)
110 addl $j,1,$j
111 ldq $nj,($nj)
112 lda $tp,8($tp)
113
114 addq $alo,$hi0,$lo0
115 mulq $aj,$bi,$alo
116 cmpult $lo0,$hi0,AT
117 addq $nlo,$hi1,$lo1
118
119 mulq $nj,$m1,$nlo
120 addq $ahi,AT,$hi0
121 cmpult $lo1,$hi1,v0
122 cmplt $j,$num,$tj
123
124 umulh $aj,$bi,$ahi
125 addq $nhi,v0,$hi1
126 addq $lo1,$lo0,$lo1
127 s8addq $j,$ap,$aj
128
129 umulh $nj,$m1,$nhi
130 cmpult $lo1,$lo0,v0
131 addq $hi1,v0,$hi1
132 s8addq $j,$np,$nj
133
134 stq $lo1,-8($tp)
135 nop
136 unop
137 bne $tj,.L1st
138 .set reorder
139
140 addq $alo,$hi0,$lo0
141 addq $nlo,$hi1,$lo1
142 cmpult $lo0,$hi0,AT
143 cmpult $lo1,$hi1,v0
144 addq $ahi,AT,$hi0
145 addq $nhi,v0,$hi1
146
147 addq $lo1,$lo0,$lo1
148 cmpult $lo1,$lo0,v0
149 addq $hi1,v0,$hi1
150
151 stq $lo1,0($tp)
152
153 addq $hi1,$hi0,$hi1
154 cmpult $hi1,$hi0,AT
155 stq $hi1,8($tp)
156 stq AT,16($tp)
157
158 mov 1,$i
159.align 4
160.Louter:
161 s8addq $i,$bp,$bi
162 ldq $hi0,($ap)
163 ldq $aj,8($ap)
164 ldq $bi,($bi)
165 ldq $hi1,($np)
166 ldq $nj,8($np)
167 ldq $tj,(sp)
168
169 mulq $hi0,$bi,$lo0
170 umulh $hi0,$bi,$hi0
171
172 addq $lo0,$tj,$lo0
173 cmpult $lo0,$tj,AT
174 addq $hi0,AT,$hi0
175
176 mulq $lo0,$n0,$m1
177
178 mulq $hi1,$m1,$lo1
179 umulh $hi1,$m1,$hi1
180
181 addq $lo1,$lo0,$lo1
182 cmpult $lo1,$lo0,AT
183 mov 2,$j
184 addq $hi1,AT,$hi1
185
186 mulq $aj,$bi,$alo
187 mov sp,$tp
188 umulh $aj,$bi,$ahi
189
190 mulq $nj,$m1,$nlo
191 s8addq $j,$ap,$aj
192 umulh $nj,$m1,$nhi
193.align 4
194.Linner:
195 .set noreorder
196 ldq $tj,8($tp) #L0
197 nop #U1
198 ldq $aj,($aj) #L1
199 s8addq $j,$np,$nj #U0
200
201 ldq $nj,($nj) #L0
202 nop #U1
203 addq $alo,$hi0,$lo0 #L1
204 lda $tp,8($tp)
205
206 mulq $aj,$bi,$alo #U1
207 cmpult $lo0,$hi0,AT #L0
208 addq $nlo,$hi1,$lo1 #L1
209 addl $j,1,$j
210
211 mulq $nj,$m1,$nlo #U1
212 addq $ahi,AT,$hi0 #L0
213 addq $lo0,$tj,$lo0 #L1
214 cmpult $lo1,$hi1,v0 #U0
215
216 umulh $aj,$bi,$ahi #U1
217 cmpult $lo0,$tj,AT #L0
218 addq $lo1,$lo0,$lo1 #L1
219 addq $nhi,v0,$hi1 #U0
220
221 umulh $nj,$m1,$nhi #U1
222 s8addq $j,$ap,$aj #L0
223 cmpult $lo1,$lo0,v0 #L1
224 cmplt $j,$num,$tj #U0 # borrow $tj
225
226 addq $hi0,AT,$hi0 #L0
227 addq $hi1,v0,$hi1 #U1
228 stq $lo1,-8($tp) #L1
229 bne $tj,.Linner #U0
230 .set reorder
231
232 ldq $tj,8($tp)
233 addq $alo,$hi0,$lo0
234 addq $nlo,$hi1,$lo1
235 cmpult $lo0,$hi0,AT
236 cmpult $lo1,$hi1,v0
237 addq $ahi,AT,$hi0
238 addq $nhi,v0,$hi1
239
240 addq $lo0,$tj,$lo0
241 cmpult $lo0,$tj,AT
242 addq $hi0,AT,$hi0
243
244 ldq $tj,16($tp)
245 addq $lo1,$lo0,$j
246 cmpult $j,$lo0,v0
247 addq $hi1,v0,$hi1
248
249 addq $hi1,$hi0,$lo1
250 stq $j,($tp)
251 cmpult $lo1,$hi0,$hi1
252 addq $lo1,$tj,$lo1
253 cmpult $lo1,$tj,AT
254 addl $i,1,$i
255 addq $hi1,AT,$hi1
256 stq $lo1,8($tp)
257 cmplt $i,$num,$tj # borrow $tj
258 stq $hi1,16($tp)
259 bne $tj,.Louter
260
261 s8addq $num,sp,$tj # &tp[num]
262 mov $rp,$bp # put rp aside
263 mov sp,$tp
264 mov sp,$ap
265 mov 0,$hi0 # clear borrow bit
266
267.align 4
268.Lsub: ldq $lo0,($tp)
269 ldq $lo1,($np)
270 lda $tp,8($tp)
271 lda $np,8($np)
272 subq $lo0,$lo1,$lo1 # tp[i]-np[i]
273 cmpult $lo0,$lo1,AT
274 subq $lo1,$hi0,$lo0
275 cmpult $lo1,$lo0,$hi0
276 or $hi0,AT,$hi0
277 stq $lo0,($rp)
278 cmpult $tp,$tj,v0
279 lda $rp,8($rp)
280 bne v0,.Lsub
281
282 subq $hi1,$hi0,$hi0 # handle upmost overflow bit
283 mov sp,$tp
284 mov $bp,$rp # restore rp
285
286 and sp,$hi0,$ap
287 bic $bp,$hi0,$bp
288 bis $bp,$ap,$ap # ap=borrow?tp:rp
289
290.align 4
291.Lcopy: ldq $aj,($ap) # copy or in-place refresh
292 lda $tp,8($tp)
293 lda $rp,8($rp)
294 lda $ap,8($ap)
295 stq zero,-8($tp) # zap tp
296 cmpult $tp,$tj,AT
297 stq $aj,-8($rp)
298 bne AT,.Lcopy
299 mov 1,v0
300
301.Lexit:
302 .set noreorder
303 mov fp,sp
304 /*ldq ra,0(sp)*/
305 ldq s3,8(sp)
306 ldq s4,16(sp)
307 ldq s5,24(sp)
308 ldq fp,32(sp)
309 lda sp,48(sp)
310 ret (ra)
311.end bn_mul_mont
312.rdata
313.asciiz "Montgomery Multiplication for Alpha, CRYPTOGAMS by <appro\@openssl.org>"
314___
315
316print $code;
317close STDOUT;