summaryrefslogtreecommitdiff
path: root/src/lib/libcrypto/bn/asm
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--src/lib/libcrypto/bn/asm/alpha-mont.pl315
-rw-r--r--src/lib/libcrypto/bn/asm/armv4-mont.pl204
-rw-r--r--src/lib/libcrypto/bn/asm/bn-586.pl567
-rw-r--r--src/lib/libcrypto/bn/asm/co-586.pl287
-rw-r--r--src/lib/libcrypto/bn/asm/mips-mont.pl426
-rw-r--r--src/lib/libcrypto/bn/asm/mips.pl2234
-rw-r--r--src/lib/libcrypto/bn/asm/modexp512-x86_64.pl1393
-rw-r--r--src/lib/libcrypto/bn/asm/parisc-mont.pl985
-rw-r--r--src/lib/libcrypto/bn/asm/ppc-mont.pl329
-rw-r--r--src/lib/libcrypto/bn/asm/ppc.pl1968
-rwxr-xr-xsrc/lib/libcrypto/bn/asm/x86-mont.pl592
-rwxr-xr-xsrc/lib/libcrypto/bn/asm/x86_64-mont.pl1503
-rwxr-xr-xsrc/lib/libcrypto/bn/asm/x86_64-mont5.pl1192
13 files changed, 0 insertions, 11995 deletions
diff --git a/src/lib/libcrypto/bn/asm/alpha-mont.pl b/src/lib/libcrypto/bn/asm/alpha-mont.pl
deleted file mode 100644
index 874597f1c0..0000000000
--- a/src/lib/libcrypto/bn/asm/alpha-mont.pl
+++ /dev/null
@@ -1,315 +0,0 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# On 21264 RSA sign performance improves by 70/35/20/15 percent for
11# 512/1024/2048/4096 bit key lengths. This is against vendor compiler
12# instructed to '-tune host' code with in-line assembler. Other
13# benchmarks improve by 15-20%. To anchor it to something else, the
14# code provides approximately the same performance per GHz as AMD64.
15# I.e. if you compare 1GHz 21264 and 2GHz Opteron, you'll observe ~2x
16# difference.
17
18# int bn_mul_mont(
19$rp="a0"; # BN_ULONG *rp,
20$ap="a1"; # const BN_ULONG *ap,
21$bp="a2"; # const BN_ULONG *bp,
22$np="a3"; # const BN_ULONG *np,
23$n0="a4"; # const BN_ULONG *n0,
24$num="a5"; # int num);
25
26$lo0="t0";
27$hi0="t1";
28$lo1="t2";
29$hi1="t3";
30$aj="t4";
31$bi="t5";
32$nj="t6";
33$tp="t7";
34$alo="t8";
35$ahi="t9";
36$nlo="t10";
37$nhi="t11";
38$tj="t12";
39$i="s3";
40$j="s4";
41$m1="s5";
42
43$code=<<___;
44#include <machine/asm.h>
45
46.text
47
48.set noat
49.set noreorder
50
51.globl bn_mul_mont
52.align 5
53.ent bn_mul_mont
54bn_mul_mont:
55 lda sp,-48(sp)
56 stq ra,0(sp)
57 stq s3,8(sp)
58 stq s4,16(sp)
59 stq s5,24(sp)
60 stq fp,32(sp)
61 mov sp,fp
62 .mask 0x0400f000,-48
63 .frame fp,48,ra
64 .prologue 0
65
66 .align 4
67 .set reorder
68 sextl $num,$num
69 mov 0,v0
70 cmplt $num,4,AT
71 bne AT,.Lexit
72
73 ldq $hi0,0($ap) # ap[0]
74 s8addq $num,16,AT
75 ldq $aj,8($ap)
76 subq sp,AT,sp
77 ldq $bi,0($bp) # bp[0]
78 lda AT,-4096(zero) # mov -4096,AT
79 ldq $n0,0($n0)
80 and sp,AT,sp
81
82 mulq $hi0,$bi,$lo0
83 ldq $hi1,0($np) # np[0]
84 umulh $hi0,$bi,$hi0
85 ldq $nj,8($np)
86
87 mulq $lo0,$n0,$m1
88
89 mulq $hi1,$m1,$lo1
90 umulh $hi1,$m1,$hi1
91
92 addq $lo1,$lo0,$lo1
93 cmpult $lo1,$lo0,AT
94 addq $hi1,AT,$hi1
95
96 mulq $aj,$bi,$alo
97 mov 2,$j
98 umulh $aj,$bi,$ahi
99 mov sp,$tp
100
101 mulq $nj,$m1,$nlo
102 s8addq $j,$ap,$aj
103 umulh $nj,$m1,$nhi
104 s8addq $j,$np,$nj
105.align 4
106.L1st:
107 .set noreorder
108 ldq $aj,0($aj)
109 addl $j,1,$j
110 ldq $nj,0($nj)
111 lda $tp,8($tp)
112
113 addq $alo,$hi0,$lo0
114 mulq $aj,$bi,$alo
115 cmpult $lo0,$hi0,AT
116 addq $nlo,$hi1,$lo1
117
118 mulq $nj,$m1,$nlo
119 addq $ahi,AT,$hi0
120 cmpult $lo1,$hi1,v0
121 cmplt $j,$num,$tj
122
123 umulh $aj,$bi,$ahi
124 addq $nhi,v0,$hi1
125 addq $lo1,$lo0,$lo1
126 s8addq $j,$ap,$aj
127
128 umulh $nj,$m1,$nhi
129 cmpult $lo1,$lo0,v0
130 addq $hi1,v0,$hi1
131 s8addq $j,$np,$nj
132
133 stq $lo1,-8($tp)
134 nop
135 unop
136 bne $tj,.L1st
137 .set reorder
138
139 addq $alo,$hi0,$lo0
140 addq $nlo,$hi1,$lo1
141 cmpult $lo0,$hi0,AT
142 cmpult $lo1,$hi1,v0
143 addq $ahi,AT,$hi0
144 addq $nhi,v0,$hi1
145
146 addq $lo1,$lo0,$lo1
147 cmpult $lo1,$lo0,v0
148 addq $hi1,v0,$hi1
149
150 stq $lo1,0($tp)
151
152 addq $hi1,$hi0,$hi1
153 cmpult $hi1,$hi0,AT
154 stq $hi1,8($tp)
155 stq AT,16($tp)
156
157 mov 1,$i
158.align 4
159.Louter:
160 s8addq $i,$bp,$bi
161 ldq $hi0,0($ap)
162 ldq $aj,8($ap)
163 ldq $bi,0($bi)
164 ldq $hi1,0($np)
165 ldq $nj,8($np)
166 ldq $tj,0(sp)
167
168 mulq $hi0,$bi,$lo0
169 umulh $hi0,$bi,$hi0
170
171 addq $lo0,$tj,$lo0
172 cmpult $lo0,$tj,AT
173 addq $hi0,AT,$hi0
174
175 mulq $lo0,$n0,$m1
176
177 mulq $hi1,$m1,$lo1
178 umulh $hi1,$m1,$hi1
179
180 addq $lo1,$lo0,$lo1
181 cmpult $lo1,$lo0,AT
182 mov 2,$j
183 addq $hi1,AT,$hi1
184
185 mulq $aj,$bi,$alo
186 mov sp,$tp
187 umulh $aj,$bi,$ahi
188
189 mulq $nj,$m1,$nlo
190 s8addq $j,$ap,$aj
191 umulh $nj,$m1,$nhi
192.align 4
193.Linner:
194 .set noreorder
195 ldq $tj,8($tp) #L0
196 nop #U1
197 ldq $aj,0($aj) #L1
198 s8addq $j,$np,$nj #U0
199
200 ldq $nj,0($nj) #L0
201 nop #U1
202 addq $alo,$hi0,$lo0 #L1
203 lda $tp,8($tp)
204
205 mulq $aj,$bi,$alo #U1
206 cmpult $lo0,$hi0,AT #L0
207 addq $nlo,$hi1,$lo1 #L1
208 addl $j,1,$j
209
210 mulq $nj,$m1,$nlo #U1
211 addq $ahi,AT,$hi0 #L0
212 addq $lo0,$tj,$lo0 #L1
213 cmpult $lo1,$hi1,v0 #U0
214
215 umulh $aj,$bi,$ahi #U1
216 cmpult $lo0,$tj,AT #L0
217 addq $lo1,$lo0,$lo1 #L1
218 addq $nhi,v0,$hi1 #U0
219
220 umulh $nj,$m1,$nhi #U1
221 s8addq $j,$ap,$aj #L0
222 cmpult $lo1,$lo0,v0 #L1
223 cmplt $j,$num,$tj #U0 # borrow $tj
224
225 addq $hi0,AT,$hi0 #L0
226 addq $hi1,v0,$hi1 #U1
227 stq $lo1,-8($tp) #L1
228 bne $tj,.Linner #U0
229 .set reorder
230
231 ldq $tj,8($tp)
232 addq $alo,$hi0,$lo0
233 addq $nlo,$hi1,$lo1
234 cmpult $lo0,$hi0,AT
235 cmpult $lo1,$hi1,v0
236 addq $ahi,AT,$hi0
237 addq $nhi,v0,$hi1
238
239 addq $lo0,$tj,$lo0
240 cmpult $lo0,$tj,AT
241 addq $hi0,AT,$hi0
242
243 ldq $tj,16($tp)
244 addq $lo1,$lo0,$j
245 cmpult $j,$lo0,v0
246 addq $hi1,v0,$hi1
247
248 addq $hi1,$hi0,$lo1
249 stq $j,0($tp)
250 cmpult $lo1,$hi0,$hi1
251 addq $lo1,$tj,$lo1
252 cmpult $lo1,$tj,AT
253 addl $i,1,$i
254 addq $hi1,AT,$hi1
255 stq $lo1,8($tp)
256 cmplt $i,$num,$tj # borrow $tj
257 stq $hi1,16($tp)
258 bne $tj,.Louter
259
260 s8addq $num,sp,$tj # &tp[num]
261 mov $rp,$bp # put rp aside
262 mov sp,$tp
263 mov sp,$ap
264 mov 0,$hi0 # clear borrow bit
265
266.align 4
267.Lsub: ldq $lo0,0($tp)
268 ldq $lo1,0($np)
269 lda $tp,8($tp)
270 lda $np,8($np)
271 subq $lo0,$lo1,$lo1 # tp[i]-np[i]
272 cmpult $lo0,$lo1,AT
273 subq $lo1,$hi0,$lo0
274 cmpult $lo1,$lo0,$hi0
275 or $hi0,AT,$hi0
276 stq $lo0,0($rp)
277 cmpult $tp,$tj,v0
278 lda $rp,8($rp)
279 bne v0,.Lsub
280
281 subq $hi1,$hi0,$hi0 # handle upmost overflow bit
282 mov sp,$tp
283 mov $bp,$rp # restore rp
284
285 and sp,$hi0,$ap
286 bic $bp,$hi0,$bp
287 bis $bp,$ap,$ap # ap=borrow?tp:rp
288
289.align 4
290.Lcopy: ldq $aj,0($ap) # copy or in-place refresh
291 lda $tp,8($tp)
292 lda $rp,8($rp)
293 lda $ap,8($ap)
294 stq zero,-8($tp) # zap tp
295 cmpult $tp,$tj,AT
296 stq $aj,-8($rp)
297 bne AT,.Lcopy
298 mov 1,v0
299
300.Lexit:
301 .set noreorder
302 mov fp,sp
303 /*ldq ra,0(sp)*/
304 ldq s3,8(sp)
305 ldq s4,16(sp)
306 ldq s5,24(sp)
307 ldq fp,32(sp)
308 lda sp,48(sp)
309 ret (ra)
310.end bn_mul_mont
311.align 2
312___
313
314print $code;
315close STDOUT;
diff --git a/src/lib/libcrypto/bn/asm/armv4-mont.pl b/src/lib/libcrypto/bn/asm/armv4-mont.pl
deleted file mode 100644
index f78a8b5f0f..0000000000
--- a/src/lib/libcrypto/bn/asm/armv4-mont.pl
+++ /dev/null
@@ -1,204 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# January 2007.
11
12# Montgomery multiplication for ARMv4.
13#
14# Performance improvement naturally varies among CPU implementations
15# and compilers. The code was observed to provide +65-35% improvement
16# [depending on key length, less for longer keys] on ARM920T, and
17# +115-80% on Intel IXP425. This is compared to pre-bn_mul_mont code
18# base and compiler generated code with in-lined umull and even umlal
19# instructions. The latter means that this code didn't really have an
20# "advantage" of utilizing some "secret" instruction.
21#
22# The code is interoperable with Thumb ISA and is rather compact, less
23# than 1/2KB. Windows CE port would be trivial, as it's exclusively
24# about decorations, ABI and instruction syntax are identical.
25
26while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
27open STDOUT,">$output";
28
29$num="r0"; # starts as num argument, but holds &tp[num-1]
30$ap="r1";
31$bp="r2"; $bi="r2"; $rp="r2";
32$np="r3";
33$tp="r4";
34$aj="r5";
35$nj="r6";
36$tj="r7";
37$n0="r8";
38########### # r9 is reserved by ELF as platform specific, e.g. TLS pointer
39$alo="r10"; # sl, gcc uses it to keep @GOT
40$ahi="r11"; # fp
41$nlo="r12"; # ip
42########### # r13 is stack pointer
43$nhi="r14"; # lr
44########### # r15 is program counter
45
46#### argument block layout relative to &tp[num-1], a.k.a. $num
47$_rp="$num,#12*4";
48# ap permanently resides in r1
49$_bp="$num,#13*4";
50# np permanently resides in r3
51$_n0="$num,#14*4";
52$_num="$num,#15*4"; $_bpend=$_num;
53
54$code=<<___;
55.text
56
57.global bn_mul_mont
58.type bn_mul_mont,%function
59
60.align 2
61bn_mul_mont:
62 stmdb sp!,{r0,r2} @ sp points at argument block
63 ldr $num,[sp,#3*4] @ load num
64 cmp $num,#2
65 movlt r0,#0
66 addlt sp,sp,#2*4
67 blt .Labrt
68
69 stmdb sp!,{r4-r12,lr} @ save 10 registers
70
71 mov $num,$num,lsl#2 @ rescale $num for byte count
72 sub sp,sp,$num @ alloca(4*num)
73 sub sp,sp,#4 @ +extra dword
74 sub $num,$num,#4 @ "num=num-1"
75 add $tp,$bp,$num @ &bp[num-1]
76
77 add $num,sp,$num @ $num to point at &tp[num-1]
78 ldr $n0,[$_n0] @ &n0
79 ldr $bi,[$bp] @ bp[0]
80 ldr $aj,[$ap],#4 @ ap[0],ap++
81 ldr $nj,[$np],#4 @ np[0],np++
82 ldr $n0,[$n0] @ *n0
83 str $tp,[$_bpend] @ save &bp[num]
84
85 umull $alo,$ahi,$aj,$bi @ ap[0]*bp[0]
86 str $n0,[$_n0] @ save n0 value
87 mul $n0,$alo,$n0 @ "tp[0]"*n0
88 mov $nlo,#0
89 umlal $alo,$nlo,$nj,$n0 @ np[0]*n0+"t[0]"
90 mov $tp,sp
91
92.L1st:
93 ldr $aj,[$ap],#4 @ ap[j],ap++
94 mov $alo,$ahi
95 ldr $nj,[$np],#4 @ np[j],np++
96 mov $ahi,#0
97 umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[0]
98 mov $nhi,#0
99 umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0
100 adds $nlo,$nlo,$alo
101 str $nlo,[$tp],#4 @ tp[j-1]=,tp++
102 adc $nlo,$nhi,#0
103 cmp $tp,$num
104 bne .L1st
105
106 adds $nlo,$nlo,$ahi
107 ldr $tp,[$_bp] @ restore bp
108 mov $nhi,#0
109 ldr $n0,[$_n0] @ restore n0
110 adc $nhi,$nhi,#0
111 str $nlo,[$num] @ tp[num-1]=
112 str $nhi,[$num,#4] @ tp[num]=
113
114.Louter:
115 sub $tj,$num,sp @ "original" $num-1 value
116 sub $ap,$ap,$tj @ "rewind" ap to &ap[1]
117 ldr $bi,[$tp,#4]! @ *(++bp)
118 sub $np,$np,$tj @ "rewind" np to &np[1]
119 ldr $aj,[$ap,#-4] @ ap[0]
120 ldr $alo,[sp] @ tp[0]
121 ldr $nj,[$np,#-4] @ np[0]
122 ldr $tj,[sp,#4] @ tp[1]
123
124 mov $ahi,#0
125 umlal $alo,$ahi,$aj,$bi @ ap[0]*bp[i]+tp[0]
126 str $tp,[$_bp] @ save bp
127 mul $n0,$alo,$n0
128 mov $nlo,#0
129 umlal $alo,$nlo,$nj,$n0 @ np[0]*n0+"tp[0]"
130 mov $tp,sp
131
132.Linner:
133 ldr $aj,[$ap],#4 @ ap[j],ap++
134 adds $alo,$ahi,$tj @ +=tp[j]
135 ldr $nj,[$np],#4 @ np[j],np++
136 mov $ahi,#0
137 umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[i]
138 mov $nhi,#0
139 umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0
140 adc $ahi,$ahi,#0
141 ldr $tj,[$tp,#8] @ tp[j+1]
142 adds $nlo,$nlo,$alo
143 str $nlo,[$tp],#4 @ tp[j-1]=,tp++
144 adc $nlo,$nhi,#0
145 cmp $tp,$num
146 bne .Linner
147
148 adds $nlo,$nlo,$ahi
149 mov $nhi,#0
150 ldr $tp,[$_bp] @ restore bp
151 adc $nhi,$nhi,#0
152 ldr $n0,[$_n0] @ restore n0
153 adds $nlo,$nlo,$tj
154 ldr $tj,[$_bpend] @ restore &bp[num]
155 adc $nhi,$nhi,#0
156 str $nlo,[$num] @ tp[num-1]=
157 str $nhi,[$num,#4] @ tp[num]=
158
159 cmp $tp,$tj
160 bne .Louter
161
162 ldr $rp,[$_rp] @ pull rp
163 add $num,$num,#4 @ $num to point at &tp[num]
164 sub $aj,$num,sp @ "original" num value
165 mov $tp,sp @ "rewind" $tp
166 mov $ap,$tp @ "borrow" $ap
167 sub $np,$np,$aj @ "rewind" $np to &np[0]
168
169 subs $tj,$tj,$tj @ "clear" carry flag
170.Lsub: ldr $tj,[$tp],#4
171 ldr $nj,[$np],#4
172 sbcs $tj,$tj,$nj @ tp[j]-np[j]
173 str $tj,[$rp],#4 @ rp[j]=
174 teq $tp,$num @ preserve carry
175 bne .Lsub
176 sbcs $nhi,$nhi,#0 @ upmost carry
177 mov $tp,sp @ "rewind" $tp
178 sub $rp,$rp,$aj @ "rewind" $rp
179
180 and $ap,$tp,$nhi
181 bic $np,$rp,$nhi
182 orr $ap,$ap,$np @ ap=borrow?tp:rp
183
184.Lcopy: ldr $tj,[$ap],#4 @ copy or in-place refresh
185 str sp,[$tp],#4 @ zap tp
186 str $tj,[$rp],#4
187 cmp $tp,$num
188 bne .Lcopy
189
190 add sp,$num,#4 @ skip over tp[num+1]
191 ldmia sp!,{r4-r12,lr} @ restore registers
192 add sp,sp,#2*4 @ skip over {r0,r2}
193 mov r0,#1
194.Labrt: tst lr,#1
195 moveq pc,lr @ be binary compatible with V4, yet
196 bx lr @ interoperable with Thumb ISA:-)
197.size bn_mul_mont,.-bn_mul_mont
198.asciz "Montgomery multiplication for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
199.align 2
200___
201
202$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
203print $code;
204close STDOUT;
diff --git a/src/lib/libcrypto/bn/asm/bn-586.pl b/src/lib/libcrypto/bn/asm/bn-586.pl
deleted file mode 100644
index 71b775af8d..0000000000
--- a/src/lib/libcrypto/bn/asm/bn-586.pl
+++ /dev/null
@@ -1,567 +0,0 @@
1#!/usr/local/bin/perl
2
3$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
4push(@INC,"${dir}","${dir}../../perlasm");
5require "x86asm.pl";
6
7&asm_init($ARGV[0],$0);
8
9$sse2=0;
10for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
11
12&external_label("OPENSSL_ia32cap_P") if ($sse2);
13
14&bn_mul_add_words("bn_mul_add_words");
15&bn_mul_words("bn_mul_words");
16&bn_sqr_words("bn_sqr_words");
17&bn_div_words("bn_div_words");
18&bn_add_words("bn_add_words");
19&bn_sub_words("bn_sub_words");
20
21&asm_finish();
22
23sub bn_mul_add_words
24 {
25 local($name)=@_;
26
27 &function_begin_B($name,"");
28
29 $r="eax";
30 $a="edx";
31 $c="ecx";
32
33 if ($sse2) {
34 &picsetup("eax");
35 &picsymbol("eax", "OPENSSL_ia32cap_P", "eax");
36 &bt(&DWP(0,"eax"),"\$IA32CAP_BIT0_SSE2");
37 &jnc(&label("maw_non_sse2"));
38
39 &mov($r,&wparam(0));
40 &mov($a,&wparam(1));
41 &mov($c,&wparam(2));
42 &movd("mm0",&wparam(3)); # mm0 = w
43 &pxor("mm1","mm1"); # mm1 = carry_in
44 &jmp(&label("maw_sse2_entry"));
45
46 &set_label("maw_sse2_unrolled",16);
47 &movd("mm3",&DWP(0,$r,"",0)); # mm3 = r[0]
48 &paddq("mm1","mm3"); # mm1 = carry_in + r[0]
49 &movd("mm2",&DWP(0,$a,"",0)); # mm2 = a[0]
50 &pmuludq("mm2","mm0"); # mm2 = w*a[0]
51 &movd("mm4",&DWP(4,$a,"",0)); # mm4 = a[1]
52 &pmuludq("mm4","mm0"); # mm4 = w*a[1]
53 &movd("mm6",&DWP(8,$a,"",0)); # mm6 = a[2]
54 &pmuludq("mm6","mm0"); # mm6 = w*a[2]
55 &movd("mm7",&DWP(12,$a,"",0)); # mm7 = a[3]
56 &pmuludq("mm7","mm0"); # mm7 = w*a[3]
57 &paddq("mm1","mm2"); # mm1 = carry_in + r[0] + w*a[0]
58 &movd("mm3",&DWP(4,$r,"",0)); # mm3 = r[1]
59 &paddq("mm3","mm4"); # mm3 = r[1] + w*a[1]
60 &movd("mm5",&DWP(8,$r,"",0)); # mm5 = r[2]
61 &paddq("mm5","mm6"); # mm5 = r[2] + w*a[2]
62 &movd("mm4",&DWP(12,$r,"",0)); # mm4 = r[3]
63 &paddq("mm7","mm4"); # mm7 = r[3] + w*a[3]
64 &movd(&DWP(0,$r,"",0),"mm1");
65 &movd("mm2",&DWP(16,$a,"",0)); # mm2 = a[4]
66 &pmuludq("mm2","mm0"); # mm2 = w*a[4]
67 &psrlq("mm1",32); # mm1 = carry0
68 &movd("mm4",&DWP(20,$a,"",0)); # mm4 = a[5]
69 &pmuludq("mm4","mm0"); # mm4 = w*a[5]
70 &paddq("mm1","mm3"); # mm1 = carry0 + r[1] + w*a[1]
71 &movd("mm6",&DWP(24,$a,"",0)); # mm6 = a[6]
72 &pmuludq("mm6","mm0"); # mm6 = w*a[6]
73 &movd(&DWP(4,$r,"",0),"mm1");
74 &psrlq("mm1",32); # mm1 = carry1
75 &movd("mm3",&DWP(28,$a,"",0)); # mm3 = a[7]
76 &add($a,32);
77 &pmuludq("mm3","mm0"); # mm3 = w*a[7]
78 &paddq("mm1","mm5"); # mm1 = carry1 + r[2] + w*a[2]
79 &movd("mm5",&DWP(16,$r,"",0)); # mm5 = r[4]
80 &paddq("mm2","mm5"); # mm2 = r[4] + w*a[4]
81 &movd(&DWP(8,$r,"",0),"mm1");
82 &psrlq("mm1",32); # mm1 = carry2
83 &paddq("mm1","mm7"); # mm1 = carry2 + r[3] + w*a[3]
84 &movd("mm5",&DWP(20,$r,"",0)); # mm5 = r[5]
85 &paddq("mm4","mm5"); # mm4 = r[5] + w*a[5]
86 &movd(&DWP(12,$r,"",0),"mm1");
87 &psrlq("mm1",32); # mm1 = carry3
88 &paddq("mm1","mm2"); # mm1 = carry3 + r[4] + w*a[4]
89 &movd("mm5",&DWP(24,$r,"",0)); # mm5 = r[6]
90 &paddq("mm6","mm5"); # mm6 = r[6] + w*a[6]
91 &movd(&DWP(16,$r,"",0),"mm1");
92 &psrlq("mm1",32); # mm1 = carry4
93 &paddq("mm1","mm4"); # mm1 = carry4 + r[5] + w*a[5]
94 &movd("mm5",&DWP(28,$r,"",0)); # mm5 = r[7]
95 &paddq("mm3","mm5"); # mm3 = r[7] + w*a[7]
96 &movd(&DWP(20,$r,"",0),"mm1");
97 &psrlq("mm1",32); # mm1 = carry5
98 &paddq("mm1","mm6"); # mm1 = carry5 + r[6] + w*a[6]
99 &movd(&DWP(24,$r,"",0),"mm1");
100 &psrlq("mm1",32); # mm1 = carry6
101 &paddq("mm1","mm3"); # mm1 = carry6 + r[7] + w*a[7]
102 &movd(&DWP(28,$r,"",0),"mm1");
103 &lea($r,&DWP(32,$r));
104 &psrlq("mm1",32); # mm1 = carry_out
105
106 &sub($c,8);
107 &jz(&label("maw_sse2_exit"));
108 &set_label("maw_sse2_entry");
109 &test($c,0xfffffff8);
110 &jnz(&label("maw_sse2_unrolled"));
111
112 &set_label("maw_sse2_loop",4);
113 &movd("mm2",&DWP(0,$a)); # mm2 = a[i]
114 &movd("mm3",&DWP(0,$r)); # mm3 = r[i]
115 &pmuludq("mm2","mm0"); # a[i] *= w
116 &lea($a,&DWP(4,$a));
117 &paddq("mm1","mm3"); # carry += r[i]
118 &paddq("mm1","mm2"); # carry += a[i]*w
119 &movd(&DWP(0,$r),"mm1"); # r[i] = carry_low
120 &sub($c,1);
121 &psrlq("mm1",32); # carry = carry_high
122 &lea($r,&DWP(4,$r));
123 &jnz(&label("maw_sse2_loop"));
124 &set_label("maw_sse2_exit");
125 &movd("eax","mm1"); # c = carry_out
126 &emms();
127 &ret();
128
129 &set_label("maw_non_sse2",16);
130 }
131
132 # function_begin prologue
133 &push("ebp");
134 &push("ebx");
135 &push("esi");
136 &push("edi");
137
138 &comment("");
139 $Low="eax";
140 $High="edx";
141 $a="ebx";
142 $w="ebp";
143 $r="edi";
144 $c="esi";
145
146 &xor($c,$c); # clear carry
147 &mov($r,&wparam(0)); #
148
149 &mov("ecx",&wparam(2)); #
150 &mov($a,&wparam(1)); #
151
152 &and("ecx",0xfffffff8); # num / 8
153 &mov($w,&wparam(3)); #
154
155 &push("ecx"); # Up the stack for a tmp variable
156
157 &jz(&label("maw_finish"));
158
159 &set_label("maw_loop",16);
160
161 for ($i=0; $i<32; $i+=4)
162 {
163 &comment("Round $i");
164
165 &mov("eax",&DWP($i,$a)); # *a
166 &mul($w); # *a * w
167 &add("eax",$c); # L(t)+= c
168 &adc("edx",0); # H(t)+=carry
169 &add("eax",&DWP($i,$r)); # L(t)+= *r
170 &adc("edx",0); # H(t)+=carry
171 &mov(&DWP($i,$r),"eax"); # *r= L(t);
172 &mov($c,"edx"); # c= H(t);
173 }
174
175 &comment("");
176 &sub("ecx",8);
177 &lea($a,&DWP(32,$a));
178 &lea($r,&DWP(32,$r));
179 &jnz(&label("maw_loop"));
180
181 &set_label("maw_finish",0);
182 &mov("ecx",&wparam(2)); # get num
183 &and("ecx",7);
184 &jnz(&label("maw_finish2")); # helps branch prediction
185 &jmp(&label("maw_end"));
186
187 &set_label("maw_finish2",1);
188 for ($i=0; $i<7; $i++)
189 {
190 &comment("Tail Round $i");
191 &mov("eax",&DWP($i*4,$a)); # *a
192 &mul($w); # *a * w
193 &add("eax",$c); # L(t)+=c
194 &adc("edx",0); # H(t)+=carry
195 &add("eax",&DWP($i*4,$r)); # L(t)+= *r
196 &adc("edx",0); # H(t)+=carry
197 &dec("ecx") if ($i != 7-1);
198 &mov(&DWP($i*4,$r),"eax"); # *r= L(t);
199 &mov($c,"edx"); # c= H(t);
200 &jz(&label("maw_end")) if ($i != 7-1);
201 }
202 &set_label("maw_end",0);
203 &mov("eax",$c);
204
205 &pop("ecx"); # clear variable from
206
207 &function_end($name);
208 }
209
210sub bn_mul_words
211 {
212 local($name)=@_;
213
214 &function_begin_B($name,"");
215
216 $r="eax";
217 $a="edx";
218 $c="ecx";
219
220 if ($sse2) {
221 &picsetup("eax");
222 &picsymbol("eax", "OPENSSL_ia32cap_P", "eax");
223 &bt(&DWP(0,"eax"),"\$IA32CAP_BIT0_SSE2");
224 &jnc(&label("mw_non_sse2"));
225
226 &mov($r,&wparam(0));
227 &mov($a,&wparam(1));
228 &mov($c,&wparam(2));
229 &movd("mm0",&wparam(3)); # mm0 = w
230 &pxor("mm1","mm1"); # mm1 = carry = 0
231
232 &set_label("mw_sse2_loop",16);
233 &movd("mm2",&DWP(0,$a)); # mm2 = a[i]
234 &pmuludq("mm2","mm0"); # a[i] *= w
235 &lea($a,&DWP(4,$a));
236 &paddq("mm1","mm2"); # carry += a[i]*w
237 &movd(&DWP(0,$r),"mm1"); # r[i] = carry_low
238 &sub($c,1);
239 &psrlq("mm1",32); # carry = carry_high
240 &lea($r,&DWP(4,$r));
241 &jnz(&label("mw_sse2_loop"));
242
243 &movd("eax","mm1"); # return carry
244 &emms();
245 &ret();
246 &set_label("mw_non_sse2",16);
247 }
248
249 # function_begin prologue
250 &push("ebp");
251 &push("ebx");
252 &push("esi");
253 &push("edi");
254
255 &comment("");
256 $Low="eax";
257 $High="edx";
258 $a="ebx";
259 $w="ecx";
260 $r="edi";
261 $c="esi";
262 $num="ebp";
263
264 &xor($c,$c); # clear carry
265 &mov($r,&wparam(0)); #
266 &mov($a,&wparam(1)); #
267 &mov($num,&wparam(2)); #
268 &mov($w,&wparam(3)); #
269
270 &and($num,0xfffffff8); # num / 8
271 &jz(&label("mw_finish"));
272
273 &set_label("mw_loop",0);
274 for ($i=0; $i<32; $i+=4)
275 {
276 &comment("Round $i");
277
278 &mov("eax",&DWP($i,$a,"",0)); # *a
279 &mul($w); # *a * w
280 &add("eax",$c); # L(t)+=c
281 # XXX
282
283 &adc("edx",0); # H(t)+=carry
284 &mov(&DWP($i,$r,"",0),"eax"); # *r= L(t);
285
286 &mov($c,"edx"); # c= H(t);
287 }
288
289 &comment("");
290 &add($a,32);
291 &add($r,32);
292 &sub($num,8);
293 &jz(&label("mw_finish"));
294 &jmp(&label("mw_loop"));
295
296 &set_label("mw_finish",0);
297 &mov($num,&wparam(2)); # get num
298 &and($num,7);
299 &jnz(&label("mw_finish2"));
300 &jmp(&label("mw_end"));
301
302 &set_label("mw_finish2",1);
303 for ($i=0; $i<7; $i++)
304 {
305 &comment("Tail Round $i");
306 &mov("eax",&DWP($i*4,$a,"",0));# *a
307 &mul($w); # *a * w
308 &add("eax",$c); # L(t)+=c
309 # XXX
310 &adc("edx",0); # H(t)+=carry
311 &mov(&DWP($i*4,$r,"",0),"eax");# *r= L(t);
312 &mov($c,"edx"); # c= H(t);
313 &dec($num) if ($i != 7-1);
314 &jz(&label("mw_end")) if ($i != 7-1);
315 }
316 &set_label("mw_end",0);
317 &mov("eax",$c);
318
319 &function_end($name);
320 }
321
322sub bn_sqr_words
323 {
324 local($name)=@_;
325
326 &function_begin_B($name,"");
327
328 $r="eax";
329 $a="edx";
330 $c="ecx";
331
332 if ($sse2) {
333 &picsetup("eax");
334 &picsymbol("eax", "OPENSSL_ia32cap_P", "eax");
335 &bt(&DWP(0,"eax"),"\$IA32CAP_BIT0_SSE2");
336 &jnc(&label("sqr_non_sse2"));
337
338 &mov($r,&wparam(0));
339 &mov($a,&wparam(1));
340 &mov($c,&wparam(2));
341
342 &set_label("sqr_sse2_loop",16);
343 &movd("mm0",&DWP(0,$a)); # mm0 = a[i]
344 &pmuludq("mm0","mm0"); # a[i] *= a[i]
345 &lea($a,&DWP(4,$a)); # a++
346 &movq(&QWP(0,$r),"mm0"); # r[i] = a[i]*a[i]
347 &sub($c,1);
348 &lea($r,&DWP(8,$r)); # r += 2
349 &jnz(&label("sqr_sse2_loop"));
350
351 &emms();
352 &ret();
353 &set_label("sqr_non_sse2",16);
354 }
355
356 # function_begin prologue
357 &push("ebp");
358 &push("ebx");
359 &push("esi");
360 &push("edi");
361
362 &comment("");
363 $r="esi";
364 $a="edi";
365 $num="ebx";
366
367 &mov($r,&wparam(0)); #
368 &mov($a,&wparam(1)); #
369 &mov($num,&wparam(2)); #
370
371 &and($num,0xfffffff8); # num / 8
372 &jz(&label("sw_finish"));
373
374 &set_label("sw_loop",0);
375 for ($i=0; $i<32; $i+=4)
376 {
377 &comment("Round $i");
378 &mov("eax",&DWP($i,$a,"",0)); # *a
379 # XXX
380 &mul("eax"); # *a * *a
381 &mov(&DWP($i*2,$r,"",0),"eax"); #
382 &mov(&DWP($i*2+4,$r,"",0),"edx");#
383 }
384
385 &comment("");
386 &add($a,32);
387 &add($r,64);
388 &sub($num,8);
389 &jnz(&label("sw_loop"));
390
391 &set_label("sw_finish",0);
392 &mov($num,&wparam(2)); # get num
393 &and($num,7);
394 &jz(&label("sw_end"));
395
396 for ($i=0; $i<7; $i++)
397 {
398 &comment("Tail Round $i");
399 &mov("eax",&DWP($i*4,$a,"",0)); # *a
400 # XXX
401 &mul("eax"); # *a * *a
402 &mov(&DWP($i*8,$r,"",0),"eax"); #
403 &dec($num) if ($i != 7-1);
404 &mov(&DWP($i*8+4,$r,"",0),"edx");
405 &jz(&label("sw_end")) if ($i != 7-1);
406 }
407 &set_label("sw_end",0);
408
409 &function_end($name);
410 }
411
412sub bn_div_words
413 {
414 local($name)=@_;
415
416 &function_begin_B($name,"");
417 &mov("edx",&wparam(0)); #
418 &mov("eax",&wparam(1)); #
419 &mov("ecx",&wparam(2)); #
420 &div("ecx");
421 &ret();
422 &function_end_B($name);
423 }
424
425sub bn_add_words
426 {
427 local($name)=@_;
428
429 &function_begin($name,"");
430
431 &comment("");
432 $a="esi";
433 $b="edi";
434 $c="eax";
435 $r="ebx";
436 $tmp1="ecx";
437 $tmp2="edx";
438 $num="ebp";
439
440 &mov($r,&wparam(0)); # get r
441 &mov($a,&wparam(1)); # get a
442 &mov($b,&wparam(2)); # get b
443 &mov($num,&wparam(3)); # get num
444 &xor($c,$c); # clear carry
445 &and($num,0xfffffff8); # num / 8
446
447 &jz(&label("aw_finish"));
448
449 &set_label("aw_loop",0);
450 for ($i=0; $i<8; $i++)
451 {
452 &comment("Round $i");
453
454 &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
455 &mov($tmp2,&DWP($i*4,$b,"",0)); # *b
456 &add($tmp1,$c);
457 &mov($c,0);
458 &adc($c,$c);
459 &add($tmp1,$tmp2);
460 &adc($c,0);
461 &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
462 }
463
464 &comment("");
465 &add($a,32);
466 &add($b,32);
467 &add($r,32);
468 &sub($num,8);
469 &jnz(&label("aw_loop"));
470
471 &set_label("aw_finish",0);
472 &mov($num,&wparam(3)); # get num
473 &and($num,7);
474 &jz(&label("aw_end"));
475
476 for ($i=0; $i<7; $i++)
477 {
478 &comment("Tail Round $i");
479 &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
480 &mov($tmp2,&DWP($i*4,$b,"",0));# *b
481 &add($tmp1,$c);
482 &mov($c,0);
483 &adc($c,$c);
484 &add($tmp1,$tmp2);
485 &adc($c,0);
486 &dec($num) if ($i != 6);
487 &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
488 &jz(&label("aw_end")) if ($i != 6);
489 }
490 &set_label("aw_end",0);
491
492# &mov("eax",$c); # $c is "eax"
493
494 &function_end($name);
495 }
496
497sub bn_sub_words
498 {
499 local($name)=@_;
500
501 &function_begin($name,"");
502
503 &comment("");
504 $a="esi";
505 $b="edi";
506 $c="eax";
507 $r="ebx";
508 $tmp1="ecx";
509 $tmp2="edx";
510 $num="ebp";
511
512 &mov($r,&wparam(0)); # get r
513 &mov($a,&wparam(1)); # get a
514 &mov($b,&wparam(2)); # get b
515 &mov($num,&wparam(3)); # get num
516 &xor($c,$c); # clear carry
517 &and($num,0xfffffff8); # num / 8
518
519 &jz(&label("aw_finish"));
520
521 &set_label("aw_loop",0);
522 for ($i=0; $i<8; $i++)
523 {
524 &comment("Round $i");
525
526 &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
527 &mov($tmp2,&DWP($i*4,$b,"",0)); # *b
528 &sub($tmp1,$c);
529 &mov($c,0);
530 &adc($c,$c);
531 &sub($tmp1,$tmp2);
532 &adc($c,0);
533 &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
534 }
535
536 &comment("");
537 &add($a,32);
538 &add($b,32);
539 &add($r,32);
540 &sub($num,8);
541 &jnz(&label("aw_loop"));
542
543 &set_label("aw_finish",0);
544 &mov($num,&wparam(3)); # get num
545 &and($num,7);
546 &jz(&label("aw_end"));
547
548 for ($i=0; $i<7; $i++)
549 {
550 &comment("Tail Round $i");
551 &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
552 &mov($tmp2,&DWP($i*4,$b,"",0));# *b
553 &sub($tmp1,$c);
554 &mov($c,0);
555 &adc($c,$c);
556 &sub($tmp1,$tmp2);
557 &adc($c,0);
558 &dec($num) if ($i != 6);
559 &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
560 &jz(&label("aw_end")) if ($i != 6);
561 }
562 &set_label("aw_end",0);
563
564# &mov("eax",$c); # $c is "eax"
565
566 &function_end($name);
567 }
diff --git a/src/lib/libcrypto/bn/asm/co-586.pl b/src/lib/libcrypto/bn/asm/co-586.pl
deleted file mode 100644
index 37d79cc0c1..0000000000
--- a/src/lib/libcrypto/bn/asm/co-586.pl
+++ /dev/null
@@ -1,287 +0,0 @@
1#!/usr/local/bin/perl
2
3$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
4push(@INC,"${dir}","${dir}../../perlasm");
5require "x86asm.pl";
6
7&asm_init($ARGV[0],$0);
8
9&bn_mul_comba("bn_mul_comba8",8);
10&bn_mul_comba("bn_mul_comba4",4);
11&bn_sqr_comba("bn_sqr_comba8",8);
12&bn_sqr_comba("bn_sqr_comba4",4);
13
14&asm_finish();
15
16sub mul_add_c
17 {
18 local($a,$ai,$b,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
19
20 # pos == -1 if eax and edx are pre-loaded, 0 to load from next
21 # words, and 1 if load return value
22
23 &comment("mul a[$ai]*b[$bi]");
24
25 # "eax" and "edx" will always be pre-loaded.
26 # &mov("eax",&DWP($ai*4,$a,"",0)) ;
27 # &mov("edx",&DWP($bi*4,$b,"",0));
28
29 &mul("edx");
30 &add($c0,"eax");
31 &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # load next a
32 &mov("eax",&wparam(0)) if $pos > 0; # load r[]
33 ###
34 &adc($c1,"edx");
35 &mov("edx",&DWP(($nb)*4,$b,"",0)) if $pos == 0; # load next b
36 &mov("edx",&DWP(($nb)*4,$b,"",0)) if $pos == 1; # load next b
37 ###
38 &adc($c2,0);
39 # is pos > 1, it means it is the last loop
40 &mov(&DWP($i*4,"eax","",0),$c0) if $pos > 0; # save r[];
41 &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # load next a
42 }
43
44sub sqr_add_c
45 {
46 local($r,$a,$ai,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
47
48 # pos == -1 if eax and edx are pre-loaded, 0 to load from next
49 # words, and 1 if load return value
50
51 &comment("sqr a[$ai]*a[$bi]");
52
53 # "eax" and "edx" will always be pre-loaded.
54 # &mov("eax",&DWP($ai*4,$a,"",0)) ;
55 # &mov("edx",&DWP($bi*4,$b,"",0));
56
57 if ($ai == $bi)
58 { &mul("eax");}
59 else
60 { &mul("edx");}
61 &add($c0,"eax");
62 &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # load next a
63 ###
64 &adc($c1,"edx");
65 &mov("edx",&DWP(($nb)*4,$a,"",0)) if ($pos == 1) && ($na != $nb);
66 ###
67 &adc($c2,0);
68 # is pos > 1, it means it is the last loop
69 &mov(&DWP($i*4,$r,"",0),$c0) if $pos > 0; # save r[];
70 &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # load next b
71 }
72
73sub sqr_add_c2
74 {
75 local($r,$a,$ai,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
76
77 # pos == -1 if eax and edx are pre-loaded, 0 to load from next
78 # words, and 1 if load return value
79
80 &comment("sqr a[$ai]*a[$bi]");
81
82 # "eax" and "edx" will always be pre-loaded.
83 # &mov("eax",&DWP($ai*4,$a,"",0)) ;
84 # &mov("edx",&DWP($bi*4,$a,"",0));
85
86 if ($ai == $bi)
87 { &mul("eax");}
88 else
89 { &mul("edx");}
90 &add("eax","eax");
91 ###
92 &adc("edx","edx");
93 ###
94 &adc($c2,0);
95 &add($c0,"eax");
96 &adc($c1,"edx");
97 &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # load next a
98 &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # load next b
99 &adc($c2,0);
100 &mov(&DWP($i*4,$r,"",0),$c0) if $pos > 0; # save r[];
101 &mov("edx",&DWP(($nb)*4,$a,"",0)) if ($pos <= 1) && ($na != $nb);
102 ###
103 }
104
105sub bn_mul_comba
106 {
107 local($name,$num)=@_;
108 local($a,$b,$c0,$c1,$c2);
109 local($i,$as,$ae,$bs,$be,$ai,$bi);
110 local($tot,$end);
111
112 &function_begin_B($name,"");
113
114 $c0="ebx";
115 $c1="ecx";
116 $c2="ebp";
117 $a="esi";
118 $b="edi";
119
120 $as=0;
121 $ae=0;
122 $bs=0;
123 $be=0;
124 $tot=$num+$num-1;
125
126 &push("esi");
127 &mov($a,&wparam(1));
128 &push("edi");
129 &mov($b,&wparam(2));
130 &push("ebp");
131 &push("ebx");
132
133 &xor($c0,$c0);
134 &mov("eax",&DWP(0,$a,"",0)); # load the first word
135 &xor($c1,$c1);
136 &mov("edx",&DWP(0,$b,"",0)); # load the first second
137
138 for ($i=0; $i<$tot; $i++)
139 {
140 $ai=$as;
141 $bi=$bs;
142 $end=$be+1;
143
144 &comment("################## Calculate word $i");
145
146 for ($j=$bs; $j<$end; $j++)
147 {
148 &xor($c2,$c2) if ($j == $bs);
149 if (($j+1) == $end)
150 {
151 $v=1;
152 $v=2 if (($i+1) == $tot);
153 }
154 else
155 { $v=0; }
156 if (($j+1) != $end)
157 {
158 $na=($ai-1);
159 $nb=($bi+1);
160 }
161 else
162 {
163 $na=$as+($i < ($num-1));
164 $nb=$bs+($i >= ($num-1));
165 }
166#printf STDERR "[$ai,$bi] -> [$na,$nb]\n";
167 &mul_add_c($a,$ai,$b,$bi,$c0,$c1,$c2,$v,$i,$na,$nb);
168 if ($v)
169 {
170 &comment("saved r[$i]");
171 # &mov("eax",&wparam(0));
172 # &mov(&DWP($i*4,"eax","",0),$c0);
173 ($c0,$c1,$c2)=($c1,$c2,$c0);
174 }
175 $ai--;
176 $bi++;
177 }
178 $as++ if ($i < ($num-1));
179 $ae++ if ($i >= ($num-1));
180
181 $bs++ if ($i >= ($num-1));
182 $be++ if ($i < ($num-1));
183 }
184 &comment("save r[$i]");
185 # &mov("eax",&wparam(0));
186 &mov(&DWP($i*4,"eax","",0),$c0);
187
188 &pop("ebx");
189 &pop("ebp");
190 &pop("edi");
191 &pop("esi");
192 &ret();
193 &function_end_B($name);
194 }
195
196sub bn_sqr_comba
197 {
198 local($name,$num)=@_;
199 local($r,$a,$c0,$c1,$c2)=@_;
200 local($i,$as,$ae,$bs,$be,$ai,$bi);
201 local($b,$tot,$end,$half);
202
203 &function_begin_B($name,"");
204
205 $c0="ebx";
206 $c1="ecx";
207 $c2="ebp";
208 $a="esi";
209 $r="edi";
210
211 &push("esi");
212 &push("edi");
213 &push("ebp");
214 &push("ebx");
215 &mov($r,&wparam(0));
216 &mov($a,&wparam(1));
217 &xor($c0,$c0);
218 &xor($c1,$c1);
219 &mov("eax",&DWP(0,$a,"",0)); # load the first word
220
221 $as=0;
222 $ae=0;
223 $bs=0;
224 $be=0;
225 $tot=$num+$num-1;
226
227 for ($i=0; $i<$tot; $i++)
228 {
229 $ai=$as;
230 $bi=$bs;
231 $end=$be+1;
232
233 &comment("############### Calculate word $i");
234 for ($j=$bs; $j<$end; $j++)
235 {
236 &xor($c2,$c2) if ($j == $bs);
237 if (($ai-1) < ($bi+1))
238 {
239 $v=1;
240 $v=2 if ($i+1) == $tot;
241 }
242 else
243 { $v=0; }
244 if (!$v)
245 {
246 $na=$ai-1;
247 $nb=$bi+1;
248 }
249 else
250 {
251 $na=$as+($i < ($num-1));
252 $nb=$bs+($i >= ($num-1));
253 }
254 if ($ai == $bi)
255 {
256 &sqr_add_c($r,$a,$ai,$bi,
257 $c0,$c1,$c2,$v,$i,$na,$nb);
258 }
259 else
260 {
261 &sqr_add_c2($r,$a,$ai,$bi,
262 $c0,$c1,$c2,$v,$i,$na,$nb);
263 }
264 if ($v)
265 {
266 &comment("saved r[$i]");
267 #&mov(&DWP($i*4,$r,"",0),$c0);
268 ($c0,$c1,$c2)=($c1,$c2,$c0);
269 last;
270 }
271 $ai--;
272 $bi++;
273 }
274 $as++ if ($i < ($num-1));
275 $ae++ if ($i >= ($num-1));
276
277 $bs++ if ($i >= ($num-1));
278 $be++ if ($i < ($num-1));
279 }
280 &mov(&DWP($i*4,$r,"",0),$c0);
281 &pop("ebx");
282 &pop("ebp");
283 &pop("edi");
284 &pop("esi");
285 &ret();
286 &function_end_B($name);
287 }
diff --git a/src/lib/libcrypto/bn/asm/mips-mont.pl b/src/lib/libcrypto/bn/asm/mips-mont.pl
deleted file mode 100644
index caae04ed3a..0000000000
--- a/src/lib/libcrypto/bn/asm/mips-mont.pl
+++ /dev/null
@@ -1,426 +0,0 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# This module doesn't present direct interest for OpenSSL, because it
11# doesn't provide better performance for longer keys, at least not on
12# in-order-execution cores. While 512-bit RSA sign operations can be
13# 65% faster in 64-bit mode, 1024-bit ones are only 15% faster, and
14# 4096-bit ones are up to 15% slower. In 32-bit mode it varies from
15# 16% improvement for 512-bit RSA sign to -33% for 4096-bit RSA
16# verify:-( All comparisons are against bn_mul_mont-free assembler.
17# The module might be of interest to embedded system developers, as
18# the code is smaller than 1KB, yet offers >3x improvement on MIPS64
19# and 75-30% [less for longer keys] on MIPS32 over compiler-generated
20# code.
21
22######################################################################
23# There is a number of MIPS ABI in use, O32 and N32/64 are most
24# widely used. Then there is a new contender: NUBI. It appears that if
25# one picks the latter, it's possible to arrange code in ABI neutral
26# manner. Therefore let's stick to NUBI register layout:
27#
28($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
29($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
30($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
31($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
32#
33# The return value is placed in $a0. Following coding rules facilitate
34# interoperability:
35#
36# - never ever touch $tp, "thread pointer", former $gp;
37# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
38# old code];
39# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
40#
41# For reference here is register layout for N32/64 MIPS ABIs:
42#
43# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
44# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
45# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
46# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
47# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
48#
49$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64
50
51if ($flavour =~ /64|n32/i) {
52 $PTR_ADD="dadd"; # incidentally works even on n32
53 $PTR_SUB="dsub"; # incidentally works even on n32
54 $REG_S="sd";
55 $REG_L="ld";
56 $SZREG=8;
57} else {
58 $PTR_ADD="add";
59 $PTR_SUB="sub";
60 $REG_S="sw";
61 $REG_L="lw";
62 $SZREG=4;
63}
64$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0x00fff000 : 0x00ff0000;
65#
66# <appro@openssl.org>
67#
68######################################################################
69
70while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
71open STDOUT,">$output";
72
73if ($flavour =~ /64|n32/i) {
74 $LD="ld";
75 $ST="sd";
76 $MULTU="dmultu";
77 $ADDU="daddu";
78 $SUBU="dsubu";
79 $BNSZ=8;
80} else {
81 $LD="lw";
82 $ST="sw";
83 $MULTU="multu";
84 $ADDU="addu";
85 $SUBU="subu";
86 $BNSZ=4;
87}
88
89# int bn_mul_mont(
90$rp=$a0; # BN_ULONG *rp,
91$ap=$a1; # const BN_ULONG *ap,
92$bp=$a2; # const BN_ULONG *bp,
93$np=$a3; # const BN_ULONG *np,
94$n0=$a4; # const BN_ULONG *n0,
95$num=$a5; # int num);
96
97$lo0=$a6;
98$hi0=$a7;
99$lo1=$t1;
100$hi1=$t2;
101$aj=$s0;
102$bi=$s1;
103$nj=$s2;
104$tp=$s3;
105$alo=$s4;
106$ahi=$s5;
107$nlo=$s6;
108$nhi=$s7;
109$tj=$s8;
110$i=$s9;
111$j=$s10;
112$m1=$s11;
113
114$FRAMESIZE=14;
115
116$code=<<___;
117.text
118
119.set noat
120.set noreorder
121
122.align 5
123.globl bn_mul_mont
124.ent bn_mul_mont
125bn_mul_mont:
126___
127$code.=<<___ if ($flavour =~ /o32/i);
128 lw $n0,16($sp)
129 lw $num,20($sp)
130___
131$code.=<<___;
132 slt $at,$num,4
133 bnez $at,1f
134 li $t0,0
135 slt $at,$num,17 # on in-order CPU
136 bnez $at,bn_mul_mont_internal
137 nop
1381: jr $ra
139 li $a0,0
140.end bn_mul_mont
141
142.align 5
143.ent bn_mul_mont_internal
144bn_mul_mont_internal:
145 .frame $fp,$FRAMESIZE*$SZREG,$ra
146 .mask 0x40000000|$SAVED_REGS_MASK,-$SZREG
147 $PTR_SUB $sp,$FRAMESIZE*$SZREG
148 $REG_S $fp,($FRAMESIZE-1)*$SZREG($sp)
149 $REG_S $s11,($FRAMESIZE-2)*$SZREG($sp)
150 $REG_S $s10,($FRAMESIZE-3)*$SZREG($sp)
151 $REG_S $s9,($FRAMESIZE-4)*$SZREG($sp)
152 $REG_S $s8,($FRAMESIZE-5)*$SZREG($sp)
153 $REG_S $s7,($FRAMESIZE-6)*$SZREG($sp)
154 $REG_S $s6,($FRAMESIZE-7)*$SZREG($sp)
155 $REG_S $s5,($FRAMESIZE-8)*$SZREG($sp)
156 $REG_S $s4,($FRAMESIZE-9)*$SZREG($sp)
157___
158$code.=<<___ if ($flavour =~ /nubi/i);
159 $REG_S $s3,($FRAMESIZE-10)*$SZREG($sp)
160 $REG_S $s2,($FRAMESIZE-11)*$SZREG($sp)
161 $REG_S $s1,($FRAMESIZE-12)*$SZREG($sp)
162 $REG_S $s0,($FRAMESIZE-13)*$SZREG($sp)
163___
164$code.=<<___;
165 move $fp,$sp
166
167 .set reorder
168 $LD $n0,0($n0)
169 $LD $bi,0($bp) # bp[0]
170 $LD $aj,0($ap) # ap[0]
171 $LD $nj,0($np) # np[0]
172
173 $PTR_SUB $sp,2*$BNSZ # place for two extra words
174 sll $num,`log($BNSZ)/log(2)`
175 li $at,-4096
176 $PTR_SUB $sp,$num
177 and $sp,$at
178
179 $MULTU $aj,$bi
180 $LD $alo,$BNSZ($ap)
181 $LD $nlo,$BNSZ($np)
182 mflo $lo0
183 mfhi $hi0
184 $MULTU $lo0,$n0
185 mflo $m1
186
187 $MULTU $alo,$bi
188 mflo $alo
189 mfhi $ahi
190
191 $MULTU $nj,$m1
192 mflo $lo1
193 mfhi $hi1
194 $MULTU $nlo,$m1
195 $ADDU $lo1,$lo0
196 sltu $at,$lo1,$lo0
197 $ADDU $hi1,$at
198 mflo $nlo
199 mfhi $nhi
200
201 move $tp,$sp
202 li $j,2*$BNSZ
203.align 4
204.L1st:
205 .set noreorder
206 $PTR_ADD $aj,$ap,$j
207 $PTR_ADD $nj,$np,$j
208 $LD $aj,($aj)
209 $LD $nj,($nj)
210
211 $MULTU $aj,$bi
212 $ADDU $lo0,$alo,$hi0
213 $ADDU $lo1,$nlo,$hi1
214 sltu $at,$lo0,$hi0
215 sltu $t0,$lo1,$hi1
216 $ADDU $hi0,$ahi,$at
217 $ADDU $hi1,$nhi,$t0
218 mflo $alo
219 mfhi $ahi
220
221 $ADDU $lo1,$lo0
222 sltu $at,$lo1,$lo0
223 $MULTU $nj,$m1
224 $ADDU $hi1,$at
225 addu $j,$BNSZ
226 $ST $lo1,($tp)
227 sltu $t0,$j,$num
228 mflo $nlo
229 mfhi $nhi
230
231 bnez $t0,.L1st
232 $PTR_ADD $tp,$BNSZ
233 .set reorder
234
235 $ADDU $lo0,$alo,$hi0
236 sltu $at,$lo0,$hi0
237 $ADDU $hi0,$ahi,$at
238
239 $ADDU $lo1,$nlo,$hi1
240 sltu $t0,$lo1,$hi1
241 $ADDU $hi1,$nhi,$t0
242 $ADDU $lo1,$lo0
243 sltu $at,$lo1,$lo0
244 $ADDU $hi1,$at
245
246 $ST $lo1,($tp)
247
248 $ADDU $hi1,$hi0
249 sltu $at,$hi1,$hi0
250 $ST $hi1,$BNSZ($tp)
251 $ST $at,2*$BNSZ($tp)
252
253 li $i,$BNSZ
254.align 4
255.Louter:
256 $PTR_ADD $bi,$bp,$i
257 $LD $bi,($bi)
258 $LD $aj,($ap)
259 $LD $alo,$BNSZ($ap)
260 $LD $tj,($sp)
261
262 $MULTU $aj,$bi
263 $LD $nj,($np)
264 $LD $nlo,$BNSZ($np)
265 mflo $lo0
266 mfhi $hi0
267 $ADDU $lo0,$tj
268 $MULTU $lo0,$n0
269 sltu $at,$lo0,$tj
270 $ADDU $hi0,$at
271 mflo $m1
272
273 $MULTU $alo,$bi
274 mflo $alo
275 mfhi $ahi
276
277 $MULTU $nj,$m1
278 mflo $lo1
279 mfhi $hi1
280
281 $MULTU $nlo,$m1
282 $ADDU $lo1,$lo0
283 sltu $at,$lo1,$lo0
284 $ADDU $hi1,$at
285 mflo $nlo
286 mfhi $nhi
287
288 move $tp,$sp
289 li $j,2*$BNSZ
290 $LD $tj,$BNSZ($tp)
291.align 4
292.Linner:
293 .set noreorder
294 $PTR_ADD $aj,$ap,$j
295 $PTR_ADD $nj,$np,$j
296 $LD $aj,($aj)
297 $LD $nj,($nj)
298
299 $MULTU $aj,$bi
300 $ADDU $lo0,$alo,$hi0
301 $ADDU $lo1,$nlo,$hi1
302 sltu $at,$lo0,$hi0
303 sltu $t0,$lo1,$hi1
304 $ADDU $hi0,$ahi,$at
305 $ADDU $hi1,$nhi,$t0
306 mflo $alo
307 mfhi $ahi
308
309 $ADDU $lo0,$tj
310 addu $j,$BNSZ
311 $MULTU $nj,$m1
312 sltu $at,$lo0,$tj
313 $ADDU $lo1,$lo0
314 $ADDU $hi0,$at
315 sltu $t0,$lo1,$lo0
316 $LD $tj,2*$BNSZ($tp)
317 $ADDU $hi1,$t0
318 sltu $at,$j,$num
319 mflo $nlo
320 mfhi $nhi
321 $ST $lo1,($tp)
322 bnez $at,.Linner
323 $PTR_ADD $tp,$BNSZ
324 .set reorder
325
326 $ADDU $lo0,$alo,$hi0
327 sltu $at,$lo0,$hi0
328 $ADDU $hi0,$ahi,$at
329 $ADDU $lo0,$tj
330 sltu $t0,$lo0,$tj
331 $ADDU $hi0,$t0
332
333 $LD $tj,2*$BNSZ($tp)
334 $ADDU $lo1,$nlo,$hi1
335 sltu $at,$lo1,$hi1
336 $ADDU $hi1,$nhi,$at
337 $ADDU $lo1,$lo0
338 sltu $t0,$lo1,$lo0
339 $ADDU $hi1,$t0
340 $ST $lo1,($tp)
341
342 $ADDU $lo1,$hi1,$hi0
343 sltu $hi1,$lo1,$hi0
344 $ADDU $lo1,$tj
345 sltu $at,$lo1,$tj
346 $ADDU $hi1,$at
347 $ST $lo1,$BNSZ($tp)
348 $ST $hi1,2*$BNSZ($tp)
349
350 addu $i,$BNSZ
351 sltu $t0,$i,$num
352 bnez $t0,.Louter
353
354 .set noreorder
355 $PTR_ADD $tj,$sp,$num # &tp[num]
356 move $tp,$sp
357 move $ap,$sp
358 li $hi0,0 # clear borrow bit
359
360.align 4
361.Lsub: $LD $lo0,($tp)
362 $LD $lo1,($np)
363 $PTR_ADD $tp,$BNSZ
364 $PTR_ADD $np,$BNSZ
365 $SUBU $lo1,$lo0,$lo1 # tp[i]-np[i]
366 sgtu $at,$lo1,$lo0
367 $SUBU $lo0,$lo1,$hi0
368 sgtu $hi0,$lo0,$lo1
369 $ST $lo0,($rp)
370 or $hi0,$at
371 sltu $at,$tp,$tj
372 bnez $at,.Lsub
373 $PTR_ADD $rp,$BNSZ
374
375 $SUBU $hi0,$hi1,$hi0 # handle upmost overflow bit
376 move $tp,$sp
377 $PTR_SUB $rp,$num # restore rp
378 not $hi1,$hi0
379
380 and $ap,$hi0,$sp
381 and $bp,$hi1,$rp
382 or $ap,$ap,$bp # ap=borrow?tp:rp
383
384.align 4
385.Lcopy: $LD $aj,($ap)
386 $PTR_ADD $ap,$BNSZ
387 $ST $zero,($tp)
388 $PTR_ADD $tp,$BNSZ
389 sltu $at,$tp,$tj
390 $ST $aj,($rp)
391 bnez $at,.Lcopy
392 $PTR_ADD $rp,$BNSZ
393
394 li $a0,1
395 li $t0,1
396
397 .set noreorder
398 move $sp,$fp
399 $REG_L $fp,($FRAMESIZE-1)*$SZREG($sp)
400 $REG_L $s11,($FRAMESIZE-2)*$SZREG($sp)
401 $REG_L $s10,($FRAMESIZE-3)*$SZREG($sp)
402 $REG_L $s9,($FRAMESIZE-4)*$SZREG($sp)
403 $REG_L $s8,($FRAMESIZE-5)*$SZREG($sp)
404 $REG_L $s7,($FRAMESIZE-6)*$SZREG($sp)
405 $REG_L $s6,($FRAMESIZE-7)*$SZREG($sp)
406 $REG_L $s5,($FRAMESIZE-8)*$SZREG($sp)
407 $REG_L $s4,($FRAMESIZE-9)*$SZREG($sp)
408___
409$code.=<<___ if ($flavour =~ /nubi/i);
410 $REG_L $s3,($FRAMESIZE-10)*$SZREG($sp)
411 $REG_L $s2,($FRAMESIZE-11)*$SZREG($sp)
412 $REG_L $s1,($FRAMESIZE-12)*$SZREG($sp)
413 $REG_L $s0,($FRAMESIZE-13)*$SZREG($sp)
414___
415$code.=<<___;
416 jr $ra
417 $PTR_ADD $sp,$FRAMESIZE*$SZREG
418.end bn_mul_mont_internal
419.rdata
420.asciiz "Montgomery Multiplication for MIPS, CRYPTOGAMS by <appro\@openssl.org>"
421___
422
423$code =~ s/\`([^\`]*)\`/eval $1/gem;
424
425print $code;
426close STDOUT;
diff --git a/src/lib/libcrypto/bn/asm/mips.pl b/src/lib/libcrypto/bn/asm/mips.pl
deleted file mode 100644
index 02d43e15b0..0000000000
--- a/src/lib/libcrypto/bn/asm/mips.pl
+++ /dev/null
@@ -1,2234 +0,0 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project.
6#
7# Rights for redistribution and usage in source and binary forms are
8# granted according to the OpenSSL license. Warranty of any kind is
9# disclaimed.
10# ====================================================================
11
12
13# July 1999
14#
15# This is drop-in MIPS III/IV ISA replacement for crypto/bn/bn_asm.c.
16#
17# The module is designed to work with either of the "new" MIPS ABI(5),
18# namely N32 or N64, offered by IRIX 6.x. It's not meant to work under
19# IRIX 5.x not only because it doesn't support new ABIs but also
20# because 5.x kernels put R4x00 CPU into 32-bit mode and all those
21# 64-bit instructions (daddu, dmultu, etc.) found below gonna only
22# cause illegal instruction exception:-(
23#
24# In addition the code depends on preprocessor flags set up by MIPSpro
25# compiler driver (either as or cc) and therefore (probably?) can't be
26# compiled by the GNU assembler. GNU C driver manages fine though...
27# I mean as long as -mmips-as is specified or is the default option,
28# because then it simply invokes /usr/bin/as which in turn takes
29# perfect care of the preprocessor definitions. Another neat feature
30# offered by the MIPSpro assembler is an optimization pass. This gave
31# me the opportunity to have the code looking more regular as all those
32# architecture dependent instruction rescheduling details were left to
33# the assembler. Cool, huh?
34#
35# Performance improvement is astonishing! 'apps/openssl speed rsa dsa'
36# goes way over 3 times faster!
37#
38# <appro@fy.chalmers.se>
39
40# October 2010
41#
42# Adapt the module even for 32-bit ABIs and other OSes. The former was
43# achieved by mechanical replacement of 64-bit arithmetic instructions
44# such as dmultu, daddu, etc. with their 32-bit counterparts and
45# adjusting offsets denoting multiples of BN_ULONG. Above mentioned
46# >3x performance improvement naturally does not apply to 32-bit code
47# [because there is no instruction 32-bit compiler can't use], one
48# has to content with 40-85% improvement depending on benchmark and
49# key length, more for longer keys.
50
51$flavour = shift;
52while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
53open STDOUT,">$output";
54
55if ($flavour =~ /64|n32/i) {
56 $LD="ld";
57 $ST="sd";
58 $MULTU="dmultu";
59 $DIVU="ddivu";
60 $ADDU="daddu";
61 $SUBU="dsubu";
62 $SRL="dsrl";
63 $SLL="dsll";
64 $BNSZ=8;
65 $PTR_ADD="daddu";
66 $PTR_SUB="dsubu";
67 $SZREG=8;
68 $REG_S="sd";
69 $REG_L="ld";
70} else {
71 $LD="lw";
72 $ST="sw";
73 $MULTU="multu";
74 $DIVU="divu";
75 $ADDU="addu";
76 $SUBU="subu";
77 $SRL="srl";
78 $SLL="sll";
79 $BNSZ=4;
80 $PTR_ADD="addu";
81 $PTR_SUB="subu";
82 $SZREG=4;
83 $REG_S="sw";
84 $REG_L="lw";
85 $code=".set mips2\n";
86}
87
88# Below is N32/64 register layout used in the original module.
89#
90($zero,$at,$v0,$v1)=map("\$$_",(0..3));
91($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
92($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
93($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
94($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
95($ta0,$ta1,$ta2,$ta3)=($a4,$a5,$a6,$a7);
96#
97# No special adaptation is required for O32. NUBI on the other hand
98# is treated by saving/restoring ($v1,$t0..$t3).
99
100$gp=$v1 if ($flavour =~ /nubi/i);
101
102$minus4=$v1;
103
104$code.=<<___;
105.rdata
106.asciiz "mips3.s, Version 1.2"
107.asciiz "MIPS II/III/IV ISA artwork by Andy Polyakov <appro\@fy.chalmers.se>"
108
109.text
110.set noat
111
112.align 5
113.globl bn_mul_add_words
114.ent bn_mul_add_words
115bn_mul_add_words:
116 .set noreorder
117 bgtz $a2,bn_mul_add_words_internal
118 move $v0,$zero
119 jr $ra
120 move $a0,$v0
121.end bn_mul_add_words
122
123.align 5
124.ent bn_mul_add_words_internal
125bn_mul_add_words_internal:
126___
127$code.=<<___ if ($flavour =~ /nubi/i);
128 .frame $sp,6*$SZREG,$ra
129 .mask 0x8000f008,-$SZREG
130 .set noreorder
131 $PTR_SUB $sp,6*$SZREG
132 $REG_S $ra,5*$SZREG($sp)
133 $REG_S $t3,4*$SZREG($sp)
134 $REG_S $t2,3*$SZREG($sp)
135 $REG_S $t1,2*$SZREG($sp)
136 $REG_S $t0,1*$SZREG($sp)
137 $REG_S $gp,0*$SZREG($sp)
138___
139$code.=<<___;
140 .set reorder
141 li $minus4,-4
142 and $ta0,$a2,$minus4
143 beqz $ta0,.L_bn_mul_add_words_tail
144
145.L_bn_mul_add_words_loop:
146 $LD $t0,0($a1)
147 $MULTU $t0,$a3
148 $LD $t1,0($a0)
149 $LD $t2,$BNSZ($a1)
150 $LD $t3,$BNSZ($a0)
151 $LD $ta0,2*$BNSZ($a1)
152 $LD $ta1,2*$BNSZ($a0)
153 $ADDU $t1,$v0
154 sltu $v0,$t1,$v0 # All manuals say it "compares 32-bit
155 # values", but it seems to work fine
156 # even on 64-bit registers.
157 mflo $at
158 mfhi $t0
159 $ADDU $t1,$at
160 $ADDU $v0,$t0
161 $MULTU $t2,$a3
162 sltu $at,$t1,$at
163 $ST $t1,0($a0)
164 $ADDU $v0,$at
165
166 $LD $ta2,3*$BNSZ($a1)
167 $LD $ta3,3*$BNSZ($a0)
168 $ADDU $t3,$v0
169 sltu $v0,$t3,$v0
170 mflo $at
171 mfhi $t2
172 $ADDU $t3,$at
173 $ADDU $v0,$t2
174 $MULTU $ta0,$a3
175 sltu $at,$t3,$at
176 $ST $t3,$BNSZ($a0)
177 $ADDU $v0,$at
178
179 subu $a2,4
180 $PTR_ADD $a0,4*$BNSZ
181 $PTR_ADD $a1,4*$BNSZ
182 $ADDU $ta1,$v0
183 sltu $v0,$ta1,$v0
184 mflo $at
185 mfhi $ta0
186 $ADDU $ta1,$at
187 $ADDU $v0,$ta0
188 $MULTU $ta2,$a3
189 sltu $at,$ta1,$at
190 $ST $ta1,-2*$BNSZ($a0)
191 $ADDU $v0,$at
192
193
194 and $ta0,$a2,$minus4
195 $ADDU $ta3,$v0
196 sltu $v0,$ta3,$v0
197 mflo $at
198 mfhi $ta2
199 $ADDU $ta3,$at
200 $ADDU $v0,$ta2
201 sltu $at,$ta3,$at
202 $ST $ta3,-$BNSZ($a0)
203 .set noreorder
204 bgtz $ta0,.L_bn_mul_add_words_loop
205 $ADDU $v0,$at
206
207 beqz $a2,.L_bn_mul_add_words_return
208 nop
209
210.L_bn_mul_add_words_tail:
211 .set reorder
212 $LD $t0,0($a1)
213 $MULTU $t0,$a3
214 $LD $t1,0($a0)
215 subu $a2,1
216 $ADDU $t1,$v0
217 sltu $v0,$t1,$v0
218 mflo $at
219 mfhi $t0
220 $ADDU $t1,$at
221 $ADDU $v0,$t0
222 sltu $at,$t1,$at
223 $ST $t1,0($a0)
224 $ADDU $v0,$at
225 beqz $a2,.L_bn_mul_add_words_return
226
227 $LD $t0,$BNSZ($a1)
228 $MULTU $t0,$a3
229 $LD $t1,$BNSZ($a0)
230 subu $a2,1
231 $ADDU $t1,$v0
232 sltu $v0,$t1,$v0
233 mflo $at
234 mfhi $t0
235 $ADDU $t1,$at
236 $ADDU $v0,$t0
237 sltu $at,$t1,$at
238 $ST $t1,$BNSZ($a0)
239 $ADDU $v0,$at
240 beqz $a2,.L_bn_mul_add_words_return
241
242 $LD $t0,2*$BNSZ($a1)
243 $MULTU $t0,$a3
244 $LD $t1,2*$BNSZ($a0)
245 $ADDU $t1,$v0
246 sltu $v0,$t1,$v0
247 mflo $at
248 mfhi $t0
249 $ADDU $t1,$at
250 $ADDU $v0,$t0
251 sltu $at,$t1,$at
252 $ST $t1,2*$BNSZ($a0)
253 $ADDU $v0,$at
254
255.L_bn_mul_add_words_return:
256 .set noreorder
257___
258$code.=<<___ if ($flavour =~ /nubi/i);
259 $REG_L $t3,4*$SZREG($sp)
260 $REG_L $t2,3*$SZREG($sp)
261 $REG_L $t1,2*$SZREG($sp)
262 $REG_L $t0,1*$SZREG($sp)
263 $REG_L $gp,0*$SZREG($sp)
264 $PTR_ADD $sp,6*$SZREG
265___
266$code.=<<___;
267 jr $ra
268 move $a0,$v0
269.end bn_mul_add_words_internal
270
271.align 5
272.globl bn_mul_words
273.ent bn_mul_words
274bn_mul_words:
275 .set noreorder
276 bgtz $a2,bn_mul_words_internal
277 move $v0,$zero
278 jr $ra
279 move $a0,$v0
280.end bn_mul_words
281
282.align 5
283.ent bn_mul_words_internal
284bn_mul_words_internal:
285___
286$code.=<<___ if ($flavour =~ /nubi/i);
287 .frame $sp,6*$SZREG,$ra
288 .mask 0x8000f008,-$SZREG
289 .set noreorder
290 $PTR_SUB $sp,6*$SZREG
291 $REG_S $ra,5*$SZREG($sp)
292 $REG_S $t3,4*$SZREG($sp)
293 $REG_S $t2,3*$SZREG($sp)
294 $REG_S $t1,2*$SZREG($sp)
295 $REG_S $t0,1*$SZREG($sp)
296 $REG_S $gp,0*$SZREG($sp)
297___
298$code.=<<___;
299 .set reorder
300 li $minus4,-4
301 and $ta0,$a2,$minus4
302 beqz $ta0,.L_bn_mul_words_tail
303
304.L_bn_mul_words_loop:
305 $LD $t0,0($a1)
306 $MULTU $t0,$a3
307 $LD $t2,$BNSZ($a1)
308 $LD $ta0,2*$BNSZ($a1)
309 $LD $ta2,3*$BNSZ($a1)
310 mflo $at
311 mfhi $t0
312 $ADDU $v0,$at
313 sltu $t1,$v0,$at
314 $MULTU $t2,$a3
315 $ST $v0,0($a0)
316 $ADDU $v0,$t1,$t0
317
318 subu $a2,4
319 $PTR_ADD $a0,4*$BNSZ
320 $PTR_ADD $a1,4*$BNSZ
321 mflo $at
322 mfhi $t2
323 $ADDU $v0,$at
324 sltu $t3,$v0,$at
325 $MULTU $ta0,$a3
326 $ST $v0,-3*$BNSZ($a0)
327 $ADDU $v0,$t3,$t2
328
329 mflo $at
330 mfhi $ta0
331 $ADDU $v0,$at
332 sltu $ta1,$v0,$at
333 $MULTU $ta2,$a3
334 $ST $v0,-2*$BNSZ($a0)
335 $ADDU $v0,$ta1,$ta0
336
337 and $ta0,$a2,$minus4
338 mflo $at
339 mfhi $ta2
340 $ADDU $v0,$at
341 sltu $ta3,$v0,$at
342 $ST $v0,-$BNSZ($a0)
343 .set noreorder
344 bgtz $ta0,.L_bn_mul_words_loop
345 $ADDU $v0,$ta3,$ta2
346
347 beqz $a2,.L_bn_mul_words_return
348 nop
349
350.L_bn_mul_words_tail:
351 .set reorder
352 $LD $t0,0($a1)
353 $MULTU $t0,$a3
354 subu $a2,1
355 mflo $at
356 mfhi $t0
357 $ADDU $v0,$at
358 sltu $t1,$v0,$at
359 $ST $v0,0($a0)
360 $ADDU $v0,$t1,$t0
361 beqz $a2,.L_bn_mul_words_return
362
363 $LD $t0,$BNSZ($a1)
364 $MULTU $t0,$a3
365 subu $a2,1
366 mflo $at
367 mfhi $t0
368 $ADDU $v0,$at
369 sltu $t1,$v0,$at
370 $ST $v0,$BNSZ($a0)
371 $ADDU $v0,$t1,$t0
372 beqz $a2,.L_bn_mul_words_return
373
374 $LD $t0,2*$BNSZ($a1)
375 $MULTU $t0,$a3
376 mflo $at
377 mfhi $t0
378 $ADDU $v0,$at
379 sltu $t1,$v0,$at
380 $ST $v0,2*$BNSZ($a0)
381 $ADDU $v0,$t1,$t0
382
383.L_bn_mul_words_return:
384 .set noreorder
385___
386$code.=<<___ if ($flavour =~ /nubi/i);
387 $REG_L $t3,4*$SZREG($sp)
388 $REG_L $t2,3*$SZREG($sp)
389 $REG_L $t1,2*$SZREG($sp)
390 $REG_L $t0,1*$SZREG($sp)
391 $REG_L $gp,0*$SZREG($sp)
392 $PTR_ADD $sp,6*$SZREG
393___
394$code.=<<___;
395 jr $ra
396 move $a0,$v0
397.end bn_mul_words_internal
398
399.align 5
400.globl bn_sqr_words
401.ent bn_sqr_words
402bn_sqr_words:
403 .set noreorder
404 bgtz $a2,bn_sqr_words_internal
405 move $v0,$zero
406 jr $ra
407 move $a0,$v0
408.end bn_sqr_words
409
410.align 5
411.ent bn_sqr_words_internal
412bn_sqr_words_internal:
413___
414$code.=<<___ if ($flavour =~ /nubi/i);
415 .frame $sp,6*$SZREG,$ra
416 .mask 0x8000f008,-$SZREG
417 .set noreorder
418 $PTR_SUB $sp,6*$SZREG
419 $REG_S $ra,5*$SZREG($sp)
420 $REG_S $t3,4*$SZREG($sp)
421 $REG_S $t2,3*$SZREG($sp)
422 $REG_S $t1,2*$SZREG($sp)
423 $REG_S $t0,1*$SZREG($sp)
424 $REG_S $gp,0*$SZREG($sp)
425___
426$code.=<<___;
427 .set reorder
428 li $minus4,-4
429 and $ta0,$a2,$minus4
430 beqz $ta0,.L_bn_sqr_words_tail
431
432.L_bn_sqr_words_loop:
433 $LD $t0,0($a1)
434 $MULTU $t0,$t0
435 $LD $t2,$BNSZ($a1)
436 $LD $ta0,2*$BNSZ($a1)
437 $LD $ta2,3*$BNSZ($a1)
438 mflo $t1
439 mfhi $t0
440 $ST $t1,0($a0)
441 $ST $t0,$BNSZ($a0)
442
443 $MULTU $t2,$t2
444 subu $a2,4
445 $PTR_ADD $a0,8*$BNSZ
446 $PTR_ADD $a1,4*$BNSZ
447 mflo $t3
448 mfhi $t2
449 $ST $t3,-6*$BNSZ($a0)
450 $ST $t2,-5*$BNSZ($a0)
451
452 $MULTU $ta0,$ta0
453 mflo $ta1
454 mfhi $ta0
455 $ST $ta1,-4*$BNSZ($a0)
456 $ST $ta0,-3*$BNSZ($a0)
457
458
459 $MULTU $ta2,$ta2
460 and $ta0,$a2,$minus4
461 mflo $ta3
462 mfhi $ta2
463 $ST $ta3,-2*$BNSZ($a0)
464
465 .set noreorder
466 bgtz $ta0,.L_bn_sqr_words_loop
467 $ST $ta2,-$BNSZ($a0)
468
469 beqz $a2,.L_bn_sqr_words_return
470 nop
471
472.L_bn_sqr_words_tail:
473 .set reorder
474 $LD $t0,0($a1)
475 $MULTU $t0,$t0
476 subu $a2,1
477 mflo $t1
478 mfhi $t0
479 $ST $t1,0($a0)
480 $ST $t0,$BNSZ($a0)
481 beqz $a2,.L_bn_sqr_words_return
482
483 $LD $t0,$BNSZ($a1)
484 $MULTU $t0,$t0
485 subu $a2,1
486 mflo $t1
487 mfhi $t0
488 $ST $t1,2*$BNSZ($a0)
489 $ST $t0,3*$BNSZ($a0)
490 beqz $a2,.L_bn_sqr_words_return
491
492 $LD $t0,2*$BNSZ($a1)
493 $MULTU $t0,$t0
494 mflo $t1
495 mfhi $t0
496 $ST $t1,4*$BNSZ($a0)
497 $ST $t0,5*$BNSZ($a0)
498
499.L_bn_sqr_words_return:
500 .set noreorder
501___
502$code.=<<___ if ($flavour =~ /nubi/i);
503 $REG_L $t3,4*$SZREG($sp)
504 $REG_L $t2,3*$SZREG($sp)
505 $REG_L $t1,2*$SZREG($sp)
506 $REG_L $t0,1*$SZREG($sp)
507 $REG_L $gp,0*$SZREG($sp)
508 $PTR_ADD $sp,6*$SZREG
509___
510$code.=<<___;
511 jr $ra
512 move $a0,$v0
513
514.end bn_sqr_words_internal
515
516.align 5
517.globl bn_add_words
518.ent bn_add_words
519bn_add_words:
520 .set noreorder
521 bgtz $a3,bn_add_words_internal
522 move $v0,$zero
523 jr $ra
524 move $a0,$v0
525.end bn_add_words
526
527.align 5
528.ent bn_add_words_internal
529bn_add_words_internal:
530___
531$code.=<<___ if ($flavour =~ /nubi/i);
532 .frame $sp,6*$SZREG,$ra
533 .mask 0x8000f008,-$SZREG
534 .set noreorder
535 $PTR_SUB $sp,6*$SZREG
536 $REG_S $ra,5*$SZREG($sp)
537 $REG_S $t3,4*$SZREG($sp)
538 $REG_S $t2,3*$SZREG($sp)
539 $REG_S $t1,2*$SZREG($sp)
540 $REG_S $t0,1*$SZREG($sp)
541 $REG_S $gp,0*$SZREG($sp)
542___
543$code.=<<___;
544 .set reorder
545 li $minus4,-4
546 and $at,$a3,$minus4
547 beqz $at,.L_bn_add_words_tail
548
549.L_bn_add_words_loop:
550 $LD $t0,0($a1)
551 $LD $ta0,0($a2)
552 subu $a3,4
553 $LD $t1,$BNSZ($a1)
554 and $at,$a3,$minus4
555 $LD $t2,2*$BNSZ($a1)
556 $PTR_ADD $a2,4*$BNSZ
557 $LD $t3,3*$BNSZ($a1)
558 $PTR_ADD $a0,4*$BNSZ
559 $LD $ta1,-3*$BNSZ($a2)
560 $PTR_ADD $a1,4*$BNSZ
561 $LD $ta2,-2*$BNSZ($a2)
562 $LD $ta3,-$BNSZ($a2)
563 $ADDU $ta0,$t0
564 sltu $t8,$ta0,$t0
565 $ADDU $t0,$ta0,$v0
566 sltu $v0,$t0,$ta0
567 $ST $t0,-4*$BNSZ($a0)
568 $ADDU $v0,$t8
569
570 $ADDU $ta1,$t1
571 sltu $t9,$ta1,$t1
572 $ADDU $t1,$ta1,$v0
573 sltu $v0,$t1,$ta1
574 $ST $t1,-3*$BNSZ($a0)
575 $ADDU $v0,$t9
576
577 $ADDU $ta2,$t2
578 sltu $t8,$ta2,$t2
579 $ADDU $t2,$ta2,$v0
580 sltu $v0,$t2,$ta2
581 $ST $t2,-2*$BNSZ($a0)
582 $ADDU $v0,$t8
583
584 $ADDU $ta3,$t3
585 sltu $t9,$ta3,$t3
586 $ADDU $t3,$ta3,$v0
587 sltu $v0,$t3,$ta3
588 $ST $t3,-$BNSZ($a0)
589
590 .set noreorder
591 bgtz $at,.L_bn_add_words_loop
592 $ADDU $v0,$t9
593
594 beqz $a3,.L_bn_add_words_return
595 nop
596
597.L_bn_add_words_tail:
598 .set reorder
599 $LD $t0,0($a1)
600 $LD $ta0,0($a2)
601 $ADDU $ta0,$t0
602 subu $a3,1
603 sltu $t8,$ta0,$t0
604 $ADDU $t0,$ta0,$v0
605 sltu $v0,$t0,$ta0
606 $ST $t0,0($a0)
607 $ADDU $v0,$t8
608 beqz $a3,.L_bn_add_words_return
609
610 $LD $t1,$BNSZ($a1)
611 $LD $ta1,$BNSZ($a2)
612 $ADDU $ta1,$t1
613 subu $a3,1
614 sltu $t9,$ta1,$t1
615 $ADDU $t1,$ta1,$v0
616 sltu $v0,$t1,$ta1
617 $ST $t1,$BNSZ($a0)
618 $ADDU $v0,$t9
619 beqz $a3,.L_bn_add_words_return
620
621 $LD $t2,2*$BNSZ($a1)
622 $LD $ta2,2*$BNSZ($a2)
623 $ADDU $ta2,$t2
624 sltu $t8,$ta2,$t2
625 $ADDU $t2,$ta2,$v0
626 sltu $v0,$t2,$ta2
627 $ST $t2,2*$BNSZ($a0)
628 $ADDU $v0,$t8
629
630.L_bn_add_words_return:
631 .set noreorder
632___
633$code.=<<___ if ($flavour =~ /nubi/i);
634 $REG_L $t3,4*$SZREG($sp)
635 $REG_L $t2,3*$SZREG($sp)
636 $REG_L $t1,2*$SZREG($sp)
637 $REG_L $t0,1*$SZREG($sp)
638 $REG_L $gp,0*$SZREG($sp)
639 $PTR_ADD $sp,6*$SZREG
640___
641$code.=<<___;
642 jr $ra
643 move $a0,$v0
644
645.end bn_add_words_internal
646
647.align 5
648.globl bn_sub_words
649.ent bn_sub_words
650bn_sub_words:
651 .set noreorder
652 bgtz $a3,bn_sub_words_internal
653 move $v0,$zero
654 jr $ra
655 move $a0,$zero
656.end bn_sub_words
657
658.align 5
659.ent bn_sub_words_internal
660bn_sub_words_internal:
661___
662$code.=<<___ if ($flavour =~ /nubi/i);
663 .frame $sp,6*$SZREG,$ra
664 .mask 0x8000f008,-$SZREG
665 .set noreorder
666 $PTR_SUB $sp,6*$SZREG
667 $REG_S $ra,5*$SZREG($sp)
668 $REG_S $t3,4*$SZREG($sp)
669 $REG_S $t2,3*$SZREG($sp)
670 $REG_S $t1,2*$SZREG($sp)
671 $REG_S $t0,1*$SZREG($sp)
672 $REG_S $gp,0*$SZREG($sp)
673___
674$code.=<<___;
675 .set reorder
676 li $minus4,-4
677 and $at,$a3,$minus4
678 beqz $at,.L_bn_sub_words_tail
679
680.L_bn_sub_words_loop:
681 $LD $t0,0($a1)
682 $LD $ta0,0($a2)
683 subu $a3,4
684 $LD $t1,$BNSZ($a1)
685 and $at,$a3,$minus4
686 $LD $t2,2*$BNSZ($a1)
687 $PTR_ADD $a2,4*$BNSZ
688 $LD $t3,3*$BNSZ($a1)
689 $PTR_ADD $a0,4*$BNSZ
690 $LD $ta1,-3*$BNSZ($a2)
691 $PTR_ADD $a1,4*$BNSZ
692 $LD $ta2,-2*$BNSZ($a2)
693 $LD $ta3,-$BNSZ($a2)
694 sltu $t8,$t0,$ta0
695 $SUBU $ta0,$t0,$ta0
696 $SUBU $t0,$ta0,$v0
697 sgtu $v0,$t0,$ta0
698 $ST $t0,-4*$BNSZ($a0)
699 $ADDU $v0,$t8
700
701 sltu $t9,$t1,$ta1
702 $SUBU $ta1,$t1,$ta1
703 $SUBU $t1,$ta1,$v0
704 sgtu $v0,$t1,$ta1
705 $ST $t1,-3*$BNSZ($a0)
706 $ADDU $v0,$t9
707
708
709 sltu $t8,$t2,$ta2
710 $SUBU $ta2,$t2,$ta2
711 $SUBU $t2,$ta2,$v0
712 sgtu $v0,$t2,$ta2
713 $ST $t2,-2*$BNSZ($a0)
714 $ADDU $v0,$t8
715
716 sltu $t9,$t3,$ta3
717 $SUBU $ta3,$t3,$ta3
718 $SUBU $t3,$ta3,$v0
719 sgtu $v0,$t3,$ta3
720 $ST $t3,-$BNSZ($a0)
721
722 .set noreorder
723 bgtz $at,.L_bn_sub_words_loop
724 $ADDU $v0,$t9
725
726 beqz $a3,.L_bn_sub_words_return
727 nop
728
729.L_bn_sub_words_tail:
730 .set reorder
731 $LD $t0,0($a1)
732 $LD $ta0,0($a2)
733 subu $a3,1
734 sltu $t8,$t0,$ta0
735 $SUBU $ta0,$t0,$ta0
736 $SUBU $t0,$ta0,$v0
737 sgtu $v0,$t0,$ta0
738 $ST $t0,0($a0)
739 $ADDU $v0,$t8
740 beqz $a3,.L_bn_sub_words_return
741
742 $LD $t1,$BNSZ($a1)
743 subu $a3,1
744 $LD $ta1,$BNSZ($a2)
745 sltu $t9,$t1,$ta1
746 $SUBU $ta1,$t1,$ta1
747 $SUBU $t1,$ta1,$v0
748 sgtu $v0,$t1,$ta1
749 $ST $t1,$BNSZ($a0)
750 $ADDU $v0,$t9
751 beqz $a3,.L_bn_sub_words_return
752
753 $LD $t2,2*$BNSZ($a1)
754 $LD $ta2,2*$BNSZ($a2)
755 sltu $t8,$t2,$ta2
756 $SUBU $ta2,$t2,$ta2
757 $SUBU $t2,$ta2,$v0
758 sgtu $v0,$t2,$ta2
759 $ST $t2,2*$BNSZ($a0)
760 $ADDU $v0,$t8
761
762.L_bn_sub_words_return:
763 .set noreorder
764___
765$code.=<<___ if ($flavour =~ /nubi/i);
766 $REG_L $t3,4*$SZREG($sp)
767 $REG_L $t2,3*$SZREG($sp)
768 $REG_L $t1,2*$SZREG($sp)
769 $REG_L $t0,1*$SZREG($sp)
770 $REG_L $gp,0*$SZREG($sp)
771 $PTR_ADD $sp,6*$SZREG
772___
773$code.=<<___;
774 jr $ra
775 move $a0,$v0
776.end bn_sub_words_internal
777
778.align 5
779.globl bn_div_3_words
780.ent bn_div_3_words
781bn_div_3_words:
782 .set noreorder
783 move $a3,$a0 # we know that bn_div_words does not
784 # touch $a3, $ta2, $ta3 and preserves $a2
785 # so that we can save two arguments
786 # and return address in registers
787 # instead of stack:-)
788
789 $LD $a0,($a3)
790 move $ta2,$a1
791 bne $a0,$a2,bn_div_3_words_internal
792 $LD $a1,-$BNSZ($a3)
793 li $v0,-1
794 jr $ra
795 move $a0,$v0
796.end bn_div_3_words
797
798.align 5
799.ent bn_div_3_words_internal
800bn_div_3_words_internal:
801___
802$code.=<<___ if ($flavour =~ /nubi/i);
803 .frame $sp,6*$SZREG,$ra
804 .mask 0x8000f008,-$SZREG
805 .set noreorder
806 $PTR_SUB $sp,6*$SZREG
807 $REG_S $ra,5*$SZREG($sp)
808 $REG_S $t3,4*$SZREG($sp)
809 $REG_S $t2,3*$SZREG($sp)
810 $REG_S $t1,2*$SZREG($sp)
811 $REG_S $t0,1*$SZREG($sp)
812 $REG_S $gp,0*$SZREG($sp)
813___
814$code.=<<___;
815 .set reorder
816 move $ta3,$ra
817 bal bn_div_words_internal
818 move $ra,$ta3
819 $MULTU $ta2,$v0
820 $LD $t2,-2*$BNSZ($a3)
821 move $ta0,$zero
822 mfhi $t1
823 mflo $t0
824 sltu $t8,$t1,$a1
825.L_bn_div_3_words_inner_loop:
826 bnez $t8,.L_bn_div_3_words_inner_loop_done
827 sgeu $at,$t2,$t0
828 seq $t9,$t1,$a1
829 and $at,$t9
830 sltu $t3,$t0,$ta2
831 $ADDU $a1,$a2
832 $SUBU $t1,$t3
833 $SUBU $t0,$ta2
834 sltu $t8,$t1,$a1
835 sltu $ta0,$a1,$a2
836 or $t8,$ta0
837 .set noreorder
838 beqz $at,.L_bn_div_3_words_inner_loop
839 $SUBU $v0,1
840 $ADDU $v0,1
841 .set reorder
842.L_bn_div_3_words_inner_loop_done:
843 .set noreorder
844___
845$code.=<<___ if ($flavour =~ /nubi/i);
846 $REG_L $t3,4*$SZREG($sp)
847 $REG_L $t2,3*$SZREG($sp)
848 $REG_L $t1,2*$SZREG($sp)
849 $REG_L $t0,1*$SZREG($sp)
850 $REG_L $gp,0*$SZREG($sp)
851 $PTR_ADD $sp,6*$SZREG
852___
853$code.=<<___;
854 jr $ra
855 move $a0,$v0
856.end bn_div_3_words_internal
857
858.align 5
859.globl bn_div_words
860.ent bn_div_words
861bn_div_words:
862 .set noreorder
863 bnez $a2,bn_div_words_internal
864 li $v0,-1 # I would rather signal div-by-zero
865 # which can be done with 'break 7'
866 jr $ra
867 move $a0,$v0
868.end bn_div_words
869
870.align 5
871.ent bn_div_words_internal
872bn_div_words_internal:
873___
874$code.=<<___ if ($flavour =~ /nubi/i);
875 .frame $sp,6*$SZREG,$ra
876 .mask 0x8000f008,-$SZREG
877 .set noreorder
878 $PTR_SUB $sp,6*$SZREG
879 $REG_S $ra,5*$SZREG($sp)
880 $REG_S $t3,4*$SZREG($sp)
881 $REG_S $t2,3*$SZREG($sp)
882 $REG_S $t1,2*$SZREG($sp)
883 $REG_S $t0,1*$SZREG($sp)
884 $REG_S $gp,0*$SZREG($sp)
885___
886$code.=<<___;
887 move $v1,$zero
888 bltz $a2,.L_bn_div_words_body
889 move $t9,$v1
890 $SLL $a2,1
891 bgtz $a2,.-4
892 addu $t9,1
893
894 .set reorder
895 negu $t1,$t9
896 li $t2,-1
897 $SLL $t2,$t1
898 and $t2,$a0
899 $SRL $at,$a1,$t1
900 .set noreorder
901 beqz $t2,.+12
902 nop
903 break 6 # signal overflow
904 .set reorder
905 $SLL $a0,$t9
906 $SLL $a1,$t9
907 or $a0,$at
908___
909$QT=$ta0;
910$HH=$ta1;
911$DH=$v1;
912$code.=<<___;
913.L_bn_div_words_body:
914 $SRL $DH,$a2,4*$BNSZ # bits
915 sgeu $at,$a0,$a2
916 .set noreorder
917 beqz $at,.+12
918 nop
919 $SUBU $a0,$a2
920 .set reorder
921
922 li $QT,-1
923 $SRL $HH,$a0,4*$BNSZ # bits
924 $SRL $QT,4*$BNSZ # q=0xffffffff
925 beq $DH,$HH,.L_bn_div_words_skip_div1
926 $DIVU $zero,$a0,$DH
927 mflo $QT
928.L_bn_div_words_skip_div1:
929 $MULTU $a2,$QT
930 $SLL $t3,$a0,4*$BNSZ # bits
931 $SRL $at,$a1,4*$BNSZ # bits
932 or $t3,$at
933 mflo $t0
934 mfhi $t1
935.L_bn_div_words_inner_loop1:
936 sltu $t2,$t3,$t0
937 seq $t8,$HH,$t1
938 sltu $at,$HH,$t1
939 and $t2,$t8
940 sltu $v0,$t0,$a2
941 or $at,$t2
942 .set noreorder
943 beqz $at,.L_bn_div_words_inner_loop1_done
944 $SUBU $t1,$v0
945 $SUBU $t0,$a2
946 b .L_bn_div_words_inner_loop1
947 $SUBU $QT,1
948 .set reorder
949.L_bn_div_words_inner_loop1_done:
950
951 $SLL $a1,4*$BNSZ # bits
952 $SUBU $a0,$t3,$t0
953 $SLL $v0,$QT,4*$BNSZ # bits
954
955 li $QT,-1
956 $SRL $HH,$a0,4*$BNSZ # bits
957 $SRL $QT,4*$BNSZ # q=0xffffffff
958 beq $DH,$HH,.L_bn_div_words_skip_div2
959 $DIVU $zero,$a0,$DH
960 mflo $QT
961.L_bn_div_words_skip_div2:
962 $MULTU $a2,$QT
963 $SLL $t3,$a0,4*$BNSZ # bits
964 $SRL $at,$a1,4*$BNSZ # bits
965 or $t3,$at
966 mflo $t0
967 mfhi $t1
968.L_bn_div_words_inner_loop2:
969 sltu $t2,$t3,$t0
970 seq $t8,$HH,$t1
971 sltu $at,$HH,$t1
972 and $t2,$t8
973 sltu $v1,$t0,$a2
974 or $at,$t2
975 .set noreorder
976 beqz $at,.L_bn_div_words_inner_loop2_done
977 $SUBU $t1,$v1
978 $SUBU $t0,$a2
979 b .L_bn_div_words_inner_loop2
980 $SUBU $QT,1
981 .set reorder
982.L_bn_div_words_inner_loop2_done:
983
984 $SUBU $a0,$t3,$t0
985 or $v0,$QT
986 $SRL $v1,$a0,$t9 # $v1 contains remainder if anybody wants it
987 $SRL $a2,$t9 # restore $a2
988
989 .set noreorder
990 move $a1,$v1
991___
992$code.=<<___ if ($flavour =~ /nubi/i);
993 $REG_L $t3,4*$SZREG($sp)
994 $REG_L $t2,3*$SZREG($sp)
995 $REG_L $t1,2*$SZREG($sp)
996 $REG_L $t0,1*$SZREG($sp)
997 $REG_L $gp,0*$SZREG($sp)
998 $PTR_ADD $sp,6*$SZREG
999___
1000$code.=<<___;
1001 jr $ra
1002 move $a0,$v0
1003.end bn_div_words_internal
1004___
1005undef $HH; undef $QT; undef $DH;
1006
1007($a_0,$a_1,$a_2,$a_3)=($t0,$t1,$t2,$t3);
1008($b_0,$b_1,$b_2,$b_3)=($ta0,$ta1,$ta2,$ta3);
1009
1010($a_4,$a_5,$a_6,$a_7)=($s0,$s2,$s4,$a1); # once we load a[7], no use for $a1
1011($b_4,$b_5,$b_6,$b_7)=($s1,$s3,$s5,$a2); # once we load b[7], no use for $a2
1012
1013($t_1,$t_2,$c_1,$c_2,$c_3)=($t8,$t9,$v0,$v1,$a3);
1014
1015$code.=<<___;
1016
1017.align 5
1018.globl bn_mul_comba8
1019.ent bn_mul_comba8
1020bn_mul_comba8:
1021 .set noreorder
1022___
1023$code.=<<___ if ($flavour =~ /nubi/i);
1024 .frame $sp,12*$SZREG,$ra
1025 .mask 0x803ff008,-$SZREG
1026 $PTR_SUB $sp,12*$SZREG
1027 $REG_S $ra,11*$SZREG($sp)
1028 $REG_S $s5,10*$SZREG($sp)
1029 $REG_S $s4,9*$SZREG($sp)
1030 $REG_S $s3,8*$SZREG($sp)
1031 $REG_S $s2,7*$SZREG($sp)
1032 $REG_S $s1,6*$SZREG($sp)
1033 $REG_S $s0,5*$SZREG($sp)
1034 $REG_S $t3,4*$SZREG($sp)
1035 $REG_S $t2,3*$SZREG($sp)
1036 $REG_S $t1,2*$SZREG($sp)
1037 $REG_S $t0,1*$SZREG($sp)
1038 $REG_S $gp,0*$SZREG($sp)
1039___
1040$code.=<<___ if ($flavour !~ /nubi/i);
1041 .frame $sp,6*$SZREG,$ra
1042 .mask 0x003f0000,-$SZREG
1043 $PTR_SUB $sp,6*$SZREG
1044 $REG_S $s5,5*$SZREG($sp)
1045 $REG_S $s4,4*$SZREG($sp)
1046 $REG_S $s3,3*$SZREG($sp)
1047 $REG_S $s2,2*$SZREG($sp)
1048 $REG_S $s1,1*$SZREG($sp)
1049 $REG_S $s0,0*$SZREG($sp)
1050___
1051$code.=<<___;
1052
1053 .set reorder
1054 $LD $a_0,0($a1) # If compiled with -mips3 option on
1055 # R5000 box assembler barks on this
1056 # 1ine with "should not have mult/div
1057 # as last instruction in bb (R10K
1058 # bug)" warning. If anybody out there
1059 # has a clue about how to circumvent
1060 # this do send me a note.
1061 # <appro\@fy.chalmers.se>
1062
1063 $LD $b_0,0($a2)
1064 $LD $a_1,$BNSZ($a1)
1065 $LD $a_2,2*$BNSZ($a1)
1066 $MULTU $a_0,$b_0 # mul_add_c(a[0],b[0],c1,c2,c3);
1067 $LD $a_3,3*$BNSZ($a1)
1068 $LD $b_1,$BNSZ($a2)
1069 $LD $b_2,2*$BNSZ($a2)
1070 $LD $b_3,3*$BNSZ($a2)
1071 mflo $c_1
1072 mfhi $c_2
1073
1074 $LD $a_4,4*$BNSZ($a1)
1075 $LD $a_5,5*$BNSZ($a1)
1076 $MULTU $a_0,$b_1 # mul_add_c(a[0],b[1],c2,c3,c1);
1077 $LD $a_6,6*$BNSZ($a1)
1078 $LD $a_7,7*$BNSZ($a1)
1079 $LD $b_4,4*$BNSZ($a2)
1080 $LD $b_5,5*$BNSZ($a2)
1081 mflo $t_1
1082 mfhi $t_2
1083 $ADDU $c_2,$t_1
1084 sltu $at,$c_2,$t_1
1085 $MULTU $a_1,$b_0 # mul_add_c(a[1],b[0],c2,c3,c1);
1086 $ADDU $c_3,$t_2,$at
1087 $LD $b_6,6*$BNSZ($a2)
1088 $LD $b_7,7*$BNSZ($a2)
1089 $ST $c_1,0($a0) # r[0]=c1;
1090 mflo $t_1
1091 mfhi $t_2
1092 $ADDU $c_2,$t_1
1093 sltu $at,$c_2,$t_1
1094 $MULTU $a_2,$b_0 # mul_add_c(a[2],b[0],c3,c1,c2);
1095 $ADDU $t_2,$at
1096 $ADDU $c_3,$t_2
1097 sltu $c_1,$c_3,$t_2
1098 $ST $c_2,$BNSZ($a0) # r[1]=c2;
1099
1100 mflo $t_1
1101 mfhi $t_2
1102 $ADDU $c_3,$t_1
1103 sltu $at,$c_3,$t_1
1104 $MULTU $a_1,$b_1 # mul_add_c(a[1],b[1],c3,c1,c2);
1105 $ADDU $t_2,$at
1106 $ADDU $c_1,$t_2
1107 mflo $t_1
1108 mfhi $t_2
1109 $ADDU $c_3,$t_1
1110 sltu $at,$c_3,$t_1
1111 $MULTU $a_0,$b_2 # mul_add_c(a[0],b[2],c3,c1,c2);
1112 $ADDU $t_2,$at
1113 $ADDU $c_1,$t_2
1114 sltu $c_2,$c_1,$t_2
1115 mflo $t_1
1116 mfhi $t_2
1117 $ADDU $c_3,$t_1
1118 sltu $at,$c_3,$t_1
1119 $MULTU $a_0,$b_3 # mul_add_c(a[0],b[3],c1,c2,c3);
1120 $ADDU $t_2,$at
1121 $ADDU $c_1,$t_2
1122 sltu $at,$c_1,$t_2
1123 $ADDU $c_2,$at
1124 $ST $c_3,2*$BNSZ($a0) # r[2]=c3;
1125
1126 mflo $t_1
1127 mfhi $t_2
1128 $ADDU $c_1,$t_1
1129 sltu $at,$c_1,$t_1
1130 $MULTU $a_1,$b_2 # mul_add_c(a[1],b[2],c1,c2,c3);
1131 $ADDU $t_2,$at
1132 $ADDU $c_2,$t_2
1133 sltu $c_3,$c_2,$t_2
1134 mflo $t_1
1135 mfhi $t_2
1136 $ADDU $c_1,$t_1
1137 sltu $at,$c_1,$t_1
1138 $MULTU $a_2,$b_1 # mul_add_c(a[2],b[1],c1,c2,c3);
1139 $ADDU $t_2,$at
1140 $ADDU $c_2,$t_2
1141 sltu $at,$c_2,$t_2
1142 $ADDU $c_3,$at
1143 mflo $t_1
1144 mfhi $t_2
1145 $ADDU $c_1,$t_1
1146 sltu $at,$c_1,$t_1
1147 $MULTU $a_3,$b_0 # mul_add_c(a[3],b[0],c1,c2,c3);
1148 $ADDU $t_2,$at
1149 $ADDU $c_2,$t_2
1150 sltu $at,$c_2,$t_2
1151 $ADDU $c_3,$at
1152 mflo $t_1
1153 mfhi $t_2
1154 $ADDU $c_1,$t_1
1155 sltu $at,$c_1,$t_1
1156 $MULTU $a_4,$b_0 # mul_add_c(a[4],b[0],c2,c3,c1);
1157 $ADDU $t_2,$at
1158 $ADDU $c_2,$t_2
1159 sltu $at,$c_2,$t_2
1160 $ADDU $c_3,$at
1161 $ST $c_1,3*$BNSZ($a0) # r[3]=c1;
1162
1163 mflo $t_1
1164 mfhi $t_2
1165 $ADDU $c_2,$t_1
1166 sltu $at,$c_2,$t_1
1167 $MULTU $a_3,$b_1 # mul_add_c(a[3],b[1],c2,c3,c1);
1168 $ADDU $t_2,$at
1169 $ADDU $c_3,$t_2
1170 sltu $c_1,$c_3,$t_2
1171 mflo $t_1
1172 mfhi $t_2
1173 $ADDU $c_2,$t_1
1174 sltu $at,$c_2,$t_1
1175 $MULTU $a_2,$b_2 # mul_add_c(a[2],b[2],c2,c3,c1);
1176 $ADDU $t_2,$at
1177 $ADDU $c_3,$t_2
1178 sltu $at,$c_3,$t_2
1179 $ADDU $c_1,$at
1180 mflo $t_1
1181 mfhi $t_2
1182 $ADDU $c_2,$t_1
1183 sltu $at,$c_2,$t_1
1184 $MULTU $a_1,$b_3 # mul_add_c(a[1],b[3],c2,c3,c1);
1185 $ADDU $t_2,$at
1186 $ADDU $c_3,$t_2
1187 sltu $at,$c_3,$t_2
1188 $ADDU $c_1,$at
1189 mflo $t_1
1190 mfhi $t_2
1191 $ADDU $c_2,$t_1
1192 sltu $at,$c_2,$t_1
1193 $MULTU $a_0,$b_4 # mul_add_c(a[0],b[4],c2,c3,c1);
1194 $ADDU $t_2,$at
1195 $ADDU $c_3,$t_2
1196 sltu $at,$c_3,$t_2
1197 $ADDU $c_1,$at
1198 mflo $t_1
1199 mfhi $t_2
1200 $ADDU $c_2,$t_1
1201 sltu $at,$c_2,$t_1
1202 $MULTU $a_0,$b_5 # mul_add_c(a[0],b[5],c3,c1,c2);
1203 $ADDU $t_2,$at
1204 $ADDU $c_3,$t_2
1205 sltu $at,$c_3,$t_2
1206 $ADDU $c_1,$at
1207 $ST $c_2,4*$BNSZ($a0) # r[4]=c2;
1208
1209 mflo $t_1
1210 mfhi $t_2
1211 $ADDU $c_3,$t_1
1212 sltu $at,$c_3,$t_1
1213 $MULTU $a_1,$b_4 # mul_add_c(a[1],b[4],c3,c1,c2);
1214 $ADDU $t_2,$at
1215 $ADDU $c_1,$t_2
1216 sltu $c_2,$c_1,$t_2
1217 mflo $t_1
1218 mfhi $t_2
1219 $ADDU $c_3,$t_1
1220 sltu $at,$c_3,$t_1
1221 $MULTU $a_2,$b_3 # mul_add_c(a[2],b[3],c3,c1,c2);
1222 $ADDU $t_2,$at
1223 $ADDU $c_1,$t_2
1224 sltu $at,$c_1,$t_2
1225 $ADDU $c_2,$at
1226 mflo $t_1
1227 mfhi $t_2
1228 $ADDU $c_3,$t_1
1229 sltu $at,$c_3,$t_1
1230 $MULTU $a_3,$b_2 # mul_add_c(a[3],b[2],c3,c1,c2);
1231 $ADDU $t_2,$at
1232 $ADDU $c_1,$t_2
1233 sltu $at,$c_1,$t_2
1234 $ADDU $c_2,$at
1235 mflo $t_1
1236 mfhi $t_2
1237 $ADDU $c_3,$t_1
1238 sltu $at,$c_3,$t_1
1239 $MULTU $a_4,$b_1 # mul_add_c(a[4],b[1],c3,c1,c2);
1240 $ADDU $t_2,$at
1241 $ADDU $c_1,$t_2
1242 sltu $at,$c_1,$t_2
1243 $ADDU $c_2,$at
1244 mflo $t_1
1245 mfhi $t_2
1246 $ADDU $c_3,$t_1
1247 sltu $at,$c_3,$t_1
1248 $MULTU $a_5,$b_0 # mul_add_c(a[5],b[0],c3,c1,c2);
1249 $ADDU $t_2,$at
1250 $ADDU $c_1,$t_2
1251 sltu $at,$c_1,$t_2
1252 $ADDU $c_2,$at
1253 mflo $t_1
1254 mfhi $t_2
1255 $ADDU $c_3,$t_1
1256 sltu $at,$c_3,$t_1
1257 $MULTU $a_6,$b_0 # mul_add_c(a[6],b[0],c1,c2,c3);
1258 $ADDU $t_2,$at
1259 $ADDU $c_1,$t_2
1260 sltu $at,$c_1,$t_2
1261 $ADDU $c_2,$at
1262 $ST $c_3,5*$BNSZ($a0) # r[5]=c3;
1263
1264 mflo $t_1
1265 mfhi $t_2
1266 $ADDU $c_1,$t_1
1267 sltu $at,$c_1,$t_1
1268 $MULTU $a_5,$b_1 # mul_add_c(a[5],b[1],c1,c2,c3);
1269 $ADDU $t_2,$at
1270 $ADDU $c_2,$t_2
1271 sltu $c_3,$c_2,$t_2
1272 mflo $t_1
1273 mfhi $t_2
1274 $ADDU $c_1,$t_1
1275 sltu $at,$c_1,$t_1
1276 $MULTU $a_4,$b_2 # mul_add_c(a[4],b[2],c1,c2,c3);
1277 $ADDU $t_2,$at
1278 $ADDU $c_2,$t_2
1279 sltu $at,$c_2,$t_2
1280 $ADDU $c_3,$at
1281 mflo $t_1
1282 mfhi $t_2
1283 $ADDU $c_1,$t_1
1284 sltu $at,$c_1,$t_1
1285 $MULTU $a_3,$b_3 # mul_add_c(a[3],b[3],c1,c2,c3);
1286 $ADDU $t_2,$at
1287 $ADDU $c_2,$t_2
1288 sltu $at,$c_2,$t_2
1289 $ADDU $c_3,$at
1290 mflo $t_1
1291 mfhi $t_2
1292 $ADDU $c_1,$t_1
1293 sltu $at,$c_1,$t_1
1294 $MULTU $a_2,$b_4 # mul_add_c(a[2],b[4],c1,c2,c3);
1295 $ADDU $t_2,$at
1296 $ADDU $c_2,$t_2
1297 sltu $at,$c_2,$t_2
1298 $ADDU $c_3,$at
1299 mflo $t_1
1300 mfhi $t_2
1301 $ADDU $c_1,$t_1
1302 sltu $at,$c_1,$t_1
1303 $MULTU $a_1,$b_5 # mul_add_c(a[1],b[5],c1,c2,c3);
1304 $ADDU $t_2,$at
1305 $ADDU $c_2,$t_2
1306 sltu $at,$c_2,$t_2
1307 $ADDU $c_3,$at
1308 mflo $t_1
1309 mfhi $t_2
1310 $ADDU $c_1,$t_1
1311 sltu $at,$c_1,$t_1
1312 $MULTU $a_0,$b_6 # mul_add_c(a[0],b[6],c1,c2,c3);
1313 $ADDU $t_2,$at
1314 $ADDU $c_2,$t_2
1315 sltu $at,$c_2,$t_2
1316 $ADDU $c_3,$at
1317 mflo $t_1
1318 mfhi $t_2
1319 $ADDU $c_1,$t_1
1320 sltu $at,$c_1,$t_1
1321 $MULTU $a_0,$b_7 # mul_add_c(a[0],b[7],c2,c3,c1);
1322 $ADDU $t_2,$at
1323 $ADDU $c_2,$t_2
1324 sltu $at,$c_2,$t_2
1325 $ADDU $c_3,$at
1326 $ST $c_1,6*$BNSZ($a0) # r[6]=c1;
1327
1328 mflo $t_1
1329 mfhi $t_2
1330 $ADDU $c_2,$t_1
1331 sltu $at,$c_2,$t_1
1332 $MULTU $a_1,$b_6 # mul_add_c(a[1],b[6],c2,c3,c1);
1333 $ADDU $t_2,$at
1334 $ADDU $c_3,$t_2
1335 sltu $c_1,$c_3,$t_2
1336 mflo $t_1
1337 mfhi $t_2
1338 $ADDU $c_2,$t_1
1339 sltu $at,$c_2,$t_1
1340 $MULTU $a_2,$b_5 # mul_add_c(a[2],b[5],c2,c3,c1);
1341 $ADDU $t_2,$at
1342 $ADDU $c_3,$t_2
1343 sltu $at,$c_3,$t_2
1344 $ADDU $c_1,$at
1345 mflo $t_1
1346 mfhi $t_2
1347 $ADDU $c_2,$t_1
1348 sltu $at,$c_2,$t_1
1349 $MULTU $a_3,$b_4 # mul_add_c(a[3],b[4],c2,c3,c1);
1350 $ADDU $t_2,$at
1351 $ADDU $c_3,$t_2
1352 sltu $at,$c_3,$t_2
1353 $ADDU $c_1,$at
1354 mflo $t_1
1355 mfhi $t_2
1356 $ADDU $c_2,$t_1
1357 sltu $at,$c_2,$t_1
1358 $MULTU $a_4,$b_3 # mul_add_c(a[4],b[3],c2,c3,c1);
1359 $ADDU $t_2,$at
1360 $ADDU $c_3,$t_2
1361 sltu $at,$c_3,$t_2
1362 $ADDU $c_1,$at
1363 mflo $t_1
1364 mfhi $t_2
1365 $ADDU $c_2,$t_1
1366 sltu $at,$c_2,$t_1
1367 $MULTU $a_5,$b_2 # mul_add_c(a[5],b[2],c2,c3,c1);
1368 $ADDU $t_2,$at
1369 $ADDU $c_3,$t_2
1370 sltu $at,$c_3,$t_2
1371 $ADDU $c_1,$at
1372 mflo $t_1
1373 mfhi $t_2
1374 $ADDU $c_2,$t_1
1375 sltu $at,$c_2,$t_1
1376 $MULTU $a_6,$b_1 # mul_add_c(a[6],b[1],c2,c3,c1);
1377 $ADDU $t_2,$at
1378 $ADDU $c_3,$t_2
1379 sltu $at,$c_3,$t_2
1380 $ADDU $c_1,$at
1381 mflo $t_1
1382 mfhi $t_2
1383 $ADDU $c_2,$t_1
1384 sltu $at,$c_2,$t_1
1385 $MULTU $a_7,$b_0 # mul_add_c(a[7],b[0],c2,c3,c1);
1386 $ADDU $t_2,$at
1387 $ADDU $c_3,$t_2
1388 sltu $at,$c_3,$t_2
1389 $ADDU $c_1,$at
1390 mflo $t_1
1391 mfhi $t_2
1392 $ADDU $c_2,$t_1
1393 sltu $at,$c_2,$t_1
1394 $MULTU $a_7,$b_1 # mul_add_c(a[7],b[1],c3,c1,c2);
1395 $ADDU $t_2,$at
1396 $ADDU $c_3,$t_2
1397 sltu $at,$c_3,$t_2
1398 $ADDU $c_1,$at
1399 $ST $c_2,7*$BNSZ($a0) # r[7]=c2;
1400
1401 mflo $t_1
1402 mfhi $t_2
1403 $ADDU $c_3,$t_1
1404 sltu $at,$c_3,$t_1
1405 $MULTU $a_6,$b_2 # mul_add_c(a[6],b[2],c3,c1,c2);
1406 $ADDU $t_2,$at
1407 $ADDU $c_1,$t_2
1408 sltu $c_2,$c_1,$t_2
1409 mflo $t_1
1410 mfhi $t_2
1411 $ADDU $c_3,$t_1
1412 sltu $at,$c_3,$t_1
1413 $MULTU $a_5,$b_3 # mul_add_c(a[5],b[3],c3,c1,c2);
1414 $ADDU $t_2,$at
1415 $ADDU $c_1,$t_2
1416 sltu $at,$c_1,$t_2
1417 $ADDU $c_2,$at
1418 mflo $t_1
1419 mfhi $t_2
1420 $ADDU $c_3,$t_1
1421 sltu $at,$c_3,$t_1
1422 $MULTU $a_4,$b_4 # mul_add_c(a[4],b[4],c3,c1,c2);
1423 $ADDU $t_2,$at
1424 $ADDU $c_1,$t_2
1425 sltu $at,$c_1,$t_2
1426 $ADDU $c_2,$at
1427 mflo $t_1
1428 mfhi $t_2
1429 $ADDU $c_3,$t_1
1430 sltu $at,$c_3,$t_1
1431 $MULTU $a_3,$b_5 # mul_add_c(a[3],b[5],c3,c1,c2);
1432 $ADDU $t_2,$at
1433 $ADDU $c_1,$t_2
1434 sltu $at,$c_1,$t_2
1435 $ADDU $c_2,$at
1436 mflo $t_1
1437 mfhi $t_2
1438 $ADDU $c_3,$t_1
1439 sltu $at,$c_3,$t_1
1440 $MULTU $a_2,$b_6 # mul_add_c(a[2],b[6],c3,c1,c2);
1441 $ADDU $t_2,$at
1442 $ADDU $c_1,$t_2
1443 sltu $at,$c_1,$t_2
1444 $ADDU $c_2,$at
1445 mflo $t_1
1446 mfhi $t_2
1447 $ADDU $c_3,$t_1
1448 sltu $at,$c_3,$t_1
1449 $MULTU $a_1,$b_7 # mul_add_c(a[1],b[7],c3,c1,c2);
1450 $ADDU $t_2,$at
1451 $ADDU $c_1,$t_2
1452 sltu $at,$c_1,$t_2
1453 $ADDU $c_2,$at
1454 mflo $t_1
1455 mfhi $t_2
1456 $ADDU $c_3,$t_1
1457 sltu $at,$c_3,$t_1
1458 $MULTU $a_2,$b_7 # mul_add_c(a[2],b[7],c1,c2,c3);
1459 $ADDU $t_2,$at
1460 $ADDU $c_1,$t_2
1461 sltu $at,$c_1,$t_2
1462 $ADDU $c_2,$at
1463 $ST $c_3,8*$BNSZ($a0) # r[8]=c3;
1464
1465 mflo $t_1
1466 mfhi $t_2
1467 $ADDU $c_1,$t_1
1468 sltu $at,$c_1,$t_1
1469 $MULTU $a_3,$b_6 # mul_add_c(a[3],b[6],c1,c2,c3);
1470 $ADDU $t_2,$at
1471 $ADDU $c_2,$t_2
1472 sltu $c_3,$c_2,$t_2
1473 mflo $t_1
1474 mfhi $t_2
1475 $ADDU $c_1,$t_1
1476 sltu $at,$c_1,$t_1
1477 $MULTU $a_4,$b_5 # mul_add_c(a[4],b[5],c1,c2,c3);
1478 $ADDU $t_2,$at
1479 $ADDU $c_2,$t_2
1480 sltu $at,$c_2,$t_2
1481 $ADDU $c_3,$at
1482 mflo $t_1
1483 mfhi $t_2
1484 $ADDU $c_1,$t_1
1485 sltu $at,$c_1,$t_1
1486 $MULTU $a_5,$b_4 # mul_add_c(a[5],b[4],c1,c2,c3);
1487 $ADDU $t_2,$at
1488 $ADDU $c_2,$t_2
1489 sltu $at,$c_2,$t_2
1490 $ADDU $c_3,$at
1491 mflo $t_1
1492 mfhi $t_2
1493 $ADDU $c_1,$t_1
1494 sltu $at,$c_1,$t_1
1495 $MULTU $a_6,$b_3 # mul_add_c(a[6],b[3],c1,c2,c3);
1496 $ADDU $t_2,$at
1497 $ADDU $c_2,$t_2
1498 sltu $at,$c_2,$t_2
1499 $ADDU $c_3,$at
1500 mflo $t_1
1501 mfhi $t_2
1502 $ADDU $c_1,$t_1
1503 sltu $at,$c_1,$t_1
1504 $MULTU $a_7,$b_2 # mul_add_c(a[7],b[2],c1,c2,c3);
1505 $ADDU $t_2,$at
1506 $ADDU $c_2,$t_2
1507 sltu $at,$c_2,$t_2
1508 $ADDU $c_3,$at
1509 mflo $t_1
1510 mfhi $t_2
1511 $ADDU $c_1,$t_1
1512 sltu $at,$c_1,$t_1
1513 $MULTU $a_7,$b_3 # mul_add_c(a[7],b[3],c2,c3,c1);
1514 $ADDU $t_2,$at
1515 $ADDU $c_2,$t_2
1516 sltu $at,$c_2,$t_2
1517 $ADDU $c_3,$at
1518 $ST $c_1,9*$BNSZ($a0) # r[9]=c1;
1519
1520 mflo $t_1
1521 mfhi $t_2
1522 $ADDU $c_2,$t_1
1523 sltu $at,$c_2,$t_1
1524 $MULTU $a_6,$b_4 # mul_add_c(a[6],b[4],c2,c3,c1);
1525 $ADDU $t_2,$at
1526 $ADDU $c_3,$t_2
1527 sltu $c_1,$c_3,$t_2
1528 mflo $t_1
1529 mfhi $t_2
1530 $ADDU $c_2,$t_1
1531 sltu $at,$c_2,$t_1
1532 $MULTU $a_5,$b_5 # mul_add_c(a[5],b[5],c2,c3,c1);
1533 $ADDU $t_2,$at
1534 $ADDU $c_3,$t_2
1535 sltu $at,$c_3,$t_2
1536 $ADDU $c_1,$at
1537 mflo $t_1
1538 mfhi $t_2
1539 $ADDU $c_2,$t_1
1540 sltu $at,$c_2,$t_1
1541 $MULTU $a_4,$b_6 # mul_add_c(a[4],b[6],c2,c3,c1);
1542 $ADDU $t_2,$at
1543 $ADDU $c_3,$t_2
1544 sltu $at,$c_3,$t_2
1545 $ADDU $c_1,$at
1546 mflo $t_1
1547 mfhi $t_2
1548 $ADDU $c_2,$t_1
1549 sltu $at,$c_2,$t_1
1550 $MULTU $a_3,$b_7 # mul_add_c(a[3],b[7],c2,c3,c1);
1551 $ADDU $t_2,$at
1552 $ADDU $c_3,$t_2
1553 sltu $at,$c_3,$t_2
1554 $ADDU $c_1,$at
1555 mflo $t_1
1556 mfhi $t_2
1557 $ADDU $c_2,$t_1
1558 sltu $at,$c_2,$t_1
1559 $MULTU $a_4,$b_7 # mul_add_c(a[4],b[7],c3,c1,c2);
1560 $ADDU $t_2,$at
1561 $ADDU $c_3,$t_2
1562 sltu $at,$c_3,$t_2
1563 $ADDU $c_1,$at
1564 $ST $c_2,10*$BNSZ($a0) # r[10]=c2;
1565
1566 mflo $t_1
1567 mfhi $t_2
1568 $ADDU $c_3,$t_1
1569 sltu $at,$c_3,$t_1
1570 $MULTU $a_5,$b_6 # mul_add_c(a[5],b[6],c3,c1,c2);
1571 $ADDU $t_2,$at
1572 $ADDU $c_1,$t_2
1573 sltu $c_2,$c_1,$t_2
1574 mflo $t_1
1575 mfhi $t_2
1576 $ADDU $c_3,$t_1
1577 sltu $at,$c_3,$t_1
1578 $MULTU $a_6,$b_5 # mul_add_c(a[6],b[5],c3,c1,c2);
1579 $ADDU $t_2,$at
1580 $ADDU $c_1,$t_2
1581 sltu $at,$c_1,$t_2
1582 $ADDU $c_2,$at
1583 mflo $t_1
1584 mfhi $t_2
1585 $ADDU $c_3,$t_1
1586 sltu $at,$c_3,$t_1
1587 $MULTU $a_7,$b_4 # mul_add_c(a[7],b[4],c3,c1,c2);
1588 $ADDU $t_2,$at
1589 $ADDU $c_1,$t_2
1590 sltu $at,$c_1,$t_2
1591 $ADDU $c_2,$at
1592 mflo $t_1
1593 mfhi $t_2
1594 $ADDU $c_3,$t_1
1595 sltu $at,$c_3,$t_1
1596 $MULTU $a_7,$b_5 # mul_add_c(a[7],b[5],c1,c2,c3);
1597 $ADDU $t_2,$at
1598 $ADDU $c_1,$t_2
1599 sltu $at,$c_1,$t_2
1600 $ADDU $c_2,$at
1601 $ST $c_3,11*$BNSZ($a0) # r[11]=c3;
1602
1603 mflo $t_1
1604 mfhi $t_2
1605 $ADDU $c_1,$t_1
1606 sltu $at,$c_1,$t_1
1607 $MULTU $a_6,$b_6 # mul_add_c(a[6],b[6],c1,c2,c3);
1608 $ADDU $t_2,$at
1609 $ADDU $c_2,$t_2
1610 sltu $c_3,$c_2,$t_2
1611 mflo $t_1
1612 mfhi $t_2
1613 $ADDU $c_1,$t_1
1614 sltu $at,$c_1,$t_1
1615 $MULTU $a_5,$b_7 # mul_add_c(a[5],b[7],c1,c2,c3);
1616 $ADDU $t_2,$at
1617 $ADDU $c_2,$t_2
1618 sltu $at,$c_2,$t_2
1619 $ADDU $c_3,$at
1620 mflo $t_1
1621 mfhi $t_2
1622 $ADDU $c_1,$t_1
1623 sltu $at,$c_1,$t_1
1624 $MULTU $a_6,$b_7 # mul_add_c(a[6],b[7],c2,c3,c1);
1625 $ADDU $t_2,$at
1626 $ADDU $c_2,$t_2
1627 sltu $at,$c_2,$t_2
1628 $ADDU $c_3,$at
1629 $ST $c_1,12*$BNSZ($a0) # r[12]=c1;
1630
1631 mflo $t_1
1632 mfhi $t_2
1633 $ADDU $c_2,$t_1
1634 sltu $at,$c_2,$t_1
1635 $MULTU $a_7,$b_6 # mul_add_c(a[7],b[6],c2,c3,c1);
1636 $ADDU $t_2,$at
1637 $ADDU $c_3,$t_2
1638 sltu $c_1,$c_3,$t_2
1639 mflo $t_1
1640 mfhi $t_2
1641 $ADDU $c_2,$t_1
1642 sltu $at,$c_2,$t_1
1643 $MULTU $a_7,$b_7 # mul_add_c(a[7],b[7],c3,c1,c2);
1644 $ADDU $t_2,$at
1645 $ADDU $c_3,$t_2
1646 sltu $at,$c_3,$t_2
1647 $ADDU $c_1,$at
1648 $ST $c_2,13*$BNSZ($a0) # r[13]=c2;
1649
1650 mflo $t_1
1651 mfhi $t_2
1652 $ADDU $c_3,$t_1
1653 sltu $at,$c_3,$t_1
1654 $ADDU $t_2,$at
1655 $ADDU $c_1,$t_2
1656 $ST $c_3,14*$BNSZ($a0) # r[14]=c3;
1657 $ST $c_1,15*$BNSZ($a0) # r[15]=c1;
1658
1659 .set noreorder
1660___
1661$code.=<<___ if ($flavour =~ /nubi/i);
1662 $REG_L $s5,10*$SZREG($sp)
1663 $REG_L $s4,9*$SZREG($sp)
1664 $REG_L $s3,8*$SZREG($sp)
1665 $REG_L $s2,7*$SZREG($sp)
1666 $REG_L $s1,6*$SZREG($sp)
1667 $REG_L $s0,5*$SZREG($sp)
1668 $REG_L $t3,4*$SZREG($sp)
1669 $REG_L $t2,3*$SZREG($sp)
1670 $REG_L $t1,2*$SZREG($sp)
1671 $REG_L $t0,1*$SZREG($sp)
1672 $REG_L $gp,0*$SZREG($sp)
1673 jr $ra
1674 $PTR_ADD $sp,12*$SZREG
1675___
1676$code.=<<___ if ($flavour !~ /nubi/i);
1677 $REG_L $s5,5*$SZREG($sp)
1678 $REG_L $s4,4*$SZREG($sp)
1679 $REG_L $s3,3*$SZREG($sp)
1680 $REG_L $s2,2*$SZREG($sp)
1681 $REG_L $s1,1*$SZREG($sp)
1682 $REG_L $s0,0*$SZREG($sp)
1683 jr $ra
1684 $PTR_ADD $sp,6*$SZREG
1685___
1686$code.=<<___;
1687.end bn_mul_comba8
1688
1689.align 5
1690.globl bn_mul_comba4
1691.ent bn_mul_comba4
1692bn_mul_comba4:
1693___
1694$code.=<<___ if ($flavour =~ /nubi/i);
1695 .frame $sp,6*$SZREG,$ra
1696 .mask 0x8000f008,-$SZREG
1697 .set noreorder
1698 $PTR_SUB $sp,6*$SZREG
1699 $REG_S $ra,5*$SZREG($sp)
1700 $REG_S $t3,4*$SZREG($sp)
1701 $REG_S $t2,3*$SZREG($sp)
1702 $REG_S $t1,2*$SZREG($sp)
1703 $REG_S $t0,1*$SZREG($sp)
1704 $REG_S $gp,0*$SZREG($sp)
1705___
1706$code.=<<___;
1707 .set reorder
1708 $LD $a_0,0($a1)
1709 $LD $b_0,0($a2)
1710 $LD $a_1,$BNSZ($a1)
1711 $LD $a_2,2*$BNSZ($a1)
1712 $MULTU $a_0,$b_0 # mul_add_c(a[0],b[0],c1,c2,c3);
1713 $LD $a_3,3*$BNSZ($a1)
1714 $LD $b_1,$BNSZ($a2)
1715 $LD $b_2,2*$BNSZ($a2)
1716 $LD $b_3,3*$BNSZ($a2)
1717 mflo $c_1
1718 mfhi $c_2
1719 $ST $c_1,0($a0)
1720
1721 $MULTU $a_0,$b_1 # mul_add_c(a[0],b[1],c2,c3,c1);
1722 mflo $t_1
1723 mfhi $t_2
1724 $ADDU $c_2,$t_1
1725 sltu $at,$c_2,$t_1
1726 $MULTU $a_1,$b_0 # mul_add_c(a[1],b[0],c2,c3,c1);
1727 $ADDU $c_3,$t_2,$at
1728 mflo $t_1
1729 mfhi $t_2
1730 $ADDU $c_2,$t_1
1731 sltu $at,$c_2,$t_1
1732 $MULTU $a_2,$b_0 # mul_add_c(a[2],b[0],c3,c1,c2);
1733 $ADDU $t_2,$at
1734 $ADDU $c_3,$t_2
1735 sltu $c_1,$c_3,$t_2
1736 $ST $c_2,$BNSZ($a0)
1737
1738 mflo $t_1
1739 mfhi $t_2
1740 $ADDU $c_3,$t_1
1741 sltu $at,$c_3,$t_1
1742 $MULTU $a_1,$b_1 # mul_add_c(a[1],b[1],c3,c1,c2);
1743 $ADDU $t_2,$at
1744 $ADDU $c_1,$t_2
1745 mflo $t_1
1746 mfhi $t_2
1747 $ADDU $c_3,$t_1
1748 sltu $at,$c_3,$t_1
1749 $MULTU $a_0,$b_2 # mul_add_c(a[0],b[2],c3,c1,c2);
1750 $ADDU $t_2,$at
1751 $ADDU $c_1,$t_2
1752 sltu $c_2,$c_1,$t_2
1753 mflo $t_1
1754 mfhi $t_2
1755 $ADDU $c_3,$t_1
1756 sltu $at,$c_3,$t_1
1757 $MULTU $a_0,$b_3 # mul_add_c(a[0],b[3],c1,c2,c3);
1758 $ADDU $t_2,$at
1759 $ADDU $c_1,$t_2
1760 sltu $at,$c_1,$t_2
1761 $ADDU $c_2,$at
1762 $ST $c_3,2*$BNSZ($a0)
1763
1764 mflo $t_1
1765 mfhi $t_2
1766 $ADDU $c_1,$t_1
1767 sltu $at,$c_1,$t_1
1768 $MULTU $a_1,$b_2 # mul_add_c(a[1],b[2],c1,c2,c3);
1769 $ADDU $t_2,$at
1770 $ADDU $c_2,$t_2
1771 sltu $c_3,$c_2,$t_2
1772 mflo $t_1
1773 mfhi $t_2
1774 $ADDU $c_1,$t_1
1775 sltu $at,$c_1,$t_1
1776 $MULTU $a_2,$b_1 # mul_add_c(a[2],b[1],c1,c2,c3);
1777 $ADDU $t_2,$at
1778 $ADDU $c_2,$t_2
1779 sltu $at,$c_2,$t_2
1780 $ADDU $c_3,$at
1781 mflo $t_1
1782 mfhi $t_2
1783 $ADDU $c_1,$t_1
1784 sltu $at,$c_1,$t_1
1785 $MULTU $a_3,$b_0 # mul_add_c(a[3],b[0],c1,c2,c3);
1786 $ADDU $t_2,$at
1787 $ADDU $c_2,$t_2
1788 sltu $at,$c_2,$t_2
1789 $ADDU $c_3,$at
1790 mflo $t_1
1791 mfhi $t_2
1792 $ADDU $c_1,$t_1
1793 sltu $at,$c_1,$t_1
1794 $MULTU $a_3,$b_1 # mul_add_c(a[3],b[1],c2,c3,c1);
1795 $ADDU $t_2,$at
1796 $ADDU $c_2,$t_2
1797 sltu $at,$c_2,$t_2
1798 $ADDU $c_3,$at
1799 $ST $c_1,3*$BNSZ($a0)
1800
1801 mflo $t_1
1802 mfhi $t_2
1803 $ADDU $c_2,$t_1
1804 sltu $at,$c_2,$t_1
1805 $MULTU $a_2,$b_2 # mul_add_c(a[2],b[2],c2,c3,c1);
1806 $ADDU $t_2,$at
1807 $ADDU $c_3,$t_2
1808 sltu $c_1,$c_3,$t_2
1809 mflo $t_1
1810 mfhi $t_2
1811 $ADDU $c_2,$t_1
1812 sltu $at,$c_2,$t_1
1813 $MULTU $a_1,$b_3 # mul_add_c(a[1],b[3],c2,c3,c1);
1814 $ADDU $t_2,$at
1815 $ADDU $c_3,$t_2
1816 sltu $at,$c_3,$t_2
1817 $ADDU $c_1,$at
1818 mflo $t_1
1819 mfhi $t_2
1820 $ADDU $c_2,$t_1
1821 sltu $at,$c_2,$t_1
1822 $MULTU $a_2,$b_3 # mul_add_c(a[2],b[3],c3,c1,c2);
1823 $ADDU $t_2,$at
1824 $ADDU $c_3,$t_2
1825 sltu $at,$c_3,$t_2
1826 $ADDU $c_1,$at
1827 $ST $c_2,4*$BNSZ($a0)
1828
1829 mflo $t_1
1830 mfhi $t_2
1831 $ADDU $c_3,$t_1
1832 sltu $at,$c_3,$t_1
1833 $MULTU $a_3,$b_2 # mul_add_c(a[3],b[2],c3,c1,c2);
1834 $ADDU $t_2,$at
1835 $ADDU $c_1,$t_2
1836 sltu $c_2,$c_1,$t_2
1837 mflo $t_1
1838 mfhi $t_2
1839 $ADDU $c_3,$t_1
1840 sltu $at,$c_3,$t_1
1841 $MULTU $a_3,$b_3 # mul_add_c(a[3],b[3],c1,c2,c3);
1842 $ADDU $t_2,$at
1843 $ADDU $c_1,$t_2
1844 sltu $at,$c_1,$t_2
1845 $ADDU $c_2,$at
1846 $ST $c_3,5*$BNSZ($a0)
1847
1848 mflo $t_1
1849 mfhi $t_2
1850 $ADDU $c_1,$t_1
1851 sltu $at,$c_1,$t_1
1852 $ADDU $t_2,$at
1853 $ADDU $c_2,$t_2
1854 $ST $c_1,6*$BNSZ($a0)
1855 $ST $c_2,7*$BNSZ($a0)
1856
1857 .set noreorder
1858___
1859$code.=<<___ if ($flavour =~ /nubi/i);
1860 $REG_L $t3,4*$SZREG($sp)
1861 $REG_L $t2,3*$SZREG($sp)
1862 $REG_L $t1,2*$SZREG($sp)
1863 $REG_L $t0,1*$SZREG($sp)
1864 $REG_L $gp,0*$SZREG($sp)
1865 $PTR_ADD $sp,6*$SZREG
1866___
1867$code.=<<___;
1868 jr $ra
1869 nop
1870.end bn_mul_comba4
1871___
1872
1873($a_4,$a_5,$a_6,$a_7)=($b_0,$b_1,$b_2,$b_3);
1874
1875sub add_c2 () {
1876my ($hi,$lo,$c0,$c1,$c2,
1877 $warm, # !$warm denotes first call with specific sequence of
1878 # $c_[XYZ] when there is no Z-carry to accumulate yet;
1879 $an,$bn # these two are arguments for multiplication which
1880 # result is used in *next* step [which is why it's
1881 # commented as "forward multiplication" below];
1882 )=@_;
1883$code.=<<___;
1884 mflo $lo
1885 mfhi $hi
1886 $ADDU $c0,$lo
1887 sltu $at,$c0,$lo
1888 $MULTU $an,$bn # forward multiplication
1889 $ADDU $c0,$lo
1890 $ADDU $at,$hi
1891 sltu $lo,$c0,$lo
1892 $ADDU $c1,$at
1893 $ADDU $hi,$lo
1894___
1895$code.=<<___ if (!$warm);
1896 sltu $c2,$c1,$at
1897 $ADDU $c1,$hi
1898 sltu $hi,$c1,$hi
1899 $ADDU $c2,$hi
1900___
1901$code.=<<___ if ($warm);
1902 sltu $at,$c1,$at
1903 $ADDU $c1,$hi
1904 $ADDU $c2,$at
1905 sltu $hi,$c1,$hi
1906 $ADDU $c2,$hi
1907___
1908}
1909
1910$code.=<<___;
1911
1912.align 5
1913.globl bn_sqr_comba8
1914.ent bn_sqr_comba8
1915bn_sqr_comba8:
1916___
1917$code.=<<___ if ($flavour =~ /nubi/i);
1918 .frame $sp,6*$SZREG,$ra
1919 .mask 0x8000f008,-$SZREG
1920 .set noreorder
1921 $PTR_SUB $sp,6*$SZREG
1922 $REG_S $ra,5*$SZREG($sp)
1923 $REG_S $t3,4*$SZREG($sp)
1924 $REG_S $t2,3*$SZREG($sp)
1925 $REG_S $t1,2*$SZREG($sp)
1926 $REG_S $t0,1*$SZREG($sp)
1927 $REG_S $gp,0*$SZREG($sp)
1928___
1929$code.=<<___;
1930 .set reorder
1931 $LD $a_0,0($a1)
1932 $LD $a_1,$BNSZ($a1)
1933 $LD $a_2,2*$BNSZ($a1)
1934 $LD $a_3,3*$BNSZ($a1)
1935
1936 $MULTU $a_0,$a_0 # mul_add_c(a[0],b[0],c1,c2,c3);
1937 $LD $a_4,4*$BNSZ($a1)
1938 $LD $a_5,5*$BNSZ($a1)
1939 $LD $a_6,6*$BNSZ($a1)
1940 $LD $a_7,7*$BNSZ($a1)
1941 mflo $c_1
1942 mfhi $c_2
1943 $ST $c_1,0($a0)
1944
1945 $MULTU $a_0,$a_1 # mul_add_c2(a[0],b[1],c2,c3,c1);
1946 mflo $t_1
1947 mfhi $t_2
1948 slt $c_1,$t_2,$zero
1949 $SLL $t_2,1
1950 $MULTU $a_2,$a_0 # mul_add_c2(a[2],b[0],c3,c1,c2);
1951 slt $a2,$t_1,$zero
1952 $ADDU $t_2,$a2
1953 $SLL $t_1,1
1954 $ADDU $c_2,$t_1
1955 sltu $at,$c_2,$t_1
1956 $ADDU $c_3,$t_2,$at
1957 $ST $c_2,$BNSZ($a0)
1958___
1959 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
1960 $a_1,$a_1); # mul_add_c(a[1],b[1],c3,c1,c2);
1961$code.=<<___;
1962 mflo $t_1
1963 mfhi $t_2
1964 $ADDU $c_3,$t_1
1965 sltu $at,$c_3,$t_1
1966 $MULTU $a_0,$a_3 # mul_add_c2(a[0],b[3],c1,c2,c3);
1967 $ADDU $t_2,$at
1968 $ADDU $c_1,$t_2
1969 sltu $at,$c_1,$t_2
1970 $ADDU $c_2,$at
1971 $ST $c_3,2*$BNSZ($a0)
1972___
1973 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
1974 $a_1,$a_2); # mul_add_c2(a[1],b[2],c1,c2,c3);
1975 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
1976 $a_4,$a_0); # mul_add_c2(a[4],b[0],c2,c3,c1);
1977$code.=<<___;
1978 $ST $c_1,3*$BNSZ($a0)
1979___
1980 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
1981 $a_3,$a_1); # mul_add_c2(a[3],b[1],c2,c3,c1);
1982 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
1983 $a_2,$a_2); # mul_add_c(a[2],b[2],c2,c3,c1);
1984$code.=<<___;
1985 mflo $t_1
1986 mfhi $t_2
1987 $ADDU $c_2,$t_1
1988 sltu $at,$c_2,$t_1
1989 $MULTU $a_0,$a_5 # mul_add_c2(a[0],b[5],c3,c1,c2);
1990 $ADDU $t_2,$at
1991 $ADDU $c_3,$t_2
1992 sltu $at,$c_3,$t_2
1993 $ADDU $c_1,$at
1994 $ST $c_2,4*$BNSZ($a0)
1995___
1996 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
1997 $a_1,$a_4); # mul_add_c2(a[1],b[4],c3,c1,c2);
1998 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
1999 $a_2,$a_3); # mul_add_c2(a[2],b[3],c3,c1,c2);
2000 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
2001 $a_6,$a_0); # mul_add_c2(a[6],b[0],c1,c2,c3);
2002$code.=<<___;
2003 $ST $c_3,5*$BNSZ($a0)
2004___
2005 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
2006 $a_5,$a_1); # mul_add_c2(a[5],b[1],c1,c2,c3);
2007 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2008 $a_4,$a_2); # mul_add_c2(a[4],b[2],c1,c2,c3);
2009 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2010 $a_3,$a_3); # mul_add_c(a[3],b[3],c1,c2,c3);
2011$code.=<<___;
2012 mflo $t_1
2013 mfhi $t_2
2014 $ADDU $c_1,$t_1
2015 sltu $at,$c_1,$t_1
2016 $MULTU $a_0,$a_7 # mul_add_c2(a[0],b[7],c2,c3,c1);
2017 $ADDU $t_2,$at
2018 $ADDU $c_2,$t_2
2019 sltu $at,$c_2,$t_2
2020 $ADDU $c_3,$at
2021 $ST $c_1,6*$BNSZ($a0)
2022___
2023 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
2024 $a_1,$a_6); # mul_add_c2(a[1],b[6],c2,c3,c1);
2025 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
2026 $a_2,$a_5); # mul_add_c2(a[2],b[5],c2,c3,c1);
2027 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
2028 $a_3,$a_4); # mul_add_c2(a[3],b[4],c2,c3,c1);
2029 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
2030 $a_7,$a_1); # mul_add_c2(a[7],b[1],c3,c1,c2);
2031$code.=<<___;
2032 $ST $c_2,7*$BNSZ($a0)
2033___
2034 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
2035 $a_6,$a_2); # mul_add_c2(a[6],b[2],c3,c1,c2);
2036 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
2037 $a_5,$a_3); # mul_add_c2(a[5],b[3],c3,c1,c2);
2038 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
2039 $a_4,$a_4); # mul_add_c(a[4],b[4],c3,c1,c2);
2040$code.=<<___;
2041 mflo $t_1
2042 mfhi $t_2
2043 $ADDU $c_3,$t_1
2044 sltu $at,$c_3,$t_1
2045 $MULTU $a_2,$a_7 # mul_add_c2(a[2],b[7],c1,c2,c3);
2046 $ADDU $t_2,$at
2047 $ADDU $c_1,$t_2
2048 sltu $at,$c_1,$t_2
2049 $ADDU $c_2,$at
2050 $ST $c_3,8*$BNSZ($a0)
2051___
2052 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
2053 $a_3,$a_6); # mul_add_c2(a[3],b[6],c1,c2,c3);
2054 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2055 $a_4,$a_5); # mul_add_c2(a[4],b[5],c1,c2,c3);
2056 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2057 $a_7,$a_3); # mul_add_c2(a[7],b[3],c2,c3,c1);
2058$code.=<<___;
2059 $ST $c_1,9*$BNSZ($a0)
2060___
2061 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
2062 $a_6,$a_4); # mul_add_c2(a[6],b[4],c2,c3,c1);
2063 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
2064 $a_5,$a_5); # mul_add_c(a[5],b[5],c2,c3,c1);
2065$code.=<<___;
2066 mflo $t_1
2067 mfhi $t_2
2068 $ADDU $c_2,$t_1
2069 sltu $at,$c_2,$t_1
2070 $MULTU $a_4,$a_7 # mul_add_c2(a[4],b[7],c3,c1,c2);
2071 $ADDU $t_2,$at
2072 $ADDU $c_3,$t_2
2073 sltu $at,$c_3,$t_2
2074 $ADDU $c_1,$at
2075 $ST $c_2,10*$BNSZ($a0)
2076___
2077 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
2078 $a_5,$a_6); # mul_add_c2(a[5],b[6],c3,c1,c2);
2079 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
2080 $a_7,$a_5); # mul_add_c2(a[7],b[5],c1,c2,c3);
2081$code.=<<___;
2082 $ST $c_3,11*$BNSZ($a0)
2083___
2084 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
2085 $a_6,$a_6); # mul_add_c(a[6],b[6],c1,c2,c3);
2086$code.=<<___;
2087 mflo $t_1
2088 mfhi $t_2
2089 $ADDU $c_1,$t_1
2090 sltu $at,$c_1,$t_1
2091 $MULTU $a_6,$a_7 # mul_add_c2(a[6],b[7],c2,c3,c1);
2092 $ADDU $t_2,$at
2093 $ADDU $c_2,$t_2
2094 sltu $at,$c_2,$t_2
2095 $ADDU $c_3,$at
2096 $ST $c_1,12*$BNSZ($a0)
2097___
2098 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
2099 $a_7,$a_7); # mul_add_c(a[7],b[7],c3,c1,c2);
2100$code.=<<___;
2101 $ST $c_2,13*$BNSZ($a0)
2102
2103 mflo $t_1
2104 mfhi $t_2
2105 $ADDU $c_3,$t_1
2106 sltu $at,$c_3,$t_1
2107 $ADDU $t_2,$at
2108 $ADDU $c_1,$t_2
2109 $ST $c_3,14*$BNSZ($a0)
2110 $ST $c_1,15*$BNSZ($a0)
2111
2112 .set noreorder
2113___
2114$code.=<<___ if ($flavour =~ /nubi/i);
2115 $REG_L $t3,4*$SZREG($sp)
2116 $REG_L $t2,3*$SZREG($sp)
2117 $REG_L $t1,2*$SZREG($sp)
2118 $REG_L $t0,1*$SZREG($sp)
2119 $REG_L $gp,0*$SZREG($sp)
2120 $PTR_ADD $sp,6*$SZREG
2121___
2122$code.=<<___;
2123 jr $ra
2124 nop
2125.end bn_sqr_comba8
2126
2127.align 5
2128.globl bn_sqr_comba4
2129.ent bn_sqr_comba4
2130bn_sqr_comba4:
2131___
2132$code.=<<___ if ($flavour =~ /nubi/i);
2133 .frame $sp,6*$SZREG,$ra
2134 .mask 0x8000f008,-$SZREG
2135 .set noreorder
2136 $PTR_SUB $sp,6*$SZREG
2137 $REG_S $ra,5*$SZREG($sp)
2138 $REG_S $t3,4*$SZREG($sp)
2139 $REG_S $t2,3*$SZREG($sp)
2140 $REG_S $t1,2*$SZREG($sp)
2141 $REG_S $t0,1*$SZREG($sp)
2142 $REG_S $gp,0*$SZREG($sp)
2143___
2144$code.=<<___;
2145 .set reorder
2146 $LD $a_0,0($a1)
2147 $LD $a_1,$BNSZ($a1)
2148 $MULTU $a_0,$a_0 # mul_add_c(a[0],b[0],c1,c2,c3);
2149 $LD $a_2,2*$BNSZ($a1)
2150 $LD $a_3,3*$BNSZ($a1)
2151 mflo $c_1
2152 mfhi $c_2
2153 $ST $c_1,0($a0)
2154
2155 $MULTU $a_0,$a_1 # mul_add_c2(a[0],b[1],c2,c3,c1);
2156 mflo $t_1
2157 mfhi $t_2
2158 slt $c_1,$t_2,$zero
2159 $SLL $t_2,1
2160 $MULTU $a_2,$a_0 # mul_add_c2(a[2],b[0],c3,c1,c2);
2161 slt $a2,$t_1,$zero
2162 $ADDU $t_2,$a2
2163 $SLL $t_1,1
2164 $ADDU $c_2,$t_1
2165 sltu $at,$c_2,$t_1
2166 $ADDU $c_3,$t_2,$at
2167 $ST $c_2,$BNSZ($a0)
2168___
2169 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
2170 $a_1,$a_1); # mul_add_c(a[1],b[1],c3,c1,c2);
2171$code.=<<___;
2172 mflo $t_1
2173 mfhi $t_2
2174 $ADDU $c_3,$t_1
2175 sltu $at,$c_3,$t_1
2176 $MULTU $a_0,$a_3 # mul_add_c2(a[0],b[3],c1,c2,c3);
2177 $ADDU $t_2,$at
2178 $ADDU $c_1,$t_2
2179 sltu $at,$c_1,$t_2
2180 $ADDU $c_2,$at
2181 $ST $c_3,2*$BNSZ($a0)
2182___
2183 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
2184 $a_1,$a_2); # mul_add_c2(a2[1],b[2],c1,c2,c3);
2185 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2186 $a_3,$a_1); # mul_add_c2(a[3],b[1],c2,c3,c1);
2187$code.=<<___;
2188 $ST $c_1,3*$BNSZ($a0)
2189___
2190 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
2191 $a_2,$a_2); # mul_add_c(a[2],b[2],c2,c3,c1);
2192$code.=<<___;
2193 mflo $t_1
2194 mfhi $t_2
2195 $ADDU $c_2,$t_1
2196 sltu $at,$c_2,$t_1
2197 $MULTU $a_2,$a_3 # mul_add_c2(a[2],b[3],c3,c1,c2);
2198 $ADDU $t_2,$at
2199 $ADDU $c_3,$t_2
2200 sltu $at,$c_3,$t_2
2201 $ADDU $c_1,$at
2202 $ST $c_2,4*$BNSZ($a0)
2203___
2204 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
2205 $a_3,$a_3); # mul_add_c(a[3],b[3],c1,c2,c3);
2206$code.=<<___;
2207 $ST $c_3,5*$BNSZ($a0)
2208
2209 mflo $t_1
2210 mfhi $t_2
2211 $ADDU $c_1,$t_1
2212 sltu $at,$c_1,$t_1
2213 $ADDU $t_2,$at
2214 $ADDU $c_2,$t_2
2215 $ST $c_1,6*$BNSZ($a0)
2216 $ST $c_2,7*$BNSZ($a0)
2217
2218 .set noreorder
2219___
2220$code.=<<___ if ($flavour =~ /nubi/i);
2221 $REG_L $t3,4*$SZREG($sp)
2222 $REG_L $t2,3*$SZREG($sp)
2223 $REG_L $t1,2*$SZREG($sp)
2224 $REG_L $t0,1*$SZREG($sp)
2225 $REG_L $gp,0*$SZREG($sp)
2226 $PTR_ADD $sp,6*$SZREG
2227___
2228$code.=<<___;
2229 jr $ra
2230 nop
2231.end bn_sqr_comba4
2232___
2233print $code;
2234close STDOUT;
diff --git a/src/lib/libcrypto/bn/asm/modexp512-x86_64.pl b/src/lib/libcrypto/bn/asm/modexp512-x86_64.pl
deleted file mode 100644
index 8645d5adcc..0000000000
--- a/src/lib/libcrypto/bn/asm/modexp512-x86_64.pl
+++ /dev/null
@@ -1,1393 +0,0 @@
1#!/usr/bin/env perl
2#
3# Copyright (c) 2010-2011 Intel Corp.
4# Author: Vinodh.Gopal@intel.com
5# Jim Guilford
6# Erdinc.Ozturk@intel.com
7# Maxim.Perminov@intel.com
8#
9# More information about algorithm used can be found at:
10# http://www.cse.buffalo.edu/srds2009/escs2009_submission_Gopal.pdf
11#
12# ====================================================================
13# Copyright (c) 2011 The OpenSSL Project. All rights reserved.
14#
15# Redistribution and use in source and binary forms, with or without
16# modification, are permitted provided that the following conditions
17# are met:
18#
19# 1. Redistributions of source code must retain the above copyright
20# notice, this list of conditions and the following disclaimer.
21#
22# 2. Redistributions in binary form must reproduce the above copyright
23# notice, this list of conditions and the following disclaimer in
24# the documentation and/or other materials provided with the
25# distribution.
26#
27# 3. All advertising materials mentioning features or use of this
28# software must display the following acknowledgment:
29# "This product includes software developed by the OpenSSL Project
30# for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
31#
32# 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
33# endorse or promote products derived from this software without
34# prior written permission. For written permission, please contact
35# licensing@OpenSSL.org.
36#
37# 5. Products derived from this software may not be called "OpenSSL"
38# nor may "OpenSSL" appear in their names without prior written
39# permission of the OpenSSL Project.
40#
41# 6. Redistributions of any form whatsoever must retain the following
42# acknowledgment:
43# "This product includes software developed by the OpenSSL Project
44# for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
45#
46# THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
47# EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
48# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
49# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
50# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
51# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
52# NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
53# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
54# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
55# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
56# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
57# OF THE POSSIBILITY OF SUCH DAMAGE.
58# ====================================================================
59
60$flavour = shift;
61$output = shift;
62if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
63
64$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
65( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
66( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
67die "can't locate x86_64-xlate.pl";
68
69open OUT,"| \"$^X\" $xlate $flavour $output";
70*STDOUT=*OUT;
71
72use strict;
73my $code=".text\n\n";
74my $m=0;
75
76#
77# Define x512 macros
78#
79
80#MULSTEP_512_ADD MACRO x7, x6, x5, x4, x3, x2, x1, x0, dst, src1, src2, add_src, tmp1, tmp2
81#
82# uses rax, rdx, and args
83sub MULSTEP_512_ADD
84{
85 my ($x, $DST, $SRC2, $ASRC, $OP, $TMP)=@_;
86 my @X=@$x; # make a copy
87$code.=<<___;
88 mov (+8*0)($SRC2), %rax
89 mul $OP # rdx:rax = %OP * [0]
90 mov ($ASRC), $X[0]
91 add %rax, $X[0]
92 adc \$0, %rdx
93 mov $X[0], $DST
94___
95for(my $i=1;$i<8;$i++) {
96$code.=<<___;
97 mov %rdx, $TMP
98
99 mov (+8*$i)($SRC2), %rax
100 mul $OP # rdx:rax = %OP * [$i]
101 mov (+8*$i)($ASRC), $X[$i]
102 add %rax, $X[$i]
103 adc \$0, %rdx
104 add $TMP, $X[$i]
105 adc \$0, %rdx
106___
107}
108$code.=<<___;
109 mov %rdx, $X[0]
110___
111}
112
113#MULSTEP_512 MACRO x7, x6, x5, x4, x3, x2, x1, x0, dst, src2, src1_val, tmp
114#
115# uses rax, rdx, and args
116sub MULSTEP_512
117{
118 my ($x, $DST, $SRC2, $OP, $TMP)=@_;
119 my @X=@$x; # make a copy
120$code.=<<___;
121 mov (+8*0)($SRC2), %rax
122 mul $OP # rdx:rax = %OP * [0]
123 add %rax, $X[0]
124 adc \$0, %rdx
125 mov $X[0], $DST
126___
127for(my $i=1;$i<8;$i++) {
128$code.=<<___;
129 mov %rdx, $TMP
130
131 mov (+8*$i)($SRC2), %rax
132 mul $OP # rdx:rax = %OP * [$i]
133 add %rax, $X[$i]
134 adc \$0, %rdx
135 add $TMP, $X[$i]
136 adc \$0, %rdx
137___
138}
139$code.=<<___;
140 mov %rdx, $X[0]
141___
142}
143
144#
145# Swizzle Macros
146#
147
148# macro to copy data from flat space to swizzled table
149#MACRO swizzle pDst, pSrc, tmp1, tmp2
150# pDst and pSrc are modified
151sub swizzle
152{
153 my ($pDst, $pSrc, $cnt, $d0)=@_;
154$code.=<<___;
155 mov \$8, $cnt
156loop_$m:
157 mov ($pSrc), $d0
158 mov $d0#w, ($pDst)
159 shr \$16, $d0
160 mov $d0#w, (+64*1)($pDst)
161 shr \$16, $d0
162 mov $d0#w, (+64*2)($pDst)
163 shr \$16, $d0
164 mov $d0#w, (+64*3)($pDst)
165 lea 8($pSrc), $pSrc
166 lea 64*4($pDst), $pDst
167 dec $cnt
168 jnz loop_$m
169___
170
171 $m++;
172}
173
174# macro to copy data from swizzled table to flat space
175#MACRO unswizzle pDst, pSrc, tmp*3
176sub unswizzle
177{
178 my ($pDst, $pSrc, $cnt, $d0, $d1)=@_;
179$code.=<<___;
180 mov \$4, $cnt
181loop_$m:
182 movzxw (+64*3+256*0)($pSrc), $d0
183 movzxw (+64*3+256*1)($pSrc), $d1
184 shl \$16, $d0
185 shl \$16, $d1
186 mov (+64*2+256*0)($pSrc), $d0#w
187 mov (+64*2+256*1)($pSrc), $d1#w
188 shl \$16, $d0
189 shl \$16, $d1
190 mov (+64*1+256*0)($pSrc), $d0#w
191 mov (+64*1+256*1)($pSrc), $d1#w
192 shl \$16, $d0
193 shl \$16, $d1
194 mov (+64*0+256*0)($pSrc), $d0#w
195 mov (+64*0+256*1)($pSrc), $d1#w
196 mov $d0, (+8*0)($pDst)
197 mov $d1, (+8*1)($pDst)
198 lea 256*2($pSrc), $pSrc
199 lea 8*2($pDst), $pDst
200 sub \$1, $cnt
201 jnz loop_$m
202___
203
204 $m++;
205}
206
207#
208# Data Structures
209#
210
211# Reduce Data
212#
213#
214# Offset Value
215# 0C0 Carries
216# 0B8 X2[10]
217# 0B0 X2[9]
218# 0A8 X2[8]
219# 0A0 X2[7]
220# 098 X2[6]
221# 090 X2[5]
222# 088 X2[4]
223# 080 X2[3]
224# 078 X2[2]
225# 070 X2[1]
226# 068 X2[0]
227# 060 X1[12] P[10]
228# 058 X1[11] P[9] Z[8]
229# 050 X1[10] P[8] Z[7]
230# 048 X1[9] P[7] Z[6]
231# 040 X1[8] P[6] Z[5]
232# 038 X1[7] P[5] Z[4]
233# 030 X1[6] P[4] Z[3]
234# 028 X1[5] P[3] Z[2]
235# 020 X1[4] P[2] Z[1]
236# 018 X1[3] P[1] Z[0]
237# 010 X1[2] P[0] Y[2]
238# 008 X1[1] Q[1] Y[1]
239# 000 X1[0] Q[0] Y[0]
240
241my $X1_offset = 0; # 13 qwords
242my $X2_offset = $X1_offset + 13*8; # 11 qwords
243my $Carries_offset = $X2_offset + 11*8; # 1 qword
244my $Q_offset = 0; # 2 qwords
245my $P_offset = $Q_offset + 2*8; # 11 qwords
246my $Y_offset = 0; # 3 qwords
247my $Z_offset = $Y_offset + 3*8; # 9 qwords
248
249my $Red_Data_Size = $Carries_offset + 1*8; # (25 qwords)
250
251#
252# Stack Frame
253#
254#
255# offset value
256# ... <old stack contents>
257# ...
258# 280 Garray
259
260# 278 tmp16[15]
261# ... ...
262# 200 tmp16[0]
263
264# 1F8 tmp[7]
265# ... ...
266# 1C0 tmp[0]
267
268# 1B8 GT[7]
269# ... ...
270# 180 GT[0]
271
272# 178 Reduce Data
273# ... ...
274# 0B8 Reduce Data
275# 0B0 reserved
276# 0A8 reserved
277# 0A0 reserved
278# 098 reserved
279# 090 reserved
280# 088 reduce result addr
281# 080 exp[8]
282
283# ...
284# 048 exp[1]
285# 040 exp[0]
286
287# 038 reserved
288# 030 loop_idx
289# 028 pg
290# 020 i
291# 018 pData ; arg 4
292# 010 pG ; arg 2
293# 008 pResult ; arg 1
294# 000 rsp ; stack pointer before subtract
295
296my $rsp_offset = 0;
297my $pResult_offset = 8*1 + $rsp_offset;
298my $pG_offset = 8*1 + $pResult_offset;
299my $pData_offset = 8*1 + $pG_offset;
300my $i_offset = 8*1 + $pData_offset;
301my $pg_offset = 8*1 + $i_offset;
302my $loop_idx_offset = 8*1 + $pg_offset;
303my $reserved1_offset = 8*1 + $loop_idx_offset;
304my $exp_offset = 8*1 + $reserved1_offset;
305my $red_result_addr_offset= 8*9 + $exp_offset;
306my $reserved2_offset = 8*1 + $red_result_addr_offset;
307my $Reduce_Data_offset = 8*5 + $reserved2_offset;
308my $GT_offset = $Red_Data_Size + $Reduce_Data_offset;
309my $tmp_offset = 8*8 + $GT_offset;
310my $tmp16_offset = 8*8 + $tmp_offset;
311my $garray_offset = 8*16 + $tmp16_offset;
312my $mem_size = 8*8*32 + $garray_offset;
313
314#
315# Offsets within Reduce Data
316#
317#
318# struct MODF_2FOLD_MONT_512_C1_DATA {
319# UINT64 t[8][8];
320# UINT64 m[8];
321# UINT64 m1[8]; /* 2^768 % m */
322# UINT64 m2[8]; /* 2^640 % m */
323# UINT64 k1[2]; /* (- 1/m) % 2^128 */
324# };
325
326my $T = 0;
327my $M = 512; # = 8 * 8 * 8
328my $M1 = 576; # = 8 * 8 * 9 /* += 8 * 8 */
329my $M2 = 640; # = 8 * 8 * 10 /* += 8 * 8 */
330my $K1 = 704; # = 8 * 8 * 11 /* += 8 * 8 */
331
332#
333# FUNCTIONS
334#
335
336{{{
337#
338# MULADD_128x512 : Function to multiply 128-bits (2 qwords) by 512-bits (8 qwords)
339# and add 512-bits (8 qwords)
340# to get 640 bits (10 qwords)
341# Input: 128-bit mul source: [rdi+8*1], rbp
342# 512-bit mul source: [rsi+8*n]
343# 512-bit add source: r15, r14, ..., r9, r8
344# Output: r9, r8, r15, r14, r13, r12, r11, r10, [rcx+8*1], [rcx+8*0]
345# Clobbers all regs except: rcx, rsi, rdi
346$code.=<<___;
347.type MULADD_128x512,\@abi-omnipotent
348.align 16
349MULADD_128x512:
350 _CET_ENDBR
351___
352 &MULSTEP_512([map("%r$_",(8..15))], "(+8*0)(%rcx)", "%rsi", "%rbp", "%rbx");
353$code.=<<___;
354 mov (+8*1)(%rdi), %rbp
355___
356 &MULSTEP_512([map("%r$_",(9..15,8))], "(+8*1)(%rcx)", "%rsi", "%rbp", "%rbx");
357$code.=<<___;
358 ret
359.size MULADD_128x512,.-MULADD_128x512
360___
361}}}
362
363{{{
364#MULADD_256x512 MACRO pDst, pA, pB, OP, TMP, X7, X6, X5, X4, X3, X2, X1, X0
365#
366# Inputs: pDst: Destination (768 bits, 12 qwords)
367# pA: Multiplicand (1024 bits, 16 qwords)
368# pB: Multiplicand (512 bits, 8 qwords)
369# Dst = Ah * B + Al
370# where Ah is (in qwords) A[15:12] (256 bits) and Al is A[7:0] (512 bits)
371# Results in X3 X2 X1 X0 X7 X6 X5 X4 Dst[3:0]
372# Uses registers: arguments, RAX, RDX
373sub MULADD_256x512
374{
375 my ($pDst, $pA, $pB, $OP, $TMP, $X)=@_;
376$code.=<<___;
377 mov (+8*12)($pA), $OP
378___
379 &MULSTEP_512_ADD($X, "(+8*0)($pDst)", $pB, $pA, $OP, $TMP);
380 push(@$X,shift(@$X));
381
382$code.=<<___;
383 mov (+8*13)($pA), $OP
384___
385 &MULSTEP_512($X, "(+8*1)($pDst)", $pB, $OP, $TMP);
386 push(@$X,shift(@$X));
387
388$code.=<<___;
389 mov (+8*14)($pA), $OP
390___
391 &MULSTEP_512($X, "(+8*2)($pDst)", $pB, $OP, $TMP);
392 push(@$X,shift(@$X));
393
394$code.=<<___;
395 mov (+8*15)($pA), $OP
396___
397 &MULSTEP_512($X, "(+8*3)($pDst)", $pB, $OP, $TMP);
398 push(@$X,shift(@$X));
399}
400
401#
402# mont_reduce(UINT64 *x, /* 1024 bits, 16 qwords */
403# UINT64 *m, /* 512 bits, 8 qwords */
404# MODF_2FOLD_MONT_512_C1_DATA *data,
405# UINT64 *r) /* 512 bits, 8 qwords */
406# Input: x (number to be reduced): tmp16 (Implicit)
407# m (modulus): [pM] (Implicit)
408# data (reduce data): [pData] (Implicit)
409# Output: r (result): Address in [red_res_addr]
410# result also in: r9, r8, r15, r14, r13, r12, r11, r10
411
412my @X=map("%r$_",(8..15));
413
414$code.=<<___;
415.type mont_reduce,\@abi-omnipotent
416.align 16
417mont_reduce:
418 _CET_ENDBR
419___
420
421my $STACK_DEPTH = 8;
422 #
423 # X1 = Xh * M1 + Xl
424$code.=<<___;
425 lea (+$Reduce_Data_offset+$X1_offset+$STACK_DEPTH)(%rsp), %rdi # pX1 (Dst) 769 bits, 13 qwords
426 mov (+$pData_offset+$STACK_DEPTH)(%rsp), %rsi # pM1 (Bsrc) 512 bits, 8 qwords
427 add \$$M1, %rsi
428 lea (+$tmp16_offset+$STACK_DEPTH)(%rsp), %rcx # X (Asrc) 1024 bits, 16 qwords
429
430___
431
432 &MULADD_256x512("%rdi", "%rcx", "%rsi", "%rbp", "%rbx", \@X); # rotates @X 4 times
433 # results in r11, r10, r9, r8, r15, r14, r13, r12, X1[3:0]
434
435$code.=<<___;
436 xor %rax, %rax
437 # X1 += xl
438 add (+8*8)(%rcx), $X[4]
439 adc (+8*9)(%rcx), $X[5]
440 adc (+8*10)(%rcx), $X[6]
441 adc (+8*11)(%rcx), $X[7]
442 adc \$0, %rax
443 # X1 is now rax, r11-r8, r15-r12, tmp16[3:0]
444
445 #
446 # check for carry ;; carry stored in rax
447 mov $X[4], (+8*8)(%rdi) # rdi points to X1
448 mov $X[5], (+8*9)(%rdi)
449 mov $X[6], %rbp
450 mov $X[7], (+8*11)(%rdi)
451
452 mov %rax, (+$Reduce_Data_offset+$Carries_offset+$STACK_DEPTH)(%rsp)
453
454 mov (+8*0)(%rdi), $X[4]
455 mov (+8*1)(%rdi), $X[5]
456 mov (+8*2)(%rdi), $X[6]
457 mov (+8*3)(%rdi), $X[7]
458
459 # X1 is now stored in: X1[11], rbp, X1[9:8], r15-r8
460 # rdi -> X1
461 # rsi -> M1
462
463 #
464 # X2 = Xh * M2 + Xl
465 # do first part (X2 = Xh * M2)
466 add \$8*10, %rdi # rdi -> pXh ; 128 bits, 2 qwords
467 # Xh is actually { [rdi+8*1], rbp }
468 add \$`$M2-$M1`, %rsi # rsi -> M2
469 lea (+$Reduce_Data_offset+$X2_offset+$STACK_DEPTH)(%rsp), %rcx # rcx -> pX2 ; 641 bits, 11 qwords
470___
471 unshift(@X,pop(@X)); unshift(@X,pop(@X));
472$code.=<<___;
473
474 call MULADD_128x512 # args in rcx, rdi / rbp, rsi, r15-r8
475 # result in r9, r8, r15, r14, r13, r12, r11, r10, X2[1:0]
476 mov (+$Reduce_Data_offset+$Carries_offset+$STACK_DEPTH)(%rsp), %rax
477
478 # X2 += Xl
479 add (+8*8-8*10)(%rdi), $X[6] # (-8*10) is to adjust rdi -> Xh to Xl
480 adc (+8*9-8*10)(%rdi), $X[7]
481 mov $X[6], (+8*8)(%rcx)
482 mov $X[7], (+8*9)(%rcx)
483
484 adc %rax, %rax
485 mov %rax, (+$Reduce_Data_offset+$Carries_offset+$STACK_DEPTH)(%rsp)
486
487 lea (+$Reduce_Data_offset+$Q_offset+$STACK_DEPTH)(%rsp), %rdi # rdi -> pQ ; 128 bits, 2 qwords
488 add \$`$K1-$M2`, %rsi # rsi -> pK1 ; 128 bits, 2 qwords
489
490 # MUL_128x128t128 rdi, rcx, rsi ; Q = X2 * K1 (bottom half)
491 # B1:B0 = rsi[1:0] = K1[1:0]
492 # A1:A0 = rcx[1:0] = X2[1:0]
493 # Result = rdi[1],rbp = Q[1],rbp
494 mov (%rsi), %r8 # B0
495 mov (+8*1)(%rsi), %rbx # B1
496
497 mov (%rcx), %rax # A0
498 mul %r8 # B0
499 mov %rax, %rbp
500 mov %rdx, %r9
501
502 mov (+8*1)(%rcx), %rax # A1
503 mul %r8 # B0
504 add %rax, %r9
505
506 mov (%rcx), %rax # A0
507 mul %rbx # B1
508 add %rax, %r9
509
510 mov %r9, (+8*1)(%rdi)
511 # end MUL_128x128t128
512
513 sub \$`$K1-$M`, %rsi
514
515 mov (%rcx), $X[6]
516 mov (+8*1)(%rcx), $X[7] # r9:r8 = X2[1:0]
517
518 call MULADD_128x512 # args in rcx, rdi / rbp, rsi, r15-r8
519 # result in r9, r8, r15, r14, r13, r12, r11, r10, X2[1:0]
520
521 # load first half of m to rdx, rdi, rbx, rax
522 # moved this here for efficiency
523 mov (+8*0)(%rsi), %rax
524 mov (+8*1)(%rsi), %rbx
525 mov (+8*2)(%rsi), %rdi
526 mov (+8*3)(%rsi), %rdx
527
528 # continue with reduction
529 mov (+$Reduce_Data_offset+$Carries_offset+$STACK_DEPTH)(%rsp), %rbp
530
531 add (+8*8)(%rcx), $X[6]
532 adc (+8*9)(%rcx), $X[7]
533
534 #accumulate the final carry to rbp
535 adc %rbp, %rbp
536
537 # Add in overflow corrections: R = (X2>>128) += T[overflow]
538 # R = {r9, r8, r15, r14, ..., r10}
539 shl \$3, %rbp
540 mov (+$pData_offset+$STACK_DEPTH)(%rsp), %rcx # rsi -> Data (and points to T)
541 add %rcx, %rbp # pT ; 512 bits, 8 qwords, spread out
542
543 # rsi will be used to generate a mask after the addition
544 xor %rsi, %rsi
545
546 add (+8*8*0)(%rbp), $X[0]
547 adc (+8*8*1)(%rbp), $X[1]
548 adc (+8*8*2)(%rbp), $X[2]
549 adc (+8*8*3)(%rbp), $X[3]
550 adc (+8*8*4)(%rbp), $X[4]
551 adc (+8*8*5)(%rbp), $X[5]
552 adc (+8*8*6)(%rbp), $X[6]
553 adc (+8*8*7)(%rbp), $X[7]
554
555 # if there is a carry: rsi = 0xFFFFFFFFFFFFFFFF
556 # if carry is clear: rsi = 0x0000000000000000
557 sbb \$0, %rsi
558
559 # if carry is clear, subtract 0. Otherwise, subtract 256 bits of m
560 and %rsi, %rax
561 and %rsi, %rbx
562 and %rsi, %rdi
563 and %rsi, %rdx
564
565 mov \$1, %rbp
566 sub %rax, $X[0]
567 sbb %rbx, $X[1]
568 sbb %rdi, $X[2]
569 sbb %rdx, $X[3]
570
571 # if there is a borrow: rbp = 0
572 # if there is no borrow: rbp = 1
573 # this is used to save the borrows in between the first half and the 2nd half of the subtraction of m
574 sbb \$0, %rbp
575
576 #load second half of m to rdx, rdi, rbx, rax
577
578 add \$$M, %rcx
579 mov (+8*4)(%rcx), %rax
580 mov (+8*5)(%rcx), %rbx
581 mov (+8*6)(%rcx), %rdi
582 mov (+8*7)(%rcx), %rdx
583
584 # use the rsi mask as before
585 # if carry is clear, subtract 0. Otherwise, subtract 256 bits of m
586 and %rsi, %rax
587 and %rsi, %rbx
588 and %rsi, %rdi
589 and %rsi, %rdx
590
591 # if rbp = 0, there was a borrow before, it is moved to the carry flag
592 # if rbp = 1, there was not a borrow before, carry flag is cleared
593 sub \$1, %rbp
594
595 sbb %rax, $X[4]
596 sbb %rbx, $X[5]
597 sbb %rdi, $X[6]
598 sbb %rdx, $X[7]
599
600 # write R back to memory
601
602 mov (+$red_result_addr_offset+$STACK_DEPTH)(%rsp), %rsi
603 mov $X[0], (+8*0)(%rsi)
604 mov $X[1], (+8*1)(%rsi)
605 mov $X[2], (+8*2)(%rsi)
606 mov $X[3], (+8*3)(%rsi)
607 mov $X[4], (+8*4)(%rsi)
608 mov $X[5], (+8*5)(%rsi)
609 mov $X[6], (+8*6)(%rsi)
610 mov $X[7], (+8*7)(%rsi)
611
612 ret
613.size mont_reduce,.-mont_reduce
614___
615}}}
616
617{{{
618#MUL_512x512 MACRO pDst, pA, pB, x7, x6, x5, x4, x3, x2, x1, x0, tmp*2
619#
620# Inputs: pDst: Destination (1024 bits, 16 qwords)
621# pA: Multiplicand (512 bits, 8 qwords)
622# pB: Multiplicand (512 bits, 8 qwords)
623# Uses registers rax, rdx, args
624# B operand in [pB] and also in x7...x0
625sub MUL_512x512
626{
627 my ($pDst, $pA, $pB, $x, $OP, $TMP, $pDst_o)=@_;
628 my ($pDst, $pDst_o) = ($pDst =~ m/([^+]*)\+?(.*)?/);
629 my @X=@$x; # make a copy
630
631$code.=<<___;
632 mov (+8*0)($pA), $OP
633
634 mov $X[0], %rax
635 mul $OP # rdx:rax = %OP * [0]
636 mov %rax, (+$pDst_o+8*0)($pDst)
637 mov %rdx, $X[0]
638___
639for(my $i=1;$i<8;$i++) {
640$code.=<<___;
641 mov $X[$i], %rax
642 mul $OP # rdx:rax = %OP * [$i]
643 add %rax, $X[$i-1]
644 adc \$0, %rdx
645 mov %rdx, $X[$i]
646___
647}
648
649for(my $i=1;$i<8;$i++) {
650$code.=<<___;
651 mov (+8*$i)($pA), $OP
652___
653
654 &MULSTEP_512(\@X, "(+$pDst_o+8*$i)($pDst)", $pB, $OP, $TMP);
655 push(@X,shift(@X));
656}
657
658$code.=<<___;
659 mov $X[0], (+$pDst_o+8*8)($pDst)
660 mov $X[1], (+$pDst_o+8*9)($pDst)
661 mov $X[2], (+$pDst_o+8*10)($pDst)
662 mov $X[3], (+$pDst_o+8*11)($pDst)
663 mov $X[4], (+$pDst_o+8*12)($pDst)
664 mov $X[5], (+$pDst_o+8*13)($pDst)
665 mov $X[6], (+$pDst_o+8*14)($pDst)
666 mov $X[7], (+$pDst_o+8*15)($pDst)
667___
668}
669
670#
671# mont_mul_a3b : subroutine to compute (Src1 * Src2) % M (all 512-bits)
672# Input: src1: Address of source 1: rdi
673# src2: Address of source 2: rsi
674# Output: dst: Address of destination: [red_res_addr]
675# src2 and result also in: r9, r8, r15, r14, r13, r12, r11, r10
676# Temp: Clobbers [tmp16], all registers
677$code.=<<___;
678.type mont_mul_a3b,\@abi-omnipotent
679.align 16
680mont_mul_a3b:
681 _CET_ENDBR
682 #
683 # multiply tmp = src1 * src2
684 # For multiply: dst = rcx, src1 = rdi, src2 = rsi
685 # stack depth is extra 8 from call
686___
687 &MUL_512x512("%rsp+$tmp16_offset+8", "%rdi", "%rsi", [map("%r$_",(10..15,8..9))], "%rbp", "%rbx");
688$code.=<<___;
689 #
690 # Dst = tmp % m
691 # Call reduce(tmp, m, data, dst)
692
693 # tail recursion optimization: jmp to mont_reduce and return from there
694 jmp mont_reduce
695 # call mont_reduce
696 # ret
697.size mont_mul_a3b,.-mont_mul_a3b
698___
699}}}
700
701{{{
702#SQR_512 MACRO pDest, pA, x7, x6, x5, x4, x3, x2, x1, x0, tmp*4
703#
704# Input in memory [pA] and also in x7...x0
705# Uses all argument registers plus rax and rdx
706#
707# This version computes all of the off-diagonal terms into memory,
708# and then it adds in the diagonal terms
709
710sub SQR_512
711{
712 my ($pDst, $pA, $x, $A, $tmp, $x7, $x6, $pDst_o)=@_;
713 my ($pDst, $pDst_o) = ($pDst =~ m/([^+]*)\+?(.*)?/);
714 my @X=@$x; # make a copy
715$code.=<<___;
716 # ------------------
717 # first pass 01...07
718 # ------------------
719 mov $X[0], $A
720
721 mov $X[1],%rax
722 mul $A
723 mov %rax, (+$pDst_o+8*1)($pDst)
724___
725for(my $i=2;$i<8;$i++) {
726$code.=<<___;
727 mov %rdx, $X[$i-2]
728 mov $X[$i],%rax
729 mul $A
730 add %rax, $X[$i-2]
731 adc \$0, %rdx
732___
733}
734$code.=<<___;
735 mov %rdx, $x7
736
737 mov $X[0], (+$pDst_o+8*2)($pDst)
738
739 # ------------------
740 # second pass 12...17
741 # ------------------
742
743 mov (+8*1)($pA), $A
744
745 mov (+8*2)($pA),%rax
746 mul $A
747 add %rax, $X[1]
748 adc \$0, %rdx
749 mov $X[1], (+$pDst_o+8*3)($pDst)
750
751 mov %rdx, $X[0]
752 mov (+8*3)($pA),%rax
753 mul $A
754 add %rax, $X[2]
755 adc \$0, %rdx
756 add $X[0], $X[2]
757 adc \$0, %rdx
758 mov $X[2], (+$pDst_o+8*4)($pDst)
759
760 mov %rdx, $X[0]
761 mov (+8*4)($pA),%rax
762 mul $A
763 add %rax, $X[3]
764 adc \$0, %rdx
765 add $X[0], $X[3]
766 adc \$0, %rdx
767
768 mov %rdx, $X[0]
769 mov (+8*5)($pA),%rax
770 mul $A
771 add %rax, $X[4]
772 adc \$0, %rdx
773 add $X[0], $X[4]
774 adc \$0, %rdx
775
776 mov %rdx, $X[0]
777 mov $X[6],%rax
778 mul $A
779 add %rax, $X[5]
780 adc \$0, %rdx
781 add $X[0], $X[5]
782 adc \$0, %rdx
783
784 mov %rdx, $X[0]
785 mov $X[7],%rax
786 mul $A
787 add %rax, $x7
788 adc \$0, %rdx
789 add $X[0], $x7
790 adc \$0, %rdx
791
792 mov %rdx, $X[1]
793
794 # ------------------
795 # third pass 23...27
796 # ------------------
797 mov (+8*2)($pA), $A
798
799 mov (+8*3)($pA),%rax
800 mul $A
801 add %rax, $X[3]
802 adc \$0, %rdx
803 mov $X[3], (+$pDst_o+8*5)($pDst)
804
805 mov %rdx, $X[0]
806 mov (+8*4)($pA),%rax
807 mul $A
808 add %rax, $X[4]
809 adc \$0, %rdx
810 add $X[0], $X[4]
811 adc \$0, %rdx
812 mov $X[4], (+$pDst_o+8*6)($pDst)
813
814 mov %rdx, $X[0]
815 mov (+8*5)($pA),%rax
816 mul $A
817 add %rax, $X[5]
818 adc \$0, %rdx
819 add $X[0], $X[5]
820 adc \$0, %rdx
821
822 mov %rdx, $X[0]
823 mov $X[6],%rax
824 mul $A
825 add %rax, $x7
826 adc \$0, %rdx
827 add $X[0], $x7
828 adc \$0, %rdx
829
830 mov %rdx, $X[0]
831 mov $X[7],%rax
832 mul $A
833 add %rax, $X[1]
834 adc \$0, %rdx
835 add $X[0], $X[1]
836 adc \$0, %rdx
837
838 mov %rdx, $X[2]
839
840 # ------------------
841 # fourth pass 34...37
842 # ------------------
843
844 mov (+8*3)($pA), $A
845
846 mov (+8*4)($pA),%rax
847 mul $A
848 add %rax, $X[5]
849 adc \$0, %rdx
850 mov $X[5], (+$pDst_o+8*7)($pDst)
851
852 mov %rdx, $X[0]
853 mov (+8*5)($pA),%rax
854 mul $A
855 add %rax, $x7
856 adc \$0, %rdx
857 add $X[0], $x7
858 adc \$0, %rdx
859 mov $x7, (+$pDst_o+8*8)($pDst)
860
861 mov %rdx, $X[0]
862 mov $X[6],%rax
863 mul $A
864 add %rax, $X[1]
865 adc \$0, %rdx
866 add $X[0], $X[1]
867 adc \$0, %rdx
868
869 mov %rdx, $X[0]
870 mov $X[7],%rax
871 mul $A
872 add %rax, $X[2]
873 adc \$0, %rdx
874 add $X[0], $X[2]
875 adc \$0, %rdx
876
877 mov %rdx, $X[5]
878
879 # ------------------
880 # fifth pass 45...47
881 # ------------------
882 mov (+8*4)($pA), $A
883
884 mov (+8*5)($pA),%rax
885 mul $A
886 add %rax, $X[1]
887 adc \$0, %rdx
888 mov $X[1], (+$pDst_o+8*9)($pDst)
889
890 mov %rdx, $X[0]
891 mov $X[6],%rax
892 mul $A
893 add %rax, $X[2]
894 adc \$0, %rdx
895 add $X[0], $X[2]
896 adc \$0, %rdx
897 mov $X[2], (+$pDst_o+8*10)($pDst)
898
899 mov %rdx, $X[0]
900 mov $X[7],%rax
901 mul $A
902 add %rax, $X[5]
903 adc \$0, %rdx
904 add $X[0], $X[5]
905 adc \$0, %rdx
906
907 mov %rdx, $X[1]
908
909 # ------------------
910 # sixth pass 56...57
911 # ------------------
912 mov (+8*5)($pA), $A
913
914 mov $X[6],%rax
915 mul $A
916 add %rax, $X[5]
917 adc \$0, %rdx
918 mov $X[5], (+$pDst_o+8*11)($pDst)
919
920 mov %rdx, $X[0]
921 mov $X[7],%rax
922 mul $A
923 add %rax, $X[1]
924 adc \$0, %rdx
925 add $X[0], $X[1]
926 adc \$0, %rdx
927 mov $X[1], (+$pDst_o+8*12)($pDst)
928
929 mov %rdx, $X[2]
930
931 # ------------------
932 # seventh pass 67
933 # ------------------
934 mov $X[6], $A
935
936 mov $X[7],%rax
937 mul $A
938 add %rax, $X[2]
939 adc \$0, %rdx
940 mov $X[2], (+$pDst_o+8*13)($pDst)
941
942 mov %rdx, (+$pDst_o+8*14)($pDst)
943
944 # start finalize (add in squares, and double off-terms)
945 mov (+$pDst_o+8*1)($pDst), $X[0]
946 mov (+$pDst_o+8*2)($pDst), $X[1]
947 mov (+$pDst_o+8*3)($pDst), $X[2]
948 mov (+$pDst_o+8*4)($pDst), $X[3]
949 mov (+$pDst_o+8*5)($pDst), $X[4]
950 mov (+$pDst_o+8*6)($pDst), $X[5]
951
952 mov (+8*3)($pA), %rax
953 mul %rax
954 mov %rax, $x6
955 mov %rdx, $X[6]
956
957 add $X[0], $X[0]
958 adc $X[1], $X[1]
959 adc $X[2], $X[2]
960 adc $X[3], $X[3]
961 adc $X[4], $X[4]
962 adc $X[5], $X[5]
963 adc \$0, $X[6]
964
965 mov (+8*0)($pA), %rax
966 mul %rax
967 mov %rax, (+$pDst_o+8*0)($pDst)
968 mov %rdx, $A
969
970 mov (+8*1)($pA), %rax
971 mul %rax
972
973 add $A, $X[0]
974 adc %rax, $X[1]
975 adc \$0, %rdx
976
977 mov %rdx, $A
978 mov $X[0], (+$pDst_o+8*1)($pDst)
979 mov $X[1], (+$pDst_o+8*2)($pDst)
980
981 mov (+8*2)($pA), %rax
982 mul %rax
983
984 add $A, $X[2]
985 adc %rax, $X[3]
986 adc \$0, %rdx
987
988 mov %rdx, $A
989
990 mov $X[2], (+$pDst_o+8*3)($pDst)
991 mov $X[3], (+$pDst_o+8*4)($pDst)
992
993 xor $tmp, $tmp
994 add $A, $X[4]
995 adc $x6, $X[5]
996 adc \$0, $tmp
997
998 mov $X[4], (+$pDst_o+8*5)($pDst)
999 mov $X[5], (+$pDst_o+8*6)($pDst)
1000
1001 # %%tmp has 0/1 in column 7
1002 # %%A6 has a full value in column 7
1003
1004 mov (+$pDst_o+8*7)($pDst), $X[0]
1005 mov (+$pDst_o+8*8)($pDst), $X[1]
1006 mov (+$pDst_o+8*9)($pDst), $X[2]
1007 mov (+$pDst_o+8*10)($pDst), $X[3]
1008 mov (+$pDst_o+8*11)($pDst), $X[4]
1009 mov (+$pDst_o+8*12)($pDst), $X[5]
1010 mov (+$pDst_o+8*13)($pDst), $x6
1011 mov (+$pDst_o+8*14)($pDst), $x7
1012
1013 mov $X[7], %rax
1014 mul %rax
1015 mov %rax, $X[7]
1016 mov %rdx, $A
1017
1018 add $X[0], $X[0]
1019 adc $X[1], $X[1]
1020 adc $X[2], $X[2]
1021 adc $X[3], $X[3]
1022 adc $X[4], $X[4]
1023 adc $X[5], $X[5]
1024 adc $x6, $x6
1025 adc $x7, $x7
1026 adc \$0, $A
1027
1028 add $tmp, $X[0]
1029
1030 mov (+8*4)($pA), %rax
1031 mul %rax
1032
1033 add $X[6], $X[0]
1034 adc %rax, $X[1]
1035 adc \$0, %rdx
1036
1037 mov %rdx, $tmp
1038
1039 mov $X[0], (+$pDst_o+8*7)($pDst)
1040 mov $X[1], (+$pDst_o+8*8)($pDst)
1041
1042 mov (+8*5)($pA), %rax
1043 mul %rax
1044
1045 add $tmp, $X[2]
1046 adc %rax, $X[3]
1047 adc \$0, %rdx
1048
1049 mov %rdx, $tmp
1050
1051 mov $X[2], (+$pDst_o+8*9)($pDst)
1052 mov $X[3], (+$pDst_o+8*10)($pDst)
1053
1054 mov (+8*6)($pA), %rax
1055 mul %rax
1056
1057 add $tmp, $X[4]
1058 adc %rax, $X[5]
1059 adc \$0, %rdx
1060
1061 mov $X[4], (+$pDst_o+8*11)($pDst)
1062 mov $X[5], (+$pDst_o+8*12)($pDst)
1063
1064 add %rdx, $x6
1065 adc $X[7], $x7
1066 adc \$0, $A
1067
1068 mov $x6, (+$pDst_o+8*13)($pDst)
1069 mov $x7, (+$pDst_o+8*14)($pDst)
1070 mov $A, (+$pDst_o+8*15)($pDst)
1071___
1072}
1073
1074#
1075# sqr_reduce: subroutine to compute Result = reduce(Result * Result)
1076#
1077# input and result also in: r9, r8, r15, r14, r13, r12, r11, r10
1078#
1079$code.=<<___;
1080.type sqr_reduce,\@abi-omnipotent
1081.align 16
1082sqr_reduce:
1083 _CET_ENDBR
1084 mov (+$pResult_offset+8)(%rsp), %rcx
1085___
1086 &SQR_512("%rsp+$tmp16_offset+8", "%rcx", [map("%r$_",(10..15,8..9))], "%rbx", "%rbp", "%rsi", "%rdi");
1087$code.=<<___;
1088 # tail recursion optimization: jmp to mont_reduce and return from there
1089 jmp mont_reduce
1090 # call mont_reduce
1091 # ret
1092.size sqr_reduce,.-sqr_reduce
1093___
1094}}}
1095
1096#
1097# MAIN FUNCTION
1098#
1099
1100#mod_exp_512(UINT64 *result, /* 512 bits, 8 qwords */
1101# UINT64 *g, /* 512 bits, 8 qwords */
1102# UINT64 *exp, /* 512 bits, 8 qwords */
1103# struct mod_ctx_512 *data)
1104
1105# window size = 5
1106# table size = 2^5 = 32
1107#table_entries equ 32
1108#table_size equ table_entries * 8
1109$code.=<<___;
1110.globl mod_exp_512
1111.type mod_exp_512,\@function,4
1112mod_exp_512:
1113 _CET_ENDBR
1114 push %rbp
1115 push %rbx
1116 push %r12
1117 push %r13
1118 push %r14
1119 push %r15
1120
1121 # adjust stack down and then align it with cache boundary
1122 mov %rsp, %r8
1123 sub \$$mem_size, %rsp
1124 and \$-64, %rsp
1125
1126 # store previous stack pointer and arguments
1127 mov %r8, (+$rsp_offset)(%rsp)
1128 mov %rdi, (+$pResult_offset)(%rsp)
1129 mov %rsi, (+$pG_offset)(%rsp)
1130 mov %rcx, (+$pData_offset)(%rsp)
1131.Lbody:
1132 # transform g into montgomery space
1133 # GT = reduce(g * C2) = reduce(g * (2^256))
1134 # reduce expects to have the input in [tmp16]
1135 pxor %xmm4, %xmm4
1136 movdqu (+16*0)(%rsi), %xmm0
1137 movdqu (+16*1)(%rsi), %xmm1
1138 movdqu (+16*2)(%rsi), %xmm2
1139 movdqu (+16*3)(%rsi), %xmm3
1140 movdqa %xmm4, (+$tmp16_offset+16*0)(%rsp)
1141 movdqa %xmm4, (+$tmp16_offset+16*1)(%rsp)
1142 movdqa %xmm4, (+$tmp16_offset+16*6)(%rsp)
1143 movdqa %xmm4, (+$tmp16_offset+16*7)(%rsp)
1144 movdqa %xmm0, (+$tmp16_offset+16*2)(%rsp)
1145 movdqa %xmm1, (+$tmp16_offset+16*3)(%rsp)
1146 movdqa %xmm2, (+$tmp16_offset+16*4)(%rsp)
1147 movdqa %xmm3, (+$tmp16_offset+16*5)(%rsp)
1148
1149 # load pExp before rdx gets blown away
1150 movdqu (+16*0)(%rdx), %xmm0
1151 movdqu (+16*1)(%rdx), %xmm1
1152 movdqu (+16*2)(%rdx), %xmm2
1153 movdqu (+16*3)(%rdx), %xmm3
1154
1155 lea (+$GT_offset)(%rsp), %rbx
1156 mov %rbx, (+$red_result_addr_offset)(%rsp)
1157 call mont_reduce
1158
1159 # Initialize tmp = C
1160 lea (+$tmp_offset)(%rsp), %rcx
1161 xor %rax, %rax
1162 mov %rax, (+8*0)(%rcx)
1163 mov %rax, (+8*1)(%rcx)
1164 mov %rax, (+8*3)(%rcx)
1165 mov %rax, (+8*4)(%rcx)
1166 mov %rax, (+8*5)(%rcx)
1167 mov %rax, (+8*6)(%rcx)
1168 mov %rax, (+8*7)(%rcx)
1169 mov %rax, (+$exp_offset+8*8)(%rsp)
1170 movq \$1, (+8*2)(%rcx)
1171
1172 lea (+$garray_offset)(%rsp), %rbp
1173 mov %rcx, %rsi # pTmp
1174 mov %rbp, %rdi # Garray[][0]
1175___
1176
1177 &swizzle("%rdi", "%rcx", "%rax", "%rbx");
1178
1179 # for (rax = 31; rax != 0; rax--) {
1180 # tmp = reduce(tmp * G)
1181 # swizzle(pg, tmp);
1182 # pg += 2; }
1183$code.=<<___;
1184 mov \$31, %rax
1185 mov %rax, (+$i_offset)(%rsp)
1186 mov %rbp, (+$pg_offset)(%rsp)
1187 # rsi -> pTmp
1188 mov %rsi, (+$red_result_addr_offset)(%rsp)
1189 mov (+8*0)(%rsi), %r10
1190 mov (+8*1)(%rsi), %r11
1191 mov (+8*2)(%rsi), %r12
1192 mov (+8*3)(%rsi), %r13
1193 mov (+8*4)(%rsi), %r14
1194 mov (+8*5)(%rsi), %r15
1195 mov (+8*6)(%rsi), %r8
1196 mov (+8*7)(%rsi), %r9
1197init_loop:
1198 lea (+$GT_offset)(%rsp), %rdi
1199 call mont_mul_a3b
1200 lea (+$tmp_offset)(%rsp), %rsi
1201 mov (+$pg_offset)(%rsp), %rbp
1202 add \$2, %rbp
1203 mov %rbp, (+$pg_offset)(%rsp)
1204 mov %rsi, %rcx # rcx = rsi = addr of tmp
1205___
1206
1207 &swizzle("%rbp", "%rcx", "%rax", "%rbx");
1208$code.=<<___;
1209 mov (+$i_offset)(%rsp), %rax
1210 sub \$1, %rax
1211 mov %rax, (+$i_offset)(%rsp)
1212 jne init_loop
1213
1214 #
1215 # Copy exponent onto stack
1216 movdqa %xmm0, (+$exp_offset+16*0)(%rsp)
1217 movdqa %xmm1, (+$exp_offset+16*1)(%rsp)
1218 movdqa %xmm2, (+$exp_offset+16*2)(%rsp)
1219 movdqa %xmm3, (+$exp_offset+16*3)(%rsp)
1220
1221
1222 #
1223 # Do exponentiation
1224 # Initialize result to G[exp{511:507}]
1225 mov (+$exp_offset+62)(%rsp), %eax
1226 mov %rax, %rdx
1227 shr \$11, %rax
1228 and \$0x07FF, %edx
1229 mov %edx, (+$exp_offset+62)(%rsp)
1230 lea (+$garray_offset)(%rsp,%rax,2), %rsi
1231 mov (+$pResult_offset)(%rsp), %rdx
1232___
1233
1234 &unswizzle("%rdx", "%rsi", "%rbp", "%rbx", "%rax");
1235
1236 #
1237 # Loop variables
1238 # rcx = [loop_idx] = index: 510-5 to 0 by 5
1239$code.=<<___;
1240 movq \$505, (+$loop_idx_offset)(%rsp)
1241
1242 mov (+$pResult_offset)(%rsp), %rcx
1243 mov %rcx, (+$red_result_addr_offset)(%rsp)
1244 mov (+8*0)(%rcx), %r10
1245 mov (+8*1)(%rcx), %r11
1246 mov (+8*2)(%rcx), %r12
1247 mov (+8*3)(%rcx), %r13
1248 mov (+8*4)(%rcx), %r14
1249 mov (+8*5)(%rcx), %r15
1250 mov (+8*6)(%rcx), %r8
1251 mov (+8*7)(%rcx), %r9
1252 jmp sqr_2
1253
1254main_loop_a3b:
1255 call sqr_reduce
1256 call sqr_reduce
1257 call sqr_reduce
1258sqr_2:
1259 call sqr_reduce
1260 call sqr_reduce
1261
1262 #
1263 # Do multiply, first look up proper value in Garray
1264 mov (+$loop_idx_offset)(%rsp), %rcx # bit index
1265 mov %rcx, %rax
1266 shr \$4, %rax # rax is word pointer
1267 mov (+$exp_offset)(%rsp,%rax,2), %edx
1268 and \$15, %rcx
1269 shrq %cl, %rdx
1270 and \$0x1F, %rdx
1271
1272 lea (+$garray_offset)(%rsp,%rdx,2), %rsi
1273 lea (+$tmp_offset)(%rsp), %rdx
1274 mov %rdx, %rdi
1275___
1276
1277 &unswizzle("%rdx", "%rsi", "%rbp", "%rbx", "%rax");
1278 # rdi = tmp = pG
1279
1280 #
1281 # Call mod_mul_a1(pDst, pSrc1, pSrc2, pM, pData)
1282 # result result pG M Data
1283$code.=<<___;
1284 mov (+$pResult_offset)(%rsp), %rsi
1285 call mont_mul_a3b
1286
1287 #
1288 # finish loop
1289 mov (+$loop_idx_offset)(%rsp), %rcx
1290 sub \$5, %rcx
1291 mov %rcx, (+$loop_idx_offset)(%rsp)
1292 jge main_loop_a3b
1293
1294 #
1295
1296end_main_loop_a3b:
1297 # transform result out of Montgomery space
1298 # result = reduce(result)
1299 mov (+$pResult_offset)(%rsp), %rdx
1300 pxor %xmm4, %xmm4
1301 movdqu (+16*0)(%rdx), %xmm0
1302 movdqu (+16*1)(%rdx), %xmm1
1303 movdqu (+16*2)(%rdx), %xmm2
1304 movdqu (+16*3)(%rdx), %xmm3
1305 movdqa %xmm4, (+$tmp16_offset+16*4)(%rsp)
1306 movdqa %xmm4, (+$tmp16_offset+16*5)(%rsp)
1307 movdqa %xmm4, (+$tmp16_offset+16*6)(%rsp)
1308 movdqa %xmm4, (+$tmp16_offset+16*7)(%rsp)
1309 movdqa %xmm0, (+$tmp16_offset+16*0)(%rsp)
1310 movdqa %xmm1, (+$tmp16_offset+16*1)(%rsp)
1311 movdqa %xmm2, (+$tmp16_offset+16*2)(%rsp)
1312 movdqa %xmm3, (+$tmp16_offset+16*3)(%rsp)
1313 call mont_reduce
1314
1315 # If result > m, subtract m
1316 # load result into r15:r8
1317 mov (+$pResult_offset)(%rsp), %rax
1318 mov (+8*0)(%rax), %r8
1319 mov (+8*1)(%rax), %r9
1320 mov (+8*2)(%rax), %r10
1321 mov (+8*3)(%rax), %r11
1322 mov (+8*4)(%rax), %r12
1323 mov (+8*5)(%rax), %r13
1324 mov (+8*6)(%rax), %r14
1325 mov (+8*7)(%rax), %r15
1326
1327 # subtract m
1328 mov (+$pData_offset)(%rsp), %rbx
1329 add \$$M, %rbx
1330
1331 sub (+8*0)(%rbx), %r8
1332 sbb (+8*1)(%rbx), %r9
1333 sbb (+8*2)(%rbx), %r10
1334 sbb (+8*3)(%rbx), %r11
1335 sbb (+8*4)(%rbx), %r12
1336 sbb (+8*5)(%rbx), %r13
1337 sbb (+8*6)(%rbx), %r14
1338 sbb (+8*7)(%rbx), %r15
1339
1340 # if Carry is clear, replace result with difference
1341 mov (+8*0)(%rax), %rsi
1342 mov (+8*1)(%rax), %rdi
1343 mov (+8*2)(%rax), %rcx
1344 mov (+8*3)(%rax), %rdx
1345 cmovnc %r8, %rsi
1346 cmovnc %r9, %rdi
1347 cmovnc %r10, %rcx
1348 cmovnc %r11, %rdx
1349 mov %rsi, (+8*0)(%rax)
1350 mov %rdi, (+8*1)(%rax)
1351 mov %rcx, (+8*2)(%rax)
1352 mov %rdx, (+8*3)(%rax)
1353
1354 mov (+8*4)(%rax), %rsi
1355 mov (+8*5)(%rax), %rdi
1356 mov (+8*6)(%rax), %rcx
1357 mov (+8*7)(%rax), %rdx
1358 cmovnc %r12, %rsi
1359 cmovnc %r13, %rdi
1360 cmovnc %r14, %rcx
1361 cmovnc %r15, %rdx
1362 mov %rsi, (+8*4)(%rax)
1363 mov %rdi, (+8*5)(%rax)
1364 mov %rcx, (+8*6)(%rax)
1365 mov %rdx, (+8*7)(%rax)
1366
1367 mov (+$rsp_offset)(%rsp), %rsi
1368 mov 0(%rsi),%r15
1369 mov 8(%rsi),%r14
1370 mov 16(%rsi),%r13
1371 mov 24(%rsi),%r12
1372 mov 32(%rsi),%rbx
1373 mov 40(%rsi),%rbp
1374 lea 48(%rsi),%rsp
1375.Lepilogue:
1376 ret
1377.size mod_exp_512, . - mod_exp_512
1378___
1379
1380sub reg_part {
1381my ($reg,$conv)=@_;
1382 if ($reg =~ /%r[0-9]+/) { $reg .= $conv; }
1383 elsif ($conv eq "b") { $reg =~ s/%[er]([^x]+)x?/%$1l/; }
1384 elsif ($conv eq "w") { $reg =~ s/%[er](.+)/%$1/; }
1385 elsif ($conv eq "d") { $reg =~ s/%[er](.+)/%e$1/; }
1386 return $reg;
1387}
1388
1389$code =~ s/(%[a-z0-9]+)#([bwd])/reg_part($1,$2)/gem;
1390$code =~ s/\`([^\`]*)\`/eval $1/gem;
1391$code =~ s/(\(\+[^)]+\))/eval $1/gem;
1392print $code;
1393close STDOUT;
diff --git a/src/lib/libcrypto/bn/asm/parisc-mont.pl b/src/lib/libcrypto/bn/asm/parisc-mont.pl
deleted file mode 100644
index 0c7aff93b9..0000000000
--- a/src/lib/libcrypto/bn/asm/parisc-mont.pl
+++ /dev/null
@@ -1,985 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# On PA-7100LC this module performs ~90-50% better, less for longer
11# keys, than code generated by gcc 3.2 for PA-RISC 1.1. Latter means
12# that compiler utilized xmpyu instruction to perform 32x32=64-bit
13# multiplication, which in turn means that "baseline" performance was
14# optimal in respect to instruction set capabilities. Fair comparison
15# with vendor compiler is problematic, because OpenSSL doesn't define
16# BN_LLONG [presumably] for historical reasons, which drives compiler
17# toward 4 times 16x16=32-bit multiplicatons [plus complementary
18# shifts and additions] instead. This means that you should observe
19# several times improvement over code generated by vendor compiler
20# for PA-RISC 1.1, but the "baseline" is far from optimal. The actual
21# improvement coefficient was never collected on PA-7100LC, or any
22# other 1.1 CPU, because I don't have access to such machine with
23# vendor compiler. But to give you a taste, PA-RISC 1.1 code path
24# reportedly outperformed code generated by cc +DA1.1 +O3 by factor
25# of ~5x on PA-8600.
26#
27# On PA-RISC 2.0 it has to compete with pa-risc2[W].s, which is
28# reportedly ~2x faster than vendor compiler generated code [according
29# to comment in pa-risc2[W].s]. Here comes a catch. Execution core of
30# this implementation is actually 32-bit one, in the sense that it
31# operates on 32-bit values. But pa-risc2[W].s operates on arrays of
32# 64-bit BN_LONGs... How do they interoperate then? No problem. This
33# module picks halves of 64-bit values in reverse order and pretends
34# they were 32-bit BN_LONGs. But can 32-bit core compete with "pure"
35# 64-bit code such as pa-risc2[W].s then? Well, the thing is that
36# 32x32=64-bit multiplication is the best even PA-RISC 2.0 can do,
37# i.e. there is no "wider" multiplication like on most other 64-bit
38# platforms. This means that even being effectively 32-bit, this
39# implementation performs "64-bit" computational task in same amount
40# of arithmetic operations, most notably multiplications. It requires
41# more memory references, most notably to tp[num], but this doesn't
42# seem to exhaust memory port capacity. And indeed, dedicated PA-RISC
43# 2.0 code path provides virtually same performance as pa-risc2[W].s:
44# it's ~10% better for shortest key length and ~10% worse for longest
45# one.
46#
47# In case it wasn't clear. The module has two distinct code paths:
48# PA-RISC 1.1 and PA-RISC 2.0 ones. Latter features carry-free 64-bit
49# additions and 64-bit integer loads, not to mention specific
50# instruction scheduling. In 64-bit build naturally only 2.0 code path
51# is assembled. In 32-bit application context both code paths are
52# assembled, PA-RISC 2.0 CPU is detected at run-time and proper path
53# is taken automatically. Also, in 32-bit build the module imposes
54# couple of limitations: vector lengths has to be even and vector
55# addresses has to be 64-bit aligned. Normally neither is a problem:
56# most common key lengths are even and vectors are commonly malloc-ed,
57# which ensures alignment.
58#
59# Special thanks to polarhome.com for providing HP-UX account on
60# PA-RISC 1.1 machine, and to correspondent who chose to remain
61# anonymous for testing the code on PA-RISC 2.0 machine.
62
63$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
64
65$flavour = shift;
66$output = shift;
67
68open STDOUT,">$output";
69
70if ($flavour =~ /64/) {
71 $LEVEL ="2.0W";
72 $SIZE_T =8;
73 $FRAME_MARKER =80;
74 $SAVED_RP =16;
75 $PUSH ="std";
76 $PUSHMA ="std,ma";
77 $POP ="ldd";
78 $POPMB ="ldd,mb";
79 $BN_SZ =$SIZE_T;
80} else {
81 $LEVEL ="1.1"; #$LEVEL.="\n\t.ALLOW\t2.0";
82 $SIZE_T =4;
83 $FRAME_MARKER =48;
84 $SAVED_RP =20;
85 $PUSH ="stw";
86 $PUSHMA ="stwm";
87 $POP ="ldw";
88 $POPMB ="ldwm";
89 $BN_SZ =$SIZE_T;
90}
91
92$FRAME=8*$SIZE_T+$FRAME_MARKER; # 8 saved regs + frame marker
93 # [+ argument transfer]
94$LOCALS=$FRAME-$FRAME_MARKER;
95$FRAME+=32; # local variables
96
97$tp="%r31";
98$ti1="%r29";
99$ti0="%r28";
100
101$rp="%r26";
102$ap="%r25";
103$bp="%r24";
104$np="%r23";
105$n0="%r22"; # passed through stack in 32-bit
106$num="%r21"; # passed through stack in 32-bit
107$idx="%r20";
108$arrsz="%r19";
109
110$nm1="%r7";
111$nm0="%r6";
112$ab1="%r5";
113$ab0="%r4";
114
115$fp="%r3";
116$hi1="%r2";
117$hi0="%r1";
118
119$xfer=$n0; # accommodates [-16..15] offset in fld[dw]s
120
121$fm0="%fr4"; $fti=$fm0;
122$fbi="%fr5L";
123$fn0="%fr5R";
124$fai="%fr6"; $fab0="%fr7"; $fab1="%fr8";
125$fni="%fr9"; $fnm0="%fr10"; $fnm1="%fr11";
126
127$code=<<___;
128 .LEVEL $LEVEL
129 .text
130
131 .EXPORT bn_mul_mont,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
132 .ALIGN 64
133bn_mul_mont
134 .PROC
135 .CALLINFO FRAME=`$FRAME-8*$SIZE_T`,NO_CALLS,SAVE_RP,SAVE_SP,ENTRY_GR=6
136 .ENTRY
137 $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
138 $PUSHMA %r3,$FRAME(%sp)
139 $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
140 $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
141 $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
142 $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp)
143 $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp)
144 $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp)
145 $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp)
146 ldo -$FRAME(%sp),$fp
147___
148$code.=<<___ if ($SIZE_T==4);
149 ldw `-$FRAME_MARKER-4`($fp),$n0
150 ldw `-$FRAME_MARKER-8`($fp),$num
151 nop
152 nop ; alignment
153___
154$code.=<<___ if ($BN_SZ==4);
155 comiclr,<= 6,$num,%r0 ; are vectors long enough?
156 b L\$abort
157 ldi 0,%r28 ; signal "unhandled"
158 add,ev %r0,$num,$num ; is $num even?
159 b L\$abort
160 nop
161 or $ap,$np,$ti1
162 extru,= $ti1,31,3,%r0 ; are ap and np 64-bit aligned?
163 b L\$abort
164 nop
165 nop ; alignment
166 nop
167
168 fldws 0($n0),${fn0}
169 fldws,ma 4($bp),${fbi} ; bp[0]
170___
171$code.=<<___ if ($BN_SZ==8);
172 comib,> 3,$num,L\$abort ; are vectors long enough?
173 ldi 0,%r28 ; signal "unhandled"
174 addl $num,$num,$num ; I operate on 32-bit values
175
176 fldws 4($n0),${fn0} ; only low part of n0
177 fldws 4($bp),${fbi} ; bp[0] in flipped word order
178___
179$code.=<<___;
180 fldds 0($ap),${fai} ; ap[0,1]
181 fldds 0($np),${fni} ; np[0,1]
182
183 sh2addl $num,%r0,$arrsz
184 ldi 31,$hi0
185 ldo 36($arrsz),$hi1 ; space for tp[num+1]
186 andcm $hi1,$hi0,$hi1 ; align
187 addl $hi1,%sp,%sp
188 $PUSH $fp,-$SIZE_T(%sp)
189
190 ldo `$LOCALS+16`($fp),$xfer
191 ldo `$LOCALS+32+4`($fp),$tp
192
193 xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[0]
194 xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[0]
195 xmpyu ${fn0},${fab0}R,${fm0}
196
197 addl $arrsz,$ap,$ap ; point at the end
198 addl $arrsz,$np,$np
199 subi 0,$arrsz,$idx ; j=0
200 ldo 8($idx),$idx ; j++++
201
202 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m
203 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m
204 fstds ${fab0},-16($xfer)
205 fstds ${fnm0},-8($xfer)
206 fstds ${fab1},0($xfer)
207 fstds ${fnm1},8($xfer)
208 flddx $idx($ap),${fai} ; ap[2,3]
209 flddx $idx($np),${fni} ; np[2,3]
210___
211$code.=<<___ if ($BN_SZ==4);
212#ifdef __LP64__
213 mtctl $hi0,%cr11 ; $hi0 still holds 31
214 extrd,u,*= $hi0,%sar,1,$hi0 ; executes on PA-RISC 1.0
215 b L\$parisc11
216 nop
217___
218$code.=<<___; # PA-RISC 2.0 code-path
219 xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0]
220 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
221 ldd -16($xfer),$ab0
222 fstds ${fab0},-16($xfer)
223
224 extrd,u $ab0,31,32,$hi0
225 extrd,u $ab0,63,32,$ab0
226 ldd -8($xfer),$nm0
227 fstds ${fnm0},-8($xfer)
228 ldo 8($idx),$idx ; j++++
229 addl $ab0,$nm0,$nm0 ; low part is discarded
230 extrd,u $nm0,31,32,$hi1
231
232L\$1st
233 xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[0]
234 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m
235 ldd 0($xfer),$ab1
236 fstds ${fab1},0($xfer)
237 addl $hi0,$ab1,$ab1
238 extrd,u $ab1,31,32,$hi0
239 ldd 8($xfer),$nm1
240 fstds ${fnm1},8($xfer)
241 extrd,u $ab1,63,32,$ab1
242 addl $hi1,$nm1,$nm1
243 flddx $idx($ap),${fai} ; ap[j,j+1]
244 flddx $idx($np),${fni} ; np[j,j+1]
245 addl $ab1,$nm1,$nm1
246 extrd,u $nm1,31,32,$hi1
247
248 xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0]
249 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
250 ldd -16($xfer),$ab0
251 fstds ${fab0},-16($xfer)
252 addl $hi0,$ab0,$ab0
253 extrd,u $ab0,31,32,$hi0
254 ldd -8($xfer),$nm0
255 fstds ${fnm0},-8($xfer)
256 extrd,u $ab0,63,32,$ab0
257 addl $hi1,$nm0,$nm0
258 stw $nm1,-4($tp) ; tp[j-1]
259 addl $ab0,$nm0,$nm0
260 stw,ma $nm0,8($tp) ; tp[j-1]
261 addib,<> 8,$idx,L\$1st ; j++++
262 extrd,u $nm0,31,32,$hi1
263
264 xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[0]
265 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m
266 ldd 0($xfer),$ab1
267 fstds ${fab1},0($xfer)
268 addl $hi0,$ab1,$ab1
269 extrd,u $ab1,31,32,$hi0
270 ldd 8($xfer),$nm1
271 fstds ${fnm1},8($xfer)
272 extrd,u $ab1,63,32,$ab1
273 addl $hi1,$nm1,$nm1
274 ldd -16($xfer),$ab0
275 addl $ab1,$nm1,$nm1
276 ldd -8($xfer),$nm0
277 extrd,u $nm1,31,32,$hi1
278
279 addl $hi0,$ab0,$ab0
280 extrd,u $ab0,31,32,$hi0
281 stw $nm1,-4($tp) ; tp[j-1]
282 extrd,u $ab0,63,32,$ab0
283 addl $hi1,$nm0,$nm0
284 ldd 0($xfer),$ab1
285 addl $ab0,$nm0,$nm0
286 ldd,mb 8($xfer),$nm1
287 extrd,u $nm0,31,32,$hi1
288 stw,ma $nm0,8($tp) ; tp[j-1]
289
290 ldo -1($num),$num ; i--
291 subi 0,$arrsz,$idx ; j=0
292___
293$code.=<<___ if ($BN_SZ==4);
294 fldws,ma 4($bp),${fbi} ; bp[1]
295___
296$code.=<<___ if ($BN_SZ==8);
297 fldws 0($bp),${fbi} ; bp[1] in flipped word order
298___
299$code.=<<___;
300 flddx $idx($ap),${fai} ; ap[0,1]
301 flddx $idx($np),${fni} ; np[0,1]
302 fldws 8($xfer),${fti}R ; tp[0]
303 addl $hi0,$ab1,$ab1
304 extrd,u $ab1,31,32,$hi0
305 extrd,u $ab1,63,32,$ab1
306 ldo 8($idx),$idx ; j++++
307 xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[1]
308 xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[1]
309 addl $hi1,$nm1,$nm1
310 addl $ab1,$nm1,$nm1
311 extrd,u $nm1,31,32,$hi1
312 fstws,mb ${fab0}L,-8($xfer) ; save high part
313 stw $nm1,-4($tp) ; tp[j-1]
314
315 fcpy,sgl %fr0,${fti}L ; zero high part
316 fcpy,sgl %fr0,${fab0}L
317 addl $hi1,$hi0,$hi0
318 extrd,u $hi0,31,32,$hi1
319 fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double
320 fcnvxf,dbl,dbl ${fab0},${fab0}
321 stw $hi0,0($tp)
322 stw $hi1,4($tp)
323
324 fadd,dbl ${fti},${fab0},${fab0} ; add tp[0]
325 fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int
326 xmpyu ${fn0},${fab0}R,${fm0}
327 ldo `$LOCALS+32+4`($fp),$tp
328L\$outer
329 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m
330 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m
331 fstds ${fab0},-16($xfer) ; 33-bit value
332 fstds ${fnm0},-8($xfer)
333 flddx $idx($ap),${fai} ; ap[2]
334 flddx $idx($np),${fni} ; np[2]
335 ldo 8($idx),$idx ; j++++
336 ldd -16($xfer),$ab0 ; 33-bit value
337 ldd -8($xfer),$nm0
338 ldw 0($xfer),$hi0 ; high part
339
340 xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i]
341 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
342 extrd,u $ab0,31,32,$ti0 ; carry bit
343 extrd,u $ab0,63,32,$ab0
344 fstds ${fab1},0($xfer)
345 addl $ti0,$hi0,$hi0 ; account carry bit
346 fstds ${fnm1},8($xfer)
347 addl $ab0,$nm0,$nm0 ; low part is discarded
348 ldw 0($tp),$ti1 ; tp[1]
349 extrd,u $nm0,31,32,$hi1
350 fstds ${fab0},-16($xfer)
351 fstds ${fnm0},-8($xfer)
352
353L\$inner
354 xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[i]
355 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m
356 ldd 0($xfer),$ab1
357 fstds ${fab1},0($xfer)
358 addl $hi0,$ti1,$ti1
359 addl $ti1,$ab1,$ab1
360 ldd 8($xfer),$nm1
361 fstds ${fnm1},8($xfer)
362 extrd,u $ab1,31,32,$hi0
363 extrd,u $ab1,63,32,$ab1
364 flddx $idx($ap),${fai} ; ap[j,j+1]
365 flddx $idx($np),${fni} ; np[j,j+1]
366 addl $hi1,$nm1,$nm1
367 addl $ab1,$nm1,$nm1
368 ldw 4($tp),$ti0 ; tp[j]
369 stw $nm1,-4($tp) ; tp[j-1]
370
371 xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i]
372 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
373 ldd -16($xfer),$ab0
374 fstds ${fab0},-16($xfer)
375 addl $hi0,$ti0,$ti0
376 addl $ti0,$ab0,$ab0
377 ldd -8($xfer),$nm0
378 fstds ${fnm0},-8($xfer)
379 extrd,u $ab0,31,32,$hi0
380 extrd,u $nm1,31,32,$hi1
381 ldw 8($tp),$ti1 ; tp[j]
382 extrd,u $ab0,63,32,$ab0
383 addl $hi1,$nm0,$nm0
384 addl $ab0,$nm0,$nm0
385 stw,ma $nm0,8($tp) ; tp[j-1]
386 addib,<> 8,$idx,L\$inner ; j++++
387 extrd,u $nm0,31,32,$hi1
388
389 xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[i]
390 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m
391 ldd 0($xfer),$ab1
392 fstds ${fab1},0($xfer)
393 addl $hi0,$ti1,$ti1
394 addl $ti1,$ab1,$ab1
395 ldd 8($xfer),$nm1
396 fstds ${fnm1},8($xfer)
397 extrd,u $ab1,31,32,$hi0
398 extrd,u $ab1,63,32,$ab1
399 ldw 4($tp),$ti0 ; tp[j]
400 addl $hi1,$nm1,$nm1
401 addl $ab1,$nm1,$nm1
402 ldd -16($xfer),$ab0
403 ldd -8($xfer),$nm0
404 extrd,u $nm1,31,32,$hi1
405
406 addl $hi0,$ab0,$ab0
407 addl $ti0,$ab0,$ab0
408 stw $nm1,-4($tp) ; tp[j-1]
409 extrd,u $ab0,31,32,$hi0
410 ldw 8($tp),$ti1 ; tp[j]
411 extrd,u $ab0,63,32,$ab0
412 addl $hi1,$nm0,$nm0
413 ldd 0($xfer),$ab1
414 addl $ab0,$nm0,$nm0
415 ldd,mb 8($xfer),$nm1
416 extrd,u $nm0,31,32,$hi1
417 stw,ma $nm0,8($tp) ; tp[j-1]
418
419 addib,= -1,$num,L\$outerdone ; i--
420 subi 0,$arrsz,$idx ; j=0
421___
422$code.=<<___ if ($BN_SZ==4);
423 fldws,ma 4($bp),${fbi} ; bp[i]
424___
425$code.=<<___ if ($BN_SZ==8);
426 ldi 12,$ti0 ; bp[i] in flipped word order
427 addl,ev %r0,$num,$num
428 ldi -4,$ti0
429 addl $ti0,$bp,$bp
430 fldws 0($bp),${fbi}
431___
432$code.=<<___;
433 flddx $idx($ap),${fai} ; ap[0]
434 addl $hi0,$ab1,$ab1
435 flddx $idx($np),${fni} ; np[0]
436 fldws 8($xfer),${fti}R ; tp[0]
437 addl $ti1,$ab1,$ab1
438 extrd,u $ab1,31,32,$hi0
439 extrd,u $ab1,63,32,$ab1
440
441 ldo 8($idx),$idx ; j++++
442 xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[i]
443 xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[i]
444 ldw 4($tp),$ti0 ; tp[j]
445
446 addl $hi1,$nm1,$nm1
447 fstws,mb ${fab0}L,-8($xfer) ; save high part
448 addl $ab1,$nm1,$nm1
449 extrd,u $nm1,31,32,$hi1
450 fcpy,sgl %fr0,${fti}L ; zero high part
451 fcpy,sgl %fr0,${fab0}L
452 stw $nm1,-4($tp) ; tp[j-1]
453
454 fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double
455 fcnvxf,dbl,dbl ${fab0},${fab0}
456 addl $hi1,$hi0,$hi0
457 fadd,dbl ${fti},${fab0},${fab0} ; add tp[0]
458 addl $ti0,$hi0,$hi0
459 extrd,u $hi0,31,32,$hi1
460 fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int
461 stw $hi0,0($tp)
462 stw $hi1,4($tp)
463 xmpyu ${fn0},${fab0}R,${fm0}
464
465 b L\$outer
466 ldo `$LOCALS+32+4`($fp),$tp
467
468L\$outerdone
469 addl $hi0,$ab1,$ab1
470 addl $ti1,$ab1,$ab1
471 extrd,u $ab1,31,32,$hi0
472 extrd,u $ab1,63,32,$ab1
473
474 ldw 4($tp),$ti0 ; tp[j]
475
476 addl $hi1,$nm1,$nm1
477 addl $ab1,$nm1,$nm1
478 extrd,u $nm1,31,32,$hi1
479 stw $nm1,-4($tp) ; tp[j-1]
480
481 addl $hi1,$hi0,$hi0
482 addl $ti0,$hi0,$hi0
483 extrd,u $hi0,31,32,$hi1
484 stw $hi0,0($tp)
485 stw $hi1,4($tp)
486
487 ldo `$LOCALS+32`($fp),$tp
488 sub %r0,%r0,%r0 ; clear borrow
489___
490$code.=<<___ if ($BN_SZ==4);
491 ldws,ma 4($tp),$ti0
492 extru,= $rp,31,3,%r0 ; is rp 64-bit aligned?
493 b L\$sub_pa11
494 addl $tp,$arrsz,$tp
495L\$sub
496 ldwx $idx($np),$hi0
497 subb $ti0,$hi0,$hi1
498 ldwx $idx($tp),$ti0
499 addib,<> 4,$idx,L\$sub
500 stws,ma $hi1,4($rp)
501
502 subb $ti0,%r0,$hi1
503 ldo -4($tp),$tp
504___
505$code.=<<___ if ($BN_SZ==8);
506 ldd,ma 8($tp),$ti0
507L\$sub
508 ldd $idx($np),$hi0
509 shrpd $ti0,$ti0,32,$ti0 ; flip word order
510 std $ti0,-8($tp) ; save flipped value
511 sub,db $ti0,$hi0,$hi1
512 ldd,ma 8($tp),$ti0
513 addib,<> 8,$idx,L\$sub
514 std,ma $hi1,8($rp)
515
516 extrd,u $ti0,31,32,$ti0 ; carry in flipped word order
517 sub,db $ti0,%r0,$hi1
518 ldo -8($tp),$tp
519___
520$code.=<<___;
521 and $tp,$hi1,$ap
522 andcm $rp,$hi1,$bp
523 or $ap,$bp,$np
524
525 sub $rp,$arrsz,$rp ; rewind rp
526 subi 0,$arrsz,$idx
527 ldo `$LOCALS+32`($fp),$tp
528L\$copy
529 ldd $idx($np),$hi0
530 std,ma %r0,8($tp)
531 addib,<> 8,$idx,.-8 ; L\$copy
532 std,ma $hi0,8($rp)
533___
534
535if ($BN_SZ==4) { # PA-RISC 1.1 code-path
536$ablo=$ab0;
537$abhi=$ab1;
538$nmlo0=$nm0;
539$nmhi0=$nm1;
540$nmlo1="%r9";
541$nmhi1="%r8";
542
543$code.=<<___;
544 b L\$done
545 nop
546
547 .ALIGN 8
548L\$parisc11
549#endif
550 xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0]
551 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
552 ldw -12($xfer),$ablo
553 ldw -16($xfer),$hi0
554 ldw -4($xfer),$nmlo0
555 ldw -8($xfer),$nmhi0
556 fstds ${fab0},-16($xfer)
557 fstds ${fnm0},-8($xfer)
558
559 ldo 8($idx),$idx ; j++++
560 add $ablo,$nmlo0,$nmlo0 ; discarded
561 addc %r0,$nmhi0,$hi1
562 ldw 4($xfer),$ablo
563 ldw 0($xfer),$abhi
564 nop
565
566L\$1st_pa11
567 xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[0]
568 flddx $idx($ap),${fai} ; ap[j,j+1]
569 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m
570 flddx $idx($np),${fni} ; np[j,j+1]
571 add $hi0,$ablo,$ablo
572 ldw 12($xfer),$nmlo1
573 addc %r0,$abhi,$hi0
574 ldw 8($xfer),$nmhi1
575 add $ablo,$nmlo1,$nmlo1
576 fstds ${fab1},0($xfer)
577 addc %r0,$nmhi1,$nmhi1
578 fstds ${fnm1},8($xfer)
579 add $hi1,$nmlo1,$nmlo1
580 ldw -12($xfer),$ablo
581 addc %r0,$nmhi1,$hi1
582 ldw -16($xfer),$abhi
583
584 xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0]
585 ldw -4($xfer),$nmlo0
586 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
587 ldw -8($xfer),$nmhi0
588 add $hi0,$ablo,$ablo
589 stw $nmlo1,-4($tp) ; tp[j-1]
590 addc %r0,$abhi,$hi0
591 fstds ${fab0},-16($xfer)
592 add $ablo,$nmlo0,$nmlo0
593 fstds ${fnm0},-8($xfer)
594 addc %r0,$nmhi0,$nmhi0
595 ldw 0($xfer),$abhi
596 add $hi1,$nmlo0,$nmlo0
597 ldw 4($xfer),$ablo
598 stws,ma $nmlo0,8($tp) ; tp[j-1]
599 addib,<> 8,$idx,L\$1st_pa11 ; j++++
600 addc %r0,$nmhi0,$hi1
601
602 ldw 8($xfer),$nmhi1
603 ldw 12($xfer),$nmlo1
604 xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[0]
605 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m
606 add $hi0,$ablo,$ablo
607 fstds ${fab1},0($xfer)
608 addc %r0,$abhi,$hi0
609 fstds ${fnm1},8($xfer)
610 add $ablo,$nmlo1,$nmlo1
611 ldw -16($xfer),$abhi
612 addc %r0,$nmhi1,$nmhi1
613 ldw -12($xfer),$ablo
614 add $hi1,$nmlo1,$nmlo1
615 ldw -8($xfer),$nmhi0
616 addc %r0,$nmhi1,$hi1
617 ldw -4($xfer),$nmlo0
618
619 add $hi0,$ablo,$ablo
620 stw $nmlo1,-4($tp) ; tp[j-1]
621 addc %r0,$abhi,$hi0
622 ldw 0($xfer),$abhi
623 add $ablo,$nmlo0,$nmlo0
624 ldw 4($xfer),$ablo
625 addc %r0,$nmhi0,$nmhi0
626 ldws,mb 8($xfer),$nmhi1
627 add $hi1,$nmlo0,$nmlo0
628 ldw 4($xfer),$nmlo1
629 addc %r0,$nmhi0,$hi1
630 stws,ma $nmlo0,8($tp) ; tp[j-1]
631
632 ldo -1($num),$num ; i--
633 subi 0,$arrsz,$idx ; j=0
634
635 fldws,ma 4($bp),${fbi} ; bp[1]
636 flddx $idx($ap),${fai} ; ap[0,1]
637 flddx $idx($np),${fni} ; np[0,1]
638 fldws 8($xfer),${fti}R ; tp[0]
639 add $hi0,$ablo,$ablo
640 addc %r0,$abhi,$hi0
641 ldo 8($idx),$idx ; j++++
642 xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[1]
643 xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[1]
644 add $hi1,$nmlo1,$nmlo1
645 addc %r0,$nmhi1,$nmhi1
646 add $ablo,$nmlo1,$nmlo1
647 addc %r0,$nmhi1,$hi1
648 fstws,mb ${fab0}L,-8($xfer) ; save high part
649 stw $nmlo1,-4($tp) ; tp[j-1]
650
651 fcpy,sgl %fr0,${fti}L ; zero high part
652 fcpy,sgl %fr0,${fab0}L
653 add $hi1,$hi0,$hi0
654 addc %r0,%r0,$hi1
655 fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double
656 fcnvxf,dbl,dbl ${fab0},${fab0}
657 stw $hi0,0($tp)
658 stw $hi1,4($tp)
659
660 fadd,dbl ${fti},${fab0},${fab0} ; add tp[0]
661 fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int
662 xmpyu ${fn0},${fab0}R,${fm0}
663 ldo `$LOCALS+32+4`($fp),$tp
664L\$outer_pa11
665 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m
666 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m
667 fstds ${fab0},-16($xfer) ; 33-bit value
668 fstds ${fnm0},-8($xfer)
669 flddx $idx($ap),${fai} ; ap[2,3]
670 flddx $idx($np),${fni} ; np[2,3]
671 ldw -16($xfer),$abhi ; carry bit actually
672 ldo 8($idx),$idx ; j++++
673 ldw -12($xfer),$ablo
674 ldw -8($xfer),$nmhi0
675 ldw -4($xfer),$nmlo0
676 ldw 0($xfer),$hi0 ; high part
677
678 xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i]
679 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
680 fstds ${fab1},0($xfer)
681 addl $abhi,$hi0,$hi0 ; account carry bit
682 fstds ${fnm1},8($xfer)
683 add $ablo,$nmlo0,$nmlo0 ; discarded
684 ldw 0($tp),$ti1 ; tp[1]
685 addc %r0,$nmhi0,$hi1
686 fstds ${fab0},-16($xfer)
687 fstds ${fnm0},-8($xfer)
688 ldw 4($xfer),$ablo
689 ldw 0($xfer),$abhi
690
691L\$inner_pa11
692 xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[i]
693 flddx $idx($ap),${fai} ; ap[j,j+1]
694 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m
695 flddx $idx($np),${fni} ; np[j,j+1]
696 add $hi0,$ablo,$ablo
697 ldw 4($tp),$ti0 ; tp[j]
698 addc %r0,$abhi,$abhi
699 ldw 12($xfer),$nmlo1
700 add $ti1,$ablo,$ablo
701 ldw 8($xfer),$nmhi1
702 addc %r0,$abhi,$hi0
703 fstds ${fab1},0($xfer)
704 add $ablo,$nmlo1,$nmlo1
705 fstds ${fnm1},8($xfer)
706 addc %r0,$nmhi1,$nmhi1
707 ldw -12($xfer),$ablo
708 add $hi1,$nmlo1,$nmlo1
709 ldw -16($xfer),$abhi
710 addc %r0,$nmhi1,$hi1
711
712 xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i]
713 ldw 8($tp),$ti1 ; tp[j]
714 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
715 ldw -4($xfer),$nmlo0
716 add $hi0,$ablo,$ablo
717 ldw -8($xfer),$nmhi0
718 addc %r0,$abhi,$abhi
719 stw $nmlo1,-4($tp) ; tp[j-1]
720 add $ti0,$ablo,$ablo
721 fstds ${fab0},-16($xfer)
722 addc %r0,$abhi,$hi0
723 fstds ${fnm0},-8($xfer)
724 add $ablo,$nmlo0,$nmlo0
725 ldw 4($xfer),$ablo
726 addc %r0,$nmhi0,$nmhi0
727 ldw 0($xfer),$abhi
728 add $hi1,$nmlo0,$nmlo0
729 stws,ma $nmlo0,8($tp) ; tp[j-1]
730 addib,<> 8,$idx,L\$inner_pa11 ; j++++
731 addc %r0,$nmhi0,$hi1
732
733 xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[i]
734 ldw 12($xfer),$nmlo1
735 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m
736 ldw 8($xfer),$nmhi1
737 add $hi0,$ablo,$ablo
738 ldw 4($tp),$ti0 ; tp[j]
739 addc %r0,$abhi,$abhi
740 fstds ${fab1},0($xfer)
741 add $ti1,$ablo,$ablo
742 fstds ${fnm1},8($xfer)
743 addc %r0,$abhi,$hi0
744 ldw -16($xfer),$abhi
745 add $ablo,$nmlo1,$nmlo1
746 ldw -12($xfer),$ablo
747 addc %r0,$nmhi1,$nmhi1
748 ldw -8($xfer),$nmhi0
749 add $hi1,$nmlo1,$nmlo1
750 ldw -4($xfer),$nmlo0
751 addc %r0,$nmhi1,$hi1
752
753 add $hi0,$ablo,$ablo
754 stw $nmlo1,-4($tp) ; tp[j-1]
755 addc %r0,$abhi,$abhi
756 add $ti0,$ablo,$ablo
757 ldw 8($tp),$ti1 ; tp[j]
758 addc %r0,$abhi,$hi0
759 ldw 0($xfer),$abhi
760 add $ablo,$nmlo0,$nmlo0
761 ldw 4($xfer),$ablo
762 addc %r0,$nmhi0,$nmhi0
763 ldws,mb 8($xfer),$nmhi1
764 add $hi1,$nmlo0,$nmlo0
765 ldw 4($xfer),$nmlo1
766 addc %r0,$nmhi0,$hi1
767 stws,ma $nmlo0,8($tp) ; tp[j-1]
768
769 addib,= -1,$num,L\$outerdone_pa11; i--
770 subi 0,$arrsz,$idx ; j=0
771
772 fldws,ma 4($bp),${fbi} ; bp[i]
773 flddx $idx($ap),${fai} ; ap[0]
774 add $hi0,$ablo,$ablo
775 addc %r0,$abhi,$abhi
776 flddx $idx($np),${fni} ; np[0]
777 fldws 8($xfer),${fti}R ; tp[0]
778 add $ti1,$ablo,$ablo
779 addc %r0,$abhi,$hi0
780
781 ldo 8($idx),$idx ; j++++
782 xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[i]
783 xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[i]
784 ldw 4($tp),$ti0 ; tp[j]
785
786 add $hi1,$nmlo1,$nmlo1
787 addc %r0,$nmhi1,$nmhi1
788 fstws,mb ${fab0}L,-8($xfer) ; save high part
789 add $ablo,$nmlo1,$nmlo1
790 addc %r0,$nmhi1,$hi1
791 fcpy,sgl %fr0,${fti}L ; zero high part
792 fcpy,sgl %fr0,${fab0}L
793 stw $nmlo1,-4($tp) ; tp[j-1]
794
795 fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double
796 fcnvxf,dbl,dbl ${fab0},${fab0}
797 add $hi1,$hi0,$hi0
798 addc %r0,%r0,$hi1
799 fadd,dbl ${fti},${fab0},${fab0} ; add tp[0]
800 add $ti0,$hi0,$hi0
801 addc %r0,$hi1,$hi1
802 fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int
803 stw $hi0,0($tp)
804 stw $hi1,4($tp)
805 xmpyu ${fn0},${fab0}R,${fm0}
806
807 b L\$outer_pa11
808 ldo `$LOCALS+32+4`($fp),$tp
809
810L\$outerdone_pa11
811 add $hi0,$ablo,$ablo
812 addc %r0,$abhi,$abhi
813 add $ti1,$ablo,$ablo
814 addc %r0,$abhi,$hi0
815
816 ldw 4($tp),$ti0 ; tp[j]
817
818 add $hi1,$nmlo1,$nmlo1
819 addc %r0,$nmhi1,$nmhi1
820 add $ablo,$nmlo1,$nmlo1
821 addc %r0,$nmhi1,$hi1
822 stw $nmlo1,-4($tp) ; tp[j-1]
823
824 add $hi1,$hi0,$hi0
825 addc %r0,%r0,$hi1
826 add $ti0,$hi0,$hi0
827 addc %r0,$hi1,$hi1
828 stw $hi0,0($tp)
829 stw $hi1,4($tp)
830
831 ldo `$LOCALS+32+4`($fp),$tp
832 sub %r0,%r0,%r0 ; clear borrow
833 ldw -4($tp),$ti0
834 addl $tp,$arrsz,$tp
835L\$sub_pa11
836 ldwx $idx($np),$hi0
837 subb $ti0,$hi0,$hi1
838 ldwx $idx($tp),$ti0
839 addib,<> 4,$idx,L\$sub_pa11
840 stws,ma $hi1,4($rp)
841
842 subb $ti0,%r0,$hi1
843 ldo -4($tp),$tp
844 and $tp,$hi1,$ap
845 andcm $rp,$hi1,$bp
846 or $ap,$bp,$np
847
848 sub $rp,$arrsz,$rp ; rewind rp
849 subi 0,$arrsz,$idx
850 ldo `$LOCALS+32`($fp),$tp
851L\$copy_pa11
852 ldwx $idx($np),$hi0
853 stws,ma %r0,4($tp)
854 addib,<> 4,$idx,L\$copy_pa11
855 stws,ma $hi0,4($rp)
856
857 nop ; alignment
858L\$done
859___
860}
861
862$code.=<<___;
863 ldi 1,%r28 ; signal "handled"
864 ldo $FRAME($fp),%sp ; destroy tp[num+1]
865
866 $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue
867 $POP `-$FRAME+1*$SIZE_T`(%sp),%r4
868 $POP `-$FRAME+2*$SIZE_T`(%sp),%r5
869 $POP `-$FRAME+3*$SIZE_T`(%sp),%r6
870 $POP `-$FRAME+4*$SIZE_T`(%sp),%r7
871 $POP `-$FRAME+5*$SIZE_T`(%sp),%r8
872 $POP `-$FRAME+6*$SIZE_T`(%sp),%r9
873 $POP `-$FRAME+7*$SIZE_T`(%sp),%r10
874L\$abort
875 bv (%r2)
876 .EXIT
877 $POPMB -$FRAME(%sp),%r3
878 .PROCEND
879___
880
881# Explicitly encode PA-RISC 2.0 instructions used in this module, so
882# that it can be compiled with .LEVEL 1.0. It should be noted that I
883# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0
884# directive...
885
886my $ldd = sub {
887 my ($mod,$args) = @_;
888 my $orig = "ldd$mod\t$args";
889
890 if ($args =~ /%r([0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 4
891 { my $opcode=(0x03<<26)|($2<<21)|($1<<16)|(3<<6)|$3;
892 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
893 }
894 elsif ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 5
895 { my $opcode=(0x03<<26)|($2<<21)|(1<<12)|(3<<6)|$3;
896 $opcode|=(($1&0xF)<<17)|(($1&0x10)<<12); # encode offset
897 $opcode|=(1<<5) if ($mod =~ /^,m/);
898 $opcode|=(1<<13) if ($mod =~ /^,mb/);
899 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
900 }
901 else { "\t".$orig; }
902};
903
904my $std = sub {
905 my ($mod,$args) = @_;
906 my $orig = "std$mod\t$args";
907
908 if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 6
909 { my $opcode=(0x03<<26)|($3<<21)|($1<<16)|(1<<12)|(0xB<<6);
910 $opcode|=(($2&0xF)<<1)|(($2&0x10)>>4); # encode offset
911 $opcode|=(1<<5) if ($mod =~ /^,m/);
912 $opcode|=(1<<13) if ($mod =~ /^,mb/);
913 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
914 }
915 else { "\t".$orig; }
916};
917
918my $extrd = sub {
919 my ($mod,$args) = @_;
920 my $orig = "extrd$mod\t$args";
921
922 # I only have ",u" completer, it's implicitly encoded...
923 if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 15
924 { my $opcode=(0x36<<26)|($1<<21)|($4<<16);
925 my $len=32-$3;
926 $opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5); # encode pos
927 $opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len
928 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
929 }
930 elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/) # format 12
931 { my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9);
932 my $len=32-$2;
933 $opcode |= (($len&0x20)<<3)|($len&0x1f); # encode len
934 $opcode |= (1<<13) if ($mod =~ /,\**=/);
935 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
936 }
937 else { "\t".$orig; }
938};
939
940my $shrpd = sub {
941 my ($mod,$args) = @_;
942 my $orig = "shrpd$mod\t$args";
943
944 if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/) # format 14
945 { my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4;
946 my $cpos=63-$3;
947 $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode sa
948 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
949 }
950 else { "\t".$orig; }
951};
952
953my $sub = sub {
954 my ($mod,$args) = @_;
955 my $orig = "sub$mod\t$args";
956
957 if ($mod eq ",db" && $args =~ /%r([0-9]+),%r([0-9]+),%r([0-9]+)/) {
958 my $opcode=(0x02<<26)|($2<<21)|($1<<16)|$3;
959 $opcode|=(1<<10); # e1
960 $opcode|=(1<<8); # e2
961 $opcode|=(1<<5); # d
962 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig
963 }
964 else { "\t".$orig; }
965};
966
967sub assemble {
968 my ($mnemonic,$mod,$args)=@_;
969 my $opcode = eval("\$$mnemonic");
970
971 ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args";
972}
973
974foreach (split("\n",$code)) {
975 s/\`([^\`]*)\`/eval $1/ge;
976 # flip word order in 64-bit mode...
977 s/(xmpyu\s+)($fai|$fni)([LR])/$1.$2.($3 eq "L"?"R":"L")/e if ($BN_SZ==8);
978 # assemble 2.0 instructions in 32-bit mode...
979 s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e if ($BN_SZ==4);
980
981 s/\bbv\b/bve/gm if ($SIZE_T==8);
982
983 print $_,"\n";
984}
985close STDOUT;
diff --git a/src/lib/libcrypto/bn/asm/ppc-mont.pl b/src/lib/libcrypto/bn/asm/ppc-mont.pl
deleted file mode 100644
index 68320a87f7..0000000000
--- a/src/lib/libcrypto/bn/asm/ppc-mont.pl
+++ /dev/null
@@ -1,329 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# April 2006
11
12# "Teaser" Montgomery multiplication module for PowerPC. It's possible
13# to gain a bit more by modulo-scheduling outer loop, then dedicated
14# squaring procedure should give further 20% and code can be adapted
15# for 32-bit application running on 64-bit CPU. As for the latter.
16# It won't be able to achieve "native" 64-bit performance, because in
17# 32-bit application context every addc instruction will have to be
18# expanded as addc, twice right shift by 32 and finally adde, etc.
19# So far RSA *sign* performance improvement over pre-bn_mul_mont asm
20# for 64-bit application running on PPC970/G5 is:
21#
22# 512-bit +65%
23# 1024-bit +35%
24# 2048-bit +18%
25# 4096-bit +4%
26
27$flavour = shift;
28
29if ($flavour =~ /32/) {
30 $BITS= 32;
31 $BNSZ= $BITS/8;
32 $SIZE_T=4;
33 $RZONE= 224;
34
35 $LD= "lwz"; # load
36 $LDU= "lwzu"; # load and update
37 $LDX= "lwzx"; # load indexed
38 $ST= "stw"; # store
39 $STU= "stwu"; # store and update
40 $STX= "stwx"; # store indexed
41 $STUX= "stwux"; # store indexed and update
42 $UMULL= "mullw"; # unsigned multiply low
43 $UMULH= "mulhwu"; # unsigned multiply high
44 $UCMP= "cmplw"; # unsigned compare
45 $SHRI= "srwi"; # unsigned shift right by immediate
46 $PUSH= $ST;
47 $POP= $LD;
48} elsif ($flavour =~ /64/) {
49 $BITS= 64;
50 $BNSZ= $BITS/8;
51 $SIZE_T=8;
52 $RZONE= 288;
53
54 # same as above, but 64-bit mnemonics...
55 $LD= "ld"; # load
56 $LDU= "ldu"; # load and update
57 $LDX= "ldx"; # load indexed
58 $ST= "std"; # store
59 $STU= "stdu"; # store and update
60 $STX= "stdx"; # store indexed
61 $STUX= "stdux"; # store indexed and update
62 $UMULL= "mulld"; # unsigned multiply low
63 $UMULH= "mulhdu"; # unsigned multiply high
64 $UCMP= "cmpld"; # unsigned compare
65 $SHRI= "srdi"; # unsigned shift right by immediate
66 $PUSH= $ST;
67 $POP= $LD;
68} else { die "nonsense $flavour"; }
69
70$FRAME=8*$SIZE_T+$RZONE;
71$LOCALS=8*$SIZE_T;
72
73$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
74( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
75( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
76die "can't locate ppc-xlate.pl";
77
78open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
79
80$sp="r1";
81$toc="r2";
82$rp="r3"; $ovf="r3";
83$ap="r4";
84$bp="r5";
85$np="r6";
86$n0="r7";
87$num="r8";
88$rp="r9"; # $rp is reassigned
89$aj="r10";
90$nj="r11";
91$tj="r12";
92# non-volatile registers
93$i="r20";
94$j="r21";
95$tp="r22";
96$m0="r23";
97$m1="r24";
98$lo0="r25";
99$hi0="r26";
100$lo1="r27";
101$hi1="r28";
102$alo="r29";
103$ahi="r30";
104$nlo="r31";
105#
106$nhi="r0";
107
108$code=<<___;
109.machine "any"
110.text
111
112.globl .bn_mul_mont
113.align 4
114.bn_mul_mont:
115 cmpwi $num,4
116 mr $rp,r3 ; $rp is reassigned
117 li r3,0
118 bltlr
119___
120$code.=<<___ if ($BNSZ==4);
121 cmpwi $num,32 ; longer key performance is not better
122 bgelr
123___
124$code.=<<___;
125 slwi $num,$num,`log($BNSZ)/log(2)`
126 li $tj,-4096
127 addi $ovf,$num,$FRAME
128 subf $ovf,$ovf,$sp ; $sp-$ovf
129 and $ovf,$ovf,$tj ; minimize TLB usage
130 subf $ovf,$sp,$ovf ; $ovf-$sp
131 mr $tj,$sp
132 srwi $num,$num,`log($BNSZ)/log(2)`
133 $STUX $sp,$sp,$ovf
134
135 $PUSH r20,`-12*$SIZE_T`($tj)
136 $PUSH r21,`-11*$SIZE_T`($tj)
137 $PUSH r22,`-10*$SIZE_T`($tj)
138 $PUSH r23,`-9*$SIZE_T`($tj)
139 $PUSH r24,`-8*$SIZE_T`($tj)
140 $PUSH r25,`-7*$SIZE_T`($tj)
141 $PUSH r26,`-6*$SIZE_T`($tj)
142 $PUSH r27,`-5*$SIZE_T`($tj)
143 $PUSH r28,`-4*$SIZE_T`($tj)
144 $PUSH r29,`-3*$SIZE_T`($tj)
145 $PUSH r30,`-2*$SIZE_T`($tj)
146 $PUSH r31,`-1*$SIZE_T`($tj)
147
148 $LD $n0,0($n0) ; pull n0[0] value
149 addi $num,$num,-2 ; adjust $num for counter register
150
151 $LD $m0,0($bp) ; m0=bp[0]
152 $LD $aj,0($ap) ; ap[0]
153 addi $tp,$sp,$LOCALS
154 $UMULL $lo0,$aj,$m0 ; ap[0]*bp[0]
155 $UMULH $hi0,$aj,$m0
156
157 $LD $aj,$BNSZ($ap) ; ap[1]
158 $LD $nj,0($np) ; np[0]
159
160 $UMULL $m1,$lo0,$n0 ; "tp[0]"*n0
161
162 $UMULL $alo,$aj,$m0 ; ap[1]*bp[0]
163 $UMULH $ahi,$aj,$m0
164
165 $UMULL $lo1,$nj,$m1 ; np[0]*m1
166 $UMULH $hi1,$nj,$m1
167 $LD $nj,$BNSZ($np) ; np[1]
168 addc $lo1,$lo1,$lo0
169 addze $hi1,$hi1
170
171 $UMULL $nlo,$nj,$m1 ; np[1]*m1
172 $UMULH $nhi,$nj,$m1
173
174 mtctr $num
175 li $j,`2*$BNSZ`
176.align 4
177L1st:
178 $LDX $aj,$ap,$j ; ap[j]
179 addc $lo0,$alo,$hi0
180 $LDX $nj,$np,$j ; np[j]
181 addze $hi0,$ahi
182 $UMULL $alo,$aj,$m0 ; ap[j]*bp[0]
183 addc $lo1,$nlo,$hi1
184 $UMULH $ahi,$aj,$m0
185 addze $hi1,$nhi
186 $UMULL $nlo,$nj,$m1 ; np[j]*m1
187 addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[0]
188 $UMULH $nhi,$nj,$m1
189 addze $hi1,$hi1
190 $ST $lo1,0($tp) ; tp[j-1]
191
192 addi $j,$j,$BNSZ ; j++
193 addi $tp,$tp,$BNSZ ; tp++
194 bdnz- L1st
195;L1st
196 addc $lo0,$alo,$hi0
197 addze $hi0,$ahi
198
199 addc $lo1,$nlo,$hi1
200 addze $hi1,$nhi
201 addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[0]
202 addze $hi1,$hi1
203 $ST $lo1,0($tp) ; tp[j-1]
204
205 li $ovf,0
206 addc $hi1,$hi1,$hi0
207 addze $ovf,$ovf ; upmost overflow bit
208 $ST $hi1,$BNSZ($tp)
209
210 li $i,$BNSZ
211.align 4
212Louter:
213 $LDX $m0,$bp,$i ; m0=bp[i]
214 $LD $aj,0($ap) ; ap[0]
215 addi $tp,$sp,$LOCALS
216 $LD $tj,$LOCALS($sp); tp[0]
217 $UMULL $lo0,$aj,$m0 ; ap[0]*bp[i]
218 $UMULH $hi0,$aj,$m0
219 $LD $aj,$BNSZ($ap) ; ap[1]
220 $LD $nj,0($np) ; np[0]
221 addc $lo0,$lo0,$tj ; ap[0]*bp[i]+tp[0]
222 $UMULL $alo,$aj,$m0 ; ap[j]*bp[i]
223 addze $hi0,$hi0
224 $UMULL $m1,$lo0,$n0 ; tp[0]*n0
225 $UMULH $ahi,$aj,$m0
226 $UMULL $lo1,$nj,$m1 ; np[0]*m1
227 $UMULH $hi1,$nj,$m1
228 $LD $nj,$BNSZ($np) ; np[1]
229 addc $lo1,$lo1,$lo0
230 $UMULL $nlo,$nj,$m1 ; np[1]*m1
231 addze $hi1,$hi1
232 $UMULH $nhi,$nj,$m1
233
234 mtctr $num
235 li $j,`2*$BNSZ`
236.align 4
237Linner:
238 $LDX $aj,$ap,$j ; ap[j]
239 addc $lo0,$alo,$hi0
240 $LD $tj,$BNSZ($tp) ; tp[j]
241 addze $hi0,$ahi
242 $LDX $nj,$np,$j ; np[j]
243 addc $lo1,$nlo,$hi1
244 $UMULL $alo,$aj,$m0 ; ap[j]*bp[i]
245 addze $hi1,$nhi
246 $UMULH $ahi,$aj,$m0
247 addc $lo0,$lo0,$tj ; ap[j]*bp[i]+tp[j]
248 $UMULL $nlo,$nj,$m1 ; np[j]*m1
249 addze $hi0,$hi0
250 $UMULH $nhi,$nj,$m1
251 addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[i]+tp[j]
252 addi $j,$j,$BNSZ ; j++
253 addze $hi1,$hi1
254 $ST $lo1,0($tp) ; tp[j-1]
255 addi $tp,$tp,$BNSZ ; tp++
256 bdnz- Linner
257;Linner
258 $LD $tj,$BNSZ($tp) ; tp[j]
259 addc $lo0,$alo,$hi0
260 addze $hi0,$ahi
261 addc $lo0,$lo0,$tj ; ap[j]*bp[i]+tp[j]
262 addze $hi0,$hi0
263
264 addc $lo1,$nlo,$hi1
265 addze $hi1,$nhi
266 addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[i]+tp[j]
267 addze $hi1,$hi1
268 $ST $lo1,0($tp) ; tp[j-1]
269
270 addic $ovf,$ovf,-1 ; move upmost overflow to XER[CA]
271 li $ovf,0
272 adde $hi1,$hi1,$hi0
273 addze $ovf,$ovf
274 $ST $hi1,$BNSZ($tp)
275;
276 slwi $tj,$num,`log($BNSZ)/log(2)`
277 $UCMP $i,$tj
278 addi $i,$i,$BNSZ
279 ble- Louter
280
281 addi $num,$num,2 ; restore $num
282 subfc $j,$j,$j ; j=0 and "clear" XER[CA]
283 addi $tp,$sp,$LOCALS
284 mtctr $num
285
286.align 4
287Lsub: $LDX $tj,$tp,$j
288 $LDX $nj,$np,$j
289 subfe $aj,$nj,$tj ; tp[j]-np[j]
290 $STX $aj,$rp,$j
291 addi $j,$j,$BNSZ
292 bdnz- Lsub
293
294 li $j,0
295 mtctr $num
296 subfe $ovf,$j,$ovf ; handle upmost overflow bit
297 and $ap,$tp,$ovf
298 andc $np,$rp,$ovf
299 or $ap,$ap,$np ; ap=borrow?tp:rp
300
301.align 4
302Lcopy: ; copy or in-place refresh
303 $LDX $tj,$ap,$j
304 $STX $tj,$rp,$j
305 $STX $j,$tp,$j ; zap at once
306 addi $j,$j,$BNSZ
307 bdnz- Lcopy
308
309 $POP $tj,0($sp)
310 li r3,1
311 $POP r20,`-12*$SIZE_T`($tj)
312 $POP r21,`-11*$SIZE_T`($tj)
313 $POP r22,`-10*$SIZE_T`($tj)
314 $POP r23,`-9*$SIZE_T`($tj)
315 $POP r24,`-8*$SIZE_T`($tj)
316 $POP r25,`-7*$SIZE_T`($tj)
317 $POP r26,`-6*$SIZE_T`($tj)
318 $POP r27,`-5*$SIZE_T`($tj)
319 $POP r28,`-4*$SIZE_T`($tj)
320 $POP r29,`-3*$SIZE_T`($tj)
321 $POP r30,`-2*$SIZE_T`($tj)
322 $POP r31,`-1*$SIZE_T`($tj)
323 mr $sp,$tj
324 blr
325___
326
327$code =~ s/\`([^\`]*)\`/eval $1/gem;
328print $code;
329close STDOUT;
diff --git a/src/lib/libcrypto/bn/asm/ppc.pl b/src/lib/libcrypto/bn/asm/ppc.pl
deleted file mode 100644
index c9b7f9477d..0000000000
--- a/src/lib/libcrypto/bn/asm/ppc.pl
+++ /dev/null
@@ -1,1968 +0,0 @@
1#!/usr/bin/env perl
2#
3# Implemented as a Perl wrapper as we want to support several different
4# architectures with single file. We pick up the target based on the
5# file name we are asked to generate.
6#
7# It should be noted though that this perl code is nothing like
8# <openssl>/crypto/perlasm/x86*. In this case perl is used pretty much
9# as pre-processor to cover for platform differences in name decoration,
10# linker tables, 32-/64-bit instruction sets...
11#
12# As you might know there're several PowerPC ABI in use. Most notably
13# Linux and AIX use different 32-bit ABIs. Good news are that these ABIs
14# are similar enough to implement leaf(!) functions, which would be ABI
15# neutral. And that's what you find here: ABI neutral leaf functions.
16# In case you wonder what that is...
17#
18# AIX performance
19#
20# MEASUREMENTS WITH cc ON a 200 MhZ PowerPC 604e.
21#
22# The following is the performance of 32-bit compiler
23# generated code:
24#
25# OpenSSL 0.9.6c 21 dec 2001
26# built on: Tue Jun 11 11:06:51 EDT 2002
27# options:bn(64,32) ...
28#compiler: cc -DTHREADS -DAIX -DB_ENDIAN -DBN_LLONG -O3
29# sign verify sign/s verify/s
30#rsa 512 bits 0.0098s 0.0009s 102.0 1170.6
31#rsa 1024 bits 0.0507s 0.0026s 19.7 387.5
32#rsa 2048 bits 0.3036s 0.0085s 3.3 117.1
33#rsa 4096 bits 2.0040s 0.0299s 0.5 33.4
34#dsa 512 bits 0.0087s 0.0106s 114.3 94.5
35#dsa 1024 bits 0.0256s 0.0313s 39.0 32.0
36#
37# Same benchmark with this assembler code:
38#
39#rsa 512 bits 0.0056s 0.0005s 178.6 2049.2
40#rsa 1024 bits 0.0283s 0.0015s 35.3 674.1
41#rsa 2048 bits 0.1744s 0.0050s 5.7 201.2
42#rsa 4096 bits 1.1644s 0.0179s 0.9 55.7
43#dsa 512 bits 0.0052s 0.0062s 191.6 162.0
44#dsa 1024 bits 0.0149s 0.0180s 67.0 55.5
45#
46# Number of operations increases by at almost 75%
47#
48# Here are performance numbers for 64-bit compiler
49# generated code:
50#
51# OpenSSL 0.9.6g [engine] 9 Aug 2002
52# built on: Fri Apr 18 16:59:20 EDT 2003
53# options:bn(64,64) ...
54# compiler: cc -DTHREADS -D_REENTRANT -q64 -DB_ENDIAN -O3
55# sign verify sign/s verify/s
56#rsa 512 bits 0.0028s 0.0003s 357.1 3844.4
57#rsa 1024 bits 0.0148s 0.0008s 67.5 1239.7
58#rsa 2048 bits 0.0963s 0.0028s 10.4 353.0
59#rsa 4096 bits 0.6538s 0.0102s 1.5 98.1
60#dsa 512 bits 0.0026s 0.0032s 382.5 313.7
61#dsa 1024 bits 0.0081s 0.0099s 122.8 100.6
62#
63# Same benchmark with this assembler code:
64#
65#rsa 512 bits 0.0020s 0.0002s 510.4 6273.7
66#rsa 1024 bits 0.0088s 0.0005s 114.1 2128.3
67#rsa 2048 bits 0.0540s 0.0016s 18.5 622.5
68#rsa 4096 bits 0.3700s 0.0058s 2.7 171.0
69#dsa 512 bits 0.0016s 0.0020s 610.7 507.1
70#dsa 1024 bits 0.0047s 0.0058s 212.5 173.2
71#
72# Again, performance increases by at about 75%
73#
74# Mac OS X, Apple G5 1.8GHz (Note this is 32 bit code)
75# OpenSSL 0.9.7c 30 Sep 2003
76#
77# Original code.
78#
79#rsa 512 bits 0.0011s 0.0001s 906.1 11012.5
80#rsa 1024 bits 0.0060s 0.0003s 166.6 3363.1
81#rsa 2048 bits 0.0370s 0.0010s 27.1 982.4
82#rsa 4096 bits 0.2426s 0.0036s 4.1 280.4
83#dsa 512 bits 0.0010s 0.0012s 1038.1 841.5
84#dsa 1024 bits 0.0030s 0.0037s 329.6 269.7
85#dsa 2048 bits 0.0101s 0.0127s 98.9 78.6
86#
87# Same benchmark with this assembler code:
88#
89#rsa 512 bits 0.0007s 0.0001s 1416.2 16645.9
90#rsa 1024 bits 0.0036s 0.0002s 274.4 5380.6
91#rsa 2048 bits 0.0222s 0.0006s 45.1 1589.5
92#rsa 4096 bits 0.1469s 0.0022s 6.8 449.6
93#dsa 512 bits 0.0006s 0.0007s 1664.2 1376.2
94#dsa 1024 bits 0.0018s 0.0023s 545.0 442.2
95#dsa 2048 bits 0.0061s 0.0075s 163.5 132.8
96#
97# Performance increase of ~60%
98#
99# If you have comments or suggestions to improve code send
100# me a note at schari@us.ibm.com
101#
102
103$flavour = shift;
104
105if ($flavour =~ /32/) {
106 $BITS= 32;
107 $BNSZ= $BITS/8;
108 $ISA= "\"ppc\"";
109
110 $LD= "lwz"; # load
111 $LDU= "lwzu"; # load and update
112 $ST= "stw"; # store
113 $STU= "stwu"; # store and update
114 $UMULL= "mullw"; # unsigned multiply low
115 $UMULH= "mulhwu"; # unsigned multiply high
116 $UDIV= "divwu"; # unsigned divide
117 $UCMPI= "cmplwi"; # unsigned compare with immediate
118 $UCMP= "cmplw"; # unsigned compare
119 $CNTLZ= "cntlzw"; # count leading zeros
120 $SHL= "slw"; # shift left
121 $SHR= "srw"; # unsigned shift right
122 $SHRI= "srwi"; # unsigned shift right by immediate
123 $SHLI= "slwi"; # shift left by immediate
124 $CLRU= "clrlwi"; # clear upper bits
125 $INSR= "insrwi"; # insert right
126 $ROTL= "rotlwi"; # rotate left by immediate
127 $TR= "tw"; # conditional trap
128} elsif ($flavour =~ /64/) {
129 $BITS= 64;
130 $BNSZ= $BITS/8;
131 $ISA= "\"ppc64\"";
132
133 # same as above, but 64-bit mnemonics...
134 $LD= "ld"; # load
135 $LDU= "ldu"; # load and update
136 $ST= "std"; # store
137 $STU= "stdu"; # store and update
138 $UMULL= "mulld"; # unsigned multiply low
139 $UMULH= "mulhdu"; # unsigned multiply high
140 $UDIV= "divdu"; # unsigned divide
141 $UCMPI= "cmpldi"; # unsigned compare with immediate
142 $UCMP= "cmpld"; # unsigned compare
143 $CNTLZ= "cntlzd"; # count leading zeros
144 $SHL= "sld"; # shift left
145 $SHR= "srd"; # unsigned shift right
146 $SHRI= "srdi"; # unsigned shift right by immediate
147 $SHLI= "sldi"; # shift left by immediate
148 $CLRU= "clrldi"; # clear upper bits
149 $INSR= "insrdi"; # insert right
150 $ROTL= "rotldi"; # rotate left by immediate
151 $TR= "td"; # conditional trap
152} else { die "nonsense $flavour"; }
153
154$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
155( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
156( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
157die "can't locate ppc-xlate.pl";
158
159open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
160
161$data=<<EOF;
162#--------------------------------------------------------------------
163#
164#
165#
166#
167# File: ppc32.s
168#
169# Created by: Suresh Chari
170# IBM Thomas J. Watson Research Library
171# Hawthorne, NY
172#
173#
174# Description: Optimized assembly routines for OpenSSL crypto
175# on the 32 bitPowerPC platform.
176#
177#
178# Version History
179#
180# 2. Fixed bn_add,bn_sub and bn_div_words, added comments,
181# cleaned up code. Also made a single version which can
182# be used for both the AIX and Linux compilers. See NOTE
183# below.
184# 12/05/03 Suresh Chari
185# (with lots of help from) Andy Polyakov
186##
187# 1. Initial version 10/20/02 Suresh Chari
188#
189#
190# The following file works for the xlc,cc
191# and gcc compilers.
192#
193# NOTE: To get the file to link correctly with the gcc compiler
194# you have to change the names of the routines and remove
195# the first .(dot) character. This should automatically
196# be done in the build process.
197#
198# Hand optimized assembly code for the following routines
199#
200# bn_sqr_comba4
201# bn_sqr_comba8
202# bn_mul_comba4
203# bn_mul_comba8
204# bn_sub_words
205# bn_add_words
206# bn_div_words
207# bn_sqr_words
208# bn_mul_words
209# bn_mul_add_words
210#
211# NOTE: It is possible to optimize this code more for
212# specific PowerPC or Power architectures. On the Northstar
213# architecture the optimizations in this file do
214# NOT provide much improvement.
215#
216# If you have comments or suggestions to improve code send
217# me a note at schari\@us.ibm.com
218#
219#--------------------------------------------------------------------------
220#
221# Defines to be used in the assembly code.
222#
223#.set r0,0 # we use it as storage for value of 0
224#.set SP,1 # preserved
225#.set RTOC,2 # preserved
226#.set r3,3 # 1st argument/return value
227#.set r4,4 # 2nd argument/volatile register
228#.set r5,5 # 3rd argument/volatile register
229#.set r6,6 # ...
230#.set r7,7
231#.set r8,8
232#.set r9,9
233#.set r10,10
234#.set r11,11
235#.set r12,12
236#.set r13,13 # not used, nor any other "below" it...
237
238# Declare function names to be global
239# NOTE: For gcc these names MUST be changed to remove
240# the first . i.e. for example change ".bn_sqr_comba4"
241# to "bn_sqr_comba4". This should be automatically done
242# in the build.
243
244 .globl .bn_sqr_comba4
245 .globl .bn_sqr_comba8
246 .globl .bn_mul_comba4
247 .globl .bn_mul_comba8
248 .globl .bn_sub_words
249 .globl .bn_add_words
250 .globl .bn_div_words
251 .globl .bn_sqr_words
252 .globl .bn_mul_words
253 .globl .bn_mul_add_words
254
255# .text section
256
257 .machine "any"
258
259#
260# NOTE: The following label name should be changed to
261# "bn_sqr_comba4" i.e. remove the first dot
262# for the gcc compiler. This should be automatically
263# done in the build
264#
265
266.align 4
267.bn_sqr_comba4:
268#
269# Optimized version of bn_sqr_comba4.
270#
271# void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
272# r3 contains r
273# r4 contains a
274#
275# Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows:
276#
277# r5,r6 are the two BN_ULONGs being multiplied.
278# r7,r8 are the results of the 32x32 giving 64 bit multiply.
279# r9,r10, r11 are the equivalents of c1,c2, c3.
280# Here's the assembly
281#
282#
283 xor r0,r0,r0 # set r0 = 0. Used in the addze
284 # instructions below
285
286 #sqr_add_c(a,0,c1,c2,c3)
287 $LD r5,`0*$BNSZ`(r4)
288 $UMULL r9,r5,r5
289 $UMULH r10,r5,r5 #in first iteration. No need
290 #to add since c1=c2=c3=0.
291 # Note c3(r11) is NOT set to 0
292 # but will be.
293
294 $ST r9,`0*$BNSZ`(r3) # r[0]=c1;
295 # sqr_add_c2(a,1,0,c2,c3,c1);
296 $LD r6,`1*$BNSZ`(r4)
297 $UMULL r7,r5,r6
298 $UMULH r8,r5,r6
299
300 addc r7,r7,r7 # compute (r7,r8)=2*(r7,r8)
301 adde r8,r8,r8
302 addze r9,r0 # catch carry if any.
303 # r9= r0(=0) and carry
304
305 addc r10,r7,r10 # now add to temp result.
306 addze r11,r8 # r8 added to r11 which is 0
307 addze r9,r9
308
309 $ST r10,`1*$BNSZ`(r3) #r[1]=c2;
310 #sqr_add_c(a,1,c3,c1,c2)
311 $UMULL r7,r6,r6
312 $UMULH r8,r6,r6
313 addc r11,r7,r11
314 adde r9,r8,r9
315 addze r10,r0
316 #sqr_add_c2(a,2,0,c3,c1,c2)
317 $LD r6,`2*$BNSZ`(r4)
318 $UMULL r7,r5,r6
319 $UMULH r8,r5,r6
320
321 addc r7,r7,r7
322 adde r8,r8,r8
323 addze r10,r10
324
325 addc r11,r7,r11
326 adde r9,r8,r9
327 addze r10,r10
328 $ST r11,`2*$BNSZ`(r3) #r[2]=c3
329 #sqr_add_c2(a,3,0,c1,c2,c3);
330 $LD r6,`3*$BNSZ`(r4)
331 $UMULL r7,r5,r6
332 $UMULH r8,r5,r6
333 addc r7,r7,r7
334 adde r8,r8,r8
335 addze r11,r0
336
337 addc r9,r7,r9
338 adde r10,r8,r10
339 addze r11,r11
340 #sqr_add_c2(a,2,1,c1,c2,c3);
341 $LD r5,`1*$BNSZ`(r4)
342 $LD r6,`2*$BNSZ`(r4)
343 $UMULL r7,r5,r6
344 $UMULH r8,r5,r6
345
346 addc r7,r7,r7
347 adde r8,r8,r8
348 addze r11,r11
349 addc r9,r7,r9
350 adde r10,r8,r10
351 addze r11,r11
352 $ST r9,`3*$BNSZ`(r3) #r[3]=c1
353 #sqr_add_c(a,2,c2,c3,c1);
354 $UMULL r7,r6,r6
355 $UMULH r8,r6,r6
356 addc r10,r7,r10
357 adde r11,r8,r11
358 addze r9,r0
359 #sqr_add_c2(a,3,1,c2,c3,c1);
360 $LD r6,`3*$BNSZ`(r4)
361 $UMULL r7,r5,r6
362 $UMULH r8,r5,r6
363 addc r7,r7,r7
364 adde r8,r8,r8
365 addze r9,r9
366
367 addc r10,r7,r10
368 adde r11,r8,r11
369 addze r9,r9
370 $ST r10,`4*$BNSZ`(r3) #r[4]=c2
371 #sqr_add_c2(a,3,2,c3,c1,c2);
372 $LD r5,`2*$BNSZ`(r4)
373 $UMULL r7,r5,r6
374 $UMULH r8,r5,r6
375 addc r7,r7,r7
376 adde r8,r8,r8
377 addze r10,r0
378
379 addc r11,r7,r11
380 adde r9,r8,r9
381 addze r10,r10
382 $ST r11,`5*$BNSZ`(r3) #r[5] = c3
383 #sqr_add_c(a,3,c1,c2,c3);
384 $UMULL r7,r6,r6
385 $UMULH r8,r6,r6
386 addc r9,r7,r9
387 adde r10,r8,r10
388
389 $ST r9,`6*$BNSZ`(r3) #r[6]=c1
390 $ST r10,`7*$BNSZ`(r3) #r[7]=c2
391 blr
392
393#
394# NOTE: The following label name should be changed to
395# "bn_sqr_comba8" i.e. remove the first dot
396# for the gcc compiler. This should be automatically
397# done in the build
398#
399
400.align 4
401.bn_sqr_comba8:
402#
403# This is an optimized version of the bn_sqr_comba8 routine.
404# Tightly uses the adde instruction
405#
406#
407# void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
408# r3 contains r
409# r4 contains a
410#
411# Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows:
412#
413# r5,r6 are the two BN_ULONGs being multiplied.
414# r7,r8 are the results of the 32x32 giving 64 bit multiply.
415# r9,r10, r11 are the equivalents of c1,c2, c3.
416#
417# Possible optimization of loading all 8 longs of a into registers
418# doesnt provide any speedup
419#
420
421 xor r0,r0,r0 #set r0 = 0.Used in addze
422 #instructions below.
423
424 #sqr_add_c(a,0,c1,c2,c3);
425 $LD r5,`0*$BNSZ`(r4)
426 $UMULL r9,r5,r5 #1st iteration: no carries.
427 $UMULH r10,r5,r5
428 $ST r9,`0*$BNSZ`(r3) # r[0]=c1;
429 #sqr_add_c2(a,1,0,c2,c3,c1);
430 $LD r6,`1*$BNSZ`(r4)
431 $UMULL r7,r5,r6
432 $UMULH r8,r5,r6
433
434 addc r10,r7,r10 #add the two register number
435 adde r11,r8,r0 # (r8,r7) to the three register
436 addze r9,r0 # number (r9,r11,r10).NOTE:r0=0
437
438 addc r10,r7,r10 #add the two register number
439 adde r11,r8,r11 # (r8,r7) to the three register
440 addze r9,r9 # number (r9,r11,r10).
441
442 $ST r10,`1*$BNSZ`(r3) # r[1]=c2
443
444 #sqr_add_c(a,1,c3,c1,c2);
445 $UMULL r7,r6,r6
446 $UMULH r8,r6,r6
447 addc r11,r7,r11
448 adde r9,r8,r9
449 addze r10,r0
450 #sqr_add_c2(a,2,0,c3,c1,c2);
451 $LD r6,`2*$BNSZ`(r4)
452 $UMULL r7,r5,r6
453 $UMULH r8,r5,r6
454
455 addc r11,r7,r11
456 adde r9,r8,r9
457 addze r10,r10
458
459 addc r11,r7,r11
460 adde r9,r8,r9
461 addze r10,r10
462
463 $ST r11,`2*$BNSZ`(r3) #r[2]=c3
464 #sqr_add_c2(a,3,0,c1,c2,c3);
465 $LD r6,`3*$BNSZ`(r4) #r6 = a[3]. r5 is already a[0].
466 $UMULL r7,r5,r6
467 $UMULH r8,r5,r6
468
469 addc r9,r7,r9
470 adde r10,r8,r10
471 addze r11,r0
472
473 addc r9,r7,r9
474 adde r10,r8,r10
475 addze r11,r11
476 #sqr_add_c2(a,2,1,c1,c2,c3);
477 $LD r5,`1*$BNSZ`(r4)
478 $LD r6,`2*$BNSZ`(r4)
479 $UMULL r7,r5,r6
480 $UMULH r8,r5,r6
481
482 addc r9,r7,r9
483 adde r10,r8,r10
484 addze r11,r11
485
486 addc r9,r7,r9
487 adde r10,r8,r10
488 addze r11,r11
489
490 $ST r9,`3*$BNSZ`(r3) #r[3]=c1;
491 #sqr_add_c(a,2,c2,c3,c1);
492 $UMULL r7,r6,r6
493 $UMULH r8,r6,r6
494
495 addc r10,r7,r10
496 adde r11,r8,r11
497 addze r9,r0
498 #sqr_add_c2(a,3,1,c2,c3,c1);
499 $LD r6,`3*$BNSZ`(r4)
500 $UMULL r7,r5,r6
501 $UMULH r8,r5,r6
502
503 addc r10,r7,r10
504 adde r11,r8,r11
505 addze r9,r9
506
507 addc r10,r7,r10
508 adde r11,r8,r11
509 addze r9,r9
510 #sqr_add_c2(a,4,0,c2,c3,c1);
511 $LD r5,`0*$BNSZ`(r4)
512 $LD r6,`4*$BNSZ`(r4)
513 $UMULL r7,r5,r6
514 $UMULH r8,r5,r6
515
516 addc r10,r7,r10
517 adde r11,r8,r11
518 addze r9,r9
519
520 addc r10,r7,r10
521 adde r11,r8,r11
522 addze r9,r9
523 $ST r10,`4*$BNSZ`(r3) #r[4]=c2;
524 #sqr_add_c2(a,5,0,c3,c1,c2);
525 $LD r6,`5*$BNSZ`(r4)
526 $UMULL r7,r5,r6
527 $UMULH r8,r5,r6
528
529 addc r11,r7,r11
530 adde r9,r8,r9
531 addze r10,r0
532
533 addc r11,r7,r11
534 adde r9,r8,r9
535 addze r10,r10
536 #sqr_add_c2(a,4,1,c3,c1,c2);
537 $LD r5,`1*$BNSZ`(r4)
538 $LD r6,`4*$BNSZ`(r4)
539 $UMULL r7,r5,r6
540 $UMULH r8,r5,r6
541
542 addc r11,r7,r11
543 adde r9,r8,r9
544 addze r10,r10
545
546 addc r11,r7,r11
547 adde r9,r8,r9
548 addze r10,r10
549 #sqr_add_c2(a,3,2,c3,c1,c2);
550 $LD r5,`2*$BNSZ`(r4)
551 $LD r6,`3*$BNSZ`(r4)
552 $UMULL r7,r5,r6
553 $UMULH r8,r5,r6
554
555 addc r11,r7,r11
556 adde r9,r8,r9
557 addze r10,r10
558
559 addc r11,r7,r11
560 adde r9,r8,r9
561 addze r10,r10
562 $ST r11,`5*$BNSZ`(r3) #r[5]=c3;
563 #sqr_add_c(a,3,c1,c2,c3);
564 $UMULL r7,r6,r6
565 $UMULH r8,r6,r6
566 addc r9,r7,r9
567 adde r10,r8,r10
568 addze r11,r0
569 #sqr_add_c2(a,4,2,c1,c2,c3);
570 $LD r6,`4*$BNSZ`(r4)
571 $UMULL r7,r5,r6
572 $UMULH r8,r5,r6
573
574 addc r9,r7,r9
575 adde r10,r8,r10
576 addze r11,r11
577
578 addc r9,r7,r9
579 adde r10,r8,r10
580 addze r11,r11
581 #sqr_add_c2(a,5,1,c1,c2,c3);
582 $LD r5,`1*$BNSZ`(r4)
583 $LD r6,`5*$BNSZ`(r4)
584 $UMULL r7,r5,r6
585 $UMULH r8,r5,r6
586
587 addc r9,r7,r9
588 adde r10,r8,r10
589 addze r11,r11
590
591 addc r9,r7,r9
592 adde r10,r8,r10
593 addze r11,r11
594 #sqr_add_c2(a,6,0,c1,c2,c3);
595 $LD r5,`0*$BNSZ`(r4)
596 $LD r6,`6*$BNSZ`(r4)
597 $UMULL r7,r5,r6
598 $UMULH r8,r5,r6
599 addc r9,r7,r9
600 adde r10,r8,r10
601 addze r11,r11
602 addc r9,r7,r9
603 adde r10,r8,r10
604 addze r11,r11
605 $ST r9,`6*$BNSZ`(r3) #r[6]=c1;
606 #sqr_add_c2(a,7,0,c2,c3,c1);
607 $LD r6,`7*$BNSZ`(r4)
608 $UMULL r7,r5,r6
609 $UMULH r8,r5,r6
610
611 addc r10,r7,r10
612 adde r11,r8,r11
613 addze r9,r0
614 addc r10,r7,r10
615 adde r11,r8,r11
616 addze r9,r9
617 #sqr_add_c2(a,6,1,c2,c3,c1);
618 $LD r5,`1*$BNSZ`(r4)
619 $LD r6,`6*$BNSZ`(r4)
620 $UMULL r7,r5,r6
621 $UMULH r8,r5,r6
622
623 addc r10,r7,r10
624 adde r11,r8,r11
625 addze r9,r9
626 addc r10,r7,r10
627 adde r11,r8,r11
628 addze r9,r9
629 #sqr_add_c2(a,5,2,c2,c3,c1);
630 $LD r5,`2*$BNSZ`(r4)
631 $LD r6,`5*$BNSZ`(r4)
632 $UMULL r7,r5,r6
633 $UMULH r8,r5,r6
634 addc r10,r7,r10
635 adde r11,r8,r11
636 addze r9,r9
637 addc r10,r7,r10
638 adde r11,r8,r11
639 addze r9,r9
640 #sqr_add_c2(a,4,3,c2,c3,c1);
641 $LD r5,`3*$BNSZ`(r4)
642 $LD r6,`4*$BNSZ`(r4)
643 $UMULL r7,r5,r6
644 $UMULH r8,r5,r6
645
646 addc r10,r7,r10
647 adde r11,r8,r11
648 addze r9,r9
649 addc r10,r7,r10
650 adde r11,r8,r11
651 addze r9,r9
652 $ST r10,`7*$BNSZ`(r3) #r[7]=c2;
653 #sqr_add_c(a,4,c3,c1,c2);
654 $UMULL r7,r6,r6
655 $UMULH r8,r6,r6
656 addc r11,r7,r11
657 adde r9,r8,r9
658 addze r10,r0
659 #sqr_add_c2(a,5,3,c3,c1,c2);
660 $LD r6,`5*$BNSZ`(r4)
661 $UMULL r7,r5,r6
662 $UMULH r8,r5,r6
663 addc r11,r7,r11
664 adde r9,r8,r9
665 addze r10,r10
666 addc r11,r7,r11
667 adde r9,r8,r9
668 addze r10,r10
669 #sqr_add_c2(a,6,2,c3,c1,c2);
670 $LD r5,`2*$BNSZ`(r4)
671 $LD r6,`6*$BNSZ`(r4)
672 $UMULL r7,r5,r6
673 $UMULH r8,r5,r6
674 addc r11,r7,r11
675 adde r9,r8,r9
676 addze r10,r10
677
678 addc r11,r7,r11
679 adde r9,r8,r9
680 addze r10,r10
681 #sqr_add_c2(a,7,1,c3,c1,c2);
682 $LD r5,`1*$BNSZ`(r4)
683 $LD r6,`7*$BNSZ`(r4)
684 $UMULL r7,r5,r6
685 $UMULH r8,r5,r6
686 addc r11,r7,r11
687 adde r9,r8,r9
688 addze r10,r10
689 addc r11,r7,r11
690 adde r9,r8,r9
691 addze r10,r10
692 $ST r11,`8*$BNSZ`(r3) #r[8]=c3;
693 #sqr_add_c2(a,7,2,c1,c2,c3);
694 $LD r5,`2*$BNSZ`(r4)
695 $UMULL r7,r5,r6
696 $UMULH r8,r5,r6
697
698 addc r9,r7,r9
699 adde r10,r8,r10
700 addze r11,r0
701 addc r9,r7,r9
702 adde r10,r8,r10
703 addze r11,r11
704 #sqr_add_c2(a,6,3,c1,c2,c3);
705 $LD r5,`3*$BNSZ`(r4)
706 $LD r6,`6*$BNSZ`(r4)
707 $UMULL r7,r5,r6
708 $UMULH r8,r5,r6
709 addc r9,r7,r9
710 adde r10,r8,r10
711 addze r11,r11
712 addc r9,r7,r9
713 adde r10,r8,r10
714 addze r11,r11
715 #sqr_add_c2(a,5,4,c1,c2,c3);
716 $LD r5,`4*$BNSZ`(r4)
717 $LD r6,`5*$BNSZ`(r4)
718 $UMULL r7,r5,r6
719 $UMULH r8,r5,r6
720 addc r9,r7,r9
721 adde r10,r8,r10
722 addze r11,r11
723 addc r9,r7,r9
724 adde r10,r8,r10
725 addze r11,r11
726 $ST r9,`9*$BNSZ`(r3) #r[9]=c1;
727 #sqr_add_c(a,5,c2,c3,c1);
728 $UMULL r7,r6,r6
729 $UMULH r8,r6,r6
730 addc r10,r7,r10
731 adde r11,r8,r11
732 addze r9,r0
733 #sqr_add_c2(a,6,4,c2,c3,c1);
734 $LD r6,`6*$BNSZ`(r4)
735 $UMULL r7,r5,r6
736 $UMULH r8,r5,r6
737 addc r10,r7,r10
738 adde r11,r8,r11
739 addze r9,r9
740 addc r10,r7,r10
741 adde r11,r8,r11
742 addze r9,r9
743 #sqr_add_c2(a,7,3,c2,c3,c1);
744 $LD r5,`3*$BNSZ`(r4)
745 $LD r6,`7*$BNSZ`(r4)
746 $UMULL r7,r5,r6
747 $UMULH r8,r5,r6
748 addc r10,r7,r10
749 adde r11,r8,r11
750 addze r9,r9
751 addc r10,r7,r10
752 adde r11,r8,r11
753 addze r9,r9
754 $ST r10,`10*$BNSZ`(r3) #r[10]=c2;
755 #sqr_add_c2(a,7,4,c3,c1,c2);
756 $LD r5,`4*$BNSZ`(r4)
757 $UMULL r7,r5,r6
758 $UMULH r8,r5,r6
759 addc r11,r7,r11
760 adde r9,r8,r9
761 addze r10,r0
762 addc r11,r7,r11
763 adde r9,r8,r9
764 addze r10,r10
765 #sqr_add_c2(a,6,5,c3,c1,c2);
766 $LD r5,`5*$BNSZ`(r4)
767 $LD r6,`6*$BNSZ`(r4)
768 $UMULL r7,r5,r6
769 $UMULH r8,r5,r6
770 addc r11,r7,r11
771 adde r9,r8,r9
772 addze r10,r10
773 addc r11,r7,r11
774 adde r9,r8,r9
775 addze r10,r10
776 $ST r11,`11*$BNSZ`(r3) #r[11]=c3;
777 #sqr_add_c(a,6,c1,c2,c3);
778 $UMULL r7,r6,r6
779 $UMULH r8,r6,r6
780 addc r9,r7,r9
781 adde r10,r8,r10
782 addze r11,r0
783 #sqr_add_c2(a,7,5,c1,c2,c3)
784 $LD r6,`7*$BNSZ`(r4)
785 $UMULL r7,r5,r6
786 $UMULH r8,r5,r6
787 addc r9,r7,r9
788 adde r10,r8,r10
789 addze r11,r11
790 addc r9,r7,r9
791 adde r10,r8,r10
792 addze r11,r11
793 $ST r9,`12*$BNSZ`(r3) #r[12]=c1;
794
795 #sqr_add_c2(a,7,6,c2,c3,c1)
796 $LD r5,`6*$BNSZ`(r4)
797 $UMULL r7,r5,r6
798 $UMULH r8,r5,r6
799 addc r10,r7,r10
800 adde r11,r8,r11
801 addze r9,r0
802 addc r10,r7,r10
803 adde r11,r8,r11
804 addze r9,r9
805 $ST r10,`13*$BNSZ`(r3) #r[13]=c2;
806 #sqr_add_c(a,7,c3,c1,c2);
807 $UMULL r7,r6,r6
808 $UMULH r8,r6,r6
809 addc r11,r7,r11
810 adde r9,r8,r9
811 $ST r11,`14*$BNSZ`(r3) #r[14]=c3;
812 $ST r9, `15*$BNSZ`(r3) #r[15]=c1;
813
814
815 blr
816
817#
818# NOTE: The following label name should be changed to
819# "bn_mul_comba4" i.e. remove the first dot
820# for the gcc compiler. This should be automatically
821# done in the build
822#
823
824.align 4
825.bn_mul_comba4:
826#
827# This is an optimized version of the bn_mul_comba4 routine.
828#
829# void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
830# r3 contains r
831# r4 contains a
832# r5 contains b
833# r6, r7 are the 2 BN_ULONGs being multiplied.
834# r8, r9 are the results of the 32x32 giving 64 multiply.
835# r10, r11, r12 are the equivalents of c1, c2, and c3.
836#
837 xor r0,r0,r0 #r0=0. Used in addze below.
838 #mul_add_c(a[0],b[0],c1,c2,c3);
839 $LD r6,`0*$BNSZ`(r4)
840 $LD r7,`0*$BNSZ`(r5)
841 $UMULL r10,r6,r7
842 $UMULH r11,r6,r7
843 $ST r10,`0*$BNSZ`(r3) #r[0]=c1
844 #mul_add_c(a[0],b[1],c2,c3,c1);
845 $LD r7,`1*$BNSZ`(r5)
846 $UMULL r8,r6,r7
847 $UMULH r9,r6,r7
848 addc r11,r8,r11
849 adde r12,r9,r0
850 addze r10,r0
851 #mul_add_c(a[1],b[0],c2,c3,c1);
852 $LD r6, `1*$BNSZ`(r4)
853 $LD r7, `0*$BNSZ`(r5)
854 $UMULL r8,r6,r7
855 $UMULH r9,r6,r7
856 addc r11,r8,r11
857 adde r12,r9,r12
858 addze r10,r10
859 $ST r11,`1*$BNSZ`(r3) #r[1]=c2
860 #mul_add_c(a[2],b[0],c3,c1,c2);
861 $LD r6,`2*$BNSZ`(r4)
862 $UMULL r8,r6,r7
863 $UMULH r9,r6,r7
864 addc r12,r8,r12
865 adde r10,r9,r10
866 addze r11,r0
867 #mul_add_c(a[1],b[1],c3,c1,c2);
868 $LD r6,`1*$BNSZ`(r4)
869 $LD r7,`1*$BNSZ`(r5)
870 $UMULL r8,r6,r7
871 $UMULH r9,r6,r7
872 addc r12,r8,r12
873 adde r10,r9,r10
874 addze r11,r11
875 #mul_add_c(a[0],b[2],c3,c1,c2);
876 $LD r6,`0*$BNSZ`(r4)
877 $LD r7,`2*$BNSZ`(r5)
878 $UMULL r8,r6,r7
879 $UMULH r9,r6,r7
880 addc r12,r8,r12
881 adde r10,r9,r10
882 addze r11,r11
883 $ST r12,`2*$BNSZ`(r3) #r[2]=c3
884 #mul_add_c(a[0],b[3],c1,c2,c3);
885 $LD r7,`3*$BNSZ`(r5)
886 $UMULL r8,r6,r7
887 $UMULH r9,r6,r7
888 addc r10,r8,r10
889 adde r11,r9,r11
890 addze r12,r0
891 #mul_add_c(a[1],b[2],c1,c2,c3);
892 $LD r6,`1*$BNSZ`(r4)
893 $LD r7,`2*$BNSZ`(r5)
894 $UMULL r8,r6,r7
895 $UMULH r9,r6,r7
896 addc r10,r8,r10
897 adde r11,r9,r11
898 addze r12,r12
899 #mul_add_c(a[2],b[1],c1,c2,c3);
900 $LD r6,`2*$BNSZ`(r4)
901 $LD r7,`1*$BNSZ`(r5)
902 $UMULL r8,r6,r7
903 $UMULH r9,r6,r7
904 addc r10,r8,r10
905 adde r11,r9,r11
906 addze r12,r12
907 #mul_add_c(a[3],b[0],c1,c2,c3);
908 $LD r6,`3*$BNSZ`(r4)
909 $LD r7,`0*$BNSZ`(r5)
910 $UMULL r8,r6,r7
911 $UMULH r9,r6,r7
912 addc r10,r8,r10
913 adde r11,r9,r11
914 addze r12,r12
915 $ST r10,`3*$BNSZ`(r3) #r[3]=c1
916 #mul_add_c(a[3],b[1],c2,c3,c1);
917 $LD r7,`1*$BNSZ`(r5)
918 $UMULL r8,r6,r7
919 $UMULH r9,r6,r7
920 addc r11,r8,r11
921 adde r12,r9,r12
922 addze r10,r0
923 #mul_add_c(a[2],b[2],c2,c3,c1);
924 $LD r6,`2*$BNSZ`(r4)
925 $LD r7,`2*$BNSZ`(r5)
926 $UMULL r8,r6,r7
927 $UMULH r9,r6,r7
928 addc r11,r8,r11
929 adde r12,r9,r12
930 addze r10,r10
931 #mul_add_c(a[1],b[3],c2,c3,c1);
932 $LD r6,`1*$BNSZ`(r4)
933 $LD r7,`3*$BNSZ`(r5)
934 $UMULL r8,r6,r7
935 $UMULH r9,r6,r7
936 addc r11,r8,r11
937 adde r12,r9,r12
938 addze r10,r10
939 $ST r11,`4*$BNSZ`(r3) #r[4]=c2
940 #mul_add_c(a[2],b[3],c3,c1,c2);
941 $LD r6,`2*$BNSZ`(r4)
942 $UMULL r8,r6,r7
943 $UMULH r9,r6,r7
944 addc r12,r8,r12
945 adde r10,r9,r10
946 addze r11,r0
947 #mul_add_c(a[3],b[2],c3,c1,c2);
948 $LD r6,`3*$BNSZ`(r4)
949 $LD r7,`2*$BNSZ`(r5)
950 $UMULL r8,r6,r7
951 $UMULH r9,r6,r7
952 addc r12,r8,r12
953 adde r10,r9,r10
954 addze r11,r11
955 $ST r12,`5*$BNSZ`(r3) #r[5]=c3
956 #mul_add_c(a[3],b[3],c1,c2,c3);
957 $LD r7,`3*$BNSZ`(r5)
958 $UMULL r8,r6,r7
959 $UMULH r9,r6,r7
960 addc r10,r8,r10
961 adde r11,r9,r11
962
963 $ST r10,`6*$BNSZ`(r3) #r[6]=c1
964 $ST r11,`7*$BNSZ`(r3) #r[7]=c2
965 blr
966
967#
968# NOTE: The following label name should be changed to
969# "bn_mul_comba8" i.e. remove the first dot
970# for the gcc compiler. This should be automatically
971# done in the build
972#
973
974.align 4
975.bn_mul_comba8:
976#
977# Optimized version of the bn_mul_comba8 routine.
978#
979# void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
980# r3 contains r
981# r4 contains a
982# r5 contains b
983# r6, r7 are the 2 BN_ULONGs being multiplied.
984# r8, r9 are the results of the 32x32 giving 64 multiply.
985# r10, r11, r12 are the equivalents of c1, c2, and c3.
986#
987 xor r0,r0,r0 #r0=0. Used in addze below.
988
989 #mul_add_c(a[0],b[0],c1,c2,c3);
990 $LD r6,`0*$BNSZ`(r4) #a[0]
991 $LD r7,`0*$BNSZ`(r5) #b[0]
992 $UMULL r10,r6,r7
993 $UMULH r11,r6,r7
994 $ST r10,`0*$BNSZ`(r3) #r[0]=c1;
995 #mul_add_c(a[0],b[1],c2,c3,c1);
996 $LD r7,`1*$BNSZ`(r5)
997 $UMULL r8,r6,r7
998 $UMULH r9,r6,r7
999 addc r11,r11,r8
1000 addze r12,r9 # since we didnt set r12 to zero before.
1001 addze r10,r0
1002 #mul_add_c(a[1],b[0],c2,c3,c1);
1003 $LD r6,`1*$BNSZ`(r4)
1004 $LD r7,`0*$BNSZ`(r5)
1005 $UMULL r8,r6,r7
1006 $UMULH r9,r6,r7
1007 addc r11,r11,r8
1008 adde r12,r12,r9
1009 addze r10,r10
1010 $ST r11,`1*$BNSZ`(r3) #r[1]=c2;
1011 #mul_add_c(a[2],b[0],c3,c1,c2);
1012 $LD r6,`2*$BNSZ`(r4)
1013 $UMULL r8,r6,r7
1014 $UMULH r9,r6,r7
1015 addc r12,r12,r8
1016 adde r10,r10,r9
1017 addze r11,r0
1018 #mul_add_c(a[1],b[1],c3,c1,c2);
1019 $LD r6,`1*$BNSZ`(r4)
1020 $LD r7,`1*$BNSZ`(r5)
1021 $UMULL r8,r6,r7
1022 $UMULH r9,r6,r7
1023 addc r12,r12,r8
1024 adde r10,r10,r9
1025 addze r11,r11
1026 #mul_add_c(a[0],b[2],c3,c1,c2);
1027 $LD r6,`0*$BNSZ`(r4)
1028 $LD r7,`2*$BNSZ`(r5)
1029 $UMULL r8,r6,r7
1030 $UMULH r9,r6,r7
1031 addc r12,r12,r8
1032 adde r10,r10,r9
1033 addze r11,r11
1034 $ST r12,`2*$BNSZ`(r3) #r[2]=c3;
1035 #mul_add_c(a[0],b[3],c1,c2,c3);
1036 $LD r7,`3*$BNSZ`(r5)
1037 $UMULL r8,r6,r7
1038 $UMULH r9,r6,r7
1039 addc r10,r10,r8
1040 adde r11,r11,r9
1041 addze r12,r0
1042 #mul_add_c(a[1],b[2],c1,c2,c3);
1043 $LD r6,`1*$BNSZ`(r4)
1044 $LD r7,`2*$BNSZ`(r5)
1045 $UMULL r8,r6,r7
1046 $UMULH r9,r6,r7
1047 addc r10,r10,r8
1048 adde r11,r11,r9
1049 addze r12,r12
1050
1051 #mul_add_c(a[2],b[1],c1,c2,c3);
1052 $LD r6,`2*$BNSZ`(r4)
1053 $LD r7,`1*$BNSZ`(r5)
1054 $UMULL r8,r6,r7
1055 $UMULH r9,r6,r7
1056 addc r10,r10,r8
1057 adde r11,r11,r9
1058 addze r12,r12
1059 #mul_add_c(a[3],b[0],c1,c2,c3);
1060 $LD r6,`3*$BNSZ`(r4)
1061 $LD r7,`0*$BNSZ`(r5)
1062 $UMULL r8,r6,r7
1063 $UMULH r9,r6,r7
1064 addc r10,r10,r8
1065 adde r11,r11,r9
1066 addze r12,r12
1067 $ST r10,`3*$BNSZ`(r3) #r[3]=c1;
1068 #mul_add_c(a[4],b[0],c2,c3,c1);
1069 $LD r6,`4*$BNSZ`(r4)
1070 $UMULL r8,r6,r7
1071 $UMULH r9,r6,r7
1072 addc r11,r11,r8
1073 adde r12,r12,r9
1074 addze r10,r0
1075 #mul_add_c(a[3],b[1],c2,c3,c1);
1076 $LD r6,`3*$BNSZ`(r4)
1077 $LD r7,`1*$BNSZ`(r5)
1078 $UMULL r8,r6,r7
1079 $UMULH r9,r6,r7
1080 addc r11,r11,r8
1081 adde r12,r12,r9
1082 addze r10,r10
1083 #mul_add_c(a[2],b[2],c2,c3,c1);
1084 $LD r6,`2*$BNSZ`(r4)
1085 $LD r7,`2*$BNSZ`(r5)
1086 $UMULL r8,r6,r7
1087 $UMULH r9,r6,r7
1088 addc r11,r11,r8
1089 adde r12,r12,r9
1090 addze r10,r10
1091 #mul_add_c(a[1],b[3],c2,c3,c1);
1092 $LD r6,`1*$BNSZ`(r4)
1093 $LD r7,`3*$BNSZ`(r5)
1094 $UMULL r8,r6,r7
1095 $UMULH r9,r6,r7
1096 addc r11,r11,r8
1097 adde r12,r12,r9
1098 addze r10,r10
1099 #mul_add_c(a[0],b[4],c2,c3,c1);
1100 $LD r6,`0*$BNSZ`(r4)
1101 $LD r7,`4*$BNSZ`(r5)
1102 $UMULL r8,r6,r7
1103 $UMULH r9,r6,r7
1104 addc r11,r11,r8
1105 adde r12,r12,r9
1106 addze r10,r10
1107 $ST r11,`4*$BNSZ`(r3) #r[4]=c2;
1108 #mul_add_c(a[0],b[5],c3,c1,c2);
1109 $LD r7,`5*$BNSZ`(r5)
1110 $UMULL r8,r6,r7
1111 $UMULH r9,r6,r7
1112 addc r12,r12,r8
1113 adde r10,r10,r9
1114 addze r11,r0
1115 #mul_add_c(a[1],b[4],c3,c1,c2);
1116 $LD r6,`1*$BNSZ`(r4)
1117 $LD r7,`4*$BNSZ`(r5)
1118 $UMULL r8,r6,r7
1119 $UMULH r9,r6,r7
1120 addc r12,r12,r8
1121 adde r10,r10,r9
1122 addze r11,r11
1123 #mul_add_c(a[2],b[3],c3,c1,c2);
1124 $LD r6,`2*$BNSZ`(r4)
1125 $LD r7,`3*$BNSZ`(r5)
1126 $UMULL r8,r6,r7
1127 $UMULH r9,r6,r7
1128 addc r12,r12,r8
1129 adde r10,r10,r9
1130 addze r11,r11
1131 #mul_add_c(a[3],b[2],c3,c1,c2);
1132 $LD r6,`3*$BNSZ`(r4)
1133 $LD r7,`2*$BNSZ`(r5)
1134 $UMULL r8,r6,r7
1135 $UMULH r9,r6,r7
1136 addc r12,r12,r8
1137 adde r10,r10,r9
1138 addze r11,r11
1139 #mul_add_c(a[4],b[1],c3,c1,c2);
1140 $LD r6,`4*$BNSZ`(r4)
1141 $LD r7,`1*$BNSZ`(r5)
1142 $UMULL r8,r6,r7
1143 $UMULH r9,r6,r7
1144 addc r12,r12,r8
1145 adde r10,r10,r9
1146 addze r11,r11
1147 #mul_add_c(a[5],b[0],c3,c1,c2);
1148 $LD r6,`5*$BNSZ`(r4)
1149 $LD r7,`0*$BNSZ`(r5)
1150 $UMULL r8,r6,r7
1151 $UMULH r9,r6,r7
1152 addc r12,r12,r8
1153 adde r10,r10,r9
1154 addze r11,r11
1155 $ST r12,`5*$BNSZ`(r3) #r[5]=c3;
1156 #mul_add_c(a[6],b[0],c1,c2,c3);
1157 $LD r6,`6*$BNSZ`(r4)
1158 $UMULL r8,r6,r7
1159 $UMULH r9,r6,r7
1160 addc r10,r10,r8
1161 adde r11,r11,r9
1162 addze r12,r0
1163 #mul_add_c(a[5],b[1],c1,c2,c3);
1164 $LD r6,`5*$BNSZ`(r4)
1165 $LD r7,`1*$BNSZ`(r5)
1166 $UMULL r8,r6,r7
1167 $UMULH r9,r6,r7
1168 addc r10,r10,r8
1169 adde r11,r11,r9
1170 addze r12,r12
1171 #mul_add_c(a[4],b[2],c1,c2,c3);
1172 $LD r6,`4*$BNSZ`(r4)
1173 $LD r7,`2*$BNSZ`(r5)
1174 $UMULL r8,r6,r7
1175 $UMULH r9,r6,r7
1176 addc r10,r10,r8
1177 adde r11,r11,r9
1178 addze r12,r12
1179 #mul_add_c(a[3],b[3],c1,c2,c3);
1180 $LD r6,`3*$BNSZ`(r4)
1181 $LD r7,`3*$BNSZ`(r5)
1182 $UMULL r8,r6,r7
1183 $UMULH r9,r6,r7
1184 addc r10,r10,r8
1185 adde r11,r11,r9
1186 addze r12,r12
1187 #mul_add_c(a[2],b[4],c1,c2,c3);
1188 $LD r6,`2*$BNSZ`(r4)
1189 $LD r7,`4*$BNSZ`(r5)
1190 $UMULL r8,r6,r7
1191 $UMULH r9,r6,r7
1192 addc r10,r10,r8
1193 adde r11,r11,r9
1194 addze r12,r12
1195 #mul_add_c(a[1],b[5],c1,c2,c3);
1196 $LD r6,`1*$BNSZ`(r4)
1197 $LD r7,`5*$BNSZ`(r5)
1198 $UMULL r8,r6,r7
1199 $UMULH r9,r6,r7
1200 addc r10,r10,r8
1201 adde r11,r11,r9
1202 addze r12,r12
1203 #mul_add_c(a[0],b[6],c1,c2,c3);
1204 $LD r6,`0*$BNSZ`(r4)
1205 $LD r7,`6*$BNSZ`(r5)
1206 $UMULL r8,r6,r7
1207 $UMULH r9,r6,r7
1208 addc r10,r10,r8
1209 adde r11,r11,r9
1210 addze r12,r12
1211 $ST r10,`6*$BNSZ`(r3) #r[6]=c1;
1212 #mul_add_c(a[0],b[7],c2,c3,c1);
1213 $LD r7,`7*$BNSZ`(r5)
1214 $UMULL r8,r6,r7
1215 $UMULH r9,r6,r7
1216 addc r11,r11,r8
1217 adde r12,r12,r9
1218 addze r10,r0
1219 #mul_add_c(a[1],b[6],c2,c3,c1);
1220 $LD r6,`1*$BNSZ`(r4)
1221 $LD r7,`6*$BNSZ`(r5)
1222 $UMULL r8,r6,r7
1223 $UMULH r9,r6,r7
1224 addc r11,r11,r8
1225 adde r12,r12,r9
1226 addze r10,r10
1227 #mul_add_c(a[2],b[5],c2,c3,c1);
1228 $LD r6,`2*$BNSZ`(r4)
1229 $LD r7,`5*$BNSZ`(r5)
1230 $UMULL r8,r6,r7
1231 $UMULH r9,r6,r7
1232 addc r11,r11,r8
1233 adde r12,r12,r9
1234 addze r10,r10
1235 #mul_add_c(a[3],b[4],c2,c3,c1);
1236 $LD r6,`3*$BNSZ`(r4)
1237 $LD r7,`4*$BNSZ`(r5)
1238 $UMULL r8,r6,r7
1239 $UMULH r9,r6,r7
1240 addc r11,r11,r8
1241 adde r12,r12,r9
1242 addze r10,r10
1243 #mul_add_c(a[4],b[3],c2,c3,c1);
1244 $LD r6,`4*$BNSZ`(r4)
1245 $LD r7,`3*$BNSZ`(r5)
1246 $UMULL r8,r6,r7
1247 $UMULH r9,r6,r7
1248 addc r11,r11,r8
1249 adde r12,r12,r9
1250 addze r10,r10
1251 #mul_add_c(a[5],b[2],c2,c3,c1);
1252 $LD r6,`5*$BNSZ`(r4)
1253 $LD r7,`2*$BNSZ`(r5)
1254 $UMULL r8,r6,r7
1255 $UMULH r9,r6,r7
1256 addc r11,r11,r8
1257 adde r12,r12,r9
1258 addze r10,r10
1259 #mul_add_c(a[6],b[1],c2,c3,c1);
1260 $LD r6,`6*$BNSZ`(r4)
1261 $LD r7,`1*$BNSZ`(r5)
1262 $UMULL r8,r6,r7
1263 $UMULH r9,r6,r7
1264 addc r11,r11,r8
1265 adde r12,r12,r9
1266 addze r10,r10
1267 #mul_add_c(a[7],b[0],c2,c3,c1);
1268 $LD r6,`7*$BNSZ`(r4)
1269 $LD r7,`0*$BNSZ`(r5)
1270 $UMULL r8,r6,r7
1271 $UMULH r9,r6,r7
1272 addc r11,r11,r8
1273 adde r12,r12,r9
1274 addze r10,r10
1275 $ST r11,`7*$BNSZ`(r3) #r[7]=c2;
1276 #mul_add_c(a[7],b[1],c3,c1,c2);
1277 $LD r7,`1*$BNSZ`(r5)
1278 $UMULL r8,r6,r7
1279 $UMULH r9,r6,r7
1280 addc r12,r12,r8
1281 adde r10,r10,r9
1282 addze r11,r0
1283 #mul_add_c(a[6],b[2],c3,c1,c2);
1284 $LD r6,`6*$BNSZ`(r4)
1285 $LD r7,`2*$BNSZ`(r5)
1286 $UMULL r8,r6,r7
1287 $UMULH r9,r6,r7
1288 addc r12,r12,r8
1289 adde r10,r10,r9
1290 addze r11,r11
1291 #mul_add_c(a[5],b[3],c3,c1,c2);
1292 $LD r6,`5*$BNSZ`(r4)
1293 $LD r7,`3*$BNSZ`(r5)
1294 $UMULL r8,r6,r7
1295 $UMULH r9,r6,r7
1296 addc r12,r12,r8
1297 adde r10,r10,r9
1298 addze r11,r11
1299 #mul_add_c(a[4],b[4],c3,c1,c2);
1300 $LD r6,`4*$BNSZ`(r4)
1301 $LD r7,`4*$BNSZ`(r5)
1302 $UMULL r8,r6,r7
1303 $UMULH r9,r6,r7
1304 addc r12,r12,r8
1305 adde r10,r10,r9
1306 addze r11,r11
1307 #mul_add_c(a[3],b[5],c3,c1,c2);
1308 $LD r6,`3*$BNSZ`(r4)
1309 $LD r7,`5*$BNSZ`(r5)
1310 $UMULL r8,r6,r7
1311 $UMULH r9,r6,r7
1312 addc r12,r12,r8
1313 adde r10,r10,r9
1314 addze r11,r11
1315 #mul_add_c(a[2],b[6],c3,c1,c2);
1316 $LD r6,`2*$BNSZ`(r4)
1317 $LD r7,`6*$BNSZ`(r5)
1318 $UMULL r8,r6,r7
1319 $UMULH r9,r6,r7
1320 addc r12,r12,r8
1321 adde r10,r10,r9
1322 addze r11,r11
1323 #mul_add_c(a[1],b[7],c3,c1,c2);
1324 $LD r6,`1*$BNSZ`(r4)
1325 $LD r7,`7*$BNSZ`(r5)
1326 $UMULL r8,r6,r7
1327 $UMULH r9,r6,r7
1328 addc r12,r12,r8
1329 adde r10,r10,r9
1330 addze r11,r11
1331 $ST r12,`8*$BNSZ`(r3) #r[8]=c3;
1332 #mul_add_c(a[2],b[7],c1,c2,c3);
1333 $LD r6,`2*$BNSZ`(r4)
1334 $UMULL r8,r6,r7
1335 $UMULH r9,r6,r7
1336 addc r10,r10,r8
1337 adde r11,r11,r9
1338 addze r12,r0
1339 #mul_add_c(a[3],b[6],c1,c2,c3);
1340 $LD r6,`3*$BNSZ`(r4)
1341 $LD r7,`6*$BNSZ`(r5)
1342 $UMULL r8,r6,r7
1343 $UMULH r9,r6,r7
1344 addc r10,r10,r8
1345 adde r11,r11,r9
1346 addze r12,r12
1347 #mul_add_c(a[4],b[5],c1,c2,c3);
1348 $LD r6,`4*$BNSZ`(r4)
1349 $LD r7,`5*$BNSZ`(r5)
1350 $UMULL r8,r6,r7
1351 $UMULH r9,r6,r7
1352 addc r10,r10,r8
1353 adde r11,r11,r9
1354 addze r12,r12
1355 #mul_add_c(a[5],b[4],c1,c2,c3);
1356 $LD r6,`5*$BNSZ`(r4)
1357 $LD r7,`4*$BNSZ`(r5)
1358 $UMULL r8,r6,r7
1359 $UMULH r9,r6,r7
1360 addc r10,r10,r8
1361 adde r11,r11,r9
1362 addze r12,r12
1363 #mul_add_c(a[6],b[3],c1,c2,c3);
1364 $LD r6,`6*$BNSZ`(r4)
1365 $LD r7,`3*$BNSZ`(r5)
1366 $UMULL r8,r6,r7
1367 $UMULH r9,r6,r7
1368 addc r10,r10,r8
1369 adde r11,r11,r9
1370 addze r12,r12
1371 #mul_add_c(a[7],b[2],c1,c2,c3);
1372 $LD r6,`7*$BNSZ`(r4)
1373 $LD r7,`2*$BNSZ`(r5)
1374 $UMULL r8,r6,r7
1375 $UMULH r9,r6,r7
1376 addc r10,r10,r8
1377 adde r11,r11,r9
1378 addze r12,r12
1379 $ST r10,`9*$BNSZ`(r3) #r[9]=c1;
1380 #mul_add_c(a[7],b[3],c2,c3,c1);
1381 $LD r7,`3*$BNSZ`(r5)
1382 $UMULL r8,r6,r7
1383 $UMULH r9,r6,r7
1384 addc r11,r11,r8
1385 adde r12,r12,r9
1386 addze r10,r0
1387 #mul_add_c(a[6],b[4],c2,c3,c1);
1388 $LD r6,`6*$BNSZ`(r4)
1389 $LD r7,`4*$BNSZ`(r5)
1390 $UMULL r8,r6,r7
1391 $UMULH r9,r6,r7
1392 addc r11,r11,r8
1393 adde r12,r12,r9
1394 addze r10,r10
1395 #mul_add_c(a[5],b[5],c2,c3,c1);
1396 $LD r6,`5*$BNSZ`(r4)
1397 $LD r7,`5*$BNSZ`(r5)
1398 $UMULL r8,r6,r7
1399 $UMULH r9,r6,r7
1400 addc r11,r11,r8
1401 adde r12,r12,r9
1402 addze r10,r10
1403 #mul_add_c(a[4],b[6],c2,c3,c1);
1404 $LD r6,`4*$BNSZ`(r4)
1405 $LD r7,`6*$BNSZ`(r5)
1406 $UMULL r8,r6,r7
1407 $UMULH r9,r6,r7
1408 addc r11,r11,r8
1409 adde r12,r12,r9
1410 addze r10,r10
1411 #mul_add_c(a[3],b[7],c2,c3,c1);
1412 $LD r6,`3*$BNSZ`(r4)
1413 $LD r7,`7*$BNSZ`(r5)
1414 $UMULL r8,r6,r7
1415 $UMULH r9,r6,r7
1416 addc r11,r11,r8
1417 adde r12,r12,r9
1418 addze r10,r10
1419 $ST r11,`10*$BNSZ`(r3) #r[10]=c2;
1420 #mul_add_c(a[4],b[7],c3,c1,c2);
1421 $LD r6,`4*$BNSZ`(r4)
1422 $UMULL r8,r6,r7
1423 $UMULH r9,r6,r7
1424 addc r12,r12,r8
1425 adde r10,r10,r9
1426 addze r11,r0
1427 #mul_add_c(a[5],b[6],c3,c1,c2);
1428 $LD r6,`5*$BNSZ`(r4)
1429 $LD r7,`6*$BNSZ`(r5)
1430 $UMULL r8,r6,r7
1431 $UMULH r9,r6,r7
1432 addc r12,r12,r8
1433 adde r10,r10,r9
1434 addze r11,r11
1435 #mul_add_c(a[6],b[5],c3,c1,c2);
1436 $LD r6,`6*$BNSZ`(r4)
1437 $LD r7,`5*$BNSZ`(r5)
1438 $UMULL r8,r6,r7
1439 $UMULH r9,r6,r7
1440 addc r12,r12,r8
1441 adde r10,r10,r9
1442 addze r11,r11
1443 #mul_add_c(a[7],b[4],c3,c1,c2);
1444 $LD r6,`7*$BNSZ`(r4)
1445 $LD r7,`4*$BNSZ`(r5)
1446 $UMULL r8,r6,r7
1447 $UMULH r9,r6,r7
1448 addc r12,r12,r8
1449 adde r10,r10,r9
1450 addze r11,r11
1451 $ST r12,`11*$BNSZ`(r3) #r[11]=c3;
1452 #mul_add_c(a[7],b[5],c1,c2,c3);
1453 $LD r7,`5*$BNSZ`(r5)
1454 $UMULL r8,r6,r7
1455 $UMULH r9,r6,r7
1456 addc r10,r10,r8
1457 adde r11,r11,r9
1458 addze r12,r0
1459 #mul_add_c(a[6],b[6],c1,c2,c3);
1460 $LD r6,`6*$BNSZ`(r4)
1461 $LD r7,`6*$BNSZ`(r5)
1462 $UMULL r8,r6,r7
1463 $UMULH r9,r6,r7
1464 addc r10,r10,r8
1465 adde r11,r11,r9
1466 addze r12,r12
1467 #mul_add_c(a[5],b[7],c1,c2,c3);
1468 $LD r6,`5*$BNSZ`(r4)
1469 $LD r7,`7*$BNSZ`(r5)
1470 $UMULL r8,r6,r7
1471 $UMULH r9,r6,r7
1472 addc r10,r10,r8
1473 adde r11,r11,r9
1474 addze r12,r12
1475 $ST r10,`12*$BNSZ`(r3) #r[12]=c1;
1476 #mul_add_c(a[6],b[7],c2,c3,c1);
1477 $LD r6,`6*$BNSZ`(r4)
1478 $UMULL r8,r6,r7
1479 $UMULH r9,r6,r7
1480 addc r11,r11,r8
1481 adde r12,r12,r9
1482 addze r10,r0
1483 #mul_add_c(a[7],b[6],c2,c3,c1);
1484 $LD r6,`7*$BNSZ`(r4)
1485 $LD r7,`6*$BNSZ`(r5)
1486 $UMULL r8,r6,r7
1487 $UMULH r9,r6,r7
1488 addc r11,r11,r8
1489 adde r12,r12,r9
1490 addze r10,r10
1491 $ST r11,`13*$BNSZ`(r3) #r[13]=c2;
1492 #mul_add_c(a[7],b[7],c3,c1,c2);
1493 $LD r7,`7*$BNSZ`(r5)
1494 $UMULL r8,r6,r7
1495 $UMULH r9,r6,r7
1496 addc r12,r12,r8
1497 adde r10,r10,r9
1498 $ST r12,`14*$BNSZ`(r3) #r[14]=c3;
1499 $ST r10,`15*$BNSZ`(r3) #r[15]=c1;
1500 blr
1501
1502#
1503# NOTE: The following label name should be changed to
1504# "bn_sub_words" i.e. remove the first dot
1505# for the gcc compiler. This should be automatically
1506# done in the build
1507#
1508#
1509.align 4
1510.bn_sub_words:
1511#
1512# Handcoded version of bn_sub_words
1513#
1514#BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
1515#
1516# r3 = r
1517# r4 = a
1518# r5 = b
1519# r6 = n
1520#
1521# Note: No loop unrolling done since this is not a performance
1522# critical loop.
1523
1524 xor r0,r0,r0 #set r0 = 0
1525#
1526# check for r6 = 0 AND set carry bit.
1527#
1528 subfc. r7,r0,r6 # If r6 is 0 then result is 0.
1529 # if r6 > 0 then result !=0
1530 # In either case carry bit is set.
1531 beq Lppcasm_sub_adios
1532 addi r4,r4,-$BNSZ
1533 addi r3,r3,-$BNSZ
1534 addi r5,r5,-$BNSZ
1535 mtctr r6
1536Lppcasm_sub_mainloop:
1537 $LDU r7,$BNSZ(r4)
1538 $LDU r8,$BNSZ(r5)
1539 subfe r6,r8,r7 # r6 = r7+carry bit + onescomplement(r8)
1540 # if carry = 1 this is r7-r8. Else it
1541 # is r7-r8 -1 as we need.
1542 $STU r6,$BNSZ(r3)
1543 bdnz- Lppcasm_sub_mainloop
1544Lppcasm_sub_adios:
1545 subfze r3,r0 # if carry bit is set then r3 = 0 else -1
1546 andi. r3,r3,1 # keep only last bit.
1547 blr
1548
1549#
1550# NOTE: The following label name should be changed to
1551# "bn_add_words" i.e. remove the first dot
1552# for the gcc compiler. This should be automatically
1553# done in the build
1554#
1555
1556.align 4
1557.bn_add_words:
1558#
1559# Handcoded version of bn_add_words
1560#
1561#BN_ULONG bn_add_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
1562#
1563# r3 = r
1564# r4 = a
1565# r5 = b
1566# r6 = n
1567#
1568# Note: No loop unrolling done since this is not a performance
1569# critical loop.
1570
1571 xor r0,r0,r0
1572#
1573# check for r6 = 0. Is this needed?
1574#
1575 addic. r6,r6,0 #test r6 and clear carry bit.
1576 beq Lppcasm_add_adios
1577 addi r4,r4,-$BNSZ
1578 addi r3,r3,-$BNSZ
1579 addi r5,r5,-$BNSZ
1580 mtctr r6
1581Lppcasm_add_mainloop:
1582 $LDU r7,$BNSZ(r4)
1583 $LDU r8,$BNSZ(r5)
1584 adde r8,r7,r8
1585 $STU r8,$BNSZ(r3)
1586 bdnz- Lppcasm_add_mainloop
1587Lppcasm_add_adios:
1588 addze r3,r0 #return carry bit.
1589 blr
1590
1591#
1592# NOTE: The following label name should be changed to
1593# "bn_div_words" i.e. remove the first dot
1594# for the gcc compiler. This should be automatically
1595# done in the build
1596#
1597
1598.align 4
1599.bn_div_words:
1600#
1601# This is a cleaned up version of code generated by
1602# the AIX compiler. The only optimization is to use
1603# the PPC instruction to count leading zeros instead
1604# of call to num_bits_word. Since this was compiled
1605# only at level -O2 we can possibly squeeze it more?
1606#
1607# r3 = h
1608# r4 = l
1609# r5 = d
1610
1611 $UCMPI 0,r5,0 # compare r5 and 0
1612 bne Lppcasm_div1 # proceed if d!=0
1613 li r3,-1 # d=0 return -1
1614 blr
1615Lppcasm_div1:
1616 xor r0,r0,r0 #r0=0
1617 li r8,$BITS
1618 $CNTLZ. r7,r5 #r7 = num leading 0s in d.
1619 beq Lppcasm_div2 #proceed if no leading zeros
1620 subf r8,r7,r8 #r8 = BN_num_bits_word(d)
1621 $SHR. r9,r3,r8 #are there any bits above r8'th?
1622 $TR 16,r9,r0 #if there're, signal to dump core...
1623Lppcasm_div2:
1624 $UCMP 0,r3,r5 #h>=d?
1625 blt Lppcasm_div3 #goto Lppcasm_div3 if not
1626 subf r3,r5,r3 #h-=d ;
1627Lppcasm_div3: #r7 = BN_BITS2-i. so r7=i
1628 cmpi 0,0,r7,0 # is (i == 0)?
1629 beq Lppcasm_div4
1630 $SHL r3,r3,r7 # h = (h<< i)
1631 $SHR r8,r4,r8 # r8 = (l >> BN_BITS2 -i)
1632 $SHL r5,r5,r7 # d<<=i
1633 or r3,r3,r8 # h = (h<<i)|(l>>(BN_BITS2-i))
1634 $SHL r4,r4,r7 # l <<=i
1635Lppcasm_div4:
1636 $SHRI r9,r5,`$BITS/2` # r9 = dh
1637 # dl will be computed when needed
1638 # as it saves registers.
1639 li r6,2 #r6=2
1640 mtctr r6 #counter will be in count.
1641Lppcasm_divouterloop:
1642 $SHRI r8,r3,`$BITS/2` #r8 = (h>>BN_BITS4)
1643 $SHRI r11,r4,`$BITS/2` #r11= (l&BN_MASK2h)>>BN_BITS4
1644 # compute here for innerloop.
1645 $UCMP 0,r8,r9 # is (h>>BN_BITS4)==dh
1646 bne Lppcasm_div5 # goto Lppcasm_div5 if not
1647
1648 li r8,-1
1649 $CLRU r8,r8,`$BITS/2` #q = BN_MASK2l
1650 b Lppcasm_div6
1651Lppcasm_div5:
1652 $UDIV r8,r3,r9 #q = h/dh
1653Lppcasm_div6:
1654 $UMULL r12,r9,r8 #th = q*dh
1655 $CLRU r10,r5,`$BITS/2` #r10=dl
1656 $UMULL r6,r8,r10 #tl = q*dl
1657
1658Lppcasm_divinnerloop:
1659 subf r10,r12,r3 #t = h -th
1660 $SHRI r7,r10,`$BITS/2` #r7= (t &BN_MASK2H), sort of...
1661 addic. r7,r7,0 #test if r7 == 0. used below.
1662 # now want to compute
1663 # r7 = (t<<BN_BITS4)|((l&BN_MASK2h)>>BN_BITS4)
1664 # the following 2 instructions do that
1665 $SHLI r7,r10,`$BITS/2` # r7 = (t<<BN_BITS4)
1666 or r7,r7,r11 # r7|=((l&BN_MASK2h)>>BN_BITS4)
1667 $UCMP cr1,r6,r7 # compare (tl <= r7)
1668 bne Lppcasm_divinnerexit
1669 ble cr1,Lppcasm_divinnerexit
1670 addi r8,r8,-1 #q--
1671 subf r12,r9,r12 #th -=dh
1672 $CLRU r10,r5,`$BITS/2` #r10=dl. t is no longer needed in loop.
1673 subf r6,r10,r6 #tl -=dl
1674 b Lppcasm_divinnerloop
1675Lppcasm_divinnerexit:
1676 $SHRI r10,r6,`$BITS/2` #t=(tl>>BN_BITS4)
1677 $SHLI r11,r6,`$BITS/2` #tl=(tl<<BN_BITS4)&BN_MASK2h;
1678 $UCMP cr1,r4,r11 # compare l and tl
1679 add r12,r12,r10 # th+=t
1680 bge cr1,Lppcasm_div7 # if (l>=tl) goto Lppcasm_div7
1681 addi r12,r12,1 # th++
1682Lppcasm_div7:
1683 subf r11,r11,r4 #r11=l-tl
1684 $UCMP cr1,r3,r12 #compare h and th
1685 bge cr1,Lppcasm_div8 #if (h>=th) goto Lppcasm_div8
1686 addi r8,r8,-1 # q--
1687 add r3,r5,r3 # h+=d
1688Lppcasm_div8:
1689 subf r12,r12,r3 #r12 = h-th
1690 $SHLI r4,r11,`$BITS/2` #l=(l&BN_MASK2l)<<BN_BITS4
1691 # want to compute
1692 # h = ((h<<BN_BITS4)|(l>>BN_BITS4))&BN_MASK2
1693 # the following 2 instructions will do this.
1694 $INSR r11,r12,`$BITS/2`,`$BITS/2` # r11 is the value we want rotated $BITS/2.
1695 $ROTL r3,r11,`$BITS/2` # rotate by $BITS/2 and store in r3
1696 bdz Lppcasm_div9 #if (count==0) break ;
1697 $SHLI r0,r8,`$BITS/2` #ret =q<<BN_BITS4
1698 b Lppcasm_divouterloop
1699Lppcasm_div9:
1700 or r3,r8,r0
1701 blr
1702
1703#
1704# NOTE: The following label name should be changed to
1705# "bn_sqr_words" i.e. remove the first dot
1706# for the gcc compiler. This should be automatically
1707# done in the build
1708#
1709.align 4
1710.bn_sqr_words:
1711#
1712# Optimized version of bn_sqr_words
1713#
1714# void bn_sqr_words(BN_ULONG *r, BN_ULONG *a, int n)
1715#
1716# r3 = r
1717# r4 = a
1718# r5 = n
1719#
1720# r6 = a[i].
1721# r7,r8 = product.
1722#
1723# No unrolling done here. Not performance critical.
1724
1725 addic. r5,r5,0 #test r5.
1726 beq Lppcasm_sqr_adios
1727 addi r4,r4,-$BNSZ
1728 addi r3,r3,-$BNSZ
1729 mtctr r5
1730Lppcasm_sqr_mainloop:
1731 #sqr(r[0],r[1],a[0]);
1732 $LDU r6,$BNSZ(r4)
1733 $UMULL r7,r6,r6
1734 $UMULH r8,r6,r6
1735 $STU r7,$BNSZ(r3)
1736 $STU r8,$BNSZ(r3)
1737 bdnz- Lppcasm_sqr_mainloop
1738Lppcasm_sqr_adios:
1739 blr
1740
1741#
1742# NOTE: The following label name should be changed to
1743# "bn_mul_words" i.e. remove the first dot
1744# for the gcc compiler. This should be automatically
1745# done in the build
1746#
1747
1748.align 4
1749.bn_mul_words:
1750#
1751# BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
1752#
1753# r3 = rp
1754# r4 = ap
1755# r5 = num
1756# r6 = w
1757 xor r0,r0,r0
1758 xor r12,r12,r12 # used for carry
1759 rlwinm. r7,r5,30,2,31 # num >> 2
1760 beq Lppcasm_mw_REM
1761 mtctr r7
1762Lppcasm_mw_LOOP:
1763 #mul(rp[0],ap[0],w,c1);
1764 $LD r8,`0*$BNSZ`(r4)
1765 $UMULL r9,r6,r8
1766 $UMULH r10,r6,r8
1767 addc r9,r9,r12
1768 #addze r10,r10 #carry is NOT ignored.
1769 #will be taken care of
1770 #in second spin below
1771 #using adde.
1772 $ST r9,`0*$BNSZ`(r3)
1773 #mul(rp[1],ap[1],w,c1);
1774 $LD r8,`1*$BNSZ`(r4)
1775 $UMULL r11,r6,r8
1776 $UMULH r12,r6,r8
1777 adde r11,r11,r10
1778 #addze r12,r12
1779 $ST r11,`1*$BNSZ`(r3)
1780 #mul(rp[2],ap[2],w,c1);
1781 $LD r8,`2*$BNSZ`(r4)
1782 $UMULL r9,r6,r8
1783 $UMULH r10,r6,r8
1784 adde r9,r9,r12
1785 #addze r10,r10
1786 $ST r9,`2*$BNSZ`(r3)
1787 #mul_add(rp[3],ap[3],w,c1);
1788 $LD r8,`3*$BNSZ`(r4)
1789 $UMULL r11,r6,r8
1790 $UMULH r12,r6,r8
1791 adde r11,r11,r10
1792 addze r12,r12 #this spin we collect carry into
1793 #r12
1794 $ST r11,`3*$BNSZ`(r3)
1795
1796 addi r3,r3,`4*$BNSZ`
1797 addi r4,r4,`4*$BNSZ`
1798 bdnz- Lppcasm_mw_LOOP
1799
1800Lppcasm_mw_REM:
1801 andi. r5,r5,0x3
1802 beq Lppcasm_mw_OVER
1803 #mul(rp[0],ap[0],w,c1);
1804 $LD r8,`0*$BNSZ`(r4)
1805 $UMULL r9,r6,r8
1806 $UMULH r10,r6,r8
1807 addc r9,r9,r12
1808 addze r10,r10
1809 $ST r9,`0*$BNSZ`(r3)
1810 addi r12,r10,0
1811
1812 addi r5,r5,-1
1813 cmpli 0,0,r5,0
1814 beq Lppcasm_mw_OVER
1815
1816
1817 #mul(rp[1],ap[1],w,c1);
1818 $LD r8,`1*$BNSZ`(r4)
1819 $UMULL r9,r6,r8
1820 $UMULH r10,r6,r8
1821 addc r9,r9,r12
1822 addze r10,r10
1823 $ST r9,`1*$BNSZ`(r3)
1824 addi r12,r10,0
1825
1826 addi r5,r5,-1
1827 cmpli 0,0,r5,0
1828 beq Lppcasm_mw_OVER
1829
1830 #mul_add(rp[2],ap[2],w,c1);
1831 $LD r8,`2*$BNSZ`(r4)
1832 $UMULL r9,r6,r8
1833 $UMULH r10,r6,r8
1834 addc r9,r9,r12
1835 addze r10,r10
1836 $ST r9,`2*$BNSZ`(r3)
1837 addi r12,r10,0
1838
1839Lppcasm_mw_OVER:
1840 addi r3,r12,0
1841 blr
1842
1843#
1844# NOTE: The following label name should be changed to
1845# "bn_mul_add_words" i.e. remove the first dot
1846# for the gcc compiler. This should be automatically
1847# done in the build
1848#
1849
1850.align 4
1851.bn_mul_add_words:
1852#
1853# BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
1854#
1855# r3 = rp
1856# r4 = ap
1857# r5 = num
1858# r6 = w
1859#
1860# empirical evidence suggests that unrolled version performs best!!
1861#
1862 xor r0,r0,r0 #r0 = 0
1863 xor r12,r12,r12 #r12 = 0 . used for carry
1864 rlwinm. r7,r5,30,2,31 # num >> 2
1865 beq Lppcasm_maw_leftover # if (num < 4) go LPPCASM_maw_leftover
1866 mtctr r7
1867Lppcasm_maw_mainloop:
1868 #mul_add(rp[0],ap[0],w,c1);
1869 $LD r8,`0*$BNSZ`(r4)
1870 $LD r11,`0*$BNSZ`(r3)
1871 $UMULL r9,r6,r8
1872 $UMULH r10,r6,r8
1873 addc r9,r9,r12 #r12 is carry.
1874 addze r10,r10
1875 addc r9,r9,r11
1876 #addze r10,r10
1877 #the above instruction addze
1878 #is NOT needed. Carry will NOT
1879 #be ignored. It's not affected
1880 #by multiply and will be collected
1881 #in the next spin
1882 $ST r9,`0*$BNSZ`(r3)
1883
1884 #mul_add(rp[1],ap[1],w,c1);
1885 $LD r8,`1*$BNSZ`(r4)
1886 $LD r9,`1*$BNSZ`(r3)
1887 $UMULL r11,r6,r8
1888 $UMULH r12,r6,r8
1889 adde r11,r11,r10 #r10 is carry.
1890 addze r12,r12
1891 addc r11,r11,r9
1892 #addze r12,r12
1893 $ST r11,`1*$BNSZ`(r3)
1894
1895 #mul_add(rp[2],ap[2],w,c1);
1896 $LD r8,`2*$BNSZ`(r4)
1897 $UMULL r9,r6,r8
1898 $LD r11,`2*$BNSZ`(r3)
1899 $UMULH r10,r6,r8
1900 adde r9,r9,r12
1901 addze r10,r10
1902 addc r9,r9,r11
1903 #addze r10,r10
1904 $ST r9,`2*$BNSZ`(r3)
1905
1906 #mul_add(rp[3],ap[3],w,c1);
1907 $LD r8,`3*$BNSZ`(r4)
1908 $UMULL r11,r6,r8
1909 $LD r9,`3*$BNSZ`(r3)
1910 $UMULH r12,r6,r8
1911 adde r11,r11,r10
1912 addze r12,r12
1913 addc r11,r11,r9
1914 addze r12,r12
1915 $ST r11,`3*$BNSZ`(r3)
1916 addi r3,r3,`4*$BNSZ`
1917 addi r4,r4,`4*$BNSZ`
1918 bdnz- Lppcasm_maw_mainloop
1919
1920Lppcasm_maw_leftover:
1921 andi. r5,r5,0x3
1922 beq Lppcasm_maw_adios
1923 addi r3,r3,-$BNSZ
1924 addi r4,r4,-$BNSZ
1925 #mul_add(rp[0],ap[0],w,c1);
1926 mtctr r5
1927 $LDU r8,$BNSZ(r4)
1928 $UMULL r9,r6,r8
1929 $UMULH r10,r6,r8
1930 $LDU r11,$BNSZ(r3)
1931 addc r9,r9,r11
1932 addze r10,r10
1933 addc r9,r9,r12
1934 addze r12,r10
1935 $ST r9,0(r3)
1936
1937 bdz Lppcasm_maw_adios
1938 #mul_add(rp[1],ap[1],w,c1);
1939 $LDU r8,$BNSZ(r4)
1940 $UMULL r9,r6,r8
1941 $UMULH r10,r6,r8
1942 $LDU r11,$BNSZ(r3)
1943 addc r9,r9,r11
1944 addze r10,r10
1945 addc r9,r9,r12
1946 addze r12,r10
1947 $ST r9,0(r3)
1948
1949 bdz Lppcasm_maw_adios
1950 #mul_add(rp[2],ap[2],w,c1);
1951 $LDU r8,$BNSZ(r4)
1952 $UMULL r9,r6,r8
1953 $UMULH r10,r6,r8
1954 $LDU r11,$BNSZ(r3)
1955 addc r9,r9,r11
1956 addze r10,r10
1957 addc r9,r9,r12
1958 addze r12,r10
1959 $ST r9,0(r3)
1960
1961Lppcasm_maw_adios:
1962 addi r3,r12,0
1963 blr
1964 .align 4
1965EOF
1966$data =~ s/\`([^\`]*)\`/eval $1/gem;
1967print $data;
1968close STDOUT;
diff --git a/src/lib/libcrypto/bn/asm/x86-mont.pl b/src/lib/libcrypto/bn/asm/x86-mont.pl
deleted file mode 100755
index 6524651748..0000000000
--- a/src/lib/libcrypto/bn/asm/x86-mont.pl
+++ /dev/null
@@ -1,592 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# October 2005
11#
12# This is a "teaser" code, as it can be improved in several ways...
13# First of all non-SSE2 path should be implemented (yes, for now it
14# performs Montgomery multiplication/convolution only on SSE2-capable
15# CPUs such as P4, others fall down to original code). Then inner loop
16# can be unrolled and modulo-scheduled to improve ILP and possibly
17# moved to 128-bit XMM register bank (though it would require input
18# rearrangement and/or increase bus bandwidth utilization). Dedicated
19# squaring procedure should give further performance improvement...
20# Yet, for being draft, the code improves rsa512 *sign* benchmark by
21# 110%(!), rsa1024 one - by 70% and rsa4096 - by 20%:-)
22
23# December 2006
24#
25# Modulo-scheduling SSE2 loops results in further 15-20% improvement.
26# Integer-only code [being equipped with dedicated squaring procedure]
27# gives ~40% on rsa512 sign benchmark...
28
29$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
30push(@INC,"${dir}","${dir}../../perlasm");
31require "x86asm.pl";
32
33&asm_init($ARGV[0],$0);
34
35$sse2=0;
36for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
37
38&external_label("OPENSSL_ia32cap_P") if ($sse2);
39
40&function_begin("bn_mul_mont");
41
42$i="edx";
43$j="ecx";
44$ap="esi"; $tp="esi"; # overlapping variables!!!
45$rp="edi"; $bp="edi"; # overlapping variables!!!
46$np="ebp";
47$num="ebx";
48
49$_num=&DWP(4*0,"esp"); # stack top layout
50$_rp=&DWP(4*1,"esp");
51$_ap=&DWP(4*2,"esp");
52$_bp=&DWP(4*3,"esp");
53$_np=&DWP(4*4,"esp");
54$_n0=&DWP(4*5,"esp"); $_n0q=&QWP(4*5,"esp");
55$_sp=&DWP(4*6,"esp");
56$_bpend=&DWP(4*7,"esp");
57$frame=32; # size of above frame rounded up to 16n
58
59 &xor ("eax","eax");
60 &mov ("edi",&wparam(5)); # int num
61 &cmp ("edi",4);
62 &jl (&label("just_leave"));
63
64 &lea ("esi",&wparam(0)); # put aside pointer to argument block
65 &lea ("edx",&wparam(1)); # load ap
66 &mov ("ebp","esp"); # saved stack pointer!
67 &add ("edi",2); # extra two words on top of tp
68 &neg ("edi");
69 &lea ("esp",&DWP(-$frame,"esp","edi",4)); # alloca($frame+4*(num+2))
70 &neg ("edi");
71
72 # minimize cache contention by arranging 2K window between stack
73 # pointer and ap argument [np is also position sensitive vector,
74 # but it's assumed to be near ap, as it's allocated at ~same
75 # time].
76 &mov ("eax","esp");
77 &sub ("eax","edx");
78 &and ("eax",2047);
79 &sub ("esp","eax"); # this aligns sp and ap modulo 2048
80
81 &xor ("edx","esp");
82 &and ("edx",2048);
83 &xor ("edx",2048);
84 &sub ("esp","edx"); # this splits them apart modulo 4096
85
86 &and ("esp",-64); # align to cache line
87
88 ################################# load argument block...
89 &mov ("eax",&DWP(0*4,"esi"));# BN_ULONG *rp
90 &mov ("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap
91 &mov ("ecx",&DWP(2*4,"esi"));# const BN_ULONG *bp
92 &mov ("edx",&DWP(3*4,"esi"));# const BN_ULONG *np
93 &mov ("esi",&DWP(4*4,"esi"));# const BN_ULONG *n0
94 #&mov ("edi",&DWP(5*4,"esi"));# int num
95
96 &mov ("esi",&DWP(0,"esi")); # pull n0[0]
97 &mov ($_rp,"eax"); # ... save a copy of argument block
98 &mov ($_ap,"ebx");
99 &mov ($_bp,"ecx");
100 &mov ($_np,"edx");
101 &mov ($_n0,"esi");
102 &lea ($num,&DWP(-3,"edi")); # num=num-1 to assist modulo-scheduling
103 #&mov ($_num,$num); # redundant as $num is not reused
104 &mov ($_sp,"ebp"); # saved stack pointer!
105
106if($sse2) {
107$acc0="mm0"; # mmx register bank layout
108$acc1="mm1";
109$car0="mm2";
110$car1="mm3";
111$mul0="mm4";
112$mul1="mm5";
113$temp="mm6";
114$mask="mm7";
115
116 &picsetup("eax");
117 &picsymbol("eax", "OPENSSL_ia32cap_P", "eax");
118 &bt (&DWP(0,"eax"),"\$IA32CAP_BIT0_SSE2");
119 &jnc (&label("non_sse2"));
120
121 &mov ("eax",-1);
122 &movd ($mask,"eax"); # mask 32 lower bits
123
124 &mov ($ap,$_ap); # load input pointers
125 &mov ($bp,$_bp);
126 &mov ($np,$_np);
127
128 &xor ($i,$i); # i=0
129 &xor ($j,$j); # j=0
130
131 &movd ($mul0,&DWP(0,$bp)); # bp[0]
132 &movd ($mul1,&DWP(0,$ap)); # ap[0]
133 &movd ($car1,&DWP(0,$np)); # np[0]
134
135 &pmuludq($mul1,$mul0); # ap[0]*bp[0]
136 &movq ($car0,$mul1);
137 &movq ($acc0,$mul1); # I wish movd worked for
138 &pand ($acc0,$mask); # inter-register transfers
139
140 &pmuludq($mul1,$_n0q); # *=n0
141
142 &pmuludq($car1,$mul1); # "t[0]"*np[0]*n0
143 &paddq ($car1,$acc0);
144
145 &movd ($acc1,&DWP(4,$np)); # np[1]
146 &movd ($acc0,&DWP(4,$ap)); # ap[1]
147
148 &psrlq ($car0,32);
149 &psrlq ($car1,32);
150
151 &inc ($j); # j++
152&set_label("1st",16);
153 &pmuludq($acc0,$mul0); # ap[j]*bp[0]
154 &pmuludq($acc1,$mul1); # np[j]*m1
155 &paddq ($car0,$acc0); # +=c0
156 &paddq ($car1,$acc1); # +=c1
157
158 &movq ($acc0,$car0);
159 &pand ($acc0,$mask);
160 &movd ($acc1,&DWP(4,$np,$j,4)); # np[j+1]
161 &paddq ($car1,$acc0); # +=ap[j]*bp[0];
162 &movd ($acc0,&DWP(4,$ap,$j,4)); # ap[j+1]
163 &psrlq ($car0,32);
164 &movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[j-1]=
165 &psrlq ($car1,32);
166
167 &lea ($j,&DWP(1,$j));
168 &cmp ($j,$num);
169 &jl (&label("1st"));
170
171 &pmuludq($acc0,$mul0); # ap[num-1]*bp[0]
172 &pmuludq($acc1,$mul1); # np[num-1]*m1
173 &paddq ($car0,$acc0); # +=c0
174 &paddq ($car1,$acc1); # +=c1
175
176 &movq ($acc0,$car0);
177 &pand ($acc0,$mask);
178 &paddq ($car1,$acc0); # +=ap[num-1]*bp[0];
179 &movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[num-2]=
180
181 &psrlq ($car0,32);
182 &psrlq ($car1,32);
183
184 &paddq ($car1,$car0);
185 &movq (&QWP($frame,"esp",$num,4),$car1); # tp[num].tp[num-1]
186
187 &inc ($i); # i++
188&set_label("outer");
189 &xor ($j,$j); # j=0
190
191 &movd ($mul0,&DWP(0,$bp,$i,4)); # bp[i]
192 &movd ($mul1,&DWP(0,$ap)); # ap[0]
193 &movd ($temp,&DWP($frame,"esp")); # tp[0]
194 &movd ($car1,&DWP(0,$np)); # np[0]
195 &pmuludq($mul1,$mul0); # ap[0]*bp[i]
196
197 &paddq ($mul1,$temp); # +=tp[0]
198 &movq ($acc0,$mul1);
199 &movq ($car0,$mul1);
200 &pand ($acc0,$mask);
201
202 &pmuludq($mul1,$_n0q); # *=n0
203
204 &pmuludq($car1,$mul1);
205 &paddq ($car1,$acc0);
206
207 &movd ($temp,&DWP($frame+4,"esp")); # tp[1]
208 &movd ($acc1,&DWP(4,$np)); # np[1]
209 &movd ($acc0,&DWP(4,$ap)); # ap[1]
210
211 &psrlq ($car0,32);
212 &psrlq ($car1,32);
213 &paddq ($car0,$temp); # +=tp[1]
214
215 &inc ($j); # j++
216 &dec ($num);
217&set_label("inner");
218 &pmuludq($acc0,$mul0); # ap[j]*bp[i]
219 &pmuludq($acc1,$mul1); # np[j]*m1
220 &paddq ($car0,$acc0); # +=c0
221 &paddq ($car1,$acc1); # +=c1
222
223 &movq ($acc0,$car0);
224 &movd ($temp,&DWP($frame+4,"esp",$j,4));# tp[j+1]
225 &pand ($acc0,$mask);
226 &movd ($acc1,&DWP(4,$np,$j,4)); # np[j+1]
227 &paddq ($car1,$acc0); # +=ap[j]*bp[i]+tp[j]
228 &movd ($acc0,&DWP(4,$ap,$j,4)); # ap[j+1]
229 &psrlq ($car0,32);
230 &movd (&DWP($frame-4,"esp",$j,4),$car1);# tp[j-1]=
231 &psrlq ($car1,32);
232 &paddq ($car0,$temp); # +=tp[j+1]
233
234 &dec ($num);
235 &lea ($j,&DWP(1,$j)); # j++
236 &jnz (&label("inner"));
237
238 &mov ($num,$j);
239 &pmuludq($acc0,$mul0); # ap[num-1]*bp[i]
240 &pmuludq($acc1,$mul1); # np[num-1]*m1
241 &paddq ($car0,$acc0); # +=c0
242 &paddq ($car1,$acc1); # +=c1
243
244 &movq ($acc0,$car0);
245 &pand ($acc0,$mask);
246 &paddq ($car1,$acc0); # +=ap[num-1]*bp[i]+tp[num-1]
247 &movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[num-2]=
248 &psrlq ($car0,32);
249 &psrlq ($car1,32);
250
251 &movd ($temp,&DWP($frame+4,"esp",$num,4)); # += tp[num]
252 &paddq ($car1,$car0);
253 &paddq ($car1,$temp);
254 &movq (&QWP($frame,"esp",$num,4),$car1); # tp[num].tp[num-1]
255
256 &lea ($i,&DWP(1,$i)); # i++
257 &cmp ($i,$num);
258 &jle (&label("outer"));
259
260 &emms (); # done with mmx bank
261 &jmp (&label("common_tail"));
262
263&set_label("non_sse2",16);
264}
265
266if (0) {
267 &mov ("esp",$_sp);
268 &xor ("eax","eax"); # signal "not fast enough [yet]"
269 &jmp (&label("just_leave"));
270 # While the below code provides competitive performance for
271 # all key lengths on modern Intel cores, it's still more
272 # than 10% slower for 4096-bit key elsewhere:-( "Competitive"
273 # means compared to the original integer-only assembler.
274 # 512-bit RSA sign is better by ~40%, but that's about all
275 # one can say about all CPUs...
276} else {
277$inp="esi"; # integer path uses these registers differently
278$word="edi";
279$carry="ebp";
280
281 &mov ($inp,$_ap);
282 &lea ($carry,&DWP(1,$num));
283 &mov ($word,$_bp);
284 &xor ($j,$j); # j=0
285 &mov ("edx",$inp);
286 &and ($carry,1); # see if num is even
287 &sub ("edx",$word); # see if ap==bp
288 &lea ("eax",&DWP(4,$word,$num,4)); # &bp[num]
289 &or ($carry,"edx");
290 &mov ($word,&DWP(0,$word)); # bp[0]
291 &jz (&label("bn_sqr_mont"));
292 &mov ($_bpend,"eax");
293 &mov ("eax",&DWP(0,$inp));
294 &xor ("edx","edx");
295
296&set_label("mull",16);
297 &mov ($carry,"edx");
298 &mul ($word); # ap[j]*bp[0]
299 &add ($carry,"eax");
300 &lea ($j,&DWP(1,$j));
301 &adc ("edx",0);
302 &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j+1]
303 &cmp ($j,$num);
304 &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]=
305 &jl (&label("mull"));
306
307 &mov ($carry,"edx");
308 &mul ($word); # ap[num-1]*bp[0]
309 &mov ($word,$_n0);
310 &add ("eax",$carry);
311 &mov ($inp,$_np);
312 &adc ("edx",0);
313 &imul ($word,&DWP($frame,"esp")); # n0*tp[0]
314
315 &mov (&DWP($frame,"esp",$num,4),"eax"); # tp[num-1]=
316 &xor ($j,$j);
317 &mov (&DWP($frame+4,"esp",$num,4),"edx"); # tp[num]=
318 &mov (&DWP($frame+8,"esp",$num,4),$j); # tp[num+1]=
319
320 &mov ("eax",&DWP(0,$inp)); # np[0]
321 &mul ($word); # np[0]*m
322 &add ("eax",&DWP($frame,"esp")); # +=tp[0]
323 &mov ("eax",&DWP(4,$inp)); # np[1]
324 &adc ("edx",0);
325 &inc ($j);
326
327 &jmp (&label("2ndmadd"));
328
329&set_label("1stmadd",16);
330 &mov ($carry,"edx");
331 &mul ($word); # ap[j]*bp[i]
332 &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j]
333 &lea ($j,&DWP(1,$j));
334 &adc ("edx",0);
335 &add ($carry,"eax");
336 &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j+1]
337 &adc ("edx",0);
338 &cmp ($j,$num);
339 &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]=
340 &jl (&label("1stmadd"));
341
342 &mov ($carry,"edx");
343 &mul ($word); # ap[num-1]*bp[i]
344 &add ("eax",&DWP($frame,"esp",$num,4)); # +=tp[num-1]
345 &mov ($word,$_n0);
346 &adc ("edx",0);
347 &mov ($inp,$_np);
348 &add ($carry,"eax");
349 &adc ("edx",0);
350 &imul ($word,&DWP($frame,"esp")); # n0*tp[0]
351
352 &xor ($j,$j);
353 &add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num]
354 &mov (&DWP($frame,"esp",$num,4),$carry); # tp[num-1]=
355 &adc ($j,0);
356 &mov ("eax",&DWP(0,$inp)); # np[0]
357 &mov (&DWP($frame+4,"esp",$num,4),"edx"); # tp[num]=
358 &mov (&DWP($frame+8,"esp",$num,4),$j); # tp[num+1]=
359
360 &mul ($word); # np[0]*m
361 &add ("eax",&DWP($frame,"esp")); # +=tp[0]
362 &mov ("eax",&DWP(4,$inp)); # np[1]
363 &adc ("edx",0);
364 &mov ($j,1);
365
366&set_label("2ndmadd",16);
367 &mov ($carry,"edx");
368 &mul ($word); # np[j]*m
369 &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j]
370 &lea ($j,&DWP(1,$j));
371 &adc ("edx",0);
372 &add ($carry,"eax");
373 &mov ("eax",&DWP(0,$inp,$j,4)); # np[j+1]
374 &adc ("edx",0);
375 &cmp ($j,$num);
376 &mov (&DWP($frame-8,"esp",$j,4),$carry); # tp[j-1]=
377 &jl (&label("2ndmadd"));
378
379 &mov ($carry,"edx");
380 &mul ($word); # np[j]*m
381 &add ($carry,&DWP($frame,"esp",$num,4)); # +=tp[num-1]
382 &adc ("edx",0);
383 &add ($carry,"eax");
384 &adc ("edx",0);
385 &mov (&DWP($frame-4,"esp",$num,4),$carry); # tp[num-2]=
386
387 &xor ("eax","eax");
388 &mov ($j,$_bp); # &bp[i]
389 &add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num]
390 &adc ("eax",&DWP($frame+8,"esp",$num,4)); # +=tp[num+1]
391 &lea ($j,&DWP(4,$j));
392 &mov (&DWP($frame,"esp",$num,4),"edx"); # tp[num-1]=
393 &cmp ($j,$_bpend);
394 &mov (&DWP($frame+4,"esp",$num,4),"eax"); # tp[num]=
395 &je (&label("common_tail"));
396
397 &mov ($word,&DWP(0,$j)); # bp[i+1]
398 &mov ($inp,$_ap);
399 &mov ($_bp,$j); # &bp[++i]
400 &xor ($j,$j);
401 &xor ("edx","edx");
402 &mov ("eax",&DWP(0,$inp));
403 &jmp (&label("1stmadd"));
404
405&set_label("bn_sqr_mont",16);
406$sbit=$num;
407 &mov ($_num,$num);
408 &mov ($_bp,$j); # i=0
409
410 &mov ("eax",$word); # ap[0]
411 &mul ($word); # ap[0]*ap[0]
412 &mov (&DWP($frame,"esp"),"eax"); # tp[0]=
413 &mov ($sbit,"edx");
414 &shr ("edx",1);
415 &and ($sbit,1);
416 &inc ($j);
417&set_label("sqr",16);
418 &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j]
419 &mov ($carry,"edx");
420 &mul ($word); # ap[j]*ap[0]
421 &add ("eax",$carry);
422 &lea ($j,&DWP(1,$j));
423 &adc ("edx",0);
424 &lea ($carry,&DWP(0,$sbit,"eax",2));
425 &shr ("eax",31);
426 &cmp ($j,$_num);
427 &mov ($sbit,"eax");
428 &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]=
429 &jl (&label("sqr"));
430
431 &mov ("eax",&DWP(0,$inp,$j,4)); # ap[num-1]
432 &mov ($carry,"edx");
433 &mul ($word); # ap[num-1]*ap[0]
434 &add ("eax",$carry);
435 &mov ($word,$_n0);
436 &adc ("edx",0);
437 &mov ($inp,$_np);
438 &lea ($carry,&DWP(0,$sbit,"eax",2));
439 &imul ($word,&DWP($frame,"esp")); # n0*tp[0]
440 &shr ("eax",31);
441 &mov (&DWP($frame,"esp",$j,4),$carry); # tp[num-1]=
442
443 &lea ($carry,&DWP(0,"eax","edx",2));
444 &mov ("eax",&DWP(0,$inp)); # np[0]
445 &shr ("edx",31);
446 &mov (&DWP($frame+4,"esp",$j,4),$carry); # tp[num]=
447 &mov (&DWP($frame+8,"esp",$j,4),"edx"); # tp[num+1]=
448
449 &mul ($word); # np[0]*m
450 &add ("eax",&DWP($frame,"esp")); # +=tp[0]
451 &mov ($num,$j);
452 &adc ("edx",0);
453 &mov ("eax",&DWP(4,$inp)); # np[1]
454 &mov ($j,1);
455
456&set_label("3rdmadd",16);
457 &mov ($carry,"edx");
458 &mul ($word); # np[j]*m
459 &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j]
460 &adc ("edx",0);
461 &add ($carry,"eax");
462 &mov ("eax",&DWP(4,$inp,$j,4)); # np[j+1]
463 &adc ("edx",0);
464 &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j-1]=
465
466 &mov ($carry,"edx");
467 &mul ($word); # np[j+1]*m
468 &add ($carry,&DWP($frame+4,"esp",$j,4)); # +=tp[j+1]
469 &lea ($j,&DWP(2,$j));
470 &adc ("edx",0);
471 &add ($carry,"eax");
472 &mov ("eax",&DWP(0,$inp,$j,4)); # np[j+2]
473 &adc ("edx",0);
474 &cmp ($j,$num);
475 &mov (&DWP($frame-8,"esp",$j,4),$carry); # tp[j]=
476 &jl (&label("3rdmadd"));
477
478 &mov ($carry,"edx");
479 &mul ($word); # np[j]*m
480 &add ($carry,&DWP($frame,"esp",$num,4)); # +=tp[num-1]
481 &adc ("edx",0);
482 &add ($carry,"eax");
483 &adc ("edx",0);
484 &mov (&DWP($frame-4,"esp",$num,4),$carry); # tp[num-2]=
485
486 &mov ($j,$_bp); # i
487 &xor ("eax","eax");
488 &mov ($inp,$_ap);
489 &add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num]
490 &adc ("eax",&DWP($frame+8,"esp",$num,4)); # +=tp[num+1]
491 &mov (&DWP($frame,"esp",$num,4),"edx"); # tp[num-1]=
492 &cmp ($j,$num);
493 &mov (&DWP($frame+4,"esp",$num,4),"eax"); # tp[num]=
494 &je (&label("common_tail"));
495
496 &mov ($word,&DWP(4,$inp,$j,4)); # ap[i]
497 &lea ($j,&DWP(1,$j));
498 &mov ("eax",$word);
499 &mov ($_bp,$j); # ++i
500 &mul ($word); # ap[i]*ap[i]
501 &add ("eax",&DWP($frame,"esp",$j,4)); # +=tp[i]
502 &adc ("edx",0);
503 &mov (&DWP($frame,"esp",$j,4),"eax"); # tp[i]=
504 &xor ($carry,$carry);
505 &cmp ($j,$num);
506 &lea ($j,&DWP(1,$j));
507 &je (&label("sqrlast"));
508
509 &mov ($sbit,"edx"); # zaps $num
510 &shr ("edx",1);
511 &and ($sbit,1);
512&set_label("sqradd",16);
513 &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j]
514 &mov ($carry,"edx");
515 &mul ($word); # ap[j]*ap[i]
516 &add ("eax",$carry);
517 &lea ($carry,&DWP(0,"eax","eax"));
518 &adc ("edx",0);
519 &shr ("eax",31);
520 &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j]
521 &lea ($j,&DWP(1,$j));
522 &adc ("eax",0);
523 &add ($carry,$sbit);
524 &adc ("eax",0);
525 &cmp ($j,$_num);
526 &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]=
527 &mov ($sbit,"eax");
528 &jle (&label("sqradd"));
529
530 &mov ($carry,"edx");
531 &add ("edx","edx");
532 &shr ($carry,31);
533 &add ("edx",$sbit);
534 &adc ($carry,0);
535&set_label("sqrlast");
536 &mov ($word,$_n0);
537 &mov ($inp,$_np);
538 &imul ($word,&DWP($frame,"esp")); # n0*tp[0]
539
540 &add ("edx",&DWP($frame,"esp",$j,4)); # +=tp[num]
541 &mov ("eax",&DWP(0,$inp)); # np[0]
542 &adc ($carry,0);
543 &mov (&DWP($frame,"esp",$j,4),"edx"); # tp[num]=
544 &mov (&DWP($frame+4,"esp",$j,4),$carry); # tp[num+1]=
545
546 &mul ($word); # np[0]*m
547 &add ("eax",&DWP($frame,"esp")); # +=tp[0]
548 &lea ($num,&DWP(-1,$j));
549 &adc ("edx",0);
550 &mov ($j,1);
551 &mov ("eax",&DWP(4,$inp)); # np[1]
552
553 &jmp (&label("3rdmadd"));
554}
555
556&set_label("common_tail",16);
557 &mov ($np,$_np); # load modulus pointer
558 &mov ($rp,$_rp); # load result pointer
559 &lea ($tp,&DWP($frame,"esp")); # [$ap and $bp are zapped]
560
561 &mov ("eax",&DWP(0,$tp)); # tp[0]
562 &mov ($j,$num); # j=num-1
563 &xor ($i,$i); # i=0 and clear CF!
564
565&set_label("sub",16);
566 &sbb ("eax",&DWP(0,$np,$i,4));
567 &mov (&DWP(0,$rp,$i,4),"eax"); # rp[i]=tp[i]-np[i]
568 &dec ($j); # doesn't affect CF!
569 &mov ("eax",&DWP(4,$tp,$i,4)); # tp[i+1]
570 &lea ($i,&DWP(1,$i)); # i++
571 &jge (&label("sub"));
572
573 &sbb ("eax",0); # handle upmost overflow bit
574 &and ($tp,"eax");
575 &not ("eax");
576 &mov ($np,$rp);
577 &and ($np,"eax");
578 &or ($tp,$np); # tp=carry?tp:rp
579
580&set_label("copy",16); # copy or in-place refresh
581 &mov ("eax",&DWP(0,$tp,$num,4));
582 &mov (&DWP(0,$rp,$num,4),"eax"); # rp[i]=tp[i]
583 &mov (&DWP($frame,"esp",$num,4),$j); # zap temporary vector
584 &dec ($num);
585 &jge (&label("copy"));
586
587 &mov ("esp",$_sp); # pull saved stack pointer
588 &mov ("eax",1);
589&set_label("just_leave");
590&function_end("bn_mul_mont");
591
592&asm_finish();
diff --git a/src/lib/libcrypto/bn/asm/x86_64-mont.pl b/src/lib/libcrypto/bn/asm/x86_64-mont.pl
deleted file mode 100755
index 30cfab4fce..0000000000
--- a/src/lib/libcrypto/bn/asm/x86_64-mont.pl
+++ /dev/null
@@ -1,1503 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# October 2005.
11#
12# Montgomery multiplication routine for x86_64. While it gives modest
13# 9% improvement of rsa4096 sign on Opteron, rsa512 sign runs more
14# than twice, >2x, as fast. Most common rsa1024 sign is improved by
15# respectful 50%. It remains to be seen if loop unrolling and
16# dedicated squaring routine can provide further improvement...
17
18# July 2011.
19#
20# Add dedicated squaring procedure. Performance improvement varies
21# from platform to platform, but in average it's ~5%/15%/25%/33%
22# for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
23
24# August 2011.
25#
26# Unroll and modulo-schedule inner loops in such manner that they
27# are "fallen through" for input lengths of 8, which is critical for
28# 1024-bit RSA *sign*. Average performance improvement in comparison
29# to *initial* version of this module from 2005 is ~0%/30%/40%/45%
30# for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
31
32$flavour = shift;
33$output = shift;
34if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
35
36$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
37( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
38( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
39die "can't locate x86_64-xlate.pl";
40
41open OUT,"| \"$^X\" $xlate $flavour $output";
42*STDOUT=*OUT;
43
44# int bn_mul_mont(
45$rp="%rdi"; # BN_ULONG *rp,
46$ap="%rsi"; # const BN_ULONG *ap,
47$bp="%rdx"; # const BN_ULONG *bp,
48$np="%rcx"; # const BN_ULONG *np,
49$n0="%r8"; # const BN_ULONG *n0,
50$num="%r9"; # int num);
51$lo0="%r10";
52$hi0="%r11";
53$hi1="%r13";
54$i="%r14";
55$j="%r15";
56$m0="%rbx";
57$m1="%rbp";
58
59$code=<<___;
60.text
61
62.globl bn_mul_mont
63.type bn_mul_mont,\@function,6
64.align 16
65bn_mul_mont:
66 _CET_ENDBR
67 test \$3,${num}d
68 jnz .Lmul_enter
69 cmp \$8,${num}d
70 jb .Lmul_enter
71 cmp $ap,$bp
72 jne .Lmul4x_enter
73 jmp .Lsqr4x_enter
74
75.align 16
76.Lmul_enter:
77 push %rbx
78 push %rbp
79 push %r12
80 push %r13
81 push %r14
82 push %r15
83
84 mov ${num}d,${num}d
85 lea 2($num),%r10
86 mov %rsp,%r11
87 neg %r10
88 lea (%rsp,%r10,8),%rsp # tp=alloca(8*(num+2))
89 and \$-1024,%rsp # minimize TLB usage
90
91 mov %r11,8(%rsp,$num,8) # tp[num+1]=%rsp
92.Lmul_body:
93 mov $bp,%r12 # reassign $bp
94___
95 $bp="%r12";
96$code.=<<___;
97 mov ($n0),$n0 # pull n0[0] value
98 mov ($bp),$m0 # m0=bp[0]
99 mov ($ap),%rax
100
101 xor $i,$i # i=0
102 xor $j,$j # j=0
103
104 mov $n0,$m1
105 mulq $m0 # ap[0]*bp[0]
106 mov %rax,$lo0
107 mov ($np),%rax
108
109 imulq $lo0,$m1 # "tp[0]"*n0
110 mov %rdx,$hi0
111
112 mulq $m1 # np[0]*m1
113 add %rax,$lo0 # discarded
114 mov 8($ap),%rax
115 adc \$0,%rdx
116 mov %rdx,$hi1
117
118 lea 1($j),$j # j++
119 jmp .L1st_enter
120
121.align 16
122.L1st:
123 add %rax,$hi1
124 mov ($ap,$j,8),%rax
125 adc \$0,%rdx
126 add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0]
127 mov $lo0,$hi0
128 adc \$0,%rdx
129 mov $hi1,-16(%rsp,$j,8) # tp[j-1]
130 mov %rdx,$hi1
131
132.L1st_enter:
133 mulq $m0 # ap[j]*bp[0]
134 add %rax,$hi0
135 mov ($np,$j,8),%rax
136 adc \$0,%rdx
137 lea 1($j),$j # j++
138 mov %rdx,$lo0
139
140 mulq $m1 # np[j]*m1
141 cmp $num,$j
142 jl .L1st
143
144 add %rax,$hi1
145 mov ($ap),%rax # ap[0]
146 adc \$0,%rdx
147 add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0]
148 adc \$0,%rdx
149 mov $hi1,-16(%rsp,$j,8) # tp[j-1]
150 mov %rdx,$hi1
151 mov $lo0,$hi0
152
153 xor %rdx,%rdx
154 add $hi0,$hi1
155 adc \$0,%rdx
156 mov $hi1,-8(%rsp,$num,8)
157 mov %rdx,(%rsp,$num,8) # store upmost overflow bit
158
159 lea 1($i),$i # i++
160 jmp .Louter
161.align 16
162.Louter:
163 mov ($bp,$i,8),$m0 # m0=bp[i]
164 xor $j,$j # j=0
165 mov $n0,$m1
166 mov (%rsp),$lo0
167 mulq $m0 # ap[0]*bp[i]
168 add %rax,$lo0 # ap[0]*bp[i]+tp[0]
169 mov ($np),%rax
170 adc \$0,%rdx
171
172 imulq $lo0,$m1 # tp[0]*n0
173 mov %rdx,$hi0
174
175 mulq $m1 # np[0]*m1
176 add %rax,$lo0 # discarded
177 mov 8($ap),%rax
178 adc \$0,%rdx
179 mov 8(%rsp),$lo0 # tp[1]
180 mov %rdx,$hi1
181
182 lea 1($j),$j # j++
183 jmp .Linner_enter
184
185.align 16
186.Linner:
187 add %rax,$hi1
188 mov ($ap,$j,8),%rax
189 adc \$0,%rdx
190 add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j]
191 mov (%rsp,$j,8),$lo0
192 adc \$0,%rdx
193 mov $hi1,-16(%rsp,$j,8) # tp[j-1]
194 mov %rdx,$hi1
195
196.Linner_enter:
197 mulq $m0 # ap[j]*bp[i]
198 add %rax,$hi0
199 mov ($np,$j,8),%rax
200 adc \$0,%rdx
201 add $hi0,$lo0 # ap[j]*bp[i]+tp[j]
202 mov %rdx,$hi0
203 adc \$0,$hi0
204 lea 1($j),$j # j++
205
206 mulq $m1 # np[j]*m1
207 cmp $num,$j
208 jl .Linner
209
210 add %rax,$hi1
211 mov ($ap),%rax # ap[0]
212 adc \$0,%rdx
213 add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j]
214 mov (%rsp,$j,8),$lo0
215 adc \$0,%rdx
216 mov $hi1,-16(%rsp,$j,8) # tp[j-1]
217 mov %rdx,$hi1
218
219 xor %rdx,%rdx
220 add $hi0,$hi1
221 adc \$0,%rdx
222 add $lo0,$hi1 # pull upmost overflow bit
223 adc \$0,%rdx
224 mov $hi1,-8(%rsp,$num,8)
225 mov %rdx,(%rsp,$num,8) # store upmost overflow bit
226
227 lea 1($i),$i # i++
228 cmp $num,$i
229 jl .Louter
230
231 xor $i,$i # i=0 and clear CF!
232 mov (%rsp),%rax # tp[0]
233 lea (%rsp),$ap # borrow ap for tp
234 mov $num,$j # j=num
235 jmp .Lsub
236.align 16
237.Lsub: sbb ($np,$i,8),%rax
238 mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i]
239 mov 8($ap,$i,8),%rax # tp[i+1]
240 lea 1($i),$i # i++
241 dec $j # doesnn't affect CF!
242 jnz .Lsub
243
244 sbb \$0,%rax # handle upmost overflow bit
245 xor $i,$i
246 and %rax,$ap
247 not %rax
248 mov $rp,$np
249 and %rax,$np
250 mov $num,$j # j=num
251 or $np,$ap # ap=borrow?tp:rp
252.align 16
253.Lcopy: # copy or in-place refresh
254 mov ($ap,$i,8),%rax
255 mov $i,(%rsp,$i,8) # zap temporary vector
256 mov %rax,($rp,$i,8) # rp[i]=tp[i]
257 lea 1($i),$i
258 sub \$1,$j
259 jnz .Lcopy
260
261 mov 8(%rsp,$num,8),%rsi # restore %rsp
262 mov \$1,%rax
263 mov (%rsi),%r15
264 mov 8(%rsi),%r14
265 mov 16(%rsi),%r13
266 mov 24(%rsi),%r12
267 mov 32(%rsi),%rbp
268 mov 40(%rsi),%rbx
269 lea 48(%rsi),%rsp
270.Lmul_epilogue:
271 ret
272.size bn_mul_mont,.-bn_mul_mont
273___
274{{{
275my @A=("%r10","%r11");
276my @N=("%r13","%rdi");
277$code.=<<___;
278.type bn_mul4x_mont,\@function,6
279.align 16
280bn_mul4x_mont:
281.Lmul4x_enter:
282 _CET_ENDBR
283 push %rbx
284 push %rbp
285 push %r12
286 push %r13
287 push %r14
288 push %r15
289
290 mov ${num}d,${num}d
291 lea 4($num),%r10
292 mov %rsp,%r11
293 neg %r10
294 lea (%rsp,%r10,8),%rsp # tp=alloca(8*(num+4))
295 and \$-1024,%rsp # minimize TLB usage
296
297 mov %r11,8(%rsp,$num,8) # tp[num+1]=%rsp
298.Lmul4x_body:
299 mov $rp,16(%rsp,$num,8) # tp[num+2]=$rp
300 mov %rdx,%r12 # reassign $bp
301___
302 $bp="%r12";
303$code.=<<___;
304 mov ($n0),$n0 # pull n0[0] value
305 mov ($bp),$m0 # m0=bp[0]
306 mov ($ap),%rax
307
308 xor $i,$i # i=0
309 xor $j,$j # j=0
310
311 mov $n0,$m1
312 mulq $m0 # ap[0]*bp[0]
313 mov %rax,$A[0]
314 mov ($np),%rax
315
316 imulq $A[0],$m1 # "tp[0]"*n0
317 mov %rdx,$A[1]
318
319 mulq $m1 # np[0]*m1
320 add %rax,$A[0] # discarded
321 mov 8($ap),%rax
322 adc \$0,%rdx
323 mov %rdx,$N[1]
324
325 mulq $m0
326 add %rax,$A[1]
327 mov 8($np),%rax
328 adc \$0,%rdx
329 mov %rdx,$A[0]
330
331 mulq $m1
332 add %rax,$N[1]
333 mov 16($ap),%rax
334 adc \$0,%rdx
335 add $A[1],$N[1]
336 lea 4($j),$j # j++
337 adc \$0,%rdx
338 mov $N[1],(%rsp)
339 mov %rdx,$N[0]
340 jmp .L1st4x
341.align 16
342.L1st4x:
343 mulq $m0 # ap[j]*bp[0]
344 add %rax,$A[0]
345 mov -16($np,$j,8),%rax
346 adc \$0,%rdx
347 mov %rdx,$A[1]
348
349 mulq $m1 # np[j]*m1
350 add %rax,$N[0]
351 mov -8($ap,$j,8),%rax
352 adc \$0,%rdx
353 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
354 adc \$0,%rdx
355 mov $N[0],-24(%rsp,$j,8) # tp[j-1]
356 mov %rdx,$N[1]
357
358 mulq $m0 # ap[j]*bp[0]
359 add %rax,$A[1]
360 mov -8($np,$j,8),%rax
361 adc \$0,%rdx
362 mov %rdx,$A[0]
363
364 mulq $m1 # np[j]*m1
365 add %rax,$N[1]
366 mov ($ap,$j,8),%rax
367 adc \$0,%rdx
368 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
369 adc \$0,%rdx
370 mov $N[1],-16(%rsp,$j,8) # tp[j-1]
371 mov %rdx,$N[0]
372
373 mulq $m0 # ap[j]*bp[0]
374 add %rax,$A[0]
375 mov ($np,$j,8),%rax
376 adc \$0,%rdx
377 mov %rdx,$A[1]
378
379 mulq $m1 # np[j]*m1
380 add %rax,$N[0]
381 mov 8($ap,$j,8),%rax
382 adc \$0,%rdx
383 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
384 adc \$0,%rdx
385 mov $N[0],-8(%rsp,$j,8) # tp[j-1]
386 mov %rdx,$N[1]
387
388 mulq $m0 # ap[j]*bp[0]
389 add %rax,$A[1]
390 mov 8($np,$j,8),%rax
391 adc \$0,%rdx
392 lea 4($j),$j # j++
393 mov %rdx,$A[0]
394
395 mulq $m1 # np[j]*m1
396 add %rax,$N[1]
397 mov -16($ap,$j,8),%rax
398 adc \$0,%rdx
399 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
400 adc \$0,%rdx
401 mov $N[1],-32(%rsp,$j,8) # tp[j-1]
402 mov %rdx,$N[0]
403 cmp $num,$j
404 jl .L1st4x
405
406 mulq $m0 # ap[j]*bp[0]
407 add %rax,$A[0]
408 mov -16($np,$j,8),%rax
409 adc \$0,%rdx
410 mov %rdx,$A[1]
411
412 mulq $m1 # np[j]*m1
413 add %rax,$N[0]
414 mov -8($ap,$j,8),%rax
415 adc \$0,%rdx
416 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
417 adc \$0,%rdx
418 mov $N[0],-24(%rsp,$j,8) # tp[j-1]
419 mov %rdx,$N[1]
420
421 mulq $m0 # ap[j]*bp[0]
422 add %rax,$A[1]
423 mov -8($np,$j,8),%rax
424 adc \$0,%rdx
425 mov %rdx,$A[0]
426
427 mulq $m1 # np[j]*m1
428 add %rax,$N[1]
429 mov ($ap),%rax # ap[0]
430 adc \$0,%rdx
431 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
432 adc \$0,%rdx
433 mov $N[1],-16(%rsp,$j,8) # tp[j-1]
434 mov %rdx,$N[0]
435
436 xor $N[1],$N[1]
437 add $A[0],$N[0]
438 adc \$0,$N[1]
439 mov $N[0],-8(%rsp,$j,8)
440 mov $N[1],(%rsp,$j,8) # store upmost overflow bit
441
442 lea 1($i),$i # i++
443.align 4
444.Louter4x:
445 mov ($bp,$i,8),$m0 # m0=bp[i]
446 xor $j,$j # j=0
447 mov (%rsp),$A[0]
448 mov $n0,$m1
449 mulq $m0 # ap[0]*bp[i]
450 add %rax,$A[0] # ap[0]*bp[i]+tp[0]
451 mov ($np),%rax
452 adc \$0,%rdx
453
454 imulq $A[0],$m1 # tp[0]*n0
455 mov %rdx,$A[1]
456
457 mulq $m1 # np[0]*m1
458 add %rax,$A[0] # "$N[0]", discarded
459 mov 8($ap),%rax
460 adc \$0,%rdx
461 mov %rdx,$N[1]
462
463 mulq $m0 # ap[j]*bp[i]
464 add %rax,$A[1]
465 mov 8($np),%rax
466 adc \$0,%rdx
467 add 8(%rsp),$A[1] # +tp[1]
468 adc \$0,%rdx
469 mov %rdx,$A[0]
470
471 mulq $m1 # np[j]*m1
472 add %rax,$N[1]
473 mov 16($ap),%rax
474 adc \$0,%rdx
475 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[i]+tp[j]
476 lea 4($j),$j # j+=2
477 adc \$0,%rdx
478 mov $N[1],(%rsp) # tp[j-1]
479 mov %rdx,$N[0]
480 jmp .Linner4x
481.align 16
482.Linner4x:
483 mulq $m0 # ap[j]*bp[i]
484 add %rax,$A[0]
485 mov -16($np,$j,8),%rax
486 adc \$0,%rdx
487 add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
488 adc \$0,%rdx
489 mov %rdx,$A[1]
490
491 mulq $m1 # np[j]*m1
492 add %rax,$N[0]
493 mov -8($ap,$j,8),%rax
494 adc \$0,%rdx
495 add $A[0],$N[0]
496 adc \$0,%rdx
497 mov $N[0],-24(%rsp,$j,8) # tp[j-1]
498 mov %rdx,$N[1]
499
500 mulq $m0 # ap[j]*bp[i]
501 add %rax,$A[1]
502 mov -8($np,$j,8),%rax
503 adc \$0,%rdx
504 add -8(%rsp,$j,8),$A[1]
505 adc \$0,%rdx
506 mov %rdx,$A[0]
507
508 mulq $m1 # np[j]*m1
509 add %rax,$N[1]
510 mov ($ap,$j,8),%rax
511 adc \$0,%rdx
512 add $A[1],$N[1]
513 adc \$0,%rdx
514 mov $N[1],-16(%rsp,$j,8) # tp[j-1]
515 mov %rdx,$N[0]
516
517 mulq $m0 # ap[j]*bp[i]
518 add %rax,$A[0]
519 mov ($np,$j,8),%rax
520 adc \$0,%rdx
521 add (%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
522 adc \$0,%rdx
523 mov %rdx,$A[1]
524
525 mulq $m1 # np[j]*m1
526 add %rax,$N[0]
527 mov 8($ap,$j,8),%rax
528 adc \$0,%rdx
529 add $A[0],$N[0]
530 adc \$0,%rdx
531 mov $N[0],-8(%rsp,$j,8) # tp[j-1]
532 mov %rdx,$N[1]
533
534 mulq $m0 # ap[j]*bp[i]
535 add %rax,$A[1]
536 mov 8($np,$j,8),%rax
537 adc \$0,%rdx
538 add 8(%rsp,$j,8),$A[1]
539 adc \$0,%rdx
540 lea 4($j),$j # j++
541 mov %rdx,$A[0]
542
543 mulq $m1 # np[j]*m1
544 add %rax,$N[1]
545 mov -16($ap,$j,8),%rax
546 adc \$0,%rdx
547 add $A[1],$N[1]
548 adc \$0,%rdx
549 mov $N[1],-32(%rsp,$j,8) # tp[j-1]
550 mov %rdx,$N[0]
551 cmp $num,$j
552 jl .Linner4x
553
554 mulq $m0 # ap[j]*bp[i]
555 add %rax,$A[0]
556 mov -16($np,$j,8),%rax
557 adc \$0,%rdx
558 add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
559 adc \$0,%rdx
560 mov %rdx,$A[1]
561
562 mulq $m1 # np[j]*m1
563 add %rax,$N[0]
564 mov -8($ap,$j,8),%rax
565 adc \$0,%rdx
566 add $A[0],$N[0]
567 adc \$0,%rdx
568 mov $N[0],-24(%rsp,$j,8) # tp[j-1]
569 mov %rdx,$N[1]
570
571 mulq $m0 # ap[j]*bp[i]
572 add %rax,$A[1]
573 mov -8($np,$j,8),%rax
574 adc \$0,%rdx
575 add -8(%rsp,$j,8),$A[1]
576 adc \$0,%rdx
577 lea 1($i),$i # i++
578 mov %rdx,$A[0]
579
580 mulq $m1 # np[j]*m1
581 add %rax,$N[1]
582 mov ($ap),%rax # ap[0]
583 adc \$0,%rdx
584 add $A[1],$N[1]
585 adc \$0,%rdx
586 mov $N[1],-16(%rsp,$j,8) # tp[j-1]
587 mov %rdx,$N[0]
588
589 xor $N[1],$N[1]
590 add $A[0],$N[0]
591 adc \$0,$N[1]
592 add (%rsp,$num,8),$N[0] # pull upmost overflow bit
593 adc \$0,$N[1]
594 mov $N[0],-8(%rsp,$j,8)
595 mov $N[1],(%rsp,$j,8) # store upmost overflow bit
596
597 cmp $num,$i
598 jl .Louter4x
599___
600{
601my @ri=("%rax","%rdx",$m0,$m1);
602$code.=<<___;
603 mov 16(%rsp,$num,8),$rp # restore $rp
604 mov 0(%rsp),@ri[0] # tp[0]
605 pxor %xmm0,%xmm0
606 mov 8(%rsp),@ri[1] # tp[1]
607 shr \$2,$num # num/=4
608 lea (%rsp),$ap # borrow ap for tp
609 xor $i,$i # i=0 and clear CF!
610
611 sub 0($np),@ri[0]
612 mov 16($ap),@ri[2] # tp[2]
613 mov 24($ap),@ri[3] # tp[3]
614 sbb 8($np),@ri[1]
615 lea -1($num),$j # j=num/4-1
616 jmp .Lsub4x
617.align 16
618.Lsub4x:
619 mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i]
620 mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i]
621 sbb 16($np,$i,8),@ri[2]
622 mov 32($ap,$i,8),@ri[0] # tp[i+1]
623 mov 40($ap,$i,8),@ri[1]
624 sbb 24($np,$i,8),@ri[3]
625 mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i]
626 mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i]
627 sbb 32($np,$i,8),@ri[0]
628 mov 48($ap,$i,8),@ri[2]
629 mov 56($ap,$i,8),@ri[3]
630 sbb 40($np,$i,8),@ri[1]
631 lea 4($i),$i # i++
632 dec $j # doesnn't affect CF!
633 jnz .Lsub4x
634
635 mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i]
636 mov 32($ap,$i,8),@ri[0] # load overflow bit
637 sbb 16($np,$i,8),@ri[2]
638 mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i]
639 sbb 24($np,$i,8),@ri[3]
640 mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i]
641
642 sbb \$0,@ri[0] # handle upmost overflow bit
643 mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i]
644 xor $i,$i # i=0
645 and @ri[0],$ap
646 not @ri[0]
647 mov $rp,$np
648 and @ri[0],$np
649 lea -1($num),$j
650 or $np,$ap # ap=borrow?tp:rp
651
652 movdqu ($ap),%xmm1
653 movdqa %xmm0,(%rsp)
654 movdqu %xmm1,($rp)
655 jmp .Lcopy4x
656.align 16
657.Lcopy4x: # copy or in-place refresh
658 movdqu 16($ap,$i),%xmm2
659 movdqu 32($ap,$i),%xmm1
660 movdqa %xmm0,16(%rsp,$i)
661 movdqu %xmm2,16($rp,$i)
662 movdqa %xmm0,32(%rsp,$i)
663 movdqu %xmm1,32($rp,$i)
664 lea 32($i),$i
665 dec $j
666 jnz .Lcopy4x
667
668 shl \$2,$num
669 movdqu 16($ap,$i),%xmm2
670 movdqa %xmm0,16(%rsp,$i)
671 movdqu %xmm2,16($rp,$i)
672___
673}
674$code.=<<___;
675 mov 8(%rsp,$num,8),%rsi # restore %rsp
676 mov \$1,%rax
677 mov (%rsi),%r15
678 mov 8(%rsi),%r14
679 mov 16(%rsi),%r13
680 mov 24(%rsi),%r12
681 mov 32(%rsi),%rbp
682 mov 40(%rsi),%rbx
683 lea 48(%rsi),%rsp
684.Lmul4x_epilogue:
685 ret
686.size bn_mul4x_mont,.-bn_mul4x_mont
687___
688}}}
689 {{{
690######################################################################
691# void bn_sqr4x_mont(
692my $rptr="%rdi"; # const BN_ULONG *rptr,
693my $aptr="%rsi"; # const BN_ULONG *aptr,
694my $bptr="%rdx"; # not used
695my $nptr="%rcx"; # const BN_ULONG *nptr,
696my $n0 ="%r8"; # const BN_ULONG *n0);
697my $num ="%r9"; # int num, has to be divisible by 4 and
698 # not less than 8
699
700my ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
701my @A0=("%r10","%r11");
702my @A1=("%r12","%r13");
703my ($a0,$a1,$ai)=("%r14","%r15","%rbx");
704
705$code.=<<___;
706.type bn_sqr4x_mont,\@function,6
707.align 16
708bn_sqr4x_mont:
709.Lsqr4x_enter:
710 _CET_ENDBR
711 push %rbx
712 push %rbp
713 push %r12
714 push %r13
715 push %r14
716 push %r15
717
718 shl \$3,${num}d # convert $num to bytes
719 xor %r10,%r10
720 mov %rsp,%r11 # put aside %rsp
721 sub $num,%r10 # -$num
722 mov ($n0),$n0 # *n0
723 lea -72(%rsp,%r10,2),%rsp # alloca(frame+2*$num)
724 and \$-1024,%rsp # minimize TLB usage
725 ##############################################################
726 # Stack layout
727 #
728 # +0 saved $num, used in reduction section
729 # +8 &t[2*$num], used in reduction section
730 # +32 saved $rptr
731 # +40 saved $nptr
732 # +48 saved *n0
733 # +56 saved %rsp
734 # +64 t[2*$num]
735 #
736 mov $rptr,32(%rsp) # save $rptr
737 mov $nptr,40(%rsp)
738 mov $n0, 48(%rsp)
739 mov %r11, 56(%rsp) # save original %rsp
740.Lsqr4x_body:
741 ##############################################################
742 # Squaring part:
743 #
744 # a) multiply-n-add everything but a[i]*a[i];
745 # b) shift result of a) by 1 to the left and accumulate
746 # a[i]*a[i] products;
747 #
748 lea 32(%r10),$i # $i=-($num-32)
749 lea ($aptr,$num),$aptr # end of a[] buffer, ($aptr,$i)=&ap[2]
750
751 mov $num,$j # $j=$num
752
753 # comments apply to $num==8 case
754 mov -32($aptr,$i),$a0 # a[0]
755 lea 64(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num]
756 mov -24($aptr,$i),%rax # a[1]
757 lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"]
758 mov -16($aptr,$i),$ai # a[2]
759 mov %rax,$a1
760
761 mul $a0 # a[1]*a[0]
762 mov %rax,$A0[0] # a[1]*a[0]
763 mov $ai,%rax # a[2]
764 mov %rdx,$A0[1]
765 mov $A0[0],-24($tptr,$i) # t[1]
766
767 xor $A0[0],$A0[0]
768 mul $a0 # a[2]*a[0]
769 add %rax,$A0[1]
770 mov $ai,%rax
771 adc %rdx,$A0[0]
772 mov $A0[1],-16($tptr,$i) # t[2]
773
774 lea -16($i),$j # j=-16
775
776
777 mov 8($aptr,$j),$ai # a[3]
778 mul $a1 # a[2]*a[1]
779 mov %rax,$A1[0] # a[2]*a[1]+t[3]
780 mov $ai,%rax
781 mov %rdx,$A1[1]
782
783 xor $A0[1],$A0[1]
784 add $A1[0],$A0[0]
785 lea 16($j),$j
786 adc \$0,$A0[1]
787 mul $a0 # a[3]*a[0]
788 add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3]
789 mov $ai,%rax
790 adc %rdx,$A0[1]
791 mov $A0[0],-8($tptr,$j) # t[3]
792 jmp .Lsqr4x_1st
793
794.align 16
795.Lsqr4x_1st:
796 mov ($aptr,$j),$ai # a[4]
797 xor $A1[0],$A1[0]
798 mul $a1 # a[3]*a[1]
799 add %rax,$A1[1] # a[3]*a[1]+t[4]
800 mov $ai,%rax
801 adc %rdx,$A1[0]
802
803 xor $A0[0],$A0[0]
804 add $A1[1],$A0[1]
805 adc \$0,$A0[0]
806 mul $a0 # a[4]*a[0]
807 add %rax,$A0[1] # a[4]*a[0]+a[3]*a[1]+t[4]
808 mov $ai,%rax # a[3]
809 adc %rdx,$A0[0]
810 mov $A0[1],($tptr,$j) # t[4]
811
812
813 mov 8($aptr,$j),$ai # a[5]
814 xor $A1[1],$A1[1]
815 mul $a1 # a[4]*a[3]
816 add %rax,$A1[0] # a[4]*a[3]+t[5]
817 mov $ai,%rax
818 adc %rdx,$A1[1]
819
820 xor $A0[1],$A0[1]
821 add $A1[0],$A0[0]
822 adc \$0,$A0[1]
823 mul $a0 # a[5]*a[2]
824 add %rax,$A0[0] # a[5]*a[2]+a[4]*a[3]+t[5]
825 mov $ai,%rax
826 adc %rdx,$A0[1]
827 mov $A0[0],8($tptr,$j) # t[5]
828
829 mov 16($aptr,$j),$ai # a[6]
830 xor $A1[0],$A1[0]
831 mul $a1 # a[5]*a[3]
832 add %rax,$A1[1] # a[5]*a[3]+t[6]
833 mov $ai,%rax
834 adc %rdx,$A1[0]
835
836 xor $A0[0],$A0[0]
837 add $A1[1],$A0[1]
838 adc \$0,$A0[0]
839 mul $a0 # a[6]*a[2]
840 add %rax,$A0[1] # a[6]*a[2]+a[5]*a[3]+t[6]
841 mov $ai,%rax # a[3]
842 adc %rdx,$A0[0]
843 mov $A0[1],16($tptr,$j) # t[6]
844
845
846 mov 24($aptr,$j),$ai # a[7]
847 xor $A1[1],$A1[1]
848 mul $a1 # a[6]*a[5]
849 add %rax,$A1[0] # a[6]*a[5]+t[7]
850 mov $ai,%rax
851 adc %rdx,$A1[1]
852
853 xor $A0[1],$A0[1]
854 add $A1[0],$A0[0]
855 lea 32($j),$j
856 adc \$0,$A0[1]
857 mul $a0 # a[7]*a[4]
858 add %rax,$A0[0] # a[7]*a[4]+a[6]*a[5]+t[6]
859 mov $ai,%rax
860 adc %rdx,$A0[1]
861 mov $A0[0],-8($tptr,$j) # t[7]
862
863 cmp \$0,$j
864 jne .Lsqr4x_1st
865
866 xor $A1[0],$A1[0]
867 add $A0[1],$A1[1]
868 adc \$0,$A1[0]
869 mul $a1 # a[7]*a[5]
870 add %rax,$A1[1]
871 adc %rdx,$A1[0]
872
873 mov $A1[1],($tptr) # t[8]
874 lea 16($i),$i
875 mov $A1[0],8($tptr) # t[9]
876 jmp .Lsqr4x_outer
877
878.align 16
879.Lsqr4x_outer: # comments apply to $num==6 case
880 mov -32($aptr,$i),$a0 # a[0]
881 lea 64(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num]
882 mov -24($aptr,$i),%rax # a[1]
883 lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"]
884 mov -16($aptr,$i),$ai # a[2]
885 mov %rax,$a1
886
887 mov -24($tptr,$i),$A0[0] # t[1]
888 xor $A0[1],$A0[1]
889 mul $a0 # a[1]*a[0]
890 add %rax,$A0[0] # a[1]*a[0]+t[1]
891 mov $ai,%rax # a[2]
892 adc %rdx,$A0[1]
893 mov $A0[0],-24($tptr,$i) # t[1]
894
895 xor $A0[0],$A0[0]
896 add -16($tptr,$i),$A0[1] # a[2]*a[0]+t[2]
897 adc \$0,$A0[0]
898 mul $a0 # a[2]*a[0]
899 add %rax,$A0[1]
900 mov $ai,%rax
901 adc %rdx,$A0[0]
902 mov $A0[1],-16($tptr,$i) # t[2]
903
904 lea -16($i),$j # j=-16
905 xor $A1[0],$A1[0]
906
907
908 mov 8($aptr,$j),$ai # a[3]
909 xor $A1[1],$A1[1]
910 add 8($tptr,$j),$A1[0]
911 adc \$0,$A1[1]
912 mul $a1 # a[2]*a[1]
913 add %rax,$A1[0] # a[2]*a[1]+t[3]
914 mov $ai,%rax
915 adc %rdx,$A1[1]
916
917 xor $A0[1],$A0[1]
918 add $A1[0],$A0[0]
919 adc \$0,$A0[1]
920 mul $a0 # a[3]*a[0]
921 add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3]
922 mov $ai,%rax
923 adc %rdx,$A0[1]
924 mov $A0[0],8($tptr,$j) # t[3]
925
926 lea 16($j),$j
927 jmp .Lsqr4x_inner
928
929.align 16
930.Lsqr4x_inner:
931 mov ($aptr,$j),$ai # a[4]
932 xor $A1[0],$A1[0]
933 add ($tptr,$j),$A1[1]
934 adc \$0,$A1[0]
935 mul $a1 # a[3]*a[1]
936 add %rax,$A1[1] # a[3]*a[1]+t[4]
937 mov $ai,%rax
938 adc %rdx,$A1[0]
939
940 xor $A0[0],$A0[0]
941 add $A1[1],$A0[1]
942 adc \$0,$A0[0]
943 mul $a0 # a[4]*a[0]
944 add %rax,$A0[1] # a[4]*a[0]+a[3]*a[1]+t[4]
945 mov $ai,%rax # a[3]
946 adc %rdx,$A0[0]
947 mov $A0[1],($tptr,$j) # t[4]
948
949 mov 8($aptr,$j),$ai # a[5]
950 xor $A1[1],$A1[1]
951 add 8($tptr,$j),$A1[0]
952 adc \$0,$A1[1]
953 mul $a1 # a[4]*a[3]
954 add %rax,$A1[0] # a[4]*a[3]+t[5]
955 mov $ai,%rax
956 adc %rdx,$A1[1]
957
958 xor $A0[1],$A0[1]
959 add $A1[0],$A0[0]
960 lea 16($j),$j # j++
961 adc \$0,$A0[1]
962 mul $a0 # a[5]*a[2]
963 add %rax,$A0[0] # a[5]*a[2]+a[4]*a[3]+t[5]
964 mov $ai,%rax
965 adc %rdx,$A0[1]
966 mov $A0[0],-8($tptr,$j) # t[5], "preloaded t[1]" below
967
968 cmp \$0,$j
969 jne .Lsqr4x_inner
970
971 xor $A1[0],$A1[0]
972 add $A0[1],$A1[1]
973 adc \$0,$A1[0]
974 mul $a1 # a[5]*a[3]
975 add %rax,$A1[1]
976 adc %rdx,$A1[0]
977
978 mov $A1[1],($tptr) # t[6], "preloaded t[2]" below
979 mov $A1[0],8($tptr) # t[7], "preloaded t[3]" below
980
981 add \$16,$i
982 jnz .Lsqr4x_outer
983
984 # comments apply to $num==4 case
985 mov -32($aptr),$a0 # a[0]
986 lea 64(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num]
987 mov -24($aptr),%rax # a[1]
988 lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"]
989 mov -16($aptr),$ai # a[2]
990 mov %rax,$a1
991
992 xor $A0[1],$A0[1]
993 mul $a0 # a[1]*a[0]
994 add %rax,$A0[0] # a[1]*a[0]+t[1], preloaded t[1]
995 mov $ai,%rax # a[2]
996 adc %rdx,$A0[1]
997 mov $A0[0],-24($tptr) # t[1]
998
999 xor $A0[0],$A0[0]
1000 add $A1[1],$A0[1] # a[2]*a[0]+t[2], preloaded t[2]
1001 adc \$0,$A0[0]
1002 mul $a0 # a[2]*a[0]
1003 add %rax,$A0[1]
1004 mov $ai,%rax
1005 adc %rdx,$A0[0]
1006 mov $A0[1],-16($tptr) # t[2]
1007
1008 mov -8($aptr),$ai # a[3]
1009 mul $a1 # a[2]*a[1]
1010 add %rax,$A1[0] # a[2]*a[1]+t[3], preloaded t[3]
1011 mov $ai,%rax
1012 adc \$0,%rdx
1013
1014 xor $A0[1],$A0[1]
1015 add $A1[0],$A0[0]
1016 mov %rdx,$A1[1]
1017 adc \$0,$A0[1]
1018 mul $a0 # a[3]*a[0]
1019 add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3]
1020 mov $ai,%rax
1021 adc %rdx,$A0[1]
1022 mov $A0[0],-8($tptr) # t[3]
1023
1024 xor $A1[0],$A1[0]
1025 add $A0[1],$A1[1]
1026 adc \$0,$A1[0]
1027 mul $a1 # a[3]*a[1]
1028 add %rax,$A1[1]
1029 mov -16($aptr),%rax # a[2]
1030 adc %rdx,$A1[0]
1031
1032 mov $A1[1],($tptr) # t[4]
1033 mov $A1[0],8($tptr) # t[5]
1034
1035 mul $ai # a[2]*a[3]
1036___
1037{
1038my ($shift,$carry)=($a0,$a1);
1039my @S=(@A1,$ai,$n0);
1040$code.=<<___;
1041 add \$16,$i
1042 xor $shift,$shift
1043 sub $num,$i # $i=16-$num
1044 xor $carry,$carry
1045
1046 add $A1[0],%rax # t[5]
1047 adc \$0,%rdx
1048 mov %rax,8($tptr) # t[5]
1049 mov %rdx,16($tptr) # t[6]
1050 mov $carry,24($tptr) # t[7]
1051
1052 mov -16($aptr,$i),%rax # a[0]
1053 lea 64(%rsp,$num,2),$tptr
1054 xor $A0[0],$A0[0] # t[0]
1055 mov -24($tptr,$i,2),$A0[1] # t[1]
1056
1057 lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift
1058 shr \$63,$A0[0]
1059 lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 |
1060 shr \$63,$A0[1]
1061 or $A0[0],$S[1] # | t[2*i]>>63
1062 mov -16($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch
1063 mov $A0[1],$shift # shift=t[2*i+1]>>63
1064 mul %rax # a[i]*a[i]
1065 neg $carry # mov $carry,cf
1066 mov -8($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch
1067 adc %rax,$S[0]
1068 mov -8($aptr,$i),%rax # a[i+1] # prefetch
1069 mov $S[0],-32($tptr,$i,2)
1070 adc %rdx,$S[1]
1071
1072 lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift
1073 mov $S[1],-24($tptr,$i,2)
1074 sbb $carry,$carry # mov cf,$carry
1075 shr \$63,$A0[0]
1076 lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 |
1077 shr \$63,$A0[1]
1078 or $A0[0],$S[3] # | t[2*i]>>63
1079 mov 0($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch
1080 mov $A0[1],$shift # shift=t[2*i+1]>>63
1081 mul %rax # a[i]*a[i]
1082 neg $carry # mov $carry,cf
1083 mov 8($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch
1084 adc %rax,$S[2]
1085 mov 0($aptr,$i),%rax # a[i+1] # prefetch
1086 mov $S[2],-16($tptr,$i,2)
1087 adc %rdx,$S[3]
1088 lea 16($i),$i
1089 mov $S[3],-40($tptr,$i,2)
1090 sbb $carry,$carry # mov cf,$carry
1091 jmp .Lsqr4x_shift_n_add
1092
1093.align 16
1094.Lsqr4x_shift_n_add:
1095 lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift
1096 shr \$63,$A0[0]
1097 lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 |
1098 shr \$63,$A0[1]
1099 or $A0[0],$S[1] # | t[2*i]>>63
1100 mov -16($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch
1101 mov $A0[1],$shift # shift=t[2*i+1]>>63
1102 mul %rax # a[i]*a[i]
1103 neg $carry # mov $carry,cf
1104 mov -8($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch
1105 adc %rax,$S[0]
1106 mov -8($aptr,$i),%rax # a[i+1] # prefetch
1107 mov $S[0],-32($tptr,$i,2)
1108 adc %rdx,$S[1]
1109
1110 lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift
1111 mov $S[1],-24($tptr,$i,2)
1112 sbb $carry,$carry # mov cf,$carry
1113 shr \$63,$A0[0]
1114 lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 |
1115 shr \$63,$A0[1]
1116 or $A0[0],$S[3] # | t[2*i]>>63
1117 mov 0($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch
1118 mov $A0[1],$shift # shift=t[2*i+1]>>63
1119 mul %rax # a[i]*a[i]
1120 neg $carry # mov $carry,cf
1121 mov 8($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch
1122 adc %rax,$S[2]
1123 mov 0($aptr,$i),%rax # a[i+1] # prefetch
1124 mov $S[2],-16($tptr,$i,2)
1125 adc %rdx,$S[3]
1126
1127 lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift
1128 mov $S[3],-8($tptr,$i,2)
1129 sbb $carry,$carry # mov cf,$carry
1130 shr \$63,$A0[0]
1131 lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 |
1132 shr \$63,$A0[1]
1133 or $A0[0],$S[1] # | t[2*i]>>63
1134 mov 16($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch
1135 mov $A0[1],$shift # shift=t[2*i+1]>>63
1136 mul %rax # a[i]*a[i]
1137 neg $carry # mov $carry,cf
1138 mov 24($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch
1139 adc %rax,$S[0]
1140 mov 8($aptr,$i),%rax # a[i+1] # prefetch
1141 mov $S[0],0($tptr,$i,2)
1142 adc %rdx,$S[1]
1143
1144 lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift
1145 mov $S[1],8($tptr,$i,2)
1146 sbb $carry,$carry # mov cf,$carry
1147 shr \$63,$A0[0]
1148 lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 |
1149 shr \$63,$A0[1]
1150 or $A0[0],$S[3] # | t[2*i]>>63
1151 mov 32($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch
1152 mov $A0[1],$shift # shift=t[2*i+1]>>63
1153 mul %rax # a[i]*a[i]
1154 neg $carry # mov $carry,cf
1155 mov 40($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch
1156 adc %rax,$S[2]
1157 mov 16($aptr,$i),%rax # a[i+1] # prefetch
1158 mov $S[2],16($tptr,$i,2)
1159 adc %rdx,$S[3]
1160 mov $S[3],24($tptr,$i,2)
1161 sbb $carry,$carry # mov cf,$carry
1162 add \$32,$i
1163 jnz .Lsqr4x_shift_n_add
1164
1165 lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift
1166 shr \$63,$A0[0]
1167 lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 |
1168 shr \$63,$A0[1]
1169 or $A0[0],$S[1] # | t[2*i]>>63
1170 mov -16($tptr),$A0[0] # t[2*i+2] # prefetch
1171 mov $A0[1],$shift # shift=t[2*i+1]>>63
1172 mul %rax # a[i]*a[i]
1173 neg $carry # mov $carry,cf
1174 mov -8($tptr),$A0[1] # t[2*i+2+1] # prefetch
1175 adc %rax,$S[0]
1176 mov -8($aptr),%rax # a[i+1] # prefetch
1177 mov $S[0],-32($tptr)
1178 adc %rdx,$S[1]
1179
1180 lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1|shift
1181 mov $S[1],-24($tptr)
1182 sbb $carry,$carry # mov cf,$carry
1183 shr \$63,$A0[0]
1184 lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 |
1185 shr \$63,$A0[1]
1186 or $A0[0],$S[3] # | t[2*i]>>63
1187 mul %rax # a[i]*a[i]
1188 neg $carry # mov $carry,cf
1189 adc %rax,$S[2]
1190 adc %rdx,$S[3]
1191 mov $S[2],-16($tptr)
1192 mov $S[3],-8($tptr)
1193___
1194}
1195##############################################################
1196# Montgomery reduction part, "word-by-word" algorithm.
1197#
1198{
1199my ($topbit,$nptr)=("%rbp",$aptr);
1200my ($m0,$m1)=($a0,$a1);
1201my @Ni=("%rbx","%r9");
1202$code.=<<___;
1203 mov 40(%rsp),$nptr # restore $nptr
1204 mov 48(%rsp),$n0 # restore *n0
1205 xor $j,$j
1206 mov $num,0(%rsp) # save $num
1207 sub $num,$j # $j=-$num
1208 mov 64(%rsp),$A0[0] # t[0] # modsched #
1209 mov $n0,$m0 # # modsched #
1210 lea 64(%rsp,$num,2),%rax # end of t[] buffer
1211 lea 64(%rsp,$num),$tptr # end of t[] window
1212 mov %rax,8(%rsp) # save end of t[] buffer
1213 lea ($nptr,$num),$nptr # end of n[] buffer
1214 xor $topbit,$topbit # $topbit=0
1215
1216 mov 0($nptr,$j),%rax # n[0] # modsched #
1217 mov 8($nptr,$j),$Ni[1] # n[1] # modsched #
1218 imulq $A0[0],$m0 # m0=t[0]*n0 # modsched #
1219 mov %rax,$Ni[0] # # modsched #
1220 jmp .Lsqr4x_mont_outer
1221
1222.align 16
1223.Lsqr4x_mont_outer:
1224 xor $A0[1],$A0[1]
1225 mul $m0 # n[0]*m0
1226 add %rax,$A0[0] # n[0]*m0+t[0]
1227 mov $Ni[1],%rax
1228 adc %rdx,$A0[1]
1229 mov $n0,$m1
1230
1231 xor $A0[0],$A0[0]
1232 add 8($tptr,$j),$A0[1]
1233 adc \$0,$A0[0]
1234 mul $m0 # n[1]*m0
1235 add %rax,$A0[1] # n[1]*m0+t[1]
1236 mov $Ni[0],%rax
1237 adc %rdx,$A0[0]
1238
1239 imulq $A0[1],$m1
1240
1241 mov 16($nptr,$j),$Ni[0] # n[2]
1242 xor $A1[1],$A1[1]
1243 add $A0[1],$A1[0]
1244 adc \$0,$A1[1]
1245 mul $m1 # n[0]*m1
1246 add %rax,$A1[0] # n[0]*m1+"t[1]"
1247 mov $Ni[0],%rax
1248 adc %rdx,$A1[1]
1249 mov $A1[0],8($tptr,$j) # "t[1]"
1250
1251 xor $A0[1],$A0[1]
1252 add 16($tptr,$j),$A0[0]
1253 adc \$0,$A0[1]
1254 mul $m0 # n[2]*m0
1255 add %rax,$A0[0] # n[2]*m0+t[2]
1256 mov $Ni[1],%rax
1257 adc %rdx,$A0[1]
1258
1259 mov 24($nptr,$j),$Ni[1] # n[3]
1260 xor $A1[0],$A1[0]
1261 add $A0[0],$A1[1]
1262 adc \$0,$A1[0]
1263 mul $m1 # n[1]*m1
1264 add %rax,$A1[1] # n[1]*m1+"t[2]"
1265 mov $Ni[1],%rax
1266 adc %rdx,$A1[0]
1267 mov $A1[1],16($tptr,$j) # "t[2]"
1268
1269 xor $A0[0],$A0[0]
1270 add 24($tptr,$j),$A0[1]
1271 lea 32($j),$j
1272 adc \$0,$A0[0]
1273 mul $m0 # n[3]*m0
1274 add %rax,$A0[1] # n[3]*m0+t[3]
1275 mov $Ni[0],%rax
1276 adc %rdx,$A0[0]
1277 jmp .Lsqr4x_mont_inner
1278
1279.align 16
1280.Lsqr4x_mont_inner:
1281 mov ($nptr,$j),$Ni[0] # n[4]
1282 xor $A1[1],$A1[1]
1283 add $A0[1],$A1[0]
1284 adc \$0,$A1[1]
1285 mul $m1 # n[2]*m1
1286 add %rax,$A1[0] # n[2]*m1+"t[3]"
1287 mov $Ni[0],%rax
1288 adc %rdx,$A1[1]
1289 mov $A1[0],-8($tptr,$j) # "t[3]"
1290
1291 xor $A0[1],$A0[1]
1292 add ($tptr,$j),$A0[0]
1293 adc \$0,$A0[1]
1294 mul $m0 # n[4]*m0
1295 add %rax,$A0[0] # n[4]*m0+t[4]
1296 mov $Ni[1],%rax
1297 adc %rdx,$A0[1]
1298
1299 mov 8($nptr,$j),$Ni[1] # n[5]
1300 xor $A1[0],$A1[0]
1301 add $A0[0],$A1[1]
1302 adc \$0,$A1[0]
1303 mul $m1 # n[3]*m1
1304 add %rax,$A1[1] # n[3]*m1+"t[4]"
1305 mov $Ni[1],%rax
1306 adc %rdx,$A1[0]
1307 mov $A1[1],($tptr,$j) # "t[4]"
1308
1309 xor $A0[0],$A0[0]
1310 add 8($tptr,$j),$A0[1]
1311 adc \$0,$A0[0]
1312 mul $m0 # n[5]*m0
1313 add %rax,$A0[1] # n[5]*m0+t[5]
1314 mov $Ni[0],%rax
1315 adc %rdx,$A0[0]
1316
1317
1318 mov 16($nptr,$j),$Ni[0] # n[6]
1319 xor $A1[1],$A1[1]
1320 add $A0[1],$A1[0]
1321 adc \$0,$A1[1]
1322 mul $m1 # n[4]*m1
1323 add %rax,$A1[0] # n[4]*m1+"t[5]"
1324 mov $Ni[0],%rax
1325 adc %rdx,$A1[1]
1326 mov $A1[0],8($tptr,$j) # "t[5]"
1327
1328 xor $A0[1],$A0[1]
1329 add 16($tptr,$j),$A0[0]
1330 adc \$0,$A0[1]
1331 mul $m0 # n[6]*m0
1332 add %rax,$A0[0] # n[6]*m0+t[6]
1333 mov $Ni[1],%rax
1334 adc %rdx,$A0[1]
1335
1336 mov 24($nptr,$j),$Ni[1] # n[7]
1337 xor $A1[0],$A1[0]
1338 add $A0[0],$A1[1]
1339 adc \$0,$A1[0]
1340 mul $m1 # n[5]*m1
1341 add %rax,$A1[1] # n[5]*m1+"t[6]"
1342 mov $Ni[1],%rax
1343 adc %rdx,$A1[0]
1344 mov $A1[1],16($tptr,$j) # "t[6]"
1345
1346 xor $A0[0],$A0[0]
1347 add 24($tptr,$j),$A0[1]
1348 lea 32($j),$j
1349 adc \$0,$A0[0]
1350 mul $m0 # n[7]*m0
1351 add %rax,$A0[1] # n[7]*m0+t[7]
1352 mov $Ni[0],%rax
1353 adc %rdx,$A0[0]
1354 cmp \$0,$j
1355 jne .Lsqr4x_mont_inner
1356
1357 sub 0(%rsp),$j # $j=-$num # modsched #
1358 mov $n0,$m0 # # modsched #
1359
1360 xor $A1[1],$A1[1]
1361 add $A0[1],$A1[0]
1362 adc \$0,$A1[1]
1363 mul $m1 # n[6]*m1
1364 add %rax,$A1[0] # n[6]*m1+"t[7]"
1365 mov $Ni[1],%rax
1366 adc %rdx,$A1[1]
1367 mov $A1[0],-8($tptr) # "t[7]"
1368
1369 xor $A0[1],$A0[1]
1370 add ($tptr),$A0[0] # +t[8]
1371 adc \$0,$A0[1]
1372 mov 0($nptr,$j),$Ni[0] # n[0] # modsched #
1373 add $topbit,$A0[0]
1374 adc \$0,$A0[1]
1375
1376 imulq 16($tptr,$j),$m0 # m0=t[0]*n0 # modsched #
1377 xor $A1[0],$A1[0]
1378 mov 8($nptr,$j),$Ni[1] # n[1] # modsched #
1379 add $A0[0],$A1[1]
1380 mov 16($tptr,$j),$A0[0] # t[0] # modsched #
1381 adc \$0,$A1[0]
1382 mul $m1 # n[7]*m1
1383 add %rax,$A1[1] # n[7]*m1+"t[8]"
1384 mov $Ni[0],%rax # # modsched #
1385 adc %rdx,$A1[0]
1386 mov $A1[1],($tptr) # "t[8]"
1387
1388 xor $topbit,$topbit
1389 add 8($tptr),$A1[0] # +t[9]
1390 adc $topbit,$topbit
1391 add $A0[1],$A1[0]
1392 lea 16($tptr),$tptr # "t[$num]>>128"
1393 adc \$0,$topbit
1394 mov $A1[0],-8($tptr) # "t[9]"
1395 cmp 8(%rsp),$tptr # are we done?
1396 jb .Lsqr4x_mont_outer
1397
1398 mov 0(%rsp),$num # restore $num
1399 mov $topbit,($tptr) # save $topbit
1400___
1401}
1402##############################################################
1403# Post-condition, 4x unrolled copy from bn_mul_mont
1404#
1405{
1406my ($tptr,$nptr)=("%rbx",$aptr);
1407my @ri=("%rax","%rdx","%r10","%r11");
1408$code.=<<___;
1409 mov 64(%rsp,$num),@ri[0] # tp[0]
1410 lea 64(%rsp,$num),$tptr # upper half of t[2*$num] holds result
1411 mov 40(%rsp),$nptr # restore $nptr
1412 shr \$5,$num # num/4
1413 mov 8($tptr),@ri[1] # t[1]
1414 xor $i,$i # i=0 and clear CF!
1415
1416 mov 32(%rsp),$rptr # restore $rptr
1417 sub 0($nptr),@ri[0]
1418 mov 16($tptr),@ri[2] # t[2]
1419 mov 24($tptr),@ri[3] # t[3]
1420 sbb 8($nptr),@ri[1]
1421 lea -1($num),$j # j=num/4-1
1422 jmp .Lsqr4x_sub
1423.align 16
1424.Lsqr4x_sub:
1425 mov @ri[0],0($rptr,$i,8) # rp[i]=tp[i]-np[i]
1426 mov @ri[1],8($rptr,$i,8) # rp[i]=tp[i]-np[i]
1427 sbb 16($nptr,$i,8),@ri[2]
1428 mov 32($tptr,$i,8),@ri[0] # tp[i+1]
1429 mov 40($tptr,$i,8),@ri[1]
1430 sbb 24($nptr,$i,8),@ri[3]
1431 mov @ri[2],16($rptr,$i,8) # rp[i]=tp[i]-np[i]
1432 mov @ri[3],24($rptr,$i,8) # rp[i]=tp[i]-np[i]
1433 sbb 32($nptr,$i,8),@ri[0]
1434 mov 48($tptr,$i,8),@ri[2]
1435 mov 56($tptr,$i,8),@ri[3]
1436 sbb 40($nptr,$i,8),@ri[1]
1437 lea 4($i),$i # i++
1438 dec $j # doesn't affect CF!
1439 jnz .Lsqr4x_sub
1440
1441 mov @ri[0],0($rptr,$i,8) # rp[i]=tp[i]-np[i]
1442 mov 32($tptr,$i,8),@ri[0] # load overflow bit
1443 sbb 16($nptr,$i,8),@ri[2]
1444 mov @ri[1],8($rptr,$i,8) # rp[i]=tp[i]-np[i]
1445 sbb 24($nptr,$i,8),@ri[3]
1446 mov @ri[2],16($rptr,$i,8) # rp[i]=tp[i]-np[i]
1447
1448 sbb \$0,@ri[0] # handle upmost overflow bit
1449 mov @ri[3],24($rptr,$i,8) # rp[i]=tp[i]-np[i]
1450 xor $i,$i # i=0
1451 and @ri[0],$tptr
1452 not @ri[0]
1453 mov $rptr,$nptr
1454 and @ri[0],$nptr
1455 lea -1($num),$j
1456 or $nptr,$tptr # tp=borrow?tp:rp
1457
1458 pxor %xmm0,%xmm0
1459 lea 64(%rsp,$num,8),$nptr
1460 movdqu ($tptr),%xmm1
1461 lea ($nptr,$num,8),$nptr
1462 movdqa %xmm0,64(%rsp) # zap lower half of temporary vector
1463 movdqa %xmm0,($nptr) # zap upper half of temporary vector
1464 movdqu %xmm1,($rptr)
1465 jmp .Lsqr4x_copy
1466.align 16
1467.Lsqr4x_copy: # copy or in-place refresh
1468 movdqu 16($tptr,$i),%xmm2
1469 movdqu 32($tptr,$i),%xmm1
1470 movdqa %xmm0,80(%rsp,$i) # zap lower half of temporary vector
1471 movdqa %xmm0,96(%rsp,$i) # zap lower half of temporary vector
1472 movdqa %xmm0,16($nptr,$i) # zap upper half of temporary vector
1473 movdqa %xmm0,32($nptr,$i) # zap upper half of temporary vector
1474 movdqu %xmm2,16($rptr,$i)
1475 movdqu %xmm1,32($rptr,$i)
1476 lea 32($i),$i
1477 dec $j
1478 jnz .Lsqr4x_copy
1479
1480 movdqu 16($tptr,$i),%xmm2
1481 movdqa %xmm0,80(%rsp,$i) # zap lower half of temporary vector
1482 movdqa %xmm0,16($nptr,$i) # zap upper half of temporary vector
1483 movdqu %xmm2,16($rptr,$i)
1484___
1485}
1486$code.=<<___;
1487 mov 56(%rsp),%rsi # restore %rsp
1488 mov \$1,%rax
1489 mov 0(%rsi),%r15
1490 mov 8(%rsi),%r14
1491 mov 16(%rsi),%r13
1492 mov 24(%rsi),%r12
1493 mov 32(%rsi),%rbp
1494 mov 40(%rsi),%rbx
1495 lea 48(%rsi),%rsp
1496.Lsqr4x_epilogue:
1497 ret
1498.size bn_sqr4x_mont,.-bn_sqr4x_mont
1499___
1500}}}
1501
1502print $code;
1503close STDOUT;
diff --git a/src/lib/libcrypto/bn/asm/x86_64-mont5.pl b/src/lib/libcrypto/bn/asm/x86_64-mont5.pl
deleted file mode 100755
index 38751ec5de..0000000000
--- a/src/lib/libcrypto/bn/asm/x86_64-mont5.pl
+++ /dev/null
@@ -1,1192 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# August 2011.
11#
12# Companion to x86_64-mont.pl that optimizes cache-timing attack
13# countermeasures. The subroutines are produced by replacing bp[i]
14# references in their x86_64-mont.pl counterparts with cache-neutral
15# references to powers table computed in BN_mod_exp_mont_consttime.
16# In addition subroutine that scatters elements of the powers table
17# is implemented, so that scatter-/gathering can be tuned without
18# bn_exp.c modifications.
19
20$flavour = shift;
21$output = shift;
22if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
23
24$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
25
26$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
27( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
28( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
29die "can't locate x86_64-xlate.pl";
30
31open OUT,"| \"$^X\" $xlate $flavour $output";
32*STDOUT=*OUT;
33
34# int bn_mul_mont_gather5(
35$rp="%rdi"; # BN_ULONG *rp,
36$ap="%rsi"; # const BN_ULONG *ap,
37$bp="%rdx"; # const BN_ULONG *bp,
38$np="%rcx"; # const BN_ULONG *np,
39$n0="%r8"; # const BN_ULONG *n0,
40$num="%r9"; # int num,
41 # int idx); # 0 to 2^5-1, "index" in $bp holding
42 # pre-computed powers of a', interlaced
43 # in such manner that b[0] is $bp[idx],
44 # b[1] is [2^5+idx], etc.
45$lo0="%r10";
46$hi0="%r11";
47$hi1="%r13";
48$i="%r14";
49$j="%r15";
50$m0="%rbx";
51$m1="%rbp";
52
53$code=<<___;
54.text
55
56.globl bn_mul_mont_gather5
57.type bn_mul_mont_gather5,\@function,6
58.align 64
59bn_mul_mont_gather5:
60 _CET_ENDBR
61 test \$3,${num}d
62 jnz .Lmul_enter
63 cmp \$8,${num}d
64 jb .Lmul_enter
65 jmp .Lmul4x_enter
66
67.align 16
68.Lmul_enter:
69 mov ${num}d,${num}d
70 movd `($win64?56:8)`(%rsp),%xmm5 # load 7th argument
71 lea .Linc(%rip),%r10
72 push %rbx
73 push %rbp
74 push %r12
75 push %r13
76 push %r14
77 push %r15
78
79.Lmul_alloca:
80 mov %rsp,%rax
81 lea 2($num),%r11
82 neg %r11
83 lea -264(%rsp,%r11,8),%rsp # tp=alloca(8*(num+2)+256+8)
84 and \$-1024,%rsp # minimize TLB usage
85
86 mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp
87.Lmul_body:
88 lea 128($bp),%r12 # reassign $bp (+size optimization)
89___
90 $bp="%r12";
91 $STRIDE=2**5*8; # 5 is "window size"
92 $N=$STRIDE/4; # should match cache line size
93$code.=<<___;
94 movdqa 0(%r10),%xmm0 # 00000001000000010000000000000000
95 movdqa 16(%r10),%xmm1 # 00000002000000020000000200000002
96 lea 24-112(%rsp,$num,8),%r10# place the mask after tp[num+3] (+ICache optimization)
97 and \$-16,%r10
98
99 pshufd \$0,%xmm5,%xmm5 # broadcast index
100 movdqa %xmm1,%xmm4
101 movdqa %xmm1,%xmm2
102___
103########################################################################
104# calculate mask by comparing 0..31 to index and save result to stack
105#
106$code.=<<___;
107 paddd %xmm0,%xmm1
108 pcmpeqd %xmm5,%xmm0 # compare to 1,0
109 .byte 0x67
110 movdqa %xmm4,%xmm3
111___
112for($k=0;$k<$STRIDE/16-4;$k+=4) {
113$code.=<<___;
114 paddd %xmm1,%xmm2
115 pcmpeqd %xmm5,%xmm1 # compare to 3,2
116 movdqa %xmm0,`16*($k+0)+112`(%r10)
117 movdqa %xmm4,%xmm0
118
119 paddd %xmm2,%xmm3
120 pcmpeqd %xmm5,%xmm2 # compare to 5,4
121 movdqa %xmm1,`16*($k+1)+112`(%r10)
122 movdqa %xmm4,%xmm1
123
124 paddd %xmm3,%xmm0
125 pcmpeqd %xmm5,%xmm3 # compare to 7,6
126 movdqa %xmm2,`16*($k+2)+112`(%r10)
127 movdqa %xmm4,%xmm2
128
129 paddd %xmm0,%xmm1
130 pcmpeqd %xmm5,%xmm0
131 movdqa %xmm3,`16*($k+3)+112`(%r10)
132 movdqa %xmm4,%xmm3
133___
134}
135$code.=<<___; # last iteration can be optimized
136 paddd %xmm1,%xmm2
137 pcmpeqd %xmm5,%xmm1
138 movdqa %xmm0,`16*($k+0)+112`(%r10)
139
140 paddd %xmm2,%xmm3
141 .byte 0x67
142 pcmpeqd %xmm5,%xmm2
143 movdqa %xmm1,`16*($k+1)+112`(%r10)
144
145 pcmpeqd %xmm5,%xmm3
146 movdqa %xmm2,`16*($k+2)+112`(%r10)
147 pand `16*($k+0)-128`($bp),%xmm0 # while it's still in register
148
149 pand `16*($k+1)-128`($bp),%xmm1
150 pand `16*($k+2)-128`($bp),%xmm2
151 movdqa %xmm3,`16*($k+3)+112`(%r10)
152 pand `16*($k+3)-128`($bp),%xmm3
153 por %xmm2,%xmm0
154 por %xmm3,%xmm1
155___
156for($k=0;$k<$STRIDE/16-4;$k+=4) {
157$code.=<<___;
158 movdqa `16*($k+0)-128`($bp),%xmm4
159 movdqa `16*($k+1)-128`($bp),%xmm5
160 movdqa `16*($k+2)-128`($bp),%xmm2
161 pand `16*($k+0)+112`(%r10),%xmm4
162 movdqa `16*($k+3)-128`($bp),%xmm3
163 pand `16*($k+1)+112`(%r10),%xmm5
164 por %xmm4,%xmm0
165 pand `16*($k+2)+112`(%r10),%xmm2
166 por %xmm5,%xmm1
167 pand `16*($k+3)+112`(%r10),%xmm3
168 por %xmm2,%xmm0
169 por %xmm3,%xmm1
170___
171}
172$code.=<<___;
173 por %xmm1,%xmm0
174 pshufd \$0x4e,%xmm0,%xmm1
175 por %xmm1,%xmm0
176 lea $STRIDE($bp),$bp
177 movd %xmm0,$m0 # m0=bp[0]
178
179 mov ($n0),$n0 # pull n0[0] value
180 mov ($ap),%rax
181
182 xor $i,$i # i=0
183 xor $j,$j # j=0
184
185 mov $n0,$m1
186 mulq $m0 # ap[0]*bp[0]
187 mov %rax,$lo0
188 mov ($np),%rax
189
190 imulq $lo0,$m1 # "tp[0]"*n0
191 mov %rdx,$hi0
192
193 mulq $m1 # np[0]*m1
194 add %rax,$lo0 # discarded
195 mov 8($ap),%rax
196 adc \$0,%rdx
197 mov %rdx,$hi1
198
199 lea 1($j),$j # j++
200 jmp .L1st_enter
201
202.align 16
203.L1st:
204 add %rax,$hi1
205 mov ($ap,$j,8),%rax
206 adc \$0,%rdx
207 add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0]
208 mov $lo0,$hi0
209 adc \$0,%rdx
210 mov $hi1,-16(%rsp,$j,8) # tp[j-1]
211 mov %rdx,$hi1
212
213.L1st_enter:
214 mulq $m0 # ap[j]*bp[0]
215 add %rax,$hi0
216 mov ($np,$j,8),%rax
217 adc \$0,%rdx
218 lea 1($j),$j # j++
219 mov %rdx,$lo0
220
221 mulq $m1 # np[j]*m1
222 cmp $num,$j
223 jl .L1st
224
225 add %rax,$hi1
226 mov ($ap),%rax # ap[0]
227 adc \$0,%rdx
228 add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0]
229 adc \$0,%rdx
230 mov $hi1,-16(%rsp,$j,8) # tp[j-1]
231 mov %rdx,$hi1
232 mov $lo0,$hi0
233
234 xor %rdx,%rdx
235 add $hi0,$hi1
236 adc \$0,%rdx
237 mov $hi1,-8(%rsp,$num,8)
238 mov %rdx,(%rsp,$num,8) # store upmost overflow bit
239
240 lea 1($i),$i # i++
241 jmp .Louter
242.align 16
243.Louter:
244 lea 24+128(%rsp,$num,8),%rdx # where 256-byte mask is (+size optimization)
245 and \$-16,%rdx
246 pxor %xmm4,%xmm4
247 pxor %xmm5,%xmm5
248___
249for($k=0;$k<$STRIDE/16;$k+=4) {
250$code.=<<___;
251 movdqa `16*($k+0)-128`($bp),%xmm0
252 movdqa `16*($k+1)-128`($bp),%xmm1
253 movdqa `16*($k+2)-128`($bp),%xmm2
254 movdqa `16*($k+3)-128`($bp),%xmm3
255 pand `16*($k+0)-128`(%rdx),%xmm0
256 pand `16*($k+1)-128`(%rdx),%xmm1
257 por %xmm0,%xmm4
258 pand `16*($k+2)-128`(%rdx),%xmm2
259 por %xmm1,%xmm5
260 pand `16*($k+3)-128`(%rdx),%xmm3
261 por %xmm2,%xmm4
262 por %xmm3,%xmm5
263___
264}
265$code.=<<___;
266 por %xmm5,%xmm4
267 pshufd \$0x4e,%xmm4,%xmm0
268 por %xmm4,%xmm0
269 lea $STRIDE($bp),$bp
270 movd %xmm0,$m0 # m0=bp[i]
271
272 xor $j,$j # j=0
273 mov $n0,$m1
274 mov (%rsp),$lo0
275
276 mulq $m0 # ap[0]*bp[i]
277 add %rax,$lo0 # ap[0]*bp[i]+tp[0]
278 mov ($np),%rax
279 adc \$0,%rdx
280
281 imulq $lo0,$m1 # tp[0]*n0
282 mov %rdx,$hi0
283
284 mulq $m1 # np[0]*m1
285 add %rax,$lo0 # discarded
286 mov 8($ap),%rax
287 adc \$0,%rdx
288 mov 8(%rsp),$lo0 # tp[1]
289 mov %rdx,$hi1
290
291 lea 1($j),$j # j++
292 jmp .Linner_enter
293
294.align 16
295.Linner:
296 add %rax,$hi1
297 mov ($ap,$j,8),%rax
298 adc \$0,%rdx
299 add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j]
300 mov (%rsp,$j,8),$lo0
301 adc \$0,%rdx
302 mov $hi1,-16(%rsp,$j,8) # tp[j-1]
303 mov %rdx,$hi1
304
305.Linner_enter:
306 mulq $m0 # ap[j]*bp[i]
307 add %rax,$hi0
308 mov ($np,$j,8),%rax
309 adc \$0,%rdx
310 add $hi0,$lo0 # ap[j]*bp[i]+tp[j]
311 mov %rdx,$hi0
312 adc \$0,$hi0
313 lea 1($j),$j # j++
314
315 mulq $m1 # np[j]*m1
316 cmp $num,$j
317 jl .Linner
318
319 add %rax,$hi1
320 mov ($ap),%rax # ap[0]
321 adc \$0,%rdx
322 add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j]
323 mov (%rsp,$j,8),$lo0
324 adc \$0,%rdx
325 mov $hi1,-16(%rsp,$j,8) # tp[j-1]
326 mov %rdx,$hi1
327
328 xor %rdx,%rdx
329 add $hi0,$hi1
330 adc \$0,%rdx
331 add $lo0,$hi1 # pull upmost overflow bit
332 adc \$0,%rdx
333 mov $hi1,-8(%rsp,$num,8)
334 mov %rdx,(%rsp,$num,8) # store upmost overflow bit
335
336 lea 1($i),$i # i++
337 cmp $num,$i
338 jl .Louter
339
340 xor $i,$i # i=0 and clear CF!
341 mov (%rsp),%rax # tp[0]
342 lea (%rsp),$ap # borrow ap for tp
343 mov $num,$j # j=num
344 jmp .Lsub
345.align 16
346.Lsub: sbb ($np,$i,8),%rax
347 mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i]
348 mov 8($ap,$i,8),%rax # tp[i+1]
349 lea 1($i),$i # i++
350 dec $j # doesnn't affect CF!
351 jnz .Lsub
352
353 sbb \$0,%rax # handle upmost overflow bit
354 xor $i,$i
355 and %rax,$ap
356 not %rax
357 mov $rp,$np
358 and %rax,$np
359 mov $num,$j # j=num
360 or $np,$ap # ap=borrow?tp:rp
361.align 16
362.Lcopy: # copy or in-place refresh
363 mov ($ap,$i,8),%rax
364 mov $i,(%rsp,$i,8) # zap temporary vector
365 mov %rax,($rp,$i,8) # rp[i]=tp[i]
366 lea 1($i),$i
367 sub \$1,$j
368 jnz .Lcopy
369
370 mov 8(%rsp,$num,8),%rsi # restore %rsp
371 mov \$1,%rax
372
373 mov (%rsi),%r15
374 mov 8(%rsi),%r14
375 mov 16(%rsi),%r13
376 mov 24(%rsi),%r12
377 mov 32(%rsi),%rbp
378 mov 40(%rsi),%rbx
379 lea 48(%rsi),%rsp
380.Lmul_epilogue:
381 ret
382.size bn_mul_mont_gather5,.-bn_mul_mont_gather5
383___
384{{{
385my @A=("%r10","%r11");
386my @N=("%r13","%rdi");
387$code.=<<___;
388.type bn_mul4x_mont_gather5,\@function,6
389.align 16
390bn_mul4x_mont_gather5:
391 _CET_ENDBR
392.Lmul4x_enter:
393 mov ${num}d,${num}d
394 movd `($win64?56:8)`(%rsp),%xmm5 # load 7th argument
395 lea .Linc(%rip),%r10
396 push %rbx
397 push %rbp
398 push %r12
399 push %r13
400 push %r14
401 push %r15
402
403.Lmul4x_alloca:
404 mov %rsp,%rax
405 lea 4($num),%r11
406 neg %r11
407 lea -256(%rsp,%r11,8),%rsp # tp=alloca(8*(num+4)+256)
408 and \$-1024,%rsp # minimize TLB usage
409
410 mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp
411.Lmul4x_body:
412 mov $rp,16(%rsp,$num,8) # tp[num+2]=$rp
413 lea 128(%rdx),%r12 # reassign $bp (+size optimization)
414___
415 $bp="%r12";
416 $STRIDE=2**5*8; # 5 is "window size"
417 $N=$STRIDE/4; # should match cache line size
418$code.=<<___;
419 movdqa 0(%r10),%xmm0 # 00000001000000010000000000000000
420 movdqa 16(%r10),%xmm1 # 00000002000000020000000200000002
421 lea 32-112(%rsp,$num,8),%r10# place the mask after tp[num+4] (+ICache optimization)
422
423 pshufd \$0,%xmm5,%xmm5 # broadcast index
424 movdqa %xmm1,%xmm4
425 .byte 0x67,0x67
426 movdqa %xmm1,%xmm2
427___
428########################################################################
429# calculate mask by comparing 0..31 to index and save result to stack
430#
431$code.=<<___;
432 paddd %xmm0,%xmm1
433 pcmpeqd %xmm5,%xmm0 # compare to 1,0
434 .byte 0x67
435 movdqa %xmm4,%xmm3
436___
437for($k=0;$k<$STRIDE/16-4;$k+=4) {
438$code.=<<___;
439 paddd %xmm1,%xmm2
440 pcmpeqd %xmm5,%xmm1 # compare to 3,2
441 movdqa %xmm0,`16*($k+0)+112`(%r10)
442 movdqa %xmm4,%xmm0
443
444 paddd %xmm2,%xmm3
445 pcmpeqd %xmm5,%xmm2 # compare to 5,4
446 movdqa %xmm1,`16*($k+1)+112`(%r10)
447 movdqa %xmm4,%xmm1
448
449 paddd %xmm3,%xmm0
450 pcmpeqd %xmm5,%xmm3 # compare to 7,6
451 movdqa %xmm2,`16*($k+2)+112`(%r10)
452 movdqa %xmm4,%xmm2
453
454 paddd %xmm0,%xmm1
455 pcmpeqd %xmm5,%xmm0
456 movdqa %xmm3,`16*($k+3)+112`(%r10)
457 movdqa %xmm4,%xmm3
458___
459}
460$code.=<<___; # last iteration can be optimized
461 paddd %xmm1,%xmm2
462 pcmpeqd %xmm5,%xmm1
463 movdqa %xmm0,`16*($k+0)+112`(%r10)
464
465 paddd %xmm2,%xmm3
466 .byte 0x67
467 pcmpeqd %xmm5,%xmm2
468 movdqa %xmm1,`16*($k+1)+112`(%r10)
469
470 pcmpeqd %xmm5,%xmm3
471 movdqa %xmm2,`16*($k+2)+112`(%r10)
472 pand `16*($k+0)-128`($bp),%xmm0 # while it's still in register
473
474 pand `16*($k+1)-128`($bp),%xmm1
475 pand `16*($k+2)-128`($bp),%xmm2
476 movdqa %xmm3,`16*($k+3)+112`(%r10)
477 pand `16*($k+3)-128`($bp),%xmm3
478 por %xmm2,%xmm0
479 por %xmm3,%xmm1
480___
481for($k=0;$k<$STRIDE/16-4;$k+=4) {
482$code.=<<___;
483 movdqa `16*($k+0)-128`($bp),%xmm4
484 movdqa `16*($k+1)-128`($bp),%xmm5
485 movdqa `16*($k+2)-128`($bp),%xmm2
486 pand `16*($k+0)+112`(%r10),%xmm4
487 movdqa `16*($k+3)-128`($bp),%xmm3
488 pand `16*($k+1)+112`(%r10),%xmm5
489 por %xmm4,%xmm0
490 pand `16*($k+2)+112`(%r10),%xmm2
491 por %xmm5,%xmm1
492 pand `16*($k+3)+112`(%r10),%xmm3
493 por %xmm2,%xmm0
494 por %xmm3,%xmm1
495___
496}
497$code.=<<___;
498 por %xmm1,%xmm0
499 pshufd \$0x4e,%xmm0,%xmm1
500 por %xmm1,%xmm0
501 lea $STRIDE($bp),$bp
502 movd %xmm0,$m0 # m0=bp[0]
503
504 mov ($n0),$n0 # pull n0[0] value
505 mov ($ap),%rax
506
507 xor $i,$i # i=0
508 xor $j,$j # j=0
509
510 mov $n0,$m1
511 mulq $m0 # ap[0]*bp[0]
512 mov %rax,$A[0]
513 mov ($np),%rax
514
515 imulq $A[0],$m1 # "tp[0]"*n0
516 mov %rdx,$A[1]
517
518 mulq $m1 # np[0]*m1
519 add %rax,$A[0] # discarded
520 mov 8($ap),%rax
521 adc \$0,%rdx
522 mov %rdx,$N[1]
523
524 mulq $m0
525 add %rax,$A[1]
526 mov 8($np),%rax
527 adc \$0,%rdx
528 mov %rdx,$A[0]
529
530 mulq $m1
531 add %rax,$N[1]
532 mov 16($ap),%rax
533 adc \$0,%rdx
534 add $A[1],$N[1]
535 lea 4($j),$j # j++
536 adc \$0,%rdx
537 mov $N[1],(%rsp)
538 mov %rdx,$N[0]
539 jmp .L1st4x
540.align 16
541.L1st4x:
542 mulq $m0 # ap[j]*bp[0]
543 add %rax,$A[0]
544 mov -16($np,$j,8),%rax
545 adc \$0,%rdx
546 mov %rdx,$A[1]
547
548 mulq $m1 # np[j]*m1
549 add %rax,$N[0]
550 mov -8($ap,$j,8),%rax
551 adc \$0,%rdx
552 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
553 adc \$0,%rdx
554 mov $N[0],-24(%rsp,$j,8) # tp[j-1]
555 mov %rdx,$N[1]
556
557 mulq $m0 # ap[j]*bp[0]
558 add %rax,$A[1]
559 mov -8($np,$j,8),%rax
560 adc \$0,%rdx
561 mov %rdx,$A[0]
562
563 mulq $m1 # np[j]*m1
564 add %rax,$N[1]
565 mov ($ap,$j,8),%rax
566 adc \$0,%rdx
567 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
568 adc \$0,%rdx
569 mov $N[1],-16(%rsp,$j,8) # tp[j-1]
570 mov %rdx,$N[0]
571
572 mulq $m0 # ap[j]*bp[0]
573 add %rax,$A[0]
574 mov ($np,$j,8),%rax
575 adc \$0,%rdx
576 mov %rdx,$A[1]
577
578 mulq $m1 # np[j]*m1
579 add %rax,$N[0]
580 mov 8($ap,$j,8),%rax
581 adc \$0,%rdx
582 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
583 adc \$0,%rdx
584 mov $N[0],-8(%rsp,$j,8) # tp[j-1]
585 mov %rdx,$N[1]
586
587 mulq $m0 # ap[j]*bp[0]
588 add %rax,$A[1]
589 mov 8($np,$j,8),%rax
590 adc \$0,%rdx
591 lea 4($j),$j # j++
592 mov %rdx,$A[0]
593
594 mulq $m1 # np[j]*m1
595 add %rax,$N[1]
596 mov -16($ap,$j,8),%rax
597 adc \$0,%rdx
598 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
599 adc \$0,%rdx
600 mov $N[1],-32(%rsp,$j,8) # tp[j-1]
601 mov %rdx,$N[0]
602 cmp $num,$j
603 jl .L1st4x
604
605 mulq $m0 # ap[j]*bp[0]
606 add %rax,$A[0]
607 mov -16($np,$j,8),%rax
608 adc \$0,%rdx
609 mov %rdx,$A[1]
610
611 mulq $m1 # np[j]*m1
612 add %rax,$N[0]
613 mov -8($ap,$j,8),%rax
614 adc \$0,%rdx
615 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
616 adc \$0,%rdx
617 mov $N[0],-24(%rsp,$j,8) # tp[j-1]
618 mov %rdx,$N[1]
619
620 mulq $m0 # ap[j]*bp[0]
621 add %rax,$A[1]
622 mov -8($np,$j,8),%rax
623 adc \$0,%rdx
624 mov %rdx,$A[0]
625
626 mulq $m1 # np[j]*m1
627 add %rax,$N[1]
628 mov ($ap),%rax # ap[0]
629 adc \$0,%rdx
630 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
631 adc \$0,%rdx
632 mov $N[1],-16(%rsp,$j,8) # tp[j-1]
633 mov %rdx,$N[0]
634
635 xor $N[1],$N[1]
636 add $A[0],$N[0]
637 adc \$0,$N[1]
638 mov $N[0],-8(%rsp,$j,8)
639 mov $N[1],(%rsp,$j,8) # store upmost overflow bit
640
641 lea 1($i),$i # i++
642.align 4
643.Louter4x:
644 lea 32+128(%rsp,$num,8),%rdx # where 256-byte mask is (+size optimization)
645 pxor %xmm4,%xmm4
646 pxor %xmm5,%xmm5
647___
648for($k=0;$k<$STRIDE/16;$k+=4) {
649$code.=<<___;
650 movdqa `16*($k+0)-128`($bp),%xmm0
651 movdqa `16*($k+1)-128`($bp),%xmm1
652 movdqa `16*($k+2)-128`($bp),%xmm2
653 movdqa `16*($k+3)-128`($bp),%xmm3
654 pand `16*($k+0)-128`(%rdx),%xmm0
655 pand `16*($k+1)-128`(%rdx),%xmm1
656 por %xmm0,%xmm4
657 pand `16*($k+2)-128`(%rdx),%xmm2
658 por %xmm1,%xmm5
659 pand `16*($k+3)-128`(%rdx),%xmm3
660 por %xmm2,%xmm4
661 por %xmm3,%xmm5
662___
663}
664$code.=<<___;
665 por %xmm5,%xmm4
666 pshufd \$0x4e,%xmm4,%xmm0
667 por %xmm4,%xmm0
668 lea $STRIDE($bp),$bp
669 movd %xmm0,$m0 # m0=bp[i]
670
671 xor $j,$j # j=0
672
673 mov (%rsp),$A[0]
674 mov $n0,$m1
675 mulq $m0 # ap[0]*bp[i]
676 add %rax,$A[0] # ap[0]*bp[i]+tp[0]
677 mov ($np),%rax
678 adc \$0,%rdx
679
680 imulq $A[0],$m1 # tp[0]*n0
681 mov %rdx,$A[1]
682
683 mulq $m1 # np[0]*m1
684 add %rax,$A[0] # "$N[0]", discarded
685 mov 8($ap),%rax
686 adc \$0,%rdx
687 mov %rdx,$N[1]
688
689 mulq $m0 # ap[j]*bp[i]
690 add %rax,$A[1]
691 mov 8($np),%rax
692 adc \$0,%rdx
693 add 8(%rsp),$A[1] # +tp[1]
694 adc \$0,%rdx
695 mov %rdx,$A[0]
696
697 mulq $m1 # np[j]*m1
698 add %rax,$N[1]
699 mov 16($ap),%rax
700 adc \$0,%rdx
701 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[i]+tp[j]
702 lea 4($j),$j # j+=2
703 adc \$0,%rdx
704 mov %rdx,$N[0]
705 jmp .Linner4x
706.align 16
707.Linner4x:
708 mulq $m0 # ap[j]*bp[i]
709 add %rax,$A[0]
710 mov -16($np,$j,8),%rax
711 adc \$0,%rdx
712 add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
713 adc \$0,%rdx
714 mov %rdx,$A[1]
715
716 mulq $m1 # np[j]*m1
717 add %rax,$N[0]
718 mov -8($ap,$j,8),%rax
719 adc \$0,%rdx
720 add $A[0],$N[0]
721 adc \$0,%rdx
722 mov $N[1],-32(%rsp,$j,8) # tp[j-1]
723 mov %rdx,$N[1]
724
725 mulq $m0 # ap[j]*bp[i]
726 add %rax,$A[1]
727 mov -8($np,$j,8),%rax
728 adc \$0,%rdx
729 add -8(%rsp,$j,8),$A[1]
730 adc \$0,%rdx
731 mov %rdx,$A[0]
732
733 mulq $m1 # np[j]*m1
734 add %rax,$N[1]
735 mov ($ap,$j,8),%rax
736 adc \$0,%rdx
737 add $A[1],$N[1]
738 adc \$0,%rdx
739 mov $N[0],-24(%rsp,$j,8) # tp[j-1]
740 mov %rdx,$N[0]
741
742 mulq $m0 # ap[j]*bp[i]
743 add %rax,$A[0]
744 mov ($np,$j,8),%rax
745 adc \$0,%rdx
746 add (%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
747 adc \$0,%rdx
748 mov %rdx,$A[1]
749
750 mulq $m1 # np[j]*m1
751 add %rax,$N[0]
752 mov 8($ap,$j,8),%rax
753 adc \$0,%rdx
754 add $A[0],$N[0]
755 adc \$0,%rdx
756 mov $N[1],-16(%rsp,$j,8) # tp[j-1]
757 mov %rdx,$N[1]
758
759 mulq $m0 # ap[j]*bp[i]
760 add %rax,$A[1]
761 mov 8($np,$j,8),%rax
762 adc \$0,%rdx
763 add 8(%rsp,$j,8),$A[1]
764 adc \$0,%rdx
765 lea 4($j),$j # j++
766 mov %rdx,$A[0]
767
768 mulq $m1 # np[j]*m1
769 add %rax,$N[1]
770 mov -16($ap,$j,8),%rax
771 adc \$0,%rdx
772 add $A[1],$N[1]
773 adc \$0,%rdx
774 mov $N[0],-40(%rsp,$j,8) # tp[j-1]
775 mov %rdx,$N[0]
776 cmp $num,$j
777 jl .Linner4x
778
779 mulq $m0 # ap[j]*bp[i]
780 add %rax,$A[0]
781 mov -16($np,$j,8),%rax
782 adc \$0,%rdx
783 add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
784 adc \$0,%rdx
785 mov %rdx,$A[1]
786
787 mulq $m1 # np[j]*m1
788 add %rax,$N[0]
789 mov -8($ap,$j,8),%rax
790 adc \$0,%rdx
791 add $A[0],$N[0]
792 adc \$0,%rdx
793 mov $N[1],-32(%rsp,$j,8) # tp[j-1]
794 mov %rdx,$N[1]
795
796 mulq $m0 # ap[j]*bp[i]
797 add %rax,$A[1]
798 mov -8($np,$j,8),%rax
799 adc \$0,%rdx
800 add -8(%rsp,$j,8),$A[1]
801 adc \$0,%rdx
802 lea 1($i),$i # i++
803 mov %rdx,$A[0]
804
805 mulq $m1 # np[j]*m1
806 add %rax,$N[1]
807 mov ($ap),%rax # ap[0]
808 adc \$0,%rdx
809 add $A[1],$N[1]
810 adc \$0,%rdx
811 mov $N[0],-24(%rsp,$j,8) # tp[j-1]
812 mov %rdx,$N[0]
813
814 mov $N[1],-16(%rsp,$j,8) # tp[j-1]
815
816 xor $N[1],$N[1]
817 add $A[0],$N[0]
818 adc \$0,$N[1]
819 add (%rsp,$num,8),$N[0] # pull upmost overflow bit
820 adc \$0,$N[1]
821 mov $N[0],-8(%rsp,$j,8)
822 mov $N[1],(%rsp,$j,8) # store upmost overflow bit
823
824 cmp $num,$i
825 jl .Louter4x
826___
827{
828my @ri=("%rax","%rdx",$m0,$m1);
829$code.=<<___;
830 mov 16(%rsp,$num,8),$rp # restore $rp
831 mov 0(%rsp),@ri[0] # tp[0]
832 pxor %xmm0,%xmm0
833 mov 8(%rsp),@ri[1] # tp[1]
834 shr \$2,$num # num/=4
835 lea (%rsp),$ap # borrow ap for tp
836 xor $i,$i # i=0 and clear CF!
837
838 sub 0($np),@ri[0]
839 mov 16($ap),@ri[2] # tp[2]
840 mov 24($ap),@ri[3] # tp[3]
841 sbb 8($np),@ri[1]
842 lea -1($num),$j # j=num/4-1
843 jmp .Lsub4x
844.align 16
845.Lsub4x:
846 mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i]
847 mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i]
848 sbb 16($np,$i,8),@ri[2]
849 mov 32($ap,$i,8),@ri[0] # tp[i+1]
850 mov 40($ap,$i,8),@ri[1]
851 sbb 24($np,$i,8),@ri[3]
852 mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i]
853 mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i]
854 sbb 32($np,$i,8),@ri[0]
855 mov 48($ap,$i,8),@ri[2]
856 mov 56($ap,$i,8),@ri[3]
857 sbb 40($np,$i,8),@ri[1]
858 lea 4($i),$i # i++
859 dec $j # doesnn't affect CF!
860 jnz .Lsub4x
861
862 mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i]
863 mov 32($ap,$i,8),@ri[0] # load overflow bit
864 sbb 16($np,$i,8),@ri[2]
865 mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i]
866 sbb 24($np,$i,8),@ri[3]
867 mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i]
868
869 sbb \$0,@ri[0] # handle upmost overflow bit
870 mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i]
871 xor $i,$i # i=0
872 and @ri[0],$ap
873 not @ri[0]
874 mov $rp,$np
875 and @ri[0],$np
876 lea -1($num),$j
877 or $np,$ap # ap=borrow?tp:rp
878
879 movdqu ($ap),%xmm1
880 movdqa %xmm0,(%rsp)
881 movdqu %xmm1,($rp)
882 jmp .Lcopy4x
883.align 16
884.Lcopy4x: # copy or in-place refresh
885 movdqu 16($ap,$i),%xmm2
886 movdqu 32($ap,$i),%xmm1
887 movdqa %xmm0,16(%rsp,$i)
888 movdqu %xmm2,16($rp,$i)
889 movdqa %xmm0,32(%rsp,$i)
890 movdqu %xmm1,32($rp,$i)
891 lea 32($i),$i
892 dec $j
893 jnz .Lcopy4x
894
895 shl \$2,$num
896 movdqu 16($ap,$i),%xmm2
897 movdqa %xmm0,16(%rsp,$i)
898 movdqu %xmm2,16($rp,$i)
899___
900}
901$code.=<<___;
902 mov 8(%rsp,$num,8),%rsi # restore %rsp
903 mov \$1,%rax
904
905 mov (%rsi),%r15
906 mov 8(%rsi),%r14
907 mov 16(%rsi),%r13
908 mov 24(%rsi),%r12
909 mov 32(%rsi),%rbp
910 mov 40(%rsi),%rbx
911 lea 48(%rsi),%rsp
912.Lmul4x_epilogue:
913 ret
914.size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
915___
916}}}
917
918{
919my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%rdx","%r8", "%r9d") : # Win64 order
920 ("%rdi","%rsi","%rdx","%ecx"); # Unix order
921my $out=$inp;
922my $STRIDE=2**5*8;
923my $N=$STRIDE/4;
924
925$code.=<<___;
926.globl bn_scatter5
927.type bn_scatter5,\@abi-omnipotent
928.align 16
929bn_scatter5:
930 _CET_ENDBR
931 cmp \$0, $num
932 jz .Lscatter_epilogue
933 lea ($tbl,$idx,8),$tbl
934.Lscatter:
935 mov ($inp),%rax
936 lea 8($inp),$inp
937 mov %rax,($tbl)
938 lea 32*8($tbl),$tbl
939 sub \$1,$num
940 jnz .Lscatter
941.Lscatter_epilogue:
942 ret
943.size bn_scatter5,.-bn_scatter5
944
945.globl bn_gather5
946.type bn_gather5,\@abi-omnipotent
947.align 16
948bn_gather5:
949 _CET_ENDBR
950.LSEH_begin_bn_gather5: # Win64 thing, but harmless in other cases
951 # I can't trust assembler to use specific encoding:-(
952 .byte 0x4c,0x8d,0x14,0x24 # lea (%rsp),%r10
953 .byte 0x48,0x81,0xec,0x08,0x01,0x00,0x00 # sub $0x108,%rsp
954 lea .Linc(%rip),%rax
955 and \$-16,%rsp # shouldn't be formally required
956
957 movd $idx,%xmm5
958 movdqa 0(%rax),%xmm0 # 00000001000000010000000000000000
959 movdqa 16(%rax),%xmm1 # 00000002000000020000000200000002
960 lea 128($tbl),%r11 # size optimization
961 lea 128(%rsp),%rax # size optimization
962
963 pshufd \$0,%xmm5,%xmm5 # broadcast $idx
964 movdqa %xmm1,%xmm4
965 movdqa %xmm1,%xmm2
966___
967########################################################################
968# calculate mask by comparing 0..31 to $idx and save result to stack
969#
970for($i=0;$i<$STRIDE/16;$i+=4) {
971$code.=<<___;
972 paddd %xmm0,%xmm1
973 pcmpeqd %xmm5,%xmm0 # compare to 1,0
974___
975$code.=<<___ if ($i);
976 movdqa %xmm3,`16*($i-1)-128`(%rax)
977___
978$code.=<<___;
979 movdqa %xmm4,%xmm3
980
981 paddd %xmm1,%xmm2
982 pcmpeqd %xmm5,%xmm1 # compare to 3,2
983 movdqa %xmm0,`16*($i+0)-128`(%rax)
984 movdqa %xmm4,%xmm0
985
986 paddd %xmm2,%xmm3
987 pcmpeqd %xmm5,%xmm2 # compare to 5,4
988 movdqa %xmm1,`16*($i+1)-128`(%rax)
989 movdqa %xmm4,%xmm1
990
991 paddd %xmm3,%xmm0
992 pcmpeqd %xmm5,%xmm3 # compare to 7,6
993 movdqa %xmm2,`16*($i+2)-128`(%rax)
994 movdqa %xmm4,%xmm2
995___
996}
997$code.=<<___;
998 movdqa %xmm3,`16*($i-1)-128`(%rax)
999 jmp .Lgather
1000
1001.align 32
1002.Lgather:
1003 pxor %xmm4,%xmm4
1004 pxor %xmm5,%xmm5
1005___
1006for($i=0;$i<$STRIDE/16;$i+=4) {
1007$code.=<<___;
1008 movdqa `16*($i+0)-128`(%r11),%xmm0
1009 movdqa `16*($i+1)-128`(%r11),%xmm1
1010 movdqa `16*($i+2)-128`(%r11),%xmm2
1011 pand `16*($i+0)-128`(%rax),%xmm0
1012 movdqa `16*($i+3)-128`(%r11),%xmm3
1013 pand `16*($i+1)-128`(%rax),%xmm1
1014 por %xmm0,%xmm4
1015 pand `16*($i+2)-128`(%rax),%xmm2
1016 por %xmm1,%xmm5
1017 pand `16*($i+3)-128`(%rax),%xmm3
1018 por %xmm2,%xmm4
1019 por %xmm3,%xmm5
1020___
1021}
1022$code.=<<___;
1023 por %xmm5,%xmm4
1024 lea $STRIDE(%r11),%r11
1025 pshufd \$0x4e,%xmm4,%xmm0
1026 por %xmm4,%xmm0
1027 movq %xmm0,($out) # m0=bp[0]
1028 lea 8($out),$out
1029 sub \$1,$num
1030 jnz .Lgather
1031
1032 lea (%r10),%rsp
1033 ret
1034.LSEH_end_bn_gather5:
1035.size bn_gather5,.-bn_gather5
1036___
1037}
1038$code.=<<___;
1039.section .rodata
1040.align 64
1041.Linc:
1042 .long 0,0, 1,1
1043 .long 2,2, 2,2
1044.text
1045___
1046
1047# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1048# CONTEXT *context,DISPATCHER_CONTEXT *disp)
1049if ($win64) {
1050$rec="%rcx";
1051$frame="%rdx";
1052$context="%r8";
1053$disp="%r9";
1054
1055$code.=<<___;
1056.extern __imp_RtlVirtualUnwind
1057.type mul_handler,\@abi-omnipotent
1058.align 16
1059mul_handler:
1060 _CET_ENDBR
1061 push %rsi
1062 push %rdi
1063 push %rbx
1064 push %rbp
1065 push %r12
1066 push %r13
1067 push %r14
1068 push %r15
1069 pushfq
1070 sub \$64,%rsp
1071
1072 mov 120($context),%rax # pull context->Rax
1073 mov 248($context),%rbx # pull context->Rip
1074
1075 mov 8($disp),%rsi # disp->ImageBase
1076 mov 56($disp),%r11 # disp->HandlerData
1077
1078 mov 0(%r11),%r10d # HandlerData[0]
1079 lea (%rsi,%r10),%r10 # end of prologue label
1080 cmp %r10,%rbx # context->Rip<end of prologue label
1081 jb .Lcommon_seh_tail
1082
1083 lea 48(%rax),%rax
1084
1085 mov 4(%r11),%r10d # HandlerData[1]
1086 lea (%rsi,%r10),%r10 # end of alloca label
1087 cmp %r10,%rbx # context->Rip<end of alloca label
1088 jb .Lcommon_seh_tail
1089
1090 mov 152($context),%rax # pull context->Rsp
1091
1092 mov 8(%r11),%r10d # HandlerData[2]
1093 lea (%rsi,%r10),%r10 # epilogue label
1094 cmp %r10,%rbx # context->Rip>=epilogue label
1095 jae .Lcommon_seh_tail
1096
1097 mov 192($context),%r10 # pull $num
1098 mov 8(%rax,%r10,8),%rax # pull saved stack pointer
1099
1100 lea 48(%rax),%rax
1101
1102 mov -8(%rax),%rbx
1103 mov -16(%rax),%rbp
1104 mov -24(%rax),%r12
1105 mov -32(%rax),%r13
1106 mov -40(%rax),%r14
1107 mov -48(%rax),%r15
1108 mov %rbx,144($context) # restore context->Rbx
1109 mov %rbp,160($context) # restore context->Rbp
1110 mov %r12,216($context) # restore context->R12
1111 mov %r13,224($context) # restore context->R13
1112 mov %r14,232($context) # restore context->R14
1113 mov %r15,240($context) # restore context->R15
1114
1115.Lcommon_seh_tail:
1116 mov 8(%rax),%rdi
1117 mov 16(%rax),%rsi
1118 mov %rax,152($context) # restore context->Rsp
1119 mov %rsi,168($context) # restore context->Rsi
1120 mov %rdi,176($context) # restore context->Rdi
1121
1122 mov 40($disp),%rdi # disp->ContextRecord
1123 mov $context,%rsi # context
1124 mov \$154,%ecx # sizeof(CONTEXT)
1125 .long 0xa548f3fc # cld; rep movsq
1126
1127 mov $disp,%rsi
1128 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1129 mov 8(%rsi),%rdx # arg2, disp->ImageBase
1130 mov 0(%rsi),%r8 # arg3, disp->ControlPc
1131 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1132 mov 40(%rsi),%r10 # disp->ContextRecord
1133 lea 56(%rsi),%r11 # &disp->HandlerData
1134 lea 24(%rsi),%r12 # &disp->EstablisherFrame
1135 mov %r10,32(%rsp) # arg5
1136 mov %r11,40(%rsp) # arg6
1137 mov %r12,48(%rsp) # arg7
1138 mov %rcx,56(%rsp) # arg8, (NULL)
1139 call *__imp_RtlVirtualUnwind(%rip)
1140
1141 mov \$1,%eax # ExceptionContinueSearch
1142 add \$64,%rsp
1143 popfq
1144 pop %r15
1145 pop %r14
1146 pop %r13
1147 pop %r12
1148 pop %rbp
1149 pop %rbx
1150 pop %rdi
1151 pop %rsi
1152 ret
1153.size mul_handler,.-mul_handler
1154
1155.section .pdata
1156.align 4
1157 .rva .LSEH_begin_bn_mul_mont_gather5
1158 .rva .LSEH_end_bn_mul_mont_gather5
1159 .rva .LSEH_info_bn_mul_mont_gather5
1160
1161 .rva .LSEH_begin_bn_mul4x_mont_gather5
1162 .rva .LSEH_end_bn_mul4x_mont_gather5
1163 .rva .LSEH_info_bn_mul4x_mont_gather5
1164
1165 .rva .LSEH_begin_bn_gather5
1166 .rva .LSEH_end_bn_gather5
1167 .rva .LSEH_info_bn_gather5
1168
1169.section .xdata
1170.align 8
1171.LSEH_info_bn_mul_mont_gather5:
1172 .byte 9,0,0,0
1173 .rva mul_handler
1174 .rva .Lmul_alloca,.Lmul_body,.Lmul_epilogue # HandlerData[]
1175.align 8
1176.LSEH_info_bn_mul4x_mont_gather5:
1177 .byte 9,0,0,0
1178 .rva mul_handler
1179 .rva .Lmul4x_alloca,.Lmul4x_body,.Lmul4x_epilogue # HandlerData[]
1180.align 8
1181.LSEH_info_bn_gather5:
1182 .byte 0x01,0x0b,0x03,0x0a
1183 .byte 0x0b,0x01,0x21,0x00 # sub rsp,0x108
1184 .byte 0x04,0xa3,0x00,0x00 # lea r10,(rsp), set_frame r10
1185.align 8
1186___
1187}
1188
1189$code =~ s/\`([^\`]*)\`/eval($1)/gem;
1190
1191print $code;
1192close STDOUT;