summaryrefslogtreecommitdiff
path: root/src/lib/libcrypto/bn
diff options
context:
space:
mode:
authorcvs2svn <admin@example.com>2002-05-15 02:29:24 +0000
committercvs2svn <admin@example.com>2002-05-15 02:29:24 +0000
commit027351f729b9e837200dae6e1520cda6577ab930 (patch)
treee25a717057aa4529e433fc3b1fac8d4df8db3a5c /src/lib/libcrypto/bn
parentaeeae06a79815dc190061534d47236cec09f9e32 (diff)
downloadopenbsd-027351f729b9e837200dae6e1520cda6577ab930.tar.gz
openbsd-027351f729b9e837200dae6e1520cda6577ab930.tar.bz2
openbsd-027351f729b9e837200dae6e1520cda6577ab930.zip
This commit was manufactured by cvs2git to create branch 'unlabeled-1.1.1'.
Diffstat (limited to 'src/lib/libcrypto/bn')
-rw-r--r--src/lib/libcrypto/bn/asm/co-586.pl286
-rw-r--r--src/lib/libcrypto/bn/asm/ia64.S1498
-rw-r--r--src/lib/libcrypto/bn/asm/pa-risc2W.s1605
-rw-r--r--src/lib/libcrypto/bn/asm/sparcv8.S1458
-rw-r--r--src/lib/libcrypto/bn/asm/sparcv8plus.S1535
-rw-r--r--src/lib/libcrypto/bn/asm/x86.pl28
-rw-r--r--src/lib/libcrypto/bn/asm/x86/add.pl76
-rw-r--r--src/lib/libcrypto/bn/asm/x86/comba.pl277
-rw-r--r--src/lib/libcrypto/bn/asm/x86/div.pl15
-rw-r--r--src/lib/libcrypto/bn/asm/x86/mul.pl77
-rw-r--r--src/lib/libcrypto/bn/asm/x86/mul_add.pl87
-rw-r--r--src/lib/libcrypto/bn/asm/x86/sqr.pl60
-rw-r--r--src/lib/libcrypto/bn/asm/x86/sub.pl76
-rw-r--r--src/lib/libcrypto/bn/bn.h467
-rw-r--r--src/lib/libcrypto/bn/bn_asm.c802
-rw-r--r--src/lib/libcrypto/bn/bn_ctx.c144
-rw-r--r--src/lib/libcrypto/bn/bn_exp2.c195
-rw-r--r--src/lib/libcrypto/bn/bn_kron.c182
-rw-r--r--src/lib/libcrypto/bn/bn_sqrt.c387
19 files changed, 9255 insertions, 0 deletions
diff --git a/src/lib/libcrypto/bn/asm/co-586.pl b/src/lib/libcrypto/bn/asm/co-586.pl
new file mode 100644
index 0000000000..5d962cb957
--- /dev/null
+++ b/src/lib/libcrypto/bn/asm/co-586.pl
@@ -0,0 +1,286 @@
1#!/usr/local/bin/perl
2
3push(@INC,"perlasm","../../perlasm");
4require "x86asm.pl";
5
6&asm_init($ARGV[0],$0);
7
8&bn_mul_comba("bn_mul_comba8",8);
9&bn_mul_comba("bn_mul_comba4",4);
10&bn_sqr_comba("bn_sqr_comba8",8);
11&bn_sqr_comba("bn_sqr_comba4",4);
12
13&asm_finish();
14
15sub mul_add_c
16 {
17 local($a,$ai,$b,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
18
19 # pos == -1 if eax and edx are pre-loaded, 0 to load from next
20 # words, and 1 if load return value
21
22 &comment("mul a[$ai]*b[$bi]");
23
24 # "eax" and "edx" will always be pre-loaded.
25 # &mov("eax",&DWP($ai*4,$a,"",0)) ;
26 # &mov("edx",&DWP($bi*4,$b,"",0));
27
28 &mul("edx");
29 &add($c0,"eax");
30 &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # laod next a
31 &mov("eax",&wparam(0)) if $pos > 0; # load r[]
32 ###
33 &adc($c1,"edx");
34 &mov("edx",&DWP(($nb)*4,$b,"",0)) if $pos == 0; # laod next b
35 &mov("edx",&DWP(($nb)*4,$b,"",0)) if $pos == 1; # laod next b
36 ###
37 &adc($c2,0);
38 # is pos > 1, it means it is the last loop
39 &mov(&DWP($i*4,"eax","",0),$c0) if $pos > 0; # save r[];
40 &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # laod next a
41 }
42
43sub sqr_add_c
44 {
45 local($r,$a,$ai,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
46
47 # pos == -1 if eax and edx are pre-loaded, 0 to load from next
48 # words, and 1 if load return value
49
50 &comment("sqr a[$ai]*a[$bi]");
51
52 # "eax" and "edx" will always be pre-loaded.
53 # &mov("eax",&DWP($ai*4,$a,"",0)) ;
54 # &mov("edx",&DWP($bi*4,$b,"",0));
55
56 if ($ai == $bi)
57 { &mul("eax");}
58 else
59 { &mul("edx");}
60 &add($c0,"eax");
61 &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # load next a
62 ###
63 &adc($c1,"edx");
64 &mov("edx",&DWP(($nb)*4,$a,"",0)) if ($pos == 1) && ($na != $nb);
65 ###
66 &adc($c2,0);
67 # is pos > 1, it means it is the last loop
68 &mov(&DWP($i*4,$r,"",0),$c0) if $pos > 0; # save r[];
69 &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # load next b
70 }
71
72sub sqr_add_c2
73 {
74 local($r,$a,$ai,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
75
76 # pos == -1 if eax and edx are pre-loaded, 0 to load from next
77 # words, and 1 if load return value
78
79 &comment("sqr a[$ai]*a[$bi]");
80
81 # "eax" and "edx" will always be pre-loaded.
82 # &mov("eax",&DWP($ai*4,$a,"",0)) ;
83 # &mov("edx",&DWP($bi*4,$a,"",0));
84
85 if ($ai == $bi)
86 { &mul("eax");}
87 else
88 { &mul("edx");}
89 &add("eax","eax");
90 ###
91 &adc("edx","edx");
92 ###
93 &adc($c2,0);
94 &add($c0,"eax");
95 &adc($c1,"edx");
96 &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # load next a
97 &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # load next b
98 &adc($c2,0);
99 &mov(&DWP($i*4,$r,"",0),$c0) if $pos > 0; # save r[];
100 &mov("edx",&DWP(($nb)*4,$a,"",0)) if ($pos <= 1) && ($na != $nb);
101 ###
102 }
103
104sub bn_mul_comba
105 {
106 local($name,$num)=@_;
107 local($a,$b,$c0,$c1,$c2);
108 local($i,$as,$ae,$bs,$be,$ai,$bi);
109 local($tot,$end);
110
111 &function_begin_B($name,"");
112
113 $c0="ebx";
114 $c1="ecx";
115 $c2="ebp";
116 $a="esi";
117 $b="edi";
118
119 $as=0;
120 $ae=0;
121 $bs=0;
122 $be=0;
123 $tot=$num+$num-1;
124
125 &push("esi");
126 &mov($a,&wparam(1));
127 &push("edi");
128 &mov($b,&wparam(2));
129 &push("ebp");
130 &push("ebx");
131
132 &xor($c0,$c0);
133 &mov("eax",&DWP(0,$a,"",0)); # load the first word
134 &xor($c1,$c1);
135 &mov("edx",&DWP(0,$b,"",0)); # load the first second
136
137 for ($i=0; $i<$tot; $i++)
138 {
139 $ai=$as;
140 $bi=$bs;
141 $end=$be+1;
142
143 &comment("################## Calculate word $i");
144
145 for ($j=$bs; $j<$end; $j++)
146 {
147 &xor($c2,$c2) if ($j == $bs);
148 if (($j+1) == $end)
149 {
150 $v=1;
151 $v=2 if (($i+1) == $tot);
152 }
153 else
154 { $v=0; }
155 if (($j+1) != $end)
156 {
157 $na=($ai-1);
158 $nb=($bi+1);
159 }
160 else
161 {
162 $na=$as+($i < ($num-1));
163 $nb=$bs+($i >= ($num-1));
164 }
165#printf STDERR "[$ai,$bi] -> [$na,$nb]\n";
166 &mul_add_c($a,$ai,$b,$bi,$c0,$c1,$c2,$v,$i,$na,$nb);
167 if ($v)
168 {
169 &comment("saved r[$i]");
170 # &mov("eax",&wparam(0));
171 # &mov(&DWP($i*4,"eax","",0),$c0);
172 ($c0,$c1,$c2)=($c1,$c2,$c0);
173 }
174 $ai--;
175 $bi++;
176 }
177 $as++ if ($i < ($num-1));
178 $ae++ if ($i >= ($num-1));
179
180 $bs++ if ($i >= ($num-1));
181 $be++ if ($i < ($num-1));
182 }
183 &comment("save r[$i]");
184 # &mov("eax",&wparam(0));
185 &mov(&DWP($i*4,"eax","",0),$c0);
186
187 &pop("ebx");
188 &pop("ebp");
189 &pop("edi");
190 &pop("esi");
191 &ret();
192 &function_end_B($name);
193 }
194
195sub bn_sqr_comba
196 {
197 local($name,$num)=@_;
198 local($r,$a,$c0,$c1,$c2)=@_;
199 local($i,$as,$ae,$bs,$be,$ai,$bi);
200 local($b,$tot,$end,$half);
201
202 &function_begin_B($name,"");
203
204 $c0="ebx";
205 $c1="ecx";
206 $c2="ebp";
207 $a="esi";
208 $r="edi";
209
210 &push("esi");
211 &push("edi");
212 &push("ebp");
213 &push("ebx");
214 &mov($r,&wparam(0));
215 &mov($a,&wparam(1));
216 &xor($c0,$c0);
217 &xor($c1,$c1);
218 &mov("eax",&DWP(0,$a,"",0)); # load the first word
219
220 $as=0;
221 $ae=0;
222 $bs=0;
223 $be=0;
224 $tot=$num+$num-1;
225
226 for ($i=0; $i<$tot; $i++)
227 {
228 $ai=$as;
229 $bi=$bs;
230 $end=$be+1;
231
232 &comment("############### Calculate word $i");
233 for ($j=$bs; $j<$end; $j++)
234 {
235 &xor($c2,$c2) if ($j == $bs);
236 if (($ai-1) < ($bi+1))
237 {
238 $v=1;
239 $v=2 if ($i+1) == $tot;
240 }
241 else
242 { $v=0; }
243 if (!$v)
244 {
245 $na=$ai-1;
246 $nb=$bi+1;
247 }
248 else
249 {
250 $na=$as+($i < ($num-1));
251 $nb=$bs+($i >= ($num-1));
252 }
253 if ($ai == $bi)
254 {
255 &sqr_add_c($r,$a,$ai,$bi,
256 $c0,$c1,$c2,$v,$i,$na,$nb);
257 }
258 else
259 {
260 &sqr_add_c2($r,$a,$ai,$bi,
261 $c0,$c1,$c2,$v,$i,$na,$nb);
262 }
263 if ($v)
264 {
265 &comment("saved r[$i]");
266 #&mov(&DWP($i*4,$r,"",0),$c0);
267 ($c0,$c1,$c2)=($c1,$c2,$c0);
268 last;
269 }
270 $ai--;
271 $bi++;
272 }
273 $as++ if ($i < ($num-1));
274 $ae++ if ($i >= ($num-1));
275
276 $bs++ if ($i >= ($num-1));
277 $be++ if ($i < ($num-1));
278 }
279 &mov(&DWP($i*4,$r,"",0),$c0);
280 &pop("ebx");
281 &pop("ebp");
282 &pop("edi");
283 &pop("esi");
284 &ret();
285 &function_end_B($name);
286 }
diff --git a/src/lib/libcrypto/bn/asm/ia64.S b/src/lib/libcrypto/bn/asm/ia64.S
new file mode 100644
index 0000000000..ae56066310
--- /dev/null
+++ b/src/lib/libcrypto/bn/asm/ia64.S
@@ -0,0 +1,1498 @@
1.explicit
2.text
3.ident "ia64.S, Version 1.1"
4.ident "IA-64 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>"
5
6//
7// ====================================================================
8// Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
9// project.
10//
11// Rights for redistribution and usage in source and binary forms are
12// granted according to the OpenSSL license. Warranty of any kind is
13// disclaimed.
14// ====================================================================
15//
16
17// Q. How much faster does it get?
18// A. Here is the output from 'openssl speed rsa dsa' for vanilla
19// 0.9.6a compiled with gcc version 2.96 20000731 (Red Hat
20// Linux 7.1 2.96-81):
21//
22// sign verify sign/s verify/s
23// rsa 512 bits 0.0036s 0.0003s 275.3 2999.2
24// rsa 1024 bits 0.0203s 0.0011s 49.3 894.1
25// rsa 2048 bits 0.1331s 0.0040s 7.5 250.9
26// rsa 4096 bits 0.9270s 0.0147s 1.1 68.1
27// sign verify sign/s verify/s
28// dsa 512 bits 0.0035s 0.0043s 288.3 234.8
29// dsa 1024 bits 0.0111s 0.0135s 90.0 74.2
30//
31// And here is similar output but for this assembler
32// implementation:-)
33//
34// sign verify sign/s verify/s
35// rsa 512 bits 0.0021s 0.0001s 549.4 9638.5
36// rsa 1024 bits 0.0055s 0.0002s 183.8 4481.1
37// rsa 2048 bits 0.0244s 0.0006s 41.4 1726.3
38// rsa 4096 bits 0.1295s 0.0018s 7.7 561.5
39// sign verify sign/s verify/s
40// dsa 512 bits 0.0012s 0.0013s 891.9 756.6
41// dsa 1024 bits 0.0023s 0.0028s 440.4 376.2
42//
43// Yes, you may argue that it's not fair comparison as it's
44// possible to craft the C implementation with BN_UMULT_HIGH
45// inline assembler macro. But of course! Here is the output
46// with the macro:
47//
48// sign verify sign/s verify/s
49// rsa 512 bits 0.0020s 0.0002s 495.0 6561.0
50// rsa 1024 bits 0.0086s 0.0004s 116.2 2235.7
51// rsa 2048 bits 0.0519s 0.0015s 19.3 667.3
52// rsa 4096 bits 0.3464s 0.0053s 2.9 187.7
53// sign verify sign/s verify/s
54// dsa 512 bits 0.0016s 0.0020s 613.1 510.5
55// dsa 1024 bits 0.0045s 0.0054s 221.0 183.9
56//
57// My code is still way faster, huh:-) And I believe that even
58// higher performance can be achieved. Note that as keys get
59// longer, performance gain is larger. Why? According to the
60// profiler there is another player in the field, namely
61// BN_from_montgomery consuming larger and larger portion of CPU
62// time as keysize decreases. I therefore consider putting effort
63// to assembler implementation of the following routine:
64//
65// void bn_mul_add_mont (BN_ULONG *rp,BN_ULONG *np,int nl,BN_ULONG n0)
66// {
67// int i,j;
68// BN_ULONG v;
69//
70// for (i=0; i<nl; i++)
71// {
72// v=bn_mul_add_words(rp,np,nl,(rp[0]*n0)&BN_MASK2);
73// nrp++;
74// rp++;
75// if (((nrp[-1]+=v)&BN_MASK2) < v)
76// for (j=0; ((++nrp[j])&BN_MASK2) == 0; j++) ;
77// }
78// }
79//
80// It might as well be beneficial to implement even combaX
81// variants, as it appears as it can literally unleash the
82// performance (see comment section to bn_mul_comba8 below).
83//
84// And finally for your reference the output for 0.9.6a compiled
85// with SGIcc version 0.01.0-12 (keep in mind that for the moment
86// of this writing it's not possible to convince SGIcc to use
87// BN_UMULT_HIGH inline assembler macro, yet the code is fast,
88// i.e. for a compiler generated one:-):
89//
90// sign verify sign/s verify/s
91// rsa 512 bits 0.0022s 0.0002s 452.7 5894.3
92// rsa 1024 bits 0.0097s 0.0005s 102.7 2002.9
93// rsa 2048 bits 0.0578s 0.0017s 17.3 600.2
94// rsa 4096 bits 0.3838s 0.0061s 2.6 164.5
95// sign verify sign/s verify/s
96// dsa 512 bits 0.0018s 0.0022s 547.3 459.6
97// dsa 1024 bits 0.0051s 0.0062s 196.6 161.3
98//
99// Oh! Benchmarks were performed on 733MHz Lion-class Itanium
100// system running Redhat Linux 7.1 (very special thanks to Ray
101// McCaffity of Williams Communications for providing an account).
102//
103// Q. What's the heck with 'rum 1<<5' at the end of every function?
104// A. Well, by clearing the "upper FP registers written" bit of the
105// User Mask I want to excuse the kernel from preserving upper
106// (f32-f128) FP register bank over process context switch, thus
107// minimizing bus bandwidth consumption during the switch (i.e.
108// after PKI opration completes and the program is off doing
109// something else like bulk symmetric encryption). Having said
110// this, I also want to point out that it might be good idea
111// to compile the whole toolkit (as well as majority of the
112// programs for that matter) with -mfixed-range=f32-f127 command
113// line option. No, it doesn't prevent the compiler from writing
114// to upper bank, but at least discourages to do so. If you don't
115// like the idea you have the option to compile the module with
116// -Drum=nop.m in command line.
117//
118
119#if 1
120//
121// bn_[add|sub]_words routines.
122//
123// Loops are spinning in 2*(n+5) ticks on Itanuim (provided that the
124// data reside in L1 cache, i.e. 2 ticks away). It's possible to
125// compress the epilogue and get down to 2*n+6, but at the cost of
126// scalability (the neat feature of this implementation is that it
127// shall automagically spin in n+5 on "wider" IA-64 implementations:-)
128// I consider that the epilogue is short enough as it is to trade tiny
129// performance loss on Itanium for scalability.
130//
131// BN_ULONG bn_add_words(BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int num)
132//
133.global bn_add_words#
134.proc bn_add_words#
135.align 64
136.skip 32 // makes the loop body aligned at 64-byte boundary
137bn_add_words:
138 .prologue
139 .fframe 0
140 .save ar.pfs,r2
141{ .mii; alloc r2=ar.pfs,4,12,0,16
142 cmp4.le p6,p0=r35,r0 };;
143{ .mfb; mov r8=r0 // return value
144(p6) br.ret.spnt.many b0 };;
145
146 .save ar.lc,r3
147{ .mib; sub r10=r35,r0,1
148 mov r3=ar.lc
149 brp.loop.imp .L_bn_add_words_ctop,.L_bn_add_words_cend-16
150 }
151 .body
152{ .mib; mov r14=r32 // rp
153 mov r9=pr };;
154{ .mii; mov r15=r33 // ap
155 mov ar.lc=r10
156 mov ar.ec=6 }
157{ .mib; mov r16=r34 // bp
158 mov pr.rot=1<<16 };;
159
160.L_bn_add_words_ctop:
161{ .mii; (p16) ld8 r32=[r16],8 // b=*(bp++)
162 (p18) add r39=r37,r34
163 (p19) cmp.ltu.unc p56,p0=r40,r38 }
164{ .mfb; (p0) nop.m 0x0
165 (p0) nop.f 0x0
166 (p0) nop.b 0x0 }
167{ .mii; (p16) ld8 r35=[r15],8 // a=*(ap++)
168 (p58) cmp.eq.or p57,p0=-1,r41 // (p20)
169 (p58) add r41=1,r41 } // (p20)
170{ .mfb; (p21) st8 [r14]=r42,8 // *(rp++)=r
171 (p0) nop.f 0x0
172 br.ctop.sptk .L_bn_add_words_ctop };;
173.L_bn_add_words_cend:
174
175{ .mii;
176(p59) add r8=1,r8 // return value
177 mov pr=r9,-1
178 mov ar.lc=r3 }
179{ .mbb; nop.b 0x0
180 br.ret.sptk.many b0 };;
181.endp bn_add_words#
182
183//
184// BN_ULONG bn_sub_words(BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int num)
185//
186.global bn_sub_words#
187.proc bn_sub_words#
188.align 64
189.skip 32 // makes the loop body aligned at 64-byte boundary
190bn_sub_words:
191 .prologue
192 .fframe 0
193 .save ar.pfs,r2
194{ .mii; alloc r2=ar.pfs,4,12,0,16
195 cmp4.le p6,p0=r35,r0 };;
196{ .mfb; mov r8=r0 // return value
197(p6) br.ret.spnt.many b0 };;
198
199 .save ar.lc,r3
200{ .mib; sub r10=r35,r0,1
201 mov r3=ar.lc
202 brp.loop.imp .L_bn_sub_words_ctop,.L_bn_sub_words_cend-16
203 }
204 .body
205{ .mib; mov r14=r32 // rp
206 mov r9=pr };;
207{ .mii; mov r15=r33 // ap
208 mov ar.lc=r10
209 mov ar.ec=6 }
210{ .mib; mov r16=r34 // bp
211 mov pr.rot=1<<16 };;
212
213.L_bn_sub_words_ctop:
214{ .mii; (p16) ld8 r32=[r16],8 // b=*(bp++)
215 (p18) sub r39=r37,r34
216 (p19) cmp.gtu.unc p56,p0=r40,r38 }
217{ .mfb; (p0) nop.m 0x0
218 (p0) nop.f 0x0
219 (p0) nop.b 0x0 }
220{ .mii; (p16) ld8 r35=[r15],8 // a=*(ap++)
221 (p58) cmp.eq.or p57,p0=0,r41 // (p20)
222 (p58) add r41=-1,r41 } // (p20)
223{ .mbb; (p21) st8 [r14]=r42,8 // *(rp++)=r
224 (p0) nop.b 0x0
225 br.ctop.sptk .L_bn_sub_words_ctop };;
226.L_bn_sub_words_cend:
227
228{ .mii;
229(p59) add r8=1,r8 // return value
230 mov pr=r9,-1
231 mov ar.lc=r3 }
232{ .mbb; nop.b 0x0
233 br.ret.sptk.many b0 };;
234.endp bn_sub_words#
235#endif
236
237#if 0
238#define XMA_TEMPTATION
239#endif
240
241#if 1
242//
243// BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
244//
245.global bn_mul_words#
246.proc bn_mul_words#
247.align 64
248.skip 32 // makes the loop body aligned at 64-byte boundary
249bn_mul_words:
250 .prologue
251 .fframe 0
252 .save ar.pfs,r2
253#ifdef XMA_TEMPTATION
254{ .mfi; alloc r2=ar.pfs,4,0,0,0 };;
255#else
256{ .mfi; alloc r2=ar.pfs,4,4,0,8 };;
257#endif
258{ .mib; mov r8=r0 // return value
259 cmp4.le p6,p0=r34,r0
260(p6) br.ret.spnt.many b0 };;
261
262 .save ar.lc,r3
263{ .mii; sub r10=r34,r0,1
264 mov r3=ar.lc
265 mov r9=pr };;
266
267 .body
268{ .mib; setf.sig f8=r35 // w
269 mov pr.rot=0x400001<<16
270 // ------^----- serves as (p48) at first (p26)
271 brp.loop.imp .L_bn_mul_words_ctop,.L_bn_mul_words_cend-16
272 }
273
274#ifndef XMA_TEMPTATION
275
276{ .mii; mov r14=r32 // rp
277 mov r15=r33 // ap
278 mov ar.lc=r10 }
279{ .mii; mov r39=0 // serves as r33 at first (p26)
280 mov ar.ec=12 };;
281
282// This loop spins in 2*(n+11) ticks. It's scheduled for data in L2
283// cache (i.e. 9 ticks away) as floating point load/store instructions
284// bypass L1 cache and L2 latency is actually best-case scenario for
285// ldf8. The loop is not scalable and shall run in 2*(n+11) even on
286// "wider" IA-64 implementations. It's a trade-off here. n+22 loop
287// would give us ~5% in *overall* performance improvement on "wider"
288// IA-64, but would hurt Itanium for about same because of longer
289// epilogue. As it's a matter of few percents in either case I've
290// chosen to trade the scalability for development time (you can see
291// this very instruction sequence in bn_mul_add_words loop which in
292// turn is scalable).
293.L_bn_mul_words_ctop:
294{ .mfi; (p25) getf.sig r36=f49 // low
295 (p21) xmpy.lu f45=f37,f8
296 (p27) cmp.ltu p52,p48=r39,r38 }
297{ .mfi; (p16) ldf8 f32=[r15],8
298 (p21) xmpy.hu f38=f37,f8
299 (p0) nop.i 0x0 };;
300{ .mii; (p26) getf.sig r32=f43 // high
301 .pred.rel "mutex",p48,p52
302 (p48) add r38=r37,r33 // (p26)
303 (p52) add r38=r37,r33,1 } // (p26)
304{ .mfb; (p27) st8 [r14]=r39,8
305 (p0) nop.f 0x0
306 br.ctop.sptk .L_bn_mul_words_ctop };;
307.L_bn_mul_words_cend:
308
309{ .mii; nop.m 0x0
310.pred.rel "mutex",p49,p53
311(p49) add r8=r34,r0
312(p53) add r8=r34,r0,1 }
313{ .mfb; nop.m 0x0
314 nop.f 0x0
315 nop.b 0x0 }
316
317#else // XMA_TEMPTATION
318
319 setf.sig f37=r0 // serves as carry at (p18) tick
320 mov ar.lc=r10
321 mov ar.ec=5;;
322
323// Most of you examining this code very likely wonder why in the name
324// of Intel the following loop is commented out? Indeed, it looks so
325// neat that you find it hard to believe that it's something wrong
326// with it, right? The catch is that every iteration depends on the
327// result from previous one and the latter isn't available instantly.
328// The loop therefore spins at the latency of xma minus 1, or in other
329// words at 6*(n+4) ticks:-( Compare to the "production" loop above
330// that runs in 2*(n+11) where the low latency problem is worked around
331// by moving the dependency to one-tick latent interger ALU. Note that
332// "distance" between ldf8 and xma is not latency of ldf8, but the
333// *difference* between xma and ldf8 latencies.
334.L_bn_mul_words_ctop:
335{ .mfi; (p16) ldf8 f32=[r33],8
336 (p18) xma.hu f38=f34,f8,f39 }
337{ .mfb; (p20) stf8 [r32]=f37,8
338 (p18) xma.lu f35=f34,f8,f39
339 br.ctop.sptk .L_bn_mul_words_ctop };;
340.L_bn_mul_words_cend:
341
342 getf.sig r8=f41 // the return value
343
344#endif // XMA_TEMPTATION
345
346{ .mii; nop.m 0x0
347 mov pr=r9,-1
348 mov ar.lc=r3 }
349{ .mfb; rum 1<<5 // clear um.mfh
350 nop.f 0x0
351 br.ret.sptk.many b0 };;
352.endp bn_mul_words#
353#endif
354
355#if 1
356//
357// BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
358//
359.global bn_mul_add_words#
360.proc bn_mul_add_words#
361.align 64
362//.skip 0 // makes the loop split at 64-byte boundary
363bn_mul_add_words:
364 .prologue
365 .fframe 0
366 .save ar.pfs,r2
367{ .mii; alloc r2=ar.pfs,4,12,0,16
368 cmp4.le p6,p0=r34,r0 };;
369{ .mfb; mov r8=r0 // return value
370(p6) br.ret.spnt.many b0 };;
371
372 .save ar.lc,r3
373{ .mii; sub r10=r34,r0,1
374 mov r3=ar.lc
375 mov r9=pr };;
376
377 .body
378{ .mib; setf.sig f8=r35 // w
379 mov pr.rot=0x400001<<16
380 // ------^----- serves as (p48) at first (p26)
381 brp.loop.imp .L_bn_mul_add_words_ctop,.L_bn_mul_add_words_cend-16
382 }
383{ .mii; mov r14=r32 // rp
384 mov r15=r33 // ap
385 mov ar.lc=r10 }
386{ .mii; mov r39=0 // serves as r33 at first (p26)
387 mov r18=r32 // rp copy
388 mov ar.ec=14 };;
389
390// This loop spins in 3*(n+13) ticks on Itanium and should spin in
391// 2*(n+13) on "wider" IA-64 implementations (to be verified with new
392// µ-architecture manuals as they become available). As usual it's
393// possible to compress the epilogue, down to 10 in this case, at the
394// cost of scalability. Compressed (and therefore non-scalable) loop
395// running at 3*(n+10) would buy you ~10% on Itanium but take ~35%
396// from "wider" IA-64 so let it be scalable! Special attention was
397// paid for having the loop body split at 64-byte boundary. ld8 is
398// scheduled for L1 cache as the data is more than likely there.
399// Indeed, bn_mul_words has put it there a moment ago:-)
400.L_bn_mul_add_words_ctop:
401{ .mfi; (p25) getf.sig r36=f49 // low
402 (p21) xmpy.lu f45=f37,f8
403 (p27) cmp.ltu p52,p48=r39,r38 }
404{ .mfi; (p16) ldf8 f32=[r15],8
405 (p21) xmpy.hu f38=f37,f8
406 (p27) add r43=r43,r39 };;
407{ .mii; (p26) getf.sig r32=f43 // high
408 .pred.rel "mutex",p48,p52
409 (p48) add r38=r37,r33 // (p26)
410 (p52) add r38=r37,r33,1 } // (p26)
411{ .mfb; (p27) cmp.ltu.unc p56,p0=r43,r39
412 (p0) nop.f 0x0
413 (p0) nop.b 0x0 }
414{ .mii; (p26) ld8 r42=[r18],8
415 (p58) cmp.eq.or p57,p0=-1,r44
416 (p58) add r44=1,r44 }
417{ .mfb; (p29) st8 [r14]=r45,8
418 (p0) nop.f 0x0
419 br.ctop.sptk .L_bn_mul_add_words_ctop};;
420.L_bn_mul_add_words_cend:
421
422{ .mii; nop.m 0x0
423.pred.rel "mutex",p51,p55
424(p51) add r8=r36,r0
425(p55) add r8=r36,r0,1 }
426{ .mfb; nop.m 0x0
427 nop.f 0x0
428 nop.b 0x0 };;
429{ .mii;
430(p59) add r8=1,r8
431 mov pr=r9,-1
432 mov ar.lc=r3 }
433{ .mfb; rum 1<<5 // clear um.mfh
434 nop.f 0x0
435 br.ret.sptk.many b0 };;
436.endp bn_mul_add_words#
437#endif
438
439#if 1
440//
441// void bn_sqr_words(BN_ULONG *rp, BN_ULONG *ap, int num)
442//
443.global bn_sqr_words#
444.proc bn_sqr_words#
445.align 64
446.skip 32 // makes the loop body aligned at 64-byte boundary
447bn_sqr_words:
448 .prologue
449 .fframe 0
450 .save ar.pfs,r2
451{ .mii; alloc r2=ar.pfs,3,0,0,0
452 sxt4 r34=r34 };;
453{ .mii; cmp.le p6,p0=r34,r0
454 mov r8=r0 } // return value
455{ .mfb; nop.f 0x0
456(p6) br.ret.spnt.many b0 };;
457
458 .save ar.lc,r3
459{ .mii; sub r10=r34,r0,1
460 mov r3=ar.lc
461 mov r9=pr };;
462
463 .body
464{ .mib;
465 mov pr.rot=1<<16
466 brp.loop.imp .L_bn_sqr_words_ctop,.L_bn_sqr_words_cend-16
467 }
468{ .mii; add r34=8,r32
469 mov ar.lc=r10
470 mov ar.ec=18 };;
471
472// 2*(n+17) on Itanium, (n+17) on "wider" IA-64 implementations. It's
473// possible to compress the epilogue (I'm getting tired to write this
474// comment over and over) and get down to 2*n+16 at the cost of
475// scalability. The decision will very likely be reconsidered after the
476// benchmark program is profiled. I.e. if perfomance gain on Itanium
477// will appear larger than loss on "wider" IA-64, then the loop should
478// be explicitely split and the epilogue compressed.
479.L_bn_sqr_words_ctop:
480{ .mfi; (p16) ldf8 f32=[r33],8
481 (p25) xmpy.lu f42=f41,f41
482 (p0) nop.i 0x0 }
483{ .mib; (p33) stf8 [r32]=f50,16
484 (p0) nop.i 0x0
485 (p0) nop.b 0x0 }
486{ .mfi; (p0) nop.m 0x0
487 (p25) xmpy.hu f52=f41,f41
488 (p0) nop.i 0x0 }
489{ .mib; (p33) stf8 [r34]=f60,16
490 (p0) nop.i 0x0
491 br.ctop.sptk .L_bn_sqr_words_ctop };;
492.L_bn_sqr_words_cend:
493
494{ .mii; nop.m 0x0
495 mov pr=r9,-1
496 mov ar.lc=r3 }
497{ .mfb; rum 1<<5 // clear um.mfh
498 nop.f 0x0
499 br.ret.sptk.many b0 };;
500.endp bn_sqr_words#
501#endif
502
503#if 1
504// Apparently we win nothing by implementing special bn_sqr_comba8.
505// Yes, it is possible to reduce the number of multiplications by
506// almost factor of two, but then the amount of additions would
507// increase by factor of two (as we would have to perform those
508// otherwise performed by xma ourselves). Normally we would trade
509// anyway as multiplications are way more expensive, but not this
510// time... Multiplication kernel is fully pipelined and as we drain
511// one 128-bit multiplication result per clock cycle multiplications
512// are effectively as inexpensive as additions. Special implementation
513// might become of interest for "wider" IA-64 implementation as you'll
514// be able to get through the multiplication phase faster (there won't
515// be any stall issues as discussed in the commentary section below and
516// you therefore will be able to employ all 4 FP units)... But these
517// Itanium days it's simply too hard to justify the effort so I just
518// drop down to bn_mul_comba8 code:-)
519//
520// void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
521//
522.global bn_sqr_comba8#
523.proc bn_sqr_comba8#
524.align 64
525bn_sqr_comba8:
526 .prologue
527 .fframe 0
528 .save ar.pfs,r2
529{ .mii; alloc r2=ar.pfs,2,1,0,0
530 mov r34=r33
531 add r14=8,r33 };;
532 .body
533{ .mii; add r17=8,r34
534 add r15=16,r33
535 add r18=16,r34 }
536{ .mfb; add r16=24,r33
537 br .L_cheat_entry_point8 };;
538.endp bn_sqr_comba8#
539#endif
540
541#if 1
542// I've estimated this routine to run in ~120 ticks, but in reality
543// (i.e. according to ar.itc) it takes ~160 ticks. Are those extra
544// cycles consumed for instructions fetch? Or did I misinterpret some
545// clause in Itanium µ-architecture manual? Comments are welcomed and
546// highly appreciated.
547//
548// However! It should be noted that even 160 ticks is darn good result
549// as it's over 10 (yes, ten, spelled as t-e-n) times faster than the
550// C version (compiled with gcc with inline assembler). I really
551// kicked compiler's butt here, didn't I? Yeah! This brings us to the
552// following statement. It's damn shame that this routine isn't called
553// very often nowadays! According to the profiler most CPU time is
554// consumed by bn_mul_add_words called from BN_from_montgomery. In
555// order to estimate what we're missing, I've compared the performance
556// of this routine against "traditional" implementation, i.e. against
557// following routine:
558//
559// void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
560// { r[ 8]=bn_mul_words( &(r[0]),a,8,b[0]);
561// r[ 9]=bn_mul_add_words(&(r[1]),a,8,b[1]);
562// r[10]=bn_mul_add_words(&(r[2]),a,8,b[2]);
563// r[11]=bn_mul_add_words(&(r[3]),a,8,b[3]);
564// r[12]=bn_mul_add_words(&(r[4]),a,8,b[4]);
565// r[13]=bn_mul_add_words(&(r[5]),a,8,b[5]);
566// r[14]=bn_mul_add_words(&(r[6]),a,8,b[6]);
567// r[15]=bn_mul_add_words(&(r[7]),a,8,b[7]);
568// }
569//
570// The one below is over 8 times faster than the one above:-( Even
571// more reasons to "combafy" bn_mul_add_mont...
572//
573// And yes, this routine really made me wish there were an optimizing
574// assembler! It also feels like it deserves a dedication.
575//
576// To my wife for being there and to my kids...
577//
578// void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
579//
580#define carry1 r14
581#define carry2 r15
582#define carry3 r34
583.global bn_mul_comba8#
584.proc bn_mul_comba8#
585.align 64
586bn_mul_comba8:
587 .prologue
588 .fframe 0
589 .save ar.pfs,r2
590{ .mii; alloc r2=ar.pfs,3,0,0,0
591 add r14=8,r33
592 add r17=8,r34 }
593 .body
594{ .mii; add r15=16,r33
595 add r18=16,r34
596 add r16=24,r33 }
597.L_cheat_entry_point8:
598{ .mmi; add r19=24,r34
599
600 ldf8 f32=[r33],32 };;
601
602{ .mmi; ldf8 f120=[r34],32
603 ldf8 f121=[r17],32 }
604{ .mmi; ldf8 f122=[r18],32
605 ldf8 f123=[r19],32 };;
606{ .mmi; ldf8 f124=[r34]
607 ldf8 f125=[r17] }
608{ .mmi; ldf8 f126=[r18]
609 ldf8 f127=[r19] }
610
611{ .mmi; ldf8 f33=[r14],32
612 ldf8 f34=[r15],32 }
613{ .mmi; ldf8 f35=[r16],32;;
614 ldf8 f36=[r33] }
615{ .mmi; ldf8 f37=[r14]
616 ldf8 f38=[r15] }
617{ .mfi; ldf8 f39=[r16]
618// -------\ Entering multiplier's heaven /-------
619// ------------\ /------------
620// -----------------\ /-----------------
621// ----------------------\/----------------------
622 xma.hu f41=f32,f120,f0 }
623{ .mfi; xma.lu f40=f32,f120,f0 };; // (*)
624{ .mfi; xma.hu f51=f32,f121,f0 }
625{ .mfi; xma.lu f50=f32,f121,f0 };;
626{ .mfi; xma.hu f61=f32,f122,f0 }
627{ .mfi; xma.lu f60=f32,f122,f0 };;
628{ .mfi; xma.hu f71=f32,f123,f0 }
629{ .mfi; xma.lu f70=f32,f123,f0 };;
630{ .mfi; xma.hu f81=f32,f124,f0 }
631{ .mfi; xma.lu f80=f32,f124,f0 };;
632{ .mfi; xma.hu f91=f32,f125,f0 }
633{ .mfi; xma.lu f90=f32,f125,f0 };;
634{ .mfi; xma.hu f101=f32,f126,f0 }
635{ .mfi; xma.lu f100=f32,f126,f0 };;
636{ .mfi; xma.hu f111=f32,f127,f0 }
637{ .mfi; xma.lu f110=f32,f127,f0 };;//
638// (*) You can argue that splitting at every second bundle would
639// prevent "wider" IA-64 implementations from achieving the peak
640// performance. Well, not really... The catch is that if you
641// intend to keep 4 FP units busy by splitting at every fourth
642// bundle and thus perform these 16 multiplications in 4 ticks,
643// the first bundle *below* would stall because the result from
644// the first xma bundle *above* won't be available for another 3
645// ticks (if not more, being an optimist, I assume that "wider"
646// implementation will have same latency:-). This stall will hold
647// you back and the performance would be as if every second bundle
648// were split *anyway*...
649{ .mfi; getf.sig r16=f40
650 xma.hu f42=f33,f120,f41
651 add r33=8,r32 }
652{ .mfi; xma.lu f41=f33,f120,f41 };;
653{ .mfi; getf.sig r24=f50
654 xma.hu f52=f33,f121,f51 }
655{ .mfi; xma.lu f51=f33,f121,f51 };;
656{ .mfi; st8 [r32]=r16,16
657 xma.hu f62=f33,f122,f61 }
658{ .mfi; xma.lu f61=f33,f122,f61 };;
659{ .mfi; xma.hu f72=f33,f123,f71 }
660{ .mfi; xma.lu f71=f33,f123,f71 };;
661{ .mfi; xma.hu f82=f33,f124,f81 }
662{ .mfi; xma.lu f81=f33,f124,f81 };;
663{ .mfi; xma.hu f92=f33,f125,f91 }
664{ .mfi; xma.lu f91=f33,f125,f91 };;
665{ .mfi; xma.hu f102=f33,f126,f101 }
666{ .mfi; xma.lu f101=f33,f126,f101 };;
667{ .mfi; xma.hu f112=f33,f127,f111 }
668{ .mfi; xma.lu f111=f33,f127,f111 };;//
669//-------------------------------------------------//
670{ .mfi; getf.sig r25=f41
671 xma.hu f43=f34,f120,f42 }
672{ .mfi; xma.lu f42=f34,f120,f42 };;
673{ .mfi; getf.sig r16=f60
674 xma.hu f53=f34,f121,f52 }
675{ .mfi; xma.lu f52=f34,f121,f52 };;
676{ .mfi; getf.sig r17=f51
677 xma.hu f63=f34,f122,f62
678 add r25=r25,r24 }
679{ .mfi; xma.lu f62=f34,f122,f62
680 mov carry1=0 };;
681{ .mfi; cmp.ltu p6,p0=r25,r24
682 xma.hu f73=f34,f123,f72 }
683{ .mfi; xma.lu f72=f34,f123,f72 };;
684{ .mfi; st8 [r33]=r25,16
685 xma.hu f83=f34,f124,f82
686(p6) add carry1=1,carry1 }
687{ .mfi; xma.lu f82=f34,f124,f82 };;
688{ .mfi; xma.hu f93=f34,f125,f92 }
689{ .mfi; xma.lu f92=f34,f125,f92 };;
690{ .mfi; xma.hu f103=f34,f126,f102 }
691{ .mfi; xma.lu f102=f34,f126,f102 };;
692{ .mfi; xma.hu f113=f34,f127,f112 }
693{ .mfi; xma.lu f112=f34,f127,f112 };;//
694//-------------------------------------------------//
695{ .mfi; getf.sig r18=f42
696 xma.hu f44=f35,f120,f43
697 add r17=r17,r16 }
698{ .mfi; xma.lu f43=f35,f120,f43 };;
699{ .mfi; getf.sig r24=f70
700 xma.hu f54=f35,f121,f53 }
701{ .mfi; mov carry2=0
702 xma.lu f53=f35,f121,f53 };;
703{ .mfi; getf.sig r25=f61
704 xma.hu f64=f35,f122,f63
705 cmp.ltu p7,p0=r17,r16 }
706{ .mfi; add r18=r18,r17
707 xma.lu f63=f35,f122,f63 };;
708{ .mfi; getf.sig r26=f52
709 xma.hu f74=f35,f123,f73
710(p7) add carry2=1,carry2 }
711{ .mfi; cmp.ltu p7,p0=r18,r17
712 xma.lu f73=f35,f123,f73
713 add r18=r18,carry1 };;
714{ .mfi;
715 xma.hu f84=f35,f124,f83
716(p7) add carry2=1,carry2 }
717{ .mfi; cmp.ltu p7,p0=r18,carry1
718 xma.lu f83=f35,f124,f83 };;
719{ .mfi; st8 [r32]=r18,16
720 xma.hu f94=f35,f125,f93
721(p7) add carry2=1,carry2 }
722{ .mfi; xma.lu f93=f35,f125,f93 };;
723{ .mfi; xma.hu f104=f35,f126,f103 }
724{ .mfi; xma.lu f103=f35,f126,f103 };;
725{ .mfi; xma.hu f114=f35,f127,f113 }
726{ .mfi; mov carry1=0
727 xma.lu f113=f35,f127,f113
728 add r25=r25,r24 };;//
729//-------------------------------------------------//
730{ .mfi; getf.sig r27=f43
731 xma.hu f45=f36,f120,f44
732 cmp.ltu p6,p0=r25,r24 }
733{ .mfi; xma.lu f44=f36,f120,f44
734 add r26=r26,r25 };;
735{ .mfi; getf.sig r16=f80
736 xma.hu f55=f36,f121,f54
737(p6) add carry1=1,carry1 }
738{ .mfi; xma.lu f54=f36,f121,f54 };;
739{ .mfi; getf.sig r17=f71
740 xma.hu f65=f36,f122,f64
741 cmp.ltu p6,p0=r26,r25 }
742{ .mfi; xma.lu f64=f36,f122,f64
743 add r27=r27,r26 };;
744{ .mfi; getf.sig r18=f62
745 xma.hu f75=f36,f123,f74
746(p6) add carry1=1,carry1 }
747{ .mfi; cmp.ltu p6,p0=r27,r26
748 xma.lu f74=f36,f123,f74
749 add r27=r27,carry2 };;
750{ .mfi; getf.sig r19=f53
751 xma.hu f85=f36,f124,f84
752(p6) add carry1=1,carry1 }
753{ .mfi; xma.lu f84=f36,f124,f84
754 cmp.ltu p6,p0=r27,carry2 };;
755{ .mfi; st8 [r33]=r27,16
756 xma.hu f95=f36,f125,f94
757(p6) add carry1=1,carry1 }
758{ .mfi; xma.lu f94=f36,f125,f94 };;
759{ .mfi; xma.hu f105=f36,f126,f104 }
760{ .mfi; mov carry2=0
761 xma.lu f104=f36,f126,f104
762 add r17=r17,r16 };;
763{ .mfi; xma.hu f115=f36,f127,f114
764 cmp.ltu p7,p0=r17,r16 }
765{ .mfi; xma.lu f114=f36,f127,f114
766 add r18=r18,r17 };;//
767//-------------------------------------------------//
768{ .mfi; getf.sig r20=f44
769 xma.hu f46=f37,f120,f45
770(p7) add carry2=1,carry2 }
771{ .mfi; cmp.ltu p7,p0=r18,r17
772 xma.lu f45=f37,f120,f45
773 add r19=r19,r18 };;
774{ .mfi; getf.sig r24=f90
775 xma.hu f56=f37,f121,f55 }
776{ .mfi; xma.lu f55=f37,f121,f55 };;
777{ .mfi; getf.sig r25=f81
778 xma.hu f66=f37,f122,f65
779(p7) add carry2=1,carry2 }
780{ .mfi; cmp.ltu p7,p0=r19,r18
781 xma.lu f65=f37,f122,f65
782 add r20=r20,r19 };;
783{ .mfi; getf.sig r26=f72
784 xma.hu f76=f37,f123,f75
785(p7) add carry2=1,carry2 }
786{ .mfi; cmp.ltu p7,p0=r20,r19
787 xma.lu f75=f37,f123,f75
788 add r20=r20,carry1 };;
789{ .mfi; getf.sig r27=f63
790 xma.hu f86=f37,f124,f85
791(p7) add carry2=1,carry2 }
792{ .mfi; xma.lu f85=f37,f124,f85
793 cmp.ltu p7,p0=r20,carry1 };;
794{ .mfi; getf.sig r28=f54
795 xma.hu f96=f37,f125,f95
796(p7) add carry2=1,carry2 }
797{ .mfi; st8 [r32]=r20,16
798 xma.lu f95=f37,f125,f95 };;
799{ .mfi; xma.hu f106=f37,f126,f105 }
800{ .mfi; mov carry1=0
801 xma.lu f105=f37,f126,f105
802 add r25=r25,r24 };;
803{ .mfi; xma.hu f116=f37,f127,f115
804 cmp.ltu p6,p0=r25,r24 }
805{ .mfi; xma.lu f115=f37,f127,f115
806 add r26=r26,r25 };;//
807//-------------------------------------------------//
808{ .mfi; getf.sig r29=f45
809 xma.hu f47=f38,f120,f46
810(p6) add carry1=1,carry1 }
811{ .mfi; cmp.ltu p6,p0=r26,r25
812 xma.lu f46=f38,f120,f46
813 add r27=r27,r26 };;
814{ .mfi; getf.sig r16=f100
815 xma.hu f57=f38,f121,f56
816(p6) add carry1=1,carry1 }
817{ .mfi; cmp.ltu p6,p0=r27,r26
818 xma.lu f56=f38,f121,f56
819 add r28=r28,r27 };;
820{ .mfi; getf.sig r17=f91
821 xma.hu f67=f38,f122,f66
822(p6) add carry1=1,carry1 }
823{ .mfi; cmp.ltu p6,p0=r28,r27
824 xma.lu f66=f38,f122,f66
825 add r29=r29,r28 };;
826{ .mfi; getf.sig r18=f82
827 xma.hu f77=f38,f123,f76
828(p6) add carry1=1,carry1 }
829{ .mfi; cmp.ltu p6,p0=r29,r28
830 xma.lu f76=f38,f123,f76
831 add r29=r29,carry2 };;
832{ .mfi; getf.sig r19=f73
833 xma.hu f87=f38,f124,f86
834(p6) add carry1=1,carry1 }
835{ .mfi; xma.lu f86=f38,f124,f86
836 cmp.ltu p6,p0=r29,carry2 };;
837{ .mfi; getf.sig r20=f64
838 xma.hu f97=f38,f125,f96
839(p6) add carry1=1,carry1 }
840{ .mfi; st8 [r33]=r29,16
841 xma.lu f96=f38,f125,f96 };;
842{ .mfi; getf.sig r21=f55
843 xma.hu f107=f38,f126,f106 }
844{ .mfi; mov carry2=0
845 xma.lu f106=f38,f126,f106
846 add r17=r17,r16 };;
847{ .mfi; xma.hu f117=f38,f127,f116
848 cmp.ltu p7,p0=r17,r16 }
849{ .mfi; xma.lu f116=f38,f127,f116
850 add r18=r18,r17 };;//
851//-------------------------------------------------//
852{ .mfi; getf.sig r22=f46
853 xma.hu f48=f39,f120,f47
854(p7) add carry2=1,carry2 }
855{ .mfi; cmp.ltu p7,p0=r18,r17
856 xma.lu f47=f39,f120,f47
857 add r19=r19,r18 };;
858{ .mfi; getf.sig r24=f110
859 xma.hu f58=f39,f121,f57
860(p7) add carry2=1,carry2 }
861{ .mfi; cmp.ltu p7,p0=r19,r18
862 xma.lu f57=f39,f121,f57
863 add r20=r20,r19 };;
864{ .mfi; getf.sig r25=f101
865 xma.hu f68=f39,f122,f67
866(p7) add carry2=1,carry2 }
867{ .mfi; cmp.ltu p7,p0=r20,r19
868 xma.lu f67=f39,f122,f67
869 add r21=r21,r20 };;
870{ .mfi; getf.sig r26=f92
871 xma.hu f78=f39,f123,f77
872(p7) add carry2=1,carry2 }
873{ .mfi; cmp.ltu p7,p0=r21,r20
874 xma.lu f77=f39,f123,f77
875 add r22=r22,r21 };;
876{ .mfi; getf.sig r27=f83
877 xma.hu f88=f39,f124,f87
878(p7) add carry2=1,carry2 }
879{ .mfi; cmp.ltu p7,p0=r22,r21
880 xma.lu f87=f39,f124,f87
881 add r22=r22,carry1 };;
882{ .mfi; getf.sig r28=f74
883 xma.hu f98=f39,f125,f97
884(p7) add carry2=1,carry2 }
885{ .mfi; xma.lu f97=f39,f125,f97
886 cmp.ltu p7,p0=r22,carry1 };;
887{ .mfi; getf.sig r29=f65
888 xma.hu f108=f39,f126,f107
889(p7) add carry2=1,carry2 }
890{ .mfi; st8 [r32]=r22,16
891 xma.lu f107=f39,f126,f107 };;
892{ .mfi; getf.sig r30=f56
893 xma.hu f118=f39,f127,f117 }
894{ .mfi; xma.lu f117=f39,f127,f117 };;//
895//-------------------------------------------------//
896// Leaving muliplier's heaven... Quite a ride, huh?
897
898{ .mii; getf.sig r31=f47
899 add r25=r25,r24
900 mov carry1=0 };;
901{ .mii; getf.sig r16=f111
902 cmp.ltu p6,p0=r25,r24
903 add r26=r26,r25 };;
904{ .mfb; getf.sig r17=f102 }
905{ .mii;
906(p6) add carry1=1,carry1
907 cmp.ltu p6,p0=r26,r25
908 add r27=r27,r26 };;
909{ .mfb; nop.m 0x0 }
910{ .mii;
911(p6) add carry1=1,carry1
912 cmp.ltu p6,p0=r27,r26
913 add r28=r28,r27 };;
914{ .mii; getf.sig r18=f93
915 add r17=r17,r16
916 mov carry3=0 }
917{ .mii;
918(p6) add carry1=1,carry1
919 cmp.ltu p6,p0=r28,r27
920 add r29=r29,r28 };;
921{ .mii; getf.sig r19=f84
922 cmp.ltu p7,p0=r17,r16 }
923{ .mii;
924(p6) add carry1=1,carry1
925 cmp.ltu p6,p0=r29,r28
926 add r30=r30,r29 };;
927{ .mii; getf.sig r20=f75
928 add r18=r18,r17 }
929{ .mii;
930(p6) add carry1=1,carry1
931 cmp.ltu p6,p0=r30,r29
932 add r31=r31,r30 };;
933{ .mfb; getf.sig r21=f66 }
934{ .mii; (p7) add carry3=1,carry3
935 cmp.ltu p7,p0=r18,r17
936 add r19=r19,r18 }
937{ .mfb; nop.m 0x0 }
938{ .mii;
939(p6) add carry1=1,carry1
940 cmp.ltu p6,p0=r31,r30
941 add r31=r31,carry2 };;
942{ .mfb; getf.sig r22=f57 }
943{ .mii; (p7) add carry3=1,carry3
944 cmp.ltu p7,p0=r19,r18
945 add r20=r20,r19 }
946{ .mfb; nop.m 0x0 }
947{ .mii;
948(p6) add carry1=1,carry1
949 cmp.ltu p6,p0=r31,carry2 };;
950{ .mfb; getf.sig r23=f48 }
951{ .mii; (p7) add carry3=1,carry3
952 cmp.ltu p7,p0=r20,r19
953 add r21=r21,r20 }
954{ .mii;
955(p6) add carry1=1,carry1 }
956{ .mfb; st8 [r33]=r31,16 };;
957
958{ .mfb; getf.sig r24=f112 }
959{ .mii; (p7) add carry3=1,carry3
960 cmp.ltu p7,p0=r21,r20
961 add r22=r22,r21 };;
962{ .mfb; getf.sig r25=f103 }
963{ .mii; (p7) add carry3=1,carry3
964 cmp.ltu p7,p0=r22,r21
965 add r23=r23,r22 };;
966{ .mfb; getf.sig r26=f94 }
967{ .mii; (p7) add carry3=1,carry3
968 cmp.ltu p7,p0=r23,r22
969 add r23=r23,carry1 };;
970{ .mfb; getf.sig r27=f85 }
971{ .mii; (p7) add carry3=1,carry3
972 cmp.ltu p7,p8=r23,carry1};;
973{ .mii; getf.sig r28=f76
974 add r25=r25,r24
975 mov carry1=0 }
976{ .mii; st8 [r32]=r23,16
977 (p7) add carry2=1,carry3
978 (p8) add carry2=0,carry3 };;
979
980{ .mfb; nop.m 0x0 }
981{ .mii; getf.sig r29=f67
982 cmp.ltu p6,p0=r25,r24
983 add r26=r26,r25 };;
984{ .mfb; getf.sig r30=f58 }
985{ .mii;
986(p6) add carry1=1,carry1
987 cmp.ltu p6,p0=r26,r25
988 add r27=r27,r26 };;
989{ .mfb; getf.sig r16=f113 }
990{ .mii;
991(p6) add carry1=1,carry1
992 cmp.ltu p6,p0=r27,r26
993 add r28=r28,r27 };;
994{ .mfb; getf.sig r17=f104 }
995{ .mii;
996(p6) add carry1=1,carry1
997 cmp.ltu p6,p0=r28,r27
998 add r29=r29,r28 };;
999{ .mfb; getf.sig r18=f95 }
1000{ .mii;
1001(p6) add carry1=1,carry1
1002 cmp.ltu p6,p0=r29,r28
1003 add r30=r30,r29 };;
1004{ .mii; getf.sig r19=f86
1005 add r17=r17,r16
1006 mov carry3=0 }
1007{ .mii;
1008(p6) add carry1=1,carry1
1009 cmp.ltu p6,p0=r30,r29
1010 add r30=r30,carry2 };;
1011{ .mii; getf.sig r20=f77
1012 cmp.ltu p7,p0=r17,r16
1013 add r18=r18,r17 }
1014{ .mii;
1015(p6) add carry1=1,carry1
1016 cmp.ltu p6,p0=r30,carry2 };;
1017{ .mfb; getf.sig r21=f68 }
1018{ .mii; st8 [r33]=r30,16
1019(p6) add carry1=1,carry1 };;
1020
1021{ .mfb; getf.sig r24=f114 }
1022{ .mii; (p7) add carry3=1,carry3
1023 cmp.ltu p7,p0=r18,r17
1024 add r19=r19,r18 };;
1025{ .mfb; getf.sig r25=f105 }
1026{ .mii; (p7) add carry3=1,carry3
1027 cmp.ltu p7,p0=r19,r18
1028 add r20=r20,r19 };;
1029{ .mfb; getf.sig r26=f96 }
1030{ .mii; (p7) add carry3=1,carry3
1031 cmp.ltu p7,p0=r20,r19
1032 add r21=r21,r20 };;
1033{ .mfb; getf.sig r27=f87 }
1034{ .mii; (p7) add carry3=1,carry3
1035 cmp.ltu p7,p0=r21,r20
1036 add r21=r21,carry1 };;
1037{ .mib; getf.sig r28=f78
1038 add r25=r25,r24 }
1039{ .mib; (p7) add carry3=1,carry3
1040 cmp.ltu p7,p8=r21,carry1};;
1041{ .mii; st8 [r32]=r21,16
1042 (p7) add carry2=1,carry3
1043 (p8) add carry2=0,carry3 }
1044
1045{ .mii; mov carry1=0
1046 cmp.ltu p6,p0=r25,r24
1047 add r26=r26,r25 };;
1048{ .mfb; getf.sig r16=f115 }
1049{ .mii;
1050(p6) add carry1=1,carry1
1051 cmp.ltu p6,p0=r26,r25
1052 add r27=r27,r26 };;
1053{ .mfb; getf.sig r17=f106 }
1054{ .mii;
1055(p6) add carry1=1,carry1
1056 cmp.ltu p6,p0=r27,r26
1057 add r28=r28,r27 };;
1058{ .mfb; getf.sig r18=f97 }
1059{ .mii;
1060(p6) add carry1=1,carry1
1061 cmp.ltu p6,p0=r28,r27
1062 add r28=r28,carry2 };;
1063{ .mib; getf.sig r19=f88
1064 add r17=r17,r16 }
1065{ .mib;
1066(p6) add carry1=1,carry1
1067 cmp.ltu p6,p0=r28,carry2 };;
1068{ .mii; st8 [r33]=r28,16
1069(p6) add carry1=1,carry1 }
1070
1071{ .mii; mov carry2=0
1072 cmp.ltu p7,p0=r17,r16
1073 add r18=r18,r17 };;
1074{ .mfb; getf.sig r24=f116 }
1075{ .mii; (p7) add carry2=1,carry2
1076 cmp.ltu p7,p0=r18,r17
1077 add r19=r19,r18 };;
1078{ .mfb; getf.sig r25=f107 }
1079{ .mii; (p7) add carry2=1,carry2
1080 cmp.ltu p7,p0=r19,r18
1081 add r19=r19,carry1 };;
1082{ .mfb; getf.sig r26=f98 }
1083{ .mii; (p7) add carry2=1,carry2
1084 cmp.ltu p7,p0=r19,carry1};;
1085{ .mii; st8 [r32]=r19,16
1086 (p7) add carry2=1,carry2 }
1087
1088{ .mfb; add r25=r25,r24 };;
1089
1090{ .mfb; getf.sig r16=f117 }
1091{ .mii; mov carry1=0
1092 cmp.ltu p6,p0=r25,r24
1093 add r26=r26,r25 };;
1094{ .mfb; getf.sig r17=f108 }
1095{ .mii;
1096(p6) add carry1=1,carry1
1097 cmp.ltu p6,p0=r26,r25
1098 add r26=r26,carry2 };;
1099{ .mfb; nop.m 0x0 }
1100{ .mii;
1101(p6) add carry1=1,carry1
1102 cmp.ltu p6,p0=r26,carry2 };;
1103{ .mii; st8 [r33]=r26,16
1104(p6) add carry1=1,carry1 }
1105
1106{ .mfb; add r17=r17,r16 };;
1107{ .mfb; getf.sig r24=f118 }
1108{ .mii; mov carry2=0
1109 cmp.ltu p7,p0=r17,r16
1110 add r17=r17,carry1 };;
1111{ .mii; (p7) add carry2=1,carry2
1112 cmp.ltu p7,p0=r17,carry1};;
1113{ .mii; st8 [r32]=r17
1114 (p7) add carry2=1,carry2 };;
1115{ .mfb; add r24=r24,carry2 };;
1116{ .mib; st8 [r33]=r24 }
1117
1118{ .mib; rum 1<<5 // clear um.mfh
1119 br.ret.sptk.many b0 };;
1120.endp bn_mul_comba8#
1121#undef carry3
1122#undef carry2
1123#undef carry1
1124#endif
1125
1126#if 1
1127// It's possible to make it faster (see comment to bn_sqr_comba8), but
1128// I reckon it doesn't worth the effort. Basically because the routine
1129// (actually both of them) practically never called... So I just play
1130// same trick as with bn_sqr_comba8.
1131//
1132// void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
1133//
1134.global bn_sqr_comba4#
1135.proc bn_sqr_comba4#
1136.align 64
1137bn_sqr_comba4:
1138 .prologue
1139 .fframe 0
1140 .save ar.pfs,r2
1141{ .mii; alloc r2=ar.pfs,2,1,0,0
1142 mov r34=r33
1143 add r14=8,r33 };;
1144 .body
1145{ .mii; add r17=8,r34
1146 add r15=16,r33
1147 add r18=16,r34 }
1148{ .mfb; add r16=24,r33
1149 br .L_cheat_entry_point4 };;
1150.endp bn_sqr_comba4#
1151#endif
1152
1153#if 1
1154// Runs in ~115 cycles and ~4.5 times faster than C. Well, whatever...
1155//
1156// void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
1157//
1158#define carry1 r14
1159#define carry2 r15
1160.global bn_mul_comba4#
1161.proc bn_mul_comba4#
1162.align 64
1163bn_mul_comba4:
1164 .prologue
1165 .fframe 0
1166 .save ar.pfs,r2
1167{ .mii; alloc r2=ar.pfs,3,0,0,0
1168 add r14=8,r33
1169 add r17=8,r34 }
1170 .body
1171{ .mii; add r15=16,r33
1172 add r18=16,r34
1173 add r16=24,r33 };;
1174.L_cheat_entry_point4:
1175{ .mmi; add r19=24,r34
1176
1177 ldf8 f32=[r33] }
1178
1179{ .mmi; ldf8 f120=[r34]
1180 ldf8 f121=[r17] };;
1181{ .mmi; ldf8 f122=[r18]
1182 ldf8 f123=[r19] }
1183
1184{ .mmi; ldf8 f33=[r14]
1185 ldf8 f34=[r15] }
1186{ .mfi; ldf8 f35=[r16]
1187
1188 xma.hu f41=f32,f120,f0 }
1189{ .mfi; xma.lu f40=f32,f120,f0 };;
1190{ .mfi; xma.hu f51=f32,f121,f0 }
1191{ .mfi; xma.lu f50=f32,f121,f0 };;
1192{ .mfi; xma.hu f61=f32,f122,f0 }
1193{ .mfi; xma.lu f60=f32,f122,f0 };;
1194{ .mfi; xma.hu f71=f32,f123,f0 }
1195{ .mfi; xma.lu f70=f32,f123,f0 };;//
1196// Major stall takes place here, and 3 more places below. Result from
1197// first xma is not available for another 3 ticks.
1198{ .mfi; getf.sig r16=f40
1199 xma.hu f42=f33,f120,f41
1200 add r33=8,r32 }
1201{ .mfi; xma.lu f41=f33,f120,f41 };;
1202{ .mfi; getf.sig r24=f50
1203 xma.hu f52=f33,f121,f51 }
1204{ .mfi; xma.lu f51=f33,f121,f51 };;
1205{ .mfi; st8 [r32]=r16,16
1206 xma.hu f62=f33,f122,f61 }
1207{ .mfi; xma.lu f61=f33,f122,f61 };;
1208{ .mfi; xma.hu f72=f33,f123,f71 }
1209{ .mfi; xma.lu f71=f33,f123,f71 };;//
1210//-------------------------------------------------//
1211{ .mfi; getf.sig r25=f41
1212 xma.hu f43=f34,f120,f42 }
1213{ .mfi; xma.lu f42=f34,f120,f42 };;
1214{ .mfi; getf.sig r16=f60
1215 xma.hu f53=f34,f121,f52 }
1216{ .mfi; xma.lu f52=f34,f121,f52 };;
1217{ .mfi; getf.sig r17=f51
1218 xma.hu f63=f34,f122,f62
1219 add r25=r25,r24 }
1220{ .mfi; mov carry1=0
1221 xma.lu f62=f34,f122,f62 };;
1222{ .mfi; st8 [r33]=r25,16
1223 xma.hu f73=f34,f123,f72
1224 cmp.ltu p6,p0=r25,r24 }
1225{ .mfi; xma.lu f72=f34,f123,f72 };;//
1226//-------------------------------------------------//
1227{ .mfi; getf.sig r18=f42
1228 xma.hu f44=f35,f120,f43
1229(p6) add carry1=1,carry1 }
1230{ .mfi; add r17=r17,r16
1231 xma.lu f43=f35,f120,f43
1232 mov carry2=0 };;
1233{ .mfi; getf.sig r24=f70
1234 xma.hu f54=f35,f121,f53
1235 cmp.ltu p7,p0=r17,r16 }
1236{ .mfi; xma.lu f53=f35,f121,f53 };;
1237{ .mfi; getf.sig r25=f61
1238 xma.hu f64=f35,f122,f63
1239 add r18=r18,r17 }
1240{ .mfi; xma.lu f63=f35,f122,f63
1241(p7) add carry2=1,carry2 };;
1242{ .mfi; getf.sig r26=f52
1243 xma.hu f74=f35,f123,f73
1244 cmp.ltu p7,p0=r18,r17 }
1245{ .mfi; xma.lu f73=f35,f123,f73
1246 add r18=r18,carry1 };;
1247//-------------------------------------------------//
1248{ .mii; st8 [r32]=r18,16
1249(p7) add carry2=1,carry2
1250 cmp.ltu p7,p0=r18,carry1 };;
1251
1252{ .mfi; getf.sig r27=f43 // last major stall
1253(p7) add carry2=1,carry2 };;
1254{ .mii; getf.sig r16=f71
1255 add r25=r25,r24
1256 mov carry1=0 };;
1257{ .mii; getf.sig r17=f62
1258 cmp.ltu p6,p0=r25,r24
1259 add r26=r26,r25 };;
1260{ .mii;
1261(p6) add carry1=1,carry1
1262 cmp.ltu p6,p0=r26,r25
1263 add r27=r27,r26 };;
1264{ .mii;
1265(p6) add carry1=1,carry1
1266 cmp.ltu p6,p0=r27,r26
1267 add r27=r27,carry2 };;
1268{ .mii; getf.sig r18=f53
1269(p6) add carry1=1,carry1
1270 cmp.ltu p6,p0=r27,carry2 };;
1271{ .mfi; st8 [r33]=r27,16
1272(p6) add carry1=1,carry1 }
1273
1274{ .mii; getf.sig r19=f44
1275 add r17=r17,r16
1276 mov carry2=0 };;
1277{ .mii; getf.sig r24=f72
1278 cmp.ltu p7,p0=r17,r16
1279 add r18=r18,r17 };;
1280{ .mii; (p7) add carry2=1,carry2
1281 cmp.ltu p7,p0=r18,r17
1282 add r19=r19,r18 };;
1283{ .mii; (p7) add carry2=1,carry2
1284 cmp.ltu p7,p0=r19,r18
1285 add r19=r19,carry1 };;
1286{ .mii; getf.sig r25=f63
1287 (p7) add carry2=1,carry2
1288 cmp.ltu p7,p0=r19,carry1};;
1289{ .mii; st8 [r32]=r19,16
1290 (p7) add carry2=1,carry2 }
1291
1292{ .mii; getf.sig r26=f54
1293 add r25=r25,r24
1294 mov carry1=0 };;
1295{ .mii; getf.sig r16=f73
1296 cmp.ltu p6,p0=r25,r24
1297 add r26=r26,r25 };;
1298{ .mii;
1299(p6) add carry1=1,carry1
1300 cmp.ltu p6,p0=r26,r25
1301 add r26=r26,carry2 };;
1302{ .mii; getf.sig r17=f64
1303(p6) add carry1=1,carry1
1304 cmp.ltu p6,p0=r26,carry2 };;
1305{ .mii; st8 [r33]=r26,16
1306(p6) add carry1=1,carry1 }
1307
1308{ .mii; getf.sig r24=f74
1309 add r17=r17,r16
1310 mov carry2=0 };;
1311{ .mii; cmp.ltu p7,p0=r17,r16
1312 add r17=r17,carry1 };;
1313
1314{ .mii; (p7) add carry2=1,carry2
1315 cmp.ltu p7,p0=r17,carry1};;
1316{ .mii; st8 [r32]=r17,16
1317 (p7) add carry2=1,carry2 };;
1318
1319{ .mii; add r24=r24,carry2 };;
1320{ .mii; st8 [r33]=r24 }
1321
1322{ .mib; rum 1<<5 // clear um.mfh
1323 br.ret.sptk.many b0 };;
1324.endp bn_mul_comba4#
1325#undef carry2
1326#undef carry1
1327#endif
1328
1329#if 1
1330//
1331// BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
1332//
1333// In the nutshell it's a port of my MIPS III/IV implementation.
1334//
1335#define AT r14
1336#define H r16
1337#define HH r20
1338#define L r17
1339#define D r18
1340#define DH r22
1341#define I r21
1342
1343#if 0
1344// Some preprocessors (most notably HP-UX) apper to be allergic to
1345// macros enclosed to parenthesis as these three will be.
1346#define cont p16
1347#define break p0 // p20
1348#define equ p24
1349#else
1350cont=p16
1351break=p0
1352equ=p24
1353#endif
1354
1355.global abort#
1356.global bn_div_words#
1357.proc bn_div_words#
1358.align 64
1359bn_div_words:
1360 .prologue
1361 .fframe 0
1362 .save ar.pfs,r2
1363 .save b0,r3
1364{ .mii; alloc r2=ar.pfs,3,5,0,8
1365 mov r3=b0
1366 mov r10=pr };;
1367{ .mmb; cmp.eq p6,p0=r34,r0
1368 mov r8=-1
1369(p6) br.ret.spnt.many b0 };;
1370
1371 .body
1372{ .mii; mov H=r32 // save h
1373 mov ar.ec=0 // don't rotate at exit
1374 mov pr.rot=0 }
1375{ .mii; mov L=r33 // save l
1376 mov r36=r0 };;
1377
1378.L_divw_shift: // -vv- note signed comparison
1379{ .mfi; (p0) cmp.lt p16,p0=r0,r34 // d
1380 (p0) shladd r33=r34,1,r0 }
1381{ .mfb; (p0) add r35=1,r36
1382 (p0) nop.f 0x0
1383(p16) br.wtop.dpnt .L_divw_shift };;
1384
1385{ .mii; mov D=r34
1386 shr.u DH=r34,32
1387 sub r35=64,r36 };;
1388{ .mii; setf.sig f7=DH
1389 shr.u AT=H,r35
1390 mov I=r36 };;
1391{ .mib; cmp.ne p6,p0=r0,AT
1392 shl H=H,r36
1393(p6) br.call.spnt.clr b0=abort };; // overflow, die...
1394
1395{ .mfi; fcvt.xuf.s1 f7=f7
1396 shr.u AT=L,r35 };;
1397{ .mii; shl L=L,r36
1398 or H=H,AT };;
1399
1400{ .mii; nop.m 0x0
1401 cmp.leu p6,p0=D,H;;
1402(p6) sub H=H,D }
1403
1404{ .mlx; setf.sig f14=D
1405 movl AT=0xffffffff };;
1406///////////////////////////////////////////////////////////
1407{ .mii; setf.sig f6=H
1408 shr.u HH=H,32;;
1409 cmp.eq p6,p7=HH,DH };;
1410{ .mfb;
1411(p6) setf.sig f8=AT
1412(p7) fcvt.xuf.s1 f6=f6
1413(p7) br.call.sptk b6=.L_udiv64_32_b6 };;
1414
1415{ .mfi; getf.sig r33=f8 // q
1416 xmpy.lu f9=f8,f14 }
1417{ .mfi; xmpy.hu f10=f8,f14
1418 shrp H=H,L,32 };;
1419
1420{ .mmi; getf.sig r35=f9 // tl
1421 getf.sig r31=f10 };; // th
1422
1423.L_divw_1st_iter:
1424{ .mii; (p0) add r32=-1,r33
1425 (p0) cmp.eq equ,cont=HH,r31 };;
1426{ .mii; (p0) cmp.ltu p8,p0=r35,D
1427 (p0) sub r34=r35,D
1428 (equ) cmp.leu break,cont=r35,H };;
1429{ .mib; (cont) cmp.leu cont,break=HH,r31
1430 (p8) add r31=-1,r31
1431(cont) br.wtop.spnt .L_divw_1st_iter };;
1432///////////////////////////////////////////////////////////
1433{ .mii; sub H=H,r35
1434 shl r8=r33,32
1435 shl L=L,32 };;
1436///////////////////////////////////////////////////////////
1437{ .mii; setf.sig f6=H
1438 shr.u HH=H,32;;
1439 cmp.eq p6,p7=HH,DH };;
1440{ .mfb;
1441(p6) setf.sig f8=AT
1442(p7) fcvt.xuf.s1 f6=f6
1443(p7) br.call.sptk b6=.L_udiv64_32_b6 };;
1444
1445{ .mfi; getf.sig r33=f8 // q
1446 xmpy.lu f9=f8,f14 }
1447{ .mfi; xmpy.hu f10=f8,f14
1448 shrp H=H,L,32 };;
1449
1450{ .mmi; getf.sig r35=f9 // tl
1451 getf.sig r31=f10 };; // th
1452
1453.L_divw_2nd_iter:
1454{ .mii; (p0) add r32=-1,r33
1455 (p0) cmp.eq equ,cont=HH,r31 };;
1456{ .mii; (p0) cmp.ltu p8,p0=r35,D
1457 (p0) sub r34=r35,D
1458 (equ) cmp.leu break,cont=r35,H };;
1459{ .mib; (cont) cmp.leu cont,break=HH,r31
1460 (p8) add r31=-1,r31
1461(cont) br.wtop.spnt .L_divw_2nd_iter };;
1462///////////////////////////////////////////////////////////
1463{ .mii; sub H=H,r35
1464 or r8=r8,r33
1465 mov ar.pfs=r2 };;
1466{ .mii; shr.u r9=H,I // remainder if anybody wants it
1467 mov pr=r10,-1 }
1468{ .mfb; br.ret.sptk.many b0 };;
1469
1470// Unsigned 64 by 32 (well, by 64 for the moment) bit integer division
1471// procedure.
1472//
1473// inputs: f6 = (double)a, f7 = (double)b
1474// output: f8 = (int)(a/b)
1475// clobbered: f8,f9,f10,f11,pred
1476pred=p15
1477// This procedure is essentially Intel code and therefore is
1478// copyrighted to Intel Corporation (I suppose...). It's sligtly
1479// modified for specific needs.
1480.align 32
1481.skip 16
1482.L_udiv64_32_b6:
1483 frcpa.s1 f8,pred=f6,f7;; // [0] y0 = 1 / b
1484
1485(pred) fnma.s1 f9=f7,f8,f1 // [5] e0 = 1 - b * y0
1486(pred) fmpy.s1 f10=f6,f8;; // [5] q0 = a * y0
1487(pred) fmpy.s1 f11=f9,f9 // [10] e1 = e0 * e0
1488(pred) fma.s1 f10=f9,f10,f10;; // [10] q1 = q0 + e0 * q0
1489(pred) fma.s1 f8=f9,f8,f8 //;; // [15] y1 = y0 + e0 * y0
1490(pred) fma.s1 f9=f11,f10,f10;; // [15] q2 = q1 + e1 * q1
1491(pred) fma.s1 f8=f11,f8,f8 //;; // [20] y2 = y1 + e1 * y1
1492(pred) fnma.s1 f10=f7,f9,f6;; // [20] r2 = a - b * q2
1493(pred) fma.s1 f8=f10,f8,f9;; // [25] q3 = q2 + r2 * y2
1494
1495 fcvt.fxu.trunc.s1 f8=f8 // [30] q = trunc(q3)
1496 br.ret.sptk.many b6;;
1497.endp bn_div_words#
1498#endif
diff --git a/src/lib/libcrypto/bn/asm/pa-risc2W.s b/src/lib/libcrypto/bn/asm/pa-risc2W.s
new file mode 100644
index 0000000000..54b6606252
--- /dev/null
+++ b/src/lib/libcrypto/bn/asm/pa-risc2W.s
@@ -0,0 +1,1605 @@
1;
2; PA-RISC 64-bit implementation of bn_asm code
3;
4; This code is approximately 2x faster than the C version
5; for RSA/DSA.
6;
7; See http://devresource.hp.com/ for more details on the PA-RISC
8; architecture. Also see the book "PA-RISC 2.0 Architecture"
9; by Gerry Kane for information on the instruction set architecture.
10;
11; Code written by Chris Ruemmler (with some help from the HP C
12; compiler).
13;
14; The code compiles with HP's assembler
15;
16
17 .level 2.0W
18 .space $TEXT$
19 .subspa $CODE$,QUAD=0,ALIGN=8,ACCESS=0x2c,CODE_ONLY
20
21;
22; Global Register definitions used for the routines.
23;
24; Some information about HP's runtime architecture for 64-bits.
25;
26; "Caller save" means the calling function must save the register
27; if it wants the register to be preserved.
28; "Callee save" means if a function uses the register, it must save
29; the value before using it.
30;
31; For the floating point registers
32;
33; "caller save" registers: fr4-fr11, fr22-fr31
34; "callee save" registers: fr12-fr21
35; "special" registers: fr0-fr3 (status and exception registers)
36;
37; For the integer registers
38; value zero : r0
39; "caller save" registers: r1,r19-r26
40; "callee save" registers: r3-r18
41; return register : r2 (rp)
42; return values ; r28 (ret0,ret1)
43; Stack pointer ; r30 (sp)
44; global data pointer ; r27 (dp)
45; argument pointer ; r29 (ap)
46; millicode return ptr ; r31 (also a caller save register)
47
48
49;
50; Arguments to the routines
51;
52r_ptr .reg %r26
53a_ptr .reg %r25
54b_ptr .reg %r24
55num .reg %r24
56w .reg %r23
57n .reg %r23
58
59
60;
61; Globals used in some routines
62;
63
64top_overflow .reg %r29
65high_mask .reg %r22 ; value 0xffffffff80000000L
66
67
68;------------------------------------------------------------------------------
69;
70; bn_mul_add_words
71;
72;BN_ULONG bn_mul_add_words(BN_ULONG *r_ptr, BN_ULONG *a_ptr,
73; int num, BN_ULONG w)
74;
75; arg0 = r_ptr
76; arg1 = a_ptr
77; arg2 = num
78; arg3 = w
79;
80; Local register definitions
81;
82
83fm1 .reg %fr22
84fm .reg %fr23
85ht_temp .reg %fr24
86ht_temp_1 .reg %fr25
87lt_temp .reg %fr26
88lt_temp_1 .reg %fr27
89fm1_1 .reg %fr28
90fm_1 .reg %fr29
91
92fw_h .reg %fr7L
93fw_l .reg %fr7R
94fw .reg %fr7
95
96fht_0 .reg %fr8L
97flt_0 .reg %fr8R
98t_float_0 .reg %fr8
99
100fht_1 .reg %fr9L
101flt_1 .reg %fr9R
102t_float_1 .reg %fr9
103
104tmp_0 .reg %r31
105tmp_1 .reg %r21
106m_0 .reg %r20
107m_1 .reg %r19
108ht_0 .reg %r1
109ht_1 .reg %r3
110lt_0 .reg %r4
111lt_1 .reg %r5
112m1_0 .reg %r6
113m1_1 .reg %r7
114rp_val .reg %r8
115rp_val_1 .reg %r9
116
117bn_mul_add_words
118 .export bn_mul_add_words,entry,NO_RELOCATION,LONG_RETURN
119 .proc
120 .callinfo frame=128
121 .entry
122 .align 64
123
124 STD %r3,0(%sp) ; save r3
125 STD %r4,8(%sp) ; save r4
126 NOP ; Needed to make the loop 16-byte aligned
127 NOP ; Needed to make the loop 16-byte aligned
128
129 STD %r5,16(%sp) ; save r5
130 STD %r6,24(%sp) ; save r6
131 STD %r7,32(%sp) ; save r7
132 STD %r8,40(%sp) ; save r8
133
134 STD %r9,48(%sp) ; save r9
135 COPY %r0,%ret0 ; return 0 by default
136 DEPDI,Z 1,31,1,top_overflow ; top_overflow = 1 << 32
137 STD w,56(%sp) ; store w on stack
138
139 CMPIB,>= 0,num,bn_mul_add_words_exit ; if (num <= 0) then exit
140 LDO 128(%sp),%sp ; bump stack
141
142 ;
143 ; The loop is unrolled twice, so if there is only 1 number
144 ; then go straight to the cleanup code.
145 ;
146 CMPIB,= 1,num,bn_mul_add_words_single_top
147 FLDD -72(%sp),fw ; load up w into fp register fw (fw_h/fw_l)
148
149 ;
150 ; This loop is unrolled 2 times (64-byte aligned as well)
151 ;
152 ; PA-RISC 2.0 chips have two fully pipelined multipliers, thus
153 ; two 32-bit mutiplies can be issued per cycle.
154 ;
155bn_mul_add_words_unroll2
156
157 FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R)
158 FLDD 8(a_ptr),t_float_1 ; load up 64-bit value (fr8L) ht(L)/lt(R)
159 LDD 0(r_ptr),rp_val ; rp[0]
160 LDD 8(r_ptr),rp_val_1 ; rp[1]
161
162 XMPYU fht_0,fw_l,fm1 ; m1[0] = fht_0*fw_l
163 XMPYU fht_1,fw_l,fm1_1 ; m1[1] = fht_1*fw_l
164 FSTD fm1,-16(%sp) ; -16(sp) = m1[0]
165 FSTD fm1_1,-48(%sp) ; -48(sp) = m1[1]
166
167 XMPYU flt_0,fw_h,fm ; m[0] = flt_0*fw_h
168 XMPYU flt_1,fw_h,fm_1 ; m[1] = flt_1*fw_h
169 FSTD fm,-8(%sp) ; -8(sp) = m[0]
170 FSTD fm_1,-40(%sp) ; -40(sp) = m[1]
171
172 XMPYU fht_0,fw_h,ht_temp ; ht_temp = fht_0*fw_h
173 XMPYU fht_1,fw_h,ht_temp_1 ; ht_temp_1 = fht_1*fw_h
174 FSTD ht_temp,-24(%sp) ; -24(sp) = ht_temp
175 FSTD ht_temp_1,-56(%sp) ; -56(sp) = ht_temp_1
176
177 XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l
178 XMPYU flt_1,fw_l,lt_temp_1 ; lt_temp = lt*fw_l
179 FSTD lt_temp,-32(%sp) ; -32(sp) = lt_temp
180 FSTD lt_temp_1,-64(%sp) ; -64(sp) = lt_temp_1
181
182 LDD -8(%sp),m_0 ; m[0]
183 LDD -40(%sp),m_1 ; m[1]
184 LDD -16(%sp),m1_0 ; m1[0]
185 LDD -48(%sp),m1_1 ; m1[1]
186
187 LDD -24(%sp),ht_0 ; ht[0]
188 LDD -56(%sp),ht_1 ; ht[1]
189 ADD,L m1_0,m_0,tmp_0 ; tmp_0 = m[0] + m1[0];
190 ADD,L m1_1,m_1,tmp_1 ; tmp_1 = m[1] + m1[1];
191
192 LDD -32(%sp),lt_0
193 LDD -64(%sp),lt_1
194 CMPCLR,*>>= tmp_0,m1_0, %r0 ; if (m[0] < m1[0])
195 ADD,L ht_0,top_overflow,ht_0 ; ht[0] += (1<<32)
196
197 CMPCLR,*>>= tmp_1,m1_1,%r0 ; if (m[1] < m1[1])
198 ADD,L ht_1,top_overflow,ht_1 ; ht[1] += (1<<32)
199 EXTRD,U tmp_0,31,32,m_0 ; m[0]>>32
200 DEPD,Z tmp_0,31,32,m1_0 ; m1[0] = m[0]<<32
201
202 EXTRD,U tmp_1,31,32,m_1 ; m[1]>>32
203 DEPD,Z tmp_1,31,32,m1_1 ; m1[1] = m[1]<<32
204 ADD,L ht_0,m_0,ht_0 ; ht[0]+= (m[0]>>32)
205 ADD,L ht_1,m_1,ht_1 ; ht[1]+= (m[1]>>32)
206
207 ADD lt_0,m1_0,lt_0 ; lt[0] = lt[0]+m1[0];
208 ADD,DC ht_0,%r0,ht_0 ; ht[0]++
209 ADD lt_1,m1_1,lt_1 ; lt[1] = lt[1]+m1[1];
210 ADD,DC ht_1,%r0,ht_1 ; ht[1]++
211
212 ADD %ret0,lt_0,lt_0 ; lt[0] = lt[0] + c;
213 ADD,DC ht_0,%r0,ht_0 ; ht[0]++
214 ADD lt_0,rp_val,lt_0 ; lt[0] = lt[0]+rp[0]
215 ADD,DC ht_0,%r0,ht_0 ; ht[0]++
216
217 LDO -2(num),num ; num = num - 2;
218 ADD ht_0,lt_1,lt_1 ; lt[1] = lt[1] + ht_0 (c);
219 ADD,DC ht_1,%r0,ht_1 ; ht[1]++
220 STD lt_0,0(r_ptr) ; rp[0] = lt[0]
221
222 ADD lt_1,rp_val_1,lt_1 ; lt[1] = lt[1]+rp[1]
223 ADD,DC ht_1,%r0,%ret0 ; ht[1]++
224 LDO 16(a_ptr),a_ptr ; a_ptr += 2
225
226 STD lt_1,8(r_ptr) ; rp[1] = lt[1]
227 CMPIB,<= 2,num,bn_mul_add_words_unroll2 ; go again if more to do
228 LDO 16(r_ptr),r_ptr ; r_ptr += 2
229
230 CMPIB,=,N 0,num,bn_mul_add_words_exit ; are we done, or cleanup last one
231
232 ;
233 ; Top of loop aligned on 64-byte boundary
234 ;
235bn_mul_add_words_single_top
236 FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R)
237 LDD 0(r_ptr),rp_val ; rp[0]
238 LDO 8(a_ptr),a_ptr ; a_ptr++
239 XMPYU fht_0,fw_l,fm1 ; m1 = ht*fw_l
240 FSTD fm1,-16(%sp) ; -16(sp) = m1
241 XMPYU flt_0,fw_h,fm ; m = lt*fw_h
242 FSTD fm,-8(%sp) ; -8(sp) = m
243 XMPYU fht_0,fw_h,ht_temp ; ht_temp = ht*fw_h
244 FSTD ht_temp,-24(%sp) ; -24(sp) = ht
245 XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l
246 FSTD lt_temp,-32(%sp) ; -32(sp) = lt
247
248 LDD -8(%sp),m_0
249 LDD -16(%sp),m1_0 ; m1 = temp1
250 ADD,L m_0,m1_0,tmp_0 ; tmp_0 = m + m1;
251 LDD -24(%sp),ht_0
252 LDD -32(%sp),lt_0
253
254 CMPCLR,*>>= tmp_0,m1_0,%r0 ; if (m < m1)
255 ADD,L ht_0,top_overflow,ht_0 ; ht += (1<<32)
256
257 EXTRD,U tmp_0,31,32,m_0 ; m>>32
258 DEPD,Z tmp_0,31,32,m1_0 ; m1 = m<<32
259
260 ADD,L ht_0,m_0,ht_0 ; ht+= (m>>32)
261 ADD lt_0,m1_0,tmp_0 ; tmp_0 = lt+m1;
262 ADD,DC ht_0,%r0,ht_0 ; ht++
263 ADD %ret0,tmp_0,lt_0 ; lt = lt + c;
264 ADD,DC ht_0,%r0,ht_0 ; ht++
265 ADD lt_0,rp_val,lt_0 ; lt = lt+rp[0]
266 ADD,DC ht_0,%r0,%ret0 ; ht++
267 STD lt_0,0(r_ptr) ; rp[0] = lt
268
269bn_mul_add_words_exit
270 .EXIT
271 LDD -80(%sp),%r9 ; restore r9
272 LDD -88(%sp),%r8 ; restore r8
273 LDD -96(%sp),%r7 ; restore r7
274 LDD -104(%sp),%r6 ; restore r6
275 LDD -112(%sp),%r5 ; restore r5
276 LDD -120(%sp),%r4 ; restore r4
277 BVE (%rp)
278 LDD,MB -128(%sp),%r3 ; restore r3
279 .PROCEND ;in=23,24,25,26,29;out=28;
280
281;----------------------------------------------------------------------------
282;
283;BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
284;
285; arg0 = rp
286; arg1 = ap
287; arg2 = num
288; arg3 = w
289
290bn_mul_words
291 .proc
292 .callinfo frame=128
293 .entry
294 .EXPORT bn_mul_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
295 .align 64
296
297 STD %r3,0(%sp) ; save r3
298 STD %r4,8(%sp) ; save r4
299 STD %r5,16(%sp) ; save r5
300 STD %r6,24(%sp) ; save r6
301
302 STD %r7,32(%sp) ; save r7
303 COPY %r0,%ret0 ; return 0 by default
304 DEPDI,Z 1,31,1,top_overflow ; top_overflow = 1 << 32
305 STD w,56(%sp) ; w on stack
306
307 CMPIB,>= 0,num,bn_mul_words_exit
308 LDO 128(%sp),%sp ; bump stack
309
310 ;
311 ; See if only 1 word to do, thus just do cleanup
312 ;
313 CMPIB,= 1,num,bn_mul_words_single_top
314 FLDD -72(%sp),fw ; load up w into fp register fw (fw_h/fw_l)
315
316 ;
317 ; This loop is unrolled 2 times (64-byte aligned as well)
318 ;
319 ; PA-RISC 2.0 chips have two fully pipelined multipliers, thus
320 ; two 32-bit mutiplies can be issued per cycle.
321 ;
322bn_mul_words_unroll2
323
324 FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R)
325 FLDD 8(a_ptr),t_float_1 ; load up 64-bit value (fr8L) ht(L)/lt(R)
326 XMPYU fht_0,fw_l,fm1 ; m1[0] = fht_0*fw_l
327 XMPYU fht_1,fw_l,fm1_1 ; m1[1] = ht*fw_l
328
329 FSTD fm1,-16(%sp) ; -16(sp) = m1
330 FSTD fm1_1,-48(%sp) ; -48(sp) = m1
331 XMPYU flt_0,fw_h,fm ; m = lt*fw_h
332 XMPYU flt_1,fw_h,fm_1 ; m = lt*fw_h
333
334 FSTD fm,-8(%sp) ; -8(sp) = m
335 FSTD fm_1,-40(%sp) ; -40(sp) = m
336 XMPYU fht_0,fw_h,ht_temp ; ht_temp = fht_0*fw_h
337 XMPYU fht_1,fw_h,ht_temp_1 ; ht_temp = ht*fw_h
338
339 FSTD ht_temp,-24(%sp) ; -24(sp) = ht
340 FSTD ht_temp_1,-56(%sp) ; -56(sp) = ht
341 XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l
342 XMPYU flt_1,fw_l,lt_temp_1 ; lt_temp = lt*fw_l
343
344 FSTD lt_temp,-32(%sp) ; -32(sp) = lt
345 FSTD lt_temp_1,-64(%sp) ; -64(sp) = lt
346 LDD -8(%sp),m_0
347 LDD -40(%sp),m_1
348
349 LDD -16(%sp),m1_0
350 LDD -48(%sp),m1_1
351 LDD -24(%sp),ht_0
352 LDD -56(%sp),ht_1
353
354 ADD,L m1_0,m_0,tmp_0 ; tmp_0 = m + m1;
355 ADD,L m1_1,m_1,tmp_1 ; tmp_1 = m + m1;
356 LDD -32(%sp),lt_0
357 LDD -64(%sp),lt_1
358
359 CMPCLR,*>>= tmp_0,m1_0, %r0 ; if (m < m1)
360 ADD,L ht_0,top_overflow,ht_0 ; ht += (1<<32)
361 CMPCLR,*>>= tmp_1,m1_1,%r0 ; if (m < m1)
362 ADD,L ht_1,top_overflow,ht_1 ; ht += (1<<32)
363
364 EXTRD,U tmp_0,31,32,m_0 ; m>>32
365 DEPD,Z tmp_0,31,32,m1_0 ; m1 = m<<32
366 EXTRD,U tmp_1,31,32,m_1 ; m>>32
367 DEPD,Z tmp_1,31,32,m1_1 ; m1 = m<<32
368
369 ADD,L ht_0,m_0,ht_0 ; ht+= (m>>32)
370 ADD,L ht_1,m_1,ht_1 ; ht+= (m>>32)
371 ADD lt_0,m1_0,lt_0 ; lt = lt+m1;
372 ADD,DC ht_0,%r0,ht_0 ; ht++
373
374 ADD lt_1,m1_1,lt_1 ; lt = lt+m1;
375 ADD,DC ht_1,%r0,ht_1 ; ht++
376 ADD %ret0,lt_0,lt_0 ; lt = lt + c (ret0);
377 ADD,DC ht_0,%r0,ht_0 ; ht++
378
379 ADD ht_0,lt_1,lt_1 ; lt = lt + c (ht_0)
380 ADD,DC ht_1,%r0,ht_1 ; ht++
381 STD lt_0,0(r_ptr) ; rp[0] = lt
382 STD lt_1,8(r_ptr) ; rp[1] = lt
383
384 COPY ht_1,%ret0 ; carry = ht
385 LDO -2(num),num ; num = num - 2;
386 LDO 16(a_ptr),a_ptr ; ap += 2
387 CMPIB,<= 2,num,bn_mul_words_unroll2
388 LDO 16(r_ptr),r_ptr ; rp++
389
390 CMPIB,=,N 0,num,bn_mul_words_exit ; are we done?
391
392 ;
393 ; Top of loop aligned on 64-byte boundary
394 ;
395bn_mul_words_single_top
396 FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R)
397
398 XMPYU fht_0,fw_l,fm1 ; m1 = ht*fw_l
399 FSTD fm1,-16(%sp) ; -16(sp) = m1
400 XMPYU flt_0,fw_h,fm ; m = lt*fw_h
401 FSTD fm,-8(%sp) ; -8(sp) = m
402 XMPYU fht_0,fw_h,ht_temp ; ht_temp = ht*fw_h
403 FSTD ht_temp,-24(%sp) ; -24(sp) = ht
404 XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l
405 FSTD lt_temp,-32(%sp) ; -32(sp) = lt
406
407 LDD -8(%sp),m_0
408 LDD -16(%sp),m1_0
409 ADD,L m_0,m1_0,tmp_0 ; tmp_0 = m + m1;
410 LDD -24(%sp),ht_0
411 LDD -32(%sp),lt_0
412
413 CMPCLR,*>>= tmp_0,m1_0,%r0 ; if (m < m1)
414 ADD,L ht_0,top_overflow,ht_0 ; ht += (1<<32)
415
416 EXTRD,U tmp_0,31,32,m_0 ; m>>32
417 DEPD,Z tmp_0,31,32,m1_0 ; m1 = m<<32
418
419 ADD,L ht_0,m_0,ht_0 ; ht+= (m>>32)
420 ADD lt_0,m1_0,lt_0 ; lt= lt+m1;
421 ADD,DC ht_0,%r0,ht_0 ; ht++
422
423 ADD %ret0,lt_0,lt_0 ; lt = lt + c;
424 ADD,DC ht_0,%r0,ht_0 ; ht++
425
426 COPY ht_0,%ret0 ; copy carry
427 STD lt_0,0(r_ptr) ; rp[0] = lt
428
429bn_mul_words_exit
430 .EXIT
431 LDD -96(%sp),%r7 ; restore r7
432 LDD -104(%sp),%r6 ; restore r6
433 LDD -112(%sp),%r5 ; restore r5
434 LDD -120(%sp),%r4 ; restore r4
435 BVE (%rp)
436 LDD,MB -128(%sp),%r3 ; restore r3
437 .PROCEND ;in=23,24,25,26,29;out=28;
438
439;----------------------------------------------------------------------------
440;
441;void bn_sqr_words(BN_ULONG *rp, BN_ULONG *ap, int num)
442;
443; arg0 = rp
444; arg1 = ap
445; arg2 = num
446;
447
448bn_sqr_words
449 .proc
450 .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
451 .EXPORT bn_sqr_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
452 .entry
453 .align 64
454
455 STD %r3,0(%sp) ; save r3
456 STD %r4,8(%sp) ; save r4
457 NOP
458 STD %r5,16(%sp) ; save r5
459
460 CMPIB,>= 0,num,bn_sqr_words_exit
461 LDO 128(%sp),%sp ; bump stack
462
463 ;
464 ; If only 1, the goto straight to cleanup
465 ;
466 CMPIB,= 1,num,bn_sqr_words_single_top
467 DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L
468
469 ;
470 ; This loop is unrolled 2 times (64-byte aligned as well)
471 ;
472
473bn_sqr_words_unroll2
474 FLDD 0(a_ptr),t_float_0 ; a[0]
475 FLDD 8(a_ptr),t_float_1 ; a[1]
476 XMPYU fht_0,flt_0,fm ; m[0]
477 XMPYU fht_1,flt_1,fm_1 ; m[1]
478
479 FSTD fm,-24(%sp) ; store m[0]
480 FSTD fm_1,-56(%sp) ; store m[1]
481 XMPYU flt_0,flt_0,lt_temp ; lt[0]
482 XMPYU flt_1,flt_1,lt_temp_1 ; lt[1]
483
484 FSTD lt_temp,-16(%sp) ; store lt[0]
485 FSTD lt_temp_1,-48(%sp) ; store lt[1]
486 XMPYU fht_0,fht_0,ht_temp ; ht[0]
487 XMPYU fht_1,fht_1,ht_temp_1 ; ht[1]
488
489 FSTD ht_temp,-8(%sp) ; store ht[0]
490 FSTD ht_temp_1,-40(%sp) ; store ht[1]
491 LDD -24(%sp),m_0
492 LDD -56(%sp),m_1
493
494 AND m_0,high_mask,tmp_0 ; m[0] & Mask
495 AND m_1,high_mask,tmp_1 ; m[1] & Mask
496 DEPD,Z m_0,30,31,m_0 ; m[0] << 32+1
497 DEPD,Z m_1,30,31,m_1 ; m[1] << 32+1
498
499 LDD -16(%sp),lt_0
500 LDD -48(%sp),lt_1
501 EXTRD,U tmp_0,32,33,tmp_0 ; tmp_0 = m[0]&Mask >> 32-1
502 EXTRD,U tmp_1,32,33,tmp_1 ; tmp_1 = m[1]&Mask >> 32-1
503
504 LDD -8(%sp),ht_0
505 LDD -40(%sp),ht_1
506 ADD,L ht_0,tmp_0,ht_0 ; ht[0] += tmp_0
507 ADD,L ht_1,tmp_1,ht_1 ; ht[1] += tmp_1
508
509 ADD lt_0,m_0,lt_0 ; lt = lt+m
510 ADD,DC ht_0,%r0,ht_0 ; ht[0]++
511 STD lt_0,0(r_ptr) ; rp[0] = lt[0]
512 STD ht_0,8(r_ptr) ; rp[1] = ht[1]
513
514 ADD lt_1,m_1,lt_1 ; lt = lt+m
515 ADD,DC ht_1,%r0,ht_1 ; ht[1]++
516 STD lt_1,16(r_ptr) ; rp[2] = lt[1]
517 STD ht_1,24(r_ptr) ; rp[3] = ht[1]
518
519 LDO -2(num),num ; num = num - 2;
520 LDO 16(a_ptr),a_ptr ; ap += 2
521 CMPIB,<= 2,num,bn_sqr_words_unroll2
522 LDO 32(r_ptr),r_ptr ; rp += 4
523
524 CMPIB,=,N 0,num,bn_sqr_words_exit ; are we done?
525
526 ;
527 ; Top of loop aligned on 64-byte boundary
528 ;
529bn_sqr_words_single_top
530 FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R)
531
532 XMPYU fht_0,flt_0,fm ; m
533 FSTD fm,-24(%sp) ; store m
534
535 XMPYU flt_0,flt_0,lt_temp ; lt
536 FSTD lt_temp,-16(%sp) ; store lt
537
538 XMPYU fht_0,fht_0,ht_temp ; ht
539 FSTD ht_temp,-8(%sp) ; store ht
540
541 LDD -24(%sp),m_0 ; load m
542 AND m_0,high_mask,tmp_0 ; m & Mask
543 DEPD,Z m_0,30,31,m_0 ; m << 32+1
544 LDD -16(%sp),lt_0 ; lt
545
546 LDD -8(%sp),ht_0 ; ht
547 EXTRD,U tmp_0,32,33,tmp_0 ; tmp_0 = m&Mask >> 32-1
548 ADD m_0,lt_0,lt_0 ; lt = lt+m
549 ADD,L ht_0,tmp_0,ht_0 ; ht += tmp_0
550 ADD,DC ht_0,%r0,ht_0 ; ht++
551
552 STD lt_0,0(r_ptr) ; rp[0] = lt
553 STD ht_0,8(r_ptr) ; rp[1] = ht
554
555bn_sqr_words_exit
556 .EXIT
557 LDD -112(%sp),%r5 ; restore r5
558 LDD -120(%sp),%r4 ; restore r4
559 BVE (%rp)
560 LDD,MB -128(%sp),%r3
561 .PROCEND ;in=23,24,25,26,29;out=28;
562
563
564;----------------------------------------------------------------------------
565;
566;BN_ULONG bn_add_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
567;
568; arg0 = rp
569; arg1 = ap
570; arg2 = bp
571; arg3 = n
572
573t .reg %r22
574b .reg %r21
575l .reg %r20
576
577bn_add_words
578 .proc
579 .entry
580 .callinfo
581 .EXPORT bn_add_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
582 .align 64
583
584 CMPIB,>= 0,n,bn_add_words_exit
585 COPY %r0,%ret0 ; return 0 by default
586
587 ;
588 ; If 2 or more numbers do the loop
589 ;
590 CMPIB,= 1,n,bn_add_words_single_top
591 NOP
592
593 ;
594 ; This loop is unrolled 2 times (64-byte aligned as well)
595 ;
596bn_add_words_unroll2
597 LDD 0(a_ptr),t
598 LDD 0(b_ptr),b
599 ADD t,%ret0,t ; t = t+c;
600 ADD,DC %r0,%r0,%ret0 ; set c to carry
601 ADD t,b,l ; l = t + b[0]
602 ADD,DC %ret0,%r0,%ret0 ; c+= carry
603 STD l,0(r_ptr)
604
605 LDD 8(a_ptr),t
606 LDD 8(b_ptr),b
607 ADD t,%ret0,t ; t = t+c;
608 ADD,DC %r0,%r0,%ret0 ; set c to carry
609 ADD t,b,l ; l = t + b[0]
610 ADD,DC %ret0,%r0,%ret0 ; c+= carry
611 STD l,8(r_ptr)
612
613 LDO -2(n),n
614 LDO 16(a_ptr),a_ptr
615 LDO 16(b_ptr),b_ptr
616
617 CMPIB,<= 2,n,bn_add_words_unroll2
618 LDO 16(r_ptr),r_ptr
619
620 CMPIB,=,N 0,n,bn_add_words_exit ; are we done?
621
622bn_add_words_single_top
623 LDD 0(a_ptr),t
624 LDD 0(b_ptr),b
625
626 ADD t,%ret0,t ; t = t+c;
627 ADD,DC %r0,%r0,%ret0 ; set c to carry (could use CMPCLR??)
628 ADD t,b,l ; l = t + b[0]
629 ADD,DC %ret0,%r0,%ret0 ; c+= carry
630 STD l,0(r_ptr)
631
632bn_add_words_exit
633 .EXIT
634 BVE (%rp)
635 NOP
636 .PROCEND ;in=23,24,25,26,29;out=28;
637
638;----------------------------------------------------------------------------
639;
640;BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
641;
642; arg0 = rp
643; arg1 = ap
644; arg2 = bp
645; arg3 = n
646
647t1 .reg %r22
648t2 .reg %r21
649sub_tmp1 .reg %r20
650sub_tmp2 .reg %r19
651
652
653bn_sub_words
654 .proc
655 .callinfo
656 .EXPORT bn_sub_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
657 .entry
658 .align 64
659
660 CMPIB,>= 0,n,bn_sub_words_exit
661 COPY %r0,%ret0 ; return 0 by default
662
663 ;
664 ; If 2 or more numbers do the loop
665 ;
666 CMPIB,= 1,n,bn_sub_words_single_top
667 NOP
668
669 ;
670 ; This loop is unrolled 2 times (64-byte aligned as well)
671 ;
672bn_sub_words_unroll2
673 LDD 0(a_ptr),t1
674 LDD 0(b_ptr),t2
675 SUB t1,t2,sub_tmp1 ; t3 = t1-t2;
676 SUB sub_tmp1,%ret0,sub_tmp1 ; t3 = t3- c;
677
678 CMPCLR,*>> t1,t2,sub_tmp2 ; clear if t1 > t2
679 LDO 1(%r0),sub_tmp2
680
681 CMPCLR,*= t1,t2,%r0
682 COPY sub_tmp2,%ret0
683 STD sub_tmp1,0(r_ptr)
684
685 LDD 8(a_ptr),t1
686 LDD 8(b_ptr),t2
687 SUB t1,t2,sub_tmp1 ; t3 = t1-t2;
688 SUB sub_tmp1,%ret0,sub_tmp1 ; t3 = t3- c;
689 CMPCLR,*>> t1,t2,sub_tmp2 ; clear if t1 > t2
690 LDO 1(%r0),sub_tmp2
691
692 CMPCLR,*= t1,t2,%r0
693 COPY sub_tmp2,%ret0
694 STD sub_tmp1,8(r_ptr)
695
696 LDO -2(n),n
697 LDO 16(a_ptr),a_ptr
698 LDO 16(b_ptr),b_ptr
699
700 CMPIB,<= 2,n,bn_sub_words_unroll2
701 LDO 16(r_ptr),r_ptr
702
703 CMPIB,=,N 0,n,bn_sub_words_exit ; are we done?
704
705bn_sub_words_single_top
706 LDD 0(a_ptr),t1
707 LDD 0(b_ptr),t2
708 SUB t1,t2,sub_tmp1 ; t3 = t1-t2;
709 SUB sub_tmp1,%ret0,sub_tmp1 ; t3 = t3- c;
710 CMPCLR,*>> t1,t2,sub_tmp2 ; clear if t1 > t2
711 LDO 1(%r0),sub_tmp2
712
713 CMPCLR,*= t1,t2,%r0
714 COPY sub_tmp2,%ret0
715
716 STD sub_tmp1,0(r_ptr)
717
718bn_sub_words_exit
719 .EXIT
720 BVE (%rp)
721 NOP
722 .PROCEND ;in=23,24,25,26,29;out=28;
723
724;------------------------------------------------------------------------------
725;
726; unsigned long bn_div_words(unsigned long h, unsigned long l, unsigned long d)
727;
728; arg0 = h
729; arg1 = l
730; arg2 = d
731;
732; This is mainly just modified assembly from the compiler, thus the
733; lack of variable names.
734;
735;------------------------------------------------------------------------------
736bn_div_words
737 .proc
738 .callinfo CALLER,FRAME=272,ENTRY_GR=%r10,SAVE_RP,ARGS_SAVED,ORDERING_AWARE
739 .EXPORT bn_div_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
740 .IMPORT BN_num_bits_word,CODE,NO_RELOCATION
741 .IMPORT __iob,DATA
742 .IMPORT fprintf,CODE,NO_RELOCATION
743 .IMPORT abort,CODE,NO_RELOCATION
744 .IMPORT $$div2U,MILLICODE
745 .entry
746 STD %r2,-16(%r30)
747 STD,MA %r3,352(%r30)
748 STD %r4,-344(%r30)
749 STD %r5,-336(%r30)
750 STD %r6,-328(%r30)
751 STD %r7,-320(%r30)
752 STD %r8,-312(%r30)
753 STD %r9,-304(%r30)
754 STD %r10,-296(%r30)
755
756 STD %r27,-288(%r30) ; save gp
757
758 COPY %r24,%r3 ; save d
759 COPY %r26,%r4 ; save h (high 64-bits)
760 LDO -1(%r0),%ret0 ; return -1 by default
761
762 CMPB,*= %r0,%arg2,$D3 ; if (d == 0)
763 COPY %r25,%r5 ; save l (low 64-bits)
764
765 LDO -48(%r30),%r29 ; create ap
766 .CALL ;in=26,29;out=28;
767 B,L BN_num_bits_word,%r2
768 COPY %r3,%r26
769 LDD -288(%r30),%r27 ; restore gp
770 LDI 64,%r21
771
772 CMPB,= %r21,%ret0,$00000012 ;if (i == 64) (forward)
773 COPY %ret0,%r24 ; i
774 MTSARCM %r24
775 DEPDI,Z -1,%sar,1,%r29
776 CMPB,*<<,N %r29,%r4,bn_div_err_case ; if (h > 1<<i) (forward)
777
778$00000012
779 SUBI 64,%r24,%r31 ; i = 64 - i;
780 CMPCLR,*<< %r4,%r3,%r0 ; if (h >= d)
781 SUB %r4,%r3,%r4 ; h -= d
782 CMPB,= %r31,%r0,$0000001A ; if (i)
783 COPY %r0,%r10 ; ret = 0
784 MTSARCM %r31 ; i to shift
785 DEPD,Z %r3,%sar,64,%r3 ; d <<= i;
786 SUBI 64,%r31,%r19 ; 64 - i; redundent
787 MTSAR %r19 ; (64 -i) to shift
788 SHRPD %r4,%r5,%sar,%r4 ; l>> (64-i)
789 MTSARCM %r31 ; i to shift
790 DEPD,Z %r5,%sar,64,%r5 ; l <<= i;
791
792$0000001A
793 DEPDI,Z -1,31,32,%r19
794 EXTRD,U %r3,31,32,%r6 ; dh=(d&0xfff)>>32
795 EXTRD,U %r3,63,32,%r8 ; dl = d&0xffffff
796 LDO 2(%r0),%r9
797 STD %r3,-280(%r30) ; "d" to stack
798
799$0000001C
800 DEPDI,Z -1,63,32,%r29 ;
801 EXTRD,U %r4,31,32,%r31 ; h >> 32
802 CMPB,*=,N %r31,%r6,$D2 ; if ((h>>32) != dh)(forward) div
803 COPY %r4,%r26
804 EXTRD,U %r4,31,32,%r25
805 COPY %r6,%r24
806 .CALL ;in=23,24,25,26;out=20,21,22,28,29; (MILLICALL)
807 B,L $$div2U,%r2
808 EXTRD,U %r6,31,32,%r23
809 DEPD %r28,31,32,%r29
810$D2
811 STD %r29,-272(%r30) ; q
812 AND %r5,%r19,%r24 ; t & 0xffffffff00000000;
813 EXTRD,U %r24,31,32,%r24 ; ???
814 FLDD -272(%r30),%fr7 ; q
815 FLDD -280(%r30),%fr8 ; d
816 XMPYU %fr8L,%fr7L,%fr10
817 FSTD %fr10,-256(%r30)
818 XMPYU %fr8L,%fr7R,%fr22
819 FSTD %fr22,-264(%r30)
820 XMPYU %fr8R,%fr7L,%fr11
821 XMPYU %fr8R,%fr7R,%fr23
822 FSTD %fr11,-232(%r30)
823 FSTD %fr23,-240(%r30)
824 LDD -256(%r30),%r28
825 DEPD,Z %r28,31,32,%r2
826 LDD -264(%r30),%r20
827 ADD,L %r20,%r2,%r31
828 LDD -232(%r30),%r22
829 DEPD,Z %r22,31,32,%r22
830 LDD -240(%r30),%r21
831 B $00000024 ; enter loop
832 ADD,L %r21,%r22,%r23
833
834$0000002A
835 LDO -1(%r29),%r29
836 SUB %r23,%r8,%r23
837$00000024
838 SUB %r4,%r31,%r25
839 AND %r25,%r19,%r26
840 CMPB,*<>,N %r0,%r26,$00000046 ; (forward)
841 DEPD,Z %r25,31,32,%r20
842 OR %r20,%r24,%r21
843 CMPB,*<<,N %r21,%r23,$0000002A ;(backward)
844 SUB %r31,%r6,%r31
845;-------------Break path---------------------
846
847$00000046
848 DEPD,Z %r23,31,32,%r25 ;tl
849 EXTRD,U %r23,31,32,%r26 ;t
850 AND %r25,%r19,%r24 ;tl = (tl<<32)&0xfffffff0000000L
851 ADD,L %r31,%r26,%r31 ;th += t;
852 CMPCLR,*>>= %r5,%r24,%r0 ;if (l<tl)
853 LDO 1(%r31),%r31 ; th++;
854 CMPB,*<<=,N %r31,%r4,$00000036 ;if (n < th) (forward)
855 LDO -1(%r29),%r29 ;q--;
856 ADD,L %r4,%r3,%r4 ;h += d;
857$00000036
858 ADDIB,=,N -1,%r9,$D1 ;if (--count == 0) break (forward)
859 SUB %r5,%r24,%r28 ; l -= tl;
860 SUB %r4,%r31,%r24 ; h -= th;
861 SHRPD %r24,%r28,32,%r4 ; h = ((h<<32)|(l>>32));
862 DEPD,Z %r29,31,32,%r10 ; ret = q<<32
863 b $0000001C
864 DEPD,Z %r28,31,32,%r5 ; l = l << 32
865
866$D1
867 OR %r10,%r29,%r28 ; ret |= q
868$D3
869 LDD -368(%r30),%r2
870$D0
871 LDD -296(%r30),%r10
872 LDD -304(%r30),%r9
873 LDD -312(%r30),%r8
874 LDD -320(%r30),%r7
875 LDD -328(%r30),%r6
876 LDD -336(%r30),%r5
877 LDD -344(%r30),%r4
878 BVE (%r2)
879 .EXIT
880 LDD,MB -352(%r30),%r3
881
882bn_div_err_case
883 MFIA %r6
884 ADDIL L'bn_div_words-bn_div_err_case,%r6,%r1
885 LDO R'bn_div_words-bn_div_err_case(%r1),%r6
886 ADDIL LT'__iob,%r27,%r1
887 LDD RT'__iob(%r1),%r26
888 ADDIL L'C$4-bn_div_words,%r6,%r1
889 LDO R'C$4-bn_div_words(%r1),%r25
890 LDO 64(%r26),%r26
891 .CALL ;in=24,25,26,29;out=28;
892 B,L fprintf,%r2
893 LDO -48(%r30),%r29
894 LDD -288(%r30),%r27
895 .CALL ;in=29;
896 B,L abort,%r2
897 LDO -48(%r30),%r29
898 LDD -288(%r30),%r27
899 B $D0
900 LDD -368(%r30),%r2
901 .PROCEND ;in=24,25,26,29;out=28;
902
903;----------------------------------------------------------------------------
904;
905; Registers to hold 64-bit values to manipulate. The "L" part
906; of the register corresponds to the upper 32-bits, while the "R"
907; part corresponds to the lower 32-bits
908;
909; Note, that when using b6 and b7, the code must save these before
910; using them because they are callee save registers
911;
912;
913; Floating point registers to use to save values that
914; are manipulated. These don't collide with ftemp1-6 and
915; are all caller save registers
916;
917a0 .reg %fr22
918a0L .reg %fr22L
919a0R .reg %fr22R
920
921a1 .reg %fr23
922a1L .reg %fr23L
923a1R .reg %fr23R
924
925a2 .reg %fr24
926a2L .reg %fr24L
927a2R .reg %fr24R
928
929a3 .reg %fr25
930a3L .reg %fr25L
931a3R .reg %fr25R
932
933a4 .reg %fr26
934a4L .reg %fr26L
935a4R .reg %fr26R
936
937a5 .reg %fr27
938a5L .reg %fr27L
939a5R .reg %fr27R
940
941a6 .reg %fr28
942a6L .reg %fr28L
943a6R .reg %fr28R
944
945a7 .reg %fr29
946a7L .reg %fr29L
947a7R .reg %fr29R
948
949b0 .reg %fr30
950b0L .reg %fr30L
951b0R .reg %fr30R
952
953b1 .reg %fr31
954b1L .reg %fr31L
955b1R .reg %fr31R
956
957;
958; Temporary floating point variables, these are all caller save
959; registers
960;
961ftemp1 .reg %fr4
962ftemp2 .reg %fr5
963ftemp3 .reg %fr6
964ftemp4 .reg %fr7
965
966;
967; The B set of registers when used.
968;
969
970b2 .reg %fr8
971b2L .reg %fr8L
972b2R .reg %fr8R
973
974b3 .reg %fr9
975b3L .reg %fr9L
976b3R .reg %fr9R
977
978b4 .reg %fr10
979b4L .reg %fr10L
980b4R .reg %fr10R
981
982b5 .reg %fr11
983b5L .reg %fr11L
984b5R .reg %fr11R
985
986b6 .reg %fr12
987b6L .reg %fr12L
988b6R .reg %fr12R
989
990b7 .reg %fr13
991b7L .reg %fr13L
992b7R .reg %fr13R
993
994c1 .reg %r21 ; only reg
995temp1 .reg %r20 ; only reg
996temp2 .reg %r19 ; only reg
997temp3 .reg %r31 ; only reg
998
999m1 .reg %r28
1000c2 .reg %r23
1001high_one .reg %r1
1002ht .reg %r6
1003lt .reg %r5
1004m .reg %r4
1005c3 .reg %r3
1006
1007SQR_ADD_C .macro A0L,A0R,C1,C2,C3
1008 XMPYU A0L,A0R,ftemp1 ; m
1009 FSTD ftemp1,-24(%sp) ; store m
1010
1011 XMPYU A0R,A0R,ftemp2 ; lt
1012 FSTD ftemp2,-16(%sp) ; store lt
1013
1014 XMPYU A0L,A0L,ftemp3 ; ht
1015 FSTD ftemp3,-8(%sp) ; store ht
1016
1017 LDD -24(%sp),m ; load m
1018 AND m,high_mask,temp2 ; m & Mask
1019 DEPD,Z m,30,31,temp3 ; m << 32+1
1020 LDD -16(%sp),lt ; lt
1021
1022 LDD -8(%sp),ht ; ht
1023 EXTRD,U temp2,32,33,temp1 ; temp1 = m&Mask >> 32-1
1024 ADD temp3,lt,lt ; lt = lt+m
1025 ADD,L ht,temp1,ht ; ht += temp1
1026 ADD,DC ht,%r0,ht ; ht++
1027
1028 ADD C1,lt,C1 ; c1=c1+lt
1029 ADD,DC ht,%r0,ht ; ht++
1030
1031 ADD C2,ht,C2 ; c2=c2+ht
1032 ADD,DC C3,%r0,C3 ; c3++
1033.endm
1034
1035SQR_ADD_C2 .macro A0L,A0R,A1L,A1R,C1,C2,C3
1036 XMPYU A0L,A1R,ftemp1 ; m1 = bl*ht
1037 FSTD ftemp1,-16(%sp) ;
1038 XMPYU A0R,A1L,ftemp2 ; m = bh*lt
1039 FSTD ftemp2,-8(%sp) ;
1040 XMPYU A0R,A1R,ftemp3 ; lt = bl*lt
1041 FSTD ftemp3,-32(%sp)
1042 XMPYU A0L,A1L,ftemp4 ; ht = bh*ht
1043 FSTD ftemp4,-24(%sp) ;
1044
1045 LDD -8(%sp),m ; r21 = m
1046 LDD -16(%sp),m1 ; r19 = m1
1047 ADD,L m,m1,m ; m+m1
1048
1049 DEPD,Z m,31,32,temp3 ; (m+m1<<32)
1050 LDD -24(%sp),ht ; r24 = ht
1051
1052 CMPCLR,*>>= m,m1,%r0 ; if (m < m1)
1053 ADD,L ht,high_one,ht ; ht+=high_one
1054
1055 EXTRD,U m,31,32,temp1 ; m >> 32
1056 LDD -32(%sp),lt ; lt
1057 ADD,L ht,temp1,ht ; ht+= m>>32
1058 ADD lt,temp3,lt ; lt = lt+m1
1059 ADD,DC ht,%r0,ht ; ht++
1060
1061 ADD ht,ht,ht ; ht=ht+ht;
1062 ADD,DC C3,%r0,C3 ; add in carry (c3++)
1063
1064 ADD lt,lt,lt ; lt=lt+lt;
1065 ADD,DC ht,%r0,ht ; add in carry (ht++)
1066
1067 ADD C1,lt,C1 ; c1=c1+lt
1068 ADD,DC,*NUV ht,%r0,ht ; add in carry (ht++)
1069 LDO 1(C3),C3 ; bump c3 if overflow,nullify otherwise
1070
1071 ADD C2,ht,C2 ; c2 = c2 + ht
1072 ADD,DC C3,%r0,C3 ; add in carry (c3++)
1073.endm
1074
1075;
1076;void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
1077; arg0 = r_ptr
1078; arg1 = a_ptr
1079;
1080
1081bn_sqr_comba8
1082 .PROC
1083 .CALLINFO FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
1084 .EXPORT bn_sqr_comba8,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
1085 .ENTRY
1086 .align 64
1087
1088 STD %r3,0(%sp) ; save r3
1089 STD %r4,8(%sp) ; save r4
1090 STD %r5,16(%sp) ; save r5
1091 STD %r6,24(%sp) ; save r6
1092
1093 ;
1094 ; Zero out carries
1095 ;
1096 COPY %r0,c1
1097 COPY %r0,c2
1098 COPY %r0,c3
1099
1100 LDO 128(%sp),%sp ; bump stack
1101 DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L
1102 DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32
1103
1104 ;
1105 ; Load up all of the values we are going to use
1106 ;
1107 FLDD 0(a_ptr),a0
1108 FLDD 8(a_ptr),a1
1109 FLDD 16(a_ptr),a2
1110 FLDD 24(a_ptr),a3
1111 FLDD 32(a_ptr),a4
1112 FLDD 40(a_ptr),a5
1113 FLDD 48(a_ptr),a6
1114 FLDD 56(a_ptr),a7
1115
1116 SQR_ADD_C a0L,a0R,c1,c2,c3
1117 STD c1,0(r_ptr) ; r[0] = c1;
1118 COPY %r0,c1
1119
1120 SQR_ADD_C2 a1L,a1R,a0L,a0R,c2,c3,c1
1121 STD c2,8(r_ptr) ; r[1] = c2;
1122 COPY %r0,c2
1123
1124 SQR_ADD_C a1L,a1R,c3,c1,c2
1125 SQR_ADD_C2 a2L,a2R,a0L,a0R,c3,c1,c2
1126 STD c3,16(r_ptr) ; r[2] = c3;
1127 COPY %r0,c3
1128
1129 SQR_ADD_C2 a3L,a3R,a0L,a0R,c1,c2,c3
1130 SQR_ADD_C2 a2L,a2R,a1L,a1R,c1,c2,c3
1131 STD c1,24(r_ptr) ; r[3] = c1;
1132 COPY %r0,c1
1133
1134 SQR_ADD_C a2L,a2R,c2,c3,c1
1135 SQR_ADD_C2 a3L,a3R,a1L,a1R,c2,c3,c1
1136 SQR_ADD_C2 a4L,a4R,a0L,a0R,c2,c3,c1
1137 STD c2,32(r_ptr) ; r[4] = c2;
1138 COPY %r0,c2
1139
1140 SQR_ADD_C2 a5L,a5R,a0L,a0R,c3,c1,c2
1141 SQR_ADD_C2 a4L,a4R,a1L,a1R,c3,c1,c2
1142 SQR_ADD_C2 a3L,a3R,a2L,a2R,c3,c1,c2
1143 STD c3,40(r_ptr) ; r[5] = c3;
1144 COPY %r0,c3
1145
1146 SQR_ADD_C a3L,a3R,c1,c2,c3
1147 SQR_ADD_C2 a4L,a4R,a2L,a2R,c1,c2,c3
1148 SQR_ADD_C2 a5L,a5R,a1L,a1R,c1,c2,c3
1149 SQR_ADD_C2 a6L,a6R,a0L,a0R,c1,c2,c3
1150 STD c1,48(r_ptr) ; r[6] = c1;
1151 COPY %r0,c1
1152
1153 SQR_ADD_C2 a7L,a7R,a0L,a0R,c2,c3,c1
1154 SQR_ADD_C2 a6L,a6R,a1L,a1R,c2,c3,c1
1155 SQR_ADD_C2 a5L,a5R,a2L,a2R,c2,c3,c1
1156 SQR_ADD_C2 a4L,a4R,a3L,a3R,c2,c3,c1
1157 STD c2,56(r_ptr) ; r[7] = c2;
1158 COPY %r0,c2
1159
1160 SQR_ADD_C a4L,a4R,c3,c1,c2
1161 SQR_ADD_C2 a5L,a5R,a3L,a3R,c3,c1,c2
1162 SQR_ADD_C2 a6L,a6R,a2L,a2R,c3,c1,c2
1163 SQR_ADD_C2 a7L,a7R,a1L,a1R,c3,c1,c2
1164 STD c3,64(r_ptr) ; r[8] = c3;
1165 COPY %r0,c3
1166
1167 SQR_ADD_C2 a7L,a7R,a2L,a2R,c1,c2,c3
1168 SQR_ADD_C2 a6L,a6R,a3L,a3R,c1,c2,c3
1169 SQR_ADD_C2 a5L,a5R,a4L,a4R,c1,c2,c3
1170 STD c1,72(r_ptr) ; r[9] = c1;
1171 COPY %r0,c1
1172
1173 SQR_ADD_C a5L,a5R,c2,c3,c1
1174 SQR_ADD_C2 a6L,a6R,a4L,a4R,c2,c3,c1
1175 SQR_ADD_C2 a7L,a7R,a3L,a3R,c2,c3,c1
1176 STD c2,80(r_ptr) ; r[10] = c2;
1177 COPY %r0,c2
1178
1179 SQR_ADD_C2 a7L,a7R,a4L,a4R,c3,c1,c2
1180 SQR_ADD_C2 a6L,a6R,a5L,a5R,c3,c1,c2
1181 STD c3,88(r_ptr) ; r[11] = c3;
1182 COPY %r0,c3
1183
1184 SQR_ADD_C a6L,a6R,c1,c2,c3
1185 SQR_ADD_C2 a7L,a7R,a5L,a5R,c1,c2,c3
1186 STD c1,96(r_ptr) ; r[12] = c1;
1187 COPY %r0,c1
1188
1189 SQR_ADD_C2 a7L,a7R,a6L,a6R,c2,c3,c1
1190 STD c2,104(r_ptr) ; r[13] = c2;
1191 COPY %r0,c2
1192
1193 SQR_ADD_C a7L,a7R,c3,c1,c2
1194 STD c3, 112(r_ptr) ; r[14] = c3
1195 STD c1, 120(r_ptr) ; r[15] = c1
1196
1197 .EXIT
1198 LDD -104(%sp),%r6 ; restore r6
1199 LDD -112(%sp),%r5 ; restore r5
1200 LDD -120(%sp),%r4 ; restore r4
1201 BVE (%rp)
1202 LDD,MB -128(%sp),%r3
1203
1204 .PROCEND
1205
1206;-----------------------------------------------------------------------------
1207;
1208;void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
1209; arg0 = r_ptr
1210; arg1 = a_ptr
1211;
1212
1213bn_sqr_comba4
1214 .proc
1215 .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
1216 .EXPORT bn_sqr_comba4,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
1217 .entry
1218 .align 64
1219 STD %r3,0(%sp) ; save r3
1220 STD %r4,8(%sp) ; save r4
1221 STD %r5,16(%sp) ; save r5
1222 STD %r6,24(%sp) ; save r6
1223
1224 ;
1225 ; Zero out carries
1226 ;
1227 COPY %r0,c1
1228 COPY %r0,c2
1229 COPY %r0,c3
1230
1231 LDO 128(%sp),%sp ; bump stack
1232 DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L
1233 DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32
1234
1235 ;
1236 ; Load up all of the values we are going to use
1237 ;
1238 FLDD 0(a_ptr),a0
1239 FLDD 8(a_ptr),a1
1240 FLDD 16(a_ptr),a2
1241 FLDD 24(a_ptr),a3
1242 FLDD 32(a_ptr),a4
1243 FLDD 40(a_ptr),a5
1244 FLDD 48(a_ptr),a6
1245 FLDD 56(a_ptr),a7
1246
1247 SQR_ADD_C a0L,a0R,c1,c2,c3
1248
1249 STD c1,0(r_ptr) ; r[0] = c1;
1250 COPY %r0,c1
1251
1252 SQR_ADD_C2 a1L,a1R,a0L,a0R,c2,c3,c1
1253
1254 STD c2,8(r_ptr) ; r[1] = c2;
1255 COPY %r0,c2
1256
1257 SQR_ADD_C a1L,a1R,c3,c1,c2
1258 SQR_ADD_C2 a2L,a2R,a0L,a0R,c3,c1,c2
1259
1260 STD c3,16(r_ptr) ; r[2] = c3;
1261 COPY %r0,c3
1262
1263 SQR_ADD_C2 a3L,a3R,a0L,a0R,c1,c2,c3
1264 SQR_ADD_C2 a2L,a2R,a1L,a1R,c1,c2,c3
1265
1266 STD c1,24(r_ptr) ; r[3] = c1;
1267 COPY %r0,c1
1268
1269 SQR_ADD_C a2L,a2R,c2,c3,c1
1270 SQR_ADD_C2 a3L,a3R,a1L,a1R,c2,c3,c1
1271
1272 STD c2,32(r_ptr) ; r[4] = c2;
1273 COPY %r0,c2
1274
1275 SQR_ADD_C2 a3L,a3R,a2L,a2R,c3,c1,c2
1276 STD c3,40(r_ptr) ; r[5] = c3;
1277 COPY %r0,c3
1278
1279 SQR_ADD_C a3L,a3R,c1,c2,c3
1280 STD c1,48(r_ptr) ; r[6] = c1;
1281 STD c2,56(r_ptr) ; r[7] = c2;
1282
1283 .EXIT
1284 LDD -104(%sp),%r6 ; restore r6
1285 LDD -112(%sp),%r5 ; restore r5
1286 LDD -120(%sp),%r4 ; restore r4
1287 BVE (%rp)
1288 LDD,MB -128(%sp),%r3
1289
1290 .PROCEND
1291
1292
1293;---------------------------------------------------------------------------
1294
1295MUL_ADD_C .macro A0L,A0R,B0L,B0R,C1,C2,C3
1296 XMPYU A0L,B0R,ftemp1 ; m1 = bl*ht
1297 FSTD ftemp1,-16(%sp) ;
1298 XMPYU A0R,B0L,ftemp2 ; m = bh*lt
1299 FSTD ftemp2,-8(%sp) ;
1300 XMPYU A0R,B0R,ftemp3 ; lt = bl*lt
1301 FSTD ftemp3,-32(%sp)
1302 XMPYU A0L,B0L,ftemp4 ; ht = bh*ht
1303 FSTD ftemp4,-24(%sp) ;
1304
1305 LDD -8(%sp),m ; r21 = m
1306 LDD -16(%sp),m1 ; r19 = m1
1307 ADD,L m,m1,m ; m+m1
1308
1309 DEPD,Z m,31,32,temp3 ; (m+m1<<32)
1310 LDD -24(%sp),ht ; r24 = ht
1311
1312 CMPCLR,*>>= m,m1,%r0 ; if (m < m1)
1313 ADD,L ht,high_one,ht ; ht+=high_one
1314
1315 EXTRD,U m,31,32,temp1 ; m >> 32
1316 LDD -32(%sp),lt ; lt
1317 ADD,L ht,temp1,ht ; ht+= m>>32
1318 ADD lt,temp3,lt ; lt = lt+m1
1319 ADD,DC ht,%r0,ht ; ht++
1320
1321 ADD C1,lt,C1 ; c1=c1+lt
1322 ADD,DC ht,%r0,ht ; bump c3 if overflow,nullify otherwise
1323
1324 ADD C2,ht,C2 ; c2 = c2 + ht
1325 ADD,DC C3,%r0,C3 ; add in carry (c3++)
1326.endm
1327
1328
1329;
1330;void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
1331; arg0 = r_ptr
1332; arg1 = a_ptr
1333; arg2 = b_ptr
1334;
1335
1336bn_mul_comba8
1337 .proc
1338 .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
1339 .EXPORT bn_mul_comba8,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
1340 .entry
1341 .align 64
1342
1343 STD %r3,0(%sp) ; save r3
1344 STD %r4,8(%sp) ; save r4
1345 STD %r5,16(%sp) ; save r5
1346 STD %r6,24(%sp) ; save r6
1347 FSTD %fr12,32(%sp) ; save r6
1348 FSTD %fr13,40(%sp) ; save r7
1349
1350 ;
1351 ; Zero out carries
1352 ;
1353 COPY %r0,c1
1354 COPY %r0,c2
1355 COPY %r0,c3
1356
1357 LDO 128(%sp),%sp ; bump stack
1358 DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32
1359
1360 ;
1361 ; Load up all of the values we are going to use
1362 ;
1363 FLDD 0(a_ptr),a0
1364 FLDD 8(a_ptr),a1
1365 FLDD 16(a_ptr),a2
1366 FLDD 24(a_ptr),a3
1367 FLDD 32(a_ptr),a4
1368 FLDD 40(a_ptr),a5
1369 FLDD 48(a_ptr),a6
1370 FLDD 56(a_ptr),a7
1371
1372 FLDD 0(b_ptr),b0
1373 FLDD 8(b_ptr),b1
1374 FLDD 16(b_ptr),b2
1375 FLDD 24(b_ptr),b3
1376 FLDD 32(b_ptr),b4
1377 FLDD 40(b_ptr),b5
1378 FLDD 48(b_ptr),b6
1379 FLDD 56(b_ptr),b7
1380
1381 MUL_ADD_C a0L,a0R,b0L,b0R,c1,c2,c3
1382 STD c1,0(r_ptr)
1383 COPY %r0,c1
1384
1385 MUL_ADD_C a0L,a0R,b1L,b1R,c2,c3,c1
1386 MUL_ADD_C a1L,a1R,b0L,b0R,c2,c3,c1
1387 STD c2,8(r_ptr)
1388 COPY %r0,c2
1389
1390 MUL_ADD_C a2L,a2R,b0L,b0R,c3,c1,c2
1391 MUL_ADD_C a1L,a1R,b1L,b1R,c3,c1,c2
1392 MUL_ADD_C a0L,a0R,b2L,b2R,c3,c1,c2
1393 STD c3,16(r_ptr)
1394 COPY %r0,c3
1395
1396 MUL_ADD_C a0L,a0R,b3L,b3R,c1,c2,c3
1397 MUL_ADD_C a1L,a1R,b2L,b2R,c1,c2,c3
1398 MUL_ADD_C a2L,a2R,b1L,b1R,c1,c2,c3
1399 MUL_ADD_C a3L,a3R,b0L,b0R,c1,c2,c3
1400 STD c1,24(r_ptr)
1401 COPY %r0,c1
1402
1403 MUL_ADD_C a4L,a4R,b0L,b0R,c2,c3,c1
1404 MUL_ADD_C a3L,a3R,b1L,b1R,c2,c3,c1
1405 MUL_ADD_C a2L,a2R,b2L,b2R,c2,c3,c1
1406 MUL_ADD_C a1L,a1R,b3L,b3R,c2,c3,c1
1407 MUL_ADD_C a0L,a0R,b4L,b4R,c2,c3,c1
1408 STD c2,32(r_ptr)
1409 COPY %r0,c2
1410
1411 MUL_ADD_C a0L,a0R,b5L,b5R,c3,c1,c2
1412 MUL_ADD_C a1L,a1R,b4L,b4R,c3,c1,c2
1413 MUL_ADD_C a2L,a2R,b3L,b3R,c3,c1,c2
1414 MUL_ADD_C a3L,a3R,b2L,b2R,c3,c1,c2
1415 MUL_ADD_C a4L,a4R,b1L,b1R,c3,c1,c2
1416 MUL_ADD_C a5L,a5R,b0L,b0R,c3,c1,c2
1417 STD c3,40(r_ptr)
1418 COPY %r0,c3
1419
1420 MUL_ADD_C a6L,a6R,b0L,b0R,c1,c2,c3
1421 MUL_ADD_C a5L,a5R,b1L,b1R,c1,c2,c3
1422 MUL_ADD_C a4L,a4R,b2L,b2R,c1,c2,c3
1423 MUL_ADD_C a3L,a3R,b3L,b3R,c1,c2,c3
1424 MUL_ADD_C a2L,a2R,b4L,b4R,c1,c2,c3
1425 MUL_ADD_C a1L,a1R,b5L,b5R,c1,c2,c3
1426 MUL_ADD_C a0L,a0R,b6L,b6R,c1,c2,c3
1427 STD c1,48(r_ptr)
1428 COPY %r0,c1
1429
1430 MUL_ADD_C a0L,a0R,b7L,b7R,c2,c3,c1
1431 MUL_ADD_C a1L,a1R,b6L,b6R,c2,c3,c1
1432 MUL_ADD_C a2L,a2R,b5L,b5R,c2,c3,c1
1433 MUL_ADD_C a3L,a3R,b4L,b4R,c2,c3,c1
1434 MUL_ADD_C a4L,a4R,b3L,b3R,c2,c3,c1
1435 MUL_ADD_C a5L,a5R,b2L,b2R,c2,c3,c1
1436 MUL_ADD_C a6L,a6R,b1L,b1R,c2,c3,c1
1437 MUL_ADD_C a7L,a7R,b0L,b0R,c2,c3,c1
1438 STD c2,56(r_ptr)
1439 COPY %r0,c2
1440
1441 MUL_ADD_C a7L,a7R,b1L,b1R,c3,c1,c2
1442 MUL_ADD_C a6L,a6R,b2L,b2R,c3,c1,c2
1443 MUL_ADD_C a5L,a5R,b3L,b3R,c3,c1,c2
1444 MUL_ADD_C a4L,a4R,b4L,b4R,c3,c1,c2
1445 MUL_ADD_C a3L,a3R,b5L,b5R,c3,c1,c2
1446 MUL_ADD_C a2L,a2R,b6L,b6R,c3,c1,c2
1447 MUL_ADD_C a1L,a1R,b7L,b7R,c3,c1,c2
1448 STD c3,64(r_ptr)
1449 COPY %r0,c3
1450
1451 MUL_ADD_C a2L,a2R,b7L,b7R,c1,c2,c3
1452 MUL_ADD_C a3L,a3R,b6L,b6R,c1,c2,c3
1453 MUL_ADD_C a4L,a4R,b5L,b5R,c1,c2,c3
1454 MUL_ADD_C a5L,a5R,b4L,b4R,c1,c2,c3
1455 MUL_ADD_C a6L,a6R,b3L,b3R,c1,c2,c3
1456 MUL_ADD_C a7L,a7R,b2L,b2R,c1,c2,c3
1457 STD c1,72(r_ptr)
1458 COPY %r0,c1
1459
1460 MUL_ADD_C a7L,a7R,b3L,b3R,c2,c3,c1
1461 MUL_ADD_C a6L,a6R,b4L,b4R,c2,c3,c1
1462 MUL_ADD_C a5L,a5R,b5L,b5R,c2,c3,c1
1463 MUL_ADD_C a4L,a4R,b6L,b6R,c2,c3,c1
1464 MUL_ADD_C a3L,a3R,b7L,b7R,c2,c3,c1
1465 STD c2,80(r_ptr)
1466 COPY %r0,c2
1467
1468 MUL_ADD_C a4L,a4R,b7L,b7R,c3,c1,c2
1469 MUL_ADD_C a5L,a5R,b6L,b6R,c3,c1,c2
1470 MUL_ADD_C a6L,a6R,b5L,b5R,c3,c1,c2
1471 MUL_ADD_C a7L,a7R,b4L,b4R,c3,c1,c2
1472 STD c3,88(r_ptr)
1473 COPY %r0,c3
1474
1475 MUL_ADD_C a7L,a7R,b5L,b5R,c1,c2,c3
1476 MUL_ADD_C a6L,a6R,b6L,b6R,c1,c2,c3
1477 MUL_ADD_C a5L,a5R,b7L,b7R,c1,c2,c3
1478 STD c1,96(r_ptr)
1479 COPY %r0,c1
1480
1481 MUL_ADD_C a6L,a6R,b7L,b7R,c2,c3,c1
1482 MUL_ADD_C a7L,a7R,b6L,b6R,c2,c3,c1
1483 STD c2,104(r_ptr)
1484 COPY %r0,c2
1485
1486 MUL_ADD_C a7L,a7R,b7L,b7R,c3,c1,c2
1487 STD c3,112(r_ptr)
1488 STD c1,120(r_ptr)
1489
1490 .EXIT
1491 FLDD -88(%sp),%fr13
1492 FLDD -96(%sp),%fr12
1493 LDD -104(%sp),%r6 ; restore r6
1494 LDD -112(%sp),%r5 ; restore r5
1495 LDD -120(%sp),%r4 ; restore r4
1496 BVE (%rp)
1497 LDD,MB -128(%sp),%r3
1498
1499 .PROCEND
1500
1501;-----------------------------------------------------------------------------
1502;
1503;void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
1504; arg0 = r_ptr
1505; arg1 = a_ptr
1506; arg2 = b_ptr
1507;
1508
1509bn_mul_comba4
1510 .proc
1511 .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
1512 .EXPORT bn_mul_comba4,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
1513 .entry
1514 .align 64
1515
1516 STD %r3,0(%sp) ; save r3
1517 STD %r4,8(%sp) ; save r4
1518 STD %r5,16(%sp) ; save r5
1519 STD %r6,24(%sp) ; save r6
1520 FSTD %fr12,32(%sp) ; save r6
1521 FSTD %fr13,40(%sp) ; save r7
1522
1523 ;
1524 ; Zero out carries
1525 ;
1526 COPY %r0,c1
1527 COPY %r0,c2
1528 COPY %r0,c3
1529
1530 LDO 128(%sp),%sp ; bump stack
1531 DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32
1532
1533 ;
1534 ; Load up all of the values we are going to use
1535 ;
1536 FLDD 0(a_ptr),a0
1537 FLDD 8(a_ptr),a1
1538 FLDD 16(a_ptr),a2
1539 FLDD 24(a_ptr),a3
1540
1541 FLDD 0(b_ptr),b0
1542 FLDD 8(b_ptr),b1
1543 FLDD 16(b_ptr),b2
1544 FLDD 24(b_ptr),b3
1545
1546 MUL_ADD_C a0L,a0R,b0L,b0R,c1,c2,c3
1547 STD c1,0(r_ptr)
1548 COPY %r0,c1
1549
1550 MUL_ADD_C a0L,a0R,b1L,b1R,c2,c3,c1
1551 MUL_ADD_C a1L,a1R,b0L,b0R,c2,c3,c1
1552 STD c2,8(r_ptr)
1553 COPY %r0,c2
1554
1555 MUL_ADD_C a2L,a2R,b0L,b0R,c3,c1,c2
1556 MUL_ADD_C a1L,a1R,b1L,b1R,c3,c1,c2
1557 MUL_ADD_C a0L,a0R,b2L,b2R,c3,c1,c2
1558 STD c3,16(r_ptr)
1559 COPY %r0,c3
1560
1561 MUL_ADD_C a0L,a0R,b3L,b3R,c1,c2,c3
1562 MUL_ADD_C a1L,a1R,b2L,b2R,c1,c2,c3
1563 MUL_ADD_C a2L,a2R,b1L,b1R,c1,c2,c3
1564 MUL_ADD_C a3L,a3R,b0L,b0R,c1,c2,c3
1565 STD c1,24(r_ptr)
1566 COPY %r0,c1
1567
1568 MUL_ADD_C a3L,a3R,b1L,b1R,c2,c3,c1
1569 MUL_ADD_C a2L,a2R,b2L,b2R,c2,c3,c1
1570 MUL_ADD_C a1L,a1R,b3L,b3R,c2,c3,c1
1571 STD c2,32(r_ptr)
1572 COPY %r0,c2
1573
1574 MUL_ADD_C a2L,a2R,b3L,b3R,c3,c1,c2
1575 MUL_ADD_C a3L,a3R,b2L,b2R,c3,c1,c2
1576 STD c3,40(r_ptr)
1577 COPY %r0,c3
1578
1579 MUL_ADD_C a3L,a3R,b3L,b3R,c1,c2,c3
1580 STD c1,48(r_ptr)
1581 STD c2,56(r_ptr)
1582
1583 .EXIT
1584 FLDD -88(%sp),%fr13
1585 FLDD -96(%sp),%fr12
1586 LDD -104(%sp),%r6 ; restore r6
1587 LDD -112(%sp),%r5 ; restore r5
1588 LDD -120(%sp),%r4 ; restore r4
1589 BVE (%rp)
1590 LDD,MB -128(%sp),%r3
1591
1592 .PROCEND
1593
1594
1595 .SPACE $TEXT$
1596 .SUBSPA $CODE$
1597 .SPACE $PRIVATE$,SORT=16
1598 .IMPORT $global$,DATA
1599 .SPACE $TEXT$
1600 .SUBSPA $CODE$
1601 .SUBSPA $LIT$,QUAD=0,ALIGN=8,ACCESS=0x2c,SORT=16
1602C$4
1603 .ALIGN 8
1604 .STRINGZ "Division would overflow (%d)\n"
1605 .END
diff --git a/src/lib/libcrypto/bn/asm/sparcv8.S b/src/lib/libcrypto/bn/asm/sparcv8.S
new file mode 100644
index 0000000000..88c5dc480a
--- /dev/null
+++ b/src/lib/libcrypto/bn/asm/sparcv8.S
@@ -0,0 +1,1458 @@
1.ident "sparcv8.s, Version 1.4"
2.ident "SPARC v8 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>"
3
4/*
5 * ====================================================================
6 * Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
7 * project.
8 *
9 * Rights for redistribution and usage in source and binary forms are
10 * granted according to the OpenSSL license. Warranty of any kind is
11 * disclaimed.
12 * ====================================================================
13 */
14
15/*
16 * This is my modest contributon to OpenSSL project (see
17 * http://www.openssl.org/ for more information about it) and is
18 * a drop-in SuperSPARC ISA replacement for crypto/bn/bn_asm.c
19 * module. For updates see http://fy.chalmers.se/~appro/hpe/.
20 *
21 * See bn_asm.sparc.v8plus.S for more details.
22 */
23
24/*
25 * Revision history.
26 *
27 * 1.1 - new loop unrolling model(*);
28 * 1.2 - made gas friendly;
29 * 1.3 - fixed problem with /usr/ccs/lib/cpp;
30 * 1.4 - some retunes;
31 *
32 * (*) see bn_asm.sparc.v8plus.S for details
33 */
34
35.section ".text",#alloc,#execinstr
36.file "bn_asm.sparc.v8.S"
37
38.align 32
39
40.global bn_mul_add_words
41/*
42 * BN_ULONG bn_mul_add_words(rp,ap,num,w)
43 * BN_ULONG *rp,*ap;
44 * int num;
45 * BN_ULONG w;
46 */
47bn_mul_add_words:
48 cmp %o2,0
49 bg,a .L_bn_mul_add_words_proceed
50 ld [%o1],%g2
51 retl
52 clr %o0
53
54.L_bn_mul_add_words_proceed:
55 andcc %o2,-4,%g0
56 bz .L_bn_mul_add_words_tail
57 clr %o5
58
59.L_bn_mul_add_words_loop:
60 ld [%o0],%o4
61 ld [%o1+4],%g3
62 umul %o3,%g2,%g2
63 rd %y,%g1
64 addcc %o4,%o5,%o4
65 addx %g1,0,%g1
66 addcc %o4,%g2,%o4
67 st %o4,[%o0]
68 addx %g1,0,%o5
69
70 ld [%o0+4],%o4
71 ld [%o1+8],%g2
72 umul %o3,%g3,%g3
73 dec 4,%o2
74 rd %y,%g1
75 addcc %o4,%o5,%o4
76 addx %g1,0,%g1
77 addcc %o4,%g3,%o4
78 st %o4,[%o0+4]
79 addx %g1,0,%o5
80
81 ld [%o0+8],%o4
82 ld [%o1+12],%g3
83 umul %o3,%g2,%g2
84 inc 16,%o1
85 rd %y,%g1
86 addcc %o4,%o5,%o4
87 addx %g1,0,%g1
88 addcc %o4,%g2,%o4
89 st %o4,[%o0+8]
90 addx %g1,0,%o5
91
92 ld [%o0+12],%o4
93 umul %o3,%g3,%g3
94 inc 16,%o0
95 rd %y,%g1
96 addcc %o4,%o5,%o4
97 addx %g1,0,%g1
98 addcc %o4,%g3,%o4
99 st %o4,[%o0-4]
100 addx %g1,0,%o5
101 andcc %o2,-4,%g0
102 bnz,a .L_bn_mul_add_words_loop
103 ld [%o1],%g2
104
105 tst %o2
106 bnz,a .L_bn_mul_add_words_tail
107 ld [%o1],%g2
108.L_bn_mul_add_words_return:
109 retl
110 mov %o5,%o0
111 nop
112
113.L_bn_mul_add_words_tail:
114 ld [%o0],%o4
115 umul %o3,%g2,%g2
116 addcc %o4,%o5,%o4
117 rd %y,%g1
118 addx %g1,0,%g1
119 addcc %o4,%g2,%o4
120 addx %g1,0,%o5
121 deccc %o2
122 bz .L_bn_mul_add_words_return
123 st %o4,[%o0]
124
125 ld [%o1+4],%g2
126 ld [%o0+4],%o4
127 umul %o3,%g2,%g2
128 rd %y,%g1
129 addcc %o4,%o5,%o4
130 addx %g1,0,%g1
131 addcc %o4,%g2,%o4
132 addx %g1,0,%o5
133 deccc %o2
134 bz .L_bn_mul_add_words_return
135 st %o4,[%o0+4]
136
137 ld [%o1+8],%g2
138 ld [%o0+8],%o4
139 umul %o3,%g2,%g2
140 rd %y,%g1
141 addcc %o4,%o5,%o4
142 addx %g1,0,%g1
143 addcc %o4,%g2,%o4
144 st %o4,[%o0+8]
145 retl
146 addx %g1,0,%o0
147
148.type bn_mul_add_words,#function
149.size bn_mul_add_words,(.-bn_mul_add_words)
150
151.align 32
152
153.global bn_mul_words
154/*
155 * BN_ULONG bn_mul_words(rp,ap,num,w)
156 * BN_ULONG *rp,*ap;
157 * int num;
158 * BN_ULONG w;
159 */
160bn_mul_words:
161 cmp %o2,0
162 bg,a .L_bn_mul_words_proceeed
163 ld [%o1],%g2
164 retl
165 clr %o0
166
167.L_bn_mul_words_proceeed:
168 andcc %o2,-4,%g0
169 bz .L_bn_mul_words_tail
170 clr %o5
171
172.L_bn_mul_words_loop:
173 ld [%o1+4],%g3
174 umul %o3,%g2,%g2
175 addcc %g2,%o5,%g2
176 rd %y,%g1
177 addx %g1,0,%o5
178 st %g2,[%o0]
179
180 ld [%o1+8],%g2
181 umul %o3,%g3,%g3
182 addcc %g3,%o5,%g3
183 rd %y,%g1
184 dec 4,%o2
185 addx %g1,0,%o5
186 st %g3,[%o0+4]
187
188 ld [%o1+12],%g3
189 umul %o3,%g2,%g2
190 addcc %g2,%o5,%g2
191 rd %y,%g1
192 inc 16,%o1
193 st %g2,[%o0+8]
194 addx %g1,0,%o5
195
196 umul %o3,%g3,%g3
197 addcc %g3,%o5,%g3
198 rd %y,%g1
199 inc 16,%o0
200 addx %g1,0,%o5
201 st %g3,[%o0-4]
202 andcc %o2,-4,%g0
203 nop
204 bnz,a .L_bn_mul_words_loop
205 ld [%o1],%g2
206
207 tst %o2
208 bnz,a .L_bn_mul_words_tail
209 ld [%o1],%g2
210.L_bn_mul_words_return:
211 retl
212 mov %o5,%o0
213 nop
214
215.L_bn_mul_words_tail:
216 umul %o3,%g2,%g2
217 addcc %g2,%o5,%g2
218 rd %y,%g1
219 addx %g1,0,%o5
220 deccc %o2
221 bz .L_bn_mul_words_return
222 st %g2,[%o0]
223 nop
224
225 ld [%o1+4],%g2
226 umul %o3,%g2,%g2
227 addcc %g2,%o5,%g2
228 rd %y,%g1
229 addx %g1,0,%o5
230 deccc %o2
231 bz .L_bn_mul_words_return
232 st %g2,[%o0+4]
233
234 ld [%o1+8],%g2
235 umul %o3,%g2,%g2
236 addcc %g2,%o5,%g2
237 rd %y,%g1
238 st %g2,[%o0+8]
239 retl
240 addx %g1,0,%o0
241
242.type bn_mul_words,#function
243.size bn_mul_words,(.-bn_mul_words)
244
245.align 32
246.global bn_sqr_words
247/*
248 * void bn_sqr_words(r,a,n)
249 * BN_ULONG *r,*a;
250 * int n;
251 */
252bn_sqr_words:
253 cmp %o2,0
254 bg,a .L_bn_sqr_words_proceeed
255 ld [%o1],%g2
256 retl
257 clr %o0
258
259.L_bn_sqr_words_proceeed:
260 andcc %o2,-4,%g0
261 bz .L_bn_sqr_words_tail
262 clr %o5
263
264.L_bn_sqr_words_loop:
265 ld [%o1+4],%g3
266 umul %g2,%g2,%o4
267 st %o4,[%o0]
268 rd %y,%o5
269 st %o5,[%o0+4]
270
271 ld [%o1+8],%g2
272 umul %g3,%g3,%o4
273 dec 4,%o2
274 st %o4,[%o0+8]
275 rd %y,%o5
276 st %o5,[%o0+12]
277 nop
278
279 ld [%o1+12],%g3
280 umul %g2,%g2,%o4
281 st %o4,[%o0+16]
282 rd %y,%o5
283 inc 16,%o1
284 st %o5,[%o0+20]
285
286 umul %g3,%g3,%o4
287 inc 32,%o0
288 st %o4,[%o0-8]
289 rd %y,%o5
290 st %o5,[%o0-4]
291 andcc %o2,-4,%g2
292 bnz,a .L_bn_sqr_words_loop
293 ld [%o1],%g2
294
295 tst %o2
296 nop
297 bnz,a .L_bn_sqr_words_tail
298 ld [%o1],%g2
299.L_bn_sqr_words_return:
300 retl
301 clr %o0
302
303.L_bn_sqr_words_tail:
304 umul %g2,%g2,%o4
305 st %o4,[%o0]
306 deccc %o2
307 rd %y,%o5
308 bz .L_bn_sqr_words_return
309 st %o5,[%o0+4]
310
311 ld [%o1+4],%g2
312 umul %g2,%g2,%o4
313 st %o4,[%o0+8]
314 deccc %o2
315 rd %y,%o5
316 nop
317 bz .L_bn_sqr_words_return
318 st %o5,[%o0+12]
319
320 ld [%o1+8],%g2
321 umul %g2,%g2,%o4
322 st %o4,[%o0+16]
323 rd %y,%o5
324 st %o5,[%o0+20]
325 retl
326 clr %o0
327
328.type bn_sqr_words,#function
329.size bn_sqr_words,(.-bn_sqr_words)
330
331.align 32
332
333.global bn_div_words
334/*
335 * BN_ULONG bn_div_words(h,l,d)
336 * BN_ULONG h,l,d;
337 */
338bn_div_words:
339 wr %o0,%y
340 udiv %o1,%o2,%o0
341 retl
342 nop
343
344.type bn_div_words,#function
345.size bn_div_words,(.-bn_div_words)
346
347.align 32
348
349.global bn_add_words
350/*
351 * BN_ULONG bn_add_words(rp,ap,bp,n)
352 * BN_ULONG *rp,*ap,*bp;
353 * int n;
354 */
355bn_add_words:
356 cmp %o3,0
357 bg,a .L_bn_add_words_proceed
358 ld [%o1],%o4
359 retl
360 clr %o0
361
362.L_bn_add_words_proceed:
363 andcc %o3,-4,%g0
364 bz .L_bn_add_words_tail
365 clr %g1
366 ba .L_bn_add_words_warn_loop
367 addcc %g0,0,%g0 ! clear carry flag
368
369.L_bn_add_words_loop:
370 ld [%o1],%o4
371.L_bn_add_words_warn_loop:
372 ld [%o2],%o5
373 ld [%o1+4],%g3
374 ld [%o2+4],%g4
375 dec 4,%o3
376 addxcc %o5,%o4,%o5
377 st %o5,[%o0]
378
379 ld [%o1+8],%o4
380 ld [%o2+8],%o5
381 inc 16,%o1
382 addxcc %g3,%g4,%g3
383 st %g3,[%o0+4]
384
385 ld [%o1-4],%g3
386 ld [%o2+12],%g4
387 inc 16,%o2
388 addxcc %o5,%o4,%o5
389 st %o5,[%o0+8]
390
391 inc 16,%o0
392 addxcc %g3,%g4,%g3
393 st %g3,[%o0-4]
394 addx %g0,0,%g1
395 andcc %o3,-4,%g0
396 bnz,a .L_bn_add_words_loop
397 addcc %g1,-1,%g0
398
399 tst %o3
400 bnz,a .L_bn_add_words_tail
401 ld [%o1],%o4
402.L_bn_add_words_return:
403 retl
404 mov %g1,%o0
405
406.L_bn_add_words_tail:
407 addcc %g1,-1,%g0
408 ld [%o2],%o5
409 addxcc %o5,%o4,%o5
410 addx %g0,0,%g1
411 deccc %o3
412 bz .L_bn_add_words_return
413 st %o5,[%o0]
414
415 ld [%o1+4],%o4
416 addcc %g1,-1,%g0
417 ld [%o2+4],%o5
418 addxcc %o5,%o4,%o5
419 addx %g0,0,%g1
420 deccc %o3
421 bz .L_bn_add_words_return
422 st %o5,[%o0+4]
423
424 ld [%o1+8],%o4
425 addcc %g1,-1,%g0
426 ld [%o2+8],%o5
427 addxcc %o5,%o4,%o5
428 st %o5,[%o0+8]
429 retl
430 addx %g0,0,%o0
431
432.type bn_add_words,#function
433.size bn_add_words,(.-bn_add_words)
434
435.align 32
436
437.global bn_sub_words
438/*
439 * BN_ULONG bn_sub_words(rp,ap,bp,n)
440 * BN_ULONG *rp,*ap,*bp;
441 * int n;
442 */
443bn_sub_words:
444 cmp %o3,0
445 bg,a .L_bn_sub_words_proceed
446 ld [%o1],%o4
447 retl
448 clr %o0
449
450.L_bn_sub_words_proceed:
451 andcc %o3,-4,%g0
452 bz .L_bn_sub_words_tail
453 clr %g1
454 ba .L_bn_sub_words_warm_loop
455 addcc %g0,0,%g0 ! clear carry flag
456
457.L_bn_sub_words_loop:
458 ld [%o1],%o4
459.L_bn_sub_words_warm_loop:
460 ld [%o2],%o5
461 ld [%o1+4],%g3
462 ld [%o2+4],%g4
463 dec 4,%o3
464 subxcc %o4,%o5,%o5
465 st %o5,[%o0]
466
467 ld [%o1+8],%o4
468 ld [%o2+8],%o5
469 inc 16,%o1
470 subxcc %g3,%g4,%g4
471 st %g4,[%o0+4]
472
473 ld [%o1-4],%g3
474 ld [%o2+12],%g4
475 inc 16,%o2
476 subxcc %o4,%o5,%o5
477 st %o5,[%o0+8]
478
479 inc 16,%o0
480 subxcc %g3,%g4,%g4
481 st %g4,[%o0-4]
482 addx %g0,0,%g1
483 andcc %o3,-4,%g0
484 bnz,a .L_bn_sub_words_loop
485 addcc %g1,-1,%g0
486
487 tst %o3
488 nop
489 bnz,a .L_bn_sub_words_tail
490 ld [%o1],%o4
491.L_bn_sub_words_return:
492 retl
493 mov %g1,%o0
494
495.L_bn_sub_words_tail:
496 addcc %g1,-1,%g0
497 ld [%o2],%o5
498 subxcc %o4,%o5,%o5
499 addx %g0,0,%g1
500 deccc %o3
501 bz .L_bn_sub_words_return
502 st %o5,[%o0]
503 nop
504
505 ld [%o1+4],%o4
506 addcc %g1,-1,%g0
507 ld [%o2+4],%o5
508 subxcc %o4,%o5,%o5
509 addx %g0,0,%g1
510 deccc %o3
511 bz .L_bn_sub_words_return
512 st %o5,[%o0+4]
513
514 ld [%o1+8],%o4
515 addcc %g1,-1,%g0
516 ld [%o2+8],%o5
517 subxcc %o4,%o5,%o5
518 st %o5,[%o0+8]
519 retl
520 addx %g0,0,%o0
521
522.type bn_sub_words,#function
523.size bn_sub_words,(.-bn_sub_words)
524
525#define FRAME_SIZE -96
526
527/*
528 * Here is register usage map for *all* routines below.
529 */
530#define t_1 %o0
531#define t_2 %o1
532#define c_1 %o2
533#define c_2 %o3
534#define c_3 %o4
535
536#define ap(I) [%i1+4*I]
537#define bp(I) [%i2+4*I]
538#define rp(I) [%i0+4*I]
539
540#define a_0 %l0
541#define a_1 %l1
542#define a_2 %l2
543#define a_3 %l3
544#define a_4 %l4
545#define a_5 %l5
546#define a_6 %l6
547#define a_7 %l7
548
549#define b_0 %i3
550#define b_1 %i4
551#define b_2 %i5
552#define b_3 %o5
553#define b_4 %g1
554#define b_5 %g2
555#define b_6 %g3
556#define b_7 %g4
557
558.align 32
559.global bn_mul_comba8
560/*
561 * void bn_mul_comba8(r,a,b)
562 * BN_ULONG *r,*a,*b;
563 */
564bn_mul_comba8:
565 save %sp,FRAME_SIZE,%sp
566 ld ap(0),a_0
567 ld bp(0),b_0
568 umul a_0,b_0,c_1 !=!mul_add_c(a[0],b[0],c1,c2,c3);
569 ld bp(1),b_1
570 rd %y,c_2
571 st c_1,rp(0) !r[0]=c1;
572
573 umul a_0,b_1,t_1 !=!mul_add_c(a[0],b[1],c2,c3,c1);
574 ld ap(1),a_1
575 addcc c_2,t_1,c_2
576 rd %y,t_2
577 addxcc %g0,t_2,c_3 !=
578 addx %g0,%g0,c_1
579 ld ap(2),a_2
580 umul a_1,b_0,t_1 !mul_add_c(a[1],b[0],c2,c3,c1);
581 addcc c_2,t_1,c_2 !=
582 rd %y,t_2
583 addxcc c_3,t_2,c_3
584 st c_2,rp(1) !r[1]=c2;
585 addx c_1,%g0,c_1 !=
586
587 umul a_2,b_0,t_1 !mul_add_c(a[2],b[0],c3,c1,c2);
588 addcc c_3,t_1,c_3
589 rd %y,t_2
590 addxcc c_1,t_2,c_1 !=
591 addx %g0,%g0,c_2
592 ld bp(2),b_2
593 umul a_1,b_1,t_1 !mul_add_c(a[1],b[1],c3,c1,c2);
594 addcc c_3,t_1,c_3 !=
595 rd %y,t_2
596 addxcc c_1,t_2,c_1
597 ld bp(3),b_3
598 addx c_2,%g0,c_2 !=
599 umul a_0,b_2,t_1 !mul_add_c(a[0],b[2],c3,c1,c2);
600 addcc c_3,t_1,c_3
601 rd %y,t_2
602 addxcc c_1,t_2,c_1 !=
603 addx c_2,%g0,c_2
604 st c_3,rp(2) !r[2]=c3;
605
606 umul a_0,b_3,t_1 !mul_add_c(a[0],b[3],c1,c2,c3);
607 addcc c_1,t_1,c_1 !=
608 rd %y,t_2
609 addxcc c_2,t_2,c_2
610 addx %g0,%g0,c_3
611 umul a_1,b_2,t_1 !=!mul_add_c(a[1],b[2],c1,c2,c3);
612 addcc c_1,t_1,c_1
613 rd %y,t_2
614 addxcc c_2,t_2,c_2
615 addx c_3,%g0,c_3 !=
616 ld ap(3),a_3
617 umul a_2,b_1,t_1 !mul_add_c(a[2],b[1],c1,c2,c3);
618 addcc c_1,t_1,c_1
619 rd %y,t_2 !=
620 addxcc c_2,t_2,c_2
621 addx c_3,%g0,c_3
622 ld ap(4),a_4
623 umul a_3,b_0,t_1 !mul_add_c(a[3],b[0],c1,c2,c3);!=
624 addcc c_1,t_1,c_1
625 rd %y,t_2
626 addxcc c_2,t_2,c_2
627 addx c_3,%g0,c_3 !=
628 st c_1,rp(3) !r[3]=c1;
629
630 umul a_4,b_0,t_1 !mul_add_c(a[4],b[0],c2,c3,c1);
631 addcc c_2,t_1,c_2
632 rd %y,t_2 !=
633 addxcc c_3,t_2,c_3
634 addx %g0,%g0,c_1
635 umul a_3,b_1,t_1 !mul_add_c(a[3],b[1],c2,c3,c1);
636 addcc c_2,t_1,c_2 !=
637 rd %y,t_2
638 addxcc c_3,t_2,c_3
639 addx c_1,%g0,c_1
640 umul a_2,b_2,t_1 !=!mul_add_c(a[2],b[2],c2,c3,c1);
641 addcc c_2,t_1,c_2
642 rd %y,t_2
643 addxcc c_3,t_2,c_3
644 addx c_1,%g0,c_1 !=
645 ld bp(4),b_4
646 umul a_1,b_3,t_1 !mul_add_c(a[1],b[3],c2,c3,c1);
647 addcc c_2,t_1,c_2
648 rd %y,t_2 !=
649 addxcc c_3,t_2,c_3
650 addx c_1,%g0,c_1
651 ld bp(5),b_5
652 umul a_0,b_4,t_1 !=!mul_add_c(a[0],b[4],c2,c3,c1);
653 addcc c_2,t_1,c_2
654 rd %y,t_2
655 addxcc c_3,t_2,c_3
656 addx c_1,%g0,c_1 !=
657 st c_2,rp(4) !r[4]=c2;
658
659 umul a_0,b_5,t_1 !mul_add_c(a[0],b[5],c3,c1,c2);
660 addcc c_3,t_1,c_3
661 rd %y,t_2 !=
662 addxcc c_1,t_2,c_1
663 addx %g0,%g0,c_2
664 umul a_1,b_4,t_1 !mul_add_c(a[1],b[4],c3,c1,c2);
665 addcc c_3,t_1,c_3 !=
666 rd %y,t_2
667 addxcc c_1,t_2,c_1
668 addx c_2,%g0,c_2
669 umul a_2,b_3,t_1 !=!mul_add_c(a[2],b[3],c3,c1,c2);
670 addcc c_3,t_1,c_3
671 rd %y,t_2
672 addxcc c_1,t_2,c_1
673 addx c_2,%g0,c_2 !=
674 umul a_3,b_2,t_1 !mul_add_c(a[3],b[2],c3,c1,c2);
675 addcc c_3,t_1,c_3
676 rd %y,t_2
677 addxcc c_1,t_2,c_1 !=
678 addx c_2,%g0,c_2
679 ld ap(5),a_5
680 umul a_4,b_1,t_1 !mul_add_c(a[4],b[1],c3,c1,c2);
681 addcc c_3,t_1,c_3 !=
682 rd %y,t_2
683 addxcc c_1,t_2,c_1
684 ld ap(6),a_6
685 addx c_2,%g0,c_2 !=
686 umul a_5,b_0,t_1 !mul_add_c(a[5],b[0],c3,c1,c2);
687 addcc c_3,t_1,c_3
688 rd %y,t_2
689 addxcc c_1,t_2,c_1 !=
690 addx c_2,%g0,c_2
691 st c_3,rp(5) !r[5]=c3;
692
693 umul a_6,b_0,t_1 !mul_add_c(a[6],b[0],c1,c2,c3);
694 addcc c_1,t_1,c_1 !=
695 rd %y,t_2
696 addxcc c_2,t_2,c_2
697 addx %g0,%g0,c_3
698 umul a_5,b_1,t_1 !=!mul_add_c(a[5],b[1],c1,c2,c3);
699 addcc c_1,t_1,c_1
700 rd %y,t_2
701 addxcc c_2,t_2,c_2
702 addx c_3,%g0,c_3 !=
703 umul a_4,b_2,t_1 !mul_add_c(a[4],b[2],c1,c2,c3);
704 addcc c_1,t_1,c_1
705 rd %y,t_2
706 addxcc c_2,t_2,c_2 !=
707 addx c_3,%g0,c_3
708 umul a_3,b_3,t_1 !mul_add_c(a[3],b[3],c1,c2,c3);
709 addcc c_1,t_1,c_1
710 rd %y,t_2 !=
711 addxcc c_2,t_2,c_2
712 addx c_3,%g0,c_3
713 umul a_2,b_4,t_1 !mul_add_c(a[2],b[4],c1,c2,c3);
714 addcc c_1,t_1,c_1 !=
715 rd %y,t_2
716 addxcc c_2,t_2,c_2
717 ld bp(6),b_6
718 addx c_3,%g0,c_3 !=
719 umul a_1,b_5,t_1 !mul_add_c(a[1],b[5],c1,c2,c3);
720 addcc c_1,t_1,c_1
721 rd %y,t_2
722 addxcc c_2,t_2,c_2 !=
723 addx c_3,%g0,c_3
724 ld bp(7),b_7
725 umul a_0,b_6,t_1 !mul_add_c(a[0],b[6],c1,c2,c3);
726 addcc c_1,t_1,c_1 !=
727 rd %y,t_2
728 addxcc c_2,t_2,c_2
729 st c_1,rp(6) !r[6]=c1;
730 addx c_3,%g0,c_3 !=
731
732 umul a_0,b_7,t_1 !mul_add_c(a[0],b[7],c2,c3,c1);
733 addcc c_2,t_1,c_2
734 rd %y,t_2
735 addxcc c_3,t_2,c_3 !=
736 addx %g0,%g0,c_1
737 umul a_1,b_6,t_1 !mul_add_c(a[1],b[6],c2,c3,c1);
738 addcc c_2,t_1,c_2
739 rd %y,t_2 !=
740 addxcc c_3,t_2,c_3
741 addx c_1,%g0,c_1
742 umul a_2,b_5,t_1 !mul_add_c(a[2],b[5],c2,c3,c1);
743 addcc c_2,t_1,c_2 !=
744 rd %y,t_2
745 addxcc c_3,t_2,c_3
746 addx c_1,%g0,c_1
747 umul a_3,b_4,t_1 !=!mul_add_c(a[3],b[4],c2,c3,c1);
748 addcc c_2,t_1,c_2
749 rd %y,t_2
750 addxcc c_3,t_2,c_3
751 addx c_1,%g0,c_1 !=
752 umul a_4,b_3,t_1 !mul_add_c(a[4],b[3],c2,c3,c1);
753 addcc c_2,t_1,c_2
754 rd %y,t_2
755 addxcc c_3,t_2,c_3 !=
756 addx c_1,%g0,c_1
757 umul a_5,b_2,t_1 !mul_add_c(a[5],b[2],c2,c3,c1);
758 addcc c_2,t_1,c_2
759 rd %y,t_2 !=
760 addxcc c_3,t_2,c_3
761 addx c_1,%g0,c_1
762 ld ap(7),a_7
763 umul a_6,b_1,t_1 !=!mul_add_c(a[6],b[1],c2,c3,c1);
764 addcc c_2,t_1,c_2
765 rd %y,t_2
766 addxcc c_3,t_2,c_3
767 addx c_1,%g0,c_1 !=
768 umul a_7,b_0,t_1 !mul_add_c(a[7],b[0],c2,c3,c1);
769 addcc c_2,t_1,c_2
770 rd %y,t_2
771 addxcc c_3,t_2,c_3 !=
772 addx c_1,%g0,c_1
773 st c_2,rp(7) !r[7]=c2;
774
775 umul a_7,b_1,t_1 !mul_add_c(a[7],b[1],c3,c1,c2);
776 addcc c_3,t_1,c_3 !=
777 rd %y,t_2
778 addxcc c_1,t_2,c_1
779 addx %g0,%g0,c_2
780 umul a_6,b_2,t_1 !=!mul_add_c(a[6],b[2],c3,c1,c2);
781 addcc c_3,t_1,c_3
782 rd %y,t_2
783 addxcc c_1,t_2,c_1
784 addx c_2,%g0,c_2 !=
785 umul a_5,b_3,t_1 !mul_add_c(a[5],b[3],c3,c1,c2);
786 addcc c_3,t_1,c_3
787 rd %y,t_2
788 addxcc c_1,t_2,c_1 !=
789 addx c_2,%g0,c_2
790 umul a_4,b_4,t_1 !mul_add_c(a[4],b[4],c3,c1,c2);
791 addcc c_3,t_1,c_3
792 rd %y,t_2 !=
793 addxcc c_1,t_2,c_1
794 addx c_2,%g0,c_2
795 umul a_3,b_5,t_1 !mul_add_c(a[3],b[5],c3,c1,c2);
796 addcc c_3,t_1,c_3 !=
797 rd %y,t_2
798 addxcc c_1,t_2,c_1
799 addx c_2,%g0,c_2
800 umul a_2,b_6,t_1 !=!mul_add_c(a[2],b[6],c3,c1,c2);
801 addcc c_3,t_1,c_3
802 rd %y,t_2
803 addxcc c_1,t_2,c_1
804 addx c_2,%g0,c_2 !=
805 umul a_1,b_7,t_1 !mul_add_c(a[1],b[7],c3,c1,c2);
806 addcc c_3,t_1,c_3
807 rd %y,t_2
808 addxcc c_1,t_2,c_1 !
809 addx c_2,%g0,c_2
810 st c_3,rp(8) !r[8]=c3;
811
812 umul a_2,b_7,t_1 !mul_add_c(a[2],b[7],c1,c2,c3);
813 addcc c_1,t_1,c_1 !=
814 rd %y,t_2
815 addxcc c_2,t_2,c_2
816 addx %g0,%g0,c_3
817 umul a_3,b_6,t_1 !=!mul_add_c(a[3],b[6],c1,c2,c3);
818 addcc c_1,t_1,c_1
819 rd %y,t_2
820 addxcc c_2,t_2,c_2
821 addx c_3,%g0,c_3 !=
822 umul a_4,b_5,t_1 !mul_add_c(a[4],b[5],c1,c2,c3);
823 addcc c_1,t_1,c_1
824 rd %y,t_2
825 addxcc c_2,t_2,c_2 !=
826 addx c_3,%g0,c_3
827 umul a_5,b_4,t_1 !mul_add_c(a[5],b[4],c1,c2,c3);
828 addcc c_1,t_1,c_1
829 rd %y,t_2 !=
830 addxcc c_2,t_2,c_2
831 addx c_3,%g0,c_3
832 umul a_6,b_3,t_1 !mul_add_c(a[6],b[3],c1,c2,c3);
833 addcc c_1,t_1,c_1 !=
834 rd %y,t_2
835 addxcc c_2,t_2,c_2
836 addx c_3,%g0,c_3
837 umul a_7,b_2,t_1 !=!mul_add_c(a[7],b[2],c1,c2,c3);
838 addcc c_1,t_1,c_1
839 rd %y,t_2
840 addxcc c_2,t_2,c_2
841 addx c_3,%g0,c_3 !=
842 st c_1,rp(9) !r[9]=c1;
843
844 umul a_7,b_3,t_1 !mul_add_c(a[7],b[3],c2,c3,c1);
845 addcc c_2,t_1,c_2
846 rd %y,t_2 !=
847 addxcc c_3,t_2,c_3
848 addx %g0,%g0,c_1
849 umul a_6,b_4,t_1 !mul_add_c(a[6],b[4],c2,c3,c1);
850 addcc c_2,t_1,c_2 !=
851 rd %y,t_2
852 addxcc c_3,t_2,c_3
853 addx c_1,%g0,c_1
854 umul a_5,b_5,t_1 !=!mul_add_c(a[5],b[5],c2,c3,c1);
855 addcc c_2,t_1,c_2
856 rd %y,t_2
857 addxcc c_3,t_2,c_3
858 addx c_1,%g0,c_1 !=
859 umul a_4,b_6,t_1 !mul_add_c(a[4],b[6],c2,c3,c1);
860 addcc c_2,t_1,c_2
861 rd %y,t_2
862 addxcc c_3,t_2,c_3 !=
863 addx c_1,%g0,c_1
864 umul a_3,b_7,t_1 !mul_add_c(a[3],b[7],c2,c3,c1);
865 addcc c_2,t_1,c_2
866 rd %y,t_2 !=
867 addxcc c_3,t_2,c_3
868 addx c_1,%g0,c_1
869 st c_2,rp(10) !r[10]=c2;
870
871 umul a_4,b_7,t_1 !=!mul_add_c(a[4],b[7],c3,c1,c2);
872 addcc c_3,t_1,c_3
873 rd %y,t_2
874 addxcc c_1,t_2,c_1
875 addx %g0,%g0,c_2 !=
876 umul a_5,b_6,t_1 !mul_add_c(a[5],b[6],c3,c1,c2);
877 addcc c_3,t_1,c_3
878 rd %y,t_2
879 addxcc c_1,t_2,c_1 !=
880 addx c_2,%g0,c_2
881 umul a_6,b_5,t_1 !mul_add_c(a[6],b[5],c3,c1,c2);
882 addcc c_3,t_1,c_3
883 rd %y,t_2 !=
884 addxcc c_1,t_2,c_1
885 addx c_2,%g0,c_2
886 umul a_7,b_4,t_1 !mul_add_c(a[7],b[4],c3,c1,c2);
887 addcc c_3,t_1,c_3 !=
888 rd %y,t_2
889 addxcc c_1,t_2,c_1
890 st c_3,rp(11) !r[11]=c3;
891 addx c_2,%g0,c_2 !=
892
893 umul a_7,b_5,t_1 !mul_add_c(a[7],b[5],c1,c2,c3);
894 addcc c_1,t_1,c_1
895 rd %y,t_2
896 addxcc c_2,t_2,c_2 !=
897 addx %g0,%g0,c_3
898 umul a_6,b_6,t_1 !mul_add_c(a[6],b[6],c1,c2,c3);
899 addcc c_1,t_1,c_1
900 rd %y,t_2 !=
901 addxcc c_2,t_2,c_2
902 addx c_3,%g0,c_3
903 umul a_5,b_7,t_1 !mul_add_c(a[5],b[7],c1,c2,c3);
904 addcc c_1,t_1,c_1 !=
905 rd %y,t_2
906 addxcc c_2,t_2,c_2
907 st c_1,rp(12) !r[12]=c1;
908 addx c_3,%g0,c_3 !=
909
910 umul a_6,b_7,t_1 !mul_add_c(a[6],b[7],c2,c3,c1);
911 addcc c_2,t_1,c_2
912 rd %y,t_2
913 addxcc c_3,t_2,c_3 !=
914 addx %g0,%g0,c_1
915 umul a_7,b_6,t_1 !mul_add_c(a[7],b[6],c2,c3,c1);
916 addcc c_2,t_1,c_2
917 rd %y,t_2 !=
918 addxcc c_3,t_2,c_3
919 addx c_1,%g0,c_1
920 st c_2,rp(13) !r[13]=c2;
921
922 umul a_7,b_7,t_1 !=!mul_add_c(a[7],b[7],c3,c1,c2);
923 addcc c_3,t_1,c_3
924 rd %y,t_2
925 addxcc c_1,t_2,c_1
926 nop !=
927 st c_3,rp(14) !r[14]=c3;
928 st c_1,rp(15) !r[15]=c1;
929
930 ret
931 restore %g0,%g0,%o0
932
933.type bn_mul_comba8,#function
934.size bn_mul_comba8,(.-bn_mul_comba8)
935
936.align 32
937
938.global bn_mul_comba4
939/*
940 * void bn_mul_comba4(r,a,b)
941 * BN_ULONG *r,*a,*b;
942 */
943bn_mul_comba4:
944 save %sp,FRAME_SIZE,%sp
945 ld ap(0),a_0
946 ld bp(0),b_0
947 umul a_0,b_0,c_1 !=!mul_add_c(a[0],b[0],c1,c2,c3);
948 ld bp(1),b_1
949 rd %y,c_2
950 st c_1,rp(0) !r[0]=c1;
951
952 umul a_0,b_1,t_1 !=!mul_add_c(a[0],b[1],c2,c3,c1);
953 ld ap(1),a_1
954 addcc c_2,t_1,c_2
955 rd %y,t_2 !=
956 addxcc %g0,t_2,c_3
957 addx %g0,%g0,c_1
958 ld ap(2),a_2
959 umul a_1,b_0,t_1 !=!mul_add_c(a[1],b[0],c2,c3,c1);
960 addcc c_2,t_1,c_2
961 rd %y,t_2
962 addxcc c_3,t_2,c_3
963 addx c_1,%g0,c_1 !=
964 st c_2,rp(1) !r[1]=c2;
965
966 umul a_2,b_0,t_1 !mul_add_c(a[2],b[0],c3,c1,c2);
967 addcc c_3,t_1,c_3
968 rd %y,t_2 !=
969 addxcc c_1,t_2,c_1
970 addx %g0,%g0,c_2
971 ld bp(2),b_2
972 umul a_1,b_1,t_1 !=!mul_add_c(a[1],b[1],c3,c1,c2);
973 addcc c_3,t_1,c_3
974 rd %y,t_2
975 addxcc c_1,t_2,c_1
976 addx c_2,%g0,c_2 !=
977 ld bp(3),b_3
978 umul a_0,b_2,t_1 !mul_add_c(a[0],b[2],c3,c1,c2);
979 addcc c_3,t_1,c_3
980 rd %y,t_2 !=
981 addxcc c_1,t_2,c_1
982 addx c_2,%g0,c_2
983 st c_3,rp(2) !r[2]=c3;
984
985 umul a_0,b_3,t_1 !=!mul_add_c(a[0],b[3],c1,c2,c3);
986 addcc c_1,t_1,c_1
987 rd %y,t_2
988 addxcc c_2,t_2,c_2
989 addx %g0,%g0,c_3 !=
990 umul a_1,b_2,t_1 !mul_add_c(a[1],b[2],c1,c2,c3);
991 addcc c_1,t_1,c_1
992 rd %y,t_2
993 addxcc c_2,t_2,c_2 !=
994 addx c_3,%g0,c_3
995 ld ap(3),a_3
996 umul a_2,b_1,t_1 !mul_add_c(a[2],b[1],c1,c2,c3);
997 addcc c_1,t_1,c_1 !=
998 rd %y,t_2
999 addxcc c_2,t_2,c_2
1000 addx c_3,%g0,c_3
1001 umul a_3,b_0,t_1 !=!mul_add_c(a[3],b[0],c1,c2,c3);
1002 addcc c_1,t_1,c_1
1003 rd %y,t_2
1004 addxcc c_2,t_2,c_2
1005 addx c_3,%g0,c_3 !=
1006 st c_1,rp(3) !r[3]=c1;
1007
1008 umul a_3,b_1,t_1 !mul_add_c(a[3],b[1],c2,c3,c1);
1009 addcc c_2,t_1,c_2
1010 rd %y,t_2 !=
1011 addxcc c_3,t_2,c_3
1012 addx %g0,%g0,c_1
1013 umul a_2,b_2,t_1 !mul_add_c(a[2],b[2],c2,c3,c1);
1014 addcc c_2,t_1,c_2 !=
1015 rd %y,t_2
1016 addxcc c_3,t_2,c_3
1017 addx c_1,%g0,c_1
1018 umul a_1,b_3,t_1 !=!mul_add_c(a[1],b[3],c2,c3,c1);
1019 addcc c_2,t_1,c_2
1020 rd %y,t_2
1021 addxcc c_3,t_2,c_3
1022 addx c_1,%g0,c_1 !=
1023 st c_2,rp(4) !r[4]=c2;
1024
1025 umul a_2,b_3,t_1 !mul_add_c(a[2],b[3],c3,c1,c2);
1026 addcc c_3,t_1,c_3
1027 rd %y,t_2 !=
1028 addxcc c_1,t_2,c_1
1029 addx %g0,%g0,c_2
1030 umul a_3,b_2,t_1 !mul_add_c(a[3],b[2],c3,c1,c2);
1031 addcc c_3,t_1,c_3 !=
1032 rd %y,t_2
1033 addxcc c_1,t_2,c_1
1034 st c_3,rp(5) !r[5]=c3;
1035 addx c_2,%g0,c_2 !=
1036
1037 umul a_3,b_3,t_1 !mul_add_c(a[3],b[3],c1,c2,c3);
1038 addcc c_1,t_1,c_1
1039 rd %y,t_2
1040 addxcc c_2,t_2,c_2 !=
1041 st c_1,rp(6) !r[6]=c1;
1042 st c_2,rp(7) !r[7]=c2;
1043
1044 ret
1045 restore %g0,%g0,%o0
1046
1047.type bn_mul_comba4,#function
1048.size bn_mul_comba4,(.-bn_mul_comba4)
1049
1050.align 32
1051
1052.global bn_sqr_comba8
1053bn_sqr_comba8:
1054 save %sp,FRAME_SIZE,%sp
1055 ld ap(0),a_0
1056 ld ap(1),a_1
1057 umul a_0,a_0,c_1 !=!sqr_add_c(a,0,c1,c2,c3);
1058 rd %y,c_2
1059 st c_1,rp(0) !r[0]=c1;
1060
1061 ld ap(2),a_2
1062 umul a_0,a_1,t_1 !=!sqr_add_c2(a,1,0,c2,c3,c1);
1063 addcc c_2,t_1,c_2
1064 rd %y,t_2
1065 addxcc %g0,t_2,c_3
1066 addx %g0,%g0,c_1 !=
1067 addcc c_2,t_1,c_2
1068 addxcc c_3,t_2,c_3
1069 st c_2,rp(1) !r[1]=c2;
1070 addx c_1,%g0,c_1 !=
1071
1072 umul a_2,a_0,t_1 !sqr_add_c2(a,2,0,c3,c1,c2);
1073 addcc c_3,t_1,c_3
1074 rd %y,t_2
1075 addxcc c_1,t_2,c_1 !=
1076 addx %g0,%g0,c_2
1077 addcc c_3,t_1,c_3
1078 addxcc c_1,t_2,c_1
1079 addx c_2,%g0,c_2 !=
1080 ld ap(3),a_3
1081 umul a_1,a_1,t_1 !sqr_add_c(a,1,c3,c1,c2);
1082 addcc c_3,t_1,c_3
1083 rd %y,t_2 !=
1084 addxcc c_1,t_2,c_1
1085 addx c_2,%g0,c_2
1086 st c_3,rp(2) !r[2]=c3;
1087
1088 umul a_0,a_3,t_1 !=!sqr_add_c2(a,3,0,c1,c2,c3);
1089 addcc c_1,t_1,c_1
1090 rd %y,t_2
1091 addxcc c_2,t_2,c_2
1092 addx %g0,%g0,c_3 !=
1093 addcc c_1,t_1,c_1
1094 addxcc c_2,t_2,c_2
1095 ld ap(4),a_4
1096 addx c_3,%g0,c_3 !=
1097 umul a_1,a_2,t_1 !sqr_add_c2(a,2,1,c1,c2,c3);
1098 addcc c_1,t_1,c_1
1099 rd %y,t_2
1100 addxcc c_2,t_2,c_2 !=
1101 addx c_3,%g0,c_3
1102 addcc c_1,t_1,c_1
1103 addxcc c_2,t_2,c_2
1104 addx c_3,%g0,c_3 !=
1105 st c_1,rp(3) !r[3]=c1;
1106
1107 umul a_4,a_0,t_1 !sqr_add_c2(a,4,0,c2,c3,c1);
1108 addcc c_2,t_1,c_2
1109 rd %y,t_2 !=
1110 addxcc c_3,t_2,c_3
1111 addx %g0,%g0,c_1
1112 addcc c_2,t_1,c_2
1113 addxcc c_3,t_2,c_3 !=
1114 addx c_1,%g0,c_1
1115 umul a_3,a_1,t_1 !sqr_add_c2(a,3,1,c2,c3,c1);
1116 addcc c_2,t_1,c_2
1117 rd %y,t_2 !=
1118 addxcc c_3,t_2,c_3
1119 addx c_1,%g0,c_1
1120 addcc c_2,t_1,c_2
1121 addxcc c_3,t_2,c_3 !=
1122 addx c_1,%g0,c_1
1123 ld ap(5),a_5
1124 umul a_2,a_2,t_1 !sqr_add_c(a,2,c2,c3,c1);
1125 addcc c_2,t_1,c_2 !=
1126 rd %y,t_2
1127 addxcc c_3,t_2,c_3
1128 st c_2,rp(4) !r[4]=c2;
1129 addx c_1,%g0,c_1 !=
1130
1131 umul a_0,a_5,t_1 !sqr_add_c2(a,5,0,c3,c1,c2);
1132 addcc c_3,t_1,c_3
1133 rd %y,t_2
1134 addxcc c_1,t_2,c_1 !=
1135 addx %g0,%g0,c_2
1136 addcc c_3,t_1,c_3
1137 addxcc c_1,t_2,c_1
1138 addx c_2,%g0,c_2 !=
1139 umul a_1,a_4,t_1 !sqr_add_c2(a,4,1,c3,c1,c2);
1140 addcc c_3,t_1,c_3
1141 rd %y,t_2
1142 addxcc c_1,t_2,c_1 !=
1143 addx c_2,%g0,c_2
1144 addcc c_3,t_1,c_3
1145 addxcc c_1,t_2,c_1
1146 addx c_2,%g0,c_2 !=
1147 ld ap(6),a_6
1148 umul a_2,a_3,t_1 !sqr_add_c2(a,3,2,c3,c1,c2);
1149 addcc c_3,t_1,c_3
1150 rd %y,t_2 !=
1151 addxcc c_1,t_2,c_1
1152 addx c_2,%g0,c_2
1153 addcc c_3,t_1,c_3
1154 addxcc c_1,t_2,c_1 !=
1155 addx c_2,%g0,c_2
1156 st c_3,rp(5) !r[5]=c3;
1157
1158 umul a_6,a_0,t_1 !sqr_add_c2(a,6,0,c1,c2,c3);
1159 addcc c_1,t_1,c_1 !=
1160 rd %y,t_2
1161 addxcc c_2,t_2,c_2
1162 addx %g0,%g0,c_3
1163 addcc c_1,t_1,c_1 !=
1164 addxcc c_2,t_2,c_2
1165 addx c_3,%g0,c_3
1166 umul a_5,a_1,t_1 !sqr_add_c2(a,5,1,c1,c2,c3);
1167 addcc c_1,t_1,c_1 !=
1168 rd %y,t_2
1169 addxcc c_2,t_2,c_2
1170 addx c_3,%g0,c_3
1171 addcc c_1,t_1,c_1 !=
1172 addxcc c_2,t_2,c_2
1173 addx c_3,%g0,c_3
1174 umul a_4,a_2,t_1 !sqr_add_c2(a,4,2,c1,c2,c3);
1175 addcc c_1,t_1,c_1 !=
1176 rd %y,t_2
1177 addxcc c_2,t_2,c_2
1178 addx c_3,%g0,c_3
1179 addcc c_1,t_1,c_1 !=
1180 addxcc c_2,t_2,c_2
1181 addx c_3,%g0,c_3
1182 ld ap(7),a_7
1183 umul a_3,a_3,t_1 !=!sqr_add_c(a,3,c1,c2,c3);
1184 addcc c_1,t_1,c_1
1185 rd %y,t_2
1186 addxcc c_2,t_2,c_2
1187 addx c_3,%g0,c_3 !=
1188 st c_1,rp(6) !r[6]=c1;
1189
1190 umul a_0,a_7,t_1 !sqr_add_c2(a,7,0,c2,c3,c1);
1191 addcc c_2,t_1,c_2
1192 rd %y,t_2 !=
1193 addxcc c_3,t_2,c_3
1194 addx %g0,%g0,c_1
1195 addcc c_2,t_1,c_2
1196 addxcc c_3,t_2,c_3 !=
1197 addx c_1,%g0,c_1
1198 umul a_1,a_6,t_1 !sqr_add_c2(a,6,1,c2,c3,c1);
1199 addcc c_2,t_1,c_2
1200 rd %y,t_2 !=
1201 addxcc c_3,t_2,c_3
1202 addx c_1,%g0,c_1
1203 addcc c_2,t_1,c_2
1204 addxcc c_3,t_2,c_3 !=
1205 addx c_1,%g0,c_1
1206 umul a_2,a_5,t_1 !sqr_add_c2(a,5,2,c2,c3,c1);
1207 addcc c_2,t_1,c_2
1208 rd %y,t_2 !=
1209 addxcc c_3,t_2,c_3
1210 addx c_1,%g0,c_1
1211 addcc c_2,t_1,c_2
1212 addxcc c_3,t_2,c_3 !=
1213 addx c_1,%g0,c_1
1214 umul a_3,a_4,t_1 !sqr_add_c2(a,4,3,c2,c3,c1);
1215 addcc c_2,t_1,c_2
1216 rd %y,t_2 !=
1217 addxcc c_3,t_2,c_3
1218 addx c_1,%g0,c_1
1219 addcc c_2,t_1,c_2
1220 addxcc c_3,t_2,c_3 !=
1221 addx c_1,%g0,c_1
1222 st c_2,rp(7) !r[7]=c2;
1223
1224 umul a_7,a_1,t_1 !sqr_add_c2(a,7,1,c3,c1,c2);
1225 addcc c_3,t_1,c_3 !=
1226 rd %y,t_2
1227 addxcc c_1,t_2,c_1
1228 addx %g0,%g0,c_2
1229 addcc c_3,t_1,c_3 !=
1230 addxcc c_1,t_2,c_1
1231 addx c_2,%g0,c_2
1232 umul a_6,a_2,t_1 !sqr_add_c2(a,6,2,c3,c1,c2);
1233 addcc c_3,t_1,c_3 !=
1234 rd %y,t_2
1235 addxcc c_1,t_2,c_1
1236 addx c_2,%g0,c_2
1237 addcc c_3,t_1,c_3 !=
1238 addxcc c_1,t_2,c_1
1239 addx c_2,%g0,c_2
1240 umul a_5,a_3,t_1 !sqr_add_c2(a,5,3,c3,c1,c2);
1241 addcc c_3,t_1,c_3 !=
1242 rd %y,t_2
1243 addxcc c_1,t_2,c_1
1244 addx c_2,%g0,c_2
1245 addcc c_3,t_1,c_3 !=
1246 addxcc c_1,t_2,c_1
1247 addx c_2,%g0,c_2
1248 umul a_4,a_4,t_1 !sqr_add_c(a,4,c3,c1,c2);
1249 addcc c_3,t_1,c_3 !=
1250 rd %y,t_2
1251 addxcc c_1,t_2,c_1
1252 st c_3,rp(8) !r[8]=c3;
1253 addx c_2,%g0,c_2 !=
1254
1255 umul a_2,a_7,t_1 !sqr_add_c2(a,7,2,c1,c2,c3);
1256 addcc c_1,t_1,c_1
1257 rd %y,t_2
1258 addxcc c_2,t_2,c_2 !=
1259 addx %g0,%g0,c_3
1260 addcc c_1,t_1,c_1
1261 addxcc c_2,t_2,c_2
1262 addx c_3,%g0,c_3 !=
1263 umul a_3,a_6,t_1 !sqr_add_c2(a,6,3,c1,c2,c3);
1264 addcc c_1,t_1,c_1
1265 rd %y,t_2
1266 addxcc c_2,t_2,c_2 !=
1267 addx c_3,%g0,c_3
1268 addcc c_1,t_1,c_1
1269 addxcc c_2,t_2,c_2
1270 addx c_3,%g0,c_3 !=
1271 umul a_4,a_5,t_1 !sqr_add_c2(a,5,4,c1,c2,c3);
1272 addcc c_1,t_1,c_1
1273 rd %y,t_2
1274 addxcc c_2,t_2,c_2 !=
1275 addx c_3,%g0,c_3
1276 addcc c_1,t_1,c_1
1277 addxcc c_2,t_2,c_2
1278 addx c_3,%g0,c_3 !=
1279 st c_1,rp(9) !r[9]=c1;
1280
1281 umul a_7,a_3,t_1 !sqr_add_c2(a,7,3,c2,c3,c1);
1282 addcc c_2,t_1,c_2
1283 rd %y,t_2 !=
1284 addxcc c_3,t_2,c_3
1285 addx %g0,%g0,c_1
1286 addcc c_2,t_1,c_2
1287 addxcc c_3,t_2,c_3 !=
1288 addx c_1,%g0,c_1
1289 umul a_6,a_4,t_1 !sqr_add_c2(a,6,4,c2,c3,c1);
1290 addcc c_2,t_1,c_2
1291 rd %y,t_2 !=
1292 addxcc c_3,t_2,c_3
1293 addx c_1,%g0,c_1
1294 addcc c_2,t_1,c_2
1295 addxcc c_3,t_2,c_3 !=
1296 addx c_1,%g0,c_1
1297 umul a_5,a_5,t_1 !sqr_add_c(a,5,c2,c3,c1);
1298 addcc c_2,t_1,c_2
1299 rd %y,t_2 !=
1300 addxcc c_3,t_2,c_3
1301 addx c_1,%g0,c_1
1302 st c_2,rp(10) !r[10]=c2;
1303
1304 umul a_4,a_7,t_1 !=!sqr_add_c2(a,7,4,c3,c1,c2);
1305 addcc c_3,t_1,c_3
1306 rd %y,t_2
1307 addxcc c_1,t_2,c_1
1308 addx %g0,%g0,c_2 !=
1309 addcc c_3,t_1,c_3
1310 addxcc c_1,t_2,c_1
1311 addx c_2,%g0,c_2
1312 umul a_5,a_6,t_1 !=!sqr_add_c2(a,6,5,c3,c1,c2);
1313 addcc c_3,t_1,c_3
1314 rd %y,t_2
1315 addxcc c_1,t_2,c_1
1316 addx c_2,%g0,c_2 !=
1317 addcc c_3,t_1,c_3
1318 addxcc c_1,t_2,c_1
1319 st c_3,rp(11) !r[11]=c3;
1320 addx c_2,%g0,c_2 !=
1321
1322 umul a_7,a_5,t_1 !sqr_add_c2(a,7,5,c1,c2,c3);
1323 addcc c_1,t_1,c_1
1324 rd %y,t_2
1325 addxcc c_2,t_2,c_2 !=
1326 addx %g0,%g0,c_3
1327 addcc c_1,t_1,c_1
1328 addxcc c_2,t_2,c_2
1329 addx c_3,%g0,c_3 !=
1330 umul a_6,a_6,t_1 !sqr_add_c(a,6,c1,c2,c3);
1331 addcc c_1,t_1,c_1
1332 rd %y,t_2
1333 addxcc c_2,t_2,c_2 !=
1334 addx c_3,%g0,c_3
1335 st c_1,rp(12) !r[12]=c1;
1336
1337 umul a_6,a_7,t_1 !sqr_add_c2(a,7,6,c2,c3,c1);
1338 addcc c_2,t_1,c_2 !=
1339 rd %y,t_2
1340 addxcc c_3,t_2,c_3
1341 addx %g0,%g0,c_1
1342 addcc c_2,t_1,c_2 !=
1343 addxcc c_3,t_2,c_3
1344 st c_2,rp(13) !r[13]=c2;
1345 addx c_1,%g0,c_1 !=
1346
1347 umul a_7,a_7,t_1 !sqr_add_c(a,7,c3,c1,c2);
1348 addcc c_3,t_1,c_3
1349 rd %y,t_2
1350 addxcc c_1,t_2,c_1 !=
1351 st c_3,rp(14) !r[14]=c3;
1352 st c_1,rp(15) !r[15]=c1;
1353
1354 ret
1355 restore %g0,%g0,%o0
1356
1357.type bn_sqr_comba8,#function
1358.size bn_sqr_comba8,(.-bn_sqr_comba8)
1359
1360.align 32
1361
1362.global bn_sqr_comba4
1363/*
1364 * void bn_sqr_comba4(r,a)
1365 * BN_ULONG *r,*a;
1366 */
1367bn_sqr_comba4:
1368 save %sp,FRAME_SIZE,%sp
1369 ld ap(0),a_0
1370 umul a_0,a_0,c_1 !sqr_add_c(a,0,c1,c2,c3);
1371 ld ap(1),a_1 !=
1372 rd %y,c_2
1373 st c_1,rp(0) !r[0]=c1;
1374
1375 ld ap(2),a_2
1376 umul a_0,a_1,t_1 !=!sqr_add_c2(a,1,0,c2,c3,c1);
1377 addcc c_2,t_1,c_2
1378 rd %y,t_2
1379 addxcc %g0,t_2,c_3
1380 addx %g0,%g0,c_1 !=
1381 addcc c_2,t_1,c_2
1382 addxcc c_3,t_2,c_3
1383 addx c_1,%g0,c_1 !=
1384 st c_2,rp(1) !r[1]=c2;
1385
1386 umul a_2,a_0,t_1 !sqr_add_c2(a,2,0,c3,c1,c2);
1387 addcc c_3,t_1,c_3
1388 rd %y,t_2 !=
1389 addxcc c_1,t_2,c_1
1390 addx %g0,%g0,c_2
1391 addcc c_3,t_1,c_3
1392 addxcc c_1,t_2,c_1 !=
1393 addx c_2,%g0,c_2
1394 ld ap(3),a_3
1395 umul a_1,a_1,t_1 !sqr_add_c(a,1,c3,c1,c2);
1396 addcc c_3,t_1,c_3 !=
1397 rd %y,t_2
1398 addxcc c_1,t_2,c_1
1399 st c_3,rp(2) !r[2]=c3;
1400 addx c_2,%g0,c_2 !=
1401
1402 umul a_0,a_3,t_1 !sqr_add_c2(a,3,0,c1,c2,c3);
1403 addcc c_1,t_1,c_1
1404 rd %y,t_2
1405 addxcc c_2,t_2,c_2 !=
1406 addx %g0,%g0,c_3
1407 addcc c_1,t_1,c_1
1408 addxcc c_2,t_2,c_2
1409 addx c_3,%g0,c_3 !=
1410 umul a_1,a_2,t_1 !sqr_add_c2(a,2,1,c1,c2,c3);
1411 addcc c_1,t_1,c_1
1412 rd %y,t_2
1413 addxcc c_2,t_2,c_2 !=
1414 addx c_3,%g0,c_3
1415 addcc c_1,t_1,c_1
1416 addxcc c_2,t_2,c_2
1417 addx c_3,%g0,c_3 !=
1418 st c_1,rp(3) !r[3]=c1;
1419
1420 umul a_3,a_1,t_1 !sqr_add_c2(a,3,1,c2,c3,c1);
1421 addcc c_2,t_1,c_2
1422 rd %y,t_2 !=
1423 addxcc c_3,t_2,c_3
1424 addx %g0,%g0,c_1
1425 addcc c_2,t_1,c_2
1426 addxcc c_3,t_2,c_3 !=
1427 addx c_1,%g0,c_1
1428 umul a_2,a_2,t_1 !sqr_add_c(a,2,c2,c3,c1);
1429 addcc c_2,t_1,c_2
1430 rd %y,t_2 !=
1431 addxcc c_3,t_2,c_3
1432 addx c_1,%g0,c_1
1433 st c_2,rp(4) !r[4]=c2;
1434
1435 umul a_2,a_3,t_1 !=!sqr_add_c2(a,3,2,c3,c1,c2);
1436 addcc c_3,t_1,c_3
1437 rd %y,t_2
1438 addxcc c_1,t_2,c_1
1439 addx %g0,%g0,c_2 !=
1440 addcc c_3,t_1,c_3
1441 addxcc c_1,t_2,c_1
1442 st c_3,rp(5) !r[5]=c3;
1443 addx c_2,%g0,c_2 !=
1444
1445 umul a_3,a_3,t_1 !sqr_add_c(a,3,c1,c2,c3);
1446 addcc c_1,t_1,c_1
1447 rd %y,t_2
1448 addxcc c_2,t_2,c_2 !=
1449 st c_1,rp(6) !r[6]=c1;
1450 st c_2,rp(7) !r[7]=c2;
1451
1452 ret
1453 restore %g0,%g0,%o0
1454
1455.type bn_sqr_comba4,#function
1456.size bn_sqr_comba4,(.-bn_sqr_comba4)
1457
1458.align 32
diff --git a/src/lib/libcrypto/bn/asm/sparcv8plus.S b/src/lib/libcrypto/bn/asm/sparcv8plus.S
new file mode 100644
index 0000000000..0074dfdb75
--- /dev/null
+++ b/src/lib/libcrypto/bn/asm/sparcv8plus.S
@@ -0,0 +1,1535 @@
1.ident "sparcv8plus.s, Version 1.4"
2.ident "SPARC v9 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>"
3
4/*
5 * ====================================================================
6 * Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
7 * project.
8 *
9 * Rights for redistribution and usage in source and binary forms are
10 * granted according to the OpenSSL license. Warranty of any kind is
11 * disclaimed.
12 * ====================================================================
13 */
14
15/*
16 * This is my modest contributon to OpenSSL project (see
17 * http://www.openssl.org/ for more information about it) and is
18 * a drop-in UltraSPARC ISA replacement for crypto/bn/bn_asm.c
19 * module. For updates see http://fy.chalmers.se/~appro/hpe/.
20 *
21 * Questions-n-answers.
22 *
23 * Q. How to compile?
24 * A. With SC4.x/SC5.x:
25 *
26 * cc -xarch=v8plus -c bn_asm.sparc.v8plus.S -o bn_asm.o
27 *
28 * and with gcc:
29 *
30 * gcc -mcpu=ultrasparc -c bn_asm.sparc.v8plus.S -o bn_asm.o
31 *
32 * or if above fails (it does if you have gas installed):
33 *
34 * gcc -E bn_asm.sparc.v8plus.S | as -xarch=v8plus /dev/fd/0 -o bn_asm.o
35 *
36 * Quick-n-dirty way to fuse the module into the library.
37 * Provided that the library is already configured and built
38 * (in 0.9.2 case with no-asm option):
39 *
40 * # cd crypto/bn
41 * # cp /some/place/bn_asm.sparc.v8plus.S .
42 * # cc -xarch=v8plus -c bn_asm.sparc.v8plus.S -o bn_asm.o
43 * # make
44 * # cd ../..
45 * # make; make test
46 *
47 * Quick-n-dirty way to get rid of it:
48 *
49 * # cd crypto/bn
50 * # touch bn_asm.c
51 * # make
52 * # cd ../..
53 * # make; make test
54 *
55 * Q. V8plus achitecture? What kind of beast is that?
56 * A. Well, it's rather a programming model than an architecture...
57 * It's actually v9-compliant, i.e. *any* UltraSPARC, CPU under
58 * special conditions, namely when kernel doesn't preserve upper
59 * 32 bits of otherwise 64-bit registers during a context switch.
60 *
61 * Q. Why just UltraSPARC? What about SuperSPARC?
62 * A. Original release did target UltraSPARC only. Now SuperSPARC
63 * version is provided along. Both version share bn_*comba[48]
64 * implementations (see comment later in code for explanation).
65 * But what's so special about this UltraSPARC implementation?
66 * Why didn't I let compiler do the job? Trouble is that most of
67 * available compilers (well, SC5.0 is the only exception) don't
68 * attempt to take advantage of UltraSPARC's 64-bitness under
69 * 32-bit kernels even though it's perfectly possible (see next
70 * question).
71 *
72 * Q. 64-bit registers under 32-bit kernels? Didn't you just say it
73 * doesn't work?
74 * A. You can't adress *all* registers as 64-bit wide:-( The catch is
75 * that you actually may rely upon %o0-%o5 and %g1-%g4 being fully
76 * preserved if you're in a leaf function, i.e. such never calling
77 * any other functions. All functions in this module are leaf and
78 * 10 registers is a handful. And as a matter of fact none-"comba"
79 * routines don't require even that much and I could even afford to
80 * not allocate own stack frame for 'em:-)
81 *
82 * Q. What about 64-bit kernels?
83 * A. What about 'em? Just kidding:-) Pure 64-bit version is currently
84 * under evaluation and development...
85 *
86 * Q. What about shared libraries?
87 * A. What about 'em? Kidding again:-) Code does *not* contain any
88 * code position dependencies and it's safe to include it into
89 * shared library as is.
90 *
91 * Q. How much faster does it go?
92 * A. Do you have a good benchmark? In either case below is what I
93 * experience with crypto/bn/expspeed.c test program:
94 *
95 * v8plus module on U10/300MHz against bn_asm.c compiled with:
96 *
97 * cc-5.0 -xarch=v8plus -xO5 -xdepend +7-12%
98 * cc-4.2 -xarch=v8plus -xO5 -xdepend +25-35%
99 * egcs-1.1.2 -mcpu=ultrasparc -O3 +35-45%
100 *
101 * v8 module on SS10/60MHz against bn_asm.c compiled with:
102 *
103 * cc-5.0 -xarch=v8 -xO5 -xdepend +7-10%
104 * cc-4.2 -xarch=v8 -xO5 -xdepend +10%
105 * egcs-1.1.2 -mv8 -O3 +35-45%
106 *
107 * As you can see it's damn hard to beat the new Sun C compiler
108 * and it's in first place GNU C users who will appreciate this
109 * assembler implementation:-)
110 */
111
112/*
113 * Revision history.
114 *
115 * 1.0 - initial release;
116 * 1.1 - new loop unrolling model(*);
117 * - some more fine tuning;
118 * 1.2 - made gas friendly;
119 * - updates to documentation concerning v9;
120 * - new performance comparison matrix;
121 * 1.3 - fixed problem with /usr/ccs/lib/cpp;
122 * 1.4 - native V9 bn_*_comba[48] implementation (15% more efficient)
123 * resulting in slight overall performance kick;
124 * - some retunes;
125 * - support for GNU as added;
126 *
127 * (*) Originally unrolled loop looked like this:
128 * for (;;) {
129 * op(p+0); if (--n==0) break;
130 * op(p+1); if (--n==0) break;
131 * op(p+2); if (--n==0) break;
132 * op(p+3); if (--n==0) break;
133 * p+=4;
134 * }
135 * I unroll according to following:
136 * while (n&~3) {
137 * op(p+0); op(p+1); op(p+2); op(p+3);
138 * p+=4; n=-4;
139 * }
140 * if (n) {
141 * op(p+0); if (--n==0) return;
142 * op(p+2); if (--n==0) return;
143 * op(p+3); return;
144 * }
145 */
146
147/*
148 * GNU assembler can't stand stuw:-(
149 */
150#define stuw st
151
152.section ".text",#alloc,#execinstr
153.file "bn_asm.sparc.v8plus.S"
154
155.align 32
156
157.global bn_mul_add_words
158/*
159 * BN_ULONG bn_mul_add_words(rp,ap,num,w)
160 * BN_ULONG *rp,*ap;
161 * int num;
162 * BN_ULONG w;
163 */
164bn_mul_add_words:
165 brgz,a %o2,.L_bn_mul_add_words_proceed
166 lduw [%o1],%g2
167 retl
168 clr %o0
169
170.L_bn_mul_add_words_proceed:
171 srl %o3,%g0,%o3 ! clruw %o3
172 andcc %o2,-4,%g0
173 bz,pn %icc,.L_bn_mul_add_words_tail
174 clr %o5
175
176.L_bn_mul_add_words_loop: ! wow! 32 aligned!
177 lduw [%o0],%g1
178 lduw [%o1+4],%g3
179 mulx %o3,%g2,%g2
180 add %g1,%o5,%o4
181 nop
182 add %o4,%g2,%o4
183 stuw %o4,[%o0]
184 srlx %o4,32,%o5
185
186 lduw [%o0+4],%g1
187 lduw [%o1+8],%g2
188 mulx %o3,%g3,%g3
189 add %g1,%o5,%o4
190 dec 4,%o2
191 add %o4,%g3,%o4
192 stuw %o4,[%o0+4]
193 srlx %o4,32,%o5
194
195 lduw [%o0+8],%g1
196 lduw [%o1+12],%g3
197 mulx %o3,%g2,%g2
198 add %g1,%o5,%o4
199 inc 16,%o1
200 add %o4,%g2,%o4
201 stuw %o4,[%o0+8]
202 srlx %o4,32,%o5
203
204 lduw [%o0+12],%g1
205 mulx %o3,%g3,%g3
206 add %g1,%o5,%o4
207 inc 16,%o0
208 add %o4,%g3,%o4
209 andcc %o2,-4,%g0
210 stuw %o4,[%o0-4]
211 srlx %o4,32,%o5
212 bnz,a,pt %icc,.L_bn_mul_add_words_loop
213 lduw [%o1],%g2
214
215 brnz,a,pn %o2,.L_bn_mul_add_words_tail
216 lduw [%o1],%g2
217.L_bn_mul_add_words_return:
218 retl
219 mov %o5,%o0
220
221.L_bn_mul_add_words_tail:
222 lduw [%o0],%g1
223 mulx %o3,%g2,%g2
224 add %g1,%o5,%o4
225 dec %o2
226 add %o4,%g2,%o4
227 srlx %o4,32,%o5
228 brz,pt %o2,.L_bn_mul_add_words_return
229 stuw %o4,[%o0]
230
231 lduw [%o1+4],%g2
232 lduw [%o0+4],%g1
233 mulx %o3,%g2,%g2
234 add %g1,%o5,%o4
235 dec %o2
236 add %o4,%g2,%o4
237 srlx %o4,32,%o5
238 brz,pt %o2,.L_bn_mul_add_words_return
239 stuw %o4,[%o0+4]
240
241 lduw [%o1+8],%g2
242 lduw [%o0+8],%g1
243 mulx %o3,%g2,%g2
244 add %g1,%o5,%o4
245 add %o4,%g2,%o4
246 stuw %o4,[%o0+8]
247 retl
248 srlx %o4,32,%o0
249
250.type bn_mul_add_words,#function
251.size bn_mul_add_words,(.-bn_mul_add_words)
252
253.align 32
254
255.global bn_mul_words
256/*
257 * BN_ULONG bn_mul_words(rp,ap,num,w)
258 * BN_ULONG *rp,*ap;
259 * int num;
260 * BN_ULONG w;
261 */
262bn_mul_words:
263 brgz,a %o2,.L_bn_mul_words_proceeed
264 lduw [%o1],%g2
265 retl
266 clr %o0
267
268.L_bn_mul_words_proceeed:
269 srl %o3,%g0,%o3 ! clruw %o3
270 andcc %o2,-4,%g0
271 bz,pn %icc,.L_bn_mul_words_tail
272 clr %o5
273
274.L_bn_mul_words_loop: ! wow! 32 aligned!
275 lduw [%o1+4],%g3
276 mulx %o3,%g2,%g2
277 add %g2,%o5,%o4
278 nop
279 stuw %o4,[%o0]
280 srlx %o4,32,%o5
281
282 lduw [%o1+8],%g2
283 mulx %o3,%g3,%g3
284 add %g3,%o5,%o4
285 dec 4,%o2
286 stuw %o4,[%o0+4]
287 srlx %o4,32,%o5
288
289 lduw [%o1+12],%g3
290 mulx %o3,%g2,%g2
291 add %g2,%o5,%o4
292 inc 16,%o1
293 stuw %o4,[%o0+8]
294 srlx %o4,32,%o5
295
296 mulx %o3,%g3,%g3
297 add %g3,%o5,%o4
298 inc 16,%o0
299 stuw %o4,[%o0-4]
300 srlx %o4,32,%o5
301 andcc %o2,-4,%g0
302 bnz,a,pt %icc,.L_bn_mul_words_loop
303 lduw [%o1],%g2
304 nop
305 nop
306
307 brnz,a,pn %o2,.L_bn_mul_words_tail
308 lduw [%o1],%g2
309.L_bn_mul_words_return:
310 retl
311 mov %o5,%o0
312
313.L_bn_mul_words_tail:
314 mulx %o3,%g2,%g2
315 add %g2,%o5,%o4
316 dec %o2
317 srlx %o4,32,%o5
318 brz,pt %o2,.L_bn_mul_words_return
319 stuw %o4,[%o0]
320
321 lduw [%o1+4],%g2
322 mulx %o3,%g2,%g2
323 add %g2,%o5,%o4
324 dec %o2
325 srlx %o4,32,%o5
326 brz,pt %o2,.L_bn_mul_words_return
327 stuw %o4,[%o0+4]
328
329 lduw [%o1+8],%g2
330 mulx %o3,%g2,%g2
331 add %g2,%o5,%o4
332 stuw %o4,[%o0+8]
333 retl
334 srlx %o4,32,%o0
335
336.type bn_mul_words,#function
337.size bn_mul_words,(.-bn_mul_words)
338
339.align 32
340.global bn_sqr_words
341/*
342 * void bn_sqr_words(r,a,n)
343 * BN_ULONG *r,*a;
344 * int n;
345 */
346bn_sqr_words:
347 brgz,a %o2,.L_bn_sqr_words_proceeed
348 lduw [%o1],%g2
349 retl
350 clr %o0
351
352.L_bn_sqr_words_proceeed:
353 andcc %o2,-4,%g0
354 nop
355 bz,pn %icc,.L_bn_sqr_words_tail
356 nop
357
358.L_bn_sqr_words_loop: ! wow! 32 aligned!
359 lduw [%o1+4],%g3
360 mulx %g2,%g2,%o4
361 stuw %o4,[%o0]
362 srlx %o4,32,%o5
363 stuw %o5,[%o0+4]
364 nop
365
366 lduw [%o1+8],%g2
367 mulx %g3,%g3,%o4
368 dec 4,%o2
369 stuw %o4,[%o0+8]
370 srlx %o4,32,%o5
371 stuw %o5,[%o0+12]
372
373 lduw [%o1+12],%g3
374 mulx %g2,%g2,%o4
375 srlx %o4,32,%o5
376 stuw %o4,[%o0+16]
377 inc 16,%o1
378 stuw %o5,[%o0+20]
379
380 mulx %g3,%g3,%o4
381 inc 32,%o0
382 stuw %o4,[%o0-8]
383 srlx %o4,32,%o5
384 andcc %o2,-4,%g2
385 stuw %o5,[%o0-4]
386 bnz,a,pt %icc,.L_bn_sqr_words_loop
387 lduw [%o1],%g2
388 nop
389
390 brnz,a,pn %o2,.L_bn_sqr_words_tail
391 lduw [%o1],%g2
392.L_bn_sqr_words_return:
393 retl
394 clr %o0
395
396.L_bn_sqr_words_tail:
397 mulx %g2,%g2,%o4
398 dec %o2
399 stuw %o4,[%o0]
400 srlx %o4,32,%o5
401 brz,pt %o2,.L_bn_sqr_words_return
402 stuw %o5,[%o0+4]
403
404 lduw [%o1+4],%g2
405 mulx %g2,%g2,%o4
406 dec %o2
407 stuw %o4,[%o0+8]
408 srlx %o4,32,%o5
409 brz,pt %o2,.L_bn_sqr_words_return
410 stuw %o5,[%o0+12]
411
412 lduw [%o1+8],%g2
413 mulx %g2,%g2,%o4
414 srlx %o4,32,%o5
415 stuw %o4,[%o0+16]
416 stuw %o5,[%o0+20]
417 retl
418 clr %o0
419
420.type bn_sqr_words,#function
421.size bn_sqr_words,(.-bn_sqr_words)
422
423.align 32
424.global bn_div_words
425/*
426 * BN_ULONG bn_div_words(h,l,d)
427 * BN_ULONG h,l,d;
428 */
429bn_div_words:
430 sllx %o0,32,%o0
431 or %o0,%o1,%o0
432 udivx %o0,%o2,%o0
433 retl
434 srl %o0,%g0,%o0 ! clruw %o0
435
436.type bn_div_words,#function
437.size bn_div_words,(.-bn_div_words)
438
439.align 32
440
441.global bn_add_words
442/*
443 * BN_ULONG bn_add_words(rp,ap,bp,n)
444 * BN_ULONG *rp,*ap,*bp;
445 * int n;
446 */
447bn_add_words:
448 brgz,a %o3,.L_bn_add_words_proceed
449 lduw [%o1],%o4
450 retl
451 clr %o0
452
453.L_bn_add_words_proceed:
454 andcc %o3,-4,%g0
455 bz,pn %icc,.L_bn_add_words_tail
456 addcc %g0,0,%g0 ! clear carry flag
457 nop
458
459.L_bn_add_words_loop: ! wow! 32 aligned!
460 dec 4,%o3
461 lduw [%o2],%o5
462 lduw [%o1+4],%g1
463 lduw [%o2+4],%g2
464 lduw [%o1+8],%g3
465 lduw [%o2+8],%g4
466 addccc %o5,%o4,%o5
467 stuw %o5,[%o0]
468
469 lduw [%o1+12],%o4
470 lduw [%o2+12],%o5
471 inc 16,%o1
472 addccc %g1,%g2,%g1
473 stuw %g1,[%o0+4]
474
475 inc 16,%o2
476 addccc %g3,%g4,%g3
477 stuw %g3,[%o0+8]
478
479 inc 16,%o0
480 addccc %o5,%o4,%o5
481 stuw %o5,[%o0-4]
482 and %o3,-4,%g1
483 brnz,a,pt %g1,.L_bn_add_words_loop
484 lduw [%o1],%o4
485
486 brnz,a,pn %o3,.L_bn_add_words_tail
487 lduw [%o1],%o4
488.L_bn_add_words_return:
489 clr %o0
490 retl
491 movcs %icc,1,%o0
492 nop
493
494.L_bn_add_words_tail:
495 lduw [%o2],%o5
496 dec %o3
497 addccc %o5,%o4,%o5
498 brz,pt %o3,.L_bn_add_words_return
499 stuw %o5,[%o0]
500
501 lduw [%o1+4],%o4
502 lduw [%o2+4],%o5
503 dec %o3
504 addccc %o5,%o4,%o5
505 brz,pt %o3,.L_bn_add_words_return
506 stuw %o5,[%o0+4]
507
508 lduw [%o1+8],%o4
509 lduw [%o2+8],%o5
510 addccc %o5,%o4,%o5
511 stuw %o5,[%o0+8]
512 clr %o0
513 retl
514 movcs %icc,1,%o0
515
516.type bn_add_words,#function
517.size bn_add_words,(.-bn_add_words)
518
519.global bn_sub_words
520/*
521 * BN_ULONG bn_sub_words(rp,ap,bp,n)
522 * BN_ULONG *rp,*ap,*bp;
523 * int n;
524 */
525bn_sub_words:
526 brgz,a %o3,.L_bn_sub_words_proceed
527 lduw [%o1],%o4
528 retl
529 clr %o0
530
531.L_bn_sub_words_proceed:
532 andcc %o3,-4,%g0
533 bz,pn %icc,.L_bn_sub_words_tail
534 addcc %g0,0,%g0 ! clear carry flag
535 nop
536
537.L_bn_sub_words_loop: ! wow! 32 aligned!
538 dec 4,%o3
539 lduw [%o2],%o5
540 lduw [%o1+4],%g1
541 lduw [%o2+4],%g2
542 lduw [%o1+8],%g3
543 lduw [%o2+8],%g4
544 subccc %o4,%o5,%o5
545 stuw %o5,[%o0]
546
547 lduw [%o1+12],%o4
548 lduw [%o2+12],%o5
549 inc 16,%o1
550 subccc %g1,%g2,%g2
551 stuw %g2,[%o0+4]
552
553 inc 16,%o2
554 subccc %g3,%g4,%g4
555 stuw %g4,[%o0+8]
556
557 inc 16,%o0
558 subccc %o4,%o5,%o5
559 stuw %o5,[%o0-4]
560 and %o3,-4,%g1
561 brnz,a,pt %g1,.L_bn_sub_words_loop
562 lduw [%o1],%o4
563
564 brnz,a,pn %o3,.L_bn_sub_words_tail
565 lduw [%o1],%o4
566.L_bn_sub_words_return:
567 clr %o0
568 retl
569 movcs %icc,1,%o0
570 nop
571
572.L_bn_sub_words_tail: ! wow! 32 aligned!
573 lduw [%o2],%o5
574 dec %o3
575 subccc %o4,%o5,%o5
576 brz,pt %o3,.L_bn_sub_words_return
577 stuw %o5,[%o0]
578
579 lduw [%o1+4],%o4
580 lduw [%o2+4],%o5
581 dec %o3
582 subccc %o4,%o5,%o5
583 brz,pt %o3,.L_bn_sub_words_return
584 stuw %o5,[%o0+4]
585
586 lduw [%o1+8],%o4
587 lduw [%o2+8],%o5
588 subccc %o4,%o5,%o5
589 stuw %o5,[%o0+8]
590 clr %o0
591 retl
592 movcs %icc,1,%o0
593
594.type bn_sub_words,#function
595.size bn_sub_words,(.-bn_sub_words)
596
597/*
598 * Code below depends on the fact that upper parts of the %l0-%l7
599 * and %i0-%i7 are zeroed by kernel after context switch. In
600 * previous versions this comment stated that "the trouble is that
601 * it's not feasible to implement the mumbo-jumbo in less V9
602 * instructions:-(" which apparently isn't true thanks to
603 * 'bcs,a %xcc,.+8; inc %rd' pair. But the performance improvement
604 * results not from the shorter code, but from elimination of
605 * multicycle none-pairable 'rd %y,%rd' instructions.
606 *
607 * Andy.
608 */
609
610#define FRAME_SIZE -96
611
612/*
613 * Here is register usage map for *all* routines below.
614 */
615#define t_1 %o0
616#define t_2 %o1
617#define c_12 %o2
618#define c_3 %o3
619
620#define ap(I) [%i1+4*I]
621#define bp(I) [%i2+4*I]
622#define rp(I) [%i0+4*I]
623
624#define a_0 %l0
625#define a_1 %l1
626#define a_2 %l2
627#define a_3 %l3
628#define a_4 %l4
629#define a_5 %l5
630#define a_6 %l6
631#define a_7 %l7
632
633#define b_0 %i3
634#define b_1 %i4
635#define b_2 %i5
636#define b_3 %o4
637#define b_4 %o5
638#define b_5 %o7
639#define b_6 %g1
640#define b_7 %g4
641
642.align 32
643.global bn_mul_comba8
644/*
645 * void bn_mul_comba8(r,a,b)
646 * BN_ULONG *r,*a,*b;
647 */
648bn_mul_comba8:
649 save %sp,FRAME_SIZE,%sp
650 mov 1,t_2
651 lduw ap(0),a_0
652 sllx t_2,32,t_2
653 lduw bp(0),b_0 !=
654 lduw bp(1),b_1
655 mulx a_0,b_0,t_1 !mul_add_c(a[0],b[0],c1,c2,c3);
656 srlx t_1,32,c_12
657 stuw t_1,rp(0) !=!r[0]=c1;
658
659 lduw ap(1),a_1
660 mulx a_0,b_1,t_1 !mul_add_c(a[0],b[1],c2,c3,c1);
661 addcc c_12,t_1,c_12
662 clr c_3 !=
663 bcs,a %xcc,.+8
664 add c_3,t_2,c_3
665 lduw ap(2),a_2
666 mulx a_1,b_0,t_1 !=!mul_add_c(a[1],b[0],c2,c3,c1);
667 addcc c_12,t_1,t_1
668 bcs,a %xcc,.+8
669 add c_3,t_2,c_3
670 srlx t_1,32,c_12 !=
671 stuw t_1,rp(1) !r[1]=c2;
672 or c_12,c_3,c_12
673
674 mulx a_2,b_0,t_1 !mul_add_c(a[2],b[0],c3,c1,c2);
675 addcc c_12,t_1,c_12 !=
676 clr c_3
677 bcs,a %xcc,.+8
678 add c_3,t_2,c_3
679 lduw bp(2),b_2 !=
680 mulx a_1,b_1,t_1 !mul_add_c(a[1],b[1],c3,c1,c2);
681 addcc c_12,t_1,c_12
682 bcs,a %xcc,.+8
683 add c_3,t_2,c_3 !=
684 lduw bp(3),b_3
685 mulx a_0,b_2,t_1 !mul_add_c(a[0],b[2],c3,c1,c2);
686 addcc c_12,t_1,t_1
687 bcs,a %xcc,.+8 !=
688 add c_3,t_2,c_3
689 srlx t_1,32,c_12
690 stuw t_1,rp(2) !r[2]=c3;
691 or c_12,c_3,c_12 !=
692
693 mulx a_0,b_3,t_1 !mul_add_c(a[0],b[3],c1,c2,c3);
694 addcc c_12,t_1,c_12
695 clr c_3
696 bcs,a %xcc,.+8 !=
697 add c_3,t_2,c_3
698 mulx a_1,b_2,t_1 !=!mul_add_c(a[1],b[2],c1,c2,c3);
699 addcc c_12,t_1,c_12
700 bcs,a %xcc,.+8 !=
701 add c_3,t_2,c_3
702 lduw ap(3),a_3
703 mulx a_2,b_1,t_1 !mul_add_c(a[2],b[1],c1,c2,c3);
704 addcc c_12,t_1,c_12 !=
705 bcs,a %xcc,.+8
706 add c_3,t_2,c_3
707 lduw ap(4),a_4
708 mulx a_3,b_0,t_1 !=!mul_add_c(a[3],b[0],c1,c2,c3);!=
709 addcc c_12,t_1,t_1
710 bcs,a %xcc,.+8
711 add c_3,t_2,c_3
712 srlx t_1,32,c_12 !=
713 stuw t_1,rp(3) !r[3]=c1;
714 or c_12,c_3,c_12
715
716 mulx a_4,b_0,t_1 !mul_add_c(a[4],b[0],c2,c3,c1);
717 addcc c_12,t_1,c_12 !=
718 clr c_3
719 bcs,a %xcc,.+8
720 add c_3,t_2,c_3
721 mulx a_3,b_1,t_1 !=!mul_add_c(a[3],b[1],c2,c3,c1);
722 addcc c_12,t_1,c_12
723 bcs,a %xcc,.+8
724 add c_3,t_2,c_3
725 mulx a_2,b_2,t_1 !=!mul_add_c(a[2],b[2],c2,c3,c1);
726 addcc c_12,t_1,c_12
727 bcs,a %xcc,.+8
728 add c_3,t_2,c_3
729 lduw bp(4),b_4 !=
730 mulx a_1,b_3,t_1 !mul_add_c(a[1],b[3],c2,c3,c1);
731 addcc c_12,t_1,c_12
732 bcs,a %xcc,.+8
733 add c_3,t_2,c_3 !=
734 lduw bp(5),b_5
735 mulx a_0,b_4,t_1 !mul_add_c(a[0],b[4],c2,c3,c1);
736 addcc c_12,t_1,t_1
737 bcs,a %xcc,.+8 !=
738 add c_3,t_2,c_3
739 srlx t_1,32,c_12
740 stuw t_1,rp(4) !r[4]=c2;
741 or c_12,c_3,c_12 !=
742
743 mulx a_0,b_5,t_1 !mul_add_c(a[0],b[5],c3,c1,c2);
744 addcc c_12,t_1,c_12
745 clr c_3
746 bcs,a %xcc,.+8 !=
747 add c_3,t_2,c_3
748 mulx a_1,b_4,t_1 !mul_add_c(a[1],b[4],c3,c1,c2);
749 addcc c_12,t_1,c_12
750 bcs,a %xcc,.+8 !=
751 add c_3,t_2,c_3
752 mulx a_2,b_3,t_1 !mul_add_c(a[2],b[3],c3,c1,c2);
753 addcc c_12,t_1,c_12
754 bcs,a %xcc,.+8 !=
755 add c_3,t_2,c_3
756 mulx a_3,b_2,t_1 !mul_add_c(a[3],b[2],c3,c1,c2);
757 addcc c_12,t_1,c_12
758 bcs,a %xcc,.+8 !=
759 add c_3,t_2,c_3
760 lduw ap(5),a_5
761 mulx a_4,b_1,t_1 !mul_add_c(a[4],b[1],c3,c1,c2);
762 addcc c_12,t_1,c_12 !=
763 bcs,a %xcc,.+8
764 add c_3,t_2,c_3
765 lduw ap(6),a_6
766 mulx a_5,b_0,t_1 !=!mul_add_c(a[5],b[0],c3,c1,c2);
767 addcc c_12,t_1,t_1
768 bcs,a %xcc,.+8
769 add c_3,t_2,c_3
770 srlx t_1,32,c_12 !=
771 stuw t_1,rp(5) !r[5]=c3;
772 or c_12,c_3,c_12
773
774 mulx a_6,b_0,t_1 !mul_add_c(a[6],b[0],c1,c2,c3);
775 addcc c_12,t_1,c_12 !=
776 clr c_3
777 bcs,a %xcc,.+8
778 add c_3,t_2,c_3
779 mulx a_5,b_1,t_1 !=!mul_add_c(a[5],b[1],c1,c2,c3);
780 addcc c_12,t_1,c_12
781 bcs,a %xcc,.+8
782 add c_3,t_2,c_3
783 mulx a_4,b_2,t_1 !=!mul_add_c(a[4],b[2],c1,c2,c3);
784 addcc c_12,t_1,c_12
785 bcs,a %xcc,.+8
786 add c_3,t_2,c_3
787 mulx a_3,b_3,t_1 !=!mul_add_c(a[3],b[3],c1,c2,c3);
788 addcc c_12,t_1,c_12
789 bcs,a %xcc,.+8
790 add c_3,t_2,c_3
791 mulx a_2,b_4,t_1 !=!mul_add_c(a[2],b[4],c1,c2,c3);
792 addcc c_12,t_1,c_12
793 bcs,a %xcc,.+8
794 add c_3,t_2,c_3
795 lduw bp(6),b_6 !=
796 mulx a_1,b_5,t_1 !mul_add_c(a[1],b[5],c1,c2,c3);
797 addcc c_12,t_1,c_12
798 bcs,a %xcc,.+8
799 add c_3,t_2,c_3 !=
800 lduw bp(7),b_7
801 mulx a_0,b_6,t_1 !mul_add_c(a[0],b[6],c1,c2,c3);
802 addcc c_12,t_1,t_1
803 bcs,a %xcc,.+8 !=
804 add c_3,t_2,c_3
805 srlx t_1,32,c_12
806 stuw t_1,rp(6) !r[6]=c1;
807 or c_12,c_3,c_12 !=
808
809 mulx a_0,b_7,t_1 !mul_add_c(a[0],b[7],c2,c3,c1);
810 addcc c_12,t_1,c_12
811 clr c_3
812 bcs,a %xcc,.+8 !=
813 add c_3,t_2,c_3
814 mulx a_1,b_6,t_1 !mul_add_c(a[1],b[6],c2,c3,c1);
815 addcc c_12,t_1,c_12
816 bcs,a %xcc,.+8 !=
817 add c_3,t_2,c_3
818 mulx a_2,b_5,t_1 !mul_add_c(a[2],b[5],c2,c3,c1);
819 addcc c_12,t_1,c_12
820 bcs,a %xcc,.+8 !=
821 add c_3,t_2,c_3
822 mulx a_3,b_4,t_1 !mul_add_c(a[3],b[4],c2,c3,c1);
823 addcc c_12,t_1,c_12
824 bcs,a %xcc,.+8 !=
825 add c_3,t_2,c_3
826 mulx a_4,b_3,t_1 !mul_add_c(a[4],b[3],c2,c3,c1);
827 addcc c_12,t_1,c_12
828 bcs,a %xcc,.+8 !=
829 add c_3,t_2,c_3
830 mulx a_5,b_2,t_1 !mul_add_c(a[5],b[2],c2,c3,c1);
831 addcc c_12,t_1,c_12
832 bcs,a %xcc,.+8 !=
833 add c_3,t_2,c_3
834 lduw ap(7),a_7
835 mulx a_6,b_1,t_1 !=!mul_add_c(a[6],b[1],c2,c3,c1);
836 addcc c_12,t_1,c_12
837 bcs,a %xcc,.+8
838 add c_3,t_2,c_3
839 mulx a_7,b_0,t_1 !=!mul_add_c(a[7],b[0],c2,c3,c1);
840 addcc c_12,t_1,t_1
841 bcs,a %xcc,.+8
842 add c_3,t_2,c_3
843 srlx t_1,32,c_12 !=
844 stuw t_1,rp(7) !r[7]=c2;
845 or c_12,c_3,c_12
846
847 mulx a_7,b_1,t_1 !=!mul_add_c(a[7],b[1],c3,c1,c2);
848 addcc c_12,t_1,c_12
849 clr c_3
850 bcs,a %xcc,.+8
851 add c_3,t_2,c_3 !=
852 mulx a_6,b_2,t_1 !mul_add_c(a[6],b[2],c3,c1,c2);
853 addcc c_12,t_1,c_12
854 bcs,a %xcc,.+8
855 add c_3,t_2,c_3 !=
856 mulx a_5,b_3,t_1 !mul_add_c(a[5],b[3],c3,c1,c2);
857 addcc c_12,t_1,c_12
858 bcs,a %xcc,.+8
859 add c_3,t_2,c_3 !=
860 mulx a_4,b_4,t_1 !mul_add_c(a[4],b[4],c3,c1,c2);
861 addcc c_12,t_1,c_12
862 bcs,a %xcc,.+8
863 add c_3,t_2,c_3 !=
864 mulx a_3,b_5,t_1 !mul_add_c(a[3],b[5],c3,c1,c2);
865 addcc c_12,t_1,c_12
866 bcs,a %xcc,.+8
867 add c_3,t_2,c_3 !=
868 mulx a_2,b_6,t_1 !mul_add_c(a[2],b[6],c3,c1,c2);
869 addcc c_12,t_1,c_12
870 bcs,a %xcc,.+8
871 add c_3,t_2,c_3 !=
872 mulx a_1,b_7,t_1 !mul_add_c(a[1],b[7],c3,c1,c2);
873 addcc c_12,t_1,t_1
874 bcs,a %xcc,.+8
875 add c_3,t_2,c_3 !=
876 srlx t_1,32,c_12
877 stuw t_1,rp(8) !r[8]=c3;
878 or c_12,c_3,c_12
879
880 mulx a_2,b_7,t_1 !=!mul_add_c(a[2],b[7],c1,c2,c3);
881 addcc c_12,t_1,c_12
882 clr c_3
883 bcs,a %xcc,.+8
884 add c_3,t_2,c_3 !=
885 mulx a_3,b_6,t_1 !mul_add_c(a[3],b[6],c1,c2,c3);
886 addcc c_12,t_1,c_12
887 bcs,a %xcc,.+8 !=
888 add c_3,t_2,c_3
889 mulx a_4,b_5,t_1 !mul_add_c(a[4],b[5],c1,c2,c3);
890 addcc c_12,t_1,c_12
891 bcs,a %xcc,.+8 !=
892 add c_3,t_2,c_3
893 mulx a_5,b_4,t_1 !mul_add_c(a[5],b[4],c1,c2,c3);
894 addcc c_12,t_1,c_12
895 bcs,a %xcc,.+8 !=
896 add c_3,t_2,c_3
897 mulx a_6,b_3,t_1 !mul_add_c(a[6],b[3],c1,c2,c3);
898 addcc c_12,t_1,c_12
899 bcs,a %xcc,.+8 !=
900 add c_3,t_2,c_3
901 mulx a_7,b_2,t_1 !mul_add_c(a[7],b[2],c1,c2,c3);
902 addcc c_12,t_1,t_1
903 bcs,a %xcc,.+8 !=
904 add c_3,t_2,c_3
905 srlx t_1,32,c_12
906 stuw t_1,rp(9) !r[9]=c1;
907 or c_12,c_3,c_12 !=
908
909 mulx a_7,b_3,t_1 !mul_add_c(a[7],b[3],c2,c3,c1);
910 addcc c_12,t_1,c_12
911 clr c_3
912 bcs,a %xcc,.+8 !=
913 add c_3,t_2,c_3
914 mulx a_6,b_4,t_1 !mul_add_c(a[6],b[4],c2,c3,c1);
915 addcc c_12,t_1,c_12
916 bcs,a %xcc,.+8 !=
917 add c_3,t_2,c_3
918 mulx a_5,b_5,t_1 !mul_add_c(a[5],b[5],c2,c3,c1);
919 addcc c_12,t_1,c_12
920 bcs,a %xcc,.+8 !=
921 add c_3,t_2,c_3
922 mulx a_4,b_6,t_1 !mul_add_c(a[4],b[6],c2,c3,c1);
923 addcc c_12,t_1,c_12
924 bcs,a %xcc,.+8 !=
925 add c_3,t_2,c_3
926 mulx a_3,b_7,t_1 !mul_add_c(a[3],b[7],c2,c3,c1);
927 addcc c_12,t_1,t_1
928 bcs,a %xcc,.+8 !=
929 add c_3,t_2,c_3
930 srlx t_1,32,c_12
931 stuw t_1,rp(10) !r[10]=c2;
932 or c_12,c_3,c_12 !=
933
934 mulx a_4,b_7,t_1 !mul_add_c(a[4],b[7],c3,c1,c2);
935 addcc c_12,t_1,c_12
936 clr c_3
937 bcs,a %xcc,.+8 !=
938 add c_3,t_2,c_3
939 mulx a_5,b_6,t_1 !mul_add_c(a[5],b[6],c3,c1,c2);
940 addcc c_12,t_1,c_12
941 bcs,a %xcc,.+8 !=
942 add c_3,t_2,c_3
943 mulx a_6,b_5,t_1 !mul_add_c(a[6],b[5],c3,c1,c2);
944 addcc c_12,t_1,c_12
945 bcs,a %xcc,.+8 !=
946 add c_3,t_2,c_3
947 mulx a_7,b_4,t_1 !mul_add_c(a[7],b[4],c3,c1,c2);
948 addcc c_12,t_1,t_1
949 bcs,a %xcc,.+8 !=
950 add c_3,t_2,c_3
951 srlx t_1,32,c_12
952 stuw t_1,rp(11) !r[11]=c3;
953 or c_12,c_3,c_12 !=
954
955 mulx a_7,b_5,t_1 !mul_add_c(a[7],b[5],c1,c2,c3);
956 addcc c_12,t_1,c_12
957 clr c_3
958 bcs,a %xcc,.+8 !=
959 add c_3,t_2,c_3
960 mulx a_6,b_6,t_1 !mul_add_c(a[6],b[6],c1,c2,c3);
961 addcc c_12,t_1,c_12
962 bcs,a %xcc,.+8 !=
963 add c_3,t_2,c_3
964 mulx a_5,b_7,t_1 !mul_add_c(a[5],b[7],c1,c2,c3);
965 addcc c_12,t_1,t_1
966 bcs,a %xcc,.+8 !=
967 add c_3,t_2,c_3
968 srlx t_1,32,c_12
969 stuw t_1,rp(12) !r[12]=c1;
970 or c_12,c_3,c_12 !=
971
972 mulx a_6,b_7,t_1 !mul_add_c(a[6],b[7],c2,c3,c1);
973 addcc c_12,t_1,c_12
974 clr c_3
975 bcs,a %xcc,.+8 !=
976 add c_3,t_2,c_3
977 mulx a_7,b_6,t_1 !mul_add_c(a[7],b[6],c2,c3,c1);
978 addcc c_12,t_1,t_1
979 bcs,a %xcc,.+8 !=
980 add c_3,t_2,c_3
981 srlx t_1,32,c_12
982 st t_1,rp(13) !r[13]=c2;
983 or c_12,c_3,c_12 !=
984
985 mulx a_7,b_7,t_1 !mul_add_c(a[7],b[7],c3,c1,c2);
986 addcc c_12,t_1,t_1
987 srlx t_1,32,c_12 !=
988 stuw t_1,rp(14) !r[14]=c3;
989 stuw c_12,rp(15) !r[15]=c1;
990
991 ret
992 restore %g0,%g0,%o0 !=
993
994.type bn_mul_comba8,#function
995.size bn_mul_comba8,(.-bn_mul_comba8)
996
997.align 32
998
999.global bn_mul_comba4
1000/*
1001 * void bn_mul_comba4(r,a,b)
1002 * BN_ULONG *r,*a,*b;
1003 */
1004bn_mul_comba4:
1005 save %sp,FRAME_SIZE,%sp
1006 lduw ap(0),a_0
1007 mov 1,t_2
1008 lduw bp(0),b_0
1009 sllx t_2,32,t_2 !=
1010 lduw bp(1),b_1
1011 mulx a_0,b_0,t_1 !mul_add_c(a[0],b[0],c1,c2,c3);
1012 srlx t_1,32,c_12
1013 stuw t_1,rp(0) !=!r[0]=c1;
1014
1015 lduw ap(1),a_1
1016 mulx a_0,b_1,t_1 !mul_add_c(a[0],b[1],c2,c3,c1);
1017 addcc c_12,t_1,c_12
1018 clr c_3 !=
1019 bcs,a %xcc,.+8
1020 add c_3,t_2,c_3
1021 lduw ap(2),a_2
1022 mulx a_1,b_0,t_1 !=!mul_add_c(a[1],b[0],c2,c3,c1);
1023 addcc c_12,t_1,t_1
1024 bcs,a %xcc,.+8
1025 add c_3,t_2,c_3
1026 srlx t_1,32,c_12 !=
1027 stuw t_1,rp(1) !r[1]=c2;
1028 or c_12,c_3,c_12
1029
1030 mulx a_2,b_0,t_1 !mul_add_c(a[2],b[0],c3,c1,c2);
1031 addcc c_12,t_1,c_12 !=
1032 clr c_3
1033 bcs,a %xcc,.+8
1034 add c_3,t_2,c_3
1035 lduw bp(2),b_2 !=
1036 mulx a_1,b_1,t_1 !mul_add_c(a[1],b[1],c3,c1,c2);
1037 addcc c_12,t_1,c_12
1038 bcs,a %xcc,.+8
1039 add c_3,t_2,c_3 !=
1040 lduw bp(3),b_3
1041 mulx a_0,b_2,t_1 !mul_add_c(a[0],b[2],c3,c1,c2);
1042 addcc c_12,t_1,t_1
1043 bcs,a %xcc,.+8 !=
1044 add c_3,t_2,c_3
1045 srlx t_1,32,c_12
1046 stuw t_1,rp(2) !r[2]=c3;
1047 or c_12,c_3,c_12 !=
1048
1049 mulx a_0,b_3,t_1 !mul_add_c(a[0],b[3],c1,c2,c3);
1050 addcc c_12,t_1,c_12
1051 clr c_3
1052 bcs,a %xcc,.+8 !=
1053 add c_3,t_2,c_3
1054 mulx a_1,b_2,t_1 !mul_add_c(a[1],b[2],c1,c2,c3);
1055 addcc c_12,t_1,c_12
1056 bcs,a %xcc,.+8 !=
1057 add c_3,t_2,c_3
1058 lduw ap(3),a_3
1059 mulx a_2,b_1,t_1 !mul_add_c(a[2],b[1],c1,c2,c3);
1060 addcc c_12,t_1,c_12 !=
1061 bcs,a %xcc,.+8
1062 add c_3,t_2,c_3
1063 mulx a_3,b_0,t_1 !mul_add_c(a[3],b[0],c1,c2,c3);!=
1064 addcc c_12,t_1,t_1 !=
1065 bcs,a %xcc,.+8
1066 add c_3,t_2,c_3
1067 srlx t_1,32,c_12
1068 stuw t_1,rp(3) !=!r[3]=c1;
1069 or c_12,c_3,c_12
1070
1071 mulx a_3,b_1,t_1 !mul_add_c(a[3],b[1],c2,c3,c1);
1072 addcc c_12,t_1,c_12
1073 clr c_3 !=
1074 bcs,a %xcc,.+8
1075 add c_3,t_2,c_3
1076 mulx a_2,b_2,t_1 !mul_add_c(a[2],b[2],c2,c3,c1);
1077 addcc c_12,t_1,c_12 !=
1078 bcs,a %xcc,.+8
1079 add c_3,t_2,c_3
1080 mulx a_1,b_3,t_1 !mul_add_c(a[1],b[3],c2,c3,c1);
1081 addcc c_12,t_1,t_1 !=
1082 bcs,a %xcc,.+8
1083 add c_3,t_2,c_3
1084 srlx t_1,32,c_12
1085 stuw t_1,rp(4) !=!r[4]=c2;
1086 or c_12,c_3,c_12
1087
1088 mulx a_2,b_3,t_1 !mul_add_c(a[2],b[3],c3,c1,c2);
1089 addcc c_12,t_1,c_12
1090 clr c_3 !=
1091 bcs,a %xcc,.+8
1092 add c_3,t_2,c_3
1093 mulx a_3,b_2,t_1 !mul_add_c(a[3],b[2],c3,c1,c2);
1094 addcc c_12,t_1,t_1 !=
1095 bcs,a %xcc,.+8
1096 add c_3,t_2,c_3
1097 srlx t_1,32,c_12
1098 stuw t_1,rp(5) !=!r[5]=c3;
1099 or c_12,c_3,c_12
1100
1101 mulx a_3,b_3,t_1 !mul_add_c(a[3],b[3],c1,c2,c3);
1102 addcc c_12,t_1,t_1
1103 srlx t_1,32,c_12 !=
1104 stuw t_1,rp(6) !r[6]=c1;
1105 stuw c_12,rp(7) !r[7]=c2;
1106
1107 ret
1108 restore %g0,%g0,%o0
1109
1110.type bn_mul_comba4,#function
1111.size bn_mul_comba4,(.-bn_mul_comba4)
1112
1113.align 32
1114
1115.global bn_sqr_comba8
1116bn_sqr_comba8:
1117 save %sp,FRAME_SIZE,%sp
1118 mov 1,t_2
1119 lduw ap(0),a_0
1120 sllx t_2,32,t_2
1121 lduw ap(1),a_1
1122 mulx a_0,a_0,t_1 !sqr_add_c(a,0,c1,c2,c3);
1123 srlx t_1,32,c_12
1124 stuw t_1,rp(0) !r[0]=c1;
1125
1126 lduw ap(2),a_2
1127 mulx a_0,a_1,t_1 !=!sqr_add_c2(a,1,0,c2,c3,c1);
1128 addcc c_12,t_1,c_12
1129 clr c_3
1130 bcs,a %xcc,.+8
1131 add c_3,t_2,c_3
1132 addcc c_12,t_1,t_1
1133 bcs,a %xcc,.+8
1134 add c_3,t_2,c_3
1135 srlx t_1,32,c_12
1136 stuw t_1,rp(1) !r[1]=c2;
1137 or c_12,c_3,c_12
1138
1139 mulx a_2,a_0,t_1 !sqr_add_c2(a,2,0,c3,c1,c2);
1140 addcc c_12,t_1,c_12
1141 clr c_3
1142 bcs,a %xcc,.+8
1143 add c_3,t_2,c_3
1144 addcc c_12,t_1,c_12
1145 bcs,a %xcc,.+8
1146 add c_3,t_2,c_3
1147 lduw ap(3),a_3
1148 mulx a_1,a_1,t_1 !sqr_add_c(a,1,c3,c1,c2);
1149 addcc c_12,t_1,t_1
1150 bcs,a %xcc,.+8
1151 add c_3,t_2,c_3
1152 srlx t_1,32,c_12
1153 stuw t_1,rp(2) !r[2]=c3;
1154 or c_12,c_3,c_12
1155
1156 mulx a_0,a_3,t_1 !sqr_add_c2(a,3,0,c1,c2,c3);
1157 addcc c_12,t_1,c_12
1158 clr c_3
1159 bcs,a %xcc,.+8
1160 add c_3,t_2,c_3
1161 addcc c_12,t_1,c_12
1162 bcs,a %xcc,.+8
1163 add c_3,t_2,c_3
1164 lduw ap(4),a_4
1165 mulx a_1,a_2,t_1 !sqr_add_c2(a,2,1,c1,c2,c3);
1166 addcc c_12,t_1,c_12
1167 bcs,a %xcc,.+8
1168 add c_3,t_2,c_3
1169 addcc c_12,t_1,t_1
1170 bcs,a %xcc,.+8
1171 add c_3,t_2,c_3
1172 srlx t_1,32,c_12
1173 st t_1,rp(3) !r[3]=c1;
1174 or c_12,c_3,c_12
1175
1176 mulx a_4,a_0,t_1 !sqr_add_c2(a,4,0,c2,c3,c1);
1177 addcc c_12,t_1,c_12
1178 clr c_3
1179 bcs,a %xcc,.+8
1180 add c_3,t_2,c_3
1181 addcc c_12,t_1,c_12
1182 bcs,a %xcc,.+8
1183 add c_3,t_2,c_3
1184 mulx a_3,a_1,t_1 !sqr_add_c2(a,3,1,c2,c3,c1);
1185 addcc c_12,t_1,c_12
1186 bcs,a %xcc,.+8
1187 add c_3,t_2,c_3
1188 addcc c_12,t_1,c_12
1189 bcs,a %xcc,.+8
1190 add c_3,t_2,c_3
1191 lduw ap(5),a_5
1192 mulx a_2,a_2,t_1 !sqr_add_c(a,2,c2,c3,c1);
1193 addcc c_12,t_1,t_1
1194 bcs,a %xcc,.+8
1195 add c_3,t_2,c_3
1196 srlx t_1,32,c_12
1197 stuw t_1,rp(4) !r[4]=c2;
1198 or c_12,c_3,c_12
1199
1200 mulx a_0,a_5,t_1 !sqr_add_c2(a,5,0,c3,c1,c2);
1201 addcc c_12,t_1,c_12
1202 clr c_3
1203 bcs,a %xcc,.+8
1204 add c_3,t_2,c_3
1205 addcc c_12,t_1,c_12
1206 bcs,a %xcc,.+8
1207 add c_3,t_2,c_3
1208 mulx a_1,a_4,t_1 !sqr_add_c2(a,4,1,c3,c1,c2);
1209 addcc c_12,t_1,c_12
1210 bcs,a %xcc,.+8
1211 add c_3,t_2,c_3
1212 addcc c_12,t_1,c_12
1213 bcs,a %xcc,.+8
1214 add c_3,t_2,c_3
1215 lduw ap(6),a_6
1216 mulx a_2,a_3,t_1 !sqr_add_c2(a,3,2,c3,c1,c2);
1217 addcc c_12,t_1,c_12
1218 bcs,a %xcc,.+8
1219 add c_3,t_2,c_3
1220 addcc c_12,t_1,t_1
1221 bcs,a %xcc,.+8
1222 add c_3,t_2,c_3
1223 srlx t_1,32,c_12
1224 stuw t_1,rp(5) !r[5]=c3;
1225 or c_12,c_3,c_12
1226
1227 mulx a_6,a_0,t_1 !sqr_add_c2(a,6,0,c1,c2,c3);
1228 addcc c_12,t_1,c_12
1229 clr c_3
1230 bcs,a %xcc,.+8
1231 add c_3,t_2,c_3
1232 addcc c_12,t_1,c_12
1233 bcs,a %xcc,.+8
1234 add c_3,t_2,c_3
1235 mulx a_5,a_1,t_1 !sqr_add_c2(a,5,1,c1,c2,c3);
1236 addcc c_12,t_1,c_12
1237 bcs,a %xcc,.+8
1238 add c_3,t_2,c_3
1239 addcc c_12,t_1,c_12
1240 bcs,a %xcc,.+8
1241 add c_3,t_2,c_3
1242 mulx a_4,a_2,t_1 !sqr_add_c2(a,4,2,c1,c2,c3);
1243 addcc c_12,t_1,c_12
1244 bcs,a %xcc,.+8
1245 add c_3,t_2,c_3
1246 addcc c_12,t_1,c_12
1247 bcs,a %xcc,.+8
1248 add c_3,t_2,c_3
1249 lduw ap(7),a_7
1250 mulx a_3,a_3,t_1 !=!sqr_add_c(a,3,c1,c2,c3);
1251 addcc c_12,t_1,t_1
1252 bcs,a %xcc,.+8
1253 add c_3,t_2,c_3
1254 srlx t_1,32,c_12
1255 stuw t_1,rp(6) !r[6]=c1;
1256 or c_12,c_3,c_12
1257
1258 mulx a_0,a_7,t_1 !sqr_add_c2(a,7,0,c2,c3,c1);
1259 addcc c_12,t_1,c_12
1260 clr c_3
1261 bcs,a %xcc,.+8
1262 add c_3,t_2,c_3
1263 addcc c_12,t_1,c_12
1264 bcs,a %xcc,.+8
1265 add c_3,t_2,c_3
1266 mulx a_1,a_6,t_1 !sqr_add_c2(a,6,1,c2,c3,c1);
1267 addcc c_12,t_1,c_12
1268 bcs,a %xcc,.+8
1269 add c_3,t_2,c_3
1270 addcc c_12,t_1,c_12
1271 bcs,a %xcc,.+8
1272 add c_3,t_2,c_3
1273 mulx a_2,a_5,t_1 !sqr_add_c2(a,5,2,c2,c3,c1);
1274 addcc c_12,t_1,c_12
1275 bcs,a %xcc,.+8
1276 add c_3,t_2,c_3
1277 addcc c_12,t_1,c_12
1278 bcs,a %xcc,.+8
1279 add c_3,t_2,c_3
1280 mulx a_3,a_4,t_1 !sqr_add_c2(a,4,3,c2,c3,c1);
1281 addcc c_12,t_1,c_12
1282 bcs,a %xcc,.+8
1283 add c_3,t_2,c_3
1284 addcc c_12,t_1,t_1
1285 bcs,a %xcc,.+8
1286 add c_3,t_2,c_3
1287 srlx t_1,32,c_12
1288 stuw t_1,rp(7) !r[7]=c2;
1289 or c_12,c_3,c_12
1290
1291 mulx a_7,a_1,t_1 !sqr_add_c2(a,7,1,c3,c1,c2);
1292 addcc c_12,t_1,c_12
1293 clr c_3
1294 bcs,a %xcc,.+8
1295 add c_3,t_2,c_3
1296 addcc c_12,t_1,c_12
1297 bcs,a %xcc,.+8
1298 add c_3,t_2,c_3
1299 mulx a_6,a_2,t_1 !sqr_add_c2(a,6,2,c3,c1,c2);
1300 addcc c_12,t_1,c_12
1301 bcs,a %xcc,.+8
1302 add c_3,t_2,c_3
1303 addcc c_12,t_1,c_12
1304 bcs,a %xcc,.+8
1305 add c_3,t_2,c_3
1306 mulx a_5,a_3,t_1 !sqr_add_c2(a,5,3,c3,c1,c2);
1307 addcc c_12,t_1,c_12
1308 bcs,a %xcc,.+8
1309 add c_3,t_2,c_3
1310 addcc c_12,t_1,c_12
1311 bcs,a %xcc,.+8
1312 add c_3,t_2,c_3
1313 mulx a_4,a_4,t_1 !sqr_add_c(a,4,c3,c1,c2);
1314 addcc c_12,t_1,t_1
1315 bcs,a %xcc,.+8
1316 add c_3,t_2,c_3
1317 srlx t_1,32,c_12
1318 stuw t_1,rp(8) !r[8]=c3;
1319 or c_12,c_3,c_12
1320
1321 mulx a_2,a_7,t_1 !sqr_add_c2(a,7,2,c1,c2,c3);
1322 addcc c_12,t_1,c_12
1323 clr c_3
1324 bcs,a %xcc,.+8
1325 add c_3,t_2,c_3
1326 addcc c_12,t_1,c_12
1327 bcs,a %xcc,.+8
1328 add c_3,t_2,c_3
1329 mulx a_3,a_6,t_1 !sqr_add_c2(a,6,3,c1,c2,c3);
1330 addcc c_12,t_1,c_12
1331 bcs,a %xcc,.+8
1332 add c_3,t_2,c_3
1333 addcc c_12,t_1,c_12
1334 bcs,a %xcc,.+8
1335 add c_3,t_2,c_3
1336 mulx a_4,a_5,t_1 !sqr_add_c2(a,5,4,c1,c2,c3);
1337 addcc c_12,t_1,c_12
1338 bcs,a %xcc,.+8
1339 add c_3,t_2,c_3
1340 addcc c_12,t_1,t_1
1341 bcs,a %xcc,.+8
1342 add c_3,t_2,c_3
1343 srlx t_1,32,c_12
1344 stuw t_1,rp(9) !r[9]=c1;
1345 or c_12,c_3,c_12
1346
1347 mulx a_7,a_3,t_1 !sqr_add_c2(a,7,3,c2,c3,c1);
1348 addcc c_12,t_1,c_12
1349 clr c_3
1350 bcs,a %xcc,.+8
1351 add c_3,t_2,c_3
1352 addcc c_12,t_1,c_12
1353 bcs,a %xcc,.+8
1354 add c_3,t_2,c_3
1355 mulx a_6,a_4,t_1 !sqr_add_c2(a,6,4,c2,c3,c1);
1356 addcc c_12,t_1,c_12
1357 bcs,a %xcc,.+8
1358 add c_3,t_2,c_3
1359 addcc c_12,t_1,c_12
1360 bcs,a %xcc,.+8
1361 add c_3,t_2,c_3
1362 mulx a_5,a_5,t_1 !sqr_add_c(a,5,c2,c3,c1);
1363 addcc c_12,t_1,t_1
1364 bcs,a %xcc,.+8
1365 add c_3,t_2,c_3
1366 srlx t_1,32,c_12
1367 stuw t_1,rp(10) !r[10]=c2;
1368 or c_12,c_3,c_12
1369
1370 mulx a_4,a_7,t_1 !sqr_add_c2(a,7,4,c3,c1,c2);
1371 addcc c_12,t_1,c_12
1372 clr c_3
1373 bcs,a %xcc,.+8
1374 add c_3,t_2,c_3
1375 addcc c_12,t_1,c_12
1376 bcs,a %xcc,.+8
1377 add c_3,t_2,c_3
1378 mulx a_5,a_6,t_1 !sqr_add_c2(a,6,5,c3,c1,c2);
1379 addcc c_12,t_1,c_12
1380 bcs,a %xcc,.+8
1381 add c_3,t_2,c_3
1382 addcc c_12,t_1,t_1
1383 bcs,a %xcc,.+8
1384 add c_3,t_2,c_3
1385 srlx t_1,32,c_12
1386 stuw t_1,rp(11) !r[11]=c3;
1387 or c_12,c_3,c_12
1388
1389 mulx a_7,a_5,t_1 !sqr_add_c2(a,7,5,c1,c2,c3);
1390 addcc c_12,t_1,c_12
1391 clr c_3
1392 bcs,a %xcc,.+8
1393 add c_3,t_2,c_3
1394 addcc c_12,t_1,c_12
1395 bcs,a %xcc,.+8
1396 add c_3,t_2,c_3
1397 mulx a_6,a_6,t_1 !sqr_add_c(a,6,c1,c2,c3);
1398 addcc c_12,t_1,t_1
1399 bcs,a %xcc,.+8
1400 add c_3,t_2,c_3
1401 srlx t_1,32,c_12
1402 stuw t_1,rp(12) !r[12]=c1;
1403 or c_12,c_3,c_12
1404
1405 mulx a_6,a_7,t_1 !sqr_add_c2(a,7,6,c2,c3,c1);
1406 addcc c_12,t_1,c_12
1407 clr c_3
1408 bcs,a %xcc,.+8
1409 add c_3,t_2,c_3
1410 addcc c_12,t_1,t_1
1411 bcs,a %xcc,.+8
1412 add c_3,t_2,c_3
1413 srlx t_1,32,c_12
1414 stuw t_1,rp(13) !r[13]=c2;
1415 or c_12,c_3,c_12
1416
1417 mulx a_7,a_7,t_1 !sqr_add_c(a,7,c3,c1,c2);
1418 addcc c_12,t_1,t_1
1419 srlx t_1,32,c_12
1420 stuw t_1,rp(14) !r[14]=c3;
1421 stuw c_12,rp(15) !r[15]=c1;
1422
1423 ret
1424 restore %g0,%g0,%o0
1425
1426.type bn_sqr_comba8,#function
1427.size bn_sqr_comba8,(.-bn_sqr_comba8)
1428
1429.align 32
1430
1431.global bn_sqr_comba4
1432/*
1433 * void bn_sqr_comba4(r,a)
1434 * BN_ULONG *r,*a;
1435 */
1436bn_sqr_comba4:
1437 save %sp,FRAME_SIZE,%sp
1438 mov 1,t_2
1439 lduw ap(0),a_0
1440 sllx t_2,32,t_2
1441 lduw ap(1),a_1
1442 mulx a_0,a_0,t_1 !sqr_add_c(a,0,c1,c2,c3);
1443 srlx t_1,32,c_12
1444 stuw t_1,rp(0) !r[0]=c1;
1445
1446 lduw ap(2),a_2
1447 mulx a_0,a_1,t_1 !sqr_add_c2(a,1,0,c2,c3,c1);
1448 addcc c_12,t_1,c_12
1449 clr c_3
1450 bcs,a %xcc,.+8
1451 add c_3,t_2,c_3
1452 addcc c_12,t_1,t_1
1453 bcs,a %xcc,.+8
1454 add c_3,t_2,c_3
1455 srlx t_1,32,c_12
1456 stuw t_1,rp(1) !r[1]=c2;
1457 or c_12,c_3,c_12
1458
1459 mulx a_2,a_0,t_1 !sqr_add_c2(a,2,0,c3,c1,c2);
1460 addcc c_12,t_1,c_12
1461 clr c_3
1462 bcs,a %xcc,.+8
1463 add c_3,t_2,c_3
1464 addcc c_12,t_1,c_12
1465 bcs,a %xcc,.+8
1466 add c_3,t_2,c_3
1467 lduw ap(3),a_3
1468 mulx a_1,a_1,t_1 !sqr_add_c(a,1,c3,c1,c2);
1469 addcc c_12,t_1,t_1
1470 bcs,a %xcc,.+8
1471 add c_3,t_2,c_3
1472 srlx t_1,32,c_12
1473 stuw t_1,rp(2) !r[2]=c3;
1474 or c_12,c_3,c_12
1475
1476 mulx a_0,a_3,t_1 !sqr_add_c2(a,3,0,c1,c2,c3);
1477 addcc c_12,t_1,c_12
1478 clr c_3
1479 bcs,a %xcc,.+8
1480 add c_3,t_2,c_3
1481 addcc c_12,t_1,c_12
1482 bcs,a %xcc,.+8
1483 add c_3,t_2,c_3
1484 mulx a_1,a_2,t_1 !sqr_add_c2(a,2,1,c1,c2,c3);
1485 addcc c_12,t_1,c_12
1486 bcs,a %xcc,.+8
1487 add c_3,t_2,c_3
1488 addcc c_12,t_1,t_1
1489 bcs,a %xcc,.+8
1490 add c_3,t_2,c_3
1491 srlx t_1,32,c_12
1492 stuw t_1,rp(3) !r[3]=c1;
1493 or c_12,c_3,c_12
1494
1495 mulx a_3,a_1,t_1 !sqr_add_c2(a,3,1,c2,c3,c1);
1496 addcc c_12,t_1,c_12
1497 clr c_3
1498 bcs,a %xcc,.+8
1499 add c_3,t_2,c_3
1500 addcc c_12,t_1,c_12
1501 bcs,a %xcc,.+8
1502 add c_3,t_2,c_3
1503 mulx a_2,a_2,t_1 !sqr_add_c(a,2,c2,c3,c1);
1504 addcc c_12,t_1,t_1
1505 bcs,a %xcc,.+8
1506 add c_3,t_2,c_3
1507 srlx t_1,32,c_12
1508 stuw t_1,rp(4) !r[4]=c2;
1509 or c_12,c_3,c_12
1510
1511 mulx a_2,a_3,t_1 !sqr_add_c2(a,3,2,c3,c1,c2);
1512 addcc c_12,t_1,c_12
1513 clr c_3
1514 bcs,a %xcc,.+8
1515 add c_3,t_2,c_3
1516 addcc c_12,t_1,t_1
1517 bcs,a %xcc,.+8
1518 add c_3,t_2,c_3
1519 srlx t_1,32,c_12
1520 stuw t_1,rp(5) !r[5]=c3;
1521 or c_12,c_3,c_12
1522
1523 mulx a_3,a_3,t_1 !sqr_add_c(a,3,c1,c2,c3);
1524 addcc c_12,t_1,t_1
1525 srlx t_1,32,c_12
1526 stuw t_1,rp(6) !r[6]=c1;
1527 stuw c_12,rp(7) !r[7]=c2;
1528
1529 ret
1530 restore %g0,%g0,%o0
1531
1532.type bn_sqr_comba4,#function
1533.size bn_sqr_comba4,(.-bn_sqr_comba4)
1534
1535.align 32
diff --git a/src/lib/libcrypto/bn/asm/x86.pl b/src/lib/libcrypto/bn/asm/x86.pl
new file mode 100644
index 0000000000..1bc4f1bb27
--- /dev/null
+++ b/src/lib/libcrypto/bn/asm/x86.pl
@@ -0,0 +1,28 @@
1#!/usr/local/bin/perl
2
3push(@INC,"perlasm","../../perlasm");
4require "x86asm.pl";
5
6require("x86/mul_add.pl");
7require("x86/mul.pl");
8require("x86/sqr.pl");
9require("x86/div.pl");
10require("x86/add.pl");
11require("x86/sub.pl");
12require("x86/comba.pl");
13
14&asm_init($ARGV[0],$0);
15
16&bn_mul_add_words("bn_mul_add_words");
17&bn_mul_words("bn_mul_words");
18&bn_sqr_words("bn_sqr_words");
19&bn_div_words("bn_div_words");
20&bn_add_words("bn_add_words");
21&bn_sub_words("bn_sub_words");
22&bn_mul_comba("bn_mul_comba8",8);
23&bn_mul_comba("bn_mul_comba4",4);
24&bn_sqr_comba("bn_sqr_comba8",8);
25&bn_sqr_comba("bn_sqr_comba4",4);
26
27&asm_finish();
28
diff --git a/src/lib/libcrypto/bn/asm/x86/add.pl b/src/lib/libcrypto/bn/asm/x86/add.pl
new file mode 100644
index 0000000000..0b5cf583e3
--- /dev/null
+++ b/src/lib/libcrypto/bn/asm/x86/add.pl
@@ -0,0 +1,76 @@
1#!/usr/local/bin/perl
2# x86 assember
3
4sub bn_add_words
5 {
6 local($name)=@_;
7
8 &function_begin($name,"");
9
10 &comment("");
11 $a="esi";
12 $b="edi";
13 $c="eax";
14 $r="ebx";
15 $tmp1="ecx";
16 $tmp2="edx";
17 $num="ebp";
18
19 &mov($r,&wparam(0)); # get r
20 &mov($a,&wparam(1)); # get a
21 &mov($b,&wparam(2)); # get b
22 &mov($num,&wparam(3)); # get num
23 &xor($c,$c); # clear carry
24 &and($num,0xfffffff8); # num / 8
25
26 &jz(&label("aw_finish"));
27
28 &set_label("aw_loop",0);
29 for ($i=0; $i<8; $i++)
30 {
31 &comment("Round $i");
32
33 &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
34 &mov($tmp2,&DWP($i*4,$b,"",0)); # *b
35 &add($tmp1,$c);
36 &mov($c,0);
37 &adc($c,$c);
38 &add($tmp1,$tmp2);
39 &adc($c,0);
40 &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
41 }
42
43 &comment("");
44 &add($a,32);
45 &add($b,32);
46 &add($r,32);
47 &sub($num,8);
48 &jnz(&label("aw_loop"));
49
50 &set_label("aw_finish",0);
51 &mov($num,&wparam(3)); # get num
52 &and($num,7);
53 &jz(&label("aw_end"));
54
55 for ($i=0; $i<7; $i++)
56 {
57 &comment("Tail Round $i");
58 &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
59 &mov($tmp2,&DWP($i*4,$b,"",0));# *b
60 &add($tmp1,$c);
61 &mov($c,0);
62 &adc($c,$c);
63 &add($tmp1,$tmp2);
64 &adc($c,0);
65 &dec($num) if ($i != 6);
66 &mov(&DWP($i*4,$r,"",0),$tmp1); # *a
67 &jz(&label("aw_end")) if ($i != 6);
68 }
69 &set_label("aw_end",0);
70
71# &mov("eax",$c); # $c is "eax"
72
73 &function_end($name);
74 }
75
761;
diff --git a/src/lib/libcrypto/bn/asm/x86/comba.pl b/src/lib/libcrypto/bn/asm/x86/comba.pl
new file mode 100644
index 0000000000..2291253629
--- /dev/null
+++ b/src/lib/libcrypto/bn/asm/x86/comba.pl
@@ -0,0 +1,277 @@
1#!/usr/local/bin/perl
2# x86 assember
3
4sub mul_add_c
5 {
6 local($a,$ai,$b,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
7
8 # pos == -1 if eax and edx are pre-loaded, 0 to load from next
9 # words, and 1 if load return value
10
11 &comment("mul a[$ai]*b[$bi]");
12
13 # "eax" and "edx" will always be pre-loaded.
14 # &mov("eax",&DWP($ai*4,$a,"",0)) ;
15 # &mov("edx",&DWP($bi*4,$b,"",0));
16
17 &mul("edx");
18 &add($c0,"eax");
19 &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # laod next a
20 &mov("eax",&wparam(0)) if $pos > 0; # load r[]
21 ###
22 &adc($c1,"edx");
23 &mov("edx",&DWP(($nb)*4,$b,"",0)) if $pos == 0; # laod next b
24 &mov("edx",&DWP(($nb)*4,$b,"",0)) if $pos == 1; # laod next b
25 ###
26 &adc($c2,0);
27 # is pos > 1, it means it is the last loop
28 &mov(&DWP($i*4,"eax","",0),$c0) if $pos > 0; # save r[];
29 &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # laod next a
30 }
31
32sub sqr_add_c
33 {
34 local($r,$a,$ai,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
35
36 # pos == -1 if eax and edx are pre-loaded, 0 to load from next
37 # words, and 1 if load return value
38
39 &comment("sqr a[$ai]*a[$bi]");
40
41 # "eax" and "edx" will always be pre-loaded.
42 # &mov("eax",&DWP($ai*4,$a,"",0)) ;
43 # &mov("edx",&DWP($bi*4,$b,"",0));
44
45 if ($ai == $bi)
46 { &mul("eax");}
47 else
48 { &mul("edx");}
49 &add($c0,"eax");
50 &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # load next a
51 ###
52 &adc($c1,"edx");
53 &mov("edx",&DWP(($nb)*4,$a,"",0)) if ($pos == 1) && ($na != $nb);
54 ###
55 &adc($c2,0);
56 # is pos > 1, it means it is the last loop
57 &mov(&DWP($i*4,$r,"",0),$c0) if $pos > 0; # save r[];
58 &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # load next b
59 }
60
61sub sqr_add_c2
62 {
63 local($r,$a,$ai,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
64
65 # pos == -1 if eax and edx are pre-loaded, 0 to load from next
66 # words, and 1 if load return value
67
68 &comment("sqr a[$ai]*a[$bi]");
69
70 # "eax" and "edx" will always be pre-loaded.
71 # &mov("eax",&DWP($ai*4,$a,"",0)) ;
72 # &mov("edx",&DWP($bi*4,$a,"",0));
73
74 if ($ai == $bi)
75 { &mul("eax");}
76 else
77 { &mul("edx");}
78 &add("eax","eax");
79 ###
80 &adc("edx","edx");
81 ###
82 &adc($c2,0);
83 &add($c0,"eax");
84 &adc($c1,"edx");
85 &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # load next a
86 &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # load next b
87 &adc($c2,0);
88 &mov(&DWP($i*4,$r,"",0),$c0) if $pos > 0; # save r[];
89 &mov("edx",&DWP(($nb)*4,$a,"",0)) if ($pos <= 1) && ($na != $nb);
90 ###
91 }
92
93sub bn_mul_comba
94 {
95 local($name,$num)=@_;
96 local($a,$b,$c0,$c1,$c2);
97 local($i,$as,$ae,$bs,$be,$ai,$bi);
98 local($tot,$end);
99
100 &function_begin_B($name,"");
101
102 $c0="ebx";
103 $c1="ecx";
104 $c2="ebp";
105 $a="esi";
106 $b="edi";
107
108 $as=0;
109 $ae=0;
110 $bs=0;
111 $be=0;
112 $tot=$num+$num-1;
113
114 &push("esi");
115 &mov($a,&wparam(1));
116 &push("edi");
117 &mov($b,&wparam(2));
118 &push("ebp");
119 &push("ebx");
120
121 &xor($c0,$c0);
122 &mov("eax",&DWP(0,$a,"",0)); # load the first word
123 &xor($c1,$c1);
124 &mov("edx",&DWP(0,$b,"",0)); # load the first second
125
126 for ($i=0; $i<$tot; $i++)
127 {
128 $ai=$as;
129 $bi=$bs;
130 $end=$be+1;
131
132 &comment("################## Calculate word $i");
133
134 for ($j=$bs; $j<$end; $j++)
135 {
136 &xor($c2,$c2) if ($j == $bs);
137 if (($j+1) == $end)
138 {
139 $v=1;
140 $v=2 if (($i+1) == $tot);
141 }
142 else
143 { $v=0; }
144 if (($j+1) != $end)
145 {
146 $na=($ai-1);
147 $nb=($bi+1);
148 }
149 else
150 {
151 $na=$as+($i < ($num-1));
152 $nb=$bs+($i >= ($num-1));
153 }
154#printf STDERR "[$ai,$bi] -> [$na,$nb]\n";
155 &mul_add_c($a,$ai,$b,$bi,$c0,$c1,$c2,$v,$i,$na,$nb);
156 if ($v)
157 {
158 &comment("saved r[$i]");
159 # &mov("eax",&wparam(0));
160 # &mov(&DWP($i*4,"eax","",0),$c0);
161 ($c0,$c1,$c2)=($c1,$c2,$c0);
162 }
163 $ai--;
164 $bi++;
165 }
166 $as++ if ($i < ($num-1));
167 $ae++ if ($i >= ($num-1));
168
169 $bs++ if ($i >= ($num-1));
170 $be++ if ($i < ($num-1));
171 }
172 &comment("save r[$i]");
173 # &mov("eax",&wparam(0));
174 &mov(&DWP($i*4,"eax","",0),$c0);
175
176 &pop("ebx");
177 &pop("ebp");
178 &pop("edi");
179 &pop("esi");
180 &ret();
181 &function_end_B($name);
182 }
183
184sub bn_sqr_comba
185 {
186 local($name,$num)=@_;
187 local($r,$a,$c0,$c1,$c2)=@_;
188 local($i,$as,$ae,$bs,$be,$ai,$bi);
189 local($b,$tot,$end,$half);
190
191 &function_begin_B($name,"");
192
193 $c0="ebx";
194 $c1="ecx";
195 $c2="ebp";
196 $a="esi";
197 $r="edi";
198
199 &push("esi");
200 &push("edi");
201 &push("ebp");
202 &push("ebx");
203 &mov($r,&wparam(0));
204 &mov($a,&wparam(1));
205 &xor($c0,$c0);
206 &xor($c1,$c1);
207 &mov("eax",&DWP(0,$a,"",0)); # load the first word
208
209 $as=0;
210 $ae=0;
211 $bs=0;
212 $be=0;
213 $tot=$num+$num-1;
214
215 for ($i=0; $i<$tot; $i++)
216 {
217 $ai=$as;
218 $bi=$bs;
219 $end=$be+1;
220
221 &comment("############### Calculate word $i");
222 for ($j=$bs; $j<$end; $j++)
223 {
224 &xor($c2,$c2) if ($j == $bs);
225 if (($ai-1) < ($bi+1))
226 {
227 $v=1;
228 $v=2 if ($i+1) == $tot;
229 }
230 else
231 { $v=0; }
232 if (!$v)
233 {
234 $na=$ai-1;
235 $nb=$bi+1;
236 }
237 else
238 {
239 $na=$as+($i < ($num-1));
240 $nb=$bs+($i >= ($num-1));
241 }
242 if ($ai == $bi)
243 {
244 &sqr_add_c($r,$a,$ai,$bi,
245 $c0,$c1,$c2,$v,$i,$na,$nb);
246 }
247 else
248 {
249 &sqr_add_c2($r,$a,$ai,$bi,
250 $c0,$c1,$c2,$v,$i,$na,$nb);
251 }
252 if ($v)
253 {
254 &comment("saved r[$i]");
255 #&mov(&DWP($i*4,$r,"",0),$c0);
256 ($c0,$c1,$c2)=($c1,$c2,$c0);
257 last;
258 }
259 $ai--;
260 $bi++;
261 }
262 $as++ if ($i < ($num-1));
263 $ae++ if ($i >= ($num-1));
264
265 $bs++ if ($i >= ($num-1));
266 $be++ if ($i < ($num-1));
267 }
268 &mov(&DWP($i*4,$r,"",0),$c0);
269 &pop("ebx");
270 &pop("ebp");
271 &pop("edi");
272 &pop("esi");
273 &ret();
274 &function_end_B($name);
275 }
276
2771;
diff --git a/src/lib/libcrypto/bn/asm/x86/div.pl b/src/lib/libcrypto/bn/asm/x86/div.pl
new file mode 100644
index 0000000000..0e90152caa
--- /dev/null
+++ b/src/lib/libcrypto/bn/asm/x86/div.pl
@@ -0,0 +1,15 @@
1#!/usr/local/bin/perl
2# x86 assember
3
4sub bn_div_words
5 {
6 local($name)=@_;
7
8 &function_begin($name,"");
9 &mov("edx",&wparam(0)); #
10 &mov("eax",&wparam(1)); #
11 &mov("ebx",&wparam(2)); #
12 &div("ebx");
13 &function_end($name);
14 }
151;
diff --git a/src/lib/libcrypto/bn/asm/x86/mul.pl b/src/lib/libcrypto/bn/asm/x86/mul.pl
new file mode 100644
index 0000000000..674cb9b055
--- /dev/null
+++ b/src/lib/libcrypto/bn/asm/x86/mul.pl
@@ -0,0 +1,77 @@
1#!/usr/local/bin/perl
2# x86 assember
3
4sub bn_mul_words
5 {
6 local($name)=@_;
7
8 &function_begin($name,"");
9
10 &comment("");
11 $Low="eax";
12 $High="edx";
13 $a="ebx";
14 $w="ecx";
15 $r="edi";
16 $c="esi";
17 $num="ebp";
18
19 &xor($c,$c); # clear carry
20 &mov($r,&wparam(0)); #
21 &mov($a,&wparam(1)); #
22 &mov($num,&wparam(2)); #
23 &mov($w,&wparam(3)); #
24
25 &and($num,0xfffffff8); # num / 8
26 &jz(&label("mw_finish"));
27
28 &set_label("mw_loop",0);
29 for ($i=0; $i<32; $i+=4)
30 {
31 &comment("Round $i");
32
33 &mov("eax",&DWP($i,$a,"",0)); # *a
34 &mul($w); # *a * w
35 &add("eax",$c); # L(t)+=c
36 # XXX
37
38 &adc("edx",0); # H(t)+=carry
39 &mov(&DWP($i,$r,"",0),"eax"); # *r= L(t);
40
41 &mov($c,"edx"); # c= H(t);
42 }
43
44 &comment("");
45 &add($a,32);
46 &add($r,32);
47 &sub($num,8);
48 &jz(&label("mw_finish"));
49 &jmp(&label("mw_loop"));
50
51 &set_label("mw_finish",0);
52 &mov($num,&wparam(2)); # get num
53 &and($num,7);
54 &jnz(&label("mw_finish2"));
55 &jmp(&label("mw_end"));
56
57 &set_label("mw_finish2",1);
58 for ($i=0; $i<7; $i++)
59 {
60 &comment("Tail Round $i");
61 &mov("eax",&DWP($i*4,$a,"",0));# *a
62 &mul($w); # *a * w
63 &add("eax",$c); # L(t)+=c
64 # XXX
65 &adc("edx",0); # H(t)+=carry
66 &mov(&DWP($i*4,$r,"",0),"eax");# *r= L(t);
67 &mov($c,"edx"); # c= H(t);
68 &dec($num) if ($i != 7-1);
69 &jz(&label("mw_end")) if ($i != 7-1);
70 }
71 &set_label("mw_end",0);
72 &mov("eax",$c);
73
74 &function_end($name);
75 }
76
771;
diff --git a/src/lib/libcrypto/bn/asm/x86/mul_add.pl b/src/lib/libcrypto/bn/asm/x86/mul_add.pl
new file mode 100644
index 0000000000..61830d3a90
--- /dev/null
+++ b/src/lib/libcrypto/bn/asm/x86/mul_add.pl
@@ -0,0 +1,87 @@
1#!/usr/local/bin/perl
2# x86 assember
3
4sub bn_mul_add_words
5 {
6 local($name)=@_;
7
8 &function_begin($name,"");
9
10 &comment("");
11 $Low="eax";
12 $High="edx";
13 $a="ebx";
14 $w="ebp";
15 $r="edi";
16 $c="esi";
17
18 &xor($c,$c); # clear carry
19 &mov($r,&wparam(0)); #
20
21 &mov("ecx",&wparam(2)); #
22 &mov($a,&wparam(1)); #
23
24 &and("ecx",0xfffffff8); # num / 8
25 &mov($w,&wparam(3)); #
26
27 &push("ecx"); # Up the stack for a tmp variable
28
29 &jz(&label("maw_finish"));
30
31 &set_label("maw_loop",0);
32
33 &mov(&swtmp(0),"ecx"); #
34
35 for ($i=0; $i<32; $i+=4)
36 {
37 &comment("Round $i");
38
39 &mov("eax",&DWP($i,$a,"",0)); # *a
40 &mul($w); # *a * w
41 &add("eax",$c); # L(t)+= *r
42 &mov($c,&DWP($i,$r,"",0)); # L(t)+= *r
43 &adc("edx",0); # H(t)+=carry
44 &add("eax",$c); # L(t)+=c
45 &adc("edx",0); # H(t)+=carry
46 &mov(&DWP($i,$r,"",0),"eax"); # *r= L(t);
47 &mov($c,"edx"); # c= H(t);
48 }
49
50 &comment("");
51 &mov("ecx",&swtmp(0)); #
52 &add($a,32);
53 &add($r,32);
54 &sub("ecx",8);
55 &jnz(&label("maw_loop"));
56
57 &set_label("maw_finish",0);
58 &mov("ecx",&wparam(2)); # get num
59 &and("ecx",7);
60 &jnz(&label("maw_finish2")); # helps branch prediction
61 &jmp(&label("maw_end"));
62
63 &set_label("maw_finish2",1);
64 for ($i=0; $i<7; $i++)
65 {
66 &comment("Tail Round $i");
67 &mov("eax",&DWP($i*4,$a,"",0));# *a
68 &mul($w); # *a * w
69 &add("eax",$c); # L(t)+=c
70 &mov($c,&DWP($i*4,$r,"",0)); # L(t)+= *r
71 &adc("edx",0); # H(t)+=carry
72 &add("eax",$c);
73 &adc("edx",0); # H(t)+=carry
74 &dec("ecx") if ($i != 7-1);
75 &mov(&DWP($i*4,$r,"",0),"eax"); # *r= L(t);
76 &mov($c,"edx"); # c= H(t);
77 &jz(&label("maw_end")) if ($i != 7-1);
78 }
79 &set_label("maw_end",0);
80 &mov("eax",$c);
81
82 &pop("ecx"); # clear variable from
83
84 &function_end($name);
85 }
86
871;
diff --git a/src/lib/libcrypto/bn/asm/x86/sqr.pl b/src/lib/libcrypto/bn/asm/x86/sqr.pl
new file mode 100644
index 0000000000..1f90993cf6
--- /dev/null
+++ b/src/lib/libcrypto/bn/asm/x86/sqr.pl
@@ -0,0 +1,60 @@
1#!/usr/local/bin/perl
2# x86 assember
3
4sub bn_sqr_words
5 {
6 local($name)=@_;
7
8 &function_begin($name,"");
9
10 &comment("");
11 $r="esi";
12 $a="edi";
13 $num="ebx";
14
15 &mov($r,&wparam(0)); #
16 &mov($a,&wparam(1)); #
17 &mov($num,&wparam(2)); #
18
19 &and($num,0xfffffff8); # num / 8
20 &jz(&label("sw_finish"));
21
22 &set_label("sw_loop",0);
23 for ($i=0; $i<32; $i+=4)
24 {
25 &comment("Round $i");
26 &mov("eax",&DWP($i,$a,"",0)); # *a
27 # XXX
28 &mul("eax"); # *a * *a
29 &mov(&DWP($i*2,$r,"",0),"eax"); #
30 &mov(&DWP($i*2+4,$r,"",0),"edx");#
31 }
32
33 &comment("");
34 &add($a,32);
35 &add($r,64);
36 &sub($num,8);
37 &jnz(&label("sw_loop"));
38
39 &set_label("sw_finish",0);
40 &mov($num,&wparam(2)); # get num
41 &and($num,7);
42 &jz(&label("sw_end"));
43
44 for ($i=0; $i<7; $i++)
45 {
46 &comment("Tail Round $i");
47 &mov("eax",&DWP($i*4,$a,"",0)); # *a
48 # XXX
49 &mul("eax"); # *a * *a
50 &mov(&DWP($i*8,$r,"",0),"eax"); #
51 &dec($num) if ($i != 7-1);
52 &mov(&DWP($i*8+4,$r,"",0),"edx");
53 &jz(&label("sw_end")) if ($i != 7-1);
54 }
55 &set_label("sw_end",0);
56
57 &function_end($name);
58 }
59
601;
diff --git a/src/lib/libcrypto/bn/asm/x86/sub.pl b/src/lib/libcrypto/bn/asm/x86/sub.pl
new file mode 100644
index 0000000000..837b0e1b07
--- /dev/null
+++ b/src/lib/libcrypto/bn/asm/x86/sub.pl
@@ -0,0 +1,76 @@
1#!/usr/local/bin/perl
2# x86 assember
3
4sub bn_sub_words
5 {
6 local($name)=@_;
7
8 &function_begin($name,"");
9
10 &comment("");
11 $a="esi";
12 $b="edi";
13 $c="eax";
14 $r="ebx";
15 $tmp1="ecx";
16 $tmp2="edx";
17 $num="ebp";
18
19 &mov($r,&wparam(0)); # get r
20 &mov($a,&wparam(1)); # get a
21 &mov($b,&wparam(2)); # get b
22 &mov($num,&wparam(3)); # get num
23 &xor($c,$c); # clear carry
24 &and($num,0xfffffff8); # num / 8
25
26 &jz(&label("aw_finish"));
27
28 &set_label("aw_loop",0);
29 for ($i=0; $i<8; $i++)
30 {
31 &comment("Round $i");
32
33 &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
34 &mov($tmp2,&DWP($i*4,$b,"",0)); # *b
35 &sub($tmp1,$c);
36 &mov($c,0);
37 &adc($c,$c);
38 &sub($tmp1,$tmp2);
39 &adc($c,0);
40 &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
41 }
42
43 &comment("");
44 &add($a,32);
45 &add($b,32);
46 &add($r,32);
47 &sub($num,8);
48 &jnz(&label("aw_loop"));
49
50 &set_label("aw_finish",0);
51 &mov($num,&wparam(3)); # get num
52 &and($num,7);
53 &jz(&label("aw_end"));
54
55 for ($i=0; $i<7; $i++)
56 {
57 &comment("Tail Round $i");
58 &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
59 &mov($tmp2,&DWP($i*4,$b,"",0));# *b
60 &sub($tmp1,$c);
61 &mov($c,0);
62 &adc($c,$c);
63 &sub($tmp1,$tmp2);
64 &adc($c,0);
65 &dec($num) if ($i != 6);
66 &mov(&DWP($i*4,$r,"",0),$tmp1); # *a
67 &jz(&label("aw_end")) if ($i != 6);
68 }
69 &set_label("aw_end",0);
70
71# &mov("eax",$c); # $c is "eax"
72
73 &function_end($name);
74 }
75
761;
diff --git a/src/lib/libcrypto/bn/bn.h b/src/lib/libcrypto/bn/bn.h
new file mode 100644
index 0000000000..f935e1ca79
--- /dev/null
+++ b/src/lib/libcrypto/bn/bn.h
@@ -0,0 +1,467 @@
1/* crypto/bn/bn.h */
2/* Copyright (C) 1995-1997 Eric Young (eay@cryptsoft.com)
3 * All rights reserved.
4 *
5 * This package is an SSL implementation written
6 * by Eric Young (eay@cryptsoft.com).
7 * The implementation was written so as to conform with Netscapes SSL.
8 *
9 * This library is free for commercial and non-commercial use as long as
10 * the following conditions are aheared to. The following conditions
11 * apply to all code found in this distribution, be it the RC4, RSA,
12 * lhash, DES, etc., code; not just the SSL code. The SSL documentation
13 * included with this distribution is covered by the same copyright terms
14 * except that the holder is Tim Hudson (tjh@cryptsoft.com).
15 *
16 * Copyright remains Eric Young's, and as such any Copyright notices in
17 * the code are not to be removed.
18 * If this package is used in a product, Eric Young should be given attribution
19 * as the author of the parts of the library used.
20 * This can be in the form of a textual message at program startup or
21 * in documentation (online or textual) provided with the package.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 * 1. Redistributions of source code must retain the copyright
27 * notice, this list of conditions and the following disclaimer.
28 * 2. Redistributions in binary form must reproduce the above copyright
29 * notice, this list of conditions and the following disclaimer in the
30 * documentation and/or other materials provided with the distribution.
31 * 3. All advertising materials mentioning features or use of this software
32 * must display the following acknowledgement:
33 * "This product includes cryptographic software written by
34 * Eric Young (eay@cryptsoft.com)"
35 * The word 'cryptographic' can be left out if the rouines from the library
36 * being used are not cryptographic related :-).
37 * 4. If you include any Windows specific code (or a derivative thereof) from
38 * the apps directory (application code) you must include an acknowledgement:
39 * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
40 *
41 * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
42 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51 * SUCH DAMAGE.
52 *
53 * The licence and distribution terms for any publically available version or
54 * derivative of this code cannot be changed. i.e. this code cannot simply be
55 * copied and put under another distribution licence
56 * [including the GNU Public Licence.]
57 */
58
59#ifndef HEADER_BN_H
60#define HEADER_BN_H
61
62#ifndef WIN16
63#include <stdio.h> /* FILE */
64#endif
65#include <openssl/opensslconf.h>
66
67#ifdef __cplusplus
68extern "C" {
69#endif
70
71#ifdef VMS
72#undef BN_LLONG /* experimental, so far... */
73#endif
74
75#define BN_MUL_COMBA
76#define BN_SQR_COMBA
77#define BN_RECURSION
78#define RECP_MUL_MOD
79#define MONT_MUL_MOD
80
81/* This next option uses the C libraries (2 word)/(1 word) function.
82 * If it is not defined, I use my C version (which is slower).
83 * The reason for this flag is that when the particular C compiler
84 * library routine is used, and the library is linked with a different
85 * compiler, the library is missing. This mostly happens when the
86 * library is built with gcc and then linked using nornal cc. This would
87 * be a common occurance because gcc normally produces code that is
88 * 2 times faster than system compilers for the big number stuff.
89 * For machines with only one compiler (or shared libraries), this should
90 * be on. Again this in only really a problem on machines
91 * using "long long's", are 32bit, and are not using my assember code. */
92#if defined(MSDOS) || defined(WINDOWS) || defined(linux)
93#define BN_DIV2W
94#endif
95
96/* assuming long is 64bit - this is the DEC Alpha
97 * unsigned long long is only 64 bits :-(, don't define
98 * BN_LLONG for the DEC Alpha */
99#ifdef SIXTY_FOUR_BIT_LONG
100#define BN_ULLONG unsigned long long
101#define BN_ULONG unsigned long
102#define BN_LONG long
103#define BN_BITS 128
104#define BN_BYTES 8
105#define BN_BITS2 64
106#define BN_BITS4 32
107#define BN_MASK (0xffffffffffffffffffffffffffffffffLL)
108#define BN_MASK2 (0xffffffffffffffffL)
109#define BN_MASK2l (0xffffffffL)
110#define BN_MASK2h (0xffffffff00000000L)
111#define BN_MASK2h1 (0xffffffff80000000L)
112#define BN_TBIT (0x8000000000000000L)
113#define BN_DEC_CONV (10000000000000000000UL)
114#define BN_DEC_FMT1 "%lu"
115#define BN_DEC_FMT2 "%019lu"
116#define BN_DEC_NUM 19
117#endif
118
119/* This is where the long long data type is 64 bits, but long is 32.
120 * For machines where there are 64bit registers, this is the mode to use.
121 * IRIX, on R4000 and above should use this mode, along with the relevent
122 * assember code :-). Do NOT define BN_LLONG.
123 */
124#ifdef SIXTY_FOUR_BIT
125#undef BN_LLONG
126#undef BN_ULLONG
127#define BN_ULONG unsigned long long
128#define BN_LONG long long
129#define BN_BITS 128
130#define BN_BYTES 8
131#define BN_BITS2 64
132#define BN_BITS4 32
133#define BN_MASK2 (0xffffffffffffffffLL)
134#define BN_MASK2l (0xffffffffL)
135#define BN_MASK2h (0xffffffff00000000LL)
136#define BN_MASK2h1 (0xffffffff80000000LL)
137#define BN_TBIT (0x8000000000000000LL)
138#define BN_DEC_CONV (10000000000000000000LL)
139#define BN_DEC_FMT1 "%llu"
140#define BN_DEC_FMT2 "%019llu"
141#define BN_DEC_NUM 19
142#endif
143
144#ifdef THIRTY_TWO_BIT
145#if defined(WIN32) && !defined(__GNUC__)
146#define BN_ULLONG unsigned _int64
147#else
148#define BN_ULLONG unsigned long long
149#endif
150#define BN_ULONG unsigned long
151#define BN_LONG long
152#define BN_BITS 64
153#define BN_BYTES 4
154#define BN_BITS2 32
155#define BN_BITS4 16
156#ifdef WIN32
157/* VC++ doesn't like the LL suffix */
158#define BN_MASK (0xffffffffffffffffL)
159#else
160#define BN_MASK (0xffffffffffffffffLL)
161#endif
162#define BN_MASK2 (0xffffffffL)
163#define BN_MASK2l (0xffff)
164#define BN_MASK2h1 (0xffff8000L)
165#define BN_MASK2h (0xffff0000L)
166#define BN_TBIT (0x80000000L)
167#define BN_DEC_CONV (1000000000L)
168#define BN_DEC_FMT1 "%lu"
169#define BN_DEC_FMT2 "%09lu"
170#define BN_DEC_NUM 9
171#endif
172
173#ifdef SIXTEEN_BIT
174#ifndef BN_DIV2W
175#define BN_DIV2W
176#endif
177#define BN_ULLONG unsigned long
178#define BN_ULONG unsigned short
179#define BN_LONG short
180#define BN_BITS 32
181#define BN_BYTES 2
182#define BN_BITS2 16
183#define BN_BITS4 8
184#define BN_MASK (0xffffffff)
185#define BN_MASK2 (0xffff)
186#define BN_MASK2l (0xff)
187#define BN_MASK2h1 (0xff80)
188#define BN_MASK2h (0xff00)
189#define BN_TBIT (0x8000)
190#define BN_DEC_CONV (100000)
191#define BN_DEC_FMT1 "%u"
192#define BN_DEC_FMT2 "%05u"
193#define BN_DEC_NUM 5
194#endif
195
196#ifdef EIGHT_BIT
197#ifndef BN_DIV2W
198#define BN_DIV2W
199#endif
200#define BN_ULLONG unsigned short
201#define BN_ULONG unsigned char
202#define BN_LONG char
203#define BN_BITS 16
204#define BN_BYTES 1
205#define BN_BITS2 8
206#define BN_BITS4 4
207#define BN_MASK (0xffff)
208#define BN_MASK2 (0xff)
209#define BN_MASK2l (0xf)
210#define BN_MASK2h1 (0xf8)
211#define BN_MASK2h (0xf0)
212#define BN_TBIT (0x80)
213#define BN_DEC_CONV (100)
214#define BN_DEC_FMT1 "%u"
215#define BN_DEC_FMT2 "%02u"
216#define BN_DEC_NUM 2
217#endif
218
219#define BN_DEFAULT_BITS 1280
220
221#ifdef BIGNUM
222#undef BIGNUM
223#endif
224
225#define BN_FLG_MALLOCED 0x01
226#define BN_FLG_STATIC_DATA 0x02
227#define BN_FLG_FREE 0x8000 /* used for debuging */
228#define BN_set_flags(b,n) ((b)->flags|=(n))
229#define BN_get_flags(b,n) ((b)->flags&(n))
230
231typedef struct bignum_st
232 {
233 BN_ULONG *d; /* Pointer to an array of 'BN_BITS2' bit chunks. */
234 int top; /* Index of last used d +1. */
235 /* The next are internal book keeping for bn_expand. */
236 int max; /* Size of the d array. */
237 int neg; /* one if the number is negative */
238 int flags;
239 } BIGNUM;
240
241/* Used for temp variables */
242#define BN_CTX_NUM 12
243typedef struct bignum_ctx
244 {
245 int tos;
246 BIGNUM bn[BN_CTX_NUM+1];
247 int flags;
248 } BN_CTX;
249
250typedef struct bn_blinding_st
251 {
252 int init;
253 BIGNUM *A;
254 BIGNUM *Ai;
255 BIGNUM *mod; /* just a reference */
256 } BN_BLINDING;
257
258/* Used for montgomery multiplication */
259typedef struct bn_mont_ctx_st
260 {
261 int use_word; /* 0 for word form, 1 for long form */
262 int ri; /* number of bits in R */
263 BIGNUM RR; /* used to convert to montgomery form */
264 BIGNUM N; /* The modulus */
265 BIGNUM Ni; /* The inverse of N */
266 BN_ULONG n0; /* word form of inverse, normally only one of
267 * Ni or n0 is defined */
268 int flags;
269 } BN_MONT_CTX;
270
271/* Used for reciprocal division/mod functions
272 * It cannot be shared between threads
273 */
274typedef struct bn_recp_ctx_st
275 {
276 BIGNUM N; /* the divisor */
277 BIGNUM Nr; /* the reciprocal */
278 int num_bits;
279 int shift;
280 int flags;
281 } BN_RECP_CTX;
282
283#define BN_to_montgomery(r,a,mont,ctx) BN_mod_mul_montgomery(\
284 r,a,&((mont)->RR),(mont),ctx)
285
286#define BN_prime_checks (5)
287
288#define BN_num_bytes(a) ((BN_num_bits(a)+7)/8)
289#define BN_is_word(a,w) (((a)->top == 1) && ((a)->d[0] == (BN_ULONG)(w)))
290#define BN_is_zero(a) (((a)->top == 0) || BN_is_word(a,0))
291#define BN_is_one(a) (BN_is_word((a),1))
292#define BN_is_odd(a) (((a)->top > 0) && ((a)->d[0] & 1))
293#define BN_one(a) (BN_set_word((a),1))
294#define BN_zero(a) (BN_set_word((a),0))
295
296/*#define BN_ascii2bn(a) BN_hex2bn(a) */
297/*#define BN_bn2ascii(a) BN_bn2hex(a) */
298
299#define bn_expand(n,b) ((((((b+BN_BITS2-1))/BN_BITS2)) <= (n)->max)?\
300 (n):bn_expand2((n),(b)/BN_BITS2+1))
301#define bn_wexpand(n,b) (((b) <= (n)->max)?(n):bn_expand2((n),(b)))
302
303#define bn_fix_top(a) \
304 { \
305 BN_ULONG *ftl; \
306 if ((a)->top > 0) \
307 { \
308 for (ftl= &((a)->d[(a)->top-1]); (a)->top > 0; (a)->top--) \
309 if (*(ftl--)) break; \
310 } \
311 }
312
313BIGNUM *BN_value_one(void);
314char * BN_options(void);
315BN_CTX *BN_CTX_new(void);
316void BN_CTX_init(BN_CTX *c);
317void BN_CTX_free(BN_CTX *c);
318int BN_rand(BIGNUM *rnd, int bits, int top,int bottom);
319int BN_num_bits(const BIGNUM *a);
320int BN_num_bits_word(BN_ULONG);
321BIGNUM *BN_new(void);
322void BN_init(BIGNUM *);
323void BN_clear_free(BIGNUM *a);
324BIGNUM *BN_copy(BIGNUM *a, const BIGNUM *b);
325BIGNUM *BN_bin2bn(const unsigned char *s,int len,BIGNUM *ret);
326int BN_bn2bin(const BIGNUM *a, unsigned char *to);
327BIGNUM *BN_mpi2bn(unsigned char *s,int len,BIGNUM *ret);
328int BN_bn2mpi(const BIGNUM *a, unsigned char *to);
329int BN_sub(BIGNUM *r, const BIGNUM *a, const BIGNUM *b);
330int BN_usub(BIGNUM *r, const BIGNUM *a, const BIGNUM *b);
331int BN_uadd(BIGNUM *r, const BIGNUM *a, const BIGNUM *b);
332int BN_add(BIGNUM *r, BIGNUM *a, BIGNUM *b);
333int BN_mod(BIGNUM *rem, const BIGNUM *m, const BIGNUM *d, BN_CTX *ctx);
334int BN_div(BIGNUM *dv, BIGNUM *rem, const BIGNUM *m, const BIGNUM *d,
335 BN_CTX *ctx);
336int BN_mul(BIGNUM *r, BIGNUM *a, BIGNUM *b,BN_CTX *ctx);
337int BN_sqr(BIGNUM *r, BIGNUM *a,BN_CTX *ctx);
338BN_ULONG BN_mod_word(BIGNUM *a, BN_ULONG w);
339BN_ULONG BN_div_word(BIGNUM *a, BN_ULONG w);
340int BN_mul_word(BIGNUM *a, BN_ULONG w);
341int BN_add_word(BIGNUM *a, BN_ULONG w);
342int BN_sub_word(BIGNUM *a, BN_ULONG w);
343int BN_set_word(BIGNUM *a, BN_ULONG w);
344BN_ULONG BN_get_word(BIGNUM *a);
345int BN_cmp(const BIGNUM *a, const BIGNUM *b);
346void BN_free(BIGNUM *a);
347int BN_is_bit_set(const BIGNUM *a, int n);
348int BN_lshift(BIGNUM *r, const BIGNUM *a, int n);
349int BN_lshift1(BIGNUM *r, BIGNUM *a);
350int BN_exp(BIGNUM *r, BIGNUM *a, BIGNUM *p,BN_CTX *ctx);
351int BN_mod_exp(BIGNUM *r, BIGNUM *a, const BIGNUM *p,
352 const BIGNUM *m,BN_CTX *ctx);
353int BN_mod_exp_mont(BIGNUM *r, BIGNUM *a, const BIGNUM *p,
354 const BIGNUM *m, BN_CTX *ctx, BN_MONT_CTX *m_ctx);
355int BN_mod_exp2_mont(BIGNUM *r, BIGNUM *a1, BIGNUM *p1,BIGNUM *a2,
356 BIGNUM *p2,BIGNUM *m,BN_CTX *ctx,BN_MONT_CTX *m_ctx);
357int BN_mod_exp_simple(BIGNUM *r, BIGNUM *a, BIGNUM *p,
358 BIGNUM *m,BN_CTX *ctx);
359int BN_mask_bits(BIGNUM *a,int n);
360int BN_mod_mul(BIGNUM *ret, BIGNUM *a, BIGNUM *b, const BIGNUM *m, BN_CTX *ctx);
361#ifndef WIN16
362int BN_print_fp(FILE *fp, BIGNUM *a);
363#endif
364#ifdef HEADER_BIO_H
365int BN_print(BIO *fp, const BIGNUM *a);
366#else
367int BN_print(char *fp, const BIGNUM *a);
368#endif
369int BN_reciprocal(BIGNUM *r, BIGNUM *m, int len, BN_CTX *ctx);
370int BN_rshift(BIGNUM *r, BIGNUM *a, int n);
371int BN_rshift1(BIGNUM *r, BIGNUM *a);
372void BN_clear(BIGNUM *a);
373BIGNUM *bn_expand2(BIGNUM *b, int bits);
374BIGNUM *BN_dup(const BIGNUM *a);
375int BN_ucmp(const BIGNUM *a, const BIGNUM *b);
376int BN_set_bit(BIGNUM *a, int n);
377int BN_clear_bit(BIGNUM *a, int n);
378char * BN_bn2hex(const BIGNUM *a);
379char * BN_bn2dec(const BIGNUM *a);
380int BN_hex2bn(BIGNUM **a, const char *str);
381int BN_dec2bn(BIGNUM **a, const char *str);
382int BN_gcd(BIGNUM *r,BIGNUM *in_a,BIGNUM *in_b,BN_CTX *ctx);
383BIGNUM *BN_mod_inverse(BIGNUM *ret,BIGNUM *a, const BIGNUM *n,BN_CTX *ctx);
384BIGNUM *BN_generate_prime(BIGNUM *ret,int bits,int strong,BIGNUM *add,
385 BIGNUM *rem,void (*callback)(int,int,void *),void *cb_arg);
386int BN_is_prime(BIGNUM *p,int nchecks,void (*callback)(int,int,void *),
387 BN_CTX *ctx,void *cb_arg);
388void ERR_load_BN_strings(void );
389
390BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w);
391BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w);
392void bn_sqr_words(BN_ULONG *rp, BN_ULONG *ap, int num);
393BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d);
394BN_ULONG bn_add_words(BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int num);
395BN_ULONG bn_sub_words(BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int num);
396
397BN_MONT_CTX *BN_MONT_CTX_new(void );
398void BN_MONT_CTX_init(BN_MONT_CTX *ctx);
399int BN_mod_mul_montgomery(BIGNUM *r,BIGNUM *a,BIGNUM *b,BN_MONT_CTX *mont,
400 BN_CTX *ctx);
401int BN_from_montgomery(BIGNUM *r,BIGNUM *a,BN_MONT_CTX *mont,BN_CTX *ctx);
402void BN_MONT_CTX_free(BN_MONT_CTX *mont);
403int BN_MONT_CTX_set(BN_MONT_CTX *mont,const BIGNUM *modulus,BN_CTX *ctx);
404BN_MONT_CTX *BN_MONT_CTX_copy(BN_MONT_CTX *to,BN_MONT_CTX *from);
405
406BN_BLINDING *BN_BLINDING_new(BIGNUM *A,BIGNUM *Ai,BIGNUM *mod);
407void BN_BLINDING_free(BN_BLINDING *b);
408int BN_BLINDING_update(BN_BLINDING *b,BN_CTX *ctx);
409int BN_BLINDING_convert(BIGNUM *n, BN_BLINDING *r, BN_CTX *ctx);
410int BN_BLINDING_invert(BIGNUM *n, BN_BLINDING *b, BN_CTX *ctx);
411
412void BN_set_params(int mul,int high,int low,int mont);
413int BN_get_params(int which); /* 0, mul, 1 high, 2 low, 3 mont */
414
415void BN_RECP_CTX_init(BN_RECP_CTX *recp);
416BN_RECP_CTX *BN_RECP_CTX_new(void);
417void BN_RECP_CTX_free(BN_RECP_CTX *recp);
418int BN_RECP_CTX_set(BN_RECP_CTX *recp,const BIGNUM *rdiv,BN_CTX *ctx);
419int BN_mod_mul_reciprocal(BIGNUM *r, BIGNUM *x, BIGNUM *y,
420 BN_RECP_CTX *recp,BN_CTX *ctx);
421int BN_mod_exp_recp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p,
422 const BIGNUM *m, BN_CTX *ctx);
423int BN_div_recp(BIGNUM *dv, BIGNUM *rem, BIGNUM *m,
424 BN_RECP_CTX *recp, BN_CTX *ctx);
425
426
427/* BEGIN ERROR CODES */
428/* The following lines are auto generated by the script mkerr.pl. Any changes
429 * made after this point may be overwritten when the script is next run.
430 */
431
432/* Error codes for the BN functions. */
433
434/* Function codes. */
435#define BN_F_BN_BLINDING_CONVERT 100
436#define BN_F_BN_BLINDING_INVERT 101
437#define BN_F_BN_BLINDING_NEW 102
438#define BN_F_BN_BLINDING_UPDATE 103
439#define BN_F_BN_BN2DEC 104
440#define BN_F_BN_BN2HEX 105
441#define BN_F_BN_CTX_NEW 106
442#define BN_F_BN_DIV 107
443#define BN_F_BN_EXPAND2 108
444#define BN_F_BN_MOD_EXP_MONT 109
445#define BN_F_BN_MOD_INVERSE 110
446#define BN_F_BN_MOD_MUL_RECIPROCAL 111
447#define BN_F_BN_MPI2BN 112
448#define BN_F_BN_NEW 113
449#define BN_F_BN_RAND 114
450#define BN_F_BN_USUB 115
451
452/* Reason codes. */
453#define BN_R_ARG2_LT_ARG3 100
454#define BN_R_BAD_RECIPROCAL 101
455#define BN_R_CALLED_WITH_EVEN_MODULUS 102
456#define BN_R_DIV_BY_ZERO 103
457#define BN_R_ENCODING_ERROR 104
458#define BN_R_EXPAND_ON_STATIC_BIGNUM_DATA 105
459#define BN_R_INVALID_LENGTH 106
460#define BN_R_NOT_INITIALIZED 107
461#define BN_R_NO_INVERSE 108
462
463#ifdef __cplusplus
464}
465#endif
466#endif
467
diff --git a/src/lib/libcrypto/bn/bn_asm.c b/src/lib/libcrypto/bn/bn_asm.c
new file mode 100644
index 0000000000..4d3da16a0c
--- /dev/null
+++ b/src/lib/libcrypto/bn/bn_asm.c
@@ -0,0 +1,802 @@
1/* crypto/bn/bn_asm.c */
2/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
3 * All rights reserved.
4 *
5 * This package is an SSL implementation written
6 * by Eric Young (eay@cryptsoft.com).
7 * The implementation was written so as to conform with Netscapes SSL.
8 *
9 * This library is free for commercial and non-commercial use as long as
10 * the following conditions are aheared to. The following conditions
11 * apply to all code found in this distribution, be it the RC4, RSA,
12 * lhash, DES, etc., code; not just the SSL code. The SSL documentation
13 * included with this distribution is covered by the same copyright terms
14 * except that the holder is Tim Hudson (tjh@cryptsoft.com).
15 *
16 * Copyright remains Eric Young's, and as such any Copyright notices in
17 * the code are not to be removed.
18 * If this package is used in a product, Eric Young should be given attribution
19 * as the author of the parts of the library used.
20 * This can be in the form of a textual message at program startup or
21 * in documentation (online or textual) provided with the package.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 * 1. Redistributions of source code must retain the copyright
27 * notice, this list of conditions and the following disclaimer.
28 * 2. Redistributions in binary form must reproduce the above copyright
29 * notice, this list of conditions and the following disclaimer in the
30 * documentation and/or other materials provided with the distribution.
31 * 3. All advertising materials mentioning features or use of this software
32 * must display the following acknowledgement:
33 * "This product includes cryptographic software written by
34 * Eric Young (eay@cryptsoft.com)"
35 * The word 'cryptographic' can be left out if the rouines from the library
36 * being used are not cryptographic related :-).
37 * 4. If you include any Windows specific code (or a derivative thereof) from
38 * the apps directory (application code) you must include an acknowledgement:
39 * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
40 *
41 * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
42 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51 * SUCH DAMAGE.
52 *
53 * The licence and distribution terms for any publically available version or
54 * derivative of this code cannot be changed. i.e. this code cannot simply be
55 * copied and put under another distribution licence
56 * [including the GNU Public Licence.]
57 */
58
59#include <stdio.h>
60#include "cryptlib.h"
61#include "bn_lcl.h"
62
63#ifdef BN_LLONG
64
65BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
66 {
67 BN_ULONG c1=0;
68
69 bn_check_num(num);
70 if (num <= 0) return(c1);
71
72 for (;;)
73 {
74 mul_add(rp[0],ap[0],w,c1);
75 if (--num == 0) break;
76 mul_add(rp[1],ap[1],w,c1);
77 if (--num == 0) break;
78 mul_add(rp[2],ap[2],w,c1);
79 if (--num == 0) break;
80 mul_add(rp[3],ap[3],w,c1);
81 if (--num == 0) break;
82 ap+=4;
83 rp+=4;
84 }
85
86 return(c1);
87 }
88
89BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
90 {
91 BN_ULONG c1=0;
92
93 bn_check_num(num);
94 if (num <= 0) return(c1);
95
96 /* for (;;) */
97 while (1) /* circumvent egcs-1.1.2 bug */
98 {
99 mul(rp[0],ap[0],w,c1);
100 if (--num == 0) break;
101 mul(rp[1],ap[1],w,c1);
102 if (--num == 0) break;
103 mul(rp[2],ap[2],w,c1);
104 if (--num == 0) break;
105 mul(rp[3],ap[3],w,c1);
106 if (--num == 0) break;
107 ap+=4;
108 rp+=4;
109 }
110 return(c1);
111 }
112
113void bn_sqr_words(BN_ULONG *r, BN_ULONG *a, int n)
114 {
115 bn_check_num(n);
116 if (n <= 0) return;
117 for (;;)
118 {
119 BN_ULLONG t;
120
121 t=(BN_ULLONG)(a[0])*(a[0]);
122 r[0]=Lw(t); r[1]=Hw(t);
123 if (--n == 0) break;
124
125 t=(BN_ULLONG)(a[1])*(a[1]);
126 r[2]=Lw(t); r[3]=Hw(t);
127 if (--n == 0) break;
128
129 t=(BN_ULLONG)(a[2])*(a[2]);
130 r[4]=Lw(t); r[5]=Hw(t);
131 if (--n == 0) break;
132
133 t=(BN_ULLONG)(a[3])*(a[3]);
134 r[6]=Lw(t); r[7]=Hw(t);
135 if (--n == 0) break;
136
137 a+=4;
138 r+=8;
139 }
140 }
141
142#else
143
144BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
145 {
146 BN_ULONG c=0;
147 BN_ULONG bl,bh;
148
149 bn_check_num(num);
150 if (num <= 0) return((BN_ULONG)0);
151
152 bl=LBITS(w);
153 bh=HBITS(w);
154
155 for (;;)
156 {
157 mul_add(rp[0],ap[0],bl,bh,c);
158 if (--num == 0) break;
159 mul_add(rp[1],ap[1],bl,bh,c);
160 if (--num == 0) break;
161 mul_add(rp[2],ap[2],bl,bh,c);
162 if (--num == 0) break;
163 mul_add(rp[3],ap[3],bl,bh,c);
164 if (--num == 0) break;
165 ap+=4;
166 rp+=4;
167 }
168 return(c);
169 }
170
171BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
172 {
173 BN_ULONG carry=0;
174 BN_ULONG bl,bh;
175
176 bn_check_num(num);
177 if (num <= 0) return((BN_ULONG)0);
178
179 bl=LBITS(w);
180 bh=HBITS(w);
181
182 for (;;)
183 {
184 mul(rp[0],ap[0],bl,bh,carry);
185 if (--num == 0) break;
186 mul(rp[1],ap[1],bl,bh,carry);
187 if (--num == 0) break;
188 mul(rp[2],ap[2],bl,bh,carry);
189 if (--num == 0) break;
190 mul(rp[3],ap[3],bl,bh,carry);
191 if (--num == 0) break;
192 ap+=4;
193 rp+=4;
194 }
195 return(carry);
196 }
197
198void bn_sqr_words(BN_ULONG *r, BN_ULONG *a, int n)
199 {
200 bn_check_num(n);
201 if (n <= 0) return;
202 for (;;)
203 {
204 sqr64(r[0],r[1],a[0]);
205 if (--n == 0) break;
206
207 sqr64(r[2],r[3],a[1]);
208 if (--n == 0) break;
209
210 sqr64(r[4],r[5],a[2]);
211 if (--n == 0) break;
212
213 sqr64(r[6],r[7],a[3]);
214 if (--n == 0) break;
215
216 a+=4;
217 r+=8;
218 }
219 }
220
221#endif
222
223#if defined(BN_LLONG) && defined(BN_DIV2W)
224
225BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
226 {
227 return((BN_ULONG)(((((BN_ULLONG)h)<<BN_BITS2)|l)/(BN_ULLONG)d));
228 }
229
230#else
231
232/* Divide h-l by d and return the result. */
233/* I need to test this some more :-( */
234BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
235 {
236 BN_ULONG dh,dl,q,ret=0,th,tl,t;
237 int i,count=2;
238
239 if (d == 0) return(BN_MASK2);
240
241 i=BN_num_bits_word(d);
242 if ((i != BN_BITS2) && (h > (BN_ULONG)1<<i))
243 {
244#if !defined(NO_STDIO) && !defined(WIN16)
245 fprintf(stderr,"Division would overflow (%d)\n",i);
246#endif
247 abort();
248 }
249 i=BN_BITS2-i;
250 if (h >= d) h-=d;
251
252 if (i)
253 {
254 d<<=i;
255 h=(h<<i)|(l>>(BN_BITS2-i));
256 l<<=i;
257 }
258 dh=(d&BN_MASK2h)>>BN_BITS4;
259 dl=(d&BN_MASK2l);
260 for (;;)
261 {
262 if ((h>>BN_BITS4) == dh)
263 q=BN_MASK2l;
264 else
265 q=h/dh;
266
267 th=q*dh;
268 tl=dl*q;
269 for (;;)
270 {
271 t=h-th;
272 if ((t&BN_MASK2h) ||
273 ((tl) <= (
274 (t<<BN_BITS4)|
275 ((l&BN_MASK2h)>>BN_BITS4))))
276 break;
277 q--;
278 th-=dh;
279 tl-=dl;
280 }
281 t=(tl>>BN_BITS4);
282 tl=(tl<<BN_BITS4)&BN_MASK2h;
283 th+=t;
284
285 if (l < tl) th++;
286 l-=tl;
287 if (h < th)
288 {
289 h+=d;
290 q--;
291 }
292 h-=th;
293
294 if (--count == 0) break;
295
296 ret=q<<BN_BITS4;
297 h=((h<<BN_BITS4)|(l>>BN_BITS4))&BN_MASK2;
298 l=(l&BN_MASK2l)<<BN_BITS4;
299 }
300 ret|=q;
301 return(ret);
302 }
303#endif
304
305#ifdef BN_LLONG
306BN_ULONG bn_add_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
307 {
308 BN_ULLONG ll=0;
309
310 bn_check_num(n);
311 if (n <= 0) return((BN_ULONG)0);
312
313 for (;;)
314 {
315 ll+=(BN_ULLONG)a[0]+b[0];
316 r[0]=(BN_ULONG)ll&BN_MASK2;
317 ll>>=BN_BITS2;
318 if (--n <= 0) break;
319
320 ll+=(BN_ULLONG)a[1]+b[1];
321 r[1]=(BN_ULONG)ll&BN_MASK2;
322 ll>>=BN_BITS2;
323 if (--n <= 0) break;
324
325 ll+=(BN_ULLONG)a[2]+b[2];
326 r[2]=(BN_ULONG)ll&BN_MASK2;
327 ll>>=BN_BITS2;
328 if (--n <= 0) break;
329
330 ll+=(BN_ULLONG)a[3]+b[3];
331 r[3]=(BN_ULONG)ll&BN_MASK2;
332 ll>>=BN_BITS2;
333 if (--n <= 0) break;
334
335 a+=4;
336 b+=4;
337 r+=4;
338 }
339 return((BN_ULONG)ll);
340 }
341#else
342BN_ULONG bn_add_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
343 {
344 BN_ULONG c,l,t;
345
346 bn_check_num(n);
347 if (n <= 0) return((BN_ULONG)0);
348
349 c=0;
350 for (;;)
351 {
352 t=a[0];
353 t=(t+c)&BN_MASK2;
354 c=(t < c);
355 l=(t+b[0])&BN_MASK2;
356 c+=(l < t);
357 r[0]=l;
358 if (--n <= 0) break;
359
360 t=a[1];
361 t=(t+c)&BN_MASK2;
362 c=(t < c);
363 l=(t+b[1])&BN_MASK2;
364 c+=(l < t);
365 r[1]=l;
366 if (--n <= 0) break;
367
368 t=a[2];
369 t=(t+c)&BN_MASK2;
370 c=(t < c);
371 l=(t+b[2])&BN_MASK2;
372 c+=(l < t);
373 r[2]=l;
374 if (--n <= 0) break;
375
376 t=a[3];
377 t=(t+c)&BN_MASK2;
378 c=(t < c);
379 l=(t+b[3])&BN_MASK2;
380 c+=(l < t);
381 r[3]=l;
382 if (--n <= 0) break;
383
384 a+=4;
385 b+=4;
386 r+=4;
387 }
388 return((BN_ULONG)c);
389 }
390#endif
391
392BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
393 {
394 BN_ULONG t1,t2;
395 int c=0;
396
397 bn_check_num(n);
398 if (n <= 0) return((BN_ULONG)0);
399
400 for (;;)
401 {
402 t1=a[0]; t2=b[0];
403 r[0]=(t1-t2-c)&BN_MASK2;
404 if (t1 != t2) c=(t1 < t2);
405 if (--n <= 0) break;
406
407 t1=a[1]; t2=b[1];
408 r[1]=(t1-t2-c)&BN_MASK2;
409 if (t1 != t2) c=(t1 < t2);
410 if (--n <= 0) break;
411
412 t1=a[2]; t2=b[2];
413 r[2]=(t1-t2-c)&BN_MASK2;
414 if (t1 != t2) c=(t1 < t2);
415 if (--n <= 0) break;
416
417 t1=a[3]; t2=b[3];
418 r[3]=(t1-t2-c)&BN_MASK2;
419 if (t1 != t2) c=(t1 < t2);
420 if (--n <= 0) break;
421
422 a+=4;
423 b+=4;
424 r+=4;
425 }
426 return(c);
427 }
428
429#ifdef BN_MUL_COMBA
430
431#undef bn_mul_comba8
432#undef bn_mul_comba4
433#undef bn_sqr_comba8
434#undef bn_sqr_comba4
435
436#ifdef BN_LLONG
437#define mul_add_c(a,b,c0,c1,c2) \
438 t=(BN_ULLONG)a*b; \
439 t1=(BN_ULONG)Lw(t); \
440 t2=(BN_ULONG)Hw(t); \
441 c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \
442 c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++;
443
444#define mul_add_c2(a,b,c0,c1,c2) \
445 t=(BN_ULLONG)a*b; \
446 tt=(t+t)&BN_MASK; \
447 if (tt < t) c2++; \
448 t1=(BN_ULONG)Lw(tt); \
449 t2=(BN_ULONG)Hw(tt); \
450 c0=(c0+t1)&BN_MASK2; \
451 if ((c0 < t1) && (((++t2)&BN_MASK2) == 0)) c2++; \
452 c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++;
453
454#define sqr_add_c(a,i,c0,c1,c2) \
455 t=(BN_ULLONG)a[i]*a[i]; \
456 t1=(BN_ULONG)Lw(t); \
457 t2=(BN_ULONG)Hw(t); \
458 c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \
459 c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++;
460
461#define sqr_add_c2(a,i,j,c0,c1,c2) \
462 mul_add_c2((a)[i],(a)[j],c0,c1,c2)
463#else
464#define mul_add_c(a,b,c0,c1,c2) \
465 t1=LBITS(a); t2=HBITS(a); \
466 bl=LBITS(b); bh=HBITS(b); \
467 mul64(t1,t2,bl,bh); \
468 c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \
469 c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++;
470
471#define mul_add_c2(a,b,c0,c1,c2) \
472 t1=LBITS(a); t2=HBITS(a); \
473 bl=LBITS(b); bh=HBITS(b); \
474 mul64(t1,t2,bl,bh); \
475 if (t2 & BN_TBIT) c2++; \
476 t2=(t2+t2)&BN_MASK2; \
477 if (t1 & BN_TBIT) t2++; \
478 t1=(t1+t1)&BN_MASK2; \
479 c0=(c0+t1)&BN_MASK2; \
480 if ((c0 < t1) && (((++t2)&BN_MASK2) == 0)) c2++; \
481 c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++;
482
483#define sqr_add_c(a,i,c0,c1,c2) \
484 sqr64(t1,t2,(a)[i]); \
485 c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \
486 c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++;
487
488#define sqr_add_c2(a,i,j,c0,c1,c2) \
489 mul_add_c2((a)[i],(a)[j],c0,c1,c2)
490#endif
491
492void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
493 {
494#ifdef BN_LLONG
495 BN_ULLONG t;
496#else
497 BN_ULONG bl,bh;
498#endif
499 BN_ULONG t1,t2;
500 BN_ULONG c1,c2,c3;
501
502 c1=0;
503 c2=0;
504 c3=0;
505 mul_add_c(a[0],b[0],c1,c2,c3);
506 r[0]=c1;
507 c1=0;
508 mul_add_c(a[0],b[1],c2,c3,c1);
509 mul_add_c(a[1],b[0],c2,c3,c1);
510 r[1]=c2;
511 c2=0;
512 mul_add_c(a[2],b[0],c3,c1,c2);
513 mul_add_c(a[1],b[1],c3,c1,c2);
514 mul_add_c(a[0],b[2],c3,c1,c2);
515 r[2]=c3;
516 c3=0;
517 mul_add_c(a[0],b[3],c1,c2,c3);
518 mul_add_c(a[1],b[2],c1,c2,c3);
519 mul_add_c(a[2],b[1],c1,c2,c3);
520 mul_add_c(a[3],b[0],c1,c2,c3);
521 r[3]=c1;
522 c1=0;
523 mul_add_c(a[4],b[0],c2,c3,c1);
524 mul_add_c(a[3],b[1],c2,c3,c1);
525 mul_add_c(a[2],b[2],c2,c3,c1);
526 mul_add_c(a[1],b[3],c2,c3,c1);
527 mul_add_c(a[0],b[4],c2,c3,c1);
528 r[4]=c2;
529 c2=0;
530 mul_add_c(a[0],b[5],c3,c1,c2);
531 mul_add_c(a[1],b[4],c3,c1,c2);
532 mul_add_c(a[2],b[3],c3,c1,c2);
533 mul_add_c(a[3],b[2],c3,c1,c2);
534 mul_add_c(a[4],b[1],c3,c1,c2);
535 mul_add_c(a[5],b[0],c3,c1,c2);
536 r[5]=c3;
537 c3=0;
538 mul_add_c(a[6],b[0],c1,c2,c3);
539 mul_add_c(a[5],b[1],c1,c2,c3);
540 mul_add_c(a[4],b[2],c1,c2,c3);
541 mul_add_c(a[3],b[3],c1,c2,c3);
542 mul_add_c(a[2],b[4],c1,c2,c3);
543 mul_add_c(a[1],b[5],c1,c2,c3);
544 mul_add_c(a[0],b[6],c1,c2,c3);
545 r[6]=c1;
546 c1=0;
547 mul_add_c(a[0],b[7],c2,c3,c1);
548 mul_add_c(a[1],b[6],c2,c3,c1);
549 mul_add_c(a[2],b[5],c2,c3,c1);
550 mul_add_c(a[3],b[4],c2,c3,c1);
551 mul_add_c(a[4],b[3],c2,c3,c1);
552 mul_add_c(a[5],b[2],c2,c3,c1);
553 mul_add_c(a[6],b[1],c2,c3,c1);
554 mul_add_c(a[7],b[0],c2,c3,c1);
555 r[7]=c2;
556 c2=0;
557 mul_add_c(a[7],b[1],c3,c1,c2);
558 mul_add_c(a[6],b[2],c3,c1,c2);
559 mul_add_c(a[5],b[3],c3,c1,c2);
560 mul_add_c(a[4],b[4],c3,c1,c2);
561 mul_add_c(a[3],b[5],c3,c1,c2);
562 mul_add_c(a[2],b[6],c3,c1,c2);
563 mul_add_c(a[1],b[7],c3,c1,c2);
564 r[8]=c3;
565 c3=0;
566 mul_add_c(a[2],b[7],c1,c2,c3);
567 mul_add_c(a[3],b[6],c1,c2,c3);
568 mul_add_c(a[4],b[5],c1,c2,c3);
569 mul_add_c(a[5],b[4],c1,c2,c3);
570 mul_add_c(a[6],b[3],c1,c2,c3);
571 mul_add_c(a[7],b[2],c1,c2,c3);
572 r[9]=c1;
573 c1=0;
574 mul_add_c(a[7],b[3],c2,c3,c1);
575 mul_add_c(a[6],b[4],c2,c3,c1);
576 mul_add_c(a[5],b[5],c2,c3,c1);
577 mul_add_c(a[4],b[6],c2,c3,c1);
578 mul_add_c(a[3],b[7],c2,c3,c1);
579 r[10]=c2;
580 c2=0;
581 mul_add_c(a[4],b[7],c3,c1,c2);
582 mul_add_c(a[5],b[6],c3,c1,c2);
583 mul_add_c(a[6],b[5],c3,c1,c2);
584 mul_add_c(a[7],b[4],c3,c1,c2);
585 r[11]=c3;
586 c3=0;
587 mul_add_c(a[7],b[5],c1,c2,c3);
588 mul_add_c(a[6],b[6],c1,c2,c3);
589 mul_add_c(a[5],b[7],c1,c2,c3);
590 r[12]=c1;
591 c1=0;
592 mul_add_c(a[6],b[7],c2,c3,c1);
593 mul_add_c(a[7],b[6],c2,c3,c1);
594 r[13]=c2;
595 c2=0;
596 mul_add_c(a[7],b[7],c3,c1,c2);
597 r[14]=c3;
598 r[15]=c1;
599 }
600
601void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
602 {
603#ifdef BN_LLONG
604 BN_ULLONG t;
605#else
606 BN_ULONG bl,bh;
607#endif
608 BN_ULONG t1,t2;
609 BN_ULONG c1,c2,c3;
610
611 c1=0;
612 c2=0;
613 c3=0;
614 mul_add_c(a[0],b[0],c1,c2,c3);
615 r[0]=c1;
616 c1=0;
617 mul_add_c(a[0],b[1],c2,c3,c1);
618 mul_add_c(a[1],b[0],c2,c3,c1);
619 r[1]=c2;
620 c2=0;
621 mul_add_c(a[2],b[0],c3,c1,c2);
622 mul_add_c(a[1],b[1],c3,c1,c2);
623 mul_add_c(a[0],b[2],c3,c1,c2);
624 r[2]=c3;
625 c3=0;
626 mul_add_c(a[0],b[3],c1,c2,c3);
627 mul_add_c(a[1],b[2],c1,c2,c3);
628 mul_add_c(a[2],b[1],c1,c2,c3);
629 mul_add_c(a[3],b[0],c1,c2,c3);
630 r[3]=c1;
631 c1=0;
632 mul_add_c(a[3],b[1],c2,c3,c1);
633 mul_add_c(a[2],b[2],c2,c3,c1);
634 mul_add_c(a[1],b[3],c2,c3,c1);
635 r[4]=c2;
636 c2=0;
637 mul_add_c(a[2],b[3],c3,c1,c2);
638 mul_add_c(a[3],b[2],c3,c1,c2);
639 r[5]=c3;
640 c3=0;
641 mul_add_c(a[3],b[3],c1,c2,c3);
642 r[6]=c1;
643 r[7]=c2;
644 }
645
646void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
647 {
648#ifdef BN_LLONG
649 BN_ULLONG t,tt;
650#else
651 BN_ULONG bl,bh;
652#endif
653 BN_ULONG t1,t2;
654 BN_ULONG c1,c2,c3;
655
656 c1=0;
657 c2=0;
658 c3=0;
659 sqr_add_c(a,0,c1,c2,c3);
660 r[0]=c1;
661 c1=0;
662 sqr_add_c2(a,1,0,c2,c3,c1);
663 r[1]=c2;
664 c2=0;
665 sqr_add_c(a,1,c3,c1,c2);
666 sqr_add_c2(a,2,0,c3,c1,c2);
667 r[2]=c3;
668 c3=0;
669 sqr_add_c2(a,3,0,c1,c2,c3);
670 sqr_add_c2(a,2,1,c1,c2,c3);
671 r[3]=c1;
672 c1=0;
673 sqr_add_c(a,2,c2,c3,c1);
674 sqr_add_c2(a,3,1,c2,c3,c1);
675 sqr_add_c2(a,4,0,c2,c3,c1);
676 r[4]=c2;
677 c2=0;
678 sqr_add_c2(a,5,0,c3,c1,c2);
679 sqr_add_c2(a,4,1,c3,c1,c2);
680 sqr_add_c2(a,3,2,c3,c1,c2);
681 r[5]=c3;
682 c3=0;
683 sqr_add_c(a,3,c1,c2,c3);
684 sqr_add_c2(a,4,2,c1,c2,c3);
685 sqr_add_c2(a,5,1,c1,c2,c3);
686 sqr_add_c2(a,6,0,c1,c2,c3);
687 r[6]=c1;
688 c1=0;
689 sqr_add_c2(a,7,0,c2,c3,c1);
690 sqr_add_c2(a,6,1,c2,c3,c1);
691 sqr_add_c2(a,5,2,c2,c3,c1);
692 sqr_add_c2(a,4,3,c2,c3,c1);
693 r[7]=c2;
694 c2=0;
695 sqr_add_c(a,4,c3,c1,c2);
696 sqr_add_c2(a,5,3,c3,c1,c2);
697 sqr_add_c2(a,6,2,c3,c1,c2);
698 sqr_add_c2(a,7,1,c3,c1,c2);
699 r[8]=c3;
700 c3=0;
701 sqr_add_c2(a,7,2,c1,c2,c3);
702 sqr_add_c2(a,6,3,c1,c2,c3);
703 sqr_add_c2(a,5,4,c1,c2,c3);
704 r[9]=c1;
705 c1=0;
706 sqr_add_c(a,5,c2,c3,c1);
707 sqr_add_c2(a,6,4,c2,c3,c1);
708 sqr_add_c2(a,7,3,c2,c3,c1);
709 r[10]=c2;
710 c2=0;
711 sqr_add_c2(a,7,4,c3,c1,c2);
712 sqr_add_c2(a,6,5,c3,c1,c2);
713 r[11]=c3;
714 c3=0;
715 sqr_add_c(a,6,c1,c2,c3);
716 sqr_add_c2(a,7,5,c1,c2,c3);
717 r[12]=c1;
718 c1=0;
719 sqr_add_c2(a,7,6,c2,c3,c1);
720 r[13]=c2;
721 c2=0;
722 sqr_add_c(a,7,c3,c1,c2);
723 r[14]=c3;
724 r[15]=c1;
725 }
726
727void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
728 {
729#ifdef BN_LLONG
730 BN_ULLONG t,tt;
731#else
732 BN_ULONG bl,bh;
733#endif
734 BN_ULONG t1,t2;
735 BN_ULONG c1,c2,c3;
736
737 c1=0;
738 c2=0;
739 c3=0;
740 sqr_add_c(a,0,c1,c2,c3);
741 r[0]=c1;
742 c1=0;
743 sqr_add_c2(a,1,0,c2,c3,c1);
744 r[1]=c2;
745 c2=0;
746 sqr_add_c(a,1,c3,c1,c2);
747 sqr_add_c2(a,2,0,c3,c1,c2);
748 r[2]=c3;
749 c3=0;
750 sqr_add_c2(a,3,0,c1,c2,c3);
751 sqr_add_c2(a,2,1,c1,c2,c3);
752 r[3]=c1;
753 c1=0;
754 sqr_add_c(a,2,c2,c3,c1);
755 sqr_add_c2(a,3,1,c2,c3,c1);
756 r[4]=c2;
757 c2=0;
758 sqr_add_c2(a,3,2,c3,c1,c2);
759 r[5]=c3;
760 c3=0;
761 sqr_add_c(a,3,c1,c2,c3);
762 r[6]=c1;
763 r[7]=c2;
764 }
765#else
766
767/* hmm... is it faster just to do a multiply? */
768#undef bn_sqr_comba4
769void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
770 {
771 BN_ULONG t[8];
772 bn_sqr_normal(r,a,4,t);
773 }
774
775#undef bn_sqr_comba8
776void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
777 {
778 BN_ULONG t[16];
779 bn_sqr_normal(r,a,8,t);
780 }
781
782void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
783 {
784 r[4]=bn_mul_words( &(r[0]),a,4,b[0]);
785 r[5]=bn_mul_add_words(&(r[1]),a,4,b[1]);
786 r[6]=bn_mul_add_words(&(r[2]),a,4,b[2]);
787 r[7]=bn_mul_add_words(&(r[3]),a,4,b[3]);
788 }
789
790void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
791 {
792 r[ 8]=bn_mul_words( &(r[0]),a,8,b[0]);
793 r[ 9]=bn_mul_add_words(&(r[1]),a,8,b[1]);
794 r[10]=bn_mul_add_words(&(r[2]),a,8,b[2]);
795 r[11]=bn_mul_add_words(&(r[3]),a,8,b[3]);
796 r[12]=bn_mul_add_words(&(r[4]),a,8,b[4]);
797 r[13]=bn_mul_add_words(&(r[5]),a,8,b[5]);
798 r[14]=bn_mul_add_words(&(r[6]),a,8,b[6]);
799 r[15]=bn_mul_add_words(&(r[7]),a,8,b[7]);
800 }
801
802#endif /* BN_COMBA */
diff --git a/src/lib/libcrypto/bn/bn_ctx.c b/src/lib/libcrypto/bn/bn_ctx.c
new file mode 100644
index 0000000000..46132fd180
--- /dev/null
+++ b/src/lib/libcrypto/bn/bn_ctx.c
@@ -0,0 +1,144 @@
1/* crypto/bn/bn_ctx.c */
2/* Written by Ulf Moeller for the OpenSSL project. */
3/* ====================================================================
4 * Copyright (c) 1998-2000 The OpenSSL Project. All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 *
18 * 3. All advertising materials mentioning features or use of this
19 * software must display the following acknowledgment:
20 * "This product includes software developed by the OpenSSL Project
21 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
22 *
23 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
24 * endorse or promote products derived from this software without
25 * prior written permission. For written permission, please contact
26 * openssl-core@openssl.org.
27 *
28 * 5. Products derived from this software may not be called "OpenSSL"
29 * nor may "OpenSSL" appear in their names without prior written
30 * permission of the OpenSSL Project.
31 *
32 * 6. Redistributions of any form whatsoever must retain the following
33 * acknowledgment:
34 * "This product includes software developed by the OpenSSL Project
35 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
36 *
37 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
38 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
39 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
40 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
41 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
42 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
43 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
44 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
45 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
46 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
47 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
48 * OF THE POSSIBILITY OF SUCH DAMAGE.
49 * ====================================================================
50 *
51 * This product includes cryptographic software written by Eric Young
52 * (eay@cryptsoft.com). This product includes software written by Tim
53 * Hudson (tjh@cryptsoft.com).
54 *
55 */
56
57#ifndef BN_CTX_DEBUG
58# undef NDEBUG /* avoid conflicting definitions */
59# define NDEBUG
60#endif
61
62#include <stdio.h>
63#include <assert.h>
64#include "cryptlib.h"
65#include <openssl/bn.h>
66
67
68BN_CTX *BN_CTX_new(void)
69 {
70 BN_CTX *ret;
71
72 ret=(BN_CTX *)Malloc(sizeof(BN_CTX));
73 if (ret == NULL)
74 {
75 BNerr(BN_F_BN_CTX_NEW,ERR_R_MALLOC_FAILURE);
76 return(NULL);
77 }
78
79 BN_CTX_init(ret);
80 ret->flags=BN_FLG_MALLOCED;
81 return(ret);
82 }
83
84void BN_CTX_init(BN_CTX *ctx)
85 {
86 int i;
87 ctx->tos = 0;
88 ctx->flags = 0;
89 ctx->depth = 0;
90 ctx->too_many = 0;
91 for (i = 0; i < BN_CTX_NUM; i++)
92 BN_init(&(ctx->bn[i]));
93 }
94
95void BN_CTX_free(BN_CTX *ctx)
96 {
97 int i;
98
99 if (ctx == NULL) return;
100 assert(ctx->depth == 0);
101
102 for (i=0; i < BN_CTX_NUM; i++)
103 BN_clear_free(&(ctx->bn[i]));
104 if (ctx->flags & BN_FLG_MALLOCED)
105 Free(ctx);
106 }
107
108void BN_CTX_start(BN_CTX *ctx)
109 {
110 if (ctx->depth < BN_CTX_NUM_POS)
111 ctx->pos[ctx->depth] = ctx->tos;
112 ctx->depth++;
113 }
114
115BIGNUM *BN_CTX_get(BN_CTX *ctx)
116 {
117 if (ctx->depth > BN_CTX_NUM_POS || ctx->tos >= BN_CTX_NUM)
118 {
119 if (!ctx->too_many)
120 {
121 BNerr(BN_F_BN_CTX_GET,BN_R_TOO_MANY_TEMPORARY_VARIABLES);
122 /* disable error code until BN_CTX_end is called: */
123 ctx->too_many = 1;
124 }
125 return NULL;
126 }
127 return (&(ctx->bn[ctx->tos++]));
128 }
129
130void BN_CTX_end(BN_CTX *ctx)
131 {
132 if (ctx == NULL) return;
133 assert(ctx->depth > 0);
134 if (ctx->depth == 0)
135 /* should never happen, but we can tolerate it if not in
136 * debug mode (could be a 'goto err' in the calling function
137 * before BN_CTX_start was reached) */
138 BN_CTX_start(ctx);
139
140 ctx->too_many = 0;
141 ctx->depth--;
142 if (ctx->depth < BN_CTX_NUM_POS)
143 ctx->tos = ctx->pos[ctx->depth];
144 }
diff --git a/src/lib/libcrypto/bn/bn_exp2.c b/src/lib/libcrypto/bn/bn_exp2.c
new file mode 100644
index 0000000000..1132d53365
--- /dev/null
+++ b/src/lib/libcrypto/bn/bn_exp2.c
@@ -0,0 +1,195 @@
1#include <stdio.h>
2#include "cryptlib.h"
3#include "bn_lcl.h"
4
5/* I've done some timing with different table sizes.
6 * The main hassle is that even with bits set at 3, this requires
7 * 63 BIGNUMs to store the pre-calculated values.
8 * 512 1024
9 * bits=1 75.4% 79.4%
10 * bits=2 61.2% 62.4%
11 * bits=3 61.3% 59.3%
12 * The lack of speed improvment is also a function of the pre-calculation
13 * which could be removed.
14 */
15#define EXP2_TABLE_BITS 2 /* 1 2 3 4 5 */
16#define EXP2_TABLE_SIZE 4 /* 2 4 8 16 32 */
17
18int BN_mod_exp2_mont(BIGNUM *rr, BIGNUM *a1, BIGNUM *p1, BIGNUM *a2,
19 BIGNUM *p2, BIGNUM *m, BN_CTX *ctx, BN_MONT_CTX *in_mont)
20 {
21 int i,j,k,bits,bits1,bits2,ret=0,wstart,wend,window,xvalue,yvalue;
22 int start=1,ts=0,x,y;
23 BIGNUM *d,*aa1,*aa2,*r;
24 BIGNUM val[EXP2_TABLE_SIZE][EXP2_TABLE_SIZE];
25 BN_MONT_CTX *mont=NULL;
26
27 bn_check_top(a1);
28 bn_check_top(p1);
29 bn_check_top(a2);
30 bn_check_top(p2);
31 bn_check_top(m);
32
33 if (!(m->d[0] & 1))
34 {
35 BNerr(BN_F_BN_MOD_EXP_MONT,BN_R_CALLED_WITH_EVEN_MODULUS);
36 return(0);
37 }
38 d= &(ctx->bn[ctx->tos++]);
39 r= &(ctx->bn[ctx->tos++]);
40 bits1=BN_num_bits(p1);
41 bits2=BN_num_bits(p2);
42 if ((bits1 == 0) && (bits2 == 0))
43 {
44 BN_one(r);
45 return(1);
46 }
47 bits=(bits1 > bits2)?bits1:bits2;
48
49 /* If this is not done, things will break in the montgomery
50 * part */
51
52 if (in_mont != NULL)
53 mont=in_mont;
54 else
55 {
56 if ((mont=BN_MONT_CTX_new()) == NULL) goto err;
57 if (!BN_MONT_CTX_set(mont,m,ctx)) goto err;
58 }
59
60 BN_init(&(val[0][0]));
61 BN_init(&(val[1][1]));
62 BN_init(&(val[0][1]));
63 BN_init(&(val[1][0]));
64 ts=1;
65 if (BN_ucmp(a1,m) >= 0)
66 {
67 BN_mod(&(val[1][0]),a1,m,ctx);
68 aa1= &(val[1][0]);
69 }
70 else
71 aa1=a1;
72 if (BN_ucmp(a2,m) >= 0)
73 {
74 BN_mod(&(val[0][1]),a2,m,ctx);
75 aa2= &(val[0][1]);
76 }
77 else
78 aa2=a2;
79 if (!BN_to_montgomery(&(val[1][0]),aa1,mont,ctx)) goto err;
80 if (!BN_to_montgomery(&(val[0][1]),aa2,mont,ctx)) goto err;
81 if (!BN_mod_mul_montgomery(&(val[1][1]),
82 &(val[1][0]),&(val[0][1]),mont,ctx))
83 goto err;
84
85#if 0
86 if (bits <= 20) /* This is probably 3 or 0x10001, so just do singles */
87 window=1;
88 else if (bits > 250)
89 window=5; /* max size of window */
90 else if (bits >= 120)
91 window=4;
92 else
93 window=3;
94#else
95 window=EXP2_TABLE_BITS;
96#endif
97
98 k=1<<window;
99 for (x=0; x<k; x++)
100 {
101 if (x >= 2)
102 {
103 BN_init(&(val[x][0]));
104 BN_init(&(val[x][1]));
105 if (!BN_mod_mul_montgomery(&(val[x][0]),
106 &(val[1][0]),&(val[x-1][0]),mont,ctx)) goto err;
107 if (!BN_mod_mul_montgomery(&(val[x][1]),
108 &(val[1][0]),&(val[x-1][1]),mont,ctx)) goto err;
109 }
110 for (y=2; y<k; y++)
111 {
112 BN_init(&(val[x][y]));
113 if (!BN_mod_mul_montgomery(&(val[x][y]),
114 &(val[x][y-1]),&(val[0][1]),mont,ctx))
115 goto err;
116 }
117 }
118 ts=k;
119
120 start=1; /* This is used to avoid multiplication etc
121 * when there is only the value '1' in the
122 * buffer. */
123 xvalue=0; /* The 'x value' of the window */
124 yvalue=0; /* The 'y value' of the window */
125 wstart=bits-1; /* The top bit of the window */
126 wend=0; /* The bottom bit of the window */
127
128 if (!BN_to_montgomery(r,BN_value_one(),mont,ctx)) goto err;
129 for (;;)
130 {
131 xvalue=BN_is_bit_set(p1,wstart);
132 yvalue=BN_is_bit_set(p2,wstart);
133 if (!(xvalue || yvalue))
134 {
135 if (!start)
136 {
137 if (!BN_mod_mul_montgomery(r,r,r,mont,ctx))
138 goto err;
139 }
140 wstart--;
141 if (wstart < 0) break;
142 continue;
143 }
144 /* We now have wstart on a 'set' bit, we now need to work out
145 * how bit a window to do. To do this we need to scan
146 * forward until the last set bit before the end of the
147 * window */
148 j=wstart;
149 /* xvalue=BN_is_bit_set(p1,wstart); already set */
150 /* yvalue=BN_is_bit_set(p1,wstart); already set */
151 wend=0;
152 for (i=1; i<window; i++)
153 {
154 if (wstart-i < 0) break;
155 xvalue+=xvalue;
156 xvalue|=BN_is_bit_set(p1,wstart-i);
157 yvalue+=yvalue;
158 yvalue|=BN_is_bit_set(p2,wstart-i);
159 }
160
161 /* i is the size of the current window */
162 /* add the 'bytes above' */
163 if (!start)
164 for (j=0; j<i; j++)
165 {
166 if (!BN_mod_mul_montgomery(r,r,r,mont,ctx))
167 goto err;
168 }
169
170 /* wvalue will be an odd number < 2^window */
171 if (xvalue || yvalue)
172 {
173 if (!BN_mod_mul_montgomery(r,r,&(val[xvalue][yvalue]),
174 mont,ctx)) goto err;
175 }
176
177 /* move the 'window' down further */
178 wstart-=i;
179 start=0;
180 if (wstart < 0) break;
181 }
182 BN_from_montgomery(rr,r,mont,ctx);
183 ret=1;
184err:
185 if ((in_mont == NULL) && (mont != NULL)) BN_MONT_CTX_free(mont);
186 ctx->tos-=2;
187 for (i=0; i<ts; i++)
188 {
189 for (j=0; j<ts; j++)
190 {
191 BN_clear_free(&(val[i][j]));
192 }
193 }
194 return(ret);
195 }
diff --git a/src/lib/libcrypto/bn/bn_kron.c b/src/lib/libcrypto/bn/bn_kron.c
new file mode 100644
index 0000000000..49f75594ae
--- /dev/null
+++ b/src/lib/libcrypto/bn/bn_kron.c
@@ -0,0 +1,182 @@
1/* crypto/bn/bn_kron.c */
2/* ====================================================================
3 * Copyright (c) 1998-2000 The OpenSSL Project. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in
14 * the documentation and/or other materials provided with the
15 * distribution.
16 *
17 * 3. All advertising materials mentioning features or use of this
18 * software must display the following acknowledgment:
19 * "This product includes software developed by the OpenSSL Project
20 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
21 *
22 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
23 * endorse or promote products derived from this software without
24 * prior written permission. For written permission, please contact
25 * openssl-core@openssl.org.
26 *
27 * 5. Products derived from this software may not be called "OpenSSL"
28 * nor may "OpenSSL" appear in their names without prior written
29 * permission of the OpenSSL Project.
30 *
31 * 6. Redistributions of any form whatsoever must retain the following
32 * acknowledgment:
33 * "This product includes software developed by the OpenSSL Project
34 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
35 *
36 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
37 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
38 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
39 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
40 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
41 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
42 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
43 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
44 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
45 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
46 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
47 * OF THE POSSIBILITY OF SUCH DAMAGE.
48 * ====================================================================
49 *
50 * This product includes cryptographic software written by Eric Young
51 * (eay@cryptsoft.com). This product includes software written by Tim
52 * Hudson (tjh@cryptsoft.com).
53 *
54 */
55
56#include "bn_lcl.h"
57
58
59/* least significant word */
60#define BN_lsw(n) (((n)->top == 0) ? (BN_ULONG) 0 : (n)->d[0])
61
62/* Returns -2 for errors because both -1 and 0 are valid results. */
63int BN_kronecker(const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx)
64 {
65 int i;
66 int ret = -2; /* avoid 'uninitialized' warning */
67 int err = 0;
68 BIGNUM *A, *B, *tmp;
69 /* In 'tab', only odd-indexed entries are relevant:
70 * For any odd BIGNUM n,
71 * tab[BN_lsw(n) & 7]
72 * is $(-1)^{(n^2-1)/8}$ (using TeX notation).
73 * Note that the sign of n does not matter.
74 */
75 static const int tab[8] = {0, 1, 0, -1, 0, -1, 0, 1};
76
77 BN_CTX_start(ctx);
78 A = BN_CTX_get(ctx);
79 B = BN_CTX_get(ctx);
80 if (B == NULL) goto end;
81
82 err = !BN_copy(A, a);
83 if (err) goto end;
84 err = !BN_copy(B, b);
85 if (err) goto end;
86
87 /*
88 * Kronecker symbol, imlemented according to Henri Cohen,
89 * "A Course in Computational Algebraic Number Theory"
90 * (algorithm 1.4.10).
91 */
92
93 /* Cohen's step 1: */
94
95 if (BN_is_zero(B))
96 {
97 ret = BN_abs_is_word(A, 1);
98 goto end;
99 }
100
101 /* Cohen's step 2: */
102
103 if (!BN_is_odd(A) && !BN_is_odd(B))
104 {
105 ret = 0;
106 goto end;
107 }
108
109 /* now B is non-zero */
110 i = 0;
111 while (!BN_is_bit_set(B, i))
112 i++;
113 err = !BN_rshift(B, B, i);
114 if (err) goto end;
115 if (i & 1)
116 {
117 /* i is odd */
118 /* (thus B was even, thus A must be odd!) */
119
120 /* set 'ret' to $(-1)^{(A^2-1)/8}$ */
121 ret = tab[BN_lsw(A) & 7];
122 }
123 else
124 {
125 /* i is even */
126 ret = 1;
127 }
128
129 if (B->neg)
130 {
131 B->neg = 0;
132 if (A->neg)
133 ret = -ret;
134 }
135
136 /* now B is positive and odd, so what remains to be done is
137 * to compute the Jacobi symbol (A/B) and multiply it by 'ret' */
138
139 while (1)
140 {
141 /* Cohen's step 3: */
142
143 /* B is positive and odd */
144
145 if (BN_is_zero(A))
146 {
147 ret = BN_is_one(B) ? ret : 0;
148 goto end;
149 }
150
151 /* now A is non-zero */
152 i = 0;
153 while (!BN_is_bit_set(A, i))
154 i++;
155 err = !BN_rshift(A, A, i);
156 if (err) goto end;
157 if (i & 1)
158 {
159 /* i is odd */
160 /* multiply 'ret' by $(-1)^{(B^2-1)/8}$ */
161 ret = ret * tab[BN_lsw(B) & 7];
162 }
163
164 /* Cohen's step 4: */
165 /* multiply 'ret' by $(-1)^{(A-1)(B-1)/4}$ */
166 if ((A->neg ? ~BN_lsw(A) : BN_lsw(A)) & BN_lsw(B) & 2)
167 ret = -ret;
168
169 /* (A, B) := (B mod |A|, |A|) */
170 err = !BN_nnmod(B, B, A, ctx);
171 if (err) goto end;
172 tmp = A; A = B; B = tmp;
173 tmp->neg = 0;
174 }
175
176 end:
177 BN_CTX_end(ctx);
178 if (err)
179 return -2;
180 else
181 return ret;
182 }
diff --git a/src/lib/libcrypto/bn/bn_sqrt.c b/src/lib/libcrypto/bn/bn_sqrt.c
new file mode 100644
index 0000000000..e2a1105dc8
--- /dev/null
+++ b/src/lib/libcrypto/bn/bn_sqrt.c
@@ -0,0 +1,387 @@
1/* crypto/bn/bn_mod.c */
2/* Written by Lenka Fibikova <fibikova@exp-math.uni-essen.de>
3 * and Bodo Moeller for the OpenSSL project. */
4/* ====================================================================
5 * Copyright (c) 1998-2000 The OpenSSL Project. All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 *
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 *
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in
16 * the documentation and/or other materials provided with the
17 * distribution.
18 *
19 * 3. All advertising materials mentioning features or use of this
20 * software must display the following acknowledgment:
21 * "This product includes software developed by the OpenSSL Project
22 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
23 *
24 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
25 * endorse or promote products derived from this software without
26 * prior written permission. For written permission, please contact
27 * openssl-core@openssl.org.
28 *
29 * 5. Products derived from this software may not be called "OpenSSL"
30 * nor may "OpenSSL" appear in their names without prior written
31 * permission of the OpenSSL Project.
32 *
33 * 6. Redistributions of any form whatsoever must retain the following
34 * acknowledgment:
35 * "This product includes software developed by the OpenSSL Project
36 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
37 *
38 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
39 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
40 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
41 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
42 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
43 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
44 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
45 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
46 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
47 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
48 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
49 * OF THE POSSIBILITY OF SUCH DAMAGE.
50 * ====================================================================
51 *
52 * This product includes cryptographic software written by Eric Young
53 * (eay@cryptsoft.com). This product includes software written by Tim
54 * Hudson (tjh@cryptsoft.com).
55 *
56 */
57
58#include "cryptlib.h"
59#include "bn_lcl.h"
60
61
62BIGNUM *BN_mod_sqrt(BIGNUM *in, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx)
63/* Returns 'ret' such that
64 * ret^2 == a (mod p),
65 * using the Tonelli/Shanks algorithm (cf. Henri Cohen, "A Course
66 * in Algebraic Computational Number Theory", algorithm 1.5.1).
67 * 'p' must be prime!
68 * If 'a' is not a square, this is not necessarily detected by
69 * the algorithms; a bogus result must be expected in this case.
70 */
71 {
72 BIGNUM *ret = in;
73 int err = 1;
74 int r;
75 BIGNUM *b, *q, *t, *x, *y;
76 int e, i, j;
77
78 if (!BN_is_odd(p) || BN_abs_is_word(p, 1))
79 {
80 if (BN_abs_is_word(p, 2))
81 {
82 if (ret == NULL)
83 ret = BN_new();
84 if (ret == NULL)
85 goto end;
86 if (!BN_set_word(ret, BN_is_bit_set(a, 0)))
87 {
88 BN_free(ret);
89 return NULL;
90 }
91 return ret;
92 }
93
94 BNerr(BN_F_BN_MOD_SQRT, BN_R_P_IS_NOT_PRIME);
95 return(NULL);
96 }
97
98 if (BN_is_zero(a) || BN_is_one(a))
99 {
100 if (ret == NULL)
101 ret = BN_new();
102 if (ret == NULL)
103 goto end;
104 if (!BN_set_word(ret, BN_is_one(a)))
105 {
106 BN_free(ret);
107 return NULL;
108 }
109 return ret;
110 }
111
112#if 0 /* if BN_mod_sqrt is used with correct input, this just wastes time */
113 r = BN_kronecker(a, p, ctx);
114 if (r < -1) return NULL;
115 if (r == -1)
116 {
117 BNerr(BN_F_BN_MOD_SQRT, BN_R_NOT_A_SQUARE);
118 return(NULL);
119 }
120#endif
121
122 BN_CTX_start(ctx);
123 b = BN_CTX_get(ctx);
124 q = BN_CTX_get(ctx);
125 t = BN_CTX_get(ctx);
126 x = BN_CTX_get(ctx);
127 y = BN_CTX_get(ctx);
128 if (y == NULL) goto end;
129
130 if (ret == NULL)
131 ret = BN_new();
132 if (ret == NULL) goto end;
133
134 /* now write |p| - 1 as 2^e*q where q is odd */
135 e = 1;
136 while (!BN_is_bit_set(p, e))
137 e++;
138 /* we'll set q later (if needed) */
139
140 if (e == 1)
141 {
142 /* The easy case: (|p|-1)/2 is odd, so 2 has an inverse
143 * modulo (|p|-1)/2, and square roots can be computed
144 * directly by modular exponentiation.
145 * We have
146 * 2 * (|p|+1)/4 == 1 (mod (|p|-1)/2),
147 * so we can use exponent (|p|+1)/4, i.e. (|p|-3)/4 + 1.
148 */
149 if (!BN_rshift(q, p, 2)) goto end;
150 q->neg = 0;
151 if (!BN_add_word(q, 1)) goto end;
152 if (!BN_mod_exp(ret, a, q, p, ctx)) goto end;
153 err = 0;
154 goto end;
155 }
156
157 if (e == 2)
158 {
159 /* |p| == 5 (mod 8)
160 *
161 * In this case 2 is always a non-square since
162 * Legendre(2,p) = (-1)^((p^2-1)/8) for any odd prime.
163 * So if a really is a square, then 2*a is a non-square.
164 * Thus for
165 * b := (2*a)^((|p|-5)/8),
166 * i := (2*a)*b^2
167 * we have
168 * i^2 = (2*a)^((1 + (|p|-5)/4)*2)
169 * = (2*a)^((p-1)/2)
170 * = -1;
171 * so if we set
172 * x := a*b*(i-1),
173 * then
174 * x^2 = a^2 * b^2 * (i^2 - 2*i + 1)
175 * = a^2 * b^2 * (-2*i)
176 * = a*(-i)*(2*a*b^2)
177 * = a*(-i)*i
178 * = a.
179 *
180 * (This is due to A.O.L. Atkin,
181 * <URL: http://listserv.nodak.edu/scripts/wa.exe?A2=ind9211&L=nmbrthry&O=T&P=562>,
182 * November 1992.)
183 */
184
185 /* make sure that a is reduced modulo p */
186 if (a->neg || BN_ucmp(a, p) >= 0)
187 {
188 if (!BN_nnmod(x, a, p, ctx)) goto end;
189 a = x; /* use x as temporary variable */
190 }
191
192 /* t := 2*a */
193 if (!BN_mod_lshift1_quick(t, a, p)) goto end;
194
195 /* b := (2*a)^((|p|-5)/8) */
196 if (!BN_rshift(q, p, 3)) goto end;
197 q->neg = 0;
198 if (!BN_mod_exp(b, t, q, p, ctx)) goto end;
199
200 /* y := b^2 */
201 if (!BN_mod_sqr(y, b, p, ctx)) goto end;
202
203 /* t := (2*a)*b^2 - 1*/
204 if (!BN_mod_mul(t, t, y, p, ctx)) goto end;
205 if (!BN_sub_word(t, 1)) goto end;
206
207 /* x = a*b*t */
208 if (!BN_mod_mul(x, a, b, p, ctx)) goto end;
209 if (!BN_mod_mul(x, x, t, p, ctx)) goto end;
210
211 if (!BN_copy(ret, x)) goto end;
212 err = 0;
213 goto end;
214 }
215
216 /* e > 2, so we really have to use the Tonelli/Shanks algorithm.
217 * First, find some y that is not a square. */
218 if (!BN_copy(q, p)) goto end; /* use 'q' as temp */
219 q->neg = 0;
220 i = 2;
221 do
222 {
223 /* For efficiency, try small numbers first;
224 * if this fails, try random numbers.
225 */
226 if (i < 22)
227 {
228 if (!BN_set_word(y, i)) goto end;
229 }
230 else
231 {
232 if (!BN_pseudo_rand(y, BN_num_bits(p), 0, 0)) goto end;
233 if (BN_ucmp(y, p) >= 0)
234 {
235 if (!(p->neg ? BN_add : BN_sub)(y, y, p)) goto end;
236 }
237 /* now 0 <= y < |p| */
238 if (BN_is_zero(y))
239 if (!BN_set_word(y, i)) goto end;
240 }
241
242 r = BN_kronecker(y, q, ctx); /* here 'q' is |p| */
243 if (r < -1) goto end;
244 if (r == 0)
245 {
246 /* m divides p */
247 BNerr(BN_F_BN_MOD_SQRT, BN_R_P_IS_NOT_PRIME);
248 goto end;
249 }
250 }
251 while (r == 1 && ++i < 82);
252
253 if (r != -1)
254 {
255 /* Many rounds and still no non-square -- this is more likely
256 * a bug than just bad luck.
257 * Even if p is not prime, we should have found some y
258 * such that r == -1.
259 */
260 BNerr(BN_F_BN_MOD_SQRT, BN_R_TOO_MANY_ITERATIONS);
261 goto end;
262 }
263
264 /* Here's our actual 'q': */
265 if (!BN_rshift(q, q, e)) goto end;
266
267 /* Now that we have some non-square, we can find an element
268 * of order 2^e by computing its q'th power. */
269 if (!BN_mod_exp(y, y, q, p, ctx)) goto end;
270 if (BN_is_one(y))
271 {
272 BNerr(BN_F_BN_MOD_SQRT, BN_R_P_IS_NOT_PRIME);
273 goto end;
274 }
275
276 /* Now we know that (if p is indeed prime) there is an integer
277 * k, 0 <= k < 2^e, such that
278 *
279 * a^q * y^k == 1 (mod p).
280 *
281 * As a^q is a square and y is not, k must be even.
282 * q+1 is even, too, so there is an element
283 *
284 * X := a^((q+1)/2) * y^(k/2),
285 *
286 * and it satisfies
287 *
288 * X^2 = a^q * a * y^k
289 * = a,
290 *
291 * so it is the square root that we are looking for.
292 */
293
294 /* t := (q-1)/2 (note that q is odd) */
295 if (!BN_rshift1(t, q)) goto end;
296
297 /* x := a^((q-1)/2) */
298 if (BN_is_zero(t)) /* special case: p = 2^e + 1 */
299 {
300 if (!BN_nnmod(t, a, p, ctx)) goto end;
301 if (BN_is_zero(t))
302 {
303 /* special case: a == 0 (mod p) */
304 if (!BN_zero(ret)) goto end;
305 err = 0;
306 goto end;
307 }
308 else
309 if (!BN_one(x)) goto end;
310 }
311 else
312 {
313 if (!BN_mod_exp(x, a, t, p, ctx)) goto end;
314 if (BN_is_zero(x))
315 {
316 /* special case: a == 0 (mod p) */
317 if (!BN_zero(ret)) goto end;
318 err = 0;
319 goto end;
320 }
321 }
322
323 /* b := a*x^2 (= a^q) */
324 if (!BN_mod_sqr(b, x, p, ctx)) goto end;
325 if (!BN_mod_mul(b, b, a, p, ctx)) goto end;
326
327 /* x := a*x (= a^((q+1)/2)) */
328 if (!BN_mod_mul(x, x, a, p, ctx)) goto end;
329
330 while (1)
331 {
332 /* Now b is a^q * y^k for some even k (0 <= k < 2^E
333 * where E refers to the original value of e, which we
334 * don't keep in a variable), and x is a^((q+1)/2) * y^(k/2).
335 *
336 * We have a*b = x^2,
337 * y^2^(e-1) = -1,
338 * b^2^(e-1) = 1.
339 */
340
341 if (BN_is_one(b))
342 {
343 if (!BN_copy(ret, x)) goto end;
344 err = 0;
345 goto end;
346 }
347
348
349 /* find smallest i such that b^(2^i) = 1 */
350 i = 1;
351 if (!BN_mod_sqr(t, b, p, ctx)) goto end;
352 while (!BN_is_one(t))
353 {
354 i++;
355 if (i == e)
356 {
357 BNerr(BN_F_BN_MOD_SQRT, BN_R_NOT_A_SQUARE);
358 goto end;
359 }
360 if (!BN_mod_mul(t, t, t, p, ctx)) goto end;
361 }
362
363
364 /* t := y^2^(e - i - 1) */
365 if (!BN_copy(t, y)) goto end;
366 for (j = e - i - 1; j > 0; j--)
367 {
368 if (!BN_mod_sqr(t, t, p, ctx)) goto end;
369 }
370 if (!BN_mod_mul(y, t, t, p, ctx)) goto end;
371 if (!BN_mod_mul(x, x, t, p, ctx)) goto end;
372 if (!BN_mod_mul(b, b, y, p, ctx)) goto end;
373 e = i;
374 }
375
376 end:
377 if (err)
378 {
379 if (ret != NULL && ret != in)
380 {
381 BN_clear_free(ret);
382 }
383 ret = NULL;
384 }
385 BN_CTX_end(ctx);
386 return ret;
387 }