summaryrefslogtreecommitdiff
path: root/src/lib/libcrypto/bn
diff options
context:
space:
mode:
authorcvs2svn <admin@example.com>2015-08-02 21:54:22 +0000
committercvs2svn <admin@example.com>2015-08-02 21:54:22 +0000
commited3760bf4be4a96a89233fb8f8b84a0d44725862 (patch)
tree5609c82060f75c53af0a7641d9b33a88574876cd /src/lib/libcrypto/bn
parentf8b563fb5ba1524c821d37308f4e6abfc866bc3f (diff)
downloadopenbsd-OPENBSD_5_8_BASE.tar.gz
openbsd-OPENBSD_5_8_BASE.tar.bz2
openbsd-OPENBSD_5_8_BASE.zip
This commit was manufactured by cvs2git to create tag 'OPENBSD_5_8_BASE'.OPENBSD_5_8_BASE
Diffstat (limited to '')
-rw-r--r--src/lib/libcrypto/bn/asm/alpha-mont.pl316
-rw-r--r--src/lib/libcrypto/bn/asm/armv4-gf2m.pl278
-rw-r--r--src/lib/libcrypto/bn/asm/armv4-mont.pl204
-rw-r--r--src/lib/libcrypto/bn/asm/bn-586.pl774
-rw-r--r--src/lib/libcrypto/bn/asm/co-586.pl287
-rw-r--r--src/lib/libcrypto/bn/asm/ia64-mont.pl851
-rw-r--r--src/lib/libcrypto/bn/asm/ia64.S1555
-rw-r--r--src/lib/libcrypto/bn/asm/mips-mont.pl426
-rw-r--r--src/lib/libcrypto/bn/asm/mips.pl2234
-rw-r--r--src/lib/libcrypto/bn/asm/modexp512-x86_64.pl1388
-rw-r--r--src/lib/libcrypto/bn/asm/pa-risc2.s1618
-rw-r--r--src/lib/libcrypto/bn/asm/pa-risc2W.s1605
-rw-r--r--src/lib/libcrypto/bn/asm/parisc-mont.pl993
-rw-r--r--src/lib/libcrypto/bn/asm/ppc-mont.pl334
-rw-r--r--src/lib/libcrypto/bn/asm/ppc.pl1998
-rw-r--r--src/lib/libcrypto/bn/asm/ppc64-mont.pl1088
-rw-r--r--src/lib/libcrypto/bn/asm/s390x-gf2m.pl221
-rw-r--r--src/lib/libcrypto/bn/asm/s390x-mont.pl277
-rwxr-xr-xsrc/lib/libcrypto/bn/asm/s390x.S678
-rw-r--r--src/lib/libcrypto/bn/asm/sparcv8.S1458
-rw-r--r--src/lib/libcrypto/bn/asm/sparcv8plus.S1558
-rw-r--r--src/lib/libcrypto/bn/asm/sparcv9-mont.pl606
-rwxr-xr-xsrc/lib/libcrypto/bn/asm/sparcv9a-mont.pl882
-rw-r--r--src/lib/libcrypto/bn/asm/via-mont.pl242
-rw-r--r--src/lib/libcrypto/bn/asm/x86-gf2m.pl313
-rwxr-xr-xsrc/lib/libcrypto/bn/asm/x86-mont.pl593
-rw-r--r--src/lib/libcrypto/bn/asm/x86.pl28
-rw-r--r--src/lib/libcrypto/bn/asm/x86/add.pl76
-rw-r--r--src/lib/libcrypto/bn/asm/x86/comba.pl277
-rw-r--r--src/lib/libcrypto/bn/asm/x86/div.pl15
-rw-r--r--src/lib/libcrypto/bn/asm/x86/mul.pl77
-rw-r--r--src/lib/libcrypto/bn/asm/x86/mul_add.pl87
-rw-r--r--src/lib/libcrypto/bn/asm/x86/sqr.pl60
-rw-r--r--src/lib/libcrypto/bn/asm/x86/sub.pl76
-rw-r--r--src/lib/libcrypto/bn/asm/x86_64-gcc.c598
-rw-r--r--src/lib/libcrypto/bn/asm/x86_64-gf2m.pl390
-rwxr-xr-xsrc/lib/libcrypto/bn/asm/x86_64-mont.pl1504
-rwxr-xr-xsrc/lib/libcrypto/bn/asm/x86_64-mont5.pl1071
-rw-r--r--src/lib/libcrypto/bn/bn.h821
-rw-r--r--src/lib/libcrypto/bn/bn_add.c313
-rw-r--r--src/lib/libcrypto/bn/bn_asm.c1098
-rw-r--r--src/lib/libcrypto/bn/bn_blind.c388
-rw-r--r--src/lib/libcrypto/bn/bn_const.c409
-rw-r--r--src/lib/libcrypto/bn/bn_ctx.c478
-rw-r--r--src/lib/libcrypto/bn/bn_depr.c115
-rw-r--r--src/lib/libcrypto/bn/bn_div.c381
-rw-r--r--src/lib/libcrypto/bn/bn_err.c150
-rw-r--r--src/lib/libcrypto/bn/bn_exp.c1097
-rw-r--r--src/lib/libcrypto/bn/bn_exp2.c308
-rw-r--r--src/lib/libcrypto/bn/bn_gcd.c688
-rw-r--r--src/lib/libcrypto/bn/bn_gf2m.c1320
-rw-r--r--src/lib/libcrypto/bn/bn_kron.c185
-rw-r--r--src/lib/libcrypto/bn/bn_lcl.h484
-rw-r--r--src/lib/libcrypto/bn/bn_lib.c883
-rw-r--r--src/lib/libcrypto/bn/bn_mod.c305
-rw-r--r--src/lib/libcrypto/bn/bn_mont.c538
-rw-r--r--src/lib/libcrypto/bn/bn_mpi.c132
-rw-r--r--src/lib/libcrypto/bn/bn_mul.c1171
-rw-r--r--src/lib/libcrypto/bn/bn_nist.c1270
-rw-r--r--src/lib/libcrypto/bn/bn_prime.c518
-rw-r--r--src/lib/libcrypto/bn/bn_prime.h319
-rw-r--r--src/lib/libcrypto/bn/bn_prime.pl103
-rw-r--r--src/lib/libcrypto/bn/bn_print.c393
-rw-r--r--src/lib/libcrypto/bn/bn_rand.c290
-rw-r--r--src/lib/libcrypto/bn/bn_recp.c263
-rw-r--r--src/lib/libcrypto/bn/bn_shift.c218
-rw-r--r--src/lib/libcrypto/bn/bn_sqr.c286
-rw-r--r--src/lib/libcrypto/bn/bn_sqrt.c405
-rw-r--r--src/lib/libcrypto/bn/bn_word.c233
-rw-r--r--src/lib/libcrypto/bn/bn_x931p.c279
70 files changed, 0 insertions, 42877 deletions
diff --git a/src/lib/libcrypto/bn/asm/alpha-mont.pl b/src/lib/libcrypto/bn/asm/alpha-mont.pl
deleted file mode 100644
index 41700d5bd5..0000000000
--- a/src/lib/libcrypto/bn/asm/alpha-mont.pl
+++ /dev/null
@@ -1,316 +0,0 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# On 21264 RSA sign performance improves by 70/35/20/15 percent for
11# 512/1024/2048/4096 bit key lengths. This is against vendor compiler
12# instructed to '-tune host' code with in-line assembler. Other
13# benchmarks improve by 15-20%. To anchor it to something else, the
14# code provides approximately the same performance per GHz as AMD64.
15# I.e. if you compare 1GHz 21264 and 2GHz Opteron, you'll observe ~2x
16# difference.
17
18# int bn_mul_mont(
19$rp="a0"; # BN_ULONG *rp,
20$ap="a1"; # const BN_ULONG *ap,
21$bp="a2"; # const BN_ULONG *bp,
22$np="a3"; # const BN_ULONG *np,
23$n0="a4"; # const BN_ULONG *n0,
24$num="a5"; # int num);
25
26$lo0="t0";
27$hi0="t1";
28$lo1="t2";
29$hi1="t3";
30$aj="t4";
31$bi="t5";
32$nj="t6";
33$tp="t7";
34$alo="t8";
35$ahi="t9";
36$nlo="t10";
37$nhi="t11";
38$tj="t12";
39$i="s3";
40$j="s4";
41$m1="s5";
42
43$code=<<___;
44#include <machine/asm.h>
45
46.text
47
48.set noat
49.set noreorder
50
51.globl bn_mul_mont
52.align 5
53.ent bn_mul_mont
54bn_mul_mont:
55 lda sp,-48(sp)
56 stq ra,0(sp)
57 stq s3,8(sp)
58 stq s4,16(sp)
59 stq s5,24(sp)
60 stq fp,32(sp)
61 mov sp,fp
62 .mask 0x0400f000,-48
63 .frame fp,48,ra
64 .prologue 0
65
66 .align 4
67 .set reorder
68 sextl $num,$num
69 mov 0,v0
70 cmplt $num,4,AT
71 bne AT,.Lexit
72
73 ldq $hi0,0($ap) # ap[0]
74 s8addq $num,16,AT
75 ldq $aj,8($ap)
76 subq sp,AT,sp
77 ldq $bi,0($bp) # bp[0]
78 lda AT,-4096(zero) # mov -4096,AT
79 ldq $n0,0($n0)
80 and sp,AT,sp
81
82 mulq $hi0,$bi,$lo0
83 ldq $hi1,0($np) # np[0]
84 umulh $hi0,$bi,$hi0
85 ldq $nj,8($np)
86
87 mulq $lo0,$n0,$m1
88
89 mulq $hi1,$m1,$lo1
90 umulh $hi1,$m1,$hi1
91
92 addq $lo1,$lo0,$lo1
93 cmpult $lo1,$lo0,AT
94 addq $hi1,AT,$hi1
95
96 mulq $aj,$bi,$alo
97 mov 2,$j
98 umulh $aj,$bi,$ahi
99 mov sp,$tp
100
101 mulq $nj,$m1,$nlo
102 s8addq $j,$ap,$aj
103 umulh $nj,$m1,$nhi
104 s8addq $j,$np,$nj
105.align 4
106.L1st:
107 .set noreorder
108 ldq $aj,0($aj)
109 addl $j,1,$j
110 ldq $nj,0($nj)
111 lda $tp,8($tp)
112
113 addq $alo,$hi0,$lo0
114 mulq $aj,$bi,$alo
115 cmpult $lo0,$hi0,AT
116 addq $nlo,$hi1,$lo1
117
118 mulq $nj,$m1,$nlo
119 addq $ahi,AT,$hi0
120 cmpult $lo1,$hi1,v0
121 cmplt $j,$num,$tj
122
123 umulh $aj,$bi,$ahi
124 addq $nhi,v0,$hi1
125 addq $lo1,$lo0,$lo1
126 s8addq $j,$ap,$aj
127
128 umulh $nj,$m1,$nhi
129 cmpult $lo1,$lo0,v0
130 addq $hi1,v0,$hi1
131 s8addq $j,$np,$nj
132
133 stq $lo1,-8($tp)
134 nop
135 unop
136 bne $tj,.L1st
137 .set reorder
138
139 addq $alo,$hi0,$lo0
140 addq $nlo,$hi1,$lo1
141 cmpult $lo0,$hi0,AT
142 cmpult $lo1,$hi1,v0
143 addq $ahi,AT,$hi0
144 addq $nhi,v0,$hi1
145
146 addq $lo1,$lo0,$lo1
147 cmpult $lo1,$lo0,v0
148 addq $hi1,v0,$hi1
149
150 stq $lo1,0($tp)
151
152 addq $hi1,$hi0,$hi1
153 cmpult $hi1,$hi0,AT
154 stq $hi1,8($tp)
155 stq AT,16($tp)
156
157 mov 1,$i
158.align 4
159.Louter:
160 s8addq $i,$bp,$bi
161 ldq $hi0,0($ap)
162 ldq $aj,8($ap)
163 ldq $bi,0($bi)
164 ldq $hi1,0($np)
165 ldq $nj,8($np)
166 ldq $tj,0(sp)
167
168 mulq $hi0,$bi,$lo0
169 umulh $hi0,$bi,$hi0
170
171 addq $lo0,$tj,$lo0
172 cmpult $lo0,$tj,AT
173 addq $hi0,AT,$hi0
174
175 mulq $lo0,$n0,$m1
176
177 mulq $hi1,$m1,$lo1
178 umulh $hi1,$m1,$hi1
179
180 addq $lo1,$lo0,$lo1
181 cmpult $lo1,$lo0,AT
182 mov 2,$j
183 addq $hi1,AT,$hi1
184
185 mulq $aj,$bi,$alo
186 mov sp,$tp
187 umulh $aj,$bi,$ahi
188
189 mulq $nj,$m1,$nlo
190 s8addq $j,$ap,$aj
191 umulh $nj,$m1,$nhi
192.align 4
193.Linner:
194 .set noreorder
195 ldq $tj,8($tp) #L0
196 nop #U1
197 ldq $aj,0($aj) #L1
198 s8addq $j,$np,$nj #U0
199
200 ldq $nj,0($nj) #L0
201 nop #U1
202 addq $alo,$hi0,$lo0 #L1
203 lda $tp,8($tp)
204
205 mulq $aj,$bi,$alo #U1
206 cmpult $lo0,$hi0,AT #L0
207 addq $nlo,$hi1,$lo1 #L1
208 addl $j,1,$j
209
210 mulq $nj,$m1,$nlo #U1
211 addq $ahi,AT,$hi0 #L0
212 addq $lo0,$tj,$lo0 #L1
213 cmpult $lo1,$hi1,v0 #U0
214
215 umulh $aj,$bi,$ahi #U1
216 cmpult $lo0,$tj,AT #L0
217 addq $lo1,$lo0,$lo1 #L1
218 addq $nhi,v0,$hi1 #U0
219
220 umulh $nj,$m1,$nhi #U1
221 s8addq $j,$ap,$aj #L0
222 cmpult $lo1,$lo0,v0 #L1
223 cmplt $j,$num,$tj #U0 # borrow $tj
224
225 addq $hi0,AT,$hi0 #L0
226 addq $hi1,v0,$hi1 #U1
227 stq $lo1,-8($tp) #L1
228 bne $tj,.Linner #U0
229 .set reorder
230
231 ldq $tj,8($tp)
232 addq $alo,$hi0,$lo0
233 addq $nlo,$hi1,$lo1
234 cmpult $lo0,$hi0,AT
235 cmpult $lo1,$hi1,v0
236 addq $ahi,AT,$hi0
237 addq $nhi,v0,$hi1
238
239 addq $lo0,$tj,$lo0
240 cmpult $lo0,$tj,AT
241 addq $hi0,AT,$hi0
242
243 ldq $tj,16($tp)
244 addq $lo1,$lo0,$j
245 cmpult $j,$lo0,v0
246 addq $hi1,v0,$hi1
247
248 addq $hi1,$hi0,$lo1
249 stq $j,0($tp)
250 cmpult $lo1,$hi0,$hi1
251 addq $lo1,$tj,$lo1
252 cmpult $lo1,$tj,AT
253 addl $i,1,$i
254 addq $hi1,AT,$hi1
255 stq $lo1,8($tp)
256 cmplt $i,$num,$tj # borrow $tj
257 stq $hi1,16($tp)
258 bne $tj,.Louter
259
260 s8addq $num,sp,$tj # &tp[num]
261 mov $rp,$bp # put rp aside
262 mov sp,$tp
263 mov sp,$ap
264 mov 0,$hi0 # clear borrow bit
265
266.align 4
267.Lsub: ldq $lo0,0($tp)
268 ldq $lo1,0($np)
269 lda $tp,8($tp)
270 lda $np,8($np)
271 subq $lo0,$lo1,$lo1 # tp[i]-np[i]
272 cmpult $lo0,$lo1,AT
273 subq $lo1,$hi0,$lo0
274 cmpult $lo1,$lo0,$hi0
275 or $hi0,AT,$hi0
276 stq $lo0,0($rp)
277 cmpult $tp,$tj,v0
278 lda $rp,8($rp)
279 bne v0,.Lsub
280
281 subq $hi1,$hi0,$hi0 # handle upmost overflow bit
282 mov sp,$tp
283 mov $bp,$rp # restore rp
284
285 and sp,$hi0,$ap
286 bic $bp,$hi0,$bp
287 bis $bp,$ap,$ap # ap=borrow?tp:rp
288
289.align 4
290.Lcopy: ldq $aj,0($ap) # copy or in-place refresh
291 lda $tp,8($tp)
292 lda $rp,8($rp)
293 lda $ap,8($ap)
294 stq zero,-8($tp) # zap tp
295 cmpult $tp,$tj,AT
296 stq $aj,-8($rp)
297 bne AT,.Lcopy
298 mov 1,v0
299
300.Lexit:
301 .set noreorder
302 mov fp,sp
303 /*ldq ra,0(sp)*/
304 ldq s3,8(sp)
305 ldq s4,16(sp)
306 ldq s5,24(sp)
307 ldq fp,32(sp)
308 lda sp,48(sp)
309 ret (ra)
310.end bn_mul_mont
311.ascii "Montgomery Multiplication for Alpha, CRYPTOGAMS by <appro\@openssl.org>"
312.align 2
313___
314
315print $code;
316close STDOUT;
diff --git a/src/lib/libcrypto/bn/asm/armv4-gf2m.pl b/src/lib/libcrypto/bn/asm/armv4-gf2m.pl
deleted file mode 100644
index c52e0b75b5..0000000000
--- a/src/lib/libcrypto/bn/asm/armv4-gf2m.pl
+++ /dev/null
@@ -1,278 +0,0 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# May 2011
11#
12# The module implements bn_GF2m_mul_2x2 polynomial multiplication
13# used in bn_gf2m.c. It's kind of low-hanging mechanical port from
14# C for the time being... Except that it has two code paths: pure
15# integer code suitable for any ARMv4 and later CPU and NEON code
16# suitable for ARMv7. Pure integer 1x1 multiplication subroutine runs
17# in ~45 cycles on dual-issue core such as Cortex A8, which is ~50%
18# faster than compiler-generated code. For ECDH and ECDSA verify (but
19# not for ECDSA sign) it means 25%-45% improvement depending on key
20# length, more for longer keys. Even though NEON 1x1 multiplication
21# runs in even less cycles, ~30, improvement is measurable only on
22# longer keys. One has to optimize code elsewhere to get NEON glow...
23
24while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
25open STDOUT,">$output";
26
27sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
28sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
29sub Q() { shift=~m|d([1-3]?[02468])|?"q".($1/2):""; }
30
31$code=<<___;
32#include "arm_arch.h"
33
34.text
35.code 32
36
37#if __ARM_ARCH__>=7
38.fpu neon
39
40.type mul_1x1_neon,%function
41.align 5
42mul_1x1_neon:
43 vshl.u64 `&Dlo("q1")`,d16,#8 @ q1-q3 are slided $a
44 vmull.p8 `&Q("d0")`,d16,d17 @ a·bb
45 vshl.u64 `&Dlo("q2")`,d16,#16
46 vmull.p8 q1,`&Dlo("q1")`,d17 @ a<<8·bb
47 vshl.u64 `&Dlo("q3")`,d16,#24
48 vmull.p8 q2,`&Dlo("q2")`,d17 @ a<<16·bb
49 vshr.u64 `&Dlo("q1")`,#8
50 vmull.p8 q3,`&Dlo("q3")`,d17 @ a<<24·bb
51 vshl.u64 `&Dhi("q1")`,#24
52 veor d0,`&Dlo("q1")`
53 vshr.u64 `&Dlo("q2")`,#16
54 veor d0,`&Dhi("q1")`
55 vshl.u64 `&Dhi("q2")`,#16
56 veor d0,`&Dlo("q2")`
57 vshr.u64 `&Dlo("q3")`,#24
58 veor d0,`&Dhi("q2")`
59 vshl.u64 `&Dhi("q3")`,#8
60 veor d0,`&Dlo("q3")`
61 veor d0,`&Dhi("q3")`
62 bx lr
63.size mul_1x1_neon,.-mul_1x1_neon
64#endif
65___
66################
67# private interface to mul_1x1_ialu
68#
69$a="r1";
70$b="r0";
71
72($a0,$a1,$a2,$a12,$a4,$a14)=
73($hi,$lo,$t0,$t1, $i0,$i1 )=map("r$_",(4..9),12);
74
75$mask="r12";
76
77$code.=<<___;
78.type mul_1x1_ialu,%function
79.align 5
80mul_1x1_ialu:
81 mov $a0,#0
82 bic $a1,$a,#3<<30 @ a1=a&0x3fffffff
83 str $a0,[sp,#0] @ tab[0]=0
84 add $a2,$a1,$a1 @ a2=a1<<1
85 str $a1,[sp,#4] @ tab[1]=a1
86 eor $a12,$a1,$a2 @ a1^a2
87 str $a2,[sp,#8] @ tab[2]=a2
88 mov $a4,$a1,lsl#2 @ a4=a1<<2
89 str $a12,[sp,#12] @ tab[3]=a1^a2
90 eor $a14,$a1,$a4 @ a1^a4
91 str $a4,[sp,#16] @ tab[4]=a4
92 eor $a0,$a2,$a4 @ a2^a4
93 str $a14,[sp,#20] @ tab[5]=a1^a4
94 eor $a12,$a12,$a4 @ a1^a2^a4
95 str $a0,[sp,#24] @ tab[6]=a2^a4
96 and $i0,$mask,$b,lsl#2
97 str $a12,[sp,#28] @ tab[7]=a1^a2^a4
98
99 and $i1,$mask,$b,lsr#1
100 ldr $lo,[sp,$i0] @ tab[b & 0x7]
101 and $i0,$mask,$b,lsr#4
102 ldr $t1,[sp,$i1] @ tab[b >> 3 & 0x7]
103 and $i1,$mask,$b,lsr#7
104 ldr $t0,[sp,$i0] @ tab[b >> 6 & 0x7]
105 eor $lo,$lo,$t1,lsl#3 @ stall
106 mov $hi,$t1,lsr#29
107 ldr $t1,[sp,$i1] @ tab[b >> 9 & 0x7]
108
109 and $i0,$mask,$b,lsr#10
110 eor $lo,$lo,$t0,lsl#6
111 eor $hi,$hi,$t0,lsr#26
112 ldr $t0,[sp,$i0] @ tab[b >> 12 & 0x7]
113
114 and $i1,$mask,$b,lsr#13
115 eor $lo,$lo,$t1,lsl#9
116 eor $hi,$hi,$t1,lsr#23
117 ldr $t1,[sp,$i1] @ tab[b >> 15 & 0x7]
118
119 and $i0,$mask,$b,lsr#16
120 eor $lo,$lo,$t0,lsl#12
121 eor $hi,$hi,$t0,lsr#20
122 ldr $t0,[sp,$i0] @ tab[b >> 18 & 0x7]
123
124 and $i1,$mask,$b,lsr#19
125 eor $lo,$lo,$t1,lsl#15
126 eor $hi,$hi,$t1,lsr#17
127 ldr $t1,[sp,$i1] @ tab[b >> 21 & 0x7]
128
129 and $i0,$mask,$b,lsr#22
130 eor $lo,$lo,$t0,lsl#18
131 eor $hi,$hi,$t0,lsr#14
132 ldr $t0,[sp,$i0] @ tab[b >> 24 & 0x7]
133
134 and $i1,$mask,$b,lsr#25
135 eor $lo,$lo,$t1,lsl#21
136 eor $hi,$hi,$t1,lsr#11
137 ldr $t1,[sp,$i1] @ tab[b >> 27 & 0x7]
138
139 tst $a,#1<<30
140 and $i0,$mask,$b,lsr#28
141 eor $lo,$lo,$t0,lsl#24
142 eor $hi,$hi,$t0,lsr#8
143 ldr $t0,[sp,$i0] @ tab[b >> 30 ]
144
145 eorne $lo,$lo,$b,lsl#30
146 eorne $hi,$hi,$b,lsr#2
147 tst $a,#1<<31
148 eor $lo,$lo,$t1,lsl#27
149 eor $hi,$hi,$t1,lsr#5
150 eorne $lo,$lo,$b,lsl#31
151 eorne $hi,$hi,$b,lsr#1
152 eor $lo,$lo,$t0,lsl#30
153 eor $hi,$hi,$t0,lsr#2
154
155 mov pc,lr
156.size mul_1x1_ialu,.-mul_1x1_ialu
157___
158################
159# void bn_GF2m_mul_2x2(BN_ULONG *r,
160# BN_ULONG a1,BN_ULONG a0,
161# BN_ULONG b1,BN_ULONG b0); # r[3..0]=a1a0·b1b0
162
163($A1,$B1,$A0,$B0,$A1B1,$A0B0)=map("d$_",(18..23));
164
165$code.=<<___;
166.global bn_GF2m_mul_2x2
167.type bn_GF2m_mul_2x2,%function
168.align 5
169bn_GF2m_mul_2x2:
170#if __ARM_ARCH__>=7
171 ldr r12,.LOPENSSL_armcap
172.Lpic: ldr r12,[pc,r12]
173 tst r12,#1
174 beq .Lialu
175
176 veor $A1,$A1
177 vmov.32 $B1,r3,r3 @ two copies of b1
178 vmov.32 ${A1}[0],r1 @ a1
179
180 veor $A0,$A0
181 vld1.32 ${B0}[],[sp,:32] @ two copies of b0
182 vmov.32 ${A0}[0],r2 @ a0
183 mov r12,lr
184
185 vmov d16,$A1
186 vmov d17,$B1
187 bl mul_1x1_neon @ a1·b1
188 vmov $A1B1,d0
189
190 vmov d16,$A0
191 vmov d17,$B0
192 bl mul_1x1_neon @ a0·b0
193 vmov $A0B0,d0
194
195 veor d16,$A0,$A1
196 veor d17,$B0,$B1
197 veor $A0,$A0B0,$A1B1
198 bl mul_1x1_neon @ (a0+a1)·(b0+b1)
199
200 veor d0,$A0 @ (a0+a1)·(b0+b1)-a0·b0-a1·b1
201 vshl.u64 d1,d0,#32
202 vshr.u64 d0,d0,#32
203 veor $A0B0,d1
204 veor $A1B1,d0
205 vst1.32 {${A0B0}[0]},[r0,:32]!
206 vst1.32 {${A0B0}[1]},[r0,:32]!
207 vst1.32 {${A1B1}[0]},[r0,:32]!
208 vst1.32 {${A1B1}[1]},[r0,:32]
209 bx r12
210.align 4
211.Lialu:
212#endif
213___
214$ret="r10"; # reassigned 1st argument
215$code.=<<___;
216 stmdb sp!,{r4-r10,lr}
217 mov $ret,r0 @ reassign 1st argument
218 mov $b,r3 @ $b=b1
219 ldr r3,[sp,#32] @ load b0
220 mov $mask,#7<<2
221 sub sp,sp,#32 @ allocate tab[8]
222
223 bl mul_1x1_ialu @ a1·b1
224 str $lo,[$ret,#8]
225 str $hi,[$ret,#12]
226
227 eor $b,$b,r3 @ flip b0 and b1
228 eor $a,$a,r2 @ flip a0 and a1
229 eor r3,r3,$b
230 eor r2,r2,$a
231 eor $b,$b,r3
232 eor $a,$a,r2
233 bl mul_1x1_ialu @ a0·b0
234 str $lo,[$ret]
235 str $hi,[$ret,#4]
236
237 eor $a,$a,r2
238 eor $b,$b,r3
239 bl mul_1x1_ialu @ (a1+a0)·(b1+b0)
240___
241@r=map("r$_",(6..9));
242$code.=<<___;
243 ldmia $ret,{@r[0]-@r[3]}
244 eor $lo,$lo,$hi
245 eor $hi,$hi,@r[1]
246 eor $lo,$lo,@r[0]
247 eor $hi,$hi,@r[2]
248 eor $lo,$lo,@r[3]
249 eor $hi,$hi,@r[3]
250 str $hi,[$ret,#8]
251 eor $lo,$lo,$hi
252 add sp,sp,#32 @ destroy tab[8]
253 str $lo,[$ret,#4]
254
255#if __ARM_ARCH__>=5
256 ldmia sp!,{r4-r10,pc}
257#else
258 ldmia sp!,{r4-r10,lr}
259 tst lr,#1
260 moveq pc,lr @ be binary compatible with V4, yet
261 bx lr @ interoperable with Thumb ISA:-)
262#endif
263.size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
264#if __ARM_ARCH__>=7
265.align 5
266.LOPENSSL_armcap:
267.word OPENSSL_armcap_P-(.Lpic+8)
268#endif
269.asciz "GF(2^m) Multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
270.align 5
271
272.comm OPENSSL_armcap_P,4,4
273___
274
275$code =~ s/\`([^\`]*)\`/eval $1/gem;
276$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
277print $code;
278close STDOUT; # enforce flush
diff --git a/src/lib/libcrypto/bn/asm/armv4-mont.pl b/src/lib/libcrypto/bn/asm/armv4-mont.pl
deleted file mode 100644
index f78a8b5f0f..0000000000
--- a/src/lib/libcrypto/bn/asm/armv4-mont.pl
+++ /dev/null
@@ -1,204 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# January 2007.
11
12# Montgomery multiplication for ARMv4.
13#
14# Performance improvement naturally varies among CPU implementations
15# and compilers. The code was observed to provide +65-35% improvement
16# [depending on key length, less for longer keys] on ARM920T, and
17# +115-80% on Intel IXP425. This is compared to pre-bn_mul_mont code
18# base and compiler generated code with in-lined umull and even umlal
19# instructions. The latter means that this code didn't really have an
20# "advantage" of utilizing some "secret" instruction.
21#
22# The code is interoperable with Thumb ISA and is rather compact, less
23# than 1/2KB. Windows CE port would be trivial, as it's exclusively
24# about decorations, ABI and instruction syntax are identical.
25
26while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
27open STDOUT,">$output";
28
29$num="r0"; # starts as num argument, but holds &tp[num-1]
30$ap="r1";
31$bp="r2"; $bi="r2"; $rp="r2";
32$np="r3";
33$tp="r4";
34$aj="r5";
35$nj="r6";
36$tj="r7";
37$n0="r8";
38########### # r9 is reserved by ELF as platform specific, e.g. TLS pointer
39$alo="r10"; # sl, gcc uses it to keep @GOT
40$ahi="r11"; # fp
41$nlo="r12"; # ip
42########### # r13 is stack pointer
43$nhi="r14"; # lr
44########### # r15 is program counter
45
46#### argument block layout relative to &tp[num-1], a.k.a. $num
47$_rp="$num,#12*4";
48# ap permanently resides in r1
49$_bp="$num,#13*4";
50# np permanently resides in r3
51$_n0="$num,#14*4";
52$_num="$num,#15*4"; $_bpend=$_num;
53
54$code=<<___;
55.text
56
57.global bn_mul_mont
58.type bn_mul_mont,%function
59
60.align 2
61bn_mul_mont:
62 stmdb sp!,{r0,r2} @ sp points at argument block
63 ldr $num,[sp,#3*4] @ load num
64 cmp $num,#2
65 movlt r0,#0
66 addlt sp,sp,#2*4
67 blt .Labrt
68
69 stmdb sp!,{r4-r12,lr} @ save 10 registers
70
71 mov $num,$num,lsl#2 @ rescale $num for byte count
72 sub sp,sp,$num @ alloca(4*num)
73 sub sp,sp,#4 @ +extra dword
74 sub $num,$num,#4 @ "num=num-1"
75 add $tp,$bp,$num @ &bp[num-1]
76
77 add $num,sp,$num @ $num to point at &tp[num-1]
78 ldr $n0,[$_n0] @ &n0
79 ldr $bi,[$bp] @ bp[0]
80 ldr $aj,[$ap],#4 @ ap[0],ap++
81 ldr $nj,[$np],#4 @ np[0],np++
82 ldr $n0,[$n0] @ *n0
83 str $tp,[$_bpend] @ save &bp[num]
84
85 umull $alo,$ahi,$aj,$bi @ ap[0]*bp[0]
86 str $n0,[$_n0] @ save n0 value
87 mul $n0,$alo,$n0 @ "tp[0]"*n0
88 mov $nlo,#0
89 umlal $alo,$nlo,$nj,$n0 @ np[0]*n0+"t[0]"
90 mov $tp,sp
91
92.L1st:
93 ldr $aj,[$ap],#4 @ ap[j],ap++
94 mov $alo,$ahi
95 ldr $nj,[$np],#4 @ np[j],np++
96 mov $ahi,#0
97 umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[0]
98 mov $nhi,#0
99 umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0
100 adds $nlo,$nlo,$alo
101 str $nlo,[$tp],#4 @ tp[j-1]=,tp++
102 adc $nlo,$nhi,#0
103 cmp $tp,$num
104 bne .L1st
105
106 adds $nlo,$nlo,$ahi
107 ldr $tp,[$_bp] @ restore bp
108 mov $nhi,#0
109 ldr $n0,[$_n0] @ restore n0
110 adc $nhi,$nhi,#0
111 str $nlo,[$num] @ tp[num-1]=
112 str $nhi,[$num,#4] @ tp[num]=
113
114.Louter:
115 sub $tj,$num,sp @ "original" $num-1 value
116 sub $ap,$ap,$tj @ "rewind" ap to &ap[1]
117 ldr $bi,[$tp,#4]! @ *(++bp)
118 sub $np,$np,$tj @ "rewind" np to &np[1]
119 ldr $aj,[$ap,#-4] @ ap[0]
120 ldr $alo,[sp] @ tp[0]
121 ldr $nj,[$np,#-4] @ np[0]
122 ldr $tj,[sp,#4] @ tp[1]
123
124 mov $ahi,#0
125 umlal $alo,$ahi,$aj,$bi @ ap[0]*bp[i]+tp[0]
126 str $tp,[$_bp] @ save bp
127 mul $n0,$alo,$n0
128 mov $nlo,#0
129 umlal $alo,$nlo,$nj,$n0 @ np[0]*n0+"tp[0]"
130 mov $tp,sp
131
132.Linner:
133 ldr $aj,[$ap],#4 @ ap[j],ap++
134 adds $alo,$ahi,$tj @ +=tp[j]
135 ldr $nj,[$np],#4 @ np[j],np++
136 mov $ahi,#0
137 umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[i]
138 mov $nhi,#0
139 umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0
140 adc $ahi,$ahi,#0
141 ldr $tj,[$tp,#8] @ tp[j+1]
142 adds $nlo,$nlo,$alo
143 str $nlo,[$tp],#4 @ tp[j-1]=,tp++
144 adc $nlo,$nhi,#0
145 cmp $tp,$num
146 bne .Linner
147
148 adds $nlo,$nlo,$ahi
149 mov $nhi,#0
150 ldr $tp,[$_bp] @ restore bp
151 adc $nhi,$nhi,#0
152 ldr $n0,[$_n0] @ restore n0
153 adds $nlo,$nlo,$tj
154 ldr $tj,[$_bpend] @ restore &bp[num]
155 adc $nhi,$nhi,#0
156 str $nlo,[$num] @ tp[num-1]=
157 str $nhi,[$num,#4] @ tp[num]=
158
159 cmp $tp,$tj
160 bne .Louter
161
162 ldr $rp,[$_rp] @ pull rp
163 add $num,$num,#4 @ $num to point at &tp[num]
164 sub $aj,$num,sp @ "original" num value
165 mov $tp,sp @ "rewind" $tp
166 mov $ap,$tp @ "borrow" $ap
167 sub $np,$np,$aj @ "rewind" $np to &np[0]
168
169 subs $tj,$tj,$tj @ "clear" carry flag
170.Lsub: ldr $tj,[$tp],#4
171 ldr $nj,[$np],#4
172 sbcs $tj,$tj,$nj @ tp[j]-np[j]
173 str $tj,[$rp],#4 @ rp[j]=
174 teq $tp,$num @ preserve carry
175 bne .Lsub
176 sbcs $nhi,$nhi,#0 @ upmost carry
177 mov $tp,sp @ "rewind" $tp
178 sub $rp,$rp,$aj @ "rewind" $rp
179
180 and $ap,$tp,$nhi
181 bic $np,$rp,$nhi
182 orr $ap,$ap,$np @ ap=borrow?tp:rp
183
184.Lcopy: ldr $tj,[$ap],#4 @ copy or in-place refresh
185 str sp,[$tp],#4 @ zap tp
186 str $tj,[$rp],#4
187 cmp $tp,$num
188 bne .Lcopy
189
190 add sp,$num,#4 @ skip over tp[num+1]
191 ldmia sp!,{r4-r12,lr} @ restore registers
192 add sp,sp,#2*4 @ skip over {r0,r2}
193 mov r0,#1
194.Labrt: tst lr,#1
195 moveq pc,lr @ be binary compatible with V4, yet
196 bx lr @ interoperable with Thumb ISA:-)
197.size bn_mul_mont,.-bn_mul_mont
198.asciz "Montgomery multiplication for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
199.align 2
200___
201
202$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
203print $code;
204close STDOUT;
diff --git a/src/lib/libcrypto/bn/asm/bn-586.pl b/src/lib/libcrypto/bn/asm/bn-586.pl
deleted file mode 100644
index 332ef3e91d..0000000000
--- a/src/lib/libcrypto/bn/asm/bn-586.pl
+++ /dev/null
@@ -1,774 +0,0 @@
1#!/usr/local/bin/perl
2
3$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
4push(@INC,"${dir}","${dir}../../perlasm");
5require "x86asm.pl";
6
7&asm_init($ARGV[0],$0);
8
9$sse2=0;
10for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
11
12&external_label("OPENSSL_ia32cap_P") if ($sse2);
13
14&bn_mul_add_words("bn_mul_add_words");
15&bn_mul_words("bn_mul_words");
16&bn_sqr_words("bn_sqr_words");
17&bn_div_words("bn_div_words");
18&bn_add_words("bn_add_words");
19&bn_sub_words("bn_sub_words");
20&bn_sub_part_words("bn_sub_part_words");
21
22&asm_finish();
23
24sub bn_mul_add_words
25 {
26 local($name)=@_;
27
28 &function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
29
30 $r="eax";
31 $a="edx";
32 $c="ecx";
33
34 if ($sse2) {
35 &picmeup("eax","OPENSSL_ia32cap_P");
36 &bt(&DWP(0,"eax"),26);
37 &jnc(&label("maw_non_sse2"));
38
39 &mov($r,&wparam(0));
40 &mov($a,&wparam(1));
41 &mov($c,&wparam(2));
42 &movd("mm0",&wparam(3)); # mm0 = w
43 &pxor("mm1","mm1"); # mm1 = carry_in
44 &jmp(&label("maw_sse2_entry"));
45
46 &set_label("maw_sse2_unrolled",16);
47 &movd("mm3",&DWP(0,$r,"",0)); # mm3 = r[0]
48 &paddq("mm1","mm3"); # mm1 = carry_in + r[0]
49 &movd("mm2",&DWP(0,$a,"",0)); # mm2 = a[0]
50 &pmuludq("mm2","mm0"); # mm2 = w*a[0]
51 &movd("mm4",&DWP(4,$a,"",0)); # mm4 = a[1]
52 &pmuludq("mm4","mm0"); # mm4 = w*a[1]
53 &movd("mm6",&DWP(8,$a,"",0)); # mm6 = a[2]
54 &pmuludq("mm6","mm0"); # mm6 = w*a[2]
55 &movd("mm7",&DWP(12,$a,"",0)); # mm7 = a[3]
56 &pmuludq("mm7","mm0"); # mm7 = w*a[3]
57 &paddq("mm1","mm2"); # mm1 = carry_in + r[0] + w*a[0]
58 &movd("mm3",&DWP(4,$r,"",0)); # mm3 = r[1]
59 &paddq("mm3","mm4"); # mm3 = r[1] + w*a[1]
60 &movd("mm5",&DWP(8,$r,"",0)); # mm5 = r[2]
61 &paddq("mm5","mm6"); # mm5 = r[2] + w*a[2]
62 &movd("mm4",&DWP(12,$r,"",0)); # mm4 = r[3]
63 &paddq("mm7","mm4"); # mm7 = r[3] + w*a[3]
64 &movd(&DWP(0,$r,"",0),"mm1");
65 &movd("mm2",&DWP(16,$a,"",0)); # mm2 = a[4]
66 &pmuludq("mm2","mm0"); # mm2 = w*a[4]
67 &psrlq("mm1",32); # mm1 = carry0
68 &movd("mm4",&DWP(20,$a,"",0)); # mm4 = a[5]
69 &pmuludq("mm4","mm0"); # mm4 = w*a[5]
70 &paddq("mm1","mm3"); # mm1 = carry0 + r[1] + w*a[1]
71 &movd("mm6",&DWP(24,$a,"",0)); # mm6 = a[6]
72 &pmuludq("mm6","mm0"); # mm6 = w*a[6]
73 &movd(&DWP(4,$r,"",0),"mm1");
74 &psrlq("mm1",32); # mm1 = carry1
75 &movd("mm3",&DWP(28,$a,"",0)); # mm3 = a[7]
76 &add($a,32);
77 &pmuludq("mm3","mm0"); # mm3 = w*a[7]
78 &paddq("mm1","mm5"); # mm1 = carry1 + r[2] + w*a[2]
79 &movd("mm5",&DWP(16,$r,"",0)); # mm5 = r[4]
80 &paddq("mm2","mm5"); # mm2 = r[4] + w*a[4]
81 &movd(&DWP(8,$r,"",0),"mm1");
82 &psrlq("mm1",32); # mm1 = carry2
83 &paddq("mm1","mm7"); # mm1 = carry2 + r[3] + w*a[3]
84 &movd("mm5",&DWP(20,$r,"",0)); # mm5 = r[5]
85 &paddq("mm4","mm5"); # mm4 = r[5] + w*a[5]
86 &movd(&DWP(12,$r,"",0),"mm1");
87 &psrlq("mm1",32); # mm1 = carry3
88 &paddq("mm1","mm2"); # mm1 = carry3 + r[4] + w*a[4]
89 &movd("mm5",&DWP(24,$r,"",0)); # mm5 = r[6]
90 &paddq("mm6","mm5"); # mm6 = r[6] + w*a[6]
91 &movd(&DWP(16,$r,"",0),"mm1");
92 &psrlq("mm1",32); # mm1 = carry4
93 &paddq("mm1","mm4"); # mm1 = carry4 + r[5] + w*a[5]
94 &movd("mm5",&DWP(28,$r,"",0)); # mm5 = r[7]
95 &paddq("mm3","mm5"); # mm3 = r[7] + w*a[7]
96 &movd(&DWP(20,$r,"",0),"mm1");
97 &psrlq("mm1",32); # mm1 = carry5
98 &paddq("mm1","mm6"); # mm1 = carry5 + r[6] + w*a[6]
99 &movd(&DWP(24,$r,"",0),"mm1");
100 &psrlq("mm1",32); # mm1 = carry6
101 &paddq("mm1","mm3"); # mm1 = carry6 + r[7] + w*a[7]
102 &movd(&DWP(28,$r,"",0),"mm1");
103 &lea($r,&DWP(32,$r));
104 &psrlq("mm1",32); # mm1 = carry_out
105
106 &sub($c,8);
107 &jz(&label("maw_sse2_exit"));
108 &set_label("maw_sse2_entry");
109 &test($c,0xfffffff8);
110 &jnz(&label("maw_sse2_unrolled"));
111
112 &set_label("maw_sse2_loop",4);
113 &movd("mm2",&DWP(0,$a)); # mm2 = a[i]
114 &movd("mm3",&DWP(0,$r)); # mm3 = r[i]
115 &pmuludq("mm2","mm0"); # a[i] *= w
116 &lea($a,&DWP(4,$a));
117 &paddq("mm1","mm3"); # carry += r[i]
118 &paddq("mm1","mm2"); # carry += a[i]*w
119 &movd(&DWP(0,$r),"mm1"); # r[i] = carry_low
120 &sub($c,1);
121 &psrlq("mm1",32); # carry = carry_high
122 &lea($r,&DWP(4,$r));
123 &jnz(&label("maw_sse2_loop"));
124 &set_label("maw_sse2_exit");
125 &movd("eax","mm1"); # c = carry_out
126 &emms();
127 &ret();
128
129 &set_label("maw_non_sse2",16);
130 }
131
132 # function_begin prologue
133 &push("ebp");
134 &push("ebx");
135 &push("esi");
136 &push("edi");
137
138 &comment("");
139 $Low="eax";
140 $High="edx";
141 $a="ebx";
142 $w="ebp";
143 $r="edi";
144 $c="esi";
145
146 &xor($c,$c); # clear carry
147 &mov($r,&wparam(0)); #
148
149 &mov("ecx",&wparam(2)); #
150 &mov($a,&wparam(1)); #
151
152 &and("ecx",0xfffffff8); # num / 8
153 &mov($w,&wparam(3)); #
154
155 &push("ecx"); # Up the stack for a tmp variable
156
157 &jz(&label("maw_finish"));
158
159 &set_label("maw_loop",16);
160
161 for ($i=0; $i<32; $i+=4)
162 {
163 &comment("Round $i");
164
165 &mov("eax",&DWP($i,$a)); # *a
166 &mul($w); # *a * w
167 &add("eax",$c); # L(t)+= c
168 &adc("edx",0); # H(t)+=carry
169 &add("eax",&DWP($i,$r)); # L(t)+= *r
170 &adc("edx",0); # H(t)+=carry
171 &mov(&DWP($i,$r),"eax"); # *r= L(t);
172 &mov($c,"edx"); # c= H(t);
173 }
174
175 &comment("");
176 &sub("ecx",8);
177 &lea($a,&DWP(32,$a));
178 &lea($r,&DWP(32,$r));
179 &jnz(&label("maw_loop"));
180
181 &set_label("maw_finish",0);
182 &mov("ecx",&wparam(2)); # get num
183 &and("ecx",7);
184 &jnz(&label("maw_finish2")); # helps branch prediction
185 &jmp(&label("maw_end"));
186
187 &set_label("maw_finish2",1);
188 for ($i=0; $i<7; $i++)
189 {
190 &comment("Tail Round $i");
191 &mov("eax",&DWP($i*4,$a)); # *a
192 &mul($w); # *a * w
193 &add("eax",$c); # L(t)+=c
194 &adc("edx",0); # H(t)+=carry
195 &add("eax",&DWP($i*4,$r)); # L(t)+= *r
196 &adc("edx",0); # H(t)+=carry
197 &dec("ecx") if ($i != 7-1);
198 &mov(&DWP($i*4,$r),"eax"); # *r= L(t);
199 &mov($c,"edx"); # c= H(t);
200 &jz(&label("maw_end")) if ($i != 7-1);
201 }
202 &set_label("maw_end",0);
203 &mov("eax",$c);
204
205 &pop("ecx"); # clear variable from
206
207 &function_end($name);
208 }
209
210sub bn_mul_words
211 {
212 local($name)=@_;
213
214 &function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
215
216 $r="eax";
217 $a="edx";
218 $c="ecx";
219
220 if ($sse2) {
221 &picmeup("eax","OPENSSL_ia32cap_P");
222 &bt(&DWP(0,"eax"),26);
223 &jnc(&label("mw_non_sse2"));
224
225 &mov($r,&wparam(0));
226 &mov($a,&wparam(1));
227 &mov($c,&wparam(2));
228 &movd("mm0",&wparam(3)); # mm0 = w
229 &pxor("mm1","mm1"); # mm1 = carry = 0
230
231 &set_label("mw_sse2_loop",16);
232 &movd("mm2",&DWP(0,$a)); # mm2 = a[i]
233 &pmuludq("mm2","mm0"); # a[i] *= w
234 &lea($a,&DWP(4,$a));
235 &paddq("mm1","mm2"); # carry += a[i]*w
236 &movd(&DWP(0,$r),"mm1"); # r[i] = carry_low
237 &sub($c,1);
238 &psrlq("mm1",32); # carry = carry_high
239 &lea($r,&DWP(4,$r));
240 &jnz(&label("mw_sse2_loop"));
241
242 &movd("eax","mm1"); # return carry
243 &emms();
244 &ret();
245 &set_label("mw_non_sse2",16);
246 }
247
248 # function_begin prologue
249 &push("ebp");
250 &push("ebx");
251 &push("esi");
252 &push("edi");
253
254 &comment("");
255 $Low="eax";
256 $High="edx";
257 $a="ebx";
258 $w="ecx";
259 $r="edi";
260 $c="esi";
261 $num="ebp";
262
263 &xor($c,$c); # clear carry
264 &mov($r,&wparam(0)); #
265 &mov($a,&wparam(1)); #
266 &mov($num,&wparam(2)); #
267 &mov($w,&wparam(3)); #
268
269 &and($num,0xfffffff8); # num / 8
270 &jz(&label("mw_finish"));
271
272 &set_label("mw_loop",0);
273 for ($i=0; $i<32; $i+=4)
274 {
275 &comment("Round $i");
276
277 &mov("eax",&DWP($i,$a,"",0)); # *a
278 &mul($w); # *a * w
279 &add("eax",$c); # L(t)+=c
280 # XXX
281
282 &adc("edx",0); # H(t)+=carry
283 &mov(&DWP($i,$r,"",0),"eax"); # *r= L(t);
284
285 &mov($c,"edx"); # c= H(t);
286 }
287
288 &comment("");
289 &add($a,32);
290 &add($r,32);
291 &sub($num,8);
292 &jz(&label("mw_finish"));
293 &jmp(&label("mw_loop"));
294
295 &set_label("mw_finish",0);
296 &mov($num,&wparam(2)); # get num
297 &and($num,7);
298 &jnz(&label("mw_finish2"));
299 &jmp(&label("mw_end"));
300
301 &set_label("mw_finish2",1);
302 for ($i=0; $i<7; $i++)
303 {
304 &comment("Tail Round $i");
305 &mov("eax",&DWP($i*4,$a,"",0));# *a
306 &mul($w); # *a * w
307 &add("eax",$c); # L(t)+=c
308 # XXX
309 &adc("edx",0); # H(t)+=carry
310 &mov(&DWP($i*4,$r,"",0),"eax");# *r= L(t);
311 &mov($c,"edx"); # c= H(t);
312 &dec($num) if ($i != 7-1);
313 &jz(&label("mw_end")) if ($i != 7-1);
314 }
315 &set_label("mw_end",0);
316 &mov("eax",$c);
317
318 &function_end($name);
319 }
320
321sub bn_sqr_words
322 {
323 local($name)=@_;
324
325 &function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
326
327 $r="eax";
328 $a="edx";
329 $c="ecx";
330
331 if ($sse2) {
332 &picmeup("eax","OPENSSL_ia32cap_P");
333 &bt(&DWP(0,"eax"),26);
334 &jnc(&label("sqr_non_sse2"));
335
336 &mov($r,&wparam(0));
337 &mov($a,&wparam(1));
338 &mov($c,&wparam(2));
339
340 &set_label("sqr_sse2_loop",16);
341 &movd("mm0",&DWP(0,$a)); # mm0 = a[i]
342 &pmuludq("mm0","mm0"); # a[i] *= a[i]
343 &lea($a,&DWP(4,$a)); # a++
344 &movq(&QWP(0,$r),"mm0"); # r[i] = a[i]*a[i]
345 &sub($c,1);
346 &lea($r,&DWP(8,$r)); # r += 2
347 &jnz(&label("sqr_sse2_loop"));
348
349 &emms();
350 &ret();
351 &set_label("sqr_non_sse2",16);
352 }
353
354 # function_begin prologue
355 &push("ebp");
356 &push("ebx");
357 &push("esi");
358 &push("edi");
359
360 &comment("");
361 $r="esi";
362 $a="edi";
363 $num="ebx";
364
365 &mov($r,&wparam(0)); #
366 &mov($a,&wparam(1)); #
367 &mov($num,&wparam(2)); #
368
369 &and($num,0xfffffff8); # num / 8
370 &jz(&label("sw_finish"));
371
372 &set_label("sw_loop",0);
373 for ($i=0; $i<32; $i+=4)
374 {
375 &comment("Round $i");
376 &mov("eax",&DWP($i,$a,"",0)); # *a
377 # XXX
378 &mul("eax"); # *a * *a
379 &mov(&DWP($i*2,$r,"",0),"eax"); #
380 &mov(&DWP($i*2+4,$r,"",0),"edx");#
381 }
382
383 &comment("");
384 &add($a,32);
385 &add($r,64);
386 &sub($num,8);
387 &jnz(&label("sw_loop"));
388
389 &set_label("sw_finish",0);
390 &mov($num,&wparam(2)); # get num
391 &and($num,7);
392 &jz(&label("sw_end"));
393
394 for ($i=0; $i<7; $i++)
395 {
396 &comment("Tail Round $i");
397 &mov("eax",&DWP($i*4,$a,"",0)); # *a
398 # XXX
399 &mul("eax"); # *a * *a
400 &mov(&DWP($i*8,$r,"",0),"eax"); #
401 &dec($num) if ($i != 7-1);
402 &mov(&DWP($i*8+4,$r,"",0),"edx");
403 &jz(&label("sw_end")) if ($i != 7-1);
404 }
405 &set_label("sw_end",0);
406
407 &function_end($name);
408 }
409
410sub bn_div_words
411 {
412 local($name)=@_;
413
414 &function_begin_B($name,"");
415 &mov("edx",&wparam(0)); #
416 &mov("eax",&wparam(1)); #
417 &mov("ecx",&wparam(2)); #
418 &div("ecx");
419 &ret();
420 &function_end_B($name);
421 }
422
423sub bn_add_words
424 {
425 local($name)=@_;
426
427 &function_begin($name,"");
428
429 &comment("");
430 $a="esi";
431 $b="edi";
432 $c="eax";
433 $r="ebx";
434 $tmp1="ecx";
435 $tmp2="edx";
436 $num="ebp";
437
438 &mov($r,&wparam(0)); # get r
439 &mov($a,&wparam(1)); # get a
440 &mov($b,&wparam(2)); # get b
441 &mov($num,&wparam(3)); # get num
442 &xor($c,$c); # clear carry
443 &and($num,0xfffffff8); # num / 8
444
445 &jz(&label("aw_finish"));
446
447 &set_label("aw_loop",0);
448 for ($i=0; $i<8; $i++)
449 {
450 &comment("Round $i");
451
452 &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
453 &mov($tmp2,&DWP($i*4,$b,"",0)); # *b
454 &add($tmp1,$c);
455 &mov($c,0);
456 &adc($c,$c);
457 &add($tmp1,$tmp2);
458 &adc($c,0);
459 &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
460 }
461
462 &comment("");
463 &add($a,32);
464 &add($b,32);
465 &add($r,32);
466 &sub($num,8);
467 &jnz(&label("aw_loop"));
468
469 &set_label("aw_finish",0);
470 &mov($num,&wparam(3)); # get num
471 &and($num,7);
472 &jz(&label("aw_end"));
473
474 for ($i=0; $i<7; $i++)
475 {
476 &comment("Tail Round $i");
477 &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
478 &mov($tmp2,&DWP($i*4,$b,"",0));# *b
479 &add($tmp1,$c);
480 &mov($c,0);
481 &adc($c,$c);
482 &add($tmp1,$tmp2);
483 &adc($c,0);
484 &dec($num) if ($i != 6);
485 &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
486 &jz(&label("aw_end")) if ($i != 6);
487 }
488 &set_label("aw_end",0);
489
490# &mov("eax",$c); # $c is "eax"
491
492 &function_end($name);
493 }
494
495sub bn_sub_words
496 {
497 local($name)=@_;
498
499 &function_begin($name,"");
500
501 &comment("");
502 $a="esi";
503 $b="edi";
504 $c="eax";
505 $r="ebx";
506 $tmp1="ecx";
507 $tmp2="edx";
508 $num="ebp";
509
510 &mov($r,&wparam(0)); # get r
511 &mov($a,&wparam(1)); # get a
512 &mov($b,&wparam(2)); # get b
513 &mov($num,&wparam(3)); # get num
514 &xor($c,$c); # clear carry
515 &and($num,0xfffffff8); # num / 8
516
517 &jz(&label("aw_finish"));
518
519 &set_label("aw_loop",0);
520 for ($i=0; $i<8; $i++)
521 {
522 &comment("Round $i");
523
524 &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
525 &mov($tmp2,&DWP($i*4,$b,"",0)); # *b
526 &sub($tmp1,$c);
527 &mov($c,0);
528 &adc($c,$c);
529 &sub($tmp1,$tmp2);
530 &adc($c,0);
531 &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
532 }
533
534 &comment("");
535 &add($a,32);
536 &add($b,32);
537 &add($r,32);
538 &sub($num,8);
539 &jnz(&label("aw_loop"));
540
541 &set_label("aw_finish",0);
542 &mov($num,&wparam(3)); # get num
543 &and($num,7);
544 &jz(&label("aw_end"));
545
546 for ($i=0; $i<7; $i++)
547 {
548 &comment("Tail Round $i");
549 &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
550 &mov($tmp2,&DWP($i*4,$b,"",0));# *b
551 &sub($tmp1,$c);
552 &mov($c,0);
553 &adc($c,$c);
554 &sub($tmp1,$tmp2);
555 &adc($c,0);
556 &dec($num) if ($i != 6);
557 &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
558 &jz(&label("aw_end")) if ($i != 6);
559 }
560 &set_label("aw_end",0);
561
562# &mov("eax",$c); # $c is "eax"
563
564 &function_end($name);
565 }
566
567sub bn_sub_part_words
568 {
569 local($name)=@_;
570
571 &function_begin($name,"");
572
573 &comment("");
574 $a="esi";
575 $b="edi";
576 $c="eax";
577 $r="ebx";
578 $tmp1="ecx";
579 $tmp2="edx";
580 $num="ebp";
581
582 &mov($r,&wparam(0)); # get r
583 &mov($a,&wparam(1)); # get a
584 &mov($b,&wparam(2)); # get b
585 &mov($num,&wparam(3)); # get num
586 &xor($c,$c); # clear carry
587 &and($num,0xfffffff8); # num / 8
588
589 &jz(&label("aw_finish"));
590
591 &set_label("aw_loop",0);
592 for ($i=0; $i<8; $i++)
593 {
594 &comment("Round $i");
595
596 &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
597 &mov($tmp2,&DWP($i*4,$b,"",0)); # *b
598 &sub($tmp1,$c);
599 &mov($c,0);
600 &adc($c,$c);
601 &sub($tmp1,$tmp2);
602 &adc($c,0);
603 &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
604 }
605
606 &comment("");
607 &add($a,32);
608 &add($b,32);
609 &add($r,32);
610 &sub($num,8);
611 &jnz(&label("aw_loop"));
612
613 &set_label("aw_finish",0);
614 &mov($num,&wparam(3)); # get num
615 &and($num,7);
616 &jz(&label("aw_end"));
617
618 for ($i=0; $i<7; $i++)
619 {
620 &comment("Tail Round $i");
621 &mov($tmp1,&DWP(0,$a,"",0)); # *a
622 &mov($tmp2,&DWP(0,$b,"",0));# *b
623 &sub($tmp1,$c);
624 &mov($c,0);
625 &adc($c,$c);
626 &sub($tmp1,$tmp2);
627 &adc($c,0);
628 &mov(&DWP(0,$r,"",0),$tmp1); # *r
629 &add($a, 4);
630 &add($b, 4);
631 &add($r, 4);
632 &dec($num) if ($i != 6);
633 &jz(&label("aw_end")) if ($i != 6);
634 }
635 &set_label("aw_end",0);
636
637 &cmp(&wparam(4),0);
638 &je(&label("pw_end"));
639
640 &mov($num,&wparam(4)); # get dl
641 &cmp($num,0);
642 &je(&label("pw_end"));
643 &jge(&label("pw_pos"));
644
645 &comment("pw_neg");
646 &mov($tmp2,0);
647 &sub($tmp2,$num);
648 &mov($num,$tmp2);
649 &and($num,0xfffffff8); # num / 8
650 &jz(&label("pw_neg_finish"));
651
652 &set_label("pw_neg_loop",0);
653 for ($i=0; $i<8; $i++)
654 {
655 &comment("dl<0 Round $i");
656
657 &mov($tmp1,0);
658 &mov($tmp2,&DWP($i*4,$b,"",0)); # *b
659 &sub($tmp1,$c);
660 &mov($c,0);
661 &adc($c,$c);
662 &sub($tmp1,$tmp2);
663 &adc($c,0);
664 &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
665 }
666
667 &comment("");
668 &add($b,32);
669 &add($r,32);
670 &sub($num,8);
671 &jnz(&label("pw_neg_loop"));
672
673 &set_label("pw_neg_finish",0);
674 &mov($tmp2,&wparam(4)); # get dl
675 &mov($num,0);
676 &sub($num,$tmp2);
677 &and($num,7);
678 &jz(&label("pw_end"));
679
680 for ($i=0; $i<7; $i++)
681 {
682 &comment("dl<0 Tail Round $i");
683 &mov($tmp1,0);
684 &mov($tmp2,&DWP($i*4,$b,"",0));# *b
685 &sub($tmp1,$c);
686 &mov($c,0);
687 &adc($c,$c);
688 &sub($tmp1,$tmp2);
689 &adc($c,0);
690 &dec($num) if ($i != 6);
691 &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
692 &jz(&label("pw_end")) if ($i != 6);
693 }
694
695 &jmp(&label("pw_end"));
696
697 &set_label("pw_pos",0);
698
699 &and($num,0xfffffff8); # num / 8
700 &jz(&label("pw_pos_finish"));
701
702 &set_label("pw_pos_loop",0);
703
704 for ($i=0; $i<8; $i++)
705 {
706 &comment("dl>0 Round $i");
707
708 &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
709 &sub($tmp1,$c);
710 &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
711 &jnc(&label("pw_nc".$i));
712 }
713
714 &comment("");
715 &add($a,32);
716 &add($r,32);
717 &sub($num,8);
718 &jnz(&label("pw_pos_loop"));
719
720 &set_label("pw_pos_finish",0);
721 &mov($num,&wparam(4)); # get dl
722 &and($num,7);
723 &jz(&label("pw_end"));
724
725 for ($i=0; $i<7; $i++)
726 {
727 &comment("dl>0 Tail Round $i");
728 &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
729 &sub($tmp1,$c);
730 &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
731 &jnc(&label("pw_tail_nc".$i));
732 &dec($num) if ($i != 6);
733 &jz(&label("pw_end")) if ($i != 6);
734 }
735 &mov($c,1);
736 &jmp(&label("pw_end"));
737
738 &set_label("pw_nc_loop",0);
739 for ($i=0; $i<8; $i++)
740 {
741 &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
742 &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
743 &set_label("pw_nc".$i,0);
744 }
745
746 &comment("");
747 &add($a,32);
748 &add($r,32);
749 &sub($num,8);
750 &jnz(&label("pw_nc_loop"));
751
752 &mov($num,&wparam(4)); # get dl
753 &and($num,7);
754 &jz(&label("pw_nc_end"));
755
756 for ($i=0; $i<7; $i++)
757 {
758 &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
759 &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
760 &set_label("pw_tail_nc".$i,0);
761 &dec($num) if ($i != 6);
762 &jz(&label("pw_nc_end")) if ($i != 6);
763 }
764
765 &set_label("pw_nc_end",0);
766 &mov($c,0);
767
768 &set_label("pw_end",0);
769
770# &mov("eax",$c); # $c is "eax"
771
772 &function_end($name);
773 }
774
diff --git a/src/lib/libcrypto/bn/asm/co-586.pl b/src/lib/libcrypto/bn/asm/co-586.pl
deleted file mode 100644
index 57101a6bd7..0000000000
--- a/src/lib/libcrypto/bn/asm/co-586.pl
+++ /dev/null
@@ -1,287 +0,0 @@
1#!/usr/local/bin/perl
2
3$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
4push(@INC,"${dir}","${dir}../../perlasm");
5require "x86asm.pl";
6
7&asm_init($ARGV[0],$0);
8
9&bn_mul_comba("bn_mul_comba8",8);
10&bn_mul_comba("bn_mul_comba4",4);
11&bn_sqr_comba("bn_sqr_comba8",8);
12&bn_sqr_comba("bn_sqr_comba4",4);
13
14&asm_finish();
15
16sub mul_add_c
17 {
18 local($a,$ai,$b,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
19
20 # pos == -1 if eax and edx are pre-loaded, 0 to load from next
21 # words, and 1 if load return value
22
23 &comment("mul a[$ai]*b[$bi]");
24
25 # "eax" and "edx" will always be pre-loaded.
26 # &mov("eax",&DWP($ai*4,$a,"",0)) ;
27 # &mov("edx",&DWP($bi*4,$b,"",0));
28
29 &mul("edx");
30 &add($c0,"eax");
31 &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # laod next a
32 &mov("eax",&wparam(0)) if $pos > 0; # load r[]
33 ###
34 &adc($c1,"edx");
35 &mov("edx",&DWP(($nb)*4,$b,"",0)) if $pos == 0; # laod next b
36 &mov("edx",&DWP(($nb)*4,$b,"",0)) if $pos == 1; # laod next b
37 ###
38 &adc($c2,0);
39 # is pos > 1, it means it is the last loop
40 &mov(&DWP($i*4,"eax","",0),$c0) if $pos > 0; # save r[];
41 &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # laod next a
42 }
43
44sub sqr_add_c
45 {
46 local($r,$a,$ai,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
47
48 # pos == -1 if eax and edx are pre-loaded, 0 to load from next
49 # words, and 1 if load return value
50
51 &comment("sqr a[$ai]*a[$bi]");
52
53 # "eax" and "edx" will always be pre-loaded.
54 # &mov("eax",&DWP($ai*4,$a,"",0)) ;
55 # &mov("edx",&DWP($bi*4,$b,"",0));
56
57 if ($ai == $bi)
58 { &mul("eax");}
59 else
60 { &mul("edx");}
61 &add($c0,"eax");
62 &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # load next a
63 ###
64 &adc($c1,"edx");
65 &mov("edx",&DWP(($nb)*4,$a,"",0)) if ($pos == 1) && ($na != $nb);
66 ###
67 &adc($c2,0);
68 # is pos > 1, it means it is the last loop
69 &mov(&DWP($i*4,$r,"",0),$c0) if $pos > 0; # save r[];
70 &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # load next b
71 }
72
73sub sqr_add_c2
74 {
75 local($r,$a,$ai,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
76
77 # pos == -1 if eax and edx are pre-loaded, 0 to load from next
78 # words, and 1 if load return value
79
80 &comment("sqr a[$ai]*a[$bi]");
81
82 # "eax" and "edx" will always be pre-loaded.
83 # &mov("eax",&DWP($ai*4,$a,"",0)) ;
84 # &mov("edx",&DWP($bi*4,$a,"",0));
85
86 if ($ai == $bi)
87 { &mul("eax");}
88 else
89 { &mul("edx");}
90 &add("eax","eax");
91 ###
92 &adc("edx","edx");
93 ###
94 &adc($c2,0);
95 &add($c0,"eax");
96 &adc($c1,"edx");
97 &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # load next a
98 &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # load next b
99 &adc($c2,0);
100 &mov(&DWP($i*4,$r,"",0),$c0) if $pos > 0; # save r[];
101 &mov("edx",&DWP(($nb)*4,$a,"",0)) if ($pos <= 1) && ($na != $nb);
102 ###
103 }
104
105sub bn_mul_comba
106 {
107 local($name,$num)=@_;
108 local($a,$b,$c0,$c1,$c2);
109 local($i,$as,$ae,$bs,$be,$ai,$bi);
110 local($tot,$end);
111
112 &function_begin_B($name,"");
113
114 $c0="ebx";
115 $c1="ecx";
116 $c2="ebp";
117 $a="esi";
118 $b="edi";
119
120 $as=0;
121 $ae=0;
122 $bs=0;
123 $be=0;
124 $tot=$num+$num-1;
125
126 &push("esi");
127 &mov($a,&wparam(1));
128 &push("edi");
129 &mov($b,&wparam(2));
130 &push("ebp");
131 &push("ebx");
132
133 &xor($c0,$c0);
134 &mov("eax",&DWP(0,$a,"",0)); # load the first word
135 &xor($c1,$c1);
136 &mov("edx",&DWP(0,$b,"",0)); # load the first second
137
138 for ($i=0; $i<$tot; $i++)
139 {
140 $ai=$as;
141 $bi=$bs;
142 $end=$be+1;
143
144 &comment("################## Calculate word $i");
145
146 for ($j=$bs; $j<$end; $j++)
147 {
148 &xor($c2,$c2) if ($j == $bs);
149 if (($j+1) == $end)
150 {
151 $v=1;
152 $v=2 if (($i+1) == $tot);
153 }
154 else
155 { $v=0; }
156 if (($j+1) != $end)
157 {
158 $na=($ai-1);
159 $nb=($bi+1);
160 }
161 else
162 {
163 $na=$as+($i < ($num-1));
164 $nb=$bs+($i >= ($num-1));
165 }
166#printf STDERR "[$ai,$bi] -> [$na,$nb]\n";
167 &mul_add_c($a,$ai,$b,$bi,$c0,$c1,$c2,$v,$i,$na,$nb);
168 if ($v)
169 {
170 &comment("saved r[$i]");
171 # &mov("eax",&wparam(0));
172 # &mov(&DWP($i*4,"eax","",0),$c0);
173 ($c0,$c1,$c2)=($c1,$c2,$c0);
174 }
175 $ai--;
176 $bi++;
177 }
178 $as++ if ($i < ($num-1));
179 $ae++ if ($i >= ($num-1));
180
181 $bs++ if ($i >= ($num-1));
182 $be++ if ($i < ($num-1));
183 }
184 &comment("save r[$i]");
185 # &mov("eax",&wparam(0));
186 &mov(&DWP($i*4,"eax","",0),$c0);
187
188 &pop("ebx");
189 &pop("ebp");
190 &pop("edi");
191 &pop("esi");
192 &ret();
193 &function_end_B($name);
194 }
195
196sub bn_sqr_comba
197 {
198 local($name,$num)=@_;
199 local($r,$a,$c0,$c1,$c2)=@_;
200 local($i,$as,$ae,$bs,$be,$ai,$bi);
201 local($b,$tot,$end,$half);
202
203 &function_begin_B($name,"");
204
205 $c0="ebx";
206 $c1="ecx";
207 $c2="ebp";
208 $a="esi";
209 $r="edi";
210
211 &push("esi");
212 &push("edi");
213 &push("ebp");
214 &push("ebx");
215 &mov($r,&wparam(0));
216 &mov($a,&wparam(1));
217 &xor($c0,$c0);
218 &xor($c1,$c1);
219 &mov("eax",&DWP(0,$a,"",0)); # load the first word
220
221 $as=0;
222 $ae=0;
223 $bs=0;
224 $be=0;
225 $tot=$num+$num-1;
226
227 for ($i=0; $i<$tot; $i++)
228 {
229 $ai=$as;
230 $bi=$bs;
231 $end=$be+1;
232
233 &comment("############### Calculate word $i");
234 for ($j=$bs; $j<$end; $j++)
235 {
236 &xor($c2,$c2) if ($j == $bs);
237 if (($ai-1) < ($bi+1))
238 {
239 $v=1;
240 $v=2 if ($i+1) == $tot;
241 }
242 else
243 { $v=0; }
244 if (!$v)
245 {
246 $na=$ai-1;
247 $nb=$bi+1;
248 }
249 else
250 {
251 $na=$as+($i < ($num-1));
252 $nb=$bs+($i >= ($num-1));
253 }
254 if ($ai == $bi)
255 {
256 &sqr_add_c($r,$a,$ai,$bi,
257 $c0,$c1,$c2,$v,$i,$na,$nb);
258 }
259 else
260 {
261 &sqr_add_c2($r,$a,$ai,$bi,
262 $c0,$c1,$c2,$v,$i,$na,$nb);
263 }
264 if ($v)
265 {
266 &comment("saved r[$i]");
267 #&mov(&DWP($i*4,$r,"",0),$c0);
268 ($c0,$c1,$c2)=($c1,$c2,$c0);
269 last;
270 }
271 $ai--;
272 $bi++;
273 }
274 $as++ if ($i < ($num-1));
275 $ae++ if ($i >= ($num-1));
276
277 $bs++ if ($i >= ($num-1));
278 $be++ if ($i < ($num-1));
279 }
280 &mov(&DWP($i*4,$r,"",0),$c0);
281 &pop("ebx");
282 &pop("ebp");
283 &pop("edi");
284 &pop("esi");
285 &ret();
286 &function_end_B($name);
287 }
diff --git a/src/lib/libcrypto/bn/asm/ia64-mont.pl b/src/lib/libcrypto/bn/asm/ia64-mont.pl
deleted file mode 100644
index e258658428..0000000000
--- a/src/lib/libcrypto/bn/asm/ia64-mont.pl
+++ /dev/null
@@ -1,851 +0,0 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# January 2010
11#
12# "Teaser" Montgomery multiplication module for IA-64. There are
13# several possibilities for improvement:
14#
15# - modulo-scheduling outer loop would eliminate quite a number of
16# stalls after ldf8, xma and getf.sig outside inner loop and
17# improve shorter key performance;
18# - shorter vector support [with input vectors being fetched only
19# once] should be added;
20# - 2x unroll with help of n0[1] would make the code scalable on
21# "wider" IA-64, "wider" than Itanium 2 that is, which is not of
22# acute interest, because upcoming Tukwila's individual cores are
23# reportedly based on Itanium 2 design;
24# - dedicated squaring procedure(?);
25#
26# January 2010
27#
28# Shorter vector support is implemented by zero-padding ap and np
29# vectors up to 8 elements, or 512 bits. This means that 256-bit
30# inputs will be processed only 2 times faster than 512-bit inputs,
31# not 4 [as one would expect, because algorithm complexity is n^2].
32# The reason for padding is that inputs shorter than 512 bits won't
33# be processed faster anyway, because minimal critical path of the
34# core loop happens to match 512-bit timing. Either way, it resulted
35# in >100% improvement of 512-bit RSA sign benchmark and 50% - of
36# 1024-bit one [in comparison to original version of *this* module].
37#
38# So far 'openssl speed rsa dsa' output on 900MHz Itanium 2 *with*
39# this module is:
40# sign verify sign/s verify/s
41# rsa 512 bits 0.000290s 0.000024s 3452.8 42031.4
42# rsa 1024 bits 0.000793s 0.000058s 1261.7 17172.0
43# rsa 2048 bits 0.005908s 0.000148s 169.3 6754.0
44# rsa 4096 bits 0.033456s 0.000469s 29.9 2133.6
45# dsa 512 bits 0.000253s 0.000198s 3949.9 5057.0
46# dsa 1024 bits 0.000585s 0.000607s 1708.4 1647.4
47# dsa 2048 bits 0.001453s 0.001703s 688.1 587.4
48#
49# ... and *without* (but still with ia64.S):
50#
51# rsa 512 bits 0.000670s 0.000041s 1491.8 24145.5
52# rsa 1024 bits 0.001988s 0.000080s 502.9 12499.3
53# rsa 2048 bits 0.008702s 0.000189s 114.9 5293.9
54# rsa 4096 bits 0.043860s 0.000533s 22.8 1875.9
55# dsa 512 bits 0.000441s 0.000427s 2265.3 2340.6
56# dsa 1024 bits 0.000823s 0.000867s 1215.6 1153.2
57# dsa 2048 bits 0.001894s 0.002179s 528.1 458.9
58#
59# As it can be seen, RSA sign performance improves by 130-30%,
60# hereafter less for longer keys, while verify - by 74-13%.
61# DSA performance improves by 115-30%.
62
63if ($^O eq "hpux") {
64 $ADDP="addp4";
65 for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
66} else { $ADDP="add"; }
67
68$code=<<___;
69.explicit
70.text
71
72// int bn_mul_mont (BN_ULONG *rp,const BN_ULONG *ap,
73// const BN_ULONG *bp,const BN_ULONG *np,
74// const BN_ULONG *n0p,int num);
75.align 64
76.global bn_mul_mont#
77.proc bn_mul_mont#
78bn_mul_mont:
79 .prologue
80 .body
81{ .mmi; cmp4.le p6,p7=2,r37;;
82(p6) cmp4.lt.unc p8,p9=8,r37
83 mov ret0=r0 };;
84{ .bbb;
85(p9) br.cond.dptk.many bn_mul_mont_8
86(p8) br.cond.dpnt.many bn_mul_mont_general
87(p7) br.ret.spnt.many b0 };;
88.endp bn_mul_mont#
89
90prevfs=r2; prevpr=r3; prevlc=r10; prevsp=r11;
91
92rptr=r8; aptr=r9; bptr=r14; nptr=r15;
93tptr=r16; // &tp[0]
94tp_1=r17; // &tp[-1]
95num=r18; len=r19; lc=r20;
96topbit=r21; // carry bit from tmp[num]
97
98n0=f6;
99m0=f7;
100bi=f8;
101
102.align 64
103.local bn_mul_mont_general#
104.proc bn_mul_mont_general#
105bn_mul_mont_general:
106 .prologue
107{ .mmi; .save ar.pfs,prevfs
108 alloc prevfs=ar.pfs,6,2,0,8
109 $ADDP aptr=0,in1
110 .save ar.lc,prevlc
111 mov prevlc=ar.lc }
112{ .mmi; .vframe prevsp
113 mov prevsp=sp
114 $ADDP bptr=0,in2
115 .save pr,prevpr
116 mov prevpr=pr };;
117
118 .body
119 .rotf alo[6],nlo[4],ahi[8],nhi[6]
120 .rotr a[3],n[3],t[2]
121
122{ .mmi; ldf8 bi=[bptr],8 // (*bp++)
123 ldf8 alo[4]=[aptr],16 // ap[0]
124 $ADDP r30=8,in1 };;
125{ .mmi; ldf8 alo[3]=[r30],16 // ap[1]
126 ldf8 alo[2]=[aptr],16 // ap[2]
127 $ADDP in4=0,in4 };;
128{ .mmi; ldf8 alo[1]=[r30] // ap[3]
129 ldf8 n0=[in4] // n0
130 $ADDP rptr=0,in0 }
131{ .mmi; $ADDP nptr=0,in3
132 mov r31=16
133 zxt4 num=in5 };;
134{ .mmi; ldf8 nlo[2]=[nptr],8 // np[0]
135 shladd len=num,3,r0
136 shladd r31=num,3,r31 };;
137{ .mmi; ldf8 nlo[1]=[nptr],8 // np[1]
138 add lc=-5,num
139 sub r31=sp,r31 };;
140{ .mfb; and sp=-16,r31 // alloca
141 xmpy.hu ahi[2]=alo[4],bi // ap[0]*bp[0]
142 nop.b 0 }
143{ .mfb; nop.m 0
144 xmpy.lu alo[4]=alo[4],bi
145 brp.loop.imp .L1st_ctop,.L1st_cend-16
146 };;
147{ .mfi; nop.m 0
148 xma.hu ahi[1]=alo[3],bi,ahi[2] // ap[1]*bp[0]
149 add tp_1=8,sp }
150{ .mfi; nop.m 0
151 xma.lu alo[3]=alo[3],bi,ahi[2]
152 mov pr.rot=0x20001f<<16
153 // ------^----- (p40) at first (p23)
154 // ----------^^ p[16:20]=1
155 };;
156{ .mfi; nop.m 0
157 xmpy.lu m0=alo[4],n0 // (ap[0]*bp[0])*n0
158 mov ar.lc=lc }
159{ .mfi; nop.m 0
160 fcvt.fxu.s1 nhi[1]=f0
161 mov ar.ec=8 };;
162
163.align 32
164.L1st_ctop:
165.pred.rel "mutex",p40,p42
166{ .mfi; (p16) ldf8 alo[0]=[aptr],8 // *(aptr++)
167 (p18) xma.hu ahi[0]=alo[2],bi,ahi[1]
168 (p40) add n[2]=n[2],a[2] } // (p23) }
169{ .mfi; (p18) ldf8 nlo[0]=[nptr],8 // *(nptr++)(p16)
170 (p18) xma.lu alo[2]=alo[2],bi,ahi[1]
171 (p42) add n[2]=n[2],a[2],1 };; // (p23)
172{ .mfi; (p21) getf.sig a[0]=alo[5]
173 (p20) xma.hu nhi[0]=nlo[2],m0,nhi[1]
174 (p42) cmp.leu p41,p39=n[2],a[2] } // (p23)
175{ .mfi; (p23) st8 [tp_1]=n[2],8
176 (p20) xma.lu nlo[2]=nlo[2],m0,nhi[1]
177 (p40) cmp.ltu p41,p39=n[2],a[2] } // (p23)
178{ .mmb; (p21) getf.sig n[0]=nlo[3]
179 (p16) nop.m 0
180 br.ctop.sptk .L1st_ctop };;
181.L1st_cend:
182
183{ .mmi; getf.sig a[0]=ahi[6] // (p24)
184 getf.sig n[0]=nhi[4]
185 add num=-1,num };; // num--
186{ .mmi; .pred.rel "mutex",p40,p42
187(p40) add n[0]=n[0],a[0]
188(p42) add n[0]=n[0],a[0],1
189 sub aptr=aptr,len };; // rewind
190{ .mmi; .pred.rel "mutex",p40,p42
191(p40) cmp.ltu p41,p39=n[0],a[0]
192(p42) cmp.leu p41,p39=n[0],a[0]
193 sub nptr=nptr,len };;
194{ .mmi; .pred.rel "mutex",p39,p41
195(p39) add topbit=r0,r0
196(p41) add topbit=r0,r0,1
197 nop.i 0 }
198{ .mmi; st8 [tp_1]=n[0]
199 add tptr=16,sp
200 add tp_1=8,sp };;
201
202.Louter:
203{ .mmi; ldf8 bi=[bptr],8 // (*bp++)
204 ldf8 ahi[3]=[tptr] // tp[0]
205 add r30=8,aptr };;
206{ .mmi; ldf8 alo[4]=[aptr],16 // ap[0]
207 ldf8 alo[3]=[r30],16 // ap[1]
208 add r31=8,nptr };;
209{ .mfb; ldf8 alo[2]=[aptr],16 // ap[2]
210 xma.hu ahi[2]=alo[4],bi,ahi[3] // ap[0]*bp[i]+tp[0]
211 brp.loop.imp .Linner_ctop,.Linner_cend-16
212 }
213{ .mfb; ldf8 alo[1]=[r30] // ap[3]
214 xma.lu alo[4]=alo[4],bi,ahi[3]
215 clrrrb.pr };;
216{ .mfi; ldf8 nlo[2]=[nptr],16 // np[0]
217 xma.hu ahi[1]=alo[3],bi,ahi[2] // ap[1]*bp[i]
218 nop.i 0 }
219{ .mfi; ldf8 nlo[1]=[r31] // np[1]
220 xma.lu alo[3]=alo[3],bi,ahi[2]
221 mov pr.rot=0x20101f<<16
222 // ------^----- (p40) at first (p23)
223 // --------^--- (p30) at first (p22)
224 // ----------^^ p[16:20]=1
225 };;
226{ .mfi; st8 [tptr]=r0 // tp[0] is already accounted
227 xmpy.lu m0=alo[4],n0 // (ap[0]*bp[i]+tp[0])*n0
228 mov ar.lc=lc }
229{ .mfi;
230 fcvt.fxu.s1 nhi[1]=f0
231 mov ar.ec=8 };;
232
233// This loop spins in 4*(n+7) ticks on Itanium 2 and should spin in
234// 7*(n+7) ticks on Itanium (the one codenamed Merced). Factor of 7
235// in latter case accounts for two-tick pipeline stall, which means
236// that its performance would be ~20% lower than optimal one. No
237// attempt was made to address this, because original Itanium is
238// hardly represented out in the wild...
239.align 32
240.Linner_ctop:
241.pred.rel "mutex",p40,p42
242.pred.rel "mutex",p30,p32
243{ .mfi; (p16) ldf8 alo[0]=[aptr],8 // *(aptr++)
244 (p18) xma.hu ahi[0]=alo[2],bi,ahi[1]
245 (p40) add n[2]=n[2],a[2] } // (p23)
246{ .mfi; (p16) nop.m 0
247 (p18) xma.lu alo[2]=alo[2],bi,ahi[1]
248 (p42) add n[2]=n[2],a[2],1 };; // (p23)
249{ .mfi; (p21) getf.sig a[0]=alo[5]
250 (p16) nop.f 0
251 (p40) cmp.ltu p41,p39=n[2],a[2] } // (p23)
252{ .mfi; (p21) ld8 t[0]=[tptr],8
253 (p16) nop.f 0
254 (p42) cmp.leu p41,p39=n[2],a[2] };; // (p23)
255{ .mfi; (p18) ldf8 nlo[0]=[nptr],8 // *(nptr++)
256 (p20) xma.hu nhi[0]=nlo[2],m0,nhi[1]
257 (p30) add a[1]=a[1],t[1] } // (p22)
258{ .mfi; (p16) nop.m 0
259 (p20) xma.lu nlo[2]=nlo[2],m0,nhi[1]
260 (p32) add a[1]=a[1],t[1],1 };; // (p22)
261{ .mmi; (p21) getf.sig n[0]=nlo[3]
262 (p16) nop.m 0
263 (p30) cmp.ltu p31,p29=a[1],t[1] } // (p22)
264{ .mmb; (p23) st8 [tp_1]=n[2],8
265 (p32) cmp.leu p31,p29=a[1],t[1] // (p22)
266 br.ctop.sptk .Linner_ctop };;
267.Linner_cend:
268
269{ .mmi; getf.sig a[0]=ahi[6] // (p24)
270 getf.sig n[0]=nhi[4]
271 nop.i 0 };;
272
273{ .mmi; .pred.rel "mutex",p31,p33
274(p31) add a[0]=a[0],topbit
275(p33) add a[0]=a[0],topbit,1
276 mov topbit=r0 };;
277{ .mfi; .pred.rel "mutex",p31,p33
278(p31) cmp.ltu p32,p30=a[0],topbit
279(p33) cmp.leu p32,p30=a[0],topbit
280 }
281{ .mfi; .pred.rel "mutex",p40,p42
282(p40) add n[0]=n[0],a[0]
283(p42) add n[0]=n[0],a[0],1
284 };;
285{ .mmi; .pred.rel "mutex",p44,p46
286(p40) cmp.ltu p41,p39=n[0],a[0]
287(p42) cmp.leu p41,p39=n[0],a[0]
288(p32) add topbit=r0,r0,1 }
289
290{ .mmi; st8 [tp_1]=n[0],8
291 cmp4.ne p6,p0=1,num
292 sub aptr=aptr,len };; // rewind
293{ .mmi; sub nptr=nptr,len
294(p41) add topbit=r0,r0,1
295 add tptr=16,sp }
296{ .mmb; add tp_1=8,sp
297 add num=-1,num // num--
298(p6) br.cond.sptk.many .Louter };;
299
300{ .mbb; add lc=4,lc
301 brp.loop.imp .Lsub_ctop,.Lsub_cend-16
302 clrrrb.pr };;
303{ .mii; nop.m 0
304 mov pr.rot=0x10001<<16
305 // ------^---- (p33) at first (p17)
306 mov ar.lc=lc }
307{ .mii; nop.m 0
308 mov ar.ec=3
309 nop.i 0 };;
310
311.Lsub_ctop:
312.pred.rel "mutex",p33,p35
313{ .mfi; (p16) ld8 t[0]=[tptr],8 // t=*(tp++)
314 (p16) nop.f 0
315 (p33) sub n[1]=t[1],n[1] } // (p17)
316{ .mfi; (p16) ld8 n[0]=[nptr],8 // n=*(np++)
317 (p16) nop.f 0
318 (p35) sub n[1]=t[1],n[1],1 };; // (p17)
319{ .mib; (p18) st8 [rptr]=n[2],8 // *(rp++)=r
320 (p33) cmp.gtu p34,p32=n[1],t[1] // (p17)
321 (p18) nop.b 0 }
322{ .mib; (p18) nop.m 0
323 (p35) cmp.geu p34,p32=n[1],t[1] // (p17)
324 br.ctop.sptk .Lsub_ctop };;
325.Lsub_cend:
326
327{ .mmb; .pred.rel "mutex",p34,p36
328(p34) sub topbit=topbit,r0 // (p19)
329(p36) sub topbit=topbit,r0,1
330 brp.loop.imp .Lcopy_ctop,.Lcopy_cend-16
331 }
332{ .mmb; sub rptr=rptr,len // rewind
333 sub tptr=tptr,len
334 clrrrb.pr };;
335{ .mmi; and aptr=tptr,topbit
336 andcm bptr=rptr,topbit
337 mov pr.rot=1<<16 };;
338{ .mii; or nptr=aptr,bptr
339 mov ar.lc=lc
340 mov ar.ec=3 };;
341
342.Lcopy_ctop:
343{ .mmb; (p16) ld8 n[0]=[nptr],8
344 (p18) st8 [tptr]=r0,8
345 (p16) nop.b 0 }
346{ .mmb; (p16) nop.m 0
347 (p18) st8 [rptr]=n[2],8
348 br.ctop.sptk .Lcopy_ctop };;
349.Lcopy_cend:
350
351{ .mmi; mov ret0=1 // signal "handled"
352 rum 1<<5 // clear um.mfh
353 mov ar.lc=prevlc }
354{ .mib; .restore sp
355 mov sp=prevsp
356 mov pr=prevpr,0x1ffff
357 br.ret.sptk.many b0 };;
358.endp bn_mul_mont_general#
359
360a1=r16; a2=r17; a3=r18; a4=r19; a5=r20; a6=r21; a7=r22; a8=r23;
361n1=r24; n2=r25; n3=r26; n4=r27; n5=r28; n6=r29; n7=r30; n8=r31;
362t0=r15;
363
364ai0=f8; ai1=f9; ai2=f10; ai3=f11; ai4=f12; ai5=f13; ai6=f14; ai7=f15;
365ni0=f16; ni1=f17; ni2=f18; ni3=f19; ni4=f20; ni5=f21; ni6=f22; ni7=f23;
366
367.align 64
368.skip 48 // aligns loop body
369.local bn_mul_mont_8#
370.proc bn_mul_mont_8#
371bn_mul_mont_8:
372 .prologue
373{ .mmi; .save ar.pfs,prevfs
374 alloc prevfs=ar.pfs,6,2,0,8
375 .vframe prevsp
376 mov prevsp=sp
377 .save ar.lc,prevlc
378 mov prevlc=ar.lc }
379{ .mmi; add r17=-6*16,sp
380 add sp=-7*16,sp
381 .save pr,prevpr
382 mov prevpr=pr };;
383
384{ .mmi; .save.gf 0,0x10
385 stf.spill [sp]=f16,-16
386 .save.gf 0,0x20
387 stf.spill [r17]=f17,32
388 add r16=-5*16,prevsp};;
389{ .mmi; .save.gf 0,0x40
390 stf.spill [r16]=f18,32
391 .save.gf 0,0x80
392 stf.spill [r17]=f19,32
393 $ADDP aptr=0,in1 };;
394{ .mmi; .save.gf 0,0x100
395 stf.spill [r16]=f20,32
396 .save.gf 0,0x200
397 stf.spill [r17]=f21,32
398 $ADDP r29=8,in1 };;
399{ .mmi; .save.gf 0,0x400
400 stf.spill [r16]=f22
401 .save.gf 0,0x800
402 stf.spill [r17]=f23
403 $ADDP rptr=0,in0 };;
404
405 .body
406 .rotf bj[8],mj[2],tf[2],alo[10],ahi[10],nlo[10],nhi[10]
407 .rotr t[8]
408
409// load input vectors padding them to 8 elements
410{ .mmi; ldf8 ai0=[aptr],16 // ap[0]
411 ldf8 ai1=[r29],16 // ap[1]
412 $ADDP bptr=0,in2 }
413{ .mmi; $ADDP r30=8,in2
414 $ADDP nptr=0,in3
415 $ADDP r31=8,in3 };;
416{ .mmi; ldf8 bj[7]=[bptr],16 // bp[0]
417 ldf8 bj[6]=[r30],16 // bp[1]
418 cmp4.le p4,p5=3,in5 }
419{ .mmi; ldf8 ni0=[nptr],16 // np[0]
420 ldf8 ni1=[r31],16 // np[1]
421 cmp4.le p6,p7=4,in5 };;
422
423{ .mfi; (p4)ldf8 ai2=[aptr],16 // ap[2]
424 (p5)fcvt.fxu ai2=f0
425 cmp4.le p8,p9=5,in5 }
426{ .mfi; (p6)ldf8 ai3=[r29],16 // ap[3]
427 (p7)fcvt.fxu ai3=f0
428 cmp4.le p10,p11=6,in5 }
429{ .mfi; (p4)ldf8 bj[5]=[bptr],16 // bp[2]
430 (p5)fcvt.fxu bj[5]=f0
431 cmp4.le p12,p13=7,in5 }
432{ .mfi; (p6)ldf8 bj[4]=[r30],16 // bp[3]
433 (p7)fcvt.fxu bj[4]=f0
434 cmp4.le p14,p15=8,in5 }
435{ .mfi; (p4)ldf8 ni2=[nptr],16 // np[2]
436 (p5)fcvt.fxu ni2=f0
437 addp4 r28=-1,in5 }
438{ .mfi; (p6)ldf8 ni3=[r31],16 // np[3]
439 (p7)fcvt.fxu ni3=f0
440 $ADDP in4=0,in4 };;
441
442{ .mfi; ldf8 n0=[in4]
443 fcvt.fxu tf[1]=f0
444 nop.i 0 }
445
446{ .mfi; (p8)ldf8 ai4=[aptr],16 // ap[4]
447 (p9)fcvt.fxu ai4=f0
448 mov t[0]=r0 }
449{ .mfi; (p10)ldf8 ai5=[r29],16 // ap[5]
450 (p11)fcvt.fxu ai5=f0
451 mov t[1]=r0 }
452{ .mfi; (p8)ldf8 bj[3]=[bptr],16 // bp[4]
453 (p9)fcvt.fxu bj[3]=f0
454 mov t[2]=r0 }
455{ .mfi; (p10)ldf8 bj[2]=[r30],16 // bp[5]
456 (p11)fcvt.fxu bj[2]=f0
457 mov t[3]=r0 }
458{ .mfi; (p8)ldf8 ni4=[nptr],16 // np[4]
459 (p9)fcvt.fxu ni4=f0
460 mov t[4]=r0 }
461{ .mfi; (p10)ldf8 ni5=[r31],16 // np[5]
462 (p11)fcvt.fxu ni5=f0
463 mov t[5]=r0 };;
464
465{ .mfi; (p12)ldf8 ai6=[aptr],16 // ap[6]
466 (p13)fcvt.fxu ai6=f0
467 mov t[6]=r0 }
468{ .mfi; (p14)ldf8 ai7=[r29],16 // ap[7]
469 (p15)fcvt.fxu ai7=f0
470 mov t[7]=r0 }
471{ .mfi; (p12)ldf8 bj[1]=[bptr],16 // bp[6]
472 (p13)fcvt.fxu bj[1]=f0
473 mov ar.lc=r28 }
474{ .mfi; (p14)ldf8 bj[0]=[r30],16 // bp[7]
475 (p15)fcvt.fxu bj[0]=f0
476 mov ar.ec=1 }
477{ .mfi; (p12)ldf8 ni6=[nptr],16 // np[6]
478 (p13)fcvt.fxu ni6=f0
479 mov pr.rot=1<<16 }
480{ .mfb; (p14)ldf8 ni7=[r31],16 // np[7]
481 (p15)fcvt.fxu ni7=f0
482 brp.loop.imp .Louter_8_ctop,.Louter_8_cend-16
483 };;
484
485// The loop is scheduled for 32*n ticks on Itanium 2. Actual attempt
486// to measure with help of Interval Time Counter indicated that the
487// factor is a tad higher: 33 or 34, if not 35. Exact measurement and
488// addressing the issue is problematic, because I don't have access
489// to platform-specific instruction-level profiler. On Itanium it
490// should run in 56*n ticks, because of higher xma latency...
491.Louter_8_ctop:
492 .pred.rel "mutex",p40,p42
493 .pred.rel "mutex",p48,p50
494{ .mfi; (p16) nop.m 0 // 0:
495 (p16) xma.hu ahi[0]=ai0,bj[7],tf[1] // ap[0]*b[i]+t[0]
496 (p40) add a3=a3,n3 } // (p17) a3+=n3
497{ .mfi; (p42) add a3=a3,n3,1
498 (p16) xma.lu alo[0]=ai0,bj[7],tf[1]
499 (p16) nop.i 0 };;
500{ .mii; (p17) getf.sig a7=alo[8] // 1:
501 (p48) add t[6]=t[6],a3 // (p17) t[6]+=a3
502 (p50) add t[6]=t[6],a3,1 };;
503{ .mfi; (p17) getf.sig a8=ahi[8] // 2:
504 (p17) xma.hu nhi[7]=ni6,mj[1],nhi[6] // np[6]*m0
505 (p40) cmp.ltu p43,p41=a3,n3 }
506{ .mfi; (p42) cmp.leu p43,p41=a3,n3
507 (p17) xma.lu nlo[7]=ni6,mj[1],nhi[6]
508 (p16) nop.i 0 };;
509{ .mii; (p17) getf.sig n5=nlo[6] // 3:
510 (p48) cmp.ltu p51,p49=t[6],a3
511 (p50) cmp.leu p51,p49=t[6],a3 };;
512 .pred.rel "mutex",p41,p43
513 .pred.rel "mutex",p49,p51
514{ .mfi; (p16) nop.m 0 // 4:
515 (p16) xma.hu ahi[1]=ai1,bj[7],ahi[0] // ap[1]*b[i]
516 (p41) add a4=a4,n4 } // (p17) a4+=n4
517{ .mfi; (p43) add a4=a4,n4,1
518 (p16) xma.lu alo[1]=ai1,bj[7],ahi[0]
519 (p16) nop.i 0 };;
520{ .mfi; (p49) add t[5]=t[5],a4 // 5: (p17) t[5]+=a4
521 (p16) xmpy.lu mj[0]=alo[0],n0 // (ap[0]*b[i]+t[0])*n0
522 (p51) add t[5]=t[5],a4,1 };;
523{ .mfi; (p16) nop.m 0 // 6:
524 (p17) xma.hu nhi[8]=ni7,mj[1],nhi[7] // np[7]*m0
525 (p41) cmp.ltu p42,p40=a4,n4 }
526{ .mfi; (p43) cmp.leu p42,p40=a4,n4
527 (p17) xma.lu nlo[8]=ni7,mj[1],nhi[7]
528 (p16) nop.i 0 };;
529{ .mii; (p17) getf.sig n6=nlo[7] // 7:
530 (p49) cmp.ltu p50,p48=t[5],a4
531 (p51) cmp.leu p50,p48=t[5],a4 };;
532 .pred.rel "mutex",p40,p42
533 .pred.rel "mutex",p48,p50
534{ .mfi; (p16) nop.m 0 // 8:
535 (p16) xma.hu ahi[2]=ai2,bj[7],ahi[1] // ap[2]*b[i]
536 (p40) add a5=a5,n5 } // (p17) a5+=n5
537{ .mfi; (p42) add a5=a5,n5,1
538 (p16) xma.lu alo[2]=ai2,bj[7],ahi[1]
539 (p16) nop.i 0 };;
540{ .mii; (p16) getf.sig a1=alo[1] // 9:
541 (p48) add t[4]=t[4],a5 // p(17) t[4]+=a5
542 (p50) add t[4]=t[4],a5,1 };;
543{ .mfi; (p16) nop.m 0 // 10:
544 (p16) xma.hu nhi[0]=ni0,mj[0],alo[0] // np[0]*m0
545 (p40) cmp.ltu p43,p41=a5,n5 }
546{ .mfi; (p42) cmp.leu p43,p41=a5,n5
547 (p16) xma.lu nlo[0]=ni0,mj[0],alo[0]
548 (p16) nop.i 0 };;
549{ .mii; (p17) getf.sig n7=nlo[8] // 11:
550 (p48) cmp.ltu p51,p49=t[4],a5
551 (p50) cmp.leu p51,p49=t[4],a5 };;
552 .pred.rel "mutex",p41,p43
553 .pred.rel "mutex",p49,p51
554{ .mfi; (p17) getf.sig n8=nhi[8] // 12:
555 (p16) xma.hu ahi[3]=ai3,bj[7],ahi[2] // ap[3]*b[i]
556 (p41) add a6=a6,n6 } // (p17) a6+=n6
557{ .mfi; (p43) add a6=a6,n6,1
558 (p16) xma.lu alo[3]=ai3,bj[7],ahi[2]
559 (p16) nop.i 0 };;
560{ .mii; (p16) getf.sig a2=alo[2] // 13:
561 (p49) add t[3]=t[3],a6 // (p17) t[3]+=a6
562 (p51) add t[3]=t[3],a6,1 };;
563{ .mfi; (p16) nop.m 0 // 14:
564 (p16) xma.hu nhi[1]=ni1,mj[0],nhi[0] // np[1]*m0
565 (p41) cmp.ltu p42,p40=a6,n6 }
566{ .mfi; (p43) cmp.leu p42,p40=a6,n6
567 (p16) xma.lu nlo[1]=ni1,mj[0],nhi[0]
568 (p16) nop.i 0 };;
569{ .mii; (p16) nop.m 0 // 15:
570 (p49) cmp.ltu p50,p48=t[3],a6
571 (p51) cmp.leu p50,p48=t[3],a6 };;
572 .pred.rel "mutex",p40,p42
573 .pred.rel "mutex",p48,p50
574{ .mfi; (p16) nop.m 0 // 16:
575 (p16) xma.hu ahi[4]=ai4,bj[7],ahi[3] // ap[4]*b[i]
576 (p40) add a7=a7,n7 } // (p17) a7+=n7
577{ .mfi; (p42) add a7=a7,n7,1
578 (p16) xma.lu alo[4]=ai4,bj[7],ahi[3]
579 (p16) nop.i 0 };;
580{ .mii; (p16) getf.sig a3=alo[3] // 17:
581 (p48) add t[2]=t[2],a7 // (p17) t[2]+=a7
582 (p50) add t[2]=t[2],a7,1 };;
583{ .mfi; (p16) nop.m 0 // 18:
584 (p16) xma.hu nhi[2]=ni2,mj[0],nhi[1] // np[2]*m0
585 (p40) cmp.ltu p43,p41=a7,n7 }
586{ .mfi; (p42) cmp.leu p43,p41=a7,n7
587 (p16) xma.lu nlo[2]=ni2,mj[0],nhi[1]
588 (p16) nop.i 0 };;
589{ .mii; (p16) getf.sig n1=nlo[1] // 19:
590 (p48) cmp.ltu p51,p49=t[2],a7
591 (p50) cmp.leu p51,p49=t[2],a7 };;
592 .pred.rel "mutex",p41,p43
593 .pred.rel "mutex",p49,p51
594{ .mfi; (p16) nop.m 0 // 20:
595 (p16) xma.hu ahi[5]=ai5,bj[7],ahi[4] // ap[5]*b[i]
596 (p41) add a8=a8,n8 } // (p17) a8+=n8
597{ .mfi; (p43) add a8=a8,n8,1
598 (p16) xma.lu alo[5]=ai5,bj[7],ahi[4]
599 (p16) nop.i 0 };;
600{ .mii; (p16) getf.sig a4=alo[4] // 21:
601 (p49) add t[1]=t[1],a8 // (p17) t[1]+=a8
602 (p51) add t[1]=t[1],a8,1 };;
603{ .mfi; (p16) nop.m 0 // 22:
604 (p16) xma.hu nhi[3]=ni3,mj[0],nhi[2] // np[3]*m0
605 (p41) cmp.ltu p42,p40=a8,n8 }
606{ .mfi; (p43) cmp.leu p42,p40=a8,n8
607 (p16) xma.lu nlo[3]=ni3,mj[0],nhi[2]
608 (p16) nop.i 0 };;
609{ .mii; (p16) getf.sig n2=nlo[2] // 23:
610 (p49) cmp.ltu p50,p48=t[1],a8
611 (p51) cmp.leu p50,p48=t[1],a8 };;
612{ .mfi; (p16) nop.m 0 // 24:
613 (p16) xma.hu ahi[6]=ai6,bj[7],ahi[5] // ap[6]*b[i]
614 (p16) add a1=a1,n1 } // (p16) a1+=n1
615{ .mfi; (p16) nop.m 0
616 (p16) xma.lu alo[6]=ai6,bj[7],ahi[5]
617 (p17) mov t[0]=r0 };;
618{ .mii; (p16) getf.sig a5=alo[5] // 25:
619 (p16) add t0=t[7],a1 // (p16) t[7]+=a1
620 (p42) add t[0]=t[0],r0,1 };;
621{ .mfi; (p16) setf.sig tf[0]=t0 // 26:
622 (p16) xma.hu nhi[4]=ni4,mj[0],nhi[3] // np[4]*m0
623 (p50) add t[0]=t[0],r0,1 }
624{ .mfi; (p16) cmp.ltu.unc p42,p40=a1,n1
625 (p16) xma.lu nlo[4]=ni4,mj[0],nhi[3]
626 (p16) nop.i 0 };;
627{ .mii; (p16) getf.sig n3=nlo[3] // 27:
628 (p16) cmp.ltu.unc p50,p48=t0,a1
629 (p16) nop.i 0 };;
630 .pred.rel "mutex",p40,p42
631 .pred.rel "mutex",p48,p50
632{ .mfi; (p16) nop.m 0 // 28:
633 (p16) xma.hu ahi[7]=ai7,bj[7],ahi[6] // ap[7]*b[i]
634 (p40) add a2=a2,n2 } // (p16) a2+=n2
635{ .mfi; (p42) add a2=a2,n2,1
636 (p16) xma.lu alo[7]=ai7,bj[7],ahi[6]
637 (p16) nop.i 0 };;
638{ .mii; (p16) getf.sig a6=alo[6] // 29:
639 (p48) add t[6]=t[6],a2 // (p16) t[6]+=a2
640 (p50) add t[6]=t[6],a2,1 };;
641{ .mfi; (p16) nop.m 0 // 30:
642 (p16) xma.hu nhi[5]=ni5,mj[0],nhi[4] // np[5]*m0
643 (p40) cmp.ltu p41,p39=a2,n2 }
644{ .mfi; (p42) cmp.leu p41,p39=a2,n2
645 (p16) xma.lu nlo[5]=ni5,mj[0],nhi[4]
646 (p16) nop.i 0 };;
647{ .mfi; (p16) getf.sig n4=nlo[4] // 31:
648 (p16) nop.f 0
649 (p48) cmp.ltu p49,p47=t[6],a2 }
650{ .mfb; (p50) cmp.leu p49,p47=t[6],a2
651 (p16) nop.f 0
652 br.ctop.sptk.many .Louter_8_ctop };;
653.Louter_8_cend:
654
655// above loop has to execute one more time, without (p16), which is
656// replaced with merged move of np[8] to GPR bank
657 .pred.rel "mutex",p40,p42
658 .pred.rel "mutex",p48,p50
659{ .mmi; (p0) getf.sig n1=ni0 // 0:
660 (p40) add a3=a3,n3 // (p17) a3+=n3
661 (p42) add a3=a3,n3,1 };;
662{ .mii; (p17) getf.sig a7=alo[8] // 1:
663 (p48) add t[6]=t[6],a3 // (p17) t[6]+=a3
664 (p50) add t[6]=t[6],a3,1 };;
665{ .mfi; (p17) getf.sig a8=ahi[8] // 2:
666 (p17) xma.hu nhi[7]=ni6,mj[1],nhi[6] // np[6]*m0
667 (p40) cmp.ltu p43,p41=a3,n3 }
668{ .mfi; (p42) cmp.leu p43,p41=a3,n3
669 (p17) xma.lu nlo[7]=ni6,mj[1],nhi[6]
670 (p0) nop.i 0 };;
671{ .mii; (p17) getf.sig n5=nlo[6] // 3:
672 (p48) cmp.ltu p51,p49=t[6],a3
673 (p50) cmp.leu p51,p49=t[6],a3 };;
674 .pred.rel "mutex",p41,p43
675 .pred.rel "mutex",p49,p51
676{ .mmi; (p0) getf.sig n2=ni1 // 4:
677 (p41) add a4=a4,n4 // (p17) a4+=n4
678 (p43) add a4=a4,n4,1 };;
679{ .mfi; (p49) add t[5]=t[5],a4 // 5: (p17) t[5]+=a4
680 (p0) nop.f 0
681 (p51) add t[5]=t[5],a4,1 };;
682{ .mfi; (p0) getf.sig n3=ni2 // 6:
683 (p17) xma.hu nhi[8]=ni7,mj[1],nhi[7] // np[7]*m0
684 (p41) cmp.ltu p42,p40=a4,n4 }
685{ .mfi; (p43) cmp.leu p42,p40=a4,n4
686 (p17) xma.lu nlo[8]=ni7,mj[1],nhi[7]
687 (p0) nop.i 0 };;
688{ .mii; (p17) getf.sig n6=nlo[7] // 7:
689 (p49) cmp.ltu p50,p48=t[5],a4
690 (p51) cmp.leu p50,p48=t[5],a4 };;
691 .pred.rel "mutex",p40,p42
692 .pred.rel "mutex",p48,p50
693{ .mii; (p0) getf.sig n4=ni3 // 8:
694 (p40) add a5=a5,n5 // (p17) a5+=n5
695 (p42) add a5=a5,n5,1 };;
696{ .mii; (p0) nop.m 0 // 9:
697 (p48) add t[4]=t[4],a5 // p(17) t[4]+=a5
698 (p50) add t[4]=t[4],a5,1 };;
699{ .mii; (p0) nop.m 0 // 10:
700 (p40) cmp.ltu p43,p41=a5,n5
701 (p42) cmp.leu p43,p41=a5,n5 };;
702{ .mii; (p17) getf.sig n7=nlo[8] // 11:
703 (p48) cmp.ltu p51,p49=t[4],a5
704 (p50) cmp.leu p51,p49=t[4],a5 };;
705 .pred.rel "mutex",p41,p43
706 .pred.rel "mutex",p49,p51
707{ .mii; (p17) getf.sig n8=nhi[8] // 12:
708 (p41) add a6=a6,n6 // (p17) a6+=n6
709 (p43) add a6=a6,n6,1 };;
710{ .mii; (p0) getf.sig n5=ni4 // 13:
711 (p49) add t[3]=t[3],a6 // (p17) t[3]+=a6
712 (p51) add t[3]=t[3],a6,1 };;
713{ .mii; (p0) nop.m 0 // 14:
714 (p41) cmp.ltu p42,p40=a6,n6
715 (p43) cmp.leu p42,p40=a6,n6 };;
716{ .mii; (p0) getf.sig n6=ni5 // 15:
717 (p49) cmp.ltu p50,p48=t[3],a6
718 (p51) cmp.leu p50,p48=t[3],a6 };;
719 .pred.rel "mutex",p40,p42
720 .pred.rel "mutex",p48,p50
721{ .mii; (p0) nop.m 0 // 16:
722 (p40) add a7=a7,n7 // (p17) a7+=n7
723 (p42) add a7=a7,n7,1 };;
724{ .mii; (p0) nop.m 0 // 17:
725 (p48) add t[2]=t[2],a7 // (p17) t[2]+=a7
726 (p50) add t[2]=t[2],a7,1 };;
727{ .mii; (p0) nop.m 0 // 18:
728 (p40) cmp.ltu p43,p41=a7,n7
729 (p42) cmp.leu p43,p41=a7,n7 };;
730{ .mii; (p0) getf.sig n7=ni6 // 19:
731 (p48) cmp.ltu p51,p49=t[2],a7
732 (p50) cmp.leu p51,p49=t[2],a7 };;
733 .pred.rel "mutex",p41,p43
734 .pred.rel "mutex",p49,p51
735{ .mii; (p0) nop.m 0 // 20:
736 (p41) add a8=a8,n8 // (p17) a8+=n8
737 (p43) add a8=a8,n8,1 };;
738{ .mmi; (p0) nop.m 0 // 21:
739 (p49) add t[1]=t[1],a8 // (p17) t[1]+=a8
740 (p51) add t[1]=t[1],a8,1 }
741{ .mmi; (p17) mov t[0]=r0
742 (p41) cmp.ltu p42,p40=a8,n8
743 (p43) cmp.leu p42,p40=a8,n8 };;
744{ .mmi; (p0) getf.sig n8=ni7 // 22:
745 (p49) cmp.ltu p50,p48=t[1],a8
746 (p51) cmp.leu p50,p48=t[1],a8 }
747{ .mmi; (p42) add t[0]=t[0],r0,1
748 (p0) add r16=-7*16,prevsp
749 (p0) add r17=-6*16,prevsp };;
750
751// subtract np[8] from carrybit|tmp[8]
752// carrybit|tmp[8] layout upon exit from above loop is:
753// t[0]|t[1]|t[2]|t[3]|t[4]|t[5]|t[6]|t[7]|t0 (least significant)
754{ .mmi; (p50)add t[0]=t[0],r0,1
755 add r18=-5*16,prevsp
756 sub n1=t0,n1 };;
757{ .mmi; cmp.gtu p34,p32=n1,t0;;
758 .pred.rel "mutex",p32,p34
759 (p32)sub n2=t[7],n2
760 (p34)sub n2=t[7],n2,1 };;
761{ .mii; (p32)cmp.gtu p35,p33=n2,t[7]
762 (p34)cmp.geu p35,p33=n2,t[7];;
763 .pred.rel "mutex",p33,p35
764 (p33)sub n3=t[6],n3 }
765{ .mmi; (p35)sub n3=t[6],n3,1;;
766 (p33)cmp.gtu p34,p32=n3,t[6]
767 (p35)cmp.geu p34,p32=n3,t[6] };;
768 .pred.rel "mutex",p32,p34
769{ .mii; (p32)sub n4=t[5],n4
770 (p34)sub n4=t[5],n4,1;;
771 (p32)cmp.gtu p35,p33=n4,t[5] }
772{ .mmi; (p34)cmp.geu p35,p33=n4,t[5];;
773 .pred.rel "mutex",p33,p35
774 (p33)sub n5=t[4],n5
775 (p35)sub n5=t[4],n5,1 };;
776{ .mii; (p33)cmp.gtu p34,p32=n5,t[4]
777 (p35)cmp.geu p34,p32=n5,t[4];;
778 .pred.rel "mutex",p32,p34
779 (p32)sub n6=t[3],n6 }
780{ .mmi; (p34)sub n6=t[3],n6,1;;
781 (p32)cmp.gtu p35,p33=n6,t[3]
782 (p34)cmp.geu p35,p33=n6,t[3] };;
783 .pred.rel "mutex",p33,p35
784{ .mii; (p33)sub n7=t[2],n7
785 (p35)sub n7=t[2],n7,1;;
786 (p33)cmp.gtu p34,p32=n7,t[2] }
787{ .mmi; (p35)cmp.geu p34,p32=n7,t[2];;
788 .pred.rel "mutex",p32,p34
789 (p32)sub n8=t[1],n8
790 (p34)sub n8=t[1],n8,1 };;
791{ .mii; (p32)cmp.gtu p35,p33=n8,t[1]
792 (p34)cmp.geu p35,p33=n8,t[1];;
793 .pred.rel "mutex",p33,p35
794 (p33)sub a8=t[0],r0 }
795{ .mmi; (p35)sub a8=t[0],r0,1;;
796 (p33)cmp.gtu p34,p32=a8,t[0]
797 (p35)cmp.geu p34,p32=a8,t[0] };;
798
799// save the result, either tmp[num] or tmp[num]-np[num]
800 .pred.rel "mutex",p32,p34
801{ .mmi; (p32)st8 [rptr]=n1,8
802 (p34)st8 [rptr]=t0,8
803 add r19=-4*16,prevsp};;
804{ .mmb; (p32)st8 [rptr]=n2,8
805 (p34)st8 [rptr]=t[7],8
806 (p5)br.cond.dpnt.few .Ldone };;
807{ .mmb; (p32)st8 [rptr]=n3,8
808 (p34)st8 [rptr]=t[6],8
809 (p7)br.cond.dpnt.few .Ldone };;
810{ .mmb; (p32)st8 [rptr]=n4,8
811 (p34)st8 [rptr]=t[5],8
812 (p9)br.cond.dpnt.few .Ldone };;
813{ .mmb; (p32)st8 [rptr]=n5,8
814 (p34)st8 [rptr]=t[4],8
815 (p11)br.cond.dpnt.few .Ldone };;
816{ .mmb; (p32)st8 [rptr]=n6,8
817 (p34)st8 [rptr]=t[3],8
818 (p13)br.cond.dpnt.few .Ldone };;
819{ .mmb; (p32)st8 [rptr]=n7,8
820 (p34)st8 [rptr]=t[2],8
821 (p15)br.cond.dpnt.few .Ldone };;
822{ .mmb; (p32)st8 [rptr]=n8,8
823 (p34)st8 [rptr]=t[1],8
824 nop.b 0 };;
825.Ldone: // epilogue
826{ .mmi; ldf.fill f16=[r16],64
827 ldf.fill f17=[r17],64
828 nop.i 0 }
829{ .mmi; ldf.fill f18=[r18],64
830 ldf.fill f19=[r19],64
831 mov pr=prevpr,0x1ffff };;
832{ .mmi; ldf.fill f20=[r16]
833 ldf.fill f21=[r17]
834 mov ar.lc=prevlc }
835{ .mmi; ldf.fill f22=[r18]
836 ldf.fill f23=[r19]
837 mov ret0=1 } // signal "handled"
838{ .mib; rum 1<<5
839 .restore sp
840 mov sp=prevsp
841 br.ret.sptk.many b0 };;
842.endp bn_mul_mont_8#
843
844.type copyright#,\@object
845copyright:
846stringz "Montgomery multiplication for IA-64, CRYPTOGAMS by <appro\@openssl.org>"
847___
848
849$output=shift and open STDOUT,">$output";
850print $code;
851close STDOUT;
diff --git a/src/lib/libcrypto/bn/asm/ia64.S b/src/lib/libcrypto/bn/asm/ia64.S
deleted file mode 100644
index 7c4fbd3118..0000000000
--- a/src/lib/libcrypto/bn/asm/ia64.S
+++ /dev/null
@@ -1,1555 +0,0 @@
1.explicit
2.text
3.ident "ia64.S, Version 2.1"
4.ident "IA-64 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>"
5
6//
7// ====================================================================
8// Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
9// project.
10//
11// Rights for redistribution and usage in source and binary forms are
12// granted according to the OpenSSL license. Warranty of any kind is
13// disclaimed.
14// ====================================================================
15//
16// Version 2.x is Itanium2 re-tune. Few words about how Itanum2 is
17// different from Itanium to this module viewpoint. Most notably, is it
18// "wider" than Itanium? Can you experience loop scalability as
19// discussed in commentary sections? Not really:-( Itanium2 has 6
20// integer ALU ports, i.e. it's 2 ports wider, but it's not enough to
21// spin twice as fast, as I need 8 IALU ports. Amount of floating point
22// ports is the same, i.e. 2, while I need 4. In other words, to this
23// module Itanium2 remains effectively as "wide" as Itanium. Yet it's
24// essentially different in respect to this module, and a re-tune was
25// required. Well, because some intruction latencies has changed. Most
26// noticeably those intensively used:
27//
28// Itanium Itanium2
29// ldf8 9 6 L2 hit
30// ld8 2 1 L1 hit
31// getf 2 5
32// xma[->getf] 7[+1] 4[+0]
33// add[->st8] 1[+1] 1[+0]
34//
35// What does it mean? You might ratiocinate that the original code
36// should run just faster... Because sum of latencies is smaller...
37// Wrong! Note that getf latency increased. This means that if a loop is
38// scheduled for lower latency (as they were), then it will suffer from
39// stall condition and the code will therefore turn anti-scalable, e.g.
40// original bn_mul_words spun at 5*n or 2.5 times slower than expected
41// on Itanium2! What to do? Reschedule loops for Itanium2? But then
42// Itanium would exhibit anti-scalability. So I've chosen to reschedule
43// for worst latency for every instruction aiming for best *all-round*
44// performance.
45
46// Q. How much faster does it get?
47// A. Here is the output from 'openssl speed rsa dsa' for vanilla
48// 0.9.6a compiled with gcc version 2.96 20000731 (Red Hat
49// Linux 7.1 2.96-81):
50//
51// sign verify sign/s verify/s
52// rsa 512 bits 0.0036s 0.0003s 275.3 2999.2
53// rsa 1024 bits 0.0203s 0.0011s 49.3 894.1
54// rsa 2048 bits 0.1331s 0.0040s 7.5 250.9
55// rsa 4096 bits 0.9270s 0.0147s 1.1 68.1
56// sign verify sign/s verify/s
57// dsa 512 bits 0.0035s 0.0043s 288.3 234.8
58// dsa 1024 bits 0.0111s 0.0135s 90.0 74.2
59//
60// And here is similar output but for this assembler
61// implementation:-)
62//
63// sign verify sign/s verify/s
64// rsa 512 bits 0.0021s 0.0001s 549.4 9638.5
65// rsa 1024 bits 0.0055s 0.0002s 183.8 4481.1
66// rsa 2048 bits 0.0244s 0.0006s 41.4 1726.3
67// rsa 4096 bits 0.1295s 0.0018s 7.7 561.5
68// sign verify sign/s verify/s
69// dsa 512 bits 0.0012s 0.0013s 891.9 756.6
70// dsa 1024 bits 0.0023s 0.0028s 440.4 376.2
71//
72// Yes, you may argue that it's not fair comparison as it's
73// possible to craft the C implementation with BN_UMULT_HIGH
74// inline assembler macro. But of course! Here is the output
75// with the macro:
76//
77// sign verify sign/s verify/s
78// rsa 512 bits 0.0020s 0.0002s 495.0 6561.0
79// rsa 1024 bits 0.0086s 0.0004s 116.2 2235.7
80// rsa 2048 bits 0.0519s 0.0015s 19.3 667.3
81// rsa 4096 bits 0.3464s 0.0053s 2.9 187.7
82// sign verify sign/s verify/s
83// dsa 512 bits 0.0016s 0.0020s 613.1 510.5
84// dsa 1024 bits 0.0045s 0.0054s 221.0 183.9
85//
86// My code is still way faster, huh:-) And I believe that even
87// higher performance can be achieved. Note that as keys get
88// longer, performance gain is larger. Why? According to the
89// profiler there is another player in the field, namely
90// BN_from_montgomery consuming larger and larger portion of CPU
91// time as keysize decreases. I therefore consider putting effort
92// to assembler implementation of the following routine:
93//
94// void bn_mul_add_mont (BN_ULONG *rp,BN_ULONG *np,int nl,BN_ULONG n0)
95// {
96// int i,j;
97// BN_ULONG v;
98//
99// for (i=0; i<nl; i++)
100// {
101// v=bn_mul_add_words(rp,np,nl,(rp[0]*n0)&BN_MASK2);
102// nrp++;
103// rp++;
104// if (((nrp[-1]+=v)&BN_MASK2) < v)
105// for (j=0; ((++nrp[j])&BN_MASK2) == 0; j++) ;
106// }
107// }
108//
109// It might as well be beneficial to implement even combaX
110// variants, as it appears as it can literally unleash the
111// performance (see comment section to bn_mul_comba8 below).
112//
113// And finally for your reference the output for 0.9.6a compiled
114// with SGIcc version 0.01.0-12 (keep in mind that for the moment
115// of this writing it's not possible to convince SGIcc to use
116// BN_UMULT_HIGH inline assembler macro, yet the code is fast,
117// i.e. for a compiler generated one:-):
118//
119// sign verify sign/s verify/s
120// rsa 512 bits 0.0022s 0.0002s 452.7 5894.3
121// rsa 1024 bits 0.0097s 0.0005s 102.7 2002.9
122// rsa 2048 bits 0.0578s 0.0017s 17.3 600.2
123// rsa 4096 bits 0.3838s 0.0061s 2.6 164.5
124// sign verify sign/s verify/s
125// dsa 512 bits 0.0018s 0.0022s 547.3 459.6
126// dsa 1024 bits 0.0051s 0.0062s 196.6 161.3
127//
128// Oh! Benchmarks were performed on 733MHz Lion-class Itanium
129// system running Redhat Linux 7.1 (very special thanks to Ray
130// McCaffity of Williams Communications for providing an account).
131//
132// Q. What's the heck with 'rum 1<<5' at the end of every function?
133// A. Well, by clearing the "upper FP registers written" bit of the
134// User Mask I want to excuse the kernel from preserving upper
135// (f32-f128) FP register bank over process context switch, thus
136// minimizing bus bandwidth consumption during the switch (i.e.
137// after PKI opration completes and the program is off doing
138// something else like bulk symmetric encryption). Having said
139// this, I also want to point out that it might be good idea
140// to compile the whole toolkit (as well as majority of the
141// programs for that matter) with -mfixed-range=f32-f127 command
142// line option. No, it doesn't prevent the compiler from writing
143// to upper bank, but at least discourages to do so. If you don't
144// like the idea you have the option to compile the module with
145// -Drum=nop.m in command line.
146//
147
148#if defined(_HPUX_SOURCE) && !defined(_LP64)
149#define ADDP addp4
150#else
151#define ADDP add
152#endif
153
154#if 1
155//
156// bn_[add|sub]_words routines.
157//
158// Loops are spinning in 2*(n+5) ticks on Itanuim (provided that the
159// data reside in L1 cache, i.e. 2 ticks away). It's possible to
160// compress the epilogue and get down to 2*n+6, but at the cost of
161// scalability (the neat feature of this implementation is that it
162// shall automagically spin in n+5 on "wider" IA-64 implementations:-)
163// I consider that the epilogue is short enough as it is to trade tiny
164// performance loss on Itanium for scalability.
165//
166// BN_ULONG bn_add_words(BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int num)
167//
168.global bn_add_words#
169.proc bn_add_words#
170.align 64
171.skip 32 // makes the loop body aligned at 64-byte boundary
172bn_add_words:
173 .prologue
174 .save ar.pfs,r2
175{ .mii; alloc r2=ar.pfs,4,12,0,16
176 cmp4.le p6,p0=r35,r0 };;
177{ .mfb; mov r8=r0 // return value
178(p6) br.ret.spnt.many b0 };;
179
180{ .mib; sub r10=r35,r0,1
181 .save ar.lc,r3
182 mov r3=ar.lc
183 brp.loop.imp .L_bn_add_words_ctop,.L_bn_add_words_cend-16
184 }
185{ .mib; ADDP r14=0,r32 // rp
186 .save pr,r9
187 mov r9=pr };;
188 .body
189{ .mii; ADDP r15=0,r33 // ap
190 mov ar.lc=r10
191 mov ar.ec=6 }
192{ .mib; ADDP r16=0,r34 // bp
193 mov pr.rot=1<<16 };;
194
195.L_bn_add_words_ctop:
196{ .mii; (p16) ld8 r32=[r16],8 // b=*(bp++)
197 (p18) add r39=r37,r34
198 (p19) cmp.ltu.unc p56,p0=r40,r38 }
199{ .mfb; (p0) nop.m 0x0
200 (p0) nop.f 0x0
201 (p0) nop.b 0x0 }
202{ .mii; (p16) ld8 r35=[r15],8 // a=*(ap++)
203 (p58) cmp.eq.or p57,p0=-1,r41 // (p20)
204 (p58) add r41=1,r41 } // (p20)
205{ .mfb; (p21) st8 [r14]=r42,8 // *(rp++)=r
206 (p0) nop.f 0x0
207 br.ctop.sptk .L_bn_add_words_ctop };;
208.L_bn_add_words_cend:
209
210{ .mii;
211(p59) add r8=1,r8 // return value
212 mov pr=r9,0x1ffff
213 mov ar.lc=r3 }
214{ .mbb; nop.b 0x0
215 br.ret.sptk.many b0 };;
216.endp bn_add_words#
217
218//
219// BN_ULONG bn_sub_words(BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int num)
220//
221.global bn_sub_words#
222.proc bn_sub_words#
223.align 64
224.skip 32 // makes the loop body aligned at 64-byte boundary
225bn_sub_words:
226 .prologue
227 .save ar.pfs,r2
228{ .mii; alloc r2=ar.pfs,4,12,0,16
229 cmp4.le p6,p0=r35,r0 };;
230{ .mfb; mov r8=r0 // return value
231(p6) br.ret.spnt.many b0 };;
232
233{ .mib; sub r10=r35,r0,1
234 .save ar.lc,r3
235 mov r3=ar.lc
236 brp.loop.imp .L_bn_sub_words_ctop,.L_bn_sub_words_cend-16
237 }
238{ .mib; ADDP r14=0,r32 // rp
239 .save pr,r9
240 mov r9=pr };;
241 .body
242{ .mii; ADDP r15=0,r33 // ap
243 mov ar.lc=r10
244 mov ar.ec=6 }
245{ .mib; ADDP r16=0,r34 // bp
246 mov pr.rot=1<<16 };;
247
248.L_bn_sub_words_ctop:
249{ .mii; (p16) ld8 r32=[r16],8 // b=*(bp++)
250 (p18) sub r39=r37,r34
251 (p19) cmp.gtu.unc p56,p0=r40,r38 }
252{ .mfb; (p0) nop.m 0x0
253 (p0) nop.f 0x0
254 (p0) nop.b 0x0 }
255{ .mii; (p16) ld8 r35=[r15],8 // a=*(ap++)
256 (p58) cmp.eq.or p57,p0=0,r41 // (p20)
257 (p58) add r41=-1,r41 } // (p20)
258{ .mbb; (p21) st8 [r14]=r42,8 // *(rp++)=r
259 (p0) nop.b 0x0
260 br.ctop.sptk .L_bn_sub_words_ctop };;
261.L_bn_sub_words_cend:
262
263{ .mii;
264(p59) add r8=1,r8 // return value
265 mov pr=r9,0x1ffff
266 mov ar.lc=r3 }
267{ .mbb; nop.b 0x0
268 br.ret.sptk.many b0 };;
269.endp bn_sub_words#
270#endif
271
272#if 0
273#define XMA_TEMPTATION
274#endif
275
276#if 1
277//
278// BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
279//
280.global bn_mul_words#
281.proc bn_mul_words#
282.align 64
283.skip 32 // makes the loop body aligned at 64-byte boundary
284bn_mul_words:
285 .prologue
286 .save ar.pfs,r2
287#ifdef XMA_TEMPTATION
288{ .mfi; alloc r2=ar.pfs,4,0,0,0 };;
289#else
290{ .mfi; alloc r2=ar.pfs,4,12,0,16 };;
291#endif
292{ .mib; mov r8=r0 // return value
293 cmp4.le p6,p0=r34,r0
294(p6) br.ret.spnt.many b0 };;
295
296{ .mii; sub r10=r34,r0,1
297 .save ar.lc,r3
298 mov r3=ar.lc
299 .save pr,r9
300 mov r9=pr };;
301
302 .body
303{ .mib; setf.sig f8=r35 // w
304 mov pr.rot=0x800001<<16
305 // ------^----- serves as (p50) at first (p27)
306 brp.loop.imp .L_bn_mul_words_ctop,.L_bn_mul_words_cend-16
307 }
308
309#ifndef XMA_TEMPTATION
310
311{ .mmi; ADDP r14=0,r32 // rp
312 ADDP r15=0,r33 // ap
313 mov ar.lc=r10 }
314{ .mmi; mov r40=0 // serves as r35 at first (p27)
315 mov ar.ec=13 };;
316
317// This loop spins in 2*(n+12) ticks. It's scheduled for data in Itanium
318// L2 cache (i.e. 9 ticks away) as floating point load/store instructions
319// bypass L1 cache and L2 latency is actually best-case scenario for
320// ldf8. The loop is not scalable and shall run in 2*(n+12) even on
321// "wider" IA-64 implementations. It's a trade-off here. n+24 loop
322// would give us ~5% in *overall* performance improvement on "wider"
323// IA-64, but would hurt Itanium for about same because of longer
324// epilogue. As it's a matter of few percents in either case I've
325// chosen to trade the scalability for development time (you can see
326// this very instruction sequence in bn_mul_add_words loop which in
327// turn is scalable).
328.L_bn_mul_words_ctop:
329{ .mfi; (p25) getf.sig r36=f52 // low
330 (p21) xmpy.lu f48=f37,f8
331 (p28) cmp.ltu p54,p50=r41,r39 }
332{ .mfi; (p16) ldf8 f32=[r15],8
333 (p21) xmpy.hu f40=f37,f8
334 (p0) nop.i 0x0 };;
335{ .mii; (p25) getf.sig r32=f44 // high
336 .pred.rel "mutex",p50,p54
337 (p50) add r40=r38,r35 // (p27)
338 (p54) add r40=r38,r35,1 } // (p27)
339{ .mfb; (p28) st8 [r14]=r41,8
340 (p0) nop.f 0x0
341 br.ctop.sptk .L_bn_mul_words_ctop };;
342.L_bn_mul_words_cend:
343
344{ .mii; nop.m 0x0
345.pred.rel "mutex",p51,p55
346(p51) add r8=r36,r0
347(p55) add r8=r36,r0,1 }
348{ .mfb; nop.m 0x0
349 nop.f 0x0
350 nop.b 0x0 }
351
352#else // XMA_TEMPTATION
353
354 setf.sig f37=r0 // serves as carry at (p18) tick
355 mov ar.lc=r10
356 mov ar.ec=5;;
357
358// Most of you examining this code very likely wonder why in the name
359// of Intel the following loop is commented out? Indeed, it looks so
360// neat that you find it hard to believe that it's something wrong
361// with it, right? The catch is that every iteration depends on the
362// result from previous one and the latter isn't available instantly.
363// The loop therefore spins at the latency of xma minus 1, or in other
364// words at 6*(n+4) ticks:-( Compare to the "production" loop above
365// that runs in 2*(n+11) where the low latency problem is worked around
366// by moving the dependency to one-tick latent interger ALU. Note that
367// "distance" between ldf8 and xma is not latency of ldf8, but the
368// *difference* between xma and ldf8 latencies.
369.L_bn_mul_words_ctop:
370{ .mfi; (p16) ldf8 f32=[r33],8
371 (p18) xma.hu f38=f34,f8,f39 }
372{ .mfb; (p20) stf8 [r32]=f37,8
373 (p18) xma.lu f35=f34,f8,f39
374 br.ctop.sptk .L_bn_mul_words_ctop };;
375.L_bn_mul_words_cend:
376
377 getf.sig r8=f41 // the return value
378
379#endif // XMA_TEMPTATION
380
381{ .mii; nop.m 0x0
382 mov pr=r9,0x1ffff
383 mov ar.lc=r3 }
384{ .mfb; rum 1<<5 // clear um.mfh
385 nop.f 0x0
386 br.ret.sptk.many b0 };;
387.endp bn_mul_words#
388#endif
389
390#if 1
391//
392// BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
393//
394.global bn_mul_add_words#
395.proc bn_mul_add_words#
396.align 64
397.skip 48 // makes the loop body aligned at 64-byte boundary
398bn_mul_add_words:
399 .prologue
400 .save ar.pfs,r2
401{ .mmi; alloc r2=ar.pfs,4,4,0,8
402 cmp4.le p6,p0=r34,r0
403 .save ar.lc,r3
404 mov r3=ar.lc };;
405{ .mib; mov r8=r0 // return value
406 sub r10=r34,r0,1
407(p6) br.ret.spnt.many b0 };;
408
409{ .mib; setf.sig f8=r35 // w
410 .save pr,r9
411 mov r9=pr
412 brp.loop.imp .L_bn_mul_add_words_ctop,.L_bn_mul_add_words_cend-16
413 }
414 .body
415{ .mmi; ADDP r14=0,r32 // rp
416 ADDP r15=0,r33 // ap
417 mov ar.lc=r10 }
418{ .mii; ADDP r16=0,r32 // rp copy
419 mov pr.rot=0x2001<<16
420 // ------^----- serves as (p40) at first (p27)
421 mov ar.ec=11 };;
422
423// This loop spins in 3*(n+10) ticks on Itanium and in 2*(n+10) on
424// Itanium 2. Yes, unlike previous versions it scales:-) Previous
425// version was peforming *all* additions in IALU and was starving
426// for those even on Itanium 2. In this version one addition is
427// moved to FPU and is folded with multiplication. This is at cost
428// of propogating the result from previous call to this subroutine
429// to L2 cache... In other words negligible even for shorter keys.
430// *Overall* performance improvement [over previous version] varies
431// from 11 to 22 percent depending on key length.
432.L_bn_mul_add_words_ctop:
433.pred.rel "mutex",p40,p42
434{ .mfi; (p23) getf.sig r36=f45 // low
435 (p20) xma.lu f42=f36,f8,f50 // low
436 (p40) add r39=r39,r35 } // (p27)
437{ .mfi; (p16) ldf8 f32=[r15],8 // *(ap++)
438 (p20) xma.hu f36=f36,f8,f50 // high
439 (p42) add r39=r39,r35,1 };; // (p27)
440{ .mmi; (p24) getf.sig r32=f40 // high
441 (p16) ldf8 f46=[r16],8 // *(rp1++)
442 (p40) cmp.ltu p41,p39=r39,r35 } // (p27)
443{ .mib; (p26) st8 [r14]=r39,8 // *(rp2++)
444 (p42) cmp.leu p41,p39=r39,r35 // (p27)
445 br.ctop.sptk .L_bn_mul_add_words_ctop};;
446.L_bn_mul_add_words_cend:
447
448{ .mmi; .pred.rel "mutex",p40,p42
449(p40) add r8=r35,r0
450(p42) add r8=r35,r0,1
451 mov pr=r9,0x1ffff }
452{ .mib; rum 1<<5 // clear um.mfh
453 mov ar.lc=r3
454 br.ret.sptk.many b0 };;
455.endp bn_mul_add_words#
456#endif
457
458#if 1
459//
460// void bn_sqr_words(BN_ULONG *rp, BN_ULONG *ap, int num)
461//
462.global bn_sqr_words#
463.proc bn_sqr_words#
464.align 64
465.skip 32 // makes the loop body aligned at 64-byte boundary
466bn_sqr_words:
467 .prologue
468 .save ar.pfs,r2
469{ .mii; alloc r2=ar.pfs,3,0,0,0
470 sxt4 r34=r34 };;
471{ .mii; cmp.le p6,p0=r34,r0
472 mov r8=r0 } // return value
473{ .mfb; ADDP r32=0,r32
474 nop.f 0x0
475(p6) br.ret.spnt.many b0 };;
476
477{ .mii; sub r10=r34,r0,1
478 .save ar.lc,r3
479 mov r3=ar.lc
480 .save pr,r9
481 mov r9=pr };;
482
483 .body
484{ .mib; ADDP r33=0,r33
485 mov pr.rot=1<<16
486 brp.loop.imp .L_bn_sqr_words_ctop,.L_bn_sqr_words_cend-16
487 }
488{ .mii; add r34=8,r32
489 mov ar.lc=r10
490 mov ar.ec=18 };;
491
492// 2*(n+17) on Itanium, (n+17) on "wider" IA-64 implementations. It's
493// possible to compress the epilogue (I'm getting tired to write this
494// comment over and over) and get down to 2*n+16 at the cost of
495// scalability. The decision will very likely be reconsidered after the
496// benchmark program is profiled. I.e. if performance gain on Itanium
497// will appear larger than loss on "wider" IA-64, then the loop should
498// be explicitely split and the epilogue compressed.
499.L_bn_sqr_words_ctop:
500{ .mfi; (p16) ldf8 f32=[r33],8
501 (p25) xmpy.lu f42=f41,f41
502 (p0) nop.i 0x0 }
503{ .mib; (p33) stf8 [r32]=f50,16
504 (p0) nop.i 0x0
505 (p0) nop.b 0x0 }
506{ .mfi; (p0) nop.m 0x0
507 (p25) xmpy.hu f52=f41,f41
508 (p0) nop.i 0x0 }
509{ .mib; (p33) stf8 [r34]=f60,16
510 (p0) nop.i 0x0
511 br.ctop.sptk .L_bn_sqr_words_ctop };;
512.L_bn_sqr_words_cend:
513
514{ .mii; nop.m 0x0
515 mov pr=r9,0x1ffff
516 mov ar.lc=r3 }
517{ .mfb; rum 1<<5 // clear um.mfh
518 nop.f 0x0
519 br.ret.sptk.many b0 };;
520.endp bn_sqr_words#
521#endif
522
523#if 1
524// Apparently we win nothing by implementing special bn_sqr_comba8.
525// Yes, it is possible to reduce the number of multiplications by
526// almost factor of two, but then the amount of additions would
527// increase by factor of two (as we would have to perform those
528// otherwise performed by xma ourselves). Normally we would trade
529// anyway as multiplications are way more expensive, but not this
530// time... Multiplication kernel is fully pipelined and as we drain
531// one 128-bit multiplication result per clock cycle multiplications
532// are effectively as inexpensive as additions. Special implementation
533// might become of interest for "wider" IA-64 implementation as you'll
534// be able to get through the multiplication phase faster (there won't
535// be any stall issues as discussed in the commentary section below and
536// you therefore will be able to employ all 4 FP units)... But these
537// Itanium days it's simply too hard to justify the effort so I just
538// drop down to bn_mul_comba8 code:-)
539//
540// void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
541//
542.global bn_sqr_comba8#
543.proc bn_sqr_comba8#
544.align 64
545bn_sqr_comba8:
546 .prologue
547 .save ar.pfs,r2
548#if defined(_HPUX_SOURCE) && !defined(_LP64)
549{ .mii; alloc r2=ar.pfs,2,1,0,0
550 addp4 r33=0,r33
551 addp4 r32=0,r32 };;
552{ .mii;
553#else
554{ .mii; alloc r2=ar.pfs,2,1,0,0
555#endif
556 mov r34=r33
557 add r14=8,r33 };;
558 .body
559{ .mii; add r17=8,r34
560 add r15=16,r33
561 add r18=16,r34 }
562{ .mfb; add r16=24,r33
563 br .L_cheat_entry_point8 };;
564.endp bn_sqr_comba8#
565#endif
566
567#if 1
568// I've estimated this routine to run in ~120 ticks, but in reality
569// (i.e. according to ar.itc) it takes ~160 ticks. Are those extra
570// cycles consumed for instructions fetch? Or did I misinterpret some
571// clause in Itanium µ-architecture manual? Comments are welcomed and
572// highly appreciated.
573//
574// On Itanium 2 it takes ~190 ticks. This is because of stalls on
575// result from getf.sig. I do nothing about it at this point for
576// reasons depicted below.
577//
578// However! It should be noted that even 160 ticks is darn good result
579// as it's over 10 (yes, ten, spelled as t-e-n) times faster than the
580// C version (compiled with gcc with inline assembler). I really
581// kicked compiler's butt here, didn't I? Yeah! This brings us to the
582// following statement. It's damn shame that this routine isn't called
583// very often nowadays! According to the profiler most CPU time is
584// consumed by bn_mul_add_words called from BN_from_montgomery. In
585// order to estimate what we're missing, I've compared the performance
586// of this routine against "traditional" implementation, i.e. against
587// following routine:
588//
589// void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
590// { r[ 8]=bn_mul_words( &(r[0]),a,8,b[0]);
591// r[ 9]=bn_mul_add_words(&(r[1]),a,8,b[1]);
592// r[10]=bn_mul_add_words(&(r[2]),a,8,b[2]);
593// r[11]=bn_mul_add_words(&(r[3]),a,8,b[3]);
594// r[12]=bn_mul_add_words(&(r[4]),a,8,b[4]);
595// r[13]=bn_mul_add_words(&(r[5]),a,8,b[5]);
596// r[14]=bn_mul_add_words(&(r[6]),a,8,b[6]);
597// r[15]=bn_mul_add_words(&(r[7]),a,8,b[7]);
598// }
599//
600// The one below is over 8 times faster than the one above:-( Even
601// more reasons to "combafy" bn_mul_add_mont...
602//
603// And yes, this routine really made me wish there were an optimizing
604// assembler! It also feels like it deserves a dedication.
605//
606// To my wife for being there and to my kids...
607//
608// void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
609//
610#define carry1 r14
611#define carry2 r15
612#define carry3 r34
613.global bn_mul_comba8#
614.proc bn_mul_comba8#
615.align 64
616bn_mul_comba8:
617 .prologue
618 .save ar.pfs,r2
619#if defined(_HPUX_SOURCE) && !defined(_LP64)
620{ .mii; alloc r2=ar.pfs,3,0,0,0
621 addp4 r33=0,r33
622 addp4 r34=0,r34 };;
623{ .mii; addp4 r32=0,r32
624#else
625{ .mii; alloc r2=ar.pfs,3,0,0,0
626#endif
627 add r14=8,r33
628 add r17=8,r34 }
629 .body
630{ .mii; add r15=16,r33
631 add r18=16,r34
632 add r16=24,r33 }
633.L_cheat_entry_point8:
634{ .mmi; add r19=24,r34
635
636 ldf8 f32=[r33],32 };;
637
638{ .mmi; ldf8 f120=[r34],32
639 ldf8 f121=[r17],32 }
640{ .mmi; ldf8 f122=[r18],32
641 ldf8 f123=[r19],32 };;
642{ .mmi; ldf8 f124=[r34]
643 ldf8 f125=[r17] }
644{ .mmi; ldf8 f126=[r18]
645 ldf8 f127=[r19] }
646
647{ .mmi; ldf8 f33=[r14],32
648 ldf8 f34=[r15],32 }
649{ .mmi; ldf8 f35=[r16],32;;
650 ldf8 f36=[r33] }
651{ .mmi; ldf8 f37=[r14]
652 ldf8 f38=[r15] }
653{ .mfi; ldf8 f39=[r16]
654// -------\ Entering multiplier's heaven /-------
655// ------------\ /------------
656// -----------------\ /-----------------
657// ----------------------\/----------------------
658 xma.hu f41=f32,f120,f0 }
659{ .mfi; xma.lu f40=f32,f120,f0 };; // (*)
660{ .mfi; xma.hu f51=f32,f121,f0 }
661{ .mfi; xma.lu f50=f32,f121,f0 };;
662{ .mfi; xma.hu f61=f32,f122,f0 }
663{ .mfi; xma.lu f60=f32,f122,f0 };;
664{ .mfi; xma.hu f71=f32,f123,f0 }
665{ .mfi; xma.lu f70=f32,f123,f0 };;
666{ .mfi; xma.hu f81=f32,f124,f0 }
667{ .mfi; xma.lu f80=f32,f124,f0 };;
668{ .mfi; xma.hu f91=f32,f125,f0 }
669{ .mfi; xma.lu f90=f32,f125,f0 };;
670{ .mfi; xma.hu f101=f32,f126,f0 }
671{ .mfi; xma.lu f100=f32,f126,f0 };;
672{ .mfi; xma.hu f111=f32,f127,f0 }
673{ .mfi; xma.lu f110=f32,f127,f0 };;//
674// (*) You can argue that splitting at every second bundle would
675// prevent "wider" IA-64 implementations from achieving the peak
676// performance. Well, not really... The catch is that if you
677// intend to keep 4 FP units busy by splitting at every fourth
678// bundle and thus perform these 16 multiplications in 4 ticks,
679// the first bundle *below* would stall because the result from
680// the first xma bundle *above* won't be available for another 3
681// ticks (if not more, being an optimist, I assume that "wider"
682// implementation will have same latency:-). This stall will hold
683// you back and the performance would be as if every second bundle
684// were split *anyway*...
685{ .mfi; getf.sig r16=f40
686 xma.hu f42=f33,f120,f41
687 add r33=8,r32 }
688{ .mfi; xma.lu f41=f33,f120,f41 };;
689{ .mfi; getf.sig r24=f50
690 xma.hu f52=f33,f121,f51 }
691{ .mfi; xma.lu f51=f33,f121,f51 };;
692{ .mfi; st8 [r32]=r16,16
693 xma.hu f62=f33,f122,f61 }
694{ .mfi; xma.lu f61=f33,f122,f61 };;
695{ .mfi; xma.hu f72=f33,f123,f71 }
696{ .mfi; xma.lu f71=f33,f123,f71 };;
697{ .mfi; xma.hu f82=f33,f124,f81 }
698{ .mfi; xma.lu f81=f33,f124,f81 };;
699{ .mfi; xma.hu f92=f33,f125,f91 }
700{ .mfi; xma.lu f91=f33,f125,f91 };;
701{ .mfi; xma.hu f102=f33,f126,f101 }
702{ .mfi; xma.lu f101=f33,f126,f101 };;
703{ .mfi; xma.hu f112=f33,f127,f111 }
704{ .mfi; xma.lu f111=f33,f127,f111 };;//
705//-------------------------------------------------//
706{ .mfi; getf.sig r25=f41
707 xma.hu f43=f34,f120,f42 }
708{ .mfi; xma.lu f42=f34,f120,f42 };;
709{ .mfi; getf.sig r16=f60
710 xma.hu f53=f34,f121,f52 }
711{ .mfi; xma.lu f52=f34,f121,f52 };;
712{ .mfi; getf.sig r17=f51
713 xma.hu f63=f34,f122,f62
714 add r25=r25,r24 }
715{ .mfi; xma.lu f62=f34,f122,f62
716 mov carry1=0 };;
717{ .mfi; cmp.ltu p6,p0=r25,r24
718 xma.hu f73=f34,f123,f72 }
719{ .mfi; xma.lu f72=f34,f123,f72 };;
720{ .mfi; st8 [r33]=r25,16
721 xma.hu f83=f34,f124,f82
722(p6) add carry1=1,carry1 }
723{ .mfi; xma.lu f82=f34,f124,f82 };;
724{ .mfi; xma.hu f93=f34,f125,f92 }
725{ .mfi; xma.lu f92=f34,f125,f92 };;
726{ .mfi; xma.hu f103=f34,f126,f102 }
727{ .mfi; xma.lu f102=f34,f126,f102 };;
728{ .mfi; xma.hu f113=f34,f127,f112 }
729{ .mfi; xma.lu f112=f34,f127,f112 };;//
730//-------------------------------------------------//
731{ .mfi; getf.sig r18=f42
732 xma.hu f44=f35,f120,f43
733 add r17=r17,r16 }
734{ .mfi; xma.lu f43=f35,f120,f43 };;
735{ .mfi; getf.sig r24=f70
736 xma.hu f54=f35,f121,f53 }
737{ .mfi; mov carry2=0
738 xma.lu f53=f35,f121,f53 };;
739{ .mfi; getf.sig r25=f61
740 xma.hu f64=f35,f122,f63
741 cmp.ltu p7,p0=r17,r16 }
742{ .mfi; add r18=r18,r17
743 xma.lu f63=f35,f122,f63 };;
744{ .mfi; getf.sig r26=f52
745 xma.hu f74=f35,f123,f73
746(p7) add carry2=1,carry2 }
747{ .mfi; cmp.ltu p7,p0=r18,r17
748 xma.lu f73=f35,f123,f73
749 add r18=r18,carry1 };;
750{ .mfi;
751 xma.hu f84=f35,f124,f83
752(p7) add carry2=1,carry2 }
753{ .mfi; cmp.ltu p7,p0=r18,carry1
754 xma.lu f83=f35,f124,f83 };;
755{ .mfi; st8 [r32]=r18,16
756 xma.hu f94=f35,f125,f93
757(p7) add carry2=1,carry2 }
758{ .mfi; xma.lu f93=f35,f125,f93 };;
759{ .mfi; xma.hu f104=f35,f126,f103 }
760{ .mfi; xma.lu f103=f35,f126,f103 };;
761{ .mfi; xma.hu f114=f35,f127,f113 }
762{ .mfi; mov carry1=0
763 xma.lu f113=f35,f127,f113
764 add r25=r25,r24 };;//
765//-------------------------------------------------//
766{ .mfi; getf.sig r27=f43
767 xma.hu f45=f36,f120,f44
768 cmp.ltu p6,p0=r25,r24 }
769{ .mfi; xma.lu f44=f36,f120,f44
770 add r26=r26,r25 };;
771{ .mfi; getf.sig r16=f80
772 xma.hu f55=f36,f121,f54
773(p6) add carry1=1,carry1 }
774{ .mfi; xma.lu f54=f36,f121,f54 };;
775{ .mfi; getf.sig r17=f71
776 xma.hu f65=f36,f122,f64
777 cmp.ltu p6,p0=r26,r25 }
778{ .mfi; xma.lu f64=f36,f122,f64
779 add r27=r27,r26 };;
780{ .mfi; getf.sig r18=f62
781 xma.hu f75=f36,f123,f74
782(p6) add carry1=1,carry1 }
783{ .mfi; cmp.ltu p6,p0=r27,r26
784 xma.lu f74=f36,f123,f74
785 add r27=r27,carry2 };;
786{ .mfi; getf.sig r19=f53
787 xma.hu f85=f36,f124,f84
788(p6) add carry1=1,carry1 }
789{ .mfi; xma.lu f84=f36,f124,f84
790 cmp.ltu p6,p0=r27,carry2 };;
791{ .mfi; st8 [r33]=r27,16
792 xma.hu f95=f36,f125,f94
793(p6) add carry1=1,carry1 }
794{ .mfi; xma.lu f94=f36,f125,f94 };;
795{ .mfi; xma.hu f105=f36,f126,f104 }
796{ .mfi; mov carry2=0
797 xma.lu f104=f36,f126,f104
798 add r17=r17,r16 };;
799{ .mfi; xma.hu f115=f36,f127,f114
800 cmp.ltu p7,p0=r17,r16 }
801{ .mfi; xma.lu f114=f36,f127,f114
802 add r18=r18,r17 };;//
803//-------------------------------------------------//
804{ .mfi; getf.sig r20=f44
805 xma.hu f46=f37,f120,f45
806(p7) add carry2=1,carry2 }
807{ .mfi; cmp.ltu p7,p0=r18,r17
808 xma.lu f45=f37,f120,f45
809 add r19=r19,r18 };;
810{ .mfi; getf.sig r24=f90
811 xma.hu f56=f37,f121,f55 }
812{ .mfi; xma.lu f55=f37,f121,f55 };;
813{ .mfi; getf.sig r25=f81
814 xma.hu f66=f37,f122,f65
815(p7) add carry2=1,carry2 }
816{ .mfi; cmp.ltu p7,p0=r19,r18
817 xma.lu f65=f37,f122,f65
818 add r20=r20,r19 };;
819{ .mfi; getf.sig r26=f72
820 xma.hu f76=f37,f123,f75
821(p7) add carry2=1,carry2 }
822{ .mfi; cmp.ltu p7,p0=r20,r19
823 xma.lu f75=f37,f123,f75
824 add r20=r20,carry1 };;
825{ .mfi; getf.sig r27=f63
826 xma.hu f86=f37,f124,f85
827(p7) add carry2=1,carry2 }
828{ .mfi; xma.lu f85=f37,f124,f85
829 cmp.ltu p7,p0=r20,carry1 };;
830{ .mfi; getf.sig r28=f54
831 xma.hu f96=f37,f125,f95
832(p7) add carry2=1,carry2 }
833{ .mfi; st8 [r32]=r20,16
834 xma.lu f95=f37,f125,f95 };;
835{ .mfi; xma.hu f106=f37,f126,f105 }
836{ .mfi; mov carry1=0
837 xma.lu f105=f37,f126,f105
838 add r25=r25,r24 };;
839{ .mfi; xma.hu f116=f37,f127,f115
840 cmp.ltu p6,p0=r25,r24 }
841{ .mfi; xma.lu f115=f37,f127,f115
842 add r26=r26,r25 };;//
843//-------------------------------------------------//
844{ .mfi; getf.sig r29=f45
845 xma.hu f47=f38,f120,f46
846(p6) add carry1=1,carry1 }
847{ .mfi; cmp.ltu p6,p0=r26,r25
848 xma.lu f46=f38,f120,f46
849 add r27=r27,r26 };;
850{ .mfi; getf.sig r16=f100
851 xma.hu f57=f38,f121,f56
852(p6) add carry1=1,carry1 }
853{ .mfi; cmp.ltu p6,p0=r27,r26
854 xma.lu f56=f38,f121,f56
855 add r28=r28,r27 };;
856{ .mfi; getf.sig r17=f91
857 xma.hu f67=f38,f122,f66
858(p6) add carry1=1,carry1 }
859{ .mfi; cmp.ltu p6,p0=r28,r27
860 xma.lu f66=f38,f122,f66
861 add r29=r29,r28 };;
862{ .mfi; getf.sig r18=f82
863 xma.hu f77=f38,f123,f76
864(p6) add carry1=1,carry1 }
865{ .mfi; cmp.ltu p6,p0=r29,r28
866 xma.lu f76=f38,f123,f76
867 add r29=r29,carry2 };;
868{ .mfi; getf.sig r19=f73
869 xma.hu f87=f38,f124,f86
870(p6) add carry1=1,carry1 }
871{ .mfi; xma.lu f86=f38,f124,f86
872 cmp.ltu p6,p0=r29,carry2 };;
873{ .mfi; getf.sig r20=f64
874 xma.hu f97=f38,f125,f96
875(p6) add carry1=1,carry1 }
876{ .mfi; st8 [r33]=r29,16
877 xma.lu f96=f38,f125,f96 };;
878{ .mfi; getf.sig r21=f55
879 xma.hu f107=f38,f126,f106 }
880{ .mfi; mov carry2=0
881 xma.lu f106=f38,f126,f106
882 add r17=r17,r16 };;
883{ .mfi; xma.hu f117=f38,f127,f116
884 cmp.ltu p7,p0=r17,r16 }
885{ .mfi; xma.lu f116=f38,f127,f116
886 add r18=r18,r17 };;//
887//-------------------------------------------------//
888{ .mfi; getf.sig r22=f46
889 xma.hu f48=f39,f120,f47
890(p7) add carry2=1,carry2 }
891{ .mfi; cmp.ltu p7,p0=r18,r17
892 xma.lu f47=f39,f120,f47
893 add r19=r19,r18 };;
894{ .mfi; getf.sig r24=f110
895 xma.hu f58=f39,f121,f57
896(p7) add carry2=1,carry2 }
897{ .mfi; cmp.ltu p7,p0=r19,r18
898 xma.lu f57=f39,f121,f57
899 add r20=r20,r19 };;
900{ .mfi; getf.sig r25=f101
901 xma.hu f68=f39,f122,f67
902(p7) add carry2=1,carry2 }
903{ .mfi; cmp.ltu p7,p0=r20,r19
904 xma.lu f67=f39,f122,f67
905 add r21=r21,r20 };;
906{ .mfi; getf.sig r26=f92
907 xma.hu f78=f39,f123,f77
908(p7) add carry2=1,carry2 }
909{ .mfi; cmp.ltu p7,p0=r21,r20
910 xma.lu f77=f39,f123,f77
911 add r22=r22,r21 };;
912{ .mfi; getf.sig r27=f83
913 xma.hu f88=f39,f124,f87
914(p7) add carry2=1,carry2 }
915{ .mfi; cmp.ltu p7,p0=r22,r21
916 xma.lu f87=f39,f124,f87
917 add r22=r22,carry1 };;
918{ .mfi; getf.sig r28=f74
919 xma.hu f98=f39,f125,f97
920(p7) add carry2=1,carry2 }
921{ .mfi; xma.lu f97=f39,f125,f97
922 cmp.ltu p7,p0=r22,carry1 };;
923{ .mfi; getf.sig r29=f65
924 xma.hu f108=f39,f126,f107
925(p7) add carry2=1,carry2 }
926{ .mfi; st8 [r32]=r22,16
927 xma.lu f107=f39,f126,f107 };;
928{ .mfi; getf.sig r30=f56
929 xma.hu f118=f39,f127,f117 }
930{ .mfi; xma.lu f117=f39,f127,f117 };;//
931//-------------------------------------------------//
932// Leaving muliplier's heaven... Quite a ride, huh?
933
934{ .mii; getf.sig r31=f47
935 add r25=r25,r24
936 mov carry1=0 };;
937{ .mii; getf.sig r16=f111
938 cmp.ltu p6,p0=r25,r24
939 add r26=r26,r25 };;
940{ .mfb; getf.sig r17=f102 }
941{ .mii;
942(p6) add carry1=1,carry1
943 cmp.ltu p6,p0=r26,r25
944 add r27=r27,r26 };;
945{ .mfb; nop.m 0x0 }
946{ .mii;
947(p6) add carry1=1,carry1
948 cmp.ltu p6,p0=r27,r26
949 add r28=r28,r27 };;
950{ .mii; getf.sig r18=f93
951 add r17=r17,r16
952 mov carry3=0 }
953{ .mii;
954(p6) add carry1=1,carry1
955 cmp.ltu p6,p0=r28,r27
956 add r29=r29,r28 };;
957{ .mii; getf.sig r19=f84
958 cmp.ltu p7,p0=r17,r16 }
959{ .mii;
960(p6) add carry1=1,carry1
961 cmp.ltu p6,p0=r29,r28
962 add r30=r30,r29 };;
963{ .mii; getf.sig r20=f75
964 add r18=r18,r17 }
965{ .mii;
966(p6) add carry1=1,carry1
967 cmp.ltu p6,p0=r30,r29
968 add r31=r31,r30 };;
969{ .mfb; getf.sig r21=f66 }
970{ .mii; (p7) add carry3=1,carry3
971 cmp.ltu p7,p0=r18,r17
972 add r19=r19,r18 }
973{ .mfb; nop.m 0x0 }
974{ .mii;
975(p6) add carry1=1,carry1
976 cmp.ltu p6,p0=r31,r30
977 add r31=r31,carry2 };;
978{ .mfb; getf.sig r22=f57 }
979{ .mii; (p7) add carry3=1,carry3
980 cmp.ltu p7,p0=r19,r18
981 add r20=r20,r19 }
982{ .mfb; nop.m 0x0 }
983{ .mii;
984(p6) add carry1=1,carry1
985 cmp.ltu p6,p0=r31,carry2 };;
986{ .mfb; getf.sig r23=f48 }
987{ .mii; (p7) add carry3=1,carry3
988 cmp.ltu p7,p0=r20,r19
989 add r21=r21,r20 }
990{ .mii;
991(p6) add carry1=1,carry1 }
992{ .mfb; st8 [r33]=r31,16 };;
993
994{ .mfb; getf.sig r24=f112 }
995{ .mii; (p7) add carry3=1,carry3
996 cmp.ltu p7,p0=r21,r20
997 add r22=r22,r21 };;
998{ .mfb; getf.sig r25=f103 }
999{ .mii; (p7) add carry3=1,carry3
1000 cmp.ltu p7,p0=r22,r21
1001 add r23=r23,r22 };;
1002{ .mfb; getf.sig r26=f94 }
1003{ .mii; (p7) add carry3=1,carry3
1004 cmp.ltu p7,p0=r23,r22
1005 add r23=r23,carry1 };;
1006{ .mfb; getf.sig r27=f85 }
1007{ .mii; (p7) add carry3=1,carry3
1008 cmp.ltu p7,p8=r23,carry1};;
1009{ .mii; getf.sig r28=f76
1010 add r25=r25,r24
1011 mov carry1=0 }
1012{ .mii; st8 [r32]=r23,16
1013 (p7) add carry2=1,carry3
1014 (p8) add carry2=0,carry3 };;
1015
1016{ .mfb; nop.m 0x0 }
1017{ .mii; getf.sig r29=f67
1018 cmp.ltu p6,p0=r25,r24
1019 add r26=r26,r25 };;
1020{ .mfb; getf.sig r30=f58 }
1021{ .mii;
1022(p6) add carry1=1,carry1
1023 cmp.ltu p6,p0=r26,r25
1024 add r27=r27,r26 };;
1025{ .mfb; getf.sig r16=f113 }
1026{ .mii;
1027(p6) add carry1=1,carry1
1028 cmp.ltu p6,p0=r27,r26
1029 add r28=r28,r27 };;
1030{ .mfb; getf.sig r17=f104 }
1031{ .mii;
1032(p6) add carry1=1,carry1
1033 cmp.ltu p6,p0=r28,r27
1034 add r29=r29,r28 };;
1035{ .mfb; getf.sig r18=f95 }
1036{ .mii;
1037(p6) add carry1=1,carry1
1038 cmp.ltu p6,p0=r29,r28
1039 add r30=r30,r29 };;
1040{ .mii; getf.sig r19=f86
1041 add r17=r17,r16
1042 mov carry3=0 }
1043{ .mii;
1044(p6) add carry1=1,carry1
1045 cmp.ltu p6,p0=r30,r29
1046 add r30=r30,carry2 };;
1047{ .mii; getf.sig r20=f77
1048 cmp.ltu p7,p0=r17,r16
1049 add r18=r18,r17 }
1050{ .mii;
1051(p6) add carry1=1,carry1
1052 cmp.ltu p6,p0=r30,carry2 };;
1053{ .mfb; getf.sig r21=f68 }
1054{ .mii; st8 [r33]=r30,16
1055(p6) add carry1=1,carry1 };;
1056
1057{ .mfb; getf.sig r24=f114 }
1058{ .mii; (p7) add carry3=1,carry3
1059 cmp.ltu p7,p0=r18,r17
1060 add r19=r19,r18 };;
1061{ .mfb; getf.sig r25=f105 }
1062{ .mii; (p7) add carry3=1,carry3
1063 cmp.ltu p7,p0=r19,r18
1064 add r20=r20,r19 };;
1065{ .mfb; getf.sig r26=f96 }
1066{ .mii; (p7) add carry3=1,carry3
1067 cmp.ltu p7,p0=r20,r19
1068 add r21=r21,r20 };;
1069{ .mfb; getf.sig r27=f87 }
1070{ .mii; (p7) add carry3=1,carry3
1071 cmp.ltu p7,p0=r21,r20
1072 add r21=r21,carry1 };;
1073{ .mib; getf.sig r28=f78
1074 add r25=r25,r24 }
1075{ .mib; (p7) add carry3=1,carry3
1076 cmp.ltu p7,p8=r21,carry1};;
1077{ .mii; st8 [r32]=r21,16
1078 (p7) add carry2=1,carry3
1079 (p8) add carry2=0,carry3 }
1080
1081{ .mii; mov carry1=0
1082 cmp.ltu p6,p0=r25,r24
1083 add r26=r26,r25 };;
1084{ .mfb; getf.sig r16=f115 }
1085{ .mii;
1086(p6) add carry1=1,carry1
1087 cmp.ltu p6,p0=r26,r25
1088 add r27=r27,r26 };;
1089{ .mfb; getf.sig r17=f106 }
1090{ .mii;
1091(p6) add carry1=1,carry1
1092 cmp.ltu p6,p0=r27,r26
1093 add r28=r28,r27 };;
1094{ .mfb; getf.sig r18=f97 }
1095{ .mii;
1096(p6) add carry1=1,carry1
1097 cmp.ltu p6,p0=r28,r27
1098 add r28=r28,carry2 };;
1099{ .mib; getf.sig r19=f88
1100 add r17=r17,r16 }
1101{ .mib;
1102(p6) add carry1=1,carry1
1103 cmp.ltu p6,p0=r28,carry2 };;
1104{ .mii; st8 [r33]=r28,16
1105(p6) add carry1=1,carry1 }
1106
1107{ .mii; mov carry2=0
1108 cmp.ltu p7,p0=r17,r16
1109 add r18=r18,r17 };;
1110{ .mfb; getf.sig r24=f116 }
1111{ .mii; (p7) add carry2=1,carry2
1112 cmp.ltu p7,p0=r18,r17
1113 add r19=r19,r18 };;
1114{ .mfb; getf.sig r25=f107 }
1115{ .mii; (p7) add carry2=1,carry2
1116 cmp.ltu p7,p0=r19,r18
1117 add r19=r19,carry1 };;
1118{ .mfb; getf.sig r26=f98 }
1119{ .mii; (p7) add carry2=1,carry2
1120 cmp.ltu p7,p0=r19,carry1};;
1121{ .mii; st8 [r32]=r19,16
1122 (p7) add carry2=1,carry2 }
1123
1124{ .mfb; add r25=r25,r24 };;
1125
1126{ .mfb; getf.sig r16=f117 }
1127{ .mii; mov carry1=0
1128 cmp.ltu p6,p0=r25,r24
1129 add r26=r26,r25 };;
1130{ .mfb; getf.sig r17=f108 }
1131{ .mii;
1132(p6) add carry1=1,carry1
1133 cmp.ltu p6,p0=r26,r25
1134 add r26=r26,carry2 };;
1135{ .mfb; nop.m 0x0 }
1136{ .mii;
1137(p6) add carry1=1,carry1
1138 cmp.ltu p6,p0=r26,carry2 };;
1139{ .mii; st8 [r33]=r26,16
1140(p6) add carry1=1,carry1 }
1141
1142{ .mfb; add r17=r17,r16 };;
1143{ .mfb; getf.sig r24=f118 }
1144{ .mii; mov carry2=0
1145 cmp.ltu p7,p0=r17,r16
1146 add r17=r17,carry1 };;
1147{ .mii; (p7) add carry2=1,carry2
1148 cmp.ltu p7,p0=r17,carry1};;
1149{ .mii; st8 [r32]=r17
1150 (p7) add carry2=1,carry2 };;
1151{ .mfb; add r24=r24,carry2 };;
1152{ .mib; st8 [r33]=r24 }
1153
1154{ .mib; rum 1<<5 // clear um.mfh
1155 br.ret.sptk.many b0 };;
1156.endp bn_mul_comba8#
1157#undef carry3
1158#undef carry2
1159#undef carry1
1160#endif
1161
1162#if 1
1163// It's possible to make it faster (see comment to bn_sqr_comba8), but
1164// I reckon it doesn't worth the effort. Basically because the routine
1165// (actually both of them) practically never called... So I just play
1166// same trick as with bn_sqr_comba8.
1167//
1168// void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
1169//
1170.global bn_sqr_comba4#
1171.proc bn_sqr_comba4#
1172.align 64
1173bn_sqr_comba4:
1174 .prologue
1175 .save ar.pfs,r2
1176#if defined(_HPUX_SOURCE) && !defined(_LP64)
1177{ .mii; alloc r2=ar.pfs,2,1,0,0
1178 addp4 r32=0,r32
1179 addp4 r33=0,r33 };;
1180{ .mii;
1181#else
1182{ .mii; alloc r2=ar.pfs,2,1,0,0
1183#endif
1184 mov r34=r33
1185 add r14=8,r33 };;
1186 .body
1187{ .mii; add r17=8,r34
1188 add r15=16,r33
1189 add r18=16,r34 }
1190{ .mfb; add r16=24,r33
1191 br .L_cheat_entry_point4 };;
1192.endp bn_sqr_comba4#
1193#endif
1194
1195#if 1
1196// Runs in ~115 cycles and ~4.5 times faster than C. Well, whatever...
1197//
1198// void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
1199//
1200#define carry1 r14
1201#define carry2 r15
1202.global bn_mul_comba4#
1203.proc bn_mul_comba4#
1204.align 64
1205bn_mul_comba4:
1206 .prologue
1207 .save ar.pfs,r2
1208#if defined(_HPUX_SOURCE) && !defined(_LP64)
1209{ .mii; alloc r2=ar.pfs,3,0,0,0
1210 addp4 r33=0,r33
1211 addp4 r34=0,r34 };;
1212{ .mii; addp4 r32=0,r32
1213#else
1214{ .mii; alloc r2=ar.pfs,3,0,0,0
1215#endif
1216 add r14=8,r33
1217 add r17=8,r34 }
1218 .body
1219{ .mii; add r15=16,r33
1220 add r18=16,r34
1221 add r16=24,r33 };;
1222.L_cheat_entry_point4:
1223{ .mmi; add r19=24,r34
1224
1225 ldf8 f32=[r33] }
1226
1227{ .mmi; ldf8 f120=[r34]
1228 ldf8 f121=[r17] };;
1229{ .mmi; ldf8 f122=[r18]
1230 ldf8 f123=[r19] }
1231
1232{ .mmi; ldf8 f33=[r14]
1233 ldf8 f34=[r15] }
1234{ .mfi; ldf8 f35=[r16]
1235
1236 xma.hu f41=f32,f120,f0 }
1237{ .mfi; xma.lu f40=f32,f120,f0 };;
1238{ .mfi; xma.hu f51=f32,f121,f0 }
1239{ .mfi; xma.lu f50=f32,f121,f0 };;
1240{ .mfi; xma.hu f61=f32,f122,f0 }
1241{ .mfi; xma.lu f60=f32,f122,f0 };;
1242{ .mfi; xma.hu f71=f32,f123,f0 }
1243{ .mfi; xma.lu f70=f32,f123,f0 };;//
1244// Major stall takes place here, and 3 more places below. Result from
1245// first xma is not available for another 3 ticks.
1246{ .mfi; getf.sig r16=f40
1247 xma.hu f42=f33,f120,f41
1248 add r33=8,r32 }
1249{ .mfi; xma.lu f41=f33,f120,f41 };;
1250{ .mfi; getf.sig r24=f50
1251 xma.hu f52=f33,f121,f51 }
1252{ .mfi; xma.lu f51=f33,f121,f51 };;
1253{ .mfi; st8 [r32]=r16,16
1254 xma.hu f62=f33,f122,f61 }
1255{ .mfi; xma.lu f61=f33,f122,f61 };;
1256{ .mfi; xma.hu f72=f33,f123,f71 }
1257{ .mfi; xma.lu f71=f33,f123,f71 };;//
1258//-------------------------------------------------//
1259{ .mfi; getf.sig r25=f41
1260 xma.hu f43=f34,f120,f42 }
1261{ .mfi; xma.lu f42=f34,f120,f42 };;
1262{ .mfi; getf.sig r16=f60
1263 xma.hu f53=f34,f121,f52 }
1264{ .mfi; xma.lu f52=f34,f121,f52 };;
1265{ .mfi; getf.sig r17=f51
1266 xma.hu f63=f34,f122,f62
1267 add r25=r25,r24 }
1268{ .mfi; mov carry1=0
1269 xma.lu f62=f34,f122,f62 };;
1270{ .mfi; st8 [r33]=r25,16
1271 xma.hu f73=f34,f123,f72
1272 cmp.ltu p6,p0=r25,r24 }
1273{ .mfi; xma.lu f72=f34,f123,f72 };;//
1274//-------------------------------------------------//
1275{ .mfi; getf.sig r18=f42
1276 xma.hu f44=f35,f120,f43
1277(p6) add carry1=1,carry1 }
1278{ .mfi; add r17=r17,r16
1279 xma.lu f43=f35,f120,f43
1280 mov carry2=0 };;
1281{ .mfi; getf.sig r24=f70
1282 xma.hu f54=f35,f121,f53
1283 cmp.ltu p7,p0=r17,r16 }
1284{ .mfi; xma.lu f53=f35,f121,f53 };;
1285{ .mfi; getf.sig r25=f61
1286 xma.hu f64=f35,f122,f63
1287 add r18=r18,r17 }
1288{ .mfi; xma.lu f63=f35,f122,f63
1289(p7) add carry2=1,carry2 };;
1290{ .mfi; getf.sig r26=f52
1291 xma.hu f74=f35,f123,f73
1292 cmp.ltu p7,p0=r18,r17 }
1293{ .mfi; xma.lu f73=f35,f123,f73
1294 add r18=r18,carry1 };;
1295//-------------------------------------------------//
1296{ .mii; st8 [r32]=r18,16
1297(p7) add carry2=1,carry2
1298 cmp.ltu p7,p0=r18,carry1 };;
1299
1300{ .mfi; getf.sig r27=f43 // last major stall
1301(p7) add carry2=1,carry2 };;
1302{ .mii; getf.sig r16=f71
1303 add r25=r25,r24
1304 mov carry1=0 };;
1305{ .mii; getf.sig r17=f62
1306 cmp.ltu p6,p0=r25,r24
1307 add r26=r26,r25 };;
1308{ .mii;
1309(p6) add carry1=1,carry1
1310 cmp.ltu p6,p0=r26,r25
1311 add r27=r27,r26 };;
1312{ .mii;
1313(p6) add carry1=1,carry1
1314 cmp.ltu p6,p0=r27,r26
1315 add r27=r27,carry2 };;
1316{ .mii; getf.sig r18=f53
1317(p6) add carry1=1,carry1
1318 cmp.ltu p6,p0=r27,carry2 };;
1319{ .mfi; st8 [r33]=r27,16
1320(p6) add carry1=1,carry1 }
1321
1322{ .mii; getf.sig r19=f44
1323 add r17=r17,r16
1324 mov carry2=0 };;
1325{ .mii; getf.sig r24=f72
1326 cmp.ltu p7,p0=r17,r16
1327 add r18=r18,r17 };;
1328{ .mii; (p7) add carry2=1,carry2
1329 cmp.ltu p7,p0=r18,r17
1330 add r19=r19,r18 };;
1331{ .mii; (p7) add carry2=1,carry2
1332 cmp.ltu p7,p0=r19,r18
1333 add r19=r19,carry1 };;
1334{ .mii; getf.sig r25=f63
1335 (p7) add carry2=1,carry2
1336 cmp.ltu p7,p0=r19,carry1};;
1337{ .mii; st8 [r32]=r19,16
1338 (p7) add carry2=1,carry2 }
1339
1340{ .mii; getf.sig r26=f54
1341 add r25=r25,r24
1342 mov carry1=0 };;
1343{ .mii; getf.sig r16=f73
1344 cmp.ltu p6,p0=r25,r24
1345 add r26=r26,r25 };;
1346{ .mii;
1347(p6) add carry1=1,carry1
1348 cmp.ltu p6,p0=r26,r25
1349 add r26=r26,carry2 };;
1350{ .mii; getf.sig r17=f64
1351(p6) add carry1=1,carry1
1352 cmp.ltu p6,p0=r26,carry2 };;
1353{ .mii; st8 [r33]=r26,16
1354(p6) add carry1=1,carry1 }
1355
1356{ .mii; getf.sig r24=f74
1357 add r17=r17,r16
1358 mov carry2=0 };;
1359{ .mii; cmp.ltu p7,p0=r17,r16
1360 add r17=r17,carry1 };;
1361
1362{ .mii; (p7) add carry2=1,carry2
1363 cmp.ltu p7,p0=r17,carry1};;
1364{ .mii; st8 [r32]=r17,16
1365 (p7) add carry2=1,carry2 };;
1366
1367{ .mii; add r24=r24,carry2 };;
1368{ .mii; st8 [r33]=r24 }
1369
1370{ .mib; rum 1<<5 // clear um.mfh
1371 br.ret.sptk.many b0 };;
1372.endp bn_mul_comba4#
1373#undef carry2
1374#undef carry1
1375#endif
1376
1377#if 1
1378//
1379// BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
1380//
1381// In the nutshell it's a port of my MIPS III/IV implementation.
1382//
1383#define AT r14
1384#define H r16
1385#define HH r20
1386#define L r17
1387#define D r18
1388#define DH r22
1389#define I r21
1390
1391#if 0
1392// Some preprocessors (most notably HP-UX) appear to be allergic to
1393// macros enclosed to parenthesis [as these three were].
1394#define cont p16
1395#define break p0 // p20
1396#define equ p24
1397#else
1398cont=p16
1399break=p0
1400equ=p24
1401#endif
1402
1403.global abort#
1404.global bn_div_words#
1405.proc bn_div_words#
1406.align 64
1407bn_div_words:
1408 .prologue
1409 .save ar.pfs,r2
1410{ .mii; alloc r2=ar.pfs,3,5,0,8
1411 .save b0,r3
1412 mov r3=b0
1413 .save pr,r10
1414 mov r10=pr };;
1415{ .mmb; cmp.eq p6,p0=r34,r0
1416 mov r8=-1
1417(p6) br.ret.spnt.many b0 };;
1418
1419 .body
1420{ .mii; mov H=r32 // save h
1421 mov ar.ec=0 // don't rotate at exit
1422 mov pr.rot=0 }
1423{ .mii; mov L=r33 // save l
1424 mov r36=r0 };;
1425
1426.L_divw_shift: // -vv- note signed comparison
1427{ .mfi; (p0) cmp.lt p16,p0=r0,r34 // d
1428 (p0) shladd r33=r34,1,r0 }
1429{ .mfb; (p0) add r35=1,r36
1430 (p0) nop.f 0x0
1431(p16) br.wtop.dpnt .L_divw_shift };;
1432
1433{ .mii; mov D=r34
1434 shr.u DH=r34,32
1435 sub r35=64,r36 };;
1436{ .mii; setf.sig f7=DH
1437 shr.u AT=H,r35
1438 mov I=r36 };;
1439{ .mib; cmp.ne p6,p0=r0,AT
1440 shl H=H,r36
1441(p6) br.call.spnt.clr b0=abort };; // overflow, die...
1442
1443{ .mfi; fcvt.xuf.s1 f7=f7
1444 shr.u AT=L,r35 };;
1445{ .mii; shl L=L,r36
1446 or H=H,AT };;
1447
1448{ .mii; nop.m 0x0
1449 cmp.leu p6,p0=D,H;;
1450(p6) sub H=H,D }
1451
1452{ .mlx; setf.sig f14=D
1453 movl AT=0xffffffff };;
1454///////////////////////////////////////////////////////////
1455{ .mii; setf.sig f6=H
1456 shr.u HH=H,32;;
1457 cmp.eq p6,p7=HH,DH };;
1458{ .mfb;
1459(p6) setf.sig f8=AT
1460(p7) fcvt.xuf.s1 f6=f6
1461(p7) br.call.sptk b6=.L_udiv64_32_b6 };;
1462
1463{ .mfi; getf.sig r33=f8 // q
1464 xmpy.lu f9=f8,f14 }
1465{ .mfi; xmpy.hu f10=f8,f14
1466 shrp H=H,L,32 };;
1467
1468{ .mmi; getf.sig r35=f9 // tl
1469 getf.sig r31=f10 };; // th
1470
1471.L_divw_1st_iter:
1472{ .mii; (p0) add r32=-1,r33
1473 (p0) cmp.eq equ,cont=HH,r31 };;
1474{ .mii; (p0) cmp.ltu p8,p0=r35,D
1475 (p0) sub r34=r35,D
1476 (equ) cmp.leu break,cont=r35,H };;
1477{ .mib; (cont) cmp.leu cont,break=HH,r31
1478 (p8) add r31=-1,r31
1479(cont) br.wtop.spnt .L_divw_1st_iter };;
1480///////////////////////////////////////////////////////////
1481{ .mii; sub H=H,r35
1482 shl r8=r33,32
1483 shl L=L,32 };;
1484///////////////////////////////////////////////////////////
1485{ .mii; setf.sig f6=H
1486 shr.u HH=H,32;;
1487 cmp.eq p6,p7=HH,DH };;
1488{ .mfb;
1489(p6) setf.sig f8=AT
1490(p7) fcvt.xuf.s1 f6=f6
1491(p7) br.call.sptk b6=.L_udiv64_32_b6 };;
1492
1493{ .mfi; getf.sig r33=f8 // q
1494 xmpy.lu f9=f8,f14 }
1495{ .mfi; xmpy.hu f10=f8,f14
1496 shrp H=H,L,32 };;
1497
1498{ .mmi; getf.sig r35=f9 // tl
1499 getf.sig r31=f10 };; // th
1500
1501.L_divw_2nd_iter:
1502{ .mii; (p0) add r32=-1,r33
1503 (p0) cmp.eq equ,cont=HH,r31 };;
1504{ .mii; (p0) cmp.ltu p8,p0=r35,D
1505 (p0) sub r34=r35,D
1506 (equ) cmp.leu break,cont=r35,H };;
1507{ .mib; (cont) cmp.leu cont,break=HH,r31
1508 (p8) add r31=-1,r31
1509(cont) br.wtop.spnt .L_divw_2nd_iter };;
1510///////////////////////////////////////////////////////////
1511{ .mii; sub H=H,r35
1512 or r8=r8,r33
1513 mov ar.pfs=r2 };;
1514{ .mii; shr.u r9=H,I // remainder if anybody wants it
1515 mov pr=r10,0x1ffff }
1516{ .mfb; br.ret.sptk.many b0 };;
1517
1518// Unsigned 64 by 32 (well, by 64 for the moment) bit integer division
1519// procedure.
1520//
1521// inputs: f6 = (double)a, f7 = (double)b
1522// output: f8 = (int)(a/b)
1523// clobbered: f8,f9,f10,f11,pred
1524pred=p15
1525// One can argue that this snippet is copyrighted to Intel
1526// Corporation, as it's essentially identical to one of those
1527// found in "Divide, Square Root and Remainder" section at
1528// http://www.intel.com/software/products/opensource/libraries/num.htm.
1529// Yes, I admit that the referred code was used as template,
1530// but after I realized that there hardly is any other instruction
1531// sequence which would perform this operation. I mean I figure that
1532// any independent attempt to implement high-performance division
1533// will result in code virtually identical to the Intel code. It
1534// should be noted though that below division kernel is 1 cycle
1535// faster than Intel one (note commented splits:-), not to mention
1536// original prologue (rather lack of one) and epilogue.
1537.align 32
1538.skip 16
1539.L_udiv64_32_b6:
1540 frcpa.s1 f8,pred=f6,f7;; // [0] y0 = 1 / b
1541
1542(pred) fnma.s1 f9=f7,f8,f1 // [5] e0 = 1 - b * y0
1543(pred) fmpy.s1 f10=f6,f8;; // [5] q0 = a * y0
1544(pred) fmpy.s1 f11=f9,f9 // [10] e1 = e0 * e0
1545(pred) fma.s1 f10=f9,f10,f10;; // [10] q1 = q0 + e0 * q0
1546(pred) fma.s1 f8=f9,f8,f8 //;; // [15] y1 = y0 + e0 * y0
1547(pred) fma.s1 f9=f11,f10,f10;; // [15] q2 = q1 + e1 * q1
1548(pred) fma.s1 f8=f11,f8,f8 //;; // [20] y2 = y1 + e1 * y1
1549(pred) fnma.s1 f10=f7,f9,f6;; // [20] r2 = a - b * q2
1550(pred) fma.s1 f8=f10,f8,f9;; // [25] q3 = q2 + r2 * y2
1551
1552 fcvt.fxu.trunc.s1 f8=f8 // [30] q = trunc(q3)
1553 br.ret.sptk.many b6;;
1554.endp bn_div_words#
1555#endif
diff --git a/src/lib/libcrypto/bn/asm/mips-mont.pl b/src/lib/libcrypto/bn/asm/mips-mont.pl
deleted file mode 100644
index caae04ed3a..0000000000
--- a/src/lib/libcrypto/bn/asm/mips-mont.pl
+++ /dev/null
@@ -1,426 +0,0 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# This module doesn't present direct interest for OpenSSL, because it
11# doesn't provide better performance for longer keys, at least not on
12# in-order-execution cores. While 512-bit RSA sign operations can be
13# 65% faster in 64-bit mode, 1024-bit ones are only 15% faster, and
14# 4096-bit ones are up to 15% slower. In 32-bit mode it varies from
15# 16% improvement for 512-bit RSA sign to -33% for 4096-bit RSA
16# verify:-( All comparisons are against bn_mul_mont-free assembler.
17# The module might be of interest to embedded system developers, as
18# the code is smaller than 1KB, yet offers >3x improvement on MIPS64
19# and 75-30% [less for longer keys] on MIPS32 over compiler-generated
20# code.
21
22######################################################################
23# There is a number of MIPS ABI in use, O32 and N32/64 are most
24# widely used. Then there is a new contender: NUBI. It appears that if
25# one picks the latter, it's possible to arrange code in ABI neutral
26# manner. Therefore let's stick to NUBI register layout:
27#
28($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
29($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
30($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
31($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
32#
33# The return value is placed in $a0. Following coding rules facilitate
34# interoperability:
35#
36# - never ever touch $tp, "thread pointer", former $gp;
37# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
38# old code];
39# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
40#
41# For reference here is register layout for N32/64 MIPS ABIs:
42#
43# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
44# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
45# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
46# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
47# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
48#
49$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64
50
51if ($flavour =~ /64|n32/i) {
52 $PTR_ADD="dadd"; # incidentally works even on n32
53 $PTR_SUB="dsub"; # incidentally works even on n32
54 $REG_S="sd";
55 $REG_L="ld";
56 $SZREG=8;
57} else {
58 $PTR_ADD="add";
59 $PTR_SUB="sub";
60 $REG_S="sw";
61 $REG_L="lw";
62 $SZREG=4;
63}
64$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0x00fff000 : 0x00ff0000;
65#
66# <appro@openssl.org>
67#
68######################################################################
69
70while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
71open STDOUT,">$output";
72
73if ($flavour =~ /64|n32/i) {
74 $LD="ld";
75 $ST="sd";
76 $MULTU="dmultu";
77 $ADDU="daddu";
78 $SUBU="dsubu";
79 $BNSZ=8;
80} else {
81 $LD="lw";
82 $ST="sw";
83 $MULTU="multu";
84 $ADDU="addu";
85 $SUBU="subu";
86 $BNSZ=4;
87}
88
89# int bn_mul_mont(
90$rp=$a0; # BN_ULONG *rp,
91$ap=$a1; # const BN_ULONG *ap,
92$bp=$a2; # const BN_ULONG *bp,
93$np=$a3; # const BN_ULONG *np,
94$n0=$a4; # const BN_ULONG *n0,
95$num=$a5; # int num);
96
97$lo0=$a6;
98$hi0=$a7;
99$lo1=$t1;
100$hi1=$t2;
101$aj=$s0;
102$bi=$s1;
103$nj=$s2;
104$tp=$s3;
105$alo=$s4;
106$ahi=$s5;
107$nlo=$s6;
108$nhi=$s7;
109$tj=$s8;
110$i=$s9;
111$j=$s10;
112$m1=$s11;
113
114$FRAMESIZE=14;
115
116$code=<<___;
117.text
118
119.set noat
120.set noreorder
121
122.align 5
123.globl bn_mul_mont
124.ent bn_mul_mont
125bn_mul_mont:
126___
127$code.=<<___ if ($flavour =~ /o32/i);
128 lw $n0,16($sp)
129 lw $num,20($sp)
130___
131$code.=<<___;
132 slt $at,$num,4
133 bnez $at,1f
134 li $t0,0
135 slt $at,$num,17 # on in-order CPU
136 bnez $at,bn_mul_mont_internal
137 nop
1381: jr $ra
139 li $a0,0
140.end bn_mul_mont
141
142.align 5
143.ent bn_mul_mont_internal
144bn_mul_mont_internal:
145 .frame $fp,$FRAMESIZE*$SZREG,$ra
146 .mask 0x40000000|$SAVED_REGS_MASK,-$SZREG
147 $PTR_SUB $sp,$FRAMESIZE*$SZREG
148 $REG_S $fp,($FRAMESIZE-1)*$SZREG($sp)
149 $REG_S $s11,($FRAMESIZE-2)*$SZREG($sp)
150 $REG_S $s10,($FRAMESIZE-3)*$SZREG($sp)
151 $REG_S $s9,($FRAMESIZE-4)*$SZREG($sp)
152 $REG_S $s8,($FRAMESIZE-5)*$SZREG($sp)
153 $REG_S $s7,($FRAMESIZE-6)*$SZREG($sp)
154 $REG_S $s6,($FRAMESIZE-7)*$SZREG($sp)
155 $REG_S $s5,($FRAMESIZE-8)*$SZREG($sp)
156 $REG_S $s4,($FRAMESIZE-9)*$SZREG($sp)
157___
158$code.=<<___ if ($flavour =~ /nubi/i);
159 $REG_S $s3,($FRAMESIZE-10)*$SZREG($sp)
160 $REG_S $s2,($FRAMESIZE-11)*$SZREG($sp)
161 $REG_S $s1,($FRAMESIZE-12)*$SZREG($sp)
162 $REG_S $s0,($FRAMESIZE-13)*$SZREG($sp)
163___
164$code.=<<___;
165 move $fp,$sp
166
167 .set reorder
168 $LD $n0,0($n0)
169 $LD $bi,0($bp) # bp[0]
170 $LD $aj,0($ap) # ap[0]
171 $LD $nj,0($np) # np[0]
172
173 $PTR_SUB $sp,2*$BNSZ # place for two extra words
174 sll $num,`log($BNSZ)/log(2)`
175 li $at,-4096
176 $PTR_SUB $sp,$num
177 and $sp,$at
178
179 $MULTU $aj,$bi
180 $LD $alo,$BNSZ($ap)
181 $LD $nlo,$BNSZ($np)
182 mflo $lo0
183 mfhi $hi0
184 $MULTU $lo0,$n0
185 mflo $m1
186
187 $MULTU $alo,$bi
188 mflo $alo
189 mfhi $ahi
190
191 $MULTU $nj,$m1
192 mflo $lo1
193 mfhi $hi1
194 $MULTU $nlo,$m1
195 $ADDU $lo1,$lo0
196 sltu $at,$lo1,$lo0
197 $ADDU $hi1,$at
198 mflo $nlo
199 mfhi $nhi
200
201 move $tp,$sp
202 li $j,2*$BNSZ
203.align 4
204.L1st:
205 .set noreorder
206 $PTR_ADD $aj,$ap,$j
207 $PTR_ADD $nj,$np,$j
208 $LD $aj,($aj)
209 $LD $nj,($nj)
210
211 $MULTU $aj,$bi
212 $ADDU $lo0,$alo,$hi0
213 $ADDU $lo1,$nlo,$hi1
214 sltu $at,$lo0,$hi0
215 sltu $t0,$lo1,$hi1
216 $ADDU $hi0,$ahi,$at
217 $ADDU $hi1,$nhi,$t0
218 mflo $alo
219 mfhi $ahi
220
221 $ADDU $lo1,$lo0
222 sltu $at,$lo1,$lo0
223 $MULTU $nj,$m1
224 $ADDU $hi1,$at
225 addu $j,$BNSZ
226 $ST $lo1,($tp)
227 sltu $t0,$j,$num
228 mflo $nlo
229 mfhi $nhi
230
231 bnez $t0,.L1st
232 $PTR_ADD $tp,$BNSZ
233 .set reorder
234
235 $ADDU $lo0,$alo,$hi0
236 sltu $at,$lo0,$hi0
237 $ADDU $hi0,$ahi,$at
238
239 $ADDU $lo1,$nlo,$hi1
240 sltu $t0,$lo1,$hi1
241 $ADDU $hi1,$nhi,$t0
242 $ADDU $lo1,$lo0
243 sltu $at,$lo1,$lo0
244 $ADDU $hi1,$at
245
246 $ST $lo1,($tp)
247
248 $ADDU $hi1,$hi0
249 sltu $at,$hi1,$hi0
250 $ST $hi1,$BNSZ($tp)
251 $ST $at,2*$BNSZ($tp)
252
253 li $i,$BNSZ
254.align 4
255.Louter:
256 $PTR_ADD $bi,$bp,$i
257 $LD $bi,($bi)
258 $LD $aj,($ap)
259 $LD $alo,$BNSZ($ap)
260 $LD $tj,($sp)
261
262 $MULTU $aj,$bi
263 $LD $nj,($np)
264 $LD $nlo,$BNSZ($np)
265 mflo $lo0
266 mfhi $hi0
267 $ADDU $lo0,$tj
268 $MULTU $lo0,$n0
269 sltu $at,$lo0,$tj
270 $ADDU $hi0,$at
271 mflo $m1
272
273 $MULTU $alo,$bi
274 mflo $alo
275 mfhi $ahi
276
277 $MULTU $nj,$m1
278 mflo $lo1
279 mfhi $hi1
280
281 $MULTU $nlo,$m1
282 $ADDU $lo1,$lo0
283 sltu $at,$lo1,$lo0
284 $ADDU $hi1,$at
285 mflo $nlo
286 mfhi $nhi
287
288 move $tp,$sp
289 li $j,2*$BNSZ
290 $LD $tj,$BNSZ($tp)
291.align 4
292.Linner:
293 .set noreorder
294 $PTR_ADD $aj,$ap,$j
295 $PTR_ADD $nj,$np,$j
296 $LD $aj,($aj)
297 $LD $nj,($nj)
298
299 $MULTU $aj,$bi
300 $ADDU $lo0,$alo,$hi0
301 $ADDU $lo1,$nlo,$hi1
302 sltu $at,$lo0,$hi0
303 sltu $t0,$lo1,$hi1
304 $ADDU $hi0,$ahi,$at
305 $ADDU $hi1,$nhi,$t0
306 mflo $alo
307 mfhi $ahi
308
309 $ADDU $lo0,$tj
310 addu $j,$BNSZ
311 $MULTU $nj,$m1
312 sltu $at,$lo0,$tj
313 $ADDU $lo1,$lo0
314 $ADDU $hi0,$at
315 sltu $t0,$lo1,$lo0
316 $LD $tj,2*$BNSZ($tp)
317 $ADDU $hi1,$t0
318 sltu $at,$j,$num
319 mflo $nlo
320 mfhi $nhi
321 $ST $lo1,($tp)
322 bnez $at,.Linner
323 $PTR_ADD $tp,$BNSZ
324 .set reorder
325
326 $ADDU $lo0,$alo,$hi0
327 sltu $at,$lo0,$hi0
328 $ADDU $hi0,$ahi,$at
329 $ADDU $lo0,$tj
330 sltu $t0,$lo0,$tj
331 $ADDU $hi0,$t0
332
333 $LD $tj,2*$BNSZ($tp)
334 $ADDU $lo1,$nlo,$hi1
335 sltu $at,$lo1,$hi1
336 $ADDU $hi1,$nhi,$at
337 $ADDU $lo1,$lo0
338 sltu $t0,$lo1,$lo0
339 $ADDU $hi1,$t0
340 $ST $lo1,($tp)
341
342 $ADDU $lo1,$hi1,$hi0
343 sltu $hi1,$lo1,$hi0
344 $ADDU $lo1,$tj
345 sltu $at,$lo1,$tj
346 $ADDU $hi1,$at
347 $ST $lo1,$BNSZ($tp)
348 $ST $hi1,2*$BNSZ($tp)
349
350 addu $i,$BNSZ
351 sltu $t0,$i,$num
352 bnez $t0,.Louter
353
354 .set noreorder
355 $PTR_ADD $tj,$sp,$num # &tp[num]
356 move $tp,$sp
357 move $ap,$sp
358 li $hi0,0 # clear borrow bit
359
360.align 4
361.Lsub: $LD $lo0,($tp)
362 $LD $lo1,($np)
363 $PTR_ADD $tp,$BNSZ
364 $PTR_ADD $np,$BNSZ
365 $SUBU $lo1,$lo0,$lo1 # tp[i]-np[i]
366 sgtu $at,$lo1,$lo0
367 $SUBU $lo0,$lo1,$hi0
368 sgtu $hi0,$lo0,$lo1
369 $ST $lo0,($rp)
370 or $hi0,$at
371 sltu $at,$tp,$tj
372 bnez $at,.Lsub
373 $PTR_ADD $rp,$BNSZ
374
375 $SUBU $hi0,$hi1,$hi0 # handle upmost overflow bit
376 move $tp,$sp
377 $PTR_SUB $rp,$num # restore rp
378 not $hi1,$hi0
379
380 and $ap,$hi0,$sp
381 and $bp,$hi1,$rp
382 or $ap,$ap,$bp # ap=borrow?tp:rp
383
384.align 4
385.Lcopy: $LD $aj,($ap)
386 $PTR_ADD $ap,$BNSZ
387 $ST $zero,($tp)
388 $PTR_ADD $tp,$BNSZ
389 sltu $at,$tp,$tj
390 $ST $aj,($rp)
391 bnez $at,.Lcopy
392 $PTR_ADD $rp,$BNSZ
393
394 li $a0,1
395 li $t0,1
396
397 .set noreorder
398 move $sp,$fp
399 $REG_L $fp,($FRAMESIZE-1)*$SZREG($sp)
400 $REG_L $s11,($FRAMESIZE-2)*$SZREG($sp)
401 $REG_L $s10,($FRAMESIZE-3)*$SZREG($sp)
402 $REG_L $s9,($FRAMESIZE-4)*$SZREG($sp)
403 $REG_L $s8,($FRAMESIZE-5)*$SZREG($sp)
404 $REG_L $s7,($FRAMESIZE-6)*$SZREG($sp)
405 $REG_L $s6,($FRAMESIZE-7)*$SZREG($sp)
406 $REG_L $s5,($FRAMESIZE-8)*$SZREG($sp)
407 $REG_L $s4,($FRAMESIZE-9)*$SZREG($sp)
408___
409$code.=<<___ if ($flavour =~ /nubi/i);
410 $REG_L $s3,($FRAMESIZE-10)*$SZREG($sp)
411 $REG_L $s2,($FRAMESIZE-11)*$SZREG($sp)
412 $REG_L $s1,($FRAMESIZE-12)*$SZREG($sp)
413 $REG_L $s0,($FRAMESIZE-13)*$SZREG($sp)
414___
415$code.=<<___;
416 jr $ra
417 $PTR_ADD $sp,$FRAMESIZE*$SZREG
418.end bn_mul_mont_internal
419.rdata
420.asciiz "Montgomery Multiplication for MIPS, CRYPTOGAMS by <appro\@openssl.org>"
421___
422
423$code =~ s/\`([^\`]*)\`/eval $1/gem;
424
425print $code;
426close STDOUT;
diff --git a/src/lib/libcrypto/bn/asm/mips.pl b/src/lib/libcrypto/bn/asm/mips.pl
deleted file mode 100644
index 215c9a7483..0000000000
--- a/src/lib/libcrypto/bn/asm/mips.pl
+++ /dev/null
@@ -1,2234 +0,0 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project.
6#
7# Rights for redistribution and usage in source and binary forms are
8# granted according to the OpenSSL license. Warranty of any kind is
9# disclaimed.
10# ====================================================================
11
12
13# July 1999
14#
15# This is drop-in MIPS III/IV ISA replacement for crypto/bn/bn_asm.c.
16#
17# The module is designed to work with either of the "new" MIPS ABI(5),
18# namely N32 or N64, offered by IRIX 6.x. It's not ment to work under
19# IRIX 5.x not only because it doesn't support new ABIs but also
20# because 5.x kernels put R4x00 CPU into 32-bit mode and all those
21# 64-bit instructions (daddu, dmultu, etc.) found below gonna only
22# cause illegal instruction exception:-(
23#
24# In addition the code depends on preprocessor flags set up by MIPSpro
25# compiler driver (either as or cc) and therefore (probably?) can't be
26# compiled by the GNU assembler. GNU C driver manages fine though...
27# I mean as long as -mmips-as is specified or is the default option,
28# because then it simply invokes /usr/bin/as which in turn takes
29# perfect care of the preprocessor definitions. Another neat feature
30# offered by the MIPSpro assembler is an optimization pass. This gave
31# me the opportunity to have the code looking more regular as all those
32# architecture dependent instruction rescheduling details were left to
33# the assembler. Cool, huh?
34#
35# Performance improvement is astonishing! 'apps/openssl speed rsa dsa'
36# goes way over 3 times faster!
37#
38# <appro@fy.chalmers.se>
39
40# October 2010
41#
42# Adapt the module even for 32-bit ABIs and other OSes. The former was
43# achieved by mechanical replacement of 64-bit arithmetic instructions
44# such as dmultu, daddu, etc. with their 32-bit counterparts and
45# adjusting offsets denoting multiples of BN_ULONG. Above mentioned
46# >3x performance improvement naturally does not apply to 32-bit code
47# [because there is no instruction 32-bit compiler can't use], one
48# has to content with 40-85% improvement depending on benchmark and
49# key length, more for longer keys.
50
51$flavour = shift;
52while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
53open STDOUT,">$output";
54
55if ($flavour =~ /64|n32/i) {
56 $LD="ld";
57 $ST="sd";
58 $MULTU="dmultu";
59 $DIVU="ddivu";
60 $ADDU="daddu";
61 $SUBU="dsubu";
62 $SRL="dsrl";
63 $SLL="dsll";
64 $BNSZ=8;
65 $PTR_ADD="daddu";
66 $PTR_SUB="dsubu";
67 $SZREG=8;
68 $REG_S="sd";
69 $REG_L="ld";
70} else {
71 $LD="lw";
72 $ST="sw";
73 $MULTU="multu";
74 $DIVU="divu";
75 $ADDU="addu";
76 $SUBU="subu";
77 $SRL="srl";
78 $SLL="sll";
79 $BNSZ=4;
80 $PTR_ADD="addu";
81 $PTR_SUB="subu";
82 $SZREG=4;
83 $REG_S="sw";
84 $REG_L="lw";
85 $code=".set mips2\n";
86}
87
88# Below is N32/64 register layout used in the original module.
89#
90($zero,$at,$v0,$v1)=map("\$$_",(0..3));
91($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
92($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
93($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
94($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
95($ta0,$ta1,$ta2,$ta3)=($a4,$a5,$a6,$a7);
96#
97# No special adaptation is required for O32. NUBI on the other hand
98# is treated by saving/restoring ($v1,$t0..$t3).
99
100$gp=$v1 if ($flavour =~ /nubi/i);
101
102$minus4=$v1;
103
104$code.=<<___;
105.rdata
106.asciiz "mips3.s, Version 1.2"
107.asciiz "MIPS II/III/IV ISA artwork by Andy Polyakov <appro\@fy.chalmers.se>"
108
109.text
110.set noat
111
112.align 5
113.globl bn_mul_add_words
114.ent bn_mul_add_words
115bn_mul_add_words:
116 .set noreorder
117 bgtz $a2,bn_mul_add_words_internal
118 move $v0,$zero
119 jr $ra
120 move $a0,$v0
121.end bn_mul_add_words
122
123.align 5
124.ent bn_mul_add_words_internal
125bn_mul_add_words_internal:
126___
127$code.=<<___ if ($flavour =~ /nubi/i);
128 .frame $sp,6*$SZREG,$ra
129 .mask 0x8000f008,-$SZREG
130 .set noreorder
131 $PTR_SUB $sp,6*$SZREG
132 $REG_S $ra,5*$SZREG($sp)
133 $REG_S $t3,4*$SZREG($sp)
134 $REG_S $t2,3*$SZREG($sp)
135 $REG_S $t1,2*$SZREG($sp)
136 $REG_S $t0,1*$SZREG($sp)
137 $REG_S $gp,0*$SZREG($sp)
138___
139$code.=<<___;
140 .set reorder
141 li $minus4,-4
142 and $ta0,$a2,$minus4
143 beqz $ta0,.L_bn_mul_add_words_tail
144
145.L_bn_mul_add_words_loop:
146 $LD $t0,0($a1)
147 $MULTU $t0,$a3
148 $LD $t1,0($a0)
149 $LD $t2,$BNSZ($a1)
150 $LD $t3,$BNSZ($a0)
151 $LD $ta0,2*$BNSZ($a1)
152 $LD $ta1,2*$BNSZ($a0)
153 $ADDU $t1,$v0
154 sltu $v0,$t1,$v0 # All manuals say it "compares 32-bit
155 # values", but it seems to work fine
156 # even on 64-bit registers.
157 mflo $at
158 mfhi $t0
159 $ADDU $t1,$at
160 $ADDU $v0,$t0
161 $MULTU $t2,$a3
162 sltu $at,$t1,$at
163 $ST $t1,0($a0)
164 $ADDU $v0,$at
165
166 $LD $ta2,3*$BNSZ($a1)
167 $LD $ta3,3*$BNSZ($a0)
168 $ADDU $t3,$v0
169 sltu $v0,$t3,$v0
170 mflo $at
171 mfhi $t2
172 $ADDU $t3,$at
173 $ADDU $v0,$t2
174 $MULTU $ta0,$a3
175 sltu $at,$t3,$at
176 $ST $t3,$BNSZ($a0)
177 $ADDU $v0,$at
178
179 subu $a2,4
180 $PTR_ADD $a0,4*$BNSZ
181 $PTR_ADD $a1,4*$BNSZ
182 $ADDU $ta1,$v0
183 sltu $v0,$ta1,$v0
184 mflo $at
185 mfhi $ta0
186 $ADDU $ta1,$at
187 $ADDU $v0,$ta0
188 $MULTU $ta2,$a3
189 sltu $at,$ta1,$at
190 $ST $ta1,-2*$BNSZ($a0)
191 $ADDU $v0,$at
192
193
194 and $ta0,$a2,$minus4
195 $ADDU $ta3,$v0
196 sltu $v0,$ta3,$v0
197 mflo $at
198 mfhi $ta2
199 $ADDU $ta3,$at
200 $ADDU $v0,$ta2
201 sltu $at,$ta3,$at
202 $ST $ta3,-$BNSZ($a0)
203 .set noreorder
204 bgtz $ta0,.L_bn_mul_add_words_loop
205 $ADDU $v0,$at
206
207 beqz $a2,.L_bn_mul_add_words_return
208 nop
209
210.L_bn_mul_add_words_tail:
211 .set reorder
212 $LD $t0,0($a1)
213 $MULTU $t0,$a3
214 $LD $t1,0($a0)
215 subu $a2,1
216 $ADDU $t1,$v0
217 sltu $v0,$t1,$v0
218 mflo $at
219 mfhi $t0
220 $ADDU $t1,$at
221 $ADDU $v0,$t0
222 sltu $at,$t1,$at
223 $ST $t1,0($a0)
224 $ADDU $v0,$at
225 beqz $a2,.L_bn_mul_add_words_return
226
227 $LD $t0,$BNSZ($a1)
228 $MULTU $t0,$a3
229 $LD $t1,$BNSZ($a0)
230 subu $a2,1
231 $ADDU $t1,$v0
232 sltu $v0,$t1,$v0
233 mflo $at
234 mfhi $t0
235 $ADDU $t1,$at
236 $ADDU $v0,$t0
237 sltu $at,$t1,$at
238 $ST $t1,$BNSZ($a0)
239 $ADDU $v0,$at
240 beqz $a2,.L_bn_mul_add_words_return
241
242 $LD $t0,2*$BNSZ($a1)
243 $MULTU $t0,$a3
244 $LD $t1,2*$BNSZ($a0)
245 $ADDU $t1,$v0
246 sltu $v0,$t1,$v0
247 mflo $at
248 mfhi $t0
249 $ADDU $t1,$at
250 $ADDU $v0,$t0
251 sltu $at,$t1,$at
252 $ST $t1,2*$BNSZ($a0)
253 $ADDU $v0,$at
254
255.L_bn_mul_add_words_return:
256 .set noreorder
257___
258$code.=<<___ if ($flavour =~ /nubi/i);
259 $REG_L $t3,4*$SZREG($sp)
260 $REG_L $t2,3*$SZREG($sp)
261 $REG_L $t1,2*$SZREG($sp)
262 $REG_L $t0,1*$SZREG($sp)
263 $REG_L $gp,0*$SZREG($sp)
264 $PTR_ADD $sp,6*$SZREG
265___
266$code.=<<___;
267 jr $ra
268 move $a0,$v0
269.end bn_mul_add_words_internal
270
271.align 5
272.globl bn_mul_words
273.ent bn_mul_words
274bn_mul_words:
275 .set noreorder
276 bgtz $a2,bn_mul_words_internal
277 move $v0,$zero
278 jr $ra
279 move $a0,$v0
280.end bn_mul_words
281
282.align 5
283.ent bn_mul_words_internal
284bn_mul_words_internal:
285___
286$code.=<<___ if ($flavour =~ /nubi/i);
287 .frame $sp,6*$SZREG,$ra
288 .mask 0x8000f008,-$SZREG
289 .set noreorder
290 $PTR_SUB $sp,6*$SZREG
291 $REG_S $ra,5*$SZREG($sp)
292 $REG_S $t3,4*$SZREG($sp)
293 $REG_S $t2,3*$SZREG($sp)
294 $REG_S $t1,2*$SZREG($sp)
295 $REG_S $t0,1*$SZREG($sp)
296 $REG_S $gp,0*$SZREG($sp)
297___
298$code.=<<___;
299 .set reorder
300 li $minus4,-4
301 and $ta0,$a2,$minus4
302 beqz $ta0,.L_bn_mul_words_tail
303
304.L_bn_mul_words_loop:
305 $LD $t0,0($a1)
306 $MULTU $t0,$a3
307 $LD $t2,$BNSZ($a1)
308 $LD $ta0,2*$BNSZ($a1)
309 $LD $ta2,3*$BNSZ($a1)
310 mflo $at
311 mfhi $t0
312 $ADDU $v0,$at
313 sltu $t1,$v0,$at
314 $MULTU $t2,$a3
315 $ST $v0,0($a0)
316 $ADDU $v0,$t1,$t0
317
318 subu $a2,4
319 $PTR_ADD $a0,4*$BNSZ
320 $PTR_ADD $a1,4*$BNSZ
321 mflo $at
322 mfhi $t2
323 $ADDU $v0,$at
324 sltu $t3,$v0,$at
325 $MULTU $ta0,$a3
326 $ST $v0,-3*$BNSZ($a0)
327 $ADDU $v0,$t3,$t2
328
329 mflo $at
330 mfhi $ta0
331 $ADDU $v0,$at
332 sltu $ta1,$v0,$at
333 $MULTU $ta2,$a3
334 $ST $v0,-2*$BNSZ($a0)
335 $ADDU $v0,$ta1,$ta0
336
337 and $ta0,$a2,$minus4
338 mflo $at
339 mfhi $ta2
340 $ADDU $v0,$at
341 sltu $ta3,$v0,$at
342 $ST $v0,-$BNSZ($a0)
343 .set noreorder
344 bgtz $ta0,.L_bn_mul_words_loop
345 $ADDU $v0,$ta3,$ta2
346
347 beqz $a2,.L_bn_mul_words_return
348 nop
349
350.L_bn_mul_words_tail:
351 .set reorder
352 $LD $t0,0($a1)
353 $MULTU $t0,$a3
354 subu $a2,1
355 mflo $at
356 mfhi $t0
357 $ADDU $v0,$at
358 sltu $t1,$v0,$at
359 $ST $v0,0($a0)
360 $ADDU $v0,$t1,$t0
361 beqz $a2,.L_bn_mul_words_return
362
363 $LD $t0,$BNSZ($a1)
364 $MULTU $t0,$a3
365 subu $a2,1
366 mflo $at
367 mfhi $t0
368 $ADDU $v0,$at
369 sltu $t1,$v0,$at
370 $ST $v0,$BNSZ($a0)
371 $ADDU $v0,$t1,$t0
372 beqz $a2,.L_bn_mul_words_return
373
374 $LD $t0,2*$BNSZ($a1)
375 $MULTU $t0,$a3
376 mflo $at
377 mfhi $t0
378 $ADDU $v0,$at
379 sltu $t1,$v0,$at
380 $ST $v0,2*$BNSZ($a0)
381 $ADDU $v0,$t1,$t0
382
383.L_bn_mul_words_return:
384 .set noreorder
385___
386$code.=<<___ if ($flavour =~ /nubi/i);
387 $REG_L $t3,4*$SZREG($sp)
388 $REG_L $t2,3*$SZREG($sp)
389 $REG_L $t1,2*$SZREG($sp)
390 $REG_L $t0,1*$SZREG($sp)
391 $REG_L $gp,0*$SZREG($sp)
392 $PTR_ADD $sp,6*$SZREG
393___
394$code.=<<___;
395 jr $ra
396 move $a0,$v0
397.end bn_mul_words_internal
398
399.align 5
400.globl bn_sqr_words
401.ent bn_sqr_words
402bn_sqr_words:
403 .set noreorder
404 bgtz $a2,bn_sqr_words_internal
405 move $v0,$zero
406 jr $ra
407 move $a0,$v0
408.end bn_sqr_words
409
410.align 5
411.ent bn_sqr_words_internal
412bn_sqr_words_internal:
413___
414$code.=<<___ if ($flavour =~ /nubi/i);
415 .frame $sp,6*$SZREG,$ra
416 .mask 0x8000f008,-$SZREG
417 .set noreorder
418 $PTR_SUB $sp,6*$SZREG
419 $REG_S $ra,5*$SZREG($sp)
420 $REG_S $t3,4*$SZREG($sp)
421 $REG_S $t2,3*$SZREG($sp)
422 $REG_S $t1,2*$SZREG($sp)
423 $REG_S $t0,1*$SZREG($sp)
424 $REG_S $gp,0*$SZREG($sp)
425___
426$code.=<<___;
427 .set reorder
428 li $minus4,-4
429 and $ta0,$a2,$minus4
430 beqz $ta0,.L_bn_sqr_words_tail
431
432.L_bn_sqr_words_loop:
433 $LD $t0,0($a1)
434 $MULTU $t0,$t0
435 $LD $t2,$BNSZ($a1)
436 $LD $ta0,2*$BNSZ($a1)
437 $LD $ta2,3*$BNSZ($a1)
438 mflo $t1
439 mfhi $t0
440 $ST $t1,0($a0)
441 $ST $t0,$BNSZ($a0)
442
443 $MULTU $t2,$t2
444 subu $a2,4
445 $PTR_ADD $a0,8*$BNSZ
446 $PTR_ADD $a1,4*$BNSZ
447 mflo $t3
448 mfhi $t2
449 $ST $t3,-6*$BNSZ($a0)
450 $ST $t2,-5*$BNSZ($a0)
451
452 $MULTU $ta0,$ta0
453 mflo $ta1
454 mfhi $ta0
455 $ST $ta1,-4*$BNSZ($a0)
456 $ST $ta0,-3*$BNSZ($a0)
457
458
459 $MULTU $ta2,$ta2
460 and $ta0,$a2,$minus4
461 mflo $ta3
462 mfhi $ta2
463 $ST $ta3,-2*$BNSZ($a0)
464
465 .set noreorder
466 bgtz $ta0,.L_bn_sqr_words_loop
467 $ST $ta2,-$BNSZ($a0)
468
469 beqz $a2,.L_bn_sqr_words_return
470 nop
471
472.L_bn_sqr_words_tail:
473 .set reorder
474 $LD $t0,0($a1)
475 $MULTU $t0,$t0
476 subu $a2,1
477 mflo $t1
478 mfhi $t0
479 $ST $t1,0($a0)
480 $ST $t0,$BNSZ($a0)
481 beqz $a2,.L_bn_sqr_words_return
482
483 $LD $t0,$BNSZ($a1)
484 $MULTU $t0,$t0
485 subu $a2,1
486 mflo $t1
487 mfhi $t0
488 $ST $t1,2*$BNSZ($a0)
489 $ST $t0,3*$BNSZ($a0)
490 beqz $a2,.L_bn_sqr_words_return
491
492 $LD $t0,2*$BNSZ($a1)
493 $MULTU $t0,$t0
494 mflo $t1
495 mfhi $t0
496 $ST $t1,4*$BNSZ($a0)
497 $ST $t0,5*$BNSZ($a0)
498
499.L_bn_sqr_words_return:
500 .set noreorder
501___
502$code.=<<___ if ($flavour =~ /nubi/i);
503 $REG_L $t3,4*$SZREG($sp)
504 $REG_L $t2,3*$SZREG($sp)
505 $REG_L $t1,2*$SZREG($sp)
506 $REG_L $t0,1*$SZREG($sp)
507 $REG_L $gp,0*$SZREG($sp)
508 $PTR_ADD $sp,6*$SZREG
509___
510$code.=<<___;
511 jr $ra
512 move $a0,$v0
513
514.end bn_sqr_words_internal
515
516.align 5
517.globl bn_add_words
518.ent bn_add_words
519bn_add_words:
520 .set noreorder
521 bgtz $a3,bn_add_words_internal
522 move $v0,$zero
523 jr $ra
524 move $a0,$v0
525.end bn_add_words
526
527.align 5
528.ent bn_add_words_internal
529bn_add_words_internal:
530___
531$code.=<<___ if ($flavour =~ /nubi/i);
532 .frame $sp,6*$SZREG,$ra
533 .mask 0x8000f008,-$SZREG
534 .set noreorder
535 $PTR_SUB $sp,6*$SZREG
536 $REG_S $ra,5*$SZREG($sp)
537 $REG_S $t3,4*$SZREG($sp)
538 $REG_S $t2,3*$SZREG($sp)
539 $REG_S $t1,2*$SZREG($sp)
540 $REG_S $t0,1*$SZREG($sp)
541 $REG_S $gp,0*$SZREG($sp)
542___
543$code.=<<___;
544 .set reorder
545 li $minus4,-4
546 and $at,$a3,$minus4
547 beqz $at,.L_bn_add_words_tail
548
549.L_bn_add_words_loop:
550 $LD $t0,0($a1)
551 $LD $ta0,0($a2)
552 subu $a3,4
553 $LD $t1,$BNSZ($a1)
554 and $at,$a3,$minus4
555 $LD $t2,2*$BNSZ($a1)
556 $PTR_ADD $a2,4*$BNSZ
557 $LD $t3,3*$BNSZ($a1)
558 $PTR_ADD $a0,4*$BNSZ
559 $LD $ta1,-3*$BNSZ($a2)
560 $PTR_ADD $a1,4*$BNSZ
561 $LD $ta2,-2*$BNSZ($a2)
562 $LD $ta3,-$BNSZ($a2)
563 $ADDU $ta0,$t0
564 sltu $t8,$ta0,$t0
565 $ADDU $t0,$ta0,$v0
566 sltu $v0,$t0,$ta0
567 $ST $t0,-4*$BNSZ($a0)
568 $ADDU $v0,$t8
569
570 $ADDU $ta1,$t1
571 sltu $t9,$ta1,$t1
572 $ADDU $t1,$ta1,$v0
573 sltu $v0,$t1,$ta1
574 $ST $t1,-3*$BNSZ($a0)
575 $ADDU $v0,$t9
576
577 $ADDU $ta2,$t2
578 sltu $t8,$ta2,$t2
579 $ADDU $t2,$ta2,$v0
580 sltu $v0,$t2,$ta2
581 $ST $t2,-2*$BNSZ($a0)
582 $ADDU $v0,$t8
583
584 $ADDU $ta3,$t3
585 sltu $t9,$ta3,$t3
586 $ADDU $t3,$ta3,$v0
587 sltu $v0,$t3,$ta3
588 $ST $t3,-$BNSZ($a0)
589
590 .set noreorder
591 bgtz $at,.L_bn_add_words_loop
592 $ADDU $v0,$t9
593
594 beqz $a3,.L_bn_add_words_return
595 nop
596
597.L_bn_add_words_tail:
598 .set reorder
599 $LD $t0,0($a1)
600 $LD $ta0,0($a2)
601 $ADDU $ta0,$t0
602 subu $a3,1
603 sltu $t8,$ta0,$t0
604 $ADDU $t0,$ta0,$v0
605 sltu $v0,$t0,$ta0
606 $ST $t0,0($a0)
607 $ADDU $v0,$t8
608 beqz $a3,.L_bn_add_words_return
609
610 $LD $t1,$BNSZ($a1)
611 $LD $ta1,$BNSZ($a2)
612 $ADDU $ta1,$t1
613 subu $a3,1
614 sltu $t9,$ta1,$t1
615 $ADDU $t1,$ta1,$v0
616 sltu $v0,$t1,$ta1
617 $ST $t1,$BNSZ($a0)
618 $ADDU $v0,$t9
619 beqz $a3,.L_bn_add_words_return
620
621 $LD $t2,2*$BNSZ($a1)
622 $LD $ta2,2*$BNSZ($a2)
623 $ADDU $ta2,$t2
624 sltu $t8,$ta2,$t2
625 $ADDU $t2,$ta2,$v0
626 sltu $v0,$t2,$ta2
627 $ST $t2,2*$BNSZ($a0)
628 $ADDU $v0,$t8
629
630.L_bn_add_words_return:
631 .set noreorder
632___
633$code.=<<___ if ($flavour =~ /nubi/i);
634 $REG_L $t3,4*$SZREG($sp)
635 $REG_L $t2,3*$SZREG($sp)
636 $REG_L $t1,2*$SZREG($sp)
637 $REG_L $t0,1*$SZREG($sp)
638 $REG_L $gp,0*$SZREG($sp)
639 $PTR_ADD $sp,6*$SZREG
640___
641$code.=<<___;
642 jr $ra
643 move $a0,$v0
644
645.end bn_add_words_internal
646
647.align 5
648.globl bn_sub_words
649.ent bn_sub_words
650bn_sub_words:
651 .set noreorder
652 bgtz $a3,bn_sub_words_internal
653 move $v0,$zero
654 jr $ra
655 move $a0,$zero
656.end bn_sub_words
657
658.align 5
659.ent bn_sub_words_internal
660bn_sub_words_internal:
661___
662$code.=<<___ if ($flavour =~ /nubi/i);
663 .frame $sp,6*$SZREG,$ra
664 .mask 0x8000f008,-$SZREG
665 .set noreorder
666 $PTR_SUB $sp,6*$SZREG
667 $REG_S $ra,5*$SZREG($sp)
668 $REG_S $t3,4*$SZREG($sp)
669 $REG_S $t2,3*$SZREG($sp)
670 $REG_S $t1,2*$SZREG($sp)
671 $REG_S $t0,1*$SZREG($sp)
672 $REG_S $gp,0*$SZREG($sp)
673___
674$code.=<<___;
675 .set reorder
676 li $minus4,-4
677 and $at,$a3,$minus4
678 beqz $at,.L_bn_sub_words_tail
679
680.L_bn_sub_words_loop:
681 $LD $t0,0($a1)
682 $LD $ta0,0($a2)
683 subu $a3,4
684 $LD $t1,$BNSZ($a1)
685 and $at,$a3,$minus4
686 $LD $t2,2*$BNSZ($a1)
687 $PTR_ADD $a2,4*$BNSZ
688 $LD $t3,3*$BNSZ($a1)
689 $PTR_ADD $a0,4*$BNSZ
690 $LD $ta1,-3*$BNSZ($a2)
691 $PTR_ADD $a1,4*$BNSZ
692 $LD $ta2,-2*$BNSZ($a2)
693 $LD $ta3,-$BNSZ($a2)
694 sltu $t8,$t0,$ta0
695 $SUBU $ta0,$t0,$ta0
696 $SUBU $t0,$ta0,$v0
697 sgtu $v0,$t0,$ta0
698 $ST $t0,-4*$BNSZ($a0)
699 $ADDU $v0,$t8
700
701 sltu $t9,$t1,$ta1
702 $SUBU $ta1,$t1,$ta1
703 $SUBU $t1,$ta1,$v0
704 sgtu $v0,$t1,$ta1
705 $ST $t1,-3*$BNSZ($a0)
706 $ADDU $v0,$t9
707
708
709 sltu $t8,$t2,$ta2
710 $SUBU $ta2,$t2,$ta2
711 $SUBU $t2,$ta2,$v0
712 sgtu $v0,$t2,$ta2
713 $ST $t2,-2*$BNSZ($a0)
714 $ADDU $v0,$t8
715
716 sltu $t9,$t3,$ta3
717 $SUBU $ta3,$t3,$ta3
718 $SUBU $t3,$ta3,$v0
719 sgtu $v0,$t3,$ta3
720 $ST $t3,-$BNSZ($a0)
721
722 .set noreorder
723 bgtz $at,.L_bn_sub_words_loop
724 $ADDU $v0,$t9
725
726 beqz $a3,.L_bn_sub_words_return
727 nop
728
729.L_bn_sub_words_tail:
730 .set reorder
731 $LD $t0,0($a1)
732 $LD $ta0,0($a2)
733 subu $a3,1
734 sltu $t8,$t0,$ta0
735 $SUBU $ta0,$t0,$ta0
736 $SUBU $t0,$ta0,$v0
737 sgtu $v0,$t0,$ta0
738 $ST $t0,0($a0)
739 $ADDU $v0,$t8
740 beqz $a3,.L_bn_sub_words_return
741
742 $LD $t1,$BNSZ($a1)
743 subu $a3,1
744 $LD $ta1,$BNSZ($a2)
745 sltu $t9,$t1,$ta1
746 $SUBU $ta1,$t1,$ta1
747 $SUBU $t1,$ta1,$v0
748 sgtu $v0,$t1,$ta1
749 $ST $t1,$BNSZ($a0)
750 $ADDU $v0,$t9
751 beqz $a3,.L_bn_sub_words_return
752
753 $LD $t2,2*$BNSZ($a1)
754 $LD $ta2,2*$BNSZ($a2)
755 sltu $t8,$t2,$ta2
756 $SUBU $ta2,$t2,$ta2
757 $SUBU $t2,$ta2,$v0
758 sgtu $v0,$t2,$ta2
759 $ST $t2,2*$BNSZ($a0)
760 $ADDU $v0,$t8
761
762.L_bn_sub_words_return:
763 .set noreorder
764___
765$code.=<<___ if ($flavour =~ /nubi/i);
766 $REG_L $t3,4*$SZREG($sp)
767 $REG_L $t2,3*$SZREG($sp)
768 $REG_L $t1,2*$SZREG($sp)
769 $REG_L $t0,1*$SZREG($sp)
770 $REG_L $gp,0*$SZREG($sp)
771 $PTR_ADD $sp,6*$SZREG
772___
773$code.=<<___;
774 jr $ra
775 move $a0,$v0
776.end bn_sub_words_internal
777
778.align 5
779.globl bn_div_3_words
780.ent bn_div_3_words
781bn_div_3_words:
782 .set noreorder
783 move $a3,$a0 # we know that bn_div_words does not
784 # touch $a3, $ta2, $ta3 and preserves $a2
785 # so that we can save two arguments
786 # and return address in registers
787 # instead of stack:-)
788
789 $LD $a0,($a3)
790 move $ta2,$a1
791 bne $a0,$a2,bn_div_3_words_internal
792 $LD $a1,-$BNSZ($a3)
793 li $v0,-1
794 jr $ra
795 move $a0,$v0
796.end bn_div_3_words
797
798.align 5
799.ent bn_div_3_words_internal
800bn_div_3_words_internal:
801___
802$code.=<<___ if ($flavour =~ /nubi/i);
803 .frame $sp,6*$SZREG,$ra
804 .mask 0x8000f008,-$SZREG
805 .set noreorder
806 $PTR_SUB $sp,6*$SZREG
807 $REG_S $ra,5*$SZREG($sp)
808 $REG_S $t3,4*$SZREG($sp)
809 $REG_S $t2,3*$SZREG($sp)
810 $REG_S $t1,2*$SZREG($sp)
811 $REG_S $t0,1*$SZREG($sp)
812 $REG_S $gp,0*$SZREG($sp)
813___
814$code.=<<___;
815 .set reorder
816 move $ta3,$ra
817 bal bn_div_words_internal
818 move $ra,$ta3
819 $MULTU $ta2,$v0
820 $LD $t2,-2*$BNSZ($a3)
821 move $ta0,$zero
822 mfhi $t1
823 mflo $t0
824 sltu $t8,$t1,$a1
825.L_bn_div_3_words_inner_loop:
826 bnez $t8,.L_bn_div_3_words_inner_loop_done
827 sgeu $at,$t2,$t0
828 seq $t9,$t1,$a1
829 and $at,$t9
830 sltu $t3,$t0,$ta2
831 $ADDU $a1,$a2
832 $SUBU $t1,$t3
833 $SUBU $t0,$ta2
834 sltu $t8,$t1,$a1
835 sltu $ta0,$a1,$a2
836 or $t8,$ta0
837 .set noreorder
838 beqz $at,.L_bn_div_3_words_inner_loop
839 $SUBU $v0,1
840 $ADDU $v0,1
841 .set reorder
842.L_bn_div_3_words_inner_loop_done:
843 .set noreorder
844___
845$code.=<<___ if ($flavour =~ /nubi/i);
846 $REG_L $t3,4*$SZREG($sp)
847 $REG_L $t2,3*$SZREG($sp)
848 $REG_L $t1,2*$SZREG($sp)
849 $REG_L $t0,1*$SZREG($sp)
850 $REG_L $gp,0*$SZREG($sp)
851 $PTR_ADD $sp,6*$SZREG
852___
853$code.=<<___;
854 jr $ra
855 move $a0,$v0
856.end bn_div_3_words_internal
857
858.align 5
859.globl bn_div_words
860.ent bn_div_words
861bn_div_words:
862 .set noreorder
863 bnez $a2,bn_div_words_internal
864 li $v0,-1 # I would rather signal div-by-zero
865 # which can be done with 'break 7'
866 jr $ra
867 move $a0,$v0
868.end bn_div_words
869
870.align 5
871.ent bn_div_words_internal
872bn_div_words_internal:
873___
874$code.=<<___ if ($flavour =~ /nubi/i);
875 .frame $sp,6*$SZREG,$ra
876 .mask 0x8000f008,-$SZREG
877 .set noreorder
878 $PTR_SUB $sp,6*$SZREG
879 $REG_S $ra,5*$SZREG($sp)
880 $REG_S $t3,4*$SZREG($sp)
881 $REG_S $t2,3*$SZREG($sp)
882 $REG_S $t1,2*$SZREG($sp)
883 $REG_S $t0,1*$SZREG($sp)
884 $REG_S $gp,0*$SZREG($sp)
885___
886$code.=<<___;
887 move $v1,$zero
888 bltz $a2,.L_bn_div_words_body
889 move $t9,$v1
890 $SLL $a2,1
891 bgtz $a2,.-4
892 addu $t9,1
893
894 .set reorder
895 negu $t1,$t9
896 li $t2,-1
897 $SLL $t2,$t1
898 and $t2,$a0
899 $SRL $at,$a1,$t1
900 .set noreorder
901 beqz $t2,.+12
902 nop
903 break 6 # signal overflow
904 .set reorder
905 $SLL $a0,$t9
906 $SLL $a1,$t9
907 or $a0,$at
908___
909$QT=$ta0;
910$HH=$ta1;
911$DH=$v1;
912$code.=<<___;
913.L_bn_div_words_body:
914 $SRL $DH,$a2,4*$BNSZ # bits
915 sgeu $at,$a0,$a2
916 .set noreorder
917 beqz $at,.+12
918 nop
919 $SUBU $a0,$a2
920 .set reorder
921
922 li $QT,-1
923 $SRL $HH,$a0,4*$BNSZ # bits
924 $SRL $QT,4*$BNSZ # q=0xffffffff
925 beq $DH,$HH,.L_bn_div_words_skip_div1
926 $DIVU $zero,$a0,$DH
927 mflo $QT
928.L_bn_div_words_skip_div1:
929 $MULTU $a2,$QT
930 $SLL $t3,$a0,4*$BNSZ # bits
931 $SRL $at,$a1,4*$BNSZ # bits
932 or $t3,$at
933 mflo $t0
934 mfhi $t1
935.L_bn_div_words_inner_loop1:
936 sltu $t2,$t3,$t0
937 seq $t8,$HH,$t1
938 sltu $at,$HH,$t1
939 and $t2,$t8
940 sltu $v0,$t0,$a2
941 or $at,$t2
942 .set noreorder
943 beqz $at,.L_bn_div_words_inner_loop1_done
944 $SUBU $t1,$v0
945 $SUBU $t0,$a2
946 b .L_bn_div_words_inner_loop1
947 $SUBU $QT,1
948 .set reorder
949.L_bn_div_words_inner_loop1_done:
950
951 $SLL $a1,4*$BNSZ # bits
952 $SUBU $a0,$t3,$t0
953 $SLL $v0,$QT,4*$BNSZ # bits
954
955 li $QT,-1
956 $SRL $HH,$a0,4*$BNSZ # bits
957 $SRL $QT,4*$BNSZ # q=0xffffffff
958 beq $DH,$HH,.L_bn_div_words_skip_div2
959 $DIVU $zero,$a0,$DH
960 mflo $QT
961.L_bn_div_words_skip_div2:
962 $MULTU $a2,$QT
963 $SLL $t3,$a0,4*$BNSZ # bits
964 $SRL $at,$a1,4*$BNSZ # bits
965 or $t3,$at
966 mflo $t0
967 mfhi $t1
968.L_bn_div_words_inner_loop2:
969 sltu $t2,$t3,$t0
970 seq $t8,$HH,$t1
971 sltu $at,$HH,$t1
972 and $t2,$t8
973 sltu $v1,$t0,$a2
974 or $at,$t2
975 .set noreorder
976 beqz $at,.L_bn_div_words_inner_loop2_done
977 $SUBU $t1,$v1
978 $SUBU $t0,$a2
979 b .L_bn_div_words_inner_loop2
980 $SUBU $QT,1
981 .set reorder
982.L_bn_div_words_inner_loop2_done:
983
984 $SUBU $a0,$t3,$t0
985 or $v0,$QT
986 $SRL $v1,$a0,$t9 # $v1 contains remainder if anybody wants it
987 $SRL $a2,$t9 # restore $a2
988
989 .set noreorder
990 move $a1,$v1
991___
992$code.=<<___ if ($flavour =~ /nubi/i);
993 $REG_L $t3,4*$SZREG($sp)
994 $REG_L $t2,3*$SZREG($sp)
995 $REG_L $t1,2*$SZREG($sp)
996 $REG_L $t0,1*$SZREG($sp)
997 $REG_L $gp,0*$SZREG($sp)
998 $PTR_ADD $sp,6*$SZREG
999___
1000$code.=<<___;
1001 jr $ra
1002 move $a0,$v0
1003.end bn_div_words_internal
1004___
1005undef $HH; undef $QT; undef $DH;
1006
1007($a_0,$a_1,$a_2,$a_3)=($t0,$t1,$t2,$t3);
1008($b_0,$b_1,$b_2,$b_3)=($ta0,$ta1,$ta2,$ta3);
1009
1010($a_4,$a_5,$a_6,$a_7)=($s0,$s2,$s4,$a1); # once we load a[7], no use for $a1
1011($b_4,$b_5,$b_6,$b_7)=($s1,$s3,$s5,$a2); # once we load b[7], no use for $a2
1012
1013($t_1,$t_2,$c_1,$c_2,$c_3)=($t8,$t9,$v0,$v1,$a3);
1014
1015$code.=<<___;
1016
1017.align 5
1018.globl bn_mul_comba8
1019.ent bn_mul_comba8
1020bn_mul_comba8:
1021 .set noreorder
1022___
1023$code.=<<___ if ($flavour =~ /nubi/i);
1024 .frame $sp,12*$SZREG,$ra
1025 .mask 0x803ff008,-$SZREG
1026 $PTR_SUB $sp,12*$SZREG
1027 $REG_S $ra,11*$SZREG($sp)
1028 $REG_S $s5,10*$SZREG($sp)
1029 $REG_S $s4,9*$SZREG($sp)
1030 $REG_S $s3,8*$SZREG($sp)
1031 $REG_S $s2,7*$SZREG($sp)
1032 $REG_S $s1,6*$SZREG($sp)
1033 $REG_S $s0,5*$SZREG($sp)
1034 $REG_S $t3,4*$SZREG($sp)
1035 $REG_S $t2,3*$SZREG($sp)
1036 $REG_S $t1,2*$SZREG($sp)
1037 $REG_S $t0,1*$SZREG($sp)
1038 $REG_S $gp,0*$SZREG($sp)
1039___
1040$code.=<<___ if ($flavour !~ /nubi/i);
1041 .frame $sp,6*$SZREG,$ra
1042 .mask 0x003f0000,-$SZREG
1043 $PTR_SUB $sp,6*$SZREG
1044 $REG_S $s5,5*$SZREG($sp)
1045 $REG_S $s4,4*$SZREG($sp)
1046 $REG_S $s3,3*$SZREG($sp)
1047 $REG_S $s2,2*$SZREG($sp)
1048 $REG_S $s1,1*$SZREG($sp)
1049 $REG_S $s0,0*$SZREG($sp)
1050___
1051$code.=<<___;
1052
1053 .set reorder
1054 $LD $a_0,0($a1) # If compiled with -mips3 option on
1055 # R5000 box assembler barks on this
1056 # 1ine with "should not have mult/div
1057 # as last instruction in bb (R10K
1058 # bug)" warning. If anybody out there
1059 # has a clue about how to circumvent
1060 # this do send me a note.
1061 # <appro\@fy.chalmers.se>
1062
1063 $LD $b_0,0($a2)
1064 $LD $a_1,$BNSZ($a1)
1065 $LD $a_2,2*$BNSZ($a1)
1066 $MULTU $a_0,$b_0 # mul_add_c(a[0],b[0],c1,c2,c3);
1067 $LD $a_3,3*$BNSZ($a1)
1068 $LD $b_1,$BNSZ($a2)
1069 $LD $b_2,2*$BNSZ($a2)
1070 $LD $b_3,3*$BNSZ($a2)
1071 mflo $c_1
1072 mfhi $c_2
1073
1074 $LD $a_4,4*$BNSZ($a1)
1075 $LD $a_5,5*$BNSZ($a1)
1076 $MULTU $a_0,$b_1 # mul_add_c(a[0],b[1],c2,c3,c1);
1077 $LD $a_6,6*$BNSZ($a1)
1078 $LD $a_7,7*$BNSZ($a1)
1079 $LD $b_4,4*$BNSZ($a2)
1080 $LD $b_5,5*$BNSZ($a2)
1081 mflo $t_1
1082 mfhi $t_2
1083 $ADDU $c_2,$t_1
1084 sltu $at,$c_2,$t_1
1085 $MULTU $a_1,$b_0 # mul_add_c(a[1],b[0],c2,c3,c1);
1086 $ADDU $c_3,$t_2,$at
1087 $LD $b_6,6*$BNSZ($a2)
1088 $LD $b_7,7*$BNSZ($a2)
1089 $ST $c_1,0($a0) # r[0]=c1;
1090 mflo $t_1
1091 mfhi $t_2
1092 $ADDU $c_2,$t_1
1093 sltu $at,$c_2,$t_1
1094 $MULTU $a_2,$b_0 # mul_add_c(a[2],b[0],c3,c1,c2);
1095 $ADDU $t_2,$at
1096 $ADDU $c_3,$t_2
1097 sltu $c_1,$c_3,$t_2
1098 $ST $c_2,$BNSZ($a0) # r[1]=c2;
1099
1100 mflo $t_1
1101 mfhi $t_2
1102 $ADDU $c_3,$t_1
1103 sltu $at,$c_3,$t_1
1104 $MULTU $a_1,$b_1 # mul_add_c(a[1],b[1],c3,c1,c2);
1105 $ADDU $t_2,$at
1106 $ADDU $c_1,$t_2
1107 mflo $t_1
1108 mfhi $t_2
1109 $ADDU $c_3,$t_1
1110 sltu $at,$c_3,$t_1
1111 $MULTU $a_0,$b_2 # mul_add_c(a[0],b[2],c3,c1,c2);
1112 $ADDU $t_2,$at
1113 $ADDU $c_1,$t_2
1114 sltu $c_2,$c_1,$t_2
1115 mflo $t_1
1116 mfhi $t_2
1117 $ADDU $c_3,$t_1
1118 sltu $at,$c_3,$t_1
1119 $MULTU $a_0,$b_3 # mul_add_c(a[0],b[3],c1,c2,c3);
1120 $ADDU $t_2,$at
1121 $ADDU $c_1,$t_2
1122 sltu $at,$c_1,$t_2
1123 $ADDU $c_2,$at
1124 $ST $c_3,2*$BNSZ($a0) # r[2]=c3;
1125
1126 mflo $t_1
1127 mfhi $t_2
1128 $ADDU $c_1,$t_1
1129 sltu $at,$c_1,$t_1
1130 $MULTU $a_1,$b_2 # mul_add_c(a[1],b[2],c1,c2,c3);
1131 $ADDU $t_2,$at
1132 $ADDU $c_2,$t_2
1133 sltu $c_3,$c_2,$t_2
1134 mflo $t_1
1135 mfhi $t_2
1136 $ADDU $c_1,$t_1
1137 sltu $at,$c_1,$t_1
1138 $MULTU $a_2,$b_1 # mul_add_c(a[2],b[1],c1,c2,c3);
1139 $ADDU $t_2,$at
1140 $ADDU $c_2,$t_2
1141 sltu $at,$c_2,$t_2
1142 $ADDU $c_3,$at
1143 mflo $t_1
1144 mfhi $t_2
1145 $ADDU $c_1,$t_1
1146 sltu $at,$c_1,$t_1
1147 $MULTU $a_3,$b_0 # mul_add_c(a[3],b[0],c1,c2,c3);
1148 $ADDU $t_2,$at
1149 $ADDU $c_2,$t_2
1150 sltu $at,$c_2,$t_2
1151 $ADDU $c_3,$at
1152 mflo $t_1
1153 mfhi $t_2
1154 $ADDU $c_1,$t_1
1155 sltu $at,$c_1,$t_1
1156 $MULTU $a_4,$b_0 # mul_add_c(a[4],b[0],c2,c3,c1);
1157 $ADDU $t_2,$at
1158 $ADDU $c_2,$t_2
1159 sltu $at,$c_2,$t_2
1160 $ADDU $c_3,$at
1161 $ST $c_1,3*$BNSZ($a0) # r[3]=c1;
1162
1163 mflo $t_1
1164 mfhi $t_2
1165 $ADDU $c_2,$t_1
1166 sltu $at,$c_2,$t_1
1167 $MULTU $a_3,$b_1 # mul_add_c(a[3],b[1],c2,c3,c1);
1168 $ADDU $t_2,$at
1169 $ADDU $c_3,$t_2
1170 sltu $c_1,$c_3,$t_2
1171 mflo $t_1
1172 mfhi $t_2
1173 $ADDU $c_2,$t_1
1174 sltu $at,$c_2,$t_1
1175 $MULTU $a_2,$b_2 # mul_add_c(a[2],b[2],c2,c3,c1);
1176 $ADDU $t_2,$at
1177 $ADDU $c_3,$t_2
1178 sltu $at,$c_3,$t_2
1179 $ADDU $c_1,$at
1180 mflo $t_1
1181 mfhi $t_2
1182 $ADDU $c_2,$t_1
1183 sltu $at,$c_2,$t_1
1184 $MULTU $a_1,$b_3 # mul_add_c(a[1],b[3],c2,c3,c1);
1185 $ADDU $t_2,$at
1186 $ADDU $c_3,$t_2
1187 sltu $at,$c_3,$t_2
1188 $ADDU $c_1,$at
1189 mflo $t_1
1190 mfhi $t_2
1191 $ADDU $c_2,$t_1
1192 sltu $at,$c_2,$t_1
1193 $MULTU $a_0,$b_4 # mul_add_c(a[0],b[4],c2,c3,c1);
1194 $ADDU $t_2,$at
1195 $ADDU $c_3,$t_2
1196 sltu $at,$c_3,$t_2
1197 $ADDU $c_1,$at
1198 mflo $t_1
1199 mfhi $t_2
1200 $ADDU $c_2,$t_1
1201 sltu $at,$c_2,$t_1
1202 $MULTU $a_0,$b_5 # mul_add_c(a[0],b[5],c3,c1,c2);
1203 $ADDU $t_2,$at
1204 $ADDU $c_3,$t_2
1205 sltu $at,$c_3,$t_2
1206 $ADDU $c_1,$at
1207 $ST $c_2,4*$BNSZ($a0) # r[4]=c2;
1208
1209 mflo $t_1
1210 mfhi $t_2
1211 $ADDU $c_3,$t_1
1212 sltu $at,$c_3,$t_1
1213 $MULTU $a_1,$b_4 # mul_add_c(a[1],b[4],c3,c1,c2);
1214 $ADDU $t_2,$at
1215 $ADDU $c_1,$t_2
1216 sltu $c_2,$c_1,$t_2
1217 mflo $t_1
1218 mfhi $t_2
1219 $ADDU $c_3,$t_1
1220 sltu $at,$c_3,$t_1
1221 $MULTU $a_2,$b_3 # mul_add_c(a[2],b[3],c3,c1,c2);
1222 $ADDU $t_2,$at
1223 $ADDU $c_1,$t_2
1224 sltu $at,$c_1,$t_2
1225 $ADDU $c_2,$at
1226 mflo $t_1
1227 mfhi $t_2
1228 $ADDU $c_3,$t_1
1229 sltu $at,$c_3,$t_1
1230 $MULTU $a_3,$b_2 # mul_add_c(a[3],b[2],c3,c1,c2);
1231 $ADDU $t_2,$at
1232 $ADDU $c_1,$t_2
1233 sltu $at,$c_1,$t_2
1234 $ADDU $c_2,$at
1235 mflo $t_1
1236 mfhi $t_2
1237 $ADDU $c_3,$t_1
1238 sltu $at,$c_3,$t_1
1239 $MULTU $a_4,$b_1 # mul_add_c(a[4],b[1],c3,c1,c2);
1240 $ADDU $t_2,$at
1241 $ADDU $c_1,$t_2
1242 sltu $at,$c_1,$t_2
1243 $ADDU $c_2,$at
1244 mflo $t_1
1245 mfhi $t_2
1246 $ADDU $c_3,$t_1
1247 sltu $at,$c_3,$t_1
1248 $MULTU $a_5,$b_0 # mul_add_c(a[5],b[0],c3,c1,c2);
1249 $ADDU $t_2,$at
1250 $ADDU $c_1,$t_2
1251 sltu $at,$c_1,$t_2
1252 $ADDU $c_2,$at
1253 mflo $t_1
1254 mfhi $t_2
1255 $ADDU $c_3,$t_1
1256 sltu $at,$c_3,$t_1
1257 $MULTU $a_6,$b_0 # mul_add_c(a[6],b[0],c1,c2,c3);
1258 $ADDU $t_2,$at
1259 $ADDU $c_1,$t_2
1260 sltu $at,$c_1,$t_2
1261 $ADDU $c_2,$at
1262 $ST $c_3,5*$BNSZ($a0) # r[5]=c3;
1263
1264 mflo $t_1
1265 mfhi $t_2
1266 $ADDU $c_1,$t_1
1267 sltu $at,$c_1,$t_1
1268 $MULTU $a_5,$b_1 # mul_add_c(a[5],b[1],c1,c2,c3);
1269 $ADDU $t_2,$at
1270 $ADDU $c_2,$t_2
1271 sltu $c_3,$c_2,$t_2
1272 mflo $t_1
1273 mfhi $t_2
1274 $ADDU $c_1,$t_1
1275 sltu $at,$c_1,$t_1
1276 $MULTU $a_4,$b_2 # mul_add_c(a[4],b[2],c1,c2,c3);
1277 $ADDU $t_2,$at
1278 $ADDU $c_2,$t_2
1279 sltu $at,$c_2,$t_2
1280 $ADDU $c_3,$at
1281 mflo $t_1
1282 mfhi $t_2
1283 $ADDU $c_1,$t_1
1284 sltu $at,$c_1,$t_1
1285 $MULTU $a_3,$b_3 # mul_add_c(a[3],b[3],c1,c2,c3);
1286 $ADDU $t_2,$at
1287 $ADDU $c_2,$t_2
1288 sltu $at,$c_2,$t_2
1289 $ADDU $c_3,$at
1290 mflo $t_1
1291 mfhi $t_2
1292 $ADDU $c_1,$t_1
1293 sltu $at,$c_1,$t_1
1294 $MULTU $a_2,$b_4 # mul_add_c(a[2],b[4],c1,c2,c3);
1295 $ADDU $t_2,$at
1296 $ADDU $c_2,$t_2
1297 sltu $at,$c_2,$t_2
1298 $ADDU $c_3,$at
1299 mflo $t_1
1300 mfhi $t_2
1301 $ADDU $c_1,$t_1
1302 sltu $at,$c_1,$t_1
1303 $MULTU $a_1,$b_5 # mul_add_c(a[1],b[5],c1,c2,c3);
1304 $ADDU $t_2,$at
1305 $ADDU $c_2,$t_2
1306 sltu $at,$c_2,$t_2
1307 $ADDU $c_3,$at
1308 mflo $t_1
1309 mfhi $t_2
1310 $ADDU $c_1,$t_1
1311 sltu $at,$c_1,$t_1
1312 $MULTU $a_0,$b_6 # mul_add_c(a[0],b[6],c1,c2,c3);
1313 $ADDU $t_2,$at
1314 $ADDU $c_2,$t_2
1315 sltu $at,$c_2,$t_2
1316 $ADDU $c_3,$at
1317 mflo $t_1
1318 mfhi $t_2
1319 $ADDU $c_1,$t_1
1320 sltu $at,$c_1,$t_1
1321 $MULTU $a_0,$b_7 # mul_add_c(a[0],b[7],c2,c3,c1);
1322 $ADDU $t_2,$at
1323 $ADDU $c_2,$t_2
1324 sltu $at,$c_2,$t_2
1325 $ADDU $c_3,$at
1326 $ST $c_1,6*$BNSZ($a0) # r[6]=c1;
1327
1328 mflo $t_1
1329 mfhi $t_2
1330 $ADDU $c_2,$t_1
1331 sltu $at,$c_2,$t_1
1332 $MULTU $a_1,$b_6 # mul_add_c(a[1],b[6],c2,c3,c1);
1333 $ADDU $t_2,$at
1334 $ADDU $c_3,$t_2
1335 sltu $c_1,$c_3,$t_2
1336 mflo $t_1
1337 mfhi $t_2
1338 $ADDU $c_2,$t_1
1339 sltu $at,$c_2,$t_1
1340 $MULTU $a_2,$b_5 # mul_add_c(a[2],b[5],c2,c3,c1);
1341 $ADDU $t_2,$at
1342 $ADDU $c_3,$t_2
1343 sltu $at,$c_3,$t_2
1344 $ADDU $c_1,$at
1345 mflo $t_1
1346 mfhi $t_2
1347 $ADDU $c_2,$t_1
1348 sltu $at,$c_2,$t_1
1349 $MULTU $a_3,$b_4 # mul_add_c(a[3],b[4],c2,c3,c1);
1350 $ADDU $t_2,$at
1351 $ADDU $c_3,$t_2
1352 sltu $at,$c_3,$t_2
1353 $ADDU $c_1,$at
1354 mflo $t_1
1355 mfhi $t_2
1356 $ADDU $c_2,$t_1
1357 sltu $at,$c_2,$t_1
1358 $MULTU $a_4,$b_3 # mul_add_c(a[4],b[3],c2,c3,c1);
1359 $ADDU $t_2,$at
1360 $ADDU $c_3,$t_2
1361 sltu $at,$c_3,$t_2
1362 $ADDU $c_1,$at
1363 mflo $t_1
1364 mfhi $t_2
1365 $ADDU $c_2,$t_1
1366 sltu $at,$c_2,$t_1
1367 $MULTU $a_5,$b_2 # mul_add_c(a[5],b[2],c2,c3,c1);
1368 $ADDU $t_2,$at
1369 $ADDU $c_3,$t_2
1370 sltu $at,$c_3,$t_2
1371 $ADDU $c_1,$at
1372 mflo $t_1
1373 mfhi $t_2
1374 $ADDU $c_2,$t_1
1375 sltu $at,$c_2,$t_1
1376 $MULTU $a_6,$b_1 # mul_add_c(a[6],b[1],c2,c3,c1);
1377 $ADDU $t_2,$at
1378 $ADDU $c_3,$t_2
1379 sltu $at,$c_3,$t_2
1380 $ADDU $c_1,$at
1381 mflo $t_1
1382 mfhi $t_2
1383 $ADDU $c_2,$t_1
1384 sltu $at,$c_2,$t_1
1385 $MULTU $a_7,$b_0 # mul_add_c(a[7],b[0],c2,c3,c1);
1386 $ADDU $t_2,$at
1387 $ADDU $c_3,$t_2
1388 sltu $at,$c_3,$t_2
1389 $ADDU $c_1,$at
1390 mflo $t_1
1391 mfhi $t_2
1392 $ADDU $c_2,$t_1
1393 sltu $at,$c_2,$t_1
1394 $MULTU $a_7,$b_1 # mul_add_c(a[7],b[1],c3,c1,c2);
1395 $ADDU $t_2,$at
1396 $ADDU $c_3,$t_2
1397 sltu $at,$c_3,$t_2
1398 $ADDU $c_1,$at
1399 $ST $c_2,7*$BNSZ($a0) # r[7]=c2;
1400
1401 mflo $t_1
1402 mfhi $t_2
1403 $ADDU $c_3,$t_1
1404 sltu $at,$c_3,$t_1
1405 $MULTU $a_6,$b_2 # mul_add_c(a[6],b[2],c3,c1,c2);
1406 $ADDU $t_2,$at
1407 $ADDU $c_1,$t_2
1408 sltu $c_2,$c_1,$t_2
1409 mflo $t_1
1410 mfhi $t_2
1411 $ADDU $c_3,$t_1
1412 sltu $at,$c_3,$t_1
1413 $MULTU $a_5,$b_3 # mul_add_c(a[5],b[3],c3,c1,c2);
1414 $ADDU $t_2,$at
1415 $ADDU $c_1,$t_2
1416 sltu $at,$c_1,$t_2
1417 $ADDU $c_2,$at
1418 mflo $t_1
1419 mfhi $t_2
1420 $ADDU $c_3,$t_1
1421 sltu $at,$c_3,$t_1
1422 $MULTU $a_4,$b_4 # mul_add_c(a[4],b[4],c3,c1,c2);
1423 $ADDU $t_2,$at
1424 $ADDU $c_1,$t_2
1425 sltu $at,$c_1,$t_2
1426 $ADDU $c_2,$at
1427 mflo $t_1
1428 mfhi $t_2
1429 $ADDU $c_3,$t_1
1430 sltu $at,$c_3,$t_1
1431 $MULTU $a_3,$b_5 # mul_add_c(a[3],b[5],c3,c1,c2);
1432 $ADDU $t_2,$at
1433 $ADDU $c_1,$t_2
1434 sltu $at,$c_1,$t_2
1435 $ADDU $c_2,$at
1436 mflo $t_1
1437 mfhi $t_2
1438 $ADDU $c_3,$t_1
1439 sltu $at,$c_3,$t_1
1440 $MULTU $a_2,$b_6 # mul_add_c(a[2],b[6],c3,c1,c2);
1441 $ADDU $t_2,$at
1442 $ADDU $c_1,$t_2
1443 sltu $at,$c_1,$t_2
1444 $ADDU $c_2,$at
1445 mflo $t_1
1446 mfhi $t_2
1447 $ADDU $c_3,$t_1
1448 sltu $at,$c_3,$t_1
1449 $MULTU $a_1,$b_7 # mul_add_c(a[1],b[7],c3,c1,c2);
1450 $ADDU $t_2,$at
1451 $ADDU $c_1,$t_2
1452 sltu $at,$c_1,$t_2
1453 $ADDU $c_2,$at
1454 mflo $t_1
1455 mfhi $t_2
1456 $ADDU $c_3,$t_1
1457 sltu $at,$c_3,$t_1
1458 $MULTU $a_2,$b_7 # mul_add_c(a[2],b[7],c1,c2,c3);
1459 $ADDU $t_2,$at
1460 $ADDU $c_1,$t_2
1461 sltu $at,$c_1,$t_2
1462 $ADDU $c_2,$at
1463 $ST $c_3,8*$BNSZ($a0) # r[8]=c3;
1464
1465 mflo $t_1
1466 mfhi $t_2
1467 $ADDU $c_1,$t_1
1468 sltu $at,$c_1,$t_1
1469 $MULTU $a_3,$b_6 # mul_add_c(a[3],b[6],c1,c2,c3);
1470 $ADDU $t_2,$at
1471 $ADDU $c_2,$t_2
1472 sltu $c_3,$c_2,$t_2
1473 mflo $t_1
1474 mfhi $t_2
1475 $ADDU $c_1,$t_1
1476 sltu $at,$c_1,$t_1
1477 $MULTU $a_4,$b_5 # mul_add_c(a[4],b[5],c1,c2,c3);
1478 $ADDU $t_2,$at
1479 $ADDU $c_2,$t_2
1480 sltu $at,$c_2,$t_2
1481 $ADDU $c_3,$at
1482 mflo $t_1
1483 mfhi $t_2
1484 $ADDU $c_1,$t_1
1485 sltu $at,$c_1,$t_1
1486 $MULTU $a_5,$b_4 # mul_add_c(a[5],b[4],c1,c2,c3);
1487 $ADDU $t_2,$at
1488 $ADDU $c_2,$t_2
1489 sltu $at,$c_2,$t_2
1490 $ADDU $c_3,$at
1491 mflo $t_1
1492 mfhi $t_2
1493 $ADDU $c_1,$t_1
1494 sltu $at,$c_1,$t_1
1495 $MULTU $a_6,$b_3 # mul_add_c(a[6],b[3],c1,c2,c3);
1496 $ADDU $t_2,$at
1497 $ADDU $c_2,$t_2
1498 sltu $at,$c_2,$t_2
1499 $ADDU $c_3,$at
1500 mflo $t_1
1501 mfhi $t_2
1502 $ADDU $c_1,$t_1
1503 sltu $at,$c_1,$t_1
1504 $MULTU $a_7,$b_2 # mul_add_c(a[7],b[2],c1,c2,c3);
1505 $ADDU $t_2,$at
1506 $ADDU $c_2,$t_2
1507 sltu $at,$c_2,$t_2
1508 $ADDU $c_3,$at
1509 mflo $t_1
1510 mfhi $t_2
1511 $ADDU $c_1,$t_1
1512 sltu $at,$c_1,$t_1
1513 $MULTU $a_7,$b_3 # mul_add_c(a[7],b[3],c2,c3,c1);
1514 $ADDU $t_2,$at
1515 $ADDU $c_2,$t_2
1516 sltu $at,$c_2,$t_2
1517 $ADDU $c_3,$at
1518 $ST $c_1,9*$BNSZ($a0) # r[9]=c1;
1519
1520 mflo $t_1
1521 mfhi $t_2
1522 $ADDU $c_2,$t_1
1523 sltu $at,$c_2,$t_1
1524 $MULTU $a_6,$b_4 # mul_add_c(a[6],b[4],c2,c3,c1);
1525 $ADDU $t_2,$at
1526 $ADDU $c_3,$t_2
1527 sltu $c_1,$c_3,$t_2
1528 mflo $t_1
1529 mfhi $t_2
1530 $ADDU $c_2,$t_1
1531 sltu $at,$c_2,$t_1
1532 $MULTU $a_5,$b_5 # mul_add_c(a[5],b[5],c2,c3,c1);
1533 $ADDU $t_2,$at
1534 $ADDU $c_3,$t_2
1535 sltu $at,$c_3,$t_2
1536 $ADDU $c_1,$at
1537 mflo $t_1
1538 mfhi $t_2
1539 $ADDU $c_2,$t_1
1540 sltu $at,$c_2,$t_1
1541 $MULTU $a_4,$b_6 # mul_add_c(a[4],b[6],c2,c3,c1);
1542 $ADDU $t_2,$at
1543 $ADDU $c_3,$t_2
1544 sltu $at,$c_3,$t_2
1545 $ADDU $c_1,$at
1546 mflo $t_1
1547 mfhi $t_2
1548 $ADDU $c_2,$t_1
1549 sltu $at,$c_2,$t_1
1550 $MULTU $a_3,$b_7 # mul_add_c(a[3],b[7],c2,c3,c1);
1551 $ADDU $t_2,$at
1552 $ADDU $c_3,$t_2
1553 sltu $at,$c_3,$t_2
1554 $ADDU $c_1,$at
1555 mflo $t_1
1556 mfhi $t_2
1557 $ADDU $c_2,$t_1
1558 sltu $at,$c_2,$t_1
1559 $MULTU $a_4,$b_7 # mul_add_c(a[4],b[7],c3,c1,c2);
1560 $ADDU $t_2,$at
1561 $ADDU $c_3,$t_2
1562 sltu $at,$c_3,$t_2
1563 $ADDU $c_1,$at
1564 $ST $c_2,10*$BNSZ($a0) # r[10]=c2;
1565
1566 mflo $t_1
1567 mfhi $t_2
1568 $ADDU $c_3,$t_1
1569 sltu $at,$c_3,$t_1
1570 $MULTU $a_5,$b_6 # mul_add_c(a[5],b[6],c3,c1,c2);
1571 $ADDU $t_2,$at
1572 $ADDU $c_1,$t_2
1573 sltu $c_2,$c_1,$t_2
1574 mflo $t_1
1575 mfhi $t_2
1576 $ADDU $c_3,$t_1
1577 sltu $at,$c_3,$t_1
1578 $MULTU $a_6,$b_5 # mul_add_c(a[6],b[5],c3,c1,c2);
1579 $ADDU $t_2,$at
1580 $ADDU $c_1,$t_2
1581 sltu $at,$c_1,$t_2
1582 $ADDU $c_2,$at
1583 mflo $t_1
1584 mfhi $t_2
1585 $ADDU $c_3,$t_1
1586 sltu $at,$c_3,$t_1
1587 $MULTU $a_7,$b_4 # mul_add_c(a[7],b[4],c3,c1,c2);
1588 $ADDU $t_2,$at
1589 $ADDU $c_1,$t_2
1590 sltu $at,$c_1,$t_2
1591 $ADDU $c_2,$at
1592 mflo $t_1
1593 mfhi $t_2
1594 $ADDU $c_3,$t_1
1595 sltu $at,$c_3,$t_1
1596 $MULTU $a_7,$b_5 # mul_add_c(a[7],b[5],c1,c2,c3);
1597 $ADDU $t_2,$at
1598 $ADDU $c_1,$t_2
1599 sltu $at,$c_1,$t_2
1600 $ADDU $c_2,$at
1601 $ST $c_3,11*$BNSZ($a0) # r[11]=c3;
1602
1603 mflo $t_1
1604 mfhi $t_2
1605 $ADDU $c_1,$t_1
1606 sltu $at,$c_1,$t_1
1607 $MULTU $a_6,$b_6 # mul_add_c(a[6],b[6],c1,c2,c3);
1608 $ADDU $t_2,$at
1609 $ADDU $c_2,$t_2
1610 sltu $c_3,$c_2,$t_2
1611 mflo $t_1
1612 mfhi $t_2
1613 $ADDU $c_1,$t_1
1614 sltu $at,$c_1,$t_1
1615 $MULTU $a_5,$b_7 # mul_add_c(a[5],b[7],c1,c2,c3);
1616 $ADDU $t_2,$at
1617 $ADDU $c_2,$t_2
1618 sltu $at,$c_2,$t_2
1619 $ADDU $c_3,$at
1620 mflo $t_1
1621 mfhi $t_2
1622 $ADDU $c_1,$t_1
1623 sltu $at,$c_1,$t_1
1624 $MULTU $a_6,$b_7 # mul_add_c(a[6],b[7],c2,c3,c1);
1625 $ADDU $t_2,$at
1626 $ADDU $c_2,$t_2
1627 sltu $at,$c_2,$t_2
1628 $ADDU $c_3,$at
1629 $ST $c_1,12*$BNSZ($a0) # r[12]=c1;
1630
1631 mflo $t_1
1632 mfhi $t_2
1633 $ADDU $c_2,$t_1
1634 sltu $at,$c_2,$t_1
1635 $MULTU $a_7,$b_6 # mul_add_c(a[7],b[6],c2,c3,c1);
1636 $ADDU $t_2,$at
1637 $ADDU $c_3,$t_2
1638 sltu $c_1,$c_3,$t_2
1639 mflo $t_1
1640 mfhi $t_2
1641 $ADDU $c_2,$t_1
1642 sltu $at,$c_2,$t_1
1643 $MULTU $a_7,$b_7 # mul_add_c(a[7],b[7],c3,c1,c2);
1644 $ADDU $t_2,$at
1645 $ADDU $c_3,$t_2
1646 sltu $at,$c_3,$t_2
1647 $ADDU $c_1,$at
1648 $ST $c_2,13*$BNSZ($a0) # r[13]=c2;
1649
1650 mflo $t_1
1651 mfhi $t_2
1652 $ADDU $c_3,$t_1
1653 sltu $at,$c_3,$t_1
1654 $ADDU $t_2,$at
1655 $ADDU $c_1,$t_2
1656 $ST $c_3,14*$BNSZ($a0) # r[14]=c3;
1657 $ST $c_1,15*$BNSZ($a0) # r[15]=c1;
1658
1659 .set noreorder
1660___
1661$code.=<<___ if ($flavour =~ /nubi/i);
1662 $REG_L $s5,10*$SZREG($sp)
1663 $REG_L $s4,9*$SZREG($sp)
1664 $REG_L $s3,8*$SZREG($sp)
1665 $REG_L $s2,7*$SZREG($sp)
1666 $REG_L $s1,6*$SZREG($sp)
1667 $REG_L $s0,5*$SZREG($sp)
1668 $REG_L $t3,4*$SZREG($sp)
1669 $REG_L $t2,3*$SZREG($sp)
1670 $REG_L $t1,2*$SZREG($sp)
1671 $REG_L $t0,1*$SZREG($sp)
1672 $REG_L $gp,0*$SZREG($sp)
1673 jr $ra
1674 $PTR_ADD $sp,12*$SZREG
1675___
1676$code.=<<___ if ($flavour !~ /nubi/i);
1677 $REG_L $s5,5*$SZREG($sp)
1678 $REG_L $s4,4*$SZREG($sp)
1679 $REG_L $s3,3*$SZREG($sp)
1680 $REG_L $s2,2*$SZREG($sp)
1681 $REG_L $s1,1*$SZREG($sp)
1682 $REG_L $s0,0*$SZREG($sp)
1683 jr $ra
1684 $PTR_ADD $sp,6*$SZREG
1685___
1686$code.=<<___;
1687.end bn_mul_comba8
1688
1689.align 5
1690.globl bn_mul_comba4
1691.ent bn_mul_comba4
1692bn_mul_comba4:
1693___
1694$code.=<<___ if ($flavour =~ /nubi/i);
1695 .frame $sp,6*$SZREG,$ra
1696 .mask 0x8000f008,-$SZREG
1697 .set noreorder
1698 $PTR_SUB $sp,6*$SZREG
1699 $REG_S $ra,5*$SZREG($sp)
1700 $REG_S $t3,4*$SZREG($sp)
1701 $REG_S $t2,3*$SZREG($sp)
1702 $REG_S $t1,2*$SZREG($sp)
1703 $REG_S $t0,1*$SZREG($sp)
1704 $REG_S $gp,0*$SZREG($sp)
1705___
1706$code.=<<___;
1707 .set reorder
1708 $LD $a_0,0($a1)
1709 $LD $b_0,0($a2)
1710 $LD $a_1,$BNSZ($a1)
1711 $LD $a_2,2*$BNSZ($a1)
1712 $MULTU $a_0,$b_0 # mul_add_c(a[0],b[0],c1,c2,c3);
1713 $LD $a_3,3*$BNSZ($a1)
1714 $LD $b_1,$BNSZ($a2)
1715 $LD $b_2,2*$BNSZ($a2)
1716 $LD $b_3,3*$BNSZ($a2)
1717 mflo $c_1
1718 mfhi $c_2
1719 $ST $c_1,0($a0)
1720
1721 $MULTU $a_0,$b_1 # mul_add_c(a[0],b[1],c2,c3,c1);
1722 mflo $t_1
1723 mfhi $t_2
1724 $ADDU $c_2,$t_1
1725 sltu $at,$c_2,$t_1
1726 $MULTU $a_1,$b_0 # mul_add_c(a[1],b[0],c2,c3,c1);
1727 $ADDU $c_3,$t_2,$at
1728 mflo $t_1
1729 mfhi $t_2
1730 $ADDU $c_2,$t_1
1731 sltu $at,$c_2,$t_1
1732 $MULTU $a_2,$b_0 # mul_add_c(a[2],b[0],c3,c1,c2);
1733 $ADDU $t_2,$at
1734 $ADDU $c_3,$t_2
1735 sltu $c_1,$c_3,$t_2
1736 $ST $c_2,$BNSZ($a0)
1737
1738 mflo $t_1
1739 mfhi $t_2
1740 $ADDU $c_3,$t_1
1741 sltu $at,$c_3,$t_1
1742 $MULTU $a_1,$b_1 # mul_add_c(a[1],b[1],c3,c1,c2);
1743 $ADDU $t_2,$at
1744 $ADDU $c_1,$t_2
1745 mflo $t_1
1746 mfhi $t_2
1747 $ADDU $c_3,$t_1
1748 sltu $at,$c_3,$t_1
1749 $MULTU $a_0,$b_2 # mul_add_c(a[0],b[2],c3,c1,c2);
1750 $ADDU $t_2,$at
1751 $ADDU $c_1,$t_2
1752 sltu $c_2,$c_1,$t_2
1753 mflo $t_1
1754 mfhi $t_2
1755 $ADDU $c_3,$t_1
1756 sltu $at,$c_3,$t_1
1757 $MULTU $a_0,$b_3 # mul_add_c(a[0],b[3],c1,c2,c3);
1758 $ADDU $t_2,$at
1759 $ADDU $c_1,$t_2
1760 sltu $at,$c_1,$t_2
1761 $ADDU $c_2,$at
1762 $ST $c_3,2*$BNSZ($a0)
1763
1764 mflo $t_1
1765 mfhi $t_2
1766 $ADDU $c_1,$t_1
1767 sltu $at,$c_1,$t_1
1768 $MULTU $a_1,$b_2 # mul_add_c(a[1],b[2],c1,c2,c3);
1769 $ADDU $t_2,$at
1770 $ADDU $c_2,$t_2
1771 sltu $c_3,$c_2,$t_2
1772 mflo $t_1
1773 mfhi $t_2
1774 $ADDU $c_1,$t_1
1775 sltu $at,$c_1,$t_1
1776 $MULTU $a_2,$b_1 # mul_add_c(a[2],b[1],c1,c2,c3);
1777 $ADDU $t_2,$at
1778 $ADDU $c_2,$t_2
1779 sltu $at,$c_2,$t_2
1780 $ADDU $c_3,$at
1781 mflo $t_1
1782 mfhi $t_2
1783 $ADDU $c_1,$t_1
1784 sltu $at,$c_1,$t_1
1785 $MULTU $a_3,$b_0 # mul_add_c(a[3],b[0],c1,c2,c3);
1786 $ADDU $t_2,$at
1787 $ADDU $c_2,$t_2
1788 sltu $at,$c_2,$t_2
1789 $ADDU $c_3,$at
1790 mflo $t_1
1791 mfhi $t_2
1792 $ADDU $c_1,$t_1
1793 sltu $at,$c_1,$t_1
1794 $MULTU $a_3,$b_1 # mul_add_c(a[3],b[1],c2,c3,c1);
1795 $ADDU $t_2,$at
1796 $ADDU $c_2,$t_2
1797 sltu $at,$c_2,$t_2
1798 $ADDU $c_3,$at
1799 $ST $c_1,3*$BNSZ($a0)
1800
1801 mflo $t_1
1802 mfhi $t_2
1803 $ADDU $c_2,$t_1
1804 sltu $at,$c_2,$t_1
1805 $MULTU $a_2,$b_2 # mul_add_c(a[2],b[2],c2,c3,c1);
1806 $ADDU $t_2,$at
1807 $ADDU $c_3,$t_2
1808 sltu $c_1,$c_3,$t_2
1809 mflo $t_1
1810 mfhi $t_2
1811 $ADDU $c_2,$t_1
1812 sltu $at,$c_2,$t_1
1813 $MULTU $a_1,$b_3 # mul_add_c(a[1],b[3],c2,c3,c1);
1814 $ADDU $t_2,$at
1815 $ADDU $c_3,$t_2
1816 sltu $at,$c_3,$t_2
1817 $ADDU $c_1,$at
1818 mflo $t_1
1819 mfhi $t_2
1820 $ADDU $c_2,$t_1
1821 sltu $at,$c_2,$t_1
1822 $MULTU $a_2,$b_3 # mul_add_c(a[2],b[3],c3,c1,c2);
1823 $ADDU $t_2,$at
1824 $ADDU $c_3,$t_2
1825 sltu $at,$c_3,$t_2
1826 $ADDU $c_1,$at
1827 $ST $c_2,4*$BNSZ($a0)
1828
1829 mflo $t_1
1830 mfhi $t_2
1831 $ADDU $c_3,$t_1
1832 sltu $at,$c_3,$t_1
1833 $MULTU $a_3,$b_2 # mul_add_c(a[3],b[2],c3,c1,c2);
1834 $ADDU $t_2,$at
1835 $ADDU $c_1,$t_2
1836 sltu $c_2,$c_1,$t_2
1837 mflo $t_1
1838 mfhi $t_2
1839 $ADDU $c_3,$t_1
1840 sltu $at,$c_3,$t_1
1841 $MULTU $a_3,$b_3 # mul_add_c(a[3],b[3],c1,c2,c3);
1842 $ADDU $t_2,$at
1843 $ADDU $c_1,$t_2
1844 sltu $at,$c_1,$t_2
1845 $ADDU $c_2,$at
1846 $ST $c_3,5*$BNSZ($a0)
1847
1848 mflo $t_1
1849 mfhi $t_2
1850 $ADDU $c_1,$t_1
1851 sltu $at,$c_1,$t_1
1852 $ADDU $t_2,$at
1853 $ADDU $c_2,$t_2
1854 $ST $c_1,6*$BNSZ($a0)
1855 $ST $c_2,7*$BNSZ($a0)
1856
1857 .set noreorder
1858___
1859$code.=<<___ if ($flavour =~ /nubi/i);
1860 $REG_L $t3,4*$SZREG($sp)
1861 $REG_L $t2,3*$SZREG($sp)
1862 $REG_L $t1,2*$SZREG($sp)
1863 $REG_L $t0,1*$SZREG($sp)
1864 $REG_L $gp,0*$SZREG($sp)
1865 $PTR_ADD $sp,6*$SZREG
1866___
1867$code.=<<___;
1868 jr $ra
1869 nop
1870.end bn_mul_comba4
1871___
1872
1873($a_4,$a_5,$a_6,$a_7)=($b_0,$b_1,$b_2,$b_3);
1874
1875sub add_c2 () {
1876my ($hi,$lo,$c0,$c1,$c2,
1877 $warm, # !$warm denotes first call with specific sequence of
1878 # $c_[XYZ] when there is no Z-carry to accumulate yet;
1879 $an,$bn # these two are arguments for multiplication which
1880 # result is used in *next* step [which is why it's
1881 # commented as "forward multiplication" below];
1882 )=@_;
1883$code.=<<___;
1884 mflo $lo
1885 mfhi $hi
1886 $ADDU $c0,$lo
1887 sltu $at,$c0,$lo
1888 $MULTU $an,$bn # forward multiplication
1889 $ADDU $c0,$lo
1890 $ADDU $at,$hi
1891 sltu $lo,$c0,$lo
1892 $ADDU $c1,$at
1893 $ADDU $hi,$lo
1894___
1895$code.=<<___ if (!$warm);
1896 sltu $c2,$c1,$at
1897 $ADDU $c1,$hi
1898 sltu $hi,$c1,$hi
1899 $ADDU $c2,$hi
1900___
1901$code.=<<___ if ($warm);
1902 sltu $at,$c1,$at
1903 $ADDU $c1,$hi
1904 $ADDU $c2,$at
1905 sltu $hi,$c1,$hi
1906 $ADDU $c2,$hi
1907___
1908}
1909
1910$code.=<<___;
1911
1912.align 5
1913.globl bn_sqr_comba8
1914.ent bn_sqr_comba8
1915bn_sqr_comba8:
1916___
1917$code.=<<___ if ($flavour =~ /nubi/i);
1918 .frame $sp,6*$SZREG,$ra
1919 .mask 0x8000f008,-$SZREG
1920 .set noreorder
1921 $PTR_SUB $sp,6*$SZREG
1922 $REG_S $ra,5*$SZREG($sp)
1923 $REG_S $t3,4*$SZREG($sp)
1924 $REG_S $t2,3*$SZREG($sp)
1925 $REG_S $t1,2*$SZREG($sp)
1926 $REG_S $t0,1*$SZREG($sp)
1927 $REG_S $gp,0*$SZREG($sp)
1928___
1929$code.=<<___;
1930 .set reorder
1931 $LD $a_0,0($a1)
1932 $LD $a_1,$BNSZ($a1)
1933 $LD $a_2,2*$BNSZ($a1)
1934 $LD $a_3,3*$BNSZ($a1)
1935
1936 $MULTU $a_0,$a_0 # mul_add_c(a[0],b[0],c1,c2,c3);
1937 $LD $a_4,4*$BNSZ($a1)
1938 $LD $a_5,5*$BNSZ($a1)
1939 $LD $a_6,6*$BNSZ($a1)
1940 $LD $a_7,7*$BNSZ($a1)
1941 mflo $c_1
1942 mfhi $c_2
1943 $ST $c_1,0($a0)
1944
1945 $MULTU $a_0,$a_1 # mul_add_c2(a[0],b[1],c2,c3,c1);
1946 mflo $t_1
1947 mfhi $t_2
1948 slt $c_1,$t_2,$zero
1949 $SLL $t_2,1
1950 $MULTU $a_2,$a_0 # mul_add_c2(a[2],b[0],c3,c1,c2);
1951 slt $a2,$t_1,$zero
1952 $ADDU $t_2,$a2
1953 $SLL $t_1,1
1954 $ADDU $c_2,$t_1
1955 sltu $at,$c_2,$t_1
1956 $ADDU $c_3,$t_2,$at
1957 $ST $c_2,$BNSZ($a0)
1958___
1959 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
1960 $a_1,$a_1); # mul_add_c(a[1],b[1],c3,c1,c2);
1961$code.=<<___;
1962 mflo $t_1
1963 mfhi $t_2
1964 $ADDU $c_3,$t_1
1965 sltu $at,$c_3,$t_1
1966 $MULTU $a_0,$a_3 # mul_add_c2(a[0],b[3],c1,c2,c3);
1967 $ADDU $t_2,$at
1968 $ADDU $c_1,$t_2
1969 sltu $at,$c_1,$t_2
1970 $ADDU $c_2,$at
1971 $ST $c_3,2*$BNSZ($a0)
1972___
1973 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
1974 $a_1,$a_2); # mul_add_c2(a[1],b[2],c1,c2,c3);
1975 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
1976 $a_4,$a_0); # mul_add_c2(a[4],b[0],c2,c3,c1);
1977$code.=<<___;
1978 $ST $c_1,3*$BNSZ($a0)
1979___
1980 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
1981 $a_3,$a_1); # mul_add_c2(a[3],b[1],c2,c3,c1);
1982 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
1983 $a_2,$a_2); # mul_add_c(a[2],b[2],c2,c3,c1);
1984$code.=<<___;
1985 mflo $t_1
1986 mfhi $t_2
1987 $ADDU $c_2,$t_1
1988 sltu $at,$c_2,$t_1
1989 $MULTU $a_0,$a_5 # mul_add_c2(a[0],b[5],c3,c1,c2);
1990 $ADDU $t_2,$at
1991 $ADDU $c_3,$t_2
1992 sltu $at,$c_3,$t_2
1993 $ADDU $c_1,$at
1994 $ST $c_2,4*$BNSZ($a0)
1995___
1996 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
1997 $a_1,$a_4); # mul_add_c2(a[1],b[4],c3,c1,c2);
1998 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
1999 $a_2,$a_3); # mul_add_c2(a[2],b[3],c3,c1,c2);
2000 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
2001 $a_6,$a_0); # mul_add_c2(a[6],b[0],c1,c2,c3);
2002$code.=<<___;
2003 $ST $c_3,5*$BNSZ($a0)
2004___
2005 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
2006 $a_5,$a_1); # mul_add_c2(a[5],b[1],c1,c2,c3);
2007 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2008 $a_4,$a_2); # mul_add_c2(a[4],b[2],c1,c2,c3);
2009 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2010 $a_3,$a_3); # mul_add_c(a[3],b[3],c1,c2,c3);
2011$code.=<<___;
2012 mflo $t_1
2013 mfhi $t_2
2014 $ADDU $c_1,$t_1
2015 sltu $at,$c_1,$t_1
2016 $MULTU $a_0,$a_7 # mul_add_c2(a[0],b[7],c2,c3,c1);
2017 $ADDU $t_2,$at
2018 $ADDU $c_2,$t_2
2019 sltu $at,$c_2,$t_2
2020 $ADDU $c_3,$at
2021 $ST $c_1,6*$BNSZ($a0)
2022___
2023 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
2024 $a_1,$a_6); # mul_add_c2(a[1],b[6],c2,c3,c1);
2025 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
2026 $a_2,$a_5); # mul_add_c2(a[2],b[5],c2,c3,c1);
2027 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
2028 $a_3,$a_4); # mul_add_c2(a[3],b[4],c2,c3,c1);
2029 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
2030 $a_7,$a_1); # mul_add_c2(a[7],b[1],c3,c1,c2);
2031$code.=<<___;
2032 $ST $c_2,7*$BNSZ($a0)
2033___
2034 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
2035 $a_6,$a_2); # mul_add_c2(a[6],b[2],c3,c1,c2);
2036 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
2037 $a_5,$a_3); # mul_add_c2(a[5],b[3],c3,c1,c2);
2038 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
2039 $a_4,$a_4); # mul_add_c(a[4],b[4],c3,c1,c2);
2040$code.=<<___;
2041 mflo $t_1
2042 mfhi $t_2
2043 $ADDU $c_3,$t_1
2044 sltu $at,$c_3,$t_1
2045 $MULTU $a_2,$a_7 # mul_add_c2(a[2],b[7],c1,c2,c3);
2046 $ADDU $t_2,$at
2047 $ADDU $c_1,$t_2
2048 sltu $at,$c_1,$t_2
2049 $ADDU $c_2,$at
2050 $ST $c_3,8*$BNSZ($a0)
2051___
2052 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
2053 $a_3,$a_6); # mul_add_c2(a[3],b[6],c1,c2,c3);
2054 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2055 $a_4,$a_5); # mul_add_c2(a[4],b[5],c1,c2,c3);
2056 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2057 $a_7,$a_3); # mul_add_c2(a[7],b[3],c2,c3,c1);
2058$code.=<<___;
2059 $ST $c_1,9*$BNSZ($a0)
2060___
2061 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
2062 $a_6,$a_4); # mul_add_c2(a[6],b[4],c2,c3,c1);
2063 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
2064 $a_5,$a_5); # mul_add_c(a[5],b[5],c2,c3,c1);
2065$code.=<<___;
2066 mflo $t_1
2067 mfhi $t_2
2068 $ADDU $c_2,$t_1
2069 sltu $at,$c_2,$t_1
2070 $MULTU $a_4,$a_7 # mul_add_c2(a[4],b[7],c3,c1,c2);
2071 $ADDU $t_2,$at
2072 $ADDU $c_3,$t_2
2073 sltu $at,$c_3,$t_2
2074 $ADDU $c_1,$at
2075 $ST $c_2,10*$BNSZ($a0)
2076___
2077 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
2078 $a_5,$a_6); # mul_add_c2(a[5],b[6],c3,c1,c2);
2079 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
2080 $a_7,$a_5); # mul_add_c2(a[7],b[5],c1,c2,c3);
2081$code.=<<___;
2082 $ST $c_3,11*$BNSZ($a0)
2083___
2084 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
2085 $a_6,$a_6); # mul_add_c(a[6],b[6],c1,c2,c3);
2086$code.=<<___;
2087 mflo $t_1
2088 mfhi $t_2
2089 $ADDU $c_1,$t_1
2090 sltu $at,$c_1,$t_1
2091 $MULTU $a_6,$a_7 # mul_add_c2(a[6],b[7],c2,c3,c1);
2092 $ADDU $t_2,$at
2093 $ADDU $c_2,$t_2
2094 sltu $at,$c_2,$t_2
2095 $ADDU $c_3,$at
2096 $ST $c_1,12*$BNSZ($a0)
2097___
2098 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
2099 $a_7,$a_7); # mul_add_c(a[7],b[7],c3,c1,c2);
2100$code.=<<___;
2101 $ST $c_2,13*$BNSZ($a0)
2102
2103 mflo $t_1
2104 mfhi $t_2
2105 $ADDU $c_3,$t_1
2106 sltu $at,$c_3,$t_1
2107 $ADDU $t_2,$at
2108 $ADDU $c_1,$t_2
2109 $ST $c_3,14*$BNSZ($a0)
2110 $ST $c_1,15*$BNSZ($a0)
2111
2112 .set noreorder
2113___
2114$code.=<<___ if ($flavour =~ /nubi/i);
2115 $REG_L $t3,4*$SZREG($sp)
2116 $REG_L $t2,3*$SZREG($sp)
2117 $REG_L $t1,2*$SZREG($sp)
2118 $REG_L $t0,1*$SZREG($sp)
2119 $REG_L $gp,0*$SZREG($sp)
2120 $PTR_ADD $sp,6*$SZREG
2121___
2122$code.=<<___;
2123 jr $ra
2124 nop
2125.end bn_sqr_comba8
2126
2127.align 5
2128.globl bn_sqr_comba4
2129.ent bn_sqr_comba4
2130bn_sqr_comba4:
2131___
2132$code.=<<___ if ($flavour =~ /nubi/i);
2133 .frame $sp,6*$SZREG,$ra
2134 .mask 0x8000f008,-$SZREG
2135 .set noreorder
2136 $PTR_SUB $sp,6*$SZREG
2137 $REG_S $ra,5*$SZREG($sp)
2138 $REG_S $t3,4*$SZREG($sp)
2139 $REG_S $t2,3*$SZREG($sp)
2140 $REG_S $t1,2*$SZREG($sp)
2141 $REG_S $t0,1*$SZREG($sp)
2142 $REG_S $gp,0*$SZREG($sp)
2143___
2144$code.=<<___;
2145 .set reorder
2146 $LD $a_0,0($a1)
2147 $LD $a_1,$BNSZ($a1)
2148 $MULTU $a_0,$a_0 # mul_add_c(a[0],b[0],c1,c2,c3);
2149 $LD $a_2,2*$BNSZ($a1)
2150 $LD $a_3,3*$BNSZ($a1)
2151 mflo $c_1
2152 mfhi $c_2
2153 $ST $c_1,0($a0)
2154
2155 $MULTU $a_0,$a_1 # mul_add_c2(a[0],b[1],c2,c3,c1);
2156 mflo $t_1
2157 mfhi $t_2
2158 slt $c_1,$t_2,$zero
2159 $SLL $t_2,1
2160 $MULTU $a_2,$a_0 # mul_add_c2(a[2],b[0],c3,c1,c2);
2161 slt $a2,$t_1,$zero
2162 $ADDU $t_2,$a2
2163 $SLL $t_1,1
2164 $ADDU $c_2,$t_1
2165 sltu $at,$c_2,$t_1
2166 $ADDU $c_3,$t_2,$at
2167 $ST $c_2,$BNSZ($a0)
2168___
2169 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
2170 $a_1,$a_1); # mul_add_c(a[1],b[1],c3,c1,c2);
2171$code.=<<___;
2172 mflo $t_1
2173 mfhi $t_2
2174 $ADDU $c_3,$t_1
2175 sltu $at,$c_3,$t_1
2176 $MULTU $a_0,$a_3 # mul_add_c2(a[0],b[3],c1,c2,c3);
2177 $ADDU $t_2,$at
2178 $ADDU $c_1,$t_2
2179 sltu $at,$c_1,$t_2
2180 $ADDU $c_2,$at
2181 $ST $c_3,2*$BNSZ($a0)
2182___
2183 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
2184 $a_1,$a_2); # mul_add_c2(a2[1],b[2],c1,c2,c3);
2185 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2186 $a_3,$a_1); # mul_add_c2(a[3],b[1],c2,c3,c1);
2187$code.=<<___;
2188 $ST $c_1,3*$BNSZ($a0)
2189___
2190 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
2191 $a_2,$a_2); # mul_add_c(a[2],b[2],c2,c3,c1);
2192$code.=<<___;
2193 mflo $t_1
2194 mfhi $t_2
2195 $ADDU $c_2,$t_1
2196 sltu $at,$c_2,$t_1
2197 $MULTU $a_2,$a_3 # mul_add_c2(a[2],b[3],c3,c1,c2);
2198 $ADDU $t_2,$at
2199 $ADDU $c_3,$t_2
2200 sltu $at,$c_3,$t_2
2201 $ADDU $c_1,$at
2202 $ST $c_2,4*$BNSZ($a0)
2203___
2204 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
2205 $a_3,$a_3); # mul_add_c(a[3],b[3],c1,c2,c3);
2206$code.=<<___;
2207 $ST $c_3,5*$BNSZ($a0)
2208
2209 mflo $t_1
2210 mfhi $t_2
2211 $ADDU $c_1,$t_1
2212 sltu $at,$c_1,$t_1
2213 $ADDU $t_2,$at
2214 $ADDU $c_2,$t_2
2215 $ST $c_1,6*$BNSZ($a0)
2216 $ST $c_2,7*$BNSZ($a0)
2217
2218 .set noreorder
2219___
2220$code.=<<___ if ($flavour =~ /nubi/i);
2221 $REG_L $t3,4*$SZREG($sp)
2222 $REG_L $t2,3*$SZREG($sp)
2223 $REG_L $t1,2*$SZREG($sp)
2224 $REG_L $t0,1*$SZREG($sp)
2225 $REG_L $gp,0*$SZREG($sp)
2226 $PTR_ADD $sp,6*$SZREG
2227___
2228$code.=<<___;
2229 jr $ra
2230 nop
2231.end bn_sqr_comba4
2232___
2233print $code;
2234close STDOUT;
diff --git a/src/lib/libcrypto/bn/asm/modexp512-x86_64.pl b/src/lib/libcrypto/bn/asm/modexp512-x86_64.pl
deleted file mode 100644
index 4317282835..0000000000
--- a/src/lib/libcrypto/bn/asm/modexp512-x86_64.pl
+++ /dev/null
@@ -1,1388 +0,0 @@
1#!/usr/bin/env perl
2#
3# Copyright (c) 2010-2011 Intel Corp.
4# Author: Vinodh.Gopal@intel.com
5# Jim Guilford
6# Erdinc.Ozturk@intel.com
7# Maxim.Perminov@intel.com
8#
9# More information about algorithm used can be found at:
10# http://www.cse.buffalo.edu/srds2009/escs2009_submission_Gopal.pdf
11#
12# ====================================================================
13# Copyright (c) 2011 The OpenSSL Project. All rights reserved.
14#
15# Redistribution and use in source and binary forms, with or without
16# modification, are permitted provided that the following conditions
17# are met:
18#
19# 1. Redistributions of source code must retain the above copyright
20# notice, this list of conditions and the following disclaimer.
21#
22# 2. Redistributions in binary form must reproduce the above copyright
23# notice, this list of conditions and the following disclaimer in
24# the documentation and/or other materials provided with the
25# distribution.
26#
27# 3. All advertising materials mentioning features or use of this
28# software must display the following acknowledgment:
29# "This product includes software developed by the OpenSSL Project
30# for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
31#
32# 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
33# endorse or promote products derived from this software without
34# prior written permission. For written permission, please contact
35# licensing@OpenSSL.org.
36#
37# 5. Products derived from this software may not be called "OpenSSL"
38# nor may "OpenSSL" appear in their names without prior written
39# permission of the OpenSSL Project.
40#
41# 6. Redistributions of any form whatsoever must retain the following
42# acknowledgment:
43# "This product includes software developed by the OpenSSL Project
44# for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
45#
46# THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
47# EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
48# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
49# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
50# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
51# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
52# NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
53# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
54# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
55# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
56# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
57# OF THE POSSIBILITY OF SUCH DAMAGE.
58# ====================================================================
59
60$flavour = shift;
61$output = shift;
62if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
63
64$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
65( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
66( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
67die "can't locate x86_64-xlate.pl";
68
69open OUT,"| \"$^X\" $xlate $flavour $output";
70*STDOUT=*OUT;
71
72use strict;
73my $code=".text\n\n";
74my $m=0;
75
76#
77# Define x512 macros
78#
79
80#MULSTEP_512_ADD MACRO x7, x6, x5, x4, x3, x2, x1, x0, dst, src1, src2, add_src, tmp1, tmp2
81#
82# uses rax, rdx, and args
83sub MULSTEP_512_ADD
84{
85 my ($x, $DST, $SRC2, $ASRC, $OP, $TMP)=@_;
86 my @X=@$x; # make a copy
87$code.=<<___;
88 mov (+8*0)($SRC2), %rax
89 mul $OP # rdx:rax = %OP * [0]
90 mov ($ASRC), $X[0]
91 add %rax, $X[0]
92 adc \$0, %rdx
93 mov $X[0], $DST
94___
95for(my $i=1;$i<8;$i++) {
96$code.=<<___;
97 mov %rdx, $TMP
98
99 mov (+8*$i)($SRC2), %rax
100 mul $OP # rdx:rax = %OP * [$i]
101 mov (+8*$i)($ASRC), $X[$i]
102 add %rax, $X[$i]
103 adc \$0, %rdx
104 add $TMP, $X[$i]
105 adc \$0, %rdx
106___
107}
108$code.=<<___;
109 mov %rdx, $X[0]
110___
111}
112
113#MULSTEP_512 MACRO x7, x6, x5, x4, x3, x2, x1, x0, dst, src2, src1_val, tmp
114#
115# uses rax, rdx, and args
116sub MULSTEP_512
117{
118 my ($x, $DST, $SRC2, $OP, $TMP)=@_;
119 my @X=@$x; # make a copy
120$code.=<<___;
121 mov (+8*0)($SRC2), %rax
122 mul $OP # rdx:rax = %OP * [0]
123 add %rax, $X[0]
124 adc \$0, %rdx
125 mov $X[0], $DST
126___
127for(my $i=1;$i<8;$i++) {
128$code.=<<___;
129 mov %rdx, $TMP
130
131 mov (+8*$i)($SRC2), %rax
132 mul $OP # rdx:rax = %OP * [$i]
133 add %rax, $X[$i]
134 adc \$0, %rdx
135 add $TMP, $X[$i]
136 adc \$0, %rdx
137___
138}
139$code.=<<___;
140 mov %rdx, $X[0]
141___
142}
143
144#
145# Swizzle Macros
146#
147
148# macro to copy data from flat space to swizzled table
149#MACRO swizzle pDst, pSrc, tmp1, tmp2
150# pDst and pSrc are modified
151sub swizzle
152{
153 my ($pDst, $pSrc, $cnt, $d0)=@_;
154$code.=<<___;
155 mov \$8, $cnt
156loop_$m:
157 mov ($pSrc), $d0
158 mov $d0#w, ($pDst)
159 shr \$16, $d0
160 mov $d0#w, (+64*1)($pDst)
161 shr \$16, $d0
162 mov $d0#w, (+64*2)($pDst)
163 shr \$16, $d0
164 mov $d0#w, (+64*3)($pDst)
165 lea 8($pSrc), $pSrc
166 lea 64*4($pDst), $pDst
167 dec $cnt
168 jnz loop_$m
169___
170
171 $m++;
172}
173
174# macro to copy data from swizzled table to flat space
175#MACRO unswizzle pDst, pSrc, tmp*3
176sub unswizzle
177{
178 my ($pDst, $pSrc, $cnt, $d0, $d1)=@_;
179$code.=<<___;
180 mov \$4, $cnt
181loop_$m:
182 movzxw (+64*3+256*0)($pSrc), $d0
183 movzxw (+64*3+256*1)($pSrc), $d1
184 shl \$16, $d0
185 shl \$16, $d1
186 mov (+64*2+256*0)($pSrc), $d0#w
187 mov (+64*2+256*1)($pSrc), $d1#w
188 shl \$16, $d0
189 shl \$16, $d1
190 mov (+64*1+256*0)($pSrc), $d0#w
191 mov (+64*1+256*1)($pSrc), $d1#w
192 shl \$16, $d0
193 shl \$16, $d1
194 mov (+64*0+256*0)($pSrc), $d0#w
195 mov (+64*0+256*1)($pSrc), $d1#w
196 mov $d0, (+8*0)($pDst)
197 mov $d1, (+8*1)($pDst)
198 lea 256*2($pSrc), $pSrc
199 lea 8*2($pDst), $pDst
200 sub \$1, $cnt
201 jnz loop_$m
202___
203
204 $m++;
205}
206
207#
208# Data Structures
209#
210
211# Reduce Data
212#
213#
214# Offset Value
215# 0C0 Carries
216# 0B8 X2[10]
217# 0B0 X2[9]
218# 0A8 X2[8]
219# 0A0 X2[7]
220# 098 X2[6]
221# 090 X2[5]
222# 088 X2[4]
223# 080 X2[3]
224# 078 X2[2]
225# 070 X2[1]
226# 068 X2[0]
227# 060 X1[12] P[10]
228# 058 X1[11] P[9] Z[8]
229# 050 X1[10] P[8] Z[7]
230# 048 X1[9] P[7] Z[6]
231# 040 X1[8] P[6] Z[5]
232# 038 X1[7] P[5] Z[4]
233# 030 X1[6] P[4] Z[3]
234# 028 X1[5] P[3] Z[2]
235# 020 X1[4] P[2] Z[1]
236# 018 X1[3] P[1] Z[0]
237# 010 X1[2] P[0] Y[2]
238# 008 X1[1] Q[1] Y[1]
239# 000 X1[0] Q[0] Y[0]
240
241my $X1_offset = 0; # 13 qwords
242my $X2_offset = $X1_offset + 13*8; # 11 qwords
243my $Carries_offset = $X2_offset + 11*8; # 1 qword
244my $Q_offset = 0; # 2 qwords
245my $P_offset = $Q_offset + 2*8; # 11 qwords
246my $Y_offset = 0; # 3 qwords
247my $Z_offset = $Y_offset + 3*8; # 9 qwords
248
249my $Red_Data_Size = $Carries_offset + 1*8; # (25 qwords)
250
251#
252# Stack Frame
253#
254#
255# offset value
256# ... <old stack contents>
257# ...
258# 280 Garray
259
260# 278 tmp16[15]
261# ... ...
262# 200 tmp16[0]
263
264# 1F8 tmp[7]
265# ... ...
266# 1C0 tmp[0]
267
268# 1B8 GT[7]
269# ... ...
270# 180 GT[0]
271
272# 178 Reduce Data
273# ... ...
274# 0B8 Reduce Data
275# 0B0 reserved
276# 0A8 reserved
277# 0A0 reserved
278# 098 reserved
279# 090 reserved
280# 088 reduce result addr
281# 080 exp[8]
282
283# ...
284# 048 exp[1]
285# 040 exp[0]
286
287# 038 reserved
288# 030 loop_idx
289# 028 pg
290# 020 i
291# 018 pData ; arg 4
292# 010 pG ; arg 2
293# 008 pResult ; arg 1
294# 000 rsp ; stack pointer before subtract
295
296my $rsp_offset = 0;
297my $pResult_offset = 8*1 + $rsp_offset;
298my $pG_offset = 8*1 + $pResult_offset;
299my $pData_offset = 8*1 + $pG_offset;
300my $i_offset = 8*1 + $pData_offset;
301my $pg_offset = 8*1 + $i_offset;
302my $loop_idx_offset = 8*1 + $pg_offset;
303my $reserved1_offset = 8*1 + $loop_idx_offset;
304my $exp_offset = 8*1 + $reserved1_offset;
305my $red_result_addr_offset= 8*9 + $exp_offset;
306my $reserved2_offset = 8*1 + $red_result_addr_offset;
307my $Reduce_Data_offset = 8*5 + $reserved2_offset;
308my $GT_offset = $Red_Data_Size + $Reduce_Data_offset;
309my $tmp_offset = 8*8 + $GT_offset;
310my $tmp16_offset = 8*8 + $tmp_offset;
311my $garray_offset = 8*16 + $tmp16_offset;
312my $mem_size = 8*8*32 + $garray_offset;
313
314#
315# Offsets within Reduce Data
316#
317#
318# struct MODF_2FOLD_MONT_512_C1_DATA {
319# UINT64 t[8][8];
320# UINT64 m[8];
321# UINT64 m1[8]; /* 2^768 % m */
322# UINT64 m2[8]; /* 2^640 % m */
323# UINT64 k1[2]; /* (- 1/m) % 2^128 */
324# };
325
326my $T = 0;
327my $M = 512; # = 8 * 8 * 8
328my $M1 = 576; # = 8 * 8 * 9 /* += 8 * 8 */
329my $M2 = 640; # = 8 * 8 * 10 /* += 8 * 8 */
330my $K1 = 704; # = 8 * 8 * 11 /* += 8 * 8 */
331
332#
333# FUNCTIONS
334#
335
336{{{
337#
338# MULADD_128x512 : Function to multiply 128-bits (2 qwords) by 512-bits (8 qwords)
339# and add 512-bits (8 qwords)
340# to get 640 bits (10 qwords)
341# Input: 128-bit mul source: [rdi+8*1], rbp
342# 512-bit mul source: [rsi+8*n]
343# 512-bit add source: r15, r14, ..., r9, r8
344# Output: r9, r8, r15, r14, r13, r12, r11, r10, [rcx+8*1], [rcx+8*0]
345# Clobbers all regs except: rcx, rsi, rdi
346$code.=<<___;
347.type MULADD_128x512,\@abi-omnipotent
348.align 16
349MULADD_128x512:
350___
351 &MULSTEP_512([map("%r$_",(8..15))], "(+8*0)(%rcx)", "%rsi", "%rbp", "%rbx");
352$code.=<<___;
353 mov (+8*1)(%rdi), %rbp
354___
355 &MULSTEP_512([map("%r$_",(9..15,8))], "(+8*1)(%rcx)", "%rsi", "%rbp", "%rbx");
356$code.=<<___;
357 ret
358.size MULADD_128x512,.-MULADD_128x512
359___
360}}}
361
362{{{
363#MULADD_256x512 MACRO pDst, pA, pB, OP, TMP, X7, X6, X5, X4, X3, X2, X1, X0
364#
365# Inputs: pDst: Destination (768 bits, 12 qwords)
366# pA: Multiplicand (1024 bits, 16 qwords)
367# pB: Multiplicand (512 bits, 8 qwords)
368# Dst = Ah * B + Al
369# where Ah is (in qwords) A[15:12] (256 bits) and Al is A[7:0] (512 bits)
370# Results in X3 X2 X1 X0 X7 X6 X5 X4 Dst[3:0]
371# Uses registers: arguments, RAX, RDX
372sub MULADD_256x512
373{
374 my ($pDst, $pA, $pB, $OP, $TMP, $X)=@_;
375$code.=<<___;
376 mov (+8*12)($pA), $OP
377___
378 &MULSTEP_512_ADD($X, "(+8*0)($pDst)", $pB, $pA, $OP, $TMP);
379 push(@$X,shift(@$X));
380
381$code.=<<___;
382 mov (+8*13)($pA), $OP
383___
384 &MULSTEP_512($X, "(+8*1)($pDst)", $pB, $OP, $TMP);
385 push(@$X,shift(@$X));
386
387$code.=<<___;
388 mov (+8*14)($pA), $OP
389___
390 &MULSTEP_512($X, "(+8*2)($pDst)", $pB, $OP, $TMP);
391 push(@$X,shift(@$X));
392
393$code.=<<___;
394 mov (+8*15)($pA), $OP
395___
396 &MULSTEP_512($X, "(+8*3)($pDst)", $pB, $OP, $TMP);
397 push(@$X,shift(@$X));
398}
399
400#
401# mont_reduce(UINT64 *x, /* 1024 bits, 16 qwords */
402# UINT64 *m, /* 512 bits, 8 qwords */
403# MODF_2FOLD_MONT_512_C1_DATA *data,
404# UINT64 *r) /* 512 bits, 8 qwords */
405# Input: x (number to be reduced): tmp16 (Implicit)
406# m (modulus): [pM] (Implicit)
407# data (reduce data): [pData] (Implicit)
408# Output: r (result): Address in [red_res_addr]
409# result also in: r9, r8, r15, r14, r13, r12, r11, r10
410
411my @X=map("%r$_",(8..15));
412
413$code.=<<___;
414.type mont_reduce,\@abi-omnipotent
415.align 16
416mont_reduce:
417___
418
419my $STACK_DEPTH = 8;
420 #
421 # X1 = Xh * M1 + Xl
422$code.=<<___;
423 lea (+$Reduce_Data_offset+$X1_offset+$STACK_DEPTH)(%rsp), %rdi # pX1 (Dst) 769 bits, 13 qwords
424 mov (+$pData_offset+$STACK_DEPTH)(%rsp), %rsi # pM1 (Bsrc) 512 bits, 8 qwords
425 add \$$M1, %rsi
426 lea (+$tmp16_offset+$STACK_DEPTH)(%rsp), %rcx # X (Asrc) 1024 bits, 16 qwords
427
428___
429
430 &MULADD_256x512("%rdi", "%rcx", "%rsi", "%rbp", "%rbx", \@X); # rotates @X 4 times
431 # results in r11, r10, r9, r8, r15, r14, r13, r12, X1[3:0]
432
433$code.=<<___;
434 xor %rax, %rax
435 # X1 += xl
436 add (+8*8)(%rcx), $X[4]
437 adc (+8*9)(%rcx), $X[5]
438 adc (+8*10)(%rcx), $X[6]
439 adc (+8*11)(%rcx), $X[7]
440 adc \$0, %rax
441 # X1 is now rax, r11-r8, r15-r12, tmp16[3:0]
442
443 #
444 # check for carry ;; carry stored in rax
445 mov $X[4], (+8*8)(%rdi) # rdi points to X1
446 mov $X[5], (+8*9)(%rdi)
447 mov $X[6], %rbp
448 mov $X[7], (+8*11)(%rdi)
449
450 mov %rax, (+$Reduce_Data_offset+$Carries_offset+$STACK_DEPTH)(%rsp)
451
452 mov (+8*0)(%rdi), $X[4]
453 mov (+8*1)(%rdi), $X[5]
454 mov (+8*2)(%rdi), $X[6]
455 mov (+8*3)(%rdi), $X[7]
456
457 # X1 is now stored in: X1[11], rbp, X1[9:8], r15-r8
458 # rdi -> X1
459 # rsi -> M1
460
461 #
462 # X2 = Xh * M2 + Xl
463 # do first part (X2 = Xh * M2)
464 add \$8*10, %rdi # rdi -> pXh ; 128 bits, 2 qwords
465 # Xh is actually { [rdi+8*1], rbp }
466 add \$`$M2-$M1`, %rsi # rsi -> M2
467 lea (+$Reduce_Data_offset+$X2_offset+$STACK_DEPTH)(%rsp), %rcx # rcx -> pX2 ; 641 bits, 11 qwords
468___
469 unshift(@X,pop(@X)); unshift(@X,pop(@X));
470$code.=<<___;
471
472 call MULADD_128x512 # args in rcx, rdi / rbp, rsi, r15-r8
473 # result in r9, r8, r15, r14, r13, r12, r11, r10, X2[1:0]
474 mov (+$Reduce_Data_offset+$Carries_offset+$STACK_DEPTH)(%rsp), %rax
475
476 # X2 += Xl
477 add (+8*8-8*10)(%rdi), $X[6] # (-8*10) is to adjust rdi -> Xh to Xl
478 adc (+8*9-8*10)(%rdi), $X[7]
479 mov $X[6], (+8*8)(%rcx)
480 mov $X[7], (+8*9)(%rcx)
481
482 adc %rax, %rax
483 mov %rax, (+$Reduce_Data_offset+$Carries_offset+$STACK_DEPTH)(%rsp)
484
485 lea (+$Reduce_Data_offset+$Q_offset+$STACK_DEPTH)(%rsp), %rdi # rdi -> pQ ; 128 bits, 2 qwords
486 add \$`$K1-$M2`, %rsi # rsi -> pK1 ; 128 bits, 2 qwords
487
488 # MUL_128x128t128 rdi, rcx, rsi ; Q = X2 * K1 (bottom half)
489 # B1:B0 = rsi[1:0] = K1[1:0]
490 # A1:A0 = rcx[1:0] = X2[1:0]
491 # Result = rdi[1],rbp = Q[1],rbp
492 mov (%rsi), %r8 # B0
493 mov (+8*1)(%rsi), %rbx # B1
494
495 mov (%rcx), %rax # A0
496 mul %r8 # B0
497 mov %rax, %rbp
498 mov %rdx, %r9
499
500 mov (+8*1)(%rcx), %rax # A1
501 mul %r8 # B0
502 add %rax, %r9
503
504 mov (%rcx), %rax # A0
505 mul %rbx # B1
506 add %rax, %r9
507
508 mov %r9, (+8*1)(%rdi)
509 # end MUL_128x128t128
510
511 sub \$`$K1-$M`, %rsi
512
513 mov (%rcx), $X[6]
514 mov (+8*1)(%rcx), $X[7] # r9:r8 = X2[1:0]
515
516 call MULADD_128x512 # args in rcx, rdi / rbp, rsi, r15-r8
517 # result in r9, r8, r15, r14, r13, r12, r11, r10, X2[1:0]
518
519 # load first half of m to rdx, rdi, rbx, rax
520 # moved this here for efficiency
521 mov (+8*0)(%rsi), %rax
522 mov (+8*1)(%rsi), %rbx
523 mov (+8*2)(%rsi), %rdi
524 mov (+8*3)(%rsi), %rdx
525
526 # continue with reduction
527 mov (+$Reduce_Data_offset+$Carries_offset+$STACK_DEPTH)(%rsp), %rbp
528
529 add (+8*8)(%rcx), $X[6]
530 adc (+8*9)(%rcx), $X[7]
531
532 #accumulate the final carry to rbp
533 adc %rbp, %rbp
534
535 # Add in overflow corrections: R = (X2>>128) += T[overflow]
536 # R = {r9, r8, r15, r14, ..., r10}
537 shl \$3, %rbp
538 mov (+$pData_offset+$STACK_DEPTH)(%rsp), %rcx # rsi -> Data (and points to T)
539 add %rcx, %rbp # pT ; 512 bits, 8 qwords, spread out
540
541 # rsi will be used to generate a mask after the addition
542 xor %rsi, %rsi
543
544 add (+8*8*0)(%rbp), $X[0]
545 adc (+8*8*1)(%rbp), $X[1]
546 adc (+8*8*2)(%rbp), $X[2]
547 adc (+8*8*3)(%rbp), $X[3]
548 adc (+8*8*4)(%rbp), $X[4]
549 adc (+8*8*5)(%rbp), $X[5]
550 adc (+8*8*6)(%rbp), $X[6]
551 adc (+8*8*7)(%rbp), $X[7]
552
553 # if there is a carry: rsi = 0xFFFFFFFFFFFFFFFF
554 # if carry is clear: rsi = 0x0000000000000000
555 sbb \$0, %rsi
556
557 # if carry is clear, subtract 0. Otherwise, subtract 256 bits of m
558 and %rsi, %rax
559 and %rsi, %rbx
560 and %rsi, %rdi
561 and %rsi, %rdx
562
563 mov \$1, %rbp
564 sub %rax, $X[0]
565 sbb %rbx, $X[1]
566 sbb %rdi, $X[2]
567 sbb %rdx, $X[3]
568
569 # if there is a borrow: rbp = 0
570 # if there is no borrow: rbp = 1
571 # this is used to save the borrows in between the first half and the 2nd half of the subtraction of m
572 sbb \$0, %rbp
573
574 #load second half of m to rdx, rdi, rbx, rax
575
576 add \$$M, %rcx
577 mov (+8*4)(%rcx), %rax
578 mov (+8*5)(%rcx), %rbx
579 mov (+8*6)(%rcx), %rdi
580 mov (+8*7)(%rcx), %rdx
581
582 # use the rsi mask as before
583 # if carry is clear, subtract 0. Otherwise, subtract 256 bits of m
584 and %rsi, %rax
585 and %rsi, %rbx
586 and %rsi, %rdi
587 and %rsi, %rdx
588
589 # if rbp = 0, there was a borrow before, it is moved to the carry flag
590 # if rbp = 1, there was not a borrow before, carry flag is cleared
591 sub \$1, %rbp
592
593 sbb %rax, $X[4]
594 sbb %rbx, $X[5]
595 sbb %rdi, $X[6]
596 sbb %rdx, $X[7]
597
598 # write R back to memory
599
600 mov (+$red_result_addr_offset+$STACK_DEPTH)(%rsp), %rsi
601 mov $X[0], (+8*0)(%rsi)
602 mov $X[1], (+8*1)(%rsi)
603 mov $X[2], (+8*2)(%rsi)
604 mov $X[3], (+8*3)(%rsi)
605 mov $X[4], (+8*4)(%rsi)
606 mov $X[5], (+8*5)(%rsi)
607 mov $X[6], (+8*6)(%rsi)
608 mov $X[7], (+8*7)(%rsi)
609
610 ret
611.size mont_reduce,.-mont_reduce
612___
613}}}
614
615{{{
616#MUL_512x512 MACRO pDst, pA, pB, x7, x6, x5, x4, x3, x2, x1, x0, tmp*2
617#
618# Inputs: pDst: Destination (1024 bits, 16 qwords)
619# pA: Multiplicand (512 bits, 8 qwords)
620# pB: Multiplicand (512 bits, 8 qwords)
621# Uses registers rax, rdx, args
622# B operand in [pB] and also in x7...x0
623sub MUL_512x512
624{
625 my ($pDst, $pA, $pB, $x, $OP, $TMP, $pDst_o)=@_;
626 my ($pDst, $pDst_o) = ($pDst =~ m/([^+]*)\+?(.*)?/);
627 my @X=@$x; # make a copy
628
629$code.=<<___;
630 mov (+8*0)($pA), $OP
631
632 mov $X[0], %rax
633 mul $OP # rdx:rax = %OP * [0]
634 mov %rax, (+$pDst_o+8*0)($pDst)
635 mov %rdx, $X[0]
636___
637for(my $i=1;$i<8;$i++) {
638$code.=<<___;
639 mov $X[$i], %rax
640 mul $OP # rdx:rax = %OP * [$i]
641 add %rax, $X[$i-1]
642 adc \$0, %rdx
643 mov %rdx, $X[$i]
644___
645}
646
647for(my $i=1;$i<8;$i++) {
648$code.=<<___;
649 mov (+8*$i)($pA), $OP
650___
651
652 &MULSTEP_512(\@X, "(+$pDst_o+8*$i)($pDst)", $pB, $OP, $TMP);
653 push(@X,shift(@X));
654}
655
656$code.=<<___;
657 mov $X[0], (+$pDst_o+8*8)($pDst)
658 mov $X[1], (+$pDst_o+8*9)($pDst)
659 mov $X[2], (+$pDst_o+8*10)($pDst)
660 mov $X[3], (+$pDst_o+8*11)($pDst)
661 mov $X[4], (+$pDst_o+8*12)($pDst)
662 mov $X[5], (+$pDst_o+8*13)($pDst)
663 mov $X[6], (+$pDst_o+8*14)($pDst)
664 mov $X[7], (+$pDst_o+8*15)($pDst)
665___
666}
667
668#
669# mont_mul_a3b : subroutine to compute (Src1 * Src2) % M (all 512-bits)
670# Input: src1: Address of source 1: rdi
671# src2: Address of source 2: rsi
672# Output: dst: Address of destination: [red_res_addr]
673# src2 and result also in: r9, r8, r15, r14, r13, r12, r11, r10
674# Temp: Clobbers [tmp16], all registers
675$code.=<<___;
676.type mont_mul_a3b,\@abi-omnipotent
677.align 16
678mont_mul_a3b:
679 #
680 # multiply tmp = src1 * src2
681 # For multiply: dst = rcx, src1 = rdi, src2 = rsi
682 # stack depth is extra 8 from call
683___
684 &MUL_512x512("%rsp+$tmp16_offset+8", "%rdi", "%rsi", [map("%r$_",(10..15,8..9))], "%rbp", "%rbx");
685$code.=<<___;
686 #
687 # Dst = tmp % m
688 # Call reduce(tmp, m, data, dst)
689
690 # tail recursion optimization: jmp to mont_reduce and return from there
691 jmp mont_reduce
692 # call mont_reduce
693 # ret
694.size mont_mul_a3b,.-mont_mul_a3b
695___
696}}}
697
698{{{
699#SQR_512 MACRO pDest, pA, x7, x6, x5, x4, x3, x2, x1, x0, tmp*4
700#
701# Input in memory [pA] and also in x7...x0
702# Uses all argument registers plus rax and rdx
703#
704# This version computes all of the off-diagonal terms into memory,
705# and then it adds in the diagonal terms
706
707sub SQR_512
708{
709 my ($pDst, $pA, $x, $A, $tmp, $x7, $x6, $pDst_o)=@_;
710 my ($pDst, $pDst_o) = ($pDst =~ m/([^+]*)\+?(.*)?/);
711 my @X=@$x; # make a copy
712$code.=<<___;
713 # ------------------
714 # first pass 01...07
715 # ------------------
716 mov $X[0], $A
717
718 mov $X[1],%rax
719 mul $A
720 mov %rax, (+$pDst_o+8*1)($pDst)
721___
722for(my $i=2;$i<8;$i++) {
723$code.=<<___;
724 mov %rdx, $X[$i-2]
725 mov $X[$i],%rax
726 mul $A
727 add %rax, $X[$i-2]
728 adc \$0, %rdx
729___
730}
731$code.=<<___;
732 mov %rdx, $x7
733
734 mov $X[0], (+$pDst_o+8*2)($pDst)
735
736 # ------------------
737 # second pass 12...17
738 # ------------------
739
740 mov (+8*1)($pA), $A
741
742 mov (+8*2)($pA),%rax
743 mul $A
744 add %rax, $X[1]
745 adc \$0, %rdx
746 mov $X[1], (+$pDst_o+8*3)($pDst)
747
748 mov %rdx, $X[0]
749 mov (+8*3)($pA),%rax
750 mul $A
751 add %rax, $X[2]
752 adc \$0, %rdx
753 add $X[0], $X[2]
754 adc \$0, %rdx
755 mov $X[2], (+$pDst_o+8*4)($pDst)
756
757 mov %rdx, $X[0]
758 mov (+8*4)($pA),%rax
759 mul $A
760 add %rax, $X[3]
761 adc \$0, %rdx
762 add $X[0], $X[3]
763 adc \$0, %rdx
764
765 mov %rdx, $X[0]
766 mov (+8*5)($pA),%rax
767 mul $A
768 add %rax, $X[4]
769 adc \$0, %rdx
770 add $X[0], $X[4]
771 adc \$0, %rdx
772
773 mov %rdx, $X[0]
774 mov $X[6],%rax
775 mul $A
776 add %rax, $X[5]
777 adc \$0, %rdx
778 add $X[0], $X[5]
779 adc \$0, %rdx
780
781 mov %rdx, $X[0]
782 mov $X[7],%rax
783 mul $A
784 add %rax, $x7
785 adc \$0, %rdx
786 add $X[0], $x7
787 adc \$0, %rdx
788
789 mov %rdx, $X[1]
790
791 # ------------------
792 # third pass 23...27
793 # ------------------
794 mov (+8*2)($pA), $A
795
796 mov (+8*3)($pA),%rax
797 mul $A
798 add %rax, $X[3]
799 adc \$0, %rdx
800 mov $X[3], (+$pDst_o+8*5)($pDst)
801
802 mov %rdx, $X[0]
803 mov (+8*4)($pA),%rax
804 mul $A
805 add %rax, $X[4]
806 adc \$0, %rdx
807 add $X[0], $X[4]
808 adc \$0, %rdx
809 mov $X[4], (+$pDst_o+8*6)($pDst)
810
811 mov %rdx, $X[0]
812 mov (+8*5)($pA),%rax
813 mul $A
814 add %rax, $X[5]
815 adc \$0, %rdx
816 add $X[0], $X[5]
817 adc \$0, %rdx
818
819 mov %rdx, $X[0]
820 mov $X[6],%rax
821 mul $A
822 add %rax, $x7
823 adc \$0, %rdx
824 add $X[0], $x7
825 adc \$0, %rdx
826
827 mov %rdx, $X[0]
828 mov $X[7],%rax
829 mul $A
830 add %rax, $X[1]
831 adc \$0, %rdx
832 add $X[0], $X[1]
833 adc \$0, %rdx
834
835 mov %rdx, $X[2]
836
837 # ------------------
838 # fourth pass 34...37
839 # ------------------
840
841 mov (+8*3)($pA), $A
842
843 mov (+8*4)($pA),%rax
844 mul $A
845 add %rax, $X[5]
846 adc \$0, %rdx
847 mov $X[5], (+$pDst_o+8*7)($pDst)
848
849 mov %rdx, $X[0]
850 mov (+8*5)($pA),%rax
851 mul $A
852 add %rax, $x7
853 adc \$0, %rdx
854 add $X[0], $x7
855 adc \$0, %rdx
856 mov $x7, (+$pDst_o+8*8)($pDst)
857
858 mov %rdx, $X[0]
859 mov $X[6],%rax
860 mul $A
861 add %rax, $X[1]
862 adc \$0, %rdx
863 add $X[0], $X[1]
864 adc \$0, %rdx
865
866 mov %rdx, $X[0]
867 mov $X[7],%rax
868 mul $A
869 add %rax, $X[2]
870 adc \$0, %rdx
871 add $X[0], $X[2]
872 adc \$0, %rdx
873
874 mov %rdx, $X[5]
875
876 # ------------------
877 # fifth pass 45...47
878 # ------------------
879 mov (+8*4)($pA), $A
880
881 mov (+8*5)($pA),%rax
882 mul $A
883 add %rax, $X[1]
884 adc \$0, %rdx
885 mov $X[1], (+$pDst_o+8*9)($pDst)
886
887 mov %rdx, $X[0]
888 mov $X[6],%rax
889 mul $A
890 add %rax, $X[2]
891 adc \$0, %rdx
892 add $X[0], $X[2]
893 adc \$0, %rdx
894 mov $X[2], (+$pDst_o+8*10)($pDst)
895
896 mov %rdx, $X[0]
897 mov $X[7],%rax
898 mul $A
899 add %rax, $X[5]
900 adc \$0, %rdx
901 add $X[0], $X[5]
902 adc \$0, %rdx
903
904 mov %rdx, $X[1]
905
906 # ------------------
907 # sixth pass 56...57
908 # ------------------
909 mov (+8*5)($pA), $A
910
911 mov $X[6],%rax
912 mul $A
913 add %rax, $X[5]
914 adc \$0, %rdx
915 mov $X[5], (+$pDst_o+8*11)($pDst)
916
917 mov %rdx, $X[0]
918 mov $X[7],%rax
919 mul $A
920 add %rax, $X[1]
921 adc \$0, %rdx
922 add $X[0], $X[1]
923 adc \$0, %rdx
924 mov $X[1], (+$pDst_o+8*12)($pDst)
925
926 mov %rdx, $X[2]
927
928 # ------------------
929 # seventh pass 67
930 # ------------------
931 mov $X[6], $A
932
933 mov $X[7],%rax
934 mul $A
935 add %rax, $X[2]
936 adc \$0, %rdx
937 mov $X[2], (+$pDst_o+8*13)($pDst)
938
939 mov %rdx, (+$pDst_o+8*14)($pDst)
940
941 # start finalize (add in squares, and double off-terms)
942 mov (+$pDst_o+8*1)($pDst), $X[0]
943 mov (+$pDst_o+8*2)($pDst), $X[1]
944 mov (+$pDst_o+8*3)($pDst), $X[2]
945 mov (+$pDst_o+8*4)($pDst), $X[3]
946 mov (+$pDst_o+8*5)($pDst), $X[4]
947 mov (+$pDst_o+8*6)($pDst), $X[5]
948
949 mov (+8*3)($pA), %rax
950 mul %rax
951 mov %rax, $x6
952 mov %rdx, $X[6]
953
954 add $X[0], $X[0]
955 adc $X[1], $X[1]
956 adc $X[2], $X[2]
957 adc $X[3], $X[3]
958 adc $X[4], $X[4]
959 adc $X[5], $X[5]
960 adc \$0, $X[6]
961
962 mov (+8*0)($pA), %rax
963 mul %rax
964 mov %rax, (+$pDst_o+8*0)($pDst)
965 mov %rdx, $A
966
967 mov (+8*1)($pA), %rax
968 mul %rax
969
970 add $A, $X[0]
971 adc %rax, $X[1]
972 adc \$0, %rdx
973
974 mov %rdx, $A
975 mov $X[0], (+$pDst_o+8*1)($pDst)
976 mov $X[1], (+$pDst_o+8*2)($pDst)
977
978 mov (+8*2)($pA), %rax
979 mul %rax
980
981 add $A, $X[2]
982 adc %rax, $X[3]
983 adc \$0, %rdx
984
985 mov %rdx, $A
986
987 mov $X[2], (+$pDst_o+8*3)($pDst)
988 mov $X[3], (+$pDst_o+8*4)($pDst)
989
990 xor $tmp, $tmp
991 add $A, $X[4]
992 adc $x6, $X[5]
993 adc \$0, $tmp
994
995 mov $X[4], (+$pDst_o+8*5)($pDst)
996 mov $X[5], (+$pDst_o+8*6)($pDst)
997
998 # %%tmp has 0/1 in column 7
999 # %%A6 has a full value in column 7
1000
1001 mov (+$pDst_o+8*7)($pDst), $X[0]
1002 mov (+$pDst_o+8*8)($pDst), $X[1]
1003 mov (+$pDst_o+8*9)($pDst), $X[2]
1004 mov (+$pDst_o+8*10)($pDst), $X[3]
1005 mov (+$pDst_o+8*11)($pDst), $X[4]
1006 mov (+$pDst_o+8*12)($pDst), $X[5]
1007 mov (+$pDst_o+8*13)($pDst), $x6
1008 mov (+$pDst_o+8*14)($pDst), $x7
1009
1010 mov $X[7], %rax
1011 mul %rax
1012 mov %rax, $X[7]
1013 mov %rdx, $A
1014
1015 add $X[0], $X[0]
1016 adc $X[1], $X[1]
1017 adc $X[2], $X[2]
1018 adc $X[3], $X[3]
1019 adc $X[4], $X[4]
1020 adc $X[5], $X[5]
1021 adc $x6, $x6
1022 adc $x7, $x7
1023 adc \$0, $A
1024
1025 add $tmp, $X[0]
1026
1027 mov (+8*4)($pA), %rax
1028 mul %rax
1029
1030 add $X[6], $X[0]
1031 adc %rax, $X[1]
1032 adc \$0, %rdx
1033
1034 mov %rdx, $tmp
1035
1036 mov $X[0], (+$pDst_o+8*7)($pDst)
1037 mov $X[1], (+$pDst_o+8*8)($pDst)
1038
1039 mov (+8*5)($pA), %rax
1040 mul %rax
1041
1042 add $tmp, $X[2]
1043 adc %rax, $X[3]
1044 adc \$0, %rdx
1045
1046 mov %rdx, $tmp
1047
1048 mov $X[2], (+$pDst_o+8*9)($pDst)
1049 mov $X[3], (+$pDst_o+8*10)($pDst)
1050
1051 mov (+8*6)($pA), %rax
1052 mul %rax
1053
1054 add $tmp, $X[4]
1055 adc %rax, $X[5]
1056 adc \$0, %rdx
1057
1058 mov $X[4], (+$pDst_o+8*11)($pDst)
1059 mov $X[5], (+$pDst_o+8*12)($pDst)
1060
1061 add %rdx, $x6
1062 adc $X[7], $x7
1063 adc \$0, $A
1064
1065 mov $x6, (+$pDst_o+8*13)($pDst)
1066 mov $x7, (+$pDst_o+8*14)($pDst)
1067 mov $A, (+$pDst_o+8*15)($pDst)
1068___
1069}
1070
1071#
1072# sqr_reduce: subroutine to compute Result = reduce(Result * Result)
1073#
1074# input and result also in: r9, r8, r15, r14, r13, r12, r11, r10
1075#
1076$code.=<<___;
1077.type sqr_reduce,\@abi-omnipotent
1078.align 16
1079sqr_reduce:
1080 mov (+$pResult_offset+8)(%rsp), %rcx
1081___
1082 &SQR_512("%rsp+$tmp16_offset+8", "%rcx", [map("%r$_",(10..15,8..9))], "%rbx", "%rbp", "%rsi", "%rdi");
1083$code.=<<___;
1084 # tail recursion optimization: jmp to mont_reduce and return from there
1085 jmp mont_reduce
1086 # call mont_reduce
1087 # ret
1088.size sqr_reduce,.-sqr_reduce
1089___
1090}}}
1091
1092#
1093# MAIN FUNCTION
1094#
1095
1096#mod_exp_512(UINT64 *result, /* 512 bits, 8 qwords */
1097# UINT64 *g, /* 512 bits, 8 qwords */
1098# UINT64 *exp, /* 512 bits, 8 qwords */
1099# struct mod_ctx_512 *data)
1100
1101# window size = 5
1102# table size = 2^5 = 32
1103#table_entries equ 32
1104#table_size equ table_entries * 8
1105$code.=<<___;
1106.globl mod_exp_512
1107.type mod_exp_512,\@function,4
1108mod_exp_512:
1109 push %rbp
1110 push %rbx
1111 push %r12
1112 push %r13
1113 push %r14
1114 push %r15
1115
1116 # adjust stack down and then align it with cache boundary
1117 mov %rsp, %r8
1118 sub \$$mem_size, %rsp
1119 and \$-64, %rsp
1120
1121 # store previous stack pointer and arguments
1122 mov %r8, (+$rsp_offset)(%rsp)
1123 mov %rdi, (+$pResult_offset)(%rsp)
1124 mov %rsi, (+$pG_offset)(%rsp)
1125 mov %rcx, (+$pData_offset)(%rsp)
1126.Lbody:
1127 # transform g into montgomery space
1128 # GT = reduce(g * C2) = reduce(g * (2^256))
1129 # reduce expects to have the input in [tmp16]
1130 pxor %xmm4, %xmm4
1131 movdqu (+16*0)(%rsi), %xmm0
1132 movdqu (+16*1)(%rsi), %xmm1
1133 movdqu (+16*2)(%rsi), %xmm2
1134 movdqu (+16*3)(%rsi), %xmm3
1135 movdqa %xmm4, (+$tmp16_offset+16*0)(%rsp)
1136 movdqa %xmm4, (+$tmp16_offset+16*1)(%rsp)
1137 movdqa %xmm4, (+$tmp16_offset+16*6)(%rsp)
1138 movdqa %xmm4, (+$tmp16_offset+16*7)(%rsp)
1139 movdqa %xmm0, (+$tmp16_offset+16*2)(%rsp)
1140 movdqa %xmm1, (+$tmp16_offset+16*3)(%rsp)
1141 movdqa %xmm2, (+$tmp16_offset+16*4)(%rsp)
1142 movdqa %xmm3, (+$tmp16_offset+16*5)(%rsp)
1143
1144 # load pExp before rdx gets blown away
1145 movdqu (+16*0)(%rdx), %xmm0
1146 movdqu (+16*1)(%rdx), %xmm1
1147 movdqu (+16*2)(%rdx), %xmm2
1148 movdqu (+16*3)(%rdx), %xmm3
1149
1150 lea (+$GT_offset)(%rsp), %rbx
1151 mov %rbx, (+$red_result_addr_offset)(%rsp)
1152 call mont_reduce
1153
1154 # Initialize tmp = C
1155 lea (+$tmp_offset)(%rsp), %rcx
1156 xor %rax, %rax
1157 mov %rax, (+8*0)(%rcx)
1158 mov %rax, (+8*1)(%rcx)
1159 mov %rax, (+8*3)(%rcx)
1160 mov %rax, (+8*4)(%rcx)
1161 mov %rax, (+8*5)(%rcx)
1162 mov %rax, (+8*6)(%rcx)
1163 mov %rax, (+8*7)(%rcx)
1164 mov %rax, (+$exp_offset+8*8)(%rsp)
1165 movq \$1, (+8*2)(%rcx)
1166
1167 lea (+$garray_offset)(%rsp), %rbp
1168 mov %rcx, %rsi # pTmp
1169 mov %rbp, %rdi # Garray[][0]
1170___
1171
1172 &swizzle("%rdi", "%rcx", "%rax", "%rbx");
1173
1174 # for (rax = 31; rax != 0; rax--) {
1175 # tmp = reduce(tmp * G)
1176 # swizzle(pg, tmp);
1177 # pg += 2; }
1178$code.=<<___;
1179 mov \$31, %rax
1180 mov %rax, (+$i_offset)(%rsp)
1181 mov %rbp, (+$pg_offset)(%rsp)
1182 # rsi -> pTmp
1183 mov %rsi, (+$red_result_addr_offset)(%rsp)
1184 mov (+8*0)(%rsi), %r10
1185 mov (+8*1)(%rsi), %r11
1186 mov (+8*2)(%rsi), %r12
1187 mov (+8*3)(%rsi), %r13
1188 mov (+8*4)(%rsi), %r14
1189 mov (+8*5)(%rsi), %r15
1190 mov (+8*6)(%rsi), %r8
1191 mov (+8*7)(%rsi), %r9
1192init_loop:
1193 lea (+$GT_offset)(%rsp), %rdi
1194 call mont_mul_a3b
1195 lea (+$tmp_offset)(%rsp), %rsi
1196 mov (+$pg_offset)(%rsp), %rbp
1197 add \$2, %rbp
1198 mov %rbp, (+$pg_offset)(%rsp)
1199 mov %rsi, %rcx # rcx = rsi = addr of tmp
1200___
1201
1202 &swizzle("%rbp", "%rcx", "%rax", "%rbx");
1203$code.=<<___;
1204 mov (+$i_offset)(%rsp), %rax
1205 sub \$1, %rax
1206 mov %rax, (+$i_offset)(%rsp)
1207 jne init_loop
1208
1209 #
1210 # Copy exponent onto stack
1211 movdqa %xmm0, (+$exp_offset+16*0)(%rsp)
1212 movdqa %xmm1, (+$exp_offset+16*1)(%rsp)
1213 movdqa %xmm2, (+$exp_offset+16*2)(%rsp)
1214 movdqa %xmm3, (+$exp_offset+16*3)(%rsp)
1215
1216
1217 #
1218 # Do exponentiation
1219 # Initialize result to G[exp{511:507}]
1220 mov (+$exp_offset+62)(%rsp), %eax
1221 mov %rax, %rdx
1222 shr \$11, %rax
1223 and \$0x07FF, %edx
1224 mov %edx, (+$exp_offset+62)(%rsp)
1225 lea (+$garray_offset)(%rsp,%rax,2), %rsi
1226 mov (+$pResult_offset)(%rsp), %rdx
1227___
1228
1229 &unswizzle("%rdx", "%rsi", "%rbp", "%rbx", "%rax");
1230
1231 #
1232 # Loop variables
1233 # rcx = [loop_idx] = index: 510-5 to 0 by 5
1234$code.=<<___;
1235 movq \$505, (+$loop_idx_offset)(%rsp)
1236
1237 mov (+$pResult_offset)(%rsp), %rcx
1238 mov %rcx, (+$red_result_addr_offset)(%rsp)
1239 mov (+8*0)(%rcx), %r10
1240 mov (+8*1)(%rcx), %r11
1241 mov (+8*2)(%rcx), %r12
1242 mov (+8*3)(%rcx), %r13
1243 mov (+8*4)(%rcx), %r14
1244 mov (+8*5)(%rcx), %r15
1245 mov (+8*6)(%rcx), %r8
1246 mov (+8*7)(%rcx), %r9
1247 jmp sqr_2
1248
1249main_loop_a3b:
1250 call sqr_reduce
1251 call sqr_reduce
1252 call sqr_reduce
1253sqr_2:
1254 call sqr_reduce
1255 call sqr_reduce
1256
1257 #
1258 # Do multiply, first look up proper value in Garray
1259 mov (+$loop_idx_offset)(%rsp), %rcx # bit index
1260 mov %rcx, %rax
1261 shr \$4, %rax # rax is word pointer
1262 mov (+$exp_offset)(%rsp,%rax,2), %edx
1263 and \$15, %rcx
1264 shrq %cl, %rdx
1265 and \$0x1F, %rdx
1266
1267 lea (+$garray_offset)(%rsp,%rdx,2), %rsi
1268 lea (+$tmp_offset)(%rsp), %rdx
1269 mov %rdx, %rdi
1270___
1271
1272 &unswizzle("%rdx", "%rsi", "%rbp", "%rbx", "%rax");
1273 # rdi = tmp = pG
1274
1275 #
1276 # Call mod_mul_a1(pDst, pSrc1, pSrc2, pM, pData)
1277 # result result pG M Data
1278$code.=<<___;
1279 mov (+$pResult_offset)(%rsp), %rsi
1280 call mont_mul_a3b
1281
1282 #
1283 # finish loop
1284 mov (+$loop_idx_offset)(%rsp), %rcx
1285 sub \$5, %rcx
1286 mov %rcx, (+$loop_idx_offset)(%rsp)
1287 jge main_loop_a3b
1288
1289 #
1290
1291end_main_loop_a3b:
1292 # transform result out of Montgomery space
1293 # result = reduce(result)
1294 mov (+$pResult_offset)(%rsp), %rdx
1295 pxor %xmm4, %xmm4
1296 movdqu (+16*0)(%rdx), %xmm0
1297 movdqu (+16*1)(%rdx), %xmm1
1298 movdqu (+16*2)(%rdx), %xmm2
1299 movdqu (+16*3)(%rdx), %xmm3
1300 movdqa %xmm4, (+$tmp16_offset+16*4)(%rsp)
1301 movdqa %xmm4, (+$tmp16_offset+16*5)(%rsp)
1302 movdqa %xmm4, (+$tmp16_offset+16*6)(%rsp)
1303 movdqa %xmm4, (+$tmp16_offset+16*7)(%rsp)
1304 movdqa %xmm0, (+$tmp16_offset+16*0)(%rsp)
1305 movdqa %xmm1, (+$tmp16_offset+16*1)(%rsp)
1306 movdqa %xmm2, (+$tmp16_offset+16*2)(%rsp)
1307 movdqa %xmm3, (+$tmp16_offset+16*3)(%rsp)
1308 call mont_reduce
1309
1310 # If result > m, subract m
1311 # load result into r15:r8
1312 mov (+$pResult_offset)(%rsp), %rax
1313 mov (+8*0)(%rax), %r8
1314 mov (+8*1)(%rax), %r9
1315 mov (+8*2)(%rax), %r10
1316 mov (+8*3)(%rax), %r11
1317 mov (+8*4)(%rax), %r12
1318 mov (+8*5)(%rax), %r13
1319 mov (+8*6)(%rax), %r14
1320 mov (+8*7)(%rax), %r15
1321
1322 # subtract m
1323 mov (+$pData_offset)(%rsp), %rbx
1324 add \$$M, %rbx
1325
1326 sub (+8*0)(%rbx), %r8
1327 sbb (+8*1)(%rbx), %r9
1328 sbb (+8*2)(%rbx), %r10
1329 sbb (+8*3)(%rbx), %r11
1330 sbb (+8*4)(%rbx), %r12
1331 sbb (+8*5)(%rbx), %r13
1332 sbb (+8*6)(%rbx), %r14
1333 sbb (+8*7)(%rbx), %r15
1334
1335 # if Carry is clear, replace result with difference
1336 mov (+8*0)(%rax), %rsi
1337 mov (+8*1)(%rax), %rdi
1338 mov (+8*2)(%rax), %rcx
1339 mov (+8*3)(%rax), %rdx
1340 cmovnc %r8, %rsi
1341 cmovnc %r9, %rdi
1342 cmovnc %r10, %rcx
1343 cmovnc %r11, %rdx
1344 mov %rsi, (+8*0)(%rax)
1345 mov %rdi, (+8*1)(%rax)
1346 mov %rcx, (+8*2)(%rax)
1347 mov %rdx, (+8*3)(%rax)
1348
1349 mov (+8*4)(%rax), %rsi
1350 mov (+8*5)(%rax), %rdi
1351 mov (+8*6)(%rax), %rcx
1352 mov (+8*7)(%rax), %rdx
1353 cmovnc %r12, %rsi
1354 cmovnc %r13, %rdi
1355 cmovnc %r14, %rcx
1356 cmovnc %r15, %rdx
1357 mov %rsi, (+8*4)(%rax)
1358 mov %rdi, (+8*5)(%rax)
1359 mov %rcx, (+8*6)(%rax)
1360 mov %rdx, (+8*7)(%rax)
1361
1362 mov (+$rsp_offset)(%rsp), %rsi
1363 mov 0(%rsi),%r15
1364 mov 8(%rsi),%r14
1365 mov 16(%rsi),%r13
1366 mov 24(%rsi),%r12
1367 mov 32(%rsi),%rbx
1368 mov 40(%rsi),%rbp
1369 lea 48(%rsi),%rsp
1370.Lepilogue:
1371 ret
1372.size mod_exp_512, . - mod_exp_512
1373___
1374
1375sub reg_part {
1376my ($reg,$conv)=@_;
1377 if ($reg =~ /%r[0-9]+/) { $reg .= $conv; }
1378 elsif ($conv eq "b") { $reg =~ s/%[er]([^x]+)x?/%$1l/; }
1379 elsif ($conv eq "w") { $reg =~ s/%[er](.+)/%$1/; }
1380 elsif ($conv eq "d") { $reg =~ s/%[er](.+)/%e$1/; }
1381 return $reg;
1382}
1383
1384$code =~ s/(%[a-z0-9]+)#([bwd])/reg_part($1,$2)/gem;
1385$code =~ s/\`([^\`]*)\`/eval $1/gem;
1386$code =~ s/(\(\+[^)]+\))/eval $1/gem;
1387print $code;
1388close STDOUT;
diff --git a/src/lib/libcrypto/bn/asm/pa-risc2.s b/src/lib/libcrypto/bn/asm/pa-risc2.s
deleted file mode 100644
index f3b16290eb..0000000000
--- a/src/lib/libcrypto/bn/asm/pa-risc2.s
+++ /dev/null
@@ -1,1618 +0,0 @@
1;
2; PA-RISC 2.0 implementation of bn_asm code, based on the
3; 64-bit version of the code. This code is effectively the
4; same as the 64-bit version except the register model is
5; slightly different given all values must be 32-bit between
6; function calls. Thus the 64-bit return values are returned
7; in %ret0 and %ret1 vs just %ret0 as is done in 64-bit
8;
9;
10; This code is approximately 2x faster than the C version
11; for RSA/DSA.
12;
13; See http://devresource.hp.com/ for more details on the PA-RISC
14; architecture. Also see the book "PA-RISC 2.0 Architecture"
15; by Gerry Kane for information on the instruction set architecture.
16;
17; Code written by Chris Ruemmler (with some help from the HP C
18; compiler).
19;
20; The code compiles with HP's assembler
21;
22
23 .level 2.0N
24 .space $TEXT$
25 .subspa $CODE$,QUAD=0,ALIGN=8,ACCESS=0x2c,CODE_ONLY
26
27;
28; Global Register definitions used for the routines.
29;
30; Some information about HP's runtime architecture for 32-bits.
31;
32; "Caller save" means the calling function must save the register
33; if it wants the register to be preserved.
34; "Callee save" means if a function uses the register, it must save
35; the value before using it.
36;
37; For the floating point registers
38;
39; "caller save" registers: fr4-fr11, fr22-fr31
40; "callee save" registers: fr12-fr21
41; "special" registers: fr0-fr3 (status and exception registers)
42;
43; For the integer registers
44; value zero : r0
45; "caller save" registers: r1,r19-r26
46; "callee save" registers: r3-r18
47; return register : r2 (rp)
48; return values ; r28,r29 (ret0,ret1)
49; Stack pointer ; r30 (sp)
50; millicode return ptr ; r31 (also a caller save register)
51
52
53;
54; Arguments to the routines
55;
56r_ptr .reg %r26
57a_ptr .reg %r25
58b_ptr .reg %r24
59num .reg %r24
60n .reg %r23
61
62;
63; Note that the "w" argument for bn_mul_add_words and bn_mul_words
64; is passed on the stack at a delta of -56 from the top of stack
65; as the routine is entered.
66;
67
68;
69; Globals used in some routines
70;
71
72top_overflow .reg %r23
73high_mask .reg %r22 ; value 0xffffffff80000000L
74
75
76;------------------------------------------------------------------------------
77;
78; bn_mul_add_words
79;
80;BN_ULONG bn_mul_add_words(BN_ULONG *r_ptr, BN_ULONG *a_ptr,
81; int num, BN_ULONG w)
82;
83; arg0 = r_ptr
84; arg1 = a_ptr
85; arg3 = num
86; -56(sp) = w
87;
88; Local register definitions
89;
90
91fm1 .reg %fr22
92fm .reg %fr23
93ht_temp .reg %fr24
94ht_temp_1 .reg %fr25
95lt_temp .reg %fr26
96lt_temp_1 .reg %fr27
97fm1_1 .reg %fr28
98fm_1 .reg %fr29
99
100fw_h .reg %fr7L
101fw_l .reg %fr7R
102fw .reg %fr7
103
104fht_0 .reg %fr8L
105flt_0 .reg %fr8R
106t_float_0 .reg %fr8
107
108fht_1 .reg %fr9L
109flt_1 .reg %fr9R
110t_float_1 .reg %fr9
111
112tmp_0 .reg %r31
113tmp_1 .reg %r21
114m_0 .reg %r20
115m_1 .reg %r19
116ht_0 .reg %r1
117ht_1 .reg %r3
118lt_0 .reg %r4
119lt_1 .reg %r5
120m1_0 .reg %r6
121m1_1 .reg %r7
122rp_val .reg %r8
123rp_val_1 .reg %r9
124
125bn_mul_add_words
126 .export bn_mul_add_words,entry,NO_RELOCATION,LONG_RETURN
127 .proc
128 .callinfo frame=128
129 .entry
130 .align 64
131
132 STD %r3,0(%sp) ; save r3
133 STD %r4,8(%sp) ; save r4
134 NOP ; Needed to make the loop 16-byte aligned
135 NOP ; needed to make the loop 16-byte aligned
136
137 STD %r5,16(%sp) ; save r5
138 NOP
139 STD %r6,24(%sp) ; save r6
140 STD %r7,32(%sp) ; save r7
141
142 STD %r8,40(%sp) ; save r8
143 STD %r9,48(%sp) ; save r9
144 COPY %r0,%ret1 ; return 0 by default
145 DEPDI,Z 1,31,1,top_overflow ; top_overflow = 1 << 32
146
147 CMPIB,>= 0,num,bn_mul_add_words_exit ; if (num <= 0) then exit
148 LDO 128(%sp),%sp ; bump stack
149
150 ;
151 ; The loop is unrolled twice, so if there is only 1 number
152 ; then go straight to the cleanup code.
153 ;
154 CMPIB,= 1,num,bn_mul_add_words_single_top
155 FLDD -184(%sp),fw ; (-56-128) load up w into fw (fw_h/fw_l)
156
157 ;
158 ; This loop is unrolled 2 times (64-byte aligned as well)
159 ;
160 ; PA-RISC 2.0 chips have two fully pipelined multipliers, thus
161 ; two 32-bit mutiplies can be issued per cycle.
162 ;
163bn_mul_add_words_unroll2
164
165 FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R)
166 FLDD 8(a_ptr),t_float_1 ; load up 64-bit value (fr8L) ht(L)/lt(R)
167 LDD 0(r_ptr),rp_val ; rp[0]
168 LDD 8(r_ptr),rp_val_1 ; rp[1]
169
170 XMPYU fht_0,fw_l,fm1 ; m1[0] = fht_0*fw_l
171 XMPYU fht_1,fw_l,fm1_1 ; m1[1] = fht_1*fw_l
172 FSTD fm1,-16(%sp) ; -16(sp) = m1[0]
173 FSTD fm1_1,-48(%sp) ; -48(sp) = m1[1]
174
175 XMPYU flt_0,fw_h,fm ; m[0] = flt_0*fw_h
176 XMPYU flt_1,fw_h,fm_1 ; m[1] = flt_1*fw_h
177 FSTD fm,-8(%sp) ; -8(sp) = m[0]
178 FSTD fm_1,-40(%sp) ; -40(sp) = m[1]
179
180 XMPYU fht_0,fw_h,ht_temp ; ht_temp = fht_0*fw_h
181 XMPYU fht_1,fw_h,ht_temp_1 ; ht_temp_1 = fht_1*fw_h
182 FSTD ht_temp,-24(%sp) ; -24(sp) = ht_temp
183 FSTD ht_temp_1,-56(%sp) ; -56(sp) = ht_temp_1
184
185 XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l
186 XMPYU flt_1,fw_l,lt_temp_1 ; lt_temp = lt*fw_l
187 FSTD lt_temp,-32(%sp) ; -32(sp) = lt_temp
188 FSTD lt_temp_1,-64(%sp) ; -64(sp) = lt_temp_1
189
190 LDD -8(%sp),m_0 ; m[0]
191 LDD -40(%sp),m_1 ; m[1]
192 LDD -16(%sp),m1_0 ; m1[0]
193 LDD -48(%sp),m1_1 ; m1[1]
194
195 LDD -24(%sp),ht_0 ; ht[0]
196 LDD -56(%sp),ht_1 ; ht[1]
197 ADD,L m1_0,m_0,tmp_0 ; tmp_0 = m[0] + m1[0];
198 ADD,L m1_1,m_1,tmp_1 ; tmp_1 = m[1] + m1[1];
199
200 LDD -32(%sp),lt_0
201 LDD -64(%sp),lt_1
202 CMPCLR,*>>= tmp_0,m1_0, %r0 ; if (m[0] < m1[0])
203 ADD,L ht_0,top_overflow,ht_0 ; ht[0] += (1<<32)
204
205 CMPCLR,*>>= tmp_1,m1_1,%r0 ; if (m[1] < m1[1])
206 ADD,L ht_1,top_overflow,ht_1 ; ht[1] += (1<<32)
207 EXTRD,U tmp_0,31,32,m_0 ; m[0]>>32
208 DEPD,Z tmp_0,31,32,m1_0 ; m1[0] = m[0]<<32
209
210 EXTRD,U tmp_1,31,32,m_1 ; m[1]>>32
211 DEPD,Z tmp_1,31,32,m1_1 ; m1[1] = m[1]<<32
212 ADD,L ht_0,m_0,ht_0 ; ht[0]+= (m[0]>>32)
213 ADD,L ht_1,m_1,ht_1 ; ht[1]+= (m[1]>>32)
214
215 ADD lt_0,m1_0,lt_0 ; lt[0] = lt[0]+m1[0];
216 ADD,DC ht_0,%r0,ht_0 ; ht[0]++
217 ADD lt_1,m1_1,lt_1 ; lt[1] = lt[1]+m1[1];
218 ADD,DC ht_1,%r0,ht_1 ; ht[1]++
219
220 ADD %ret1,lt_0,lt_0 ; lt[0] = lt[0] + c;
221 ADD,DC ht_0,%r0,ht_0 ; ht[0]++
222 ADD lt_0,rp_val,lt_0 ; lt[0] = lt[0]+rp[0]
223 ADD,DC ht_0,%r0,ht_0 ; ht[0]++
224
225 LDO -2(num),num ; num = num - 2;
226 ADD ht_0,lt_1,lt_1 ; lt[1] = lt[1] + ht_0 (c);
227 ADD,DC ht_1,%r0,ht_1 ; ht[1]++
228 STD lt_0,0(r_ptr) ; rp[0] = lt[0]
229
230 ADD lt_1,rp_val_1,lt_1 ; lt[1] = lt[1]+rp[1]
231 ADD,DC ht_1,%r0,%ret1 ; ht[1]++
232 LDO 16(a_ptr),a_ptr ; a_ptr += 2
233
234 STD lt_1,8(r_ptr) ; rp[1] = lt[1]
235 CMPIB,<= 2,num,bn_mul_add_words_unroll2 ; go again if more to do
236 LDO 16(r_ptr),r_ptr ; r_ptr += 2
237
238 CMPIB,=,N 0,num,bn_mul_add_words_exit ; are we done, or cleanup last one
239
240 ;
241 ; Top of loop aligned on 64-byte boundary
242 ;
243bn_mul_add_words_single_top
244 FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R)
245 LDD 0(r_ptr),rp_val ; rp[0]
246 LDO 8(a_ptr),a_ptr ; a_ptr++
247 XMPYU fht_0,fw_l,fm1 ; m1 = ht*fw_l
248 FSTD fm1,-16(%sp) ; -16(sp) = m1
249 XMPYU flt_0,fw_h,fm ; m = lt*fw_h
250 FSTD fm,-8(%sp) ; -8(sp) = m
251 XMPYU fht_0,fw_h,ht_temp ; ht_temp = ht*fw_h
252 FSTD ht_temp,-24(%sp) ; -24(sp) = ht
253 XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l
254 FSTD lt_temp,-32(%sp) ; -32(sp) = lt
255
256 LDD -8(%sp),m_0
257 LDD -16(%sp),m1_0 ; m1 = temp1
258 ADD,L m_0,m1_0,tmp_0 ; tmp_0 = m + m1;
259 LDD -24(%sp),ht_0
260 LDD -32(%sp),lt_0
261
262 CMPCLR,*>>= tmp_0,m1_0,%r0 ; if (m < m1)
263 ADD,L ht_0,top_overflow,ht_0 ; ht += (1<<32)
264
265 EXTRD,U tmp_0,31,32,m_0 ; m>>32
266 DEPD,Z tmp_0,31,32,m1_0 ; m1 = m<<32
267
268 ADD,L ht_0,m_0,ht_0 ; ht+= (m>>32)
269 ADD lt_0,m1_0,tmp_0 ; tmp_0 = lt+m1;
270 ADD,DC ht_0,%r0,ht_0 ; ht++
271 ADD %ret1,tmp_0,lt_0 ; lt = lt + c;
272 ADD,DC ht_0,%r0,ht_0 ; ht++
273 ADD lt_0,rp_val,lt_0 ; lt = lt+rp[0]
274 ADD,DC ht_0,%r0,%ret1 ; ht++
275 STD lt_0,0(r_ptr) ; rp[0] = lt
276
277bn_mul_add_words_exit
278 .EXIT
279
280 EXTRD,U %ret1,31,32,%ret0 ; for 32-bit, return in ret0/ret1
281 LDD -80(%sp),%r9 ; restore r9
282 LDD -88(%sp),%r8 ; restore r8
283 LDD -96(%sp),%r7 ; restore r7
284 LDD -104(%sp),%r6 ; restore r6
285 LDD -112(%sp),%r5 ; restore r5
286 LDD -120(%sp),%r4 ; restore r4
287 BVE (%rp)
288 LDD,MB -128(%sp),%r3 ; restore r3
289 .PROCEND ;in=23,24,25,26,29;out=28;
290
291;----------------------------------------------------------------------------
292;
293;BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
294;
295; arg0 = rp
296; arg1 = ap
297; arg3 = num
298; w on stack at -56(sp)
299
300bn_mul_words
301 .proc
302 .callinfo frame=128
303 .entry
304 .EXPORT bn_mul_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
305 .align 64
306
307 STD %r3,0(%sp) ; save r3
308 STD %r4,8(%sp) ; save r4
309 NOP
310 STD %r5,16(%sp) ; save r5
311
312 STD %r6,24(%sp) ; save r6
313 STD %r7,32(%sp) ; save r7
314 COPY %r0,%ret1 ; return 0 by default
315 DEPDI,Z 1,31,1,top_overflow ; top_overflow = 1 << 32
316
317 CMPIB,>= 0,num,bn_mul_words_exit
318 LDO 128(%sp),%sp ; bump stack
319
320 ;
321 ; See if only 1 word to do, thus just do cleanup
322 ;
323 CMPIB,= 1,num,bn_mul_words_single_top
324 FLDD -184(%sp),fw ; (-56-128) load up w into fw (fw_h/fw_l)
325
326 ;
327 ; This loop is unrolled 2 times (64-byte aligned as well)
328 ;
329 ; PA-RISC 2.0 chips have two fully pipelined multipliers, thus
330 ; two 32-bit mutiplies can be issued per cycle.
331 ;
332bn_mul_words_unroll2
333
334 FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R)
335 FLDD 8(a_ptr),t_float_1 ; load up 64-bit value (fr8L) ht(L)/lt(R)
336 XMPYU fht_0,fw_l,fm1 ; m1[0] = fht_0*fw_l
337 XMPYU fht_1,fw_l,fm1_1 ; m1[1] = ht*fw_l
338
339 FSTD fm1,-16(%sp) ; -16(sp) = m1
340 FSTD fm1_1,-48(%sp) ; -48(sp) = m1
341 XMPYU flt_0,fw_h,fm ; m = lt*fw_h
342 XMPYU flt_1,fw_h,fm_1 ; m = lt*fw_h
343
344 FSTD fm,-8(%sp) ; -8(sp) = m
345 FSTD fm_1,-40(%sp) ; -40(sp) = m
346 XMPYU fht_0,fw_h,ht_temp ; ht_temp = fht_0*fw_h
347 XMPYU fht_1,fw_h,ht_temp_1 ; ht_temp = ht*fw_h
348
349 FSTD ht_temp,-24(%sp) ; -24(sp) = ht
350 FSTD ht_temp_1,-56(%sp) ; -56(sp) = ht
351 XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l
352 XMPYU flt_1,fw_l,lt_temp_1 ; lt_temp = lt*fw_l
353
354 FSTD lt_temp,-32(%sp) ; -32(sp) = lt
355 FSTD lt_temp_1,-64(%sp) ; -64(sp) = lt
356 LDD -8(%sp),m_0
357 LDD -40(%sp),m_1
358
359 LDD -16(%sp),m1_0
360 LDD -48(%sp),m1_1
361 LDD -24(%sp),ht_0
362 LDD -56(%sp),ht_1
363
364 ADD,L m1_0,m_0,tmp_0 ; tmp_0 = m + m1;
365 ADD,L m1_1,m_1,tmp_1 ; tmp_1 = m + m1;
366 LDD -32(%sp),lt_0
367 LDD -64(%sp),lt_1
368
369 CMPCLR,*>>= tmp_0,m1_0, %r0 ; if (m < m1)
370 ADD,L ht_0,top_overflow,ht_0 ; ht += (1<<32)
371 CMPCLR,*>>= tmp_1,m1_1,%r0 ; if (m < m1)
372 ADD,L ht_1,top_overflow,ht_1 ; ht += (1<<32)
373
374 EXTRD,U tmp_0,31,32,m_0 ; m>>32
375 DEPD,Z tmp_0,31,32,m1_0 ; m1 = m<<32
376 EXTRD,U tmp_1,31,32,m_1 ; m>>32
377 DEPD,Z tmp_1,31,32,m1_1 ; m1 = m<<32
378
379 ADD,L ht_0,m_0,ht_0 ; ht+= (m>>32)
380 ADD,L ht_1,m_1,ht_1 ; ht+= (m>>32)
381 ADD lt_0,m1_0,lt_0 ; lt = lt+m1;
382 ADD,DC ht_0,%r0,ht_0 ; ht++
383
384 ADD lt_1,m1_1,lt_1 ; lt = lt+m1;
385 ADD,DC ht_1,%r0,ht_1 ; ht++
386 ADD %ret1,lt_0,lt_0 ; lt = lt + c (ret1);
387 ADD,DC ht_0,%r0,ht_0 ; ht++
388
389 ADD ht_0,lt_1,lt_1 ; lt = lt + c (ht_0)
390 ADD,DC ht_1,%r0,ht_1 ; ht++
391 STD lt_0,0(r_ptr) ; rp[0] = lt
392 STD lt_1,8(r_ptr) ; rp[1] = lt
393
394 COPY ht_1,%ret1 ; carry = ht
395 LDO -2(num),num ; num = num - 2;
396 LDO 16(a_ptr),a_ptr ; ap += 2
397 CMPIB,<= 2,num,bn_mul_words_unroll2
398 LDO 16(r_ptr),r_ptr ; rp++
399
400 CMPIB,=,N 0,num,bn_mul_words_exit ; are we done?
401
402 ;
403 ; Top of loop aligned on 64-byte boundary
404 ;
405bn_mul_words_single_top
406 FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R)
407
408 XMPYU fht_0,fw_l,fm1 ; m1 = ht*fw_l
409 FSTD fm1,-16(%sp) ; -16(sp) = m1
410 XMPYU flt_0,fw_h,fm ; m = lt*fw_h
411 FSTD fm,-8(%sp) ; -8(sp) = m
412 XMPYU fht_0,fw_h,ht_temp ; ht_temp = ht*fw_h
413 FSTD ht_temp,-24(%sp) ; -24(sp) = ht
414 XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l
415 FSTD lt_temp,-32(%sp) ; -32(sp) = lt
416
417 LDD -8(%sp),m_0
418 LDD -16(%sp),m1_0
419 ADD,L m_0,m1_0,tmp_0 ; tmp_0 = m + m1;
420 LDD -24(%sp),ht_0
421 LDD -32(%sp),lt_0
422
423 CMPCLR,*>>= tmp_0,m1_0,%r0 ; if (m < m1)
424 ADD,L ht_0,top_overflow,ht_0 ; ht += (1<<32)
425
426 EXTRD,U tmp_0,31,32,m_0 ; m>>32
427 DEPD,Z tmp_0,31,32,m1_0 ; m1 = m<<32
428
429 ADD,L ht_0,m_0,ht_0 ; ht+= (m>>32)
430 ADD lt_0,m1_0,lt_0 ; lt= lt+m1;
431 ADD,DC ht_0,%r0,ht_0 ; ht++
432
433 ADD %ret1,lt_0,lt_0 ; lt = lt + c;
434 ADD,DC ht_0,%r0,ht_0 ; ht++
435
436 COPY ht_0,%ret1 ; copy carry
437 STD lt_0,0(r_ptr) ; rp[0] = lt
438
439bn_mul_words_exit
440 .EXIT
441 EXTRD,U %ret1,31,32,%ret0 ; for 32-bit, return in ret0/ret1
442 LDD -96(%sp),%r7 ; restore r7
443 LDD -104(%sp),%r6 ; restore r6
444 LDD -112(%sp),%r5 ; restore r5
445 LDD -120(%sp),%r4 ; restore r4
446 BVE (%rp)
447 LDD,MB -128(%sp),%r3 ; restore r3
448 .PROCEND
449
450;----------------------------------------------------------------------------
451;
452;void bn_sqr_words(BN_ULONG *rp, BN_ULONG *ap, int num)
453;
454; arg0 = rp
455; arg1 = ap
456; arg2 = num
457;
458
459bn_sqr_words
460 .proc
461 .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
462 .EXPORT bn_sqr_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
463 .entry
464 .align 64
465
466 STD %r3,0(%sp) ; save r3
467 STD %r4,8(%sp) ; save r4
468 NOP
469 STD %r5,16(%sp) ; save r5
470
471 CMPIB,>= 0,num,bn_sqr_words_exit
472 LDO 128(%sp),%sp ; bump stack
473
474 ;
475 ; If only 1, the goto straight to cleanup
476 ;
477 CMPIB,= 1,num,bn_sqr_words_single_top
478 DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L
479
480 ;
481 ; This loop is unrolled 2 times (64-byte aligned as well)
482 ;
483
484bn_sqr_words_unroll2
485 FLDD 0(a_ptr),t_float_0 ; a[0]
486 FLDD 8(a_ptr),t_float_1 ; a[1]
487 XMPYU fht_0,flt_0,fm ; m[0]
488 XMPYU fht_1,flt_1,fm_1 ; m[1]
489
490 FSTD fm,-24(%sp) ; store m[0]
491 FSTD fm_1,-56(%sp) ; store m[1]
492 XMPYU flt_0,flt_0,lt_temp ; lt[0]
493 XMPYU flt_1,flt_1,lt_temp_1 ; lt[1]
494
495 FSTD lt_temp,-16(%sp) ; store lt[0]
496 FSTD lt_temp_1,-48(%sp) ; store lt[1]
497 XMPYU fht_0,fht_0,ht_temp ; ht[0]
498 XMPYU fht_1,fht_1,ht_temp_1 ; ht[1]
499
500 FSTD ht_temp,-8(%sp) ; store ht[0]
501 FSTD ht_temp_1,-40(%sp) ; store ht[1]
502 LDD -24(%sp),m_0
503 LDD -56(%sp),m_1
504
505 AND m_0,high_mask,tmp_0 ; m[0] & Mask
506 AND m_1,high_mask,tmp_1 ; m[1] & Mask
507 DEPD,Z m_0,30,31,m_0 ; m[0] << 32+1
508 DEPD,Z m_1,30,31,m_1 ; m[1] << 32+1
509
510 LDD -16(%sp),lt_0
511 LDD -48(%sp),lt_1
512 EXTRD,U tmp_0,32,33,tmp_0 ; tmp_0 = m[0]&Mask >> 32-1
513 EXTRD,U tmp_1,32,33,tmp_1 ; tmp_1 = m[1]&Mask >> 32-1
514
515 LDD -8(%sp),ht_0
516 LDD -40(%sp),ht_1
517 ADD,L ht_0,tmp_0,ht_0 ; ht[0] += tmp_0
518 ADD,L ht_1,tmp_1,ht_1 ; ht[1] += tmp_1
519
520 ADD lt_0,m_0,lt_0 ; lt = lt+m
521 ADD,DC ht_0,%r0,ht_0 ; ht[0]++
522 STD lt_0,0(r_ptr) ; rp[0] = lt[0]
523 STD ht_0,8(r_ptr) ; rp[1] = ht[1]
524
525 ADD lt_1,m_1,lt_1 ; lt = lt+m
526 ADD,DC ht_1,%r0,ht_1 ; ht[1]++
527 STD lt_1,16(r_ptr) ; rp[2] = lt[1]
528 STD ht_1,24(r_ptr) ; rp[3] = ht[1]
529
530 LDO -2(num),num ; num = num - 2;
531 LDO 16(a_ptr),a_ptr ; ap += 2
532 CMPIB,<= 2,num,bn_sqr_words_unroll2
533 LDO 32(r_ptr),r_ptr ; rp += 4
534
535 CMPIB,=,N 0,num,bn_sqr_words_exit ; are we done?
536
537 ;
538 ; Top of loop aligned on 64-byte boundary
539 ;
540bn_sqr_words_single_top
541 FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R)
542
543 XMPYU fht_0,flt_0,fm ; m
544 FSTD fm,-24(%sp) ; store m
545
546 XMPYU flt_0,flt_0,lt_temp ; lt
547 FSTD lt_temp,-16(%sp) ; store lt
548
549 XMPYU fht_0,fht_0,ht_temp ; ht
550 FSTD ht_temp,-8(%sp) ; store ht
551
552 LDD -24(%sp),m_0 ; load m
553 AND m_0,high_mask,tmp_0 ; m & Mask
554 DEPD,Z m_0,30,31,m_0 ; m << 32+1
555 LDD -16(%sp),lt_0 ; lt
556
557 LDD -8(%sp),ht_0 ; ht
558 EXTRD,U tmp_0,32,33,tmp_0 ; tmp_0 = m&Mask >> 32-1
559 ADD m_0,lt_0,lt_0 ; lt = lt+m
560 ADD,L ht_0,tmp_0,ht_0 ; ht += tmp_0
561 ADD,DC ht_0,%r0,ht_0 ; ht++
562
563 STD lt_0,0(r_ptr) ; rp[0] = lt
564 STD ht_0,8(r_ptr) ; rp[1] = ht
565
566bn_sqr_words_exit
567 .EXIT
568 LDD -112(%sp),%r5 ; restore r5
569 LDD -120(%sp),%r4 ; restore r4
570 BVE (%rp)
571 LDD,MB -128(%sp),%r3
572 .PROCEND ;in=23,24,25,26,29;out=28;
573
574
575;----------------------------------------------------------------------------
576;
577;BN_ULONG bn_add_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
578;
579; arg0 = rp
580; arg1 = ap
581; arg2 = bp
582; arg3 = n
583
584t .reg %r22
585b .reg %r21
586l .reg %r20
587
588bn_add_words
589 .proc
590 .entry
591 .callinfo
592 .EXPORT bn_add_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
593 .align 64
594
595 CMPIB,>= 0,n,bn_add_words_exit
596 COPY %r0,%ret1 ; return 0 by default
597
598 ;
599 ; If 2 or more numbers do the loop
600 ;
601 CMPIB,= 1,n,bn_add_words_single_top
602 NOP
603
604 ;
605 ; This loop is unrolled 2 times (64-byte aligned as well)
606 ;
607bn_add_words_unroll2
608 LDD 0(a_ptr),t
609 LDD 0(b_ptr),b
610 ADD t,%ret1,t ; t = t+c;
611 ADD,DC %r0,%r0,%ret1 ; set c to carry
612 ADD t,b,l ; l = t + b[0]
613 ADD,DC %ret1,%r0,%ret1 ; c+= carry
614 STD l,0(r_ptr)
615
616 LDD 8(a_ptr),t
617 LDD 8(b_ptr),b
618 ADD t,%ret1,t ; t = t+c;
619 ADD,DC %r0,%r0,%ret1 ; set c to carry
620 ADD t,b,l ; l = t + b[0]
621 ADD,DC %ret1,%r0,%ret1 ; c+= carry
622 STD l,8(r_ptr)
623
624 LDO -2(n),n
625 LDO 16(a_ptr),a_ptr
626 LDO 16(b_ptr),b_ptr
627
628 CMPIB,<= 2,n,bn_add_words_unroll2
629 LDO 16(r_ptr),r_ptr
630
631 CMPIB,=,N 0,n,bn_add_words_exit ; are we done?
632
633bn_add_words_single_top
634 LDD 0(a_ptr),t
635 LDD 0(b_ptr),b
636
637 ADD t,%ret1,t ; t = t+c;
638 ADD,DC %r0,%r0,%ret1 ; set c to carry (could use CMPCLR??)
639 ADD t,b,l ; l = t + b[0]
640 ADD,DC %ret1,%r0,%ret1 ; c+= carry
641 STD l,0(r_ptr)
642
643bn_add_words_exit
644 .EXIT
645 BVE (%rp)
646 EXTRD,U %ret1,31,32,%ret0 ; for 32-bit, return in ret0/ret1
647 .PROCEND ;in=23,24,25,26,29;out=28;
648
649;----------------------------------------------------------------------------
650;
651;BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
652;
653; arg0 = rp
654; arg1 = ap
655; arg2 = bp
656; arg3 = n
657
658t1 .reg %r22
659t2 .reg %r21
660sub_tmp1 .reg %r20
661sub_tmp2 .reg %r19
662
663
664bn_sub_words
665 .proc
666 .callinfo
667 .EXPORT bn_sub_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
668 .entry
669 .align 64
670
671 CMPIB,>= 0,n,bn_sub_words_exit
672 COPY %r0,%ret1 ; return 0 by default
673
674 ;
675 ; If 2 or more numbers do the loop
676 ;
677 CMPIB,= 1,n,bn_sub_words_single_top
678 NOP
679
680 ;
681 ; This loop is unrolled 2 times (64-byte aligned as well)
682 ;
683bn_sub_words_unroll2
684 LDD 0(a_ptr),t1
685 LDD 0(b_ptr),t2
686 SUB t1,t2,sub_tmp1 ; t3 = t1-t2;
687 SUB sub_tmp1,%ret1,sub_tmp1 ; t3 = t3- c;
688
689 CMPCLR,*>> t1,t2,sub_tmp2 ; clear if t1 > t2
690 LDO 1(%r0),sub_tmp2
691
692 CMPCLR,*= t1,t2,%r0
693 COPY sub_tmp2,%ret1
694 STD sub_tmp1,0(r_ptr)
695
696 LDD 8(a_ptr),t1
697 LDD 8(b_ptr),t2
698 SUB t1,t2,sub_tmp1 ; t3 = t1-t2;
699 SUB sub_tmp1,%ret1,sub_tmp1 ; t3 = t3- c;
700 CMPCLR,*>> t1,t2,sub_tmp2 ; clear if t1 > t2
701 LDO 1(%r0),sub_tmp2
702
703 CMPCLR,*= t1,t2,%r0
704 COPY sub_tmp2,%ret1
705 STD sub_tmp1,8(r_ptr)
706
707 LDO -2(n),n
708 LDO 16(a_ptr),a_ptr
709 LDO 16(b_ptr),b_ptr
710
711 CMPIB,<= 2,n,bn_sub_words_unroll2
712 LDO 16(r_ptr),r_ptr
713
714 CMPIB,=,N 0,n,bn_sub_words_exit ; are we done?
715
716bn_sub_words_single_top
717 LDD 0(a_ptr),t1
718 LDD 0(b_ptr),t2
719 SUB t1,t2,sub_tmp1 ; t3 = t1-t2;
720 SUB sub_tmp1,%ret1,sub_tmp1 ; t3 = t3- c;
721 CMPCLR,*>> t1,t2,sub_tmp2 ; clear if t1 > t2
722 LDO 1(%r0),sub_tmp2
723
724 CMPCLR,*= t1,t2,%r0
725 COPY sub_tmp2,%ret1
726
727 STD sub_tmp1,0(r_ptr)
728
729bn_sub_words_exit
730 .EXIT
731 BVE (%rp)
732 EXTRD,U %ret1,31,32,%ret0 ; for 32-bit, return in ret0/ret1
733 .PROCEND ;in=23,24,25,26,29;out=28;
734
735;------------------------------------------------------------------------------
736;
737; unsigned long bn_div_words(unsigned long h, unsigned long l, unsigned long d)
738;
739; arg0 = h
740; arg1 = l
741; arg2 = d
742;
743; This is mainly just output from the HP C compiler.
744;
745;------------------------------------------------------------------------------
746bn_div_words
747 .PROC
748 .EXPORT bn_div_words,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR,RTNVAL=GR,LONG_RETURN
749 .IMPORT BN_num_bits_word,CODE
750 ;--- not PIC .IMPORT __iob,DATA
751 ;--- not PIC .IMPORT fprintf,CODE
752 .IMPORT abort,CODE
753 .IMPORT $$div2U,MILLICODE
754 .CALLINFO CALLER,FRAME=144,ENTRY_GR=%r9,SAVE_RP,ARGS_SAVED,ORDERING_AWARE
755 .ENTRY
756 STW %r2,-20(%r30) ;offset 0x8ec
757 STW,MA %r3,192(%r30) ;offset 0x8f0
758 STW %r4,-188(%r30) ;offset 0x8f4
759 DEPD %r5,31,32,%r6 ;offset 0x8f8
760 STD %r6,-184(%r30) ;offset 0x8fc
761 DEPD %r7,31,32,%r8 ;offset 0x900
762 STD %r8,-176(%r30) ;offset 0x904
763 STW %r9,-168(%r30) ;offset 0x908
764 LDD -248(%r30),%r3 ;offset 0x90c
765 COPY %r26,%r4 ;offset 0x910
766 COPY %r24,%r5 ;offset 0x914
767 DEPD %r25,31,32,%r4 ;offset 0x918
768 CMPB,*<> %r3,%r0,$0006000C ;offset 0x91c
769 DEPD %r23,31,32,%r5 ;offset 0x920
770 MOVIB,TR -1,%r29,$00060002 ;offset 0x924
771 EXTRD,U %r29,31,32,%r28 ;offset 0x928
772$0006002A
773 LDO -1(%r29),%r29 ;offset 0x92c
774 SUB %r23,%r7,%r23 ;offset 0x930
775$00060024
776 SUB %r4,%r31,%r25 ;offset 0x934
777 AND %r25,%r19,%r26 ;offset 0x938
778 CMPB,*<>,N %r0,%r26,$00060046 ;offset 0x93c
779 DEPD,Z %r25,31,32,%r20 ;offset 0x940
780 OR %r20,%r24,%r21 ;offset 0x944
781 CMPB,*<<,N %r21,%r23,$0006002A ;offset 0x948
782 SUB %r31,%r2,%r31 ;offset 0x94c
783$00060046
784$0006002E
785 DEPD,Z %r23,31,32,%r25 ;offset 0x950
786 EXTRD,U %r23,31,32,%r26 ;offset 0x954
787 AND %r25,%r19,%r24 ;offset 0x958
788 ADD,L %r31,%r26,%r31 ;offset 0x95c
789 CMPCLR,*>>= %r5,%r24,%r0 ;offset 0x960
790 LDO 1(%r31),%r31 ;offset 0x964
791$00060032
792 CMPB,*<<=,N %r31,%r4,$00060036 ;offset 0x968
793 LDO -1(%r29),%r29 ;offset 0x96c
794 ADD,L %r4,%r3,%r4 ;offset 0x970
795$00060036
796 ADDIB,=,N -1,%r8,$D0 ;offset 0x974
797 SUB %r5,%r24,%r28 ;offset 0x978
798$0006003A
799 SUB %r4,%r31,%r24 ;offset 0x97c
800 SHRPD %r24,%r28,32,%r4 ;offset 0x980
801 DEPD,Z %r29,31,32,%r9 ;offset 0x984
802 DEPD,Z %r28,31,32,%r5 ;offset 0x988
803$0006001C
804 EXTRD,U %r4,31,32,%r31 ;offset 0x98c
805 CMPB,*<>,N %r31,%r2,$00060020 ;offset 0x990
806 MOVB,TR %r6,%r29,$D1 ;offset 0x994
807 STD %r29,-152(%r30) ;offset 0x998
808$0006000C
809 EXTRD,U %r3,31,32,%r25 ;offset 0x99c
810 COPY %r3,%r26 ;offset 0x9a0
811 EXTRD,U %r3,31,32,%r9 ;offset 0x9a4
812 EXTRD,U %r4,31,32,%r8 ;offset 0x9a8
813 .CALL ARGW0=GR,ARGW1=GR,RTNVAL=GR ;in=25,26;out=28;
814 B,L BN_num_bits_word,%r2 ;offset 0x9ac
815 EXTRD,U %r5,31,32,%r7 ;offset 0x9b0
816 LDI 64,%r20 ;offset 0x9b4
817 DEPD %r7,31,32,%r5 ;offset 0x9b8
818 DEPD %r8,31,32,%r4 ;offset 0x9bc
819 DEPD %r9,31,32,%r3 ;offset 0x9c0
820 CMPB,= %r28,%r20,$00060012 ;offset 0x9c4
821 COPY %r28,%r24 ;offset 0x9c8
822 MTSARCM %r24 ;offset 0x9cc
823 DEPDI,Z -1,%sar,1,%r19 ;offset 0x9d0
824 CMPB,*>>,N %r4,%r19,$D2 ;offset 0x9d4
825$00060012
826 SUBI 64,%r24,%r31 ;offset 0x9d8
827 CMPCLR,*<< %r4,%r3,%r0 ;offset 0x9dc
828 SUB %r4,%r3,%r4 ;offset 0x9e0
829$00060016
830 CMPB,= %r31,%r0,$0006001A ;offset 0x9e4
831 COPY %r0,%r9 ;offset 0x9e8
832 MTSARCM %r31 ;offset 0x9ec
833 DEPD,Z %r3,%sar,64,%r3 ;offset 0x9f0
834 SUBI 64,%r31,%r26 ;offset 0x9f4
835 MTSAR %r26 ;offset 0x9f8
836 SHRPD %r4,%r5,%sar,%r4 ;offset 0x9fc
837 MTSARCM %r31 ;offset 0xa00
838 DEPD,Z %r5,%sar,64,%r5 ;offset 0xa04
839$0006001A
840 DEPDI,Z -1,31,32,%r19 ;offset 0xa08
841 AND %r3,%r19,%r29 ;offset 0xa0c
842 EXTRD,U %r29,31,32,%r2 ;offset 0xa10
843 DEPDI,Z -1,63,32,%r6 ;offset 0xa14
844 MOVIB,TR 2,%r8,$0006001C ;offset 0xa18
845 EXTRD,U %r3,63,32,%r7 ;offset 0xa1c
846$D2
847 ;--- not PIC ADDIL LR'__iob-$global$,%r27,%r1 ;offset 0xa20
848 ;--- not PIC LDIL LR'C$7,%r21 ;offset 0xa24
849 ;--- not PIC LDO RR'__iob-$global$+32(%r1),%r26 ;offset 0xa28
850 ;--- not PIC .CALL ARGW0=GR,ARGW1=GR,ARGW2=GR,RTNVAL=GR ;in=24,25,26;out=28;
851 ;--- not PIC B,L fprintf,%r2 ;offset 0xa2c
852 ;--- not PIC LDO RR'C$7(%r21),%r25 ;offset 0xa30
853 .CALL ;
854 B,L abort,%r2 ;offset 0xa34
855 NOP ;offset 0xa38
856 B $D3 ;offset 0xa3c
857 LDW -212(%r30),%r2 ;offset 0xa40
858$00060020
859 COPY %r4,%r26 ;offset 0xa44
860 EXTRD,U %r4,31,32,%r25 ;offset 0xa48
861 COPY %r2,%r24 ;offset 0xa4c
862 .CALL ;in=23,24,25,26;out=20,21,22,28,29; (MILLICALL)
863 B,L $$div2U,%r31 ;offset 0xa50
864 EXTRD,U %r2,31,32,%r23 ;offset 0xa54
865 DEPD %r28,31,32,%r29 ;offset 0xa58
866$00060022
867 STD %r29,-152(%r30) ;offset 0xa5c
868$D1
869 AND %r5,%r19,%r24 ;offset 0xa60
870 EXTRD,U %r24,31,32,%r24 ;offset 0xa64
871 STW %r2,-160(%r30) ;offset 0xa68
872 STW %r7,-128(%r30) ;offset 0xa6c
873 FLDD -152(%r30),%fr4 ;offset 0xa70
874 FLDD -152(%r30),%fr7 ;offset 0xa74
875 FLDW -160(%r30),%fr8L ;offset 0xa78
876 FLDW -128(%r30),%fr5L ;offset 0xa7c
877 XMPYU %fr8L,%fr7L,%fr10 ;offset 0xa80
878 FSTD %fr10,-136(%r30) ;offset 0xa84
879 XMPYU %fr8L,%fr7R,%fr22 ;offset 0xa88
880 FSTD %fr22,-144(%r30) ;offset 0xa8c
881 XMPYU %fr5L,%fr4L,%fr11 ;offset 0xa90
882 XMPYU %fr5L,%fr4R,%fr23 ;offset 0xa94
883 FSTD %fr11,-112(%r30) ;offset 0xa98
884 FSTD %fr23,-120(%r30) ;offset 0xa9c
885 LDD -136(%r30),%r28 ;offset 0xaa0
886 DEPD,Z %r28,31,32,%r31 ;offset 0xaa4
887 LDD -144(%r30),%r20 ;offset 0xaa8
888 ADD,L %r20,%r31,%r31 ;offset 0xaac
889 LDD -112(%r30),%r22 ;offset 0xab0
890 DEPD,Z %r22,31,32,%r22 ;offset 0xab4
891 LDD -120(%r30),%r21 ;offset 0xab8
892 B $00060024 ;offset 0xabc
893 ADD,L %r21,%r22,%r23 ;offset 0xac0
894$D0
895 OR %r9,%r29,%r29 ;offset 0xac4
896$00060040
897 EXTRD,U %r29,31,32,%r28 ;offset 0xac8
898$00060002
899$L2
900 LDW -212(%r30),%r2 ;offset 0xacc
901$D3
902 LDW -168(%r30),%r9 ;offset 0xad0
903 LDD -176(%r30),%r8 ;offset 0xad4
904 EXTRD,U %r8,31,32,%r7 ;offset 0xad8
905 LDD -184(%r30),%r6 ;offset 0xadc
906 EXTRD,U %r6,31,32,%r5 ;offset 0xae0
907 LDW -188(%r30),%r4 ;offset 0xae4
908 BVE (%r2) ;offset 0xae8
909 .EXIT
910 LDW,MB -192(%r30),%r3 ;offset 0xaec
911 .PROCEND ;in=23,25;out=28,29;fpin=105,107;
912
913
914
915
916;----------------------------------------------------------------------------
917;
918; Registers to hold 64-bit values to manipulate. The "L" part
919; of the register corresponds to the upper 32-bits, while the "R"
920; part corresponds to the lower 32-bits
921;
922; Note, that when using b6 and b7, the code must save these before
923; using them because they are callee save registers
924;
925;
926; Floating point registers to use to save values that
927; are manipulated. These don't collide with ftemp1-6 and
928; are all caller save registers
929;
930a0 .reg %fr22
931a0L .reg %fr22L
932a0R .reg %fr22R
933
934a1 .reg %fr23
935a1L .reg %fr23L
936a1R .reg %fr23R
937
938a2 .reg %fr24
939a2L .reg %fr24L
940a2R .reg %fr24R
941
942a3 .reg %fr25
943a3L .reg %fr25L
944a3R .reg %fr25R
945
946a4 .reg %fr26
947a4L .reg %fr26L
948a4R .reg %fr26R
949
950a5 .reg %fr27
951a5L .reg %fr27L
952a5R .reg %fr27R
953
954a6 .reg %fr28
955a6L .reg %fr28L
956a6R .reg %fr28R
957
958a7 .reg %fr29
959a7L .reg %fr29L
960a7R .reg %fr29R
961
962b0 .reg %fr30
963b0L .reg %fr30L
964b0R .reg %fr30R
965
966b1 .reg %fr31
967b1L .reg %fr31L
968b1R .reg %fr31R
969
970;
971; Temporary floating point variables, these are all caller save
972; registers
973;
974ftemp1 .reg %fr4
975ftemp2 .reg %fr5
976ftemp3 .reg %fr6
977ftemp4 .reg %fr7
978
979;
980; The B set of registers when used.
981;
982
983b2 .reg %fr8
984b2L .reg %fr8L
985b2R .reg %fr8R
986
987b3 .reg %fr9
988b3L .reg %fr9L
989b3R .reg %fr9R
990
991b4 .reg %fr10
992b4L .reg %fr10L
993b4R .reg %fr10R
994
995b5 .reg %fr11
996b5L .reg %fr11L
997b5R .reg %fr11R
998
999b6 .reg %fr12
1000b6L .reg %fr12L
1001b6R .reg %fr12R
1002
1003b7 .reg %fr13
1004b7L .reg %fr13L
1005b7R .reg %fr13R
1006
1007c1 .reg %r21 ; only reg
1008temp1 .reg %r20 ; only reg
1009temp2 .reg %r19 ; only reg
1010temp3 .reg %r31 ; only reg
1011
1012m1 .reg %r28
1013c2 .reg %r23
1014high_one .reg %r1
1015ht .reg %r6
1016lt .reg %r5
1017m .reg %r4
1018c3 .reg %r3
1019
1020SQR_ADD_C .macro A0L,A0R,C1,C2,C3
1021 XMPYU A0L,A0R,ftemp1 ; m
1022 FSTD ftemp1,-24(%sp) ; store m
1023
1024 XMPYU A0R,A0R,ftemp2 ; lt
1025 FSTD ftemp2,-16(%sp) ; store lt
1026
1027 XMPYU A0L,A0L,ftemp3 ; ht
1028 FSTD ftemp3,-8(%sp) ; store ht
1029
1030 LDD -24(%sp),m ; load m
1031 AND m,high_mask,temp2 ; m & Mask
1032 DEPD,Z m,30,31,temp3 ; m << 32+1
1033 LDD -16(%sp),lt ; lt
1034
1035 LDD -8(%sp),ht ; ht
1036 EXTRD,U temp2,32,33,temp1 ; temp1 = m&Mask >> 32-1
1037 ADD temp3,lt,lt ; lt = lt+m
1038 ADD,L ht,temp1,ht ; ht += temp1
1039 ADD,DC ht,%r0,ht ; ht++
1040
1041 ADD C1,lt,C1 ; c1=c1+lt
1042 ADD,DC ht,%r0,ht ; ht++
1043
1044 ADD C2,ht,C2 ; c2=c2+ht
1045 ADD,DC C3,%r0,C3 ; c3++
1046.endm
1047
1048SQR_ADD_C2 .macro A0L,A0R,A1L,A1R,C1,C2,C3
1049 XMPYU A0L,A1R,ftemp1 ; m1 = bl*ht
1050 FSTD ftemp1,-16(%sp) ;
1051 XMPYU A0R,A1L,ftemp2 ; m = bh*lt
1052 FSTD ftemp2,-8(%sp) ;
1053 XMPYU A0R,A1R,ftemp3 ; lt = bl*lt
1054 FSTD ftemp3,-32(%sp)
1055 XMPYU A0L,A1L,ftemp4 ; ht = bh*ht
1056 FSTD ftemp4,-24(%sp) ;
1057
1058 LDD -8(%sp),m ; r21 = m
1059 LDD -16(%sp),m1 ; r19 = m1
1060 ADD,L m,m1,m ; m+m1
1061
1062 DEPD,Z m,31,32,temp3 ; (m+m1<<32)
1063 LDD -24(%sp),ht ; r24 = ht
1064
1065 CMPCLR,*>>= m,m1,%r0 ; if (m < m1)
1066 ADD,L ht,high_one,ht ; ht+=high_one
1067
1068 EXTRD,U m,31,32,temp1 ; m >> 32
1069 LDD -32(%sp),lt ; lt
1070 ADD,L ht,temp1,ht ; ht+= m>>32
1071 ADD lt,temp3,lt ; lt = lt+m1
1072 ADD,DC ht,%r0,ht ; ht++
1073
1074 ADD ht,ht,ht ; ht=ht+ht;
1075 ADD,DC C3,%r0,C3 ; add in carry (c3++)
1076
1077 ADD lt,lt,lt ; lt=lt+lt;
1078 ADD,DC ht,%r0,ht ; add in carry (ht++)
1079
1080 ADD C1,lt,C1 ; c1=c1+lt
1081 ADD,DC,*NUV ht,%r0,ht ; add in carry (ht++)
1082 LDO 1(C3),C3 ; bump c3 if overflow,nullify otherwise
1083
1084 ADD C2,ht,C2 ; c2 = c2 + ht
1085 ADD,DC C3,%r0,C3 ; add in carry (c3++)
1086.endm
1087
1088;
1089;void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
1090; arg0 = r_ptr
1091; arg1 = a_ptr
1092;
1093
1094bn_sqr_comba8
1095 .PROC
1096 .CALLINFO FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
1097 .EXPORT bn_sqr_comba8,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
1098 .ENTRY
1099 .align 64
1100
1101 STD %r3,0(%sp) ; save r3
1102 STD %r4,8(%sp) ; save r4
1103 STD %r5,16(%sp) ; save r5
1104 STD %r6,24(%sp) ; save r6
1105
1106 ;
1107 ; Zero out carries
1108 ;
1109 COPY %r0,c1
1110 COPY %r0,c2
1111 COPY %r0,c3
1112
1113 LDO 128(%sp),%sp ; bump stack
1114 DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L
1115 DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32
1116
1117 ;
1118 ; Load up all of the values we are going to use
1119 ;
1120 FLDD 0(a_ptr),a0
1121 FLDD 8(a_ptr),a1
1122 FLDD 16(a_ptr),a2
1123 FLDD 24(a_ptr),a3
1124 FLDD 32(a_ptr),a4
1125 FLDD 40(a_ptr),a5
1126 FLDD 48(a_ptr),a6
1127 FLDD 56(a_ptr),a7
1128
1129 SQR_ADD_C a0L,a0R,c1,c2,c3
1130 STD c1,0(r_ptr) ; r[0] = c1;
1131 COPY %r0,c1
1132
1133 SQR_ADD_C2 a1L,a1R,a0L,a0R,c2,c3,c1
1134 STD c2,8(r_ptr) ; r[1] = c2;
1135 COPY %r0,c2
1136
1137 SQR_ADD_C a1L,a1R,c3,c1,c2
1138 SQR_ADD_C2 a2L,a2R,a0L,a0R,c3,c1,c2
1139 STD c3,16(r_ptr) ; r[2] = c3;
1140 COPY %r0,c3
1141
1142 SQR_ADD_C2 a3L,a3R,a0L,a0R,c1,c2,c3
1143 SQR_ADD_C2 a2L,a2R,a1L,a1R,c1,c2,c3
1144 STD c1,24(r_ptr) ; r[3] = c1;
1145 COPY %r0,c1
1146
1147 SQR_ADD_C a2L,a2R,c2,c3,c1
1148 SQR_ADD_C2 a3L,a3R,a1L,a1R,c2,c3,c1
1149 SQR_ADD_C2 a4L,a4R,a0L,a0R,c2,c3,c1
1150 STD c2,32(r_ptr) ; r[4] = c2;
1151 COPY %r0,c2
1152
1153 SQR_ADD_C2 a5L,a5R,a0L,a0R,c3,c1,c2
1154 SQR_ADD_C2 a4L,a4R,a1L,a1R,c3,c1,c2
1155 SQR_ADD_C2 a3L,a3R,a2L,a2R,c3,c1,c2
1156 STD c3,40(r_ptr) ; r[5] = c3;
1157 COPY %r0,c3
1158
1159 SQR_ADD_C a3L,a3R,c1,c2,c3
1160 SQR_ADD_C2 a4L,a4R,a2L,a2R,c1,c2,c3
1161 SQR_ADD_C2 a5L,a5R,a1L,a1R,c1,c2,c3
1162 SQR_ADD_C2 a6L,a6R,a0L,a0R,c1,c2,c3
1163 STD c1,48(r_ptr) ; r[6] = c1;
1164 COPY %r0,c1
1165
1166 SQR_ADD_C2 a7L,a7R,a0L,a0R,c2,c3,c1
1167 SQR_ADD_C2 a6L,a6R,a1L,a1R,c2,c3,c1
1168 SQR_ADD_C2 a5L,a5R,a2L,a2R,c2,c3,c1
1169 SQR_ADD_C2 a4L,a4R,a3L,a3R,c2,c3,c1
1170 STD c2,56(r_ptr) ; r[7] = c2;
1171 COPY %r0,c2
1172
1173 SQR_ADD_C a4L,a4R,c3,c1,c2
1174 SQR_ADD_C2 a5L,a5R,a3L,a3R,c3,c1,c2
1175 SQR_ADD_C2 a6L,a6R,a2L,a2R,c3,c1,c2
1176 SQR_ADD_C2 a7L,a7R,a1L,a1R,c3,c1,c2
1177 STD c3,64(r_ptr) ; r[8] = c3;
1178 COPY %r0,c3
1179
1180 SQR_ADD_C2 a7L,a7R,a2L,a2R,c1,c2,c3
1181 SQR_ADD_C2 a6L,a6R,a3L,a3R,c1,c2,c3
1182 SQR_ADD_C2 a5L,a5R,a4L,a4R,c1,c2,c3
1183 STD c1,72(r_ptr) ; r[9] = c1;
1184 COPY %r0,c1
1185
1186 SQR_ADD_C a5L,a5R,c2,c3,c1
1187 SQR_ADD_C2 a6L,a6R,a4L,a4R,c2,c3,c1
1188 SQR_ADD_C2 a7L,a7R,a3L,a3R,c2,c3,c1
1189 STD c2,80(r_ptr) ; r[10] = c2;
1190 COPY %r0,c2
1191
1192 SQR_ADD_C2 a7L,a7R,a4L,a4R,c3,c1,c2
1193 SQR_ADD_C2 a6L,a6R,a5L,a5R,c3,c1,c2
1194 STD c3,88(r_ptr) ; r[11] = c3;
1195 COPY %r0,c3
1196
1197 SQR_ADD_C a6L,a6R,c1,c2,c3
1198 SQR_ADD_C2 a7L,a7R,a5L,a5R,c1,c2,c3
1199 STD c1,96(r_ptr) ; r[12] = c1;
1200 COPY %r0,c1
1201
1202 SQR_ADD_C2 a7L,a7R,a6L,a6R,c2,c3,c1
1203 STD c2,104(r_ptr) ; r[13] = c2;
1204 COPY %r0,c2
1205
1206 SQR_ADD_C a7L,a7R,c3,c1,c2
1207 STD c3, 112(r_ptr) ; r[14] = c3
1208 STD c1, 120(r_ptr) ; r[15] = c1
1209
1210 .EXIT
1211 LDD -104(%sp),%r6 ; restore r6
1212 LDD -112(%sp),%r5 ; restore r5
1213 LDD -120(%sp),%r4 ; restore r4
1214 BVE (%rp)
1215 LDD,MB -128(%sp),%r3
1216
1217 .PROCEND
1218
1219;-----------------------------------------------------------------------------
1220;
1221;void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
1222; arg0 = r_ptr
1223; arg1 = a_ptr
1224;
1225
1226bn_sqr_comba4
1227 .proc
1228 .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
1229 .EXPORT bn_sqr_comba4,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
1230 .entry
1231 .align 64
1232 STD %r3,0(%sp) ; save r3
1233 STD %r4,8(%sp) ; save r4
1234 STD %r5,16(%sp) ; save r5
1235 STD %r6,24(%sp) ; save r6
1236
1237 ;
1238 ; Zero out carries
1239 ;
1240 COPY %r0,c1
1241 COPY %r0,c2
1242 COPY %r0,c3
1243
1244 LDO 128(%sp),%sp ; bump stack
1245 DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L
1246 DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32
1247
1248 ;
1249 ; Load up all of the values we are going to use
1250 ;
1251 FLDD 0(a_ptr),a0
1252 FLDD 8(a_ptr),a1
1253 FLDD 16(a_ptr),a2
1254 FLDD 24(a_ptr),a3
1255 FLDD 32(a_ptr),a4
1256 FLDD 40(a_ptr),a5
1257 FLDD 48(a_ptr),a6
1258 FLDD 56(a_ptr),a7
1259
1260 SQR_ADD_C a0L,a0R,c1,c2,c3
1261
1262 STD c1,0(r_ptr) ; r[0] = c1;
1263 COPY %r0,c1
1264
1265 SQR_ADD_C2 a1L,a1R,a0L,a0R,c2,c3,c1
1266
1267 STD c2,8(r_ptr) ; r[1] = c2;
1268 COPY %r0,c2
1269
1270 SQR_ADD_C a1L,a1R,c3,c1,c2
1271 SQR_ADD_C2 a2L,a2R,a0L,a0R,c3,c1,c2
1272
1273 STD c3,16(r_ptr) ; r[2] = c3;
1274 COPY %r0,c3
1275
1276 SQR_ADD_C2 a3L,a3R,a0L,a0R,c1,c2,c3
1277 SQR_ADD_C2 a2L,a2R,a1L,a1R,c1,c2,c3
1278
1279 STD c1,24(r_ptr) ; r[3] = c1;
1280 COPY %r0,c1
1281
1282 SQR_ADD_C a2L,a2R,c2,c3,c1
1283 SQR_ADD_C2 a3L,a3R,a1L,a1R,c2,c3,c1
1284
1285 STD c2,32(r_ptr) ; r[4] = c2;
1286 COPY %r0,c2
1287
1288 SQR_ADD_C2 a3L,a3R,a2L,a2R,c3,c1,c2
1289 STD c3,40(r_ptr) ; r[5] = c3;
1290 COPY %r0,c3
1291
1292 SQR_ADD_C a3L,a3R,c1,c2,c3
1293 STD c1,48(r_ptr) ; r[6] = c1;
1294 STD c2,56(r_ptr) ; r[7] = c2;
1295
1296 .EXIT
1297 LDD -104(%sp),%r6 ; restore r6
1298 LDD -112(%sp),%r5 ; restore r5
1299 LDD -120(%sp),%r4 ; restore r4
1300 BVE (%rp)
1301 LDD,MB -128(%sp),%r3
1302
1303 .PROCEND
1304
1305
1306;---------------------------------------------------------------------------
1307
1308MUL_ADD_C .macro A0L,A0R,B0L,B0R,C1,C2,C3
1309 XMPYU A0L,B0R,ftemp1 ; m1 = bl*ht
1310 FSTD ftemp1,-16(%sp) ;
1311 XMPYU A0R,B0L,ftemp2 ; m = bh*lt
1312 FSTD ftemp2,-8(%sp) ;
1313 XMPYU A0R,B0R,ftemp3 ; lt = bl*lt
1314 FSTD ftemp3,-32(%sp)
1315 XMPYU A0L,B0L,ftemp4 ; ht = bh*ht
1316 FSTD ftemp4,-24(%sp) ;
1317
1318 LDD -8(%sp),m ; r21 = m
1319 LDD -16(%sp),m1 ; r19 = m1
1320 ADD,L m,m1,m ; m+m1
1321
1322 DEPD,Z m,31,32,temp3 ; (m+m1<<32)
1323 LDD -24(%sp),ht ; r24 = ht
1324
1325 CMPCLR,*>>= m,m1,%r0 ; if (m < m1)
1326 ADD,L ht,high_one,ht ; ht+=high_one
1327
1328 EXTRD,U m,31,32,temp1 ; m >> 32
1329 LDD -32(%sp),lt ; lt
1330 ADD,L ht,temp1,ht ; ht+= m>>32
1331 ADD lt,temp3,lt ; lt = lt+m1
1332 ADD,DC ht,%r0,ht ; ht++
1333
1334 ADD C1,lt,C1 ; c1=c1+lt
1335 ADD,DC ht,%r0,ht ; bump c3 if overflow,nullify otherwise
1336
1337 ADD C2,ht,C2 ; c2 = c2 + ht
1338 ADD,DC C3,%r0,C3 ; add in carry (c3++)
1339.endm
1340
1341
1342;
1343;void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
1344; arg0 = r_ptr
1345; arg1 = a_ptr
1346; arg2 = b_ptr
1347;
1348
1349bn_mul_comba8
1350 .proc
1351 .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
1352 .EXPORT bn_mul_comba8,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
1353 .entry
1354 .align 64
1355
1356 STD %r3,0(%sp) ; save r3
1357 STD %r4,8(%sp) ; save r4
1358 STD %r5,16(%sp) ; save r5
1359 STD %r6,24(%sp) ; save r6
1360 FSTD %fr12,32(%sp) ; save r6
1361 FSTD %fr13,40(%sp) ; save r7
1362
1363 ;
1364 ; Zero out carries
1365 ;
1366 COPY %r0,c1
1367 COPY %r0,c2
1368 COPY %r0,c3
1369
1370 LDO 128(%sp),%sp ; bump stack
1371 DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32
1372
1373 ;
1374 ; Load up all of the values we are going to use
1375 ;
1376 FLDD 0(a_ptr),a0
1377 FLDD 8(a_ptr),a1
1378 FLDD 16(a_ptr),a2
1379 FLDD 24(a_ptr),a3
1380 FLDD 32(a_ptr),a4
1381 FLDD 40(a_ptr),a5
1382 FLDD 48(a_ptr),a6
1383 FLDD 56(a_ptr),a7
1384
1385 FLDD 0(b_ptr),b0
1386 FLDD 8(b_ptr),b1
1387 FLDD 16(b_ptr),b2
1388 FLDD 24(b_ptr),b3
1389 FLDD 32(b_ptr),b4
1390 FLDD 40(b_ptr),b5
1391 FLDD 48(b_ptr),b6
1392 FLDD 56(b_ptr),b7
1393
1394 MUL_ADD_C a0L,a0R,b0L,b0R,c1,c2,c3
1395 STD c1,0(r_ptr)
1396 COPY %r0,c1
1397
1398 MUL_ADD_C a0L,a0R,b1L,b1R,c2,c3,c1
1399 MUL_ADD_C a1L,a1R,b0L,b0R,c2,c3,c1
1400 STD c2,8(r_ptr)
1401 COPY %r0,c2
1402
1403 MUL_ADD_C a2L,a2R,b0L,b0R,c3,c1,c2
1404 MUL_ADD_C a1L,a1R,b1L,b1R,c3,c1,c2
1405 MUL_ADD_C a0L,a0R,b2L,b2R,c3,c1,c2
1406 STD c3,16(r_ptr)
1407 COPY %r0,c3
1408
1409 MUL_ADD_C a0L,a0R,b3L,b3R,c1,c2,c3
1410 MUL_ADD_C a1L,a1R,b2L,b2R,c1,c2,c3
1411 MUL_ADD_C a2L,a2R,b1L,b1R,c1,c2,c3
1412 MUL_ADD_C a3L,a3R,b0L,b0R,c1,c2,c3
1413 STD c1,24(r_ptr)
1414 COPY %r0,c1
1415
1416 MUL_ADD_C a4L,a4R,b0L,b0R,c2,c3,c1
1417 MUL_ADD_C a3L,a3R,b1L,b1R,c2,c3,c1
1418 MUL_ADD_C a2L,a2R,b2L,b2R,c2,c3,c1
1419 MUL_ADD_C a1L,a1R,b3L,b3R,c2,c3,c1
1420 MUL_ADD_C a0L,a0R,b4L,b4R,c2,c3,c1
1421 STD c2,32(r_ptr)
1422 COPY %r0,c2
1423
1424 MUL_ADD_C a0L,a0R,b5L,b5R,c3,c1,c2
1425 MUL_ADD_C a1L,a1R,b4L,b4R,c3,c1,c2
1426 MUL_ADD_C a2L,a2R,b3L,b3R,c3,c1,c2
1427 MUL_ADD_C a3L,a3R,b2L,b2R,c3,c1,c2
1428 MUL_ADD_C a4L,a4R,b1L,b1R,c3,c1,c2
1429 MUL_ADD_C a5L,a5R,b0L,b0R,c3,c1,c2
1430 STD c3,40(r_ptr)
1431 COPY %r0,c3
1432
1433 MUL_ADD_C a6L,a6R,b0L,b0R,c1,c2,c3
1434 MUL_ADD_C a5L,a5R,b1L,b1R,c1,c2,c3
1435 MUL_ADD_C a4L,a4R,b2L,b2R,c1,c2,c3
1436 MUL_ADD_C a3L,a3R,b3L,b3R,c1,c2,c3
1437 MUL_ADD_C a2L,a2R,b4L,b4R,c1,c2,c3
1438 MUL_ADD_C a1L,a1R,b5L,b5R,c1,c2,c3
1439 MUL_ADD_C a0L,a0R,b6L,b6R,c1,c2,c3
1440 STD c1,48(r_ptr)
1441 COPY %r0,c1
1442
1443 MUL_ADD_C a0L,a0R,b7L,b7R,c2,c3,c1
1444 MUL_ADD_C a1L,a1R,b6L,b6R,c2,c3,c1
1445 MUL_ADD_C a2L,a2R,b5L,b5R,c2,c3,c1
1446 MUL_ADD_C a3L,a3R,b4L,b4R,c2,c3,c1
1447 MUL_ADD_C a4L,a4R,b3L,b3R,c2,c3,c1
1448 MUL_ADD_C a5L,a5R,b2L,b2R,c2,c3,c1
1449 MUL_ADD_C a6L,a6R,b1L,b1R,c2,c3,c1
1450 MUL_ADD_C a7L,a7R,b0L,b0R,c2,c3,c1
1451 STD c2,56(r_ptr)
1452 COPY %r0,c2
1453
1454 MUL_ADD_C a7L,a7R,b1L,b1R,c3,c1,c2
1455 MUL_ADD_C a6L,a6R,b2L,b2R,c3,c1,c2
1456 MUL_ADD_C a5L,a5R,b3L,b3R,c3,c1,c2
1457 MUL_ADD_C a4L,a4R,b4L,b4R,c3,c1,c2
1458 MUL_ADD_C a3L,a3R,b5L,b5R,c3,c1,c2
1459 MUL_ADD_C a2L,a2R,b6L,b6R,c3,c1,c2
1460 MUL_ADD_C a1L,a1R,b7L,b7R,c3,c1,c2
1461 STD c3,64(r_ptr)
1462 COPY %r0,c3
1463
1464 MUL_ADD_C a2L,a2R,b7L,b7R,c1,c2,c3
1465 MUL_ADD_C a3L,a3R,b6L,b6R,c1,c2,c3
1466 MUL_ADD_C a4L,a4R,b5L,b5R,c1,c2,c3
1467 MUL_ADD_C a5L,a5R,b4L,b4R,c1,c2,c3
1468 MUL_ADD_C a6L,a6R,b3L,b3R,c1,c2,c3
1469 MUL_ADD_C a7L,a7R,b2L,b2R,c1,c2,c3
1470 STD c1,72(r_ptr)
1471 COPY %r0,c1
1472
1473 MUL_ADD_C a7L,a7R,b3L,b3R,c2,c3,c1
1474 MUL_ADD_C a6L,a6R,b4L,b4R,c2,c3,c1
1475 MUL_ADD_C a5L,a5R,b5L,b5R,c2,c3,c1
1476 MUL_ADD_C a4L,a4R,b6L,b6R,c2,c3,c1
1477 MUL_ADD_C a3L,a3R,b7L,b7R,c2,c3,c1
1478 STD c2,80(r_ptr)
1479 COPY %r0,c2
1480
1481 MUL_ADD_C a4L,a4R,b7L,b7R,c3,c1,c2
1482 MUL_ADD_C a5L,a5R,b6L,b6R,c3,c1,c2
1483 MUL_ADD_C a6L,a6R,b5L,b5R,c3,c1,c2
1484 MUL_ADD_C a7L,a7R,b4L,b4R,c3,c1,c2
1485 STD c3,88(r_ptr)
1486 COPY %r0,c3
1487
1488 MUL_ADD_C a7L,a7R,b5L,b5R,c1,c2,c3
1489 MUL_ADD_C a6L,a6R,b6L,b6R,c1,c2,c3
1490 MUL_ADD_C a5L,a5R,b7L,b7R,c1,c2,c3
1491 STD c1,96(r_ptr)
1492 COPY %r0,c1
1493
1494 MUL_ADD_C a6L,a6R,b7L,b7R,c2,c3,c1
1495 MUL_ADD_C a7L,a7R,b6L,b6R,c2,c3,c1
1496 STD c2,104(r_ptr)
1497 COPY %r0,c2
1498
1499 MUL_ADD_C a7L,a7R,b7L,b7R,c3,c1,c2
1500 STD c3,112(r_ptr)
1501 STD c1,120(r_ptr)
1502
1503 .EXIT
1504 FLDD -88(%sp),%fr13
1505 FLDD -96(%sp),%fr12
1506 LDD -104(%sp),%r6 ; restore r6
1507 LDD -112(%sp),%r5 ; restore r5
1508 LDD -120(%sp),%r4 ; restore r4
1509 BVE (%rp)
1510 LDD,MB -128(%sp),%r3
1511
1512 .PROCEND
1513
1514;-----------------------------------------------------------------------------
1515;
1516;void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
1517; arg0 = r_ptr
1518; arg1 = a_ptr
1519; arg2 = b_ptr
1520;
1521
1522bn_mul_comba4
1523 .proc
1524 .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
1525 .EXPORT bn_mul_comba4,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
1526 .entry
1527 .align 64
1528
1529 STD %r3,0(%sp) ; save r3
1530 STD %r4,8(%sp) ; save r4
1531 STD %r5,16(%sp) ; save r5
1532 STD %r6,24(%sp) ; save r6
1533 FSTD %fr12,32(%sp) ; save r6
1534 FSTD %fr13,40(%sp) ; save r7
1535
1536 ;
1537 ; Zero out carries
1538 ;
1539 COPY %r0,c1
1540 COPY %r0,c2
1541 COPY %r0,c3
1542
1543 LDO 128(%sp),%sp ; bump stack
1544 DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32
1545
1546 ;
1547 ; Load up all of the values we are going to use
1548 ;
1549 FLDD 0(a_ptr),a0
1550 FLDD 8(a_ptr),a1
1551 FLDD 16(a_ptr),a2
1552 FLDD 24(a_ptr),a3
1553
1554 FLDD 0(b_ptr),b0
1555 FLDD 8(b_ptr),b1
1556 FLDD 16(b_ptr),b2
1557 FLDD 24(b_ptr),b3
1558
1559 MUL_ADD_C a0L,a0R,b0L,b0R,c1,c2,c3
1560 STD c1,0(r_ptr)
1561 COPY %r0,c1
1562
1563 MUL_ADD_C a0L,a0R,b1L,b1R,c2,c3,c1
1564 MUL_ADD_C a1L,a1R,b0L,b0R,c2,c3,c1
1565 STD c2,8(r_ptr)
1566 COPY %r0,c2
1567
1568 MUL_ADD_C a2L,a2R,b0L,b0R,c3,c1,c2
1569 MUL_ADD_C a1L,a1R,b1L,b1R,c3,c1,c2
1570 MUL_ADD_C a0L,a0R,b2L,b2R,c3,c1,c2
1571 STD c3,16(r_ptr)
1572 COPY %r0,c3
1573
1574 MUL_ADD_C a0L,a0R,b3L,b3R,c1,c2,c3
1575 MUL_ADD_C a1L,a1R,b2L,b2R,c1,c2,c3
1576 MUL_ADD_C a2L,a2R,b1L,b1R,c1,c2,c3
1577 MUL_ADD_C a3L,a3R,b0L,b0R,c1,c2,c3
1578 STD c1,24(r_ptr)
1579 COPY %r0,c1
1580
1581 MUL_ADD_C a3L,a3R,b1L,b1R,c2,c3,c1
1582 MUL_ADD_C a2L,a2R,b2L,b2R,c2,c3,c1
1583 MUL_ADD_C a1L,a1R,b3L,b3R,c2,c3,c1
1584 STD c2,32(r_ptr)
1585 COPY %r0,c2
1586
1587 MUL_ADD_C a2L,a2R,b3L,b3R,c3,c1,c2
1588 MUL_ADD_C a3L,a3R,b2L,b2R,c3,c1,c2
1589 STD c3,40(r_ptr)
1590 COPY %r0,c3
1591
1592 MUL_ADD_C a3L,a3R,b3L,b3R,c1,c2,c3
1593 STD c1,48(r_ptr)
1594 STD c2,56(r_ptr)
1595
1596 .EXIT
1597 FLDD -88(%sp),%fr13
1598 FLDD -96(%sp),%fr12
1599 LDD -104(%sp),%r6 ; restore r6
1600 LDD -112(%sp),%r5 ; restore r5
1601 LDD -120(%sp),%r4 ; restore r4
1602 BVE (%rp)
1603 LDD,MB -128(%sp),%r3
1604
1605 .PROCEND
1606
1607
1608;--- not PIC .SPACE $TEXT$
1609;--- not PIC .SUBSPA $CODE$
1610;--- not PIC .SPACE $PRIVATE$,SORT=16
1611;--- not PIC .IMPORT $global$,DATA
1612;--- not PIC .SPACE $TEXT$
1613;--- not PIC .SUBSPA $CODE$
1614;--- not PIC .SUBSPA $LIT$,ACCESS=0x2c
1615;--- not PIC C$7
1616;--- not PIC .ALIGN 8
1617;--- not PIC .STRINGZ "Division would overflow (%d)\n"
1618 .END
diff --git a/src/lib/libcrypto/bn/asm/pa-risc2W.s b/src/lib/libcrypto/bn/asm/pa-risc2W.s
deleted file mode 100644
index a99545754d..0000000000
--- a/src/lib/libcrypto/bn/asm/pa-risc2W.s
+++ /dev/null
@@ -1,1605 +0,0 @@
1;
2; PA-RISC 64-bit implementation of bn_asm code
3;
4; This code is approximately 2x faster than the C version
5; for RSA/DSA.
6;
7; See http://devresource.hp.com/ for more details on the PA-RISC
8; architecture. Also see the book "PA-RISC 2.0 Architecture"
9; by Gerry Kane for information on the instruction set architecture.
10;
11; Code written by Chris Ruemmler (with some help from the HP C
12; compiler).
13;
14; The code compiles with HP's assembler
15;
16
17 .level 2.0W
18 .space $TEXT$
19 .subspa $CODE$,QUAD=0,ALIGN=8,ACCESS=0x2c,CODE_ONLY
20
21;
22; Global Register definitions used for the routines.
23;
24; Some information about HP's runtime architecture for 64-bits.
25;
26; "Caller save" means the calling function must save the register
27; if it wants the register to be preserved.
28; "Callee save" means if a function uses the register, it must save
29; the value before using it.
30;
31; For the floating point registers
32;
33; "caller save" registers: fr4-fr11, fr22-fr31
34; "callee save" registers: fr12-fr21
35; "special" registers: fr0-fr3 (status and exception registers)
36;
37; For the integer registers
38; value zero : r0
39; "caller save" registers: r1,r19-r26
40; "callee save" registers: r3-r18
41; return register : r2 (rp)
42; return values ; r28 (ret0,ret1)
43; Stack pointer ; r30 (sp)
44; global data pointer ; r27 (dp)
45; argument pointer ; r29 (ap)
46; millicode return ptr ; r31 (also a caller save register)
47
48
49;
50; Arguments to the routines
51;
52r_ptr .reg %r26
53a_ptr .reg %r25
54b_ptr .reg %r24
55num .reg %r24
56w .reg %r23
57n .reg %r23
58
59
60;
61; Globals used in some routines
62;
63
64top_overflow .reg %r29
65high_mask .reg %r22 ; value 0xffffffff80000000L
66
67
68;------------------------------------------------------------------------------
69;
70; bn_mul_add_words
71;
72;BN_ULONG bn_mul_add_words(BN_ULONG *r_ptr, BN_ULONG *a_ptr,
73; int num, BN_ULONG w)
74;
75; arg0 = r_ptr
76; arg1 = a_ptr
77; arg2 = num
78; arg3 = w
79;
80; Local register definitions
81;
82
83fm1 .reg %fr22
84fm .reg %fr23
85ht_temp .reg %fr24
86ht_temp_1 .reg %fr25
87lt_temp .reg %fr26
88lt_temp_1 .reg %fr27
89fm1_1 .reg %fr28
90fm_1 .reg %fr29
91
92fw_h .reg %fr7L
93fw_l .reg %fr7R
94fw .reg %fr7
95
96fht_0 .reg %fr8L
97flt_0 .reg %fr8R
98t_float_0 .reg %fr8
99
100fht_1 .reg %fr9L
101flt_1 .reg %fr9R
102t_float_1 .reg %fr9
103
104tmp_0 .reg %r31
105tmp_1 .reg %r21
106m_0 .reg %r20
107m_1 .reg %r19
108ht_0 .reg %r1
109ht_1 .reg %r3
110lt_0 .reg %r4
111lt_1 .reg %r5
112m1_0 .reg %r6
113m1_1 .reg %r7
114rp_val .reg %r8
115rp_val_1 .reg %r9
116
117bn_mul_add_words
118 .export bn_mul_add_words,entry,NO_RELOCATION,LONG_RETURN
119 .proc
120 .callinfo frame=128
121 .entry
122 .align 64
123
124 STD %r3,0(%sp) ; save r3
125 STD %r4,8(%sp) ; save r4
126 NOP ; Needed to make the loop 16-byte aligned
127 NOP ; Needed to make the loop 16-byte aligned
128
129 STD %r5,16(%sp) ; save r5
130 STD %r6,24(%sp) ; save r6
131 STD %r7,32(%sp) ; save r7
132 STD %r8,40(%sp) ; save r8
133
134 STD %r9,48(%sp) ; save r9
135 COPY %r0,%ret0 ; return 0 by default
136 DEPDI,Z 1,31,1,top_overflow ; top_overflow = 1 << 32
137 STD w,56(%sp) ; store w on stack
138
139 CMPIB,>= 0,num,bn_mul_add_words_exit ; if (num <= 0) then exit
140 LDO 128(%sp),%sp ; bump stack
141
142 ;
143 ; The loop is unrolled twice, so if there is only 1 number
144 ; then go straight to the cleanup code.
145 ;
146 CMPIB,= 1,num,bn_mul_add_words_single_top
147 FLDD -72(%sp),fw ; load up w into fp register fw (fw_h/fw_l)
148
149 ;
150 ; This loop is unrolled 2 times (64-byte aligned as well)
151 ;
152 ; PA-RISC 2.0 chips have two fully pipelined multipliers, thus
153 ; two 32-bit mutiplies can be issued per cycle.
154 ;
155bn_mul_add_words_unroll2
156
157 FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R)
158 FLDD 8(a_ptr),t_float_1 ; load up 64-bit value (fr8L) ht(L)/lt(R)
159 LDD 0(r_ptr),rp_val ; rp[0]
160 LDD 8(r_ptr),rp_val_1 ; rp[1]
161
162 XMPYU fht_0,fw_l,fm1 ; m1[0] = fht_0*fw_l
163 XMPYU fht_1,fw_l,fm1_1 ; m1[1] = fht_1*fw_l
164 FSTD fm1,-16(%sp) ; -16(sp) = m1[0]
165 FSTD fm1_1,-48(%sp) ; -48(sp) = m1[1]
166
167 XMPYU flt_0,fw_h,fm ; m[0] = flt_0*fw_h
168 XMPYU flt_1,fw_h,fm_1 ; m[1] = flt_1*fw_h
169 FSTD fm,-8(%sp) ; -8(sp) = m[0]
170 FSTD fm_1,-40(%sp) ; -40(sp) = m[1]
171
172 XMPYU fht_0,fw_h,ht_temp ; ht_temp = fht_0*fw_h
173 XMPYU fht_1,fw_h,ht_temp_1 ; ht_temp_1 = fht_1*fw_h
174 FSTD ht_temp,-24(%sp) ; -24(sp) = ht_temp
175 FSTD ht_temp_1,-56(%sp) ; -56(sp) = ht_temp_1
176
177 XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l
178 XMPYU flt_1,fw_l,lt_temp_1 ; lt_temp = lt*fw_l
179 FSTD lt_temp,-32(%sp) ; -32(sp) = lt_temp
180 FSTD lt_temp_1,-64(%sp) ; -64(sp) = lt_temp_1
181
182 LDD -8(%sp),m_0 ; m[0]
183 LDD -40(%sp),m_1 ; m[1]
184 LDD -16(%sp),m1_0 ; m1[0]
185 LDD -48(%sp),m1_1 ; m1[1]
186
187 LDD -24(%sp),ht_0 ; ht[0]
188 LDD -56(%sp),ht_1 ; ht[1]
189 ADD,L m1_0,m_0,tmp_0 ; tmp_0 = m[0] + m1[0];
190 ADD,L m1_1,m_1,tmp_1 ; tmp_1 = m[1] + m1[1];
191
192 LDD -32(%sp),lt_0
193 LDD -64(%sp),lt_1
194 CMPCLR,*>>= tmp_0,m1_0, %r0 ; if (m[0] < m1[0])
195 ADD,L ht_0,top_overflow,ht_0 ; ht[0] += (1<<32)
196
197 CMPCLR,*>>= tmp_1,m1_1,%r0 ; if (m[1] < m1[1])
198 ADD,L ht_1,top_overflow,ht_1 ; ht[1] += (1<<32)
199 EXTRD,U tmp_0,31,32,m_0 ; m[0]>>32
200 DEPD,Z tmp_0,31,32,m1_0 ; m1[0] = m[0]<<32
201
202 EXTRD,U tmp_1,31,32,m_1 ; m[1]>>32
203 DEPD,Z tmp_1,31,32,m1_1 ; m1[1] = m[1]<<32
204 ADD,L ht_0,m_0,ht_0 ; ht[0]+= (m[0]>>32)
205 ADD,L ht_1,m_1,ht_1 ; ht[1]+= (m[1]>>32)
206
207 ADD lt_0,m1_0,lt_0 ; lt[0] = lt[0]+m1[0];
208 ADD,DC ht_0,%r0,ht_0 ; ht[0]++
209 ADD lt_1,m1_1,lt_1 ; lt[1] = lt[1]+m1[1];
210 ADD,DC ht_1,%r0,ht_1 ; ht[1]++
211
212 ADD %ret0,lt_0,lt_0 ; lt[0] = lt[0] + c;
213 ADD,DC ht_0,%r0,ht_0 ; ht[0]++
214 ADD lt_0,rp_val,lt_0 ; lt[0] = lt[0]+rp[0]
215 ADD,DC ht_0,%r0,ht_0 ; ht[0]++
216
217 LDO -2(num),num ; num = num - 2;
218 ADD ht_0,lt_1,lt_1 ; lt[1] = lt[1] + ht_0 (c);
219 ADD,DC ht_1,%r0,ht_1 ; ht[1]++
220 STD lt_0,0(r_ptr) ; rp[0] = lt[0]
221
222 ADD lt_1,rp_val_1,lt_1 ; lt[1] = lt[1]+rp[1]
223 ADD,DC ht_1,%r0,%ret0 ; ht[1]++
224 LDO 16(a_ptr),a_ptr ; a_ptr += 2
225
226 STD lt_1,8(r_ptr) ; rp[1] = lt[1]
227 CMPIB,<= 2,num,bn_mul_add_words_unroll2 ; go again if more to do
228 LDO 16(r_ptr),r_ptr ; r_ptr += 2
229
230 CMPIB,=,N 0,num,bn_mul_add_words_exit ; are we done, or cleanup last one
231
232 ;
233 ; Top of loop aligned on 64-byte boundary
234 ;
235bn_mul_add_words_single_top
236 FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R)
237 LDD 0(r_ptr),rp_val ; rp[0]
238 LDO 8(a_ptr),a_ptr ; a_ptr++
239 XMPYU fht_0,fw_l,fm1 ; m1 = ht*fw_l
240 FSTD fm1,-16(%sp) ; -16(sp) = m1
241 XMPYU flt_0,fw_h,fm ; m = lt*fw_h
242 FSTD fm,-8(%sp) ; -8(sp) = m
243 XMPYU fht_0,fw_h,ht_temp ; ht_temp = ht*fw_h
244 FSTD ht_temp,-24(%sp) ; -24(sp) = ht
245 XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l
246 FSTD lt_temp,-32(%sp) ; -32(sp) = lt
247
248 LDD -8(%sp),m_0
249 LDD -16(%sp),m1_0 ; m1 = temp1
250 ADD,L m_0,m1_0,tmp_0 ; tmp_0 = m + m1;
251 LDD -24(%sp),ht_0
252 LDD -32(%sp),lt_0
253
254 CMPCLR,*>>= tmp_0,m1_0,%r0 ; if (m < m1)
255 ADD,L ht_0,top_overflow,ht_0 ; ht += (1<<32)
256
257 EXTRD,U tmp_0,31,32,m_0 ; m>>32
258 DEPD,Z tmp_0,31,32,m1_0 ; m1 = m<<32
259
260 ADD,L ht_0,m_0,ht_0 ; ht+= (m>>32)
261 ADD lt_0,m1_0,tmp_0 ; tmp_0 = lt+m1;
262 ADD,DC ht_0,%r0,ht_0 ; ht++
263 ADD %ret0,tmp_0,lt_0 ; lt = lt + c;
264 ADD,DC ht_0,%r0,ht_0 ; ht++
265 ADD lt_0,rp_val,lt_0 ; lt = lt+rp[0]
266 ADD,DC ht_0,%r0,%ret0 ; ht++
267 STD lt_0,0(r_ptr) ; rp[0] = lt
268
269bn_mul_add_words_exit
270 .EXIT
271 LDD -80(%sp),%r9 ; restore r9
272 LDD -88(%sp),%r8 ; restore r8
273 LDD -96(%sp),%r7 ; restore r7
274 LDD -104(%sp),%r6 ; restore r6
275 LDD -112(%sp),%r5 ; restore r5
276 LDD -120(%sp),%r4 ; restore r4
277 BVE (%rp)
278 LDD,MB -128(%sp),%r3 ; restore r3
279 .PROCEND ;in=23,24,25,26,29;out=28;
280
281;----------------------------------------------------------------------------
282;
283;BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
284;
285; arg0 = rp
286; arg1 = ap
287; arg2 = num
288; arg3 = w
289
290bn_mul_words
291 .proc
292 .callinfo frame=128
293 .entry
294 .EXPORT bn_mul_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
295 .align 64
296
297 STD %r3,0(%sp) ; save r3
298 STD %r4,8(%sp) ; save r4
299 STD %r5,16(%sp) ; save r5
300 STD %r6,24(%sp) ; save r6
301
302 STD %r7,32(%sp) ; save r7
303 COPY %r0,%ret0 ; return 0 by default
304 DEPDI,Z 1,31,1,top_overflow ; top_overflow = 1 << 32
305 STD w,56(%sp) ; w on stack
306
307 CMPIB,>= 0,num,bn_mul_words_exit
308 LDO 128(%sp),%sp ; bump stack
309
310 ;
311 ; See if only 1 word to do, thus just do cleanup
312 ;
313 CMPIB,= 1,num,bn_mul_words_single_top
314 FLDD -72(%sp),fw ; load up w into fp register fw (fw_h/fw_l)
315
316 ;
317 ; This loop is unrolled 2 times (64-byte aligned as well)
318 ;
319 ; PA-RISC 2.0 chips have two fully pipelined multipliers, thus
320 ; two 32-bit mutiplies can be issued per cycle.
321 ;
322bn_mul_words_unroll2
323
324 FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R)
325 FLDD 8(a_ptr),t_float_1 ; load up 64-bit value (fr8L) ht(L)/lt(R)
326 XMPYU fht_0,fw_l,fm1 ; m1[0] = fht_0*fw_l
327 XMPYU fht_1,fw_l,fm1_1 ; m1[1] = ht*fw_l
328
329 FSTD fm1,-16(%sp) ; -16(sp) = m1
330 FSTD fm1_1,-48(%sp) ; -48(sp) = m1
331 XMPYU flt_0,fw_h,fm ; m = lt*fw_h
332 XMPYU flt_1,fw_h,fm_1 ; m = lt*fw_h
333
334 FSTD fm,-8(%sp) ; -8(sp) = m
335 FSTD fm_1,-40(%sp) ; -40(sp) = m
336 XMPYU fht_0,fw_h,ht_temp ; ht_temp = fht_0*fw_h
337 XMPYU fht_1,fw_h,ht_temp_1 ; ht_temp = ht*fw_h
338
339 FSTD ht_temp,-24(%sp) ; -24(sp) = ht
340 FSTD ht_temp_1,-56(%sp) ; -56(sp) = ht
341 XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l
342 XMPYU flt_1,fw_l,lt_temp_1 ; lt_temp = lt*fw_l
343
344 FSTD lt_temp,-32(%sp) ; -32(sp) = lt
345 FSTD lt_temp_1,-64(%sp) ; -64(sp) = lt
346 LDD -8(%sp),m_0
347 LDD -40(%sp),m_1
348
349 LDD -16(%sp),m1_0
350 LDD -48(%sp),m1_1
351 LDD -24(%sp),ht_0
352 LDD -56(%sp),ht_1
353
354 ADD,L m1_0,m_0,tmp_0 ; tmp_0 = m + m1;
355 ADD,L m1_1,m_1,tmp_1 ; tmp_1 = m + m1;
356 LDD -32(%sp),lt_0
357 LDD -64(%sp),lt_1
358
359 CMPCLR,*>>= tmp_0,m1_0, %r0 ; if (m < m1)
360 ADD,L ht_0,top_overflow,ht_0 ; ht += (1<<32)
361 CMPCLR,*>>= tmp_1,m1_1,%r0 ; if (m < m1)
362 ADD,L ht_1,top_overflow,ht_1 ; ht += (1<<32)
363
364 EXTRD,U tmp_0,31,32,m_0 ; m>>32
365 DEPD,Z tmp_0,31,32,m1_0 ; m1 = m<<32
366 EXTRD,U tmp_1,31,32,m_1 ; m>>32
367 DEPD,Z tmp_1,31,32,m1_1 ; m1 = m<<32
368
369 ADD,L ht_0,m_0,ht_0 ; ht+= (m>>32)
370 ADD,L ht_1,m_1,ht_1 ; ht+= (m>>32)
371 ADD lt_0,m1_0,lt_0 ; lt = lt+m1;
372 ADD,DC ht_0,%r0,ht_0 ; ht++
373
374 ADD lt_1,m1_1,lt_1 ; lt = lt+m1;
375 ADD,DC ht_1,%r0,ht_1 ; ht++
376 ADD %ret0,lt_0,lt_0 ; lt = lt + c (ret0);
377 ADD,DC ht_0,%r0,ht_0 ; ht++
378
379 ADD ht_0,lt_1,lt_1 ; lt = lt + c (ht_0)
380 ADD,DC ht_1,%r0,ht_1 ; ht++
381 STD lt_0,0(r_ptr) ; rp[0] = lt
382 STD lt_1,8(r_ptr) ; rp[1] = lt
383
384 COPY ht_1,%ret0 ; carry = ht
385 LDO -2(num),num ; num = num - 2;
386 LDO 16(a_ptr),a_ptr ; ap += 2
387 CMPIB,<= 2,num,bn_mul_words_unroll2
388 LDO 16(r_ptr),r_ptr ; rp++
389
390 CMPIB,=,N 0,num,bn_mul_words_exit ; are we done?
391
392 ;
393 ; Top of loop aligned on 64-byte boundary
394 ;
395bn_mul_words_single_top
396 FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R)
397
398 XMPYU fht_0,fw_l,fm1 ; m1 = ht*fw_l
399 FSTD fm1,-16(%sp) ; -16(sp) = m1
400 XMPYU flt_0,fw_h,fm ; m = lt*fw_h
401 FSTD fm,-8(%sp) ; -8(sp) = m
402 XMPYU fht_0,fw_h,ht_temp ; ht_temp = ht*fw_h
403 FSTD ht_temp,-24(%sp) ; -24(sp) = ht
404 XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l
405 FSTD lt_temp,-32(%sp) ; -32(sp) = lt
406
407 LDD -8(%sp),m_0
408 LDD -16(%sp),m1_0
409 ADD,L m_0,m1_0,tmp_0 ; tmp_0 = m + m1;
410 LDD -24(%sp),ht_0
411 LDD -32(%sp),lt_0
412
413 CMPCLR,*>>= tmp_0,m1_0,%r0 ; if (m < m1)
414 ADD,L ht_0,top_overflow,ht_0 ; ht += (1<<32)
415
416 EXTRD,U tmp_0,31,32,m_0 ; m>>32
417 DEPD,Z tmp_0,31,32,m1_0 ; m1 = m<<32
418
419 ADD,L ht_0,m_0,ht_0 ; ht+= (m>>32)
420 ADD lt_0,m1_0,lt_0 ; lt= lt+m1;
421 ADD,DC ht_0,%r0,ht_0 ; ht++
422
423 ADD %ret0,lt_0,lt_0 ; lt = lt + c;
424 ADD,DC ht_0,%r0,ht_0 ; ht++
425
426 COPY ht_0,%ret0 ; copy carry
427 STD lt_0,0(r_ptr) ; rp[0] = lt
428
429bn_mul_words_exit
430 .EXIT
431 LDD -96(%sp),%r7 ; restore r7
432 LDD -104(%sp),%r6 ; restore r6
433 LDD -112(%sp),%r5 ; restore r5
434 LDD -120(%sp),%r4 ; restore r4
435 BVE (%rp)
436 LDD,MB -128(%sp),%r3 ; restore r3
437 .PROCEND ;in=23,24,25,26,29;out=28;
438
439;----------------------------------------------------------------------------
440;
441;void bn_sqr_words(BN_ULONG *rp, BN_ULONG *ap, int num)
442;
443; arg0 = rp
444; arg1 = ap
445; arg2 = num
446;
447
448bn_sqr_words
449 .proc
450 .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
451 .EXPORT bn_sqr_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
452 .entry
453 .align 64
454
455 STD %r3,0(%sp) ; save r3
456 STD %r4,8(%sp) ; save r4
457 NOP
458 STD %r5,16(%sp) ; save r5
459
460 CMPIB,>= 0,num,bn_sqr_words_exit
461 LDO 128(%sp),%sp ; bump stack
462
463 ;
464 ; If only 1, the goto straight to cleanup
465 ;
466 CMPIB,= 1,num,bn_sqr_words_single_top
467 DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L
468
469 ;
470 ; This loop is unrolled 2 times (64-byte aligned as well)
471 ;
472
473bn_sqr_words_unroll2
474 FLDD 0(a_ptr),t_float_0 ; a[0]
475 FLDD 8(a_ptr),t_float_1 ; a[1]
476 XMPYU fht_0,flt_0,fm ; m[0]
477 XMPYU fht_1,flt_1,fm_1 ; m[1]
478
479 FSTD fm,-24(%sp) ; store m[0]
480 FSTD fm_1,-56(%sp) ; store m[1]
481 XMPYU flt_0,flt_0,lt_temp ; lt[0]
482 XMPYU flt_1,flt_1,lt_temp_1 ; lt[1]
483
484 FSTD lt_temp,-16(%sp) ; store lt[0]
485 FSTD lt_temp_1,-48(%sp) ; store lt[1]
486 XMPYU fht_0,fht_0,ht_temp ; ht[0]
487 XMPYU fht_1,fht_1,ht_temp_1 ; ht[1]
488
489 FSTD ht_temp,-8(%sp) ; store ht[0]
490 FSTD ht_temp_1,-40(%sp) ; store ht[1]
491 LDD -24(%sp),m_0
492 LDD -56(%sp),m_1
493
494 AND m_0,high_mask,tmp_0 ; m[0] & Mask
495 AND m_1,high_mask,tmp_1 ; m[1] & Mask
496 DEPD,Z m_0,30,31,m_0 ; m[0] << 32+1
497 DEPD,Z m_1,30,31,m_1 ; m[1] << 32+1
498
499 LDD -16(%sp),lt_0
500 LDD -48(%sp),lt_1
501 EXTRD,U tmp_0,32,33,tmp_0 ; tmp_0 = m[0]&Mask >> 32-1
502 EXTRD,U tmp_1,32,33,tmp_1 ; tmp_1 = m[1]&Mask >> 32-1
503
504 LDD -8(%sp),ht_0
505 LDD -40(%sp),ht_1
506 ADD,L ht_0,tmp_0,ht_0 ; ht[0] += tmp_0
507 ADD,L ht_1,tmp_1,ht_1 ; ht[1] += tmp_1
508
509 ADD lt_0,m_0,lt_0 ; lt = lt+m
510 ADD,DC ht_0,%r0,ht_0 ; ht[0]++
511 STD lt_0,0(r_ptr) ; rp[0] = lt[0]
512 STD ht_0,8(r_ptr) ; rp[1] = ht[1]
513
514 ADD lt_1,m_1,lt_1 ; lt = lt+m
515 ADD,DC ht_1,%r0,ht_1 ; ht[1]++
516 STD lt_1,16(r_ptr) ; rp[2] = lt[1]
517 STD ht_1,24(r_ptr) ; rp[3] = ht[1]
518
519 LDO -2(num),num ; num = num - 2;
520 LDO 16(a_ptr),a_ptr ; ap += 2
521 CMPIB,<= 2,num,bn_sqr_words_unroll2
522 LDO 32(r_ptr),r_ptr ; rp += 4
523
524 CMPIB,=,N 0,num,bn_sqr_words_exit ; are we done?
525
526 ;
527 ; Top of loop aligned on 64-byte boundary
528 ;
529bn_sqr_words_single_top
530 FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R)
531
532 XMPYU fht_0,flt_0,fm ; m
533 FSTD fm,-24(%sp) ; store m
534
535 XMPYU flt_0,flt_0,lt_temp ; lt
536 FSTD lt_temp,-16(%sp) ; store lt
537
538 XMPYU fht_0,fht_0,ht_temp ; ht
539 FSTD ht_temp,-8(%sp) ; store ht
540
541 LDD -24(%sp),m_0 ; load m
542 AND m_0,high_mask,tmp_0 ; m & Mask
543 DEPD,Z m_0,30,31,m_0 ; m << 32+1
544 LDD -16(%sp),lt_0 ; lt
545
546 LDD -8(%sp),ht_0 ; ht
547 EXTRD,U tmp_0,32,33,tmp_0 ; tmp_0 = m&Mask >> 32-1
548 ADD m_0,lt_0,lt_0 ; lt = lt+m
549 ADD,L ht_0,tmp_0,ht_0 ; ht += tmp_0
550 ADD,DC ht_0,%r0,ht_0 ; ht++
551
552 STD lt_0,0(r_ptr) ; rp[0] = lt
553 STD ht_0,8(r_ptr) ; rp[1] = ht
554
555bn_sqr_words_exit
556 .EXIT
557 LDD -112(%sp),%r5 ; restore r5
558 LDD -120(%sp),%r4 ; restore r4
559 BVE (%rp)
560 LDD,MB -128(%sp),%r3
561 .PROCEND ;in=23,24,25,26,29;out=28;
562
563
564;----------------------------------------------------------------------------
565;
566;BN_ULONG bn_add_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
567;
568; arg0 = rp
569; arg1 = ap
570; arg2 = bp
571; arg3 = n
572
573t .reg %r22
574b .reg %r21
575l .reg %r20
576
577bn_add_words
578 .proc
579 .entry
580 .callinfo
581 .EXPORT bn_add_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
582 .align 64
583
584 CMPIB,>= 0,n,bn_add_words_exit
585 COPY %r0,%ret0 ; return 0 by default
586
587 ;
588 ; If 2 or more numbers do the loop
589 ;
590 CMPIB,= 1,n,bn_add_words_single_top
591 NOP
592
593 ;
594 ; This loop is unrolled 2 times (64-byte aligned as well)
595 ;
596bn_add_words_unroll2
597 LDD 0(a_ptr),t
598 LDD 0(b_ptr),b
599 ADD t,%ret0,t ; t = t+c;
600 ADD,DC %r0,%r0,%ret0 ; set c to carry
601 ADD t,b,l ; l = t + b[0]
602 ADD,DC %ret0,%r0,%ret0 ; c+= carry
603 STD l,0(r_ptr)
604
605 LDD 8(a_ptr),t
606 LDD 8(b_ptr),b
607 ADD t,%ret0,t ; t = t+c;
608 ADD,DC %r0,%r0,%ret0 ; set c to carry
609 ADD t,b,l ; l = t + b[0]
610 ADD,DC %ret0,%r0,%ret0 ; c+= carry
611 STD l,8(r_ptr)
612
613 LDO -2(n),n
614 LDO 16(a_ptr),a_ptr
615 LDO 16(b_ptr),b_ptr
616
617 CMPIB,<= 2,n,bn_add_words_unroll2
618 LDO 16(r_ptr),r_ptr
619
620 CMPIB,=,N 0,n,bn_add_words_exit ; are we done?
621
622bn_add_words_single_top
623 LDD 0(a_ptr),t
624 LDD 0(b_ptr),b
625
626 ADD t,%ret0,t ; t = t+c;
627 ADD,DC %r0,%r0,%ret0 ; set c to carry (could use CMPCLR??)
628 ADD t,b,l ; l = t + b[0]
629 ADD,DC %ret0,%r0,%ret0 ; c+= carry
630 STD l,0(r_ptr)
631
632bn_add_words_exit
633 .EXIT
634 BVE (%rp)
635 NOP
636 .PROCEND ;in=23,24,25,26,29;out=28;
637
638;----------------------------------------------------------------------------
639;
640;BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
641;
642; arg0 = rp
643; arg1 = ap
644; arg2 = bp
645; arg3 = n
646
647t1 .reg %r22
648t2 .reg %r21
649sub_tmp1 .reg %r20
650sub_tmp2 .reg %r19
651
652
653bn_sub_words
654 .proc
655 .callinfo
656 .EXPORT bn_sub_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
657 .entry
658 .align 64
659
660 CMPIB,>= 0,n,bn_sub_words_exit
661 COPY %r0,%ret0 ; return 0 by default
662
663 ;
664 ; If 2 or more numbers do the loop
665 ;
666 CMPIB,= 1,n,bn_sub_words_single_top
667 NOP
668
669 ;
670 ; This loop is unrolled 2 times (64-byte aligned as well)
671 ;
672bn_sub_words_unroll2
673 LDD 0(a_ptr),t1
674 LDD 0(b_ptr),t2
675 SUB t1,t2,sub_tmp1 ; t3 = t1-t2;
676 SUB sub_tmp1,%ret0,sub_tmp1 ; t3 = t3- c;
677
678 CMPCLR,*>> t1,t2,sub_tmp2 ; clear if t1 > t2
679 LDO 1(%r0),sub_tmp2
680
681 CMPCLR,*= t1,t2,%r0
682 COPY sub_tmp2,%ret0
683 STD sub_tmp1,0(r_ptr)
684
685 LDD 8(a_ptr),t1
686 LDD 8(b_ptr),t2
687 SUB t1,t2,sub_tmp1 ; t3 = t1-t2;
688 SUB sub_tmp1,%ret0,sub_tmp1 ; t3 = t3- c;
689 CMPCLR,*>> t1,t2,sub_tmp2 ; clear if t1 > t2
690 LDO 1(%r0),sub_tmp2
691
692 CMPCLR,*= t1,t2,%r0
693 COPY sub_tmp2,%ret0
694 STD sub_tmp1,8(r_ptr)
695
696 LDO -2(n),n
697 LDO 16(a_ptr),a_ptr
698 LDO 16(b_ptr),b_ptr
699
700 CMPIB,<= 2,n,bn_sub_words_unroll2
701 LDO 16(r_ptr),r_ptr
702
703 CMPIB,=,N 0,n,bn_sub_words_exit ; are we done?
704
705bn_sub_words_single_top
706 LDD 0(a_ptr),t1
707 LDD 0(b_ptr),t2
708 SUB t1,t2,sub_tmp1 ; t3 = t1-t2;
709 SUB sub_tmp1,%ret0,sub_tmp1 ; t3 = t3- c;
710 CMPCLR,*>> t1,t2,sub_tmp2 ; clear if t1 > t2
711 LDO 1(%r0),sub_tmp2
712
713 CMPCLR,*= t1,t2,%r0
714 COPY sub_tmp2,%ret0
715
716 STD sub_tmp1,0(r_ptr)
717
718bn_sub_words_exit
719 .EXIT
720 BVE (%rp)
721 NOP
722 .PROCEND ;in=23,24,25,26,29;out=28;
723
724;------------------------------------------------------------------------------
725;
726; unsigned long bn_div_words(unsigned long h, unsigned long l, unsigned long d)
727;
728; arg0 = h
729; arg1 = l
730; arg2 = d
731;
732; This is mainly just modified assembly from the compiler, thus the
733; lack of variable names.
734;
735;------------------------------------------------------------------------------
736bn_div_words
737 .proc
738 .callinfo CALLER,FRAME=272,ENTRY_GR=%r10,SAVE_RP,ARGS_SAVED,ORDERING_AWARE
739 .EXPORT bn_div_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
740 .IMPORT BN_num_bits_word,CODE,NO_RELOCATION
741 .IMPORT __iob,DATA
742 .IMPORT fprintf,CODE,NO_RELOCATION
743 .IMPORT abort,CODE,NO_RELOCATION
744 .IMPORT $$div2U,MILLICODE
745 .entry
746 STD %r2,-16(%r30)
747 STD,MA %r3,352(%r30)
748 STD %r4,-344(%r30)
749 STD %r5,-336(%r30)
750 STD %r6,-328(%r30)
751 STD %r7,-320(%r30)
752 STD %r8,-312(%r30)
753 STD %r9,-304(%r30)
754 STD %r10,-296(%r30)
755
756 STD %r27,-288(%r30) ; save gp
757
758 COPY %r24,%r3 ; save d
759 COPY %r26,%r4 ; save h (high 64-bits)
760 LDO -1(%r0),%ret0 ; return -1 by default
761
762 CMPB,*= %r0,%arg2,$D3 ; if (d == 0)
763 COPY %r25,%r5 ; save l (low 64-bits)
764
765 LDO -48(%r30),%r29 ; create ap
766 .CALL ;in=26,29;out=28;
767 B,L BN_num_bits_word,%r2
768 COPY %r3,%r26
769 LDD -288(%r30),%r27 ; restore gp
770 LDI 64,%r21
771
772 CMPB,= %r21,%ret0,$00000012 ;if (i == 64) (forward)
773 COPY %ret0,%r24 ; i
774 MTSARCM %r24
775 DEPDI,Z -1,%sar,1,%r29
776 CMPB,*<<,N %r29,%r4,bn_div_err_case ; if (h > 1<<i) (forward)
777
778$00000012
779 SUBI 64,%r24,%r31 ; i = 64 - i;
780 CMPCLR,*<< %r4,%r3,%r0 ; if (h >= d)
781 SUB %r4,%r3,%r4 ; h -= d
782 CMPB,= %r31,%r0,$0000001A ; if (i)
783 COPY %r0,%r10 ; ret = 0
784 MTSARCM %r31 ; i to shift
785 DEPD,Z %r3,%sar,64,%r3 ; d <<= i;
786 SUBI 64,%r31,%r19 ; 64 - i; redundent
787 MTSAR %r19 ; (64 -i) to shift
788 SHRPD %r4,%r5,%sar,%r4 ; l>> (64-i)
789 MTSARCM %r31 ; i to shift
790 DEPD,Z %r5,%sar,64,%r5 ; l <<= i;
791
792$0000001A
793 DEPDI,Z -1,31,32,%r19
794 EXTRD,U %r3,31,32,%r6 ; dh=(d&0xfff)>>32
795 EXTRD,U %r3,63,32,%r8 ; dl = d&0xffffff
796 LDO 2(%r0),%r9
797 STD %r3,-280(%r30) ; "d" to stack
798
799$0000001C
800 DEPDI,Z -1,63,32,%r29 ;
801 EXTRD,U %r4,31,32,%r31 ; h >> 32
802 CMPB,*=,N %r31,%r6,$D2 ; if ((h>>32) != dh)(forward) div
803 COPY %r4,%r26
804 EXTRD,U %r4,31,32,%r25
805 COPY %r6,%r24
806 .CALL ;in=23,24,25,26;out=20,21,22,28,29; (MILLICALL)
807 B,L $$div2U,%r2
808 EXTRD,U %r6,31,32,%r23
809 DEPD %r28,31,32,%r29
810$D2
811 STD %r29,-272(%r30) ; q
812 AND %r5,%r19,%r24 ; t & 0xffffffff00000000;
813 EXTRD,U %r24,31,32,%r24 ; ???
814 FLDD -272(%r30),%fr7 ; q
815 FLDD -280(%r30),%fr8 ; d
816 XMPYU %fr8L,%fr7L,%fr10
817 FSTD %fr10,-256(%r30)
818 XMPYU %fr8L,%fr7R,%fr22
819 FSTD %fr22,-264(%r30)
820 XMPYU %fr8R,%fr7L,%fr11
821 XMPYU %fr8R,%fr7R,%fr23
822 FSTD %fr11,-232(%r30)
823 FSTD %fr23,-240(%r30)
824 LDD -256(%r30),%r28
825 DEPD,Z %r28,31,32,%r2
826 LDD -264(%r30),%r20
827 ADD,L %r20,%r2,%r31
828 LDD -232(%r30),%r22
829 DEPD,Z %r22,31,32,%r22
830 LDD -240(%r30),%r21
831 B $00000024 ; enter loop
832 ADD,L %r21,%r22,%r23
833
834$0000002A
835 LDO -1(%r29),%r29
836 SUB %r23,%r8,%r23
837$00000024
838 SUB %r4,%r31,%r25
839 AND %r25,%r19,%r26
840 CMPB,*<>,N %r0,%r26,$00000046 ; (forward)
841 DEPD,Z %r25,31,32,%r20
842 OR %r20,%r24,%r21
843 CMPB,*<<,N %r21,%r23,$0000002A ;(backward)
844 SUB %r31,%r6,%r31
845;-------------Break path---------------------
846
847$00000046
848 DEPD,Z %r23,31,32,%r25 ;tl
849 EXTRD,U %r23,31,32,%r26 ;t
850 AND %r25,%r19,%r24 ;tl = (tl<<32)&0xfffffff0000000L
851 ADD,L %r31,%r26,%r31 ;th += t;
852 CMPCLR,*>>= %r5,%r24,%r0 ;if (l<tl)
853 LDO 1(%r31),%r31 ; th++;
854 CMPB,*<<=,N %r31,%r4,$00000036 ;if (n < th) (forward)
855 LDO -1(%r29),%r29 ;q--;
856 ADD,L %r4,%r3,%r4 ;h += d;
857$00000036
858 ADDIB,=,N -1,%r9,$D1 ;if (--count == 0) break (forward)
859 SUB %r5,%r24,%r28 ; l -= tl;
860 SUB %r4,%r31,%r24 ; h -= th;
861 SHRPD %r24,%r28,32,%r4 ; h = ((h<<32)|(l>>32));
862 DEPD,Z %r29,31,32,%r10 ; ret = q<<32
863 b $0000001C
864 DEPD,Z %r28,31,32,%r5 ; l = l << 32
865
866$D1
867 OR %r10,%r29,%r28 ; ret |= q
868$D3
869 LDD -368(%r30),%r2
870$D0
871 LDD -296(%r30),%r10
872 LDD -304(%r30),%r9
873 LDD -312(%r30),%r8
874 LDD -320(%r30),%r7
875 LDD -328(%r30),%r6
876 LDD -336(%r30),%r5
877 LDD -344(%r30),%r4
878 BVE (%r2)
879 .EXIT
880 LDD,MB -352(%r30),%r3
881
882bn_div_err_case
883 MFIA %r6
884 ADDIL L'bn_div_words-bn_div_err_case,%r6,%r1
885 LDO R'bn_div_words-bn_div_err_case(%r1),%r6
886 ADDIL LT'__iob,%r27,%r1
887 LDD RT'__iob(%r1),%r26
888 ADDIL L'C$4-bn_div_words,%r6,%r1
889 LDO R'C$4-bn_div_words(%r1),%r25
890 LDO 64(%r26),%r26
891 .CALL ;in=24,25,26,29;out=28;
892 B,L fprintf,%r2
893 LDO -48(%r30),%r29
894 LDD -288(%r30),%r27
895 .CALL ;in=29;
896 B,L abort,%r2
897 LDO -48(%r30),%r29
898 LDD -288(%r30),%r27
899 B $D0
900 LDD -368(%r30),%r2
901 .PROCEND ;in=24,25,26,29;out=28;
902
903;----------------------------------------------------------------------------
904;
905; Registers to hold 64-bit values to manipulate. The "L" part
906; of the register corresponds to the upper 32-bits, while the "R"
907; part corresponds to the lower 32-bits
908;
909; Note, that when using b6 and b7, the code must save these before
910; using them because they are callee save registers
911;
912;
913; Floating point registers to use to save values that
914; are manipulated. These don't collide with ftemp1-6 and
915; are all caller save registers
916;
917a0 .reg %fr22
918a0L .reg %fr22L
919a0R .reg %fr22R
920
921a1 .reg %fr23
922a1L .reg %fr23L
923a1R .reg %fr23R
924
925a2 .reg %fr24
926a2L .reg %fr24L
927a2R .reg %fr24R
928
929a3 .reg %fr25
930a3L .reg %fr25L
931a3R .reg %fr25R
932
933a4 .reg %fr26
934a4L .reg %fr26L
935a4R .reg %fr26R
936
937a5 .reg %fr27
938a5L .reg %fr27L
939a5R .reg %fr27R
940
941a6 .reg %fr28
942a6L .reg %fr28L
943a6R .reg %fr28R
944
945a7 .reg %fr29
946a7L .reg %fr29L
947a7R .reg %fr29R
948
949b0 .reg %fr30
950b0L .reg %fr30L
951b0R .reg %fr30R
952
953b1 .reg %fr31
954b1L .reg %fr31L
955b1R .reg %fr31R
956
957;
958; Temporary floating point variables, these are all caller save
959; registers
960;
961ftemp1 .reg %fr4
962ftemp2 .reg %fr5
963ftemp3 .reg %fr6
964ftemp4 .reg %fr7
965
966;
967; The B set of registers when used.
968;
969
970b2 .reg %fr8
971b2L .reg %fr8L
972b2R .reg %fr8R
973
974b3 .reg %fr9
975b3L .reg %fr9L
976b3R .reg %fr9R
977
978b4 .reg %fr10
979b4L .reg %fr10L
980b4R .reg %fr10R
981
982b5 .reg %fr11
983b5L .reg %fr11L
984b5R .reg %fr11R
985
986b6 .reg %fr12
987b6L .reg %fr12L
988b6R .reg %fr12R
989
990b7 .reg %fr13
991b7L .reg %fr13L
992b7R .reg %fr13R
993
994c1 .reg %r21 ; only reg
995temp1 .reg %r20 ; only reg
996temp2 .reg %r19 ; only reg
997temp3 .reg %r31 ; only reg
998
999m1 .reg %r28
1000c2 .reg %r23
1001high_one .reg %r1
1002ht .reg %r6
1003lt .reg %r5
1004m .reg %r4
1005c3 .reg %r3
1006
1007SQR_ADD_C .macro A0L,A0R,C1,C2,C3
1008 XMPYU A0L,A0R,ftemp1 ; m
1009 FSTD ftemp1,-24(%sp) ; store m
1010
1011 XMPYU A0R,A0R,ftemp2 ; lt
1012 FSTD ftemp2,-16(%sp) ; store lt
1013
1014 XMPYU A0L,A0L,ftemp3 ; ht
1015 FSTD ftemp3,-8(%sp) ; store ht
1016
1017 LDD -24(%sp),m ; load m
1018 AND m,high_mask,temp2 ; m & Mask
1019 DEPD,Z m,30,31,temp3 ; m << 32+1
1020 LDD -16(%sp),lt ; lt
1021
1022 LDD -8(%sp),ht ; ht
1023 EXTRD,U temp2,32,33,temp1 ; temp1 = m&Mask >> 32-1
1024 ADD temp3,lt,lt ; lt = lt+m
1025 ADD,L ht,temp1,ht ; ht += temp1
1026 ADD,DC ht,%r0,ht ; ht++
1027
1028 ADD C1,lt,C1 ; c1=c1+lt
1029 ADD,DC ht,%r0,ht ; ht++
1030
1031 ADD C2,ht,C2 ; c2=c2+ht
1032 ADD,DC C3,%r0,C3 ; c3++
1033.endm
1034
1035SQR_ADD_C2 .macro A0L,A0R,A1L,A1R,C1,C2,C3
1036 XMPYU A0L,A1R,ftemp1 ; m1 = bl*ht
1037 FSTD ftemp1,-16(%sp) ;
1038 XMPYU A0R,A1L,ftemp2 ; m = bh*lt
1039 FSTD ftemp2,-8(%sp) ;
1040 XMPYU A0R,A1R,ftemp3 ; lt = bl*lt
1041 FSTD ftemp3,-32(%sp)
1042 XMPYU A0L,A1L,ftemp4 ; ht = bh*ht
1043 FSTD ftemp4,-24(%sp) ;
1044
1045 LDD -8(%sp),m ; r21 = m
1046 LDD -16(%sp),m1 ; r19 = m1
1047 ADD,L m,m1,m ; m+m1
1048
1049 DEPD,Z m,31,32,temp3 ; (m+m1<<32)
1050 LDD -24(%sp),ht ; r24 = ht
1051
1052 CMPCLR,*>>= m,m1,%r0 ; if (m < m1)
1053 ADD,L ht,high_one,ht ; ht+=high_one
1054
1055 EXTRD,U m,31,32,temp1 ; m >> 32
1056 LDD -32(%sp),lt ; lt
1057 ADD,L ht,temp1,ht ; ht+= m>>32
1058 ADD lt,temp3,lt ; lt = lt+m1
1059 ADD,DC ht,%r0,ht ; ht++
1060
1061 ADD ht,ht,ht ; ht=ht+ht;
1062 ADD,DC C3,%r0,C3 ; add in carry (c3++)
1063
1064 ADD lt,lt,lt ; lt=lt+lt;
1065 ADD,DC ht,%r0,ht ; add in carry (ht++)
1066
1067 ADD C1,lt,C1 ; c1=c1+lt
1068 ADD,DC,*NUV ht,%r0,ht ; add in carry (ht++)
1069 LDO 1(C3),C3 ; bump c3 if overflow,nullify otherwise
1070
1071 ADD C2,ht,C2 ; c2 = c2 + ht
1072 ADD,DC C3,%r0,C3 ; add in carry (c3++)
1073.endm
1074
1075;
1076;void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
1077; arg0 = r_ptr
1078; arg1 = a_ptr
1079;
1080
1081bn_sqr_comba8
1082 .PROC
1083 .CALLINFO FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
1084 .EXPORT bn_sqr_comba8,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
1085 .ENTRY
1086 .align 64
1087
1088 STD %r3,0(%sp) ; save r3
1089 STD %r4,8(%sp) ; save r4
1090 STD %r5,16(%sp) ; save r5
1091 STD %r6,24(%sp) ; save r6
1092
1093 ;
1094 ; Zero out carries
1095 ;
1096 COPY %r0,c1
1097 COPY %r0,c2
1098 COPY %r0,c3
1099
1100 LDO 128(%sp),%sp ; bump stack
1101 DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L
1102 DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32
1103
1104 ;
1105 ; Load up all of the values we are going to use
1106 ;
1107 FLDD 0(a_ptr),a0
1108 FLDD 8(a_ptr),a1
1109 FLDD 16(a_ptr),a2
1110 FLDD 24(a_ptr),a3
1111 FLDD 32(a_ptr),a4
1112 FLDD 40(a_ptr),a5
1113 FLDD 48(a_ptr),a6
1114 FLDD 56(a_ptr),a7
1115
1116 SQR_ADD_C a0L,a0R,c1,c2,c3
1117 STD c1,0(r_ptr) ; r[0] = c1;
1118 COPY %r0,c1
1119
1120 SQR_ADD_C2 a1L,a1R,a0L,a0R,c2,c3,c1
1121 STD c2,8(r_ptr) ; r[1] = c2;
1122 COPY %r0,c2
1123
1124 SQR_ADD_C a1L,a1R,c3,c1,c2
1125 SQR_ADD_C2 a2L,a2R,a0L,a0R,c3,c1,c2
1126 STD c3,16(r_ptr) ; r[2] = c3;
1127 COPY %r0,c3
1128
1129 SQR_ADD_C2 a3L,a3R,a0L,a0R,c1,c2,c3
1130 SQR_ADD_C2 a2L,a2R,a1L,a1R,c1,c2,c3
1131 STD c1,24(r_ptr) ; r[3] = c1;
1132 COPY %r0,c1
1133
1134 SQR_ADD_C a2L,a2R,c2,c3,c1
1135 SQR_ADD_C2 a3L,a3R,a1L,a1R,c2,c3,c1
1136 SQR_ADD_C2 a4L,a4R,a0L,a0R,c2,c3,c1
1137 STD c2,32(r_ptr) ; r[4] = c2;
1138 COPY %r0,c2
1139
1140 SQR_ADD_C2 a5L,a5R,a0L,a0R,c3,c1,c2
1141 SQR_ADD_C2 a4L,a4R,a1L,a1R,c3,c1,c2
1142 SQR_ADD_C2 a3L,a3R,a2L,a2R,c3,c1,c2
1143 STD c3,40(r_ptr) ; r[5] = c3;
1144 COPY %r0,c3
1145
1146 SQR_ADD_C a3L,a3R,c1,c2,c3
1147 SQR_ADD_C2 a4L,a4R,a2L,a2R,c1,c2,c3
1148 SQR_ADD_C2 a5L,a5R,a1L,a1R,c1,c2,c3
1149 SQR_ADD_C2 a6L,a6R,a0L,a0R,c1,c2,c3
1150 STD c1,48(r_ptr) ; r[6] = c1;
1151 COPY %r0,c1
1152
1153 SQR_ADD_C2 a7L,a7R,a0L,a0R,c2,c3,c1
1154 SQR_ADD_C2 a6L,a6R,a1L,a1R,c2,c3,c1
1155 SQR_ADD_C2 a5L,a5R,a2L,a2R,c2,c3,c1
1156 SQR_ADD_C2 a4L,a4R,a3L,a3R,c2,c3,c1
1157 STD c2,56(r_ptr) ; r[7] = c2;
1158 COPY %r0,c2
1159
1160 SQR_ADD_C a4L,a4R,c3,c1,c2
1161 SQR_ADD_C2 a5L,a5R,a3L,a3R,c3,c1,c2
1162 SQR_ADD_C2 a6L,a6R,a2L,a2R,c3,c1,c2
1163 SQR_ADD_C2 a7L,a7R,a1L,a1R,c3,c1,c2
1164 STD c3,64(r_ptr) ; r[8] = c3;
1165 COPY %r0,c3
1166
1167 SQR_ADD_C2 a7L,a7R,a2L,a2R,c1,c2,c3
1168 SQR_ADD_C2 a6L,a6R,a3L,a3R,c1,c2,c3
1169 SQR_ADD_C2 a5L,a5R,a4L,a4R,c1,c2,c3
1170 STD c1,72(r_ptr) ; r[9] = c1;
1171 COPY %r0,c1
1172
1173 SQR_ADD_C a5L,a5R,c2,c3,c1
1174 SQR_ADD_C2 a6L,a6R,a4L,a4R,c2,c3,c1
1175 SQR_ADD_C2 a7L,a7R,a3L,a3R,c2,c3,c1
1176 STD c2,80(r_ptr) ; r[10] = c2;
1177 COPY %r0,c2
1178
1179 SQR_ADD_C2 a7L,a7R,a4L,a4R,c3,c1,c2
1180 SQR_ADD_C2 a6L,a6R,a5L,a5R,c3,c1,c2
1181 STD c3,88(r_ptr) ; r[11] = c3;
1182 COPY %r0,c3
1183
1184 SQR_ADD_C a6L,a6R,c1,c2,c3
1185 SQR_ADD_C2 a7L,a7R,a5L,a5R,c1,c2,c3
1186 STD c1,96(r_ptr) ; r[12] = c1;
1187 COPY %r0,c1
1188
1189 SQR_ADD_C2 a7L,a7R,a6L,a6R,c2,c3,c1
1190 STD c2,104(r_ptr) ; r[13] = c2;
1191 COPY %r0,c2
1192
1193 SQR_ADD_C a7L,a7R,c3,c1,c2
1194 STD c3, 112(r_ptr) ; r[14] = c3
1195 STD c1, 120(r_ptr) ; r[15] = c1
1196
1197 .EXIT
1198 LDD -104(%sp),%r6 ; restore r6
1199 LDD -112(%sp),%r5 ; restore r5
1200 LDD -120(%sp),%r4 ; restore r4
1201 BVE (%rp)
1202 LDD,MB -128(%sp),%r3
1203
1204 .PROCEND
1205
1206;-----------------------------------------------------------------------------
1207;
1208;void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
1209; arg0 = r_ptr
1210; arg1 = a_ptr
1211;
1212
1213bn_sqr_comba4
1214 .proc
1215 .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
1216 .EXPORT bn_sqr_comba4,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
1217 .entry
1218 .align 64
1219 STD %r3,0(%sp) ; save r3
1220 STD %r4,8(%sp) ; save r4
1221 STD %r5,16(%sp) ; save r5
1222 STD %r6,24(%sp) ; save r6
1223
1224 ;
1225 ; Zero out carries
1226 ;
1227 COPY %r0,c1
1228 COPY %r0,c2
1229 COPY %r0,c3
1230
1231 LDO 128(%sp),%sp ; bump stack
1232 DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L
1233 DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32
1234
1235 ;
1236 ; Load up all of the values we are going to use
1237 ;
1238 FLDD 0(a_ptr),a0
1239 FLDD 8(a_ptr),a1
1240 FLDD 16(a_ptr),a2
1241 FLDD 24(a_ptr),a3
1242 FLDD 32(a_ptr),a4
1243 FLDD 40(a_ptr),a5
1244 FLDD 48(a_ptr),a6
1245 FLDD 56(a_ptr),a7
1246
1247 SQR_ADD_C a0L,a0R,c1,c2,c3
1248
1249 STD c1,0(r_ptr) ; r[0] = c1;
1250 COPY %r0,c1
1251
1252 SQR_ADD_C2 a1L,a1R,a0L,a0R,c2,c3,c1
1253
1254 STD c2,8(r_ptr) ; r[1] = c2;
1255 COPY %r0,c2
1256
1257 SQR_ADD_C a1L,a1R,c3,c1,c2
1258 SQR_ADD_C2 a2L,a2R,a0L,a0R,c3,c1,c2
1259
1260 STD c3,16(r_ptr) ; r[2] = c3;
1261 COPY %r0,c3
1262
1263 SQR_ADD_C2 a3L,a3R,a0L,a0R,c1,c2,c3
1264 SQR_ADD_C2 a2L,a2R,a1L,a1R,c1,c2,c3
1265
1266 STD c1,24(r_ptr) ; r[3] = c1;
1267 COPY %r0,c1
1268
1269 SQR_ADD_C a2L,a2R,c2,c3,c1
1270 SQR_ADD_C2 a3L,a3R,a1L,a1R,c2,c3,c1
1271
1272 STD c2,32(r_ptr) ; r[4] = c2;
1273 COPY %r0,c2
1274
1275 SQR_ADD_C2 a3L,a3R,a2L,a2R,c3,c1,c2
1276 STD c3,40(r_ptr) ; r[5] = c3;
1277 COPY %r0,c3
1278
1279 SQR_ADD_C a3L,a3R,c1,c2,c3
1280 STD c1,48(r_ptr) ; r[6] = c1;
1281 STD c2,56(r_ptr) ; r[7] = c2;
1282
1283 .EXIT
1284 LDD -104(%sp),%r6 ; restore r6
1285 LDD -112(%sp),%r5 ; restore r5
1286 LDD -120(%sp),%r4 ; restore r4
1287 BVE (%rp)
1288 LDD,MB -128(%sp),%r3
1289
1290 .PROCEND
1291
1292
1293;---------------------------------------------------------------------------
1294
1295MUL_ADD_C .macro A0L,A0R,B0L,B0R,C1,C2,C3
1296 XMPYU A0L,B0R,ftemp1 ; m1 = bl*ht
1297 FSTD ftemp1,-16(%sp) ;
1298 XMPYU A0R,B0L,ftemp2 ; m = bh*lt
1299 FSTD ftemp2,-8(%sp) ;
1300 XMPYU A0R,B0R,ftemp3 ; lt = bl*lt
1301 FSTD ftemp3,-32(%sp)
1302 XMPYU A0L,B0L,ftemp4 ; ht = bh*ht
1303 FSTD ftemp4,-24(%sp) ;
1304
1305 LDD -8(%sp),m ; r21 = m
1306 LDD -16(%sp),m1 ; r19 = m1
1307 ADD,L m,m1,m ; m+m1
1308
1309 DEPD,Z m,31,32,temp3 ; (m+m1<<32)
1310 LDD -24(%sp),ht ; r24 = ht
1311
1312 CMPCLR,*>>= m,m1,%r0 ; if (m < m1)
1313 ADD,L ht,high_one,ht ; ht+=high_one
1314
1315 EXTRD,U m,31,32,temp1 ; m >> 32
1316 LDD -32(%sp),lt ; lt
1317 ADD,L ht,temp1,ht ; ht+= m>>32
1318 ADD lt,temp3,lt ; lt = lt+m1
1319 ADD,DC ht,%r0,ht ; ht++
1320
1321 ADD C1,lt,C1 ; c1=c1+lt
1322 ADD,DC ht,%r0,ht ; bump c3 if overflow,nullify otherwise
1323
1324 ADD C2,ht,C2 ; c2 = c2 + ht
1325 ADD,DC C3,%r0,C3 ; add in carry (c3++)
1326.endm
1327
1328
1329;
1330;void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
1331; arg0 = r_ptr
1332; arg1 = a_ptr
1333; arg2 = b_ptr
1334;
1335
1336bn_mul_comba8
1337 .proc
1338 .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
1339 .EXPORT bn_mul_comba8,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
1340 .entry
1341 .align 64
1342
1343 STD %r3,0(%sp) ; save r3
1344 STD %r4,8(%sp) ; save r4
1345 STD %r5,16(%sp) ; save r5
1346 STD %r6,24(%sp) ; save r6
1347 FSTD %fr12,32(%sp) ; save r6
1348 FSTD %fr13,40(%sp) ; save r7
1349
1350 ;
1351 ; Zero out carries
1352 ;
1353 COPY %r0,c1
1354 COPY %r0,c2
1355 COPY %r0,c3
1356
1357 LDO 128(%sp),%sp ; bump stack
1358 DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32
1359
1360 ;
1361 ; Load up all of the values we are going to use
1362 ;
1363 FLDD 0(a_ptr),a0
1364 FLDD 8(a_ptr),a1
1365 FLDD 16(a_ptr),a2
1366 FLDD 24(a_ptr),a3
1367 FLDD 32(a_ptr),a4
1368 FLDD 40(a_ptr),a5
1369 FLDD 48(a_ptr),a6
1370 FLDD 56(a_ptr),a7
1371
1372 FLDD 0(b_ptr),b0
1373 FLDD 8(b_ptr),b1
1374 FLDD 16(b_ptr),b2
1375 FLDD 24(b_ptr),b3
1376 FLDD 32(b_ptr),b4
1377 FLDD 40(b_ptr),b5
1378 FLDD 48(b_ptr),b6
1379 FLDD 56(b_ptr),b7
1380
1381 MUL_ADD_C a0L,a0R,b0L,b0R,c1,c2,c3
1382 STD c1,0(r_ptr)
1383 COPY %r0,c1
1384
1385 MUL_ADD_C a0L,a0R,b1L,b1R,c2,c3,c1
1386 MUL_ADD_C a1L,a1R,b0L,b0R,c2,c3,c1
1387 STD c2,8(r_ptr)
1388 COPY %r0,c2
1389
1390 MUL_ADD_C a2L,a2R,b0L,b0R,c3,c1,c2
1391 MUL_ADD_C a1L,a1R,b1L,b1R,c3,c1,c2
1392 MUL_ADD_C a0L,a0R,b2L,b2R,c3,c1,c2
1393 STD c3,16(r_ptr)
1394 COPY %r0,c3
1395
1396 MUL_ADD_C a0L,a0R,b3L,b3R,c1,c2,c3
1397 MUL_ADD_C a1L,a1R,b2L,b2R,c1,c2,c3
1398 MUL_ADD_C a2L,a2R,b1L,b1R,c1,c2,c3
1399 MUL_ADD_C a3L,a3R,b0L,b0R,c1,c2,c3
1400 STD c1,24(r_ptr)
1401 COPY %r0,c1
1402
1403 MUL_ADD_C a4L,a4R,b0L,b0R,c2,c3,c1
1404 MUL_ADD_C a3L,a3R,b1L,b1R,c2,c3,c1
1405 MUL_ADD_C a2L,a2R,b2L,b2R,c2,c3,c1
1406 MUL_ADD_C a1L,a1R,b3L,b3R,c2,c3,c1
1407 MUL_ADD_C a0L,a0R,b4L,b4R,c2,c3,c1
1408 STD c2,32(r_ptr)
1409 COPY %r0,c2
1410
1411 MUL_ADD_C a0L,a0R,b5L,b5R,c3,c1,c2
1412 MUL_ADD_C a1L,a1R,b4L,b4R,c3,c1,c2
1413 MUL_ADD_C a2L,a2R,b3L,b3R,c3,c1,c2
1414 MUL_ADD_C a3L,a3R,b2L,b2R,c3,c1,c2
1415 MUL_ADD_C a4L,a4R,b1L,b1R,c3,c1,c2
1416 MUL_ADD_C a5L,a5R,b0L,b0R,c3,c1,c2
1417 STD c3,40(r_ptr)
1418 COPY %r0,c3
1419
1420 MUL_ADD_C a6L,a6R,b0L,b0R,c1,c2,c3
1421 MUL_ADD_C a5L,a5R,b1L,b1R,c1,c2,c3
1422 MUL_ADD_C a4L,a4R,b2L,b2R,c1,c2,c3
1423 MUL_ADD_C a3L,a3R,b3L,b3R,c1,c2,c3
1424 MUL_ADD_C a2L,a2R,b4L,b4R,c1,c2,c3
1425 MUL_ADD_C a1L,a1R,b5L,b5R,c1,c2,c3
1426 MUL_ADD_C a0L,a0R,b6L,b6R,c1,c2,c3
1427 STD c1,48(r_ptr)
1428 COPY %r0,c1
1429
1430 MUL_ADD_C a0L,a0R,b7L,b7R,c2,c3,c1
1431 MUL_ADD_C a1L,a1R,b6L,b6R,c2,c3,c1
1432 MUL_ADD_C a2L,a2R,b5L,b5R,c2,c3,c1
1433 MUL_ADD_C a3L,a3R,b4L,b4R,c2,c3,c1
1434 MUL_ADD_C a4L,a4R,b3L,b3R,c2,c3,c1
1435 MUL_ADD_C a5L,a5R,b2L,b2R,c2,c3,c1
1436 MUL_ADD_C a6L,a6R,b1L,b1R,c2,c3,c1
1437 MUL_ADD_C a7L,a7R,b0L,b0R,c2,c3,c1
1438 STD c2,56(r_ptr)
1439 COPY %r0,c2
1440
1441 MUL_ADD_C a7L,a7R,b1L,b1R,c3,c1,c2
1442 MUL_ADD_C a6L,a6R,b2L,b2R,c3,c1,c2
1443 MUL_ADD_C a5L,a5R,b3L,b3R,c3,c1,c2
1444 MUL_ADD_C a4L,a4R,b4L,b4R,c3,c1,c2
1445 MUL_ADD_C a3L,a3R,b5L,b5R,c3,c1,c2
1446 MUL_ADD_C a2L,a2R,b6L,b6R,c3,c1,c2
1447 MUL_ADD_C a1L,a1R,b7L,b7R,c3,c1,c2
1448 STD c3,64(r_ptr)
1449 COPY %r0,c3
1450
1451 MUL_ADD_C a2L,a2R,b7L,b7R,c1,c2,c3
1452 MUL_ADD_C a3L,a3R,b6L,b6R,c1,c2,c3
1453 MUL_ADD_C a4L,a4R,b5L,b5R,c1,c2,c3
1454 MUL_ADD_C a5L,a5R,b4L,b4R,c1,c2,c3
1455 MUL_ADD_C a6L,a6R,b3L,b3R,c1,c2,c3
1456 MUL_ADD_C a7L,a7R,b2L,b2R,c1,c2,c3
1457 STD c1,72(r_ptr)
1458 COPY %r0,c1
1459
1460 MUL_ADD_C a7L,a7R,b3L,b3R,c2,c3,c1
1461 MUL_ADD_C a6L,a6R,b4L,b4R,c2,c3,c1
1462 MUL_ADD_C a5L,a5R,b5L,b5R,c2,c3,c1
1463 MUL_ADD_C a4L,a4R,b6L,b6R,c2,c3,c1
1464 MUL_ADD_C a3L,a3R,b7L,b7R,c2,c3,c1
1465 STD c2,80(r_ptr)
1466 COPY %r0,c2
1467
1468 MUL_ADD_C a4L,a4R,b7L,b7R,c3,c1,c2
1469 MUL_ADD_C a5L,a5R,b6L,b6R,c3,c1,c2
1470 MUL_ADD_C a6L,a6R,b5L,b5R,c3,c1,c2
1471 MUL_ADD_C a7L,a7R,b4L,b4R,c3,c1,c2
1472 STD c3,88(r_ptr)
1473 COPY %r0,c3
1474
1475 MUL_ADD_C a7L,a7R,b5L,b5R,c1,c2,c3
1476 MUL_ADD_C a6L,a6R,b6L,b6R,c1,c2,c3
1477 MUL_ADD_C a5L,a5R,b7L,b7R,c1,c2,c3
1478 STD c1,96(r_ptr)
1479 COPY %r0,c1
1480
1481 MUL_ADD_C a6L,a6R,b7L,b7R,c2,c3,c1
1482 MUL_ADD_C a7L,a7R,b6L,b6R,c2,c3,c1
1483 STD c2,104(r_ptr)
1484 COPY %r0,c2
1485
1486 MUL_ADD_C a7L,a7R,b7L,b7R,c3,c1,c2
1487 STD c3,112(r_ptr)
1488 STD c1,120(r_ptr)
1489
1490 .EXIT
1491 FLDD -88(%sp),%fr13
1492 FLDD -96(%sp),%fr12
1493 LDD -104(%sp),%r6 ; restore r6
1494 LDD -112(%sp),%r5 ; restore r5
1495 LDD -120(%sp),%r4 ; restore r4
1496 BVE (%rp)
1497 LDD,MB -128(%sp),%r3
1498
1499 .PROCEND
1500
1501;-----------------------------------------------------------------------------
1502;
1503;void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
1504; arg0 = r_ptr
1505; arg1 = a_ptr
1506; arg2 = b_ptr
1507;
1508
1509bn_mul_comba4
1510 .proc
1511 .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
1512 .EXPORT bn_mul_comba4,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
1513 .entry
1514 .align 64
1515
1516 STD %r3,0(%sp) ; save r3
1517 STD %r4,8(%sp) ; save r4
1518 STD %r5,16(%sp) ; save r5
1519 STD %r6,24(%sp) ; save r6
1520 FSTD %fr12,32(%sp) ; save r6
1521 FSTD %fr13,40(%sp) ; save r7
1522
1523 ;
1524 ; Zero out carries
1525 ;
1526 COPY %r0,c1
1527 COPY %r0,c2
1528 COPY %r0,c3
1529
1530 LDO 128(%sp),%sp ; bump stack
1531 DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32
1532
1533 ;
1534 ; Load up all of the values we are going to use
1535 ;
1536 FLDD 0(a_ptr),a0
1537 FLDD 8(a_ptr),a1
1538 FLDD 16(a_ptr),a2
1539 FLDD 24(a_ptr),a3
1540
1541 FLDD 0(b_ptr),b0
1542 FLDD 8(b_ptr),b1
1543 FLDD 16(b_ptr),b2
1544 FLDD 24(b_ptr),b3
1545
1546 MUL_ADD_C a0L,a0R,b0L,b0R,c1,c2,c3
1547 STD c1,0(r_ptr)
1548 COPY %r0,c1
1549
1550 MUL_ADD_C a0L,a0R,b1L,b1R,c2,c3,c1
1551 MUL_ADD_C a1L,a1R,b0L,b0R,c2,c3,c1
1552 STD c2,8(r_ptr)
1553 COPY %r0,c2
1554
1555 MUL_ADD_C a2L,a2R,b0L,b0R,c3,c1,c2
1556 MUL_ADD_C a1L,a1R,b1L,b1R,c3,c1,c2
1557 MUL_ADD_C a0L,a0R,b2L,b2R,c3,c1,c2
1558 STD c3,16(r_ptr)
1559 COPY %r0,c3
1560
1561 MUL_ADD_C a0L,a0R,b3L,b3R,c1,c2,c3
1562 MUL_ADD_C a1L,a1R,b2L,b2R,c1,c2,c3
1563 MUL_ADD_C a2L,a2R,b1L,b1R,c1,c2,c3
1564 MUL_ADD_C a3L,a3R,b0L,b0R,c1,c2,c3
1565 STD c1,24(r_ptr)
1566 COPY %r0,c1
1567
1568 MUL_ADD_C a3L,a3R,b1L,b1R,c2,c3,c1
1569 MUL_ADD_C a2L,a2R,b2L,b2R,c2,c3,c1
1570 MUL_ADD_C a1L,a1R,b3L,b3R,c2,c3,c1
1571 STD c2,32(r_ptr)
1572 COPY %r0,c2
1573
1574 MUL_ADD_C a2L,a2R,b3L,b3R,c3,c1,c2
1575 MUL_ADD_C a3L,a3R,b2L,b2R,c3,c1,c2
1576 STD c3,40(r_ptr)
1577 COPY %r0,c3
1578
1579 MUL_ADD_C a3L,a3R,b3L,b3R,c1,c2,c3
1580 STD c1,48(r_ptr)
1581 STD c2,56(r_ptr)
1582
1583 .EXIT
1584 FLDD -88(%sp),%fr13
1585 FLDD -96(%sp),%fr12
1586 LDD -104(%sp),%r6 ; restore r6
1587 LDD -112(%sp),%r5 ; restore r5
1588 LDD -120(%sp),%r4 ; restore r4
1589 BVE (%rp)
1590 LDD,MB -128(%sp),%r3
1591
1592 .PROCEND
1593
1594
1595 .SPACE $TEXT$
1596 .SUBSPA $CODE$
1597 .SPACE $PRIVATE$,SORT=16
1598 .IMPORT $global$,DATA
1599 .SPACE $TEXT$
1600 .SUBSPA $CODE$
1601 .SUBSPA $LIT$,ACCESS=0x2c
1602C$4
1603 .ALIGN 8
1604 .STRINGZ "Division would overflow (%d)\n"
1605 .END
diff --git a/src/lib/libcrypto/bn/asm/parisc-mont.pl b/src/lib/libcrypto/bn/asm/parisc-mont.pl
deleted file mode 100644
index fcfdee1f1f..0000000000
--- a/src/lib/libcrypto/bn/asm/parisc-mont.pl
+++ /dev/null
@@ -1,993 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# On PA-7100LC this module performs ~90-50% better, less for longer
11# keys, than code generated by gcc 3.2 for PA-RISC 1.1. Latter means
12# that compiler utilized xmpyu instruction to perform 32x32=64-bit
13# multiplication, which in turn means that "baseline" performance was
14# optimal in respect to instruction set capabilities. Fair comparison
15# with vendor compiler is problematic, because OpenSSL doesn't define
16# BN_LLONG [presumably] for historical reasons, which drives compiler
17# toward 4 times 16x16=32-bit multiplicatons [plus complementary
18# shifts and additions] instead. This means that you should observe
19# several times improvement over code generated by vendor compiler
20# for PA-RISC 1.1, but the "baseline" is far from optimal. The actual
21# improvement coefficient was never collected on PA-7100LC, or any
22# other 1.1 CPU, because I don't have access to such machine with
23# vendor compiler. But to give you a taste, PA-RISC 1.1 code path
24# reportedly outperformed code generated by cc +DA1.1 +O3 by factor
25# of ~5x on PA-8600.
26#
27# On PA-RISC 2.0 it has to compete with pa-risc2[W].s, which is
28# reportedly ~2x faster than vendor compiler generated code [according
29# to comment in pa-risc2[W].s]. Here comes a catch. Execution core of
30# this implementation is actually 32-bit one, in the sense that it
31# operates on 32-bit values. But pa-risc2[W].s operates on arrays of
32# 64-bit BN_LONGs... How do they interoperate then? No problem. This
33# module picks halves of 64-bit values in reverse order and pretends
34# they were 32-bit BN_LONGs. But can 32-bit core compete with "pure"
35# 64-bit code such as pa-risc2[W].s then? Well, the thing is that
36# 32x32=64-bit multiplication is the best even PA-RISC 2.0 can do,
37# i.e. there is no "wider" multiplication like on most other 64-bit
38# platforms. This means that even being effectively 32-bit, this
39# implementation performs "64-bit" computational task in same amount
40# of arithmetic operations, most notably multiplications. It requires
41# more memory references, most notably to tp[num], but this doesn't
42# seem to exhaust memory port capacity. And indeed, dedicated PA-RISC
43# 2.0 code path provides virtually same performance as pa-risc2[W].s:
44# it's ~10% better for shortest key length and ~10% worse for longest
45# one.
46#
47# In case it wasn't clear. The module has two distinct code paths:
48# PA-RISC 1.1 and PA-RISC 2.0 ones. Latter features carry-free 64-bit
49# additions and 64-bit integer loads, not to mention specific
50# instruction scheduling. In 64-bit build naturally only 2.0 code path
51# is assembled. In 32-bit application context both code paths are
52# assembled, PA-RISC 2.0 CPU is detected at run-time and proper path
53# is taken automatically. Also, in 32-bit build the module imposes
54# couple of limitations: vector lengths has to be even and vector
55# addresses has to be 64-bit aligned. Normally neither is a problem:
56# most common key lengths are even and vectors are commonly malloc-ed,
57# which ensures alignment.
58#
59# Special thanks to polarhome.com for providing HP-UX account on
60# PA-RISC 1.1 machine, and to correspondent who chose to remain
61# anonymous for testing the code on PA-RISC 2.0 machine.
62
63$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
64
65$flavour = shift;
66$output = shift;
67
68open STDOUT,">$output";
69
70if ($flavour =~ /64/) {
71 $LEVEL ="2.0W";
72 $SIZE_T =8;
73 $FRAME_MARKER =80;
74 $SAVED_RP =16;
75 $PUSH ="std";
76 $PUSHMA ="std,ma";
77 $POP ="ldd";
78 $POPMB ="ldd,mb";
79 $BN_SZ =$SIZE_T;
80} else {
81 $LEVEL ="1.1"; #$LEVEL.="\n\t.ALLOW\t2.0";
82 $SIZE_T =4;
83 $FRAME_MARKER =48;
84 $SAVED_RP =20;
85 $PUSH ="stw";
86 $PUSHMA ="stwm";
87 $POP ="ldw";
88 $POPMB ="ldwm";
89 $BN_SZ =$SIZE_T;
90}
91
92$FRAME=8*$SIZE_T+$FRAME_MARKER; # 8 saved regs + frame marker
93 # [+ argument transfer]
94$LOCALS=$FRAME-$FRAME_MARKER;
95$FRAME+=32; # local variables
96
97$tp="%r31";
98$ti1="%r29";
99$ti0="%r28";
100
101$rp="%r26";
102$ap="%r25";
103$bp="%r24";
104$np="%r23";
105$n0="%r22"; # passed through stack in 32-bit
106$num="%r21"; # passed through stack in 32-bit
107$idx="%r20";
108$arrsz="%r19";
109
110$nm1="%r7";
111$nm0="%r6";
112$ab1="%r5";
113$ab0="%r4";
114
115$fp="%r3";
116$hi1="%r2";
117$hi0="%r1";
118
119$xfer=$n0; # accomodates [-16..15] offset in fld[dw]s
120
121$fm0="%fr4"; $fti=$fm0;
122$fbi="%fr5L";
123$fn0="%fr5R";
124$fai="%fr6"; $fab0="%fr7"; $fab1="%fr8";
125$fni="%fr9"; $fnm0="%fr10"; $fnm1="%fr11";
126
127$code=<<___;
128 .LEVEL $LEVEL
129#if 0
130 .SPACE \$TEXT\$
131 .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
132#else
133 .text
134#endif
135
136 .EXPORT bn_mul_mont,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
137 .ALIGN 64
138bn_mul_mont
139 .PROC
140 .CALLINFO FRAME=`$FRAME-8*$SIZE_T`,NO_CALLS,SAVE_RP,SAVE_SP,ENTRY_GR=6
141 .ENTRY
142 $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
143 $PUSHMA %r3,$FRAME(%sp)
144 $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
145 $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
146 $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
147 $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp)
148 $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp)
149 $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp)
150 $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp)
151 ldo -$FRAME(%sp),$fp
152___
153$code.=<<___ if ($SIZE_T==4);
154 ldw `-$FRAME_MARKER-4`($fp),$n0
155 ldw `-$FRAME_MARKER-8`($fp),$num
156 nop
157 nop ; alignment
158___
159$code.=<<___ if ($BN_SZ==4);
160 comiclr,<= 6,$num,%r0 ; are vectors long enough?
161 b L\$abort
162 ldi 0,%r28 ; signal "unhandled"
163 add,ev %r0,$num,$num ; is $num even?
164 b L\$abort
165 nop
166 or $ap,$np,$ti1
167 extru,= $ti1,31,3,%r0 ; are ap and np 64-bit aligned?
168 b L\$abort
169 nop
170 nop ; alignment
171 nop
172
173 fldws 0($n0),${fn0}
174 fldws,ma 4($bp),${fbi} ; bp[0]
175___
176$code.=<<___ if ($BN_SZ==8);
177 comib,> 3,$num,L\$abort ; are vectors long enough?
178 ldi 0,%r28 ; signal "unhandled"
179 addl $num,$num,$num ; I operate on 32-bit values
180
181 fldws 4($n0),${fn0} ; only low part of n0
182 fldws 4($bp),${fbi} ; bp[0] in flipped word order
183___
184$code.=<<___;
185 fldds 0($ap),${fai} ; ap[0,1]
186 fldds 0($np),${fni} ; np[0,1]
187
188 sh2addl $num,%r0,$arrsz
189 ldi 31,$hi0
190 ldo 36($arrsz),$hi1 ; space for tp[num+1]
191 andcm $hi1,$hi0,$hi1 ; align
192 addl $hi1,%sp,%sp
193 $PUSH $fp,-$SIZE_T(%sp)
194
195 ldo `$LOCALS+16`($fp),$xfer
196 ldo `$LOCALS+32+4`($fp),$tp
197
198 xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[0]
199 xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[0]
200 xmpyu ${fn0},${fab0}R,${fm0}
201
202 addl $arrsz,$ap,$ap ; point at the end
203 addl $arrsz,$np,$np
204 subi 0,$arrsz,$idx ; j=0
205 ldo 8($idx),$idx ; j++++
206
207 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m
208 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m
209 fstds ${fab0},-16($xfer)
210 fstds ${fnm0},-8($xfer)
211 fstds ${fab1},0($xfer)
212 fstds ${fnm1},8($xfer)
213 flddx $idx($ap),${fai} ; ap[2,3]
214 flddx $idx($np),${fni} ; np[2,3]
215___
216$code.=<<___ if ($BN_SZ==4);
217#ifndef __OpenBSD__
218 mtctl $hi0,%cr11 ; $hi0 still holds 31
219 extrd,u,*= $hi0,%sar,1,$hi0 ; executes on PA-RISC 1.0
220 b L\$parisc11
221 nop
222___
223$code.=<<___; # PA-RISC 2.0 code-path
224 xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0]
225 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
226 ldd -16($xfer),$ab0
227 fstds ${fab0},-16($xfer)
228
229 extrd,u $ab0,31,32,$hi0
230 extrd,u $ab0,63,32,$ab0
231 ldd -8($xfer),$nm0
232 fstds ${fnm0},-8($xfer)
233 ldo 8($idx),$idx ; j++++
234 addl $ab0,$nm0,$nm0 ; low part is discarded
235 extrd,u $nm0,31,32,$hi1
236
237L\$1st
238 xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[0]
239 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m
240 ldd 0($xfer),$ab1
241 fstds ${fab1},0($xfer)
242 addl $hi0,$ab1,$ab1
243 extrd,u $ab1,31,32,$hi0
244 ldd 8($xfer),$nm1
245 fstds ${fnm1},8($xfer)
246 extrd,u $ab1,63,32,$ab1
247 addl $hi1,$nm1,$nm1
248 flddx $idx($ap),${fai} ; ap[j,j+1]
249 flddx $idx($np),${fni} ; np[j,j+1]
250 addl $ab1,$nm1,$nm1
251 extrd,u $nm1,31,32,$hi1
252
253 xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0]
254 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
255 ldd -16($xfer),$ab0
256 fstds ${fab0},-16($xfer)
257 addl $hi0,$ab0,$ab0
258 extrd,u $ab0,31,32,$hi0
259 ldd -8($xfer),$nm0
260 fstds ${fnm0},-8($xfer)
261 extrd,u $ab0,63,32,$ab0
262 addl $hi1,$nm0,$nm0
263 stw $nm1,-4($tp) ; tp[j-1]
264 addl $ab0,$nm0,$nm0
265 stw,ma $nm0,8($tp) ; tp[j-1]
266 addib,<> 8,$idx,L\$1st ; j++++
267 extrd,u $nm0,31,32,$hi1
268
269 xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[0]
270 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m
271 ldd 0($xfer),$ab1
272 fstds ${fab1},0($xfer)
273 addl $hi0,$ab1,$ab1
274 extrd,u $ab1,31,32,$hi0
275 ldd 8($xfer),$nm1
276 fstds ${fnm1},8($xfer)
277 extrd,u $ab1,63,32,$ab1
278 addl $hi1,$nm1,$nm1
279 ldd -16($xfer),$ab0
280 addl $ab1,$nm1,$nm1
281 ldd -8($xfer),$nm0
282 extrd,u $nm1,31,32,$hi1
283
284 addl $hi0,$ab0,$ab0
285 extrd,u $ab0,31,32,$hi0
286 stw $nm1,-4($tp) ; tp[j-1]
287 extrd,u $ab0,63,32,$ab0
288 addl $hi1,$nm0,$nm0
289 ldd 0($xfer),$ab1
290 addl $ab0,$nm0,$nm0
291 ldd,mb 8($xfer),$nm1
292 extrd,u $nm0,31,32,$hi1
293 stw,ma $nm0,8($tp) ; tp[j-1]
294
295 ldo -1($num),$num ; i--
296 subi 0,$arrsz,$idx ; j=0
297___
298$code.=<<___ if ($BN_SZ==4);
299 fldws,ma 4($bp),${fbi} ; bp[1]
300___
301$code.=<<___ if ($BN_SZ==8);
302 fldws 0($bp),${fbi} ; bp[1] in flipped word order
303___
304$code.=<<___;
305 flddx $idx($ap),${fai} ; ap[0,1]
306 flddx $idx($np),${fni} ; np[0,1]
307 fldws 8($xfer),${fti}R ; tp[0]
308 addl $hi0,$ab1,$ab1
309 extrd,u $ab1,31,32,$hi0
310 extrd,u $ab1,63,32,$ab1
311 ldo 8($idx),$idx ; j++++
312 xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[1]
313 xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[1]
314 addl $hi1,$nm1,$nm1
315 addl $ab1,$nm1,$nm1
316 extrd,u $nm1,31,32,$hi1
317 fstws,mb ${fab0}L,-8($xfer) ; save high part
318 stw $nm1,-4($tp) ; tp[j-1]
319
320 fcpy,sgl %fr0,${fti}L ; zero high part
321 fcpy,sgl %fr0,${fab0}L
322 addl $hi1,$hi0,$hi0
323 extrd,u $hi0,31,32,$hi1
324 fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double
325 fcnvxf,dbl,dbl ${fab0},${fab0}
326 stw $hi0,0($tp)
327 stw $hi1,4($tp)
328
329 fadd,dbl ${fti},${fab0},${fab0} ; add tp[0]
330 fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int
331 xmpyu ${fn0},${fab0}R,${fm0}
332 ldo `$LOCALS+32+4`($fp),$tp
333L\$outer
334 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m
335 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m
336 fstds ${fab0},-16($xfer) ; 33-bit value
337 fstds ${fnm0},-8($xfer)
338 flddx $idx($ap),${fai} ; ap[2]
339 flddx $idx($np),${fni} ; np[2]
340 ldo 8($idx),$idx ; j++++
341 ldd -16($xfer),$ab0 ; 33-bit value
342 ldd -8($xfer),$nm0
343 ldw 0($xfer),$hi0 ; high part
344
345 xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i]
346 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
347 extrd,u $ab0,31,32,$ti0 ; carry bit
348 extrd,u $ab0,63,32,$ab0
349 fstds ${fab1},0($xfer)
350 addl $ti0,$hi0,$hi0 ; account carry bit
351 fstds ${fnm1},8($xfer)
352 addl $ab0,$nm0,$nm0 ; low part is discarded
353 ldw 0($tp),$ti1 ; tp[1]
354 extrd,u $nm0,31,32,$hi1
355 fstds ${fab0},-16($xfer)
356 fstds ${fnm0},-8($xfer)
357
358L\$inner
359 xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[i]
360 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m
361 ldd 0($xfer),$ab1
362 fstds ${fab1},0($xfer)
363 addl $hi0,$ti1,$ti1
364 addl $ti1,$ab1,$ab1
365 ldd 8($xfer),$nm1
366 fstds ${fnm1},8($xfer)
367 extrd,u $ab1,31,32,$hi0
368 extrd,u $ab1,63,32,$ab1
369 flddx $idx($ap),${fai} ; ap[j,j+1]
370 flddx $idx($np),${fni} ; np[j,j+1]
371 addl $hi1,$nm1,$nm1
372 addl $ab1,$nm1,$nm1
373 ldw 4($tp),$ti0 ; tp[j]
374 stw $nm1,-4($tp) ; tp[j-1]
375
376 xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i]
377 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
378 ldd -16($xfer),$ab0
379 fstds ${fab0},-16($xfer)
380 addl $hi0,$ti0,$ti0
381 addl $ti0,$ab0,$ab0
382 ldd -8($xfer),$nm0
383 fstds ${fnm0},-8($xfer)
384 extrd,u $ab0,31,32,$hi0
385 extrd,u $nm1,31,32,$hi1
386 ldw 8($tp),$ti1 ; tp[j]
387 extrd,u $ab0,63,32,$ab0
388 addl $hi1,$nm0,$nm0
389 addl $ab0,$nm0,$nm0
390 stw,ma $nm0,8($tp) ; tp[j-1]
391 addib,<> 8,$idx,L\$inner ; j++++
392 extrd,u $nm0,31,32,$hi1
393
394 xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[i]
395 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m
396 ldd 0($xfer),$ab1
397 fstds ${fab1},0($xfer)
398 addl $hi0,$ti1,$ti1
399 addl $ti1,$ab1,$ab1
400 ldd 8($xfer),$nm1
401 fstds ${fnm1},8($xfer)
402 extrd,u $ab1,31,32,$hi0
403 extrd,u $ab1,63,32,$ab1
404 ldw 4($tp),$ti0 ; tp[j]
405 addl $hi1,$nm1,$nm1
406 addl $ab1,$nm1,$nm1
407 ldd -16($xfer),$ab0
408 ldd -8($xfer),$nm0
409 extrd,u $nm1,31,32,$hi1
410
411 addl $hi0,$ab0,$ab0
412 addl $ti0,$ab0,$ab0
413 stw $nm1,-4($tp) ; tp[j-1]
414 extrd,u $ab0,31,32,$hi0
415 ldw 8($tp),$ti1 ; tp[j]
416 extrd,u $ab0,63,32,$ab0
417 addl $hi1,$nm0,$nm0
418 ldd 0($xfer),$ab1
419 addl $ab0,$nm0,$nm0
420 ldd,mb 8($xfer),$nm1
421 extrd,u $nm0,31,32,$hi1
422 stw,ma $nm0,8($tp) ; tp[j-1]
423
424 addib,= -1,$num,L\$outerdone ; i--
425 subi 0,$arrsz,$idx ; j=0
426___
427$code.=<<___ if ($BN_SZ==4);
428 fldws,ma 4($bp),${fbi} ; bp[i]
429___
430$code.=<<___ if ($BN_SZ==8);
431 ldi 12,$ti0 ; bp[i] in flipped word order
432 addl,ev %r0,$num,$num
433 ldi -4,$ti0
434 addl $ti0,$bp,$bp
435 fldws 0($bp),${fbi}
436___
437$code.=<<___;
438 flddx $idx($ap),${fai} ; ap[0]
439 addl $hi0,$ab1,$ab1
440 flddx $idx($np),${fni} ; np[0]
441 fldws 8($xfer),${fti}R ; tp[0]
442 addl $ti1,$ab1,$ab1
443 extrd,u $ab1,31,32,$hi0
444 extrd,u $ab1,63,32,$ab1
445
446 ldo 8($idx),$idx ; j++++
447 xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[i]
448 xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[i]
449 ldw 4($tp),$ti0 ; tp[j]
450
451 addl $hi1,$nm1,$nm1
452 fstws,mb ${fab0}L,-8($xfer) ; save high part
453 addl $ab1,$nm1,$nm1
454 extrd,u $nm1,31,32,$hi1
455 fcpy,sgl %fr0,${fti}L ; zero high part
456 fcpy,sgl %fr0,${fab0}L
457 stw $nm1,-4($tp) ; tp[j-1]
458
459 fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double
460 fcnvxf,dbl,dbl ${fab0},${fab0}
461 addl $hi1,$hi0,$hi0
462 fadd,dbl ${fti},${fab0},${fab0} ; add tp[0]
463 addl $ti0,$hi0,$hi0
464 extrd,u $hi0,31,32,$hi1
465 fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int
466 stw $hi0,0($tp)
467 stw $hi1,4($tp)
468 xmpyu ${fn0},${fab0}R,${fm0}
469
470 b L\$outer
471 ldo `$LOCALS+32+4`($fp),$tp
472
473L\$outerdone
474 addl $hi0,$ab1,$ab1
475 addl $ti1,$ab1,$ab1
476 extrd,u $ab1,31,32,$hi0
477 extrd,u $ab1,63,32,$ab1
478
479 ldw 4($tp),$ti0 ; tp[j]
480
481 addl $hi1,$nm1,$nm1
482 addl $ab1,$nm1,$nm1
483 extrd,u $nm1,31,32,$hi1
484 stw $nm1,-4($tp) ; tp[j-1]
485
486 addl $hi1,$hi0,$hi0
487 addl $ti0,$hi0,$hi0
488 extrd,u $hi0,31,32,$hi1
489 stw $hi0,0($tp)
490 stw $hi1,4($tp)
491
492 ldo `$LOCALS+32`($fp),$tp
493 sub %r0,%r0,%r0 ; clear borrow
494___
495$code.=<<___ if ($BN_SZ==4);
496 ldws,ma 4($tp),$ti0
497 extru,= $rp,31,3,%r0 ; is rp 64-bit aligned?
498 b L\$sub_pa11
499 addl $tp,$arrsz,$tp
500L\$sub
501 ldwx $idx($np),$hi0
502 subb $ti0,$hi0,$hi1
503 ldwx $idx($tp),$ti0
504 addib,<> 4,$idx,L\$sub
505 stws,ma $hi1,4($rp)
506
507 subb $ti0,%r0,$hi1
508 ldo -4($tp),$tp
509___
510$code.=<<___ if ($BN_SZ==8);
511 ldd,ma 8($tp),$ti0
512L\$sub
513 ldd $idx($np),$hi0
514 shrpd $ti0,$ti0,32,$ti0 ; flip word order
515 std $ti0,-8($tp) ; save flipped value
516 sub,db $ti0,$hi0,$hi1
517 ldd,ma 8($tp),$ti0
518 addib,<> 8,$idx,L\$sub
519 std,ma $hi1,8($rp)
520
521 extrd,u $ti0,31,32,$ti0 ; carry in flipped word order
522 sub,db $ti0,%r0,$hi1
523 ldo -8($tp),$tp
524___
525$code.=<<___;
526 and $tp,$hi1,$ap
527 andcm $rp,$hi1,$bp
528 or $ap,$bp,$np
529
530 sub $rp,$arrsz,$rp ; rewind rp
531 subi 0,$arrsz,$idx
532 ldo `$LOCALS+32`($fp),$tp
533L\$copy
534 ldd $idx($np),$hi0
535 std,ma %r0,8($tp)
536 addib,<> 8,$idx,.-8 ; L\$copy
537 std,ma $hi0,8($rp)
538___
539
540if ($BN_SZ==4) { # PA-RISC 1.1 code-path
541$ablo=$ab0;
542$abhi=$ab1;
543$nmlo0=$nm0;
544$nmhi0=$nm1;
545$nmlo1="%r9";
546$nmhi1="%r8";
547
548$code.=<<___;
549 b L\$done
550 nop
551
552 .ALIGN 8
553L\$parisc11
554#endif
555 xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0]
556 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
557 ldw -12($xfer),$ablo
558 ldw -16($xfer),$hi0
559 ldw -4($xfer),$nmlo0
560 ldw -8($xfer),$nmhi0
561 fstds ${fab0},-16($xfer)
562 fstds ${fnm0},-8($xfer)
563
564 ldo 8($idx),$idx ; j++++
565 add $ablo,$nmlo0,$nmlo0 ; discarded
566 addc %r0,$nmhi0,$hi1
567 ldw 4($xfer),$ablo
568 ldw 0($xfer),$abhi
569 nop
570
571L\$1st_pa11
572 xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[0]
573 flddx $idx($ap),${fai} ; ap[j,j+1]
574 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m
575 flddx $idx($np),${fni} ; np[j,j+1]
576 add $hi0,$ablo,$ablo
577 ldw 12($xfer),$nmlo1
578 addc %r0,$abhi,$hi0
579 ldw 8($xfer),$nmhi1
580 add $ablo,$nmlo1,$nmlo1
581 fstds ${fab1},0($xfer)
582 addc %r0,$nmhi1,$nmhi1
583 fstds ${fnm1},8($xfer)
584 add $hi1,$nmlo1,$nmlo1
585 ldw -12($xfer),$ablo
586 addc %r0,$nmhi1,$hi1
587 ldw -16($xfer),$abhi
588
589 xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0]
590 ldw -4($xfer),$nmlo0
591 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
592 ldw -8($xfer),$nmhi0
593 add $hi0,$ablo,$ablo
594 stw $nmlo1,-4($tp) ; tp[j-1]
595 addc %r0,$abhi,$hi0
596 fstds ${fab0},-16($xfer)
597 add $ablo,$nmlo0,$nmlo0
598 fstds ${fnm0},-8($xfer)
599 addc %r0,$nmhi0,$nmhi0
600 ldw 0($xfer),$abhi
601 add $hi1,$nmlo0,$nmlo0
602 ldw 4($xfer),$ablo
603 stws,ma $nmlo0,8($tp) ; tp[j-1]
604 addib,<> 8,$idx,L\$1st_pa11 ; j++++
605 addc %r0,$nmhi0,$hi1
606
607 ldw 8($xfer),$nmhi1
608 ldw 12($xfer),$nmlo1
609 xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[0]
610 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m
611 add $hi0,$ablo,$ablo
612 fstds ${fab1},0($xfer)
613 addc %r0,$abhi,$hi0
614 fstds ${fnm1},8($xfer)
615 add $ablo,$nmlo1,$nmlo1
616 ldw -16($xfer),$abhi
617 addc %r0,$nmhi1,$nmhi1
618 ldw -12($xfer),$ablo
619 add $hi1,$nmlo1,$nmlo1
620 ldw -8($xfer),$nmhi0
621 addc %r0,$nmhi1,$hi1
622 ldw -4($xfer),$nmlo0
623
624 add $hi0,$ablo,$ablo
625 stw $nmlo1,-4($tp) ; tp[j-1]
626 addc %r0,$abhi,$hi0
627 ldw 0($xfer),$abhi
628 add $ablo,$nmlo0,$nmlo0
629 ldw 4($xfer),$ablo
630 addc %r0,$nmhi0,$nmhi0
631 ldws,mb 8($xfer),$nmhi1
632 add $hi1,$nmlo0,$nmlo0
633 ldw 4($xfer),$nmlo1
634 addc %r0,$nmhi0,$hi1
635 stws,ma $nmlo0,8($tp) ; tp[j-1]
636
637 ldo -1($num),$num ; i--
638 subi 0,$arrsz,$idx ; j=0
639
640 fldws,ma 4($bp),${fbi} ; bp[1]
641 flddx $idx($ap),${fai} ; ap[0,1]
642 flddx $idx($np),${fni} ; np[0,1]
643 fldws 8($xfer),${fti}R ; tp[0]
644 add $hi0,$ablo,$ablo
645 addc %r0,$abhi,$hi0
646 ldo 8($idx),$idx ; j++++
647 xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[1]
648 xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[1]
649 add $hi1,$nmlo1,$nmlo1
650 addc %r0,$nmhi1,$nmhi1
651 add $ablo,$nmlo1,$nmlo1
652 addc %r0,$nmhi1,$hi1
653 fstws,mb ${fab0}L,-8($xfer) ; save high part
654 stw $nmlo1,-4($tp) ; tp[j-1]
655
656 fcpy,sgl %fr0,${fti}L ; zero high part
657 fcpy,sgl %fr0,${fab0}L
658 add $hi1,$hi0,$hi0
659 addc %r0,%r0,$hi1
660 fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double
661 fcnvxf,dbl,dbl ${fab0},${fab0}
662 stw $hi0,0($tp)
663 stw $hi1,4($tp)
664
665 fadd,dbl ${fti},${fab0},${fab0} ; add tp[0]
666 fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int
667 xmpyu ${fn0},${fab0}R,${fm0}
668 ldo `$LOCALS+32+4`($fp),$tp
669L\$outer_pa11
670 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m
671 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m
672 fstds ${fab0},-16($xfer) ; 33-bit value
673 fstds ${fnm0},-8($xfer)
674 flddx $idx($ap),${fai} ; ap[2,3]
675 flddx $idx($np),${fni} ; np[2,3]
676 ldw -16($xfer),$abhi ; carry bit actually
677 ldo 8($idx),$idx ; j++++
678 ldw -12($xfer),$ablo
679 ldw -8($xfer),$nmhi0
680 ldw -4($xfer),$nmlo0
681 ldw 0($xfer),$hi0 ; high part
682
683 xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i]
684 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
685 fstds ${fab1},0($xfer)
686 addl $abhi,$hi0,$hi0 ; account carry bit
687 fstds ${fnm1},8($xfer)
688 add $ablo,$nmlo0,$nmlo0 ; discarded
689 ldw 0($tp),$ti1 ; tp[1]
690 addc %r0,$nmhi0,$hi1
691 fstds ${fab0},-16($xfer)
692 fstds ${fnm0},-8($xfer)
693 ldw 4($xfer),$ablo
694 ldw 0($xfer),$abhi
695
696L\$inner_pa11
697 xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[i]
698 flddx $idx($ap),${fai} ; ap[j,j+1]
699 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m
700 flddx $idx($np),${fni} ; np[j,j+1]
701 add $hi0,$ablo,$ablo
702 ldw 4($tp),$ti0 ; tp[j]
703 addc %r0,$abhi,$abhi
704 ldw 12($xfer),$nmlo1
705 add $ti1,$ablo,$ablo
706 ldw 8($xfer),$nmhi1
707 addc %r0,$abhi,$hi0
708 fstds ${fab1},0($xfer)
709 add $ablo,$nmlo1,$nmlo1
710 fstds ${fnm1},8($xfer)
711 addc %r0,$nmhi1,$nmhi1
712 ldw -12($xfer),$ablo
713 add $hi1,$nmlo1,$nmlo1
714 ldw -16($xfer),$abhi
715 addc %r0,$nmhi1,$hi1
716
717 xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i]
718 ldw 8($tp),$ti1 ; tp[j]
719 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
720 ldw -4($xfer),$nmlo0
721 add $hi0,$ablo,$ablo
722 ldw -8($xfer),$nmhi0
723 addc %r0,$abhi,$abhi
724 stw $nmlo1,-4($tp) ; tp[j-1]
725 add $ti0,$ablo,$ablo
726 fstds ${fab0},-16($xfer)
727 addc %r0,$abhi,$hi0
728 fstds ${fnm0},-8($xfer)
729 add $ablo,$nmlo0,$nmlo0
730 ldw 4($xfer),$ablo
731 addc %r0,$nmhi0,$nmhi0
732 ldw 0($xfer),$abhi
733 add $hi1,$nmlo0,$nmlo0
734 stws,ma $nmlo0,8($tp) ; tp[j-1]
735 addib,<> 8,$idx,L\$inner_pa11 ; j++++
736 addc %r0,$nmhi0,$hi1
737
738 xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[i]
739 ldw 12($xfer),$nmlo1
740 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m
741 ldw 8($xfer),$nmhi1
742 add $hi0,$ablo,$ablo
743 ldw 4($tp),$ti0 ; tp[j]
744 addc %r0,$abhi,$abhi
745 fstds ${fab1},0($xfer)
746 add $ti1,$ablo,$ablo
747 fstds ${fnm1},8($xfer)
748 addc %r0,$abhi,$hi0
749 ldw -16($xfer),$abhi
750 add $ablo,$nmlo1,$nmlo1
751 ldw -12($xfer),$ablo
752 addc %r0,$nmhi1,$nmhi1
753 ldw -8($xfer),$nmhi0
754 add $hi1,$nmlo1,$nmlo1
755 ldw -4($xfer),$nmlo0
756 addc %r0,$nmhi1,$hi1
757
758 add $hi0,$ablo,$ablo
759 stw $nmlo1,-4($tp) ; tp[j-1]
760 addc %r0,$abhi,$abhi
761 add $ti0,$ablo,$ablo
762 ldw 8($tp),$ti1 ; tp[j]
763 addc %r0,$abhi,$hi0
764 ldw 0($xfer),$abhi
765 add $ablo,$nmlo0,$nmlo0
766 ldw 4($xfer),$ablo
767 addc %r0,$nmhi0,$nmhi0
768 ldws,mb 8($xfer),$nmhi1
769 add $hi1,$nmlo0,$nmlo0
770 ldw 4($xfer),$nmlo1
771 addc %r0,$nmhi0,$hi1
772 stws,ma $nmlo0,8($tp) ; tp[j-1]
773
774 addib,= -1,$num,L\$outerdone_pa11; i--
775 subi 0,$arrsz,$idx ; j=0
776
777 fldws,ma 4($bp),${fbi} ; bp[i]
778 flddx $idx($ap),${fai} ; ap[0]
779 add $hi0,$ablo,$ablo
780 addc %r0,$abhi,$abhi
781 flddx $idx($np),${fni} ; np[0]
782 fldws 8($xfer),${fti}R ; tp[0]
783 add $ti1,$ablo,$ablo
784 addc %r0,$abhi,$hi0
785
786 ldo 8($idx),$idx ; j++++
787 xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[i]
788 xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[i]
789 ldw 4($tp),$ti0 ; tp[j]
790
791 add $hi1,$nmlo1,$nmlo1
792 addc %r0,$nmhi1,$nmhi1
793 fstws,mb ${fab0}L,-8($xfer) ; save high part
794 add $ablo,$nmlo1,$nmlo1
795 addc %r0,$nmhi1,$hi1
796 fcpy,sgl %fr0,${fti}L ; zero high part
797 fcpy,sgl %fr0,${fab0}L
798 stw $nmlo1,-4($tp) ; tp[j-1]
799
800 fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double
801 fcnvxf,dbl,dbl ${fab0},${fab0}
802 add $hi1,$hi0,$hi0
803 addc %r0,%r0,$hi1
804 fadd,dbl ${fti},${fab0},${fab0} ; add tp[0]
805 add $ti0,$hi0,$hi0
806 addc %r0,$hi1,$hi1
807 fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int
808 stw $hi0,0($tp)
809 stw $hi1,4($tp)
810 xmpyu ${fn0},${fab0}R,${fm0}
811
812 b L\$outer_pa11
813 ldo `$LOCALS+32+4`($fp),$tp
814
815L\$outerdone_pa11
816 add $hi0,$ablo,$ablo
817 addc %r0,$abhi,$abhi
818 add $ti1,$ablo,$ablo
819 addc %r0,$abhi,$hi0
820
821 ldw 4($tp),$ti0 ; tp[j]
822
823 add $hi1,$nmlo1,$nmlo1
824 addc %r0,$nmhi1,$nmhi1
825 add $ablo,$nmlo1,$nmlo1
826 addc %r0,$nmhi1,$hi1
827 stw $nmlo1,-4($tp) ; tp[j-1]
828
829 add $hi1,$hi0,$hi0
830 addc %r0,%r0,$hi1
831 add $ti0,$hi0,$hi0
832 addc %r0,$hi1,$hi1
833 stw $hi0,0($tp)
834 stw $hi1,4($tp)
835
836 ldo `$LOCALS+32+4`($fp),$tp
837 sub %r0,%r0,%r0 ; clear borrow
838 ldw -4($tp),$ti0
839 addl $tp,$arrsz,$tp
840L\$sub_pa11
841 ldwx $idx($np),$hi0
842 subb $ti0,$hi0,$hi1
843 ldwx $idx($tp),$ti0
844 addib,<> 4,$idx,L\$sub_pa11
845 stws,ma $hi1,4($rp)
846
847 subb $ti0,%r0,$hi1
848 ldo -4($tp),$tp
849 and $tp,$hi1,$ap
850 andcm $rp,$hi1,$bp
851 or $ap,$bp,$np
852
853 sub $rp,$arrsz,$rp ; rewind rp
854 subi 0,$arrsz,$idx
855 ldo `$LOCALS+32`($fp),$tp
856L\$copy_pa11
857 ldwx $idx($np),$hi0
858 stws,ma %r0,4($tp)
859 addib,<> 4,$idx,L\$copy_pa11
860 stws,ma $hi0,4($rp)
861
862 nop ; alignment
863L\$done
864___
865}
866
867$code.=<<___;
868 ldi 1,%r28 ; signal "handled"
869 ldo $FRAME($fp),%sp ; destroy tp[num+1]
870
871 $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue
872 $POP `-$FRAME+1*$SIZE_T`(%sp),%r4
873 $POP `-$FRAME+2*$SIZE_T`(%sp),%r5
874 $POP `-$FRAME+3*$SIZE_T`(%sp),%r6
875 $POP `-$FRAME+4*$SIZE_T`(%sp),%r7
876 $POP `-$FRAME+5*$SIZE_T`(%sp),%r8
877 $POP `-$FRAME+6*$SIZE_T`(%sp),%r9
878 $POP `-$FRAME+7*$SIZE_T`(%sp),%r10
879L\$abort
880 bv (%r2)
881 .EXIT
882 $POPMB -$FRAME(%sp),%r3
883 .PROCEND
884
885 .data
886 .STRINGZ "Montgomery Multiplication for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
887___
888
889# Explicitly encode PA-RISC 2.0 instructions used in this module, so
890# that it can be compiled with .LEVEL 1.0. It should be noted that I
891# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0
892# directive...
893
894my $ldd = sub {
895 my ($mod,$args) = @_;
896 my $orig = "ldd$mod\t$args";
897
898 if ($args =~ /%r([0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 4
899 { my $opcode=(0x03<<26)|($2<<21)|($1<<16)|(3<<6)|$3;
900 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
901 }
902 elsif ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 5
903 { my $opcode=(0x03<<26)|($2<<21)|(1<<12)|(3<<6)|$3;
904 $opcode|=(($1&0xF)<<17)|(($1&0x10)<<12); # encode offset
905 $opcode|=(1<<5) if ($mod =~ /^,m/);
906 $opcode|=(1<<13) if ($mod =~ /^,mb/);
907 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
908 }
909 else { "\t".$orig; }
910};
911
912my $std = sub {
913 my ($mod,$args) = @_;
914 my $orig = "std$mod\t$args";
915
916 if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 6
917 { my $opcode=(0x03<<26)|($3<<21)|($1<<16)|(1<<12)|(0xB<<6);
918 $opcode|=(($2&0xF)<<1)|(($2&0x10)>>4); # encode offset
919 $opcode|=(1<<5) if ($mod =~ /^,m/);
920 $opcode|=(1<<13) if ($mod =~ /^,mb/);
921 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
922 }
923 else { "\t".$orig; }
924};
925
926my $extrd = sub {
927 my ($mod,$args) = @_;
928 my $orig = "extrd$mod\t$args";
929
930 # I only have ",u" completer, it's implicitly encoded...
931 if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 15
932 { my $opcode=(0x36<<26)|($1<<21)|($4<<16);
933 my $len=32-$3;
934 $opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5); # encode pos
935 $opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len
936 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
937 }
938 elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/) # format 12
939 { my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9);
940 my $len=32-$2;
941 $opcode |= (($len&0x20)<<3)|($len&0x1f); # encode len
942 $opcode |= (1<<13) if ($mod =~ /,\**=/);
943 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
944 }
945 else { "\t".$orig; }
946};
947
948my $shrpd = sub {
949 my ($mod,$args) = @_;
950 my $orig = "shrpd$mod\t$args";
951
952 if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/) # format 14
953 { my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4;
954 my $cpos=63-$3;
955 $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode sa
956 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
957 }
958 else { "\t".$orig; }
959};
960
961my $sub = sub {
962 my ($mod,$args) = @_;
963 my $orig = "sub$mod\t$args";
964
965 if ($mod eq ",db" && $args =~ /%r([0-9]+),%r([0-9]+),%r([0-9]+)/) {
966 my $opcode=(0x02<<26)|($2<<21)|($1<<16)|$3;
967 $opcode|=(1<<10); # e1
968 $opcode|=(1<<8); # e2
969 $opcode|=(1<<5); # d
970 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig
971 }
972 else { "\t".$orig; }
973};
974
975sub assemble {
976 my ($mnemonic,$mod,$args)=@_;
977 my $opcode = eval("\$$mnemonic");
978
979 ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args";
980}
981
982foreach (split("\n",$code)) {
983 s/\`([^\`]*)\`/eval $1/ge;
984 # flip word order in 64-bit mode...
985 s/(xmpyu\s+)($fai|$fni)([LR])/$1.$2.($3 eq "L"?"R":"L")/e if ($BN_SZ==8);
986 # assemble 2.0 instructions in 32-bit mode...
987 s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e if ($BN_SZ==4);
988
989 s/\bbv\b/bve/gm if ($SIZE_T==8);
990
991 print $_,"\n";
992}
993close STDOUT;
diff --git a/src/lib/libcrypto/bn/asm/ppc-mont.pl b/src/lib/libcrypto/bn/asm/ppc-mont.pl
deleted file mode 100644
index f9b6992ccc..0000000000
--- a/src/lib/libcrypto/bn/asm/ppc-mont.pl
+++ /dev/null
@@ -1,334 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# April 2006
11
12# "Teaser" Montgomery multiplication module for PowerPC. It's possible
13# to gain a bit more by modulo-scheduling outer loop, then dedicated
14# squaring procedure should give further 20% and code can be adapted
15# for 32-bit application running on 64-bit CPU. As for the latter.
16# It won't be able to achieve "native" 64-bit performance, because in
17# 32-bit application context every addc instruction will have to be
18# expanded as addc, twice right shift by 32 and finally adde, etc.
19# So far RSA *sign* performance improvement over pre-bn_mul_mont asm
20# for 64-bit application running on PPC970/G5 is:
21#
22# 512-bit +65%
23# 1024-bit +35%
24# 2048-bit +18%
25# 4096-bit +4%
26
27$flavour = shift;
28
29if ($flavour =~ /32/) {
30 $BITS= 32;
31 $BNSZ= $BITS/8;
32 $SIZE_T=4;
33 $RZONE= 224;
34
35 $LD= "lwz"; # load
36 $LDU= "lwzu"; # load and update
37 $LDX= "lwzx"; # load indexed
38 $ST= "stw"; # store
39 $STU= "stwu"; # store and update
40 $STX= "stwx"; # store indexed
41 $STUX= "stwux"; # store indexed and update
42 $UMULL= "mullw"; # unsigned multiply low
43 $UMULH= "mulhwu"; # unsigned multiply high
44 $UCMP= "cmplw"; # unsigned compare
45 $SHRI= "srwi"; # unsigned shift right by immediate
46 $PUSH= $ST;
47 $POP= $LD;
48} elsif ($flavour =~ /64/) {
49 $BITS= 64;
50 $BNSZ= $BITS/8;
51 $SIZE_T=8;
52 $RZONE= 288;
53
54 # same as above, but 64-bit mnemonics...
55 $LD= "ld"; # load
56 $LDU= "ldu"; # load and update
57 $LDX= "ldx"; # load indexed
58 $ST= "std"; # store
59 $STU= "stdu"; # store and update
60 $STX= "stdx"; # store indexed
61 $STUX= "stdux"; # store indexed and update
62 $UMULL= "mulld"; # unsigned multiply low
63 $UMULH= "mulhdu"; # unsigned multiply high
64 $UCMP= "cmpld"; # unsigned compare
65 $SHRI= "srdi"; # unsigned shift right by immediate
66 $PUSH= $ST;
67 $POP= $LD;
68} else { die "nonsense $flavour"; }
69
70$FRAME=8*$SIZE_T+$RZONE;
71$LOCALS=8*$SIZE_T;
72
73$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
74( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
75( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
76die "can't locate ppc-xlate.pl";
77
78open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
79
80$sp="r1";
81$toc="r2";
82$rp="r3"; $ovf="r3";
83$ap="r4";
84$bp="r5";
85$np="r6";
86$n0="r7";
87$num="r8";
88$rp="r9"; # $rp is reassigned
89$aj="r10";
90$nj="r11";
91$tj="r12";
92# non-volatile registers
93$i="r20";
94$j="r21";
95$tp="r22";
96$m0="r23";
97$m1="r24";
98$lo0="r25";
99$hi0="r26";
100$lo1="r27";
101$hi1="r28";
102$alo="r29";
103$ahi="r30";
104$nlo="r31";
105#
106$nhi="r0";
107
108$code=<<___;
109.machine "any"
110.text
111
112.globl .bn_mul_mont_int
113.align 4
114.bn_mul_mont_int:
115 cmpwi $num,4
116 mr $rp,r3 ; $rp is reassigned
117 li r3,0
118 bltlr
119___
120$code.=<<___ if ($BNSZ==4);
121 cmpwi $num,32 ; longer key performance is not better
122 bgelr
123___
124$code.=<<___;
125 slwi $num,$num,`log($BNSZ)/log(2)`
126 li $tj,-4096
127 addi $ovf,$num,$FRAME
128 subf $ovf,$ovf,$sp ; $sp-$ovf
129 and $ovf,$ovf,$tj ; minimize TLB usage
130 subf $ovf,$sp,$ovf ; $ovf-$sp
131 mr $tj,$sp
132 srwi $num,$num,`log($BNSZ)/log(2)`
133 $STUX $sp,$sp,$ovf
134
135 $PUSH r20,`-12*$SIZE_T`($tj)
136 $PUSH r21,`-11*$SIZE_T`($tj)
137 $PUSH r22,`-10*$SIZE_T`($tj)
138 $PUSH r23,`-9*$SIZE_T`($tj)
139 $PUSH r24,`-8*$SIZE_T`($tj)
140 $PUSH r25,`-7*$SIZE_T`($tj)
141 $PUSH r26,`-6*$SIZE_T`($tj)
142 $PUSH r27,`-5*$SIZE_T`($tj)
143 $PUSH r28,`-4*$SIZE_T`($tj)
144 $PUSH r29,`-3*$SIZE_T`($tj)
145 $PUSH r30,`-2*$SIZE_T`($tj)
146 $PUSH r31,`-1*$SIZE_T`($tj)
147
148 $LD $n0,0($n0) ; pull n0[0] value
149 addi $num,$num,-2 ; adjust $num for counter register
150
151 $LD $m0,0($bp) ; m0=bp[0]
152 $LD $aj,0($ap) ; ap[0]
153 addi $tp,$sp,$LOCALS
154 $UMULL $lo0,$aj,$m0 ; ap[0]*bp[0]
155 $UMULH $hi0,$aj,$m0
156
157 $LD $aj,$BNSZ($ap) ; ap[1]
158 $LD $nj,0($np) ; np[0]
159
160 $UMULL $m1,$lo0,$n0 ; "tp[0]"*n0
161
162 $UMULL $alo,$aj,$m0 ; ap[1]*bp[0]
163 $UMULH $ahi,$aj,$m0
164
165 $UMULL $lo1,$nj,$m1 ; np[0]*m1
166 $UMULH $hi1,$nj,$m1
167 $LD $nj,$BNSZ($np) ; np[1]
168 addc $lo1,$lo1,$lo0
169 addze $hi1,$hi1
170
171 $UMULL $nlo,$nj,$m1 ; np[1]*m1
172 $UMULH $nhi,$nj,$m1
173
174 mtctr $num
175 li $j,`2*$BNSZ`
176.align 4
177L1st:
178 $LDX $aj,$ap,$j ; ap[j]
179 addc $lo0,$alo,$hi0
180 $LDX $nj,$np,$j ; np[j]
181 addze $hi0,$ahi
182 $UMULL $alo,$aj,$m0 ; ap[j]*bp[0]
183 addc $lo1,$nlo,$hi1
184 $UMULH $ahi,$aj,$m0
185 addze $hi1,$nhi
186 $UMULL $nlo,$nj,$m1 ; np[j]*m1
187 addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[0]
188 $UMULH $nhi,$nj,$m1
189 addze $hi1,$hi1
190 $ST $lo1,0($tp) ; tp[j-1]
191
192 addi $j,$j,$BNSZ ; j++
193 addi $tp,$tp,$BNSZ ; tp++
194 bdnz- L1st
195;L1st
196 addc $lo0,$alo,$hi0
197 addze $hi0,$ahi
198
199 addc $lo1,$nlo,$hi1
200 addze $hi1,$nhi
201 addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[0]
202 addze $hi1,$hi1
203 $ST $lo1,0($tp) ; tp[j-1]
204
205 li $ovf,0
206 addc $hi1,$hi1,$hi0
207 addze $ovf,$ovf ; upmost overflow bit
208 $ST $hi1,$BNSZ($tp)
209
210 li $i,$BNSZ
211.align 4
212Louter:
213 $LDX $m0,$bp,$i ; m0=bp[i]
214 $LD $aj,0($ap) ; ap[0]
215 addi $tp,$sp,$LOCALS
216 $LD $tj,$LOCALS($sp); tp[0]
217 $UMULL $lo0,$aj,$m0 ; ap[0]*bp[i]
218 $UMULH $hi0,$aj,$m0
219 $LD $aj,$BNSZ($ap) ; ap[1]
220 $LD $nj,0($np) ; np[0]
221 addc $lo0,$lo0,$tj ; ap[0]*bp[i]+tp[0]
222 $UMULL $alo,$aj,$m0 ; ap[j]*bp[i]
223 addze $hi0,$hi0
224 $UMULL $m1,$lo0,$n0 ; tp[0]*n0
225 $UMULH $ahi,$aj,$m0
226 $UMULL $lo1,$nj,$m1 ; np[0]*m1
227 $UMULH $hi1,$nj,$m1
228 $LD $nj,$BNSZ($np) ; np[1]
229 addc $lo1,$lo1,$lo0
230 $UMULL $nlo,$nj,$m1 ; np[1]*m1
231 addze $hi1,$hi1
232 $UMULH $nhi,$nj,$m1
233
234 mtctr $num
235 li $j,`2*$BNSZ`
236.align 4
237Linner:
238 $LDX $aj,$ap,$j ; ap[j]
239 addc $lo0,$alo,$hi0
240 $LD $tj,$BNSZ($tp) ; tp[j]
241 addze $hi0,$ahi
242 $LDX $nj,$np,$j ; np[j]
243 addc $lo1,$nlo,$hi1
244 $UMULL $alo,$aj,$m0 ; ap[j]*bp[i]
245 addze $hi1,$nhi
246 $UMULH $ahi,$aj,$m0
247 addc $lo0,$lo0,$tj ; ap[j]*bp[i]+tp[j]
248 $UMULL $nlo,$nj,$m1 ; np[j]*m1
249 addze $hi0,$hi0
250 $UMULH $nhi,$nj,$m1
251 addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[i]+tp[j]
252 addi $j,$j,$BNSZ ; j++
253 addze $hi1,$hi1
254 $ST $lo1,0($tp) ; tp[j-1]
255 addi $tp,$tp,$BNSZ ; tp++
256 bdnz- Linner
257;Linner
258 $LD $tj,$BNSZ($tp) ; tp[j]
259 addc $lo0,$alo,$hi0
260 addze $hi0,$ahi
261 addc $lo0,$lo0,$tj ; ap[j]*bp[i]+tp[j]
262 addze $hi0,$hi0
263
264 addc $lo1,$nlo,$hi1
265 addze $hi1,$nhi
266 addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[i]+tp[j]
267 addze $hi1,$hi1
268 $ST $lo1,0($tp) ; tp[j-1]
269
270 addic $ovf,$ovf,-1 ; move upmost overflow to XER[CA]
271 li $ovf,0
272 adde $hi1,$hi1,$hi0
273 addze $ovf,$ovf
274 $ST $hi1,$BNSZ($tp)
275;
276 slwi $tj,$num,`log($BNSZ)/log(2)`
277 $UCMP $i,$tj
278 addi $i,$i,$BNSZ
279 ble- Louter
280
281 addi $num,$num,2 ; restore $num
282 subfc $j,$j,$j ; j=0 and "clear" XER[CA]
283 addi $tp,$sp,$LOCALS
284 mtctr $num
285
286.align 4
287Lsub: $LDX $tj,$tp,$j
288 $LDX $nj,$np,$j
289 subfe $aj,$nj,$tj ; tp[j]-np[j]
290 $STX $aj,$rp,$j
291 addi $j,$j,$BNSZ
292 bdnz- Lsub
293
294 li $j,0
295 mtctr $num
296 subfe $ovf,$j,$ovf ; handle upmost overflow bit
297 and $ap,$tp,$ovf
298 andc $np,$rp,$ovf
299 or $ap,$ap,$np ; ap=borrow?tp:rp
300
301.align 4
302Lcopy: ; copy or in-place refresh
303 $LDX $tj,$ap,$j
304 $STX $tj,$rp,$j
305 $STX $j,$tp,$j ; zap at once
306 addi $j,$j,$BNSZ
307 bdnz- Lcopy
308
309 $POP $tj,0($sp)
310 li r3,1
311 $POP r20,`-12*$SIZE_T`($tj)
312 $POP r21,`-11*$SIZE_T`($tj)
313 $POP r22,`-10*$SIZE_T`($tj)
314 $POP r23,`-9*$SIZE_T`($tj)
315 $POP r24,`-8*$SIZE_T`($tj)
316 $POP r25,`-7*$SIZE_T`($tj)
317 $POP r26,`-6*$SIZE_T`($tj)
318 $POP r27,`-5*$SIZE_T`($tj)
319 $POP r28,`-4*$SIZE_T`($tj)
320 $POP r29,`-3*$SIZE_T`($tj)
321 $POP r30,`-2*$SIZE_T`($tj)
322 $POP r31,`-1*$SIZE_T`($tj)
323 mr $sp,$tj
324 blr
325 .long 0
326 .byte 0,12,4,0,0x80,12,6,0
327 .long 0
328
329.asciz "Montgomery Multiplication for PPC, CRYPTOGAMS by <appro\@openssl.org>"
330___
331
332$code =~ s/\`([^\`]*)\`/eval $1/gem;
333print $code;
334close STDOUT;
diff --git a/src/lib/libcrypto/bn/asm/ppc.pl b/src/lib/libcrypto/bn/asm/ppc.pl
deleted file mode 100644
index 1249ce2299..0000000000
--- a/src/lib/libcrypto/bn/asm/ppc.pl
+++ /dev/null
@@ -1,1998 +0,0 @@
1#!/usr/bin/env perl
2#
3# Implemented as a Perl wrapper as we want to support several different
4# architectures with single file. We pick up the target based on the
5# file name we are asked to generate.
6#
7# It should be noted though that this perl code is nothing like
8# <openssl>/crypto/perlasm/x86*. In this case perl is used pretty much
9# as pre-processor to cover for platform differences in name decoration,
10# linker tables, 32-/64-bit instruction sets...
11#
12# As you might know there're several PowerPC ABI in use. Most notably
13# Linux and AIX use different 32-bit ABIs. Good news are that these ABIs
14# are similar enough to implement leaf(!) functions, which would be ABI
15# neutral. And that's what you find here: ABI neutral leaf functions.
16# In case you wonder what that is...
17#
18# AIX performance
19#
20# MEASUREMENTS WITH cc ON a 200 MhZ PowerPC 604e.
21#
22# The following is the performance of 32-bit compiler
23# generated code:
24#
25# OpenSSL 0.9.6c 21 dec 2001
26# built on: Tue Jun 11 11:06:51 EDT 2002
27# options:bn(64,32) ...
28#compiler: cc -DTHREADS -DAIX -DB_ENDIAN -DBN_LLONG -O3
29# sign verify sign/s verify/s
30#rsa 512 bits 0.0098s 0.0009s 102.0 1170.6
31#rsa 1024 bits 0.0507s 0.0026s 19.7 387.5
32#rsa 2048 bits 0.3036s 0.0085s 3.3 117.1
33#rsa 4096 bits 2.0040s 0.0299s 0.5 33.4
34#dsa 512 bits 0.0087s 0.0106s 114.3 94.5
35#dsa 1024 bits 0.0256s 0.0313s 39.0 32.0
36#
37# Same bechmark with this assembler code:
38#
39#rsa 512 bits 0.0056s 0.0005s 178.6 2049.2
40#rsa 1024 bits 0.0283s 0.0015s 35.3 674.1
41#rsa 2048 bits 0.1744s 0.0050s 5.7 201.2
42#rsa 4096 bits 1.1644s 0.0179s 0.9 55.7
43#dsa 512 bits 0.0052s 0.0062s 191.6 162.0
44#dsa 1024 bits 0.0149s 0.0180s 67.0 55.5
45#
46# Number of operations increases by at almost 75%
47#
48# Here are performance numbers for 64-bit compiler
49# generated code:
50#
51# OpenSSL 0.9.6g [engine] 9 Aug 2002
52# built on: Fri Apr 18 16:59:20 EDT 2003
53# options:bn(64,64) ...
54# compiler: cc -DTHREADS -D_REENTRANT -q64 -DB_ENDIAN -O3
55# sign verify sign/s verify/s
56#rsa 512 bits 0.0028s 0.0003s 357.1 3844.4
57#rsa 1024 bits 0.0148s 0.0008s 67.5 1239.7
58#rsa 2048 bits 0.0963s 0.0028s 10.4 353.0
59#rsa 4096 bits 0.6538s 0.0102s 1.5 98.1
60#dsa 512 bits 0.0026s 0.0032s 382.5 313.7
61#dsa 1024 bits 0.0081s 0.0099s 122.8 100.6
62#
63# Same benchmark with this assembler code:
64#
65#rsa 512 bits 0.0020s 0.0002s 510.4 6273.7
66#rsa 1024 bits 0.0088s 0.0005s 114.1 2128.3
67#rsa 2048 bits 0.0540s 0.0016s 18.5 622.5
68#rsa 4096 bits 0.3700s 0.0058s 2.7 171.0
69#dsa 512 bits 0.0016s 0.0020s 610.7 507.1
70#dsa 1024 bits 0.0047s 0.0058s 212.5 173.2
71#
72# Again, performance increases by at about 75%
73#
74# Mac OS X, Apple G5 1.8GHz (Note this is 32 bit code)
75# OpenSSL 0.9.7c 30 Sep 2003
76#
77# Original code.
78#
79#rsa 512 bits 0.0011s 0.0001s 906.1 11012.5
80#rsa 1024 bits 0.0060s 0.0003s 166.6 3363.1
81#rsa 2048 bits 0.0370s 0.0010s 27.1 982.4
82#rsa 4096 bits 0.2426s 0.0036s 4.1 280.4
83#dsa 512 bits 0.0010s 0.0012s 1038.1 841.5
84#dsa 1024 bits 0.0030s 0.0037s 329.6 269.7
85#dsa 2048 bits 0.0101s 0.0127s 98.9 78.6
86#
87# Same benchmark with this assembler code:
88#
89#rsa 512 bits 0.0007s 0.0001s 1416.2 16645.9
90#rsa 1024 bits 0.0036s 0.0002s 274.4 5380.6
91#rsa 2048 bits 0.0222s 0.0006s 45.1 1589.5
92#rsa 4096 bits 0.1469s 0.0022s 6.8 449.6
93#dsa 512 bits 0.0006s 0.0007s 1664.2 1376.2
94#dsa 1024 bits 0.0018s 0.0023s 545.0 442.2
95#dsa 2048 bits 0.0061s 0.0075s 163.5 132.8
96#
97# Performance increase of ~60%
98#
99# If you have comments or suggestions to improve code send
100# me a note at schari@us.ibm.com
101#
102
103$flavour = shift;
104
105if ($flavour =~ /32/) {
106 $BITS= 32;
107 $BNSZ= $BITS/8;
108 $ISA= "\"ppc\"";
109
110 $LD= "lwz"; # load
111 $LDU= "lwzu"; # load and update
112 $ST= "stw"; # store
113 $STU= "stwu"; # store and update
114 $UMULL= "mullw"; # unsigned multiply low
115 $UMULH= "mulhwu"; # unsigned multiply high
116 $UDIV= "divwu"; # unsigned divide
117 $UCMPI= "cmplwi"; # unsigned compare with immediate
118 $UCMP= "cmplw"; # unsigned compare
119 $CNTLZ= "cntlzw"; # count leading zeros
120 $SHL= "slw"; # shift left
121 $SHR= "srw"; # unsigned shift right
122 $SHRI= "srwi"; # unsigned shift right by immediate
123 $SHLI= "slwi"; # shift left by immediate
124 $CLRU= "clrlwi"; # clear upper bits
125 $INSR= "insrwi"; # insert right
126 $ROTL= "rotlwi"; # rotate left by immediate
127 $TR= "tw"; # conditional trap
128} elsif ($flavour =~ /64/) {
129 $BITS= 64;
130 $BNSZ= $BITS/8;
131 $ISA= "\"ppc64\"";
132
133 # same as above, but 64-bit mnemonics...
134 $LD= "ld"; # load
135 $LDU= "ldu"; # load and update
136 $ST= "std"; # store
137 $STU= "stdu"; # store and update
138 $UMULL= "mulld"; # unsigned multiply low
139 $UMULH= "mulhdu"; # unsigned multiply high
140 $UDIV= "divdu"; # unsigned divide
141 $UCMPI= "cmpldi"; # unsigned compare with immediate
142 $UCMP= "cmpld"; # unsigned compare
143 $CNTLZ= "cntlzd"; # count leading zeros
144 $SHL= "sld"; # shift left
145 $SHR= "srd"; # unsigned shift right
146 $SHRI= "srdi"; # unsigned shift right by immediate
147 $SHLI= "sldi"; # shift left by immediate
148 $CLRU= "clrldi"; # clear upper bits
149 $INSR= "insrdi"; # insert right
150 $ROTL= "rotldi"; # rotate left by immediate
151 $TR= "td"; # conditional trap
152} else { die "nonsense $flavour"; }
153
154$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
155( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
156( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
157die "can't locate ppc-xlate.pl";
158
159open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
160
161$data=<<EOF;
162#--------------------------------------------------------------------
163#
164#
165#
166#
167# File: ppc32.s
168#
169# Created by: Suresh Chari
170# IBM Thomas J. Watson Research Library
171# Hawthorne, NY
172#
173#
174# Description: Optimized assembly routines for OpenSSL crypto
175# on the 32 bitPowerPC platform.
176#
177#
178# Version History
179#
180# 2. Fixed bn_add,bn_sub and bn_div_words, added comments,
181# cleaned up code. Also made a single version which can
182# be used for both the AIX and Linux compilers. See NOTE
183# below.
184# 12/05/03 Suresh Chari
185# (with lots of help from) Andy Polyakov
186##
187# 1. Initial version 10/20/02 Suresh Chari
188#
189#
190# The following file works for the xlc,cc
191# and gcc compilers.
192#
193# NOTE: To get the file to link correctly with the gcc compiler
194# you have to change the names of the routines and remove
195# the first .(dot) character. This should automatically
196# be done in the build process.
197#
198# Hand optimized assembly code for the following routines
199#
200# bn_sqr_comba4
201# bn_sqr_comba8
202# bn_mul_comba4
203# bn_mul_comba8
204# bn_sub_words
205# bn_add_words
206# bn_div_words
207# bn_sqr_words
208# bn_mul_words
209# bn_mul_add_words
210#
211# NOTE: It is possible to optimize this code more for
212# specific PowerPC or Power architectures. On the Northstar
213# architecture the optimizations in this file do
214# NOT provide much improvement.
215#
216# If you have comments or suggestions to improve code send
217# me a note at schari\@us.ibm.com
218#
219#--------------------------------------------------------------------------
220#
221# Defines to be used in the assembly code.
222#
223#.set r0,0 # we use it as storage for value of 0
224#.set SP,1 # preserved
225#.set RTOC,2 # preserved
226#.set r3,3 # 1st argument/return value
227#.set r4,4 # 2nd argument/volatile register
228#.set r5,5 # 3rd argument/volatile register
229#.set r6,6 # ...
230#.set r7,7
231#.set r8,8
232#.set r9,9
233#.set r10,10
234#.set r11,11
235#.set r12,12
236#.set r13,13 # not used, nor any other "below" it...
237
238# Declare function names to be global
239# NOTE: For gcc these names MUST be changed to remove
240# the first . i.e. for example change ".bn_sqr_comba4"
241# to "bn_sqr_comba4". This should be automatically done
242# in the build.
243
244 .globl .bn_sqr_comba4
245 .globl .bn_sqr_comba8
246 .globl .bn_mul_comba4
247 .globl .bn_mul_comba8
248 .globl .bn_sub_words
249 .globl .bn_add_words
250 .globl .bn_div_words
251 .globl .bn_sqr_words
252 .globl .bn_mul_words
253 .globl .bn_mul_add_words
254
255# .text section
256
257 .machine "any"
258
259#
260# NOTE: The following label name should be changed to
261# "bn_sqr_comba4" i.e. remove the first dot
262# for the gcc compiler. This should be automatically
263# done in the build
264#
265
266.align 4
267.bn_sqr_comba4:
268#
269# Optimized version of bn_sqr_comba4.
270#
271# void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
272# r3 contains r
273# r4 contains a
274#
275# Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows:
276#
277# r5,r6 are the two BN_ULONGs being multiplied.
278# r7,r8 are the results of the 32x32 giving 64 bit multiply.
279# r9,r10, r11 are the equivalents of c1,c2, c3.
280# Here's the assembly
281#
282#
283 xor r0,r0,r0 # set r0 = 0. Used in the addze
284 # instructions below
285
286 #sqr_add_c(a,0,c1,c2,c3)
287 $LD r5,`0*$BNSZ`(r4)
288 $UMULL r9,r5,r5
289 $UMULH r10,r5,r5 #in first iteration. No need
290 #to add since c1=c2=c3=0.
291 # Note c3(r11) is NOT set to 0
292 # but will be.
293
294 $ST r9,`0*$BNSZ`(r3) # r[0]=c1;
295 # sqr_add_c2(a,1,0,c2,c3,c1);
296 $LD r6,`1*$BNSZ`(r4)
297 $UMULL r7,r5,r6
298 $UMULH r8,r5,r6
299
300 addc r7,r7,r7 # compute (r7,r8)=2*(r7,r8)
301 adde r8,r8,r8
302 addze r9,r0 # catch carry if any.
303 # r9= r0(=0) and carry
304
305 addc r10,r7,r10 # now add to temp result.
306 addze r11,r8 # r8 added to r11 which is 0
307 addze r9,r9
308
309 $ST r10,`1*$BNSZ`(r3) #r[1]=c2;
310 #sqr_add_c(a,1,c3,c1,c2)
311 $UMULL r7,r6,r6
312 $UMULH r8,r6,r6
313 addc r11,r7,r11
314 adde r9,r8,r9
315 addze r10,r0
316 #sqr_add_c2(a,2,0,c3,c1,c2)
317 $LD r6,`2*$BNSZ`(r4)
318 $UMULL r7,r5,r6
319 $UMULH r8,r5,r6
320
321 addc r7,r7,r7
322 adde r8,r8,r8
323 addze r10,r10
324
325 addc r11,r7,r11
326 adde r9,r8,r9
327 addze r10,r10
328 $ST r11,`2*$BNSZ`(r3) #r[2]=c3
329 #sqr_add_c2(a,3,0,c1,c2,c3);
330 $LD r6,`3*$BNSZ`(r4)
331 $UMULL r7,r5,r6
332 $UMULH r8,r5,r6
333 addc r7,r7,r7
334 adde r8,r8,r8
335 addze r11,r0
336
337 addc r9,r7,r9
338 adde r10,r8,r10
339 addze r11,r11
340 #sqr_add_c2(a,2,1,c1,c2,c3);
341 $LD r5,`1*$BNSZ`(r4)
342 $LD r6,`2*$BNSZ`(r4)
343 $UMULL r7,r5,r6
344 $UMULH r8,r5,r6
345
346 addc r7,r7,r7
347 adde r8,r8,r8
348 addze r11,r11
349 addc r9,r7,r9
350 adde r10,r8,r10
351 addze r11,r11
352 $ST r9,`3*$BNSZ`(r3) #r[3]=c1
353 #sqr_add_c(a,2,c2,c3,c1);
354 $UMULL r7,r6,r6
355 $UMULH r8,r6,r6
356 addc r10,r7,r10
357 adde r11,r8,r11
358 addze r9,r0
359 #sqr_add_c2(a,3,1,c2,c3,c1);
360 $LD r6,`3*$BNSZ`(r4)
361 $UMULL r7,r5,r6
362 $UMULH r8,r5,r6
363 addc r7,r7,r7
364 adde r8,r8,r8
365 addze r9,r9
366
367 addc r10,r7,r10
368 adde r11,r8,r11
369 addze r9,r9
370 $ST r10,`4*$BNSZ`(r3) #r[4]=c2
371 #sqr_add_c2(a,3,2,c3,c1,c2);
372 $LD r5,`2*$BNSZ`(r4)
373 $UMULL r7,r5,r6
374 $UMULH r8,r5,r6
375 addc r7,r7,r7
376 adde r8,r8,r8
377 addze r10,r0
378
379 addc r11,r7,r11
380 adde r9,r8,r9
381 addze r10,r10
382 $ST r11,`5*$BNSZ`(r3) #r[5] = c3
383 #sqr_add_c(a,3,c1,c2,c3);
384 $UMULL r7,r6,r6
385 $UMULH r8,r6,r6
386 addc r9,r7,r9
387 adde r10,r8,r10
388
389 $ST r9,`6*$BNSZ`(r3) #r[6]=c1
390 $ST r10,`7*$BNSZ`(r3) #r[7]=c2
391 blr
392 .long 0
393 .byte 0,12,0x14,0,0,0,2,0
394 .long 0
395
396#
397# NOTE: The following label name should be changed to
398# "bn_sqr_comba8" i.e. remove the first dot
399# for the gcc compiler. This should be automatically
400# done in the build
401#
402
403.align 4
404.bn_sqr_comba8:
405#
406# This is an optimized version of the bn_sqr_comba8 routine.
407# Tightly uses the adde instruction
408#
409#
410# void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
411# r3 contains r
412# r4 contains a
413#
414# Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows:
415#
416# r5,r6 are the two BN_ULONGs being multiplied.
417# r7,r8 are the results of the 32x32 giving 64 bit multiply.
418# r9,r10, r11 are the equivalents of c1,c2, c3.
419#
420# Possible optimization of loading all 8 longs of a into registers
421# doesnt provide any speedup
422#
423
424 xor r0,r0,r0 #set r0 = 0.Used in addze
425 #instructions below.
426
427 #sqr_add_c(a,0,c1,c2,c3);
428 $LD r5,`0*$BNSZ`(r4)
429 $UMULL r9,r5,r5 #1st iteration: no carries.
430 $UMULH r10,r5,r5
431 $ST r9,`0*$BNSZ`(r3) # r[0]=c1;
432 #sqr_add_c2(a,1,0,c2,c3,c1);
433 $LD r6,`1*$BNSZ`(r4)
434 $UMULL r7,r5,r6
435 $UMULH r8,r5,r6
436
437 addc r10,r7,r10 #add the two register number
438 adde r11,r8,r0 # (r8,r7) to the three register
439 addze r9,r0 # number (r9,r11,r10).NOTE:r0=0
440
441 addc r10,r7,r10 #add the two register number
442 adde r11,r8,r11 # (r8,r7) to the three register
443 addze r9,r9 # number (r9,r11,r10).
444
445 $ST r10,`1*$BNSZ`(r3) # r[1]=c2
446
447 #sqr_add_c(a,1,c3,c1,c2);
448 $UMULL r7,r6,r6
449 $UMULH r8,r6,r6
450 addc r11,r7,r11
451 adde r9,r8,r9
452 addze r10,r0
453 #sqr_add_c2(a,2,0,c3,c1,c2);
454 $LD r6,`2*$BNSZ`(r4)
455 $UMULL r7,r5,r6
456 $UMULH r8,r5,r6
457
458 addc r11,r7,r11
459 adde r9,r8,r9
460 addze r10,r10
461
462 addc r11,r7,r11
463 adde r9,r8,r9
464 addze r10,r10
465
466 $ST r11,`2*$BNSZ`(r3) #r[2]=c3
467 #sqr_add_c2(a,3,0,c1,c2,c3);
468 $LD r6,`3*$BNSZ`(r4) #r6 = a[3]. r5 is already a[0].
469 $UMULL r7,r5,r6
470 $UMULH r8,r5,r6
471
472 addc r9,r7,r9
473 adde r10,r8,r10
474 addze r11,r0
475
476 addc r9,r7,r9
477 adde r10,r8,r10
478 addze r11,r11
479 #sqr_add_c2(a,2,1,c1,c2,c3);
480 $LD r5,`1*$BNSZ`(r4)
481 $LD r6,`2*$BNSZ`(r4)
482 $UMULL r7,r5,r6
483 $UMULH r8,r5,r6
484
485 addc r9,r7,r9
486 adde r10,r8,r10
487 addze r11,r11
488
489 addc r9,r7,r9
490 adde r10,r8,r10
491 addze r11,r11
492
493 $ST r9,`3*$BNSZ`(r3) #r[3]=c1;
494 #sqr_add_c(a,2,c2,c3,c1);
495 $UMULL r7,r6,r6
496 $UMULH r8,r6,r6
497
498 addc r10,r7,r10
499 adde r11,r8,r11
500 addze r9,r0
501 #sqr_add_c2(a,3,1,c2,c3,c1);
502 $LD r6,`3*$BNSZ`(r4)
503 $UMULL r7,r5,r6
504 $UMULH r8,r5,r6
505
506 addc r10,r7,r10
507 adde r11,r8,r11
508 addze r9,r9
509
510 addc r10,r7,r10
511 adde r11,r8,r11
512 addze r9,r9
513 #sqr_add_c2(a,4,0,c2,c3,c1);
514 $LD r5,`0*$BNSZ`(r4)
515 $LD r6,`4*$BNSZ`(r4)
516 $UMULL r7,r5,r6
517 $UMULH r8,r5,r6
518
519 addc r10,r7,r10
520 adde r11,r8,r11
521 addze r9,r9
522
523 addc r10,r7,r10
524 adde r11,r8,r11
525 addze r9,r9
526 $ST r10,`4*$BNSZ`(r3) #r[4]=c2;
527 #sqr_add_c2(a,5,0,c3,c1,c2);
528 $LD r6,`5*$BNSZ`(r4)
529 $UMULL r7,r5,r6
530 $UMULH r8,r5,r6
531
532 addc r11,r7,r11
533 adde r9,r8,r9
534 addze r10,r0
535
536 addc r11,r7,r11
537 adde r9,r8,r9
538 addze r10,r10
539 #sqr_add_c2(a,4,1,c3,c1,c2);
540 $LD r5,`1*$BNSZ`(r4)
541 $LD r6,`4*$BNSZ`(r4)
542 $UMULL r7,r5,r6
543 $UMULH r8,r5,r6
544
545 addc r11,r7,r11
546 adde r9,r8,r9
547 addze r10,r10
548
549 addc r11,r7,r11
550 adde r9,r8,r9
551 addze r10,r10
552 #sqr_add_c2(a,3,2,c3,c1,c2);
553 $LD r5,`2*$BNSZ`(r4)
554 $LD r6,`3*$BNSZ`(r4)
555 $UMULL r7,r5,r6
556 $UMULH r8,r5,r6
557
558 addc r11,r7,r11
559 adde r9,r8,r9
560 addze r10,r10
561
562 addc r11,r7,r11
563 adde r9,r8,r9
564 addze r10,r10
565 $ST r11,`5*$BNSZ`(r3) #r[5]=c3;
566 #sqr_add_c(a,3,c1,c2,c3);
567 $UMULL r7,r6,r6
568 $UMULH r8,r6,r6
569 addc r9,r7,r9
570 adde r10,r8,r10
571 addze r11,r0
572 #sqr_add_c2(a,4,2,c1,c2,c3);
573 $LD r6,`4*$BNSZ`(r4)
574 $UMULL r7,r5,r6
575 $UMULH r8,r5,r6
576
577 addc r9,r7,r9
578 adde r10,r8,r10
579 addze r11,r11
580
581 addc r9,r7,r9
582 adde r10,r8,r10
583 addze r11,r11
584 #sqr_add_c2(a,5,1,c1,c2,c3);
585 $LD r5,`1*$BNSZ`(r4)
586 $LD r6,`5*$BNSZ`(r4)
587 $UMULL r7,r5,r6
588 $UMULH r8,r5,r6
589
590 addc r9,r7,r9
591 adde r10,r8,r10
592 addze r11,r11
593
594 addc r9,r7,r9
595 adde r10,r8,r10
596 addze r11,r11
597 #sqr_add_c2(a,6,0,c1,c2,c3);
598 $LD r5,`0*$BNSZ`(r4)
599 $LD r6,`6*$BNSZ`(r4)
600 $UMULL r7,r5,r6
601 $UMULH r8,r5,r6
602 addc r9,r7,r9
603 adde r10,r8,r10
604 addze r11,r11
605 addc r9,r7,r9
606 adde r10,r8,r10
607 addze r11,r11
608 $ST r9,`6*$BNSZ`(r3) #r[6]=c1;
609 #sqr_add_c2(a,7,0,c2,c3,c1);
610 $LD r6,`7*$BNSZ`(r4)
611 $UMULL r7,r5,r6
612 $UMULH r8,r5,r6
613
614 addc r10,r7,r10
615 adde r11,r8,r11
616 addze r9,r0
617 addc r10,r7,r10
618 adde r11,r8,r11
619 addze r9,r9
620 #sqr_add_c2(a,6,1,c2,c3,c1);
621 $LD r5,`1*$BNSZ`(r4)
622 $LD r6,`6*$BNSZ`(r4)
623 $UMULL r7,r5,r6
624 $UMULH r8,r5,r6
625
626 addc r10,r7,r10
627 adde r11,r8,r11
628 addze r9,r9
629 addc r10,r7,r10
630 adde r11,r8,r11
631 addze r9,r9
632 #sqr_add_c2(a,5,2,c2,c3,c1);
633 $LD r5,`2*$BNSZ`(r4)
634 $LD r6,`5*$BNSZ`(r4)
635 $UMULL r7,r5,r6
636 $UMULH r8,r5,r6
637 addc r10,r7,r10
638 adde r11,r8,r11
639 addze r9,r9
640 addc r10,r7,r10
641 adde r11,r8,r11
642 addze r9,r9
643 #sqr_add_c2(a,4,3,c2,c3,c1);
644 $LD r5,`3*$BNSZ`(r4)
645 $LD r6,`4*$BNSZ`(r4)
646 $UMULL r7,r5,r6
647 $UMULH r8,r5,r6
648
649 addc r10,r7,r10
650 adde r11,r8,r11
651 addze r9,r9
652 addc r10,r7,r10
653 adde r11,r8,r11
654 addze r9,r9
655 $ST r10,`7*$BNSZ`(r3) #r[7]=c2;
656 #sqr_add_c(a,4,c3,c1,c2);
657 $UMULL r7,r6,r6
658 $UMULH r8,r6,r6
659 addc r11,r7,r11
660 adde r9,r8,r9
661 addze r10,r0
662 #sqr_add_c2(a,5,3,c3,c1,c2);
663 $LD r6,`5*$BNSZ`(r4)
664 $UMULL r7,r5,r6
665 $UMULH r8,r5,r6
666 addc r11,r7,r11
667 adde r9,r8,r9
668 addze r10,r10
669 addc r11,r7,r11
670 adde r9,r8,r9
671 addze r10,r10
672 #sqr_add_c2(a,6,2,c3,c1,c2);
673 $LD r5,`2*$BNSZ`(r4)
674 $LD r6,`6*$BNSZ`(r4)
675 $UMULL r7,r5,r6
676 $UMULH r8,r5,r6
677 addc r11,r7,r11
678 adde r9,r8,r9
679 addze r10,r10
680
681 addc r11,r7,r11
682 adde r9,r8,r9
683 addze r10,r10
684 #sqr_add_c2(a,7,1,c3,c1,c2);
685 $LD r5,`1*$BNSZ`(r4)
686 $LD r6,`7*$BNSZ`(r4)
687 $UMULL r7,r5,r6
688 $UMULH r8,r5,r6
689 addc r11,r7,r11
690 adde r9,r8,r9
691 addze r10,r10
692 addc r11,r7,r11
693 adde r9,r8,r9
694 addze r10,r10
695 $ST r11,`8*$BNSZ`(r3) #r[8]=c3;
696 #sqr_add_c2(a,7,2,c1,c2,c3);
697 $LD r5,`2*$BNSZ`(r4)
698 $UMULL r7,r5,r6
699 $UMULH r8,r5,r6
700
701 addc r9,r7,r9
702 adde r10,r8,r10
703 addze r11,r0
704 addc r9,r7,r9
705 adde r10,r8,r10
706 addze r11,r11
707 #sqr_add_c2(a,6,3,c1,c2,c3);
708 $LD r5,`3*$BNSZ`(r4)
709 $LD r6,`6*$BNSZ`(r4)
710 $UMULL r7,r5,r6
711 $UMULH r8,r5,r6
712 addc r9,r7,r9
713 adde r10,r8,r10
714 addze r11,r11
715 addc r9,r7,r9
716 adde r10,r8,r10
717 addze r11,r11
718 #sqr_add_c2(a,5,4,c1,c2,c3);
719 $LD r5,`4*$BNSZ`(r4)
720 $LD r6,`5*$BNSZ`(r4)
721 $UMULL r7,r5,r6
722 $UMULH r8,r5,r6
723 addc r9,r7,r9
724 adde r10,r8,r10
725 addze r11,r11
726 addc r9,r7,r9
727 adde r10,r8,r10
728 addze r11,r11
729 $ST r9,`9*$BNSZ`(r3) #r[9]=c1;
730 #sqr_add_c(a,5,c2,c3,c1);
731 $UMULL r7,r6,r6
732 $UMULH r8,r6,r6
733 addc r10,r7,r10
734 adde r11,r8,r11
735 addze r9,r0
736 #sqr_add_c2(a,6,4,c2,c3,c1);
737 $LD r6,`6*$BNSZ`(r4)
738 $UMULL r7,r5,r6
739 $UMULH r8,r5,r6
740 addc r10,r7,r10
741 adde r11,r8,r11
742 addze r9,r9
743 addc r10,r7,r10
744 adde r11,r8,r11
745 addze r9,r9
746 #sqr_add_c2(a,7,3,c2,c3,c1);
747 $LD r5,`3*$BNSZ`(r4)
748 $LD r6,`7*$BNSZ`(r4)
749 $UMULL r7,r5,r6
750 $UMULH r8,r5,r6
751 addc r10,r7,r10
752 adde r11,r8,r11
753 addze r9,r9
754 addc r10,r7,r10
755 adde r11,r8,r11
756 addze r9,r9
757 $ST r10,`10*$BNSZ`(r3) #r[10]=c2;
758 #sqr_add_c2(a,7,4,c3,c1,c2);
759 $LD r5,`4*$BNSZ`(r4)
760 $UMULL r7,r5,r6
761 $UMULH r8,r5,r6
762 addc r11,r7,r11
763 adde r9,r8,r9
764 addze r10,r0
765 addc r11,r7,r11
766 adde r9,r8,r9
767 addze r10,r10
768 #sqr_add_c2(a,6,5,c3,c1,c2);
769 $LD r5,`5*$BNSZ`(r4)
770 $LD r6,`6*$BNSZ`(r4)
771 $UMULL r7,r5,r6
772 $UMULH r8,r5,r6
773 addc r11,r7,r11
774 adde r9,r8,r9
775 addze r10,r10
776 addc r11,r7,r11
777 adde r9,r8,r9
778 addze r10,r10
779 $ST r11,`11*$BNSZ`(r3) #r[11]=c3;
780 #sqr_add_c(a,6,c1,c2,c3);
781 $UMULL r7,r6,r6
782 $UMULH r8,r6,r6
783 addc r9,r7,r9
784 adde r10,r8,r10
785 addze r11,r0
786 #sqr_add_c2(a,7,5,c1,c2,c3)
787 $LD r6,`7*$BNSZ`(r4)
788 $UMULL r7,r5,r6
789 $UMULH r8,r5,r6
790 addc r9,r7,r9
791 adde r10,r8,r10
792 addze r11,r11
793 addc r9,r7,r9
794 adde r10,r8,r10
795 addze r11,r11
796 $ST r9,`12*$BNSZ`(r3) #r[12]=c1;
797
798 #sqr_add_c2(a,7,6,c2,c3,c1)
799 $LD r5,`6*$BNSZ`(r4)
800 $UMULL r7,r5,r6
801 $UMULH r8,r5,r6
802 addc r10,r7,r10
803 adde r11,r8,r11
804 addze r9,r0
805 addc r10,r7,r10
806 adde r11,r8,r11
807 addze r9,r9
808 $ST r10,`13*$BNSZ`(r3) #r[13]=c2;
809 #sqr_add_c(a,7,c3,c1,c2);
810 $UMULL r7,r6,r6
811 $UMULH r8,r6,r6
812 addc r11,r7,r11
813 adde r9,r8,r9
814 $ST r11,`14*$BNSZ`(r3) #r[14]=c3;
815 $ST r9, `15*$BNSZ`(r3) #r[15]=c1;
816
817
818 blr
819 .long 0
820 .byte 0,12,0x14,0,0,0,2,0
821 .long 0
822
823#
824# NOTE: The following label name should be changed to
825# "bn_mul_comba4" i.e. remove the first dot
826# for the gcc compiler. This should be automatically
827# done in the build
828#
829
830.align 4
831.bn_mul_comba4:
832#
833# This is an optimized version of the bn_mul_comba4 routine.
834#
835# void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
836# r3 contains r
837# r4 contains a
838# r5 contains b
839# r6, r7 are the 2 BN_ULONGs being multiplied.
840# r8, r9 are the results of the 32x32 giving 64 multiply.
841# r10, r11, r12 are the equivalents of c1, c2, and c3.
842#
843 xor r0,r0,r0 #r0=0. Used in addze below.
844 #mul_add_c(a[0],b[0],c1,c2,c3);
845 $LD r6,`0*$BNSZ`(r4)
846 $LD r7,`0*$BNSZ`(r5)
847 $UMULL r10,r6,r7
848 $UMULH r11,r6,r7
849 $ST r10,`0*$BNSZ`(r3) #r[0]=c1
850 #mul_add_c(a[0],b[1],c2,c3,c1);
851 $LD r7,`1*$BNSZ`(r5)
852 $UMULL r8,r6,r7
853 $UMULH r9,r6,r7
854 addc r11,r8,r11
855 adde r12,r9,r0
856 addze r10,r0
857 #mul_add_c(a[1],b[0],c2,c3,c1);
858 $LD r6, `1*$BNSZ`(r4)
859 $LD r7, `0*$BNSZ`(r5)
860 $UMULL r8,r6,r7
861 $UMULH r9,r6,r7
862 addc r11,r8,r11
863 adde r12,r9,r12
864 addze r10,r10
865 $ST r11,`1*$BNSZ`(r3) #r[1]=c2
866 #mul_add_c(a[2],b[0],c3,c1,c2);
867 $LD r6,`2*$BNSZ`(r4)
868 $UMULL r8,r6,r7
869 $UMULH r9,r6,r7
870 addc r12,r8,r12
871 adde r10,r9,r10
872 addze r11,r0
873 #mul_add_c(a[1],b[1],c3,c1,c2);
874 $LD r6,`1*$BNSZ`(r4)
875 $LD r7,`1*$BNSZ`(r5)
876 $UMULL r8,r6,r7
877 $UMULH r9,r6,r7
878 addc r12,r8,r12
879 adde r10,r9,r10
880 addze r11,r11
881 #mul_add_c(a[0],b[2],c3,c1,c2);
882 $LD r6,`0*$BNSZ`(r4)
883 $LD r7,`2*$BNSZ`(r5)
884 $UMULL r8,r6,r7
885 $UMULH r9,r6,r7
886 addc r12,r8,r12
887 adde r10,r9,r10
888 addze r11,r11
889 $ST r12,`2*$BNSZ`(r3) #r[2]=c3
890 #mul_add_c(a[0],b[3],c1,c2,c3);
891 $LD r7,`3*$BNSZ`(r5)
892 $UMULL r8,r6,r7
893 $UMULH r9,r6,r7
894 addc r10,r8,r10
895 adde r11,r9,r11
896 addze r12,r0
897 #mul_add_c(a[1],b[2],c1,c2,c3);
898 $LD r6,`1*$BNSZ`(r4)
899 $LD r7,`2*$BNSZ`(r5)
900 $UMULL r8,r6,r7
901 $UMULH r9,r6,r7
902 addc r10,r8,r10
903 adde r11,r9,r11
904 addze r12,r12
905 #mul_add_c(a[2],b[1],c1,c2,c3);
906 $LD r6,`2*$BNSZ`(r4)
907 $LD r7,`1*$BNSZ`(r5)
908 $UMULL r8,r6,r7
909 $UMULH r9,r6,r7
910 addc r10,r8,r10
911 adde r11,r9,r11
912 addze r12,r12
913 #mul_add_c(a[3],b[0],c1,c2,c3);
914 $LD r6,`3*$BNSZ`(r4)
915 $LD r7,`0*$BNSZ`(r5)
916 $UMULL r8,r6,r7
917 $UMULH r9,r6,r7
918 addc r10,r8,r10
919 adde r11,r9,r11
920 addze r12,r12
921 $ST r10,`3*$BNSZ`(r3) #r[3]=c1
922 #mul_add_c(a[3],b[1],c2,c3,c1);
923 $LD r7,`1*$BNSZ`(r5)
924 $UMULL r8,r6,r7
925 $UMULH r9,r6,r7
926 addc r11,r8,r11
927 adde r12,r9,r12
928 addze r10,r0
929 #mul_add_c(a[2],b[2],c2,c3,c1);
930 $LD r6,`2*$BNSZ`(r4)
931 $LD r7,`2*$BNSZ`(r5)
932 $UMULL r8,r6,r7
933 $UMULH r9,r6,r7
934 addc r11,r8,r11
935 adde r12,r9,r12
936 addze r10,r10
937 #mul_add_c(a[1],b[3],c2,c3,c1);
938 $LD r6,`1*$BNSZ`(r4)
939 $LD r7,`3*$BNSZ`(r5)
940 $UMULL r8,r6,r7
941 $UMULH r9,r6,r7
942 addc r11,r8,r11
943 adde r12,r9,r12
944 addze r10,r10
945 $ST r11,`4*$BNSZ`(r3) #r[4]=c2
946 #mul_add_c(a[2],b[3],c3,c1,c2);
947 $LD r6,`2*$BNSZ`(r4)
948 $UMULL r8,r6,r7
949 $UMULH r9,r6,r7
950 addc r12,r8,r12
951 adde r10,r9,r10
952 addze r11,r0
953 #mul_add_c(a[3],b[2],c3,c1,c2);
954 $LD r6,`3*$BNSZ`(r4)
955 $LD r7,`2*$BNSZ`(r5)
956 $UMULL r8,r6,r7
957 $UMULH r9,r6,r7
958 addc r12,r8,r12
959 adde r10,r9,r10
960 addze r11,r11
961 $ST r12,`5*$BNSZ`(r3) #r[5]=c3
962 #mul_add_c(a[3],b[3],c1,c2,c3);
963 $LD r7,`3*$BNSZ`(r5)
964 $UMULL r8,r6,r7
965 $UMULH r9,r6,r7
966 addc r10,r8,r10
967 adde r11,r9,r11
968
969 $ST r10,`6*$BNSZ`(r3) #r[6]=c1
970 $ST r11,`7*$BNSZ`(r3) #r[7]=c2
971 blr
972 .long 0
973 .byte 0,12,0x14,0,0,0,3,0
974 .long 0
975
976#
977# NOTE: The following label name should be changed to
978# "bn_mul_comba8" i.e. remove the first dot
979# for the gcc compiler. This should be automatically
980# done in the build
981#
982
983.align 4
984.bn_mul_comba8:
985#
986# Optimized version of the bn_mul_comba8 routine.
987#
988# void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
989# r3 contains r
990# r4 contains a
991# r5 contains b
992# r6, r7 are the 2 BN_ULONGs being multiplied.
993# r8, r9 are the results of the 32x32 giving 64 multiply.
994# r10, r11, r12 are the equivalents of c1, c2, and c3.
995#
996 xor r0,r0,r0 #r0=0. Used in addze below.
997
998 #mul_add_c(a[0],b[0],c1,c2,c3);
999 $LD r6,`0*$BNSZ`(r4) #a[0]
1000 $LD r7,`0*$BNSZ`(r5) #b[0]
1001 $UMULL r10,r6,r7
1002 $UMULH r11,r6,r7
1003 $ST r10,`0*$BNSZ`(r3) #r[0]=c1;
1004 #mul_add_c(a[0],b[1],c2,c3,c1);
1005 $LD r7,`1*$BNSZ`(r5)
1006 $UMULL r8,r6,r7
1007 $UMULH r9,r6,r7
1008 addc r11,r11,r8
1009 addze r12,r9 # since we didnt set r12 to zero before.
1010 addze r10,r0
1011 #mul_add_c(a[1],b[0],c2,c3,c1);
1012 $LD r6,`1*$BNSZ`(r4)
1013 $LD r7,`0*$BNSZ`(r5)
1014 $UMULL r8,r6,r7
1015 $UMULH r9,r6,r7
1016 addc r11,r11,r8
1017 adde r12,r12,r9
1018 addze r10,r10
1019 $ST r11,`1*$BNSZ`(r3) #r[1]=c2;
1020 #mul_add_c(a[2],b[0],c3,c1,c2);
1021 $LD r6,`2*$BNSZ`(r4)
1022 $UMULL r8,r6,r7
1023 $UMULH r9,r6,r7
1024 addc r12,r12,r8
1025 adde r10,r10,r9
1026 addze r11,r0
1027 #mul_add_c(a[1],b[1],c3,c1,c2);
1028 $LD r6,`1*$BNSZ`(r4)
1029 $LD r7,`1*$BNSZ`(r5)
1030 $UMULL r8,r6,r7
1031 $UMULH r9,r6,r7
1032 addc r12,r12,r8
1033 adde r10,r10,r9
1034 addze r11,r11
1035 #mul_add_c(a[0],b[2],c3,c1,c2);
1036 $LD r6,`0*$BNSZ`(r4)
1037 $LD r7,`2*$BNSZ`(r5)
1038 $UMULL r8,r6,r7
1039 $UMULH r9,r6,r7
1040 addc r12,r12,r8
1041 adde r10,r10,r9
1042 addze r11,r11
1043 $ST r12,`2*$BNSZ`(r3) #r[2]=c3;
1044 #mul_add_c(a[0],b[3],c1,c2,c3);
1045 $LD r7,`3*$BNSZ`(r5)
1046 $UMULL r8,r6,r7
1047 $UMULH r9,r6,r7
1048 addc r10,r10,r8
1049 adde r11,r11,r9
1050 addze r12,r0
1051 #mul_add_c(a[1],b[2],c1,c2,c3);
1052 $LD r6,`1*$BNSZ`(r4)
1053 $LD r7,`2*$BNSZ`(r5)
1054 $UMULL r8,r6,r7
1055 $UMULH r9,r6,r7
1056 addc r10,r10,r8
1057 adde r11,r11,r9
1058 addze r12,r12
1059
1060 #mul_add_c(a[2],b[1],c1,c2,c3);
1061 $LD r6,`2*$BNSZ`(r4)
1062 $LD r7,`1*$BNSZ`(r5)
1063 $UMULL r8,r6,r7
1064 $UMULH r9,r6,r7
1065 addc r10,r10,r8
1066 adde r11,r11,r9
1067 addze r12,r12
1068 #mul_add_c(a[3],b[0],c1,c2,c3);
1069 $LD r6,`3*$BNSZ`(r4)
1070 $LD r7,`0*$BNSZ`(r5)
1071 $UMULL r8,r6,r7
1072 $UMULH r9,r6,r7
1073 addc r10,r10,r8
1074 adde r11,r11,r9
1075 addze r12,r12
1076 $ST r10,`3*$BNSZ`(r3) #r[3]=c1;
1077 #mul_add_c(a[4],b[0],c2,c3,c1);
1078 $LD r6,`4*$BNSZ`(r4)
1079 $UMULL r8,r6,r7
1080 $UMULH r9,r6,r7
1081 addc r11,r11,r8
1082 adde r12,r12,r9
1083 addze r10,r0
1084 #mul_add_c(a[3],b[1],c2,c3,c1);
1085 $LD r6,`3*$BNSZ`(r4)
1086 $LD r7,`1*$BNSZ`(r5)
1087 $UMULL r8,r6,r7
1088 $UMULH r9,r6,r7
1089 addc r11,r11,r8
1090 adde r12,r12,r9
1091 addze r10,r10
1092 #mul_add_c(a[2],b[2],c2,c3,c1);
1093 $LD r6,`2*$BNSZ`(r4)
1094 $LD r7,`2*$BNSZ`(r5)
1095 $UMULL r8,r6,r7
1096 $UMULH r9,r6,r7
1097 addc r11,r11,r8
1098 adde r12,r12,r9
1099 addze r10,r10
1100 #mul_add_c(a[1],b[3],c2,c3,c1);
1101 $LD r6,`1*$BNSZ`(r4)
1102 $LD r7,`3*$BNSZ`(r5)
1103 $UMULL r8,r6,r7
1104 $UMULH r9,r6,r7
1105 addc r11,r11,r8
1106 adde r12,r12,r9
1107 addze r10,r10
1108 #mul_add_c(a[0],b[4],c2,c3,c1);
1109 $LD r6,`0*$BNSZ`(r4)
1110 $LD r7,`4*$BNSZ`(r5)
1111 $UMULL r8,r6,r7
1112 $UMULH r9,r6,r7
1113 addc r11,r11,r8
1114 adde r12,r12,r9
1115 addze r10,r10
1116 $ST r11,`4*$BNSZ`(r3) #r[4]=c2;
1117 #mul_add_c(a[0],b[5],c3,c1,c2);
1118 $LD r7,`5*$BNSZ`(r5)
1119 $UMULL r8,r6,r7
1120 $UMULH r9,r6,r7
1121 addc r12,r12,r8
1122 adde r10,r10,r9
1123 addze r11,r0
1124 #mul_add_c(a[1],b[4],c3,c1,c2);
1125 $LD r6,`1*$BNSZ`(r4)
1126 $LD r7,`4*$BNSZ`(r5)
1127 $UMULL r8,r6,r7
1128 $UMULH r9,r6,r7
1129 addc r12,r12,r8
1130 adde r10,r10,r9
1131 addze r11,r11
1132 #mul_add_c(a[2],b[3],c3,c1,c2);
1133 $LD r6,`2*$BNSZ`(r4)
1134 $LD r7,`3*$BNSZ`(r5)
1135 $UMULL r8,r6,r7
1136 $UMULH r9,r6,r7
1137 addc r12,r12,r8
1138 adde r10,r10,r9
1139 addze r11,r11
1140 #mul_add_c(a[3],b[2],c3,c1,c2);
1141 $LD r6,`3*$BNSZ`(r4)
1142 $LD r7,`2*$BNSZ`(r5)
1143 $UMULL r8,r6,r7
1144 $UMULH r9,r6,r7
1145 addc r12,r12,r8
1146 adde r10,r10,r9
1147 addze r11,r11
1148 #mul_add_c(a[4],b[1],c3,c1,c2);
1149 $LD r6,`4*$BNSZ`(r4)
1150 $LD r7,`1*$BNSZ`(r5)
1151 $UMULL r8,r6,r7
1152 $UMULH r9,r6,r7
1153 addc r12,r12,r8
1154 adde r10,r10,r9
1155 addze r11,r11
1156 #mul_add_c(a[5],b[0],c3,c1,c2);
1157 $LD r6,`5*$BNSZ`(r4)
1158 $LD r7,`0*$BNSZ`(r5)
1159 $UMULL r8,r6,r7
1160 $UMULH r9,r6,r7
1161 addc r12,r12,r8
1162 adde r10,r10,r9
1163 addze r11,r11
1164 $ST r12,`5*$BNSZ`(r3) #r[5]=c3;
1165 #mul_add_c(a[6],b[0],c1,c2,c3);
1166 $LD r6,`6*$BNSZ`(r4)
1167 $UMULL r8,r6,r7
1168 $UMULH r9,r6,r7
1169 addc r10,r10,r8
1170 adde r11,r11,r9
1171 addze r12,r0
1172 #mul_add_c(a[5],b[1],c1,c2,c3);
1173 $LD r6,`5*$BNSZ`(r4)
1174 $LD r7,`1*$BNSZ`(r5)
1175 $UMULL r8,r6,r7
1176 $UMULH r9,r6,r7
1177 addc r10,r10,r8
1178 adde r11,r11,r9
1179 addze r12,r12
1180 #mul_add_c(a[4],b[2],c1,c2,c3);
1181 $LD r6,`4*$BNSZ`(r4)
1182 $LD r7,`2*$BNSZ`(r5)
1183 $UMULL r8,r6,r7
1184 $UMULH r9,r6,r7
1185 addc r10,r10,r8
1186 adde r11,r11,r9
1187 addze r12,r12
1188 #mul_add_c(a[3],b[3],c1,c2,c3);
1189 $LD r6,`3*$BNSZ`(r4)
1190 $LD r7,`3*$BNSZ`(r5)
1191 $UMULL r8,r6,r7
1192 $UMULH r9,r6,r7
1193 addc r10,r10,r8
1194 adde r11,r11,r9
1195 addze r12,r12
1196 #mul_add_c(a[2],b[4],c1,c2,c3);
1197 $LD r6,`2*$BNSZ`(r4)
1198 $LD r7,`4*$BNSZ`(r5)
1199 $UMULL r8,r6,r7
1200 $UMULH r9,r6,r7
1201 addc r10,r10,r8
1202 adde r11,r11,r9
1203 addze r12,r12
1204 #mul_add_c(a[1],b[5],c1,c2,c3);
1205 $LD r6,`1*$BNSZ`(r4)
1206 $LD r7,`5*$BNSZ`(r5)
1207 $UMULL r8,r6,r7
1208 $UMULH r9,r6,r7
1209 addc r10,r10,r8
1210 adde r11,r11,r9
1211 addze r12,r12
1212 #mul_add_c(a[0],b[6],c1,c2,c3);
1213 $LD r6,`0*$BNSZ`(r4)
1214 $LD r7,`6*$BNSZ`(r5)
1215 $UMULL r8,r6,r7
1216 $UMULH r9,r6,r7
1217 addc r10,r10,r8
1218 adde r11,r11,r9
1219 addze r12,r12
1220 $ST r10,`6*$BNSZ`(r3) #r[6]=c1;
1221 #mul_add_c(a[0],b[7],c2,c3,c1);
1222 $LD r7,`7*$BNSZ`(r5)
1223 $UMULL r8,r6,r7
1224 $UMULH r9,r6,r7
1225 addc r11,r11,r8
1226 adde r12,r12,r9
1227 addze r10,r0
1228 #mul_add_c(a[1],b[6],c2,c3,c1);
1229 $LD r6,`1*$BNSZ`(r4)
1230 $LD r7,`6*$BNSZ`(r5)
1231 $UMULL r8,r6,r7
1232 $UMULH r9,r6,r7
1233 addc r11,r11,r8
1234 adde r12,r12,r9
1235 addze r10,r10
1236 #mul_add_c(a[2],b[5],c2,c3,c1);
1237 $LD r6,`2*$BNSZ`(r4)
1238 $LD r7,`5*$BNSZ`(r5)
1239 $UMULL r8,r6,r7
1240 $UMULH r9,r6,r7
1241 addc r11,r11,r8
1242 adde r12,r12,r9
1243 addze r10,r10
1244 #mul_add_c(a[3],b[4],c2,c3,c1);
1245 $LD r6,`3*$BNSZ`(r4)
1246 $LD r7,`4*$BNSZ`(r5)
1247 $UMULL r8,r6,r7
1248 $UMULH r9,r6,r7
1249 addc r11,r11,r8
1250 adde r12,r12,r9
1251 addze r10,r10
1252 #mul_add_c(a[4],b[3],c2,c3,c1);
1253 $LD r6,`4*$BNSZ`(r4)
1254 $LD r7,`3*$BNSZ`(r5)
1255 $UMULL r8,r6,r7
1256 $UMULH r9,r6,r7
1257 addc r11,r11,r8
1258 adde r12,r12,r9
1259 addze r10,r10
1260 #mul_add_c(a[5],b[2],c2,c3,c1);
1261 $LD r6,`5*$BNSZ`(r4)
1262 $LD r7,`2*$BNSZ`(r5)
1263 $UMULL r8,r6,r7
1264 $UMULH r9,r6,r7
1265 addc r11,r11,r8
1266 adde r12,r12,r9
1267 addze r10,r10
1268 #mul_add_c(a[6],b[1],c2,c3,c1);
1269 $LD r6,`6*$BNSZ`(r4)
1270 $LD r7,`1*$BNSZ`(r5)
1271 $UMULL r8,r6,r7
1272 $UMULH r9,r6,r7
1273 addc r11,r11,r8
1274 adde r12,r12,r9
1275 addze r10,r10
1276 #mul_add_c(a[7],b[0],c2,c3,c1);
1277 $LD r6,`7*$BNSZ`(r4)
1278 $LD r7,`0*$BNSZ`(r5)
1279 $UMULL r8,r6,r7
1280 $UMULH r9,r6,r7
1281 addc r11,r11,r8
1282 adde r12,r12,r9
1283 addze r10,r10
1284 $ST r11,`7*$BNSZ`(r3) #r[7]=c2;
1285 #mul_add_c(a[7],b[1],c3,c1,c2);
1286 $LD r7,`1*$BNSZ`(r5)
1287 $UMULL r8,r6,r7
1288 $UMULH r9,r6,r7
1289 addc r12,r12,r8
1290 adde r10,r10,r9
1291 addze r11,r0
1292 #mul_add_c(a[6],b[2],c3,c1,c2);
1293 $LD r6,`6*$BNSZ`(r4)
1294 $LD r7,`2*$BNSZ`(r5)
1295 $UMULL r8,r6,r7
1296 $UMULH r9,r6,r7
1297 addc r12,r12,r8
1298 adde r10,r10,r9
1299 addze r11,r11
1300 #mul_add_c(a[5],b[3],c3,c1,c2);
1301 $LD r6,`5*$BNSZ`(r4)
1302 $LD r7,`3*$BNSZ`(r5)
1303 $UMULL r8,r6,r7
1304 $UMULH r9,r6,r7
1305 addc r12,r12,r8
1306 adde r10,r10,r9
1307 addze r11,r11
1308 #mul_add_c(a[4],b[4],c3,c1,c2);
1309 $LD r6,`4*$BNSZ`(r4)
1310 $LD r7,`4*$BNSZ`(r5)
1311 $UMULL r8,r6,r7
1312 $UMULH r9,r6,r7
1313 addc r12,r12,r8
1314 adde r10,r10,r9
1315 addze r11,r11
1316 #mul_add_c(a[3],b[5],c3,c1,c2);
1317 $LD r6,`3*$BNSZ`(r4)
1318 $LD r7,`5*$BNSZ`(r5)
1319 $UMULL r8,r6,r7
1320 $UMULH r9,r6,r7
1321 addc r12,r12,r8
1322 adde r10,r10,r9
1323 addze r11,r11
1324 #mul_add_c(a[2],b[6],c3,c1,c2);
1325 $LD r6,`2*$BNSZ`(r4)
1326 $LD r7,`6*$BNSZ`(r5)
1327 $UMULL r8,r6,r7
1328 $UMULH r9,r6,r7
1329 addc r12,r12,r8
1330 adde r10,r10,r9
1331 addze r11,r11
1332 #mul_add_c(a[1],b[7],c3,c1,c2);
1333 $LD r6,`1*$BNSZ`(r4)
1334 $LD r7,`7*$BNSZ`(r5)
1335 $UMULL r8,r6,r7
1336 $UMULH r9,r6,r7
1337 addc r12,r12,r8
1338 adde r10,r10,r9
1339 addze r11,r11
1340 $ST r12,`8*$BNSZ`(r3) #r[8]=c3;
1341 #mul_add_c(a[2],b[7],c1,c2,c3);
1342 $LD r6,`2*$BNSZ`(r4)
1343 $UMULL r8,r6,r7
1344 $UMULH r9,r6,r7
1345 addc r10,r10,r8
1346 adde r11,r11,r9
1347 addze r12,r0
1348 #mul_add_c(a[3],b[6],c1,c2,c3);
1349 $LD r6,`3*$BNSZ`(r4)
1350 $LD r7,`6*$BNSZ`(r5)
1351 $UMULL r8,r6,r7
1352 $UMULH r9,r6,r7
1353 addc r10,r10,r8
1354 adde r11,r11,r9
1355 addze r12,r12
1356 #mul_add_c(a[4],b[5],c1,c2,c3);
1357 $LD r6,`4*$BNSZ`(r4)
1358 $LD r7,`5*$BNSZ`(r5)
1359 $UMULL r8,r6,r7
1360 $UMULH r9,r6,r7
1361 addc r10,r10,r8
1362 adde r11,r11,r9
1363 addze r12,r12
1364 #mul_add_c(a[5],b[4],c1,c2,c3);
1365 $LD r6,`5*$BNSZ`(r4)
1366 $LD r7,`4*$BNSZ`(r5)
1367 $UMULL r8,r6,r7
1368 $UMULH r9,r6,r7
1369 addc r10,r10,r8
1370 adde r11,r11,r9
1371 addze r12,r12
1372 #mul_add_c(a[6],b[3],c1,c2,c3);
1373 $LD r6,`6*$BNSZ`(r4)
1374 $LD r7,`3*$BNSZ`(r5)
1375 $UMULL r8,r6,r7
1376 $UMULH r9,r6,r7
1377 addc r10,r10,r8
1378 adde r11,r11,r9
1379 addze r12,r12
1380 #mul_add_c(a[7],b[2],c1,c2,c3);
1381 $LD r6,`7*$BNSZ`(r4)
1382 $LD r7,`2*$BNSZ`(r5)
1383 $UMULL r8,r6,r7
1384 $UMULH r9,r6,r7
1385 addc r10,r10,r8
1386 adde r11,r11,r9
1387 addze r12,r12
1388 $ST r10,`9*$BNSZ`(r3) #r[9]=c1;
1389 #mul_add_c(a[7],b[3],c2,c3,c1);
1390 $LD r7,`3*$BNSZ`(r5)
1391 $UMULL r8,r6,r7
1392 $UMULH r9,r6,r7
1393 addc r11,r11,r8
1394 adde r12,r12,r9
1395 addze r10,r0
1396 #mul_add_c(a[6],b[4],c2,c3,c1);
1397 $LD r6,`6*$BNSZ`(r4)
1398 $LD r7,`4*$BNSZ`(r5)
1399 $UMULL r8,r6,r7
1400 $UMULH r9,r6,r7
1401 addc r11,r11,r8
1402 adde r12,r12,r9
1403 addze r10,r10
1404 #mul_add_c(a[5],b[5],c2,c3,c1);
1405 $LD r6,`5*$BNSZ`(r4)
1406 $LD r7,`5*$BNSZ`(r5)
1407 $UMULL r8,r6,r7
1408 $UMULH r9,r6,r7
1409 addc r11,r11,r8
1410 adde r12,r12,r9
1411 addze r10,r10
1412 #mul_add_c(a[4],b[6],c2,c3,c1);
1413 $LD r6,`4*$BNSZ`(r4)
1414 $LD r7,`6*$BNSZ`(r5)
1415 $UMULL r8,r6,r7
1416 $UMULH r9,r6,r7
1417 addc r11,r11,r8
1418 adde r12,r12,r9
1419 addze r10,r10
1420 #mul_add_c(a[3],b[7],c2,c3,c1);
1421 $LD r6,`3*$BNSZ`(r4)
1422 $LD r7,`7*$BNSZ`(r5)
1423 $UMULL r8,r6,r7
1424 $UMULH r9,r6,r7
1425 addc r11,r11,r8
1426 adde r12,r12,r9
1427 addze r10,r10
1428 $ST r11,`10*$BNSZ`(r3) #r[10]=c2;
1429 #mul_add_c(a[4],b[7],c3,c1,c2);
1430 $LD r6,`4*$BNSZ`(r4)
1431 $UMULL r8,r6,r7
1432 $UMULH r9,r6,r7
1433 addc r12,r12,r8
1434 adde r10,r10,r9
1435 addze r11,r0
1436 #mul_add_c(a[5],b[6],c3,c1,c2);
1437 $LD r6,`5*$BNSZ`(r4)
1438 $LD r7,`6*$BNSZ`(r5)
1439 $UMULL r8,r6,r7
1440 $UMULH r9,r6,r7
1441 addc r12,r12,r8
1442 adde r10,r10,r9
1443 addze r11,r11
1444 #mul_add_c(a[6],b[5],c3,c1,c2);
1445 $LD r6,`6*$BNSZ`(r4)
1446 $LD r7,`5*$BNSZ`(r5)
1447 $UMULL r8,r6,r7
1448 $UMULH r9,r6,r7
1449 addc r12,r12,r8
1450 adde r10,r10,r9
1451 addze r11,r11
1452 #mul_add_c(a[7],b[4],c3,c1,c2);
1453 $LD r6,`7*$BNSZ`(r4)
1454 $LD r7,`4*$BNSZ`(r5)
1455 $UMULL r8,r6,r7
1456 $UMULH r9,r6,r7
1457 addc r12,r12,r8
1458 adde r10,r10,r9
1459 addze r11,r11
1460 $ST r12,`11*$BNSZ`(r3) #r[11]=c3;
1461 #mul_add_c(a[7],b[5],c1,c2,c3);
1462 $LD r7,`5*$BNSZ`(r5)
1463 $UMULL r8,r6,r7
1464 $UMULH r9,r6,r7
1465 addc r10,r10,r8
1466 adde r11,r11,r9
1467 addze r12,r0
1468 #mul_add_c(a[6],b[6],c1,c2,c3);
1469 $LD r6,`6*$BNSZ`(r4)
1470 $LD r7,`6*$BNSZ`(r5)
1471 $UMULL r8,r6,r7
1472 $UMULH r9,r6,r7
1473 addc r10,r10,r8
1474 adde r11,r11,r9
1475 addze r12,r12
1476 #mul_add_c(a[5],b[7],c1,c2,c3);
1477 $LD r6,`5*$BNSZ`(r4)
1478 $LD r7,`7*$BNSZ`(r5)
1479 $UMULL r8,r6,r7
1480 $UMULH r9,r6,r7
1481 addc r10,r10,r8
1482 adde r11,r11,r9
1483 addze r12,r12
1484 $ST r10,`12*$BNSZ`(r3) #r[12]=c1;
1485 #mul_add_c(a[6],b[7],c2,c3,c1);
1486 $LD r6,`6*$BNSZ`(r4)
1487 $UMULL r8,r6,r7
1488 $UMULH r9,r6,r7
1489 addc r11,r11,r8
1490 adde r12,r12,r9
1491 addze r10,r0
1492 #mul_add_c(a[7],b[6],c2,c3,c1);
1493 $LD r6,`7*$BNSZ`(r4)
1494 $LD r7,`6*$BNSZ`(r5)
1495 $UMULL r8,r6,r7
1496 $UMULH r9,r6,r7
1497 addc r11,r11,r8
1498 adde r12,r12,r9
1499 addze r10,r10
1500 $ST r11,`13*$BNSZ`(r3) #r[13]=c2;
1501 #mul_add_c(a[7],b[7],c3,c1,c2);
1502 $LD r7,`7*$BNSZ`(r5)
1503 $UMULL r8,r6,r7
1504 $UMULH r9,r6,r7
1505 addc r12,r12,r8
1506 adde r10,r10,r9
1507 $ST r12,`14*$BNSZ`(r3) #r[14]=c3;
1508 $ST r10,`15*$BNSZ`(r3) #r[15]=c1;
1509 blr
1510 .long 0
1511 .byte 0,12,0x14,0,0,0,3,0
1512 .long 0
1513
1514#
1515# NOTE: The following label name should be changed to
1516# "bn_sub_words" i.e. remove the first dot
1517# for the gcc compiler. This should be automatically
1518# done in the build
1519#
1520#
1521.align 4
1522.bn_sub_words:
1523#
1524# Handcoded version of bn_sub_words
1525#
1526#BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
1527#
1528# r3 = r
1529# r4 = a
1530# r5 = b
1531# r6 = n
1532#
1533# Note: No loop unrolling done since this is not a performance
1534# critical loop.
1535
1536 xor r0,r0,r0 #set r0 = 0
1537#
1538# check for r6 = 0 AND set carry bit.
1539#
1540 subfc. r7,r0,r6 # If r6 is 0 then result is 0.
1541 # if r6 > 0 then result !=0
1542 # In either case carry bit is set.
1543 beq Lppcasm_sub_adios
1544 addi r4,r4,-$BNSZ
1545 addi r3,r3,-$BNSZ
1546 addi r5,r5,-$BNSZ
1547 mtctr r6
1548Lppcasm_sub_mainloop:
1549 $LDU r7,$BNSZ(r4)
1550 $LDU r8,$BNSZ(r5)
1551 subfe r6,r8,r7 # r6 = r7+carry bit + onescomplement(r8)
1552 # if carry = 1 this is r7-r8. Else it
1553 # is r7-r8 -1 as we need.
1554 $STU r6,$BNSZ(r3)
1555 bdnz- Lppcasm_sub_mainloop
1556Lppcasm_sub_adios:
1557 subfze r3,r0 # if carry bit is set then r3 = 0 else -1
1558 andi. r3,r3,1 # keep only last bit.
1559 blr
1560 .long 0
1561 .byte 0,12,0x14,0,0,0,4,0
1562 .long 0
1563
1564#
1565# NOTE: The following label name should be changed to
1566# "bn_add_words" i.e. remove the first dot
1567# for the gcc compiler. This should be automatically
1568# done in the build
1569#
1570
1571.align 4
1572.bn_add_words:
1573#
1574# Handcoded version of bn_add_words
1575#
1576#BN_ULONG bn_add_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
1577#
1578# r3 = r
1579# r4 = a
1580# r5 = b
1581# r6 = n
1582#
1583# Note: No loop unrolling done since this is not a performance
1584# critical loop.
1585
1586 xor r0,r0,r0
1587#
1588# check for r6 = 0. Is this needed?
1589#
1590 addic. r6,r6,0 #test r6 and clear carry bit.
1591 beq Lppcasm_add_adios
1592 addi r4,r4,-$BNSZ
1593 addi r3,r3,-$BNSZ
1594 addi r5,r5,-$BNSZ
1595 mtctr r6
1596Lppcasm_add_mainloop:
1597 $LDU r7,$BNSZ(r4)
1598 $LDU r8,$BNSZ(r5)
1599 adde r8,r7,r8
1600 $STU r8,$BNSZ(r3)
1601 bdnz- Lppcasm_add_mainloop
1602Lppcasm_add_adios:
1603 addze r3,r0 #return carry bit.
1604 blr
1605 .long 0
1606 .byte 0,12,0x14,0,0,0,4,0
1607 .long 0
1608
1609#
1610# NOTE: The following label name should be changed to
1611# "bn_div_words" i.e. remove the first dot
1612# for the gcc compiler. This should be automatically
1613# done in the build
1614#
1615
1616.align 4
1617.bn_div_words:
1618#
1619# This is a cleaned up version of code generated by
1620# the AIX compiler. The only optimization is to use
1621# the PPC instruction to count leading zeros instead
1622# of call to num_bits_word. Since this was compiled
1623# only at level -O2 we can possibly squeeze it more?
1624#
1625# r3 = h
1626# r4 = l
1627# r5 = d
1628
1629 $UCMPI 0,r5,0 # compare r5 and 0
1630 bne Lppcasm_div1 # proceed if d!=0
1631 li r3,-1 # d=0 return -1
1632 blr
1633Lppcasm_div1:
1634 xor r0,r0,r0 #r0=0
1635 li r8,$BITS
1636 $CNTLZ. r7,r5 #r7 = num leading 0s in d.
1637 beq Lppcasm_div2 #proceed if no leading zeros
1638 subf r8,r7,r8 #r8 = BN_num_bits_word(d)
1639 $SHR. r9,r3,r8 #are there any bits above r8'th?
1640 $TR 16,r9,r0 #if there're, signal to dump core...
1641Lppcasm_div2:
1642 $UCMP 0,r3,r5 #h>=d?
1643 blt Lppcasm_div3 #goto Lppcasm_div3 if not
1644 subf r3,r5,r3 #h-=d ;
1645Lppcasm_div3: #r7 = BN_BITS2-i. so r7=i
1646 cmpi 0,0,r7,0 # is (i == 0)?
1647 beq Lppcasm_div4
1648 $SHL r3,r3,r7 # h = (h<< i)
1649 $SHR r8,r4,r8 # r8 = (l >> BN_BITS2 -i)
1650 $SHL r5,r5,r7 # d<<=i
1651 or r3,r3,r8 # h = (h<<i)|(l>>(BN_BITS2-i))
1652 $SHL r4,r4,r7 # l <<=i
1653Lppcasm_div4:
1654 $SHRI r9,r5,`$BITS/2` # r9 = dh
1655 # dl will be computed when needed
1656 # as it saves registers.
1657 li r6,2 #r6=2
1658 mtctr r6 #counter will be in count.
1659Lppcasm_divouterloop:
1660 $SHRI r8,r3,`$BITS/2` #r8 = (h>>BN_BITS4)
1661 $SHRI r11,r4,`$BITS/2` #r11= (l&BN_MASK2h)>>BN_BITS4
1662 # compute here for innerloop.
1663 $UCMP 0,r8,r9 # is (h>>BN_BITS4)==dh
1664 bne Lppcasm_div5 # goto Lppcasm_div5 if not
1665
1666 li r8,-1
1667 $CLRU r8,r8,`$BITS/2` #q = BN_MASK2l
1668 b Lppcasm_div6
1669Lppcasm_div5:
1670 $UDIV r8,r3,r9 #q = h/dh
1671Lppcasm_div6:
1672 $UMULL r12,r9,r8 #th = q*dh
1673 $CLRU r10,r5,`$BITS/2` #r10=dl
1674 $UMULL r6,r8,r10 #tl = q*dl
1675
1676Lppcasm_divinnerloop:
1677 subf r10,r12,r3 #t = h -th
1678 $SHRI r7,r10,`$BITS/2` #r7= (t &BN_MASK2H), sort of...
1679 addic. r7,r7,0 #test if r7 == 0. used below.
1680 # now want to compute
1681 # r7 = (t<<BN_BITS4)|((l&BN_MASK2h)>>BN_BITS4)
1682 # the following 2 instructions do that
1683 $SHLI r7,r10,`$BITS/2` # r7 = (t<<BN_BITS4)
1684 or r7,r7,r11 # r7|=((l&BN_MASK2h)>>BN_BITS4)
1685 $UCMP cr1,r6,r7 # compare (tl <= r7)
1686 bne Lppcasm_divinnerexit
1687 ble cr1,Lppcasm_divinnerexit
1688 addi r8,r8,-1 #q--
1689 subf r12,r9,r12 #th -=dh
1690 $CLRU r10,r5,`$BITS/2` #r10=dl. t is no longer needed in loop.
1691 subf r6,r10,r6 #tl -=dl
1692 b Lppcasm_divinnerloop
1693Lppcasm_divinnerexit:
1694 $SHRI r10,r6,`$BITS/2` #t=(tl>>BN_BITS4)
1695 $SHLI r11,r6,`$BITS/2` #tl=(tl<<BN_BITS4)&BN_MASK2h;
1696 $UCMP cr1,r4,r11 # compare l and tl
1697 add r12,r12,r10 # th+=t
1698 bge cr1,Lppcasm_div7 # if (l>=tl) goto Lppcasm_div7
1699 addi r12,r12,1 # th++
1700Lppcasm_div7:
1701 subf r11,r11,r4 #r11=l-tl
1702 $UCMP cr1,r3,r12 #compare h and th
1703 bge cr1,Lppcasm_div8 #if (h>=th) goto Lppcasm_div8
1704 addi r8,r8,-1 # q--
1705 add r3,r5,r3 # h+=d
1706Lppcasm_div8:
1707 subf r12,r12,r3 #r12 = h-th
1708 $SHLI r4,r11,`$BITS/2` #l=(l&BN_MASK2l)<<BN_BITS4
1709 # want to compute
1710 # h = ((h<<BN_BITS4)|(l>>BN_BITS4))&BN_MASK2
1711 # the following 2 instructions will do this.
1712 $INSR r11,r12,`$BITS/2`,`$BITS/2` # r11 is the value we want rotated $BITS/2.
1713 $ROTL r3,r11,`$BITS/2` # rotate by $BITS/2 and store in r3
1714 bdz Lppcasm_div9 #if (count==0) break ;
1715 $SHLI r0,r8,`$BITS/2` #ret =q<<BN_BITS4
1716 b Lppcasm_divouterloop
1717Lppcasm_div9:
1718 or r3,r8,r0
1719 blr
1720 .long 0
1721 .byte 0,12,0x14,0,0,0,3,0
1722 .long 0
1723
1724#
1725# NOTE: The following label name should be changed to
1726# "bn_sqr_words" i.e. remove the first dot
1727# for the gcc compiler. This should be automatically
1728# done in the build
1729#
1730.align 4
1731.bn_sqr_words:
1732#
1733# Optimized version of bn_sqr_words
1734#
1735# void bn_sqr_words(BN_ULONG *r, BN_ULONG *a, int n)
1736#
1737# r3 = r
1738# r4 = a
1739# r5 = n
1740#
1741# r6 = a[i].
1742# r7,r8 = product.
1743#
1744# No unrolling done here. Not performance critical.
1745
1746 addic. r5,r5,0 #test r5.
1747 beq Lppcasm_sqr_adios
1748 addi r4,r4,-$BNSZ
1749 addi r3,r3,-$BNSZ
1750 mtctr r5
1751Lppcasm_sqr_mainloop:
1752 #sqr(r[0],r[1],a[0]);
1753 $LDU r6,$BNSZ(r4)
1754 $UMULL r7,r6,r6
1755 $UMULH r8,r6,r6
1756 $STU r7,$BNSZ(r3)
1757 $STU r8,$BNSZ(r3)
1758 bdnz- Lppcasm_sqr_mainloop
1759Lppcasm_sqr_adios:
1760 blr
1761 .long 0
1762 .byte 0,12,0x14,0,0,0,3,0
1763 .long 0
1764
1765#
1766# NOTE: The following label name should be changed to
1767# "bn_mul_words" i.e. remove the first dot
1768# for the gcc compiler. This should be automatically
1769# done in the build
1770#
1771
1772.align 4
1773.bn_mul_words:
1774#
1775# BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
1776#
1777# r3 = rp
1778# r4 = ap
1779# r5 = num
1780# r6 = w
1781 xor r0,r0,r0
1782 xor r12,r12,r12 # used for carry
1783 rlwinm. r7,r5,30,2,31 # num >> 2
1784 beq Lppcasm_mw_REM
1785 mtctr r7
1786Lppcasm_mw_LOOP:
1787 #mul(rp[0],ap[0],w,c1);
1788 $LD r8,`0*$BNSZ`(r4)
1789 $UMULL r9,r6,r8
1790 $UMULH r10,r6,r8
1791 addc r9,r9,r12
1792 #addze r10,r10 #carry is NOT ignored.
1793 #will be taken care of
1794 #in second spin below
1795 #using adde.
1796 $ST r9,`0*$BNSZ`(r3)
1797 #mul(rp[1],ap[1],w,c1);
1798 $LD r8,`1*$BNSZ`(r4)
1799 $UMULL r11,r6,r8
1800 $UMULH r12,r6,r8
1801 adde r11,r11,r10
1802 #addze r12,r12
1803 $ST r11,`1*$BNSZ`(r3)
1804 #mul(rp[2],ap[2],w,c1);
1805 $LD r8,`2*$BNSZ`(r4)
1806 $UMULL r9,r6,r8
1807 $UMULH r10,r6,r8
1808 adde r9,r9,r12
1809 #addze r10,r10
1810 $ST r9,`2*$BNSZ`(r3)
1811 #mul_add(rp[3],ap[3],w,c1);
1812 $LD r8,`3*$BNSZ`(r4)
1813 $UMULL r11,r6,r8
1814 $UMULH r12,r6,r8
1815 adde r11,r11,r10
1816 addze r12,r12 #this spin we collect carry into
1817 #r12
1818 $ST r11,`3*$BNSZ`(r3)
1819
1820 addi r3,r3,`4*$BNSZ`
1821 addi r4,r4,`4*$BNSZ`
1822 bdnz- Lppcasm_mw_LOOP
1823
1824Lppcasm_mw_REM:
1825 andi. r5,r5,0x3
1826 beq Lppcasm_mw_OVER
1827 #mul(rp[0],ap[0],w,c1);
1828 $LD r8,`0*$BNSZ`(r4)
1829 $UMULL r9,r6,r8
1830 $UMULH r10,r6,r8
1831 addc r9,r9,r12
1832 addze r10,r10
1833 $ST r9,`0*$BNSZ`(r3)
1834 addi r12,r10,0
1835
1836 addi r5,r5,-1
1837 cmpli 0,0,r5,0
1838 beq Lppcasm_mw_OVER
1839
1840
1841 #mul(rp[1],ap[1],w,c1);
1842 $LD r8,`1*$BNSZ`(r4)
1843 $UMULL r9,r6,r8
1844 $UMULH r10,r6,r8
1845 addc r9,r9,r12
1846 addze r10,r10
1847 $ST r9,`1*$BNSZ`(r3)
1848 addi r12,r10,0
1849
1850 addi r5,r5,-1
1851 cmpli 0,0,r5,0
1852 beq Lppcasm_mw_OVER
1853
1854 #mul_add(rp[2],ap[2],w,c1);
1855 $LD r8,`2*$BNSZ`(r4)
1856 $UMULL r9,r6,r8
1857 $UMULH r10,r6,r8
1858 addc r9,r9,r12
1859 addze r10,r10
1860 $ST r9,`2*$BNSZ`(r3)
1861 addi r12,r10,0
1862
1863Lppcasm_mw_OVER:
1864 addi r3,r12,0
1865 blr
1866 .long 0
1867 .byte 0,12,0x14,0,0,0,4,0
1868 .long 0
1869
1870#
1871# NOTE: The following label name should be changed to
1872# "bn_mul_add_words" i.e. remove the first dot
1873# for the gcc compiler. This should be automatically
1874# done in the build
1875#
1876
1877.align 4
1878.bn_mul_add_words:
1879#
1880# BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
1881#
1882# r3 = rp
1883# r4 = ap
1884# r5 = num
1885# r6 = w
1886#
1887# empirical evidence suggests that unrolled version performs best!!
1888#
1889 xor r0,r0,r0 #r0 = 0
1890 xor r12,r12,r12 #r12 = 0 . used for carry
1891 rlwinm. r7,r5,30,2,31 # num >> 2
1892 beq Lppcasm_maw_leftover # if (num < 4) go LPPCASM_maw_leftover
1893 mtctr r7
1894Lppcasm_maw_mainloop:
1895 #mul_add(rp[0],ap[0],w,c1);
1896 $LD r8,`0*$BNSZ`(r4)
1897 $LD r11,`0*$BNSZ`(r3)
1898 $UMULL r9,r6,r8
1899 $UMULH r10,r6,r8
1900 addc r9,r9,r12 #r12 is carry.
1901 addze r10,r10
1902 addc r9,r9,r11
1903 #addze r10,r10
1904 #the above instruction addze
1905 #is NOT needed. Carry will NOT
1906 #be ignored. It's not affected
1907 #by multiply and will be collected
1908 #in the next spin
1909 $ST r9,`0*$BNSZ`(r3)
1910
1911 #mul_add(rp[1],ap[1],w,c1);
1912 $LD r8,`1*$BNSZ`(r4)
1913 $LD r9,`1*$BNSZ`(r3)
1914 $UMULL r11,r6,r8
1915 $UMULH r12,r6,r8
1916 adde r11,r11,r10 #r10 is carry.
1917 addze r12,r12
1918 addc r11,r11,r9
1919 #addze r12,r12
1920 $ST r11,`1*$BNSZ`(r3)
1921
1922 #mul_add(rp[2],ap[2],w,c1);
1923 $LD r8,`2*$BNSZ`(r4)
1924 $UMULL r9,r6,r8
1925 $LD r11,`2*$BNSZ`(r3)
1926 $UMULH r10,r6,r8
1927 adde r9,r9,r12
1928 addze r10,r10
1929 addc r9,r9,r11
1930 #addze r10,r10
1931 $ST r9,`2*$BNSZ`(r3)
1932
1933 #mul_add(rp[3],ap[3],w,c1);
1934 $LD r8,`3*$BNSZ`(r4)
1935 $UMULL r11,r6,r8
1936 $LD r9,`3*$BNSZ`(r3)
1937 $UMULH r12,r6,r8
1938 adde r11,r11,r10
1939 addze r12,r12
1940 addc r11,r11,r9
1941 addze r12,r12
1942 $ST r11,`3*$BNSZ`(r3)
1943 addi r3,r3,`4*$BNSZ`
1944 addi r4,r4,`4*$BNSZ`
1945 bdnz- Lppcasm_maw_mainloop
1946
1947Lppcasm_maw_leftover:
1948 andi. r5,r5,0x3
1949 beq Lppcasm_maw_adios
1950 addi r3,r3,-$BNSZ
1951 addi r4,r4,-$BNSZ
1952 #mul_add(rp[0],ap[0],w,c1);
1953 mtctr r5
1954 $LDU r8,$BNSZ(r4)
1955 $UMULL r9,r6,r8
1956 $UMULH r10,r6,r8
1957 $LDU r11,$BNSZ(r3)
1958 addc r9,r9,r11
1959 addze r10,r10
1960 addc r9,r9,r12
1961 addze r12,r10
1962 $ST r9,0(r3)
1963
1964 bdz Lppcasm_maw_adios
1965 #mul_add(rp[1],ap[1],w,c1);
1966 $LDU r8,$BNSZ(r4)
1967 $UMULL r9,r6,r8
1968 $UMULH r10,r6,r8
1969 $LDU r11,$BNSZ(r3)
1970 addc r9,r9,r11
1971 addze r10,r10
1972 addc r9,r9,r12
1973 addze r12,r10
1974 $ST r9,0(r3)
1975
1976 bdz Lppcasm_maw_adios
1977 #mul_add(rp[2],ap[2],w,c1);
1978 $LDU r8,$BNSZ(r4)
1979 $UMULL r9,r6,r8
1980 $UMULH r10,r6,r8
1981 $LDU r11,$BNSZ(r3)
1982 addc r9,r9,r11
1983 addze r10,r10
1984 addc r9,r9,r12
1985 addze r12,r10
1986 $ST r9,0(r3)
1987
1988Lppcasm_maw_adios:
1989 addi r3,r12,0
1990 blr
1991 .long 0
1992 .byte 0,12,0x14,0,0,0,4,0
1993 .long 0
1994 .align 4
1995EOF
1996$data =~ s/\`([^\`]*)\`/eval $1/gem;
1997print $data;
1998close STDOUT;
diff --git a/src/lib/libcrypto/bn/asm/ppc64-mont.pl b/src/lib/libcrypto/bn/asm/ppc64-mont.pl
deleted file mode 100644
index a14e769ad0..0000000000
--- a/src/lib/libcrypto/bn/asm/ppc64-mont.pl
+++ /dev/null
@@ -1,1088 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# December 2007
11
12# The reason for undertaken effort is basically following. Even though
13# Power 6 CPU operates at incredible 4.7GHz clock frequency, its PKI
14# performance was observed to be less than impressive, essentially as
15# fast as 1.8GHz PPC970, or 2.6 times(!) slower than one would hope.
16# Well, it's not surprising that IBM had to make some sacrifices to
17# boost the clock frequency that much, but no overall improvement?
18# Having observed how much difference did switching to FPU make on
19# UltraSPARC, playing same stunt on Power 6 appeared appropriate...
20# Unfortunately the resulting performance improvement is not as
21# impressive, ~30%, and in absolute terms is still very far from what
22# one would expect from 4.7GHz CPU. There is a chance that I'm doing
23# something wrong, but in the lack of assembler level micro-profiling
24# data or at least decent platform guide I can't tell... Or better
25# results might be achieved with VMX... Anyway, this module provides
26# *worse* performance on other PowerPC implementations, ~40-15% slower
27# on PPC970 depending on key length and ~40% slower on Power 5 for all
28# key lengths. As it's obviously inappropriate as "best all-round"
29# alternative, it has to be complemented with run-time CPU family
30# detection. Oh! It should also be noted that unlike other PowerPC
31# implementation IALU ppc-mont.pl module performs *suboptimaly* on
32# >=1024-bit key lengths on Power 6. It should also be noted that
33# *everything* said so far applies to 64-bit builds! As far as 32-bit
34# application executed on 64-bit CPU goes, this module is likely to
35# become preferred choice, because it's easy to adapt it for such
36# case and *is* faster than 32-bit ppc-mont.pl on *all* processors.
37
38# February 2008
39
40# Micro-profiling assisted optimization results in ~15% improvement
41# over original ppc64-mont.pl version, or overall ~50% improvement
42# over ppc.pl module on Power 6. If compared to ppc-mont.pl on same
43# Power 6 CPU, this module is 5-150% faster depending on key length,
44# [hereafter] more for longer keys. But if compared to ppc-mont.pl
45# on 1.8GHz PPC970, it's only 5-55% faster. Still far from impressive
46# in absolute terms, but it's apparently the way Power 6 is...
47
48# December 2009
49
50# Adapted for 32-bit build this module delivers 25-120%, yes, more
51# than *twice* for longer keys, performance improvement over 32-bit
52# ppc-mont.pl on 1.8GHz PPC970. However! This implementation utilizes
53# even 64-bit integer operations and the trouble is that most PPC
54# operating systems don't preserve upper halves of general purpose
55# registers upon 32-bit signal delivery. They do preserve them upon
56# context switch, but not signalling:-( This means that asynchronous
57# signals have to be blocked upon entry to this subroutine. Signal
58# masking (and of course complementary unmasking) has quite an impact
59# on performance, naturally larger for shorter keys. It's so severe
60# that 512-bit key performance can be as low as 1/3 of expected one.
61# This is why this routine can be engaged for longer key operations
62# only on these OSes, see crypto/ppccap.c for further details. MacOS X
63# is an exception from this and doesn't require signal masking, and
64# that's where above improvement coefficients were collected. For
65# others alternative would be to break dependence on upper halves of
66# GPRs by sticking to 32-bit integer operations...
67
68$flavour = shift;
69
70if ($flavour =~ /32/) {
71 $SIZE_T=4;
72 $RZONE= 224;
73 $fname= "bn_mul_mont_fpu64";
74
75 $STUX= "stwux"; # store indexed and update
76 $PUSH= "stw";
77 $POP= "lwz";
78} elsif ($flavour =~ /64/) {
79 $SIZE_T=8;
80 $RZONE= 288;
81 $fname= "bn_mul_mont_fpu64";
82
83 # same as above, but 64-bit mnemonics...
84 $STUX= "stdux"; # store indexed and update
85 $PUSH= "std";
86 $POP= "ld";
87} else { die "nonsense $flavour"; }
88
89$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
90( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
91( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
92die "can't locate ppc-xlate.pl";
93
94open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
95
96$FRAME=64; # padded frame header
97$TRANSFER=16*8;
98
99$carry="r0";
100$sp="r1";
101$toc="r2";
102$rp="r3"; $ovf="r3";
103$ap="r4";
104$bp="r5";
105$np="r6";
106$n0="r7";
107$num="r8";
108$rp="r9"; # $rp is reassigned
109$tp="r10";
110$j="r11";
111$i="r12";
112# non-volatile registers
113$nap_d="r22"; # interleaved ap and np in double format
114$a0="r23"; # ap[0]
115$t0="r24"; # temporary registers
116$t1="r25";
117$t2="r26";
118$t3="r27";
119$t4="r28";
120$t5="r29";
121$t6="r30";
122$t7="r31";
123
124# PPC offers enough register bank capacity to unroll inner loops twice
125#
126# ..A3A2A1A0
127# dcba
128# -----------
129# A0a
130# A0b
131# A0c
132# A0d
133# A1a
134# A1b
135# A1c
136# A1d
137# A2a
138# A2b
139# A2c
140# A2d
141# A3a
142# A3b
143# A3c
144# A3d
145# ..a
146# ..b
147#
148$ba="f0"; $bb="f1"; $bc="f2"; $bd="f3";
149$na="f4"; $nb="f5"; $nc="f6"; $nd="f7";
150$dota="f8"; $dotb="f9";
151$A0="f10"; $A1="f11"; $A2="f12"; $A3="f13";
152$N0="f20"; $N1="f21"; $N2="f22"; $N3="f23";
153$T0a="f24"; $T0b="f25";
154$T1a="f26"; $T1b="f27";
155$T2a="f28"; $T2b="f29";
156$T3a="f30"; $T3b="f31";
157
158# sp----------->+-------------------------------+
159# | saved sp |
160# +-------------------------------+
161# . .
162# +64 +-------------------------------+
163# | 16 gpr<->fpr transfer zone |
164# . .
165# . .
166# +16*8 +-------------------------------+
167# | __int64 tmp[-1] |
168# +-------------------------------+
169# | __int64 tmp[num] |
170# . .
171# . .
172# . .
173# +(num+1)*8 +-------------------------------+
174# | padding to 64 byte boundary |
175# . .
176# +X +-------------------------------+
177# | double nap_d[4*num] |
178# . .
179# . .
180# . .
181# +-------------------------------+
182# . .
183# -12*size_t +-------------------------------+
184# | 10 saved gpr, r22-r31 |
185# . .
186# . .
187# -12*8 +-------------------------------+
188# | 12 saved fpr, f20-f31 |
189# . .
190# . .
191# +-------------------------------+
192
193$code=<<___;
194.machine "any"
195.text
196
197.globl .$fname
198.align 5
199.$fname:
200 cmpwi $num,`3*8/$SIZE_T`
201 mr $rp,r3 ; $rp is reassigned
202 li r3,0 ; possible "not handled" return code
203 bltlr-
204 andi. r0,$num,`16/$SIZE_T-1` ; $num has to be "even"
205 bnelr-
206
207 slwi $num,$num,`log($SIZE_T)/log(2)` ; num*=sizeof(BN_LONG)
208 li $i,-4096
209 slwi $tp,$num,2 ; place for {an}p_{lh}[num], i.e. 4*num
210 add $tp,$tp,$num ; place for tp[num+1]
211 addi $tp,$tp,`$FRAME+$TRANSFER+8+64+$RZONE`
212 subf $tp,$tp,$sp ; $sp-$tp
213 and $tp,$tp,$i ; minimize TLB usage
214 subf $tp,$sp,$tp ; $tp-$sp
215 mr $i,$sp
216 $STUX $sp,$sp,$tp ; alloca
217
218 $PUSH r22,`-12*8-10*$SIZE_T`($i)
219 $PUSH r23,`-12*8-9*$SIZE_T`($i)
220 $PUSH r24,`-12*8-8*$SIZE_T`($i)
221 $PUSH r25,`-12*8-7*$SIZE_T`($i)
222 $PUSH r26,`-12*8-6*$SIZE_T`($i)
223 $PUSH r27,`-12*8-5*$SIZE_T`($i)
224 $PUSH r28,`-12*8-4*$SIZE_T`($i)
225 $PUSH r29,`-12*8-3*$SIZE_T`($i)
226 $PUSH r30,`-12*8-2*$SIZE_T`($i)
227 $PUSH r31,`-12*8-1*$SIZE_T`($i)
228 stfd f20,`-12*8`($i)
229 stfd f21,`-11*8`($i)
230 stfd f22,`-10*8`($i)
231 stfd f23,`-9*8`($i)
232 stfd f24,`-8*8`($i)
233 stfd f25,`-7*8`($i)
234 stfd f26,`-6*8`($i)
235 stfd f27,`-5*8`($i)
236 stfd f28,`-4*8`($i)
237 stfd f29,`-3*8`($i)
238 stfd f30,`-2*8`($i)
239 stfd f31,`-1*8`($i)
240___
241$code.=<<___ if ($SIZE_T==8);
242 ld $a0,0($ap) ; pull ap[0] value
243 ld $n0,0($n0) ; pull n0[0] value
244 ld $t3,0($bp) ; bp[0]
245___
246$code.=<<___ if ($SIZE_T==4);
247 mr $t1,$n0
248 lwz $a0,0($ap) ; pull ap[0,1] value
249 lwz $t0,4($ap)
250 lwz $n0,0($t1) ; pull n0[0,1] value
251 lwz $t1,4($t1)
252 lwz $t3,0($bp) ; bp[0,1]
253 lwz $t2,4($bp)
254 insrdi $a0,$t0,32,0
255 insrdi $n0,$t1,32,0
256 insrdi $t3,$t2,32,0
257___
258$code.=<<___;
259 addi $tp,$sp,`$FRAME+$TRANSFER+8+64`
260 li $i,-64
261 add $nap_d,$tp,$num
262 and $nap_d,$nap_d,$i ; align to 64 bytes
263
264 mulld $t7,$a0,$t3 ; ap[0]*bp[0]
265 ; nap_d is off by 1, because it's used with stfdu/lfdu
266 addi $nap_d,$nap_d,-8
267 srwi $j,$num,`3+1` ; counter register, num/2
268 mulld $t7,$t7,$n0 ; tp[0]*n0
269 addi $j,$j,-1
270 addi $tp,$sp,`$FRAME+$TRANSFER-8`
271 li $carry,0
272 mtctr $j
273
274 ; transfer bp[0] to FPU as 4x16-bit values
275 extrdi $t0,$t3,16,48
276 extrdi $t1,$t3,16,32
277 extrdi $t2,$t3,16,16
278 extrdi $t3,$t3,16,0
279 std $t0,`$FRAME+0`($sp)
280 std $t1,`$FRAME+8`($sp)
281 std $t2,`$FRAME+16`($sp)
282 std $t3,`$FRAME+24`($sp)
283 ; transfer (ap[0]*bp[0])*n0 to FPU as 4x16-bit values
284 extrdi $t4,$t7,16,48
285 extrdi $t5,$t7,16,32
286 extrdi $t6,$t7,16,16
287 extrdi $t7,$t7,16,0
288 std $t4,`$FRAME+32`($sp)
289 std $t5,`$FRAME+40`($sp)
290 std $t6,`$FRAME+48`($sp)
291 std $t7,`$FRAME+56`($sp)
292___
293$code.=<<___ if ($SIZE_T==8);
294 lwz $t0,4($ap) ; load a[j] as 32-bit word pair
295 lwz $t1,0($ap)
296 lwz $t2,12($ap) ; load a[j+1] as 32-bit word pair
297 lwz $t3,8($ap)
298 lwz $t4,4($np) ; load n[j] as 32-bit word pair
299 lwz $t5,0($np)
300 lwz $t6,12($np) ; load n[j+1] as 32-bit word pair
301 lwz $t7,8($np)
302___
303$code.=<<___ if ($SIZE_T==4);
304 lwz $t0,0($ap) ; load a[j..j+3] as 32-bit word pairs
305 lwz $t1,4($ap)
306 lwz $t2,8($ap)
307 lwz $t3,12($ap)
308 lwz $t4,0($np) ; load n[j..j+3] as 32-bit word pairs
309 lwz $t5,4($np)
310 lwz $t6,8($np)
311 lwz $t7,12($np)
312___
313$code.=<<___;
314 lfd $ba,`$FRAME+0`($sp)
315 lfd $bb,`$FRAME+8`($sp)
316 lfd $bc,`$FRAME+16`($sp)
317 lfd $bd,`$FRAME+24`($sp)
318 lfd $na,`$FRAME+32`($sp)
319 lfd $nb,`$FRAME+40`($sp)
320 lfd $nc,`$FRAME+48`($sp)
321 lfd $nd,`$FRAME+56`($sp)
322 std $t0,`$FRAME+64`($sp)
323 std $t1,`$FRAME+72`($sp)
324 std $t2,`$FRAME+80`($sp)
325 std $t3,`$FRAME+88`($sp)
326 std $t4,`$FRAME+96`($sp)
327 std $t5,`$FRAME+104`($sp)
328 std $t6,`$FRAME+112`($sp)
329 std $t7,`$FRAME+120`($sp)
330 fcfid $ba,$ba
331 fcfid $bb,$bb
332 fcfid $bc,$bc
333 fcfid $bd,$bd
334 fcfid $na,$na
335 fcfid $nb,$nb
336 fcfid $nc,$nc
337 fcfid $nd,$nd
338
339 lfd $A0,`$FRAME+64`($sp)
340 lfd $A1,`$FRAME+72`($sp)
341 lfd $A2,`$FRAME+80`($sp)
342 lfd $A3,`$FRAME+88`($sp)
343 lfd $N0,`$FRAME+96`($sp)
344 lfd $N1,`$FRAME+104`($sp)
345 lfd $N2,`$FRAME+112`($sp)
346 lfd $N3,`$FRAME+120`($sp)
347 fcfid $A0,$A0
348 fcfid $A1,$A1
349 fcfid $A2,$A2
350 fcfid $A3,$A3
351 fcfid $N0,$N0
352 fcfid $N1,$N1
353 fcfid $N2,$N2
354 fcfid $N3,$N3
355 addi $ap,$ap,16
356 addi $np,$np,16
357
358 fmul $T1a,$A1,$ba
359 fmul $T1b,$A1,$bb
360 stfd $A0,8($nap_d) ; save a[j] in double format
361 stfd $A1,16($nap_d)
362 fmul $T2a,$A2,$ba
363 fmul $T2b,$A2,$bb
364 stfd $A2,24($nap_d) ; save a[j+1] in double format
365 stfd $A3,32($nap_d)
366 fmul $T3a,$A3,$ba
367 fmul $T3b,$A3,$bb
368 stfd $N0,40($nap_d) ; save n[j] in double format
369 stfd $N1,48($nap_d)
370 fmul $T0a,$A0,$ba
371 fmul $T0b,$A0,$bb
372 stfd $N2,56($nap_d) ; save n[j+1] in double format
373 stfdu $N3,64($nap_d)
374
375 fmadd $T1a,$A0,$bc,$T1a
376 fmadd $T1b,$A0,$bd,$T1b
377 fmadd $T2a,$A1,$bc,$T2a
378 fmadd $T2b,$A1,$bd,$T2b
379 fmadd $T3a,$A2,$bc,$T3a
380 fmadd $T3b,$A2,$bd,$T3b
381 fmul $dota,$A3,$bc
382 fmul $dotb,$A3,$bd
383
384 fmadd $T1a,$N1,$na,$T1a
385 fmadd $T1b,$N1,$nb,$T1b
386 fmadd $T2a,$N2,$na,$T2a
387 fmadd $T2b,$N2,$nb,$T2b
388 fmadd $T3a,$N3,$na,$T3a
389 fmadd $T3b,$N3,$nb,$T3b
390 fmadd $T0a,$N0,$na,$T0a
391 fmadd $T0b,$N0,$nb,$T0b
392
393 fmadd $T1a,$N0,$nc,$T1a
394 fmadd $T1b,$N0,$nd,$T1b
395 fmadd $T2a,$N1,$nc,$T2a
396 fmadd $T2b,$N1,$nd,$T2b
397 fmadd $T3a,$N2,$nc,$T3a
398 fmadd $T3b,$N2,$nd,$T3b
399 fmadd $dota,$N3,$nc,$dota
400 fmadd $dotb,$N3,$nd,$dotb
401
402 fctid $T0a,$T0a
403 fctid $T0b,$T0b
404 fctid $T1a,$T1a
405 fctid $T1b,$T1b
406 fctid $T2a,$T2a
407 fctid $T2b,$T2b
408 fctid $T3a,$T3a
409 fctid $T3b,$T3b
410
411 stfd $T0a,`$FRAME+0`($sp)
412 stfd $T0b,`$FRAME+8`($sp)
413 stfd $T1a,`$FRAME+16`($sp)
414 stfd $T1b,`$FRAME+24`($sp)
415 stfd $T2a,`$FRAME+32`($sp)
416 stfd $T2b,`$FRAME+40`($sp)
417 stfd $T3a,`$FRAME+48`($sp)
418 stfd $T3b,`$FRAME+56`($sp)
419
420.align 5
421L1st:
422___
423$code.=<<___ if ($SIZE_T==8);
424 lwz $t0,4($ap) ; load a[j] as 32-bit word pair
425 lwz $t1,0($ap)
426 lwz $t2,12($ap) ; load a[j+1] as 32-bit word pair
427 lwz $t3,8($ap)
428 lwz $t4,4($np) ; load n[j] as 32-bit word pair
429 lwz $t5,0($np)
430 lwz $t6,12($np) ; load n[j+1] as 32-bit word pair
431 lwz $t7,8($np)
432___
433$code.=<<___ if ($SIZE_T==4);
434 lwz $t0,0($ap) ; load a[j..j+3] as 32-bit word pairs
435 lwz $t1,4($ap)
436 lwz $t2,8($ap)
437 lwz $t3,12($ap)
438 lwz $t4,0($np) ; load n[j..j+3] as 32-bit word pairs
439 lwz $t5,4($np)
440 lwz $t6,8($np)
441 lwz $t7,12($np)
442___
443$code.=<<___;
444 std $t0,`$FRAME+64`($sp)
445 std $t1,`$FRAME+72`($sp)
446 std $t2,`$FRAME+80`($sp)
447 std $t3,`$FRAME+88`($sp)
448 std $t4,`$FRAME+96`($sp)
449 std $t5,`$FRAME+104`($sp)
450 std $t6,`$FRAME+112`($sp)
451 std $t7,`$FRAME+120`($sp)
452 ld $t0,`$FRAME+0`($sp)
453 ld $t1,`$FRAME+8`($sp)
454 ld $t2,`$FRAME+16`($sp)
455 ld $t3,`$FRAME+24`($sp)
456 ld $t4,`$FRAME+32`($sp)
457 ld $t5,`$FRAME+40`($sp)
458 ld $t6,`$FRAME+48`($sp)
459 ld $t7,`$FRAME+56`($sp)
460 lfd $A0,`$FRAME+64`($sp)
461 lfd $A1,`$FRAME+72`($sp)
462 lfd $A2,`$FRAME+80`($sp)
463 lfd $A3,`$FRAME+88`($sp)
464 lfd $N0,`$FRAME+96`($sp)
465 lfd $N1,`$FRAME+104`($sp)
466 lfd $N2,`$FRAME+112`($sp)
467 lfd $N3,`$FRAME+120`($sp)
468 fcfid $A0,$A0
469 fcfid $A1,$A1
470 fcfid $A2,$A2
471 fcfid $A3,$A3
472 fcfid $N0,$N0
473 fcfid $N1,$N1
474 fcfid $N2,$N2
475 fcfid $N3,$N3
476 addi $ap,$ap,16
477 addi $np,$np,16
478
479 fmul $T1a,$A1,$ba
480 fmul $T1b,$A1,$bb
481 fmul $T2a,$A2,$ba
482 fmul $T2b,$A2,$bb
483 stfd $A0,8($nap_d) ; save a[j] in double format
484 stfd $A1,16($nap_d)
485 fmul $T3a,$A3,$ba
486 fmul $T3b,$A3,$bb
487 fmadd $T0a,$A0,$ba,$dota
488 fmadd $T0b,$A0,$bb,$dotb
489 stfd $A2,24($nap_d) ; save a[j+1] in double format
490 stfd $A3,32($nap_d)
491
492 fmadd $T1a,$A0,$bc,$T1a
493 fmadd $T1b,$A0,$bd,$T1b
494 fmadd $T2a,$A1,$bc,$T2a
495 fmadd $T2b,$A1,$bd,$T2b
496 stfd $N0,40($nap_d) ; save n[j] in double format
497 stfd $N1,48($nap_d)
498 fmadd $T3a,$A2,$bc,$T3a
499 fmadd $T3b,$A2,$bd,$T3b
500 add $t0,$t0,$carry ; can not overflow
501 fmul $dota,$A3,$bc
502 fmul $dotb,$A3,$bd
503 stfd $N2,56($nap_d) ; save n[j+1] in double format
504 stfdu $N3,64($nap_d)
505 srdi $carry,$t0,16
506 add $t1,$t1,$carry
507 srdi $carry,$t1,16
508
509 fmadd $T1a,$N1,$na,$T1a
510 fmadd $T1b,$N1,$nb,$T1b
511 insrdi $t0,$t1,16,32
512 fmadd $T2a,$N2,$na,$T2a
513 fmadd $T2b,$N2,$nb,$T2b
514 add $t2,$t2,$carry
515 fmadd $T3a,$N3,$na,$T3a
516 fmadd $T3b,$N3,$nb,$T3b
517 srdi $carry,$t2,16
518 fmadd $T0a,$N0,$na,$T0a
519 fmadd $T0b,$N0,$nb,$T0b
520 insrdi $t0,$t2,16,16
521 add $t3,$t3,$carry
522 srdi $carry,$t3,16
523
524 fmadd $T1a,$N0,$nc,$T1a
525 fmadd $T1b,$N0,$nd,$T1b
526 insrdi $t0,$t3,16,0 ; 0..63 bits
527 fmadd $T2a,$N1,$nc,$T2a
528 fmadd $T2b,$N1,$nd,$T2b
529 add $t4,$t4,$carry
530 fmadd $T3a,$N2,$nc,$T3a
531 fmadd $T3b,$N2,$nd,$T3b
532 srdi $carry,$t4,16
533 fmadd $dota,$N3,$nc,$dota
534 fmadd $dotb,$N3,$nd,$dotb
535 add $t5,$t5,$carry
536 srdi $carry,$t5,16
537 insrdi $t4,$t5,16,32
538
539 fctid $T0a,$T0a
540 fctid $T0b,$T0b
541 add $t6,$t6,$carry
542 fctid $T1a,$T1a
543 fctid $T1b,$T1b
544 srdi $carry,$t6,16
545 fctid $T2a,$T2a
546 fctid $T2b,$T2b
547 insrdi $t4,$t6,16,16
548 fctid $T3a,$T3a
549 fctid $T3b,$T3b
550 add $t7,$t7,$carry
551 insrdi $t4,$t7,16,0 ; 64..127 bits
552 srdi $carry,$t7,16 ; upper 33 bits
553
554 stfd $T0a,`$FRAME+0`($sp)
555 stfd $T0b,`$FRAME+8`($sp)
556 stfd $T1a,`$FRAME+16`($sp)
557 stfd $T1b,`$FRAME+24`($sp)
558 stfd $T2a,`$FRAME+32`($sp)
559 stfd $T2b,`$FRAME+40`($sp)
560 stfd $T3a,`$FRAME+48`($sp)
561 stfd $T3b,`$FRAME+56`($sp)
562 std $t0,8($tp) ; tp[j-1]
563 stdu $t4,16($tp) ; tp[j]
564 bdnz- L1st
565
566 fctid $dota,$dota
567 fctid $dotb,$dotb
568
569 ld $t0,`$FRAME+0`($sp)
570 ld $t1,`$FRAME+8`($sp)
571 ld $t2,`$FRAME+16`($sp)
572 ld $t3,`$FRAME+24`($sp)
573 ld $t4,`$FRAME+32`($sp)
574 ld $t5,`$FRAME+40`($sp)
575 ld $t6,`$FRAME+48`($sp)
576 ld $t7,`$FRAME+56`($sp)
577 stfd $dota,`$FRAME+64`($sp)
578 stfd $dotb,`$FRAME+72`($sp)
579
580 add $t0,$t0,$carry ; can not overflow
581 srdi $carry,$t0,16
582 add $t1,$t1,$carry
583 srdi $carry,$t1,16
584 insrdi $t0,$t1,16,32
585 add $t2,$t2,$carry
586 srdi $carry,$t2,16
587 insrdi $t0,$t2,16,16
588 add $t3,$t3,$carry
589 srdi $carry,$t3,16
590 insrdi $t0,$t3,16,0 ; 0..63 bits
591 add $t4,$t4,$carry
592 srdi $carry,$t4,16
593 add $t5,$t5,$carry
594 srdi $carry,$t5,16
595 insrdi $t4,$t5,16,32
596 add $t6,$t6,$carry
597 srdi $carry,$t6,16
598 insrdi $t4,$t6,16,16
599 add $t7,$t7,$carry
600 insrdi $t4,$t7,16,0 ; 64..127 bits
601 srdi $carry,$t7,16 ; upper 33 bits
602 ld $t6,`$FRAME+64`($sp)
603 ld $t7,`$FRAME+72`($sp)
604
605 std $t0,8($tp) ; tp[j-1]
606 stdu $t4,16($tp) ; tp[j]
607
608 add $t6,$t6,$carry ; can not overflow
609 srdi $carry,$t6,16
610 add $t7,$t7,$carry
611 insrdi $t6,$t7,48,0
612 srdi $ovf,$t7,48
613 std $t6,8($tp) ; tp[num-1]
614
615 slwi $t7,$num,2
616 subf $nap_d,$t7,$nap_d ; rewind pointer
617
618 li $i,8 ; i=1
619.align 5
620Louter:
621___
622$code.=<<___ if ($SIZE_T==8);
623 ldx $t3,$bp,$i ; bp[i]
624___
625$code.=<<___ if ($SIZE_T==4);
626 add $t0,$bp,$i
627 lwz $t3,0($t0) ; bp[i,i+1]
628 lwz $t0,4($t0)
629 insrdi $t3,$t0,32,0
630___
631$code.=<<___;
632 ld $t6,`$FRAME+$TRANSFER+8`($sp) ; tp[0]
633 mulld $t7,$a0,$t3 ; ap[0]*bp[i]
634
635 addi $tp,$sp,`$FRAME+$TRANSFER`
636 add $t7,$t7,$t6 ; ap[0]*bp[i]+tp[0]
637 li $carry,0
638 mulld $t7,$t7,$n0 ; tp[0]*n0
639 mtctr $j
640
641 ; transfer bp[i] to FPU as 4x16-bit values
642 extrdi $t0,$t3,16,48
643 extrdi $t1,$t3,16,32
644 extrdi $t2,$t3,16,16
645 extrdi $t3,$t3,16,0
646 std $t0,`$FRAME+0`($sp)
647 std $t1,`$FRAME+8`($sp)
648 std $t2,`$FRAME+16`($sp)
649 std $t3,`$FRAME+24`($sp)
650 ; transfer (ap[0]*bp[i]+tp[0])*n0 to FPU as 4x16-bit values
651 extrdi $t4,$t7,16,48
652 extrdi $t5,$t7,16,32
653 extrdi $t6,$t7,16,16
654 extrdi $t7,$t7,16,0
655 std $t4,`$FRAME+32`($sp)
656 std $t5,`$FRAME+40`($sp)
657 std $t6,`$FRAME+48`($sp)
658 std $t7,`$FRAME+56`($sp)
659
660 lfd $A0,8($nap_d) ; load a[j] in double format
661 lfd $A1,16($nap_d)
662 lfd $A2,24($nap_d) ; load a[j+1] in double format
663 lfd $A3,32($nap_d)
664 lfd $N0,40($nap_d) ; load n[j] in double format
665 lfd $N1,48($nap_d)
666 lfd $N2,56($nap_d) ; load n[j+1] in double format
667 lfdu $N3,64($nap_d)
668
669 lfd $ba,`$FRAME+0`($sp)
670 lfd $bb,`$FRAME+8`($sp)
671 lfd $bc,`$FRAME+16`($sp)
672 lfd $bd,`$FRAME+24`($sp)
673 lfd $na,`$FRAME+32`($sp)
674 lfd $nb,`$FRAME+40`($sp)
675 lfd $nc,`$FRAME+48`($sp)
676 lfd $nd,`$FRAME+56`($sp)
677
678 fcfid $ba,$ba
679 fcfid $bb,$bb
680 fcfid $bc,$bc
681 fcfid $bd,$bd
682 fcfid $na,$na
683 fcfid $nb,$nb
684 fcfid $nc,$nc
685 fcfid $nd,$nd
686
687 fmul $T1a,$A1,$ba
688 fmul $T1b,$A1,$bb
689 fmul $T2a,$A2,$ba
690 fmul $T2b,$A2,$bb
691 fmul $T3a,$A3,$ba
692 fmul $T3b,$A3,$bb
693 fmul $T0a,$A0,$ba
694 fmul $T0b,$A0,$bb
695
696 fmadd $T1a,$A0,$bc,$T1a
697 fmadd $T1b,$A0,$bd,$T1b
698 fmadd $T2a,$A1,$bc,$T2a
699 fmadd $T2b,$A1,$bd,$T2b
700 fmadd $T3a,$A2,$bc,$T3a
701 fmadd $T3b,$A2,$bd,$T3b
702 fmul $dota,$A3,$bc
703 fmul $dotb,$A3,$bd
704
705 fmadd $T1a,$N1,$na,$T1a
706 fmadd $T1b,$N1,$nb,$T1b
707 lfd $A0,8($nap_d) ; load a[j] in double format
708 lfd $A1,16($nap_d)
709 fmadd $T2a,$N2,$na,$T2a
710 fmadd $T2b,$N2,$nb,$T2b
711 lfd $A2,24($nap_d) ; load a[j+1] in double format
712 lfd $A3,32($nap_d)
713 fmadd $T3a,$N3,$na,$T3a
714 fmadd $T3b,$N3,$nb,$T3b
715 fmadd $T0a,$N0,$na,$T0a
716 fmadd $T0b,$N0,$nb,$T0b
717
718 fmadd $T1a,$N0,$nc,$T1a
719 fmadd $T1b,$N0,$nd,$T1b
720 fmadd $T2a,$N1,$nc,$T2a
721 fmadd $T2b,$N1,$nd,$T2b
722 fmadd $T3a,$N2,$nc,$T3a
723 fmadd $T3b,$N2,$nd,$T3b
724 fmadd $dota,$N3,$nc,$dota
725 fmadd $dotb,$N3,$nd,$dotb
726
727 fctid $T0a,$T0a
728 fctid $T0b,$T0b
729 fctid $T1a,$T1a
730 fctid $T1b,$T1b
731 fctid $T2a,$T2a
732 fctid $T2b,$T2b
733 fctid $T3a,$T3a
734 fctid $T3b,$T3b
735
736 stfd $T0a,`$FRAME+0`($sp)
737 stfd $T0b,`$FRAME+8`($sp)
738 stfd $T1a,`$FRAME+16`($sp)
739 stfd $T1b,`$FRAME+24`($sp)
740 stfd $T2a,`$FRAME+32`($sp)
741 stfd $T2b,`$FRAME+40`($sp)
742 stfd $T3a,`$FRAME+48`($sp)
743 stfd $T3b,`$FRAME+56`($sp)
744
745.align 5
746Linner:
747 fmul $T1a,$A1,$ba
748 fmul $T1b,$A1,$bb
749 fmul $T2a,$A2,$ba
750 fmul $T2b,$A2,$bb
751 lfd $N0,40($nap_d) ; load n[j] in double format
752 lfd $N1,48($nap_d)
753 fmul $T3a,$A3,$ba
754 fmul $T3b,$A3,$bb
755 fmadd $T0a,$A0,$ba,$dota
756 fmadd $T0b,$A0,$bb,$dotb
757 lfd $N2,56($nap_d) ; load n[j+1] in double format
758 lfdu $N3,64($nap_d)
759
760 fmadd $T1a,$A0,$bc,$T1a
761 fmadd $T1b,$A0,$bd,$T1b
762 fmadd $T2a,$A1,$bc,$T2a
763 fmadd $T2b,$A1,$bd,$T2b
764 lfd $A0,8($nap_d) ; load a[j] in double format
765 lfd $A1,16($nap_d)
766 fmadd $T3a,$A2,$bc,$T3a
767 fmadd $T3b,$A2,$bd,$T3b
768 fmul $dota,$A3,$bc
769 fmul $dotb,$A3,$bd
770 lfd $A2,24($nap_d) ; load a[j+1] in double format
771 lfd $A3,32($nap_d)
772
773 fmadd $T1a,$N1,$na,$T1a
774 fmadd $T1b,$N1,$nb,$T1b
775 ld $t0,`$FRAME+0`($sp)
776 ld $t1,`$FRAME+8`($sp)
777 fmadd $T2a,$N2,$na,$T2a
778 fmadd $T2b,$N2,$nb,$T2b
779 ld $t2,`$FRAME+16`($sp)
780 ld $t3,`$FRAME+24`($sp)
781 fmadd $T3a,$N3,$na,$T3a
782 fmadd $T3b,$N3,$nb,$T3b
783 add $t0,$t0,$carry ; can not overflow
784 ld $t4,`$FRAME+32`($sp)
785 ld $t5,`$FRAME+40`($sp)
786 fmadd $T0a,$N0,$na,$T0a
787 fmadd $T0b,$N0,$nb,$T0b
788 srdi $carry,$t0,16
789 add $t1,$t1,$carry
790 srdi $carry,$t1,16
791 ld $t6,`$FRAME+48`($sp)
792 ld $t7,`$FRAME+56`($sp)
793
794 fmadd $T1a,$N0,$nc,$T1a
795 fmadd $T1b,$N0,$nd,$T1b
796 insrdi $t0,$t1,16,32
797 ld $t1,8($tp) ; tp[j]
798 fmadd $T2a,$N1,$nc,$T2a
799 fmadd $T2b,$N1,$nd,$T2b
800 add $t2,$t2,$carry
801 fmadd $T3a,$N2,$nc,$T3a
802 fmadd $T3b,$N2,$nd,$T3b
803 srdi $carry,$t2,16
804 insrdi $t0,$t2,16,16
805 fmadd $dota,$N3,$nc,$dota
806 fmadd $dotb,$N3,$nd,$dotb
807 add $t3,$t3,$carry
808 ldu $t2,16($tp) ; tp[j+1]
809 srdi $carry,$t3,16
810 insrdi $t0,$t3,16,0 ; 0..63 bits
811 add $t4,$t4,$carry
812
813 fctid $T0a,$T0a
814 fctid $T0b,$T0b
815 srdi $carry,$t4,16
816 fctid $T1a,$T1a
817 fctid $T1b,$T1b
818 add $t5,$t5,$carry
819 fctid $T2a,$T2a
820 fctid $T2b,$T2b
821 srdi $carry,$t5,16
822 insrdi $t4,$t5,16,32
823 fctid $T3a,$T3a
824 fctid $T3b,$T3b
825 add $t6,$t6,$carry
826 srdi $carry,$t6,16
827 insrdi $t4,$t6,16,16
828
829 stfd $T0a,`$FRAME+0`($sp)
830 stfd $T0b,`$FRAME+8`($sp)
831 add $t7,$t7,$carry
832 addc $t3,$t0,$t1
833___
834$code.=<<___ if ($SIZE_T==4); # adjust XER[CA]
835 extrdi $t0,$t0,32,0
836 extrdi $t1,$t1,32,0
837 adde $t0,$t0,$t1
838___
839$code.=<<___;
840 stfd $T1a,`$FRAME+16`($sp)
841 stfd $T1b,`$FRAME+24`($sp)
842 insrdi $t4,$t7,16,0 ; 64..127 bits
843 srdi $carry,$t7,16 ; upper 33 bits
844 stfd $T2a,`$FRAME+32`($sp)
845 stfd $T2b,`$FRAME+40`($sp)
846 adde $t5,$t4,$t2
847___
848$code.=<<___ if ($SIZE_T==4); # adjust XER[CA]
849 extrdi $t4,$t4,32,0
850 extrdi $t2,$t2,32,0
851 adde $t4,$t4,$t2
852___
853$code.=<<___;
854 stfd $T3a,`$FRAME+48`($sp)
855 stfd $T3b,`$FRAME+56`($sp)
856 addze $carry,$carry
857 std $t3,-16($tp) ; tp[j-1]
858 std $t5,-8($tp) ; tp[j]
859 bdnz- Linner
860
861 fctid $dota,$dota
862 fctid $dotb,$dotb
863 ld $t0,`$FRAME+0`($sp)
864 ld $t1,`$FRAME+8`($sp)
865 ld $t2,`$FRAME+16`($sp)
866 ld $t3,`$FRAME+24`($sp)
867 ld $t4,`$FRAME+32`($sp)
868 ld $t5,`$FRAME+40`($sp)
869 ld $t6,`$FRAME+48`($sp)
870 ld $t7,`$FRAME+56`($sp)
871 stfd $dota,`$FRAME+64`($sp)
872 stfd $dotb,`$FRAME+72`($sp)
873
874 add $t0,$t0,$carry ; can not overflow
875 srdi $carry,$t0,16
876 add $t1,$t1,$carry
877 srdi $carry,$t1,16
878 insrdi $t0,$t1,16,32
879 add $t2,$t2,$carry
880 ld $t1,8($tp) ; tp[j]
881 srdi $carry,$t2,16
882 insrdi $t0,$t2,16,16
883 add $t3,$t3,$carry
884 ldu $t2,16($tp) ; tp[j+1]
885 srdi $carry,$t3,16
886 insrdi $t0,$t3,16,0 ; 0..63 bits
887 add $t4,$t4,$carry
888 srdi $carry,$t4,16
889 add $t5,$t5,$carry
890 srdi $carry,$t5,16
891 insrdi $t4,$t5,16,32
892 add $t6,$t6,$carry
893 srdi $carry,$t6,16
894 insrdi $t4,$t6,16,16
895 add $t7,$t7,$carry
896 insrdi $t4,$t7,16,0 ; 64..127 bits
897 srdi $carry,$t7,16 ; upper 33 bits
898 ld $t6,`$FRAME+64`($sp)
899 ld $t7,`$FRAME+72`($sp)
900
901 addc $t3,$t0,$t1
902___
903$code.=<<___ if ($SIZE_T==4); # adjust XER[CA]
904 extrdi $t0,$t0,32,0
905 extrdi $t1,$t1,32,0
906 adde $t0,$t0,$t1
907___
908$code.=<<___;
909 adde $t5,$t4,$t2
910___
911$code.=<<___ if ($SIZE_T==4); # adjust XER[CA]
912 extrdi $t4,$t4,32,0
913 extrdi $t2,$t2,32,0
914 adde $t4,$t4,$t2
915___
916$code.=<<___;
917 addze $carry,$carry
918
919 std $t3,-16($tp) ; tp[j-1]
920 std $t5,-8($tp) ; tp[j]
921
922 add $carry,$carry,$ovf ; comsume upmost overflow
923 add $t6,$t6,$carry ; can not overflow
924 srdi $carry,$t6,16
925 add $t7,$t7,$carry
926 insrdi $t6,$t7,48,0
927 srdi $ovf,$t7,48
928 std $t6,0($tp) ; tp[num-1]
929
930 slwi $t7,$num,2
931 addi $i,$i,8
932 subf $nap_d,$t7,$nap_d ; rewind pointer
933 cmpw $i,$num
934 blt- Louter
935___
936
937$code.=<<___ if ($SIZE_T==8);
938 subf $np,$num,$np ; rewind np
939 addi $j,$j,1 ; restore counter
940 subfc $i,$i,$i ; j=0 and "clear" XER[CA]
941 addi $tp,$sp,`$FRAME+$TRANSFER+8`
942 addi $t4,$sp,`$FRAME+$TRANSFER+16`
943 addi $t5,$np,8
944 addi $t6,$rp,8
945 mtctr $j
946
947.align 4
948Lsub: ldx $t0,$tp,$i
949 ldx $t1,$np,$i
950 ldx $t2,$t4,$i
951 ldx $t3,$t5,$i
952 subfe $t0,$t1,$t0 ; tp[j]-np[j]
953 subfe $t2,$t3,$t2 ; tp[j+1]-np[j+1]
954 stdx $t0,$rp,$i
955 stdx $t2,$t6,$i
956 addi $i,$i,16
957 bdnz- Lsub
958
959 li $i,0
960 subfe $ovf,$i,$ovf ; handle upmost overflow bit
961 and $ap,$tp,$ovf
962 andc $np,$rp,$ovf
963 or $ap,$ap,$np ; ap=borrow?tp:rp
964 addi $t7,$ap,8
965 mtctr $j
966
967.align 4
968Lcopy: ; copy or in-place refresh
969 ldx $t0,$ap,$i
970 ldx $t1,$t7,$i
971 std $i,8($nap_d) ; zap nap_d
972 std $i,16($nap_d)
973 std $i,24($nap_d)
974 std $i,32($nap_d)
975 std $i,40($nap_d)
976 std $i,48($nap_d)
977 std $i,56($nap_d)
978 stdu $i,64($nap_d)
979 stdx $t0,$rp,$i
980 stdx $t1,$t6,$i
981 stdx $i,$tp,$i ; zap tp at once
982 stdx $i,$t4,$i
983 addi $i,$i,16
984 bdnz- Lcopy
985___
986$code.=<<___ if ($SIZE_T==4);
987 subf $np,$num,$np ; rewind np
988 addi $j,$j,1 ; restore counter
989 subfc $i,$i,$i ; j=0 and "clear" XER[CA]
990 addi $tp,$sp,`$FRAME+$TRANSFER`
991 addi $np,$np,-4
992 addi $rp,$rp,-4
993 addi $ap,$sp,`$FRAME+$TRANSFER+4`
994 mtctr $j
995
996.align 4
997Lsub: ld $t0,8($tp) ; load tp[j..j+3] in 64-bit word order
998 ldu $t2,16($tp)
999 lwz $t4,4($np) ; load np[j..j+3] in 32-bit word order
1000 lwz $t5,8($np)
1001 lwz $t6,12($np)
1002 lwzu $t7,16($np)
1003 extrdi $t1,$t0,32,0
1004 extrdi $t3,$t2,32,0
1005 subfe $t4,$t4,$t0 ; tp[j]-np[j]
1006 stw $t0,4($ap) ; save tp[j..j+3] in 32-bit word order
1007 subfe $t5,$t5,$t1 ; tp[j+1]-np[j+1]
1008 stw $t1,8($ap)
1009 subfe $t6,$t6,$t2 ; tp[j+2]-np[j+2]
1010 stw $t2,12($ap)
1011 subfe $t7,$t7,$t3 ; tp[j+3]-np[j+3]
1012 stwu $t3,16($ap)
1013 stw $t4,4($rp)
1014 stw $t5,8($rp)
1015 stw $t6,12($rp)
1016 stwu $t7,16($rp)
1017 bdnz- Lsub
1018
1019 li $i,0
1020 subfe $ovf,$i,$ovf ; handle upmost overflow bit
1021 addi $tp,$sp,`$FRAME+$TRANSFER+4`
1022 subf $rp,$num,$rp ; rewind rp
1023 and $ap,$tp,$ovf
1024 andc $np,$rp,$ovf
1025 or $ap,$ap,$np ; ap=borrow?tp:rp
1026 addi $tp,$sp,`$FRAME+$TRANSFER`
1027 mtctr $j
1028
1029.align 4
1030Lcopy: ; copy or in-place refresh
1031 lwz $t0,4($ap)
1032 lwz $t1,8($ap)
1033 lwz $t2,12($ap)
1034 lwzu $t3,16($ap)
1035 std $i,8($nap_d) ; zap nap_d
1036 std $i,16($nap_d)
1037 std $i,24($nap_d)
1038 std $i,32($nap_d)
1039 std $i,40($nap_d)
1040 std $i,48($nap_d)
1041 std $i,56($nap_d)
1042 stdu $i,64($nap_d)
1043 stw $t0,4($rp)
1044 stw $t1,8($rp)
1045 stw $t2,12($rp)
1046 stwu $t3,16($rp)
1047 std $i,8($tp) ; zap tp at once
1048 stdu $i,16($tp)
1049 bdnz- Lcopy
1050___
1051
1052$code.=<<___;
1053 $POP $i,0($sp)
1054 li r3,1 ; signal "handled"
1055 $POP r22,`-12*8-10*$SIZE_T`($i)
1056 $POP r23,`-12*8-9*$SIZE_T`($i)
1057 $POP r24,`-12*8-8*$SIZE_T`($i)
1058 $POP r25,`-12*8-7*$SIZE_T`($i)
1059 $POP r26,`-12*8-6*$SIZE_T`($i)
1060 $POP r27,`-12*8-5*$SIZE_T`($i)
1061 $POP r28,`-12*8-4*$SIZE_T`($i)
1062 $POP r29,`-12*8-3*$SIZE_T`($i)
1063 $POP r30,`-12*8-2*$SIZE_T`($i)
1064 $POP r31,`-12*8-1*$SIZE_T`($i)
1065 lfd f20,`-12*8`($i)
1066 lfd f21,`-11*8`($i)
1067 lfd f22,`-10*8`($i)
1068 lfd f23,`-9*8`($i)
1069 lfd f24,`-8*8`($i)
1070 lfd f25,`-7*8`($i)
1071 lfd f26,`-6*8`($i)
1072 lfd f27,`-5*8`($i)
1073 lfd f28,`-4*8`($i)
1074 lfd f29,`-3*8`($i)
1075 lfd f30,`-2*8`($i)
1076 lfd f31,`-1*8`($i)
1077 mr $sp,$i
1078 blr
1079 .long 0
1080 .byte 0,12,4,0,0x8c,10,6,0
1081 .long 0
1082
1083.asciz "Montgomery Multiplication for PPC64, CRYPTOGAMS by <appro\@openssl.org>"
1084___
1085
1086$code =~ s/\`([^\`]*)\`/eval $1/gem;
1087print $code;
1088close STDOUT;
diff --git a/src/lib/libcrypto/bn/asm/s390x-gf2m.pl b/src/lib/libcrypto/bn/asm/s390x-gf2m.pl
deleted file mode 100644
index cd9f13eca2..0000000000
--- a/src/lib/libcrypto/bn/asm/s390x-gf2m.pl
+++ /dev/null
@@ -1,221 +0,0 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# May 2011
11#
12# The module implements bn_GF2m_mul_2x2 polynomial multiplication used
13# in bn_gf2m.c. It's kind of low-hanging mechanical port from C for
14# the time being... gcc 4.3 appeared to generate poor code, therefore
15# the effort. And indeed, the module delivers 55%-90%(*) improvement
16# on haviest ECDSA verify and ECDH benchmarks for 163- and 571-bit
17# key lengths on z990, 30%-55%(*) - on z10, and 70%-110%(*) - on z196.
18# This is for 64-bit build. In 32-bit "highgprs" case improvement is
19# even higher, for example on z990 it was measured 80%-150%. ECDSA
20# sign is modest 9%-12% faster. Keep in mind that these coefficients
21# are not ones for bn_GF2m_mul_2x2 itself, as not all CPU time is
22# burnt in it...
23#
24# (*) gcc 4.1 was observed to deliver better results than gcc 4.3,
25# so that improvement coefficients can vary from one specific
26# setup to another.
27
28$flavour = shift;
29
30if ($flavour =~ /3[12]/) {
31 $SIZE_T=4;
32 $g="";
33} else {
34 $SIZE_T=8;
35 $g="g";
36}
37
38while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
39open STDOUT,">$output";
40
41$stdframe=16*$SIZE_T+4*8;
42
43$rp="%r2";
44$a1="%r3";
45$a0="%r4";
46$b1="%r5";
47$b0="%r6";
48
49$ra="%r14";
50$sp="%r15";
51
52@T=("%r0","%r1");
53@i=("%r12","%r13");
54
55($a1,$a2,$a4,$a8,$a12,$a48)=map("%r$_",(6..11));
56($lo,$hi,$b)=map("%r$_",(3..5)); $a=$lo; $mask=$a8;
57
58$code.=<<___;
59.text
60
61.type _mul_1x1,\@function
62.align 16
63_mul_1x1:
64 lgr $a1,$a
65 sllg $a2,$a,1
66 sllg $a4,$a,2
67 sllg $a8,$a,3
68
69 srag $lo,$a1,63 # broadcast 63rd bit
70 nihh $a1,0x1fff
71 srag @i[0],$a2,63 # broadcast 62nd bit
72 nihh $a2,0x3fff
73 srag @i[1],$a4,63 # broadcast 61st bit
74 nihh $a4,0x7fff
75 ngr $lo,$b
76 ngr @i[0],$b
77 ngr @i[1],$b
78
79 lghi @T[0],0
80 lgr $a12,$a1
81 stg @T[0],`$stdframe+0*8`($sp) # tab[0]=0
82 xgr $a12,$a2
83 stg $a1,`$stdframe+1*8`($sp) # tab[1]=a1
84 lgr $a48,$a4
85 stg $a2,`$stdframe+2*8`($sp) # tab[2]=a2
86 xgr $a48,$a8
87 stg $a12,`$stdframe+3*8`($sp) # tab[3]=a1^a2
88 xgr $a1,$a4
89
90 stg $a4,`$stdframe+4*8`($sp) # tab[4]=a4
91 xgr $a2,$a4
92 stg $a1,`$stdframe+5*8`($sp) # tab[5]=a1^a4
93 xgr $a12,$a4
94 stg $a2,`$stdframe+6*8`($sp) # tab[6]=a2^a4
95 xgr $a1,$a48
96 stg $a12,`$stdframe+7*8`($sp) # tab[7]=a1^a2^a4
97 xgr $a2,$a48
98
99 stg $a8,`$stdframe+8*8`($sp) # tab[8]=a8
100 xgr $a12,$a48
101 stg $a1,`$stdframe+9*8`($sp) # tab[9]=a1^a8
102 xgr $a1,$a4
103 stg $a2,`$stdframe+10*8`($sp) # tab[10]=a2^a8
104 xgr $a2,$a4
105 stg $a12,`$stdframe+11*8`($sp) # tab[11]=a1^a2^a8
106
107 xgr $a12,$a4
108 stg $a48,`$stdframe+12*8`($sp) # tab[12]=a4^a8
109 srlg $hi,$lo,1
110 stg $a1,`$stdframe+13*8`($sp) # tab[13]=a1^a4^a8
111 sllg $lo,$lo,63
112 stg $a2,`$stdframe+14*8`($sp) # tab[14]=a2^a4^a8
113 srlg @T[0],@i[0],2
114 stg $a12,`$stdframe+15*8`($sp) # tab[15]=a1^a2^a4^a8
115
116 lghi $mask,`0xf<<3`
117 sllg $a1,@i[0],62
118 sllg @i[0],$b,3
119 srlg @T[1],@i[1],3
120 ngr @i[0],$mask
121 sllg $a2,@i[1],61
122 srlg @i[1],$b,4-3
123 xgr $hi,@T[0]
124 ngr @i[1],$mask
125 xgr $lo,$a1
126 xgr $hi,@T[1]
127 xgr $lo,$a2
128
129 xg $lo,$stdframe(@i[0],$sp)
130 srlg @i[0],$b,8-3
131 ngr @i[0],$mask
132___
133for($n=1;$n<14;$n++) {
134$code.=<<___;
135 lg @T[1],$stdframe(@i[1],$sp)
136 srlg @i[1],$b,`($n+2)*4`-3
137 sllg @T[0],@T[1],`$n*4`
138 ngr @i[1],$mask
139 srlg @T[1],@T[1],`64-$n*4`
140 xgr $lo,@T[0]
141 xgr $hi,@T[1]
142___
143 push(@i,shift(@i)); push(@T,shift(@T));
144}
145$code.=<<___;
146 lg @T[1],$stdframe(@i[1],$sp)
147 sllg @T[0],@T[1],`$n*4`
148 srlg @T[1],@T[1],`64-$n*4`
149 xgr $lo,@T[0]
150 xgr $hi,@T[1]
151
152 lg @T[0],$stdframe(@i[0],$sp)
153 sllg @T[1],@T[0],`($n+1)*4`
154 srlg @T[0],@T[0],`64-($n+1)*4`
155 xgr $lo,@T[1]
156 xgr $hi,@T[0]
157
158 br $ra
159.size _mul_1x1,.-_mul_1x1
160
161.globl bn_GF2m_mul_2x2
162.type bn_GF2m_mul_2x2,\@function
163.align 16
164bn_GF2m_mul_2x2:
165 stm${g} %r3,%r15,3*$SIZE_T($sp)
166
167 lghi %r1,-$stdframe-128
168 la %r0,0($sp)
169 la $sp,0(%r1,$sp) # alloca
170 st${g} %r0,0($sp) # back chain
171___
172if ($SIZE_T==8) {
173my @r=map("%r$_",(6..9));
174$code.=<<___;
175 bras $ra,_mul_1x1 # a1·b1
176 stmg $lo,$hi,16($rp)
177
178 lg $a,`$stdframe+128+4*$SIZE_T`($sp)
179 lg $b,`$stdframe+128+6*$SIZE_T`($sp)
180 bras $ra,_mul_1x1 # a0·b0
181 stmg $lo,$hi,0($rp)
182
183 lg $a,`$stdframe+128+3*$SIZE_T`($sp)
184 lg $b,`$stdframe+128+5*$SIZE_T`($sp)
185 xg $a,`$stdframe+128+4*$SIZE_T`($sp)
186 xg $b,`$stdframe+128+6*$SIZE_T`($sp)
187 bras $ra,_mul_1x1 # (a0+a1)·(b0+b1)
188 lmg @r[0],@r[3],0($rp)
189
190 xgr $lo,$hi
191 xgr $hi,@r[1]
192 xgr $lo,@r[0]
193 xgr $hi,@r[2]
194 xgr $lo,@r[3]
195 xgr $hi,@r[3]
196 xgr $lo,$hi
197 stg $hi,16($rp)
198 stg $lo,8($rp)
199___
200} else {
201$code.=<<___;
202 sllg %r3,%r3,32
203 sllg %r5,%r5,32
204 or %r3,%r4
205 or %r5,%r6
206 bras $ra,_mul_1x1
207 rllg $lo,$lo,32
208 rllg $hi,$hi,32
209 stmg $lo,$hi,0($rp)
210___
211}
212$code.=<<___;
213 lm${g} %r6,%r15,`$stdframe+128+6*$SIZE_T`($sp)
214 br $ra
215.size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
216.string "GF(2^m) Multiplication for s390x, CRYPTOGAMS by <appro\@openssl.org>"
217___
218
219$code =~ s/\`([^\`]*)\`/eval($1)/gem;
220print $code;
221close STDOUT;
diff --git a/src/lib/libcrypto/bn/asm/s390x-mont.pl b/src/lib/libcrypto/bn/asm/s390x-mont.pl
deleted file mode 100644
index 9fd64e81ee..0000000000
--- a/src/lib/libcrypto/bn/asm/s390x-mont.pl
+++ /dev/null
@@ -1,277 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# April 2007.
11#
12# Performance improvement over vanilla C code varies from 85% to 45%
13# depending on key length and benchmark. Unfortunately in this context
14# these are not very impressive results [for code that utilizes "wide"
15# 64x64=128-bit multiplication, which is not commonly available to C
16# programmers], at least hand-coded bn_asm.c replacement is known to
17# provide 30-40% better results for longest keys. Well, on a second
18# thought it's not very surprising, because z-CPUs are single-issue
19# and _strictly_ in-order execution, while bn_mul_mont is more or less
20# dependent on CPU ability to pipe-line instructions and have several
21# of them "in-flight" at the same time. I mean while other methods,
22# for example Karatsuba, aim to minimize amount of multiplications at
23# the cost of other operations increase, bn_mul_mont aim to neatly
24# "overlap" multiplications and the other operations [and on most
25# platforms even minimize the amount of the other operations, in
26# particular references to memory]. But it's possible to improve this
27# module performance by implementing dedicated squaring code-path and
28# possibly by unrolling loops...
29
30# January 2009.
31#
32# Reschedule to minimize/avoid Address Generation Interlock hazard,
33# make inner loops counter-based.
34
35# November 2010.
36#
37# Adapt for -m31 build. If kernel supports what's called "highgprs"
38# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
39# instructions and achieve "64-bit" performance even in 31-bit legacy
40# application context. The feature is not specific to any particular
41# processor, as long as it's "z-CPU". Latter implies that the code
42# remains z/Architecture specific. Compatibility with 32-bit BN_ULONG
43# is achieved by swapping words after 64-bit loads, follow _dswap-s.
44# On z990 it was measured to perform 2.6-2.2 times better than
45# compiler-generated code, less for longer keys...
46
47$flavour = shift;
48
49if ($flavour =~ /3[12]/) {
50 $SIZE_T=4;
51 $g="";
52} else {
53 $SIZE_T=8;
54 $g="g";
55}
56
57while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
58open STDOUT,">$output";
59
60$stdframe=16*$SIZE_T+4*8;
61
62$mn0="%r0";
63$num="%r1";
64
65# int bn_mul_mont(
66$rp="%r2"; # BN_ULONG *rp,
67$ap="%r3"; # const BN_ULONG *ap,
68$bp="%r4"; # const BN_ULONG *bp,
69$np="%r5"; # const BN_ULONG *np,
70$n0="%r6"; # const BN_ULONG *n0,
71#$num="160(%r15)" # int num);
72
73$bi="%r2"; # zaps rp
74$j="%r7";
75
76$ahi="%r8";
77$alo="%r9";
78$nhi="%r10";
79$nlo="%r11";
80$AHI="%r12";
81$NHI="%r13";
82$count="%r14";
83$sp="%r15";
84
85$code.=<<___;
86.text
87.globl bn_mul_mont
88.type bn_mul_mont,\@function
89bn_mul_mont:
90 lgf $num,`$stdframe+$SIZE_T-4`($sp) # pull $num
91 sla $num,`log($SIZE_T)/log(2)` # $num to enumerate bytes
92 la $bp,0($num,$bp)
93
94 st${g} %r2,2*$SIZE_T($sp)
95
96 cghi $num,16 #
97 lghi %r2,0 #
98 blr %r14 # if($num<16) return 0;
99___
100$code.=<<___ if ($flavour =~ /3[12]/);
101 tmll $num,4
102 bnzr %r14 # if ($num&1) return 0;
103___
104$code.=<<___ if ($flavour !~ /3[12]/);
105 cghi $num,96 #
106 bhr %r14 # if($num>96) return 0;
107___
108$code.=<<___;
109 stm${g} %r3,%r15,3*$SIZE_T($sp)
110
111 lghi $rp,-$stdframe-8 # leave room for carry bit
112 lcgr $j,$num # -$num
113 lgr %r0,$sp
114 la $rp,0($rp,$sp)
115 la $sp,0($j,$rp) # alloca
116 st${g} %r0,0($sp) # back chain
117
118 sra $num,3 # restore $num
119 la $bp,0($j,$bp) # restore $bp
120 ahi $num,-1 # adjust $num for inner loop
121 lg $n0,0($n0) # pull n0
122 _dswap $n0
123
124 lg $bi,0($bp)
125 _dswap $bi
126 lg $alo,0($ap)
127 _dswap $alo
128 mlgr $ahi,$bi # ap[0]*bp[0]
129 lgr $AHI,$ahi
130
131 lgr $mn0,$alo # "tp[0]"*n0
132 msgr $mn0,$n0
133
134 lg $nlo,0($np) #
135 _dswap $nlo
136 mlgr $nhi,$mn0 # np[0]*m1
137 algr $nlo,$alo # +="tp[0]"
138 lghi $NHI,0
139 alcgr $NHI,$nhi
140
141 la $j,8(%r0) # j=1
142 lr $count,$num
143
144.align 16
145.L1st:
146 lg $alo,0($j,$ap)
147 _dswap $alo
148 mlgr $ahi,$bi # ap[j]*bp[0]
149 algr $alo,$AHI
150 lghi $AHI,0
151 alcgr $AHI,$ahi
152
153 lg $nlo,0($j,$np)
154 _dswap $nlo
155 mlgr $nhi,$mn0 # np[j]*m1
156 algr $nlo,$NHI
157 lghi $NHI,0
158 alcgr $nhi,$NHI # +="tp[j]"
159 algr $nlo,$alo
160 alcgr $NHI,$nhi
161
162 stg $nlo,$stdframe-8($j,$sp) # tp[j-1]=
163 la $j,8($j) # j++
164 brct $count,.L1st
165
166 algr $NHI,$AHI
167 lghi $AHI,0
168 alcgr $AHI,$AHI # upmost overflow bit
169 stg $NHI,$stdframe-8($j,$sp)
170 stg $AHI,$stdframe($j,$sp)
171 la $bp,8($bp) # bp++
172
173.Louter:
174 lg $bi,0($bp) # bp[i]
175 _dswap $bi
176 lg $alo,0($ap)
177 _dswap $alo
178 mlgr $ahi,$bi # ap[0]*bp[i]
179 alg $alo,$stdframe($sp) # +=tp[0]
180 lghi $AHI,0
181 alcgr $AHI,$ahi
182
183 lgr $mn0,$alo
184 msgr $mn0,$n0 # tp[0]*n0
185
186 lg $nlo,0($np) # np[0]
187 _dswap $nlo
188 mlgr $nhi,$mn0 # np[0]*m1
189 algr $nlo,$alo # +="tp[0]"
190 lghi $NHI,0
191 alcgr $NHI,$nhi
192
193 la $j,8(%r0) # j=1
194 lr $count,$num
195
196.align 16
197.Linner:
198 lg $alo,0($j,$ap)
199 _dswap $alo
200 mlgr $ahi,$bi # ap[j]*bp[i]
201 algr $alo,$AHI
202 lghi $AHI,0
203 alcgr $ahi,$AHI
204 alg $alo,$stdframe($j,$sp)# +=tp[j]
205 alcgr $AHI,$ahi
206
207 lg $nlo,0($j,$np)
208 _dswap $nlo
209 mlgr $nhi,$mn0 # np[j]*m1
210 algr $nlo,$NHI
211 lghi $NHI,0
212 alcgr $nhi,$NHI
213 algr $nlo,$alo # +="tp[j]"
214 alcgr $NHI,$nhi
215
216 stg $nlo,$stdframe-8($j,$sp) # tp[j-1]=
217 la $j,8($j) # j++
218 brct $count,.Linner
219
220 algr $NHI,$AHI
221 lghi $AHI,0
222 alcgr $AHI,$AHI
223 alg $NHI,$stdframe($j,$sp)# accumulate previous upmost overflow bit
224 lghi $ahi,0
225 alcgr $AHI,$ahi # new upmost overflow bit
226 stg $NHI,$stdframe-8($j,$sp)
227 stg $AHI,$stdframe($j,$sp)
228
229 la $bp,8($bp) # bp++
230 cl${g} $bp,`$stdframe+8+4*$SIZE_T`($j,$sp) # compare to &bp[num]
231 jne .Louter
232
233 l${g} $rp,`$stdframe+8+2*$SIZE_T`($j,$sp) # reincarnate rp
234 la $ap,$stdframe($sp)
235 ahi $num,1 # restore $num, incidentally clears "borrow"
236
237 la $j,0(%r0)
238 lr $count,$num
239.Lsub: lg $alo,0($j,$ap)
240 lg $nlo,0($j,$np)
241 _dswap $nlo
242 slbgr $alo,$nlo
243 stg $alo,0($j,$rp)
244 la $j,8($j)
245 brct $count,.Lsub
246 lghi $ahi,0
247 slbgr $AHI,$ahi # handle upmost carry
248
249 ngr $ap,$AHI
250 lghi $np,-1
251 xgr $np,$AHI
252 ngr $np,$rp
253 ogr $ap,$np # ap=borrow?tp:rp
254
255 la $j,0(%r0)
256 lgr $count,$num
257.Lcopy: lg $alo,0($j,$ap) # copy or in-place refresh
258 _dswap $alo
259 stg $j,$stdframe($j,$sp) # zap tp
260 stg $alo,0($j,$rp)
261 la $j,8($j)
262 brct $count,.Lcopy
263
264 la %r1,`$stdframe+8+6*$SIZE_T`($j,$sp)
265 lm${g} %r6,%r15,0(%r1)
266 lghi %r2,1 # signal "processed"
267 br %r14
268.size bn_mul_mont,.-bn_mul_mont
269.string "Montgomery Multiplication for s390x, CRYPTOGAMS by <appro\@openssl.org>"
270___
271
272foreach (split("\n",$code)) {
273 s/\`([^\`]*)\`/eval $1/ge;
274 s/_dswap\s+(%r[0-9]+)/sprintf("rllg\t%s,%s,32",$1,$1) if($SIZE_T==4)/e;
275 print $_,"\n";
276}
277close STDOUT;
diff --git a/src/lib/libcrypto/bn/asm/s390x.S b/src/lib/libcrypto/bn/asm/s390x.S
deleted file mode 100755
index 43fcb79bc0..0000000000
--- a/src/lib/libcrypto/bn/asm/s390x.S
+++ /dev/null
@@ -1,678 +0,0 @@
1.ident "s390x.S, version 1.1"
2// ====================================================================
3// Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
4// project.
5//
6// Rights for redistribution and usage in source and binary forms are
7// granted according to the OpenSSL license. Warranty of any kind is
8// disclaimed.
9// ====================================================================
10
11.text
12
13#define zero %r0
14
15// BN_ULONG bn_mul_add_words(BN_ULONG *r2,BN_ULONG *r3,int r4,BN_ULONG r5);
16.globl bn_mul_add_words
17.type bn_mul_add_words,@function
18.align 4
19bn_mul_add_words:
20 lghi zero,0 // zero = 0
21 la %r1,0(%r2) // put rp aside
22 lghi %r2,0 // i=0;
23 ltgfr %r4,%r4
24 bler %r14 // if (len<=0) return 0;
25
26 stmg %r6,%r10,48(%r15)
27 lghi %r10,3
28 lghi %r8,0 // carry = 0
29 nr %r10,%r4 // len%4
30 sra %r4,2 // cnt=len/4
31 jz .Loop1_madd // carry is incidentally cleared if branch taken
32 algr zero,zero // clear carry
33
34.Loop4_madd:
35 lg %r7,0(%r2,%r3) // ap[i]
36 mlgr %r6,%r5 // *=w
37 alcgr %r7,%r8 // +=carry
38 alcgr %r6,zero
39 alg %r7,0(%r2,%r1) // +=rp[i]
40 stg %r7,0(%r2,%r1) // rp[i]=
41
42 lg %r9,8(%r2,%r3)
43 mlgr %r8,%r5
44 alcgr %r9,%r6
45 alcgr %r8,zero
46 alg %r9,8(%r2,%r1)
47 stg %r9,8(%r2,%r1)
48
49 lg %r7,16(%r2,%r3)
50 mlgr %r6,%r5
51 alcgr %r7,%r8
52 alcgr %r6,zero
53 alg %r7,16(%r2,%r1)
54 stg %r7,16(%r2,%r1)
55
56 lg %r9,24(%r2,%r3)
57 mlgr %r8,%r5
58 alcgr %r9,%r6
59 alcgr %r8,zero
60 alg %r9,24(%r2,%r1)
61 stg %r9,24(%r2,%r1)
62
63 la %r2,32(%r2) // i+=4
64 brct %r4,.Loop4_madd
65
66 la %r10,1(%r10) // see if len%4 is zero ...
67 brct %r10,.Loop1_madd // without touching condition code:-)
68
69.Lend_madd:
70 alcgr %r8,zero // collect carry bit
71 lgr %r2,%r8
72 lmg %r6,%r10,48(%r15)
73 br %r14
74
75.Loop1_madd:
76 lg %r7,0(%r2,%r3) // ap[i]
77 mlgr %r6,%r5 // *=w
78 alcgr %r7,%r8 // +=carry
79 alcgr %r6,zero
80 alg %r7,0(%r2,%r1) // +=rp[i]
81 stg %r7,0(%r2,%r1) // rp[i]=
82
83 lgr %r8,%r6
84 la %r2,8(%r2) // i++
85 brct %r10,.Loop1_madd
86
87 j .Lend_madd
88.size bn_mul_add_words,.-bn_mul_add_words
89
90// BN_ULONG bn_mul_words(BN_ULONG *r2,BN_ULONG *r3,int r4,BN_ULONG r5);
91.globl bn_mul_words
92.type bn_mul_words,@function
93.align 4
94bn_mul_words:
95 lghi zero,0 // zero = 0
96 la %r1,0(%r2) // put rp aside
97 lghi %r2,0 // i=0;
98 ltgfr %r4,%r4
99 bler %r14 // if (len<=0) return 0;
100
101 stmg %r6,%r10,48(%r15)
102 lghi %r10,3
103 lghi %r8,0 // carry = 0
104 nr %r10,%r4 // len%4
105 sra %r4,2 // cnt=len/4
106 jz .Loop1_mul // carry is incidentally cleared if branch taken
107 algr zero,zero // clear carry
108
109.Loop4_mul:
110 lg %r7,0(%r2,%r3) // ap[i]
111 mlgr %r6,%r5 // *=w
112 alcgr %r7,%r8 // +=carry
113 stg %r7,0(%r2,%r1) // rp[i]=
114
115 lg %r9,8(%r2,%r3)
116 mlgr %r8,%r5
117 alcgr %r9,%r6
118 stg %r9,8(%r2,%r1)
119
120 lg %r7,16(%r2,%r3)
121 mlgr %r6,%r5
122 alcgr %r7,%r8
123 stg %r7,16(%r2,%r1)
124
125 lg %r9,24(%r2,%r3)
126 mlgr %r8,%r5
127 alcgr %r9,%r6
128 stg %r9,24(%r2,%r1)
129
130 la %r2,32(%r2) // i+=4
131 brct %r4,.Loop4_mul
132
133 la %r10,1(%r10) // see if len%4 is zero ...
134 brct %r10,.Loop1_mul // without touching condition code:-)
135
136.Lend_mul:
137 alcgr %r8,zero // collect carry bit
138 lgr %r2,%r8
139 lmg %r6,%r10,48(%r15)
140 br %r14
141
142.Loop1_mul:
143 lg %r7,0(%r2,%r3) // ap[i]
144 mlgr %r6,%r5 // *=w
145 alcgr %r7,%r8 // +=carry
146 stg %r7,0(%r2,%r1) // rp[i]=
147
148 lgr %r8,%r6
149 la %r2,8(%r2) // i++
150 brct %r10,.Loop1_mul
151
152 j .Lend_mul
153.size bn_mul_words,.-bn_mul_words
154
155// void bn_sqr_words(BN_ULONG *r2,BN_ULONG *r2,int r4)
156.globl bn_sqr_words
157.type bn_sqr_words,@function
158.align 4
159bn_sqr_words:
160 ltgfr %r4,%r4
161 bler %r14
162
163 stmg %r6,%r7,48(%r15)
164 srag %r1,%r4,2 // cnt=len/4
165 jz .Loop1_sqr
166
167.Loop4_sqr:
168 lg %r7,0(%r3)
169 mlgr %r6,%r7
170 stg %r7,0(%r2)
171 stg %r6,8(%r2)
172
173 lg %r7,8(%r3)
174 mlgr %r6,%r7
175 stg %r7,16(%r2)
176 stg %r6,24(%r2)
177
178 lg %r7,16(%r3)
179 mlgr %r6,%r7
180 stg %r7,32(%r2)
181 stg %r6,40(%r2)
182
183 lg %r7,24(%r3)
184 mlgr %r6,%r7
185 stg %r7,48(%r2)
186 stg %r6,56(%r2)
187
188 la %r3,32(%r3)
189 la %r2,64(%r2)
190 brct %r1,.Loop4_sqr
191
192 lghi %r1,3
193 nr %r4,%r1 // cnt=len%4
194 jz .Lend_sqr
195
196.Loop1_sqr:
197 lg %r7,0(%r3)
198 mlgr %r6,%r7
199 stg %r7,0(%r2)
200 stg %r6,8(%r2)
201
202 la %r3,8(%r3)
203 la %r2,16(%r2)
204 brct %r4,.Loop1_sqr
205
206.Lend_sqr:
207 lmg %r6,%r7,48(%r15)
208 br %r14
209.size bn_sqr_words,.-bn_sqr_words
210
211// BN_ULONG bn_div_words(BN_ULONG h,BN_ULONG l,BN_ULONG d);
212.globl bn_div_words
213.type bn_div_words,@function
214.align 4
215bn_div_words:
216 dlgr %r2,%r4
217 lgr %r2,%r3
218 br %r14
219.size bn_div_words,.-bn_div_words
220
221// BN_ULONG bn_add_words(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4,int r5);
222.globl bn_add_words
223.type bn_add_words,@function
224.align 4
225bn_add_words:
226 la %r1,0(%r2) // put rp aside
227 lghi %r2,0 // i=0
228 ltgfr %r5,%r5
229 bler %r14 // if (len<=0) return 0;
230
231 stg %r6,48(%r15)
232 lghi %r6,3
233 nr %r6,%r5 // len%4
234 sra %r5,2 // len/4, use sra because it sets condition code
235 jz .Loop1_add // carry is incidentally cleared if branch taken
236 algr %r2,%r2 // clear carry
237
238.Loop4_add:
239 lg %r0,0(%r2,%r3)
240 alcg %r0,0(%r2,%r4)
241 stg %r0,0(%r2,%r1)
242 lg %r0,8(%r2,%r3)
243 alcg %r0,8(%r2,%r4)
244 stg %r0,8(%r2,%r1)
245 lg %r0,16(%r2,%r3)
246 alcg %r0,16(%r2,%r4)
247 stg %r0,16(%r2,%r1)
248 lg %r0,24(%r2,%r3)
249 alcg %r0,24(%r2,%r4)
250 stg %r0,24(%r2,%r1)
251
252 la %r2,32(%r2) // i+=4
253 brct %r5,.Loop4_add
254
255 la %r6,1(%r6) // see if len%4 is zero ...
256 brct %r6,.Loop1_add // without touching condition code:-)
257
258.Lexit_add:
259 lghi %r2,0
260 alcgr %r2,%r2
261 lg %r6,48(%r15)
262 br %r14
263
264.Loop1_add:
265 lg %r0,0(%r2,%r3)
266 alcg %r0,0(%r2,%r4)
267 stg %r0,0(%r2,%r1)
268
269 la %r2,8(%r2) // i++
270 brct %r6,.Loop1_add
271
272 j .Lexit_add
273.size bn_add_words,.-bn_add_words
274
275// BN_ULONG bn_sub_words(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4,int r5);
276.globl bn_sub_words
277.type bn_sub_words,@function
278.align 4
279bn_sub_words:
280 la %r1,0(%r2) // put rp aside
281 lghi %r2,0 // i=0
282 ltgfr %r5,%r5
283 bler %r14 // if (len<=0) return 0;
284
285 stg %r6,48(%r15)
286 lghi %r6,3
287 nr %r6,%r5 // len%4
288 sra %r5,2 // len/4, use sra because it sets condition code
289 jnz .Loop4_sub // borrow is incidentally cleared if branch taken
290 slgr %r2,%r2 // clear borrow
291
292.Loop1_sub:
293 lg %r0,0(%r2,%r3)
294 slbg %r0,0(%r2,%r4)
295 stg %r0,0(%r2,%r1)
296
297 la %r2,8(%r2) // i++
298 brct %r6,.Loop1_sub
299 j .Lexit_sub
300
301.Loop4_sub:
302 lg %r0,0(%r2,%r3)
303 slbg %r0,0(%r2,%r4)
304 stg %r0,0(%r2,%r1)
305 lg %r0,8(%r2,%r3)
306 slbg %r0,8(%r2,%r4)
307 stg %r0,8(%r2,%r1)
308 lg %r0,16(%r2,%r3)
309 slbg %r0,16(%r2,%r4)
310 stg %r0,16(%r2,%r1)
311 lg %r0,24(%r2,%r3)
312 slbg %r0,24(%r2,%r4)
313 stg %r0,24(%r2,%r1)
314
315 la %r2,32(%r2) // i+=4
316 brct %r5,.Loop4_sub
317
318 la %r6,1(%r6) // see if len%4 is zero ...
319 brct %r6,.Loop1_sub // without touching condition code:-)
320
321.Lexit_sub:
322 lghi %r2,0
323 slbgr %r2,%r2
324 lcgr %r2,%r2
325 lg %r6,48(%r15)
326 br %r14
327.size bn_sub_words,.-bn_sub_words
328
329#define c1 %r1
330#define c2 %r5
331#define c3 %r8
332
333#define mul_add_c(ai,bi,c1,c2,c3) \
334 lg %r7,ai*8(%r3); \
335 mlg %r6,bi*8(%r4); \
336 algr c1,%r7; \
337 alcgr c2,%r6; \
338 alcgr c3,zero
339
340// void bn_mul_comba8(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4);
341.globl bn_mul_comba8
342.type bn_mul_comba8,@function
343.align 4
344bn_mul_comba8:
345 stmg %r6,%r8,48(%r15)
346
347 lghi c1,0
348 lghi c2,0
349 lghi c3,0
350 lghi zero,0
351
352 mul_add_c(0,0,c1,c2,c3);
353 stg c1,0*8(%r2)
354 lghi c1,0
355
356 mul_add_c(0,1,c2,c3,c1);
357 mul_add_c(1,0,c2,c3,c1);
358 stg c2,1*8(%r2)
359 lghi c2,0
360
361 mul_add_c(2,0,c3,c1,c2);
362 mul_add_c(1,1,c3,c1,c2);
363 mul_add_c(0,2,c3,c1,c2);
364 stg c3,2*8(%r2)
365 lghi c3,0
366
367 mul_add_c(0,3,c1,c2,c3);
368 mul_add_c(1,2,c1,c2,c3);
369 mul_add_c(2,1,c1,c2,c3);
370 mul_add_c(3,0,c1,c2,c3);
371 stg c1,3*8(%r2)
372 lghi c1,0
373
374 mul_add_c(4,0,c2,c3,c1);
375 mul_add_c(3,1,c2,c3,c1);
376 mul_add_c(2,2,c2,c3,c1);
377 mul_add_c(1,3,c2,c3,c1);
378 mul_add_c(0,4,c2,c3,c1);
379 stg c2,4*8(%r2)
380 lghi c2,0
381
382 mul_add_c(0,5,c3,c1,c2);
383 mul_add_c(1,4,c3,c1,c2);
384 mul_add_c(2,3,c3,c1,c2);
385 mul_add_c(3,2,c3,c1,c2);
386 mul_add_c(4,1,c3,c1,c2);
387 mul_add_c(5,0,c3,c1,c2);
388 stg c3,5*8(%r2)
389 lghi c3,0
390
391 mul_add_c(6,0,c1,c2,c3);
392 mul_add_c(5,1,c1,c2,c3);
393 mul_add_c(4,2,c1,c2,c3);
394 mul_add_c(3,3,c1,c2,c3);
395 mul_add_c(2,4,c1,c2,c3);
396 mul_add_c(1,5,c1,c2,c3);
397 mul_add_c(0,6,c1,c2,c3);
398 stg c1,6*8(%r2)
399 lghi c1,0
400
401 mul_add_c(0,7,c2,c3,c1);
402 mul_add_c(1,6,c2,c3,c1);
403 mul_add_c(2,5,c2,c3,c1);
404 mul_add_c(3,4,c2,c3,c1);
405 mul_add_c(4,3,c2,c3,c1);
406 mul_add_c(5,2,c2,c3,c1);
407 mul_add_c(6,1,c2,c3,c1);
408 mul_add_c(7,0,c2,c3,c1);
409 stg c2,7*8(%r2)
410 lghi c2,0
411
412 mul_add_c(7,1,c3,c1,c2);
413 mul_add_c(6,2,c3,c1,c2);
414 mul_add_c(5,3,c3,c1,c2);
415 mul_add_c(4,4,c3,c1,c2);
416 mul_add_c(3,5,c3,c1,c2);
417 mul_add_c(2,6,c3,c1,c2);
418 mul_add_c(1,7,c3,c1,c2);
419 stg c3,8*8(%r2)
420 lghi c3,0
421
422 mul_add_c(2,7,c1,c2,c3);
423 mul_add_c(3,6,c1,c2,c3);
424 mul_add_c(4,5,c1,c2,c3);
425 mul_add_c(5,4,c1,c2,c3);
426 mul_add_c(6,3,c1,c2,c3);
427 mul_add_c(7,2,c1,c2,c3);
428 stg c1,9*8(%r2)
429 lghi c1,0
430
431 mul_add_c(7,3,c2,c3,c1);
432 mul_add_c(6,4,c2,c3,c1);
433 mul_add_c(5,5,c2,c3,c1);
434 mul_add_c(4,6,c2,c3,c1);
435 mul_add_c(3,7,c2,c3,c1);
436 stg c2,10*8(%r2)
437 lghi c2,0
438
439 mul_add_c(4,7,c3,c1,c2);
440 mul_add_c(5,6,c3,c1,c2);
441 mul_add_c(6,5,c3,c1,c2);
442 mul_add_c(7,4,c3,c1,c2);
443 stg c3,11*8(%r2)
444 lghi c3,0
445
446 mul_add_c(7,5,c1,c2,c3);
447 mul_add_c(6,6,c1,c2,c3);
448 mul_add_c(5,7,c1,c2,c3);
449 stg c1,12*8(%r2)
450 lghi c1,0
451
452
453 mul_add_c(6,7,c2,c3,c1);
454 mul_add_c(7,6,c2,c3,c1);
455 stg c2,13*8(%r2)
456 lghi c2,0
457
458 mul_add_c(7,7,c3,c1,c2);
459 stg c3,14*8(%r2)
460 stg c1,15*8(%r2)
461
462 lmg %r6,%r8,48(%r15)
463 br %r14
464.size bn_mul_comba8,.-bn_mul_comba8
465
466// void bn_mul_comba4(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4);
467.globl bn_mul_comba4
468.type bn_mul_comba4,@function
469.align 4
470bn_mul_comba4:
471 stmg %r6,%r8,48(%r15)
472
473 lghi c1,0
474 lghi c2,0
475 lghi c3,0
476 lghi zero,0
477
478 mul_add_c(0,0,c1,c2,c3);
479 stg c1,0*8(%r3)
480 lghi c1,0
481
482 mul_add_c(0,1,c2,c3,c1);
483 mul_add_c(1,0,c2,c3,c1);
484 stg c2,1*8(%r2)
485 lghi c2,0
486
487 mul_add_c(2,0,c3,c1,c2);
488 mul_add_c(1,1,c3,c1,c2);
489 mul_add_c(0,2,c3,c1,c2);
490 stg c3,2*8(%r2)
491 lghi c3,0
492
493 mul_add_c(0,3,c1,c2,c3);
494 mul_add_c(1,2,c1,c2,c3);
495 mul_add_c(2,1,c1,c2,c3);
496 mul_add_c(3,0,c1,c2,c3);
497 stg c1,3*8(%r2)
498 lghi c1,0
499
500 mul_add_c(3,1,c2,c3,c1);
501 mul_add_c(2,2,c2,c3,c1);
502 mul_add_c(1,3,c2,c3,c1);
503 stg c2,4*8(%r2)
504 lghi c2,0
505
506 mul_add_c(2,3,c3,c1,c2);
507 mul_add_c(3,2,c3,c1,c2);
508 stg c3,5*8(%r2)
509 lghi c3,0
510
511 mul_add_c(3,3,c1,c2,c3);
512 stg c1,6*8(%r2)
513 stg c2,7*8(%r2)
514
515 stmg %r6,%r8,48(%r15)
516 br %r14
517.size bn_mul_comba4,.-bn_mul_comba4
518
519#define sqr_add_c(ai,c1,c2,c3) \
520 lg %r7,ai*8(%r3); \
521 mlgr %r6,%r7; \
522 algr c1,%r7; \
523 alcgr c2,%r6; \
524 alcgr c3,zero
525
526#define sqr_add_c2(ai,aj,c1,c2,c3) \
527 lg %r7,ai*8(%r3); \
528 mlg %r6,aj*8(%r3); \
529 algr c1,%r7; \
530 alcgr c2,%r6; \
531 alcgr c3,zero; \
532 algr c1,%r7; \
533 alcgr c2,%r6; \
534 alcgr c3,zero
535
536// void bn_sqr_comba8(BN_ULONG *r2,BN_ULONG *r3);
537.globl bn_sqr_comba8
538.type bn_sqr_comba8,@function
539.align 4
540bn_sqr_comba8:
541 stmg %r6,%r8,48(%r15)
542
543 lghi c1,0
544 lghi c2,0
545 lghi c3,0
546 lghi zero,0
547
548 sqr_add_c(0,c1,c2,c3);
549 stg c1,0*8(%r2)
550 lghi c1,0
551
552 sqr_add_c2(1,0,c2,c3,c1);
553 stg c2,1*8(%r2)
554 lghi c2,0
555
556 sqr_add_c(1,c3,c1,c2);
557 sqr_add_c2(2,0,c3,c1,c2);
558 stg c3,2*8(%r2)
559 lghi c3,0
560
561 sqr_add_c2(3,0,c1,c2,c3);
562 sqr_add_c2(2,1,c1,c2,c3);
563 stg c1,3*8(%r2)
564 lghi c1,0
565
566 sqr_add_c(2,c2,c3,c1);
567 sqr_add_c2(3,1,c2,c3,c1);
568 sqr_add_c2(4,0,c2,c3,c1);
569 stg c2,4*8(%r2)
570 lghi c2,0
571
572 sqr_add_c2(5,0,c3,c1,c2);
573 sqr_add_c2(4,1,c3,c1,c2);
574 sqr_add_c2(3,2,c3,c1,c2);
575 stg c3,5*8(%r2)
576 lghi c3,0
577
578 sqr_add_c(3,c1,c2,c3);
579 sqr_add_c2(4,2,c1,c2,c3);
580 sqr_add_c2(5,1,c1,c2,c3);
581 sqr_add_c2(6,0,c1,c2,c3);
582 stg c1,6*8(%r2)
583 lghi c1,0
584
585 sqr_add_c2(7,0,c2,c3,c1);
586 sqr_add_c2(6,1,c2,c3,c1);
587 sqr_add_c2(5,2,c2,c3,c1);
588 sqr_add_c2(4,3,c2,c3,c1);
589 stg c2,7*8(%r2)
590 lghi c2,0
591
592 sqr_add_c(4,c3,c1,c2);
593 sqr_add_c2(5,3,c3,c1,c2);
594 sqr_add_c2(6,2,c3,c1,c2);
595 sqr_add_c2(7,1,c3,c1,c2);
596 stg c3,8*8(%r2)
597 lghi c3,0
598
599 sqr_add_c2(7,2,c1,c2,c3);
600 sqr_add_c2(6,3,c1,c2,c3);
601 sqr_add_c2(5,4,c1,c2,c3);
602 stg c1,9*8(%r2)
603 lghi c1,0
604
605 sqr_add_c(5,c2,c3,c1);
606 sqr_add_c2(6,4,c2,c3,c1);
607 sqr_add_c2(7,3,c2,c3,c1);
608 stg c2,10*8(%r2)
609 lghi c2,0
610
611 sqr_add_c2(7,4,c3,c1,c2);
612 sqr_add_c2(6,5,c3,c1,c2);
613 stg c3,11*8(%r2)
614 lghi c3,0
615
616 sqr_add_c(6,c1,c2,c3);
617 sqr_add_c2(7,5,c1,c2,c3);
618 stg c1,12*8(%r2)
619 lghi c1,0
620
621 sqr_add_c2(7,6,c2,c3,c1);
622 stg c2,13*8(%r2)
623 lghi c2,0
624
625 sqr_add_c(7,c3,c1,c2);
626 stg c3,14*8(%r2)
627 stg c1,15*8(%r2)
628
629 lmg %r6,%r8,48(%r15)
630 br %r14
631.size bn_sqr_comba8,.-bn_sqr_comba8
632
633// void bn_sqr_comba4(BN_ULONG *r2,BN_ULONG *r3);
634.globl bn_sqr_comba4
635.type bn_sqr_comba4,@function
636.align 4
637bn_sqr_comba4:
638 stmg %r6,%r8,48(%r15)
639
640 lghi c1,0
641 lghi c2,0
642 lghi c3,0
643 lghi zero,0
644
645 sqr_add_c(0,c1,c2,c3);
646 stg c1,0*8(%r2)
647 lghi c1,0
648
649 sqr_add_c2(1,0,c2,c3,c1);
650 stg c2,1*8(%r2)
651 lghi c2,0
652
653 sqr_add_c(1,c3,c1,c2);
654 sqr_add_c2(2,0,c3,c1,c2);
655 stg c3,2*8(%r2)
656 lghi c3,0
657
658 sqr_add_c2(3,0,c1,c2,c3);
659 sqr_add_c2(2,1,c1,c2,c3);
660 stg c1,3*8(%r2)
661 lghi c1,0
662
663 sqr_add_c(2,c2,c3,c1);
664 sqr_add_c2(3,1,c2,c3,c1);
665 stg c2,4*8(%r2)
666 lghi c2,0
667
668 sqr_add_c2(3,2,c3,c1,c2);
669 stg c3,5*8(%r2)
670 lghi c3,0
671
672 sqr_add_c(3,c1,c2,c3);
673 stg c1,6*8(%r2)
674 stg c2,7*8(%r2)
675
676 lmg %r6,%r8,48(%r15)
677 br %r14
678.size bn_sqr_comba4,.-bn_sqr_comba4
diff --git a/src/lib/libcrypto/bn/asm/sparcv8.S b/src/lib/libcrypto/bn/asm/sparcv8.S
deleted file mode 100644
index 88c5dc480a..0000000000
--- a/src/lib/libcrypto/bn/asm/sparcv8.S
+++ /dev/null
@@ -1,1458 +0,0 @@
1.ident "sparcv8.s, Version 1.4"
2.ident "SPARC v8 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>"
3
4/*
5 * ====================================================================
6 * Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
7 * project.
8 *
9 * Rights for redistribution and usage in source and binary forms are
10 * granted according to the OpenSSL license. Warranty of any kind is
11 * disclaimed.
12 * ====================================================================
13 */
14
15/*
16 * This is my modest contributon to OpenSSL project (see
17 * http://www.openssl.org/ for more information about it) and is
18 * a drop-in SuperSPARC ISA replacement for crypto/bn/bn_asm.c
19 * module. For updates see http://fy.chalmers.se/~appro/hpe/.
20 *
21 * See bn_asm.sparc.v8plus.S for more details.
22 */
23
24/*
25 * Revision history.
26 *
27 * 1.1 - new loop unrolling model(*);
28 * 1.2 - made gas friendly;
29 * 1.3 - fixed problem with /usr/ccs/lib/cpp;
30 * 1.4 - some retunes;
31 *
32 * (*) see bn_asm.sparc.v8plus.S for details
33 */
34
35.section ".text",#alloc,#execinstr
36.file "bn_asm.sparc.v8.S"
37
38.align 32
39
40.global bn_mul_add_words
41/*
42 * BN_ULONG bn_mul_add_words(rp,ap,num,w)
43 * BN_ULONG *rp,*ap;
44 * int num;
45 * BN_ULONG w;
46 */
47bn_mul_add_words:
48 cmp %o2,0
49 bg,a .L_bn_mul_add_words_proceed
50 ld [%o1],%g2
51 retl
52 clr %o0
53
54.L_bn_mul_add_words_proceed:
55 andcc %o2,-4,%g0
56 bz .L_bn_mul_add_words_tail
57 clr %o5
58
59.L_bn_mul_add_words_loop:
60 ld [%o0],%o4
61 ld [%o1+4],%g3
62 umul %o3,%g2,%g2
63 rd %y,%g1
64 addcc %o4,%o5,%o4
65 addx %g1,0,%g1
66 addcc %o4,%g2,%o4
67 st %o4,[%o0]
68 addx %g1,0,%o5
69
70 ld [%o0+4],%o4
71 ld [%o1+8],%g2
72 umul %o3,%g3,%g3
73 dec 4,%o2
74 rd %y,%g1
75 addcc %o4,%o5,%o4
76 addx %g1,0,%g1
77 addcc %o4,%g3,%o4
78 st %o4,[%o0+4]
79 addx %g1,0,%o5
80
81 ld [%o0+8],%o4
82 ld [%o1+12],%g3
83 umul %o3,%g2,%g2
84 inc 16,%o1
85 rd %y,%g1
86 addcc %o4,%o5,%o4
87 addx %g1,0,%g1
88 addcc %o4,%g2,%o4
89 st %o4,[%o0+8]
90 addx %g1,0,%o5
91
92 ld [%o0+12],%o4
93 umul %o3,%g3,%g3
94 inc 16,%o0
95 rd %y,%g1
96 addcc %o4,%o5,%o4
97 addx %g1,0,%g1
98 addcc %o4,%g3,%o4
99 st %o4,[%o0-4]
100 addx %g1,0,%o5
101 andcc %o2,-4,%g0
102 bnz,a .L_bn_mul_add_words_loop
103 ld [%o1],%g2
104
105 tst %o2
106 bnz,a .L_bn_mul_add_words_tail
107 ld [%o1],%g2
108.L_bn_mul_add_words_return:
109 retl
110 mov %o5,%o0
111 nop
112
113.L_bn_mul_add_words_tail:
114 ld [%o0],%o4
115 umul %o3,%g2,%g2
116 addcc %o4,%o5,%o4
117 rd %y,%g1
118 addx %g1,0,%g1
119 addcc %o4,%g2,%o4
120 addx %g1,0,%o5
121 deccc %o2
122 bz .L_bn_mul_add_words_return
123 st %o4,[%o0]
124
125 ld [%o1+4],%g2
126 ld [%o0+4],%o4
127 umul %o3,%g2,%g2
128 rd %y,%g1
129 addcc %o4,%o5,%o4
130 addx %g1,0,%g1
131 addcc %o4,%g2,%o4
132 addx %g1,0,%o5
133 deccc %o2
134 bz .L_bn_mul_add_words_return
135 st %o4,[%o0+4]
136
137 ld [%o1+8],%g2
138 ld [%o0+8],%o4
139 umul %o3,%g2,%g2
140 rd %y,%g1
141 addcc %o4,%o5,%o4
142 addx %g1,0,%g1
143 addcc %o4,%g2,%o4
144 st %o4,[%o0+8]
145 retl
146 addx %g1,0,%o0
147
148.type bn_mul_add_words,#function
149.size bn_mul_add_words,(.-bn_mul_add_words)
150
151.align 32
152
153.global bn_mul_words
154/*
155 * BN_ULONG bn_mul_words(rp,ap,num,w)
156 * BN_ULONG *rp,*ap;
157 * int num;
158 * BN_ULONG w;
159 */
160bn_mul_words:
161 cmp %o2,0
162 bg,a .L_bn_mul_words_proceeed
163 ld [%o1],%g2
164 retl
165 clr %o0
166
167.L_bn_mul_words_proceeed:
168 andcc %o2,-4,%g0
169 bz .L_bn_mul_words_tail
170 clr %o5
171
172.L_bn_mul_words_loop:
173 ld [%o1+4],%g3
174 umul %o3,%g2,%g2
175 addcc %g2,%o5,%g2
176 rd %y,%g1
177 addx %g1,0,%o5
178 st %g2,[%o0]
179
180 ld [%o1+8],%g2
181 umul %o3,%g3,%g3
182 addcc %g3,%o5,%g3
183 rd %y,%g1
184 dec 4,%o2
185 addx %g1,0,%o5
186 st %g3,[%o0+4]
187
188 ld [%o1+12],%g3
189 umul %o3,%g2,%g2
190 addcc %g2,%o5,%g2
191 rd %y,%g1
192 inc 16,%o1
193 st %g2,[%o0+8]
194 addx %g1,0,%o5
195
196 umul %o3,%g3,%g3
197 addcc %g3,%o5,%g3
198 rd %y,%g1
199 inc 16,%o0
200 addx %g1,0,%o5
201 st %g3,[%o0-4]
202 andcc %o2,-4,%g0
203 nop
204 bnz,a .L_bn_mul_words_loop
205 ld [%o1],%g2
206
207 tst %o2
208 bnz,a .L_bn_mul_words_tail
209 ld [%o1],%g2
210.L_bn_mul_words_return:
211 retl
212 mov %o5,%o0
213 nop
214
215.L_bn_mul_words_tail:
216 umul %o3,%g2,%g2
217 addcc %g2,%o5,%g2
218 rd %y,%g1
219 addx %g1,0,%o5
220 deccc %o2
221 bz .L_bn_mul_words_return
222 st %g2,[%o0]
223 nop
224
225 ld [%o1+4],%g2
226 umul %o3,%g2,%g2
227 addcc %g2,%o5,%g2
228 rd %y,%g1
229 addx %g1,0,%o5
230 deccc %o2
231 bz .L_bn_mul_words_return
232 st %g2,[%o0+4]
233
234 ld [%o1+8],%g2
235 umul %o3,%g2,%g2
236 addcc %g2,%o5,%g2
237 rd %y,%g1
238 st %g2,[%o0+8]
239 retl
240 addx %g1,0,%o0
241
242.type bn_mul_words,#function
243.size bn_mul_words,(.-bn_mul_words)
244
245.align 32
246.global bn_sqr_words
247/*
248 * void bn_sqr_words(r,a,n)
249 * BN_ULONG *r,*a;
250 * int n;
251 */
252bn_sqr_words:
253 cmp %o2,0
254 bg,a .L_bn_sqr_words_proceeed
255 ld [%o1],%g2
256 retl
257 clr %o0
258
259.L_bn_sqr_words_proceeed:
260 andcc %o2,-4,%g0
261 bz .L_bn_sqr_words_tail
262 clr %o5
263
264.L_bn_sqr_words_loop:
265 ld [%o1+4],%g3
266 umul %g2,%g2,%o4
267 st %o4,[%o0]
268 rd %y,%o5
269 st %o5,[%o0+4]
270
271 ld [%o1+8],%g2
272 umul %g3,%g3,%o4
273 dec 4,%o2
274 st %o4,[%o0+8]
275 rd %y,%o5
276 st %o5,[%o0+12]
277 nop
278
279 ld [%o1+12],%g3
280 umul %g2,%g2,%o4
281 st %o4,[%o0+16]
282 rd %y,%o5
283 inc 16,%o1
284 st %o5,[%o0+20]
285
286 umul %g3,%g3,%o4
287 inc 32,%o0
288 st %o4,[%o0-8]
289 rd %y,%o5
290 st %o5,[%o0-4]
291 andcc %o2,-4,%g2
292 bnz,a .L_bn_sqr_words_loop
293 ld [%o1],%g2
294
295 tst %o2
296 nop
297 bnz,a .L_bn_sqr_words_tail
298 ld [%o1],%g2
299.L_bn_sqr_words_return:
300 retl
301 clr %o0
302
303.L_bn_sqr_words_tail:
304 umul %g2,%g2,%o4
305 st %o4,[%o0]
306 deccc %o2
307 rd %y,%o5
308 bz .L_bn_sqr_words_return
309 st %o5,[%o0+4]
310
311 ld [%o1+4],%g2
312 umul %g2,%g2,%o4
313 st %o4,[%o0+8]
314 deccc %o2
315 rd %y,%o5
316 nop
317 bz .L_bn_sqr_words_return
318 st %o5,[%o0+12]
319
320 ld [%o1+8],%g2
321 umul %g2,%g2,%o4
322 st %o4,[%o0+16]
323 rd %y,%o5
324 st %o5,[%o0+20]
325 retl
326 clr %o0
327
328.type bn_sqr_words,#function
329.size bn_sqr_words,(.-bn_sqr_words)
330
331.align 32
332
333.global bn_div_words
334/*
335 * BN_ULONG bn_div_words(h,l,d)
336 * BN_ULONG h,l,d;
337 */
338bn_div_words:
339 wr %o0,%y
340 udiv %o1,%o2,%o0
341 retl
342 nop
343
344.type bn_div_words,#function
345.size bn_div_words,(.-bn_div_words)
346
347.align 32
348
349.global bn_add_words
350/*
351 * BN_ULONG bn_add_words(rp,ap,bp,n)
352 * BN_ULONG *rp,*ap,*bp;
353 * int n;
354 */
355bn_add_words:
356 cmp %o3,0
357 bg,a .L_bn_add_words_proceed
358 ld [%o1],%o4
359 retl
360 clr %o0
361
362.L_bn_add_words_proceed:
363 andcc %o3,-4,%g0
364 bz .L_bn_add_words_tail
365 clr %g1
366 ba .L_bn_add_words_warn_loop
367 addcc %g0,0,%g0 ! clear carry flag
368
369.L_bn_add_words_loop:
370 ld [%o1],%o4
371.L_bn_add_words_warn_loop:
372 ld [%o2],%o5
373 ld [%o1+4],%g3
374 ld [%o2+4],%g4
375 dec 4,%o3
376 addxcc %o5,%o4,%o5
377 st %o5,[%o0]
378
379 ld [%o1+8],%o4
380 ld [%o2+8],%o5
381 inc 16,%o1
382 addxcc %g3,%g4,%g3
383 st %g3,[%o0+4]
384
385 ld [%o1-4],%g3
386 ld [%o2+12],%g4
387 inc 16,%o2
388 addxcc %o5,%o4,%o5
389 st %o5,[%o0+8]
390
391 inc 16,%o0
392 addxcc %g3,%g4,%g3
393 st %g3,[%o0-4]
394 addx %g0,0,%g1
395 andcc %o3,-4,%g0
396 bnz,a .L_bn_add_words_loop
397 addcc %g1,-1,%g0
398
399 tst %o3
400 bnz,a .L_bn_add_words_tail
401 ld [%o1],%o4
402.L_bn_add_words_return:
403 retl
404 mov %g1,%o0
405
406.L_bn_add_words_tail:
407 addcc %g1,-1,%g0
408 ld [%o2],%o5
409 addxcc %o5,%o4,%o5
410 addx %g0,0,%g1
411 deccc %o3
412 bz .L_bn_add_words_return
413 st %o5,[%o0]
414
415 ld [%o1+4],%o4
416 addcc %g1,-1,%g0
417 ld [%o2+4],%o5
418 addxcc %o5,%o4,%o5
419 addx %g0,0,%g1
420 deccc %o3
421 bz .L_bn_add_words_return
422 st %o5,[%o0+4]
423
424 ld [%o1+8],%o4
425 addcc %g1,-1,%g0
426 ld [%o2+8],%o5
427 addxcc %o5,%o4,%o5
428 st %o5,[%o0+8]
429 retl
430 addx %g0,0,%o0
431
432.type bn_add_words,#function
433.size bn_add_words,(.-bn_add_words)
434
435.align 32
436
437.global bn_sub_words
438/*
439 * BN_ULONG bn_sub_words(rp,ap,bp,n)
440 * BN_ULONG *rp,*ap,*bp;
441 * int n;
442 */
443bn_sub_words:
444 cmp %o3,0
445 bg,a .L_bn_sub_words_proceed
446 ld [%o1],%o4
447 retl
448 clr %o0
449
450.L_bn_sub_words_proceed:
451 andcc %o3,-4,%g0
452 bz .L_bn_sub_words_tail
453 clr %g1
454 ba .L_bn_sub_words_warm_loop
455 addcc %g0,0,%g0 ! clear carry flag
456
457.L_bn_sub_words_loop:
458 ld [%o1],%o4
459.L_bn_sub_words_warm_loop:
460 ld [%o2],%o5
461 ld [%o1+4],%g3
462 ld [%o2+4],%g4
463 dec 4,%o3
464 subxcc %o4,%o5,%o5
465 st %o5,[%o0]
466
467 ld [%o1+8],%o4
468 ld [%o2+8],%o5
469 inc 16,%o1
470 subxcc %g3,%g4,%g4
471 st %g4,[%o0+4]
472
473 ld [%o1-4],%g3
474 ld [%o2+12],%g4
475 inc 16,%o2
476 subxcc %o4,%o5,%o5
477 st %o5,[%o0+8]
478
479 inc 16,%o0
480 subxcc %g3,%g4,%g4
481 st %g4,[%o0-4]
482 addx %g0,0,%g1
483 andcc %o3,-4,%g0
484 bnz,a .L_bn_sub_words_loop
485 addcc %g1,-1,%g0
486
487 tst %o3
488 nop
489 bnz,a .L_bn_sub_words_tail
490 ld [%o1],%o4
491.L_bn_sub_words_return:
492 retl
493 mov %g1,%o0
494
495.L_bn_sub_words_tail:
496 addcc %g1,-1,%g0
497 ld [%o2],%o5
498 subxcc %o4,%o5,%o5
499 addx %g0,0,%g1
500 deccc %o3
501 bz .L_bn_sub_words_return
502 st %o5,[%o0]
503 nop
504
505 ld [%o1+4],%o4
506 addcc %g1,-1,%g0
507 ld [%o2+4],%o5
508 subxcc %o4,%o5,%o5
509 addx %g0,0,%g1
510 deccc %o3
511 bz .L_bn_sub_words_return
512 st %o5,[%o0+4]
513
514 ld [%o1+8],%o4
515 addcc %g1,-1,%g0
516 ld [%o2+8],%o5
517 subxcc %o4,%o5,%o5
518 st %o5,[%o0+8]
519 retl
520 addx %g0,0,%o0
521
522.type bn_sub_words,#function
523.size bn_sub_words,(.-bn_sub_words)
524
525#define FRAME_SIZE -96
526
527/*
528 * Here is register usage map for *all* routines below.
529 */
530#define t_1 %o0
531#define t_2 %o1
532#define c_1 %o2
533#define c_2 %o3
534#define c_3 %o4
535
536#define ap(I) [%i1+4*I]
537#define bp(I) [%i2+4*I]
538#define rp(I) [%i0+4*I]
539
540#define a_0 %l0
541#define a_1 %l1
542#define a_2 %l2
543#define a_3 %l3
544#define a_4 %l4
545#define a_5 %l5
546#define a_6 %l6
547#define a_7 %l7
548
549#define b_0 %i3
550#define b_1 %i4
551#define b_2 %i5
552#define b_3 %o5
553#define b_4 %g1
554#define b_5 %g2
555#define b_6 %g3
556#define b_7 %g4
557
558.align 32
559.global bn_mul_comba8
560/*
561 * void bn_mul_comba8(r,a,b)
562 * BN_ULONG *r,*a,*b;
563 */
564bn_mul_comba8:
565 save %sp,FRAME_SIZE,%sp
566 ld ap(0),a_0
567 ld bp(0),b_0
568 umul a_0,b_0,c_1 !=!mul_add_c(a[0],b[0],c1,c2,c3);
569 ld bp(1),b_1
570 rd %y,c_2
571 st c_1,rp(0) !r[0]=c1;
572
573 umul a_0,b_1,t_1 !=!mul_add_c(a[0],b[1],c2,c3,c1);
574 ld ap(1),a_1
575 addcc c_2,t_1,c_2
576 rd %y,t_2
577 addxcc %g0,t_2,c_3 !=
578 addx %g0,%g0,c_1
579 ld ap(2),a_2
580 umul a_1,b_0,t_1 !mul_add_c(a[1],b[0],c2,c3,c1);
581 addcc c_2,t_1,c_2 !=
582 rd %y,t_2
583 addxcc c_3,t_2,c_3
584 st c_2,rp(1) !r[1]=c2;
585 addx c_1,%g0,c_1 !=
586
587 umul a_2,b_0,t_1 !mul_add_c(a[2],b[0],c3,c1,c2);
588 addcc c_3,t_1,c_3
589 rd %y,t_2
590 addxcc c_1,t_2,c_1 !=
591 addx %g0,%g0,c_2
592 ld bp(2),b_2
593 umul a_1,b_1,t_1 !mul_add_c(a[1],b[1],c3,c1,c2);
594 addcc c_3,t_1,c_3 !=
595 rd %y,t_2
596 addxcc c_1,t_2,c_1
597 ld bp(3),b_3
598 addx c_2,%g0,c_2 !=
599 umul a_0,b_2,t_1 !mul_add_c(a[0],b[2],c3,c1,c2);
600 addcc c_3,t_1,c_3
601 rd %y,t_2
602 addxcc c_1,t_2,c_1 !=
603 addx c_2,%g0,c_2
604 st c_3,rp(2) !r[2]=c3;
605
606 umul a_0,b_3,t_1 !mul_add_c(a[0],b[3],c1,c2,c3);
607 addcc c_1,t_1,c_1 !=
608 rd %y,t_2
609 addxcc c_2,t_2,c_2
610 addx %g0,%g0,c_3
611 umul a_1,b_2,t_1 !=!mul_add_c(a[1],b[2],c1,c2,c3);
612 addcc c_1,t_1,c_1
613 rd %y,t_2
614 addxcc c_2,t_2,c_2
615 addx c_3,%g0,c_3 !=
616 ld ap(3),a_3
617 umul a_2,b_1,t_1 !mul_add_c(a[2],b[1],c1,c2,c3);
618 addcc c_1,t_1,c_1
619 rd %y,t_2 !=
620 addxcc c_2,t_2,c_2
621 addx c_3,%g0,c_3
622 ld ap(4),a_4
623 umul a_3,b_0,t_1 !mul_add_c(a[3],b[0],c1,c2,c3);!=
624 addcc c_1,t_1,c_1
625 rd %y,t_2
626 addxcc c_2,t_2,c_2
627 addx c_3,%g0,c_3 !=
628 st c_1,rp(3) !r[3]=c1;
629
630 umul a_4,b_0,t_1 !mul_add_c(a[4],b[0],c2,c3,c1);
631 addcc c_2,t_1,c_2
632 rd %y,t_2 !=
633 addxcc c_3,t_2,c_3
634 addx %g0,%g0,c_1
635 umul a_3,b_1,t_1 !mul_add_c(a[3],b[1],c2,c3,c1);
636 addcc c_2,t_1,c_2 !=
637 rd %y,t_2
638 addxcc c_3,t_2,c_3
639 addx c_1,%g0,c_1
640 umul a_2,b_2,t_1 !=!mul_add_c(a[2],b[2],c2,c3,c1);
641 addcc c_2,t_1,c_2
642 rd %y,t_2
643 addxcc c_3,t_2,c_3
644 addx c_1,%g0,c_1 !=
645 ld bp(4),b_4
646 umul a_1,b_3,t_1 !mul_add_c(a[1],b[3],c2,c3,c1);
647 addcc c_2,t_1,c_2
648 rd %y,t_2 !=
649 addxcc c_3,t_2,c_3
650 addx c_1,%g0,c_1
651 ld bp(5),b_5
652 umul a_0,b_4,t_1 !=!mul_add_c(a[0],b[4],c2,c3,c1);
653 addcc c_2,t_1,c_2
654 rd %y,t_2
655 addxcc c_3,t_2,c_3
656 addx c_1,%g0,c_1 !=
657 st c_2,rp(4) !r[4]=c2;
658
659 umul a_0,b_5,t_1 !mul_add_c(a[0],b[5],c3,c1,c2);
660 addcc c_3,t_1,c_3
661 rd %y,t_2 !=
662 addxcc c_1,t_2,c_1
663 addx %g0,%g0,c_2
664 umul a_1,b_4,t_1 !mul_add_c(a[1],b[4],c3,c1,c2);
665 addcc c_3,t_1,c_3 !=
666 rd %y,t_2
667 addxcc c_1,t_2,c_1
668 addx c_2,%g0,c_2
669 umul a_2,b_3,t_1 !=!mul_add_c(a[2],b[3],c3,c1,c2);
670 addcc c_3,t_1,c_3
671 rd %y,t_2
672 addxcc c_1,t_2,c_1
673 addx c_2,%g0,c_2 !=
674 umul a_3,b_2,t_1 !mul_add_c(a[3],b[2],c3,c1,c2);
675 addcc c_3,t_1,c_3
676 rd %y,t_2
677 addxcc c_1,t_2,c_1 !=
678 addx c_2,%g0,c_2
679 ld ap(5),a_5
680 umul a_4,b_1,t_1 !mul_add_c(a[4],b[1],c3,c1,c2);
681 addcc c_3,t_1,c_3 !=
682 rd %y,t_2
683 addxcc c_1,t_2,c_1
684 ld ap(6),a_6
685 addx c_2,%g0,c_2 !=
686 umul a_5,b_0,t_1 !mul_add_c(a[5],b[0],c3,c1,c2);
687 addcc c_3,t_1,c_3
688 rd %y,t_2
689 addxcc c_1,t_2,c_1 !=
690 addx c_2,%g0,c_2
691 st c_3,rp(5) !r[5]=c3;
692
693 umul a_6,b_0,t_1 !mul_add_c(a[6],b[0],c1,c2,c3);
694 addcc c_1,t_1,c_1 !=
695 rd %y,t_2
696 addxcc c_2,t_2,c_2
697 addx %g0,%g0,c_3
698 umul a_5,b_1,t_1 !=!mul_add_c(a[5],b[1],c1,c2,c3);
699 addcc c_1,t_1,c_1
700 rd %y,t_2
701 addxcc c_2,t_2,c_2
702 addx c_3,%g0,c_3 !=
703 umul a_4,b_2,t_1 !mul_add_c(a[4],b[2],c1,c2,c3);
704 addcc c_1,t_1,c_1
705 rd %y,t_2
706 addxcc c_2,t_2,c_2 !=
707 addx c_3,%g0,c_3
708 umul a_3,b_3,t_1 !mul_add_c(a[3],b[3],c1,c2,c3);
709 addcc c_1,t_1,c_1
710 rd %y,t_2 !=
711 addxcc c_2,t_2,c_2
712 addx c_3,%g0,c_3
713 umul a_2,b_4,t_1 !mul_add_c(a[2],b[4],c1,c2,c3);
714 addcc c_1,t_1,c_1 !=
715 rd %y,t_2
716 addxcc c_2,t_2,c_2
717 ld bp(6),b_6
718 addx c_3,%g0,c_3 !=
719 umul a_1,b_5,t_1 !mul_add_c(a[1],b[5],c1,c2,c3);
720 addcc c_1,t_1,c_1
721 rd %y,t_2
722 addxcc c_2,t_2,c_2 !=
723 addx c_3,%g0,c_3
724 ld bp(7),b_7
725 umul a_0,b_6,t_1 !mul_add_c(a[0],b[6],c1,c2,c3);
726 addcc c_1,t_1,c_1 !=
727 rd %y,t_2
728 addxcc c_2,t_2,c_2
729 st c_1,rp(6) !r[6]=c1;
730 addx c_3,%g0,c_3 !=
731
732 umul a_0,b_7,t_1 !mul_add_c(a[0],b[7],c2,c3,c1);
733 addcc c_2,t_1,c_2
734 rd %y,t_2
735 addxcc c_3,t_2,c_3 !=
736 addx %g0,%g0,c_1
737 umul a_1,b_6,t_1 !mul_add_c(a[1],b[6],c2,c3,c1);
738 addcc c_2,t_1,c_2
739 rd %y,t_2 !=
740 addxcc c_3,t_2,c_3
741 addx c_1,%g0,c_1
742 umul a_2,b_5,t_1 !mul_add_c(a[2],b[5],c2,c3,c1);
743 addcc c_2,t_1,c_2 !=
744 rd %y,t_2
745 addxcc c_3,t_2,c_3
746 addx c_1,%g0,c_1
747 umul a_3,b_4,t_1 !=!mul_add_c(a[3],b[4],c2,c3,c1);
748 addcc c_2,t_1,c_2
749 rd %y,t_2
750 addxcc c_3,t_2,c_3
751 addx c_1,%g0,c_1 !=
752 umul a_4,b_3,t_1 !mul_add_c(a[4],b[3],c2,c3,c1);
753 addcc c_2,t_1,c_2
754 rd %y,t_2
755 addxcc c_3,t_2,c_3 !=
756 addx c_1,%g0,c_1
757 umul a_5,b_2,t_1 !mul_add_c(a[5],b[2],c2,c3,c1);
758 addcc c_2,t_1,c_2
759 rd %y,t_2 !=
760 addxcc c_3,t_2,c_3
761 addx c_1,%g0,c_1
762 ld ap(7),a_7
763 umul a_6,b_1,t_1 !=!mul_add_c(a[6],b[1],c2,c3,c1);
764 addcc c_2,t_1,c_2
765 rd %y,t_2
766 addxcc c_3,t_2,c_3
767 addx c_1,%g0,c_1 !=
768 umul a_7,b_0,t_1 !mul_add_c(a[7],b[0],c2,c3,c1);
769 addcc c_2,t_1,c_2
770 rd %y,t_2
771 addxcc c_3,t_2,c_3 !=
772 addx c_1,%g0,c_1
773 st c_2,rp(7) !r[7]=c2;
774
775 umul a_7,b_1,t_1 !mul_add_c(a[7],b[1],c3,c1,c2);
776 addcc c_3,t_1,c_3 !=
777 rd %y,t_2
778 addxcc c_1,t_2,c_1
779 addx %g0,%g0,c_2
780 umul a_6,b_2,t_1 !=!mul_add_c(a[6],b[2],c3,c1,c2);
781 addcc c_3,t_1,c_3
782 rd %y,t_2
783 addxcc c_1,t_2,c_1
784 addx c_2,%g0,c_2 !=
785 umul a_5,b_3,t_1 !mul_add_c(a[5],b[3],c3,c1,c2);
786 addcc c_3,t_1,c_3
787 rd %y,t_2
788 addxcc c_1,t_2,c_1 !=
789 addx c_2,%g0,c_2
790 umul a_4,b_4,t_1 !mul_add_c(a[4],b[4],c3,c1,c2);
791 addcc c_3,t_1,c_3
792 rd %y,t_2 !=
793 addxcc c_1,t_2,c_1
794 addx c_2,%g0,c_2
795 umul a_3,b_5,t_1 !mul_add_c(a[3],b[5],c3,c1,c2);
796 addcc c_3,t_1,c_3 !=
797 rd %y,t_2
798 addxcc c_1,t_2,c_1
799 addx c_2,%g0,c_2
800 umul a_2,b_6,t_1 !=!mul_add_c(a[2],b[6],c3,c1,c2);
801 addcc c_3,t_1,c_3
802 rd %y,t_2
803 addxcc c_1,t_2,c_1
804 addx c_2,%g0,c_2 !=
805 umul a_1,b_7,t_1 !mul_add_c(a[1],b[7],c3,c1,c2);
806 addcc c_3,t_1,c_3
807 rd %y,t_2
808 addxcc c_1,t_2,c_1 !
809 addx c_2,%g0,c_2
810 st c_3,rp(8) !r[8]=c3;
811
812 umul a_2,b_7,t_1 !mul_add_c(a[2],b[7],c1,c2,c3);
813 addcc c_1,t_1,c_1 !=
814 rd %y,t_2
815 addxcc c_2,t_2,c_2
816 addx %g0,%g0,c_3
817 umul a_3,b_6,t_1 !=!mul_add_c(a[3],b[6],c1,c2,c3);
818 addcc c_1,t_1,c_1
819 rd %y,t_2
820 addxcc c_2,t_2,c_2
821 addx c_3,%g0,c_3 !=
822 umul a_4,b_5,t_1 !mul_add_c(a[4],b[5],c1,c2,c3);
823 addcc c_1,t_1,c_1
824 rd %y,t_2
825 addxcc c_2,t_2,c_2 !=
826 addx c_3,%g0,c_3
827 umul a_5,b_4,t_1 !mul_add_c(a[5],b[4],c1,c2,c3);
828 addcc c_1,t_1,c_1
829 rd %y,t_2 !=
830 addxcc c_2,t_2,c_2
831 addx c_3,%g0,c_3
832 umul a_6,b_3,t_1 !mul_add_c(a[6],b[3],c1,c2,c3);
833 addcc c_1,t_1,c_1 !=
834 rd %y,t_2
835 addxcc c_2,t_2,c_2
836 addx c_3,%g0,c_3
837 umul a_7,b_2,t_1 !=!mul_add_c(a[7],b[2],c1,c2,c3);
838 addcc c_1,t_1,c_1
839 rd %y,t_2
840 addxcc c_2,t_2,c_2
841 addx c_3,%g0,c_3 !=
842 st c_1,rp(9) !r[9]=c1;
843
844 umul a_7,b_3,t_1 !mul_add_c(a[7],b[3],c2,c3,c1);
845 addcc c_2,t_1,c_2
846 rd %y,t_2 !=
847 addxcc c_3,t_2,c_3
848 addx %g0,%g0,c_1
849 umul a_6,b_4,t_1 !mul_add_c(a[6],b[4],c2,c3,c1);
850 addcc c_2,t_1,c_2 !=
851 rd %y,t_2
852 addxcc c_3,t_2,c_3
853 addx c_1,%g0,c_1
854 umul a_5,b_5,t_1 !=!mul_add_c(a[5],b[5],c2,c3,c1);
855 addcc c_2,t_1,c_2
856 rd %y,t_2
857 addxcc c_3,t_2,c_3
858 addx c_1,%g0,c_1 !=
859 umul a_4,b_6,t_1 !mul_add_c(a[4],b[6],c2,c3,c1);
860 addcc c_2,t_1,c_2
861 rd %y,t_2
862 addxcc c_3,t_2,c_3 !=
863 addx c_1,%g0,c_1
864 umul a_3,b_7,t_1 !mul_add_c(a[3],b[7],c2,c3,c1);
865 addcc c_2,t_1,c_2
866 rd %y,t_2 !=
867 addxcc c_3,t_2,c_3
868 addx c_1,%g0,c_1
869 st c_2,rp(10) !r[10]=c2;
870
871 umul a_4,b_7,t_1 !=!mul_add_c(a[4],b[7],c3,c1,c2);
872 addcc c_3,t_1,c_3
873 rd %y,t_2
874 addxcc c_1,t_2,c_1
875 addx %g0,%g0,c_2 !=
876 umul a_5,b_6,t_1 !mul_add_c(a[5],b[6],c3,c1,c2);
877 addcc c_3,t_1,c_3
878 rd %y,t_2
879 addxcc c_1,t_2,c_1 !=
880 addx c_2,%g0,c_2
881 umul a_6,b_5,t_1 !mul_add_c(a[6],b[5],c3,c1,c2);
882 addcc c_3,t_1,c_3
883 rd %y,t_2 !=
884 addxcc c_1,t_2,c_1
885 addx c_2,%g0,c_2
886 umul a_7,b_4,t_1 !mul_add_c(a[7],b[4],c3,c1,c2);
887 addcc c_3,t_1,c_3 !=
888 rd %y,t_2
889 addxcc c_1,t_2,c_1
890 st c_3,rp(11) !r[11]=c3;
891 addx c_2,%g0,c_2 !=
892
893 umul a_7,b_5,t_1 !mul_add_c(a[7],b[5],c1,c2,c3);
894 addcc c_1,t_1,c_1
895 rd %y,t_2
896 addxcc c_2,t_2,c_2 !=
897 addx %g0,%g0,c_3
898 umul a_6,b_6,t_1 !mul_add_c(a[6],b[6],c1,c2,c3);
899 addcc c_1,t_1,c_1
900 rd %y,t_2 !=
901 addxcc c_2,t_2,c_2
902 addx c_3,%g0,c_3
903 umul a_5,b_7,t_1 !mul_add_c(a[5],b[7],c1,c2,c3);
904 addcc c_1,t_1,c_1 !=
905 rd %y,t_2
906 addxcc c_2,t_2,c_2
907 st c_1,rp(12) !r[12]=c1;
908 addx c_3,%g0,c_3 !=
909
910 umul a_6,b_7,t_1 !mul_add_c(a[6],b[7],c2,c3,c1);
911 addcc c_2,t_1,c_2
912 rd %y,t_2
913 addxcc c_3,t_2,c_3 !=
914 addx %g0,%g0,c_1
915 umul a_7,b_6,t_1 !mul_add_c(a[7],b[6],c2,c3,c1);
916 addcc c_2,t_1,c_2
917 rd %y,t_2 !=
918 addxcc c_3,t_2,c_3
919 addx c_1,%g0,c_1
920 st c_2,rp(13) !r[13]=c2;
921
922 umul a_7,b_7,t_1 !=!mul_add_c(a[7],b[7],c3,c1,c2);
923 addcc c_3,t_1,c_3
924 rd %y,t_2
925 addxcc c_1,t_2,c_1
926 nop !=
927 st c_3,rp(14) !r[14]=c3;
928 st c_1,rp(15) !r[15]=c1;
929
930 ret
931 restore %g0,%g0,%o0
932
933.type bn_mul_comba8,#function
934.size bn_mul_comba8,(.-bn_mul_comba8)
935
936.align 32
937
938.global bn_mul_comba4
939/*
940 * void bn_mul_comba4(r,a,b)
941 * BN_ULONG *r,*a,*b;
942 */
943bn_mul_comba4:
944 save %sp,FRAME_SIZE,%sp
945 ld ap(0),a_0
946 ld bp(0),b_0
947 umul a_0,b_0,c_1 !=!mul_add_c(a[0],b[0],c1,c2,c3);
948 ld bp(1),b_1
949 rd %y,c_2
950 st c_1,rp(0) !r[0]=c1;
951
952 umul a_0,b_1,t_1 !=!mul_add_c(a[0],b[1],c2,c3,c1);
953 ld ap(1),a_1
954 addcc c_2,t_1,c_2
955 rd %y,t_2 !=
956 addxcc %g0,t_2,c_3
957 addx %g0,%g0,c_1
958 ld ap(2),a_2
959 umul a_1,b_0,t_1 !=!mul_add_c(a[1],b[0],c2,c3,c1);
960 addcc c_2,t_1,c_2
961 rd %y,t_2
962 addxcc c_3,t_2,c_3
963 addx c_1,%g0,c_1 !=
964 st c_2,rp(1) !r[1]=c2;
965
966 umul a_2,b_0,t_1 !mul_add_c(a[2],b[0],c3,c1,c2);
967 addcc c_3,t_1,c_3
968 rd %y,t_2 !=
969 addxcc c_1,t_2,c_1
970 addx %g0,%g0,c_2
971 ld bp(2),b_2
972 umul a_1,b_1,t_1 !=!mul_add_c(a[1],b[1],c3,c1,c2);
973 addcc c_3,t_1,c_3
974 rd %y,t_2
975 addxcc c_1,t_2,c_1
976 addx c_2,%g0,c_2 !=
977 ld bp(3),b_3
978 umul a_0,b_2,t_1 !mul_add_c(a[0],b[2],c3,c1,c2);
979 addcc c_3,t_1,c_3
980 rd %y,t_2 !=
981 addxcc c_1,t_2,c_1
982 addx c_2,%g0,c_2
983 st c_3,rp(2) !r[2]=c3;
984
985 umul a_0,b_3,t_1 !=!mul_add_c(a[0],b[3],c1,c2,c3);
986 addcc c_1,t_1,c_1
987 rd %y,t_2
988 addxcc c_2,t_2,c_2
989 addx %g0,%g0,c_3 !=
990 umul a_1,b_2,t_1 !mul_add_c(a[1],b[2],c1,c2,c3);
991 addcc c_1,t_1,c_1
992 rd %y,t_2
993 addxcc c_2,t_2,c_2 !=
994 addx c_3,%g0,c_3
995 ld ap(3),a_3
996 umul a_2,b_1,t_1 !mul_add_c(a[2],b[1],c1,c2,c3);
997 addcc c_1,t_1,c_1 !=
998 rd %y,t_2
999 addxcc c_2,t_2,c_2
1000 addx c_3,%g0,c_3
1001 umul a_3,b_0,t_1 !=!mul_add_c(a[3],b[0],c1,c2,c3);
1002 addcc c_1,t_1,c_1
1003 rd %y,t_2
1004 addxcc c_2,t_2,c_2
1005 addx c_3,%g0,c_3 !=
1006 st c_1,rp(3) !r[3]=c1;
1007
1008 umul a_3,b_1,t_1 !mul_add_c(a[3],b[1],c2,c3,c1);
1009 addcc c_2,t_1,c_2
1010 rd %y,t_2 !=
1011 addxcc c_3,t_2,c_3
1012 addx %g0,%g0,c_1
1013 umul a_2,b_2,t_1 !mul_add_c(a[2],b[2],c2,c3,c1);
1014 addcc c_2,t_1,c_2 !=
1015 rd %y,t_2
1016 addxcc c_3,t_2,c_3
1017 addx c_1,%g0,c_1
1018 umul a_1,b_3,t_1 !=!mul_add_c(a[1],b[3],c2,c3,c1);
1019 addcc c_2,t_1,c_2
1020 rd %y,t_2
1021 addxcc c_3,t_2,c_3
1022 addx c_1,%g0,c_1 !=
1023 st c_2,rp(4) !r[4]=c2;
1024
1025 umul a_2,b_3,t_1 !mul_add_c(a[2],b[3],c3,c1,c2);
1026 addcc c_3,t_1,c_3
1027 rd %y,t_2 !=
1028 addxcc c_1,t_2,c_1
1029 addx %g0,%g0,c_2
1030 umul a_3,b_2,t_1 !mul_add_c(a[3],b[2],c3,c1,c2);
1031 addcc c_3,t_1,c_3 !=
1032 rd %y,t_2
1033 addxcc c_1,t_2,c_1
1034 st c_3,rp(5) !r[5]=c3;
1035 addx c_2,%g0,c_2 !=
1036
1037 umul a_3,b_3,t_1 !mul_add_c(a[3],b[3],c1,c2,c3);
1038 addcc c_1,t_1,c_1
1039 rd %y,t_2
1040 addxcc c_2,t_2,c_2 !=
1041 st c_1,rp(6) !r[6]=c1;
1042 st c_2,rp(7) !r[7]=c2;
1043
1044 ret
1045 restore %g0,%g0,%o0
1046
1047.type bn_mul_comba4,#function
1048.size bn_mul_comba4,(.-bn_mul_comba4)
1049
1050.align 32
1051
1052.global bn_sqr_comba8
1053bn_sqr_comba8:
1054 save %sp,FRAME_SIZE,%sp
1055 ld ap(0),a_0
1056 ld ap(1),a_1
1057 umul a_0,a_0,c_1 !=!sqr_add_c(a,0,c1,c2,c3);
1058 rd %y,c_2
1059 st c_1,rp(0) !r[0]=c1;
1060
1061 ld ap(2),a_2
1062 umul a_0,a_1,t_1 !=!sqr_add_c2(a,1,0,c2,c3,c1);
1063 addcc c_2,t_1,c_2
1064 rd %y,t_2
1065 addxcc %g0,t_2,c_3
1066 addx %g0,%g0,c_1 !=
1067 addcc c_2,t_1,c_2
1068 addxcc c_3,t_2,c_3
1069 st c_2,rp(1) !r[1]=c2;
1070 addx c_1,%g0,c_1 !=
1071
1072 umul a_2,a_0,t_1 !sqr_add_c2(a,2,0,c3,c1,c2);
1073 addcc c_3,t_1,c_3
1074 rd %y,t_2
1075 addxcc c_1,t_2,c_1 !=
1076 addx %g0,%g0,c_2
1077 addcc c_3,t_1,c_3
1078 addxcc c_1,t_2,c_1
1079 addx c_2,%g0,c_2 !=
1080 ld ap(3),a_3
1081 umul a_1,a_1,t_1 !sqr_add_c(a,1,c3,c1,c2);
1082 addcc c_3,t_1,c_3
1083 rd %y,t_2 !=
1084 addxcc c_1,t_2,c_1
1085 addx c_2,%g0,c_2
1086 st c_3,rp(2) !r[2]=c3;
1087
1088 umul a_0,a_3,t_1 !=!sqr_add_c2(a,3,0,c1,c2,c3);
1089 addcc c_1,t_1,c_1
1090 rd %y,t_2
1091 addxcc c_2,t_2,c_2
1092 addx %g0,%g0,c_3 !=
1093 addcc c_1,t_1,c_1
1094 addxcc c_2,t_2,c_2
1095 ld ap(4),a_4
1096 addx c_3,%g0,c_3 !=
1097 umul a_1,a_2,t_1 !sqr_add_c2(a,2,1,c1,c2,c3);
1098 addcc c_1,t_1,c_1
1099 rd %y,t_2
1100 addxcc c_2,t_2,c_2 !=
1101 addx c_3,%g0,c_3
1102 addcc c_1,t_1,c_1
1103 addxcc c_2,t_2,c_2
1104 addx c_3,%g0,c_3 !=
1105 st c_1,rp(3) !r[3]=c1;
1106
1107 umul a_4,a_0,t_1 !sqr_add_c2(a,4,0,c2,c3,c1);
1108 addcc c_2,t_1,c_2
1109 rd %y,t_2 !=
1110 addxcc c_3,t_2,c_3
1111 addx %g0,%g0,c_1
1112 addcc c_2,t_1,c_2
1113 addxcc c_3,t_2,c_3 !=
1114 addx c_1,%g0,c_1
1115 umul a_3,a_1,t_1 !sqr_add_c2(a,3,1,c2,c3,c1);
1116 addcc c_2,t_1,c_2
1117 rd %y,t_2 !=
1118 addxcc c_3,t_2,c_3
1119 addx c_1,%g0,c_1
1120 addcc c_2,t_1,c_2
1121 addxcc c_3,t_2,c_3 !=
1122 addx c_1,%g0,c_1
1123 ld ap(5),a_5
1124 umul a_2,a_2,t_1 !sqr_add_c(a,2,c2,c3,c1);
1125 addcc c_2,t_1,c_2 !=
1126 rd %y,t_2
1127 addxcc c_3,t_2,c_3
1128 st c_2,rp(4) !r[4]=c2;
1129 addx c_1,%g0,c_1 !=
1130
1131 umul a_0,a_5,t_1 !sqr_add_c2(a,5,0,c3,c1,c2);
1132 addcc c_3,t_1,c_3
1133 rd %y,t_2
1134 addxcc c_1,t_2,c_1 !=
1135 addx %g0,%g0,c_2
1136 addcc c_3,t_1,c_3
1137 addxcc c_1,t_2,c_1
1138 addx c_2,%g0,c_2 !=
1139 umul a_1,a_4,t_1 !sqr_add_c2(a,4,1,c3,c1,c2);
1140 addcc c_3,t_1,c_3
1141 rd %y,t_2
1142 addxcc c_1,t_2,c_1 !=
1143 addx c_2,%g0,c_2
1144 addcc c_3,t_1,c_3
1145 addxcc c_1,t_2,c_1
1146 addx c_2,%g0,c_2 !=
1147 ld ap(6),a_6
1148 umul a_2,a_3,t_1 !sqr_add_c2(a,3,2,c3,c1,c2);
1149 addcc c_3,t_1,c_3
1150 rd %y,t_2 !=
1151 addxcc c_1,t_2,c_1
1152 addx c_2,%g0,c_2
1153 addcc c_3,t_1,c_3
1154 addxcc c_1,t_2,c_1 !=
1155 addx c_2,%g0,c_2
1156 st c_3,rp(5) !r[5]=c3;
1157
1158 umul a_6,a_0,t_1 !sqr_add_c2(a,6,0,c1,c2,c3);
1159 addcc c_1,t_1,c_1 !=
1160 rd %y,t_2
1161 addxcc c_2,t_2,c_2
1162 addx %g0,%g0,c_3
1163 addcc c_1,t_1,c_1 !=
1164 addxcc c_2,t_2,c_2
1165 addx c_3,%g0,c_3
1166 umul a_5,a_1,t_1 !sqr_add_c2(a,5,1,c1,c2,c3);
1167 addcc c_1,t_1,c_1 !=
1168 rd %y,t_2
1169 addxcc c_2,t_2,c_2
1170 addx c_3,%g0,c_3
1171 addcc c_1,t_1,c_1 !=
1172 addxcc c_2,t_2,c_2
1173 addx c_3,%g0,c_3
1174 umul a_4,a_2,t_1 !sqr_add_c2(a,4,2,c1,c2,c3);
1175 addcc c_1,t_1,c_1 !=
1176 rd %y,t_2
1177 addxcc c_2,t_2,c_2
1178 addx c_3,%g0,c_3
1179 addcc c_1,t_1,c_1 !=
1180 addxcc c_2,t_2,c_2
1181 addx c_3,%g0,c_3
1182 ld ap(7),a_7
1183 umul a_3,a_3,t_1 !=!sqr_add_c(a,3,c1,c2,c3);
1184 addcc c_1,t_1,c_1
1185 rd %y,t_2
1186 addxcc c_2,t_2,c_2
1187 addx c_3,%g0,c_3 !=
1188 st c_1,rp(6) !r[6]=c1;
1189
1190 umul a_0,a_7,t_1 !sqr_add_c2(a,7,0,c2,c3,c1);
1191 addcc c_2,t_1,c_2
1192 rd %y,t_2 !=
1193 addxcc c_3,t_2,c_3
1194 addx %g0,%g0,c_1
1195 addcc c_2,t_1,c_2
1196 addxcc c_3,t_2,c_3 !=
1197 addx c_1,%g0,c_1
1198 umul a_1,a_6,t_1 !sqr_add_c2(a,6,1,c2,c3,c1);
1199 addcc c_2,t_1,c_2
1200 rd %y,t_2 !=
1201 addxcc c_3,t_2,c_3
1202 addx c_1,%g0,c_1
1203 addcc c_2,t_1,c_2
1204 addxcc c_3,t_2,c_3 !=
1205 addx c_1,%g0,c_1
1206 umul a_2,a_5,t_1 !sqr_add_c2(a,5,2,c2,c3,c1);
1207 addcc c_2,t_1,c_2
1208 rd %y,t_2 !=
1209 addxcc c_3,t_2,c_3
1210 addx c_1,%g0,c_1
1211 addcc c_2,t_1,c_2
1212 addxcc c_3,t_2,c_3 !=
1213 addx c_1,%g0,c_1
1214 umul a_3,a_4,t_1 !sqr_add_c2(a,4,3,c2,c3,c1);
1215 addcc c_2,t_1,c_2
1216 rd %y,t_2 !=
1217 addxcc c_3,t_2,c_3
1218 addx c_1,%g0,c_1
1219 addcc c_2,t_1,c_2
1220 addxcc c_3,t_2,c_3 !=
1221 addx c_1,%g0,c_1
1222 st c_2,rp(7) !r[7]=c2;
1223
1224 umul a_7,a_1,t_1 !sqr_add_c2(a,7,1,c3,c1,c2);
1225 addcc c_3,t_1,c_3 !=
1226 rd %y,t_2
1227 addxcc c_1,t_2,c_1
1228 addx %g0,%g0,c_2
1229 addcc c_3,t_1,c_3 !=
1230 addxcc c_1,t_2,c_1
1231 addx c_2,%g0,c_2
1232 umul a_6,a_2,t_1 !sqr_add_c2(a,6,2,c3,c1,c2);
1233 addcc c_3,t_1,c_3 !=
1234 rd %y,t_2
1235 addxcc c_1,t_2,c_1
1236 addx c_2,%g0,c_2
1237 addcc c_3,t_1,c_3 !=
1238 addxcc c_1,t_2,c_1
1239 addx c_2,%g0,c_2
1240 umul a_5,a_3,t_1 !sqr_add_c2(a,5,3,c3,c1,c2);
1241 addcc c_3,t_1,c_3 !=
1242 rd %y,t_2
1243 addxcc c_1,t_2,c_1
1244 addx c_2,%g0,c_2
1245 addcc c_3,t_1,c_3 !=
1246 addxcc c_1,t_2,c_1
1247 addx c_2,%g0,c_2
1248 umul a_4,a_4,t_1 !sqr_add_c(a,4,c3,c1,c2);
1249 addcc c_3,t_1,c_3 !=
1250 rd %y,t_2
1251 addxcc c_1,t_2,c_1
1252 st c_3,rp(8) !r[8]=c3;
1253 addx c_2,%g0,c_2 !=
1254
1255 umul a_2,a_7,t_1 !sqr_add_c2(a,7,2,c1,c2,c3);
1256 addcc c_1,t_1,c_1
1257 rd %y,t_2
1258 addxcc c_2,t_2,c_2 !=
1259 addx %g0,%g0,c_3
1260 addcc c_1,t_1,c_1
1261 addxcc c_2,t_2,c_2
1262 addx c_3,%g0,c_3 !=
1263 umul a_3,a_6,t_1 !sqr_add_c2(a,6,3,c1,c2,c3);
1264 addcc c_1,t_1,c_1
1265 rd %y,t_2
1266 addxcc c_2,t_2,c_2 !=
1267 addx c_3,%g0,c_3
1268 addcc c_1,t_1,c_1
1269 addxcc c_2,t_2,c_2
1270 addx c_3,%g0,c_3 !=
1271 umul a_4,a_5,t_1 !sqr_add_c2(a,5,4,c1,c2,c3);
1272 addcc c_1,t_1,c_1
1273 rd %y,t_2
1274 addxcc c_2,t_2,c_2 !=
1275 addx c_3,%g0,c_3
1276 addcc c_1,t_1,c_1
1277 addxcc c_2,t_2,c_2
1278 addx c_3,%g0,c_3 !=
1279 st c_1,rp(9) !r[9]=c1;
1280
1281 umul a_7,a_3,t_1 !sqr_add_c2(a,7,3,c2,c3,c1);
1282 addcc c_2,t_1,c_2
1283 rd %y,t_2 !=
1284 addxcc c_3,t_2,c_3
1285 addx %g0,%g0,c_1
1286 addcc c_2,t_1,c_2
1287 addxcc c_3,t_2,c_3 !=
1288 addx c_1,%g0,c_1
1289 umul a_6,a_4,t_1 !sqr_add_c2(a,6,4,c2,c3,c1);
1290 addcc c_2,t_1,c_2
1291 rd %y,t_2 !=
1292 addxcc c_3,t_2,c_3
1293 addx c_1,%g0,c_1
1294 addcc c_2,t_1,c_2
1295 addxcc c_3,t_2,c_3 !=
1296 addx c_1,%g0,c_1
1297 umul a_5,a_5,t_1 !sqr_add_c(a,5,c2,c3,c1);
1298 addcc c_2,t_1,c_2
1299 rd %y,t_2 !=
1300 addxcc c_3,t_2,c_3
1301 addx c_1,%g0,c_1
1302 st c_2,rp(10) !r[10]=c2;
1303
1304 umul a_4,a_7,t_1 !=!sqr_add_c2(a,7,4,c3,c1,c2);
1305 addcc c_3,t_1,c_3
1306 rd %y,t_2
1307 addxcc c_1,t_2,c_1
1308 addx %g0,%g0,c_2 !=
1309 addcc c_3,t_1,c_3
1310 addxcc c_1,t_2,c_1
1311 addx c_2,%g0,c_2
1312 umul a_5,a_6,t_1 !=!sqr_add_c2(a,6,5,c3,c1,c2);
1313 addcc c_3,t_1,c_3
1314 rd %y,t_2
1315 addxcc c_1,t_2,c_1
1316 addx c_2,%g0,c_2 !=
1317 addcc c_3,t_1,c_3
1318 addxcc c_1,t_2,c_1
1319 st c_3,rp(11) !r[11]=c3;
1320 addx c_2,%g0,c_2 !=
1321
1322 umul a_7,a_5,t_1 !sqr_add_c2(a,7,5,c1,c2,c3);
1323 addcc c_1,t_1,c_1
1324 rd %y,t_2
1325 addxcc c_2,t_2,c_2 !=
1326 addx %g0,%g0,c_3
1327 addcc c_1,t_1,c_1
1328 addxcc c_2,t_2,c_2
1329 addx c_3,%g0,c_3 !=
1330 umul a_6,a_6,t_1 !sqr_add_c(a,6,c1,c2,c3);
1331 addcc c_1,t_1,c_1
1332 rd %y,t_2
1333 addxcc c_2,t_2,c_2 !=
1334 addx c_3,%g0,c_3
1335 st c_1,rp(12) !r[12]=c1;
1336
1337 umul a_6,a_7,t_1 !sqr_add_c2(a,7,6,c2,c3,c1);
1338 addcc c_2,t_1,c_2 !=
1339 rd %y,t_2
1340 addxcc c_3,t_2,c_3
1341 addx %g0,%g0,c_1
1342 addcc c_2,t_1,c_2 !=
1343 addxcc c_3,t_2,c_3
1344 st c_2,rp(13) !r[13]=c2;
1345 addx c_1,%g0,c_1 !=
1346
1347 umul a_7,a_7,t_1 !sqr_add_c(a,7,c3,c1,c2);
1348 addcc c_3,t_1,c_3
1349 rd %y,t_2
1350 addxcc c_1,t_2,c_1 !=
1351 st c_3,rp(14) !r[14]=c3;
1352 st c_1,rp(15) !r[15]=c1;
1353
1354 ret
1355 restore %g0,%g0,%o0
1356
1357.type bn_sqr_comba8,#function
1358.size bn_sqr_comba8,(.-bn_sqr_comba8)
1359
1360.align 32
1361
1362.global bn_sqr_comba4
1363/*
1364 * void bn_sqr_comba4(r,a)
1365 * BN_ULONG *r,*a;
1366 */
1367bn_sqr_comba4:
1368 save %sp,FRAME_SIZE,%sp
1369 ld ap(0),a_0
1370 umul a_0,a_0,c_1 !sqr_add_c(a,0,c1,c2,c3);
1371 ld ap(1),a_1 !=
1372 rd %y,c_2
1373 st c_1,rp(0) !r[0]=c1;
1374
1375 ld ap(2),a_2
1376 umul a_0,a_1,t_1 !=!sqr_add_c2(a,1,0,c2,c3,c1);
1377 addcc c_2,t_1,c_2
1378 rd %y,t_2
1379 addxcc %g0,t_2,c_3
1380 addx %g0,%g0,c_1 !=
1381 addcc c_2,t_1,c_2
1382 addxcc c_3,t_2,c_3
1383 addx c_1,%g0,c_1 !=
1384 st c_2,rp(1) !r[1]=c2;
1385
1386 umul a_2,a_0,t_1 !sqr_add_c2(a,2,0,c3,c1,c2);
1387 addcc c_3,t_1,c_3
1388 rd %y,t_2 !=
1389 addxcc c_1,t_2,c_1
1390 addx %g0,%g0,c_2
1391 addcc c_3,t_1,c_3
1392 addxcc c_1,t_2,c_1 !=
1393 addx c_2,%g0,c_2
1394 ld ap(3),a_3
1395 umul a_1,a_1,t_1 !sqr_add_c(a,1,c3,c1,c2);
1396 addcc c_3,t_1,c_3 !=
1397 rd %y,t_2
1398 addxcc c_1,t_2,c_1
1399 st c_3,rp(2) !r[2]=c3;
1400 addx c_2,%g0,c_2 !=
1401
1402 umul a_0,a_3,t_1 !sqr_add_c2(a,3,0,c1,c2,c3);
1403 addcc c_1,t_1,c_1
1404 rd %y,t_2
1405 addxcc c_2,t_2,c_2 !=
1406 addx %g0,%g0,c_3
1407 addcc c_1,t_1,c_1
1408 addxcc c_2,t_2,c_2
1409 addx c_3,%g0,c_3 !=
1410 umul a_1,a_2,t_1 !sqr_add_c2(a,2,1,c1,c2,c3);
1411 addcc c_1,t_1,c_1
1412 rd %y,t_2
1413 addxcc c_2,t_2,c_2 !=
1414 addx c_3,%g0,c_3
1415 addcc c_1,t_1,c_1
1416 addxcc c_2,t_2,c_2
1417 addx c_3,%g0,c_3 !=
1418 st c_1,rp(3) !r[3]=c1;
1419
1420 umul a_3,a_1,t_1 !sqr_add_c2(a,3,1,c2,c3,c1);
1421 addcc c_2,t_1,c_2
1422 rd %y,t_2 !=
1423 addxcc c_3,t_2,c_3
1424 addx %g0,%g0,c_1
1425 addcc c_2,t_1,c_2
1426 addxcc c_3,t_2,c_3 !=
1427 addx c_1,%g0,c_1
1428 umul a_2,a_2,t_1 !sqr_add_c(a,2,c2,c3,c1);
1429 addcc c_2,t_1,c_2
1430 rd %y,t_2 !=
1431 addxcc c_3,t_2,c_3
1432 addx c_1,%g0,c_1
1433 st c_2,rp(4) !r[4]=c2;
1434
1435 umul a_2,a_3,t_1 !=!sqr_add_c2(a,3,2,c3,c1,c2);
1436 addcc c_3,t_1,c_3
1437 rd %y,t_2
1438 addxcc c_1,t_2,c_1
1439 addx %g0,%g0,c_2 !=
1440 addcc c_3,t_1,c_3
1441 addxcc c_1,t_2,c_1
1442 st c_3,rp(5) !r[5]=c3;
1443 addx c_2,%g0,c_2 !=
1444
1445 umul a_3,a_3,t_1 !sqr_add_c(a,3,c1,c2,c3);
1446 addcc c_1,t_1,c_1
1447 rd %y,t_2
1448 addxcc c_2,t_2,c_2 !=
1449 st c_1,rp(6) !r[6]=c1;
1450 st c_2,rp(7) !r[7]=c2;
1451
1452 ret
1453 restore %g0,%g0,%o0
1454
1455.type bn_sqr_comba4,#function
1456.size bn_sqr_comba4,(.-bn_sqr_comba4)
1457
1458.align 32
diff --git a/src/lib/libcrypto/bn/asm/sparcv8plus.S b/src/lib/libcrypto/bn/asm/sparcv8plus.S
deleted file mode 100644
index 608dbe1571..0000000000
--- a/src/lib/libcrypto/bn/asm/sparcv8plus.S
+++ /dev/null
@@ -1,1558 +0,0 @@
1.ident "sparcv8plus.s, Version 1.4"
2.ident "SPARC v9 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>"
3
4/*
5 * ====================================================================
6 * Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
7 * project.
8 *
9 * Rights for redistribution and usage in source and binary forms are
10 * granted according to the OpenSSL license. Warranty of any kind is
11 * disclaimed.
12 * ====================================================================
13 */
14
15/*
16 * This is my modest contribution to OpenSSL project (see
17 * http://www.openssl.org/ for more information about it) and is
18 * a drop-in UltraSPARC ISA replacement for crypto/bn/bn_asm.c
19 * module. For updates see http://fy.chalmers.se/~appro/hpe/.
20 *
21 * Questions-n-answers.
22 *
23 * Q. How to compile?
24 * A. With SC4.x/SC5.x:
25 *
26 * cc -xarch=v8plus -c bn_asm.sparc.v8plus.S -o bn_asm.o
27 *
28 * and with gcc:
29 *
30 * gcc -mcpu=ultrasparc -c bn_asm.sparc.v8plus.S -o bn_asm.o
31 *
32 * or if above fails (it does if you have gas installed):
33 *
34 * gcc -E bn_asm.sparc.v8plus.S | as -xarch=v8plus /dev/fd/0 -o bn_asm.o
35 *
36 * Quick-n-dirty way to fuse the module into the library.
37 * Provided that the library is already configured and built
38 * (in 0.9.2 case with no-asm option):
39 *
40 * # cd crypto/bn
41 * # cp /some/place/bn_asm.sparc.v8plus.S .
42 * # cc -xarch=v8plus -c bn_asm.sparc.v8plus.S -o bn_asm.o
43 * # make
44 * # cd ../..
45 * # make; make test
46 *
47 * Quick-n-dirty way to get rid of it:
48 *
49 * # cd crypto/bn
50 * # touch bn_asm.c
51 * # make
52 * # cd ../..
53 * # make; make test
54 *
55 * Q. V8plus architecture? What kind of beast is that?
56 * A. Well, it's rather a programming model than an architecture...
57 * It's actually v9-compliant, i.e. *any* UltraSPARC, CPU under
58 * special conditions, namely when kernel doesn't preserve upper
59 * 32 bits of otherwise 64-bit registers during a context switch.
60 *
61 * Q. Why just UltraSPARC? What about SuperSPARC?
62 * A. Original release did target UltraSPARC only. Now SuperSPARC
63 * version is provided along. Both version share bn_*comba[48]
64 * implementations (see comment later in code for explanation).
65 * But what's so special about this UltraSPARC implementation?
66 * Why didn't I let compiler do the job? Trouble is that most of
67 * available compilers (well, SC5.0 is the only exception) don't
68 * attempt to take advantage of UltraSPARC's 64-bitness under
69 * 32-bit kernels even though it's perfectly possible (see next
70 * question).
71 *
72 * Q. 64-bit registers under 32-bit kernels? Didn't you just say it
73 * doesn't work?
74 * A. You can't adress *all* registers as 64-bit wide:-( The catch is
75 * that you actually may rely upon %o0-%o5 and %g1-%g4 being fully
76 * preserved if you're in a leaf function, i.e. such never calling
77 * any other functions. All functions in this module are leaf and
78 * 10 registers is a handful. And as a matter of fact none-"comba"
79 * routines don't require even that much and I could even afford to
80 * not allocate own stack frame for 'em:-)
81 *
82 * Q. What about 64-bit kernels?
83 * A. What about 'em? Just kidding:-) Pure 64-bit version is currently
84 * under evaluation and development...
85 *
86 * Q. What about shared libraries?
87 * A. What about 'em? Kidding again:-) Code does *not* contain any
88 * code position dependencies and it's safe to include it into
89 * shared library as is.
90 *
91 * Q. How much faster does it go?
92 * A. Do you have a good benchmark? In either case below is what I
93 * experience with crypto/bn/expspeed.c test program:
94 *
95 * v8plus module on U10/300MHz against bn_asm.c compiled with:
96 *
97 * cc-5.0 -xarch=v8plus -xO5 -xdepend +7-12%
98 * cc-4.2 -xarch=v8plus -xO5 -xdepend +25-35%
99 * egcs-1.1.2 -mcpu=ultrasparc -O3 +35-45%
100 *
101 * v8 module on SS10/60MHz against bn_asm.c compiled with:
102 *
103 * cc-5.0 -xarch=v8 -xO5 -xdepend +7-10%
104 * cc-4.2 -xarch=v8 -xO5 -xdepend +10%
105 * egcs-1.1.2 -mv8 -O3 +35-45%
106 *
107 * As you can see it's damn hard to beat the new Sun C compiler
108 * and it's in first place GNU C users who will appreciate this
109 * assembler implementation:-)
110 */
111
112/*
113 * Revision history.
114 *
115 * 1.0 - initial release;
116 * 1.1 - new loop unrolling model(*);
117 * - some more fine tuning;
118 * 1.2 - made gas friendly;
119 * - updates to documentation concerning v9;
120 * - new performance comparison matrix;
121 * 1.3 - fixed problem with /usr/ccs/lib/cpp;
122 * 1.4 - native V9 bn_*_comba[48] implementation (15% more efficient)
123 * resulting in slight overall performance kick;
124 * - some retunes;
125 * - support for GNU as added;
126 *
127 * (*) Originally unrolled loop looked like this:
128 * for (;;) {
129 * op(p+0); if (--n==0) break;
130 * op(p+1); if (--n==0) break;
131 * op(p+2); if (--n==0) break;
132 * op(p+3); if (--n==0) break;
133 * p+=4;
134 * }
135 * I unroll according to following:
136 * while (n&~3) {
137 * op(p+0); op(p+1); op(p+2); op(p+3);
138 * p+=4; n=-4;
139 * }
140 * if (n) {
141 * op(p+0); if (--n==0) return;
142 * op(p+2); if (--n==0) return;
143 * op(p+3); return;
144 * }
145 */
146
147#if defined(__SUNPRO_C) && defined(__sparcv9)
148 /* They've said -xarch=v9 at command line */
149 .register %g2,#scratch
150 .register %g3,#scratch
151# define FRAME_SIZE -192
152#elif defined(__GNUC__) && defined(__arch64__)
153 /* They've said -m64 at command line */
154 .register %g2,#scratch
155 .register %g3,#scratch
156# define FRAME_SIZE -192
157#else
158# define FRAME_SIZE -96
159#endif
160/*
161 * GNU assembler can't stand stuw:-(
162 */
163#define stuw st
164
165.section ".text",#alloc,#execinstr
166.file "bn_asm.sparc.v8plus.S"
167
168.align 32
169
170.global bn_mul_add_words
171/*
172 * BN_ULONG bn_mul_add_words(rp,ap,num,w)
173 * BN_ULONG *rp,*ap;
174 * int num;
175 * BN_ULONG w;
176 */
177bn_mul_add_words:
178 sra %o2,%g0,%o2 ! signx %o2
179 brgz,a %o2,.L_bn_mul_add_words_proceed
180 lduw [%o1],%g2
181 retl
182 clr %o0
183 nop
184 nop
185 nop
186
187.L_bn_mul_add_words_proceed:
188 srl %o3,%g0,%o3 ! clruw %o3
189 andcc %o2,-4,%g0
190 bz,pn %icc,.L_bn_mul_add_words_tail
191 clr %o5
192
193.L_bn_mul_add_words_loop: ! wow! 32 aligned!
194 lduw [%o0],%g1
195 lduw [%o1+4],%g3
196 mulx %o3,%g2,%g2
197 add %g1,%o5,%o4
198 nop
199 add %o4,%g2,%o4
200 stuw %o4,[%o0]
201 srlx %o4,32,%o5
202
203 lduw [%o0+4],%g1
204 lduw [%o1+8],%g2
205 mulx %o3,%g3,%g3
206 add %g1,%o5,%o4
207 dec 4,%o2
208 add %o4,%g3,%o4
209 stuw %o4,[%o0+4]
210 srlx %o4,32,%o5
211
212 lduw [%o0+8],%g1
213 lduw [%o1+12],%g3
214 mulx %o3,%g2,%g2
215 add %g1,%o5,%o4
216 inc 16,%o1
217 add %o4,%g2,%o4
218 stuw %o4,[%o0+8]
219 srlx %o4,32,%o5
220
221 lduw [%o0+12],%g1
222 mulx %o3,%g3,%g3
223 add %g1,%o5,%o4
224 inc 16,%o0
225 add %o4,%g3,%o4
226 andcc %o2,-4,%g0
227 stuw %o4,[%o0-4]
228 srlx %o4,32,%o5
229 bnz,a,pt %icc,.L_bn_mul_add_words_loop
230 lduw [%o1],%g2
231
232 brnz,a,pn %o2,.L_bn_mul_add_words_tail
233 lduw [%o1],%g2
234.L_bn_mul_add_words_return:
235 retl
236 mov %o5,%o0
237
238.L_bn_mul_add_words_tail:
239 lduw [%o0],%g1
240 mulx %o3,%g2,%g2
241 add %g1,%o5,%o4
242 dec %o2
243 add %o4,%g2,%o4
244 srlx %o4,32,%o5
245 brz,pt %o2,.L_bn_mul_add_words_return
246 stuw %o4,[%o0]
247
248 lduw [%o1+4],%g2
249 lduw [%o0+4],%g1
250 mulx %o3,%g2,%g2
251 add %g1,%o5,%o4
252 dec %o2
253 add %o4,%g2,%o4
254 srlx %o4,32,%o5
255 brz,pt %o2,.L_bn_mul_add_words_return
256 stuw %o4,[%o0+4]
257
258 lduw [%o1+8],%g2
259 lduw [%o0+8],%g1
260 mulx %o3,%g2,%g2
261 add %g1,%o5,%o4
262 add %o4,%g2,%o4
263 stuw %o4,[%o0+8]
264 retl
265 srlx %o4,32,%o0
266
267.type bn_mul_add_words,#function
268.size bn_mul_add_words,(.-bn_mul_add_words)
269
270.align 32
271
272.global bn_mul_words
273/*
274 * BN_ULONG bn_mul_words(rp,ap,num,w)
275 * BN_ULONG *rp,*ap;
276 * int num;
277 * BN_ULONG w;
278 */
279bn_mul_words:
280 sra %o2,%g0,%o2 ! signx %o2
281 brgz,a %o2,.L_bn_mul_words_proceeed
282 lduw [%o1],%g2
283 retl
284 clr %o0
285 nop
286 nop
287 nop
288
289.L_bn_mul_words_proceeed:
290 srl %o3,%g0,%o3 ! clruw %o3
291 andcc %o2,-4,%g0
292 bz,pn %icc,.L_bn_mul_words_tail
293 clr %o5
294
295.L_bn_mul_words_loop: ! wow! 32 aligned!
296 lduw [%o1+4],%g3
297 mulx %o3,%g2,%g2
298 add %g2,%o5,%o4
299 nop
300 stuw %o4,[%o0]
301 srlx %o4,32,%o5
302
303 lduw [%o1+8],%g2
304 mulx %o3,%g3,%g3
305 add %g3,%o5,%o4
306 dec 4,%o2
307 stuw %o4,[%o0+4]
308 srlx %o4,32,%o5
309
310 lduw [%o1+12],%g3
311 mulx %o3,%g2,%g2
312 add %g2,%o5,%o4
313 inc 16,%o1
314 stuw %o4,[%o0+8]
315 srlx %o4,32,%o5
316
317 mulx %o3,%g3,%g3
318 add %g3,%o5,%o4
319 inc 16,%o0
320 stuw %o4,[%o0-4]
321 srlx %o4,32,%o5
322 andcc %o2,-4,%g0
323 bnz,a,pt %icc,.L_bn_mul_words_loop
324 lduw [%o1],%g2
325 nop
326 nop
327
328 brnz,a,pn %o2,.L_bn_mul_words_tail
329 lduw [%o1],%g2
330.L_bn_mul_words_return:
331 retl
332 mov %o5,%o0
333
334.L_bn_mul_words_tail:
335 mulx %o3,%g2,%g2
336 add %g2,%o5,%o4
337 dec %o2
338 srlx %o4,32,%o5
339 brz,pt %o2,.L_bn_mul_words_return
340 stuw %o4,[%o0]
341
342 lduw [%o1+4],%g2
343 mulx %o3,%g2,%g2
344 add %g2,%o5,%o4
345 dec %o2
346 srlx %o4,32,%o5
347 brz,pt %o2,.L_bn_mul_words_return
348 stuw %o4,[%o0+4]
349
350 lduw [%o1+8],%g2
351 mulx %o3,%g2,%g2
352 add %g2,%o5,%o4
353 stuw %o4,[%o0+8]
354 retl
355 srlx %o4,32,%o0
356
357.type bn_mul_words,#function
358.size bn_mul_words,(.-bn_mul_words)
359
360.align 32
361.global bn_sqr_words
362/*
363 * void bn_sqr_words(r,a,n)
364 * BN_ULONG *r,*a;
365 * int n;
366 */
367bn_sqr_words:
368 sra %o2,%g0,%o2 ! signx %o2
369 brgz,a %o2,.L_bn_sqr_words_proceeed
370 lduw [%o1],%g2
371 retl
372 clr %o0
373 nop
374 nop
375 nop
376
377.L_bn_sqr_words_proceeed:
378 andcc %o2,-4,%g0
379 nop
380 bz,pn %icc,.L_bn_sqr_words_tail
381 nop
382
383.L_bn_sqr_words_loop: ! wow! 32 aligned!
384 lduw [%o1+4],%g3
385 mulx %g2,%g2,%o4
386 stuw %o4,[%o0]
387 srlx %o4,32,%o5
388 stuw %o5,[%o0+4]
389 nop
390
391 lduw [%o1+8],%g2
392 mulx %g3,%g3,%o4
393 dec 4,%o2
394 stuw %o4,[%o0+8]
395 srlx %o4,32,%o5
396 stuw %o5,[%o0+12]
397
398 lduw [%o1+12],%g3
399 mulx %g2,%g2,%o4
400 srlx %o4,32,%o5
401 stuw %o4,[%o0+16]
402 inc 16,%o1
403 stuw %o5,[%o0+20]
404
405 mulx %g3,%g3,%o4
406 inc 32,%o0
407 stuw %o4,[%o0-8]
408 srlx %o4,32,%o5
409 andcc %o2,-4,%g2
410 stuw %o5,[%o0-4]
411 bnz,a,pt %icc,.L_bn_sqr_words_loop
412 lduw [%o1],%g2
413 nop
414
415 brnz,a,pn %o2,.L_bn_sqr_words_tail
416 lduw [%o1],%g2
417.L_bn_sqr_words_return:
418 retl
419 clr %o0
420
421.L_bn_sqr_words_tail:
422 mulx %g2,%g2,%o4
423 dec %o2
424 stuw %o4,[%o0]
425 srlx %o4,32,%o5
426 brz,pt %o2,.L_bn_sqr_words_return
427 stuw %o5,[%o0+4]
428
429 lduw [%o1+4],%g2
430 mulx %g2,%g2,%o4
431 dec %o2
432 stuw %o4,[%o0+8]
433 srlx %o4,32,%o5
434 brz,pt %o2,.L_bn_sqr_words_return
435 stuw %o5,[%o0+12]
436
437 lduw [%o1+8],%g2
438 mulx %g2,%g2,%o4
439 srlx %o4,32,%o5
440 stuw %o4,[%o0+16]
441 stuw %o5,[%o0+20]
442 retl
443 clr %o0
444
445.type bn_sqr_words,#function
446.size bn_sqr_words,(.-bn_sqr_words)
447
448.align 32
449.global bn_div_words
450/*
451 * BN_ULONG bn_div_words(h,l,d)
452 * BN_ULONG h,l,d;
453 */
454bn_div_words:
455 sllx %o0,32,%o0
456 or %o0,%o1,%o0
457 udivx %o0,%o2,%o0
458 retl
459 srl %o0,%g0,%o0 ! clruw %o0
460
461.type bn_div_words,#function
462.size bn_div_words,(.-bn_div_words)
463
464.align 32
465
466.global bn_add_words
467/*
468 * BN_ULONG bn_add_words(rp,ap,bp,n)
469 * BN_ULONG *rp,*ap,*bp;
470 * int n;
471 */
472bn_add_words:
473 sra %o3,%g0,%o3 ! signx %o3
474 brgz,a %o3,.L_bn_add_words_proceed
475 lduw [%o1],%o4
476 retl
477 clr %o0
478
479.L_bn_add_words_proceed:
480 andcc %o3,-4,%g0
481 bz,pn %icc,.L_bn_add_words_tail
482 addcc %g0,0,%g0 ! clear carry flag
483
484.L_bn_add_words_loop: ! wow! 32 aligned!
485 dec 4,%o3
486 lduw [%o2],%o5
487 lduw [%o1+4],%g1
488 lduw [%o2+4],%g2
489 lduw [%o1+8],%g3
490 lduw [%o2+8],%g4
491 addccc %o5,%o4,%o5
492 stuw %o5,[%o0]
493
494 lduw [%o1+12],%o4
495 lduw [%o2+12],%o5
496 inc 16,%o1
497 addccc %g1,%g2,%g1
498 stuw %g1,[%o0+4]
499
500 inc 16,%o2
501 addccc %g3,%g4,%g3
502 stuw %g3,[%o0+8]
503
504 inc 16,%o0
505 addccc %o5,%o4,%o5
506 stuw %o5,[%o0-4]
507 and %o3,-4,%g1
508 brnz,a,pt %g1,.L_bn_add_words_loop
509 lduw [%o1],%o4
510
511 brnz,a,pn %o3,.L_bn_add_words_tail
512 lduw [%o1],%o4
513.L_bn_add_words_return:
514 clr %o0
515 retl
516 movcs %icc,1,%o0
517 nop
518
519.L_bn_add_words_tail:
520 lduw [%o2],%o5
521 dec %o3
522 addccc %o5,%o4,%o5
523 brz,pt %o3,.L_bn_add_words_return
524 stuw %o5,[%o0]
525
526 lduw [%o1+4],%o4
527 lduw [%o2+4],%o5
528 dec %o3
529 addccc %o5,%o4,%o5
530 brz,pt %o3,.L_bn_add_words_return
531 stuw %o5,[%o0+4]
532
533 lduw [%o1+8],%o4
534 lduw [%o2+8],%o5
535 addccc %o5,%o4,%o5
536 stuw %o5,[%o0+8]
537 clr %o0
538 retl
539 movcs %icc,1,%o0
540
541.type bn_add_words,#function
542.size bn_add_words,(.-bn_add_words)
543
544.global bn_sub_words
545/*
546 * BN_ULONG bn_sub_words(rp,ap,bp,n)
547 * BN_ULONG *rp,*ap,*bp;
548 * int n;
549 */
550bn_sub_words:
551 sra %o3,%g0,%o3 ! signx %o3
552 brgz,a %o3,.L_bn_sub_words_proceed
553 lduw [%o1],%o4
554 retl
555 clr %o0
556
557.L_bn_sub_words_proceed:
558 andcc %o3,-4,%g0
559 bz,pn %icc,.L_bn_sub_words_tail
560 addcc %g0,0,%g0 ! clear carry flag
561
562.L_bn_sub_words_loop: ! wow! 32 aligned!
563 dec 4,%o3
564 lduw [%o2],%o5
565 lduw [%o1+4],%g1
566 lduw [%o2+4],%g2
567 lduw [%o1+8],%g3
568 lduw [%o2+8],%g4
569 subccc %o4,%o5,%o5
570 stuw %o5,[%o0]
571
572 lduw [%o1+12],%o4
573 lduw [%o2+12],%o5
574 inc 16,%o1
575 subccc %g1,%g2,%g2
576 stuw %g2,[%o0+4]
577
578 inc 16,%o2
579 subccc %g3,%g4,%g4
580 stuw %g4,[%o0+8]
581
582 inc 16,%o0
583 subccc %o4,%o5,%o5
584 stuw %o5,[%o0-4]
585 and %o3,-4,%g1
586 brnz,a,pt %g1,.L_bn_sub_words_loop
587 lduw [%o1],%o4
588
589 brnz,a,pn %o3,.L_bn_sub_words_tail
590 lduw [%o1],%o4
591.L_bn_sub_words_return:
592 clr %o0
593 retl
594 movcs %icc,1,%o0
595 nop
596
597.L_bn_sub_words_tail: ! wow! 32 aligned!
598 lduw [%o2],%o5
599 dec %o3
600 subccc %o4,%o5,%o5
601 brz,pt %o3,.L_bn_sub_words_return
602 stuw %o5,[%o0]
603
604 lduw [%o1+4],%o4
605 lduw [%o2+4],%o5
606 dec %o3
607 subccc %o4,%o5,%o5
608 brz,pt %o3,.L_bn_sub_words_return
609 stuw %o5,[%o0+4]
610
611 lduw [%o1+8],%o4
612 lduw [%o2+8],%o5
613 subccc %o4,%o5,%o5
614 stuw %o5,[%o0+8]
615 clr %o0
616 retl
617 movcs %icc,1,%o0
618
619.type bn_sub_words,#function
620.size bn_sub_words,(.-bn_sub_words)
621
622/*
623 * Code below depends on the fact that upper parts of the %l0-%l7
624 * and %i0-%i7 are zeroed by kernel after context switch. In
625 * previous versions this comment stated that "the trouble is that
626 * it's not feasible to implement the mumbo-jumbo in less V9
627 * instructions:-(" which apparently isn't true thanks to
628 * 'bcs,a %xcc,.+8; inc %rd' pair. But the performance improvement
629 * results not from the shorter code, but from elimination of
630 * multicycle none-pairable 'rd %y,%rd' instructions.
631 *
632 * Andy.
633 */
634
635/*
636 * Here is register usage map for *all* routines below.
637 */
638#define t_1 %o0
639#define t_2 %o1
640#define c_12 %o2
641#define c_3 %o3
642
643#define ap(I) [%i1+4*I]
644#define bp(I) [%i2+4*I]
645#define rp(I) [%i0+4*I]
646
647#define a_0 %l0
648#define a_1 %l1
649#define a_2 %l2
650#define a_3 %l3
651#define a_4 %l4
652#define a_5 %l5
653#define a_6 %l6
654#define a_7 %l7
655
656#define b_0 %i3
657#define b_1 %i4
658#define b_2 %i5
659#define b_3 %o4
660#define b_4 %o5
661#define b_5 %o7
662#define b_6 %g1
663#define b_7 %g4
664
665.align 32
666.global bn_mul_comba8
667/*
668 * void bn_mul_comba8(r,a,b)
669 * BN_ULONG *r,*a,*b;
670 */
671bn_mul_comba8:
672 save %sp,FRAME_SIZE,%sp
673 mov 1,t_2
674 lduw ap(0),a_0
675 sllx t_2,32,t_2
676 lduw bp(0),b_0 !=
677 lduw bp(1),b_1
678 mulx a_0,b_0,t_1 !mul_add_c(a[0],b[0],c1,c2,c3);
679 srlx t_1,32,c_12
680 stuw t_1,rp(0) !=!r[0]=c1;
681
682 lduw ap(1),a_1
683 mulx a_0,b_1,t_1 !mul_add_c(a[0],b[1],c2,c3,c1);
684 addcc c_12,t_1,c_12
685 clr c_3 !=
686 bcs,a %xcc,.+8
687 add c_3,t_2,c_3
688 lduw ap(2),a_2
689 mulx a_1,b_0,t_1 !=!mul_add_c(a[1],b[0],c2,c3,c1);
690 addcc c_12,t_1,t_1
691 bcs,a %xcc,.+8
692 add c_3,t_2,c_3
693 srlx t_1,32,c_12 !=
694 stuw t_1,rp(1) !r[1]=c2;
695 or c_12,c_3,c_12
696
697 mulx a_2,b_0,t_1 !mul_add_c(a[2],b[0],c3,c1,c2);
698 addcc c_12,t_1,c_12 !=
699 clr c_3
700 bcs,a %xcc,.+8
701 add c_3,t_2,c_3
702 lduw bp(2),b_2 !=
703 mulx a_1,b_1,t_1 !mul_add_c(a[1],b[1],c3,c1,c2);
704 addcc c_12,t_1,c_12
705 bcs,a %xcc,.+8
706 add c_3,t_2,c_3 !=
707 lduw bp(3),b_3
708 mulx a_0,b_2,t_1 !mul_add_c(a[0],b[2],c3,c1,c2);
709 addcc c_12,t_1,t_1
710 bcs,a %xcc,.+8 !=
711 add c_3,t_2,c_3
712 srlx t_1,32,c_12
713 stuw t_1,rp(2) !r[2]=c3;
714 or c_12,c_3,c_12 !=
715
716 mulx a_0,b_3,t_1 !mul_add_c(a[0],b[3],c1,c2,c3);
717 addcc c_12,t_1,c_12
718 clr c_3
719 bcs,a %xcc,.+8 !=
720 add c_3,t_2,c_3
721 mulx a_1,b_2,t_1 !=!mul_add_c(a[1],b[2],c1,c2,c3);
722 addcc c_12,t_1,c_12
723 bcs,a %xcc,.+8 !=
724 add c_3,t_2,c_3
725 lduw ap(3),a_3
726 mulx a_2,b_1,t_1 !mul_add_c(a[2],b[1],c1,c2,c3);
727 addcc c_12,t_1,c_12 !=
728 bcs,a %xcc,.+8
729 add c_3,t_2,c_3
730 lduw ap(4),a_4
731 mulx a_3,b_0,t_1 !=!mul_add_c(a[3],b[0],c1,c2,c3);!=
732 addcc c_12,t_1,t_1
733 bcs,a %xcc,.+8
734 add c_3,t_2,c_3
735 srlx t_1,32,c_12 !=
736 stuw t_1,rp(3) !r[3]=c1;
737 or c_12,c_3,c_12
738
739 mulx a_4,b_0,t_1 !mul_add_c(a[4],b[0],c2,c3,c1);
740 addcc c_12,t_1,c_12 !=
741 clr c_3
742 bcs,a %xcc,.+8
743 add c_3,t_2,c_3
744 mulx a_3,b_1,t_1 !=!mul_add_c(a[3],b[1],c2,c3,c1);
745 addcc c_12,t_1,c_12
746 bcs,a %xcc,.+8
747 add c_3,t_2,c_3
748 mulx a_2,b_2,t_1 !=!mul_add_c(a[2],b[2],c2,c3,c1);
749 addcc c_12,t_1,c_12
750 bcs,a %xcc,.+8
751 add c_3,t_2,c_3
752 lduw bp(4),b_4 !=
753 mulx a_1,b_3,t_1 !mul_add_c(a[1],b[3],c2,c3,c1);
754 addcc c_12,t_1,c_12
755 bcs,a %xcc,.+8
756 add c_3,t_2,c_3 !=
757 lduw bp(5),b_5
758 mulx a_0,b_4,t_1 !mul_add_c(a[0],b[4],c2,c3,c1);
759 addcc c_12,t_1,t_1
760 bcs,a %xcc,.+8 !=
761 add c_3,t_2,c_3
762 srlx t_1,32,c_12
763 stuw t_1,rp(4) !r[4]=c2;
764 or c_12,c_3,c_12 !=
765
766 mulx a_0,b_5,t_1 !mul_add_c(a[0],b[5],c3,c1,c2);
767 addcc c_12,t_1,c_12
768 clr c_3
769 bcs,a %xcc,.+8 !=
770 add c_3,t_2,c_3
771 mulx a_1,b_4,t_1 !mul_add_c(a[1],b[4],c3,c1,c2);
772 addcc c_12,t_1,c_12
773 bcs,a %xcc,.+8 !=
774 add c_3,t_2,c_3
775 mulx a_2,b_3,t_1 !mul_add_c(a[2],b[3],c3,c1,c2);
776 addcc c_12,t_1,c_12
777 bcs,a %xcc,.+8 !=
778 add c_3,t_2,c_3
779 mulx a_3,b_2,t_1 !mul_add_c(a[3],b[2],c3,c1,c2);
780 addcc c_12,t_1,c_12
781 bcs,a %xcc,.+8 !=
782 add c_3,t_2,c_3
783 lduw ap(5),a_5
784 mulx a_4,b_1,t_1 !mul_add_c(a[4],b[1],c3,c1,c2);
785 addcc c_12,t_1,c_12 !=
786 bcs,a %xcc,.+8
787 add c_3,t_2,c_3
788 lduw ap(6),a_6
789 mulx a_5,b_0,t_1 !=!mul_add_c(a[5],b[0],c3,c1,c2);
790 addcc c_12,t_1,t_1
791 bcs,a %xcc,.+8
792 add c_3,t_2,c_3
793 srlx t_1,32,c_12 !=
794 stuw t_1,rp(5) !r[5]=c3;
795 or c_12,c_3,c_12
796
797 mulx a_6,b_0,t_1 !mul_add_c(a[6],b[0],c1,c2,c3);
798 addcc c_12,t_1,c_12 !=
799 clr c_3
800 bcs,a %xcc,.+8
801 add c_3,t_2,c_3
802 mulx a_5,b_1,t_1 !=!mul_add_c(a[5],b[1],c1,c2,c3);
803 addcc c_12,t_1,c_12
804 bcs,a %xcc,.+8
805 add c_3,t_2,c_3
806 mulx a_4,b_2,t_1 !=!mul_add_c(a[4],b[2],c1,c2,c3);
807 addcc c_12,t_1,c_12
808 bcs,a %xcc,.+8
809 add c_3,t_2,c_3
810 mulx a_3,b_3,t_1 !=!mul_add_c(a[3],b[3],c1,c2,c3);
811 addcc c_12,t_1,c_12
812 bcs,a %xcc,.+8
813 add c_3,t_2,c_3
814 mulx a_2,b_4,t_1 !=!mul_add_c(a[2],b[4],c1,c2,c3);
815 addcc c_12,t_1,c_12
816 bcs,a %xcc,.+8
817 add c_3,t_2,c_3
818 lduw bp(6),b_6 !=
819 mulx a_1,b_5,t_1 !mul_add_c(a[1],b[5],c1,c2,c3);
820 addcc c_12,t_1,c_12
821 bcs,a %xcc,.+8
822 add c_3,t_2,c_3 !=
823 lduw bp(7),b_7
824 mulx a_0,b_6,t_1 !mul_add_c(a[0],b[6],c1,c2,c3);
825 addcc c_12,t_1,t_1
826 bcs,a %xcc,.+8 !=
827 add c_3,t_2,c_3
828 srlx t_1,32,c_12
829 stuw t_1,rp(6) !r[6]=c1;
830 or c_12,c_3,c_12 !=
831
832 mulx a_0,b_7,t_1 !mul_add_c(a[0],b[7],c2,c3,c1);
833 addcc c_12,t_1,c_12
834 clr c_3
835 bcs,a %xcc,.+8 !=
836 add c_3,t_2,c_3
837 mulx a_1,b_6,t_1 !mul_add_c(a[1],b[6],c2,c3,c1);
838 addcc c_12,t_1,c_12
839 bcs,a %xcc,.+8 !=
840 add c_3,t_2,c_3
841 mulx a_2,b_5,t_1 !mul_add_c(a[2],b[5],c2,c3,c1);
842 addcc c_12,t_1,c_12
843 bcs,a %xcc,.+8 !=
844 add c_3,t_2,c_3
845 mulx a_3,b_4,t_1 !mul_add_c(a[3],b[4],c2,c3,c1);
846 addcc c_12,t_1,c_12
847 bcs,a %xcc,.+8 !=
848 add c_3,t_2,c_3
849 mulx a_4,b_3,t_1 !mul_add_c(a[4],b[3],c2,c3,c1);
850 addcc c_12,t_1,c_12
851 bcs,a %xcc,.+8 !=
852 add c_3,t_2,c_3
853 mulx a_5,b_2,t_1 !mul_add_c(a[5],b[2],c2,c3,c1);
854 addcc c_12,t_1,c_12
855 bcs,a %xcc,.+8 !=
856 add c_3,t_2,c_3
857 lduw ap(7),a_7
858 mulx a_6,b_1,t_1 !=!mul_add_c(a[6],b[1],c2,c3,c1);
859 addcc c_12,t_1,c_12
860 bcs,a %xcc,.+8
861 add c_3,t_2,c_3
862 mulx a_7,b_0,t_1 !=!mul_add_c(a[7],b[0],c2,c3,c1);
863 addcc c_12,t_1,t_1
864 bcs,a %xcc,.+8
865 add c_3,t_2,c_3
866 srlx t_1,32,c_12 !=
867 stuw t_1,rp(7) !r[7]=c2;
868 or c_12,c_3,c_12
869
870 mulx a_7,b_1,t_1 !=!mul_add_c(a[7],b[1],c3,c1,c2);
871 addcc c_12,t_1,c_12
872 clr c_3
873 bcs,a %xcc,.+8
874 add c_3,t_2,c_3 !=
875 mulx a_6,b_2,t_1 !mul_add_c(a[6],b[2],c3,c1,c2);
876 addcc c_12,t_1,c_12
877 bcs,a %xcc,.+8
878 add c_3,t_2,c_3 !=
879 mulx a_5,b_3,t_1 !mul_add_c(a[5],b[3],c3,c1,c2);
880 addcc c_12,t_1,c_12
881 bcs,a %xcc,.+8
882 add c_3,t_2,c_3 !=
883 mulx a_4,b_4,t_1 !mul_add_c(a[4],b[4],c3,c1,c2);
884 addcc c_12,t_1,c_12
885 bcs,a %xcc,.+8
886 add c_3,t_2,c_3 !=
887 mulx a_3,b_5,t_1 !mul_add_c(a[3],b[5],c3,c1,c2);
888 addcc c_12,t_1,c_12
889 bcs,a %xcc,.+8
890 add c_3,t_2,c_3 !=
891 mulx a_2,b_6,t_1 !mul_add_c(a[2],b[6],c3,c1,c2);
892 addcc c_12,t_1,c_12
893 bcs,a %xcc,.+8
894 add c_3,t_2,c_3 !=
895 mulx a_1,b_7,t_1 !mul_add_c(a[1],b[7],c3,c1,c2);
896 addcc c_12,t_1,t_1
897 bcs,a %xcc,.+8
898 add c_3,t_2,c_3 !=
899 srlx t_1,32,c_12
900 stuw t_1,rp(8) !r[8]=c3;
901 or c_12,c_3,c_12
902
903 mulx a_2,b_7,t_1 !=!mul_add_c(a[2],b[7],c1,c2,c3);
904 addcc c_12,t_1,c_12
905 clr c_3
906 bcs,a %xcc,.+8
907 add c_3,t_2,c_3 !=
908 mulx a_3,b_6,t_1 !mul_add_c(a[3],b[6],c1,c2,c3);
909 addcc c_12,t_1,c_12
910 bcs,a %xcc,.+8 !=
911 add c_3,t_2,c_3
912 mulx a_4,b_5,t_1 !mul_add_c(a[4],b[5],c1,c2,c3);
913 addcc c_12,t_1,c_12
914 bcs,a %xcc,.+8 !=
915 add c_3,t_2,c_3
916 mulx a_5,b_4,t_1 !mul_add_c(a[5],b[4],c1,c2,c3);
917 addcc c_12,t_1,c_12
918 bcs,a %xcc,.+8 !=
919 add c_3,t_2,c_3
920 mulx a_6,b_3,t_1 !mul_add_c(a[6],b[3],c1,c2,c3);
921 addcc c_12,t_1,c_12
922 bcs,a %xcc,.+8 !=
923 add c_3,t_2,c_3
924 mulx a_7,b_2,t_1 !mul_add_c(a[7],b[2],c1,c2,c3);
925 addcc c_12,t_1,t_1
926 bcs,a %xcc,.+8 !=
927 add c_3,t_2,c_3
928 srlx t_1,32,c_12
929 stuw t_1,rp(9) !r[9]=c1;
930 or c_12,c_3,c_12 !=
931
932 mulx a_7,b_3,t_1 !mul_add_c(a[7],b[3],c2,c3,c1);
933 addcc c_12,t_1,c_12
934 clr c_3
935 bcs,a %xcc,.+8 !=
936 add c_3,t_2,c_3
937 mulx a_6,b_4,t_1 !mul_add_c(a[6],b[4],c2,c3,c1);
938 addcc c_12,t_1,c_12
939 bcs,a %xcc,.+8 !=
940 add c_3,t_2,c_3
941 mulx a_5,b_5,t_1 !mul_add_c(a[5],b[5],c2,c3,c1);
942 addcc c_12,t_1,c_12
943 bcs,a %xcc,.+8 !=
944 add c_3,t_2,c_3
945 mulx a_4,b_6,t_1 !mul_add_c(a[4],b[6],c2,c3,c1);
946 addcc c_12,t_1,c_12
947 bcs,a %xcc,.+8 !=
948 add c_3,t_2,c_3
949 mulx a_3,b_7,t_1 !mul_add_c(a[3],b[7],c2,c3,c1);
950 addcc c_12,t_1,t_1
951 bcs,a %xcc,.+8 !=
952 add c_3,t_2,c_3
953 srlx t_1,32,c_12
954 stuw t_1,rp(10) !r[10]=c2;
955 or c_12,c_3,c_12 !=
956
957 mulx a_4,b_7,t_1 !mul_add_c(a[4],b[7],c3,c1,c2);
958 addcc c_12,t_1,c_12
959 clr c_3
960 bcs,a %xcc,.+8 !=
961 add c_3,t_2,c_3
962 mulx a_5,b_6,t_1 !mul_add_c(a[5],b[6],c3,c1,c2);
963 addcc c_12,t_1,c_12
964 bcs,a %xcc,.+8 !=
965 add c_3,t_2,c_3
966 mulx a_6,b_5,t_1 !mul_add_c(a[6],b[5],c3,c1,c2);
967 addcc c_12,t_1,c_12
968 bcs,a %xcc,.+8 !=
969 add c_3,t_2,c_3
970 mulx a_7,b_4,t_1 !mul_add_c(a[7],b[4],c3,c1,c2);
971 addcc c_12,t_1,t_1
972 bcs,a %xcc,.+8 !=
973 add c_3,t_2,c_3
974 srlx t_1,32,c_12
975 stuw t_1,rp(11) !r[11]=c3;
976 or c_12,c_3,c_12 !=
977
978 mulx a_7,b_5,t_1 !mul_add_c(a[7],b[5],c1,c2,c3);
979 addcc c_12,t_1,c_12
980 clr c_3
981 bcs,a %xcc,.+8 !=
982 add c_3,t_2,c_3
983 mulx a_6,b_6,t_1 !mul_add_c(a[6],b[6],c1,c2,c3);
984 addcc c_12,t_1,c_12
985 bcs,a %xcc,.+8 !=
986 add c_3,t_2,c_3
987 mulx a_5,b_7,t_1 !mul_add_c(a[5],b[7],c1,c2,c3);
988 addcc c_12,t_1,t_1
989 bcs,a %xcc,.+8 !=
990 add c_3,t_2,c_3
991 srlx t_1,32,c_12
992 stuw t_1,rp(12) !r[12]=c1;
993 or c_12,c_3,c_12 !=
994
995 mulx a_6,b_7,t_1 !mul_add_c(a[6],b[7],c2,c3,c1);
996 addcc c_12,t_1,c_12
997 clr c_3
998 bcs,a %xcc,.+8 !=
999 add c_3,t_2,c_3
1000 mulx a_7,b_6,t_1 !mul_add_c(a[7],b[6],c2,c3,c1);
1001 addcc c_12,t_1,t_1
1002 bcs,a %xcc,.+8 !=
1003 add c_3,t_2,c_3
1004 srlx t_1,32,c_12
1005 st t_1,rp(13) !r[13]=c2;
1006 or c_12,c_3,c_12 !=
1007
1008 mulx a_7,b_7,t_1 !mul_add_c(a[7],b[7],c3,c1,c2);
1009 addcc c_12,t_1,t_1
1010 srlx t_1,32,c_12 !=
1011 stuw t_1,rp(14) !r[14]=c3;
1012 stuw c_12,rp(15) !r[15]=c1;
1013
1014 ret
1015 restore %g0,%g0,%o0 !=
1016
1017.type bn_mul_comba8,#function
1018.size bn_mul_comba8,(.-bn_mul_comba8)
1019
1020.align 32
1021
1022.global bn_mul_comba4
1023/*
1024 * void bn_mul_comba4(r,a,b)
1025 * BN_ULONG *r,*a,*b;
1026 */
1027bn_mul_comba4:
1028 save %sp,FRAME_SIZE,%sp
1029 lduw ap(0),a_0
1030 mov 1,t_2
1031 lduw bp(0),b_0
1032 sllx t_2,32,t_2 !=
1033 lduw bp(1),b_1
1034 mulx a_0,b_0,t_1 !mul_add_c(a[0],b[0],c1,c2,c3);
1035 srlx t_1,32,c_12
1036 stuw t_1,rp(0) !=!r[0]=c1;
1037
1038 lduw ap(1),a_1
1039 mulx a_0,b_1,t_1 !mul_add_c(a[0],b[1],c2,c3,c1);
1040 addcc c_12,t_1,c_12
1041 clr c_3 !=
1042 bcs,a %xcc,.+8
1043 add c_3,t_2,c_3
1044 lduw ap(2),a_2
1045 mulx a_1,b_0,t_1 !=!mul_add_c(a[1],b[0],c2,c3,c1);
1046 addcc c_12,t_1,t_1
1047 bcs,a %xcc,.+8
1048 add c_3,t_2,c_3
1049 srlx t_1,32,c_12 !=
1050 stuw t_1,rp(1) !r[1]=c2;
1051 or c_12,c_3,c_12
1052
1053 mulx a_2,b_0,t_1 !mul_add_c(a[2],b[0],c3,c1,c2);
1054 addcc c_12,t_1,c_12 !=
1055 clr c_3
1056 bcs,a %xcc,.+8
1057 add c_3,t_2,c_3
1058 lduw bp(2),b_2 !=
1059 mulx a_1,b_1,t_1 !mul_add_c(a[1],b[1],c3,c1,c2);
1060 addcc c_12,t_1,c_12
1061 bcs,a %xcc,.+8
1062 add c_3,t_2,c_3 !=
1063 lduw bp(3),b_3
1064 mulx a_0,b_2,t_1 !mul_add_c(a[0],b[2],c3,c1,c2);
1065 addcc c_12,t_1,t_1
1066 bcs,a %xcc,.+8 !=
1067 add c_3,t_2,c_3
1068 srlx t_1,32,c_12
1069 stuw t_1,rp(2) !r[2]=c3;
1070 or c_12,c_3,c_12 !=
1071
1072 mulx a_0,b_3,t_1 !mul_add_c(a[0],b[3],c1,c2,c3);
1073 addcc c_12,t_1,c_12
1074 clr c_3
1075 bcs,a %xcc,.+8 !=
1076 add c_3,t_2,c_3
1077 mulx a_1,b_2,t_1 !mul_add_c(a[1],b[2],c1,c2,c3);
1078 addcc c_12,t_1,c_12
1079 bcs,a %xcc,.+8 !=
1080 add c_3,t_2,c_3
1081 lduw ap(3),a_3
1082 mulx a_2,b_1,t_1 !mul_add_c(a[2],b[1],c1,c2,c3);
1083 addcc c_12,t_1,c_12 !=
1084 bcs,a %xcc,.+8
1085 add c_3,t_2,c_3
1086 mulx a_3,b_0,t_1 !mul_add_c(a[3],b[0],c1,c2,c3);!=
1087 addcc c_12,t_1,t_1 !=
1088 bcs,a %xcc,.+8
1089 add c_3,t_2,c_3
1090 srlx t_1,32,c_12
1091 stuw t_1,rp(3) !=!r[3]=c1;
1092 or c_12,c_3,c_12
1093
1094 mulx a_3,b_1,t_1 !mul_add_c(a[3],b[1],c2,c3,c1);
1095 addcc c_12,t_1,c_12
1096 clr c_3 !=
1097 bcs,a %xcc,.+8
1098 add c_3,t_2,c_3
1099 mulx a_2,b_2,t_1 !mul_add_c(a[2],b[2],c2,c3,c1);
1100 addcc c_12,t_1,c_12 !=
1101 bcs,a %xcc,.+8
1102 add c_3,t_2,c_3
1103 mulx a_1,b_3,t_1 !mul_add_c(a[1],b[3],c2,c3,c1);
1104 addcc c_12,t_1,t_1 !=
1105 bcs,a %xcc,.+8
1106 add c_3,t_2,c_3
1107 srlx t_1,32,c_12
1108 stuw t_1,rp(4) !=!r[4]=c2;
1109 or c_12,c_3,c_12
1110
1111 mulx a_2,b_3,t_1 !mul_add_c(a[2],b[3],c3,c1,c2);
1112 addcc c_12,t_1,c_12
1113 clr c_3 !=
1114 bcs,a %xcc,.+8
1115 add c_3,t_2,c_3
1116 mulx a_3,b_2,t_1 !mul_add_c(a[3],b[2],c3,c1,c2);
1117 addcc c_12,t_1,t_1 !=
1118 bcs,a %xcc,.+8
1119 add c_3,t_2,c_3
1120 srlx t_1,32,c_12
1121 stuw t_1,rp(5) !=!r[5]=c3;
1122 or c_12,c_3,c_12
1123
1124 mulx a_3,b_3,t_1 !mul_add_c(a[3],b[3],c1,c2,c3);
1125 addcc c_12,t_1,t_1
1126 srlx t_1,32,c_12 !=
1127 stuw t_1,rp(6) !r[6]=c1;
1128 stuw c_12,rp(7) !r[7]=c2;
1129
1130 ret
1131 restore %g0,%g0,%o0
1132
1133.type bn_mul_comba4,#function
1134.size bn_mul_comba4,(.-bn_mul_comba4)
1135
1136.align 32
1137
1138.global bn_sqr_comba8
1139bn_sqr_comba8:
1140 save %sp,FRAME_SIZE,%sp
1141 mov 1,t_2
1142 lduw ap(0),a_0
1143 sllx t_2,32,t_2
1144 lduw ap(1),a_1
1145 mulx a_0,a_0,t_1 !sqr_add_c(a,0,c1,c2,c3);
1146 srlx t_1,32,c_12
1147 stuw t_1,rp(0) !r[0]=c1;
1148
1149 lduw ap(2),a_2
1150 mulx a_0,a_1,t_1 !=!sqr_add_c2(a,1,0,c2,c3,c1);
1151 addcc c_12,t_1,c_12
1152 clr c_3
1153 bcs,a %xcc,.+8
1154 add c_3,t_2,c_3
1155 addcc c_12,t_1,t_1
1156 bcs,a %xcc,.+8
1157 add c_3,t_2,c_3
1158 srlx t_1,32,c_12
1159 stuw t_1,rp(1) !r[1]=c2;
1160 or c_12,c_3,c_12
1161
1162 mulx a_2,a_0,t_1 !sqr_add_c2(a,2,0,c3,c1,c2);
1163 addcc c_12,t_1,c_12
1164 clr c_3
1165 bcs,a %xcc,.+8
1166 add c_3,t_2,c_3
1167 addcc c_12,t_1,c_12
1168 bcs,a %xcc,.+8
1169 add c_3,t_2,c_3
1170 lduw ap(3),a_3
1171 mulx a_1,a_1,t_1 !sqr_add_c(a,1,c3,c1,c2);
1172 addcc c_12,t_1,t_1
1173 bcs,a %xcc,.+8
1174 add c_3,t_2,c_3
1175 srlx t_1,32,c_12
1176 stuw t_1,rp(2) !r[2]=c3;
1177 or c_12,c_3,c_12
1178
1179 mulx a_0,a_3,t_1 !sqr_add_c2(a,3,0,c1,c2,c3);
1180 addcc c_12,t_1,c_12
1181 clr c_3
1182 bcs,a %xcc,.+8
1183 add c_3,t_2,c_3
1184 addcc c_12,t_1,c_12
1185 bcs,a %xcc,.+8
1186 add c_3,t_2,c_3
1187 lduw ap(4),a_4
1188 mulx a_1,a_2,t_1 !sqr_add_c2(a,2,1,c1,c2,c3);
1189 addcc c_12,t_1,c_12
1190 bcs,a %xcc,.+8
1191 add c_3,t_2,c_3
1192 addcc c_12,t_1,t_1
1193 bcs,a %xcc,.+8
1194 add c_3,t_2,c_3
1195 srlx t_1,32,c_12
1196 st t_1,rp(3) !r[3]=c1;
1197 or c_12,c_3,c_12
1198
1199 mulx a_4,a_0,t_1 !sqr_add_c2(a,4,0,c2,c3,c1);
1200 addcc c_12,t_1,c_12
1201 clr c_3
1202 bcs,a %xcc,.+8
1203 add c_3,t_2,c_3
1204 addcc c_12,t_1,c_12
1205 bcs,a %xcc,.+8
1206 add c_3,t_2,c_3
1207 mulx a_3,a_1,t_1 !sqr_add_c2(a,3,1,c2,c3,c1);
1208 addcc c_12,t_1,c_12
1209 bcs,a %xcc,.+8
1210 add c_3,t_2,c_3
1211 addcc c_12,t_1,c_12
1212 bcs,a %xcc,.+8
1213 add c_3,t_2,c_3
1214 lduw ap(5),a_5
1215 mulx a_2,a_2,t_1 !sqr_add_c(a,2,c2,c3,c1);
1216 addcc c_12,t_1,t_1
1217 bcs,a %xcc,.+8
1218 add c_3,t_2,c_3
1219 srlx t_1,32,c_12
1220 stuw t_1,rp(4) !r[4]=c2;
1221 or c_12,c_3,c_12
1222
1223 mulx a_0,a_5,t_1 !sqr_add_c2(a,5,0,c3,c1,c2);
1224 addcc c_12,t_1,c_12
1225 clr c_3
1226 bcs,a %xcc,.+8
1227 add c_3,t_2,c_3
1228 addcc c_12,t_1,c_12
1229 bcs,a %xcc,.+8
1230 add c_3,t_2,c_3
1231 mulx a_1,a_4,t_1 !sqr_add_c2(a,4,1,c3,c1,c2);
1232 addcc c_12,t_1,c_12
1233 bcs,a %xcc,.+8
1234 add c_3,t_2,c_3
1235 addcc c_12,t_1,c_12
1236 bcs,a %xcc,.+8
1237 add c_3,t_2,c_3
1238 lduw ap(6),a_6
1239 mulx a_2,a_3,t_1 !sqr_add_c2(a,3,2,c3,c1,c2);
1240 addcc c_12,t_1,c_12
1241 bcs,a %xcc,.+8
1242 add c_3,t_2,c_3
1243 addcc c_12,t_1,t_1
1244 bcs,a %xcc,.+8
1245 add c_3,t_2,c_3
1246 srlx t_1,32,c_12
1247 stuw t_1,rp(5) !r[5]=c3;
1248 or c_12,c_3,c_12
1249
1250 mulx a_6,a_0,t_1 !sqr_add_c2(a,6,0,c1,c2,c3);
1251 addcc c_12,t_1,c_12
1252 clr c_3
1253 bcs,a %xcc,.+8
1254 add c_3,t_2,c_3
1255 addcc c_12,t_1,c_12
1256 bcs,a %xcc,.+8
1257 add c_3,t_2,c_3
1258 mulx a_5,a_1,t_1 !sqr_add_c2(a,5,1,c1,c2,c3);
1259 addcc c_12,t_1,c_12
1260 bcs,a %xcc,.+8
1261 add c_3,t_2,c_3
1262 addcc c_12,t_1,c_12
1263 bcs,a %xcc,.+8
1264 add c_3,t_2,c_3
1265 mulx a_4,a_2,t_1 !sqr_add_c2(a,4,2,c1,c2,c3);
1266 addcc c_12,t_1,c_12
1267 bcs,a %xcc,.+8
1268 add c_3,t_2,c_3
1269 addcc c_12,t_1,c_12
1270 bcs,a %xcc,.+8
1271 add c_3,t_2,c_3
1272 lduw ap(7),a_7
1273 mulx a_3,a_3,t_1 !=!sqr_add_c(a,3,c1,c2,c3);
1274 addcc c_12,t_1,t_1
1275 bcs,a %xcc,.+8
1276 add c_3,t_2,c_3
1277 srlx t_1,32,c_12
1278 stuw t_1,rp(6) !r[6]=c1;
1279 or c_12,c_3,c_12
1280
1281 mulx a_0,a_7,t_1 !sqr_add_c2(a,7,0,c2,c3,c1);
1282 addcc c_12,t_1,c_12
1283 clr c_3
1284 bcs,a %xcc,.+8
1285 add c_3,t_2,c_3
1286 addcc c_12,t_1,c_12
1287 bcs,a %xcc,.+8
1288 add c_3,t_2,c_3
1289 mulx a_1,a_6,t_1 !sqr_add_c2(a,6,1,c2,c3,c1);
1290 addcc c_12,t_1,c_12
1291 bcs,a %xcc,.+8
1292 add c_3,t_2,c_3
1293 addcc c_12,t_1,c_12
1294 bcs,a %xcc,.+8
1295 add c_3,t_2,c_3
1296 mulx a_2,a_5,t_1 !sqr_add_c2(a,5,2,c2,c3,c1);
1297 addcc c_12,t_1,c_12
1298 bcs,a %xcc,.+8
1299 add c_3,t_2,c_3
1300 addcc c_12,t_1,c_12
1301 bcs,a %xcc,.+8
1302 add c_3,t_2,c_3
1303 mulx a_3,a_4,t_1 !sqr_add_c2(a,4,3,c2,c3,c1);
1304 addcc c_12,t_1,c_12
1305 bcs,a %xcc,.+8
1306 add c_3,t_2,c_3
1307 addcc c_12,t_1,t_1
1308 bcs,a %xcc,.+8
1309 add c_3,t_2,c_3
1310 srlx t_1,32,c_12
1311 stuw t_1,rp(7) !r[7]=c2;
1312 or c_12,c_3,c_12
1313
1314 mulx a_7,a_1,t_1 !sqr_add_c2(a,7,1,c3,c1,c2);
1315 addcc c_12,t_1,c_12
1316 clr c_3
1317 bcs,a %xcc,.+8
1318 add c_3,t_2,c_3
1319 addcc c_12,t_1,c_12
1320 bcs,a %xcc,.+8
1321 add c_3,t_2,c_3
1322 mulx a_6,a_2,t_1 !sqr_add_c2(a,6,2,c3,c1,c2);
1323 addcc c_12,t_1,c_12
1324 bcs,a %xcc,.+8
1325 add c_3,t_2,c_3
1326 addcc c_12,t_1,c_12
1327 bcs,a %xcc,.+8
1328 add c_3,t_2,c_3
1329 mulx a_5,a_3,t_1 !sqr_add_c2(a,5,3,c3,c1,c2);
1330 addcc c_12,t_1,c_12
1331 bcs,a %xcc,.+8
1332 add c_3,t_2,c_3
1333 addcc c_12,t_1,c_12
1334 bcs,a %xcc,.+8
1335 add c_3,t_2,c_3
1336 mulx a_4,a_4,t_1 !sqr_add_c(a,4,c3,c1,c2);
1337 addcc c_12,t_1,t_1
1338 bcs,a %xcc,.+8
1339 add c_3,t_2,c_3
1340 srlx t_1,32,c_12
1341 stuw t_1,rp(8) !r[8]=c3;
1342 or c_12,c_3,c_12
1343
1344 mulx a_2,a_7,t_1 !sqr_add_c2(a,7,2,c1,c2,c3);
1345 addcc c_12,t_1,c_12
1346 clr c_3
1347 bcs,a %xcc,.+8
1348 add c_3,t_2,c_3
1349 addcc c_12,t_1,c_12
1350 bcs,a %xcc,.+8
1351 add c_3,t_2,c_3
1352 mulx a_3,a_6,t_1 !sqr_add_c2(a,6,3,c1,c2,c3);
1353 addcc c_12,t_1,c_12
1354 bcs,a %xcc,.+8
1355 add c_3,t_2,c_3
1356 addcc c_12,t_1,c_12
1357 bcs,a %xcc,.+8
1358 add c_3,t_2,c_3
1359 mulx a_4,a_5,t_1 !sqr_add_c2(a,5,4,c1,c2,c3);
1360 addcc c_12,t_1,c_12
1361 bcs,a %xcc,.+8
1362 add c_3,t_2,c_3
1363 addcc c_12,t_1,t_1
1364 bcs,a %xcc,.+8
1365 add c_3,t_2,c_3
1366 srlx t_1,32,c_12
1367 stuw t_1,rp(9) !r[9]=c1;
1368 or c_12,c_3,c_12
1369
1370 mulx a_7,a_3,t_1 !sqr_add_c2(a,7,3,c2,c3,c1);
1371 addcc c_12,t_1,c_12
1372 clr c_3
1373 bcs,a %xcc,.+8
1374 add c_3,t_2,c_3
1375 addcc c_12,t_1,c_12
1376 bcs,a %xcc,.+8
1377 add c_3,t_2,c_3
1378 mulx a_6,a_4,t_1 !sqr_add_c2(a,6,4,c2,c3,c1);
1379 addcc c_12,t_1,c_12
1380 bcs,a %xcc,.+8
1381 add c_3,t_2,c_3
1382 addcc c_12,t_1,c_12
1383 bcs,a %xcc,.+8
1384 add c_3,t_2,c_3
1385 mulx a_5,a_5,t_1 !sqr_add_c(a,5,c2,c3,c1);
1386 addcc c_12,t_1,t_1
1387 bcs,a %xcc,.+8
1388 add c_3,t_2,c_3
1389 srlx t_1,32,c_12
1390 stuw t_1,rp(10) !r[10]=c2;
1391 or c_12,c_3,c_12
1392
1393 mulx a_4,a_7,t_1 !sqr_add_c2(a,7,4,c3,c1,c2);
1394 addcc c_12,t_1,c_12
1395 clr c_3
1396 bcs,a %xcc,.+8
1397 add c_3,t_2,c_3
1398 addcc c_12,t_1,c_12
1399 bcs,a %xcc,.+8
1400 add c_3,t_2,c_3
1401 mulx a_5,a_6,t_1 !sqr_add_c2(a,6,5,c3,c1,c2);
1402 addcc c_12,t_1,c_12
1403 bcs,a %xcc,.+8
1404 add c_3,t_2,c_3
1405 addcc c_12,t_1,t_1
1406 bcs,a %xcc,.+8
1407 add c_3,t_2,c_3
1408 srlx t_1,32,c_12
1409 stuw t_1,rp(11) !r[11]=c3;
1410 or c_12,c_3,c_12
1411
1412 mulx a_7,a_5,t_1 !sqr_add_c2(a,7,5,c1,c2,c3);
1413 addcc c_12,t_1,c_12
1414 clr c_3
1415 bcs,a %xcc,.+8
1416 add c_3,t_2,c_3
1417 addcc c_12,t_1,c_12
1418 bcs,a %xcc,.+8
1419 add c_3,t_2,c_3
1420 mulx a_6,a_6,t_1 !sqr_add_c(a,6,c1,c2,c3);
1421 addcc c_12,t_1,t_1
1422 bcs,a %xcc,.+8
1423 add c_3,t_2,c_3
1424 srlx t_1,32,c_12
1425 stuw t_1,rp(12) !r[12]=c1;
1426 or c_12,c_3,c_12
1427
1428 mulx a_6,a_7,t_1 !sqr_add_c2(a,7,6,c2,c3,c1);
1429 addcc c_12,t_1,c_12
1430 clr c_3
1431 bcs,a %xcc,.+8
1432 add c_3,t_2,c_3
1433 addcc c_12,t_1,t_1
1434 bcs,a %xcc,.+8
1435 add c_3,t_2,c_3
1436 srlx t_1,32,c_12
1437 stuw t_1,rp(13) !r[13]=c2;
1438 or c_12,c_3,c_12
1439
1440 mulx a_7,a_7,t_1 !sqr_add_c(a,7,c3,c1,c2);
1441 addcc c_12,t_1,t_1
1442 srlx t_1,32,c_12
1443 stuw t_1,rp(14) !r[14]=c3;
1444 stuw c_12,rp(15) !r[15]=c1;
1445
1446 ret
1447 restore %g0,%g0,%o0
1448
1449.type bn_sqr_comba8,#function
1450.size bn_sqr_comba8,(.-bn_sqr_comba8)
1451
1452.align 32
1453
1454.global bn_sqr_comba4
1455/*
1456 * void bn_sqr_comba4(r,a)
1457 * BN_ULONG *r,*a;
1458 */
1459bn_sqr_comba4:
1460 save %sp,FRAME_SIZE,%sp
1461 mov 1,t_2
1462 lduw ap(0),a_0
1463 sllx t_2,32,t_2
1464 lduw ap(1),a_1
1465 mulx a_0,a_0,t_1 !sqr_add_c(a,0,c1,c2,c3);
1466 srlx t_1,32,c_12
1467 stuw t_1,rp(0) !r[0]=c1;
1468
1469 lduw ap(2),a_2
1470 mulx a_0,a_1,t_1 !sqr_add_c2(a,1,0,c2,c3,c1);
1471 addcc c_12,t_1,c_12
1472 clr c_3
1473 bcs,a %xcc,.+8
1474 add c_3,t_2,c_3
1475 addcc c_12,t_1,t_1
1476 bcs,a %xcc,.+8
1477 add c_3,t_2,c_3
1478 srlx t_1,32,c_12
1479 stuw t_1,rp(1) !r[1]=c2;
1480 or c_12,c_3,c_12
1481
1482 mulx a_2,a_0,t_1 !sqr_add_c2(a,2,0,c3,c1,c2);
1483 addcc c_12,t_1,c_12
1484 clr c_3
1485 bcs,a %xcc,.+8
1486 add c_3,t_2,c_3
1487 addcc c_12,t_1,c_12
1488 bcs,a %xcc,.+8
1489 add c_3,t_2,c_3
1490 lduw ap(3),a_3
1491 mulx a_1,a_1,t_1 !sqr_add_c(a,1,c3,c1,c2);
1492 addcc c_12,t_1,t_1
1493 bcs,a %xcc,.+8
1494 add c_3,t_2,c_3
1495 srlx t_1,32,c_12
1496 stuw t_1,rp(2) !r[2]=c3;
1497 or c_12,c_3,c_12
1498
1499 mulx a_0,a_3,t_1 !sqr_add_c2(a,3,0,c1,c2,c3);
1500 addcc c_12,t_1,c_12
1501 clr c_3
1502 bcs,a %xcc,.+8
1503 add c_3,t_2,c_3
1504 addcc c_12,t_1,c_12
1505 bcs,a %xcc,.+8
1506 add c_3,t_2,c_3
1507 mulx a_1,a_2,t_1 !sqr_add_c2(a,2,1,c1,c2,c3);
1508 addcc c_12,t_1,c_12
1509 bcs,a %xcc,.+8
1510 add c_3,t_2,c_3
1511 addcc c_12,t_1,t_1
1512 bcs,a %xcc,.+8
1513 add c_3,t_2,c_3
1514 srlx t_1,32,c_12
1515 stuw t_1,rp(3) !r[3]=c1;
1516 or c_12,c_3,c_12
1517
1518 mulx a_3,a_1,t_1 !sqr_add_c2(a,3,1,c2,c3,c1);
1519 addcc c_12,t_1,c_12
1520 clr c_3
1521 bcs,a %xcc,.+8
1522 add c_3,t_2,c_3
1523 addcc c_12,t_1,c_12
1524 bcs,a %xcc,.+8
1525 add c_3,t_2,c_3
1526 mulx a_2,a_2,t_1 !sqr_add_c(a,2,c2,c3,c1);
1527 addcc c_12,t_1,t_1
1528 bcs,a %xcc,.+8
1529 add c_3,t_2,c_3
1530 srlx t_1,32,c_12
1531 stuw t_1,rp(4) !r[4]=c2;
1532 or c_12,c_3,c_12
1533
1534 mulx a_2,a_3,t_1 !sqr_add_c2(a,3,2,c3,c1,c2);
1535 addcc c_12,t_1,c_12
1536 clr c_3
1537 bcs,a %xcc,.+8
1538 add c_3,t_2,c_3
1539 addcc c_12,t_1,t_1
1540 bcs,a %xcc,.+8
1541 add c_3,t_2,c_3
1542 srlx t_1,32,c_12
1543 stuw t_1,rp(5) !r[5]=c3;
1544 or c_12,c_3,c_12
1545
1546 mulx a_3,a_3,t_1 !sqr_add_c(a,3,c1,c2,c3);
1547 addcc c_12,t_1,t_1
1548 srlx t_1,32,c_12
1549 stuw t_1,rp(6) !r[6]=c1;
1550 stuw c_12,rp(7) !r[7]=c2;
1551
1552 ret
1553 restore %g0,%g0,%o0
1554
1555.type bn_sqr_comba4,#function
1556.size bn_sqr_comba4,(.-bn_sqr_comba4)
1557
1558.align 32
diff --git a/src/lib/libcrypto/bn/asm/sparcv9-mont.pl b/src/lib/libcrypto/bn/asm/sparcv9-mont.pl
deleted file mode 100644
index b8fb1e8a25..0000000000
--- a/src/lib/libcrypto/bn/asm/sparcv9-mont.pl
+++ /dev/null
@@ -1,606 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# December 2005
11#
12# Pure SPARCv9/8+ and IALU-only bn_mul_mont implementation. The reasons
13# for undertaken effort are multiple. First of all, UltraSPARC is not
14# the whole SPARCv9 universe and other VIS-free implementations deserve
15# optimized code as much. Secondly, newly introduced UltraSPARC T1,
16# a.k.a. Niagara, has shared FPU and concurrent FPU-intensive pathes,
17# such as sparcv9a-mont, will simply sink it. Yes, T1 is equipped with
18# several integrated RSA/DSA accelerator circuits accessible through
19# kernel driver [only(*)], but having decent user-land software
20# implementation is important too. Finally, reasons like desire to
21# experiment with dedicated squaring procedure. Yes, this module
22# implements one, because it was easiest to draft it in SPARCv9
23# instructions...
24
25# (*) Engine accessing the driver in question is on my TODO list.
26# For reference, acceleator is estimated to give 6 to 10 times
27# improvement on single-threaded RSA sign. It should be noted
28# that 6-10x improvement coefficient does not actually mean
29# something extraordinary in terms of absolute [single-threaded]
30# performance, as SPARCv9 instruction set is by all means least
31# suitable for high performance crypto among other 64 bit
32# platforms. 6-10x factor simply places T1 in same performance
33# domain as say AMD64 and IA-64. Improvement of RSA verify don't
34# appear impressive at all, but it's the sign operation which is
35# far more critical/interesting.
36
37# You might notice that inner loops are modulo-scheduled:-) This has
38# essentially negligible impact on UltraSPARC performance, it's
39# Fujitsu SPARC64 V users who should notice and hopefully appreciate
40# the advantage... Currently this module surpasses sparcv9a-mont.pl
41# by ~20% on UltraSPARC-III and later cores, but recall that sparcv9a
42# module still have hidden potential [see TODO list there], which is
43# estimated to be larger than 20%...
44
45# int bn_mul_mont(
46$rp="%i0"; # BN_ULONG *rp,
47$ap="%i1"; # const BN_ULONG *ap,
48$bp="%i2"; # const BN_ULONG *bp,
49$np="%i3"; # const BN_ULONG *np,
50$n0="%i4"; # const BN_ULONG *n0,
51$num="%i5"; # int num);
52
53$bits=32;
54for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
55if ($bits==64) { $bias=2047; $frame=192; }
56else { $bias=0; $frame=128; }
57
58$car0="%o0";
59$car1="%o1";
60$car2="%o2"; # 1 bit
61$acc0="%o3";
62$acc1="%o4";
63$mask="%g1"; # 32 bits, what a waste...
64$tmp0="%g4";
65$tmp1="%g5";
66
67$i="%l0";
68$j="%l1";
69$mul0="%l2";
70$mul1="%l3";
71$tp="%l4";
72$apj="%l5";
73$npj="%l6";
74$tpj="%l7";
75
76$fname="bn_mul_mont_int";
77
78$code=<<___;
79.section ".text",#alloc,#execinstr
80
81.global $fname
82.align 32
83$fname:
84 cmp %o5,4 ! 128 bits minimum
85 bge,pt %icc,.Lenter
86 sethi %hi(0xffffffff),$mask
87 retl
88 clr %o0
89.align 32
90.Lenter:
91 save %sp,-$frame,%sp
92 sll $num,2,$num ! num*=4
93 or $mask,%lo(0xffffffff),$mask
94 ld [$n0],$n0
95 cmp $ap,$bp
96 and $num,$mask,$num
97 ld [$bp],$mul0 ! bp[0]
98 nop
99
100 add %sp,$bias,%o7 ! real top of stack
101 ld [$ap],$car0 ! ap[0] ! redundant in squaring context
102 sub %o7,$num,%o7
103 ld [$ap+4],$apj ! ap[1]
104 and %o7,-1024,%o7
105 ld [$np],$car1 ! np[0]
106 sub %o7,$bias,%sp ! alloca
107 ld [$np+4],$npj ! np[1]
108 be,pt `$bits==32?"%icc":"%xcc"`,.Lbn_sqr_mont
109 mov 12,$j
110
111 mulx $car0,$mul0,$car0 ! ap[0]*bp[0]
112 mulx $apj,$mul0,$tmp0 !prologue! ap[1]*bp[0]
113 and $car0,$mask,$acc0
114 add %sp,$bias+$frame,$tp
115 ld [$ap+8],$apj !prologue!
116
117 mulx $n0,$acc0,$mul1 ! "t[0]"*n0
118 and $mul1,$mask,$mul1
119
120 mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0
121 mulx $npj,$mul1,$acc1 !prologue! np[1]*"t[0]"*n0
122 srlx $car0,32,$car0
123 add $acc0,$car1,$car1
124 ld [$np+8],$npj !prologue!
125 srlx $car1,32,$car1
126 mov $tmp0,$acc0 !prologue!
127
128.L1st:
129 mulx $apj,$mul0,$tmp0
130 mulx $npj,$mul1,$tmp1
131 add $acc0,$car0,$car0
132 ld [$ap+$j],$apj ! ap[j]
133 and $car0,$mask,$acc0
134 add $acc1,$car1,$car1
135 ld [$np+$j],$npj ! np[j]
136 srlx $car0,32,$car0
137 add $acc0,$car1,$car1
138 add $j,4,$j ! j++
139 mov $tmp0,$acc0
140 st $car1,[$tp]
141 cmp $j,$num
142 mov $tmp1,$acc1
143 srlx $car1,32,$car1
144 bl %icc,.L1st
145 add $tp,4,$tp ! tp++
146!.L1st
147
148 mulx $apj,$mul0,$tmp0 !epilogue!
149 mulx $npj,$mul1,$tmp1
150 add $acc0,$car0,$car0
151 and $car0,$mask,$acc0
152 add $acc1,$car1,$car1
153 srlx $car0,32,$car0
154 add $acc0,$car1,$car1
155 st $car1,[$tp]
156 srlx $car1,32,$car1
157
158 add $tmp0,$car0,$car0
159 and $car0,$mask,$acc0
160 add $tmp1,$car1,$car1
161 srlx $car0,32,$car0
162 add $acc0,$car1,$car1
163 st $car1,[$tp+4]
164 srlx $car1,32,$car1
165
166 add $car0,$car1,$car1
167 st $car1,[$tp+8]
168 srlx $car1,32,$car2
169
170 mov 4,$i ! i++
171 ld [$bp+4],$mul0 ! bp[1]
172.Louter:
173 add %sp,$bias+$frame,$tp
174 ld [$ap],$car0 ! ap[0]
175 ld [$ap+4],$apj ! ap[1]
176 ld [$np],$car1 ! np[0]
177 ld [$np+4],$npj ! np[1]
178 ld [$tp],$tmp1 ! tp[0]
179 ld [$tp+4],$tpj ! tp[1]
180 mov 12,$j
181
182 mulx $car0,$mul0,$car0
183 mulx $apj,$mul0,$tmp0 !prologue!
184 add $tmp1,$car0,$car0
185 ld [$ap+8],$apj !prologue!
186 and $car0,$mask,$acc0
187
188 mulx $n0,$acc0,$mul1
189 and $mul1,$mask,$mul1
190
191 mulx $car1,$mul1,$car1
192 mulx $npj,$mul1,$acc1 !prologue!
193 srlx $car0,32,$car0
194 add $acc0,$car1,$car1
195 ld [$np+8],$npj !prologue!
196 srlx $car1,32,$car1
197 mov $tmp0,$acc0 !prologue!
198
199.Linner:
200 mulx $apj,$mul0,$tmp0
201 mulx $npj,$mul1,$tmp1
202 add $tpj,$car0,$car0
203 ld [$ap+$j],$apj ! ap[j]
204 add $acc0,$car0,$car0
205 add $acc1,$car1,$car1
206 ld [$np+$j],$npj ! np[j]
207 and $car0,$mask,$acc0
208 ld [$tp+8],$tpj ! tp[j]
209 srlx $car0,32,$car0
210 add $acc0,$car1,$car1
211 add $j,4,$j ! j++
212 mov $tmp0,$acc0
213 st $car1,[$tp] ! tp[j-1]
214 srlx $car1,32,$car1
215 mov $tmp1,$acc1
216 cmp $j,$num
217 bl %icc,.Linner
218 add $tp,4,$tp ! tp++
219!.Linner
220
221 mulx $apj,$mul0,$tmp0 !epilogue!
222 mulx $npj,$mul1,$tmp1
223 add $tpj,$car0,$car0
224 add $acc0,$car0,$car0
225 ld [$tp+8],$tpj ! tp[j]
226 and $car0,$mask,$acc0
227 add $acc1,$car1,$car1
228 srlx $car0,32,$car0
229 add $acc0,$car1,$car1
230 st $car1,[$tp] ! tp[j-1]
231 srlx $car1,32,$car1
232
233 add $tpj,$car0,$car0
234 add $tmp0,$car0,$car0
235 and $car0,$mask,$acc0
236 add $tmp1,$car1,$car1
237 add $acc0,$car1,$car1
238 st $car1,[$tp+4] ! tp[j-1]
239 srlx $car0,32,$car0
240 add $i,4,$i ! i++
241 srlx $car1,32,$car1
242
243 add $car0,$car1,$car1
244 cmp $i,$num
245 add $car2,$car1,$car1
246 st $car1,[$tp+8]
247
248 srlx $car1,32,$car2
249 bl,a %icc,.Louter
250 ld [$bp+$i],$mul0 ! bp[i]
251!.Louter
252
253 add $tp,12,$tp
254
255.Ltail:
256 add $np,$num,$np
257 add $rp,$num,$rp
258 mov $tp,$ap
259 sub %g0,$num,%o7 ! k=-num
260 ba .Lsub
261 subcc %g0,%g0,%g0 ! clear %icc.c
262.align 16
263.Lsub:
264 ld [$tp+%o7],%o0
265 ld [$np+%o7],%o1
266 subccc %o0,%o1,%o1 ! tp[j]-np[j]
267 add $rp,%o7,$i
268 add %o7,4,%o7
269 brnz %o7,.Lsub
270 st %o1,[$i]
271 subc $car2,0,$car2 ! handle upmost overflow bit
272 and $tp,$car2,$ap
273 andn $rp,$car2,$np
274 or $ap,$np,$ap
275 sub %g0,$num,%o7
276
277.Lcopy:
278 ld [$ap+%o7],%o0 ! copy or in-place refresh
279 st %g0,[$tp+%o7] ! zap tp
280 st %o0,[$rp+%o7]
281 add %o7,4,%o7
282 brnz %o7,.Lcopy
283 nop
284 mov 1,%i0
285 ret
286 restore
287___
288
289########
290######## .Lbn_sqr_mont gives up to 20% *overall* improvement over
291######## code without following dedicated squaring procedure.
292########
293$sbit="%i2"; # re-use $bp!
294
295$code.=<<___;
296.align 32
297.Lbn_sqr_mont:
298 mulx $mul0,$mul0,$car0 ! ap[0]*ap[0]
299 mulx $apj,$mul0,$tmp0 !prologue!
300 and $car0,$mask,$acc0
301 add %sp,$bias+$frame,$tp
302 ld [$ap+8],$apj !prologue!
303
304 mulx $n0,$acc0,$mul1 ! "t[0]"*n0
305 srlx $car0,32,$car0
306 and $mul1,$mask,$mul1
307
308 mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0
309 mulx $npj,$mul1,$acc1 !prologue!
310 and $car0,1,$sbit
311 ld [$np+8],$npj !prologue!
312 srlx $car0,1,$car0
313 add $acc0,$car1,$car1
314 srlx $car1,32,$car1
315 mov $tmp0,$acc0 !prologue!
316
317.Lsqr_1st:
318 mulx $apj,$mul0,$tmp0
319 mulx $npj,$mul1,$tmp1
320 add $acc0,$car0,$car0 ! ap[j]*a0+c0
321 add $acc1,$car1,$car1
322 ld [$ap+$j],$apj ! ap[j]
323 and $car0,$mask,$acc0
324 ld [$np+$j],$npj ! np[j]
325 srlx $car0,32,$car0
326 add $acc0,$acc0,$acc0
327 or $sbit,$acc0,$acc0
328 mov $tmp1,$acc1
329 srlx $acc0,32,$sbit
330 add $j,4,$j ! j++
331 and $acc0,$mask,$acc0
332 cmp $j,$num
333 add $acc0,$car1,$car1
334 st $car1,[$tp]
335 mov $tmp0,$acc0
336 srlx $car1,32,$car1
337 bl %icc,.Lsqr_1st
338 add $tp,4,$tp ! tp++
339!.Lsqr_1st
340
341 mulx $apj,$mul0,$tmp0 ! epilogue
342 mulx $npj,$mul1,$tmp1
343 add $acc0,$car0,$car0 ! ap[j]*a0+c0
344 add $acc1,$car1,$car1
345 and $car0,$mask,$acc0
346 srlx $car0,32,$car0
347 add $acc0,$acc0,$acc0
348 or $sbit,$acc0,$acc0
349 srlx $acc0,32,$sbit
350 and $acc0,$mask,$acc0
351 add $acc0,$car1,$car1
352 st $car1,[$tp]
353 srlx $car1,32,$car1
354
355 add $tmp0,$car0,$car0 ! ap[j]*a0+c0
356 add $tmp1,$car1,$car1
357 and $car0,$mask,$acc0
358 srlx $car0,32,$car0
359 add $acc0,$acc0,$acc0
360 or $sbit,$acc0,$acc0
361 srlx $acc0,32,$sbit
362 and $acc0,$mask,$acc0
363 add $acc0,$car1,$car1
364 st $car1,[$tp+4]
365 srlx $car1,32,$car1
366
367 add $car0,$car0,$car0
368 or $sbit,$car0,$car0
369 add $car0,$car1,$car1
370 st $car1,[$tp+8]
371 srlx $car1,32,$car2
372
373 ld [%sp+$bias+$frame],$tmp0 ! tp[0]
374 ld [%sp+$bias+$frame+4],$tmp1 ! tp[1]
375 ld [%sp+$bias+$frame+8],$tpj ! tp[2]
376 ld [$ap+4],$mul0 ! ap[1]
377 ld [$ap+8],$apj ! ap[2]
378 ld [$np],$car1 ! np[0]
379 ld [$np+4],$npj ! np[1]
380 mulx $n0,$tmp0,$mul1
381
382 mulx $mul0,$mul0,$car0
383 and $mul1,$mask,$mul1
384
385 mulx $car1,$mul1,$car1
386 mulx $npj,$mul1,$acc1
387 add $tmp0,$car1,$car1
388 and $car0,$mask,$acc0
389 ld [$np+8],$npj ! np[2]
390 srlx $car1,32,$car1
391 add $tmp1,$car1,$car1
392 srlx $car0,32,$car0
393 add $acc0,$car1,$car1
394 and $car0,1,$sbit
395 add $acc1,$car1,$car1
396 srlx $car0,1,$car0
397 mov 12,$j
398 st $car1,[%sp+$bias+$frame] ! tp[0]=
399 srlx $car1,32,$car1
400 add %sp,$bias+$frame+4,$tp
401
402.Lsqr_2nd:
403 mulx $apj,$mul0,$acc0
404 mulx $npj,$mul1,$acc1
405 add $acc0,$car0,$car0
406 add $tpj,$car1,$car1
407 ld [$ap+$j],$apj ! ap[j]
408 and $car0,$mask,$acc0
409 ld [$np+$j],$npj ! np[j]
410 srlx $car0,32,$car0
411 add $acc1,$car1,$car1
412 ld [$tp+8],$tpj ! tp[j]
413 add $acc0,$acc0,$acc0
414 add $j,4,$j ! j++
415 or $sbit,$acc0,$acc0
416 srlx $acc0,32,$sbit
417 and $acc0,$mask,$acc0
418 cmp $j,$num
419 add $acc0,$car1,$car1
420 st $car1,[$tp] ! tp[j-1]
421 srlx $car1,32,$car1
422 bl %icc,.Lsqr_2nd
423 add $tp,4,$tp ! tp++
424!.Lsqr_2nd
425
426 mulx $apj,$mul0,$acc0
427 mulx $npj,$mul1,$acc1
428 add $acc0,$car0,$car0
429 add $tpj,$car1,$car1
430 and $car0,$mask,$acc0
431 srlx $car0,32,$car0
432 add $acc1,$car1,$car1
433 add $acc0,$acc0,$acc0
434 or $sbit,$acc0,$acc0
435 srlx $acc0,32,$sbit
436 and $acc0,$mask,$acc0
437 add $acc0,$car1,$car1
438 st $car1,[$tp] ! tp[j-1]
439 srlx $car1,32,$car1
440
441 add $car0,$car0,$car0
442 or $sbit,$car0,$car0
443 add $car0,$car1,$car1
444 add $car2,$car1,$car1
445 st $car1,[$tp+4]
446 srlx $car1,32,$car2
447
448 ld [%sp+$bias+$frame],$tmp1 ! tp[0]
449 ld [%sp+$bias+$frame+4],$tpj ! tp[1]
450 ld [$ap+8],$mul0 ! ap[2]
451 ld [$np],$car1 ! np[0]
452 ld [$np+4],$npj ! np[1]
453 mulx $n0,$tmp1,$mul1
454 and $mul1,$mask,$mul1
455 mov 8,$i
456
457 mulx $mul0,$mul0,$car0
458 mulx $car1,$mul1,$car1
459 and $car0,$mask,$acc0
460 add $tmp1,$car1,$car1
461 srlx $car0,32,$car0
462 add %sp,$bias+$frame,$tp
463 srlx $car1,32,$car1
464 and $car0,1,$sbit
465 srlx $car0,1,$car0
466 mov 4,$j
467
468.Lsqr_outer:
469.Lsqr_inner1:
470 mulx $npj,$mul1,$acc1
471 add $tpj,$car1,$car1
472 add $j,4,$j
473 ld [$tp+8],$tpj
474 cmp $j,$i
475 add $acc1,$car1,$car1
476 ld [$np+$j],$npj
477 st $car1,[$tp]
478 srlx $car1,32,$car1
479 bl %icc,.Lsqr_inner1
480 add $tp,4,$tp
481!.Lsqr_inner1
482
483 add $j,4,$j
484 ld [$ap+$j],$apj ! ap[j]
485 mulx $npj,$mul1,$acc1
486 add $tpj,$car1,$car1
487 ld [$np+$j],$npj ! np[j]
488 add $acc0,$car1,$car1
489 ld [$tp+8],$tpj ! tp[j]
490 add $acc1,$car1,$car1
491 st $car1,[$tp]
492 srlx $car1,32,$car1
493
494 add $j,4,$j
495 cmp $j,$num
496 be,pn %icc,.Lsqr_no_inner2
497 add $tp,4,$tp
498
499.Lsqr_inner2:
500 mulx $apj,$mul0,$acc0
501 mulx $npj,$mul1,$acc1
502 add $tpj,$car1,$car1
503 add $acc0,$car0,$car0
504 ld [$ap+$j],$apj ! ap[j]
505 and $car0,$mask,$acc0
506 ld [$np+$j],$npj ! np[j]
507 srlx $car0,32,$car0
508 add $acc0,$acc0,$acc0
509 ld [$tp+8],$tpj ! tp[j]
510 or $sbit,$acc0,$acc0
511 add $j,4,$j ! j++
512 srlx $acc0,32,$sbit
513 and $acc0,$mask,$acc0
514 cmp $j,$num
515 add $acc0,$car1,$car1
516 add $acc1,$car1,$car1
517 st $car1,[$tp] ! tp[j-1]
518 srlx $car1,32,$car1
519 bl %icc,.Lsqr_inner2
520 add $tp,4,$tp ! tp++
521
522.Lsqr_no_inner2:
523 mulx $apj,$mul0,$acc0
524 mulx $npj,$mul1,$acc1
525 add $tpj,$car1,$car1
526 add $acc0,$car0,$car0
527 and $car0,$mask,$acc0
528 srlx $car0,32,$car0
529 add $acc0,$acc0,$acc0
530 or $sbit,$acc0,$acc0
531 srlx $acc0,32,$sbit
532 and $acc0,$mask,$acc0
533 add $acc0,$car1,$car1
534 add $acc1,$car1,$car1
535 st $car1,[$tp] ! tp[j-1]
536 srlx $car1,32,$car1
537
538 add $car0,$car0,$car0
539 or $sbit,$car0,$car0
540 add $car0,$car1,$car1
541 add $car2,$car1,$car1
542 st $car1,[$tp+4]
543 srlx $car1,32,$car2
544
545 add $i,4,$i ! i++
546 ld [%sp+$bias+$frame],$tmp1 ! tp[0]
547 ld [%sp+$bias+$frame+4],$tpj ! tp[1]
548 ld [$ap+$i],$mul0 ! ap[j]
549 ld [$np],$car1 ! np[0]
550 ld [$np+4],$npj ! np[1]
551 mulx $n0,$tmp1,$mul1
552 and $mul1,$mask,$mul1
553 add $i,4,$tmp0
554
555 mulx $mul0,$mul0,$car0
556 mulx $car1,$mul1,$car1
557 and $car0,$mask,$acc0
558 add $tmp1,$car1,$car1
559 srlx $car0,32,$car0
560 add %sp,$bias+$frame,$tp
561 srlx $car1,32,$car1
562 and $car0,1,$sbit
563 srlx $car0,1,$car0
564
565 cmp $tmp0,$num ! i<num-1
566 bl %icc,.Lsqr_outer
567 mov 4,$j
568
569.Lsqr_last:
570 mulx $npj,$mul1,$acc1
571 add $tpj,$car1,$car1
572 add $j,4,$j
573 ld [$tp+8],$tpj
574 cmp $j,$i
575 add $acc1,$car1,$car1
576 ld [$np+$j],$npj
577 st $car1,[$tp]
578 srlx $car1,32,$car1
579 bl %icc,.Lsqr_last
580 add $tp,4,$tp
581!.Lsqr_last
582
583 mulx $npj,$mul1,$acc1
584 add $tpj,$car1,$car1
585 add $acc0,$car1,$car1
586 add $acc1,$car1,$car1
587 st $car1,[$tp]
588 srlx $car1,32,$car1
589
590 add $car0,$car0,$car0 ! recover $car0
591 or $sbit,$car0,$car0
592 add $car0,$car1,$car1
593 add $car2,$car1,$car1
594 st $car1,[$tp+4]
595 srlx $car1,32,$car2
596
597 ba .Ltail
598 add $tp,8,$tp
599.type $fname,#function
600.size $fname,(.-$fname)
601.asciz "Montgomery Multipltication for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
602.align 32
603___
604$code =~ s/\`([^\`]*)\`/eval($1)/gem;
605print $code;
606close STDOUT;
diff --git a/src/lib/libcrypto/bn/asm/sparcv9a-mont.pl b/src/lib/libcrypto/bn/asm/sparcv9a-mont.pl
deleted file mode 100755
index a14205f2f0..0000000000
--- a/src/lib/libcrypto/bn/asm/sparcv9a-mont.pl
+++ /dev/null
@@ -1,882 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# October 2005
11#
12# "Teaser" Montgomery multiplication module for UltraSPARC. Why FPU?
13# Because unlike integer multiplier, which simply stalls whole CPU,
14# FPU is fully pipelined and can effectively emit 48 bit partial
15# product every cycle. Why not blended SPARC v9? One can argue that
16# making this module dependent on UltraSPARC VIS extension limits its
17# binary compatibility. Well yes, it does exclude SPARC64 prior-V(!)
18# implementations from compatibility matrix. But the rest, whole Sun
19# UltraSPARC family and brand new Fujitsu's SPARC64 V, all support
20# VIS extension instructions used in this module. This is considered
21# good enough to not care about HAL SPARC64 users [if any] who have
22# integer-only pure SPARCv9 module to "fall down" to.
23
24# USI&II cores currently exhibit uniform 2x improvement [over pre-
25# bn_mul_mont codebase] for all key lengths and benchmarks. On USIII
26# performance improves few percents for shorter keys and worsens few
27# percents for longer keys. This is because USIII integer multiplier
28# is >3x faster than USI&II one, which is harder to match [but see
29# TODO list below]. It should also be noted that SPARC64 V features
30# out-of-order execution, which *might* mean that integer multiplier
31# is pipelined, which in turn *might* be impossible to match... On
32# additional note, SPARC64 V implements FP Multiply-Add instruction,
33# which is perfectly usable in this context... In other words, as far
34# as Fujitsu SPARC64 V goes, talk to the author:-)
35
36# The implementation implies following "non-natural" limitations on
37# input arguments:
38# - num may not be less than 4;
39# - num has to be even;
40# Failure to meet either condition has no fatal effects, simply
41# doesn't give any performance gain.
42
43# TODO:
44# - modulo-schedule inner loop for better performance (on in-order
45# execution core such as UltraSPARC this shall result in further
46# noticeable(!) improvement);
47# - dedicated squaring procedure[?];
48
49######################################################################
50# November 2006
51#
52# Modulo-scheduled inner loops allow to interleave floating point and
53# integer instructions and minimize Read-After-Write penalties. This
54# results in *further* 20-50% perfromance improvement [depending on
55# key length, more for longer keys] on USI&II cores and 30-80% - on
56# USIII&IV.
57
58$fname="bn_mul_mont_fpu";
59$bits=32;
60for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
61
62if ($bits==64) {
63 $bias=2047;
64 $frame=192;
65} else {
66 $bias=0;
67 $frame=128; # 96 rounded up to largest known cache-line
68}
69$locals=64;
70
71# In order to provide for 32-/64-bit ABI duality, I keep integers wider
72# than 32 bit in %g1-%g4 and %o0-%o5. %l0-%l7 and %i0-%i5 are used
73# exclusively for pointers, indexes and other small values...
74# int bn_mul_mont(
75$rp="%i0"; # BN_ULONG *rp,
76$ap="%i1"; # const BN_ULONG *ap,
77$bp="%i2"; # const BN_ULONG *bp,
78$np="%i3"; # const BN_ULONG *np,
79$n0="%i4"; # const BN_ULONG *n0,
80$num="%i5"; # int num);
81
82$tp="%l0"; # t[num]
83$ap_l="%l1"; # a[num],n[num] are smashed to 32-bit words and saved
84$ap_h="%l2"; # to these four vectors as double-precision FP values.
85$np_l="%l3"; # This way a bunch of fxtods are eliminated in second
86$np_h="%l4"; # loop and L1-cache aliasing is minimized...
87$i="%l5";
88$j="%l6";
89$mask="%l7"; # 16-bit mask, 0xffff
90
91$n0="%g4"; # reassigned(!) to "64-bit" register
92$carry="%i4"; # %i4 reused(!) for a carry bit
93
94# FP register naming chart
95#
96# ..HILO
97# dcba
98# --------
99# LOa
100# LOb
101# LOc
102# LOd
103# HIa
104# HIb
105# HIc
106# HId
107# ..a
108# ..b
109$ba="%f0"; $bb="%f2"; $bc="%f4"; $bd="%f6";
110$na="%f8"; $nb="%f10"; $nc="%f12"; $nd="%f14";
111$alo="%f16"; $alo_="%f17"; $ahi="%f18"; $ahi_="%f19";
112$nlo="%f20"; $nlo_="%f21"; $nhi="%f22"; $nhi_="%f23";
113
114$dota="%f24"; $dotb="%f26";
115
116$aloa="%f32"; $alob="%f34"; $aloc="%f36"; $alod="%f38";
117$ahia="%f40"; $ahib="%f42"; $ahic="%f44"; $ahid="%f46";
118$nloa="%f48"; $nlob="%f50"; $nloc="%f52"; $nlod="%f54";
119$nhia="%f56"; $nhib="%f58"; $nhic="%f60"; $nhid="%f62";
120
121$ASI_FL16_P=0xD2; # magic ASI value to engage 16-bit FP load
122
123$code=<<___;
124.section ".text",#alloc,#execinstr
125
126.global $fname
127.align 32
128$fname:
129 save %sp,-$frame-$locals,%sp
130
131 cmp $num,4
132 bl,a,pn %icc,.Lret
133 clr %i0
134 andcc $num,1,%g0 ! $num has to be even...
135 bnz,a,pn %icc,.Lret
136 clr %i0 ! signal "unsupported input value"
137
138 srl $num,1,$num
139 sethi %hi(0xffff),$mask
140 ld [%i4+0],$n0 ! $n0 reassigned, remember?
141 or $mask,%lo(0xffff),$mask
142 ld [%i4+4],%o0
143 sllx %o0,32,%o0
144 or %o0,$n0,$n0 ! $n0=n0[1].n0[0]
145
146 sll $num,3,$num ! num*=8
147
148 add %sp,$bias,%o0 ! real top of stack
149 sll $num,2,%o1
150 add %o1,$num,%o1 ! %o1=num*5
151 sub %o0,%o1,%o0
152 and %o0,-2048,%o0 ! optimize TLB utilization
153 sub %o0,$bias,%sp ! alloca(5*num*8)
154
155 rd %asi,%o7 ! save %asi
156 add %sp,$bias+$frame+$locals,$tp
157 add $tp,$num,$ap_l
158 add $ap_l,$num,$ap_l ! [an]p_[lh] point at the vectors' ends !
159 add $ap_l,$num,$ap_h
160 add $ap_h,$num,$np_l
161 add $np_l,$num,$np_h
162
163 wr %g0,$ASI_FL16_P,%asi ! setup %asi for 16-bit FP loads
164
165 add $rp,$num,$rp ! readjust input pointers to point
166 add $ap,$num,$ap ! at the ends too...
167 add $bp,$num,$bp
168 add $np,$num,$np
169
170 stx %o7,[%sp+$bias+$frame+48] ! save %asi
171
172 sub %g0,$num,$i ! i=-num
173 sub %g0,$num,$j ! j=-num
174
175 add $ap,$j,%o3
176 add $bp,$i,%o4
177
178 ld [%o3+4],%g1 ! bp[0]
179 ld [%o3+0],%o0
180 ld [%o4+4],%g5 ! ap[0]
181 sllx %g1,32,%g1
182 ld [%o4+0],%o1
183 sllx %g5,32,%g5
184 or %g1,%o0,%o0
185 or %g5,%o1,%o1
186
187 add $np,$j,%o5
188
189 mulx %o1,%o0,%o0 ! ap[0]*bp[0]
190 mulx $n0,%o0,%o0 ! ap[0]*bp[0]*n0
191 stx %o0,[%sp+$bias+$frame+0]
192
193 ld [%o3+0],$alo_ ! load a[j] as pair of 32-bit words
194 fzeros $alo
195 ld [%o3+4],$ahi_
196 fzeros $ahi
197 ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words
198 fzeros $nlo
199 ld [%o5+4],$nhi_
200 fzeros $nhi
201
202 ! transfer b[i] to FPU as 4x16-bit values
203 ldda [%o4+2]%asi,$ba
204 fxtod $alo,$alo
205 ldda [%o4+0]%asi,$bb
206 fxtod $ahi,$ahi
207 ldda [%o4+6]%asi,$bc
208 fxtod $nlo,$nlo
209 ldda [%o4+4]%asi,$bd
210 fxtod $nhi,$nhi
211
212 ! transfer ap[0]*b[0]*n0 to FPU as 4x16-bit values
213 ldda [%sp+$bias+$frame+6]%asi,$na
214 fxtod $ba,$ba
215 ldda [%sp+$bias+$frame+4]%asi,$nb
216 fxtod $bb,$bb
217 ldda [%sp+$bias+$frame+2]%asi,$nc
218 fxtod $bc,$bc
219 ldda [%sp+$bias+$frame+0]%asi,$nd
220 fxtod $bd,$bd
221
222 std $alo,[$ap_l+$j] ! save smashed ap[j] in double format
223 fxtod $na,$na
224 std $ahi,[$ap_h+$j]
225 fxtod $nb,$nb
226 std $nlo,[$np_l+$j] ! save smashed np[j] in double format
227 fxtod $nc,$nc
228 std $nhi,[$np_h+$j]
229 fxtod $nd,$nd
230
231 fmuld $alo,$ba,$aloa
232 fmuld $nlo,$na,$nloa
233 fmuld $alo,$bb,$alob
234 fmuld $nlo,$nb,$nlob
235 fmuld $alo,$bc,$aloc
236 faddd $aloa,$nloa,$nloa
237 fmuld $nlo,$nc,$nloc
238 fmuld $alo,$bd,$alod
239 faddd $alob,$nlob,$nlob
240 fmuld $nlo,$nd,$nlod
241 fmuld $ahi,$ba,$ahia
242 faddd $aloc,$nloc,$nloc
243 fmuld $nhi,$na,$nhia
244 fmuld $ahi,$bb,$ahib
245 faddd $alod,$nlod,$nlod
246 fmuld $nhi,$nb,$nhib
247 fmuld $ahi,$bc,$ahic
248 faddd $ahia,$nhia,$nhia
249 fmuld $nhi,$nc,$nhic
250 fmuld $ahi,$bd,$ahid
251 faddd $ahib,$nhib,$nhib
252 fmuld $nhi,$nd,$nhid
253
254 faddd $ahic,$nhic,$dota ! $nhic
255 faddd $ahid,$nhid,$dotb ! $nhid
256
257 faddd $nloc,$nhia,$nloc
258 faddd $nlod,$nhib,$nlod
259
260 fdtox $nloa,$nloa
261 fdtox $nlob,$nlob
262 fdtox $nloc,$nloc
263 fdtox $nlod,$nlod
264
265 std $nloa,[%sp+$bias+$frame+0]
266 add $j,8,$j
267 std $nlob,[%sp+$bias+$frame+8]
268 add $ap,$j,%o4
269 std $nloc,[%sp+$bias+$frame+16]
270 add $np,$j,%o5
271 std $nlod,[%sp+$bias+$frame+24]
272
273 ld [%o4+0],$alo_ ! load a[j] as pair of 32-bit words
274 fzeros $alo
275 ld [%o4+4],$ahi_
276 fzeros $ahi
277 ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words
278 fzeros $nlo
279 ld [%o5+4],$nhi_
280 fzeros $nhi
281
282 fxtod $alo,$alo
283 fxtod $ahi,$ahi
284 fxtod $nlo,$nlo
285 fxtod $nhi,$nhi
286
287 ldx [%sp+$bias+$frame+0],%o0
288 fmuld $alo,$ba,$aloa
289 ldx [%sp+$bias+$frame+8],%o1
290 fmuld $nlo,$na,$nloa
291 ldx [%sp+$bias+$frame+16],%o2
292 fmuld $alo,$bb,$alob
293 ldx [%sp+$bias+$frame+24],%o3
294 fmuld $nlo,$nb,$nlob
295
296 srlx %o0,16,%o7
297 std $alo,[$ap_l+$j] ! save smashed ap[j] in double format
298 fmuld $alo,$bc,$aloc
299 add %o7,%o1,%o1
300 std $ahi,[$ap_h+$j]
301 faddd $aloa,$nloa,$nloa
302 fmuld $nlo,$nc,$nloc
303 srlx %o1,16,%o7
304 std $nlo,[$np_l+$j] ! save smashed np[j] in double format
305 fmuld $alo,$bd,$alod
306 add %o7,%o2,%o2
307 std $nhi,[$np_h+$j]
308 faddd $alob,$nlob,$nlob
309 fmuld $nlo,$nd,$nlod
310 srlx %o2,16,%o7
311 fmuld $ahi,$ba,$ahia
312 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
313 faddd $aloc,$nloc,$nloc
314 fmuld $nhi,$na,$nhia
315 !and %o0,$mask,%o0
316 !and %o1,$mask,%o1
317 !and %o2,$mask,%o2
318 !sllx %o1,16,%o1
319 !sllx %o2,32,%o2
320 !sllx %o3,48,%o7
321 !or %o1,%o0,%o0
322 !or %o2,%o0,%o0
323 !or %o7,%o0,%o0 ! 64-bit result
324 srlx %o3,16,%g1 ! 34-bit carry
325 fmuld $ahi,$bb,$ahib
326
327 faddd $alod,$nlod,$nlod
328 fmuld $nhi,$nb,$nhib
329 fmuld $ahi,$bc,$ahic
330 faddd $ahia,$nhia,$nhia
331 fmuld $nhi,$nc,$nhic
332 fmuld $ahi,$bd,$ahid
333 faddd $ahib,$nhib,$nhib
334 fmuld $nhi,$nd,$nhid
335
336 faddd $dota,$nloa,$nloa
337 faddd $dotb,$nlob,$nlob
338 faddd $ahic,$nhic,$dota ! $nhic
339 faddd $ahid,$nhid,$dotb ! $nhid
340
341 faddd $nloc,$nhia,$nloc
342 faddd $nlod,$nhib,$nlod
343
344 fdtox $nloa,$nloa
345 fdtox $nlob,$nlob
346 fdtox $nloc,$nloc
347 fdtox $nlod,$nlod
348
349 std $nloa,[%sp+$bias+$frame+0]
350 std $nlob,[%sp+$bias+$frame+8]
351 addcc $j,8,$j
352 std $nloc,[%sp+$bias+$frame+16]
353 bz,pn %icc,.L1stskip
354 std $nlod,[%sp+$bias+$frame+24]
355
356.align 32 ! incidentally already aligned !
357.L1st:
358 add $ap,$j,%o4
359 add $np,$j,%o5
360 ld [%o4+0],$alo_ ! load a[j] as pair of 32-bit words
361 fzeros $alo
362 ld [%o4+4],$ahi_
363 fzeros $ahi
364 ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words
365 fzeros $nlo
366 ld [%o5+4],$nhi_
367 fzeros $nhi
368
369 fxtod $alo,$alo
370 fxtod $ahi,$ahi
371 fxtod $nlo,$nlo
372 fxtod $nhi,$nhi
373
374 ldx [%sp+$bias+$frame+0],%o0
375 fmuld $alo,$ba,$aloa
376 ldx [%sp+$bias+$frame+8],%o1
377 fmuld $nlo,$na,$nloa
378 ldx [%sp+$bias+$frame+16],%o2
379 fmuld $alo,$bb,$alob
380 ldx [%sp+$bias+$frame+24],%o3
381 fmuld $nlo,$nb,$nlob
382
383 srlx %o0,16,%o7
384 std $alo,[$ap_l+$j] ! save smashed ap[j] in double format
385 fmuld $alo,$bc,$aloc
386 add %o7,%o1,%o1
387 std $ahi,[$ap_h+$j]
388 faddd $aloa,$nloa,$nloa
389 fmuld $nlo,$nc,$nloc
390 srlx %o1,16,%o7
391 std $nlo,[$np_l+$j] ! save smashed np[j] in double format
392 fmuld $alo,$bd,$alod
393 add %o7,%o2,%o2
394 std $nhi,[$np_h+$j]
395 faddd $alob,$nlob,$nlob
396 fmuld $nlo,$nd,$nlod
397 srlx %o2,16,%o7
398 fmuld $ahi,$ba,$ahia
399 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
400 and %o0,$mask,%o0
401 faddd $aloc,$nloc,$nloc
402 fmuld $nhi,$na,$nhia
403 and %o1,$mask,%o1
404 and %o2,$mask,%o2
405 fmuld $ahi,$bb,$ahib
406 sllx %o1,16,%o1
407 faddd $alod,$nlod,$nlod
408 fmuld $nhi,$nb,$nhib
409 sllx %o2,32,%o2
410 fmuld $ahi,$bc,$ahic
411 sllx %o3,48,%o7
412 or %o1,%o0,%o0
413 faddd $ahia,$nhia,$nhia
414 fmuld $nhi,$nc,$nhic
415 or %o2,%o0,%o0
416 fmuld $ahi,$bd,$ahid
417 or %o7,%o0,%o0 ! 64-bit result
418 faddd $ahib,$nhib,$nhib
419 fmuld $nhi,$nd,$nhid
420 addcc %g1,%o0,%o0
421 faddd $dota,$nloa,$nloa
422 srlx %o3,16,%g1 ! 34-bit carry
423 faddd $dotb,$nlob,$nlob
424 bcs,a %xcc,.+8
425 add %g1,1,%g1
426
427 stx %o0,[$tp] ! tp[j-1]=
428
429 faddd $ahic,$nhic,$dota ! $nhic
430 faddd $ahid,$nhid,$dotb ! $nhid
431
432 faddd $nloc,$nhia,$nloc
433 faddd $nlod,$nhib,$nlod
434
435 fdtox $nloa,$nloa
436 fdtox $nlob,$nlob
437 fdtox $nloc,$nloc
438 fdtox $nlod,$nlod
439
440 std $nloa,[%sp+$bias+$frame+0]
441 std $nlob,[%sp+$bias+$frame+8]
442 std $nloc,[%sp+$bias+$frame+16]
443 std $nlod,[%sp+$bias+$frame+24]
444
445 addcc $j,8,$j
446 bnz,pt %icc,.L1st
447 add $tp,8,$tp
448
449.L1stskip:
450 fdtox $dota,$dota
451 fdtox $dotb,$dotb
452
453 ldx [%sp+$bias+$frame+0],%o0
454 ldx [%sp+$bias+$frame+8],%o1
455 ldx [%sp+$bias+$frame+16],%o2
456 ldx [%sp+$bias+$frame+24],%o3
457
458 srlx %o0,16,%o7
459 std $dota,[%sp+$bias+$frame+32]
460 add %o7,%o1,%o1
461 std $dotb,[%sp+$bias+$frame+40]
462 srlx %o1,16,%o7
463 add %o7,%o2,%o2
464 srlx %o2,16,%o7
465 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
466 and %o0,$mask,%o0
467 and %o1,$mask,%o1
468 and %o2,$mask,%o2
469 sllx %o1,16,%o1
470 sllx %o2,32,%o2
471 sllx %o3,48,%o7
472 or %o1,%o0,%o0
473 or %o2,%o0,%o0
474 or %o7,%o0,%o0 ! 64-bit result
475 ldx [%sp+$bias+$frame+32],%o4
476 addcc %g1,%o0,%o0
477 ldx [%sp+$bias+$frame+40],%o5
478 srlx %o3,16,%g1 ! 34-bit carry
479 bcs,a %xcc,.+8
480 add %g1,1,%g1
481
482 stx %o0,[$tp] ! tp[j-1]=
483 add $tp,8,$tp
484
485 srlx %o4,16,%o7
486 add %o7,%o5,%o5
487 and %o4,$mask,%o4
488 sllx %o5,16,%o7
489 or %o7,%o4,%o4
490 addcc %g1,%o4,%o4
491 srlx %o5,48,%g1
492 bcs,a %xcc,.+8
493 add %g1,1,%g1
494
495 mov %g1,$carry
496 stx %o4,[$tp] ! tp[num-1]=
497
498 ba .Louter
499 add $i,8,$i
500.align 32
501.Louter:
502 sub %g0,$num,$j ! j=-num
503 add %sp,$bias+$frame+$locals,$tp
504
505 add $ap,$j,%o3
506 add $bp,$i,%o4
507
508 ld [%o3+4],%g1 ! bp[i]
509 ld [%o3+0],%o0
510 ld [%o4+4],%g5 ! ap[0]
511 sllx %g1,32,%g1
512 ld [%o4+0],%o1
513 sllx %g5,32,%g5
514 or %g1,%o0,%o0
515 or %g5,%o1,%o1
516
517 ldx [$tp],%o2 ! tp[0]
518 mulx %o1,%o0,%o0
519 addcc %o2,%o0,%o0
520 mulx $n0,%o0,%o0 ! (ap[0]*bp[i]+t[0])*n0
521 stx %o0,[%sp+$bias+$frame+0]
522
523 ! transfer b[i] to FPU as 4x16-bit values
524 ldda [%o4+2]%asi,$ba
525 ldda [%o4+0]%asi,$bb
526 ldda [%o4+6]%asi,$bc
527 ldda [%o4+4]%asi,$bd
528
529 ! transfer (ap[0]*b[i]+t[0])*n0 to FPU as 4x16-bit values
530 ldda [%sp+$bias+$frame+6]%asi,$na
531 fxtod $ba,$ba
532 ldda [%sp+$bias+$frame+4]%asi,$nb
533 fxtod $bb,$bb
534 ldda [%sp+$bias+$frame+2]%asi,$nc
535 fxtod $bc,$bc
536 ldda [%sp+$bias+$frame+0]%asi,$nd
537 fxtod $bd,$bd
538 ldd [$ap_l+$j],$alo ! load a[j] in double format
539 fxtod $na,$na
540 ldd [$ap_h+$j],$ahi
541 fxtod $nb,$nb
542 ldd [$np_l+$j],$nlo ! load n[j] in double format
543 fxtod $nc,$nc
544 ldd [$np_h+$j],$nhi
545 fxtod $nd,$nd
546
547 fmuld $alo,$ba,$aloa
548 fmuld $nlo,$na,$nloa
549 fmuld $alo,$bb,$alob
550 fmuld $nlo,$nb,$nlob
551 fmuld $alo,$bc,$aloc
552 faddd $aloa,$nloa,$nloa
553 fmuld $nlo,$nc,$nloc
554 fmuld $alo,$bd,$alod
555 faddd $alob,$nlob,$nlob
556 fmuld $nlo,$nd,$nlod
557 fmuld $ahi,$ba,$ahia
558 faddd $aloc,$nloc,$nloc
559 fmuld $nhi,$na,$nhia
560 fmuld $ahi,$bb,$ahib
561 faddd $alod,$nlod,$nlod
562 fmuld $nhi,$nb,$nhib
563 fmuld $ahi,$bc,$ahic
564 faddd $ahia,$nhia,$nhia
565 fmuld $nhi,$nc,$nhic
566 fmuld $ahi,$bd,$ahid
567 faddd $ahib,$nhib,$nhib
568 fmuld $nhi,$nd,$nhid
569
570 faddd $ahic,$nhic,$dota ! $nhic
571 faddd $ahid,$nhid,$dotb ! $nhid
572
573 faddd $nloc,$nhia,$nloc
574 faddd $nlod,$nhib,$nlod
575
576 fdtox $nloa,$nloa
577 fdtox $nlob,$nlob
578 fdtox $nloc,$nloc
579 fdtox $nlod,$nlod
580
581 std $nloa,[%sp+$bias+$frame+0]
582 std $nlob,[%sp+$bias+$frame+8]
583 std $nloc,[%sp+$bias+$frame+16]
584 add $j,8,$j
585 std $nlod,[%sp+$bias+$frame+24]
586
587 ldd [$ap_l+$j],$alo ! load a[j] in double format
588 ldd [$ap_h+$j],$ahi
589 ldd [$np_l+$j],$nlo ! load n[j] in double format
590 ldd [$np_h+$j],$nhi
591
592 fmuld $alo,$ba,$aloa
593 fmuld $nlo,$na,$nloa
594 fmuld $alo,$bb,$alob
595 fmuld $nlo,$nb,$nlob
596 fmuld $alo,$bc,$aloc
597 ldx [%sp+$bias+$frame+0],%o0
598 faddd $aloa,$nloa,$nloa
599 fmuld $nlo,$nc,$nloc
600 ldx [%sp+$bias+$frame+8],%o1
601 fmuld $alo,$bd,$alod
602 ldx [%sp+$bias+$frame+16],%o2
603 faddd $alob,$nlob,$nlob
604 fmuld $nlo,$nd,$nlod
605 ldx [%sp+$bias+$frame+24],%o3
606 fmuld $ahi,$ba,$ahia
607
608 srlx %o0,16,%o7
609 faddd $aloc,$nloc,$nloc
610 fmuld $nhi,$na,$nhia
611 add %o7,%o1,%o1
612 fmuld $ahi,$bb,$ahib
613 srlx %o1,16,%o7
614 faddd $alod,$nlod,$nlod
615 fmuld $nhi,$nb,$nhib
616 add %o7,%o2,%o2
617 fmuld $ahi,$bc,$ahic
618 srlx %o2,16,%o7
619 faddd $ahia,$nhia,$nhia
620 fmuld $nhi,$nc,$nhic
621 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
622 ! why?
623 and %o0,$mask,%o0
624 fmuld $ahi,$bd,$ahid
625 and %o1,$mask,%o1
626 and %o2,$mask,%o2
627 faddd $ahib,$nhib,$nhib
628 fmuld $nhi,$nd,$nhid
629 sllx %o1,16,%o1
630 faddd $dota,$nloa,$nloa
631 sllx %o2,32,%o2
632 faddd $dotb,$nlob,$nlob
633 sllx %o3,48,%o7
634 or %o1,%o0,%o0
635 faddd $ahic,$nhic,$dota ! $nhic
636 or %o2,%o0,%o0
637 faddd $ahid,$nhid,$dotb ! $nhid
638 or %o7,%o0,%o0 ! 64-bit result
639 ldx [$tp],%o7
640 faddd $nloc,$nhia,$nloc
641 addcc %o7,%o0,%o0
642 ! end-of-why?
643 faddd $nlod,$nhib,$nlod
644 srlx %o3,16,%g1 ! 34-bit carry
645 fdtox $nloa,$nloa
646 bcs,a %xcc,.+8
647 add %g1,1,%g1
648
649 fdtox $nlob,$nlob
650 fdtox $nloc,$nloc
651 fdtox $nlod,$nlod
652
653 std $nloa,[%sp+$bias+$frame+0]
654 std $nlob,[%sp+$bias+$frame+8]
655 addcc $j,8,$j
656 std $nloc,[%sp+$bias+$frame+16]
657 bz,pn %icc,.Linnerskip
658 std $nlod,[%sp+$bias+$frame+24]
659
660 ba .Linner
661 nop
662.align 32
663.Linner:
664 ldd [$ap_l+$j],$alo ! load a[j] in double format
665 ldd [$ap_h+$j],$ahi
666 ldd [$np_l+$j],$nlo ! load n[j] in double format
667 ldd [$np_h+$j],$nhi
668
669 fmuld $alo,$ba,$aloa
670 fmuld $nlo,$na,$nloa
671 fmuld $alo,$bb,$alob
672 fmuld $nlo,$nb,$nlob
673 fmuld $alo,$bc,$aloc
674 ldx [%sp+$bias+$frame+0],%o0
675 faddd $aloa,$nloa,$nloa
676 fmuld $nlo,$nc,$nloc
677 ldx [%sp+$bias+$frame+8],%o1
678 fmuld $alo,$bd,$alod
679 ldx [%sp+$bias+$frame+16],%o2
680 faddd $alob,$nlob,$nlob
681 fmuld $nlo,$nd,$nlod
682 ldx [%sp+$bias+$frame+24],%o3
683 fmuld $ahi,$ba,$ahia
684
685 srlx %o0,16,%o7
686 faddd $aloc,$nloc,$nloc
687 fmuld $nhi,$na,$nhia
688 add %o7,%o1,%o1
689 fmuld $ahi,$bb,$ahib
690 srlx %o1,16,%o7
691 faddd $alod,$nlod,$nlod
692 fmuld $nhi,$nb,$nhib
693 add %o7,%o2,%o2
694 fmuld $ahi,$bc,$ahic
695 srlx %o2,16,%o7
696 faddd $ahia,$nhia,$nhia
697 fmuld $nhi,$nc,$nhic
698 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
699 and %o0,$mask,%o0
700 fmuld $ahi,$bd,$ahid
701 and %o1,$mask,%o1
702 and %o2,$mask,%o2
703 faddd $ahib,$nhib,$nhib
704 fmuld $nhi,$nd,$nhid
705 sllx %o1,16,%o1
706 faddd $dota,$nloa,$nloa
707 sllx %o2,32,%o2
708 faddd $dotb,$nlob,$nlob
709 sllx %o3,48,%o7
710 or %o1,%o0,%o0
711 faddd $ahic,$nhic,$dota ! $nhic
712 or %o2,%o0,%o0
713 faddd $ahid,$nhid,$dotb ! $nhid
714 or %o7,%o0,%o0 ! 64-bit result
715 faddd $nloc,$nhia,$nloc
716 addcc %g1,%o0,%o0
717 ldx [$tp+8],%o7 ! tp[j]
718 faddd $nlod,$nhib,$nlod
719 srlx %o3,16,%g1 ! 34-bit carry
720 fdtox $nloa,$nloa
721 bcs,a %xcc,.+8
722 add %g1,1,%g1
723 fdtox $nlob,$nlob
724 addcc %o7,%o0,%o0
725 fdtox $nloc,$nloc
726 bcs,a %xcc,.+8
727 add %g1,1,%g1
728
729 stx %o0,[$tp] ! tp[j-1]
730 fdtox $nlod,$nlod
731
732 std $nloa,[%sp+$bias+$frame+0]
733 std $nlob,[%sp+$bias+$frame+8]
734 std $nloc,[%sp+$bias+$frame+16]
735 addcc $j,8,$j
736 std $nlod,[%sp+$bias+$frame+24]
737 bnz,pt %icc,.Linner
738 add $tp,8,$tp
739
740.Linnerskip:
741 fdtox $dota,$dota
742 fdtox $dotb,$dotb
743
744 ldx [%sp+$bias+$frame+0],%o0
745 ldx [%sp+$bias+$frame+8],%o1
746 ldx [%sp+$bias+$frame+16],%o2
747 ldx [%sp+$bias+$frame+24],%o3
748
749 srlx %o0,16,%o7
750 std $dota,[%sp+$bias+$frame+32]
751 add %o7,%o1,%o1
752 std $dotb,[%sp+$bias+$frame+40]
753 srlx %o1,16,%o7
754 add %o7,%o2,%o2
755 srlx %o2,16,%o7
756 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
757 and %o0,$mask,%o0
758 and %o1,$mask,%o1
759 and %o2,$mask,%o2
760 sllx %o1,16,%o1
761 sllx %o2,32,%o2
762 sllx %o3,48,%o7
763 or %o1,%o0,%o0
764 or %o2,%o0,%o0
765 ldx [%sp+$bias+$frame+32],%o4
766 or %o7,%o0,%o0 ! 64-bit result
767 ldx [%sp+$bias+$frame+40],%o5
768 addcc %g1,%o0,%o0
769 ldx [$tp+8],%o7 ! tp[j]
770 srlx %o3,16,%g1 ! 34-bit carry
771 bcs,a %xcc,.+8
772 add %g1,1,%g1
773
774 addcc %o7,%o0,%o0
775 bcs,a %xcc,.+8
776 add %g1,1,%g1
777
778 stx %o0,[$tp] ! tp[j-1]
779 add $tp,8,$tp
780
781 srlx %o4,16,%o7
782 add %o7,%o5,%o5
783 and %o4,$mask,%o4
784 sllx %o5,16,%o7
785 or %o7,%o4,%o4
786 addcc %g1,%o4,%o4
787 srlx %o5,48,%g1
788 bcs,a %xcc,.+8
789 add %g1,1,%g1
790
791 addcc $carry,%o4,%o4
792 stx %o4,[$tp] ! tp[num-1]
793 mov %g1,$carry
794 bcs,a %xcc,.+8
795 add $carry,1,$carry
796
797 addcc $i,8,$i
798 bnz %icc,.Louter
799 nop
800
801 add $tp,8,$tp ! adjust tp to point at the end
802 orn %g0,%g0,%g4
803 sub %g0,$num,%o7 ! n=-num
804 ba .Lsub
805 subcc %g0,%g0,%g0 ! clear %icc.c
806
807.align 32
808.Lsub:
809 ldx [$tp+%o7],%o0
810 add $np,%o7,%g1
811 ld [%g1+0],%o2
812 ld [%g1+4],%o3
813 srlx %o0,32,%o1
814 subccc %o0,%o2,%o2
815 add $rp,%o7,%g1
816 subccc %o1,%o3,%o3
817 st %o2,[%g1+0]
818 add %o7,8,%o7
819 brnz,pt %o7,.Lsub
820 st %o3,[%g1+4]
821 subc $carry,0,%g4
822 sub %g0,$num,%o7 ! n=-num
823 ba .Lcopy
824 nop
825
826.align 32
827.Lcopy:
828 ldx [$tp+%o7],%o0
829 add $rp,%o7,%g1
830 ld [%g1+0],%o2
831 ld [%g1+4],%o3
832 stx %g0,[$tp+%o7]
833 and %o0,%g4,%o0
834 srlx %o0,32,%o1
835 andn %o2,%g4,%o2
836 andn %o3,%g4,%o3
837 or %o2,%o0,%o0
838 or %o3,%o1,%o1
839 st %o0,[%g1+0]
840 add %o7,8,%o7
841 brnz,pt %o7,.Lcopy
842 st %o1,[%g1+4]
843 sub %g0,$num,%o7 ! n=-num
844
845.Lzap:
846 stx %g0,[$ap_l+%o7]
847 stx %g0,[$ap_h+%o7]
848 stx %g0,[$np_l+%o7]
849 stx %g0,[$np_h+%o7]
850 add %o7,8,%o7
851 brnz,pt %o7,.Lzap
852 nop
853
854 ldx [%sp+$bias+$frame+48],%o7
855 wr %g0,%o7,%asi ! restore %asi
856
857 mov 1,%i0
858.Lret:
859 ret
860 restore
861.type $fname,#function
862.size $fname,(.-$fname)
863.asciz "Montgomery Multipltication for UltraSPARC, CRYPTOGAMS by <appro\@openssl.org>"
864.align 32
865___
866
867$code =~ s/\`([^\`]*)\`/eval($1)/gem;
868
869# Below substitution makes it possible to compile without demanding
870# VIS extentions on command line, e.g. -xarch=v9 vs. -xarch=v9a. I
871# dare to do this, because VIS capability is detected at run-time now
872# and this routine is not called on CPU not capable to execute it. Do
873# note that fzeros is not the only VIS dependency! Another dependency
874# is implicit and is just _a_ numerical value loaded to %asi register,
875# which assembler can't recognize as VIS specific...
876$code =~ s/fzeros\s+%f([0-9]+)/
877 sprintf(".word\t0x%x\t! fzeros %%f%d",0x81b00c20|($1<<25),$1)
878 /gem;
879
880print $code;
881# flush
882close STDOUT;
diff --git a/src/lib/libcrypto/bn/asm/via-mont.pl b/src/lib/libcrypto/bn/asm/via-mont.pl
deleted file mode 100644
index c046a514c8..0000000000
--- a/src/lib/libcrypto/bn/asm/via-mont.pl
+++ /dev/null
@@ -1,242 +0,0 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# Wrapper around 'rep montmul', VIA-specific instruction accessing
11# PadLock Montgomery Multiplier. The wrapper is designed as drop-in
12# replacement for OpenSSL bn_mul_mont [first implemented in 0.9.9].
13#
14# Below are interleaved outputs from 'openssl speed rsa dsa' for 4
15# different software configurations on 1.5GHz VIA Esther processor.
16# Lines marked with "software integer" denote performance of hand-
17# coded integer-only assembler found in OpenSSL 0.9.7. "Software SSE2"
18# refers to hand-coded SSE2 Montgomery multiplication procedure found
19# OpenSSL 0.9.9. "Hardware VIA SDK" refers to padlock_pmm routine from
20# Padlock SDK 2.0.1 available for download from VIA, which naturally
21# utilizes the magic 'repz montmul' instruction. And finally "hardware
22# this" refers to *this* implementation which also uses 'repz montmul'
23#
24# sign verify sign/s verify/s
25# rsa 512 bits 0.001720s 0.000140s 581.4 7149.7 software integer
26# rsa 512 bits 0.000690s 0.000086s 1450.3 11606.0 software SSE2
27# rsa 512 bits 0.006136s 0.000201s 163.0 4974.5 hardware VIA SDK
28# rsa 512 bits 0.000712s 0.000050s 1404.9 19858.5 hardware this
29#
30# rsa 1024 bits 0.008518s 0.000413s 117.4 2420.8 software integer
31# rsa 1024 bits 0.004275s 0.000277s 233.9 3609.7 software SSE2
32# rsa 1024 bits 0.012136s 0.000260s 82.4 3844.5 hardware VIA SDK
33# rsa 1024 bits 0.002522s 0.000116s 396.5 8650.9 hardware this
34#
35# rsa 2048 bits 0.050101s 0.001371s 20.0 729.6 software integer
36# rsa 2048 bits 0.030273s 0.001008s 33.0 991.9 software SSE2
37# rsa 2048 bits 0.030833s 0.000976s 32.4 1025.1 hardware VIA SDK
38# rsa 2048 bits 0.011879s 0.000342s 84.2 2921.7 hardware this
39#
40# rsa 4096 bits 0.327097s 0.004859s 3.1 205.8 software integer
41# rsa 4096 bits 0.229318s 0.003859s 4.4 259.2 software SSE2
42# rsa 4096 bits 0.233953s 0.003274s 4.3 305.4 hardware VIA SDK
43# rsa 4096 bits 0.070493s 0.001166s 14.2 857.6 hardware this
44#
45# dsa 512 bits 0.001342s 0.001651s 745.2 605.7 software integer
46# dsa 512 bits 0.000844s 0.000987s 1185.3 1013.1 software SSE2
47# dsa 512 bits 0.001902s 0.002247s 525.6 444.9 hardware VIA SDK
48# dsa 512 bits 0.000458s 0.000524s 2182.2 1909.1 hardware this
49#
50# dsa 1024 bits 0.003964s 0.004926s 252.3 203.0 software integer
51# dsa 1024 bits 0.002686s 0.003166s 372.3 315.8 software SSE2
52# dsa 1024 bits 0.002397s 0.002823s 417.1 354.3 hardware VIA SDK
53# dsa 1024 bits 0.000978s 0.001170s 1022.2 855.0 hardware this
54#
55# dsa 2048 bits 0.013280s 0.016518s 75.3 60.5 software integer
56# dsa 2048 bits 0.009911s 0.011522s 100.9 86.8 software SSE2
57# dsa 2048 bits 0.009542s 0.011763s 104.8 85.0 hardware VIA SDK
58# dsa 2048 bits 0.002884s 0.003352s 346.8 298.3 hardware this
59#
60# To give you some other reference point here is output for 2.4GHz P4
61# running hand-coded SSE2 bn_mul_mont found in 0.9.9, i.e. "software
62# SSE2" in above terms.
63#
64# rsa 512 bits 0.000407s 0.000047s 2454.2 21137.0
65# rsa 1024 bits 0.002426s 0.000141s 412.1 7100.0
66# rsa 2048 bits 0.015046s 0.000491s 66.5 2034.9
67# rsa 4096 bits 0.109770s 0.002379s 9.1 420.3
68# dsa 512 bits 0.000438s 0.000525s 2281.1 1904.1
69# dsa 1024 bits 0.001346s 0.001595s 742.7 627.0
70# dsa 2048 bits 0.004745s 0.005582s 210.7 179.1
71#
72# Conclusions:
73# - VIA SDK leaves a *lot* of room for improvement (which this
74# implementation successfully fills:-);
75# - 'rep montmul' gives up to >3x performance improvement depending on
76# key length;
77# - in terms of absolute performance it delivers approximately as much
78# as modern out-of-order 32-bit cores [again, for longer keys].
79
80$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
81push(@INC,"${dir}","${dir}../../perlasm");
82require "x86asm.pl";
83
84&asm_init($ARGV[0],"via-mont.pl");
85
86# int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num);
87$func="bn_mul_mont_padlock";
88
89$pad=16*1; # amount of reserved bytes on top of every vector
90
91# stack layout
92$mZeroPrime=&DWP(0,"esp"); # these are specified by VIA
93$A=&DWP(4,"esp");
94$B=&DWP(8,"esp");
95$T=&DWP(12,"esp");
96$M=&DWP(16,"esp");
97$scratch=&DWP(20,"esp");
98$rp=&DWP(24,"esp"); # these are mine
99$sp=&DWP(28,"esp");
100# &DWP(32,"esp") # 32 byte scratch area
101# &DWP(64+(4*$num+$pad)*0,"esp") # padded tp[num]
102# &DWP(64+(4*$num+$pad)*1,"esp") # padded copy of ap[num]
103# &DWP(64+(4*$num+$pad)*2,"esp") # padded copy of bp[num]
104# &DWP(64+(4*$num+$pad)*3,"esp") # padded copy of np[num]
105# Note that SDK suggests to unconditionally allocate 2K per vector. This
106# has quite an impact on performance. It naturally depends on key length,
107# but to give an example 1024 bit private RSA key operations suffer >30%
108# penalty. I allocate only as much as actually required...
109
110&function_begin($func);
111 &xor ("eax","eax");
112 &mov ("ecx",&wparam(5)); # num
113 # meet VIA's limitations for num [note that the specification
114 # expresses them in bits, while we work with amount of 32-bit words]
115 &test ("ecx",3);
116 &jnz (&label("leave")); # num % 4 != 0
117 &cmp ("ecx",8);
118 &jb (&label("leave")); # num < 8
119 &cmp ("ecx",1024);
120 &ja (&label("leave")); # num > 1024
121
122 &pushf ();
123 &cld ();
124
125 &mov ("edi",&wparam(0)); # rp
126 &mov ("eax",&wparam(1)); # ap
127 &mov ("ebx",&wparam(2)); # bp
128 &mov ("edx",&wparam(3)); # np
129 &mov ("esi",&wparam(4)); # n0
130 &mov ("esi",&DWP(0,"esi")); # *n0
131
132 &lea ("ecx",&DWP($pad,"","ecx",4)); # ecx becomes vector size in bytes
133 &lea ("ebp",&DWP(64,"","ecx",4)); # allocate 4 vectors + 64 bytes
134 &neg ("ebp");
135 &add ("ebp","esp");
136 &and ("ebp",-64); # align to cache-line
137 &xchg ("ebp","esp"); # alloca
138
139 &mov ($rp,"edi"); # save rp
140 &mov ($sp,"ebp"); # save esp
141
142 &mov ($mZeroPrime,"esi");
143 &lea ("esi",&DWP(64,"esp")); # tp
144 &mov ($T,"esi");
145 &lea ("edi",&DWP(32,"esp")); # scratch area
146 &mov ($scratch,"edi");
147 &mov ("esi","eax");
148
149 &lea ("ebp",&DWP(-$pad,"ecx"));
150 &shr ("ebp",2); # restore original num value in ebp
151
152 &xor ("eax","eax");
153
154 &mov ("ecx","ebp");
155 &lea ("ecx",&DWP((32+$pad)/4,"ecx"));# padded tp + scratch
156 &data_byte(0xf3,0xab); # rep stosl, bzero
157
158 &mov ("ecx","ebp");
159 &lea ("edi",&DWP(64+$pad,"esp","ecx",4));# pointer to ap copy
160 &mov ($A,"edi");
161 &data_byte(0xf3,0xa5); # rep movsl, memcpy
162 &mov ("ecx",$pad/4);
163 &data_byte(0xf3,0xab); # rep stosl, bzero pad
164 # edi points at the end of padded ap copy...
165
166 &mov ("ecx","ebp");
167 &mov ("esi","ebx");
168 &mov ($B,"edi");
169 &data_byte(0xf3,0xa5); # rep movsl, memcpy
170 &mov ("ecx",$pad/4);
171 &data_byte(0xf3,0xab); # rep stosl, bzero pad
172 # edi points at the end of padded bp copy...
173
174 &mov ("ecx","ebp");
175 &mov ("esi","edx");
176 &mov ($M,"edi");
177 &data_byte(0xf3,0xa5); # rep movsl, memcpy
178 &mov ("ecx",$pad/4);
179 &data_byte(0xf3,0xab); # rep stosl, bzero pad
180 # edi points at the end of padded np copy...
181
182 # let magic happen...
183 &mov ("ecx","ebp");
184 &mov ("esi","esp");
185 &shl ("ecx",5); # convert word counter to bit counter
186 &align (4);
187 &data_byte(0xf3,0x0f,0xa6,0xc0);# rep montmul
188
189 &mov ("ecx","ebp");
190 &lea ("esi",&DWP(64,"esp")); # tp
191 # edi still points at the end of padded np copy...
192 &neg ("ebp");
193 &lea ("ebp",&DWP(-$pad,"edi","ebp",4)); # so just "rewind"
194 &mov ("edi",$rp); # restore rp
195 &xor ("edx","edx"); # i=0 and clear CF
196
197&set_label("sub",8);
198 &mov ("eax",&DWP(0,"esi","edx",4));
199 &sbb ("eax",&DWP(0,"ebp","edx",4));
200 &mov (&DWP(0,"edi","edx",4),"eax"); # rp[i]=tp[i]-np[i]
201 &lea ("edx",&DWP(1,"edx")); # i++
202 &loop (&label("sub")); # doesn't affect CF!
203
204 &mov ("eax",&DWP(0,"esi","edx",4)); # upmost overflow bit
205 &sbb ("eax",0);
206 &and ("esi","eax");
207 &not ("eax");
208 &mov ("ebp","edi");
209 &and ("ebp","eax");
210 &or ("esi","ebp"); # tp=carry?tp:rp
211
212 &mov ("ecx","edx"); # num
213 &xor ("edx","edx"); # i=0
214
215&set_label("copy",8);
216 &mov ("eax",&DWP(0,"esi","edx",4));
217 &mov (&DWP(64,"esp","edx",4),"ecx"); # zap tp
218 &mov (&DWP(0,"edi","edx",4),"eax");
219 &lea ("edx",&DWP(1,"edx")); # i++
220 &loop (&label("copy"));
221
222 &mov ("ebp",$sp);
223 &xor ("eax","eax");
224
225 &mov ("ecx",64/4);
226 &mov ("edi","esp"); # zap frame including scratch area
227 &data_byte(0xf3,0xab); # rep stosl, bzero
228
229 # zap copies of ap, bp and np
230 &lea ("edi",&DWP(64+$pad,"esp","edx",4));# pointer to ap
231 &lea ("ecx",&DWP(3*$pad/4,"edx","edx",2));
232 &data_byte(0xf3,0xab); # rep stosl, bzero
233
234 &mov ("esp","ebp");
235 &inc ("eax"); # signal "done"
236 &popf ();
237&set_label("leave");
238&function_end($func);
239
240&asciz("Padlock Montgomery Multiplication, CRYPTOGAMS by <appro\@openssl.org>");
241
242&asm_finish();
diff --git a/src/lib/libcrypto/bn/asm/x86-gf2m.pl b/src/lib/libcrypto/bn/asm/x86-gf2m.pl
deleted file mode 100644
index 808a1e5969..0000000000
--- a/src/lib/libcrypto/bn/asm/x86-gf2m.pl
+++ /dev/null
@@ -1,313 +0,0 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# May 2011
11#
12# The module implements bn_GF2m_mul_2x2 polynomial multiplication used
13# in bn_gf2m.c. It's kind of low-hanging mechanical port from C for
14# the time being... Except that it has three code paths: pure integer
15# code suitable for any x86 CPU, MMX code suitable for PIII and later
16# and PCLMULQDQ suitable for Westmere and later. Improvement varies
17# from one benchmark and µ-arch to another. Below are interval values
18# for 163- and 571-bit ECDH benchmarks relative to compiler-generated
19# code:
20#
21# PIII 16%-30%
22# P4 12%-12%
23# Opteron 18%-40%
24# Core2 19%-44%
25# Atom 38%-64%
26# Westmere 53%-121%(PCLMULQDQ)/20%-32%(MMX)
27# Sandy Bridge 72%-127%(PCLMULQDQ)/27%-23%(MMX)
28#
29# Note that above improvement coefficients are not coefficients for
30# bn_GF2m_mul_2x2 itself. For example 120% ECDH improvement is result
31# of bn_GF2m_mul_2x2 being >4x faster. As it gets faster, benchmark
32# is more and more dominated by other subroutines, most notably by
33# BN_GF2m_mod[_mul]_arr...
34
35$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
36push(@INC,"${dir}","${dir}../../perlasm");
37require "x86asm.pl";
38
39&asm_init($ARGV[0],$0,$x86only = $ARGV[$#ARGV] eq "386");
40
41$sse2=0;
42for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
43
44&external_label("OPENSSL_ia32cap_P") if ($sse2);
45
46$a="eax";
47$b="ebx";
48($a1,$a2,$a4)=("ecx","edx","ebp");
49
50$R="mm0";
51@T=("mm1","mm2");
52($A,$B,$B30,$B31)=("mm2","mm3","mm4","mm5");
53@i=("esi","edi");
54
55 if (!$x86only) {
56&function_begin_B("_mul_1x1_mmx");
57 &sub ("esp",32+4);
58 &mov ($a1,$a);
59 &lea ($a2,&DWP(0,$a,$a));
60 &and ($a1,0x3fffffff);
61 &lea ($a4,&DWP(0,$a2,$a2));
62 &mov (&DWP(0*4,"esp"),0);
63 &and ($a2,0x7fffffff);
64 &movd ($A,$a);
65 &movd ($B,$b);
66 &mov (&DWP(1*4,"esp"),$a1); # a1
67 &xor ($a1,$a2); # a1^a2
68 &pxor ($B31,$B31);
69 &pxor ($B30,$B30);
70 &mov (&DWP(2*4,"esp"),$a2); # a2
71 &xor ($a2,$a4); # a2^a4
72 &mov (&DWP(3*4,"esp"),$a1); # a1^a2
73 &pcmpgtd($B31,$A); # broadcast 31st bit
74 &paddd ($A,$A); # $A<<=1
75 &xor ($a1,$a2); # a1^a4=a1^a2^a2^a4
76 &mov (&DWP(4*4,"esp"),$a4); # a4
77 &xor ($a4,$a2); # a2=a4^a2^a4
78 &pand ($B31,$B);
79 &pcmpgtd($B30,$A); # broadcast 30th bit
80 &mov (&DWP(5*4,"esp"),$a1); # a1^a4
81 &xor ($a4,$a1); # a1^a2^a4
82 &psllq ($B31,31);
83 &pand ($B30,$B);
84 &mov (&DWP(6*4,"esp"),$a2); # a2^a4
85 &mov (@i[0],0x7);
86 &mov (&DWP(7*4,"esp"),$a4); # a1^a2^a4
87 &mov ($a4,@i[0]);
88 &and (@i[0],$b);
89 &shr ($b,3);
90 &mov (@i[1],$a4);
91 &psllq ($B30,30);
92 &and (@i[1],$b);
93 &shr ($b,3);
94 &movd ($R,&DWP(0,"esp",@i[0],4));
95 &mov (@i[0],$a4);
96 &and (@i[0],$b);
97 &shr ($b,3);
98 for($n=1;$n<9;$n++) {
99 &movd (@T[1],&DWP(0,"esp",@i[1],4));
100 &mov (@i[1],$a4);
101 &psllq (@T[1],3*$n);
102 &and (@i[1],$b);
103 &shr ($b,3);
104 &pxor ($R,@T[1]);
105
106 push(@i,shift(@i)); push(@T,shift(@T));
107 }
108 &movd (@T[1],&DWP(0,"esp",@i[1],4));
109 &pxor ($R,$B30);
110 &psllq (@T[1],3*$n++);
111 &pxor ($R,@T[1]);
112
113 &movd (@T[0],&DWP(0,"esp",@i[0],4));
114 &pxor ($R,$B31);
115 &psllq (@T[0],3*$n);
116 &add ("esp",32+4);
117 &pxor ($R,@T[0]);
118 &ret ();
119&function_end_B("_mul_1x1_mmx");
120 }
121
122($lo,$hi)=("eax","edx");
123@T=("ecx","ebp");
124
125&function_begin_B("_mul_1x1_ialu");
126 &sub ("esp",32+4);
127 &mov ($a1,$a);
128 &lea ($a2,&DWP(0,$a,$a));
129 &lea ($a4,&DWP(0,"",$a,4));
130 &and ($a1,0x3fffffff);
131 &lea (@i[1],&DWP(0,$lo,$lo));
132 &sar ($lo,31); # broadcast 31st bit
133 &mov (&DWP(0*4,"esp"),0);
134 &and ($a2,0x7fffffff);
135 &mov (&DWP(1*4,"esp"),$a1); # a1
136 &xor ($a1,$a2); # a1^a2
137 &mov (&DWP(2*4,"esp"),$a2); # a2
138 &xor ($a2,$a4); # a2^a4
139 &mov (&DWP(3*4,"esp"),$a1); # a1^a2
140 &xor ($a1,$a2); # a1^a4=a1^a2^a2^a4
141 &mov (&DWP(4*4,"esp"),$a4); # a4
142 &xor ($a4,$a2); # a2=a4^a2^a4
143 &mov (&DWP(5*4,"esp"),$a1); # a1^a4
144 &xor ($a4,$a1); # a1^a2^a4
145 &sar (@i[1],31); # broardcast 30th bit
146 &and ($lo,$b);
147 &mov (&DWP(6*4,"esp"),$a2); # a2^a4
148 &and (@i[1],$b);
149 &mov (&DWP(7*4,"esp"),$a4); # a1^a2^a4
150 &mov ($hi,$lo);
151 &shl ($lo,31);
152 &mov (@T[0],@i[1]);
153 &shr ($hi,1);
154
155 &mov (@i[0],0x7);
156 &shl (@i[1],30);
157 &and (@i[0],$b);
158 &shr (@T[0],2);
159 &xor ($lo,@i[1]);
160
161 &shr ($b,3);
162 &mov (@i[1],0x7); # 5-byte instruction!?
163 &and (@i[1],$b);
164 &shr ($b,3);
165 &xor ($hi,@T[0]);
166 &xor ($lo,&DWP(0,"esp",@i[0],4));
167 &mov (@i[0],0x7);
168 &and (@i[0],$b);
169 &shr ($b,3);
170 for($n=1;$n<9;$n++) {
171 &mov (@T[1],&DWP(0,"esp",@i[1],4));
172 &mov (@i[1],0x7);
173 &mov (@T[0],@T[1]);
174 &shl (@T[1],3*$n);
175 &and (@i[1],$b);
176 &shr (@T[0],32-3*$n);
177 &xor ($lo,@T[1]);
178 &shr ($b,3);
179 &xor ($hi,@T[0]);
180
181 push(@i,shift(@i)); push(@T,shift(@T));
182 }
183 &mov (@T[1],&DWP(0,"esp",@i[1],4));
184 &mov (@T[0],@T[1]);
185 &shl (@T[1],3*$n);
186 &mov (@i[1],&DWP(0,"esp",@i[0],4));
187 &shr (@T[0],32-3*$n); $n++;
188 &mov (@i[0],@i[1]);
189 &xor ($lo,@T[1]);
190 &shl (@i[1],3*$n);
191 &xor ($hi,@T[0]);
192 &shr (@i[0],32-3*$n);
193 &xor ($lo,@i[1]);
194 &xor ($hi,@i[0]);
195
196 &add ("esp",32+4);
197 &ret ();
198&function_end_B("_mul_1x1_ialu");
199
200# void bn_GF2m_mul_2x2(BN_ULONG *r, BN_ULONG a1, BN_ULONG a0, BN_ULONG b1, BN_ULONG b0);
201&function_begin_B("bn_GF2m_mul_2x2");
202if (!$x86only) {
203 &picmeup("edx","OPENSSL_ia32cap_P");
204 &mov ("eax",&DWP(0,"edx"));
205 &mov ("edx",&DWP(4,"edx"));
206 &test ("eax",1<<23); # check MMX bit
207 &jz (&label("ialu"));
208if ($sse2) {
209 &test ("eax",1<<24); # check FXSR bit
210 &jz (&label("mmx"));
211 &test ("edx",1<<1); # check PCLMULQDQ bit
212 &jz (&label("mmx"));
213
214 &movups ("xmm0",&QWP(8,"esp"));
215 &shufps ("xmm0","xmm0",0b10110001);
216 &pclmulqdq ("xmm0","xmm0",1);
217 &mov ("eax",&DWP(4,"esp"));
218 &movups (&QWP(0,"eax"),"xmm0");
219 &ret ();
220
221&set_label("mmx",16);
222}
223 &push ("ebp");
224 &push ("ebx");
225 &push ("esi");
226 &push ("edi");
227 &mov ($a,&wparam(1));
228 &mov ($b,&wparam(3));
229 &call ("_mul_1x1_mmx"); # a1·b1
230 &movq ("mm7",$R);
231
232 &mov ($a,&wparam(2));
233 &mov ($b,&wparam(4));
234 &call ("_mul_1x1_mmx"); # a0·b0
235 &movq ("mm6",$R);
236
237 &mov ($a,&wparam(1));
238 &mov ($b,&wparam(3));
239 &xor ($a,&wparam(2));
240 &xor ($b,&wparam(4));
241 &call ("_mul_1x1_mmx"); # (a0+a1)·(b0+b1)
242 &pxor ($R,"mm7");
243 &mov ($a,&wparam(0));
244 &pxor ($R,"mm6"); # (a0+a1)·(b0+b1)-a1·b1-a0·b0
245
246 &movq ($A,$R);
247 &psllq ($R,32);
248 &pop ("edi");
249 &psrlq ($A,32);
250 &pop ("esi");
251 &pxor ($R,"mm6");
252 &pop ("ebx");
253 &pxor ($A,"mm7");
254 &movq (&QWP(0,$a),$R);
255 &pop ("ebp");
256 &movq (&QWP(8,$a),$A);
257 &emms ();
258 &ret ();
259&set_label("ialu",16);
260}
261 &push ("ebp");
262 &push ("ebx");
263 &push ("esi");
264 &push ("edi");
265 &stack_push(4+1);
266
267 &mov ($a,&wparam(1));
268 &mov ($b,&wparam(3));
269 &call ("_mul_1x1_ialu"); # a1·b1
270 &mov (&DWP(8,"esp"),$lo);
271 &mov (&DWP(12,"esp"),$hi);
272
273 &mov ($a,&wparam(2));
274 &mov ($b,&wparam(4));
275 &call ("_mul_1x1_ialu"); # a0·b0
276 &mov (&DWP(0,"esp"),$lo);
277 &mov (&DWP(4,"esp"),$hi);
278
279 &mov ($a,&wparam(1));
280 &mov ($b,&wparam(3));
281 &xor ($a,&wparam(2));
282 &xor ($b,&wparam(4));
283 &call ("_mul_1x1_ialu"); # (a0+a1)·(b0+b1)
284
285 &mov ("ebp",&wparam(0));
286 @r=("ebx","ecx","edi","esi");
287 &mov (@r[0],&DWP(0,"esp"));
288 &mov (@r[1],&DWP(4,"esp"));
289 &mov (@r[2],&DWP(8,"esp"));
290 &mov (@r[3],&DWP(12,"esp"));
291
292 &xor ($lo,$hi);
293 &xor ($hi,@r[1]);
294 &xor ($lo,@r[0]);
295 &mov (&DWP(0,"ebp"),@r[0]);
296 &xor ($hi,@r[2]);
297 &mov (&DWP(12,"ebp"),@r[3]);
298 &xor ($lo,@r[3]);
299 &stack_pop(4+1);
300 &xor ($hi,@r[3]);
301 &pop ("edi");
302 &xor ($lo,$hi);
303 &pop ("esi");
304 &mov (&DWP(8,"ebp"),$hi);
305 &pop ("ebx");
306 &mov (&DWP(4,"ebp"),$lo);
307 &pop ("ebp");
308 &ret ();
309&function_end_B("bn_GF2m_mul_2x2");
310
311&asciz ("GF(2^m) Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>");
312
313&asm_finish();
diff --git a/src/lib/libcrypto/bn/asm/x86-mont.pl b/src/lib/libcrypto/bn/asm/x86-mont.pl
deleted file mode 100755
index e8f6b05084..0000000000
--- a/src/lib/libcrypto/bn/asm/x86-mont.pl
+++ /dev/null
@@ -1,593 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# October 2005
11#
12# This is a "teaser" code, as it can be improved in several ways...
13# First of all non-SSE2 path should be implemented (yes, for now it
14# performs Montgomery multiplication/convolution only on SSE2-capable
15# CPUs such as P4, others fall down to original code). Then inner loop
16# can be unrolled and modulo-scheduled to improve ILP and possibly
17# moved to 128-bit XMM register bank (though it would require input
18# rearrangement and/or increase bus bandwidth utilization). Dedicated
19# squaring procedure should give further performance improvement...
20# Yet, for being draft, the code improves rsa512 *sign* benchmark by
21# 110%(!), rsa1024 one - by 70% and rsa4096 - by 20%:-)
22
23# December 2006
24#
25# Modulo-scheduling SSE2 loops results in further 15-20% improvement.
26# Integer-only code [being equipped with dedicated squaring procedure]
27# gives ~40% on rsa512 sign benchmark...
28
29$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
30push(@INC,"${dir}","${dir}../../perlasm");
31require "x86asm.pl";
32
33&asm_init($ARGV[0],$0);
34
35$sse2=0;
36for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
37
38&external_label("OPENSSL_ia32cap_P") if ($sse2);
39
40&function_begin("bn_mul_mont");
41
42$i="edx";
43$j="ecx";
44$ap="esi"; $tp="esi"; # overlapping variables!!!
45$rp="edi"; $bp="edi"; # overlapping variables!!!
46$np="ebp";
47$num="ebx";
48
49$_num=&DWP(4*0,"esp"); # stack top layout
50$_rp=&DWP(4*1,"esp");
51$_ap=&DWP(4*2,"esp");
52$_bp=&DWP(4*3,"esp");
53$_np=&DWP(4*4,"esp");
54$_n0=&DWP(4*5,"esp"); $_n0q=&QWP(4*5,"esp");
55$_sp=&DWP(4*6,"esp");
56$_bpend=&DWP(4*7,"esp");
57$frame=32; # size of above frame rounded up to 16n
58
59 &xor ("eax","eax");
60 &mov ("edi",&wparam(5)); # int num
61 &cmp ("edi",4);
62 &jl (&label("just_leave"));
63
64 &lea ("esi",&wparam(0)); # put aside pointer to argument block
65 &lea ("edx",&wparam(1)); # load ap
66 &mov ("ebp","esp"); # saved stack pointer!
67 &add ("edi",2); # extra two words on top of tp
68 &neg ("edi");
69 &lea ("esp",&DWP(-$frame,"esp","edi",4)); # alloca($frame+4*(num+2))
70 &neg ("edi");
71
72 # minimize cache contention by arraning 2K window between stack
73 # pointer and ap argument [np is also position sensitive vector,
74 # but it's assumed to be near ap, as it's allocated at ~same
75 # time].
76 &mov ("eax","esp");
77 &sub ("eax","edx");
78 &and ("eax",2047);
79 &sub ("esp","eax"); # this aligns sp and ap modulo 2048
80
81 &xor ("edx","esp");
82 &and ("edx",2048);
83 &xor ("edx",2048);
84 &sub ("esp","edx"); # this splits them apart modulo 4096
85
86 &and ("esp",-64); # align to cache line
87
88 ################################# load argument block...
89 &mov ("eax",&DWP(0*4,"esi"));# BN_ULONG *rp
90 &mov ("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap
91 &mov ("ecx",&DWP(2*4,"esi"));# const BN_ULONG *bp
92 &mov ("edx",&DWP(3*4,"esi"));# const BN_ULONG *np
93 &mov ("esi",&DWP(4*4,"esi"));# const BN_ULONG *n0
94 #&mov ("edi",&DWP(5*4,"esi"));# int num
95
96 &mov ("esi",&DWP(0,"esi")); # pull n0[0]
97 &mov ($_rp,"eax"); # ... save a copy of argument block
98 &mov ($_ap,"ebx");
99 &mov ($_bp,"ecx");
100 &mov ($_np,"edx");
101 &mov ($_n0,"esi");
102 &lea ($num,&DWP(-3,"edi")); # num=num-1 to assist modulo-scheduling
103 #&mov ($_num,$num); # redundant as $num is not reused
104 &mov ($_sp,"ebp"); # saved stack pointer!
105
106if($sse2) {
107$acc0="mm0"; # mmx register bank layout
108$acc1="mm1";
109$car0="mm2";
110$car1="mm3";
111$mul0="mm4";
112$mul1="mm5";
113$temp="mm6";
114$mask="mm7";
115
116 &picmeup("eax","OPENSSL_ia32cap_P");
117 &bt (&DWP(0,"eax"),26);
118 &jnc (&label("non_sse2"));
119
120 &mov ("eax",-1);
121 &movd ($mask,"eax"); # mask 32 lower bits
122
123 &mov ($ap,$_ap); # load input pointers
124 &mov ($bp,$_bp);
125 &mov ($np,$_np);
126
127 &xor ($i,$i); # i=0
128 &xor ($j,$j); # j=0
129
130 &movd ($mul0,&DWP(0,$bp)); # bp[0]
131 &movd ($mul1,&DWP(0,$ap)); # ap[0]
132 &movd ($car1,&DWP(0,$np)); # np[0]
133
134 &pmuludq($mul1,$mul0); # ap[0]*bp[0]
135 &movq ($car0,$mul1);
136 &movq ($acc0,$mul1); # I wish movd worked for
137 &pand ($acc0,$mask); # inter-register transfers
138
139 &pmuludq($mul1,$_n0q); # *=n0
140
141 &pmuludq($car1,$mul1); # "t[0]"*np[0]*n0
142 &paddq ($car1,$acc0);
143
144 &movd ($acc1,&DWP(4,$np)); # np[1]
145 &movd ($acc0,&DWP(4,$ap)); # ap[1]
146
147 &psrlq ($car0,32);
148 &psrlq ($car1,32);
149
150 &inc ($j); # j++
151&set_label("1st",16);
152 &pmuludq($acc0,$mul0); # ap[j]*bp[0]
153 &pmuludq($acc1,$mul1); # np[j]*m1
154 &paddq ($car0,$acc0); # +=c0
155 &paddq ($car1,$acc1); # +=c1
156
157 &movq ($acc0,$car0);
158 &pand ($acc0,$mask);
159 &movd ($acc1,&DWP(4,$np,$j,4)); # np[j+1]
160 &paddq ($car1,$acc0); # +=ap[j]*bp[0];
161 &movd ($acc0,&DWP(4,$ap,$j,4)); # ap[j+1]
162 &psrlq ($car0,32);
163 &movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[j-1]=
164 &psrlq ($car1,32);
165
166 &lea ($j,&DWP(1,$j));
167 &cmp ($j,$num);
168 &jl (&label("1st"));
169
170 &pmuludq($acc0,$mul0); # ap[num-1]*bp[0]
171 &pmuludq($acc1,$mul1); # np[num-1]*m1
172 &paddq ($car0,$acc0); # +=c0
173 &paddq ($car1,$acc1); # +=c1
174
175 &movq ($acc0,$car0);
176 &pand ($acc0,$mask);
177 &paddq ($car1,$acc0); # +=ap[num-1]*bp[0];
178 &movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[num-2]=
179
180 &psrlq ($car0,32);
181 &psrlq ($car1,32);
182
183 &paddq ($car1,$car0);
184 &movq (&QWP($frame,"esp",$num,4),$car1); # tp[num].tp[num-1]
185
186 &inc ($i); # i++
187&set_label("outer");
188 &xor ($j,$j); # j=0
189
190 &movd ($mul0,&DWP(0,$bp,$i,4)); # bp[i]
191 &movd ($mul1,&DWP(0,$ap)); # ap[0]
192 &movd ($temp,&DWP($frame,"esp")); # tp[0]
193 &movd ($car1,&DWP(0,$np)); # np[0]
194 &pmuludq($mul1,$mul0); # ap[0]*bp[i]
195
196 &paddq ($mul1,$temp); # +=tp[0]
197 &movq ($acc0,$mul1);
198 &movq ($car0,$mul1);
199 &pand ($acc0,$mask);
200
201 &pmuludq($mul1,$_n0q); # *=n0
202
203 &pmuludq($car1,$mul1);
204 &paddq ($car1,$acc0);
205
206 &movd ($temp,&DWP($frame+4,"esp")); # tp[1]
207 &movd ($acc1,&DWP(4,$np)); # np[1]
208 &movd ($acc0,&DWP(4,$ap)); # ap[1]
209
210 &psrlq ($car0,32);
211 &psrlq ($car1,32);
212 &paddq ($car0,$temp); # +=tp[1]
213
214 &inc ($j); # j++
215 &dec ($num);
216&set_label("inner");
217 &pmuludq($acc0,$mul0); # ap[j]*bp[i]
218 &pmuludq($acc1,$mul1); # np[j]*m1
219 &paddq ($car0,$acc0); # +=c0
220 &paddq ($car1,$acc1); # +=c1
221
222 &movq ($acc0,$car0);
223 &movd ($temp,&DWP($frame+4,"esp",$j,4));# tp[j+1]
224 &pand ($acc0,$mask);
225 &movd ($acc1,&DWP(4,$np,$j,4)); # np[j+1]
226 &paddq ($car1,$acc0); # +=ap[j]*bp[i]+tp[j]
227 &movd ($acc0,&DWP(4,$ap,$j,4)); # ap[j+1]
228 &psrlq ($car0,32);
229 &movd (&DWP($frame-4,"esp",$j,4),$car1);# tp[j-1]=
230 &psrlq ($car1,32);
231 &paddq ($car0,$temp); # +=tp[j+1]
232
233 &dec ($num);
234 &lea ($j,&DWP(1,$j)); # j++
235 &jnz (&label("inner"));
236
237 &mov ($num,$j);
238 &pmuludq($acc0,$mul0); # ap[num-1]*bp[i]
239 &pmuludq($acc1,$mul1); # np[num-1]*m1
240 &paddq ($car0,$acc0); # +=c0
241 &paddq ($car1,$acc1); # +=c1
242
243 &movq ($acc0,$car0);
244 &pand ($acc0,$mask);
245 &paddq ($car1,$acc0); # +=ap[num-1]*bp[i]+tp[num-1]
246 &movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[num-2]=
247 &psrlq ($car0,32);
248 &psrlq ($car1,32);
249
250 &movd ($temp,&DWP($frame+4,"esp",$num,4)); # += tp[num]
251 &paddq ($car1,$car0);
252 &paddq ($car1,$temp);
253 &movq (&QWP($frame,"esp",$num,4),$car1); # tp[num].tp[num-1]
254
255 &lea ($i,&DWP(1,$i)); # i++
256 &cmp ($i,$num);
257 &jle (&label("outer"));
258
259 &emms (); # done with mmx bank
260 &jmp (&label("common_tail"));
261
262&set_label("non_sse2",16);
263}
264
265if (0) {
266 &mov ("esp",$_sp);
267 &xor ("eax","eax"); # signal "not fast enough [yet]"
268 &jmp (&label("just_leave"));
269 # While the below code provides competitive performance for
270 # all key lengthes on modern Intel cores, it's still more
271 # than 10% slower for 4096-bit key elsewhere:-( "Competitive"
272 # means compared to the original integer-only assembler.
273 # 512-bit RSA sign is better by ~40%, but that's about all
274 # one can say about all CPUs...
275} else {
276$inp="esi"; # integer path uses these registers differently
277$word="edi";
278$carry="ebp";
279
280 &mov ($inp,$_ap);
281 &lea ($carry,&DWP(1,$num));
282 &mov ($word,$_bp);
283 &xor ($j,$j); # j=0
284 &mov ("edx",$inp);
285 &and ($carry,1); # see if num is even
286 &sub ("edx",$word); # see if ap==bp
287 &lea ("eax",&DWP(4,$word,$num,4)); # &bp[num]
288 &or ($carry,"edx");
289 &mov ($word,&DWP(0,$word)); # bp[0]
290 &jz (&label("bn_sqr_mont"));
291 &mov ($_bpend,"eax");
292 &mov ("eax",&DWP(0,$inp));
293 &xor ("edx","edx");
294
295&set_label("mull",16);
296 &mov ($carry,"edx");
297 &mul ($word); # ap[j]*bp[0]
298 &add ($carry,"eax");
299 &lea ($j,&DWP(1,$j));
300 &adc ("edx",0);
301 &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j+1]
302 &cmp ($j,$num);
303 &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]=
304 &jl (&label("mull"));
305
306 &mov ($carry,"edx");
307 &mul ($word); # ap[num-1]*bp[0]
308 &mov ($word,$_n0);
309 &add ("eax",$carry);
310 &mov ($inp,$_np);
311 &adc ("edx",0);
312 &imul ($word,&DWP($frame,"esp")); # n0*tp[0]
313
314 &mov (&DWP($frame,"esp",$num,4),"eax"); # tp[num-1]=
315 &xor ($j,$j);
316 &mov (&DWP($frame+4,"esp",$num,4),"edx"); # tp[num]=
317 &mov (&DWP($frame+8,"esp",$num,4),$j); # tp[num+1]=
318
319 &mov ("eax",&DWP(0,$inp)); # np[0]
320 &mul ($word); # np[0]*m
321 &add ("eax",&DWP($frame,"esp")); # +=tp[0]
322 &mov ("eax",&DWP(4,$inp)); # np[1]
323 &adc ("edx",0);
324 &inc ($j);
325
326 &jmp (&label("2ndmadd"));
327
328&set_label("1stmadd",16);
329 &mov ($carry,"edx");
330 &mul ($word); # ap[j]*bp[i]
331 &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j]
332 &lea ($j,&DWP(1,$j));
333 &adc ("edx",0);
334 &add ($carry,"eax");
335 &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j+1]
336 &adc ("edx",0);
337 &cmp ($j,$num);
338 &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]=
339 &jl (&label("1stmadd"));
340
341 &mov ($carry,"edx");
342 &mul ($word); # ap[num-1]*bp[i]
343 &add ("eax",&DWP($frame,"esp",$num,4)); # +=tp[num-1]
344 &mov ($word,$_n0);
345 &adc ("edx",0);
346 &mov ($inp,$_np);
347 &add ($carry,"eax");
348 &adc ("edx",0);
349 &imul ($word,&DWP($frame,"esp")); # n0*tp[0]
350
351 &xor ($j,$j);
352 &add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num]
353 &mov (&DWP($frame,"esp",$num,4),$carry); # tp[num-1]=
354 &adc ($j,0);
355 &mov ("eax",&DWP(0,$inp)); # np[0]
356 &mov (&DWP($frame+4,"esp",$num,4),"edx"); # tp[num]=
357 &mov (&DWP($frame+8,"esp",$num,4),$j); # tp[num+1]=
358
359 &mul ($word); # np[0]*m
360 &add ("eax",&DWP($frame,"esp")); # +=tp[0]
361 &mov ("eax",&DWP(4,$inp)); # np[1]
362 &adc ("edx",0);
363 &mov ($j,1);
364
365&set_label("2ndmadd",16);
366 &mov ($carry,"edx");
367 &mul ($word); # np[j]*m
368 &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j]
369 &lea ($j,&DWP(1,$j));
370 &adc ("edx",0);
371 &add ($carry,"eax");
372 &mov ("eax",&DWP(0,$inp,$j,4)); # np[j+1]
373 &adc ("edx",0);
374 &cmp ($j,$num);
375 &mov (&DWP($frame-8,"esp",$j,4),$carry); # tp[j-1]=
376 &jl (&label("2ndmadd"));
377
378 &mov ($carry,"edx");
379 &mul ($word); # np[j]*m
380 &add ($carry,&DWP($frame,"esp",$num,4)); # +=tp[num-1]
381 &adc ("edx",0);
382 &add ($carry,"eax");
383 &adc ("edx",0);
384 &mov (&DWP($frame-4,"esp",$num,4),$carry); # tp[num-2]=
385
386 &xor ("eax","eax");
387 &mov ($j,$_bp); # &bp[i]
388 &add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num]
389 &adc ("eax",&DWP($frame+8,"esp",$num,4)); # +=tp[num+1]
390 &lea ($j,&DWP(4,$j));
391 &mov (&DWP($frame,"esp",$num,4),"edx"); # tp[num-1]=
392 &cmp ($j,$_bpend);
393 &mov (&DWP($frame+4,"esp",$num,4),"eax"); # tp[num]=
394 &je (&label("common_tail"));
395
396 &mov ($word,&DWP(0,$j)); # bp[i+1]
397 &mov ($inp,$_ap);
398 &mov ($_bp,$j); # &bp[++i]
399 &xor ($j,$j);
400 &xor ("edx","edx");
401 &mov ("eax",&DWP(0,$inp));
402 &jmp (&label("1stmadd"));
403
404&set_label("bn_sqr_mont",16);
405$sbit=$num;
406 &mov ($_num,$num);
407 &mov ($_bp,$j); # i=0
408
409 &mov ("eax",$word); # ap[0]
410 &mul ($word); # ap[0]*ap[0]
411 &mov (&DWP($frame,"esp"),"eax"); # tp[0]=
412 &mov ($sbit,"edx");
413 &shr ("edx",1);
414 &and ($sbit,1);
415 &inc ($j);
416&set_label("sqr",16);
417 &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j]
418 &mov ($carry,"edx");
419 &mul ($word); # ap[j]*ap[0]
420 &add ("eax",$carry);
421 &lea ($j,&DWP(1,$j));
422 &adc ("edx",0);
423 &lea ($carry,&DWP(0,$sbit,"eax",2));
424 &shr ("eax",31);
425 &cmp ($j,$_num);
426 &mov ($sbit,"eax");
427 &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]=
428 &jl (&label("sqr"));
429
430 &mov ("eax",&DWP(0,$inp,$j,4)); # ap[num-1]
431 &mov ($carry,"edx");
432 &mul ($word); # ap[num-1]*ap[0]
433 &add ("eax",$carry);
434 &mov ($word,$_n0);
435 &adc ("edx",0);
436 &mov ($inp,$_np);
437 &lea ($carry,&DWP(0,$sbit,"eax",2));
438 &imul ($word,&DWP($frame,"esp")); # n0*tp[0]
439 &shr ("eax",31);
440 &mov (&DWP($frame,"esp",$j,4),$carry); # tp[num-1]=
441
442 &lea ($carry,&DWP(0,"eax","edx",2));
443 &mov ("eax",&DWP(0,$inp)); # np[0]
444 &shr ("edx",31);
445 &mov (&DWP($frame+4,"esp",$j,4),$carry); # tp[num]=
446 &mov (&DWP($frame+8,"esp",$j,4),"edx"); # tp[num+1]=
447
448 &mul ($word); # np[0]*m
449 &add ("eax",&DWP($frame,"esp")); # +=tp[0]
450 &mov ($num,$j);
451 &adc ("edx",0);
452 &mov ("eax",&DWP(4,$inp)); # np[1]
453 &mov ($j,1);
454
455&set_label("3rdmadd",16);
456 &mov ($carry,"edx");
457 &mul ($word); # np[j]*m
458 &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j]
459 &adc ("edx",0);
460 &add ($carry,"eax");
461 &mov ("eax",&DWP(4,$inp,$j,4)); # np[j+1]
462 &adc ("edx",0);
463 &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j-1]=
464
465 &mov ($carry,"edx");
466 &mul ($word); # np[j+1]*m
467 &add ($carry,&DWP($frame+4,"esp",$j,4)); # +=tp[j+1]
468 &lea ($j,&DWP(2,$j));
469 &adc ("edx",0);
470 &add ($carry,"eax");
471 &mov ("eax",&DWP(0,$inp,$j,4)); # np[j+2]
472 &adc ("edx",0);
473 &cmp ($j,$num);
474 &mov (&DWP($frame-8,"esp",$j,4),$carry); # tp[j]=
475 &jl (&label("3rdmadd"));
476
477 &mov ($carry,"edx");
478 &mul ($word); # np[j]*m
479 &add ($carry,&DWP($frame,"esp",$num,4)); # +=tp[num-1]
480 &adc ("edx",0);
481 &add ($carry,"eax");
482 &adc ("edx",0);
483 &mov (&DWP($frame-4,"esp",$num,4),$carry); # tp[num-2]=
484
485 &mov ($j,$_bp); # i
486 &xor ("eax","eax");
487 &mov ($inp,$_ap);
488 &add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num]
489 &adc ("eax",&DWP($frame+8,"esp",$num,4)); # +=tp[num+1]
490 &mov (&DWP($frame,"esp",$num,4),"edx"); # tp[num-1]=
491 &cmp ($j,$num);
492 &mov (&DWP($frame+4,"esp",$num,4),"eax"); # tp[num]=
493 &je (&label("common_tail"));
494
495 &mov ($word,&DWP(4,$inp,$j,4)); # ap[i]
496 &lea ($j,&DWP(1,$j));
497 &mov ("eax",$word);
498 &mov ($_bp,$j); # ++i
499 &mul ($word); # ap[i]*ap[i]
500 &add ("eax",&DWP($frame,"esp",$j,4)); # +=tp[i]
501 &adc ("edx",0);
502 &mov (&DWP($frame,"esp",$j,4),"eax"); # tp[i]=
503 &xor ($carry,$carry);
504 &cmp ($j,$num);
505 &lea ($j,&DWP(1,$j));
506 &je (&label("sqrlast"));
507
508 &mov ($sbit,"edx"); # zaps $num
509 &shr ("edx",1);
510 &and ($sbit,1);
511&set_label("sqradd",16);
512 &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j]
513 &mov ($carry,"edx");
514 &mul ($word); # ap[j]*ap[i]
515 &add ("eax",$carry);
516 &lea ($carry,&DWP(0,"eax","eax"));
517 &adc ("edx",0);
518 &shr ("eax",31);
519 &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j]
520 &lea ($j,&DWP(1,$j));
521 &adc ("eax",0);
522 &add ($carry,$sbit);
523 &adc ("eax",0);
524 &cmp ($j,$_num);
525 &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]=
526 &mov ($sbit,"eax");
527 &jle (&label("sqradd"));
528
529 &mov ($carry,"edx");
530 &add ("edx","edx");
531 &shr ($carry,31);
532 &add ("edx",$sbit);
533 &adc ($carry,0);
534&set_label("sqrlast");
535 &mov ($word,$_n0);
536 &mov ($inp,$_np);
537 &imul ($word,&DWP($frame,"esp")); # n0*tp[0]
538
539 &add ("edx",&DWP($frame,"esp",$j,4)); # +=tp[num]
540 &mov ("eax",&DWP(0,$inp)); # np[0]
541 &adc ($carry,0);
542 &mov (&DWP($frame,"esp",$j,4),"edx"); # tp[num]=
543 &mov (&DWP($frame+4,"esp",$j,4),$carry); # tp[num+1]=
544
545 &mul ($word); # np[0]*m
546 &add ("eax",&DWP($frame,"esp")); # +=tp[0]
547 &lea ($num,&DWP(-1,$j));
548 &adc ("edx",0);
549 &mov ($j,1);
550 &mov ("eax",&DWP(4,$inp)); # np[1]
551
552 &jmp (&label("3rdmadd"));
553}
554
555&set_label("common_tail",16);
556 &mov ($np,$_np); # load modulus pointer
557 &mov ($rp,$_rp); # load result pointer
558 &lea ($tp,&DWP($frame,"esp")); # [$ap and $bp are zapped]
559
560 &mov ("eax",&DWP(0,$tp)); # tp[0]
561 &mov ($j,$num); # j=num-1
562 &xor ($i,$i); # i=0 and clear CF!
563
564&set_label("sub",16);
565 &sbb ("eax",&DWP(0,$np,$i,4));
566 &mov (&DWP(0,$rp,$i,4),"eax"); # rp[i]=tp[i]-np[i]
567 &dec ($j); # doesn't affect CF!
568 &mov ("eax",&DWP(4,$tp,$i,4)); # tp[i+1]
569 &lea ($i,&DWP(1,$i)); # i++
570 &jge (&label("sub"));
571
572 &sbb ("eax",0); # handle upmost overflow bit
573 &and ($tp,"eax");
574 &not ("eax");
575 &mov ($np,$rp);
576 &and ($np,"eax");
577 &or ($tp,$np); # tp=carry?tp:rp
578
579&set_label("copy",16); # copy or in-place refresh
580 &mov ("eax",&DWP(0,$tp,$num,4));
581 &mov (&DWP(0,$rp,$num,4),"eax"); # rp[i]=tp[i]
582 &mov (&DWP($frame,"esp",$num,4),$j); # zap temporary vector
583 &dec ($num);
584 &jge (&label("copy"));
585
586 &mov ("esp",$_sp); # pull saved stack pointer
587 &mov ("eax",1);
588&set_label("just_leave");
589&function_end("bn_mul_mont");
590
591&asciz("Montgomery Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>");
592
593&asm_finish();
diff --git a/src/lib/libcrypto/bn/asm/x86.pl b/src/lib/libcrypto/bn/asm/x86.pl
deleted file mode 100644
index 1bc4f1bb27..0000000000
--- a/src/lib/libcrypto/bn/asm/x86.pl
+++ /dev/null
@@ -1,28 +0,0 @@
1#!/usr/local/bin/perl
2
3push(@INC,"perlasm","../../perlasm");
4require "x86asm.pl";
5
6require("x86/mul_add.pl");
7require("x86/mul.pl");
8require("x86/sqr.pl");
9require("x86/div.pl");
10require("x86/add.pl");
11require("x86/sub.pl");
12require("x86/comba.pl");
13
14&asm_init($ARGV[0],$0);
15
16&bn_mul_add_words("bn_mul_add_words");
17&bn_mul_words("bn_mul_words");
18&bn_sqr_words("bn_sqr_words");
19&bn_div_words("bn_div_words");
20&bn_add_words("bn_add_words");
21&bn_sub_words("bn_sub_words");
22&bn_mul_comba("bn_mul_comba8",8);
23&bn_mul_comba("bn_mul_comba4",4);
24&bn_sqr_comba("bn_sqr_comba8",8);
25&bn_sqr_comba("bn_sqr_comba4",4);
26
27&asm_finish();
28
diff --git a/src/lib/libcrypto/bn/asm/x86/add.pl b/src/lib/libcrypto/bn/asm/x86/add.pl
deleted file mode 100644
index 3bb0080922..0000000000
--- a/src/lib/libcrypto/bn/asm/x86/add.pl
+++ /dev/null
@@ -1,76 +0,0 @@
1#!/usr/local/bin/perl
2# x86 assembler
3
4sub bn_add_words
5 {
6 local($name)=@_;
7
8 &function_begin($name,"");
9
10 &comment("");
11 $a="esi";
12 $b="edi";
13 $c="eax";
14 $r="ebx";
15 $tmp1="ecx";
16 $tmp2="edx";
17 $num="ebp";
18
19 &mov($r,&wparam(0)); # get r
20 &mov($a,&wparam(1)); # get a
21 &mov($b,&wparam(2)); # get b
22 &mov($num,&wparam(3)); # get num
23 &xor($c,$c); # clear carry
24 &and($num,0xfffffff8); # num / 8
25
26 &jz(&label("aw_finish"));
27
28 &set_label("aw_loop",0);
29 for ($i=0; $i<8; $i++)
30 {
31 &comment("Round $i");
32
33 &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
34 &mov($tmp2,&DWP($i*4,$b,"",0)); # *b
35 &add($tmp1,$c);
36 &mov($c,0);
37 &adc($c,$c);
38 &add($tmp1,$tmp2);
39 &adc($c,0);
40 &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
41 }
42
43 &comment("");
44 &add($a,32);
45 &add($b,32);
46 &add($r,32);
47 &sub($num,8);
48 &jnz(&label("aw_loop"));
49
50 &set_label("aw_finish",0);
51 &mov($num,&wparam(3)); # get num
52 &and($num,7);
53 &jz(&label("aw_end"));
54
55 for ($i=0; $i<7; $i++)
56 {
57 &comment("Tail Round $i");
58 &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
59 &mov($tmp2,&DWP($i*4,$b,"",0));# *b
60 &add($tmp1,$c);
61 &mov($c,0);
62 &adc($c,$c);
63 &add($tmp1,$tmp2);
64 &adc($c,0);
65 &dec($num) if ($i != 6);
66 &mov(&DWP($i*4,$r,"",0),$tmp1); # *a
67 &jz(&label("aw_end")) if ($i != 6);
68 }
69 &set_label("aw_end",0);
70
71# &mov("eax",$c); # $c is "eax"
72
73 &function_end($name);
74 }
75
761;
diff --git a/src/lib/libcrypto/bn/asm/x86/comba.pl b/src/lib/libcrypto/bn/asm/x86/comba.pl
deleted file mode 100644
index dc4ec97ff5..0000000000
--- a/src/lib/libcrypto/bn/asm/x86/comba.pl
+++ /dev/null
@@ -1,277 +0,0 @@
1#!/usr/local/bin/perl
2# x86 assembler
3
4sub mul_add_c
5 {
6 local($a,$ai,$b,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
7
8 # pos == -1 if eax and edx are pre-loaded, 0 to load from next
9 # words, and 1 if load return value
10
11 &comment("mul a[$ai]*b[$bi]");
12
13 # "eax" and "edx" will always be pre-loaded.
14 # &mov("eax",&DWP($ai*4,$a,"",0)) ;
15 # &mov("edx",&DWP($bi*4,$b,"",0));
16
17 &mul("edx");
18 &add($c0,"eax");
19 &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # laod next a
20 &mov("eax",&wparam(0)) if $pos > 0; # load r[]
21 ###
22 &adc($c1,"edx");
23 &mov("edx",&DWP(($nb)*4,$b,"",0)) if $pos == 0; # laod next b
24 &mov("edx",&DWP(($nb)*4,$b,"",0)) if $pos == 1; # laod next b
25 ###
26 &adc($c2,0);
27 # is pos > 1, it means it is the last loop
28 &mov(&DWP($i*4,"eax","",0),$c0) if $pos > 0; # save r[];
29 &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # laod next a
30 }
31
32sub sqr_add_c
33 {
34 local($r,$a,$ai,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
35
36 # pos == -1 if eax and edx are pre-loaded, 0 to load from next
37 # words, and 1 if load return value
38
39 &comment("sqr a[$ai]*a[$bi]");
40
41 # "eax" and "edx" will always be pre-loaded.
42 # &mov("eax",&DWP($ai*4,$a,"",0)) ;
43 # &mov("edx",&DWP($bi*4,$b,"",0));
44
45 if ($ai == $bi)
46 { &mul("eax");}
47 else
48 { &mul("edx");}
49 &add($c0,"eax");
50 &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # load next a
51 ###
52 &adc($c1,"edx");
53 &mov("edx",&DWP(($nb)*4,$a,"",0)) if ($pos == 1) && ($na != $nb);
54 ###
55 &adc($c2,0);
56 # is pos > 1, it means it is the last loop
57 &mov(&DWP($i*4,$r,"",0),$c0) if $pos > 0; # save r[];
58 &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # load next b
59 }
60
61sub sqr_add_c2
62 {
63 local($r,$a,$ai,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
64
65 # pos == -1 if eax and edx are pre-loaded, 0 to load from next
66 # words, and 1 if load return value
67
68 &comment("sqr a[$ai]*a[$bi]");
69
70 # "eax" and "edx" will always be pre-loaded.
71 # &mov("eax",&DWP($ai*4,$a,"",0)) ;
72 # &mov("edx",&DWP($bi*4,$a,"",0));
73
74 if ($ai == $bi)
75 { &mul("eax");}
76 else
77 { &mul("edx");}
78 &add("eax","eax");
79 ###
80 &adc("edx","edx");
81 ###
82 &adc($c2,0);
83 &add($c0,"eax");
84 &adc($c1,"edx");
85 &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # load next a
86 &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # load next b
87 &adc($c2,0);
88 &mov(&DWP($i*4,$r,"",0),$c0) if $pos > 0; # save r[];
89 &mov("edx",&DWP(($nb)*4,$a,"",0)) if ($pos <= 1) && ($na != $nb);
90 ###
91 }
92
93sub bn_mul_comba
94 {
95 local($name,$num)=@_;
96 local($a,$b,$c0,$c1,$c2);
97 local($i,$as,$ae,$bs,$be,$ai,$bi);
98 local($tot,$end);
99
100 &function_begin_B($name,"");
101
102 $c0="ebx";
103 $c1="ecx";
104 $c2="ebp";
105 $a="esi";
106 $b="edi";
107
108 $as=0;
109 $ae=0;
110 $bs=0;
111 $be=0;
112 $tot=$num+$num-1;
113
114 &push("esi");
115 &mov($a,&wparam(1));
116 &push("edi");
117 &mov($b,&wparam(2));
118 &push("ebp");
119 &push("ebx");
120
121 &xor($c0,$c0);
122 &mov("eax",&DWP(0,$a,"",0)); # load the first word
123 &xor($c1,$c1);
124 &mov("edx",&DWP(0,$b,"",0)); # load the first second
125
126 for ($i=0; $i<$tot; $i++)
127 {
128 $ai=$as;
129 $bi=$bs;
130 $end=$be+1;
131
132 &comment("################## Calculate word $i");
133
134 for ($j=$bs; $j<$end; $j++)
135 {
136 &xor($c2,$c2) if ($j == $bs);
137 if (($j+1) == $end)
138 {
139 $v=1;
140 $v=2 if (($i+1) == $tot);
141 }
142 else
143 { $v=0; }
144 if (($j+1) != $end)
145 {
146 $na=($ai-1);
147 $nb=($bi+1);
148 }
149 else
150 {
151 $na=$as+($i < ($num-1));
152 $nb=$bs+($i >= ($num-1));
153 }
154#printf STDERR "[$ai,$bi] -> [$na,$nb]\n";
155 &mul_add_c($a,$ai,$b,$bi,$c0,$c1,$c2,$v,$i,$na,$nb);
156 if ($v)
157 {
158 &comment("saved r[$i]");
159 # &mov("eax",&wparam(0));
160 # &mov(&DWP($i*4,"eax","",0),$c0);
161 ($c0,$c1,$c2)=($c1,$c2,$c0);
162 }
163 $ai--;
164 $bi++;
165 }
166 $as++ if ($i < ($num-1));
167 $ae++ if ($i >= ($num-1));
168
169 $bs++ if ($i >= ($num-1));
170 $be++ if ($i < ($num-1));
171 }
172 &comment("save r[$i]");
173 # &mov("eax",&wparam(0));
174 &mov(&DWP($i*4,"eax","",0),$c0);
175
176 &pop("ebx");
177 &pop("ebp");
178 &pop("edi");
179 &pop("esi");
180 &ret();
181 &function_end_B($name);
182 }
183
184sub bn_sqr_comba
185 {
186 local($name,$num)=@_;
187 local($r,$a,$c0,$c1,$c2)=@_;
188 local($i,$as,$ae,$bs,$be,$ai,$bi);
189 local($b,$tot,$end,$half);
190
191 &function_begin_B($name,"");
192
193 $c0="ebx";
194 $c1="ecx";
195 $c2="ebp";
196 $a="esi";
197 $r="edi";
198
199 &push("esi");
200 &push("edi");
201 &push("ebp");
202 &push("ebx");
203 &mov($r,&wparam(0));
204 &mov($a,&wparam(1));
205 &xor($c0,$c0);
206 &xor($c1,$c1);
207 &mov("eax",&DWP(0,$a,"",0)); # load the first word
208
209 $as=0;
210 $ae=0;
211 $bs=0;
212 $be=0;
213 $tot=$num+$num-1;
214
215 for ($i=0; $i<$tot; $i++)
216 {
217 $ai=$as;
218 $bi=$bs;
219 $end=$be+1;
220
221 &comment("############### Calculate word $i");
222 for ($j=$bs; $j<$end; $j++)
223 {
224 &xor($c2,$c2) if ($j == $bs);
225 if (($ai-1) < ($bi+1))
226 {
227 $v=1;
228 $v=2 if ($i+1) == $tot;
229 }
230 else
231 { $v=0; }
232 if (!$v)
233 {
234 $na=$ai-1;
235 $nb=$bi+1;
236 }
237 else
238 {
239 $na=$as+($i < ($num-1));
240 $nb=$bs+($i >= ($num-1));
241 }
242 if ($ai == $bi)
243 {
244 &sqr_add_c($r,$a,$ai,$bi,
245 $c0,$c1,$c2,$v,$i,$na,$nb);
246 }
247 else
248 {
249 &sqr_add_c2($r,$a,$ai,$bi,
250 $c0,$c1,$c2,$v,$i,$na,$nb);
251 }
252 if ($v)
253 {
254 &comment("saved r[$i]");
255 #&mov(&DWP($i*4,$r,"",0),$c0);
256 ($c0,$c1,$c2)=($c1,$c2,$c0);
257 last;
258 }
259 $ai--;
260 $bi++;
261 }
262 $as++ if ($i < ($num-1));
263 $ae++ if ($i >= ($num-1));
264
265 $bs++ if ($i >= ($num-1));
266 $be++ if ($i < ($num-1));
267 }
268 &mov(&DWP($i*4,$r,"",0),$c0);
269 &pop("ebx");
270 &pop("ebp");
271 &pop("edi");
272 &pop("esi");
273 &ret();
274 &function_end_B($name);
275 }
276
2771;
diff --git a/src/lib/libcrypto/bn/asm/x86/div.pl b/src/lib/libcrypto/bn/asm/x86/div.pl
deleted file mode 100644
index e771eda82f..0000000000
--- a/src/lib/libcrypto/bn/asm/x86/div.pl
+++ /dev/null
@@ -1,15 +0,0 @@
1#!/usr/local/bin/perl
2# x86 assembler
3
4sub bn_div_words
5 {
6 local($name)=@_;
7
8 &function_begin($name,"");
9 &mov("edx",&wparam(0)); #
10 &mov("eax",&wparam(1)); #
11 &mov("ebx",&wparam(2)); #
12 &div("ebx");
13 &function_end($name);
14 }
151;
diff --git a/src/lib/libcrypto/bn/asm/x86/mul.pl b/src/lib/libcrypto/bn/asm/x86/mul.pl
deleted file mode 100644
index 92b5542dac..0000000000
--- a/src/lib/libcrypto/bn/asm/x86/mul.pl
+++ /dev/null
@@ -1,77 +0,0 @@
1#!/usr/local/bin/perl
2# x86 assembler
3
4sub bn_mul_words
5 {
6 local($name)=@_;
7
8 &function_begin($name,"");
9
10 &comment("");
11 $Low="eax";
12 $High="edx";
13 $a="ebx";
14 $w="ecx";
15 $r="edi";
16 $c="esi";
17 $num="ebp";
18
19 &xor($c,$c); # clear carry
20 &mov($r,&wparam(0)); #
21 &mov($a,&wparam(1)); #
22 &mov($num,&wparam(2)); #
23 &mov($w,&wparam(3)); #
24
25 &and($num,0xfffffff8); # num / 8
26 &jz(&label("mw_finish"));
27
28 &set_label("mw_loop",0);
29 for ($i=0; $i<32; $i+=4)
30 {
31 &comment("Round $i");
32
33 &mov("eax",&DWP($i,$a,"",0)); # *a
34 &mul($w); # *a * w
35 &add("eax",$c); # L(t)+=c
36 # XXX
37
38 &adc("edx",0); # H(t)+=carry
39 &mov(&DWP($i,$r,"",0),"eax"); # *r= L(t);
40
41 &mov($c,"edx"); # c= H(t);
42 }
43
44 &comment("");
45 &add($a,32);
46 &add($r,32);
47 &sub($num,8);
48 &jz(&label("mw_finish"));
49 &jmp(&label("mw_loop"));
50
51 &set_label("mw_finish",0);
52 &mov($num,&wparam(2)); # get num
53 &and($num,7);
54 &jnz(&label("mw_finish2"));
55 &jmp(&label("mw_end"));
56
57 &set_label("mw_finish2",1);
58 for ($i=0; $i<7; $i++)
59 {
60 &comment("Tail Round $i");
61 &mov("eax",&DWP($i*4,$a,"",0));# *a
62 &mul($w); # *a * w
63 &add("eax",$c); # L(t)+=c
64 # XXX
65 &adc("edx",0); # H(t)+=carry
66 &mov(&DWP($i*4,$r,"",0),"eax");# *r= L(t);
67 &mov($c,"edx"); # c= H(t);
68 &dec($num) if ($i != 7-1);
69 &jz(&label("mw_end")) if ($i != 7-1);
70 }
71 &set_label("mw_end",0);
72 &mov("eax",$c);
73
74 &function_end($name);
75 }
76
771;
diff --git a/src/lib/libcrypto/bn/asm/x86/mul_add.pl b/src/lib/libcrypto/bn/asm/x86/mul_add.pl
deleted file mode 100644
index 9803dbdad0..0000000000
--- a/src/lib/libcrypto/bn/asm/x86/mul_add.pl
+++ /dev/null
@@ -1,87 +0,0 @@
1#!/usr/local/bin/perl
2# x86 assembler
3
4sub bn_mul_add_words
5 {
6 local($name)=@_;
7
8 &function_begin($name,"");
9
10 &comment("");
11 $Low="eax";
12 $High="edx";
13 $a="ebx";
14 $w="ebp";
15 $r="edi";
16 $c="esi";
17
18 &xor($c,$c); # clear carry
19 &mov($r,&wparam(0)); #
20
21 &mov("ecx",&wparam(2)); #
22 &mov($a,&wparam(1)); #
23
24 &and("ecx",0xfffffff8); # num / 8
25 &mov($w,&wparam(3)); #
26
27 &push("ecx"); # Up the stack for a tmp variable
28
29 &jz(&label("maw_finish"));
30
31 &set_label("maw_loop",0);
32
33 &mov(&swtmp(0),"ecx"); #
34
35 for ($i=0; $i<32; $i+=4)
36 {
37 &comment("Round $i");
38
39 &mov("eax",&DWP($i,$a,"",0)); # *a
40 &mul($w); # *a * w
41 &add("eax",$c); # L(t)+= *r
42 &mov($c,&DWP($i,$r,"",0)); # L(t)+= *r
43 &adc("edx",0); # H(t)+=carry
44 &add("eax",$c); # L(t)+=c
45 &adc("edx",0); # H(t)+=carry
46 &mov(&DWP($i,$r,"",0),"eax"); # *r= L(t);
47 &mov($c,"edx"); # c= H(t);
48 }
49
50 &comment("");
51 &mov("ecx",&swtmp(0)); #
52 &add($a,32);
53 &add($r,32);
54 &sub("ecx",8);
55 &jnz(&label("maw_loop"));
56
57 &set_label("maw_finish",0);
58 &mov("ecx",&wparam(2)); # get num
59 &and("ecx",7);
60 &jnz(&label("maw_finish2")); # helps branch prediction
61 &jmp(&label("maw_end"));
62
63 &set_label("maw_finish2",1);
64 for ($i=0; $i<7; $i++)
65 {
66 &comment("Tail Round $i");
67 &mov("eax",&DWP($i*4,$a,"",0));# *a
68 &mul($w); # *a * w
69 &add("eax",$c); # L(t)+=c
70 &mov($c,&DWP($i*4,$r,"",0)); # L(t)+= *r
71 &adc("edx",0); # H(t)+=carry
72 &add("eax",$c);
73 &adc("edx",0); # H(t)+=carry
74 &dec("ecx") if ($i != 7-1);
75 &mov(&DWP($i*4,$r,"",0),"eax"); # *r= L(t);
76 &mov($c,"edx"); # c= H(t);
77 &jz(&label("maw_end")) if ($i != 7-1);
78 }
79 &set_label("maw_end",0);
80 &mov("eax",$c);
81
82 &pop("ecx"); # clear variable from
83
84 &function_end($name);
85 }
86
871;
diff --git a/src/lib/libcrypto/bn/asm/x86/sqr.pl b/src/lib/libcrypto/bn/asm/x86/sqr.pl
deleted file mode 100644
index 6cf75a76e2..0000000000
--- a/src/lib/libcrypto/bn/asm/x86/sqr.pl
+++ /dev/null
@@ -1,60 +0,0 @@
1#!/usr/local/bin/perl
2# x86 assembler
3
4sub bn_sqr_words
5 {
6 local($name)=@_;
7
8 &function_begin($name,"");
9
10 &comment("");
11 $r="esi";
12 $a="edi";
13 $num="ebx";
14
15 &mov($r,&wparam(0)); #
16 &mov($a,&wparam(1)); #
17 &mov($num,&wparam(2)); #
18
19 &and($num,0xfffffff8); # num / 8
20 &jz(&label("sw_finish"));
21
22 &set_label("sw_loop",0);
23 for ($i=0; $i<32; $i+=4)
24 {
25 &comment("Round $i");
26 &mov("eax",&DWP($i,$a,"",0)); # *a
27 # XXX
28 &mul("eax"); # *a * *a
29 &mov(&DWP($i*2,$r,"",0),"eax"); #
30 &mov(&DWP($i*2+4,$r,"",0),"edx");#
31 }
32
33 &comment("");
34 &add($a,32);
35 &add($r,64);
36 &sub($num,8);
37 &jnz(&label("sw_loop"));
38
39 &set_label("sw_finish",0);
40 &mov($num,&wparam(2)); # get num
41 &and($num,7);
42 &jz(&label("sw_end"));
43
44 for ($i=0; $i<7; $i++)
45 {
46 &comment("Tail Round $i");
47 &mov("eax",&DWP($i*4,$a,"",0)); # *a
48 # XXX
49 &mul("eax"); # *a * *a
50 &mov(&DWP($i*8,$r,"",0),"eax"); #
51 &dec($num) if ($i != 7-1);
52 &mov(&DWP($i*8+4,$r,"",0),"edx");
53 &jz(&label("sw_end")) if ($i != 7-1);
54 }
55 &set_label("sw_end",0);
56
57 &function_end($name);
58 }
59
601;
diff --git a/src/lib/libcrypto/bn/asm/x86/sub.pl b/src/lib/libcrypto/bn/asm/x86/sub.pl
deleted file mode 100644
index 0c5364cce5..0000000000
--- a/src/lib/libcrypto/bn/asm/x86/sub.pl
+++ /dev/null
@@ -1,76 +0,0 @@
1#!/usr/local/bin/perl
2# x86 assembler
3
4sub bn_sub_words
5 {
6 local($name)=@_;
7
8 &function_begin($name,"");
9
10 &comment("");
11 $a="esi";
12 $b="edi";
13 $c="eax";
14 $r="ebx";
15 $tmp1="ecx";
16 $tmp2="edx";
17 $num="ebp";
18
19 &mov($r,&wparam(0)); # get r
20 &mov($a,&wparam(1)); # get a
21 &mov($b,&wparam(2)); # get b
22 &mov($num,&wparam(3)); # get num
23 &xor($c,$c); # clear carry
24 &and($num,0xfffffff8); # num / 8
25
26 &jz(&label("aw_finish"));
27
28 &set_label("aw_loop",0);
29 for ($i=0; $i<8; $i++)
30 {
31 &comment("Round $i");
32
33 &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
34 &mov($tmp2,&DWP($i*4,$b,"",0)); # *b
35 &sub($tmp1,$c);
36 &mov($c,0);
37 &adc($c,$c);
38 &sub($tmp1,$tmp2);
39 &adc($c,0);
40 &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
41 }
42
43 &comment("");
44 &add($a,32);
45 &add($b,32);
46 &add($r,32);
47 &sub($num,8);
48 &jnz(&label("aw_loop"));
49
50 &set_label("aw_finish",0);
51 &mov($num,&wparam(3)); # get num
52 &and($num,7);
53 &jz(&label("aw_end"));
54
55 for ($i=0; $i<7; $i++)
56 {
57 &comment("Tail Round $i");
58 &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
59 &mov($tmp2,&DWP($i*4,$b,"",0));# *b
60 &sub($tmp1,$c);
61 &mov($c,0);
62 &adc($c,$c);
63 &sub($tmp1,$tmp2);
64 &adc($c,0);
65 &dec($num) if ($i != 6);
66 &mov(&DWP($i*4,$r,"",0),$tmp1); # *a
67 &jz(&label("aw_end")) if ($i != 6);
68 }
69 &set_label("aw_end",0);
70
71# &mov("eax",$c); # $c is "eax"
72
73 &function_end($name);
74 }
75
761;
diff --git a/src/lib/libcrypto/bn/asm/x86_64-gcc.c b/src/lib/libcrypto/bn/asm/x86_64-gcc.c
deleted file mode 100644
index 9deffa71f1..0000000000
--- a/src/lib/libcrypto/bn/asm/x86_64-gcc.c
+++ /dev/null
@@ -1,598 +0,0 @@
1/* $OpenBSD: x86_64-gcc.c,v 1.5 2015/02/25 15:39:49 bcook Exp $ */
2#include "../bn_lcl.h"
3#if !(defined(__GNUC__) && __GNUC__>=2)
4# include "../bn_asm.c" /* kind of dirty hack for Sun Studio */
5#else
6/*
7 * x86_64 BIGNUM accelerator version 0.1, December 2002.
8 *
9 * Implemented by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
10 * project.
11 *
12 * Rights for redistribution and usage in source and binary forms are
13 * granted according to the OpenSSL license. Warranty of any kind is
14 * disclaimed.
15 *
16 * Q. Version 0.1? It doesn't sound like Andy, he used to assign real
17 * versions, like 1.0...
18 * A. Well, that's because this code is basically a quick-n-dirty
19 * proof-of-concept hack. As you can see it's implemented with
20 * inline assembler, which means that you're bound to GCC and that
21 * there might be enough room for further improvement.
22 *
23 * Q. Why inline assembler?
24 * A. x86_64 features own ABI which I'm not familiar with. This is
25 * why I decided to let the compiler take care of subroutine
26 * prologue/epilogue as well as register allocation. For reference.
27 * Win64 implements different ABI for AMD64, different from Linux.
28 *
29 * Q. How much faster does it get?
30 * A. 'apps/openssl speed rsa dsa' output with no-asm:
31 *
32 * sign verify sign/s verify/s
33 * rsa 512 bits 0.0006s 0.0001s 1683.8 18456.2
34 * rsa 1024 bits 0.0028s 0.0002s 356.0 6407.0
35 * rsa 2048 bits 0.0172s 0.0005s 58.0 1957.8
36 * rsa 4096 bits 0.1155s 0.0018s 8.7 555.6
37 * sign verify sign/s verify/s
38 * dsa 512 bits 0.0005s 0.0006s 2100.8 1768.3
39 * dsa 1024 bits 0.0014s 0.0018s 692.3 559.2
40 * dsa 2048 bits 0.0049s 0.0061s 204.7 165.0
41 *
42 * 'apps/openssl speed rsa dsa' output with this module:
43 *
44 * sign verify sign/s verify/s
45 * rsa 512 bits 0.0004s 0.0000s 2767.1 33297.9
46 * rsa 1024 bits 0.0012s 0.0001s 867.4 14674.7
47 * rsa 2048 bits 0.0061s 0.0002s 164.0 5270.0
48 * rsa 4096 bits 0.0384s 0.0006s 26.1 1650.8
49 * sign verify sign/s verify/s
50 * dsa 512 bits 0.0002s 0.0003s 4442.2 3786.3
51 * dsa 1024 bits 0.0005s 0.0007s 1835.1 1497.4
52 * dsa 2048 bits 0.0016s 0.0020s 620.4 504.6
53 *
54 * For the reference. IA-32 assembler implementation performs
55 * very much like 64-bit code compiled with no-asm on the same
56 * machine.
57 */
58
59#define BN_ULONG unsigned long
60
61#undef mul
62#undef mul_add
63#undef sqr
64
65/*
66 * "m"(a), "+m"(r) is the way to favor DirectPath µ-code;
67 * "g"(0) let the compiler to decide where does it
68 * want to keep the value of zero;
69 */
70#define mul_add(r,a,word,carry) do { \
71 BN_ULONG high,low; \
72 asm ("mulq %3" \
73 : "=a"(low),"=d"(high) \
74 : "a"(word),"m"(a) \
75 : "cc"); \
76 asm ("addq %2,%0; adcq %3,%1" \
77 : "+r"(carry),"+d"(high)\
78 : "a"(low),"g"(0) \
79 : "cc"); \
80 asm ("addq %2,%0; adcq %3,%1" \
81 : "+m"(r),"+d"(high) \
82 : "r"(carry),"g"(0) \
83 : "cc"); \
84 carry=high; \
85 } while (0)
86
87#define mul(r,a,word,carry) do { \
88 BN_ULONG high,low; \
89 asm ("mulq %3" \
90 : "=a"(low),"=d"(high) \
91 : "a"(word),"g"(a) \
92 : "cc"); \
93 asm ("addq %2,%0; adcq %3,%1" \
94 : "+r"(carry),"+d"(high)\
95 : "a"(low),"g"(0) \
96 : "cc"); \
97 (r)=carry, carry=high; \
98 } while (0)
99
100#define sqr(r0,r1,a) \
101 asm ("mulq %2" \
102 : "=a"(r0),"=d"(r1) \
103 : "a"(a) \
104 : "cc");
105
106BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
107 {
108 BN_ULONG c1=0;
109
110 if (num <= 0) return(c1);
111
112 while (num&~3)
113 {
114 mul_add(rp[0],ap[0],w,c1);
115 mul_add(rp[1],ap[1],w,c1);
116 mul_add(rp[2],ap[2],w,c1);
117 mul_add(rp[3],ap[3],w,c1);
118 ap+=4; rp+=4; num-=4;
119 }
120 if (num)
121 {
122 mul_add(rp[0],ap[0],w,c1); if (--num==0) return c1;
123 mul_add(rp[1],ap[1],w,c1); if (--num==0) return c1;
124 mul_add(rp[2],ap[2],w,c1); return c1;
125 }
126
127 return(c1);
128 }
129
130BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
131 {
132 BN_ULONG c1=0;
133
134 if (num <= 0) return(c1);
135
136 while (num&~3)
137 {
138 mul(rp[0],ap[0],w,c1);
139 mul(rp[1],ap[1],w,c1);
140 mul(rp[2],ap[2],w,c1);
141 mul(rp[3],ap[3],w,c1);
142 ap+=4; rp+=4; num-=4;
143 }
144 if (num)
145 {
146 mul(rp[0],ap[0],w,c1); if (--num == 0) return c1;
147 mul(rp[1],ap[1],w,c1); if (--num == 0) return c1;
148 mul(rp[2],ap[2],w,c1);
149 }
150 return(c1);
151 }
152
153void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n)
154 {
155 if (n <= 0) return;
156
157 while (n&~3)
158 {
159 sqr(r[0],r[1],a[0]);
160 sqr(r[2],r[3],a[1]);
161 sqr(r[4],r[5],a[2]);
162 sqr(r[6],r[7],a[3]);
163 a+=4; r+=8; n-=4;
164 }
165 if (n)
166 {
167 sqr(r[0],r[1],a[0]); if (--n == 0) return;
168 sqr(r[2],r[3],a[1]); if (--n == 0) return;
169 sqr(r[4],r[5],a[2]);
170 }
171 }
172
173BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
174{ BN_ULONG ret,waste;
175
176 asm ("divq %4"
177 : "=a"(ret),"=d"(waste)
178 : "a"(l),"d"(h),"g"(d)
179 : "cc");
180
181 return ret;
182}
183
184BN_ULONG bn_add_words (BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,int n)
185{ BN_ULONG ret=0,i=0;
186
187 if (n <= 0) return 0;
188
189 asm (
190 " subq %2,%2 \n"
191 ".p2align 4 \n"
192 "1: movq (%4,%2,8),%0 \n"
193 " adcq (%5,%2,8),%0 \n"
194 " movq %0,(%3,%2,8) \n"
195 " leaq 1(%2),%2 \n"
196 " loop 1b \n"
197 " sbbq %0,%0 \n"
198 : "=&a"(ret),"+c"(n),"=&r"(i)
199 : "r"(rp),"r"(ap),"r"(bp)
200 : "cc"
201 );
202
203 return ret&1;
204}
205
206#ifndef SIMICS
207BN_ULONG bn_sub_words (BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,int n)
208{ BN_ULONG ret=0,i=0;
209
210 if (n <= 0) return 0;
211
212 asm (
213 " subq %2,%2 \n"
214 ".p2align 4 \n"
215 "1: movq (%4,%2,8),%0 \n"
216 " sbbq (%5,%2,8),%0 \n"
217 " movq %0,(%3,%2,8) \n"
218 " leaq 1(%2),%2 \n"
219 " loop 1b \n"
220 " sbbq %0,%0 \n"
221 : "=&a"(ret),"+c"(n),"=&r"(i)
222 : "r"(rp),"r"(ap),"r"(bp)
223 : "cc"
224 );
225
226 return ret&1;
227}
228#else
229/* Simics 1.4<7 has buggy sbbq:-( */
230#define BN_MASK2 0xffffffffffffffffL
231BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
232 {
233 BN_ULONG t1,t2;
234 int c=0;
235
236 if (n <= 0) return((BN_ULONG)0);
237
238 for (;;)
239 {
240 t1=a[0]; t2=b[0];
241 r[0]=(t1-t2-c)&BN_MASK2;
242 if (t1 != t2) c=(t1 < t2);
243 if (--n <= 0) break;
244
245 t1=a[1]; t2=b[1];
246 r[1]=(t1-t2-c)&BN_MASK2;
247 if (t1 != t2) c=(t1 < t2);
248 if (--n <= 0) break;
249
250 t1=a[2]; t2=b[2];
251 r[2]=(t1-t2-c)&BN_MASK2;
252 if (t1 != t2) c=(t1 < t2);
253 if (--n <= 0) break;
254
255 t1=a[3]; t2=b[3];
256 r[3]=(t1-t2-c)&BN_MASK2;
257 if (t1 != t2) c=(t1 < t2);
258 if (--n <= 0) break;
259
260 a+=4;
261 b+=4;
262 r+=4;
263 }
264 return(c);
265 }
266#endif
267
268/* mul_add_c(a,b,c0,c1,c2) -- c+=a*b for three word number c=(c2,c1,c0) */
269/* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */
270/* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
271/* sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0) */
272
273/*
274 * Keep in mind that carrying into high part of multiplication result
275 * can not overflow, because it cannot be all-ones.
276 */
277#if 0
278/* original macros are kept for reference purposes */
279#define mul_add_c(a,b,c0,c1,c2) do { \
280 BN_ULONG ta = (a), tb = (b); \
281 BN_ULONG lo, hi; \
282 BN_UMULT_LOHI(lo,hi,ta,tb); \
283 c0 += lo; hi += (c0<lo)?1:0; \
284 c1 += hi; c2 += (c1<hi)?1:0; \
285 } while(0)
286
287#define mul_add_c2(a,b,c0,c1,c2) do { \
288 BN_ULONG ta = (a), tb = (b); \
289 BN_ULONG lo, hi, tt; \
290 BN_UMULT_LOHI(lo,hi,ta,tb); \
291 c0 += lo; tt = hi+((c0<lo)?1:0); \
292 c1 += tt; c2 += (c1<tt)?1:0; \
293 c0 += lo; hi += (c0<lo)?1:0; \
294 c1 += hi; c2 += (c1<hi)?1:0; \
295 } while(0)
296
297#define sqr_add_c(a,i,c0,c1,c2) do { \
298 BN_ULONG ta = (a)[i]; \
299 BN_ULONG lo, hi; \
300 BN_UMULT_LOHI(lo,hi,ta,ta); \
301 c0 += lo; hi += (c0<lo)?1:0; \
302 c1 += hi; c2 += (c1<hi)?1:0; \
303 } while(0)
304#else
305#define mul_add_c(a,b,c0,c1,c2) do { \
306 BN_ULONG t1,t2; \
307 asm ("mulq %3" \
308 : "=a"(t1),"=d"(t2) \
309 : "a"(a),"m"(b) \
310 : "cc"); \
311 asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \
312 : "+r"(c0),"+r"(c1),"+r"(c2) \
313 : "r"(t1),"r"(t2),"g"(0) \
314 : "cc"); \
315 } while (0)
316
317#define sqr_add_c(a,i,c0,c1,c2) do { \
318 BN_ULONG t1,t2; \
319 asm ("mulq %2" \
320 : "=a"(t1),"=d"(t2) \
321 : "a"(a[i]) \
322 : "cc"); \
323 asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \
324 : "+r"(c0),"+r"(c1),"+r"(c2) \
325 : "r"(t1),"r"(t2),"g"(0) \
326 : "cc"); \
327 } while (0)
328
329#define mul_add_c2(a,b,c0,c1,c2) do { \
330 BN_ULONG t1,t2; \
331 asm ("mulq %3" \
332 : "=a"(t1),"=d"(t2) \
333 : "a"(a),"m"(b) \
334 : "cc"); \
335 asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \
336 : "+r"(c0),"+r"(c1),"+r"(c2) \
337 : "r"(t1),"r"(t2),"g"(0) \
338 : "cc"); \
339 asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \
340 : "+r"(c0),"+r"(c1),"+r"(c2) \
341 : "r"(t1),"r"(t2),"g"(0) \
342 : "cc"); \
343 } while (0)
344#endif
345
346#define sqr_add_c2(a,i,j,c0,c1,c2) \
347 mul_add_c2((a)[i],(a)[j],c0,c1,c2)
348
349void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
350 {
351 BN_ULONG c1,c2,c3;
352
353 c1=0;
354 c2=0;
355 c3=0;
356 mul_add_c(a[0],b[0],c1,c2,c3);
357 r[0]=c1;
358 c1=0;
359 mul_add_c(a[0],b[1],c2,c3,c1);
360 mul_add_c(a[1],b[0],c2,c3,c1);
361 r[1]=c2;
362 c2=0;
363 mul_add_c(a[2],b[0],c3,c1,c2);
364 mul_add_c(a[1],b[1],c3,c1,c2);
365 mul_add_c(a[0],b[2],c3,c1,c2);
366 r[2]=c3;
367 c3=0;
368 mul_add_c(a[0],b[3],c1,c2,c3);
369 mul_add_c(a[1],b[2],c1,c2,c3);
370 mul_add_c(a[2],b[1],c1,c2,c3);
371 mul_add_c(a[3],b[0],c1,c2,c3);
372 r[3]=c1;
373 c1=0;
374 mul_add_c(a[4],b[0],c2,c3,c1);
375 mul_add_c(a[3],b[1],c2,c3,c1);
376 mul_add_c(a[2],b[2],c2,c3,c1);
377 mul_add_c(a[1],b[3],c2,c3,c1);
378 mul_add_c(a[0],b[4],c2,c3,c1);
379 r[4]=c2;
380 c2=0;
381 mul_add_c(a[0],b[5],c3,c1,c2);
382 mul_add_c(a[1],b[4],c3,c1,c2);
383 mul_add_c(a[2],b[3],c3,c1,c2);
384 mul_add_c(a[3],b[2],c3,c1,c2);
385 mul_add_c(a[4],b[1],c3,c1,c2);
386 mul_add_c(a[5],b[0],c3,c1,c2);
387 r[5]=c3;
388 c3=0;
389 mul_add_c(a[6],b[0],c1,c2,c3);
390 mul_add_c(a[5],b[1],c1,c2,c3);
391 mul_add_c(a[4],b[2],c1,c2,c3);
392 mul_add_c(a[3],b[3],c1,c2,c3);
393 mul_add_c(a[2],b[4],c1,c2,c3);
394 mul_add_c(a[1],b[5],c1,c2,c3);
395 mul_add_c(a[0],b[6],c1,c2,c3);
396 r[6]=c1;
397 c1=0;
398 mul_add_c(a[0],b[7],c2,c3,c1);
399 mul_add_c(a[1],b[6],c2,c3,c1);
400 mul_add_c(a[2],b[5],c2,c3,c1);
401 mul_add_c(a[3],b[4],c2,c3,c1);
402 mul_add_c(a[4],b[3],c2,c3,c1);
403 mul_add_c(a[5],b[2],c2,c3,c1);
404 mul_add_c(a[6],b[1],c2,c3,c1);
405 mul_add_c(a[7],b[0],c2,c3,c1);
406 r[7]=c2;
407 c2=0;
408 mul_add_c(a[7],b[1],c3,c1,c2);
409 mul_add_c(a[6],b[2],c3,c1,c2);
410 mul_add_c(a[5],b[3],c3,c1,c2);
411 mul_add_c(a[4],b[4],c3,c1,c2);
412 mul_add_c(a[3],b[5],c3,c1,c2);
413 mul_add_c(a[2],b[6],c3,c1,c2);
414 mul_add_c(a[1],b[7],c3,c1,c2);
415 r[8]=c3;
416 c3=0;
417 mul_add_c(a[2],b[7],c1,c2,c3);
418 mul_add_c(a[3],b[6],c1,c2,c3);
419 mul_add_c(a[4],b[5],c1,c2,c3);
420 mul_add_c(a[5],b[4],c1,c2,c3);
421 mul_add_c(a[6],b[3],c1,c2,c3);
422 mul_add_c(a[7],b[2],c1,c2,c3);
423 r[9]=c1;
424 c1=0;
425 mul_add_c(a[7],b[3],c2,c3,c1);
426 mul_add_c(a[6],b[4],c2,c3,c1);
427 mul_add_c(a[5],b[5],c2,c3,c1);
428 mul_add_c(a[4],b[6],c2,c3,c1);
429 mul_add_c(a[3],b[7],c2,c3,c1);
430 r[10]=c2;
431 c2=0;
432 mul_add_c(a[4],b[7],c3,c1,c2);
433 mul_add_c(a[5],b[6],c3,c1,c2);
434 mul_add_c(a[6],b[5],c3,c1,c2);
435 mul_add_c(a[7],b[4],c3,c1,c2);
436 r[11]=c3;
437 c3=0;
438 mul_add_c(a[7],b[5],c1,c2,c3);
439 mul_add_c(a[6],b[6],c1,c2,c3);
440 mul_add_c(a[5],b[7],c1,c2,c3);
441 r[12]=c1;
442 c1=0;
443 mul_add_c(a[6],b[7],c2,c3,c1);
444 mul_add_c(a[7],b[6],c2,c3,c1);
445 r[13]=c2;
446 c2=0;
447 mul_add_c(a[7],b[7],c3,c1,c2);
448 r[14]=c3;
449 r[15]=c1;
450 }
451
452void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
453 {
454 BN_ULONG c1,c2,c3;
455
456 c1=0;
457 c2=0;
458 c3=0;
459 mul_add_c(a[0],b[0],c1,c2,c3);
460 r[0]=c1;
461 c1=0;
462 mul_add_c(a[0],b[1],c2,c3,c1);
463 mul_add_c(a[1],b[0],c2,c3,c1);
464 r[1]=c2;
465 c2=0;
466 mul_add_c(a[2],b[0],c3,c1,c2);
467 mul_add_c(a[1],b[1],c3,c1,c2);
468 mul_add_c(a[0],b[2],c3,c1,c2);
469 r[2]=c3;
470 c3=0;
471 mul_add_c(a[0],b[3],c1,c2,c3);
472 mul_add_c(a[1],b[2],c1,c2,c3);
473 mul_add_c(a[2],b[1],c1,c2,c3);
474 mul_add_c(a[3],b[0],c1,c2,c3);
475 r[3]=c1;
476 c1=0;
477 mul_add_c(a[3],b[1],c2,c3,c1);
478 mul_add_c(a[2],b[2],c2,c3,c1);
479 mul_add_c(a[1],b[3],c2,c3,c1);
480 r[4]=c2;
481 c2=0;
482 mul_add_c(a[2],b[3],c3,c1,c2);
483 mul_add_c(a[3],b[2],c3,c1,c2);
484 r[5]=c3;
485 c3=0;
486 mul_add_c(a[3],b[3],c1,c2,c3);
487 r[6]=c1;
488 r[7]=c2;
489 }
490
491void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
492 {
493 BN_ULONG c1,c2,c3;
494
495 c1=0;
496 c2=0;
497 c3=0;
498 sqr_add_c(a,0,c1,c2,c3);
499 r[0]=c1;
500 c1=0;
501 sqr_add_c2(a,1,0,c2,c3,c1);
502 r[1]=c2;
503 c2=0;
504 sqr_add_c(a,1,c3,c1,c2);
505 sqr_add_c2(a,2,0,c3,c1,c2);
506 r[2]=c3;
507 c3=0;
508 sqr_add_c2(a,3,0,c1,c2,c3);
509 sqr_add_c2(a,2,1,c1,c2,c3);
510 r[3]=c1;
511 c1=0;
512 sqr_add_c(a,2,c2,c3,c1);
513 sqr_add_c2(a,3,1,c2,c3,c1);
514 sqr_add_c2(a,4,0,c2,c3,c1);
515 r[4]=c2;
516 c2=0;
517 sqr_add_c2(a,5,0,c3,c1,c2);
518 sqr_add_c2(a,4,1,c3,c1,c2);
519 sqr_add_c2(a,3,2,c3,c1,c2);
520 r[5]=c3;
521 c3=0;
522 sqr_add_c(a,3,c1,c2,c3);
523 sqr_add_c2(a,4,2,c1,c2,c3);
524 sqr_add_c2(a,5,1,c1,c2,c3);
525 sqr_add_c2(a,6,0,c1,c2,c3);
526 r[6]=c1;
527 c1=0;
528 sqr_add_c2(a,7,0,c2,c3,c1);
529 sqr_add_c2(a,6,1,c2,c3,c1);
530 sqr_add_c2(a,5,2,c2,c3,c1);
531 sqr_add_c2(a,4,3,c2,c3,c1);
532 r[7]=c2;
533 c2=0;
534 sqr_add_c(a,4,c3,c1,c2);
535 sqr_add_c2(a,5,3,c3,c1,c2);
536 sqr_add_c2(a,6,2,c3,c1,c2);
537 sqr_add_c2(a,7,1,c3,c1,c2);
538 r[8]=c3;
539 c3=0;
540 sqr_add_c2(a,7,2,c1,c2,c3);
541 sqr_add_c2(a,6,3,c1,c2,c3);
542 sqr_add_c2(a,5,4,c1,c2,c3);
543 r[9]=c1;
544 c1=0;
545 sqr_add_c(a,5,c2,c3,c1);
546 sqr_add_c2(a,6,4,c2,c3,c1);
547 sqr_add_c2(a,7,3,c2,c3,c1);
548 r[10]=c2;
549 c2=0;
550 sqr_add_c2(a,7,4,c3,c1,c2);
551 sqr_add_c2(a,6,5,c3,c1,c2);
552 r[11]=c3;
553 c3=0;
554 sqr_add_c(a,6,c1,c2,c3);
555 sqr_add_c2(a,7,5,c1,c2,c3);
556 r[12]=c1;
557 c1=0;
558 sqr_add_c2(a,7,6,c2,c3,c1);
559 r[13]=c2;
560 c2=0;
561 sqr_add_c(a,7,c3,c1,c2);
562 r[14]=c3;
563 r[15]=c1;
564 }
565
566void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
567 {
568 BN_ULONG c1,c2,c3;
569
570 c1=0;
571 c2=0;
572 c3=0;
573 sqr_add_c(a,0,c1,c2,c3);
574 r[0]=c1;
575 c1=0;
576 sqr_add_c2(a,1,0,c2,c3,c1);
577 r[1]=c2;
578 c2=0;
579 sqr_add_c(a,1,c3,c1,c2);
580 sqr_add_c2(a,2,0,c3,c1,c2);
581 r[2]=c3;
582 c3=0;
583 sqr_add_c2(a,3,0,c1,c2,c3);
584 sqr_add_c2(a,2,1,c1,c2,c3);
585 r[3]=c1;
586 c1=0;
587 sqr_add_c(a,2,c2,c3,c1);
588 sqr_add_c2(a,3,1,c2,c3,c1);
589 r[4]=c2;
590 c2=0;
591 sqr_add_c2(a,3,2,c3,c1,c2);
592 r[5]=c3;
593 c3=0;
594 sqr_add_c(a,3,c1,c2,c3);
595 r[6]=c1;
596 r[7]=c2;
597 }
598#endif
diff --git a/src/lib/libcrypto/bn/asm/x86_64-gf2m.pl b/src/lib/libcrypto/bn/asm/x86_64-gf2m.pl
deleted file mode 100644
index 226c66c35e..0000000000
--- a/src/lib/libcrypto/bn/asm/x86_64-gf2m.pl
+++ /dev/null
@@ -1,390 +0,0 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# May 2011
11#
12# The module implements bn_GF2m_mul_2x2 polynomial multiplication used
13# in bn_gf2m.c. It's kind of low-hanging mechanical port from C for
14# the time being... Except that it has two code paths: code suitable
15# for any x86_64 CPU and PCLMULQDQ one suitable for Westmere and
16# later. Improvement varies from one benchmark and µ-arch to another.
17# Vanilla code path is at most 20% faster than compiler-generated code
18# [not very impressive], while PCLMULQDQ - whole 85%-160% better on
19# 163- and 571-bit ECDH benchmarks on Intel CPUs. Keep in mind that
20# these coefficients are not ones for bn_GF2m_mul_2x2 itself, as not
21# all CPU time is burnt in it...
22
23$flavour = shift;
24$output = shift;
25if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
26
27$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
28
29$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
30( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
31( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
32die "can't locate x86_64-xlate.pl";
33
34open OUT,"| \"$^X\" $xlate $flavour $output";
35*STDOUT=*OUT;
36
37($lo,$hi)=("%rax","%rdx"); $a=$lo;
38($i0,$i1)=("%rsi","%rdi");
39($t0,$t1)=("%rbx","%rcx");
40($b,$mask)=("%rbp","%r8");
41($a1,$a2,$a4,$a8,$a12,$a48)=map("%r$_",(9..15));
42($R,$Tx)=("%xmm0","%xmm1");
43
44$code.=<<___;
45.text
46
47.type _mul_1x1,\@abi-omnipotent
48.align 16
49_mul_1x1:
50 sub \$128+8,%rsp
51 mov \$-1,$a1
52 lea ($a,$a),$i0
53 shr \$3,$a1
54 lea (,$a,4),$i1
55 and $a,$a1 # a1=a&0x1fffffffffffffff
56 lea (,$a,8),$a8
57 sar \$63,$a # broadcast 63rd bit
58 lea ($a1,$a1),$a2
59 sar \$63,$i0 # broadcast 62nd bit
60 lea (,$a1,4),$a4
61 and $b,$a
62 sar \$63,$i1 # boardcast 61st bit
63 mov $a,$hi # $a is $lo
64 shl \$63,$lo
65 and $b,$i0
66 shr \$1,$hi
67 mov $i0,$t1
68 shl \$62,$i0
69 and $b,$i1
70 shr \$2,$t1
71 xor $i0,$lo
72 mov $i1,$t0
73 shl \$61,$i1
74 xor $t1,$hi
75 shr \$3,$t0
76 xor $i1,$lo
77 xor $t0,$hi
78
79 mov $a1,$a12
80 movq \$0,0(%rsp) # tab[0]=0
81 xor $a2,$a12 # a1^a2
82 mov $a1,8(%rsp) # tab[1]=a1
83 mov $a4,$a48
84 mov $a2,16(%rsp) # tab[2]=a2
85 xor $a8,$a48 # a4^a8
86 mov $a12,24(%rsp) # tab[3]=a1^a2
87
88 xor $a4,$a1
89 mov $a4,32(%rsp) # tab[4]=a4
90 xor $a4,$a2
91 mov $a1,40(%rsp) # tab[5]=a1^a4
92 xor $a4,$a12
93 mov $a2,48(%rsp) # tab[6]=a2^a4
94 xor $a48,$a1 # a1^a4^a4^a8=a1^a8
95 mov $a12,56(%rsp) # tab[7]=a1^a2^a4
96 xor $a48,$a2 # a2^a4^a4^a8=a1^a8
97
98 mov $a8,64(%rsp) # tab[8]=a8
99 xor $a48,$a12 # a1^a2^a4^a4^a8=a1^a2^a8
100 mov $a1,72(%rsp) # tab[9]=a1^a8
101 xor $a4,$a1 # a1^a8^a4
102 mov $a2,80(%rsp) # tab[10]=a2^a8
103 xor $a4,$a2 # a2^a8^a4
104 mov $a12,88(%rsp) # tab[11]=a1^a2^a8
105
106 xor $a4,$a12 # a1^a2^a8^a4
107 mov $a48,96(%rsp) # tab[12]=a4^a8
108 mov $mask,$i0
109 mov $a1,104(%rsp) # tab[13]=a1^a4^a8
110 and $b,$i0
111 mov $a2,112(%rsp) # tab[14]=a2^a4^a8
112 shr \$4,$b
113 mov $a12,120(%rsp) # tab[15]=a1^a2^a4^a8
114 mov $mask,$i1
115 and $b,$i1
116 shr \$4,$b
117
118 movq (%rsp,$i0,8),$R # half of calculations is done in SSE2
119 mov $mask,$i0
120 and $b,$i0
121 shr \$4,$b
122___
123 for ($n=1;$n<8;$n++) {
124 $code.=<<___;
125 mov (%rsp,$i1,8),$t1
126 mov $mask,$i1
127 mov $t1,$t0
128 shl \$`8*$n-4`,$t1
129 and $b,$i1
130 movq (%rsp,$i0,8),$Tx
131 shr \$`64-(8*$n-4)`,$t0
132 xor $t1,$lo
133 pslldq \$$n,$Tx
134 mov $mask,$i0
135 shr \$4,$b
136 xor $t0,$hi
137 and $b,$i0
138 shr \$4,$b
139 pxor $Tx,$R
140___
141 }
142$code.=<<___;
143 mov (%rsp,$i1,8),$t1
144 mov $t1,$t0
145 shl \$`8*$n-4`,$t1
146 movq $R,$i0
147 shr \$`64-(8*$n-4)`,$t0
148 xor $t1,$lo
149 psrldq \$8,$R
150 xor $t0,$hi
151 movq $R,$i1
152 xor $i0,$lo
153 xor $i1,$hi
154
155 add \$128+8,%rsp
156 ret
157.Lend_mul_1x1:
158.size _mul_1x1,.-_mul_1x1
159___
160
161($rp,$a1,$a0,$b1,$b0) = $win64? ("%rcx","%rdx","%r8", "%r9","%r10") : # Win64 order
162 ("%rdi","%rsi","%rdx","%rcx","%r8"); # Unix order
163
164$code.=<<___;
165.extern OPENSSL_ia32cap_P
166.globl bn_GF2m_mul_2x2
167.type bn_GF2m_mul_2x2,\@abi-omnipotent
168.align 16
169bn_GF2m_mul_2x2:
170 mov OPENSSL_ia32cap_P(%rip),%rax
171 bt \$33,%rax
172 jnc .Lvanilla_mul_2x2
173
174 movq $a1,%xmm0
175 movq $b1,%xmm1
176 movq $a0,%xmm2
177___
178$code.=<<___ if ($win64);
179 movq 40(%rsp),%xmm3
180___
181$code.=<<___ if (!$win64);
182 movq $b0,%xmm3
183___
184$code.=<<___;
185 movdqa %xmm0,%xmm4
186 movdqa %xmm1,%xmm5
187 pclmulqdq \$0,%xmm1,%xmm0 # a1·b1
188 pxor %xmm2,%xmm4
189 pxor %xmm3,%xmm5
190 pclmulqdq \$0,%xmm3,%xmm2 # a0·b0
191 pclmulqdq \$0,%xmm5,%xmm4 # (a0+a1)·(b0+b1)
192 xorps %xmm0,%xmm4
193 xorps %xmm2,%xmm4 # (a0+a1)·(b0+b1)-a0·b0-a1·b1
194 movdqa %xmm4,%xmm5
195 pslldq \$8,%xmm4
196 psrldq \$8,%xmm5
197 pxor %xmm4,%xmm2
198 pxor %xmm5,%xmm0
199 movdqu %xmm2,0($rp)
200 movdqu %xmm0,16($rp)
201 ret
202
203.align 16
204.Lvanilla_mul_2x2:
205 lea -8*17(%rsp),%rsp
206___
207$code.=<<___ if ($win64);
208 mov `8*17+40`(%rsp),$b0
209 mov %rdi,8*15(%rsp)
210 mov %rsi,8*16(%rsp)
211___
212$code.=<<___;
213 mov %r14,8*10(%rsp)
214 mov %r13,8*11(%rsp)
215 mov %r12,8*12(%rsp)
216 mov %rbp,8*13(%rsp)
217 mov %rbx,8*14(%rsp)
218.Lbody_mul_2x2:
219 mov $rp,32(%rsp) # save the arguments
220 mov $a1,40(%rsp)
221 mov $a0,48(%rsp)
222 mov $b1,56(%rsp)
223 mov $b0,64(%rsp)
224
225 mov \$0xf,$mask
226 mov $a1,$a
227 mov $b1,$b
228 call _mul_1x1 # a1·b1
229 mov $lo,16(%rsp)
230 mov $hi,24(%rsp)
231
232 mov 48(%rsp),$a
233 mov 64(%rsp),$b
234 call _mul_1x1 # a0·b0
235 mov $lo,0(%rsp)
236 mov $hi,8(%rsp)
237
238 mov 40(%rsp),$a
239 mov 56(%rsp),$b
240 xor 48(%rsp),$a
241 xor 64(%rsp),$b
242 call _mul_1x1 # (a0+a1)·(b0+b1)
243___
244 @r=("%rbx","%rcx","%rdi","%rsi");
245$code.=<<___;
246 mov 0(%rsp),@r[0]
247 mov 8(%rsp),@r[1]
248 mov 16(%rsp),@r[2]
249 mov 24(%rsp),@r[3]
250 mov 32(%rsp),%rbp
251
252 xor $hi,$lo
253 xor @r[1],$hi
254 xor @r[0],$lo
255 mov @r[0],0(%rbp)
256 xor @r[2],$hi
257 mov @r[3],24(%rbp)
258 xor @r[3],$lo
259 xor @r[3],$hi
260 xor $hi,$lo
261 mov $hi,16(%rbp)
262 mov $lo,8(%rbp)
263
264 mov 8*10(%rsp),%r14
265 mov 8*11(%rsp),%r13
266 mov 8*12(%rsp),%r12
267 mov 8*13(%rsp),%rbp
268 mov 8*14(%rsp),%rbx
269___
270$code.=<<___ if ($win64);
271 mov 8*15(%rsp),%rdi
272 mov 8*16(%rsp),%rsi
273___
274$code.=<<___;
275 lea 8*17(%rsp),%rsp
276 ret
277.Lend_mul_2x2:
278.size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
279.asciz "GF(2^m) Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
280.align 16
281___
282
283# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
284# CONTEXT *context,DISPATCHER_CONTEXT *disp)
285if ($win64) {
286$rec="%rcx";
287$frame="%rdx";
288$context="%r8";
289$disp="%r9";
290
291$code.=<<___;
292.extern __imp_RtlVirtualUnwind
293
294.type se_handler,\@abi-omnipotent
295.align 16
296se_handler:
297 push %rsi
298 push %rdi
299 push %rbx
300 push %rbp
301 push %r12
302 push %r13
303 push %r14
304 push %r15
305 pushfq
306 sub \$64,%rsp
307
308 mov 152($context),%rax # pull context->Rsp
309 mov 248($context),%rbx # pull context->Rip
310
311 lea .Lbody_mul_2x2(%rip),%r10
312 cmp %r10,%rbx # context->Rip<"prologue" label
313 jb .Lin_prologue
314
315 mov 8*10(%rax),%r14 # mimic epilogue
316 mov 8*11(%rax),%r13
317 mov 8*12(%rax),%r12
318 mov 8*13(%rax),%rbp
319 mov 8*14(%rax),%rbx
320 mov 8*15(%rax),%rdi
321 mov 8*16(%rax),%rsi
322
323 mov %rbx,144($context) # restore context->Rbx
324 mov %rbp,160($context) # restore context->Rbp
325 mov %rsi,168($context) # restore context->Rsi
326 mov %rdi,176($context) # restore context->Rdi
327 mov %r12,216($context) # restore context->R12
328 mov %r13,224($context) # restore context->R13
329 mov %r14,232($context) # restore context->R14
330
331.Lin_prologue:
332 lea 8*17(%rax),%rax
333 mov %rax,152($context) # restore context->Rsp
334
335 mov 40($disp),%rdi # disp->ContextRecord
336 mov $context,%rsi # context
337 mov \$154,%ecx # sizeof(CONTEXT)
338 .long 0xa548f3fc # cld; rep movsq
339
340 mov $disp,%rsi
341 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
342 mov 8(%rsi),%rdx # arg2, disp->ImageBase
343 mov 0(%rsi),%r8 # arg3, disp->ControlPc
344 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
345 mov 40(%rsi),%r10 # disp->ContextRecord
346 lea 56(%rsi),%r11 # &disp->HandlerData
347 lea 24(%rsi),%r12 # &disp->EstablisherFrame
348 mov %r10,32(%rsp) # arg5
349 mov %r11,40(%rsp) # arg6
350 mov %r12,48(%rsp) # arg7
351 mov %rcx,56(%rsp) # arg8, (NULL)
352 call *__imp_RtlVirtualUnwind(%rip)
353
354 mov \$1,%eax # ExceptionContinueSearch
355 add \$64,%rsp
356 popfq
357 pop %r15
358 pop %r14
359 pop %r13
360 pop %r12
361 pop %rbp
362 pop %rbx
363 pop %rdi
364 pop %rsi
365 ret
366.size se_handler,.-se_handler
367
368.section .pdata
369.align 4
370 .rva _mul_1x1
371 .rva .Lend_mul_1x1
372 .rva .LSEH_info_1x1
373
374 .rva .Lvanilla_mul_2x2
375 .rva .Lend_mul_2x2
376 .rva .LSEH_info_2x2
377.section .xdata
378.align 8
379.LSEH_info_1x1:
380 .byte 0x01,0x07,0x02,0x00
381 .byte 0x07,0x01,0x11,0x00 # sub rsp,128+8
382.LSEH_info_2x2:
383 .byte 9,0,0,0
384 .rva se_handler
385___
386}
387
388$code =~ s/\`([^\`]*)\`/eval($1)/gem;
389print $code;
390close STDOUT;
diff --git a/src/lib/libcrypto/bn/asm/x86_64-mont.pl b/src/lib/libcrypto/bn/asm/x86_64-mont.pl
deleted file mode 100755
index c35493e80a..0000000000
--- a/src/lib/libcrypto/bn/asm/x86_64-mont.pl
+++ /dev/null
@@ -1,1504 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# October 2005.
11#
12# Montgomery multiplication routine for x86_64. While it gives modest
13# 9% improvement of rsa4096 sign on Opteron, rsa512 sign runs more
14# than twice, >2x, as fast. Most common rsa1024 sign is improved by
15# respectful 50%. It remains to be seen if loop unrolling and
16# dedicated squaring routine can provide further improvement...
17
18# July 2011.
19#
20# Add dedicated squaring procedure. Performance improvement varies
21# from platform to platform, but in average it's ~5%/15%/25%/33%
22# for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
23
24# August 2011.
25#
26# Unroll and modulo-schedule inner loops in such manner that they
27# are "fallen through" for input lengths of 8, which is critical for
28# 1024-bit RSA *sign*. Average performance improvement in comparison
29# to *initial* version of this module from 2005 is ~0%/30%/40%/45%
30# for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
31
32$flavour = shift;
33$output = shift;
34if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
35
36$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
37( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
38( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
39die "can't locate x86_64-xlate.pl";
40
41open OUT,"| \"$^X\" $xlate $flavour $output";
42*STDOUT=*OUT;
43
44# int bn_mul_mont(
45$rp="%rdi"; # BN_ULONG *rp,
46$ap="%rsi"; # const BN_ULONG *ap,
47$bp="%rdx"; # const BN_ULONG *bp,
48$np="%rcx"; # const BN_ULONG *np,
49$n0="%r8"; # const BN_ULONG *n0,
50$num="%r9"; # int num);
51$lo0="%r10";
52$hi0="%r11";
53$hi1="%r13";
54$i="%r14";
55$j="%r15";
56$m0="%rbx";
57$m1="%rbp";
58
59$code=<<___;
60.text
61
62.globl bn_mul_mont
63.type bn_mul_mont,\@function,6
64.align 16
65bn_mul_mont:
66 test \$3,${num}d
67 jnz .Lmul_enter
68 cmp \$8,${num}d
69 jb .Lmul_enter
70 cmp $ap,$bp
71 jne .Lmul4x_enter
72 jmp .Lsqr4x_enter
73
74.align 16
75.Lmul_enter:
76 push %rbx
77 push %rbp
78 push %r12
79 push %r13
80 push %r14
81 push %r15
82
83 mov ${num}d,${num}d
84 lea 2($num),%r10
85 mov %rsp,%r11
86 neg %r10
87 lea (%rsp,%r10,8),%rsp # tp=alloca(8*(num+2))
88 and \$-1024,%rsp # minimize TLB usage
89
90 mov %r11,8(%rsp,$num,8) # tp[num+1]=%rsp
91.Lmul_body:
92 mov $bp,%r12 # reassign $bp
93___
94 $bp="%r12";
95$code.=<<___;
96 mov ($n0),$n0 # pull n0[0] value
97 mov ($bp),$m0 # m0=bp[0]
98 mov ($ap),%rax
99
100 xor $i,$i # i=0
101 xor $j,$j # j=0
102
103 mov $n0,$m1
104 mulq $m0 # ap[0]*bp[0]
105 mov %rax,$lo0
106 mov ($np),%rax
107
108 imulq $lo0,$m1 # "tp[0]"*n0
109 mov %rdx,$hi0
110
111 mulq $m1 # np[0]*m1
112 add %rax,$lo0 # discarded
113 mov 8($ap),%rax
114 adc \$0,%rdx
115 mov %rdx,$hi1
116
117 lea 1($j),$j # j++
118 jmp .L1st_enter
119
120.align 16
121.L1st:
122 add %rax,$hi1
123 mov ($ap,$j,8),%rax
124 adc \$0,%rdx
125 add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0]
126 mov $lo0,$hi0
127 adc \$0,%rdx
128 mov $hi1,-16(%rsp,$j,8) # tp[j-1]
129 mov %rdx,$hi1
130
131.L1st_enter:
132 mulq $m0 # ap[j]*bp[0]
133 add %rax,$hi0
134 mov ($np,$j,8),%rax
135 adc \$0,%rdx
136 lea 1($j),$j # j++
137 mov %rdx,$lo0
138
139 mulq $m1 # np[j]*m1
140 cmp $num,$j
141 jl .L1st
142
143 add %rax,$hi1
144 mov ($ap),%rax # ap[0]
145 adc \$0,%rdx
146 add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0]
147 adc \$0,%rdx
148 mov $hi1,-16(%rsp,$j,8) # tp[j-1]
149 mov %rdx,$hi1
150 mov $lo0,$hi0
151
152 xor %rdx,%rdx
153 add $hi0,$hi1
154 adc \$0,%rdx
155 mov $hi1,-8(%rsp,$num,8)
156 mov %rdx,(%rsp,$num,8) # store upmost overflow bit
157
158 lea 1($i),$i # i++
159 jmp .Louter
160.align 16
161.Louter:
162 mov ($bp,$i,8),$m0 # m0=bp[i]
163 xor $j,$j # j=0
164 mov $n0,$m1
165 mov (%rsp),$lo0
166 mulq $m0 # ap[0]*bp[i]
167 add %rax,$lo0 # ap[0]*bp[i]+tp[0]
168 mov ($np),%rax
169 adc \$0,%rdx
170
171 imulq $lo0,$m1 # tp[0]*n0
172 mov %rdx,$hi0
173
174 mulq $m1 # np[0]*m1
175 add %rax,$lo0 # discarded
176 mov 8($ap),%rax
177 adc \$0,%rdx
178 mov 8(%rsp),$lo0 # tp[1]
179 mov %rdx,$hi1
180
181 lea 1($j),$j # j++
182 jmp .Linner_enter
183
184.align 16
185.Linner:
186 add %rax,$hi1
187 mov ($ap,$j,8),%rax
188 adc \$0,%rdx
189 add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j]
190 mov (%rsp,$j,8),$lo0
191 adc \$0,%rdx
192 mov $hi1,-16(%rsp,$j,8) # tp[j-1]
193 mov %rdx,$hi1
194
195.Linner_enter:
196 mulq $m0 # ap[j]*bp[i]
197 add %rax,$hi0
198 mov ($np,$j,8),%rax
199 adc \$0,%rdx
200 add $hi0,$lo0 # ap[j]*bp[i]+tp[j]
201 mov %rdx,$hi0
202 adc \$0,$hi0
203 lea 1($j),$j # j++
204
205 mulq $m1 # np[j]*m1
206 cmp $num,$j
207 jl .Linner
208
209 add %rax,$hi1
210 mov ($ap),%rax # ap[0]
211 adc \$0,%rdx
212 add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j]
213 mov (%rsp,$j,8),$lo0
214 adc \$0,%rdx
215 mov $hi1,-16(%rsp,$j,8) # tp[j-1]
216 mov %rdx,$hi1
217
218 xor %rdx,%rdx
219 add $hi0,$hi1
220 adc \$0,%rdx
221 add $lo0,$hi1 # pull upmost overflow bit
222 adc \$0,%rdx
223 mov $hi1,-8(%rsp,$num,8)
224 mov %rdx,(%rsp,$num,8) # store upmost overflow bit
225
226 lea 1($i),$i # i++
227 cmp $num,$i
228 jl .Louter
229
230 xor $i,$i # i=0 and clear CF!
231 mov (%rsp),%rax # tp[0]
232 lea (%rsp),$ap # borrow ap for tp
233 mov $num,$j # j=num
234 jmp .Lsub
235.align 16
236.Lsub: sbb ($np,$i,8),%rax
237 mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i]
238 mov 8($ap,$i,8),%rax # tp[i+1]
239 lea 1($i),$i # i++
240 dec $j # doesnn't affect CF!
241 jnz .Lsub
242
243 sbb \$0,%rax # handle upmost overflow bit
244 xor $i,$i
245 and %rax,$ap
246 not %rax
247 mov $rp,$np
248 and %rax,$np
249 mov $num,$j # j=num
250 or $np,$ap # ap=borrow?tp:rp
251.align 16
252.Lcopy: # copy or in-place refresh
253 mov ($ap,$i,8),%rax
254 mov $i,(%rsp,$i,8) # zap temporary vector
255 mov %rax,($rp,$i,8) # rp[i]=tp[i]
256 lea 1($i),$i
257 sub \$1,$j
258 jnz .Lcopy
259
260 mov 8(%rsp,$num,8),%rsi # restore %rsp
261 mov \$1,%rax
262 mov (%rsi),%r15
263 mov 8(%rsi),%r14
264 mov 16(%rsi),%r13
265 mov 24(%rsi),%r12
266 mov 32(%rsi),%rbp
267 mov 40(%rsi),%rbx
268 lea 48(%rsi),%rsp
269.Lmul_epilogue:
270 ret
271.size bn_mul_mont,.-bn_mul_mont
272___
273{{{
274my @A=("%r10","%r11");
275my @N=("%r13","%rdi");
276$code.=<<___;
277.type bn_mul4x_mont,\@function,6
278.align 16
279bn_mul4x_mont:
280.Lmul4x_enter:
281 push %rbx
282 push %rbp
283 push %r12
284 push %r13
285 push %r14
286 push %r15
287
288 mov ${num}d,${num}d
289 lea 4($num),%r10
290 mov %rsp,%r11
291 neg %r10
292 lea (%rsp,%r10,8),%rsp # tp=alloca(8*(num+4))
293 and \$-1024,%rsp # minimize TLB usage
294
295 mov %r11,8(%rsp,$num,8) # tp[num+1]=%rsp
296.Lmul4x_body:
297 mov $rp,16(%rsp,$num,8) # tp[num+2]=$rp
298 mov %rdx,%r12 # reassign $bp
299___
300 $bp="%r12";
301$code.=<<___;
302 mov ($n0),$n0 # pull n0[0] value
303 mov ($bp),$m0 # m0=bp[0]
304 mov ($ap),%rax
305
306 xor $i,$i # i=0
307 xor $j,$j # j=0
308
309 mov $n0,$m1
310 mulq $m0 # ap[0]*bp[0]
311 mov %rax,$A[0]
312 mov ($np),%rax
313
314 imulq $A[0],$m1 # "tp[0]"*n0
315 mov %rdx,$A[1]
316
317 mulq $m1 # np[0]*m1
318 add %rax,$A[0] # discarded
319 mov 8($ap),%rax
320 adc \$0,%rdx
321 mov %rdx,$N[1]
322
323 mulq $m0
324 add %rax,$A[1]
325 mov 8($np),%rax
326 adc \$0,%rdx
327 mov %rdx,$A[0]
328
329 mulq $m1
330 add %rax,$N[1]
331 mov 16($ap),%rax
332 adc \$0,%rdx
333 add $A[1],$N[1]
334 lea 4($j),$j # j++
335 adc \$0,%rdx
336 mov $N[1],(%rsp)
337 mov %rdx,$N[0]
338 jmp .L1st4x
339.align 16
340.L1st4x:
341 mulq $m0 # ap[j]*bp[0]
342 add %rax,$A[0]
343 mov -16($np,$j,8),%rax
344 adc \$0,%rdx
345 mov %rdx,$A[1]
346
347 mulq $m1 # np[j]*m1
348 add %rax,$N[0]
349 mov -8($ap,$j,8),%rax
350 adc \$0,%rdx
351 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
352 adc \$0,%rdx
353 mov $N[0],-24(%rsp,$j,8) # tp[j-1]
354 mov %rdx,$N[1]
355
356 mulq $m0 # ap[j]*bp[0]
357 add %rax,$A[1]
358 mov -8($np,$j,8),%rax
359 adc \$0,%rdx
360 mov %rdx,$A[0]
361
362 mulq $m1 # np[j]*m1
363 add %rax,$N[1]
364 mov ($ap,$j,8),%rax
365 adc \$0,%rdx
366 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
367 adc \$0,%rdx
368 mov $N[1],-16(%rsp,$j,8) # tp[j-1]
369 mov %rdx,$N[0]
370
371 mulq $m0 # ap[j]*bp[0]
372 add %rax,$A[0]
373 mov ($np,$j,8),%rax
374 adc \$0,%rdx
375 mov %rdx,$A[1]
376
377 mulq $m1 # np[j]*m1
378 add %rax,$N[0]
379 mov 8($ap,$j,8),%rax
380 adc \$0,%rdx
381 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
382 adc \$0,%rdx
383 mov $N[0],-8(%rsp,$j,8) # tp[j-1]
384 mov %rdx,$N[1]
385
386 mulq $m0 # ap[j]*bp[0]
387 add %rax,$A[1]
388 mov 8($np,$j,8),%rax
389 adc \$0,%rdx
390 lea 4($j),$j # j++
391 mov %rdx,$A[0]
392
393 mulq $m1 # np[j]*m1
394 add %rax,$N[1]
395 mov -16($ap,$j,8),%rax
396 adc \$0,%rdx
397 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
398 adc \$0,%rdx
399 mov $N[1],-32(%rsp,$j,8) # tp[j-1]
400 mov %rdx,$N[0]
401 cmp $num,$j
402 jl .L1st4x
403
404 mulq $m0 # ap[j]*bp[0]
405 add %rax,$A[0]
406 mov -16($np,$j,8),%rax
407 adc \$0,%rdx
408 mov %rdx,$A[1]
409
410 mulq $m1 # np[j]*m1
411 add %rax,$N[0]
412 mov -8($ap,$j,8),%rax
413 adc \$0,%rdx
414 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
415 adc \$0,%rdx
416 mov $N[0],-24(%rsp,$j,8) # tp[j-1]
417 mov %rdx,$N[1]
418
419 mulq $m0 # ap[j]*bp[0]
420 add %rax,$A[1]
421 mov -8($np,$j,8),%rax
422 adc \$0,%rdx
423 mov %rdx,$A[0]
424
425 mulq $m1 # np[j]*m1
426 add %rax,$N[1]
427 mov ($ap),%rax # ap[0]
428 adc \$0,%rdx
429 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
430 adc \$0,%rdx
431 mov $N[1],-16(%rsp,$j,8) # tp[j-1]
432 mov %rdx,$N[0]
433
434 xor $N[1],$N[1]
435 add $A[0],$N[0]
436 adc \$0,$N[1]
437 mov $N[0],-8(%rsp,$j,8)
438 mov $N[1],(%rsp,$j,8) # store upmost overflow bit
439
440 lea 1($i),$i # i++
441.align 4
442.Louter4x:
443 mov ($bp,$i,8),$m0 # m0=bp[i]
444 xor $j,$j # j=0
445 mov (%rsp),$A[0]
446 mov $n0,$m1
447 mulq $m0 # ap[0]*bp[i]
448 add %rax,$A[0] # ap[0]*bp[i]+tp[0]
449 mov ($np),%rax
450 adc \$0,%rdx
451
452 imulq $A[0],$m1 # tp[0]*n0
453 mov %rdx,$A[1]
454
455 mulq $m1 # np[0]*m1
456 add %rax,$A[0] # "$N[0]", discarded
457 mov 8($ap),%rax
458 adc \$0,%rdx
459 mov %rdx,$N[1]
460
461 mulq $m0 # ap[j]*bp[i]
462 add %rax,$A[1]
463 mov 8($np),%rax
464 adc \$0,%rdx
465 add 8(%rsp),$A[1] # +tp[1]
466 adc \$0,%rdx
467 mov %rdx,$A[0]
468
469 mulq $m1 # np[j]*m1
470 add %rax,$N[1]
471 mov 16($ap),%rax
472 adc \$0,%rdx
473 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[i]+tp[j]
474 lea 4($j),$j # j+=2
475 adc \$0,%rdx
476 mov $N[1],(%rsp) # tp[j-1]
477 mov %rdx,$N[0]
478 jmp .Linner4x
479.align 16
480.Linner4x:
481 mulq $m0 # ap[j]*bp[i]
482 add %rax,$A[0]
483 mov -16($np,$j,8),%rax
484 adc \$0,%rdx
485 add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
486 adc \$0,%rdx
487 mov %rdx,$A[1]
488
489 mulq $m1 # np[j]*m1
490 add %rax,$N[0]
491 mov -8($ap,$j,8),%rax
492 adc \$0,%rdx
493 add $A[0],$N[0]
494 adc \$0,%rdx
495 mov $N[0],-24(%rsp,$j,8) # tp[j-1]
496 mov %rdx,$N[1]
497
498 mulq $m0 # ap[j]*bp[i]
499 add %rax,$A[1]
500 mov -8($np,$j,8),%rax
501 adc \$0,%rdx
502 add -8(%rsp,$j,8),$A[1]
503 adc \$0,%rdx
504 mov %rdx,$A[0]
505
506 mulq $m1 # np[j]*m1
507 add %rax,$N[1]
508 mov ($ap,$j,8),%rax
509 adc \$0,%rdx
510 add $A[1],$N[1]
511 adc \$0,%rdx
512 mov $N[1],-16(%rsp,$j,8) # tp[j-1]
513 mov %rdx,$N[0]
514
515 mulq $m0 # ap[j]*bp[i]
516 add %rax,$A[0]
517 mov ($np,$j,8),%rax
518 adc \$0,%rdx
519 add (%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
520 adc \$0,%rdx
521 mov %rdx,$A[1]
522
523 mulq $m1 # np[j]*m1
524 add %rax,$N[0]
525 mov 8($ap,$j,8),%rax
526 adc \$0,%rdx
527 add $A[0],$N[0]
528 adc \$0,%rdx
529 mov $N[0],-8(%rsp,$j,8) # tp[j-1]
530 mov %rdx,$N[1]
531
532 mulq $m0 # ap[j]*bp[i]
533 add %rax,$A[1]
534 mov 8($np,$j,8),%rax
535 adc \$0,%rdx
536 add 8(%rsp,$j,8),$A[1]
537 adc \$0,%rdx
538 lea 4($j),$j # j++
539 mov %rdx,$A[0]
540
541 mulq $m1 # np[j]*m1
542 add %rax,$N[1]
543 mov -16($ap,$j,8),%rax
544 adc \$0,%rdx
545 add $A[1],$N[1]
546 adc \$0,%rdx
547 mov $N[1],-32(%rsp,$j,8) # tp[j-1]
548 mov %rdx,$N[0]
549 cmp $num,$j
550 jl .Linner4x
551
552 mulq $m0 # ap[j]*bp[i]
553 add %rax,$A[0]
554 mov -16($np,$j,8),%rax
555 adc \$0,%rdx
556 add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
557 adc \$0,%rdx
558 mov %rdx,$A[1]
559
560 mulq $m1 # np[j]*m1
561 add %rax,$N[0]
562 mov -8($ap,$j,8),%rax
563 adc \$0,%rdx
564 add $A[0],$N[0]
565 adc \$0,%rdx
566 mov $N[0],-24(%rsp,$j,8) # tp[j-1]
567 mov %rdx,$N[1]
568
569 mulq $m0 # ap[j]*bp[i]
570 add %rax,$A[1]
571 mov -8($np,$j,8),%rax
572 adc \$0,%rdx
573 add -8(%rsp,$j,8),$A[1]
574 adc \$0,%rdx
575 lea 1($i),$i # i++
576 mov %rdx,$A[0]
577
578 mulq $m1 # np[j]*m1
579 add %rax,$N[1]
580 mov ($ap),%rax # ap[0]
581 adc \$0,%rdx
582 add $A[1],$N[1]
583 adc \$0,%rdx
584 mov $N[1],-16(%rsp,$j,8) # tp[j-1]
585 mov %rdx,$N[0]
586
587 xor $N[1],$N[1]
588 add $A[0],$N[0]
589 adc \$0,$N[1]
590 add (%rsp,$num,8),$N[0] # pull upmost overflow bit
591 adc \$0,$N[1]
592 mov $N[0],-8(%rsp,$j,8)
593 mov $N[1],(%rsp,$j,8) # store upmost overflow bit
594
595 cmp $num,$i
596 jl .Louter4x
597___
598{
599my @ri=("%rax","%rdx",$m0,$m1);
600$code.=<<___;
601 mov 16(%rsp,$num,8),$rp # restore $rp
602 mov 0(%rsp),@ri[0] # tp[0]
603 pxor %xmm0,%xmm0
604 mov 8(%rsp),@ri[1] # tp[1]
605 shr \$2,$num # num/=4
606 lea (%rsp),$ap # borrow ap for tp
607 xor $i,$i # i=0 and clear CF!
608
609 sub 0($np),@ri[0]
610 mov 16($ap),@ri[2] # tp[2]
611 mov 24($ap),@ri[3] # tp[3]
612 sbb 8($np),@ri[1]
613 lea -1($num),$j # j=num/4-1
614 jmp .Lsub4x
615.align 16
616.Lsub4x:
617 mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i]
618 mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i]
619 sbb 16($np,$i,8),@ri[2]
620 mov 32($ap,$i,8),@ri[0] # tp[i+1]
621 mov 40($ap,$i,8),@ri[1]
622 sbb 24($np,$i,8),@ri[3]
623 mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i]
624 mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i]
625 sbb 32($np,$i,8),@ri[0]
626 mov 48($ap,$i,8),@ri[2]
627 mov 56($ap,$i,8),@ri[3]
628 sbb 40($np,$i,8),@ri[1]
629 lea 4($i),$i # i++
630 dec $j # doesnn't affect CF!
631 jnz .Lsub4x
632
633 mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i]
634 mov 32($ap,$i,8),@ri[0] # load overflow bit
635 sbb 16($np,$i,8),@ri[2]
636 mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i]
637 sbb 24($np,$i,8),@ri[3]
638 mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i]
639
640 sbb \$0,@ri[0] # handle upmost overflow bit
641 mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i]
642 xor $i,$i # i=0
643 and @ri[0],$ap
644 not @ri[0]
645 mov $rp,$np
646 and @ri[0],$np
647 lea -1($num),$j
648 or $np,$ap # ap=borrow?tp:rp
649
650 movdqu ($ap),%xmm1
651 movdqa %xmm0,(%rsp)
652 movdqu %xmm1,($rp)
653 jmp .Lcopy4x
654.align 16
655.Lcopy4x: # copy or in-place refresh
656 movdqu 16($ap,$i),%xmm2
657 movdqu 32($ap,$i),%xmm1
658 movdqa %xmm0,16(%rsp,$i)
659 movdqu %xmm2,16($rp,$i)
660 movdqa %xmm0,32(%rsp,$i)
661 movdqu %xmm1,32($rp,$i)
662 lea 32($i),$i
663 dec $j
664 jnz .Lcopy4x
665
666 shl \$2,$num
667 movdqu 16($ap,$i),%xmm2
668 movdqa %xmm0,16(%rsp,$i)
669 movdqu %xmm2,16($rp,$i)
670___
671}
672$code.=<<___;
673 mov 8(%rsp,$num,8),%rsi # restore %rsp
674 mov \$1,%rax
675 mov (%rsi),%r15
676 mov 8(%rsi),%r14
677 mov 16(%rsi),%r13
678 mov 24(%rsi),%r12
679 mov 32(%rsi),%rbp
680 mov 40(%rsi),%rbx
681 lea 48(%rsi),%rsp
682.Lmul4x_epilogue:
683 ret
684.size bn_mul4x_mont,.-bn_mul4x_mont
685___
686}}}
687 {{{
688######################################################################
689# void bn_sqr4x_mont(
690my $rptr="%rdi"; # const BN_ULONG *rptr,
691my $aptr="%rsi"; # const BN_ULONG *aptr,
692my $bptr="%rdx"; # not used
693my $nptr="%rcx"; # const BN_ULONG *nptr,
694my $n0 ="%r8"; # const BN_ULONG *n0);
695my $num ="%r9"; # int num, has to be divisible by 4 and
696 # not less than 8
697
698my ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
699my @A0=("%r10","%r11");
700my @A1=("%r12","%r13");
701my ($a0,$a1,$ai)=("%r14","%r15","%rbx");
702
703$code.=<<___;
704.type bn_sqr4x_mont,\@function,6
705.align 16
706bn_sqr4x_mont:
707.Lsqr4x_enter:
708 push %rbx
709 push %rbp
710 push %r12
711 push %r13
712 push %r14
713 push %r15
714
715 shl \$3,${num}d # convert $num to bytes
716 xor %r10,%r10
717 mov %rsp,%r11 # put aside %rsp
718 sub $num,%r10 # -$num
719 mov ($n0),$n0 # *n0
720 lea -72(%rsp,%r10,2),%rsp # alloca(frame+2*$num)
721 and \$-1024,%rsp # minimize TLB usage
722 ##############################################################
723 # Stack layout
724 #
725 # +0 saved $num, used in reduction section
726 # +8 &t[2*$num], used in reduction section
727 # +32 saved $rptr
728 # +40 saved $nptr
729 # +48 saved *n0
730 # +56 saved %rsp
731 # +64 t[2*$num]
732 #
733 mov $rptr,32(%rsp) # save $rptr
734 mov $nptr,40(%rsp)
735 mov $n0, 48(%rsp)
736 mov %r11, 56(%rsp) # save original %rsp
737.Lsqr4x_body:
738 ##############################################################
739 # Squaring part:
740 #
741 # a) multiply-n-add everything but a[i]*a[i];
742 # b) shift result of a) by 1 to the left and accumulate
743 # a[i]*a[i] products;
744 #
745 lea 32(%r10),$i # $i=-($num-32)
746 lea ($aptr,$num),$aptr # end of a[] buffer, ($aptr,$i)=&ap[2]
747
748 mov $num,$j # $j=$num
749
750 # comments apply to $num==8 case
751 mov -32($aptr,$i),$a0 # a[0]
752 lea 64(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num]
753 mov -24($aptr,$i),%rax # a[1]
754 lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"]
755 mov -16($aptr,$i),$ai # a[2]
756 mov %rax,$a1
757
758 mul $a0 # a[1]*a[0]
759 mov %rax,$A0[0] # a[1]*a[0]
760 mov $ai,%rax # a[2]
761 mov %rdx,$A0[1]
762 mov $A0[0],-24($tptr,$i) # t[1]
763
764 xor $A0[0],$A0[0]
765 mul $a0 # a[2]*a[0]
766 add %rax,$A0[1]
767 mov $ai,%rax
768 adc %rdx,$A0[0]
769 mov $A0[1],-16($tptr,$i) # t[2]
770
771 lea -16($i),$j # j=-16
772
773
774 mov 8($aptr,$j),$ai # a[3]
775 mul $a1 # a[2]*a[1]
776 mov %rax,$A1[0] # a[2]*a[1]+t[3]
777 mov $ai,%rax
778 mov %rdx,$A1[1]
779
780 xor $A0[1],$A0[1]
781 add $A1[0],$A0[0]
782 lea 16($j),$j
783 adc \$0,$A0[1]
784 mul $a0 # a[3]*a[0]
785 add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3]
786 mov $ai,%rax
787 adc %rdx,$A0[1]
788 mov $A0[0],-8($tptr,$j) # t[3]
789 jmp .Lsqr4x_1st
790
791.align 16
792.Lsqr4x_1st:
793 mov ($aptr,$j),$ai # a[4]
794 xor $A1[0],$A1[0]
795 mul $a1 # a[3]*a[1]
796 add %rax,$A1[1] # a[3]*a[1]+t[4]
797 mov $ai,%rax
798 adc %rdx,$A1[0]
799
800 xor $A0[0],$A0[0]
801 add $A1[1],$A0[1]
802 adc \$0,$A0[0]
803 mul $a0 # a[4]*a[0]
804 add %rax,$A0[1] # a[4]*a[0]+a[3]*a[1]+t[4]
805 mov $ai,%rax # a[3]
806 adc %rdx,$A0[0]
807 mov $A0[1],($tptr,$j) # t[4]
808
809
810 mov 8($aptr,$j),$ai # a[5]
811 xor $A1[1],$A1[1]
812 mul $a1 # a[4]*a[3]
813 add %rax,$A1[0] # a[4]*a[3]+t[5]
814 mov $ai,%rax
815 adc %rdx,$A1[1]
816
817 xor $A0[1],$A0[1]
818 add $A1[0],$A0[0]
819 adc \$0,$A0[1]
820 mul $a0 # a[5]*a[2]
821 add %rax,$A0[0] # a[5]*a[2]+a[4]*a[3]+t[5]
822 mov $ai,%rax
823 adc %rdx,$A0[1]
824 mov $A0[0],8($tptr,$j) # t[5]
825
826 mov 16($aptr,$j),$ai # a[6]
827 xor $A1[0],$A1[0]
828 mul $a1 # a[5]*a[3]
829 add %rax,$A1[1] # a[5]*a[3]+t[6]
830 mov $ai,%rax
831 adc %rdx,$A1[0]
832
833 xor $A0[0],$A0[0]
834 add $A1[1],$A0[1]
835 adc \$0,$A0[0]
836 mul $a0 # a[6]*a[2]
837 add %rax,$A0[1] # a[6]*a[2]+a[5]*a[3]+t[6]
838 mov $ai,%rax # a[3]
839 adc %rdx,$A0[0]
840 mov $A0[1],16($tptr,$j) # t[6]
841
842
843 mov 24($aptr,$j),$ai # a[7]
844 xor $A1[1],$A1[1]
845 mul $a1 # a[6]*a[5]
846 add %rax,$A1[0] # a[6]*a[5]+t[7]
847 mov $ai,%rax
848 adc %rdx,$A1[1]
849
850 xor $A0[1],$A0[1]
851 add $A1[0],$A0[0]
852 lea 32($j),$j
853 adc \$0,$A0[1]
854 mul $a0 # a[7]*a[4]
855 add %rax,$A0[0] # a[7]*a[4]+a[6]*a[5]+t[6]
856 mov $ai,%rax
857 adc %rdx,$A0[1]
858 mov $A0[0],-8($tptr,$j) # t[7]
859
860 cmp \$0,$j
861 jne .Lsqr4x_1st
862
863 xor $A1[0],$A1[0]
864 add $A0[1],$A1[1]
865 adc \$0,$A1[0]
866 mul $a1 # a[7]*a[5]
867 add %rax,$A1[1]
868 adc %rdx,$A1[0]
869
870 mov $A1[1],($tptr) # t[8]
871 lea 16($i),$i
872 mov $A1[0],8($tptr) # t[9]
873 jmp .Lsqr4x_outer
874
875.align 16
876.Lsqr4x_outer: # comments apply to $num==6 case
877 mov -32($aptr,$i),$a0 # a[0]
878 lea 64(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num]
879 mov -24($aptr,$i),%rax # a[1]
880 lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"]
881 mov -16($aptr,$i),$ai # a[2]
882 mov %rax,$a1
883
884 mov -24($tptr,$i),$A0[0] # t[1]
885 xor $A0[1],$A0[1]
886 mul $a0 # a[1]*a[0]
887 add %rax,$A0[0] # a[1]*a[0]+t[1]
888 mov $ai,%rax # a[2]
889 adc %rdx,$A0[1]
890 mov $A0[0],-24($tptr,$i) # t[1]
891
892 xor $A0[0],$A0[0]
893 add -16($tptr,$i),$A0[1] # a[2]*a[0]+t[2]
894 adc \$0,$A0[0]
895 mul $a0 # a[2]*a[0]
896 add %rax,$A0[1]
897 mov $ai,%rax
898 adc %rdx,$A0[0]
899 mov $A0[1],-16($tptr,$i) # t[2]
900
901 lea -16($i),$j # j=-16
902 xor $A1[0],$A1[0]
903
904
905 mov 8($aptr,$j),$ai # a[3]
906 xor $A1[1],$A1[1]
907 add 8($tptr,$j),$A1[0]
908 adc \$0,$A1[1]
909 mul $a1 # a[2]*a[1]
910 add %rax,$A1[0] # a[2]*a[1]+t[3]
911 mov $ai,%rax
912 adc %rdx,$A1[1]
913
914 xor $A0[1],$A0[1]
915 add $A1[0],$A0[0]
916 adc \$0,$A0[1]
917 mul $a0 # a[3]*a[0]
918 add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3]
919 mov $ai,%rax
920 adc %rdx,$A0[1]
921 mov $A0[0],8($tptr,$j) # t[3]
922
923 lea 16($j),$j
924 jmp .Lsqr4x_inner
925
926.align 16
927.Lsqr4x_inner:
928 mov ($aptr,$j),$ai # a[4]
929 xor $A1[0],$A1[0]
930 add ($tptr,$j),$A1[1]
931 adc \$0,$A1[0]
932 mul $a1 # a[3]*a[1]
933 add %rax,$A1[1] # a[3]*a[1]+t[4]
934 mov $ai,%rax
935 adc %rdx,$A1[0]
936
937 xor $A0[0],$A0[0]
938 add $A1[1],$A0[1]
939 adc \$0,$A0[0]
940 mul $a0 # a[4]*a[0]
941 add %rax,$A0[1] # a[4]*a[0]+a[3]*a[1]+t[4]
942 mov $ai,%rax # a[3]
943 adc %rdx,$A0[0]
944 mov $A0[1],($tptr,$j) # t[4]
945
946 mov 8($aptr,$j),$ai # a[5]
947 xor $A1[1],$A1[1]
948 add 8($tptr,$j),$A1[0]
949 adc \$0,$A1[1]
950 mul $a1 # a[4]*a[3]
951 add %rax,$A1[0] # a[4]*a[3]+t[5]
952 mov $ai,%rax
953 adc %rdx,$A1[1]
954
955 xor $A0[1],$A0[1]
956 add $A1[0],$A0[0]
957 lea 16($j),$j # j++
958 adc \$0,$A0[1]
959 mul $a0 # a[5]*a[2]
960 add %rax,$A0[0] # a[5]*a[2]+a[4]*a[3]+t[5]
961 mov $ai,%rax
962 adc %rdx,$A0[1]
963 mov $A0[0],-8($tptr,$j) # t[5], "preloaded t[1]" below
964
965 cmp \$0,$j
966 jne .Lsqr4x_inner
967
968 xor $A1[0],$A1[0]
969 add $A0[1],$A1[1]
970 adc \$0,$A1[0]
971 mul $a1 # a[5]*a[3]
972 add %rax,$A1[1]
973 adc %rdx,$A1[0]
974
975 mov $A1[1],($tptr) # t[6], "preloaded t[2]" below
976 mov $A1[0],8($tptr) # t[7], "preloaded t[3]" below
977
978 add \$16,$i
979 jnz .Lsqr4x_outer
980
981 # comments apply to $num==4 case
982 mov -32($aptr),$a0 # a[0]
983 lea 64(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num]
984 mov -24($aptr),%rax # a[1]
985 lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"]
986 mov -16($aptr),$ai # a[2]
987 mov %rax,$a1
988
989 xor $A0[1],$A0[1]
990 mul $a0 # a[1]*a[0]
991 add %rax,$A0[0] # a[1]*a[0]+t[1], preloaded t[1]
992 mov $ai,%rax # a[2]
993 adc %rdx,$A0[1]
994 mov $A0[0],-24($tptr) # t[1]
995
996 xor $A0[0],$A0[0]
997 add $A1[1],$A0[1] # a[2]*a[0]+t[2], preloaded t[2]
998 adc \$0,$A0[0]
999 mul $a0 # a[2]*a[0]
1000 add %rax,$A0[1]
1001 mov $ai,%rax
1002 adc %rdx,$A0[0]
1003 mov $A0[1],-16($tptr) # t[2]
1004
1005 mov -8($aptr),$ai # a[3]
1006 mul $a1 # a[2]*a[1]
1007 add %rax,$A1[0] # a[2]*a[1]+t[3], preloaded t[3]
1008 mov $ai,%rax
1009 adc \$0,%rdx
1010
1011 xor $A0[1],$A0[1]
1012 add $A1[0],$A0[0]
1013 mov %rdx,$A1[1]
1014 adc \$0,$A0[1]
1015 mul $a0 # a[3]*a[0]
1016 add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3]
1017 mov $ai,%rax
1018 adc %rdx,$A0[1]
1019 mov $A0[0],-8($tptr) # t[3]
1020
1021 xor $A1[0],$A1[0]
1022 add $A0[1],$A1[1]
1023 adc \$0,$A1[0]
1024 mul $a1 # a[3]*a[1]
1025 add %rax,$A1[1]
1026 mov -16($aptr),%rax # a[2]
1027 adc %rdx,$A1[0]
1028
1029 mov $A1[1],($tptr) # t[4]
1030 mov $A1[0],8($tptr) # t[5]
1031
1032 mul $ai # a[2]*a[3]
1033___
1034{
1035my ($shift,$carry)=($a0,$a1);
1036my @S=(@A1,$ai,$n0);
1037$code.=<<___;
1038 add \$16,$i
1039 xor $shift,$shift
1040 sub $num,$i # $i=16-$num
1041 xor $carry,$carry
1042
1043 add $A1[0],%rax # t[5]
1044 adc \$0,%rdx
1045 mov %rax,8($tptr) # t[5]
1046 mov %rdx,16($tptr) # t[6]
1047 mov $carry,24($tptr) # t[7]
1048
1049 mov -16($aptr,$i),%rax # a[0]
1050 lea 64(%rsp,$num,2),$tptr
1051 xor $A0[0],$A0[0] # t[0]
1052 mov -24($tptr,$i,2),$A0[1] # t[1]
1053
1054 lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift
1055 shr \$63,$A0[0]
1056 lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 |
1057 shr \$63,$A0[1]
1058 or $A0[0],$S[1] # | t[2*i]>>63
1059 mov -16($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch
1060 mov $A0[1],$shift # shift=t[2*i+1]>>63
1061 mul %rax # a[i]*a[i]
1062 neg $carry # mov $carry,cf
1063 mov -8($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch
1064 adc %rax,$S[0]
1065 mov -8($aptr,$i),%rax # a[i+1] # prefetch
1066 mov $S[0],-32($tptr,$i,2)
1067 adc %rdx,$S[1]
1068
1069 lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift
1070 mov $S[1],-24($tptr,$i,2)
1071 sbb $carry,$carry # mov cf,$carry
1072 shr \$63,$A0[0]
1073 lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 |
1074 shr \$63,$A0[1]
1075 or $A0[0],$S[3] # | t[2*i]>>63
1076 mov 0($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch
1077 mov $A0[1],$shift # shift=t[2*i+1]>>63
1078 mul %rax # a[i]*a[i]
1079 neg $carry # mov $carry,cf
1080 mov 8($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch
1081 adc %rax,$S[2]
1082 mov 0($aptr,$i),%rax # a[i+1] # prefetch
1083 mov $S[2],-16($tptr,$i,2)
1084 adc %rdx,$S[3]
1085 lea 16($i),$i
1086 mov $S[3],-40($tptr,$i,2)
1087 sbb $carry,$carry # mov cf,$carry
1088 jmp .Lsqr4x_shift_n_add
1089
1090.align 16
1091.Lsqr4x_shift_n_add:
1092 lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift
1093 shr \$63,$A0[0]
1094 lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 |
1095 shr \$63,$A0[1]
1096 or $A0[0],$S[1] # | t[2*i]>>63
1097 mov -16($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch
1098 mov $A0[1],$shift # shift=t[2*i+1]>>63
1099 mul %rax # a[i]*a[i]
1100 neg $carry # mov $carry,cf
1101 mov -8($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch
1102 adc %rax,$S[0]
1103 mov -8($aptr,$i),%rax # a[i+1] # prefetch
1104 mov $S[0],-32($tptr,$i,2)
1105 adc %rdx,$S[1]
1106
1107 lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift
1108 mov $S[1],-24($tptr,$i,2)
1109 sbb $carry,$carry # mov cf,$carry
1110 shr \$63,$A0[0]
1111 lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 |
1112 shr \$63,$A0[1]
1113 or $A0[0],$S[3] # | t[2*i]>>63
1114 mov 0($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch
1115 mov $A0[1],$shift # shift=t[2*i+1]>>63
1116 mul %rax # a[i]*a[i]
1117 neg $carry # mov $carry,cf
1118 mov 8($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch
1119 adc %rax,$S[2]
1120 mov 0($aptr,$i),%rax # a[i+1] # prefetch
1121 mov $S[2],-16($tptr,$i,2)
1122 adc %rdx,$S[3]
1123
1124 lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift
1125 mov $S[3],-8($tptr,$i,2)
1126 sbb $carry,$carry # mov cf,$carry
1127 shr \$63,$A0[0]
1128 lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 |
1129 shr \$63,$A0[1]
1130 or $A0[0],$S[1] # | t[2*i]>>63
1131 mov 16($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch
1132 mov $A0[1],$shift # shift=t[2*i+1]>>63
1133 mul %rax # a[i]*a[i]
1134 neg $carry # mov $carry,cf
1135 mov 24($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch
1136 adc %rax,$S[0]
1137 mov 8($aptr,$i),%rax # a[i+1] # prefetch
1138 mov $S[0],0($tptr,$i,2)
1139 adc %rdx,$S[1]
1140
1141 lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift
1142 mov $S[1],8($tptr,$i,2)
1143 sbb $carry,$carry # mov cf,$carry
1144 shr \$63,$A0[0]
1145 lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 |
1146 shr \$63,$A0[1]
1147 or $A0[0],$S[3] # | t[2*i]>>63
1148 mov 32($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch
1149 mov $A0[1],$shift # shift=t[2*i+1]>>63
1150 mul %rax # a[i]*a[i]
1151 neg $carry # mov $carry,cf
1152 mov 40($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch
1153 adc %rax,$S[2]
1154 mov 16($aptr,$i),%rax # a[i+1] # prefetch
1155 mov $S[2],16($tptr,$i,2)
1156 adc %rdx,$S[3]
1157 mov $S[3],24($tptr,$i,2)
1158 sbb $carry,$carry # mov cf,$carry
1159 add \$32,$i
1160 jnz .Lsqr4x_shift_n_add
1161
1162 lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift
1163 shr \$63,$A0[0]
1164 lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 |
1165 shr \$63,$A0[1]
1166 or $A0[0],$S[1] # | t[2*i]>>63
1167 mov -16($tptr),$A0[0] # t[2*i+2] # prefetch
1168 mov $A0[1],$shift # shift=t[2*i+1]>>63
1169 mul %rax # a[i]*a[i]
1170 neg $carry # mov $carry,cf
1171 mov -8($tptr),$A0[1] # t[2*i+2+1] # prefetch
1172 adc %rax,$S[0]
1173 mov -8($aptr),%rax # a[i+1] # prefetch
1174 mov $S[0],-32($tptr)
1175 adc %rdx,$S[1]
1176
1177 lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1|shift
1178 mov $S[1],-24($tptr)
1179 sbb $carry,$carry # mov cf,$carry
1180 shr \$63,$A0[0]
1181 lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 |
1182 shr \$63,$A0[1]
1183 or $A0[0],$S[3] # | t[2*i]>>63
1184 mul %rax # a[i]*a[i]
1185 neg $carry # mov $carry,cf
1186 adc %rax,$S[2]
1187 adc %rdx,$S[3]
1188 mov $S[2],-16($tptr)
1189 mov $S[3],-8($tptr)
1190___
1191}
1192##############################################################
1193# Montgomery reduction part, "word-by-word" algorithm.
1194#
1195{
1196my ($topbit,$nptr)=("%rbp",$aptr);
1197my ($m0,$m1)=($a0,$a1);
1198my @Ni=("%rbx","%r9");
1199$code.=<<___;
1200 mov 40(%rsp),$nptr # restore $nptr
1201 mov 48(%rsp),$n0 # restore *n0
1202 xor $j,$j
1203 mov $num,0(%rsp) # save $num
1204 sub $num,$j # $j=-$num
1205 mov 64(%rsp),$A0[0] # t[0] # modsched #
1206 mov $n0,$m0 # # modsched #
1207 lea 64(%rsp,$num,2),%rax # end of t[] buffer
1208 lea 64(%rsp,$num),$tptr # end of t[] window
1209 mov %rax,8(%rsp) # save end of t[] buffer
1210 lea ($nptr,$num),$nptr # end of n[] buffer
1211 xor $topbit,$topbit # $topbit=0
1212
1213 mov 0($nptr,$j),%rax # n[0] # modsched #
1214 mov 8($nptr,$j),$Ni[1] # n[1] # modsched #
1215 imulq $A0[0],$m0 # m0=t[0]*n0 # modsched #
1216 mov %rax,$Ni[0] # # modsched #
1217 jmp .Lsqr4x_mont_outer
1218
1219.align 16
1220.Lsqr4x_mont_outer:
1221 xor $A0[1],$A0[1]
1222 mul $m0 # n[0]*m0
1223 add %rax,$A0[0] # n[0]*m0+t[0]
1224 mov $Ni[1],%rax
1225 adc %rdx,$A0[1]
1226 mov $n0,$m1
1227
1228 xor $A0[0],$A0[0]
1229 add 8($tptr,$j),$A0[1]
1230 adc \$0,$A0[0]
1231 mul $m0 # n[1]*m0
1232 add %rax,$A0[1] # n[1]*m0+t[1]
1233 mov $Ni[0],%rax
1234 adc %rdx,$A0[0]
1235
1236 imulq $A0[1],$m1
1237
1238 mov 16($nptr,$j),$Ni[0] # n[2]
1239 xor $A1[1],$A1[1]
1240 add $A0[1],$A1[0]
1241 adc \$0,$A1[1]
1242 mul $m1 # n[0]*m1
1243 add %rax,$A1[0] # n[0]*m1+"t[1]"
1244 mov $Ni[0],%rax
1245 adc %rdx,$A1[1]
1246 mov $A1[0],8($tptr,$j) # "t[1]"
1247
1248 xor $A0[1],$A0[1]
1249 add 16($tptr,$j),$A0[0]
1250 adc \$0,$A0[1]
1251 mul $m0 # n[2]*m0
1252 add %rax,$A0[0] # n[2]*m0+t[2]
1253 mov $Ni[1],%rax
1254 adc %rdx,$A0[1]
1255
1256 mov 24($nptr,$j),$Ni[1] # n[3]
1257 xor $A1[0],$A1[0]
1258 add $A0[0],$A1[1]
1259 adc \$0,$A1[0]
1260 mul $m1 # n[1]*m1
1261 add %rax,$A1[1] # n[1]*m1+"t[2]"
1262 mov $Ni[1],%rax
1263 adc %rdx,$A1[0]
1264 mov $A1[1],16($tptr,$j) # "t[2]"
1265
1266 xor $A0[0],$A0[0]
1267 add 24($tptr,$j),$A0[1]
1268 lea 32($j),$j
1269 adc \$0,$A0[0]
1270 mul $m0 # n[3]*m0
1271 add %rax,$A0[1] # n[3]*m0+t[3]
1272 mov $Ni[0],%rax
1273 adc %rdx,$A0[0]
1274 jmp .Lsqr4x_mont_inner
1275
1276.align 16
1277.Lsqr4x_mont_inner:
1278 mov ($nptr,$j),$Ni[0] # n[4]
1279 xor $A1[1],$A1[1]
1280 add $A0[1],$A1[0]
1281 adc \$0,$A1[1]
1282 mul $m1 # n[2]*m1
1283 add %rax,$A1[0] # n[2]*m1+"t[3]"
1284 mov $Ni[0],%rax
1285 adc %rdx,$A1[1]
1286 mov $A1[0],-8($tptr,$j) # "t[3]"
1287
1288 xor $A0[1],$A0[1]
1289 add ($tptr,$j),$A0[0]
1290 adc \$0,$A0[1]
1291 mul $m0 # n[4]*m0
1292 add %rax,$A0[0] # n[4]*m0+t[4]
1293 mov $Ni[1],%rax
1294 adc %rdx,$A0[1]
1295
1296 mov 8($nptr,$j),$Ni[1] # n[5]
1297 xor $A1[0],$A1[0]
1298 add $A0[0],$A1[1]
1299 adc \$0,$A1[0]
1300 mul $m1 # n[3]*m1
1301 add %rax,$A1[1] # n[3]*m1+"t[4]"
1302 mov $Ni[1],%rax
1303 adc %rdx,$A1[0]
1304 mov $A1[1],($tptr,$j) # "t[4]"
1305
1306 xor $A0[0],$A0[0]
1307 add 8($tptr,$j),$A0[1]
1308 adc \$0,$A0[0]
1309 mul $m0 # n[5]*m0
1310 add %rax,$A0[1] # n[5]*m0+t[5]
1311 mov $Ni[0],%rax
1312 adc %rdx,$A0[0]
1313
1314
1315 mov 16($nptr,$j),$Ni[0] # n[6]
1316 xor $A1[1],$A1[1]
1317 add $A0[1],$A1[0]
1318 adc \$0,$A1[1]
1319 mul $m1 # n[4]*m1
1320 add %rax,$A1[0] # n[4]*m1+"t[5]"
1321 mov $Ni[0],%rax
1322 adc %rdx,$A1[1]
1323 mov $A1[0],8($tptr,$j) # "t[5]"
1324
1325 xor $A0[1],$A0[1]
1326 add 16($tptr,$j),$A0[0]
1327 adc \$0,$A0[1]
1328 mul $m0 # n[6]*m0
1329 add %rax,$A0[0] # n[6]*m0+t[6]
1330 mov $Ni[1],%rax
1331 adc %rdx,$A0[1]
1332
1333 mov 24($nptr,$j),$Ni[1] # n[7]
1334 xor $A1[0],$A1[0]
1335 add $A0[0],$A1[1]
1336 adc \$0,$A1[0]
1337 mul $m1 # n[5]*m1
1338 add %rax,$A1[1] # n[5]*m1+"t[6]"
1339 mov $Ni[1],%rax
1340 adc %rdx,$A1[0]
1341 mov $A1[1],16($tptr,$j) # "t[6]"
1342
1343 xor $A0[0],$A0[0]
1344 add 24($tptr,$j),$A0[1]
1345 lea 32($j),$j
1346 adc \$0,$A0[0]
1347 mul $m0 # n[7]*m0
1348 add %rax,$A0[1] # n[7]*m0+t[7]
1349 mov $Ni[0],%rax
1350 adc %rdx,$A0[0]
1351 cmp \$0,$j
1352 jne .Lsqr4x_mont_inner
1353
1354 sub 0(%rsp),$j # $j=-$num # modsched #
1355 mov $n0,$m0 # # modsched #
1356
1357 xor $A1[1],$A1[1]
1358 add $A0[1],$A1[0]
1359 adc \$0,$A1[1]
1360 mul $m1 # n[6]*m1
1361 add %rax,$A1[0] # n[6]*m1+"t[7]"
1362 mov $Ni[1],%rax
1363 adc %rdx,$A1[1]
1364 mov $A1[0],-8($tptr) # "t[7]"
1365
1366 xor $A0[1],$A0[1]
1367 add ($tptr),$A0[0] # +t[8]
1368 adc \$0,$A0[1]
1369 mov 0($nptr,$j),$Ni[0] # n[0] # modsched #
1370 add $topbit,$A0[0]
1371 adc \$0,$A0[1]
1372
1373 imulq 16($tptr,$j),$m0 # m0=t[0]*n0 # modsched #
1374 xor $A1[0],$A1[0]
1375 mov 8($nptr,$j),$Ni[1] # n[1] # modsched #
1376 add $A0[0],$A1[1]
1377 mov 16($tptr,$j),$A0[0] # t[0] # modsched #
1378 adc \$0,$A1[0]
1379 mul $m1 # n[7]*m1
1380 add %rax,$A1[1] # n[7]*m1+"t[8]"
1381 mov $Ni[0],%rax # # modsched #
1382 adc %rdx,$A1[0]
1383 mov $A1[1],($tptr) # "t[8]"
1384
1385 xor $topbit,$topbit
1386 add 8($tptr),$A1[0] # +t[9]
1387 adc $topbit,$topbit
1388 add $A0[1],$A1[0]
1389 lea 16($tptr),$tptr # "t[$num]>>128"
1390 adc \$0,$topbit
1391 mov $A1[0],-8($tptr) # "t[9]"
1392 cmp 8(%rsp),$tptr # are we done?
1393 jb .Lsqr4x_mont_outer
1394
1395 mov 0(%rsp),$num # restore $num
1396 mov $topbit,($tptr) # save $topbit
1397___
1398}
1399##############################################################
1400# Post-condition, 4x unrolled copy from bn_mul_mont
1401#
1402{
1403my ($tptr,$nptr)=("%rbx",$aptr);
1404my @ri=("%rax","%rdx","%r10","%r11");
1405$code.=<<___;
1406 mov 64(%rsp,$num),@ri[0] # tp[0]
1407 lea 64(%rsp,$num),$tptr # upper half of t[2*$num] holds result
1408 mov 40(%rsp),$nptr # restore $nptr
1409 shr \$5,$num # num/4
1410 mov 8($tptr),@ri[1] # t[1]
1411 xor $i,$i # i=0 and clear CF!
1412
1413 mov 32(%rsp),$rptr # restore $rptr
1414 sub 0($nptr),@ri[0]
1415 mov 16($tptr),@ri[2] # t[2]
1416 mov 24($tptr),@ri[3] # t[3]
1417 sbb 8($nptr),@ri[1]
1418 lea -1($num),$j # j=num/4-1
1419 jmp .Lsqr4x_sub
1420.align 16
1421.Lsqr4x_sub:
1422 mov @ri[0],0($rptr,$i,8) # rp[i]=tp[i]-np[i]
1423 mov @ri[1],8($rptr,$i,8) # rp[i]=tp[i]-np[i]
1424 sbb 16($nptr,$i,8),@ri[2]
1425 mov 32($tptr,$i,8),@ri[0] # tp[i+1]
1426 mov 40($tptr,$i,8),@ri[1]
1427 sbb 24($nptr,$i,8),@ri[3]
1428 mov @ri[2],16($rptr,$i,8) # rp[i]=tp[i]-np[i]
1429 mov @ri[3],24($rptr,$i,8) # rp[i]=tp[i]-np[i]
1430 sbb 32($nptr,$i,8),@ri[0]
1431 mov 48($tptr,$i,8),@ri[2]
1432 mov 56($tptr,$i,8),@ri[3]
1433 sbb 40($nptr,$i,8),@ri[1]
1434 lea 4($i),$i # i++
1435 dec $j # doesn't affect CF!
1436 jnz .Lsqr4x_sub
1437
1438 mov @ri[0],0($rptr,$i,8) # rp[i]=tp[i]-np[i]
1439 mov 32($tptr,$i,8),@ri[0] # load overflow bit
1440 sbb 16($nptr,$i,8),@ri[2]
1441 mov @ri[1],8($rptr,$i,8) # rp[i]=tp[i]-np[i]
1442 sbb 24($nptr,$i,8),@ri[3]
1443 mov @ri[2],16($rptr,$i,8) # rp[i]=tp[i]-np[i]
1444
1445 sbb \$0,@ri[0] # handle upmost overflow bit
1446 mov @ri[3],24($rptr,$i,8) # rp[i]=tp[i]-np[i]
1447 xor $i,$i # i=0
1448 and @ri[0],$tptr
1449 not @ri[0]
1450 mov $rptr,$nptr
1451 and @ri[0],$nptr
1452 lea -1($num),$j
1453 or $nptr,$tptr # tp=borrow?tp:rp
1454
1455 pxor %xmm0,%xmm0
1456 lea 64(%rsp,$num,8),$nptr
1457 movdqu ($tptr),%xmm1
1458 lea ($nptr,$num,8),$nptr
1459 movdqa %xmm0,64(%rsp) # zap lower half of temporary vector
1460 movdqa %xmm0,($nptr) # zap upper half of temporary vector
1461 movdqu %xmm1,($rptr)
1462 jmp .Lsqr4x_copy
1463.align 16
1464.Lsqr4x_copy: # copy or in-place refresh
1465 movdqu 16($tptr,$i),%xmm2
1466 movdqu 32($tptr,$i),%xmm1
1467 movdqa %xmm0,80(%rsp,$i) # zap lower half of temporary vector
1468 movdqa %xmm0,96(%rsp,$i) # zap lower half of temporary vector
1469 movdqa %xmm0,16($nptr,$i) # zap upper half of temporary vector
1470 movdqa %xmm0,32($nptr,$i) # zap upper half of temporary vector
1471 movdqu %xmm2,16($rptr,$i)
1472 movdqu %xmm1,32($rptr,$i)
1473 lea 32($i),$i
1474 dec $j
1475 jnz .Lsqr4x_copy
1476
1477 movdqu 16($tptr,$i),%xmm2
1478 movdqa %xmm0,80(%rsp,$i) # zap lower half of temporary vector
1479 movdqa %xmm0,16($nptr,$i) # zap upper half of temporary vector
1480 movdqu %xmm2,16($rptr,$i)
1481___
1482}
1483$code.=<<___;
1484 mov 56(%rsp),%rsi # restore %rsp
1485 mov \$1,%rax
1486 mov 0(%rsi),%r15
1487 mov 8(%rsi),%r14
1488 mov 16(%rsi),%r13
1489 mov 24(%rsi),%r12
1490 mov 32(%rsi),%rbp
1491 mov 40(%rsi),%rbx
1492 lea 48(%rsi),%rsp
1493.Lsqr4x_epilogue:
1494 ret
1495.size bn_sqr4x_mont,.-bn_sqr4x_mont
1496___
1497}}}
1498$code.=<<___;
1499.asciz "Montgomery Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1500.align 16
1501___
1502
1503print $code;
1504close STDOUT;
diff --git a/src/lib/libcrypto/bn/asm/x86_64-mont5.pl b/src/lib/libcrypto/bn/asm/x86_64-mont5.pl
deleted file mode 100755
index 9c88884d42..0000000000
--- a/src/lib/libcrypto/bn/asm/x86_64-mont5.pl
+++ /dev/null
@@ -1,1071 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# August 2011.
11#
12# Companion to x86_64-mont.pl that optimizes cache-timing attack
13# countermeasures. The subroutines are produced by replacing bp[i]
14# references in their x86_64-mont.pl counterparts with cache-neutral
15# references to powers table computed in BN_mod_exp_mont_consttime.
16# In addition subroutine that scatters elements of the powers table
17# is implemented, so that scatter-/gathering can be tuned without
18# bn_exp.c modifications.
19
20$flavour = shift;
21$output = shift;
22if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
23
24$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
25
26$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
27( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
28( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
29die "can't locate x86_64-xlate.pl";
30
31open OUT,"| \"$^X\" $xlate $flavour $output";
32*STDOUT=*OUT;
33
34# int bn_mul_mont_gather5(
35$rp="%rdi"; # BN_ULONG *rp,
36$ap="%rsi"; # const BN_ULONG *ap,
37$bp="%rdx"; # const BN_ULONG *bp,
38$np="%rcx"; # const BN_ULONG *np,
39$n0="%r8"; # const BN_ULONG *n0,
40$num="%r9"; # int num,
41 # int idx); # 0 to 2^5-1, "index" in $bp holding
42 # pre-computed powers of a', interlaced
43 # in such manner that b[0] is $bp[idx],
44 # b[1] is [2^5+idx], etc.
45$lo0="%r10";
46$hi0="%r11";
47$hi1="%r13";
48$i="%r14";
49$j="%r15";
50$m0="%rbx";
51$m1="%rbp";
52
53$code=<<___;
54.text
55
56.globl bn_mul_mont_gather5
57.type bn_mul_mont_gather5,\@function,6
58.align 64
59bn_mul_mont_gather5:
60 test \$3,${num}d
61 jnz .Lmul_enter
62 cmp \$8,${num}d
63 jb .Lmul_enter
64 jmp .Lmul4x_enter
65
66.align 16
67.Lmul_enter:
68 mov ${num}d,${num}d
69 mov `($win64?56:8)`(%rsp),%r10d # load 7th argument
70 push %rbx
71 push %rbp
72 push %r12
73 push %r13
74 push %r14
75 push %r15
76___
77$code.=<<___ if ($win64);
78 lea -0x28(%rsp),%rsp
79 movaps %xmm6,(%rsp)
80 movaps %xmm7,0x10(%rsp)
81.Lmul_alloca:
82___
83$code.=<<___;
84 mov %rsp,%rax
85 lea 2($num),%r11
86 neg %r11
87 lea (%rsp,%r11,8),%rsp # tp=alloca(8*(num+2))
88 and \$-1024,%rsp # minimize TLB usage
89
90 mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp
91.Lmul_body:
92 mov $bp,%r12 # reassign $bp
93___
94 $bp="%r12";
95 $STRIDE=2**5*8; # 5 is "window size"
96 $N=$STRIDE/4; # should match cache line size
97$code.=<<___;
98 mov %r10,%r11
99 shr \$`log($N/8)/log(2)`,%r10
100 and \$`$N/8-1`,%r11
101 not %r10
102 lea .Lmagic_masks(%rip),%rax
103 and \$`2**5/($N/8)-1`,%r10 # 5 is "window size"
104 lea 96($bp,%r11,8),$bp # pointer within 1st cache line
105 movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which
106 movq 8(%rax,%r10,8),%xmm5 # cache line contains element
107 movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument
108 movq 24(%rax,%r10,8),%xmm7
109
110 movq `0*$STRIDE/4-96`($bp),%xmm0
111 movq `1*$STRIDE/4-96`($bp),%xmm1
112 pand %xmm4,%xmm0
113 movq `2*$STRIDE/4-96`($bp),%xmm2
114 pand %xmm5,%xmm1
115 movq `3*$STRIDE/4-96`($bp),%xmm3
116 pand %xmm6,%xmm2
117 por %xmm1,%xmm0
118 pand %xmm7,%xmm3
119 por %xmm2,%xmm0
120 lea $STRIDE($bp),$bp
121 por %xmm3,%xmm0
122
123 movq %xmm0,$m0 # m0=bp[0]
124
125 mov ($n0),$n0 # pull n0[0] value
126 mov ($ap),%rax
127
128 xor $i,$i # i=0
129 xor $j,$j # j=0
130
131 movq `0*$STRIDE/4-96`($bp),%xmm0
132 movq `1*$STRIDE/4-96`($bp),%xmm1
133 pand %xmm4,%xmm0
134 movq `2*$STRIDE/4-96`($bp),%xmm2
135 pand %xmm5,%xmm1
136
137 mov $n0,$m1
138 mulq $m0 # ap[0]*bp[0]
139 mov %rax,$lo0
140 mov ($np),%rax
141
142 movq `3*$STRIDE/4-96`($bp),%xmm3
143 pand %xmm6,%xmm2
144 por %xmm1,%xmm0
145 pand %xmm7,%xmm3
146
147 imulq $lo0,$m1 # "tp[0]"*n0
148 mov %rdx,$hi0
149
150 por %xmm2,%xmm0
151 lea $STRIDE($bp),$bp
152 por %xmm3,%xmm0
153
154 mulq $m1 # np[0]*m1
155 add %rax,$lo0 # discarded
156 mov 8($ap),%rax
157 adc \$0,%rdx
158 mov %rdx,$hi1
159
160 lea 1($j),$j # j++
161 jmp .L1st_enter
162
163.align 16
164.L1st:
165 add %rax,$hi1
166 mov ($ap,$j,8),%rax
167 adc \$0,%rdx
168 add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0]
169 mov $lo0,$hi0
170 adc \$0,%rdx
171 mov $hi1,-16(%rsp,$j,8) # tp[j-1]
172 mov %rdx,$hi1
173
174.L1st_enter:
175 mulq $m0 # ap[j]*bp[0]
176 add %rax,$hi0
177 mov ($np,$j,8),%rax
178 adc \$0,%rdx
179 lea 1($j),$j # j++
180 mov %rdx,$lo0
181
182 mulq $m1 # np[j]*m1
183 cmp $num,$j
184 jl .L1st
185
186 movq %xmm0,$m0 # bp[1]
187
188 add %rax,$hi1
189 mov ($ap),%rax # ap[0]
190 adc \$0,%rdx
191 add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0]
192 adc \$0,%rdx
193 mov $hi1,-16(%rsp,$j,8) # tp[j-1]
194 mov %rdx,$hi1
195 mov $lo0,$hi0
196
197 xor %rdx,%rdx
198 add $hi0,$hi1
199 adc \$0,%rdx
200 mov $hi1,-8(%rsp,$num,8)
201 mov %rdx,(%rsp,$num,8) # store upmost overflow bit
202
203 lea 1($i),$i # i++
204 jmp .Louter
205.align 16
206.Louter:
207 xor $j,$j # j=0
208 mov $n0,$m1
209 mov (%rsp),$lo0
210
211 movq `0*$STRIDE/4-96`($bp),%xmm0
212 movq `1*$STRIDE/4-96`($bp),%xmm1
213 pand %xmm4,%xmm0
214 movq `2*$STRIDE/4-96`($bp),%xmm2
215 pand %xmm5,%xmm1
216
217 mulq $m0 # ap[0]*bp[i]
218 add %rax,$lo0 # ap[0]*bp[i]+tp[0]
219 mov ($np),%rax
220 adc \$0,%rdx
221
222 movq `3*$STRIDE/4-96`($bp),%xmm3
223 pand %xmm6,%xmm2
224 por %xmm1,%xmm0
225 pand %xmm7,%xmm3
226
227 imulq $lo0,$m1 # tp[0]*n0
228 mov %rdx,$hi0
229
230 por %xmm2,%xmm0
231 lea $STRIDE($bp),$bp
232 por %xmm3,%xmm0
233
234 mulq $m1 # np[0]*m1
235 add %rax,$lo0 # discarded
236 mov 8($ap),%rax
237 adc \$0,%rdx
238 mov 8(%rsp),$lo0 # tp[1]
239 mov %rdx,$hi1
240
241 lea 1($j),$j # j++
242 jmp .Linner_enter
243
244.align 16
245.Linner:
246 add %rax,$hi1
247 mov ($ap,$j,8),%rax
248 adc \$0,%rdx
249 add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j]
250 mov (%rsp,$j,8),$lo0
251 adc \$0,%rdx
252 mov $hi1,-16(%rsp,$j,8) # tp[j-1]
253 mov %rdx,$hi1
254
255.Linner_enter:
256 mulq $m0 # ap[j]*bp[i]
257 add %rax,$hi0
258 mov ($np,$j,8),%rax
259 adc \$0,%rdx
260 add $hi0,$lo0 # ap[j]*bp[i]+tp[j]
261 mov %rdx,$hi0
262 adc \$0,$hi0
263 lea 1($j),$j # j++
264
265 mulq $m1 # np[j]*m1
266 cmp $num,$j
267 jl .Linner
268
269 movq %xmm0,$m0 # bp[i+1]
270
271 add %rax,$hi1
272 mov ($ap),%rax # ap[0]
273 adc \$0,%rdx
274 add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j]
275 mov (%rsp,$j,8),$lo0
276 adc \$0,%rdx
277 mov $hi1,-16(%rsp,$j,8) # tp[j-1]
278 mov %rdx,$hi1
279
280 xor %rdx,%rdx
281 add $hi0,$hi1
282 adc \$0,%rdx
283 add $lo0,$hi1 # pull upmost overflow bit
284 adc \$0,%rdx
285 mov $hi1,-8(%rsp,$num,8)
286 mov %rdx,(%rsp,$num,8) # store upmost overflow bit
287
288 lea 1($i),$i # i++
289 cmp $num,$i
290 jl .Louter
291
292 xor $i,$i # i=0 and clear CF!
293 mov (%rsp),%rax # tp[0]
294 lea (%rsp),$ap # borrow ap for tp
295 mov $num,$j # j=num
296 jmp .Lsub
297.align 16
298.Lsub: sbb ($np,$i,8),%rax
299 mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i]
300 mov 8($ap,$i,8),%rax # tp[i+1]
301 lea 1($i),$i # i++
302 dec $j # doesnn't affect CF!
303 jnz .Lsub
304
305 sbb \$0,%rax # handle upmost overflow bit
306 xor $i,$i
307 and %rax,$ap
308 not %rax
309 mov $rp,$np
310 and %rax,$np
311 mov $num,$j # j=num
312 or $np,$ap # ap=borrow?tp:rp
313.align 16
314.Lcopy: # copy or in-place refresh
315 mov ($ap,$i,8),%rax
316 mov $i,(%rsp,$i,8) # zap temporary vector
317 mov %rax,($rp,$i,8) # rp[i]=tp[i]
318 lea 1($i),$i
319 sub \$1,$j
320 jnz .Lcopy
321
322 mov 8(%rsp,$num,8),%rsi # restore %rsp
323 mov \$1,%rax
324___
325$code.=<<___ if ($win64);
326 movaps (%rsi),%xmm6
327 movaps 0x10(%rsi),%xmm7
328 lea 0x28(%rsi),%rsi
329___
330$code.=<<___;
331 mov (%rsi),%r15
332 mov 8(%rsi),%r14
333 mov 16(%rsi),%r13
334 mov 24(%rsi),%r12
335 mov 32(%rsi),%rbp
336 mov 40(%rsi),%rbx
337 lea 48(%rsi),%rsp
338.Lmul_epilogue:
339 ret
340.size bn_mul_mont_gather5,.-bn_mul_mont_gather5
341___
342{{{
343my @A=("%r10","%r11");
344my @N=("%r13","%rdi");
345$code.=<<___;
346.type bn_mul4x_mont_gather5,\@function,6
347.align 16
348bn_mul4x_mont_gather5:
349.Lmul4x_enter:
350 mov ${num}d,${num}d
351 mov `($win64?56:8)`(%rsp),%r10d # load 7th argument
352 push %rbx
353 push %rbp
354 push %r12
355 push %r13
356 push %r14
357 push %r15
358___
359$code.=<<___ if ($win64);
360 lea -0x28(%rsp),%rsp
361 movaps %xmm6,(%rsp)
362 movaps %xmm7,0x10(%rsp)
363.Lmul4x_alloca:
364___
365$code.=<<___;
366 mov %rsp,%rax
367 lea 4($num),%r11
368 neg %r11
369 lea (%rsp,%r11,8),%rsp # tp=alloca(8*(num+4))
370 and \$-1024,%rsp # minimize TLB usage
371
372 mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp
373.Lmul4x_body:
374 mov $rp,16(%rsp,$num,8) # tp[num+2]=$rp
375 mov %rdx,%r12 # reassign $bp
376___
377 $bp="%r12";
378 $STRIDE=2**5*8; # 5 is "window size"
379 $N=$STRIDE/4; # should match cache line size
380$code.=<<___;
381 mov %r10,%r11
382 shr \$`log($N/8)/log(2)`,%r10
383 and \$`$N/8-1`,%r11
384 not %r10
385 lea .Lmagic_masks(%rip),%rax
386 and \$`2**5/($N/8)-1`,%r10 # 5 is "window size"
387 lea 96($bp,%r11,8),$bp # pointer within 1st cache line
388 movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which
389 movq 8(%rax,%r10,8),%xmm5 # cache line contains element
390 movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument
391 movq 24(%rax,%r10,8),%xmm7
392
393 movq `0*$STRIDE/4-96`($bp),%xmm0
394 movq `1*$STRIDE/4-96`($bp),%xmm1
395 pand %xmm4,%xmm0
396 movq `2*$STRIDE/4-96`($bp),%xmm2
397 pand %xmm5,%xmm1
398 movq `3*$STRIDE/4-96`($bp),%xmm3
399 pand %xmm6,%xmm2
400 por %xmm1,%xmm0
401 pand %xmm7,%xmm3
402 por %xmm2,%xmm0
403 lea $STRIDE($bp),$bp
404 por %xmm3,%xmm0
405
406 movq %xmm0,$m0 # m0=bp[0]
407 mov ($n0),$n0 # pull n0[0] value
408 mov ($ap),%rax
409
410 xor $i,$i # i=0
411 xor $j,$j # j=0
412
413 movq `0*$STRIDE/4-96`($bp),%xmm0
414 movq `1*$STRIDE/4-96`($bp),%xmm1
415 pand %xmm4,%xmm0
416 movq `2*$STRIDE/4-96`($bp),%xmm2
417 pand %xmm5,%xmm1
418
419 mov $n0,$m1
420 mulq $m0 # ap[0]*bp[0]
421 mov %rax,$A[0]
422 mov ($np),%rax
423
424 movq `3*$STRIDE/4-96`($bp),%xmm3
425 pand %xmm6,%xmm2
426 por %xmm1,%xmm0
427 pand %xmm7,%xmm3
428
429 imulq $A[0],$m1 # "tp[0]"*n0
430 mov %rdx,$A[1]
431
432 por %xmm2,%xmm0
433 lea $STRIDE($bp),$bp
434 por %xmm3,%xmm0
435
436 mulq $m1 # np[0]*m1
437 add %rax,$A[0] # discarded
438 mov 8($ap),%rax
439 adc \$0,%rdx
440 mov %rdx,$N[1]
441
442 mulq $m0
443 add %rax,$A[1]
444 mov 8($np),%rax
445 adc \$0,%rdx
446 mov %rdx,$A[0]
447
448 mulq $m1
449 add %rax,$N[1]
450 mov 16($ap),%rax
451 adc \$0,%rdx
452 add $A[1],$N[1]
453 lea 4($j),$j # j++
454 adc \$0,%rdx
455 mov $N[1],(%rsp)
456 mov %rdx,$N[0]
457 jmp .L1st4x
458.align 16
459.L1st4x:
460 mulq $m0 # ap[j]*bp[0]
461 add %rax,$A[0]
462 mov -16($np,$j,8),%rax
463 adc \$0,%rdx
464 mov %rdx,$A[1]
465
466 mulq $m1 # np[j]*m1
467 add %rax,$N[0]
468 mov -8($ap,$j,8),%rax
469 adc \$0,%rdx
470 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
471 adc \$0,%rdx
472 mov $N[0],-24(%rsp,$j,8) # tp[j-1]
473 mov %rdx,$N[1]
474
475 mulq $m0 # ap[j]*bp[0]
476 add %rax,$A[1]
477 mov -8($np,$j,8),%rax
478 adc \$0,%rdx
479 mov %rdx,$A[0]
480
481 mulq $m1 # np[j]*m1
482 add %rax,$N[1]
483 mov ($ap,$j,8),%rax
484 adc \$0,%rdx
485 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
486 adc \$0,%rdx
487 mov $N[1],-16(%rsp,$j,8) # tp[j-1]
488 mov %rdx,$N[0]
489
490 mulq $m0 # ap[j]*bp[0]
491 add %rax,$A[0]
492 mov ($np,$j,8),%rax
493 adc \$0,%rdx
494 mov %rdx,$A[1]
495
496 mulq $m1 # np[j]*m1
497 add %rax,$N[0]
498 mov 8($ap,$j,8),%rax
499 adc \$0,%rdx
500 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
501 adc \$0,%rdx
502 mov $N[0],-8(%rsp,$j,8) # tp[j-1]
503 mov %rdx,$N[1]
504
505 mulq $m0 # ap[j]*bp[0]
506 add %rax,$A[1]
507 mov 8($np,$j,8),%rax
508 adc \$0,%rdx
509 lea 4($j),$j # j++
510 mov %rdx,$A[0]
511
512 mulq $m1 # np[j]*m1
513 add %rax,$N[1]
514 mov -16($ap,$j,8),%rax
515 adc \$0,%rdx
516 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
517 adc \$0,%rdx
518 mov $N[1],-32(%rsp,$j,8) # tp[j-1]
519 mov %rdx,$N[0]
520 cmp $num,$j
521 jl .L1st4x
522
523 mulq $m0 # ap[j]*bp[0]
524 add %rax,$A[0]
525 mov -16($np,$j,8),%rax
526 adc \$0,%rdx
527 mov %rdx,$A[1]
528
529 mulq $m1 # np[j]*m1
530 add %rax,$N[0]
531 mov -8($ap,$j,8),%rax
532 adc \$0,%rdx
533 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
534 adc \$0,%rdx
535 mov $N[0],-24(%rsp,$j,8) # tp[j-1]
536 mov %rdx,$N[1]
537
538 mulq $m0 # ap[j]*bp[0]
539 add %rax,$A[1]
540 mov -8($np,$j,8),%rax
541 adc \$0,%rdx
542 mov %rdx,$A[0]
543
544 mulq $m1 # np[j]*m1
545 add %rax,$N[1]
546 mov ($ap),%rax # ap[0]
547 adc \$0,%rdx
548 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
549 adc \$0,%rdx
550 mov $N[1],-16(%rsp,$j,8) # tp[j-1]
551 mov %rdx,$N[0]
552
553 movq %xmm0,$m0 # bp[1]
554
555 xor $N[1],$N[1]
556 add $A[0],$N[0]
557 adc \$0,$N[1]
558 mov $N[0],-8(%rsp,$j,8)
559 mov $N[1],(%rsp,$j,8) # store upmost overflow bit
560
561 lea 1($i),$i # i++
562.align 4
563.Louter4x:
564 xor $j,$j # j=0
565 movq `0*$STRIDE/4-96`($bp),%xmm0
566 movq `1*$STRIDE/4-96`($bp),%xmm1
567 pand %xmm4,%xmm0
568 movq `2*$STRIDE/4-96`($bp),%xmm2
569 pand %xmm5,%xmm1
570
571 mov (%rsp),$A[0]
572 mov $n0,$m1
573 mulq $m0 # ap[0]*bp[i]
574 add %rax,$A[0] # ap[0]*bp[i]+tp[0]
575 mov ($np),%rax
576 adc \$0,%rdx
577
578 movq `3*$STRIDE/4-96`($bp),%xmm3
579 pand %xmm6,%xmm2
580 por %xmm1,%xmm0
581 pand %xmm7,%xmm3
582
583 imulq $A[0],$m1 # tp[0]*n0
584 mov %rdx,$A[1]
585
586 por %xmm2,%xmm0
587 lea $STRIDE($bp),$bp
588 por %xmm3,%xmm0
589
590 mulq $m1 # np[0]*m1
591 add %rax,$A[0] # "$N[0]", discarded
592 mov 8($ap),%rax
593 adc \$0,%rdx
594 mov %rdx,$N[1]
595
596 mulq $m0 # ap[j]*bp[i]
597 add %rax,$A[1]
598 mov 8($np),%rax
599 adc \$0,%rdx
600 add 8(%rsp),$A[1] # +tp[1]
601 adc \$0,%rdx
602 mov %rdx,$A[0]
603
604 mulq $m1 # np[j]*m1
605 add %rax,$N[1]
606 mov 16($ap),%rax
607 adc \$0,%rdx
608 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[i]+tp[j]
609 lea 4($j),$j # j+=2
610 adc \$0,%rdx
611 mov %rdx,$N[0]
612 jmp .Linner4x
613.align 16
614.Linner4x:
615 mulq $m0 # ap[j]*bp[i]
616 add %rax,$A[0]
617 mov -16($np,$j,8),%rax
618 adc \$0,%rdx
619 add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
620 adc \$0,%rdx
621 mov %rdx,$A[1]
622
623 mulq $m1 # np[j]*m1
624 add %rax,$N[0]
625 mov -8($ap,$j,8),%rax
626 adc \$0,%rdx
627 add $A[0],$N[0]
628 adc \$0,%rdx
629 mov $N[1],-32(%rsp,$j,8) # tp[j-1]
630 mov %rdx,$N[1]
631
632 mulq $m0 # ap[j]*bp[i]
633 add %rax,$A[1]
634 mov -8($np,$j,8),%rax
635 adc \$0,%rdx
636 add -8(%rsp,$j,8),$A[1]
637 adc \$0,%rdx
638 mov %rdx,$A[0]
639
640 mulq $m1 # np[j]*m1
641 add %rax,$N[1]
642 mov ($ap,$j,8),%rax
643 adc \$0,%rdx
644 add $A[1],$N[1]
645 adc \$0,%rdx
646 mov $N[0],-24(%rsp,$j,8) # tp[j-1]
647 mov %rdx,$N[0]
648
649 mulq $m0 # ap[j]*bp[i]
650 add %rax,$A[0]
651 mov ($np,$j,8),%rax
652 adc \$0,%rdx
653 add (%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
654 adc \$0,%rdx
655 mov %rdx,$A[1]
656
657 mulq $m1 # np[j]*m1
658 add %rax,$N[0]
659 mov 8($ap,$j,8),%rax
660 adc \$0,%rdx
661 add $A[0],$N[0]
662 adc \$0,%rdx
663 mov $N[1],-16(%rsp,$j,8) # tp[j-1]
664 mov %rdx,$N[1]
665
666 mulq $m0 # ap[j]*bp[i]
667 add %rax,$A[1]
668 mov 8($np,$j,8),%rax
669 adc \$0,%rdx
670 add 8(%rsp,$j,8),$A[1]
671 adc \$0,%rdx
672 lea 4($j),$j # j++
673 mov %rdx,$A[0]
674
675 mulq $m1 # np[j]*m1
676 add %rax,$N[1]
677 mov -16($ap,$j,8),%rax
678 adc \$0,%rdx
679 add $A[1],$N[1]
680 adc \$0,%rdx
681 mov $N[0],-40(%rsp,$j,8) # tp[j-1]
682 mov %rdx,$N[0]
683 cmp $num,$j
684 jl .Linner4x
685
686 mulq $m0 # ap[j]*bp[i]
687 add %rax,$A[0]
688 mov -16($np,$j,8),%rax
689 adc \$0,%rdx
690 add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
691 adc \$0,%rdx
692 mov %rdx,$A[1]
693
694 mulq $m1 # np[j]*m1
695 add %rax,$N[0]
696 mov -8($ap,$j,8),%rax
697 adc \$0,%rdx
698 add $A[0],$N[0]
699 adc \$0,%rdx
700 mov $N[1],-32(%rsp,$j,8) # tp[j-1]
701 mov %rdx,$N[1]
702
703 mulq $m0 # ap[j]*bp[i]
704 add %rax,$A[1]
705 mov -8($np,$j,8),%rax
706 adc \$0,%rdx
707 add -8(%rsp,$j,8),$A[1]
708 adc \$0,%rdx
709 lea 1($i),$i # i++
710 mov %rdx,$A[0]
711
712 mulq $m1 # np[j]*m1
713 add %rax,$N[1]
714 mov ($ap),%rax # ap[0]
715 adc \$0,%rdx
716 add $A[1],$N[1]
717 adc \$0,%rdx
718 mov $N[0],-24(%rsp,$j,8) # tp[j-1]
719 mov %rdx,$N[0]
720
721 movq %xmm0,$m0 # bp[i+1]
722 mov $N[1],-16(%rsp,$j,8) # tp[j-1]
723
724 xor $N[1],$N[1]
725 add $A[0],$N[0]
726 adc \$0,$N[1]
727 add (%rsp,$num,8),$N[0] # pull upmost overflow bit
728 adc \$0,$N[1]
729 mov $N[0],-8(%rsp,$j,8)
730 mov $N[1],(%rsp,$j,8) # store upmost overflow bit
731
732 cmp $num,$i
733 jl .Louter4x
734___
735{
736my @ri=("%rax","%rdx",$m0,$m1);
737$code.=<<___;
738 mov 16(%rsp,$num,8),$rp # restore $rp
739 mov 0(%rsp),@ri[0] # tp[0]
740 pxor %xmm0,%xmm0
741 mov 8(%rsp),@ri[1] # tp[1]
742 shr \$2,$num # num/=4
743 lea (%rsp),$ap # borrow ap for tp
744 xor $i,$i # i=0 and clear CF!
745
746 sub 0($np),@ri[0]
747 mov 16($ap),@ri[2] # tp[2]
748 mov 24($ap),@ri[3] # tp[3]
749 sbb 8($np),@ri[1]
750 lea -1($num),$j # j=num/4-1
751 jmp .Lsub4x
752.align 16
753.Lsub4x:
754 mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i]
755 mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i]
756 sbb 16($np,$i,8),@ri[2]
757 mov 32($ap,$i,8),@ri[0] # tp[i+1]
758 mov 40($ap,$i,8),@ri[1]
759 sbb 24($np,$i,8),@ri[3]
760 mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i]
761 mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i]
762 sbb 32($np,$i,8),@ri[0]
763 mov 48($ap,$i,8),@ri[2]
764 mov 56($ap,$i,8),@ri[3]
765 sbb 40($np,$i,8),@ri[1]
766 lea 4($i),$i # i++
767 dec $j # doesnn't affect CF!
768 jnz .Lsub4x
769
770 mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i]
771 mov 32($ap,$i,8),@ri[0] # load overflow bit
772 sbb 16($np,$i,8),@ri[2]
773 mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i]
774 sbb 24($np,$i,8),@ri[3]
775 mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i]
776
777 sbb \$0,@ri[0] # handle upmost overflow bit
778 mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i]
779 xor $i,$i # i=0
780 and @ri[0],$ap
781 not @ri[0]
782 mov $rp,$np
783 and @ri[0],$np
784 lea -1($num),$j
785 or $np,$ap # ap=borrow?tp:rp
786
787 movdqu ($ap),%xmm1
788 movdqa %xmm0,(%rsp)
789 movdqu %xmm1,($rp)
790 jmp .Lcopy4x
791.align 16
792.Lcopy4x: # copy or in-place refresh
793 movdqu 16($ap,$i),%xmm2
794 movdqu 32($ap,$i),%xmm1
795 movdqa %xmm0,16(%rsp,$i)
796 movdqu %xmm2,16($rp,$i)
797 movdqa %xmm0,32(%rsp,$i)
798 movdqu %xmm1,32($rp,$i)
799 lea 32($i),$i
800 dec $j
801 jnz .Lcopy4x
802
803 shl \$2,$num
804 movdqu 16($ap,$i),%xmm2
805 movdqa %xmm0,16(%rsp,$i)
806 movdqu %xmm2,16($rp,$i)
807___
808}
809$code.=<<___;
810 mov 8(%rsp,$num,8),%rsi # restore %rsp
811 mov \$1,%rax
812___
813$code.=<<___ if ($win64);
814 movaps (%rsi),%xmm6
815 movaps 0x10(%rsi),%xmm7
816 lea 0x28(%rsi),%rsi
817___
818$code.=<<___;
819 mov (%rsi),%r15
820 mov 8(%rsi),%r14
821 mov 16(%rsi),%r13
822 mov 24(%rsi),%r12
823 mov 32(%rsi),%rbp
824 mov 40(%rsi),%rbx
825 lea 48(%rsi),%rsp
826.Lmul4x_epilogue:
827 ret
828.size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
829___
830}}}
831
832{
833my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%rdx","%r8", "%r9") : # Win64 order
834 ("%rdi","%rsi","%rdx","%rcx"); # Unix order
835my $out=$inp;
836my $STRIDE=2**5*8;
837my $N=$STRIDE/4;
838
839$code.=<<___;
840.globl bn_scatter5
841.type bn_scatter5,\@abi-omnipotent
842.align 16
843bn_scatter5:
844 cmp \$0, $num
845 jz .Lscatter_epilogue
846 lea ($tbl,$idx,8),$tbl
847.Lscatter:
848 mov ($inp),%rax
849 lea 8($inp),$inp
850 mov %rax,($tbl)
851 lea 32*8($tbl),$tbl
852 sub \$1,$num
853 jnz .Lscatter
854.Lscatter_epilogue:
855 ret
856.size bn_scatter5,.-bn_scatter5
857
858.globl bn_gather5
859.type bn_gather5,\@abi-omnipotent
860.align 16
861bn_gather5:
862___
863$code.=<<___ if ($win64);
864.LSEH_begin_bn_gather5:
865 # I can't trust assembler to use specific encoding:-(
866 .byte 0x48,0x83,0xec,0x28 #sub \$0x28,%rsp
867 .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp)
868 .byte 0x0f,0x29,0x7c,0x24,0x10 #movdqa %xmm7,0x10(%rsp)
869___
870$code.=<<___;
871 mov $idx,%r11
872 shr \$`log($N/8)/log(2)`,$idx
873 and \$`$N/8-1`,%r11
874 not $idx
875 lea .Lmagic_masks(%rip),%rax
876 and \$`2**5/($N/8)-1`,$idx # 5 is "window size"
877 lea 96($tbl,%r11,8),$tbl # pointer within 1st cache line
878 movq 0(%rax,$idx,8),%xmm4 # set of masks denoting which
879 movq 8(%rax,$idx,8),%xmm5 # cache line contains element
880 movq 16(%rax,$idx,8),%xmm6 # denoted by 7th argument
881 movq 24(%rax,$idx,8),%xmm7
882 jmp .Lgather
883.align 16
884.Lgather:
885 movq `0*$STRIDE/4-96`($tbl),%xmm0
886 movq `1*$STRIDE/4-96`($tbl),%xmm1
887 pand %xmm4,%xmm0
888 movq `2*$STRIDE/4-96`($tbl),%xmm2
889 pand %xmm5,%xmm1
890 movq `3*$STRIDE/4-96`($tbl),%xmm3
891 pand %xmm6,%xmm2
892 por %xmm1,%xmm0
893 pand %xmm7,%xmm3
894 por %xmm2,%xmm0
895 lea $STRIDE($tbl),$tbl
896 por %xmm3,%xmm0
897
898 movq %xmm0,($out) # m0=bp[0]
899 lea 8($out),$out
900 sub \$1,$num
901 jnz .Lgather
902___
903$code.=<<___ if ($win64);
904 movaps (%rsp),%xmm6
905 movaps 0x10(%rsp),%xmm7
906 lea 0x28(%rsp),%rsp
907___
908$code.=<<___;
909 ret
910.LSEH_end_bn_gather5:
911.size bn_gather5,.-bn_gather5
912___
913}
914$code.=<<___;
915.align 64
916.Lmagic_masks:
917 .long 0,0, 0,0, 0,0, -1,-1
918 .long 0,0, 0,0, 0,0, 0,0
919.asciz "Montgomery Multiplication with scatter/gather for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
920___
921
922# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
923# CONTEXT *context,DISPATCHER_CONTEXT *disp)
924if ($win64) {
925$rec="%rcx";
926$frame="%rdx";
927$context="%r8";
928$disp="%r9";
929
930$code.=<<___;
931.extern __imp_RtlVirtualUnwind
932.type mul_handler,\@abi-omnipotent
933.align 16
934mul_handler:
935 push %rsi
936 push %rdi
937 push %rbx
938 push %rbp
939 push %r12
940 push %r13
941 push %r14
942 push %r15
943 pushfq
944 sub \$64,%rsp
945
946 mov 120($context),%rax # pull context->Rax
947 mov 248($context),%rbx # pull context->Rip
948
949 mov 8($disp),%rsi # disp->ImageBase
950 mov 56($disp),%r11 # disp->HandlerData
951
952 mov 0(%r11),%r10d # HandlerData[0]
953 lea (%rsi,%r10),%r10 # end of prologue label
954 cmp %r10,%rbx # context->Rip<end of prologue label
955 jb .Lcommon_seh_tail
956
957 lea `40+48`(%rax),%rax
958
959 mov 4(%r11),%r10d # HandlerData[1]
960 lea (%rsi,%r10),%r10 # end of alloca label
961 cmp %r10,%rbx # context->Rip<end of alloca label
962 jb .Lcommon_seh_tail
963
964 mov 152($context),%rax # pull context->Rsp
965
966 mov 8(%r11),%r10d # HandlerData[2]
967 lea (%rsi,%r10),%r10 # epilogue label
968 cmp %r10,%rbx # context->Rip>=epilogue label
969 jae .Lcommon_seh_tail
970
971 mov 192($context),%r10 # pull $num
972 mov 8(%rax,%r10,8),%rax # pull saved stack pointer
973
974 movaps (%rax),%xmm0
975 movaps 16(%rax),%xmm1
976 lea `40+48`(%rax),%rax
977
978 mov -8(%rax),%rbx
979 mov -16(%rax),%rbp
980 mov -24(%rax),%r12
981 mov -32(%rax),%r13
982 mov -40(%rax),%r14
983 mov -48(%rax),%r15
984 mov %rbx,144($context) # restore context->Rbx
985 mov %rbp,160($context) # restore context->Rbp
986 mov %r12,216($context) # restore context->R12
987 mov %r13,224($context) # restore context->R13
988 mov %r14,232($context) # restore context->R14
989 mov %r15,240($context) # restore context->R15
990 movups %xmm0,512($context) # restore context->Xmm6
991 movups %xmm1,528($context) # restore context->Xmm7
992
993.Lcommon_seh_tail:
994 mov 8(%rax),%rdi
995 mov 16(%rax),%rsi
996 mov %rax,152($context) # restore context->Rsp
997 mov %rsi,168($context) # restore context->Rsi
998 mov %rdi,176($context) # restore context->Rdi
999
1000 mov 40($disp),%rdi # disp->ContextRecord
1001 mov $context,%rsi # context
1002 mov \$154,%ecx # sizeof(CONTEXT)
1003 .long 0xa548f3fc # cld; rep movsq
1004
1005 mov $disp,%rsi
1006 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1007 mov 8(%rsi),%rdx # arg2, disp->ImageBase
1008 mov 0(%rsi),%r8 # arg3, disp->ControlPc
1009 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1010 mov 40(%rsi),%r10 # disp->ContextRecord
1011 lea 56(%rsi),%r11 # &disp->HandlerData
1012 lea 24(%rsi),%r12 # &disp->EstablisherFrame
1013 mov %r10,32(%rsp) # arg5
1014 mov %r11,40(%rsp) # arg6
1015 mov %r12,48(%rsp) # arg7
1016 mov %rcx,56(%rsp) # arg8, (NULL)
1017 call *__imp_RtlVirtualUnwind(%rip)
1018
1019 mov \$1,%eax # ExceptionContinueSearch
1020 add \$64,%rsp
1021 popfq
1022 pop %r15
1023 pop %r14
1024 pop %r13
1025 pop %r12
1026 pop %rbp
1027 pop %rbx
1028 pop %rdi
1029 pop %rsi
1030 ret
1031.size mul_handler,.-mul_handler
1032
1033.section .pdata
1034.align 4
1035 .rva .LSEH_begin_bn_mul_mont_gather5
1036 .rva .LSEH_end_bn_mul_mont_gather5
1037 .rva .LSEH_info_bn_mul_mont_gather5
1038
1039 .rva .LSEH_begin_bn_mul4x_mont_gather5
1040 .rva .LSEH_end_bn_mul4x_mont_gather5
1041 .rva .LSEH_info_bn_mul4x_mont_gather5
1042
1043 .rva .LSEH_begin_bn_gather5
1044 .rva .LSEH_end_bn_gather5
1045 .rva .LSEH_info_bn_gather5
1046
1047.section .xdata
1048.align 8
1049.LSEH_info_bn_mul_mont_gather5:
1050 .byte 9,0,0,0
1051 .rva mul_handler
1052 .rva .Lmul_alloca,.Lmul_body,.Lmul_epilogue # HandlerData[]
1053.align 8
1054.LSEH_info_bn_mul4x_mont_gather5:
1055 .byte 9,0,0,0
1056 .rva mul_handler
1057 .rva .Lmul4x_alloca,.Lmul4x_body,.Lmul4x_epilogue # HandlerData[]
1058.align 8
1059.LSEH_info_bn_gather5:
1060 .byte 0x01,0x0d,0x05,0x00
1061 .byte 0x0d,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7
1062 .byte 0x08,0x68,0x00,0x00 #movaps (rsp),xmm6
1063 .byte 0x04,0x42,0x00,0x00 #sub rsp,0x28
1064.align 8
1065___
1066}
1067
1068$code =~ s/\`([^\`]*)\`/eval($1)/gem;
1069
1070print $code;
1071close STDOUT;
diff --git a/src/lib/libcrypto/bn/bn.h b/src/lib/libcrypto/bn/bn.h
deleted file mode 100644
index d93c9fc059..0000000000
--- a/src/lib/libcrypto/bn/bn.h
+++ /dev/null
@@ -1,821 +0,0 @@
1/* $OpenBSD: bn.h,v 1.26 2015/02/07 13:19:15 doug Exp $ */
2/* Copyright (C) 1995-1997 Eric Young (eay@cryptsoft.com)
3 * All rights reserved.
4 *
5 * This package is an SSL implementation written
6 * by Eric Young (eay@cryptsoft.com).
7 * The implementation was written so as to conform with Netscapes SSL.
8 *
9 * This library is free for commercial and non-commercial use as long as
10 * the following conditions are aheared to. The following conditions
11 * apply to all code found in this distribution, be it the RC4, RSA,
12 * lhash, DES, etc., code; not just the SSL code. The SSL documentation
13 * included with this distribution is covered by the same copyright terms
14 * except that the holder is Tim Hudson (tjh@cryptsoft.com).
15 *
16 * Copyright remains Eric Young's, and as such any Copyright notices in
17 * the code are not to be removed.
18 * If this package is used in a product, Eric Young should be given attribution
19 * as the author of the parts of the library used.
20 * This can be in the form of a textual message at program startup or
21 * in documentation (online or textual) provided with the package.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 * 1. Redistributions of source code must retain the copyright
27 * notice, this list of conditions and the following disclaimer.
28 * 2. Redistributions in binary form must reproduce the above copyright
29 * notice, this list of conditions and the following disclaimer in the
30 * documentation and/or other materials provided with the distribution.
31 * 3. All advertising materials mentioning features or use of this software
32 * must display the following acknowledgement:
33 * "This product includes cryptographic software written by
34 * Eric Young (eay@cryptsoft.com)"
35 * The word 'cryptographic' can be left out if the rouines from the library
36 * being used are not cryptographic related :-).
37 * 4. If you include any Windows specific code (or a derivative thereof) from
38 * the apps directory (application code) you must include an acknowledgement:
39 * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
40 *
41 * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
42 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51 * SUCH DAMAGE.
52 *
53 * The licence and distribution terms for any publically available version or
54 * derivative of this code cannot be changed. i.e. this code cannot simply be
55 * copied and put under another distribution licence
56 * [including the GNU Public Licence.]
57 */
58/* ====================================================================
59 * Copyright (c) 1998-2006 The OpenSSL Project. All rights reserved.
60 *
61 * Redistribution and use in source and binary forms, with or without
62 * modification, are permitted provided that the following conditions
63 * are met:
64 *
65 * 1. Redistributions of source code must retain the above copyright
66 * notice, this list of conditions and the following disclaimer.
67 *
68 * 2. Redistributions in binary form must reproduce the above copyright
69 * notice, this list of conditions and the following disclaimer in
70 * the documentation and/or other materials provided with the
71 * distribution.
72 *
73 * 3. All advertising materials mentioning features or use of this
74 * software must display the following acknowledgment:
75 * "This product includes software developed by the OpenSSL Project
76 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
77 *
78 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
79 * endorse or promote products derived from this software without
80 * prior written permission. For written permission, please contact
81 * openssl-core@openssl.org.
82 *
83 * 5. Products derived from this software may not be called "OpenSSL"
84 * nor may "OpenSSL" appear in their names without prior written
85 * permission of the OpenSSL Project.
86 *
87 * 6. Redistributions of any form whatsoever must retain the following
88 * acknowledgment:
89 * "This product includes software developed by the OpenSSL Project
90 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
91 *
92 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
93 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
94 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
95 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
96 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
97 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
98 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
99 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
100 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
101 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
102 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
103 * OF THE POSSIBILITY OF SUCH DAMAGE.
104 * ====================================================================
105 *
106 * This product includes cryptographic software written by Eric Young
107 * (eay@cryptsoft.com). This product includes software written by Tim
108 * Hudson (tjh@cryptsoft.com).
109 *
110 */
111/* ====================================================================
112 * Copyright 2002 Sun Microsystems, Inc. ALL RIGHTS RESERVED.
113 *
114 * Portions of the attached software ("Contribution") are developed by
115 * SUN MICROSYSTEMS, INC., and are contributed to the OpenSSL project.
116 *
117 * The Contribution is licensed pursuant to the Eric Young open source
118 * license provided above.
119 *
120 * The binary polynomial arithmetic software is originally written by
121 * Sheueling Chang Shantz and Douglas Stebila of Sun Microsystems Laboratories.
122 *
123 */
124
125#ifndef HEADER_BN_H
126#define HEADER_BN_H
127
128#include <stdio.h>
129#include <stdlib.h>
130
131#include <openssl/opensslconf.h>
132
133#include <openssl/ossl_typ.h>
134#include <openssl/crypto.h>
135
136#ifdef __cplusplus
137extern "C" {
138#endif
139
140/* These preprocessor symbols control various aspects of the bignum headers and
141 * library code. They're not defined by any "normal" configuration, as they are
142 * intended for development and testing purposes. NB: defining all three can be
143 * useful for debugging application code as well as openssl itself.
144 *
145 * BN_DEBUG - turn on various debugging alterations to the bignum code
146 * BN_DEBUG_RAND - uses random poisoning of unused words to trip up
147 * mismanagement of bignum internals. You must also define BN_DEBUG.
148 */
149/* #define BN_DEBUG */
150/* #define BN_DEBUG_RAND */
151
152#ifndef OPENSSL_SMALL_FOOTPRINT
153#define BN_MUL_COMBA
154#define BN_SQR_COMBA
155#define BN_RECURSION
156#endif
157
158/* This next option uses the C libraries (2 word)/(1 word) function.
159 * If it is not defined, I use my C version (which is slower).
160 * The reason for this flag is that when the particular C compiler
161 * library routine is used, and the library is linked with a different
162 * compiler, the library is missing. This mostly happens when the
163 * library is built with gcc and then linked using normal cc. This would
164 * be a common occurrence because gcc normally produces code that is
165 * 2 times faster than system compilers for the big number stuff.
166 * For machines with only one compiler (or shared libraries), this should
167 * be on. Again this in only really a problem on machines
168 * using "long long's", are 32bit, and are not using my assembler code. */
169/* #define BN_DIV2W */
170
171#ifdef _LP64
172#undef BN_LLONG
173#define BN_ULONG unsigned long
174#define BN_LONG long
175#define BN_BITS 128
176#define BN_BYTES 8
177#define BN_BITS2 64
178#define BN_BITS4 32
179#define BN_MASK2 (0xffffffffffffffffL)
180#define BN_MASK2l (0xffffffffL)
181#define BN_MASK2h (0xffffffff00000000L)
182#define BN_MASK2h1 (0xffffffff80000000L)
183#define BN_TBIT (0x8000000000000000L)
184#define BN_DEC_CONV (10000000000000000000UL)
185#define BN_DEC_FMT1 "%lu"
186#define BN_DEC_FMT2 "%019lu"
187#define BN_DEC_NUM 19
188#define BN_HEX_FMT1 "%lX"
189#define BN_HEX_FMT2 "%016lX"
190#else
191#define BN_ULLONG unsigned long long
192#define BN_LLONG
193#define BN_ULONG unsigned int
194#define BN_LONG int
195#define BN_BITS 64
196#define BN_BYTES 4
197#define BN_BITS2 32
198#define BN_BITS4 16
199#define BN_MASK (0xffffffffffffffffLL)
200#define BN_MASK2 (0xffffffffL)
201#define BN_MASK2l (0xffff)
202#define BN_MASK2h1 (0xffff8000L)
203#define BN_MASK2h (0xffff0000L)
204#define BN_TBIT (0x80000000L)
205#define BN_DEC_CONV (1000000000L)
206#define BN_DEC_FMT1 "%u"
207#define BN_DEC_FMT2 "%09u"
208#define BN_DEC_NUM 9
209#define BN_HEX_FMT1 "%X"
210#define BN_HEX_FMT2 "%08X"
211#endif
212
213#define BN_FLG_MALLOCED 0x01
214#define BN_FLG_STATIC_DATA 0x02
215#define BN_FLG_CONSTTIME 0x04 /* avoid leaking exponent information through timing,
216 * BN_mod_exp_mont() will call BN_mod_exp_mont_consttime,
217 * BN_div() will call BN_div_no_branch,
218 * BN_mod_inverse() will call BN_mod_inverse_no_branch.
219 */
220
221#ifndef OPENSSL_NO_DEPRECATED
222#define BN_FLG_EXP_CONSTTIME BN_FLG_CONSTTIME /* deprecated name for the flag */
223 /* avoid leaking exponent information through timings
224 * (BN_mod_exp_mont() will call BN_mod_exp_mont_consttime) */
225#endif
226
227#ifndef OPENSSL_NO_DEPRECATED
228#define BN_FLG_FREE 0x8000 /* used for debuging */
229#endif
230#define BN_set_flags(b,n) ((b)->flags|=(n))
231#define BN_get_flags(b,n) ((b)->flags&(n))
232
233/* get a clone of a BIGNUM with changed flags, for *temporary* use only
234 * (the two BIGNUMs cannot not be used in parallel!) */
235#define BN_with_flags(dest,b,n) ((dest)->d=(b)->d, \
236 (dest)->top=(b)->top, \
237 (dest)->dmax=(b)->dmax, \
238 (dest)->neg=(b)->neg, \
239 (dest)->flags=(((dest)->flags & BN_FLG_MALLOCED) \
240 | ((b)->flags & ~BN_FLG_MALLOCED) \
241 | BN_FLG_STATIC_DATA \
242 | (n)))
243
244struct bignum_st {
245 BN_ULONG *d; /* Pointer to an array of 'BN_BITS2' bit chunks. */
246 int top; /* Index of last used d +1. */
247 /* The next are internal book keeping for bn_expand. */
248 int dmax; /* Size of the d array. */
249 int neg; /* one if the number is negative */
250 int flags;
251};
252
253/* Used for montgomery multiplication */
254struct bn_mont_ctx_st {
255 int ri; /* number of bits in R */
256 BIGNUM RR; /* used to convert to montgomery form */
257 BIGNUM N; /* The modulus */
258 BIGNUM Ni; /* R*(1/R mod N) - N*Ni = 1
259 * (Ni is only stored for bignum algorithm) */
260 BN_ULONG n0[2];/* least significant word(s) of Ni;
261 (type changed with 0.9.9, was "BN_ULONG n0;" before) */
262 int flags;
263};
264
265/* Used for reciprocal division/mod functions
266 * It cannot be shared between threads
267 */
268struct bn_recp_ctx_st {
269 BIGNUM N; /* the divisor */
270 BIGNUM Nr; /* the reciprocal */
271 int num_bits;
272 int shift;
273 int flags;
274};
275
276/* Used for slow "generation" functions. */
277struct bn_gencb_st {
278 unsigned int ver; /* To handle binary (in)compatibility */
279 void *arg; /* callback-specific data */
280 union {
281 /* if(ver==1) - handles old style callbacks */
282 void (*cb_1)(int, int, void *);
283 /* if(ver==2) - new callback style */
284 int (*cb_2)(int, int, BN_GENCB *);
285 } cb;
286};
287/* Wrapper function to make using BN_GENCB easier, */
288int BN_GENCB_call(BN_GENCB *cb, int a, int b);
289/* Macro to populate a BN_GENCB structure with an "old"-style callback */
290#define BN_GENCB_set_old(gencb, callback, cb_arg) { \
291 BN_GENCB *tmp_gencb = (gencb); \
292 tmp_gencb->ver = 1; \
293 tmp_gencb->arg = (cb_arg); \
294 tmp_gencb->cb.cb_1 = (callback); }
295/* Macro to populate a BN_GENCB structure with a "new"-style callback */
296#define BN_GENCB_set(gencb, callback, cb_arg) { \
297 BN_GENCB *tmp_gencb = (gencb); \
298 tmp_gencb->ver = 2; \
299 tmp_gencb->arg = (cb_arg); \
300 tmp_gencb->cb.cb_2 = (callback); }
301
302#define BN_prime_checks 0 /* default: select number of iterations
303 based on the size of the number */
304
305/* number of Miller-Rabin iterations for an error rate of less than 2^-80
306 * for random 'b'-bit input, b >= 100 (taken from table 4.4 in the Handbook
307 * of Applied Cryptography [Menezes, van Oorschot, Vanstone; CRC Press 1996];
308 * original paper: Damgaard, Landrock, Pomerance: Average case error estimates
309 * for the strong probable prime test. -- Math. Comp. 61 (1993) 177-194) */
310#define BN_prime_checks_for_size(b) ((b) >= 1300 ? 2 : \
311 (b) >= 850 ? 3 : \
312 (b) >= 650 ? 4 : \
313 (b) >= 550 ? 5 : \
314 (b) >= 450 ? 6 : \
315 (b) >= 400 ? 7 : \
316 (b) >= 350 ? 8 : \
317 (b) >= 300 ? 9 : \
318 (b) >= 250 ? 12 : \
319 (b) >= 200 ? 15 : \
320 (b) >= 150 ? 18 : \
321 /* b >= 100 */ 27)
322
323#define BN_num_bytes(a) ((BN_num_bits(a)+7)/8)
324
325/* Note that BN_abs_is_word didn't work reliably for w == 0 until 0.9.8 */
326#define BN_abs_is_word(a,w) ((((a)->top == 1) && ((a)->d[0] == (BN_ULONG)(w))) || \
327 (((w) == 0) && ((a)->top == 0)))
328#define BN_is_zero(a) ((a)->top == 0)
329#define BN_is_one(a) (BN_abs_is_word((a),1) && !(a)->neg)
330#define BN_is_word(a,w) (BN_abs_is_word((a),(w)) && (!(w) || !(a)->neg))
331#define BN_is_odd(a) (((a)->top > 0) && ((a)->d[0] & 1))
332
333#define BN_one(a) (BN_set_word((a),1))
334#define BN_zero_ex(a) \
335 do { \
336 BIGNUM *_tmp_bn = (a); \
337 _tmp_bn->top = 0; \
338 _tmp_bn->neg = 0; \
339 } while(0)
340
341#ifdef OPENSSL_NO_DEPRECATED
342#define BN_zero(a) BN_zero_ex(a)
343#else
344#define BN_zero(a) (BN_set_word((a),0))
345#endif
346
347const BIGNUM *BN_value_one(void);
348char * BN_options(void);
349BN_CTX *BN_CTX_new(void);
350#ifndef OPENSSL_NO_DEPRECATED
351void BN_CTX_init(BN_CTX *c);
352#endif
353void BN_CTX_free(BN_CTX *c);
354void BN_CTX_start(BN_CTX *ctx);
355BIGNUM *BN_CTX_get(BN_CTX *ctx);
356void BN_CTX_end(BN_CTX *ctx);
357int BN_rand(BIGNUM *rnd, int bits, int top, int bottom);
358int BN_pseudo_rand(BIGNUM *rnd, int bits, int top, int bottom);
359int BN_rand_range(BIGNUM *rnd, const BIGNUM *range);
360int BN_pseudo_rand_range(BIGNUM *rnd, const BIGNUM *range);
361int BN_num_bits(const BIGNUM *a);
362int BN_num_bits_word(BN_ULONG);
363BIGNUM *BN_new(void);
364void BN_init(BIGNUM *);
365void BN_clear_free(BIGNUM *a);
366BIGNUM *BN_copy(BIGNUM *a, const BIGNUM *b);
367void BN_swap(BIGNUM *a, BIGNUM *b);
368BIGNUM *BN_bin2bn(const unsigned char *s, int len, BIGNUM *ret);
369int BN_bn2bin(const BIGNUM *a, unsigned char *to);
370BIGNUM *BN_mpi2bn(const unsigned char *s, int len, BIGNUM *ret);
371int BN_bn2mpi(const BIGNUM *a, unsigned char *to);
372int BN_sub(BIGNUM *r, const BIGNUM *a, const BIGNUM *b);
373int BN_usub(BIGNUM *r, const BIGNUM *a, const BIGNUM *b);
374int BN_uadd(BIGNUM *r, const BIGNUM *a, const BIGNUM *b);
375int BN_add(BIGNUM *r, const BIGNUM *a, const BIGNUM *b);
376int BN_mul(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx);
377int BN_sqr(BIGNUM *r, const BIGNUM *a, BN_CTX *ctx);
378/** BN_set_negative sets sign of a BIGNUM
379 * \param b pointer to the BIGNUM object
380 * \param n 0 if the BIGNUM b should be positive and a value != 0 otherwise
381 */
382void BN_set_negative(BIGNUM *b, int n);
383/** BN_is_negative returns 1 if the BIGNUM is negative
384 * \param a pointer to the BIGNUM object
385 * \return 1 if a < 0 and 0 otherwise
386 */
387#define BN_is_negative(a) ((a)->neg != 0)
388
389int BN_div(BIGNUM *dv, BIGNUM *rem, const BIGNUM *m, const BIGNUM *d,
390 BN_CTX *ctx);
391#define BN_mod(rem,m,d,ctx) BN_div(NULL,(rem),(m),(d),(ctx))
392int BN_nnmod(BIGNUM *r, const BIGNUM *m, const BIGNUM *d, BN_CTX *ctx);
393int BN_mod_add(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, const BIGNUM *m, BN_CTX *ctx);
394int BN_mod_add_quick(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, const BIGNUM *m);
395int BN_mod_sub(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, const BIGNUM *m, BN_CTX *ctx);
396int BN_mod_sub_quick(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, const BIGNUM *m);
397int BN_mod_mul(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
398 const BIGNUM *m, BN_CTX *ctx);
399int BN_mod_sqr(BIGNUM *r, const BIGNUM *a, const BIGNUM *m, BN_CTX *ctx);
400int BN_mod_lshift1(BIGNUM *r, const BIGNUM *a, const BIGNUM *m, BN_CTX *ctx);
401int BN_mod_lshift1_quick(BIGNUM *r, const BIGNUM *a, const BIGNUM *m);
402int BN_mod_lshift(BIGNUM *r, const BIGNUM *a, int n, const BIGNUM *m, BN_CTX *ctx);
403int BN_mod_lshift_quick(BIGNUM *r, const BIGNUM *a, int n, const BIGNUM *m);
404
405BN_ULONG BN_mod_word(const BIGNUM *a, BN_ULONG w);
406BN_ULONG BN_div_word(BIGNUM *a, BN_ULONG w);
407int BN_mul_word(BIGNUM *a, BN_ULONG w);
408int BN_add_word(BIGNUM *a, BN_ULONG w);
409int BN_sub_word(BIGNUM *a, BN_ULONG w);
410int BN_set_word(BIGNUM *a, BN_ULONG w);
411BN_ULONG BN_get_word(const BIGNUM *a);
412
413int BN_cmp(const BIGNUM *a, const BIGNUM *b);
414void BN_free(BIGNUM *a);
415int BN_is_bit_set(const BIGNUM *a, int n);
416int BN_lshift(BIGNUM *r, const BIGNUM *a, int n);
417int BN_lshift1(BIGNUM *r, const BIGNUM *a);
418int BN_exp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx);
419
420int BN_mod_exp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p,
421 const BIGNUM *m, BN_CTX *ctx);
422int BN_mod_exp_mont(BIGNUM *r, const BIGNUM *a, const BIGNUM *p,
423 const BIGNUM *m, BN_CTX *ctx, BN_MONT_CTX *m_ctx);
424int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
425 const BIGNUM *m, BN_CTX *ctx, BN_MONT_CTX *in_mont);
426int BN_mod_exp_mont_word(BIGNUM *r, BN_ULONG a, const BIGNUM *p,
427 const BIGNUM *m, BN_CTX *ctx, BN_MONT_CTX *m_ctx);
428int BN_mod_exp2_mont(BIGNUM *r, const BIGNUM *a1, const BIGNUM *p1,
429 const BIGNUM *a2, const BIGNUM *p2, const BIGNUM *m,
430 BN_CTX *ctx, BN_MONT_CTX *m_ctx);
431int BN_mod_exp_simple(BIGNUM *r, const BIGNUM *a, const BIGNUM *p,
432 const BIGNUM *m, BN_CTX *ctx);
433
434int BN_mask_bits(BIGNUM *a, int n);
435int BN_print_fp(FILE *fp, const BIGNUM *a);
436#ifdef HEADER_BIO_H
437int BN_print(BIO *fp, const BIGNUM *a);
438#else
439int BN_print(void *fp, const BIGNUM *a);
440#endif
441int BN_reciprocal(BIGNUM *r, const BIGNUM *m, int len, BN_CTX *ctx);
442int BN_rshift(BIGNUM *r, const BIGNUM *a, int n);
443int BN_rshift1(BIGNUM *r, const BIGNUM *a);
444void BN_clear(BIGNUM *a);
445BIGNUM *BN_dup(const BIGNUM *a);
446int BN_ucmp(const BIGNUM *a, const BIGNUM *b);
447int BN_set_bit(BIGNUM *a, int n);
448int BN_clear_bit(BIGNUM *a, int n);
449char * BN_bn2hex(const BIGNUM *a);
450char * BN_bn2dec(const BIGNUM *a);
451int BN_hex2bn(BIGNUM **a, const char *str);
452int BN_dec2bn(BIGNUM **a, const char *str);
453int BN_asc2bn(BIGNUM **a, const char *str);
454int BN_gcd(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx);
455int BN_kronecker(const BIGNUM *a,const BIGNUM *b,BN_CTX *ctx); /* returns -2 for error */
456BIGNUM *BN_mod_inverse(BIGNUM *ret,
457 const BIGNUM *a, const BIGNUM *n, BN_CTX *ctx);
458BIGNUM *BN_mod_sqrt(BIGNUM *ret,
459 const BIGNUM *a, const BIGNUM *n, BN_CTX *ctx);
460
461void BN_consttime_swap(BN_ULONG swap, BIGNUM *a, BIGNUM *b, int nwords);
462
463/* Deprecated versions */
464#ifndef OPENSSL_NO_DEPRECATED
465BIGNUM *BN_generate_prime(BIGNUM *ret, int bits, int safe,
466 const BIGNUM *add, const BIGNUM *rem,
467 void (*callback)(int, int, void *), void *cb_arg);
468int BN_is_prime(const BIGNUM *p, int nchecks,
469 void (*callback)(int, int, void *),
470 BN_CTX *ctx, void *cb_arg);
471int BN_is_prime_fasttest(const BIGNUM *p, int nchecks,
472 void (*callback)(int, int, void *), BN_CTX *ctx, void *cb_arg,
473 int do_trial_division);
474#endif /* !defined(OPENSSL_NO_DEPRECATED) */
475
476/* Newer versions */
477int BN_generate_prime_ex(BIGNUM *ret, int bits, int safe, const BIGNUM *add,
478 const BIGNUM *rem, BN_GENCB *cb);
479int BN_is_prime_ex(const BIGNUM *p, int nchecks, BN_CTX *ctx, BN_GENCB *cb);
480int BN_is_prime_fasttest_ex(const BIGNUM *p, int nchecks, BN_CTX *ctx,
481 int do_trial_division, BN_GENCB *cb);
482
483int BN_X931_generate_Xpq(BIGNUM *Xp, BIGNUM *Xq, int nbits, BN_CTX *ctx);
484
485int BN_X931_derive_prime_ex(BIGNUM *p, BIGNUM *p1, BIGNUM *p2,
486 const BIGNUM *Xp, const BIGNUM *Xp1, const BIGNUM *Xp2,
487 const BIGNUM *e, BN_CTX *ctx, BN_GENCB *cb);
488int BN_X931_generate_prime_ex(BIGNUM *p, BIGNUM *p1, BIGNUM *p2,
489 BIGNUM *Xp1, BIGNUM *Xp2,
490 const BIGNUM *Xp,
491 const BIGNUM *e, BN_CTX *ctx,
492 BN_GENCB *cb);
493
494BN_MONT_CTX *BN_MONT_CTX_new(void );
495void BN_MONT_CTX_init(BN_MONT_CTX *ctx);
496int BN_mod_mul_montgomery(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
497 BN_MONT_CTX *mont, BN_CTX *ctx);
498#define BN_to_montgomery(r,a,mont,ctx) BN_mod_mul_montgomery(\
499 (r),(a),&((mont)->RR),(mont),(ctx))
500int BN_from_montgomery(BIGNUM *r, const BIGNUM *a,
501 BN_MONT_CTX *mont, BN_CTX *ctx);
502void BN_MONT_CTX_free(BN_MONT_CTX *mont);
503int BN_MONT_CTX_set(BN_MONT_CTX *mont, const BIGNUM *mod, BN_CTX *ctx);
504BN_MONT_CTX *BN_MONT_CTX_copy(BN_MONT_CTX *to, BN_MONT_CTX *from);
505BN_MONT_CTX *BN_MONT_CTX_set_locked(BN_MONT_CTX **pmont, int lock,
506 const BIGNUM *mod, BN_CTX *ctx);
507
508/* BN_BLINDING flags */
509#define BN_BLINDING_NO_UPDATE 0x00000001
510#define BN_BLINDING_NO_RECREATE 0x00000002
511
512BN_BLINDING *BN_BLINDING_new(const BIGNUM *A, const BIGNUM *Ai, BIGNUM *mod);
513void BN_BLINDING_free(BN_BLINDING *b);
514int BN_BLINDING_update(BN_BLINDING *b, BN_CTX *ctx);
515int BN_BLINDING_convert(BIGNUM *n, BN_BLINDING *b, BN_CTX *ctx);
516int BN_BLINDING_invert(BIGNUM *n, BN_BLINDING *b, BN_CTX *ctx);
517int BN_BLINDING_convert_ex(BIGNUM *n, BIGNUM *r, BN_BLINDING *b, BN_CTX *);
518int BN_BLINDING_invert_ex(BIGNUM *n, const BIGNUM *r, BN_BLINDING *b, BN_CTX *);
519#ifndef OPENSSL_NO_DEPRECATED
520unsigned long BN_BLINDING_get_thread_id(const BN_BLINDING *);
521void BN_BLINDING_set_thread_id(BN_BLINDING *, unsigned long);
522#endif
523CRYPTO_THREADID *BN_BLINDING_thread_id(BN_BLINDING *);
524unsigned long BN_BLINDING_get_flags(const BN_BLINDING *);
525void BN_BLINDING_set_flags(BN_BLINDING *, unsigned long);
526BN_BLINDING *BN_BLINDING_create_param(BN_BLINDING *b,
527 const BIGNUM *e, BIGNUM *m, BN_CTX *ctx,
528 int (*bn_mod_exp)(BIGNUM *r, const BIGNUM *a, const BIGNUM *p,
529 const BIGNUM *m, BN_CTX *ctx, BN_MONT_CTX *m_ctx),
530 BN_MONT_CTX *m_ctx);
531
532#ifndef OPENSSL_NO_DEPRECATED
533void BN_set_params(int mul, int high, int low, int mont);
534int BN_get_params(int which); /* 0, mul, 1 high, 2 low, 3 mont */
535#endif
536
537void BN_RECP_CTX_init(BN_RECP_CTX *recp);
538BN_RECP_CTX *BN_RECP_CTX_new(void);
539void BN_RECP_CTX_free(BN_RECP_CTX *recp);
540int BN_RECP_CTX_set(BN_RECP_CTX *recp, const BIGNUM *rdiv, BN_CTX *ctx);
541int BN_mod_mul_reciprocal(BIGNUM *r, const BIGNUM *x, const BIGNUM *y,
542 BN_RECP_CTX *recp, BN_CTX *ctx);
543int BN_mod_exp_recp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p,
544 const BIGNUM *m, BN_CTX *ctx);
545int BN_div_recp(BIGNUM *dv, BIGNUM *rem, const BIGNUM *m,
546 BN_RECP_CTX *recp, BN_CTX *ctx);
547
548#ifndef OPENSSL_NO_EC2M
549
550/* Functions for arithmetic over binary polynomials represented by BIGNUMs.
551 *
552 * The BIGNUM::neg property of BIGNUMs representing binary polynomials is
553 * ignored.
554 *
555 * Note that input arguments are not const so that their bit arrays can
556 * be expanded to the appropriate size if needed.
557 */
558
559int BN_GF2m_add(BIGNUM *r, const BIGNUM *a, const BIGNUM *b); /*r = a + b*/
560#define BN_GF2m_sub(r, a, b) BN_GF2m_add(r, a, b)
561int BN_GF2m_mod(BIGNUM *r, const BIGNUM *a, const BIGNUM *p); /*r=a mod p*/
562int
563BN_GF2m_mod_mul(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
564 const BIGNUM *p, BN_CTX *ctx); /* r = (a * b) mod p */
565int
566BN_GF2m_mod_sqr(BIGNUM *r, const BIGNUM *a, const BIGNUM *p,
567 BN_CTX *ctx); /* r = (a * a) mod p */
568int
569BN_GF2m_mod_inv(BIGNUM *r, const BIGNUM *b, const BIGNUM *p,
570 BN_CTX *ctx); /* r = (1 / b) mod p */
571int
572BN_GF2m_mod_div(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
573 const BIGNUM *p, BN_CTX *ctx); /* r = (a / b) mod p */
574int
575BN_GF2m_mod_exp(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
576 const BIGNUM *p, BN_CTX *ctx); /* r = (a ^ b) mod p */
577int
578BN_GF2m_mod_sqrt(BIGNUM *r, const BIGNUM *a, const BIGNUM *p,
579 BN_CTX *ctx); /* r = sqrt(a) mod p */
580int BN_GF2m_mod_solve_quad(BIGNUM *r, const BIGNUM *a, const BIGNUM *p,
581 BN_CTX *ctx); /* r^2 + r = a mod p */
582#define BN_GF2m_cmp(a, b) BN_ucmp((a), (b))
583/* Some functions allow for representation of the irreducible polynomials
584 * as an unsigned int[], say p. The irreducible f(t) is then of the form:
585 * t^p[0] + t^p[1] + ... + t^p[k]
586 * where m = p[0] > p[1] > ... > p[k] = 0.
587 */
588int BN_GF2m_mod_arr(BIGNUM *r, const BIGNUM *a, const int p[]);
589/* r = a mod p */
590int BN_GF2m_mod_mul_arr(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
591 const int p[], BN_CTX *ctx); /* r = (a * b) mod p */
592int BN_GF2m_mod_sqr_arr(BIGNUM *r, const BIGNUM *a, const int p[],
593 BN_CTX *ctx); /* r = (a * a) mod p */
594int BN_GF2m_mod_inv_arr(BIGNUM *r, const BIGNUM *b, const int p[],
595 BN_CTX *ctx); /* r = (1 / b) mod p */
596int BN_GF2m_mod_div_arr(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
597 const int p[], BN_CTX *ctx); /* r = (a / b) mod p */
598int BN_GF2m_mod_exp_arr(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
599 const int p[], BN_CTX *ctx); /* r = (a ^ b) mod p */
600int BN_GF2m_mod_sqrt_arr(BIGNUM *r, const BIGNUM *a,
601 const int p[], BN_CTX *ctx); /* r = sqrt(a) mod p */
602int BN_GF2m_mod_solve_quad_arr(BIGNUM *r, const BIGNUM *a,
603 const int p[], BN_CTX *ctx); /* r^2 + r = a mod p */
604int BN_GF2m_poly2arr(const BIGNUM *a, int p[], int max);
605int BN_GF2m_arr2poly(const int p[], BIGNUM *a);
606
607#endif
608
609/* faster mod functions for the 'NIST primes'
610 * 0 <= a < p^2 */
611int BN_nist_mod_192(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx);
612int BN_nist_mod_224(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx);
613int BN_nist_mod_256(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx);
614int BN_nist_mod_384(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx);
615int BN_nist_mod_521(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx);
616
617const BIGNUM *BN_get0_nist_prime_192(void);
618const BIGNUM *BN_get0_nist_prime_224(void);
619const BIGNUM *BN_get0_nist_prime_256(void);
620const BIGNUM *BN_get0_nist_prime_384(void);
621const BIGNUM *BN_get0_nist_prime_521(void);
622
623/* library internal functions */
624
625#define bn_expand(a,bits) ((((((bits+BN_BITS2-1))/BN_BITS2)) <= (a)->dmax)?\
626 (a):bn_expand2((a),(bits+BN_BITS2-1)/BN_BITS2))
627#define bn_wexpand(a,words) (((words) <= (a)->dmax)?(a):bn_expand2((a),(words)))
628BIGNUM *bn_expand2(BIGNUM *a, int words);
629#ifndef OPENSSL_NO_DEPRECATED
630BIGNUM *bn_dup_expand(const BIGNUM *a, int words); /* unused */
631#endif
632
633/* Bignum consistency macros
634 * There is one "API" macro, bn_fix_top(), for stripping leading zeroes from
635 * bignum data after direct manipulations on the data. There is also an
636 * "internal" macro, bn_check_top(), for verifying that there are no leading
637 * zeroes. Unfortunately, some auditing is required due to the fact that
638 * bn_fix_top() has become an overabused duct-tape because bignum data is
639 * occasionally passed around in an inconsistent state. So the following
640 * changes have been made to sort this out;
641 * - bn_fix_top()s implementation has been moved to bn_correct_top()
642 * - if BN_DEBUG isn't defined, bn_fix_top() maps to bn_correct_top(), and
643 * bn_check_top() is as before.
644 * - if BN_DEBUG *is* defined;
645 * - bn_check_top() tries to pollute unused words even if the bignum 'top' is
646 * consistent. (ed: only if BN_DEBUG_RAND is defined)
647 * - bn_fix_top() maps to bn_check_top() rather than "fixing" anything.
648 * The idea is to have debug builds flag up inconsistent bignums when they
649 * occur. If that occurs in a bn_fix_top(), we examine the code in question; if
650 * the use of bn_fix_top() was appropriate (ie. it follows directly after code
651 * that manipulates the bignum) it is converted to bn_correct_top(), and if it
652 * was not appropriate, we convert it permanently to bn_check_top() and track
653 * down the cause of the bug. Eventually, no internal code should be using the
654 * bn_fix_top() macro. External applications and libraries should try this with
655 * their own code too, both in terms of building against the openssl headers
656 * with BN_DEBUG defined *and* linking with a version of OpenSSL built with it
657 * defined. This not only improves external code, it provides more test
658 * coverage for openssl's own code.
659 */
660
661#ifdef BN_DEBUG
662
663/* We only need assert() when debugging */
664#include <assert.h>
665
666#ifdef BN_DEBUG_RAND
667#define bn_pollute(a) \
668 do { \
669 const BIGNUM *_bnum1 = (a); \
670 if(_bnum1->top < _bnum1->dmax) { \
671 unsigned char _tmp_char; \
672 /* We cast away const without the compiler knowing, any \
673 * *genuinely* constant variables that aren't mutable \
674 * wouldn't be constructed with top!=dmax. */ \
675 BN_ULONG *_not_const; \
676 memcpy(&_not_const, &_bnum1->d, sizeof(BN_ULONG*)); \
677 arc4random_buf(&_tmp_char, 1); \
678 memset((unsigned char *)(_not_const + _bnum1->top), _tmp_char, \
679 (_bnum1->dmax - _bnum1->top) * sizeof(BN_ULONG)); \
680 } \
681 } while(0)
682#else
683#define bn_pollute(a)
684#endif
685
686#define bn_check_top(a) \
687 do { \
688 const BIGNUM *_bnum2 = (a); \
689 if (_bnum2 != NULL) { \
690 assert((_bnum2->top == 0) || \
691 (_bnum2->d[_bnum2->top - 1] != 0)); \
692 bn_pollute(_bnum2); \
693 } \
694 } while(0)
695
696#define bn_fix_top(a) bn_check_top(a)
697
698#define bn_check_size(bn, bits) bn_wcheck_size(bn, ((bits+BN_BITS2-1))/BN_BITS2)
699#define bn_wcheck_size(bn, words) \
700 do { \
701 const BIGNUM *_bnum2 = (bn); \
702 assert(words <= (_bnum2)->dmax && words >= (_bnum2)->top); \
703 } while(0)
704
705#else /* !BN_DEBUG */
706
707#define bn_pollute(a)
708#define bn_check_top(a)
709#define bn_fix_top(a) bn_correct_top(a)
710#define bn_check_size(bn, bits)
711#define bn_wcheck_size(bn, words)
712
713#endif
714
715#define bn_correct_top(a) \
716 { \
717 BN_ULONG *ftl; \
718 int tmp_top = (a)->top; \
719 if (tmp_top > 0) \
720 { \
721 for (ftl= &((a)->d[tmp_top-1]); tmp_top > 0; tmp_top--) \
722 if (*(ftl--)) break; \
723 (a)->top = tmp_top; \
724 } \
725 bn_pollute(a); \
726 }
727
728 BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w);
729BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w);
730void bn_sqr_words(BN_ULONG *rp, const BN_ULONG *ap, int num);
731BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d);
732BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, int num);
733BN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, int num);
734
735/* Primes from RFC 2409 */
736BIGNUM *get_rfc2409_prime_768(BIGNUM *bn);
737BIGNUM *get_rfc2409_prime_1024(BIGNUM *bn);
738
739/* Primes from RFC 3526 */
740BIGNUM *get_rfc3526_prime_1536(BIGNUM *bn);
741BIGNUM *get_rfc3526_prime_2048(BIGNUM *bn);
742BIGNUM *get_rfc3526_prime_3072(BIGNUM *bn);
743BIGNUM *get_rfc3526_prime_4096(BIGNUM *bn);
744BIGNUM *get_rfc3526_prime_6144(BIGNUM *bn);
745BIGNUM *get_rfc3526_prime_8192(BIGNUM *bn);
746
747int BN_bntest_rand(BIGNUM *rnd, int bits, int top, int bottom);
748
749/* BEGIN ERROR CODES */
750/* The following lines are auto generated by the script mkerr.pl. Any changes
751 * made after this point may be overwritten when the script is next run.
752 */
753void ERR_load_BN_strings(void);
754
755/* Error codes for the BN functions. */
756
757/* Function codes. */
758#define BN_F_BNRAND 127
759#define BN_F_BN_BLINDING_CONVERT_EX 100
760#define BN_F_BN_BLINDING_CREATE_PARAM 128
761#define BN_F_BN_BLINDING_INVERT_EX 101
762#define BN_F_BN_BLINDING_NEW 102
763#define BN_F_BN_BLINDING_UPDATE 103
764#define BN_F_BN_BN2DEC 104
765#define BN_F_BN_BN2HEX 105
766#define BN_F_BN_CTX_GET 116
767#define BN_F_BN_CTX_NEW 106
768#define BN_F_BN_CTX_START 129
769#define BN_F_BN_DIV 107
770#define BN_F_BN_DIV_NO_BRANCH 138
771#define BN_F_BN_DIV_RECP 130
772#define BN_F_BN_EXP 123
773#define BN_F_BN_EXPAND2 108
774#define BN_F_BN_EXPAND_INTERNAL 120
775#define BN_F_BN_GF2M_MOD 131
776#define BN_F_BN_GF2M_MOD_EXP 132
777#define BN_F_BN_GF2M_MOD_MUL 133
778#define BN_F_BN_GF2M_MOD_SOLVE_QUAD 134
779#define BN_F_BN_GF2M_MOD_SOLVE_QUAD_ARR 135
780#define BN_F_BN_GF2M_MOD_SQR 136
781#define BN_F_BN_GF2M_MOD_SQRT 137
782#define BN_F_BN_MOD_EXP2_MONT 118
783#define BN_F_BN_MOD_EXP_MONT 109
784#define BN_F_BN_MOD_EXP_MONT_CONSTTIME 124
785#define BN_F_BN_MOD_EXP_MONT_WORD 117
786#define BN_F_BN_MOD_EXP_RECP 125
787#define BN_F_BN_MOD_EXP_SIMPLE 126
788#define BN_F_BN_MOD_INVERSE 110
789#define BN_F_BN_MOD_INVERSE_NO_BRANCH 139
790#define BN_F_BN_MOD_LSHIFT_QUICK 119
791#define BN_F_BN_MOD_MUL_RECIPROCAL 111
792#define BN_F_BN_MOD_SQRT 121
793#define BN_F_BN_MPI2BN 112
794#define BN_F_BN_NEW 113
795#define BN_F_BN_RAND 114
796#define BN_F_BN_RAND_RANGE 122
797#define BN_F_BN_USUB 115
798
799/* Reason codes. */
800#define BN_R_ARG2_LT_ARG3 100
801#define BN_R_BAD_RECIPROCAL 101
802#define BN_R_BIGNUM_TOO_LONG 114
803#define BN_R_CALLED_WITH_EVEN_MODULUS 102
804#define BN_R_DIV_BY_ZERO 103
805#define BN_R_ENCODING_ERROR 104
806#define BN_R_EXPAND_ON_STATIC_BIGNUM_DATA 105
807#define BN_R_INPUT_NOT_REDUCED 110
808#define BN_R_INVALID_LENGTH 106
809#define BN_R_INVALID_RANGE 115
810#define BN_R_NOT_A_SQUARE 111
811#define BN_R_NOT_INITIALIZED 107
812#define BN_R_NO_INVERSE 108
813#define BN_R_NO_SOLUTION 116
814#define BN_R_P_IS_NOT_PRIME 112
815#define BN_R_TOO_MANY_ITERATIONS 113
816#define BN_R_TOO_MANY_TEMPORARY_VARIABLES 109
817
818#ifdef __cplusplus
819}
820#endif
821#endif
diff --git a/src/lib/libcrypto/bn/bn_add.c b/src/lib/libcrypto/bn/bn_add.c
deleted file mode 100644
index ebc9b9b56b..0000000000
--- a/src/lib/libcrypto/bn/bn_add.c
+++ /dev/null
@@ -1,313 +0,0 @@
1/* $OpenBSD: bn_add.c,v 1.10 2014/10/28 07:35:58 jsg Exp $ */
2/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
3 * All rights reserved.
4 *
5 * This package is an SSL implementation written
6 * by Eric Young (eay@cryptsoft.com).
7 * The implementation was written so as to conform with Netscapes SSL.
8 *
9 * This library is free for commercial and non-commercial use as long as
10 * the following conditions are aheared to. The following conditions
11 * apply to all code found in this distribution, be it the RC4, RSA,
12 * lhash, DES, etc., code; not just the SSL code. The SSL documentation
13 * included with this distribution is covered by the same copyright terms
14 * except that the holder is Tim Hudson (tjh@cryptsoft.com).
15 *
16 * Copyright remains Eric Young's, and as such any Copyright notices in
17 * the code are not to be removed.
18 * If this package is used in a product, Eric Young should be given attribution
19 * as the author of the parts of the library used.
20 * This can be in the form of a textual message at program startup or
21 * in documentation (online or textual) provided with the package.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 * 1. Redistributions of source code must retain the copyright
27 * notice, this list of conditions and the following disclaimer.
28 * 2. Redistributions in binary form must reproduce the above copyright
29 * notice, this list of conditions and the following disclaimer in the
30 * documentation and/or other materials provided with the distribution.
31 * 3. All advertising materials mentioning features or use of this software
32 * must display the following acknowledgement:
33 * "This product includes cryptographic software written by
34 * Eric Young (eay@cryptsoft.com)"
35 * The word 'cryptographic' can be left out if the rouines from the library
36 * being used are not cryptographic related :-).
37 * 4. If you include any Windows specific code (or a derivative thereof) from
38 * the apps directory (application code) you must include an acknowledgement:
39 * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
40 *
41 * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
42 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51 * SUCH DAMAGE.
52 *
53 * The licence and distribution terms for any publically available version or
54 * derivative of this code cannot be changed. i.e. this code cannot simply be
55 * copied and put under another distribution licence
56 * [including the GNU Public Licence.]
57 */
58
59#include <stdio.h>
60
61#include <openssl/err.h>
62
63#include "bn_lcl.h"
64
65/* r can == a or b */
66int
67BN_add(BIGNUM *r, const BIGNUM *a, const BIGNUM *b)
68{
69 const BIGNUM *tmp;
70 int a_neg = a->neg, ret;
71
72 bn_check_top(a);
73 bn_check_top(b);
74
75 /* a + b a+b
76 * a + -b a-b
77 * -a + b b-a
78 * -a + -b -(a+b)
79 */
80 if (a_neg ^ b->neg) {
81 /* only one is negative */
82 if (a_neg)
83 { tmp = a;
84 a = b;
85 b = tmp;
86 }
87
88 /* we are now a - b */
89
90 if (BN_ucmp(a, b) < 0) {
91 if (!BN_usub(r, b, a))
92 return (0);
93 r->neg = 1;
94 } else {
95 if (!BN_usub(r, a, b))
96 return (0);
97 r->neg = 0;
98 }
99 return (1);
100 }
101
102 ret = BN_uadd(r, a, b);
103 r->neg = a_neg;
104 bn_check_top(r);
105 return ret;
106}
107
108/* unsigned add of b to a */
109int
110BN_uadd(BIGNUM *r, const BIGNUM *a, const BIGNUM *b)
111{
112 int max, min, dif;
113 BN_ULONG *ap, *bp, *rp, carry, t1, t2;
114 const BIGNUM *tmp;
115
116 bn_check_top(a);
117 bn_check_top(b);
118
119 if (a->top < b->top) {
120 tmp = a;
121 a = b;
122 b = tmp;
123 }
124 max = a->top;
125 min = b->top;
126 dif = max - min;
127
128 if (bn_wexpand(r, max + 1) == NULL)
129 return 0;
130
131 r->top = max;
132
133 ap = a->d;
134 bp = b->d;
135 rp = r->d;
136
137 carry = bn_add_words(rp, ap, bp, min);
138 rp += min;
139 ap += min;
140 bp += min;
141
142 if (carry) {
143 while (dif) {
144 dif--;
145 t1 = *(ap++);
146 t2 = (t1 + 1) & BN_MASK2;
147 *(rp++) = t2;
148 if (t2) {
149 carry = 0;
150 break;
151 }
152 }
153 if (carry) {
154 /* carry != 0 => dif == 0 */
155 *rp = 1;
156 r->top++;
157 }
158 }
159 if (dif && rp != ap)
160 while (dif--)
161 /* copy remaining words if ap != rp */
162 *(rp++) = *(ap++);
163 r->neg = 0;
164 bn_check_top(r);
165 return 1;
166}
167
168/* unsigned subtraction of b from a, a must be larger than b. */
169int
170BN_usub(BIGNUM *r, const BIGNUM *a, const BIGNUM *b)
171{
172 int max, min, dif;
173 BN_ULONG t1, t2, *ap, *bp, *rp;
174 int i, carry;
175
176 bn_check_top(a);
177 bn_check_top(b);
178
179 max = a->top;
180 min = b->top;
181 dif = max - min;
182
183 if (dif < 0) /* hmm... should not be happening */
184 {
185 BNerr(BN_F_BN_USUB, BN_R_ARG2_LT_ARG3);
186 return (0);
187 }
188
189 if (bn_wexpand(r, max) == NULL)
190 return (0);
191
192 ap = a->d;
193 bp = b->d;
194 rp = r->d;
195
196#if 1
197 carry = 0;
198 for (i = min; i != 0; i--) {
199 t1= *(ap++);
200 t2= *(bp++);
201 if (carry) {
202 carry = (t1 <= t2);
203 t1 = (t1 - t2 - 1)&BN_MASK2;
204 } else {
205 carry = (t1 < t2);
206 t1 = (t1 - t2)&BN_MASK2;
207 }
208 *(rp++) = t1&BN_MASK2;
209 }
210#else
211 carry = bn_sub_words(rp, ap, bp, min);
212 ap += min;
213 bp += min;
214 rp += min;
215#endif
216 if (carry) /* subtracted */
217 {
218 if (!dif)
219 /* error: a < b */
220 return 0;
221 while (dif) {
222 dif--;
223 t1 = *(ap++);
224 t2 = (t1 - 1)&BN_MASK2;
225 *(rp++) = t2;
226 if (t1)
227 break;
228 }
229 }
230#if 0
231 memcpy(rp, ap, sizeof(*rp)*(max - i));
232#else
233 if (rp != ap) {
234 for (;;) {
235 if (!dif--)
236 break;
237 rp[0] = ap[0];
238 if (!dif--)
239 break;
240 rp[1] = ap[1];
241 if (!dif--)
242 break;
243 rp[2] = ap[2];
244 if (!dif--)
245 break;
246 rp[3] = ap[3];
247 rp += 4;
248 ap += 4;
249 }
250 }
251#endif
252
253 r->top = max;
254 r->neg = 0;
255 bn_correct_top(r);
256 return (1);
257}
258
259int
260BN_sub(BIGNUM *r, const BIGNUM *a, const BIGNUM *b)
261{
262 int max;
263 int add = 0, neg = 0;
264 const BIGNUM *tmp;
265
266 bn_check_top(a);
267 bn_check_top(b);
268
269 /* a - b a-b
270 * a - -b a+b
271 * -a - b -(a+b)
272 * -a - -b b-a
273 */
274 if (a->neg) {
275 if (b->neg) {
276 tmp = a;
277 a = b;
278 b = tmp;
279 } else {
280 add = 1;
281 neg = 1;
282 }
283 } else {
284 if (b->neg) {
285 add = 1;
286 neg = 0;
287 }
288 }
289
290 if (add) {
291 if (!BN_uadd(r, a, b))
292 return (0);
293 r->neg = neg;
294 return (1);
295 }
296
297 /* We are actually doing a - b :-) */
298
299 max = (a->top > b->top) ? a->top : b->top;
300 if (bn_wexpand(r, max) == NULL)
301 return (0);
302 if (BN_ucmp(a, b) < 0) {
303 if (!BN_usub(r, b, a))
304 return (0);
305 r->neg = 1;
306 } else {
307 if (!BN_usub(r, a, b))
308 return (0);
309 r->neg = 0;
310 }
311 bn_check_top(r);
312 return (1);
313}
diff --git a/src/lib/libcrypto/bn/bn_asm.c b/src/lib/libcrypto/bn/bn_asm.c
deleted file mode 100644
index 49f0ba5d7b..0000000000
--- a/src/lib/libcrypto/bn/bn_asm.c
+++ /dev/null
@@ -1,1098 +0,0 @@
1/* $OpenBSD: bn_asm.c,v 1.14 2015/02/25 15:39:49 bcook Exp $ */
2/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
3 * All rights reserved.
4 *
5 * This package is an SSL implementation written
6 * by Eric Young (eay@cryptsoft.com).
7 * The implementation was written so as to conform with Netscapes SSL.
8 *
9 * This library is free for commercial and non-commercial use as long as
10 * the following conditions are aheared to. The following conditions
11 * apply to all code found in this distribution, be it the RC4, RSA,
12 * lhash, DES, etc., code; not just the SSL code. The SSL documentation
13 * included with this distribution is covered by the same copyright terms
14 * except that the holder is Tim Hudson (tjh@cryptsoft.com).
15 *
16 * Copyright remains Eric Young's, and as such any Copyright notices in
17 * the code are not to be removed.
18 * If this package is used in a product, Eric Young should be given attribution
19 * as the author of the parts of the library used.
20 * This can be in the form of a textual message at program startup or
21 * in documentation (online or textual) provided with the package.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 * 1. Redistributions of source code must retain the copyright
27 * notice, this list of conditions and the following disclaimer.
28 * 2. Redistributions in binary form must reproduce the above copyright
29 * notice, this list of conditions and the following disclaimer in the
30 * documentation and/or other materials provided with the distribution.
31 * 3. All advertising materials mentioning features or use of this software
32 * must display the following acknowledgement:
33 * "This product includes cryptographic software written by
34 * Eric Young (eay@cryptsoft.com)"
35 * The word 'cryptographic' can be left out if the rouines from the library
36 * being used are not cryptographic related :-).
37 * 4. If you include any Windows specific code (or a derivative thereof) from
38 * the apps directory (application code) you must include an acknowledgement:
39 * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
40 *
41 * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
42 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51 * SUCH DAMAGE.
52 *
53 * The licence and distribution terms for any publically available version or
54 * derivative of this code cannot be changed. i.e. this code cannot simply be
55 * copied and put under another distribution licence
56 * [including the GNU Public Licence.]
57 */
58
59#ifndef BN_DEBUG
60# undef NDEBUG /* avoid conflicting definitions */
61# define NDEBUG
62#endif
63
64#include <assert.h>
65#include <stdio.h>
66
67#include <openssl/opensslconf.h>
68
69#include "bn_lcl.h"
70
71#if defined(BN_LLONG) || defined(BN_UMULT_HIGH)
72
73BN_ULONG
74bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
75{
76 BN_ULONG c1 = 0;
77
78 assert(num >= 0);
79 if (num <= 0)
80 return (c1);
81
82#ifndef OPENSSL_SMALL_FOOTPRINT
83 while (num & ~3) {
84 mul_add(rp[0], ap[0], w, c1);
85 mul_add(rp[1], ap[1], w, c1);
86 mul_add(rp[2], ap[2], w, c1);
87 mul_add(rp[3], ap[3], w, c1);
88 ap += 4;
89 rp += 4;
90 num -= 4;
91 }
92#endif
93 while (num) {
94 mul_add(rp[0], ap[0], w, c1);
95 ap++;
96 rp++;
97 num--;
98 }
99
100 return (c1);
101}
102
103BN_ULONG
104bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
105{
106 BN_ULONG c1 = 0;
107
108 assert(num >= 0);
109 if (num <= 0)
110 return (c1);
111
112#ifndef OPENSSL_SMALL_FOOTPRINT
113 while (num & ~3) {
114 mul(rp[0], ap[0], w, c1);
115 mul(rp[1], ap[1], w, c1);
116 mul(rp[2], ap[2], w, c1);
117 mul(rp[3], ap[3], w, c1);
118 ap += 4;
119 rp += 4;
120 num -= 4;
121 }
122#endif
123 while (num) {
124 mul(rp[0], ap[0], w, c1);
125 ap++;
126 rp++;
127 num--;
128 }
129 return (c1);
130}
131
132void
133bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n)
134{
135 assert(n >= 0);
136 if (n <= 0)
137 return;
138
139#ifndef OPENSSL_SMALL_FOOTPRINT
140 while (n & ~3) {
141 sqr(r[0], r[1], a[0]);
142 sqr(r[2], r[3], a[1]);
143 sqr(r[4], r[5], a[2]);
144 sqr(r[6], r[7], a[3]);
145 a += 4;
146 r += 8;
147 n -= 4;
148 }
149#endif
150 while (n) {
151 sqr(r[0], r[1], a[0]);
152 a++;
153 r += 2;
154 n--;
155 }
156}
157
158#else /* !(defined(BN_LLONG) || defined(BN_UMULT_HIGH)) */
159
160BN_ULONG
161bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
162{
163 BN_ULONG c = 0;
164 BN_ULONG bl, bh;
165
166 assert(num >= 0);
167 if (num <= 0)
168 return ((BN_ULONG)0);
169
170 bl = LBITS(w);
171 bh = HBITS(w);
172
173#ifndef OPENSSL_SMALL_FOOTPRINT
174 while (num & ~3) {
175 mul_add(rp[0], ap[0], bl, bh, c);
176 mul_add(rp[1], ap[1], bl, bh, c);
177 mul_add(rp[2], ap[2], bl, bh, c);
178 mul_add(rp[3], ap[3], bl, bh, c);
179 ap += 4;
180 rp += 4;
181 num -= 4;
182 }
183#endif
184 while (num) {
185 mul_add(rp[0], ap[0], bl, bh, c);
186 ap++;
187 rp++;
188 num--;
189 }
190 return (c);
191}
192
193BN_ULONG
194bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
195{
196 BN_ULONG carry = 0;
197 BN_ULONG bl, bh;
198
199 assert(num >= 0);
200 if (num <= 0)
201 return ((BN_ULONG)0);
202
203 bl = LBITS(w);
204 bh = HBITS(w);
205
206#ifndef OPENSSL_SMALL_FOOTPRINT
207 while (num & ~3) {
208 mul(rp[0], ap[0], bl, bh, carry);
209 mul(rp[1], ap[1], bl, bh, carry);
210 mul(rp[2], ap[2], bl, bh, carry);
211 mul(rp[3], ap[3], bl, bh, carry);
212 ap += 4;
213 rp += 4;
214 num -= 4;
215 }
216#endif
217 while (num) {
218 mul(rp[0], ap[0], bl, bh, carry);
219 ap++;
220 rp++;
221 num--;
222 }
223 return (carry);
224}
225
226void
227bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n)
228{
229 assert(n >= 0);
230 if (n <= 0)
231 return;
232
233#ifndef OPENSSL_SMALL_FOOTPRINT
234 while (n & ~3) {
235 sqr64(r[0], r[1], a[0]);
236 sqr64(r[2], r[3], a[1]);
237 sqr64(r[4], r[5], a[2]);
238 sqr64(r[6], r[7], a[3]);
239 a += 4;
240 r += 8;
241 n -= 4;
242 }
243#endif
244 while (n) {
245 sqr64(r[0], r[1], a[0]);
246 a++;
247 r += 2;
248 n--;
249 }
250}
251
252#endif /* !(defined(BN_LLONG) || defined(BN_UMULT_HIGH)) */
253
254#if defined(BN_LLONG) && defined(BN_DIV2W)
255
256BN_ULONG
257bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
258{
259 return ((BN_ULONG)(((((BN_ULLONG)h) << BN_BITS2)|l)/(BN_ULLONG)d));
260}
261
262#else
263
264/* Divide h,l by d and return the result. */
265/* I need to test this some more :-( */
266BN_ULONG
267bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
268{
269 BN_ULONG dh, dl, q,ret = 0, th, tl, t;
270 int i, count = 2;
271
272 if (d == 0)
273 return (BN_MASK2);
274
275 i = BN_num_bits_word(d);
276 assert((i == BN_BITS2) || (h <= (BN_ULONG)1 << i));
277
278 i = BN_BITS2 - i;
279 if (h >= d)
280 h -= d;
281
282 if (i) {
283 d <<= i;
284 h = (h << i) | (l >> (BN_BITS2 - i));
285 l <<= i;
286 }
287 dh = (d & BN_MASK2h) >> BN_BITS4;
288 dl = (d & BN_MASK2l);
289 for (;;) {
290 if ((h >> BN_BITS4) == dh)
291 q = BN_MASK2l;
292 else
293 q = h / dh;
294
295 th = q * dh;
296 tl = dl * q;
297 for (;;) {
298 t = h - th;
299 if ((t & BN_MASK2h) ||
300 ((tl) <= (
301 (t << BN_BITS4) |
302 ((l & BN_MASK2h) >> BN_BITS4))))
303 break;
304 q--;
305 th -= dh;
306 tl -= dl;
307 }
308 t = (tl >> BN_BITS4);
309 tl = (tl << BN_BITS4) & BN_MASK2h;
310 th += t;
311
312 if (l < tl)
313 th++;
314 l -= tl;
315 if (h < th) {
316 h += d;
317 q--;
318 }
319 h -= th;
320
321 if (--count == 0)
322 break;
323
324 ret = q << BN_BITS4;
325 h = ((h << BN_BITS4) | (l >> BN_BITS4)) & BN_MASK2;
326 l = (l & BN_MASK2l) << BN_BITS4;
327 }
328 ret |= q;
329 return (ret);
330}
331#endif /* !defined(BN_LLONG) && defined(BN_DIV2W) */
332
333#ifdef BN_LLONG
334BN_ULONG
335bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, int n)
336{
337 BN_ULLONG ll = 0;
338
339 assert(n >= 0);
340 if (n <= 0)
341 return ((BN_ULONG)0);
342
343#ifndef OPENSSL_SMALL_FOOTPRINT
344 while (n & ~3) {
345 ll += (BN_ULLONG)a[0] + b[0];
346 r[0] = (BN_ULONG)ll & BN_MASK2;
347 ll >>= BN_BITS2;
348 ll += (BN_ULLONG)a[1] + b[1];
349 r[1] = (BN_ULONG)ll & BN_MASK2;
350 ll >>= BN_BITS2;
351 ll += (BN_ULLONG)a[2] + b[2];
352 r[2] = (BN_ULONG)ll & BN_MASK2;
353 ll >>= BN_BITS2;
354 ll += (BN_ULLONG)a[3] + b[3];
355 r[3] = (BN_ULONG)ll & BN_MASK2;
356 ll >>= BN_BITS2;
357 a += 4;
358 b += 4;
359 r += 4;
360 n -= 4;
361 }
362#endif
363 while (n) {
364 ll += (BN_ULLONG)a[0] + b[0];
365 r[0] = (BN_ULONG)ll & BN_MASK2;
366 ll >>= BN_BITS2;
367 a++;
368 b++;
369 r++;
370 n--;
371 }
372 return ((BN_ULONG)ll);
373}
374#else /* !BN_LLONG */
375BN_ULONG
376bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, int n)
377{
378 BN_ULONG c, l, t;
379
380 assert(n >= 0);
381 if (n <= 0)
382 return ((BN_ULONG)0);
383
384 c = 0;
385#ifndef OPENSSL_SMALL_FOOTPRINT
386 while (n & ~3) {
387 t = a[0];
388 t = (t + c) & BN_MASK2;
389 c = (t < c);
390 l = (t + b[0]) & BN_MASK2;
391 c += (l < t);
392 r[0] = l;
393 t = a[1];
394 t = (t + c) & BN_MASK2;
395 c = (t < c);
396 l = (t + b[1]) & BN_MASK2;
397 c += (l < t);
398 r[1] = l;
399 t = a[2];
400 t = (t + c) & BN_MASK2;
401 c = (t < c);
402 l = (t + b[2]) & BN_MASK2;
403 c += (l < t);
404 r[2] = l;
405 t = a[3];
406 t = (t + c) & BN_MASK2;
407 c = (t < c);
408 l = (t + b[3]) & BN_MASK2;
409 c += (l < t);
410 r[3] = l;
411 a += 4;
412 b += 4;
413 r += 4;
414 n -= 4;
415 }
416#endif
417 while (n) {
418 t = a[0];
419 t = (t + c) & BN_MASK2;
420 c = (t < c);
421 l = (t + b[0]) & BN_MASK2;
422 c += (l < t);
423 r[0] = l;
424 a++;
425 b++;
426 r++;
427 n--;
428 }
429 return ((BN_ULONG)c);
430}
431#endif /* !BN_LLONG */
432
433BN_ULONG
434bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, int n)
435{
436 BN_ULONG t1, t2;
437 int c = 0;
438
439 assert(n >= 0);
440 if (n <= 0)
441 return ((BN_ULONG)0);
442
443#ifndef OPENSSL_SMALL_FOOTPRINT
444 while (n&~3) {
445 t1 = a[0];
446 t2 = b[0];
447 r[0] = (t1 - t2 - c) & BN_MASK2;
448 if (t1 != t2)
449 c = (t1 < t2);
450 t1 = a[1];
451 t2 = b[1];
452 r[1] = (t1 - t2 - c) & BN_MASK2;
453 if (t1 != t2)
454 c = (t1 < t2);
455 t1 = a[2];
456 t2 = b[2];
457 r[2] = (t1 - t2 - c) & BN_MASK2;
458 if (t1 != t2)
459 c = (t1 < t2);
460 t1 = a[3];
461 t2 = b[3];
462 r[3] = (t1 - t2 - c) & BN_MASK2;
463 if (t1 != t2)
464 c = (t1 < t2);
465 a += 4;
466 b += 4;
467 r += 4;
468 n -= 4;
469 }
470#endif
471 while (n) {
472 t1 = a[0];
473 t2 = b[0];
474 r[0] = (t1 - t2 - c) & BN_MASK2;
475 if (t1 != t2)
476 c = (t1 < t2);
477 a++;
478 b++;
479 r++;
480 n--;
481 }
482 return (c);
483}
484
485#if defined(BN_MUL_COMBA) && !defined(OPENSSL_SMALL_FOOTPRINT)
486
487#undef bn_mul_comba8
488#undef bn_mul_comba4
489#undef bn_sqr_comba8
490#undef bn_sqr_comba4
491
492/* mul_add_c(a,b,c0,c1,c2) -- c+=a*b for three word number c=(c2,c1,c0) */
493/* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */
494/* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
495/* sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0) */
496
497#ifdef BN_LLONG
498/*
499 * Keep in mind that additions to multiplication result can not
500 * overflow, because its high half cannot be all-ones.
501 */
502#define mul_add_c(a,b,c0,c1,c2) do { \
503 BN_ULONG hi; \
504 BN_ULLONG t = (BN_ULLONG)(a)*(b); \
505 t += c0; /* no carry */ \
506 c0 = (BN_ULONG)Lw(t); \
507 hi = (BN_ULONG)Hw(t); \
508 c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
509 } while(0)
510
511#define mul_add_c2(a,b,c0,c1,c2) do { \
512 BN_ULONG hi; \
513 BN_ULLONG t = (BN_ULLONG)(a)*(b); \
514 BN_ULLONG tt = t+c0; /* no carry */ \
515 c0 = (BN_ULONG)Lw(tt); \
516 hi = (BN_ULONG)Hw(tt); \
517 c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
518 t += c0; /* no carry */ \
519 c0 = (BN_ULONG)Lw(t); \
520 hi = (BN_ULONG)Hw(t); \
521 c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
522 } while(0)
523
524#define sqr_add_c(a,i,c0,c1,c2) do { \
525 BN_ULONG hi; \
526 BN_ULLONG t = (BN_ULLONG)a[i]*a[i]; \
527 t += c0; /* no carry */ \
528 c0 = (BN_ULONG)Lw(t); \
529 hi = (BN_ULONG)Hw(t); \
530 c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
531 } while(0)
532
533#define sqr_add_c2(a,i,j,c0,c1,c2) \
534 mul_add_c2((a)[i],(a)[j],c0,c1,c2)
535
536#elif defined(BN_UMULT_LOHI)
537/*
538 * Keep in mind that additions to hi can not overflow, because
539 * the high word of a multiplication result cannot be all-ones.
540 */
541#define mul_add_c(a,b,c0,c1,c2) do { \
542 BN_ULONG ta = (a), tb = (b); \
543 BN_ULONG lo, hi; \
544 BN_UMULT_LOHI(lo,hi,ta,tb); \
545 c0 += lo; hi += (c0<lo)?1:0; \
546 c1 += hi; c2 += (c1<hi)?1:0; \
547 } while(0)
548
549#define mul_add_c2(a,b,c0,c1,c2) do { \
550 BN_ULONG ta = (a), tb = (b); \
551 BN_ULONG lo, hi, tt; \
552 BN_UMULT_LOHI(lo,hi,ta,tb); \
553 c0 += lo; tt = hi+((c0<lo)?1:0); \
554 c1 += tt; c2 += (c1<tt)?1:0; \
555 c0 += lo; hi += (c0<lo)?1:0; \
556 c1 += hi; c2 += (c1<hi)?1:0; \
557 } while(0)
558
559#define sqr_add_c(a,i,c0,c1,c2) do { \
560 BN_ULONG ta = (a)[i]; \
561 BN_ULONG lo, hi; \
562 BN_UMULT_LOHI(lo,hi,ta,ta); \
563 c0 += lo; hi += (c0<lo)?1:0; \
564 c1 += hi; c2 += (c1<hi)?1:0; \
565 } while(0)
566
567#define sqr_add_c2(a,i,j,c0,c1,c2) \
568 mul_add_c2((a)[i],(a)[j],c0,c1,c2)
569
570#elif defined(BN_UMULT_HIGH)
571/*
572 * Keep in mind that additions to hi can not overflow, because
573 * the high word of a multiplication result cannot be all-ones.
574 */
575#define mul_add_c(a,b,c0,c1,c2) do { \
576 BN_ULONG ta = (a), tb = (b); \
577 BN_ULONG lo = ta * tb; \
578 BN_ULONG hi = BN_UMULT_HIGH(ta,tb); \
579 c0 += lo; hi += (c0<lo)?1:0; \
580 c1 += hi; c2 += (c1<hi)?1:0; \
581 } while(0)
582
583#define mul_add_c2(a,b,c0,c1,c2) do { \
584 BN_ULONG ta = (a), tb = (b), tt; \
585 BN_ULONG lo = ta * tb; \
586 BN_ULONG hi = BN_UMULT_HIGH(ta,tb); \
587 c0 += lo; tt = hi + ((c0<lo)?1:0); \
588 c1 += tt; c2 += (c1<tt)?1:0; \
589 c0 += lo; hi += (c0<lo)?1:0; \
590 c1 += hi; c2 += (c1<hi)?1:0; \
591 } while(0)
592
593#define sqr_add_c(a,i,c0,c1,c2) do { \
594 BN_ULONG ta = (a)[i]; \
595 BN_ULONG lo = ta * ta; \
596 BN_ULONG hi = BN_UMULT_HIGH(ta,ta); \
597 c0 += lo; hi += (c0<lo)?1:0; \
598 c1 += hi; c2 += (c1<hi)?1:0; \
599 } while(0)
600
601#define sqr_add_c2(a,i,j,c0,c1,c2) \
602 mul_add_c2((a)[i],(a)[j],c0,c1,c2)
603
604#else /* !BN_LLONG */
605/*
606 * Keep in mind that additions to hi can not overflow, because
607 * the high word of a multiplication result cannot be all-ones.
608 */
609#define mul_add_c(a,b,c0,c1,c2) do { \
610 BN_ULONG lo = LBITS(a), hi = HBITS(a); \
611 BN_ULONG bl = LBITS(b), bh = HBITS(b); \
612 mul64(lo,hi,bl,bh); \
613 c0 = (c0+lo)&BN_MASK2; if (c0<lo) hi++; \
614 c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
615 } while(0)
616
617#define mul_add_c2(a,b,c0,c1,c2) do { \
618 BN_ULONG tt; \
619 BN_ULONG lo = LBITS(a), hi = HBITS(a); \
620 BN_ULONG bl = LBITS(b), bh = HBITS(b); \
621 mul64(lo,hi,bl,bh); \
622 tt = hi; \
623 c0 = (c0+lo)&BN_MASK2; if (c0<lo) tt++; \
624 c1 = (c1+tt)&BN_MASK2; if (c1<tt) c2++; \
625 c0 = (c0+lo)&BN_MASK2; if (c0<lo) hi++; \
626 c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
627 } while(0)
628
629#define sqr_add_c(a,i,c0,c1,c2) do { \
630 BN_ULONG lo, hi; \
631 sqr64(lo,hi,(a)[i]); \
632 c0 = (c0+lo)&BN_MASK2; if (c0<lo) hi++; \
633 c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
634 } while(0)
635
636#define sqr_add_c2(a,i,j,c0,c1,c2) \
637 mul_add_c2((a)[i],(a)[j],c0,c1,c2)
638#endif /* !BN_LLONG */
639
640void
641bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
642{
643 BN_ULONG c1, c2, c3;
644
645 c1 = 0;
646 c2 = 0;
647 c3 = 0;
648 mul_add_c(a[0], b[0], c1, c2, c3);
649 r[0] = c1;
650 c1 = 0;
651 mul_add_c(a[0], b[1], c2, c3, c1);
652 mul_add_c(a[1], b[0], c2, c3, c1);
653 r[1] = c2;
654 c2 = 0;
655 mul_add_c(a[2], b[0], c3, c1, c2);
656 mul_add_c(a[1], b[1], c3, c1, c2);
657 mul_add_c(a[0], b[2], c3, c1, c2);
658 r[2] = c3;
659 c3 = 0;
660 mul_add_c(a[0], b[3], c1, c2, c3);
661 mul_add_c(a[1], b[2], c1, c2, c3);
662 mul_add_c(a[2], b[1], c1, c2, c3);
663 mul_add_c(a[3], b[0], c1, c2, c3);
664 r[3] = c1;
665 c1 = 0;
666 mul_add_c(a[4], b[0], c2, c3, c1);
667 mul_add_c(a[3], b[1], c2, c3, c1);
668 mul_add_c(a[2], b[2], c2, c3, c1);
669 mul_add_c(a[1], b[3], c2, c3, c1);
670 mul_add_c(a[0], b[4], c2, c3, c1);
671 r[4] = c2;
672 c2 = 0;
673 mul_add_c(a[0], b[5], c3, c1, c2);
674 mul_add_c(a[1], b[4], c3, c1, c2);
675 mul_add_c(a[2], b[3], c3, c1, c2);
676 mul_add_c(a[3], b[2], c3, c1, c2);
677 mul_add_c(a[4], b[1], c3, c1, c2);
678 mul_add_c(a[5], b[0], c3, c1, c2);
679 r[5] = c3;
680 c3 = 0;
681 mul_add_c(a[6], b[0], c1, c2, c3);
682 mul_add_c(a[5], b[1], c1, c2, c3);
683 mul_add_c(a[4], b[2], c1, c2, c3);
684 mul_add_c(a[3], b[3], c1, c2, c3);
685 mul_add_c(a[2], b[4], c1, c2, c3);
686 mul_add_c(a[1], b[5], c1, c2, c3);
687 mul_add_c(a[0], b[6], c1, c2, c3);
688 r[6] = c1;
689 c1 = 0;
690 mul_add_c(a[0], b[7], c2, c3, c1);
691 mul_add_c(a[1], b[6], c2, c3, c1);
692 mul_add_c(a[2], b[5], c2, c3, c1);
693 mul_add_c(a[3], b[4], c2, c3, c1);
694 mul_add_c(a[4], b[3], c2, c3, c1);
695 mul_add_c(a[5], b[2], c2, c3, c1);
696 mul_add_c(a[6], b[1], c2, c3, c1);
697 mul_add_c(a[7], b[0], c2, c3, c1);
698 r[7] = c2;
699 c2 = 0;
700 mul_add_c(a[7], b[1], c3, c1, c2);
701 mul_add_c(a[6], b[2], c3, c1, c2);
702 mul_add_c(a[5], b[3], c3, c1, c2);
703 mul_add_c(a[4], b[4], c3, c1, c2);
704 mul_add_c(a[3], b[5], c3, c1, c2);
705 mul_add_c(a[2], b[6], c3, c1, c2);
706 mul_add_c(a[1], b[7], c3, c1, c2);
707 r[8] = c3;
708 c3 = 0;
709 mul_add_c(a[2], b[7], c1, c2, c3);
710 mul_add_c(a[3], b[6], c1, c2, c3);
711 mul_add_c(a[4], b[5], c1, c2, c3);
712 mul_add_c(a[5], b[4], c1, c2, c3);
713 mul_add_c(a[6], b[3], c1, c2, c3);
714 mul_add_c(a[7], b[2], c1, c2, c3);
715 r[9] = c1;
716 c1 = 0;
717 mul_add_c(a[7], b[3], c2, c3, c1);
718 mul_add_c(a[6], b[4], c2, c3, c1);
719 mul_add_c(a[5], b[5], c2, c3, c1);
720 mul_add_c(a[4], b[6], c2, c3, c1);
721 mul_add_c(a[3], b[7], c2, c3, c1);
722 r[10] = c2;
723 c2 = 0;
724 mul_add_c(a[4], b[7], c3, c1, c2);
725 mul_add_c(a[5], b[6], c3, c1, c2);
726 mul_add_c(a[6], b[5], c3, c1, c2);
727 mul_add_c(a[7], b[4], c3, c1, c2);
728 r[11] = c3;
729 c3 = 0;
730 mul_add_c(a[7], b[5], c1, c2, c3);
731 mul_add_c(a[6], b[6], c1, c2, c3);
732 mul_add_c(a[5], b[7], c1, c2, c3);
733 r[12] = c1;
734 c1 = 0;
735 mul_add_c(a[6], b[7], c2, c3, c1);
736 mul_add_c(a[7], b[6], c2, c3, c1);
737 r[13] = c2;
738 c2 = 0;
739 mul_add_c(a[7], b[7], c3, c1, c2);
740 r[14] = c3;
741 r[15] = c1;
742}
743
744void
745bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
746{
747 BN_ULONG c1, c2, c3;
748
749 c1 = 0;
750 c2 = 0;
751 c3 = 0;
752 mul_add_c(a[0], b[0], c1, c2, c3);
753 r[0] = c1;
754 c1 = 0;
755 mul_add_c(a[0], b[1], c2, c3, c1);
756 mul_add_c(a[1], b[0], c2, c3, c1);
757 r[1] = c2;
758 c2 = 0;
759 mul_add_c(a[2], b[0], c3, c1, c2);
760 mul_add_c(a[1], b[1], c3, c1, c2);
761 mul_add_c(a[0], b[2], c3, c1, c2);
762 r[2] = c3;
763 c3 = 0;
764 mul_add_c(a[0], b[3], c1, c2, c3);
765 mul_add_c(a[1], b[2], c1, c2, c3);
766 mul_add_c(a[2], b[1], c1, c2, c3);
767 mul_add_c(a[3], b[0], c1, c2, c3);
768 r[3] = c1;
769 c1 = 0;
770 mul_add_c(a[3], b[1], c2, c3, c1);
771 mul_add_c(a[2], b[2], c2, c3, c1);
772 mul_add_c(a[1], b[3], c2, c3, c1);
773 r[4] = c2;
774 c2 = 0;
775 mul_add_c(a[2], b[3], c3, c1, c2);
776 mul_add_c(a[3], b[2], c3, c1, c2);
777 r[5] = c3;
778 c3 = 0;
779 mul_add_c(a[3], b[3], c1, c2, c3);
780 r[6] = c1;
781 r[7] = c2;
782}
783
784void
785bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
786{
787 BN_ULONG c1, c2, c3;
788
789 c1 = 0;
790 c2 = 0;
791 c3 = 0;
792 sqr_add_c(a, 0, c1, c2, c3);
793 r[0] = c1;
794 c1 = 0;
795 sqr_add_c2(a, 1, 0, c2, c3, c1);
796 r[1] = c2;
797 c2 = 0;
798 sqr_add_c(a, 1, c3, c1, c2);
799 sqr_add_c2(a, 2, 0, c3, c1, c2);
800 r[2] = c3;
801 c3 = 0;
802 sqr_add_c2(a, 3, 0, c1, c2, c3);
803 sqr_add_c2(a, 2, 1, c1, c2, c3);
804 r[3] = c1;
805 c1 = 0;
806 sqr_add_c(a, 2, c2, c3, c1);
807 sqr_add_c2(a, 3, 1, c2, c3, c1);
808 sqr_add_c2(a, 4, 0, c2, c3, c1);
809 r[4] = c2;
810 c2 = 0;
811 sqr_add_c2(a, 5, 0, c3, c1, c2);
812 sqr_add_c2(a, 4, 1, c3, c1, c2);
813 sqr_add_c2(a, 3, 2, c3, c1, c2);
814 r[5] = c3;
815 c3 = 0;
816 sqr_add_c(a, 3, c1, c2, c3);
817 sqr_add_c2(a, 4, 2, c1, c2, c3);
818 sqr_add_c2(a, 5, 1, c1, c2, c3);
819 sqr_add_c2(a, 6, 0, c1, c2, c3);
820 r[6] = c1;
821 c1 = 0;
822 sqr_add_c2(a, 7, 0, c2, c3, c1);
823 sqr_add_c2(a, 6, 1, c2, c3, c1);
824 sqr_add_c2(a, 5, 2, c2, c3, c1);
825 sqr_add_c2(a, 4, 3, c2, c3, c1);
826 r[7] = c2;
827 c2 = 0;
828 sqr_add_c(a, 4, c3, c1, c2);
829 sqr_add_c2(a, 5, 3, c3, c1, c2);
830 sqr_add_c2(a, 6, 2, c3, c1, c2);
831 sqr_add_c2(a, 7, 1, c3, c1, c2);
832 r[8] = c3;
833 c3 = 0;
834 sqr_add_c2(a, 7, 2, c1, c2, c3);
835 sqr_add_c2(a, 6, 3, c1, c2, c3);
836 sqr_add_c2(a, 5, 4, c1, c2, c3);
837 r[9] = c1;
838 c1 = 0;
839 sqr_add_c(a, 5, c2, c3, c1);
840 sqr_add_c2(a, 6, 4, c2, c3, c1);
841 sqr_add_c2(a, 7, 3, c2, c3, c1);
842 r[10] = c2;
843 c2 = 0;
844 sqr_add_c2(a, 7, 4, c3, c1, c2);
845 sqr_add_c2(a, 6, 5, c3, c1, c2);
846 r[11] = c3;
847 c3 = 0;
848 sqr_add_c(a, 6, c1, c2, c3);
849 sqr_add_c2(a, 7, 5, c1, c2, c3);
850 r[12] = c1;
851 c1 = 0;
852 sqr_add_c2(a, 7, 6, c2, c3, c1);
853 r[13] = c2;
854 c2 = 0;
855 sqr_add_c(a, 7, c3, c1, c2);
856 r[14] = c3;
857 r[15] = c1;
858}
859
860void
861bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
862{
863 BN_ULONG c1, c2, c3;
864
865 c1 = 0;
866 c2 = 0;
867 c3 = 0;
868 sqr_add_c(a, 0, c1, c2, c3);
869 r[0] = c1;
870 c1 = 0;
871 sqr_add_c2(a, 1, 0, c2, c3, c1);
872 r[1] = c2;
873 c2 = 0;
874 sqr_add_c(a, 1, c3, c1, c2);
875 sqr_add_c2(a, 2, 0, c3, c1, c2);
876 r[2] = c3;
877 c3 = 0;
878 sqr_add_c2(a, 3, 0, c1, c2, c3);
879 sqr_add_c2(a, 2, 1, c1, c2, c3);
880 r[3] = c1;
881 c1 = 0;
882 sqr_add_c(a, 2, c2, c3, c1);
883 sqr_add_c2(a, 3, 1, c2, c3, c1);
884 r[4] = c2;
885 c2 = 0;
886 sqr_add_c2(a, 3, 2, c3, c1, c2);
887 r[5] = c3;
888 c3 = 0;
889 sqr_add_c(a, 3, c1, c2, c3);
890 r[6] = c1;
891 r[7] = c2;
892}
893
894#ifdef OPENSSL_NO_ASM
895#ifdef OPENSSL_BN_ASM_MONT
896/*
897 * This is essentially reference implementation, which may or may not
898 * result in performance improvement. E.g. on IA-32 this routine was
899 * observed to give 40% faster rsa1024 private key operations and 10%
900 * faster rsa4096 ones, while on AMD64 it improves rsa1024 sign only
901 * by 10% and *worsens* rsa4096 sign by 15%. Once again, it's a
902 * reference implementation, one to be used as starting point for
903 * platform-specific assembler. Mentioned numbers apply to compiler
904 * generated code compiled with and without -DOPENSSL_BN_ASM_MONT and
905 * can vary not only from platform to platform, but even for compiler
906 * versions. Assembler vs. assembler improvement coefficients can
907 * [and are known to] differ and are to be documented elsewhere.
908 */
909int
910bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np, const BN_ULONG *n0p, int num)
911{
912 BN_ULONG c0, c1, ml, *tp, n0;
913#ifdef mul64
914 BN_ULONG mh;
915#endif
916 int i = 0, j;
917
918#if 0 /* template for platform-specific implementation */
919 if (ap == bp)
920 return bn_sqr_mont(rp, ap, np, n0p, num);
921#endif
922 tp = reallocarray(NULL, num + 2, sizeof(BN_ULONG));
923 if (tp == NULL)
924 return 0;
925
926 n0 = *n0p;
927
928 c0 = 0;
929 ml = bp[0];
930#ifdef mul64
931 mh = HBITS(ml);
932 ml = LBITS(ml);
933 for (j = 0; j < num; ++j)
934 mul(tp[j], ap[j], ml, mh, c0);
935#else
936 for (j = 0; j < num; ++j)
937 mul(tp[j], ap[j], ml, c0);
938#endif
939
940 tp[num] = c0;
941 tp[num + 1] = 0;
942 goto enter;
943
944 for (i = 0; i < num; i++) {
945 c0 = 0;
946 ml = bp[i];
947#ifdef mul64
948 mh = HBITS(ml);
949 ml = LBITS(ml);
950 for (j = 0; j < num; ++j)
951 mul_add(tp[j], ap[j], ml, mh, c0);
952#else
953 for (j = 0; j < num; ++j)
954 mul_add(tp[j], ap[j], ml, c0);
955#endif
956 c1 = (tp[num] + c0) & BN_MASK2;
957 tp[num] = c1;
958 tp[num + 1] = (c1 < c0 ? 1 : 0);
959enter:
960 c1 = tp[0];
961 ml = (c1 * n0) & BN_MASK2;
962 c0 = 0;
963#ifdef mul64
964 mh = HBITS(ml);
965 ml = LBITS(ml);
966 mul_add(c1, np[0], ml, mh, c0);
967#else
968 mul_add(c1, ml, np[0], c0);
969#endif
970 for (j = 1; j < num; j++) {
971 c1 = tp[j];
972#ifdef mul64
973 mul_add(c1, np[j], ml, mh, c0);
974#else
975 mul_add(c1, ml, np[j], c0);
976#endif
977 tp[j - 1] = c1 & BN_MASK2;
978 }
979 c1 = (tp[num] + c0) & BN_MASK2;
980 tp[num - 1] = c1;
981 tp[num] = tp[num + 1] + (c1 < c0 ? 1 : 0);
982 }
983
984 if (tp[num] != 0 || tp[num - 1] >= np[num - 1]) {
985 c0 = bn_sub_words(rp, tp, np, num);
986 if (tp[num] != 0 || c0 == 0) {
987 goto out;
988 }
989 }
990 memcpy(rp, tp, num * sizeof(BN_ULONG));
991out:
992 explicit_bzero(tp, (num + 2) * sizeof(BN_ULONG));
993 free(tp);
994 return 1;
995}
996#else
997/*
998 * Return value of 0 indicates that multiplication/convolution was not
999 * performed to signal the caller to fall down to alternative/original
1000 * code-path.
1001 */
1002int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np, const BN_ULONG *n0, int num)
1003 { return 0;
1004}
1005#endif /* OPENSSL_BN_ASM_MONT */
1006#endif
1007
1008#else /* !BN_MUL_COMBA */
1009
1010/* hmm... is it faster just to do a multiply? */
1011#undef bn_sqr_comba4
1012void
1013bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
1014{
1015 BN_ULONG t[8];
1016 bn_sqr_normal(r, a, 4, t);
1017}
1018
1019#undef bn_sqr_comba8
1020void
1021bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
1022{
1023 BN_ULONG t[16];
1024 bn_sqr_normal(r, a, 8, t);
1025}
1026
1027void
1028bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
1029{
1030 r[4] = bn_mul_words(&(r[0]), a, 4, b[0]);
1031 r[5] = bn_mul_add_words(&(r[1]), a, 4, b[1]);
1032 r[6] = bn_mul_add_words(&(r[2]), a, 4, b[2]);
1033 r[7] = bn_mul_add_words(&(r[3]), a, 4, b[3]);
1034}
1035
1036void
1037bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
1038{
1039 r[8] = bn_mul_words(&(r[0]), a, 8, b[0]);
1040 r[9] = bn_mul_add_words(&(r[1]), a, 8, b[1]);
1041 r[10] = bn_mul_add_words(&(r[2]), a, 8, b[2]);
1042 r[11] = bn_mul_add_words(&(r[3]), a, 8, b[3]);
1043 r[12] = bn_mul_add_words(&(r[4]), a, 8, b[4]);
1044 r[13] = bn_mul_add_words(&(r[5]), a, 8, b[5]);
1045 r[14] = bn_mul_add_words(&(r[6]), a, 8, b[6]);
1046 r[15] = bn_mul_add_words(&(r[7]), a, 8, b[7]);
1047}
1048
1049#ifdef OPENSSL_NO_ASM
1050#ifdef OPENSSL_BN_ASM_MONT
1051int
1052bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
1053 const BN_ULONG *np, const BN_ULONG *n0p, int num)
1054{
1055 BN_ULONG c0, c1, *tp, n0 = *n0p;
1056 int i = 0, j;
1057
1058 tp = calloc(NULL, num + 2, sizeof(BN_ULONG));
1059 if (tp == NULL)
1060 return 0;
1061
1062 for (i = 0; i < num; i++) {
1063 c0 = bn_mul_add_words(tp, ap, num, bp[i]);
1064 c1 = (tp[num] + c0) & BN_MASK2;
1065 tp[num] = c1;
1066 tp[num + 1] = (c1 < c0 ? 1 : 0);
1067
1068 c0 = bn_mul_add_words(tp, np, num, tp[0] * n0);
1069 c1 = (tp[num] + c0) & BN_MASK2;
1070 tp[num] = c1;
1071 tp[num + 1] += (c1 < c0 ? 1 : 0);
1072 for (j = 0; j <= num; j++)
1073 tp[j] = tp[j + 1];
1074 }
1075
1076 if (tp[num] != 0 || tp[num - 1] >= np[num - 1]) {
1077 c0 = bn_sub_words(rp, tp, np, num);
1078 if (tp[num] != 0 || c0 == 0) {
1079 goto out;
1080 }
1081 }
1082 memcpy(rp, tp, num * sizeof(BN_ULONG));
1083out:
1084 explicit_bzero(tp, (num + 2) * sizeof(BN_ULONG));
1085 free(tp);
1086 return 1;
1087}
1088#else
1089int
1090bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
1091 const BN_ULONG *np, const BN_ULONG *n0, int num)
1092{
1093 return 0;
1094}
1095#endif /* OPENSSL_BN_ASM_MONT */
1096#endif
1097
1098#endif /* !BN_MUL_COMBA */
diff --git a/src/lib/libcrypto/bn/bn_blind.c b/src/lib/libcrypto/bn/bn_blind.c
deleted file mode 100644
index c842f76c6f..0000000000
--- a/src/lib/libcrypto/bn/bn_blind.c
+++ /dev/null
@@ -1,388 +0,0 @@
1/* $OpenBSD: bn_blind.c,v 1.14 2014/07/12 16:03:36 miod Exp $ */
2/* ====================================================================
3 * Copyright (c) 1998-2006 The OpenSSL Project. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in
14 * the documentation and/or other materials provided with the
15 * distribution.
16 *
17 * 3. All advertising materials mentioning features or use of this
18 * software must display the following acknowledgment:
19 * "This product includes software developed by the OpenSSL Project
20 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
21 *
22 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
23 * endorse or promote products derived from this software without
24 * prior written permission. For written permission, please contact
25 * openssl-core@openssl.org.
26 *
27 * 5. Products derived from this software may not be called "OpenSSL"
28 * nor may "OpenSSL" appear in their names without prior written
29 * permission of the OpenSSL Project.
30 *
31 * 6. Redistributions of any form whatsoever must retain the following
32 * acknowledgment:
33 * "This product includes software developed by the OpenSSL Project
34 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
35 *
36 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
37 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
38 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
39 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
40 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
41 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
42 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
43 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
44 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
45 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
46 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
47 * OF THE POSSIBILITY OF SUCH DAMAGE.
48 * ====================================================================
49 *
50 * This product includes cryptographic software written by Eric Young
51 * (eay@cryptsoft.com). This product includes software written by Tim
52 * Hudson (tjh@cryptsoft.com).
53 *
54 */
55/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
56 * All rights reserved.
57 *
58 * This package is an SSL implementation written
59 * by Eric Young (eay@cryptsoft.com).
60 * The implementation was written so as to conform with Netscapes SSL.
61 *
62 * This library is free for commercial and non-commercial use as long as
63 * the following conditions are aheared to. The following conditions
64 * apply to all code found in this distribution, be it the RC4, RSA,
65 * lhash, DES, etc., code; not just the SSL code. The SSL documentation
66 * included with this distribution is covered by the same copyright terms
67 * except that the holder is Tim Hudson (tjh@cryptsoft.com).
68 *
69 * Copyright remains Eric Young's, and as such any Copyright notices in
70 * the code are not to be removed.
71 * If this package is used in a product, Eric Young should be given attribution
72 * as the author of the parts of the library used.
73 * This can be in the form of a textual message at program startup or
74 * in documentation (online or textual) provided with the package.
75 *
76 * Redistribution and use in source and binary forms, with or without
77 * modification, are permitted provided that the following conditions
78 * are met:
79 * 1. Redistributions of source code must retain the copyright
80 * notice, this list of conditions and the following disclaimer.
81 * 2. Redistributions in binary form must reproduce the above copyright
82 * notice, this list of conditions and the following disclaimer in the
83 * documentation and/or other materials provided with the distribution.
84 * 3. All advertising materials mentioning features or use of this software
85 * must display the following acknowledgement:
86 * "This product includes cryptographic software written by
87 * Eric Young (eay@cryptsoft.com)"
88 * The word 'cryptographic' can be left out if the rouines from the library
89 * being used are not cryptographic related :-).
90 * 4. If you include any Windows specific code (or a derivative thereof) from
91 * the apps directory (application code) you must include an acknowledgement:
92 * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
93 *
94 * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
95 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
96 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
97 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
98 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
99 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
100 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
101 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
102 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
103 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
104 * SUCH DAMAGE.
105 *
106 * The licence and distribution terms for any publically available version or
107 * derivative of this code cannot be changed. i.e. this code cannot simply be
108 * copied and put under another distribution licence
109 * [including the GNU Public Licence.]
110 */
111
112#include <stdio.h>
113
114#include <openssl/opensslconf.h>
115
116#include <openssl/err.h>
117
118#include "bn_lcl.h"
119
120#define BN_BLINDING_COUNTER 32
121
122struct bn_blinding_st {
123 BIGNUM *A;
124 BIGNUM *Ai;
125 BIGNUM *e;
126 BIGNUM *mod; /* just a reference */
127#ifndef OPENSSL_NO_DEPRECATED
128 unsigned long thread_id; /* added in OpenSSL 0.9.6j and 0.9.7b;
129 * used only by crypto/rsa/rsa_eay.c, rsa_lib.c */
130#endif
131 CRYPTO_THREADID tid;
132 int counter;
133 unsigned long flags;
134 BN_MONT_CTX *m_ctx;
135 int (*bn_mod_exp)(BIGNUM *r, const BIGNUM *a, const BIGNUM *p,
136 const BIGNUM *m, BN_CTX *ctx, BN_MONT_CTX *m_ctx);
137};
138
139BN_BLINDING *
140BN_BLINDING_new(const BIGNUM *A, const BIGNUM *Ai, BIGNUM *mod)
141{
142 BN_BLINDING *ret = NULL;
143
144 bn_check_top(mod);
145
146 if ((ret = calloc(1, sizeof(BN_BLINDING))) == NULL) {
147 BNerr(BN_F_BN_BLINDING_NEW, ERR_R_MALLOC_FAILURE);
148 return (NULL);
149 }
150 if (A != NULL) {
151 if ((ret->A = BN_dup(A)) == NULL)
152 goto err;
153 }
154 if (Ai != NULL) {
155 if ((ret->Ai = BN_dup(Ai)) == NULL)
156 goto err;
157 }
158
159 /* save a copy of mod in the BN_BLINDING structure */
160 if ((ret->mod = BN_dup(mod)) == NULL)
161 goto err;
162 if (BN_get_flags(mod, BN_FLG_CONSTTIME) != 0)
163 BN_set_flags(ret->mod, BN_FLG_CONSTTIME);
164
165 /* Set the counter to the special value -1
166 * to indicate that this is never-used fresh blinding
167 * that does not need updating before first use. */
168 ret->counter = -1;
169 CRYPTO_THREADID_current(&ret->tid);
170 return (ret);
171
172err:
173 if (ret != NULL)
174 BN_BLINDING_free(ret);
175 return (NULL);
176}
177
178void
179BN_BLINDING_free(BN_BLINDING *r)
180{
181 if (r == NULL)
182 return;
183
184 BN_clear_free(r->A);
185 BN_clear_free(r->Ai);
186 BN_clear_free(r->e);
187 BN_clear_free(r->mod);
188 free(r);
189}
190
191int
192BN_BLINDING_update(BN_BLINDING *b, BN_CTX *ctx)
193{
194 int ret = 0;
195
196 if ((b->A == NULL) || (b->Ai == NULL)) {
197 BNerr(BN_F_BN_BLINDING_UPDATE, BN_R_NOT_INITIALIZED);
198 goto err;
199 }
200
201 if (b->counter == -1)
202 b->counter = 0;
203
204 if (++b->counter == BN_BLINDING_COUNTER && b->e != NULL &&
205 !(b->flags & BN_BLINDING_NO_RECREATE)) {
206 /* re-create blinding parameters */
207 if (!BN_BLINDING_create_param(b, NULL, NULL, ctx, NULL, NULL))
208 goto err;
209 } else if (!(b->flags & BN_BLINDING_NO_UPDATE)) {
210 if (!BN_mod_mul(b->A, b->A, b->A, b->mod, ctx))
211 goto err;
212 if (!BN_mod_mul(b->Ai, b->Ai, b->Ai, b->mod, ctx))
213 goto err;
214 }
215
216 ret = 1;
217
218err:
219 if (b->counter == BN_BLINDING_COUNTER)
220 b->counter = 0;
221 return (ret);
222}
223
224int
225BN_BLINDING_convert(BIGNUM *n, BN_BLINDING *b, BN_CTX *ctx)
226{
227 return BN_BLINDING_convert_ex(n, NULL, b, ctx);
228}
229
230int
231BN_BLINDING_convert_ex(BIGNUM *n, BIGNUM *r, BN_BLINDING *b, BN_CTX *ctx)
232{
233 int ret = 1;
234
235 bn_check_top(n);
236
237 if ((b->A == NULL) || (b->Ai == NULL)) {
238 BNerr(BN_F_BN_BLINDING_CONVERT_EX, BN_R_NOT_INITIALIZED);
239 return (0);
240 }
241
242 if (b->counter == -1)
243 /* Fresh blinding, doesn't need updating. */
244 b->counter = 0;
245 else if (!BN_BLINDING_update(b, ctx))
246 return (0);
247
248 if (r != NULL) {
249 if (!BN_copy(r, b->Ai))
250 ret = 0;
251 }
252
253 if (!BN_mod_mul(n, n,b->A, b->mod, ctx))
254 ret = 0;
255
256 return ret;
257}
258
259int
260BN_BLINDING_invert(BIGNUM *n, BN_BLINDING *b, BN_CTX *ctx)
261{
262 return BN_BLINDING_invert_ex(n, NULL, b, ctx);
263}
264
265int
266BN_BLINDING_invert_ex(BIGNUM *n, const BIGNUM *r, BN_BLINDING *b, BN_CTX *ctx)
267{
268 int ret;
269
270 bn_check_top(n);
271
272 if (r != NULL)
273 ret = BN_mod_mul(n, n, r, b->mod, ctx);
274 else {
275 if (b->Ai == NULL) {
276 BNerr(BN_F_BN_BLINDING_INVERT_EX, BN_R_NOT_INITIALIZED);
277 return (0);
278 }
279 ret = BN_mod_mul(n, n, b->Ai, b->mod, ctx);
280 }
281
282 bn_check_top(n);
283 return (ret);
284}
285
286#ifndef OPENSSL_NO_DEPRECATED
287unsigned long
288BN_BLINDING_get_thread_id(const BN_BLINDING *b)
289{
290 return b->thread_id;
291}
292
293void
294BN_BLINDING_set_thread_id(BN_BLINDING *b, unsigned long n)
295{
296 b->thread_id = n;
297}
298#endif
299
300CRYPTO_THREADID *
301BN_BLINDING_thread_id(BN_BLINDING *b)
302{
303 return &b->tid;
304}
305
306unsigned long
307BN_BLINDING_get_flags(const BN_BLINDING *b)
308{
309 return b->flags;
310}
311
312void
313BN_BLINDING_set_flags(BN_BLINDING *b, unsigned long flags)
314{
315 b->flags = flags;
316}
317
318BN_BLINDING *
319BN_BLINDING_create_param(BN_BLINDING *b, const BIGNUM *e, BIGNUM *m,
320 BN_CTX *ctx, int (*bn_mod_exp)(BIGNUM *r, const BIGNUM *a, const BIGNUM *p,
321 const BIGNUM *m, BN_CTX *ctx, BN_MONT_CTX *m_ctx), BN_MONT_CTX *m_ctx)
322{
323 int retry_counter = 32;
324 BN_BLINDING *ret = NULL;
325
326 if (b == NULL)
327 ret = BN_BLINDING_new(NULL, NULL, m);
328 else
329 ret = b;
330
331 if (ret == NULL)
332 goto err;
333
334 if (ret->A == NULL && (ret->A = BN_new()) == NULL)
335 goto err;
336 if (ret->Ai == NULL && (ret->Ai = BN_new()) == NULL)
337 goto err;
338
339 if (e != NULL) {
340 BN_free(ret->e);
341 ret->e = BN_dup(e);
342 }
343 if (ret->e == NULL)
344 goto err;
345
346 if (bn_mod_exp != NULL)
347 ret->bn_mod_exp = bn_mod_exp;
348 if (m_ctx != NULL)
349 ret->m_ctx = m_ctx;
350
351 do {
352 if (!BN_rand_range(ret->A, ret->mod))
353 goto err;
354 if (BN_mod_inverse(ret->Ai, ret->A, ret->mod, ctx) == NULL) {
355 /* this should almost never happen for good RSA keys */
356 unsigned long error = ERR_peek_last_error();
357 if (ERR_GET_REASON(error) == BN_R_NO_INVERSE) {
358 if (retry_counter-- == 0) {
359 BNerr(BN_F_BN_BLINDING_CREATE_PARAM,
360 BN_R_TOO_MANY_ITERATIONS);
361 goto err;
362 }
363 ERR_clear_error();
364 } else
365 goto err;
366 } else
367 break;
368 } while (1);
369
370 if (ret->bn_mod_exp != NULL && ret->m_ctx != NULL) {
371 if (!ret->bn_mod_exp(ret->A, ret->A, ret->e, ret->mod,
372 ctx, ret->m_ctx))
373 goto err;
374 } else {
375 if (!BN_mod_exp(ret->A, ret->A, ret->e, ret->mod, ctx))
376 goto err;
377 }
378
379 return ret;
380
381err:
382 if (b == NULL && ret != NULL) {
383 BN_BLINDING_free(ret);
384 ret = NULL;
385 }
386
387 return ret;
388}
diff --git a/src/lib/libcrypto/bn/bn_const.c b/src/lib/libcrypto/bn/bn_const.c
deleted file mode 100644
index 4be9f4f791..0000000000
--- a/src/lib/libcrypto/bn/bn_const.c
+++ /dev/null
@@ -1,409 +0,0 @@
1/* $OpenBSD: bn_const.c,v 1.4 2014/06/12 15:49:28 deraadt Exp $ */
2/* Insert boilerplate */
3
4#include <openssl/bn.h>
5
6/* "First Oakley Default Group" from RFC2409, section 6.1.
7 *
8 * The prime is: 2^768 - 2 ^704 - 1 + 2^64 * { [2^638 pi] + 149686 }
9 *
10 * RFC2409 specifies a generator of 2.
11 * RFC2412 specifies a generator of of 22.
12 */
13
14BIGNUM *
15get_rfc2409_prime_768(BIGNUM *bn)
16{
17 static const unsigned char RFC2409_PRIME_768[] = {
18 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xC9, 0x0F, 0xDA, 0xA2,
19 0x21, 0x68, 0xC2, 0x34, 0xC4, 0xC6, 0x62, 0x8B, 0x80, 0xDC, 0x1C, 0xD1,
20 0x29, 0x02, 0x4E, 0x08, 0x8A, 0x67, 0xCC, 0x74, 0x02, 0x0B, 0xBE, 0xA6,
21 0x3B, 0x13, 0x9B, 0x22, 0x51, 0x4A, 0x08, 0x79, 0x8E, 0x34, 0x04, 0xDD,
22 0xEF, 0x95, 0x19, 0xB3, 0xCD, 0x3A, 0x43, 0x1B, 0x30, 0x2B, 0x0A, 0x6D,
23 0xF2, 0x5F, 0x14, 0x37, 0x4F, 0xE1, 0x35, 0x6D, 0x6D, 0x51, 0xC2, 0x45,
24 0xE4, 0x85, 0xB5, 0x76, 0x62, 0x5E, 0x7E, 0xC6, 0xF4, 0x4C, 0x42, 0xE9,
25 0xA6, 0x3A, 0x36, 0x20, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
26 };
27 return BN_bin2bn(RFC2409_PRIME_768, sizeof(RFC2409_PRIME_768), bn);
28}
29
30/* "Second Oakley Default Group" from RFC2409, section 6.2.
31 *
32 * The prime is: 2^1024 - 2^960 - 1 + 2^64 * { [2^894 pi] + 129093 }.
33 *
34 * RFC2409 specifies a generator of 2.
35 * RFC2412 specifies a generator of 22.
36 */
37
38BIGNUM *
39get_rfc2409_prime_1024(BIGNUM *bn)
40{
41 static const unsigned char RFC2409_PRIME_1024[] = {
42 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xC9, 0x0F, 0xDA, 0xA2,
43 0x21, 0x68, 0xC2, 0x34, 0xC4, 0xC6, 0x62, 0x8B, 0x80, 0xDC, 0x1C, 0xD1,
44 0x29, 0x02, 0x4E, 0x08, 0x8A, 0x67, 0xCC, 0x74, 0x02, 0x0B, 0xBE, 0xA6,
45 0x3B, 0x13, 0x9B, 0x22, 0x51, 0x4A, 0x08, 0x79, 0x8E, 0x34, 0x04, 0xDD,
46 0xEF, 0x95, 0x19, 0xB3, 0xCD, 0x3A, 0x43, 0x1B, 0x30, 0x2B, 0x0A, 0x6D,
47 0xF2, 0x5F, 0x14, 0x37, 0x4F, 0xE1, 0x35, 0x6D, 0x6D, 0x51, 0xC2, 0x45,
48 0xE4, 0x85, 0xB5, 0x76, 0x62, 0x5E, 0x7E, 0xC6, 0xF4, 0x4C, 0x42, 0xE9,
49 0xA6, 0x37, 0xED, 0x6B, 0x0B, 0xFF, 0x5C, 0xB6, 0xF4, 0x06, 0xB7, 0xED,
50 0xEE, 0x38, 0x6B, 0xFB, 0x5A, 0x89, 0x9F, 0xA5, 0xAE, 0x9F, 0x24, 0x11,
51 0x7C, 0x4B, 0x1F, 0xE6, 0x49, 0x28, 0x66, 0x51, 0xEC, 0xE6, 0x53, 0x81,
52 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
53 };
54 return BN_bin2bn(RFC2409_PRIME_1024, sizeof(RFC2409_PRIME_1024), bn);
55}
56
57/* "1536-bit MODP Group" from RFC3526, Section 2.
58 *
59 * The prime is: 2^1536 - 2^1472 - 1 + 2^64 * { [2^1406 pi] + 741804 }
60 *
61 * RFC3526 specifies a generator of 2.
62 * RFC2312 specifies a generator of 22.
63 */
64
65BIGNUM *
66get_rfc3526_prime_1536(BIGNUM *bn)
67{
68 static const unsigned char RFC3526_PRIME_1536[] = {
69 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xC9, 0x0F, 0xDA, 0xA2,
70 0x21, 0x68, 0xC2, 0x34, 0xC4, 0xC6, 0x62, 0x8B, 0x80, 0xDC, 0x1C, 0xD1,
71 0x29, 0x02, 0x4E, 0x08, 0x8A, 0x67, 0xCC, 0x74, 0x02, 0x0B, 0xBE, 0xA6,
72 0x3B, 0x13, 0x9B, 0x22, 0x51, 0x4A, 0x08, 0x79, 0x8E, 0x34, 0x04, 0xDD,
73 0xEF, 0x95, 0x19, 0xB3, 0xCD, 0x3A, 0x43, 0x1B, 0x30, 0x2B, 0x0A, 0x6D,
74 0xF2, 0x5F, 0x14, 0x37, 0x4F, 0xE1, 0x35, 0x6D, 0x6D, 0x51, 0xC2, 0x45,
75 0xE4, 0x85, 0xB5, 0x76, 0x62, 0x5E, 0x7E, 0xC6, 0xF4, 0x4C, 0x42, 0xE9,
76 0xA6, 0x37, 0xED, 0x6B, 0x0B, 0xFF, 0x5C, 0xB6, 0xF4, 0x06, 0xB7, 0xED,
77 0xEE, 0x38, 0x6B, 0xFB, 0x5A, 0x89, 0x9F, 0xA5, 0xAE, 0x9F, 0x24, 0x11,
78 0x7C, 0x4B, 0x1F, 0xE6, 0x49, 0x28, 0x66, 0x51, 0xEC, 0xE4, 0x5B, 0x3D,
79 0xC2, 0x00, 0x7C, 0xB8, 0xA1, 0x63, 0xBF, 0x05, 0x98, 0xDA, 0x48, 0x36,
80 0x1C, 0x55, 0xD3, 0x9A, 0x69, 0x16, 0x3F, 0xA8, 0xFD, 0x24, 0xCF, 0x5F,
81 0x83, 0x65, 0x5D, 0x23, 0xDC, 0xA3, 0xAD, 0x96, 0x1C, 0x62, 0xF3, 0x56,
82 0x20, 0x85, 0x52, 0xBB, 0x9E, 0xD5, 0x29, 0x07, 0x70, 0x96, 0x96, 0x6D,
83 0x67, 0x0C, 0x35, 0x4E, 0x4A, 0xBC, 0x98, 0x04, 0xF1, 0x74, 0x6C, 0x08,
84 0xCA, 0x23, 0x73, 0x27, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
85 };
86 return BN_bin2bn(RFC3526_PRIME_1536, sizeof(RFC3526_PRIME_1536), bn);
87}
88
89/* "2048-bit MODP Group" from RFC3526, Section 3.
90 *
91 * The prime is: 2^2048 - 2^1984 - 1 + 2^64 * { [2^1918 pi] + 124476 }
92 *
93 * RFC3526 specifies a generator of 2.
94 */
95
96BIGNUM *
97get_rfc3526_prime_2048(BIGNUM *bn)
98{
99 static const unsigned char RFC3526_PRIME_2048[] = {
100 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xC9, 0x0F, 0xDA, 0xA2,
101 0x21, 0x68, 0xC2, 0x34, 0xC4, 0xC6, 0x62, 0x8B, 0x80, 0xDC, 0x1C, 0xD1,
102 0x29, 0x02, 0x4E, 0x08, 0x8A, 0x67, 0xCC, 0x74, 0x02, 0x0B, 0xBE, 0xA6,
103 0x3B, 0x13, 0x9B, 0x22, 0x51, 0x4A, 0x08, 0x79, 0x8E, 0x34, 0x04, 0xDD,
104 0xEF, 0x95, 0x19, 0xB3, 0xCD, 0x3A, 0x43, 0x1B, 0x30, 0x2B, 0x0A, 0x6D,
105 0xF2, 0x5F, 0x14, 0x37, 0x4F, 0xE1, 0x35, 0x6D, 0x6D, 0x51, 0xC2, 0x45,
106 0xE4, 0x85, 0xB5, 0x76, 0x62, 0x5E, 0x7E, 0xC6, 0xF4, 0x4C, 0x42, 0xE9,
107 0xA6, 0x37, 0xED, 0x6B, 0x0B, 0xFF, 0x5C, 0xB6, 0xF4, 0x06, 0xB7, 0xED,
108 0xEE, 0x38, 0x6B, 0xFB, 0x5A, 0x89, 0x9F, 0xA5, 0xAE, 0x9F, 0x24, 0x11,
109 0x7C, 0x4B, 0x1F, 0xE6, 0x49, 0x28, 0x66, 0x51, 0xEC, 0xE4, 0x5B, 0x3D,
110 0xC2, 0x00, 0x7C, 0xB8, 0xA1, 0x63, 0xBF, 0x05, 0x98, 0xDA, 0x48, 0x36,
111 0x1C, 0x55, 0xD3, 0x9A, 0x69, 0x16, 0x3F, 0xA8, 0xFD, 0x24, 0xCF, 0x5F,
112 0x83, 0x65, 0x5D, 0x23, 0xDC, 0xA3, 0xAD, 0x96, 0x1C, 0x62, 0xF3, 0x56,
113 0x20, 0x85, 0x52, 0xBB, 0x9E, 0xD5, 0x29, 0x07, 0x70, 0x96, 0x96, 0x6D,
114 0x67, 0x0C, 0x35, 0x4E, 0x4A, 0xBC, 0x98, 0x04, 0xF1, 0x74, 0x6C, 0x08,
115 0xCA, 0x18, 0x21, 0x7C, 0x32, 0x90, 0x5E, 0x46, 0x2E, 0x36, 0xCE, 0x3B,
116 0xE3, 0x9E, 0x77, 0x2C, 0x18, 0x0E, 0x86, 0x03, 0x9B, 0x27, 0x83, 0xA2,
117 0xEC, 0x07, 0xA2, 0x8F, 0xB5, 0xC5, 0x5D, 0xF0, 0x6F, 0x4C, 0x52, 0xC9,
118 0xDE, 0x2B, 0xCB, 0xF6, 0x95, 0x58, 0x17, 0x18, 0x39, 0x95, 0x49, 0x7C,
119 0xEA, 0x95, 0x6A, 0xE5, 0x15, 0xD2, 0x26, 0x18, 0x98, 0xFA, 0x05, 0x10,
120 0x15, 0x72, 0x8E, 0x5A, 0x8A, 0xAC, 0xAA, 0x68, 0xFF, 0xFF, 0xFF, 0xFF,
121 0xFF, 0xFF, 0xFF, 0xFF,
122 };
123 return BN_bin2bn(RFC3526_PRIME_2048, sizeof(RFC3526_PRIME_2048), bn);
124}
125
126/* "3072-bit MODP Group" from RFC3526, Section 4.
127 *
128 * The prime is: 2^3072 - 2^3008 - 1 + 2^64 * { [2^2942 pi] + 1690314 }
129 *
130 * RFC3526 specifies a generator of 2.
131 */
132
133BIGNUM *
134get_rfc3526_prime_3072(BIGNUM *bn)
135{
136 static const unsigned char RFC3526_PRIME_3072[] = {
137 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xC9, 0x0F, 0xDA, 0xA2,
138 0x21, 0x68, 0xC2, 0x34, 0xC4, 0xC6, 0x62, 0x8B, 0x80, 0xDC, 0x1C, 0xD1,
139 0x29, 0x02, 0x4E, 0x08, 0x8A, 0x67, 0xCC, 0x74, 0x02, 0x0B, 0xBE, 0xA6,
140 0x3B, 0x13, 0x9B, 0x22, 0x51, 0x4A, 0x08, 0x79, 0x8E, 0x34, 0x04, 0xDD,
141 0xEF, 0x95, 0x19, 0xB3, 0xCD, 0x3A, 0x43, 0x1B, 0x30, 0x2B, 0x0A, 0x6D,
142 0xF2, 0x5F, 0x14, 0x37, 0x4F, 0xE1, 0x35, 0x6D, 0x6D, 0x51, 0xC2, 0x45,
143 0xE4, 0x85, 0xB5, 0x76, 0x62, 0x5E, 0x7E, 0xC6, 0xF4, 0x4C, 0x42, 0xE9,
144 0xA6, 0x37, 0xED, 0x6B, 0x0B, 0xFF, 0x5C, 0xB6, 0xF4, 0x06, 0xB7, 0xED,
145 0xEE, 0x38, 0x6B, 0xFB, 0x5A, 0x89, 0x9F, 0xA5, 0xAE, 0x9F, 0x24, 0x11,
146 0x7C, 0x4B, 0x1F, 0xE6, 0x49, 0x28, 0x66, 0x51, 0xEC, 0xE4, 0x5B, 0x3D,
147 0xC2, 0x00, 0x7C, 0xB8, 0xA1, 0x63, 0xBF, 0x05, 0x98, 0xDA, 0x48, 0x36,
148 0x1C, 0x55, 0xD3, 0x9A, 0x69, 0x16, 0x3F, 0xA8, 0xFD, 0x24, 0xCF, 0x5F,
149 0x83, 0x65, 0x5D, 0x23, 0xDC, 0xA3, 0xAD, 0x96, 0x1C, 0x62, 0xF3, 0x56,
150 0x20, 0x85, 0x52, 0xBB, 0x9E, 0xD5, 0x29, 0x07, 0x70, 0x96, 0x96, 0x6D,
151 0x67, 0x0C, 0x35, 0x4E, 0x4A, 0xBC, 0x98, 0x04, 0xF1, 0x74, 0x6C, 0x08,
152 0xCA, 0x18, 0x21, 0x7C, 0x32, 0x90, 0x5E, 0x46, 0x2E, 0x36, 0xCE, 0x3B,
153 0xE3, 0x9E, 0x77, 0x2C, 0x18, 0x0E, 0x86, 0x03, 0x9B, 0x27, 0x83, 0xA2,
154 0xEC, 0x07, 0xA2, 0x8F, 0xB5, 0xC5, 0x5D, 0xF0, 0x6F, 0x4C, 0x52, 0xC9,
155 0xDE, 0x2B, 0xCB, 0xF6, 0x95, 0x58, 0x17, 0x18, 0x39, 0x95, 0x49, 0x7C,
156 0xEA, 0x95, 0x6A, 0xE5, 0x15, 0xD2, 0x26, 0x18, 0x98, 0xFA, 0x05, 0x10,
157 0x15, 0x72, 0x8E, 0x5A, 0x8A, 0xAA, 0xC4, 0x2D, 0xAD, 0x33, 0x17, 0x0D,
158 0x04, 0x50, 0x7A, 0x33, 0xA8, 0x55, 0x21, 0xAB, 0xDF, 0x1C, 0xBA, 0x64,
159 0xEC, 0xFB, 0x85, 0x04, 0x58, 0xDB, 0xEF, 0x0A, 0x8A, 0xEA, 0x71, 0x57,
160 0x5D, 0x06, 0x0C, 0x7D, 0xB3, 0x97, 0x0F, 0x85, 0xA6, 0xE1, 0xE4, 0xC7,
161 0xAB, 0xF5, 0xAE, 0x8C, 0xDB, 0x09, 0x33, 0xD7, 0x1E, 0x8C, 0x94, 0xE0,
162 0x4A, 0x25, 0x61, 0x9D, 0xCE, 0xE3, 0xD2, 0x26, 0x1A, 0xD2, 0xEE, 0x6B,
163 0xF1, 0x2F, 0xFA, 0x06, 0xD9, 0x8A, 0x08, 0x64, 0xD8, 0x76, 0x02, 0x73,
164 0x3E, 0xC8, 0x6A, 0x64, 0x52, 0x1F, 0x2B, 0x18, 0x17, 0x7B, 0x20, 0x0C,
165 0xBB, 0xE1, 0x17, 0x57, 0x7A, 0x61, 0x5D, 0x6C, 0x77, 0x09, 0x88, 0xC0,
166 0xBA, 0xD9, 0x46, 0xE2, 0x08, 0xE2, 0x4F, 0xA0, 0x74, 0xE5, 0xAB, 0x31,
167 0x43, 0xDB, 0x5B, 0xFC, 0xE0, 0xFD, 0x10, 0x8E, 0x4B, 0x82, 0xD1, 0x20,
168 0xA9, 0x3A, 0xD2, 0xCA, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
169 };
170 return BN_bin2bn(RFC3526_PRIME_3072, sizeof(RFC3526_PRIME_3072), bn);
171}
172
173/* "4096-bit MODP Group" from RFC3526, Section 5.
174 *
175 * The prime is: 2^4096 - 2^4032 - 1 + 2^64 * { [2^3966 pi] + 240904 }
176 *
177 * RFC3526 specifies a generator of 2.
178 */
179
180BIGNUM *
181get_rfc3526_prime_4096(BIGNUM *bn)
182{
183 static const unsigned char RFC3526_PRIME_4096[] = {
184 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xC9, 0x0F, 0xDA, 0xA2,
185 0x21, 0x68, 0xC2, 0x34, 0xC4, 0xC6, 0x62, 0x8B, 0x80, 0xDC, 0x1C, 0xD1,
186 0x29, 0x02, 0x4E, 0x08, 0x8A, 0x67, 0xCC, 0x74, 0x02, 0x0B, 0xBE, 0xA6,
187 0x3B, 0x13, 0x9B, 0x22, 0x51, 0x4A, 0x08, 0x79, 0x8E, 0x34, 0x04, 0xDD,
188 0xEF, 0x95, 0x19, 0xB3, 0xCD, 0x3A, 0x43, 0x1B, 0x30, 0x2B, 0x0A, 0x6D,
189 0xF2, 0x5F, 0x14, 0x37, 0x4F, 0xE1, 0x35, 0x6D, 0x6D, 0x51, 0xC2, 0x45,
190 0xE4, 0x85, 0xB5, 0x76, 0x62, 0x5E, 0x7E, 0xC6, 0xF4, 0x4C, 0x42, 0xE9,
191 0xA6, 0x37, 0xED, 0x6B, 0x0B, 0xFF, 0x5C, 0xB6, 0xF4, 0x06, 0xB7, 0xED,
192 0xEE, 0x38, 0x6B, 0xFB, 0x5A, 0x89, 0x9F, 0xA5, 0xAE, 0x9F, 0x24, 0x11,
193 0x7C, 0x4B, 0x1F, 0xE6, 0x49, 0x28, 0x66, 0x51, 0xEC, 0xE4, 0x5B, 0x3D,
194 0xC2, 0x00, 0x7C, 0xB8, 0xA1, 0x63, 0xBF, 0x05, 0x98, 0xDA, 0x48, 0x36,
195 0x1C, 0x55, 0xD3, 0x9A, 0x69, 0x16, 0x3F, 0xA8, 0xFD, 0x24, 0xCF, 0x5F,
196 0x83, 0x65, 0x5D, 0x23, 0xDC, 0xA3, 0xAD, 0x96, 0x1C, 0x62, 0xF3, 0x56,
197 0x20, 0x85, 0x52, 0xBB, 0x9E, 0xD5, 0x29, 0x07, 0x70, 0x96, 0x96, 0x6D,
198 0x67, 0x0C, 0x35, 0x4E, 0x4A, 0xBC, 0x98, 0x04, 0xF1, 0x74, 0x6C, 0x08,
199 0xCA, 0x18, 0x21, 0x7C, 0x32, 0x90, 0x5E, 0x46, 0x2E, 0x36, 0xCE, 0x3B,
200 0xE3, 0x9E, 0x77, 0x2C, 0x18, 0x0E, 0x86, 0x03, 0x9B, 0x27, 0x83, 0xA2,
201 0xEC, 0x07, 0xA2, 0x8F, 0xB5, 0xC5, 0x5D, 0xF0, 0x6F, 0x4C, 0x52, 0xC9,
202 0xDE, 0x2B, 0xCB, 0xF6, 0x95, 0x58, 0x17, 0x18, 0x39, 0x95, 0x49, 0x7C,
203 0xEA, 0x95, 0x6A, 0xE5, 0x15, 0xD2, 0x26, 0x18, 0x98, 0xFA, 0x05, 0x10,
204 0x15, 0x72, 0x8E, 0x5A, 0x8A, 0xAA, 0xC4, 0x2D, 0xAD, 0x33, 0x17, 0x0D,
205 0x04, 0x50, 0x7A, 0x33, 0xA8, 0x55, 0x21, 0xAB, 0xDF, 0x1C, 0xBA, 0x64,
206 0xEC, 0xFB, 0x85, 0x04, 0x58, 0xDB, 0xEF, 0x0A, 0x8A, 0xEA, 0x71, 0x57,
207 0x5D, 0x06, 0x0C, 0x7D, 0xB3, 0x97, 0x0F, 0x85, 0xA6, 0xE1, 0xE4, 0xC7,
208 0xAB, 0xF5, 0xAE, 0x8C, 0xDB, 0x09, 0x33, 0xD7, 0x1E, 0x8C, 0x94, 0xE0,
209 0x4A, 0x25, 0x61, 0x9D, 0xCE, 0xE3, 0xD2, 0x26, 0x1A, 0xD2, 0xEE, 0x6B,
210 0xF1, 0x2F, 0xFA, 0x06, 0xD9, 0x8A, 0x08, 0x64, 0xD8, 0x76, 0x02, 0x73,
211 0x3E, 0xC8, 0x6A, 0x64, 0x52, 0x1F, 0x2B, 0x18, 0x17, 0x7B, 0x20, 0x0C,
212 0xBB, 0xE1, 0x17, 0x57, 0x7A, 0x61, 0x5D, 0x6C, 0x77, 0x09, 0x88, 0xC0,
213 0xBA, 0xD9, 0x46, 0xE2, 0x08, 0xE2, 0x4F, 0xA0, 0x74, 0xE5, 0xAB, 0x31,
214 0x43, 0xDB, 0x5B, 0xFC, 0xE0, 0xFD, 0x10, 0x8E, 0x4B, 0x82, 0xD1, 0x20,
215 0xA9, 0x21, 0x08, 0x01, 0x1A, 0x72, 0x3C, 0x12, 0xA7, 0x87, 0xE6, 0xD7,
216 0x88, 0x71, 0x9A, 0x10, 0xBD, 0xBA, 0x5B, 0x26, 0x99, 0xC3, 0x27, 0x18,
217 0x6A, 0xF4, 0xE2, 0x3C, 0x1A, 0x94, 0x68, 0x34, 0xB6, 0x15, 0x0B, 0xDA,
218 0x25, 0x83, 0xE9, 0xCA, 0x2A, 0xD4, 0x4C, 0xE8, 0xDB, 0xBB, 0xC2, 0xDB,
219 0x04, 0xDE, 0x8E, 0xF9, 0x2E, 0x8E, 0xFC, 0x14, 0x1F, 0xBE, 0xCA, 0xA6,
220 0x28, 0x7C, 0x59, 0x47, 0x4E, 0x6B, 0xC0, 0x5D, 0x99, 0xB2, 0x96, 0x4F,
221 0xA0, 0x90, 0xC3, 0xA2, 0x23, 0x3B, 0xA1, 0x86, 0x51, 0x5B, 0xE7, 0xED,
222 0x1F, 0x61, 0x29, 0x70, 0xCE, 0xE2, 0xD7, 0xAF, 0xB8, 0x1B, 0xDD, 0x76,
223 0x21, 0x70, 0x48, 0x1C, 0xD0, 0x06, 0x91, 0x27, 0xD5, 0xB0, 0x5A, 0xA9,
224 0x93, 0xB4, 0xEA, 0x98, 0x8D, 0x8F, 0xDD, 0xC1, 0x86, 0xFF, 0xB7, 0xDC,
225 0x90, 0xA6, 0xC0, 0x8F, 0x4D, 0xF4, 0x35, 0xC9, 0x34, 0x06, 0x31, 0x99,
226 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
227 };
228 return BN_bin2bn(RFC3526_PRIME_4096, sizeof(RFC3526_PRIME_4096), bn);
229}
230
231/* "6144-bit MODP Group" from RFC3526, Section 6.
232 *
233 * The prime is: 2^6144 - 2^6080 - 1 + 2^64 * { [2^6014 pi] + 929484 }
234 *
235 * RFC3526 specifies a generator of 2.
236 */
237
238BIGNUM *
239get_rfc3526_prime_6144(BIGNUM *bn)
240{
241 static const unsigned char RFC3526_PRIME_6144[] = {
242 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xC9, 0x0F, 0xDA, 0xA2,
243 0x21, 0x68, 0xC2, 0x34, 0xC4, 0xC6, 0x62, 0x8B, 0x80, 0xDC, 0x1C, 0xD1,
244 0x29, 0x02, 0x4E, 0x08, 0x8A, 0x67, 0xCC, 0x74, 0x02, 0x0B, 0xBE, 0xA6,
245 0x3B, 0x13, 0x9B, 0x22, 0x51, 0x4A, 0x08, 0x79, 0x8E, 0x34, 0x04, 0xDD,
246 0xEF, 0x95, 0x19, 0xB3, 0xCD, 0x3A, 0x43, 0x1B, 0x30, 0x2B, 0x0A, 0x6D,
247 0xF2, 0x5F, 0x14, 0x37, 0x4F, 0xE1, 0x35, 0x6D, 0x6D, 0x51, 0xC2, 0x45,
248 0xE4, 0x85, 0xB5, 0x76, 0x62, 0x5E, 0x7E, 0xC6, 0xF4, 0x4C, 0x42, 0xE9,
249 0xA6, 0x37, 0xED, 0x6B, 0x0B, 0xFF, 0x5C, 0xB6, 0xF4, 0x06, 0xB7, 0xED,
250 0xEE, 0x38, 0x6B, 0xFB, 0x5A, 0x89, 0x9F, 0xA5, 0xAE, 0x9F, 0x24, 0x11,
251 0x7C, 0x4B, 0x1F, 0xE6, 0x49, 0x28, 0x66, 0x51, 0xEC, 0xE4, 0x5B, 0x3D,
252 0xC2, 0x00, 0x7C, 0xB8, 0xA1, 0x63, 0xBF, 0x05, 0x98, 0xDA, 0x48, 0x36,
253 0x1C, 0x55, 0xD3, 0x9A, 0x69, 0x16, 0x3F, 0xA8, 0xFD, 0x24, 0xCF, 0x5F,
254 0x83, 0x65, 0x5D, 0x23, 0xDC, 0xA3, 0xAD, 0x96, 0x1C, 0x62, 0xF3, 0x56,
255 0x20, 0x85, 0x52, 0xBB, 0x9E, 0xD5, 0x29, 0x07, 0x70, 0x96, 0x96, 0x6D,
256 0x67, 0x0C, 0x35, 0x4E, 0x4A, 0xBC, 0x98, 0x04, 0xF1, 0x74, 0x6C, 0x08,
257 0xCA, 0x18, 0x21, 0x7C, 0x32, 0x90, 0x5E, 0x46, 0x2E, 0x36, 0xCE, 0x3B,
258 0xE3, 0x9E, 0x77, 0x2C, 0x18, 0x0E, 0x86, 0x03, 0x9B, 0x27, 0x83, 0xA2,
259 0xEC, 0x07, 0xA2, 0x8F, 0xB5, 0xC5, 0x5D, 0xF0, 0x6F, 0x4C, 0x52, 0xC9,
260 0xDE, 0x2B, 0xCB, 0xF6, 0x95, 0x58, 0x17, 0x18, 0x39, 0x95, 0x49, 0x7C,
261 0xEA, 0x95, 0x6A, 0xE5, 0x15, 0xD2, 0x26, 0x18, 0x98, 0xFA, 0x05, 0x10,
262 0x15, 0x72, 0x8E, 0x5A, 0x8A, 0xAA, 0xC4, 0x2D, 0xAD, 0x33, 0x17, 0x0D,
263 0x04, 0x50, 0x7A, 0x33, 0xA8, 0x55, 0x21, 0xAB, 0xDF, 0x1C, 0xBA, 0x64,
264 0xEC, 0xFB, 0x85, 0x04, 0x58, 0xDB, 0xEF, 0x0A, 0x8A, 0xEA, 0x71, 0x57,
265 0x5D, 0x06, 0x0C, 0x7D, 0xB3, 0x97, 0x0F, 0x85, 0xA6, 0xE1, 0xE4, 0xC7,
266 0xAB, 0xF5, 0xAE, 0x8C, 0xDB, 0x09, 0x33, 0xD7, 0x1E, 0x8C, 0x94, 0xE0,
267 0x4A, 0x25, 0x61, 0x9D, 0xCE, 0xE3, 0xD2, 0x26, 0x1A, 0xD2, 0xEE, 0x6B,
268 0xF1, 0x2F, 0xFA, 0x06, 0xD9, 0x8A, 0x08, 0x64, 0xD8, 0x76, 0x02, 0x73,
269 0x3E, 0xC8, 0x6A, 0x64, 0x52, 0x1F, 0x2B, 0x18, 0x17, 0x7B, 0x20, 0x0C,
270 0xBB, 0xE1, 0x17, 0x57, 0x7A, 0x61, 0x5D, 0x6C, 0x77, 0x09, 0x88, 0xC0,
271 0xBA, 0xD9, 0x46, 0xE2, 0x08, 0xE2, 0x4F, 0xA0, 0x74, 0xE5, 0xAB, 0x31,
272 0x43, 0xDB, 0x5B, 0xFC, 0xE0, 0xFD, 0x10, 0x8E, 0x4B, 0x82, 0xD1, 0x20,
273 0xA9, 0x21, 0x08, 0x01, 0x1A, 0x72, 0x3C, 0x12, 0xA7, 0x87, 0xE6, 0xD7,
274 0x88, 0x71, 0x9A, 0x10, 0xBD, 0xBA, 0x5B, 0x26, 0x99, 0xC3, 0x27, 0x18,
275 0x6A, 0xF4, 0xE2, 0x3C, 0x1A, 0x94, 0x68, 0x34, 0xB6, 0x15, 0x0B, 0xDA,
276 0x25, 0x83, 0xE9, 0xCA, 0x2A, 0xD4, 0x4C, 0xE8, 0xDB, 0xBB, 0xC2, 0xDB,
277 0x04, 0xDE, 0x8E, 0xF9, 0x2E, 0x8E, 0xFC, 0x14, 0x1F, 0xBE, 0xCA, 0xA6,
278 0x28, 0x7C, 0x59, 0x47, 0x4E, 0x6B, 0xC0, 0x5D, 0x99, 0xB2, 0x96, 0x4F,
279 0xA0, 0x90, 0xC3, 0xA2, 0x23, 0x3B, 0xA1, 0x86, 0x51, 0x5B, 0xE7, 0xED,
280 0x1F, 0x61, 0x29, 0x70, 0xCE, 0xE2, 0xD7, 0xAF, 0xB8, 0x1B, 0xDD, 0x76,
281 0x21, 0x70, 0x48, 0x1C, 0xD0, 0x06, 0x91, 0x27, 0xD5, 0xB0, 0x5A, 0xA9,
282 0x93, 0xB4, 0xEA, 0x98, 0x8D, 0x8F, 0xDD, 0xC1, 0x86, 0xFF, 0xB7, 0xDC,
283 0x90, 0xA6, 0xC0, 0x8F, 0x4D, 0xF4, 0x35, 0xC9, 0x34, 0x02, 0x84, 0x92,
284 0x36, 0xC3, 0xFA, 0xB4, 0xD2, 0x7C, 0x70, 0x26, 0xC1, 0xD4, 0xDC, 0xB2,
285 0x60, 0x26, 0x46, 0xDE, 0xC9, 0x75, 0x1E, 0x76, 0x3D, 0xBA, 0x37, 0xBD,
286 0xF8, 0xFF, 0x94, 0x06, 0xAD, 0x9E, 0x53, 0x0E, 0xE5, 0xDB, 0x38, 0x2F,
287 0x41, 0x30, 0x01, 0xAE, 0xB0, 0x6A, 0x53, 0xED, 0x90, 0x27, 0xD8, 0x31,
288 0x17, 0x97, 0x27, 0xB0, 0x86, 0x5A, 0x89, 0x18, 0xDA, 0x3E, 0xDB, 0xEB,
289 0xCF, 0x9B, 0x14, 0xED, 0x44, 0xCE, 0x6C, 0xBA, 0xCE, 0xD4, 0xBB, 0x1B,
290 0xDB, 0x7F, 0x14, 0x47, 0xE6, 0xCC, 0x25, 0x4B, 0x33, 0x20, 0x51, 0x51,
291 0x2B, 0xD7, 0xAF, 0x42, 0x6F, 0xB8, 0xF4, 0x01, 0x37, 0x8C, 0xD2, 0xBF,
292 0x59, 0x83, 0xCA, 0x01, 0xC6, 0x4B, 0x92, 0xEC, 0xF0, 0x32, 0xEA, 0x15,
293 0xD1, 0x72, 0x1D, 0x03, 0xF4, 0x82, 0xD7, 0xCE, 0x6E, 0x74, 0xFE, 0xF6,
294 0xD5, 0x5E, 0x70, 0x2F, 0x46, 0x98, 0x0C, 0x82, 0xB5, 0xA8, 0x40, 0x31,
295 0x90, 0x0B, 0x1C, 0x9E, 0x59, 0xE7, 0xC9, 0x7F, 0xBE, 0xC7, 0xE8, 0xF3,
296 0x23, 0xA9, 0x7A, 0x7E, 0x36, 0xCC, 0x88, 0xBE, 0x0F, 0x1D, 0x45, 0xB7,
297 0xFF, 0x58, 0x5A, 0xC5, 0x4B, 0xD4, 0x07, 0xB2, 0x2B, 0x41, 0x54, 0xAA,
298 0xCC, 0x8F, 0x6D, 0x7E, 0xBF, 0x48, 0xE1, 0xD8, 0x14, 0xCC, 0x5E, 0xD2,
299 0x0F, 0x80, 0x37, 0xE0, 0xA7, 0x97, 0x15, 0xEE, 0xF2, 0x9B, 0xE3, 0x28,
300 0x06, 0xA1, 0xD5, 0x8B, 0xB7, 0xC5, 0xDA, 0x76, 0xF5, 0x50, 0xAA, 0x3D,
301 0x8A, 0x1F, 0xBF, 0xF0, 0xEB, 0x19, 0xCC, 0xB1, 0xA3, 0x13, 0xD5, 0x5C,
302 0xDA, 0x56, 0xC9, 0xEC, 0x2E, 0xF2, 0x96, 0x32, 0x38, 0x7F, 0xE8, 0xD7,
303 0x6E, 0x3C, 0x04, 0x68, 0x04, 0x3E, 0x8F, 0x66, 0x3F, 0x48, 0x60, 0xEE,
304 0x12, 0xBF, 0x2D, 0x5B, 0x0B, 0x74, 0x74, 0xD6, 0xE6, 0x94, 0xF9, 0x1E,
305 0x6D, 0xCC, 0x40, 0x24, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
306 };
307 return BN_bin2bn(RFC3526_PRIME_6144, sizeof(RFC3526_PRIME_6144), bn);
308}
309
310/* "8192-bit MODP Group" from RFC3526, Section 7.
311 *
312 * The prime is: 2^8192 - 2^8128 - 1 + 2^64 * { [2^8062 pi] + 4743158 }
313 *
314 * RFC3526 specifies a generator of 2.
315 */
316
317BIGNUM *
318get_rfc3526_prime_8192(BIGNUM *bn)
319{
320 static const unsigned char RFC3526_PRIME_8192[] = {
321 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xC9, 0x0F, 0xDA, 0xA2,
322 0x21, 0x68, 0xC2, 0x34, 0xC4, 0xC6, 0x62, 0x8B, 0x80, 0xDC, 0x1C, 0xD1,
323 0x29, 0x02, 0x4E, 0x08, 0x8A, 0x67, 0xCC, 0x74, 0x02, 0x0B, 0xBE, 0xA6,
324 0x3B, 0x13, 0x9B, 0x22, 0x51, 0x4A, 0x08, 0x79, 0x8E, 0x34, 0x04, 0xDD,
325 0xEF, 0x95, 0x19, 0xB3, 0xCD, 0x3A, 0x43, 0x1B, 0x30, 0x2B, 0x0A, 0x6D,
326 0xF2, 0x5F, 0x14, 0x37, 0x4F, 0xE1, 0x35, 0x6D, 0x6D, 0x51, 0xC2, 0x45,
327 0xE4, 0x85, 0xB5, 0x76, 0x62, 0x5E, 0x7E, 0xC6, 0xF4, 0x4C, 0x42, 0xE9,
328 0xA6, 0x37, 0xED, 0x6B, 0x0B, 0xFF, 0x5C, 0xB6, 0xF4, 0x06, 0xB7, 0xED,
329 0xEE, 0x38, 0x6B, 0xFB, 0x5A, 0x89, 0x9F, 0xA5, 0xAE, 0x9F, 0x24, 0x11,
330 0x7C, 0x4B, 0x1F, 0xE6, 0x49, 0x28, 0x66, 0x51, 0xEC, 0xE4, 0x5B, 0x3D,
331 0xC2, 0x00, 0x7C, 0xB8, 0xA1, 0x63, 0xBF, 0x05, 0x98, 0xDA, 0x48, 0x36,
332 0x1C, 0x55, 0xD3, 0x9A, 0x69, 0x16, 0x3F, 0xA8, 0xFD, 0x24, 0xCF, 0x5F,
333 0x83, 0x65, 0x5D, 0x23, 0xDC, 0xA3, 0xAD, 0x96, 0x1C, 0x62, 0xF3, 0x56,
334 0x20, 0x85, 0x52, 0xBB, 0x9E, 0xD5, 0x29, 0x07, 0x70, 0x96, 0x96, 0x6D,
335 0x67, 0x0C, 0x35, 0x4E, 0x4A, 0xBC, 0x98, 0x04, 0xF1, 0x74, 0x6C, 0x08,
336 0xCA, 0x18, 0x21, 0x7C, 0x32, 0x90, 0x5E, 0x46, 0x2E, 0x36, 0xCE, 0x3B,
337 0xE3, 0x9E, 0x77, 0x2C, 0x18, 0x0E, 0x86, 0x03, 0x9B, 0x27, 0x83, 0xA2,
338 0xEC, 0x07, 0xA2, 0x8F, 0xB5, 0xC5, 0x5D, 0xF0, 0x6F, 0x4C, 0x52, 0xC9,
339 0xDE, 0x2B, 0xCB, 0xF6, 0x95, 0x58, 0x17, 0x18, 0x39, 0x95, 0x49, 0x7C,
340 0xEA, 0x95, 0x6A, 0xE5, 0x15, 0xD2, 0x26, 0x18, 0x98, 0xFA, 0x05, 0x10,
341 0x15, 0x72, 0x8E, 0x5A, 0x8A, 0xAA, 0xC4, 0x2D, 0xAD, 0x33, 0x17, 0x0D,
342 0x04, 0x50, 0x7A, 0x33, 0xA8, 0x55, 0x21, 0xAB, 0xDF, 0x1C, 0xBA, 0x64,
343 0xEC, 0xFB, 0x85, 0x04, 0x58, 0xDB, 0xEF, 0x0A, 0x8A, 0xEA, 0x71, 0x57,
344 0x5D, 0x06, 0x0C, 0x7D, 0xB3, 0x97, 0x0F, 0x85, 0xA6, 0xE1, 0xE4, 0xC7,
345 0xAB, 0xF5, 0xAE, 0x8C, 0xDB, 0x09, 0x33, 0xD7, 0x1E, 0x8C, 0x94, 0xE0,
346 0x4A, 0x25, 0x61, 0x9D, 0xCE, 0xE3, 0xD2, 0x26, 0x1A, 0xD2, 0xEE, 0x6B,
347 0xF1, 0x2F, 0xFA, 0x06, 0xD9, 0x8A, 0x08, 0x64, 0xD8, 0x76, 0x02, 0x73,
348 0x3E, 0xC8, 0x6A, 0x64, 0x52, 0x1F, 0x2B, 0x18, 0x17, 0x7B, 0x20, 0x0C,
349 0xBB, 0xE1, 0x17, 0x57, 0x7A, 0x61, 0x5D, 0x6C, 0x77, 0x09, 0x88, 0xC0,
350 0xBA, 0xD9, 0x46, 0xE2, 0x08, 0xE2, 0x4F, 0xA0, 0x74, 0xE5, 0xAB, 0x31,
351 0x43, 0xDB, 0x5B, 0xFC, 0xE0, 0xFD, 0x10, 0x8E, 0x4B, 0x82, 0xD1, 0x20,
352 0xA9, 0x21, 0x08, 0x01, 0x1A, 0x72, 0x3C, 0x12, 0xA7, 0x87, 0xE6, 0xD7,
353 0x88, 0x71, 0x9A, 0x10, 0xBD, 0xBA, 0x5B, 0x26, 0x99, 0xC3, 0x27, 0x18,
354 0x6A, 0xF4, 0xE2, 0x3C, 0x1A, 0x94, 0x68, 0x34, 0xB6, 0x15, 0x0B, 0xDA,
355 0x25, 0x83, 0xE9, 0xCA, 0x2A, 0xD4, 0x4C, 0xE8, 0xDB, 0xBB, 0xC2, 0xDB,
356 0x04, 0xDE, 0x8E, 0xF9, 0x2E, 0x8E, 0xFC, 0x14, 0x1F, 0xBE, 0xCA, 0xA6,
357 0x28, 0x7C, 0x59, 0x47, 0x4E, 0x6B, 0xC0, 0x5D, 0x99, 0xB2, 0x96, 0x4F,
358 0xA0, 0x90, 0xC3, 0xA2, 0x23, 0x3B, 0xA1, 0x86, 0x51, 0x5B, 0xE7, 0xED,
359 0x1F, 0x61, 0x29, 0x70, 0xCE, 0xE2, 0xD7, 0xAF, 0xB8, 0x1B, 0xDD, 0x76,
360 0x21, 0x70, 0x48, 0x1C, 0xD0, 0x06, 0x91, 0x27, 0xD5, 0xB0, 0x5A, 0xA9,
361 0x93, 0xB4, 0xEA, 0x98, 0x8D, 0x8F, 0xDD, 0xC1, 0x86, 0xFF, 0xB7, 0xDC,
362 0x90, 0xA6, 0xC0, 0x8F, 0x4D, 0xF4, 0x35, 0xC9, 0x34, 0x02, 0x84, 0x92,
363 0x36, 0xC3, 0xFA, 0xB4, 0xD2, 0x7C, 0x70, 0x26, 0xC1, 0xD4, 0xDC, 0xB2,
364 0x60, 0x26, 0x46, 0xDE, 0xC9, 0x75, 0x1E, 0x76, 0x3D, 0xBA, 0x37, 0xBD,
365 0xF8, 0xFF, 0x94, 0x06, 0xAD, 0x9E, 0x53, 0x0E, 0xE5, 0xDB, 0x38, 0x2F,
366 0x41, 0x30, 0x01, 0xAE, 0xB0, 0x6A, 0x53, 0xED, 0x90, 0x27, 0xD8, 0x31,
367 0x17, 0x97, 0x27, 0xB0, 0x86, 0x5A, 0x89, 0x18, 0xDA, 0x3E, 0xDB, 0xEB,
368 0xCF, 0x9B, 0x14, 0xED, 0x44, 0xCE, 0x6C, 0xBA, 0xCE, 0xD4, 0xBB, 0x1B,
369 0xDB, 0x7F, 0x14, 0x47, 0xE6, 0xCC, 0x25, 0x4B, 0x33, 0x20, 0x51, 0x51,
370 0x2B, 0xD7, 0xAF, 0x42, 0x6F, 0xB8, 0xF4, 0x01, 0x37, 0x8C, 0xD2, 0xBF,
371 0x59, 0x83, 0xCA, 0x01, 0xC6, 0x4B, 0x92, 0xEC, 0xF0, 0x32, 0xEA, 0x15,
372 0xD1, 0x72, 0x1D, 0x03, 0xF4, 0x82, 0xD7, 0xCE, 0x6E, 0x74, 0xFE, 0xF6,
373 0xD5, 0x5E, 0x70, 0x2F, 0x46, 0x98, 0x0C, 0x82, 0xB5, 0xA8, 0x40, 0x31,
374 0x90, 0x0B, 0x1C, 0x9E, 0x59, 0xE7, 0xC9, 0x7F, 0xBE, 0xC7, 0xE8, 0xF3,
375 0x23, 0xA9, 0x7A, 0x7E, 0x36, 0xCC, 0x88, 0xBE, 0x0F, 0x1D, 0x45, 0xB7,
376 0xFF, 0x58, 0x5A, 0xC5, 0x4B, 0xD4, 0x07, 0xB2, 0x2B, 0x41, 0x54, 0xAA,
377 0xCC, 0x8F, 0x6D, 0x7E, 0xBF, 0x48, 0xE1, 0xD8, 0x14, 0xCC, 0x5E, 0xD2,
378 0x0F, 0x80, 0x37, 0xE0, 0xA7, 0x97, 0x15, 0xEE, 0xF2, 0x9B, 0xE3, 0x28,
379 0x06, 0xA1, 0xD5, 0x8B, 0xB7, 0xC5, 0xDA, 0x76, 0xF5, 0x50, 0xAA, 0x3D,
380 0x8A, 0x1F, 0xBF, 0xF0, 0xEB, 0x19, 0xCC, 0xB1, 0xA3, 0x13, 0xD5, 0x5C,
381 0xDA, 0x56, 0xC9, 0xEC, 0x2E, 0xF2, 0x96, 0x32, 0x38, 0x7F, 0xE8, 0xD7,
382 0x6E, 0x3C, 0x04, 0x68, 0x04, 0x3E, 0x8F, 0x66, 0x3F, 0x48, 0x60, 0xEE,
383 0x12, 0xBF, 0x2D, 0x5B, 0x0B, 0x74, 0x74, 0xD6, 0xE6, 0x94, 0xF9, 0x1E,
384 0x6D, 0xBE, 0x11, 0x59, 0x74, 0xA3, 0x92, 0x6F, 0x12, 0xFE, 0xE5, 0xE4,
385 0x38, 0x77, 0x7C, 0xB6, 0xA9, 0x32, 0xDF, 0x8C, 0xD8, 0xBE, 0xC4, 0xD0,
386 0x73, 0xB9, 0x31, 0xBA, 0x3B, 0xC8, 0x32, 0xB6, 0x8D, 0x9D, 0xD3, 0x00,
387 0x74, 0x1F, 0xA7, 0xBF, 0x8A, 0xFC, 0x47, 0xED, 0x25, 0x76, 0xF6, 0x93,
388 0x6B, 0xA4, 0x24, 0x66, 0x3A, 0xAB, 0x63, 0x9C, 0x5A, 0xE4, 0xF5, 0x68,
389 0x34, 0x23, 0xB4, 0x74, 0x2B, 0xF1, 0xC9, 0x78, 0x23, 0x8F, 0x16, 0xCB,
390 0xE3, 0x9D, 0x65, 0x2D, 0xE3, 0xFD, 0xB8, 0xBE, 0xFC, 0x84, 0x8A, 0xD9,
391 0x22, 0x22, 0x2E, 0x04, 0xA4, 0x03, 0x7C, 0x07, 0x13, 0xEB, 0x57, 0xA8,
392 0x1A, 0x23, 0xF0, 0xC7, 0x34, 0x73, 0xFC, 0x64, 0x6C, 0xEA, 0x30, 0x6B,
393 0x4B, 0xCB, 0xC8, 0x86, 0x2F, 0x83, 0x85, 0xDD, 0xFA, 0x9D, 0x4B, 0x7F,
394 0xA2, 0xC0, 0x87, 0xE8, 0x79, 0x68, 0x33, 0x03, 0xED, 0x5B, 0xDD, 0x3A,
395 0x06, 0x2B, 0x3C, 0xF5, 0xB3, 0xA2, 0x78, 0xA6, 0x6D, 0x2A, 0x13, 0xF8,
396 0x3F, 0x44, 0xF8, 0x2D, 0xDF, 0x31, 0x0E, 0xE0, 0x74, 0xAB, 0x6A, 0x36,
397 0x45, 0x97, 0xE8, 0x99, 0xA0, 0x25, 0x5D, 0xC1, 0x64, 0xF3, 0x1C, 0xC5,
398 0x08, 0x46, 0x85, 0x1D, 0xF9, 0xAB, 0x48, 0x19, 0x5D, 0xED, 0x7E, 0xA1,
399 0xB1, 0xD5, 0x10, 0xBD, 0x7E, 0xE7, 0x4D, 0x73, 0xFA, 0xF3, 0x6B, 0xC3,
400 0x1E, 0xCF, 0xA2, 0x68, 0x35, 0x90, 0x46, 0xF4, 0xEB, 0x87, 0x9F, 0x92,
401 0x40, 0x09, 0x43, 0x8B, 0x48, 0x1C, 0x6C, 0xD7, 0x88, 0x9A, 0x00, 0x2E,
402 0xD5, 0xEE, 0x38, 0x2B, 0xC9, 0x19, 0x0D, 0xA6, 0xFC, 0x02, 0x6E, 0x47,
403 0x95, 0x58, 0xE4, 0x47, 0x56, 0x77, 0xE9, 0xAA, 0x9E, 0x30, 0x50, 0xE2,
404 0x76, 0x56, 0x94, 0xDF, 0xC8, 0x1F, 0x56, 0xE8, 0x80, 0xB9, 0x6E, 0x71,
405 0x60, 0xC9, 0x80, 0xDD, 0x98, 0xED, 0xD3, 0xDF, 0xFF, 0xFF, 0xFF, 0xFF,
406 0xFF, 0xFF, 0xFF, 0xFF,
407 };
408 return BN_bin2bn(RFC3526_PRIME_8192, sizeof(RFC3526_PRIME_8192), bn);
409}
diff --git a/src/lib/libcrypto/bn/bn_ctx.c b/src/lib/libcrypto/bn/bn_ctx.c
deleted file mode 100644
index eb2d6a43b3..0000000000
--- a/src/lib/libcrypto/bn/bn_ctx.c
+++ /dev/null
@@ -1,478 +0,0 @@
1/* $OpenBSD: bn_ctx.c,v 1.14 2015/02/10 09:50:12 miod Exp $ */
2/* Written by Ulf Moeller for the OpenSSL project. */
3/* ====================================================================
4 * Copyright (c) 1998-2004 The OpenSSL Project. All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 *
18 * 3. All advertising materials mentioning features or use of this
19 * software must display the following acknowledgment:
20 * "This product includes software developed by the OpenSSL Project
21 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
22 *
23 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
24 * endorse or promote products derived from this software without
25 * prior written permission. For written permission, please contact
26 * openssl-core@openssl.org.
27 *
28 * 5. Products derived from this software may not be called "OpenSSL"
29 * nor may "OpenSSL" appear in their names without prior written
30 * permission of the OpenSSL Project.
31 *
32 * 6. Redistributions of any form whatsoever must retain the following
33 * acknowledgment:
34 * "This product includes software developed by the OpenSSL Project
35 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
36 *
37 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
38 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
39 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
40 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
41 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
42 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
43 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
44 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
45 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
46 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
47 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
48 * OF THE POSSIBILITY OF SUCH DAMAGE.
49 * ====================================================================
50 *
51 * This product includes cryptographic software written by Eric Young
52 * (eay@cryptsoft.com). This product includes software written by Tim
53 * Hudson (tjh@cryptsoft.com).
54 *
55 */
56
57#if !defined(BN_CTX_DEBUG) && !defined(BN_DEBUG)
58#ifndef NDEBUG
59#define NDEBUG
60#endif
61#endif
62
63#include <stdio.h>
64#include <string.h>
65
66#include <openssl/opensslconf.h>
67
68#include <openssl/err.h>
69
70#include "bn_lcl.h"
71
72/* TODO list
73 *
74 * 1. Check a bunch of "(words+1)" type hacks in various bignum functions and
75 * check they can be safely removed.
76 * - Check +1 and other ugliness in BN_from_montgomery()
77 *
78 * 2. Consider allowing a BN_new_ex() that, at least, lets you specify an
79 * appropriate 'block' size that will be honoured by bn_expand_internal() to
80 * prevent piddly little reallocations. OTOH, profiling bignum expansions in
81 * BN_CTX doesn't show this to be a big issue.
82 */
83
84/* How many bignums are in each "pool item"; */
85#define BN_CTX_POOL_SIZE 16
86/* The stack frame info is resizing, set a first-time expansion size; */
87#define BN_CTX_START_FRAMES 32
88
89/***********/
90/* BN_POOL */
91/***********/
92
93/* A bundle of bignums that can be linked with other bundles */
94typedef struct bignum_pool_item {
95 /* The bignum values */
96 BIGNUM vals[BN_CTX_POOL_SIZE];
97 /* Linked-list admin */
98 struct bignum_pool_item *prev, *next;
99} BN_POOL_ITEM;
100
101/* A linked-list of bignums grouped in bundles */
102typedef struct bignum_pool {
103 /* Linked-list admin */
104 BN_POOL_ITEM *head, *current, *tail;
105 /* Stack depth and allocation size */
106 unsigned used, size;
107} BN_POOL;
108
109static void BN_POOL_init(BN_POOL *);
110static void BN_POOL_finish(BN_POOL *);
111#ifndef OPENSSL_NO_DEPRECATED
112static void BN_POOL_reset(BN_POOL *);
113#endif
114static BIGNUM * BN_POOL_get(BN_POOL *);
115static void BN_POOL_release(BN_POOL *, unsigned int);
116
117/************/
118/* BN_STACK */
119/************/
120
121/* A wrapper to manage the "stack frames" */
122typedef struct bignum_ctx_stack {
123 /* Array of indexes into the bignum stack */
124 unsigned int *indexes;
125 /* Number of stack frames, and the size of the allocated array */
126 unsigned int depth, size;
127} BN_STACK;
128
129static void BN_STACK_init(BN_STACK *);
130static void BN_STACK_finish(BN_STACK *);
131#ifndef OPENSSL_NO_DEPRECATED
132static void BN_STACK_reset(BN_STACK *);
133#endif
134static int BN_STACK_push(BN_STACK *, unsigned int);
135static unsigned int BN_STACK_pop(BN_STACK *);
136
137/**********/
138/* BN_CTX */
139/**********/
140
141/* The opaque BN_CTX type */
142struct bignum_ctx {
143 /* The bignum bundles */
144 BN_POOL pool;
145 /* The "stack frames", if you will */
146 BN_STACK stack;
147 /* The number of bignums currently assigned */
148 unsigned int used;
149 /* Depth of stack overflow */
150 int err_stack;
151 /* Block "gets" until an "end" (compatibility behaviour) */
152 int too_many;
153};
154
155/* Enable this to find BN_CTX bugs */
156#ifdef BN_CTX_DEBUG
157static const char *ctxdbg_cur = NULL;
158
159static void
160ctxdbg(BN_CTX *ctx)
161{
162 unsigned int bnidx = 0, fpidx = 0;
163 BN_POOL_ITEM *item = ctx->pool.head;
164 BN_STACK *stack = &ctx->stack;
165
166 fprintf(stderr, "(%08x): ", (unsigned int)ctx);
167 while (bnidx < ctx->used) {
168 fprintf(stderr, "%03x ",
169 item->vals[bnidx++ % BN_CTX_POOL_SIZE].dmax);
170 if (!(bnidx % BN_CTX_POOL_SIZE))
171 item = item->next;
172 }
173 fprintf(stderr, "\n");
174 bnidx = 0;
175 fprintf(stderr, " : ");
176 while (fpidx < stack->depth) {
177 while (bnidx++ < stack->indexes[fpidx])
178 fprintf(stderr, " ");
179 fprintf(stderr, "^^^ ");
180 bnidx++;
181 fpidx++;
182 }
183 fprintf(stderr, "\n");
184}
185#define CTXDBG_ENTRY(str, ctx) \
186 do { \
187 ctxdbg_cur = (str); \
188 fprintf(stderr, "Starting %s\n", ctxdbg_cur); \
189 ctxdbg(ctx); \
190 } while(0)
191
192#define CTXDBG_EXIT(ctx) \
193 do { \
194 fprintf(stderr, "Ending %s\n", ctxdbg_cur); \
195 ctxdbg(ctx); \
196 } while(0)
197
198#define CTXDBG_RET(ctx,ret)
199#else
200#define CTXDBG_ENTRY(str, ctx)
201#define CTXDBG_EXIT(ctx)
202#define CTXDBG_RET(ctx,ret)
203#endif
204
205/* This function is an evil legacy and should not be used. This implementation
206 * is WYSIWYG, though I've done my best. */
207#ifndef OPENSSL_NO_DEPRECATED
208void
209BN_CTX_init(BN_CTX *ctx)
210{
211 /* Assume the caller obtained the context via BN_CTX_new() and so is
212 * trying to reset it for use. Nothing else makes sense, least of all
213 * binary compatibility from a time when they could declare a static
214 * variable. */
215 BN_POOL_reset(&ctx->pool);
216 BN_STACK_reset(&ctx->stack);
217 ctx->used = 0;
218 ctx->err_stack = 0;
219 ctx->too_many = 0;
220}
221#endif
222
223BN_CTX *
224BN_CTX_new(void)
225{
226 BN_CTX *ret = malloc(sizeof(BN_CTX));
227 if (!ret) {
228 BNerr(BN_F_BN_CTX_NEW, ERR_R_MALLOC_FAILURE);
229 return NULL;
230 }
231
232 /* Initialise the structure */
233 BN_POOL_init(&ret->pool);
234 BN_STACK_init(&ret->stack);
235 ret->used = 0;
236 ret->err_stack = 0;
237 ret->too_many = 0;
238 return ret;
239}
240
241void
242BN_CTX_free(BN_CTX *ctx)
243{
244 if (ctx == NULL)
245 return;
246#ifdef BN_CTX_DEBUG
247 {
248 BN_POOL_ITEM *pool = ctx->pool.head;
249 fprintf(stderr, "BN_CTX_free, stack-size=%d, pool-bignums=%d\n",
250 ctx->stack.size, ctx->pool.size);
251 fprintf(stderr, "dmaxs: ");
252 while (pool) {
253 unsigned loop = 0;
254 while (loop < BN_CTX_POOL_SIZE)
255 fprintf(stderr, "%02x ",
256 pool->vals[loop++].dmax);
257 pool = pool->next;
258 }
259 fprintf(stderr, "\n");
260 }
261#endif
262 BN_STACK_finish(&ctx->stack);
263 BN_POOL_finish(&ctx->pool);
264 free(ctx);
265}
266
267void
268BN_CTX_start(BN_CTX *ctx)
269{
270 CTXDBG_ENTRY("BN_CTX_start", ctx);
271
272 /* If we're already overflowing ... */
273 if (ctx->err_stack || ctx->too_many)
274 ctx->err_stack++;
275 /* (Try to) get a new frame pointer */
276 else if (!BN_STACK_push(&ctx->stack, ctx->used)) {
277 BNerr(BN_F_BN_CTX_START, BN_R_TOO_MANY_TEMPORARY_VARIABLES);
278 ctx->err_stack++;
279 }
280 CTXDBG_EXIT(ctx);
281}
282
283void
284BN_CTX_end(BN_CTX *ctx)
285{
286 CTXDBG_ENTRY("BN_CTX_end", ctx);
287
288 if (ctx->err_stack)
289 ctx->err_stack--;
290 else {
291 unsigned int fp = BN_STACK_pop(&ctx->stack);
292 /* Does this stack frame have anything to release? */
293 if (fp < ctx->used)
294 BN_POOL_release(&ctx->pool, ctx->used - fp);
295 ctx->used = fp;
296 /* Unjam "too_many" in case "get" had failed */
297 ctx->too_many = 0;
298 }
299 CTXDBG_EXIT(ctx);
300}
301
302BIGNUM *
303BN_CTX_get(BN_CTX *ctx)
304{
305 BIGNUM *ret;
306
307 CTXDBG_ENTRY("BN_CTX_get", ctx);
308
309 if (ctx->err_stack || ctx->too_many)
310 return NULL;
311 if ((ret = BN_POOL_get(&ctx->pool)) == NULL) {
312 /* Setting too_many prevents repeated "get" attempts from
313 * cluttering the error stack. */
314 ctx->too_many = 1;
315 BNerr(BN_F_BN_CTX_GET, BN_R_TOO_MANY_TEMPORARY_VARIABLES);
316 return NULL;
317 }
318 /* OK, make sure the returned bignum is "zero" */
319 BN_zero(ret);
320 ctx->used++;
321 CTXDBG_RET(ctx, ret);
322 return ret;
323}
324
325/************/
326/* BN_STACK */
327/************/
328
329static void
330BN_STACK_init(BN_STACK *st)
331{
332 st->indexes = NULL;
333 st->depth = st->size = 0;
334}
335
336static void
337BN_STACK_finish(BN_STACK *st)
338{
339 if (st->size)
340 free(st->indexes);
341}
342
343#ifndef OPENSSL_NO_DEPRECATED
344static void
345BN_STACK_reset(BN_STACK *st)
346{
347 st->depth = 0;
348}
349#endif
350
351static int
352BN_STACK_push(BN_STACK *st, unsigned int idx)
353{
354 if (st->depth == st->size)
355 /* Need to expand */
356 {
357 unsigned int newsize = (st->size ?
358 (st->size * 3 / 2) : BN_CTX_START_FRAMES);
359 unsigned int *newitems = reallocarray(NULL,
360 newsize, sizeof(unsigned int));
361 if (!newitems)
362 return 0;
363 if (st->depth)
364 memcpy(newitems, st->indexes, st->depth *
365 sizeof(unsigned int));
366 if (st->size)
367 free(st->indexes);
368 st->indexes = newitems;
369 st->size = newsize;
370 }
371 st->indexes[(st->depth)++] = idx;
372 return 1;
373}
374
375static unsigned int
376BN_STACK_pop(BN_STACK *st)
377{
378 return st->indexes[--(st->depth)];
379}
380
381/***********/
382/* BN_POOL */
383/***********/
384
385static void
386BN_POOL_init(BN_POOL *p)
387{
388 p->head = p->current = p->tail = NULL;
389 p->used = p->size = 0;
390}
391
392static void
393BN_POOL_finish(BN_POOL *p)
394{
395 while (p->head) {
396 unsigned int loop = 0;
397 BIGNUM *bn = p->head->vals;
398 while (loop++ < BN_CTX_POOL_SIZE) {
399 if (bn->d)
400 BN_clear_free(bn);
401 bn++;
402 }
403 p->current = p->head->next;
404 free(p->head);
405 p->head = p->current;
406 }
407}
408
409#ifndef OPENSSL_NO_DEPRECATED
410static void
411BN_POOL_reset(BN_POOL *p)
412{
413 BN_POOL_ITEM *item = p->head;
414 while (item) {
415 unsigned int loop = 0;
416 BIGNUM *bn = item->vals;
417 while (loop++ < BN_CTX_POOL_SIZE) {
418 if (bn->d)
419 BN_clear(bn);
420 bn++;
421 }
422 item = item->next;
423 }
424 p->current = p->head;
425 p->used = 0;
426}
427#endif
428
429static BIGNUM *
430BN_POOL_get(BN_POOL *p)
431{
432 if (p->used == p->size) {
433 BIGNUM *bn;
434 unsigned int loop = 0;
435 BN_POOL_ITEM *item = malloc(sizeof(BN_POOL_ITEM));
436 if (!item)
437 return NULL;
438 /* Initialise the structure */
439 bn = item->vals;
440 while (loop++ < BN_CTX_POOL_SIZE)
441 BN_init(bn++);
442 item->prev = p->tail;
443 item->next = NULL;
444 /* Link it in */
445 if (!p->head)
446 p->head = p->current = p->tail = item;
447 else {
448 p->tail->next = item;
449 p->tail = item;
450 p->current = item;
451 }
452 p->size += BN_CTX_POOL_SIZE;
453 p->used++;
454 /* Return the first bignum from the new pool */
455 return item->vals;
456 }
457 if (!p->used)
458 p->current = p->head;
459 else if ((p->used % BN_CTX_POOL_SIZE) == 0)
460 p->current = p->current->next;
461 return p->current->vals + ((p->used++) % BN_CTX_POOL_SIZE);
462}
463
464static void
465BN_POOL_release(BN_POOL *p, unsigned int num)
466{
467 unsigned int offset = (p->used - 1) % BN_CTX_POOL_SIZE;
468
469 p->used -= num;
470 while (num--) {
471 bn_check_top(p->current->vals + offset);
472 if (!offset) {
473 offset = BN_CTX_POOL_SIZE - 1;
474 p->current = p->current->prev;
475 } else
476 offset--;
477 }
478}
diff --git a/src/lib/libcrypto/bn/bn_depr.c b/src/lib/libcrypto/bn/bn_depr.c
deleted file mode 100644
index dc5c2abee0..0000000000
--- a/src/lib/libcrypto/bn/bn_depr.c
+++ /dev/null
@@ -1,115 +0,0 @@
1/* $OpenBSD: bn_depr.c,v 1.7 2014/10/18 17:20:40 jsing Exp $ */
2/* ====================================================================
3 * Copyright (c) 1998-2002 The OpenSSL Project. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in
14 * the documentation and/or other materials provided with the
15 * distribution.
16 *
17 * 3. All advertising materials mentioning features or use of this
18 * software must display the following acknowledgment:
19 * "This product includes software developed by the OpenSSL Project
20 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
21 *
22 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
23 * endorse or promote products derived from this software without
24 * prior written permission. For written permission, please contact
25 * openssl-core@openssl.org.
26 *
27 * 5. Products derived from this software may not be called "OpenSSL"
28 * nor may "OpenSSL" appear in their names without prior written
29 * permission of the OpenSSL Project.
30 *
31 * 6. Redistributions of any form whatsoever must retain the following
32 * acknowledgment:
33 * "This product includes software developed by the OpenSSL Project
34 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
35 *
36 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
37 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
38 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
39 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
40 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
41 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
42 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
43 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
44 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
45 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
46 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
47 * OF THE POSSIBILITY OF SUCH DAMAGE.
48 * ====================================================================
49 *
50 * This product includes cryptographic software written by Eric Young
51 * (eay@cryptsoft.com). This product includes software written by Tim
52 * Hudson (tjh@cryptsoft.com).
53 *
54 */
55
56/* Support for deprecated functions goes here - static linkage will only slurp
57 * this code if applications are using them directly. */
58
59#include <stdio.h>
60#include <time.h>
61
62#include <openssl/opensslconf.h>
63
64#include "bn_lcl.h"
65
66#ifndef OPENSSL_NO_DEPRECATED
67BIGNUM *
68BN_generate_prime(BIGNUM *ret, int bits, int safe, const BIGNUM *add,
69 const BIGNUM *rem, void (*callback)(int, int, void *), void *cb_arg)
70{
71 BN_GENCB cb;
72 BIGNUM *rnd = NULL;
73 int found = 0;
74
75 BN_GENCB_set_old(&cb, callback, cb_arg);
76
77 if (ret == NULL) {
78 if ((rnd = BN_new()) == NULL)
79 goto err;
80 } else
81 rnd = ret;
82 if (!BN_generate_prime_ex(rnd, bits, safe, add, rem, &cb))
83 goto err;
84
85 /* we have a prime :-) */
86 found = 1;
87
88err:
89 if (!found && (ret == NULL) && (rnd != NULL))
90 BN_free(rnd);
91 return (found ? rnd : NULL);
92}
93
94int
95BN_is_prime(const BIGNUM *a, int checks, void (*callback)(int, int, void *),
96 BN_CTX *ctx_passed, void *cb_arg)
97{
98 BN_GENCB cb;
99
100 BN_GENCB_set_old(&cb, callback, cb_arg);
101 return BN_is_prime_ex(a, checks, ctx_passed, &cb);
102}
103
104int
105BN_is_prime_fasttest(const BIGNUM *a, int checks,
106 void (*callback)(int, int, void *), BN_CTX *ctx_passed, void *cb_arg,
107 int do_trial_division)
108{
109 BN_GENCB cb;
110
111 BN_GENCB_set_old(&cb, callback, cb_arg);
112 return BN_is_prime_fasttest_ex(a, checks, ctx_passed,
113 do_trial_division, &cb);
114}
115#endif
diff --git a/src/lib/libcrypto/bn/bn_div.c b/src/lib/libcrypto/bn/bn_div.c
deleted file mode 100644
index fefc53f9fa..0000000000
--- a/src/lib/libcrypto/bn/bn_div.c
+++ /dev/null
@@ -1,381 +0,0 @@
1/* $OpenBSD: bn_div.c,v 1.23 2015/02/09 15:49:22 jsing Exp $ */
2/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
3 * All rights reserved.
4 *
5 * This package is an SSL implementation written
6 * by Eric Young (eay@cryptsoft.com).
7 * The implementation was written so as to conform with Netscapes SSL.
8 *
9 * This library is free for commercial and non-commercial use as long as
10 * the following conditions are aheared to. The following conditions
11 * apply to all code found in this distribution, be it the RC4, RSA,
12 * lhash, DES, etc., code; not just the SSL code. The SSL documentation
13 * included with this distribution is covered by the same copyright terms
14 * except that the holder is Tim Hudson (tjh@cryptsoft.com).
15 *
16 * Copyright remains Eric Young's, and as such any Copyright notices in
17 * the code are not to be removed.
18 * If this package is used in a product, Eric Young should be given attribution
19 * as the author of the parts of the library used.
20 * This can be in the form of a textual message at program startup or
21 * in documentation (online or textual) provided with the package.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 * 1. Redistributions of source code must retain the copyright
27 * notice, this list of conditions and the following disclaimer.
28 * 2. Redistributions in binary form must reproduce the above copyright
29 * notice, this list of conditions and the following disclaimer in the
30 * documentation and/or other materials provided with the distribution.
31 * 3. All advertising materials mentioning features or use of this software
32 * must display the following acknowledgement:
33 * "This product includes cryptographic software written by
34 * Eric Young (eay@cryptsoft.com)"
35 * The word 'cryptographic' can be left out if the rouines from the library
36 * being used are not cryptographic related :-).
37 * 4. If you include any Windows specific code (or a derivative thereof) from
38 * the apps directory (application code) you must include an acknowledgement:
39 * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
40 *
41 * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
42 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51 * SUCH DAMAGE.
52 *
53 * The licence and distribution terms for any publically available version or
54 * derivative of this code cannot be changed. i.e. this code cannot simply be
55 * copied and put under another distribution licence
56 * [including the GNU Public Licence.]
57 */
58
59#include <stdio.h>
60
61#include <openssl/opensslconf.h>
62
63#include <openssl/bn.h>
64#include <openssl/err.h>
65
66#include "bn_lcl.h"
67
68#if !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_NO_INLINE_ASM) \
69 && !defined(BN_DIV3W)
70# if defined(__GNUC__) && __GNUC__>=2
71# if defined(__i386) || defined (__i386__)
72 /*
73 * There were two reasons for implementing this template:
74 * - GNU C generates a call to a function (__udivdi3 to be exact)
75 * in reply to ((((BN_ULLONG)n0)<<BN_BITS2)|n1)/d0 (I fail to
76 * understand why...);
77 * - divl doesn't only calculate quotient, but also leaves
78 * remainder in %edx which we can definitely use here:-)
79 *
80 * <appro@fy.chalmers.se>
81 */
82#undef bn_div_words
83# define bn_div_words(n0,n1,d0) \
84 ({ asm volatile ( \
85 "divl %4" \
86 : "=a"(q), "=d"(rem) \
87 : "a"(n1), "d"(n0), "g"(d0) \
88 : "cc"); \
89 q; \
90 })
91# define REMAINDER_IS_ALREADY_CALCULATED
92# elif defined(__x86_64)
93 /*
94 * Same story here, but it's 128-bit by 64-bit division. Wow!
95 * <appro@fy.chalmers.se>
96 */
97# undef bn_div_words
98# define bn_div_words(n0,n1,d0) \
99 ({ asm volatile ( \
100 "divq %4" \
101 : "=a"(q), "=d"(rem) \
102 : "a"(n1), "d"(n0), "g"(d0) \
103 : "cc"); \
104 q; \
105 })
106# define REMAINDER_IS_ALREADY_CALCULATED
107# endif /* __<cpu> */
108# endif /* __GNUC__ */
109#endif /* OPENSSL_NO_ASM */
110
111
112/* BN_div computes dv := num / divisor, rounding towards
113 * zero, and sets up rm such that dv*divisor + rm = num holds.
114 * Thus:
115 * dv->neg == num->neg ^ divisor->neg (unless the result is zero)
116 * rm->neg == num->neg (unless the remainder is zero)
117 * If 'dv' or 'rm' is NULL, the respective value is not returned.
118 */
119int
120BN_div(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, const BIGNUM *divisor,
121 BN_CTX *ctx)
122{
123 int norm_shift, i, loop;
124 BIGNUM *tmp, wnum, *snum, *sdiv, *res;
125 BN_ULONG *resp, *wnump;
126 BN_ULONG d0, d1;
127 int num_n, div_n;
128 int no_branch = 0;
129
130 /* Invalid zero-padding would have particularly bad consequences
131 * in the case of 'num', so don't just rely on bn_check_top() for this one
132 * (bn_check_top() works only for BN_DEBUG builds) */
133 if (num->top > 0 && num->d[num->top - 1] == 0) {
134 BNerr(BN_F_BN_DIV, BN_R_NOT_INITIALIZED);
135 return 0;
136 }
137
138 bn_check_top(num);
139
140 if ((BN_get_flags(num, BN_FLG_CONSTTIME) != 0) ||
141 (BN_get_flags(divisor, BN_FLG_CONSTTIME) != 0)) {
142 no_branch = 1;
143 }
144
145 bn_check_top(dv);
146 bn_check_top(rm);
147 /* bn_check_top(num); */ /* 'num' has been checked already */
148 bn_check_top(divisor);
149
150 if (BN_is_zero(divisor)) {
151 BNerr(BN_F_BN_DIV, BN_R_DIV_BY_ZERO);
152 return (0);
153 }
154
155 if (!no_branch && BN_ucmp(num, divisor) < 0) {
156 if (rm != NULL) {
157 if (BN_copy(rm, num) == NULL)
158 return (0);
159 }
160 if (dv != NULL)
161 BN_zero(dv);
162 return (1);
163 }
164
165 BN_CTX_start(ctx);
166 tmp = BN_CTX_get(ctx);
167 snum = BN_CTX_get(ctx);
168 sdiv = BN_CTX_get(ctx);
169 if (dv == NULL)
170 res = BN_CTX_get(ctx);
171 else
172 res = dv;
173 if (tmp == NULL || snum == NULL || sdiv == NULL || res == NULL)
174 goto err;
175
176 /* First we normalise the numbers */
177 norm_shift = BN_BITS2 - ((BN_num_bits(divisor)) % BN_BITS2);
178 if (!(BN_lshift(sdiv, divisor, norm_shift)))
179 goto err;
180 sdiv->neg = 0;
181 norm_shift += BN_BITS2;
182 if (!(BN_lshift(snum, num, norm_shift)))
183 goto err;
184 snum->neg = 0;
185
186 if (no_branch) {
187 /* Since we don't know whether snum is larger than sdiv,
188 * we pad snum with enough zeroes without changing its
189 * value.
190 */
191 if (snum->top <= sdiv->top + 1) {
192 if (bn_wexpand(snum, sdiv->top + 2) == NULL)
193 goto err;
194 for (i = snum->top; i < sdiv->top + 2; i++)
195 snum->d[i] = 0;
196 snum->top = sdiv->top + 2;
197 } else {
198 if (bn_wexpand(snum, snum->top + 1) == NULL)
199 goto err;
200 snum->d[snum->top] = 0;
201 snum->top ++;
202 }
203 }
204
205 div_n = sdiv->top;
206 num_n = snum->top;
207 loop = num_n - div_n;
208 /* Lets setup a 'window' into snum
209 * This is the part that corresponds to the current
210 * 'area' being divided */
211 wnum.neg = 0;
212 wnum.d = &(snum->d[loop]);
213 wnum.top = div_n;
214 /* only needed when BN_ucmp messes up the values between top and max */
215 wnum.dmax = snum->dmax - loop; /* so we don't step out of bounds */
216 wnum.flags = snum->flags | BN_FLG_STATIC_DATA;
217
218 /* Get the top 2 words of sdiv */
219 /* div_n=sdiv->top; */
220 d0 = sdiv->d[div_n - 1];
221 d1 = (div_n == 1) ? 0 : sdiv->d[div_n - 2];
222
223 /* pointer to the 'top' of snum */
224 wnump = &(snum->d[num_n - 1]);
225
226 /* Setup to 'res' */
227 res->neg = (num->neg ^ divisor->neg);
228 if (!bn_wexpand(res, (loop + 1)))
229 goto err;
230 res->top = loop - no_branch;
231 resp = &(res->d[loop - 1]);
232
233 /* space for temp */
234 if (!bn_wexpand(tmp, (div_n + 1)))
235 goto err;
236
237 if (!no_branch) {
238 if (BN_ucmp(&wnum, sdiv) >= 0) {
239 /* If BN_DEBUG_RAND is defined BN_ucmp changes (via
240 * bn_pollute) the const bignum arguments =>
241 * clean the values between top and max again */
242 bn_clear_top2max(&wnum);
243 bn_sub_words(wnum.d, wnum.d, sdiv->d, div_n);
244 *resp = 1;
245 } else
246 res->top--;
247 }
248
249 /* if res->top == 0 then clear the neg value otherwise decrease
250 * the resp pointer */
251 if (res->top == 0)
252 res->neg = 0;
253 else
254 resp--;
255
256 for (i = 0; i < loop - 1; i++, wnump--, resp--) {
257 BN_ULONG q, l0;
258 /* the first part of the loop uses the top two words of
259 * snum and sdiv to calculate a BN_ULONG q such that
260 * | wnum - sdiv * q | < sdiv */
261#if defined(BN_DIV3W) && !defined(OPENSSL_NO_ASM)
262 BN_ULONG bn_div_3_words(BN_ULONG*, BN_ULONG, BN_ULONG);
263 q = bn_div_3_words(wnump, d1, d0);
264#else
265 BN_ULONG n0, n1, rem = 0;
266
267 n0 = wnump[0];
268 n1 = wnump[-1];
269 if (n0 == d0)
270 q = BN_MASK2;
271 else /* n0 < d0 */
272 {
273#ifdef BN_LLONG
274 BN_ULLONG t2;
275
276#if defined(BN_DIV2W) && !defined(bn_div_words)
277 q = (BN_ULONG)(((((BN_ULLONG)n0) << BN_BITS2)|n1)/d0);
278#else
279 q = bn_div_words(n0, n1, d0);
280#endif
281
282#ifndef REMAINDER_IS_ALREADY_CALCULATED
283 /*
284 * rem doesn't have to be BN_ULLONG. The least we
285 * know it's less that d0, isn't it?
286 */
287 rem = (n1 - q * d0) & BN_MASK2;
288#endif
289 t2 = (BN_ULLONG)d1*q;
290
291 for (;;) {
292 if (t2 <= ((((BN_ULLONG)rem) << BN_BITS2) |
293 wnump[-2]))
294 break;
295 q--;
296 rem += d0;
297 if (rem < d0) break; /* don't let rem overflow */
298 t2 -= d1;
299 }
300#else /* !BN_LLONG */
301 BN_ULONG t2l, t2h;
302
303 q = bn_div_words(n0, n1, d0);
304#ifndef REMAINDER_IS_ALREADY_CALCULATED
305 rem = (n1 - q*d0)&BN_MASK2;
306#endif
307
308#if defined(BN_UMULT_LOHI)
309 BN_UMULT_LOHI(t2l, t2h, d1, q);
310#elif defined(BN_UMULT_HIGH)
311 t2l = d1 * q;
312 t2h = BN_UMULT_HIGH(d1, q);
313#else
314 {
315 BN_ULONG ql, qh;
316 t2l = LBITS(d1);
317 t2h = HBITS(d1);
318 ql = LBITS(q);
319 qh = HBITS(q);
320 mul64(t2l, t2h, ql, qh); /* t2=(BN_ULLONG)d1*q; */
321 }
322#endif
323
324 for (;;) {
325 if ((t2h < rem) ||
326 ((t2h == rem) && (t2l <= wnump[-2])))
327 break;
328 q--;
329 rem += d0;
330 if (rem < d0)
331 break; /* don't let rem overflow */
332 if (t2l < d1)
333 t2h--;
334 t2l -= d1;
335 }
336#endif /* !BN_LLONG */
337 }
338#endif /* !BN_DIV3W */
339
340 l0 = bn_mul_words(tmp->d, sdiv->d, div_n, q);
341 tmp->d[div_n] = l0;
342 wnum.d--;
343 /* ingore top values of the bignums just sub the two
344 * BN_ULONG arrays with bn_sub_words */
345 if (bn_sub_words(wnum.d, wnum.d, tmp->d, div_n + 1)) {
346 /* Note: As we have considered only the leading
347 * two BN_ULONGs in the calculation of q, sdiv * q
348 * might be greater than wnum (but then (q-1) * sdiv
349 * is less or equal than wnum)
350 */
351 q--;
352 if (bn_add_words(wnum.d, wnum.d, sdiv->d, div_n))
353 /* we can't have an overflow here (assuming
354 * that q != 0, but if q == 0 then tmp is
355 * zero anyway) */
356 (*wnump)++;
357 }
358 /* store part of the result */
359 *resp = q;
360 }
361 bn_correct_top(snum);
362 if (rm != NULL) {
363 /* Keep a copy of the neg flag in num because if rm==num
364 * BN_rshift() will overwrite it.
365 */
366 int neg = num->neg;
367 BN_rshift(rm, snum, norm_shift);
368 if (!BN_is_zero(rm))
369 rm->neg = neg;
370 bn_check_top(rm);
371 }
372 if (no_branch)
373 bn_correct_top(res);
374 BN_CTX_end(ctx);
375 return (1);
376
377err:
378 bn_check_top(rm);
379 BN_CTX_end(ctx);
380 return (0);
381}
diff --git a/src/lib/libcrypto/bn/bn_err.c b/src/lib/libcrypto/bn/bn_err.c
deleted file mode 100644
index 5a0f359d86..0000000000
--- a/src/lib/libcrypto/bn/bn_err.c
+++ /dev/null
@@ -1,150 +0,0 @@
1/* $OpenBSD: bn_err.c,v 1.12 2014/07/10 22:45:56 jsing Exp $ */
2/* ====================================================================
3 * Copyright (c) 1999-2007 The OpenSSL Project. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in
14 * the documentation and/or other materials provided with the
15 * distribution.
16 *
17 * 3. All advertising materials mentioning features or use of this
18 * software must display the following acknowledgment:
19 * "This product includes software developed by the OpenSSL Project
20 * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
21 *
22 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
23 * endorse or promote products derived from this software without
24 * prior written permission. For written permission, please contact
25 * openssl-core@OpenSSL.org.
26 *
27 * 5. Products derived from this software may not be called "OpenSSL"
28 * nor may "OpenSSL" appear in their names without prior written
29 * permission of the OpenSSL Project.
30 *
31 * 6. Redistributions of any form whatsoever must retain the following
32 * acknowledgment:
33 * "This product includes software developed by the OpenSSL Project
34 * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
35 *
36 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
37 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
38 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
39 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
40 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
41 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
42 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
43 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
44 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
45 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
46 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
47 * OF THE POSSIBILITY OF SUCH DAMAGE.
48 * ====================================================================
49 *
50 * This product includes cryptographic software written by Eric Young
51 * (eay@cryptsoft.com). This product includes software written by Tim
52 * Hudson (tjh@cryptsoft.com).
53 *
54 */
55
56/* NOTE: this file was auto generated by the mkerr.pl script: any changes
57 * made to it will be overwritten when the script next updates this file,
58 * only reason strings will be preserved.
59 */
60
61#include <stdio.h>
62
63#include <openssl/opensslconf.h>
64
65#include <openssl/err.h>
66#include <openssl/bn.h>
67
68/* BEGIN ERROR CODES */
69#ifndef OPENSSL_NO_ERR
70
71#define ERR_FUNC(func) ERR_PACK(ERR_LIB_BN,func,0)
72#define ERR_REASON(reason) ERR_PACK(ERR_LIB_BN,0,reason)
73
74static ERR_STRING_DATA BN_str_functs[]= {
75 {ERR_FUNC(BN_F_BNRAND), "BNRAND"},
76 {ERR_FUNC(BN_F_BN_BLINDING_CONVERT_EX), "BN_BLINDING_convert_ex"},
77 {ERR_FUNC(BN_F_BN_BLINDING_CREATE_PARAM), "BN_BLINDING_create_param"},
78 {ERR_FUNC(BN_F_BN_BLINDING_INVERT_EX), "BN_BLINDING_invert_ex"},
79 {ERR_FUNC(BN_F_BN_BLINDING_NEW), "BN_BLINDING_new"},
80 {ERR_FUNC(BN_F_BN_BLINDING_UPDATE), "BN_BLINDING_update"},
81 {ERR_FUNC(BN_F_BN_BN2DEC), "BN_bn2dec"},
82 {ERR_FUNC(BN_F_BN_BN2HEX), "BN_bn2hex"},
83 {ERR_FUNC(BN_F_BN_CTX_GET), "BN_CTX_get"},
84 {ERR_FUNC(BN_F_BN_CTX_NEW), "BN_CTX_new"},
85 {ERR_FUNC(BN_F_BN_CTX_START), "BN_CTX_start"},
86 {ERR_FUNC(BN_F_BN_DIV), "BN_div"},
87 {ERR_FUNC(BN_F_BN_DIV_NO_BRANCH), "BN_div_no_branch"},
88 {ERR_FUNC(BN_F_BN_DIV_RECP), "BN_div_recp"},
89 {ERR_FUNC(BN_F_BN_EXP), "BN_exp"},
90 {ERR_FUNC(BN_F_BN_EXPAND2), "bn_expand2"},
91 {ERR_FUNC(BN_F_BN_EXPAND_INTERNAL), "BN_EXPAND_INTERNAL"},
92 {ERR_FUNC(BN_F_BN_GF2M_MOD), "BN_GF2m_mod"},
93 {ERR_FUNC(BN_F_BN_GF2M_MOD_EXP), "BN_GF2m_mod_exp"},
94 {ERR_FUNC(BN_F_BN_GF2M_MOD_MUL), "BN_GF2m_mod_mul"},
95 {ERR_FUNC(BN_F_BN_GF2M_MOD_SOLVE_QUAD), "BN_GF2m_mod_solve_quad"},
96 {ERR_FUNC(BN_F_BN_GF2M_MOD_SOLVE_QUAD_ARR), "BN_GF2m_mod_solve_quad_arr"},
97 {ERR_FUNC(BN_F_BN_GF2M_MOD_SQR), "BN_GF2m_mod_sqr"},
98 {ERR_FUNC(BN_F_BN_GF2M_MOD_SQRT), "BN_GF2m_mod_sqrt"},
99 {ERR_FUNC(BN_F_BN_MOD_EXP2_MONT), "BN_mod_exp2_mont"},
100 {ERR_FUNC(BN_F_BN_MOD_EXP_MONT), "BN_mod_exp_mont"},
101 {ERR_FUNC(BN_F_BN_MOD_EXP_MONT_CONSTTIME), "BN_mod_exp_mont_consttime"},
102 {ERR_FUNC(BN_F_BN_MOD_EXP_MONT_WORD), "BN_mod_exp_mont_word"},
103 {ERR_FUNC(BN_F_BN_MOD_EXP_RECP), "BN_mod_exp_recp"},
104 {ERR_FUNC(BN_F_BN_MOD_EXP_SIMPLE), "BN_mod_exp_simple"},
105 {ERR_FUNC(BN_F_BN_MOD_INVERSE), "BN_mod_inverse"},
106 {ERR_FUNC(BN_F_BN_MOD_INVERSE_NO_BRANCH), "BN_mod_inverse_no_branch"},
107 {ERR_FUNC(BN_F_BN_MOD_LSHIFT_QUICK), "BN_mod_lshift_quick"},
108 {ERR_FUNC(BN_F_BN_MOD_MUL_RECIPROCAL), "BN_mod_mul_reciprocal"},
109 {ERR_FUNC(BN_F_BN_MOD_SQRT), "BN_mod_sqrt"},
110 {ERR_FUNC(BN_F_BN_MPI2BN), "BN_mpi2bn"},
111 {ERR_FUNC(BN_F_BN_NEW), "BN_new"},
112 {ERR_FUNC(BN_F_BN_RAND), "BN_rand"},
113 {ERR_FUNC(BN_F_BN_RAND_RANGE), "BN_rand_range"},
114 {ERR_FUNC(BN_F_BN_USUB), "BN_usub"},
115 {0, NULL}
116};
117
118static ERR_STRING_DATA BN_str_reasons[]= {
119 {ERR_REASON(BN_R_ARG2_LT_ARG3) , "arg2 lt arg3"},
120 {ERR_REASON(BN_R_BAD_RECIPROCAL) , "bad reciprocal"},
121 {ERR_REASON(BN_R_BIGNUM_TOO_LONG) , "bignum too long"},
122 {ERR_REASON(BN_R_CALLED_WITH_EVEN_MODULUS), "called with even modulus"},
123 {ERR_REASON(BN_R_DIV_BY_ZERO) , "div by zero"},
124 {ERR_REASON(BN_R_ENCODING_ERROR) , "encoding error"},
125 {ERR_REASON(BN_R_EXPAND_ON_STATIC_BIGNUM_DATA), "expand on static bignum data"},
126 {ERR_REASON(BN_R_INPUT_NOT_REDUCED) , "input not reduced"},
127 {ERR_REASON(BN_R_INVALID_LENGTH) , "invalid length"},
128 {ERR_REASON(BN_R_INVALID_RANGE) , "invalid range"},
129 {ERR_REASON(BN_R_NOT_A_SQUARE) , "not a square"},
130 {ERR_REASON(BN_R_NOT_INITIALIZED) , "not initialized"},
131 {ERR_REASON(BN_R_NO_INVERSE) , "no inverse"},
132 {ERR_REASON(BN_R_NO_SOLUTION) , "no solution"},
133 {ERR_REASON(BN_R_P_IS_NOT_PRIME) , "p is not prime"},
134 {ERR_REASON(BN_R_TOO_MANY_ITERATIONS) , "too many iterations"},
135 {ERR_REASON(BN_R_TOO_MANY_TEMPORARY_VARIABLES), "too many temporary variables"},
136 {0, NULL}
137};
138
139#endif
140
141void
142ERR_load_BN_strings(void)
143{
144#ifndef OPENSSL_NO_ERR
145 if (ERR_func_error_string(BN_str_functs[0].error) == NULL) {
146 ERR_load_strings(0, BN_str_functs);
147 ERR_load_strings(0, BN_str_reasons);
148 }
149#endif
150}
diff --git a/src/lib/libcrypto/bn/bn_exp.c b/src/lib/libcrypto/bn/bn_exp.c
deleted file mode 100644
index 4a28c2c605..0000000000
--- a/src/lib/libcrypto/bn/bn_exp.c
+++ /dev/null
@@ -1,1097 +0,0 @@
1/* $OpenBSD: bn_exp.c,v 1.22 2015/03/21 08:05:20 doug Exp $ */
2/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
3 * All rights reserved.
4 *
5 * This package is an SSL implementation written
6 * by Eric Young (eay@cryptsoft.com).
7 * The implementation was written so as to conform with Netscapes SSL.
8 *
9 * This library is free for commercial and non-commercial use as long as
10 * the following conditions are aheared to. The following conditions
11 * apply to all code found in this distribution, be it the RC4, RSA,
12 * lhash, DES, etc., code; not just the SSL code. The SSL documentation
13 * included with this distribution is covered by the same copyright terms
14 * except that the holder is Tim Hudson (tjh@cryptsoft.com).
15 *
16 * Copyright remains Eric Young's, and as such any Copyright notices in
17 * the code are not to be removed.
18 * If this package is used in a product, Eric Young should be given attribution
19 * as the author of the parts of the library used.
20 * This can be in the form of a textual message at program startup or
21 * in documentation (online or textual) provided with the package.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 * 1. Redistributions of source code must retain the copyright
27 * notice, this list of conditions and the following disclaimer.
28 * 2. Redistributions in binary form must reproduce the above copyright
29 * notice, this list of conditions and the following disclaimer in the
30 * documentation and/or other materials provided with the distribution.
31 * 3. All advertising materials mentioning features or use of this software
32 * must display the following acknowledgement:
33 * "This product includes cryptographic software written by
34 * Eric Young (eay@cryptsoft.com)"
35 * The word 'cryptographic' can be left out if the rouines from the library
36 * being used are not cryptographic related :-).
37 * 4. If you include any Windows specific code (or a derivative thereof) from
38 * the apps directory (application code) you must include an acknowledgement:
39 * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
40 *
41 * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
42 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51 * SUCH DAMAGE.
52 *
53 * The licence and distribution terms for any publically available version or
54 * derivative of this code cannot be changed. i.e. this code cannot simply be
55 * copied and put under another distribution licence
56 * [including the GNU Public Licence.]
57 */
58/* ====================================================================
59 * Copyright (c) 1998-2005 The OpenSSL Project. All rights reserved.
60 *
61 * Redistribution and use in source and binary forms, with or without
62 * modification, are permitted provided that the following conditions
63 * are met:
64 *
65 * 1. Redistributions of source code must retain the above copyright
66 * notice, this list of conditions and the following disclaimer.
67 *
68 * 2. Redistributions in binary form must reproduce the above copyright
69 * notice, this list of conditions and the following disclaimer in
70 * the documentation and/or other materials provided with the
71 * distribution.
72 *
73 * 3. All advertising materials mentioning features or use of this
74 * software must display the following acknowledgment:
75 * "This product includes software developed by the OpenSSL Project
76 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
77 *
78 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
79 * endorse or promote products derived from this software without
80 * prior written permission. For written permission, please contact
81 * openssl-core@openssl.org.
82 *
83 * 5. Products derived from this software may not be called "OpenSSL"
84 * nor may "OpenSSL" appear in their names without prior written
85 * permission of the OpenSSL Project.
86 *
87 * 6. Redistributions of any form whatsoever must retain the following
88 * acknowledgment:
89 * "This product includes software developed by the OpenSSL Project
90 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
91 *
92 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
93 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
94 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
95 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
96 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
97 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
98 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
99 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
100 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
101 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
102 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
103 * OF THE POSSIBILITY OF SUCH DAMAGE.
104 * ====================================================================
105 *
106 * This product includes cryptographic software written by Eric Young
107 * (eay@cryptsoft.com). This product includes software written by Tim
108 * Hudson (tjh@cryptsoft.com).
109 *
110 */
111
112#include <stdlib.h>
113#include <string.h>
114
115#include <openssl/err.h>
116
117#include "bn_lcl.h"
118
119/* maximum precomputation table size for *variable* sliding windows */
120#define TABLE_SIZE 32
121
122/* this one works - simple but works */
123int
124BN_exp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx)
125{
126 int i, bits, ret = 0;
127 BIGNUM *v, *rr;
128
129 if (BN_get_flags(p, BN_FLG_CONSTTIME) != 0) {
130 /* BN_FLG_CONSTTIME only supported by BN_mod_exp_mont() */
131 BNerr(BN_F_BN_EXP, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
132 return -1;
133 }
134
135 BN_CTX_start(ctx);
136 if ((r == a) || (r == p))
137 rr = BN_CTX_get(ctx);
138 else
139 rr = r;
140 v = BN_CTX_get(ctx);
141 if (rr == NULL || v == NULL)
142 goto err;
143
144 if (BN_copy(v, a) == NULL)
145 goto err;
146 bits = BN_num_bits(p);
147
148 if (BN_is_odd(p)) {
149 if (BN_copy(rr, a) == NULL)
150 goto err;
151 } else {
152 if (!BN_one(rr))
153 goto err;
154 }
155
156 for (i = 1; i < bits; i++) {
157 if (!BN_sqr(v, v, ctx))
158 goto err;
159 if (BN_is_bit_set(p, i)) {
160 if (!BN_mul(rr, rr, v, ctx))
161 goto err;
162 }
163 }
164 ret = 1;
165
166err:
167 if (r != rr && rr != NULL)
168 BN_copy(r, rr);
169 BN_CTX_end(ctx);
170 bn_check_top(r);
171 return (ret);
172}
173
174int
175BN_mod_exp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, const BIGNUM *m,
176 BN_CTX *ctx)
177{
178 int ret;
179
180 bn_check_top(a);
181 bn_check_top(p);
182 bn_check_top(m);
183
184 /* For even modulus m = 2^k*m_odd, it might make sense to compute
185 * a^p mod m_odd and a^p mod 2^k separately (with Montgomery
186 * exponentiation for the odd part), using appropriate exponent
187 * reductions, and combine the results using the CRT.
188 *
189 * For now, we use Montgomery only if the modulus is odd; otherwise,
190 * exponentiation using the reciprocal-based quick remaindering
191 * algorithm is used.
192 *
193 * (Timing obtained with expspeed.c [computations a^p mod m
194 * where a, p, m are of the same length: 256, 512, 1024, 2048,
195 * 4096, 8192 bits], compared to the running time of the
196 * standard algorithm:
197 *
198 * BN_mod_exp_mont 33 .. 40 % [AMD K6-2, Linux, debug configuration]
199 * 55 .. 77 % [UltraSparc processor, but
200 * debug-solaris-sparcv8-gcc conf.]
201 *
202 * BN_mod_exp_recp 50 .. 70 % [AMD K6-2, Linux, debug configuration]
203 * 62 .. 118 % [UltraSparc, debug-solaris-sparcv8-gcc]
204 *
205 * On the Sparc, BN_mod_exp_recp was faster than BN_mod_exp_mont
206 * at 2048 and more bits, but at 512 and 1024 bits, it was
207 * slower even than the standard algorithm!
208 *
209 * "Real" timings [linux-elf, solaris-sparcv9-gcc configurations]
210 * should be obtained when the new Montgomery reduction code
211 * has been integrated into OpenSSL.)
212 */
213
214#define MONT_MUL_MOD
215#define MONT_EXP_WORD
216#define RECP_MUL_MOD
217
218#ifdef MONT_MUL_MOD
219 /* I have finally been able to take out this pre-condition of
220 * the top bit being set. It was caused by an error in BN_div
221 * with negatives. There was also another problem when for a^b%m
222 * a >= m. eay 07-May-97 */
223/* if ((m->d[m->top-1]&BN_TBIT) && BN_is_odd(m)) */
224
225 if (BN_is_odd(m)) {
226# ifdef MONT_EXP_WORD
227 if (a->top == 1 && !a->neg &&
228 (BN_get_flags(p, BN_FLG_CONSTTIME) == 0)) {
229 BN_ULONG A = a->d[0];
230 ret = BN_mod_exp_mont_word(r, A,p, m,ctx, NULL);
231 } else
232# endif
233 ret = BN_mod_exp_mont(r, a,p, m,ctx, NULL);
234 } else
235#endif
236#ifdef RECP_MUL_MOD
237 {
238 ret = BN_mod_exp_recp(r, a,p, m, ctx);
239 }
240#else
241 {
242 ret = BN_mod_exp_simple(r, a,p, m, ctx);
243 }
244#endif
245
246 bn_check_top(r);
247 return (ret);
248}
249
250int
251BN_mod_exp_recp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, const BIGNUM *m,
252 BN_CTX *ctx)
253{
254 int i, j, bits, ret = 0, wstart, wend, window, wvalue;
255 int start = 1;
256 BIGNUM *aa;
257 /* Table of variables obtained from 'ctx' */
258 BIGNUM *val[TABLE_SIZE];
259 BN_RECP_CTX recp;
260
261 if (BN_get_flags(p, BN_FLG_CONSTTIME) != 0) {
262 /* BN_FLG_CONSTTIME only supported by BN_mod_exp_mont() */
263 BNerr(BN_F_BN_MOD_EXP_RECP, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
264 return -1;
265 }
266
267 bits = BN_num_bits(p);
268
269 if (bits == 0) {
270 ret = BN_one(r);
271 return ret;
272 }
273
274 BN_CTX_start(ctx);
275 if ((aa = BN_CTX_get(ctx)) == NULL)
276 goto err;
277 if ((val[0] = BN_CTX_get(ctx)) == NULL)
278 goto err;
279
280 BN_RECP_CTX_init(&recp);
281 if (m->neg) {
282 /* ignore sign of 'm' */
283 if (!BN_copy(aa, m))
284 goto err;
285 aa->neg = 0;
286 if (BN_RECP_CTX_set(&recp, aa, ctx) <= 0)
287 goto err;
288 } else {
289 if (BN_RECP_CTX_set(&recp, m, ctx) <= 0)
290 goto err;
291 }
292
293 if (!BN_nnmod(val[0], a, m, ctx))
294 goto err; /* 1 */
295 if (BN_is_zero(val[0])) {
296 BN_zero(r);
297 ret = 1;
298 goto err;
299 }
300
301 window = BN_window_bits_for_exponent_size(bits);
302 if (window > 1) {
303 if (!BN_mod_mul_reciprocal(aa, val[0], val[0], &recp, ctx))
304 goto err; /* 2 */
305 j = 1 << (window - 1);
306 for (i = 1; i < j; i++) {
307 if (((val[i] = BN_CTX_get(ctx)) == NULL) ||
308 !BN_mod_mul_reciprocal(val[i], val[i - 1],
309 aa, &recp, ctx))
310 goto err;
311 }
312 }
313
314 start = 1; /* This is used to avoid multiplication etc
315 * when there is only the value '1' in the
316 * buffer. */
317 wvalue = 0; /* The 'value' of the window */
318 wstart = bits - 1; /* The top bit of the window */
319 wend = 0; /* The bottom bit of the window */
320
321 if (!BN_one(r))
322 goto err;
323
324 for (;;) {
325 if (BN_is_bit_set(p, wstart) == 0) {
326 if (!start)
327 if (!BN_mod_mul_reciprocal(r, r,r, &recp, ctx))
328 goto err;
329 if (wstart == 0)
330 break;
331 wstart--;
332 continue;
333 }
334 /* We now have wstart on a 'set' bit, we now need to work out
335 * how bit a window to do. To do this we need to scan
336 * forward until the last set bit before the end of the
337 * window */
338 j = wstart;
339 wvalue = 1;
340 wend = 0;
341 for (i = 1; i < window; i++) {
342 if (wstart - i < 0)
343 break;
344 if (BN_is_bit_set(p, wstart - i)) {
345 wvalue <<= (i - wend);
346 wvalue |= 1;
347 wend = i;
348 }
349 }
350
351 /* wend is the size of the current window */
352 j = wend + 1;
353 /* add the 'bytes above' */
354 if (!start)
355 for (i = 0; i < j; i++) {
356 if (!BN_mod_mul_reciprocal(r, r,r, &recp, ctx))
357 goto err;
358 }
359
360 /* wvalue will be an odd number < 2^window */
361 if (!BN_mod_mul_reciprocal(r, r,val[wvalue >> 1], &recp, ctx))
362 goto err;
363
364 /* move the 'window' down further */
365 wstart -= wend + 1;
366 wvalue = 0;
367 start = 0;
368 if (wstart < 0)
369 break;
370 }
371 ret = 1;
372
373err:
374 BN_CTX_end(ctx);
375 BN_RECP_CTX_free(&recp);
376 bn_check_top(r);
377 return (ret);
378}
379
380int
381BN_mod_exp_mont(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, const BIGNUM *m,
382 BN_CTX *ctx, BN_MONT_CTX *in_mont)
383{
384 int i, j, bits, ret = 0, wstart, wend, window, wvalue;
385 int start = 1;
386 BIGNUM *d, *r;
387 const BIGNUM *aa;
388 /* Table of variables obtained from 'ctx' */
389 BIGNUM *val[TABLE_SIZE];
390 BN_MONT_CTX *mont = NULL;
391
392 if (BN_get_flags(p, BN_FLG_CONSTTIME) != 0) {
393 return BN_mod_exp_mont_consttime(rr, a, p, m, ctx, in_mont);
394 }
395
396 bn_check_top(a);
397 bn_check_top(p);
398 bn_check_top(m);
399
400 if (!BN_is_odd(m)) {
401 BNerr(BN_F_BN_MOD_EXP_MONT, BN_R_CALLED_WITH_EVEN_MODULUS);
402 return (0);
403 }
404 bits = BN_num_bits(p);
405 if (bits == 0) {
406 ret = BN_one(rr);
407 return ret;
408 }
409
410 BN_CTX_start(ctx);
411 if ((d = BN_CTX_get(ctx)) == NULL)
412 goto err;
413 if ((r = BN_CTX_get(ctx)) == NULL)
414 goto err;
415 if ((val[0] = BN_CTX_get(ctx)) == NULL)
416 goto err;
417
418 /* If this is not done, things will break in the montgomery
419 * part */
420
421 if (in_mont != NULL)
422 mont = in_mont;
423 else {
424 if ((mont = BN_MONT_CTX_new()) == NULL)
425 goto err;
426 if (!BN_MONT_CTX_set(mont, m, ctx))
427 goto err;
428 }
429
430 if (a->neg || BN_ucmp(a, m) >= 0) {
431 if (!BN_nnmod(val[0], a,m, ctx))
432 goto err;
433 aa = val[0];
434 } else
435 aa = a;
436 if (BN_is_zero(aa)) {
437 BN_zero(rr);
438 ret = 1;
439 goto err;
440 }
441 if (!BN_to_montgomery(val[0], aa, mont, ctx))
442 goto err; /* 1 */
443
444 window = BN_window_bits_for_exponent_size(bits);
445 if (window > 1) {
446 if (!BN_mod_mul_montgomery(d, val[0], val[0], mont, ctx))
447 goto err; /* 2 */
448 j = 1 << (window - 1);
449 for (i = 1; i < j; i++) {
450 if (((val[i] = BN_CTX_get(ctx)) == NULL) ||
451 !BN_mod_mul_montgomery(val[i], val[i - 1],
452 d, mont, ctx))
453 goto err;
454 }
455 }
456
457 start = 1; /* This is used to avoid multiplication etc
458 * when there is only the value '1' in the
459 * buffer. */
460 wvalue = 0; /* The 'value' of the window */
461 wstart = bits - 1; /* The top bit of the window */
462 wend = 0; /* The bottom bit of the window */
463
464 if (!BN_to_montgomery(r, BN_value_one(), mont, ctx))
465 goto err;
466 for (;;) {
467 if (BN_is_bit_set(p, wstart) == 0) {
468 if (!start) {
469 if (!BN_mod_mul_montgomery(r, r, r, mont, ctx))
470 goto err;
471 }
472 if (wstart == 0)
473 break;
474 wstart--;
475 continue;
476 }
477 /* We now have wstart on a 'set' bit, we now need to work out
478 * how bit a window to do. To do this we need to scan
479 * forward until the last set bit before the end of the
480 * window */
481 j = wstart;
482 wvalue = 1;
483 wend = 0;
484 for (i = 1; i < window; i++) {
485 if (wstart - i < 0)
486 break;
487 if (BN_is_bit_set(p, wstart - i)) {
488 wvalue <<= (i - wend);
489 wvalue |= 1;
490 wend = i;
491 }
492 }
493
494 /* wend is the size of the current window */
495 j = wend + 1;
496 /* add the 'bytes above' */
497 if (!start)
498 for (i = 0; i < j; i++) {
499 if (!BN_mod_mul_montgomery(r, r, r, mont, ctx))
500 goto err;
501 }
502
503 /* wvalue will be an odd number < 2^window */
504 if (!BN_mod_mul_montgomery(r, r, val[wvalue >> 1], mont, ctx))
505 goto err;
506
507 /* move the 'window' down further */
508 wstart -= wend + 1;
509 wvalue = 0;
510 start = 0;
511 if (wstart < 0)
512 break;
513 }
514 if (!BN_from_montgomery(rr, r,mont, ctx))
515 goto err;
516 ret = 1;
517
518err:
519 if ((in_mont == NULL) && (mont != NULL))
520 BN_MONT_CTX_free(mont);
521 BN_CTX_end(ctx);
522 bn_check_top(rr);
523 return (ret);
524}
525
526
527/* BN_mod_exp_mont_consttime() stores the precomputed powers in a specific layout
528 * so that accessing any of these table values shows the same access pattern as far
529 * as cache lines are concerned. The following functions are used to transfer a BIGNUM
530 * from/to that table. */
531
532static int
533MOD_EXP_CTIME_COPY_TO_PREBUF(const BIGNUM *b, int top, unsigned char *buf,
534 int idx, int width)
535{
536 size_t i, j;
537
538 if (top > b->top)
539 top = b->top; /* this works because 'buf' is explicitly zeroed */
540 for (i = 0, j = idx; i < top * sizeof b->d[0]; i++, j += width) {
541 buf[j] = ((unsigned char*)b->d)[i];
542 }
543
544 return 1;
545}
546
547static int
548MOD_EXP_CTIME_COPY_FROM_PREBUF(BIGNUM *b, int top, unsigned char *buf, int idx,
549 int width)
550{
551 size_t i, j;
552
553 if (bn_wexpand(b, top) == NULL)
554 return 0;
555
556 for (i = 0, j = idx; i < top * sizeof b->d[0]; i++, j += width) {
557 ((unsigned char*)b->d)[i] = buf[j];
558 }
559
560 b->top = top;
561 bn_correct_top(b);
562 return 1;
563}
564
565/* Given a pointer value, compute the next address that is a cache line multiple. */
566#define MOD_EXP_CTIME_ALIGN(x_) \
567 ((unsigned char*)(x_) + (MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH - (((size_t)(x_)) & (MOD_EXP_CTIME_MIN_CACHE_LINE_MASK))))
568
569/* This variant of BN_mod_exp_mont() uses fixed windows and the special
570 * precomputation memory layout to limit data-dependency to a minimum
571 * to protect secret exponents (cf. the hyper-threading timing attacks
572 * pointed out by Colin Percival,
573 * http://www.daemonology.net/hyperthreading-considered-harmful/)
574 */
575int
576BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
577 const BIGNUM *m, BN_CTX *ctx, BN_MONT_CTX *in_mont)
578{
579 int i, bits, ret = 0, window, wvalue;
580 int top;
581 BN_MONT_CTX *mont = NULL;
582 int numPowers;
583 unsigned char *powerbufFree = NULL;
584 int powerbufLen = 0;
585 unsigned char *powerbuf = NULL;
586 BIGNUM tmp, am;
587
588 bn_check_top(a);
589 bn_check_top(p);
590 bn_check_top(m);
591
592 top = m->top;
593
594 if (!(m->d[0] & 1)) {
595 BNerr(BN_F_BN_MOD_EXP_MONT_CONSTTIME,
596 BN_R_CALLED_WITH_EVEN_MODULUS);
597 return (0);
598 }
599 bits = BN_num_bits(p);
600 if (bits == 0) {
601 ret = BN_one(rr);
602 return ret;
603 }
604
605 BN_CTX_start(ctx);
606
607 /* Allocate a montgomery context if it was not supplied by the caller.
608 * If this is not done, things will break in the montgomery part.
609 */
610 if (in_mont != NULL)
611 mont = in_mont;
612 else {
613 if ((mont = BN_MONT_CTX_new()) == NULL)
614 goto err;
615 if (!BN_MONT_CTX_set(mont, m, ctx))
616 goto err;
617 }
618
619 /* Get the window size to use with size of p. */
620 window = BN_window_bits_for_ctime_exponent_size(bits);
621#if defined(OPENSSL_BN_ASM_MONT5)
622 if (window == 6 && bits <= 1024)
623 window = 5; /* ~5% improvement of 2048-bit RSA sign */
624#endif
625
626 /* Allocate a buffer large enough to hold all of the pre-computed
627 * powers of am, am itself and tmp.
628 */
629 numPowers = 1 << window;
630 powerbufLen = sizeof(m->d[0]) * (top * numPowers +
631 ((2*top) > numPowers ? (2*top) : numPowers));
632 if ((powerbufFree = malloc(powerbufLen +
633 MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH)) == NULL)
634 goto err;
635
636 powerbuf = MOD_EXP_CTIME_ALIGN(powerbufFree);
637 memset(powerbuf, 0, powerbufLen);
638
639 /* lay down tmp and am right after powers table */
640 tmp.d = (BN_ULONG *)(powerbuf + sizeof(m->d[0]) * top * numPowers);
641 am.d = tmp.d + top;
642 tmp.top = am.top = 0;
643 tmp.dmax = am.dmax = top;
644 tmp.neg = am.neg = 0;
645 tmp.flags = am.flags = BN_FLG_STATIC_DATA;
646
647 /* prepare a^0 in Montgomery domain */
648#if 1
649 if (!BN_to_montgomery(&tmp, BN_value_one(), mont, ctx))
650 goto err;
651#else
652 tmp.d[0] = (0 - m - >d[0]) & BN_MASK2; /* 2^(top*BN_BITS2) - m */
653 for (i = 1; i < top; i++)
654 tmp.d[i] = (~m->d[i]) & BN_MASK2;
655 tmp.top = top;
656#endif
657
658 /* prepare a^1 in Montgomery domain */
659 if (a->neg || BN_ucmp(a, m) >= 0) {
660 if (!BN_mod(&am, a,m, ctx))
661 goto err;
662 if (!BN_to_montgomery(&am, &am, mont, ctx))
663 goto err;
664 } else if (!BN_to_montgomery(&am, a,mont, ctx))
665 goto err;
666
667#if defined(OPENSSL_BN_ASM_MONT5)
668 /* This optimization uses ideas from http://eprint.iacr.org/2011/239,
669 * specifically optimization of cache-timing attack countermeasures
670 * and pre-computation optimization. */
671
672 /* Dedicated window==4 case improves 512-bit RSA sign by ~15%, but as
673 * 512-bit RSA is hardly relevant, we omit it to spare size... */
674 if (window == 5 && top > 1) {
675 void bn_mul_mont_gather5(BN_ULONG *rp, const BN_ULONG *ap,
676 const void *table, const BN_ULONG *np,
677 const BN_ULONG *n0, int num, int power);
678 void bn_scatter5(const BN_ULONG *inp, size_t num,
679 void *table, size_t power);
680 void bn_gather5(BN_ULONG *out, size_t num,
681 void *table, size_t power);
682
683 BN_ULONG *np = mont->N.d, *n0 = mont->n0;
684
685 /* BN_to_montgomery can contaminate words above .top
686 * [in BN_DEBUG[_DEBUG] build]... */
687 for (i = am.top; i < top; i++)
688 am.d[i] = 0;
689 for (i = tmp.top; i < top; i++)
690 tmp.d[i] = 0;
691
692 bn_scatter5(tmp.d, top, powerbuf, 0);
693 bn_scatter5(am.d, am.top, powerbuf, 1);
694 bn_mul_mont(tmp.d, am.d, am.d, np, n0, top);
695 bn_scatter5(tmp.d, top, powerbuf, 2);
696
697#if 0
698 for (i = 3; i < 32; i++) {
699 /* Calculate a^i = a^(i-1) * a */
700 bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np,
701 n0, top, i - 1);
702 bn_scatter5(tmp.d, top, powerbuf, i);
703 }
704#else
705 /* same as above, but uses squaring for 1/2 of operations */
706 for (i = 4; i < 32; i*=2) {
707 bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
708 bn_scatter5(tmp.d, top, powerbuf, i);
709 }
710 for (i = 3; i < 8; i += 2) {
711 int j;
712 bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np,
713 n0, top, i - 1);
714 bn_scatter5(tmp.d, top, powerbuf, i);
715 for (j = 2 * i; j < 32; j *= 2) {
716 bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
717 bn_scatter5(tmp.d, top, powerbuf, j);
718 }
719 }
720 for (; i < 16; i += 2) {
721 bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np,
722 n0, top, i - 1);
723 bn_scatter5(tmp.d, top, powerbuf, i);
724 bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
725 bn_scatter5(tmp.d, top, powerbuf, 2*i);
726 }
727 for (; i < 32; i += 2) {
728 bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np,
729 n0, top, i - 1);
730 bn_scatter5(tmp.d, top, powerbuf, i);
731 }
732#endif
733 bits--;
734 for (wvalue = 0, i = bits % 5; i >= 0; i--, bits--)
735 wvalue = (wvalue << 1) + BN_is_bit_set(p, bits);
736 bn_gather5(tmp.d, top, powerbuf, wvalue);
737
738 /* Scan the exponent one window at a time starting from the most
739 * significant bits.
740 */
741 while (bits >= 0) {
742 for (wvalue = 0, i = 0; i < 5; i++, bits--)
743 wvalue = (wvalue << 1) + BN_is_bit_set(p, bits);
744
745 bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
746 bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
747 bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
748 bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
749 bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
750 bn_mul_mont_gather5(tmp.d, tmp.d, powerbuf, np, n0, top, wvalue);
751 }
752
753 tmp.top = top;
754 bn_correct_top(&tmp);
755 } else
756#endif
757 {
758 if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&tmp, top, powerbuf, 0,
759 numPowers))
760 goto err;
761 if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&am, top, powerbuf, 1,
762 numPowers))
763 goto err;
764
765 /* If the window size is greater than 1, then calculate
766 * val[i=2..2^winsize-1]. Powers are computed as a*a^(i-1)
767 * (even powers could instead be computed as (a^(i/2))^2
768 * to use the slight performance advantage of sqr over mul).
769 */
770 if (window > 1) {
771 if (!BN_mod_mul_montgomery(&tmp, &am, &am, mont, ctx))
772 goto err;
773 if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&tmp, top, powerbuf,
774 2, numPowers))
775 goto err;
776 for (i = 3; i < numPowers; i++) {
777 /* Calculate a^i = a^(i-1) * a */
778 if (!BN_mod_mul_montgomery(&tmp, &am, &tmp,
779 mont, ctx))
780 goto err;
781 if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&tmp, top,
782 powerbuf, i, numPowers))
783 goto err;
784 }
785 }
786
787 bits--;
788 for (wvalue = 0, i = bits % window; i >= 0; i--, bits--)
789 wvalue = (wvalue << 1) + BN_is_bit_set(p, bits);
790 if (!MOD_EXP_CTIME_COPY_FROM_PREBUF(&tmp, top, powerbuf,
791 wvalue, numPowers))
792 goto err;
793
794 /* Scan the exponent one window at a time starting from the most
795 * significant bits.
796 */
797 while (bits >= 0) {
798 wvalue = 0; /* The 'value' of the window */
799
800 /* Scan the window, squaring the result as we go */
801 for (i = 0; i < window; i++, bits--) {
802 if (!BN_mod_mul_montgomery(&tmp, &tmp, &tmp,
803 mont, ctx))
804 goto err;
805 wvalue = (wvalue << 1) + BN_is_bit_set(p, bits);
806 }
807
808 /* Fetch the appropriate pre-computed value from the pre-buf */
809 if (!MOD_EXP_CTIME_COPY_FROM_PREBUF(&am, top, powerbuf,
810 wvalue, numPowers))
811 goto err;
812
813 /* Multiply the result into the intermediate result */
814 if (!BN_mod_mul_montgomery(&tmp, &tmp, &am, mont, ctx))
815 goto err;
816 }
817 }
818
819 /* Convert the final result from montgomery to standard format */
820 if (!BN_from_montgomery(rr, &tmp, mont, ctx))
821 goto err;
822 ret = 1;
823
824err:
825 if ((in_mont == NULL) && (mont != NULL))
826 BN_MONT_CTX_free(mont);
827 if (powerbuf != NULL) {
828 OPENSSL_cleanse(powerbuf, powerbufLen);
829 free(powerbufFree);
830 }
831 BN_CTX_end(ctx);
832 return (ret);
833}
834
835int
836BN_mod_exp_mont_word(BIGNUM *rr, BN_ULONG a, const BIGNUM *p, const BIGNUM *m,
837 BN_CTX *ctx, BN_MONT_CTX *in_mont)
838{
839 BN_MONT_CTX *mont = NULL;
840 int b, bits, ret = 0;
841 int r_is_one;
842 BN_ULONG w, next_w;
843 BIGNUM *d, *r, *t;
844 BIGNUM *swap_tmp;
845
846#define BN_MOD_MUL_WORD(r, w, m) \
847 (BN_mul_word(r, (w)) && \
848 (/* BN_ucmp(r, (m)) < 0 ? 1 :*/ \
849 (BN_mod(t, r, m, ctx) && (swap_tmp = r, r = t, t = swap_tmp, 1))))
850 /* BN_MOD_MUL_WORD is only used with 'w' large,
851 * so the BN_ucmp test is probably more overhead
852 * than always using BN_mod (which uses BN_copy if
853 * a similar test returns true). */
854 /* We can use BN_mod and do not need BN_nnmod because our
855 * accumulator is never negative (the result of BN_mod does
856 * not depend on the sign of the modulus).
857 */
858#define BN_TO_MONTGOMERY_WORD(r, w, mont) \
859 (BN_set_word(r, (w)) && BN_to_montgomery(r, r, (mont), ctx))
860
861 if (BN_get_flags(p, BN_FLG_CONSTTIME) != 0) {
862 /* BN_FLG_CONSTTIME only supported by BN_mod_exp_mont() */
863 BNerr(BN_F_BN_MOD_EXP_MONT_WORD,
864 ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
865 return -1;
866 }
867
868 bn_check_top(p);
869 bn_check_top(m);
870
871 if (!BN_is_odd(m)) {
872 BNerr(BN_F_BN_MOD_EXP_MONT_WORD, BN_R_CALLED_WITH_EVEN_MODULUS);
873 return (0);
874 }
875 if (m->top == 1)
876 a %= m->d[0]; /* make sure that 'a' is reduced */
877
878 bits = BN_num_bits(p);
879 if (bits == 0) {
880 ret = BN_one(rr);
881 return ret;
882 }
883 if (a == 0) {
884 BN_zero(rr);
885 ret = 1;
886 return ret;
887 }
888
889 BN_CTX_start(ctx);
890 if ((d = BN_CTX_get(ctx)) == NULL)
891 goto err;
892 if ((r = BN_CTX_get(ctx)) == NULL)
893 goto err;
894 if ((t = BN_CTX_get(ctx)) == NULL)
895 goto err;
896
897 if (in_mont != NULL)
898 mont = in_mont;
899 else {
900 if ((mont = BN_MONT_CTX_new()) == NULL)
901 goto err;
902 if (!BN_MONT_CTX_set(mont, m, ctx))
903 goto err;
904 }
905
906 r_is_one = 1; /* except for Montgomery factor */
907
908 /* bits-1 >= 0 */
909
910 /* The result is accumulated in the product r*w. */
911 w = a; /* bit 'bits-1' of 'p' is always set */
912 for (b = bits - 2; b >= 0; b--) {
913 /* First, square r*w. */
914 next_w = w * w;
915 if ((next_w / w) != w) /* overflow */
916 {
917 if (r_is_one) {
918 if (!BN_TO_MONTGOMERY_WORD(r, w, mont))
919 goto err;
920 r_is_one = 0;
921 } else {
922 if (!BN_MOD_MUL_WORD(r, w, m))
923 goto err;
924 }
925 next_w = 1;
926 }
927 w = next_w;
928 if (!r_is_one) {
929 if (!BN_mod_mul_montgomery(r, r, r, mont, ctx))
930 goto err;
931 }
932
933 /* Second, multiply r*w by 'a' if exponent bit is set. */
934 if (BN_is_bit_set(p, b)) {
935 next_w = w * a;
936 if ((next_w / a) != w) /* overflow */
937 {
938 if (r_is_one) {
939 if (!BN_TO_MONTGOMERY_WORD(r, w, mont))
940 goto err;
941 r_is_one = 0;
942 } else {
943 if (!BN_MOD_MUL_WORD(r, w, m))
944 goto err;
945 }
946 next_w = a;
947 }
948 w = next_w;
949 }
950 }
951
952 /* Finally, set r:=r*w. */
953 if (w != 1) {
954 if (r_is_one) {
955 if (!BN_TO_MONTGOMERY_WORD(r, w, mont))
956 goto err;
957 r_is_one = 0;
958 } else {
959 if (!BN_MOD_MUL_WORD(r, w, m))
960 goto err;
961 }
962 }
963
964 if (r_is_one) /* can happen only if a == 1*/
965 {
966 if (!BN_one(rr))
967 goto err;
968 } else {
969 if (!BN_from_montgomery(rr, r, mont, ctx))
970 goto err;
971 }
972 ret = 1;
973
974err:
975 if ((in_mont == NULL) && (mont != NULL))
976 BN_MONT_CTX_free(mont);
977 BN_CTX_end(ctx);
978 bn_check_top(rr);
979 return (ret);
980}
981
982
983/* The old fallback, simple version :-) */
984int
985BN_mod_exp_simple(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, const BIGNUM *m,
986 BN_CTX *ctx)
987{
988 int i, j,bits, ret = 0, wstart, wend, window, wvalue;
989 int start = 1;
990 BIGNUM *d;
991 /* Table of variables obtained from 'ctx' */
992 BIGNUM *val[TABLE_SIZE];
993
994 if (BN_get_flags(p, BN_FLG_CONSTTIME) != 0) {
995 /* BN_FLG_CONSTTIME only supported by BN_mod_exp_mont() */
996 BNerr(BN_F_BN_MOD_EXP_SIMPLE,
997 ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
998 return -1;
999 }
1000
1001 bits = BN_num_bits(p);
1002
1003 if (bits == 0) {
1004 ret = BN_one(r);
1005 return ret;
1006 }
1007
1008 BN_CTX_start(ctx);
1009 if ((d = BN_CTX_get(ctx)) == NULL)
1010 goto err;
1011 if ((val[0] = BN_CTX_get(ctx)) == NULL)
1012 goto err;
1013
1014 if (!BN_nnmod(val[0],a,m,ctx))
1015 goto err; /* 1 */
1016 if (BN_is_zero(val[0])) {
1017 BN_zero(r);
1018 ret = 1;
1019 goto err;
1020 }
1021
1022 window = BN_window_bits_for_exponent_size(bits);
1023 if (window > 1) {
1024 if (!BN_mod_mul(d, val[0], val[0], m, ctx))
1025 goto err; /* 2 */
1026 j = 1 << (window - 1);
1027 for (i = 1; i < j; i++) {
1028 if (((val[i] = BN_CTX_get(ctx)) == NULL) ||
1029 !BN_mod_mul(val[i], val[i - 1], d,m, ctx))
1030 goto err;
1031 }
1032 }
1033
1034 start = 1; /* This is used to avoid multiplication etc
1035 * when there is only the value '1' in the
1036 * buffer. */
1037 wvalue = 0; /* The 'value' of the window */
1038 wstart = bits - 1; /* The top bit of the window */
1039 wend = 0; /* The bottom bit of the window */
1040
1041 if (!BN_one(r))
1042 goto err;
1043
1044 for (;;) {
1045 if (BN_is_bit_set(p, wstart) == 0) {
1046 if (!start)
1047 if (!BN_mod_mul(r, r, r, m, ctx))
1048 goto err;
1049 if (wstart == 0)
1050 break;
1051 wstart--;
1052 continue;
1053 }
1054 /* We now have wstart on a 'set' bit, we now need to work out
1055 * how bit a window to do. To do this we need to scan
1056 * forward until the last set bit before the end of the
1057 * window */
1058 j = wstart;
1059 wvalue = 1;
1060 wend = 0;
1061 for (i = 1; i < window; i++) {
1062 if (wstart - i < 0)
1063 break;
1064 if (BN_is_bit_set(p, wstart - i)) {
1065 wvalue <<= (i - wend);
1066 wvalue |= 1;
1067 wend = i;
1068 }
1069 }
1070
1071 /* wend is the size of the current window */
1072 j = wend + 1;
1073 /* add the 'bytes above' */
1074 if (!start)
1075 for (i = 0; i < j; i++) {
1076 if (!BN_mod_mul(r, r, r, m, ctx))
1077 goto err;
1078 }
1079
1080 /* wvalue will be an odd number < 2^window */
1081 if (!BN_mod_mul(r, r, val[wvalue >> 1], m, ctx))
1082 goto err;
1083
1084 /* move the 'window' down further */
1085 wstart -= wend + 1;
1086 wvalue = 0;
1087 start = 0;
1088 if (wstart < 0)
1089 break;
1090 }
1091 ret = 1;
1092
1093err:
1094 BN_CTX_end(ctx);
1095 bn_check_top(r);
1096 return (ret);
1097}
diff --git a/src/lib/libcrypto/bn/bn_exp2.c b/src/lib/libcrypto/bn/bn_exp2.c
deleted file mode 100644
index 38bf467a38..0000000000
--- a/src/lib/libcrypto/bn/bn_exp2.c
+++ /dev/null
@@ -1,308 +0,0 @@
1/* $OpenBSD: bn_exp2.c,v 1.10 2015/02/09 15:49:22 jsing Exp $ */
2/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
3 * All rights reserved.
4 *
5 * This package is an SSL implementation written
6 * by Eric Young (eay@cryptsoft.com).
7 * The implementation was written so as to conform with Netscapes SSL.
8 *
9 * This library is free for commercial and non-commercial use as long as
10 * the following conditions are aheared to. The following conditions
11 * apply to all code found in this distribution, be it the RC4, RSA,
12 * lhash, DES, etc., code; not just the SSL code. The SSL documentation
13 * included with this distribution is covered by the same copyright terms
14 * except that the holder is Tim Hudson (tjh@cryptsoft.com).
15 *
16 * Copyright remains Eric Young's, and as such any Copyright notices in
17 * the code are not to be removed.
18 * If this package is used in a product, Eric Young should be given attribution
19 * as the author of the parts of the library used.
20 * This can be in the form of a textual message at program startup or
21 * in documentation (online or textual) provided with the package.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 * 1. Redistributions of source code must retain the copyright
27 * notice, this list of conditions and the following disclaimer.
28 * 2. Redistributions in binary form must reproduce the above copyright
29 * notice, this list of conditions and the following disclaimer in the
30 * documentation and/or other materials provided with the distribution.
31 * 3. All advertising materials mentioning features or use of this software
32 * must display the following acknowledgement:
33 * "This product includes cryptographic software written by
34 * Eric Young (eay@cryptsoft.com)"
35 * The word 'cryptographic' can be left out if the rouines from the library
36 * being used are not cryptographic related :-).
37 * 4. If you include any Windows specific code (or a derivative thereof) from
38 * the apps directory (application code) you must include an acknowledgement:
39 * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
40 *
41 * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
42 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51 * SUCH DAMAGE.
52 *
53 * The licence and distribution terms for any publically available version or
54 * derivative of this code cannot be changed. i.e. this code cannot simply be
55 * copied and put under another distribution licence
56 * [including the GNU Public Licence.]
57 */
58/* ====================================================================
59 * Copyright (c) 1998-2000 The OpenSSL Project. All rights reserved.
60 *
61 * Redistribution and use in source and binary forms, with or without
62 * modification, are permitted provided that the following conditions
63 * are met:
64 *
65 * 1. Redistributions of source code must retain the above copyright
66 * notice, this list of conditions and the following disclaimer.
67 *
68 * 2. Redistributions in binary form must reproduce the above copyright
69 * notice, this list of conditions and the following disclaimer in
70 * the documentation and/or other materials provided with the
71 * distribution.
72 *
73 * 3. All advertising materials mentioning features or use of this
74 * software must display the following acknowledgment:
75 * "This product includes software developed by the OpenSSL Project
76 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
77 *
78 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
79 * endorse or promote products derived from this software without
80 * prior written permission. For written permission, please contact
81 * openssl-core@openssl.org.
82 *
83 * 5. Products derived from this software may not be called "OpenSSL"
84 * nor may "OpenSSL" appear in their names without prior written
85 * permission of the OpenSSL Project.
86 *
87 * 6. Redistributions of any form whatsoever must retain the following
88 * acknowledgment:
89 * "This product includes software developed by the OpenSSL Project
90 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
91 *
92 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
93 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
94 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
95 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
96 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
97 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
98 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
99 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
100 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
101 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
102 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
103 * OF THE POSSIBILITY OF SUCH DAMAGE.
104 * ====================================================================
105 *
106 * This product includes cryptographic software written by Eric Young
107 * (eay@cryptsoft.com). This product includes software written by Tim
108 * Hudson (tjh@cryptsoft.com).
109 *
110 */
111
112#include <stdio.h>
113
114#include <openssl/err.h>
115
116#include "bn_lcl.h"
117
118#define TABLE_SIZE 32
119
120int
121BN_mod_exp2_mont(BIGNUM *rr, const BIGNUM *a1, const BIGNUM *p1,
122 const BIGNUM *a2, const BIGNUM *p2, const BIGNUM *m, BN_CTX *ctx,
123 BN_MONT_CTX *in_mont)
124{
125 int i, j, bits, b, bits1, bits2, ret = 0, wpos1, wpos2, window1, window2, wvalue1, wvalue2;
126 int r_is_one = 1;
127 BIGNUM *d, *r;
128 const BIGNUM *a_mod_m;
129 /* Tables of variables obtained from 'ctx' */
130 BIGNUM *val1[TABLE_SIZE], *val2[TABLE_SIZE];
131 BN_MONT_CTX *mont = NULL;
132
133 bn_check_top(a1);
134 bn_check_top(p1);
135 bn_check_top(a2);
136 bn_check_top(p2);
137 bn_check_top(m);
138
139 if (!(m->d[0] & 1)) {
140 BNerr(BN_F_BN_MOD_EXP2_MONT, BN_R_CALLED_WITH_EVEN_MODULUS);
141 return (0);
142 }
143 bits1 = BN_num_bits(p1);
144 bits2 = BN_num_bits(p2);
145 if ((bits1 == 0) && (bits2 == 0)) {
146 ret = BN_one(rr);
147 return ret;
148 }
149
150 bits = (bits1 > bits2) ? bits1 : bits2;
151
152 BN_CTX_start(ctx);
153 if ((d = BN_CTX_get(ctx)) == NULL)
154 goto err;
155 if ((r = BN_CTX_get(ctx)) == NULL)
156 goto err;
157 if ((val1[0] = BN_CTX_get(ctx)) == NULL)
158 goto err;
159 if ((val2[0] = BN_CTX_get(ctx)) == NULL)
160 goto err;
161
162 if (in_mont != NULL)
163 mont = in_mont;
164 else {
165 if ((mont = BN_MONT_CTX_new()) == NULL)
166 goto err;
167 if (!BN_MONT_CTX_set(mont, m, ctx))
168 goto err;
169 }
170
171 window1 = BN_window_bits_for_exponent_size(bits1);
172 window2 = BN_window_bits_for_exponent_size(bits2);
173
174 /*
175 * Build table for a1: val1[i] := a1^(2*i + 1) mod m for i = 0 .. 2^(window1-1)
176 */
177 if (a1->neg || BN_ucmp(a1, m) >= 0) {
178 if (!BN_mod(val1[0], a1, m, ctx))
179 goto err;
180 a_mod_m = val1[0];
181 } else
182 a_mod_m = a1;
183 if (BN_is_zero(a_mod_m)) {
184 BN_zero(rr);
185 ret = 1;
186 goto err;
187 }
188
189 if (!BN_to_montgomery(val1[0], a_mod_m, mont, ctx))
190 goto err;
191 if (window1 > 1) {
192 if (!BN_mod_mul_montgomery(d, val1[0], val1[0], mont, ctx))
193 goto err;
194
195 j = 1 << (window1 - 1);
196 for (i = 1; i < j; i++) {
197 if (((val1[i] = BN_CTX_get(ctx)) == NULL) ||
198 !BN_mod_mul_montgomery(val1[i], val1[i - 1],
199 d, mont, ctx))
200 goto err;
201 }
202 }
203
204
205 /*
206 * Build table for a2: val2[i] := a2^(2*i + 1) mod m for i = 0 .. 2^(window2-1)
207 */
208 if (a2->neg || BN_ucmp(a2, m) >= 0) {
209 if (!BN_mod(val2[0], a2, m, ctx))
210 goto err;
211 a_mod_m = val2[0];
212 } else
213 a_mod_m = a2;
214 if (BN_is_zero(a_mod_m)) {
215 BN_zero(rr);
216 ret = 1;
217 goto err;
218 }
219 if (!BN_to_montgomery(val2[0], a_mod_m, mont, ctx))
220 goto err;
221 if (window2 > 1) {
222 if (!BN_mod_mul_montgomery(d, val2[0], val2[0], mont, ctx))
223 goto err;
224
225 j = 1 << (window2 - 1);
226 for (i = 1; i < j; i++) {
227 if (((val2[i] = BN_CTX_get(ctx)) == NULL) ||
228 !BN_mod_mul_montgomery(val2[i], val2[i - 1],
229 d, mont, ctx))
230 goto err;
231 }
232 }
233
234
235 /* Now compute the power product, using independent windows. */
236 r_is_one = 1;
237 wvalue1 = 0; /* The 'value' of the first window */
238 wvalue2 = 0; /* The 'value' of the second window */
239 wpos1 = 0; /* If wvalue1 > 0, the bottom bit of the first window */
240 wpos2 = 0; /* If wvalue2 > 0, the bottom bit of the second window */
241
242 if (!BN_to_montgomery(r, BN_value_one(), mont, ctx))
243 goto err;
244 for (b = bits - 1; b >= 0; b--) {
245 if (!r_is_one) {
246 if (!BN_mod_mul_montgomery(r, r,r, mont, ctx))
247 goto err;
248 }
249
250 if (!wvalue1)
251 if (BN_is_bit_set(p1, b)) {
252 /* consider bits b-window1+1 .. b for this window */
253 i = b - window1 + 1;
254 while (!BN_is_bit_set(p1, i)) /* works for i<0 */
255 i++;
256 wpos1 = i;
257 wvalue1 = 1;
258 for (i = b - 1; i >= wpos1; i--) {
259 wvalue1 <<= 1;
260 if (BN_is_bit_set(p1, i))
261 wvalue1++;
262 }
263 }
264
265 if (!wvalue2)
266 if (BN_is_bit_set(p2, b)) {
267 /* consider bits b-window2+1 .. b for this window */
268 i = b - window2 + 1;
269 while (!BN_is_bit_set(p2, i))
270 i++;
271 wpos2 = i;
272 wvalue2 = 1;
273 for (i = b - 1; i >= wpos2; i--) {
274 wvalue2 <<= 1;
275 if (BN_is_bit_set(p2, i))
276 wvalue2++;
277 }
278 }
279
280 if (wvalue1 && b == wpos1) {
281 /* wvalue1 is odd and < 2^window1 */
282 if (!BN_mod_mul_montgomery(r, r, val1[wvalue1 >> 1],
283 mont, ctx))
284 goto err;
285 wvalue1 = 0;
286 r_is_one = 0;
287 }
288
289 if (wvalue2 && b == wpos2) {
290 /* wvalue2 is odd and < 2^window2 */
291 if (!BN_mod_mul_montgomery(r, r, val2[wvalue2 >> 1],
292 mont, ctx))
293 goto err;
294 wvalue2 = 0;
295 r_is_one = 0;
296 }
297 }
298 if (!BN_from_montgomery(rr, r,mont, ctx))
299 goto err;
300 ret = 1;
301
302err:
303 if ((in_mont == NULL) && (mont != NULL))
304 BN_MONT_CTX_free(mont);
305 BN_CTX_end(ctx);
306 bn_check_top(rr);
307 return (ret);
308}
diff --git a/src/lib/libcrypto/bn/bn_gcd.c b/src/lib/libcrypto/bn/bn_gcd.c
deleted file mode 100644
index da9c29a8e5..0000000000
--- a/src/lib/libcrypto/bn/bn_gcd.c
+++ /dev/null
@@ -1,688 +0,0 @@
1/* $OpenBSD: bn_gcd.c,v 1.10 2015/02/09 15:49:22 jsing Exp $ */
2/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
3 * All rights reserved.
4 *
5 * This package is an SSL implementation written
6 * by Eric Young (eay@cryptsoft.com).
7 * The implementation was written so as to conform with Netscapes SSL.
8 *
9 * This library is free for commercial and non-commercial use as long as
10 * the following conditions are aheared to. The following conditions
11 * apply to all code found in this distribution, be it the RC4, RSA,
12 * lhash, DES, etc., code; not just the SSL code. The SSL documentation
13 * included with this distribution is covered by the same copyright terms
14 * except that the holder is Tim Hudson (tjh@cryptsoft.com).
15 *
16 * Copyright remains Eric Young's, and as such any Copyright notices in
17 * the code are not to be removed.
18 * If this package is used in a product, Eric Young should be given attribution
19 * as the author of the parts of the library used.
20 * This can be in the form of a textual message at program startup or
21 * in documentation (online or textual) provided with the package.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 * 1. Redistributions of source code must retain the copyright
27 * notice, this list of conditions and the following disclaimer.
28 * 2. Redistributions in binary form must reproduce the above copyright
29 * notice, this list of conditions and the following disclaimer in the
30 * documentation and/or other materials provided with the distribution.
31 * 3. All advertising materials mentioning features or use of this software
32 * must display the following acknowledgement:
33 * "This product includes cryptographic software written by
34 * Eric Young (eay@cryptsoft.com)"
35 * The word 'cryptographic' can be left out if the rouines from the library
36 * being used are not cryptographic related :-).
37 * 4. If you include any Windows specific code (or a derivative thereof) from
38 * the apps directory (application code) you must include an acknowledgement:
39 * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
40 *
41 * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
42 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51 * SUCH DAMAGE.
52 *
53 * The licence and distribution terms for any publically available version or
54 * derivative of this code cannot be changed. i.e. this code cannot simply be
55 * copied and put under another distribution licence
56 * [including the GNU Public Licence.]
57 */
58/* ====================================================================
59 * Copyright (c) 1998-2001 The OpenSSL Project. All rights reserved.
60 *
61 * Redistribution and use in source and binary forms, with or without
62 * modification, are permitted provided that the following conditions
63 * are met:
64 *
65 * 1. Redistributions of source code must retain the above copyright
66 * notice, this list of conditions and the following disclaimer.
67 *
68 * 2. Redistributions in binary form must reproduce the above copyright
69 * notice, this list of conditions and the following disclaimer in
70 * the documentation and/or other materials provided with the
71 * distribution.
72 *
73 * 3. All advertising materials mentioning features or use of this
74 * software must display the following acknowledgment:
75 * "This product includes software developed by the OpenSSL Project
76 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
77 *
78 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
79 * endorse or promote products derived from this software without
80 * prior written permission. For written permission, please contact
81 * openssl-core@openssl.org.
82 *
83 * 5. Products derived from this software may not be called "OpenSSL"
84 * nor may "OpenSSL" appear in their names without prior written
85 * permission of the OpenSSL Project.
86 *
87 * 6. Redistributions of any form whatsoever must retain the following
88 * acknowledgment:
89 * "This product includes software developed by the OpenSSL Project
90 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
91 *
92 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
93 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
94 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
95 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
96 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
97 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
98 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
99 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
100 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
101 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
102 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
103 * OF THE POSSIBILITY OF SUCH DAMAGE.
104 * ====================================================================
105 *
106 * This product includes cryptographic software written by Eric Young
107 * (eay@cryptsoft.com). This product includes software written by Tim
108 * Hudson (tjh@cryptsoft.com).
109 *
110 */
111
112#include <openssl/err.h>
113
114#include "bn_lcl.h"
115
116static BIGNUM *euclid(BIGNUM *a, BIGNUM *b);
117
118int
119BN_gcd(BIGNUM *r, const BIGNUM *in_a, const BIGNUM *in_b, BN_CTX *ctx)
120{
121 BIGNUM *a, *b, *t;
122 int ret = 0;
123
124 bn_check_top(in_a);
125 bn_check_top(in_b);
126
127 BN_CTX_start(ctx);
128 if ((a = BN_CTX_get(ctx)) == NULL)
129 goto err;
130 if ((b = BN_CTX_get(ctx)) == NULL)
131 goto err;
132
133 if (BN_copy(a, in_a) == NULL)
134 goto err;
135 if (BN_copy(b, in_b) == NULL)
136 goto err;
137 a->neg = 0;
138 b->neg = 0;
139
140 if (BN_cmp(a, b) < 0) {
141 t = a;
142 a = b;
143 b = t;
144 }
145 t = euclid(a, b);
146 if (t == NULL)
147 goto err;
148
149 if (BN_copy(r, t) == NULL)
150 goto err;
151 ret = 1;
152
153err:
154 BN_CTX_end(ctx);
155 bn_check_top(r);
156 return (ret);
157}
158
159static BIGNUM *
160euclid(BIGNUM *a, BIGNUM *b)
161{
162 BIGNUM *t;
163 int shifts = 0;
164
165 bn_check_top(a);
166 bn_check_top(b);
167
168 /* 0 <= b <= a */
169 while (!BN_is_zero(b)) {
170 /* 0 < b <= a */
171
172 if (BN_is_odd(a)) {
173 if (BN_is_odd(b)) {
174 if (!BN_sub(a, a, b))
175 goto err;
176 if (!BN_rshift1(a, a))
177 goto err;
178 if (BN_cmp(a, b) < 0) {
179 t = a;
180 a = b;
181 b = t;
182 }
183 }
184 else /* a odd - b even */
185 {
186 if (!BN_rshift1(b, b))
187 goto err;
188 if (BN_cmp(a, b) < 0) {
189 t = a;
190 a = b;
191 b = t;
192 }
193 }
194 }
195 else /* a is even */
196 {
197 if (BN_is_odd(b)) {
198 if (!BN_rshift1(a, a))
199 goto err;
200 if (BN_cmp(a, b) < 0) {
201 t = a;
202 a = b;
203 b = t;
204 }
205 }
206 else /* a even - b even */
207 {
208 if (!BN_rshift1(a, a))
209 goto err;
210 if (!BN_rshift1(b, b))
211 goto err;
212 shifts++;
213 }
214 }
215 /* 0 <= b <= a */
216 }
217
218 if (shifts) {
219 if (!BN_lshift(a, a, shifts))
220 goto err;
221 }
222 bn_check_top(a);
223 return (a);
224
225err:
226 return (NULL);
227}
228
229
230/* solves ax == 1 (mod n) */
231static BIGNUM *BN_mod_inverse_no_branch(BIGNUM *in, const BIGNUM *a,
232 const BIGNUM *n, BN_CTX *ctx);
233
234BIGNUM *
235BN_mod_inverse(BIGNUM *in, const BIGNUM *a, const BIGNUM *n, BN_CTX *ctx)
236{
237 BIGNUM *A, *B, *X, *Y, *M, *D, *T, *R = NULL;
238 BIGNUM *ret = NULL;
239 int sign;
240
241 if ((BN_get_flags(a, BN_FLG_CONSTTIME) != 0) ||
242 (BN_get_flags(n, BN_FLG_CONSTTIME) != 0)) {
243 return BN_mod_inverse_no_branch(in, a, n, ctx);
244 }
245
246 bn_check_top(a);
247 bn_check_top(n);
248
249 BN_CTX_start(ctx);
250 if ((A = BN_CTX_get(ctx)) == NULL)
251 goto err;
252 if ((B = BN_CTX_get(ctx)) == NULL)
253 goto err;
254 if ((X = BN_CTX_get(ctx)) == NULL)
255 goto err;
256 if ((D = BN_CTX_get(ctx)) == NULL)
257 goto err;
258 if ((M = BN_CTX_get(ctx)) == NULL)
259 goto err;
260 if ((Y = BN_CTX_get(ctx)) == NULL)
261 goto err;
262 if ((T = BN_CTX_get(ctx)) == NULL)
263 goto err;
264
265 if (in == NULL)
266 R = BN_new();
267 else
268 R = in;
269 if (R == NULL)
270 goto err;
271
272 BN_one(X);
273 BN_zero(Y);
274 if (BN_copy(B, a) == NULL)
275 goto err;
276 if (BN_copy(A, n) == NULL)
277 goto err;
278 A->neg = 0;
279 if (B->neg || (BN_ucmp(B, A) >= 0)) {
280 if (!BN_nnmod(B, B, A, ctx))
281 goto err;
282 }
283 sign = -1;
284 /* From B = a mod |n|, A = |n| it follows that
285 *
286 * 0 <= B < A,
287 * -sign*X*a == B (mod |n|),
288 * sign*Y*a == A (mod |n|).
289 */
290
291 if (BN_is_odd(n) && (BN_num_bits(n) <= (BN_BITS <= 32 ? 450 : 2048))) {
292 /* Binary inversion algorithm; requires odd modulus.
293 * This is faster than the general algorithm if the modulus
294 * is sufficiently small (about 400 .. 500 bits on 32-bit
295 * sytems, but much more on 64-bit systems) */
296 int shift;
297
298 while (!BN_is_zero(B)) {
299 /*
300 * 0 < B < |n|,
301 * 0 < A <= |n|,
302 * (1) -sign*X*a == B (mod |n|),
303 * (2) sign*Y*a == A (mod |n|)
304 */
305
306 /* Now divide B by the maximum possible power of two in the integers,
307 * and divide X by the same value mod |n|.
308 * When we're done, (1) still holds. */
309 shift = 0;
310 while (!BN_is_bit_set(B, shift)) /* note that 0 < B */
311 {
312 shift++;
313
314 if (BN_is_odd(X)) {
315 if (!BN_uadd(X, X, n))
316 goto err;
317 }
318 /* now X is even, so we can easily divide it by two */
319 if (!BN_rshift1(X, X))
320 goto err;
321 }
322 if (shift > 0) {
323 if (!BN_rshift(B, B, shift))
324 goto err;
325 }
326
327
328 /* Same for A and Y. Afterwards, (2) still holds. */
329 shift = 0;
330 while (!BN_is_bit_set(A, shift)) /* note that 0 < A */
331 {
332 shift++;
333
334 if (BN_is_odd(Y)) {
335 if (!BN_uadd(Y, Y, n))
336 goto err;
337 }
338 /* now Y is even */
339 if (!BN_rshift1(Y, Y))
340 goto err;
341 }
342 if (shift > 0) {
343 if (!BN_rshift(A, A, shift))
344 goto err;
345 }
346
347
348 /* We still have (1) and (2).
349 * Both A and B are odd.
350 * The following computations ensure that
351 *
352 * 0 <= B < |n|,
353 * 0 < A < |n|,
354 * (1) -sign*X*a == B (mod |n|),
355 * (2) sign*Y*a == A (mod |n|),
356 *
357 * and that either A or B is even in the next iteration.
358 */
359 if (BN_ucmp(B, A) >= 0) {
360 /* -sign*(X + Y)*a == B - A (mod |n|) */
361 if (!BN_uadd(X, X, Y))
362 goto err;
363 /* NB: we could use BN_mod_add_quick(X, X, Y, n), but that
364 * actually makes the algorithm slower */
365 if (!BN_usub(B, B, A))
366 goto err;
367 } else {
368 /* sign*(X + Y)*a == A - B (mod |n|) */
369 if (!BN_uadd(Y, Y, X))
370 goto err;
371 /* as above, BN_mod_add_quick(Y, Y, X, n) would slow things down */
372 if (!BN_usub(A, A, B))
373 goto err;
374 }
375 }
376 } else {
377 /* general inversion algorithm */
378
379 while (!BN_is_zero(B)) {
380 BIGNUM *tmp;
381
382 /*
383 * 0 < B < A,
384 * (*) -sign*X*a == B (mod |n|),
385 * sign*Y*a == A (mod |n|)
386 */
387
388 /* (D, M) := (A/B, A%B) ... */
389 if (BN_num_bits(A) == BN_num_bits(B)) {
390 if (!BN_one(D))
391 goto err;
392 if (!BN_sub(M, A, B))
393 goto err;
394 } else if (BN_num_bits(A) == BN_num_bits(B) + 1) {
395 /* A/B is 1, 2, or 3 */
396 if (!BN_lshift1(T, B))
397 goto err;
398 if (BN_ucmp(A, T) < 0) {
399 /* A < 2*B, so D=1 */
400 if (!BN_one(D))
401 goto err;
402 if (!BN_sub(M, A, B))
403 goto err;
404 } else {
405 /* A >= 2*B, so D=2 or D=3 */
406 if (!BN_sub(M, A, T))
407 goto err;
408 if (!BN_add(D,T,B)) goto err; /* use D (:= 3*B) as temp */
409 if (BN_ucmp(A, D) < 0) {
410 /* A < 3*B, so D=2 */
411 if (!BN_set_word(D, 2))
412 goto err;
413 /* M (= A - 2*B) already has the correct value */
414 } else {
415 /* only D=3 remains */
416 if (!BN_set_word(D, 3))
417 goto err;
418 /* currently M = A - 2*B, but we need M = A - 3*B */
419 if (!BN_sub(M, M, B))
420 goto err;
421 }
422 }
423 } else {
424 if (!BN_div(D, M, A, B, ctx))
425 goto err;
426 }
427
428 /* Now
429 * A = D*B + M;
430 * thus we have
431 * (**) sign*Y*a == D*B + M (mod |n|).
432 */
433 tmp = A; /* keep the BIGNUM object, the value does not matter */
434
435 /* (A, B) := (B, A mod B) ... */
436 A = B;
437 B = M;
438 /* ... so we have 0 <= B < A again */
439
440 /* Since the former M is now B and the former B is now A,
441 * (**) translates into
442 * sign*Y*a == D*A + B (mod |n|),
443 * i.e.
444 * sign*Y*a - D*A == B (mod |n|).
445 * Similarly, (*) translates into
446 * -sign*X*a == A (mod |n|).
447 *
448 * Thus,
449 * sign*Y*a + D*sign*X*a == B (mod |n|),
450 * i.e.
451 * sign*(Y + D*X)*a == B (mod |n|).
452 *
453 * So if we set (X, Y, sign) := (Y + D*X, X, -sign), we arrive back at
454 * -sign*X*a == B (mod |n|),
455 * sign*Y*a == A (mod |n|).
456 * Note that X and Y stay non-negative all the time.
457 */
458
459 /* most of the time D is very small, so we can optimize tmp := D*X+Y */
460 if (BN_is_one(D)) {
461 if (!BN_add(tmp, X, Y))
462 goto err;
463 } else {
464 if (BN_is_word(D, 2)) {
465 if (!BN_lshift1(tmp, X))
466 goto err;
467 } else if (BN_is_word(D, 4)) {
468 if (!BN_lshift(tmp, X, 2))
469 goto err;
470 } else if (D->top == 1) {
471 if (!BN_copy(tmp, X))
472 goto err;
473 if (!BN_mul_word(tmp, D->d[0]))
474 goto err;
475 } else {
476 if (!BN_mul(tmp, D,X, ctx))
477 goto err;
478 }
479 if (!BN_add(tmp, tmp, Y))
480 goto err;
481 }
482
483 M = Y; /* keep the BIGNUM object, the value does not matter */
484 Y = X;
485 X = tmp;
486 sign = -sign;
487 }
488 }
489
490 /*
491 * The while loop (Euclid's algorithm) ends when
492 * A == gcd(a,n);
493 * we have
494 * sign*Y*a == A (mod |n|),
495 * where Y is non-negative.
496 */
497
498 if (sign < 0) {
499 if (!BN_sub(Y, n, Y))
500 goto err;
501 }
502 /* Now Y*a == A (mod |n|). */
503
504 if (BN_is_one(A)) {
505 /* Y*a == 1 (mod |n|) */
506 if (!Y->neg && BN_ucmp(Y, n) < 0) {
507 if (!BN_copy(R, Y))
508 goto err;
509 } else {
510 if (!BN_nnmod(R, Y,n, ctx))
511 goto err;
512 }
513 } else {
514 BNerr(BN_F_BN_MOD_INVERSE, BN_R_NO_INVERSE);
515 goto err;
516 }
517 ret = R;
518
519err:
520 if ((ret == NULL) && (in == NULL))
521 BN_free(R);
522 BN_CTX_end(ctx);
523 bn_check_top(ret);
524 return (ret);
525}
526
527
528/* BN_mod_inverse_no_branch is a special version of BN_mod_inverse.
529 * It does not contain branches that may leak sensitive information.
530 */
531static BIGNUM *
532BN_mod_inverse_no_branch(BIGNUM *in, const BIGNUM *a, const BIGNUM *n,
533 BN_CTX *ctx)
534{
535 BIGNUM *A, *B, *X, *Y, *M, *D, *T, *R = NULL;
536 BIGNUM local_A, local_B;
537 BIGNUM *pA, *pB;
538 BIGNUM *ret = NULL;
539 int sign;
540
541 bn_check_top(a);
542 bn_check_top(n);
543
544 BN_CTX_start(ctx);
545 if ((A = BN_CTX_get(ctx)) == NULL)
546 goto err;
547 if ((B = BN_CTX_get(ctx)) == NULL)
548 goto err;
549 if ((X = BN_CTX_get(ctx)) == NULL)
550 goto err;
551 if ((D = BN_CTX_get(ctx)) == NULL)
552 goto err;
553 if ((M = BN_CTX_get(ctx)) == NULL)
554 goto err;
555 if ((Y = BN_CTX_get(ctx)) == NULL)
556 goto err;
557 if ((T = BN_CTX_get(ctx)) == NULL)
558 goto err;
559
560 if (in == NULL)
561 R = BN_new();
562 else
563 R = in;
564 if (R == NULL)
565 goto err;
566
567 BN_one(X);
568 BN_zero(Y);
569 if (BN_copy(B, a) == NULL)
570 goto err;
571 if (BN_copy(A, n) == NULL)
572 goto err;
573 A->neg = 0;
574
575 if (B->neg || (BN_ucmp(B, A) >= 0)) {
576 /* Turn BN_FLG_CONSTTIME flag on, so that when BN_div is invoked,
577 * BN_div_no_branch will be called eventually.
578 */
579 pB = &local_B;
580 BN_with_flags(pB, B, BN_FLG_CONSTTIME);
581 if (!BN_nnmod(B, pB, A, ctx))
582 goto err;
583 }
584 sign = -1;
585 /* From B = a mod |n|, A = |n| it follows that
586 *
587 * 0 <= B < A,
588 * -sign*X*a == B (mod |n|),
589 * sign*Y*a == A (mod |n|).
590 */
591
592 while (!BN_is_zero(B)) {
593 BIGNUM *tmp;
594
595 /*
596 * 0 < B < A,
597 * (*) -sign*X*a == B (mod |n|),
598 * sign*Y*a == A (mod |n|)
599 */
600
601 /* Turn BN_FLG_CONSTTIME flag on, so that when BN_div is invoked,
602 * BN_div_no_branch will be called eventually.
603 */
604 pA = &local_A;
605 BN_with_flags(pA, A, BN_FLG_CONSTTIME);
606
607 /* (D, M) := (A/B, A%B) ... */
608 if (!BN_div(D, M, pA, B, ctx))
609 goto err;
610
611 /* Now
612 * A = D*B + M;
613 * thus we have
614 * (**) sign*Y*a == D*B + M (mod |n|).
615 */
616 tmp = A; /* keep the BIGNUM object, the value does not matter */
617
618 /* (A, B) := (B, A mod B) ... */
619 A = B;
620 B = M;
621 /* ... so we have 0 <= B < A again */
622
623 /* Since the former M is now B and the former B is now A,
624 * (**) translates into
625 * sign*Y*a == D*A + B (mod |n|),
626 * i.e.
627 * sign*Y*a - D*A == B (mod |n|).
628 * Similarly, (*) translates into
629 * -sign*X*a == A (mod |n|).
630 *
631 * Thus,
632 * sign*Y*a + D*sign*X*a == B (mod |n|),
633 * i.e.
634 * sign*(Y + D*X)*a == B (mod |n|).
635 *
636 * So if we set (X, Y, sign) := (Y + D*X, X, -sign), we arrive back at
637 * -sign*X*a == B (mod |n|),
638 * sign*Y*a == A (mod |n|).
639 * Note that X and Y stay non-negative all the time.
640 */
641
642 if (!BN_mul(tmp, D, X, ctx))
643 goto err;
644 if (!BN_add(tmp, tmp, Y))
645 goto err;
646
647 M = Y; /* keep the BIGNUM object, the value does not matter */
648 Y = X;
649 X = tmp;
650 sign = -sign;
651 }
652
653 /*
654 * The while loop (Euclid's algorithm) ends when
655 * A == gcd(a,n);
656 * we have
657 * sign*Y*a == A (mod |n|),
658 * where Y is non-negative.
659 */
660
661 if (sign < 0) {
662 if (!BN_sub(Y, n, Y))
663 goto err;
664 }
665 /* Now Y*a == A (mod |n|). */
666
667 if (BN_is_one(A)) {
668 /* Y*a == 1 (mod |n|) */
669 if (!Y->neg && BN_ucmp(Y, n) < 0) {
670 if (!BN_copy(R, Y))
671 goto err;
672 } else {
673 if (!BN_nnmod(R, Y, n, ctx))
674 goto err;
675 }
676 } else {
677 BNerr(BN_F_BN_MOD_INVERSE_NO_BRANCH, BN_R_NO_INVERSE);
678 goto err;
679 }
680 ret = R;
681
682err:
683 if ((ret == NULL) && (in == NULL))
684 BN_free(R);
685 BN_CTX_end(ctx);
686 bn_check_top(ret);
687 return (ret);
688}
diff --git a/src/lib/libcrypto/bn/bn_gf2m.c b/src/lib/libcrypto/bn/bn_gf2m.c
deleted file mode 100644
index 40c1a94220..0000000000
--- a/src/lib/libcrypto/bn/bn_gf2m.c
+++ /dev/null
@@ -1,1320 +0,0 @@
1/* $OpenBSD: bn_gf2m.c,v 1.20 2015/06/11 15:55:28 jsing Exp $ */
2/* ====================================================================
3 * Copyright 2002 Sun Microsystems, Inc. ALL RIGHTS RESERVED.
4 *
5 * The Elliptic Curve Public-Key Crypto Library (ECC Code) included
6 * herein is developed by SUN MICROSYSTEMS, INC., and is contributed
7 * to the OpenSSL project.
8 *
9 * The ECC Code is licensed pursuant to the OpenSSL open source
10 * license provided below.
11 *
12 * In addition, Sun covenants to all licensees who provide a reciprocal
13 * covenant with respect to their own patents if any, not to sue under
14 * current and future patent claims necessarily infringed by the making,
15 * using, practicing, selling, offering for sale and/or otherwise
16 * disposing of the ECC Code as delivered hereunder (or portions thereof),
17 * provided that such covenant shall not apply:
18 * 1) for code that a licensee deletes from the ECC Code;
19 * 2) separates from the ECC Code; or
20 * 3) for infringements caused by:
21 * i) the modification of the ECC Code or
22 * ii) the combination of the ECC Code with other software or
23 * devices where such combination causes the infringement.
24 *
25 * The software is originally written by Sheueling Chang Shantz and
26 * Douglas Stebila of Sun Microsystems Laboratories.
27 *
28 */
29
30/* NOTE: This file is licensed pursuant to the OpenSSL license below
31 * and may be modified; but after modifications, the above covenant
32 * may no longer apply! In such cases, the corresponding paragraph
33 * ["In addition, Sun covenants ... causes the infringement."] and
34 * this note can be edited out; but please keep the Sun copyright
35 * notice and attribution. */
36
37/* ====================================================================
38 * Copyright (c) 1998-2002 The OpenSSL Project. All rights reserved.
39 *
40 * Redistribution and use in source and binary forms, with or without
41 * modification, are permitted provided that the following conditions
42 * are met:
43 *
44 * 1. Redistributions of source code must retain the above copyright
45 * notice, this list of conditions and the following disclaimer.
46 *
47 * 2. Redistributions in binary form must reproduce the above copyright
48 * notice, this list of conditions and the following disclaimer in
49 * the documentation and/or other materials provided with the
50 * distribution.
51 *
52 * 3. All advertising materials mentioning features or use of this
53 * software must display the following acknowledgment:
54 * "This product includes software developed by the OpenSSL Project
55 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
56 *
57 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
58 * endorse or promote products derived from this software without
59 * prior written permission. For written permission, please contact
60 * openssl-core@openssl.org.
61 *
62 * 5. Products derived from this software may not be called "OpenSSL"
63 * nor may "OpenSSL" appear in their names without prior written
64 * permission of the OpenSSL Project.
65 *
66 * 6. Redistributions of any form whatsoever must retain the following
67 * acknowledgment:
68 * "This product includes software developed by the OpenSSL Project
69 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
70 *
71 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
72 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
73 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
74 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
75 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
76 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
77 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
78 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
79 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
80 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
81 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
82 * OF THE POSSIBILITY OF SUCH DAMAGE.
83 * ====================================================================
84 *
85 * This product includes cryptographic software written by Eric Young
86 * (eay@cryptsoft.com). This product includes software written by Tim
87 * Hudson (tjh@cryptsoft.com).
88 *
89 */
90
91#include <limits.h>
92#include <stdio.h>
93
94#include <openssl/opensslconf.h>
95
96#include <openssl/err.h>
97
98#include "bn_lcl.h"
99
100#ifndef OPENSSL_NO_EC2M
101
102/* Maximum number of iterations before BN_GF2m_mod_solve_quad_arr should fail. */
103#define MAX_ITERATIONS 50
104
105static const BN_ULONG SQR_tb[16] =
106 { 0, 1, 4, 5, 16, 17, 20, 21,
10764, 65, 68, 69, 80, 81, 84, 85 };
108/* Platform-specific macros to accelerate squaring. */
109#ifdef _LP64
110#define SQR1(w) \
111 SQR_tb[(w) >> 60 & 0xF] << 56 | SQR_tb[(w) >> 56 & 0xF] << 48 | \
112 SQR_tb[(w) >> 52 & 0xF] << 40 | SQR_tb[(w) >> 48 & 0xF] << 32 | \
113 SQR_tb[(w) >> 44 & 0xF] << 24 | SQR_tb[(w) >> 40 & 0xF] << 16 | \
114 SQR_tb[(w) >> 36 & 0xF] << 8 | SQR_tb[(w) >> 32 & 0xF]
115#define SQR0(w) \
116 SQR_tb[(w) >> 28 & 0xF] << 56 | SQR_tb[(w) >> 24 & 0xF] << 48 | \
117 SQR_tb[(w) >> 20 & 0xF] << 40 | SQR_tb[(w) >> 16 & 0xF] << 32 | \
118 SQR_tb[(w) >> 12 & 0xF] << 24 | SQR_tb[(w) >> 8 & 0xF] << 16 | \
119 SQR_tb[(w) >> 4 & 0xF] << 8 | SQR_tb[(w) & 0xF]
120#else
121#define SQR1(w) \
122 SQR_tb[(w) >> 28 & 0xF] << 24 | SQR_tb[(w) >> 24 & 0xF] << 16 | \
123 SQR_tb[(w) >> 20 & 0xF] << 8 | SQR_tb[(w) >> 16 & 0xF]
124#define SQR0(w) \
125 SQR_tb[(w) >> 12 & 0xF] << 24 | SQR_tb[(w) >> 8 & 0xF] << 16 | \
126 SQR_tb[(w) >> 4 & 0xF] << 8 | SQR_tb[(w) & 0xF]
127#endif
128
129#if !defined(OPENSSL_BN_ASM_GF2m)
130/* Product of two polynomials a, b each with degree < BN_BITS2 - 1,
131 * result is a polynomial r with degree < 2 * BN_BITS - 1
132 * The caller MUST ensure that the variables have the right amount
133 * of space allocated.
134 */
135static void
136bn_GF2m_mul_1x1(BN_ULONG *r1, BN_ULONG *r0, const BN_ULONG a, const BN_ULONG b)
137{
138#ifndef _LP64
139 BN_ULONG h, l, s;
140 BN_ULONG tab[8], top2b = a >> 30;
141 BN_ULONG a1, a2, a4;
142
143 a1 = a & (0x3FFFFFFF);
144 a2 = a1 << 1;
145 a4 = a2 << 1;
146
147 tab[0] = 0;
148 tab[1] = a1;
149 tab[2] = a2;
150 tab[3] = a1 ^ a2;
151 tab[4] = a4;
152 tab[5] = a1 ^ a4;
153 tab[6] = a2 ^ a4;
154 tab[7] = a1 ^ a2 ^ a4;
155
156 s = tab[b & 0x7];
157 l = s;
158 s = tab[b >> 3 & 0x7];
159 l ^= s << 3;
160 h = s >> 29;
161 s = tab[b >> 6 & 0x7];
162 l ^= s << 6;
163 h ^= s >> 26;
164 s = tab[b >> 9 & 0x7];
165 l ^= s << 9;
166 h ^= s >> 23;
167 s = tab[b >> 12 & 0x7];
168 l ^= s << 12;
169 h ^= s >> 20;
170 s = tab[b >> 15 & 0x7];
171 l ^= s << 15;
172 h ^= s >> 17;
173 s = tab[b >> 18 & 0x7];
174 l ^= s << 18;
175 h ^= s >> 14;
176 s = tab[b >> 21 & 0x7];
177 l ^= s << 21;
178 h ^= s >> 11;
179 s = tab[b >> 24 & 0x7];
180 l ^= s << 24;
181 h ^= s >> 8;
182 s = tab[b >> 27 & 0x7];
183 l ^= s << 27;
184 h ^= s >> 5;
185 s = tab[b >> 30];
186 l ^= s << 30;
187 h ^= s >> 2;
188
189 /* compensate for the top two bits of a */
190 if (top2b & 01) {
191 l ^= b << 30;
192 h ^= b >> 2;
193 }
194 if (top2b & 02) {
195 l ^= b << 31;
196 h ^= b >> 1;
197 }
198
199 *r1 = h;
200 *r0 = l;
201#else
202 BN_ULONG h, l, s;
203 BN_ULONG tab[16], top3b = a >> 61;
204 BN_ULONG a1, a2, a4, a8;
205
206 a1 = a & (0x1FFFFFFFFFFFFFFFULL);
207 a2 = a1 << 1;
208 a4 = a2 << 1;
209 a8 = a4 << 1;
210
211 tab[0] = 0;
212 tab[1] = a1;
213 tab[2] = a2;
214 tab[3] = a1 ^ a2;
215 tab[4] = a4;
216 tab[5] = a1 ^ a4;
217 tab[6] = a2 ^ a4;
218 tab[7] = a1 ^ a2 ^ a4;
219 tab[8] = a8;
220 tab[9] = a1 ^ a8;
221 tab[10] = a2 ^ a8;
222 tab[11] = a1 ^ a2 ^ a8;
223 tab[12] = a4 ^ a8;
224 tab[13] = a1 ^ a4 ^ a8;
225 tab[14] = a2 ^ a4 ^ a8;
226 tab[15] = a1 ^ a2 ^ a4 ^ a8;
227
228 s = tab[b & 0xF];
229 l = s;
230 s = tab[b >> 4 & 0xF];
231 l ^= s << 4;
232 h = s >> 60;
233 s = tab[b >> 8 & 0xF];
234 l ^= s << 8;
235 h ^= s >> 56;
236 s = tab[b >> 12 & 0xF];
237 l ^= s << 12;
238 h ^= s >> 52;
239 s = tab[b >> 16 & 0xF];
240 l ^= s << 16;
241 h ^= s >> 48;
242 s = tab[b >> 20 & 0xF];
243 l ^= s << 20;
244 h ^= s >> 44;
245 s = tab[b >> 24 & 0xF];
246 l ^= s << 24;
247 h ^= s >> 40;
248 s = tab[b >> 28 & 0xF];
249 l ^= s << 28;
250 h ^= s >> 36;
251 s = tab[b >> 32 & 0xF];
252 l ^= s << 32;
253 h ^= s >> 32;
254 s = tab[b >> 36 & 0xF];
255 l ^= s << 36;
256 h ^= s >> 28;
257 s = tab[b >> 40 & 0xF];
258 l ^= s << 40;
259 h ^= s >> 24;
260 s = tab[b >> 44 & 0xF];
261 l ^= s << 44;
262 h ^= s >> 20;
263 s = tab[b >> 48 & 0xF];
264 l ^= s << 48;
265 h ^= s >> 16;
266 s = tab[b >> 52 & 0xF];
267 l ^= s << 52;
268 h ^= s >> 12;
269 s = tab[b >> 56 & 0xF];
270 l ^= s << 56;
271 h ^= s >> 8;
272 s = tab[b >> 60];
273 l ^= s << 60;
274 h ^= s >> 4;
275
276 /* compensate for the top three bits of a */
277 if (top3b & 01) {
278 l ^= b << 61;
279 h ^= b >> 3;
280 }
281 if (top3b & 02) {
282 l ^= b << 62;
283 h ^= b >> 2;
284 }
285 if (top3b & 04) {
286 l ^= b << 63;
287 h ^= b >> 1;
288 }
289
290 *r1 = h;
291 *r0 = l;
292#endif
293}
294
295/* Product of two polynomials a, b each with degree < 2 * BN_BITS2 - 1,
296 * result is a polynomial r with degree < 4 * BN_BITS2 - 1
297 * The caller MUST ensure that the variables have the right amount
298 * of space allocated.
299 */
300static void
301bn_GF2m_mul_2x2(BN_ULONG *r, const BN_ULONG a1, const BN_ULONG a0,
302 const BN_ULONG b1, const BN_ULONG b0)
303{
304 BN_ULONG m1, m0;
305
306 /* r[3] = h1, r[2] = h0; r[1] = l1; r[0] = l0 */
307 bn_GF2m_mul_1x1(r + 3, r + 2, a1, b1);
308 bn_GF2m_mul_1x1(r + 1, r, a0, b0);
309 bn_GF2m_mul_1x1(&m1, &m0, a0 ^ a1, b0 ^ b1);
310 /* Correction on m1 ^= l1 ^ h1; m0 ^= l0 ^ h0; */
311 r[2] ^= m1 ^ r[1] ^ r[3]; /* h0 ^= m1 ^ l1 ^ h1; */
312 r[1] = r[3] ^ r[2] ^ r[0] ^ m1 ^ m0; /* l1 ^= l0 ^ h0 ^ m0; */
313}
314#else
315void bn_GF2m_mul_2x2(BN_ULONG *r, BN_ULONG a1, BN_ULONG a0, BN_ULONG b1,
316 BN_ULONG b0);
317#endif
318
319/* Add polynomials a and b and store result in r; r could be a or b, a and b
320 * could be equal; r is the bitwise XOR of a and b.
321 */
322int
323BN_GF2m_add(BIGNUM *r, const BIGNUM *a, const BIGNUM *b)
324{
325 int i;
326 const BIGNUM *at, *bt;
327
328 bn_check_top(a);
329 bn_check_top(b);
330
331 if (a->top < b->top) {
332 at = b;
333 bt = a;
334 } else {
335 at = a;
336 bt = b;
337 }
338
339 if (bn_wexpand(r, at->top) == NULL)
340 return 0;
341
342 for (i = 0; i < bt->top; i++) {
343 r->d[i] = at->d[i] ^ bt->d[i];
344 }
345 for (; i < at->top; i++) {
346 r->d[i] = at->d[i];
347 }
348
349 r->top = at->top;
350 bn_correct_top(r);
351
352 return 1;
353}
354
355
356/* Some functions allow for representation of the irreducible polynomials
357 * as an int[], say p. The irreducible f(t) is then of the form:
358 * t^p[0] + t^p[1] + ... + t^p[k]
359 * where m = p[0] > p[1] > ... > p[k] = 0.
360 */
361
362
363/* Performs modular reduction of a and store result in r. r could be a. */
364int
365BN_GF2m_mod_arr(BIGNUM *r, const BIGNUM *a, const int p[])
366{
367 int j, k;
368 int n, dN, d0, d1;
369 BN_ULONG zz, *z;
370
371 bn_check_top(a);
372
373 if (!p[0]) {
374 /* reduction mod 1 => return 0 */
375 BN_zero(r);
376 return 1;
377 }
378
379 /* Since the algorithm does reduction in the r value, if a != r, copy
380 * the contents of a into r so we can do reduction in r.
381 */
382 if (a != r) {
383 if (!bn_wexpand(r, a->top))
384 return 0;
385 for (j = 0; j < a->top; j++) {
386 r->d[j] = a->d[j];
387 }
388 r->top = a->top;
389 }
390 z = r->d;
391
392 /* start reduction */
393 dN = p[0] / BN_BITS2;
394 for (j = r->top - 1; j > dN; ) {
395 zz = z[j];
396 if (z[j] == 0) {
397 j--;
398 continue;
399 }
400 z[j] = 0;
401
402 for (k = 1; p[k] != 0; k++) {
403 /* reducing component t^p[k] */
404 n = p[0] - p[k];
405 d0 = n % BN_BITS2;
406 d1 = BN_BITS2 - d0;
407 n /= BN_BITS2;
408 z[j - n] ^= (zz >> d0);
409 if (d0)
410 z[j - n - 1] ^= (zz << d1);
411 }
412
413 /* reducing component t^0 */
414 n = dN;
415 d0 = p[0] % BN_BITS2;
416 d1 = BN_BITS2 - d0;
417 z[j - n] ^= (zz >> d0);
418 if (d0)
419 z[j - n - 1] ^= (zz << d1);
420 }
421
422 /* final round of reduction */
423 while (j == dN) {
424
425 d0 = p[0] % BN_BITS2;
426 zz = z[dN] >> d0;
427 if (zz == 0)
428 break;
429 d1 = BN_BITS2 - d0;
430
431 /* clear up the top d1 bits */
432 if (d0)
433 z[dN] = (z[dN] << d1) >> d1;
434 else
435 z[dN] = 0;
436 z[0] ^= zz; /* reduction t^0 component */
437
438 for (k = 1; p[k] != 0; k++) {
439 BN_ULONG tmp_ulong;
440
441 /* reducing component t^p[k]*/
442 n = p[k] / BN_BITS2;
443 d0 = p[k] % BN_BITS2;
444 d1 = BN_BITS2 - d0;
445 z[n] ^= (zz << d0);
446 tmp_ulong = zz >> d1;
447 if (d0 && tmp_ulong)
448 z[n + 1] ^= tmp_ulong;
449 }
450
451
452 }
453
454 bn_correct_top(r);
455 return 1;
456}
457
458/* Performs modular reduction of a by p and store result in r. r could be a.
459 *
460 * This function calls down to the BN_GF2m_mod_arr implementation; this wrapper
461 * function is only provided for convenience; for best performance, use the
462 * BN_GF2m_mod_arr function.
463 */
464int
465BN_GF2m_mod(BIGNUM *r, const BIGNUM *a, const BIGNUM *p)
466{
467 int ret = 0;
468 int arr[6];
469
470 bn_check_top(a);
471 bn_check_top(p);
472 ret = BN_GF2m_poly2arr(p, arr, sizeof(arr) / sizeof(arr[0]));
473 if (!ret || ret > (int)(sizeof(arr) / sizeof(arr[0]))) {
474 BNerr(BN_F_BN_GF2M_MOD, BN_R_INVALID_LENGTH);
475 return 0;
476 }
477 ret = BN_GF2m_mod_arr(r, a, arr);
478 bn_check_top(r);
479 return ret;
480}
481
482
483/* Compute the product of two polynomials a and b, reduce modulo p, and store
484 * the result in r. r could be a or b; a could be b.
485 */
486int
487BN_GF2m_mod_mul_arr(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, const int p[],
488 BN_CTX *ctx)
489{
490 int zlen, i, j, k, ret = 0;
491 BIGNUM *s;
492 BN_ULONG x1, x0, y1, y0, zz[4];
493
494 bn_check_top(a);
495 bn_check_top(b);
496
497 if (a == b) {
498 return BN_GF2m_mod_sqr_arr(r, a, p, ctx);
499 }
500
501 BN_CTX_start(ctx);
502 if ((s = BN_CTX_get(ctx)) == NULL)
503 goto err;
504
505 zlen = a->top + b->top + 4;
506 if (!bn_wexpand(s, zlen))
507 goto err;
508 s->top = zlen;
509
510 for (i = 0; i < zlen; i++)
511 s->d[i] = 0;
512
513 for (j = 0; j < b->top; j += 2) {
514 y0 = b->d[j];
515 y1 = ((j + 1) == b->top) ? 0 : b->d[j + 1];
516 for (i = 0; i < a->top; i += 2) {
517 x0 = a->d[i];
518 x1 = ((i + 1) == a->top) ? 0 : a->d[i + 1];
519 bn_GF2m_mul_2x2(zz, x1, x0, y1, y0);
520 for (k = 0; k < 4; k++)
521 s->d[i + j + k] ^= zz[k];
522 }
523 }
524
525 bn_correct_top(s);
526 if (BN_GF2m_mod_arr(r, s, p))
527 ret = 1;
528 bn_check_top(r);
529
530err:
531 BN_CTX_end(ctx);
532 return ret;
533}
534
535/* Compute the product of two polynomials a and b, reduce modulo p, and store
536 * the result in r. r could be a or b; a could equal b.
537 *
538 * This function calls down to the BN_GF2m_mod_mul_arr implementation; this wrapper
539 * function is only provided for convenience; for best performance, use the
540 * BN_GF2m_mod_mul_arr function.
541 */
542int
543BN_GF2m_mod_mul(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, const BIGNUM *p,
544 BN_CTX *ctx)
545{
546 int ret = 0;
547 const int max = BN_num_bits(p) + 1;
548 int *arr = NULL;
549
550 bn_check_top(a);
551 bn_check_top(b);
552 bn_check_top(p);
553 if ((arr = reallocarray(NULL, max, sizeof(int))) == NULL)
554 goto err;
555 ret = BN_GF2m_poly2arr(p, arr, max);
556 if (!ret || ret > max) {
557 BNerr(BN_F_BN_GF2M_MOD_MUL, BN_R_INVALID_LENGTH);
558 goto err;
559 }
560 ret = BN_GF2m_mod_mul_arr(r, a, b, arr, ctx);
561 bn_check_top(r);
562
563err:
564 free(arr);
565 return ret;
566}
567
568
569/* Square a, reduce the result mod p, and store it in a. r could be a. */
570int
571BN_GF2m_mod_sqr_arr(BIGNUM *r, const BIGNUM *a, const int p[], BN_CTX *ctx)
572{
573 int i, ret = 0;
574 BIGNUM *s;
575
576 bn_check_top(a);
577 BN_CTX_start(ctx);
578 if ((s = BN_CTX_get(ctx)) == NULL)
579 goto err;
580 if (!bn_wexpand(s, 2 * a->top))
581 goto err;
582
583 for (i = a->top - 1; i >= 0; i--) {
584 s->d[2 * i + 1] = SQR1(a->d[i]);
585 s->d[2 * i] = SQR0(a->d[i]);
586 }
587
588 s->top = 2 * a->top;
589 bn_correct_top(s);
590 if (!BN_GF2m_mod_arr(r, s, p))
591 goto err;
592 bn_check_top(r);
593 ret = 1;
594
595err:
596 BN_CTX_end(ctx);
597 return ret;
598}
599
600/* Square a, reduce the result mod p, and store it in a. r could be a.
601 *
602 * This function calls down to the BN_GF2m_mod_sqr_arr implementation; this wrapper
603 * function is only provided for convenience; for best performance, use the
604 * BN_GF2m_mod_sqr_arr function.
605 */
606int
607BN_GF2m_mod_sqr(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx)
608{
609 int ret = 0;
610 const int max = BN_num_bits(p) + 1;
611 int *arr = NULL;
612
613 bn_check_top(a);
614 bn_check_top(p);
615 if ((arr = reallocarray(NULL, max, sizeof(int))) == NULL)
616 goto err;
617 ret = BN_GF2m_poly2arr(p, arr, max);
618 if (!ret || ret > max) {
619 BNerr(BN_F_BN_GF2M_MOD_SQR, BN_R_INVALID_LENGTH);
620 goto err;
621 }
622 ret = BN_GF2m_mod_sqr_arr(r, a, arr, ctx);
623 bn_check_top(r);
624
625err:
626 free(arr);
627 return ret;
628}
629
630
631/* Invert a, reduce modulo p, and store the result in r. r could be a.
632 * Uses Modified Almost Inverse Algorithm (Algorithm 10) from
633 * Hankerson, D., Hernandez, J.L., and Menezes, A. "Software Implementation
634 * of Elliptic Curve Cryptography Over Binary Fields".
635 */
636int
637BN_GF2m_mod_inv(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx)
638{
639 BIGNUM *b, *c = NULL, *u = NULL, *v = NULL, *tmp;
640 int ret = 0;
641
642 bn_check_top(a);
643 bn_check_top(p);
644
645 BN_CTX_start(ctx);
646
647 if ((b = BN_CTX_get(ctx)) == NULL)
648 goto err;
649 if ((c = BN_CTX_get(ctx)) == NULL)
650 goto err;
651 if ((u = BN_CTX_get(ctx)) == NULL)
652 goto err;
653 if ((v = BN_CTX_get(ctx)) == NULL)
654 goto err;
655
656 if (!BN_GF2m_mod(u, a, p))
657 goto err;
658 if (BN_is_zero(u))
659 goto err;
660
661 if (!BN_copy(v, p))
662 goto err;
663#if 0
664 if (!BN_one(b))
665 goto err;
666
667 while (1) {
668 while (!BN_is_odd(u)) {
669 if (BN_is_zero(u))
670 goto err;
671 if (!BN_rshift1(u, u))
672 goto err;
673 if (BN_is_odd(b)) {
674 if (!BN_GF2m_add(b, b, p))
675 goto err;
676 }
677 if (!BN_rshift1(b, b))
678 goto err;
679 }
680
681 if (BN_abs_is_word(u, 1))
682 break;
683
684 if (BN_num_bits(u) < BN_num_bits(v)) {
685 tmp = u;
686 u = v;
687 v = tmp;
688 tmp = b;
689 b = c;
690 c = tmp;
691 }
692
693 if (!BN_GF2m_add(u, u, v))
694 goto err;
695 if (!BN_GF2m_add(b, b, c))
696 goto err;
697 }
698#else
699 {
700 int i, ubits = BN_num_bits(u),
701 vbits = BN_num_bits(v), /* v is copy of p */
702 top = p->top;
703 BN_ULONG *udp, *bdp, *vdp, *cdp;
704
705 bn_wexpand(u, top);
706 udp = u->d;
707 for (i = u->top; i < top; i++)
708 udp[i] = 0;
709 u->top = top;
710 bn_wexpand(b, top);
711 bdp = b->d;
712 bdp[0] = 1;
713 for (i = 1; i < top; i++)
714 bdp[i] = 0;
715 b->top = top;
716 bn_wexpand(c, top);
717 cdp = c->d;
718 for (i = 0; i < top; i++)
719 cdp[i] = 0;
720 c->top = top;
721 vdp = v->d; /* It pays off to "cache" *->d pointers, because
722 * it allows optimizer to be more aggressive.
723 * But we don't have to "cache" p->d, because *p
724 * is declared 'const'... */
725 while (1) {
726 while (ubits && !(udp[0]&1)) {
727 BN_ULONG u0, u1, b0, b1, mask;
728
729 u0 = udp[0];
730 b0 = bdp[0];
731 mask = (BN_ULONG)0 - (b0 & 1);
732 b0 ^= p->d[0] & mask;
733 for (i = 0; i < top - 1; i++) {
734 u1 = udp[i + 1];
735 udp[i] = ((u0 >> 1) |
736 (u1 << (BN_BITS2 - 1))) & BN_MASK2;
737 u0 = u1;
738 b1 = bdp[i + 1] ^ (p->d[i + 1] & mask);
739 bdp[i] = ((b0 >> 1) |
740 (b1 << (BN_BITS2 - 1))) & BN_MASK2;
741 b0 = b1;
742 }
743 udp[i] = u0 >> 1;
744 bdp[i] = b0 >> 1;
745 ubits--;
746 }
747
748 if (ubits <= BN_BITS2) {
749 /* See if poly was reducible. */
750 if (udp[0] == 0)
751 goto err;
752 if (udp[0] == 1)
753 break;
754 }
755
756 if (ubits < vbits) {
757 i = ubits;
758 ubits = vbits;
759 vbits = i;
760 tmp = u;
761 u = v;
762 v = tmp;
763 tmp = b;
764 b = c;
765 c = tmp;
766 udp = vdp;
767 vdp = v->d;
768 bdp = cdp;
769 cdp = c->d;
770 }
771 for (i = 0; i < top; i++) {
772 udp[i] ^= vdp[i];
773 bdp[i] ^= cdp[i];
774 }
775 if (ubits == vbits) {
776 BN_ULONG ul;
777 int utop = (ubits - 1) / BN_BITS2;
778
779 while ((ul = udp[utop]) == 0 && utop)
780 utop--;
781 ubits = utop*BN_BITS2 + BN_num_bits_word(ul);
782 }
783 }
784 bn_correct_top(b);
785 }
786#endif
787
788 if (!BN_copy(r, b))
789 goto err;
790 bn_check_top(r);
791 ret = 1;
792
793err:
794#ifdef BN_DEBUG /* BN_CTX_end would complain about the expanded form */
795 bn_correct_top(c);
796 bn_correct_top(u);
797 bn_correct_top(v);
798#endif
799 BN_CTX_end(ctx);
800 return ret;
801}
802
803/* Invert xx, reduce modulo p, and store the result in r. r could be xx.
804 *
805 * This function calls down to the BN_GF2m_mod_inv implementation; this wrapper
806 * function is only provided for convenience; for best performance, use the
807 * BN_GF2m_mod_inv function.
808 */
809int
810BN_GF2m_mod_inv_arr(BIGNUM *r, const BIGNUM *xx, const int p[], BN_CTX *ctx)
811{
812 BIGNUM *field;
813 int ret = 0;
814
815 bn_check_top(xx);
816 BN_CTX_start(ctx);
817 if ((field = BN_CTX_get(ctx)) == NULL)
818 goto err;
819 if (!BN_GF2m_arr2poly(p, field))
820 goto err;
821
822 ret = BN_GF2m_mod_inv(r, xx, field, ctx);
823 bn_check_top(r);
824
825err:
826 BN_CTX_end(ctx);
827 return ret;
828}
829
830
831#ifndef OPENSSL_SUN_GF2M_DIV
832/* Divide y by x, reduce modulo p, and store the result in r. r could be x
833 * or y, x could equal y.
834 */
835int
836BN_GF2m_mod_div(BIGNUM *r, const BIGNUM *y, const BIGNUM *x, const BIGNUM *p,
837 BN_CTX *ctx)
838{
839 BIGNUM *xinv = NULL;
840 int ret = 0;
841
842 bn_check_top(y);
843 bn_check_top(x);
844 bn_check_top(p);
845
846 BN_CTX_start(ctx);
847 if ((xinv = BN_CTX_get(ctx)) == NULL)
848 goto err;
849
850 if (!BN_GF2m_mod_inv(xinv, x, p, ctx))
851 goto err;
852 if (!BN_GF2m_mod_mul(r, y, xinv, p, ctx))
853 goto err;
854 bn_check_top(r);
855 ret = 1;
856
857err:
858 BN_CTX_end(ctx);
859 return ret;
860}
861#else
862/* Divide y by x, reduce modulo p, and store the result in r. r could be x
863 * or y, x could equal y.
864 * Uses algorithm Modular_Division_GF(2^m) from
865 * Chang-Shantz, S. "From Euclid's GCD to Montgomery Multiplication to
866 * the Great Divide".
867 */
868int
869BN_GF2m_mod_div(BIGNUM *r, const BIGNUM *y, const BIGNUM *x, const BIGNUM *p,
870 BN_CTX *ctx)
871{
872 BIGNUM *a, *b, *u, *v;
873 int ret = 0;
874
875 bn_check_top(y);
876 bn_check_top(x);
877 bn_check_top(p);
878
879 BN_CTX_start(ctx);
880
881 if ((a = BN_CTX_get(ctx)) == NULL)
882 goto err;
883 if ((b = BN_CTX_get(ctx)) == NULL)
884 goto err;
885 if ((u = BN_CTX_get(ctx)) == NULL)
886 goto err;
887 if ((v = BN_CTX_get(ctx)) == NULL)
888 goto err;
889
890 /* reduce x and y mod p */
891 if (!BN_GF2m_mod(u, y, p))
892 goto err;
893 if (!BN_GF2m_mod(a, x, p))
894 goto err;
895 if (!BN_copy(b, p))
896 goto err;
897
898 while (!BN_is_odd(a)) {
899 if (!BN_rshift1(a, a))
900 goto err;
901 if (BN_is_odd(u))
902 if (!BN_GF2m_add(u, u, p))
903 goto err;
904 if (!BN_rshift1(u, u))
905 goto err;
906 }
907
908 do {
909 if (BN_GF2m_cmp(b, a) > 0) {
910 if (!BN_GF2m_add(b, b, a))
911 goto err;
912 if (!BN_GF2m_add(v, v, u))
913 goto err;
914 do {
915 if (!BN_rshift1(b, b))
916 goto err;
917 if (BN_is_odd(v))
918 if (!BN_GF2m_add(v, v, p))
919 goto err;
920 if (!BN_rshift1(v, v))
921 goto err;
922 } while (!BN_is_odd(b));
923 } else if (BN_abs_is_word(a, 1))
924 break;
925 else {
926 if (!BN_GF2m_add(a, a, b))
927 goto err;
928 if (!BN_GF2m_add(u, u, v))
929 goto err;
930 do {
931 if (!BN_rshift1(a, a))
932 goto err;
933 if (BN_is_odd(u))
934 if (!BN_GF2m_add(u, u, p))
935 goto err;
936 if (!BN_rshift1(u, u))
937 goto err;
938 } while (!BN_is_odd(a));
939 }
940 } while (1);
941
942 if (!BN_copy(r, u))
943 goto err;
944 bn_check_top(r);
945 ret = 1;
946
947err:
948 BN_CTX_end(ctx);
949 return ret;
950}
951#endif
952
953/* Divide yy by xx, reduce modulo p, and store the result in r. r could be xx
954 * or yy, xx could equal yy.
955 *
956 * This function calls down to the BN_GF2m_mod_div implementation; this wrapper
957 * function is only provided for convenience; for best performance, use the
958 * BN_GF2m_mod_div function.
959 */
960int
961BN_GF2m_mod_div_arr(BIGNUM *r, const BIGNUM *yy, const BIGNUM *xx,
962 const int p[], BN_CTX *ctx)
963{
964 BIGNUM *field;
965 int ret = 0;
966
967 bn_check_top(yy);
968 bn_check_top(xx);
969
970 BN_CTX_start(ctx);
971 if ((field = BN_CTX_get(ctx)) == NULL)
972 goto err;
973 if (!BN_GF2m_arr2poly(p, field))
974 goto err;
975
976 ret = BN_GF2m_mod_div(r, yy, xx, field, ctx);
977 bn_check_top(r);
978
979err:
980 BN_CTX_end(ctx);
981 return ret;
982}
983
984
985/* Compute the bth power of a, reduce modulo p, and store
986 * the result in r. r could be a.
987 * Uses simple square-and-multiply algorithm A.5.1 from IEEE P1363.
988 */
989int
990BN_GF2m_mod_exp_arr(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, const int p[],
991 BN_CTX *ctx)
992{
993 int ret = 0, i, n;
994 BIGNUM *u;
995
996 bn_check_top(a);
997 bn_check_top(b);
998
999 if (BN_is_zero(b))
1000 return (BN_one(r));
1001
1002 if (BN_abs_is_word(b, 1))
1003 return (BN_copy(r, a) != NULL);
1004
1005 BN_CTX_start(ctx);
1006 if ((u = BN_CTX_get(ctx)) == NULL)
1007 goto err;
1008
1009 if (!BN_GF2m_mod_arr(u, a, p))
1010 goto err;
1011
1012 n = BN_num_bits(b) - 1;
1013 for (i = n - 1; i >= 0; i--) {
1014 if (!BN_GF2m_mod_sqr_arr(u, u, p, ctx))
1015 goto err;
1016 if (BN_is_bit_set(b, i)) {
1017 if (!BN_GF2m_mod_mul_arr(u, u, a, p, ctx))
1018 goto err;
1019 }
1020 }
1021 if (!BN_copy(r, u))
1022 goto err;
1023 bn_check_top(r);
1024 ret = 1;
1025
1026err:
1027 BN_CTX_end(ctx);
1028 return ret;
1029}
1030
1031/* Compute the bth power of a, reduce modulo p, and store
1032 * the result in r. r could be a.
1033 *
1034 * This function calls down to the BN_GF2m_mod_exp_arr implementation; this wrapper
1035 * function is only provided for convenience; for best performance, use the
1036 * BN_GF2m_mod_exp_arr function.
1037 */
1038int
1039BN_GF2m_mod_exp(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, const BIGNUM *p,
1040 BN_CTX *ctx)
1041{
1042 int ret = 0;
1043 const int max = BN_num_bits(p) + 1;
1044 int *arr = NULL;
1045
1046 bn_check_top(a);
1047 bn_check_top(b);
1048 bn_check_top(p);
1049 if ((arr = reallocarray(NULL, max, sizeof(int))) == NULL)
1050 goto err;
1051 ret = BN_GF2m_poly2arr(p, arr, max);
1052 if (!ret || ret > max) {
1053 BNerr(BN_F_BN_GF2M_MOD_EXP, BN_R_INVALID_LENGTH);
1054 goto err;
1055 }
1056 ret = BN_GF2m_mod_exp_arr(r, a, b, arr, ctx);
1057 bn_check_top(r);
1058
1059err:
1060 free(arr);
1061 return ret;
1062}
1063
1064/* Compute the square root of a, reduce modulo p, and store
1065 * the result in r. r could be a.
1066 * Uses exponentiation as in algorithm A.4.1 from IEEE P1363.
1067 */
1068int
1069BN_GF2m_mod_sqrt_arr(BIGNUM *r, const BIGNUM *a, const int p[], BN_CTX *ctx)
1070{
1071 int ret = 0;
1072 BIGNUM *u;
1073
1074 bn_check_top(a);
1075
1076 if (!p[0]) {
1077 /* reduction mod 1 => return 0 */
1078 BN_zero(r);
1079 return 1;
1080 }
1081
1082 BN_CTX_start(ctx);
1083 if ((u = BN_CTX_get(ctx)) == NULL)
1084 goto err;
1085
1086 if (!BN_set_bit(u, p[0] - 1))
1087 goto err;
1088 ret = BN_GF2m_mod_exp_arr(r, a, u, p, ctx);
1089 bn_check_top(r);
1090
1091err:
1092 BN_CTX_end(ctx);
1093 return ret;
1094}
1095
1096/* Compute the square root of a, reduce modulo p, and store
1097 * the result in r. r could be a.
1098 *
1099 * This function calls down to the BN_GF2m_mod_sqrt_arr implementation; this wrapper
1100 * function is only provided for convenience; for best performance, use the
1101 * BN_GF2m_mod_sqrt_arr function.
1102 */
1103int
1104BN_GF2m_mod_sqrt(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx)
1105{
1106 int ret = 0;
1107 const int max = BN_num_bits(p) + 1;
1108 int *arr = NULL;
1109 bn_check_top(a);
1110 bn_check_top(p);
1111 if ((arr = reallocarray(NULL, max, sizeof(int))) == NULL)
1112 goto err;
1113 ret = BN_GF2m_poly2arr(p, arr, max);
1114 if (!ret || ret > max) {
1115 BNerr(BN_F_BN_GF2M_MOD_SQRT, BN_R_INVALID_LENGTH);
1116 goto err;
1117 }
1118 ret = BN_GF2m_mod_sqrt_arr(r, a, arr, ctx);
1119 bn_check_top(r);
1120
1121err:
1122 free(arr);
1123 return ret;
1124}
1125
1126/* Find r such that r^2 + r = a mod p. r could be a. If no r exists returns 0.
1127 * Uses algorithms A.4.7 and A.4.6 from IEEE P1363.
1128 */
1129int
1130BN_GF2m_mod_solve_quad_arr(BIGNUM *r, const BIGNUM *a_, const int p[],
1131 BN_CTX *ctx)
1132{
1133 int ret = 0, count = 0, j;
1134 BIGNUM *a, *z, *rho, *w, *w2, *tmp;
1135
1136 bn_check_top(a_);
1137
1138 if (!p[0]) {
1139 /* reduction mod 1 => return 0 */
1140 BN_zero(r);
1141 return 1;
1142 }
1143
1144 BN_CTX_start(ctx);
1145 if ((a = BN_CTX_get(ctx)) == NULL)
1146 goto err;
1147 if ((z = BN_CTX_get(ctx)) == NULL)
1148 goto err;
1149 if ((w = BN_CTX_get(ctx)) == NULL)
1150 goto err;
1151
1152 if (!BN_GF2m_mod_arr(a, a_, p))
1153 goto err;
1154
1155 if (BN_is_zero(a)) {
1156 BN_zero(r);
1157 ret = 1;
1158 goto err;
1159 }
1160
1161 if (p[0] & 0x1) /* m is odd */
1162 {
1163 /* compute half-trace of a */
1164 if (!BN_copy(z, a))
1165 goto err;
1166 for (j = 1; j <= (p[0] - 1) / 2; j++) {
1167 if (!BN_GF2m_mod_sqr_arr(z, z, p, ctx))
1168 goto err;
1169 if (!BN_GF2m_mod_sqr_arr(z, z, p, ctx))
1170 goto err;
1171 if (!BN_GF2m_add(z, z, a))
1172 goto err;
1173 }
1174
1175 }
1176 else /* m is even */
1177 {
1178 if ((rho = BN_CTX_get(ctx)) == NULL)
1179 goto err;
1180 if ((w2 = BN_CTX_get(ctx)) == NULL)
1181 goto err;
1182 if ((tmp = BN_CTX_get(ctx)) == NULL)
1183 goto err;
1184 do {
1185 if (!BN_rand(rho, p[0], 0, 0))
1186 goto err;
1187 if (!BN_GF2m_mod_arr(rho, rho, p))
1188 goto err;
1189 BN_zero(z);
1190 if (!BN_copy(w, rho))
1191 goto err;
1192 for (j = 1; j <= p[0] - 1; j++) {
1193 if (!BN_GF2m_mod_sqr_arr(z, z, p, ctx))
1194 goto err;
1195 if (!BN_GF2m_mod_sqr_arr(w2, w, p, ctx))
1196 goto err;
1197 if (!BN_GF2m_mod_mul_arr(tmp, w2, a, p, ctx))
1198 goto err;
1199 if (!BN_GF2m_add(z, z, tmp))
1200 goto err;
1201 if (!BN_GF2m_add(w, w2, rho))
1202 goto err;
1203 }
1204 count++;
1205 } while (BN_is_zero(w) && (count < MAX_ITERATIONS));
1206 if (BN_is_zero(w)) {
1207 BNerr(BN_F_BN_GF2M_MOD_SOLVE_QUAD_ARR,
1208 BN_R_TOO_MANY_ITERATIONS);
1209 goto err;
1210 }
1211 }
1212
1213 if (!BN_GF2m_mod_sqr_arr(w, z, p, ctx))
1214 goto err;
1215 if (!BN_GF2m_add(w, z, w))
1216 goto err;
1217 if (BN_GF2m_cmp(w, a)) {
1218 BNerr(BN_F_BN_GF2M_MOD_SOLVE_QUAD_ARR, BN_R_NO_SOLUTION);
1219 goto err;
1220 }
1221
1222 if (!BN_copy(r, z))
1223 goto err;
1224 bn_check_top(r);
1225
1226 ret = 1;
1227
1228err:
1229 BN_CTX_end(ctx);
1230 return ret;
1231}
1232
1233/* Find r such that r^2 + r = a mod p. r could be a. If no r exists returns 0.
1234 *
1235 * This function calls down to the BN_GF2m_mod_solve_quad_arr implementation; this wrapper
1236 * function is only provided for convenience; for best performance, use the
1237 * BN_GF2m_mod_solve_quad_arr function.
1238 */
1239int
1240BN_GF2m_mod_solve_quad(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx)
1241{
1242 int ret = 0;
1243 const int max = BN_num_bits(p) + 1;
1244 int *arr = NULL;
1245
1246 bn_check_top(a);
1247 bn_check_top(p);
1248 if ((arr = reallocarray(NULL, max, sizeof(int))) == NULL)
1249 goto err;
1250 ret = BN_GF2m_poly2arr(p, arr, max);
1251 if (!ret || ret > max) {
1252 BNerr(BN_F_BN_GF2M_MOD_SOLVE_QUAD, BN_R_INVALID_LENGTH);
1253 goto err;
1254 }
1255 ret = BN_GF2m_mod_solve_quad_arr(r, a, arr, ctx);
1256 bn_check_top(r);
1257
1258err:
1259 free(arr);
1260 return ret;
1261}
1262
1263/* Convert the bit-string representation of a polynomial
1264 * ( \sum_{i=0}^n a_i * x^i) into an array of integers corresponding
1265 * to the bits with non-zero coefficient. Array is terminated with -1.
1266 * Up to max elements of the array will be filled. Return value is total
1267 * number of array elements that would be filled if array was large enough.
1268 */
1269int
1270BN_GF2m_poly2arr(const BIGNUM *a, int p[], int max)
1271{
1272 int i, j, k = 0;
1273 BN_ULONG mask;
1274
1275 if (BN_is_zero(a))
1276 return 0;
1277
1278 for (i = a->top - 1; i >= 0; i--) {
1279 if (!a->d[i])
1280 /* skip word if a->d[i] == 0 */
1281 continue;
1282 mask = BN_TBIT;
1283 for (j = BN_BITS2 - 1; j >= 0; j--) {
1284 if (a->d[i] & mask) {
1285 if (k < max)
1286 p[k] = BN_BITS2 * i + j;
1287 k++;
1288 }
1289 mask >>= 1;
1290 }
1291 }
1292
1293 if (k < max) {
1294 p[k] = -1;
1295 k++;
1296 }
1297
1298 return k;
1299}
1300
1301/* Convert the coefficient array representation of a polynomial to a
1302 * bit-string. The array must be terminated by -1.
1303 */
1304int
1305BN_GF2m_arr2poly(const int p[], BIGNUM *a)
1306{
1307 int i;
1308
1309 bn_check_top(a);
1310 BN_zero(a);
1311 for (i = 0; p[i] != -1; i++) {
1312 if (BN_set_bit(a, p[i]) == 0)
1313 return 0;
1314 }
1315 bn_check_top(a);
1316
1317 return 1;
1318}
1319
1320#endif
diff --git a/src/lib/libcrypto/bn/bn_kron.c b/src/lib/libcrypto/bn/bn_kron.c
deleted file mode 100644
index 274da5d186..0000000000
--- a/src/lib/libcrypto/bn/bn_kron.c
+++ /dev/null
@@ -1,185 +0,0 @@
1/* $OpenBSD: bn_kron.c,v 1.6 2015/02/09 15:49:22 jsing Exp $ */
2/* ====================================================================
3 * Copyright (c) 1998-2000 The OpenSSL Project. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in
14 * the documentation and/or other materials provided with the
15 * distribution.
16 *
17 * 3. All advertising materials mentioning features or use of this
18 * software must display the following acknowledgment:
19 * "This product includes software developed by the OpenSSL Project
20 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
21 *
22 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
23 * endorse or promote products derived from this software without
24 * prior written permission. For written permission, please contact
25 * openssl-core@openssl.org.
26 *
27 * 5. Products derived from this software may not be called "OpenSSL"
28 * nor may "OpenSSL" appear in their names without prior written
29 * permission of the OpenSSL Project.
30 *
31 * 6. Redistributions of any form whatsoever must retain the following
32 * acknowledgment:
33 * "This product includes software developed by the OpenSSL Project
34 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
35 *
36 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
37 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
38 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
39 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
40 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
41 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
42 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
43 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
44 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
45 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
46 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
47 * OF THE POSSIBILITY OF SUCH DAMAGE.
48 * ====================================================================
49 *
50 * This product includes cryptographic software written by Eric Young
51 * (eay@cryptsoft.com). This product includes software written by Tim
52 * Hudson (tjh@cryptsoft.com).
53 *
54 */
55
56#include "bn_lcl.h"
57
58/* least significant word */
59#define BN_lsw(n) (((n)->top == 0) ? (BN_ULONG) 0 : (n)->d[0])
60
61/* Returns -2 for errors because both -1 and 0 are valid results. */
62int
63BN_kronecker(const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx)
64{
65 int i;
66 int ret = -2; /* avoid 'uninitialized' warning */
67 int err = 0;
68 BIGNUM *A, *B, *tmp;
69
70 /* In 'tab', only odd-indexed entries are relevant:
71 * For any odd BIGNUM n,
72 * tab[BN_lsw(n) & 7]
73 * is $(-1)^{(n^2-1)/8}$ (using TeX notation).
74 * Note that the sign of n does not matter.
75 */
76 static const int tab[8] = {0, 1, 0, -1, 0, -1, 0, 1};
77
78 bn_check_top(a);
79 bn_check_top(b);
80
81 BN_CTX_start(ctx);
82 if ((A = BN_CTX_get(ctx)) == NULL)
83 goto end;
84 if ((B = BN_CTX_get(ctx)) == NULL)
85 goto end;
86
87 err = !BN_copy(A, a);
88 if (err)
89 goto end;
90 err = !BN_copy(B, b);
91 if (err)
92 goto end;
93
94 /*
95 * Kronecker symbol, imlemented according to Henri Cohen,
96 * "A Course in Computational Algebraic Number Theory"
97 * (algorithm 1.4.10).
98 */
99
100 /* Cohen's step 1: */
101
102 if (BN_is_zero(B)) {
103 ret = BN_abs_is_word(A, 1);
104 goto end;
105 }
106
107 /* Cohen's step 2: */
108
109 if (!BN_is_odd(A) && !BN_is_odd(B)) {
110 ret = 0;
111 goto end;
112 }
113
114 /* now B is non-zero */
115 i = 0;
116 while (!BN_is_bit_set(B, i))
117 i++;
118 err = !BN_rshift(B, B, i);
119 if (err)
120 goto end;
121 if (i & 1) {
122 /* i is odd */
123 /* (thus B was even, thus A must be odd!) */
124
125 /* set 'ret' to $(-1)^{(A^2-1)/8}$ */
126 ret = tab[BN_lsw(A) & 7];
127 } else {
128 /* i is even */
129 ret = 1;
130 }
131
132 if (B->neg) {
133 B->neg = 0;
134 if (A->neg)
135 ret = -ret;
136 }
137
138 /* now B is positive and odd, so what remains to be done is
139 * to compute the Jacobi symbol (A/B) and multiply it by 'ret' */
140
141 while (1) {
142 /* Cohen's step 3: */
143
144 /* B is positive and odd */
145
146 if (BN_is_zero(A)) {
147 ret = BN_is_one(B) ? ret : 0;
148 goto end;
149 }
150
151 /* now A is non-zero */
152 i = 0;
153 while (!BN_is_bit_set(A, i))
154 i++;
155 err = !BN_rshift(A, A, i);
156 if (err)
157 goto end;
158 if (i & 1) {
159 /* i is odd */
160 /* multiply 'ret' by $(-1)^{(B^2-1)/8}$ */
161 ret = ret * tab[BN_lsw(B) & 7];
162 }
163
164 /* Cohen's step 4: */
165 /* multiply 'ret' by $(-1)^{(A-1)(B-1)/4}$ */
166 if ((A->neg ? ~BN_lsw(A) : BN_lsw(A)) & BN_lsw(B) & 2)
167 ret = -ret;
168
169 /* (A, B) := (B mod |A|, |A|) */
170 err = !BN_nnmod(B, B, A, ctx);
171 if (err)
172 goto end;
173 tmp = A;
174 A = B;
175 B = tmp;
176 tmp->neg = 0;
177 }
178
179end:
180 BN_CTX_end(ctx);
181 if (err)
182 return -2;
183 else
184 return ret;
185}
diff --git a/src/lib/libcrypto/bn/bn_lcl.h b/src/lib/libcrypto/bn/bn_lcl.h
deleted file mode 100644
index a76ba4149f..0000000000
--- a/src/lib/libcrypto/bn/bn_lcl.h
+++ /dev/null
@@ -1,484 +0,0 @@
1/* $OpenBSD: bn_lcl.h,v 1.21 2014/10/28 07:35:58 jsg Exp $ */
2/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
3 * All rights reserved.
4 *
5 * This package is an SSL implementation written
6 * by Eric Young (eay@cryptsoft.com).
7 * The implementation was written so as to conform with Netscapes SSL.
8 *
9 * This library is free for commercial and non-commercial use as long as
10 * the following conditions are aheared to. The following conditions
11 * apply to all code found in this distribution, be it the RC4, RSA,
12 * lhash, DES, etc., code; not just the SSL code. The SSL documentation
13 * included with this distribution is covered by the same copyright terms
14 * except that the holder is Tim Hudson (tjh@cryptsoft.com).
15 *
16 * Copyright remains Eric Young's, and as such any Copyright notices in
17 * the code are not to be removed.
18 * If this package is used in a product, Eric Young should be given attribution
19 * as the author of the parts of the library used.
20 * This can be in the form of a textual message at program startup or
21 * in documentation (online or textual) provided with the package.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 * 1. Redistributions of source code must retain the copyright
27 * notice, this list of conditions and the following disclaimer.
28 * 2. Redistributions in binary form must reproduce the above copyright
29 * notice, this list of conditions and the following disclaimer in the
30 * documentation and/or other materials provided with the distribution.
31 * 3. All advertising materials mentioning features or use of this software
32 * must display the following acknowledgement:
33 * "This product includes cryptographic software written by
34 * Eric Young (eay@cryptsoft.com)"
35 * The word 'cryptographic' can be left out if the rouines from the library
36 * being used are not cryptographic related :-).
37 * 4. If you include any Windows specific code (or a derivative thereof) from
38 * the apps directory (application code) you must include an acknowledgement:
39 * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
40 *
41 * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
42 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51 * SUCH DAMAGE.
52 *
53 * The licence and distribution terms for any publically available version or
54 * derivative of this code cannot be changed. i.e. this code cannot simply be
55 * copied and put under another distribution licence
56 * [including the GNU Public Licence.]
57 */
58/* ====================================================================
59 * Copyright (c) 1998-2000 The OpenSSL Project. All rights reserved.
60 *
61 * Redistribution and use in source and binary forms, with or without
62 * modification, are permitted provided that the following conditions
63 * are met:
64 *
65 * 1. Redistributions of source code must retain the above copyright
66 * notice, this list of conditions and the following disclaimer.
67 *
68 * 2. Redistributions in binary form must reproduce the above copyright
69 * notice, this list of conditions and the following disclaimer in
70 * the documentation and/or other materials provided with the
71 * distribution.
72 *
73 * 3. All advertising materials mentioning features or use of this
74 * software must display the following acknowledgment:
75 * "This product includes software developed by the OpenSSL Project
76 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
77 *
78 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
79 * endorse or promote products derived from this software without
80 * prior written permission. For written permission, please contact
81 * openssl-core@openssl.org.
82 *
83 * 5. Products derived from this software may not be called "OpenSSL"
84 * nor may "OpenSSL" appear in their names without prior written
85 * permission of the OpenSSL Project.
86 *
87 * 6. Redistributions of any form whatsoever must retain the following
88 * acknowledgment:
89 * "This product includes software developed by the OpenSSL Project
90 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
91 *
92 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
93 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
94 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
95 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
96 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
97 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
98 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
99 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
100 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
101 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
102 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
103 * OF THE POSSIBILITY OF SUCH DAMAGE.
104 * ====================================================================
105 *
106 * This product includes cryptographic software written by Eric Young
107 * (eay@cryptsoft.com). This product includes software written by Tim
108 * Hudson (tjh@cryptsoft.com).
109 *
110 */
111
112#ifndef HEADER_BN_LCL_H
113#define HEADER_BN_LCL_H
114
115#include <openssl/opensslconf.h>
116
117#include <openssl/bn.h>
118
119#ifdef __cplusplus
120extern "C" {
121#endif
122
123
124/*
125 * BN_window_bits_for_exponent_size -- macro for sliding window mod_exp functions
126 *
127 *
128 * For window size 'w' (w >= 2) and a random 'b' bits exponent,
129 * the number of multiplications is a constant plus on average
130 *
131 * 2^(w-1) + (b-w)/(w+1);
132 *
133 * here 2^(w-1) is for precomputing the table (we actually need
134 * entries only for windows that have the lowest bit set), and
135 * (b-w)/(w+1) is an approximation for the expected number of
136 * w-bit windows, not counting the first one.
137 *
138 * Thus we should use
139 *
140 * w >= 6 if b > 671
141 * w = 5 if 671 > b > 239
142 * w = 4 if 239 > b > 79
143 * w = 3 if 79 > b > 23
144 * w <= 2 if 23 > b
145 *
146 * (with draws in between). Very small exponents are often selected
147 * with low Hamming weight, so we use w = 1 for b <= 23.
148 */
149#define BN_window_bits_for_exponent_size(b) \
150 ((b) > 671 ? 6 : \
151 (b) > 239 ? 5 : \
152 (b) > 79 ? 4 : \
153 (b) > 23 ? 3 : 1)
154
155
156/* BN_mod_exp_mont_consttime is based on the assumption that the
157 * L1 data cache line width of the target processor is at least
158 * the following value.
159 */
160#define MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH ( 64 )
161#define MOD_EXP_CTIME_MIN_CACHE_LINE_MASK (MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH - 1)
162
163/* Window sizes optimized for fixed window size modular exponentiation
164 * algorithm (BN_mod_exp_mont_consttime).
165 *
166 * To achieve the security goals of BN_mode_exp_mont_consttime, the
167 * maximum size of the window must not exceed
168 * log_2(MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH).
169 *
170 * Window size thresholds are defined for cache line sizes of 32 and 64,
171 * cache line sizes where log_2(32)=5 and log_2(64)=6 respectively. A
172 * window size of 7 should only be used on processors that have a 128
173 * byte or greater cache line size.
174 */
175#if MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH == 64
176
177# define BN_window_bits_for_ctime_exponent_size(b) \
178 ((b) > 937 ? 6 : \
179 (b) > 306 ? 5 : \
180 (b) > 89 ? 4 : \
181 (b) > 22 ? 3 : 1)
182# define BN_MAX_WINDOW_BITS_FOR_CTIME_EXPONENT_SIZE (6)
183
184#elif MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH == 32
185
186# define BN_window_bits_for_ctime_exponent_size(b) \
187 ((b) > 306 ? 5 : \
188 (b) > 89 ? 4 : \
189 (b) > 22 ? 3 : 1)
190# define BN_MAX_WINDOW_BITS_FOR_CTIME_EXPONENT_SIZE (5)
191
192#endif
193
194
195/* Pentium pro 16,16,16,32,64 */
196/* Alpha 16,16,16,16.64 */
197#define BN_MULL_SIZE_NORMAL (16) /* 32 */
198#define BN_MUL_RECURSIVE_SIZE_NORMAL (16) /* 32 less than */
199#define BN_SQR_RECURSIVE_SIZE_NORMAL (16) /* 32 */
200#define BN_MUL_LOW_RECURSIVE_SIZE_NORMAL (32) /* 32 */
201#define BN_MONT_CTX_SET_SIZE_WORD (64) /* 32 */
202
203#if !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_NO_INLINE_ASM)
204/*
205 * BN_UMULT_HIGH section.
206 *
207 * No, I'm not trying to overwhelm you when stating that the
208 * product of N-bit numbers is 2*N bits wide:-) No, I don't expect
209 * you to be impressed when I say that if the compiler doesn't
210 * support 2*N integer type, then you have to replace every N*N
211 * multiplication with 4 (N/2)*(N/2) accompanied by some shifts
212 * and additions which unavoidably results in severe performance
213 * penalties. Of course provided that the hardware is capable of
214 * producing 2*N result... That's when you normally start
215 * considering assembler implementation. However! It should be
216 * pointed out that some CPUs (most notably Alpha, PowerPC and
217 * upcoming IA-64 family:-) provide *separate* instruction
218 * calculating the upper half of the product placing the result
219 * into a general purpose register. Now *if* the compiler supports
220 * inline assembler, then it's not impossible to implement the
221 * "bignum" routines (and have the compiler optimize 'em)
222 * exhibiting "native" performance in C. That's what BN_UMULT_HIGH
223 * macro is about:-)
224 *
225 * <appro@fy.chalmers.se>
226 */
227# if defined(__alpha)
228# if defined(__GNUC__) && __GNUC__>=2
229# define BN_UMULT_HIGH(a,b) ({ \
230 BN_ULONG ret; \
231 asm ("umulh %1,%2,%0" \
232 : "=r"(ret) \
233 : "r"(a), "r"(b)); \
234 ret; })
235# endif /* compiler */
236# elif defined(_ARCH_PPC) && defined(_LP64)
237# if defined(__GNUC__) && __GNUC__>=2
238# define BN_UMULT_HIGH(a,b) ({ \
239 BN_ULONG ret; \
240 asm ("mulhdu %0,%1,%2" \
241 : "=r"(ret) \
242 : "r"(a), "r"(b)); \
243 ret; })
244# endif /* compiler */
245# elif defined(__x86_64) || defined(__x86_64__)
246# if defined(__GNUC__) && __GNUC__>=2
247# define BN_UMULT_HIGH(a,b) ({ \
248 BN_ULONG ret,discard; \
249 asm ("mulq %3" \
250 : "=a"(discard),"=d"(ret) \
251 : "a"(a), "g"(b) \
252 : "cc"); \
253 ret; })
254# define BN_UMULT_LOHI(low,high,a,b) \
255 asm ("mulq %3" \
256 : "=a"(low),"=d"(high) \
257 : "a"(a),"g"(b) \
258 : "cc");
259# endif
260# elif defined(__mips) && defined(_LP64)
261# if defined(__GNUC__) && __GNUC__>=2
262# if __GNUC__>=4 && __GNUC_MINOR__>=4 /* "h" constraint is no more since 4.4 */
263# define BN_UMULT_HIGH(a,b) (((__uint128_t)(a)*(b))>>64)
264# define BN_UMULT_LOHI(low,high,a,b) ({ \
265 __uint128_t ret=(__uint128_t)(a)*(b); \
266 (high)=ret>>64; (low)=ret; })
267# else
268# define BN_UMULT_HIGH(a,b) ({ \
269 BN_ULONG ret; \
270 asm ("dmultu %1,%2" \
271 : "=h"(ret) \
272 : "r"(a), "r"(b) : "l"); \
273 ret; })
274# define BN_UMULT_LOHI(low,high,a,b)\
275 asm ("dmultu %2,%3" \
276 : "=l"(low),"=h"(high) \
277 : "r"(a), "r"(b));
278# endif
279# endif
280# endif /* cpu */
281#endif /* OPENSSL_NO_ASM */
282
283/*************************************************************
284 * Using the long long type
285 */
286#define Lw(t) (((BN_ULONG)(t))&BN_MASK2)
287#define Hw(t) (((BN_ULONG)((t)>>BN_BITS2))&BN_MASK2)
288
289#ifdef BN_DEBUG_RAND
290#define bn_clear_top2max(a) \
291 { \
292 int ind = (a)->dmax - (a)->top; \
293 BN_ULONG *ftl = &(a)->d[(a)->top-1]; \
294 for (; ind != 0; ind--) \
295 *(++ftl) = 0x0; \
296 }
297#else
298#define bn_clear_top2max(a)
299#endif
300
301#ifdef BN_LLONG
302#define mul_add(r,a,w,c) { \
303 BN_ULLONG t; \
304 t=(BN_ULLONG)w * (a) + (r) + (c); \
305 (r)= Lw(t); \
306 (c)= Hw(t); \
307 }
308
309#define mul(r,a,w,c) { \
310 BN_ULLONG t; \
311 t=(BN_ULLONG)w * (a) + (c); \
312 (r)= Lw(t); \
313 (c)= Hw(t); \
314 }
315
316#define sqr(r0,r1,a) { \
317 BN_ULLONG t; \
318 t=(BN_ULLONG)(a)*(a); \
319 (r0)=Lw(t); \
320 (r1)=Hw(t); \
321 }
322
323#elif defined(BN_UMULT_LOHI)
324#define mul_add(r,a,w,c) { \
325 BN_ULONG high,low,ret,tmp=(a); \
326 ret = (r); \
327 BN_UMULT_LOHI(low,high,w,tmp); \
328 ret += (c); \
329 (c) = (ret<(c))?1:0; \
330 (c) += high; \
331 ret += low; \
332 (c) += (ret<low)?1:0; \
333 (r) = ret; \
334 }
335
336#define mul(r,a,w,c) { \
337 BN_ULONG high,low,ret,ta=(a); \
338 BN_UMULT_LOHI(low,high,w,ta); \
339 ret = low + (c); \
340 (c) = high; \
341 (c) += (ret<low)?1:0; \
342 (r) = ret; \
343 }
344
345#define sqr(r0,r1,a) { \
346 BN_ULONG tmp=(a); \
347 BN_UMULT_LOHI(r0,r1,tmp,tmp); \
348 }
349
350#elif defined(BN_UMULT_HIGH)
351#define mul_add(r,a,w,c) { \
352 BN_ULONG high,low,ret,tmp=(a); \
353 ret = (r); \
354 high= BN_UMULT_HIGH(w,tmp); \
355 ret += (c); \
356 low = (w) * tmp; \
357 (c) = (ret<(c))?1:0; \
358 (c) += high; \
359 ret += low; \
360 (c) += (ret<low)?1:0; \
361 (r) = ret; \
362 }
363
364#define mul(r,a,w,c) { \
365 BN_ULONG high,low,ret,ta=(a); \
366 low = (w) * ta; \
367 high= BN_UMULT_HIGH(w,ta); \
368 ret = low + (c); \
369 (c) = high; \
370 (c) += (ret<low)?1:0; \
371 (r) = ret; \
372 }
373
374#define sqr(r0,r1,a) { \
375 BN_ULONG tmp=(a); \
376 (r0) = tmp * tmp; \
377 (r1) = BN_UMULT_HIGH(tmp,tmp); \
378 }
379
380#else
381/*************************************************************
382 * No long long type
383 */
384
385#define LBITS(a) ((a)&BN_MASK2l)
386#define HBITS(a) (((a)>>BN_BITS4)&BN_MASK2l)
387#define L2HBITS(a) (((a)<<BN_BITS4)&BN_MASK2)
388
389#define mul64(l,h,bl,bh) \
390 { \
391 BN_ULONG m,m1,lt,ht; \
392 \
393 lt=l; \
394 ht=h; \
395 m =(bh)*(lt); \
396 lt=(bl)*(lt); \
397 m1=(bl)*(ht); \
398 ht =(bh)*(ht); \
399 m=(m+m1)&BN_MASK2; if (m < m1) ht+=L2HBITS((BN_ULONG)1); \
400 ht+=HBITS(m); \
401 m1=L2HBITS(m); \
402 lt=(lt+m1)&BN_MASK2; if (lt < m1) ht++; \
403 (l)=lt; \
404 (h)=ht; \
405 }
406
407#define sqr64(lo,ho,in) \
408 { \
409 BN_ULONG l,h,m; \
410 \
411 h=(in); \
412 l=LBITS(h); \
413 h=HBITS(h); \
414 m =(l)*(h); \
415 l*=l; \
416 h*=h; \
417 h+=(m&BN_MASK2h1)>>(BN_BITS4-1); \
418 m =(m&BN_MASK2l)<<(BN_BITS4+1); \
419 l=(l+m)&BN_MASK2; if (l < m) h++; \
420 (lo)=l; \
421 (ho)=h; \
422 }
423
424#define mul_add(r,a,bl,bh,c) { \
425 BN_ULONG l,h; \
426 \
427 h= (a); \
428 l=LBITS(h); \
429 h=HBITS(h); \
430 mul64(l,h,(bl),(bh)); \
431 \
432 /* non-multiply part */ \
433 l=(l+(c))&BN_MASK2; if (l < (c)) h++; \
434 (c)=(r); \
435 l=(l+(c))&BN_MASK2; if (l < (c)) h++; \
436 (c)=h&BN_MASK2; \
437 (r)=l; \
438 }
439
440#define mul(r,a,bl,bh,c) { \
441 BN_ULONG l,h; \
442 \
443 h= (a); \
444 l=LBITS(h); \
445 h=HBITS(h); \
446 mul64(l,h,(bl),(bh)); \
447 \
448 /* non-multiply part */ \
449 l+=(c); if ((l&BN_MASK2) < (c)) h++; \
450 (c)=h&BN_MASK2; \
451 (r)=l&BN_MASK2; \
452 }
453#endif /* !BN_LLONG */
454
455 void bn_mul_normal(BN_ULONG *r, BN_ULONG *a, int na, BN_ULONG *b, int nb);
456void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b);
457void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b);
458void bn_sqr_normal(BN_ULONG *r, const BN_ULONG *a, int n, BN_ULONG *tmp);
459void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a);
460void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a);
461int bn_cmp_words(const BN_ULONG *a, const BN_ULONG *b, int n);
462int bn_cmp_part_words(const BN_ULONG *a, const BN_ULONG *b,
463 int cl, int dl);
464void bn_mul_recursive(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n2,
465 int dna, int dnb, BN_ULONG *t);
466void bn_mul_part_recursive(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b,
467 int n, int tna, int tnb, BN_ULONG *t);
468void bn_sqr_recursive(BN_ULONG *r, const BN_ULONG *a, int n2, BN_ULONG *t);
469void bn_mul_low_normal(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n);
470void bn_mul_low_recursive(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n2,
471 BN_ULONG *t);
472void bn_mul_high(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, BN_ULONG *l, int n2,
473 BN_ULONG *t);
474BN_ULONG bn_add_part_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
475 int cl, int dl);
476BN_ULONG bn_sub_part_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
477 int cl, int dl);
478int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np, const BN_ULONG *n0, int num);
479
480#ifdef __cplusplus
481}
482#endif
483
484#endif
diff --git a/src/lib/libcrypto/bn/bn_lib.c b/src/lib/libcrypto/bn/bn_lib.c
deleted file mode 100644
index d0cb49cd1e..0000000000
--- a/src/lib/libcrypto/bn/bn_lib.c
+++ /dev/null
@@ -1,883 +0,0 @@
1/* $OpenBSD: bn_lib.c,v 1.33 2014/07/12 16:03:36 miod Exp $ */
2/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
3 * All rights reserved.
4 *
5 * This package is an SSL implementation written
6 * by Eric Young (eay@cryptsoft.com).
7 * The implementation was written so as to conform with Netscapes SSL.
8 *
9 * This library is free for commercial and non-commercial use as long as
10 * the following conditions are aheared to. The following conditions
11 * apply to all code found in this distribution, be it the RC4, RSA,
12 * lhash, DES, etc., code; not just the SSL code. The SSL documentation
13 * included with this distribution is covered by the same copyright terms
14 * except that the holder is Tim Hudson (tjh@cryptsoft.com).
15 *
16 * Copyright remains Eric Young's, and as such any Copyright notices in
17 * the code are not to be removed.
18 * If this package is used in a product, Eric Young should be given attribution
19 * as the author of the parts of the library used.
20 * This can be in the form of a textual message at program startup or
21 * in documentation (online or textual) provided with the package.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 * 1. Redistributions of source code must retain the copyright
27 * notice, this list of conditions and the following disclaimer.
28 * 2. Redistributions in binary form must reproduce the above copyright
29 * notice, this list of conditions and the following disclaimer in the
30 * documentation and/or other materials provided with the distribution.
31 * 3. All advertising materials mentioning features or use of this software
32 * must display the following acknowledgement:
33 * "This product includes cryptographic software written by
34 * Eric Young (eay@cryptsoft.com)"
35 * The word 'cryptographic' can be left out if the rouines from the library
36 * being used are not cryptographic related :-).
37 * 4. If you include any Windows specific code (or a derivative thereof) from
38 * the apps directory (application code) you must include an acknowledgement:
39 * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
40 *
41 * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
42 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51 * SUCH DAMAGE.
52 *
53 * The licence and distribution terms for any publically available version or
54 * derivative of this code cannot be changed. i.e. this code cannot simply be
55 * copied and put under another distribution licence
56 * [including the GNU Public Licence.]
57 */
58
59#ifndef BN_DEBUG
60# undef NDEBUG /* avoid conflicting definitions */
61# define NDEBUG
62#endif
63
64#include <assert.h>
65#include <limits.h>
66#include <stdio.h>
67#include <string.h>
68
69#include <openssl/opensslconf.h>
70
71#include <openssl/err.h>
72
73#include "bn_lcl.h"
74
75/* This stuff appears to be completely unused, so is deprecated */
76#ifndef OPENSSL_NO_DEPRECATED
77/* For a 32 bit machine
78 * 2 - 4 == 128
79 * 3 - 8 == 256
80 * 4 - 16 == 512
81 * 5 - 32 == 1024
82 * 6 - 64 == 2048
83 * 7 - 128 == 4096
84 * 8 - 256 == 8192
85 */
86static int bn_limit_bits = 0;
87static int bn_limit_num = 8; /* (1<<bn_limit_bits) */
88static int bn_limit_bits_low = 0;
89static int bn_limit_num_low = 8; /* (1<<bn_limit_bits_low) */
90static int bn_limit_bits_high = 0;
91static int bn_limit_num_high = 8; /* (1<<bn_limit_bits_high) */
92static int bn_limit_bits_mont = 0;
93static int bn_limit_num_mont = 8; /* (1<<bn_limit_bits_mont) */
94
95void
96BN_set_params(int mult, int high, int low, int mont)
97{
98 if (mult >= 0) {
99 if (mult > (int)(sizeof(int) * 8) - 1)
100 mult = sizeof(int) * 8 - 1;
101 bn_limit_bits = mult;
102 bn_limit_num = 1 << mult;
103 }
104 if (high >= 0) {
105 if (high > (int)(sizeof(int) * 8) - 1)
106 high = sizeof(int) * 8 - 1;
107 bn_limit_bits_high = high;
108 bn_limit_num_high = 1 << high;
109 }
110 if (low >= 0) {
111 if (low > (int)(sizeof(int) * 8) - 1)
112 low = sizeof(int) * 8 - 1;
113 bn_limit_bits_low = low;
114 bn_limit_num_low = 1 << low;
115 }
116 if (mont >= 0) {
117 if (mont > (int)(sizeof(int) * 8) - 1)
118 mont = sizeof(int) * 8 - 1;
119 bn_limit_bits_mont = mont;
120 bn_limit_num_mont = 1 << mont;
121 }
122}
123
124int
125BN_get_params(int which)
126{
127 if (which == 0)
128 return (bn_limit_bits);
129 else if (which == 1)
130 return (bn_limit_bits_high);
131 else if (which == 2)
132 return (bn_limit_bits_low);
133 else if (which == 3)
134 return (bn_limit_bits_mont);
135 else
136 return (0);
137}
138#endif
139
140const BIGNUM *
141BN_value_one(void)
142{
143 static const BN_ULONG data_one = 1L;
144 static const BIGNUM const_one = {
145 (BN_ULONG *)&data_one, 1, 1, 0, BN_FLG_STATIC_DATA
146 };
147
148 return (&const_one);
149}
150
151int
152BN_num_bits_word(BN_ULONG l)
153{
154 static const unsigned char bits[256] = {
155 0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
156 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
157 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
158 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
159 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
160 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
161 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
162 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
163 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
164 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
165 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
166 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
167 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
168 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
169 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
170 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
171 };
172
173#ifdef _LP64
174 if (l & 0xffffffff00000000L) {
175 if (l & 0xffff000000000000L) {
176 if (l & 0xff00000000000000L) {
177 return (bits[(int)(l >> 56)] + 56);
178 } else
179 return (bits[(int)(l >> 48)] + 48);
180 } else {
181 if (l & 0x0000ff0000000000L) {
182 return (bits[(int)(l >> 40)] + 40);
183 } else
184 return (bits[(int)(l >> 32)] + 32);
185 }
186 } else
187#endif
188 {
189 if (l & 0xffff0000L) {
190 if (l & 0xff000000L)
191 return (bits[(int)(l >> 24L)] + 24);
192 else
193 return (bits[(int)(l >> 16L)] + 16);
194 } else {
195 if (l & 0xff00L)
196 return (bits[(int)(l >> 8)] + 8);
197 else
198 return (bits[(int)(l)]);
199 }
200 }
201}
202
203int
204BN_num_bits(const BIGNUM *a)
205{
206 int i = a->top - 1;
207
208 bn_check_top(a);
209
210 if (BN_is_zero(a))
211 return 0;
212 return ((i * BN_BITS2) + BN_num_bits_word(a->d[i]));
213}
214
215void
216BN_clear_free(BIGNUM *a)
217{
218 int i;
219
220 if (a == NULL)
221 return;
222 bn_check_top(a);
223 if (a->d != NULL && !(BN_get_flags(a, BN_FLG_STATIC_DATA))) {
224 OPENSSL_cleanse(a->d, a->dmax * sizeof(a->d[0]));
225 free(a->d);
226 }
227 i = BN_get_flags(a, BN_FLG_MALLOCED);
228 OPENSSL_cleanse(a, sizeof(BIGNUM));
229 if (i)
230 free(a);
231}
232
233void
234BN_free(BIGNUM *a)
235{
236 BN_clear_free(a);
237}
238
239void
240BN_init(BIGNUM *a)
241{
242 memset(a, 0, sizeof(BIGNUM));
243 bn_check_top(a);
244}
245
246BIGNUM *
247BN_new(void)
248{
249 BIGNUM *ret;
250
251 if ((ret = malloc(sizeof(BIGNUM))) == NULL) {
252 BNerr(BN_F_BN_NEW, ERR_R_MALLOC_FAILURE);
253 return (NULL);
254 }
255 ret->flags = BN_FLG_MALLOCED;
256 ret->top = 0;
257 ret->neg = 0;
258 ret->dmax = 0;
259 ret->d = NULL;
260 bn_check_top(ret);
261 return (ret);
262}
263
264/* This is used both by bn_expand2() and bn_dup_expand() */
265/* The caller MUST check that words > b->dmax before calling this */
266static BN_ULONG *
267bn_expand_internal(const BIGNUM *b, int words)
268{
269 BN_ULONG *A, *a = NULL;
270 const BN_ULONG *B;
271 int i;
272
273 bn_check_top(b);
274
275 if (words > (INT_MAX/(4*BN_BITS2))) {
276 BNerr(BN_F_BN_EXPAND_INTERNAL, BN_R_BIGNUM_TOO_LONG);
277 return NULL;
278 }
279 if (BN_get_flags(b, BN_FLG_STATIC_DATA)) {
280 BNerr(BN_F_BN_EXPAND_INTERNAL,
281 BN_R_EXPAND_ON_STATIC_BIGNUM_DATA);
282 return (NULL);
283 }
284 a = A = reallocarray(NULL, words, sizeof(BN_ULONG));
285 if (A == NULL) {
286 BNerr(BN_F_BN_EXPAND_INTERNAL, ERR_R_MALLOC_FAILURE);
287 return (NULL);
288 }
289#if 1
290 B = b->d;
291 /* Check if the previous number needs to be copied */
292 if (B != NULL) {
293 for (i = b->top >> 2; i > 0; i--, A += 4, B += 4) {
294 /*
295 * The fact that the loop is unrolled
296 * 4-wise is a tribute to Intel. It's
297 * the one that doesn't have enough
298 * registers to accomodate more data.
299 * I'd unroll it 8-wise otherwise:-)
300 *
301 * <appro@fy.chalmers.se>
302 */
303 BN_ULONG a0, a1, a2, a3;
304 a0 = B[0];
305 a1 = B[1];
306 a2 = B[2];
307 a3 = B[3];
308 A[0] = a0;
309 A[1] = a1;
310 A[2] = a2;
311 A[3] = a3;
312 }
313 switch (b->top & 3) {
314 case 3:
315 A[2] = B[2];
316 case 2:
317 A[1] = B[1];
318 case 1:
319 A[0] = B[0];
320 }
321 }
322
323#else
324 memset(A, 0, sizeof(BN_ULONG) * words);
325 memcpy(A, b->d, sizeof(b->d[0]) * b->top);
326#endif
327
328 return (a);
329}
330
331/* This is an internal function that can be used instead of bn_expand2()
332 * when there is a need to copy BIGNUMs instead of only expanding the
333 * data part, while still expanding them.
334 * Especially useful when needing to expand BIGNUMs that are declared
335 * 'const' and should therefore not be changed.
336 * The reason to use this instead of a BN_dup() followed by a bn_expand2()
337 * is memory allocation overhead. A BN_dup() followed by a bn_expand2()
338 * will allocate new memory for the BIGNUM data twice, and free it once,
339 * while bn_dup_expand() makes sure allocation is made only once.
340 */
341
342#ifndef OPENSSL_NO_DEPRECATED
343BIGNUM *
344bn_dup_expand(const BIGNUM *b, int words)
345{
346 BIGNUM *r = NULL;
347
348 bn_check_top(b);
349
350 /* This function does not work if
351 * words <= b->dmax && top < words
352 * because BN_dup() does not preserve 'dmax'!
353 * (But bn_dup_expand() is not used anywhere yet.)
354 */
355
356 if (words > b->dmax) {
357 BN_ULONG *a = bn_expand_internal(b, words);
358
359 if (a) {
360 r = BN_new();
361 if (r) {
362 r->top = b->top;
363 r->dmax = words;
364 r->neg = b->neg;
365 r->d = a;
366 } else {
367 /* r == NULL, BN_new failure */
368 free(a);
369 }
370 }
371 /* If a == NULL, there was an error in allocation in
372 bn_expand_internal(), and NULL should be returned */
373 } else {
374 r = BN_dup(b);
375 }
376
377 bn_check_top(r);
378 return r;
379}
380#endif
381
382/* This is an internal function that should not be used in applications.
383 * It ensures that 'b' has enough room for a 'words' word number
384 * and initialises any unused part of b->d with leading zeros.
385 * It is mostly used by the various BIGNUM routines. If there is an error,
386 * NULL is returned. If not, 'b' is returned. */
387
388BIGNUM *
389bn_expand2(BIGNUM *b, int words)
390{
391 bn_check_top(b);
392
393 if (words > b->dmax) {
394 BN_ULONG *a = bn_expand_internal(b, words);
395 if (!a)
396 return NULL;
397 if (b->d) {
398 OPENSSL_cleanse(b->d, b->dmax * sizeof(b->d[0]));
399 free(b->d);
400 }
401 b->d = a;
402 b->dmax = words;
403 }
404
405/* None of this should be necessary because of what b->top means! */
406#if 0
407 /* NB: bn_wexpand() calls this only if the BIGNUM really has to grow */
408 if (b->top < b->dmax) {
409 int i;
410 BN_ULONG *A = &(b->d[b->top]);
411 for (i = (b->dmax - b->top) >> 3; i > 0; i--, A += 8) {
412 A[0] = 0;
413 A[1] = 0;
414 A[2] = 0;
415 A[3] = 0;
416 A[4] = 0;
417 A[5] = 0;
418 A[6] = 0;
419 A[7] = 0;
420 }
421 for (i = (b->dmax - b->top)&7; i > 0; i--, A++)
422 A[0] = 0;
423 assert(A == &(b->d[b->dmax]));
424 }
425#endif
426 bn_check_top(b);
427 return b;
428}
429
430BIGNUM *
431BN_dup(const BIGNUM *a)
432{
433 BIGNUM *t;
434
435 if (a == NULL)
436 return NULL;
437 bn_check_top(a);
438
439 t = BN_new();
440 if (t == NULL)
441 return NULL;
442 if (!BN_copy(t, a)) {
443 BN_free(t);
444 return NULL;
445 }
446 bn_check_top(t);
447 return t;
448}
449
450BIGNUM *
451BN_copy(BIGNUM *a, const BIGNUM *b)
452{
453 int i;
454 BN_ULONG *A;
455 const BN_ULONG *B;
456
457 bn_check_top(b);
458
459 if (a == b)
460 return (a);
461 if (bn_wexpand(a, b->top) == NULL)
462 return (NULL);
463
464#if 1
465 A = a->d;
466 B = b->d;
467 for (i = b->top >> 2; i > 0; i--, A += 4, B += 4) {
468 BN_ULONG a0, a1, a2, a3;
469 a0 = B[0];
470 a1 = B[1];
471 a2 = B[2];
472 a3 = B[3];
473 A[0] = a0;
474 A[1] = a1;
475 A[2] = a2;
476 A[3] = a3;
477 }
478 switch (b->top & 3) {
479 case 3:
480 A[2] = B[2];
481 case 2:
482 A[1] = B[1];
483 case 1:
484 A[0] = B[0];
485 }
486#else
487 memcpy(a->d, b->d, sizeof(b->d[0]) * b->top);
488#endif
489
490 a->top = b->top;
491 a->neg = b->neg;
492 bn_check_top(a);
493 return (a);
494}
495
496void
497BN_swap(BIGNUM *a, BIGNUM *b)
498{
499 int flags_old_a, flags_old_b;
500 BN_ULONG *tmp_d;
501 int tmp_top, tmp_dmax, tmp_neg;
502
503 bn_check_top(a);
504 bn_check_top(b);
505
506 flags_old_a = a->flags;
507 flags_old_b = b->flags;
508
509 tmp_d = a->d;
510 tmp_top = a->top;
511 tmp_dmax = a->dmax;
512 tmp_neg = a->neg;
513
514 a->d = b->d;
515 a->top = b->top;
516 a->dmax = b->dmax;
517 a->neg = b->neg;
518
519 b->d = tmp_d;
520 b->top = tmp_top;
521 b->dmax = tmp_dmax;
522 b->neg = tmp_neg;
523
524 a->flags = (flags_old_a & BN_FLG_MALLOCED) |
525 (flags_old_b & BN_FLG_STATIC_DATA);
526 b->flags = (flags_old_b & BN_FLG_MALLOCED) |
527 (flags_old_a & BN_FLG_STATIC_DATA);
528 bn_check_top(a);
529 bn_check_top(b);
530}
531
532void
533BN_clear(BIGNUM *a)
534{
535 bn_check_top(a);
536 if (a->d != NULL)
537 memset(a->d, 0, a->dmax * sizeof(a->d[0]));
538 a->top = 0;
539 a->neg = 0;
540}
541
542BN_ULONG
543BN_get_word(const BIGNUM *a)
544{
545 if (a->top > 1)
546 return BN_MASK2;
547 else if (a->top == 1)
548 return a->d[0];
549 /* a->top == 0 */
550 return 0;
551}
552
553int
554BN_set_word(BIGNUM *a, BN_ULONG w)
555{
556 bn_check_top(a);
557 if (bn_expand(a, (int)sizeof(BN_ULONG) * 8) == NULL)
558 return (0);
559 a->neg = 0;
560 a->d[0] = w;
561 a->top = (w ? 1 : 0);
562 bn_check_top(a);
563 return (1);
564}
565
566BIGNUM *
567BN_bin2bn(const unsigned char *s, int len, BIGNUM *ret)
568{
569 unsigned int i, m;
570 unsigned int n;
571 BN_ULONG l;
572 BIGNUM *bn = NULL;
573
574 if (ret == NULL)
575 ret = bn = BN_new();
576 if (ret == NULL)
577 return (NULL);
578 bn_check_top(ret);
579 l = 0;
580 n = len;
581 if (n == 0) {
582 ret->top = 0;
583 return (ret);
584 }
585 i = ((n - 1) / BN_BYTES) + 1;
586 m = ((n - 1) % (BN_BYTES));
587 if (bn_wexpand(ret, (int)i) == NULL) {
588 BN_free(bn);
589 return NULL;
590 }
591 ret->top = i;
592 ret->neg = 0;
593 while (n--) {
594 l = (l << 8L) | *(s++);
595 if (m-- == 0) {
596 ret->d[--i] = l;
597 l = 0;
598 m = BN_BYTES - 1;
599 }
600 }
601 /* need to call this due to clear byte at top if avoiding
602 * having the top bit set (-ve number) */
603 bn_correct_top(ret);
604 return (ret);
605}
606
607/* ignore negative */
608int
609BN_bn2bin(const BIGNUM *a, unsigned char *to)
610{
611 int n, i;
612 BN_ULONG l;
613
614 bn_check_top(a);
615 n = i=BN_num_bytes(a);
616 while (i--) {
617 l = a->d[i / BN_BYTES];
618 *(to++) = (unsigned char)(l >> (8 * (i % BN_BYTES))) & 0xff;
619 }
620 return (n);
621}
622
623int
624BN_ucmp(const BIGNUM *a, const BIGNUM *b)
625{
626 int i;
627 BN_ULONG t1, t2, *ap, *bp;
628
629 bn_check_top(a);
630 bn_check_top(b);
631
632 i = a->top - b->top;
633 if (i != 0)
634 return (i);
635 ap = a->d;
636 bp = b->d;
637 for (i = a->top - 1; i >= 0; i--) {
638 t1 = ap[i];
639 t2 = bp[i];
640 if (t1 != t2)
641 return ((t1 > t2) ? 1 : -1);
642 }
643 return (0);
644}
645
646int
647BN_cmp(const BIGNUM *a, const BIGNUM *b)
648{
649 int i;
650 int gt, lt;
651 BN_ULONG t1, t2;
652
653 if ((a == NULL) || (b == NULL)) {
654 if (a != NULL)
655 return (-1);
656 else if (b != NULL)
657 return (1);
658 else
659 return (0);
660 }
661
662 bn_check_top(a);
663 bn_check_top(b);
664
665 if (a->neg != b->neg) {
666 if (a->neg)
667 return (-1);
668 else
669 return (1);
670 }
671 if (a->neg == 0) {
672 gt = 1;
673 lt = -1;
674 } else {
675 gt = -1;
676 lt = 1;
677 }
678
679 if (a->top > b->top)
680 return (gt);
681 if (a->top < b->top)
682 return (lt);
683 for (i = a->top - 1; i >= 0; i--) {
684 t1 = a->d[i];
685 t2 = b->d[i];
686 if (t1 > t2)
687 return (gt);
688 if (t1 < t2)
689 return (lt);
690 }
691 return (0);
692}
693
694int
695BN_set_bit(BIGNUM *a, int n)
696{
697 int i, j, k;
698
699 if (n < 0)
700 return 0;
701
702 i = n / BN_BITS2;
703 j = n % BN_BITS2;
704 if (a->top <= i) {
705 if (bn_wexpand(a, i + 1) == NULL)
706 return (0);
707 for (k = a->top; k < i + 1; k++)
708 a->d[k] = 0;
709 a->top = i + 1;
710 }
711
712 a->d[i] |= (((BN_ULONG)1) << j);
713 bn_check_top(a);
714 return (1);
715}
716
717int
718BN_clear_bit(BIGNUM *a, int n)
719{
720 int i, j;
721
722 bn_check_top(a);
723 if (n < 0)
724 return 0;
725
726 i = n / BN_BITS2;
727 j = n % BN_BITS2;
728 if (a->top <= i)
729 return (0);
730
731 a->d[i] &= (~(((BN_ULONG)1) << j));
732 bn_correct_top(a);
733 return (1);
734}
735
736int
737BN_is_bit_set(const BIGNUM *a, int n)
738{
739 int i, j;
740
741 bn_check_top(a);
742 if (n < 0)
743 return 0;
744 i = n / BN_BITS2;
745 j = n % BN_BITS2;
746 if (a->top <= i)
747 return 0;
748 return (int)(((a->d[i]) >> j) & ((BN_ULONG)1));
749}
750
751int
752BN_mask_bits(BIGNUM *a, int n)
753{
754 int b, w;
755
756 bn_check_top(a);
757 if (n < 0)
758 return 0;
759
760 w = n / BN_BITS2;
761 b = n % BN_BITS2;
762 if (w >= a->top)
763 return 0;
764 if (b == 0)
765 a->top = w;
766 else {
767 a->top = w + 1;
768 a->d[w] &= ~(BN_MASK2 << b);
769 }
770 bn_correct_top(a);
771 return (1);
772}
773
774void
775BN_set_negative(BIGNUM *a, int b)
776{
777 if (b && !BN_is_zero(a))
778 a->neg = 1;
779 else
780 a->neg = 0;
781}
782
783int
784bn_cmp_words(const BN_ULONG *a, const BN_ULONG *b, int n)
785{
786 int i;
787 BN_ULONG aa, bb;
788
789 aa = a[n - 1];
790 bb = b[n - 1];
791 if (aa != bb)
792 return ((aa > bb) ? 1 : -1);
793 for (i = n - 2; i >= 0; i--) {
794 aa = a[i];
795 bb = b[i];
796 if (aa != bb)
797 return ((aa > bb) ? 1 : -1);
798 }
799 return (0);
800}
801
802/* Here follows a specialised variants of bn_cmp_words(). It has the
803 property of performing the operation on arrays of different sizes.
804 The sizes of those arrays is expressed through cl, which is the
805 common length ( basicall, min(len(a),len(b)) ), and dl, which is the
806 delta between the two lengths, calculated as len(a)-len(b).
807 All lengths are the number of BN_ULONGs... */
808
809int
810bn_cmp_part_words(const BN_ULONG *a, const BN_ULONG *b, int cl, int dl)
811{
812 int n, i;
813
814 n = cl - 1;
815
816 if (dl < 0) {
817 for (i = dl; i < 0; i++) {
818 if (b[n - i] != 0)
819 return -1; /* a < b */
820 }
821 }
822 if (dl > 0) {
823 for (i = dl; i > 0; i--) {
824 if (a[n + i] != 0)
825 return 1; /* a > b */
826 }
827 }
828 return bn_cmp_words(a, b, cl);
829}
830
831/*
832 * Constant-time conditional swap of a and b.
833 * a and b are swapped if condition is not 0. The code assumes that at most one bit of condition is set.
834 * nwords is the number of words to swap. The code assumes that at least nwords are allocated in both a and b,
835 * and that no more than nwords are used by either a or b.
836 * a and b cannot be the same number
837 */
838void
839BN_consttime_swap(BN_ULONG condition, BIGNUM *a, BIGNUM *b, int nwords)
840{
841 BN_ULONG t;
842 int i;
843
844 bn_wcheck_size(a, nwords);
845 bn_wcheck_size(b, nwords);
846
847 assert(a != b);
848 assert((condition & (condition - 1)) == 0);
849 assert(sizeof(BN_ULONG) >= sizeof(int));
850
851 condition = ((condition - 1) >> (BN_BITS2 - 1)) - 1;
852
853 t = (a->top^b->top) & condition;
854 a->top ^= t;
855 b->top ^= t;
856
857#define BN_CONSTTIME_SWAP(ind) \
858 do { \
859 t = (a->d[ind] ^ b->d[ind]) & condition; \
860 a->d[ind] ^= t; \
861 b->d[ind] ^= t; \
862 } while (0)
863
864
865 switch (nwords) {
866 default:
867 for (i = 10; i < nwords; i++)
868 BN_CONSTTIME_SWAP(i);
869 /* Fallthrough */
870 case 10: BN_CONSTTIME_SWAP(9); /* Fallthrough */
871 case 9: BN_CONSTTIME_SWAP(8); /* Fallthrough */
872 case 8: BN_CONSTTIME_SWAP(7); /* Fallthrough */
873 case 7: BN_CONSTTIME_SWAP(6); /* Fallthrough */
874 case 6: BN_CONSTTIME_SWAP(5); /* Fallthrough */
875 case 5: BN_CONSTTIME_SWAP(4); /* Fallthrough */
876 case 4: BN_CONSTTIME_SWAP(3); /* Fallthrough */
877 case 3: BN_CONSTTIME_SWAP(2); /* Fallthrough */
878 case 2: BN_CONSTTIME_SWAP(1); /* Fallthrough */
879 case 1:
880 BN_CONSTTIME_SWAP(0);
881 }
882#undef BN_CONSTTIME_SWAP
883}
diff --git a/src/lib/libcrypto/bn/bn_mod.c b/src/lib/libcrypto/bn/bn_mod.c
deleted file mode 100644
index 67bd3541b0..0000000000
--- a/src/lib/libcrypto/bn/bn_mod.c
+++ /dev/null
@@ -1,305 +0,0 @@
1/* $OpenBSD: bn_mod.c,v 1.9 2014/07/12 16:03:36 miod Exp $ */
2/* Includes code written by Lenka Fibikova <fibikova@exp-math.uni-essen.de>
3 * for the OpenSSL project. */
4/* ====================================================================
5 * Copyright (c) 1998-2000 The OpenSSL Project. All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 *
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 *
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in
16 * the documentation and/or other materials provided with the
17 * distribution.
18 *
19 * 3. All advertising materials mentioning features or use of this
20 * software must display the following acknowledgment:
21 * "This product includes software developed by the OpenSSL Project
22 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
23 *
24 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
25 * endorse or promote products derived from this software without
26 * prior written permission. For written permission, please contact
27 * openssl-core@openssl.org.
28 *
29 * 5. Products derived from this software may not be called "OpenSSL"
30 * nor may "OpenSSL" appear in their names without prior written
31 * permission of the OpenSSL Project.
32 *
33 * 6. Redistributions of any form whatsoever must retain the following
34 * acknowledgment:
35 * "This product includes software developed by the OpenSSL Project
36 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
37 *
38 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
39 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
40 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
41 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
42 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
43 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
44 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
45 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
46 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
47 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
48 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
49 * OF THE POSSIBILITY OF SUCH DAMAGE.
50 * ====================================================================
51 *
52 * This product includes cryptographic software written by Eric Young
53 * (eay@cryptsoft.com). This product includes software written by Tim
54 * Hudson (tjh@cryptsoft.com).
55 *
56 */
57/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
58 * All rights reserved.
59 *
60 * This package is an SSL implementation written
61 * by Eric Young (eay@cryptsoft.com).
62 * The implementation was written so as to conform with Netscapes SSL.
63 *
64 * This library is free for commercial and non-commercial use as long as
65 * the following conditions are aheared to. The following conditions
66 * apply to all code found in this distribution, be it the RC4, RSA,
67 * lhash, DES, etc., code; not just the SSL code. The SSL documentation
68 * included with this distribution is covered by the same copyright terms
69 * except that the holder is Tim Hudson (tjh@cryptsoft.com).
70 *
71 * Copyright remains Eric Young's, and as such any Copyright notices in
72 * the code are not to be removed.
73 * If this package is used in a product, Eric Young should be given attribution
74 * as the author of the parts of the library used.
75 * This can be in the form of a textual message at program startup or
76 * in documentation (online or textual) provided with the package.
77 *
78 * Redistribution and use in source and binary forms, with or without
79 * modification, are permitted provided that the following conditions
80 * are met:
81 * 1. Redistributions of source code must retain the copyright
82 * notice, this list of conditions and the following disclaimer.
83 * 2. Redistributions in binary form must reproduce the above copyright
84 * notice, this list of conditions and the following disclaimer in the
85 * documentation and/or other materials provided with the distribution.
86 * 3. All advertising materials mentioning features or use of this software
87 * must display the following acknowledgement:
88 * "This product includes cryptographic software written by
89 * Eric Young (eay@cryptsoft.com)"
90 * The word 'cryptographic' can be left out if the rouines from the library
91 * being used are not cryptographic related :-).
92 * 4. If you include any Windows specific code (or a derivative thereof) from
93 * the apps directory (application code) you must include an acknowledgement:
94 * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
95 *
96 * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
97 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
98 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
99 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
100 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
101 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
102 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
103 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
104 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
105 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
106 * SUCH DAMAGE.
107 *
108 * The licence and distribution terms for any publically available version or
109 * derivative of this code cannot be changed. i.e. this code cannot simply be
110 * copied and put under another distribution licence
111 * [including the GNU Public Licence.]
112 */
113
114#include <openssl/err.h>
115
116#include "bn_lcl.h"
117
118int
119BN_nnmod(BIGNUM *r, const BIGNUM *m, const BIGNUM *d, BN_CTX *ctx)
120{
121 /* like BN_mod, but returns non-negative remainder
122 * (i.e., 0 <= r < |d| always holds) */
123
124 if (!(BN_mod(r, m,d, ctx)))
125 return 0;
126 if (!r->neg)
127 return 1;
128 /* now -|d| < r < 0, so we have to set r := r + |d| */
129 return (d->neg ? BN_sub : BN_add)(r, r, d);
130}
131
132int
133BN_mod_add(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, const BIGNUM *m,
134 BN_CTX *ctx)
135{
136 if (!BN_add(r, a, b))
137 return 0;
138 return BN_nnmod(r, r, m, ctx);
139}
140
141/* BN_mod_add variant that may be used if both a and b are non-negative
142 * and less than m */
143int
144BN_mod_add_quick(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, const BIGNUM *m)
145{
146 if (!BN_uadd(r, a, b))
147 return 0;
148 if (BN_ucmp(r, m) >= 0)
149 return BN_usub(r, r, m);
150 return 1;
151}
152
153int
154BN_mod_sub(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, const BIGNUM *m,
155 BN_CTX *ctx)
156{
157 if (!BN_sub(r, a, b))
158 return 0;
159 return BN_nnmod(r, r, m, ctx);
160}
161
162/* BN_mod_sub variant that may be used if both a and b are non-negative
163 * and less than m */
164int
165BN_mod_sub_quick(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, const BIGNUM *m)
166{
167 if (!BN_sub(r, a, b))
168 return 0;
169 if (r->neg)
170 return BN_add(r, r, m);
171 return 1;
172}
173
174/* slow but works */
175int
176BN_mod_mul(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, const BIGNUM *m,
177 BN_CTX *ctx)
178{
179 BIGNUM *t;
180 int ret = 0;
181
182 bn_check_top(a);
183 bn_check_top(b);
184 bn_check_top(m);
185
186 BN_CTX_start(ctx);
187 if ((t = BN_CTX_get(ctx)) == NULL)
188 goto err;
189 if (a == b) {
190 if (!BN_sqr(t, a, ctx))
191 goto err;
192 } else {
193 if (!BN_mul(t, a,b, ctx))
194 goto err;
195 }
196 if (!BN_nnmod(r, t,m, ctx))
197 goto err;
198 bn_check_top(r);
199 ret = 1;
200
201err:
202 BN_CTX_end(ctx);
203 return (ret);
204}
205
206int
207BN_mod_sqr(BIGNUM *r, const BIGNUM *a, const BIGNUM *m, BN_CTX *ctx)
208{
209 if (!BN_sqr(r, a, ctx))
210 return 0;
211 /* r->neg == 0, thus we don't need BN_nnmod */
212 return BN_mod(r, r, m, ctx);
213}
214
215int
216BN_mod_lshift1(BIGNUM *r, const BIGNUM *a, const BIGNUM *m, BN_CTX *ctx)
217{
218 if (!BN_lshift1(r, a))
219 return 0;
220 bn_check_top(r);
221 return BN_nnmod(r, r, m, ctx);
222}
223
224/* BN_mod_lshift1 variant that may be used if a is non-negative
225 * and less than m */
226int
227BN_mod_lshift1_quick(BIGNUM *r, const BIGNUM *a, const BIGNUM *m)
228{
229 if (!BN_lshift1(r, a))
230 return 0;
231 bn_check_top(r);
232 if (BN_cmp(r, m) >= 0)
233 return BN_sub(r, r, m);
234 return 1;
235}
236
237int
238BN_mod_lshift(BIGNUM *r, const BIGNUM *a, int n, const BIGNUM *m, BN_CTX *ctx)
239{
240 BIGNUM *abs_m = NULL;
241 int ret;
242
243 if (!BN_nnmod(r, a, m, ctx))
244 return 0;
245
246 if (m->neg) {
247 abs_m = BN_dup(m);
248 if (abs_m == NULL)
249 return 0;
250 abs_m->neg = 0;
251 }
252
253 ret = BN_mod_lshift_quick(r, r, n, (abs_m ? abs_m : m));
254 bn_check_top(r);
255
256 BN_free(abs_m);
257 return ret;
258}
259
260/* BN_mod_lshift variant that may be used if a is non-negative
261 * and less than m */
262int
263BN_mod_lshift_quick(BIGNUM *r, const BIGNUM *a, int n, const BIGNUM *m)
264{
265 if (r != a) {
266 if (BN_copy(r, a) == NULL)
267 return 0;
268 }
269
270 while (n > 0) {
271 int max_shift;
272
273 /* 0 < r < m */
274 max_shift = BN_num_bits(m) - BN_num_bits(r);
275 /* max_shift >= 0 */
276
277 if (max_shift < 0) {
278 BNerr(BN_F_BN_MOD_LSHIFT_QUICK, BN_R_INPUT_NOT_REDUCED);
279 return 0;
280 }
281
282 if (max_shift > n)
283 max_shift = n;
284
285 if (max_shift) {
286 if (!BN_lshift(r, r, max_shift))
287 return 0;
288 n -= max_shift;
289 } else {
290 if (!BN_lshift1(r, r))
291 return 0;
292 --n;
293 }
294
295 /* BN_num_bits(r) <= BN_num_bits(m) */
296
297 if (BN_cmp(r, m) >= 0) {
298 if (!BN_sub(r, r, m))
299 return 0;
300 }
301 }
302 bn_check_top(r);
303
304 return 1;
305}
diff --git a/src/lib/libcrypto/bn/bn_mont.c b/src/lib/libcrypto/bn/bn_mont.c
deleted file mode 100644
index 3eb9913a9e..0000000000
--- a/src/lib/libcrypto/bn/bn_mont.c
+++ /dev/null
@@ -1,538 +0,0 @@
1/* $OpenBSD: bn_mont.c,v 1.24 2015/02/09 15:49:22 jsing Exp $ */
2/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
3 * All rights reserved.
4 *
5 * This package is an SSL implementation written
6 * by Eric Young (eay@cryptsoft.com).
7 * The implementation was written so as to conform with Netscapes SSL.
8 *
9 * This library is free for commercial and non-commercial use as long as
10 * the following conditions are aheared to. The following conditions
11 * apply to all code found in this distribution, be it the RC4, RSA,
12 * lhash, DES, etc., code; not just the SSL code. The SSL documentation
13 * included with this distribution is covered by the same copyright terms
14 * except that the holder is Tim Hudson (tjh@cryptsoft.com).
15 *
16 * Copyright remains Eric Young's, and as such any Copyright notices in
17 * the code are not to be removed.
18 * If this package is used in a product, Eric Young should be given attribution
19 * as the author of the parts of the library used.
20 * This can be in the form of a textual message at program startup or
21 * in documentation (online or textual) provided with the package.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 * 1. Redistributions of source code must retain the copyright
27 * notice, this list of conditions and the following disclaimer.
28 * 2. Redistributions in binary form must reproduce the above copyright
29 * notice, this list of conditions and the following disclaimer in the
30 * documentation and/or other materials provided with the distribution.
31 * 3. All advertising materials mentioning features or use of this software
32 * must display the following acknowledgement:
33 * "This product includes cryptographic software written by
34 * Eric Young (eay@cryptsoft.com)"
35 * The word 'cryptographic' can be left out if the rouines from the library
36 * being used are not cryptographic related :-).
37 * 4. If you include any Windows specific code (or a derivative thereof) from
38 * the apps directory (application code) you must include an acknowledgement:
39 * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
40 *
41 * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
42 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51 * SUCH DAMAGE.
52 *
53 * The licence and distribution terms for any publically available version or
54 * derivative of this code cannot be changed. i.e. this code cannot simply be
55 * copied and put under another distribution licence
56 * [including the GNU Public Licence.]
57 */
58/* ====================================================================
59 * Copyright (c) 1998-2006 The OpenSSL Project. All rights reserved.
60 *
61 * Redistribution and use in source and binary forms, with or without
62 * modification, are permitted provided that the following conditions
63 * are met:
64 *
65 * 1. Redistributions of source code must retain the above copyright
66 * notice, this list of conditions and the following disclaimer.
67 *
68 * 2. Redistributions in binary form must reproduce the above copyright
69 * notice, this list of conditions and the following disclaimer in
70 * the documentation and/or other materials provided with the
71 * distribution.
72 *
73 * 3. All advertising materials mentioning features or use of this
74 * software must display the following acknowledgment:
75 * "This product includes software developed by the OpenSSL Project
76 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
77 *
78 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
79 * endorse or promote products derived from this software without
80 * prior written permission. For written permission, please contact
81 * openssl-core@openssl.org.
82 *
83 * 5. Products derived from this software may not be called "OpenSSL"
84 * nor may "OpenSSL" appear in their names without prior written
85 * permission of the OpenSSL Project.
86 *
87 * 6. Redistributions of any form whatsoever must retain the following
88 * acknowledgment:
89 * "This product includes software developed by the OpenSSL Project
90 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
91 *
92 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
93 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
94 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
95 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
96 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
97 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
98 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
99 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
100 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
101 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
102 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
103 * OF THE POSSIBILITY OF SUCH DAMAGE.
104 * ====================================================================
105 *
106 * This product includes cryptographic software written by Eric Young
107 * (eay@cryptsoft.com). This product includes software written by Tim
108 * Hudson (tjh@cryptsoft.com).
109 *
110 */
111
112/*
113 * Details about Montgomery multiplication algorithms can be found at
114 * http://security.ece.orst.edu/publications.html, e.g.
115 * http://security.ece.orst.edu/koc/papers/j37acmon.pdf and
116 * sections 3.8 and 4.2 in http://security.ece.orst.edu/koc/papers/r01rsasw.pdf
117 */
118
119#include <stdio.h>
120#include <stdint.h>
121
122#include "bn_lcl.h"
123
124#define MONT_WORD /* use the faster word-based algorithm */
125
126#ifdef MONT_WORD
127static int BN_from_montgomery_word(BIGNUM *ret, BIGNUM *r, BN_MONT_CTX *mont);
128#endif
129
130int
131BN_mod_mul_montgomery(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
132 BN_MONT_CTX *mont, BN_CTX *ctx)
133{
134 BIGNUM *tmp;
135 int ret = 0;
136#if defined(OPENSSL_BN_ASM_MONT) && defined(MONT_WORD)
137 int num = mont->N.top;
138
139 if (num > 1 && a->top == num && b->top == num) {
140 if (bn_wexpand(r, num) == NULL)
141 return (0);
142 if (bn_mul_mont(r->d, a->d, b->d, mont->N.d, mont->n0, num)) {
143 r->neg = a->neg^b->neg;
144 r->top = num;
145 bn_correct_top(r);
146 return (1);
147 }
148 }
149#endif
150
151 BN_CTX_start(ctx);
152 if ((tmp = BN_CTX_get(ctx)) == NULL)
153 goto err;
154
155 bn_check_top(tmp);
156 if (a == b) {
157 if (!BN_sqr(tmp, a, ctx))
158 goto err;
159 } else {
160 if (!BN_mul(tmp, a,b, ctx))
161 goto err;
162 }
163 /* reduce from aRR to aR */
164#ifdef MONT_WORD
165 if (!BN_from_montgomery_word(r, tmp, mont))
166 goto err;
167#else
168 if (!BN_from_montgomery(r, tmp, mont, ctx))
169 goto err;
170#endif
171 bn_check_top(r);
172 ret = 1;
173err:
174 BN_CTX_end(ctx);
175 return (ret);
176}
177
178#ifdef MONT_WORD
179static int
180BN_from_montgomery_word(BIGNUM *ret, BIGNUM *r, BN_MONT_CTX *mont)
181{
182 BIGNUM *n;
183 BN_ULONG *ap, *np, *rp, n0, v, carry;
184 int nl, max, i;
185
186 n = &(mont->N);
187 nl = n->top;
188 if (nl == 0) {
189 ret->top = 0;
190 return (1);
191 }
192
193 max = (2 * nl); /* carry is stored separately */
194 if (bn_wexpand(r, max) == NULL)
195 return (0);
196
197 r->neg ^= n->neg;
198 np = n->d;
199 rp = r->d;
200
201 /* clear the top words of T */
202#if 1
203 for (i=r->top; i<max; i++) /* memset? XXX */
204 rp[i] = 0;
205#else
206 memset(&(rp[r->top]), 0, (max - r->top) * sizeof(BN_ULONG));
207#endif
208
209 r->top = max;
210 n0 = mont->n0[0];
211
212#ifdef BN_COUNT
213 fprintf(stderr, "word BN_from_montgomery_word %d * %d\n", nl, nl);
214#endif
215 for (carry = 0, i = 0; i < nl; i++, rp++) {
216 v = bn_mul_add_words(rp, np, nl, (rp[0] * n0) & BN_MASK2);
217 v = (v + carry + rp[nl]) & BN_MASK2;
218 carry |= (v != rp[nl]);
219 carry &= (v <= rp[nl]);
220 rp[nl] = v;
221 }
222
223 if (bn_wexpand(ret, nl) == NULL)
224 return (0);
225 ret->top = nl;
226 ret->neg = r->neg;
227
228 rp = ret->d;
229 ap = &(r->d[nl]);
230
231#define BRANCH_FREE 1
232#if BRANCH_FREE
233 {
234 BN_ULONG *nrp;
235 size_t m;
236
237 v = bn_sub_words(rp, ap, np, nl) - carry;
238 /* if subtraction result is real, then
239 * trick unconditional memcpy below to perform in-place
240 * "refresh" instead of actual copy. */
241 m = (0 - (size_t)v);
242 nrp = (BN_ULONG *)(((uintptr_t)rp & ~m)|((uintptr_t)ap & m));
243
244 for (i = 0, nl -= 4; i < nl; i += 4) {
245 BN_ULONG t1, t2, t3, t4;
246
247 t1 = nrp[i + 0];
248 t2 = nrp[i + 1];
249 t3 = nrp[i + 2];
250 ap[i + 0] = 0;
251 t4 = nrp[i + 3];
252 ap[i + 1] = 0;
253 rp[i + 0] = t1;
254 ap[i + 2] = 0;
255 rp[i + 1] = t2;
256 ap[i + 3] = 0;
257 rp[i + 2] = t3;
258 rp[i + 3] = t4;
259 }
260 for (nl += 4; i < nl; i++)
261 rp[i] = nrp[i], ap[i] = 0;
262 }
263#else
264 if (bn_sub_words (rp, ap, np, nl) - carry)
265 memcpy(rp, ap, nl*sizeof(BN_ULONG));
266#endif
267 bn_correct_top(r);
268 bn_correct_top(ret);
269 bn_check_top(ret);
270
271 return (1);
272}
273#endif /* MONT_WORD */
274
275int
276BN_from_montgomery(BIGNUM *ret, const BIGNUM *a, BN_MONT_CTX *mont, BN_CTX *ctx)
277{
278 int retn = 0;
279#ifdef MONT_WORD
280 BIGNUM *t;
281
282 BN_CTX_start(ctx);
283 if ((t = BN_CTX_get(ctx)) && BN_copy(t, a))
284 retn = BN_from_montgomery_word(ret, t, mont);
285 BN_CTX_end(ctx);
286#else /* !MONT_WORD */
287 BIGNUM *t1, *t2;
288
289 BN_CTX_start(ctx);
290 if ((t1 = BN_CTX_get(ctx)) == NULL)
291 goto err;
292 if ((t2 = BN_CTX_get(ctx)) == NULL)
293 goto err;
294
295 if (!BN_copy(t1, a))
296 goto err;
297 BN_mask_bits(t1, mont->ri);
298
299 if (!BN_mul(t2, t1, &mont->Ni, ctx))
300 goto err;
301 BN_mask_bits(t2, mont->ri);
302
303 if (!BN_mul(t1, t2, &mont->N, ctx))
304 goto err;
305 if (!BN_add(t2, a, t1))
306 goto err;
307 if (!BN_rshift(ret, t2, mont->ri))
308 goto err;
309
310 if (BN_ucmp(ret, &(mont->N)) >= 0) {
311 if (!BN_usub(ret, ret, &(mont->N)))
312 goto err;
313 }
314 retn = 1;
315 bn_check_top(ret);
316
317err:
318 BN_CTX_end(ctx);
319#endif /* MONT_WORD */
320 return (retn);
321}
322
323BN_MONT_CTX *
324BN_MONT_CTX_new(void)
325{
326 BN_MONT_CTX *ret;
327
328 if ((ret = malloc(sizeof(BN_MONT_CTX))) == NULL)
329 return (NULL);
330
331 BN_MONT_CTX_init(ret);
332 ret->flags = BN_FLG_MALLOCED;
333 return (ret);
334}
335
336void
337BN_MONT_CTX_init(BN_MONT_CTX *ctx)
338{
339 ctx->ri = 0;
340 BN_init(&(ctx->RR));
341 BN_init(&(ctx->N));
342 BN_init(&(ctx->Ni));
343 ctx->n0[0] = ctx->n0[1] = 0;
344 ctx->flags = 0;
345}
346
347void
348BN_MONT_CTX_free(BN_MONT_CTX *mont)
349{
350 if (mont == NULL)
351 return;
352
353 BN_clear_free(&(mont->RR));
354 BN_clear_free(&(mont->N));
355 BN_clear_free(&(mont->Ni));
356 if (mont->flags & BN_FLG_MALLOCED)
357 free(mont);
358}
359
360int
361BN_MONT_CTX_set(BN_MONT_CTX *mont, const BIGNUM *mod, BN_CTX *ctx)
362{
363 int ret = 0;
364 BIGNUM *Ri, *R;
365
366 BN_CTX_start(ctx);
367 if ((Ri = BN_CTX_get(ctx)) == NULL)
368 goto err;
369 R = &(mont->RR); /* grab RR as a temp */
370 if (!BN_copy(&(mont->N), mod))
371 goto err; /* Set N */
372 mont->N.neg = 0;
373
374#ifdef MONT_WORD
375 {
376 BIGNUM tmod;
377 BN_ULONG buf[2];
378
379 BN_init(&tmod);
380 tmod.d = buf;
381 tmod.dmax = 2;
382 tmod.neg = 0;
383
384 mont->ri = (BN_num_bits(mod) +
385 (BN_BITS2 - 1)) / BN_BITS2 * BN_BITS2;
386
387#if defined(OPENSSL_BN_ASM_MONT) && (BN_BITS2<=32)
388 /* Only certain BN_BITS2<=32 platforms actually make use of
389 * n0[1], and we could use the #else case (with a shorter R
390 * value) for the others. However, currently only the assembler
391 * files do know which is which. */
392
393 BN_zero(R);
394 if (!(BN_set_bit(R, 2 * BN_BITS2)))
395 goto err;
396
397 tmod.top = 0;
398 if ((buf[0] = mod->d[0]))
399 tmod.top = 1;
400 if ((buf[1] = mod->top > 1 ? mod->d[1] : 0))
401 tmod.top = 2;
402
403 if ((BN_mod_inverse(Ri, R, &tmod, ctx)) == NULL)
404 goto err;
405 if (!BN_lshift(Ri, Ri, 2 * BN_BITS2))
406 goto err; /* R*Ri */
407 if (!BN_is_zero(Ri)) {
408 if (!BN_sub_word(Ri, 1))
409 goto err;
410 }
411 else /* if N mod word size == 1 */
412 {
413 if (bn_expand(Ri, (int)sizeof(BN_ULONG) * 2) == NULL)
414 goto err;
415 /* Ri-- (mod double word size) */
416 Ri->neg = 0;
417 Ri->d[0] = BN_MASK2;
418 Ri->d[1] = BN_MASK2;
419 Ri->top = 2;
420 }
421 if (!BN_div(Ri, NULL, Ri, &tmod, ctx))
422 goto err;
423 /* Ni = (R*Ri-1)/N,
424 * keep only couple of least significant words: */
425 mont->n0[0] = (Ri->top > 0) ? Ri->d[0] : 0;
426 mont->n0[1] = (Ri->top > 1) ? Ri->d[1] : 0;
427#else
428 BN_zero(R);
429 if (!(BN_set_bit(R, BN_BITS2)))
430 goto err; /* R */
431
432 buf[0] = mod->d[0]; /* tmod = N mod word size */
433 buf[1] = 0;
434 tmod.top = buf[0] != 0 ? 1 : 0;
435 /* Ri = R^-1 mod N*/
436 if ((BN_mod_inverse(Ri, R, &tmod, ctx)) == NULL)
437 goto err;
438 if (!BN_lshift(Ri, Ri, BN_BITS2))
439 goto err; /* R*Ri */
440 if (!BN_is_zero(Ri)) {
441 if (!BN_sub_word(Ri, 1))
442 goto err;
443 }
444 else /* if N mod word size == 1 */
445 {
446 if (!BN_set_word(Ri, BN_MASK2))
447 goto err; /* Ri-- (mod word size) */
448 }
449 if (!BN_div(Ri, NULL, Ri, &tmod, ctx))
450 goto err;
451 /* Ni = (R*Ri-1)/N,
452 * keep only least significant word: */
453 mont->n0[0] = (Ri->top > 0) ? Ri->d[0] : 0;
454 mont->n0[1] = 0;
455#endif
456 }
457#else /* !MONT_WORD */
458 { /* bignum version */
459 mont->ri = BN_num_bits(&mont->N);
460 BN_zero(R);
461 if (!BN_set_bit(R, mont->ri))
462 goto err; /* R = 2^ri */
463 /* Ri = R^-1 mod N*/
464 if ((BN_mod_inverse(Ri, R, &mont->N, ctx)) == NULL)
465 goto err;
466 if (!BN_lshift(Ri, Ri, mont->ri))
467 goto err; /* R*Ri */
468 if (!BN_sub_word(Ri, 1))
469 goto err;
470 /* Ni = (R*Ri-1) / N */
471 if (!BN_div(&(mont->Ni), NULL, Ri, &mont->N, ctx))
472 goto err;
473 }
474#endif
475
476 /* setup RR for conversions */
477 BN_zero(&(mont->RR));
478 if (!BN_set_bit(&(mont->RR), mont->ri*2))
479 goto err;
480 if (!BN_mod(&(mont->RR), &(mont->RR), &(mont->N), ctx))
481 goto err;
482
483 ret = 1;
484
485err:
486 BN_CTX_end(ctx);
487 return ret;
488}
489
490BN_MONT_CTX *
491BN_MONT_CTX_copy(BN_MONT_CTX *to, BN_MONT_CTX *from)
492{
493 if (to == from)
494 return (to);
495
496 if (!BN_copy(&(to->RR), &(from->RR)))
497 return NULL;
498 if (!BN_copy(&(to->N), &(from->N)))
499 return NULL;
500 if (!BN_copy(&(to->Ni), &(from->Ni)))
501 return NULL;
502 to->ri = from->ri;
503 to->n0[0] = from->n0[0];
504 to->n0[1] = from->n0[1];
505 return (to);
506}
507
508BN_MONT_CTX *
509BN_MONT_CTX_set_locked(BN_MONT_CTX **pmont, int lock, const BIGNUM *mod,
510 BN_CTX *ctx)
511{
512 int got_write_lock = 0;
513 BN_MONT_CTX *ret;
514
515 CRYPTO_r_lock(lock);
516 if (!*pmont) {
517 CRYPTO_r_unlock(lock);
518 CRYPTO_w_lock(lock);
519 got_write_lock = 1;
520
521 if (!*pmont) {
522 ret = BN_MONT_CTX_new();
523 if (ret && !BN_MONT_CTX_set(ret, mod, ctx))
524 BN_MONT_CTX_free(ret);
525 else
526 *pmont = ret;
527 }
528 }
529
530 ret = *pmont;
531
532 if (got_write_lock)
533 CRYPTO_w_unlock(lock);
534 else
535 CRYPTO_r_unlock(lock);
536
537 return ret;
538}
diff --git a/src/lib/libcrypto/bn/bn_mpi.c b/src/lib/libcrypto/bn/bn_mpi.c
deleted file mode 100644
index cf4c7d8d24..0000000000
--- a/src/lib/libcrypto/bn/bn_mpi.c
+++ /dev/null
@@ -1,132 +0,0 @@
1/* $OpenBSD: bn_mpi.c,v 1.7 2014/07/11 08:44:48 jsing Exp $ */
2/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
3 * All rights reserved.
4 *
5 * This package is an SSL implementation written
6 * by Eric Young (eay@cryptsoft.com).
7 * The implementation was written so as to conform with Netscapes SSL.
8 *
9 * This library is free for commercial and non-commercial use as long as
10 * the following conditions are aheared to. The following conditions
11 * apply to all code found in this distribution, be it the RC4, RSA,
12 * lhash, DES, etc., code; not just the SSL code. The SSL documentation
13 * included with this distribution is covered by the same copyright terms
14 * except that the holder is Tim Hudson (tjh@cryptsoft.com).
15 *
16 * Copyright remains Eric Young's, and as such any Copyright notices in
17 * the code are not to be removed.
18 * If this package is used in a product, Eric Young should be given attribution
19 * as the author of the parts of the library used.
20 * This can be in the form of a textual message at program startup or
21 * in documentation (online or textual) provided with the package.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 * 1. Redistributions of source code must retain the copyright
27 * notice, this list of conditions and the following disclaimer.
28 * 2. Redistributions in binary form must reproduce the above copyright
29 * notice, this list of conditions and the following disclaimer in the
30 * documentation and/or other materials provided with the distribution.
31 * 3. All advertising materials mentioning features or use of this software
32 * must display the following acknowledgement:
33 * "This product includes cryptographic software written by
34 * Eric Young (eay@cryptsoft.com)"
35 * The word 'cryptographic' can be left out if the rouines from the library
36 * being used are not cryptographic related :-).
37 * 4. If you include any Windows specific code (or a derivative thereof) from
38 * the apps directory (application code) you must include an acknowledgement:
39 * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
40 *
41 * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
42 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51 * SUCH DAMAGE.
52 *
53 * The licence and distribution terms for any publically available version or
54 * derivative of this code cannot be changed. i.e. this code cannot simply be
55 * copied and put under another distribution licence
56 * [including the GNU Public Licence.]
57 */
58
59#include <stdio.h>
60
61#include <openssl/err.h>
62
63#include "bn_lcl.h"
64
65int
66BN_bn2mpi(const BIGNUM *a, unsigned char *d)
67{
68 int bits;
69 int num = 0;
70 int ext = 0;
71 long l;
72
73 bits = BN_num_bits(a);
74 num = (bits + 7) / 8;
75 if (bits > 0) {
76 ext = ((bits & 0x07) == 0);
77 }
78 if (d == NULL)
79 return (num + 4 + ext);
80
81 l = num + ext;
82 d[0] = (unsigned char)(l >> 24) & 0xff;
83 d[1] = (unsigned char)(l >> 16) & 0xff;
84 d[2] = (unsigned char)(l >> 8) & 0xff;
85 d[3] = (unsigned char)(l) & 0xff;
86 if (ext)
87 d[4] = 0;
88 num = BN_bn2bin(a, &(d[4 + ext]));
89 if (a->neg)
90 d[4] |= 0x80;
91 return (num + 4 + ext);
92}
93
94BIGNUM *
95BN_mpi2bn(const unsigned char *d, int n, BIGNUM *a)
96{
97 long len;
98 int neg = 0;
99
100 if (n < 4) {
101 BNerr(BN_F_BN_MPI2BN, BN_R_INVALID_LENGTH);
102 return (NULL);
103 }
104 len = ((long)d[0] << 24) | ((long)d[1] << 16) | ((int)d[2] << 8) |
105 (int)d[3];
106 if ((len + 4) != n) {
107 BNerr(BN_F_BN_MPI2BN, BN_R_ENCODING_ERROR);
108 return (NULL);
109 }
110
111 if (a == NULL)
112 a = BN_new();
113 if (a == NULL)
114 return (NULL);
115
116 if (len == 0) {
117 a->neg = 0;
118 a->top = 0;
119 return (a);
120 }
121 d += 4;
122 if ((*d) & 0x80)
123 neg = 1;
124 if (BN_bin2bn(d, (int)len, a) == NULL)
125 return (NULL);
126 a->neg = neg;
127 if (neg) {
128 BN_clear_bit(a, BN_num_bits(a) - 1);
129 }
130 bn_check_top(a);
131 return (a);
132}
diff --git a/src/lib/libcrypto/bn/bn_mul.c b/src/lib/libcrypto/bn/bn_mul.c
deleted file mode 100644
index 7794d59707..0000000000
--- a/src/lib/libcrypto/bn/bn_mul.c
+++ /dev/null
@@ -1,1171 +0,0 @@
1/* $OpenBSD: bn_mul.c,v 1.20 2015/02/09 15:49:22 jsing Exp $ */
2/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
3 * All rights reserved.
4 *
5 * This package is an SSL implementation written
6 * by Eric Young (eay@cryptsoft.com).
7 * The implementation was written so as to conform with Netscapes SSL.
8 *
9 * This library is free for commercial and non-commercial use as long as
10 * the following conditions are aheared to. The following conditions
11 * apply to all code found in this distribution, be it the RC4, RSA,
12 * lhash, DES, etc., code; not just the SSL code. The SSL documentation
13 * included with this distribution is covered by the same copyright terms
14 * except that the holder is Tim Hudson (tjh@cryptsoft.com).
15 *
16 * Copyright remains Eric Young's, and as such any Copyright notices in
17 * the code are not to be removed.
18 * If this package is used in a product, Eric Young should be given attribution
19 * as the author of the parts of the library used.
20 * This can be in the form of a textual message at program startup or
21 * in documentation (online or textual) provided with the package.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 * 1. Redistributions of source code must retain the copyright
27 * notice, this list of conditions and the following disclaimer.
28 * 2. Redistributions in binary form must reproduce the above copyright
29 * notice, this list of conditions and the following disclaimer in the
30 * documentation and/or other materials provided with the distribution.
31 * 3. All advertising materials mentioning features or use of this software
32 * must display the following acknowledgement:
33 * "This product includes cryptographic software written by
34 * Eric Young (eay@cryptsoft.com)"
35 * The word 'cryptographic' can be left out if the rouines from the library
36 * being used are not cryptographic related :-).
37 * 4. If you include any Windows specific code (or a derivative thereof) from
38 * the apps directory (application code) you must include an acknowledgement:
39 * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
40 *
41 * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
42 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51 * SUCH DAMAGE.
52 *
53 * The licence and distribution terms for any publically available version or
54 * derivative of this code cannot be changed. i.e. this code cannot simply be
55 * copied and put under another distribution licence
56 * [including the GNU Public Licence.]
57 */
58
59#ifndef BN_DEBUG
60# undef NDEBUG /* avoid conflicting definitions */
61# define NDEBUG
62#endif
63
64#include <assert.h>
65#include <stdio.h>
66#include <string.h>
67
68#include <openssl/opensslconf.h>
69
70#include "bn_lcl.h"
71
72#if defined(OPENSSL_NO_ASM) || !defined(OPENSSL_BN_ASM_PART_WORDS)
73/* Here follows specialised variants of bn_add_words() and
74 bn_sub_words(). They have the property performing operations on
75 arrays of different sizes. The sizes of those arrays is expressed through
76 cl, which is the common length ( basicall, min(len(a),len(b)) ), and dl,
77 which is the delta between the two lengths, calculated as len(a)-len(b).
78 All lengths are the number of BN_ULONGs... For the operations that require
79 a result array as parameter, it must have the length cl+abs(dl).
80 These functions should probably end up in bn_asm.c as soon as there are
81 assembler counterparts for the systems that use assembler files. */
82
83BN_ULONG
84bn_sub_part_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, int cl,
85 int dl)
86{
87 BN_ULONG c, t;
88
89 assert(cl >= 0);
90 c = bn_sub_words(r, a, b, cl);
91
92 if (dl == 0)
93 return c;
94
95 r += cl;
96 a += cl;
97 b += cl;
98
99 if (dl < 0) {
100#ifdef BN_COUNT
101 fprintf(stderr,
102 " bn_sub_part_words %d + %d (dl < 0, c = %d)\n",
103 cl, dl, c);
104#endif
105 for (;;) {
106 t = b[0];
107 r[0] = (0 - t - c) & BN_MASK2;
108 if (t != 0)
109 c = 1;
110 if (++dl >= 0)
111 break;
112
113 t = b[1];
114 r[1] = (0 - t - c) & BN_MASK2;
115 if (t != 0)
116 c = 1;
117 if (++dl >= 0)
118 break;
119
120 t = b[2];
121 r[2] = (0 - t - c) & BN_MASK2;
122 if (t != 0)
123 c = 1;
124 if (++dl >= 0)
125 break;
126
127 t = b[3];
128 r[3] = (0 - t - c) & BN_MASK2;
129 if (t != 0)
130 c = 1;
131 if (++dl >= 0)
132 break;
133
134 b += 4;
135 r += 4;
136 }
137 } else {
138 int save_dl = dl;
139#ifdef BN_COUNT
140 fprintf(stderr,
141 " bn_sub_part_words %d + %d (dl > 0, c = %d)\n",
142 cl, dl, c);
143#endif
144 while (c) {
145 t = a[0];
146 r[0] = (t - c) & BN_MASK2;
147 if (t != 0)
148 c = 0;
149 if (--dl <= 0)
150 break;
151
152 t = a[1];
153 r[1] = (t - c) & BN_MASK2;
154 if (t != 0)
155 c = 0;
156 if (--dl <= 0)
157 break;
158
159 t = a[2];
160 r[2] = (t - c) & BN_MASK2;
161 if (t != 0)
162 c = 0;
163 if (--dl <= 0)
164 break;
165
166 t = a[3];
167 r[3] = (t - c) & BN_MASK2;
168 if (t != 0)
169 c = 0;
170 if (--dl <= 0)
171 break;
172
173 save_dl = dl;
174 a += 4;
175 r += 4;
176 }
177 if (dl > 0) {
178#ifdef BN_COUNT
179 fprintf(stderr,
180 " bn_sub_part_words %d + %d (dl > 0, c == 0)\n",
181 cl, dl);
182#endif
183 if (save_dl > dl) {
184 switch (save_dl - dl) {
185 case 1:
186 r[1] = a[1];
187 if (--dl <= 0)
188 break;
189 case 2:
190 r[2] = a[2];
191 if (--dl <= 0)
192 break;
193 case 3:
194 r[3] = a[3];
195 if (--dl <= 0)
196 break;
197 }
198 a += 4;
199 r += 4;
200 }
201 }
202 if (dl > 0) {
203#ifdef BN_COUNT
204 fprintf(stderr,
205 " bn_sub_part_words %d + %d (dl > 0, copy)\n",
206 cl, dl);
207#endif
208 for (;;) {
209 r[0] = a[0];
210 if (--dl <= 0)
211 break;
212 r[1] = a[1];
213 if (--dl <= 0)
214 break;
215 r[2] = a[2];
216 if (--dl <= 0)
217 break;
218 r[3] = a[3];
219 if (--dl <= 0)
220 break;
221
222 a += 4;
223 r += 4;
224 }
225 }
226 }
227 return c;
228}
229#endif
230
231BN_ULONG
232bn_add_part_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, int cl,
233 int dl)
234{
235 BN_ULONG c, l, t;
236
237 assert(cl >= 0);
238 c = bn_add_words(r, a, b, cl);
239
240 if (dl == 0)
241 return c;
242
243 r += cl;
244 a += cl;
245 b += cl;
246
247 if (dl < 0) {
248 int save_dl = dl;
249#ifdef BN_COUNT
250 fprintf(stderr,
251 " bn_add_part_words %d + %d (dl < 0, c = %d)\n",
252 cl, dl, c);
253#endif
254 while (c) {
255 l = (c + b[0]) & BN_MASK2;
256 c = (l < c);
257 r[0] = l;
258 if (++dl >= 0)
259 break;
260
261 l = (c + b[1]) & BN_MASK2;
262 c = (l < c);
263 r[1] = l;
264 if (++dl >= 0)
265 break;
266
267 l = (c + b[2]) & BN_MASK2;
268 c = (l < c);
269 r[2] = l;
270 if (++dl >= 0)
271 break;
272
273 l = (c + b[3]) & BN_MASK2;
274 c = (l < c);
275 r[3] = l;
276 if (++dl >= 0)
277 break;
278
279 save_dl = dl;
280 b += 4;
281 r += 4;
282 }
283 if (dl < 0) {
284#ifdef BN_COUNT
285 fprintf(stderr,
286 " bn_add_part_words %d + %d (dl < 0, c == 0)\n",
287 cl, dl);
288#endif
289 if (save_dl < dl) {
290 switch (dl - save_dl) {
291 case 1:
292 r[1] = b[1];
293 if (++dl >= 0)
294 break;
295 case 2:
296 r[2] = b[2];
297 if (++dl >= 0)
298 break;
299 case 3:
300 r[3] = b[3];
301 if (++dl >= 0)
302 break;
303 }
304 b += 4;
305 r += 4;
306 }
307 }
308 if (dl < 0) {
309#ifdef BN_COUNT
310 fprintf(stderr,
311 " bn_add_part_words %d + %d (dl < 0, copy)\n",
312 cl, dl);
313#endif
314 for (;;) {
315 r[0] = b[0];
316 if (++dl >= 0)
317 break;
318 r[1] = b[1];
319 if (++dl >= 0)
320 break;
321 r[2] = b[2];
322 if (++dl >= 0)
323 break;
324 r[3] = b[3];
325 if (++dl >= 0)
326 break;
327
328 b += 4;
329 r += 4;
330 }
331 }
332 } else {
333 int save_dl = dl;
334#ifdef BN_COUNT
335 fprintf(stderr,
336 " bn_add_part_words %d + %d (dl > 0)\n", cl, dl);
337#endif
338 while (c) {
339 t = (a[0] + c) & BN_MASK2;
340 c = (t < c);
341 r[0] = t;
342 if (--dl <= 0)
343 break;
344
345 t = (a[1] + c) & BN_MASK2;
346 c = (t < c);
347 r[1] = t;
348 if (--dl <= 0)
349 break;
350
351 t = (a[2] + c) & BN_MASK2;
352 c = (t < c);
353 r[2] = t;
354 if (--dl <= 0)
355 break;
356
357 t = (a[3] + c) & BN_MASK2;
358 c = (t < c);
359 r[3] = t;
360 if (--dl <= 0)
361 break;
362
363 save_dl = dl;
364 a += 4;
365 r += 4;
366 }
367#ifdef BN_COUNT
368 fprintf(stderr,
369 " bn_add_part_words %d + %d (dl > 0, c == 0)\n", cl, dl);
370#endif
371 if (dl > 0) {
372 if (save_dl > dl) {
373 switch (save_dl - dl) {
374 case 1:
375 r[1] = a[1];
376 if (--dl <= 0)
377 break;
378 case 2:
379 r[2] = a[2];
380 if (--dl <= 0)
381 break;
382 case 3:
383 r[3] = a[3];
384 if (--dl <= 0)
385 break;
386 }
387 a += 4;
388 r += 4;
389 }
390 }
391 if (dl > 0) {
392#ifdef BN_COUNT
393 fprintf(stderr,
394 " bn_add_part_words %d + %d (dl > 0, copy)\n",
395 cl, dl);
396#endif
397 for (;;) {
398 r[0] = a[0];
399 if (--dl <= 0)
400 break;
401 r[1] = a[1];
402 if (--dl <= 0)
403 break;
404 r[2] = a[2];
405 if (--dl <= 0)
406 break;
407 r[3] = a[3];
408 if (--dl <= 0)
409 break;
410
411 a += 4;
412 r += 4;
413 }
414 }
415 }
416 return c;
417}
418
419#ifdef BN_RECURSION
420/* Karatsuba recursive multiplication algorithm
421 * (cf. Knuth, The Art of Computer Programming, Vol. 2) */
422
423/* r is 2*n2 words in size,
424 * a and b are both n2 words in size.
425 * n2 must be a power of 2.
426 * We multiply and return the result.
427 * t must be 2*n2 words in size
428 * We calculate
429 * a[0]*b[0]
430 * a[0]*b[0]+a[1]*b[1]+(a[0]-a[1])*(b[1]-b[0])
431 * a[1]*b[1]
432 */
433/* dnX may not be positive, but n2/2+dnX has to be */
434void
435bn_mul_recursive(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n2, int dna,
436 int dnb, BN_ULONG *t)
437{
438 int n = n2 / 2, c1, c2;
439 int tna = n + dna, tnb = n + dnb;
440 unsigned int neg, zero;
441 BN_ULONG ln, lo, *p;
442
443# ifdef BN_COUNT
444 fprintf(stderr, " bn_mul_recursive %d%+d * %d%+d\n",n2,dna,n2,dnb);
445# endif
446# ifdef BN_MUL_COMBA
447# if 0
448 if (n2 == 4) {
449 bn_mul_comba4(r, a, b);
450 return;
451 }
452# endif
453 /* Only call bn_mul_comba 8 if n2 == 8 and the
454 * two arrays are complete [steve]
455 */
456 if (n2 == 8 && dna == 0 && dnb == 0) {
457 bn_mul_comba8(r, a, b);
458 return;
459 }
460# endif /* BN_MUL_COMBA */
461 /* Else do normal multiply */
462 if (n2 < BN_MUL_RECURSIVE_SIZE_NORMAL) {
463 bn_mul_normal(r, a, n2 + dna, b, n2 + dnb);
464 if ((dna + dnb) < 0)
465 memset(&r[2*n2 + dna + dnb], 0,
466 sizeof(BN_ULONG) * -(dna + dnb));
467 return;
468 }
469 /* r=(a[0]-a[1])*(b[1]-b[0]) */
470 c1 = bn_cmp_part_words(a, &(a[n]), tna, n - tna);
471 c2 = bn_cmp_part_words(&(b[n]), b,tnb, tnb - n);
472 zero = neg = 0;
473 switch (c1 * 3 + c2) {
474 case -4:
475 bn_sub_part_words(t, &(a[n]), a, tna, tna - n); /* - */
476 bn_sub_part_words(&(t[n]), b, &(b[n]), tnb, n - tnb); /* - */
477 break;
478 case -3:
479 zero = 1;
480 break;
481 case -2:
482 bn_sub_part_words(t, &(a[n]), a, tna, tna - n); /* - */
483 bn_sub_part_words(&(t[n]), &(b[n]), b, tnb, tnb - n); /* + */
484 neg = 1;
485 break;
486 case -1:
487 case 0:
488 case 1:
489 zero = 1;
490 break;
491 case 2:
492 bn_sub_part_words(t, a, &(a[n]), tna, n - tna); /* + */
493 bn_sub_part_words(&(t[n]), b, &(b[n]), tnb, n - tnb); /* - */
494 neg = 1;
495 break;
496 case 3:
497 zero = 1;
498 break;
499 case 4:
500 bn_sub_part_words(t, a, &(a[n]), tna, n - tna);
501 bn_sub_part_words(&(t[n]), &(b[n]), b, tnb, tnb - n);
502 break;
503 }
504
505# ifdef BN_MUL_COMBA
506 if (n == 4 && dna == 0 && dnb == 0) /* XXX: bn_mul_comba4 could take
507 extra args to do this well */
508 {
509 if (!zero)
510 bn_mul_comba4(&(t[n2]), t, &(t[n]));
511 else
512 memset(&(t[n2]), 0, 8 * sizeof(BN_ULONG));
513
514 bn_mul_comba4(r, a, b);
515 bn_mul_comba4(&(r[n2]), &(a[n]), &(b[n]));
516 } else if (n == 8 && dna == 0 && dnb == 0) /* XXX: bn_mul_comba8 could
517 take extra args to do this
518 well */
519 {
520 if (!zero)
521 bn_mul_comba8(&(t[n2]), t, &(t[n]));
522 else
523 memset(&(t[n2]), 0, 16 * sizeof(BN_ULONG));
524
525 bn_mul_comba8(r, a, b);
526 bn_mul_comba8(&(r[n2]), &(a[n]), &(b[n]));
527 } else
528# endif /* BN_MUL_COMBA */
529 {
530 p = &(t[n2 * 2]);
531 if (!zero)
532 bn_mul_recursive(&(t[n2]), t, &(t[n]), n, 0, 0, p);
533 else
534 memset(&(t[n2]), 0, n2 * sizeof(BN_ULONG));
535 bn_mul_recursive(r, a, b, n, 0, 0, p);
536 bn_mul_recursive(&(r[n2]), &(a[n]), &(b[n]), n, dna, dnb, p);
537 }
538
539 /* t[32] holds (a[0]-a[1])*(b[1]-b[0]), c1 is the sign
540 * r[10] holds (a[0]*b[0])
541 * r[32] holds (b[1]*b[1])
542 */
543
544 c1 = (int)(bn_add_words(t, r, &(r[n2]), n2));
545
546 if (neg) /* if t[32] is negative */
547 {
548 c1 -= (int)(bn_sub_words(&(t[n2]), t, &(t[n2]), n2));
549 } else {
550 /* Might have a carry */
551 c1 += (int)(bn_add_words(&(t[n2]), &(t[n2]), t, n2));
552 }
553
554 /* t[32] holds (a[0]-a[1])*(b[1]-b[0])+(a[0]*b[0])+(a[1]*b[1])
555 * r[10] holds (a[0]*b[0])
556 * r[32] holds (b[1]*b[1])
557 * c1 holds the carry bits
558 */
559 c1 += (int)(bn_add_words(&(r[n]), &(r[n]), &(t[n2]), n2));
560 if (c1) {
561 p = &(r[n + n2]);
562 lo= *p;
563 ln = (lo + c1) & BN_MASK2;
564 *p = ln;
565
566 /* The overflow will stop before we over write
567 * words we should not overwrite */
568 if (ln < (BN_ULONG)c1) {
569 do {
570 p++;
571 lo= *p;
572 ln = (lo + 1) & BN_MASK2;
573 *p = ln;
574 } while (ln == 0);
575 }
576 }
577}
578
579/* n+tn is the word length
580 * t needs to be n*4 is size, as does r */
581/* tnX may not be negative but less than n */
582void
583bn_mul_part_recursive(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n, int tna,
584 int tnb, BN_ULONG *t)
585{
586 int i, j, n2 = n * 2;
587 int c1, c2, neg;
588 BN_ULONG ln, lo, *p;
589
590# ifdef BN_COUNT
591 fprintf(stderr, " bn_mul_part_recursive (%d%+d) * (%d%+d)\n",
592 n, tna, n, tnb);
593# endif
594 if (n < 8) {
595 bn_mul_normal(r, a, n + tna, b, n + tnb);
596 return;
597 }
598
599 /* r=(a[0]-a[1])*(b[1]-b[0]) */
600 c1 = bn_cmp_part_words(a, &(a[n]), tna, n - tna);
601 c2 = bn_cmp_part_words(&(b[n]), b, tnb, tnb - n);
602 neg = 0;
603 switch (c1 * 3 + c2) {
604 case -4:
605 bn_sub_part_words(t, &(a[n]), a, tna, tna - n); /* - */
606 bn_sub_part_words(&(t[n]), b, &(b[n]), tnb, n - tnb); /* - */
607 break;
608 case -3:
609 /* break; */
610 case -2:
611 bn_sub_part_words(t, &(a[n]), a, tna, tna - n); /* - */
612 bn_sub_part_words(&(t[n]), &(b[n]), b, tnb, tnb - n); /* + */
613 neg = 1;
614 break;
615 case -1:
616 case 0:
617 case 1:
618 /* break; */
619 case 2:
620 bn_sub_part_words(t, a, &(a[n]), tna, n - tna); /* + */
621 bn_sub_part_words(&(t[n]), b, &(b[n]), tnb, n - tnb); /* - */
622 neg = 1;
623 break;
624 case 3:
625 /* break; */
626 case 4:
627 bn_sub_part_words(t, a, &(a[n]), tna, n - tna);
628 bn_sub_part_words(&(t[n]), &(b[n]), b, tnb, tnb - n);
629 break;
630 }
631 /* The zero case isn't yet implemented here. The speedup
632 would probably be negligible. */
633# if 0
634 if (n == 4) {
635 bn_mul_comba4(&(t[n2]), t, &(t[n]));
636 bn_mul_comba4(r, a, b);
637 bn_mul_normal(&(r[n2]), &(a[n]), tn, &(b[n]), tn);
638 memset(&(r[n2 + tn * 2]), 0, sizeof(BN_ULONG) * (n2 - tn * 2));
639 } else
640# endif
641 if (n == 8) {
642 bn_mul_comba8(&(t[n2]), t, &(t[n]));
643 bn_mul_comba8(r, a, b);
644 bn_mul_normal(&(r[n2]), &(a[n]), tna, &(b[n]), tnb);
645 memset(&(r[n2 + tna + tnb]), 0,
646 sizeof(BN_ULONG) * (n2 - tna - tnb));
647 } else {
648 p = &(t[n2*2]);
649 bn_mul_recursive(&(t[n2]), t, &(t[n]), n, 0, 0, p);
650 bn_mul_recursive(r, a, b, n, 0, 0, p);
651 i = n / 2;
652 /* If there is only a bottom half to the number,
653 * just do it */
654 if (tna > tnb)
655 j = tna - i;
656 else
657 j = tnb - i;
658 if (j == 0) {
659 bn_mul_recursive(&(r[n2]), &(a[n]), &(b[n]),
660 i, tna - i, tnb - i, p);
661 memset(&(r[n2 + i * 2]), 0,
662 sizeof(BN_ULONG) * (n2 - i * 2));
663 }
664 else if (j > 0) /* eg, n == 16, i == 8 and tn == 11 */
665 {
666 bn_mul_part_recursive(&(r[n2]), &(a[n]), &(b[n]),
667 i, tna - i, tnb - i, p);
668 memset(&(r[n2 + tna + tnb]), 0,
669 sizeof(BN_ULONG) * (n2 - tna - tnb));
670 }
671 else /* (j < 0) eg, n == 16, i == 8 and tn == 5 */
672 {
673 memset(&(r[n2]), 0, sizeof(BN_ULONG) * n2);
674 if (tna < BN_MUL_RECURSIVE_SIZE_NORMAL &&
675 tnb < BN_MUL_RECURSIVE_SIZE_NORMAL) {
676 bn_mul_normal(&(r[n2]), &(a[n]), tna,
677 &(b[n]), tnb);
678 } else {
679 for (;;) {
680 i /= 2;
681 /* these simplified conditions work
682 * exclusively because difference
683 * between tna and tnb is 1 or 0 */
684 if (i < tna || i < tnb) {
685 bn_mul_part_recursive(&(r[n2]),
686 &(a[n]), &(b[n]), i,
687 tna - i, tnb - i, p);
688 break;
689 } else if (i == tna || i == tnb) {
690 bn_mul_recursive(&(r[n2]),
691 &(a[n]), &(b[n]), i,
692 tna - i, tnb - i, p);
693 break;
694 }
695 }
696 }
697 }
698 }
699
700 /* t[32] holds (a[0]-a[1])*(b[1]-b[0]), c1 is the sign
701 * r[10] holds (a[0]*b[0])
702 * r[32] holds (b[1]*b[1])
703 */
704
705 c1 = (int)(bn_add_words(t, r,&(r[n2]), n2));
706
707 if (neg) /* if t[32] is negative */
708 {
709 c1 -= (int)(bn_sub_words(&(t[n2]), t,&(t[n2]), n2));
710 } else {
711 /* Might have a carry */
712 c1 += (int)(bn_add_words(&(t[n2]), &(t[n2]), t, n2));
713 }
714
715 /* t[32] holds (a[0]-a[1])*(b[1]-b[0])+(a[0]*b[0])+(a[1]*b[1])
716 * r[10] holds (a[0]*b[0])
717 * r[32] holds (b[1]*b[1])
718 * c1 holds the carry bits
719 */
720 c1 += (int)(bn_add_words(&(r[n]), &(r[n]), &(t[n2]), n2));
721 if (c1) {
722 p = &(r[n + n2]);
723 lo= *p;
724 ln = (lo + c1)&BN_MASK2;
725 *p = ln;
726
727 /* The overflow will stop before we over write
728 * words we should not overwrite */
729 if (ln < (BN_ULONG)c1) {
730 do {
731 p++;
732 lo= *p;
733 ln = (lo + 1) & BN_MASK2;
734 *p = ln;
735 } while (ln == 0);
736 }
737 }
738}
739
740/* a and b must be the same size, which is n2.
741 * r needs to be n2 words and t needs to be n2*2
742 */
743void
744bn_mul_low_recursive(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n2, BN_ULONG *t)
745{
746 int n = n2 / 2;
747
748# ifdef BN_COUNT
749 fprintf(stderr, " bn_mul_low_recursive %d * %d\n",n2,n2);
750# endif
751
752 bn_mul_recursive(r, a, b, n, 0, 0, &(t[0]));
753 if (n >= BN_MUL_LOW_RECURSIVE_SIZE_NORMAL) {
754 bn_mul_low_recursive(&(t[0]), &(a[0]), &(b[n]), n, &(t[n2]));
755 bn_add_words(&(r[n]), &(r[n]), &(t[0]), n);
756 bn_mul_low_recursive(&(t[0]), &(a[n]), &(b[0]), n, &(t[n2]));
757 bn_add_words(&(r[n]), &(r[n]), &(t[0]), n);
758 } else {
759 bn_mul_low_normal(&(t[0]), &(a[0]), &(b[n]), n);
760 bn_mul_low_normal(&(t[n]), &(a[n]), &(b[0]), n);
761 bn_add_words(&(r[n]), &(r[n]), &(t[0]), n);
762 bn_add_words(&(r[n]), &(r[n]), &(t[n]), n);
763 }
764}
765
766/* a and b must be the same size, which is n2.
767 * r needs to be n2 words and t needs to be n2*2
768 * l is the low words of the output.
769 * t needs to be n2*3
770 */
771void
772bn_mul_high(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, BN_ULONG *l, int n2,
773 BN_ULONG *t)
774{
775 int i, n;
776 int c1, c2;
777 int neg, oneg, zero;
778 BN_ULONG ll, lc, *lp, *mp;
779
780# ifdef BN_COUNT
781 fprintf(stderr, " bn_mul_high %d * %d\n",n2,n2);
782# endif
783 n = n2 / 2;
784
785 /* Calculate (al-ah)*(bh-bl) */
786 neg = zero = 0;
787 c1 = bn_cmp_words(&(a[0]), &(a[n]), n);
788 c2 = bn_cmp_words(&(b[n]), &(b[0]), n);
789 switch (c1 * 3 + c2) {
790 case -4:
791 bn_sub_words(&(r[0]), &(a[n]), &(a[0]), n);
792 bn_sub_words(&(r[n]), &(b[0]), &(b[n]), n);
793 break;
794 case -3:
795 zero = 1;
796 break;
797 case -2:
798 bn_sub_words(&(r[0]), &(a[n]), &(a[0]), n);
799 bn_sub_words(&(r[n]), &(b[n]), &(b[0]), n);
800 neg = 1;
801 break;
802 case -1:
803 case 0:
804 case 1:
805 zero = 1;
806 break;
807 case 2:
808 bn_sub_words(&(r[0]), &(a[0]), &(a[n]), n);
809 bn_sub_words(&(r[n]), &(b[0]), &(b[n]), n);
810 neg = 1;
811 break;
812 case 3:
813 zero = 1;
814 break;
815 case 4:
816 bn_sub_words(&(r[0]), &(a[0]), &(a[n]), n);
817 bn_sub_words(&(r[n]), &(b[n]), &(b[0]), n);
818 break;
819 }
820
821 oneg = neg;
822 /* t[10] = (a[0]-a[1])*(b[1]-b[0]) */
823 /* r[10] = (a[1]*b[1]) */
824# ifdef BN_MUL_COMBA
825 if (n == 8) {
826 bn_mul_comba8(&(t[0]), &(r[0]), &(r[n]));
827 bn_mul_comba8(r, &(a[n]), &(b[n]));
828 } else
829# endif
830 {
831 bn_mul_recursive(&(t[0]), &(r[0]), &(r[n]), n, 0, 0, &(t[n2]));
832 bn_mul_recursive(r, &(a[n]), &(b[n]), n, 0, 0, &(t[n2]));
833 }
834
835 /* s0 == low(al*bl)
836 * s1 == low(ah*bh)+low((al-ah)*(bh-bl))+low(al*bl)+high(al*bl)
837 * We know s0 and s1 so the only unknown is high(al*bl)
838 * high(al*bl) == s1 - low(ah*bh+s0+(al-ah)*(bh-bl))
839 * high(al*bl) == s1 - (r[0]+l[0]+t[0])
840 */
841 if (l != NULL) {
842 lp = &(t[n2 + n]);
843 c1 = (int)(bn_add_words(lp, &(r[0]), &(l[0]), n));
844 } else {
845 c1 = 0;
846 lp = &(r[0]);
847 }
848
849 if (neg)
850 neg = (int)(bn_sub_words(&(t[n2]), lp, &(t[0]), n));
851 else {
852 bn_add_words(&(t[n2]), lp, &(t[0]), n);
853 neg = 0;
854 }
855
856 if (l != NULL) {
857 bn_sub_words(&(t[n2 + n]), &(l[n]), &(t[n2]), n);
858 } else {
859 lp = &(t[n2 + n]);
860 mp = &(t[n2]);
861 for (i = 0; i < n; i++)
862 lp[i] = ((~mp[i]) + 1) & BN_MASK2;
863 }
864
865 /* s[0] = low(al*bl)
866 * t[3] = high(al*bl)
867 * t[10] = (a[0]-a[1])*(b[1]-b[0]) neg is the sign
868 * r[10] = (a[1]*b[1])
869 */
870 /* R[10] = al*bl
871 * R[21] = al*bl + ah*bh + (a[0]-a[1])*(b[1]-b[0])
872 * R[32] = ah*bh
873 */
874 /* R[1]=t[3]+l[0]+r[0](+-)t[0] (have carry/borrow)
875 * R[2]=r[0]+t[3]+r[1](+-)t[1] (have carry/borrow)
876 * R[3]=r[1]+(carry/borrow)
877 */
878 if (l != NULL) {
879 lp = &(t[n2]);
880 c1 = (int)(bn_add_words(lp, &(t[n2 + n]), &(l[0]), n));
881 } else {
882 lp = &(t[n2 + n]);
883 c1 = 0;
884 }
885 c1 += (int)(bn_add_words(&(t[n2]), lp, &(r[0]), n));
886 if (oneg)
887 c1 -= (int)(bn_sub_words(&(t[n2]), &(t[n2]), &(t[0]), n));
888 else
889 c1 += (int)(bn_add_words(&(t[n2]), &(t[n2]), &(t[0]), n));
890
891 c2 = (int)(bn_add_words(&(r[0]), &(r[0]), &(t[n2 + n]), n));
892 c2 += (int)(bn_add_words(&(r[0]), &(r[0]), &(r[n]), n));
893 if (oneg)
894 c2 -= (int)(bn_sub_words(&(r[0]), &(r[0]), &(t[n]), n));
895 else
896 c2 += (int)(bn_add_words(&(r[0]), &(r[0]), &(t[n]), n));
897
898 if (c1 != 0) /* Add starting at r[0], could be +ve or -ve */
899 {
900 i = 0;
901 if (c1 > 0) {
902 lc = c1;
903 do {
904 ll = (r[i] + lc) & BN_MASK2;
905 r[i++] = ll;
906 lc = (lc > ll);
907 } while (lc);
908 } else {
909 lc = -c1;
910 do {
911 ll = r[i];
912 r[i++] = (ll - lc) & BN_MASK2;
913 lc = (lc > ll);
914 } while (lc);
915 }
916 }
917 if (c2 != 0) /* Add starting at r[1] */
918 {
919 i = n;
920 if (c2 > 0) {
921 lc = c2;
922 do {
923 ll = (r[i] + lc) & BN_MASK2;
924 r[i++] = ll;
925 lc = (lc > ll);
926 } while (lc);
927 } else {
928 lc = -c2;
929 do {
930 ll = r[i];
931 r[i++] = (ll - lc) & BN_MASK2;
932 lc = (lc > ll);
933 } while (lc);
934 }
935 }
936}
937#endif /* BN_RECURSION */
938
939int
940BN_mul(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx)
941{
942 int ret = 0;
943 int top, al, bl;
944 BIGNUM *rr;
945#if defined(BN_MUL_COMBA) || defined(BN_RECURSION)
946 int i;
947#endif
948#ifdef BN_RECURSION
949 BIGNUM *t = NULL;
950 int j = 0, k;
951#endif
952
953#ifdef BN_COUNT
954 fprintf(stderr, "BN_mul %d * %d\n",a->top,b->top);
955#endif
956
957 bn_check_top(a);
958 bn_check_top(b);
959 bn_check_top(r);
960
961 al = a->top;
962 bl = b->top;
963
964 if ((al == 0) || (bl == 0)) {
965 BN_zero(r);
966 return (1);
967 }
968 top = al + bl;
969
970 BN_CTX_start(ctx);
971 if ((r == a) || (r == b)) {
972 if ((rr = BN_CTX_get(ctx)) == NULL)
973 goto err;
974 } else
975 rr = r;
976 rr->neg = a->neg ^ b->neg;
977
978#if defined(BN_MUL_COMBA) || defined(BN_RECURSION)
979 i = al - bl;
980#endif
981#ifdef BN_MUL_COMBA
982 if (i == 0) {
983# if 0
984 if (al == 4) {
985 if (bn_wexpand(rr, 8) == NULL)
986 goto err;
987 rr->top = 8;
988 bn_mul_comba4(rr->d, a->d, b->d);
989 goto end;
990 }
991# endif
992 if (al == 8) {
993 if (bn_wexpand(rr, 16) == NULL)
994 goto err;
995 rr->top = 16;
996 bn_mul_comba8(rr->d, a->d, b->d);
997 goto end;
998 }
999 }
1000#endif /* BN_MUL_COMBA */
1001#ifdef BN_RECURSION
1002 if ((al >= BN_MULL_SIZE_NORMAL) && (bl >= BN_MULL_SIZE_NORMAL)) {
1003 if (i >= -1 && i <= 1) {
1004 /* Find out the power of two lower or equal
1005 to the longest of the two numbers */
1006 if (i >= 0) {
1007 j = BN_num_bits_word((BN_ULONG)al);
1008 }
1009 if (i == -1) {
1010 j = BN_num_bits_word((BN_ULONG)bl);
1011 }
1012 j = 1 << (j - 1);
1013 assert(j <= al || j <= bl);
1014 k = j + j;
1015 if ((t = BN_CTX_get(ctx)) == NULL)
1016 goto err;
1017 if (al > j || bl > j) {
1018 if (bn_wexpand(t, k * 4) == NULL)
1019 goto err;
1020 if (bn_wexpand(rr, k * 4) == NULL)
1021 goto err;
1022 bn_mul_part_recursive(rr->d, a->d, b->d,
1023 j, al - j, bl - j, t->d);
1024 }
1025 else /* al <= j || bl <= j */
1026 {
1027 if (bn_wexpand(t, k * 2) == NULL)
1028 goto err;
1029 if (bn_wexpand(rr, k * 2) == NULL)
1030 goto err;
1031 bn_mul_recursive(rr->d, a->d, b->d,
1032 j, al - j, bl - j, t->d);
1033 }
1034 rr->top = top;
1035 goto end;
1036 }
1037#if 0
1038 if (i == 1 && !BN_get_flags(b, BN_FLG_STATIC_DATA)) {
1039 BIGNUM *tmp_bn = (BIGNUM *)b;
1040 if (bn_wexpand(tmp_bn, al) == NULL)
1041 goto err;
1042 tmp_bn->d[bl] = 0;
1043 bl++;
1044 i--;
1045 } else if (i == -1 && !BN_get_flags(a, BN_FLG_STATIC_DATA)) {
1046 BIGNUM *tmp_bn = (BIGNUM *)a;
1047 if (bn_wexpand(tmp_bn, bl) == NULL)
1048 goto err;
1049 tmp_bn->d[al] = 0;
1050 al++;
1051 i++;
1052 }
1053 if (i == 0) {
1054 /* symmetric and > 4 */
1055 /* 16 or larger */
1056 j = BN_num_bits_word((BN_ULONG)al);
1057 j = 1 << (j - 1);
1058 k = j + j;
1059 if ((t = BN_CTX_get(ctx)) == NULL)
1060 goto err;
1061 if (al == j) /* exact multiple */
1062 {
1063 if (bn_wexpand(t, k * 2) == NULL)
1064 goto err;
1065 if (bn_wexpand(rr, k * 2) == NULL)
1066 goto err;
1067 bn_mul_recursive(rr->d, a->d, b->d, al, t->d);
1068 } else {
1069 if (bn_wexpand(t, k * 4) == NULL)
1070 goto err;
1071 if (bn_wexpand(rr, k * 4) == NULL)
1072 goto err;
1073 bn_mul_part_recursive(rr->d, a->d, b->d,
1074 al - j, j, t->d);
1075 }
1076 rr->top = top;
1077 goto end;
1078 }
1079#endif
1080 }
1081#endif /* BN_RECURSION */
1082 if (bn_wexpand(rr, top) == NULL)
1083 goto err;
1084 rr->top = top;
1085 bn_mul_normal(rr->d, a->d, al, b->d, bl);
1086
1087#if defined(BN_MUL_COMBA) || defined(BN_RECURSION)
1088end:
1089#endif
1090 bn_correct_top(rr);
1091 if (r != rr)
1092 BN_copy(r, rr);
1093 ret = 1;
1094err:
1095 bn_check_top(r);
1096 BN_CTX_end(ctx);
1097 return (ret);
1098}
1099
1100void
1101bn_mul_normal(BN_ULONG *r, BN_ULONG *a, int na, BN_ULONG *b, int nb)
1102{
1103 BN_ULONG *rr;
1104
1105#ifdef BN_COUNT
1106 fprintf(stderr, " bn_mul_normal %d * %d\n", na, nb);
1107#endif
1108
1109 if (na < nb) {
1110 int itmp;
1111 BN_ULONG *ltmp;
1112
1113 itmp = na;
1114 na = nb;
1115 nb = itmp;
1116 ltmp = a;
1117 a = b;
1118 b = ltmp;
1119
1120 }
1121 rr = &(r[na]);
1122 if (nb <= 0) {
1123 (void)bn_mul_words(r, a, na, 0);
1124 return;
1125 } else
1126 rr[0] = bn_mul_words(r, a, na, b[0]);
1127
1128 for (;;) {
1129 if (--nb <= 0)
1130 return;
1131 rr[1] = bn_mul_add_words(&(r[1]), a, na, b[1]);
1132 if (--nb <= 0)
1133 return;
1134 rr[2] = bn_mul_add_words(&(r[2]), a, na, b[2]);
1135 if (--nb <= 0)
1136 return;
1137 rr[3] = bn_mul_add_words(&(r[3]), a, na, b[3]);
1138 if (--nb <= 0)
1139 return;
1140 rr[4] = bn_mul_add_words(&(r[4]), a, na, b[4]);
1141 rr += 4;
1142 r += 4;
1143 b += 4;
1144 }
1145}
1146
1147void
1148bn_mul_low_normal(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
1149{
1150#ifdef BN_COUNT
1151 fprintf(stderr, " bn_mul_low_normal %d * %d\n", n, n);
1152#endif
1153 bn_mul_words(r, a, n, b[0]);
1154
1155 for (;;) {
1156 if (--n <= 0)
1157 return;
1158 bn_mul_add_words(&(r[1]), a, n, b[1]);
1159 if (--n <= 0)
1160 return;
1161 bn_mul_add_words(&(r[2]), a, n, b[2]);
1162 if (--n <= 0)
1163 return;
1164 bn_mul_add_words(&(r[3]), a, n, b[3]);
1165 if (--n <= 0)
1166 return;
1167 bn_mul_add_words(&(r[4]), a, n, b[4]);
1168 r += 4;
1169 b += 4;
1170 }
1171}
diff --git a/src/lib/libcrypto/bn/bn_nist.c b/src/lib/libcrypto/bn/bn_nist.c
deleted file mode 100644
index 693d6f1ed3..0000000000
--- a/src/lib/libcrypto/bn/bn_nist.c
+++ /dev/null
@@ -1,1270 +0,0 @@
1/* $OpenBSD: bn_nist.c,v 1.15 2014/10/28 07:35:58 jsg Exp $ */
2/*
3 * Written by Nils Larsch for the OpenSSL project
4 */
5/* ====================================================================
6 * Copyright (c) 1998-2005 The OpenSSL Project. All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 *
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 *
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in
17 * the documentation and/or other materials provided with the
18 * distribution.
19 *
20 * 3. All advertising materials mentioning features or use of this
21 * software must display the following acknowledgment:
22 * "This product includes software developed by the OpenSSL Project
23 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
24 *
25 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
26 * endorse or promote products derived from this software without
27 * prior written permission. For written permission, please contact
28 * openssl-core@openssl.org.
29 *
30 * 5. Products derived from this software may not be called "OpenSSL"
31 * nor may "OpenSSL" appear in their names without prior written
32 * permission of the OpenSSL Project.
33 *
34 * 6. Redistributions of any form whatsoever must retain the following
35 * acknowledgment:
36 * "This product includes software developed by the OpenSSL Project
37 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
38 *
39 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
40 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
41 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
42 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
43 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
44 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
45 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
46 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
47 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
48 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
49 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
50 * OF THE POSSIBILITY OF SUCH DAMAGE.
51 * ====================================================================
52 *
53 * This product includes cryptographic software written by Eric Young
54 * (eay@cryptsoft.com). This product includes software written by Tim
55 * Hudson (tjh@cryptsoft.com).
56 *
57 */
58
59#include <machine/endian.h>
60
61#include <stdint.h>
62
63#include "bn_lcl.h"
64
65#define BN_NIST_192_TOP (192+BN_BITS2-1)/BN_BITS2
66#define BN_NIST_224_TOP (224+BN_BITS2-1)/BN_BITS2
67#define BN_NIST_256_TOP (256+BN_BITS2-1)/BN_BITS2
68#define BN_NIST_384_TOP (384+BN_BITS2-1)/BN_BITS2
69#define BN_NIST_521_TOP (521+BN_BITS2-1)/BN_BITS2
70
71/* pre-computed tables are "carry-less" values of modulus*(i+1) */
72#if BN_BITS2 == 64
73static const BN_ULONG _nist_p_192[][BN_NIST_192_TOP] = {
74 {0xFFFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFEULL, 0xFFFFFFFFFFFFFFFFULL},
75 {0xFFFFFFFFFFFFFFFEULL, 0xFFFFFFFFFFFFFFFDULL, 0xFFFFFFFFFFFFFFFFULL},
76 {0xFFFFFFFFFFFFFFFDULL, 0xFFFFFFFFFFFFFFFCULL, 0xFFFFFFFFFFFFFFFFULL}
77};
78static const BN_ULONG _nist_p_192_sqr[] = {
79 0x0000000000000001ULL, 0x0000000000000002ULL, 0x0000000000000001ULL,
80 0xFFFFFFFFFFFFFFFEULL, 0xFFFFFFFFFFFFFFFDULL, 0xFFFFFFFFFFFFFFFFULL
81};
82static const BN_ULONG _nist_p_224[][BN_NIST_224_TOP] = {
83 {
84 0x0000000000000001ULL, 0xFFFFFFFF00000000ULL,
85 0xFFFFFFFFFFFFFFFFULL, 0x00000000FFFFFFFFULL
86 },
87 {
88 0x0000000000000002ULL, 0xFFFFFFFE00000000ULL,
89 0xFFFFFFFFFFFFFFFFULL, 0x00000001FFFFFFFFULL
90 } /* this one is "carry-full" */
91};
92static const BN_ULONG _nist_p_224_sqr[] = {
93 0x0000000000000001ULL, 0xFFFFFFFE00000000ULL,
94 0xFFFFFFFFFFFFFFFFULL, 0x0000000200000000ULL,
95 0x0000000000000000ULL, 0xFFFFFFFFFFFFFFFEULL,
96 0xFFFFFFFFFFFFFFFFULL
97};
98static const BN_ULONG _nist_p_256[][BN_NIST_256_TOP] = {
99 {
100 0xFFFFFFFFFFFFFFFFULL, 0x00000000FFFFFFFFULL,
101 0x0000000000000000ULL, 0xFFFFFFFF00000001ULL
102 },
103 {
104 0xFFFFFFFFFFFFFFFEULL, 0x00000001FFFFFFFFULL,
105 0x0000000000000000ULL, 0xFFFFFFFE00000002ULL
106 },
107 {
108 0xFFFFFFFFFFFFFFFDULL, 0x00000002FFFFFFFFULL,
109 0x0000000000000000ULL, 0xFFFFFFFD00000003ULL
110 },
111 {
112 0xFFFFFFFFFFFFFFFCULL, 0x00000003FFFFFFFFULL,
113 0x0000000000000000ULL, 0xFFFFFFFC00000004ULL
114 },
115 {
116 0xFFFFFFFFFFFFFFFBULL, 0x00000004FFFFFFFFULL,
117 0x0000000000000000ULL, 0xFFFFFFFB00000005ULL
118 },
119};
120static const BN_ULONG _nist_p_256_sqr[] = {
121 0x0000000000000001ULL, 0xFFFFFFFE00000000ULL,
122 0xFFFFFFFFFFFFFFFFULL, 0x00000001FFFFFFFEULL,
123 0x00000001FFFFFFFEULL, 0x00000001FFFFFFFEULL,
124 0xFFFFFFFE00000001ULL, 0xFFFFFFFE00000002ULL
125};
126static const BN_ULONG _nist_p_384[][BN_NIST_384_TOP] = {
127 {
128 0x00000000FFFFFFFFULL, 0xFFFFFFFF00000000ULL,
129 0xFFFFFFFFFFFFFFFEULL, 0xFFFFFFFFFFFFFFFFULL,
130 0xFFFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL
131 },
132 {
133 0x00000001FFFFFFFEULL, 0xFFFFFFFE00000000ULL,
134 0xFFFFFFFFFFFFFFFDULL, 0xFFFFFFFFFFFFFFFFULL,
135 0xFFFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL
136 },
137 {
138 0x00000002FFFFFFFDULL, 0xFFFFFFFD00000000ULL,
139 0xFFFFFFFFFFFFFFFCULL, 0xFFFFFFFFFFFFFFFFULL,
140 0xFFFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL
141 },
142 {
143 0x00000003FFFFFFFCULL, 0xFFFFFFFC00000000ULL,
144 0xFFFFFFFFFFFFFFFBULL, 0xFFFFFFFFFFFFFFFFULL,
145 0xFFFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL
146 },
147 {
148 0x00000004FFFFFFFBULL, 0xFFFFFFFB00000000ULL,
149 0xFFFFFFFFFFFFFFFAULL, 0xFFFFFFFFFFFFFFFFULL,
150 0xFFFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL
151 },
152};
153static const BN_ULONG _nist_p_384_sqr[] = {
154 0xFFFFFFFE00000001ULL, 0x0000000200000000ULL, 0xFFFFFFFE00000000ULL,
155 0x0000000200000000ULL, 0x0000000000000001ULL, 0x0000000000000000ULL,
156 0x00000001FFFFFFFEULL, 0xFFFFFFFE00000000ULL, 0xFFFFFFFFFFFFFFFDULL,
157 0xFFFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL
158};
159static const BN_ULONG _nist_p_521[] = {
160 0xFFFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL,
161 0xFFFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL,
162 0xFFFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL, 0x00000000000001FFULL
163};
164static const BN_ULONG _nist_p_521_sqr[] = {
165 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
166 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
167 0x0000000000000000ULL, 0x0000000000000000ULL, 0xFFFFFFFFFFFFFC00ULL,
168 0xFFFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL,
169 0xFFFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL,
170 0xFFFFFFFFFFFFFFFFULL, 0x000000000003FFFFULL
171};
172#elif BN_BITS2 == 32
173static const BN_ULONG _nist_p_192[][BN_NIST_192_TOP] = {
174 {
175 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFE, 0xFFFFFFFF,
176 0xFFFFFFFF, 0xFFFFFFFF
177 },
178 {
179 0xFFFFFFFE, 0xFFFFFFFF, 0xFFFFFFFD, 0xFFFFFFFF,
180 0xFFFFFFFF, 0xFFFFFFFF
181 },
182 {
183 0xFFFFFFFD, 0xFFFFFFFF, 0xFFFFFFFC, 0xFFFFFFFF,
184 0xFFFFFFFF, 0xFFFFFFFF
185 }
186};
187static const BN_ULONG _nist_p_192_sqr[] = {
188 0x00000001, 0x00000000, 0x00000002, 0x00000000, 0x00000001, 0x00000000,
189 0xFFFFFFFE, 0xFFFFFFFF, 0xFFFFFFFD, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF
190};
191static const BN_ULONG _nist_p_224[][BN_NIST_224_TOP] = {
192 {
193 0x00000001, 0x00000000, 0x00000000, 0xFFFFFFFF,
194 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF
195 },
196 {
197 0x00000002, 0x00000000, 0x00000000, 0xFFFFFFFE,
198 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF
199 }
200};
201static const BN_ULONG _nist_p_224_sqr[] = {
202 0x00000001, 0x00000000, 0x00000000, 0xFFFFFFFE,
203 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000002,
204 0x00000000, 0x00000000, 0xFFFFFFFE, 0xFFFFFFFF,
205 0xFFFFFFFF, 0xFFFFFFFF
206};
207static const BN_ULONG _nist_p_256[][BN_NIST_256_TOP] = {
208 {
209 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000,
210 0x00000000, 0x00000000, 0x00000001, 0xFFFFFFFF
211 },
212 {
213 0xFFFFFFFE, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000001,
214 0x00000000, 0x00000000, 0x00000002, 0xFFFFFFFE
215 },
216 {
217 0xFFFFFFFD, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000002,
218 0x00000000, 0x00000000, 0x00000003, 0xFFFFFFFD
219 },
220 {
221 0xFFFFFFFC, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000003,
222 0x00000000, 0x00000000, 0x00000004, 0xFFFFFFFC
223 },
224 {
225 0xFFFFFFFB, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000004,
226 0x00000000, 0x00000000, 0x00000005, 0xFFFFFFFB
227 },
228};
229static const BN_ULONG _nist_p_256_sqr[] = {
230 0x00000001, 0x00000000, 0x00000000, 0xFFFFFFFE,
231 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFE, 0x00000001,
232 0xFFFFFFFE, 0x00000001, 0xFFFFFFFE, 0x00000001,
233 0x00000001, 0xFFFFFFFE, 0x00000002, 0xFFFFFFFE
234};
235static const BN_ULONG _nist_p_384[][BN_NIST_384_TOP] = {
236 {
237 0xFFFFFFFF, 0x00000000, 0x00000000, 0xFFFFFFFF,
238 0xFFFFFFFE, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
239 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF
240 },
241 {
242 0xFFFFFFFE, 0x00000001, 0x00000000, 0xFFFFFFFE,
243 0xFFFFFFFD, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
244 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF
245 },
246 {
247 0xFFFFFFFD, 0x00000002, 0x00000000, 0xFFFFFFFD,
248 0xFFFFFFFC, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
249 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF
250 },
251 {
252 0xFFFFFFFC, 0x00000003, 0x00000000, 0xFFFFFFFC,
253 0xFFFFFFFB, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
254 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF
255 },
256 {
257 0xFFFFFFFB, 0x00000004, 0x00000000, 0xFFFFFFFB,
258 0xFFFFFFFA, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
259 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF
260 },
261};
262static const BN_ULONG _nist_p_384_sqr[] = {
263 0x00000001, 0xFFFFFFFE, 0x00000000, 0x00000002, 0x00000000, 0xFFFFFFFE,
264 0x00000000, 0x00000002, 0x00000001, 0x00000000, 0x00000000, 0x00000000,
265 0xFFFFFFFE, 0x00000001, 0x00000000, 0xFFFFFFFE, 0xFFFFFFFD, 0xFFFFFFFF,
266 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF
267};
268static const BN_ULONG _nist_p_521[] = {
269 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
270 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
271 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
272 0xFFFFFFFF, 0x000001FF
273};
274static const BN_ULONG _nist_p_521_sqr[] = {
275 0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
276 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
277 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xFFFFFC00, 0xFFFFFFFF,
278 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
279 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
280 0xFFFFFFFF, 0xFFFFFFFF, 0x0003FFFF
281};
282#else
283#error "unsupported BN_BITS2"
284#endif
285
286static const BIGNUM _bignum_nist_p_192 = {
287 (BN_ULONG *)_nist_p_192[0],
288 BN_NIST_192_TOP,
289 BN_NIST_192_TOP,
290 0,
291 BN_FLG_STATIC_DATA
292};
293
294static const BIGNUM _bignum_nist_p_224 = {
295 (BN_ULONG *)_nist_p_224[0],
296 BN_NIST_224_TOP,
297 BN_NIST_224_TOP,
298 0,
299 BN_FLG_STATIC_DATA
300};
301
302static const BIGNUM _bignum_nist_p_256 = {
303 (BN_ULONG *)_nist_p_256[0],
304 BN_NIST_256_TOP,
305 BN_NIST_256_TOP,
306 0,
307 BN_FLG_STATIC_DATA
308};
309
310static const BIGNUM _bignum_nist_p_384 = {
311 (BN_ULONG *)_nist_p_384[0],
312 BN_NIST_384_TOP,
313 BN_NIST_384_TOP,
314 0,
315 BN_FLG_STATIC_DATA
316};
317
318static const BIGNUM _bignum_nist_p_521 = {
319 (BN_ULONG *)_nist_p_521,
320 BN_NIST_521_TOP,
321 BN_NIST_521_TOP,
322 0,
323 BN_FLG_STATIC_DATA
324};
325
326
327const BIGNUM *
328BN_get0_nist_prime_192(void)
329{
330 return &_bignum_nist_p_192;
331}
332
333const BIGNUM *
334BN_get0_nist_prime_224(void)
335{
336 return &_bignum_nist_p_224;
337}
338
339const BIGNUM *
340BN_get0_nist_prime_256(void)
341{
342 return &_bignum_nist_p_256;
343}
344
345const BIGNUM *
346BN_get0_nist_prime_384(void)
347{
348 return &_bignum_nist_p_384;
349}
350
351const BIGNUM *
352BN_get0_nist_prime_521(void)
353{
354 return &_bignum_nist_p_521;
355}
356
357static void
358nist_cp_bn_0(BN_ULONG *dst, const BN_ULONG *src, int top, int max)
359{
360 int i;
361
362#ifdef BN_DEBUG
363 OPENSSL_assert(top <= max);
364#endif
365 for (i = 0; i < top; i++)
366 dst[i] = src[i];
367 for (; i < max; i++)
368 dst[i] = 0;
369}
370
371static void nist_cp_bn(BN_ULONG *dst, const BN_ULONG *src, int top)
372{
373 int i;
374
375 for (i = 0; i < top; i++)
376 dst[i] = src[i];
377}
378
379#if BN_BITS2 == 64
380#define bn_cp_64(to, n, from, m) (to)[n] = (m>=0)?((from)[m]):0;
381#define bn_64_set_0(to, n) (to)[n] = (BN_ULONG)0;
382/*
383 * two following macros are implemented under assumption that they
384 * are called in a sequence with *ascending* n, i.e. as they are...
385 */
386#define bn_cp_32_naked(to, n, from, m) (((n)&1)?(to[(n)/2]|=((m)&1)?(from[(m)/2]&BN_MASK2h):(from[(m)/2]<<32))\
387 :(to[(n)/2] =((m)&1)?(from[(m)/2]>>32):(from[(m)/2]&BN_MASK2l)))
388#define bn_32_set_0(to, n) (((n)&1)?(to[(n)/2]&=BN_MASK2l):(to[(n)/2]=0));
389#define bn_cp_32(to,n,from,m) ((m)>=0)?bn_cp_32_naked(to,n,from,m):bn_32_set_0(to,n)
390# if BYTE_ORDER == LITTLE_ENDIAN
391# if defined(_LP64)
392# define NIST_INT64 long
393# else
394# define NIST_INT64 long long
395# endif
396# endif
397#else
398#define bn_cp_64(to, n, from, m) \
399 { \
400 bn_cp_32(to, (n)*2, from, (m)*2); \
401 bn_cp_32(to, (n)*2+1, from, (m)*2+1); \
402 }
403#define bn_64_set_0(to, n) \
404 { \
405 bn_32_set_0(to, (n)*2); \
406 bn_32_set_0(to, (n)*2+1); \
407 }
408#define bn_cp_32(to, n, from, m) (to)[n] = (m>=0)?((from)[m]):0;
409#define bn_32_set_0(to, n) (to)[n] = (BN_ULONG)0;
410# if defined(BN_LLONG)
411# define NIST_INT64 long long
412# endif
413#endif /* BN_BITS2 != 64 */
414
415#define nist_set_192(to, from, a1, a2, a3) \
416 { \
417 bn_cp_64(to, 0, from, (a3) - 3) \
418 bn_cp_64(to, 1, from, (a2) - 3) \
419 bn_cp_64(to, 2, from, (a1) - 3) \
420 }
421
422int
423BN_nist_mod_192(BIGNUM *r, const BIGNUM *a, const BIGNUM *field, BN_CTX *ctx)
424{
425 int top = a->top, i;
426 int carry;
427 BN_ULONG *r_d, *a_d = a->d;
428 union {
429 BN_ULONG bn[BN_NIST_192_TOP];
430 unsigned int ui[BN_NIST_192_TOP *
431 sizeof(BN_ULONG) / sizeof(unsigned int)];
432 } buf;
433 BN_ULONG c_d[BN_NIST_192_TOP], *res;
434 uintptr_t mask;
435 static const BIGNUM _bignum_nist_p_192_sqr = {
436 (BN_ULONG *)_nist_p_192_sqr,
437 sizeof(_nist_p_192_sqr) / sizeof(_nist_p_192_sqr[0]),
438 sizeof(_nist_p_192_sqr) / sizeof(_nist_p_192_sqr[0]),
439 0,
440 BN_FLG_STATIC_DATA
441 };
442
443 field = &_bignum_nist_p_192; /* just to make sure */
444
445 if (BN_is_negative(a) || BN_ucmp(a, &_bignum_nist_p_192_sqr) >= 0)
446 return BN_nnmod(r, a, field, ctx);
447
448 i = BN_ucmp(field, a);
449 if (i == 0) {
450 BN_zero(r);
451 return 1;
452 } else if (i > 0)
453 return (r == a) ? 1 : (BN_copy(r , a) != NULL);
454
455 if (r != a) {
456 if (!bn_wexpand(r, BN_NIST_192_TOP))
457 return 0;
458 r_d = r->d;
459 nist_cp_bn(r_d, a_d, BN_NIST_192_TOP);
460 } else
461 r_d = a_d;
462
463 nist_cp_bn_0(buf.bn, a_d + BN_NIST_192_TOP, top - BN_NIST_192_TOP,
464 BN_NIST_192_TOP);
465
466#if defined(NIST_INT64)
467 {
468 NIST_INT64 acc; /* accumulator */
469 unsigned int *rp = (unsigned int *)r_d;
470 const unsigned int *bp = (const unsigned int *)buf.ui;
471
472 acc = rp[0];
473 acc += bp[3 * 2 - 6];
474 acc += bp[5 * 2 - 6];
475 rp[0] = (unsigned int)acc;
476 acc >>= 32;
477
478 acc += rp[1];
479 acc += bp[3 * 2 - 5];
480 acc += bp[5 * 2 - 5];
481 rp[1] = (unsigned int)acc;
482 acc >>= 32;
483
484 acc += rp[2];
485 acc += bp[3 * 2 - 6];
486 acc += bp[4 * 2 - 6];
487 acc += bp[5 * 2 - 6];
488 rp[2] = (unsigned int)acc;
489 acc >>= 32;
490
491 acc += rp[3];
492 acc += bp[3 * 2 - 5];
493 acc += bp[4 * 2 - 5];
494 acc += bp[5 * 2 - 5];
495 rp[3] = (unsigned int)acc;
496 acc >>= 32;
497
498 acc += rp[4];
499 acc += bp[4 * 2 - 6];
500 acc += bp[5 * 2 - 6];
501 rp[4] = (unsigned int)acc;
502 acc >>= 32;
503
504 acc += rp[5];
505 acc += bp[4 * 2 - 5];
506 acc += bp[5 * 2 - 5];
507 rp[5] = (unsigned int)acc;
508
509 carry = (int)(acc >> 32);
510 }
511#else
512 {
513 BN_ULONG t_d[BN_NIST_192_TOP];
514
515 nist_set_192(t_d, buf.bn, 0, 3, 3);
516 carry = (int)bn_add_words(r_d, r_d, t_d, BN_NIST_192_TOP);
517 nist_set_192(t_d, buf.bn, 4, 4, 0);
518 carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_192_TOP);
519 nist_set_192(t_d, buf.bn, 5, 5, 5)
520 carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_192_TOP);
521 }
522#endif
523 if (carry > 0)
524 carry = (int)bn_sub_words(r_d, r_d, _nist_p_192[carry - 1],
525 BN_NIST_192_TOP);
526 else
527 carry = 1;
528
529 /*
530 * we need 'if (carry==0 || result>=modulus) result-=modulus;'
531 * as comparison implies subtraction, we can write
532 * 'tmp=result-modulus; if (!carry || !borrow) result=tmp;'
533 * this is what happens below, but without explicit if:-) a.
534 */
535 mask = 0 - (uintptr_t)bn_sub_words(c_d, r_d, _nist_p_192[0],
536 BN_NIST_192_TOP);
537 mask &= 0 - (uintptr_t)carry;
538 res = c_d;
539 res = (BN_ULONG *)(((uintptr_t)res & ~mask) | ((uintptr_t)r_d & mask));
540 nist_cp_bn(r_d, res, BN_NIST_192_TOP);
541 r->top = BN_NIST_192_TOP;
542 bn_correct_top(r);
543
544 return 1;
545}
546
547typedef BN_ULONG (*bn_addsub_f)(BN_ULONG *, const BN_ULONG *,
548 const BN_ULONG *, int);
549
550#define nist_set_224(to, from, a1, a2, a3, a4, a5, a6, a7) \
551 { \
552 bn_cp_32(to, 0, from, (a7) - 7) \
553 bn_cp_32(to, 1, from, (a6) - 7) \
554 bn_cp_32(to, 2, from, (a5) - 7) \
555 bn_cp_32(to, 3, from, (a4) - 7) \
556 bn_cp_32(to, 4, from, (a3) - 7) \
557 bn_cp_32(to, 5, from, (a2) - 7) \
558 bn_cp_32(to, 6, from, (a1) - 7) \
559 }
560
561int
562BN_nist_mod_224(BIGNUM *r, const BIGNUM *a, const BIGNUM *field, BN_CTX *ctx)
563{
564 int top = a->top, i;
565 int carry;
566 BN_ULONG *r_d, *a_d = a->d;
567 union {
568 BN_ULONG bn[BN_NIST_224_TOP];
569 unsigned int ui[BN_NIST_224_TOP *
570 sizeof(BN_ULONG) / sizeof(unsigned int)];
571 } buf;
572 BN_ULONG c_d[BN_NIST_224_TOP], *res;
573 uintptr_t mask;
574 union {
575 bn_addsub_f f;
576 uintptr_t p;
577 } u;
578 static const BIGNUM _bignum_nist_p_224_sqr = {
579 (BN_ULONG *)_nist_p_224_sqr,
580 sizeof(_nist_p_224_sqr) / sizeof(_nist_p_224_sqr[0]),
581 sizeof(_nist_p_224_sqr) / sizeof(_nist_p_224_sqr[0]),
582 0,
583 BN_FLG_STATIC_DATA
584 };
585
586 field = &_bignum_nist_p_224; /* just to make sure */
587
588 if (BN_is_negative(a) || BN_ucmp(a, &_bignum_nist_p_224_sqr) >= 0)
589 return BN_nnmod(r, a, field, ctx);
590
591 i = BN_ucmp(field, a);
592 if (i == 0) {
593 BN_zero(r);
594 return 1;
595 } else if (i > 0)
596 return (r == a) ? 1 : (BN_copy(r, a) != NULL);
597
598 if (r != a) {
599 if (!bn_wexpand(r, BN_NIST_224_TOP))
600 return 0;
601 r_d = r->d;
602 nist_cp_bn(r_d, a_d, BN_NIST_224_TOP);
603 } else
604 r_d = a_d;
605
606#if BN_BITS2==64
607 /* copy upper 256 bits of 448 bit number ... */
608 nist_cp_bn_0(c_d, a_d + (BN_NIST_224_TOP - 1),
609 top - (BN_NIST_224_TOP - 1), BN_NIST_224_TOP);
610 /* ... and right shift by 32 to obtain upper 224 bits */
611 nist_set_224(buf.bn, c_d, 14, 13, 12, 11, 10, 9, 8);
612 /* truncate lower part to 224 bits too */
613 r_d[BN_NIST_224_TOP - 1] &= BN_MASK2l;
614#else
615 nist_cp_bn_0(buf.bn, a_d + BN_NIST_224_TOP,
616 top - BN_NIST_224_TOP, BN_NIST_224_TOP);
617#endif
618
619#if defined(NIST_INT64) && BN_BITS2!=64
620 {
621 NIST_INT64 acc; /* accumulator */
622 unsigned int *rp = (unsigned int *)r_d;
623 const unsigned int *bp = (const unsigned int *)buf.ui;
624
625 acc = rp[0];
626 acc -= bp[7 - 7];
627 acc -= bp[11 - 7];
628 rp[0] = (unsigned int)acc;
629 acc >>= 32;
630
631 acc += rp[1];
632 acc -= bp[8 - 7];
633 acc -= bp[12 - 7];
634 rp[1] = (unsigned int)acc;
635 acc >>= 32;
636
637 acc += rp[2];
638 acc -= bp[9 - 7];
639 acc -= bp[13 - 7];
640 rp[2] = (unsigned int)acc;
641 acc >>= 32;
642
643 acc += rp[3];
644 acc += bp[7 - 7];
645 acc += bp[11 - 7];
646 acc -= bp[10 - 7];
647 rp[3] = (unsigned int)acc;
648 acc >>= 32;
649
650 acc += rp[4];
651 acc += bp[8 - 7];
652 acc += bp[12 - 7];
653 acc -= bp[11 - 7];
654 rp[4] = (unsigned int)acc;
655 acc >>= 32;
656
657 acc += rp[5];
658 acc += bp[9 - 7];
659 acc += bp[13 - 7];
660 acc -= bp[12 - 7];
661 rp[5] = (unsigned int)acc;
662 acc >>= 32;
663
664 acc += rp[6];
665 acc += bp[10 - 7];
666 acc -= bp[13 - 7];
667 rp[6] = (unsigned int)acc;
668
669 carry = (int)(acc >> 32);
670# if BN_BITS2==64
671 rp[7] = carry;
672# endif
673 }
674#else
675 {
676 BN_ULONG t_d[BN_NIST_224_TOP];
677
678 nist_set_224(t_d, buf.bn, 10, 9, 8, 7, 0, 0, 0);
679 carry = (int)bn_add_words(r_d, r_d, t_d, BN_NIST_224_TOP);
680 nist_set_224(t_d, buf.bn, 0, 13, 12, 11, 0, 0, 0);
681 carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_224_TOP);
682 nist_set_224(t_d, buf.bn, 13, 12, 11, 10, 9, 8, 7);
683 carry -= (int)bn_sub_words(r_d, r_d, t_d, BN_NIST_224_TOP);
684 nist_set_224(t_d, buf.bn, 0, 0, 0, 0, 13, 12, 11);
685 carry -= (int)bn_sub_words(r_d, r_d, t_d, BN_NIST_224_TOP);
686
687#if BN_BITS2==64
688 carry = (int)(r_d[BN_NIST_224_TOP - 1] >> 32);
689#endif
690 }
691#endif
692 u.f = bn_sub_words;
693 if (carry > 0) {
694 carry = (int)bn_sub_words(r_d, r_d, _nist_p_224[carry - 1],
695 BN_NIST_224_TOP);
696#if BN_BITS2==64
697 carry = (int)(~(r_d[BN_NIST_224_TOP - 1] >> 32)) & 1;
698#endif
699 } else if (carry < 0) {
700 /* it's a bit more complicated logic in this case.
701 * if bn_add_words yields no carry, then result
702 * has to be adjusted by unconditionally *adding*
703 * the modulus. but if it does, then result has
704 * to be compared to the modulus and conditionally
705 * adjusted by *subtracting* the latter. */
706 carry = (int)bn_add_words(r_d, r_d, _nist_p_224[-carry - 1],
707 BN_NIST_224_TOP);
708 mask = 0 - (uintptr_t)carry;
709 u.p = ((uintptr_t)bn_sub_words & mask) |
710 ((uintptr_t)bn_add_words & ~mask);
711 } else
712 carry = 1;
713
714 /* otherwise it's effectively same as in BN_nist_mod_192... */
715 mask = 0 - (uintptr_t)(*u.f)(c_d, r_d, _nist_p_224[0], BN_NIST_224_TOP);
716 mask &= 0 - (uintptr_t)carry;
717 res = c_d;
718 res = (BN_ULONG *)(((uintptr_t)res & ~mask) | ((uintptr_t)r_d & mask));
719 nist_cp_bn(r_d, res, BN_NIST_224_TOP);
720 r->top = BN_NIST_224_TOP;
721 bn_correct_top(r);
722
723 return 1;
724}
725
726#define nist_set_256(to, from, a1, a2, a3, a4, a5, a6, a7, a8) \
727 { \
728 bn_cp_32(to, 0, from, (a8) - 8) \
729 bn_cp_32(to, 1, from, (a7) - 8) \
730 bn_cp_32(to, 2, from, (a6) - 8) \
731 bn_cp_32(to, 3, from, (a5) - 8) \
732 bn_cp_32(to, 4, from, (a4) - 8) \
733 bn_cp_32(to, 5, from, (a3) - 8) \
734 bn_cp_32(to, 6, from, (a2) - 8) \
735 bn_cp_32(to, 7, from, (a1) - 8) \
736 }
737
738int
739BN_nist_mod_256(BIGNUM *r, const BIGNUM *a, const BIGNUM *field, BN_CTX *ctx)
740{
741 int i, top = a->top;
742 int carry = 0;
743 BN_ULONG *a_d = a->d, *r_d;
744 union {
745 BN_ULONG bn[BN_NIST_256_TOP];
746 unsigned int ui[BN_NIST_256_TOP *
747 sizeof(BN_ULONG) / sizeof(unsigned int)];
748 } buf;
749 BN_ULONG c_d[BN_NIST_256_TOP], *res;
750 uintptr_t mask;
751 union {
752 bn_addsub_f f;
753 uintptr_t p;
754 } u;
755 static const BIGNUM _bignum_nist_p_256_sqr = {
756 (BN_ULONG *)_nist_p_256_sqr,
757 sizeof(_nist_p_256_sqr) / sizeof(_nist_p_256_sqr[0]),
758 sizeof(_nist_p_256_sqr) / sizeof(_nist_p_256_sqr[0]),
759 0,
760 BN_FLG_STATIC_DATA
761 };
762
763 field = &_bignum_nist_p_256; /* just to make sure */
764
765 if (BN_is_negative(a) || BN_ucmp(a, &_bignum_nist_p_256_sqr) >= 0)
766 return BN_nnmod(r, a, field, ctx);
767
768 i = BN_ucmp(field, a);
769 if (i == 0) {
770 BN_zero(r);
771 return 1;
772 } else if (i > 0)
773 return (r == a) ? 1 : (BN_copy(r, a) != NULL);
774
775 if (r != a) {
776 if (!bn_wexpand(r, BN_NIST_256_TOP))
777 return 0;
778 r_d = r->d;
779 nist_cp_bn(r_d, a_d, BN_NIST_256_TOP);
780 } else
781 r_d = a_d;
782
783 nist_cp_bn_0(buf.bn, a_d + BN_NIST_256_TOP,
784 top - BN_NIST_256_TOP, BN_NIST_256_TOP);
785
786#if defined(NIST_INT64)
787 {
788 NIST_INT64 acc; /* accumulator */
789 unsigned int *rp = (unsigned int *)r_d;
790 const unsigned int *bp = (const unsigned int *)buf.ui;
791
792 acc = rp[0];
793 acc += bp[8 - 8];
794 acc += bp[9 - 8];
795 acc -= bp[11 - 8];
796 acc -= bp[12 - 8];
797 acc -= bp[13 - 8];
798 acc -= bp[14 - 8];
799 rp[0] = (unsigned int)acc;
800 acc >>= 32;
801
802 acc += rp[1];
803 acc += bp[9 - 8];
804 acc += bp[10 - 8];
805 acc -= bp[12 - 8];
806 acc -= bp[13 - 8];
807 acc -= bp[14 - 8];
808 acc -= bp[15 - 8];
809 rp[1] = (unsigned int)acc;
810 acc >>= 32;
811
812 acc += rp[2];
813 acc += bp[10 - 8];
814 acc += bp[11 - 8];
815 acc -= bp[13 - 8];
816 acc -= bp[14 - 8];
817 acc -= bp[15 - 8];
818 rp[2] = (unsigned int)acc;
819 acc >>= 32;
820
821 acc += rp[3];
822 acc += bp[11 - 8];
823 acc += bp[11 - 8];
824 acc += bp[12 - 8];
825 acc += bp[12 - 8];
826 acc += bp[13 - 8];
827 acc -= bp[15 - 8];
828 acc -= bp[8 - 8];
829 acc -= bp[9 - 8];
830 rp[3] = (unsigned int)acc;
831 acc >>= 32;
832
833 acc += rp[4];
834 acc += bp[12 - 8];
835 acc += bp[12 - 8];
836 acc += bp[13 - 8];
837 acc += bp[13 - 8];
838 acc += bp[14 - 8];
839 acc -= bp[9 - 8];
840 acc -= bp[10 - 8];
841 rp[4] = (unsigned int)acc;
842 acc >>= 32;
843
844 acc += rp[5];
845 acc += bp[13 - 8];
846 acc += bp[13 - 8];
847 acc += bp[14 - 8];
848 acc += bp[14 - 8];
849 acc += bp[15 - 8];
850 acc -= bp[10 - 8];
851 acc -= bp[11 - 8];
852 rp[5] = (unsigned int)acc;
853 acc >>= 32;
854
855 acc += rp[6];
856 acc += bp[14 - 8];
857 acc += bp[14 - 8];
858 acc += bp[15 - 8];
859 acc += bp[15 - 8];
860 acc += bp[14 - 8];
861 acc += bp[13 - 8];
862 acc -= bp[8 - 8];
863 acc -= bp[9 - 8];
864 rp[6] = (unsigned int)acc;
865 acc >>= 32;
866
867 acc += rp[7];
868 acc += bp[15 - 8];
869 acc += bp[15 - 8];
870 acc += bp[15 - 8];
871 acc += bp[8 - 8];
872 acc -= bp[10 - 8];
873 acc -= bp[11 - 8];
874 acc -= bp[12 - 8];
875 acc -= bp[13 - 8];
876 rp[7] = (unsigned int)acc;
877
878 carry = (int)(acc >> 32);
879 }
880#else
881 {
882 BN_ULONG t_d[BN_NIST_256_TOP];
883
884 /*S1*/
885 nist_set_256(t_d, buf.bn, 15, 14, 13, 12, 11, 0, 0, 0);
886 /*S2*/
887 nist_set_256(c_d, buf.bn, 0, 15, 14, 13, 12, 0, 0, 0);
888 carry = (int)bn_add_words(t_d, t_d, c_d, BN_NIST_256_TOP);
889 /* left shift */
890 {
891 BN_ULONG *ap, t, c;
892 ap = t_d;
893 c = 0;
894 for (i = BN_NIST_256_TOP; i != 0; --i) {
895 t = *ap;
896 *(ap++) = ((t << 1) | c) & BN_MASK2;
897 c = (t & BN_TBIT) ? 1 : 0;
898 }
899 carry <<= 1;
900 carry |= c;
901 }
902 carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_256_TOP);
903 /*S3*/
904 nist_set_256(t_d, buf.bn, 15, 14, 0, 0, 0, 10, 9, 8);
905 carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_256_TOP);
906 /*S4*/
907 nist_set_256(t_d, buf.bn, 8, 13, 15, 14, 13, 11, 10, 9);
908 carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_256_TOP);
909 /*D1*/
910 nist_set_256(t_d, buf.bn, 10, 8, 0, 0, 0, 13, 12, 11);
911 carry -= (int)bn_sub_words(r_d, r_d, t_d, BN_NIST_256_TOP);
912 /*D2*/
913 nist_set_256(t_d, buf.bn, 11, 9, 0, 0, 15, 14, 13, 12);
914 carry -= (int)bn_sub_words(r_d, r_d, t_d, BN_NIST_256_TOP);
915 /*D3*/
916 nist_set_256(t_d, buf.bn, 12, 0, 10, 9, 8, 15, 14, 13);
917 carry -= (int)bn_sub_words(r_d, r_d, t_d, BN_NIST_256_TOP);
918 /*D4*/
919 nist_set_256(t_d, buf.bn, 13, 0, 11, 10, 9, 0, 15, 14);
920 carry -= (int)bn_sub_words(r_d, r_d, t_d, BN_NIST_256_TOP);
921
922 }
923#endif
924 /* see BN_nist_mod_224 for explanation */
925 u.f = bn_sub_words;
926 if (carry > 0)
927 carry = (int)bn_sub_words(r_d, r_d, _nist_p_256[carry - 1],
928 BN_NIST_256_TOP);
929 else if (carry < 0) {
930 carry = (int)bn_add_words(r_d, r_d, _nist_p_256[-carry - 1],
931 BN_NIST_256_TOP);
932 mask = 0 - (uintptr_t)carry;
933 u.p = ((uintptr_t)bn_sub_words & mask) |
934 ((uintptr_t)bn_add_words & ~mask);
935 } else
936 carry = 1;
937
938 mask = 0 - (uintptr_t)(*u.f)(c_d, r_d, _nist_p_256[0], BN_NIST_256_TOP);
939 mask &= 0 - (uintptr_t)carry;
940 res = c_d;
941 res = (BN_ULONG *)(((uintptr_t)res & ~mask) | ((uintptr_t)r_d & mask));
942 nist_cp_bn(r_d, res, BN_NIST_256_TOP);
943 r->top = BN_NIST_256_TOP;
944 bn_correct_top(r);
945
946 return 1;
947}
948
949#define nist_set_384(to,from,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12) \
950 { \
951 bn_cp_32(to, 0, from, (a12) - 12) \
952 bn_cp_32(to, 1, from, (a11) - 12) \
953 bn_cp_32(to, 2, from, (a10) - 12) \
954 bn_cp_32(to, 3, from, (a9) - 12) \
955 bn_cp_32(to, 4, from, (a8) - 12) \
956 bn_cp_32(to, 5, from, (a7) - 12) \
957 bn_cp_32(to, 6, from, (a6) - 12) \
958 bn_cp_32(to, 7, from, (a5) - 12) \
959 bn_cp_32(to, 8, from, (a4) - 12) \
960 bn_cp_32(to, 9, from, (a3) - 12) \
961 bn_cp_32(to, 10, from, (a2) - 12) \
962 bn_cp_32(to, 11, from, (a1) - 12) \
963 }
964
965int
966BN_nist_mod_384(BIGNUM *r, const BIGNUM *a, const BIGNUM *field, BN_CTX *ctx)
967{
968 int i, top = a->top;
969 int carry = 0;
970 BN_ULONG *r_d, *a_d = a->d;
971 union {
972 BN_ULONG bn[BN_NIST_384_TOP];
973 unsigned int ui[BN_NIST_384_TOP *
974 sizeof(BN_ULONG) / sizeof(unsigned int)];
975 } buf;
976 BN_ULONG c_d[BN_NIST_384_TOP], *res;
977 uintptr_t mask;
978 union {
979 bn_addsub_f f;
980 uintptr_t p;
981 } u;
982 static const BIGNUM _bignum_nist_p_384_sqr = {
983 (BN_ULONG *)_nist_p_384_sqr,
984 sizeof(_nist_p_384_sqr) / sizeof(_nist_p_384_sqr[0]),
985 sizeof(_nist_p_384_sqr) / sizeof(_nist_p_384_sqr[0]),
986 0,
987 BN_FLG_STATIC_DATA
988 };
989
990 field = &_bignum_nist_p_384; /* just to make sure */
991
992 if (BN_is_negative(a) || BN_ucmp(a, &_bignum_nist_p_384_sqr) >= 0)
993 return BN_nnmod(r, a, field, ctx);
994
995 i = BN_ucmp(field, a);
996 if (i == 0) {
997 BN_zero(r);
998 return 1;
999 } else if (i > 0)
1000 return (r == a) ? 1 : (BN_copy(r, a) != NULL);
1001
1002 if (r != a) {
1003 if (!bn_wexpand(r, BN_NIST_384_TOP))
1004 return 0;
1005 r_d = r->d;
1006 nist_cp_bn(r_d, a_d, BN_NIST_384_TOP);
1007 } else
1008 r_d = a_d;
1009
1010 nist_cp_bn_0(buf.bn, a_d + BN_NIST_384_TOP,
1011 top - BN_NIST_384_TOP, BN_NIST_384_TOP);
1012
1013#if defined(NIST_INT64)
1014 {
1015 NIST_INT64 acc; /* accumulator */
1016 unsigned int *rp = (unsigned int *)r_d;
1017 const unsigned int *bp = (const unsigned int *)buf.ui;
1018
1019 acc = rp[0];
1020 acc += bp[12 - 12];
1021 acc += bp[21 - 12];
1022 acc += bp[20 - 12];
1023 acc -= bp[23 - 12];
1024 rp[0] = (unsigned int)acc;
1025 acc >>= 32;
1026
1027 acc += rp[1];
1028 acc += bp[13 - 12];
1029 acc += bp[22 - 12];
1030 acc += bp[23 - 12];
1031 acc -= bp[12 - 12];
1032 acc -= bp[20 - 12];
1033 rp[1] = (unsigned int)acc;
1034 acc >>= 32;
1035
1036 acc += rp[2];
1037 acc += bp[14 - 12];
1038 acc += bp[23 - 12];
1039 acc -= bp[13 - 12];
1040 acc -= bp[21 - 12];
1041 rp[2] = (unsigned int)acc;
1042 acc >>= 32;
1043
1044 acc += rp[3];
1045 acc += bp[15 - 12];
1046 acc += bp[12 - 12];
1047 acc += bp[20 - 12];
1048 acc += bp[21 - 12];
1049 acc -= bp[14 - 12];
1050 acc -= bp[22 - 12];
1051 acc -= bp[23 - 12];
1052 rp[3] = (unsigned int)acc;
1053 acc >>= 32;
1054
1055 acc += rp[4];
1056 acc += bp[21 - 12];
1057 acc += bp[21 - 12];
1058 acc += bp[16 - 12];
1059 acc += bp[13 - 12];
1060 acc += bp[12 - 12];
1061 acc += bp[20 - 12];
1062 acc += bp[22 - 12];
1063 acc -= bp[15 - 12];
1064 acc -= bp[23 - 12];
1065 acc -= bp[23 - 12];
1066 rp[4] = (unsigned int)acc;
1067 acc >>= 32;
1068
1069 acc += rp[5];
1070 acc += bp[22 - 12];
1071 acc += bp[22 - 12];
1072 acc += bp[17 - 12];
1073 acc += bp[14 - 12];
1074 acc += bp[13 - 12];
1075 acc += bp[21 - 12];
1076 acc += bp[23 - 12];
1077 acc -= bp[16 - 12];
1078 rp[5] = (unsigned int)acc;
1079 acc >>= 32;
1080
1081 acc += rp[6];
1082 acc += bp[23 - 12];
1083 acc += bp[23 - 12];
1084 acc += bp[18 - 12];
1085 acc += bp[15 - 12];
1086 acc += bp[14 - 12];
1087 acc += bp[22 - 12];
1088 acc -= bp[17 - 12];
1089 rp[6] = (unsigned int)acc;
1090 acc >>= 32;
1091
1092 acc += rp[7];
1093 acc += bp[19 - 12];
1094 acc += bp[16 - 12];
1095 acc += bp[15 - 12];
1096 acc += bp[23 - 12];
1097 acc -= bp[18 - 12];
1098 rp[7] = (unsigned int)acc;
1099 acc >>= 32;
1100
1101 acc += rp[8];
1102 acc += bp[20 - 12];
1103 acc += bp[17 - 12];
1104 acc += bp[16 - 12];
1105 acc -= bp[19 - 12];
1106 rp[8] = (unsigned int)acc;
1107 acc >>= 32;
1108
1109 acc += rp[9];
1110 acc += bp[21 - 12];
1111 acc += bp[18 - 12];
1112 acc += bp[17 - 12];
1113 acc -= bp[20 - 12];
1114 rp[9] = (unsigned int)acc;
1115 acc >>= 32;
1116
1117 acc += rp[10];
1118 acc += bp[22 - 12];
1119 acc += bp[19 - 12];
1120 acc += bp[18 - 12];
1121 acc -= bp[21 - 12];
1122 rp[10] = (unsigned int)acc;
1123 acc >>= 32;
1124
1125 acc += rp[11];
1126 acc += bp[23 - 12];
1127 acc += bp[20 - 12];
1128 acc += bp[19 - 12];
1129 acc -= bp[22 - 12];
1130 rp[11] = (unsigned int)acc;
1131
1132 carry = (int)(acc >> 32);
1133 }
1134#else
1135 {
1136 BN_ULONG t_d[BN_NIST_384_TOP];
1137
1138 /*S1*/
1139 nist_set_256(t_d, buf.bn, 0, 0, 0, 0, 0, 23 - 4, 22 - 4,
1140 21 - 4);
1141 /* left shift */
1142 {
1143 BN_ULONG *ap, t, c;
1144 ap = t_d;
1145 c = 0;
1146 for (i = 3; i != 0; --i) {
1147 t= *ap;
1148 *(ap++) = ((t << 1)|c) & BN_MASK2;
1149 c = (t & BN_TBIT) ? 1 : 0;
1150 }
1151 *ap = c;
1152 }
1153 carry = (int)bn_add_words(r_d + (128 / BN_BITS2),
1154 r_d + (128 / BN_BITS2), t_d, BN_NIST_256_TOP);
1155 /*S2 */
1156 carry += (int)bn_add_words(r_d, r_d, buf.bn, BN_NIST_384_TOP);
1157 /*S3*/
1158 nist_set_384(t_d, buf.bn, 20, 19, 18, 17, 16, 15, 14, 13, 12,
1159 23, 22, 21);
1160 carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_384_TOP);
1161 /*S4*/
1162 nist_set_384(t_d, buf.bn, 19, 18, 17, 16, 15, 14, 13, 12, 20,
1163 0, 23, 0);
1164 carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_384_TOP);
1165 /*S5*/
1166 nist_set_384(t_d, buf.bn, 0,0, 0,0, 23, 22, 21, 20, 0,0, 0, 0);
1167 carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_384_TOP);
1168 /*S6*/
1169 nist_set_384(t_d, buf.bn, 0,0, 0,0, 0,0, 23, 22, 21, 0,0, 20);
1170 carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_384_TOP);
1171 /*D1*/
1172 nist_set_384(t_d, buf.bn, 22, 21, 20, 19, 18, 17, 16, 15, 14,
1173 13, 12, 23);
1174 carry -= (int)bn_sub_words(r_d, r_d, t_d, BN_NIST_384_TOP);
1175 /*D2*/
1176 nist_set_384(t_d, buf.bn, 0,0, 0,0, 0,0, 0,23, 22, 21, 20, 0);
1177 carry -= (int)bn_sub_words(r_d, r_d, t_d, BN_NIST_384_TOP);
1178 /*D3*/
1179 nist_set_384(t_d, buf.bn, 0,0, 0,0, 0,0, 0,23, 23, 0,0, 0);
1180 carry -= (int)bn_sub_words(r_d, r_d, t_d, BN_NIST_384_TOP);
1181
1182 }
1183#endif
1184 /* see BN_nist_mod_224 for explanation */
1185 u.f = bn_sub_words;
1186 if (carry > 0)
1187 carry = (int)bn_sub_words(r_d, r_d, _nist_p_384[carry - 1],
1188 BN_NIST_384_TOP);
1189 else if (carry < 0) {
1190 carry = (int)bn_add_words(r_d, r_d, _nist_p_384[-carry - 1],
1191 BN_NIST_384_TOP);
1192 mask = 0 - (uintptr_t)carry;
1193 u.p = ((uintptr_t)bn_sub_words & mask) |
1194 ((uintptr_t)bn_add_words & ~mask);
1195 } else
1196 carry = 1;
1197
1198 mask = 0 - (uintptr_t)(*u.f)(c_d, r_d, _nist_p_384[0], BN_NIST_384_TOP);
1199 mask &= 0 - (uintptr_t)carry;
1200 res = c_d;
1201 res = (BN_ULONG *)(((uintptr_t)res & ~mask) | ((uintptr_t)r_d & mask));
1202 nist_cp_bn(r_d, res, BN_NIST_384_TOP);
1203 r->top = BN_NIST_384_TOP;
1204 bn_correct_top(r);
1205
1206 return 1;
1207}
1208
1209#define BN_NIST_521_RSHIFT (521%BN_BITS2)
1210#define BN_NIST_521_LSHIFT (BN_BITS2-BN_NIST_521_RSHIFT)
1211#define BN_NIST_521_TOP_MASK ((BN_ULONG)BN_MASK2>>BN_NIST_521_LSHIFT)
1212
1213int
1214BN_nist_mod_521(BIGNUM *r, const BIGNUM *a, const BIGNUM *field, BN_CTX *ctx)
1215{
1216 int top = a->top, i;
1217 BN_ULONG *r_d, *a_d = a->d, t_d[BN_NIST_521_TOP], val, tmp, *res;
1218 uintptr_t mask;
1219 static const BIGNUM _bignum_nist_p_521_sqr = {
1220 (BN_ULONG *)_nist_p_521_sqr,
1221 sizeof(_nist_p_521_sqr) / sizeof(_nist_p_521_sqr[0]),
1222 sizeof(_nist_p_521_sqr) / sizeof(_nist_p_521_sqr[0]),
1223 0,
1224 BN_FLG_STATIC_DATA
1225 };
1226
1227 field = &_bignum_nist_p_521; /* just to make sure */
1228
1229 if (BN_is_negative(a) || BN_ucmp(a, &_bignum_nist_p_521_sqr) >= 0)
1230 return BN_nnmod(r, a, field, ctx);
1231
1232 i = BN_ucmp(field, a);
1233 if (i == 0) {
1234 BN_zero(r);
1235 return 1;
1236 } else if (i > 0)
1237 return (r == a) ? 1 : (BN_copy(r, a) != NULL);
1238
1239 if (r != a) {
1240 if (!bn_wexpand(r, BN_NIST_521_TOP))
1241 return 0;
1242 r_d = r->d;
1243 nist_cp_bn(r_d, a_d, BN_NIST_521_TOP);
1244 } else
1245 r_d = a_d;
1246
1247 /* upper 521 bits, copy ... */
1248 nist_cp_bn_0(t_d, a_d + (BN_NIST_521_TOP - 1),
1249 top - (BN_NIST_521_TOP - 1), BN_NIST_521_TOP);
1250 /* ... and right shift */
1251 for (val = t_d[0], i = 0; i < BN_NIST_521_TOP - 1; i++) {
1252 tmp = val >> BN_NIST_521_RSHIFT;
1253 val = t_d[i + 1];
1254 t_d[i] = (tmp | val << BN_NIST_521_LSHIFT) & BN_MASK2;
1255 }
1256 t_d[i] = val >> BN_NIST_521_RSHIFT;
1257 /* lower 521 bits */
1258 r_d[i] &= BN_NIST_521_TOP_MASK;
1259
1260 bn_add_words(r_d, r_d, t_d, BN_NIST_521_TOP);
1261 mask = 0 - (uintptr_t)bn_sub_words(t_d, r_d, _nist_p_521,
1262 BN_NIST_521_TOP);
1263 res = t_d;
1264 res = (BN_ULONG *)(((uintptr_t)res & ~mask) | ((uintptr_t)r_d & mask));
1265 nist_cp_bn(r_d, res, BN_NIST_521_TOP);
1266 r->top = BN_NIST_521_TOP;
1267 bn_correct_top(r);
1268
1269 return 1;
1270}
diff --git a/src/lib/libcrypto/bn/bn_prime.c b/src/lib/libcrypto/bn/bn_prime.c
deleted file mode 100644
index 02780d32e6..0000000000
--- a/src/lib/libcrypto/bn/bn_prime.c
+++ /dev/null
@@ -1,518 +0,0 @@
1/* $OpenBSD: bn_prime.c,v 1.13 2015/02/09 15:49:22 jsing Exp $ */
2/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
3 * All rights reserved.
4 *
5 * This package is an SSL implementation written
6 * by Eric Young (eay@cryptsoft.com).
7 * The implementation was written so as to conform with Netscapes SSL.
8 *
9 * This library is free for commercial and non-commercial use as long as
10 * the following conditions are aheared to. The following conditions
11 * apply to all code found in this distribution, be it the RC4, RSA,
12 * lhash, DES, etc., code; not just the SSL code. The SSL documentation
13 * included with this distribution is covered by the same copyright terms
14 * except that the holder is Tim Hudson (tjh@cryptsoft.com).
15 *
16 * Copyright remains Eric Young's, and as such any Copyright notices in
17 * the code are not to be removed.
18 * If this package is used in a product, Eric Young should be given attribution
19 * as the author of the parts of the library used.
20 * This can be in the form of a textual message at program startup or
21 * in documentation (online or textual) provided with the package.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 * 1. Redistributions of source code must retain the copyright
27 * notice, this list of conditions and the following disclaimer.
28 * 2. Redistributions in binary form must reproduce the above copyright
29 * notice, this list of conditions and the following disclaimer in the
30 * documentation and/or other materials provided with the distribution.
31 * 3. All advertising materials mentioning features or use of this software
32 * must display the following acknowledgement:
33 * "This product includes cryptographic software written by
34 * Eric Young (eay@cryptsoft.com)"
35 * The word 'cryptographic' can be left out if the rouines from the library
36 * being used are not cryptographic related :-).
37 * 4. If you include any Windows specific code (or a derivative thereof) from
38 * the apps directory (application code) you must include an acknowledgement:
39 * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
40 *
41 * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
42 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51 * SUCH DAMAGE.
52 *
53 * The licence and distribution terms for any publically available version or
54 * derivative of this code cannot be changed. i.e. this code cannot simply be
55 * copied and put under another distribution licence
56 * [including the GNU Public Licence.]
57 */
58/* ====================================================================
59 * Copyright (c) 1998-2001 The OpenSSL Project. All rights reserved.
60 *
61 * Redistribution and use in source and binary forms, with or without
62 * modification, are permitted provided that the following conditions
63 * are met:
64 *
65 * 1. Redistributions of source code must retain the above copyright
66 * notice, this list of conditions and the following disclaimer.
67 *
68 * 2. Redistributions in binary form must reproduce the above copyright
69 * notice, this list of conditions and the following disclaimer in
70 * the documentation and/or other materials provided with the
71 * distribution.
72 *
73 * 3. All advertising materials mentioning features or use of this
74 * software must display the following acknowledgment:
75 * "This product includes software developed by the OpenSSL Project
76 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
77 *
78 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
79 * endorse or promote products derived from this software without
80 * prior written permission. For written permission, please contact
81 * openssl-core@openssl.org.
82 *
83 * 5. Products derived from this software may not be called "OpenSSL"
84 * nor may "OpenSSL" appear in their names without prior written
85 * permission of the OpenSSL Project.
86 *
87 * 6. Redistributions of any form whatsoever must retain the following
88 * acknowledgment:
89 * "This product includes software developed by the OpenSSL Project
90 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
91 *
92 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
93 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
94 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
95 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
96 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
97 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
98 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
99 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
100 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
101 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
102 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
103 * OF THE POSSIBILITY OF SUCH DAMAGE.
104 * ====================================================================
105 *
106 * This product includes cryptographic software written by Eric Young
107 * (eay@cryptsoft.com). This product includes software written by Tim
108 * Hudson (tjh@cryptsoft.com).
109 *
110 */
111
112#include <stdio.h>
113#include <time.h>
114
115#include "bn_lcl.h"
116
117/* NB: these functions have been "upgraded", the deprecated versions (which are
118 * compatibility wrappers using these functions) are in bn_depr.c.
119 * - Geoff
120 */
121
122/* The quick sieve algorithm approach to weeding out primes is
123 * Philip Zimmermann's, as implemented in PGP. I have had a read of
124 * his comments and implemented my own version.
125 */
126#include "bn_prime.h"
127
128static int witness(BIGNUM *w, const BIGNUM *a, const BIGNUM *a1,
129 const BIGNUM *a1_odd, int k, BN_CTX *ctx, BN_MONT_CTX *mont);
130static int probable_prime(BIGNUM *rnd, int bits);
131static int probable_prime_dh(BIGNUM *rnd, int bits,
132 const BIGNUM *add, const BIGNUM *rem, BN_CTX *ctx);
133static int probable_prime_dh_safe(BIGNUM *rnd, int bits,
134 const BIGNUM *add, const BIGNUM *rem, BN_CTX *ctx);
135
136int
137BN_GENCB_call(BN_GENCB *cb, int a, int b)
138{
139 /* No callback means continue */
140 if (!cb)
141 return 1;
142 switch (cb->ver) {
143 case 1:
144 /* Deprecated-style callbacks */
145 if (!cb->cb.cb_1)
146 return 1;
147 cb->cb.cb_1(a, b, cb->arg);
148 return 1;
149 case 2:
150 /* New-style callbacks */
151 return cb->cb.cb_2(a, b, cb);
152 default:
153 break;
154 }
155 /* Unrecognised callback type */
156 return 0;
157}
158
159int
160BN_generate_prime_ex(BIGNUM *ret, int bits, int safe, const BIGNUM *add,
161 const BIGNUM *rem, BN_GENCB *cb)
162{
163 BIGNUM *t;
164 int found = 0;
165 int i, j, c1 = 0;
166 BN_CTX *ctx;
167 int checks = BN_prime_checks_for_size(bits);
168
169 ctx = BN_CTX_new();
170 if (ctx == NULL)
171 goto err;
172 BN_CTX_start(ctx);
173 if ((t = BN_CTX_get(ctx)) == NULL)
174 goto err;
175loop:
176 /* make a random number and set the top and bottom bits */
177 if (add == NULL) {
178 if (!probable_prime(ret, bits))
179 goto err;
180 } else {
181 if (safe) {
182 if (!probable_prime_dh_safe(ret, bits, add, rem, ctx))
183 goto err;
184 } else {
185 if (!probable_prime_dh(ret, bits, add, rem, ctx))
186 goto err;
187 }
188 }
189 /* if (BN_mod_word(ret,(BN_ULONG)3) == 1) goto loop; */
190 if (!BN_GENCB_call(cb, 0, c1++))
191 /* aborted */
192 goto err;
193
194 if (!safe) {
195 i = BN_is_prime_fasttest_ex(ret, checks, ctx, 0, cb);
196 if (i == -1)
197 goto err;
198 if (i == 0)
199 goto loop;
200 } else {
201 /* for "safe prime" generation,
202 * check that (p-1)/2 is prime.
203 * Since a prime is odd, We just
204 * need to divide by 2 */
205 if (!BN_rshift1(t, ret))
206 goto err;
207
208 for (i = 0; i < checks; i++) {
209 j = BN_is_prime_fasttest_ex(ret, 1, ctx, 0, cb);
210 if (j == -1)
211 goto err;
212 if (j == 0)
213 goto loop;
214
215 j = BN_is_prime_fasttest_ex(t, 1, ctx, 0, cb);
216 if (j == -1)
217 goto err;
218 if (j == 0)
219 goto loop;
220
221 if (!BN_GENCB_call(cb, 2, c1 - 1))
222 goto err;
223 /* We have a safe prime test pass */
224 }
225 }
226 /* we have a prime :-) */
227 found = 1;
228
229err:
230 if (ctx != NULL) {
231 BN_CTX_end(ctx);
232 BN_CTX_free(ctx);
233 }
234 bn_check_top(ret);
235 return found;
236}
237
238int
239BN_is_prime_ex(const BIGNUM *a, int checks, BN_CTX *ctx_passed, BN_GENCB *cb)
240{
241 return BN_is_prime_fasttest_ex(a, checks, ctx_passed, 0, cb);
242}
243
244int
245BN_is_prime_fasttest_ex(const BIGNUM *a, int checks, BN_CTX *ctx_passed,
246 int do_trial_division, BN_GENCB *cb)
247{
248 int i, j, ret = -1;
249 int k;
250 BN_CTX *ctx = NULL;
251 BIGNUM *A1, *A1_odd, *check; /* taken from ctx */
252 BN_MONT_CTX *mont = NULL;
253 const BIGNUM *A = NULL;
254
255 if (BN_cmp(a, BN_value_one()) <= 0)
256 return 0;
257
258 if (checks == BN_prime_checks)
259 checks = BN_prime_checks_for_size(BN_num_bits(a));
260
261 /* first look for small factors */
262 if (!BN_is_odd(a))
263 /* a is even => a is prime if and only if a == 2 */
264 return BN_is_word(a, 2);
265 if (do_trial_division) {
266 for (i = 1; i < NUMPRIMES; i++)
267 if (BN_mod_word(a, primes[i]) == 0)
268 return 0;
269 if (!BN_GENCB_call(cb, 1, -1))
270 goto err;
271 }
272
273 if (ctx_passed != NULL)
274 ctx = ctx_passed;
275 else if ((ctx = BN_CTX_new()) == NULL)
276 goto err;
277 BN_CTX_start(ctx);
278
279 /* A := abs(a) */
280 if (a->neg) {
281 BIGNUM *t;
282 if ((t = BN_CTX_get(ctx)) == NULL)
283 goto err;
284 BN_copy(t, a);
285 t->neg = 0;
286 A = t;
287 } else
288 A = a;
289 if ((A1 = BN_CTX_get(ctx)) == NULL)
290 goto err;
291 if ((A1_odd = BN_CTX_get(ctx)) == NULL)
292 goto err;
293 if ((check = BN_CTX_get(ctx)) == NULL)
294 goto err;
295
296 /* compute A1 := A - 1 */
297 if (!BN_copy(A1, A))
298 goto err;
299 if (!BN_sub_word(A1, 1))
300 goto err;
301 if (BN_is_zero(A1)) {
302 ret = 0;
303 goto err;
304 }
305
306 /* write A1 as A1_odd * 2^k */
307 k = 1;
308 while (!BN_is_bit_set(A1, k))
309 k++;
310 if (!BN_rshift(A1_odd, A1, k))
311 goto err;
312
313 /* Montgomery setup for computations mod A */
314 mont = BN_MONT_CTX_new();
315 if (mont == NULL)
316 goto err;
317 if (!BN_MONT_CTX_set(mont, A, ctx))
318 goto err;
319
320 for (i = 0; i < checks; i++) {
321 if (!BN_pseudo_rand_range(check, A1))
322 goto err;
323 if (!BN_add_word(check, 1))
324 goto err;
325 /* now 1 <= check < A */
326
327 j = witness(check, A, A1, A1_odd, k, ctx, mont);
328 if (j == -1)
329 goto err;
330 if (j) {
331 ret = 0;
332 goto err;
333 }
334 if (!BN_GENCB_call(cb, 1, i))
335 goto err;
336 }
337 ret = 1;
338
339err:
340 if (ctx != NULL) {
341 BN_CTX_end(ctx);
342 if (ctx_passed == NULL)
343 BN_CTX_free(ctx);
344 }
345 BN_MONT_CTX_free(mont);
346
347 return (ret);
348}
349
350static int
351witness(BIGNUM *w, const BIGNUM *a, const BIGNUM *a1, const BIGNUM *a1_odd,
352 int k, BN_CTX *ctx, BN_MONT_CTX *mont)
353{
354 if (!BN_mod_exp_mont(w, w, a1_odd, a, ctx, mont))
355 /* w := w^a1_odd mod a */
356 return -1;
357 if (BN_is_one(w))
358 return 0; /* probably prime */
359 if (BN_cmp(w, a1) == 0)
360 return 0; /* w == -1 (mod a), 'a' is probably prime */
361 while (--k) {
362 if (!BN_mod_mul(w, w, w, a, ctx)) /* w := w^2 mod a */
363 return -1;
364 if (BN_is_one(w))
365 return 1; /* 'a' is composite, otherwise a previous 'w' would
366 * have been == -1 (mod 'a') */
367 if (BN_cmp(w, a1) == 0)
368 return 0; /* w == -1 (mod a), 'a' is probably prime */
369 }
370 /* If we get here, 'w' is the (a-1)/2-th power of the original 'w',
371 * and it is neither -1 nor +1 -- so 'a' cannot be prime */
372 bn_check_top(w);
373 return 1;
374}
375
376static int
377probable_prime(BIGNUM *rnd, int bits)
378{
379 int i;
380 prime_t mods[NUMPRIMES];
381 BN_ULONG delta, maxdelta;
382
383again:
384 if (!BN_rand(rnd, bits, 1, 1))
385 return (0);
386 /* we now have a random number 'rand' to test. */
387 for (i = 1; i < NUMPRIMES; i++)
388 mods[i] = (prime_t)BN_mod_word(rnd, (BN_ULONG)primes[i]);
389 maxdelta = BN_MASK2 - primes[NUMPRIMES - 1];
390 delta = 0;
391loop:
392 for (i = 1; i < NUMPRIMES; i++) {
393 /* check that rnd is not a prime and also
394 * that gcd(rnd-1,primes) == 1 (except for 2) */
395 if (((mods[i] + delta) % primes[i]) <= 1) {
396 delta += 2;
397 if (delta > maxdelta)
398 goto again;
399 goto loop;
400 }
401 }
402 if (!BN_add_word(rnd, delta))
403 return (0);
404 bn_check_top(rnd);
405 return (1);
406}
407
408static int
409probable_prime_dh(BIGNUM *rnd, int bits, const BIGNUM *add, const BIGNUM *rem,
410 BN_CTX *ctx)
411{
412 int i, ret = 0;
413 BIGNUM *t1;
414
415 BN_CTX_start(ctx);
416 if ((t1 = BN_CTX_get(ctx)) == NULL)
417 goto err;
418
419 if (!BN_rand(rnd, bits, 0, 1))
420 goto err;
421
422 /* we need ((rnd-rem) % add) == 0 */
423
424 if (!BN_mod(t1, rnd, add, ctx))
425 goto err;
426 if (!BN_sub(rnd, rnd, t1))
427 goto err;
428 if (rem == NULL) {
429 if (!BN_add_word(rnd, 1))
430 goto err;
431 } else {
432 if (!BN_add(rnd, rnd, rem))
433 goto err;
434 }
435
436 /* we now have a random number 'rand' to test. */
437
438loop:
439 for (i = 1; i < NUMPRIMES; i++) {
440 /* check that rnd is a prime */
441 if (BN_mod_word(rnd, (BN_ULONG)primes[i]) <= 1) {
442 if (!BN_add(rnd, rnd, add))
443 goto err;
444 goto loop;
445 }
446 }
447 ret = 1;
448
449err:
450 BN_CTX_end(ctx);
451 bn_check_top(rnd);
452 return (ret);
453}
454
455static int
456probable_prime_dh_safe(BIGNUM *p, int bits, const BIGNUM *padd,
457 const BIGNUM *rem, BN_CTX *ctx)
458{
459 int i, ret = 0;
460 BIGNUM *t1, *qadd, *q;
461
462 bits--;
463 BN_CTX_start(ctx);
464 if ((t1 = BN_CTX_get(ctx)) == NULL)
465 goto err;
466 if ((q = BN_CTX_get(ctx)) == NULL)
467 goto err;
468 if ((qadd = BN_CTX_get(ctx)) == NULL)
469 goto err;
470
471 if (!BN_rshift1(qadd, padd))
472 goto err;
473
474 if (!BN_rand(q, bits, 0, 1))
475 goto err;
476
477 /* we need ((rnd-rem) % add) == 0 */
478 if (!BN_mod(t1, q,qadd, ctx))
479 goto err;
480 if (!BN_sub(q, q, t1))
481 goto err;
482 if (rem == NULL) {
483 if (!BN_add_word(q, 1))
484 goto err;
485 } else {
486 if (!BN_rshift1(t1, rem))
487 goto err;
488 if (!BN_add(q, q, t1))
489 goto err;
490 }
491
492 /* we now have a random number 'rand' to test. */
493 if (!BN_lshift1(p, q))
494 goto err;
495 if (!BN_add_word(p, 1))
496 goto err;
497
498loop:
499 for (i = 1; i < NUMPRIMES; i++) {
500 /* check that p and q are prime */
501 /* check that for p and q
502 * gcd(p-1,primes) == 1 (except for 2) */
503 if ((BN_mod_word(p, (BN_ULONG)primes[i]) == 0) ||
504 (BN_mod_word(q, (BN_ULONG)primes[i]) == 0)) {
505 if (!BN_add(p, p, padd))
506 goto err;
507 if (!BN_add(q, q, qadd))
508 goto err;
509 goto loop;
510 }
511 }
512 ret = 1;
513
514err:
515 BN_CTX_end(ctx);
516 bn_check_top(p);
517 return (ret);
518}
diff --git a/src/lib/libcrypto/bn/bn_prime.h b/src/lib/libcrypto/bn/bn_prime.h
deleted file mode 100644
index 3102d8eb41..0000000000
--- a/src/lib/libcrypto/bn/bn_prime.h
+++ /dev/null
@@ -1,319 +0,0 @@
1/* $OpenBSD: bn_prime.h,v 1.6 2014/06/12 15:49:28 deraadt Exp $ */
2/* Auto generated by bn_prime.pl */
3/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
4 * All rights reserved.
5 *
6 * This package is an SSL implementation written
7 * by Eric Young (eay@cryptsoft.com).
8 * The implementation was written so as to conform with Netscapes SSL.
9 *
10 * This library is free for commercial and non-commercial use as long as
11 * the following conditions are aheared to. The following conditions
12 * apply to all code found in this distribution, be it the RC4, RSA,
13 * lhash, DES, etc., code; not just the SSL code. The SSL documentation
14 * included with this distribution is covered by the same copyright terms
15 * except that the holder is Tim Hudson (tjh@cryptsoft.com).
16 *
17 * Copyright remains Eric Young's, and as such any Copyright notices in
18 * the code are not to be removed.
19 * If this package is used in a product, Eric Young should be given attribution
20 * as the author of the parts of the library used.
21 * This can be in the form of a textual message at program startup or
22 * in documentation (online or textual) provided with the package.
23 *
24 * Redistribution and use in source and binary forms, with or without
25 * modification, are permitted provided that the following conditions
26 * are met:
27 * 1. Redistributions of source code must retain the copyright
28 * notice, this list of conditions and the following disclaimer.
29 * 2. Redistributions in binary form must reproduce the above copyright
30 * notice, this list of conditions and the following disclaimer in the
31 * documentation and/or other materials provided with the distribution.
32 * 3. All advertising materials mentioning features or use of this software
33 * must display the following acknowledgement:
34 * "This product includes cryptographic software written by
35 * Eric Young (eay@cryptsoft.com)"
36 * The word 'cryptographic' can be left out if the rouines from the library
37 * being used are not cryptographic related :-).
38 * 4. If you include any Windows specific code (or a derivative thereof) from
39 * the apps directory (application code) you must include an acknowledgement:
40 * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
41 *
42 * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
43 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
52 * SUCH DAMAGE.
53 *
54 * The licence and distribution terms for any publically available version or
55 * derivative of this code cannot be changed. i.e. this code cannot simply be
56 * copied and put under another distribution licence
57 * [including the GNU Public Licence.]
58 */
59
60#define NUMPRIMES 2048
61typedef unsigned short prime_t;
62static const prime_t primes[NUMPRIMES] = {
63 2, 3, 5, 7, 11, 13, 17, 19,
64 23, 29, 31, 37, 41, 43, 47, 53,
65 59, 61, 67, 71, 73, 79, 83, 89,
66 97, 101, 103, 107, 109, 113, 127, 131,
67 137, 139, 149, 151, 157, 163, 167, 173,
68 179, 181, 191, 193, 197, 199, 211, 223,
69 227, 229, 233, 239, 241, 251, 257, 263,
70 269, 271, 277, 281, 283, 293, 307, 311,
71 313, 317, 331, 337, 347, 349, 353, 359,
72 367, 373, 379, 383, 389, 397, 401, 409,
73 419, 421, 431, 433, 439, 443, 449, 457,
74 461, 463, 467, 479, 487, 491, 499, 503,
75 509, 521, 523, 541, 547, 557, 563, 569,
76 571, 577, 587, 593, 599, 601, 607, 613,
77 617, 619, 631, 641, 643, 647, 653, 659,
78 661, 673, 677, 683, 691, 701, 709, 719,
79 727, 733, 739, 743, 751, 757, 761, 769,
80 773, 787, 797, 809, 811, 821, 823, 827,
81 829, 839, 853, 857, 859, 863, 877, 881,
82 883, 887, 907, 911, 919, 929, 937, 941,
83 947, 953, 967, 971, 977, 983, 991, 997,
84 1009, 1013, 1019, 1021, 1031, 1033, 1039, 1049,
85 1051, 1061, 1063, 1069, 1087, 1091, 1093, 1097,
86 1103, 1109, 1117, 1123, 1129, 1151, 1153, 1163,
87 1171, 1181, 1187, 1193, 1201, 1213, 1217, 1223,
88 1229, 1231, 1237, 1249, 1259, 1277, 1279, 1283,
89 1289, 1291, 1297, 1301, 1303, 1307, 1319, 1321,
90 1327, 1361, 1367, 1373, 1381, 1399, 1409, 1423,
91 1427, 1429, 1433, 1439, 1447, 1451, 1453, 1459,
92 1471, 1481, 1483, 1487, 1489, 1493, 1499, 1511,
93 1523, 1531, 1543, 1549, 1553, 1559, 1567, 1571,
94 1579, 1583, 1597, 1601, 1607, 1609, 1613, 1619,
95 1621, 1627, 1637, 1657, 1663, 1667, 1669, 1693,
96 1697, 1699, 1709, 1721, 1723, 1733, 1741, 1747,
97 1753, 1759, 1777, 1783, 1787, 1789, 1801, 1811,
98 1823, 1831, 1847, 1861, 1867, 1871, 1873, 1877,
99 1879, 1889, 1901, 1907, 1913, 1931, 1933, 1949,
100 1951, 1973, 1979, 1987, 1993, 1997, 1999, 2003,
101 2011, 2017, 2027, 2029, 2039, 2053, 2063, 2069,
102 2081, 2083, 2087, 2089, 2099, 2111, 2113, 2129,
103 2131, 2137, 2141, 2143, 2153, 2161, 2179, 2203,
104 2207, 2213, 2221, 2237, 2239, 2243, 2251, 2267,
105 2269, 2273, 2281, 2287, 2293, 2297, 2309, 2311,
106 2333, 2339, 2341, 2347, 2351, 2357, 2371, 2377,
107 2381, 2383, 2389, 2393, 2399, 2411, 2417, 2423,
108 2437, 2441, 2447, 2459, 2467, 2473, 2477, 2503,
109 2521, 2531, 2539, 2543, 2549, 2551, 2557, 2579,
110 2591, 2593, 2609, 2617, 2621, 2633, 2647, 2657,
111 2659, 2663, 2671, 2677, 2683, 2687, 2689, 2693,
112 2699, 2707, 2711, 2713, 2719, 2729, 2731, 2741,
113 2749, 2753, 2767, 2777, 2789, 2791, 2797, 2801,
114 2803, 2819, 2833, 2837, 2843, 2851, 2857, 2861,
115 2879, 2887, 2897, 2903, 2909, 2917, 2927, 2939,
116 2953, 2957, 2963, 2969, 2971, 2999, 3001, 3011,
117 3019, 3023, 3037, 3041, 3049, 3061, 3067, 3079,
118 3083, 3089, 3109, 3119, 3121, 3137, 3163, 3167,
119 3169, 3181, 3187, 3191, 3203, 3209, 3217, 3221,
120 3229, 3251, 3253, 3257, 3259, 3271, 3299, 3301,
121 3307, 3313, 3319, 3323, 3329, 3331, 3343, 3347,
122 3359, 3361, 3371, 3373, 3389, 3391, 3407, 3413,
123 3433, 3449, 3457, 3461, 3463, 3467, 3469, 3491,
124 3499, 3511, 3517, 3527, 3529, 3533, 3539, 3541,
125 3547, 3557, 3559, 3571, 3581, 3583, 3593, 3607,
126 3613, 3617, 3623, 3631, 3637, 3643, 3659, 3671,
127 3673, 3677, 3691, 3697, 3701, 3709, 3719, 3727,
128 3733, 3739, 3761, 3767, 3769, 3779, 3793, 3797,
129 3803, 3821, 3823, 3833, 3847, 3851, 3853, 3863,
130 3877, 3881, 3889, 3907, 3911, 3917, 3919, 3923,
131 3929, 3931, 3943, 3947, 3967, 3989, 4001, 4003,
132 4007, 4013, 4019, 4021, 4027, 4049, 4051, 4057,
133 4073, 4079, 4091, 4093, 4099, 4111, 4127, 4129,
134 4133, 4139, 4153, 4157, 4159, 4177, 4201, 4211,
135 4217, 4219, 4229, 4231, 4241, 4243, 4253, 4259,
136 4261, 4271, 4273, 4283, 4289, 4297, 4327, 4337,
137 4339, 4349, 4357, 4363, 4373, 4391, 4397, 4409,
138 4421, 4423, 4441, 4447, 4451, 4457, 4463, 4481,
139 4483, 4493, 4507, 4513, 4517, 4519, 4523, 4547,
140 4549, 4561, 4567, 4583, 4591, 4597, 4603, 4621,
141 4637, 4639, 4643, 4649, 4651, 4657, 4663, 4673,
142 4679, 4691, 4703, 4721, 4723, 4729, 4733, 4751,
143 4759, 4783, 4787, 4789, 4793, 4799, 4801, 4813,
144 4817, 4831, 4861, 4871, 4877, 4889, 4903, 4909,
145 4919, 4931, 4933, 4937, 4943, 4951, 4957, 4967,
146 4969, 4973, 4987, 4993, 4999, 5003, 5009, 5011,
147 5021, 5023, 5039, 5051, 5059, 5077, 5081, 5087,
148 5099, 5101, 5107, 5113, 5119, 5147, 5153, 5167,
149 5171, 5179, 5189, 5197, 5209, 5227, 5231, 5233,
150 5237, 5261, 5273, 5279, 5281, 5297, 5303, 5309,
151 5323, 5333, 5347, 5351, 5381, 5387, 5393, 5399,
152 5407, 5413, 5417, 5419, 5431, 5437, 5441, 5443,
153 5449, 5471, 5477, 5479, 5483, 5501, 5503, 5507,
154 5519, 5521, 5527, 5531, 5557, 5563, 5569, 5573,
155 5581, 5591, 5623, 5639, 5641, 5647, 5651, 5653,
156 5657, 5659, 5669, 5683, 5689, 5693, 5701, 5711,
157 5717, 5737, 5741, 5743, 5749, 5779, 5783, 5791,
158 5801, 5807, 5813, 5821, 5827, 5839, 5843, 5849,
159 5851, 5857, 5861, 5867, 5869, 5879, 5881, 5897,
160 5903, 5923, 5927, 5939, 5953, 5981, 5987, 6007,
161 6011, 6029, 6037, 6043, 6047, 6053, 6067, 6073,
162 6079, 6089, 6091, 6101, 6113, 6121, 6131, 6133,
163 6143, 6151, 6163, 6173, 6197, 6199, 6203, 6211,
164 6217, 6221, 6229, 6247, 6257, 6263, 6269, 6271,
165 6277, 6287, 6299, 6301, 6311, 6317, 6323, 6329,
166 6337, 6343, 6353, 6359, 6361, 6367, 6373, 6379,
167 6389, 6397, 6421, 6427, 6449, 6451, 6469, 6473,
168 6481, 6491, 6521, 6529, 6547, 6551, 6553, 6563,
169 6569, 6571, 6577, 6581, 6599, 6607, 6619, 6637,
170 6653, 6659, 6661, 6673, 6679, 6689, 6691, 6701,
171 6703, 6709, 6719, 6733, 6737, 6761, 6763, 6779,
172 6781, 6791, 6793, 6803, 6823, 6827, 6829, 6833,
173 6841, 6857, 6863, 6869, 6871, 6883, 6899, 6907,
174 6911, 6917, 6947, 6949, 6959, 6961, 6967, 6971,
175 6977, 6983, 6991, 6997, 7001, 7013, 7019, 7027,
176 7039, 7043, 7057, 7069, 7079, 7103, 7109, 7121,
177 7127, 7129, 7151, 7159, 7177, 7187, 7193, 7207,
178 7211, 7213, 7219, 7229, 7237, 7243, 7247, 7253,
179 7283, 7297, 7307, 7309, 7321, 7331, 7333, 7349,
180 7351, 7369, 7393, 7411, 7417, 7433, 7451, 7457,
181 7459, 7477, 7481, 7487, 7489, 7499, 7507, 7517,
182 7523, 7529, 7537, 7541, 7547, 7549, 7559, 7561,
183 7573, 7577, 7583, 7589, 7591, 7603, 7607, 7621,
184 7639, 7643, 7649, 7669, 7673, 7681, 7687, 7691,
185 7699, 7703, 7717, 7723, 7727, 7741, 7753, 7757,
186 7759, 7789, 7793, 7817, 7823, 7829, 7841, 7853,
187 7867, 7873, 7877, 7879, 7883, 7901, 7907, 7919,
188 7927, 7933, 7937, 7949, 7951, 7963, 7993, 8009,
189 8011, 8017, 8039, 8053, 8059, 8069, 8081, 8087,
190 8089, 8093, 8101, 8111, 8117, 8123, 8147, 8161,
191 8167, 8171, 8179, 8191, 8209, 8219, 8221, 8231,
192 8233, 8237, 8243, 8263, 8269, 8273, 8287, 8291,
193 8293, 8297, 8311, 8317, 8329, 8353, 8363, 8369,
194 8377, 8387, 8389, 8419, 8423, 8429, 8431, 8443,
195 8447, 8461, 8467, 8501, 8513, 8521, 8527, 8537,
196 8539, 8543, 8563, 8573, 8581, 8597, 8599, 8609,
197 8623, 8627, 8629, 8641, 8647, 8663, 8669, 8677,
198 8681, 8689, 8693, 8699, 8707, 8713, 8719, 8731,
199 8737, 8741, 8747, 8753, 8761, 8779, 8783, 8803,
200 8807, 8819, 8821, 8831, 8837, 8839, 8849, 8861,
201 8863, 8867, 8887, 8893, 8923, 8929, 8933, 8941,
202 8951, 8963, 8969, 8971, 8999, 9001, 9007, 9011,
203 9013, 9029, 9041, 9043, 9049, 9059, 9067, 9091,
204 9103, 9109, 9127, 9133, 9137, 9151, 9157, 9161,
205 9173, 9181, 9187, 9199, 9203, 9209, 9221, 9227,
206 9239, 9241, 9257, 9277, 9281, 9283, 9293, 9311,
207 9319, 9323, 9337, 9341, 9343, 9349, 9371, 9377,
208 9391, 9397, 9403, 9413, 9419, 9421, 9431, 9433,
209 9437, 9439, 9461, 9463, 9467, 9473, 9479, 9491,
210 9497, 9511, 9521, 9533, 9539, 9547, 9551, 9587,
211 9601, 9613, 9619, 9623, 9629, 9631, 9643, 9649,
212 9661, 9677, 9679, 9689, 9697, 9719, 9721, 9733,
213 9739, 9743, 9749, 9767, 9769, 9781, 9787, 9791,
214 9803, 9811, 9817, 9829, 9833, 9839, 9851, 9857,
215 9859, 9871, 9883, 9887, 9901, 9907, 9923, 9929,
216 9931, 9941, 9949, 9967, 9973, 10007, 10009, 10037,
217 10039, 10061, 10067, 10069, 10079, 10091, 10093, 10099,
218 10103, 10111, 10133, 10139, 10141, 10151, 10159, 10163,
219 10169, 10177, 10181, 10193, 10211, 10223, 10243, 10247,
220 10253, 10259, 10267, 10271, 10273, 10289, 10301, 10303,
221 10313, 10321, 10331, 10333, 10337, 10343, 10357, 10369,
222 10391, 10399, 10427, 10429, 10433, 10453, 10457, 10459,
223 10463, 10477, 10487, 10499, 10501, 10513, 10529, 10531,
224 10559, 10567, 10589, 10597, 10601, 10607, 10613, 10627,
225 10631, 10639, 10651, 10657, 10663, 10667, 10687, 10691,
226 10709, 10711, 10723, 10729, 10733, 10739, 10753, 10771,
227 10781, 10789, 10799, 10831, 10837, 10847, 10853, 10859,
228 10861, 10867, 10883, 10889, 10891, 10903, 10909, 10937,
229 10939, 10949, 10957, 10973, 10979, 10987, 10993, 11003,
230 11027, 11047, 11057, 11059, 11069, 11071, 11083, 11087,
231 11093, 11113, 11117, 11119, 11131, 11149, 11159, 11161,
232 11171, 11173, 11177, 11197, 11213, 11239, 11243, 11251,
233 11257, 11261, 11273, 11279, 11287, 11299, 11311, 11317,
234 11321, 11329, 11351, 11353, 11369, 11383, 11393, 11399,
235 11411, 11423, 11437, 11443, 11447, 11467, 11471, 11483,
236 11489, 11491, 11497, 11503, 11519, 11527, 11549, 11551,
237 11579, 11587, 11593, 11597, 11617, 11621, 11633, 11657,
238 11677, 11681, 11689, 11699, 11701, 11717, 11719, 11731,
239 11743, 11777, 11779, 11783, 11789, 11801, 11807, 11813,
240 11821, 11827, 11831, 11833, 11839, 11863, 11867, 11887,
241 11897, 11903, 11909, 11923, 11927, 11933, 11939, 11941,
242 11953, 11959, 11969, 11971, 11981, 11987, 12007, 12011,
243 12037, 12041, 12043, 12049, 12071, 12073, 12097, 12101,
244 12107, 12109, 12113, 12119, 12143, 12149, 12157, 12161,
245 12163, 12197, 12203, 12211, 12227, 12239, 12241, 12251,
246 12253, 12263, 12269, 12277, 12281, 12289, 12301, 12323,
247 12329, 12343, 12347, 12373, 12377, 12379, 12391, 12401,
248 12409, 12413, 12421, 12433, 12437, 12451, 12457, 12473,
249 12479, 12487, 12491, 12497, 12503, 12511, 12517, 12527,
250 12539, 12541, 12547, 12553, 12569, 12577, 12583, 12589,
251 12601, 12611, 12613, 12619, 12637, 12641, 12647, 12653,
252 12659, 12671, 12689, 12697, 12703, 12713, 12721, 12739,
253 12743, 12757, 12763, 12781, 12791, 12799, 12809, 12821,
254 12823, 12829, 12841, 12853, 12889, 12893, 12899, 12907,
255 12911, 12917, 12919, 12923, 12941, 12953, 12959, 12967,
256 12973, 12979, 12983, 13001, 13003, 13007, 13009, 13033,
257 13037, 13043, 13049, 13063, 13093, 13099, 13103, 13109,
258 13121, 13127, 13147, 13151, 13159, 13163, 13171, 13177,
259 13183, 13187, 13217, 13219, 13229, 13241, 13249, 13259,
260 13267, 13291, 13297, 13309, 13313, 13327, 13331, 13337,
261 13339, 13367, 13381, 13397, 13399, 13411, 13417, 13421,
262 13441, 13451, 13457, 13463, 13469, 13477, 13487, 13499,
263 13513, 13523, 13537, 13553, 13567, 13577, 13591, 13597,
264 13613, 13619, 13627, 13633, 13649, 13669, 13679, 13681,
265 13687, 13691, 13693, 13697, 13709, 13711, 13721, 13723,
266 13729, 13751, 13757, 13759, 13763, 13781, 13789, 13799,
267 13807, 13829, 13831, 13841, 13859, 13873, 13877, 13879,
268 13883, 13901, 13903, 13907, 13913, 13921, 13931, 13933,
269 13963, 13967, 13997, 13999, 14009, 14011, 14029, 14033,
270 14051, 14057, 14071, 14081, 14083, 14087, 14107, 14143,
271 14149, 14153, 14159, 14173, 14177, 14197, 14207, 14221,
272 14243, 14249, 14251, 14281, 14293, 14303, 14321, 14323,
273 14327, 14341, 14347, 14369, 14387, 14389, 14401, 14407,
274 14411, 14419, 14423, 14431, 14437, 14447, 14449, 14461,
275 14479, 14489, 14503, 14519, 14533, 14537, 14543, 14549,
276 14551, 14557, 14561, 14563, 14591, 14593, 14621, 14627,
277 14629, 14633, 14639, 14653, 14657, 14669, 14683, 14699,
278 14713, 14717, 14723, 14731, 14737, 14741, 14747, 14753,
279 14759, 14767, 14771, 14779, 14783, 14797, 14813, 14821,
280 14827, 14831, 14843, 14851, 14867, 14869, 14879, 14887,
281 14891, 14897, 14923, 14929, 14939, 14947, 14951, 14957,
282 14969, 14983, 15013, 15017, 15031, 15053, 15061, 15073,
283 15077, 15083, 15091, 15101, 15107, 15121, 15131, 15137,
284 15139, 15149, 15161, 15173, 15187, 15193, 15199, 15217,
285 15227, 15233, 15241, 15259, 15263, 15269, 15271, 15277,
286 15287, 15289, 15299, 15307, 15313, 15319, 15329, 15331,
287 15349, 15359, 15361, 15373, 15377, 15383, 15391, 15401,
288 15413, 15427, 15439, 15443, 15451, 15461, 15467, 15473,
289 15493, 15497, 15511, 15527, 15541, 15551, 15559, 15569,
290 15581, 15583, 15601, 15607, 15619, 15629, 15641, 15643,
291 15647, 15649, 15661, 15667, 15671, 15679, 15683, 15727,
292 15731, 15733, 15737, 15739, 15749, 15761, 15767, 15773,
293 15787, 15791, 15797, 15803, 15809, 15817, 15823, 15859,
294 15877, 15881, 15887, 15889, 15901, 15907, 15913, 15919,
295 15923, 15937, 15959, 15971, 15973, 15991, 16001, 16007,
296 16033, 16057, 16061, 16063, 16067, 16069, 16073, 16087,
297 16091, 16097, 16103, 16111, 16127, 16139, 16141, 16183,
298 16187, 16189, 16193, 16217, 16223, 16229, 16231, 16249,
299 16253, 16267, 16273, 16301, 16319, 16333, 16339, 16349,
300 16361, 16363, 16369, 16381, 16411, 16417, 16421, 16427,
301 16433, 16447, 16451, 16453, 16477, 16481, 16487, 16493,
302 16519, 16529, 16547, 16553, 16561, 16567, 16573, 16603,
303 16607, 16619, 16631, 16633, 16649, 16651, 16657, 16661,
304 16673, 16691, 16693, 16699, 16703, 16729, 16741, 16747,
305 16759, 16763, 16787, 16811, 16823, 16829, 16831, 16843,
306 16871, 16879, 16883, 16889, 16901, 16903, 16921, 16927,
307 16931, 16937, 16943, 16963, 16979, 16981, 16987, 16993,
308 17011, 17021, 17027, 17029, 17033, 17041, 17047, 17053,
309 17077, 17093, 17099, 17107, 17117, 17123, 17137, 17159,
310 17167, 17183, 17189, 17191, 17203, 17207, 17209, 17231,
311 17239, 17257, 17291, 17293, 17299, 17317, 17321, 17327,
312 17333, 17341, 17351, 17359, 17377, 17383, 17387, 17389,
313 17393, 17401, 17417, 17419, 17431, 17443, 17449, 17467,
314 17471, 17477, 17483, 17489, 17491, 17497, 17509, 17519,
315 17539, 17551, 17569, 17573, 17579, 17581, 17597, 17599,
316 17609, 17623, 17627, 17657, 17659, 17669, 17681, 17683,
317 17707, 17713, 17729, 17737, 17747, 17749, 17761, 17783,
318 17789, 17791, 17807, 17827, 17837, 17839, 17851, 17863,
319};
diff --git a/src/lib/libcrypto/bn/bn_prime.pl b/src/lib/libcrypto/bn/bn_prime.pl
deleted file mode 100644
index eb73f0bfa6..0000000000
--- a/src/lib/libcrypto/bn/bn_prime.pl
+++ /dev/null
@@ -1,103 +0,0 @@
1#!/usr/local/bin/perl
2# bn_prime.pl
3
4$num=2048;
5$num=$ARGV[0] if ($#ARGV >= 0);
6
7push(@primes,2);
8$p=1;
9loop: while ($#primes < $num-1)
10 {
11 $p+=2;
12 $s=int(sqrt($p));
13
14 for ($i=0; defined($primes[$i]) && $primes[$i]<=$s; $i++)
15 {
16 next loop if (($p%$primes[$i]) == 0);
17 }
18 push(@primes,$p);
19 }
20
21# print <<"EOF";
22# /* Auto generated by bn_prime.pl */
23# /* Copyright (C) 1995-1997 Eric Young (eay\@mincom.oz.au).
24# * All rights reserved.
25# * Copyright remains Eric Young's, and as such any Copyright notices in
26# * the code are not to be removed.
27# * See the COPYRIGHT file in the SSLeay distribution for more details.
28# */
29#
30# EOF
31
32print <<\EOF;
33/* Auto generated by bn_prime.pl */
34/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
35 * All rights reserved.
36 *
37 * This package is an SSL implementation written
38 * by Eric Young (eay@cryptsoft.com).
39 * The implementation was written so as to conform with Netscapes SSL.
40 *
41 * This library is free for commercial and non-commercial use as long as
42 * the following conditions are aheared to. The following conditions
43 * apply to all code found in this distribution, be it the RC4, RSA,
44 * lhash, DES, etc., code; not just the SSL code. The SSL documentation
45 * included with this distribution is covered by the same copyright terms
46 * except that the holder is Tim Hudson (tjh@cryptsoft.com).
47 *
48 * Copyright remains Eric Young's, and as such any Copyright notices in
49 * the code are not to be removed.
50 * If this package is used in a product, Eric Young should be given attribution
51 * as the author of the parts of the library used.
52 * This can be in the form of a textual message at program startup or
53 * in documentation (online or textual) provided with the package.
54 *
55 * Redistribution and use in source and binary forms, with or without
56 * modification, are permitted provided that the following conditions
57 * are met:
58 * 1. Redistributions of source code must retain the copyright
59 * notice, this list of conditions and the following disclaimer.
60 * 2. Redistributions in binary form must reproduce the above copyright
61 * notice, this list of conditions and the following disclaimer in the
62 * documentation and/or other materials provided with the distribution.
63 * 3. All advertising materials mentioning features or use of this software
64 * must display the following acknowledgement:
65 * "This product includes cryptographic software written by
66 * Eric Young (eay@cryptsoft.com)"
67 * The word 'cryptographic' can be left out if the rouines from the library
68 * being used are not cryptographic related :-).
69 * 4. If you include any Windows specific code (or a derivative thereof) from
70 * the apps directory (application code) you must include an acknowledgement:
71 * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
72 *
73 * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
74 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
75 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
76 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
77 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
78 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
79 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
80 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
81 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
82 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
83 * SUCH DAMAGE.
84 *
85 * The licence and distribution terms for any publically available version or
86 * derivative of this code cannot be changed. i.e. this code cannot simply be
87 * copied and put under another distribution licence
88 * [including the GNU Public Licence.]
89 */
90
91EOF
92
93printf "#define NUMPRIMES %d\n",$num;
94printf "typedef unsigned short prime_t;\n";
95print "static const prime_t primes[NUMPRIMES]=\n{\n\t";
96for ($i=0; $i <= $#primes; $i++)
97 {
98 printf("\n\t") if (($i%8) == 0) && ($i != 0);
99 printf("%4d,",$primes[$i]);
100 }
101print "\n};\n";
102
103
diff --git a/src/lib/libcrypto/bn/bn_print.c b/src/lib/libcrypto/bn/bn_print.c
deleted file mode 100644
index 4920705a5b..0000000000
--- a/src/lib/libcrypto/bn/bn_print.c
+++ /dev/null
@@ -1,393 +0,0 @@
1/* $OpenBSD: bn_print.c,v 1.23 2014/07/12 16:03:36 miod Exp $ */
2/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
3 * All rights reserved.
4 *
5 * This package is an SSL implementation written
6 * by Eric Young (eay@cryptsoft.com).
7 * The implementation was written so as to conform with Netscapes SSL.
8 *
9 * This library is free for commercial and non-commercial use as long as
10 * the following conditions are aheared to. The following conditions
11 * apply to all code found in this distribution, be it the RC4, RSA,
12 * lhash, DES, etc., code; not just the SSL code. The SSL documentation
13 * included with this distribution is covered by the same copyright terms
14 * except that the holder is Tim Hudson (tjh@cryptsoft.com).
15 *
16 * Copyright remains Eric Young's, and as such any Copyright notices in
17 * the code are not to be removed.
18 * If this package is used in a product, Eric Young should be given attribution
19 * as the author of the parts of the library used.
20 * This can be in the form of a textual message at program startup or
21 * in documentation (online or textual) provided with the package.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 * 1. Redistributions of source code must retain the copyright
27 * notice, this list of conditions and the following disclaimer.
28 * 2. Redistributions in binary form must reproduce the above copyright
29 * notice, this list of conditions and the following disclaimer in the
30 * documentation and/or other materials provided with the distribution.
31 * 3. All advertising materials mentioning features or use of this software
32 * must display the following acknowledgement:
33 * "This product includes cryptographic software written by
34 * Eric Young (eay@cryptsoft.com)"
35 * The word 'cryptographic' can be left out if the rouines from the library
36 * being used are not cryptographic related :-).
37 * 4. If you include any Windows specific code (or a derivative thereof) from
38 * the apps directory (application code) you must include an acknowledgement:
39 * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
40 *
41 * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
42 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51 * SUCH DAMAGE.
52 *
53 * The licence and distribution terms for any publically available version or
54 * derivative of this code cannot be changed. i.e. this code cannot simply be
55 * copied and put under another distribution licence
56 * [including the GNU Public Licence.]
57 */
58
59#include <ctype.h>
60#include <stdio.h>
61
62#include <openssl/opensslconf.h>
63
64#include <openssl/bio.h>
65#include <openssl/buffer.h>
66#include <openssl/err.h>
67
68#include "bn_lcl.h"
69
70static const char Hex[]="0123456789ABCDEF";
71
72/* Must 'free' the returned data */
73char *
74BN_bn2hex(const BIGNUM *a)
75{
76 int i, j, v, z = 0;
77 char *buf;
78 char *p;
79
80 buf = malloc(a->top * BN_BYTES * 2 + 2);
81 if (buf == NULL) {
82 BNerr(BN_F_BN_BN2HEX, ERR_R_MALLOC_FAILURE);
83 goto err;
84 }
85 p = buf;
86 if (a->neg)
87 *(p++) = '-';
88 if (BN_is_zero(a))
89 *(p++) = '0';
90 for (i = a->top - 1; i >=0; i--) {
91 for (j = BN_BITS2 - 8; j >= 0; j -= 8) {
92 /* strip leading zeros */
93 v = ((int)(a->d[i] >> (long)j)) & 0xff;
94 if (z || (v != 0)) {
95 *(p++) = Hex[v >> 4];
96 *(p++) = Hex[v & 0x0f];
97 z = 1;
98 }
99 }
100 }
101 *p = '\0';
102
103err:
104 return (buf);
105}
106
107/* Must 'free' the returned data */
108char *
109BN_bn2dec(const BIGNUM *a)
110{
111 int i = 0, num, ok = 0;
112 char *buf = NULL;
113 char *p;
114 BIGNUM *t = NULL;
115 BN_ULONG *bn_data = NULL, *lp;
116
117 /* get an upper bound for the length of the decimal integer
118 * num <= (BN_num_bits(a) + 1) * log(2)
119 * <= 3 * BN_num_bits(a) * 0.1001 + log(2) + 1 (rounding error)
120 * <= BN_num_bits(a)/10 + BN_num_bits/1000 + 1 + 1
121 */
122 i = BN_num_bits(a) * 3;
123 num = (i / 10 + i / 1000 + 1) + 1;
124 bn_data = reallocarray(NULL, num / BN_DEC_NUM + 1, sizeof(BN_ULONG));
125 buf = malloc(num + 3);
126 if ((buf == NULL) || (bn_data == NULL)) {
127 BNerr(BN_F_BN_BN2DEC, ERR_R_MALLOC_FAILURE);
128 goto err;
129 }
130 if ((t = BN_dup(a)) == NULL)
131 goto err;
132
133#define BUF_REMAIN (num+3 - (size_t)(p - buf))
134 p = buf;
135 lp = bn_data;
136 if (BN_is_zero(t)) {
137 *(p++) = '0';
138 *(p++) = '\0';
139 } else {
140 if (BN_is_negative(t))
141 *p++ = '-';
142
143 i = 0;
144 while (!BN_is_zero(t)) {
145 *lp = BN_div_word(t, BN_DEC_CONV);
146 lp++;
147 }
148 lp--;
149 /* We now have a series of blocks, BN_DEC_NUM chars
150 * in length, where the last one needs truncation.
151 * The blocks need to be reversed in order. */
152 snprintf(p, BUF_REMAIN, BN_DEC_FMT1, *lp);
153 while (*p)
154 p++;
155 while (lp != bn_data) {
156 lp--;
157 snprintf(p, BUF_REMAIN, BN_DEC_FMT2, *lp);
158 while (*p)
159 p++;
160 }
161 }
162 ok = 1;
163
164err:
165 free(bn_data);
166 BN_free(t);
167 if (!ok && buf) {
168 free(buf);
169 buf = NULL;
170 }
171
172 return (buf);
173}
174
175int
176BN_hex2bn(BIGNUM **bn, const char *a)
177{
178 BIGNUM *ret = NULL;
179 BN_ULONG l = 0;
180 int neg = 0, h, m, i,j, k, c;
181 int num;
182
183 if ((a == NULL) || (*a == '\0'))
184 return (0);
185
186 if (*a == '-') {
187 neg = 1;
188 a++;
189 }
190
191 for (i = 0; isxdigit((unsigned char)a[i]); i++)
192 ;
193
194 num = i + neg;
195 if (bn == NULL)
196 return (num);
197
198 /* a is the start of the hex digits, and it is 'i' long */
199 if (*bn == NULL) {
200 if ((ret = BN_new()) == NULL)
201 return (0);
202 } else {
203 ret= *bn;
204 BN_zero(ret);
205 }
206
207 /* i is the number of hex digests; */
208 if (bn_expand(ret, i * 4) == NULL)
209 goto err;
210
211 j = i; /* least significant 'hex' */
212 m = 0;
213 h = 0;
214 while (j > 0) {
215 m = ((BN_BYTES*2) <= j) ? (BN_BYTES * 2) : j;
216 l = 0;
217 for (;;) {
218 c = a[j - m];
219 if ((c >= '0') && (c <= '9'))
220 k = c - '0';
221 else if ((c >= 'a') && (c <= 'f'))
222 k = c - 'a' + 10;
223 else if ((c >= 'A') && (c <= 'F'))
224 k = c - 'A' + 10;
225 else
226 k = 0; /* paranoia */
227 l = (l << 4) | k;
228
229 if (--m <= 0) {
230 ret->d[h++] = l;
231 break;
232 }
233 }
234 j -= (BN_BYTES * 2);
235 }
236 ret->top = h;
237 bn_correct_top(ret);
238 ret->neg = neg;
239
240 *bn = ret;
241 bn_check_top(ret);
242 return (num);
243
244err:
245 if (*bn == NULL)
246 BN_free(ret);
247 return (0);
248}
249
250int
251BN_dec2bn(BIGNUM **bn, const char *a)
252{
253 BIGNUM *ret = NULL;
254 BN_ULONG l = 0;
255 int neg = 0, i, j;
256 int num;
257
258 if ((a == NULL) || (*a == '\0'))
259 return (0);
260 if (*a == '-') {
261 neg = 1;
262 a++;
263 }
264
265 for (i = 0; isdigit((unsigned char)a[i]); i++)
266 ;
267
268 num = i + neg;
269 if (bn == NULL)
270 return (num);
271
272 /* a is the start of the digits, and it is 'i' long.
273 * We chop it into BN_DEC_NUM digits at a time */
274 if (*bn == NULL) {
275 if ((ret = BN_new()) == NULL)
276 return (0);
277 } else {
278 ret = *bn;
279 BN_zero(ret);
280 }
281
282 /* i is the number of digests, a bit of an over expand; */
283 if (bn_expand(ret, i * 4) == NULL)
284 goto err;
285
286 j = BN_DEC_NUM - (i % BN_DEC_NUM);
287 if (j == BN_DEC_NUM)
288 j = 0;
289 l = 0;
290 while (*a) {
291 l *= 10;
292 l += *a - '0';
293 a++;
294 if (++j == BN_DEC_NUM) {
295 BN_mul_word(ret, BN_DEC_CONV);
296 BN_add_word(ret, l);
297 l = 0;
298 j = 0;
299 }
300 }
301 ret->neg = neg;
302
303 bn_correct_top(ret);
304 *bn = ret;
305 bn_check_top(ret);
306 return (num);
307
308err:
309 if (*bn == NULL)
310 BN_free(ret);
311 return (0);
312}
313
314int
315BN_asc2bn(BIGNUM **bn, const char *a)
316{
317 const char *p = a;
318 if (*p == '-')
319 p++;
320
321 if (p[0] == '0' && (p[1] == 'X' || p[1] == 'x')) {
322 if (!BN_hex2bn(bn, p + 2))
323 return 0;
324 } else {
325 if (!BN_dec2bn(bn, p))
326 return 0;
327 }
328 if (*a == '-')
329 (*bn)->neg = 1;
330 return 1;
331}
332
333#ifndef OPENSSL_NO_BIO
334int
335BN_print_fp(FILE *fp, const BIGNUM *a)
336{
337 BIO *b;
338 int ret;
339
340 if ((b = BIO_new(BIO_s_file())) == NULL)
341 return (0);
342 BIO_set_fp(b, fp, BIO_NOCLOSE);
343 ret = BN_print(b, a);
344 BIO_free(b);
345 return (ret);
346}
347
348int
349BN_print(BIO *bp, const BIGNUM *a)
350{
351 int i, j, v, z = 0;
352 int ret = 0;
353
354 if ((a->neg) && (BIO_write(bp, "-", 1) != 1))
355 goto end;
356 if (BN_is_zero(a) && (BIO_write(bp, "0", 1) != 1))
357 goto end;
358 for (i = a->top - 1; i >= 0; i--) {
359 for (j = BN_BITS2 - 4; j >= 0; j -= 4) {
360 /* strip leading zeros */
361 v = ((int)(a->d[i] >> (long)j)) & 0x0f;
362 if (z || (v != 0)) {
363 if (BIO_write(bp, &(Hex[v]), 1) != 1)
364 goto end;
365 z = 1;
366 }
367 }
368 }
369 ret = 1;
370
371end:
372 return (ret);
373}
374#endif
375
376char *
377BN_options(void)
378{
379 static int init = 0;
380 static char data[16];
381
382 if (!init) {
383 init++;
384#ifdef BN_LLONG
385 snprintf(data,sizeof data, "bn(%d,%d)",
386 (int)sizeof(BN_ULLONG) * 8, (int)sizeof(BN_ULONG) * 8);
387#else
388 snprintf(data,sizeof data, "bn(%d,%d)",
389 (int)sizeof(BN_ULONG) * 8, (int)sizeof(BN_ULONG) * 8);
390#endif
391 }
392 return (data);
393}
diff --git a/src/lib/libcrypto/bn/bn_rand.c b/src/lib/libcrypto/bn/bn_rand.c
deleted file mode 100644
index ac5c5eb308..0000000000
--- a/src/lib/libcrypto/bn/bn_rand.c
+++ /dev/null
@@ -1,290 +0,0 @@
1/* $OpenBSD: bn_rand.c,v 1.17 2015/02/19 06:10:29 jsing Exp $ */
2/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
3 * All rights reserved.
4 *
5 * This package is an SSL implementation written
6 * by Eric Young (eay@cryptsoft.com).
7 * The implementation was written so as to conform with Netscapes SSL.
8 *
9 * This library is free for commercial and non-commercial use as long as
10 * the following conditions are aheared to. The following conditions
11 * apply to all code found in this distribution, be it the RC4, RSA,
12 * lhash, DES, etc., code; not just the SSL code. The SSL documentation
13 * included with this distribution is covered by the same copyright terms
14 * except that the holder is Tim Hudson (tjh@cryptsoft.com).
15 *
16 * Copyright remains Eric Young's, and as such any Copyright notices in
17 * the code are not to be removed.
18 * If this package is used in a product, Eric Young should be given attribution
19 * as the author of the parts of the library used.
20 * This can be in the form of a textual message at program startup or
21 * in documentation (online or textual) provided with the package.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 * 1. Redistributions of source code must retain the copyright
27 * notice, this list of conditions and the following disclaimer.
28 * 2. Redistributions in binary form must reproduce the above copyright
29 * notice, this list of conditions and the following disclaimer in the
30 * documentation and/or other materials provided with the distribution.
31 * 3. All advertising materials mentioning features or use of this software
32 * must display the following acknowledgement:
33 * "This product includes cryptographic software written by
34 * Eric Young (eay@cryptsoft.com)"
35 * The word 'cryptographic' can be left out if the rouines from the library
36 * being used are not cryptographic related :-).
37 * 4. If you include any Windows specific code (or a derivative thereof) from
38 * the apps directory (application code) you must include an acknowledgement:
39 * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
40 *
41 * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
42 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51 * SUCH DAMAGE.
52 *
53 * The licence and distribution terms for any publically available version or
54 * derivative of this code cannot be changed. i.e. this code cannot simply be
55 * copied and put under another distribution licence
56 * [including the GNU Public Licence.]
57 */
58/* ====================================================================
59 * Copyright (c) 1998-2001 The OpenSSL Project. All rights reserved.
60 *
61 * Redistribution and use in source and binary forms, with or without
62 * modification, are permitted provided that the following conditions
63 * are met:
64 *
65 * 1. Redistributions of source code must retain the above copyright
66 * notice, this list of conditions and the following disclaimer.
67 *
68 * 2. Redistributions in binary form must reproduce the above copyright
69 * notice, this list of conditions and the following disclaimer in
70 * the documentation and/or other materials provided with the
71 * distribution.
72 *
73 * 3. All advertising materials mentioning features or use of this
74 * software must display the following acknowledgment:
75 * "This product includes software developed by the OpenSSL Project
76 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
77 *
78 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
79 * endorse or promote products derived from this software without
80 * prior written permission. For written permission, please contact
81 * openssl-core@openssl.org.
82 *
83 * 5. Products derived from this software may not be called "OpenSSL"
84 * nor may "OpenSSL" appear in their names without prior written
85 * permission of the OpenSSL Project.
86 *
87 * 6. Redistributions of any form whatsoever must retain the following
88 * acknowledgment:
89 * "This product includes software developed by the OpenSSL Project
90 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
91 *
92 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
93 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
94 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
95 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
96 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
97 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
98 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
99 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
100 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
101 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
102 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
103 * OF THE POSSIBILITY OF SUCH DAMAGE.
104 * ====================================================================
105 *
106 * This product includes cryptographic software written by Eric Young
107 * (eay@cryptsoft.com). This product includes software written by Tim
108 * Hudson (tjh@cryptsoft.com).
109 *
110 */
111
112#include <stdio.h>
113#include <stdlib.h>
114#include <time.h>
115
116#include <openssl/err.h>
117
118#include "bn_lcl.h"
119
120static int
121bnrand(int pseudorand, BIGNUM *rnd, int bits, int top, int bottom)
122{
123 unsigned char *buf = NULL;
124 int ret = 0, bit, bytes, mask;
125
126 if (rnd == NULL) {
127 BNerr(BN_F_BNRAND, ERR_R_PASSED_NULL_PARAMETER);
128 return (0);
129 }
130
131 if (bits == 0) {
132 BN_zero(rnd);
133 return (1);
134 }
135
136 bytes = (bits + 7) / 8;
137 bit = (bits - 1) % 8;
138 mask = 0xff << (bit + 1);
139
140 buf = malloc(bytes);
141 if (buf == NULL) {
142 BNerr(BN_F_BNRAND, ERR_R_MALLOC_FAILURE);
143 goto err;
144 }
145
146 /* make a random number and set the top and bottom bits */
147 arc4random_buf(buf, bytes);
148
149#if 1
150 if (pseudorand == 2) {
151 /* generate patterns that are more likely to trigger BN
152 library bugs */
153 int i;
154 unsigned char c;
155
156 for (i = 0; i < bytes; i++) {
157 arc4random_buf(&c, 1);
158 if (c >= 128 && i > 0)
159 buf[i] = buf[i - 1];
160 else if (c < 42)
161 buf[i] = 0;
162 else if (c < 84)
163 buf[i] = 255;
164 }
165 }
166#endif
167
168 if (top != -1) {
169 if (top) {
170 if (bit == 0) {
171 buf[0] = 1;
172 buf[1] |= 0x80;
173 } else {
174 buf[0] |= (3 << (bit - 1));
175 }
176 } else {
177 buf[0] |= (1 << bit);
178 }
179 }
180 buf[0] &= ~mask;
181 if (bottom) /* set bottom bit if requested */
182 buf[bytes - 1] |= 1;
183 if (BN_bin2bn(buf, bytes, rnd) == NULL)
184 goto err;
185 ret = 1;
186
187err:
188 if (buf != NULL) {
189 OPENSSL_cleanse(buf, bytes);
190 free(buf);
191 }
192 bn_check_top(rnd);
193 return (ret);
194}
195
196int
197BN_rand(BIGNUM *rnd, int bits, int top, int bottom)
198{
199 return bnrand(0, rnd, bits, top, bottom);
200}
201
202int
203BN_pseudo_rand(BIGNUM *rnd, int bits, int top, int bottom)
204{
205 return bnrand(1, rnd, bits, top, bottom);
206}
207
208#if 1
209int
210BN_bntest_rand(BIGNUM *rnd, int bits, int top, int bottom)
211{
212 return bnrand(2, rnd, bits, top, bottom);
213}
214#endif
215
216
217/* random number r: 0 <= r < range */
218static int
219bn_rand_range(int pseudo, BIGNUM *r, const BIGNUM *range)
220{
221 int (*bn_rand)(BIGNUM *, int, int, int) = pseudo ? BN_pseudo_rand : BN_rand;
222 int n;
223 int count = 100;
224
225 if (range->neg || BN_is_zero(range)) {
226 BNerr(BN_F_BN_RAND_RANGE, BN_R_INVALID_RANGE);
227 return 0;
228 }
229
230 n = BN_num_bits(range); /* n > 0 */
231
232 /* BN_is_bit_set(range, n - 1) always holds */
233
234 if (n == 1)
235 BN_zero(r);
236 else if (!BN_is_bit_set(range, n - 2) && !BN_is_bit_set(range, n - 3)) {
237 /* range = 100..._2,
238 * so 3*range (= 11..._2) is exactly one bit longer than range */
239 do {
240 if (!bn_rand(r, n + 1, -1, 0))
241 return 0;
242 /* If r < 3*range, use r := r MOD range
243 * (which is either r, r - range, or r - 2*range).
244 * Otherwise, iterate once more.
245 * Since 3*range = 11..._2, each iteration succeeds with
246 * probability >= .75. */
247 if (BN_cmp(r, range) >= 0) {
248 if (!BN_sub(r, r, range))
249 return 0;
250 if (BN_cmp(r, range) >= 0)
251 if (!BN_sub(r, r, range))
252 return 0;
253 }
254
255 if (!--count) {
256 BNerr(BN_F_BN_RAND_RANGE,
257 BN_R_TOO_MANY_ITERATIONS);
258 return 0;
259 }
260
261 } while (BN_cmp(r, range) >= 0);
262 } else {
263 do {
264 /* range = 11..._2 or range = 101..._2 */
265 if (!bn_rand(r, n, -1, 0))
266 return 0;
267
268 if (!--count) {
269 BNerr(BN_F_BN_RAND_RANGE,
270 BN_R_TOO_MANY_ITERATIONS);
271 return 0;
272 }
273 } while (BN_cmp(r, range) >= 0);
274 }
275
276 bn_check_top(r);
277 return 1;
278}
279
280int
281BN_rand_range(BIGNUM *r, const BIGNUM *range)
282{
283 return bn_rand_range(0, r, range);
284}
285
286int
287BN_pseudo_rand_range(BIGNUM *r, const BIGNUM *range)
288{
289 return bn_rand_range(1, r, range);
290}
diff --git a/src/lib/libcrypto/bn/bn_recp.c b/src/lib/libcrypto/bn/bn_recp.c
deleted file mode 100644
index b0bd0aa4df..0000000000
--- a/src/lib/libcrypto/bn/bn_recp.c
+++ /dev/null
@@ -1,263 +0,0 @@
1/* $OpenBSD: bn_recp.c,v 1.13 2015/04/29 00:11:12 doug Exp $ */
2/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
3 * All rights reserved.
4 *
5 * This package is an SSL implementation written
6 * by Eric Young (eay@cryptsoft.com).
7 * The implementation was written so as to conform with Netscapes SSL.
8 *
9 * This library is free for commercial and non-commercial use as long as
10 * the following conditions are aheared to. The following conditions
11 * apply to all code found in this distribution, be it the RC4, RSA,
12 * lhash, DES, etc., code; not just the SSL code. The SSL documentation
13 * included with this distribution is covered by the same copyright terms
14 * except that the holder is Tim Hudson (tjh@cryptsoft.com).
15 *
16 * Copyright remains Eric Young's, and as such any Copyright notices in
17 * the code are not to be removed.
18 * If this package is used in a product, Eric Young should be given attribution
19 * as the author of the parts of the library used.
20 * This can be in the form of a textual message at program startup or
21 * in documentation (online or textual) provided with the package.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 * 1. Redistributions of source code must retain the copyright
27 * notice, this list of conditions and the following disclaimer.
28 * 2. Redistributions in binary form must reproduce the above copyright
29 * notice, this list of conditions and the following disclaimer in the
30 * documentation and/or other materials provided with the distribution.
31 * 3. All advertising materials mentioning features or use of this software
32 * must display the following acknowledgement:
33 * "This product includes cryptographic software written by
34 * Eric Young (eay@cryptsoft.com)"
35 * The word 'cryptographic' can be left out if the rouines from the library
36 * being used are not cryptographic related :-).
37 * 4. If you include any Windows specific code (or a derivative thereof) from
38 * the apps directory (application code) you must include an acknowledgement:
39 * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
40 *
41 * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
42 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51 * SUCH DAMAGE.
52 *
53 * The licence and distribution terms for any publically available version or
54 * derivative of this code cannot be changed. i.e. this code cannot simply be
55 * copied and put under another distribution licence
56 * [including the GNU Public Licence.]
57 */
58
59#include <stdio.h>
60
61#include <openssl/err.h>
62
63#include "bn_lcl.h"
64
65void
66BN_RECP_CTX_init(BN_RECP_CTX *recp)
67{
68 BN_init(&(recp->N));
69 BN_init(&(recp->Nr));
70 recp->num_bits = 0;
71 recp->flags = 0;
72}
73
74BN_RECP_CTX *
75BN_RECP_CTX_new(void)
76{
77 BN_RECP_CTX *ret;
78
79 if ((ret = malloc(sizeof(BN_RECP_CTX))) == NULL)
80 return (NULL);
81
82 BN_RECP_CTX_init(ret);
83 ret->flags = BN_FLG_MALLOCED;
84 return (ret);
85}
86
87void
88BN_RECP_CTX_free(BN_RECP_CTX *recp)
89{
90 if (recp == NULL)
91 return;
92
93 BN_free(&(recp->N));
94 BN_free(&(recp->Nr));
95 if (recp->flags & BN_FLG_MALLOCED)
96 free(recp);
97}
98
99int
100BN_RECP_CTX_set(BN_RECP_CTX *recp, const BIGNUM *d, BN_CTX *ctx)
101{
102 if (!BN_copy(&(recp->N), d))
103 return 0;
104 BN_zero(&(recp->Nr));
105 recp->num_bits = BN_num_bits(d);
106 recp->shift = 0;
107 return (1);
108}
109
110int
111BN_mod_mul_reciprocal(BIGNUM *r, const BIGNUM *x, const BIGNUM *y,
112 BN_RECP_CTX *recp, BN_CTX *ctx)
113{
114 int ret = 0;
115 BIGNUM *a;
116 const BIGNUM *ca;
117
118 BN_CTX_start(ctx);
119 if ((a = BN_CTX_get(ctx)) == NULL)
120 goto err;
121 if (y != NULL) {
122 if (x == y) {
123 if (!BN_sqr(a, x, ctx))
124 goto err;
125 } else {
126 if (!BN_mul(a, x, y, ctx))
127 goto err;
128 }
129 ca = a;
130 } else
131 ca = x; /* Just do the mod */
132
133 ret = BN_div_recp(NULL, r, ca, recp, ctx);
134
135err:
136 BN_CTX_end(ctx);
137 bn_check_top(r);
138 return (ret);
139}
140
141int
142BN_div_recp(BIGNUM *dv, BIGNUM *rem, const BIGNUM *m, BN_RECP_CTX *recp,
143 BN_CTX *ctx)
144{
145 int i, j, ret = 0;
146 BIGNUM *a, *b, *d, *r;
147
148 BN_CTX_start(ctx);
149 a = BN_CTX_get(ctx);
150 b = BN_CTX_get(ctx);
151 if (dv != NULL)
152 d = dv;
153 else
154 d = BN_CTX_get(ctx);
155 if (rem != NULL)
156 r = rem;
157 else
158 r = BN_CTX_get(ctx);
159 if (a == NULL || b == NULL || d == NULL || r == NULL)
160 goto err;
161
162 if (BN_ucmp(m, &(recp->N)) < 0) {
163 BN_zero(d);
164 if (!BN_copy(r, m)) {
165 BN_CTX_end(ctx);
166 return 0;
167 }
168 BN_CTX_end(ctx);
169 return (1);
170 }
171
172 /* We want the remainder
173 * Given input of ABCDEF / ab
174 * we need multiply ABCDEF by 3 digests of the reciprocal of ab
175 *
176 */
177
178 /* i := max(BN_num_bits(m), 2*BN_num_bits(N)) */
179 i = BN_num_bits(m);
180 j = recp->num_bits << 1;
181 if (j > i)
182 i = j;
183
184 /* Nr := round(2^i / N) */
185 if (i != recp->shift)
186 recp->shift = BN_reciprocal(&(recp->Nr), &(recp->N), i, ctx);
187
188 /* BN_reciprocal returns i, or -1 for an error */
189 if (recp->shift == -1)
190 goto err;
191
192 /* d := |round(round(m / 2^BN_num_bits(N)) * recp->Nr / 2^(i - BN_num_bits(N)))|
193 * = |round(round(m / 2^BN_num_bits(N)) * round(2^i / N) / 2^(i - BN_num_bits(N)))|
194 * <= |(m / 2^BN_num_bits(N)) * (2^i / N) * (2^BN_num_bits(N) / 2^i)|
195 * = |m/N|
196 */
197 if (!BN_rshift(a, m, recp->num_bits))
198 goto err;
199 if (!BN_mul(b, a,&(recp->Nr), ctx))
200 goto err;
201 if (!BN_rshift(d, b, i - recp->num_bits))
202 goto err;
203 d->neg = 0;
204
205 if (!BN_mul(b, &(recp->N), d, ctx))
206 goto err;
207 if (!BN_usub(r, m, b))
208 goto err;
209 r->neg = 0;
210
211#if 1
212 j = 0;
213 while (BN_ucmp(r, &(recp->N)) >= 0) {
214 if (j++ > 2) {
215 BNerr(BN_F_BN_DIV_RECP, BN_R_BAD_RECIPROCAL);
216 goto err;
217 }
218 if (!BN_usub(r, r, &(recp->N)))
219 goto err;
220 if (!BN_add_word(d, 1))
221 goto err;
222 }
223#endif
224
225 r->neg = BN_is_zero(r) ? 0 : m->neg;
226 d->neg = m->neg^recp->N.neg;
227 ret = 1;
228
229err:
230 BN_CTX_end(ctx);
231 bn_check_top(dv);
232 bn_check_top(rem);
233 return (ret);
234}
235
236/* len is the expected size of the result
237 * We actually calculate with an extra word of precision, so
238 * we can do faster division if the remainder is not required.
239 */
240/* r := 2^len / m */
241int
242BN_reciprocal(BIGNUM *r, const BIGNUM *m, int len, BN_CTX *ctx)
243{
244 int ret = -1;
245 BIGNUM *t;
246
247 BN_CTX_start(ctx);
248 if ((t = BN_CTX_get(ctx)) == NULL)
249 goto err;
250
251 if (!BN_set_bit(t, len))
252 goto err;
253
254 if (!BN_div(r, NULL, t,m, ctx))
255 goto err;
256
257 ret = len;
258
259err:
260 bn_check_top(r);
261 BN_CTX_end(ctx);
262 return (ret);
263}
diff --git a/src/lib/libcrypto/bn/bn_shift.c b/src/lib/libcrypto/bn/bn_shift.c
deleted file mode 100644
index 0e8211e3d6..0000000000
--- a/src/lib/libcrypto/bn/bn_shift.c
+++ /dev/null
@@ -1,218 +0,0 @@
1/* $OpenBSD: bn_shift.c,v 1.13 2014/10/28 07:35:58 jsg Exp $ */
2/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
3 * All rights reserved.
4 *
5 * This package is an SSL implementation written
6 * by Eric Young (eay@cryptsoft.com).
7 * The implementation was written so as to conform with Netscapes SSL.
8 *
9 * This library is free for commercial and non-commercial use as long as
10 * the following conditions are aheared to. The following conditions
11 * apply to all code found in this distribution, be it the RC4, RSA,
12 * lhash, DES, etc., code; not just the SSL code. The SSL documentation
13 * included with this distribution is covered by the same copyright terms
14 * except that the holder is Tim Hudson (tjh@cryptsoft.com).
15 *
16 * Copyright remains Eric Young's, and as such any Copyright notices in
17 * the code are not to be removed.
18 * If this package is used in a product, Eric Young should be given attribution
19 * as the author of the parts of the library used.
20 * This can be in the form of a textual message at program startup or
21 * in documentation (online or textual) provided with the package.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 * 1. Redistributions of source code must retain the copyright
27 * notice, this list of conditions and the following disclaimer.
28 * 2. Redistributions in binary form must reproduce the above copyright
29 * notice, this list of conditions and the following disclaimer in the
30 * documentation and/or other materials provided with the distribution.
31 * 3. All advertising materials mentioning features or use of this software
32 * must display the following acknowledgement:
33 * "This product includes cryptographic software written by
34 * Eric Young (eay@cryptsoft.com)"
35 * The word 'cryptographic' can be left out if the rouines from the library
36 * being used are not cryptographic related :-).
37 * 4. If you include any Windows specific code (or a derivative thereof) from
38 * the apps directory (application code) you must include an acknowledgement:
39 * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
40 *
41 * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
42 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51 * SUCH DAMAGE.
52 *
53 * The licence and distribution terms for any publically available version or
54 * derivative of this code cannot be changed. i.e. this code cannot simply be
55 * copied and put under another distribution licence
56 * [including the GNU Public Licence.]
57 */
58
59#include <stdio.h>
60#include <string.h>
61
62#include "bn_lcl.h"
63
64int
65BN_lshift1(BIGNUM *r, const BIGNUM *a)
66{
67 BN_ULONG *ap, *rp, t, c;
68 int i;
69
70 bn_check_top(r);
71 bn_check_top(a);
72
73 if (r != a) {
74 r->neg = a->neg;
75 if (bn_wexpand(r, a->top + 1) == NULL)
76 return (0);
77 r->top = a->top;
78 } else {
79 if (bn_wexpand(r, a->top + 1) == NULL)
80 return (0);
81 }
82 ap = a->d;
83 rp = r->d;
84 c = 0;
85 for (i = 0; i < a->top; i++) {
86 t= *(ap++);
87 *(rp++) = ((t << 1) | c) & BN_MASK2;
88 c = (t & BN_TBIT) ? 1 : 0;
89 }
90 if (c) {
91 *rp = 1;
92 r->top++;
93 }
94 bn_check_top(r);
95 return (1);
96}
97
98int
99BN_rshift1(BIGNUM *r, const BIGNUM *a)
100{
101 BN_ULONG *ap, *rp, t, c;
102 int i, j;
103
104 bn_check_top(r);
105 bn_check_top(a);
106
107 if (BN_is_zero(a)) {
108 BN_zero(r);
109 return (1);
110 }
111 i = a->top;
112 ap = a->d;
113 j = i - (ap[i - 1]==1);
114 if (a != r) {
115 if (bn_wexpand(r, j) == NULL)
116 return (0);
117 r->neg = a->neg;
118 }
119 rp = r->d;
120 t = ap[--i];
121 c = (t & 1) ? BN_TBIT : 0;
122 if (t >>= 1)
123 rp[i] = t;
124 while (i > 0) {
125 t = ap[--i];
126 rp[i] = ((t >> 1) & BN_MASK2) | c;
127 c = (t & 1) ? BN_TBIT : 0;
128 }
129 r->top = j;
130 bn_check_top(r);
131 return (1);
132}
133
134int
135BN_lshift(BIGNUM *r, const BIGNUM *a, int n)
136{
137 int i, nw, lb, rb;
138 BN_ULONG *t, *f;
139 BN_ULONG l;
140
141 bn_check_top(r);
142 bn_check_top(a);
143
144 r->neg = a->neg;
145 nw = n / BN_BITS2;
146 if (bn_wexpand(r, a->top + nw + 1) == NULL)
147 return (0);
148 lb = n % BN_BITS2;
149 rb = BN_BITS2 - lb;
150 f = a->d;
151 t = r->d;
152 t[a->top + nw] = 0;
153 if (lb == 0)
154 for (i = a->top - 1; i >= 0; i--)
155 t[nw + i] = f[i];
156 else
157 for (i = a->top - 1; i >= 0; i--) {
158 l = f[i];
159 t[nw + i + 1] |= (l >> rb) & BN_MASK2;
160 t[nw + i] = (l << lb) & BN_MASK2;
161 }
162 memset(t, 0, nw * sizeof(t[0]));
163/* for (i=0; i<nw; i++)
164 t[i]=0;*/
165 r->top = a->top + nw + 1;
166 bn_correct_top(r);
167 bn_check_top(r);
168 return (1);
169}
170
171int
172BN_rshift(BIGNUM *r, const BIGNUM *a, int n)
173{
174 int i, j, nw, lb, rb;
175 BN_ULONG *t, *f;
176 BN_ULONG l, tmp;
177
178 bn_check_top(r);
179 bn_check_top(a);
180
181 nw = n / BN_BITS2;
182 rb = n % BN_BITS2;
183 lb = BN_BITS2 - rb;
184 if (nw >= a->top || a->top == 0) {
185 BN_zero(r);
186 return (1);
187 }
188 i = (BN_num_bits(a) - n + (BN_BITS2 - 1)) / BN_BITS2;
189 if (r != a) {
190 r->neg = a->neg;
191 if (bn_wexpand(r, i) == NULL)
192 return (0);
193 } else {
194 if (n == 0)
195 return 1; /* or the copying loop will go berserk */
196 }
197
198 f = &(a->d[nw]);
199 t = r->d;
200 j = a->top - nw;
201 r->top = i;
202
203 if (rb == 0) {
204 for (i = j; i != 0; i--)
205 *(t++) = *(f++);
206 } else {
207 l = *(f++);
208 for (i = j - 1; i != 0; i--) {
209 tmp = (l >> rb) & BN_MASK2;
210 l = *(f++);
211 *(t++) = (tmp|(l << lb)) & BN_MASK2;
212 }
213 if ((l = (l >> rb) & BN_MASK2))
214 *(t) = l;
215 }
216 bn_check_top(r);
217 return (1);
218}
diff --git a/src/lib/libcrypto/bn/bn_sqr.c b/src/lib/libcrypto/bn/bn_sqr.c
deleted file mode 100644
index a0dce6ea81..0000000000
--- a/src/lib/libcrypto/bn/bn_sqr.c
+++ /dev/null
@@ -1,286 +0,0 @@
1/* $OpenBSD: bn_sqr.c,v 1.12 2015/02/09 15:49:22 jsing Exp $ */
2/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
3 * All rights reserved.
4 *
5 * This package is an SSL implementation written
6 * by Eric Young (eay@cryptsoft.com).
7 * The implementation was written so as to conform with Netscapes SSL.
8 *
9 * This library is free for commercial and non-commercial use as long as
10 * the following conditions are aheared to. The following conditions
11 * apply to all code found in this distribution, be it the RC4, RSA,
12 * lhash, DES, etc., code; not just the SSL code. The SSL documentation
13 * included with this distribution is covered by the same copyright terms
14 * except that the holder is Tim Hudson (tjh@cryptsoft.com).
15 *
16 * Copyright remains Eric Young's, and as such any Copyright notices in
17 * the code are not to be removed.
18 * If this package is used in a product, Eric Young should be given attribution
19 * as the author of the parts of the library used.
20 * This can be in the form of a textual message at program startup or
21 * in documentation (online or textual) provided with the package.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 * 1. Redistributions of source code must retain the copyright
27 * notice, this list of conditions and the following disclaimer.
28 * 2. Redistributions in binary form must reproduce the above copyright
29 * notice, this list of conditions and the following disclaimer in the
30 * documentation and/or other materials provided with the distribution.
31 * 3. All advertising materials mentioning features or use of this software
32 * must display the following acknowledgement:
33 * "This product includes cryptographic software written by
34 * Eric Young (eay@cryptsoft.com)"
35 * The word 'cryptographic' can be left out if the rouines from the library
36 * being used are not cryptographic related :-).
37 * 4. If you include any Windows specific code (or a derivative thereof) from
38 * the apps directory (application code) you must include an acknowledgement:
39 * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
40 *
41 * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
42 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51 * SUCH DAMAGE.
52 *
53 * The licence and distribution terms for any publically available version or
54 * derivative of this code cannot be changed. i.e. this code cannot simply be
55 * copied and put under another distribution licence
56 * [including the GNU Public Licence.]
57 */
58
59#include <stdio.h>
60#include <string.h>
61
62#include "bn_lcl.h"
63
64/* r must not be a */
65/* I've just gone over this and it is now %20 faster on x86 - eay - 27 Jun 96 */
66int
67BN_sqr(BIGNUM *r, const BIGNUM *a, BN_CTX *ctx)
68{
69 int max, al;
70 int ret = 0;
71 BIGNUM *tmp, *rr;
72
73#ifdef BN_COUNT
74 fprintf(stderr, "BN_sqr %d * %d\n", a->top, a->top);
75#endif
76 bn_check_top(a);
77
78 al = a->top;
79 if (al <= 0) {
80 r->top = 0;
81 r->neg = 0;
82 return 1;
83 }
84
85 BN_CTX_start(ctx);
86 rr = (a != r) ? r : BN_CTX_get(ctx);
87 tmp = BN_CTX_get(ctx);
88 if (rr == NULL || tmp == NULL)
89 goto err;
90
91 max = 2 * al; /* Non-zero (from above) */
92 if (bn_wexpand(rr, max) == NULL)
93 goto err;
94
95 if (al == 4) {
96#ifndef BN_SQR_COMBA
97 BN_ULONG t[8];
98 bn_sqr_normal(rr->d, a->d, 4, t);
99#else
100 bn_sqr_comba4(rr->d, a->d);
101#endif
102 } else if (al == 8) {
103#ifndef BN_SQR_COMBA
104 BN_ULONG t[16];
105 bn_sqr_normal(rr->d, a->d, 8, t);
106#else
107 bn_sqr_comba8(rr->d, a->d);
108#endif
109 } else {
110#if defined(BN_RECURSION)
111 if (al < BN_SQR_RECURSIVE_SIZE_NORMAL) {
112 BN_ULONG t[BN_SQR_RECURSIVE_SIZE_NORMAL*2];
113 bn_sqr_normal(rr->d, a->d, al, t);
114 } else {
115 int j, k;
116
117 j = BN_num_bits_word((BN_ULONG)al);
118 j = 1 << (j - 1);
119 k = j + j;
120 if (al == j) {
121 if (bn_wexpand(tmp, k * 2) == NULL)
122 goto err;
123 bn_sqr_recursive(rr->d, a->d, al, tmp->d);
124 } else {
125 if (bn_wexpand(tmp, max) == NULL)
126 goto err;
127 bn_sqr_normal(rr->d, a->d, al, tmp->d);
128 }
129 }
130#else
131 if (bn_wexpand(tmp, max) == NULL)
132 goto err;
133 bn_sqr_normal(rr->d, a->d, al, tmp->d);
134#endif
135 }
136
137 rr->neg = 0;
138 /* If the most-significant half of the top word of 'a' is zero, then
139 * the square of 'a' will max-1 words. */
140 if (a->d[al - 1] == (a->d[al - 1] & BN_MASK2l))
141 rr->top = max - 1;
142 else
143 rr->top = max;
144 if (rr != r)
145 BN_copy(r, rr);
146 ret = 1;
147
148err:
149 bn_check_top(rr);
150 bn_check_top(tmp);
151 BN_CTX_end(ctx);
152 return (ret);
153}
154
155/* tmp must have 2*n words */
156void
157bn_sqr_normal(BN_ULONG *r, const BN_ULONG *a, int n, BN_ULONG *tmp)
158{
159 int i, j, max;
160 const BN_ULONG *ap;
161 BN_ULONG *rp;
162
163 max = n * 2;
164 ap = a;
165 rp = r;
166 rp[0] = rp[max - 1] = 0;
167 rp++;
168 j = n;
169
170 if (--j > 0) {
171 ap++;
172 rp[j] = bn_mul_words(rp, ap, j, ap[-1]);
173 rp += 2;
174 }
175
176 for (i = n - 2; i > 0; i--) {
177 j--;
178 ap++;
179 rp[j] = bn_mul_add_words(rp, ap, j, ap[-1]);
180 rp += 2;
181 }
182
183 bn_add_words(r, r, r, max);
184
185 /* There will not be a carry */
186
187 bn_sqr_words(tmp, a, n);
188
189 bn_add_words(r, r, tmp, max);
190}
191
192#ifdef BN_RECURSION
193/* r is 2*n words in size,
194 * a and b are both n words in size. (There's not actually a 'b' here ...)
195 * n must be a power of 2.
196 * We multiply and return the result.
197 * t must be 2*n words in size
198 * We calculate
199 * a[0]*b[0]
200 * a[0]*b[0]+a[1]*b[1]+(a[0]-a[1])*(b[1]-b[0])
201 * a[1]*b[1]
202 */
203void
204bn_sqr_recursive(BN_ULONG *r, const BN_ULONG *a, int n2, BN_ULONG *t)
205{
206 int n = n2 / 2;
207 int zero, c1;
208 BN_ULONG ln, lo, *p;
209
210#ifdef BN_COUNT
211 fprintf(stderr, " bn_sqr_recursive %d * %d\n", n2, n2);
212#endif
213 if (n2 == 4) {
214#ifndef BN_SQR_COMBA
215 bn_sqr_normal(r, a, 4, t);
216#else
217 bn_sqr_comba4(r, a);
218#endif
219 return;
220 } else if (n2 == 8) {
221#ifndef BN_SQR_COMBA
222 bn_sqr_normal(r, a, 8, t);
223#else
224 bn_sqr_comba8(r, a);
225#endif
226 return;
227 }
228 if (n2 < BN_SQR_RECURSIVE_SIZE_NORMAL) {
229 bn_sqr_normal(r, a, n2, t);
230 return;
231 }
232 /* r=(a[0]-a[1])*(a[1]-a[0]) */
233 c1 = bn_cmp_words(a, &(a[n]), n);
234 zero = 0;
235 if (c1 > 0)
236 bn_sub_words(t, a, &(a[n]), n);
237 else if (c1 < 0)
238 bn_sub_words(t, &(a[n]), a, n);
239 else
240 zero = 1;
241
242 /* The result will always be negative unless it is zero */
243 p = &(t[n2*2]);
244
245 if (!zero)
246 bn_sqr_recursive(&(t[n2]), t, n, p);
247 else
248 memset(&(t[n2]), 0, n2 * sizeof(BN_ULONG));
249 bn_sqr_recursive(r, a, n, p);
250 bn_sqr_recursive(&(r[n2]), &(a[n]), n, p);
251
252 /* t[32] holds (a[0]-a[1])*(a[1]-a[0]), it is negative or zero
253 * r[10] holds (a[0]*b[0])
254 * r[32] holds (b[1]*b[1])
255 */
256
257 c1 = (int)(bn_add_words(t, r, &(r[n2]), n2));
258
259 /* t[32] is negative */
260 c1 -= (int)(bn_sub_words(&(t[n2]), t, &(t[n2]), n2));
261
262 /* t[32] holds (a[0]-a[1])*(a[1]-a[0])+(a[0]*a[0])+(a[1]*a[1])
263 * r[10] holds (a[0]*a[0])
264 * r[32] holds (a[1]*a[1])
265 * c1 holds the carry bits
266 */
267 c1 += (int)(bn_add_words(&(r[n]), &(r[n]), &(t[n2]), n2));
268 if (c1) {
269 p = &(r[n + n2]);
270 lo= *p;
271 ln = (lo + c1) & BN_MASK2;
272 *p = ln;
273
274 /* The overflow will stop before we over write
275 * words we should not overwrite */
276 if (ln < (BN_ULONG)c1) {
277 do {
278 p++;
279 lo= *p;
280 ln = (lo + 1) & BN_MASK2;
281 *p = ln;
282 } while (ln == 0);
283 }
284 }
285}
286#endif
diff --git a/src/lib/libcrypto/bn/bn_sqrt.c b/src/lib/libcrypto/bn/bn_sqrt.c
deleted file mode 100644
index f94fa41094..0000000000
--- a/src/lib/libcrypto/bn/bn_sqrt.c
+++ /dev/null
@@ -1,405 +0,0 @@
1/* $OpenBSD: bn_sqrt.c,v 1.6 2015/02/09 15:49:22 jsing Exp $ */
2/* Written by Lenka Fibikova <fibikova@exp-math.uni-essen.de>
3 * and Bodo Moeller for the OpenSSL project. */
4/* ====================================================================
5 * Copyright (c) 1998-2000 The OpenSSL Project. All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 *
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 *
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in
16 * the documentation and/or other materials provided with the
17 * distribution.
18 *
19 * 3. All advertising materials mentioning features or use of this
20 * software must display the following acknowledgment:
21 * "This product includes software developed by the OpenSSL Project
22 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
23 *
24 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
25 * endorse or promote products derived from this software without
26 * prior written permission. For written permission, please contact
27 * openssl-core@openssl.org.
28 *
29 * 5. Products derived from this software may not be called "OpenSSL"
30 * nor may "OpenSSL" appear in their names without prior written
31 * permission of the OpenSSL Project.
32 *
33 * 6. Redistributions of any form whatsoever must retain the following
34 * acknowledgment:
35 * "This product includes software developed by the OpenSSL Project
36 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
37 *
38 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
39 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
40 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
41 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
42 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
43 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
44 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
45 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
46 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
47 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
48 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
49 * OF THE POSSIBILITY OF SUCH DAMAGE.
50 * ====================================================================
51 *
52 * This product includes cryptographic software written by Eric Young
53 * (eay@cryptsoft.com). This product includes software written by Tim
54 * Hudson (tjh@cryptsoft.com).
55 *
56 */
57
58#include <openssl/err.h>
59
60#include "bn_lcl.h"
61
62BIGNUM *
63BN_mod_sqrt(BIGNUM *in, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx)
64/* Returns 'ret' such that
65 * ret^2 == a (mod p),
66 * using the Tonelli/Shanks algorithm (cf. Henri Cohen, "A Course
67 * in Algebraic Computational Number Theory", algorithm 1.5.1).
68 * 'p' must be prime!
69 */
70{
71 BIGNUM *ret = in;
72 int err = 1;
73 int r;
74 BIGNUM *A, *b, *q, *t, *x, *y;
75 int e, i, j;
76
77 if (!BN_is_odd(p) || BN_abs_is_word(p, 1)) {
78 if (BN_abs_is_word(p, 2)) {
79 if (ret == NULL)
80 ret = BN_new();
81 if (ret == NULL)
82 goto end;
83 if (!BN_set_word(ret, BN_is_bit_set(a, 0))) {
84 if (ret != in)
85 BN_free(ret);
86 return NULL;
87 }
88 bn_check_top(ret);
89 return ret;
90 }
91
92 BNerr(BN_F_BN_MOD_SQRT, BN_R_P_IS_NOT_PRIME);
93 return (NULL);
94 }
95
96 if (BN_is_zero(a) || BN_is_one(a)) {
97 if (ret == NULL)
98 ret = BN_new();
99 if (ret == NULL)
100 goto end;
101 if (!BN_set_word(ret, BN_is_one(a))) {
102 if (ret != in)
103 BN_free(ret);
104 return NULL;
105 }
106 bn_check_top(ret);
107 return ret;
108 }
109
110 BN_CTX_start(ctx);
111 if ((A = BN_CTX_get(ctx)) == NULL)
112 goto end;
113 if ((b = BN_CTX_get(ctx)) == NULL)
114 goto end;
115 if ((q = BN_CTX_get(ctx)) == NULL)
116 goto end;
117 if ((t = BN_CTX_get(ctx)) == NULL)
118 goto end;
119 if ((x = BN_CTX_get(ctx)) == NULL)
120 goto end;
121 if ((y = BN_CTX_get(ctx)) == NULL)
122 goto end;
123
124 if (ret == NULL)
125 ret = BN_new();
126 if (ret == NULL)
127 goto end;
128
129 /* A = a mod p */
130 if (!BN_nnmod(A, a, p, ctx))
131 goto end;
132
133 /* now write |p| - 1 as 2^e*q where q is odd */
134 e = 1;
135 while (!BN_is_bit_set(p, e))
136 e++;
137 /* we'll set q later (if needed) */
138
139 if (e == 1) {
140 /* The easy case: (|p|-1)/2 is odd, so 2 has an inverse
141 * modulo (|p|-1)/2, and square roots can be computed
142 * directly by modular exponentiation.
143 * We have
144 * 2 * (|p|+1)/4 == 1 (mod (|p|-1)/2),
145 * so we can use exponent (|p|+1)/4, i.e. (|p|-3)/4 + 1.
146 */
147 if (!BN_rshift(q, p, 2))
148 goto end;
149 q->neg = 0;
150 if (!BN_add_word(q, 1))
151 goto end;
152 if (!BN_mod_exp(ret, A, q, p, ctx))
153 goto end;
154 err = 0;
155 goto vrfy;
156 }
157
158 if (e == 2) {
159 /* |p| == 5 (mod 8)
160 *
161 * In this case 2 is always a non-square since
162 * Legendre(2,p) = (-1)^((p^2-1)/8) for any odd prime.
163 * So if a really is a square, then 2*a is a non-square.
164 * Thus for
165 * b := (2*a)^((|p|-5)/8),
166 * i := (2*a)*b^2
167 * we have
168 * i^2 = (2*a)^((1 + (|p|-5)/4)*2)
169 * = (2*a)^((p-1)/2)
170 * = -1;
171 * so if we set
172 * x := a*b*(i-1),
173 * then
174 * x^2 = a^2 * b^2 * (i^2 - 2*i + 1)
175 * = a^2 * b^2 * (-2*i)
176 * = a*(-i)*(2*a*b^2)
177 * = a*(-i)*i
178 * = a.
179 *
180 * (This is due to A.O.L. Atkin,
181 * <URL: http://listserv.nodak.edu/scripts/wa.exe?A2=ind9211&L=nmbrthry&O=T&P=562>,
182 * November 1992.)
183 */
184
185 /* t := 2*a */
186 if (!BN_mod_lshift1_quick(t, A, p))
187 goto end;
188
189 /* b := (2*a)^((|p|-5)/8) */
190 if (!BN_rshift(q, p, 3))
191 goto end;
192 q->neg = 0;
193 if (!BN_mod_exp(b, t, q, p, ctx))
194 goto end;
195
196 /* y := b^2 */
197 if (!BN_mod_sqr(y, b, p, ctx))
198 goto end;
199
200 /* t := (2*a)*b^2 - 1*/
201 if (!BN_mod_mul(t, t, y, p, ctx))
202 goto end;
203 if (!BN_sub_word(t, 1))
204 goto end;
205
206 /* x = a*b*t */
207 if (!BN_mod_mul(x, A, b, p, ctx))
208 goto end;
209 if (!BN_mod_mul(x, x, t, p, ctx))
210 goto end;
211
212 if (!BN_copy(ret, x))
213 goto end;
214 err = 0;
215 goto vrfy;
216 }
217
218 /* e > 2, so we really have to use the Tonelli/Shanks algorithm.
219 * First, find some y that is not a square. */
220 if (!BN_copy(q, p)) goto end; /* use 'q' as temp */
221 q->neg = 0;
222 i = 2;
223 do {
224 /* For efficiency, try small numbers first;
225 * if this fails, try random numbers.
226 */
227 if (i < 22) {
228 if (!BN_set_word(y, i))
229 goto end;
230 } else {
231 if (!BN_pseudo_rand(y, BN_num_bits(p), 0, 0))
232 goto end;
233 if (BN_ucmp(y, p) >= 0) {
234 if (!(p->neg ? BN_add : BN_sub)(y, y, p))
235 goto end;
236 }
237 /* now 0 <= y < |p| */
238 if (BN_is_zero(y))
239 if (!BN_set_word(y, i))
240 goto end;
241 }
242
243 r = BN_kronecker(y, q, ctx); /* here 'q' is |p| */
244 if (r < -1)
245 goto end;
246 if (r == 0) {
247 /* m divides p */
248 BNerr(BN_F_BN_MOD_SQRT, BN_R_P_IS_NOT_PRIME);
249 goto end;
250 }
251 }
252 while (r == 1 && ++i < 82);
253
254 if (r != -1) {
255 /* Many rounds and still no non-square -- this is more likely
256 * a bug than just bad luck.
257 * Even if p is not prime, we should have found some y
258 * such that r == -1.
259 */
260 BNerr(BN_F_BN_MOD_SQRT, BN_R_TOO_MANY_ITERATIONS);
261 goto end;
262 }
263
264 /* Here's our actual 'q': */
265 if (!BN_rshift(q, q, e))
266 goto end;
267
268 /* Now that we have some non-square, we can find an element
269 * of order 2^e by computing its q'th power. */
270 if (!BN_mod_exp(y, y, q, p, ctx))
271 goto end;
272 if (BN_is_one(y)) {
273 BNerr(BN_F_BN_MOD_SQRT, BN_R_P_IS_NOT_PRIME);
274 goto end;
275 }
276
277 /* Now we know that (if p is indeed prime) there is an integer
278 * k, 0 <= k < 2^e, such that
279 *
280 * a^q * y^k == 1 (mod p).
281 *
282 * As a^q is a square and y is not, k must be even.
283 * q+1 is even, too, so there is an element
284 *
285 * X := a^((q+1)/2) * y^(k/2),
286 *
287 * and it satisfies
288 *
289 * X^2 = a^q * a * y^k
290 * = a,
291 *
292 * so it is the square root that we are looking for.
293 */
294
295 /* t := (q-1)/2 (note that q is odd) */
296 if (!BN_rshift1(t, q))
297 goto end;
298
299 /* x := a^((q-1)/2) */
300 if (BN_is_zero(t)) /* special case: p = 2^e + 1 */
301 {
302 if (!BN_nnmod(t, A, p, ctx))
303 goto end;
304 if (BN_is_zero(t)) {
305 /* special case: a == 0 (mod p) */
306 BN_zero(ret);
307 err = 0;
308 goto end;
309 } else if (!BN_one(x))
310 goto end;
311 } else {
312 if (!BN_mod_exp(x, A, t, p, ctx))
313 goto end;
314 if (BN_is_zero(x)) {
315 /* special case: a == 0 (mod p) */
316 BN_zero(ret);
317 err = 0;
318 goto end;
319 }
320 }
321
322 /* b := a*x^2 (= a^q) */
323 if (!BN_mod_sqr(b, x, p, ctx))
324 goto end;
325 if (!BN_mod_mul(b, b, A, p, ctx))
326 goto end;
327
328 /* x := a*x (= a^((q+1)/2)) */
329 if (!BN_mod_mul(x, x, A, p, ctx))
330 goto end;
331
332 while (1) {
333 /* Now b is a^q * y^k for some even k (0 <= k < 2^E
334 * where E refers to the original value of e, which we
335 * don't keep in a variable), and x is a^((q+1)/2) * y^(k/2).
336 *
337 * We have a*b = x^2,
338 * y^2^(e-1) = -1,
339 * b^2^(e-1) = 1.
340 */
341
342 if (BN_is_one(b)) {
343 if (!BN_copy(ret, x))
344 goto end;
345 err = 0;
346 goto vrfy;
347 }
348
349
350 /* find smallest i such that b^(2^i) = 1 */
351 i = 1;
352 if (!BN_mod_sqr(t, b, p, ctx))
353 goto end;
354 while (!BN_is_one(t)) {
355 i++;
356 if (i == e) {
357 BNerr(BN_F_BN_MOD_SQRT, BN_R_NOT_A_SQUARE);
358 goto end;
359 }
360 if (!BN_mod_mul(t, t, t, p, ctx))
361 goto end;
362 }
363
364
365 /* t := y^2^(e - i - 1) */
366 if (!BN_copy(t, y))
367 goto end;
368 for (j = e - i - 1; j > 0; j--) {
369 if (!BN_mod_sqr(t, t, p, ctx))
370 goto end;
371 }
372 if (!BN_mod_mul(y, t, t, p, ctx))
373 goto end;
374 if (!BN_mod_mul(x, x, t, p, ctx))
375 goto end;
376 if (!BN_mod_mul(b, b, y, p, ctx))
377 goto end;
378 e = i;
379 }
380
381vrfy:
382 if (!err) {
383 /* verify the result -- the input might have been not a square
384 * (test added in 0.9.8) */
385
386 if (!BN_mod_sqr(x, ret, p, ctx))
387 err = 1;
388
389 if (!err && 0 != BN_cmp(x, A)) {
390 BNerr(BN_F_BN_MOD_SQRT, BN_R_NOT_A_SQUARE);
391 err = 1;
392 }
393 }
394
395end:
396 if (err) {
397 if (ret != NULL && ret != in) {
398 BN_clear_free(ret);
399 }
400 ret = NULL;
401 }
402 BN_CTX_end(ctx);
403 bn_check_top(ret);
404 return ret;
405}
diff --git a/src/lib/libcrypto/bn/bn_word.c b/src/lib/libcrypto/bn/bn_word.c
deleted file mode 100644
index c4c6754c37..0000000000
--- a/src/lib/libcrypto/bn/bn_word.c
+++ /dev/null
@@ -1,233 +0,0 @@
1/* $OpenBSD: bn_word.c,v 1.12 2014/07/11 08:44:48 jsing Exp $ */
2/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
3 * All rights reserved.
4 *
5 * This package is an SSL implementation written
6 * by Eric Young (eay@cryptsoft.com).
7 * The implementation was written so as to conform with Netscapes SSL.
8 *
9 * This library is free for commercial and non-commercial use as long as
10 * the following conditions are aheared to. The following conditions
11 * apply to all code found in this distribution, be it the RC4, RSA,
12 * lhash, DES, etc., code; not just the SSL code. The SSL documentation
13 * included with this distribution is covered by the same copyright terms
14 * except that the holder is Tim Hudson (tjh@cryptsoft.com).
15 *
16 * Copyright remains Eric Young's, and as such any Copyright notices in
17 * the code are not to be removed.
18 * If this package is used in a product, Eric Young should be given attribution
19 * as the author of the parts of the library used.
20 * This can be in the form of a textual message at program startup or
21 * in documentation (online or textual) provided with the package.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 * 1. Redistributions of source code must retain the copyright
27 * notice, this list of conditions and the following disclaimer.
28 * 2. Redistributions in binary form must reproduce the above copyright
29 * notice, this list of conditions and the following disclaimer in the
30 * documentation and/or other materials provided with the distribution.
31 * 3. All advertising materials mentioning features or use of this software
32 * must display the following acknowledgement:
33 * "This product includes cryptographic software written by
34 * Eric Young (eay@cryptsoft.com)"
35 * The word 'cryptographic' can be left out if the rouines from the library
36 * being used are not cryptographic related :-).
37 * 4. If you include any Windows specific code (or a derivative thereof) from
38 * the apps directory (application code) you must include an acknowledgement:
39 * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
40 *
41 * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
42 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51 * SUCH DAMAGE.
52 *
53 * The licence and distribution terms for any publically available version or
54 * derivative of this code cannot be changed. i.e. this code cannot simply be
55 * copied and put under another distribution licence
56 * [including the GNU Public Licence.]
57 */
58
59#include <stdio.h>
60
61#include "bn_lcl.h"
62
63BN_ULONG
64BN_mod_word(const BIGNUM *a, BN_ULONG w)
65{
66#ifndef BN_LLONG
67 BN_ULONG ret = 0;
68#else
69 BN_ULLONG ret = 0;
70#endif
71 int i;
72
73 if (w == 0)
74 return (BN_ULONG) - 1;
75
76 bn_check_top(a);
77 w &= BN_MASK2;
78 for (i = a->top - 1; i >= 0; i--) {
79#ifndef BN_LLONG
80 ret = ((ret << BN_BITS4) | ((a->d[i] >> BN_BITS4) &
81 BN_MASK2l)) % w;
82 ret = ((ret << BN_BITS4) | (a->d[i] & BN_MASK2l)) % w;
83#else
84 ret = (BN_ULLONG)(((ret << (BN_ULLONG)BN_BITS2) |
85 a->d[i]) % (BN_ULLONG)w);
86#endif
87 }
88 return ((BN_ULONG)ret);
89}
90
91BN_ULONG
92BN_div_word(BIGNUM *a, BN_ULONG w)
93{
94 BN_ULONG ret = 0;
95 int i, j;
96
97 bn_check_top(a);
98 w &= BN_MASK2;
99
100 if (!w)
101 /* actually this an error (division by zero) */
102 return (BN_ULONG) - 1;
103 if (a->top == 0)
104 return 0;
105
106 /* normalize input (so bn_div_words doesn't complain) */
107 j = BN_BITS2 - BN_num_bits_word(w);
108 w <<= j;
109 if (!BN_lshift(a, a, j))
110 return (BN_ULONG) - 1;
111
112 for (i = a->top - 1; i >= 0; i--) {
113 BN_ULONG l, d;
114
115 l = a->d[i];
116 d = bn_div_words(ret, l, w);
117 ret = (l - ((d*w)&BN_MASK2))&BN_MASK2;
118 a->d[i] = d;
119 }
120 if ((a->top > 0) && (a->d[a->top - 1] == 0))
121 a->top--;
122 ret >>= j;
123 bn_check_top(a);
124 return (ret);
125}
126
127int
128BN_add_word(BIGNUM *a, BN_ULONG w)
129{
130 BN_ULONG l;
131 int i;
132
133 bn_check_top(a);
134 w &= BN_MASK2;
135
136 /* degenerate case: w is zero */
137 if (!w)
138 return 1;
139 /* degenerate case: a is zero */
140 if (BN_is_zero(a))
141 return BN_set_word(a, w);
142 /* handle 'a' when negative */
143 if (a->neg) {
144 a->neg = 0;
145 i = BN_sub_word(a, w);
146 if (!BN_is_zero(a))
147 a->neg=!(a->neg);
148 return (i);
149 }
150 for (i = 0; w != 0 && i < a->top; i++) {
151 a->d[i] = l = (a->d[i] + w) & BN_MASK2;
152 w = (w > l) ? 1 : 0;
153 }
154 if (w && i == a->top) {
155 if (bn_wexpand(a, a->top + 1) == NULL)
156 return 0;
157 a->top++;
158 a->d[i] = w;
159 }
160 bn_check_top(a);
161 return (1);
162}
163
164int
165BN_sub_word(BIGNUM *a, BN_ULONG w)
166{
167 int i;
168
169 bn_check_top(a);
170 w &= BN_MASK2;
171
172 /* degenerate case: w is zero */
173 if (!w)
174 return 1;
175 /* degenerate case: a is zero */
176 if (BN_is_zero(a)) {
177 i = BN_set_word(a, w);
178 if (i != 0)
179 BN_set_negative(a, 1);
180 return i;
181 }
182 /* handle 'a' when negative */
183 if (a->neg) {
184 a->neg = 0;
185 i = BN_add_word(a, w);
186 a->neg = 1;
187 return (i);
188 }
189
190 if ((a->top == 1) && (a->d[0] < w)) {
191 a->d[0] = w - a->d[0];
192 a->neg = 1;
193 return (1);
194 }
195 i = 0;
196 for (;;) {
197 if (a->d[i] >= w) {
198 a->d[i] -= w;
199 break;
200 } else {
201 a->d[i] = (a->d[i] - w) & BN_MASK2;
202 i++;
203 w = 1;
204 }
205 }
206 if ((a->d[i] == 0) && (i == (a->top - 1)))
207 a->top--;
208 bn_check_top(a);
209 return (1);
210}
211
212int
213BN_mul_word(BIGNUM *a, BN_ULONG w)
214{
215 BN_ULONG ll;
216
217 bn_check_top(a);
218 w &= BN_MASK2;
219 if (a->top) {
220 if (w == 0)
221 BN_zero(a);
222 else {
223 ll = bn_mul_words(a->d, a->d, a->top, w);
224 if (ll) {
225 if (bn_wexpand(a, a->top + 1) == NULL)
226 return (0);
227 a->d[a->top++] = ll;
228 }
229 }
230 }
231 bn_check_top(a);
232 return (1);
233}
diff --git a/src/lib/libcrypto/bn/bn_x931p.c b/src/lib/libcrypto/bn/bn_x931p.c
deleted file mode 100644
index 1948bc8e71..0000000000
--- a/src/lib/libcrypto/bn/bn_x931p.c
+++ /dev/null
@@ -1,279 +0,0 @@
1/* $OpenBSD: bn_x931p.c,v 1.8 2015/04/29 00:11:12 doug Exp $ */
2/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL
3 * project 2005.
4 */
5/* ====================================================================
6 * Copyright (c) 2005 The OpenSSL Project. All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 *
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 *
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in
17 * the documentation and/or other materials provided with the
18 * distribution.
19 *
20 * 3. All advertising materials mentioning features or use of this
21 * software must display the following acknowledgment:
22 * "This product includes software developed by the OpenSSL Project
23 * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
24 *
25 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
26 * endorse or promote products derived from this software without
27 * prior written permission. For written permission, please contact
28 * licensing@OpenSSL.org.
29 *
30 * 5. Products derived from this software may not be called "OpenSSL"
31 * nor may "OpenSSL" appear in their names without prior written
32 * permission of the OpenSSL Project.
33 *
34 * 6. Redistributions of any form whatsoever must retain the following
35 * acknowledgment:
36 * "This product includes software developed by the OpenSSL Project
37 * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
38 *
39 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
40 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
41 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
42 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
43 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
44 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
45 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
46 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
47 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
48 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
49 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
50 * OF THE POSSIBILITY OF SUCH DAMAGE.
51 * ====================================================================
52 *
53 * This product includes cryptographic software written by Eric Young
54 * (eay@cryptsoft.com). This product includes software written by Tim
55 * Hudson (tjh@cryptsoft.com).
56 *
57 */
58
59#include <stdio.h>
60#include <openssl/bn.h>
61
62/* X9.31 routines for prime derivation */
63
64/* X9.31 prime derivation. This is used to generate the primes pi
65 * (p1, p2, q1, q2) from a parameter Xpi by checking successive odd
66 * integers.
67 */
68
69static int
70bn_x931_derive_pi(BIGNUM *pi, const BIGNUM *Xpi, BN_CTX *ctx, BN_GENCB *cb)
71{
72 int i = 0;
73
74 if (!BN_copy(pi, Xpi))
75 return 0;
76 if (!BN_is_odd(pi) && !BN_add_word(pi, 1))
77 return 0;
78 for (;;) {
79 i++;
80 BN_GENCB_call(cb, 0, i);
81 /* NB 27 MR is specificed in X9.31 */
82 if (BN_is_prime_fasttest_ex(pi, 27, ctx, 1, cb))
83 break;
84 if (!BN_add_word(pi, 2))
85 return 0;
86 }
87 BN_GENCB_call(cb, 2, i);
88 return 1;
89}
90
91/* This is the main X9.31 prime derivation function. From parameters
92 * Xp1, Xp2 and Xp derive the prime p. If the parameters p1 or p2 are
93 * not NULL they will be returned too: this is needed for testing.
94 */
95
96int
97BN_X931_derive_prime_ex(BIGNUM *p, BIGNUM *p1, BIGNUM *p2, const BIGNUM *Xp,
98 const BIGNUM *Xp1, const BIGNUM *Xp2, const BIGNUM *e, BN_CTX *ctx,
99 BN_GENCB *cb)
100{
101 int ret = 0;
102
103 BIGNUM *t, *p1p2, *pm1;
104
105 /* Only even e supported */
106 if (!BN_is_odd(e))
107 return 0;
108
109 BN_CTX_start(ctx);
110 if (p1 == NULL) {
111 if ((p1 = BN_CTX_get(ctx)) == NULL)
112 goto err;
113 }
114 if (p2 == NULL) {
115 if ((p2 = BN_CTX_get(ctx)) == NULL)
116 goto err;
117 }
118
119 if ((t = BN_CTX_get(ctx)) == NULL)
120 goto err;
121 if ((p1p2 = BN_CTX_get(ctx)) == NULL)
122 goto err;
123 if ((pm1 = BN_CTX_get(ctx)) == NULL)
124 goto err;
125
126 if (!bn_x931_derive_pi(p1, Xp1, ctx, cb))
127 goto err;
128
129 if (!bn_x931_derive_pi(p2, Xp2, ctx, cb))
130 goto err;
131
132 if (!BN_mul(p1p2, p1, p2, ctx))
133 goto err;
134
135 /* First set p to value of Rp */
136
137 if (!BN_mod_inverse(p, p2, p1, ctx))
138 goto err;
139
140 if (!BN_mul(p, p, p2, ctx))
141 goto err;
142
143 if (!BN_mod_inverse(t, p1, p2, ctx))
144 goto err;
145
146 if (!BN_mul(t, t, p1, ctx))
147 goto err;
148
149 if (!BN_sub(p, p, t))
150 goto err;
151
152 if (p->neg && !BN_add(p, p, p1p2))
153 goto err;
154
155 /* p now equals Rp */
156
157 if (!BN_mod_sub(p, p, Xp, p1p2, ctx))
158 goto err;
159
160 if (!BN_add(p, p, Xp))
161 goto err;
162
163 /* p now equals Yp0 */
164
165 for (;;) {
166 int i = 1;
167 BN_GENCB_call(cb, 0, i++);
168 if (!BN_copy(pm1, p))
169 goto err;
170 if (!BN_sub_word(pm1, 1))
171 goto err;
172 if (!BN_gcd(t, pm1, e, ctx))
173 goto err;
174 if (BN_is_one(t)
175 /* X9.31 specifies 8 MR and 1 Lucas test or any prime test
176 * offering similar or better guarantees 50 MR is considerably
177 * better.
178 */
179 && BN_is_prime_fasttest_ex(p, 50, ctx, 1, cb))
180 break;
181 if (!BN_add(p, p, p1p2))
182 goto err;
183 }
184
185 BN_GENCB_call(cb, 3, 0);
186
187 ret = 1;
188
189err:
190
191 BN_CTX_end(ctx);
192
193 return ret;
194}
195
196/* Generate pair of paramters Xp, Xq for X9.31 prime generation.
197 * Note: nbits paramter is sum of number of bits in both.
198 */
199
200int
201BN_X931_generate_Xpq(BIGNUM *Xp, BIGNUM *Xq, int nbits, BN_CTX *ctx)
202{
203 BIGNUM *t;
204 int i;
205 int ret = 0;
206
207 /* Number of bits for each prime is of the form
208 * 512+128s for s = 0, 1, ...
209 */
210 if ((nbits < 1024) || (nbits & 0xff))
211 return 0;
212 nbits >>= 1;
213 /* The random value Xp must be between sqrt(2) * 2^(nbits-1) and
214 * 2^nbits - 1. By setting the top two bits we ensure that the lower
215 * bound is exceeded.
216 */
217 if (!BN_rand(Xp, nbits, 1, 0))
218 return 0;
219
220 BN_CTX_start(ctx);
221 if ((t = BN_CTX_get(ctx)) == NULL)
222 goto err;
223
224 for (i = 0; i < 1000; i++) {
225 if (!BN_rand(Xq, nbits, 1, 0))
226 goto err;
227 /* Check that |Xp - Xq| > 2^(nbits - 100) */
228 BN_sub(t, Xp, Xq);
229 if (BN_num_bits(t) > (nbits - 100))
230 break;
231 }
232
233 if (i < 1000)
234 ret = 1;
235
236err:
237 BN_CTX_end(ctx);
238
239 return ret;
240}
241
242/* Generate primes using X9.31 algorithm. Of the values p, p1, p2, Xp1
243 * and Xp2 only 'p' needs to be non-NULL. If any of the others are not NULL
244 * the relevant parameter will be stored in it.
245 *
246 * Due to the fact that |Xp - Xq| > 2^(nbits - 100) must be satisfied Xp and Xq
247 * are generated using the previous function and supplied as input.
248 */
249
250int
251BN_X931_generate_prime_ex(BIGNUM *p, BIGNUM *p1, BIGNUM *p2, BIGNUM *Xp1,
252 BIGNUM *Xp2, const BIGNUM *Xp, const BIGNUM *e, BN_CTX *ctx, BN_GENCB *cb)
253{
254 int ret = 0;
255
256 BN_CTX_start(ctx);
257 if (Xp1 == NULL) {
258 if ((Xp1 = BN_CTX_get(ctx)) == NULL)
259 goto error;
260 }
261 if (Xp2 == NULL) {
262 if ((Xp2 = BN_CTX_get(ctx)) == NULL)
263 goto error;
264 }
265
266 if (!BN_rand(Xp1, 101, 0, 0))
267 goto error;
268 if (!BN_rand(Xp2, 101, 0, 0))
269 goto error;
270 if (!BN_X931_derive_prime_ex(p, p1, p2, Xp, Xp1, Xp2, e, ctx, cb))
271 goto error;
272
273 ret = 1;
274
275error:
276 BN_CTX_end(ctx);
277
278 return ret;
279}