summaryrefslogtreecommitdiff
path: root/src/lib/libcrypto/bn
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--src/lib/libcrypto/bn/asm/alpha-mont.pl321
-rw-r--r--src/lib/libcrypto/bn/asm/armv4-gf2m.pl278
-rw-r--r--src/lib/libcrypto/bn/asm/armv4-mont.pl204
-rw-r--r--src/lib/libcrypto/bn/asm/bn-586.pl774
-rw-r--r--src/lib/libcrypto/bn/asm/co-586.pl287
-rw-r--r--src/lib/libcrypto/bn/asm/ia64-mont.pl851
-rw-r--r--src/lib/libcrypto/bn/asm/ia64.S1555
-rw-r--r--src/lib/libcrypto/bn/asm/mips-mont.pl426
-rw-r--r--src/lib/libcrypto/bn/asm/mips.pl2585
-rw-r--r--src/lib/libcrypto/bn/asm/modexp512-x86_64.pl1496
-rw-r--r--src/lib/libcrypto/bn/asm/pa-risc2.s1618
-rw-r--r--src/lib/libcrypto/bn/asm/pa-risc2W.s1605
-rw-r--r--src/lib/libcrypto/bn/asm/parisc-mont.pl993
-rw-r--r--src/lib/libcrypto/bn/asm/ppc-mont.pl334
-rw-r--r--src/lib/libcrypto/bn/asm/ppc.pl1998
-rw-r--r--src/lib/libcrypto/bn/asm/ppc64-mont.pl1088
-rw-r--r--src/lib/libcrypto/bn/asm/s390x-gf2m.pl221
-rw-r--r--src/lib/libcrypto/bn/asm/s390x-mont.pl277
-rwxr-xr-xsrc/lib/libcrypto/bn/asm/s390x.S678
-rw-r--r--src/lib/libcrypto/bn/asm/sparcv8.S1458
-rw-r--r--src/lib/libcrypto/bn/asm/sparcv8plus.S1558
-rw-r--r--src/lib/libcrypto/bn/asm/sparcv9-mont.pl606
-rwxr-xr-xsrc/lib/libcrypto/bn/asm/sparcv9a-mont.pl882
-rw-r--r--src/lib/libcrypto/bn/asm/via-mont.pl242
-rw-r--r--src/lib/libcrypto/bn/asm/x86-gf2m.pl313
-rwxr-xr-xsrc/lib/libcrypto/bn/asm/x86-mont.pl593
-rw-r--r--src/lib/libcrypto/bn/asm/x86.pl28
-rw-r--r--src/lib/libcrypto/bn/asm/x86/add.pl76
-rw-r--r--src/lib/libcrypto/bn/asm/x86/comba.pl277
-rw-r--r--src/lib/libcrypto/bn/asm/x86/div.pl15
-rw-r--r--src/lib/libcrypto/bn/asm/x86/mul.pl77
-rw-r--r--src/lib/libcrypto/bn/asm/x86/mul_add.pl87
-rw-r--r--src/lib/libcrypto/bn/asm/x86/sqr.pl60
-rw-r--r--src/lib/libcrypto/bn/asm/x86/sub.pl76
-rw-r--r--src/lib/libcrypto/bn/asm/x86_64-gcc.c606
-rw-r--r--src/lib/libcrypto/bn/asm/x86_64-gf2m.pl389
-rwxr-xr-xsrc/lib/libcrypto/bn/asm/x86_64-mont.pl1680
-rwxr-xr-xsrc/lib/libcrypto/bn/asm/x86_64-mont5.pl1070
-rw-r--r--src/lib/libcrypto/bn/bn.h891
-rw-r--r--src/lib/libcrypto/bn/bn_add.c313
-rw-r--r--src/lib/libcrypto/bn/bn_asm.c1030
-rw-r--r--src/lib/libcrypto/bn/bn_blind.c385
-rw-r--r--src/lib/libcrypto/bn/bn_const.c402
-rw-r--r--src/lib/libcrypto/bn/bn_ctx.c454
-rw-r--r--src/lib/libcrypto/bn/bn_depr.c112
-rw-r--r--src/lib/libcrypto/bn/bn_div.c446
-rw-r--r--src/lib/libcrypto/bn/bn_err.c150
-rw-r--r--src/lib/libcrypto/bn/bn_exp.c1097
-rw-r--r--src/lib/libcrypto/bn/bn_exp2.c312
-rw-r--r--src/lib/libcrypto/bn/bn_gcd.c654
-rw-r--r--src/lib/libcrypto/bn/bn_gf2m.c1113
-rw-r--r--src/lib/libcrypto/bn/bn_kron.c184
-rw-r--r--src/lib/libcrypto/bn/bn_lcl.h508
-rw-r--r--src/lib/libcrypto/bn/bn_lib.c826
-rw-r--r--src/lib/libcrypto/bn/bn_mod.c301
-rw-r--r--src/lib/libcrypto/bn/bn_mont.c509
-rw-r--r--src/lib/libcrypto/bn/bn_mpi.c130
-rw-r--r--src/lib/libcrypto/bn/bn_mul.c1166
-rw-r--r--src/lib/libcrypto/bn/bn_nist.c1102
-rw-r--r--src/lib/libcrypto/bn/bn_prime.c494
-rw-r--r--src/lib/libcrypto/bn/bn_prime.h327
-rw-r--r--src/lib/libcrypto/bn/bn_prime.pl119
-rw-r--r--src/lib/libcrypto/bn/bn_print.c378
-rw-r--r--src/lib/libcrypto/bn/bn_rand.c305
-rw-r--r--src/lib/libcrypto/bn/bn_recp.c234
-rw-r--r--src/lib/libcrypto/bn/bn_shift.c223
-rw-r--r--src/lib/libcrypto/bn/bn_sqr.c294
-rw-r--r--src/lib/libcrypto/bn/bn_sqrt.c393
-rw-r--r--src/lib/libcrypto/bn/bn_word.c238
-rw-r--r--src/lib/libcrypto/bn/bn_x931p.c272
70 files changed, 0 insertions, 43044 deletions
diff --git a/src/lib/libcrypto/bn/asm/alpha-mont.pl b/src/lib/libcrypto/bn/asm/alpha-mont.pl
deleted file mode 100644
index 03596e2014..0000000000
--- a/src/lib/libcrypto/bn/asm/alpha-mont.pl
+++ /dev/null
@@ -1,321 +0,0 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# On 21264 RSA sign performance improves by 70/35/20/15 percent for
11# 512/1024/2048/4096 bit key lengths. This is against vendor compiler
12# instructed to '-tune host' code with in-line assembler. Other
13# benchmarks improve by 15-20%. To anchor it to something else, the
14# code provides approximately the same performance per GHz as AMD64.
15# I.e. if you compare 1GHz 21264 and 2GHz Opteron, you'll observe ~2x
16# difference.
17
18# int bn_mul_mont(
19$rp="a0"; # BN_ULONG *rp,
20$ap="a1"; # const BN_ULONG *ap,
21$bp="a2"; # const BN_ULONG *bp,
22$np="a3"; # const BN_ULONG *np,
23$n0="a4"; # const BN_ULONG *n0,
24$num="a5"; # int num);
25
26$lo0="t0";
27$hi0="t1";
28$lo1="t2";
29$hi1="t3";
30$aj="t4";
31$bi="t5";
32$nj="t6";
33$tp="t7";
34$alo="t8";
35$ahi="t9";
36$nlo="t10";
37$nhi="t11";
38$tj="t12";
39$i="s3";
40$j="s4";
41$m1="s5";
42
43$code=<<___;
44#ifdef __linux__
45#include <asm/regdef.h>
46#else
47#include <asm.h>
48#include <regdef.h>
49#endif
50
51.text
52
53.set noat
54.set noreorder
55
56.globl bn_mul_mont
57.align 5
58.ent bn_mul_mont
59bn_mul_mont:
60 lda sp,-48(sp)
61 stq ra,0(sp)
62 stq s3,8(sp)
63 stq s4,16(sp)
64 stq s5,24(sp)
65 stq fp,32(sp)
66 mov sp,fp
67 .mask 0x0400f000,-48
68 .frame fp,48,ra
69 .prologue 0
70
71 .align 4
72 .set reorder
73 sextl $num,$num
74 mov 0,v0
75 cmplt $num,4,AT
76 bne AT,.Lexit
77
78 ldq $hi0,0($ap) # ap[0]
79 s8addq $num,16,AT
80 ldq $aj,8($ap)
81 subq sp,AT,sp
82 ldq $bi,0($bp) # bp[0]
83 lda AT,-4096(zero) # mov -4096,AT
84 ldq $n0,0($n0)
85 and sp,AT,sp
86
87 mulq $hi0,$bi,$lo0
88 ldq $hi1,0($np) # np[0]
89 umulh $hi0,$bi,$hi0
90 ldq $nj,8($np)
91
92 mulq $lo0,$n0,$m1
93
94 mulq $hi1,$m1,$lo1
95 umulh $hi1,$m1,$hi1
96
97 addq $lo1,$lo0,$lo1
98 cmpult $lo1,$lo0,AT
99 addq $hi1,AT,$hi1
100
101 mulq $aj,$bi,$alo
102 mov 2,$j
103 umulh $aj,$bi,$ahi
104 mov sp,$tp
105
106 mulq $nj,$m1,$nlo
107 s8addq $j,$ap,$aj
108 umulh $nj,$m1,$nhi
109 s8addq $j,$np,$nj
110.align 4
111.L1st:
112 .set noreorder
113 ldq $aj,0($aj)
114 addl $j,1,$j
115 ldq $nj,0($nj)
116 lda $tp,8($tp)
117
118 addq $alo,$hi0,$lo0
119 mulq $aj,$bi,$alo
120 cmpult $lo0,$hi0,AT
121 addq $nlo,$hi1,$lo1
122
123 mulq $nj,$m1,$nlo
124 addq $ahi,AT,$hi0
125 cmpult $lo1,$hi1,v0
126 cmplt $j,$num,$tj
127
128 umulh $aj,$bi,$ahi
129 addq $nhi,v0,$hi1
130 addq $lo1,$lo0,$lo1
131 s8addq $j,$ap,$aj
132
133 umulh $nj,$m1,$nhi
134 cmpult $lo1,$lo0,v0
135 addq $hi1,v0,$hi1
136 s8addq $j,$np,$nj
137
138 stq $lo1,-8($tp)
139 nop
140 unop
141 bne $tj,.L1st
142 .set reorder
143
144 addq $alo,$hi0,$lo0
145 addq $nlo,$hi1,$lo1
146 cmpult $lo0,$hi0,AT
147 cmpult $lo1,$hi1,v0
148 addq $ahi,AT,$hi0
149 addq $nhi,v0,$hi1
150
151 addq $lo1,$lo0,$lo1
152 cmpult $lo1,$lo0,v0
153 addq $hi1,v0,$hi1
154
155 stq $lo1,0($tp)
156
157 addq $hi1,$hi0,$hi1
158 cmpult $hi1,$hi0,AT
159 stq $hi1,8($tp)
160 stq AT,16($tp)
161
162 mov 1,$i
163.align 4
164.Louter:
165 s8addq $i,$bp,$bi
166 ldq $hi0,0($ap)
167 ldq $aj,8($ap)
168 ldq $bi,0($bi)
169 ldq $hi1,0($np)
170 ldq $nj,8($np)
171 ldq $tj,0(sp)
172
173 mulq $hi0,$bi,$lo0
174 umulh $hi0,$bi,$hi0
175
176 addq $lo0,$tj,$lo0
177 cmpult $lo0,$tj,AT
178 addq $hi0,AT,$hi0
179
180 mulq $lo0,$n0,$m1
181
182 mulq $hi1,$m1,$lo1
183 umulh $hi1,$m1,$hi1
184
185 addq $lo1,$lo0,$lo1
186 cmpult $lo1,$lo0,AT
187 mov 2,$j
188 addq $hi1,AT,$hi1
189
190 mulq $aj,$bi,$alo
191 mov sp,$tp
192 umulh $aj,$bi,$ahi
193
194 mulq $nj,$m1,$nlo
195 s8addq $j,$ap,$aj
196 umulh $nj,$m1,$nhi
197.align 4
198.Linner:
199 .set noreorder
200 ldq $tj,8($tp) #L0
201 nop #U1
202 ldq $aj,0($aj) #L1
203 s8addq $j,$np,$nj #U0
204
205 ldq $nj,0($nj) #L0
206 nop #U1
207 addq $alo,$hi0,$lo0 #L1
208 lda $tp,8($tp)
209
210 mulq $aj,$bi,$alo #U1
211 cmpult $lo0,$hi0,AT #L0
212 addq $nlo,$hi1,$lo1 #L1
213 addl $j,1,$j
214
215 mulq $nj,$m1,$nlo #U1
216 addq $ahi,AT,$hi0 #L0
217 addq $lo0,$tj,$lo0 #L1
218 cmpult $lo1,$hi1,v0 #U0
219
220 umulh $aj,$bi,$ahi #U1
221 cmpult $lo0,$tj,AT #L0
222 addq $lo1,$lo0,$lo1 #L1
223 addq $nhi,v0,$hi1 #U0
224
225 umulh $nj,$m1,$nhi #U1
226 s8addq $j,$ap,$aj #L0
227 cmpult $lo1,$lo0,v0 #L1
228 cmplt $j,$num,$tj #U0 # borrow $tj
229
230 addq $hi0,AT,$hi0 #L0
231 addq $hi1,v0,$hi1 #U1
232 stq $lo1,-8($tp) #L1
233 bne $tj,.Linner #U0
234 .set reorder
235
236 ldq $tj,8($tp)
237 addq $alo,$hi0,$lo0
238 addq $nlo,$hi1,$lo1
239 cmpult $lo0,$hi0,AT
240 cmpult $lo1,$hi1,v0
241 addq $ahi,AT,$hi0
242 addq $nhi,v0,$hi1
243
244 addq $lo0,$tj,$lo0
245 cmpult $lo0,$tj,AT
246 addq $hi0,AT,$hi0
247
248 ldq $tj,16($tp)
249 addq $lo1,$lo0,$j
250 cmpult $j,$lo0,v0
251 addq $hi1,v0,$hi1
252
253 addq $hi1,$hi0,$lo1
254 stq $j,0($tp)
255 cmpult $lo1,$hi0,$hi1
256 addq $lo1,$tj,$lo1
257 cmpult $lo1,$tj,AT
258 addl $i,1,$i
259 addq $hi1,AT,$hi1
260 stq $lo1,8($tp)
261 cmplt $i,$num,$tj # borrow $tj
262 stq $hi1,16($tp)
263 bne $tj,.Louter
264
265 s8addq $num,sp,$tj # &tp[num]
266 mov $rp,$bp # put rp aside
267 mov sp,$tp
268 mov sp,$ap
269 mov 0,$hi0 # clear borrow bit
270
271.align 4
272.Lsub: ldq $lo0,0($tp)
273 ldq $lo1,0($np)
274 lda $tp,8($tp)
275 lda $np,8($np)
276 subq $lo0,$lo1,$lo1 # tp[i]-np[i]
277 cmpult $lo0,$lo1,AT
278 subq $lo1,$hi0,$lo0
279 cmpult $lo1,$lo0,$hi0
280 or $hi0,AT,$hi0
281 stq $lo0,0($rp)
282 cmpult $tp,$tj,v0
283 lda $rp,8($rp)
284 bne v0,.Lsub
285
286 subq $hi1,$hi0,$hi0 # handle upmost overflow bit
287 mov sp,$tp
288 mov $bp,$rp # restore rp
289
290 and sp,$hi0,$ap
291 bic $bp,$hi0,$bp
292 bis $bp,$ap,$ap # ap=borrow?tp:rp
293
294.align 4
295.Lcopy: ldq $aj,0($ap) # copy or in-place refresh
296 lda $tp,8($tp)
297 lda $rp,8($rp)
298 lda $ap,8($ap)
299 stq zero,-8($tp) # zap tp
300 cmpult $tp,$tj,AT
301 stq $aj,-8($rp)
302 bne AT,.Lcopy
303 mov 1,v0
304
305.Lexit:
306 .set noreorder
307 mov fp,sp
308 /*ldq ra,0(sp)*/
309 ldq s3,8(sp)
310 ldq s4,16(sp)
311 ldq s5,24(sp)
312 ldq fp,32(sp)
313 lda sp,48(sp)
314 ret (ra)
315.end bn_mul_mont
316.ascii "Montgomery Multiplication for Alpha, CRYPTOGAMS by <appro\@openssl.org>"
317.align 2
318___
319
320print $code;
321close STDOUT;
diff --git a/src/lib/libcrypto/bn/asm/armv4-gf2m.pl b/src/lib/libcrypto/bn/asm/armv4-gf2m.pl
deleted file mode 100644
index c52e0b75b5..0000000000
--- a/src/lib/libcrypto/bn/asm/armv4-gf2m.pl
+++ /dev/null
@@ -1,278 +0,0 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# May 2011
11#
12# The module implements bn_GF2m_mul_2x2 polynomial multiplication
13# used in bn_gf2m.c. It's kind of low-hanging mechanical port from
14# C for the time being... Except that it has two code paths: pure
15# integer code suitable for any ARMv4 and later CPU and NEON code
16# suitable for ARMv7. Pure integer 1x1 multiplication subroutine runs
17# in ~45 cycles on dual-issue core such as Cortex A8, which is ~50%
18# faster than compiler-generated code. For ECDH and ECDSA verify (but
19# not for ECDSA sign) it means 25%-45% improvement depending on key
20# length, more for longer keys. Even though NEON 1x1 multiplication
21# runs in even less cycles, ~30, improvement is measurable only on
22# longer keys. One has to optimize code elsewhere to get NEON glow...
23
24while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
25open STDOUT,">$output";
26
27sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
28sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
29sub Q() { shift=~m|d([1-3]?[02468])|?"q".($1/2):""; }
30
31$code=<<___;
32#include "arm_arch.h"
33
34.text
35.code 32
36
37#if __ARM_ARCH__>=7
38.fpu neon
39
40.type mul_1x1_neon,%function
41.align 5
42mul_1x1_neon:
43 vshl.u64 `&Dlo("q1")`,d16,#8 @ q1-q3 are slided $a
44 vmull.p8 `&Q("d0")`,d16,d17 @ a·bb
45 vshl.u64 `&Dlo("q2")`,d16,#16
46 vmull.p8 q1,`&Dlo("q1")`,d17 @ a<<8·bb
47 vshl.u64 `&Dlo("q3")`,d16,#24
48 vmull.p8 q2,`&Dlo("q2")`,d17 @ a<<16·bb
49 vshr.u64 `&Dlo("q1")`,#8
50 vmull.p8 q3,`&Dlo("q3")`,d17 @ a<<24·bb
51 vshl.u64 `&Dhi("q1")`,#24
52 veor d0,`&Dlo("q1")`
53 vshr.u64 `&Dlo("q2")`,#16
54 veor d0,`&Dhi("q1")`
55 vshl.u64 `&Dhi("q2")`,#16
56 veor d0,`&Dlo("q2")`
57 vshr.u64 `&Dlo("q3")`,#24
58 veor d0,`&Dhi("q2")`
59 vshl.u64 `&Dhi("q3")`,#8
60 veor d0,`&Dlo("q3")`
61 veor d0,`&Dhi("q3")`
62 bx lr
63.size mul_1x1_neon,.-mul_1x1_neon
64#endif
65___
66################
67# private interface to mul_1x1_ialu
68#
69$a="r1";
70$b="r0";
71
72($a0,$a1,$a2,$a12,$a4,$a14)=
73($hi,$lo,$t0,$t1, $i0,$i1 )=map("r$_",(4..9),12);
74
75$mask="r12";
76
77$code.=<<___;
78.type mul_1x1_ialu,%function
79.align 5
80mul_1x1_ialu:
81 mov $a0,#0
82 bic $a1,$a,#3<<30 @ a1=a&0x3fffffff
83 str $a0,[sp,#0] @ tab[0]=0
84 add $a2,$a1,$a1 @ a2=a1<<1
85 str $a1,[sp,#4] @ tab[1]=a1
86 eor $a12,$a1,$a2 @ a1^a2
87 str $a2,[sp,#8] @ tab[2]=a2
88 mov $a4,$a1,lsl#2 @ a4=a1<<2
89 str $a12,[sp,#12] @ tab[3]=a1^a2
90 eor $a14,$a1,$a4 @ a1^a4
91 str $a4,[sp,#16] @ tab[4]=a4
92 eor $a0,$a2,$a4 @ a2^a4
93 str $a14,[sp,#20] @ tab[5]=a1^a4
94 eor $a12,$a12,$a4 @ a1^a2^a4
95 str $a0,[sp,#24] @ tab[6]=a2^a4
96 and $i0,$mask,$b,lsl#2
97 str $a12,[sp,#28] @ tab[7]=a1^a2^a4
98
99 and $i1,$mask,$b,lsr#1
100 ldr $lo,[sp,$i0] @ tab[b & 0x7]
101 and $i0,$mask,$b,lsr#4
102 ldr $t1,[sp,$i1] @ tab[b >> 3 & 0x7]
103 and $i1,$mask,$b,lsr#7
104 ldr $t0,[sp,$i0] @ tab[b >> 6 & 0x7]
105 eor $lo,$lo,$t1,lsl#3 @ stall
106 mov $hi,$t1,lsr#29
107 ldr $t1,[sp,$i1] @ tab[b >> 9 & 0x7]
108
109 and $i0,$mask,$b,lsr#10
110 eor $lo,$lo,$t0,lsl#6
111 eor $hi,$hi,$t0,lsr#26
112 ldr $t0,[sp,$i0] @ tab[b >> 12 & 0x7]
113
114 and $i1,$mask,$b,lsr#13
115 eor $lo,$lo,$t1,lsl#9
116 eor $hi,$hi,$t1,lsr#23
117 ldr $t1,[sp,$i1] @ tab[b >> 15 & 0x7]
118
119 and $i0,$mask,$b,lsr#16
120 eor $lo,$lo,$t0,lsl#12
121 eor $hi,$hi,$t0,lsr#20
122 ldr $t0,[sp,$i0] @ tab[b >> 18 & 0x7]
123
124 and $i1,$mask,$b,lsr#19
125 eor $lo,$lo,$t1,lsl#15
126 eor $hi,$hi,$t1,lsr#17
127 ldr $t1,[sp,$i1] @ tab[b >> 21 & 0x7]
128
129 and $i0,$mask,$b,lsr#22
130 eor $lo,$lo,$t0,lsl#18
131 eor $hi,$hi,$t0,lsr#14
132 ldr $t0,[sp,$i0] @ tab[b >> 24 & 0x7]
133
134 and $i1,$mask,$b,lsr#25
135 eor $lo,$lo,$t1,lsl#21
136 eor $hi,$hi,$t1,lsr#11
137 ldr $t1,[sp,$i1] @ tab[b >> 27 & 0x7]
138
139 tst $a,#1<<30
140 and $i0,$mask,$b,lsr#28
141 eor $lo,$lo,$t0,lsl#24
142 eor $hi,$hi,$t0,lsr#8
143 ldr $t0,[sp,$i0] @ tab[b >> 30 ]
144
145 eorne $lo,$lo,$b,lsl#30
146 eorne $hi,$hi,$b,lsr#2
147 tst $a,#1<<31
148 eor $lo,$lo,$t1,lsl#27
149 eor $hi,$hi,$t1,lsr#5
150 eorne $lo,$lo,$b,lsl#31
151 eorne $hi,$hi,$b,lsr#1
152 eor $lo,$lo,$t0,lsl#30
153 eor $hi,$hi,$t0,lsr#2
154
155 mov pc,lr
156.size mul_1x1_ialu,.-mul_1x1_ialu
157___
158################
159# void bn_GF2m_mul_2x2(BN_ULONG *r,
160# BN_ULONG a1,BN_ULONG a0,
161# BN_ULONG b1,BN_ULONG b0); # r[3..0]=a1a0·b1b0
162
163($A1,$B1,$A0,$B0,$A1B1,$A0B0)=map("d$_",(18..23));
164
165$code.=<<___;
166.global bn_GF2m_mul_2x2
167.type bn_GF2m_mul_2x2,%function
168.align 5
169bn_GF2m_mul_2x2:
170#if __ARM_ARCH__>=7
171 ldr r12,.LOPENSSL_armcap
172.Lpic: ldr r12,[pc,r12]
173 tst r12,#1
174 beq .Lialu
175
176 veor $A1,$A1
177 vmov.32 $B1,r3,r3 @ two copies of b1
178 vmov.32 ${A1}[0],r1 @ a1
179
180 veor $A0,$A0
181 vld1.32 ${B0}[],[sp,:32] @ two copies of b0
182 vmov.32 ${A0}[0],r2 @ a0
183 mov r12,lr
184
185 vmov d16,$A1
186 vmov d17,$B1
187 bl mul_1x1_neon @ a1·b1
188 vmov $A1B1,d0
189
190 vmov d16,$A0
191 vmov d17,$B0
192 bl mul_1x1_neon @ a0·b0
193 vmov $A0B0,d0
194
195 veor d16,$A0,$A1
196 veor d17,$B0,$B1
197 veor $A0,$A0B0,$A1B1
198 bl mul_1x1_neon @ (a0+a1)·(b0+b1)
199
200 veor d0,$A0 @ (a0+a1)·(b0+b1)-a0·b0-a1·b1
201 vshl.u64 d1,d0,#32
202 vshr.u64 d0,d0,#32
203 veor $A0B0,d1
204 veor $A1B1,d0
205 vst1.32 {${A0B0}[0]},[r0,:32]!
206 vst1.32 {${A0B0}[1]},[r0,:32]!
207 vst1.32 {${A1B1}[0]},[r0,:32]!
208 vst1.32 {${A1B1}[1]},[r0,:32]
209 bx r12
210.align 4
211.Lialu:
212#endif
213___
214$ret="r10"; # reassigned 1st argument
215$code.=<<___;
216 stmdb sp!,{r4-r10,lr}
217 mov $ret,r0 @ reassign 1st argument
218 mov $b,r3 @ $b=b1
219 ldr r3,[sp,#32] @ load b0
220 mov $mask,#7<<2
221 sub sp,sp,#32 @ allocate tab[8]
222
223 bl mul_1x1_ialu @ a1·b1
224 str $lo,[$ret,#8]
225 str $hi,[$ret,#12]
226
227 eor $b,$b,r3 @ flip b0 and b1
228 eor $a,$a,r2 @ flip a0 and a1
229 eor r3,r3,$b
230 eor r2,r2,$a
231 eor $b,$b,r3
232 eor $a,$a,r2
233 bl mul_1x1_ialu @ a0·b0
234 str $lo,[$ret]
235 str $hi,[$ret,#4]
236
237 eor $a,$a,r2
238 eor $b,$b,r3
239 bl mul_1x1_ialu @ (a1+a0)·(b1+b0)
240___
241@r=map("r$_",(6..9));
242$code.=<<___;
243 ldmia $ret,{@r[0]-@r[3]}
244 eor $lo,$lo,$hi
245 eor $hi,$hi,@r[1]
246 eor $lo,$lo,@r[0]
247 eor $hi,$hi,@r[2]
248 eor $lo,$lo,@r[3]
249 eor $hi,$hi,@r[3]
250 str $hi,[$ret,#8]
251 eor $lo,$lo,$hi
252 add sp,sp,#32 @ destroy tab[8]
253 str $lo,[$ret,#4]
254
255#if __ARM_ARCH__>=5
256 ldmia sp!,{r4-r10,pc}
257#else
258 ldmia sp!,{r4-r10,lr}
259 tst lr,#1
260 moveq pc,lr @ be binary compatible with V4, yet
261 bx lr @ interoperable with Thumb ISA:-)
262#endif
263.size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
264#if __ARM_ARCH__>=7
265.align 5
266.LOPENSSL_armcap:
267.word OPENSSL_armcap_P-(.Lpic+8)
268#endif
269.asciz "GF(2^m) Multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
270.align 5
271
272.comm OPENSSL_armcap_P,4,4
273___
274
275$code =~ s/\`([^\`]*)\`/eval $1/gem;
276$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
277print $code;
278close STDOUT; # enforce flush
diff --git a/src/lib/libcrypto/bn/asm/armv4-mont.pl b/src/lib/libcrypto/bn/asm/armv4-mont.pl
deleted file mode 100644
index f78a8b5f0f..0000000000
--- a/src/lib/libcrypto/bn/asm/armv4-mont.pl
+++ /dev/null
@@ -1,204 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# January 2007.
11
12# Montgomery multiplication for ARMv4.
13#
14# Performance improvement naturally varies among CPU implementations
15# and compilers. The code was observed to provide +65-35% improvement
16# [depending on key length, less for longer keys] on ARM920T, and
17# +115-80% on Intel IXP425. This is compared to pre-bn_mul_mont code
18# base and compiler generated code with in-lined umull and even umlal
19# instructions. The latter means that this code didn't really have an
20# "advantage" of utilizing some "secret" instruction.
21#
22# The code is interoperable with Thumb ISA and is rather compact, less
23# than 1/2KB. Windows CE port would be trivial, as it's exclusively
24# about decorations, ABI and instruction syntax are identical.
25
26while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
27open STDOUT,">$output";
28
29$num="r0"; # starts as num argument, but holds &tp[num-1]
30$ap="r1";
31$bp="r2"; $bi="r2"; $rp="r2";
32$np="r3";
33$tp="r4";
34$aj="r5";
35$nj="r6";
36$tj="r7";
37$n0="r8";
38########### # r9 is reserved by ELF as platform specific, e.g. TLS pointer
39$alo="r10"; # sl, gcc uses it to keep @GOT
40$ahi="r11"; # fp
41$nlo="r12"; # ip
42########### # r13 is stack pointer
43$nhi="r14"; # lr
44########### # r15 is program counter
45
46#### argument block layout relative to &tp[num-1], a.k.a. $num
47$_rp="$num,#12*4";
48# ap permanently resides in r1
49$_bp="$num,#13*4";
50# np permanently resides in r3
51$_n0="$num,#14*4";
52$_num="$num,#15*4"; $_bpend=$_num;
53
54$code=<<___;
55.text
56
57.global bn_mul_mont
58.type bn_mul_mont,%function
59
60.align 2
61bn_mul_mont:
62 stmdb sp!,{r0,r2} @ sp points at argument block
63 ldr $num,[sp,#3*4] @ load num
64 cmp $num,#2
65 movlt r0,#0
66 addlt sp,sp,#2*4
67 blt .Labrt
68
69 stmdb sp!,{r4-r12,lr} @ save 10 registers
70
71 mov $num,$num,lsl#2 @ rescale $num for byte count
72 sub sp,sp,$num @ alloca(4*num)
73 sub sp,sp,#4 @ +extra dword
74 sub $num,$num,#4 @ "num=num-1"
75 add $tp,$bp,$num @ &bp[num-1]
76
77 add $num,sp,$num @ $num to point at &tp[num-1]
78 ldr $n0,[$_n0] @ &n0
79 ldr $bi,[$bp] @ bp[0]
80 ldr $aj,[$ap],#4 @ ap[0],ap++
81 ldr $nj,[$np],#4 @ np[0],np++
82 ldr $n0,[$n0] @ *n0
83 str $tp,[$_bpend] @ save &bp[num]
84
85 umull $alo,$ahi,$aj,$bi @ ap[0]*bp[0]
86 str $n0,[$_n0] @ save n0 value
87 mul $n0,$alo,$n0 @ "tp[0]"*n0
88 mov $nlo,#0
89 umlal $alo,$nlo,$nj,$n0 @ np[0]*n0+"t[0]"
90 mov $tp,sp
91
92.L1st:
93 ldr $aj,[$ap],#4 @ ap[j],ap++
94 mov $alo,$ahi
95 ldr $nj,[$np],#4 @ np[j],np++
96 mov $ahi,#0
97 umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[0]
98 mov $nhi,#0
99 umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0
100 adds $nlo,$nlo,$alo
101 str $nlo,[$tp],#4 @ tp[j-1]=,tp++
102 adc $nlo,$nhi,#0
103 cmp $tp,$num
104 bne .L1st
105
106 adds $nlo,$nlo,$ahi
107 ldr $tp,[$_bp] @ restore bp
108 mov $nhi,#0
109 ldr $n0,[$_n0] @ restore n0
110 adc $nhi,$nhi,#0
111 str $nlo,[$num] @ tp[num-1]=
112 str $nhi,[$num,#4] @ tp[num]=
113
114.Louter:
115 sub $tj,$num,sp @ "original" $num-1 value
116 sub $ap,$ap,$tj @ "rewind" ap to &ap[1]
117 ldr $bi,[$tp,#4]! @ *(++bp)
118 sub $np,$np,$tj @ "rewind" np to &np[1]
119 ldr $aj,[$ap,#-4] @ ap[0]
120 ldr $alo,[sp] @ tp[0]
121 ldr $nj,[$np,#-4] @ np[0]
122 ldr $tj,[sp,#4] @ tp[1]
123
124 mov $ahi,#0
125 umlal $alo,$ahi,$aj,$bi @ ap[0]*bp[i]+tp[0]
126 str $tp,[$_bp] @ save bp
127 mul $n0,$alo,$n0
128 mov $nlo,#0
129 umlal $alo,$nlo,$nj,$n0 @ np[0]*n0+"tp[0]"
130 mov $tp,sp
131
132.Linner:
133 ldr $aj,[$ap],#4 @ ap[j],ap++
134 adds $alo,$ahi,$tj @ +=tp[j]
135 ldr $nj,[$np],#4 @ np[j],np++
136 mov $ahi,#0
137 umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[i]
138 mov $nhi,#0
139 umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0
140 adc $ahi,$ahi,#0
141 ldr $tj,[$tp,#8] @ tp[j+1]
142 adds $nlo,$nlo,$alo
143 str $nlo,[$tp],#4 @ tp[j-1]=,tp++
144 adc $nlo,$nhi,#0
145 cmp $tp,$num
146 bne .Linner
147
148 adds $nlo,$nlo,$ahi
149 mov $nhi,#0
150 ldr $tp,[$_bp] @ restore bp
151 adc $nhi,$nhi,#0
152 ldr $n0,[$_n0] @ restore n0
153 adds $nlo,$nlo,$tj
154 ldr $tj,[$_bpend] @ restore &bp[num]
155 adc $nhi,$nhi,#0
156 str $nlo,[$num] @ tp[num-1]=
157 str $nhi,[$num,#4] @ tp[num]=
158
159 cmp $tp,$tj
160 bne .Louter
161
162 ldr $rp,[$_rp] @ pull rp
163 add $num,$num,#4 @ $num to point at &tp[num]
164 sub $aj,$num,sp @ "original" num value
165 mov $tp,sp @ "rewind" $tp
166 mov $ap,$tp @ "borrow" $ap
167 sub $np,$np,$aj @ "rewind" $np to &np[0]
168
169 subs $tj,$tj,$tj @ "clear" carry flag
170.Lsub: ldr $tj,[$tp],#4
171 ldr $nj,[$np],#4
172 sbcs $tj,$tj,$nj @ tp[j]-np[j]
173 str $tj,[$rp],#4 @ rp[j]=
174 teq $tp,$num @ preserve carry
175 bne .Lsub
176 sbcs $nhi,$nhi,#0 @ upmost carry
177 mov $tp,sp @ "rewind" $tp
178 sub $rp,$rp,$aj @ "rewind" $rp
179
180 and $ap,$tp,$nhi
181 bic $np,$rp,$nhi
182 orr $ap,$ap,$np @ ap=borrow?tp:rp
183
184.Lcopy: ldr $tj,[$ap],#4 @ copy or in-place refresh
185 str sp,[$tp],#4 @ zap tp
186 str $tj,[$rp],#4
187 cmp $tp,$num
188 bne .Lcopy
189
190 add sp,$num,#4 @ skip over tp[num+1]
191 ldmia sp!,{r4-r12,lr} @ restore registers
192 add sp,sp,#2*4 @ skip over {r0,r2}
193 mov r0,#1
194.Labrt: tst lr,#1
195 moveq pc,lr @ be binary compatible with V4, yet
196 bx lr @ interoperable with Thumb ISA:-)
197.size bn_mul_mont,.-bn_mul_mont
198.asciz "Montgomery multiplication for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
199.align 2
200___
201
202$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
203print $code;
204close STDOUT;
diff --git a/src/lib/libcrypto/bn/asm/bn-586.pl b/src/lib/libcrypto/bn/asm/bn-586.pl
deleted file mode 100644
index 332ef3e91d..0000000000
--- a/src/lib/libcrypto/bn/asm/bn-586.pl
+++ /dev/null
@@ -1,774 +0,0 @@
1#!/usr/local/bin/perl
2
3$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
4push(@INC,"${dir}","${dir}../../perlasm");
5require "x86asm.pl";
6
7&asm_init($ARGV[0],$0);
8
9$sse2=0;
10for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
11
12&external_label("OPENSSL_ia32cap_P") if ($sse2);
13
14&bn_mul_add_words("bn_mul_add_words");
15&bn_mul_words("bn_mul_words");
16&bn_sqr_words("bn_sqr_words");
17&bn_div_words("bn_div_words");
18&bn_add_words("bn_add_words");
19&bn_sub_words("bn_sub_words");
20&bn_sub_part_words("bn_sub_part_words");
21
22&asm_finish();
23
24sub bn_mul_add_words
25 {
26 local($name)=@_;
27
28 &function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
29
30 $r="eax";
31 $a="edx";
32 $c="ecx";
33
34 if ($sse2) {
35 &picmeup("eax","OPENSSL_ia32cap_P");
36 &bt(&DWP(0,"eax"),26);
37 &jnc(&label("maw_non_sse2"));
38
39 &mov($r,&wparam(0));
40 &mov($a,&wparam(1));
41 &mov($c,&wparam(2));
42 &movd("mm0",&wparam(3)); # mm0 = w
43 &pxor("mm1","mm1"); # mm1 = carry_in
44 &jmp(&label("maw_sse2_entry"));
45
46 &set_label("maw_sse2_unrolled",16);
47 &movd("mm3",&DWP(0,$r,"",0)); # mm3 = r[0]
48 &paddq("mm1","mm3"); # mm1 = carry_in + r[0]
49 &movd("mm2",&DWP(0,$a,"",0)); # mm2 = a[0]
50 &pmuludq("mm2","mm0"); # mm2 = w*a[0]
51 &movd("mm4",&DWP(4,$a,"",0)); # mm4 = a[1]
52 &pmuludq("mm4","mm0"); # mm4 = w*a[1]
53 &movd("mm6",&DWP(8,$a,"",0)); # mm6 = a[2]
54 &pmuludq("mm6","mm0"); # mm6 = w*a[2]
55 &movd("mm7",&DWP(12,$a,"",0)); # mm7 = a[3]
56 &pmuludq("mm7","mm0"); # mm7 = w*a[3]
57 &paddq("mm1","mm2"); # mm1 = carry_in + r[0] + w*a[0]
58 &movd("mm3",&DWP(4,$r,"",0)); # mm3 = r[1]
59 &paddq("mm3","mm4"); # mm3 = r[1] + w*a[1]
60 &movd("mm5",&DWP(8,$r,"",0)); # mm5 = r[2]
61 &paddq("mm5","mm6"); # mm5 = r[2] + w*a[2]
62 &movd("mm4",&DWP(12,$r,"",0)); # mm4 = r[3]
63 &paddq("mm7","mm4"); # mm7 = r[3] + w*a[3]
64 &movd(&DWP(0,$r,"",0),"mm1");
65 &movd("mm2",&DWP(16,$a,"",0)); # mm2 = a[4]
66 &pmuludq("mm2","mm0"); # mm2 = w*a[4]
67 &psrlq("mm1",32); # mm1 = carry0
68 &movd("mm4",&DWP(20,$a,"",0)); # mm4 = a[5]
69 &pmuludq("mm4","mm0"); # mm4 = w*a[5]
70 &paddq("mm1","mm3"); # mm1 = carry0 + r[1] + w*a[1]
71 &movd("mm6",&DWP(24,$a,"",0)); # mm6 = a[6]
72 &pmuludq("mm6","mm0"); # mm6 = w*a[6]
73 &movd(&DWP(4,$r,"",0),"mm1");
74 &psrlq("mm1",32); # mm1 = carry1
75 &movd("mm3",&DWP(28,$a,"",0)); # mm3 = a[7]
76 &add($a,32);
77 &pmuludq("mm3","mm0"); # mm3 = w*a[7]
78 &paddq("mm1","mm5"); # mm1 = carry1 + r[2] + w*a[2]
79 &movd("mm5",&DWP(16,$r,"",0)); # mm5 = r[4]
80 &paddq("mm2","mm5"); # mm2 = r[4] + w*a[4]
81 &movd(&DWP(8,$r,"",0),"mm1");
82 &psrlq("mm1",32); # mm1 = carry2
83 &paddq("mm1","mm7"); # mm1 = carry2 + r[3] + w*a[3]
84 &movd("mm5",&DWP(20,$r,"",0)); # mm5 = r[5]
85 &paddq("mm4","mm5"); # mm4 = r[5] + w*a[5]
86 &movd(&DWP(12,$r,"",0),"mm1");
87 &psrlq("mm1",32); # mm1 = carry3
88 &paddq("mm1","mm2"); # mm1 = carry3 + r[4] + w*a[4]
89 &movd("mm5",&DWP(24,$r,"",0)); # mm5 = r[6]
90 &paddq("mm6","mm5"); # mm6 = r[6] + w*a[6]
91 &movd(&DWP(16,$r,"",0),"mm1");
92 &psrlq("mm1",32); # mm1 = carry4
93 &paddq("mm1","mm4"); # mm1 = carry4 + r[5] + w*a[5]
94 &movd("mm5",&DWP(28,$r,"",0)); # mm5 = r[7]
95 &paddq("mm3","mm5"); # mm3 = r[7] + w*a[7]
96 &movd(&DWP(20,$r,"",0),"mm1");
97 &psrlq("mm1",32); # mm1 = carry5
98 &paddq("mm1","mm6"); # mm1 = carry5 + r[6] + w*a[6]
99 &movd(&DWP(24,$r,"",0),"mm1");
100 &psrlq("mm1",32); # mm1 = carry6
101 &paddq("mm1","mm3"); # mm1 = carry6 + r[7] + w*a[7]
102 &movd(&DWP(28,$r,"",0),"mm1");
103 &lea($r,&DWP(32,$r));
104 &psrlq("mm1",32); # mm1 = carry_out
105
106 &sub($c,8);
107 &jz(&label("maw_sse2_exit"));
108 &set_label("maw_sse2_entry");
109 &test($c,0xfffffff8);
110 &jnz(&label("maw_sse2_unrolled"));
111
112 &set_label("maw_sse2_loop",4);
113 &movd("mm2",&DWP(0,$a)); # mm2 = a[i]
114 &movd("mm3",&DWP(0,$r)); # mm3 = r[i]
115 &pmuludq("mm2","mm0"); # a[i] *= w
116 &lea($a,&DWP(4,$a));
117 &paddq("mm1","mm3"); # carry += r[i]
118 &paddq("mm1","mm2"); # carry += a[i]*w
119 &movd(&DWP(0,$r),"mm1"); # r[i] = carry_low
120 &sub($c,1);
121 &psrlq("mm1",32); # carry = carry_high
122 &lea($r,&DWP(4,$r));
123 &jnz(&label("maw_sse2_loop"));
124 &set_label("maw_sse2_exit");
125 &movd("eax","mm1"); # c = carry_out
126 &emms();
127 &ret();
128
129 &set_label("maw_non_sse2",16);
130 }
131
132 # function_begin prologue
133 &push("ebp");
134 &push("ebx");
135 &push("esi");
136 &push("edi");
137
138 &comment("");
139 $Low="eax";
140 $High="edx";
141 $a="ebx";
142 $w="ebp";
143 $r="edi";
144 $c="esi";
145
146 &xor($c,$c); # clear carry
147 &mov($r,&wparam(0)); #
148
149 &mov("ecx",&wparam(2)); #
150 &mov($a,&wparam(1)); #
151
152 &and("ecx",0xfffffff8); # num / 8
153 &mov($w,&wparam(3)); #
154
155 &push("ecx"); # Up the stack for a tmp variable
156
157 &jz(&label("maw_finish"));
158
159 &set_label("maw_loop",16);
160
161 for ($i=0; $i<32; $i+=4)
162 {
163 &comment("Round $i");
164
165 &mov("eax",&DWP($i,$a)); # *a
166 &mul($w); # *a * w
167 &add("eax",$c); # L(t)+= c
168 &adc("edx",0); # H(t)+=carry
169 &add("eax",&DWP($i,$r)); # L(t)+= *r
170 &adc("edx",0); # H(t)+=carry
171 &mov(&DWP($i,$r),"eax"); # *r= L(t);
172 &mov($c,"edx"); # c= H(t);
173 }
174
175 &comment("");
176 &sub("ecx",8);
177 &lea($a,&DWP(32,$a));
178 &lea($r,&DWP(32,$r));
179 &jnz(&label("maw_loop"));
180
181 &set_label("maw_finish",0);
182 &mov("ecx",&wparam(2)); # get num
183 &and("ecx",7);
184 &jnz(&label("maw_finish2")); # helps branch prediction
185 &jmp(&label("maw_end"));
186
187 &set_label("maw_finish2",1);
188 for ($i=0; $i<7; $i++)
189 {
190 &comment("Tail Round $i");
191 &mov("eax",&DWP($i*4,$a)); # *a
192 &mul($w); # *a * w
193 &add("eax",$c); # L(t)+=c
194 &adc("edx",0); # H(t)+=carry
195 &add("eax",&DWP($i*4,$r)); # L(t)+= *r
196 &adc("edx",0); # H(t)+=carry
197 &dec("ecx") if ($i != 7-1);
198 &mov(&DWP($i*4,$r),"eax"); # *r= L(t);
199 &mov($c,"edx"); # c= H(t);
200 &jz(&label("maw_end")) if ($i != 7-1);
201 }
202 &set_label("maw_end",0);
203 &mov("eax",$c);
204
205 &pop("ecx"); # clear variable from
206
207 &function_end($name);
208 }
209
210sub bn_mul_words
211 {
212 local($name)=@_;
213
214 &function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
215
216 $r="eax";
217 $a="edx";
218 $c="ecx";
219
220 if ($sse2) {
221 &picmeup("eax","OPENSSL_ia32cap_P");
222 &bt(&DWP(0,"eax"),26);
223 &jnc(&label("mw_non_sse2"));
224
225 &mov($r,&wparam(0));
226 &mov($a,&wparam(1));
227 &mov($c,&wparam(2));
228 &movd("mm0",&wparam(3)); # mm0 = w
229 &pxor("mm1","mm1"); # mm1 = carry = 0
230
231 &set_label("mw_sse2_loop",16);
232 &movd("mm2",&DWP(0,$a)); # mm2 = a[i]
233 &pmuludq("mm2","mm0"); # a[i] *= w
234 &lea($a,&DWP(4,$a));
235 &paddq("mm1","mm2"); # carry += a[i]*w
236 &movd(&DWP(0,$r),"mm1"); # r[i] = carry_low
237 &sub($c,1);
238 &psrlq("mm1",32); # carry = carry_high
239 &lea($r,&DWP(4,$r));
240 &jnz(&label("mw_sse2_loop"));
241
242 &movd("eax","mm1"); # return carry
243 &emms();
244 &ret();
245 &set_label("mw_non_sse2",16);
246 }
247
248 # function_begin prologue
249 &push("ebp");
250 &push("ebx");
251 &push("esi");
252 &push("edi");
253
254 &comment("");
255 $Low="eax";
256 $High="edx";
257 $a="ebx";
258 $w="ecx";
259 $r="edi";
260 $c="esi";
261 $num="ebp";
262
263 &xor($c,$c); # clear carry
264 &mov($r,&wparam(0)); #
265 &mov($a,&wparam(1)); #
266 &mov($num,&wparam(2)); #
267 &mov($w,&wparam(3)); #
268
269 &and($num,0xfffffff8); # num / 8
270 &jz(&label("mw_finish"));
271
272 &set_label("mw_loop",0);
273 for ($i=0; $i<32; $i+=4)
274 {
275 &comment("Round $i");
276
277 &mov("eax",&DWP($i,$a,"",0)); # *a
278 &mul($w); # *a * w
279 &add("eax",$c); # L(t)+=c
280 # XXX
281
282 &adc("edx",0); # H(t)+=carry
283 &mov(&DWP($i,$r,"",0),"eax"); # *r= L(t);
284
285 &mov($c,"edx"); # c= H(t);
286 }
287
288 &comment("");
289 &add($a,32);
290 &add($r,32);
291 &sub($num,8);
292 &jz(&label("mw_finish"));
293 &jmp(&label("mw_loop"));
294
295 &set_label("mw_finish",0);
296 &mov($num,&wparam(2)); # get num
297 &and($num,7);
298 &jnz(&label("mw_finish2"));
299 &jmp(&label("mw_end"));
300
301 &set_label("mw_finish2",1);
302 for ($i=0; $i<7; $i++)
303 {
304 &comment("Tail Round $i");
305 &mov("eax",&DWP($i*4,$a,"",0));# *a
306 &mul($w); # *a * w
307 &add("eax",$c); # L(t)+=c
308 # XXX
309 &adc("edx",0); # H(t)+=carry
310 &mov(&DWP($i*4,$r,"",0),"eax");# *r= L(t);
311 &mov($c,"edx"); # c= H(t);
312 &dec($num) if ($i != 7-1);
313 &jz(&label("mw_end")) if ($i != 7-1);
314 }
315 &set_label("mw_end",0);
316 &mov("eax",$c);
317
318 &function_end($name);
319 }
320
321sub bn_sqr_words
322 {
323 local($name)=@_;
324
325 &function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
326
327 $r="eax";
328 $a="edx";
329 $c="ecx";
330
331 if ($sse2) {
332 &picmeup("eax","OPENSSL_ia32cap_P");
333 &bt(&DWP(0,"eax"),26);
334 &jnc(&label("sqr_non_sse2"));
335
336 &mov($r,&wparam(0));
337 &mov($a,&wparam(1));
338 &mov($c,&wparam(2));
339
340 &set_label("sqr_sse2_loop",16);
341 &movd("mm0",&DWP(0,$a)); # mm0 = a[i]
342 &pmuludq("mm0","mm0"); # a[i] *= a[i]
343 &lea($a,&DWP(4,$a)); # a++
344 &movq(&QWP(0,$r),"mm0"); # r[i] = a[i]*a[i]
345 &sub($c,1);
346 &lea($r,&DWP(8,$r)); # r += 2
347 &jnz(&label("sqr_sse2_loop"));
348
349 &emms();
350 &ret();
351 &set_label("sqr_non_sse2",16);
352 }
353
354 # function_begin prologue
355 &push("ebp");
356 &push("ebx");
357 &push("esi");
358 &push("edi");
359
360 &comment("");
361 $r="esi";
362 $a="edi";
363 $num="ebx";
364
365 &mov($r,&wparam(0)); #
366 &mov($a,&wparam(1)); #
367 &mov($num,&wparam(2)); #
368
369 &and($num,0xfffffff8); # num / 8
370 &jz(&label("sw_finish"));
371
372 &set_label("sw_loop",0);
373 for ($i=0; $i<32; $i+=4)
374 {
375 &comment("Round $i");
376 &mov("eax",&DWP($i,$a,"",0)); # *a
377 # XXX
378 &mul("eax"); # *a * *a
379 &mov(&DWP($i*2,$r,"",0),"eax"); #
380 &mov(&DWP($i*2+4,$r,"",0),"edx");#
381 }
382
383 &comment("");
384 &add($a,32);
385 &add($r,64);
386 &sub($num,8);
387 &jnz(&label("sw_loop"));
388
389 &set_label("sw_finish",0);
390 &mov($num,&wparam(2)); # get num
391 &and($num,7);
392 &jz(&label("sw_end"));
393
394 for ($i=0; $i<7; $i++)
395 {
396 &comment("Tail Round $i");
397 &mov("eax",&DWP($i*4,$a,"",0)); # *a
398 # XXX
399 &mul("eax"); # *a * *a
400 &mov(&DWP($i*8,$r,"",0),"eax"); #
401 &dec($num) if ($i != 7-1);
402 &mov(&DWP($i*8+4,$r,"",0),"edx");
403 &jz(&label("sw_end")) if ($i != 7-1);
404 }
405 &set_label("sw_end",0);
406
407 &function_end($name);
408 }
409
410sub bn_div_words
411 {
412 local($name)=@_;
413
414 &function_begin_B($name,"");
415 &mov("edx",&wparam(0)); #
416 &mov("eax",&wparam(1)); #
417 &mov("ecx",&wparam(2)); #
418 &div("ecx");
419 &ret();
420 &function_end_B($name);
421 }
422
423sub bn_add_words
424 {
425 local($name)=@_;
426
427 &function_begin($name,"");
428
429 &comment("");
430 $a="esi";
431 $b="edi";
432 $c="eax";
433 $r="ebx";
434 $tmp1="ecx";
435 $tmp2="edx";
436 $num="ebp";
437
438 &mov($r,&wparam(0)); # get r
439 &mov($a,&wparam(1)); # get a
440 &mov($b,&wparam(2)); # get b
441 &mov($num,&wparam(3)); # get num
442 &xor($c,$c); # clear carry
443 &and($num,0xfffffff8); # num / 8
444
445 &jz(&label("aw_finish"));
446
447 &set_label("aw_loop",0);
448 for ($i=0; $i<8; $i++)
449 {
450 &comment("Round $i");
451
452 &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
453 &mov($tmp2,&DWP($i*4,$b,"",0)); # *b
454 &add($tmp1,$c);
455 &mov($c,0);
456 &adc($c,$c);
457 &add($tmp1,$tmp2);
458 &adc($c,0);
459 &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
460 }
461
462 &comment("");
463 &add($a,32);
464 &add($b,32);
465 &add($r,32);
466 &sub($num,8);
467 &jnz(&label("aw_loop"));
468
469 &set_label("aw_finish",0);
470 &mov($num,&wparam(3)); # get num
471 &and($num,7);
472 &jz(&label("aw_end"));
473
474 for ($i=0; $i<7; $i++)
475 {
476 &comment("Tail Round $i");
477 &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
478 &mov($tmp2,&DWP($i*4,$b,"",0));# *b
479 &add($tmp1,$c);
480 &mov($c,0);
481 &adc($c,$c);
482 &add($tmp1,$tmp2);
483 &adc($c,0);
484 &dec($num) if ($i != 6);
485 &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
486 &jz(&label("aw_end")) if ($i != 6);
487 }
488 &set_label("aw_end",0);
489
490# &mov("eax",$c); # $c is "eax"
491
492 &function_end($name);
493 }
494
495sub bn_sub_words
496 {
497 local($name)=@_;
498
499 &function_begin($name,"");
500
501 &comment("");
502 $a="esi";
503 $b="edi";
504 $c="eax";
505 $r="ebx";
506 $tmp1="ecx";
507 $tmp2="edx";
508 $num="ebp";
509
510 &mov($r,&wparam(0)); # get r
511 &mov($a,&wparam(1)); # get a
512 &mov($b,&wparam(2)); # get b
513 &mov($num,&wparam(3)); # get num
514 &xor($c,$c); # clear carry
515 &and($num,0xfffffff8); # num / 8
516
517 &jz(&label("aw_finish"));
518
519 &set_label("aw_loop",0);
520 for ($i=0; $i<8; $i++)
521 {
522 &comment("Round $i");
523
524 &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
525 &mov($tmp2,&DWP($i*4,$b,"",0)); # *b
526 &sub($tmp1,$c);
527 &mov($c,0);
528 &adc($c,$c);
529 &sub($tmp1,$tmp2);
530 &adc($c,0);
531 &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
532 }
533
534 &comment("");
535 &add($a,32);
536 &add($b,32);
537 &add($r,32);
538 &sub($num,8);
539 &jnz(&label("aw_loop"));
540
541 &set_label("aw_finish",0);
542 &mov($num,&wparam(3)); # get num
543 &and($num,7);
544 &jz(&label("aw_end"));
545
546 for ($i=0; $i<7; $i++)
547 {
548 &comment("Tail Round $i");
549 &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
550 &mov($tmp2,&DWP($i*4,$b,"",0));# *b
551 &sub($tmp1,$c);
552 &mov($c,0);
553 &adc($c,$c);
554 &sub($tmp1,$tmp2);
555 &adc($c,0);
556 &dec($num) if ($i != 6);
557 &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
558 &jz(&label("aw_end")) if ($i != 6);
559 }
560 &set_label("aw_end",0);
561
562# &mov("eax",$c); # $c is "eax"
563
564 &function_end($name);
565 }
566
567sub bn_sub_part_words
568 {
569 local($name)=@_;
570
571 &function_begin($name,"");
572
573 &comment("");
574 $a="esi";
575 $b="edi";
576 $c="eax";
577 $r="ebx";
578 $tmp1="ecx";
579 $tmp2="edx";
580 $num="ebp";
581
582 &mov($r,&wparam(0)); # get r
583 &mov($a,&wparam(1)); # get a
584 &mov($b,&wparam(2)); # get b
585 &mov($num,&wparam(3)); # get num
586 &xor($c,$c); # clear carry
587 &and($num,0xfffffff8); # num / 8
588
589 &jz(&label("aw_finish"));
590
591 &set_label("aw_loop",0);
592 for ($i=0; $i<8; $i++)
593 {
594 &comment("Round $i");
595
596 &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
597 &mov($tmp2,&DWP($i*4,$b,"",0)); # *b
598 &sub($tmp1,$c);
599 &mov($c,0);
600 &adc($c,$c);
601 &sub($tmp1,$tmp2);
602 &adc($c,0);
603 &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
604 }
605
606 &comment("");
607 &add($a,32);
608 &add($b,32);
609 &add($r,32);
610 &sub($num,8);
611 &jnz(&label("aw_loop"));
612
613 &set_label("aw_finish",0);
614 &mov($num,&wparam(3)); # get num
615 &and($num,7);
616 &jz(&label("aw_end"));
617
618 for ($i=0; $i<7; $i++)
619 {
620 &comment("Tail Round $i");
621 &mov($tmp1,&DWP(0,$a,"",0)); # *a
622 &mov($tmp2,&DWP(0,$b,"",0));# *b
623 &sub($tmp1,$c);
624 &mov($c,0);
625 &adc($c,$c);
626 &sub($tmp1,$tmp2);
627 &adc($c,0);
628 &mov(&DWP(0,$r,"",0),$tmp1); # *r
629 &add($a, 4);
630 &add($b, 4);
631 &add($r, 4);
632 &dec($num) if ($i != 6);
633 &jz(&label("aw_end")) if ($i != 6);
634 }
635 &set_label("aw_end",0);
636
637 &cmp(&wparam(4),0);
638 &je(&label("pw_end"));
639
640 &mov($num,&wparam(4)); # get dl
641 &cmp($num,0);
642 &je(&label("pw_end"));
643 &jge(&label("pw_pos"));
644
645 &comment("pw_neg");
646 &mov($tmp2,0);
647 &sub($tmp2,$num);
648 &mov($num,$tmp2);
649 &and($num,0xfffffff8); # num / 8
650 &jz(&label("pw_neg_finish"));
651
652 &set_label("pw_neg_loop",0);
653 for ($i=0; $i<8; $i++)
654 {
655 &comment("dl<0 Round $i");
656
657 &mov($tmp1,0);
658 &mov($tmp2,&DWP($i*4,$b,"",0)); # *b
659 &sub($tmp1,$c);
660 &mov($c,0);
661 &adc($c,$c);
662 &sub($tmp1,$tmp2);
663 &adc($c,0);
664 &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
665 }
666
667 &comment("");
668 &add($b,32);
669 &add($r,32);
670 &sub($num,8);
671 &jnz(&label("pw_neg_loop"));
672
673 &set_label("pw_neg_finish",0);
674 &mov($tmp2,&wparam(4)); # get dl
675 &mov($num,0);
676 &sub($num,$tmp2);
677 &and($num,7);
678 &jz(&label("pw_end"));
679
680 for ($i=0; $i<7; $i++)
681 {
682 &comment("dl<0 Tail Round $i");
683 &mov($tmp1,0);
684 &mov($tmp2,&DWP($i*4,$b,"",0));# *b
685 &sub($tmp1,$c);
686 &mov($c,0);
687 &adc($c,$c);
688 &sub($tmp1,$tmp2);
689 &adc($c,0);
690 &dec($num) if ($i != 6);
691 &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
692 &jz(&label("pw_end")) if ($i != 6);
693 }
694
695 &jmp(&label("pw_end"));
696
697 &set_label("pw_pos",0);
698
699 &and($num,0xfffffff8); # num / 8
700 &jz(&label("pw_pos_finish"));
701
702 &set_label("pw_pos_loop",0);
703
704 for ($i=0; $i<8; $i++)
705 {
706 &comment("dl>0 Round $i");
707
708 &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
709 &sub($tmp1,$c);
710 &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
711 &jnc(&label("pw_nc".$i));
712 }
713
714 &comment("");
715 &add($a,32);
716 &add($r,32);
717 &sub($num,8);
718 &jnz(&label("pw_pos_loop"));
719
720 &set_label("pw_pos_finish",0);
721 &mov($num,&wparam(4)); # get dl
722 &and($num,7);
723 &jz(&label("pw_end"));
724
725 for ($i=0; $i<7; $i++)
726 {
727 &comment("dl>0 Tail Round $i");
728 &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
729 &sub($tmp1,$c);
730 &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
731 &jnc(&label("pw_tail_nc".$i));
732 &dec($num) if ($i != 6);
733 &jz(&label("pw_end")) if ($i != 6);
734 }
735 &mov($c,1);
736 &jmp(&label("pw_end"));
737
738 &set_label("pw_nc_loop",0);
739 for ($i=0; $i<8; $i++)
740 {
741 &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
742 &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
743 &set_label("pw_nc".$i,0);
744 }
745
746 &comment("");
747 &add($a,32);
748 &add($r,32);
749 &sub($num,8);
750 &jnz(&label("pw_nc_loop"));
751
752 &mov($num,&wparam(4)); # get dl
753 &and($num,7);
754 &jz(&label("pw_nc_end"));
755
756 for ($i=0; $i<7; $i++)
757 {
758 &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
759 &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
760 &set_label("pw_tail_nc".$i,0);
761 &dec($num) if ($i != 6);
762 &jz(&label("pw_nc_end")) if ($i != 6);
763 }
764
765 &set_label("pw_nc_end",0);
766 &mov($c,0);
767
768 &set_label("pw_end",0);
769
770# &mov("eax",$c); # $c is "eax"
771
772 &function_end($name);
773 }
774
diff --git a/src/lib/libcrypto/bn/asm/co-586.pl b/src/lib/libcrypto/bn/asm/co-586.pl
deleted file mode 100644
index 57101a6bd7..0000000000
--- a/src/lib/libcrypto/bn/asm/co-586.pl
+++ /dev/null
@@ -1,287 +0,0 @@
1#!/usr/local/bin/perl
2
3$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
4push(@INC,"${dir}","${dir}../../perlasm");
5require "x86asm.pl";
6
7&asm_init($ARGV[0],$0);
8
9&bn_mul_comba("bn_mul_comba8",8);
10&bn_mul_comba("bn_mul_comba4",4);
11&bn_sqr_comba("bn_sqr_comba8",8);
12&bn_sqr_comba("bn_sqr_comba4",4);
13
14&asm_finish();
15
16sub mul_add_c
17 {
18 local($a,$ai,$b,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
19
20 # pos == -1 if eax and edx are pre-loaded, 0 to load from next
21 # words, and 1 if load return value
22
23 &comment("mul a[$ai]*b[$bi]");
24
25 # "eax" and "edx" will always be pre-loaded.
26 # &mov("eax",&DWP($ai*4,$a,"",0)) ;
27 # &mov("edx",&DWP($bi*4,$b,"",0));
28
29 &mul("edx");
30 &add($c0,"eax");
31 &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # laod next a
32 &mov("eax",&wparam(0)) if $pos > 0; # load r[]
33 ###
34 &adc($c1,"edx");
35 &mov("edx",&DWP(($nb)*4,$b,"",0)) if $pos == 0; # laod next b
36 &mov("edx",&DWP(($nb)*4,$b,"",0)) if $pos == 1; # laod next b
37 ###
38 &adc($c2,0);
39 # is pos > 1, it means it is the last loop
40 &mov(&DWP($i*4,"eax","",0),$c0) if $pos > 0; # save r[];
41 &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # laod next a
42 }
43
44sub sqr_add_c
45 {
46 local($r,$a,$ai,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
47
48 # pos == -1 if eax and edx are pre-loaded, 0 to load from next
49 # words, and 1 if load return value
50
51 &comment("sqr a[$ai]*a[$bi]");
52
53 # "eax" and "edx" will always be pre-loaded.
54 # &mov("eax",&DWP($ai*4,$a,"",0)) ;
55 # &mov("edx",&DWP($bi*4,$b,"",0));
56
57 if ($ai == $bi)
58 { &mul("eax");}
59 else
60 { &mul("edx");}
61 &add($c0,"eax");
62 &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # load next a
63 ###
64 &adc($c1,"edx");
65 &mov("edx",&DWP(($nb)*4,$a,"",0)) if ($pos == 1) && ($na != $nb);
66 ###
67 &adc($c2,0);
68 # is pos > 1, it means it is the last loop
69 &mov(&DWP($i*4,$r,"",0),$c0) if $pos > 0; # save r[];
70 &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # load next b
71 }
72
73sub sqr_add_c2
74 {
75 local($r,$a,$ai,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
76
77 # pos == -1 if eax and edx are pre-loaded, 0 to load from next
78 # words, and 1 if load return value
79
80 &comment("sqr a[$ai]*a[$bi]");
81
82 # "eax" and "edx" will always be pre-loaded.
83 # &mov("eax",&DWP($ai*4,$a,"",0)) ;
84 # &mov("edx",&DWP($bi*4,$a,"",0));
85
86 if ($ai == $bi)
87 { &mul("eax");}
88 else
89 { &mul("edx");}
90 &add("eax","eax");
91 ###
92 &adc("edx","edx");
93 ###
94 &adc($c2,0);
95 &add($c0,"eax");
96 &adc($c1,"edx");
97 &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # load next a
98 &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # load next b
99 &adc($c2,0);
100 &mov(&DWP($i*4,$r,"",0),$c0) if $pos > 0; # save r[];
101 &mov("edx",&DWP(($nb)*4,$a,"",0)) if ($pos <= 1) && ($na != $nb);
102 ###
103 }
104
105sub bn_mul_comba
106 {
107 local($name,$num)=@_;
108 local($a,$b,$c0,$c1,$c2);
109 local($i,$as,$ae,$bs,$be,$ai,$bi);
110 local($tot,$end);
111
112 &function_begin_B($name,"");
113
114 $c0="ebx";
115 $c1="ecx";
116 $c2="ebp";
117 $a="esi";
118 $b="edi";
119
120 $as=0;
121 $ae=0;
122 $bs=0;
123 $be=0;
124 $tot=$num+$num-1;
125
126 &push("esi");
127 &mov($a,&wparam(1));
128 &push("edi");
129 &mov($b,&wparam(2));
130 &push("ebp");
131 &push("ebx");
132
133 &xor($c0,$c0);
134 &mov("eax",&DWP(0,$a,"",0)); # load the first word
135 &xor($c1,$c1);
136 &mov("edx",&DWP(0,$b,"",0)); # load the first second
137
138 for ($i=0; $i<$tot; $i++)
139 {
140 $ai=$as;
141 $bi=$bs;
142 $end=$be+1;
143
144 &comment("################## Calculate word $i");
145
146 for ($j=$bs; $j<$end; $j++)
147 {
148 &xor($c2,$c2) if ($j == $bs);
149 if (($j+1) == $end)
150 {
151 $v=1;
152 $v=2 if (($i+1) == $tot);
153 }
154 else
155 { $v=0; }
156 if (($j+1) != $end)
157 {
158 $na=($ai-1);
159 $nb=($bi+1);
160 }
161 else
162 {
163 $na=$as+($i < ($num-1));
164 $nb=$bs+($i >= ($num-1));
165 }
166#printf STDERR "[$ai,$bi] -> [$na,$nb]\n";
167 &mul_add_c($a,$ai,$b,$bi,$c0,$c1,$c2,$v,$i,$na,$nb);
168 if ($v)
169 {
170 &comment("saved r[$i]");
171 # &mov("eax",&wparam(0));
172 # &mov(&DWP($i*4,"eax","",0),$c0);
173 ($c0,$c1,$c2)=($c1,$c2,$c0);
174 }
175 $ai--;
176 $bi++;
177 }
178 $as++ if ($i < ($num-1));
179 $ae++ if ($i >= ($num-1));
180
181 $bs++ if ($i >= ($num-1));
182 $be++ if ($i < ($num-1));
183 }
184 &comment("save r[$i]");
185 # &mov("eax",&wparam(0));
186 &mov(&DWP($i*4,"eax","",0),$c0);
187
188 &pop("ebx");
189 &pop("ebp");
190 &pop("edi");
191 &pop("esi");
192 &ret();
193 &function_end_B($name);
194 }
195
196sub bn_sqr_comba
197 {
198 local($name,$num)=@_;
199 local($r,$a,$c0,$c1,$c2)=@_;
200 local($i,$as,$ae,$bs,$be,$ai,$bi);
201 local($b,$tot,$end,$half);
202
203 &function_begin_B($name,"");
204
205 $c0="ebx";
206 $c1="ecx";
207 $c2="ebp";
208 $a="esi";
209 $r="edi";
210
211 &push("esi");
212 &push("edi");
213 &push("ebp");
214 &push("ebx");
215 &mov($r,&wparam(0));
216 &mov($a,&wparam(1));
217 &xor($c0,$c0);
218 &xor($c1,$c1);
219 &mov("eax",&DWP(0,$a,"",0)); # load the first word
220
221 $as=0;
222 $ae=0;
223 $bs=0;
224 $be=0;
225 $tot=$num+$num-1;
226
227 for ($i=0; $i<$tot; $i++)
228 {
229 $ai=$as;
230 $bi=$bs;
231 $end=$be+1;
232
233 &comment("############### Calculate word $i");
234 for ($j=$bs; $j<$end; $j++)
235 {
236 &xor($c2,$c2) if ($j == $bs);
237 if (($ai-1) < ($bi+1))
238 {
239 $v=1;
240 $v=2 if ($i+1) == $tot;
241 }
242 else
243 { $v=0; }
244 if (!$v)
245 {
246 $na=$ai-1;
247 $nb=$bi+1;
248 }
249 else
250 {
251 $na=$as+($i < ($num-1));
252 $nb=$bs+($i >= ($num-1));
253 }
254 if ($ai == $bi)
255 {
256 &sqr_add_c($r,$a,$ai,$bi,
257 $c0,$c1,$c2,$v,$i,$na,$nb);
258 }
259 else
260 {
261 &sqr_add_c2($r,$a,$ai,$bi,
262 $c0,$c1,$c2,$v,$i,$na,$nb);
263 }
264 if ($v)
265 {
266 &comment("saved r[$i]");
267 #&mov(&DWP($i*4,$r,"",0),$c0);
268 ($c0,$c1,$c2)=($c1,$c2,$c0);
269 last;
270 }
271 $ai--;
272 $bi++;
273 }
274 $as++ if ($i < ($num-1));
275 $ae++ if ($i >= ($num-1));
276
277 $bs++ if ($i >= ($num-1));
278 $be++ if ($i < ($num-1));
279 }
280 &mov(&DWP($i*4,$r,"",0),$c0);
281 &pop("ebx");
282 &pop("ebp");
283 &pop("edi");
284 &pop("esi");
285 &ret();
286 &function_end_B($name);
287 }
diff --git a/src/lib/libcrypto/bn/asm/ia64-mont.pl b/src/lib/libcrypto/bn/asm/ia64-mont.pl
deleted file mode 100644
index e258658428..0000000000
--- a/src/lib/libcrypto/bn/asm/ia64-mont.pl
+++ /dev/null
@@ -1,851 +0,0 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# January 2010
11#
12# "Teaser" Montgomery multiplication module for IA-64. There are
13# several possibilities for improvement:
14#
15# - modulo-scheduling outer loop would eliminate quite a number of
16# stalls after ldf8, xma and getf.sig outside inner loop and
17# improve shorter key performance;
18# - shorter vector support [with input vectors being fetched only
19# once] should be added;
20# - 2x unroll with help of n0[1] would make the code scalable on
21# "wider" IA-64, "wider" than Itanium 2 that is, which is not of
22# acute interest, because upcoming Tukwila's individual cores are
23# reportedly based on Itanium 2 design;
24# - dedicated squaring procedure(?);
25#
26# January 2010
27#
28# Shorter vector support is implemented by zero-padding ap and np
29# vectors up to 8 elements, or 512 bits. This means that 256-bit
30# inputs will be processed only 2 times faster than 512-bit inputs,
31# not 4 [as one would expect, because algorithm complexity is n^2].
32# The reason for padding is that inputs shorter than 512 bits won't
33# be processed faster anyway, because minimal critical path of the
34# core loop happens to match 512-bit timing. Either way, it resulted
35# in >100% improvement of 512-bit RSA sign benchmark and 50% - of
36# 1024-bit one [in comparison to original version of *this* module].
37#
38# So far 'openssl speed rsa dsa' output on 900MHz Itanium 2 *with*
39# this module is:
40# sign verify sign/s verify/s
41# rsa 512 bits 0.000290s 0.000024s 3452.8 42031.4
42# rsa 1024 bits 0.000793s 0.000058s 1261.7 17172.0
43# rsa 2048 bits 0.005908s 0.000148s 169.3 6754.0
44# rsa 4096 bits 0.033456s 0.000469s 29.9 2133.6
45# dsa 512 bits 0.000253s 0.000198s 3949.9 5057.0
46# dsa 1024 bits 0.000585s 0.000607s 1708.4 1647.4
47# dsa 2048 bits 0.001453s 0.001703s 688.1 587.4
48#
49# ... and *without* (but still with ia64.S):
50#
51# rsa 512 bits 0.000670s 0.000041s 1491.8 24145.5
52# rsa 1024 bits 0.001988s 0.000080s 502.9 12499.3
53# rsa 2048 bits 0.008702s 0.000189s 114.9 5293.9
54# rsa 4096 bits 0.043860s 0.000533s 22.8 1875.9
55# dsa 512 bits 0.000441s 0.000427s 2265.3 2340.6
56# dsa 1024 bits 0.000823s 0.000867s 1215.6 1153.2
57# dsa 2048 bits 0.001894s 0.002179s 528.1 458.9
58#
59# As it can be seen, RSA sign performance improves by 130-30%,
60# hereafter less for longer keys, while verify - by 74-13%.
61# DSA performance improves by 115-30%.
62
63if ($^O eq "hpux") {
64 $ADDP="addp4";
65 for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
66} else { $ADDP="add"; }
67
68$code=<<___;
69.explicit
70.text
71
72// int bn_mul_mont (BN_ULONG *rp,const BN_ULONG *ap,
73// const BN_ULONG *bp,const BN_ULONG *np,
74// const BN_ULONG *n0p,int num);
75.align 64
76.global bn_mul_mont#
77.proc bn_mul_mont#
78bn_mul_mont:
79 .prologue
80 .body
81{ .mmi; cmp4.le p6,p7=2,r37;;
82(p6) cmp4.lt.unc p8,p9=8,r37
83 mov ret0=r0 };;
84{ .bbb;
85(p9) br.cond.dptk.many bn_mul_mont_8
86(p8) br.cond.dpnt.many bn_mul_mont_general
87(p7) br.ret.spnt.many b0 };;
88.endp bn_mul_mont#
89
90prevfs=r2; prevpr=r3; prevlc=r10; prevsp=r11;
91
92rptr=r8; aptr=r9; bptr=r14; nptr=r15;
93tptr=r16; // &tp[0]
94tp_1=r17; // &tp[-1]
95num=r18; len=r19; lc=r20;
96topbit=r21; // carry bit from tmp[num]
97
98n0=f6;
99m0=f7;
100bi=f8;
101
102.align 64
103.local bn_mul_mont_general#
104.proc bn_mul_mont_general#
105bn_mul_mont_general:
106 .prologue
107{ .mmi; .save ar.pfs,prevfs
108 alloc prevfs=ar.pfs,6,2,0,8
109 $ADDP aptr=0,in1
110 .save ar.lc,prevlc
111 mov prevlc=ar.lc }
112{ .mmi; .vframe prevsp
113 mov prevsp=sp
114 $ADDP bptr=0,in2
115 .save pr,prevpr
116 mov prevpr=pr };;
117
118 .body
119 .rotf alo[6],nlo[4],ahi[8],nhi[6]
120 .rotr a[3],n[3],t[2]
121
122{ .mmi; ldf8 bi=[bptr],8 // (*bp++)
123 ldf8 alo[4]=[aptr],16 // ap[0]
124 $ADDP r30=8,in1 };;
125{ .mmi; ldf8 alo[3]=[r30],16 // ap[1]
126 ldf8 alo[2]=[aptr],16 // ap[2]
127 $ADDP in4=0,in4 };;
128{ .mmi; ldf8 alo[1]=[r30] // ap[3]
129 ldf8 n0=[in4] // n0
130 $ADDP rptr=0,in0 }
131{ .mmi; $ADDP nptr=0,in3
132 mov r31=16
133 zxt4 num=in5 };;
134{ .mmi; ldf8 nlo[2]=[nptr],8 // np[0]
135 shladd len=num,3,r0
136 shladd r31=num,3,r31 };;
137{ .mmi; ldf8 nlo[1]=[nptr],8 // np[1]
138 add lc=-5,num
139 sub r31=sp,r31 };;
140{ .mfb; and sp=-16,r31 // alloca
141 xmpy.hu ahi[2]=alo[4],bi // ap[0]*bp[0]
142 nop.b 0 }
143{ .mfb; nop.m 0
144 xmpy.lu alo[4]=alo[4],bi
145 brp.loop.imp .L1st_ctop,.L1st_cend-16
146 };;
147{ .mfi; nop.m 0
148 xma.hu ahi[1]=alo[3],bi,ahi[2] // ap[1]*bp[0]
149 add tp_1=8,sp }
150{ .mfi; nop.m 0
151 xma.lu alo[3]=alo[3],bi,ahi[2]
152 mov pr.rot=0x20001f<<16
153 // ------^----- (p40) at first (p23)
154 // ----------^^ p[16:20]=1
155 };;
156{ .mfi; nop.m 0
157 xmpy.lu m0=alo[4],n0 // (ap[0]*bp[0])*n0
158 mov ar.lc=lc }
159{ .mfi; nop.m 0
160 fcvt.fxu.s1 nhi[1]=f0
161 mov ar.ec=8 };;
162
163.align 32
164.L1st_ctop:
165.pred.rel "mutex",p40,p42
166{ .mfi; (p16) ldf8 alo[0]=[aptr],8 // *(aptr++)
167 (p18) xma.hu ahi[0]=alo[2],bi,ahi[1]
168 (p40) add n[2]=n[2],a[2] } // (p23) }
169{ .mfi; (p18) ldf8 nlo[0]=[nptr],8 // *(nptr++)(p16)
170 (p18) xma.lu alo[2]=alo[2],bi,ahi[1]
171 (p42) add n[2]=n[2],a[2],1 };; // (p23)
172{ .mfi; (p21) getf.sig a[0]=alo[5]
173 (p20) xma.hu nhi[0]=nlo[2],m0,nhi[1]
174 (p42) cmp.leu p41,p39=n[2],a[2] } // (p23)
175{ .mfi; (p23) st8 [tp_1]=n[2],8
176 (p20) xma.lu nlo[2]=nlo[2],m0,nhi[1]
177 (p40) cmp.ltu p41,p39=n[2],a[2] } // (p23)
178{ .mmb; (p21) getf.sig n[0]=nlo[3]
179 (p16) nop.m 0
180 br.ctop.sptk .L1st_ctop };;
181.L1st_cend:
182
183{ .mmi; getf.sig a[0]=ahi[6] // (p24)
184 getf.sig n[0]=nhi[4]
185 add num=-1,num };; // num--
186{ .mmi; .pred.rel "mutex",p40,p42
187(p40) add n[0]=n[0],a[0]
188(p42) add n[0]=n[0],a[0],1
189 sub aptr=aptr,len };; // rewind
190{ .mmi; .pred.rel "mutex",p40,p42
191(p40) cmp.ltu p41,p39=n[0],a[0]
192(p42) cmp.leu p41,p39=n[0],a[0]
193 sub nptr=nptr,len };;
194{ .mmi; .pred.rel "mutex",p39,p41
195(p39) add topbit=r0,r0
196(p41) add topbit=r0,r0,1
197 nop.i 0 }
198{ .mmi; st8 [tp_1]=n[0]
199 add tptr=16,sp
200 add tp_1=8,sp };;
201
202.Louter:
203{ .mmi; ldf8 bi=[bptr],8 // (*bp++)
204 ldf8 ahi[3]=[tptr] // tp[0]
205 add r30=8,aptr };;
206{ .mmi; ldf8 alo[4]=[aptr],16 // ap[0]
207 ldf8 alo[3]=[r30],16 // ap[1]
208 add r31=8,nptr };;
209{ .mfb; ldf8 alo[2]=[aptr],16 // ap[2]
210 xma.hu ahi[2]=alo[4],bi,ahi[3] // ap[0]*bp[i]+tp[0]
211 brp.loop.imp .Linner_ctop,.Linner_cend-16
212 }
213{ .mfb; ldf8 alo[1]=[r30] // ap[3]
214 xma.lu alo[4]=alo[4],bi,ahi[3]
215 clrrrb.pr };;
216{ .mfi; ldf8 nlo[2]=[nptr],16 // np[0]
217 xma.hu ahi[1]=alo[3],bi,ahi[2] // ap[1]*bp[i]
218 nop.i 0 }
219{ .mfi; ldf8 nlo[1]=[r31] // np[1]
220 xma.lu alo[3]=alo[3],bi,ahi[2]
221 mov pr.rot=0x20101f<<16
222 // ------^----- (p40) at first (p23)
223 // --------^--- (p30) at first (p22)
224 // ----------^^ p[16:20]=1
225 };;
226{ .mfi; st8 [tptr]=r0 // tp[0] is already accounted
227 xmpy.lu m0=alo[4],n0 // (ap[0]*bp[i]+tp[0])*n0
228 mov ar.lc=lc }
229{ .mfi;
230 fcvt.fxu.s1 nhi[1]=f0
231 mov ar.ec=8 };;
232
233// This loop spins in 4*(n+7) ticks on Itanium 2 and should spin in
234// 7*(n+7) ticks on Itanium (the one codenamed Merced). Factor of 7
235// in latter case accounts for two-tick pipeline stall, which means
236// that its performance would be ~20% lower than optimal one. No
237// attempt was made to address this, because original Itanium is
238// hardly represented out in the wild...
239.align 32
240.Linner_ctop:
241.pred.rel "mutex",p40,p42
242.pred.rel "mutex",p30,p32
243{ .mfi; (p16) ldf8 alo[0]=[aptr],8 // *(aptr++)
244 (p18) xma.hu ahi[0]=alo[2],bi,ahi[1]
245 (p40) add n[2]=n[2],a[2] } // (p23)
246{ .mfi; (p16) nop.m 0
247 (p18) xma.lu alo[2]=alo[2],bi,ahi[1]
248 (p42) add n[2]=n[2],a[2],1 };; // (p23)
249{ .mfi; (p21) getf.sig a[0]=alo[5]
250 (p16) nop.f 0
251 (p40) cmp.ltu p41,p39=n[2],a[2] } // (p23)
252{ .mfi; (p21) ld8 t[0]=[tptr],8
253 (p16) nop.f 0
254 (p42) cmp.leu p41,p39=n[2],a[2] };; // (p23)
255{ .mfi; (p18) ldf8 nlo[0]=[nptr],8 // *(nptr++)
256 (p20) xma.hu nhi[0]=nlo[2],m0,nhi[1]
257 (p30) add a[1]=a[1],t[1] } // (p22)
258{ .mfi; (p16) nop.m 0
259 (p20) xma.lu nlo[2]=nlo[2],m0,nhi[1]
260 (p32) add a[1]=a[1],t[1],1 };; // (p22)
261{ .mmi; (p21) getf.sig n[0]=nlo[3]
262 (p16) nop.m 0
263 (p30) cmp.ltu p31,p29=a[1],t[1] } // (p22)
264{ .mmb; (p23) st8 [tp_1]=n[2],8
265 (p32) cmp.leu p31,p29=a[1],t[1] // (p22)
266 br.ctop.sptk .Linner_ctop };;
267.Linner_cend:
268
269{ .mmi; getf.sig a[0]=ahi[6] // (p24)
270 getf.sig n[0]=nhi[4]
271 nop.i 0 };;
272
273{ .mmi; .pred.rel "mutex",p31,p33
274(p31) add a[0]=a[0],topbit
275(p33) add a[0]=a[0],topbit,1
276 mov topbit=r0 };;
277{ .mfi; .pred.rel "mutex",p31,p33
278(p31) cmp.ltu p32,p30=a[0],topbit
279(p33) cmp.leu p32,p30=a[0],topbit
280 }
281{ .mfi; .pred.rel "mutex",p40,p42
282(p40) add n[0]=n[0],a[0]
283(p42) add n[0]=n[0],a[0],1
284 };;
285{ .mmi; .pred.rel "mutex",p44,p46
286(p40) cmp.ltu p41,p39=n[0],a[0]
287(p42) cmp.leu p41,p39=n[0],a[0]
288(p32) add topbit=r0,r0,1 }
289
290{ .mmi; st8 [tp_1]=n[0],8
291 cmp4.ne p6,p0=1,num
292 sub aptr=aptr,len };; // rewind
293{ .mmi; sub nptr=nptr,len
294(p41) add topbit=r0,r0,1
295 add tptr=16,sp }
296{ .mmb; add tp_1=8,sp
297 add num=-1,num // num--
298(p6) br.cond.sptk.many .Louter };;
299
300{ .mbb; add lc=4,lc
301 brp.loop.imp .Lsub_ctop,.Lsub_cend-16
302 clrrrb.pr };;
303{ .mii; nop.m 0
304 mov pr.rot=0x10001<<16
305 // ------^---- (p33) at first (p17)
306 mov ar.lc=lc }
307{ .mii; nop.m 0
308 mov ar.ec=3
309 nop.i 0 };;
310
311.Lsub_ctop:
312.pred.rel "mutex",p33,p35
313{ .mfi; (p16) ld8 t[0]=[tptr],8 // t=*(tp++)
314 (p16) nop.f 0
315 (p33) sub n[1]=t[1],n[1] } // (p17)
316{ .mfi; (p16) ld8 n[0]=[nptr],8 // n=*(np++)
317 (p16) nop.f 0
318 (p35) sub n[1]=t[1],n[1],1 };; // (p17)
319{ .mib; (p18) st8 [rptr]=n[2],8 // *(rp++)=r
320 (p33) cmp.gtu p34,p32=n[1],t[1] // (p17)
321 (p18) nop.b 0 }
322{ .mib; (p18) nop.m 0
323 (p35) cmp.geu p34,p32=n[1],t[1] // (p17)
324 br.ctop.sptk .Lsub_ctop };;
325.Lsub_cend:
326
327{ .mmb; .pred.rel "mutex",p34,p36
328(p34) sub topbit=topbit,r0 // (p19)
329(p36) sub topbit=topbit,r0,1
330 brp.loop.imp .Lcopy_ctop,.Lcopy_cend-16
331 }
332{ .mmb; sub rptr=rptr,len // rewind
333 sub tptr=tptr,len
334 clrrrb.pr };;
335{ .mmi; and aptr=tptr,topbit
336 andcm bptr=rptr,topbit
337 mov pr.rot=1<<16 };;
338{ .mii; or nptr=aptr,bptr
339 mov ar.lc=lc
340 mov ar.ec=3 };;
341
342.Lcopy_ctop:
343{ .mmb; (p16) ld8 n[0]=[nptr],8
344 (p18) st8 [tptr]=r0,8
345 (p16) nop.b 0 }
346{ .mmb; (p16) nop.m 0
347 (p18) st8 [rptr]=n[2],8
348 br.ctop.sptk .Lcopy_ctop };;
349.Lcopy_cend:
350
351{ .mmi; mov ret0=1 // signal "handled"
352 rum 1<<5 // clear um.mfh
353 mov ar.lc=prevlc }
354{ .mib; .restore sp
355 mov sp=prevsp
356 mov pr=prevpr,0x1ffff
357 br.ret.sptk.many b0 };;
358.endp bn_mul_mont_general#
359
360a1=r16; a2=r17; a3=r18; a4=r19; a5=r20; a6=r21; a7=r22; a8=r23;
361n1=r24; n2=r25; n3=r26; n4=r27; n5=r28; n6=r29; n7=r30; n8=r31;
362t0=r15;
363
364ai0=f8; ai1=f9; ai2=f10; ai3=f11; ai4=f12; ai5=f13; ai6=f14; ai7=f15;
365ni0=f16; ni1=f17; ni2=f18; ni3=f19; ni4=f20; ni5=f21; ni6=f22; ni7=f23;
366
367.align 64
368.skip 48 // aligns loop body
369.local bn_mul_mont_8#
370.proc bn_mul_mont_8#
371bn_mul_mont_8:
372 .prologue
373{ .mmi; .save ar.pfs,prevfs
374 alloc prevfs=ar.pfs,6,2,0,8
375 .vframe prevsp
376 mov prevsp=sp
377 .save ar.lc,prevlc
378 mov prevlc=ar.lc }
379{ .mmi; add r17=-6*16,sp
380 add sp=-7*16,sp
381 .save pr,prevpr
382 mov prevpr=pr };;
383
384{ .mmi; .save.gf 0,0x10
385 stf.spill [sp]=f16,-16
386 .save.gf 0,0x20
387 stf.spill [r17]=f17,32
388 add r16=-5*16,prevsp};;
389{ .mmi; .save.gf 0,0x40
390 stf.spill [r16]=f18,32
391 .save.gf 0,0x80
392 stf.spill [r17]=f19,32
393 $ADDP aptr=0,in1 };;
394{ .mmi; .save.gf 0,0x100
395 stf.spill [r16]=f20,32
396 .save.gf 0,0x200
397 stf.spill [r17]=f21,32
398 $ADDP r29=8,in1 };;
399{ .mmi; .save.gf 0,0x400
400 stf.spill [r16]=f22
401 .save.gf 0,0x800
402 stf.spill [r17]=f23
403 $ADDP rptr=0,in0 };;
404
405 .body
406 .rotf bj[8],mj[2],tf[2],alo[10],ahi[10],nlo[10],nhi[10]
407 .rotr t[8]
408
409// load input vectors padding them to 8 elements
410{ .mmi; ldf8 ai0=[aptr],16 // ap[0]
411 ldf8 ai1=[r29],16 // ap[1]
412 $ADDP bptr=0,in2 }
413{ .mmi; $ADDP r30=8,in2
414 $ADDP nptr=0,in3
415 $ADDP r31=8,in3 };;
416{ .mmi; ldf8 bj[7]=[bptr],16 // bp[0]
417 ldf8 bj[6]=[r30],16 // bp[1]
418 cmp4.le p4,p5=3,in5 }
419{ .mmi; ldf8 ni0=[nptr],16 // np[0]
420 ldf8 ni1=[r31],16 // np[1]
421 cmp4.le p6,p7=4,in5 };;
422
423{ .mfi; (p4)ldf8 ai2=[aptr],16 // ap[2]
424 (p5)fcvt.fxu ai2=f0
425 cmp4.le p8,p9=5,in5 }
426{ .mfi; (p6)ldf8 ai3=[r29],16 // ap[3]
427 (p7)fcvt.fxu ai3=f0
428 cmp4.le p10,p11=6,in5 }
429{ .mfi; (p4)ldf8 bj[5]=[bptr],16 // bp[2]
430 (p5)fcvt.fxu bj[5]=f0
431 cmp4.le p12,p13=7,in5 }
432{ .mfi; (p6)ldf8 bj[4]=[r30],16 // bp[3]
433 (p7)fcvt.fxu bj[4]=f0
434 cmp4.le p14,p15=8,in5 }
435{ .mfi; (p4)ldf8 ni2=[nptr],16 // np[2]
436 (p5)fcvt.fxu ni2=f0
437 addp4 r28=-1,in5 }
438{ .mfi; (p6)ldf8 ni3=[r31],16 // np[3]
439 (p7)fcvt.fxu ni3=f0
440 $ADDP in4=0,in4 };;
441
442{ .mfi; ldf8 n0=[in4]
443 fcvt.fxu tf[1]=f0
444 nop.i 0 }
445
446{ .mfi; (p8)ldf8 ai4=[aptr],16 // ap[4]
447 (p9)fcvt.fxu ai4=f0
448 mov t[0]=r0 }
449{ .mfi; (p10)ldf8 ai5=[r29],16 // ap[5]
450 (p11)fcvt.fxu ai5=f0
451 mov t[1]=r0 }
452{ .mfi; (p8)ldf8 bj[3]=[bptr],16 // bp[4]
453 (p9)fcvt.fxu bj[3]=f0
454 mov t[2]=r0 }
455{ .mfi; (p10)ldf8 bj[2]=[r30],16 // bp[5]
456 (p11)fcvt.fxu bj[2]=f0
457 mov t[3]=r0 }
458{ .mfi; (p8)ldf8 ni4=[nptr],16 // np[4]
459 (p9)fcvt.fxu ni4=f0
460 mov t[4]=r0 }
461{ .mfi; (p10)ldf8 ni5=[r31],16 // np[5]
462 (p11)fcvt.fxu ni5=f0
463 mov t[5]=r0 };;
464
465{ .mfi; (p12)ldf8 ai6=[aptr],16 // ap[6]
466 (p13)fcvt.fxu ai6=f0
467 mov t[6]=r0 }
468{ .mfi; (p14)ldf8 ai7=[r29],16 // ap[7]
469 (p15)fcvt.fxu ai7=f0
470 mov t[7]=r0 }
471{ .mfi; (p12)ldf8 bj[1]=[bptr],16 // bp[6]
472 (p13)fcvt.fxu bj[1]=f0
473 mov ar.lc=r28 }
474{ .mfi; (p14)ldf8 bj[0]=[r30],16 // bp[7]
475 (p15)fcvt.fxu bj[0]=f0
476 mov ar.ec=1 }
477{ .mfi; (p12)ldf8 ni6=[nptr],16 // np[6]
478 (p13)fcvt.fxu ni6=f0
479 mov pr.rot=1<<16 }
480{ .mfb; (p14)ldf8 ni7=[r31],16 // np[7]
481 (p15)fcvt.fxu ni7=f0
482 brp.loop.imp .Louter_8_ctop,.Louter_8_cend-16
483 };;
484
485// The loop is scheduled for 32*n ticks on Itanium 2. Actual attempt
486// to measure with help of Interval Time Counter indicated that the
487// factor is a tad higher: 33 or 34, if not 35. Exact measurement and
488// addressing the issue is problematic, because I don't have access
489// to platform-specific instruction-level profiler. On Itanium it
490// should run in 56*n ticks, because of higher xma latency...
491.Louter_8_ctop:
492 .pred.rel "mutex",p40,p42
493 .pred.rel "mutex",p48,p50
494{ .mfi; (p16) nop.m 0 // 0:
495 (p16) xma.hu ahi[0]=ai0,bj[7],tf[1] // ap[0]*b[i]+t[0]
496 (p40) add a3=a3,n3 } // (p17) a3+=n3
497{ .mfi; (p42) add a3=a3,n3,1
498 (p16) xma.lu alo[0]=ai0,bj[7],tf[1]
499 (p16) nop.i 0 };;
500{ .mii; (p17) getf.sig a7=alo[8] // 1:
501 (p48) add t[6]=t[6],a3 // (p17) t[6]+=a3
502 (p50) add t[6]=t[6],a3,1 };;
503{ .mfi; (p17) getf.sig a8=ahi[8] // 2:
504 (p17) xma.hu nhi[7]=ni6,mj[1],nhi[6] // np[6]*m0
505 (p40) cmp.ltu p43,p41=a3,n3 }
506{ .mfi; (p42) cmp.leu p43,p41=a3,n3
507 (p17) xma.lu nlo[7]=ni6,mj[1],nhi[6]
508 (p16) nop.i 0 };;
509{ .mii; (p17) getf.sig n5=nlo[6] // 3:
510 (p48) cmp.ltu p51,p49=t[6],a3
511 (p50) cmp.leu p51,p49=t[6],a3 };;
512 .pred.rel "mutex",p41,p43
513 .pred.rel "mutex",p49,p51
514{ .mfi; (p16) nop.m 0 // 4:
515 (p16) xma.hu ahi[1]=ai1,bj[7],ahi[0] // ap[1]*b[i]
516 (p41) add a4=a4,n4 } // (p17) a4+=n4
517{ .mfi; (p43) add a4=a4,n4,1
518 (p16) xma.lu alo[1]=ai1,bj[7],ahi[0]
519 (p16) nop.i 0 };;
520{ .mfi; (p49) add t[5]=t[5],a4 // 5: (p17) t[5]+=a4
521 (p16) xmpy.lu mj[0]=alo[0],n0 // (ap[0]*b[i]+t[0])*n0
522 (p51) add t[5]=t[5],a4,1 };;
523{ .mfi; (p16) nop.m 0 // 6:
524 (p17) xma.hu nhi[8]=ni7,mj[1],nhi[7] // np[7]*m0
525 (p41) cmp.ltu p42,p40=a4,n4 }
526{ .mfi; (p43) cmp.leu p42,p40=a4,n4
527 (p17) xma.lu nlo[8]=ni7,mj[1],nhi[7]
528 (p16) nop.i 0 };;
529{ .mii; (p17) getf.sig n6=nlo[7] // 7:
530 (p49) cmp.ltu p50,p48=t[5],a4
531 (p51) cmp.leu p50,p48=t[5],a4 };;
532 .pred.rel "mutex",p40,p42
533 .pred.rel "mutex",p48,p50
534{ .mfi; (p16) nop.m 0 // 8:
535 (p16) xma.hu ahi[2]=ai2,bj[7],ahi[1] // ap[2]*b[i]
536 (p40) add a5=a5,n5 } // (p17) a5+=n5
537{ .mfi; (p42) add a5=a5,n5,1
538 (p16) xma.lu alo[2]=ai2,bj[7],ahi[1]
539 (p16) nop.i 0 };;
540{ .mii; (p16) getf.sig a1=alo[1] // 9:
541 (p48) add t[4]=t[4],a5 // p(17) t[4]+=a5
542 (p50) add t[4]=t[4],a5,1 };;
543{ .mfi; (p16) nop.m 0 // 10:
544 (p16) xma.hu nhi[0]=ni0,mj[0],alo[0] // np[0]*m0
545 (p40) cmp.ltu p43,p41=a5,n5 }
546{ .mfi; (p42) cmp.leu p43,p41=a5,n5
547 (p16) xma.lu nlo[0]=ni0,mj[0],alo[0]
548 (p16) nop.i 0 };;
549{ .mii; (p17) getf.sig n7=nlo[8] // 11:
550 (p48) cmp.ltu p51,p49=t[4],a5
551 (p50) cmp.leu p51,p49=t[4],a5 };;
552 .pred.rel "mutex",p41,p43
553 .pred.rel "mutex",p49,p51
554{ .mfi; (p17) getf.sig n8=nhi[8] // 12:
555 (p16) xma.hu ahi[3]=ai3,bj[7],ahi[2] // ap[3]*b[i]
556 (p41) add a6=a6,n6 } // (p17) a6+=n6
557{ .mfi; (p43) add a6=a6,n6,1
558 (p16) xma.lu alo[3]=ai3,bj[7],ahi[2]
559 (p16) nop.i 0 };;
560{ .mii; (p16) getf.sig a2=alo[2] // 13:
561 (p49) add t[3]=t[3],a6 // (p17) t[3]+=a6
562 (p51) add t[3]=t[3],a6,1 };;
563{ .mfi; (p16) nop.m 0 // 14:
564 (p16) xma.hu nhi[1]=ni1,mj[0],nhi[0] // np[1]*m0
565 (p41) cmp.ltu p42,p40=a6,n6 }
566{ .mfi; (p43) cmp.leu p42,p40=a6,n6
567 (p16) xma.lu nlo[1]=ni1,mj[0],nhi[0]
568 (p16) nop.i 0 };;
569{ .mii; (p16) nop.m 0 // 15:
570 (p49) cmp.ltu p50,p48=t[3],a6
571 (p51) cmp.leu p50,p48=t[3],a6 };;
572 .pred.rel "mutex",p40,p42
573 .pred.rel "mutex",p48,p50
574{ .mfi; (p16) nop.m 0 // 16:
575 (p16) xma.hu ahi[4]=ai4,bj[7],ahi[3] // ap[4]*b[i]
576 (p40) add a7=a7,n7 } // (p17) a7+=n7
577{ .mfi; (p42) add a7=a7,n7,1
578 (p16) xma.lu alo[4]=ai4,bj[7],ahi[3]
579 (p16) nop.i 0 };;
580{ .mii; (p16) getf.sig a3=alo[3] // 17:
581 (p48) add t[2]=t[2],a7 // (p17) t[2]+=a7
582 (p50) add t[2]=t[2],a7,1 };;
583{ .mfi; (p16) nop.m 0 // 18:
584 (p16) xma.hu nhi[2]=ni2,mj[0],nhi[1] // np[2]*m0
585 (p40) cmp.ltu p43,p41=a7,n7 }
586{ .mfi; (p42) cmp.leu p43,p41=a7,n7
587 (p16) xma.lu nlo[2]=ni2,mj[0],nhi[1]
588 (p16) nop.i 0 };;
589{ .mii; (p16) getf.sig n1=nlo[1] // 19:
590 (p48) cmp.ltu p51,p49=t[2],a7
591 (p50) cmp.leu p51,p49=t[2],a7 };;
592 .pred.rel "mutex",p41,p43
593 .pred.rel "mutex",p49,p51
594{ .mfi; (p16) nop.m 0 // 20:
595 (p16) xma.hu ahi[5]=ai5,bj[7],ahi[4] // ap[5]*b[i]
596 (p41) add a8=a8,n8 } // (p17) a8+=n8
597{ .mfi; (p43) add a8=a8,n8,1
598 (p16) xma.lu alo[5]=ai5,bj[7],ahi[4]
599 (p16) nop.i 0 };;
600{ .mii; (p16) getf.sig a4=alo[4] // 21:
601 (p49) add t[1]=t[1],a8 // (p17) t[1]+=a8
602 (p51) add t[1]=t[1],a8,1 };;
603{ .mfi; (p16) nop.m 0 // 22:
604 (p16) xma.hu nhi[3]=ni3,mj[0],nhi[2] // np[3]*m0
605 (p41) cmp.ltu p42,p40=a8,n8 }
606{ .mfi; (p43) cmp.leu p42,p40=a8,n8
607 (p16) xma.lu nlo[3]=ni3,mj[0],nhi[2]
608 (p16) nop.i 0 };;
609{ .mii; (p16) getf.sig n2=nlo[2] // 23:
610 (p49) cmp.ltu p50,p48=t[1],a8
611 (p51) cmp.leu p50,p48=t[1],a8 };;
612{ .mfi; (p16) nop.m 0 // 24:
613 (p16) xma.hu ahi[6]=ai6,bj[7],ahi[5] // ap[6]*b[i]
614 (p16) add a1=a1,n1 } // (p16) a1+=n1
615{ .mfi; (p16) nop.m 0
616 (p16) xma.lu alo[6]=ai6,bj[7],ahi[5]
617 (p17) mov t[0]=r0 };;
618{ .mii; (p16) getf.sig a5=alo[5] // 25:
619 (p16) add t0=t[7],a1 // (p16) t[7]+=a1
620 (p42) add t[0]=t[0],r0,1 };;
621{ .mfi; (p16) setf.sig tf[0]=t0 // 26:
622 (p16) xma.hu nhi[4]=ni4,mj[0],nhi[3] // np[4]*m0
623 (p50) add t[0]=t[0],r0,1 }
624{ .mfi; (p16) cmp.ltu.unc p42,p40=a1,n1
625 (p16) xma.lu nlo[4]=ni4,mj[0],nhi[3]
626 (p16) nop.i 0 };;
627{ .mii; (p16) getf.sig n3=nlo[3] // 27:
628 (p16) cmp.ltu.unc p50,p48=t0,a1
629 (p16) nop.i 0 };;
630 .pred.rel "mutex",p40,p42
631 .pred.rel "mutex",p48,p50
632{ .mfi; (p16) nop.m 0 // 28:
633 (p16) xma.hu ahi[7]=ai7,bj[7],ahi[6] // ap[7]*b[i]
634 (p40) add a2=a2,n2 } // (p16) a2+=n2
635{ .mfi; (p42) add a2=a2,n2,1
636 (p16) xma.lu alo[7]=ai7,bj[7],ahi[6]
637 (p16) nop.i 0 };;
638{ .mii; (p16) getf.sig a6=alo[6] // 29:
639 (p48) add t[6]=t[6],a2 // (p16) t[6]+=a2
640 (p50) add t[6]=t[6],a2,1 };;
641{ .mfi; (p16) nop.m 0 // 30:
642 (p16) xma.hu nhi[5]=ni5,mj[0],nhi[4] // np[5]*m0
643 (p40) cmp.ltu p41,p39=a2,n2 }
644{ .mfi; (p42) cmp.leu p41,p39=a2,n2
645 (p16) xma.lu nlo[5]=ni5,mj[0],nhi[4]
646 (p16) nop.i 0 };;
647{ .mfi; (p16) getf.sig n4=nlo[4] // 31:
648 (p16) nop.f 0
649 (p48) cmp.ltu p49,p47=t[6],a2 }
650{ .mfb; (p50) cmp.leu p49,p47=t[6],a2
651 (p16) nop.f 0
652 br.ctop.sptk.many .Louter_8_ctop };;
653.Louter_8_cend:
654
655// above loop has to execute one more time, without (p16), which is
656// replaced with merged move of np[8] to GPR bank
657 .pred.rel "mutex",p40,p42
658 .pred.rel "mutex",p48,p50
659{ .mmi; (p0) getf.sig n1=ni0 // 0:
660 (p40) add a3=a3,n3 // (p17) a3+=n3
661 (p42) add a3=a3,n3,1 };;
662{ .mii; (p17) getf.sig a7=alo[8] // 1:
663 (p48) add t[6]=t[6],a3 // (p17) t[6]+=a3
664 (p50) add t[6]=t[6],a3,1 };;
665{ .mfi; (p17) getf.sig a8=ahi[8] // 2:
666 (p17) xma.hu nhi[7]=ni6,mj[1],nhi[6] // np[6]*m0
667 (p40) cmp.ltu p43,p41=a3,n3 }
668{ .mfi; (p42) cmp.leu p43,p41=a3,n3
669 (p17) xma.lu nlo[7]=ni6,mj[1],nhi[6]
670 (p0) nop.i 0 };;
671{ .mii; (p17) getf.sig n5=nlo[6] // 3:
672 (p48) cmp.ltu p51,p49=t[6],a3
673 (p50) cmp.leu p51,p49=t[6],a3 };;
674 .pred.rel "mutex",p41,p43
675 .pred.rel "mutex",p49,p51
676{ .mmi; (p0) getf.sig n2=ni1 // 4:
677 (p41) add a4=a4,n4 // (p17) a4+=n4
678 (p43) add a4=a4,n4,1 };;
679{ .mfi; (p49) add t[5]=t[5],a4 // 5: (p17) t[5]+=a4
680 (p0) nop.f 0
681 (p51) add t[5]=t[5],a4,1 };;
682{ .mfi; (p0) getf.sig n3=ni2 // 6:
683 (p17) xma.hu nhi[8]=ni7,mj[1],nhi[7] // np[7]*m0
684 (p41) cmp.ltu p42,p40=a4,n4 }
685{ .mfi; (p43) cmp.leu p42,p40=a4,n4
686 (p17) xma.lu nlo[8]=ni7,mj[1],nhi[7]
687 (p0) nop.i 0 };;
688{ .mii; (p17) getf.sig n6=nlo[7] // 7:
689 (p49) cmp.ltu p50,p48=t[5],a4
690 (p51) cmp.leu p50,p48=t[5],a4 };;
691 .pred.rel "mutex",p40,p42
692 .pred.rel "mutex",p48,p50
693{ .mii; (p0) getf.sig n4=ni3 // 8:
694 (p40) add a5=a5,n5 // (p17) a5+=n5
695 (p42) add a5=a5,n5,1 };;
696{ .mii; (p0) nop.m 0 // 9:
697 (p48) add t[4]=t[4],a5 // p(17) t[4]+=a5
698 (p50) add t[4]=t[4],a5,1 };;
699{ .mii; (p0) nop.m 0 // 10:
700 (p40) cmp.ltu p43,p41=a5,n5
701 (p42) cmp.leu p43,p41=a5,n5 };;
702{ .mii; (p17) getf.sig n7=nlo[8] // 11:
703 (p48) cmp.ltu p51,p49=t[4],a5
704 (p50) cmp.leu p51,p49=t[4],a5 };;
705 .pred.rel "mutex",p41,p43
706 .pred.rel "mutex",p49,p51
707{ .mii; (p17) getf.sig n8=nhi[8] // 12:
708 (p41) add a6=a6,n6 // (p17) a6+=n6
709 (p43) add a6=a6,n6,1 };;
710{ .mii; (p0) getf.sig n5=ni4 // 13:
711 (p49) add t[3]=t[3],a6 // (p17) t[3]+=a6
712 (p51) add t[3]=t[3],a6,1 };;
713{ .mii; (p0) nop.m 0 // 14:
714 (p41) cmp.ltu p42,p40=a6,n6
715 (p43) cmp.leu p42,p40=a6,n6 };;
716{ .mii; (p0) getf.sig n6=ni5 // 15:
717 (p49) cmp.ltu p50,p48=t[3],a6
718 (p51) cmp.leu p50,p48=t[3],a6 };;
719 .pred.rel "mutex",p40,p42
720 .pred.rel "mutex",p48,p50
721{ .mii; (p0) nop.m 0 // 16:
722 (p40) add a7=a7,n7 // (p17) a7+=n7
723 (p42) add a7=a7,n7,1 };;
724{ .mii; (p0) nop.m 0 // 17:
725 (p48) add t[2]=t[2],a7 // (p17) t[2]+=a7
726 (p50) add t[2]=t[2],a7,1 };;
727{ .mii; (p0) nop.m 0 // 18:
728 (p40) cmp.ltu p43,p41=a7,n7
729 (p42) cmp.leu p43,p41=a7,n7 };;
730{ .mii; (p0) getf.sig n7=ni6 // 19:
731 (p48) cmp.ltu p51,p49=t[2],a7
732 (p50) cmp.leu p51,p49=t[2],a7 };;
733 .pred.rel "mutex",p41,p43
734 .pred.rel "mutex",p49,p51
735{ .mii; (p0) nop.m 0 // 20:
736 (p41) add a8=a8,n8 // (p17) a8+=n8
737 (p43) add a8=a8,n8,1 };;
738{ .mmi; (p0) nop.m 0 // 21:
739 (p49) add t[1]=t[1],a8 // (p17) t[1]+=a8
740 (p51) add t[1]=t[1],a8,1 }
741{ .mmi; (p17) mov t[0]=r0
742 (p41) cmp.ltu p42,p40=a8,n8
743 (p43) cmp.leu p42,p40=a8,n8 };;
744{ .mmi; (p0) getf.sig n8=ni7 // 22:
745 (p49) cmp.ltu p50,p48=t[1],a8
746 (p51) cmp.leu p50,p48=t[1],a8 }
747{ .mmi; (p42) add t[0]=t[0],r0,1
748 (p0) add r16=-7*16,prevsp
749 (p0) add r17=-6*16,prevsp };;
750
751// subtract np[8] from carrybit|tmp[8]
752// carrybit|tmp[8] layout upon exit from above loop is:
753// t[0]|t[1]|t[2]|t[3]|t[4]|t[5]|t[6]|t[7]|t0 (least significant)
754{ .mmi; (p50)add t[0]=t[0],r0,1
755 add r18=-5*16,prevsp
756 sub n1=t0,n1 };;
757{ .mmi; cmp.gtu p34,p32=n1,t0;;
758 .pred.rel "mutex",p32,p34
759 (p32)sub n2=t[7],n2
760 (p34)sub n2=t[7],n2,1 };;
761{ .mii; (p32)cmp.gtu p35,p33=n2,t[7]
762 (p34)cmp.geu p35,p33=n2,t[7];;
763 .pred.rel "mutex",p33,p35
764 (p33)sub n3=t[6],n3 }
765{ .mmi; (p35)sub n3=t[6],n3,1;;
766 (p33)cmp.gtu p34,p32=n3,t[6]
767 (p35)cmp.geu p34,p32=n3,t[6] };;
768 .pred.rel "mutex",p32,p34
769{ .mii; (p32)sub n4=t[5],n4
770 (p34)sub n4=t[5],n4,1;;
771 (p32)cmp.gtu p35,p33=n4,t[5] }
772{ .mmi; (p34)cmp.geu p35,p33=n4,t[5];;
773 .pred.rel "mutex",p33,p35
774 (p33)sub n5=t[4],n5
775 (p35)sub n5=t[4],n5,1 };;
776{ .mii; (p33)cmp.gtu p34,p32=n5,t[4]
777 (p35)cmp.geu p34,p32=n5,t[4];;
778 .pred.rel "mutex",p32,p34
779 (p32)sub n6=t[3],n6 }
780{ .mmi; (p34)sub n6=t[3],n6,1;;
781 (p32)cmp.gtu p35,p33=n6,t[3]
782 (p34)cmp.geu p35,p33=n6,t[3] };;
783 .pred.rel "mutex",p33,p35
784{ .mii; (p33)sub n7=t[2],n7
785 (p35)sub n7=t[2],n7,1;;
786 (p33)cmp.gtu p34,p32=n7,t[2] }
787{ .mmi; (p35)cmp.geu p34,p32=n7,t[2];;
788 .pred.rel "mutex",p32,p34
789 (p32)sub n8=t[1],n8
790 (p34)sub n8=t[1],n8,1 };;
791{ .mii; (p32)cmp.gtu p35,p33=n8,t[1]
792 (p34)cmp.geu p35,p33=n8,t[1];;
793 .pred.rel "mutex",p33,p35
794 (p33)sub a8=t[0],r0 }
795{ .mmi; (p35)sub a8=t[0],r0,1;;
796 (p33)cmp.gtu p34,p32=a8,t[0]
797 (p35)cmp.geu p34,p32=a8,t[0] };;
798
799// save the result, either tmp[num] or tmp[num]-np[num]
800 .pred.rel "mutex",p32,p34
801{ .mmi; (p32)st8 [rptr]=n1,8
802 (p34)st8 [rptr]=t0,8
803 add r19=-4*16,prevsp};;
804{ .mmb; (p32)st8 [rptr]=n2,8
805 (p34)st8 [rptr]=t[7],8
806 (p5)br.cond.dpnt.few .Ldone };;
807{ .mmb; (p32)st8 [rptr]=n3,8
808 (p34)st8 [rptr]=t[6],8
809 (p7)br.cond.dpnt.few .Ldone };;
810{ .mmb; (p32)st8 [rptr]=n4,8
811 (p34)st8 [rptr]=t[5],8
812 (p9)br.cond.dpnt.few .Ldone };;
813{ .mmb; (p32)st8 [rptr]=n5,8
814 (p34)st8 [rptr]=t[4],8
815 (p11)br.cond.dpnt.few .Ldone };;
816{ .mmb; (p32)st8 [rptr]=n6,8
817 (p34)st8 [rptr]=t[3],8
818 (p13)br.cond.dpnt.few .Ldone };;
819{ .mmb; (p32)st8 [rptr]=n7,8
820 (p34)st8 [rptr]=t[2],8
821 (p15)br.cond.dpnt.few .Ldone };;
822{ .mmb; (p32)st8 [rptr]=n8,8
823 (p34)st8 [rptr]=t[1],8
824 nop.b 0 };;
825.Ldone: // epilogue
826{ .mmi; ldf.fill f16=[r16],64
827 ldf.fill f17=[r17],64
828 nop.i 0 }
829{ .mmi; ldf.fill f18=[r18],64
830 ldf.fill f19=[r19],64
831 mov pr=prevpr,0x1ffff };;
832{ .mmi; ldf.fill f20=[r16]
833 ldf.fill f21=[r17]
834 mov ar.lc=prevlc }
835{ .mmi; ldf.fill f22=[r18]
836 ldf.fill f23=[r19]
837 mov ret0=1 } // signal "handled"
838{ .mib; rum 1<<5
839 .restore sp
840 mov sp=prevsp
841 br.ret.sptk.many b0 };;
842.endp bn_mul_mont_8#
843
844.type copyright#,\@object
845copyright:
846stringz "Montgomery multiplication for IA-64, CRYPTOGAMS by <appro\@openssl.org>"
847___
848
849$output=shift and open STDOUT,">$output";
850print $code;
851close STDOUT;
diff --git a/src/lib/libcrypto/bn/asm/ia64.S b/src/lib/libcrypto/bn/asm/ia64.S
deleted file mode 100644
index 951abc53ea..0000000000
--- a/src/lib/libcrypto/bn/asm/ia64.S
+++ /dev/null
@@ -1,1555 +0,0 @@
1.explicit
2.text
3.ident "ia64.S, Version 2.1"
4.ident "IA-64 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>"
5
6//
7// ====================================================================
8// Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
9// project.
10//
11// Rights for redistribution and usage in source and binary forms are
12// granted according to the OpenSSL license. Warranty of any kind is
13// disclaimed.
14// ====================================================================
15//
16// Version 2.x is Itanium2 re-tune. Few words about how Itanum2 is
17// different from Itanium to this module viewpoint. Most notably, is it
18// "wider" than Itanium? Can you experience loop scalability as
19// discussed in commentary sections? Not really:-( Itanium2 has 6
20// integer ALU ports, i.e. it's 2 ports wider, but it's not enough to
21// spin twice as fast, as I need 8 IALU ports. Amount of floating point
22// ports is the same, i.e. 2, while I need 4. In other words, to this
23// module Itanium2 remains effectively as "wide" as Itanium. Yet it's
24// essentially different in respect to this module, and a re-tune was
25// required. Well, because some intruction latencies has changed. Most
26// noticeably those intensively used:
27//
28// Itanium Itanium2
29// ldf8 9 6 L2 hit
30// ld8 2 1 L1 hit
31// getf 2 5
32// xma[->getf] 7[+1] 4[+0]
33// add[->st8] 1[+1] 1[+0]
34//
35// What does it mean? You might ratiocinate that the original code
36// should run just faster... Because sum of latencies is smaller...
37// Wrong! Note that getf latency increased. This means that if a loop is
38// scheduled for lower latency (as they were), then it will suffer from
39// stall condition and the code will therefore turn anti-scalable, e.g.
40// original bn_mul_words spun at 5*n or 2.5 times slower than expected
41// on Itanium2! What to do? Reschedule loops for Itanium2? But then
42// Itanium would exhibit anti-scalability. So I've chosen to reschedule
43// for worst latency for every instruction aiming for best *all-round*
44// performance.
45
46// Q. How much faster does it get?
47// A. Here is the output from 'openssl speed rsa dsa' for vanilla
48// 0.9.6a compiled with gcc version 2.96 20000731 (Red Hat
49// Linux 7.1 2.96-81):
50//
51// sign verify sign/s verify/s
52// rsa 512 bits 0.0036s 0.0003s 275.3 2999.2
53// rsa 1024 bits 0.0203s 0.0011s 49.3 894.1
54// rsa 2048 bits 0.1331s 0.0040s 7.5 250.9
55// rsa 4096 bits 0.9270s 0.0147s 1.1 68.1
56// sign verify sign/s verify/s
57// dsa 512 bits 0.0035s 0.0043s 288.3 234.8
58// dsa 1024 bits 0.0111s 0.0135s 90.0 74.2
59//
60// And here is similar output but for this assembler
61// implementation:-)
62//
63// sign verify sign/s verify/s
64// rsa 512 bits 0.0021s 0.0001s 549.4 9638.5
65// rsa 1024 bits 0.0055s 0.0002s 183.8 4481.1
66// rsa 2048 bits 0.0244s 0.0006s 41.4 1726.3
67// rsa 4096 bits 0.1295s 0.0018s 7.7 561.5
68// sign verify sign/s verify/s
69// dsa 512 bits 0.0012s 0.0013s 891.9 756.6
70// dsa 1024 bits 0.0023s 0.0028s 440.4 376.2
71//
72// Yes, you may argue that it's not fair comparison as it's
73// possible to craft the C implementation with BN_UMULT_HIGH
74// inline assembler macro. But of course! Here is the output
75// with the macro:
76//
77// sign verify sign/s verify/s
78// rsa 512 bits 0.0020s 0.0002s 495.0 6561.0
79// rsa 1024 bits 0.0086s 0.0004s 116.2 2235.7
80// rsa 2048 bits 0.0519s 0.0015s 19.3 667.3
81// rsa 4096 bits 0.3464s 0.0053s 2.9 187.7
82// sign verify sign/s verify/s
83// dsa 512 bits 0.0016s 0.0020s 613.1 510.5
84// dsa 1024 bits 0.0045s 0.0054s 221.0 183.9
85//
86// My code is still way faster, huh:-) And I believe that even
87// higher performance can be achieved. Note that as keys get
88// longer, performance gain is larger. Why? According to the
89// profiler there is another player in the field, namely
90// BN_from_montgomery consuming larger and larger portion of CPU
91// time as keysize decreases. I therefore consider putting effort
92// to assembler implementation of the following routine:
93//
94// void bn_mul_add_mont (BN_ULONG *rp,BN_ULONG *np,int nl,BN_ULONG n0)
95// {
96// int i,j;
97// BN_ULONG v;
98//
99// for (i=0; i<nl; i++)
100// {
101// v=bn_mul_add_words(rp,np,nl,(rp[0]*n0)&BN_MASK2);
102// nrp++;
103// rp++;
104// if (((nrp[-1]+=v)&BN_MASK2) < v)
105// for (j=0; ((++nrp[j])&BN_MASK2) == 0; j++) ;
106// }
107// }
108//
109// It might as well be beneficial to implement even combaX
110// variants, as it appears as it can literally unleash the
111// performance (see comment section to bn_mul_comba8 below).
112//
113// And finally for your reference the output for 0.9.6a compiled
114// with SGIcc version 0.01.0-12 (keep in mind that for the moment
115// of this writing it's not possible to convince SGIcc to use
116// BN_UMULT_HIGH inline assembler macro, yet the code is fast,
117// i.e. for a compiler generated one:-):
118//
119// sign verify sign/s verify/s
120// rsa 512 bits 0.0022s 0.0002s 452.7 5894.3
121// rsa 1024 bits 0.0097s 0.0005s 102.7 2002.9
122// rsa 2048 bits 0.0578s 0.0017s 17.3 600.2
123// rsa 4096 bits 0.3838s 0.0061s 2.6 164.5
124// sign verify sign/s verify/s
125// dsa 512 bits 0.0018s 0.0022s 547.3 459.6
126// dsa 1024 bits 0.0051s 0.0062s 196.6 161.3
127//
128// Oh! Benchmarks were performed on 733MHz Lion-class Itanium
129// system running Redhat Linux 7.1 (very special thanks to Ray
130// McCaffity of Williams Communications for providing an account).
131//
132// Q. What's the heck with 'rum 1<<5' at the end of every function?
133// A. Well, by clearing the "upper FP registers written" bit of the
134// User Mask I want to excuse the kernel from preserving upper
135// (f32-f128) FP register bank over process context switch, thus
136// minimizing bus bandwidth consumption during the switch (i.e.
137// after PKI opration completes and the program is off doing
138// something else like bulk symmetric encryption). Having said
139// this, I also want to point out that it might be good idea
140// to compile the whole toolkit (as well as majority of the
141// programs for that matter) with -mfixed-range=f32-f127 command
142// line option. No, it doesn't prevent the compiler from writing
143// to upper bank, but at least discourages to do so. If you don't
144// like the idea you have the option to compile the module with
145// -Drum=nop.m in command line.
146//
147
148#if defined(_HPUX_SOURCE) && !defined(_LP64)
149#define ADDP addp4
150#else
151#define ADDP add
152#endif
153
154#if 1
155//
156// bn_[add|sub]_words routines.
157//
158// Loops are spinning in 2*(n+5) ticks on Itanuim (provided that the
159// data reside in L1 cache, i.e. 2 ticks away). It's possible to
160// compress the epilogue and get down to 2*n+6, but at the cost of
161// scalability (the neat feature of this implementation is that it
162// shall automagically spin in n+5 on "wider" IA-64 implementations:-)
163// I consider that the epilogue is short enough as it is to trade tiny
164// performance loss on Itanium for scalability.
165//
166// BN_ULONG bn_add_words(BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int num)
167//
168.global bn_add_words#
169.proc bn_add_words#
170.align 64
171.skip 32 // makes the loop body aligned at 64-byte boundary
172bn_add_words:
173 .prologue
174 .save ar.pfs,r2
175{ .mii; alloc r2=ar.pfs,4,12,0,16
176 cmp4.le p6,p0=r35,r0 };;
177{ .mfb; mov r8=r0 // return value
178(p6) br.ret.spnt.many b0 };;
179
180{ .mib; sub r10=r35,r0,1
181 .save ar.lc,r3
182 mov r3=ar.lc
183 brp.loop.imp .L_bn_add_words_ctop,.L_bn_add_words_cend-16
184 }
185{ .mib; ADDP r14=0,r32 // rp
186 .save pr,r9
187 mov r9=pr };;
188 .body
189{ .mii; ADDP r15=0,r33 // ap
190 mov ar.lc=r10
191 mov ar.ec=6 }
192{ .mib; ADDP r16=0,r34 // bp
193 mov pr.rot=1<<16 };;
194
195.L_bn_add_words_ctop:
196{ .mii; (p16) ld8 r32=[r16],8 // b=*(bp++)
197 (p18) add r39=r37,r34
198 (p19) cmp.ltu.unc p56,p0=r40,r38 }
199{ .mfb; (p0) nop.m 0x0
200 (p0) nop.f 0x0
201 (p0) nop.b 0x0 }
202{ .mii; (p16) ld8 r35=[r15],8 // a=*(ap++)
203 (p58) cmp.eq.or p57,p0=-1,r41 // (p20)
204 (p58) add r41=1,r41 } // (p20)
205{ .mfb; (p21) st8 [r14]=r42,8 // *(rp++)=r
206 (p0) nop.f 0x0
207 br.ctop.sptk .L_bn_add_words_ctop };;
208.L_bn_add_words_cend:
209
210{ .mii;
211(p59) add r8=1,r8 // return value
212 mov pr=r9,0x1ffff
213 mov ar.lc=r3 }
214{ .mbb; nop.b 0x0
215 br.ret.sptk.many b0 };;
216.endp bn_add_words#
217
218//
219// BN_ULONG bn_sub_words(BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int num)
220//
221.global bn_sub_words#
222.proc bn_sub_words#
223.align 64
224.skip 32 // makes the loop body aligned at 64-byte boundary
225bn_sub_words:
226 .prologue
227 .save ar.pfs,r2
228{ .mii; alloc r2=ar.pfs,4,12,0,16
229 cmp4.le p6,p0=r35,r0 };;
230{ .mfb; mov r8=r0 // return value
231(p6) br.ret.spnt.many b0 };;
232
233{ .mib; sub r10=r35,r0,1
234 .save ar.lc,r3
235 mov r3=ar.lc
236 brp.loop.imp .L_bn_sub_words_ctop,.L_bn_sub_words_cend-16
237 }
238{ .mib; ADDP r14=0,r32 // rp
239 .save pr,r9
240 mov r9=pr };;
241 .body
242{ .mii; ADDP r15=0,r33 // ap
243 mov ar.lc=r10
244 mov ar.ec=6 }
245{ .mib; ADDP r16=0,r34 // bp
246 mov pr.rot=1<<16 };;
247
248.L_bn_sub_words_ctop:
249{ .mii; (p16) ld8 r32=[r16],8 // b=*(bp++)
250 (p18) sub r39=r37,r34
251 (p19) cmp.gtu.unc p56,p0=r40,r38 }
252{ .mfb; (p0) nop.m 0x0
253 (p0) nop.f 0x0
254 (p0) nop.b 0x0 }
255{ .mii; (p16) ld8 r35=[r15],8 // a=*(ap++)
256 (p58) cmp.eq.or p57,p0=0,r41 // (p20)
257 (p58) add r41=-1,r41 } // (p20)
258{ .mbb; (p21) st8 [r14]=r42,8 // *(rp++)=r
259 (p0) nop.b 0x0
260 br.ctop.sptk .L_bn_sub_words_ctop };;
261.L_bn_sub_words_cend:
262
263{ .mii;
264(p59) add r8=1,r8 // return value
265 mov pr=r9,0x1ffff
266 mov ar.lc=r3 }
267{ .mbb; nop.b 0x0
268 br.ret.sptk.many b0 };;
269.endp bn_sub_words#
270#endif
271
272#if 0
273#define XMA_TEMPTATION
274#endif
275
276#if 1
277//
278// BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
279//
280.global bn_mul_words#
281.proc bn_mul_words#
282.align 64
283.skip 32 // makes the loop body aligned at 64-byte boundary
284bn_mul_words:
285 .prologue
286 .save ar.pfs,r2
287#ifdef XMA_TEMPTATION
288{ .mfi; alloc r2=ar.pfs,4,0,0,0 };;
289#else
290{ .mfi; alloc r2=ar.pfs,4,12,0,16 };;
291#endif
292{ .mib; mov r8=r0 // return value
293 cmp4.le p6,p0=r34,r0
294(p6) br.ret.spnt.many b0 };;
295
296{ .mii; sub r10=r34,r0,1
297 .save ar.lc,r3
298 mov r3=ar.lc
299 .save pr,r9
300 mov r9=pr };;
301
302 .body
303{ .mib; setf.sig f8=r35 // w
304 mov pr.rot=0x800001<<16
305 // ------^----- serves as (p50) at first (p27)
306 brp.loop.imp .L_bn_mul_words_ctop,.L_bn_mul_words_cend-16
307 }
308
309#ifndef XMA_TEMPTATION
310
311{ .mmi; ADDP r14=0,r32 // rp
312 ADDP r15=0,r33 // ap
313 mov ar.lc=r10 }
314{ .mmi; mov r40=0 // serves as r35 at first (p27)
315 mov ar.ec=13 };;
316
317// This loop spins in 2*(n+12) ticks. It's scheduled for data in Itanium
318// L2 cache (i.e. 9 ticks away) as floating point load/store instructions
319// bypass L1 cache and L2 latency is actually best-case scenario for
320// ldf8. The loop is not scalable and shall run in 2*(n+12) even on
321// "wider" IA-64 implementations. It's a trade-off here. n+24 loop
322// would give us ~5% in *overall* performance improvement on "wider"
323// IA-64, but would hurt Itanium for about same because of longer
324// epilogue. As it's a matter of few percents in either case I've
325// chosen to trade the scalability for development time (you can see
326// this very instruction sequence in bn_mul_add_words loop which in
327// turn is scalable).
328.L_bn_mul_words_ctop:
329{ .mfi; (p25) getf.sig r36=f52 // low
330 (p21) xmpy.lu f48=f37,f8
331 (p28) cmp.ltu p54,p50=r41,r39 }
332{ .mfi; (p16) ldf8 f32=[r15],8
333 (p21) xmpy.hu f40=f37,f8
334 (p0) nop.i 0x0 };;
335{ .mii; (p25) getf.sig r32=f44 // high
336 .pred.rel "mutex",p50,p54
337 (p50) add r40=r38,r35 // (p27)
338 (p54) add r40=r38,r35,1 } // (p27)
339{ .mfb; (p28) st8 [r14]=r41,8
340 (p0) nop.f 0x0
341 br.ctop.sptk .L_bn_mul_words_ctop };;
342.L_bn_mul_words_cend:
343
344{ .mii; nop.m 0x0
345.pred.rel "mutex",p51,p55
346(p51) add r8=r36,r0
347(p55) add r8=r36,r0,1 }
348{ .mfb; nop.m 0x0
349 nop.f 0x0
350 nop.b 0x0 }
351
352#else // XMA_TEMPTATION
353
354 setf.sig f37=r0 // serves as carry at (p18) tick
355 mov ar.lc=r10
356 mov ar.ec=5;;
357
358// Most of you examining this code very likely wonder why in the name
359// of Intel the following loop is commented out? Indeed, it looks so
360// neat that you find it hard to believe that it's something wrong
361// with it, right? The catch is that every iteration depends on the
362// result from previous one and the latter isn't available instantly.
363// The loop therefore spins at the latency of xma minus 1, or in other
364// words at 6*(n+4) ticks:-( Compare to the "production" loop above
365// that runs in 2*(n+11) where the low latency problem is worked around
366// by moving the dependency to one-tick latent interger ALU. Note that
367// "distance" between ldf8 and xma is not latency of ldf8, but the
368// *difference* between xma and ldf8 latencies.
369.L_bn_mul_words_ctop:
370{ .mfi; (p16) ldf8 f32=[r33],8
371 (p18) xma.hu f38=f34,f8,f39 }
372{ .mfb; (p20) stf8 [r32]=f37,8
373 (p18) xma.lu f35=f34,f8,f39
374 br.ctop.sptk .L_bn_mul_words_ctop };;
375.L_bn_mul_words_cend:
376
377 getf.sig r8=f41 // the return value
378
379#endif // XMA_TEMPTATION
380
381{ .mii; nop.m 0x0
382 mov pr=r9,0x1ffff
383 mov ar.lc=r3 }
384{ .mfb; rum 1<<5 // clear um.mfh
385 nop.f 0x0
386 br.ret.sptk.many b0 };;
387.endp bn_mul_words#
388#endif
389
390#if 1
391//
392// BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
393//
394.global bn_mul_add_words#
395.proc bn_mul_add_words#
396.align 64
397.skip 48 // makes the loop body aligned at 64-byte boundary
398bn_mul_add_words:
399 .prologue
400 .save ar.pfs,r2
401{ .mmi; alloc r2=ar.pfs,4,4,0,8
402 cmp4.le p6,p0=r34,r0
403 .save ar.lc,r3
404 mov r3=ar.lc };;
405{ .mib; mov r8=r0 // return value
406 sub r10=r34,r0,1
407(p6) br.ret.spnt.many b0 };;
408
409{ .mib; setf.sig f8=r35 // w
410 .save pr,r9
411 mov r9=pr
412 brp.loop.imp .L_bn_mul_add_words_ctop,.L_bn_mul_add_words_cend-16
413 }
414 .body
415{ .mmi; ADDP r14=0,r32 // rp
416 ADDP r15=0,r33 // ap
417 mov ar.lc=r10 }
418{ .mii; ADDP r16=0,r32 // rp copy
419 mov pr.rot=0x2001<<16
420 // ------^----- serves as (p40) at first (p27)
421 mov ar.ec=11 };;
422
423// This loop spins in 3*(n+10) ticks on Itanium and in 2*(n+10) on
424// Itanium 2. Yes, unlike previous versions it scales:-) Previous
425// version was peforming *all* additions in IALU and was starving
426// for those even on Itanium 2. In this version one addition is
427// moved to FPU and is folded with multiplication. This is at cost
428// of propogating the result from previous call to this subroutine
429// to L2 cache... In other words negligible even for shorter keys.
430// *Overall* performance improvement [over previous version] varies
431// from 11 to 22 percent depending on key length.
432.L_bn_mul_add_words_ctop:
433.pred.rel "mutex",p40,p42
434{ .mfi; (p23) getf.sig r36=f45 // low
435 (p20) xma.lu f42=f36,f8,f50 // low
436 (p40) add r39=r39,r35 } // (p27)
437{ .mfi; (p16) ldf8 f32=[r15],8 // *(ap++)
438 (p20) xma.hu f36=f36,f8,f50 // high
439 (p42) add r39=r39,r35,1 };; // (p27)
440{ .mmi; (p24) getf.sig r32=f40 // high
441 (p16) ldf8 f46=[r16],8 // *(rp1++)
442 (p40) cmp.ltu p41,p39=r39,r35 } // (p27)
443{ .mib; (p26) st8 [r14]=r39,8 // *(rp2++)
444 (p42) cmp.leu p41,p39=r39,r35 // (p27)
445 br.ctop.sptk .L_bn_mul_add_words_ctop};;
446.L_bn_mul_add_words_cend:
447
448{ .mmi; .pred.rel "mutex",p40,p42
449(p40) add r8=r35,r0
450(p42) add r8=r35,r0,1
451 mov pr=r9,0x1ffff }
452{ .mib; rum 1<<5 // clear um.mfh
453 mov ar.lc=r3
454 br.ret.sptk.many b0 };;
455.endp bn_mul_add_words#
456#endif
457
458#if 1
459//
460// void bn_sqr_words(BN_ULONG *rp, BN_ULONG *ap, int num)
461//
462.global bn_sqr_words#
463.proc bn_sqr_words#
464.align 64
465.skip 32 // makes the loop body aligned at 64-byte boundary
466bn_sqr_words:
467 .prologue
468 .save ar.pfs,r2
469{ .mii; alloc r2=ar.pfs,3,0,0,0
470 sxt4 r34=r34 };;
471{ .mii; cmp.le p6,p0=r34,r0
472 mov r8=r0 } // return value
473{ .mfb; ADDP r32=0,r32
474 nop.f 0x0
475(p6) br.ret.spnt.many b0 };;
476
477{ .mii; sub r10=r34,r0,1
478 .save ar.lc,r3
479 mov r3=ar.lc
480 .save pr,r9
481 mov r9=pr };;
482
483 .body
484{ .mib; ADDP r33=0,r33
485 mov pr.rot=1<<16
486 brp.loop.imp .L_bn_sqr_words_ctop,.L_bn_sqr_words_cend-16
487 }
488{ .mii; add r34=8,r32
489 mov ar.lc=r10
490 mov ar.ec=18 };;
491
492// 2*(n+17) on Itanium, (n+17) on "wider" IA-64 implementations. It's
493// possible to compress the epilogue (I'm getting tired to write this
494// comment over and over) and get down to 2*n+16 at the cost of
495// scalability. The decision will very likely be reconsidered after the
496// benchmark program is profiled. I.e. if perfomance gain on Itanium
497// will appear larger than loss on "wider" IA-64, then the loop should
498// be explicitely split and the epilogue compressed.
499.L_bn_sqr_words_ctop:
500{ .mfi; (p16) ldf8 f32=[r33],8
501 (p25) xmpy.lu f42=f41,f41
502 (p0) nop.i 0x0 }
503{ .mib; (p33) stf8 [r32]=f50,16
504 (p0) nop.i 0x0
505 (p0) nop.b 0x0 }
506{ .mfi; (p0) nop.m 0x0
507 (p25) xmpy.hu f52=f41,f41
508 (p0) nop.i 0x0 }
509{ .mib; (p33) stf8 [r34]=f60,16
510 (p0) nop.i 0x0
511 br.ctop.sptk .L_bn_sqr_words_ctop };;
512.L_bn_sqr_words_cend:
513
514{ .mii; nop.m 0x0
515 mov pr=r9,0x1ffff
516 mov ar.lc=r3 }
517{ .mfb; rum 1<<5 // clear um.mfh
518 nop.f 0x0
519 br.ret.sptk.many b0 };;
520.endp bn_sqr_words#
521#endif
522
523#if 1
524// Apparently we win nothing by implementing special bn_sqr_comba8.
525// Yes, it is possible to reduce the number of multiplications by
526// almost factor of two, but then the amount of additions would
527// increase by factor of two (as we would have to perform those
528// otherwise performed by xma ourselves). Normally we would trade
529// anyway as multiplications are way more expensive, but not this
530// time... Multiplication kernel is fully pipelined and as we drain
531// one 128-bit multiplication result per clock cycle multiplications
532// are effectively as inexpensive as additions. Special implementation
533// might become of interest for "wider" IA-64 implementation as you'll
534// be able to get through the multiplication phase faster (there won't
535// be any stall issues as discussed in the commentary section below and
536// you therefore will be able to employ all 4 FP units)... But these
537// Itanium days it's simply too hard to justify the effort so I just
538// drop down to bn_mul_comba8 code:-)
539//
540// void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
541//
542.global bn_sqr_comba8#
543.proc bn_sqr_comba8#
544.align 64
545bn_sqr_comba8:
546 .prologue
547 .save ar.pfs,r2
548#if defined(_HPUX_SOURCE) && !defined(_LP64)
549{ .mii; alloc r2=ar.pfs,2,1,0,0
550 addp4 r33=0,r33
551 addp4 r32=0,r32 };;
552{ .mii;
553#else
554{ .mii; alloc r2=ar.pfs,2,1,0,0
555#endif
556 mov r34=r33
557 add r14=8,r33 };;
558 .body
559{ .mii; add r17=8,r34
560 add r15=16,r33
561 add r18=16,r34 }
562{ .mfb; add r16=24,r33
563 br .L_cheat_entry_point8 };;
564.endp bn_sqr_comba8#
565#endif
566
567#if 1
568// I've estimated this routine to run in ~120 ticks, but in reality
569// (i.e. according to ar.itc) it takes ~160 ticks. Are those extra
570// cycles consumed for instructions fetch? Or did I misinterpret some
571// clause in Itanium µ-architecture manual? Comments are welcomed and
572// highly appreciated.
573//
574// On Itanium 2 it takes ~190 ticks. This is because of stalls on
575// result from getf.sig. I do nothing about it at this point for
576// reasons depicted below.
577//
578// However! It should be noted that even 160 ticks is darn good result
579// as it's over 10 (yes, ten, spelled as t-e-n) times faster than the
580// C version (compiled with gcc with inline assembler). I really
581// kicked compiler's butt here, didn't I? Yeah! This brings us to the
582// following statement. It's damn shame that this routine isn't called
583// very often nowadays! According to the profiler most CPU time is
584// consumed by bn_mul_add_words called from BN_from_montgomery. In
585// order to estimate what we're missing, I've compared the performance
586// of this routine against "traditional" implementation, i.e. against
587// following routine:
588//
589// void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
590// { r[ 8]=bn_mul_words( &(r[0]),a,8,b[0]);
591// r[ 9]=bn_mul_add_words(&(r[1]),a,8,b[1]);
592// r[10]=bn_mul_add_words(&(r[2]),a,8,b[2]);
593// r[11]=bn_mul_add_words(&(r[3]),a,8,b[3]);
594// r[12]=bn_mul_add_words(&(r[4]),a,8,b[4]);
595// r[13]=bn_mul_add_words(&(r[5]),a,8,b[5]);
596// r[14]=bn_mul_add_words(&(r[6]),a,8,b[6]);
597// r[15]=bn_mul_add_words(&(r[7]),a,8,b[7]);
598// }
599//
600// The one below is over 8 times faster than the one above:-( Even
601// more reasons to "combafy" bn_mul_add_mont...
602//
603// And yes, this routine really made me wish there were an optimizing
604// assembler! It also feels like it deserves a dedication.
605//
606// To my wife for being there and to my kids...
607//
608// void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
609//
610#define carry1 r14
611#define carry2 r15
612#define carry3 r34
613.global bn_mul_comba8#
614.proc bn_mul_comba8#
615.align 64
616bn_mul_comba8:
617 .prologue
618 .save ar.pfs,r2
619#if defined(_HPUX_SOURCE) && !defined(_LP64)
620{ .mii; alloc r2=ar.pfs,3,0,0,0
621 addp4 r33=0,r33
622 addp4 r34=0,r34 };;
623{ .mii; addp4 r32=0,r32
624#else
625{ .mii; alloc r2=ar.pfs,3,0,0,0
626#endif
627 add r14=8,r33
628 add r17=8,r34 }
629 .body
630{ .mii; add r15=16,r33
631 add r18=16,r34
632 add r16=24,r33 }
633.L_cheat_entry_point8:
634{ .mmi; add r19=24,r34
635
636 ldf8 f32=[r33],32 };;
637
638{ .mmi; ldf8 f120=[r34],32
639 ldf8 f121=[r17],32 }
640{ .mmi; ldf8 f122=[r18],32
641 ldf8 f123=[r19],32 };;
642{ .mmi; ldf8 f124=[r34]
643 ldf8 f125=[r17] }
644{ .mmi; ldf8 f126=[r18]
645 ldf8 f127=[r19] }
646
647{ .mmi; ldf8 f33=[r14],32
648 ldf8 f34=[r15],32 }
649{ .mmi; ldf8 f35=[r16],32;;
650 ldf8 f36=[r33] }
651{ .mmi; ldf8 f37=[r14]
652 ldf8 f38=[r15] }
653{ .mfi; ldf8 f39=[r16]
654// -------\ Entering multiplier's heaven /-------
655// ------------\ /------------
656// -----------------\ /-----------------
657// ----------------------\/----------------------
658 xma.hu f41=f32,f120,f0 }
659{ .mfi; xma.lu f40=f32,f120,f0 };; // (*)
660{ .mfi; xma.hu f51=f32,f121,f0 }
661{ .mfi; xma.lu f50=f32,f121,f0 };;
662{ .mfi; xma.hu f61=f32,f122,f0 }
663{ .mfi; xma.lu f60=f32,f122,f0 };;
664{ .mfi; xma.hu f71=f32,f123,f0 }
665{ .mfi; xma.lu f70=f32,f123,f0 };;
666{ .mfi; xma.hu f81=f32,f124,f0 }
667{ .mfi; xma.lu f80=f32,f124,f0 };;
668{ .mfi; xma.hu f91=f32,f125,f0 }
669{ .mfi; xma.lu f90=f32,f125,f0 };;
670{ .mfi; xma.hu f101=f32,f126,f0 }
671{ .mfi; xma.lu f100=f32,f126,f0 };;
672{ .mfi; xma.hu f111=f32,f127,f0 }
673{ .mfi; xma.lu f110=f32,f127,f0 };;//
674// (*) You can argue that splitting at every second bundle would
675// prevent "wider" IA-64 implementations from achieving the peak
676// performance. Well, not really... The catch is that if you
677// intend to keep 4 FP units busy by splitting at every fourth
678// bundle and thus perform these 16 multiplications in 4 ticks,
679// the first bundle *below* would stall because the result from
680// the first xma bundle *above* won't be available for another 3
681// ticks (if not more, being an optimist, I assume that "wider"
682// implementation will have same latency:-). This stall will hold
683// you back and the performance would be as if every second bundle
684// were split *anyway*...
685{ .mfi; getf.sig r16=f40
686 xma.hu f42=f33,f120,f41
687 add r33=8,r32 }
688{ .mfi; xma.lu f41=f33,f120,f41 };;
689{ .mfi; getf.sig r24=f50
690 xma.hu f52=f33,f121,f51 }
691{ .mfi; xma.lu f51=f33,f121,f51 };;
692{ .mfi; st8 [r32]=r16,16
693 xma.hu f62=f33,f122,f61 }
694{ .mfi; xma.lu f61=f33,f122,f61 };;
695{ .mfi; xma.hu f72=f33,f123,f71 }
696{ .mfi; xma.lu f71=f33,f123,f71 };;
697{ .mfi; xma.hu f82=f33,f124,f81 }
698{ .mfi; xma.lu f81=f33,f124,f81 };;
699{ .mfi; xma.hu f92=f33,f125,f91 }
700{ .mfi; xma.lu f91=f33,f125,f91 };;
701{ .mfi; xma.hu f102=f33,f126,f101 }
702{ .mfi; xma.lu f101=f33,f126,f101 };;
703{ .mfi; xma.hu f112=f33,f127,f111 }
704{ .mfi; xma.lu f111=f33,f127,f111 };;//
705//-------------------------------------------------//
706{ .mfi; getf.sig r25=f41
707 xma.hu f43=f34,f120,f42 }
708{ .mfi; xma.lu f42=f34,f120,f42 };;
709{ .mfi; getf.sig r16=f60
710 xma.hu f53=f34,f121,f52 }
711{ .mfi; xma.lu f52=f34,f121,f52 };;
712{ .mfi; getf.sig r17=f51
713 xma.hu f63=f34,f122,f62
714 add r25=r25,r24 }
715{ .mfi; xma.lu f62=f34,f122,f62
716 mov carry1=0 };;
717{ .mfi; cmp.ltu p6,p0=r25,r24
718 xma.hu f73=f34,f123,f72 }
719{ .mfi; xma.lu f72=f34,f123,f72 };;
720{ .mfi; st8 [r33]=r25,16
721 xma.hu f83=f34,f124,f82
722(p6) add carry1=1,carry1 }
723{ .mfi; xma.lu f82=f34,f124,f82 };;
724{ .mfi; xma.hu f93=f34,f125,f92 }
725{ .mfi; xma.lu f92=f34,f125,f92 };;
726{ .mfi; xma.hu f103=f34,f126,f102 }
727{ .mfi; xma.lu f102=f34,f126,f102 };;
728{ .mfi; xma.hu f113=f34,f127,f112 }
729{ .mfi; xma.lu f112=f34,f127,f112 };;//
730//-------------------------------------------------//
731{ .mfi; getf.sig r18=f42
732 xma.hu f44=f35,f120,f43
733 add r17=r17,r16 }
734{ .mfi; xma.lu f43=f35,f120,f43 };;
735{ .mfi; getf.sig r24=f70
736 xma.hu f54=f35,f121,f53 }
737{ .mfi; mov carry2=0
738 xma.lu f53=f35,f121,f53 };;
739{ .mfi; getf.sig r25=f61
740 xma.hu f64=f35,f122,f63
741 cmp.ltu p7,p0=r17,r16 }
742{ .mfi; add r18=r18,r17
743 xma.lu f63=f35,f122,f63 };;
744{ .mfi; getf.sig r26=f52
745 xma.hu f74=f35,f123,f73
746(p7) add carry2=1,carry2 }
747{ .mfi; cmp.ltu p7,p0=r18,r17
748 xma.lu f73=f35,f123,f73
749 add r18=r18,carry1 };;
750{ .mfi;
751 xma.hu f84=f35,f124,f83
752(p7) add carry2=1,carry2 }
753{ .mfi; cmp.ltu p7,p0=r18,carry1
754 xma.lu f83=f35,f124,f83 };;
755{ .mfi; st8 [r32]=r18,16
756 xma.hu f94=f35,f125,f93
757(p7) add carry2=1,carry2 }
758{ .mfi; xma.lu f93=f35,f125,f93 };;
759{ .mfi; xma.hu f104=f35,f126,f103 }
760{ .mfi; xma.lu f103=f35,f126,f103 };;
761{ .mfi; xma.hu f114=f35,f127,f113 }
762{ .mfi; mov carry1=0
763 xma.lu f113=f35,f127,f113
764 add r25=r25,r24 };;//
765//-------------------------------------------------//
766{ .mfi; getf.sig r27=f43
767 xma.hu f45=f36,f120,f44
768 cmp.ltu p6,p0=r25,r24 }
769{ .mfi; xma.lu f44=f36,f120,f44
770 add r26=r26,r25 };;
771{ .mfi; getf.sig r16=f80
772 xma.hu f55=f36,f121,f54
773(p6) add carry1=1,carry1 }
774{ .mfi; xma.lu f54=f36,f121,f54 };;
775{ .mfi; getf.sig r17=f71
776 xma.hu f65=f36,f122,f64
777 cmp.ltu p6,p0=r26,r25 }
778{ .mfi; xma.lu f64=f36,f122,f64
779 add r27=r27,r26 };;
780{ .mfi; getf.sig r18=f62
781 xma.hu f75=f36,f123,f74
782(p6) add carry1=1,carry1 }
783{ .mfi; cmp.ltu p6,p0=r27,r26
784 xma.lu f74=f36,f123,f74
785 add r27=r27,carry2 };;
786{ .mfi; getf.sig r19=f53
787 xma.hu f85=f36,f124,f84
788(p6) add carry1=1,carry1 }
789{ .mfi; xma.lu f84=f36,f124,f84
790 cmp.ltu p6,p0=r27,carry2 };;
791{ .mfi; st8 [r33]=r27,16
792 xma.hu f95=f36,f125,f94
793(p6) add carry1=1,carry1 }
794{ .mfi; xma.lu f94=f36,f125,f94 };;
795{ .mfi; xma.hu f105=f36,f126,f104 }
796{ .mfi; mov carry2=0
797 xma.lu f104=f36,f126,f104
798 add r17=r17,r16 };;
799{ .mfi; xma.hu f115=f36,f127,f114
800 cmp.ltu p7,p0=r17,r16 }
801{ .mfi; xma.lu f114=f36,f127,f114
802 add r18=r18,r17 };;//
803//-------------------------------------------------//
804{ .mfi; getf.sig r20=f44
805 xma.hu f46=f37,f120,f45
806(p7) add carry2=1,carry2 }
807{ .mfi; cmp.ltu p7,p0=r18,r17
808 xma.lu f45=f37,f120,f45
809 add r19=r19,r18 };;
810{ .mfi; getf.sig r24=f90
811 xma.hu f56=f37,f121,f55 }
812{ .mfi; xma.lu f55=f37,f121,f55 };;
813{ .mfi; getf.sig r25=f81
814 xma.hu f66=f37,f122,f65
815(p7) add carry2=1,carry2 }
816{ .mfi; cmp.ltu p7,p0=r19,r18
817 xma.lu f65=f37,f122,f65
818 add r20=r20,r19 };;
819{ .mfi; getf.sig r26=f72
820 xma.hu f76=f37,f123,f75
821(p7) add carry2=1,carry2 }
822{ .mfi; cmp.ltu p7,p0=r20,r19
823 xma.lu f75=f37,f123,f75
824 add r20=r20,carry1 };;
825{ .mfi; getf.sig r27=f63
826 xma.hu f86=f37,f124,f85
827(p7) add carry2=1,carry2 }
828{ .mfi; xma.lu f85=f37,f124,f85
829 cmp.ltu p7,p0=r20,carry1 };;
830{ .mfi; getf.sig r28=f54
831 xma.hu f96=f37,f125,f95
832(p7) add carry2=1,carry2 }
833{ .mfi; st8 [r32]=r20,16
834 xma.lu f95=f37,f125,f95 };;
835{ .mfi; xma.hu f106=f37,f126,f105 }
836{ .mfi; mov carry1=0
837 xma.lu f105=f37,f126,f105
838 add r25=r25,r24 };;
839{ .mfi; xma.hu f116=f37,f127,f115
840 cmp.ltu p6,p0=r25,r24 }
841{ .mfi; xma.lu f115=f37,f127,f115
842 add r26=r26,r25 };;//
843//-------------------------------------------------//
844{ .mfi; getf.sig r29=f45
845 xma.hu f47=f38,f120,f46
846(p6) add carry1=1,carry1 }
847{ .mfi; cmp.ltu p6,p0=r26,r25
848 xma.lu f46=f38,f120,f46
849 add r27=r27,r26 };;
850{ .mfi; getf.sig r16=f100
851 xma.hu f57=f38,f121,f56
852(p6) add carry1=1,carry1 }
853{ .mfi; cmp.ltu p6,p0=r27,r26
854 xma.lu f56=f38,f121,f56
855 add r28=r28,r27 };;
856{ .mfi; getf.sig r17=f91
857 xma.hu f67=f38,f122,f66
858(p6) add carry1=1,carry1 }
859{ .mfi; cmp.ltu p6,p0=r28,r27
860 xma.lu f66=f38,f122,f66
861 add r29=r29,r28 };;
862{ .mfi; getf.sig r18=f82
863 xma.hu f77=f38,f123,f76
864(p6) add carry1=1,carry1 }
865{ .mfi; cmp.ltu p6,p0=r29,r28
866 xma.lu f76=f38,f123,f76
867 add r29=r29,carry2 };;
868{ .mfi; getf.sig r19=f73
869 xma.hu f87=f38,f124,f86
870(p6) add carry1=1,carry1 }
871{ .mfi; xma.lu f86=f38,f124,f86
872 cmp.ltu p6,p0=r29,carry2 };;
873{ .mfi; getf.sig r20=f64
874 xma.hu f97=f38,f125,f96
875(p6) add carry1=1,carry1 }
876{ .mfi; st8 [r33]=r29,16
877 xma.lu f96=f38,f125,f96 };;
878{ .mfi; getf.sig r21=f55
879 xma.hu f107=f38,f126,f106 }
880{ .mfi; mov carry2=0
881 xma.lu f106=f38,f126,f106
882 add r17=r17,r16 };;
883{ .mfi; xma.hu f117=f38,f127,f116
884 cmp.ltu p7,p0=r17,r16 }
885{ .mfi; xma.lu f116=f38,f127,f116
886 add r18=r18,r17 };;//
887//-------------------------------------------------//
888{ .mfi; getf.sig r22=f46
889 xma.hu f48=f39,f120,f47
890(p7) add carry2=1,carry2 }
891{ .mfi; cmp.ltu p7,p0=r18,r17
892 xma.lu f47=f39,f120,f47
893 add r19=r19,r18 };;
894{ .mfi; getf.sig r24=f110
895 xma.hu f58=f39,f121,f57
896(p7) add carry2=1,carry2 }
897{ .mfi; cmp.ltu p7,p0=r19,r18
898 xma.lu f57=f39,f121,f57
899 add r20=r20,r19 };;
900{ .mfi; getf.sig r25=f101
901 xma.hu f68=f39,f122,f67
902(p7) add carry2=1,carry2 }
903{ .mfi; cmp.ltu p7,p0=r20,r19
904 xma.lu f67=f39,f122,f67
905 add r21=r21,r20 };;
906{ .mfi; getf.sig r26=f92
907 xma.hu f78=f39,f123,f77
908(p7) add carry2=1,carry2 }
909{ .mfi; cmp.ltu p7,p0=r21,r20
910 xma.lu f77=f39,f123,f77
911 add r22=r22,r21 };;
912{ .mfi; getf.sig r27=f83
913 xma.hu f88=f39,f124,f87
914(p7) add carry2=1,carry2 }
915{ .mfi; cmp.ltu p7,p0=r22,r21
916 xma.lu f87=f39,f124,f87
917 add r22=r22,carry1 };;
918{ .mfi; getf.sig r28=f74
919 xma.hu f98=f39,f125,f97
920(p7) add carry2=1,carry2 }
921{ .mfi; xma.lu f97=f39,f125,f97
922 cmp.ltu p7,p0=r22,carry1 };;
923{ .mfi; getf.sig r29=f65
924 xma.hu f108=f39,f126,f107
925(p7) add carry2=1,carry2 }
926{ .mfi; st8 [r32]=r22,16
927 xma.lu f107=f39,f126,f107 };;
928{ .mfi; getf.sig r30=f56
929 xma.hu f118=f39,f127,f117 }
930{ .mfi; xma.lu f117=f39,f127,f117 };;//
931//-------------------------------------------------//
932// Leaving muliplier's heaven... Quite a ride, huh?
933
934{ .mii; getf.sig r31=f47
935 add r25=r25,r24
936 mov carry1=0 };;
937{ .mii; getf.sig r16=f111
938 cmp.ltu p6,p0=r25,r24
939 add r26=r26,r25 };;
940{ .mfb; getf.sig r17=f102 }
941{ .mii;
942(p6) add carry1=1,carry1
943 cmp.ltu p6,p0=r26,r25
944 add r27=r27,r26 };;
945{ .mfb; nop.m 0x0 }
946{ .mii;
947(p6) add carry1=1,carry1
948 cmp.ltu p6,p0=r27,r26
949 add r28=r28,r27 };;
950{ .mii; getf.sig r18=f93
951 add r17=r17,r16
952 mov carry3=0 }
953{ .mii;
954(p6) add carry1=1,carry1
955 cmp.ltu p6,p0=r28,r27
956 add r29=r29,r28 };;
957{ .mii; getf.sig r19=f84
958 cmp.ltu p7,p0=r17,r16 }
959{ .mii;
960(p6) add carry1=1,carry1
961 cmp.ltu p6,p0=r29,r28
962 add r30=r30,r29 };;
963{ .mii; getf.sig r20=f75
964 add r18=r18,r17 }
965{ .mii;
966(p6) add carry1=1,carry1
967 cmp.ltu p6,p0=r30,r29
968 add r31=r31,r30 };;
969{ .mfb; getf.sig r21=f66 }
970{ .mii; (p7) add carry3=1,carry3
971 cmp.ltu p7,p0=r18,r17
972 add r19=r19,r18 }
973{ .mfb; nop.m 0x0 }
974{ .mii;
975(p6) add carry1=1,carry1
976 cmp.ltu p6,p0=r31,r30
977 add r31=r31,carry2 };;
978{ .mfb; getf.sig r22=f57 }
979{ .mii; (p7) add carry3=1,carry3
980 cmp.ltu p7,p0=r19,r18
981 add r20=r20,r19 }
982{ .mfb; nop.m 0x0 }
983{ .mii;
984(p6) add carry1=1,carry1
985 cmp.ltu p6,p0=r31,carry2 };;
986{ .mfb; getf.sig r23=f48 }
987{ .mii; (p7) add carry3=1,carry3
988 cmp.ltu p7,p0=r20,r19
989 add r21=r21,r20 }
990{ .mii;
991(p6) add carry1=1,carry1 }
992{ .mfb; st8 [r33]=r31,16 };;
993
994{ .mfb; getf.sig r24=f112 }
995{ .mii; (p7) add carry3=1,carry3
996 cmp.ltu p7,p0=r21,r20
997 add r22=r22,r21 };;
998{ .mfb; getf.sig r25=f103 }
999{ .mii; (p7) add carry3=1,carry3
1000 cmp.ltu p7,p0=r22,r21
1001 add r23=r23,r22 };;
1002{ .mfb; getf.sig r26=f94 }
1003{ .mii; (p7) add carry3=1,carry3
1004 cmp.ltu p7,p0=r23,r22
1005 add r23=r23,carry1 };;
1006{ .mfb; getf.sig r27=f85 }
1007{ .mii; (p7) add carry3=1,carry3
1008 cmp.ltu p7,p8=r23,carry1};;
1009{ .mii; getf.sig r28=f76
1010 add r25=r25,r24
1011 mov carry1=0 }
1012{ .mii; st8 [r32]=r23,16
1013 (p7) add carry2=1,carry3
1014 (p8) add carry2=0,carry3 };;
1015
1016{ .mfb; nop.m 0x0 }
1017{ .mii; getf.sig r29=f67
1018 cmp.ltu p6,p0=r25,r24
1019 add r26=r26,r25 };;
1020{ .mfb; getf.sig r30=f58 }
1021{ .mii;
1022(p6) add carry1=1,carry1
1023 cmp.ltu p6,p0=r26,r25
1024 add r27=r27,r26 };;
1025{ .mfb; getf.sig r16=f113 }
1026{ .mii;
1027(p6) add carry1=1,carry1
1028 cmp.ltu p6,p0=r27,r26
1029 add r28=r28,r27 };;
1030{ .mfb; getf.sig r17=f104 }
1031{ .mii;
1032(p6) add carry1=1,carry1
1033 cmp.ltu p6,p0=r28,r27
1034 add r29=r29,r28 };;
1035{ .mfb; getf.sig r18=f95 }
1036{ .mii;
1037(p6) add carry1=1,carry1
1038 cmp.ltu p6,p0=r29,r28
1039 add r30=r30,r29 };;
1040{ .mii; getf.sig r19=f86
1041 add r17=r17,r16
1042 mov carry3=0 }
1043{ .mii;
1044(p6) add carry1=1,carry1
1045 cmp.ltu p6,p0=r30,r29
1046 add r30=r30,carry2 };;
1047{ .mii; getf.sig r20=f77
1048 cmp.ltu p7,p0=r17,r16
1049 add r18=r18,r17 }
1050{ .mii;
1051(p6) add carry1=1,carry1
1052 cmp.ltu p6,p0=r30,carry2 };;
1053{ .mfb; getf.sig r21=f68 }
1054{ .mii; st8 [r33]=r30,16
1055(p6) add carry1=1,carry1 };;
1056
1057{ .mfb; getf.sig r24=f114 }
1058{ .mii; (p7) add carry3=1,carry3
1059 cmp.ltu p7,p0=r18,r17
1060 add r19=r19,r18 };;
1061{ .mfb; getf.sig r25=f105 }
1062{ .mii; (p7) add carry3=1,carry3
1063 cmp.ltu p7,p0=r19,r18
1064 add r20=r20,r19 };;
1065{ .mfb; getf.sig r26=f96 }
1066{ .mii; (p7) add carry3=1,carry3
1067 cmp.ltu p7,p0=r20,r19
1068 add r21=r21,r20 };;
1069{ .mfb; getf.sig r27=f87 }
1070{ .mii; (p7) add carry3=1,carry3
1071 cmp.ltu p7,p0=r21,r20
1072 add r21=r21,carry1 };;
1073{ .mib; getf.sig r28=f78
1074 add r25=r25,r24 }
1075{ .mib; (p7) add carry3=1,carry3
1076 cmp.ltu p7,p8=r21,carry1};;
1077{ .mii; st8 [r32]=r21,16
1078 (p7) add carry2=1,carry3
1079 (p8) add carry2=0,carry3 }
1080
1081{ .mii; mov carry1=0
1082 cmp.ltu p6,p0=r25,r24
1083 add r26=r26,r25 };;
1084{ .mfb; getf.sig r16=f115 }
1085{ .mii;
1086(p6) add carry1=1,carry1
1087 cmp.ltu p6,p0=r26,r25
1088 add r27=r27,r26 };;
1089{ .mfb; getf.sig r17=f106 }
1090{ .mii;
1091(p6) add carry1=1,carry1
1092 cmp.ltu p6,p0=r27,r26
1093 add r28=r28,r27 };;
1094{ .mfb; getf.sig r18=f97 }
1095{ .mii;
1096(p6) add carry1=1,carry1
1097 cmp.ltu p6,p0=r28,r27
1098 add r28=r28,carry2 };;
1099{ .mib; getf.sig r19=f88
1100 add r17=r17,r16 }
1101{ .mib;
1102(p6) add carry1=1,carry1
1103 cmp.ltu p6,p0=r28,carry2 };;
1104{ .mii; st8 [r33]=r28,16
1105(p6) add carry1=1,carry1 }
1106
1107{ .mii; mov carry2=0
1108 cmp.ltu p7,p0=r17,r16
1109 add r18=r18,r17 };;
1110{ .mfb; getf.sig r24=f116 }
1111{ .mii; (p7) add carry2=1,carry2
1112 cmp.ltu p7,p0=r18,r17
1113 add r19=r19,r18 };;
1114{ .mfb; getf.sig r25=f107 }
1115{ .mii; (p7) add carry2=1,carry2
1116 cmp.ltu p7,p0=r19,r18
1117 add r19=r19,carry1 };;
1118{ .mfb; getf.sig r26=f98 }
1119{ .mii; (p7) add carry2=1,carry2
1120 cmp.ltu p7,p0=r19,carry1};;
1121{ .mii; st8 [r32]=r19,16
1122 (p7) add carry2=1,carry2 }
1123
1124{ .mfb; add r25=r25,r24 };;
1125
1126{ .mfb; getf.sig r16=f117 }
1127{ .mii; mov carry1=0
1128 cmp.ltu p6,p0=r25,r24
1129 add r26=r26,r25 };;
1130{ .mfb; getf.sig r17=f108 }
1131{ .mii;
1132(p6) add carry1=1,carry1
1133 cmp.ltu p6,p0=r26,r25
1134 add r26=r26,carry2 };;
1135{ .mfb; nop.m 0x0 }
1136{ .mii;
1137(p6) add carry1=1,carry1
1138 cmp.ltu p6,p0=r26,carry2 };;
1139{ .mii; st8 [r33]=r26,16
1140(p6) add carry1=1,carry1 }
1141
1142{ .mfb; add r17=r17,r16 };;
1143{ .mfb; getf.sig r24=f118 }
1144{ .mii; mov carry2=0
1145 cmp.ltu p7,p0=r17,r16
1146 add r17=r17,carry1 };;
1147{ .mii; (p7) add carry2=1,carry2
1148 cmp.ltu p7,p0=r17,carry1};;
1149{ .mii; st8 [r32]=r17
1150 (p7) add carry2=1,carry2 };;
1151{ .mfb; add r24=r24,carry2 };;
1152{ .mib; st8 [r33]=r24 }
1153
1154{ .mib; rum 1<<5 // clear um.mfh
1155 br.ret.sptk.many b0 };;
1156.endp bn_mul_comba8#
1157#undef carry3
1158#undef carry2
1159#undef carry1
1160#endif
1161
1162#if 1
1163// It's possible to make it faster (see comment to bn_sqr_comba8), but
1164// I reckon it doesn't worth the effort. Basically because the routine
1165// (actually both of them) practically never called... So I just play
1166// same trick as with bn_sqr_comba8.
1167//
1168// void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
1169//
1170.global bn_sqr_comba4#
1171.proc bn_sqr_comba4#
1172.align 64
1173bn_sqr_comba4:
1174 .prologue
1175 .save ar.pfs,r2
1176#if defined(_HPUX_SOURCE) && !defined(_LP64)
1177{ .mii; alloc r2=ar.pfs,2,1,0,0
1178 addp4 r32=0,r32
1179 addp4 r33=0,r33 };;
1180{ .mii;
1181#else
1182{ .mii; alloc r2=ar.pfs,2,1,0,0
1183#endif
1184 mov r34=r33
1185 add r14=8,r33 };;
1186 .body
1187{ .mii; add r17=8,r34
1188 add r15=16,r33
1189 add r18=16,r34 }
1190{ .mfb; add r16=24,r33
1191 br .L_cheat_entry_point4 };;
1192.endp bn_sqr_comba4#
1193#endif
1194
1195#if 1
1196// Runs in ~115 cycles and ~4.5 times faster than C. Well, whatever...
1197//
1198// void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
1199//
1200#define carry1 r14
1201#define carry2 r15
1202.global bn_mul_comba4#
1203.proc bn_mul_comba4#
1204.align 64
1205bn_mul_comba4:
1206 .prologue
1207 .save ar.pfs,r2
1208#if defined(_HPUX_SOURCE) && !defined(_LP64)
1209{ .mii; alloc r2=ar.pfs,3,0,0,0
1210 addp4 r33=0,r33
1211 addp4 r34=0,r34 };;
1212{ .mii; addp4 r32=0,r32
1213#else
1214{ .mii; alloc r2=ar.pfs,3,0,0,0
1215#endif
1216 add r14=8,r33
1217 add r17=8,r34 }
1218 .body
1219{ .mii; add r15=16,r33
1220 add r18=16,r34
1221 add r16=24,r33 };;
1222.L_cheat_entry_point4:
1223{ .mmi; add r19=24,r34
1224
1225 ldf8 f32=[r33] }
1226
1227{ .mmi; ldf8 f120=[r34]
1228 ldf8 f121=[r17] };;
1229{ .mmi; ldf8 f122=[r18]
1230 ldf8 f123=[r19] }
1231
1232{ .mmi; ldf8 f33=[r14]
1233 ldf8 f34=[r15] }
1234{ .mfi; ldf8 f35=[r16]
1235
1236 xma.hu f41=f32,f120,f0 }
1237{ .mfi; xma.lu f40=f32,f120,f0 };;
1238{ .mfi; xma.hu f51=f32,f121,f0 }
1239{ .mfi; xma.lu f50=f32,f121,f0 };;
1240{ .mfi; xma.hu f61=f32,f122,f0 }
1241{ .mfi; xma.lu f60=f32,f122,f0 };;
1242{ .mfi; xma.hu f71=f32,f123,f0 }
1243{ .mfi; xma.lu f70=f32,f123,f0 };;//
1244// Major stall takes place here, and 3 more places below. Result from
1245// first xma is not available for another 3 ticks.
1246{ .mfi; getf.sig r16=f40
1247 xma.hu f42=f33,f120,f41
1248 add r33=8,r32 }
1249{ .mfi; xma.lu f41=f33,f120,f41 };;
1250{ .mfi; getf.sig r24=f50
1251 xma.hu f52=f33,f121,f51 }
1252{ .mfi; xma.lu f51=f33,f121,f51 };;
1253{ .mfi; st8 [r32]=r16,16
1254 xma.hu f62=f33,f122,f61 }
1255{ .mfi; xma.lu f61=f33,f122,f61 };;
1256{ .mfi; xma.hu f72=f33,f123,f71 }
1257{ .mfi; xma.lu f71=f33,f123,f71 };;//
1258//-------------------------------------------------//
1259{ .mfi; getf.sig r25=f41
1260 xma.hu f43=f34,f120,f42 }
1261{ .mfi; xma.lu f42=f34,f120,f42 };;
1262{ .mfi; getf.sig r16=f60
1263 xma.hu f53=f34,f121,f52 }
1264{ .mfi; xma.lu f52=f34,f121,f52 };;
1265{ .mfi; getf.sig r17=f51
1266 xma.hu f63=f34,f122,f62
1267 add r25=r25,r24 }
1268{ .mfi; mov carry1=0
1269 xma.lu f62=f34,f122,f62 };;
1270{ .mfi; st8 [r33]=r25,16
1271 xma.hu f73=f34,f123,f72
1272 cmp.ltu p6,p0=r25,r24 }
1273{ .mfi; xma.lu f72=f34,f123,f72 };;//
1274//-------------------------------------------------//
1275{ .mfi; getf.sig r18=f42
1276 xma.hu f44=f35,f120,f43
1277(p6) add carry1=1,carry1 }
1278{ .mfi; add r17=r17,r16
1279 xma.lu f43=f35,f120,f43
1280 mov carry2=0 };;
1281{ .mfi; getf.sig r24=f70
1282 xma.hu f54=f35,f121,f53
1283 cmp.ltu p7,p0=r17,r16 }
1284{ .mfi; xma.lu f53=f35,f121,f53 };;
1285{ .mfi; getf.sig r25=f61
1286 xma.hu f64=f35,f122,f63
1287 add r18=r18,r17 }
1288{ .mfi; xma.lu f63=f35,f122,f63
1289(p7) add carry2=1,carry2 };;
1290{ .mfi; getf.sig r26=f52
1291 xma.hu f74=f35,f123,f73
1292 cmp.ltu p7,p0=r18,r17 }
1293{ .mfi; xma.lu f73=f35,f123,f73
1294 add r18=r18,carry1 };;
1295//-------------------------------------------------//
1296{ .mii; st8 [r32]=r18,16
1297(p7) add carry2=1,carry2
1298 cmp.ltu p7,p0=r18,carry1 };;
1299
1300{ .mfi; getf.sig r27=f43 // last major stall
1301(p7) add carry2=1,carry2 };;
1302{ .mii; getf.sig r16=f71
1303 add r25=r25,r24
1304 mov carry1=0 };;
1305{ .mii; getf.sig r17=f62
1306 cmp.ltu p6,p0=r25,r24
1307 add r26=r26,r25 };;
1308{ .mii;
1309(p6) add carry1=1,carry1
1310 cmp.ltu p6,p0=r26,r25
1311 add r27=r27,r26 };;
1312{ .mii;
1313(p6) add carry1=1,carry1
1314 cmp.ltu p6,p0=r27,r26
1315 add r27=r27,carry2 };;
1316{ .mii; getf.sig r18=f53
1317(p6) add carry1=1,carry1
1318 cmp.ltu p6,p0=r27,carry2 };;
1319{ .mfi; st8 [r33]=r27,16
1320(p6) add carry1=1,carry1 }
1321
1322{ .mii; getf.sig r19=f44
1323 add r17=r17,r16
1324 mov carry2=0 };;
1325{ .mii; getf.sig r24=f72
1326 cmp.ltu p7,p0=r17,r16
1327 add r18=r18,r17 };;
1328{ .mii; (p7) add carry2=1,carry2
1329 cmp.ltu p7,p0=r18,r17
1330 add r19=r19,r18 };;
1331{ .mii; (p7) add carry2=1,carry2
1332 cmp.ltu p7,p0=r19,r18
1333 add r19=r19,carry1 };;
1334{ .mii; getf.sig r25=f63
1335 (p7) add carry2=1,carry2
1336 cmp.ltu p7,p0=r19,carry1};;
1337{ .mii; st8 [r32]=r19,16
1338 (p7) add carry2=1,carry2 }
1339
1340{ .mii; getf.sig r26=f54
1341 add r25=r25,r24
1342 mov carry1=0 };;
1343{ .mii; getf.sig r16=f73
1344 cmp.ltu p6,p0=r25,r24
1345 add r26=r26,r25 };;
1346{ .mii;
1347(p6) add carry1=1,carry1
1348 cmp.ltu p6,p0=r26,r25
1349 add r26=r26,carry2 };;
1350{ .mii; getf.sig r17=f64
1351(p6) add carry1=1,carry1
1352 cmp.ltu p6,p0=r26,carry2 };;
1353{ .mii; st8 [r33]=r26,16
1354(p6) add carry1=1,carry1 }
1355
1356{ .mii; getf.sig r24=f74
1357 add r17=r17,r16
1358 mov carry2=0 };;
1359{ .mii; cmp.ltu p7,p0=r17,r16
1360 add r17=r17,carry1 };;
1361
1362{ .mii; (p7) add carry2=1,carry2
1363 cmp.ltu p7,p0=r17,carry1};;
1364{ .mii; st8 [r32]=r17,16
1365 (p7) add carry2=1,carry2 };;
1366
1367{ .mii; add r24=r24,carry2 };;
1368{ .mii; st8 [r33]=r24 }
1369
1370{ .mib; rum 1<<5 // clear um.mfh
1371 br.ret.sptk.many b0 };;
1372.endp bn_mul_comba4#
1373#undef carry2
1374#undef carry1
1375#endif
1376
1377#if 1
1378//
1379// BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
1380//
1381// In the nutshell it's a port of my MIPS III/IV implementation.
1382//
1383#define AT r14
1384#define H r16
1385#define HH r20
1386#define L r17
1387#define D r18
1388#define DH r22
1389#define I r21
1390
1391#if 0
1392// Some preprocessors (most notably HP-UX) appear to be allergic to
1393// macros enclosed to parenthesis [as these three were].
1394#define cont p16
1395#define break p0 // p20
1396#define equ p24
1397#else
1398cont=p16
1399break=p0
1400equ=p24
1401#endif
1402
1403.global abort#
1404.global bn_div_words#
1405.proc bn_div_words#
1406.align 64
1407bn_div_words:
1408 .prologue
1409 .save ar.pfs,r2
1410{ .mii; alloc r2=ar.pfs,3,5,0,8
1411 .save b0,r3
1412 mov r3=b0
1413 .save pr,r10
1414 mov r10=pr };;
1415{ .mmb; cmp.eq p6,p0=r34,r0
1416 mov r8=-1
1417(p6) br.ret.spnt.many b0 };;
1418
1419 .body
1420{ .mii; mov H=r32 // save h
1421 mov ar.ec=0 // don't rotate at exit
1422 mov pr.rot=0 }
1423{ .mii; mov L=r33 // save l
1424 mov r36=r0 };;
1425
1426.L_divw_shift: // -vv- note signed comparison
1427{ .mfi; (p0) cmp.lt p16,p0=r0,r34 // d
1428 (p0) shladd r33=r34,1,r0 }
1429{ .mfb; (p0) add r35=1,r36
1430 (p0) nop.f 0x0
1431(p16) br.wtop.dpnt .L_divw_shift };;
1432
1433{ .mii; mov D=r34
1434 shr.u DH=r34,32
1435 sub r35=64,r36 };;
1436{ .mii; setf.sig f7=DH
1437 shr.u AT=H,r35
1438 mov I=r36 };;
1439{ .mib; cmp.ne p6,p0=r0,AT
1440 shl H=H,r36
1441(p6) br.call.spnt.clr b0=abort };; // overflow, die...
1442
1443{ .mfi; fcvt.xuf.s1 f7=f7
1444 shr.u AT=L,r35 };;
1445{ .mii; shl L=L,r36
1446 or H=H,AT };;
1447
1448{ .mii; nop.m 0x0
1449 cmp.leu p6,p0=D,H;;
1450(p6) sub H=H,D }
1451
1452{ .mlx; setf.sig f14=D
1453 movl AT=0xffffffff };;
1454///////////////////////////////////////////////////////////
1455{ .mii; setf.sig f6=H
1456 shr.u HH=H,32;;
1457 cmp.eq p6,p7=HH,DH };;
1458{ .mfb;
1459(p6) setf.sig f8=AT
1460(p7) fcvt.xuf.s1 f6=f6
1461(p7) br.call.sptk b6=.L_udiv64_32_b6 };;
1462
1463{ .mfi; getf.sig r33=f8 // q
1464 xmpy.lu f9=f8,f14 }
1465{ .mfi; xmpy.hu f10=f8,f14
1466 shrp H=H,L,32 };;
1467
1468{ .mmi; getf.sig r35=f9 // tl
1469 getf.sig r31=f10 };; // th
1470
1471.L_divw_1st_iter:
1472{ .mii; (p0) add r32=-1,r33
1473 (p0) cmp.eq equ,cont=HH,r31 };;
1474{ .mii; (p0) cmp.ltu p8,p0=r35,D
1475 (p0) sub r34=r35,D
1476 (equ) cmp.leu break,cont=r35,H };;
1477{ .mib; (cont) cmp.leu cont,break=HH,r31
1478 (p8) add r31=-1,r31
1479(cont) br.wtop.spnt .L_divw_1st_iter };;
1480///////////////////////////////////////////////////////////
1481{ .mii; sub H=H,r35
1482 shl r8=r33,32
1483 shl L=L,32 };;
1484///////////////////////////////////////////////////////////
1485{ .mii; setf.sig f6=H
1486 shr.u HH=H,32;;
1487 cmp.eq p6,p7=HH,DH };;
1488{ .mfb;
1489(p6) setf.sig f8=AT
1490(p7) fcvt.xuf.s1 f6=f6
1491(p7) br.call.sptk b6=.L_udiv64_32_b6 };;
1492
1493{ .mfi; getf.sig r33=f8 // q
1494 xmpy.lu f9=f8,f14 }
1495{ .mfi; xmpy.hu f10=f8,f14
1496 shrp H=H,L,32 };;
1497
1498{ .mmi; getf.sig r35=f9 // tl
1499 getf.sig r31=f10 };; // th
1500
1501.L_divw_2nd_iter:
1502{ .mii; (p0) add r32=-1,r33
1503 (p0) cmp.eq equ,cont=HH,r31 };;
1504{ .mii; (p0) cmp.ltu p8,p0=r35,D
1505 (p0) sub r34=r35,D
1506 (equ) cmp.leu break,cont=r35,H };;
1507{ .mib; (cont) cmp.leu cont,break=HH,r31
1508 (p8) add r31=-1,r31
1509(cont) br.wtop.spnt .L_divw_2nd_iter };;
1510///////////////////////////////////////////////////////////
1511{ .mii; sub H=H,r35
1512 or r8=r8,r33
1513 mov ar.pfs=r2 };;
1514{ .mii; shr.u r9=H,I // remainder if anybody wants it
1515 mov pr=r10,0x1ffff }
1516{ .mfb; br.ret.sptk.many b0 };;
1517
1518// Unsigned 64 by 32 (well, by 64 for the moment) bit integer division
1519// procedure.
1520//
1521// inputs: f6 = (double)a, f7 = (double)b
1522// output: f8 = (int)(a/b)
1523// clobbered: f8,f9,f10,f11,pred
1524pred=p15
1525// One can argue that this snippet is copyrighted to Intel
1526// Corporation, as it's essentially identical to one of those
1527// found in "Divide, Square Root and Remainder" section at
1528// http://www.intel.com/software/products/opensource/libraries/num.htm.
1529// Yes, I admit that the referred code was used as template,
1530// but after I realized that there hardly is any other instruction
1531// sequence which would perform this operation. I mean I figure that
1532// any independent attempt to implement high-performance division
1533// will result in code virtually identical to the Intel code. It
1534// should be noted though that below division kernel is 1 cycle
1535// faster than Intel one (note commented splits:-), not to mention
1536// original prologue (rather lack of one) and epilogue.
1537.align 32
1538.skip 16
1539.L_udiv64_32_b6:
1540 frcpa.s1 f8,pred=f6,f7;; // [0] y0 = 1 / b
1541
1542(pred) fnma.s1 f9=f7,f8,f1 // [5] e0 = 1 - b * y0
1543(pred) fmpy.s1 f10=f6,f8;; // [5] q0 = a * y0
1544(pred) fmpy.s1 f11=f9,f9 // [10] e1 = e0 * e0
1545(pred) fma.s1 f10=f9,f10,f10;; // [10] q1 = q0 + e0 * q0
1546(pred) fma.s1 f8=f9,f8,f8 //;; // [15] y1 = y0 + e0 * y0
1547(pred) fma.s1 f9=f11,f10,f10;; // [15] q2 = q1 + e1 * q1
1548(pred) fma.s1 f8=f11,f8,f8 //;; // [20] y2 = y1 + e1 * y1
1549(pred) fnma.s1 f10=f7,f9,f6;; // [20] r2 = a - b * q2
1550(pred) fma.s1 f8=f10,f8,f9;; // [25] q3 = q2 + r2 * y2
1551
1552 fcvt.fxu.trunc.s1 f8=f8 // [30] q = trunc(q3)
1553 br.ret.sptk.many b6;;
1554.endp bn_div_words#
1555#endif
diff --git a/src/lib/libcrypto/bn/asm/mips-mont.pl b/src/lib/libcrypto/bn/asm/mips-mont.pl
deleted file mode 100644
index b944a12b8e..0000000000
--- a/src/lib/libcrypto/bn/asm/mips-mont.pl
+++ /dev/null
@@ -1,426 +0,0 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# This module doesn't present direct interest for OpenSSL, because it
11# doesn't provide better performance for longer keys, at least not on
12# in-order-execution cores. While 512-bit RSA sign operations can be
13# 65% faster in 64-bit mode, 1024-bit ones are only 15% faster, and
14# 4096-bit ones are up to 15% slower. In 32-bit mode it varies from
15# 16% improvement for 512-bit RSA sign to -33% for 4096-bit RSA
16# verify:-( All comparisons are against bn_mul_mont-free assembler.
17# The module might be of interest to embedded system developers, as
18# the code is smaller than 1KB, yet offers >3x improvement on MIPS64
19# and 75-30% [less for longer keys] on MIPS32 over compiler-generated
20# code.
21
22######################################################################
23# There is a number of MIPS ABI in use, O32 and N32/64 are most
24# widely used. Then there is a new contender: NUBI. It appears that if
25# one picks the latter, it's possible to arrange code in ABI neutral
26# manner. Therefore let's stick to NUBI register layout:
27#
28($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
29($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
30($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
31($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
32#
33# The return value is placed in $a0. Following coding rules facilitate
34# interoperability:
35#
36# - never ever touch $tp, "thread pointer", former $gp;
37# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
38# old code];
39# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
40#
41# For reference here is register layout for N32/64 MIPS ABIs:
42#
43# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
44# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
45# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
46# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
47# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
48#
49$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64
50
51if ($flavour =~ /64|n32/i) {
52 $PTR_ADD="dadd"; # incidentally works even on n32
53 $PTR_SUB="dsub"; # incidentally works even on n32
54 $REG_S="sd";
55 $REG_L="ld";
56 $SZREG=8;
57} else {
58 $PTR_ADD="add";
59 $PTR_SUB="sub";
60 $REG_S="sw";
61 $REG_L="lw";
62 $SZREG=4;
63}
64$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0x00fff000 : 0x00ff0000;
65#
66# <appro@openssl.org>
67#
68######################################################################
69
70while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
71open STDOUT,">$output";
72
73if ($flavour =~ /64|n32/i) {
74 $LD="ld";
75 $ST="sd";
76 $MULTU="dmultu";
77 $ADDU="daddu";
78 $SUBU="dsubu";
79 $BNSZ=8;
80} else {
81 $LD="lw";
82 $ST="sw";
83 $MULTU="multu";
84 $ADDU="addu";
85 $SUBU="subu";
86 $BNSZ=4;
87}
88
89# int bn_mul_mont(
90$rp=$a0; # BN_ULONG *rp,
91$ap=$a1; # const BN_ULONG *ap,
92$bp=$a2; # const BN_ULONG *bp,
93$np=$a3; # const BN_ULONG *np,
94$n0=$a4; # const BN_ULONG *n0,
95$num=$a5; # int num);
96
97$lo0=$a6;
98$hi0=$a7;
99$lo1=$t1;
100$hi1=$t2;
101$aj=$s0;
102$bi=$s1;
103$nj=$s2;
104$tp=$s3;
105$alo=$s4;
106$ahi=$s5;
107$nlo=$s6;
108$nhi=$s7;
109$tj=$s8;
110$i=$s9;
111$j=$s10;
112$m1=$s11;
113
114$FRAMESIZE=14;
115
116$code=<<___;
117.text
118
119.set noat
120.set noreorder
121
122.align 5
123.globl bn_mul_mont
124.ent bn_mul_mont
125bn_mul_mont:
126___
127$code.=<<___ if ($flavour =~ /o32/i);
128 lw $n0,16($sp)
129 lw $num,20($sp)
130___
131$code.=<<___;
132 slt $at,$num,4
133 bnez $at,1f
134 li $t0,0
135 slt $at,$num,17 # on in-order CPU
136 bnezl $at,bn_mul_mont_internal
137 nop
1381: jr $ra
139 li $a0,0
140.end bn_mul_mont
141
142.align 5
143.ent bn_mul_mont_internal
144bn_mul_mont_internal:
145 .frame $fp,$FRAMESIZE*$SZREG,$ra
146 .mask 0x40000000|$SAVED_REGS_MASK,-$SZREG
147 $PTR_SUB $sp,$FRAMESIZE*$SZREG
148 $REG_S $fp,($FRAMESIZE-1)*$SZREG($sp)
149 $REG_S $s11,($FRAMESIZE-2)*$SZREG($sp)
150 $REG_S $s10,($FRAMESIZE-3)*$SZREG($sp)
151 $REG_S $s9,($FRAMESIZE-4)*$SZREG($sp)
152 $REG_S $s8,($FRAMESIZE-5)*$SZREG($sp)
153 $REG_S $s7,($FRAMESIZE-6)*$SZREG($sp)
154 $REG_S $s6,($FRAMESIZE-7)*$SZREG($sp)
155 $REG_S $s5,($FRAMESIZE-8)*$SZREG($sp)
156 $REG_S $s4,($FRAMESIZE-9)*$SZREG($sp)
157___
158$code.=<<___ if ($flavour =~ /nubi/i);
159 $REG_S $s3,($FRAMESIZE-10)*$SZREG($sp)
160 $REG_S $s2,($FRAMESIZE-11)*$SZREG($sp)
161 $REG_S $s1,($FRAMESIZE-12)*$SZREG($sp)
162 $REG_S $s0,($FRAMESIZE-13)*$SZREG($sp)
163___
164$code.=<<___;
165 move $fp,$sp
166
167 .set reorder
168 $LD $n0,0($n0)
169 $LD $bi,0($bp) # bp[0]
170 $LD $aj,0($ap) # ap[0]
171 $LD $nj,0($np) # np[0]
172
173 $PTR_SUB $sp,2*$BNSZ # place for two extra words
174 sll $num,`log($BNSZ)/log(2)`
175 li $at,-4096
176 $PTR_SUB $sp,$num
177 and $sp,$at
178
179 $MULTU $aj,$bi
180 $LD $alo,$BNSZ($ap)
181 $LD $nlo,$BNSZ($np)
182 mflo $lo0
183 mfhi $hi0
184 $MULTU $lo0,$n0
185 mflo $m1
186
187 $MULTU $alo,$bi
188 mflo $alo
189 mfhi $ahi
190
191 $MULTU $nj,$m1
192 mflo $lo1
193 mfhi $hi1
194 $MULTU $nlo,$m1
195 $ADDU $lo1,$lo0
196 sltu $at,$lo1,$lo0
197 $ADDU $hi1,$at
198 mflo $nlo
199 mfhi $nhi
200
201 move $tp,$sp
202 li $j,2*$BNSZ
203.align 4
204.L1st:
205 .set noreorder
206 $PTR_ADD $aj,$ap,$j
207 $PTR_ADD $nj,$np,$j
208 $LD $aj,($aj)
209 $LD $nj,($nj)
210
211 $MULTU $aj,$bi
212 $ADDU $lo0,$alo,$hi0
213 $ADDU $lo1,$nlo,$hi1
214 sltu $at,$lo0,$hi0
215 sltu $t0,$lo1,$hi1
216 $ADDU $hi0,$ahi,$at
217 $ADDU $hi1,$nhi,$t0
218 mflo $alo
219 mfhi $ahi
220
221 $ADDU $lo1,$lo0
222 sltu $at,$lo1,$lo0
223 $MULTU $nj,$m1
224 $ADDU $hi1,$at
225 addu $j,$BNSZ
226 $ST $lo1,($tp)
227 sltu $t0,$j,$num
228 mflo $nlo
229 mfhi $nhi
230
231 bnez $t0,.L1st
232 $PTR_ADD $tp,$BNSZ
233 .set reorder
234
235 $ADDU $lo0,$alo,$hi0
236 sltu $at,$lo0,$hi0
237 $ADDU $hi0,$ahi,$at
238
239 $ADDU $lo1,$nlo,$hi1
240 sltu $t0,$lo1,$hi1
241 $ADDU $hi1,$nhi,$t0
242 $ADDU $lo1,$lo0
243 sltu $at,$lo1,$lo0
244 $ADDU $hi1,$at
245
246 $ST $lo1,($tp)
247
248 $ADDU $hi1,$hi0
249 sltu $at,$hi1,$hi0
250 $ST $hi1,$BNSZ($tp)
251 $ST $at,2*$BNSZ($tp)
252
253 li $i,$BNSZ
254.align 4
255.Louter:
256 $PTR_ADD $bi,$bp,$i
257 $LD $bi,($bi)
258 $LD $aj,($ap)
259 $LD $alo,$BNSZ($ap)
260 $LD $tj,($sp)
261
262 $MULTU $aj,$bi
263 $LD $nj,($np)
264 $LD $nlo,$BNSZ($np)
265 mflo $lo0
266 mfhi $hi0
267 $ADDU $lo0,$tj
268 $MULTU $lo0,$n0
269 sltu $at,$lo0,$tj
270 $ADDU $hi0,$at
271 mflo $m1
272
273 $MULTU $alo,$bi
274 mflo $alo
275 mfhi $ahi
276
277 $MULTU $nj,$m1
278 mflo $lo1
279 mfhi $hi1
280
281 $MULTU $nlo,$m1
282 $ADDU $lo1,$lo0
283 sltu $at,$lo1,$lo0
284 $ADDU $hi1,$at
285 mflo $nlo
286 mfhi $nhi
287
288 move $tp,$sp
289 li $j,2*$BNSZ
290 $LD $tj,$BNSZ($tp)
291.align 4
292.Linner:
293 .set noreorder
294 $PTR_ADD $aj,$ap,$j
295 $PTR_ADD $nj,$np,$j
296 $LD $aj,($aj)
297 $LD $nj,($nj)
298
299 $MULTU $aj,$bi
300 $ADDU $lo0,$alo,$hi0
301 $ADDU $lo1,$nlo,$hi1
302 sltu $at,$lo0,$hi0
303 sltu $t0,$lo1,$hi1
304 $ADDU $hi0,$ahi,$at
305 $ADDU $hi1,$nhi,$t0
306 mflo $alo
307 mfhi $ahi
308
309 $ADDU $lo0,$tj
310 addu $j,$BNSZ
311 $MULTU $nj,$m1
312 sltu $at,$lo0,$tj
313 $ADDU $lo1,$lo0
314 $ADDU $hi0,$at
315 sltu $t0,$lo1,$lo0
316 $LD $tj,2*$BNSZ($tp)
317 $ADDU $hi1,$t0
318 sltu $at,$j,$num
319 mflo $nlo
320 mfhi $nhi
321 $ST $lo1,($tp)
322 bnez $at,.Linner
323 $PTR_ADD $tp,$BNSZ
324 .set reorder
325
326 $ADDU $lo0,$alo,$hi0
327 sltu $at,$lo0,$hi0
328 $ADDU $hi0,$ahi,$at
329 $ADDU $lo0,$tj
330 sltu $t0,$lo0,$tj
331 $ADDU $hi0,$t0
332
333 $LD $tj,2*$BNSZ($tp)
334 $ADDU $lo1,$nlo,$hi1
335 sltu $at,$lo1,$hi1
336 $ADDU $hi1,$nhi,$at
337 $ADDU $lo1,$lo0
338 sltu $t0,$lo1,$lo0
339 $ADDU $hi1,$t0
340 $ST $lo1,($tp)
341
342 $ADDU $lo1,$hi1,$hi0
343 sltu $hi1,$lo1,$hi0
344 $ADDU $lo1,$tj
345 sltu $at,$lo1,$tj
346 $ADDU $hi1,$at
347 $ST $lo1,$BNSZ($tp)
348 $ST $hi1,2*$BNSZ($tp)
349
350 addu $i,$BNSZ
351 sltu $t0,$i,$num
352 bnez $t0,.Louter
353
354 .set noreorder
355 $PTR_ADD $tj,$sp,$num # &tp[num]
356 move $tp,$sp
357 move $ap,$sp
358 li $hi0,0 # clear borrow bit
359
360.align 4
361.Lsub: $LD $lo0,($tp)
362 $LD $lo1,($np)
363 $PTR_ADD $tp,$BNSZ
364 $PTR_ADD $np,$BNSZ
365 $SUBU $lo1,$lo0,$lo1 # tp[i]-np[i]
366 sgtu $at,$lo1,$lo0
367 $SUBU $lo0,$lo1,$hi0
368 sgtu $hi0,$lo0,$lo1
369 $ST $lo0,($rp)
370 or $hi0,$at
371 sltu $at,$tp,$tj
372 bnez $at,.Lsub
373 $PTR_ADD $rp,$BNSZ
374
375 $SUBU $hi0,$hi1,$hi0 # handle upmost overflow bit
376 move $tp,$sp
377 $PTR_SUB $rp,$num # restore rp
378 not $hi1,$hi0
379
380 and $ap,$hi0,$sp
381 and $bp,$hi1,$rp
382 or $ap,$ap,$bp # ap=borrow?tp:rp
383
384.align 4
385.Lcopy: $LD $aj,($ap)
386 $PTR_ADD $ap,$BNSZ
387 $ST $zero,($tp)
388 $PTR_ADD $tp,$BNSZ
389 sltu $at,$tp,$tj
390 $ST $aj,($rp)
391 bnez $at,.Lcopy
392 $PTR_ADD $rp,$BNSZ
393
394 li $a0,1
395 li $t0,1
396
397 .set noreorder
398 move $sp,$fp
399 $REG_L $fp,($FRAMESIZE-1)*$SZREG($sp)
400 $REG_L $s11,($FRAMESIZE-2)*$SZREG($sp)
401 $REG_L $s10,($FRAMESIZE-3)*$SZREG($sp)
402 $REG_L $s9,($FRAMESIZE-4)*$SZREG($sp)
403 $REG_L $s8,($FRAMESIZE-5)*$SZREG($sp)
404 $REG_L $s7,($FRAMESIZE-6)*$SZREG($sp)
405 $REG_L $s6,($FRAMESIZE-7)*$SZREG($sp)
406 $REG_L $s5,($FRAMESIZE-8)*$SZREG($sp)
407 $REG_L $s4,($FRAMESIZE-9)*$SZREG($sp)
408___
409$code.=<<___ if ($flavour =~ /nubi/i);
410 $REG_L $s3,($FRAMESIZE-10)*$SZREG($sp)
411 $REG_L $s2,($FRAMESIZE-11)*$SZREG($sp)
412 $REG_L $s1,($FRAMESIZE-12)*$SZREG($sp)
413 $REG_L $s0,($FRAMESIZE-13)*$SZREG($sp)
414___
415$code.=<<___;
416 jr $ra
417 $PTR_ADD $sp,$FRAMESIZE*$SZREG
418.end bn_mul_mont_internal
419.rdata
420.asciiz "Montgomery Multiplication for MIPS, CRYPTOGAMS by <appro\@openssl.org>"
421___
422
423$code =~ s/\`([^\`]*)\`/eval $1/gem;
424
425print $code;
426close STDOUT;
diff --git a/src/lib/libcrypto/bn/asm/mips.pl b/src/lib/libcrypto/bn/asm/mips.pl
deleted file mode 100644
index c162a3ec23..0000000000
--- a/src/lib/libcrypto/bn/asm/mips.pl
+++ /dev/null
@@ -1,2585 +0,0 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project.
6#
7# Rights for redistribution and usage in source and binary forms are
8# granted according to the OpenSSL license. Warranty of any kind is
9# disclaimed.
10# ====================================================================
11
12
13# July 1999
14#
15# This is drop-in MIPS III/IV ISA replacement for crypto/bn/bn_asm.c.
16#
17# The module is designed to work with either of the "new" MIPS ABI(5),
18# namely N32 or N64, offered by IRIX 6.x. It's not ment to work under
19# IRIX 5.x not only because it doesn't support new ABIs but also
20# because 5.x kernels put R4x00 CPU into 32-bit mode and all those
21# 64-bit instructions (daddu, dmultu, etc.) found below gonna only
22# cause illegal instruction exception:-(
23#
24# In addition the code depends on preprocessor flags set up by MIPSpro
25# compiler driver (either as or cc) and therefore (probably?) can't be
26# compiled by the GNU assembler. GNU C driver manages fine though...
27# I mean as long as -mmips-as is specified or is the default option,
28# because then it simply invokes /usr/bin/as which in turn takes
29# perfect care of the preprocessor definitions. Another neat feature
30# offered by the MIPSpro assembler is an optimization pass. This gave
31# me the opportunity to have the code looking more regular as all those
32# architecture dependent instruction rescheduling details were left to
33# the assembler. Cool, huh?
34#
35# Performance improvement is astonishing! 'apps/openssl speed rsa dsa'
36# goes way over 3 times faster!
37#
38# <appro@fy.chalmers.se>
39
40# October 2010
41#
42# Adapt the module even for 32-bit ABIs and other OSes. The former was
43# achieved by mechanical replacement of 64-bit arithmetic instructions
44# such as dmultu, daddu, etc. with their 32-bit counterparts and
45# adjusting offsets denoting multiples of BN_ULONG. Above mentioned
46# >3x performance improvement naturally does not apply to 32-bit code
47# [because there is no instruction 32-bit compiler can't use], one
48# has to content with 40-85% improvement depending on benchmark and
49# key length, more for longer keys.
50
51$flavour = shift;
52while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
53open STDOUT,">$output";
54
55if ($flavour =~ /64|n32/i) {
56 $LD="ld";
57 $ST="sd";
58 $MULTU="dmultu";
59 $DIVU="ddivu";
60 $ADDU="daddu";
61 $SUBU="dsubu";
62 $SRL="dsrl";
63 $SLL="dsll";
64 $BNSZ=8;
65 $PTR_ADD="daddu";
66 $PTR_SUB="dsubu";
67 $SZREG=8;
68 $REG_S="sd";
69 $REG_L="ld";
70} else {
71 $LD="lw";
72 $ST="sw";
73 $MULTU="multu";
74 $DIVU="divu";
75 $ADDU="addu";
76 $SUBU="subu";
77 $SRL="srl";
78 $SLL="sll";
79 $BNSZ=4;
80 $PTR_ADD="addu";
81 $PTR_SUB="subu";
82 $SZREG=4;
83 $REG_S="sw";
84 $REG_L="lw";
85 $code=".set mips2\n";
86}
87
88# Below is N32/64 register layout used in the original module.
89#
90($zero,$at,$v0,$v1)=map("\$$_",(0..3));
91($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
92($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
93($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
94($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
95($ta0,$ta1,$ta2,$ta3)=($a4,$a5,$a6,$a7);
96#
97# No special adaptation is required for O32. NUBI on the other hand
98# is treated by saving/restoring ($v1,$t0..$t3).
99
100$gp=$v1 if ($flavour =~ /nubi/i);
101
102$minus4=$v1;
103
104$code.=<<___;
105.rdata
106.asciiz "mips3.s, Version 1.2"
107.asciiz "MIPS II/III/IV ISA artwork by Andy Polyakov <appro\@fy.chalmers.se>"
108
109.text
110.set noat
111
112.align 5
113.globl bn_mul_add_words
114.ent bn_mul_add_words
115bn_mul_add_words:
116 .set noreorder
117 bgtz $a2,bn_mul_add_words_internal
118 move $v0,$zero
119 jr $ra
120 move $a0,$v0
121.end bn_mul_add_words
122
123.align 5
124.ent bn_mul_add_words_internal
125bn_mul_add_words_internal:
126___
127$code.=<<___ if ($flavour =~ /nubi/i);
128 .frame $sp,6*$SZREG,$ra
129 .mask 0x8000f008,-$SZREG
130 .set noreorder
131 $PTR_SUB $sp,6*$SZREG
132 $REG_S $ra,5*$SZREG($sp)
133 $REG_S $t3,4*$SZREG($sp)
134 $REG_S $t2,3*$SZREG($sp)
135 $REG_S $t1,2*$SZREG($sp)
136 $REG_S $t0,1*$SZREG($sp)
137 $REG_S $gp,0*$SZREG($sp)
138___
139$code.=<<___;
140 .set reorder
141 li $minus4,-4
142 and $ta0,$a2,$minus4
143 $LD $t0,0($a1)
144 beqz $ta0,.L_bn_mul_add_words_tail
145
146.L_bn_mul_add_words_loop:
147 $MULTU $t0,$a3
148 $LD $t1,0($a0)
149 $LD $t2,$BNSZ($a1)
150 $LD $t3,$BNSZ($a0)
151 $LD $ta0,2*$BNSZ($a1)
152 $LD $ta1,2*$BNSZ($a0)
153 $ADDU $t1,$v0
154 sltu $v0,$t1,$v0 # All manuals say it "compares 32-bit
155 # values", but it seems to work fine
156 # even on 64-bit registers.
157 mflo $at
158 mfhi $t0
159 $ADDU $t1,$at
160 $ADDU $v0,$t0
161 $MULTU $t2,$a3
162 sltu $at,$t1,$at
163 $ST $t1,0($a0)
164 $ADDU $v0,$at
165
166 $LD $ta2,3*$BNSZ($a1)
167 $LD $ta3,3*$BNSZ($a0)
168 $ADDU $t3,$v0
169 sltu $v0,$t3,$v0
170 mflo $at
171 mfhi $t2
172 $ADDU $t3,$at
173 $ADDU $v0,$t2
174 $MULTU $ta0,$a3
175 sltu $at,$t3,$at
176 $ST $t3,$BNSZ($a0)
177 $ADDU $v0,$at
178
179 subu $a2,4
180 $PTR_ADD $a0,4*$BNSZ
181 $PTR_ADD $a1,4*$BNSZ
182 $ADDU $ta1,$v0
183 sltu $v0,$ta1,$v0
184 mflo $at
185 mfhi $ta0
186 $ADDU $ta1,$at
187 $ADDU $v0,$ta0
188 $MULTU $ta2,$a3
189 sltu $at,$ta1,$at
190 $ST $ta1,-2*$BNSZ($a0)
191 $ADDU $v0,$at
192
193
194 and $ta0,$a2,$minus4
195 $ADDU $ta3,$v0
196 sltu $v0,$ta3,$v0
197 mflo $at
198 mfhi $ta2
199 $ADDU $ta3,$at
200 $ADDU $v0,$ta2
201 sltu $at,$ta3,$at
202 $ST $ta3,-$BNSZ($a0)
203 $ADDU $v0,$at
204 .set noreorder
205 bgtzl $ta0,.L_bn_mul_add_words_loop
206 $LD $t0,0($a1)
207
208 beqz $a2,.L_bn_mul_add_words_return
209 nop
210
211.L_bn_mul_add_words_tail:
212 .set reorder
213 $LD $t0,0($a1)
214 $MULTU $t0,$a3
215 $LD $t1,0($a0)
216 subu $a2,1
217 $ADDU $t1,$v0
218 sltu $v0,$t1,$v0
219 mflo $at
220 mfhi $t0
221 $ADDU $t1,$at
222 $ADDU $v0,$t0
223 sltu $at,$t1,$at
224 $ST $t1,0($a0)
225 $ADDU $v0,$at
226 beqz $a2,.L_bn_mul_add_words_return
227
228 $LD $t0,$BNSZ($a1)
229 $MULTU $t0,$a3
230 $LD $t1,$BNSZ($a0)
231 subu $a2,1
232 $ADDU $t1,$v0
233 sltu $v0,$t1,$v0
234 mflo $at
235 mfhi $t0
236 $ADDU $t1,$at
237 $ADDU $v0,$t0
238 sltu $at,$t1,$at
239 $ST $t1,$BNSZ($a0)
240 $ADDU $v0,$at
241 beqz $a2,.L_bn_mul_add_words_return
242
243 $LD $t0,2*$BNSZ($a1)
244 $MULTU $t0,$a3
245 $LD $t1,2*$BNSZ($a0)
246 $ADDU $t1,$v0
247 sltu $v0,$t1,$v0
248 mflo $at
249 mfhi $t0
250 $ADDU $t1,$at
251 $ADDU $v0,$t0
252 sltu $at,$t1,$at
253 $ST $t1,2*$BNSZ($a0)
254 $ADDU $v0,$at
255
256.L_bn_mul_add_words_return:
257 .set noreorder
258___
259$code.=<<___ if ($flavour =~ /nubi/i);
260 $REG_L $t3,4*$SZREG($sp)
261 $REG_L $t2,3*$SZREG($sp)
262 $REG_L $t1,2*$SZREG($sp)
263 $REG_L $t0,1*$SZREG($sp)
264 $REG_L $gp,0*$SZREG($sp)
265 $PTR_ADD $sp,6*$SZREG
266___
267$code.=<<___;
268 jr $ra
269 move $a0,$v0
270.end bn_mul_add_words_internal
271
272.align 5
273.globl bn_mul_words
274.ent bn_mul_words
275bn_mul_words:
276 .set noreorder
277 bgtz $a2,bn_mul_words_internal
278 move $v0,$zero
279 jr $ra
280 move $a0,$v0
281.end bn_mul_words
282
283.align 5
284.ent bn_mul_words_internal
285bn_mul_words_internal:
286___
287$code.=<<___ if ($flavour =~ /nubi/i);
288 .frame $sp,6*$SZREG,$ra
289 .mask 0x8000f008,-$SZREG
290 .set noreorder
291 $PTR_SUB $sp,6*$SZREG
292 $REG_S $ra,5*$SZREG($sp)
293 $REG_S $t3,4*$SZREG($sp)
294 $REG_S $t2,3*$SZREG($sp)
295 $REG_S $t1,2*$SZREG($sp)
296 $REG_S $t0,1*$SZREG($sp)
297 $REG_S $gp,0*$SZREG($sp)
298___
299$code.=<<___;
300 .set reorder
301 li $minus4,-4
302 and $ta0,$a2,$minus4
303 $LD $t0,0($a1)
304 beqz $ta0,.L_bn_mul_words_tail
305
306.L_bn_mul_words_loop:
307 $MULTU $t0,$a3
308 $LD $t2,$BNSZ($a1)
309 $LD $ta0,2*$BNSZ($a1)
310 $LD $ta2,3*$BNSZ($a1)
311 mflo $at
312 mfhi $t0
313 $ADDU $v0,$at
314 sltu $t1,$v0,$at
315 $MULTU $t2,$a3
316 $ST $v0,0($a0)
317 $ADDU $v0,$t1,$t0
318
319 subu $a2,4
320 $PTR_ADD $a0,4*$BNSZ
321 $PTR_ADD $a1,4*$BNSZ
322 mflo $at
323 mfhi $t2
324 $ADDU $v0,$at
325 sltu $t3,$v0,$at
326 $MULTU $ta0,$a3
327 $ST $v0,-3*$BNSZ($a0)
328 $ADDU $v0,$t3,$t2
329
330 mflo $at
331 mfhi $ta0
332 $ADDU $v0,$at
333 sltu $ta1,$v0,$at
334 $MULTU $ta2,$a3
335 $ST $v0,-2*$BNSZ($a0)
336 $ADDU $v0,$ta1,$ta0
337
338 and $ta0,$a2,$minus4
339 mflo $at
340 mfhi $ta2
341 $ADDU $v0,$at
342 sltu $ta3,$v0,$at
343 $ST $v0,-$BNSZ($a0)
344 $ADDU $v0,$ta3,$ta2
345 .set noreorder
346 bgtzl $ta0,.L_bn_mul_words_loop
347 $LD $t0,0($a1)
348
349 beqz $a2,.L_bn_mul_words_return
350 nop
351
352.L_bn_mul_words_tail:
353 .set reorder
354 $LD $t0,0($a1)
355 $MULTU $t0,$a3
356 subu $a2,1
357 mflo $at
358 mfhi $t0
359 $ADDU $v0,$at
360 sltu $t1,$v0,$at
361 $ST $v0,0($a0)
362 $ADDU $v0,$t1,$t0
363 beqz $a2,.L_bn_mul_words_return
364
365 $LD $t0,$BNSZ($a1)
366 $MULTU $t0,$a3
367 subu $a2,1
368 mflo $at
369 mfhi $t0
370 $ADDU $v0,$at
371 sltu $t1,$v0,$at
372 $ST $v0,$BNSZ($a0)
373 $ADDU $v0,$t1,$t0
374 beqz $a2,.L_bn_mul_words_return
375
376 $LD $t0,2*$BNSZ($a1)
377 $MULTU $t0,$a3
378 mflo $at
379 mfhi $t0
380 $ADDU $v0,$at
381 sltu $t1,$v0,$at
382 $ST $v0,2*$BNSZ($a0)
383 $ADDU $v0,$t1,$t0
384
385.L_bn_mul_words_return:
386 .set noreorder
387___
388$code.=<<___ if ($flavour =~ /nubi/i);
389 $REG_L $t3,4*$SZREG($sp)
390 $REG_L $t2,3*$SZREG($sp)
391 $REG_L $t1,2*$SZREG($sp)
392 $REG_L $t0,1*$SZREG($sp)
393 $REG_L $gp,0*$SZREG($sp)
394 $PTR_ADD $sp,6*$SZREG
395___
396$code.=<<___;
397 jr $ra
398 move $a0,$v0
399.end bn_mul_words_internal
400
401.align 5
402.globl bn_sqr_words
403.ent bn_sqr_words
404bn_sqr_words:
405 .set noreorder
406 bgtz $a2,bn_sqr_words_internal
407 move $v0,$zero
408 jr $ra
409 move $a0,$v0
410.end bn_sqr_words
411
412.align 5
413.ent bn_sqr_words_internal
414bn_sqr_words_internal:
415___
416$code.=<<___ if ($flavour =~ /nubi/i);
417 .frame $sp,6*$SZREG,$ra
418 .mask 0x8000f008,-$SZREG
419 .set noreorder
420 $PTR_SUB $sp,6*$SZREG
421 $REG_S $ra,5*$SZREG($sp)
422 $REG_S $t3,4*$SZREG($sp)
423 $REG_S $t2,3*$SZREG($sp)
424 $REG_S $t1,2*$SZREG($sp)
425 $REG_S $t0,1*$SZREG($sp)
426 $REG_S $gp,0*$SZREG($sp)
427___
428$code.=<<___;
429 .set reorder
430 li $minus4,-4
431 and $ta0,$a2,$minus4
432 $LD $t0,0($a1)
433 beqz $ta0,.L_bn_sqr_words_tail
434
435.L_bn_sqr_words_loop:
436 $MULTU $t0,$t0
437 $LD $t2,$BNSZ($a1)
438 $LD $ta0,2*$BNSZ($a1)
439 $LD $ta2,3*$BNSZ($a1)
440 mflo $t1
441 mfhi $t0
442 $ST $t1,0($a0)
443 $ST $t0,$BNSZ($a0)
444
445 $MULTU $t2,$t2
446 subu $a2,4
447 $PTR_ADD $a0,8*$BNSZ
448 $PTR_ADD $a1,4*$BNSZ
449 mflo $t3
450 mfhi $t2
451 $ST $t3,-6*$BNSZ($a0)
452 $ST $t2,-5*$BNSZ($a0)
453
454 $MULTU $ta0,$ta0
455 mflo $ta1
456 mfhi $ta0
457 $ST $ta1,-4*$BNSZ($a0)
458 $ST $ta0,-3*$BNSZ($a0)
459
460
461 $MULTU $ta2,$ta2
462 and $ta0,$a2,$minus4
463 mflo $ta3
464 mfhi $ta2
465 $ST $ta3,-2*$BNSZ($a0)
466 $ST $ta2,-$BNSZ($a0)
467
468 .set noreorder
469 bgtzl $ta0,.L_bn_sqr_words_loop
470 $LD $t0,0($a1)
471
472 beqz $a2,.L_bn_sqr_words_return
473 nop
474
475.L_bn_sqr_words_tail:
476 .set reorder
477 $LD $t0,0($a1)
478 $MULTU $t0,$t0
479 subu $a2,1
480 mflo $t1
481 mfhi $t0
482 $ST $t1,0($a0)
483 $ST $t0,$BNSZ($a0)
484 beqz $a2,.L_bn_sqr_words_return
485
486 $LD $t0,$BNSZ($a1)
487 $MULTU $t0,$t0
488 subu $a2,1
489 mflo $t1
490 mfhi $t0
491 $ST $t1,2*$BNSZ($a0)
492 $ST $t0,3*$BNSZ($a0)
493 beqz $a2,.L_bn_sqr_words_return
494
495 $LD $t0,2*$BNSZ($a1)
496 $MULTU $t0,$t0
497 mflo $t1
498 mfhi $t0
499 $ST $t1,4*$BNSZ($a0)
500 $ST $t0,5*$BNSZ($a0)
501
502.L_bn_sqr_words_return:
503 .set noreorder
504___
505$code.=<<___ if ($flavour =~ /nubi/i);
506 $REG_L $t3,4*$SZREG($sp)
507 $REG_L $t2,3*$SZREG($sp)
508 $REG_L $t1,2*$SZREG($sp)
509 $REG_L $t0,1*$SZREG($sp)
510 $REG_L $gp,0*$SZREG($sp)
511 $PTR_ADD $sp,6*$SZREG
512___
513$code.=<<___;
514 jr $ra
515 move $a0,$v0
516
517.end bn_sqr_words_internal
518
519.align 5
520.globl bn_add_words
521.ent bn_add_words
522bn_add_words:
523 .set noreorder
524 bgtz $a3,bn_add_words_internal
525 move $v0,$zero
526 jr $ra
527 move $a0,$v0
528.end bn_add_words
529
530.align 5
531.ent bn_add_words_internal
532bn_add_words_internal:
533___
534$code.=<<___ if ($flavour =~ /nubi/i);
535 .frame $sp,6*$SZREG,$ra
536 .mask 0x8000f008,-$SZREG
537 .set noreorder
538 $PTR_SUB $sp,6*$SZREG
539 $REG_S $ra,5*$SZREG($sp)
540 $REG_S $t3,4*$SZREG($sp)
541 $REG_S $t2,3*$SZREG($sp)
542 $REG_S $t1,2*$SZREG($sp)
543 $REG_S $t0,1*$SZREG($sp)
544 $REG_S $gp,0*$SZREG($sp)
545___
546$code.=<<___;
547 .set reorder
548 li $minus4,-4
549 and $at,$a3,$minus4
550 $LD $t0,0($a1)
551 beqz $at,.L_bn_add_words_tail
552
553.L_bn_add_words_loop:
554 $LD $ta0,0($a2)
555 subu $a3,4
556 $LD $t1,$BNSZ($a1)
557 and $at,$a3,$minus4
558 $LD $t2,2*$BNSZ($a1)
559 $PTR_ADD $a2,4*$BNSZ
560 $LD $t3,3*$BNSZ($a1)
561 $PTR_ADD $a0,4*$BNSZ
562 $LD $ta1,-3*$BNSZ($a2)
563 $PTR_ADD $a1,4*$BNSZ
564 $LD $ta2,-2*$BNSZ($a2)
565 $LD $ta3,-$BNSZ($a2)
566 $ADDU $ta0,$t0
567 sltu $t8,$ta0,$t0
568 $ADDU $t0,$ta0,$v0
569 sltu $v0,$t0,$ta0
570 $ST $t0,-4*$BNSZ($a0)
571 $ADDU $v0,$t8
572
573 $ADDU $ta1,$t1
574 sltu $t9,$ta1,$t1
575 $ADDU $t1,$ta1,$v0
576 sltu $v0,$t1,$ta1
577 $ST $t1,-3*$BNSZ($a0)
578 $ADDU $v0,$t9
579
580 $ADDU $ta2,$t2
581 sltu $t8,$ta2,$t2
582 $ADDU $t2,$ta2,$v0
583 sltu $v0,$t2,$ta2
584 $ST $t2,-2*$BNSZ($a0)
585 $ADDU $v0,$t8
586
587 $ADDU $ta3,$t3
588 sltu $t9,$ta3,$t3
589 $ADDU $t3,$ta3,$v0
590 sltu $v0,$t3,$ta3
591 $ST $t3,-$BNSZ($a0)
592 $ADDU $v0,$t9
593
594 .set noreorder
595 bgtzl $at,.L_bn_add_words_loop
596 $LD $t0,0($a1)
597
598 beqz $a3,.L_bn_add_words_return
599 nop
600
601.L_bn_add_words_tail:
602 .set reorder
603 $LD $t0,0($a1)
604 $LD $ta0,0($a2)
605 $ADDU $ta0,$t0
606 subu $a3,1
607 sltu $t8,$ta0,$t0
608 $ADDU $t0,$ta0,$v0
609 sltu $v0,$t0,$ta0
610 $ST $t0,0($a0)
611 $ADDU $v0,$t8
612 beqz $a3,.L_bn_add_words_return
613
614 $LD $t1,$BNSZ($a1)
615 $LD $ta1,$BNSZ($a2)
616 $ADDU $ta1,$t1
617 subu $a3,1
618 sltu $t9,$ta1,$t1
619 $ADDU $t1,$ta1,$v0
620 sltu $v0,$t1,$ta1
621 $ST $t1,$BNSZ($a0)
622 $ADDU $v0,$t9
623 beqz $a3,.L_bn_add_words_return
624
625 $LD $t2,2*$BNSZ($a1)
626 $LD $ta2,2*$BNSZ($a2)
627 $ADDU $ta2,$t2
628 sltu $t8,$ta2,$t2
629 $ADDU $t2,$ta2,$v0
630 sltu $v0,$t2,$ta2
631 $ST $t2,2*$BNSZ($a0)
632 $ADDU $v0,$t8
633
634.L_bn_add_words_return:
635 .set noreorder
636___
637$code.=<<___ if ($flavour =~ /nubi/i);
638 $REG_L $t3,4*$SZREG($sp)
639 $REG_L $t2,3*$SZREG($sp)
640 $REG_L $t1,2*$SZREG($sp)
641 $REG_L $t0,1*$SZREG($sp)
642 $REG_L $gp,0*$SZREG($sp)
643 $PTR_ADD $sp,6*$SZREG
644___
645$code.=<<___;
646 jr $ra
647 move $a0,$v0
648
649.end bn_add_words_internal
650
651.align 5
652.globl bn_sub_words
653.ent bn_sub_words
654bn_sub_words:
655 .set noreorder
656 bgtz $a3,bn_sub_words_internal
657 move $v0,$zero
658 jr $ra
659 move $a0,$zero
660.end bn_sub_words
661
662.align 5
663.ent bn_sub_words_internal
664bn_sub_words_internal:
665___
666$code.=<<___ if ($flavour =~ /nubi/i);
667 .frame $sp,6*$SZREG,$ra
668 .mask 0x8000f008,-$SZREG
669 .set noreorder
670 $PTR_SUB $sp,6*$SZREG
671 $REG_S $ra,5*$SZREG($sp)
672 $REG_S $t3,4*$SZREG($sp)
673 $REG_S $t2,3*$SZREG($sp)
674 $REG_S $t1,2*$SZREG($sp)
675 $REG_S $t0,1*$SZREG($sp)
676 $REG_S $gp,0*$SZREG($sp)
677___
678$code.=<<___;
679 .set reorder
680 li $minus4,-4
681 and $at,$a3,$minus4
682 $LD $t0,0($a1)
683 beqz $at,.L_bn_sub_words_tail
684
685.L_bn_sub_words_loop:
686 $LD $ta0,0($a2)
687 subu $a3,4
688 $LD $t1,$BNSZ($a1)
689 and $at,$a3,$minus4
690 $LD $t2,2*$BNSZ($a1)
691 $PTR_ADD $a2,4*$BNSZ
692 $LD $t3,3*$BNSZ($a1)
693 $PTR_ADD $a0,4*$BNSZ
694 $LD $ta1,-3*$BNSZ($a2)
695 $PTR_ADD $a1,4*$BNSZ
696 $LD $ta2,-2*$BNSZ($a2)
697 $LD $ta3,-$BNSZ($a2)
698 sltu $t8,$t0,$ta0
699 $SUBU $ta0,$t0,$ta0
700 $SUBU $t0,$ta0,$v0
701 sgtu $v0,$t0,$ta0
702 $ST $t0,-4*$BNSZ($a0)
703 $ADDU $v0,$t8
704
705 sltu $t9,$t1,$ta1
706 $SUBU $ta1,$t1,$ta1
707 $SUBU $t1,$ta1,$v0
708 sgtu $v0,$t1,$ta1
709 $ST $t1,-3*$BNSZ($a0)
710 $ADDU $v0,$t9
711
712
713 sltu $t8,$t2,$ta2
714 $SUBU $ta2,$t2,$ta2
715 $SUBU $t2,$ta2,$v0
716 sgtu $v0,$t2,$ta2
717 $ST $t2,-2*$BNSZ($a0)
718 $ADDU $v0,$t8
719
720 sltu $t9,$t3,$ta3
721 $SUBU $ta3,$t3,$ta3
722 $SUBU $t3,$ta3,$v0
723 sgtu $v0,$t3,$ta3
724 $ST $t3,-$BNSZ($a0)
725 $ADDU $v0,$t9
726
727 .set noreorder
728 bgtzl $at,.L_bn_sub_words_loop
729 $LD $t0,0($a1)
730
731 beqz $a3,.L_bn_sub_words_return
732 nop
733
734.L_bn_sub_words_tail:
735 .set reorder
736 $LD $t0,0($a1)
737 $LD $ta0,0($a2)
738 subu $a3,1
739 sltu $t8,$t0,$ta0
740 $SUBU $ta0,$t0,$ta0
741 $SUBU $t0,$ta0,$v0
742 sgtu $v0,$t0,$ta0
743 $ST $t0,0($a0)
744 $ADDU $v0,$t8
745 beqz $a3,.L_bn_sub_words_return
746
747 $LD $t1,$BNSZ($a1)
748 subu $a3,1
749 $LD $ta1,$BNSZ($a2)
750 sltu $t9,$t1,$ta1
751 $SUBU $ta1,$t1,$ta1
752 $SUBU $t1,$ta1,$v0
753 sgtu $v0,$t1,$ta1
754 $ST $t1,$BNSZ($a0)
755 $ADDU $v0,$t9
756 beqz $a3,.L_bn_sub_words_return
757
758 $LD $t2,2*$BNSZ($a1)
759 $LD $ta2,2*$BNSZ($a2)
760 sltu $t8,$t2,$ta2
761 $SUBU $ta2,$t2,$ta2
762 $SUBU $t2,$ta2,$v0
763 sgtu $v0,$t2,$ta2
764 $ST $t2,2*$BNSZ($a0)
765 $ADDU $v0,$t8
766
767.L_bn_sub_words_return:
768 .set noreorder
769___
770$code.=<<___ if ($flavour =~ /nubi/i);
771 $REG_L $t3,4*$SZREG($sp)
772 $REG_L $t2,3*$SZREG($sp)
773 $REG_L $t1,2*$SZREG($sp)
774 $REG_L $t0,1*$SZREG($sp)
775 $REG_L $gp,0*$SZREG($sp)
776 $PTR_ADD $sp,6*$SZREG
777___
778$code.=<<___;
779 jr $ra
780 move $a0,$v0
781.end bn_sub_words_internal
782
783.align 5
784.globl bn_div_3_words
785.ent bn_div_3_words
786bn_div_3_words:
787 .set noreorder
788 move $a3,$a0 # we know that bn_div_words does not
789 # touch $a3, $ta2, $ta3 and preserves $a2
790 # so that we can save two arguments
791 # and return address in registers
792 # instead of stack:-)
793
794 $LD $a0,($a3)
795 move $ta2,$a1
796 bne $a0,$a2,bn_div_3_words_internal
797 $LD $a1,-$BNSZ($a3)
798 li $v0,-1
799 jr $ra
800 move $a0,$v0
801.end bn_div_3_words
802
803.align 5
804.ent bn_div_3_words_internal
805bn_div_3_words_internal:
806___
807$code.=<<___ if ($flavour =~ /nubi/i);
808 .frame $sp,6*$SZREG,$ra
809 .mask 0x8000f008,-$SZREG
810 .set noreorder
811 $PTR_SUB $sp,6*$SZREG
812 $REG_S $ra,5*$SZREG($sp)
813 $REG_S $t3,4*$SZREG($sp)
814 $REG_S $t2,3*$SZREG($sp)
815 $REG_S $t1,2*$SZREG($sp)
816 $REG_S $t0,1*$SZREG($sp)
817 $REG_S $gp,0*$SZREG($sp)
818___
819$code.=<<___;
820 .set reorder
821 move $ta3,$ra
822 bal bn_div_words
823 move $ra,$ta3
824 $MULTU $ta2,$v0
825 $LD $t2,-2*$BNSZ($a3)
826 move $ta0,$zero
827 mfhi $t1
828 mflo $t0
829 sltu $t8,$t1,$a1
830.L_bn_div_3_words_inner_loop:
831 bnez $t8,.L_bn_div_3_words_inner_loop_done
832 sgeu $at,$t2,$t0
833 seq $t9,$t1,$a1
834 and $at,$t9
835 sltu $t3,$t0,$ta2
836 $ADDU $a1,$a2
837 $SUBU $t1,$t3
838 $SUBU $t0,$ta2
839 sltu $t8,$t1,$a1
840 sltu $ta0,$a1,$a2
841 or $t8,$ta0
842 .set noreorder
843 beqzl $at,.L_bn_div_3_words_inner_loop
844 $SUBU $v0,1
845 .set reorder
846.L_bn_div_3_words_inner_loop_done:
847 .set noreorder
848___
849$code.=<<___ if ($flavour =~ /nubi/i);
850 $REG_L $t3,4*$SZREG($sp)
851 $REG_L $t2,3*$SZREG($sp)
852 $REG_L $t1,2*$SZREG($sp)
853 $REG_L $t0,1*$SZREG($sp)
854 $REG_L $gp,0*$SZREG($sp)
855 $PTR_ADD $sp,6*$SZREG
856___
857$code.=<<___;
858 jr $ra
859 move $a0,$v0
860.end bn_div_3_words_internal
861
862.align 5
863.globl bn_div_words
864.ent bn_div_words
865bn_div_words:
866 .set noreorder
867 bnez $a2,bn_div_words_internal
868 li $v0,-1 # I would rather signal div-by-zero
869 # which can be done with 'break 7'
870 jr $ra
871 move $a0,$v0
872.end bn_div_words
873
874.align 5
875.ent bn_div_words_internal
876bn_div_words_internal:
877___
878$code.=<<___ if ($flavour =~ /nubi/i);
879 .frame $sp,6*$SZREG,$ra
880 .mask 0x8000f008,-$SZREG
881 .set noreorder
882 $PTR_SUB $sp,6*$SZREG
883 $REG_S $ra,5*$SZREG($sp)
884 $REG_S $t3,4*$SZREG($sp)
885 $REG_S $t2,3*$SZREG($sp)
886 $REG_S $t1,2*$SZREG($sp)
887 $REG_S $t0,1*$SZREG($sp)
888 $REG_S $gp,0*$SZREG($sp)
889___
890$code.=<<___;
891 move $v1,$zero
892 bltz $a2,.L_bn_div_words_body
893 move $t9,$v1
894 $SLL $a2,1
895 bgtz $a2,.-4
896 addu $t9,1
897
898 .set reorder
899 negu $t1,$t9
900 li $t2,-1
901 $SLL $t2,$t1
902 and $t2,$a0
903 $SRL $at,$a1,$t1
904 .set noreorder
905 bnezl $t2,.+8
906 break 6 # signal overflow
907 .set reorder
908 $SLL $a0,$t9
909 $SLL $a1,$t9
910 or $a0,$at
911___
912$QT=$ta0;
913$HH=$ta1;
914$DH=$v1;
915$code.=<<___;
916.L_bn_div_words_body:
917 $SRL $DH,$a2,4*$BNSZ # bits
918 sgeu $at,$a0,$a2
919 .set noreorder
920 bnezl $at,.+8
921 $SUBU $a0,$a2
922 .set reorder
923
924 li $QT,-1
925 $SRL $HH,$a0,4*$BNSZ # bits
926 $SRL $QT,4*$BNSZ # q=0xffffffff
927 beq $DH,$HH,.L_bn_div_words_skip_div1
928 $DIVU $zero,$a0,$DH
929 mflo $QT
930.L_bn_div_words_skip_div1:
931 $MULTU $a2,$QT
932 $SLL $t3,$a0,4*$BNSZ # bits
933 $SRL $at,$a1,4*$BNSZ # bits
934 or $t3,$at
935 mflo $t0
936 mfhi $t1
937.L_bn_div_words_inner_loop1:
938 sltu $t2,$t3,$t0
939 seq $t8,$HH,$t1
940 sltu $at,$HH,$t1
941 and $t2,$t8
942 sltu $v0,$t0,$a2
943 or $at,$t2
944 .set noreorder
945 beqz $at,.L_bn_div_words_inner_loop1_done
946 $SUBU $t1,$v0
947 $SUBU $t0,$a2
948 b .L_bn_div_words_inner_loop1
949 $SUBU $QT,1
950 .set reorder
951.L_bn_div_words_inner_loop1_done:
952
953 $SLL $a1,4*$BNSZ # bits
954 $SUBU $a0,$t3,$t0
955 $SLL $v0,$QT,4*$BNSZ # bits
956
957 li $QT,-1
958 $SRL $HH,$a0,4*$BNSZ # bits
959 $SRL $QT,4*$BNSZ # q=0xffffffff
960 beq $DH,$HH,.L_bn_div_words_skip_div2
961 $DIVU $zero,$a0,$DH
962 mflo $QT
963.L_bn_div_words_skip_div2:
964 $MULTU $a2,$QT
965 $SLL $t3,$a0,4*$BNSZ # bits
966 $SRL $at,$a1,4*$BNSZ # bits
967 or $t3,$at
968 mflo $t0
969 mfhi $t1
970.L_bn_div_words_inner_loop2:
971 sltu $t2,$t3,$t0
972 seq $t8,$HH,$t1
973 sltu $at,$HH,$t1
974 and $t2,$t8
975 sltu $v1,$t0,$a2
976 or $at,$t2
977 .set noreorder
978 beqz $at,.L_bn_div_words_inner_loop2_done
979 $SUBU $t1,$v1
980 $SUBU $t0,$a2
981 b .L_bn_div_words_inner_loop2
982 $SUBU $QT,1
983 .set reorder
984.L_bn_div_words_inner_loop2_done:
985
986 $SUBU $a0,$t3,$t0
987 or $v0,$QT
988 $SRL $v1,$a0,$t9 # $v1 contains remainder if anybody wants it
989 $SRL $a2,$t9 # restore $a2
990
991 .set noreorder
992 move $a1,$v1
993___
994$code.=<<___ if ($flavour =~ /nubi/i);
995 $REG_L $t3,4*$SZREG($sp)
996 $REG_L $t2,3*$SZREG($sp)
997 $REG_L $t1,2*$SZREG($sp)
998 $REG_L $t0,1*$SZREG($sp)
999 $REG_L $gp,0*$SZREG($sp)
1000 $PTR_ADD $sp,6*$SZREG
1001___
1002$code.=<<___;
1003 jr $ra
1004 move $a0,$v0
1005.end bn_div_words_internal
1006___
1007undef $HH; undef $QT; undef $DH;
1008
1009($a_0,$a_1,$a_2,$a_3)=($t0,$t1,$t2,$t3);
1010($b_0,$b_1,$b_2,$b_3)=($ta0,$ta1,$ta2,$ta3);
1011
1012($a_4,$a_5,$a_6,$a_7)=($s0,$s2,$s4,$a1); # once we load a[7], no use for $a1
1013($b_4,$b_5,$b_6,$b_7)=($s1,$s3,$s5,$a2); # once we load b[7], no use for $a2
1014
1015($t_1,$t_2,$c_1,$c_2,$c_3)=($t8,$t9,$v0,$v1,$a3);
1016
1017$code.=<<___;
1018
1019.align 5
1020.globl bn_mul_comba8
1021.ent bn_mul_comba8
1022bn_mul_comba8:
1023 .set noreorder
1024___
1025$code.=<<___ if ($flavour =~ /nubi/i);
1026 .frame $sp,12*$SZREG,$ra
1027 .mask 0x803ff008,-$SZREG
1028 $PTR_SUB $sp,12*$SZREG
1029 $REG_S $ra,11*$SZREG($sp)
1030 $REG_S $s5,10*$SZREG($sp)
1031 $REG_S $s4,9*$SZREG($sp)
1032 $REG_S $s3,8*$SZREG($sp)
1033 $REG_S $s2,7*$SZREG($sp)
1034 $REG_S $s1,6*$SZREG($sp)
1035 $REG_S $s0,5*$SZREG($sp)
1036 $REG_S $t3,4*$SZREG($sp)
1037 $REG_S $t2,3*$SZREG($sp)
1038 $REG_S $t1,2*$SZREG($sp)
1039 $REG_S $t0,1*$SZREG($sp)
1040 $REG_S $gp,0*$SZREG($sp)
1041___
1042$code.=<<___ if ($flavour !~ /nubi/i);
1043 .frame $sp,6*$SZREG,$ra
1044 .mask 0x003f0000,-$SZREG
1045 $PTR_SUB $sp,6*$SZREG
1046 $REG_S $s5,5*$SZREG($sp)
1047 $REG_S $s4,4*$SZREG($sp)
1048 $REG_S $s3,3*$SZREG($sp)
1049 $REG_S $s2,2*$SZREG($sp)
1050 $REG_S $s1,1*$SZREG($sp)
1051 $REG_S $s0,0*$SZREG($sp)
1052___
1053$code.=<<___;
1054
1055 .set reorder
1056 $LD $a_0,0($a1) # If compiled with -mips3 option on
1057 # R5000 box assembler barks on this
1058 # 1ine with "should not have mult/div
1059 # as last instruction in bb (R10K
1060 # bug)" warning. If anybody out there
1061 # has a clue about how to circumvent
1062 # this do send me a note.
1063 # <appro\@fy.chalmers.se>
1064
1065 $LD $b_0,0($a2)
1066 $LD $a_1,$BNSZ($a1)
1067 $LD $a_2,2*$BNSZ($a1)
1068 $MULTU $a_0,$b_0 # mul_add_c(a[0],b[0],c1,c2,c3);
1069 $LD $a_3,3*$BNSZ($a1)
1070 $LD $b_1,$BNSZ($a2)
1071 $LD $b_2,2*$BNSZ($a2)
1072 $LD $b_3,3*$BNSZ($a2)
1073 mflo $c_1
1074 mfhi $c_2
1075
1076 $LD $a_4,4*$BNSZ($a1)
1077 $LD $a_5,5*$BNSZ($a1)
1078 $MULTU $a_0,$b_1 # mul_add_c(a[0],b[1],c2,c3,c1);
1079 $LD $a_6,6*$BNSZ($a1)
1080 $LD $a_7,7*$BNSZ($a1)
1081 $LD $b_4,4*$BNSZ($a2)
1082 $LD $b_5,5*$BNSZ($a2)
1083 mflo $t_1
1084 mfhi $t_2
1085 $ADDU $c_2,$t_1
1086 sltu $at,$c_2,$t_1
1087 $MULTU $a_1,$b_0 # mul_add_c(a[1],b[0],c2,c3,c1);
1088 $ADDU $c_3,$t_2,$at
1089 $LD $b_6,6*$BNSZ($a2)
1090 $LD $b_7,7*$BNSZ($a2)
1091 $ST $c_1,0($a0) # r[0]=c1;
1092 mflo $t_1
1093 mfhi $t_2
1094 $ADDU $c_2,$t_1
1095 sltu $at,$c_2,$t_1
1096 $MULTU $a_2,$b_0 # mul_add_c(a[2],b[0],c3,c1,c2);
1097 $ADDU $t_2,$at
1098 $ADDU $c_3,$t_2
1099 sltu $c_1,$c_3,$t_2
1100 $ST $c_2,$BNSZ($a0) # r[1]=c2;
1101
1102 mflo $t_1
1103 mfhi $t_2
1104 $ADDU $c_3,$t_1
1105 sltu $at,$c_3,$t_1
1106 $MULTU $a_1,$b_1 # mul_add_c(a[1],b[1],c3,c1,c2);
1107 $ADDU $t_2,$at
1108 $ADDU $c_1,$t_2
1109 mflo $t_1
1110 mfhi $t_2
1111 $ADDU $c_3,$t_1
1112 sltu $at,$c_3,$t_1
1113 $MULTU $a_0,$b_2 # mul_add_c(a[0],b[2],c3,c1,c2);
1114 $ADDU $t_2,$at
1115 $ADDU $c_1,$t_2
1116 sltu $c_2,$c_1,$t_2
1117 mflo $t_1
1118 mfhi $t_2
1119 $ADDU $c_3,$t_1
1120 sltu $at,$c_3,$t_1
1121 $MULTU $a_0,$b_3 # mul_add_c(a[0],b[3],c1,c2,c3);
1122 $ADDU $t_2,$at
1123 $ADDU $c_1,$t_2
1124 sltu $at,$c_1,$t_2
1125 $ADDU $c_2,$at
1126 $ST $c_3,2*$BNSZ($a0) # r[2]=c3;
1127
1128 mflo $t_1
1129 mfhi $t_2
1130 $ADDU $c_1,$t_1
1131 sltu $at,$c_1,$t_1
1132 $MULTU $a_1,$b_2 # mul_add_c(a[1],b[2],c1,c2,c3);
1133 $ADDU $t_2,$at
1134 $ADDU $c_2,$t_2
1135 sltu $c_3,$c_2,$t_2
1136 mflo $t_1
1137 mfhi $t_2
1138 $ADDU $c_1,$t_1
1139 sltu $at,$c_1,$t_1
1140 $MULTU $a_2,$b_1 # mul_add_c(a[2],b[1],c1,c2,c3);
1141 $ADDU $t_2,$at
1142 $ADDU $c_2,$t_2
1143 sltu $at,$c_2,$t_2
1144 $ADDU $c_3,$at
1145 mflo $t_1
1146 mfhi $t_2
1147 $ADDU $c_1,$t_1
1148 sltu $at,$c_1,$t_1
1149 $MULTU $a_3,$b_0 # mul_add_c(a[3],b[0],c1,c2,c3);
1150 $ADDU $t_2,$at
1151 $ADDU $c_2,$t_2
1152 sltu $at,$c_2,$t_2
1153 $ADDU $c_3,$at
1154 mflo $t_1
1155 mfhi $t_2
1156 $ADDU $c_1,$t_1
1157 sltu $at,$c_1,$t_1
1158 $MULTU $a_4,$b_0 # mul_add_c(a[4],b[0],c2,c3,c1);
1159 $ADDU $t_2,$at
1160 $ADDU $c_2,$t_2
1161 sltu $at,$c_2,$t_2
1162 $ADDU $c_3,$at
1163 $ST $c_1,3*$BNSZ($a0) # r[3]=c1;
1164
1165 mflo $t_1
1166 mfhi $t_2
1167 $ADDU $c_2,$t_1
1168 sltu $at,$c_2,$t_1
1169 $MULTU $a_3,$b_1 # mul_add_c(a[3],b[1],c2,c3,c1);
1170 $ADDU $t_2,$at
1171 $ADDU $c_3,$t_2
1172 sltu $c_1,$c_3,$t_2
1173 mflo $t_1
1174 mfhi $t_2
1175 $ADDU $c_2,$t_1
1176 sltu $at,$c_2,$t_1
1177 $MULTU $a_2,$b_2 # mul_add_c(a[2],b[2],c2,c3,c1);
1178 $ADDU $t_2,$at
1179 $ADDU $c_3,$t_2
1180 sltu $at,$c_3,$t_2
1181 $ADDU $c_1,$at
1182 mflo $t_1
1183 mfhi $t_2
1184 $ADDU $c_2,$t_1
1185 sltu $at,$c_2,$t_1
1186 $MULTU $a_1,$b_3 # mul_add_c(a[1],b[3],c2,c3,c1);
1187 $ADDU $t_2,$at
1188 $ADDU $c_3,$t_2
1189 sltu $at,$c_3,$t_2
1190 $ADDU $c_1,$at
1191 mflo $t_1
1192 mfhi $t_2
1193 $ADDU $c_2,$t_1
1194 sltu $at,$c_2,$t_1
1195 $MULTU $a_0,$b_4 # mul_add_c(a[0],b[4],c2,c3,c1);
1196 $ADDU $t_2,$at
1197 $ADDU $c_3,$t_2
1198 sltu $at,$c_3,$t_2
1199 $ADDU $c_1,$at
1200 mflo $t_1
1201 mfhi $t_2
1202 $ADDU $c_2,$t_1
1203 sltu $at,$c_2,$t_1
1204 $MULTU $a_0,$b_5 # mul_add_c(a[0],b[5],c3,c1,c2);
1205 $ADDU $t_2,$at
1206 $ADDU $c_3,$t_2
1207 sltu $at,$c_3,$t_2
1208 $ADDU $c_1,$at
1209 $ST $c_2,4*$BNSZ($a0) # r[4]=c2;
1210
1211 mflo $t_1
1212 mfhi $t_2
1213 $ADDU $c_3,$t_1
1214 sltu $at,$c_3,$t_1
1215 $MULTU $a_1,$b_4 # mul_add_c(a[1],b[4],c3,c1,c2);
1216 $ADDU $t_2,$at
1217 $ADDU $c_1,$t_2
1218 sltu $c_2,$c_1,$t_2
1219 mflo $t_1
1220 mfhi $t_2
1221 $ADDU $c_3,$t_1
1222 sltu $at,$c_3,$t_1
1223 $MULTU $a_2,$b_3 # mul_add_c(a[2],b[3],c3,c1,c2);
1224 $ADDU $t_2,$at
1225 $ADDU $c_1,$t_2
1226 sltu $at,$c_1,$t_2
1227 $ADDU $c_2,$at
1228 mflo $t_1
1229 mfhi $t_2
1230 $ADDU $c_3,$t_1
1231 sltu $at,$c_3,$t_1
1232 $MULTU $a_3,$b_2 # mul_add_c(a[3],b[2],c3,c1,c2);
1233 $ADDU $t_2,$at
1234 $ADDU $c_1,$t_2
1235 sltu $at,$c_1,$t_2
1236 $ADDU $c_2,$at
1237 mflo $t_1
1238 mfhi $t_2
1239 $ADDU $c_3,$t_1
1240 sltu $at,$c_3,$t_1
1241 $MULTU $a_4,$b_1 # mul_add_c(a[4],b[1],c3,c1,c2);
1242 $ADDU $t_2,$at
1243 $ADDU $c_1,$t_2
1244 sltu $at,$c_1,$t_2
1245 $ADDU $c_2,$at
1246 mflo $t_1
1247 mfhi $t_2
1248 $ADDU $c_3,$t_1
1249 sltu $at,$c_3,$t_1
1250 $MULTU $a_5,$b_0 # mul_add_c(a[5],b[0],c3,c1,c2);
1251 $ADDU $t_2,$at
1252 $ADDU $c_1,$t_2
1253 sltu $at,$c_1,$t_2
1254 $ADDU $c_2,$at
1255 mflo $t_1
1256 mfhi $t_2
1257 $ADDU $c_3,$t_1
1258 sltu $at,$c_3,$t_1
1259 $MULTU $a_6,$b_0 # mul_add_c(a[6],b[0],c1,c2,c3);
1260 $ADDU $t_2,$at
1261 $ADDU $c_1,$t_2
1262 sltu $at,$c_1,$t_2
1263 $ADDU $c_2,$at
1264 $ST $c_3,5*$BNSZ($a0) # r[5]=c3;
1265
1266 mflo $t_1
1267 mfhi $t_2
1268 $ADDU $c_1,$t_1
1269 sltu $at,$c_1,$t_1
1270 $MULTU $a_5,$b_1 # mul_add_c(a[5],b[1],c1,c2,c3);
1271 $ADDU $t_2,$at
1272 $ADDU $c_2,$t_2
1273 sltu $c_3,$c_2,$t_2
1274 mflo $t_1
1275 mfhi $t_2
1276 $ADDU $c_1,$t_1
1277 sltu $at,$c_1,$t_1
1278 $MULTU $a_4,$b_2 # mul_add_c(a[4],b[2],c1,c2,c3);
1279 $ADDU $t_2,$at
1280 $ADDU $c_2,$t_2
1281 sltu $at,$c_2,$t_2
1282 $ADDU $c_3,$at
1283 mflo $t_1
1284 mfhi $t_2
1285 $ADDU $c_1,$t_1
1286 sltu $at,$c_1,$t_1
1287 $MULTU $a_3,$b_3 # mul_add_c(a[3],b[3],c1,c2,c3);
1288 $ADDU $t_2,$at
1289 $ADDU $c_2,$t_2
1290 sltu $at,$c_2,$t_2
1291 $ADDU $c_3,$at
1292 mflo $t_1
1293 mfhi $t_2
1294 $ADDU $c_1,$t_1
1295 sltu $at,$c_1,$t_1
1296 $MULTU $a_2,$b_4 # mul_add_c(a[2],b[4],c1,c2,c3);
1297 $ADDU $t_2,$at
1298 $ADDU $c_2,$t_2
1299 sltu $at,$c_2,$t_2
1300 $ADDU $c_3,$at
1301 mflo $t_1
1302 mfhi $t_2
1303 $ADDU $c_1,$t_1
1304 sltu $at,$c_1,$t_1
1305 $MULTU $a_1,$b_5 # mul_add_c(a[1],b[5],c1,c2,c3);
1306 $ADDU $t_2,$at
1307 $ADDU $c_2,$t_2
1308 sltu $at,$c_2,$t_2
1309 $ADDU $c_3,$at
1310 mflo $t_1
1311 mfhi $t_2
1312 $ADDU $c_1,$t_1
1313 sltu $at,$c_1,$t_1
1314 $MULTU $a_0,$b_6 # mul_add_c(a[0],b[6],c1,c2,c3);
1315 $ADDU $t_2,$at
1316 $ADDU $c_2,$t_2
1317 sltu $at,$c_2,$t_2
1318 $ADDU $c_3,$at
1319 mflo $t_1
1320 mfhi $t_2
1321 $ADDU $c_1,$t_1
1322 sltu $at,$c_1,$t_1
1323 $MULTU $a_0,$b_7 # mul_add_c(a[0],b[7],c2,c3,c1);
1324 $ADDU $t_2,$at
1325 $ADDU $c_2,$t_2
1326 sltu $at,$c_2,$t_2
1327 $ADDU $c_3,$at
1328 $ST $c_1,6*$BNSZ($a0) # r[6]=c1;
1329
1330 mflo $t_1
1331 mfhi $t_2
1332 $ADDU $c_2,$t_1
1333 sltu $at,$c_2,$t_1
1334 $MULTU $a_1,$b_6 # mul_add_c(a[1],b[6],c2,c3,c1);
1335 $ADDU $t_2,$at
1336 $ADDU $c_3,$t_2
1337 sltu $c_1,$c_3,$t_2
1338 mflo $t_1
1339 mfhi $t_2
1340 $ADDU $c_2,$t_1
1341 sltu $at,$c_2,$t_1
1342 $MULTU $a_2,$b_5 # mul_add_c(a[2],b[5],c2,c3,c1);
1343 $ADDU $t_2,$at
1344 $ADDU $c_3,$t_2
1345 sltu $at,$c_3,$t_2
1346 $ADDU $c_1,$at
1347 mflo $t_1
1348 mfhi $t_2
1349 $ADDU $c_2,$t_1
1350 sltu $at,$c_2,$t_1
1351 $MULTU $a_3,$b_4 # mul_add_c(a[3],b[4],c2,c3,c1);
1352 $ADDU $t_2,$at
1353 $ADDU $c_3,$t_2
1354 sltu $at,$c_3,$t_2
1355 $ADDU $c_1,$at
1356 mflo $t_1
1357 mfhi $t_2
1358 $ADDU $c_2,$t_1
1359 sltu $at,$c_2,$t_1
1360 $MULTU $a_4,$b_3 # mul_add_c(a[4],b[3],c2,c3,c1);
1361 $ADDU $t_2,$at
1362 $ADDU $c_3,$t_2
1363 sltu $at,$c_3,$t_2
1364 $ADDU $c_1,$at
1365 mflo $t_1
1366 mfhi $t_2
1367 $ADDU $c_2,$t_1
1368 sltu $at,$c_2,$t_1
1369 $MULTU $a_5,$b_2 # mul_add_c(a[5],b[2],c2,c3,c1);
1370 $ADDU $t_2,$at
1371 $ADDU $c_3,$t_2
1372 sltu $at,$c_3,$t_2
1373 $ADDU $c_1,$at
1374 mflo $t_1
1375 mfhi $t_2
1376 $ADDU $c_2,$t_1
1377 sltu $at,$c_2,$t_1
1378 $MULTU $a_6,$b_1 # mul_add_c(a[6],b[1],c2,c3,c1);
1379 $ADDU $t_2,$at
1380 $ADDU $c_3,$t_2
1381 sltu $at,$c_3,$t_2
1382 $ADDU $c_1,$at
1383 mflo $t_1
1384 mfhi $t_2
1385 $ADDU $c_2,$t_1
1386 sltu $at,$c_2,$t_1
1387 $MULTU $a_7,$b_0 # mul_add_c(a[7],b[0],c2,c3,c1);
1388 $ADDU $t_2,$at
1389 $ADDU $c_3,$t_2
1390 sltu $at,$c_3,$t_2
1391 $ADDU $c_1,$at
1392 mflo $t_1
1393 mfhi $t_2
1394 $ADDU $c_2,$t_1
1395 sltu $at,$c_2,$t_1
1396 $MULTU $a_7,$b_1 # mul_add_c(a[7],b[1],c3,c1,c2);
1397 $ADDU $t_2,$at
1398 $ADDU $c_3,$t_2
1399 sltu $at,$c_3,$t_2
1400 $ADDU $c_1,$at
1401 $ST $c_2,7*$BNSZ($a0) # r[7]=c2;
1402
1403 mflo $t_1
1404 mfhi $t_2
1405 $ADDU $c_3,$t_1
1406 sltu $at,$c_3,$t_1
1407 $MULTU $a_6,$b_2 # mul_add_c(a[6],b[2],c3,c1,c2);
1408 $ADDU $t_2,$at
1409 $ADDU $c_1,$t_2
1410 sltu $c_2,$c_1,$t_2
1411 mflo $t_1
1412 mfhi $t_2
1413 $ADDU $c_3,$t_1
1414 sltu $at,$c_3,$t_1
1415 $MULTU $a_5,$b_3 # mul_add_c(a[5],b[3],c3,c1,c2);
1416 $ADDU $t_2,$at
1417 $ADDU $c_1,$t_2
1418 sltu $at,$c_1,$t_2
1419 $ADDU $c_2,$at
1420 mflo $t_1
1421 mfhi $t_2
1422 $ADDU $c_3,$t_1
1423 sltu $at,$c_3,$t_1
1424 $MULTU $a_4,$b_4 # mul_add_c(a[4],b[4],c3,c1,c2);
1425 $ADDU $t_2,$at
1426 $ADDU $c_1,$t_2
1427 sltu $at,$c_1,$t_2
1428 $ADDU $c_2,$at
1429 mflo $t_1
1430 mfhi $t_2
1431 $ADDU $c_3,$t_1
1432 sltu $at,$c_3,$t_1
1433 $MULTU $a_3,$b_5 # mul_add_c(a[3],b[5],c3,c1,c2);
1434 $ADDU $t_2,$at
1435 $ADDU $c_1,$t_2
1436 sltu $at,$c_1,$t_2
1437 $ADDU $c_2,$at
1438 mflo $t_1
1439 mfhi $t_2
1440 $ADDU $c_3,$t_1
1441 sltu $at,$c_3,$t_1
1442 $MULTU $a_2,$b_6 # mul_add_c(a[2],b[6],c3,c1,c2);
1443 $ADDU $t_2,$at
1444 $ADDU $c_1,$t_2
1445 sltu $at,$c_1,$t_2
1446 $ADDU $c_2,$at
1447 mflo $t_1
1448 mfhi $t_2
1449 $ADDU $c_3,$t_1
1450 sltu $at,$c_3,$t_1
1451 $MULTU $a_1,$b_7 # mul_add_c(a[1],b[7],c3,c1,c2);
1452 $ADDU $t_2,$at
1453 $ADDU $c_1,$t_2
1454 sltu $at,$c_1,$t_2
1455 $ADDU $c_2,$at
1456 mflo $t_1
1457 mfhi $t_2
1458 $ADDU $c_3,$t_1
1459 sltu $at,$c_3,$t_1
1460 $MULTU $a_2,$b_7 # mul_add_c(a[2],b[7],c1,c2,c3);
1461 $ADDU $t_2,$at
1462 $ADDU $c_1,$t_2
1463 sltu $at,$c_1,$t_2
1464 $ADDU $c_2,$at
1465 $ST $c_3,8*$BNSZ($a0) # r[8]=c3;
1466
1467 mflo $t_1
1468 mfhi $t_2
1469 $ADDU $c_1,$t_1
1470 sltu $at,$c_1,$t_1
1471 $MULTU $a_3,$b_6 # mul_add_c(a[3],b[6],c1,c2,c3);
1472 $ADDU $t_2,$at
1473 $ADDU $c_2,$t_2
1474 sltu $c_3,$c_2,$t_2
1475 mflo $t_1
1476 mfhi $t_2
1477 $ADDU $c_1,$t_1
1478 sltu $at,$c_1,$t_1
1479 $MULTU $a_4,$b_5 # mul_add_c(a[4],b[5],c1,c2,c3);
1480 $ADDU $t_2,$at
1481 $ADDU $c_2,$t_2
1482 sltu $at,$c_2,$t_2
1483 $ADDU $c_3,$at
1484 mflo $t_1
1485 mfhi $t_2
1486 $ADDU $c_1,$t_1
1487 sltu $at,$c_1,$t_1
1488 $MULTU $a_5,$b_4 # mul_add_c(a[5],b[4],c1,c2,c3);
1489 $ADDU $t_2,$at
1490 $ADDU $c_2,$t_2
1491 sltu $at,$c_2,$t_2
1492 $ADDU $c_3,$at
1493 mflo $t_1
1494 mfhi $t_2
1495 $ADDU $c_1,$t_1
1496 sltu $at,$c_1,$t_1
1497 $MULTU $a_6,$b_3 # mul_add_c(a[6],b[3],c1,c2,c3);
1498 $ADDU $t_2,$at
1499 $ADDU $c_2,$t_2
1500 sltu $at,$c_2,$t_2
1501 $ADDU $c_3,$at
1502 mflo $t_1
1503 mfhi $t_2
1504 $ADDU $c_1,$t_1
1505 sltu $at,$c_1,$t_1
1506 $MULTU $a_7,$b_2 # mul_add_c(a[7],b[2],c1,c2,c3);
1507 $ADDU $t_2,$at
1508 $ADDU $c_2,$t_2
1509 sltu $at,$c_2,$t_2
1510 $ADDU $c_3,$at
1511 mflo $t_1
1512 mfhi $t_2
1513 $ADDU $c_1,$t_1
1514 sltu $at,$c_1,$t_1
1515 $MULTU $a_7,$b_3 # mul_add_c(a[7],b[3],c2,c3,c1);
1516 $ADDU $t_2,$at
1517 $ADDU $c_2,$t_2
1518 sltu $at,$c_2,$t_2
1519 $ADDU $c_3,$at
1520 $ST $c_1,9*$BNSZ($a0) # r[9]=c1;
1521
1522 mflo $t_1
1523 mfhi $t_2
1524 $ADDU $c_2,$t_1
1525 sltu $at,$c_2,$t_1
1526 $MULTU $a_6,$b_4 # mul_add_c(a[6],b[4],c2,c3,c1);
1527 $ADDU $t_2,$at
1528 $ADDU $c_3,$t_2
1529 sltu $c_1,$c_3,$t_2
1530 mflo $t_1
1531 mfhi $t_2
1532 $ADDU $c_2,$t_1
1533 sltu $at,$c_2,$t_1
1534 $MULTU $a_5,$b_5 # mul_add_c(a[5],b[5],c2,c3,c1);
1535 $ADDU $t_2,$at
1536 $ADDU $c_3,$t_2
1537 sltu $at,$c_3,$t_2
1538 $ADDU $c_1,$at
1539 mflo $t_1
1540 mfhi $t_2
1541 $ADDU $c_2,$t_1
1542 sltu $at,$c_2,$t_1
1543 $MULTU $a_4,$b_6 # mul_add_c(a[4],b[6],c2,c3,c1);
1544 $ADDU $t_2,$at
1545 $ADDU $c_3,$t_2
1546 sltu $at,$c_3,$t_2
1547 $ADDU $c_1,$at
1548 mflo $t_1
1549 mfhi $t_2
1550 $ADDU $c_2,$t_1
1551 sltu $at,$c_2,$t_1
1552 $MULTU $a_3,$b_7 # mul_add_c(a[3],b[7],c2,c3,c1);
1553 $ADDU $t_2,$at
1554 $ADDU $c_3,$t_2
1555 sltu $at,$c_3,$t_2
1556 $ADDU $c_1,$at
1557 mflo $t_1
1558 mfhi $t_2
1559 $ADDU $c_2,$t_1
1560 sltu $at,$c_2,$t_1
1561 $MULTU $a_4,$b_7 # mul_add_c(a[4],b[7],c3,c1,c2);
1562 $ADDU $t_2,$at
1563 $ADDU $c_3,$t_2
1564 sltu $at,$c_3,$t_2
1565 $ADDU $c_1,$at
1566 $ST $c_2,10*$BNSZ($a0) # r[10]=c2;
1567
1568 mflo $t_1
1569 mfhi $t_2
1570 $ADDU $c_3,$t_1
1571 sltu $at,$c_3,$t_1
1572 $MULTU $a_5,$b_6 # mul_add_c(a[5],b[6],c3,c1,c2);
1573 $ADDU $t_2,$at
1574 $ADDU $c_1,$t_2
1575 sltu $c_2,$c_1,$t_2
1576 mflo $t_1
1577 mfhi $t_2
1578 $ADDU $c_3,$t_1
1579 sltu $at,$c_3,$t_1
1580 $MULTU $a_6,$b_5 # mul_add_c(a[6],b[5],c3,c1,c2);
1581 $ADDU $t_2,$at
1582 $ADDU $c_1,$t_2
1583 sltu $at,$c_1,$t_2
1584 $ADDU $c_2,$at
1585 mflo $t_1
1586 mfhi $t_2
1587 $ADDU $c_3,$t_1
1588 sltu $at,$c_3,$t_1
1589 $MULTU $a_7,$b_4 # mul_add_c(a[7],b[4],c3,c1,c2);
1590 $ADDU $t_2,$at
1591 $ADDU $c_1,$t_2
1592 sltu $at,$c_1,$t_2
1593 $ADDU $c_2,$at
1594 mflo $t_1
1595 mfhi $t_2
1596 $ADDU $c_3,$t_1
1597 sltu $at,$c_3,$t_1
1598 $MULTU $a_7,$b_5 # mul_add_c(a[7],b[5],c1,c2,c3);
1599 $ADDU $t_2,$at
1600 $ADDU $c_1,$t_2
1601 sltu $at,$c_1,$t_2
1602 $ADDU $c_2,$at
1603 $ST $c_3,11*$BNSZ($a0) # r[11]=c3;
1604
1605 mflo $t_1
1606 mfhi $t_2
1607 $ADDU $c_1,$t_1
1608 sltu $at,$c_1,$t_1
1609 $MULTU $a_6,$b_6 # mul_add_c(a[6],b[6],c1,c2,c3);
1610 $ADDU $t_2,$at
1611 $ADDU $c_2,$t_2
1612 sltu $c_3,$c_2,$t_2
1613 mflo $t_1
1614 mfhi $t_2
1615 $ADDU $c_1,$t_1
1616 sltu $at,$c_1,$t_1
1617 $MULTU $a_5,$b_7 # mul_add_c(a[5],b[7],c1,c2,c3);
1618 $ADDU $t_2,$at
1619 $ADDU $c_2,$t_2
1620 sltu $at,$c_2,$t_2
1621 $ADDU $c_3,$at
1622 mflo $t_1
1623 mfhi $t_2
1624 $ADDU $c_1,$t_1
1625 sltu $at,$c_1,$t_1
1626 $MULTU $a_6,$b_7 # mul_add_c(a[6],b[7],c2,c3,c1);
1627 $ADDU $t_2,$at
1628 $ADDU $c_2,$t_2
1629 sltu $at,$c_2,$t_2
1630 $ADDU $c_3,$at
1631 $ST $c_1,12*$BNSZ($a0) # r[12]=c1;
1632
1633 mflo $t_1
1634 mfhi $t_2
1635 $ADDU $c_2,$t_1
1636 sltu $at,$c_2,$t_1
1637 $MULTU $a_7,$b_6 # mul_add_c(a[7],b[6],c2,c3,c1);
1638 $ADDU $t_2,$at
1639 $ADDU $c_3,$t_2
1640 sltu $c_1,$c_3,$t_2
1641 mflo $t_1
1642 mfhi $t_2
1643 $ADDU $c_2,$t_1
1644 sltu $at,$c_2,$t_1
1645 $MULTU $a_7,$b_7 # mul_add_c(a[7],b[7],c3,c1,c2);
1646 $ADDU $t_2,$at
1647 $ADDU $c_3,$t_2
1648 sltu $at,$c_3,$t_2
1649 $ADDU $c_1,$at
1650 $ST $c_2,13*$BNSZ($a0) # r[13]=c2;
1651
1652 mflo $t_1
1653 mfhi $t_2
1654 $ADDU $c_3,$t_1
1655 sltu $at,$c_3,$t_1
1656 $ADDU $t_2,$at
1657 $ADDU $c_1,$t_2
1658 $ST $c_3,14*$BNSZ($a0) # r[14]=c3;
1659 $ST $c_1,15*$BNSZ($a0) # r[15]=c1;
1660
1661 .set noreorder
1662___
1663$code.=<<___ if ($flavour =~ /nubi/i);
1664 $REG_L $s5,10*$SZREG($sp)
1665 $REG_L $s4,9*$SZREG($sp)
1666 $REG_L $s3,8*$SZREG($sp)
1667 $REG_L $s2,7*$SZREG($sp)
1668 $REG_L $s1,6*$SZREG($sp)
1669 $REG_L $s0,5*$SZREG($sp)
1670 $REG_L $t3,4*$SZREG($sp)
1671 $REG_L $t2,3*$SZREG($sp)
1672 $REG_L $t1,2*$SZREG($sp)
1673 $REG_L $t0,1*$SZREG($sp)
1674 $REG_L $gp,0*$SZREG($sp)
1675 jr $ra
1676 $PTR_ADD $sp,12*$SZREG
1677___
1678$code.=<<___ if ($flavour !~ /nubi/i);
1679 $REG_L $s5,5*$SZREG($sp)
1680 $REG_L $s4,4*$SZREG($sp)
1681 $REG_L $s3,3*$SZREG($sp)
1682 $REG_L $s2,2*$SZREG($sp)
1683 $REG_L $s1,1*$SZREG($sp)
1684 $REG_L $s0,0*$SZREG($sp)
1685 jr $ra
1686 $PTR_ADD $sp,6*$SZREG
1687___
1688$code.=<<___;
1689.end bn_mul_comba8
1690
1691.align 5
1692.globl bn_mul_comba4
1693.ent bn_mul_comba4
1694bn_mul_comba4:
1695___
1696$code.=<<___ if ($flavour =~ /nubi/i);
1697 .frame $sp,6*$SZREG,$ra
1698 .mask 0x8000f008,-$SZREG
1699 .set noreorder
1700 $PTR_SUB $sp,6*$SZREG
1701 $REG_S $ra,5*$SZREG($sp)
1702 $REG_S $t3,4*$SZREG($sp)
1703 $REG_S $t2,3*$SZREG($sp)
1704 $REG_S $t1,2*$SZREG($sp)
1705 $REG_S $t0,1*$SZREG($sp)
1706 $REG_S $gp,0*$SZREG($sp)
1707___
1708$code.=<<___;
1709 .set reorder
1710 $LD $a_0,0($a1)
1711 $LD $b_0,0($a2)
1712 $LD $a_1,$BNSZ($a1)
1713 $LD $a_2,2*$BNSZ($a1)
1714 $MULTU $a_0,$b_0 # mul_add_c(a[0],b[0],c1,c2,c3);
1715 $LD $a_3,3*$BNSZ($a1)
1716 $LD $b_1,$BNSZ($a2)
1717 $LD $b_2,2*$BNSZ($a2)
1718 $LD $b_3,3*$BNSZ($a2)
1719 mflo $c_1
1720 mfhi $c_2
1721 $ST $c_1,0($a0)
1722
1723 $MULTU $a_0,$b_1 # mul_add_c(a[0],b[1],c2,c3,c1);
1724 mflo $t_1
1725 mfhi $t_2
1726 $ADDU $c_2,$t_1
1727 sltu $at,$c_2,$t_1
1728 $MULTU $a_1,$b_0 # mul_add_c(a[1],b[0],c2,c3,c1);
1729 $ADDU $c_3,$t_2,$at
1730 mflo $t_1
1731 mfhi $t_2
1732 $ADDU $c_2,$t_1
1733 sltu $at,$c_2,$t_1
1734 $MULTU $a_2,$b_0 # mul_add_c(a[2],b[0],c3,c1,c2);
1735 $ADDU $t_2,$at
1736 $ADDU $c_3,$t_2
1737 sltu $c_1,$c_3,$t_2
1738 $ST $c_2,$BNSZ($a0)
1739
1740 mflo $t_1
1741 mfhi $t_2
1742 $ADDU $c_3,$t_1
1743 sltu $at,$c_3,$t_1
1744 $MULTU $a_1,$b_1 # mul_add_c(a[1],b[1],c3,c1,c2);
1745 $ADDU $t_2,$at
1746 $ADDU $c_1,$t_2
1747 mflo $t_1
1748 mfhi $t_2
1749 $ADDU $c_3,$t_1
1750 sltu $at,$c_3,$t_1
1751 $MULTU $a_0,$b_2 # mul_add_c(a[0],b[2],c3,c1,c2);
1752 $ADDU $t_2,$at
1753 $ADDU $c_1,$t_2
1754 sltu $c_2,$c_1,$t_2
1755 mflo $t_1
1756 mfhi $t_2
1757 $ADDU $c_3,$t_1
1758 sltu $at,$c_3,$t_1
1759 $MULTU $a_0,$b_3 # mul_add_c(a[0],b[3],c1,c2,c3);
1760 $ADDU $t_2,$at
1761 $ADDU $c_1,$t_2
1762 sltu $at,$c_1,$t_2
1763 $ADDU $c_2,$at
1764 $ST $c_3,2*$BNSZ($a0)
1765
1766 mflo $t_1
1767 mfhi $t_2
1768 $ADDU $c_1,$t_1
1769 sltu $at,$c_1,$t_1
1770 $MULTU $a_1,$b_2 # mul_add_c(a[1],b[2],c1,c2,c3);
1771 $ADDU $t_2,$at
1772 $ADDU $c_2,$t_2
1773 sltu $c_3,$c_2,$t_2
1774 mflo $t_1
1775 mfhi $t_2
1776 $ADDU $c_1,$t_1
1777 sltu $at,$c_1,$t_1
1778 $MULTU $a_2,$b_1 # mul_add_c(a[2],b[1],c1,c2,c3);
1779 $ADDU $t_2,$at
1780 $ADDU $c_2,$t_2
1781 sltu $at,$c_2,$t_2
1782 $ADDU $c_3,$at
1783 mflo $t_1
1784 mfhi $t_2
1785 $ADDU $c_1,$t_1
1786 sltu $at,$c_1,$t_1
1787 $MULTU $a_3,$b_0 # mul_add_c(a[3],b[0],c1,c2,c3);
1788 $ADDU $t_2,$at
1789 $ADDU $c_2,$t_2
1790 sltu $at,$c_2,$t_2
1791 $ADDU $c_3,$at
1792 mflo $t_1
1793 mfhi $t_2
1794 $ADDU $c_1,$t_1
1795 sltu $at,$c_1,$t_1
1796 $MULTU $a_3,$b_1 # mul_add_c(a[3],b[1],c2,c3,c1);
1797 $ADDU $t_2,$at
1798 $ADDU $c_2,$t_2
1799 sltu $at,$c_2,$t_2
1800 $ADDU $c_3,$at
1801 $ST $c_1,3*$BNSZ($a0)
1802
1803 mflo $t_1
1804 mfhi $t_2
1805 $ADDU $c_2,$t_1
1806 sltu $at,$c_2,$t_1
1807 $MULTU $a_2,$b_2 # mul_add_c(a[2],b[2],c2,c3,c1);
1808 $ADDU $t_2,$at
1809 $ADDU $c_3,$t_2
1810 sltu $c_1,$c_3,$t_2
1811 mflo $t_1
1812 mfhi $t_2
1813 $ADDU $c_2,$t_1
1814 sltu $at,$c_2,$t_1
1815 $MULTU $a_1,$b_3 # mul_add_c(a[1],b[3],c2,c3,c1);
1816 $ADDU $t_2,$at
1817 $ADDU $c_3,$t_2
1818 sltu $at,$c_3,$t_2
1819 $ADDU $c_1,$at
1820 mflo $t_1
1821 mfhi $t_2
1822 $ADDU $c_2,$t_1
1823 sltu $at,$c_2,$t_1
1824 $MULTU $a_2,$b_3 # mul_add_c(a[2],b[3],c3,c1,c2);
1825 $ADDU $t_2,$at
1826 $ADDU $c_3,$t_2
1827 sltu $at,$c_3,$t_2
1828 $ADDU $c_1,$at
1829 $ST $c_2,4*$BNSZ($a0)
1830
1831 mflo $t_1
1832 mfhi $t_2
1833 $ADDU $c_3,$t_1
1834 sltu $at,$c_3,$t_1
1835 $MULTU $a_3,$b_2 # mul_add_c(a[3],b[2],c3,c1,c2);
1836 $ADDU $t_2,$at
1837 $ADDU $c_1,$t_2
1838 sltu $c_2,$c_1,$t_2
1839 mflo $t_1
1840 mfhi $t_2
1841 $ADDU $c_3,$t_1
1842 sltu $at,$c_3,$t_1
1843 $MULTU $a_3,$b_3 # mul_add_c(a[3],b[3],c1,c2,c3);
1844 $ADDU $t_2,$at
1845 $ADDU $c_1,$t_2
1846 sltu $at,$c_1,$t_2
1847 $ADDU $c_2,$at
1848 $ST $c_3,5*$BNSZ($a0)
1849
1850 mflo $t_1
1851 mfhi $t_2
1852 $ADDU $c_1,$t_1
1853 sltu $at,$c_1,$t_1
1854 $ADDU $t_2,$at
1855 $ADDU $c_2,$t_2
1856 $ST $c_1,6*$BNSZ($a0)
1857 $ST $c_2,7*$BNSZ($a0)
1858
1859 .set noreorder
1860___
1861$code.=<<___ if ($flavour =~ /nubi/i);
1862 $REG_L $t3,4*$SZREG($sp)
1863 $REG_L $t2,3*$SZREG($sp)
1864 $REG_L $t1,2*$SZREG($sp)
1865 $REG_L $t0,1*$SZREG($sp)
1866 $REG_L $gp,0*$SZREG($sp)
1867 $PTR_ADD $sp,6*$SZREG
1868___
1869$code.=<<___;
1870 jr $ra
1871 nop
1872.end bn_mul_comba4
1873___
1874
1875($a_4,$a_5,$a_6,$a_7)=($b_0,$b_1,$b_2,$b_3);
1876
1877$code.=<<___;
1878
1879.align 5
1880.globl bn_sqr_comba8
1881.ent bn_sqr_comba8
1882bn_sqr_comba8:
1883___
1884$code.=<<___ if ($flavour =~ /nubi/i);
1885 .frame $sp,6*$SZREG,$ra
1886 .mask 0x8000f008,-$SZREG
1887 .set noreorder
1888 $PTR_SUB $sp,6*$SZREG
1889 $REG_S $ra,5*$SZREG($sp)
1890 $REG_S $t3,4*$SZREG($sp)
1891 $REG_S $t2,3*$SZREG($sp)
1892 $REG_S $t1,2*$SZREG($sp)
1893 $REG_S $t0,1*$SZREG($sp)
1894 $REG_S $gp,0*$SZREG($sp)
1895___
1896$code.=<<___;
1897 .set reorder
1898 $LD $a_0,0($a1)
1899 $LD $a_1,$BNSZ($a1)
1900 $LD $a_2,2*$BNSZ($a1)
1901 $LD $a_3,3*$BNSZ($a1)
1902
1903 $MULTU $a_0,$a_0 # mul_add_c(a[0],b[0],c1,c2,c3);
1904 $LD $a_4,4*$BNSZ($a1)
1905 $LD $a_5,5*$BNSZ($a1)
1906 $LD $a_6,6*$BNSZ($a1)
1907 $LD $a_7,7*$BNSZ($a1)
1908 mflo $c_1
1909 mfhi $c_2
1910 $ST $c_1,0($a0)
1911
1912 $MULTU $a_0,$a_1 # mul_add_c2(a[0],b[1],c2,c3,c1);
1913 mflo $t_1
1914 mfhi $t_2
1915 slt $c_1,$t_2,$zero
1916 $SLL $t_2,1
1917 $MULTU $a_2,$a_0 # mul_add_c2(a[2],b[0],c3,c1,c2);
1918 slt $a2,$t_1,$zero
1919 $ADDU $t_2,$a2
1920 $SLL $t_1,1
1921 $ADDU $c_2,$t_1
1922 sltu $at,$c_2,$t_1
1923 $ADDU $c_3,$t_2,$at
1924 $ST $c_2,$BNSZ($a0)
1925
1926 mflo $t_1
1927 mfhi $t_2
1928 slt $c_2,$t_2,$zero
1929 $SLL $t_2,1
1930 $MULTU $a_1,$a_1 # mul_add_c(a[1],b[1],c3,c1,c2);
1931 slt $a2,$t_1,$zero
1932 $ADDU $t_2,$a2
1933 $SLL $t_1,1
1934 $ADDU $c_3,$t_1
1935 sltu $at,$c_3,$t_1
1936 $ADDU $t_2,$at
1937 $ADDU $c_1,$t_2
1938 sltu $at,$c_1,$t_2
1939 $ADDU $c_2,$at
1940 mflo $t_1
1941 mfhi $t_2
1942 $ADDU $c_3,$t_1
1943 sltu $at,$c_3,$t_1
1944 $MULTU $a_0,$a_3 # mul_add_c2(a[0],b[3],c1,c2,c3);
1945 $ADDU $t_2,$at
1946 $ADDU $c_1,$t_2
1947 sltu $at,$c_1,$t_2
1948 $ADDU $c_2,$at
1949 $ST $c_3,2*$BNSZ($a0)
1950
1951 mflo $t_1
1952 mfhi $t_2
1953 slt $c_3,$t_2,$zero
1954 $SLL $t_2,1
1955 $MULTU $a_1,$a_2 # mul_add_c2(a[1],b[2],c1,c2,c3);
1956 slt $a2,$t_1,$zero
1957 $ADDU $t_2,$a2
1958 $SLL $t_1,1
1959 $ADDU $c_1,$t_1
1960 sltu $at,$c_1,$t_1
1961 $ADDU $t_2,$at
1962 $ADDU $c_2,$t_2
1963 sltu $at,$c_2,$t_2
1964 $ADDU $c_3,$at
1965 mflo $t_1
1966 mfhi $t_2
1967 slt $at,$t_2,$zero
1968 $ADDU $c_3,$at
1969 $MULTU $a_4,$a_0 # mul_add_c2(a[4],b[0],c2,c3,c1);
1970 $SLL $t_2,1
1971 slt $a2,$t_1,$zero
1972 $ADDU $t_2,$a2
1973 $SLL $t_1,1
1974 $ADDU $c_1,$t_1
1975 sltu $at,$c_1,$t_1
1976 $ADDU $t_2,$at
1977 $ADDU $c_2,$t_2
1978 sltu $at,$c_2,$t_2
1979 $ADDU $c_3,$at
1980 $ST $c_1,3*$BNSZ($a0)
1981
1982 mflo $t_1
1983 mfhi $t_2
1984 slt $c_1,$t_2,$zero
1985 $SLL $t_2,1
1986 $MULTU $a_3,$a_1 # mul_add_c2(a[3],b[1],c2,c3,c1);
1987 slt $a2,$t_1,$zero
1988 $ADDU $t_2,$a2
1989 $SLL $t_1,1
1990 $ADDU $c_2,$t_1
1991 sltu $at,$c_2,$t_1
1992 $ADDU $t_2,$at
1993 $ADDU $c_3,$t_2
1994 sltu $at,$c_3,$t_2
1995 $ADDU $c_1,$at
1996 mflo $t_1
1997 mfhi $t_2
1998 slt $at,$t_2,$zero
1999 $ADDU $c_1,$at
2000 $MULTU $a_2,$a_2 # mul_add_c(a[2],b[2],c2,c3,c1);
2001 $SLL $t_2,1
2002 slt $a2,$t_1,$zero
2003 $ADDU $t_2,$a2
2004 $SLL $t_1,1
2005 $ADDU $c_2,$t_1
2006 sltu $at,$c_2,$t_1
2007 $ADDU $t_2,$at
2008 $ADDU $c_3,$t_2
2009 sltu $at,$c_3,$t_2
2010 $ADDU $c_1,$at
2011 mflo $t_1
2012 mfhi $t_2
2013 $ADDU $c_2,$t_1
2014 sltu $at,$c_2,$t_1
2015 $MULTU $a_0,$a_5 # mul_add_c2(a[0],b[5],c3,c1,c2);
2016 $ADDU $t_2,$at
2017 $ADDU $c_3,$t_2
2018 sltu $at,$c_3,$t_2
2019 $ADDU $c_1,$at
2020 $ST $c_2,4*$BNSZ($a0)
2021
2022 mflo $t_1
2023 mfhi $t_2
2024 slt $c_2,$t_2,$zero
2025 $SLL $t_2,1
2026 $MULTU $a_1,$a_4 # mul_add_c2(a[1],b[4],c3,c1,c2);
2027 slt $a2,$t_1,$zero
2028 $ADDU $t_2,$a2
2029 $SLL $t_1,1
2030 $ADDU $c_3,$t_1
2031 sltu $at,$c_3,$t_1
2032 $ADDU $t_2,$at
2033 $ADDU $c_1,$t_2
2034 sltu $at,$c_1,$t_2
2035 $ADDU $c_2,$at
2036 mflo $t_1
2037 mfhi $t_2
2038 slt $at,$t_2,$zero
2039 $ADDU $c_2,$at
2040 $MULTU $a_2,$a_3 # mul_add_c2(a[2],b[3],c3,c1,c2);
2041 $SLL $t_2,1
2042 slt $a2,$t_1,$zero
2043 $ADDU $t_2,$a2
2044 $SLL $t_1,1
2045 $ADDU $c_3,$t_1
2046 sltu $at,$c_3,$t_1
2047 $ADDU $t_2,$at
2048 $ADDU $c_1,$t_2
2049 sltu $at,$c_1,$t_2
2050 $ADDU $c_2,$at
2051 mflo $t_1
2052 mfhi $t_2
2053 slt $at,$t_2,$zero
2054 $MULTU $a_6,$a_0 # mul_add_c2(a[6],b[0],c1,c2,c3);
2055 $ADDU $c_2,$at
2056 $SLL $t_2,1
2057 slt $a2,$t_1,$zero
2058 $ADDU $t_2,$a2
2059 $SLL $t_1,1
2060 $ADDU $c_3,$t_1
2061 sltu $at,$c_3,$t_1
2062 $ADDU $t_2,$at
2063 $ADDU $c_1,$t_2
2064 sltu $at,$c_1,$t_2
2065 $ADDU $c_2,$at
2066 $ST $c_3,5*$BNSZ($a0)
2067
2068 mflo $t_1
2069 mfhi $t_2
2070 slt $c_3,$t_2,$zero
2071 $SLL $t_2,1
2072 $MULTU $a_5,$a_1 # mul_add_c2(a[5],b[1],c1,c2,c3);
2073 slt $a2,$t_1,$zero
2074 $ADDU $t_2,$a2
2075 $SLL $t_1,1
2076 $ADDU $c_1,$t_1
2077 sltu $at,$c_1,$t_1
2078 $ADDU $t_2,$at
2079 $ADDU $c_2,$t_2
2080 sltu $at,$c_2,$t_2
2081 $ADDU $c_3,$at
2082 mflo $t_1
2083 mfhi $t_2
2084 slt $at,$t_2,$zero
2085 $ADDU $c_3,$at
2086 $MULTU $a_4,$a_2 # mul_add_c2(a[4],b[2],c1,c2,c3);
2087 $SLL $t_2,1
2088 slt $a2,$t_1,$zero
2089 $ADDU $t_2,$a2
2090 $SLL $t_1,1
2091 $ADDU $c_1,$t_1
2092 sltu $at,$c_1,$t_1
2093 $ADDU $t_2,$at
2094 $ADDU $c_2,$t_2
2095 sltu $at,$c_2,$t_2
2096 $ADDU $c_3,$at
2097 mflo $t_1
2098 mfhi $t_2
2099 slt $at,$t_2,$zero
2100 $ADDU $c_3,$at
2101 $MULTU $a_3,$a_3 # mul_add_c(a[3],b[3],c1,c2,c3);
2102 $SLL $t_2,1
2103 slt $a2,$t_1,$zero
2104 $ADDU $t_2,$a2
2105 $SLL $t_1,1
2106 $ADDU $c_1,$t_1
2107 sltu $at,$c_1,$t_1
2108 $ADDU $t_2,$at
2109 $ADDU $c_2,$t_2
2110 sltu $at,$c_2,$t_2
2111 $ADDU $c_3,$at
2112 mflo $t_1
2113 mfhi $t_2
2114 $ADDU $c_1,$t_1
2115 sltu $at,$c_1,$t_1
2116 $MULTU $a_0,$a_7 # mul_add_c2(a[0],b[7],c2,c3,c1);
2117 $ADDU $t_2,$at
2118 $ADDU $c_2,$t_2
2119 sltu $at,$c_2,$t_2
2120 $ADDU $c_3,$at
2121 $ST $c_1,6*$BNSZ($a0)
2122
2123 mflo $t_1
2124 mfhi $t_2
2125 slt $c_1,$t_2,$zero
2126 $SLL $t_2,1
2127 $MULTU $a_1,$a_6 # mul_add_c2(a[1],b[6],c2,c3,c1);
2128 slt $a2,$t_1,$zero
2129 $ADDU $t_2,$a2
2130 $SLL $t_1,1
2131 $ADDU $c_2,$t_1
2132 sltu $at,$c_2,$t_1
2133 $ADDU $t_2,$at
2134 $ADDU $c_3,$t_2
2135 sltu $at,$c_3,$t_2
2136 $ADDU $c_1,$at
2137 mflo $t_1
2138 mfhi $t_2
2139 slt $at,$t_2,$zero
2140 $ADDU $c_1,$at
2141 $MULTU $a_2,$a_5 # mul_add_c2(a[2],b[5],c2,c3,c1);
2142 $SLL $t_2,1
2143 slt $a2,$t_1,$zero
2144 $ADDU $t_2,$a2
2145 $SLL $t_1,1
2146 $ADDU $c_2,$t_1
2147 sltu $at,$c_2,$t_1
2148 $ADDU $t_2,$at
2149 $ADDU $c_3,$t_2
2150 sltu $at,$c_3,$t_2
2151 $ADDU $c_1,$at
2152 mflo $t_1
2153 mfhi $t_2
2154 slt $at,$t_2,$zero
2155 $ADDU $c_1,$at
2156 $MULTU $a_3,$a_4 # mul_add_c2(a[3],b[4],c2,c3,c1);
2157 $SLL $t_2,1
2158 slt $a2,$t_1,$zero
2159 $ADDU $t_2,$a2
2160 $SLL $t_1,1
2161 $ADDU $c_2,$t_1
2162 sltu $at,$c_2,$t_1
2163 $ADDU $t_2,$at
2164 $ADDU $c_3,$t_2
2165 sltu $at,$c_3,$t_2
2166 $ADDU $c_1,$at
2167 mflo $t_1
2168 mfhi $t_2
2169 slt $at,$t_2,$zero
2170 $ADDU $c_1,$at
2171 $MULTU $a_7,$a_1 # mul_add_c2(a[7],b[1],c3,c1,c2);
2172 $SLL $t_2,1
2173 slt $a2,$t_1,$zero
2174 $ADDU $t_2,$a2
2175 $SLL $t_1,1
2176 $ADDU $c_2,$t_1
2177 sltu $at,$c_2,$t_1
2178 $ADDU $t_2,$at
2179 $ADDU $c_3,$t_2
2180 sltu $at,$c_3,$t_2
2181 $ADDU $c_1,$at
2182 $ST $c_2,7*$BNSZ($a0)
2183
2184 mflo $t_1
2185 mfhi $t_2
2186 slt $c_2,$t_2,$zero
2187 $SLL $t_2,1
2188 $MULTU $a_6,$a_2 # mul_add_c2(a[6],b[2],c3,c1,c2);
2189 slt $a2,$t_1,$zero
2190 $ADDU $t_2,$a2
2191 $SLL $t_1,1
2192 $ADDU $c_3,$t_1
2193 sltu $at,$c_3,$t_1
2194 $ADDU $t_2,$at
2195 $ADDU $c_1,$t_2
2196 sltu $at,$c_1,$t_2
2197 $ADDU $c_2,$at
2198 mflo $t_1
2199 mfhi $t_2
2200 slt $at,$t_2,$zero
2201 $ADDU $c_2,$at
2202 $MULTU $a_5,$a_3 # mul_add_c2(a[5],b[3],c3,c1,c2);
2203 $SLL $t_2,1
2204 slt $a2,$t_1,$zero
2205 $ADDU $t_2,$a2
2206 $SLL $t_1,1
2207 $ADDU $c_3,$t_1
2208 sltu $at,$c_3,$t_1
2209 $ADDU $t_2,$at
2210 $ADDU $c_1,$t_2
2211 sltu $at,$c_1,$t_2
2212 $ADDU $c_2,$at
2213 mflo $t_1
2214 mfhi $t_2
2215 slt $at,$t_2,$zero
2216 $ADDU $c_2,$at
2217 $MULTU $a_4,$a_4 # mul_add_c(a[4],b[4],c3,c1,c2);
2218 $SLL $t_2,1
2219 slt $a2,$t_1,$zero
2220 $ADDU $t_2,$a2
2221 $SLL $t_1,1
2222 $ADDU $c_3,$t_1
2223 sltu $at,$c_3,$t_1
2224 $ADDU $t_2,$at
2225 $ADDU $c_1,$t_2
2226 sltu $at,$c_1,$t_2
2227 $ADDU $c_2,$at
2228 mflo $t_1
2229 mfhi $t_2
2230 $ADDU $c_3,$t_1
2231 sltu $at,$c_3,$t_1
2232 $MULTU $a_2,$a_7 # mul_add_c2(a[2],b[7],c1,c2,c3);
2233 $ADDU $t_2,$at
2234 $ADDU $c_1,$t_2
2235 sltu $at,$c_1,$t_2
2236 $ADDU $c_2,$at
2237 $ST $c_3,8*$BNSZ($a0)
2238
2239 mflo $t_1
2240 mfhi $t_2
2241 slt $c_3,$t_2,$zero
2242 $SLL $t_2,1
2243 $MULTU $a_3,$a_6 # mul_add_c2(a[3],b[6],c1,c2,c3);
2244 slt $a2,$t_1,$zero
2245 $ADDU $t_2,$a2
2246 $SLL $t_1,1
2247 $ADDU $c_1,$t_1
2248 sltu $at,$c_1,$t_1
2249 $ADDU $t_2,$at
2250 $ADDU $c_2,$t_2
2251 sltu $at,$c_2,$t_2
2252 $ADDU $c_3,$at
2253 mflo $t_1
2254 mfhi $t_2
2255 slt $at,$t_2,$zero
2256 $ADDU $c_3,$at
2257 $MULTU $a_4,$a_5 # mul_add_c2(a[4],b[5],c1,c2,c3);
2258 $SLL $t_2,1
2259 slt $a2,$t_1,$zero
2260 $ADDU $t_2,$a2
2261 $SLL $t_1,1
2262 $ADDU $c_1,$t_1
2263 sltu $at,$c_1,$t_1
2264 $ADDU $t_2,$at
2265 $ADDU $c_2,$t_2
2266 sltu $at,$c_2,$t_2
2267 $ADDU $c_3,$at
2268 mflo $t_1
2269 mfhi $t_2
2270 slt $at,$t_2,$zero
2271 $ADDU $c_3,$at
2272 $MULTU $a_7,$a_3 # mul_add_c2(a[7],b[3],c2,c3,c1);
2273 $SLL $t_2,1
2274 slt $a2,$t_1,$zero
2275 $ADDU $t_2,$a2
2276 $SLL $t_1,1
2277 $ADDU $c_1,$t_1
2278 sltu $at,$c_1,$t_1
2279 $ADDU $t_2,$at
2280 $ADDU $c_2,$t_2
2281 sltu $at,$c_2,$t_2
2282 $ADDU $c_3,$at
2283 $ST $c_1,9*$BNSZ($a0)
2284
2285 mflo $t_1
2286 mfhi $t_2
2287 slt $c_1,$t_2,$zero
2288 $SLL $t_2,1
2289 $MULTU $a_6,$a_4 # mul_add_c2(a[6],b[4],c2,c3,c1);
2290 slt $a2,$t_1,$zero
2291 $ADDU $t_2,$a2
2292 $SLL $t_1,1
2293 $ADDU $c_2,$t_1
2294 sltu $at,$c_2,$t_1
2295 $ADDU $t_2,$at
2296 $ADDU $c_3,$t_2
2297 sltu $at,$c_3,$t_2
2298 $ADDU $c_1,$at
2299 mflo $t_1
2300 mfhi $t_2
2301 slt $at,$t_2,$zero
2302 $ADDU $c_1,$at
2303 $MULTU $a_5,$a_5 # mul_add_c(a[5],b[5],c2,c3,c1);
2304 $SLL $t_2,1
2305 slt $a2,$t_1,$zero
2306 $ADDU $t_2,$a2
2307 $SLL $t_1,1
2308 $ADDU $c_2,$t_1
2309 sltu $at,$c_2,$t_1
2310 $ADDU $t_2,$at
2311 $ADDU $c_3,$t_2
2312 sltu $at,$c_3,$t_2
2313 $ADDU $c_1,$at
2314 mflo $t_1
2315 mfhi $t_2
2316 $ADDU $c_2,$t_1
2317 sltu $at,$c_2,$t_1
2318 $MULTU $a_4,$a_7 # mul_add_c2(a[4],b[7],c3,c1,c2);
2319 $ADDU $t_2,$at
2320 $ADDU $c_3,$t_2
2321 sltu $at,$c_3,$t_2
2322 $ADDU $c_1,$at
2323 $ST $c_2,10*$BNSZ($a0)
2324
2325 mflo $t_1
2326 mfhi $t_2
2327 slt $c_2,$t_2,$zero
2328 $SLL $t_2,1
2329 $MULTU $a_5,$a_6 # mul_add_c2(a[5],b[6],c3,c1,c2);
2330 slt $a2,$t_1,$zero
2331 $ADDU $t_2,$a2
2332 $SLL $t_1,1
2333 $ADDU $c_3,$t_1
2334 sltu $at,$c_3,$t_1
2335 $ADDU $t_2,$at
2336 $ADDU $c_1,$t_2
2337 sltu $at,$c_1,$t_2
2338 $ADDU $c_2,$at
2339 mflo $t_1
2340 mfhi $t_2
2341 slt $at,$t_2,$zero
2342 $ADDU $c_2,$at
2343 $MULTU $a_7,$a_5 # mul_add_c2(a[7],b[5],c1,c2,c3);
2344 $SLL $t_2,1
2345 slt $a2,$t_1,$zero
2346 $ADDU $t_2,$a2
2347 $SLL $t_1,1
2348 $ADDU $c_3,$t_1
2349 sltu $at,$c_3,$t_1
2350 $ADDU $t_2,$at
2351 $ADDU $c_1,$t_2
2352 sltu $at,$c_1,$t_2
2353 $ADDU $c_2,$at
2354 $ST $c_3,11*$BNSZ($a0)
2355
2356 mflo $t_1
2357 mfhi $t_2
2358 slt $c_3,$t_2,$zero
2359 $SLL $t_2,1
2360 $MULTU $a_6,$a_6 # mul_add_c(a[6],b[6],c1,c2,c3);
2361 slt $a2,$t_1,$zero
2362 $ADDU $t_2,$a2
2363 $SLL $t_1,1
2364 $ADDU $c_1,$t_1
2365 sltu $at,$c_1,$t_1
2366 $ADDU $t_2,$at
2367 $ADDU $c_2,$t_2
2368 sltu $at,$c_2,$t_2
2369 $ADDU $c_3,$at
2370 mflo $t_1
2371 mfhi $t_2
2372 $ADDU $c_1,$t_1
2373 sltu $at,$c_1,$t_1
2374 $MULTU $a_6,$a_7 # mul_add_c2(a[6],b[7],c2,c3,c1);
2375 $ADDU $t_2,$at
2376 $ADDU $c_2,$t_2
2377 sltu $at,$c_2,$t_2
2378 $ADDU $c_3,$at
2379 $ST $c_1,12*$BNSZ($a0)
2380
2381 mflo $t_1
2382 mfhi $t_2
2383 slt $c_1,$t_2,$zero
2384 $SLL $t_2,1
2385 $MULTU $a_7,$a_7 # mul_add_c(a[7],b[7],c3,c1,c2);
2386 slt $a2,$t_1,$zero
2387 $ADDU $t_2,$a2
2388 $SLL $t_1,1
2389 $ADDU $c_2,$t_1
2390 sltu $at,$c_2,$t_1
2391 $ADDU $t_2,$at
2392 $ADDU $c_3,$t_2
2393 sltu $at,$c_3,$t_2
2394 $ADDU $c_1,$at
2395 $ST $c_2,13*$BNSZ($a0)
2396
2397 mflo $t_1
2398 mfhi $t_2
2399 $ADDU $c_3,$t_1
2400 sltu $at,$c_3,$t_1
2401 $ADDU $t_2,$at
2402 $ADDU $c_1,$t_2
2403 $ST $c_3,14*$BNSZ($a0)
2404 $ST $c_1,15*$BNSZ($a0)
2405
2406 .set noreorder
2407___
2408$code.=<<___ if ($flavour =~ /nubi/i);
2409 $REG_L $t3,4*$SZREG($sp)
2410 $REG_L $t2,3*$SZREG($sp)
2411 $REG_L $t1,2*$SZREG($sp)
2412 $REG_L $t0,1*$SZREG($sp)
2413 $REG_L $gp,0*$SZREG($sp)
2414 $PTR_ADD $sp,6*$SZREG
2415___
2416$code.=<<___;
2417 jr $ra
2418 nop
2419.end bn_sqr_comba8
2420
2421.align 5
2422.globl bn_sqr_comba4
2423.ent bn_sqr_comba4
2424bn_sqr_comba4:
2425___
2426$code.=<<___ if ($flavour =~ /nubi/i);
2427 .frame $sp,6*$SZREG,$ra
2428 .mask 0x8000f008,-$SZREG
2429 .set noreorder
2430 $PTR_SUB $sp,6*$SZREG
2431 $REG_S $ra,5*$SZREG($sp)
2432 $REG_S $t3,4*$SZREG($sp)
2433 $REG_S $t2,3*$SZREG($sp)
2434 $REG_S $t1,2*$SZREG($sp)
2435 $REG_S $t0,1*$SZREG($sp)
2436 $REG_S $gp,0*$SZREG($sp)
2437___
2438$code.=<<___;
2439 .set reorder
2440 $LD $a_0,0($a1)
2441 $LD $a_1,$BNSZ($a1)
2442 $MULTU $a_0,$a_0 # mul_add_c(a[0],b[0],c1,c2,c3);
2443 $LD $a_2,2*$BNSZ($a1)
2444 $LD $a_3,3*$BNSZ($a1)
2445 mflo $c_1
2446 mfhi $c_2
2447 $ST $c_1,0($a0)
2448
2449 $MULTU $a_0,$a_1 # mul_add_c2(a[0],b[1],c2,c3,c1);
2450 mflo $t_1
2451 mfhi $t_2
2452 slt $c_1,$t_2,$zero
2453 $SLL $t_2,1
2454 $MULTU $a_2,$a_0 # mul_add_c2(a[2],b[0],c3,c1,c2);
2455 slt $a2,$t_1,$zero
2456 $ADDU $t_2,$a2
2457 $SLL $t_1,1
2458 $ADDU $c_2,$t_1
2459 sltu $at,$c_2,$t_1
2460 $ADDU $c_3,$t_2,$at
2461 $ST $c_2,$BNSZ($a0)
2462
2463 mflo $t_1
2464 mfhi $t_2
2465 slt $c_2,$t_2,$zero
2466 $SLL $t_2,1
2467 $MULTU $a_1,$a_1 # mul_add_c(a[1],b[1],c3,c1,c2);
2468 slt $a2,$t_1,$zero
2469 $ADDU $t_2,$a2
2470 $SLL $t_1,1
2471 $ADDU $c_3,$t_1
2472 sltu $at,$c_3,$t_1
2473 $ADDU $t_2,$at
2474 $ADDU $c_1,$t_2
2475 sltu $at,$c_1,$t_2
2476 $ADDU $c_2,$at
2477 mflo $t_1
2478 mfhi $t_2
2479 $ADDU $c_3,$t_1
2480 sltu $at,$c_3,$t_1
2481 $MULTU $a_0,$a_3 # mul_add_c2(a[0],b[3],c1,c2,c3);
2482 $ADDU $t_2,$at
2483 $ADDU $c_1,$t_2
2484 sltu $at,$c_1,$t_2
2485 $ADDU $c_2,$at
2486 $ST $c_3,2*$BNSZ($a0)
2487
2488 mflo $t_1
2489 mfhi $t_2
2490 slt $c_3,$t_2,$zero
2491 $SLL $t_2,1
2492 $MULTU $a_1,$a_2 # mul_add_c(a2[1],b[2],c1,c2,c3);
2493 slt $a2,$t_1,$zero
2494 $ADDU $t_2,$a2
2495 $SLL $t_1,1
2496 $ADDU $c_1,$t_1
2497 sltu $at,$c_1,$t_1
2498 $ADDU $t_2,$at
2499 $ADDU $c_2,$t_2
2500 sltu $at,$c_2,$t_2
2501 $ADDU $c_3,$at
2502 mflo $t_1
2503 mfhi $t_2
2504 slt $at,$t_2,$zero
2505 $ADDU $c_3,$at
2506 $MULTU $a_3,$a_1 # mul_add_c2(a[3],b[1],c2,c3,c1);
2507 $SLL $t_2,1
2508 slt $a2,$t_1,$zero
2509 $ADDU $t_2,$a2
2510 $SLL $t_1,1
2511 $ADDU $c_1,$t_1
2512 sltu $at,$c_1,$t_1
2513 $ADDU $t_2,$at
2514 $ADDU $c_2,$t_2
2515 sltu $at,$c_2,$t_2
2516 $ADDU $c_3,$at
2517 $ST $c_1,3*$BNSZ($a0)
2518
2519 mflo $t_1
2520 mfhi $t_2
2521 slt $c_1,$t_2,$zero
2522 $SLL $t_2,1
2523 $MULTU $a_2,$a_2 # mul_add_c(a[2],b[2],c2,c3,c1);
2524 slt $a2,$t_1,$zero
2525 $ADDU $t_2,$a2
2526 $SLL $t_1,1
2527 $ADDU $c_2,$t_1
2528 sltu $at,$c_2,$t_1
2529 $ADDU $t_2,$at
2530 $ADDU $c_3,$t_2
2531 sltu $at,$c_3,$t_2
2532 $ADDU $c_1,$at
2533 mflo $t_1
2534 mfhi $t_2
2535 $ADDU $c_2,$t_1
2536 sltu $at,$c_2,$t_1
2537 $MULTU $a_2,$a_3 # mul_add_c2(a[2],b[3],c3,c1,c2);
2538 $ADDU $t_2,$at
2539 $ADDU $c_3,$t_2
2540 sltu $at,$c_3,$t_2
2541 $ADDU $c_1,$at
2542 $ST $c_2,4*$BNSZ($a0)
2543
2544 mflo $t_1
2545 mfhi $t_2
2546 slt $c_2,$t_2,$zero
2547 $SLL $t_2,1
2548 $MULTU $a_3,$a_3 # mul_add_c(a[3],b[3],c1,c2,c3);
2549 slt $a2,$t_1,$zero
2550 $ADDU $t_2,$a2
2551 $SLL $t_1,1
2552 $ADDU $c_3,$t_1
2553 sltu $at,$c_3,$t_1
2554 $ADDU $t_2,$at
2555 $ADDU $c_1,$t_2
2556 sltu $at,$c_1,$t_2
2557 $ADDU $c_2,$at
2558 $ST $c_3,5*$BNSZ($a0)
2559
2560 mflo $t_1
2561 mfhi $t_2
2562 $ADDU $c_1,$t_1
2563 sltu $at,$c_1,$t_1
2564 $ADDU $t_2,$at
2565 $ADDU $c_2,$t_2
2566 $ST $c_1,6*$BNSZ($a0)
2567 $ST $c_2,7*$BNSZ($a0)
2568
2569 .set noreorder
2570___
2571$code.=<<___ if ($flavour =~ /nubi/i);
2572 $REG_L $t3,4*$SZREG($sp)
2573 $REG_L $t2,3*$SZREG($sp)
2574 $REG_L $t1,2*$SZREG($sp)
2575 $REG_L $t0,1*$SZREG($sp)
2576 $REG_L $gp,0*$SZREG($sp)
2577 $PTR_ADD $sp,6*$SZREG
2578___
2579$code.=<<___;
2580 jr $ra
2581 nop
2582.end bn_sqr_comba4
2583___
2584print $code;
2585close STDOUT;
diff --git a/src/lib/libcrypto/bn/asm/modexp512-x86_64.pl b/src/lib/libcrypto/bn/asm/modexp512-x86_64.pl
deleted file mode 100644
index 54aeb01921..0000000000
--- a/src/lib/libcrypto/bn/asm/modexp512-x86_64.pl
+++ /dev/null
@@ -1,1496 +0,0 @@
1#!/usr/bin/env perl
2#
3# Copyright (c) 2010-2011 Intel Corp.
4# Author: Vinodh.Gopal@intel.com
5# Jim Guilford
6# Erdinc.Ozturk@intel.com
7# Maxim.Perminov@intel.com
8#
9# More information about algorithm used can be found at:
10# http://www.cse.buffalo.edu/srds2009/escs2009_submission_Gopal.pdf
11#
12# ====================================================================
13# Copyright (c) 2011 The OpenSSL Project. All rights reserved.
14#
15# Redistribution and use in source and binary forms, with or without
16# modification, are permitted provided that the following conditions
17# are met:
18#
19# 1. Redistributions of source code must retain the above copyright
20# notice, this list of conditions and the following disclaimer.
21#
22# 2. Redistributions in binary form must reproduce the above copyright
23# notice, this list of conditions and the following disclaimer in
24# the documentation and/or other materials provided with the
25# distribution.
26#
27# 3. All advertising materials mentioning features or use of this
28# software must display the following acknowledgment:
29# "This product includes software developed by the OpenSSL Project
30# for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
31#
32# 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
33# endorse or promote products derived from this software without
34# prior written permission. For written permission, please contact
35# licensing@OpenSSL.org.
36#
37# 5. Products derived from this software may not be called "OpenSSL"
38# nor may "OpenSSL" appear in their names without prior written
39# permission of the OpenSSL Project.
40#
41# 6. Redistributions of any form whatsoever must retain the following
42# acknowledgment:
43# "This product includes software developed by the OpenSSL Project
44# for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
45#
46# THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
47# EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
48# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
49# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
50# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
51# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
52# NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
53# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
54# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
55# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
56# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
57# OF THE POSSIBILITY OF SUCH DAMAGE.
58# ====================================================================
59
60$flavour = shift;
61$output = shift;
62if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
63
64my $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
65
66$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
67( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
68( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
69die "can't locate x86_64-xlate.pl";
70
71open STDOUT,"| $^X $xlate $flavour $output";
72
73use strict;
74my $code=".text\n\n";
75my $m=0;
76
77#
78# Define x512 macros
79#
80
81#MULSTEP_512_ADD MACRO x7, x6, x5, x4, x3, x2, x1, x0, dst, src1, src2, add_src, tmp1, tmp2
82#
83# uses rax, rdx, and args
84sub MULSTEP_512_ADD
85{
86 my ($x, $DST, $SRC2, $ASRC, $OP, $TMP)=@_;
87 my @X=@$x; # make a copy
88$code.=<<___;
89 mov (+8*0)($SRC2), %rax
90 mul $OP # rdx:rax = %OP * [0]
91 mov ($ASRC), $X[0]
92 add %rax, $X[0]
93 adc \$0, %rdx
94 mov $X[0], $DST
95___
96for(my $i=1;$i<8;$i++) {
97$code.=<<___;
98 mov %rdx, $TMP
99
100 mov (+8*$i)($SRC2), %rax
101 mul $OP # rdx:rax = %OP * [$i]
102 mov (+8*$i)($ASRC), $X[$i]
103 add %rax, $X[$i]
104 adc \$0, %rdx
105 add $TMP, $X[$i]
106 adc \$0, %rdx
107___
108}
109$code.=<<___;
110 mov %rdx, $X[0]
111___
112}
113
114#MULSTEP_512 MACRO x7, x6, x5, x4, x3, x2, x1, x0, dst, src2, src1_val, tmp
115#
116# uses rax, rdx, and args
117sub MULSTEP_512
118{
119 my ($x, $DST, $SRC2, $OP, $TMP)=@_;
120 my @X=@$x; # make a copy
121$code.=<<___;
122 mov (+8*0)($SRC2), %rax
123 mul $OP # rdx:rax = %OP * [0]
124 add %rax, $X[0]
125 adc \$0, %rdx
126 mov $X[0], $DST
127___
128for(my $i=1;$i<8;$i++) {
129$code.=<<___;
130 mov %rdx, $TMP
131
132 mov (+8*$i)($SRC2), %rax
133 mul $OP # rdx:rax = %OP * [$i]
134 add %rax, $X[$i]
135 adc \$0, %rdx
136 add $TMP, $X[$i]
137 adc \$0, %rdx
138___
139}
140$code.=<<___;
141 mov %rdx, $X[0]
142___
143}
144
145#
146# Swizzle Macros
147#
148
149# macro to copy data from flat space to swizzled table
150#MACRO swizzle pDst, pSrc, tmp1, tmp2
151# pDst and pSrc are modified
152sub swizzle
153{
154 my ($pDst, $pSrc, $cnt, $d0)=@_;
155$code.=<<___;
156 mov \$8, $cnt
157loop_$m:
158 mov ($pSrc), $d0
159 mov $d0#w, ($pDst)
160 shr \$16, $d0
161 mov $d0#w, (+64*1)($pDst)
162 shr \$16, $d0
163 mov $d0#w, (+64*2)($pDst)
164 shr \$16, $d0
165 mov $d0#w, (+64*3)($pDst)
166 lea 8($pSrc), $pSrc
167 lea 64*4($pDst), $pDst
168 dec $cnt
169 jnz loop_$m
170___
171
172 $m++;
173}
174
175# macro to copy data from swizzled table to flat space
176#MACRO unswizzle pDst, pSrc, tmp*3
177sub unswizzle
178{
179 my ($pDst, $pSrc, $cnt, $d0, $d1)=@_;
180$code.=<<___;
181 mov \$4, $cnt
182loop_$m:
183 movzxw (+64*3+256*0)($pSrc), $d0
184 movzxw (+64*3+256*1)($pSrc), $d1
185 shl \$16, $d0
186 shl \$16, $d1
187 mov (+64*2+256*0)($pSrc), $d0#w
188 mov (+64*2+256*1)($pSrc), $d1#w
189 shl \$16, $d0
190 shl \$16, $d1
191 mov (+64*1+256*0)($pSrc), $d0#w
192 mov (+64*1+256*1)($pSrc), $d1#w
193 shl \$16, $d0
194 shl \$16, $d1
195 mov (+64*0+256*0)($pSrc), $d0#w
196 mov (+64*0+256*1)($pSrc), $d1#w
197 mov $d0, (+8*0)($pDst)
198 mov $d1, (+8*1)($pDst)
199 lea 256*2($pSrc), $pSrc
200 lea 8*2($pDst), $pDst
201 sub \$1, $cnt
202 jnz loop_$m
203___
204
205 $m++;
206}
207
208#
209# Data Structures
210#
211
212# Reduce Data
213#
214#
215# Offset Value
216# 0C0 Carries
217# 0B8 X2[10]
218# 0B0 X2[9]
219# 0A8 X2[8]
220# 0A0 X2[7]
221# 098 X2[6]
222# 090 X2[5]
223# 088 X2[4]
224# 080 X2[3]
225# 078 X2[2]
226# 070 X2[1]
227# 068 X2[0]
228# 060 X1[12] P[10]
229# 058 X1[11] P[9] Z[8]
230# 050 X1[10] P[8] Z[7]
231# 048 X1[9] P[7] Z[6]
232# 040 X1[8] P[6] Z[5]
233# 038 X1[7] P[5] Z[4]
234# 030 X1[6] P[4] Z[3]
235# 028 X1[5] P[3] Z[2]
236# 020 X1[4] P[2] Z[1]
237# 018 X1[3] P[1] Z[0]
238# 010 X1[2] P[0] Y[2]
239# 008 X1[1] Q[1] Y[1]
240# 000 X1[0] Q[0] Y[0]
241
242my $X1_offset = 0; # 13 qwords
243my $X2_offset = $X1_offset + 13*8; # 11 qwords
244my $Carries_offset = $X2_offset + 11*8; # 1 qword
245my $Q_offset = 0; # 2 qwords
246my $P_offset = $Q_offset + 2*8; # 11 qwords
247my $Y_offset = 0; # 3 qwords
248my $Z_offset = $Y_offset + 3*8; # 9 qwords
249
250my $Red_Data_Size = $Carries_offset + 1*8; # (25 qwords)
251
252#
253# Stack Frame
254#
255#
256# offset value
257# ... <old stack contents>
258# ...
259# 280 Garray
260
261# 278 tmp16[15]
262# ... ...
263# 200 tmp16[0]
264
265# 1F8 tmp[7]
266# ... ...
267# 1C0 tmp[0]
268
269# 1B8 GT[7]
270# ... ...
271# 180 GT[0]
272
273# 178 Reduce Data
274# ... ...
275# 0B8 Reduce Data
276# 0B0 reserved
277# 0A8 reserved
278# 0A0 reserved
279# 098 reserved
280# 090 reserved
281# 088 reduce result addr
282# 080 exp[8]
283
284# ...
285# 048 exp[1]
286# 040 exp[0]
287
288# 038 reserved
289# 030 loop_idx
290# 028 pg
291# 020 i
292# 018 pData ; arg 4
293# 010 pG ; arg 2
294# 008 pResult ; arg 1
295# 000 rsp ; stack pointer before subtract
296
297my $rsp_offset = 0;
298my $pResult_offset = 8*1 + $rsp_offset;
299my $pG_offset = 8*1 + $pResult_offset;
300my $pData_offset = 8*1 + $pG_offset;
301my $i_offset = 8*1 + $pData_offset;
302my $pg_offset = 8*1 + $i_offset;
303my $loop_idx_offset = 8*1 + $pg_offset;
304my $reserved1_offset = 8*1 + $loop_idx_offset;
305my $exp_offset = 8*1 + $reserved1_offset;
306my $red_result_addr_offset= 8*9 + $exp_offset;
307my $reserved2_offset = 8*1 + $red_result_addr_offset;
308my $Reduce_Data_offset = 8*5 + $reserved2_offset;
309my $GT_offset = $Red_Data_Size + $Reduce_Data_offset;
310my $tmp_offset = 8*8 + $GT_offset;
311my $tmp16_offset = 8*8 + $tmp_offset;
312my $garray_offset = 8*16 + $tmp16_offset;
313my $mem_size = 8*8*32 + $garray_offset;
314
315#
316# Offsets within Reduce Data
317#
318#
319# struct MODF_2FOLD_MONT_512_C1_DATA {
320# UINT64 t[8][8];
321# UINT64 m[8];
322# UINT64 m1[8]; /* 2^768 % m */
323# UINT64 m2[8]; /* 2^640 % m */
324# UINT64 k1[2]; /* (- 1/m) % 2^128 */
325# };
326
327my $T = 0;
328my $M = 512; # = 8 * 8 * 8
329my $M1 = 576; # = 8 * 8 * 9 /* += 8 * 8 */
330my $M2 = 640; # = 8 * 8 * 10 /* += 8 * 8 */
331my $K1 = 704; # = 8 * 8 * 11 /* += 8 * 8 */
332
333#
334# FUNCTIONS
335#
336
337{{{
338#
339# MULADD_128x512 : Function to multiply 128-bits (2 qwords) by 512-bits (8 qwords)
340# and add 512-bits (8 qwords)
341# to get 640 bits (10 qwords)
342# Input: 128-bit mul source: [rdi+8*1], rbp
343# 512-bit mul source: [rsi+8*n]
344# 512-bit add source: r15, r14, ..., r9, r8
345# Output: r9, r8, r15, r14, r13, r12, r11, r10, [rcx+8*1], [rcx+8*0]
346# Clobbers all regs except: rcx, rsi, rdi
347$code.=<<___;
348.type MULADD_128x512,\@abi-omnipotent
349.align 16
350MULADD_128x512:
351___
352 &MULSTEP_512([map("%r$_",(8..15))], "(+8*0)(%rcx)", "%rsi", "%rbp", "%rbx");
353$code.=<<___;
354 mov (+8*1)(%rdi), %rbp
355___
356 &MULSTEP_512([map("%r$_",(9..15,8))], "(+8*1)(%rcx)", "%rsi", "%rbp", "%rbx");
357$code.=<<___;
358 ret
359.size MULADD_128x512,.-MULADD_128x512
360___
361}}}
362
363{{{
364#MULADD_256x512 MACRO pDst, pA, pB, OP, TMP, X7, X6, X5, X4, X3, X2, X1, X0
365#
366# Inputs: pDst: Destination (768 bits, 12 qwords)
367# pA: Multiplicand (1024 bits, 16 qwords)
368# pB: Multiplicand (512 bits, 8 qwords)
369# Dst = Ah * B + Al
370# where Ah is (in qwords) A[15:12] (256 bits) and Al is A[7:0] (512 bits)
371# Results in X3 X2 X1 X0 X7 X6 X5 X4 Dst[3:0]
372# Uses registers: arguments, RAX, RDX
373sub MULADD_256x512
374{
375 my ($pDst, $pA, $pB, $OP, $TMP, $X)=@_;
376$code.=<<___;
377 mov (+8*12)($pA), $OP
378___
379 &MULSTEP_512_ADD($X, "(+8*0)($pDst)", $pB, $pA, $OP, $TMP);
380 push(@$X,shift(@$X));
381
382$code.=<<___;
383 mov (+8*13)($pA), $OP
384___
385 &MULSTEP_512($X, "(+8*1)($pDst)", $pB, $OP, $TMP);
386 push(@$X,shift(@$X));
387
388$code.=<<___;
389 mov (+8*14)($pA), $OP
390___
391 &MULSTEP_512($X, "(+8*2)($pDst)", $pB, $OP, $TMP);
392 push(@$X,shift(@$X));
393
394$code.=<<___;
395 mov (+8*15)($pA), $OP
396___
397 &MULSTEP_512($X, "(+8*3)($pDst)", $pB, $OP, $TMP);
398 push(@$X,shift(@$X));
399}
400
401#
402# mont_reduce(UINT64 *x, /* 1024 bits, 16 qwords */
403# UINT64 *m, /* 512 bits, 8 qwords */
404# MODF_2FOLD_MONT_512_C1_DATA *data,
405# UINT64 *r) /* 512 bits, 8 qwords */
406# Input: x (number to be reduced): tmp16 (Implicit)
407# m (modulus): [pM] (Implicit)
408# data (reduce data): [pData] (Implicit)
409# Output: r (result): Address in [red_res_addr]
410# result also in: r9, r8, r15, r14, r13, r12, r11, r10
411
412my @X=map("%r$_",(8..15));
413
414$code.=<<___;
415.type mont_reduce,\@abi-omnipotent
416.align 16
417mont_reduce:
418___
419
420my $STACK_DEPTH = 8;
421 #
422 # X1 = Xh * M1 + Xl
423$code.=<<___;
424 lea (+$Reduce_Data_offset+$X1_offset+$STACK_DEPTH)(%rsp), %rdi # pX1 (Dst) 769 bits, 13 qwords
425 mov (+$pData_offset+$STACK_DEPTH)(%rsp), %rsi # pM1 (Bsrc) 512 bits, 8 qwords
426 add \$$M1, %rsi
427 lea (+$tmp16_offset+$STACK_DEPTH)(%rsp), %rcx # X (Asrc) 1024 bits, 16 qwords
428
429___
430
431 &MULADD_256x512("%rdi", "%rcx", "%rsi", "%rbp", "%rbx", \@X); # rotates @X 4 times
432 # results in r11, r10, r9, r8, r15, r14, r13, r12, X1[3:0]
433
434$code.=<<___;
435 xor %rax, %rax
436 # X1 += xl
437 add (+8*8)(%rcx), $X[4]
438 adc (+8*9)(%rcx), $X[5]
439 adc (+8*10)(%rcx), $X[6]
440 adc (+8*11)(%rcx), $X[7]
441 adc \$0, %rax
442 # X1 is now rax, r11-r8, r15-r12, tmp16[3:0]
443
444 #
445 # check for carry ;; carry stored in rax
446 mov $X[4], (+8*8)(%rdi) # rdi points to X1
447 mov $X[5], (+8*9)(%rdi)
448 mov $X[6], %rbp
449 mov $X[7], (+8*11)(%rdi)
450
451 mov %rax, (+$Reduce_Data_offset+$Carries_offset+$STACK_DEPTH)(%rsp)
452
453 mov (+8*0)(%rdi), $X[4]
454 mov (+8*1)(%rdi), $X[5]
455 mov (+8*2)(%rdi), $X[6]
456 mov (+8*3)(%rdi), $X[7]
457
458 # X1 is now stored in: X1[11], rbp, X1[9:8], r15-r8
459 # rdi -> X1
460 # rsi -> M1
461
462 #
463 # X2 = Xh * M2 + Xl
464 # do first part (X2 = Xh * M2)
465 add \$8*10, %rdi # rdi -> pXh ; 128 bits, 2 qwords
466 # Xh is actually { [rdi+8*1], rbp }
467 add \$`$M2-$M1`, %rsi # rsi -> M2
468 lea (+$Reduce_Data_offset+$X2_offset+$STACK_DEPTH)(%rsp), %rcx # rcx -> pX2 ; 641 bits, 11 qwords
469___
470 unshift(@X,pop(@X)); unshift(@X,pop(@X));
471$code.=<<___;
472
473 call MULADD_128x512 # args in rcx, rdi / rbp, rsi, r15-r8
474 # result in r9, r8, r15, r14, r13, r12, r11, r10, X2[1:0]
475 mov (+$Reduce_Data_offset+$Carries_offset+$STACK_DEPTH)(%rsp), %rax
476
477 # X2 += Xl
478 add (+8*8-8*10)(%rdi), $X[6] # (-8*10) is to adjust rdi -> Xh to Xl
479 adc (+8*9-8*10)(%rdi), $X[7]
480 mov $X[6], (+8*8)(%rcx)
481 mov $X[7], (+8*9)(%rcx)
482
483 adc %rax, %rax
484 mov %rax, (+$Reduce_Data_offset+$Carries_offset+$STACK_DEPTH)(%rsp)
485
486 lea (+$Reduce_Data_offset+$Q_offset+$STACK_DEPTH)(%rsp), %rdi # rdi -> pQ ; 128 bits, 2 qwords
487 add \$`$K1-$M2`, %rsi # rsi -> pK1 ; 128 bits, 2 qwords
488
489 # MUL_128x128t128 rdi, rcx, rsi ; Q = X2 * K1 (bottom half)
490 # B1:B0 = rsi[1:0] = K1[1:0]
491 # A1:A0 = rcx[1:0] = X2[1:0]
492 # Result = rdi[1],rbp = Q[1],rbp
493 mov (%rsi), %r8 # B0
494 mov (+8*1)(%rsi), %rbx # B1
495
496 mov (%rcx), %rax # A0
497 mul %r8 # B0
498 mov %rax, %rbp
499 mov %rdx, %r9
500
501 mov (+8*1)(%rcx), %rax # A1
502 mul %r8 # B0
503 add %rax, %r9
504
505 mov (%rcx), %rax # A0
506 mul %rbx # B1
507 add %rax, %r9
508
509 mov %r9, (+8*1)(%rdi)
510 # end MUL_128x128t128
511
512 sub \$`$K1-$M`, %rsi
513
514 mov (%rcx), $X[6]
515 mov (+8*1)(%rcx), $X[7] # r9:r8 = X2[1:0]
516
517 call MULADD_128x512 # args in rcx, rdi / rbp, rsi, r15-r8
518 # result in r9, r8, r15, r14, r13, r12, r11, r10, X2[1:0]
519
520 # load first half of m to rdx, rdi, rbx, rax
521 # moved this here for efficiency
522 mov (+8*0)(%rsi), %rax
523 mov (+8*1)(%rsi), %rbx
524 mov (+8*2)(%rsi), %rdi
525 mov (+8*3)(%rsi), %rdx
526
527 # continue with reduction
528 mov (+$Reduce_Data_offset+$Carries_offset+$STACK_DEPTH)(%rsp), %rbp
529
530 add (+8*8)(%rcx), $X[6]
531 adc (+8*9)(%rcx), $X[7]
532
533 #accumulate the final carry to rbp
534 adc %rbp, %rbp
535
536 # Add in overflow corrections: R = (X2>>128) += T[overflow]
537 # R = {r9, r8, r15, r14, ..., r10}
538 shl \$3, %rbp
539 mov (+$pData_offset+$STACK_DEPTH)(%rsp), %rcx # rsi -> Data (and points to T)
540 add %rcx, %rbp # pT ; 512 bits, 8 qwords, spread out
541
542 # rsi will be used to generate a mask after the addition
543 xor %rsi, %rsi
544
545 add (+8*8*0)(%rbp), $X[0]
546 adc (+8*8*1)(%rbp), $X[1]
547 adc (+8*8*2)(%rbp), $X[2]
548 adc (+8*8*3)(%rbp), $X[3]
549 adc (+8*8*4)(%rbp), $X[4]
550 adc (+8*8*5)(%rbp), $X[5]
551 adc (+8*8*6)(%rbp), $X[6]
552 adc (+8*8*7)(%rbp), $X[7]
553
554 # if there is a carry: rsi = 0xFFFFFFFFFFFFFFFF
555 # if carry is clear: rsi = 0x0000000000000000
556 sbb \$0, %rsi
557
558 # if carry is clear, subtract 0. Otherwise, subtract 256 bits of m
559 and %rsi, %rax
560 and %rsi, %rbx
561 and %rsi, %rdi
562 and %rsi, %rdx
563
564 mov \$1, %rbp
565 sub %rax, $X[0]
566 sbb %rbx, $X[1]
567 sbb %rdi, $X[2]
568 sbb %rdx, $X[3]
569
570 # if there is a borrow: rbp = 0
571 # if there is no borrow: rbp = 1
572 # this is used to save the borrows in between the first half and the 2nd half of the subtraction of m
573 sbb \$0, %rbp
574
575 #load second half of m to rdx, rdi, rbx, rax
576
577 add \$$M, %rcx
578 mov (+8*4)(%rcx), %rax
579 mov (+8*5)(%rcx), %rbx
580 mov (+8*6)(%rcx), %rdi
581 mov (+8*7)(%rcx), %rdx
582
583 # use the rsi mask as before
584 # if carry is clear, subtract 0. Otherwise, subtract 256 bits of m
585 and %rsi, %rax
586 and %rsi, %rbx
587 and %rsi, %rdi
588 and %rsi, %rdx
589
590 # if rbp = 0, there was a borrow before, it is moved to the carry flag
591 # if rbp = 1, there was not a borrow before, carry flag is cleared
592 sub \$1, %rbp
593
594 sbb %rax, $X[4]
595 sbb %rbx, $X[5]
596 sbb %rdi, $X[6]
597 sbb %rdx, $X[7]
598
599 # write R back to memory
600
601 mov (+$red_result_addr_offset+$STACK_DEPTH)(%rsp), %rsi
602 mov $X[0], (+8*0)(%rsi)
603 mov $X[1], (+8*1)(%rsi)
604 mov $X[2], (+8*2)(%rsi)
605 mov $X[3], (+8*3)(%rsi)
606 mov $X[4], (+8*4)(%rsi)
607 mov $X[5], (+8*5)(%rsi)
608 mov $X[6], (+8*6)(%rsi)
609 mov $X[7], (+8*7)(%rsi)
610
611 ret
612.size mont_reduce,.-mont_reduce
613___
614}}}
615
616{{{
617#MUL_512x512 MACRO pDst, pA, pB, x7, x6, x5, x4, x3, x2, x1, x0, tmp*2
618#
619# Inputs: pDst: Destination (1024 bits, 16 qwords)
620# pA: Multiplicand (512 bits, 8 qwords)
621# pB: Multiplicand (512 bits, 8 qwords)
622# Uses registers rax, rdx, args
623# B operand in [pB] and also in x7...x0
624sub MUL_512x512
625{
626 my ($pDst, $pA, $pB, $x, $OP, $TMP, $pDst_o)=@_;
627 my ($pDst, $pDst_o) = ($pDst =~ m/([^+]*)\+?(.*)?/);
628 my @X=@$x; # make a copy
629
630$code.=<<___;
631 mov (+8*0)($pA), $OP
632
633 mov $X[0], %rax
634 mul $OP # rdx:rax = %OP * [0]
635 mov %rax, (+$pDst_o+8*0)($pDst)
636 mov %rdx, $X[0]
637___
638for(my $i=1;$i<8;$i++) {
639$code.=<<___;
640 mov $X[$i], %rax
641 mul $OP # rdx:rax = %OP * [$i]
642 add %rax, $X[$i-1]
643 adc \$0, %rdx
644 mov %rdx, $X[$i]
645___
646}
647
648for(my $i=1;$i<8;$i++) {
649$code.=<<___;
650 mov (+8*$i)($pA), $OP
651___
652
653 &MULSTEP_512(\@X, "(+$pDst_o+8*$i)($pDst)", $pB, $OP, $TMP);
654 push(@X,shift(@X));
655}
656
657$code.=<<___;
658 mov $X[0], (+$pDst_o+8*8)($pDst)
659 mov $X[1], (+$pDst_o+8*9)($pDst)
660 mov $X[2], (+$pDst_o+8*10)($pDst)
661 mov $X[3], (+$pDst_o+8*11)($pDst)
662 mov $X[4], (+$pDst_o+8*12)($pDst)
663 mov $X[5], (+$pDst_o+8*13)($pDst)
664 mov $X[6], (+$pDst_o+8*14)($pDst)
665 mov $X[7], (+$pDst_o+8*15)($pDst)
666___
667}
668
669#
670# mont_mul_a3b : subroutine to compute (Src1 * Src2) % M (all 512-bits)
671# Input: src1: Address of source 1: rdi
672# src2: Address of source 2: rsi
673# Output: dst: Address of destination: [red_res_addr]
674# src2 and result also in: r9, r8, r15, r14, r13, r12, r11, r10
675# Temp: Clobbers [tmp16], all registers
676$code.=<<___;
677.type mont_mul_a3b,\@abi-omnipotent
678.align 16
679mont_mul_a3b:
680 #
681 # multiply tmp = src1 * src2
682 # For multiply: dst = rcx, src1 = rdi, src2 = rsi
683 # stack depth is extra 8 from call
684___
685 &MUL_512x512("%rsp+$tmp16_offset+8", "%rdi", "%rsi", [map("%r$_",(10..15,8..9))], "%rbp", "%rbx");
686$code.=<<___;
687 #
688 # Dst = tmp % m
689 # Call reduce(tmp, m, data, dst)
690
691 # tail recursion optimization: jmp to mont_reduce and return from there
692 jmp mont_reduce
693 # call mont_reduce
694 # ret
695.size mont_mul_a3b,.-mont_mul_a3b
696___
697}}}
698
699{{{
700#SQR_512 MACRO pDest, pA, x7, x6, x5, x4, x3, x2, x1, x0, tmp*4
701#
702# Input in memory [pA] and also in x7...x0
703# Uses all argument registers plus rax and rdx
704#
705# This version computes all of the off-diagonal terms into memory,
706# and then it adds in the diagonal terms
707
708sub SQR_512
709{
710 my ($pDst, $pA, $x, $A, $tmp, $x7, $x6, $pDst_o)=@_;
711 my ($pDst, $pDst_o) = ($pDst =~ m/([^+]*)\+?(.*)?/);
712 my @X=@$x; # make a copy
713$code.=<<___;
714 # ------------------
715 # first pass 01...07
716 # ------------------
717 mov $X[0], $A
718
719 mov $X[1],%rax
720 mul $A
721 mov %rax, (+$pDst_o+8*1)($pDst)
722___
723for(my $i=2;$i<8;$i++) {
724$code.=<<___;
725 mov %rdx, $X[$i-2]
726 mov $X[$i],%rax
727 mul $A
728 add %rax, $X[$i-2]
729 adc \$0, %rdx
730___
731}
732$code.=<<___;
733 mov %rdx, $x7
734
735 mov $X[0], (+$pDst_o+8*2)($pDst)
736
737 # ------------------
738 # second pass 12...17
739 # ------------------
740
741 mov (+8*1)($pA), $A
742
743 mov (+8*2)($pA),%rax
744 mul $A
745 add %rax, $X[1]
746 adc \$0, %rdx
747 mov $X[1], (+$pDst_o+8*3)($pDst)
748
749 mov %rdx, $X[0]
750 mov (+8*3)($pA),%rax
751 mul $A
752 add %rax, $X[2]
753 adc \$0, %rdx
754 add $X[0], $X[2]
755 adc \$0, %rdx
756 mov $X[2], (+$pDst_o+8*4)($pDst)
757
758 mov %rdx, $X[0]
759 mov (+8*4)($pA),%rax
760 mul $A
761 add %rax, $X[3]
762 adc \$0, %rdx
763 add $X[0], $X[3]
764 adc \$0, %rdx
765
766 mov %rdx, $X[0]
767 mov (+8*5)($pA),%rax
768 mul $A
769 add %rax, $X[4]
770 adc \$0, %rdx
771 add $X[0], $X[4]
772 adc \$0, %rdx
773
774 mov %rdx, $X[0]
775 mov $X[6],%rax
776 mul $A
777 add %rax, $X[5]
778 adc \$0, %rdx
779 add $X[0], $X[5]
780 adc \$0, %rdx
781
782 mov %rdx, $X[0]
783 mov $X[7],%rax
784 mul $A
785 add %rax, $x7
786 adc \$0, %rdx
787 add $X[0], $x7
788 adc \$0, %rdx
789
790 mov %rdx, $X[1]
791
792 # ------------------
793 # third pass 23...27
794 # ------------------
795 mov (+8*2)($pA), $A
796
797 mov (+8*3)($pA),%rax
798 mul $A
799 add %rax, $X[3]
800 adc \$0, %rdx
801 mov $X[3], (+$pDst_o+8*5)($pDst)
802
803 mov %rdx, $X[0]
804 mov (+8*4)($pA),%rax
805 mul $A
806 add %rax, $X[4]
807 adc \$0, %rdx
808 add $X[0], $X[4]
809 adc \$0, %rdx
810 mov $X[4], (+$pDst_o+8*6)($pDst)
811
812 mov %rdx, $X[0]
813 mov (+8*5)($pA),%rax
814 mul $A
815 add %rax, $X[5]
816 adc \$0, %rdx
817 add $X[0], $X[5]
818 adc \$0, %rdx
819
820 mov %rdx, $X[0]
821 mov $X[6],%rax
822 mul $A
823 add %rax, $x7
824 adc \$0, %rdx
825 add $X[0], $x7
826 adc \$0, %rdx
827
828 mov %rdx, $X[0]
829 mov $X[7],%rax
830 mul $A
831 add %rax, $X[1]
832 adc \$0, %rdx
833 add $X[0], $X[1]
834 adc \$0, %rdx
835
836 mov %rdx, $X[2]
837
838 # ------------------
839 # fourth pass 34...37
840 # ------------------
841
842 mov (+8*3)($pA), $A
843
844 mov (+8*4)($pA),%rax
845 mul $A
846 add %rax, $X[5]
847 adc \$0, %rdx
848 mov $X[5], (+$pDst_o+8*7)($pDst)
849
850 mov %rdx, $X[0]
851 mov (+8*5)($pA),%rax
852 mul $A
853 add %rax, $x7
854 adc \$0, %rdx
855 add $X[0], $x7
856 adc \$0, %rdx
857 mov $x7, (+$pDst_o+8*8)($pDst)
858
859 mov %rdx, $X[0]
860 mov $X[6],%rax
861 mul $A
862 add %rax, $X[1]
863 adc \$0, %rdx
864 add $X[0], $X[1]
865 adc \$0, %rdx
866
867 mov %rdx, $X[0]
868 mov $X[7],%rax
869 mul $A
870 add %rax, $X[2]
871 adc \$0, %rdx
872 add $X[0], $X[2]
873 adc \$0, %rdx
874
875 mov %rdx, $X[5]
876
877 # ------------------
878 # fifth pass 45...47
879 # ------------------
880 mov (+8*4)($pA), $A
881
882 mov (+8*5)($pA),%rax
883 mul $A
884 add %rax, $X[1]
885 adc \$0, %rdx
886 mov $X[1], (+$pDst_o+8*9)($pDst)
887
888 mov %rdx, $X[0]
889 mov $X[6],%rax
890 mul $A
891 add %rax, $X[2]
892 adc \$0, %rdx
893 add $X[0], $X[2]
894 adc \$0, %rdx
895 mov $X[2], (+$pDst_o+8*10)($pDst)
896
897 mov %rdx, $X[0]
898 mov $X[7],%rax
899 mul $A
900 add %rax, $X[5]
901 adc \$0, %rdx
902 add $X[0], $X[5]
903 adc \$0, %rdx
904
905 mov %rdx, $X[1]
906
907 # ------------------
908 # sixth pass 56...57
909 # ------------------
910 mov (+8*5)($pA), $A
911
912 mov $X[6],%rax
913 mul $A
914 add %rax, $X[5]
915 adc \$0, %rdx
916 mov $X[5], (+$pDst_o+8*11)($pDst)
917
918 mov %rdx, $X[0]
919 mov $X[7],%rax
920 mul $A
921 add %rax, $X[1]
922 adc \$0, %rdx
923 add $X[0], $X[1]
924 adc \$0, %rdx
925 mov $X[1], (+$pDst_o+8*12)($pDst)
926
927 mov %rdx, $X[2]
928
929 # ------------------
930 # seventh pass 67
931 # ------------------
932 mov $X[6], $A
933
934 mov $X[7],%rax
935 mul $A
936 add %rax, $X[2]
937 adc \$0, %rdx
938 mov $X[2], (+$pDst_o+8*13)($pDst)
939
940 mov %rdx, (+$pDst_o+8*14)($pDst)
941
942 # start finalize (add in squares, and double off-terms)
943 mov (+$pDst_o+8*1)($pDst), $X[0]
944 mov (+$pDst_o+8*2)($pDst), $X[1]
945 mov (+$pDst_o+8*3)($pDst), $X[2]
946 mov (+$pDst_o+8*4)($pDst), $X[3]
947 mov (+$pDst_o+8*5)($pDst), $X[4]
948 mov (+$pDst_o+8*6)($pDst), $X[5]
949
950 mov (+8*3)($pA), %rax
951 mul %rax
952 mov %rax, $x6
953 mov %rdx, $X[6]
954
955 add $X[0], $X[0]
956 adc $X[1], $X[1]
957 adc $X[2], $X[2]
958 adc $X[3], $X[3]
959 adc $X[4], $X[4]
960 adc $X[5], $X[5]
961 adc \$0, $X[6]
962
963 mov (+8*0)($pA), %rax
964 mul %rax
965 mov %rax, (+$pDst_o+8*0)($pDst)
966 mov %rdx, $A
967
968 mov (+8*1)($pA), %rax
969 mul %rax
970
971 add $A, $X[0]
972 adc %rax, $X[1]
973 adc \$0, %rdx
974
975 mov %rdx, $A
976 mov $X[0], (+$pDst_o+8*1)($pDst)
977 mov $X[1], (+$pDst_o+8*2)($pDst)
978
979 mov (+8*2)($pA), %rax
980 mul %rax
981
982 add $A, $X[2]
983 adc %rax, $X[3]
984 adc \$0, %rdx
985
986 mov %rdx, $A
987
988 mov $X[2], (+$pDst_o+8*3)($pDst)
989 mov $X[3], (+$pDst_o+8*4)($pDst)
990
991 xor $tmp, $tmp
992 add $A, $X[4]
993 adc $x6, $X[5]
994 adc \$0, $tmp
995
996 mov $X[4], (+$pDst_o+8*5)($pDst)
997 mov $X[5], (+$pDst_o+8*6)($pDst)
998
999 # %%tmp has 0/1 in column 7
1000 # %%A6 has a full value in column 7
1001
1002 mov (+$pDst_o+8*7)($pDst), $X[0]
1003 mov (+$pDst_o+8*8)($pDst), $X[1]
1004 mov (+$pDst_o+8*9)($pDst), $X[2]
1005 mov (+$pDst_o+8*10)($pDst), $X[3]
1006 mov (+$pDst_o+8*11)($pDst), $X[4]
1007 mov (+$pDst_o+8*12)($pDst), $X[5]
1008 mov (+$pDst_o+8*13)($pDst), $x6
1009 mov (+$pDst_o+8*14)($pDst), $x7
1010
1011 mov $X[7], %rax
1012 mul %rax
1013 mov %rax, $X[7]
1014 mov %rdx, $A
1015
1016 add $X[0], $X[0]
1017 adc $X[1], $X[1]
1018 adc $X[2], $X[2]
1019 adc $X[3], $X[3]
1020 adc $X[4], $X[4]
1021 adc $X[5], $X[5]
1022 adc $x6, $x6
1023 adc $x7, $x7
1024 adc \$0, $A
1025
1026 add $tmp, $X[0]
1027
1028 mov (+8*4)($pA), %rax
1029 mul %rax
1030
1031 add $X[6], $X[0]
1032 adc %rax, $X[1]
1033 adc \$0, %rdx
1034
1035 mov %rdx, $tmp
1036
1037 mov $X[0], (+$pDst_o+8*7)($pDst)
1038 mov $X[1], (+$pDst_o+8*8)($pDst)
1039
1040 mov (+8*5)($pA), %rax
1041 mul %rax
1042
1043 add $tmp, $X[2]
1044 adc %rax, $X[3]
1045 adc \$0, %rdx
1046
1047 mov %rdx, $tmp
1048
1049 mov $X[2], (+$pDst_o+8*9)($pDst)
1050 mov $X[3], (+$pDst_o+8*10)($pDst)
1051
1052 mov (+8*6)($pA), %rax
1053 mul %rax
1054
1055 add $tmp, $X[4]
1056 adc %rax, $X[5]
1057 adc \$0, %rdx
1058
1059 mov $X[4], (+$pDst_o+8*11)($pDst)
1060 mov $X[5], (+$pDst_o+8*12)($pDst)
1061
1062 add %rdx, $x6
1063 adc $X[7], $x7
1064 adc \$0, $A
1065
1066 mov $x6, (+$pDst_o+8*13)($pDst)
1067 mov $x7, (+$pDst_o+8*14)($pDst)
1068 mov $A, (+$pDst_o+8*15)($pDst)
1069___
1070}
1071
1072#
1073# sqr_reduce: subroutine to compute Result = reduce(Result * Result)
1074#
1075# input and result also in: r9, r8, r15, r14, r13, r12, r11, r10
1076#
1077$code.=<<___;
1078.type sqr_reduce,\@abi-omnipotent
1079.align 16
1080sqr_reduce:
1081 mov (+$pResult_offset+8)(%rsp), %rcx
1082___
1083 &SQR_512("%rsp+$tmp16_offset+8", "%rcx", [map("%r$_",(10..15,8..9))], "%rbx", "%rbp", "%rsi", "%rdi");
1084$code.=<<___;
1085 # tail recursion optimization: jmp to mont_reduce and return from there
1086 jmp mont_reduce
1087 # call mont_reduce
1088 # ret
1089.size sqr_reduce,.-sqr_reduce
1090___
1091}}}
1092
1093#
1094# MAIN FUNCTION
1095#
1096
1097#mod_exp_512(UINT64 *result, /* 512 bits, 8 qwords */
1098# UINT64 *g, /* 512 bits, 8 qwords */
1099# UINT64 *exp, /* 512 bits, 8 qwords */
1100# struct mod_ctx_512 *data)
1101
1102# window size = 5
1103# table size = 2^5 = 32
1104#table_entries equ 32
1105#table_size equ table_entries * 8
1106$code.=<<___;
1107.globl mod_exp_512
1108.type mod_exp_512,\@function,4
1109mod_exp_512:
1110 push %rbp
1111 push %rbx
1112 push %r12
1113 push %r13
1114 push %r14
1115 push %r15
1116
1117 # adjust stack down and then align it with cache boundary
1118 mov %rsp, %r8
1119 sub \$$mem_size, %rsp
1120 and \$-64, %rsp
1121
1122 # store previous stack pointer and arguments
1123 mov %r8, (+$rsp_offset)(%rsp)
1124 mov %rdi, (+$pResult_offset)(%rsp)
1125 mov %rsi, (+$pG_offset)(%rsp)
1126 mov %rcx, (+$pData_offset)(%rsp)
1127.Lbody:
1128 # transform g into montgomery space
1129 # GT = reduce(g * C2) = reduce(g * (2^256))
1130 # reduce expects to have the input in [tmp16]
1131 pxor %xmm4, %xmm4
1132 movdqu (+16*0)(%rsi), %xmm0
1133 movdqu (+16*1)(%rsi), %xmm1
1134 movdqu (+16*2)(%rsi), %xmm2
1135 movdqu (+16*3)(%rsi), %xmm3
1136 movdqa %xmm4, (+$tmp16_offset+16*0)(%rsp)
1137 movdqa %xmm4, (+$tmp16_offset+16*1)(%rsp)
1138 movdqa %xmm4, (+$tmp16_offset+16*6)(%rsp)
1139 movdqa %xmm4, (+$tmp16_offset+16*7)(%rsp)
1140 movdqa %xmm0, (+$tmp16_offset+16*2)(%rsp)
1141 movdqa %xmm1, (+$tmp16_offset+16*3)(%rsp)
1142 movdqa %xmm2, (+$tmp16_offset+16*4)(%rsp)
1143 movdqa %xmm3, (+$tmp16_offset+16*5)(%rsp)
1144
1145 # load pExp before rdx gets blown away
1146 movdqu (+16*0)(%rdx), %xmm0
1147 movdqu (+16*1)(%rdx), %xmm1
1148 movdqu (+16*2)(%rdx), %xmm2
1149 movdqu (+16*3)(%rdx), %xmm3
1150
1151 lea (+$GT_offset)(%rsp), %rbx
1152 mov %rbx, (+$red_result_addr_offset)(%rsp)
1153 call mont_reduce
1154
1155 # Initialize tmp = C
1156 lea (+$tmp_offset)(%rsp), %rcx
1157 xor %rax, %rax
1158 mov %rax, (+8*0)(%rcx)
1159 mov %rax, (+8*1)(%rcx)
1160 mov %rax, (+8*3)(%rcx)
1161 mov %rax, (+8*4)(%rcx)
1162 mov %rax, (+8*5)(%rcx)
1163 mov %rax, (+8*6)(%rcx)
1164 mov %rax, (+8*7)(%rcx)
1165 mov %rax, (+$exp_offset+8*8)(%rsp)
1166 movq \$1, (+8*2)(%rcx)
1167
1168 lea (+$garray_offset)(%rsp), %rbp
1169 mov %rcx, %rsi # pTmp
1170 mov %rbp, %rdi # Garray[][0]
1171___
1172
1173 &swizzle("%rdi", "%rcx", "%rax", "%rbx");
1174
1175 # for (rax = 31; rax != 0; rax--) {
1176 # tmp = reduce(tmp * G)
1177 # swizzle(pg, tmp);
1178 # pg += 2; }
1179$code.=<<___;
1180 mov \$31, %rax
1181 mov %rax, (+$i_offset)(%rsp)
1182 mov %rbp, (+$pg_offset)(%rsp)
1183 # rsi -> pTmp
1184 mov %rsi, (+$red_result_addr_offset)(%rsp)
1185 mov (+8*0)(%rsi), %r10
1186 mov (+8*1)(%rsi), %r11
1187 mov (+8*2)(%rsi), %r12
1188 mov (+8*3)(%rsi), %r13
1189 mov (+8*4)(%rsi), %r14
1190 mov (+8*5)(%rsi), %r15
1191 mov (+8*6)(%rsi), %r8
1192 mov (+8*7)(%rsi), %r9
1193init_loop:
1194 lea (+$GT_offset)(%rsp), %rdi
1195 call mont_mul_a3b
1196 lea (+$tmp_offset)(%rsp), %rsi
1197 mov (+$pg_offset)(%rsp), %rbp
1198 add \$2, %rbp
1199 mov %rbp, (+$pg_offset)(%rsp)
1200 mov %rsi, %rcx # rcx = rsi = addr of tmp
1201___
1202
1203 &swizzle("%rbp", "%rcx", "%rax", "%rbx");
1204$code.=<<___;
1205 mov (+$i_offset)(%rsp), %rax
1206 sub \$1, %rax
1207 mov %rax, (+$i_offset)(%rsp)
1208 jne init_loop
1209
1210 #
1211 # Copy exponent onto stack
1212 movdqa %xmm0, (+$exp_offset+16*0)(%rsp)
1213 movdqa %xmm1, (+$exp_offset+16*1)(%rsp)
1214 movdqa %xmm2, (+$exp_offset+16*2)(%rsp)
1215 movdqa %xmm3, (+$exp_offset+16*3)(%rsp)
1216
1217
1218 #
1219 # Do exponentiation
1220 # Initialize result to G[exp{511:507}]
1221 mov (+$exp_offset+62)(%rsp), %eax
1222 mov %rax, %rdx
1223 shr \$11, %rax
1224 and \$0x07FF, %edx
1225 mov %edx, (+$exp_offset+62)(%rsp)
1226 lea (+$garray_offset)(%rsp,%rax,2), %rsi
1227 mov (+$pResult_offset)(%rsp), %rdx
1228___
1229
1230 &unswizzle("%rdx", "%rsi", "%rbp", "%rbx", "%rax");
1231
1232 #
1233 # Loop variables
1234 # rcx = [loop_idx] = index: 510-5 to 0 by 5
1235$code.=<<___;
1236 movq \$505, (+$loop_idx_offset)(%rsp)
1237
1238 mov (+$pResult_offset)(%rsp), %rcx
1239 mov %rcx, (+$red_result_addr_offset)(%rsp)
1240 mov (+8*0)(%rcx), %r10
1241 mov (+8*1)(%rcx), %r11
1242 mov (+8*2)(%rcx), %r12
1243 mov (+8*3)(%rcx), %r13
1244 mov (+8*4)(%rcx), %r14
1245 mov (+8*5)(%rcx), %r15
1246 mov (+8*6)(%rcx), %r8
1247 mov (+8*7)(%rcx), %r9
1248 jmp sqr_2
1249
1250main_loop_a3b:
1251 call sqr_reduce
1252 call sqr_reduce
1253 call sqr_reduce
1254sqr_2:
1255 call sqr_reduce
1256 call sqr_reduce
1257
1258 #
1259 # Do multiply, first look up proper value in Garray
1260 mov (+$loop_idx_offset)(%rsp), %rcx # bit index
1261 mov %rcx, %rax
1262 shr \$4, %rax # rax is word pointer
1263 mov (+$exp_offset)(%rsp,%rax,2), %edx
1264 and \$15, %rcx
1265 shrq %cl, %rdx
1266 and \$0x1F, %rdx
1267
1268 lea (+$garray_offset)(%rsp,%rdx,2), %rsi
1269 lea (+$tmp_offset)(%rsp), %rdx
1270 mov %rdx, %rdi
1271___
1272
1273 &unswizzle("%rdx", "%rsi", "%rbp", "%rbx", "%rax");
1274 # rdi = tmp = pG
1275
1276 #
1277 # Call mod_mul_a1(pDst, pSrc1, pSrc2, pM, pData)
1278 # result result pG M Data
1279$code.=<<___;
1280 mov (+$pResult_offset)(%rsp), %rsi
1281 call mont_mul_a3b
1282
1283 #
1284 # finish loop
1285 mov (+$loop_idx_offset)(%rsp), %rcx
1286 sub \$5, %rcx
1287 mov %rcx, (+$loop_idx_offset)(%rsp)
1288 jge main_loop_a3b
1289
1290 #
1291
1292end_main_loop_a3b:
1293 # transform result out of Montgomery space
1294 # result = reduce(result)
1295 mov (+$pResult_offset)(%rsp), %rdx
1296 pxor %xmm4, %xmm4
1297 movdqu (+16*0)(%rdx), %xmm0
1298 movdqu (+16*1)(%rdx), %xmm1
1299 movdqu (+16*2)(%rdx), %xmm2
1300 movdqu (+16*3)(%rdx), %xmm3
1301 movdqa %xmm4, (+$tmp16_offset+16*4)(%rsp)
1302 movdqa %xmm4, (+$tmp16_offset+16*5)(%rsp)
1303 movdqa %xmm4, (+$tmp16_offset+16*6)(%rsp)
1304 movdqa %xmm4, (+$tmp16_offset+16*7)(%rsp)
1305 movdqa %xmm0, (+$tmp16_offset+16*0)(%rsp)
1306 movdqa %xmm1, (+$tmp16_offset+16*1)(%rsp)
1307 movdqa %xmm2, (+$tmp16_offset+16*2)(%rsp)
1308 movdqa %xmm3, (+$tmp16_offset+16*3)(%rsp)
1309 call mont_reduce
1310
1311 # If result > m, subract m
1312 # load result into r15:r8
1313 mov (+$pResult_offset)(%rsp), %rax
1314 mov (+8*0)(%rax), %r8
1315 mov (+8*1)(%rax), %r9
1316 mov (+8*2)(%rax), %r10
1317 mov (+8*3)(%rax), %r11
1318 mov (+8*4)(%rax), %r12
1319 mov (+8*5)(%rax), %r13
1320 mov (+8*6)(%rax), %r14
1321 mov (+8*7)(%rax), %r15
1322
1323 # subtract m
1324 mov (+$pData_offset)(%rsp), %rbx
1325 add \$$M, %rbx
1326
1327 sub (+8*0)(%rbx), %r8
1328 sbb (+8*1)(%rbx), %r9
1329 sbb (+8*2)(%rbx), %r10
1330 sbb (+8*3)(%rbx), %r11
1331 sbb (+8*4)(%rbx), %r12
1332 sbb (+8*5)(%rbx), %r13
1333 sbb (+8*6)(%rbx), %r14
1334 sbb (+8*7)(%rbx), %r15
1335
1336 # if Carry is clear, replace result with difference
1337 mov (+8*0)(%rax), %rsi
1338 mov (+8*1)(%rax), %rdi
1339 mov (+8*2)(%rax), %rcx
1340 mov (+8*3)(%rax), %rdx
1341 cmovnc %r8, %rsi
1342 cmovnc %r9, %rdi
1343 cmovnc %r10, %rcx
1344 cmovnc %r11, %rdx
1345 mov %rsi, (+8*0)(%rax)
1346 mov %rdi, (+8*1)(%rax)
1347 mov %rcx, (+8*2)(%rax)
1348 mov %rdx, (+8*3)(%rax)
1349
1350 mov (+8*4)(%rax), %rsi
1351 mov (+8*5)(%rax), %rdi
1352 mov (+8*6)(%rax), %rcx
1353 mov (+8*7)(%rax), %rdx
1354 cmovnc %r12, %rsi
1355 cmovnc %r13, %rdi
1356 cmovnc %r14, %rcx
1357 cmovnc %r15, %rdx
1358 mov %rsi, (+8*4)(%rax)
1359 mov %rdi, (+8*5)(%rax)
1360 mov %rcx, (+8*6)(%rax)
1361 mov %rdx, (+8*7)(%rax)
1362
1363 mov (+$rsp_offset)(%rsp), %rsi
1364 mov 0(%rsi),%r15
1365 mov 8(%rsi),%r14
1366 mov 16(%rsi),%r13
1367 mov 24(%rsi),%r12
1368 mov 32(%rsi),%rbx
1369 mov 40(%rsi),%rbp
1370 lea 48(%rsi),%rsp
1371.Lepilogue:
1372 ret
1373.size mod_exp_512, . - mod_exp_512
1374___
1375
1376if ($win64) {
1377# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1378# CONTEXT *context,DISPATCHER_CONTEXT *disp)
1379my $rec="%rcx";
1380my $frame="%rdx";
1381my $context="%r8";
1382my $disp="%r9";
1383
1384$code.=<<___;
1385.extern __imp_RtlVirtualUnwind
1386.type mod_exp_512_se_handler,\@abi-omnipotent
1387.align 16
1388mod_exp_512_se_handler:
1389 push %rsi
1390 push %rdi
1391 push %rbx
1392 push %rbp
1393 push %r12
1394 push %r13
1395 push %r14
1396 push %r15
1397 pushfq
1398 sub \$64,%rsp
1399
1400 mov 120($context),%rax # pull context->Rax
1401 mov 248($context),%rbx # pull context->Rip
1402
1403 lea .Lbody(%rip),%r10
1404 cmp %r10,%rbx # context->Rip<prologue label
1405 jb .Lin_prologue
1406
1407 mov 152($context),%rax # pull context->Rsp
1408
1409 lea .Lepilogue(%rip),%r10
1410 cmp %r10,%rbx # context->Rip>=epilogue label
1411 jae .Lin_prologue
1412
1413 mov $rsp_offset(%rax),%rax # pull saved Rsp
1414
1415 mov 32(%rax),%rbx
1416 mov 40(%rax),%rbp
1417 mov 24(%rax),%r12
1418 mov 16(%rax),%r13
1419 mov 8(%rax),%r14
1420 mov 0(%rax),%r15
1421 lea 48(%rax),%rax
1422 mov %rbx,144($context) # restore context->Rbx
1423 mov %rbp,160($context) # restore context->Rbp
1424 mov %r12,216($context) # restore context->R12
1425 mov %r13,224($context) # restore context->R13
1426 mov %r14,232($context) # restore context->R14
1427 mov %r15,240($context) # restore context->R15
1428
1429.Lin_prologue:
1430 mov 8(%rax),%rdi
1431 mov 16(%rax),%rsi
1432 mov %rax,152($context) # restore context->Rsp
1433 mov %rsi,168($context) # restore context->Rsi
1434 mov %rdi,176($context) # restore context->Rdi
1435
1436 mov 40($disp),%rdi # disp->ContextRecord
1437 mov $context,%rsi # context
1438 mov \$154,%ecx # sizeof(CONTEXT)
1439 .long 0xa548f3fc # cld; rep movsq
1440
1441 mov $disp,%rsi
1442 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1443 mov 8(%rsi),%rdx # arg2, disp->ImageBase
1444 mov 0(%rsi),%r8 # arg3, disp->ControlPc
1445 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1446 mov 40(%rsi),%r10 # disp->ContextRecord
1447 lea 56(%rsi),%r11 # &disp->HandlerData
1448 lea 24(%rsi),%r12 # &disp->EstablisherFrame
1449 mov %r10,32(%rsp) # arg5
1450 mov %r11,40(%rsp) # arg6
1451 mov %r12,48(%rsp) # arg7
1452 mov %rcx,56(%rsp) # arg8, (NULL)
1453 call *__imp_RtlVirtualUnwind(%rip)
1454
1455 mov \$1,%eax # ExceptionContinueSearch
1456 add \$64,%rsp
1457 popfq
1458 pop %r15
1459 pop %r14
1460 pop %r13
1461 pop %r12
1462 pop %rbp
1463 pop %rbx
1464 pop %rdi
1465 pop %rsi
1466 ret
1467.size mod_exp_512_se_handler,.-mod_exp_512_se_handler
1468
1469.section .pdata
1470.align 4
1471 .rva .LSEH_begin_mod_exp_512
1472 .rva .LSEH_end_mod_exp_512
1473 .rva .LSEH_info_mod_exp_512
1474
1475.section .xdata
1476.align 8
1477.LSEH_info_mod_exp_512:
1478 .byte 9,0,0,0
1479 .rva mod_exp_512_se_handler
1480___
1481}
1482
1483sub reg_part {
1484my ($reg,$conv)=@_;
1485 if ($reg =~ /%r[0-9]+/) { $reg .= $conv; }
1486 elsif ($conv eq "b") { $reg =~ s/%[er]([^x]+)x?/%$1l/; }
1487 elsif ($conv eq "w") { $reg =~ s/%[er](.+)/%$1/; }
1488 elsif ($conv eq "d") { $reg =~ s/%[er](.+)/%e$1/; }
1489 return $reg;
1490}
1491
1492$code =~ s/(%[a-z0-9]+)#([bwd])/reg_part($1,$2)/gem;
1493$code =~ s/\`([^\`]*)\`/eval $1/gem;
1494$code =~ s/(\(\+[^)]+\))/eval $1/gem;
1495print $code;
1496close STDOUT;
diff --git a/src/lib/libcrypto/bn/asm/pa-risc2.s b/src/lib/libcrypto/bn/asm/pa-risc2.s
deleted file mode 100644
index f3b16290eb..0000000000
--- a/src/lib/libcrypto/bn/asm/pa-risc2.s
+++ /dev/null
@@ -1,1618 +0,0 @@
1;
2; PA-RISC 2.0 implementation of bn_asm code, based on the
3; 64-bit version of the code. This code is effectively the
4; same as the 64-bit version except the register model is
5; slightly different given all values must be 32-bit between
6; function calls. Thus the 64-bit return values are returned
7; in %ret0 and %ret1 vs just %ret0 as is done in 64-bit
8;
9;
10; This code is approximately 2x faster than the C version
11; for RSA/DSA.
12;
13; See http://devresource.hp.com/ for more details on the PA-RISC
14; architecture. Also see the book "PA-RISC 2.0 Architecture"
15; by Gerry Kane for information on the instruction set architecture.
16;
17; Code written by Chris Ruemmler (with some help from the HP C
18; compiler).
19;
20; The code compiles with HP's assembler
21;
22
23 .level 2.0N
24 .space $TEXT$
25 .subspa $CODE$,QUAD=0,ALIGN=8,ACCESS=0x2c,CODE_ONLY
26
27;
28; Global Register definitions used for the routines.
29;
30; Some information about HP's runtime architecture for 32-bits.
31;
32; "Caller save" means the calling function must save the register
33; if it wants the register to be preserved.
34; "Callee save" means if a function uses the register, it must save
35; the value before using it.
36;
37; For the floating point registers
38;
39; "caller save" registers: fr4-fr11, fr22-fr31
40; "callee save" registers: fr12-fr21
41; "special" registers: fr0-fr3 (status and exception registers)
42;
43; For the integer registers
44; value zero : r0
45; "caller save" registers: r1,r19-r26
46; "callee save" registers: r3-r18
47; return register : r2 (rp)
48; return values ; r28,r29 (ret0,ret1)
49; Stack pointer ; r30 (sp)
50; millicode return ptr ; r31 (also a caller save register)
51
52
53;
54; Arguments to the routines
55;
56r_ptr .reg %r26
57a_ptr .reg %r25
58b_ptr .reg %r24
59num .reg %r24
60n .reg %r23
61
62;
63; Note that the "w" argument for bn_mul_add_words and bn_mul_words
64; is passed on the stack at a delta of -56 from the top of stack
65; as the routine is entered.
66;
67
68;
69; Globals used in some routines
70;
71
72top_overflow .reg %r23
73high_mask .reg %r22 ; value 0xffffffff80000000L
74
75
76;------------------------------------------------------------------------------
77;
78; bn_mul_add_words
79;
80;BN_ULONG bn_mul_add_words(BN_ULONG *r_ptr, BN_ULONG *a_ptr,
81; int num, BN_ULONG w)
82;
83; arg0 = r_ptr
84; arg1 = a_ptr
85; arg3 = num
86; -56(sp) = w
87;
88; Local register definitions
89;
90
91fm1 .reg %fr22
92fm .reg %fr23
93ht_temp .reg %fr24
94ht_temp_1 .reg %fr25
95lt_temp .reg %fr26
96lt_temp_1 .reg %fr27
97fm1_1 .reg %fr28
98fm_1 .reg %fr29
99
100fw_h .reg %fr7L
101fw_l .reg %fr7R
102fw .reg %fr7
103
104fht_0 .reg %fr8L
105flt_0 .reg %fr8R
106t_float_0 .reg %fr8
107
108fht_1 .reg %fr9L
109flt_1 .reg %fr9R
110t_float_1 .reg %fr9
111
112tmp_0 .reg %r31
113tmp_1 .reg %r21
114m_0 .reg %r20
115m_1 .reg %r19
116ht_0 .reg %r1
117ht_1 .reg %r3
118lt_0 .reg %r4
119lt_1 .reg %r5
120m1_0 .reg %r6
121m1_1 .reg %r7
122rp_val .reg %r8
123rp_val_1 .reg %r9
124
125bn_mul_add_words
126 .export bn_mul_add_words,entry,NO_RELOCATION,LONG_RETURN
127 .proc
128 .callinfo frame=128
129 .entry
130 .align 64
131
132 STD %r3,0(%sp) ; save r3
133 STD %r4,8(%sp) ; save r4
134 NOP ; Needed to make the loop 16-byte aligned
135 NOP ; needed to make the loop 16-byte aligned
136
137 STD %r5,16(%sp) ; save r5
138 NOP
139 STD %r6,24(%sp) ; save r6
140 STD %r7,32(%sp) ; save r7
141
142 STD %r8,40(%sp) ; save r8
143 STD %r9,48(%sp) ; save r9
144 COPY %r0,%ret1 ; return 0 by default
145 DEPDI,Z 1,31,1,top_overflow ; top_overflow = 1 << 32
146
147 CMPIB,>= 0,num,bn_mul_add_words_exit ; if (num <= 0) then exit
148 LDO 128(%sp),%sp ; bump stack
149
150 ;
151 ; The loop is unrolled twice, so if there is only 1 number
152 ; then go straight to the cleanup code.
153 ;
154 CMPIB,= 1,num,bn_mul_add_words_single_top
155 FLDD -184(%sp),fw ; (-56-128) load up w into fw (fw_h/fw_l)
156
157 ;
158 ; This loop is unrolled 2 times (64-byte aligned as well)
159 ;
160 ; PA-RISC 2.0 chips have two fully pipelined multipliers, thus
161 ; two 32-bit mutiplies can be issued per cycle.
162 ;
163bn_mul_add_words_unroll2
164
165 FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R)
166 FLDD 8(a_ptr),t_float_1 ; load up 64-bit value (fr8L) ht(L)/lt(R)
167 LDD 0(r_ptr),rp_val ; rp[0]
168 LDD 8(r_ptr),rp_val_1 ; rp[1]
169
170 XMPYU fht_0,fw_l,fm1 ; m1[0] = fht_0*fw_l
171 XMPYU fht_1,fw_l,fm1_1 ; m1[1] = fht_1*fw_l
172 FSTD fm1,-16(%sp) ; -16(sp) = m1[0]
173 FSTD fm1_1,-48(%sp) ; -48(sp) = m1[1]
174
175 XMPYU flt_0,fw_h,fm ; m[0] = flt_0*fw_h
176 XMPYU flt_1,fw_h,fm_1 ; m[1] = flt_1*fw_h
177 FSTD fm,-8(%sp) ; -8(sp) = m[0]
178 FSTD fm_1,-40(%sp) ; -40(sp) = m[1]
179
180 XMPYU fht_0,fw_h,ht_temp ; ht_temp = fht_0*fw_h
181 XMPYU fht_1,fw_h,ht_temp_1 ; ht_temp_1 = fht_1*fw_h
182 FSTD ht_temp,-24(%sp) ; -24(sp) = ht_temp
183 FSTD ht_temp_1,-56(%sp) ; -56(sp) = ht_temp_1
184
185 XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l
186 XMPYU flt_1,fw_l,lt_temp_1 ; lt_temp = lt*fw_l
187 FSTD lt_temp,-32(%sp) ; -32(sp) = lt_temp
188 FSTD lt_temp_1,-64(%sp) ; -64(sp) = lt_temp_1
189
190 LDD -8(%sp),m_0 ; m[0]
191 LDD -40(%sp),m_1 ; m[1]
192 LDD -16(%sp),m1_0 ; m1[0]
193 LDD -48(%sp),m1_1 ; m1[1]
194
195 LDD -24(%sp),ht_0 ; ht[0]
196 LDD -56(%sp),ht_1 ; ht[1]
197 ADD,L m1_0,m_0,tmp_0 ; tmp_0 = m[0] + m1[0];
198 ADD,L m1_1,m_1,tmp_1 ; tmp_1 = m[1] + m1[1];
199
200 LDD -32(%sp),lt_0
201 LDD -64(%sp),lt_1
202 CMPCLR,*>>= tmp_0,m1_0, %r0 ; if (m[0] < m1[0])
203 ADD,L ht_0,top_overflow,ht_0 ; ht[0] += (1<<32)
204
205 CMPCLR,*>>= tmp_1,m1_1,%r0 ; if (m[1] < m1[1])
206 ADD,L ht_1,top_overflow,ht_1 ; ht[1] += (1<<32)
207 EXTRD,U tmp_0,31,32,m_0 ; m[0]>>32
208 DEPD,Z tmp_0,31,32,m1_0 ; m1[0] = m[0]<<32
209
210 EXTRD,U tmp_1,31,32,m_1 ; m[1]>>32
211 DEPD,Z tmp_1,31,32,m1_1 ; m1[1] = m[1]<<32
212 ADD,L ht_0,m_0,ht_0 ; ht[0]+= (m[0]>>32)
213 ADD,L ht_1,m_1,ht_1 ; ht[1]+= (m[1]>>32)
214
215 ADD lt_0,m1_0,lt_0 ; lt[0] = lt[0]+m1[0];
216 ADD,DC ht_0,%r0,ht_0 ; ht[0]++
217 ADD lt_1,m1_1,lt_1 ; lt[1] = lt[1]+m1[1];
218 ADD,DC ht_1,%r0,ht_1 ; ht[1]++
219
220 ADD %ret1,lt_0,lt_0 ; lt[0] = lt[0] + c;
221 ADD,DC ht_0,%r0,ht_0 ; ht[0]++
222 ADD lt_0,rp_val,lt_0 ; lt[0] = lt[0]+rp[0]
223 ADD,DC ht_0,%r0,ht_0 ; ht[0]++
224
225 LDO -2(num),num ; num = num - 2;
226 ADD ht_0,lt_1,lt_1 ; lt[1] = lt[1] + ht_0 (c);
227 ADD,DC ht_1,%r0,ht_1 ; ht[1]++
228 STD lt_0,0(r_ptr) ; rp[0] = lt[0]
229
230 ADD lt_1,rp_val_1,lt_1 ; lt[1] = lt[1]+rp[1]
231 ADD,DC ht_1,%r0,%ret1 ; ht[1]++
232 LDO 16(a_ptr),a_ptr ; a_ptr += 2
233
234 STD lt_1,8(r_ptr) ; rp[1] = lt[1]
235 CMPIB,<= 2,num,bn_mul_add_words_unroll2 ; go again if more to do
236 LDO 16(r_ptr),r_ptr ; r_ptr += 2
237
238 CMPIB,=,N 0,num,bn_mul_add_words_exit ; are we done, or cleanup last one
239
240 ;
241 ; Top of loop aligned on 64-byte boundary
242 ;
243bn_mul_add_words_single_top
244 FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R)
245 LDD 0(r_ptr),rp_val ; rp[0]
246 LDO 8(a_ptr),a_ptr ; a_ptr++
247 XMPYU fht_0,fw_l,fm1 ; m1 = ht*fw_l
248 FSTD fm1,-16(%sp) ; -16(sp) = m1
249 XMPYU flt_0,fw_h,fm ; m = lt*fw_h
250 FSTD fm,-8(%sp) ; -8(sp) = m
251 XMPYU fht_0,fw_h,ht_temp ; ht_temp = ht*fw_h
252 FSTD ht_temp,-24(%sp) ; -24(sp) = ht
253 XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l
254 FSTD lt_temp,-32(%sp) ; -32(sp) = lt
255
256 LDD -8(%sp),m_0
257 LDD -16(%sp),m1_0 ; m1 = temp1
258 ADD,L m_0,m1_0,tmp_0 ; tmp_0 = m + m1;
259 LDD -24(%sp),ht_0
260 LDD -32(%sp),lt_0
261
262 CMPCLR,*>>= tmp_0,m1_0,%r0 ; if (m < m1)
263 ADD,L ht_0,top_overflow,ht_0 ; ht += (1<<32)
264
265 EXTRD,U tmp_0,31,32,m_0 ; m>>32
266 DEPD,Z tmp_0,31,32,m1_0 ; m1 = m<<32
267
268 ADD,L ht_0,m_0,ht_0 ; ht+= (m>>32)
269 ADD lt_0,m1_0,tmp_0 ; tmp_0 = lt+m1;
270 ADD,DC ht_0,%r0,ht_0 ; ht++
271 ADD %ret1,tmp_0,lt_0 ; lt = lt + c;
272 ADD,DC ht_0,%r0,ht_0 ; ht++
273 ADD lt_0,rp_val,lt_0 ; lt = lt+rp[0]
274 ADD,DC ht_0,%r0,%ret1 ; ht++
275 STD lt_0,0(r_ptr) ; rp[0] = lt
276
277bn_mul_add_words_exit
278 .EXIT
279
280 EXTRD,U %ret1,31,32,%ret0 ; for 32-bit, return in ret0/ret1
281 LDD -80(%sp),%r9 ; restore r9
282 LDD -88(%sp),%r8 ; restore r8
283 LDD -96(%sp),%r7 ; restore r7
284 LDD -104(%sp),%r6 ; restore r6
285 LDD -112(%sp),%r5 ; restore r5
286 LDD -120(%sp),%r4 ; restore r4
287 BVE (%rp)
288 LDD,MB -128(%sp),%r3 ; restore r3
289 .PROCEND ;in=23,24,25,26,29;out=28;
290
291;----------------------------------------------------------------------------
292;
293;BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
294;
295; arg0 = rp
296; arg1 = ap
297; arg3 = num
298; w on stack at -56(sp)
299
300bn_mul_words
301 .proc
302 .callinfo frame=128
303 .entry
304 .EXPORT bn_mul_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
305 .align 64
306
307 STD %r3,0(%sp) ; save r3
308 STD %r4,8(%sp) ; save r4
309 NOP
310 STD %r5,16(%sp) ; save r5
311
312 STD %r6,24(%sp) ; save r6
313 STD %r7,32(%sp) ; save r7
314 COPY %r0,%ret1 ; return 0 by default
315 DEPDI,Z 1,31,1,top_overflow ; top_overflow = 1 << 32
316
317 CMPIB,>= 0,num,bn_mul_words_exit
318 LDO 128(%sp),%sp ; bump stack
319
320 ;
321 ; See if only 1 word to do, thus just do cleanup
322 ;
323 CMPIB,= 1,num,bn_mul_words_single_top
324 FLDD -184(%sp),fw ; (-56-128) load up w into fw (fw_h/fw_l)
325
326 ;
327 ; This loop is unrolled 2 times (64-byte aligned as well)
328 ;
329 ; PA-RISC 2.0 chips have two fully pipelined multipliers, thus
330 ; two 32-bit mutiplies can be issued per cycle.
331 ;
332bn_mul_words_unroll2
333
334 FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R)
335 FLDD 8(a_ptr),t_float_1 ; load up 64-bit value (fr8L) ht(L)/lt(R)
336 XMPYU fht_0,fw_l,fm1 ; m1[0] = fht_0*fw_l
337 XMPYU fht_1,fw_l,fm1_1 ; m1[1] = ht*fw_l
338
339 FSTD fm1,-16(%sp) ; -16(sp) = m1
340 FSTD fm1_1,-48(%sp) ; -48(sp) = m1
341 XMPYU flt_0,fw_h,fm ; m = lt*fw_h
342 XMPYU flt_1,fw_h,fm_1 ; m = lt*fw_h
343
344 FSTD fm,-8(%sp) ; -8(sp) = m
345 FSTD fm_1,-40(%sp) ; -40(sp) = m
346 XMPYU fht_0,fw_h,ht_temp ; ht_temp = fht_0*fw_h
347 XMPYU fht_1,fw_h,ht_temp_1 ; ht_temp = ht*fw_h
348
349 FSTD ht_temp,-24(%sp) ; -24(sp) = ht
350 FSTD ht_temp_1,-56(%sp) ; -56(sp) = ht
351 XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l
352 XMPYU flt_1,fw_l,lt_temp_1 ; lt_temp = lt*fw_l
353
354 FSTD lt_temp,-32(%sp) ; -32(sp) = lt
355 FSTD lt_temp_1,-64(%sp) ; -64(sp) = lt
356 LDD -8(%sp),m_0
357 LDD -40(%sp),m_1
358
359 LDD -16(%sp),m1_0
360 LDD -48(%sp),m1_1
361 LDD -24(%sp),ht_0
362 LDD -56(%sp),ht_1
363
364 ADD,L m1_0,m_0,tmp_0 ; tmp_0 = m + m1;
365 ADD,L m1_1,m_1,tmp_1 ; tmp_1 = m + m1;
366 LDD -32(%sp),lt_0
367 LDD -64(%sp),lt_1
368
369 CMPCLR,*>>= tmp_0,m1_0, %r0 ; if (m < m1)
370 ADD,L ht_0,top_overflow,ht_0 ; ht += (1<<32)
371 CMPCLR,*>>= tmp_1,m1_1,%r0 ; if (m < m1)
372 ADD,L ht_1,top_overflow,ht_1 ; ht += (1<<32)
373
374 EXTRD,U tmp_0,31,32,m_0 ; m>>32
375 DEPD,Z tmp_0,31,32,m1_0 ; m1 = m<<32
376 EXTRD,U tmp_1,31,32,m_1 ; m>>32
377 DEPD,Z tmp_1,31,32,m1_1 ; m1 = m<<32
378
379 ADD,L ht_0,m_0,ht_0 ; ht+= (m>>32)
380 ADD,L ht_1,m_1,ht_1 ; ht+= (m>>32)
381 ADD lt_0,m1_0,lt_0 ; lt = lt+m1;
382 ADD,DC ht_0,%r0,ht_0 ; ht++
383
384 ADD lt_1,m1_1,lt_1 ; lt = lt+m1;
385 ADD,DC ht_1,%r0,ht_1 ; ht++
386 ADD %ret1,lt_0,lt_0 ; lt = lt + c (ret1);
387 ADD,DC ht_0,%r0,ht_0 ; ht++
388
389 ADD ht_0,lt_1,lt_1 ; lt = lt + c (ht_0)
390 ADD,DC ht_1,%r0,ht_1 ; ht++
391 STD lt_0,0(r_ptr) ; rp[0] = lt
392 STD lt_1,8(r_ptr) ; rp[1] = lt
393
394 COPY ht_1,%ret1 ; carry = ht
395 LDO -2(num),num ; num = num - 2;
396 LDO 16(a_ptr),a_ptr ; ap += 2
397 CMPIB,<= 2,num,bn_mul_words_unroll2
398 LDO 16(r_ptr),r_ptr ; rp++
399
400 CMPIB,=,N 0,num,bn_mul_words_exit ; are we done?
401
402 ;
403 ; Top of loop aligned on 64-byte boundary
404 ;
405bn_mul_words_single_top
406 FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R)
407
408 XMPYU fht_0,fw_l,fm1 ; m1 = ht*fw_l
409 FSTD fm1,-16(%sp) ; -16(sp) = m1
410 XMPYU flt_0,fw_h,fm ; m = lt*fw_h
411 FSTD fm,-8(%sp) ; -8(sp) = m
412 XMPYU fht_0,fw_h,ht_temp ; ht_temp = ht*fw_h
413 FSTD ht_temp,-24(%sp) ; -24(sp) = ht
414 XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l
415 FSTD lt_temp,-32(%sp) ; -32(sp) = lt
416
417 LDD -8(%sp),m_0
418 LDD -16(%sp),m1_0
419 ADD,L m_0,m1_0,tmp_0 ; tmp_0 = m + m1;
420 LDD -24(%sp),ht_0
421 LDD -32(%sp),lt_0
422
423 CMPCLR,*>>= tmp_0,m1_0,%r0 ; if (m < m1)
424 ADD,L ht_0,top_overflow,ht_0 ; ht += (1<<32)
425
426 EXTRD,U tmp_0,31,32,m_0 ; m>>32
427 DEPD,Z tmp_0,31,32,m1_0 ; m1 = m<<32
428
429 ADD,L ht_0,m_0,ht_0 ; ht+= (m>>32)
430 ADD lt_0,m1_0,lt_0 ; lt= lt+m1;
431 ADD,DC ht_0,%r0,ht_0 ; ht++
432
433 ADD %ret1,lt_0,lt_0 ; lt = lt + c;
434 ADD,DC ht_0,%r0,ht_0 ; ht++
435
436 COPY ht_0,%ret1 ; copy carry
437 STD lt_0,0(r_ptr) ; rp[0] = lt
438
439bn_mul_words_exit
440 .EXIT
441 EXTRD,U %ret1,31,32,%ret0 ; for 32-bit, return in ret0/ret1
442 LDD -96(%sp),%r7 ; restore r7
443 LDD -104(%sp),%r6 ; restore r6
444 LDD -112(%sp),%r5 ; restore r5
445 LDD -120(%sp),%r4 ; restore r4
446 BVE (%rp)
447 LDD,MB -128(%sp),%r3 ; restore r3
448 .PROCEND
449
450;----------------------------------------------------------------------------
451;
452;void bn_sqr_words(BN_ULONG *rp, BN_ULONG *ap, int num)
453;
454; arg0 = rp
455; arg1 = ap
456; arg2 = num
457;
458
459bn_sqr_words
460 .proc
461 .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
462 .EXPORT bn_sqr_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
463 .entry
464 .align 64
465
466 STD %r3,0(%sp) ; save r3
467 STD %r4,8(%sp) ; save r4
468 NOP
469 STD %r5,16(%sp) ; save r5
470
471 CMPIB,>= 0,num,bn_sqr_words_exit
472 LDO 128(%sp),%sp ; bump stack
473
474 ;
475 ; If only 1, the goto straight to cleanup
476 ;
477 CMPIB,= 1,num,bn_sqr_words_single_top
478 DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L
479
480 ;
481 ; This loop is unrolled 2 times (64-byte aligned as well)
482 ;
483
484bn_sqr_words_unroll2
485 FLDD 0(a_ptr),t_float_0 ; a[0]
486 FLDD 8(a_ptr),t_float_1 ; a[1]
487 XMPYU fht_0,flt_0,fm ; m[0]
488 XMPYU fht_1,flt_1,fm_1 ; m[1]
489
490 FSTD fm,-24(%sp) ; store m[0]
491 FSTD fm_1,-56(%sp) ; store m[1]
492 XMPYU flt_0,flt_0,lt_temp ; lt[0]
493 XMPYU flt_1,flt_1,lt_temp_1 ; lt[1]
494
495 FSTD lt_temp,-16(%sp) ; store lt[0]
496 FSTD lt_temp_1,-48(%sp) ; store lt[1]
497 XMPYU fht_0,fht_0,ht_temp ; ht[0]
498 XMPYU fht_1,fht_1,ht_temp_1 ; ht[1]
499
500 FSTD ht_temp,-8(%sp) ; store ht[0]
501 FSTD ht_temp_1,-40(%sp) ; store ht[1]
502 LDD -24(%sp),m_0
503 LDD -56(%sp),m_1
504
505 AND m_0,high_mask,tmp_0 ; m[0] & Mask
506 AND m_1,high_mask,tmp_1 ; m[1] & Mask
507 DEPD,Z m_0,30,31,m_0 ; m[0] << 32+1
508 DEPD,Z m_1,30,31,m_1 ; m[1] << 32+1
509
510 LDD -16(%sp),lt_0
511 LDD -48(%sp),lt_1
512 EXTRD,U tmp_0,32,33,tmp_0 ; tmp_0 = m[0]&Mask >> 32-1
513 EXTRD,U tmp_1,32,33,tmp_1 ; tmp_1 = m[1]&Mask >> 32-1
514
515 LDD -8(%sp),ht_0
516 LDD -40(%sp),ht_1
517 ADD,L ht_0,tmp_0,ht_0 ; ht[0] += tmp_0
518 ADD,L ht_1,tmp_1,ht_1 ; ht[1] += tmp_1
519
520 ADD lt_0,m_0,lt_0 ; lt = lt+m
521 ADD,DC ht_0,%r0,ht_0 ; ht[0]++
522 STD lt_0,0(r_ptr) ; rp[0] = lt[0]
523 STD ht_0,8(r_ptr) ; rp[1] = ht[1]
524
525 ADD lt_1,m_1,lt_1 ; lt = lt+m
526 ADD,DC ht_1,%r0,ht_1 ; ht[1]++
527 STD lt_1,16(r_ptr) ; rp[2] = lt[1]
528 STD ht_1,24(r_ptr) ; rp[3] = ht[1]
529
530 LDO -2(num),num ; num = num - 2;
531 LDO 16(a_ptr),a_ptr ; ap += 2
532 CMPIB,<= 2,num,bn_sqr_words_unroll2
533 LDO 32(r_ptr),r_ptr ; rp += 4
534
535 CMPIB,=,N 0,num,bn_sqr_words_exit ; are we done?
536
537 ;
538 ; Top of loop aligned on 64-byte boundary
539 ;
540bn_sqr_words_single_top
541 FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R)
542
543 XMPYU fht_0,flt_0,fm ; m
544 FSTD fm,-24(%sp) ; store m
545
546 XMPYU flt_0,flt_0,lt_temp ; lt
547 FSTD lt_temp,-16(%sp) ; store lt
548
549 XMPYU fht_0,fht_0,ht_temp ; ht
550 FSTD ht_temp,-8(%sp) ; store ht
551
552 LDD -24(%sp),m_0 ; load m
553 AND m_0,high_mask,tmp_0 ; m & Mask
554 DEPD,Z m_0,30,31,m_0 ; m << 32+1
555 LDD -16(%sp),lt_0 ; lt
556
557 LDD -8(%sp),ht_0 ; ht
558 EXTRD,U tmp_0,32,33,tmp_0 ; tmp_0 = m&Mask >> 32-1
559 ADD m_0,lt_0,lt_0 ; lt = lt+m
560 ADD,L ht_0,tmp_0,ht_0 ; ht += tmp_0
561 ADD,DC ht_0,%r0,ht_0 ; ht++
562
563 STD lt_0,0(r_ptr) ; rp[0] = lt
564 STD ht_0,8(r_ptr) ; rp[1] = ht
565
566bn_sqr_words_exit
567 .EXIT
568 LDD -112(%sp),%r5 ; restore r5
569 LDD -120(%sp),%r4 ; restore r4
570 BVE (%rp)
571 LDD,MB -128(%sp),%r3
572 .PROCEND ;in=23,24,25,26,29;out=28;
573
574
575;----------------------------------------------------------------------------
576;
577;BN_ULONG bn_add_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
578;
579; arg0 = rp
580; arg1 = ap
581; arg2 = bp
582; arg3 = n
583
584t .reg %r22
585b .reg %r21
586l .reg %r20
587
588bn_add_words
589 .proc
590 .entry
591 .callinfo
592 .EXPORT bn_add_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
593 .align 64
594
595 CMPIB,>= 0,n,bn_add_words_exit
596 COPY %r0,%ret1 ; return 0 by default
597
598 ;
599 ; If 2 or more numbers do the loop
600 ;
601 CMPIB,= 1,n,bn_add_words_single_top
602 NOP
603
604 ;
605 ; This loop is unrolled 2 times (64-byte aligned as well)
606 ;
607bn_add_words_unroll2
608 LDD 0(a_ptr),t
609 LDD 0(b_ptr),b
610 ADD t,%ret1,t ; t = t+c;
611 ADD,DC %r0,%r0,%ret1 ; set c to carry
612 ADD t,b,l ; l = t + b[0]
613 ADD,DC %ret1,%r0,%ret1 ; c+= carry
614 STD l,0(r_ptr)
615
616 LDD 8(a_ptr),t
617 LDD 8(b_ptr),b
618 ADD t,%ret1,t ; t = t+c;
619 ADD,DC %r0,%r0,%ret1 ; set c to carry
620 ADD t,b,l ; l = t + b[0]
621 ADD,DC %ret1,%r0,%ret1 ; c+= carry
622 STD l,8(r_ptr)
623
624 LDO -2(n),n
625 LDO 16(a_ptr),a_ptr
626 LDO 16(b_ptr),b_ptr
627
628 CMPIB,<= 2,n,bn_add_words_unroll2
629 LDO 16(r_ptr),r_ptr
630
631 CMPIB,=,N 0,n,bn_add_words_exit ; are we done?
632
633bn_add_words_single_top
634 LDD 0(a_ptr),t
635 LDD 0(b_ptr),b
636
637 ADD t,%ret1,t ; t = t+c;
638 ADD,DC %r0,%r0,%ret1 ; set c to carry (could use CMPCLR??)
639 ADD t,b,l ; l = t + b[0]
640 ADD,DC %ret1,%r0,%ret1 ; c+= carry
641 STD l,0(r_ptr)
642
643bn_add_words_exit
644 .EXIT
645 BVE (%rp)
646 EXTRD,U %ret1,31,32,%ret0 ; for 32-bit, return in ret0/ret1
647 .PROCEND ;in=23,24,25,26,29;out=28;
648
649;----------------------------------------------------------------------------
650;
651;BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
652;
653; arg0 = rp
654; arg1 = ap
655; arg2 = bp
656; arg3 = n
657
658t1 .reg %r22
659t2 .reg %r21
660sub_tmp1 .reg %r20
661sub_tmp2 .reg %r19
662
663
664bn_sub_words
665 .proc
666 .callinfo
667 .EXPORT bn_sub_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
668 .entry
669 .align 64
670
671 CMPIB,>= 0,n,bn_sub_words_exit
672 COPY %r0,%ret1 ; return 0 by default
673
674 ;
675 ; If 2 or more numbers do the loop
676 ;
677 CMPIB,= 1,n,bn_sub_words_single_top
678 NOP
679
680 ;
681 ; This loop is unrolled 2 times (64-byte aligned as well)
682 ;
683bn_sub_words_unroll2
684 LDD 0(a_ptr),t1
685 LDD 0(b_ptr),t2
686 SUB t1,t2,sub_tmp1 ; t3 = t1-t2;
687 SUB sub_tmp1,%ret1,sub_tmp1 ; t3 = t3- c;
688
689 CMPCLR,*>> t1,t2,sub_tmp2 ; clear if t1 > t2
690 LDO 1(%r0),sub_tmp2
691
692 CMPCLR,*= t1,t2,%r0
693 COPY sub_tmp2,%ret1
694 STD sub_tmp1,0(r_ptr)
695
696 LDD 8(a_ptr),t1
697 LDD 8(b_ptr),t2
698 SUB t1,t2,sub_tmp1 ; t3 = t1-t2;
699 SUB sub_tmp1,%ret1,sub_tmp1 ; t3 = t3- c;
700 CMPCLR,*>> t1,t2,sub_tmp2 ; clear if t1 > t2
701 LDO 1(%r0),sub_tmp2
702
703 CMPCLR,*= t1,t2,%r0
704 COPY sub_tmp2,%ret1
705 STD sub_tmp1,8(r_ptr)
706
707 LDO -2(n),n
708 LDO 16(a_ptr),a_ptr
709 LDO 16(b_ptr),b_ptr
710
711 CMPIB,<= 2,n,bn_sub_words_unroll2
712 LDO 16(r_ptr),r_ptr
713
714 CMPIB,=,N 0,n,bn_sub_words_exit ; are we done?
715
716bn_sub_words_single_top
717 LDD 0(a_ptr),t1
718 LDD 0(b_ptr),t2
719 SUB t1,t2,sub_tmp1 ; t3 = t1-t2;
720 SUB sub_tmp1,%ret1,sub_tmp1 ; t3 = t3- c;
721 CMPCLR,*>> t1,t2,sub_tmp2 ; clear if t1 > t2
722 LDO 1(%r0),sub_tmp2
723
724 CMPCLR,*= t1,t2,%r0
725 COPY sub_tmp2,%ret1
726
727 STD sub_tmp1,0(r_ptr)
728
729bn_sub_words_exit
730 .EXIT
731 BVE (%rp)
732 EXTRD,U %ret1,31,32,%ret0 ; for 32-bit, return in ret0/ret1
733 .PROCEND ;in=23,24,25,26,29;out=28;
734
735;------------------------------------------------------------------------------
736;
737; unsigned long bn_div_words(unsigned long h, unsigned long l, unsigned long d)
738;
739; arg0 = h
740; arg1 = l
741; arg2 = d
742;
743; This is mainly just output from the HP C compiler.
744;
745;------------------------------------------------------------------------------
746bn_div_words
747 .PROC
748 .EXPORT bn_div_words,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR,RTNVAL=GR,LONG_RETURN
749 .IMPORT BN_num_bits_word,CODE
750 ;--- not PIC .IMPORT __iob,DATA
751 ;--- not PIC .IMPORT fprintf,CODE
752 .IMPORT abort,CODE
753 .IMPORT $$div2U,MILLICODE
754 .CALLINFO CALLER,FRAME=144,ENTRY_GR=%r9,SAVE_RP,ARGS_SAVED,ORDERING_AWARE
755 .ENTRY
756 STW %r2,-20(%r30) ;offset 0x8ec
757 STW,MA %r3,192(%r30) ;offset 0x8f0
758 STW %r4,-188(%r30) ;offset 0x8f4
759 DEPD %r5,31,32,%r6 ;offset 0x8f8
760 STD %r6,-184(%r30) ;offset 0x8fc
761 DEPD %r7,31,32,%r8 ;offset 0x900
762 STD %r8,-176(%r30) ;offset 0x904
763 STW %r9,-168(%r30) ;offset 0x908
764 LDD -248(%r30),%r3 ;offset 0x90c
765 COPY %r26,%r4 ;offset 0x910
766 COPY %r24,%r5 ;offset 0x914
767 DEPD %r25,31,32,%r4 ;offset 0x918
768 CMPB,*<> %r3,%r0,$0006000C ;offset 0x91c
769 DEPD %r23,31,32,%r5 ;offset 0x920
770 MOVIB,TR -1,%r29,$00060002 ;offset 0x924
771 EXTRD,U %r29,31,32,%r28 ;offset 0x928
772$0006002A
773 LDO -1(%r29),%r29 ;offset 0x92c
774 SUB %r23,%r7,%r23 ;offset 0x930
775$00060024
776 SUB %r4,%r31,%r25 ;offset 0x934
777 AND %r25,%r19,%r26 ;offset 0x938
778 CMPB,*<>,N %r0,%r26,$00060046 ;offset 0x93c
779 DEPD,Z %r25,31,32,%r20 ;offset 0x940
780 OR %r20,%r24,%r21 ;offset 0x944
781 CMPB,*<<,N %r21,%r23,$0006002A ;offset 0x948
782 SUB %r31,%r2,%r31 ;offset 0x94c
783$00060046
784$0006002E
785 DEPD,Z %r23,31,32,%r25 ;offset 0x950
786 EXTRD,U %r23,31,32,%r26 ;offset 0x954
787 AND %r25,%r19,%r24 ;offset 0x958
788 ADD,L %r31,%r26,%r31 ;offset 0x95c
789 CMPCLR,*>>= %r5,%r24,%r0 ;offset 0x960
790 LDO 1(%r31),%r31 ;offset 0x964
791$00060032
792 CMPB,*<<=,N %r31,%r4,$00060036 ;offset 0x968
793 LDO -1(%r29),%r29 ;offset 0x96c
794 ADD,L %r4,%r3,%r4 ;offset 0x970
795$00060036
796 ADDIB,=,N -1,%r8,$D0 ;offset 0x974
797 SUB %r5,%r24,%r28 ;offset 0x978
798$0006003A
799 SUB %r4,%r31,%r24 ;offset 0x97c
800 SHRPD %r24,%r28,32,%r4 ;offset 0x980
801 DEPD,Z %r29,31,32,%r9 ;offset 0x984
802 DEPD,Z %r28,31,32,%r5 ;offset 0x988
803$0006001C
804 EXTRD,U %r4,31,32,%r31 ;offset 0x98c
805 CMPB,*<>,N %r31,%r2,$00060020 ;offset 0x990
806 MOVB,TR %r6,%r29,$D1 ;offset 0x994
807 STD %r29,-152(%r30) ;offset 0x998
808$0006000C
809 EXTRD,U %r3,31,32,%r25 ;offset 0x99c
810 COPY %r3,%r26 ;offset 0x9a0
811 EXTRD,U %r3,31,32,%r9 ;offset 0x9a4
812 EXTRD,U %r4,31,32,%r8 ;offset 0x9a8
813 .CALL ARGW0=GR,ARGW1=GR,RTNVAL=GR ;in=25,26;out=28;
814 B,L BN_num_bits_word,%r2 ;offset 0x9ac
815 EXTRD,U %r5,31,32,%r7 ;offset 0x9b0
816 LDI 64,%r20 ;offset 0x9b4
817 DEPD %r7,31,32,%r5 ;offset 0x9b8
818 DEPD %r8,31,32,%r4 ;offset 0x9bc
819 DEPD %r9,31,32,%r3 ;offset 0x9c0
820 CMPB,= %r28,%r20,$00060012 ;offset 0x9c4
821 COPY %r28,%r24 ;offset 0x9c8
822 MTSARCM %r24 ;offset 0x9cc
823 DEPDI,Z -1,%sar,1,%r19 ;offset 0x9d0
824 CMPB,*>>,N %r4,%r19,$D2 ;offset 0x9d4
825$00060012
826 SUBI 64,%r24,%r31 ;offset 0x9d8
827 CMPCLR,*<< %r4,%r3,%r0 ;offset 0x9dc
828 SUB %r4,%r3,%r4 ;offset 0x9e0
829$00060016
830 CMPB,= %r31,%r0,$0006001A ;offset 0x9e4
831 COPY %r0,%r9 ;offset 0x9e8
832 MTSARCM %r31 ;offset 0x9ec
833 DEPD,Z %r3,%sar,64,%r3 ;offset 0x9f0
834 SUBI 64,%r31,%r26 ;offset 0x9f4
835 MTSAR %r26 ;offset 0x9f8
836 SHRPD %r4,%r5,%sar,%r4 ;offset 0x9fc
837 MTSARCM %r31 ;offset 0xa00
838 DEPD,Z %r5,%sar,64,%r5 ;offset 0xa04
839$0006001A
840 DEPDI,Z -1,31,32,%r19 ;offset 0xa08
841 AND %r3,%r19,%r29 ;offset 0xa0c
842 EXTRD,U %r29,31,32,%r2 ;offset 0xa10
843 DEPDI,Z -1,63,32,%r6 ;offset 0xa14
844 MOVIB,TR 2,%r8,$0006001C ;offset 0xa18
845 EXTRD,U %r3,63,32,%r7 ;offset 0xa1c
846$D2
847 ;--- not PIC ADDIL LR'__iob-$global$,%r27,%r1 ;offset 0xa20
848 ;--- not PIC LDIL LR'C$7,%r21 ;offset 0xa24
849 ;--- not PIC LDO RR'__iob-$global$+32(%r1),%r26 ;offset 0xa28
850 ;--- not PIC .CALL ARGW0=GR,ARGW1=GR,ARGW2=GR,RTNVAL=GR ;in=24,25,26;out=28;
851 ;--- not PIC B,L fprintf,%r2 ;offset 0xa2c
852 ;--- not PIC LDO RR'C$7(%r21),%r25 ;offset 0xa30
853 .CALL ;
854 B,L abort,%r2 ;offset 0xa34
855 NOP ;offset 0xa38
856 B $D3 ;offset 0xa3c
857 LDW -212(%r30),%r2 ;offset 0xa40
858$00060020
859 COPY %r4,%r26 ;offset 0xa44
860 EXTRD,U %r4,31,32,%r25 ;offset 0xa48
861 COPY %r2,%r24 ;offset 0xa4c
862 .CALL ;in=23,24,25,26;out=20,21,22,28,29; (MILLICALL)
863 B,L $$div2U,%r31 ;offset 0xa50
864 EXTRD,U %r2,31,32,%r23 ;offset 0xa54
865 DEPD %r28,31,32,%r29 ;offset 0xa58
866$00060022
867 STD %r29,-152(%r30) ;offset 0xa5c
868$D1
869 AND %r5,%r19,%r24 ;offset 0xa60
870 EXTRD,U %r24,31,32,%r24 ;offset 0xa64
871 STW %r2,-160(%r30) ;offset 0xa68
872 STW %r7,-128(%r30) ;offset 0xa6c
873 FLDD -152(%r30),%fr4 ;offset 0xa70
874 FLDD -152(%r30),%fr7 ;offset 0xa74
875 FLDW -160(%r30),%fr8L ;offset 0xa78
876 FLDW -128(%r30),%fr5L ;offset 0xa7c
877 XMPYU %fr8L,%fr7L,%fr10 ;offset 0xa80
878 FSTD %fr10,-136(%r30) ;offset 0xa84
879 XMPYU %fr8L,%fr7R,%fr22 ;offset 0xa88
880 FSTD %fr22,-144(%r30) ;offset 0xa8c
881 XMPYU %fr5L,%fr4L,%fr11 ;offset 0xa90
882 XMPYU %fr5L,%fr4R,%fr23 ;offset 0xa94
883 FSTD %fr11,-112(%r30) ;offset 0xa98
884 FSTD %fr23,-120(%r30) ;offset 0xa9c
885 LDD -136(%r30),%r28 ;offset 0xaa0
886 DEPD,Z %r28,31,32,%r31 ;offset 0xaa4
887 LDD -144(%r30),%r20 ;offset 0xaa8
888 ADD,L %r20,%r31,%r31 ;offset 0xaac
889 LDD -112(%r30),%r22 ;offset 0xab0
890 DEPD,Z %r22,31,32,%r22 ;offset 0xab4
891 LDD -120(%r30),%r21 ;offset 0xab8
892 B $00060024 ;offset 0xabc
893 ADD,L %r21,%r22,%r23 ;offset 0xac0
894$D0
895 OR %r9,%r29,%r29 ;offset 0xac4
896$00060040
897 EXTRD,U %r29,31,32,%r28 ;offset 0xac8
898$00060002
899$L2
900 LDW -212(%r30),%r2 ;offset 0xacc
901$D3
902 LDW -168(%r30),%r9 ;offset 0xad0
903 LDD -176(%r30),%r8 ;offset 0xad4
904 EXTRD,U %r8,31,32,%r7 ;offset 0xad8
905 LDD -184(%r30),%r6 ;offset 0xadc
906 EXTRD,U %r6,31,32,%r5 ;offset 0xae0
907 LDW -188(%r30),%r4 ;offset 0xae4
908 BVE (%r2) ;offset 0xae8
909 .EXIT
910 LDW,MB -192(%r30),%r3 ;offset 0xaec
911 .PROCEND ;in=23,25;out=28,29;fpin=105,107;
912
913
914
915
916;----------------------------------------------------------------------------
917;
918; Registers to hold 64-bit values to manipulate. The "L" part
919; of the register corresponds to the upper 32-bits, while the "R"
920; part corresponds to the lower 32-bits
921;
922; Note, that when using b6 and b7, the code must save these before
923; using them because they are callee save registers
924;
925;
926; Floating point registers to use to save values that
927; are manipulated. These don't collide with ftemp1-6 and
928; are all caller save registers
929;
930a0 .reg %fr22
931a0L .reg %fr22L
932a0R .reg %fr22R
933
934a1 .reg %fr23
935a1L .reg %fr23L
936a1R .reg %fr23R
937
938a2 .reg %fr24
939a2L .reg %fr24L
940a2R .reg %fr24R
941
942a3 .reg %fr25
943a3L .reg %fr25L
944a3R .reg %fr25R
945
946a4 .reg %fr26
947a4L .reg %fr26L
948a4R .reg %fr26R
949
950a5 .reg %fr27
951a5L .reg %fr27L
952a5R .reg %fr27R
953
954a6 .reg %fr28
955a6L .reg %fr28L
956a6R .reg %fr28R
957
958a7 .reg %fr29
959a7L .reg %fr29L
960a7R .reg %fr29R
961
962b0 .reg %fr30
963b0L .reg %fr30L
964b0R .reg %fr30R
965
966b1 .reg %fr31
967b1L .reg %fr31L
968b1R .reg %fr31R
969
970;
971; Temporary floating point variables, these are all caller save
972; registers
973;
974ftemp1 .reg %fr4
975ftemp2 .reg %fr5
976ftemp3 .reg %fr6
977ftemp4 .reg %fr7
978
979;
980; The B set of registers when used.
981;
982
983b2 .reg %fr8
984b2L .reg %fr8L
985b2R .reg %fr8R
986
987b3 .reg %fr9
988b3L .reg %fr9L
989b3R .reg %fr9R
990
991b4 .reg %fr10
992b4L .reg %fr10L
993b4R .reg %fr10R
994
995b5 .reg %fr11
996b5L .reg %fr11L
997b5R .reg %fr11R
998
999b6 .reg %fr12
1000b6L .reg %fr12L
1001b6R .reg %fr12R
1002
1003b7 .reg %fr13
1004b7L .reg %fr13L
1005b7R .reg %fr13R
1006
1007c1 .reg %r21 ; only reg
1008temp1 .reg %r20 ; only reg
1009temp2 .reg %r19 ; only reg
1010temp3 .reg %r31 ; only reg
1011
1012m1 .reg %r28
1013c2 .reg %r23
1014high_one .reg %r1
1015ht .reg %r6
1016lt .reg %r5
1017m .reg %r4
1018c3 .reg %r3
1019
1020SQR_ADD_C .macro A0L,A0R,C1,C2,C3
1021 XMPYU A0L,A0R,ftemp1 ; m
1022 FSTD ftemp1,-24(%sp) ; store m
1023
1024 XMPYU A0R,A0R,ftemp2 ; lt
1025 FSTD ftemp2,-16(%sp) ; store lt
1026
1027 XMPYU A0L,A0L,ftemp3 ; ht
1028 FSTD ftemp3,-8(%sp) ; store ht
1029
1030 LDD -24(%sp),m ; load m
1031 AND m,high_mask,temp2 ; m & Mask
1032 DEPD,Z m,30,31,temp3 ; m << 32+1
1033 LDD -16(%sp),lt ; lt
1034
1035 LDD -8(%sp),ht ; ht
1036 EXTRD,U temp2,32,33,temp1 ; temp1 = m&Mask >> 32-1
1037 ADD temp3,lt,lt ; lt = lt+m
1038 ADD,L ht,temp1,ht ; ht += temp1
1039 ADD,DC ht,%r0,ht ; ht++
1040
1041 ADD C1,lt,C1 ; c1=c1+lt
1042 ADD,DC ht,%r0,ht ; ht++
1043
1044 ADD C2,ht,C2 ; c2=c2+ht
1045 ADD,DC C3,%r0,C3 ; c3++
1046.endm
1047
1048SQR_ADD_C2 .macro A0L,A0R,A1L,A1R,C1,C2,C3
1049 XMPYU A0L,A1R,ftemp1 ; m1 = bl*ht
1050 FSTD ftemp1,-16(%sp) ;
1051 XMPYU A0R,A1L,ftemp2 ; m = bh*lt
1052 FSTD ftemp2,-8(%sp) ;
1053 XMPYU A0R,A1R,ftemp3 ; lt = bl*lt
1054 FSTD ftemp3,-32(%sp)
1055 XMPYU A0L,A1L,ftemp4 ; ht = bh*ht
1056 FSTD ftemp4,-24(%sp) ;
1057
1058 LDD -8(%sp),m ; r21 = m
1059 LDD -16(%sp),m1 ; r19 = m1
1060 ADD,L m,m1,m ; m+m1
1061
1062 DEPD,Z m,31,32,temp3 ; (m+m1<<32)
1063 LDD -24(%sp),ht ; r24 = ht
1064
1065 CMPCLR,*>>= m,m1,%r0 ; if (m < m1)
1066 ADD,L ht,high_one,ht ; ht+=high_one
1067
1068 EXTRD,U m,31,32,temp1 ; m >> 32
1069 LDD -32(%sp),lt ; lt
1070 ADD,L ht,temp1,ht ; ht+= m>>32
1071 ADD lt,temp3,lt ; lt = lt+m1
1072 ADD,DC ht,%r0,ht ; ht++
1073
1074 ADD ht,ht,ht ; ht=ht+ht;
1075 ADD,DC C3,%r0,C3 ; add in carry (c3++)
1076
1077 ADD lt,lt,lt ; lt=lt+lt;
1078 ADD,DC ht,%r0,ht ; add in carry (ht++)
1079
1080 ADD C1,lt,C1 ; c1=c1+lt
1081 ADD,DC,*NUV ht,%r0,ht ; add in carry (ht++)
1082 LDO 1(C3),C3 ; bump c3 if overflow,nullify otherwise
1083
1084 ADD C2,ht,C2 ; c2 = c2 + ht
1085 ADD,DC C3,%r0,C3 ; add in carry (c3++)
1086.endm
1087
1088;
1089;void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
1090; arg0 = r_ptr
1091; arg1 = a_ptr
1092;
1093
1094bn_sqr_comba8
1095 .PROC
1096 .CALLINFO FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
1097 .EXPORT bn_sqr_comba8,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
1098 .ENTRY
1099 .align 64
1100
1101 STD %r3,0(%sp) ; save r3
1102 STD %r4,8(%sp) ; save r4
1103 STD %r5,16(%sp) ; save r5
1104 STD %r6,24(%sp) ; save r6
1105
1106 ;
1107 ; Zero out carries
1108 ;
1109 COPY %r0,c1
1110 COPY %r0,c2
1111 COPY %r0,c3
1112
1113 LDO 128(%sp),%sp ; bump stack
1114 DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L
1115 DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32
1116
1117 ;
1118 ; Load up all of the values we are going to use
1119 ;
1120 FLDD 0(a_ptr),a0
1121 FLDD 8(a_ptr),a1
1122 FLDD 16(a_ptr),a2
1123 FLDD 24(a_ptr),a3
1124 FLDD 32(a_ptr),a4
1125 FLDD 40(a_ptr),a5
1126 FLDD 48(a_ptr),a6
1127 FLDD 56(a_ptr),a7
1128
1129 SQR_ADD_C a0L,a0R,c1,c2,c3
1130 STD c1,0(r_ptr) ; r[0] = c1;
1131 COPY %r0,c1
1132
1133 SQR_ADD_C2 a1L,a1R,a0L,a0R,c2,c3,c1
1134 STD c2,8(r_ptr) ; r[1] = c2;
1135 COPY %r0,c2
1136
1137 SQR_ADD_C a1L,a1R,c3,c1,c2
1138 SQR_ADD_C2 a2L,a2R,a0L,a0R,c3,c1,c2
1139 STD c3,16(r_ptr) ; r[2] = c3;
1140 COPY %r0,c3
1141
1142 SQR_ADD_C2 a3L,a3R,a0L,a0R,c1,c2,c3
1143 SQR_ADD_C2 a2L,a2R,a1L,a1R,c1,c2,c3
1144 STD c1,24(r_ptr) ; r[3] = c1;
1145 COPY %r0,c1
1146
1147 SQR_ADD_C a2L,a2R,c2,c3,c1
1148 SQR_ADD_C2 a3L,a3R,a1L,a1R,c2,c3,c1
1149 SQR_ADD_C2 a4L,a4R,a0L,a0R,c2,c3,c1
1150 STD c2,32(r_ptr) ; r[4] = c2;
1151 COPY %r0,c2
1152
1153 SQR_ADD_C2 a5L,a5R,a0L,a0R,c3,c1,c2
1154 SQR_ADD_C2 a4L,a4R,a1L,a1R,c3,c1,c2
1155 SQR_ADD_C2 a3L,a3R,a2L,a2R,c3,c1,c2
1156 STD c3,40(r_ptr) ; r[5] = c3;
1157 COPY %r0,c3
1158
1159 SQR_ADD_C a3L,a3R,c1,c2,c3
1160 SQR_ADD_C2 a4L,a4R,a2L,a2R,c1,c2,c3
1161 SQR_ADD_C2 a5L,a5R,a1L,a1R,c1,c2,c3
1162 SQR_ADD_C2 a6L,a6R,a0L,a0R,c1,c2,c3
1163 STD c1,48(r_ptr) ; r[6] = c1;
1164 COPY %r0,c1
1165
1166 SQR_ADD_C2 a7L,a7R,a0L,a0R,c2,c3,c1
1167 SQR_ADD_C2 a6L,a6R,a1L,a1R,c2,c3,c1
1168 SQR_ADD_C2 a5L,a5R,a2L,a2R,c2,c3,c1
1169 SQR_ADD_C2 a4L,a4R,a3L,a3R,c2,c3,c1
1170 STD c2,56(r_ptr) ; r[7] = c2;
1171 COPY %r0,c2
1172
1173 SQR_ADD_C a4L,a4R,c3,c1,c2
1174 SQR_ADD_C2 a5L,a5R,a3L,a3R,c3,c1,c2
1175 SQR_ADD_C2 a6L,a6R,a2L,a2R,c3,c1,c2
1176 SQR_ADD_C2 a7L,a7R,a1L,a1R,c3,c1,c2
1177 STD c3,64(r_ptr) ; r[8] = c3;
1178 COPY %r0,c3
1179
1180 SQR_ADD_C2 a7L,a7R,a2L,a2R,c1,c2,c3
1181 SQR_ADD_C2 a6L,a6R,a3L,a3R,c1,c2,c3
1182 SQR_ADD_C2 a5L,a5R,a4L,a4R,c1,c2,c3
1183 STD c1,72(r_ptr) ; r[9] = c1;
1184 COPY %r0,c1
1185
1186 SQR_ADD_C a5L,a5R,c2,c3,c1
1187 SQR_ADD_C2 a6L,a6R,a4L,a4R,c2,c3,c1
1188 SQR_ADD_C2 a7L,a7R,a3L,a3R,c2,c3,c1
1189 STD c2,80(r_ptr) ; r[10] = c2;
1190 COPY %r0,c2
1191
1192 SQR_ADD_C2 a7L,a7R,a4L,a4R,c3,c1,c2
1193 SQR_ADD_C2 a6L,a6R,a5L,a5R,c3,c1,c2
1194 STD c3,88(r_ptr) ; r[11] = c3;
1195 COPY %r0,c3
1196
1197 SQR_ADD_C a6L,a6R,c1,c2,c3
1198 SQR_ADD_C2 a7L,a7R,a5L,a5R,c1,c2,c3
1199 STD c1,96(r_ptr) ; r[12] = c1;
1200 COPY %r0,c1
1201
1202 SQR_ADD_C2 a7L,a7R,a6L,a6R,c2,c3,c1
1203 STD c2,104(r_ptr) ; r[13] = c2;
1204 COPY %r0,c2
1205
1206 SQR_ADD_C a7L,a7R,c3,c1,c2
1207 STD c3, 112(r_ptr) ; r[14] = c3
1208 STD c1, 120(r_ptr) ; r[15] = c1
1209
1210 .EXIT
1211 LDD -104(%sp),%r6 ; restore r6
1212 LDD -112(%sp),%r5 ; restore r5
1213 LDD -120(%sp),%r4 ; restore r4
1214 BVE (%rp)
1215 LDD,MB -128(%sp),%r3
1216
1217 .PROCEND
1218
1219;-----------------------------------------------------------------------------
1220;
1221;void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
1222; arg0 = r_ptr
1223; arg1 = a_ptr
1224;
1225
1226bn_sqr_comba4
1227 .proc
1228 .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
1229 .EXPORT bn_sqr_comba4,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
1230 .entry
1231 .align 64
1232 STD %r3,0(%sp) ; save r3
1233 STD %r4,8(%sp) ; save r4
1234 STD %r5,16(%sp) ; save r5
1235 STD %r6,24(%sp) ; save r6
1236
1237 ;
1238 ; Zero out carries
1239 ;
1240 COPY %r0,c1
1241 COPY %r0,c2
1242 COPY %r0,c3
1243
1244 LDO 128(%sp),%sp ; bump stack
1245 DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L
1246 DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32
1247
1248 ;
1249 ; Load up all of the values we are going to use
1250 ;
1251 FLDD 0(a_ptr),a0
1252 FLDD 8(a_ptr),a1
1253 FLDD 16(a_ptr),a2
1254 FLDD 24(a_ptr),a3
1255 FLDD 32(a_ptr),a4
1256 FLDD 40(a_ptr),a5
1257 FLDD 48(a_ptr),a6
1258 FLDD 56(a_ptr),a7
1259
1260 SQR_ADD_C a0L,a0R,c1,c2,c3
1261
1262 STD c1,0(r_ptr) ; r[0] = c1;
1263 COPY %r0,c1
1264
1265 SQR_ADD_C2 a1L,a1R,a0L,a0R,c2,c3,c1
1266
1267 STD c2,8(r_ptr) ; r[1] = c2;
1268 COPY %r0,c2
1269
1270 SQR_ADD_C a1L,a1R,c3,c1,c2
1271 SQR_ADD_C2 a2L,a2R,a0L,a0R,c3,c1,c2
1272
1273 STD c3,16(r_ptr) ; r[2] = c3;
1274 COPY %r0,c3
1275
1276 SQR_ADD_C2 a3L,a3R,a0L,a0R,c1,c2,c3
1277 SQR_ADD_C2 a2L,a2R,a1L,a1R,c1,c2,c3
1278
1279 STD c1,24(r_ptr) ; r[3] = c1;
1280 COPY %r0,c1
1281
1282 SQR_ADD_C a2L,a2R,c2,c3,c1
1283 SQR_ADD_C2 a3L,a3R,a1L,a1R,c2,c3,c1
1284
1285 STD c2,32(r_ptr) ; r[4] = c2;
1286 COPY %r0,c2
1287
1288 SQR_ADD_C2 a3L,a3R,a2L,a2R,c3,c1,c2
1289 STD c3,40(r_ptr) ; r[5] = c3;
1290 COPY %r0,c3
1291
1292 SQR_ADD_C a3L,a3R,c1,c2,c3
1293 STD c1,48(r_ptr) ; r[6] = c1;
1294 STD c2,56(r_ptr) ; r[7] = c2;
1295
1296 .EXIT
1297 LDD -104(%sp),%r6 ; restore r6
1298 LDD -112(%sp),%r5 ; restore r5
1299 LDD -120(%sp),%r4 ; restore r4
1300 BVE (%rp)
1301 LDD,MB -128(%sp),%r3
1302
1303 .PROCEND
1304
1305
1306;---------------------------------------------------------------------------
1307
1308MUL_ADD_C .macro A0L,A0R,B0L,B0R,C1,C2,C3
1309 XMPYU A0L,B0R,ftemp1 ; m1 = bl*ht
1310 FSTD ftemp1,-16(%sp) ;
1311 XMPYU A0R,B0L,ftemp2 ; m = bh*lt
1312 FSTD ftemp2,-8(%sp) ;
1313 XMPYU A0R,B0R,ftemp3 ; lt = bl*lt
1314 FSTD ftemp3,-32(%sp)
1315 XMPYU A0L,B0L,ftemp4 ; ht = bh*ht
1316 FSTD ftemp4,-24(%sp) ;
1317
1318 LDD -8(%sp),m ; r21 = m
1319 LDD -16(%sp),m1 ; r19 = m1
1320 ADD,L m,m1,m ; m+m1
1321
1322 DEPD,Z m,31,32,temp3 ; (m+m1<<32)
1323 LDD -24(%sp),ht ; r24 = ht
1324
1325 CMPCLR,*>>= m,m1,%r0 ; if (m < m1)
1326 ADD,L ht,high_one,ht ; ht+=high_one
1327
1328 EXTRD,U m,31,32,temp1 ; m >> 32
1329 LDD -32(%sp),lt ; lt
1330 ADD,L ht,temp1,ht ; ht+= m>>32
1331 ADD lt,temp3,lt ; lt = lt+m1
1332 ADD,DC ht,%r0,ht ; ht++
1333
1334 ADD C1,lt,C1 ; c1=c1+lt
1335 ADD,DC ht,%r0,ht ; bump c3 if overflow,nullify otherwise
1336
1337 ADD C2,ht,C2 ; c2 = c2 + ht
1338 ADD,DC C3,%r0,C3 ; add in carry (c3++)
1339.endm
1340
1341
1342;
1343;void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
1344; arg0 = r_ptr
1345; arg1 = a_ptr
1346; arg2 = b_ptr
1347;
1348
1349bn_mul_comba8
1350 .proc
1351 .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
1352 .EXPORT bn_mul_comba8,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
1353 .entry
1354 .align 64
1355
1356 STD %r3,0(%sp) ; save r3
1357 STD %r4,8(%sp) ; save r4
1358 STD %r5,16(%sp) ; save r5
1359 STD %r6,24(%sp) ; save r6
1360 FSTD %fr12,32(%sp) ; save r6
1361 FSTD %fr13,40(%sp) ; save r7
1362
1363 ;
1364 ; Zero out carries
1365 ;
1366 COPY %r0,c1
1367 COPY %r0,c2
1368 COPY %r0,c3
1369
1370 LDO 128(%sp),%sp ; bump stack
1371 DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32
1372
1373 ;
1374 ; Load up all of the values we are going to use
1375 ;
1376 FLDD 0(a_ptr),a0
1377 FLDD 8(a_ptr),a1
1378 FLDD 16(a_ptr),a2
1379 FLDD 24(a_ptr),a3
1380 FLDD 32(a_ptr),a4
1381 FLDD 40(a_ptr),a5
1382 FLDD 48(a_ptr),a6
1383 FLDD 56(a_ptr),a7
1384
1385 FLDD 0(b_ptr),b0
1386 FLDD 8(b_ptr),b1
1387 FLDD 16(b_ptr),b2
1388 FLDD 24(b_ptr),b3
1389 FLDD 32(b_ptr),b4
1390 FLDD 40(b_ptr),b5
1391 FLDD 48(b_ptr),b6
1392 FLDD 56(b_ptr),b7
1393
1394 MUL_ADD_C a0L,a0R,b0L,b0R,c1,c2,c3
1395 STD c1,0(r_ptr)
1396 COPY %r0,c1
1397
1398 MUL_ADD_C a0L,a0R,b1L,b1R,c2,c3,c1
1399 MUL_ADD_C a1L,a1R,b0L,b0R,c2,c3,c1
1400 STD c2,8(r_ptr)
1401 COPY %r0,c2
1402
1403 MUL_ADD_C a2L,a2R,b0L,b0R,c3,c1,c2
1404 MUL_ADD_C a1L,a1R,b1L,b1R,c3,c1,c2
1405 MUL_ADD_C a0L,a0R,b2L,b2R,c3,c1,c2
1406 STD c3,16(r_ptr)
1407 COPY %r0,c3
1408
1409 MUL_ADD_C a0L,a0R,b3L,b3R,c1,c2,c3
1410 MUL_ADD_C a1L,a1R,b2L,b2R,c1,c2,c3
1411 MUL_ADD_C a2L,a2R,b1L,b1R,c1,c2,c3
1412 MUL_ADD_C a3L,a3R,b0L,b0R,c1,c2,c3
1413 STD c1,24(r_ptr)
1414 COPY %r0,c1
1415
1416 MUL_ADD_C a4L,a4R,b0L,b0R,c2,c3,c1
1417 MUL_ADD_C a3L,a3R,b1L,b1R,c2,c3,c1
1418 MUL_ADD_C a2L,a2R,b2L,b2R,c2,c3,c1
1419 MUL_ADD_C a1L,a1R,b3L,b3R,c2,c3,c1
1420 MUL_ADD_C a0L,a0R,b4L,b4R,c2,c3,c1
1421 STD c2,32(r_ptr)
1422 COPY %r0,c2
1423
1424 MUL_ADD_C a0L,a0R,b5L,b5R,c3,c1,c2
1425 MUL_ADD_C a1L,a1R,b4L,b4R,c3,c1,c2
1426 MUL_ADD_C a2L,a2R,b3L,b3R,c3,c1,c2
1427 MUL_ADD_C a3L,a3R,b2L,b2R,c3,c1,c2
1428 MUL_ADD_C a4L,a4R,b1L,b1R,c3,c1,c2
1429 MUL_ADD_C a5L,a5R,b0L,b0R,c3,c1,c2
1430 STD c3,40(r_ptr)
1431 COPY %r0,c3
1432
1433 MUL_ADD_C a6L,a6R,b0L,b0R,c1,c2,c3
1434 MUL_ADD_C a5L,a5R,b1L,b1R,c1,c2,c3
1435 MUL_ADD_C a4L,a4R,b2L,b2R,c1,c2,c3
1436 MUL_ADD_C a3L,a3R,b3L,b3R,c1,c2,c3
1437 MUL_ADD_C a2L,a2R,b4L,b4R,c1,c2,c3
1438 MUL_ADD_C a1L,a1R,b5L,b5R,c1,c2,c3
1439 MUL_ADD_C a0L,a0R,b6L,b6R,c1,c2,c3
1440 STD c1,48(r_ptr)
1441 COPY %r0,c1
1442
1443 MUL_ADD_C a0L,a0R,b7L,b7R,c2,c3,c1
1444 MUL_ADD_C a1L,a1R,b6L,b6R,c2,c3,c1
1445 MUL_ADD_C a2L,a2R,b5L,b5R,c2,c3,c1
1446 MUL_ADD_C a3L,a3R,b4L,b4R,c2,c3,c1
1447 MUL_ADD_C a4L,a4R,b3L,b3R,c2,c3,c1
1448 MUL_ADD_C a5L,a5R,b2L,b2R,c2,c3,c1
1449 MUL_ADD_C a6L,a6R,b1L,b1R,c2,c3,c1
1450 MUL_ADD_C a7L,a7R,b0L,b0R,c2,c3,c1
1451 STD c2,56(r_ptr)
1452 COPY %r0,c2
1453
1454 MUL_ADD_C a7L,a7R,b1L,b1R,c3,c1,c2
1455 MUL_ADD_C a6L,a6R,b2L,b2R,c3,c1,c2
1456 MUL_ADD_C a5L,a5R,b3L,b3R,c3,c1,c2
1457 MUL_ADD_C a4L,a4R,b4L,b4R,c3,c1,c2
1458 MUL_ADD_C a3L,a3R,b5L,b5R,c3,c1,c2
1459 MUL_ADD_C a2L,a2R,b6L,b6R,c3,c1,c2
1460 MUL_ADD_C a1L,a1R,b7L,b7R,c3,c1,c2
1461 STD c3,64(r_ptr)
1462 COPY %r0,c3
1463
1464 MUL_ADD_C a2L,a2R,b7L,b7R,c1,c2,c3
1465 MUL_ADD_C a3L,a3R,b6L,b6R,c1,c2,c3
1466 MUL_ADD_C a4L,a4R,b5L,b5R,c1,c2,c3
1467 MUL_ADD_C a5L,a5R,b4L,b4R,c1,c2,c3
1468 MUL_ADD_C a6L,a6R,b3L,b3R,c1,c2,c3
1469 MUL_ADD_C a7L,a7R,b2L,b2R,c1,c2,c3
1470 STD c1,72(r_ptr)
1471 COPY %r0,c1
1472
1473 MUL_ADD_C a7L,a7R,b3L,b3R,c2,c3,c1
1474 MUL_ADD_C a6L,a6R,b4L,b4R,c2,c3,c1
1475 MUL_ADD_C a5L,a5R,b5L,b5R,c2,c3,c1
1476 MUL_ADD_C a4L,a4R,b6L,b6R,c2,c3,c1
1477 MUL_ADD_C a3L,a3R,b7L,b7R,c2,c3,c1
1478 STD c2,80(r_ptr)
1479 COPY %r0,c2
1480
1481 MUL_ADD_C a4L,a4R,b7L,b7R,c3,c1,c2
1482 MUL_ADD_C a5L,a5R,b6L,b6R,c3,c1,c2
1483 MUL_ADD_C a6L,a6R,b5L,b5R,c3,c1,c2
1484 MUL_ADD_C a7L,a7R,b4L,b4R,c3,c1,c2
1485 STD c3,88(r_ptr)
1486 COPY %r0,c3
1487
1488 MUL_ADD_C a7L,a7R,b5L,b5R,c1,c2,c3
1489 MUL_ADD_C a6L,a6R,b6L,b6R,c1,c2,c3
1490 MUL_ADD_C a5L,a5R,b7L,b7R,c1,c2,c3
1491 STD c1,96(r_ptr)
1492 COPY %r0,c1
1493
1494 MUL_ADD_C a6L,a6R,b7L,b7R,c2,c3,c1
1495 MUL_ADD_C a7L,a7R,b6L,b6R,c2,c3,c1
1496 STD c2,104(r_ptr)
1497 COPY %r0,c2
1498
1499 MUL_ADD_C a7L,a7R,b7L,b7R,c3,c1,c2
1500 STD c3,112(r_ptr)
1501 STD c1,120(r_ptr)
1502
1503 .EXIT
1504 FLDD -88(%sp),%fr13
1505 FLDD -96(%sp),%fr12
1506 LDD -104(%sp),%r6 ; restore r6
1507 LDD -112(%sp),%r5 ; restore r5
1508 LDD -120(%sp),%r4 ; restore r4
1509 BVE (%rp)
1510 LDD,MB -128(%sp),%r3
1511
1512 .PROCEND
1513
1514;-----------------------------------------------------------------------------
1515;
1516;void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
1517; arg0 = r_ptr
1518; arg1 = a_ptr
1519; arg2 = b_ptr
1520;
1521
1522bn_mul_comba4
1523 .proc
1524 .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
1525 .EXPORT bn_mul_comba4,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
1526 .entry
1527 .align 64
1528
1529 STD %r3,0(%sp) ; save r3
1530 STD %r4,8(%sp) ; save r4
1531 STD %r5,16(%sp) ; save r5
1532 STD %r6,24(%sp) ; save r6
1533 FSTD %fr12,32(%sp) ; save r6
1534 FSTD %fr13,40(%sp) ; save r7
1535
1536 ;
1537 ; Zero out carries
1538 ;
1539 COPY %r0,c1
1540 COPY %r0,c2
1541 COPY %r0,c3
1542
1543 LDO 128(%sp),%sp ; bump stack
1544 DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32
1545
1546 ;
1547 ; Load up all of the values we are going to use
1548 ;
1549 FLDD 0(a_ptr),a0
1550 FLDD 8(a_ptr),a1
1551 FLDD 16(a_ptr),a2
1552 FLDD 24(a_ptr),a3
1553
1554 FLDD 0(b_ptr),b0
1555 FLDD 8(b_ptr),b1
1556 FLDD 16(b_ptr),b2
1557 FLDD 24(b_ptr),b3
1558
1559 MUL_ADD_C a0L,a0R,b0L,b0R,c1,c2,c3
1560 STD c1,0(r_ptr)
1561 COPY %r0,c1
1562
1563 MUL_ADD_C a0L,a0R,b1L,b1R,c2,c3,c1
1564 MUL_ADD_C a1L,a1R,b0L,b0R,c2,c3,c1
1565 STD c2,8(r_ptr)
1566 COPY %r0,c2
1567
1568 MUL_ADD_C a2L,a2R,b0L,b0R,c3,c1,c2
1569 MUL_ADD_C a1L,a1R,b1L,b1R,c3,c1,c2
1570 MUL_ADD_C a0L,a0R,b2L,b2R,c3,c1,c2
1571 STD c3,16(r_ptr)
1572 COPY %r0,c3
1573
1574 MUL_ADD_C a0L,a0R,b3L,b3R,c1,c2,c3
1575 MUL_ADD_C a1L,a1R,b2L,b2R,c1,c2,c3
1576 MUL_ADD_C a2L,a2R,b1L,b1R,c1,c2,c3
1577 MUL_ADD_C a3L,a3R,b0L,b0R,c1,c2,c3
1578 STD c1,24(r_ptr)
1579 COPY %r0,c1
1580
1581 MUL_ADD_C a3L,a3R,b1L,b1R,c2,c3,c1
1582 MUL_ADD_C a2L,a2R,b2L,b2R,c2,c3,c1
1583 MUL_ADD_C a1L,a1R,b3L,b3R,c2,c3,c1
1584 STD c2,32(r_ptr)
1585 COPY %r0,c2
1586
1587 MUL_ADD_C a2L,a2R,b3L,b3R,c3,c1,c2
1588 MUL_ADD_C a3L,a3R,b2L,b2R,c3,c1,c2
1589 STD c3,40(r_ptr)
1590 COPY %r0,c3
1591
1592 MUL_ADD_C a3L,a3R,b3L,b3R,c1,c2,c3
1593 STD c1,48(r_ptr)
1594 STD c2,56(r_ptr)
1595
1596 .EXIT
1597 FLDD -88(%sp),%fr13
1598 FLDD -96(%sp),%fr12
1599 LDD -104(%sp),%r6 ; restore r6
1600 LDD -112(%sp),%r5 ; restore r5
1601 LDD -120(%sp),%r4 ; restore r4
1602 BVE (%rp)
1603 LDD,MB -128(%sp),%r3
1604
1605 .PROCEND
1606
1607
1608;--- not PIC .SPACE $TEXT$
1609;--- not PIC .SUBSPA $CODE$
1610;--- not PIC .SPACE $PRIVATE$,SORT=16
1611;--- not PIC .IMPORT $global$,DATA
1612;--- not PIC .SPACE $TEXT$
1613;--- not PIC .SUBSPA $CODE$
1614;--- not PIC .SUBSPA $LIT$,ACCESS=0x2c
1615;--- not PIC C$7
1616;--- not PIC .ALIGN 8
1617;--- not PIC .STRINGZ "Division would overflow (%d)\n"
1618 .END
diff --git a/src/lib/libcrypto/bn/asm/pa-risc2W.s b/src/lib/libcrypto/bn/asm/pa-risc2W.s
deleted file mode 100644
index a99545754d..0000000000
--- a/src/lib/libcrypto/bn/asm/pa-risc2W.s
+++ /dev/null
@@ -1,1605 +0,0 @@
1;
2; PA-RISC 64-bit implementation of bn_asm code
3;
4; This code is approximately 2x faster than the C version
5; for RSA/DSA.
6;
7; See http://devresource.hp.com/ for more details on the PA-RISC
8; architecture. Also see the book "PA-RISC 2.0 Architecture"
9; by Gerry Kane for information on the instruction set architecture.
10;
11; Code written by Chris Ruemmler (with some help from the HP C
12; compiler).
13;
14; The code compiles with HP's assembler
15;
16
17 .level 2.0W
18 .space $TEXT$
19 .subspa $CODE$,QUAD=0,ALIGN=8,ACCESS=0x2c,CODE_ONLY
20
21;
22; Global Register definitions used for the routines.
23;
24; Some information about HP's runtime architecture for 64-bits.
25;
26; "Caller save" means the calling function must save the register
27; if it wants the register to be preserved.
28; "Callee save" means if a function uses the register, it must save
29; the value before using it.
30;
31; For the floating point registers
32;
33; "caller save" registers: fr4-fr11, fr22-fr31
34; "callee save" registers: fr12-fr21
35; "special" registers: fr0-fr3 (status and exception registers)
36;
37; For the integer registers
38; value zero : r0
39; "caller save" registers: r1,r19-r26
40; "callee save" registers: r3-r18
41; return register : r2 (rp)
42; return values ; r28 (ret0,ret1)
43; Stack pointer ; r30 (sp)
44; global data pointer ; r27 (dp)
45; argument pointer ; r29 (ap)
46; millicode return ptr ; r31 (also a caller save register)
47
48
49;
50; Arguments to the routines
51;
52r_ptr .reg %r26
53a_ptr .reg %r25
54b_ptr .reg %r24
55num .reg %r24
56w .reg %r23
57n .reg %r23
58
59
60;
61; Globals used in some routines
62;
63
64top_overflow .reg %r29
65high_mask .reg %r22 ; value 0xffffffff80000000L
66
67
68;------------------------------------------------------------------------------
69;
70; bn_mul_add_words
71;
72;BN_ULONG bn_mul_add_words(BN_ULONG *r_ptr, BN_ULONG *a_ptr,
73; int num, BN_ULONG w)
74;
75; arg0 = r_ptr
76; arg1 = a_ptr
77; arg2 = num
78; arg3 = w
79;
80; Local register definitions
81;
82
83fm1 .reg %fr22
84fm .reg %fr23
85ht_temp .reg %fr24
86ht_temp_1 .reg %fr25
87lt_temp .reg %fr26
88lt_temp_1 .reg %fr27
89fm1_1 .reg %fr28
90fm_1 .reg %fr29
91
92fw_h .reg %fr7L
93fw_l .reg %fr7R
94fw .reg %fr7
95
96fht_0 .reg %fr8L
97flt_0 .reg %fr8R
98t_float_0 .reg %fr8
99
100fht_1 .reg %fr9L
101flt_1 .reg %fr9R
102t_float_1 .reg %fr9
103
104tmp_0 .reg %r31
105tmp_1 .reg %r21
106m_0 .reg %r20
107m_1 .reg %r19
108ht_0 .reg %r1
109ht_1 .reg %r3
110lt_0 .reg %r4
111lt_1 .reg %r5
112m1_0 .reg %r6
113m1_1 .reg %r7
114rp_val .reg %r8
115rp_val_1 .reg %r9
116
117bn_mul_add_words
118 .export bn_mul_add_words,entry,NO_RELOCATION,LONG_RETURN
119 .proc
120 .callinfo frame=128
121 .entry
122 .align 64
123
124 STD %r3,0(%sp) ; save r3
125 STD %r4,8(%sp) ; save r4
126 NOP ; Needed to make the loop 16-byte aligned
127 NOP ; Needed to make the loop 16-byte aligned
128
129 STD %r5,16(%sp) ; save r5
130 STD %r6,24(%sp) ; save r6
131 STD %r7,32(%sp) ; save r7
132 STD %r8,40(%sp) ; save r8
133
134 STD %r9,48(%sp) ; save r9
135 COPY %r0,%ret0 ; return 0 by default
136 DEPDI,Z 1,31,1,top_overflow ; top_overflow = 1 << 32
137 STD w,56(%sp) ; store w on stack
138
139 CMPIB,>= 0,num,bn_mul_add_words_exit ; if (num <= 0) then exit
140 LDO 128(%sp),%sp ; bump stack
141
142 ;
143 ; The loop is unrolled twice, so if there is only 1 number
144 ; then go straight to the cleanup code.
145 ;
146 CMPIB,= 1,num,bn_mul_add_words_single_top
147 FLDD -72(%sp),fw ; load up w into fp register fw (fw_h/fw_l)
148
149 ;
150 ; This loop is unrolled 2 times (64-byte aligned as well)
151 ;
152 ; PA-RISC 2.0 chips have two fully pipelined multipliers, thus
153 ; two 32-bit mutiplies can be issued per cycle.
154 ;
155bn_mul_add_words_unroll2
156
157 FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R)
158 FLDD 8(a_ptr),t_float_1 ; load up 64-bit value (fr8L) ht(L)/lt(R)
159 LDD 0(r_ptr),rp_val ; rp[0]
160 LDD 8(r_ptr),rp_val_1 ; rp[1]
161
162 XMPYU fht_0,fw_l,fm1 ; m1[0] = fht_0*fw_l
163 XMPYU fht_1,fw_l,fm1_1 ; m1[1] = fht_1*fw_l
164 FSTD fm1,-16(%sp) ; -16(sp) = m1[0]
165 FSTD fm1_1,-48(%sp) ; -48(sp) = m1[1]
166
167 XMPYU flt_0,fw_h,fm ; m[0] = flt_0*fw_h
168 XMPYU flt_1,fw_h,fm_1 ; m[1] = flt_1*fw_h
169 FSTD fm,-8(%sp) ; -8(sp) = m[0]
170 FSTD fm_1,-40(%sp) ; -40(sp) = m[1]
171
172 XMPYU fht_0,fw_h,ht_temp ; ht_temp = fht_0*fw_h
173 XMPYU fht_1,fw_h,ht_temp_1 ; ht_temp_1 = fht_1*fw_h
174 FSTD ht_temp,-24(%sp) ; -24(sp) = ht_temp
175 FSTD ht_temp_1,-56(%sp) ; -56(sp) = ht_temp_1
176
177 XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l
178 XMPYU flt_1,fw_l,lt_temp_1 ; lt_temp = lt*fw_l
179 FSTD lt_temp,-32(%sp) ; -32(sp) = lt_temp
180 FSTD lt_temp_1,-64(%sp) ; -64(sp) = lt_temp_1
181
182 LDD -8(%sp),m_0 ; m[0]
183 LDD -40(%sp),m_1 ; m[1]
184 LDD -16(%sp),m1_0 ; m1[0]
185 LDD -48(%sp),m1_1 ; m1[1]
186
187 LDD -24(%sp),ht_0 ; ht[0]
188 LDD -56(%sp),ht_1 ; ht[1]
189 ADD,L m1_0,m_0,tmp_0 ; tmp_0 = m[0] + m1[0];
190 ADD,L m1_1,m_1,tmp_1 ; tmp_1 = m[1] + m1[1];
191
192 LDD -32(%sp),lt_0
193 LDD -64(%sp),lt_1
194 CMPCLR,*>>= tmp_0,m1_0, %r0 ; if (m[0] < m1[0])
195 ADD,L ht_0,top_overflow,ht_0 ; ht[0] += (1<<32)
196
197 CMPCLR,*>>= tmp_1,m1_1,%r0 ; if (m[1] < m1[1])
198 ADD,L ht_1,top_overflow,ht_1 ; ht[1] += (1<<32)
199 EXTRD,U tmp_0,31,32,m_0 ; m[0]>>32
200 DEPD,Z tmp_0,31,32,m1_0 ; m1[0] = m[0]<<32
201
202 EXTRD,U tmp_1,31,32,m_1 ; m[1]>>32
203 DEPD,Z tmp_1,31,32,m1_1 ; m1[1] = m[1]<<32
204 ADD,L ht_0,m_0,ht_0 ; ht[0]+= (m[0]>>32)
205 ADD,L ht_1,m_1,ht_1 ; ht[1]+= (m[1]>>32)
206
207 ADD lt_0,m1_0,lt_0 ; lt[0] = lt[0]+m1[0];
208 ADD,DC ht_0,%r0,ht_0 ; ht[0]++
209 ADD lt_1,m1_1,lt_1 ; lt[1] = lt[1]+m1[1];
210 ADD,DC ht_1,%r0,ht_1 ; ht[1]++
211
212 ADD %ret0,lt_0,lt_0 ; lt[0] = lt[0] + c;
213 ADD,DC ht_0,%r0,ht_0 ; ht[0]++
214 ADD lt_0,rp_val,lt_0 ; lt[0] = lt[0]+rp[0]
215 ADD,DC ht_0,%r0,ht_0 ; ht[0]++
216
217 LDO -2(num),num ; num = num - 2;
218 ADD ht_0,lt_1,lt_1 ; lt[1] = lt[1] + ht_0 (c);
219 ADD,DC ht_1,%r0,ht_1 ; ht[1]++
220 STD lt_0,0(r_ptr) ; rp[0] = lt[0]
221
222 ADD lt_1,rp_val_1,lt_1 ; lt[1] = lt[1]+rp[1]
223 ADD,DC ht_1,%r0,%ret0 ; ht[1]++
224 LDO 16(a_ptr),a_ptr ; a_ptr += 2
225
226 STD lt_1,8(r_ptr) ; rp[1] = lt[1]
227 CMPIB,<= 2,num,bn_mul_add_words_unroll2 ; go again if more to do
228 LDO 16(r_ptr),r_ptr ; r_ptr += 2
229
230 CMPIB,=,N 0,num,bn_mul_add_words_exit ; are we done, or cleanup last one
231
232 ;
233 ; Top of loop aligned on 64-byte boundary
234 ;
235bn_mul_add_words_single_top
236 FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R)
237 LDD 0(r_ptr),rp_val ; rp[0]
238 LDO 8(a_ptr),a_ptr ; a_ptr++
239 XMPYU fht_0,fw_l,fm1 ; m1 = ht*fw_l
240 FSTD fm1,-16(%sp) ; -16(sp) = m1
241 XMPYU flt_0,fw_h,fm ; m = lt*fw_h
242 FSTD fm,-8(%sp) ; -8(sp) = m
243 XMPYU fht_0,fw_h,ht_temp ; ht_temp = ht*fw_h
244 FSTD ht_temp,-24(%sp) ; -24(sp) = ht
245 XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l
246 FSTD lt_temp,-32(%sp) ; -32(sp) = lt
247
248 LDD -8(%sp),m_0
249 LDD -16(%sp),m1_0 ; m1 = temp1
250 ADD,L m_0,m1_0,tmp_0 ; tmp_0 = m + m1;
251 LDD -24(%sp),ht_0
252 LDD -32(%sp),lt_0
253
254 CMPCLR,*>>= tmp_0,m1_0,%r0 ; if (m < m1)
255 ADD,L ht_0,top_overflow,ht_0 ; ht += (1<<32)
256
257 EXTRD,U tmp_0,31,32,m_0 ; m>>32
258 DEPD,Z tmp_0,31,32,m1_0 ; m1 = m<<32
259
260 ADD,L ht_0,m_0,ht_0 ; ht+= (m>>32)
261 ADD lt_0,m1_0,tmp_0 ; tmp_0 = lt+m1;
262 ADD,DC ht_0,%r0,ht_0 ; ht++
263 ADD %ret0,tmp_0,lt_0 ; lt = lt + c;
264 ADD,DC ht_0,%r0,ht_0 ; ht++
265 ADD lt_0,rp_val,lt_0 ; lt = lt+rp[0]
266 ADD,DC ht_0,%r0,%ret0 ; ht++
267 STD lt_0,0(r_ptr) ; rp[0] = lt
268
269bn_mul_add_words_exit
270 .EXIT
271 LDD -80(%sp),%r9 ; restore r9
272 LDD -88(%sp),%r8 ; restore r8
273 LDD -96(%sp),%r7 ; restore r7
274 LDD -104(%sp),%r6 ; restore r6
275 LDD -112(%sp),%r5 ; restore r5
276 LDD -120(%sp),%r4 ; restore r4
277 BVE (%rp)
278 LDD,MB -128(%sp),%r3 ; restore r3
279 .PROCEND ;in=23,24,25,26,29;out=28;
280
281;----------------------------------------------------------------------------
282;
283;BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
284;
285; arg0 = rp
286; arg1 = ap
287; arg2 = num
288; arg3 = w
289
290bn_mul_words
291 .proc
292 .callinfo frame=128
293 .entry
294 .EXPORT bn_mul_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
295 .align 64
296
297 STD %r3,0(%sp) ; save r3
298 STD %r4,8(%sp) ; save r4
299 STD %r5,16(%sp) ; save r5
300 STD %r6,24(%sp) ; save r6
301
302 STD %r7,32(%sp) ; save r7
303 COPY %r0,%ret0 ; return 0 by default
304 DEPDI,Z 1,31,1,top_overflow ; top_overflow = 1 << 32
305 STD w,56(%sp) ; w on stack
306
307 CMPIB,>= 0,num,bn_mul_words_exit
308 LDO 128(%sp),%sp ; bump stack
309
310 ;
311 ; See if only 1 word to do, thus just do cleanup
312 ;
313 CMPIB,= 1,num,bn_mul_words_single_top
314 FLDD -72(%sp),fw ; load up w into fp register fw (fw_h/fw_l)
315
316 ;
317 ; This loop is unrolled 2 times (64-byte aligned as well)
318 ;
319 ; PA-RISC 2.0 chips have two fully pipelined multipliers, thus
320 ; two 32-bit mutiplies can be issued per cycle.
321 ;
322bn_mul_words_unroll2
323
324 FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R)
325 FLDD 8(a_ptr),t_float_1 ; load up 64-bit value (fr8L) ht(L)/lt(R)
326 XMPYU fht_0,fw_l,fm1 ; m1[0] = fht_0*fw_l
327 XMPYU fht_1,fw_l,fm1_1 ; m1[1] = ht*fw_l
328
329 FSTD fm1,-16(%sp) ; -16(sp) = m1
330 FSTD fm1_1,-48(%sp) ; -48(sp) = m1
331 XMPYU flt_0,fw_h,fm ; m = lt*fw_h
332 XMPYU flt_1,fw_h,fm_1 ; m = lt*fw_h
333
334 FSTD fm,-8(%sp) ; -8(sp) = m
335 FSTD fm_1,-40(%sp) ; -40(sp) = m
336 XMPYU fht_0,fw_h,ht_temp ; ht_temp = fht_0*fw_h
337 XMPYU fht_1,fw_h,ht_temp_1 ; ht_temp = ht*fw_h
338
339 FSTD ht_temp,-24(%sp) ; -24(sp) = ht
340 FSTD ht_temp_1,-56(%sp) ; -56(sp) = ht
341 XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l
342 XMPYU flt_1,fw_l,lt_temp_1 ; lt_temp = lt*fw_l
343
344 FSTD lt_temp,-32(%sp) ; -32(sp) = lt
345 FSTD lt_temp_1,-64(%sp) ; -64(sp) = lt
346 LDD -8(%sp),m_0
347 LDD -40(%sp),m_1
348
349 LDD -16(%sp),m1_0
350 LDD -48(%sp),m1_1
351 LDD -24(%sp),ht_0
352 LDD -56(%sp),ht_1
353
354 ADD,L m1_0,m_0,tmp_0 ; tmp_0 = m + m1;
355 ADD,L m1_1,m_1,tmp_1 ; tmp_1 = m + m1;
356 LDD -32(%sp),lt_0
357 LDD -64(%sp),lt_1
358
359 CMPCLR,*>>= tmp_0,m1_0, %r0 ; if (m < m1)
360 ADD,L ht_0,top_overflow,ht_0 ; ht += (1<<32)
361 CMPCLR,*>>= tmp_1,m1_1,%r0 ; if (m < m1)
362 ADD,L ht_1,top_overflow,ht_1 ; ht += (1<<32)
363
364 EXTRD,U tmp_0,31,32,m_0 ; m>>32
365 DEPD,Z tmp_0,31,32,m1_0 ; m1 = m<<32
366 EXTRD,U tmp_1,31,32,m_1 ; m>>32
367 DEPD,Z tmp_1,31,32,m1_1 ; m1 = m<<32
368
369 ADD,L ht_0,m_0,ht_0 ; ht+= (m>>32)
370 ADD,L ht_1,m_1,ht_1 ; ht+= (m>>32)
371 ADD lt_0,m1_0,lt_0 ; lt = lt+m1;
372 ADD,DC ht_0,%r0,ht_0 ; ht++
373
374 ADD lt_1,m1_1,lt_1 ; lt = lt+m1;
375 ADD,DC ht_1,%r0,ht_1 ; ht++
376 ADD %ret0,lt_0,lt_0 ; lt = lt + c (ret0);
377 ADD,DC ht_0,%r0,ht_0 ; ht++
378
379 ADD ht_0,lt_1,lt_1 ; lt = lt + c (ht_0)
380 ADD,DC ht_1,%r0,ht_1 ; ht++
381 STD lt_0,0(r_ptr) ; rp[0] = lt
382 STD lt_1,8(r_ptr) ; rp[1] = lt
383
384 COPY ht_1,%ret0 ; carry = ht
385 LDO -2(num),num ; num = num - 2;
386 LDO 16(a_ptr),a_ptr ; ap += 2
387 CMPIB,<= 2,num,bn_mul_words_unroll2
388 LDO 16(r_ptr),r_ptr ; rp++
389
390 CMPIB,=,N 0,num,bn_mul_words_exit ; are we done?
391
392 ;
393 ; Top of loop aligned on 64-byte boundary
394 ;
395bn_mul_words_single_top
396 FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R)
397
398 XMPYU fht_0,fw_l,fm1 ; m1 = ht*fw_l
399 FSTD fm1,-16(%sp) ; -16(sp) = m1
400 XMPYU flt_0,fw_h,fm ; m = lt*fw_h
401 FSTD fm,-8(%sp) ; -8(sp) = m
402 XMPYU fht_0,fw_h,ht_temp ; ht_temp = ht*fw_h
403 FSTD ht_temp,-24(%sp) ; -24(sp) = ht
404 XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l
405 FSTD lt_temp,-32(%sp) ; -32(sp) = lt
406
407 LDD -8(%sp),m_0
408 LDD -16(%sp),m1_0
409 ADD,L m_0,m1_0,tmp_0 ; tmp_0 = m + m1;
410 LDD -24(%sp),ht_0
411 LDD -32(%sp),lt_0
412
413 CMPCLR,*>>= tmp_0,m1_0,%r0 ; if (m < m1)
414 ADD,L ht_0,top_overflow,ht_0 ; ht += (1<<32)
415
416 EXTRD,U tmp_0,31,32,m_0 ; m>>32
417 DEPD,Z tmp_0,31,32,m1_0 ; m1 = m<<32
418
419 ADD,L ht_0,m_0,ht_0 ; ht+= (m>>32)
420 ADD lt_0,m1_0,lt_0 ; lt= lt+m1;
421 ADD,DC ht_0,%r0,ht_0 ; ht++
422
423 ADD %ret0,lt_0,lt_0 ; lt = lt + c;
424 ADD,DC ht_0,%r0,ht_0 ; ht++
425
426 COPY ht_0,%ret0 ; copy carry
427 STD lt_0,0(r_ptr) ; rp[0] = lt
428
429bn_mul_words_exit
430 .EXIT
431 LDD -96(%sp),%r7 ; restore r7
432 LDD -104(%sp),%r6 ; restore r6
433 LDD -112(%sp),%r5 ; restore r5
434 LDD -120(%sp),%r4 ; restore r4
435 BVE (%rp)
436 LDD,MB -128(%sp),%r3 ; restore r3
437 .PROCEND ;in=23,24,25,26,29;out=28;
438
439;----------------------------------------------------------------------------
440;
441;void bn_sqr_words(BN_ULONG *rp, BN_ULONG *ap, int num)
442;
443; arg0 = rp
444; arg1 = ap
445; arg2 = num
446;
447
448bn_sqr_words
449 .proc
450 .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
451 .EXPORT bn_sqr_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
452 .entry
453 .align 64
454
455 STD %r3,0(%sp) ; save r3
456 STD %r4,8(%sp) ; save r4
457 NOP
458 STD %r5,16(%sp) ; save r5
459
460 CMPIB,>= 0,num,bn_sqr_words_exit
461 LDO 128(%sp),%sp ; bump stack
462
463 ;
464 ; If only 1, the goto straight to cleanup
465 ;
466 CMPIB,= 1,num,bn_sqr_words_single_top
467 DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L
468
469 ;
470 ; This loop is unrolled 2 times (64-byte aligned as well)
471 ;
472
473bn_sqr_words_unroll2
474 FLDD 0(a_ptr),t_float_0 ; a[0]
475 FLDD 8(a_ptr),t_float_1 ; a[1]
476 XMPYU fht_0,flt_0,fm ; m[0]
477 XMPYU fht_1,flt_1,fm_1 ; m[1]
478
479 FSTD fm,-24(%sp) ; store m[0]
480 FSTD fm_1,-56(%sp) ; store m[1]
481 XMPYU flt_0,flt_0,lt_temp ; lt[0]
482 XMPYU flt_1,flt_1,lt_temp_1 ; lt[1]
483
484 FSTD lt_temp,-16(%sp) ; store lt[0]
485 FSTD lt_temp_1,-48(%sp) ; store lt[1]
486 XMPYU fht_0,fht_0,ht_temp ; ht[0]
487 XMPYU fht_1,fht_1,ht_temp_1 ; ht[1]
488
489 FSTD ht_temp,-8(%sp) ; store ht[0]
490 FSTD ht_temp_1,-40(%sp) ; store ht[1]
491 LDD -24(%sp),m_0
492 LDD -56(%sp),m_1
493
494 AND m_0,high_mask,tmp_0 ; m[0] & Mask
495 AND m_1,high_mask,tmp_1 ; m[1] & Mask
496 DEPD,Z m_0,30,31,m_0 ; m[0] << 32+1
497 DEPD,Z m_1,30,31,m_1 ; m[1] << 32+1
498
499 LDD -16(%sp),lt_0
500 LDD -48(%sp),lt_1
501 EXTRD,U tmp_0,32,33,tmp_0 ; tmp_0 = m[0]&Mask >> 32-1
502 EXTRD,U tmp_1,32,33,tmp_1 ; tmp_1 = m[1]&Mask >> 32-1
503
504 LDD -8(%sp),ht_0
505 LDD -40(%sp),ht_1
506 ADD,L ht_0,tmp_0,ht_0 ; ht[0] += tmp_0
507 ADD,L ht_1,tmp_1,ht_1 ; ht[1] += tmp_1
508
509 ADD lt_0,m_0,lt_0 ; lt = lt+m
510 ADD,DC ht_0,%r0,ht_0 ; ht[0]++
511 STD lt_0,0(r_ptr) ; rp[0] = lt[0]
512 STD ht_0,8(r_ptr) ; rp[1] = ht[1]
513
514 ADD lt_1,m_1,lt_1 ; lt = lt+m
515 ADD,DC ht_1,%r0,ht_1 ; ht[1]++
516 STD lt_1,16(r_ptr) ; rp[2] = lt[1]
517 STD ht_1,24(r_ptr) ; rp[3] = ht[1]
518
519 LDO -2(num),num ; num = num - 2;
520 LDO 16(a_ptr),a_ptr ; ap += 2
521 CMPIB,<= 2,num,bn_sqr_words_unroll2
522 LDO 32(r_ptr),r_ptr ; rp += 4
523
524 CMPIB,=,N 0,num,bn_sqr_words_exit ; are we done?
525
526 ;
527 ; Top of loop aligned on 64-byte boundary
528 ;
529bn_sqr_words_single_top
530 FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R)
531
532 XMPYU fht_0,flt_0,fm ; m
533 FSTD fm,-24(%sp) ; store m
534
535 XMPYU flt_0,flt_0,lt_temp ; lt
536 FSTD lt_temp,-16(%sp) ; store lt
537
538 XMPYU fht_0,fht_0,ht_temp ; ht
539 FSTD ht_temp,-8(%sp) ; store ht
540
541 LDD -24(%sp),m_0 ; load m
542 AND m_0,high_mask,tmp_0 ; m & Mask
543 DEPD,Z m_0,30,31,m_0 ; m << 32+1
544 LDD -16(%sp),lt_0 ; lt
545
546 LDD -8(%sp),ht_0 ; ht
547 EXTRD,U tmp_0,32,33,tmp_0 ; tmp_0 = m&Mask >> 32-1
548 ADD m_0,lt_0,lt_0 ; lt = lt+m
549 ADD,L ht_0,tmp_0,ht_0 ; ht += tmp_0
550 ADD,DC ht_0,%r0,ht_0 ; ht++
551
552 STD lt_0,0(r_ptr) ; rp[0] = lt
553 STD ht_0,8(r_ptr) ; rp[1] = ht
554
555bn_sqr_words_exit
556 .EXIT
557 LDD -112(%sp),%r5 ; restore r5
558 LDD -120(%sp),%r4 ; restore r4
559 BVE (%rp)
560 LDD,MB -128(%sp),%r3
561 .PROCEND ;in=23,24,25,26,29;out=28;
562
563
564;----------------------------------------------------------------------------
565;
566;BN_ULONG bn_add_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
567;
568; arg0 = rp
569; arg1 = ap
570; arg2 = bp
571; arg3 = n
572
573t .reg %r22
574b .reg %r21
575l .reg %r20
576
577bn_add_words
578 .proc
579 .entry
580 .callinfo
581 .EXPORT bn_add_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
582 .align 64
583
584 CMPIB,>= 0,n,bn_add_words_exit
585 COPY %r0,%ret0 ; return 0 by default
586
587 ;
588 ; If 2 or more numbers do the loop
589 ;
590 CMPIB,= 1,n,bn_add_words_single_top
591 NOP
592
593 ;
594 ; This loop is unrolled 2 times (64-byte aligned as well)
595 ;
596bn_add_words_unroll2
597 LDD 0(a_ptr),t
598 LDD 0(b_ptr),b
599 ADD t,%ret0,t ; t = t+c;
600 ADD,DC %r0,%r0,%ret0 ; set c to carry
601 ADD t,b,l ; l = t + b[0]
602 ADD,DC %ret0,%r0,%ret0 ; c+= carry
603 STD l,0(r_ptr)
604
605 LDD 8(a_ptr),t
606 LDD 8(b_ptr),b
607 ADD t,%ret0,t ; t = t+c;
608 ADD,DC %r0,%r0,%ret0 ; set c to carry
609 ADD t,b,l ; l = t + b[0]
610 ADD,DC %ret0,%r0,%ret0 ; c+= carry
611 STD l,8(r_ptr)
612
613 LDO -2(n),n
614 LDO 16(a_ptr),a_ptr
615 LDO 16(b_ptr),b_ptr
616
617 CMPIB,<= 2,n,bn_add_words_unroll2
618 LDO 16(r_ptr),r_ptr
619
620 CMPIB,=,N 0,n,bn_add_words_exit ; are we done?
621
622bn_add_words_single_top
623 LDD 0(a_ptr),t
624 LDD 0(b_ptr),b
625
626 ADD t,%ret0,t ; t = t+c;
627 ADD,DC %r0,%r0,%ret0 ; set c to carry (could use CMPCLR??)
628 ADD t,b,l ; l = t + b[0]
629 ADD,DC %ret0,%r0,%ret0 ; c+= carry
630 STD l,0(r_ptr)
631
632bn_add_words_exit
633 .EXIT
634 BVE (%rp)
635 NOP
636 .PROCEND ;in=23,24,25,26,29;out=28;
637
638;----------------------------------------------------------------------------
639;
640;BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
641;
642; arg0 = rp
643; arg1 = ap
644; arg2 = bp
645; arg3 = n
646
647t1 .reg %r22
648t2 .reg %r21
649sub_tmp1 .reg %r20
650sub_tmp2 .reg %r19
651
652
653bn_sub_words
654 .proc
655 .callinfo
656 .EXPORT bn_sub_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
657 .entry
658 .align 64
659
660 CMPIB,>= 0,n,bn_sub_words_exit
661 COPY %r0,%ret0 ; return 0 by default
662
663 ;
664 ; If 2 or more numbers do the loop
665 ;
666 CMPIB,= 1,n,bn_sub_words_single_top
667 NOP
668
669 ;
670 ; This loop is unrolled 2 times (64-byte aligned as well)
671 ;
672bn_sub_words_unroll2
673 LDD 0(a_ptr),t1
674 LDD 0(b_ptr),t2
675 SUB t1,t2,sub_tmp1 ; t3 = t1-t2;
676 SUB sub_tmp1,%ret0,sub_tmp1 ; t3 = t3- c;
677
678 CMPCLR,*>> t1,t2,sub_tmp2 ; clear if t1 > t2
679 LDO 1(%r0),sub_tmp2
680
681 CMPCLR,*= t1,t2,%r0
682 COPY sub_tmp2,%ret0
683 STD sub_tmp1,0(r_ptr)
684
685 LDD 8(a_ptr),t1
686 LDD 8(b_ptr),t2
687 SUB t1,t2,sub_tmp1 ; t3 = t1-t2;
688 SUB sub_tmp1,%ret0,sub_tmp1 ; t3 = t3- c;
689 CMPCLR,*>> t1,t2,sub_tmp2 ; clear if t1 > t2
690 LDO 1(%r0),sub_tmp2
691
692 CMPCLR,*= t1,t2,%r0
693 COPY sub_tmp2,%ret0
694 STD sub_tmp1,8(r_ptr)
695
696 LDO -2(n),n
697 LDO 16(a_ptr),a_ptr
698 LDO 16(b_ptr),b_ptr
699
700 CMPIB,<= 2,n,bn_sub_words_unroll2
701 LDO 16(r_ptr),r_ptr
702
703 CMPIB,=,N 0,n,bn_sub_words_exit ; are we done?
704
705bn_sub_words_single_top
706 LDD 0(a_ptr),t1
707 LDD 0(b_ptr),t2
708 SUB t1,t2,sub_tmp1 ; t3 = t1-t2;
709 SUB sub_tmp1,%ret0,sub_tmp1 ; t3 = t3- c;
710 CMPCLR,*>> t1,t2,sub_tmp2 ; clear if t1 > t2
711 LDO 1(%r0),sub_tmp2
712
713 CMPCLR,*= t1,t2,%r0
714 COPY sub_tmp2,%ret0
715
716 STD sub_tmp1,0(r_ptr)
717
718bn_sub_words_exit
719 .EXIT
720 BVE (%rp)
721 NOP
722 .PROCEND ;in=23,24,25,26,29;out=28;
723
724;------------------------------------------------------------------------------
725;
726; unsigned long bn_div_words(unsigned long h, unsigned long l, unsigned long d)
727;
728; arg0 = h
729; arg1 = l
730; arg2 = d
731;
732; This is mainly just modified assembly from the compiler, thus the
733; lack of variable names.
734;
735;------------------------------------------------------------------------------
736bn_div_words
737 .proc
738 .callinfo CALLER,FRAME=272,ENTRY_GR=%r10,SAVE_RP,ARGS_SAVED,ORDERING_AWARE
739 .EXPORT bn_div_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
740 .IMPORT BN_num_bits_word,CODE,NO_RELOCATION
741 .IMPORT __iob,DATA
742 .IMPORT fprintf,CODE,NO_RELOCATION
743 .IMPORT abort,CODE,NO_RELOCATION
744 .IMPORT $$div2U,MILLICODE
745 .entry
746 STD %r2,-16(%r30)
747 STD,MA %r3,352(%r30)
748 STD %r4,-344(%r30)
749 STD %r5,-336(%r30)
750 STD %r6,-328(%r30)
751 STD %r7,-320(%r30)
752 STD %r8,-312(%r30)
753 STD %r9,-304(%r30)
754 STD %r10,-296(%r30)
755
756 STD %r27,-288(%r30) ; save gp
757
758 COPY %r24,%r3 ; save d
759 COPY %r26,%r4 ; save h (high 64-bits)
760 LDO -1(%r0),%ret0 ; return -1 by default
761
762 CMPB,*= %r0,%arg2,$D3 ; if (d == 0)
763 COPY %r25,%r5 ; save l (low 64-bits)
764
765 LDO -48(%r30),%r29 ; create ap
766 .CALL ;in=26,29;out=28;
767 B,L BN_num_bits_word,%r2
768 COPY %r3,%r26
769 LDD -288(%r30),%r27 ; restore gp
770 LDI 64,%r21
771
772 CMPB,= %r21,%ret0,$00000012 ;if (i == 64) (forward)
773 COPY %ret0,%r24 ; i
774 MTSARCM %r24
775 DEPDI,Z -1,%sar,1,%r29
776 CMPB,*<<,N %r29,%r4,bn_div_err_case ; if (h > 1<<i) (forward)
777
778$00000012
779 SUBI 64,%r24,%r31 ; i = 64 - i;
780 CMPCLR,*<< %r4,%r3,%r0 ; if (h >= d)
781 SUB %r4,%r3,%r4 ; h -= d
782 CMPB,= %r31,%r0,$0000001A ; if (i)
783 COPY %r0,%r10 ; ret = 0
784 MTSARCM %r31 ; i to shift
785 DEPD,Z %r3,%sar,64,%r3 ; d <<= i;
786 SUBI 64,%r31,%r19 ; 64 - i; redundent
787 MTSAR %r19 ; (64 -i) to shift
788 SHRPD %r4,%r5,%sar,%r4 ; l>> (64-i)
789 MTSARCM %r31 ; i to shift
790 DEPD,Z %r5,%sar,64,%r5 ; l <<= i;
791
792$0000001A
793 DEPDI,Z -1,31,32,%r19
794 EXTRD,U %r3,31,32,%r6 ; dh=(d&0xfff)>>32
795 EXTRD,U %r3,63,32,%r8 ; dl = d&0xffffff
796 LDO 2(%r0),%r9
797 STD %r3,-280(%r30) ; "d" to stack
798
799$0000001C
800 DEPDI,Z -1,63,32,%r29 ;
801 EXTRD,U %r4,31,32,%r31 ; h >> 32
802 CMPB,*=,N %r31,%r6,$D2 ; if ((h>>32) != dh)(forward) div
803 COPY %r4,%r26
804 EXTRD,U %r4,31,32,%r25
805 COPY %r6,%r24
806 .CALL ;in=23,24,25,26;out=20,21,22,28,29; (MILLICALL)
807 B,L $$div2U,%r2
808 EXTRD,U %r6,31,32,%r23
809 DEPD %r28,31,32,%r29
810$D2
811 STD %r29,-272(%r30) ; q
812 AND %r5,%r19,%r24 ; t & 0xffffffff00000000;
813 EXTRD,U %r24,31,32,%r24 ; ???
814 FLDD -272(%r30),%fr7 ; q
815 FLDD -280(%r30),%fr8 ; d
816 XMPYU %fr8L,%fr7L,%fr10
817 FSTD %fr10,-256(%r30)
818 XMPYU %fr8L,%fr7R,%fr22
819 FSTD %fr22,-264(%r30)
820 XMPYU %fr8R,%fr7L,%fr11
821 XMPYU %fr8R,%fr7R,%fr23
822 FSTD %fr11,-232(%r30)
823 FSTD %fr23,-240(%r30)
824 LDD -256(%r30),%r28
825 DEPD,Z %r28,31,32,%r2
826 LDD -264(%r30),%r20
827 ADD,L %r20,%r2,%r31
828 LDD -232(%r30),%r22
829 DEPD,Z %r22,31,32,%r22
830 LDD -240(%r30),%r21
831 B $00000024 ; enter loop
832 ADD,L %r21,%r22,%r23
833
834$0000002A
835 LDO -1(%r29),%r29
836 SUB %r23,%r8,%r23
837$00000024
838 SUB %r4,%r31,%r25
839 AND %r25,%r19,%r26
840 CMPB,*<>,N %r0,%r26,$00000046 ; (forward)
841 DEPD,Z %r25,31,32,%r20
842 OR %r20,%r24,%r21
843 CMPB,*<<,N %r21,%r23,$0000002A ;(backward)
844 SUB %r31,%r6,%r31
845;-------------Break path---------------------
846
847$00000046
848 DEPD,Z %r23,31,32,%r25 ;tl
849 EXTRD,U %r23,31,32,%r26 ;t
850 AND %r25,%r19,%r24 ;tl = (tl<<32)&0xfffffff0000000L
851 ADD,L %r31,%r26,%r31 ;th += t;
852 CMPCLR,*>>= %r5,%r24,%r0 ;if (l<tl)
853 LDO 1(%r31),%r31 ; th++;
854 CMPB,*<<=,N %r31,%r4,$00000036 ;if (n < th) (forward)
855 LDO -1(%r29),%r29 ;q--;
856 ADD,L %r4,%r3,%r4 ;h += d;
857$00000036
858 ADDIB,=,N -1,%r9,$D1 ;if (--count == 0) break (forward)
859 SUB %r5,%r24,%r28 ; l -= tl;
860 SUB %r4,%r31,%r24 ; h -= th;
861 SHRPD %r24,%r28,32,%r4 ; h = ((h<<32)|(l>>32));
862 DEPD,Z %r29,31,32,%r10 ; ret = q<<32
863 b $0000001C
864 DEPD,Z %r28,31,32,%r5 ; l = l << 32
865
866$D1
867 OR %r10,%r29,%r28 ; ret |= q
868$D3
869 LDD -368(%r30),%r2
870$D0
871 LDD -296(%r30),%r10
872 LDD -304(%r30),%r9
873 LDD -312(%r30),%r8
874 LDD -320(%r30),%r7
875 LDD -328(%r30),%r6
876 LDD -336(%r30),%r5
877 LDD -344(%r30),%r4
878 BVE (%r2)
879 .EXIT
880 LDD,MB -352(%r30),%r3
881
882bn_div_err_case
883 MFIA %r6
884 ADDIL L'bn_div_words-bn_div_err_case,%r6,%r1
885 LDO R'bn_div_words-bn_div_err_case(%r1),%r6
886 ADDIL LT'__iob,%r27,%r1
887 LDD RT'__iob(%r1),%r26
888 ADDIL L'C$4-bn_div_words,%r6,%r1
889 LDO R'C$4-bn_div_words(%r1),%r25
890 LDO 64(%r26),%r26
891 .CALL ;in=24,25,26,29;out=28;
892 B,L fprintf,%r2
893 LDO -48(%r30),%r29
894 LDD -288(%r30),%r27
895 .CALL ;in=29;
896 B,L abort,%r2
897 LDO -48(%r30),%r29
898 LDD -288(%r30),%r27
899 B $D0
900 LDD -368(%r30),%r2
901 .PROCEND ;in=24,25,26,29;out=28;
902
903;----------------------------------------------------------------------------
904;
905; Registers to hold 64-bit values to manipulate. The "L" part
906; of the register corresponds to the upper 32-bits, while the "R"
907; part corresponds to the lower 32-bits
908;
909; Note, that when using b6 and b7, the code must save these before
910; using them because they are callee save registers
911;
912;
913; Floating point registers to use to save values that
914; are manipulated. These don't collide with ftemp1-6 and
915; are all caller save registers
916;
917a0 .reg %fr22
918a0L .reg %fr22L
919a0R .reg %fr22R
920
921a1 .reg %fr23
922a1L .reg %fr23L
923a1R .reg %fr23R
924
925a2 .reg %fr24
926a2L .reg %fr24L
927a2R .reg %fr24R
928
929a3 .reg %fr25
930a3L .reg %fr25L
931a3R .reg %fr25R
932
933a4 .reg %fr26
934a4L .reg %fr26L
935a4R .reg %fr26R
936
937a5 .reg %fr27
938a5L .reg %fr27L
939a5R .reg %fr27R
940
941a6 .reg %fr28
942a6L .reg %fr28L
943a6R .reg %fr28R
944
945a7 .reg %fr29
946a7L .reg %fr29L
947a7R .reg %fr29R
948
949b0 .reg %fr30
950b0L .reg %fr30L
951b0R .reg %fr30R
952
953b1 .reg %fr31
954b1L .reg %fr31L
955b1R .reg %fr31R
956
957;
958; Temporary floating point variables, these are all caller save
959; registers
960;
961ftemp1 .reg %fr4
962ftemp2 .reg %fr5
963ftemp3 .reg %fr6
964ftemp4 .reg %fr7
965
966;
967; The B set of registers when used.
968;
969
970b2 .reg %fr8
971b2L .reg %fr8L
972b2R .reg %fr8R
973
974b3 .reg %fr9
975b3L .reg %fr9L
976b3R .reg %fr9R
977
978b4 .reg %fr10
979b4L .reg %fr10L
980b4R .reg %fr10R
981
982b5 .reg %fr11
983b5L .reg %fr11L
984b5R .reg %fr11R
985
986b6 .reg %fr12
987b6L .reg %fr12L
988b6R .reg %fr12R
989
990b7 .reg %fr13
991b7L .reg %fr13L
992b7R .reg %fr13R
993
994c1 .reg %r21 ; only reg
995temp1 .reg %r20 ; only reg
996temp2 .reg %r19 ; only reg
997temp3 .reg %r31 ; only reg
998
999m1 .reg %r28
1000c2 .reg %r23
1001high_one .reg %r1
1002ht .reg %r6
1003lt .reg %r5
1004m .reg %r4
1005c3 .reg %r3
1006
1007SQR_ADD_C .macro A0L,A0R,C1,C2,C3
1008 XMPYU A0L,A0R,ftemp1 ; m
1009 FSTD ftemp1,-24(%sp) ; store m
1010
1011 XMPYU A0R,A0R,ftemp2 ; lt
1012 FSTD ftemp2,-16(%sp) ; store lt
1013
1014 XMPYU A0L,A0L,ftemp3 ; ht
1015 FSTD ftemp3,-8(%sp) ; store ht
1016
1017 LDD -24(%sp),m ; load m
1018 AND m,high_mask,temp2 ; m & Mask
1019 DEPD,Z m,30,31,temp3 ; m << 32+1
1020 LDD -16(%sp),lt ; lt
1021
1022 LDD -8(%sp),ht ; ht
1023 EXTRD,U temp2,32,33,temp1 ; temp1 = m&Mask >> 32-1
1024 ADD temp3,lt,lt ; lt = lt+m
1025 ADD,L ht,temp1,ht ; ht += temp1
1026 ADD,DC ht,%r0,ht ; ht++
1027
1028 ADD C1,lt,C1 ; c1=c1+lt
1029 ADD,DC ht,%r0,ht ; ht++
1030
1031 ADD C2,ht,C2 ; c2=c2+ht
1032 ADD,DC C3,%r0,C3 ; c3++
1033.endm
1034
1035SQR_ADD_C2 .macro A0L,A0R,A1L,A1R,C1,C2,C3
1036 XMPYU A0L,A1R,ftemp1 ; m1 = bl*ht
1037 FSTD ftemp1,-16(%sp) ;
1038 XMPYU A0R,A1L,ftemp2 ; m = bh*lt
1039 FSTD ftemp2,-8(%sp) ;
1040 XMPYU A0R,A1R,ftemp3 ; lt = bl*lt
1041 FSTD ftemp3,-32(%sp)
1042 XMPYU A0L,A1L,ftemp4 ; ht = bh*ht
1043 FSTD ftemp4,-24(%sp) ;
1044
1045 LDD -8(%sp),m ; r21 = m
1046 LDD -16(%sp),m1 ; r19 = m1
1047 ADD,L m,m1,m ; m+m1
1048
1049 DEPD,Z m,31,32,temp3 ; (m+m1<<32)
1050 LDD -24(%sp),ht ; r24 = ht
1051
1052 CMPCLR,*>>= m,m1,%r0 ; if (m < m1)
1053 ADD,L ht,high_one,ht ; ht+=high_one
1054
1055 EXTRD,U m,31,32,temp1 ; m >> 32
1056 LDD -32(%sp),lt ; lt
1057 ADD,L ht,temp1,ht ; ht+= m>>32
1058 ADD lt,temp3,lt ; lt = lt+m1
1059 ADD,DC ht,%r0,ht ; ht++
1060
1061 ADD ht,ht,ht ; ht=ht+ht;
1062 ADD,DC C3,%r0,C3 ; add in carry (c3++)
1063
1064 ADD lt,lt,lt ; lt=lt+lt;
1065 ADD,DC ht,%r0,ht ; add in carry (ht++)
1066
1067 ADD C1,lt,C1 ; c1=c1+lt
1068 ADD,DC,*NUV ht,%r0,ht ; add in carry (ht++)
1069 LDO 1(C3),C3 ; bump c3 if overflow,nullify otherwise
1070
1071 ADD C2,ht,C2 ; c2 = c2 + ht
1072 ADD,DC C3,%r0,C3 ; add in carry (c3++)
1073.endm
1074
1075;
1076;void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
1077; arg0 = r_ptr
1078; arg1 = a_ptr
1079;
1080
1081bn_sqr_comba8
1082 .PROC
1083 .CALLINFO FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
1084 .EXPORT bn_sqr_comba8,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
1085 .ENTRY
1086 .align 64
1087
1088 STD %r3,0(%sp) ; save r3
1089 STD %r4,8(%sp) ; save r4
1090 STD %r5,16(%sp) ; save r5
1091 STD %r6,24(%sp) ; save r6
1092
1093 ;
1094 ; Zero out carries
1095 ;
1096 COPY %r0,c1
1097 COPY %r0,c2
1098 COPY %r0,c3
1099
1100 LDO 128(%sp),%sp ; bump stack
1101 DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L
1102 DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32
1103
1104 ;
1105 ; Load up all of the values we are going to use
1106 ;
1107 FLDD 0(a_ptr),a0
1108 FLDD 8(a_ptr),a1
1109 FLDD 16(a_ptr),a2
1110 FLDD 24(a_ptr),a3
1111 FLDD 32(a_ptr),a4
1112 FLDD 40(a_ptr),a5
1113 FLDD 48(a_ptr),a6
1114 FLDD 56(a_ptr),a7
1115
1116 SQR_ADD_C a0L,a0R,c1,c2,c3
1117 STD c1,0(r_ptr) ; r[0] = c1;
1118 COPY %r0,c1
1119
1120 SQR_ADD_C2 a1L,a1R,a0L,a0R,c2,c3,c1
1121 STD c2,8(r_ptr) ; r[1] = c2;
1122 COPY %r0,c2
1123
1124 SQR_ADD_C a1L,a1R,c3,c1,c2
1125 SQR_ADD_C2 a2L,a2R,a0L,a0R,c3,c1,c2
1126 STD c3,16(r_ptr) ; r[2] = c3;
1127 COPY %r0,c3
1128
1129 SQR_ADD_C2 a3L,a3R,a0L,a0R,c1,c2,c3
1130 SQR_ADD_C2 a2L,a2R,a1L,a1R,c1,c2,c3
1131 STD c1,24(r_ptr) ; r[3] = c1;
1132 COPY %r0,c1
1133
1134 SQR_ADD_C a2L,a2R,c2,c3,c1
1135 SQR_ADD_C2 a3L,a3R,a1L,a1R,c2,c3,c1
1136 SQR_ADD_C2 a4L,a4R,a0L,a0R,c2,c3,c1
1137 STD c2,32(r_ptr) ; r[4] = c2;
1138 COPY %r0,c2
1139
1140 SQR_ADD_C2 a5L,a5R,a0L,a0R,c3,c1,c2
1141 SQR_ADD_C2 a4L,a4R,a1L,a1R,c3,c1,c2
1142 SQR_ADD_C2 a3L,a3R,a2L,a2R,c3,c1,c2
1143 STD c3,40(r_ptr) ; r[5] = c3;
1144 COPY %r0,c3
1145
1146 SQR_ADD_C a3L,a3R,c1,c2,c3
1147 SQR_ADD_C2 a4L,a4R,a2L,a2R,c1,c2,c3
1148 SQR_ADD_C2 a5L,a5R,a1L,a1R,c1,c2,c3
1149 SQR_ADD_C2 a6L,a6R,a0L,a0R,c1,c2,c3
1150 STD c1,48(r_ptr) ; r[6] = c1;
1151 COPY %r0,c1
1152
1153 SQR_ADD_C2 a7L,a7R,a0L,a0R,c2,c3,c1
1154 SQR_ADD_C2 a6L,a6R,a1L,a1R,c2,c3,c1
1155 SQR_ADD_C2 a5L,a5R,a2L,a2R,c2,c3,c1
1156 SQR_ADD_C2 a4L,a4R,a3L,a3R,c2,c3,c1
1157 STD c2,56(r_ptr) ; r[7] = c2;
1158 COPY %r0,c2
1159
1160 SQR_ADD_C a4L,a4R,c3,c1,c2
1161 SQR_ADD_C2 a5L,a5R,a3L,a3R,c3,c1,c2
1162 SQR_ADD_C2 a6L,a6R,a2L,a2R,c3,c1,c2
1163 SQR_ADD_C2 a7L,a7R,a1L,a1R,c3,c1,c2
1164 STD c3,64(r_ptr) ; r[8] = c3;
1165 COPY %r0,c3
1166
1167 SQR_ADD_C2 a7L,a7R,a2L,a2R,c1,c2,c3
1168 SQR_ADD_C2 a6L,a6R,a3L,a3R,c1,c2,c3
1169 SQR_ADD_C2 a5L,a5R,a4L,a4R,c1,c2,c3
1170 STD c1,72(r_ptr) ; r[9] = c1;
1171 COPY %r0,c1
1172
1173 SQR_ADD_C a5L,a5R,c2,c3,c1
1174 SQR_ADD_C2 a6L,a6R,a4L,a4R,c2,c3,c1
1175 SQR_ADD_C2 a7L,a7R,a3L,a3R,c2,c3,c1
1176 STD c2,80(r_ptr) ; r[10] = c2;
1177 COPY %r0,c2
1178
1179 SQR_ADD_C2 a7L,a7R,a4L,a4R,c3,c1,c2
1180 SQR_ADD_C2 a6L,a6R,a5L,a5R,c3,c1,c2
1181 STD c3,88(r_ptr) ; r[11] = c3;
1182 COPY %r0,c3
1183
1184 SQR_ADD_C a6L,a6R,c1,c2,c3
1185 SQR_ADD_C2 a7L,a7R,a5L,a5R,c1,c2,c3
1186 STD c1,96(r_ptr) ; r[12] = c1;
1187 COPY %r0,c1
1188
1189 SQR_ADD_C2 a7L,a7R,a6L,a6R,c2,c3,c1
1190 STD c2,104(r_ptr) ; r[13] = c2;
1191 COPY %r0,c2
1192
1193 SQR_ADD_C a7L,a7R,c3,c1,c2
1194 STD c3, 112(r_ptr) ; r[14] = c3
1195 STD c1, 120(r_ptr) ; r[15] = c1
1196
1197 .EXIT
1198 LDD -104(%sp),%r6 ; restore r6
1199 LDD -112(%sp),%r5 ; restore r5
1200 LDD -120(%sp),%r4 ; restore r4
1201 BVE (%rp)
1202 LDD,MB -128(%sp),%r3
1203
1204 .PROCEND
1205
1206;-----------------------------------------------------------------------------
1207;
1208;void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
1209; arg0 = r_ptr
1210; arg1 = a_ptr
1211;
1212
1213bn_sqr_comba4
1214 .proc
1215 .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
1216 .EXPORT bn_sqr_comba4,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
1217 .entry
1218 .align 64
1219 STD %r3,0(%sp) ; save r3
1220 STD %r4,8(%sp) ; save r4
1221 STD %r5,16(%sp) ; save r5
1222 STD %r6,24(%sp) ; save r6
1223
1224 ;
1225 ; Zero out carries
1226 ;
1227 COPY %r0,c1
1228 COPY %r0,c2
1229 COPY %r0,c3
1230
1231 LDO 128(%sp),%sp ; bump stack
1232 DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L
1233 DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32
1234
1235 ;
1236 ; Load up all of the values we are going to use
1237 ;
1238 FLDD 0(a_ptr),a0
1239 FLDD 8(a_ptr),a1
1240 FLDD 16(a_ptr),a2
1241 FLDD 24(a_ptr),a3
1242 FLDD 32(a_ptr),a4
1243 FLDD 40(a_ptr),a5
1244 FLDD 48(a_ptr),a6
1245 FLDD 56(a_ptr),a7
1246
1247 SQR_ADD_C a0L,a0R,c1,c2,c3
1248
1249 STD c1,0(r_ptr) ; r[0] = c1;
1250 COPY %r0,c1
1251
1252 SQR_ADD_C2 a1L,a1R,a0L,a0R,c2,c3,c1
1253
1254 STD c2,8(r_ptr) ; r[1] = c2;
1255 COPY %r0,c2
1256
1257 SQR_ADD_C a1L,a1R,c3,c1,c2
1258 SQR_ADD_C2 a2L,a2R,a0L,a0R,c3,c1,c2
1259
1260 STD c3,16(r_ptr) ; r[2] = c3;
1261 COPY %r0,c3
1262
1263 SQR_ADD_C2 a3L,a3R,a0L,a0R,c1,c2,c3
1264 SQR_ADD_C2 a2L,a2R,a1L,a1R,c1,c2,c3
1265
1266 STD c1,24(r_ptr) ; r[3] = c1;
1267 COPY %r0,c1
1268
1269 SQR_ADD_C a2L,a2R,c2,c3,c1
1270 SQR_ADD_C2 a3L,a3R,a1L,a1R,c2,c3,c1
1271
1272 STD c2,32(r_ptr) ; r[4] = c2;
1273 COPY %r0,c2
1274
1275 SQR_ADD_C2 a3L,a3R,a2L,a2R,c3,c1,c2
1276 STD c3,40(r_ptr) ; r[5] = c3;
1277 COPY %r0,c3
1278
1279 SQR_ADD_C a3L,a3R,c1,c2,c3
1280 STD c1,48(r_ptr) ; r[6] = c1;
1281 STD c2,56(r_ptr) ; r[7] = c2;
1282
1283 .EXIT
1284 LDD -104(%sp),%r6 ; restore r6
1285 LDD -112(%sp),%r5 ; restore r5
1286 LDD -120(%sp),%r4 ; restore r4
1287 BVE (%rp)
1288 LDD,MB -128(%sp),%r3
1289
1290 .PROCEND
1291
1292
1293;---------------------------------------------------------------------------
1294
1295MUL_ADD_C .macro A0L,A0R,B0L,B0R,C1,C2,C3
1296 XMPYU A0L,B0R,ftemp1 ; m1 = bl*ht
1297 FSTD ftemp1,-16(%sp) ;
1298 XMPYU A0R,B0L,ftemp2 ; m = bh*lt
1299 FSTD ftemp2,-8(%sp) ;
1300 XMPYU A0R,B0R,ftemp3 ; lt = bl*lt
1301 FSTD ftemp3,-32(%sp)
1302 XMPYU A0L,B0L,ftemp4 ; ht = bh*ht
1303 FSTD ftemp4,-24(%sp) ;
1304
1305 LDD -8(%sp),m ; r21 = m
1306 LDD -16(%sp),m1 ; r19 = m1
1307 ADD,L m,m1,m ; m+m1
1308
1309 DEPD,Z m,31,32,temp3 ; (m+m1<<32)
1310 LDD -24(%sp),ht ; r24 = ht
1311
1312 CMPCLR,*>>= m,m1,%r0 ; if (m < m1)
1313 ADD,L ht,high_one,ht ; ht+=high_one
1314
1315 EXTRD,U m,31,32,temp1 ; m >> 32
1316 LDD -32(%sp),lt ; lt
1317 ADD,L ht,temp1,ht ; ht+= m>>32
1318 ADD lt,temp3,lt ; lt = lt+m1
1319 ADD,DC ht,%r0,ht ; ht++
1320
1321 ADD C1,lt,C1 ; c1=c1+lt
1322 ADD,DC ht,%r0,ht ; bump c3 if overflow,nullify otherwise
1323
1324 ADD C2,ht,C2 ; c2 = c2 + ht
1325 ADD,DC C3,%r0,C3 ; add in carry (c3++)
1326.endm
1327
1328
1329;
1330;void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
1331; arg0 = r_ptr
1332; arg1 = a_ptr
1333; arg2 = b_ptr
1334;
1335
1336bn_mul_comba8
1337 .proc
1338 .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
1339 .EXPORT bn_mul_comba8,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
1340 .entry
1341 .align 64
1342
1343 STD %r3,0(%sp) ; save r3
1344 STD %r4,8(%sp) ; save r4
1345 STD %r5,16(%sp) ; save r5
1346 STD %r6,24(%sp) ; save r6
1347 FSTD %fr12,32(%sp) ; save r6
1348 FSTD %fr13,40(%sp) ; save r7
1349
1350 ;
1351 ; Zero out carries
1352 ;
1353 COPY %r0,c1
1354 COPY %r0,c2
1355 COPY %r0,c3
1356
1357 LDO 128(%sp),%sp ; bump stack
1358 DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32
1359
1360 ;
1361 ; Load up all of the values we are going to use
1362 ;
1363 FLDD 0(a_ptr),a0
1364 FLDD 8(a_ptr),a1
1365 FLDD 16(a_ptr),a2
1366 FLDD 24(a_ptr),a3
1367 FLDD 32(a_ptr),a4
1368 FLDD 40(a_ptr),a5
1369 FLDD 48(a_ptr),a6
1370 FLDD 56(a_ptr),a7
1371
1372 FLDD 0(b_ptr),b0
1373 FLDD 8(b_ptr),b1
1374 FLDD 16(b_ptr),b2
1375 FLDD 24(b_ptr),b3
1376 FLDD 32(b_ptr),b4
1377 FLDD 40(b_ptr),b5
1378 FLDD 48(b_ptr),b6
1379 FLDD 56(b_ptr),b7
1380
1381 MUL_ADD_C a0L,a0R,b0L,b0R,c1,c2,c3
1382 STD c1,0(r_ptr)
1383 COPY %r0,c1
1384
1385 MUL_ADD_C a0L,a0R,b1L,b1R,c2,c3,c1
1386 MUL_ADD_C a1L,a1R,b0L,b0R,c2,c3,c1
1387 STD c2,8(r_ptr)
1388 COPY %r0,c2
1389
1390 MUL_ADD_C a2L,a2R,b0L,b0R,c3,c1,c2
1391 MUL_ADD_C a1L,a1R,b1L,b1R,c3,c1,c2
1392 MUL_ADD_C a0L,a0R,b2L,b2R,c3,c1,c2
1393 STD c3,16(r_ptr)
1394 COPY %r0,c3
1395
1396 MUL_ADD_C a0L,a0R,b3L,b3R,c1,c2,c3
1397 MUL_ADD_C a1L,a1R,b2L,b2R,c1,c2,c3
1398 MUL_ADD_C a2L,a2R,b1L,b1R,c1,c2,c3
1399 MUL_ADD_C a3L,a3R,b0L,b0R,c1,c2,c3
1400 STD c1,24(r_ptr)
1401 COPY %r0,c1
1402
1403 MUL_ADD_C a4L,a4R,b0L,b0R,c2,c3,c1
1404 MUL_ADD_C a3L,a3R,b1L,b1R,c2,c3,c1
1405 MUL_ADD_C a2L,a2R,b2L,b2R,c2,c3,c1
1406 MUL_ADD_C a1L,a1R,b3L,b3R,c2,c3,c1
1407 MUL_ADD_C a0L,a0R,b4L,b4R,c2,c3,c1
1408 STD c2,32(r_ptr)
1409 COPY %r0,c2
1410
1411 MUL_ADD_C a0L,a0R,b5L,b5R,c3,c1,c2
1412 MUL_ADD_C a1L,a1R,b4L,b4R,c3,c1,c2
1413 MUL_ADD_C a2L,a2R,b3L,b3R,c3,c1,c2
1414 MUL_ADD_C a3L,a3R,b2L,b2R,c3,c1,c2
1415 MUL_ADD_C a4L,a4R,b1L,b1R,c3,c1,c2
1416 MUL_ADD_C a5L,a5R,b0L,b0R,c3,c1,c2
1417 STD c3,40(r_ptr)
1418 COPY %r0,c3
1419
1420 MUL_ADD_C a6L,a6R,b0L,b0R,c1,c2,c3
1421 MUL_ADD_C a5L,a5R,b1L,b1R,c1,c2,c3
1422 MUL_ADD_C a4L,a4R,b2L,b2R,c1,c2,c3
1423 MUL_ADD_C a3L,a3R,b3L,b3R,c1,c2,c3
1424 MUL_ADD_C a2L,a2R,b4L,b4R,c1,c2,c3
1425 MUL_ADD_C a1L,a1R,b5L,b5R,c1,c2,c3
1426 MUL_ADD_C a0L,a0R,b6L,b6R,c1,c2,c3
1427 STD c1,48(r_ptr)
1428 COPY %r0,c1
1429
1430 MUL_ADD_C a0L,a0R,b7L,b7R,c2,c3,c1
1431 MUL_ADD_C a1L,a1R,b6L,b6R,c2,c3,c1
1432 MUL_ADD_C a2L,a2R,b5L,b5R,c2,c3,c1
1433 MUL_ADD_C a3L,a3R,b4L,b4R,c2,c3,c1
1434 MUL_ADD_C a4L,a4R,b3L,b3R,c2,c3,c1
1435 MUL_ADD_C a5L,a5R,b2L,b2R,c2,c3,c1
1436 MUL_ADD_C a6L,a6R,b1L,b1R,c2,c3,c1
1437 MUL_ADD_C a7L,a7R,b0L,b0R,c2,c3,c1
1438 STD c2,56(r_ptr)
1439 COPY %r0,c2
1440
1441 MUL_ADD_C a7L,a7R,b1L,b1R,c3,c1,c2
1442 MUL_ADD_C a6L,a6R,b2L,b2R,c3,c1,c2
1443 MUL_ADD_C a5L,a5R,b3L,b3R,c3,c1,c2
1444 MUL_ADD_C a4L,a4R,b4L,b4R,c3,c1,c2
1445 MUL_ADD_C a3L,a3R,b5L,b5R,c3,c1,c2
1446 MUL_ADD_C a2L,a2R,b6L,b6R,c3,c1,c2
1447 MUL_ADD_C a1L,a1R,b7L,b7R,c3,c1,c2
1448 STD c3,64(r_ptr)
1449 COPY %r0,c3
1450
1451 MUL_ADD_C a2L,a2R,b7L,b7R,c1,c2,c3
1452 MUL_ADD_C a3L,a3R,b6L,b6R,c1,c2,c3
1453 MUL_ADD_C a4L,a4R,b5L,b5R,c1,c2,c3
1454 MUL_ADD_C a5L,a5R,b4L,b4R,c1,c2,c3
1455 MUL_ADD_C a6L,a6R,b3L,b3R,c1,c2,c3
1456 MUL_ADD_C a7L,a7R,b2L,b2R,c1,c2,c3
1457 STD c1,72(r_ptr)
1458 COPY %r0,c1
1459
1460 MUL_ADD_C a7L,a7R,b3L,b3R,c2,c3,c1
1461 MUL_ADD_C a6L,a6R,b4L,b4R,c2,c3,c1
1462 MUL_ADD_C a5L,a5R,b5L,b5R,c2,c3,c1
1463 MUL_ADD_C a4L,a4R,b6L,b6R,c2,c3,c1
1464 MUL_ADD_C a3L,a3R,b7L,b7R,c2,c3,c1
1465 STD c2,80(r_ptr)
1466 COPY %r0,c2
1467
1468 MUL_ADD_C a4L,a4R,b7L,b7R,c3,c1,c2
1469 MUL_ADD_C a5L,a5R,b6L,b6R,c3,c1,c2
1470 MUL_ADD_C a6L,a6R,b5L,b5R,c3,c1,c2
1471 MUL_ADD_C a7L,a7R,b4L,b4R,c3,c1,c2
1472 STD c3,88(r_ptr)
1473 COPY %r0,c3
1474
1475 MUL_ADD_C a7L,a7R,b5L,b5R,c1,c2,c3
1476 MUL_ADD_C a6L,a6R,b6L,b6R,c1,c2,c3
1477 MUL_ADD_C a5L,a5R,b7L,b7R,c1,c2,c3
1478 STD c1,96(r_ptr)
1479 COPY %r0,c1
1480
1481 MUL_ADD_C a6L,a6R,b7L,b7R,c2,c3,c1
1482 MUL_ADD_C a7L,a7R,b6L,b6R,c2,c3,c1
1483 STD c2,104(r_ptr)
1484 COPY %r0,c2
1485
1486 MUL_ADD_C a7L,a7R,b7L,b7R,c3,c1,c2
1487 STD c3,112(r_ptr)
1488 STD c1,120(r_ptr)
1489
1490 .EXIT
1491 FLDD -88(%sp),%fr13
1492 FLDD -96(%sp),%fr12
1493 LDD -104(%sp),%r6 ; restore r6
1494 LDD -112(%sp),%r5 ; restore r5
1495 LDD -120(%sp),%r4 ; restore r4
1496 BVE (%rp)
1497 LDD,MB -128(%sp),%r3
1498
1499 .PROCEND
1500
1501;-----------------------------------------------------------------------------
1502;
1503;void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
1504; arg0 = r_ptr
1505; arg1 = a_ptr
1506; arg2 = b_ptr
1507;
1508
1509bn_mul_comba4
1510 .proc
1511 .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
1512 .EXPORT bn_mul_comba4,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
1513 .entry
1514 .align 64
1515
1516 STD %r3,0(%sp) ; save r3
1517 STD %r4,8(%sp) ; save r4
1518 STD %r5,16(%sp) ; save r5
1519 STD %r6,24(%sp) ; save r6
1520 FSTD %fr12,32(%sp) ; save r6
1521 FSTD %fr13,40(%sp) ; save r7
1522
1523 ;
1524 ; Zero out carries
1525 ;
1526 COPY %r0,c1
1527 COPY %r0,c2
1528 COPY %r0,c3
1529
1530 LDO 128(%sp),%sp ; bump stack
1531 DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32
1532
1533 ;
1534 ; Load up all of the values we are going to use
1535 ;
1536 FLDD 0(a_ptr),a0
1537 FLDD 8(a_ptr),a1
1538 FLDD 16(a_ptr),a2
1539 FLDD 24(a_ptr),a3
1540
1541 FLDD 0(b_ptr),b0
1542 FLDD 8(b_ptr),b1
1543 FLDD 16(b_ptr),b2
1544 FLDD 24(b_ptr),b3
1545
1546 MUL_ADD_C a0L,a0R,b0L,b0R,c1,c2,c3
1547 STD c1,0(r_ptr)
1548 COPY %r0,c1
1549
1550 MUL_ADD_C a0L,a0R,b1L,b1R,c2,c3,c1
1551 MUL_ADD_C a1L,a1R,b0L,b0R,c2,c3,c1
1552 STD c2,8(r_ptr)
1553 COPY %r0,c2
1554
1555 MUL_ADD_C a2L,a2R,b0L,b0R,c3,c1,c2
1556 MUL_ADD_C a1L,a1R,b1L,b1R,c3,c1,c2
1557 MUL_ADD_C a0L,a0R,b2L,b2R,c3,c1,c2
1558 STD c3,16(r_ptr)
1559 COPY %r0,c3
1560
1561 MUL_ADD_C a0L,a0R,b3L,b3R,c1,c2,c3
1562 MUL_ADD_C a1L,a1R,b2L,b2R,c1,c2,c3
1563 MUL_ADD_C a2L,a2R,b1L,b1R,c1,c2,c3
1564 MUL_ADD_C a3L,a3R,b0L,b0R,c1,c2,c3
1565 STD c1,24(r_ptr)
1566 COPY %r0,c1
1567
1568 MUL_ADD_C a3L,a3R,b1L,b1R,c2,c3,c1
1569 MUL_ADD_C a2L,a2R,b2L,b2R,c2,c3,c1
1570 MUL_ADD_C a1L,a1R,b3L,b3R,c2,c3,c1
1571 STD c2,32(r_ptr)
1572 COPY %r0,c2
1573
1574 MUL_ADD_C a2L,a2R,b3L,b3R,c3,c1,c2
1575 MUL_ADD_C a3L,a3R,b2L,b2R,c3,c1,c2
1576 STD c3,40(r_ptr)
1577 COPY %r0,c3
1578
1579 MUL_ADD_C a3L,a3R,b3L,b3R,c1,c2,c3
1580 STD c1,48(r_ptr)
1581 STD c2,56(r_ptr)
1582
1583 .EXIT
1584 FLDD -88(%sp),%fr13
1585 FLDD -96(%sp),%fr12
1586 LDD -104(%sp),%r6 ; restore r6
1587 LDD -112(%sp),%r5 ; restore r5
1588 LDD -120(%sp),%r4 ; restore r4
1589 BVE (%rp)
1590 LDD,MB -128(%sp),%r3
1591
1592 .PROCEND
1593
1594
1595 .SPACE $TEXT$
1596 .SUBSPA $CODE$
1597 .SPACE $PRIVATE$,SORT=16
1598 .IMPORT $global$,DATA
1599 .SPACE $TEXT$
1600 .SUBSPA $CODE$
1601 .SUBSPA $LIT$,ACCESS=0x2c
1602C$4
1603 .ALIGN 8
1604 .STRINGZ "Division would overflow (%d)\n"
1605 .END
diff --git a/src/lib/libcrypto/bn/asm/parisc-mont.pl b/src/lib/libcrypto/bn/asm/parisc-mont.pl
deleted file mode 100644
index 4a766a87fb..0000000000
--- a/src/lib/libcrypto/bn/asm/parisc-mont.pl
+++ /dev/null
@@ -1,993 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# On PA-7100LC this module performs ~90-50% better, less for longer
11# keys, than code generated by gcc 3.2 for PA-RISC 1.1. Latter means
12# that compiler utilized xmpyu instruction to perform 32x32=64-bit
13# multiplication, which in turn means that "baseline" performance was
14# optimal in respect to instruction set capabilities. Fair comparison
15# with vendor compiler is problematic, because OpenSSL doesn't define
16# BN_LLONG [presumably] for historical reasons, which drives compiler
17# toward 4 times 16x16=32-bit multiplicatons [plus complementary
18# shifts and additions] instead. This means that you should observe
19# several times improvement over code generated by vendor compiler
20# for PA-RISC 1.1, but the "baseline" is far from optimal. The actual
21# improvement coefficient was never collected on PA-7100LC, or any
22# other 1.1 CPU, because I don't have access to such machine with
23# vendor compiler. But to give you a taste, PA-RISC 1.1 code path
24# reportedly outperformed code generated by cc +DA1.1 +O3 by factor
25# of ~5x on PA-8600.
26#
27# On PA-RISC 2.0 it has to compete with pa-risc2[W].s, which is
28# reportedly ~2x faster than vendor compiler generated code [according
29# to comment in pa-risc2[W].s]. Here comes a catch. Execution core of
30# this implementation is actually 32-bit one, in the sense that it
31# operates on 32-bit values. But pa-risc2[W].s operates on arrays of
32# 64-bit BN_LONGs... How do they interoperate then? No problem. This
33# module picks halves of 64-bit values in reverse order and pretends
34# they were 32-bit BN_LONGs. But can 32-bit core compete with "pure"
35# 64-bit code such as pa-risc2[W].s then? Well, the thing is that
36# 32x32=64-bit multiplication is the best even PA-RISC 2.0 can do,
37# i.e. there is no "wider" multiplication like on most other 64-bit
38# platforms. This means that even being effectively 32-bit, this
39# implementation performs "64-bit" computational task in same amount
40# of arithmetic operations, most notably multiplications. It requires
41# more memory references, most notably to tp[num], but this doesn't
42# seem to exhaust memory port capacity. And indeed, dedicated PA-RISC
43# 2.0 code path, provides virtually same performance as pa-risc2[W].s:
44# it's ~10% better for shortest key length and ~10% worse for longest
45# one.
46#
47# In case it wasn't clear. The module has two distinct code paths:
48# PA-RISC 1.1 and PA-RISC 2.0 ones. Latter features carry-free 64-bit
49# additions and 64-bit integer loads, not to mention specific
50# instruction scheduling. In 64-bit build naturally only 2.0 code path
51# is assembled. In 32-bit application context both code paths are
52# assembled, PA-RISC 2.0 CPU is detected at run-time and proper path
53# is taken automatically. Also, in 32-bit build the module imposes
54# couple of limitations: vector lengths has to be even and vector
55# addresses has to be 64-bit aligned. Normally neither is a problem:
56# most common key lengths are even and vectors are commonly malloc-ed,
57# which ensures alignment.
58#
59# Special thanks to polarhome.com for providing HP-UX account on
60# PA-RISC 1.1 machine, and to correspondent who chose to remain
61# anonymous for testing the code on PA-RISC 2.0 machine.
62
63$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
64
65$flavour = shift;
66$output = shift;
67
68open STDOUT,">$output";
69
70if ($flavour =~ /64/) {
71 $LEVEL ="2.0W";
72 $SIZE_T =8;
73 $FRAME_MARKER =80;
74 $SAVED_RP =16;
75 $PUSH ="std";
76 $PUSHMA ="std,ma";
77 $POP ="ldd";
78 $POPMB ="ldd,mb";
79 $BN_SZ =$SIZE_T;
80} else {
81 $LEVEL ="1.1"; #$LEVEL.="\n\t.ALLOW\t2.0";
82 $SIZE_T =4;
83 $FRAME_MARKER =48;
84 $SAVED_RP =20;
85 $PUSH ="stw";
86 $PUSHMA ="stwm";
87 $POP ="ldw";
88 $POPMB ="ldwm";
89 $BN_SZ =$SIZE_T;
90 if (open CONF,"<${dir}../../opensslconf.h") {
91 while(<CONF>) {
92 if (m/#\s*define\s+SIXTY_FOUR_BIT/) {
93 $BN_SZ=8;
94 $LEVEL="2.0";
95 last;
96 }
97 }
98 close CONF;
99 }
100}
101
102$FRAME=8*$SIZE_T+$FRAME_MARKER; # 8 saved regs + frame marker
103 # [+ argument transfer]
104$LOCALS=$FRAME-$FRAME_MARKER;
105$FRAME+=32; # local variables
106
107$tp="%r31";
108$ti1="%r29";
109$ti0="%r28";
110
111$rp="%r26";
112$ap="%r25";
113$bp="%r24";
114$np="%r23";
115$n0="%r22"; # passed through stack in 32-bit
116$num="%r21"; # passed through stack in 32-bit
117$idx="%r20";
118$arrsz="%r19";
119
120$nm1="%r7";
121$nm0="%r6";
122$ab1="%r5";
123$ab0="%r4";
124
125$fp="%r3";
126$hi1="%r2";
127$hi0="%r1";
128
129$xfer=$n0; # accomodates [-16..15] offset in fld[dw]s
130
131$fm0="%fr4"; $fti=$fm0;
132$fbi="%fr5L";
133$fn0="%fr5R";
134$fai="%fr6"; $fab0="%fr7"; $fab1="%fr8";
135$fni="%fr9"; $fnm0="%fr10"; $fnm1="%fr11";
136
137$code=<<___;
138 .LEVEL $LEVEL
139 .SPACE \$TEXT\$
140 .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
141
142 .EXPORT bn_mul_mont,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
143 .ALIGN 64
144bn_mul_mont
145 .PROC
146 .CALLINFO FRAME=`$FRAME-8*$SIZE_T`,NO_CALLS,SAVE_RP,SAVE_SP,ENTRY_GR=6
147 .ENTRY
148 $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
149 $PUSHMA %r3,$FRAME(%sp)
150 $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
151 $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
152 $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
153 $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp)
154 $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp)
155 $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp)
156 $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp)
157 ldo -$FRAME(%sp),$fp
158___
159$code.=<<___ if ($SIZE_T==4);
160 ldw `-$FRAME_MARKER-4`($fp),$n0
161 ldw `-$FRAME_MARKER-8`($fp),$num
162 nop
163 nop ; alignment
164___
165$code.=<<___ if ($BN_SZ==4);
166 comiclr,<= 6,$num,%r0 ; are vectors long enough?
167 b L\$abort
168 ldi 0,%r28 ; signal "unhandled"
169 add,ev %r0,$num,$num ; is $num even?
170 b L\$abort
171 nop
172 or $ap,$np,$ti1
173 extru,= $ti1,31,3,%r0 ; are ap and np 64-bit aligned?
174 b L\$abort
175 nop
176 nop ; alignment
177 nop
178
179 fldws 0($n0),${fn0}
180 fldws,ma 4($bp),${fbi} ; bp[0]
181___
182$code.=<<___ if ($BN_SZ==8);
183 comib,> 3,$num,L\$abort ; are vectors long enough?
184 ldi 0,%r28 ; signal "unhandled"
185 addl $num,$num,$num ; I operate on 32-bit values
186
187 fldws 4($n0),${fn0} ; only low part of n0
188 fldws 4($bp),${fbi} ; bp[0] in flipped word order
189___
190$code.=<<___;
191 fldds 0($ap),${fai} ; ap[0,1]
192 fldds 0($np),${fni} ; np[0,1]
193
194 sh2addl $num,%r0,$arrsz
195 ldi 31,$hi0
196 ldo 36($arrsz),$hi1 ; space for tp[num+1]
197 andcm $hi1,$hi0,$hi1 ; align
198 addl $hi1,%sp,%sp
199 $PUSH $fp,-$SIZE_T(%sp)
200
201 ldo `$LOCALS+16`($fp),$xfer
202 ldo `$LOCALS+32+4`($fp),$tp
203
204 xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[0]
205 xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[0]
206 xmpyu ${fn0},${fab0}R,${fm0}
207
208 addl $arrsz,$ap,$ap ; point at the end
209 addl $arrsz,$np,$np
210 subi 0,$arrsz,$idx ; j=0
211 ldo 8($idx),$idx ; j++++
212
213 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m
214 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m
215 fstds ${fab0},-16($xfer)
216 fstds ${fnm0},-8($xfer)
217 fstds ${fab1},0($xfer)
218 fstds ${fnm1},8($xfer)
219 flddx $idx($ap),${fai} ; ap[2,3]
220 flddx $idx($np),${fni} ; np[2,3]
221___
222$code.=<<___ if ($BN_SZ==4);
223 mtctl $hi0,%cr11 ; $hi0 still holds 31
224 extrd,u,*= $hi0,%sar,1,$hi0 ; executes on PA-RISC 1.0
225 b L\$parisc11
226 nop
227___
228$code.=<<___; # PA-RISC 2.0 code-path
229 xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0]
230 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
231 ldd -16($xfer),$ab0
232 fstds ${fab0},-16($xfer)
233
234 extrd,u $ab0,31,32,$hi0
235 extrd,u $ab0,63,32,$ab0
236 ldd -8($xfer),$nm0
237 fstds ${fnm0},-8($xfer)
238 ldo 8($idx),$idx ; j++++
239 addl $ab0,$nm0,$nm0 ; low part is discarded
240 extrd,u $nm0,31,32,$hi1
241
242L\$1st
243 xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[0]
244 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m
245 ldd 0($xfer),$ab1
246 fstds ${fab1},0($xfer)
247 addl $hi0,$ab1,$ab1
248 extrd,u $ab1,31,32,$hi0
249 ldd 8($xfer),$nm1
250 fstds ${fnm1},8($xfer)
251 extrd,u $ab1,63,32,$ab1
252 addl $hi1,$nm1,$nm1
253 flddx $idx($ap),${fai} ; ap[j,j+1]
254 flddx $idx($np),${fni} ; np[j,j+1]
255 addl $ab1,$nm1,$nm1
256 extrd,u $nm1,31,32,$hi1
257
258 xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0]
259 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
260 ldd -16($xfer),$ab0
261 fstds ${fab0},-16($xfer)
262 addl $hi0,$ab0,$ab0
263 extrd,u $ab0,31,32,$hi0
264 ldd -8($xfer),$nm0
265 fstds ${fnm0},-8($xfer)
266 extrd,u $ab0,63,32,$ab0
267 addl $hi1,$nm0,$nm0
268 stw $nm1,-4($tp) ; tp[j-1]
269 addl $ab0,$nm0,$nm0
270 stw,ma $nm0,8($tp) ; tp[j-1]
271 addib,<> 8,$idx,L\$1st ; j++++
272 extrd,u $nm0,31,32,$hi1
273
274 xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[0]
275 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m
276 ldd 0($xfer),$ab1
277 fstds ${fab1},0($xfer)
278 addl $hi0,$ab1,$ab1
279 extrd,u $ab1,31,32,$hi0
280 ldd 8($xfer),$nm1
281 fstds ${fnm1},8($xfer)
282 extrd,u $ab1,63,32,$ab1
283 addl $hi1,$nm1,$nm1
284 ldd -16($xfer),$ab0
285 addl $ab1,$nm1,$nm1
286 ldd -8($xfer),$nm0
287 extrd,u $nm1,31,32,$hi1
288
289 addl $hi0,$ab0,$ab0
290 extrd,u $ab0,31,32,$hi0
291 stw $nm1,-4($tp) ; tp[j-1]
292 extrd,u $ab0,63,32,$ab0
293 addl $hi1,$nm0,$nm0
294 ldd 0($xfer),$ab1
295 addl $ab0,$nm0,$nm0
296 ldd,mb 8($xfer),$nm1
297 extrd,u $nm0,31,32,$hi1
298 stw,ma $nm0,8($tp) ; tp[j-1]
299
300 ldo -1($num),$num ; i--
301 subi 0,$arrsz,$idx ; j=0
302___
303$code.=<<___ if ($BN_SZ==4);
304 fldws,ma 4($bp),${fbi} ; bp[1]
305___
306$code.=<<___ if ($BN_SZ==8);
307 fldws 0($bp),${fbi} ; bp[1] in flipped word order
308___
309$code.=<<___;
310 flddx $idx($ap),${fai} ; ap[0,1]
311 flddx $idx($np),${fni} ; np[0,1]
312 fldws 8($xfer),${fti}R ; tp[0]
313 addl $hi0,$ab1,$ab1
314 extrd,u $ab1,31,32,$hi0
315 extrd,u $ab1,63,32,$ab1
316 ldo 8($idx),$idx ; j++++
317 xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[1]
318 xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[1]
319 addl $hi1,$nm1,$nm1
320 addl $ab1,$nm1,$nm1
321 extrd,u $nm1,31,32,$hi1
322 fstws,mb ${fab0}L,-8($xfer) ; save high part
323 stw $nm1,-4($tp) ; tp[j-1]
324
325 fcpy,sgl %fr0,${fti}L ; zero high part
326 fcpy,sgl %fr0,${fab0}L
327 addl $hi1,$hi0,$hi0
328 extrd,u $hi0,31,32,$hi1
329 fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double
330 fcnvxf,dbl,dbl ${fab0},${fab0}
331 stw $hi0,0($tp)
332 stw $hi1,4($tp)
333
334 fadd,dbl ${fti},${fab0},${fab0} ; add tp[0]
335 fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int
336 xmpyu ${fn0},${fab0}R,${fm0}
337 ldo `$LOCALS+32+4`($fp),$tp
338L\$outer
339 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m
340 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m
341 fstds ${fab0},-16($xfer) ; 33-bit value
342 fstds ${fnm0},-8($xfer)
343 flddx $idx($ap),${fai} ; ap[2]
344 flddx $idx($np),${fni} ; np[2]
345 ldo 8($idx),$idx ; j++++
346 ldd -16($xfer),$ab0 ; 33-bit value
347 ldd -8($xfer),$nm0
348 ldw 0($xfer),$hi0 ; high part
349
350 xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i]
351 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
352 extrd,u $ab0,31,32,$ti0 ; carry bit
353 extrd,u $ab0,63,32,$ab0
354 fstds ${fab1},0($xfer)
355 addl $ti0,$hi0,$hi0 ; account carry bit
356 fstds ${fnm1},8($xfer)
357 addl $ab0,$nm0,$nm0 ; low part is discarded
358 ldw 0($tp),$ti1 ; tp[1]
359 extrd,u $nm0,31,32,$hi1
360 fstds ${fab0},-16($xfer)
361 fstds ${fnm0},-8($xfer)
362
363L\$inner
364 xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[i]
365 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m
366 ldd 0($xfer),$ab1
367 fstds ${fab1},0($xfer)
368 addl $hi0,$ti1,$ti1
369 addl $ti1,$ab1,$ab1
370 ldd 8($xfer),$nm1
371 fstds ${fnm1},8($xfer)
372 extrd,u $ab1,31,32,$hi0
373 extrd,u $ab1,63,32,$ab1
374 flddx $idx($ap),${fai} ; ap[j,j+1]
375 flddx $idx($np),${fni} ; np[j,j+1]
376 addl $hi1,$nm1,$nm1
377 addl $ab1,$nm1,$nm1
378 ldw 4($tp),$ti0 ; tp[j]
379 stw $nm1,-4($tp) ; tp[j-1]
380
381 xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i]
382 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
383 ldd -16($xfer),$ab0
384 fstds ${fab0},-16($xfer)
385 addl $hi0,$ti0,$ti0
386 addl $ti0,$ab0,$ab0
387 ldd -8($xfer),$nm0
388 fstds ${fnm0},-8($xfer)
389 extrd,u $ab0,31,32,$hi0
390 extrd,u $nm1,31,32,$hi1
391 ldw 8($tp),$ti1 ; tp[j]
392 extrd,u $ab0,63,32,$ab0
393 addl $hi1,$nm0,$nm0
394 addl $ab0,$nm0,$nm0
395 stw,ma $nm0,8($tp) ; tp[j-1]
396 addib,<> 8,$idx,L\$inner ; j++++
397 extrd,u $nm0,31,32,$hi1
398
399 xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[i]
400 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m
401 ldd 0($xfer),$ab1
402 fstds ${fab1},0($xfer)
403 addl $hi0,$ti1,$ti1
404 addl $ti1,$ab1,$ab1
405 ldd 8($xfer),$nm1
406 fstds ${fnm1},8($xfer)
407 extrd,u $ab1,31,32,$hi0
408 extrd,u $ab1,63,32,$ab1
409 ldw 4($tp),$ti0 ; tp[j]
410 addl $hi1,$nm1,$nm1
411 addl $ab1,$nm1,$nm1
412 ldd -16($xfer),$ab0
413 ldd -8($xfer),$nm0
414 extrd,u $nm1,31,32,$hi1
415
416 addl $hi0,$ab0,$ab0
417 addl $ti0,$ab0,$ab0
418 stw $nm1,-4($tp) ; tp[j-1]
419 extrd,u $ab0,31,32,$hi0
420 ldw 8($tp),$ti1 ; tp[j]
421 extrd,u $ab0,63,32,$ab0
422 addl $hi1,$nm0,$nm0
423 ldd 0($xfer),$ab1
424 addl $ab0,$nm0,$nm0
425 ldd,mb 8($xfer),$nm1
426 extrd,u $nm0,31,32,$hi1
427 stw,ma $nm0,8($tp) ; tp[j-1]
428
429 addib,= -1,$num,L\$outerdone ; i--
430 subi 0,$arrsz,$idx ; j=0
431___
432$code.=<<___ if ($BN_SZ==4);
433 fldws,ma 4($bp),${fbi} ; bp[i]
434___
435$code.=<<___ if ($BN_SZ==8);
436 ldi 12,$ti0 ; bp[i] in flipped word order
437 addl,ev %r0,$num,$num
438 ldi -4,$ti0
439 addl $ti0,$bp,$bp
440 fldws 0($bp),${fbi}
441___
442$code.=<<___;
443 flddx $idx($ap),${fai} ; ap[0]
444 addl $hi0,$ab1,$ab1
445 flddx $idx($np),${fni} ; np[0]
446 fldws 8($xfer),${fti}R ; tp[0]
447 addl $ti1,$ab1,$ab1
448 extrd,u $ab1,31,32,$hi0
449 extrd,u $ab1,63,32,$ab1
450
451 ldo 8($idx),$idx ; j++++
452 xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[i]
453 xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[i]
454 ldw 4($tp),$ti0 ; tp[j]
455
456 addl $hi1,$nm1,$nm1
457 fstws,mb ${fab0}L,-8($xfer) ; save high part
458 addl $ab1,$nm1,$nm1
459 extrd,u $nm1,31,32,$hi1
460 fcpy,sgl %fr0,${fti}L ; zero high part
461 fcpy,sgl %fr0,${fab0}L
462 stw $nm1,-4($tp) ; tp[j-1]
463
464 fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double
465 fcnvxf,dbl,dbl ${fab0},${fab0}
466 addl $hi1,$hi0,$hi0
467 fadd,dbl ${fti},${fab0},${fab0} ; add tp[0]
468 addl $ti0,$hi0,$hi0
469 extrd,u $hi0,31,32,$hi1
470 fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int
471 stw $hi0,0($tp)
472 stw $hi1,4($tp)
473 xmpyu ${fn0},${fab0}R,${fm0}
474
475 b L\$outer
476 ldo `$LOCALS+32+4`($fp),$tp
477
478L\$outerdone
479 addl $hi0,$ab1,$ab1
480 addl $ti1,$ab1,$ab1
481 extrd,u $ab1,31,32,$hi0
482 extrd,u $ab1,63,32,$ab1
483
484 ldw 4($tp),$ti0 ; tp[j]
485
486 addl $hi1,$nm1,$nm1
487 addl $ab1,$nm1,$nm1
488 extrd,u $nm1,31,32,$hi1
489 stw $nm1,-4($tp) ; tp[j-1]
490
491 addl $hi1,$hi0,$hi0
492 addl $ti0,$hi0,$hi0
493 extrd,u $hi0,31,32,$hi1
494 stw $hi0,0($tp)
495 stw $hi1,4($tp)
496
497 ldo `$LOCALS+32`($fp),$tp
498 sub %r0,%r0,%r0 ; clear borrow
499___
500$code.=<<___ if ($BN_SZ==4);
501 ldws,ma 4($tp),$ti0
502 extru,= $rp,31,3,%r0 ; is rp 64-bit aligned?
503 b L\$sub_pa11
504 addl $tp,$arrsz,$tp
505L\$sub
506 ldwx $idx($np),$hi0
507 subb $ti0,$hi0,$hi1
508 ldwx $idx($tp),$ti0
509 addib,<> 4,$idx,L\$sub
510 stws,ma $hi1,4($rp)
511
512 subb $ti0,%r0,$hi1
513 ldo -4($tp),$tp
514___
515$code.=<<___ if ($BN_SZ==8);
516 ldd,ma 8($tp),$ti0
517L\$sub
518 ldd $idx($np),$hi0
519 shrpd $ti0,$ti0,32,$ti0 ; flip word order
520 std $ti0,-8($tp) ; save flipped value
521 sub,db $ti0,$hi0,$hi1
522 ldd,ma 8($tp),$ti0
523 addib,<> 8,$idx,L\$sub
524 std,ma $hi1,8($rp)
525
526 extrd,u $ti0,31,32,$ti0 ; carry in flipped word order
527 sub,db $ti0,%r0,$hi1
528 ldo -8($tp),$tp
529___
530$code.=<<___;
531 and $tp,$hi1,$ap
532 andcm $rp,$hi1,$bp
533 or $ap,$bp,$np
534
535 sub $rp,$arrsz,$rp ; rewind rp
536 subi 0,$arrsz,$idx
537 ldo `$LOCALS+32`($fp),$tp
538L\$copy
539 ldd $idx($np),$hi0
540 std,ma %r0,8($tp)
541 addib,<> 8,$idx,.-8 ; L\$copy
542 std,ma $hi0,8($rp)
543___
544
545if ($BN_SZ==4) { # PA-RISC 1.1 code-path
546$ablo=$ab0;
547$abhi=$ab1;
548$nmlo0=$nm0;
549$nmhi0=$nm1;
550$nmlo1="%r9";
551$nmhi1="%r8";
552
553$code.=<<___;
554 b L\$done
555 nop
556
557 .ALIGN 8
558L\$parisc11
559 xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0]
560 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
561 ldw -12($xfer),$ablo
562 ldw -16($xfer),$hi0
563 ldw -4($xfer),$nmlo0
564 ldw -8($xfer),$nmhi0
565 fstds ${fab0},-16($xfer)
566 fstds ${fnm0},-8($xfer)
567
568 ldo 8($idx),$idx ; j++++
569 add $ablo,$nmlo0,$nmlo0 ; discarded
570 addc %r0,$nmhi0,$hi1
571 ldw 4($xfer),$ablo
572 ldw 0($xfer),$abhi
573 nop
574
575L\$1st_pa11
576 xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[0]
577 flddx $idx($ap),${fai} ; ap[j,j+1]
578 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m
579 flddx $idx($np),${fni} ; np[j,j+1]
580 add $hi0,$ablo,$ablo
581 ldw 12($xfer),$nmlo1
582 addc %r0,$abhi,$hi0
583 ldw 8($xfer),$nmhi1
584 add $ablo,$nmlo1,$nmlo1
585 fstds ${fab1},0($xfer)
586 addc %r0,$nmhi1,$nmhi1
587 fstds ${fnm1},8($xfer)
588 add $hi1,$nmlo1,$nmlo1
589 ldw -12($xfer),$ablo
590 addc %r0,$nmhi1,$hi1
591 ldw -16($xfer),$abhi
592
593 xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0]
594 ldw -4($xfer),$nmlo0
595 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
596 ldw -8($xfer),$nmhi0
597 add $hi0,$ablo,$ablo
598 stw $nmlo1,-4($tp) ; tp[j-1]
599 addc %r0,$abhi,$hi0
600 fstds ${fab0},-16($xfer)
601 add $ablo,$nmlo0,$nmlo0
602 fstds ${fnm0},-8($xfer)
603 addc %r0,$nmhi0,$nmhi0
604 ldw 0($xfer),$abhi
605 add $hi1,$nmlo0,$nmlo0
606 ldw 4($xfer),$ablo
607 stws,ma $nmlo0,8($tp) ; tp[j-1]
608 addib,<> 8,$idx,L\$1st_pa11 ; j++++
609 addc %r0,$nmhi0,$hi1
610
611 ldw 8($xfer),$nmhi1
612 ldw 12($xfer),$nmlo1
613 xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[0]
614 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m
615 add $hi0,$ablo,$ablo
616 fstds ${fab1},0($xfer)
617 addc %r0,$abhi,$hi0
618 fstds ${fnm1},8($xfer)
619 add $ablo,$nmlo1,$nmlo1
620 ldw -16($xfer),$abhi
621 addc %r0,$nmhi1,$nmhi1
622 ldw -12($xfer),$ablo
623 add $hi1,$nmlo1,$nmlo1
624 ldw -8($xfer),$nmhi0
625 addc %r0,$nmhi1,$hi1
626 ldw -4($xfer),$nmlo0
627
628 add $hi0,$ablo,$ablo
629 stw $nmlo1,-4($tp) ; tp[j-1]
630 addc %r0,$abhi,$hi0
631 ldw 0($xfer),$abhi
632 add $ablo,$nmlo0,$nmlo0
633 ldw 4($xfer),$ablo
634 addc %r0,$nmhi0,$nmhi0
635 ldws,mb 8($xfer),$nmhi1
636 add $hi1,$nmlo0,$nmlo0
637 ldw 4($xfer),$nmlo1
638 addc %r0,$nmhi0,$hi1
639 stws,ma $nmlo0,8($tp) ; tp[j-1]
640
641 ldo -1($num),$num ; i--
642 subi 0,$arrsz,$idx ; j=0
643
644 fldws,ma 4($bp),${fbi} ; bp[1]
645 flddx $idx($ap),${fai} ; ap[0,1]
646 flddx $idx($np),${fni} ; np[0,1]
647 fldws 8($xfer),${fti}R ; tp[0]
648 add $hi0,$ablo,$ablo
649 addc %r0,$abhi,$hi0
650 ldo 8($idx),$idx ; j++++
651 xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[1]
652 xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[1]
653 add $hi1,$nmlo1,$nmlo1
654 addc %r0,$nmhi1,$nmhi1
655 add $ablo,$nmlo1,$nmlo1
656 addc %r0,$nmhi1,$hi1
657 fstws,mb ${fab0}L,-8($xfer) ; save high part
658 stw $nmlo1,-4($tp) ; tp[j-1]
659
660 fcpy,sgl %fr0,${fti}L ; zero high part
661 fcpy,sgl %fr0,${fab0}L
662 add $hi1,$hi0,$hi0
663 addc %r0,%r0,$hi1
664 fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double
665 fcnvxf,dbl,dbl ${fab0},${fab0}
666 stw $hi0,0($tp)
667 stw $hi1,4($tp)
668
669 fadd,dbl ${fti},${fab0},${fab0} ; add tp[0]
670 fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int
671 xmpyu ${fn0},${fab0}R,${fm0}
672 ldo `$LOCALS+32+4`($fp),$tp
673L\$outer_pa11
674 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m
675 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m
676 fstds ${fab0},-16($xfer) ; 33-bit value
677 fstds ${fnm0},-8($xfer)
678 flddx $idx($ap),${fai} ; ap[2,3]
679 flddx $idx($np),${fni} ; np[2,3]
680 ldw -16($xfer),$abhi ; carry bit actually
681 ldo 8($idx),$idx ; j++++
682 ldw -12($xfer),$ablo
683 ldw -8($xfer),$nmhi0
684 ldw -4($xfer),$nmlo0
685 ldw 0($xfer),$hi0 ; high part
686
687 xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i]
688 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
689 fstds ${fab1},0($xfer)
690 addl $abhi,$hi0,$hi0 ; account carry bit
691 fstds ${fnm1},8($xfer)
692 add $ablo,$nmlo0,$nmlo0 ; discarded
693 ldw 0($tp),$ti1 ; tp[1]
694 addc %r0,$nmhi0,$hi1
695 fstds ${fab0},-16($xfer)
696 fstds ${fnm0},-8($xfer)
697 ldw 4($xfer),$ablo
698 ldw 0($xfer),$abhi
699
700L\$inner_pa11
701 xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[i]
702 flddx $idx($ap),${fai} ; ap[j,j+1]
703 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m
704 flddx $idx($np),${fni} ; np[j,j+1]
705 add $hi0,$ablo,$ablo
706 ldw 4($tp),$ti0 ; tp[j]
707 addc %r0,$abhi,$abhi
708 ldw 12($xfer),$nmlo1
709 add $ti1,$ablo,$ablo
710 ldw 8($xfer),$nmhi1
711 addc %r0,$abhi,$hi0
712 fstds ${fab1},0($xfer)
713 add $ablo,$nmlo1,$nmlo1
714 fstds ${fnm1},8($xfer)
715 addc %r0,$nmhi1,$nmhi1
716 ldw -12($xfer),$ablo
717 add $hi1,$nmlo1,$nmlo1
718 ldw -16($xfer),$abhi
719 addc %r0,$nmhi1,$hi1
720
721 xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i]
722 ldw 8($tp),$ti1 ; tp[j]
723 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
724 ldw -4($xfer),$nmlo0
725 add $hi0,$ablo,$ablo
726 ldw -8($xfer),$nmhi0
727 addc %r0,$abhi,$abhi
728 stw $nmlo1,-4($tp) ; tp[j-1]
729 add $ti0,$ablo,$ablo
730 fstds ${fab0},-16($xfer)
731 addc %r0,$abhi,$hi0
732 fstds ${fnm0},-8($xfer)
733 add $ablo,$nmlo0,$nmlo0
734 ldw 4($xfer),$ablo
735 addc %r0,$nmhi0,$nmhi0
736 ldw 0($xfer),$abhi
737 add $hi1,$nmlo0,$nmlo0
738 stws,ma $nmlo0,8($tp) ; tp[j-1]
739 addib,<> 8,$idx,L\$inner_pa11 ; j++++
740 addc %r0,$nmhi0,$hi1
741
742 xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[i]
743 ldw 12($xfer),$nmlo1
744 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m
745 ldw 8($xfer),$nmhi1
746 add $hi0,$ablo,$ablo
747 ldw 4($tp),$ti0 ; tp[j]
748 addc %r0,$abhi,$abhi
749 fstds ${fab1},0($xfer)
750 add $ti1,$ablo,$ablo
751 fstds ${fnm1},8($xfer)
752 addc %r0,$abhi,$hi0
753 ldw -16($xfer),$abhi
754 add $ablo,$nmlo1,$nmlo1
755 ldw -12($xfer),$ablo
756 addc %r0,$nmhi1,$nmhi1
757 ldw -8($xfer),$nmhi0
758 add $hi1,$nmlo1,$nmlo1
759 ldw -4($xfer),$nmlo0
760 addc %r0,$nmhi1,$hi1
761
762 add $hi0,$ablo,$ablo
763 stw $nmlo1,-4($tp) ; tp[j-1]
764 addc %r0,$abhi,$abhi
765 add $ti0,$ablo,$ablo
766 ldw 8($tp),$ti1 ; tp[j]
767 addc %r0,$abhi,$hi0
768 ldw 0($xfer),$abhi
769 add $ablo,$nmlo0,$nmlo0
770 ldw 4($xfer),$ablo
771 addc %r0,$nmhi0,$nmhi0
772 ldws,mb 8($xfer),$nmhi1
773 add $hi1,$nmlo0,$nmlo0
774 ldw 4($xfer),$nmlo1
775 addc %r0,$nmhi0,$hi1
776 stws,ma $nmlo0,8($tp) ; tp[j-1]
777
778 addib,= -1,$num,L\$outerdone_pa11; i--
779 subi 0,$arrsz,$idx ; j=0
780
781 fldws,ma 4($bp),${fbi} ; bp[i]
782 flddx $idx($ap),${fai} ; ap[0]
783 add $hi0,$ablo,$ablo
784 addc %r0,$abhi,$abhi
785 flddx $idx($np),${fni} ; np[0]
786 fldws 8($xfer),${fti}R ; tp[0]
787 add $ti1,$ablo,$ablo
788 addc %r0,$abhi,$hi0
789
790 ldo 8($idx),$idx ; j++++
791 xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[i]
792 xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[i]
793 ldw 4($tp),$ti0 ; tp[j]
794
795 add $hi1,$nmlo1,$nmlo1
796 addc %r0,$nmhi1,$nmhi1
797 fstws,mb ${fab0}L,-8($xfer) ; save high part
798 add $ablo,$nmlo1,$nmlo1
799 addc %r0,$nmhi1,$hi1
800 fcpy,sgl %fr0,${fti}L ; zero high part
801 fcpy,sgl %fr0,${fab0}L
802 stw $nmlo1,-4($tp) ; tp[j-1]
803
804 fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double
805 fcnvxf,dbl,dbl ${fab0},${fab0}
806 add $hi1,$hi0,$hi0
807 addc %r0,%r0,$hi1
808 fadd,dbl ${fti},${fab0},${fab0} ; add tp[0]
809 add $ti0,$hi0,$hi0
810 addc %r0,$hi1,$hi1
811 fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int
812 stw $hi0,0($tp)
813 stw $hi1,4($tp)
814 xmpyu ${fn0},${fab0}R,${fm0}
815
816 b L\$outer_pa11
817 ldo `$LOCALS+32+4`($fp),$tp
818
819L\$outerdone_pa11
820 add $hi0,$ablo,$ablo
821 addc %r0,$abhi,$abhi
822 add $ti1,$ablo,$ablo
823 addc %r0,$abhi,$hi0
824
825 ldw 4($tp),$ti0 ; tp[j]
826
827 add $hi1,$nmlo1,$nmlo1
828 addc %r0,$nmhi1,$nmhi1
829 add $ablo,$nmlo1,$nmlo1
830 addc %r0,$nmhi1,$hi1
831 stw $nmlo1,-4($tp) ; tp[j-1]
832
833 add $hi1,$hi0,$hi0
834 addc %r0,%r0,$hi1
835 add $ti0,$hi0,$hi0
836 addc %r0,$hi1,$hi1
837 stw $hi0,0($tp)
838 stw $hi1,4($tp)
839
840 ldo `$LOCALS+32+4`($fp),$tp
841 sub %r0,%r0,%r0 ; clear borrow
842 ldw -4($tp),$ti0
843 addl $tp,$arrsz,$tp
844L\$sub_pa11
845 ldwx $idx($np),$hi0
846 subb $ti0,$hi0,$hi1
847 ldwx $idx($tp),$ti0
848 addib,<> 4,$idx,L\$sub_pa11
849 stws,ma $hi1,4($rp)
850
851 subb $ti0,%r0,$hi1
852 ldo -4($tp),$tp
853 and $tp,$hi1,$ap
854 andcm $rp,$hi1,$bp
855 or $ap,$bp,$np
856
857 sub $rp,$arrsz,$rp ; rewind rp
858 subi 0,$arrsz,$idx
859 ldo `$LOCALS+32`($fp),$tp
860L\$copy_pa11
861 ldwx $idx($np),$hi0
862 stws,ma %r0,4($tp)
863 addib,<> 4,$idx,L\$copy_pa11
864 stws,ma $hi0,4($rp)
865
866 nop ; alignment
867L\$done
868___
869}
870
871$code.=<<___;
872 ldi 1,%r28 ; signal "handled"
873 ldo $FRAME($fp),%sp ; destroy tp[num+1]
874
875 $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue
876 $POP `-$FRAME+1*$SIZE_T`(%sp),%r4
877 $POP `-$FRAME+2*$SIZE_T`(%sp),%r5
878 $POP `-$FRAME+3*$SIZE_T`(%sp),%r6
879 $POP `-$FRAME+4*$SIZE_T`(%sp),%r7
880 $POP `-$FRAME+5*$SIZE_T`(%sp),%r8
881 $POP `-$FRAME+6*$SIZE_T`(%sp),%r9
882 $POP `-$FRAME+7*$SIZE_T`(%sp),%r10
883L\$abort
884 bv (%r2)
885 .EXIT
886 $POPMB -$FRAME(%sp),%r3
887 .PROCEND
888 .STRINGZ "Montgomery Multiplication for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
889___
890
891# Explicitly encode PA-RISC 2.0 instructions used in this module, so
892# that it can be compiled with .LEVEL 1.0. It should be noted that I
893# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0
894# directive...
895
896my $ldd = sub {
897 my ($mod,$args) = @_;
898 my $orig = "ldd$mod\t$args";
899
900 if ($args =~ /%r([0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 4
901 { my $opcode=(0x03<<26)|($2<<21)|($1<<16)|(3<<6)|$3;
902 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
903 }
904 elsif ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 5
905 { my $opcode=(0x03<<26)|($2<<21)|(1<<12)|(3<<6)|$3;
906 $opcode|=(($1&0xF)<<17)|(($1&0x10)<<12); # encode offset
907 $opcode|=(1<<5) if ($mod =~ /^,m/);
908 $opcode|=(1<<13) if ($mod =~ /^,mb/);
909 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
910 }
911 else { "\t".$orig; }
912};
913
914my $std = sub {
915 my ($mod,$args) = @_;
916 my $orig = "std$mod\t$args";
917
918 if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 6
919 { my $opcode=(0x03<<26)|($3<<21)|($1<<16)|(1<<12)|(0xB<<6);
920 $opcode|=(($2&0xF)<<1)|(($2&0x10)>>4); # encode offset
921 $opcode|=(1<<5) if ($mod =~ /^,m/);
922 $opcode|=(1<<13) if ($mod =~ /^,mb/);
923 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
924 }
925 else { "\t".$orig; }
926};
927
928my $extrd = sub {
929 my ($mod,$args) = @_;
930 my $orig = "extrd$mod\t$args";
931
932 # I only have ",u" completer, it's implicitly encoded...
933 if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 15
934 { my $opcode=(0x36<<26)|($1<<21)|($4<<16);
935 my $len=32-$3;
936 $opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5); # encode pos
937 $opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len
938 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
939 }
940 elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/) # format 12
941 { my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9);
942 my $len=32-$2;
943 $opcode |= (($len&0x20)<<3)|($len&0x1f); # encode len
944 $opcode |= (1<<13) if ($mod =~ /,\**=/);
945 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
946 }
947 else { "\t".$orig; }
948};
949
950my $shrpd = sub {
951 my ($mod,$args) = @_;
952 my $orig = "shrpd$mod\t$args";
953
954 if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/) # format 14
955 { my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4;
956 my $cpos=63-$3;
957 $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode sa
958 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
959 }
960 else { "\t".$orig; }
961};
962
963my $sub = sub {
964 my ($mod,$args) = @_;
965 my $orig = "sub$mod\t$args";
966
967 if ($mod eq ",db" && $args =~ /%r([0-9]+),%r([0-9]+),%r([0-9]+)/) {
968 my $opcode=(0x02<<26)|($2<<21)|($1<<16)|$3;
969 $opcode|=(1<<10); # e1
970 $opcode|=(1<<8); # e2
971 $opcode|=(1<<5); # d
972 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig
973 }
974 else { "\t".$orig; }
975};
976
977sub assemble {
978 my ($mnemonic,$mod,$args)=@_;
979 my $opcode = eval("\$$mnemonic");
980
981 ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args";
982}
983
984foreach (split("\n",$code)) {
985 s/\`([^\`]*)\`/eval $1/ge;
986 # flip word order in 64-bit mode...
987 s/(xmpyu\s+)($fai|$fni)([LR])/$1.$2.($3 eq "L"?"R":"L")/e if ($BN_SZ==8);
988 # assemble 2.0 instructions in 32-bit mode...
989 s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e if ($BN_SZ==4);
990
991 print $_,"\n";
992}
993close STDOUT;
diff --git a/src/lib/libcrypto/bn/asm/ppc-mont.pl b/src/lib/libcrypto/bn/asm/ppc-mont.pl
deleted file mode 100644
index f9b6992ccc..0000000000
--- a/src/lib/libcrypto/bn/asm/ppc-mont.pl
+++ /dev/null
@@ -1,334 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# April 2006
11
12# "Teaser" Montgomery multiplication module for PowerPC. It's possible
13# to gain a bit more by modulo-scheduling outer loop, then dedicated
14# squaring procedure should give further 20% and code can be adapted
15# for 32-bit application running on 64-bit CPU. As for the latter.
16# It won't be able to achieve "native" 64-bit performance, because in
17# 32-bit application context every addc instruction will have to be
18# expanded as addc, twice right shift by 32 and finally adde, etc.
19# So far RSA *sign* performance improvement over pre-bn_mul_mont asm
20# for 64-bit application running on PPC970/G5 is:
21#
22# 512-bit +65%
23# 1024-bit +35%
24# 2048-bit +18%
25# 4096-bit +4%
26
27$flavour = shift;
28
29if ($flavour =~ /32/) {
30 $BITS= 32;
31 $BNSZ= $BITS/8;
32 $SIZE_T=4;
33 $RZONE= 224;
34
35 $LD= "lwz"; # load
36 $LDU= "lwzu"; # load and update
37 $LDX= "lwzx"; # load indexed
38 $ST= "stw"; # store
39 $STU= "stwu"; # store and update
40 $STX= "stwx"; # store indexed
41 $STUX= "stwux"; # store indexed and update
42 $UMULL= "mullw"; # unsigned multiply low
43 $UMULH= "mulhwu"; # unsigned multiply high
44 $UCMP= "cmplw"; # unsigned compare
45 $SHRI= "srwi"; # unsigned shift right by immediate
46 $PUSH= $ST;
47 $POP= $LD;
48} elsif ($flavour =~ /64/) {
49 $BITS= 64;
50 $BNSZ= $BITS/8;
51 $SIZE_T=8;
52 $RZONE= 288;
53
54 # same as above, but 64-bit mnemonics...
55 $LD= "ld"; # load
56 $LDU= "ldu"; # load and update
57 $LDX= "ldx"; # load indexed
58 $ST= "std"; # store
59 $STU= "stdu"; # store and update
60 $STX= "stdx"; # store indexed
61 $STUX= "stdux"; # store indexed and update
62 $UMULL= "mulld"; # unsigned multiply low
63 $UMULH= "mulhdu"; # unsigned multiply high
64 $UCMP= "cmpld"; # unsigned compare
65 $SHRI= "srdi"; # unsigned shift right by immediate
66 $PUSH= $ST;
67 $POP= $LD;
68} else { die "nonsense $flavour"; }
69
70$FRAME=8*$SIZE_T+$RZONE;
71$LOCALS=8*$SIZE_T;
72
73$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
74( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
75( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
76die "can't locate ppc-xlate.pl";
77
78open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
79
80$sp="r1";
81$toc="r2";
82$rp="r3"; $ovf="r3";
83$ap="r4";
84$bp="r5";
85$np="r6";
86$n0="r7";
87$num="r8";
88$rp="r9"; # $rp is reassigned
89$aj="r10";
90$nj="r11";
91$tj="r12";
92# non-volatile registers
93$i="r20";
94$j="r21";
95$tp="r22";
96$m0="r23";
97$m1="r24";
98$lo0="r25";
99$hi0="r26";
100$lo1="r27";
101$hi1="r28";
102$alo="r29";
103$ahi="r30";
104$nlo="r31";
105#
106$nhi="r0";
107
108$code=<<___;
109.machine "any"
110.text
111
112.globl .bn_mul_mont_int
113.align 4
114.bn_mul_mont_int:
115 cmpwi $num,4
116 mr $rp,r3 ; $rp is reassigned
117 li r3,0
118 bltlr
119___
120$code.=<<___ if ($BNSZ==4);
121 cmpwi $num,32 ; longer key performance is not better
122 bgelr
123___
124$code.=<<___;
125 slwi $num,$num,`log($BNSZ)/log(2)`
126 li $tj,-4096
127 addi $ovf,$num,$FRAME
128 subf $ovf,$ovf,$sp ; $sp-$ovf
129 and $ovf,$ovf,$tj ; minimize TLB usage
130 subf $ovf,$sp,$ovf ; $ovf-$sp
131 mr $tj,$sp
132 srwi $num,$num,`log($BNSZ)/log(2)`
133 $STUX $sp,$sp,$ovf
134
135 $PUSH r20,`-12*$SIZE_T`($tj)
136 $PUSH r21,`-11*$SIZE_T`($tj)
137 $PUSH r22,`-10*$SIZE_T`($tj)
138 $PUSH r23,`-9*$SIZE_T`($tj)
139 $PUSH r24,`-8*$SIZE_T`($tj)
140 $PUSH r25,`-7*$SIZE_T`($tj)
141 $PUSH r26,`-6*$SIZE_T`($tj)
142 $PUSH r27,`-5*$SIZE_T`($tj)
143 $PUSH r28,`-4*$SIZE_T`($tj)
144 $PUSH r29,`-3*$SIZE_T`($tj)
145 $PUSH r30,`-2*$SIZE_T`($tj)
146 $PUSH r31,`-1*$SIZE_T`($tj)
147
148 $LD $n0,0($n0) ; pull n0[0] value
149 addi $num,$num,-2 ; adjust $num for counter register
150
151 $LD $m0,0($bp) ; m0=bp[0]
152 $LD $aj,0($ap) ; ap[0]
153 addi $tp,$sp,$LOCALS
154 $UMULL $lo0,$aj,$m0 ; ap[0]*bp[0]
155 $UMULH $hi0,$aj,$m0
156
157 $LD $aj,$BNSZ($ap) ; ap[1]
158 $LD $nj,0($np) ; np[0]
159
160 $UMULL $m1,$lo0,$n0 ; "tp[0]"*n0
161
162 $UMULL $alo,$aj,$m0 ; ap[1]*bp[0]
163 $UMULH $ahi,$aj,$m0
164
165 $UMULL $lo1,$nj,$m1 ; np[0]*m1
166 $UMULH $hi1,$nj,$m1
167 $LD $nj,$BNSZ($np) ; np[1]
168 addc $lo1,$lo1,$lo0
169 addze $hi1,$hi1
170
171 $UMULL $nlo,$nj,$m1 ; np[1]*m1
172 $UMULH $nhi,$nj,$m1
173
174 mtctr $num
175 li $j,`2*$BNSZ`
176.align 4
177L1st:
178 $LDX $aj,$ap,$j ; ap[j]
179 addc $lo0,$alo,$hi0
180 $LDX $nj,$np,$j ; np[j]
181 addze $hi0,$ahi
182 $UMULL $alo,$aj,$m0 ; ap[j]*bp[0]
183 addc $lo1,$nlo,$hi1
184 $UMULH $ahi,$aj,$m0
185 addze $hi1,$nhi
186 $UMULL $nlo,$nj,$m1 ; np[j]*m1
187 addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[0]
188 $UMULH $nhi,$nj,$m1
189 addze $hi1,$hi1
190 $ST $lo1,0($tp) ; tp[j-1]
191
192 addi $j,$j,$BNSZ ; j++
193 addi $tp,$tp,$BNSZ ; tp++
194 bdnz- L1st
195;L1st
196 addc $lo0,$alo,$hi0
197 addze $hi0,$ahi
198
199 addc $lo1,$nlo,$hi1
200 addze $hi1,$nhi
201 addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[0]
202 addze $hi1,$hi1
203 $ST $lo1,0($tp) ; tp[j-1]
204
205 li $ovf,0
206 addc $hi1,$hi1,$hi0
207 addze $ovf,$ovf ; upmost overflow bit
208 $ST $hi1,$BNSZ($tp)
209
210 li $i,$BNSZ
211.align 4
212Louter:
213 $LDX $m0,$bp,$i ; m0=bp[i]
214 $LD $aj,0($ap) ; ap[0]
215 addi $tp,$sp,$LOCALS
216 $LD $tj,$LOCALS($sp); tp[0]
217 $UMULL $lo0,$aj,$m0 ; ap[0]*bp[i]
218 $UMULH $hi0,$aj,$m0
219 $LD $aj,$BNSZ($ap) ; ap[1]
220 $LD $nj,0($np) ; np[0]
221 addc $lo0,$lo0,$tj ; ap[0]*bp[i]+tp[0]
222 $UMULL $alo,$aj,$m0 ; ap[j]*bp[i]
223 addze $hi0,$hi0
224 $UMULL $m1,$lo0,$n0 ; tp[0]*n0
225 $UMULH $ahi,$aj,$m0
226 $UMULL $lo1,$nj,$m1 ; np[0]*m1
227 $UMULH $hi1,$nj,$m1
228 $LD $nj,$BNSZ($np) ; np[1]
229 addc $lo1,$lo1,$lo0
230 $UMULL $nlo,$nj,$m1 ; np[1]*m1
231 addze $hi1,$hi1
232 $UMULH $nhi,$nj,$m1
233
234 mtctr $num
235 li $j,`2*$BNSZ`
236.align 4
237Linner:
238 $LDX $aj,$ap,$j ; ap[j]
239 addc $lo0,$alo,$hi0
240 $LD $tj,$BNSZ($tp) ; tp[j]
241 addze $hi0,$ahi
242 $LDX $nj,$np,$j ; np[j]
243 addc $lo1,$nlo,$hi1
244 $UMULL $alo,$aj,$m0 ; ap[j]*bp[i]
245 addze $hi1,$nhi
246 $UMULH $ahi,$aj,$m0
247 addc $lo0,$lo0,$tj ; ap[j]*bp[i]+tp[j]
248 $UMULL $nlo,$nj,$m1 ; np[j]*m1
249 addze $hi0,$hi0
250 $UMULH $nhi,$nj,$m1
251 addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[i]+tp[j]
252 addi $j,$j,$BNSZ ; j++
253 addze $hi1,$hi1
254 $ST $lo1,0($tp) ; tp[j-1]
255 addi $tp,$tp,$BNSZ ; tp++
256 bdnz- Linner
257;Linner
258 $LD $tj,$BNSZ($tp) ; tp[j]
259 addc $lo0,$alo,$hi0
260 addze $hi0,$ahi
261 addc $lo0,$lo0,$tj ; ap[j]*bp[i]+tp[j]
262 addze $hi0,$hi0
263
264 addc $lo1,$nlo,$hi1
265 addze $hi1,$nhi
266 addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[i]+tp[j]
267 addze $hi1,$hi1
268 $ST $lo1,0($tp) ; tp[j-1]
269
270 addic $ovf,$ovf,-1 ; move upmost overflow to XER[CA]
271 li $ovf,0
272 adde $hi1,$hi1,$hi0
273 addze $ovf,$ovf
274 $ST $hi1,$BNSZ($tp)
275;
276 slwi $tj,$num,`log($BNSZ)/log(2)`
277 $UCMP $i,$tj
278 addi $i,$i,$BNSZ
279 ble- Louter
280
281 addi $num,$num,2 ; restore $num
282 subfc $j,$j,$j ; j=0 and "clear" XER[CA]
283 addi $tp,$sp,$LOCALS
284 mtctr $num
285
286.align 4
287Lsub: $LDX $tj,$tp,$j
288 $LDX $nj,$np,$j
289 subfe $aj,$nj,$tj ; tp[j]-np[j]
290 $STX $aj,$rp,$j
291 addi $j,$j,$BNSZ
292 bdnz- Lsub
293
294 li $j,0
295 mtctr $num
296 subfe $ovf,$j,$ovf ; handle upmost overflow bit
297 and $ap,$tp,$ovf
298 andc $np,$rp,$ovf
299 or $ap,$ap,$np ; ap=borrow?tp:rp
300
301.align 4
302Lcopy: ; copy or in-place refresh
303 $LDX $tj,$ap,$j
304 $STX $tj,$rp,$j
305 $STX $j,$tp,$j ; zap at once
306 addi $j,$j,$BNSZ
307 bdnz- Lcopy
308
309 $POP $tj,0($sp)
310 li r3,1
311 $POP r20,`-12*$SIZE_T`($tj)
312 $POP r21,`-11*$SIZE_T`($tj)
313 $POP r22,`-10*$SIZE_T`($tj)
314 $POP r23,`-9*$SIZE_T`($tj)
315 $POP r24,`-8*$SIZE_T`($tj)
316 $POP r25,`-7*$SIZE_T`($tj)
317 $POP r26,`-6*$SIZE_T`($tj)
318 $POP r27,`-5*$SIZE_T`($tj)
319 $POP r28,`-4*$SIZE_T`($tj)
320 $POP r29,`-3*$SIZE_T`($tj)
321 $POP r30,`-2*$SIZE_T`($tj)
322 $POP r31,`-1*$SIZE_T`($tj)
323 mr $sp,$tj
324 blr
325 .long 0
326 .byte 0,12,4,0,0x80,12,6,0
327 .long 0
328
329.asciz "Montgomery Multiplication for PPC, CRYPTOGAMS by <appro\@openssl.org>"
330___
331
332$code =~ s/\`([^\`]*)\`/eval $1/gem;
333print $code;
334close STDOUT;
diff --git a/src/lib/libcrypto/bn/asm/ppc.pl b/src/lib/libcrypto/bn/asm/ppc.pl
deleted file mode 100644
index 1249ce2299..0000000000
--- a/src/lib/libcrypto/bn/asm/ppc.pl
+++ /dev/null
@@ -1,1998 +0,0 @@
1#!/usr/bin/env perl
2#
3# Implemented as a Perl wrapper as we want to support several different
4# architectures with single file. We pick up the target based on the
5# file name we are asked to generate.
6#
7# It should be noted though that this perl code is nothing like
8# <openssl>/crypto/perlasm/x86*. In this case perl is used pretty much
9# as pre-processor to cover for platform differences in name decoration,
10# linker tables, 32-/64-bit instruction sets...
11#
12# As you might know there're several PowerPC ABI in use. Most notably
13# Linux and AIX use different 32-bit ABIs. Good news are that these ABIs
14# are similar enough to implement leaf(!) functions, which would be ABI
15# neutral. And that's what you find here: ABI neutral leaf functions.
16# In case you wonder what that is...
17#
18# AIX performance
19#
20# MEASUREMENTS WITH cc ON a 200 MhZ PowerPC 604e.
21#
22# The following is the performance of 32-bit compiler
23# generated code:
24#
25# OpenSSL 0.9.6c 21 dec 2001
26# built on: Tue Jun 11 11:06:51 EDT 2002
27# options:bn(64,32) ...
28#compiler: cc -DTHREADS -DAIX -DB_ENDIAN -DBN_LLONG -O3
29# sign verify sign/s verify/s
30#rsa 512 bits 0.0098s 0.0009s 102.0 1170.6
31#rsa 1024 bits 0.0507s 0.0026s 19.7 387.5
32#rsa 2048 bits 0.3036s 0.0085s 3.3 117.1
33#rsa 4096 bits 2.0040s 0.0299s 0.5 33.4
34#dsa 512 bits 0.0087s 0.0106s 114.3 94.5
35#dsa 1024 bits 0.0256s 0.0313s 39.0 32.0
36#
37# Same bechmark with this assembler code:
38#
39#rsa 512 bits 0.0056s 0.0005s 178.6 2049.2
40#rsa 1024 bits 0.0283s 0.0015s 35.3 674.1
41#rsa 2048 bits 0.1744s 0.0050s 5.7 201.2
42#rsa 4096 bits 1.1644s 0.0179s 0.9 55.7
43#dsa 512 bits 0.0052s 0.0062s 191.6 162.0
44#dsa 1024 bits 0.0149s 0.0180s 67.0 55.5
45#
46# Number of operations increases by at almost 75%
47#
48# Here are performance numbers for 64-bit compiler
49# generated code:
50#
51# OpenSSL 0.9.6g [engine] 9 Aug 2002
52# built on: Fri Apr 18 16:59:20 EDT 2003
53# options:bn(64,64) ...
54# compiler: cc -DTHREADS -D_REENTRANT -q64 -DB_ENDIAN -O3
55# sign verify sign/s verify/s
56#rsa 512 bits 0.0028s 0.0003s 357.1 3844.4
57#rsa 1024 bits 0.0148s 0.0008s 67.5 1239.7
58#rsa 2048 bits 0.0963s 0.0028s 10.4 353.0
59#rsa 4096 bits 0.6538s 0.0102s 1.5 98.1
60#dsa 512 bits 0.0026s 0.0032s 382.5 313.7
61#dsa 1024 bits 0.0081s 0.0099s 122.8 100.6
62#
63# Same benchmark with this assembler code:
64#
65#rsa 512 bits 0.0020s 0.0002s 510.4 6273.7
66#rsa 1024 bits 0.0088s 0.0005s 114.1 2128.3
67#rsa 2048 bits 0.0540s 0.0016s 18.5 622.5
68#rsa 4096 bits 0.3700s 0.0058s 2.7 171.0
69#dsa 512 bits 0.0016s 0.0020s 610.7 507.1
70#dsa 1024 bits 0.0047s 0.0058s 212.5 173.2
71#
72# Again, performance increases by at about 75%
73#
74# Mac OS X, Apple G5 1.8GHz (Note this is 32 bit code)
75# OpenSSL 0.9.7c 30 Sep 2003
76#
77# Original code.
78#
79#rsa 512 bits 0.0011s 0.0001s 906.1 11012.5
80#rsa 1024 bits 0.0060s 0.0003s 166.6 3363.1
81#rsa 2048 bits 0.0370s 0.0010s 27.1 982.4
82#rsa 4096 bits 0.2426s 0.0036s 4.1 280.4
83#dsa 512 bits 0.0010s 0.0012s 1038.1 841.5
84#dsa 1024 bits 0.0030s 0.0037s 329.6 269.7
85#dsa 2048 bits 0.0101s 0.0127s 98.9 78.6
86#
87# Same benchmark with this assembler code:
88#
89#rsa 512 bits 0.0007s 0.0001s 1416.2 16645.9
90#rsa 1024 bits 0.0036s 0.0002s 274.4 5380.6
91#rsa 2048 bits 0.0222s 0.0006s 45.1 1589.5
92#rsa 4096 bits 0.1469s 0.0022s 6.8 449.6
93#dsa 512 bits 0.0006s 0.0007s 1664.2 1376.2
94#dsa 1024 bits 0.0018s 0.0023s 545.0 442.2
95#dsa 2048 bits 0.0061s 0.0075s 163.5 132.8
96#
97# Performance increase of ~60%
98#
99# If you have comments or suggestions to improve code send
100# me a note at schari@us.ibm.com
101#
102
103$flavour = shift;
104
105if ($flavour =~ /32/) {
106 $BITS= 32;
107 $BNSZ= $BITS/8;
108 $ISA= "\"ppc\"";
109
110 $LD= "lwz"; # load
111 $LDU= "lwzu"; # load and update
112 $ST= "stw"; # store
113 $STU= "stwu"; # store and update
114 $UMULL= "mullw"; # unsigned multiply low
115 $UMULH= "mulhwu"; # unsigned multiply high
116 $UDIV= "divwu"; # unsigned divide
117 $UCMPI= "cmplwi"; # unsigned compare with immediate
118 $UCMP= "cmplw"; # unsigned compare
119 $CNTLZ= "cntlzw"; # count leading zeros
120 $SHL= "slw"; # shift left
121 $SHR= "srw"; # unsigned shift right
122 $SHRI= "srwi"; # unsigned shift right by immediate
123 $SHLI= "slwi"; # shift left by immediate
124 $CLRU= "clrlwi"; # clear upper bits
125 $INSR= "insrwi"; # insert right
126 $ROTL= "rotlwi"; # rotate left by immediate
127 $TR= "tw"; # conditional trap
128} elsif ($flavour =~ /64/) {
129 $BITS= 64;
130 $BNSZ= $BITS/8;
131 $ISA= "\"ppc64\"";
132
133 # same as above, but 64-bit mnemonics...
134 $LD= "ld"; # load
135 $LDU= "ldu"; # load and update
136 $ST= "std"; # store
137 $STU= "stdu"; # store and update
138 $UMULL= "mulld"; # unsigned multiply low
139 $UMULH= "mulhdu"; # unsigned multiply high
140 $UDIV= "divdu"; # unsigned divide
141 $UCMPI= "cmpldi"; # unsigned compare with immediate
142 $UCMP= "cmpld"; # unsigned compare
143 $CNTLZ= "cntlzd"; # count leading zeros
144 $SHL= "sld"; # shift left
145 $SHR= "srd"; # unsigned shift right
146 $SHRI= "srdi"; # unsigned shift right by immediate
147 $SHLI= "sldi"; # shift left by immediate
148 $CLRU= "clrldi"; # clear upper bits
149 $INSR= "insrdi"; # insert right
150 $ROTL= "rotldi"; # rotate left by immediate
151 $TR= "td"; # conditional trap
152} else { die "nonsense $flavour"; }
153
154$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
155( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
156( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
157die "can't locate ppc-xlate.pl";
158
159open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
160
161$data=<<EOF;
162#--------------------------------------------------------------------
163#
164#
165#
166#
167# File: ppc32.s
168#
169# Created by: Suresh Chari
170# IBM Thomas J. Watson Research Library
171# Hawthorne, NY
172#
173#
174# Description: Optimized assembly routines for OpenSSL crypto
175# on the 32 bitPowerPC platform.
176#
177#
178# Version History
179#
180# 2. Fixed bn_add,bn_sub and bn_div_words, added comments,
181# cleaned up code. Also made a single version which can
182# be used for both the AIX and Linux compilers. See NOTE
183# below.
184# 12/05/03 Suresh Chari
185# (with lots of help from) Andy Polyakov
186##
187# 1. Initial version 10/20/02 Suresh Chari
188#
189#
190# The following file works for the xlc,cc
191# and gcc compilers.
192#
193# NOTE: To get the file to link correctly with the gcc compiler
194# you have to change the names of the routines and remove
195# the first .(dot) character. This should automatically
196# be done in the build process.
197#
198# Hand optimized assembly code for the following routines
199#
200# bn_sqr_comba4
201# bn_sqr_comba8
202# bn_mul_comba4
203# bn_mul_comba8
204# bn_sub_words
205# bn_add_words
206# bn_div_words
207# bn_sqr_words
208# bn_mul_words
209# bn_mul_add_words
210#
211# NOTE: It is possible to optimize this code more for
212# specific PowerPC or Power architectures. On the Northstar
213# architecture the optimizations in this file do
214# NOT provide much improvement.
215#
216# If you have comments or suggestions to improve code send
217# me a note at schari\@us.ibm.com
218#
219#--------------------------------------------------------------------------
220#
221# Defines to be used in the assembly code.
222#
223#.set r0,0 # we use it as storage for value of 0
224#.set SP,1 # preserved
225#.set RTOC,2 # preserved
226#.set r3,3 # 1st argument/return value
227#.set r4,4 # 2nd argument/volatile register
228#.set r5,5 # 3rd argument/volatile register
229#.set r6,6 # ...
230#.set r7,7
231#.set r8,8
232#.set r9,9
233#.set r10,10
234#.set r11,11
235#.set r12,12
236#.set r13,13 # not used, nor any other "below" it...
237
238# Declare function names to be global
239# NOTE: For gcc these names MUST be changed to remove
240# the first . i.e. for example change ".bn_sqr_comba4"
241# to "bn_sqr_comba4". This should be automatically done
242# in the build.
243
244 .globl .bn_sqr_comba4
245 .globl .bn_sqr_comba8
246 .globl .bn_mul_comba4
247 .globl .bn_mul_comba8
248 .globl .bn_sub_words
249 .globl .bn_add_words
250 .globl .bn_div_words
251 .globl .bn_sqr_words
252 .globl .bn_mul_words
253 .globl .bn_mul_add_words
254
255# .text section
256
257 .machine "any"
258
259#
260# NOTE: The following label name should be changed to
261# "bn_sqr_comba4" i.e. remove the first dot
262# for the gcc compiler. This should be automatically
263# done in the build
264#
265
266.align 4
267.bn_sqr_comba4:
268#
269# Optimized version of bn_sqr_comba4.
270#
271# void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
272# r3 contains r
273# r4 contains a
274#
275# Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows:
276#
277# r5,r6 are the two BN_ULONGs being multiplied.
278# r7,r8 are the results of the 32x32 giving 64 bit multiply.
279# r9,r10, r11 are the equivalents of c1,c2, c3.
280# Here's the assembly
281#
282#
283 xor r0,r0,r0 # set r0 = 0. Used in the addze
284 # instructions below
285
286 #sqr_add_c(a,0,c1,c2,c3)
287 $LD r5,`0*$BNSZ`(r4)
288 $UMULL r9,r5,r5
289 $UMULH r10,r5,r5 #in first iteration. No need
290 #to add since c1=c2=c3=0.
291 # Note c3(r11) is NOT set to 0
292 # but will be.
293
294 $ST r9,`0*$BNSZ`(r3) # r[0]=c1;
295 # sqr_add_c2(a,1,0,c2,c3,c1);
296 $LD r6,`1*$BNSZ`(r4)
297 $UMULL r7,r5,r6
298 $UMULH r8,r5,r6
299
300 addc r7,r7,r7 # compute (r7,r8)=2*(r7,r8)
301 adde r8,r8,r8
302 addze r9,r0 # catch carry if any.
303 # r9= r0(=0) and carry
304
305 addc r10,r7,r10 # now add to temp result.
306 addze r11,r8 # r8 added to r11 which is 0
307 addze r9,r9
308
309 $ST r10,`1*$BNSZ`(r3) #r[1]=c2;
310 #sqr_add_c(a,1,c3,c1,c2)
311 $UMULL r7,r6,r6
312 $UMULH r8,r6,r6
313 addc r11,r7,r11
314 adde r9,r8,r9
315 addze r10,r0
316 #sqr_add_c2(a,2,0,c3,c1,c2)
317 $LD r6,`2*$BNSZ`(r4)
318 $UMULL r7,r5,r6
319 $UMULH r8,r5,r6
320
321 addc r7,r7,r7
322 adde r8,r8,r8
323 addze r10,r10
324
325 addc r11,r7,r11
326 adde r9,r8,r9
327 addze r10,r10
328 $ST r11,`2*$BNSZ`(r3) #r[2]=c3
329 #sqr_add_c2(a,3,0,c1,c2,c3);
330 $LD r6,`3*$BNSZ`(r4)
331 $UMULL r7,r5,r6
332 $UMULH r8,r5,r6
333 addc r7,r7,r7
334 adde r8,r8,r8
335 addze r11,r0
336
337 addc r9,r7,r9
338 adde r10,r8,r10
339 addze r11,r11
340 #sqr_add_c2(a,2,1,c1,c2,c3);
341 $LD r5,`1*$BNSZ`(r4)
342 $LD r6,`2*$BNSZ`(r4)
343 $UMULL r7,r5,r6
344 $UMULH r8,r5,r6
345
346 addc r7,r7,r7
347 adde r8,r8,r8
348 addze r11,r11
349 addc r9,r7,r9
350 adde r10,r8,r10
351 addze r11,r11
352 $ST r9,`3*$BNSZ`(r3) #r[3]=c1
353 #sqr_add_c(a,2,c2,c3,c1);
354 $UMULL r7,r6,r6
355 $UMULH r8,r6,r6
356 addc r10,r7,r10
357 adde r11,r8,r11
358 addze r9,r0
359 #sqr_add_c2(a,3,1,c2,c3,c1);
360 $LD r6,`3*$BNSZ`(r4)
361 $UMULL r7,r5,r6
362 $UMULH r8,r5,r6
363 addc r7,r7,r7
364 adde r8,r8,r8
365 addze r9,r9
366
367 addc r10,r7,r10
368 adde r11,r8,r11
369 addze r9,r9
370 $ST r10,`4*$BNSZ`(r3) #r[4]=c2
371 #sqr_add_c2(a,3,2,c3,c1,c2);
372 $LD r5,`2*$BNSZ`(r4)
373 $UMULL r7,r5,r6
374 $UMULH r8,r5,r6
375 addc r7,r7,r7
376 adde r8,r8,r8
377 addze r10,r0
378
379 addc r11,r7,r11
380 adde r9,r8,r9
381 addze r10,r10
382 $ST r11,`5*$BNSZ`(r3) #r[5] = c3
383 #sqr_add_c(a,3,c1,c2,c3);
384 $UMULL r7,r6,r6
385 $UMULH r8,r6,r6
386 addc r9,r7,r9
387 adde r10,r8,r10
388
389 $ST r9,`6*$BNSZ`(r3) #r[6]=c1
390 $ST r10,`7*$BNSZ`(r3) #r[7]=c2
391 blr
392 .long 0
393 .byte 0,12,0x14,0,0,0,2,0
394 .long 0
395
396#
397# NOTE: The following label name should be changed to
398# "bn_sqr_comba8" i.e. remove the first dot
399# for the gcc compiler. This should be automatically
400# done in the build
401#
402
403.align 4
404.bn_sqr_comba8:
405#
406# This is an optimized version of the bn_sqr_comba8 routine.
407# Tightly uses the adde instruction
408#
409#
410# void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
411# r3 contains r
412# r4 contains a
413#
414# Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows:
415#
416# r5,r6 are the two BN_ULONGs being multiplied.
417# r7,r8 are the results of the 32x32 giving 64 bit multiply.
418# r9,r10, r11 are the equivalents of c1,c2, c3.
419#
420# Possible optimization of loading all 8 longs of a into registers
421# doesnt provide any speedup
422#
423
424 xor r0,r0,r0 #set r0 = 0.Used in addze
425 #instructions below.
426
427 #sqr_add_c(a,0,c1,c2,c3);
428 $LD r5,`0*$BNSZ`(r4)
429 $UMULL r9,r5,r5 #1st iteration: no carries.
430 $UMULH r10,r5,r5
431 $ST r9,`0*$BNSZ`(r3) # r[0]=c1;
432 #sqr_add_c2(a,1,0,c2,c3,c1);
433 $LD r6,`1*$BNSZ`(r4)
434 $UMULL r7,r5,r6
435 $UMULH r8,r5,r6
436
437 addc r10,r7,r10 #add the two register number
438 adde r11,r8,r0 # (r8,r7) to the three register
439 addze r9,r0 # number (r9,r11,r10).NOTE:r0=0
440
441 addc r10,r7,r10 #add the two register number
442 adde r11,r8,r11 # (r8,r7) to the three register
443 addze r9,r9 # number (r9,r11,r10).
444
445 $ST r10,`1*$BNSZ`(r3) # r[1]=c2
446
447 #sqr_add_c(a,1,c3,c1,c2);
448 $UMULL r7,r6,r6
449 $UMULH r8,r6,r6
450 addc r11,r7,r11
451 adde r9,r8,r9
452 addze r10,r0
453 #sqr_add_c2(a,2,0,c3,c1,c2);
454 $LD r6,`2*$BNSZ`(r4)
455 $UMULL r7,r5,r6
456 $UMULH r8,r5,r6
457
458 addc r11,r7,r11
459 adde r9,r8,r9
460 addze r10,r10
461
462 addc r11,r7,r11
463 adde r9,r8,r9
464 addze r10,r10
465
466 $ST r11,`2*$BNSZ`(r3) #r[2]=c3
467 #sqr_add_c2(a,3,0,c1,c2,c3);
468 $LD r6,`3*$BNSZ`(r4) #r6 = a[3]. r5 is already a[0].
469 $UMULL r7,r5,r6
470 $UMULH r8,r5,r6
471
472 addc r9,r7,r9
473 adde r10,r8,r10
474 addze r11,r0
475
476 addc r9,r7,r9
477 adde r10,r8,r10
478 addze r11,r11
479 #sqr_add_c2(a,2,1,c1,c2,c3);
480 $LD r5,`1*$BNSZ`(r4)
481 $LD r6,`2*$BNSZ`(r4)
482 $UMULL r7,r5,r6
483 $UMULH r8,r5,r6
484
485 addc r9,r7,r9
486 adde r10,r8,r10
487 addze r11,r11
488
489 addc r9,r7,r9
490 adde r10,r8,r10
491 addze r11,r11
492
493 $ST r9,`3*$BNSZ`(r3) #r[3]=c1;
494 #sqr_add_c(a,2,c2,c3,c1);
495 $UMULL r7,r6,r6
496 $UMULH r8,r6,r6
497
498 addc r10,r7,r10
499 adde r11,r8,r11
500 addze r9,r0
501 #sqr_add_c2(a,3,1,c2,c3,c1);
502 $LD r6,`3*$BNSZ`(r4)
503 $UMULL r7,r5,r6
504 $UMULH r8,r5,r6
505
506 addc r10,r7,r10
507 adde r11,r8,r11
508 addze r9,r9
509
510 addc r10,r7,r10
511 adde r11,r8,r11
512 addze r9,r9
513 #sqr_add_c2(a,4,0,c2,c3,c1);
514 $LD r5,`0*$BNSZ`(r4)
515 $LD r6,`4*$BNSZ`(r4)
516 $UMULL r7,r5,r6
517 $UMULH r8,r5,r6
518
519 addc r10,r7,r10
520 adde r11,r8,r11
521 addze r9,r9
522
523 addc r10,r7,r10
524 adde r11,r8,r11
525 addze r9,r9
526 $ST r10,`4*$BNSZ`(r3) #r[4]=c2;
527 #sqr_add_c2(a,5,0,c3,c1,c2);
528 $LD r6,`5*$BNSZ`(r4)
529 $UMULL r7,r5,r6
530 $UMULH r8,r5,r6
531
532 addc r11,r7,r11
533 adde r9,r8,r9
534 addze r10,r0
535
536 addc r11,r7,r11
537 adde r9,r8,r9
538 addze r10,r10
539 #sqr_add_c2(a,4,1,c3,c1,c2);
540 $LD r5,`1*$BNSZ`(r4)
541 $LD r6,`4*$BNSZ`(r4)
542 $UMULL r7,r5,r6
543 $UMULH r8,r5,r6
544
545 addc r11,r7,r11
546 adde r9,r8,r9
547 addze r10,r10
548
549 addc r11,r7,r11
550 adde r9,r8,r9
551 addze r10,r10
552 #sqr_add_c2(a,3,2,c3,c1,c2);
553 $LD r5,`2*$BNSZ`(r4)
554 $LD r6,`3*$BNSZ`(r4)
555 $UMULL r7,r5,r6
556 $UMULH r8,r5,r6
557
558 addc r11,r7,r11
559 adde r9,r8,r9
560 addze r10,r10
561
562 addc r11,r7,r11
563 adde r9,r8,r9
564 addze r10,r10
565 $ST r11,`5*$BNSZ`(r3) #r[5]=c3;
566 #sqr_add_c(a,3,c1,c2,c3);
567 $UMULL r7,r6,r6
568 $UMULH r8,r6,r6
569 addc r9,r7,r9
570 adde r10,r8,r10
571 addze r11,r0
572 #sqr_add_c2(a,4,2,c1,c2,c3);
573 $LD r6,`4*$BNSZ`(r4)
574 $UMULL r7,r5,r6
575 $UMULH r8,r5,r6
576
577 addc r9,r7,r9
578 adde r10,r8,r10
579 addze r11,r11
580
581 addc r9,r7,r9
582 adde r10,r8,r10
583 addze r11,r11
584 #sqr_add_c2(a,5,1,c1,c2,c3);
585 $LD r5,`1*$BNSZ`(r4)
586 $LD r6,`5*$BNSZ`(r4)
587 $UMULL r7,r5,r6
588 $UMULH r8,r5,r6
589
590 addc r9,r7,r9
591 adde r10,r8,r10
592 addze r11,r11
593
594 addc r9,r7,r9
595 adde r10,r8,r10
596 addze r11,r11
597 #sqr_add_c2(a,6,0,c1,c2,c3);
598 $LD r5,`0*$BNSZ`(r4)
599 $LD r6,`6*$BNSZ`(r4)
600 $UMULL r7,r5,r6
601 $UMULH r8,r5,r6
602 addc r9,r7,r9
603 adde r10,r8,r10
604 addze r11,r11
605 addc r9,r7,r9
606 adde r10,r8,r10
607 addze r11,r11
608 $ST r9,`6*$BNSZ`(r3) #r[6]=c1;
609 #sqr_add_c2(a,7,0,c2,c3,c1);
610 $LD r6,`7*$BNSZ`(r4)
611 $UMULL r7,r5,r6
612 $UMULH r8,r5,r6
613
614 addc r10,r7,r10
615 adde r11,r8,r11
616 addze r9,r0
617 addc r10,r7,r10
618 adde r11,r8,r11
619 addze r9,r9
620 #sqr_add_c2(a,6,1,c2,c3,c1);
621 $LD r5,`1*$BNSZ`(r4)
622 $LD r6,`6*$BNSZ`(r4)
623 $UMULL r7,r5,r6
624 $UMULH r8,r5,r6
625
626 addc r10,r7,r10
627 adde r11,r8,r11
628 addze r9,r9
629 addc r10,r7,r10
630 adde r11,r8,r11
631 addze r9,r9
632 #sqr_add_c2(a,5,2,c2,c3,c1);
633 $LD r5,`2*$BNSZ`(r4)
634 $LD r6,`5*$BNSZ`(r4)
635 $UMULL r7,r5,r6
636 $UMULH r8,r5,r6
637 addc r10,r7,r10
638 adde r11,r8,r11
639 addze r9,r9
640 addc r10,r7,r10
641 adde r11,r8,r11
642 addze r9,r9
643 #sqr_add_c2(a,4,3,c2,c3,c1);
644 $LD r5,`3*$BNSZ`(r4)
645 $LD r6,`4*$BNSZ`(r4)
646 $UMULL r7,r5,r6
647 $UMULH r8,r5,r6
648
649 addc r10,r7,r10
650 adde r11,r8,r11
651 addze r9,r9
652 addc r10,r7,r10
653 adde r11,r8,r11
654 addze r9,r9
655 $ST r10,`7*$BNSZ`(r3) #r[7]=c2;
656 #sqr_add_c(a,4,c3,c1,c2);
657 $UMULL r7,r6,r6
658 $UMULH r8,r6,r6
659 addc r11,r7,r11
660 adde r9,r8,r9
661 addze r10,r0
662 #sqr_add_c2(a,5,3,c3,c1,c2);
663 $LD r6,`5*$BNSZ`(r4)
664 $UMULL r7,r5,r6
665 $UMULH r8,r5,r6
666 addc r11,r7,r11
667 adde r9,r8,r9
668 addze r10,r10
669 addc r11,r7,r11
670 adde r9,r8,r9
671 addze r10,r10
672 #sqr_add_c2(a,6,2,c3,c1,c2);
673 $LD r5,`2*$BNSZ`(r4)
674 $LD r6,`6*$BNSZ`(r4)
675 $UMULL r7,r5,r6
676 $UMULH r8,r5,r6
677 addc r11,r7,r11
678 adde r9,r8,r9
679 addze r10,r10
680
681 addc r11,r7,r11
682 adde r9,r8,r9
683 addze r10,r10
684 #sqr_add_c2(a,7,1,c3,c1,c2);
685 $LD r5,`1*$BNSZ`(r4)
686 $LD r6,`7*$BNSZ`(r4)
687 $UMULL r7,r5,r6
688 $UMULH r8,r5,r6
689 addc r11,r7,r11
690 adde r9,r8,r9
691 addze r10,r10
692 addc r11,r7,r11
693 adde r9,r8,r9
694 addze r10,r10
695 $ST r11,`8*$BNSZ`(r3) #r[8]=c3;
696 #sqr_add_c2(a,7,2,c1,c2,c3);
697 $LD r5,`2*$BNSZ`(r4)
698 $UMULL r7,r5,r6
699 $UMULH r8,r5,r6
700
701 addc r9,r7,r9
702 adde r10,r8,r10
703 addze r11,r0
704 addc r9,r7,r9
705 adde r10,r8,r10
706 addze r11,r11
707 #sqr_add_c2(a,6,3,c1,c2,c3);
708 $LD r5,`3*$BNSZ`(r4)
709 $LD r6,`6*$BNSZ`(r4)
710 $UMULL r7,r5,r6
711 $UMULH r8,r5,r6
712 addc r9,r7,r9
713 adde r10,r8,r10
714 addze r11,r11
715 addc r9,r7,r9
716 adde r10,r8,r10
717 addze r11,r11
718 #sqr_add_c2(a,5,4,c1,c2,c3);
719 $LD r5,`4*$BNSZ`(r4)
720 $LD r6,`5*$BNSZ`(r4)
721 $UMULL r7,r5,r6
722 $UMULH r8,r5,r6
723 addc r9,r7,r9
724 adde r10,r8,r10
725 addze r11,r11
726 addc r9,r7,r9
727 adde r10,r8,r10
728 addze r11,r11
729 $ST r9,`9*$BNSZ`(r3) #r[9]=c1;
730 #sqr_add_c(a,5,c2,c3,c1);
731 $UMULL r7,r6,r6
732 $UMULH r8,r6,r6
733 addc r10,r7,r10
734 adde r11,r8,r11
735 addze r9,r0
736 #sqr_add_c2(a,6,4,c2,c3,c1);
737 $LD r6,`6*$BNSZ`(r4)
738 $UMULL r7,r5,r6
739 $UMULH r8,r5,r6
740 addc r10,r7,r10
741 adde r11,r8,r11
742 addze r9,r9
743 addc r10,r7,r10
744 adde r11,r8,r11
745 addze r9,r9
746 #sqr_add_c2(a,7,3,c2,c3,c1);
747 $LD r5,`3*$BNSZ`(r4)
748 $LD r6,`7*$BNSZ`(r4)
749 $UMULL r7,r5,r6
750 $UMULH r8,r5,r6
751 addc r10,r7,r10
752 adde r11,r8,r11
753 addze r9,r9
754 addc r10,r7,r10
755 adde r11,r8,r11
756 addze r9,r9
757 $ST r10,`10*$BNSZ`(r3) #r[10]=c2;
758 #sqr_add_c2(a,7,4,c3,c1,c2);
759 $LD r5,`4*$BNSZ`(r4)
760 $UMULL r7,r5,r6
761 $UMULH r8,r5,r6
762 addc r11,r7,r11
763 adde r9,r8,r9
764 addze r10,r0
765 addc r11,r7,r11
766 adde r9,r8,r9
767 addze r10,r10
768 #sqr_add_c2(a,6,5,c3,c1,c2);
769 $LD r5,`5*$BNSZ`(r4)
770 $LD r6,`6*$BNSZ`(r4)
771 $UMULL r7,r5,r6
772 $UMULH r8,r5,r6
773 addc r11,r7,r11
774 adde r9,r8,r9
775 addze r10,r10
776 addc r11,r7,r11
777 adde r9,r8,r9
778 addze r10,r10
779 $ST r11,`11*$BNSZ`(r3) #r[11]=c3;
780 #sqr_add_c(a,6,c1,c2,c3);
781 $UMULL r7,r6,r6
782 $UMULH r8,r6,r6
783 addc r9,r7,r9
784 adde r10,r8,r10
785 addze r11,r0
786 #sqr_add_c2(a,7,5,c1,c2,c3)
787 $LD r6,`7*$BNSZ`(r4)
788 $UMULL r7,r5,r6
789 $UMULH r8,r5,r6
790 addc r9,r7,r9
791 adde r10,r8,r10
792 addze r11,r11
793 addc r9,r7,r9
794 adde r10,r8,r10
795 addze r11,r11
796 $ST r9,`12*$BNSZ`(r3) #r[12]=c1;
797
798 #sqr_add_c2(a,7,6,c2,c3,c1)
799 $LD r5,`6*$BNSZ`(r4)
800 $UMULL r7,r5,r6
801 $UMULH r8,r5,r6
802 addc r10,r7,r10
803 adde r11,r8,r11
804 addze r9,r0
805 addc r10,r7,r10
806 adde r11,r8,r11
807 addze r9,r9
808 $ST r10,`13*$BNSZ`(r3) #r[13]=c2;
809 #sqr_add_c(a,7,c3,c1,c2);
810 $UMULL r7,r6,r6
811 $UMULH r8,r6,r6
812 addc r11,r7,r11
813 adde r9,r8,r9
814 $ST r11,`14*$BNSZ`(r3) #r[14]=c3;
815 $ST r9, `15*$BNSZ`(r3) #r[15]=c1;
816
817
818 blr
819 .long 0
820 .byte 0,12,0x14,0,0,0,2,0
821 .long 0
822
823#
824# NOTE: The following label name should be changed to
825# "bn_mul_comba4" i.e. remove the first dot
826# for the gcc compiler. This should be automatically
827# done in the build
828#
829
830.align 4
831.bn_mul_comba4:
832#
833# This is an optimized version of the bn_mul_comba4 routine.
834#
835# void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
836# r3 contains r
837# r4 contains a
838# r5 contains b
839# r6, r7 are the 2 BN_ULONGs being multiplied.
840# r8, r9 are the results of the 32x32 giving 64 multiply.
841# r10, r11, r12 are the equivalents of c1, c2, and c3.
842#
843 xor r0,r0,r0 #r0=0. Used in addze below.
844 #mul_add_c(a[0],b[0],c1,c2,c3);
845 $LD r6,`0*$BNSZ`(r4)
846 $LD r7,`0*$BNSZ`(r5)
847 $UMULL r10,r6,r7
848 $UMULH r11,r6,r7
849 $ST r10,`0*$BNSZ`(r3) #r[0]=c1
850 #mul_add_c(a[0],b[1],c2,c3,c1);
851 $LD r7,`1*$BNSZ`(r5)
852 $UMULL r8,r6,r7
853 $UMULH r9,r6,r7
854 addc r11,r8,r11
855 adde r12,r9,r0
856 addze r10,r0
857 #mul_add_c(a[1],b[0],c2,c3,c1);
858 $LD r6, `1*$BNSZ`(r4)
859 $LD r7, `0*$BNSZ`(r5)
860 $UMULL r8,r6,r7
861 $UMULH r9,r6,r7
862 addc r11,r8,r11
863 adde r12,r9,r12
864 addze r10,r10
865 $ST r11,`1*$BNSZ`(r3) #r[1]=c2
866 #mul_add_c(a[2],b[0],c3,c1,c2);
867 $LD r6,`2*$BNSZ`(r4)
868 $UMULL r8,r6,r7
869 $UMULH r9,r6,r7
870 addc r12,r8,r12
871 adde r10,r9,r10
872 addze r11,r0
873 #mul_add_c(a[1],b[1],c3,c1,c2);
874 $LD r6,`1*$BNSZ`(r4)
875 $LD r7,`1*$BNSZ`(r5)
876 $UMULL r8,r6,r7
877 $UMULH r9,r6,r7
878 addc r12,r8,r12
879 adde r10,r9,r10
880 addze r11,r11
881 #mul_add_c(a[0],b[2],c3,c1,c2);
882 $LD r6,`0*$BNSZ`(r4)
883 $LD r7,`2*$BNSZ`(r5)
884 $UMULL r8,r6,r7
885 $UMULH r9,r6,r7
886 addc r12,r8,r12
887 adde r10,r9,r10
888 addze r11,r11
889 $ST r12,`2*$BNSZ`(r3) #r[2]=c3
890 #mul_add_c(a[0],b[3],c1,c2,c3);
891 $LD r7,`3*$BNSZ`(r5)
892 $UMULL r8,r6,r7
893 $UMULH r9,r6,r7
894 addc r10,r8,r10
895 adde r11,r9,r11
896 addze r12,r0
897 #mul_add_c(a[1],b[2],c1,c2,c3);
898 $LD r6,`1*$BNSZ`(r4)
899 $LD r7,`2*$BNSZ`(r5)
900 $UMULL r8,r6,r7
901 $UMULH r9,r6,r7
902 addc r10,r8,r10
903 adde r11,r9,r11
904 addze r12,r12
905 #mul_add_c(a[2],b[1],c1,c2,c3);
906 $LD r6,`2*$BNSZ`(r4)
907 $LD r7,`1*$BNSZ`(r5)
908 $UMULL r8,r6,r7
909 $UMULH r9,r6,r7
910 addc r10,r8,r10
911 adde r11,r9,r11
912 addze r12,r12
913 #mul_add_c(a[3],b[0],c1,c2,c3);
914 $LD r6,`3*$BNSZ`(r4)
915 $LD r7,`0*$BNSZ`(r5)
916 $UMULL r8,r6,r7
917 $UMULH r9,r6,r7
918 addc r10,r8,r10
919 adde r11,r9,r11
920 addze r12,r12
921 $ST r10,`3*$BNSZ`(r3) #r[3]=c1
922 #mul_add_c(a[3],b[1],c2,c3,c1);
923 $LD r7,`1*$BNSZ`(r5)
924 $UMULL r8,r6,r7
925 $UMULH r9,r6,r7
926 addc r11,r8,r11
927 adde r12,r9,r12
928 addze r10,r0
929 #mul_add_c(a[2],b[2],c2,c3,c1);
930 $LD r6,`2*$BNSZ`(r4)
931 $LD r7,`2*$BNSZ`(r5)
932 $UMULL r8,r6,r7
933 $UMULH r9,r6,r7
934 addc r11,r8,r11
935 adde r12,r9,r12
936 addze r10,r10
937 #mul_add_c(a[1],b[3],c2,c3,c1);
938 $LD r6,`1*$BNSZ`(r4)
939 $LD r7,`3*$BNSZ`(r5)
940 $UMULL r8,r6,r7
941 $UMULH r9,r6,r7
942 addc r11,r8,r11
943 adde r12,r9,r12
944 addze r10,r10
945 $ST r11,`4*$BNSZ`(r3) #r[4]=c2
946 #mul_add_c(a[2],b[3],c3,c1,c2);
947 $LD r6,`2*$BNSZ`(r4)
948 $UMULL r8,r6,r7
949 $UMULH r9,r6,r7
950 addc r12,r8,r12
951 adde r10,r9,r10
952 addze r11,r0
953 #mul_add_c(a[3],b[2],c3,c1,c2);
954 $LD r6,`3*$BNSZ`(r4)
955 $LD r7,`2*$BNSZ`(r5)
956 $UMULL r8,r6,r7
957 $UMULH r9,r6,r7
958 addc r12,r8,r12
959 adde r10,r9,r10
960 addze r11,r11
961 $ST r12,`5*$BNSZ`(r3) #r[5]=c3
962 #mul_add_c(a[3],b[3],c1,c2,c3);
963 $LD r7,`3*$BNSZ`(r5)
964 $UMULL r8,r6,r7
965 $UMULH r9,r6,r7
966 addc r10,r8,r10
967 adde r11,r9,r11
968
969 $ST r10,`6*$BNSZ`(r3) #r[6]=c1
970 $ST r11,`7*$BNSZ`(r3) #r[7]=c2
971 blr
972 .long 0
973 .byte 0,12,0x14,0,0,0,3,0
974 .long 0
975
976#
977# NOTE: The following label name should be changed to
978# "bn_mul_comba8" i.e. remove the first dot
979# for the gcc compiler. This should be automatically
980# done in the build
981#
982
983.align 4
984.bn_mul_comba8:
985#
986# Optimized version of the bn_mul_comba8 routine.
987#
988# void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
989# r3 contains r
990# r4 contains a
991# r5 contains b
992# r6, r7 are the 2 BN_ULONGs being multiplied.
993# r8, r9 are the results of the 32x32 giving 64 multiply.
994# r10, r11, r12 are the equivalents of c1, c2, and c3.
995#
996 xor r0,r0,r0 #r0=0. Used in addze below.
997
998 #mul_add_c(a[0],b[0],c1,c2,c3);
999 $LD r6,`0*$BNSZ`(r4) #a[0]
1000 $LD r7,`0*$BNSZ`(r5) #b[0]
1001 $UMULL r10,r6,r7
1002 $UMULH r11,r6,r7
1003 $ST r10,`0*$BNSZ`(r3) #r[0]=c1;
1004 #mul_add_c(a[0],b[1],c2,c3,c1);
1005 $LD r7,`1*$BNSZ`(r5)
1006 $UMULL r8,r6,r7
1007 $UMULH r9,r6,r7
1008 addc r11,r11,r8
1009 addze r12,r9 # since we didnt set r12 to zero before.
1010 addze r10,r0
1011 #mul_add_c(a[1],b[0],c2,c3,c1);
1012 $LD r6,`1*$BNSZ`(r4)
1013 $LD r7,`0*$BNSZ`(r5)
1014 $UMULL r8,r6,r7
1015 $UMULH r9,r6,r7
1016 addc r11,r11,r8
1017 adde r12,r12,r9
1018 addze r10,r10
1019 $ST r11,`1*$BNSZ`(r3) #r[1]=c2;
1020 #mul_add_c(a[2],b[0],c3,c1,c2);
1021 $LD r6,`2*$BNSZ`(r4)
1022 $UMULL r8,r6,r7
1023 $UMULH r9,r6,r7
1024 addc r12,r12,r8
1025 adde r10,r10,r9
1026 addze r11,r0
1027 #mul_add_c(a[1],b[1],c3,c1,c2);
1028 $LD r6,`1*$BNSZ`(r4)
1029 $LD r7,`1*$BNSZ`(r5)
1030 $UMULL r8,r6,r7
1031 $UMULH r9,r6,r7
1032 addc r12,r12,r8
1033 adde r10,r10,r9
1034 addze r11,r11
1035 #mul_add_c(a[0],b[2],c3,c1,c2);
1036 $LD r6,`0*$BNSZ`(r4)
1037 $LD r7,`2*$BNSZ`(r5)
1038 $UMULL r8,r6,r7
1039 $UMULH r9,r6,r7
1040 addc r12,r12,r8
1041 adde r10,r10,r9
1042 addze r11,r11
1043 $ST r12,`2*$BNSZ`(r3) #r[2]=c3;
1044 #mul_add_c(a[0],b[3],c1,c2,c3);
1045 $LD r7,`3*$BNSZ`(r5)
1046 $UMULL r8,r6,r7
1047 $UMULH r9,r6,r7
1048 addc r10,r10,r8
1049 adde r11,r11,r9
1050 addze r12,r0
1051 #mul_add_c(a[1],b[2],c1,c2,c3);
1052 $LD r6,`1*$BNSZ`(r4)
1053 $LD r7,`2*$BNSZ`(r5)
1054 $UMULL r8,r6,r7
1055 $UMULH r9,r6,r7
1056 addc r10,r10,r8
1057 adde r11,r11,r9
1058 addze r12,r12
1059
1060 #mul_add_c(a[2],b[1],c1,c2,c3);
1061 $LD r6,`2*$BNSZ`(r4)
1062 $LD r7,`1*$BNSZ`(r5)
1063 $UMULL r8,r6,r7
1064 $UMULH r9,r6,r7
1065 addc r10,r10,r8
1066 adde r11,r11,r9
1067 addze r12,r12
1068 #mul_add_c(a[3],b[0],c1,c2,c3);
1069 $LD r6,`3*$BNSZ`(r4)
1070 $LD r7,`0*$BNSZ`(r5)
1071 $UMULL r8,r6,r7
1072 $UMULH r9,r6,r7
1073 addc r10,r10,r8
1074 adde r11,r11,r9
1075 addze r12,r12
1076 $ST r10,`3*$BNSZ`(r3) #r[3]=c1;
1077 #mul_add_c(a[4],b[0],c2,c3,c1);
1078 $LD r6,`4*$BNSZ`(r4)
1079 $UMULL r8,r6,r7
1080 $UMULH r9,r6,r7
1081 addc r11,r11,r8
1082 adde r12,r12,r9
1083 addze r10,r0
1084 #mul_add_c(a[3],b[1],c2,c3,c1);
1085 $LD r6,`3*$BNSZ`(r4)
1086 $LD r7,`1*$BNSZ`(r5)
1087 $UMULL r8,r6,r7
1088 $UMULH r9,r6,r7
1089 addc r11,r11,r8
1090 adde r12,r12,r9
1091 addze r10,r10
1092 #mul_add_c(a[2],b[2],c2,c3,c1);
1093 $LD r6,`2*$BNSZ`(r4)
1094 $LD r7,`2*$BNSZ`(r5)
1095 $UMULL r8,r6,r7
1096 $UMULH r9,r6,r7
1097 addc r11,r11,r8
1098 adde r12,r12,r9
1099 addze r10,r10
1100 #mul_add_c(a[1],b[3],c2,c3,c1);
1101 $LD r6,`1*$BNSZ`(r4)
1102 $LD r7,`3*$BNSZ`(r5)
1103 $UMULL r8,r6,r7
1104 $UMULH r9,r6,r7
1105 addc r11,r11,r8
1106 adde r12,r12,r9
1107 addze r10,r10
1108 #mul_add_c(a[0],b[4],c2,c3,c1);
1109 $LD r6,`0*$BNSZ`(r4)
1110 $LD r7,`4*$BNSZ`(r5)
1111 $UMULL r8,r6,r7
1112 $UMULH r9,r6,r7
1113 addc r11,r11,r8
1114 adde r12,r12,r9
1115 addze r10,r10
1116 $ST r11,`4*$BNSZ`(r3) #r[4]=c2;
1117 #mul_add_c(a[0],b[5],c3,c1,c2);
1118 $LD r7,`5*$BNSZ`(r5)
1119 $UMULL r8,r6,r7
1120 $UMULH r9,r6,r7
1121 addc r12,r12,r8
1122 adde r10,r10,r9
1123 addze r11,r0
1124 #mul_add_c(a[1],b[4],c3,c1,c2);
1125 $LD r6,`1*$BNSZ`(r4)
1126 $LD r7,`4*$BNSZ`(r5)
1127 $UMULL r8,r6,r7
1128 $UMULH r9,r6,r7
1129 addc r12,r12,r8
1130 adde r10,r10,r9
1131 addze r11,r11
1132 #mul_add_c(a[2],b[3],c3,c1,c2);
1133 $LD r6,`2*$BNSZ`(r4)
1134 $LD r7,`3*$BNSZ`(r5)
1135 $UMULL r8,r6,r7
1136 $UMULH r9,r6,r7
1137 addc r12,r12,r8
1138 adde r10,r10,r9
1139 addze r11,r11
1140 #mul_add_c(a[3],b[2],c3,c1,c2);
1141 $LD r6,`3*$BNSZ`(r4)
1142 $LD r7,`2*$BNSZ`(r5)
1143 $UMULL r8,r6,r7
1144 $UMULH r9,r6,r7
1145 addc r12,r12,r8
1146 adde r10,r10,r9
1147 addze r11,r11
1148 #mul_add_c(a[4],b[1],c3,c1,c2);
1149 $LD r6,`4*$BNSZ`(r4)
1150 $LD r7,`1*$BNSZ`(r5)
1151 $UMULL r8,r6,r7
1152 $UMULH r9,r6,r7
1153 addc r12,r12,r8
1154 adde r10,r10,r9
1155 addze r11,r11
1156 #mul_add_c(a[5],b[0],c3,c1,c2);
1157 $LD r6,`5*$BNSZ`(r4)
1158 $LD r7,`0*$BNSZ`(r5)
1159 $UMULL r8,r6,r7
1160 $UMULH r9,r6,r7
1161 addc r12,r12,r8
1162 adde r10,r10,r9
1163 addze r11,r11
1164 $ST r12,`5*$BNSZ`(r3) #r[5]=c3;
1165 #mul_add_c(a[6],b[0],c1,c2,c3);
1166 $LD r6,`6*$BNSZ`(r4)
1167 $UMULL r8,r6,r7
1168 $UMULH r9,r6,r7
1169 addc r10,r10,r8
1170 adde r11,r11,r9
1171 addze r12,r0
1172 #mul_add_c(a[5],b[1],c1,c2,c3);
1173 $LD r6,`5*$BNSZ`(r4)
1174 $LD r7,`1*$BNSZ`(r5)
1175 $UMULL r8,r6,r7
1176 $UMULH r9,r6,r7
1177 addc r10,r10,r8
1178 adde r11,r11,r9
1179 addze r12,r12
1180 #mul_add_c(a[4],b[2],c1,c2,c3);
1181 $LD r6,`4*$BNSZ`(r4)
1182 $LD r7,`2*$BNSZ`(r5)
1183 $UMULL r8,r6,r7
1184 $UMULH r9,r6,r7
1185 addc r10,r10,r8
1186 adde r11,r11,r9
1187 addze r12,r12
1188 #mul_add_c(a[3],b[3],c1,c2,c3);
1189 $LD r6,`3*$BNSZ`(r4)
1190 $LD r7,`3*$BNSZ`(r5)
1191 $UMULL r8,r6,r7
1192 $UMULH r9,r6,r7
1193 addc r10,r10,r8
1194 adde r11,r11,r9
1195 addze r12,r12
1196 #mul_add_c(a[2],b[4],c1,c2,c3);
1197 $LD r6,`2*$BNSZ`(r4)
1198 $LD r7,`4*$BNSZ`(r5)
1199 $UMULL r8,r6,r7
1200 $UMULH r9,r6,r7
1201 addc r10,r10,r8
1202 adde r11,r11,r9
1203 addze r12,r12
1204 #mul_add_c(a[1],b[5],c1,c2,c3);
1205 $LD r6,`1*$BNSZ`(r4)
1206 $LD r7,`5*$BNSZ`(r5)
1207 $UMULL r8,r6,r7
1208 $UMULH r9,r6,r7
1209 addc r10,r10,r8
1210 adde r11,r11,r9
1211 addze r12,r12
1212 #mul_add_c(a[0],b[6],c1,c2,c3);
1213 $LD r6,`0*$BNSZ`(r4)
1214 $LD r7,`6*$BNSZ`(r5)
1215 $UMULL r8,r6,r7
1216 $UMULH r9,r6,r7
1217 addc r10,r10,r8
1218 adde r11,r11,r9
1219 addze r12,r12
1220 $ST r10,`6*$BNSZ`(r3) #r[6]=c1;
1221 #mul_add_c(a[0],b[7],c2,c3,c1);
1222 $LD r7,`7*$BNSZ`(r5)
1223 $UMULL r8,r6,r7
1224 $UMULH r9,r6,r7
1225 addc r11,r11,r8
1226 adde r12,r12,r9
1227 addze r10,r0
1228 #mul_add_c(a[1],b[6],c2,c3,c1);
1229 $LD r6,`1*$BNSZ`(r4)
1230 $LD r7,`6*$BNSZ`(r5)
1231 $UMULL r8,r6,r7
1232 $UMULH r9,r6,r7
1233 addc r11,r11,r8
1234 adde r12,r12,r9
1235 addze r10,r10
1236 #mul_add_c(a[2],b[5],c2,c3,c1);
1237 $LD r6,`2*$BNSZ`(r4)
1238 $LD r7,`5*$BNSZ`(r5)
1239 $UMULL r8,r6,r7
1240 $UMULH r9,r6,r7
1241 addc r11,r11,r8
1242 adde r12,r12,r9
1243 addze r10,r10
1244 #mul_add_c(a[3],b[4],c2,c3,c1);
1245 $LD r6,`3*$BNSZ`(r4)
1246 $LD r7,`4*$BNSZ`(r5)
1247 $UMULL r8,r6,r7
1248 $UMULH r9,r6,r7
1249 addc r11,r11,r8
1250 adde r12,r12,r9
1251 addze r10,r10
1252 #mul_add_c(a[4],b[3],c2,c3,c1);
1253 $LD r6,`4*$BNSZ`(r4)
1254 $LD r7,`3*$BNSZ`(r5)
1255 $UMULL r8,r6,r7
1256 $UMULH r9,r6,r7
1257 addc r11,r11,r8
1258 adde r12,r12,r9
1259 addze r10,r10
1260 #mul_add_c(a[5],b[2],c2,c3,c1);
1261 $LD r6,`5*$BNSZ`(r4)
1262 $LD r7,`2*$BNSZ`(r5)
1263 $UMULL r8,r6,r7
1264 $UMULH r9,r6,r7
1265 addc r11,r11,r8
1266 adde r12,r12,r9
1267 addze r10,r10
1268 #mul_add_c(a[6],b[1],c2,c3,c1);
1269 $LD r6,`6*$BNSZ`(r4)
1270 $LD r7,`1*$BNSZ`(r5)
1271 $UMULL r8,r6,r7
1272 $UMULH r9,r6,r7
1273 addc r11,r11,r8
1274 adde r12,r12,r9
1275 addze r10,r10
1276 #mul_add_c(a[7],b[0],c2,c3,c1);
1277 $LD r6,`7*$BNSZ`(r4)
1278 $LD r7,`0*$BNSZ`(r5)
1279 $UMULL r8,r6,r7
1280 $UMULH r9,r6,r7
1281 addc r11,r11,r8
1282 adde r12,r12,r9
1283 addze r10,r10
1284 $ST r11,`7*$BNSZ`(r3) #r[7]=c2;
1285 #mul_add_c(a[7],b[1],c3,c1,c2);
1286 $LD r7,`1*$BNSZ`(r5)
1287 $UMULL r8,r6,r7
1288 $UMULH r9,r6,r7
1289 addc r12,r12,r8
1290 adde r10,r10,r9
1291 addze r11,r0
1292 #mul_add_c(a[6],b[2],c3,c1,c2);
1293 $LD r6,`6*$BNSZ`(r4)
1294 $LD r7,`2*$BNSZ`(r5)
1295 $UMULL r8,r6,r7
1296 $UMULH r9,r6,r7
1297 addc r12,r12,r8
1298 adde r10,r10,r9
1299 addze r11,r11
1300 #mul_add_c(a[5],b[3],c3,c1,c2);
1301 $LD r6,`5*$BNSZ`(r4)
1302 $LD r7,`3*$BNSZ`(r5)
1303 $UMULL r8,r6,r7
1304 $UMULH r9,r6,r7
1305 addc r12,r12,r8
1306 adde r10,r10,r9
1307 addze r11,r11
1308 #mul_add_c(a[4],b[4],c3,c1,c2);
1309 $LD r6,`4*$BNSZ`(r4)
1310 $LD r7,`4*$BNSZ`(r5)
1311 $UMULL r8,r6,r7
1312 $UMULH r9,r6,r7
1313 addc r12,r12,r8
1314 adde r10,r10,r9
1315 addze r11,r11
1316 #mul_add_c(a[3],b[5],c3,c1,c2);
1317 $LD r6,`3*$BNSZ`(r4)
1318 $LD r7,`5*$BNSZ`(r5)
1319 $UMULL r8,r6,r7
1320 $UMULH r9,r6,r7
1321 addc r12,r12,r8
1322 adde r10,r10,r9
1323 addze r11,r11
1324 #mul_add_c(a[2],b[6],c3,c1,c2);
1325 $LD r6,`2*$BNSZ`(r4)
1326 $LD r7,`6*$BNSZ`(r5)
1327 $UMULL r8,r6,r7
1328 $UMULH r9,r6,r7
1329 addc r12,r12,r8
1330 adde r10,r10,r9
1331 addze r11,r11
1332 #mul_add_c(a[1],b[7],c3,c1,c2);
1333 $LD r6,`1*$BNSZ`(r4)
1334 $LD r7,`7*$BNSZ`(r5)
1335 $UMULL r8,r6,r7
1336 $UMULH r9,r6,r7
1337 addc r12,r12,r8
1338 adde r10,r10,r9
1339 addze r11,r11
1340 $ST r12,`8*$BNSZ`(r3) #r[8]=c3;
1341 #mul_add_c(a[2],b[7],c1,c2,c3);
1342 $LD r6,`2*$BNSZ`(r4)
1343 $UMULL r8,r6,r7
1344 $UMULH r9,r6,r7
1345 addc r10,r10,r8
1346 adde r11,r11,r9
1347 addze r12,r0
1348 #mul_add_c(a[3],b[6],c1,c2,c3);
1349 $LD r6,`3*$BNSZ`(r4)
1350 $LD r7,`6*$BNSZ`(r5)
1351 $UMULL r8,r6,r7
1352 $UMULH r9,r6,r7
1353 addc r10,r10,r8
1354 adde r11,r11,r9
1355 addze r12,r12
1356 #mul_add_c(a[4],b[5],c1,c2,c3);
1357 $LD r6,`4*$BNSZ`(r4)
1358 $LD r7,`5*$BNSZ`(r5)
1359 $UMULL r8,r6,r7
1360 $UMULH r9,r6,r7
1361 addc r10,r10,r8
1362 adde r11,r11,r9
1363 addze r12,r12
1364 #mul_add_c(a[5],b[4],c1,c2,c3);
1365 $LD r6,`5*$BNSZ`(r4)
1366 $LD r7,`4*$BNSZ`(r5)
1367 $UMULL r8,r6,r7
1368 $UMULH r9,r6,r7
1369 addc r10,r10,r8
1370 adde r11,r11,r9
1371 addze r12,r12
1372 #mul_add_c(a[6],b[3],c1,c2,c3);
1373 $LD r6,`6*$BNSZ`(r4)
1374 $LD r7,`3*$BNSZ`(r5)
1375 $UMULL r8,r6,r7
1376 $UMULH r9,r6,r7
1377 addc r10,r10,r8
1378 adde r11,r11,r9
1379 addze r12,r12
1380 #mul_add_c(a[7],b[2],c1,c2,c3);
1381 $LD r6,`7*$BNSZ`(r4)
1382 $LD r7,`2*$BNSZ`(r5)
1383 $UMULL r8,r6,r7
1384 $UMULH r9,r6,r7
1385 addc r10,r10,r8
1386 adde r11,r11,r9
1387 addze r12,r12
1388 $ST r10,`9*$BNSZ`(r3) #r[9]=c1;
1389 #mul_add_c(a[7],b[3],c2,c3,c1);
1390 $LD r7,`3*$BNSZ`(r5)
1391 $UMULL r8,r6,r7
1392 $UMULH r9,r6,r7
1393 addc r11,r11,r8
1394 adde r12,r12,r9
1395 addze r10,r0
1396 #mul_add_c(a[6],b[4],c2,c3,c1);
1397 $LD r6,`6*$BNSZ`(r4)
1398 $LD r7,`4*$BNSZ`(r5)
1399 $UMULL r8,r6,r7
1400 $UMULH r9,r6,r7
1401 addc r11,r11,r8
1402 adde r12,r12,r9
1403 addze r10,r10
1404 #mul_add_c(a[5],b[5],c2,c3,c1);
1405 $LD r6,`5*$BNSZ`(r4)
1406 $LD r7,`5*$BNSZ`(r5)
1407 $UMULL r8,r6,r7
1408 $UMULH r9,r6,r7
1409 addc r11,r11,r8
1410 adde r12,r12,r9
1411 addze r10,r10
1412 #mul_add_c(a[4],b[6],c2,c3,c1);
1413 $LD r6,`4*$BNSZ`(r4)
1414 $LD r7,`6*$BNSZ`(r5)
1415 $UMULL r8,r6,r7
1416 $UMULH r9,r6,r7
1417 addc r11,r11,r8
1418 adde r12,r12,r9
1419 addze r10,r10
1420 #mul_add_c(a[3],b[7],c2,c3,c1);
1421 $LD r6,`3*$BNSZ`(r4)
1422 $LD r7,`7*$BNSZ`(r5)
1423 $UMULL r8,r6,r7
1424 $UMULH r9,r6,r7
1425 addc r11,r11,r8
1426 adde r12,r12,r9
1427 addze r10,r10
1428 $ST r11,`10*$BNSZ`(r3) #r[10]=c2;
1429 #mul_add_c(a[4],b[7],c3,c1,c2);
1430 $LD r6,`4*$BNSZ`(r4)
1431 $UMULL r8,r6,r7
1432 $UMULH r9,r6,r7
1433 addc r12,r12,r8
1434 adde r10,r10,r9
1435 addze r11,r0
1436 #mul_add_c(a[5],b[6],c3,c1,c2);
1437 $LD r6,`5*$BNSZ`(r4)
1438 $LD r7,`6*$BNSZ`(r5)
1439 $UMULL r8,r6,r7
1440 $UMULH r9,r6,r7
1441 addc r12,r12,r8
1442 adde r10,r10,r9
1443 addze r11,r11
1444 #mul_add_c(a[6],b[5],c3,c1,c2);
1445 $LD r6,`6*$BNSZ`(r4)
1446 $LD r7,`5*$BNSZ`(r5)
1447 $UMULL r8,r6,r7
1448 $UMULH r9,r6,r7
1449 addc r12,r12,r8
1450 adde r10,r10,r9
1451 addze r11,r11
1452 #mul_add_c(a[7],b[4],c3,c1,c2);
1453 $LD r6,`7*$BNSZ`(r4)
1454 $LD r7,`4*$BNSZ`(r5)
1455 $UMULL r8,r6,r7
1456 $UMULH r9,r6,r7
1457 addc r12,r12,r8
1458 adde r10,r10,r9
1459 addze r11,r11
1460 $ST r12,`11*$BNSZ`(r3) #r[11]=c3;
1461 #mul_add_c(a[7],b[5],c1,c2,c3);
1462 $LD r7,`5*$BNSZ`(r5)
1463 $UMULL r8,r6,r7
1464 $UMULH r9,r6,r7
1465 addc r10,r10,r8
1466 adde r11,r11,r9
1467 addze r12,r0
1468 #mul_add_c(a[6],b[6],c1,c2,c3);
1469 $LD r6,`6*$BNSZ`(r4)
1470 $LD r7,`6*$BNSZ`(r5)
1471 $UMULL r8,r6,r7
1472 $UMULH r9,r6,r7
1473 addc r10,r10,r8
1474 adde r11,r11,r9
1475 addze r12,r12
1476 #mul_add_c(a[5],b[7],c1,c2,c3);
1477 $LD r6,`5*$BNSZ`(r4)
1478 $LD r7,`7*$BNSZ`(r5)
1479 $UMULL r8,r6,r7
1480 $UMULH r9,r6,r7
1481 addc r10,r10,r8
1482 adde r11,r11,r9
1483 addze r12,r12
1484 $ST r10,`12*$BNSZ`(r3) #r[12]=c1;
1485 #mul_add_c(a[6],b[7],c2,c3,c1);
1486 $LD r6,`6*$BNSZ`(r4)
1487 $UMULL r8,r6,r7
1488 $UMULH r9,r6,r7
1489 addc r11,r11,r8
1490 adde r12,r12,r9
1491 addze r10,r0
1492 #mul_add_c(a[7],b[6],c2,c3,c1);
1493 $LD r6,`7*$BNSZ`(r4)
1494 $LD r7,`6*$BNSZ`(r5)
1495 $UMULL r8,r6,r7
1496 $UMULH r9,r6,r7
1497 addc r11,r11,r8
1498 adde r12,r12,r9
1499 addze r10,r10
1500 $ST r11,`13*$BNSZ`(r3) #r[13]=c2;
1501 #mul_add_c(a[7],b[7],c3,c1,c2);
1502 $LD r7,`7*$BNSZ`(r5)
1503 $UMULL r8,r6,r7
1504 $UMULH r9,r6,r7
1505 addc r12,r12,r8
1506 adde r10,r10,r9
1507 $ST r12,`14*$BNSZ`(r3) #r[14]=c3;
1508 $ST r10,`15*$BNSZ`(r3) #r[15]=c1;
1509 blr
1510 .long 0
1511 .byte 0,12,0x14,0,0,0,3,0
1512 .long 0
1513
1514#
1515# NOTE: The following label name should be changed to
1516# "bn_sub_words" i.e. remove the first dot
1517# for the gcc compiler. This should be automatically
1518# done in the build
1519#
1520#
1521.align 4
1522.bn_sub_words:
1523#
1524# Handcoded version of bn_sub_words
1525#
1526#BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
1527#
1528# r3 = r
1529# r4 = a
1530# r5 = b
1531# r6 = n
1532#
1533# Note: No loop unrolling done since this is not a performance
1534# critical loop.
1535
1536 xor r0,r0,r0 #set r0 = 0
1537#
1538# check for r6 = 0 AND set carry bit.
1539#
1540 subfc. r7,r0,r6 # If r6 is 0 then result is 0.
1541 # if r6 > 0 then result !=0
1542 # In either case carry bit is set.
1543 beq Lppcasm_sub_adios
1544 addi r4,r4,-$BNSZ
1545 addi r3,r3,-$BNSZ
1546 addi r5,r5,-$BNSZ
1547 mtctr r6
1548Lppcasm_sub_mainloop:
1549 $LDU r7,$BNSZ(r4)
1550 $LDU r8,$BNSZ(r5)
1551 subfe r6,r8,r7 # r6 = r7+carry bit + onescomplement(r8)
1552 # if carry = 1 this is r7-r8. Else it
1553 # is r7-r8 -1 as we need.
1554 $STU r6,$BNSZ(r3)
1555 bdnz- Lppcasm_sub_mainloop
1556Lppcasm_sub_adios:
1557 subfze r3,r0 # if carry bit is set then r3 = 0 else -1
1558 andi. r3,r3,1 # keep only last bit.
1559 blr
1560 .long 0
1561 .byte 0,12,0x14,0,0,0,4,0
1562 .long 0
1563
1564#
1565# NOTE: The following label name should be changed to
1566# "bn_add_words" i.e. remove the first dot
1567# for the gcc compiler. This should be automatically
1568# done in the build
1569#
1570
1571.align 4
1572.bn_add_words:
1573#
1574# Handcoded version of bn_add_words
1575#
1576#BN_ULONG bn_add_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
1577#
1578# r3 = r
1579# r4 = a
1580# r5 = b
1581# r6 = n
1582#
1583# Note: No loop unrolling done since this is not a performance
1584# critical loop.
1585
1586 xor r0,r0,r0
1587#
1588# check for r6 = 0. Is this needed?
1589#
1590 addic. r6,r6,0 #test r6 and clear carry bit.
1591 beq Lppcasm_add_adios
1592 addi r4,r4,-$BNSZ
1593 addi r3,r3,-$BNSZ
1594 addi r5,r5,-$BNSZ
1595 mtctr r6
1596Lppcasm_add_mainloop:
1597 $LDU r7,$BNSZ(r4)
1598 $LDU r8,$BNSZ(r5)
1599 adde r8,r7,r8
1600 $STU r8,$BNSZ(r3)
1601 bdnz- Lppcasm_add_mainloop
1602Lppcasm_add_adios:
1603 addze r3,r0 #return carry bit.
1604 blr
1605 .long 0
1606 .byte 0,12,0x14,0,0,0,4,0
1607 .long 0
1608
1609#
1610# NOTE: The following label name should be changed to
1611# "bn_div_words" i.e. remove the first dot
1612# for the gcc compiler. This should be automatically
1613# done in the build
1614#
1615
1616.align 4
1617.bn_div_words:
1618#
1619# This is a cleaned up version of code generated by
1620# the AIX compiler. The only optimization is to use
1621# the PPC instruction to count leading zeros instead
1622# of call to num_bits_word. Since this was compiled
1623# only at level -O2 we can possibly squeeze it more?
1624#
1625# r3 = h
1626# r4 = l
1627# r5 = d
1628
1629 $UCMPI 0,r5,0 # compare r5 and 0
1630 bne Lppcasm_div1 # proceed if d!=0
1631 li r3,-1 # d=0 return -1
1632 blr
1633Lppcasm_div1:
1634 xor r0,r0,r0 #r0=0
1635 li r8,$BITS
1636 $CNTLZ. r7,r5 #r7 = num leading 0s in d.
1637 beq Lppcasm_div2 #proceed if no leading zeros
1638 subf r8,r7,r8 #r8 = BN_num_bits_word(d)
1639 $SHR. r9,r3,r8 #are there any bits above r8'th?
1640 $TR 16,r9,r0 #if there're, signal to dump core...
1641Lppcasm_div2:
1642 $UCMP 0,r3,r5 #h>=d?
1643 blt Lppcasm_div3 #goto Lppcasm_div3 if not
1644 subf r3,r5,r3 #h-=d ;
1645Lppcasm_div3: #r7 = BN_BITS2-i. so r7=i
1646 cmpi 0,0,r7,0 # is (i == 0)?
1647 beq Lppcasm_div4
1648 $SHL r3,r3,r7 # h = (h<< i)
1649 $SHR r8,r4,r8 # r8 = (l >> BN_BITS2 -i)
1650 $SHL r5,r5,r7 # d<<=i
1651 or r3,r3,r8 # h = (h<<i)|(l>>(BN_BITS2-i))
1652 $SHL r4,r4,r7 # l <<=i
1653Lppcasm_div4:
1654 $SHRI r9,r5,`$BITS/2` # r9 = dh
1655 # dl will be computed when needed
1656 # as it saves registers.
1657 li r6,2 #r6=2
1658 mtctr r6 #counter will be in count.
1659Lppcasm_divouterloop:
1660 $SHRI r8,r3,`$BITS/2` #r8 = (h>>BN_BITS4)
1661 $SHRI r11,r4,`$BITS/2` #r11= (l&BN_MASK2h)>>BN_BITS4
1662 # compute here for innerloop.
1663 $UCMP 0,r8,r9 # is (h>>BN_BITS4)==dh
1664 bne Lppcasm_div5 # goto Lppcasm_div5 if not
1665
1666 li r8,-1
1667 $CLRU r8,r8,`$BITS/2` #q = BN_MASK2l
1668 b Lppcasm_div6
1669Lppcasm_div5:
1670 $UDIV r8,r3,r9 #q = h/dh
1671Lppcasm_div6:
1672 $UMULL r12,r9,r8 #th = q*dh
1673 $CLRU r10,r5,`$BITS/2` #r10=dl
1674 $UMULL r6,r8,r10 #tl = q*dl
1675
1676Lppcasm_divinnerloop:
1677 subf r10,r12,r3 #t = h -th
1678 $SHRI r7,r10,`$BITS/2` #r7= (t &BN_MASK2H), sort of...
1679 addic. r7,r7,0 #test if r7 == 0. used below.
1680 # now want to compute
1681 # r7 = (t<<BN_BITS4)|((l&BN_MASK2h)>>BN_BITS4)
1682 # the following 2 instructions do that
1683 $SHLI r7,r10,`$BITS/2` # r7 = (t<<BN_BITS4)
1684 or r7,r7,r11 # r7|=((l&BN_MASK2h)>>BN_BITS4)
1685 $UCMP cr1,r6,r7 # compare (tl <= r7)
1686 bne Lppcasm_divinnerexit
1687 ble cr1,Lppcasm_divinnerexit
1688 addi r8,r8,-1 #q--
1689 subf r12,r9,r12 #th -=dh
1690 $CLRU r10,r5,`$BITS/2` #r10=dl. t is no longer needed in loop.
1691 subf r6,r10,r6 #tl -=dl
1692 b Lppcasm_divinnerloop
1693Lppcasm_divinnerexit:
1694 $SHRI r10,r6,`$BITS/2` #t=(tl>>BN_BITS4)
1695 $SHLI r11,r6,`$BITS/2` #tl=(tl<<BN_BITS4)&BN_MASK2h;
1696 $UCMP cr1,r4,r11 # compare l and tl
1697 add r12,r12,r10 # th+=t
1698 bge cr1,Lppcasm_div7 # if (l>=tl) goto Lppcasm_div7
1699 addi r12,r12,1 # th++
1700Lppcasm_div7:
1701 subf r11,r11,r4 #r11=l-tl
1702 $UCMP cr1,r3,r12 #compare h and th
1703 bge cr1,Lppcasm_div8 #if (h>=th) goto Lppcasm_div8
1704 addi r8,r8,-1 # q--
1705 add r3,r5,r3 # h+=d
1706Lppcasm_div8:
1707 subf r12,r12,r3 #r12 = h-th
1708 $SHLI r4,r11,`$BITS/2` #l=(l&BN_MASK2l)<<BN_BITS4
1709 # want to compute
1710 # h = ((h<<BN_BITS4)|(l>>BN_BITS4))&BN_MASK2
1711 # the following 2 instructions will do this.
1712 $INSR r11,r12,`$BITS/2`,`$BITS/2` # r11 is the value we want rotated $BITS/2.
1713 $ROTL r3,r11,`$BITS/2` # rotate by $BITS/2 and store in r3
1714 bdz Lppcasm_div9 #if (count==0) break ;
1715 $SHLI r0,r8,`$BITS/2` #ret =q<<BN_BITS4
1716 b Lppcasm_divouterloop
1717Lppcasm_div9:
1718 or r3,r8,r0
1719 blr
1720 .long 0
1721 .byte 0,12,0x14,0,0,0,3,0
1722 .long 0
1723
1724#
1725# NOTE: The following label name should be changed to
1726# "bn_sqr_words" i.e. remove the first dot
1727# for the gcc compiler. This should be automatically
1728# done in the build
1729#
1730.align 4
1731.bn_sqr_words:
1732#
1733# Optimized version of bn_sqr_words
1734#
1735# void bn_sqr_words(BN_ULONG *r, BN_ULONG *a, int n)
1736#
1737# r3 = r
1738# r4 = a
1739# r5 = n
1740#
1741# r6 = a[i].
1742# r7,r8 = product.
1743#
1744# No unrolling done here. Not performance critical.
1745
1746 addic. r5,r5,0 #test r5.
1747 beq Lppcasm_sqr_adios
1748 addi r4,r4,-$BNSZ
1749 addi r3,r3,-$BNSZ
1750 mtctr r5
1751Lppcasm_sqr_mainloop:
1752 #sqr(r[0],r[1],a[0]);
1753 $LDU r6,$BNSZ(r4)
1754 $UMULL r7,r6,r6
1755 $UMULH r8,r6,r6
1756 $STU r7,$BNSZ(r3)
1757 $STU r8,$BNSZ(r3)
1758 bdnz- Lppcasm_sqr_mainloop
1759Lppcasm_sqr_adios:
1760 blr
1761 .long 0
1762 .byte 0,12,0x14,0,0,0,3,0
1763 .long 0
1764
1765#
1766# NOTE: The following label name should be changed to
1767# "bn_mul_words" i.e. remove the first dot
1768# for the gcc compiler. This should be automatically
1769# done in the build
1770#
1771
1772.align 4
1773.bn_mul_words:
1774#
1775# BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
1776#
1777# r3 = rp
1778# r4 = ap
1779# r5 = num
1780# r6 = w
1781 xor r0,r0,r0
1782 xor r12,r12,r12 # used for carry
1783 rlwinm. r7,r5,30,2,31 # num >> 2
1784 beq Lppcasm_mw_REM
1785 mtctr r7
1786Lppcasm_mw_LOOP:
1787 #mul(rp[0],ap[0],w,c1);
1788 $LD r8,`0*$BNSZ`(r4)
1789 $UMULL r9,r6,r8
1790 $UMULH r10,r6,r8
1791 addc r9,r9,r12
1792 #addze r10,r10 #carry is NOT ignored.
1793 #will be taken care of
1794 #in second spin below
1795 #using adde.
1796 $ST r9,`0*$BNSZ`(r3)
1797 #mul(rp[1],ap[1],w,c1);
1798 $LD r8,`1*$BNSZ`(r4)
1799 $UMULL r11,r6,r8
1800 $UMULH r12,r6,r8
1801 adde r11,r11,r10
1802 #addze r12,r12
1803 $ST r11,`1*$BNSZ`(r3)
1804 #mul(rp[2],ap[2],w,c1);
1805 $LD r8,`2*$BNSZ`(r4)
1806 $UMULL r9,r6,r8
1807 $UMULH r10,r6,r8
1808 adde r9,r9,r12
1809 #addze r10,r10
1810 $ST r9,`2*$BNSZ`(r3)
1811 #mul_add(rp[3],ap[3],w,c1);
1812 $LD r8,`3*$BNSZ`(r4)
1813 $UMULL r11,r6,r8
1814 $UMULH r12,r6,r8
1815 adde r11,r11,r10
1816 addze r12,r12 #this spin we collect carry into
1817 #r12
1818 $ST r11,`3*$BNSZ`(r3)
1819
1820 addi r3,r3,`4*$BNSZ`
1821 addi r4,r4,`4*$BNSZ`
1822 bdnz- Lppcasm_mw_LOOP
1823
1824Lppcasm_mw_REM:
1825 andi. r5,r5,0x3
1826 beq Lppcasm_mw_OVER
1827 #mul(rp[0],ap[0],w,c1);
1828 $LD r8,`0*$BNSZ`(r4)
1829 $UMULL r9,r6,r8
1830 $UMULH r10,r6,r8
1831 addc r9,r9,r12
1832 addze r10,r10
1833 $ST r9,`0*$BNSZ`(r3)
1834 addi r12,r10,0
1835
1836 addi r5,r5,-1
1837 cmpli 0,0,r5,0
1838 beq Lppcasm_mw_OVER
1839
1840
1841 #mul(rp[1],ap[1],w,c1);
1842 $LD r8,`1*$BNSZ`(r4)
1843 $UMULL r9,r6,r8
1844 $UMULH r10,r6,r8
1845 addc r9,r9,r12
1846 addze r10,r10
1847 $ST r9,`1*$BNSZ`(r3)
1848 addi r12,r10,0
1849
1850 addi r5,r5,-1
1851 cmpli 0,0,r5,0
1852 beq Lppcasm_mw_OVER
1853
1854 #mul_add(rp[2],ap[2],w,c1);
1855 $LD r8,`2*$BNSZ`(r4)
1856 $UMULL r9,r6,r8
1857 $UMULH r10,r6,r8
1858 addc r9,r9,r12
1859 addze r10,r10
1860 $ST r9,`2*$BNSZ`(r3)
1861 addi r12,r10,0
1862
1863Lppcasm_mw_OVER:
1864 addi r3,r12,0
1865 blr
1866 .long 0
1867 .byte 0,12,0x14,0,0,0,4,0
1868 .long 0
1869
1870#
1871# NOTE: The following label name should be changed to
1872# "bn_mul_add_words" i.e. remove the first dot
1873# for the gcc compiler. This should be automatically
1874# done in the build
1875#
1876
1877.align 4
1878.bn_mul_add_words:
1879#
1880# BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
1881#
1882# r3 = rp
1883# r4 = ap
1884# r5 = num
1885# r6 = w
1886#
1887# empirical evidence suggests that unrolled version performs best!!
1888#
1889 xor r0,r0,r0 #r0 = 0
1890 xor r12,r12,r12 #r12 = 0 . used for carry
1891 rlwinm. r7,r5,30,2,31 # num >> 2
1892 beq Lppcasm_maw_leftover # if (num < 4) go LPPCASM_maw_leftover
1893 mtctr r7
1894Lppcasm_maw_mainloop:
1895 #mul_add(rp[0],ap[0],w,c1);
1896 $LD r8,`0*$BNSZ`(r4)
1897 $LD r11,`0*$BNSZ`(r3)
1898 $UMULL r9,r6,r8
1899 $UMULH r10,r6,r8
1900 addc r9,r9,r12 #r12 is carry.
1901 addze r10,r10
1902 addc r9,r9,r11
1903 #addze r10,r10
1904 #the above instruction addze
1905 #is NOT needed. Carry will NOT
1906 #be ignored. It's not affected
1907 #by multiply and will be collected
1908 #in the next spin
1909 $ST r9,`0*$BNSZ`(r3)
1910
1911 #mul_add(rp[1],ap[1],w,c1);
1912 $LD r8,`1*$BNSZ`(r4)
1913 $LD r9,`1*$BNSZ`(r3)
1914 $UMULL r11,r6,r8
1915 $UMULH r12,r6,r8
1916 adde r11,r11,r10 #r10 is carry.
1917 addze r12,r12
1918 addc r11,r11,r9
1919 #addze r12,r12
1920 $ST r11,`1*$BNSZ`(r3)
1921
1922 #mul_add(rp[2],ap[2],w,c1);
1923 $LD r8,`2*$BNSZ`(r4)
1924 $UMULL r9,r6,r8
1925 $LD r11,`2*$BNSZ`(r3)
1926 $UMULH r10,r6,r8
1927 adde r9,r9,r12
1928 addze r10,r10
1929 addc r9,r9,r11
1930 #addze r10,r10
1931 $ST r9,`2*$BNSZ`(r3)
1932
1933 #mul_add(rp[3],ap[3],w,c1);
1934 $LD r8,`3*$BNSZ`(r4)
1935 $UMULL r11,r6,r8
1936 $LD r9,`3*$BNSZ`(r3)
1937 $UMULH r12,r6,r8
1938 adde r11,r11,r10
1939 addze r12,r12
1940 addc r11,r11,r9
1941 addze r12,r12
1942 $ST r11,`3*$BNSZ`(r3)
1943 addi r3,r3,`4*$BNSZ`
1944 addi r4,r4,`4*$BNSZ`
1945 bdnz- Lppcasm_maw_mainloop
1946
1947Lppcasm_maw_leftover:
1948 andi. r5,r5,0x3
1949 beq Lppcasm_maw_adios
1950 addi r3,r3,-$BNSZ
1951 addi r4,r4,-$BNSZ
1952 #mul_add(rp[0],ap[0],w,c1);
1953 mtctr r5
1954 $LDU r8,$BNSZ(r4)
1955 $UMULL r9,r6,r8
1956 $UMULH r10,r6,r8
1957 $LDU r11,$BNSZ(r3)
1958 addc r9,r9,r11
1959 addze r10,r10
1960 addc r9,r9,r12
1961 addze r12,r10
1962 $ST r9,0(r3)
1963
1964 bdz Lppcasm_maw_adios
1965 #mul_add(rp[1],ap[1],w,c1);
1966 $LDU r8,$BNSZ(r4)
1967 $UMULL r9,r6,r8
1968 $UMULH r10,r6,r8
1969 $LDU r11,$BNSZ(r3)
1970 addc r9,r9,r11
1971 addze r10,r10
1972 addc r9,r9,r12
1973 addze r12,r10
1974 $ST r9,0(r3)
1975
1976 bdz Lppcasm_maw_adios
1977 #mul_add(rp[2],ap[2],w,c1);
1978 $LDU r8,$BNSZ(r4)
1979 $UMULL r9,r6,r8
1980 $UMULH r10,r6,r8
1981 $LDU r11,$BNSZ(r3)
1982 addc r9,r9,r11
1983 addze r10,r10
1984 addc r9,r9,r12
1985 addze r12,r10
1986 $ST r9,0(r3)
1987
1988Lppcasm_maw_adios:
1989 addi r3,r12,0
1990 blr
1991 .long 0
1992 .byte 0,12,0x14,0,0,0,4,0
1993 .long 0
1994 .align 4
1995EOF
1996$data =~ s/\`([^\`]*)\`/eval $1/gem;
1997print $data;
1998close STDOUT;
diff --git a/src/lib/libcrypto/bn/asm/ppc64-mont.pl b/src/lib/libcrypto/bn/asm/ppc64-mont.pl
deleted file mode 100644
index a14e769ad0..0000000000
--- a/src/lib/libcrypto/bn/asm/ppc64-mont.pl
+++ /dev/null
@@ -1,1088 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# December 2007
11
12# The reason for undertaken effort is basically following. Even though
13# Power 6 CPU operates at incredible 4.7GHz clock frequency, its PKI
14# performance was observed to be less than impressive, essentially as
15# fast as 1.8GHz PPC970, or 2.6 times(!) slower than one would hope.
16# Well, it's not surprising that IBM had to make some sacrifices to
17# boost the clock frequency that much, but no overall improvement?
18# Having observed how much difference did switching to FPU make on
19# UltraSPARC, playing same stunt on Power 6 appeared appropriate...
20# Unfortunately the resulting performance improvement is not as
21# impressive, ~30%, and in absolute terms is still very far from what
22# one would expect from 4.7GHz CPU. There is a chance that I'm doing
23# something wrong, but in the lack of assembler level micro-profiling
24# data or at least decent platform guide I can't tell... Or better
25# results might be achieved with VMX... Anyway, this module provides
26# *worse* performance on other PowerPC implementations, ~40-15% slower
27# on PPC970 depending on key length and ~40% slower on Power 5 for all
28# key lengths. As it's obviously inappropriate as "best all-round"
29# alternative, it has to be complemented with run-time CPU family
30# detection. Oh! It should also be noted that unlike other PowerPC
31# implementation IALU ppc-mont.pl module performs *suboptimaly* on
32# >=1024-bit key lengths on Power 6. It should also be noted that
33# *everything* said so far applies to 64-bit builds! As far as 32-bit
34# application executed on 64-bit CPU goes, this module is likely to
35# become preferred choice, because it's easy to adapt it for such
36# case and *is* faster than 32-bit ppc-mont.pl on *all* processors.
37
38# February 2008
39
40# Micro-profiling assisted optimization results in ~15% improvement
41# over original ppc64-mont.pl version, or overall ~50% improvement
42# over ppc.pl module on Power 6. If compared to ppc-mont.pl on same
43# Power 6 CPU, this module is 5-150% faster depending on key length,
44# [hereafter] more for longer keys. But if compared to ppc-mont.pl
45# on 1.8GHz PPC970, it's only 5-55% faster. Still far from impressive
46# in absolute terms, but it's apparently the way Power 6 is...
47
48# December 2009
49
50# Adapted for 32-bit build this module delivers 25-120%, yes, more
51# than *twice* for longer keys, performance improvement over 32-bit
52# ppc-mont.pl on 1.8GHz PPC970. However! This implementation utilizes
53# even 64-bit integer operations and the trouble is that most PPC
54# operating systems don't preserve upper halves of general purpose
55# registers upon 32-bit signal delivery. They do preserve them upon
56# context switch, but not signalling:-( This means that asynchronous
57# signals have to be blocked upon entry to this subroutine. Signal
58# masking (and of course complementary unmasking) has quite an impact
59# on performance, naturally larger for shorter keys. It's so severe
60# that 512-bit key performance can be as low as 1/3 of expected one.
61# This is why this routine can be engaged for longer key operations
62# only on these OSes, see crypto/ppccap.c for further details. MacOS X
63# is an exception from this and doesn't require signal masking, and
64# that's where above improvement coefficients were collected. For
65# others alternative would be to break dependence on upper halves of
66# GPRs by sticking to 32-bit integer operations...
67
68$flavour = shift;
69
70if ($flavour =~ /32/) {
71 $SIZE_T=4;
72 $RZONE= 224;
73 $fname= "bn_mul_mont_fpu64";
74
75 $STUX= "stwux"; # store indexed and update
76 $PUSH= "stw";
77 $POP= "lwz";
78} elsif ($flavour =~ /64/) {
79 $SIZE_T=8;
80 $RZONE= 288;
81 $fname= "bn_mul_mont_fpu64";
82
83 # same as above, but 64-bit mnemonics...
84 $STUX= "stdux"; # store indexed and update
85 $PUSH= "std";
86 $POP= "ld";
87} else { die "nonsense $flavour"; }
88
89$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
90( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
91( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
92die "can't locate ppc-xlate.pl";
93
94open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
95
96$FRAME=64; # padded frame header
97$TRANSFER=16*8;
98
99$carry="r0";
100$sp="r1";
101$toc="r2";
102$rp="r3"; $ovf="r3";
103$ap="r4";
104$bp="r5";
105$np="r6";
106$n0="r7";
107$num="r8";
108$rp="r9"; # $rp is reassigned
109$tp="r10";
110$j="r11";
111$i="r12";
112# non-volatile registers
113$nap_d="r22"; # interleaved ap and np in double format
114$a0="r23"; # ap[0]
115$t0="r24"; # temporary registers
116$t1="r25";
117$t2="r26";
118$t3="r27";
119$t4="r28";
120$t5="r29";
121$t6="r30";
122$t7="r31";
123
124# PPC offers enough register bank capacity to unroll inner loops twice
125#
126# ..A3A2A1A0
127# dcba
128# -----------
129# A0a
130# A0b
131# A0c
132# A0d
133# A1a
134# A1b
135# A1c
136# A1d
137# A2a
138# A2b
139# A2c
140# A2d
141# A3a
142# A3b
143# A3c
144# A3d
145# ..a
146# ..b
147#
148$ba="f0"; $bb="f1"; $bc="f2"; $bd="f3";
149$na="f4"; $nb="f5"; $nc="f6"; $nd="f7";
150$dota="f8"; $dotb="f9";
151$A0="f10"; $A1="f11"; $A2="f12"; $A3="f13";
152$N0="f20"; $N1="f21"; $N2="f22"; $N3="f23";
153$T0a="f24"; $T0b="f25";
154$T1a="f26"; $T1b="f27";
155$T2a="f28"; $T2b="f29";
156$T3a="f30"; $T3b="f31";
157
158# sp----------->+-------------------------------+
159# | saved sp |
160# +-------------------------------+
161# . .
162# +64 +-------------------------------+
163# | 16 gpr<->fpr transfer zone |
164# . .
165# . .
166# +16*8 +-------------------------------+
167# | __int64 tmp[-1] |
168# +-------------------------------+
169# | __int64 tmp[num] |
170# . .
171# . .
172# . .
173# +(num+1)*8 +-------------------------------+
174# | padding to 64 byte boundary |
175# . .
176# +X +-------------------------------+
177# | double nap_d[4*num] |
178# . .
179# . .
180# . .
181# +-------------------------------+
182# . .
183# -12*size_t +-------------------------------+
184# | 10 saved gpr, r22-r31 |
185# . .
186# . .
187# -12*8 +-------------------------------+
188# | 12 saved fpr, f20-f31 |
189# . .
190# . .
191# +-------------------------------+
192
193$code=<<___;
194.machine "any"
195.text
196
197.globl .$fname
198.align 5
199.$fname:
200 cmpwi $num,`3*8/$SIZE_T`
201 mr $rp,r3 ; $rp is reassigned
202 li r3,0 ; possible "not handled" return code
203 bltlr-
204 andi. r0,$num,`16/$SIZE_T-1` ; $num has to be "even"
205 bnelr-
206
207 slwi $num,$num,`log($SIZE_T)/log(2)` ; num*=sizeof(BN_LONG)
208 li $i,-4096
209 slwi $tp,$num,2 ; place for {an}p_{lh}[num], i.e. 4*num
210 add $tp,$tp,$num ; place for tp[num+1]
211 addi $tp,$tp,`$FRAME+$TRANSFER+8+64+$RZONE`
212 subf $tp,$tp,$sp ; $sp-$tp
213 and $tp,$tp,$i ; minimize TLB usage
214 subf $tp,$sp,$tp ; $tp-$sp
215 mr $i,$sp
216 $STUX $sp,$sp,$tp ; alloca
217
218 $PUSH r22,`-12*8-10*$SIZE_T`($i)
219 $PUSH r23,`-12*8-9*$SIZE_T`($i)
220 $PUSH r24,`-12*8-8*$SIZE_T`($i)
221 $PUSH r25,`-12*8-7*$SIZE_T`($i)
222 $PUSH r26,`-12*8-6*$SIZE_T`($i)
223 $PUSH r27,`-12*8-5*$SIZE_T`($i)
224 $PUSH r28,`-12*8-4*$SIZE_T`($i)
225 $PUSH r29,`-12*8-3*$SIZE_T`($i)
226 $PUSH r30,`-12*8-2*$SIZE_T`($i)
227 $PUSH r31,`-12*8-1*$SIZE_T`($i)
228 stfd f20,`-12*8`($i)
229 stfd f21,`-11*8`($i)
230 stfd f22,`-10*8`($i)
231 stfd f23,`-9*8`($i)
232 stfd f24,`-8*8`($i)
233 stfd f25,`-7*8`($i)
234 stfd f26,`-6*8`($i)
235 stfd f27,`-5*8`($i)
236 stfd f28,`-4*8`($i)
237 stfd f29,`-3*8`($i)
238 stfd f30,`-2*8`($i)
239 stfd f31,`-1*8`($i)
240___
241$code.=<<___ if ($SIZE_T==8);
242 ld $a0,0($ap) ; pull ap[0] value
243 ld $n0,0($n0) ; pull n0[0] value
244 ld $t3,0($bp) ; bp[0]
245___
246$code.=<<___ if ($SIZE_T==4);
247 mr $t1,$n0
248 lwz $a0,0($ap) ; pull ap[0,1] value
249 lwz $t0,4($ap)
250 lwz $n0,0($t1) ; pull n0[0,1] value
251 lwz $t1,4($t1)
252 lwz $t3,0($bp) ; bp[0,1]
253 lwz $t2,4($bp)
254 insrdi $a0,$t0,32,0
255 insrdi $n0,$t1,32,0
256 insrdi $t3,$t2,32,0
257___
258$code.=<<___;
259 addi $tp,$sp,`$FRAME+$TRANSFER+8+64`
260 li $i,-64
261 add $nap_d,$tp,$num
262 and $nap_d,$nap_d,$i ; align to 64 bytes
263
264 mulld $t7,$a0,$t3 ; ap[0]*bp[0]
265 ; nap_d is off by 1, because it's used with stfdu/lfdu
266 addi $nap_d,$nap_d,-8
267 srwi $j,$num,`3+1` ; counter register, num/2
268 mulld $t7,$t7,$n0 ; tp[0]*n0
269 addi $j,$j,-1
270 addi $tp,$sp,`$FRAME+$TRANSFER-8`
271 li $carry,0
272 mtctr $j
273
274 ; transfer bp[0] to FPU as 4x16-bit values
275 extrdi $t0,$t3,16,48
276 extrdi $t1,$t3,16,32
277 extrdi $t2,$t3,16,16
278 extrdi $t3,$t3,16,0
279 std $t0,`$FRAME+0`($sp)
280 std $t1,`$FRAME+8`($sp)
281 std $t2,`$FRAME+16`($sp)
282 std $t3,`$FRAME+24`($sp)
283 ; transfer (ap[0]*bp[0])*n0 to FPU as 4x16-bit values
284 extrdi $t4,$t7,16,48
285 extrdi $t5,$t7,16,32
286 extrdi $t6,$t7,16,16
287 extrdi $t7,$t7,16,0
288 std $t4,`$FRAME+32`($sp)
289 std $t5,`$FRAME+40`($sp)
290 std $t6,`$FRAME+48`($sp)
291 std $t7,`$FRAME+56`($sp)
292___
293$code.=<<___ if ($SIZE_T==8);
294 lwz $t0,4($ap) ; load a[j] as 32-bit word pair
295 lwz $t1,0($ap)
296 lwz $t2,12($ap) ; load a[j+1] as 32-bit word pair
297 lwz $t3,8($ap)
298 lwz $t4,4($np) ; load n[j] as 32-bit word pair
299 lwz $t5,0($np)
300 lwz $t6,12($np) ; load n[j+1] as 32-bit word pair
301 lwz $t7,8($np)
302___
303$code.=<<___ if ($SIZE_T==4);
304 lwz $t0,0($ap) ; load a[j..j+3] as 32-bit word pairs
305 lwz $t1,4($ap)
306 lwz $t2,8($ap)
307 lwz $t3,12($ap)
308 lwz $t4,0($np) ; load n[j..j+3] as 32-bit word pairs
309 lwz $t5,4($np)
310 lwz $t6,8($np)
311 lwz $t7,12($np)
312___
313$code.=<<___;
314 lfd $ba,`$FRAME+0`($sp)
315 lfd $bb,`$FRAME+8`($sp)
316 lfd $bc,`$FRAME+16`($sp)
317 lfd $bd,`$FRAME+24`($sp)
318 lfd $na,`$FRAME+32`($sp)
319 lfd $nb,`$FRAME+40`($sp)
320 lfd $nc,`$FRAME+48`($sp)
321 lfd $nd,`$FRAME+56`($sp)
322 std $t0,`$FRAME+64`($sp)
323 std $t1,`$FRAME+72`($sp)
324 std $t2,`$FRAME+80`($sp)
325 std $t3,`$FRAME+88`($sp)
326 std $t4,`$FRAME+96`($sp)
327 std $t5,`$FRAME+104`($sp)
328 std $t6,`$FRAME+112`($sp)
329 std $t7,`$FRAME+120`($sp)
330 fcfid $ba,$ba
331 fcfid $bb,$bb
332 fcfid $bc,$bc
333 fcfid $bd,$bd
334 fcfid $na,$na
335 fcfid $nb,$nb
336 fcfid $nc,$nc
337 fcfid $nd,$nd
338
339 lfd $A0,`$FRAME+64`($sp)
340 lfd $A1,`$FRAME+72`($sp)
341 lfd $A2,`$FRAME+80`($sp)
342 lfd $A3,`$FRAME+88`($sp)
343 lfd $N0,`$FRAME+96`($sp)
344 lfd $N1,`$FRAME+104`($sp)
345 lfd $N2,`$FRAME+112`($sp)
346 lfd $N3,`$FRAME+120`($sp)
347 fcfid $A0,$A0
348 fcfid $A1,$A1
349 fcfid $A2,$A2
350 fcfid $A3,$A3
351 fcfid $N0,$N0
352 fcfid $N1,$N1
353 fcfid $N2,$N2
354 fcfid $N3,$N3
355 addi $ap,$ap,16
356 addi $np,$np,16
357
358 fmul $T1a,$A1,$ba
359 fmul $T1b,$A1,$bb
360 stfd $A0,8($nap_d) ; save a[j] in double format
361 stfd $A1,16($nap_d)
362 fmul $T2a,$A2,$ba
363 fmul $T2b,$A2,$bb
364 stfd $A2,24($nap_d) ; save a[j+1] in double format
365 stfd $A3,32($nap_d)
366 fmul $T3a,$A3,$ba
367 fmul $T3b,$A3,$bb
368 stfd $N0,40($nap_d) ; save n[j] in double format
369 stfd $N1,48($nap_d)
370 fmul $T0a,$A0,$ba
371 fmul $T0b,$A0,$bb
372 stfd $N2,56($nap_d) ; save n[j+1] in double format
373 stfdu $N3,64($nap_d)
374
375 fmadd $T1a,$A0,$bc,$T1a
376 fmadd $T1b,$A0,$bd,$T1b
377 fmadd $T2a,$A1,$bc,$T2a
378 fmadd $T2b,$A1,$bd,$T2b
379 fmadd $T3a,$A2,$bc,$T3a
380 fmadd $T3b,$A2,$bd,$T3b
381 fmul $dota,$A3,$bc
382 fmul $dotb,$A3,$bd
383
384 fmadd $T1a,$N1,$na,$T1a
385 fmadd $T1b,$N1,$nb,$T1b
386 fmadd $T2a,$N2,$na,$T2a
387 fmadd $T2b,$N2,$nb,$T2b
388 fmadd $T3a,$N3,$na,$T3a
389 fmadd $T3b,$N3,$nb,$T3b
390 fmadd $T0a,$N0,$na,$T0a
391 fmadd $T0b,$N0,$nb,$T0b
392
393 fmadd $T1a,$N0,$nc,$T1a
394 fmadd $T1b,$N0,$nd,$T1b
395 fmadd $T2a,$N1,$nc,$T2a
396 fmadd $T2b,$N1,$nd,$T2b
397 fmadd $T3a,$N2,$nc,$T3a
398 fmadd $T3b,$N2,$nd,$T3b
399 fmadd $dota,$N3,$nc,$dota
400 fmadd $dotb,$N3,$nd,$dotb
401
402 fctid $T0a,$T0a
403 fctid $T0b,$T0b
404 fctid $T1a,$T1a
405 fctid $T1b,$T1b
406 fctid $T2a,$T2a
407 fctid $T2b,$T2b
408 fctid $T3a,$T3a
409 fctid $T3b,$T3b
410
411 stfd $T0a,`$FRAME+0`($sp)
412 stfd $T0b,`$FRAME+8`($sp)
413 stfd $T1a,`$FRAME+16`($sp)
414 stfd $T1b,`$FRAME+24`($sp)
415 stfd $T2a,`$FRAME+32`($sp)
416 stfd $T2b,`$FRAME+40`($sp)
417 stfd $T3a,`$FRAME+48`($sp)
418 stfd $T3b,`$FRAME+56`($sp)
419
420.align 5
421L1st:
422___
423$code.=<<___ if ($SIZE_T==8);
424 lwz $t0,4($ap) ; load a[j] as 32-bit word pair
425 lwz $t1,0($ap)
426 lwz $t2,12($ap) ; load a[j+1] as 32-bit word pair
427 lwz $t3,8($ap)
428 lwz $t4,4($np) ; load n[j] as 32-bit word pair
429 lwz $t5,0($np)
430 lwz $t6,12($np) ; load n[j+1] as 32-bit word pair
431 lwz $t7,8($np)
432___
433$code.=<<___ if ($SIZE_T==4);
434 lwz $t0,0($ap) ; load a[j..j+3] as 32-bit word pairs
435 lwz $t1,4($ap)
436 lwz $t2,8($ap)
437 lwz $t3,12($ap)
438 lwz $t4,0($np) ; load n[j..j+3] as 32-bit word pairs
439 lwz $t5,4($np)
440 lwz $t6,8($np)
441 lwz $t7,12($np)
442___
443$code.=<<___;
444 std $t0,`$FRAME+64`($sp)
445 std $t1,`$FRAME+72`($sp)
446 std $t2,`$FRAME+80`($sp)
447 std $t3,`$FRAME+88`($sp)
448 std $t4,`$FRAME+96`($sp)
449 std $t5,`$FRAME+104`($sp)
450 std $t6,`$FRAME+112`($sp)
451 std $t7,`$FRAME+120`($sp)
452 ld $t0,`$FRAME+0`($sp)
453 ld $t1,`$FRAME+8`($sp)
454 ld $t2,`$FRAME+16`($sp)
455 ld $t3,`$FRAME+24`($sp)
456 ld $t4,`$FRAME+32`($sp)
457 ld $t5,`$FRAME+40`($sp)
458 ld $t6,`$FRAME+48`($sp)
459 ld $t7,`$FRAME+56`($sp)
460 lfd $A0,`$FRAME+64`($sp)
461 lfd $A1,`$FRAME+72`($sp)
462 lfd $A2,`$FRAME+80`($sp)
463 lfd $A3,`$FRAME+88`($sp)
464 lfd $N0,`$FRAME+96`($sp)
465 lfd $N1,`$FRAME+104`($sp)
466 lfd $N2,`$FRAME+112`($sp)
467 lfd $N3,`$FRAME+120`($sp)
468 fcfid $A0,$A0
469 fcfid $A1,$A1
470 fcfid $A2,$A2
471 fcfid $A3,$A3
472 fcfid $N0,$N0
473 fcfid $N1,$N1
474 fcfid $N2,$N2
475 fcfid $N3,$N3
476 addi $ap,$ap,16
477 addi $np,$np,16
478
479 fmul $T1a,$A1,$ba
480 fmul $T1b,$A1,$bb
481 fmul $T2a,$A2,$ba
482 fmul $T2b,$A2,$bb
483 stfd $A0,8($nap_d) ; save a[j] in double format
484 stfd $A1,16($nap_d)
485 fmul $T3a,$A3,$ba
486 fmul $T3b,$A3,$bb
487 fmadd $T0a,$A0,$ba,$dota
488 fmadd $T0b,$A0,$bb,$dotb
489 stfd $A2,24($nap_d) ; save a[j+1] in double format
490 stfd $A3,32($nap_d)
491
492 fmadd $T1a,$A0,$bc,$T1a
493 fmadd $T1b,$A0,$bd,$T1b
494 fmadd $T2a,$A1,$bc,$T2a
495 fmadd $T2b,$A1,$bd,$T2b
496 stfd $N0,40($nap_d) ; save n[j] in double format
497 stfd $N1,48($nap_d)
498 fmadd $T3a,$A2,$bc,$T3a
499 fmadd $T3b,$A2,$bd,$T3b
500 add $t0,$t0,$carry ; can not overflow
501 fmul $dota,$A3,$bc
502 fmul $dotb,$A3,$bd
503 stfd $N2,56($nap_d) ; save n[j+1] in double format
504 stfdu $N3,64($nap_d)
505 srdi $carry,$t0,16
506 add $t1,$t1,$carry
507 srdi $carry,$t1,16
508
509 fmadd $T1a,$N1,$na,$T1a
510 fmadd $T1b,$N1,$nb,$T1b
511 insrdi $t0,$t1,16,32
512 fmadd $T2a,$N2,$na,$T2a
513 fmadd $T2b,$N2,$nb,$T2b
514 add $t2,$t2,$carry
515 fmadd $T3a,$N3,$na,$T3a
516 fmadd $T3b,$N3,$nb,$T3b
517 srdi $carry,$t2,16
518 fmadd $T0a,$N0,$na,$T0a
519 fmadd $T0b,$N0,$nb,$T0b
520 insrdi $t0,$t2,16,16
521 add $t3,$t3,$carry
522 srdi $carry,$t3,16
523
524 fmadd $T1a,$N0,$nc,$T1a
525 fmadd $T1b,$N0,$nd,$T1b
526 insrdi $t0,$t3,16,0 ; 0..63 bits
527 fmadd $T2a,$N1,$nc,$T2a
528 fmadd $T2b,$N1,$nd,$T2b
529 add $t4,$t4,$carry
530 fmadd $T3a,$N2,$nc,$T3a
531 fmadd $T3b,$N2,$nd,$T3b
532 srdi $carry,$t4,16
533 fmadd $dota,$N3,$nc,$dota
534 fmadd $dotb,$N3,$nd,$dotb
535 add $t5,$t5,$carry
536 srdi $carry,$t5,16
537 insrdi $t4,$t5,16,32
538
539 fctid $T0a,$T0a
540 fctid $T0b,$T0b
541 add $t6,$t6,$carry
542 fctid $T1a,$T1a
543 fctid $T1b,$T1b
544 srdi $carry,$t6,16
545 fctid $T2a,$T2a
546 fctid $T2b,$T2b
547 insrdi $t4,$t6,16,16
548 fctid $T3a,$T3a
549 fctid $T3b,$T3b
550 add $t7,$t7,$carry
551 insrdi $t4,$t7,16,0 ; 64..127 bits
552 srdi $carry,$t7,16 ; upper 33 bits
553
554 stfd $T0a,`$FRAME+0`($sp)
555 stfd $T0b,`$FRAME+8`($sp)
556 stfd $T1a,`$FRAME+16`($sp)
557 stfd $T1b,`$FRAME+24`($sp)
558 stfd $T2a,`$FRAME+32`($sp)
559 stfd $T2b,`$FRAME+40`($sp)
560 stfd $T3a,`$FRAME+48`($sp)
561 stfd $T3b,`$FRAME+56`($sp)
562 std $t0,8($tp) ; tp[j-1]
563 stdu $t4,16($tp) ; tp[j]
564 bdnz- L1st
565
566 fctid $dota,$dota
567 fctid $dotb,$dotb
568
569 ld $t0,`$FRAME+0`($sp)
570 ld $t1,`$FRAME+8`($sp)
571 ld $t2,`$FRAME+16`($sp)
572 ld $t3,`$FRAME+24`($sp)
573 ld $t4,`$FRAME+32`($sp)
574 ld $t5,`$FRAME+40`($sp)
575 ld $t6,`$FRAME+48`($sp)
576 ld $t7,`$FRAME+56`($sp)
577 stfd $dota,`$FRAME+64`($sp)
578 stfd $dotb,`$FRAME+72`($sp)
579
580 add $t0,$t0,$carry ; can not overflow
581 srdi $carry,$t0,16
582 add $t1,$t1,$carry
583 srdi $carry,$t1,16
584 insrdi $t0,$t1,16,32
585 add $t2,$t2,$carry
586 srdi $carry,$t2,16
587 insrdi $t0,$t2,16,16
588 add $t3,$t3,$carry
589 srdi $carry,$t3,16
590 insrdi $t0,$t3,16,0 ; 0..63 bits
591 add $t4,$t4,$carry
592 srdi $carry,$t4,16
593 add $t5,$t5,$carry
594 srdi $carry,$t5,16
595 insrdi $t4,$t5,16,32
596 add $t6,$t6,$carry
597 srdi $carry,$t6,16
598 insrdi $t4,$t6,16,16
599 add $t7,$t7,$carry
600 insrdi $t4,$t7,16,0 ; 64..127 bits
601 srdi $carry,$t7,16 ; upper 33 bits
602 ld $t6,`$FRAME+64`($sp)
603 ld $t7,`$FRAME+72`($sp)
604
605 std $t0,8($tp) ; tp[j-1]
606 stdu $t4,16($tp) ; tp[j]
607
608 add $t6,$t6,$carry ; can not overflow
609 srdi $carry,$t6,16
610 add $t7,$t7,$carry
611 insrdi $t6,$t7,48,0
612 srdi $ovf,$t7,48
613 std $t6,8($tp) ; tp[num-1]
614
615 slwi $t7,$num,2
616 subf $nap_d,$t7,$nap_d ; rewind pointer
617
618 li $i,8 ; i=1
619.align 5
620Louter:
621___
622$code.=<<___ if ($SIZE_T==8);
623 ldx $t3,$bp,$i ; bp[i]
624___
625$code.=<<___ if ($SIZE_T==4);
626 add $t0,$bp,$i
627 lwz $t3,0($t0) ; bp[i,i+1]
628 lwz $t0,4($t0)
629 insrdi $t3,$t0,32,0
630___
631$code.=<<___;
632 ld $t6,`$FRAME+$TRANSFER+8`($sp) ; tp[0]
633 mulld $t7,$a0,$t3 ; ap[0]*bp[i]
634
635 addi $tp,$sp,`$FRAME+$TRANSFER`
636 add $t7,$t7,$t6 ; ap[0]*bp[i]+tp[0]
637 li $carry,0
638 mulld $t7,$t7,$n0 ; tp[0]*n0
639 mtctr $j
640
641 ; transfer bp[i] to FPU as 4x16-bit values
642 extrdi $t0,$t3,16,48
643 extrdi $t1,$t3,16,32
644 extrdi $t2,$t3,16,16
645 extrdi $t3,$t3,16,0
646 std $t0,`$FRAME+0`($sp)
647 std $t1,`$FRAME+8`($sp)
648 std $t2,`$FRAME+16`($sp)
649 std $t3,`$FRAME+24`($sp)
650 ; transfer (ap[0]*bp[i]+tp[0])*n0 to FPU as 4x16-bit values
651 extrdi $t4,$t7,16,48
652 extrdi $t5,$t7,16,32
653 extrdi $t6,$t7,16,16
654 extrdi $t7,$t7,16,0
655 std $t4,`$FRAME+32`($sp)
656 std $t5,`$FRAME+40`($sp)
657 std $t6,`$FRAME+48`($sp)
658 std $t7,`$FRAME+56`($sp)
659
660 lfd $A0,8($nap_d) ; load a[j] in double format
661 lfd $A1,16($nap_d)
662 lfd $A2,24($nap_d) ; load a[j+1] in double format
663 lfd $A3,32($nap_d)
664 lfd $N0,40($nap_d) ; load n[j] in double format
665 lfd $N1,48($nap_d)
666 lfd $N2,56($nap_d) ; load n[j+1] in double format
667 lfdu $N3,64($nap_d)
668
669 lfd $ba,`$FRAME+0`($sp)
670 lfd $bb,`$FRAME+8`($sp)
671 lfd $bc,`$FRAME+16`($sp)
672 lfd $bd,`$FRAME+24`($sp)
673 lfd $na,`$FRAME+32`($sp)
674 lfd $nb,`$FRAME+40`($sp)
675 lfd $nc,`$FRAME+48`($sp)
676 lfd $nd,`$FRAME+56`($sp)
677
678 fcfid $ba,$ba
679 fcfid $bb,$bb
680 fcfid $bc,$bc
681 fcfid $bd,$bd
682 fcfid $na,$na
683 fcfid $nb,$nb
684 fcfid $nc,$nc
685 fcfid $nd,$nd
686
687 fmul $T1a,$A1,$ba
688 fmul $T1b,$A1,$bb
689 fmul $T2a,$A2,$ba
690 fmul $T2b,$A2,$bb
691 fmul $T3a,$A3,$ba
692 fmul $T3b,$A3,$bb
693 fmul $T0a,$A0,$ba
694 fmul $T0b,$A0,$bb
695
696 fmadd $T1a,$A0,$bc,$T1a
697 fmadd $T1b,$A0,$bd,$T1b
698 fmadd $T2a,$A1,$bc,$T2a
699 fmadd $T2b,$A1,$bd,$T2b
700 fmadd $T3a,$A2,$bc,$T3a
701 fmadd $T3b,$A2,$bd,$T3b
702 fmul $dota,$A3,$bc
703 fmul $dotb,$A3,$bd
704
705 fmadd $T1a,$N1,$na,$T1a
706 fmadd $T1b,$N1,$nb,$T1b
707 lfd $A0,8($nap_d) ; load a[j] in double format
708 lfd $A1,16($nap_d)
709 fmadd $T2a,$N2,$na,$T2a
710 fmadd $T2b,$N2,$nb,$T2b
711 lfd $A2,24($nap_d) ; load a[j+1] in double format
712 lfd $A3,32($nap_d)
713 fmadd $T3a,$N3,$na,$T3a
714 fmadd $T3b,$N3,$nb,$T3b
715 fmadd $T0a,$N0,$na,$T0a
716 fmadd $T0b,$N0,$nb,$T0b
717
718 fmadd $T1a,$N0,$nc,$T1a
719 fmadd $T1b,$N0,$nd,$T1b
720 fmadd $T2a,$N1,$nc,$T2a
721 fmadd $T2b,$N1,$nd,$T2b
722 fmadd $T3a,$N2,$nc,$T3a
723 fmadd $T3b,$N2,$nd,$T3b
724 fmadd $dota,$N3,$nc,$dota
725 fmadd $dotb,$N3,$nd,$dotb
726
727 fctid $T0a,$T0a
728 fctid $T0b,$T0b
729 fctid $T1a,$T1a
730 fctid $T1b,$T1b
731 fctid $T2a,$T2a
732 fctid $T2b,$T2b
733 fctid $T3a,$T3a
734 fctid $T3b,$T3b
735
736 stfd $T0a,`$FRAME+0`($sp)
737 stfd $T0b,`$FRAME+8`($sp)
738 stfd $T1a,`$FRAME+16`($sp)
739 stfd $T1b,`$FRAME+24`($sp)
740 stfd $T2a,`$FRAME+32`($sp)
741 stfd $T2b,`$FRAME+40`($sp)
742 stfd $T3a,`$FRAME+48`($sp)
743 stfd $T3b,`$FRAME+56`($sp)
744
745.align 5
746Linner:
747 fmul $T1a,$A1,$ba
748 fmul $T1b,$A1,$bb
749 fmul $T2a,$A2,$ba
750 fmul $T2b,$A2,$bb
751 lfd $N0,40($nap_d) ; load n[j] in double format
752 lfd $N1,48($nap_d)
753 fmul $T3a,$A3,$ba
754 fmul $T3b,$A3,$bb
755 fmadd $T0a,$A0,$ba,$dota
756 fmadd $T0b,$A0,$bb,$dotb
757 lfd $N2,56($nap_d) ; load n[j+1] in double format
758 lfdu $N3,64($nap_d)
759
760 fmadd $T1a,$A0,$bc,$T1a
761 fmadd $T1b,$A0,$bd,$T1b
762 fmadd $T2a,$A1,$bc,$T2a
763 fmadd $T2b,$A1,$bd,$T2b
764 lfd $A0,8($nap_d) ; load a[j] in double format
765 lfd $A1,16($nap_d)
766 fmadd $T3a,$A2,$bc,$T3a
767 fmadd $T3b,$A2,$bd,$T3b
768 fmul $dota,$A3,$bc
769 fmul $dotb,$A3,$bd
770 lfd $A2,24($nap_d) ; load a[j+1] in double format
771 lfd $A3,32($nap_d)
772
773 fmadd $T1a,$N1,$na,$T1a
774 fmadd $T1b,$N1,$nb,$T1b
775 ld $t0,`$FRAME+0`($sp)
776 ld $t1,`$FRAME+8`($sp)
777 fmadd $T2a,$N2,$na,$T2a
778 fmadd $T2b,$N2,$nb,$T2b
779 ld $t2,`$FRAME+16`($sp)
780 ld $t3,`$FRAME+24`($sp)
781 fmadd $T3a,$N3,$na,$T3a
782 fmadd $T3b,$N3,$nb,$T3b
783 add $t0,$t0,$carry ; can not overflow
784 ld $t4,`$FRAME+32`($sp)
785 ld $t5,`$FRAME+40`($sp)
786 fmadd $T0a,$N0,$na,$T0a
787 fmadd $T0b,$N0,$nb,$T0b
788 srdi $carry,$t0,16
789 add $t1,$t1,$carry
790 srdi $carry,$t1,16
791 ld $t6,`$FRAME+48`($sp)
792 ld $t7,`$FRAME+56`($sp)
793
794 fmadd $T1a,$N0,$nc,$T1a
795 fmadd $T1b,$N0,$nd,$T1b
796 insrdi $t0,$t1,16,32
797 ld $t1,8($tp) ; tp[j]
798 fmadd $T2a,$N1,$nc,$T2a
799 fmadd $T2b,$N1,$nd,$T2b
800 add $t2,$t2,$carry
801 fmadd $T3a,$N2,$nc,$T3a
802 fmadd $T3b,$N2,$nd,$T3b
803 srdi $carry,$t2,16
804 insrdi $t0,$t2,16,16
805 fmadd $dota,$N3,$nc,$dota
806 fmadd $dotb,$N3,$nd,$dotb
807 add $t3,$t3,$carry
808 ldu $t2,16($tp) ; tp[j+1]
809 srdi $carry,$t3,16
810 insrdi $t0,$t3,16,0 ; 0..63 bits
811 add $t4,$t4,$carry
812
813 fctid $T0a,$T0a
814 fctid $T0b,$T0b
815 srdi $carry,$t4,16
816 fctid $T1a,$T1a
817 fctid $T1b,$T1b
818 add $t5,$t5,$carry
819 fctid $T2a,$T2a
820 fctid $T2b,$T2b
821 srdi $carry,$t5,16
822 insrdi $t4,$t5,16,32
823 fctid $T3a,$T3a
824 fctid $T3b,$T3b
825 add $t6,$t6,$carry
826 srdi $carry,$t6,16
827 insrdi $t4,$t6,16,16
828
829 stfd $T0a,`$FRAME+0`($sp)
830 stfd $T0b,`$FRAME+8`($sp)
831 add $t7,$t7,$carry
832 addc $t3,$t0,$t1
833___
834$code.=<<___ if ($SIZE_T==4); # adjust XER[CA]
835 extrdi $t0,$t0,32,0
836 extrdi $t1,$t1,32,0
837 adde $t0,$t0,$t1
838___
839$code.=<<___;
840 stfd $T1a,`$FRAME+16`($sp)
841 stfd $T1b,`$FRAME+24`($sp)
842 insrdi $t4,$t7,16,0 ; 64..127 bits
843 srdi $carry,$t7,16 ; upper 33 bits
844 stfd $T2a,`$FRAME+32`($sp)
845 stfd $T2b,`$FRAME+40`($sp)
846 adde $t5,$t4,$t2
847___
848$code.=<<___ if ($SIZE_T==4); # adjust XER[CA]
849 extrdi $t4,$t4,32,0
850 extrdi $t2,$t2,32,0
851 adde $t4,$t4,$t2
852___
853$code.=<<___;
854 stfd $T3a,`$FRAME+48`($sp)
855 stfd $T3b,`$FRAME+56`($sp)
856 addze $carry,$carry
857 std $t3,-16($tp) ; tp[j-1]
858 std $t5,-8($tp) ; tp[j]
859 bdnz- Linner
860
861 fctid $dota,$dota
862 fctid $dotb,$dotb
863 ld $t0,`$FRAME+0`($sp)
864 ld $t1,`$FRAME+8`($sp)
865 ld $t2,`$FRAME+16`($sp)
866 ld $t3,`$FRAME+24`($sp)
867 ld $t4,`$FRAME+32`($sp)
868 ld $t5,`$FRAME+40`($sp)
869 ld $t6,`$FRAME+48`($sp)
870 ld $t7,`$FRAME+56`($sp)
871 stfd $dota,`$FRAME+64`($sp)
872 stfd $dotb,`$FRAME+72`($sp)
873
874 add $t0,$t0,$carry ; can not overflow
875 srdi $carry,$t0,16
876 add $t1,$t1,$carry
877 srdi $carry,$t1,16
878 insrdi $t0,$t1,16,32
879 add $t2,$t2,$carry
880 ld $t1,8($tp) ; tp[j]
881 srdi $carry,$t2,16
882 insrdi $t0,$t2,16,16
883 add $t3,$t3,$carry
884 ldu $t2,16($tp) ; tp[j+1]
885 srdi $carry,$t3,16
886 insrdi $t0,$t3,16,0 ; 0..63 bits
887 add $t4,$t4,$carry
888 srdi $carry,$t4,16
889 add $t5,$t5,$carry
890 srdi $carry,$t5,16
891 insrdi $t4,$t5,16,32
892 add $t6,$t6,$carry
893 srdi $carry,$t6,16
894 insrdi $t4,$t6,16,16
895 add $t7,$t7,$carry
896 insrdi $t4,$t7,16,0 ; 64..127 bits
897 srdi $carry,$t7,16 ; upper 33 bits
898 ld $t6,`$FRAME+64`($sp)
899 ld $t7,`$FRAME+72`($sp)
900
901 addc $t3,$t0,$t1
902___
903$code.=<<___ if ($SIZE_T==4); # adjust XER[CA]
904 extrdi $t0,$t0,32,0
905 extrdi $t1,$t1,32,0
906 adde $t0,$t0,$t1
907___
908$code.=<<___;
909 adde $t5,$t4,$t2
910___
911$code.=<<___ if ($SIZE_T==4); # adjust XER[CA]
912 extrdi $t4,$t4,32,0
913 extrdi $t2,$t2,32,0
914 adde $t4,$t4,$t2
915___
916$code.=<<___;
917 addze $carry,$carry
918
919 std $t3,-16($tp) ; tp[j-1]
920 std $t5,-8($tp) ; tp[j]
921
922 add $carry,$carry,$ovf ; comsume upmost overflow
923 add $t6,$t6,$carry ; can not overflow
924 srdi $carry,$t6,16
925 add $t7,$t7,$carry
926 insrdi $t6,$t7,48,0
927 srdi $ovf,$t7,48
928 std $t6,0($tp) ; tp[num-1]
929
930 slwi $t7,$num,2
931 addi $i,$i,8
932 subf $nap_d,$t7,$nap_d ; rewind pointer
933 cmpw $i,$num
934 blt- Louter
935___
936
937$code.=<<___ if ($SIZE_T==8);
938 subf $np,$num,$np ; rewind np
939 addi $j,$j,1 ; restore counter
940 subfc $i,$i,$i ; j=0 and "clear" XER[CA]
941 addi $tp,$sp,`$FRAME+$TRANSFER+8`
942 addi $t4,$sp,`$FRAME+$TRANSFER+16`
943 addi $t5,$np,8
944 addi $t6,$rp,8
945 mtctr $j
946
947.align 4
948Lsub: ldx $t0,$tp,$i
949 ldx $t1,$np,$i
950 ldx $t2,$t4,$i
951 ldx $t3,$t5,$i
952 subfe $t0,$t1,$t0 ; tp[j]-np[j]
953 subfe $t2,$t3,$t2 ; tp[j+1]-np[j+1]
954 stdx $t0,$rp,$i
955 stdx $t2,$t6,$i
956 addi $i,$i,16
957 bdnz- Lsub
958
959 li $i,0
960 subfe $ovf,$i,$ovf ; handle upmost overflow bit
961 and $ap,$tp,$ovf
962 andc $np,$rp,$ovf
963 or $ap,$ap,$np ; ap=borrow?tp:rp
964 addi $t7,$ap,8
965 mtctr $j
966
967.align 4
968Lcopy: ; copy or in-place refresh
969 ldx $t0,$ap,$i
970 ldx $t1,$t7,$i
971 std $i,8($nap_d) ; zap nap_d
972 std $i,16($nap_d)
973 std $i,24($nap_d)
974 std $i,32($nap_d)
975 std $i,40($nap_d)
976 std $i,48($nap_d)
977 std $i,56($nap_d)
978 stdu $i,64($nap_d)
979 stdx $t0,$rp,$i
980 stdx $t1,$t6,$i
981 stdx $i,$tp,$i ; zap tp at once
982 stdx $i,$t4,$i
983 addi $i,$i,16
984 bdnz- Lcopy
985___
986$code.=<<___ if ($SIZE_T==4);
987 subf $np,$num,$np ; rewind np
988 addi $j,$j,1 ; restore counter
989 subfc $i,$i,$i ; j=0 and "clear" XER[CA]
990 addi $tp,$sp,`$FRAME+$TRANSFER`
991 addi $np,$np,-4
992 addi $rp,$rp,-4
993 addi $ap,$sp,`$FRAME+$TRANSFER+4`
994 mtctr $j
995
996.align 4
997Lsub: ld $t0,8($tp) ; load tp[j..j+3] in 64-bit word order
998 ldu $t2,16($tp)
999 lwz $t4,4($np) ; load np[j..j+3] in 32-bit word order
1000 lwz $t5,8($np)
1001 lwz $t6,12($np)
1002 lwzu $t7,16($np)
1003 extrdi $t1,$t0,32,0
1004 extrdi $t3,$t2,32,0
1005 subfe $t4,$t4,$t0 ; tp[j]-np[j]
1006 stw $t0,4($ap) ; save tp[j..j+3] in 32-bit word order
1007 subfe $t5,$t5,$t1 ; tp[j+1]-np[j+1]
1008 stw $t1,8($ap)
1009 subfe $t6,$t6,$t2 ; tp[j+2]-np[j+2]
1010 stw $t2,12($ap)
1011 subfe $t7,$t7,$t3 ; tp[j+3]-np[j+3]
1012 stwu $t3,16($ap)
1013 stw $t4,4($rp)
1014 stw $t5,8($rp)
1015 stw $t6,12($rp)
1016 stwu $t7,16($rp)
1017 bdnz- Lsub
1018
1019 li $i,0
1020 subfe $ovf,$i,$ovf ; handle upmost overflow bit
1021 addi $tp,$sp,`$FRAME+$TRANSFER+4`
1022 subf $rp,$num,$rp ; rewind rp
1023 and $ap,$tp,$ovf
1024 andc $np,$rp,$ovf
1025 or $ap,$ap,$np ; ap=borrow?tp:rp
1026 addi $tp,$sp,`$FRAME+$TRANSFER`
1027 mtctr $j
1028
1029.align 4
1030Lcopy: ; copy or in-place refresh
1031 lwz $t0,4($ap)
1032 lwz $t1,8($ap)
1033 lwz $t2,12($ap)
1034 lwzu $t3,16($ap)
1035 std $i,8($nap_d) ; zap nap_d
1036 std $i,16($nap_d)
1037 std $i,24($nap_d)
1038 std $i,32($nap_d)
1039 std $i,40($nap_d)
1040 std $i,48($nap_d)
1041 std $i,56($nap_d)
1042 stdu $i,64($nap_d)
1043 stw $t0,4($rp)
1044 stw $t1,8($rp)
1045 stw $t2,12($rp)
1046 stwu $t3,16($rp)
1047 std $i,8($tp) ; zap tp at once
1048 stdu $i,16($tp)
1049 bdnz- Lcopy
1050___
1051
1052$code.=<<___;
1053 $POP $i,0($sp)
1054 li r3,1 ; signal "handled"
1055 $POP r22,`-12*8-10*$SIZE_T`($i)
1056 $POP r23,`-12*8-9*$SIZE_T`($i)
1057 $POP r24,`-12*8-8*$SIZE_T`($i)
1058 $POP r25,`-12*8-7*$SIZE_T`($i)
1059 $POP r26,`-12*8-6*$SIZE_T`($i)
1060 $POP r27,`-12*8-5*$SIZE_T`($i)
1061 $POP r28,`-12*8-4*$SIZE_T`($i)
1062 $POP r29,`-12*8-3*$SIZE_T`($i)
1063 $POP r30,`-12*8-2*$SIZE_T`($i)
1064 $POP r31,`-12*8-1*$SIZE_T`($i)
1065 lfd f20,`-12*8`($i)
1066 lfd f21,`-11*8`($i)
1067 lfd f22,`-10*8`($i)
1068 lfd f23,`-9*8`($i)
1069 lfd f24,`-8*8`($i)
1070 lfd f25,`-7*8`($i)
1071 lfd f26,`-6*8`($i)
1072 lfd f27,`-5*8`($i)
1073 lfd f28,`-4*8`($i)
1074 lfd f29,`-3*8`($i)
1075 lfd f30,`-2*8`($i)
1076 lfd f31,`-1*8`($i)
1077 mr $sp,$i
1078 blr
1079 .long 0
1080 .byte 0,12,4,0,0x8c,10,6,0
1081 .long 0
1082
1083.asciz "Montgomery Multiplication for PPC64, CRYPTOGAMS by <appro\@openssl.org>"
1084___
1085
1086$code =~ s/\`([^\`]*)\`/eval $1/gem;
1087print $code;
1088close STDOUT;
diff --git a/src/lib/libcrypto/bn/asm/s390x-gf2m.pl b/src/lib/libcrypto/bn/asm/s390x-gf2m.pl
deleted file mode 100644
index cd9f13eca2..0000000000
--- a/src/lib/libcrypto/bn/asm/s390x-gf2m.pl
+++ /dev/null
@@ -1,221 +0,0 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# May 2011
11#
12# The module implements bn_GF2m_mul_2x2 polynomial multiplication used
13# in bn_gf2m.c. It's kind of low-hanging mechanical port from C for
14# the time being... gcc 4.3 appeared to generate poor code, therefore
15# the effort. And indeed, the module delivers 55%-90%(*) improvement
16# on haviest ECDSA verify and ECDH benchmarks for 163- and 571-bit
17# key lengths on z990, 30%-55%(*) - on z10, and 70%-110%(*) - on z196.
18# This is for 64-bit build. In 32-bit "highgprs" case improvement is
19# even higher, for example on z990 it was measured 80%-150%. ECDSA
20# sign is modest 9%-12% faster. Keep in mind that these coefficients
21# are not ones for bn_GF2m_mul_2x2 itself, as not all CPU time is
22# burnt in it...
23#
24# (*) gcc 4.1 was observed to deliver better results than gcc 4.3,
25# so that improvement coefficients can vary from one specific
26# setup to another.
27
28$flavour = shift;
29
30if ($flavour =~ /3[12]/) {
31 $SIZE_T=4;
32 $g="";
33} else {
34 $SIZE_T=8;
35 $g="g";
36}
37
38while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
39open STDOUT,">$output";
40
41$stdframe=16*$SIZE_T+4*8;
42
43$rp="%r2";
44$a1="%r3";
45$a0="%r4";
46$b1="%r5";
47$b0="%r6";
48
49$ra="%r14";
50$sp="%r15";
51
52@T=("%r0","%r1");
53@i=("%r12","%r13");
54
55($a1,$a2,$a4,$a8,$a12,$a48)=map("%r$_",(6..11));
56($lo,$hi,$b)=map("%r$_",(3..5)); $a=$lo; $mask=$a8;
57
58$code.=<<___;
59.text
60
61.type _mul_1x1,\@function
62.align 16
63_mul_1x1:
64 lgr $a1,$a
65 sllg $a2,$a,1
66 sllg $a4,$a,2
67 sllg $a8,$a,3
68
69 srag $lo,$a1,63 # broadcast 63rd bit
70 nihh $a1,0x1fff
71 srag @i[0],$a2,63 # broadcast 62nd bit
72 nihh $a2,0x3fff
73 srag @i[1],$a4,63 # broadcast 61st bit
74 nihh $a4,0x7fff
75 ngr $lo,$b
76 ngr @i[0],$b
77 ngr @i[1],$b
78
79 lghi @T[0],0
80 lgr $a12,$a1
81 stg @T[0],`$stdframe+0*8`($sp) # tab[0]=0
82 xgr $a12,$a2
83 stg $a1,`$stdframe+1*8`($sp) # tab[1]=a1
84 lgr $a48,$a4
85 stg $a2,`$stdframe+2*8`($sp) # tab[2]=a2
86 xgr $a48,$a8
87 stg $a12,`$stdframe+3*8`($sp) # tab[3]=a1^a2
88 xgr $a1,$a4
89
90 stg $a4,`$stdframe+4*8`($sp) # tab[4]=a4
91 xgr $a2,$a4
92 stg $a1,`$stdframe+5*8`($sp) # tab[5]=a1^a4
93 xgr $a12,$a4
94 stg $a2,`$stdframe+6*8`($sp) # tab[6]=a2^a4
95 xgr $a1,$a48
96 stg $a12,`$stdframe+7*8`($sp) # tab[7]=a1^a2^a4
97 xgr $a2,$a48
98
99 stg $a8,`$stdframe+8*8`($sp) # tab[8]=a8
100 xgr $a12,$a48
101 stg $a1,`$stdframe+9*8`($sp) # tab[9]=a1^a8
102 xgr $a1,$a4
103 stg $a2,`$stdframe+10*8`($sp) # tab[10]=a2^a8
104 xgr $a2,$a4
105 stg $a12,`$stdframe+11*8`($sp) # tab[11]=a1^a2^a8
106
107 xgr $a12,$a4
108 stg $a48,`$stdframe+12*8`($sp) # tab[12]=a4^a8
109 srlg $hi,$lo,1
110 stg $a1,`$stdframe+13*8`($sp) # tab[13]=a1^a4^a8
111 sllg $lo,$lo,63
112 stg $a2,`$stdframe+14*8`($sp) # tab[14]=a2^a4^a8
113 srlg @T[0],@i[0],2
114 stg $a12,`$stdframe+15*8`($sp) # tab[15]=a1^a2^a4^a8
115
116 lghi $mask,`0xf<<3`
117 sllg $a1,@i[0],62
118 sllg @i[0],$b,3
119 srlg @T[1],@i[1],3
120 ngr @i[0],$mask
121 sllg $a2,@i[1],61
122 srlg @i[1],$b,4-3
123 xgr $hi,@T[0]
124 ngr @i[1],$mask
125 xgr $lo,$a1
126 xgr $hi,@T[1]
127 xgr $lo,$a2
128
129 xg $lo,$stdframe(@i[0],$sp)
130 srlg @i[0],$b,8-3
131 ngr @i[0],$mask
132___
133for($n=1;$n<14;$n++) {
134$code.=<<___;
135 lg @T[1],$stdframe(@i[1],$sp)
136 srlg @i[1],$b,`($n+2)*4`-3
137 sllg @T[0],@T[1],`$n*4`
138 ngr @i[1],$mask
139 srlg @T[1],@T[1],`64-$n*4`
140 xgr $lo,@T[0]
141 xgr $hi,@T[1]
142___
143 push(@i,shift(@i)); push(@T,shift(@T));
144}
145$code.=<<___;
146 lg @T[1],$stdframe(@i[1],$sp)
147 sllg @T[0],@T[1],`$n*4`
148 srlg @T[1],@T[1],`64-$n*4`
149 xgr $lo,@T[0]
150 xgr $hi,@T[1]
151
152 lg @T[0],$stdframe(@i[0],$sp)
153 sllg @T[1],@T[0],`($n+1)*4`
154 srlg @T[0],@T[0],`64-($n+1)*4`
155 xgr $lo,@T[1]
156 xgr $hi,@T[0]
157
158 br $ra
159.size _mul_1x1,.-_mul_1x1
160
161.globl bn_GF2m_mul_2x2
162.type bn_GF2m_mul_2x2,\@function
163.align 16
164bn_GF2m_mul_2x2:
165 stm${g} %r3,%r15,3*$SIZE_T($sp)
166
167 lghi %r1,-$stdframe-128
168 la %r0,0($sp)
169 la $sp,0(%r1,$sp) # alloca
170 st${g} %r0,0($sp) # back chain
171___
172if ($SIZE_T==8) {
173my @r=map("%r$_",(6..9));
174$code.=<<___;
175 bras $ra,_mul_1x1 # a1·b1
176 stmg $lo,$hi,16($rp)
177
178 lg $a,`$stdframe+128+4*$SIZE_T`($sp)
179 lg $b,`$stdframe+128+6*$SIZE_T`($sp)
180 bras $ra,_mul_1x1 # a0·b0
181 stmg $lo,$hi,0($rp)
182
183 lg $a,`$stdframe+128+3*$SIZE_T`($sp)
184 lg $b,`$stdframe+128+5*$SIZE_T`($sp)
185 xg $a,`$stdframe+128+4*$SIZE_T`($sp)
186 xg $b,`$stdframe+128+6*$SIZE_T`($sp)
187 bras $ra,_mul_1x1 # (a0+a1)·(b0+b1)
188 lmg @r[0],@r[3],0($rp)
189
190 xgr $lo,$hi
191 xgr $hi,@r[1]
192 xgr $lo,@r[0]
193 xgr $hi,@r[2]
194 xgr $lo,@r[3]
195 xgr $hi,@r[3]
196 xgr $lo,$hi
197 stg $hi,16($rp)
198 stg $lo,8($rp)
199___
200} else {
201$code.=<<___;
202 sllg %r3,%r3,32
203 sllg %r5,%r5,32
204 or %r3,%r4
205 or %r5,%r6
206 bras $ra,_mul_1x1
207 rllg $lo,$lo,32
208 rllg $hi,$hi,32
209 stmg $lo,$hi,0($rp)
210___
211}
212$code.=<<___;
213 lm${g} %r6,%r15,`$stdframe+128+6*$SIZE_T`($sp)
214 br $ra
215.size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
216.string "GF(2^m) Multiplication for s390x, CRYPTOGAMS by <appro\@openssl.org>"
217___
218
219$code =~ s/\`([^\`]*)\`/eval($1)/gem;
220print $code;
221close STDOUT;
diff --git a/src/lib/libcrypto/bn/asm/s390x-mont.pl b/src/lib/libcrypto/bn/asm/s390x-mont.pl
deleted file mode 100644
index 9fd64e81ee..0000000000
--- a/src/lib/libcrypto/bn/asm/s390x-mont.pl
+++ /dev/null
@@ -1,277 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# April 2007.
11#
12# Performance improvement over vanilla C code varies from 85% to 45%
13# depending on key length and benchmark. Unfortunately in this context
14# these are not very impressive results [for code that utilizes "wide"
15# 64x64=128-bit multiplication, which is not commonly available to C
16# programmers], at least hand-coded bn_asm.c replacement is known to
17# provide 30-40% better results for longest keys. Well, on a second
18# thought it's not very surprising, because z-CPUs are single-issue
19# and _strictly_ in-order execution, while bn_mul_mont is more or less
20# dependent on CPU ability to pipe-line instructions and have several
21# of them "in-flight" at the same time. I mean while other methods,
22# for example Karatsuba, aim to minimize amount of multiplications at
23# the cost of other operations increase, bn_mul_mont aim to neatly
24# "overlap" multiplications and the other operations [and on most
25# platforms even minimize the amount of the other operations, in
26# particular references to memory]. But it's possible to improve this
27# module performance by implementing dedicated squaring code-path and
28# possibly by unrolling loops...
29
30# January 2009.
31#
32# Reschedule to minimize/avoid Address Generation Interlock hazard,
33# make inner loops counter-based.
34
35# November 2010.
36#
37# Adapt for -m31 build. If kernel supports what's called "highgprs"
38# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
39# instructions and achieve "64-bit" performance even in 31-bit legacy
40# application context. The feature is not specific to any particular
41# processor, as long as it's "z-CPU". Latter implies that the code
42# remains z/Architecture specific. Compatibility with 32-bit BN_ULONG
43# is achieved by swapping words after 64-bit loads, follow _dswap-s.
44# On z990 it was measured to perform 2.6-2.2 times better than
45# compiler-generated code, less for longer keys...
46
47$flavour = shift;
48
49if ($flavour =~ /3[12]/) {
50 $SIZE_T=4;
51 $g="";
52} else {
53 $SIZE_T=8;
54 $g="g";
55}
56
57while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
58open STDOUT,">$output";
59
60$stdframe=16*$SIZE_T+4*8;
61
62$mn0="%r0";
63$num="%r1";
64
65# int bn_mul_mont(
66$rp="%r2"; # BN_ULONG *rp,
67$ap="%r3"; # const BN_ULONG *ap,
68$bp="%r4"; # const BN_ULONG *bp,
69$np="%r5"; # const BN_ULONG *np,
70$n0="%r6"; # const BN_ULONG *n0,
71#$num="160(%r15)" # int num);
72
73$bi="%r2"; # zaps rp
74$j="%r7";
75
76$ahi="%r8";
77$alo="%r9";
78$nhi="%r10";
79$nlo="%r11";
80$AHI="%r12";
81$NHI="%r13";
82$count="%r14";
83$sp="%r15";
84
85$code.=<<___;
86.text
87.globl bn_mul_mont
88.type bn_mul_mont,\@function
89bn_mul_mont:
90 lgf $num,`$stdframe+$SIZE_T-4`($sp) # pull $num
91 sla $num,`log($SIZE_T)/log(2)` # $num to enumerate bytes
92 la $bp,0($num,$bp)
93
94 st${g} %r2,2*$SIZE_T($sp)
95
96 cghi $num,16 #
97 lghi %r2,0 #
98 blr %r14 # if($num<16) return 0;
99___
100$code.=<<___ if ($flavour =~ /3[12]/);
101 tmll $num,4
102 bnzr %r14 # if ($num&1) return 0;
103___
104$code.=<<___ if ($flavour !~ /3[12]/);
105 cghi $num,96 #
106 bhr %r14 # if($num>96) return 0;
107___
108$code.=<<___;
109 stm${g} %r3,%r15,3*$SIZE_T($sp)
110
111 lghi $rp,-$stdframe-8 # leave room for carry bit
112 lcgr $j,$num # -$num
113 lgr %r0,$sp
114 la $rp,0($rp,$sp)
115 la $sp,0($j,$rp) # alloca
116 st${g} %r0,0($sp) # back chain
117
118 sra $num,3 # restore $num
119 la $bp,0($j,$bp) # restore $bp
120 ahi $num,-1 # adjust $num for inner loop
121 lg $n0,0($n0) # pull n0
122 _dswap $n0
123
124 lg $bi,0($bp)
125 _dswap $bi
126 lg $alo,0($ap)
127 _dswap $alo
128 mlgr $ahi,$bi # ap[0]*bp[0]
129 lgr $AHI,$ahi
130
131 lgr $mn0,$alo # "tp[0]"*n0
132 msgr $mn0,$n0
133
134 lg $nlo,0($np) #
135 _dswap $nlo
136 mlgr $nhi,$mn0 # np[0]*m1
137 algr $nlo,$alo # +="tp[0]"
138 lghi $NHI,0
139 alcgr $NHI,$nhi
140
141 la $j,8(%r0) # j=1
142 lr $count,$num
143
144.align 16
145.L1st:
146 lg $alo,0($j,$ap)
147 _dswap $alo
148 mlgr $ahi,$bi # ap[j]*bp[0]
149 algr $alo,$AHI
150 lghi $AHI,0
151 alcgr $AHI,$ahi
152
153 lg $nlo,0($j,$np)
154 _dswap $nlo
155 mlgr $nhi,$mn0 # np[j]*m1
156 algr $nlo,$NHI
157 lghi $NHI,0
158 alcgr $nhi,$NHI # +="tp[j]"
159 algr $nlo,$alo
160 alcgr $NHI,$nhi
161
162 stg $nlo,$stdframe-8($j,$sp) # tp[j-1]=
163 la $j,8($j) # j++
164 brct $count,.L1st
165
166 algr $NHI,$AHI
167 lghi $AHI,0
168 alcgr $AHI,$AHI # upmost overflow bit
169 stg $NHI,$stdframe-8($j,$sp)
170 stg $AHI,$stdframe($j,$sp)
171 la $bp,8($bp) # bp++
172
173.Louter:
174 lg $bi,0($bp) # bp[i]
175 _dswap $bi
176 lg $alo,0($ap)
177 _dswap $alo
178 mlgr $ahi,$bi # ap[0]*bp[i]
179 alg $alo,$stdframe($sp) # +=tp[0]
180 lghi $AHI,0
181 alcgr $AHI,$ahi
182
183 lgr $mn0,$alo
184 msgr $mn0,$n0 # tp[0]*n0
185
186 lg $nlo,0($np) # np[0]
187 _dswap $nlo
188 mlgr $nhi,$mn0 # np[0]*m1
189 algr $nlo,$alo # +="tp[0]"
190 lghi $NHI,0
191 alcgr $NHI,$nhi
192
193 la $j,8(%r0) # j=1
194 lr $count,$num
195
196.align 16
197.Linner:
198 lg $alo,0($j,$ap)
199 _dswap $alo
200 mlgr $ahi,$bi # ap[j]*bp[i]
201 algr $alo,$AHI
202 lghi $AHI,0
203 alcgr $ahi,$AHI
204 alg $alo,$stdframe($j,$sp)# +=tp[j]
205 alcgr $AHI,$ahi
206
207 lg $nlo,0($j,$np)
208 _dswap $nlo
209 mlgr $nhi,$mn0 # np[j]*m1
210 algr $nlo,$NHI
211 lghi $NHI,0
212 alcgr $nhi,$NHI
213 algr $nlo,$alo # +="tp[j]"
214 alcgr $NHI,$nhi
215
216 stg $nlo,$stdframe-8($j,$sp) # tp[j-1]=
217 la $j,8($j) # j++
218 brct $count,.Linner
219
220 algr $NHI,$AHI
221 lghi $AHI,0
222 alcgr $AHI,$AHI
223 alg $NHI,$stdframe($j,$sp)# accumulate previous upmost overflow bit
224 lghi $ahi,0
225 alcgr $AHI,$ahi # new upmost overflow bit
226 stg $NHI,$stdframe-8($j,$sp)
227 stg $AHI,$stdframe($j,$sp)
228
229 la $bp,8($bp) # bp++
230 cl${g} $bp,`$stdframe+8+4*$SIZE_T`($j,$sp) # compare to &bp[num]
231 jne .Louter
232
233 l${g} $rp,`$stdframe+8+2*$SIZE_T`($j,$sp) # reincarnate rp
234 la $ap,$stdframe($sp)
235 ahi $num,1 # restore $num, incidentally clears "borrow"
236
237 la $j,0(%r0)
238 lr $count,$num
239.Lsub: lg $alo,0($j,$ap)
240 lg $nlo,0($j,$np)
241 _dswap $nlo
242 slbgr $alo,$nlo
243 stg $alo,0($j,$rp)
244 la $j,8($j)
245 brct $count,.Lsub
246 lghi $ahi,0
247 slbgr $AHI,$ahi # handle upmost carry
248
249 ngr $ap,$AHI
250 lghi $np,-1
251 xgr $np,$AHI
252 ngr $np,$rp
253 ogr $ap,$np # ap=borrow?tp:rp
254
255 la $j,0(%r0)
256 lgr $count,$num
257.Lcopy: lg $alo,0($j,$ap) # copy or in-place refresh
258 _dswap $alo
259 stg $j,$stdframe($j,$sp) # zap tp
260 stg $alo,0($j,$rp)
261 la $j,8($j)
262 brct $count,.Lcopy
263
264 la %r1,`$stdframe+8+6*$SIZE_T`($j,$sp)
265 lm${g} %r6,%r15,0(%r1)
266 lghi %r2,1 # signal "processed"
267 br %r14
268.size bn_mul_mont,.-bn_mul_mont
269.string "Montgomery Multiplication for s390x, CRYPTOGAMS by <appro\@openssl.org>"
270___
271
272foreach (split("\n",$code)) {
273 s/\`([^\`]*)\`/eval $1/ge;
274 s/_dswap\s+(%r[0-9]+)/sprintf("rllg\t%s,%s,32",$1,$1) if($SIZE_T==4)/e;
275 print $_,"\n";
276}
277close STDOUT;
diff --git a/src/lib/libcrypto/bn/asm/s390x.S b/src/lib/libcrypto/bn/asm/s390x.S
deleted file mode 100755
index 43fcb79bc0..0000000000
--- a/src/lib/libcrypto/bn/asm/s390x.S
+++ /dev/null
@@ -1,678 +0,0 @@
1.ident "s390x.S, version 1.1"
2// ====================================================================
3// Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
4// project.
5//
6// Rights for redistribution and usage in source and binary forms are
7// granted according to the OpenSSL license. Warranty of any kind is
8// disclaimed.
9// ====================================================================
10
11.text
12
13#define zero %r0
14
15// BN_ULONG bn_mul_add_words(BN_ULONG *r2,BN_ULONG *r3,int r4,BN_ULONG r5);
16.globl bn_mul_add_words
17.type bn_mul_add_words,@function
18.align 4
19bn_mul_add_words:
20 lghi zero,0 // zero = 0
21 la %r1,0(%r2) // put rp aside
22 lghi %r2,0 // i=0;
23 ltgfr %r4,%r4
24 bler %r14 // if (len<=0) return 0;
25
26 stmg %r6,%r10,48(%r15)
27 lghi %r10,3
28 lghi %r8,0 // carry = 0
29 nr %r10,%r4 // len%4
30 sra %r4,2 // cnt=len/4
31 jz .Loop1_madd // carry is incidentally cleared if branch taken
32 algr zero,zero // clear carry
33
34.Loop4_madd:
35 lg %r7,0(%r2,%r3) // ap[i]
36 mlgr %r6,%r5 // *=w
37 alcgr %r7,%r8 // +=carry
38 alcgr %r6,zero
39 alg %r7,0(%r2,%r1) // +=rp[i]
40 stg %r7,0(%r2,%r1) // rp[i]=
41
42 lg %r9,8(%r2,%r3)
43 mlgr %r8,%r5
44 alcgr %r9,%r6
45 alcgr %r8,zero
46 alg %r9,8(%r2,%r1)
47 stg %r9,8(%r2,%r1)
48
49 lg %r7,16(%r2,%r3)
50 mlgr %r6,%r5
51 alcgr %r7,%r8
52 alcgr %r6,zero
53 alg %r7,16(%r2,%r1)
54 stg %r7,16(%r2,%r1)
55
56 lg %r9,24(%r2,%r3)
57 mlgr %r8,%r5
58 alcgr %r9,%r6
59 alcgr %r8,zero
60 alg %r9,24(%r2,%r1)
61 stg %r9,24(%r2,%r1)
62
63 la %r2,32(%r2) // i+=4
64 brct %r4,.Loop4_madd
65
66 la %r10,1(%r10) // see if len%4 is zero ...
67 brct %r10,.Loop1_madd // without touching condition code:-)
68
69.Lend_madd:
70 alcgr %r8,zero // collect carry bit
71 lgr %r2,%r8
72 lmg %r6,%r10,48(%r15)
73 br %r14
74
75.Loop1_madd:
76 lg %r7,0(%r2,%r3) // ap[i]
77 mlgr %r6,%r5 // *=w
78 alcgr %r7,%r8 // +=carry
79 alcgr %r6,zero
80 alg %r7,0(%r2,%r1) // +=rp[i]
81 stg %r7,0(%r2,%r1) // rp[i]=
82
83 lgr %r8,%r6
84 la %r2,8(%r2) // i++
85 brct %r10,.Loop1_madd
86
87 j .Lend_madd
88.size bn_mul_add_words,.-bn_mul_add_words
89
90// BN_ULONG bn_mul_words(BN_ULONG *r2,BN_ULONG *r3,int r4,BN_ULONG r5);
91.globl bn_mul_words
92.type bn_mul_words,@function
93.align 4
94bn_mul_words:
95 lghi zero,0 // zero = 0
96 la %r1,0(%r2) // put rp aside
97 lghi %r2,0 // i=0;
98 ltgfr %r4,%r4
99 bler %r14 // if (len<=0) return 0;
100
101 stmg %r6,%r10,48(%r15)
102 lghi %r10,3
103 lghi %r8,0 // carry = 0
104 nr %r10,%r4 // len%4
105 sra %r4,2 // cnt=len/4
106 jz .Loop1_mul // carry is incidentally cleared if branch taken
107 algr zero,zero // clear carry
108
109.Loop4_mul:
110 lg %r7,0(%r2,%r3) // ap[i]
111 mlgr %r6,%r5 // *=w
112 alcgr %r7,%r8 // +=carry
113 stg %r7,0(%r2,%r1) // rp[i]=
114
115 lg %r9,8(%r2,%r3)
116 mlgr %r8,%r5
117 alcgr %r9,%r6
118 stg %r9,8(%r2,%r1)
119
120 lg %r7,16(%r2,%r3)
121 mlgr %r6,%r5
122 alcgr %r7,%r8
123 stg %r7,16(%r2,%r1)
124
125 lg %r9,24(%r2,%r3)
126 mlgr %r8,%r5
127 alcgr %r9,%r6
128 stg %r9,24(%r2,%r1)
129
130 la %r2,32(%r2) // i+=4
131 brct %r4,.Loop4_mul
132
133 la %r10,1(%r10) // see if len%4 is zero ...
134 brct %r10,.Loop1_mul // without touching condition code:-)
135
136.Lend_mul:
137 alcgr %r8,zero // collect carry bit
138 lgr %r2,%r8
139 lmg %r6,%r10,48(%r15)
140 br %r14
141
142.Loop1_mul:
143 lg %r7,0(%r2,%r3) // ap[i]
144 mlgr %r6,%r5 // *=w
145 alcgr %r7,%r8 // +=carry
146 stg %r7,0(%r2,%r1) // rp[i]=
147
148 lgr %r8,%r6
149 la %r2,8(%r2) // i++
150 brct %r10,.Loop1_mul
151
152 j .Lend_mul
153.size bn_mul_words,.-bn_mul_words
154
155// void bn_sqr_words(BN_ULONG *r2,BN_ULONG *r2,int r4)
156.globl bn_sqr_words
157.type bn_sqr_words,@function
158.align 4
159bn_sqr_words:
160 ltgfr %r4,%r4
161 bler %r14
162
163 stmg %r6,%r7,48(%r15)
164 srag %r1,%r4,2 // cnt=len/4
165 jz .Loop1_sqr
166
167.Loop4_sqr:
168 lg %r7,0(%r3)
169 mlgr %r6,%r7
170 stg %r7,0(%r2)
171 stg %r6,8(%r2)
172
173 lg %r7,8(%r3)
174 mlgr %r6,%r7
175 stg %r7,16(%r2)
176 stg %r6,24(%r2)
177
178 lg %r7,16(%r3)
179 mlgr %r6,%r7
180 stg %r7,32(%r2)
181 stg %r6,40(%r2)
182
183 lg %r7,24(%r3)
184 mlgr %r6,%r7
185 stg %r7,48(%r2)
186 stg %r6,56(%r2)
187
188 la %r3,32(%r3)
189 la %r2,64(%r2)
190 brct %r1,.Loop4_sqr
191
192 lghi %r1,3
193 nr %r4,%r1 // cnt=len%4
194 jz .Lend_sqr
195
196.Loop1_sqr:
197 lg %r7,0(%r3)
198 mlgr %r6,%r7
199 stg %r7,0(%r2)
200 stg %r6,8(%r2)
201
202 la %r3,8(%r3)
203 la %r2,16(%r2)
204 brct %r4,.Loop1_sqr
205
206.Lend_sqr:
207 lmg %r6,%r7,48(%r15)
208 br %r14
209.size bn_sqr_words,.-bn_sqr_words
210
211// BN_ULONG bn_div_words(BN_ULONG h,BN_ULONG l,BN_ULONG d);
212.globl bn_div_words
213.type bn_div_words,@function
214.align 4
215bn_div_words:
216 dlgr %r2,%r4
217 lgr %r2,%r3
218 br %r14
219.size bn_div_words,.-bn_div_words
220
221// BN_ULONG bn_add_words(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4,int r5);
222.globl bn_add_words
223.type bn_add_words,@function
224.align 4
225bn_add_words:
226 la %r1,0(%r2) // put rp aside
227 lghi %r2,0 // i=0
228 ltgfr %r5,%r5
229 bler %r14 // if (len<=0) return 0;
230
231 stg %r6,48(%r15)
232 lghi %r6,3
233 nr %r6,%r5 // len%4
234 sra %r5,2 // len/4, use sra because it sets condition code
235 jz .Loop1_add // carry is incidentally cleared if branch taken
236 algr %r2,%r2 // clear carry
237
238.Loop4_add:
239 lg %r0,0(%r2,%r3)
240 alcg %r0,0(%r2,%r4)
241 stg %r0,0(%r2,%r1)
242 lg %r0,8(%r2,%r3)
243 alcg %r0,8(%r2,%r4)
244 stg %r0,8(%r2,%r1)
245 lg %r0,16(%r2,%r3)
246 alcg %r0,16(%r2,%r4)
247 stg %r0,16(%r2,%r1)
248 lg %r0,24(%r2,%r3)
249 alcg %r0,24(%r2,%r4)
250 stg %r0,24(%r2,%r1)
251
252 la %r2,32(%r2) // i+=4
253 brct %r5,.Loop4_add
254
255 la %r6,1(%r6) // see if len%4 is zero ...
256 brct %r6,.Loop1_add // without touching condition code:-)
257
258.Lexit_add:
259 lghi %r2,0
260 alcgr %r2,%r2
261 lg %r6,48(%r15)
262 br %r14
263
264.Loop1_add:
265 lg %r0,0(%r2,%r3)
266 alcg %r0,0(%r2,%r4)
267 stg %r0,0(%r2,%r1)
268
269 la %r2,8(%r2) // i++
270 brct %r6,.Loop1_add
271
272 j .Lexit_add
273.size bn_add_words,.-bn_add_words
274
275// BN_ULONG bn_sub_words(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4,int r5);
276.globl bn_sub_words
277.type bn_sub_words,@function
278.align 4
279bn_sub_words:
280 la %r1,0(%r2) // put rp aside
281 lghi %r2,0 // i=0
282 ltgfr %r5,%r5
283 bler %r14 // if (len<=0) return 0;
284
285 stg %r6,48(%r15)
286 lghi %r6,3
287 nr %r6,%r5 // len%4
288 sra %r5,2 // len/4, use sra because it sets condition code
289 jnz .Loop4_sub // borrow is incidentally cleared if branch taken
290 slgr %r2,%r2 // clear borrow
291
292.Loop1_sub:
293 lg %r0,0(%r2,%r3)
294 slbg %r0,0(%r2,%r4)
295 stg %r0,0(%r2,%r1)
296
297 la %r2,8(%r2) // i++
298 brct %r6,.Loop1_sub
299 j .Lexit_sub
300
301.Loop4_sub:
302 lg %r0,0(%r2,%r3)
303 slbg %r0,0(%r2,%r4)
304 stg %r0,0(%r2,%r1)
305 lg %r0,8(%r2,%r3)
306 slbg %r0,8(%r2,%r4)
307 stg %r0,8(%r2,%r1)
308 lg %r0,16(%r2,%r3)
309 slbg %r0,16(%r2,%r4)
310 stg %r0,16(%r2,%r1)
311 lg %r0,24(%r2,%r3)
312 slbg %r0,24(%r2,%r4)
313 stg %r0,24(%r2,%r1)
314
315 la %r2,32(%r2) // i+=4
316 brct %r5,.Loop4_sub
317
318 la %r6,1(%r6) // see if len%4 is zero ...
319 brct %r6,.Loop1_sub // without touching condition code:-)
320
321.Lexit_sub:
322 lghi %r2,0
323 slbgr %r2,%r2
324 lcgr %r2,%r2
325 lg %r6,48(%r15)
326 br %r14
327.size bn_sub_words,.-bn_sub_words
328
329#define c1 %r1
330#define c2 %r5
331#define c3 %r8
332
333#define mul_add_c(ai,bi,c1,c2,c3) \
334 lg %r7,ai*8(%r3); \
335 mlg %r6,bi*8(%r4); \
336 algr c1,%r7; \
337 alcgr c2,%r6; \
338 alcgr c3,zero
339
340// void bn_mul_comba8(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4);
341.globl bn_mul_comba8
342.type bn_mul_comba8,@function
343.align 4
344bn_mul_comba8:
345 stmg %r6,%r8,48(%r15)
346
347 lghi c1,0
348 lghi c2,0
349 lghi c3,0
350 lghi zero,0
351
352 mul_add_c(0,0,c1,c2,c3);
353 stg c1,0*8(%r2)
354 lghi c1,0
355
356 mul_add_c(0,1,c2,c3,c1);
357 mul_add_c(1,0,c2,c3,c1);
358 stg c2,1*8(%r2)
359 lghi c2,0
360
361 mul_add_c(2,0,c3,c1,c2);
362 mul_add_c(1,1,c3,c1,c2);
363 mul_add_c(0,2,c3,c1,c2);
364 stg c3,2*8(%r2)
365 lghi c3,0
366
367 mul_add_c(0,3,c1,c2,c3);
368 mul_add_c(1,2,c1,c2,c3);
369 mul_add_c(2,1,c1,c2,c3);
370 mul_add_c(3,0,c1,c2,c3);
371 stg c1,3*8(%r2)
372 lghi c1,0
373
374 mul_add_c(4,0,c2,c3,c1);
375 mul_add_c(3,1,c2,c3,c1);
376 mul_add_c(2,2,c2,c3,c1);
377 mul_add_c(1,3,c2,c3,c1);
378 mul_add_c(0,4,c2,c3,c1);
379 stg c2,4*8(%r2)
380 lghi c2,0
381
382 mul_add_c(0,5,c3,c1,c2);
383 mul_add_c(1,4,c3,c1,c2);
384 mul_add_c(2,3,c3,c1,c2);
385 mul_add_c(3,2,c3,c1,c2);
386 mul_add_c(4,1,c3,c1,c2);
387 mul_add_c(5,0,c3,c1,c2);
388 stg c3,5*8(%r2)
389 lghi c3,0
390
391 mul_add_c(6,0,c1,c2,c3);
392 mul_add_c(5,1,c1,c2,c3);
393 mul_add_c(4,2,c1,c2,c3);
394 mul_add_c(3,3,c1,c2,c3);
395 mul_add_c(2,4,c1,c2,c3);
396 mul_add_c(1,5,c1,c2,c3);
397 mul_add_c(0,6,c1,c2,c3);
398 stg c1,6*8(%r2)
399 lghi c1,0
400
401 mul_add_c(0,7,c2,c3,c1);
402 mul_add_c(1,6,c2,c3,c1);
403 mul_add_c(2,5,c2,c3,c1);
404 mul_add_c(3,4,c2,c3,c1);
405 mul_add_c(4,3,c2,c3,c1);
406 mul_add_c(5,2,c2,c3,c1);
407 mul_add_c(6,1,c2,c3,c1);
408 mul_add_c(7,0,c2,c3,c1);
409 stg c2,7*8(%r2)
410 lghi c2,0
411
412 mul_add_c(7,1,c3,c1,c2);
413 mul_add_c(6,2,c3,c1,c2);
414 mul_add_c(5,3,c3,c1,c2);
415 mul_add_c(4,4,c3,c1,c2);
416 mul_add_c(3,5,c3,c1,c2);
417 mul_add_c(2,6,c3,c1,c2);
418 mul_add_c(1,7,c3,c1,c2);
419 stg c3,8*8(%r2)
420 lghi c3,0
421
422 mul_add_c(2,7,c1,c2,c3);
423 mul_add_c(3,6,c1,c2,c3);
424 mul_add_c(4,5,c1,c2,c3);
425 mul_add_c(5,4,c1,c2,c3);
426 mul_add_c(6,3,c1,c2,c3);
427 mul_add_c(7,2,c1,c2,c3);
428 stg c1,9*8(%r2)
429 lghi c1,0
430
431 mul_add_c(7,3,c2,c3,c1);
432 mul_add_c(6,4,c2,c3,c1);
433 mul_add_c(5,5,c2,c3,c1);
434 mul_add_c(4,6,c2,c3,c1);
435 mul_add_c(3,7,c2,c3,c1);
436 stg c2,10*8(%r2)
437 lghi c2,0
438
439 mul_add_c(4,7,c3,c1,c2);
440 mul_add_c(5,6,c3,c1,c2);
441 mul_add_c(6,5,c3,c1,c2);
442 mul_add_c(7,4,c3,c1,c2);
443 stg c3,11*8(%r2)
444 lghi c3,0
445
446 mul_add_c(7,5,c1,c2,c3);
447 mul_add_c(6,6,c1,c2,c3);
448 mul_add_c(5,7,c1,c2,c3);
449 stg c1,12*8(%r2)
450 lghi c1,0
451
452
453 mul_add_c(6,7,c2,c3,c1);
454 mul_add_c(7,6,c2,c3,c1);
455 stg c2,13*8(%r2)
456 lghi c2,0
457
458 mul_add_c(7,7,c3,c1,c2);
459 stg c3,14*8(%r2)
460 stg c1,15*8(%r2)
461
462 lmg %r6,%r8,48(%r15)
463 br %r14
464.size bn_mul_comba8,.-bn_mul_comba8
465
466// void bn_mul_comba4(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4);
467.globl bn_mul_comba4
468.type bn_mul_comba4,@function
469.align 4
470bn_mul_comba4:
471 stmg %r6,%r8,48(%r15)
472
473 lghi c1,0
474 lghi c2,0
475 lghi c3,0
476 lghi zero,0
477
478 mul_add_c(0,0,c1,c2,c3);
479 stg c1,0*8(%r3)
480 lghi c1,0
481
482 mul_add_c(0,1,c2,c3,c1);
483 mul_add_c(1,0,c2,c3,c1);
484 stg c2,1*8(%r2)
485 lghi c2,0
486
487 mul_add_c(2,0,c3,c1,c2);
488 mul_add_c(1,1,c3,c1,c2);
489 mul_add_c(0,2,c3,c1,c2);
490 stg c3,2*8(%r2)
491 lghi c3,0
492
493 mul_add_c(0,3,c1,c2,c3);
494 mul_add_c(1,2,c1,c2,c3);
495 mul_add_c(2,1,c1,c2,c3);
496 mul_add_c(3,0,c1,c2,c3);
497 stg c1,3*8(%r2)
498 lghi c1,0
499
500 mul_add_c(3,1,c2,c3,c1);
501 mul_add_c(2,2,c2,c3,c1);
502 mul_add_c(1,3,c2,c3,c1);
503 stg c2,4*8(%r2)
504 lghi c2,0
505
506 mul_add_c(2,3,c3,c1,c2);
507 mul_add_c(3,2,c3,c1,c2);
508 stg c3,5*8(%r2)
509 lghi c3,0
510
511 mul_add_c(3,3,c1,c2,c3);
512 stg c1,6*8(%r2)
513 stg c2,7*8(%r2)
514
515 stmg %r6,%r8,48(%r15)
516 br %r14
517.size bn_mul_comba4,.-bn_mul_comba4
518
519#define sqr_add_c(ai,c1,c2,c3) \
520 lg %r7,ai*8(%r3); \
521 mlgr %r6,%r7; \
522 algr c1,%r7; \
523 alcgr c2,%r6; \
524 alcgr c3,zero
525
526#define sqr_add_c2(ai,aj,c1,c2,c3) \
527 lg %r7,ai*8(%r3); \
528 mlg %r6,aj*8(%r3); \
529 algr c1,%r7; \
530 alcgr c2,%r6; \
531 alcgr c3,zero; \
532 algr c1,%r7; \
533 alcgr c2,%r6; \
534 alcgr c3,zero
535
536// void bn_sqr_comba8(BN_ULONG *r2,BN_ULONG *r3);
537.globl bn_sqr_comba8
538.type bn_sqr_comba8,@function
539.align 4
540bn_sqr_comba8:
541 stmg %r6,%r8,48(%r15)
542
543 lghi c1,0
544 lghi c2,0
545 lghi c3,0
546 lghi zero,0
547
548 sqr_add_c(0,c1,c2,c3);
549 stg c1,0*8(%r2)
550 lghi c1,0
551
552 sqr_add_c2(1,0,c2,c3,c1);
553 stg c2,1*8(%r2)
554 lghi c2,0
555
556 sqr_add_c(1,c3,c1,c2);
557 sqr_add_c2(2,0,c3,c1,c2);
558 stg c3,2*8(%r2)
559 lghi c3,0
560
561 sqr_add_c2(3,0,c1,c2,c3);
562 sqr_add_c2(2,1,c1,c2,c3);
563 stg c1,3*8(%r2)
564 lghi c1,0
565
566 sqr_add_c(2,c2,c3,c1);
567 sqr_add_c2(3,1,c2,c3,c1);
568 sqr_add_c2(4,0,c2,c3,c1);
569 stg c2,4*8(%r2)
570 lghi c2,0
571
572 sqr_add_c2(5,0,c3,c1,c2);
573 sqr_add_c2(4,1,c3,c1,c2);
574 sqr_add_c2(3,2,c3,c1,c2);
575 stg c3,5*8(%r2)
576 lghi c3,0
577
578 sqr_add_c(3,c1,c2,c3);
579 sqr_add_c2(4,2,c1,c2,c3);
580 sqr_add_c2(5,1,c1,c2,c3);
581 sqr_add_c2(6,0,c1,c2,c3);
582 stg c1,6*8(%r2)
583 lghi c1,0
584
585 sqr_add_c2(7,0,c2,c3,c1);
586 sqr_add_c2(6,1,c2,c3,c1);
587 sqr_add_c2(5,2,c2,c3,c1);
588 sqr_add_c2(4,3,c2,c3,c1);
589 stg c2,7*8(%r2)
590 lghi c2,0
591
592 sqr_add_c(4,c3,c1,c2);
593 sqr_add_c2(5,3,c3,c1,c2);
594 sqr_add_c2(6,2,c3,c1,c2);
595 sqr_add_c2(7,1,c3,c1,c2);
596 stg c3,8*8(%r2)
597 lghi c3,0
598
599 sqr_add_c2(7,2,c1,c2,c3);
600 sqr_add_c2(6,3,c1,c2,c3);
601 sqr_add_c2(5,4,c1,c2,c3);
602 stg c1,9*8(%r2)
603 lghi c1,0
604
605 sqr_add_c(5,c2,c3,c1);
606 sqr_add_c2(6,4,c2,c3,c1);
607 sqr_add_c2(7,3,c2,c3,c1);
608 stg c2,10*8(%r2)
609 lghi c2,0
610
611 sqr_add_c2(7,4,c3,c1,c2);
612 sqr_add_c2(6,5,c3,c1,c2);
613 stg c3,11*8(%r2)
614 lghi c3,0
615
616 sqr_add_c(6,c1,c2,c3);
617 sqr_add_c2(7,5,c1,c2,c3);
618 stg c1,12*8(%r2)
619 lghi c1,0
620
621 sqr_add_c2(7,6,c2,c3,c1);
622 stg c2,13*8(%r2)
623 lghi c2,0
624
625 sqr_add_c(7,c3,c1,c2);
626 stg c3,14*8(%r2)
627 stg c1,15*8(%r2)
628
629 lmg %r6,%r8,48(%r15)
630 br %r14
631.size bn_sqr_comba8,.-bn_sqr_comba8
632
633// void bn_sqr_comba4(BN_ULONG *r2,BN_ULONG *r3);
634.globl bn_sqr_comba4
635.type bn_sqr_comba4,@function
636.align 4
637bn_sqr_comba4:
638 stmg %r6,%r8,48(%r15)
639
640 lghi c1,0
641 lghi c2,0
642 lghi c3,0
643 lghi zero,0
644
645 sqr_add_c(0,c1,c2,c3);
646 stg c1,0*8(%r2)
647 lghi c1,0
648
649 sqr_add_c2(1,0,c2,c3,c1);
650 stg c2,1*8(%r2)
651 lghi c2,0
652
653 sqr_add_c(1,c3,c1,c2);
654 sqr_add_c2(2,0,c3,c1,c2);
655 stg c3,2*8(%r2)
656 lghi c3,0
657
658 sqr_add_c2(3,0,c1,c2,c3);
659 sqr_add_c2(2,1,c1,c2,c3);
660 stg c1,3*8(%r2)
661 lghi c1,0
662
663 sqr_add_c(2,c2,c3,c1);
664 sqr_add_c2(3,1,c2,c3,c1);
665 stg c2,4*8(%r2)
666 lghi c2,0
667
668 sqr_add_c2(3,2,c3,c1,c2);
669 stg c3,5*8(%r2)
670 lghi c3,0
671
672 sqr_add_c(3,c1,c2,c3);
673 stg c1,6*8(%r2)
674 stg c2,7*8(%r2)
675
676 lmg %r6,%r8,48(%r15)
677 br %r14
678.size bn_sqr_comba4,.-bn_sqr_comba4
diff --git a/src/lib/libcrypto/bn/asm/sparcv8.S b/src/lib/libcrypto/bn/asm/sparcv8.S
deleted file mode 100644
index 88c5dc480a..0000000000
--- a/src/lib/libcrypto/bn/asm/sparcv8.S
+++ /dev/null
@@ -1,1458 +0,0 @@
1.ident "sparcv8.s, Version 1.4"
2.ident "SPARC v8 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>"
3
4/*
5 * ====================================================================
6 * Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
7 * project.
8 *
9 * Rights for redistribution and usage in source and binary forms are
10 * granted according to the OpenSSL license. Warranty of any kind is
11 * disclaimed.
12 * ====================================================================
13 */
14
15/*
16 * This is my modest contributon to OpenSSL project (see
17 * http://www.openssl.org/ for more information about it) and is
18 * a drop-in SuperSPARC ISA replacement for crypto/bn/bn_asm.c
19 * module. For updates see http://fy.chalmers.se/~appro/hpe/.
20 *
21 * See bn_asm.sparc.v8plus.S for more details.
22 */
23
24/*
25 * Revision history.
26 *
27 * 1.1 - new loop unrolling model(*);
28 * 1.2 - made gas friendly;
29 * 1.3 - fixed problem with /usr/ccs/lib/cpp;
30 * 1.4 - some retunes;
31 *
32 * (*) see bn_asm.sparc.v8plus.S for details
33 */
34
35.section ".text",#alloc,#execinstr
36.file "bn_asm.sparc.v8.S"
37
38.align 32
39
40.global bn_mul_add_words
41/*
42 * BN_ULONG bn_mul_add_words(rp,ap,num,w)
43 * BN_ULONG *rp,*ap;
44 * int num;
45 * BN_ULONG w;
46 */
47bn_mul_add_words:
48 cmp %o2,0
49 bg,a .L_bn_mul_add_words_proceed
50 ld [%o1],%g2
51 retl
52 clr %o0
53
54.L_bn_mul_add_words_proceed:
55 andcc %o2,-4,%g0
56 bz .L_bn_mul_add_words_tail
57 clr %o5
58
59.L_bn_mul_add_words_loop:
60 ld [%o0],%o4
61 ld [%o1+4],%g3
62 umul %o3,%g2,%g2
63 rd %y,%g1
64 addcc %o4,%o5,%o4
65 addx %g1,0,%g1
66 addcc %o4,%g2,%o4
67 st %o4,[%o0]
68 addx %g1,0,%o5
69
70 ld [%o0+4],%o4
71 ld [%o1+8],%g2
72 umul %o3,%g3,%g3
73 dec 4,%o2
74 rd %y,%g1
75 addcc %o4,%o5,%o4
76 addx %g1,0,%g1
77 addcc %o4,%g3,%o4
78 st %o4,[%o0+4]
79 addx %g1,0,%o5
80
81 ld [%o0+8],%o4
82 ld [%o1+12],%g3
83 umul %o3,%g2,%g2
84 inc 16,%o1
85 rd %y,%g1
86 addcc %o4,%o5,%o4
87 addx %g1,0,%g1
88 addcc %o4,%g2,%o4
89 st %o4,[%o0+8]
90 addx %g1,0,%o5
91
92 ld [%o0+12],%o4
93 umul %o3,%g3,%g3
94 inc 16,%o0
95 rd %y,%g1
96 addcc %o4,%o5,%o4
97 addx %g1,0,%g1
98 addcc %o4,%g3,%o4
99 st %o4,[%o0-4]
100 addx %g1,0,%o5
101 andcc %o2,-4,%g0
102 bnz,a .L_bn_mul_add_words_loop
103 ld [%o1],%g2
104
105 tst %o2
106 bnz,a .L_bn_mul_add_words_tail
107 ld [%o1],%g2
108.L_bn_mul_add_words_return:
109 retl
110 mov %o5,%o0
111 nop
112
113.L_bn_mul_add_words_tail:
114 ld [%o0],%o4
115 umul %o3,%g2,%g2
116 addcc %o4,%o5,%o4
117 rd %y,%g1
118 addx %g1,0,%g1
119 addcc %o4,%g2,%o4
120 addx %g1,0,%o5
121 deccc %o2
122 bz .L_bn_mul_add_words_return
123 st %o4,[%o0]
124
125 ld [%o1+4],%g2
126 ld [%o0+4],%o4
127 umul %o3,%g2,%g2
128 rd %y,%g1
129 addcc %o4,%o5,%o4
130 addx %g1,0,%g1
131 addcc %o4,%g2,%o4
132 addx %g1,0,%o5
133 deccc %o2
134 bz .L_bn_mul_add_words_return
135 st %o4,[%o0+4]
136
137 ld [%o1+8],%g2
138 ld [%o0+8],%o4
139 umul %o3,%g2,%g2
140 rd %y,%g1
141 addcc %o4,%o5,%o4
142 addx %g1,0,%g1
143 addcc %o4,%g2,%o4
144 st %o4,[%o0+8]
145 retl
146 addx %g1,0,%o0
147
148.type bn_mul_add_words,#function
149.size bn_mul_add_words,(.-bn_mul_add_words)
150
151.align 32
152
153.global bn_mul_words
154/*
155 * BN_ULONG bn_mul_words(rp,ap,num,w)
156 * BN_ULONG *rp,*ap;
157 * int num;
158 * BN_ULONG w;
159 */
160bn_mul_words:
161 cmp %o2,0
162 bg,a .L_bn_mul_words_proceeed
163 ld [%o1],%g2
164 retl
165 clr %o0
166
167.L_bn_mul_words_proceeed:
168 andcc %o2,-4,%g0
169 bz .L_bn_mul_words_tail
170 clr %o5
171
172.L_bn_mul_words_loop:
173 ld [%o1+4],%g3
174 umul %o3,%g2,%g2
175 addcc %g2,%o5,%g2
176 rd %y,%g1
177 addx %g1,0,%o5
178 st %g2,[%o0]
179
180 ld [%o1+8],%g2
181 umul %o3,%g3,%g3
182 addcc %g3,%o5,%g3
183 rd %y,%g1
184 dec 4,%o2
185 addx %g1,0,%o5
186 st %g3,[%o0+4]
187
188 ld [%o1+12],%g3
189 umul %o3,%g2,%g2
190 addcc %g2,%o5,%g2
191 rd %y,%g1
192 inc 16,%o1
193 st %g2,[%o0+8]
194 addx %g1,0,%o5
195
196 umul %o3,%g3,%g3
197 addcc %g3,%o5,%g3
198 rd %y,%g1
199 inc 16,%o0
200 addx %g1,0,%o5
201 st %g3,[%o0-4]
202 andcc %o2,-4,%g0
203 nop
204 bnz,a .L_bn_mul_words_loop
205 ld [%o1],%g2
206
207 tst %o2
208 bnz,a .L_bn_mul_words_tail
209 ld [%o1],%g2
210.L_bn_mul_words_return:
211 retl
212 mov %o5,%o0
213 nop
214
215.L_bn_mul_words_tail:
216 umul %o3,%g2,%g2
217 addcc %g2,%o5,%g2
218 rd %y,%g1
219 addx %g1,0,%o5
220 deccc %o2
221 bz .L_bn_mul_words_return
222 st %g2,[%o0]
223 nop
224
225 ld [%o1+4],%g2
226 umul %o3,%g2,%g2
227 addcc %g2,%o5,%g2
228 rd %y,%g1
229 addx %g1,0,%o5
230 deccc %o2
231 bz .L_bn_mul_words_return
232 st %g2,[%o0+4]
233
234 ld [%o1+8],%g2
235 umul %o3,%g2,%g2
236 addcc %g2,%o5,%g2
237 rd %y,%g1
238 st %g2,[%o0+8]
239 retl
240 addx %g1,0,%o0
241
242.type bn_mul_words,#function
243.size bn_mul_words,(.-bn_mul_words)
244
245.align 32
246.global bn_sqr_words
247/*
248 * void bn_sqr_words(r,a,n)
249 * BN_ULONG *r,*a;
250 * int n;
251 */
252bn_sqr_words:
253 cmp %o2,0
254 bg,a .L_bn_sqr_words_proceeed
255 ld [%o1],%g2
256 retl
257 clr %o0
258
259.L_bn_sqr_words_proceeed:
260 andcc %o2,-4,%g0
261 bz .L_bn_sqr_words_tail
262 clr %o5
263
264.L_bn_sqr_words_loop:
265 ld [%o1+4],%g3
266 umul %g2,%g2,%o4
267 st %o4,[%o0]
268 rd %y,%o5
269 st %o5,[%o0+4]
270
271 ld [%o1+8],%g2
272 umul %g3,%g3,%o4
273 dec 4,%o2
274 st %o4,[%o0+8]
275 rd %y,%o5
276 st %o5,[%o0+12]
277 nop
278
279 ld [%o1+12],%g3
280 umul %g2,%g2,%o4
281 st %o4,[%o0+16]
282 rd %y,%o5
283 inc 16,%o1
284 st %o5,[%o0+20]
285
286 umul %g3,%g3,%o4
287 inc 32,%o0
288 st %o4,[%o0-8]
289 rd %y,%o5
290 st %o5,[%o0-4]
291 andcc %o2,-4,%g2
292 bnz,a .L_bn_sqr_words_loop
293 ld [%o1],%g2
294
295 tst %o2
296 nop
297 bnz,a .L_bn_sqr_words_tail
298 ld [%o1],%g2
299.L_bn_sqr_words_return:
300 retl
301 clr %o0
302
303.L_bn_sqr_words_tail:
304 umul %g2,%g2,%o4
305 st %o4,[%o0]
306 deccc %o2
307 rd %y,%o5
308 bz .L_bn_sqr_words_return
309 st %o5,[%o0+4]
310
311 ld [%o1+4],%g2
312 umul %g2,%g2,%o4
313 st %o4,[%o0+8]
314 deccc %o2
315 rd %y,%o5
316 nop
317 bz .L_bn_sqr_words_return
318 st %o5,[%o0+12]
319
320 ld [%o1+8],%g2
321 umul %g2,%g2,%o4
322 st %o4,[%o0+16]
323 rd %y,%o5
324 st %o5,[%o0+20]
325 retl
326 clr %o0
327
328.type bn_sqr_words,#function
329.size bn_sqr_words,(.-bn_sqr_words)
330
331.align 32
332
333.global bn_div_words
334/*
335 * BN_ULONG bn_div_words(h,l,d)
336 * BN_ULONG h,l,d;
337 */
338bn_div_words:
339 wr %o0,%y
340 udiv %o1,%o2,%o0
341 retl
342 nop
343
344.type bn_div_words,#function
345.size bn_div_words,(.-bn_div_words)
346
347.align 32
348
349.global bn_add_words
350/*
351 * BN_ULONG bn_add_words(rp,ap,bp,n)
352 * BN_ULONG *rp,*ap,*bp;
353 * int n;
354 */
355bn_add_words:
356 cmp %o3,0
357 bg,a .L_bn_add_words_proceed
358 ld [%o1],%o4
359 retl
360 clr %o0
361
362.L_bn_add_words_proceed:
363 andcc %o3,-4,%g0
364 bz .L_bn_add_words_tail
365 clr %g1
366 ba .L_bn_add_words_warn_loop
367 addcc %g0,0,%g0 ! clear carry flag
368
369.L_bn_add_words_loop:
370 ld [%o1],%o4
371.L_bn_add_words_warn_loop:
372 ld [%o2],%o5
373 ld [%o1+4],%g3
374 ld [%o2+4],%g4
375 dec 4,%o3
376 addxcc %o5,%o4,%o5
377 st %o5,[%o0]
378
379 ld [%o1+8],%o4
380 ld [%o2+8],%o5
381 inc 16,%o1
382 addxcc %g3,%g4,%g3
383 st %g3,[%o0+4]
384
385 ld [%o1-4],%g3
386 ld [%o2+12],%g4
387 inc 16,%o2
388 addxcc %o5,%o4,%o5
389 st %o5,[%o0+8]
390
391 inc 16,%o0
392 addxcc %g3,%g4,%g3
393 st %g3,[%o0-4]
394 addx %g0,0,%g1
395 andcc %o3,-4,%g0
396 bnz,a .L_bn_add_words_loop
397 addcc %g1,-1,%g0
398
399 tst %o3
400 bnz,a .L_bn_add_words_tail
401 ld [%o1],%o4
402.L_bn_add_words_return:
403 retl
404 mov %g1,%o0
405
406.L_bn_add_words_tail:
407 addcc %g1,-1,%g0
408 ld [%o2],%o5
409 addxcc %o5,%o4,%o5
410 addx %g0,0,%g1
411 deccc %o3
412 bz .L_bn_add_words_return
413 st %o5,[%o0]
414
415 ld [%o1+4],%o4
416 addcc %g1,-1,%g0
417 ld [%o2+4],%o5
418 addxcc %o5,%o4,%o5
419 addx %g0,0,%g1
420 deccc %o3
421 bz .L_bn_add_words_return
422 st %o5,[%o0+4]
423
424 ld [%o1+8],%o4
425 addcc %g1,-1,%g0
426 ld [%o2+8],%o5
427 addxcc %o5,%o4,%o5
428 st %o5,[%o0+8]
429 retl
430 addx %g0,0,%o0
431
432.type bn_add_words,#function
433.size bn_add_words,(.-bn_add_words)
434
435.align 32
436
437.global bn_sub_words
438/*
439 * BN_ULONG bn_sub_words(rp,ap,bp,n)
440 * BN_ULONG *rp,*ap,*bp;
441 * int n;
442 */
443bn_sub_words:
444 cmp %o3,0
445 bg,a .L_bn_sub_words_proceed
446 ld [%o1],%o4
447 retl
448 clr %o0
449
450.L_bn_sub_words_proceed:
451 andcc %o3,-4,%g0
452 bz .L_bn_sub_words_tail
453 clr %g1
454 ba .L_bn_sub_words_warm_loop
455 addcc %g0,0,%g0 ! clear carry flag
456
457.L_bn_sub_words_loop:
458 ld [%o1],%o4
459.L_bn_sub_words_warm_loop:
460 ld [%o2],%o5
461 ld [%o1+4],%g3
462 ld [%o2+4],%g4
463 dec 4,%o3
464 subxcc %o4,%o5,%o5
465 st %o5,[%o0]
466
467 ld [%o1+8],%o4
468 ld [%o2+8],%o5
469 inc 16,%o1
470 subxcc %g3,%g4,%g4
471 st %g4,[%o0+4]
472
473 ld [%o1-4],%g3
474 ld [%o2+12],%g4
475 inc 16,%o2
476 subxcc %o4,%o5,%o5
477 st %o5,[%o0+8]
478
479 inc 16,%o0
480 subxcc %g3,%g4,%g4
481 st %g4,[%o0-4]
482 addx %g0,0,%g1
483 andcc %o3,-4,%g0
484 bnz,a .L_bn_sub_words_loop
485 addcc %g1,-1,%g0
486
487 tst %o3
488 nop
489 bnz,a .L_bn_sub_words_tail
490 ld [%o1],%o4
491.L_bn_sub_words_return:
492 retl
493 mov %g1,%o0
494
495.L_bn_sub_words_tail:
496 addcc %g1,-1,%g0
497 ld [%o2],%o5
498 subxcc %o4,%o5,%o5
499 addx %g0,0,%g1
500 deccc %o3
501 bz .L_bn_sub_words_return
502 st %o5,[%o0]
503 nop
504
505 ld [%o1+4],%o4
506 addcc %g1,-1,%g0
507 ld [%o2+4],%o5
508 subxcc %o4,%o5,%o5
509 addx %g0,0,%g1
510 deccc %o3
511 bz .L_bn_sub_words_return
512 st %o5,[%o0+4]
513
514 ld [%o1+8],%o4
515 addcc %g1,-1,%g0
516 ld [%o2+8],%o5
517 subxcc %o4,%o5,%o5
518 st %o5,[%o0+8]
519 retl
520 addx %g0,0,%o0
521
522.type bn_sub_words,#function
523.size bn_sub_words,(.-bn_sub_words)
524
525#define FRAME_SIZE -96
526
527/*
528 * Here is register usage map for *all* routines below.
529 */
530#define t_1 %o0
531#define t_2 %o1
532#define c_1 %o2
533#define c_2 %o3
534#define c_3 %o4
535
536#define ap(I) [%i1+4*I]
537#define bp(I) [%i2+4*I]
538#define rp(I) [%i0+4*I]
539
540#define a_0 %l0
541#define a_1 %l1
542#define a_2 %l2
543#define a_3 %l3
544#define a_4 %l4
545#define a_5 %l5
546#define a_6 %l6
547#define a_7 %l7
548
549#define b_0 %i3
550#define b_1 %i4
551#define b_2 %i5
552#define b_3 %o5
553#define b_4 %g1
554#define b_5 %g2
555#define b_6 %g3
556#define b_7 %g4
557
558.align 32
559.global bn_mul_comba8
560/*
561 * void bn_mul_comba8(r,a,b)
562 * BN_ULONG *r,*a,*b;
563 */
564bn_mul_comba8:
565 save %sp,FRAME_SIZE,%sp
566 ld ap(0),a_0
567 ld bp(0),b_0
568 umul a_0,b_0,c_1 !=!mul_add_c(a[0],b[0],c1,c2,c3);
569 ld bp(1),b_1
570 rd %y,c_2
571 st c_1,rp(0) !r[0]=c1;
572
573 umul a_0,b_1,t_1 !=!mul_add_c(a[0],b[1],c2,c3,c1);
574 ld ap(1),a_1
575 addcc c_2,t_1,c_2
576 rd %y,t_2
577 addxcc %g0,t_2,c_3 !=
578 addx %g0,%g0,c_1
579 ld ap(2),a_2
580 umul a_1,b_0,t_1 !mul_add_c(a[1],b[0],c2,c3,c1);
581 addcc c_2,t_1,c_2 !=
582 rd %y,t_2
583 addxcc c_3,t_2,c_3
584 st c_2,rp(1) !r[1]=c2;
585 addx c_1,%g0,c_1 !=
586
587 umul a_2,b_0,t_1 !mul_add_c(a[2],b[0],c3,c1,c2);
588 addcc c_3,t_1,c_3
589 rd %y,t_2
590 addxcc c_1,t_2,c_1 !=
591 addx %g0,%g0,c_2
592 ld bp(2),b_2
593 umul a_1,b_1,t_1 !mul_add_c(a[1],b[1],c3,c1,c2);
594 addcc c_3,t_1,c_3 !=
595 rd %y,t_2
596 addxcc c_1,t_2,c_1
597 ld bp(3),b_3
598 addx c_2,%g0,c_2 !=
599 umul a_0,b_2,t_1 !mul_add_c(a[0],b[2],c3,c1,c2);
600 addcc c_3,t_1,c_3
601 rd %y,t_2
602 addxcc c_1,t_2,c_1 !=
603 addx c_2,%g0,c_2
604 st c_3,rp(2) !r[2]=c3;
605
606 umul a_0,b_3,t_1 !mul_add_c(a[0],b[3],c1,c2,c3);
607 addcc c_1,t_1,c_1 !=
608 rd %y,t_2
609 addxcc c_2,t_2,c_2
610 addx %g0,%g0,c_3
611 umul a_1,b_2,t_1 !=!mul_add_c(a[1],b[2],c1,c2,c3);
612 addcc c_1,t_1,c_1
613 rd %y,t_2
614 addxcc c_2,t_2,c_2
615 addx c_3,%g0,c_3 !=
616 ld ap(3),a_3
617 umul a_2,b_1,t_1 !mul_add_c(a[2],b[1],c1,c2,c3);
618 addcc c_1,t_1,c_1
619 rd %y,t_2 !=
620 addxcc c_2,t_2,c_2
621 addx c_3,%g0,c_3
622 ld ap(4),a_4
623 umul a_3,b_0,t_1 !mul_add_c(a[3],b[0],c1,c2,c3);!=
624 addcc c_1,t_1,c_1
625 rd %y,t_2
626 addxcc c_2,t_2,c_2
627 addx c_3,%g0,c_3 !=
628 st c_1,rp(3) !r[3]=c1;
629
630 umul a_4,b_0,t_1 !mul_add_c(a[4],b[0],c2,c3,c1);
631 addcc c_2,t_1,c_2
632 rd %y,t_2 !=
633 addxcc c_3,t_2,c_3
634 addx %g0,%g0,c_1
635 umul a_3,b_1,t_1 !mul_add_c(a[3],b[1],c2,c3,c1);
636 addcc c_2,t_1,c_2 !=
637 rd %y,t_2
638 addxcc c_3,t_2,c_3
639 addx c_1,%g0,c_1
640 umul a_2,b_2,t_1 !=!mul_add_c(a[2],b[2],c2,c3,c1);
641 addcc c_2,t_1,c_2
642 rd %y,t_2
643 addxcc c_3,t_2,c_3
644 addx c_1,%g0,c_1 !=
645 ld bp(4),b_4
646 umul a_1,b_3,t_1 !mul_add_c(a[1],b[3],c2,c3,c1);
647 addcc c_2,t_1,c_2
648 rd %y,t_2 !=
649 addxcc c_3,t_2,c_3
650 addx c_1,%g0,c_1
651 ld bp(5),b_5
652 umul a_0,b_4,t_1 !=!mul_add_c(a[0],b[4],c2,c3,c1);
653 addcc c_2,t_1,c_2
654 rd %y,t_2
655 addxcc c_3,t_2,c_3
656 addx c_1,%g0,c_1 !=
657 st c_2,rp(4) !r[4]=c2;
658
659 umul a_0,b_5,t_1 !mul_add_c(a[0],b[5],c3,c1,c2);
660 addcc c_3,t_1,c_3
661 rd %y,t_2 !=
662 addxcc c_1,t_2,c_1
663 addx %g0,%g0,c_2
664 umul a_1,b_4,t_1 !mul_add_c(a[1],b[4],c3,c1,c2);
665 addcc c_3,t_1,c_3 !=
666 rd %y,t_2
667 addxcc c_1,t_2,c_1
668 addx c_2,%g0,c_2
669 umul a_2,b_3,t_1 !=!mul_add_c(a[2],b[3],c3,c1,c2);
670 addcc c_3,t_1,c_3
671 rd %y,t_2
672 addxcc c_1,t_2,c_1
673 addx c_2,%g0,c_2 !=
674 umul a_3,b_2,t_1 !mul_add_c(a[3],b[2],c3,c1,c2);
675 addcc c_3,t_1,c_3
676 rd %y,t_2
677 addxcc c_1,t_2,c_1 !=
678 addx c_2,%g0,c_2
679 ld ap(5),a_5
680 umul a_4,b_1,t_1 !mul_add_c(a[4],b[1],c3,c1,c2);
681 addcc c_3,t_1,c_3 !=
682 rd %y,t_2
683 addxcc c_1,t_2,c_1
684 ld ap(6),a_6
685 addx c_2,%g0,c_2 !=
686 umul a_5,b_0,t_1 !mul_add_c(a[5],b[0],c3,c1,c2);
687 addcc c_3,t_1,c_3
688 rd %y,t_2
689 addxcc c_1,t_2,c_1 !=
690 addx c_2,%g0,c_2
691 st c_3,rp(5) !r[5]=c3;
692
693 umul a_6,b_0,t_1 !mul_add_c(a[6],b[0],c1,c2,c3);
694 addcc c_1,t_1,c_1 !=
695 rd %y,t_2
696 addxcc c_2,t_2,c_2
697 addx %g0,%g0,c_3
698 umul a_5,b_1,t_1 !=!mul_add_c(a[5],b[1],c1,c2,c3);
699 addcc c_1,t_1,c_1
700 rd %y,t_2
701 addxcc c_2,t_2,c_2
702 addx c_3,%g0,c_3 !=
703 umul a_4,b_2,t_1 !mul_add_c(a[4],b[2],c1,c2,c3);
704 addcc c_1,t_1,c_1
705 rd %y,t_2
706 addxcc c_2,t_2,c_2 !=
707 addx c_3,%g0,c_3
708 umul a_3,b_3,t_1 !mul_add_c(a[3],b[3],c1,c2,c3);
709 addcc c_1,t_1,c_1
710 rd %y,t_2 !=
711 addxcc c_2,t_2,c_2
712 addx c_3,%g0,c_3
713 umul a_2,b_4,t_1 !mul_add_c(a[2],b[4],c1,c2,c3);
714 addcc c_1,t_1,c_1 !=
715 rd %y,t_2
716 addxcc c_2,t_2,c_2
717 ld bp(6),b_6
718 addx c_3,%g0,c_3 !=
719 umul a_1,b_5,t_1 !mul_add_c(a[1],b[5],c1,c2,c3);
720 addcc c_1,t_1,c_1
721 rd %y,t_2
722 addxcc c_2,t_2,c_2 !=
723 addx c_3,%g0,c_3
724 ld bp(7),b_7
725 umul a_0,b_6,t_1 !mul_add_c(a[0],b[6],c1,c2,c3);
726 addcc c_1,t_1,c_1 !=
727 rd %y,t_2
728 addxcc c_2,t_2,c_2
729 st c_1,rp(6) !r[6]=c1;
730 addx c_3,%g0,c_3 !=
731
732 umul a_0,b_7,t_1 !mul_add_c(a[0],b[7],c2,c3,c1);
733 addcc c_2,t_1,c_2
734 rd %y,t_2
735 addxcc c_3,t_2,c_3 !=
736 addx %g0,%g0,c_1
737 umul a_1,b_6,t_1 !mul_add_c(a[1],b[6],c2,c3,c1);
738 addcc c_2,t_1,c_2
739 rd %y,t_2 !=
740 addxcc c_3,t_2,c_3
741 addx c_1,%g0,c_1
742 umul a_2,b_5,t_1 !mul_add_c(a[2],b[5],c2,c3,c1);
743 addcc c_2,t_1,c_2 !=
744 rd %y,t_2
745 addxcc c_3,t_2,c_3
746 addx c_1,%g0,c_1
747 umul a_3,b_4,t_1 !=!mul_add_c(a[3],b[4],c2,c3,c1);
748 addcc c_2,t_1,c_2
749 rd %y,t_2
750 addxcc c_3,t_2,c_3
751 addx c_1,%g0,c_1 !=
752 umul a_4,b_3,t_1 !mul_add_c(a[4],b[3],c2,c3,c1);
753 addcc c_2,t_1,c_2
754 rd %y,t_2
755 addxcc c_3,t_2,c_3 !=
756 addx c_1,%g0,c_1
757 umul a_5,b_2,t_1 !mul_add_c(a[5],b[2],c2,c3,c1);
758 addcc c_2,t_1,c_2
759 rd %y,t_2 !=
760 addxcc c_3,t_2,c_3
761 addx c_1,%g0,c_1
762 ld ap(7),a_7
763 umul a_6,b_1,t_1 !=!mul_add_c(a[6],b[1],c2,c3,c1);
764 addcc c_2,t_1,c_2
765 rd %y,t_2
766 addxcc c_3,t_2,c_3
767 addx c_1,%g0,c_1 !=
768 umul a_7,b_0,t_1 !mul_add_c(a[7],b[0],c2,c3,c1);
769 addcc c_2,t_1,c_2
770 rd %y,t_2
771 addxcc c_3,t_2,c_3 !=
772 addx c_1,%g0,c_1
773 st c_2,rp(7) !r[7]=c2;
774
775 umul a_7,b_1,t_1 !mul_add_c(a[7],b[1],c3,c1,c2);
776 addcc c_3,t_1,c_3 !=
777 rd %y,t_2
778 addxcc c_1,t_2,c_1
779 addx %g0,%g0,c_2
780 umul a_6,b_2,t_1 !=!mul_add_c(a[6],b[2],c3,c1,c2);
781 addcc c_3,t_1,c_3
782 rd %y,t_2
783 addxcc c_1,t_2,c_1
784 addx c_2,%g0,c_2 !=
785 umul a_5,b_3,t_1 !mul_add_c(a[5],b[3],c3,c1,c2);
786 addcc c_3,t_1,c_3
787 rd %y,t_2
788 addxcc c_1,t_2,c_1 !=
789 addx c_2,%g0,c_2
790 umul a_4,b_4,t_1 !mul_add_c(a[4],b[4],c3,c1,c2);
791 addcc c_3,t_1,c_3
792 rd %y,t_2 !=
793 addxcc c_1,t_2,c_1
794 addx c_2,%g0,c_2
795 umul a_3,b_5,t_1 !mul_add_c(a[3],b[5],c3,c1,c2);
796 addcc c_3,t_1,c_3 !=
797 rd %y,t_2
798 addxcc c_1,t_2,c_1
799 addx c_2,%g0,c_2
800 umul a_2,b_6,t_1 !=!mul_add_c(a[2],b[6],c3,c1,c2);
801 addcc c_3,t_1,c_3
802 rd %y,t_2
803 addxcc c_1,t_2,c_1
804 addx c_2,%g0,c_2 !=
805 umul a_1,b_7,t_1 !mul_add_c(a[1],b[7],c3,c1,c2);
806 addcc c_3,t_1,c_3
807 rd %y,t_2
808 addxcc c_1,t_2,c_1 !
809 addx c_2,%g0,c_2
810 st c_3,rp(8) !r[8]=c3;
811
812 umul a_2,b_7,t_1 !mul_add_c(a[2],b[7],c1,c2,c3);
813 addcc c_1,t_1,c_1 !=
814 rd %y,t_2
815 addxcc c_2,t_2,c_2
816 addx %g0,%g0,c_3
817 umul a_3,b_6,t_1 !=!mul_add_c(a[3],b[6],c1,c2,c3);
818 addcc c_1,t_1,c_1
819 rd %y,t_2
820 addxcc c_2,t_2,c_2
821 addx c_3,%g0,c_3 !=
822 umul a_4,b_5,t_1 !mul_add_c(a[4],b[5],c1,c2,c3);
823 addcc c_1,t_1,c_1
824 rd %y,t_2
825 addxcc c_2,t_2,c_2 !=
826 addx c_3,%g0,c_3
827 umul a_5,b_4,t_1 !mul_add_c(a[5],b[4],c1,c2,c3);
828 addcc c_1,t_1,c_1
829 rd %y,t_2 !=
830 addxcc c_2,t_2,c_2
831 addx c_3,%g0,c_3
832 umul a_6,b_3,t_1 !mul_add_c(a[6],b[3],c1,c2,c3);
833 addcc c_1,t_1,c_1 !=
834 rd %y,t_2
835 addxcc c_2,t_2,c_2
836 addx c_3,%g0,c_3
837 umul a_7,b_2,t_1 !=!mul_add_c(a[7],b[2],c1,c2,c3);
838 addcc c_1,t_1,c_1
839 rd %y,t_2
840 addxcc c_2,t_2,c_2
841 addx c_3,%g0,c_3 !=
842 st c_1,rp(9) !r[9]=c1;
843
844 umul a_7,b_3,t_1 !mul_add_c(a[7],b[3],c2,c3,c1);
845 addcc c_2,t_1,c_2
846 rd %y,t_2 !=
847 addxcc c_3,t_2,c_3
848 addx %g0,%g0,c_1
849 umul a_6,b_4,t_1 !mul_add_c(a[6],b[4],c2,c3,c1);
850 addcc c_2,t_1,c_2 !=
851 rd %y,t_2
852 addxcc c_3,t_2,c_3
853 addx c_1,%g0,c_1
854 umul a_5,b_5,t_1 !=!mul_add_c(a[5],b[5],c2,c3,c1);
855 addcc c_2,t_1,c_2
856 rd %y,t_2
857 addxcc c_3,t_2,c_3
858 addx c_1,%g0,c_1 !=
859 umul a_4,b_6,t_1 !mul_add_c(a[4],b[6],c2,c3,c1);
860 addcc c_2,t_1,c_2
861 rd %y,t_2
862 addxcc c_3,t_2,c_3 !=
863 addx c_1,%g0,c_1
864 umul a_3,b_7,t_1 !mul_add_c(a[3],b[7],c2,c3,c1);
865 addcc c_2,t_1,c_2
866 rd %y,t_2 !=
867 addxcc c_3,t_2,c_3
868 addx c_1,%g0,c_1
869 st c_2,rp(10) !r[10]=c2;
870
871 umul a_4,b_7,t_1 !=!mul_add_c(a[4],b[7],c3,c1,c2);
872 addcc c_3,t_1,c_3
873 rd %y,t_2
874 addxcc c_1,t_2,c_1
875 addx %g0,%g0,c_2 !=
876 umul a_5,b_6,t_1 !mul_add_c(a[5],b[6],c3,c1,c2);
877 addcc c_3,t_1,c_3
878 rd %y,t_2
879 addxcc c_1,t_2,c_1 !=
880 addx c_2,%g0,c_2
881 umul a_6,b_5,t_1 !mul_add_c(a[6],b[5],c3,c1,c2);
882 addcc c_3,t_1,c_3
883 rd %y,t_2 !=
884 addxcc c_1,t_2,c_1
885 addx c_2,%g0,c_2
886 umul a_7,b_4,t_1 !mul_add_c(a[7],b[4],c3,c1,c2);
887 addcc c_3,t_1,c_3 !=
888 rd %y,t_2
889 addxcc c_1,t_2,c_1
890 st c_3,rp(11) !r[11]=c3;
891 addx c_2,%g0,c_2 !=
892
893 umul a_7,b_5,t_1 !mul_add_c(a[7],b[5],c1,c2,c3);
894 addcc c_1,t_1,c_1
895 rd %y,t_2
896 addxcc c_2,t_2,c_2 !=
897 addx %g0,%g0,c_3
898 umul a_6,b_6,t_1 !mul_add_c(a[6],b[6],c1,c2,c3);
899 addcc c_1,t_1,c_1
900 rd %y,t_2 !=
901 addxcc c_2,t_2,c_2
902 addx c_3,%g0,c_3
903 umul a_5,b_7,t_1 !mul_add_c(a[5],b[7],c1,c2,c3);
904 addcc c_1,t_1,c_1 !=
905 rd %y,t_2
906 addxcc c_2,t_2,c_2
907 st c_1,rp(12) !r[12]=c1;
908 addx c_3,%g0,c_3 !=
909
910 umul a_6,b_7,t_1 !mul_add_c(a[6],b[7],c2,c3,c1);
911 addcc c_2,t_1,c_2
912 rd %y,t_2
913 addxcc c_3,t_2,c_3 !=
914 addx %g0,%g0,c_1
915 umul a_7,b_6,t_1 !mul_add_c(a[7],b[6],c2,c3,c1);
916 addcc c_2,t_1,c_2
917 rd %y,t_2 !=
918 addxcc c_3,t_2,c_3
919 addx c_1,%g0,c_1
920 st c_2,rp(13) !r[13]=c2;
921
922 umul a_7,b_7,t_1 !=!mul_add_c(a[7],b[7],c3,c1,c2);
923 addcc c_3,t_1,c_3
924 rd %y,t_2
925 addxcc c_1,t_2,c_1
926 nop !=
927 st c_3,rp(14) !r[14]=c3;
928 st c_1,rp(15) !r[15]=c1;
929
930 ret
931 restore %g0,%g0,%o0
932
933.type bn_mul_comba8,#function
934.size bn_mul_comba8,(.-bn_mul_comba8)
935
936.align 32
937
938.global bn_mul_comba4
939/*
940 * void bn_mul_comba4(r,a,b)
941 * BN_ULONG *r,*a,*b;
942 */
943bn_mul_comba4:
944 save %sp,FRAME_SIZE,%sp
945 ld ap(0),a_0
946 ld bp(0),b_0
947 umul a_0,b_0,c_1 !=!mul_add_c(a[0],b[0],c1,c2,c3);
948 ld bp(1),b_1
949 rd %y,c_2
950 st c_1,rp(0) !r[0]=c1;
951
952 umul a_0,b_1,t_1 !=!mul_add_c(a[0],b[1],c2,c3,c1);
953 ld ap(1),a_1
954 addcc c_2,t_1,c_2
955 rd %y,t_2 !=
956 addxcc %g0,t_2,c_3
957 addx %g0,%g0,c_1
958 ld ap(2),a_2
959 umul a_1,b_0,t_1 !=!mul_add_c(a[1],b[0],c2,c3,c1);
960 addcc c_2,t_1,c_2
961 rd %y,t_2
962 addxcc c_3,t_2,c_3
963 addx c_1,%g0,c_1 !=
964 st c_2,rp(1) !r[1]=c2;
965
966 umul a_2,b_0,t_1 !mul_add_c(a[2],b[0],c3,c1,c2);
967 addcc c_3,t_1,c_3
968 rd %y,t_2 !=
969 addxcc c_1,t_2,c_1
970 addx %g0,%g0,c_2
971 ld bp(2),b_2
972 umul a_1,b_1,t_1 !=!mul_add_c(a[1],b[1],c3,c1,c2);
973 addcc c_3,t_1,c_3
974 rd %y,t_2
975 addxcc c_1,t_2,c_1
976 addx c_2,%g0,c_2 !=
977 ld bp(3),b_3
978 umul a_0,b_2,t_1 !mul_add_c(a[0],b[2],c3,c1,c2);
979 addcc c_3,t_1,c_3
980 rd %y,t_2 !=
981 addxcc c_1,t_2,c_1
982 addx c_2,%g0,c_2
983 st c_3,rp(2) !r[2]=c3;
984
985 umul a_0,b_3,t_1 !=!mul_add_c(a[0],b[3],c1,c2,c3);
986 addcc c_1,t_1,c_1
987 rd %y,t_2
988 addxcc c_2,t_2,c_2
989 addx %g0,%g0,c_3 !=
990 umul a_1,b_2,t_1 !mul_add_c(a[1],b[2],c1,c2,c3);
991 addcc c_1,t_1,c_1
992 rd %y,t_2
993 addxcc c_2,t_2,c_2 !=
994 addx c_3,%g0,c_3
995 ld ap(3),a_3
996 umul a_2,b_1,t_1 !mul_add_c(a[2],b[1],c1,c2,c3);
997 addcc c_1,t_1,c_1 !=
998 rd %y,t_2
999 addxcc c_2,t_2,c_2
1000 addx c_3,%g0,c_3
1001 umul a_3,b_0,t_1 !=!mul_add_c(a[3],b[0],c1,c2,c3);
1002 addcc c_1,t_1,c_1
1003 rd %y,t_2
1004 addxcc c_2,t_2,c_2
1005 addx c_3,%g0,c_3 !=
1006 st c_1,rp(3) !r[3]=c1;
1007
1008 umul a_3,b_1,t_1 !mul_add_c(a[3],b[1],c2,c3,c1);
1009 addcc c_2,t_1,c_2
1010 rd %y,t_2 !=
1011 addxcc c_3,t_2,c_3
1012 addx %g0,%g0,c_1
1013 umul a_2,b_2,t_1 !mul_add_c(a[2],b[2],c2,c3,c1);
1014 addcc c_2,t_1,c_2 !=
1015 rd %y,t_2
1016 addxcc c_3,t_2,c_3
1017 addx c_1,%g0,c_1
1018 umul a_1,b_3,t_1 !=!mul_add_c(a[1],b[3],c2,c3,c1);
1019 addcc c_2,t_1,c_2
1020 rd %y,t_2
1021 addxcc c_3,t_2,c_3
1022 addx c_1,%g0,c_1 !=
1023 st c_2,rp(4) !r[4]=c2;
1024
1025 umul a_2,b_3,t_1 !mul_add_c(a[2],b[3],c3,c1,c2);
1026 addcc c_3,t_1,c_3
1027 rd %y,t_2 !=
1028 addxcc c_1,t_2,c_1
1029 addx %g0,%g0,c_2
1030 umul a_3,b_2,t_1 !mul_add_c(a[3],b[2],c3,c1,c2);
1031 addcc c_3,t_1,c_3 !=
1032 rd %y,t_2
1033 addxcc c_1,t_2,c_1
1034 st c_3,rp(5) !r[5]=c3;
1035 addx c_2,%g0,c_2 !=
1036
1037 umul a_3,b_3,t_1 !mul_add_c(a[3],b[3],c1,c2,c3);
1038 addcc c_1,t_1,c_1
1039 rd %y,t_2
1040 addxcc c_2,t_2,c_2 !=
1041 st c_1,rp(6) !r[6]=c1;
1042 st c_2,rp(7) !r[7]=c2;
1043
1044 ret
1045 restore %g0,%g0,%o0
1046
1047.type bn_mul_comba4,#function
1048.size bn_mul_comba4,(.-bn_mul_comba4)
1049
1050.align 32
1051
1052.global bn_sqr_comba8
1053bn_sqr_comba8:
1054 save %sp,FRAME_SIZE,%sp
1055 ld ap(0),a_0
1056 ld ap(1),a_1
1057 umul a_0,a_0,c_1 !=!sqr_add_c(a,0,c1,c2,c3);
1058 rd %y,c_2
1059 st c_1,rp(0) !r[0]=c1;
1060
1061 ld ap(2),a_2
1062 umul a_0,a_1,t_1 !=!sqr_add_c2(a,1,0,c2,c3,c1);
1063 addcc c_2,t_1,c_2
1064 rd %y,t_2
1065 addxcc %g0,t_2,c_3
1066 addx %g0,%g0,c_1 !=
1067 addcc c_2,t_1,c_2
1068 addxcc c_3,t_2,c_3
1069 st c_2,rp(1) !r[1]=c2;
1070 addx c_1,%g0,c_1 !=
1071
1072 umul a_2,a_0,t_1 !sqr_add_c2(a,2,0,c3,c1,c2);
1073 addcc c_3,t_1,c_3
1074 rd %y,t_2
1075 addxcc c_1,t_2,c_1 !=
1076 addx %g0,%g0,c_2
1077 addcc c_3,t_1,c_3
1078 addxcc c_1,t_2,c_1
1079 addx c_2,%g0,c_2 !=
1080 ld ap(3),a_3
1081 umul a_1,a_1,t_1 !sqr_add_c(a,1,c3,c1,c2);
1082 addcc c_3,t_1,c_3
1083 rd %y,t_2 !=
1084 addxcc c_1,t_2,c_1
1085 addx c_2,%g0,c_2
1086 st c_3,rp(2) !r[2]=c3;
1087
1088 umul a_0,a_3,t_1 !=!sqr_add_c2(a,3,0,c1,c2,c3);
1089 addcc c_1,t_1,c_1
1090 rd %y,t_2
1091 addxcc c_2,t_2,c_2
1092 addx %g0,%g0,c_3 !=
1093 addcc c_1,t_1,c_1
1094 addxcc c_2,t_2,c_2
1095 ld ap(4),a_4
1096 addx c_3,%g0,c_3 !=
1097 umul a_1,a_2,t_1 !sqr_add_c2(a,2,1,c1,c2,c3);
1098 addcc c_1,t_1,c_1
1099 rd %y,t_2
1100 addxcc c_2,t_2,c_2 !=
1101 addx c_3,%g0,c_3
1102 addcc c_1,t_1,c_1
1103 addxcc c_2,t_2,c_2
1104 addx c_3,%g0,c_3 !=
1105 st c_1,rp(3) !r[3]=c1;
1106
1107 umul a_4,a_0,t_1 !sqr_add_c2(a,4,0,c2,c3,c1);
1108 addcc c_2,t_1,c_2
1109 rd %y,t_2 !=
1110 addxcc c_3,t_2,c_3
1111 addx %g0,%g0,c_1
1112 addcc c_2,t_1,c_2
1113 addxcc c_3,t_2,c_3 !=
1114 addx c_1,%g0,c_1
1115 umul a_3,a_1,t_1 !sqr_add_c2(a,3,1,c2,c3,c1);
1116 addcc c_2,t_1,c_2
1117 rd %y,t_2 !=
1118 addxcc c_3,t_2,c_3
1119 addx c_1,%g0,c_1
1120 addcc c_2,t_1,c_2
1121 addxcc c_3,t_2,c_3 !=
1122 addx c_1,%g0,c_1
1123 ld ap(5),a_5
1124 umul a_2,a_2,t_1 !sqr_add_c(a,2,c2,c3,c1);
1125 addcc c_2,t_1,c_2 !=
1126 rd %y,t_2
1127 addxcc c_3,t_2,c_3
1128 st c_2,rp(4) !r[4]=c2;
1129 addx c_1,%g0,c_1 !=
1130
1131 umul a_0,a_5,t_1 !sqr_add_c2(a,5,0,c3,c1,c2);
1132 addcc c_3,t_1,c_3
1133 rd %y,t_2
1134 addxcc c_1,t_2,c_1 !=
1135 addx %g0,%g0,c_2
1136 addcc c_3,t_1,c_3
1137 addxcc c_1,t_2,c_1
1138 addx c_2,%g0,c_2 !=
1139 umul a_1,a_4,t_1 !sqr_add_c2(a,4,1,c3,c1,c2);
1140 addcc c_3,t_1,c_3
1141 rd %y,t_2
1142 addxcc c_1,t_2,c_1 !=
1143 addx c_2,%g0,c_2
1144 addcc c_3,t_1,c_3
1145 addxcc c_1,t_2,c_1
1146 addx c_2,%g0,c_2 !=
1147 ld ap(6),a_6
1148 umul a_2,a_3,t_1 !sqr_add_c2(a,3,2,c3,c1,c2);
1149 addcc c_3,t_1,c_3
1150 rd %y,t_2 !=
1151 addxcc c_1,t_2,c_1
1152 addx c_2,%g0,c_2
1153 addcc c_3,t_1,c_3
1154 addxcc c_1,t_2,c_1 !=
1155 addx c_2,%g0,c_2
1156 st c_3,rp(5) !r[5]=c3;
1157
1158 umul a_6,a_0,t_1 !sqr_add_c2(a,6,0,c1,c2,c3);
1159 addcc c_1,t_1,c_1 !=
1160 rd %y,t_2
1161 addxcc c_2,t_2,c_2
1162 addx %g0,%g0,c_3
1163 addcc c_1,t_1,c_1 !=
1164 addxcc c_2,t_2,c_2
1165 addx c_3,%g0,c_3
1166 umul a_5,a_1,t_1 !sqr_add_c2(a,5,1,c1,c2,c3);
1167 addcc c_1,t_1,c_1 !=
1168 rd %y,t_2
1169 addxcc c_2,t_2,c_2
1170 addx c_3,%g0,c_3
1171 addcc c_1,t_1,c_1 !=
1172 addxcc c_2,t_2,c_2
1173 addx c_3,%g0,c_3
1174 umul a_4,a_2,t_1 !sqr_add_c2(a,4,2,c1,c2,c3);
1175 addcc c_1,t_1,c_1 !=
1176 rd %y,t_2
1177 addxcc c_2,t_2,c_2
1178 addx c_3,%g0,c_3
1179 addcc c_1,t_1,c_1 !=
1180 addxcc c_2,t_2,c_2
1181 addx c_3,%g0,c_3
1182 ld ap(7),a_7
1183 umul a_3,a_3,t_1 !=!sqr_add_c(a,3,c1,c2,c3);
1184 addcc c_1,t_1,c_1
1185 rd %y,t_2
1186 addxcc c_2,t_2,c_2
1187 addx c_3,%g0,c_3 !=
1188 st c_1,rp(6) !r[6]=c1;
1189
1190 umul a_0,a_7,t_1 !sqr_add_c2(a,7,0,c2,c3,c1);
1191 addcc c_2,t_1,c_2
1192 rd %y,t_2 !=
1193 addxcc c_3,t_2,c_3
1194 addx %g0,%g0,c_1
1195 addcc c_2,t_1,c_2
1196 addxcc c_3,t_2,c_3 !=
1197 addx c_1,%g0,c_1
1198 umul a_1,a_6,t_1 !sqr_add_c2(a,6,1,c2,c3,c1);
1199 addcc c_2,t_1,c_2
1200 rd %y,t_2 !=
1201 addxcc c_3,t_2,c_3
1202 addx c_1,%g0,c_1
1203 addcc c_2,t_1,c_2
1204 addxcc c_3,t_2,c_3 !=
1205 addx c_1,%g0,c_1
1206 umul a_2,a_5,t_1 !sqr_add_c2(a,5,2,c2,c3,c1);
1207 addcc c_2,t_1,c_2
1208 rd %y,t_2 !=
1209 addxcc c_3,t_2,c_3
1210 addx c_1,%g0,c_1
1211 addcc c_2,t_1,c_2
1212 addxcc c_3,t_2,c_3 !=
1213 addx c_1,%g0,c_1
1214 umul a_3,a_4,t_1 !sqr_add_c2(a,4,3,c2,c3,c1);
1215 addcc c_2,t_1,c_2
1216 rd %y,t_2 !=
1217 addxcc c_3,t_2,c_3
1218 addx c_1,%g0,c_1
1219 addcc c_2,t_1,c_2
1220 addxcc c_3,t_2,c_3 !=
1221 addx c_1,%g0,c_1
1222 st c_2,rp(7) !r[7]=c2;
1223
1224 umul a_7,a_1,t_1 !sqr_add_c2(a,7,1,c3,c1,c2);
1225 addcc c_3,t_1,c_3 !=
1226 rd %y,t_2
1227 addxcc c_1,t_2,c_1
1228 addx %g0,%g0,c_2
1229 addcc c_3,t_1,c_3 !=
1230 addxcc c_1,t_2,c_1
1231 addx c_2,%g0,c_2
1232 umul a_6,a_2,t_1 !sqr_add_c2(a,6,2,c3,c1,c2);
1233 addcc c_3,t_1,c_3 !=
1234 rd %y,t_2
1235 addxcc c_1,t_2,c_1
1236 addx c_2,%g0,c_2
1237 addcc c_3,t_1,c_3 !=
1238 addxcc c_1,t_2,c_1
1239 addx c_2,%g0,c_2
1240 umul a_5,a_3,t_1 !sqr_add_c2(a,5,3,c3,c1,c2);
1241 addcc c_3,t_1,c_3 !=
1242 rd %y,t_2
1243 addxcc c_1,t_2,c_1
1244 addx c_2,%g0,c_2
1245 addcc c_3,t_1,c_3 !=
1246 addxcc c_1,t_2,c_1
1247 addx c_2,%g0,c_2
1248 umul a_4,a_4,t_1 !sqr_add_c(a,4,c3,c1,c2);
1249 addcc c_3,t_1,c_3 !=
1250 rd %y,t_2
1251 addxcc c_1,t_2,c_1
1252 st c_3,rp(8) !r[8]=c3;
1253 addx c_2,%g0,c_2 !=
1254
1255 umul a_2,a_7,t_1 !sqr_add_c2(a,7,2,c1,c2,c3);
1256 addcc c_1,t_1,c_1
1257 rd %y,t_2
1258 addxcc c_2,t_2,c_2 !=
1259 addx %g0,%g0,c_3
1260 addcc c_1,t_1,c_1
1261 addxcc c_2,t_2,c_2
1262 addx c_3,%g0,c_3 !=
1263 umul a_3,a_6,t_1 !sqr_add_c2(a,6,3,c1,c2,c3);
1264 addcc c_1,t_1,c_1
1265 rd %y,t_2
1266 addxcc c_2,t_2,c_2 !=
1267 addx c_3,%g0,c_3
1268 addcc c_1,t_1,c_1
1269 addxcc c_2,t_2,c_2
1270 addx c_3,%g0,c_3 !=
1271 umul a_4,a_5,t_1 !sqr_add_c2(a,5,4,c1,c2,c3);
1272 addcc c_1,t_1,c_1
1273 rd %y,t_2
1274 addxcc c_2,t_2,c_2 !=
1275 addx c_3,%g0,c_3
1276 addcc c_1,t_1,c_1
1277 addxcc c_2,t_2,c_2
1278 addx c_3,%g0,c_3 !=
1279 st c_1,rp(9) !r[9]=c1;
1280
1281 umul a_7,a_3,t_1 !sqr_add_c2(a,7,3,c2,c3,c1);
1282 addcc c_2,t_1,c_2
1283 rd %y,t_2 !=
1284 addxcc c_3,t_2,c_3
1285 addx %g0,%g0,c_1
1286 addcc c_2,t_1,c_2
1287 addxcc c_3,t_2,c_3 !=
1288 addx c_1,%g0,c_1
1289 umul a_6,a_4,t_1 !sqr_add_c2(a,6,4,c2,c3,c1);
1290 addcc c_2,t_1,c_2
1291 rd %y,t_2 !=
1292 addxcc c_3,t_2,c_3
1293 addx c_1,%g0,c_1
1294 addcc c_2,t_1,c_2
1295 addxcc c_3,t_2,c_3 !=
1296 addx c_1,%g0,c_1
1297 umul a_5,a_5,t_1 !sqr_add_c(a,5,c2,c3,c1);
1298 addcc c_2,t_1,c_2
1299 rd %y,t_2 !=
1300 addxcc c_3,t_2,c_3
1301 addx c_1,%g0,c_1
1302 st c_2,rp(10) !r[10]=c2;
1303
1304 umul a_4,a_7,t_1 !=!sqr_add_c2(a,7,4,c3,c1,c2);
1305 addcc c_3,t_1,c_3
1306 rd %y,t_2
1307 addxcc c_1,t_2,c_1
1308 addx %g0,%g0,c_2 !=
1309 addcc c_3,t_1,c_3
1310 addxcc c_1,t_2,c_1
1311 addx c_2,%g0,c_2
1312 umul a_5,a_6,t_1 !=!sqr_add_c2(a,6,5,c3,c1,c2);
1313 addcc c_3,t_1,c_3
1314 rd %y,t_2
1315 addxcc c_1,t_2,c_1
1316 addx c_2,%g0,c_2 !=
1317 addcc c_3,t_1,c_3
1318 addxcc c_1,t_2,c_1
1319 st c_3,rp(11) !r[11]=c3;
1320 addx c_2,%g0,c_2 !=
1321
1322 umul a_7,a_5,t_1 !sqr_add_c2(a,7,5,c1,c2,c3);
1323 addcc c_1,t_1,c_1
1324 rd %y,t_2
1325 addxcc c_2,t_2,c_2 !=
1326 addx %g0,%g0,c_3
1327 addcc c_1,t_1,c_1
1328 addxcc c_2,t_2,c_2
1329 addx c_3,%g0,c_3 !=
1330 umul a_6,a_6,t_1 !sqr_add_c(a,6,c1,c2,c3);
1331 addcc c_1,t_1,c_1
1332 rd %y,t_2
1333 addxcc c_2,t_2,c_2 !=
1334 addx c_3,%g0,c_3
1335 st c_1,rp(12) !r[12]=c1;
1336
1337 umul a_6,a_7,t_1 !sqr_add_c2(a,7,6,c2,c3,c1);
1338 addcc c_2,t_1,c_2 !=
1339 rd %y,t_2
1340 addxcc c_3,t_2,c_3
1341 addx %g0,%g0,c_1
1342 addcc c_2,t_1,c_2 !=
1343 addxcc c_3,t_2,c_3
1344 st c_2,rp(13) !r[13]=c2;
1345 addx c_1,%g0,c_1 !=
1346
1347 umul a_7,a_7,t_1 !sqr_add_c(a,7,c3,c1,c2);
1348 addcc c_3,t_1,c_3
1349 rd %y,t_2
1350 addxcc c_1,t_2,c_1 !=
1351 st c_3,rp(14) !r[14]=c3;
1352 st c_1,rp(15) !r[15]=c1;
1353
1354 ret
1355 restore %g0,%g0,%o0
1356
1357.type bn_sqr_comba8,#function
1358.size bn_sqr_comba8,(.-bn_sqr_comba8)
1359
1360.align 32
1361
1362.global bn_sqr_comba4
1363/*
1364 * void bn_sqr_comba4(r,a)
1365 * BN_ULONG *r,*a;
1366 */
1367bn_sqr_comba4:
1368 save %sp,FRAME_SIZE,%sp
1369 ld ap(0),a_0
1370 umul a_0,a_0,c_1 !sqr_add_c(a,0,c1,c2,c3);
1371 ld ap(1),a_1 !=
1372 rd %y,c_2
1373 st c_1,rp(0) !r[0]=c1;
1374
1375 ld ap(2),a_2
1376 umul a_0,a_1,t_1 !=!sqr_add_c2(a,1,0,c2,c3,c1);
1377 addcc c_2,t_1,c_2
1378 rd %y,t_2
1379 addxcc %g0,t_2,c_3
1380 addx %g0,%g0,c_1 !=
1381 addcc c_2,t_1,c_2
1382 addxcc c_3,t_2,c_3
1383 addx c_1,%g0,c_1 !=
1384 st c_2,rp(1) !r[1]=c2;
1385
1386 umul a_2,a_0,t_1 !sqr_add_c2(a,2,0,c3,c1,c2);
1387 addcc c_3,t_1,c_3
1388 rd %y,t_2 !=
1389 addxcc c_1,t_2,c_1
1390 addx %g0,%g0,c_2
1391 addcc c_3,t_1,c_3
1392 addxcc c_1,t_2,c_1 !=
1393 addx c_2,%g0,c_2
1394 ld ap(3),a_3
1395 umul a_1,a_1,t_1 !sqr_add_c(a,1,c3,c1,c2);
1396 addcc c_3,t_1,c_3 !=
1397 rd %y,t_2
1398 addxcc c_1,t_2,c_1
1399 st c_3,rp(2) !r[2]=c3;
1400 addx c_2,%g0,c_2 !=
1401
1402 umul a_0,a_3,t_1 !sqr_add_c2(a,3,0,c1,c2,c3);
1403 addcc c_1,t_1,c_1
1404 rd %y,t_2
1405 addxcc c_2,t_2,c_2 !=
1406 addx %g0,%g0,c_3
1407 addcc c_1,t_1,c_1
1408 addxcc c_2,t_2,c_2
1409 addx c_3,%g0,c_3 !=
1410 umul a_1,a_2,t_1 !sqr_add_c2(a,2,1,c1,c2,c3);
1411 addcc c_1,t_1,c_1
1412 rd %y,t_2
1413 addxcc c_2,t_2,c_2 !=
1414 addx c_3,%g0,c_3
1415 addcc c_1,t_1,c_1
1416 addxcc c_2,t_2,c_2
1417 addx c_3,%g0,c_3 !=
1418 st c_1,rp(3) !r[3]=c1;
1419
1420 umul a_3,a_1,t_1 !sqr_add_c2(a,3,1,c2,c3,c1);
1421 addcc c_2,t_1,c_2
1422 rd %y,t_2 !=
1423 addxcc c_3,t_2,c_3
1424 addx %g0,%g0,c_1
1425 addcc c_2,t_1,c_2
1426 addxcc c_3,t_2,c_3 !=
1427 addx c_1,%g0,c_1
1428 umul a_2,a_2,t_1 !sqr_add_c(a,2,c2,c3,c1);
1429 addcc c_2,t_1,c_2
1430 rd %y,t_2 !=
1431 addxcc c_3,t_2,c_3
1432 addx c_1,%g0,c_1
1433 st c_2,rp(4) !r[4]=c2;
1434
1435 umul a_2,a_3,t_1 !=!sqr_add_c2(a,3,2,c3,c1,c2);
1436 addcc c_3,t_1,c_3
1437 rd %y,t_2
1438 addxcc c_1,t_2,c_1
1439 addx %g0,%g0,c_2 !=
1440 addcc c_3,t_1,c_3
1441 addxcc c_1,t_2,c_1
1442 st c_3,rp(5) !r[5]=c3;
1443 addx c_2,%g0,c_2 !=
1444
1445 umul a_3,a_3,t_1 !sqr_add_c(a,3,c1,c2,c3);
1446 addcc c_1,t_1,c_1
1447 rd %y,t_2
1448 addxcc c_2,t_2,c_2 !=
1449 st c_1,rp(6) !r[6]=c1;
1450 st c_2,rp(7) !r[7]=c2;
1451
1452 ret
1453 restore %g0,%g0,%o0
1454
1455.type bn_sqr_comba4,#function
1456.size bn_sqr_comba4,(.-bn_sqr_comba4)
1457
1458.align 32
diff --git a/src/lib/libcrypto/bn/asm/sparcv8plus.S b/src/lib/libcrypto/bn/asm/sparcv8plus.S
deleted file mode 100644
index 63de1860f2..0000000000
--- a/src/lib/libcrypto/bn/asm/sparcv8plus.S
+++ /dev/null
@@ -1,1558 +0,0 @@
1.ident "sparcv8plus.s, Version 1.4"
2.ident "SPARC v9 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>"
3
4/*
5 * ====================================================================
6 * Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
7 * project.
8 *
9 * Rights for redistribution and usage in source and binary forms are
10 * granted according to the OpenSSL license. Warranty of any kind is
11 * disclaimed.
12 * ====================================================================
13 */
14
15/*
16 * This is my modest contributon to OpenSSL project (see
17 * http://www.openssl.org/ for more information about it) and is
18 * a drop-in UltraSPARC ISA replacement for crypto/bn/bn_asm.c
19 * module. For updates see http://fy.chalmers.se/~appro/hpe/.
20 *
21 * Questions-n-answers.
22 *
23 * Q. How to compile?
24 * A. With SC4.x/SC5.x:
25 *
26 * cc -xarch=v8plus -c bn_asm.sparc.v8plus.S -o bn_asm.o
27 *
28 * and with gcc:
29 *
30 * gcc -mcpu=ultrasparc -c bn_asm.sparc.v8plus.S -o bn_asm.o
31 *
32 * or if above fails (it does if you have gas installed):
33 *
34 * gcc -E bn_asm.sparc.v8plus.S | as -xarch=v8plus /dev/fd/0 -o bn_asm.o
35 *
36 * Quick-n-dirty way to fuse the module into the library.
37 * Provided that the library is already configured and built
38 * (in 0.9.2 case with no-asm option):
39 *
40 * # cd crypto/bn
41 * # cp /some/place/bn_asm.sparc.v8plus.S .
42 * # cc -xarch=v8plus -c bn_asm.sparc.v8plus.S -o bn_asm.o
43 * # make
44 * # cd ../..
45 * # make; make test
46 *
47 * Quick-n-dirty way to get rid of it:
48 *
49 * # cd crypto/bn
50 * # touch bn_asm.c
51 * # make
52 * # cd ../..
53 * # make; make test
54 *
55 * Q. V8plus achitecture? What kind of beast is that?
56 * A. Well, it's rather a programming model than an architecture...
57 * It's actually v9-compliant, i.e. *any* UltraSPARC, CPU under
58 * special conditions, namely when kernel doesn't preserve upper
59 * 32 bits of otherwise 64-bit registers during a context switch.
60 *
61 * Q. Why just UltraSPARC? What about SuperSPARC?
62 * A. Original release did target UltraSPARC only. Now SuperSPARC
63 * version is provided along. Both version share bn_*comba[48]
64 * implementations (see comment later in code for explanation).
65 * But what's so special about this UltraSPARC implementation?
66 * Why didn't I let compiler do the job? Trouble is that most of
67 * available compilers (well, SC5.0 is the only exception) don't
68 * attempt to take advantage of UltraSPARC's 64-bitness under
69 * 32-bit kernels even though it's perfectly possible (see next
70 * question).
71 *
72 * Q. 64-bit registers under 32-bit kernels? Didn't you just say it
73 * doesn't work?
74 * A. You can't adress *all* registers as 64-bit wide:-( The catch is
75 * that you actually may rely upon %o0-%o5 and %g1-%g4 being fully
76 * preserved if you're in a leaf function, i.e. such never calling
77 * any other functions. All functions in this module are leaf and
78 * 10 registers is a handful. And as a matter of fact none-"comba"
79 * routines don't require even that much and I could even afford to
80 * not allocate own stack frame for 'em:-)
81 *
82 * Q. What about 64-bit kernels?
83 * A. What about 'em? Just kidding:-) Pure 64-bit version is currently
84 * under evaluation and development...
85 *
86 * Q. What about shared libraries?
87 * A. What about 'em? Kidding again:-) Code does *not* contain any
88 * code position dependencies and it's safe to include it into
89 * shared library as is.
90 *
91 * Q. How much faster does it go?
92 * A. Do you have a good benchmark? In either case below is what I
93 * experience with crypto/bn/expspeed.c test program:
94 *
95 * v8plus module on U10/300MHz against bn_asm.c compiled with:
96 *
97 * cc-5.0 -xarch=v8plus -xO5 -xdepend +7-12%
98 * cc-4.2 -xarch=v8plus -xO5 -xdepend +25-35%
99 * egcs-1.1.2 -mcpu=ultrasparc -O3 +35-45%
100 *
101 * v8 module on SS10/60MHz against bn_asm.c compiled with:
102 *
103 * cc-5.0 -xarch=v8 -xO5 -xdepend +7-10%
104 * cc-4.2 -xarch=v8 -xO5 -xdepend +10%
105 * egcs-1.1.2 -mv8 -O3 +35-45%
106 *
107 * As you can see it's damn hard to beat the new Sun C compiler
108 * and it's in first place GNU C users who will appreciate this
109 * assembler implementation:-)
110 */
111
112/*
113 * Revision history.
114 *
115 * 1.0 - initial release;
116 * 1.1 - new loop unrolling model(*);
117 * - some more fine tuning;
118 * 1.2 - made gas friendly;
119 * - updates to documentation concerning v9;
120 * - new performance comparison matrix;
121 * 1.3 - fixed problem with /usr/ccs/lib/cpp;
122 * 1.4 - native V9 bn_*_comba[48] implementation (15% more efficient)
123 * resulting in slight overall performance kick;
124 * - some retunes;
125 * - support for GNU as added;
126 *
127 * (*) Originally unrolled loop looked like this:
128 * for (;;) {
129 * op(p+0); if (--n==0) break;
130 * op(p+1); if (--n==0) break;
131 * op(p+2); if (--n==0) break;
132 * op(p+3); if (--n==0) break;
133 * p+=4;
134 * }
135 * I unroll according to following:
136 * while (n&~3) {
137 * op(p+0); op(p+1); op(p+2); op(p+3);
138 * p+=4; n=-4;
139 * }
140 * if (n) {
141 * op(p+0); if (--n==0) return;
142 * op(p+2); if (--n==0) return;
143 * op(p+3); return;
144 * }
145 */
146
147#if defined(__SUNPRO_C) && defined(__sparcv9)
148 /* They've said -xarch=v9 at command line */
149 .register %g2,#scratch
150 .register %g3,#scratch
151# define FRAME_SIZE -192
152#elif defined(__GNUC__) && defined(__arch64__)
153 /* They've said -m64 at command line */
154 .register %g2,#scratch
155 .register %g3,#scratch
156# define FRAME_SIZE -192
157#else
158# define FRAME_SIZE -96
159#endif
160/*
161 * GNU assembler can't stand stuw:-(
162 */
163#define stuw st
164
165.section ".text",#alloc,#execinstr
166.file "bn_asm.sparc.v8plus.S"
167
168.align 32
169
170.global bn_mul_add_words
171/*
172 * BN_ULONG bn_mul_add_words(rp,ap,num,w)
173 * BN_ULONG *rp,*ap;
174 * int num;
175 * BN_ULONG w;
176 */
177bn_mul_add_words:
178 sra %o2,%g0,%o2 ! signx %o2
179 brgz,a %o2,.L_bn_mul_add_words_proceed
180 lduw [%o1],%g2
181 retl
182 clr %o0
183 nop
184 nop
185 nop
186
187.L_bn_mul_add_words_proceed:
188 srl %o3,%g0,%o3 ! clruw %o3
189 andcc %o2,-4,%g0
190 bz,pn %icc,.L_bn_mul_add_words_tail
191 clr %o5
192
193.L_bn_mul_add_words_loop: ! wow! 32 aligned!
194 lduw [%o0],%g1
195 lduw [%o1+4],%g3
196 mulx %o3,%g2,%g2
197 add %g1,%o5,%o4
198 nop
199 add %o4,%g2,%o4
200 stuw %o4,[%o0]
201 srlx %o4,32,%o5
202
203 lduw [%o0+4],%g1
204 lduw [%o1+8],%g2
205 mulx %o3,%g3,%g3
206 add %g1,%o5,%o4
207 dec 4,%o2
208 add %o4,%g3,%o4
209 stuw %o4,[%o0+4]
210 srlx %o4,32,%o5
211
212 lduw [%o0+8],%g1
213 lduw [%o1+12],%g3
214 mulx %o3,%g2,%g2
215 add %g1,%o5,%o4
216 inc 16,%o1
217 add %o4,%g2,%o4
218 stuw %o4,[%o0+8]
219 srlx %o4,32,%o5
220
221 lduw [%o0+12],%g1
222 mulx %o3,%g3,%g3
223 add %g1,%o5,%o4
224 inc 16,%o0
225 add %o4,%g3,%o4
226 andcc %o2,-4,%g0
227 stuw %o4,[%o0-4]
228 srlx %o4,32,%o5
229 bnz,a,pt %icc,.L_bn_mul_add_words_loop
230 lduw [%o1],%g2
231
232 brnz,a,pn %o2,.L_bn_mul_add_words_tail
233 lduw [%o1],%g2
234.L_bn_mul_add_words_return:
235 retl
236 mov %o5,%o0
237
238.L_bn_mul_add_words_tail:
239 lduw [%o0],%g1
240 mulx %o3,%g2,%g2
241 add %g1,%o5,%o4
242 dec %o2
243 add %o4,%g2,%o4
244 srlx %o4,32,%o5
245 brz,pt %o2,.L_bn_mul_add_words_return
246 stuw %o4,[%o0]
247
248 lduw [%o1+4],%g2
249 lduw [%o0+4],%g1
250 mulx %o3,%g2,%g2
251 add %g1,%o5,%o4
252 dec %o2
253 add %o4,%g2,%o4
254 srlx %o4,32,%o5
255 brz,pt %o2,.L_bn_mul_add_words_return
256 stuw %o4,[%o0+4]
257
258 lduw [%o1+8],%g2
259 lduw [%o0+8],%g1
260 mulx %o3,%g2,%g2
261 add %g1,%o5,%o4
262 add %o4,%g2,%o4
263 stuw %o4,[%o0+8]
264 retl
265 srlx %o4,32,%o0
266
267.type bn_mul_add_words,#function
268.size bn_mul_add_words,(.-bn_mul_add_words)
269
270.align 32
271
272.global bn_mul_words
273/*
274 * BN_ULONG bn_mul_words(rp,ap,num,w)
275 * BN_ULONG *rp,*ap;
276 * int num;
277 * BN_ULONG w;
278 */
279bn_mul_words:
280 sra %o2,%g0,%o2 ! signx %o2
281 brgz,a %o2,.L_bn_mul_words_proceeed
282 lduw [%o1],%g2
283 retl
284 clr %o0
285 nop
286 nop
287 nop
288
289.L_bn_mul_words_proceeed:
290 srl %o3,%g0,%o3 ! clruw %o3
291 andcc %o2,-4,%g0
292 bz,pn %icc,.L_bn_mul_words_tail
293 clr %o5
294
295.L_bn_mul_words_loop: ! wow! 32 aligned!
296 lduw [%o1+4],%g3
297 mulx %o3,%g2,%g2
298 add %g2,%o5,%o4
299 nop
300 stuw %o4,[%o0]
301 srlx %o4,32,%o5
302
303 lduw [%o1+8],%g2
304 mulx %o3,%g3,%g3
305 add %g3,%o5,%o4
306 dec 4,%o2
307 stuw %o4,[%o0+4]
308 srlx %o4,32,%o5
309
310 lduw [%o1+12],%g3
311 mulx %o3,%g2,%g2
312 add %g2,%o5,%o4
313 inc 16,%o1
314 stuw %o4,[%o0+8]
315 srlx %o4,32,%o5
316
317 mulx %o3,%g3,%g3
318 add %g3,%o5,%o4
319 inc 16,%o0
320 stuw %o4,[%o0-4]
321 srlx %o4,32,%o5
322 andcc %o2,-4,%g0
323 bnz,a,pt %icc,.L_bn_mul_words_loop
324 lduw [%o1],%g2
325 nop
326 nop
327
328 brnz,a,pn %o2,.L_bn_mul_words_tail
329 lduw [%o1],%g2
330.L_bn_mul_words_return:
331 retl
332 mov %o5,%o0
333
334.L_bn_mul_words_tail:
335 mulx %o3,%g2,%g2
336 add %g2,%o5,%o4
337 dec %o2
338 srlx %o4,32,%o5
339 brz,pt %o2,.L_bn_mul_words_return
340 stuw %o4,[%o0]
341
342 lduw [%o1+4],%g2
343 mulx %o3,%g2,%g2
344 add %g2,%o5,%o4
345 dec %o2
346 srlx %o4,32,%o5
347 brz,pt %o2,.L_bn_mul_words_return
348 stuw %o4,[%o0+4]
349
350 lduw [%o1+8],%g2
351 mulx %o3,%g2,%g2
352 add %g2,%o5,%o4
353 stuw %o4,[%o0+8]
354 retl
355 srlx %o4,32,%o0
356
357.type bn_mul_words,#function
358.size bn_mul_words,(.-bn_mul_words)
359
360.align 32
361.global bn_sqr_words
362/*
363 * void bn_sqr_words(r,a,n)
364 * BN_ULONG *r,*a;
365 * int n;
366 */
367bn_sqr_words:
368 sra %o2,%g0,%o2 ! signx %o2
369 brgz,a %o2,.L_bn_sqr_words_proceeed
370 lduw [%o1],%g2
371 retl
372 clr %o0
373 nop
374 nop
375 nop
376
377.L_bn_sqr_words_proceeed:
378 andcc %o2,-4,%g0
379 nop
380 bz,pn %icc,.L_bn_sqr_words_tail
381 nop
382
383.L_bn_sqr_words_loop: ! wow! 32 aligned!
384 lduw [%o1+4],%g3
385 mulx %g2,%g2,%o4
386 stuw %o4,[%o0]
387 srlx %o4,32,%o5
388 stuw %o5,[%o0+4]
389 nop
390
391 lduw [%o1+8],%g2
392 mulx %g3,%g3,%o4
393 dec 4,%o2
394 stuw %o4,[%o0+8]
395 srlx %o4,32,%o5
396 stuw %o5,[%o0+12]
397
398 lduw [%o1+12],%g3
399 mulx %g2,%g2,%o4
400 srlx %o4,32,%o5
401 stuw %o4,[%o0+16]
402 inc 16,%o1
403 stuw %o5,[%o0+20]
404
405 mulx %g3,%g3,%o4
406 inc 32,%o0
407 stuw %o4,[%o0-8]
408 srlx %o4,32,%o5
409 andcc %o2,-4,%g2
410 stuw %o5,[%o0-4]
411 bnz,a,pt %icc,.L_bn_sqr_words_loop
412 lduw [%o1],%g2
413 nop
414
415 brnz,a,pn %o2,.L_bn_sqr_words_tail
416 lduw [%o1],%g2
417.L_bn_sqr_words_return:
418 retl
419 clr %o0
420
421.L_bn_sqr_words_tail:
422 mulx %g2,%g2,%o4
423 dec %o2
424 stuw %o4,[%o0]
425 srlx %o4,32,%o5
426 brz,pt %o2,.L_bn_sqr_words_return
427 stuw %o5,[%o0+4]
428
429 lduw [%o1+4],%g2
430 mulx %g2,%g2,%o4
431 dec %o2
432 stuw %o4,[%o0+8]
433 srlx %o4,32,%o5
434 brz,pt %o2,.L_bn_sqr_words_return
435 stuw %o5,[%o0+12]
436
437 lduw [%o1+8],%g2
438 mulx %g2,%g2,%o4
439 srlx %o4,32,%o5
440 stuw %o4,[%o0+16]
441 stuw %o5,[%o0+20]
442 retl
443 clr %o0
444
445.type bn_sqr_words,#function
446.size bn_sqr_words,(.-bn_sqr_words)
447
448.align 32
449.global bn_div_words
450/*
451 * BN_ULONG bn_div_words(h,l,d)
452 * BN_ULONG h,l,d;
453 */
454bn_div_words:
455 sllx %o0,32,%o0
456 or %o0,%o1,%o0
457 udivx %o0,%o2,%o0
458 retl
459 srl %o0,%g0,%o0 ! clruw %o0
460
461.type bn_div_words,#function
462.size bn_div_words,(.-bn_div_words)
463
464.align 32
465
466.global bn_add_words
467/*
468 * BN_ULONG bn_add_words(rp,ap,bp,n)
469 * BN_ULONG *rp,*ap,*bp;
470 * int n;
471 */
472bn_add_words:
473 sra %o3,%g0,%o3 ! signx %o3
474 brgz,a %o3,.L_bn_add_words_proceed
475 lduw [%o1],%o4
476 retl
477 clr %o0
478
479.L_bn_add_words_proceed:
480 andcc %o3,-4,%g0
481 bz,pn %icc,.L_bn_add_words_tail
482 addcc %g0,0,%g0 ! clear carry flag
483
484.L_bn_add_words_loop: ! wow! 32 aligned!
485 dec 4,%o3
486 lduw [%o2],%o5
487 lduw [%o1+4],%g1
488 lduw [%o2+4],%g2
489 lduw [%o1+8],%g3
490 lduw [%o2+8],%g4
491 addccc %o5,%o4,%o5
492 stuw %o5,[%o0]
493
494 lduw [%o1+12],%o4
495 lduw [%o2+12],%o5
496 inc 16,%o1
497 addccc %g1,%g2,%g1
498 stuw %g1,[%o0+4]
499
500 inc 16,%o2
501 addccc %g3,%g4,%g3
502 stuw %g3,[%o0+8]
503
504 inc 16,%o0
505 addccc %o5,%o4,%o5
506 stuw %o5,[%o0-4]
507 and %o3,-4,%g1
508 brnz,a,pt %g1,.L_bn_add_words_loop
509 lduw [%o1],%o4
510
511 brnz,a,pn %o3,.L_bn_add_words_tail
512 lduw [%o1],%o4
513.L_bn_add_words_return:
514 clr %o0
515 retl
516 movcs %icc,1,%o0
517 nop
518
519.L_bn_add_words_tail:
520 lduw [%o2],%o5
521 dec %o3
522 addccc %o5,%o4,%o5
523 brz,pt %o3,.L_bn_add_words_return
524 stuw %o5,[%o0]
525
526 lduw [%o1+4],%o4
527 lduw [%o2+4],%o5
528 dec %o3
529 addccc %o5,%o4,%o5
530 brz,pt %o3,.L_bn_add_words_return
531 stuw %o5,[%o0+4]
532
533 lduw [%o1+8],%o4
534 lduw [%o2+8],%o5
535 addccc %o5,%o4,%o5
536 stuw %o5,[%o0+8]
537 clr %o0
538 retl
539 movcs %icc,1,%o0
540
541.type bn_add_words,#function
542.size bn_add_words,(.-bn_add_words)
543
544.global bn_sub_words
545/*
546 * BN_ULONG bn_sub_words(rp,ap,bp,n)
547 * BN_ULONG *rp,*ap,*bp;
548 * int n;
549 */
550bn_sub_words:
551 sra %o3,%g0,%o3 ! signx %o3
552 brgz,a %o3,.L_bn_sub_words_proceed
553 lduw [%o1],%o4
554 retl
555 clr %o0
556
557.L_bn_sub_words_proceed:
558 andcc %o3,-4,%g0
559 bz,pn %icc,.L_bn_sub_words_tail
560 addcc %g0,0,%g0 ! clear carry flag
561
562.L_bn_sub_words_loop: ! wow! 32 aligned!
563 dec 4,%o3
564 lduw [%o2],%o5
565 lduw [%o1+4],%g1
566 lduw [%o2+4],%g2
567 lduw [%o1+8],%g3
568 lduw [%o2+8],%g4
569 subccc %o4,%o5,%o5
570 stuw %o5,[%o0]
571
572 lduw [%o1+12],%o4
573 lduw [%o2+12],%o5
574 inc 16,%o1
575 subccc %g1,%g2,%g2
576 stuw %g2,[%o0+4]
577
578 inc 16,%o2
579 subccc %g3,%g4,%g4
580 stuw %g4,[%o0+8]
581
582 inc 16,%o0
583 subccc %o4,%o5,%o5
584 stuw %o5,[%o0-4]
585 and %o3,-4,%g1
586 brnz,a,pt %g1,.L_bn_sub_words_loop
587 lduw [%o1],%o4
588
589 brnz,a,pn %o3,.L_bn_sub_words_tail
590 lduw [%o1],%o4
591.L_bn_sub_words_return:
592 clr %o0
593 retl
594 movcs %icc,1,%o0
595 nop
596
597.L_bn_sub_words_tail: ! wow! 32 aligned!
598 lduw [%o2],%o5
599 dec %o3
600 subccc %o4,%o5,%o5
601 brz,pt %o3,.L_bn_sub_words_return
602 stuw %o5,[%o0]
603
604 lduw [%o1+4],%o4
605 lduw [%o2+4],%o5
606 dec %o3
607 subccc %o4,%o5,%o5
608 brz,pt %o3,.L_bn_sub_words_return
609 stuw %o5,[%o0+4]
610
611 lduw [%o1+8],%o4
612 lduw [%o2+8],%o5
613 subccc %o4,%o5,%o5
614 stuw %o5,[%o0+8]
615 clr %o0
616 retl
617 movcs %icc,1,%o0
618
619.type bn_sub_words,#function
620.size bn_sub_words,(.-bn_sub_words)
621
622/*
623 * Code below depends on the fact that upper parts of the %l0-%l7
624 * and %i0-%i7 are zeroed by kernel after context switch. In
625 * previous versions this comment stated that "the trouble is that
626 * it's not feasible to implement the mumbo-jumbo in less V9
627 * instructions:-(" which apparently isn't true thanks to
628 * 'bcs,a %xcc,.+8; inc %rd' pair. But the performance improvement
629 * results not from the shorter code, but from elimination of
630 * multicycle none-pairable 'rd %y,%rd' instructions.
631 *
632 * Andy.
633 */
634
635/*
636 * Here is register usage map for *all* routines below.
637 */
638#define t_1 %o0
639#define t_2 %o1
640#define c_12 %o2
641#define c_3 %o3
642
643#define ap(I) [%i1+4*I]
644#define bp(I) [%i2+4*I]
645#define rp(I) [%i0+4*I]
646
647#define a_0 %l0
648#define a_1 %l1
649#define a_2 %l2
650#define a_3 %l3
651#define a_4 %l4
652#define a_5 %l5
653#define a_6 %l6
654#define a_7 %l7
655
656#define b_0 %i3
657#define b_1 %i4
658#define b_2 %i5
659#define b_3 %o4
660#define b_4 %o5
661#define b_5 %o7
662#define b_6 %g1
663#define b_7 %g4
664
665.align 32
666.global bn_mul_comba8
667/*
668 * void bn_mul_comba8(r,a,b)
669 * BN_ULONG *r,*a,*b;
670 */
671bn_mul_comba8:
672 save %sp,FRAME_SIZE,%sp
673 mov 1,t_2
674 lduw ap(0),a_0
675 sllx t_2,32,t_2
676 lduw bp(0),b_0 !=
677 lduw bp(1),b_1
678 mulx a_0,b_0,t_1 !mul_add_c(a[0],b[0],c1,c2,c3);
679 srlx t_1,32,c_12
680 stuw t_1,rp(0) !=!r[0]=c1;
681
682 lduw ap(1),a_1
683 mulx a_0,b_1,t_1 !mul_add_c(a[0],b[1],c2,c3,c1);
684 addcc c_12,t_1,c_12
685 clr c_3 !=
686 bcs,a %xcc,.+8
687 add c_3,t_2,c_3
688 lduw ap(2),a_2
689 mulx a_1,b_0,t_1 !=!mul_add_c(a[1],b[0],c2,c3,c1);
690 addcc c_12,t_1,t_1
691 bcs,a %xcc,.+8
692 add c_3,t_2,c_3
693 srlx t_1,32,c_12 !=
694 stuw t_1,rp(1) !r[1]=c2;
695 or c_12,c_3,c_12
696
697 mulx a_2,b_0,t_1 !mul_add_c(a[2],b[0],c3,c1,c2);
698 addcc c_12,t_1,c_12 !=
699 clr c_3
700 bcs,a %xcc,.+8
701 add c_3,t_2,c_3
702 lduw bp(2),b_2 !=
703 mulx a_1,b_1,t_1 !mul_add_c(a[1],b[1],c3,c1,c2);
704 addcc c_12,t_1,c_12
705 bcs,a %xcc,.+8
706 add c_3,t_2,c_3 !=
707 lduw bp(3),b_3
708 mulx a_0,b_2,t_1 !mul_add_c(a[0],b[2],c3,c1,c2);
709 addcc c_12,t_1,t_1
710 bcs,a %xcc,.+8 !=
711 add c_3,t_2,c_3
712 srlx t_1,32,c_12
713 stuw t_1,rp(2) !r[2]=c3;
714 or c_12,c_3,c_12 !=
715
716 mulx a_0,b_3,t_1 !mul_add_c(a[0],b[3],c1,c2,c3);
717 addcc c_12,t_1,c_12
718 clr c_3
719 bcs,a %xcc,.+8 !=
720 add c_3,t_2,c_3
721 mulx a_1,b_2,t_1 !=!mul_add_c(a[1],b[2],c1,c2,c3);
722 addcc c_12,t_1,c_12
723 bcs,a %xcc,.+8 !=
724 add c_3,t_2,c_3
725 lduw ap(3),a_3
726 mulx a_2,b_1,t_1 !mul_add_c(a[2],b[1],c1,c2,c3);
727 addcc c_12,t_1,c_12 !=
728 bcs,a %xcc,.+8
729 add c_3,t_2,c_3
730 lduw ap(4),a_4
731 mulx a_3,b_0,t_1 !=!mul_add_c(a[3],b[0],c1,c2,c3);!=
732 addcc c_12,t_1,t_1
733 bcs,a %xcc,.+8
734 add c_3,t_2,c_3
735 srlx t_1,32,c_12 !=
736 stuw t_1,rp(3) !r[3]=c1;
737 or c_12,c_3,c_12
738
739 mulx a_4,b_0,t_1 !mul_add_c(a[4],b[0],c2,c3,c1);
740 addcc c_12,t_1,c_12 !=
741 clr c_3
742 bcs,a %xcc,.+8
743 add c_3,t_2,c_3
744 mulx a_3,b_1,t_1 !=!mul_add_c(a[3],b[1],c2,c3,c1);
745 addcc c_12,t_1,c_12
746 bcs,a %xcc,.+8
747 add c_3,t_2,c_3
748 mulx a_2,b_2,t_1 !=!mul_add_c(a[2],b[2],c2,c3,c1);
749 addcc c_12,t_1,c_12
750 bcs,a %xcc,.+8
751 add c_3,t_2,c_3
752 lduw bp(4),b_4 !=
753 mulx a_1,b_3,t_1 !mul_add_c(a[1],b[3],c2,c3,c1);
754 addcc c_12,t_1,c_12
755 bcs,a %xcc,.+8
756 add c_3,t_2,c_3 !=
757 lduw bp(5),b_5
758 mulx a_0,b_4,t_1 !mul_add_c(a[0],b[4],c2,c3,c1);
759 addcc c_12,t_1,t_1
760 bcs,a %xcc,.+8 !=
761 add c_3,t_2,c_3
762 srlx t_1,32,c_12
763 stuw t_1,rp(4) !r[4]=c2;
764 or c_12,c_3,c_12 !=
765
766 mulx a_0,b_5,t_1 !mul_add_c(a[0],b[5],c3,c1,c2);
767 addcc c_12,t_1,c_12
768 clr c_3
769 bcs,a %xcc,.+8 !=
770 add c_3,t_2,c_3
771 mulx a_1,b_4,t_1 !mul_add_c(a[1],b[4],c3,c1,c2);
772 addcc c_12,t_1,c_12
773 bcs,a %xcc,.+8 !=
774 add c_3,t_2,c_3
775 mulx a_2,b_3,t_1 !mul_add_c(a[2],b[3],c3,c1,c2);
776 addcc c_12,t_1,c_12
777 bcs,a %xcc,.+8 !=
778 add c_3,t_2,c_3
779 mulx a_3,b_2,t_1 !mul_add_c(a[3],b[2],c3,c1,c2);
780 addcc c_12,t_1,c_12
781 bcs,a %xcc,.+8 !=
782 add c_3,t_2,c_3
783 lduw ap(5),a_5
784 mulx a_4,b_1,t_1 !mul_add_c(a[4],b[1],c3,c1,c2);
785 addcc c_12,t_1,c_12 !=
786 bcs,a %xcc,.+8
787 add c_3,t_2,c_3
788 lduw ap(6),a_6
789 mulx a_5,b_0,t_1 !=!mul_add_c(a[5],b[0],c3,c1,c2);
790 addcc c_12,t_1,t_1
791 bcs,a %xcc,.+8
792 add c_3,t_2,c_3
793 srlx t_1,32,c_12 !=
794 stuw t_1,rp(5) !r[5]=c3;
795 or c_12,c_3,c_12
796
797 mulx a_6,b_0,t_1 !mul_add_c(a[6],b[0],c1,c2,c3);
798 addcc c_12,t_1,c_12 !=
799 clr c_3
800 bcs,a %xcc,.+8
801 add c_3,t_2,c_3
802 mulx a_5,b_1,t_1 !=!mul_add_c(a[5],b[1],c1,c2,c3);
803 addcc c_12,t_1,c_12
804 bcs,a %xcc,.+8
805 add c_3,t_2,c_3
806 mulx a_4,b_2,t_1 !=!mul_add_c(a[4],b[2],c1,c2,c3);
807 addcc c_12,t_1,c_12
808 bcs,a %xcc,.+8
809 add c_3,t_2,c_3
810 mulx a_3,b_3,t_1 !=!mul_add_c(a[3],b[3],c1,c2,c3);
811 addcc c_12,t_1,c_12
812 bcs,a %xcc,.+8
813 add c_3,t_2,c_3
814 mulx a_2,b_4,t_1 !=!mul_add_c(a[2],b[4],c1,c2,c3);
815 addcc c_12,t_1,c_12
816 bcs,a %xcc,.+8
817 add c_3,t_2,c_3
818 lduw bp(6),b_6 !=
819 mulx a_1,b_5,t_1 !mul_add_c(a[1],b[5],c1,c2,c3);
820 addcc c_12,t_1,c_12
821 bcs,a %xcc,.+8
822 add c_3,t_2,c_3 !=
823 lduw bp(7),b_7
824 mulx a_0,b_6,t_1 !mul_add_c(a[0],b[6],c1,c2,c3);
825 addcc c_12,t_1,t_1
826 bcs,a %xcc,.+8 !=
827 add c_3,t_2,c_3
828 srlx t_1,32,c_12
829 stuw t_1,rp(6) !r[6]=c1;
830 or c_12,c_3,c_12 !=
831
832 mulx a_0,b_7,t_1 !mul_add_c(a[0],b[7],c2,c3,c1);
833 addcc c_12,t_1,c_12
834 clr c_3
835 bcs,a %xcc,.+8 !=
836 add c_3,t_2,c_3
837 mulx a_1,b_6,t_1 !mul_add_c(a[1],b[6],c2,c3,c1);
838 addcc c_12,t_1,c_12
839 bcs,a %xcc,.+8 !=
840 add c_3,t_2,c_3
841 mulx a_2,b_5,t_1 !mul_add_c(a[2],b[5],c2,c3,c1);
842 addcc c_12,t_1,c_12
843 bcs,a %xcc,.+8 !=
844 add c_3,t_2,c_3
845 mulx a_3,b_4,t_1 !mul_add_c(a[3],b[4],c2,c3,c1);
846 addcc c_12,t_1,c_12
847 bcs,a %xcc,.+8 !=
848 add c_3,t_2,c_3
849 mulx a_4,b_3,t_1 !mul_add_c(a[4],b[3],c2,c3,c1);
850 addcc c_12,t_1,c_12
851 bcs,a %xcc,.+8 !=
852 add c_3,t_2,c_3
853 mulx a_5,b_2,t_1 !mul_add_c(a[5],b[2],c2,c3,c1);
854 addcc c_12,t_1,c_12
855 bcs,a %xcc,.+8 !=
856 add c_3,t_2,c_3
857 lduw ap(7),a_7
858 mulx a_6,b_1,t_1 !=!mul_add_c(a[6],b[1],c2,c3,c1);
859 addcc c_12,t_1,c_12
860 bcs,a %xcc,.+8
861 add c_3,t_2,c_3
862 mulx a_7,b_0,t_1 !=!mul_add_c(a[7],b[0],c2,c3,c1);
863 addcc c_12,t_1,t_1
864 bcs,a %xcc,.+8
865 add c_3,t_2,c_3
866 srlx t_1,32,c_12 !=
867 stuw t_1,rp(7) !r[7]=c2;
868 or c_12,c_3,c_12
869
870 mulx a_7,b_1,t_1 !=!mul_add_c(a[7],b[1],c3,c1,c2);
871 addcc c_12,t_1,c_12
872 clr c_3
873 bcs,a %xcc,.+8
874 add c_3,t_2,c_3 !=
875 mulx a_6,b_2,t_1 !mul_add_c(a[6],b[2],c3,c1,c2);
876 addcc c_12,t_1,c_12
877 bcs,a %xcc,.+8
878 add c_3,t_2,c_3 !=
879 mulx a_5,b_3,t_1 !mul_add_c(a[5],b[3],c3,c1,c2);
880 addcc c_12,t_1,c_12
881 bcs,a %xcc,.+8
882 add c_3,t_2,c_3 !=
883 mulx a_4,b_4,t_1 !mul_add_c(a[4],b[4],c3,c1,c2);
884 addcc c_12,t_1,c_12
885 bcs,a %xcc,.+8
886 add c_3,t_2,c_3 !=
887 mulx a_3,b_5,t_1 !mul_add_c(a[3],b[5],c3,c1,c2);
888 addcc c_12,t_1,c_12
889 bcs,a %xcc,.+8
890 add c_3,t_2,c_3 !=
891 mulx a_2,b_6,t_1 !mul_add_c(a[2],b[6],c3,c1,c2);
892 addcc c_12,t_1,c_12
893 bcs,a %xcc,.+8
894 add c_3,t_2,c_3 !=
895 mulx a_1,b_7,t_1 !mul_add_c(a[1],b[7],c3,c1,c2);
896 addcc c_12,t_1,t_1
897 bcs,a %xcc,.+8
898 add c_3,t_2,c_3 !=
899 srlx t_1,32,c_12
900 stuw t_1,rp(8) !r[8]=c3;
901 or c_12,c_3,c_12
902
903 mulx a_2,b_7,t_1 !=!mul_add_c(a[2],b[7],c1,c2,c3);
904 addcc c_12,t_1,c_12
905 clr c_3
906 bcs,a %xcc,.+8
907 add c_3,t_2,c_3 !=
908 mulx a_3,b_6,t_1 !mul_add_c(a[3],b[6],c1,c2,c3);
909 addcc c_12,t_1,c_12
910 bcs,a %xcc,.+8 !=
911 add c_3,t_2,c_3
912 mulx a_4,b_5,t_1 !mul_add_c(a[4],b[5],c1,c2,c3);
913 addcc c_12,t_1,c_12
914 bcs,a %xcc,.+8 !=
915 add c_3,t_2,c_3
916 mulx a_5,b_4,t_1 !mul_add_c(a[5],b[4],c1,c2,c3);
917 addcc c_12,t_1,c_12
918 bcs,a %xcc,.+8 !=
919 add c_3,t_2,c_3
920 mulx a_6,b_3,t_1 !mul_add_c(a[6],b[3],c1,c2,c3);
921 addcc c_12,t_1,c_12
922 bcs,a %xcc,.+8 !=
923 add c_3,t_2,c_3
924 mulx a_7,b_2,t_1 !mul_add_c(a[7],b[2],c1,c2,c3);
925 addcc c_12,t_1,t_1
926 bcs,a %xcc,.+8 !=
927 add c_3,t_2,c_3
928 srlx t_1,32,c_12
929 stuw t_1,rp(9) !r[9]=c1;
930 or c_12,c_3,c_12 !=
931
932 mulx a_7,b_3,t_1 !mul_add_c(a[7],b[3],c2,c3,c1);
933 addcc c_12,t_1,c_12
934 clr c_3
935 bcs,a %xcc,.+8 !=
936 add c_3,t_2,c_3
937 mulx a_6,b_4,t_1 !mul_add_c(a[6],b[4],c2,c3,c1);
938 addcc c_12,t_1,c_12
939 bcs,a %xcc,.+8 !=
940 add c_3,t_2,c_3
941 mulx a_5,b_5,t_1 !mul_add_c(a[5],b[5],c2,c3,c1);
942 addcc c_12,t_1,c_12
943 bcs,a %xcc,.+8 !=
944 add c_3,t_2,c_3
945 mulx a_4,b_6,t_1 !mul_add_c(a[4],b[6],c2,c3,c1);
946 addcc c_12,t_1,c_12
947 bcs,a %xcc,.+8 !=
948 add c_3,t_2,c_3
949 mulx a_3,b_7,t_1 !mul_add_c(a[3],b[7],c2,c3,c1);
950 addcc c_12,t_1,t_1
951 bcs,a %xcc,.+8 !=
952 add c_3,t_2,c_3
953 srlx t_1,32,c_12
954 stuw t_1,rp(10) !r[10]=c2;
955 or c_12,c_3,c_12 !=
956
957 mulx a_4,b_7,t_1 !mul_add_c(a[4],b[7],c3,c1,c2);
958 addcc c_12,t_1,c_12
959 clr c_3
960 bcs,a %xcc,.+8 !=
961 add c_3,t_2,c_3
962 mulx a_5,b_6,t_1 !mul_add_c(a[5],b[6],c3,c1,c2);
963 addcc c_12,t_1,c_12
964 bcs,a %xcc,.+8 !=
965 add c_3,t_2,c_3
966 mulx a_6,b_5,t_1 !mul_add_c(a[6],b[5],c3,c1,c2);
967 addcc c_12,t_1,c_12
968 bcs,a %xcc,.+8 !=
969 add c_3,t_2,c_3
970 mulx a_7,b_4,t_1 !mul_add_c(a[7],b[4],c3,c1,c2);
971 addcc c_12,t_1,t_1
972 bcs,a %xcc,.+8 !=
973 add c_3,t_2,c_3
974 srlx t_1,32,c_12
975 stuw t_1,rp(11) !r[11]=c3;
976 or c_12,c_3,c_12 !=
977
978 mulx a_7,b_5,t_1 !mul_add_c(a[7],b[5],c1,c2,c3);
979 addcc c_12,t_1,c_12
980 clr c_3
981 bcs,a %xcc,.+8 !=
982 add c_3,t_2,c_3
983 mulx a_6,b_6,t_1 !mul_add_c(a[6],b[6],c1,c2,c3);
984 addcc c_12,t_1,c_12
985 bcs,a %xcc,.+8 !=
986 add c_3,t_2,c_3
987 mulx a_5,b_7,t_1 !mul_add_c(a[5],b[7],c1,c2,c3);
988 addcc c_12,t_1,t_1
989 bcs,a %xcc,.+8 !=
990 add c_3,t_2,c_3
991 srlx t_1,32,c_12
992 stuw t_1,rp(12) !r[12]=c1;
993 or c_12,c_3,c_12 !=
994
995 mulx a_6,b_7,t_1 !mul_add_c(a[6],b[7],c2,c3,c1);
996 addcc c_12,t_1,c_12
997 clr c_3
998 bcs,a %xcc,.+8 !=
999 add c_3,t_2,c_3
1000 mulx a_7,b_6,t_1 !mul_add_c(a[7],b[6],c2,c3,c1);
1001 addcc c_12,t_1,t_1
1002 bcs,a %xcc,.+8 !=
1003 add c_3,t_2,c_3
1004 srlx t_1,32,c_12
1005 st t_1,rp(13) !r[13]=c2;
1006 or c_12,c_3,c_12 !=
1007
1008 mulx a_7,b_7,t_1 !mul_add_c(a[7],b[7],c3,c1,c2);
1009 addcc c_12,t_1,t_1
1010 srlx t_1,32,c_12 !=
1011 stuw t_1,rp(14) !r[14]=c3;
1012 stuw c_12,rp(15) !r[15]=c1;
1013
1014 ret
1015 restore %g0,%g0,%o0 !=
1016
1017.type bn_mul_comba8,#function
1018.size bn_mul_comba8,(.-bn_mul_comba8)
1019
1020.align 32
1021
1022.global bn_mul_comba4
1023/*
1024 * void bn_mul_comba4(r,a,b)
1025 * BN_ULONG *r,*a,*b;
1026 */
1027bn_mul_comba4:
1028 save %sp,FRAME_SIZE,%sp
1029 lduw ap(0),a_0
1030 mov 1,t_2
1031 lduw bp(0),b_0
1032 sllx t_2,32,t_2 !=
1033 lduw bp(1),b_1
1034 mulx a_0,b_0,t_1 !mul_add_c(a[0],b[0],c1,c2,c3);
1035 srlx t_1,32,c_12
1036 stuw t_1,rp(0) !=!r[0]=c1;
1037
1038 lduw ap(1),a_1
1039 mulx a_0,b_1,t_1 !mul_add_c(a[0],b[1],c2,c3,c1);
1040 addcc c_12,t_1,c_12
1041 clr c_3 !=
1042 bcs,a %xcc,.+8
1043 add c_3,t_2,c_3
1044 lduw ap(2),a_2
1045 mulx a_1,b_0,t_1 !=!mul_add_c(a[1],b[0],c2,c3,c1);
1046 addcc c_12,t_1,t_1
1047 bcs,a %xcc,.+8
1048 add c_3,t_2,c_3
1049 srlx t_1,32,c_12 !=
1050 stuw t_1,rp(1) !r[1]=c2;
1051 or c_12,c_3,c_12
1052
1053 mulx a_2,b_0,t_1 !mul_add_c(a[2],b[0],c3,c1,c2);
1054 addcc c_12,t_1,c_12 !=
1055 clr c_3
1056 bcs,a %xcc,.+8
1057 add c_3,t_2,c_3
1058 lduw bp(2),b_2 !=
1059 mulx a_1,b_1,t_1 !mul_add_c(a[1],b[1],c3,c1,c2);
1060 addcc c_12,t_1,c_12
1061 bcs,a %xcc,.+8
1062 add c_3,t_2,c_3 !=
1063 lduw bp(3),b_3
1064 mulx a_0,b_2,t_1 !mul_add_c(a[0],b[2],c3,c1,c2);
1065 addcc c_12,t_1,t_1
1066 bcs,a %xcc,.+8 !=
1067 add c_3,t_2,c_3
1068 srlx t_1,32,c_12
1069 stuw t_1,rp(2) !r[2]=c3;
1070 or c_12,c_3,c_12 !=
1071
1072 mulx a_0,b_3,t_1 !mul_add_c(a[0],b[3],c1,c2,c3);
1073 addcc c_12,t_1,c_12
1074 clr c_3
1075 bcs,a %xcc,.+8 !=
1076 add c_3,t_2,c_3
1077 mulx a_1,b_2,t_1 !mul_add_c(a[1],b[2],c1,c2,c3);
1078 addcc c_12,t_1,c_12
1079 bcs,a %xcc,.+8 !=
1080 add c_3,t_2,c_3
1081 lduw ap(3),a_3
1082 mulx a_2,b_1,t_1 !mul_add_c(a[2],b[1],c1,c2,c3);
1083 addcc c_12,t_1,c_12 !=
1084 bcs,a %xcc,.+8
1085 add c_3,t_2,c_3
1086 mulx a_3,b_0,t_1 !mul_add_c(a[3],b[0],c1,c2,c3);!=
1087 addcc c_12,t_1,t_1 !=
1088 bcs,a %xcc,.+8
1089 add c_3,t_2,c_3
1090 srlx t_1,32,c_12
1091 stuw t_1,rp(3) !=!r[3]=c1;
1092 or c_12,c_3,c_12
1093
1094 mulx a_3,b_1,t_1 !mul_add_c(a[3],b[1],c2,c3,c1);
1095 addcc c_12,t_1,c_12
1096 clr c_3 !=
1097 bcs,a %xcc,.+8
1098 add c_3,t_2,c_3
1099 mulx a_2,b_2,t_1 !mul_add_c(a[2],b[2],c2,c3,c1);
1100 addcc c_12,t_1,c_12 !=
1101 bcs,a %xcc,.+8
1102 add c_3,t_2,c_3
1103 mulx a_1,b_3,t_1 !mul_add_c(a[1],b[3],c2,c3,c1);
1104 addcc c_12,t_1,t_1 !=
1105 bcs,a %xcc,.+8
1106 add c_3,t_2,c_3
1107 srlx t_1,32,c_12
1108 stuw t_1,rp(4) !=!r[4]=c2;
1109 or c_12,c_3,c_12
1110
1111 mulx a_2,b_3,t_1 !mul_add_c(a[2],b[3],c3,c1,c2);
1112 addcc c_12,t_1,c_12
1113 clr c_3 !=
1114 bcs,a %xcc,.+8
1115 add c_3,t_2,c_3
1116 mulx a_3,b_2,t_1 !mul_add_c(a[3],b[2],c3,c1,c2);
1117 addcc c_12,t_1,t_1 !=
1118 bcs,a %xcc,.+8
1119 add c_3,t_2,c_3
1120 srlx t_1,32,c_12
1121 stuw t_1,rp(5) !=!r[5]=c3;
1122 or c_12,c_3,c_12
1123
1124 mulx a_3,b_3,t_1 !mul_add_c(a[3],b[3],c1,c2,c3);
1125 addcc c_12,t_1,t_1
1126 srlx t_1,32,c_12 !=
1127 stuw t_1,rp(6) !r[6]=c1;
1128 stuw c_12,rp(7) !r[7]=c2;
1129
1130 ret
1131 restore %g0,%g0,%o0
1132
1133.type bn_mul_comba4,#function
1134.size bn_mul_comba4,(.-bn_mul_comba4)
1135
1136.align 32
1137
1138.global bn_sqr_comba8
1139bn_sqr_comba8:
1140 save %sp,FRAME_SIZE,%sp
1141 mov 1,t_2
1142 lduw ap(0),a_0
1143 sllx t_2,32,t_2
1144 lduw ap(1),a_1
1145 mulx a_0,a_0,t_1 !sqr_add_c(a,0,c1,c2,c3);
1146 srlx t_1,32,c_12
1147 stuw t_1,rp(0) !r[0]=c1;
1148
1149 lduw ap(2),a_2
1150 mulx a_0,a_1,t_1 !=!sqr_add_c2(a,1,0,c2,c3,c1);
1151 addcc c_12,t_1,c_12
1152 clr c_3
1153 bcs,a %xcc,.+8
1154 add c_3,t_2,c_3
1155 addcc c_12,t_1,t_1
1156 bcs,a %xcc,.+8
1157 add c_3,t_2,c_3
1158 srlx t_1,32,c_12
1159 stuw t_1,rp(1) !r[1]=c2;
1160 or c_12,c_3,c_12
1161
1162 mulx a_2,a_0,t_1 !sqr_add_c2(a,2,0,c3,c1,c2);
1163 addcc c_12,t_1,c_12
1164 clr c_3
1165 bcs,a %xcc,.+8
1166 add c_3,t_2,c_3
1167 addcc c_12,t_1,c_12
1168 bcs,a %xcc,.+8
1169 add c_3,t_2,c_3
1170 lduw ap(3),a_3
1171 mulx a_1,a_1,t_1 !sqr_add_c(a,1,c3,c1,c2);
1172 addcc c_12,t_1,t_1
1173 bcs,a %xcc,.+8
1174 add c_3,t_2,c_3
1175 srlx t_1,32,c_12
1176 stuw t_1,rp(2) !r[2]=c3;
1177 or c_12,c_3,c_12
1178
1179 mulx a_0,a_3,t_1 !sqr_add_c2(a,3,0,c1,c2,c3);
1180 addcc c_12,t_1,c_12
1181 clr c_3
1182 bcs,a %xcc,.+8
1183 add c_3,t_2,c_3
1184 addcc c_12,t_1,c_12
1185 bcs,a %xcc,.+8
1186 add c_3,t_2,c_3
1187 lduw ap(4),a_4
1188 mulx a_1,a_2,t_1 !sqr_add_c2(a,2,1,c1,c2,c3);
1189 addcc c_12,t_1,c_12
1190 bcs,a %xcc,.+8
1191 add c_3,t_2,c_3
1192 addcc c_12,t_1,t_1
1193 bcs,a %xcc,.+8
1194 add c_3,t_2,c_3
1195 srlx t_1,32,c_12
1196 st t_1,rp(3) !r[3]=c1;
1197 or c_12,c_3,c_12
1198
1199 mulx a_4,a_0,t_1 !sqr_add_c2(a,4,0,c2,c3,c1);
1200 addcc c_12,t_1,c_12
1201 clr c_3
1202 bcs,a %xcc,.+8
1203 add c_3,t_2,c_3
1204 addcc c_12,t_1,c_12
1205 bcs,a %xcc,.+8
1206 add c_3,t_2,c_3
1207 mulx a_3,a_1,t_1 !sqr_add_c2(a,3,1,c2,c3,c1);
1208 addcc c_12,t_1,c_12
1209 bcs,a %xcc,.+8
1210 add c_3,t_2,c_3
1211 addcc c_12,t_1,c_12
1212 bcs,a %xcc,.+8
1213 add c_3,t_2,c_3
1214 lduw ap(5),a_5
1215 mulx a_2,a_2,t_1 !sqr_add_c(a,2,c2,c3,c1);
1216 addcc c_12,t_1,t_1
1217 bcs,a %xcc,.+8
1218 add c_3,t_2,c_3
1219 srlx t_1,32,c_12
1220 stuw t_1,rp(4) !r[4]=c2;
1221 or c_12,c_3,c_12
1222
1223 mulx a_0,a_5,t_1 !sqr_add_c2(a,5,0,c3,c1,c2);
1224 addcc c_12,t_1,c_12
1225 clr c_3
1226 bcs,a %xcc,.+8
1227 add c_3,t_2,c_3
1228 addcc c_12,t_1,c_12
1229 bcs,a %xcc,.+8
1230 add c_3,t_2,c_3
1231 mulx a_1,a_4,t_1 !sqr_add_c2(a,4,1,c3,c1,c2);
1232 addcc c_12,t_1,c_12
1233 bcs,a %xcc,.+8
1234 add c_3,t_2,c_3
1235 addcc c_12,t_1,c_12
1236 bcs,a %xcc,.+8
1237 add c_3,t_2,c_3
1238 lduw ap(6),a_6
1239 mulx a_2,a_3,t_1 !sqr_add_c2(a,3,2,c3,c1,c2);
1240 addcc c_12,t_1,c_12
1241 bcs,a %xcc,.+8
1242 add c_3,t_2,c_3
1243 addcc c_12,t_1,t_1
1244 bcs,a %xcc,.+8
1245 add c_3,t_2,c_3
1246 srlx t_1,32,c_12
1247 stuw t_1,rp(5) !r[5]=c3;
1248 or c_12,c_3,c_12
1249
1250 mulx a_6,a_0,t_1 !sqr_add_c2(a,6,0,c1,c2,c3);
1251 addcc c_12,t_1,c_12
1252 clr c_3
1253 bcs,a %xcc,.+8
1254 add c_3,t_2,c_3
1255 addcc c_12,t_1,c_12
1256 bcs,a %xcc,.+8
1257 add c_3,t_2,c_3
1258 mulx a_5,a_1,t_1 !sqr_add_c2(a,5,1,c1,c2,c3);
1259 addcc c_12,t_1,c_12
1260 bcs,a %xcc,.+8
1261 add c_3,t_2,c_3
1262 addcc c_12,t_1,c_12
1263 bcs,a %xcc,.+8
1264 add c_3,t_2,c_3
1265 mulx a_4,a_2,t_1 !sqr_add_c2(a,4,2,c1,c2,c3);
1266 addcc c_12,t_1,c_12
1267 bcs,a %xcc,.+8
1268 add c_3,t_2,c_3
1269 addcc c_12,t_1,c_12
1270 bcs,a %xcc,.+8
1271 add c_3,t_2,c_3
1272 lduw ap(7),a_7
1273 mulx a_3,a_3,t_1 !=!sqr_add_c(a,3,c1,c2,c3);
1274 addcc c_12,t_1,t_1
1275 bcs,a %xcc,.+8
1276 add c_3,t_2,c_3
1277 srlx t_1,32,c_12
1278 stuw t_1,rp(6) !r[6]=c1;
1279 or c_12,c_3,c_12
1280
1281 mulx a_0,a_7,t_1 !sqr_add_c2(a,7,0,c2,c3,c1);
1282 addcc c_12,t_1,c_12
1283 clr c_3
1284 bcs,a %xcc,.+8
1285 add c_3,t_2,c_3
1286 addcc c_12,t_1,c_12
1287 bcs,a %xcc,.+8
1288 add c_3,t_2,c_3
1289 mulx a_1,a_6,t_1 !sqr_add_c2(a,6,1,c2,c3,c1);
1290 addcc c_12,t_1,c_12
1291 bcs,a %xcc,.+8
1292 add c_3,t_2,c_3
1293 addcc c_12,t_1,c_12
1294 bcs,a %xcc,.+8
1295 add c_3,t_2,c_3
1296 mulx a_2,a_5,t_1 !sqr_add_c2(a,5,2,c2,c3,c1);
1297 addcc c_12,t_1,c_12
1298 bcs,a %xcc,.+8
1299 add c_3,t_2,c_3
1300 addcc c_12,t_1,c_12
1301 bcs,a %xcc,.+8
1302 add c_3,t_2,c_3
1303 mulx a_3,a_4,t_1 !sqr_add_c2(a,4,3,c2,c3,c1);
1304 addcc c_12,t_1,c_12
1305 bcs,a %xcc,.+8
1306 add c_3,t_2,c_3
1307 addcc c_12,t_1,t_1
1308 bcs,a %xcc,.+8
1309 add c_3,t_2,c_3
1310 srlx t_1,32,c_12
1311 stuw t_1,rp(7) !r[7]=c2;
1312 or c_12,c_3,c_12
1313
1314 mulx a_7,a_1,t_1 !sqr_add_c2(a,7,1,c3,c1,c2);
1315 addcc c_12,t_1,c_12
1316 clr c_3
1317 bcs,a %xcc,.+8
1318 add c_3,t_2,c_3
1319 addcc c_12,t_1,c_12
1320 bcs,a %xcc,.+8
1321 add c_3,t_2,c_3
1322 mulx a_6,a_2,t_1 !sqr_add_c2(a,6,2,c3,c1,c2);
1323 addcc c_12,t_1,c_12
1324 bcs,a %xcc,.+8
1325 add c_3,t_2,c_3
1326 addcc c_12,t_1,c_12
1327 bcs,a %xcc,.+8
1328 add c_3,t_2,c_3
1329 mulx a_5,a_3,t_1 !sqr_add_c2(a,5,3,c3,c1,c2);
1330 addcc c_12,t_1,c_12
1331 bcs,a %xcc,.+8
1332 add c_3,t_2,c_3
1333 addcc c_12,t_1,c_12
1334 bcs,a %xcc,.+8
1335 add c_3,t_2,c_3
1336 mulx a_4,a_4,t_1 !sqr_add_c(a,4,c3,c1,c2);
1337 addcc c_12,t_1,t_1
1338 bcs,a %xcc,.+8
1339 add c_3,t_2,c_3
1340 srlx t_1,32,c_12
1341 stuw t_1,rp(8) !r[8]=c3;
1342 or c_12,c_3,c_12
1343
1344 mulx a_2,a_7,t_1 !sqr_add_c2(a,7,2,c1,c2,c3);
1345 addcc c_12,t_1,c_12
1346 clr c_3
1347 bcs,a %xcc,.+8
1348 add c_3,t_2,c_3
1349 addcc c_12,t_1,c_12
1350 bcs,a %xcc,.+8
1351 add c_3,t_2,c_3
1352 mulx a_3,a_6,t_1 !sqr_add_c2(a,6,3,c1,c2,c3);
1353 addcc c_12,t_1,c_12
1354 bcs,a %xcc,.+8
1355 add c_3,t_2,c_3
1356 addcc c_12,t_1,c_12
1357 bcs,a %xcc,.+8
1358 add c_3,t_2,c_3
1359 mulx a_4,a_5,t_1 !sqr_add_c2(a,5,4,c1,c2,c3);
1360 addcc c_12,t_1,c_12
1361 bcs,a %xcc,.+8
1362 add c_3,t_2,c_3
1363 addcc c_12,t_1,t_1
1364 bcs,a %xcc,.+8
1365 add c_3,t_2,c_3
1366 srlx t_1,32,c_12
1367 stuw t_1,rp(9) !r[9]=c1;
1368 or c_12,c_3,c_12
1369
1370 mulx a_7,a_3,t_1 !sqr_add_c2(a,7,3,c2,c3,c1);
1371 addcc c_12,t_1,c_12
1372 clr c_3
1373 bcs,a %xcc,.+8
1374 add c_3,t_2,c_3
1375 addcc c_12,t_1,c_12
1376 bcs,a %xcc,.+8
1377 add c_3,t_2,c_3
1378 mulx a_6,a_4,t_1 !sqr_add_c2(a,6,4,c2,c3,c1);
1379 addcc c_12,t_1,c_12
1380 bcs,a %xcc,.+8
1381 add c_3,t_2,c_3
1382 addcc c_12,t_1,c_12
1383 bcs,a %xcc,.+8
1384 add c_3,t_2,c_3
1385 mulx a_5,a_5,t_1 !sqr_add_c(a,5,c2,c3,c1);
1386 addcc c_12,t_1,t_1
1387 bcs,a %xcc,.+8
1388 add c_3,t_2,c_3
1389 srlx t_1,32,c_12
1390 stuw t_1,rp(10) !r[10]=c2;
1391 or c_12,c_3,c_12
1392
1393 mulx a_4,a_7,t_1 !sqr_add_c2(a,7,4,c3,c1,c2);
1394 addcc c_12,t_1,c_12
1395 clr c_3
1396 bcs,a %xcc,.+8
1397 add c_3,t_2,c_3
1398 addcc c_12,t_1,c_12
1399 bcs,a %xcc,.+8
1400 add c_3,t_2,c_3
1401 mulx a_5,a_6,t_1 !sqr_add_c2(a,6,5,c3,c1,c2);
1402 addcc c_12,t_1,c_12
1403 bcs,a %xcc,.+8
1404 add c_3,t_2,c_3
1405 addcc c_12,t_1,t_1
1406 bcs,a %xcc,.+8
1407 add c_3,t_2,c_3
1408 srlx t_1,32,c_12
1409 stuw t_1,rp(11) !r[11]=c3;
1410 or c_12,c_3,c_12
1411
1412 mulx a_7,a_5,t_1 !sqr_add_c2(a,7,5,c1,c2,c3);
1413 addcc c_12,t_1,c_12
1414 clr c_3
1415 bcs,a %xcc,.+8
1416 add c_3,t_2,c_3
1417 addcc c_12,t_1,c_12
1418 bcs,a %xcc,.+8
1419 add c_3,t_2,c_3
1420 mulx a_6,a_6,t_1 !sqr_add_c(a,6,c1,c2,c3);
1421 addcc c_12,t_1,t_1
1422 bcs,a %xcc,.+8
1423 add c_3,t_2,c_3
1424 srlx t_1,32,c_12
1425 stuw t_1,rp(12) !r[12]=c1;
1426 or c_12,c_3,c_12
1427
1428 mulx a_6,a_7,t_1 !sqr_add_c2(a,7,6,c2,c3,c1);
1429 addcc c_12,t_1,c_12
1430 clr c_3
1431 bcs,a %xcc,.+8
1432 add c_3,t_2,c_3
1433 addcc c_12,t_1,t_1
1434 bcs,a %xcc,.+8
1435 add c_3,t_2,c_3
1436 srlx t_1,32,c_12
1437 stuw t_1,rp(13) !r[13]=c2;
1438 or c_12,c_3,c_12
1439
1440 mulx a_7,a_7,t_1 !sqr_add_c(a,7,c3,c1,c2);
1441 addcc c_12,t_1,t_1
1442 srlx t_1,32,c_12
1443 stuw t_1,rp(14) !r[14]=c3;
1444 stuw c_12,rp(15) !r[15]=c1;
1445
1446 ret
1447 restore %g0,%g0,%o0
1448
1449.type bn_sqr_comba8,#function
1450.size bn_sqr_comba8,(.-bn_sqr_comba8)
1451
1452.align 32
1453
1454.global bn_sqr_comba4
1455/*
1456 * void bn_sqr_comba4(r,a)
1457 * BN_ULONG *r,*a;
1458 */
1459bn_sqr_comba4:
1460 save %sp,FRAME_SIZE,%sp
1461 mov 1,t_2
1462 lduw ap(0),a_0
1463 sllx t_2,32,t_2
1464 lduw ap(1),a_1
1465 mulx a_0,a_0,t_1 !sqr_add_c(a,0,c1,c2,c3);
1466 srlx t_1,32,c_12
1467 stuw t_1,rp(0) !r[0]=c1;
1468
1469 lduw ap(2),a_2
1470 mulx a_0,a_1,t_1 !sqr_add_c2(a,1,0,c2,c3,c1);
1471 addcc c_12,t_1,c_12
1472 clr c_3
1473 bcs,a %xcc,.+8
1474 add c_3,t_2,c_3
1475 addcc c_12,t_1,t_1
1476 bcs,a %xcc,.+8
1477 add c_3,t_2,c_3
1478 srlx t_1,32,c_12
1479 stuw t_1,rp(1) !r[1]=c2;
1480 or c_12,c_3,c_12
1481
1482 mulx a_2,a_0,t_1 !sqr_add_c2(a,2,0,c3,c1,c2);
1483 addcc c_12,t_1,c_12
1484 clr c_3
1485 bcs,a %xcc,.+8
1486 add c_3,t_2,c_3
1487 addcc c_12,t_1,c_12
1488 bcs,a %xcc,.+8
1489 add c_3,t_2,c_3
1490 lduw ap(3),a_3
1491 mulx a_1,a_1,t_1 !sqr_add_c(a,1,c3,c1,c2);
1492 addcc c_12,t_1,t_1
1493 bcs,a %xcc,.+8
1494 add c_3,t_2,c_3
1495 srlx t_1,32,c_12
1496 stuw t_1,rp(2) !r[2]=c3;
1497 or c_12,c_3,c_12
1498
1499 mulx a_0,a_3,t_1 !sqr_add_c2(a,3,0,c1,c2,c3);
1500 addcc c_12,t_1,c_12
1501 clr c_3
1502 bcs,a %xcc,.+8
1503 add c_3,t_2,c_3
1504 addcc c_12,t_1,c_12
1505 bcs,a %xcc,.+8
1506 add c_3,t_2,c_3
1507 mulx a_1,a_2,t_1 !sqr_add_c2(a,2,1,c1,c2,c3);
1508 addcc c_12,t_1,c_12
1509 bcs,a %xcc,.+8
1510 add c_3,t_2,c_3
1511 addcc c_12,t_1,t_1
1512 bcs,a %xcc,.+8
1513 add c_3,t_2,c_3
1514 srlx t_1,32,c_12
1515 stuw t_1,rp(3) !r[3]=c1;
1516 or c_12,c_3,c_12
1517
1518 mulx a_3,a_1,t_1 !sqr_add_c2(a,3,1,c2,c3,c1);
1519 addcc c_12,t_1,c_12
1520 clr c_3
1521 bcs,a %xcc,.+8
1522 add c_3,t_2,c_3
1523 addcc c_12,t_1,c_12
1524 bcs,a %xcc,.+8
1525 add c_3,t_2,c_3
1526 mulx a_2,a_2,t_1 !sqr_add_c(a,2,c2,c3,c1);
1527 addcc c_12,t_1,t_1
1528 bcs,a %xcc,.+8
1529 add c_3,t_2,c_3
1530 srlx t_1,32,c_12
1531 stuw t_1,rp(4) !r[4]=c2;
1532 or c_12,c_3,c_12
1533
1534 mulx a_2,a_3,t_1 !sqr_add_c2(a,3,2,c3,c1,c2);
1535 addcc c_12,t_1,c_12
1536 clr c_3
1537 bcs,a %xcc,.+8
1538 add c_3,t_2,c_3
1539 addcc c_12,t_1,t_1
1540 bcs,a %xcc,.+8
1541 add c_3,t_2,c_3
1542 srlx t_1,32,c_12
1543 stuw t_1,rp(5) !r[5]=c3;
1544 or c_12,c_3,c_12
1545
1546 mulx a_3,a_3,t_1 !sqr_add_c(a,3,c1,c2,c3);
1547 addcc c_12,t_1,t_1
1548 srlx t_1,32,c_12
1549 stuw t_1,rp(6) !r[6]=c1;
1550 stuw c_12,rp(7) !r[7]=c2;
1551
1552 ret
1553 restore %g0,%g0,%o0
1554
1555.type bn_sqr_comba4,#function
1556.size bn_sqr_comba4,(.-bn_sqr_comba4)
1557
1558.align 32
diff --git a/src/lib/libcrypto/bn/asm/sparcv9-mont.pl b/src/lib/libcrypto/bn/asm/sparcv9-mont.pl
deleted file mode 100644
index b8fb1e8a25..0000000000
--- a/src/lib/libcrypto/bn/asm/sparcv9-mont.pl
+++ /dev/null
@@ -1,606 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# December 2005
11#
12# Pure SPARCv9/8+ and IALU-only bn_mul_mont implementation. The reasons
13# for undertaken effort are multiple. First of all, UltraSPARC is not
14# the whole SPARCv9 universe and other VIS-free implementations deserve
15# optimized code as much. Secondly, newly introduced UltraSPARC T1,
16# a.k.a. Niagara, has shared FPU and concurrent FPU-intensive pathes,
17# such as sparcv9a-mont, will simply sink it. Yes, T1 is equipped with
18# several integrated RSA/DSA accelerator circuits accessible through
19# kernel driver [only(*)], but having decent user-land software
20# implementation is important too. Finally, reasons like desire to
21# experiment with dedicated squaring procedure. Yes, this module
22# implements one, because it was easiest to draft it in SPARCv9
23# instructions...
24
25# (*) Engine accessing the driver in question is on my TODO list.
26# For reference, acceleator is estimated to give 6 to 10 times
27# improvement on single-threaded RSA sign. It should be noted
28# that 6-10x improvement coefficient does not actually mean
29# something extraordinary in terms of absolute [single-threaded]
30# performance, as SPARCv9 instruction set is by all means least
31# suitable for high performance crypto among other 64 bit
32# platforms. 6-10x factor simply places T1 in same performance
33# domain as say AMD64 and IA-64. Improvement of RSA verify don't
34# appear impressive at all, but it's the sign operation which is
35# far more critical/interesting.
36
37# You might notice that inner loops are modulo-scheduled:-) This has
38# essentially negligible impact on UltraSPARC performance, it's
39# Fujitsu SPARC64 V users who should notice and hopefully appreciate
40# the advantage... Currently this module surpasses sparcv9a-mont.pl
41# by ~20% on UltraSPARC-III and later cores, but recall that sparcv9a
42# module still have hidden potential [see TODO list there], which is
43# estimated to be larger than 20%...
44
45# int bn_mul_mont(
46$rp="%i0"; # BN_ULONG *rp,
47$ap="%i1"; # const BN_ULONG *ap,
48$bp="%i2"; # const BN_ULONG *bp,
49$np="%i3"; # const BN_ULONG *np,
50$n0="%i4"; # const BN_ULONG *n0,
51$num="%i5"; # int num);
52
53$bits=32;
54for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
55if ($bits==64) { $bias=2047; $frame=192; }
56else { $bias=0; $frame=128; }
57
58$car0="%o0";
59$car1="%o1";
60$car2="%o2"; # 1 bit
61$acc0="%o3";
62$acc1="%o4";
63$mask="%g1"; # 32 bits, what a waste...
64$tmp0="%g4";
65$tmp1="%g5";
66
67$i="%l0";
68$j="%l1";
69$mul0="%l2";
70$mul1="%l3";
71$tp="%l4";
72$apj="%l5";
73$npj="%l6";
74$tpj="%l7";
75
76$fname="bn_mul_mont_int";
77
78$code=<<___;
79.section ".text",#alloc,#execinstr
80
81.global $fname
82.align 32
83$fname:
84 cmp %o5,4 ! 128 bits minimum
85 bge,pt %icc,.Lenter
86 sethi %hi(0xffffffff),$mask
87 retl
88 clr %o0
89.align 32
90.Lenter:
91 save %sp,-$frame,%sp
92 sll $num,2,$num ! num*=4
93 or $mask,%lo(0xffffffff),$mask
94 ld [$n0],$n0
95 cmp $ap,$bp
96 and $num,$mask,$num
97 ld [$bp],$mul0 ! bp[0]
98 nop
99
100 add %sp,$bias,%o7 ! real top of stack
101 ld [$ap],$car0 ! ap[0] ! redundant in squaring context
102 sub %o7,$num,%o7
103 ld [$ap+4],$apj ! ap[1]
104 and %o7,-1024,%o7
105 ld [$np],$car1 ! np[0]
106 sub %o7,$bias,%sp ! alloca
107 ld [$np+4],$npj ! np[1]
108 be,pt `$bits==32?"%icc":"%xcc"`,.Lbn_sqr_mont
109 mov 12,$j
110
111 mulx $car0,$mul0,$car0 ! ap[0]*bp[0]
112 mulx $apj,$mul0,$tmp0 !prologue! ap[1]*bp[0]
113 and $car0,$mask,$acc0
114 add %sp,$bias+$frame,$tp
115 ld [$ap+8],$apj !prologue!
116
117 mulx $n0,$acc0,$mul1 ! "t[0]"*n0
118 and $mul1,$mask,$mul1
119
120 mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0
121 mulx $npj,$mul1,$acc1 !prologue! np[1]*"t[0]"*n0
122 srlx $car0,32,$car0
123 add $acc0,$car1,$car1
124 ld [$np+8],$npj !prologue!
125 srlx $car1,32,$car1
126 mov $tmp0,$acc0 !prologue!
127
128.L1st:
129 mulx $apj,$mul0,$tmp0
130 mulx $npj,$mul1,$tmp1
131 add $acc0,$car0,$car0
132 ld [$ap+$j],$apj ! ap[j]
133 and $car0,$mask,$acc0
134 add $acc1,$car1,$car1
135 ld [$np+$j],$npj ! np[j]
136 srlx $car0,32,$car0
137 add $acc0,$car1,$car1
138 add $j,4,$j ! j++
139 mov $tmp0,$acc0
140 st $car1,[$tp]
141 cmp $j,$num
142 mov $tmp1,$acc1
143 srlx $car1,32,$car1
144 bl %icc,.L1st
145 add $tp,4,$tp ! tp++
146!.L1st
147
148 mulx $apj,$mul0,$tmp0 !epilogue!
149 mulx $npj,$mul1,$tmp1
150 add $acc0,$car0,$car0
151 and $car0,$mask,$acc0
152 add $acc1,$car1,$car1
153 srlx $car0,32,$car0
154 add $acc0,$car1,$car1
155 st $car1,[$tp]
156 srlx $car1,32,$car1
157
158 add $tmp0,$car0,$car0
159 and $car0,$mask,$acc0
160 add $tmp1,$car1,$car1
161 srlx $car0,32,$car0
162 add $acc0,$car1,$car1
163 st $car1,[$tp+4]
164 srlx $car1,32,$car1
165
166 add $car0,$car1,$car1
167 st $car1,[$tp+8]
168 srlx $car1,32,$car2
169
170 mov 4,$i ! i++
171 ld [$bp+4],$mul0 ! bp[1]
172.Louter:
173 add %sp,$bias+$frame,$tp
174 ld [$ap],$car0 ! ap[0]
175 ld [$ap+4],$apj ! ap[1]
176 ld [$np],$car1 ! np[0]
177 ld [$np+4],$npj ! np[1]
178 ld [$tp],$tmp1 ! tp[0]
179 ld [$tp+4],$tpj ! tp[1]
180 mov 12,$j
181
182 mulx $car0,$mul0,$car0
183 mulx $apj,$mul0,$tmp0 !prologue!
184 add $tmp1,$car0,$car0
185 ld [$ap+8],$apj !prologue!
186 and $car0,$mask,$acc0
187
188 mulx $n0,$acc0,$mul1
189 and $mul1,$mask,$mul1
190
191 mulx $car1,$mul1,$car1
192 mulx $npj,$mul1,$acc1 !prologue!
193 srlx $car0,32,$car0
194 add $acc0,$car1,$car1
195 ld [$np+8],$npj !prologue!
196 srlx $car1,32,$car1
197 mov $tmp0,$acc0 !prologue!
198
199.Linner:
200 mulx $apj,$mul0,$tmp0
201 mulx $npj,$mul1,$tmp1
202 add $tpj,$car0,$car0
203 ld [$ap+$j],$apj ! ap[j]
204 add $acc0,$car0,$car0
205 add $acc1,$car1,$car1
206 ld [$np+$j],$npj ! np[j]
207 and $car0,$mask,$acc0
208 ld [$tp+8],$tpj ! tp[j]
209 srlx $car0,32,$car0
210 add $acc0,$car1,$car1
211 add $j,4,$j ! j++
212 mov $tmp0,$acc0
213 st $car1,[$tp] ! tp[j-1]
214 srlx $car1,32,$car1
215 mov $tmp1,$acc1
216 cmp $j,$num
217 bl %icc,.Linner
218 add $tp,4,$tp ! tp++
219!.Linner
220
221 mulx $apj,$mul0,$tmp0 !epilogue!
222 mulx $npj,$mul1,$tmp1
223 add $tpj,$car0,$car0
224 add $acc0,$car0,$car0
225 ld [$tp+8],$tpj ! tp[j]
226 and $car0,$mask,$acc0
227 add $acc1,$car1,$car1
228 srlx $car0,32,$car0
229 add $acc0,$car1,$car1
230 st $car1,[$tp] ! tp[j-1]
231 srlx $car1,32,$car1
232
233 add $tpj,$car0,$car0
234 add $tmp0,$car0,$car0
235 and $car0,$mask,$acc0
236 add $tmp1,$car1,$car1
237 add $acc0,$car1,$car1
238 st $car1,[$tp+4] ! tp[j-1]
239 srlx $car0,32,$car0
240 add $i,4,$i ! i++
241 srlx $car1,32,$car1
242
243 add $car0,$car1,$car1
244 cmp $i,$num
245 add $car2,$car1,$car1
246 st $car1,[$tp+8]
247
248 srlx $car1,32,$car2
249 bl,a %icc,.Louter
250 ld [$bp+$i],$mul0 ! bp[i]
251!.Louter
252
253 add $tp,12,$tp
254
255.Ltail:
256 add $np,$num,$np
257 add $rp,$num,$rp
258 mov $tp,$ap
259 sub %g0,$num,%o7 ! k=-num
260 ba .Lsub
261 subcc %g0,%g0,%g0 ! clear %icc.c
262.align 16
263.Lsub:
264 ld [$tp+%o7],%o0
265 ld [$np+%o7],%o1
266 subccc %o0,%o1,%o1 ! tp[j]-np[j]
267 add $rp,%o7,$i
268 add %o7,4,%o7
269 brnz %o7,.Lsub
270 st %o1,[$i]
271 subc $car2,0,$car2 ! handle upmost overflow bit
272 and $tp,$car2,$ap
273 andn $rp,$car2,$np
274 or $ap,$np,$ap
275 sub %g0,$num,%o7
276
277.Lcopy:
278 ld [$ap+%o7],%o0 ! copy or in-place refresh
279 st %g0,[$tp+%o7] ! zap tp
280 st %o0,[$rp+%o7]
281 add %o7,4,%o7
282 brnz %o7,.Lcopy
283 nop
284 mov 1,%i0
285 ret
286 restore
287___
288
289########
290######## .Lbn_sqr_mont gives up to 20% *overall* improvement over
291######## code without following dedicated squaring procedure.
292########
293$sbit="%i2"; # re-use $bp!
294
295$code.=<<___;
296.align 32
297.Lbn_sqr_mont:
298 mulx $mul0,$mul0,$car0 ! ap[0]*ap[0]
299 mulx $apj,$mul0,$tmp0 !prologue!
300 and $car0,$mask,$acc0
301 add %sp,$bias+$frame,$tp
302 ld [$ap+8],$apj !prologue!
303
304 mulx $n0,$acc0,$mul1 ! "t[0]"*n0
305 srlx $car0,32,$car0
306 and $mul1,$mask,$mul1
307
308 mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0
309 mulx $npj,$mul1,$acc1 !prologue!
310 and $car0,1,$sbit
311 ld [$np+8],$npj !prologue!
312 srlx $car0,1,$car0
313 add $acc0,$car1,$car1
314 srlx $car1,32,$car1
315 mov $tmp0,$acc0 !prologue!
316
317.Lsqr_1st:
318 mulx $apj,$mul0,$tmp0
319 mulx $npj,$mul1,$tmp1
320 add $acc0,$car0,$car0 ! ap[j]*a0+c0
321 add $acc1,$car1,$car1
322 ld [$ap+$j],$apj ! ap[j]
323 and $car0,$mask,$acc0
324 ld [$np+$j],$npj ! np[j]
325 srlx $car0,32,$car0
326 add $acc0,$acc0,$acc0
327 or $sbit,$acc0,$acc0
328 mov $tmp1,$acc1
329 srlx $acc0,32,$sbit
330 add $j,4,$j ! j++
331 and $acc0,$mask,$acc0
332 cmp $j,$num
333 add $acc0,$car1,$car1
334 st $car1,[$tp]
335 mov $tmp0,$acc0
336 srlx $car1,32,$car1
337 bl %icc,.Lsqr_1st
338 add $tp,4,$tp ! tp++
339!.Lsqr_1st
340
341 mulx $apj,$mul0,$tmp0 ! epilogue
342 mulx $npj,$mul1,$tmp1
343 add $acc0,$car0,$car0 ! ap[j]*a0+c0
344 add $acc1,$car1,$car1
345 and $car0,$mask,$acc0
346 srlx $car0,32,$car0
347 add $acc0,$acc0,$acc0
348 or $sbit,$acc0,$acc0
349 srlx $acc0,32,$sbit
350 and $acc0,$mask,$acc0
351 add $acc0,$car1,$car1
352 st $car1,[$tp]
353 srlx $car1,32,$car1
354
355 add $tmp0,$car0,$car0 ! ap[j]*a0+c0
356 add $tmp1,$car1,$car1
357 and $car0,$mask,$acc0
358 srlx $car0,32,$car0
359 add $acc0,$acc0,$acc0
360 or $sbit,$acc0,$acc0
361 srlx $acc0,32,$sbit
362 and $acc0,$mask,$acc0
363 add $acc0,$car1,$car1
364 st $car1,[$tp+4]
365 srlx $car1,32,$car1
366
367 add $car0,$car0,$car0
368 or $sbit,$car0,$car0
369 add $car0,$car1,$car1
370 st $car1,[$tp+8]
371 srlx $car1,32,$car2
372
373 ld [%sp+$bias+$frame],$tmp0 ! tp[0]
374 ld [%sp+$bias+$frame+4],$tmp1 ! tp[1]
375 ld [%sp+$bias+$frame+8],$tpj ! tp[2]
376 ld [$ap+4],$mul0 ! ap[1]
377 ld [$ap+8],$apj ! ap[2]
378 ld [$np],$car1 ! np[0]
379 ld [$np+4],$npj ! np[1]
380 mulx $n0,$tmp0,$mul1
381
382 mulx $mul0,$mul0,$car0
383 and $mul1,$mask,$mul1
384
385 mulx $car1,$mul1,$car1
386 mulx $npj,$mul1,$acc1
387 add $tmp0,$car1,$car1
388 and $car0,$mask,$acc0
389 ld [$np+8],$npj ! np[2]
390 srlx $car1,32,$car1
391 add $tmp1,$car1,$car1
392 srlx $car0,32,$car0
393 add $acc0,$car1,$car1
394 and $car0,1,$sbit
395 add $acc1,$car1,$car1
396 srlx $car0,1,$car0
397 mov 12,$j
398 st $car1,[%sp+$bias+$frame] ! tp[0]=
399 srlx $car1,32,$car1
400 add %sp,$bias+$frame+4,$tp
401
402.Lsqr_2nd:
403 mulx $apj,$mul0,$acc0
404 mulx $npj,$mul1,$acc1
405 add $acc0,$car0,$car0
406 add $tpj,$car1,$car1
407 ld [$ap+$j],$apj ! ap[j]
408 and $car0,$mask,$acc0
409 ld [$np+$j],$npj ! np[j]
410 srlx $car0,32,$car0
411 add $acc1,$car1,$car1
412 ld [$tp+8],$tpj ! tp[j]
413 add $acc0,$acc0,$acc0
414 add $j,4,$j ! j++
415 or $sbit,$acc0,$acc0
416 srlx $acc0,32,$sbit
417 and $acc0,$mask,$acc0
418 cmp $j,$num
419 add $acc0,$car1,$car1
420 st $car1,[$tp] ! tp[j-1]
421 srlx $car1,32,$car1
422 bl %icc,.Lsqr_2nd
423 add $tp,4,$tp ! tp++
424!.Lsqr_2nd
425
426 mulx $apj,$mul0,$acc0
427 mulx $npj,$mul1,$acc1
428 add $acc0,$car0,$car0
429 add $tpj,$car1,$car1
430 and $car0,$mask,$acc0
431 srlx $car0,32,$car0
432 add $acc1,$car1,$car1
433 add $acc0,$acc0,$acc0
434 or $sbit,$acc0,$acc0
435 srlx $acc0,32,$sbit
436 and $acc0,$mask,$acc0
437 add $acc0,$car1,$car1
438 st $car1,[$tp] ! tp[j-1]
439 srlx $car1,32,$car1
440
441 add $car0,$car0,$car0
442 or $sbit,$car0,$car0
443 add $car0,$car1,$car1
444 add $car2,$car1,$car1
445 st $car1,[$tp+4]
446 srlx $car1,32,$car2
447
448 ld [%sp+$bias+$frame],$tmp1 ! tp[0]
449 ld [%sp+$bias+$frame+4],$tpj ! tp[1]
450 ld [$ap+8],$mul0 ! ap[2]
451 ld [$np],$car1 ! np[0]
452 ld [$np+4],$npj ! np[1]
453 mulx $n0,$tmp1,$mul1
454 and $mul1,$mask,$mul1
455 mov 8,$i
456
457 mulx $mul0,$mul0,$car0
458 mulx $car1,$mul1,$car1
459 and $car0,$mask,$acc0
460 add $tmp1,$car1,$car1
461 srlx $car0,32,$car0
462 add %sp,$bias+$frame,$tp
463 srlx $car1,32,$car1
464 and $car0,1,$sbit
465 srlx $car0,1,$car0
466 mov 4,$j
467
468.Lsqr_outer:
469.Lsqr_inner1:
470 mulx $npj,$mul1,$acc1
471 add $tpj,$car1,$car1
472 add $j,4,$j
473 ld [$tp+8],$tpj
474 cmp $j,$i
475 add $acc1,$car1,$car1
476 ld [$np+$j],$npj
477 st $car1,[$tp]
478 srlx $car1,32,$car1
479 bl %icc,.Lsqr_inner1
480 add $tp,4,$tp
481!.Lsqr_inner1
482
483 add $j,4,$j
484 ld [$ap+$j],$apj ! ap[j]
485 mulx $npj,$mul1,$acc1
486 add $tpj,$car1,$car1
487 ld [$np+$j],$npj ! np[j]
488 add $acc0,$car1,$car1
489 ld [$tp+8],$tpj ! tp[j]
490 add $acc1,$car1,$car1
491 st $car1,[$tp]
492 srlx $car1,32,$car1
493
494 add $j,4,$j
495 cmp $j,$num
496 be,pn %icc,.Lsqr_no_inner2
497 add $tp,4,$tp
498
499.Lsqr_inner2:
500 mulx $apj,$mul0,$acc0
501 mulx $npj,$mul1,$acc1
502 add $tpj,$car1,$car1
503 add $acc0,$car0,$car0
504 ld [$ap+$j],$apj ! ap[j]
505 and $car0,$mask,$acc0
506 ld [$np+$j],$npj ! np[j]
507 srlx $car0,32,$car0
508 add $acc0,$acc0,$acc0
509 ld [$tp+8],$tpj ! tp[j]
510 or $sbit,$acc0,$acc0
511 add $j,4,$j ! j++
512 srlx $acc0,32,$sbit
513 and $acc0,$mask,$acc0
514 cmp $j,$num
515 add $acc0,$car1,$car1
516 add $acc1,$car1,$car1
517 st $car1,[$tp] ! tp[j-1]
518 srlx $car1,32,$car1
519 bl %icc,.Lsqr_inner2
520 add $tp,4,$tp ! tp++
521
522.Lsqr_no_inner2:
523 mulx $apj,$mul0,$acc0
524 mulx $npj,$mul1,$acc1
525 add $tpj,$car1,$car1
526 add $acc0,$car0,$car0
527 and $car0,$mask,$acc0
528 srlx $car0,32,$car0
529 add $acc0,$acc0,$acc0
530 or $sbit,$acc0,$acc0
531 srlx $acc0,32,$sbit
532 and $acc0,$mask,$acc0
533 add $acc0,$car1,$car1
534 add $acc1,$car1,$car1
535 st $car1,[$tp] ! tp[j-1]
536 srlx $car1,32,$car1
537
538 add $car0,$car0,$car0
539 or $sbit,$car0,$car0
540 add $car0,$car1,$car1
541 add $car2,$car1,$car1
542 st $car1,[$tp+4]
543 srlx $car1,32,$car2
544
545 add $i,4,$i ! i++
546 ld [%sp+$bias+$frame],$tmp1 ! tp[0]
547 ld [%sp+$bias+$frame+4],$tpj ! tp[1]
548 ld [$ap+$i],$mul0 ! ap[j]
549 ld [$np],$car1 ! np[0]
550 ld [$np+4],$npj ! np[1]
551 mulx $n0,$tmp1,$mul1
552 and $mul1,$mask,$mul1
553 add $i,4,$tmp0
554
555 mulx $mul0,$mul0,$car0
556 mulx $car1,$mul1,$car1
557 and $car0,$mask,$acc0
558 add $tmp1,$car1,$car1
559 srlx $car0,32,$car0
560 add %sp,$bias+$frame,$tp
561 srlx $car1,32,$car1
562 and $car0,1,$sbit
563 srlx $car0,1,$car0
564
565 cmp $tmp0,$num ! i<num-1
566 bl %icc,.Lsqr_outer
567 mov 4,$j
568
569.Lsqr_last:
570 mulx $npj,$mul1,$acc1
571 add $tpj,$car1,$car1
572 add $j,4,$j
573 ld [$tp+8],$tpj
574 cmp $j,$i
575 add $acc1,$car1,$car1
576 ld [$np+$j],$npj
577 st $car1,[$tp]
578 srlx $car1,32,$car1
579 bl %icc,.Lsqr_last
580 add $tp,4,$tp
581!.Lsqr_last
582
583 mulx $npj,$mul1,$acc1
584 add $tpj,$car1,$car1
585 add $acc0,$car1,$car1
586 add $acc1,$car1,$car1
587 st $car1,[$tp]
588 srlx $car1,32,$car1
589
590 add $car0,$car0,$car0 ! recover $car0
591 or $sbit,$car0,$car0
592 add $car0,$car1,$car1
593 add $car2,$car1,$car1
594 st $car1,[$tp+4]
595 srlx $car1,32,$car2
596
597 ba .Ltail
598 add $tp,8,$tp
599.type $fname,#function
600.size $fname,(.-$fname)
601.asciz "Montgomery Multipltication for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
602.align 32
603___
604$code =~ s/\`([^\`]*)\`/eval($1)/gem;
605print $code;
606close STDOUT;
diff --git a/src/lib/libcrypto/bn/asm/sparcv9a-mont.pl b/src/lib/libcrypto/bn/asm/sparcv9a-mont.pl
deleted file mode 100755
index a14205f2f0..0000000000
--- a/src/lib/libcrypto/bn/asm/sparcv9a-mont.pl
+++ /dev/null
@@ -1,882 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# October 2005
11#
12# "Teaser" Montgomery multiplication module for UltraSPARC. Why FPU?
13# Because unlike integer multiplier, which simply stalls whole CPU,
14# FPU is fully pipelined and can effectively emit 48 bit partial
15# product every cycle. Why not blended SPARC v9? One can argue that
16# making this module dependent on UltraSPARC VIS extension limits its
17# binary compatibility. Well yes, it does exclude SPARC64 prior-V(!)
18# implementations from compatibility matrix. But the rest, whole Sun
19# UltraSPARC family and brand new Fujitsu's SPARC64 V, all support
20# VIS extension instructions used in this module. This is considered
21# good enough to not care about HAL SPARC64 users [if any] who have
22# integer-only pure SPARCv9 module to "fall down" to.
23
24# USI&II cores currently exhibit uniform 2x improvement [over pre-
25# bn_mul_mont codebase] for all key lengths and benchmarks. On USIII
26# performance improves few percents for shorter keys and worsens few
27# percents for longer keys. This is because USIII integer multiplier
28# is >3x faster than USI&II one, which is harder to match [but see
29# TODO list below]. It should also be noted that SPARC64 V features
30# out-of-order execution, which *might* mean that integer multiplier
31# is pipelined, which in turn *might* be impossible to match... On
32# additional note, SPARC64 V implements FP Multiply-Add instruction,
33# which is perfectly usable in this context... In other words, as far
34# as Fujitsu SPARC64 V goes, talk to the author:-)
35
36# The implementation implies following "non-natural" limitations on
37# input arguments:
38# - num may not be less than 4;
39# - num has to be even;
40# Failure to meet either condition has no fatal effects, simply
41# doesn't give any performance gain.
42
43# TODO:
44# - modulo-schedule inner loop for better performance (on in-order
45# execution core such as UltraSPARC this shall result in further
46# noticeable(!) improvement);
47# - dedicated squaring procedure[?];
48
49######################################################################
50# November 2006
51#
52# Modulo-scheduled inner loops allow to interleave floating point and
53# integer instructions and minimize Read-After-Write penalties. This
54# results in *further* 20-50% perfromance improvement [depending on
55# key length, more for longer keys] on USI&II cores and 30-80% - on
56# USIII&IV.
57
58$fname="bn_mul_mont_fpu";
59$bits=32;
60for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
61
62if ($bits==64) {
63 $bias=2047;
64 $frame=192;
65} else {
66 $bias=0;
67 $frame=128; # 96 rounded up to largest known cache-line
68}
69$locals=64;
70
71# In order to provide for 32-/64-bit ABI duality, I keep integers wider
72# than 32 bit in %g1-%g4 and %o0-%o5. %l0-%l7 and %i0-%i5 are used
73# exclusively for pointers, indexes and other small values...
74# int bn_mul_mont(
75$rp="%i0"; # BN_ULONG *rp,
76$ap="%i1"; # const BN_ULONG *ap,
77$bp="%i2"; # const BN_ULONG *bp,
78$np="%i3"; # const BN_ULONG *np,
79$n0="%i4"; # const BN_ULONG *n0,
80$num="%i5"; # int num);
81
82$tp="%l0"; # t[num]
83$ap_l="%l1"; # a[num],n[num] are smashed to 32-bit words and saved
84$ap_h="%l2"; # to these four vectors as double-precision FP values.
85$np_l="%l3"; # This way a bunch of fxtods are eliminated in second
86$np_h="%l4"; # loop and L1-cache aliasing is minimized...
87$i="%l5";
88$j="%l6";
89$mask="%l7"; # 16-bit mask, 0xffff
90
91$n0="%g4"; # reassigned(!) to "64-bit" register
92$carry="%i4"; # %i4 reused(!) for a carry bit
93
94# FP register naming chart
95#
96# ..HILO
97# dcba
98# --------
99# LOa
100# LOb
101# LOc
102# LOd
103# HIa
104# HIb
105# HIc
106# HId
107# ..a
108# ..b
109$ba="%f0"; $bb="%f2"; $bc="%f4"; $bd="%f6";
110$na="%f8"; $nb="%f10"; $nc="%f12"; $nd="%f14";
111$alo="%f16"; $alo_="%f17"; $ahi="%f18"; $ahi_="%f19";
112$nlo="%f20"; $nlo_="%f21"; $nhi="%f22"; $nhi_="%f23";
113
114$dota="%f24"; $dotb="%f26";
115
116$aloa="%f32"; $alob="%f34"; $aloc="%f36"; $alod="%f38";
117$ahia="%f40"; $ahib="%f42"; $ahic="%f44"; $ahid="%f46";
118$nloa="%f48"; $nlob="%f50"; $nloc="%f52"; $nlod="%f54";
119$nhia="%f56"; $nhib="%f58"; $nhic="%f60"; $nhid="%f62";
120
121$ASI_FL16_P=0xD2; # magic ASI value to engage 16-bit FP load
122
123$code=<<___;
124.section ".text",#alloc,#execinstr
125
126.global $fname
127.align 32
128$fname:
129 save %sp,-$frame-$locals,%sp
130
131 cmp $num,4
132 bl,a,pn %icc,.Lret
133 clr %i0
134 andcc $num,1,%g0 ! $num has to be even...
135 bnz,a,pn %icc,.Lret
136 clr %i0 ! signal "unsupported input value"
137
138 srl $num,1,$num
139 sethi %hi(0xffff),$mask
140 ld [%i4+0],$n0 ! $n0 reassigned, remember?
141 or $mask,%lo(0xffff),$mask
142 ld [%i4+4],%o0
143 sllx %o0,32,%o0
144 or %o0,$n0,$n0 ! $n0=n0[1].n0[0]
145
146 sll $num,3,$num ! num*=8
147
148 add %sp,$bias,%o0 ! real top of stack
149 sll $num,2,%o1
150 add %o1,$num,%o1 ! %o1=num*5
151 sub %o0,%o1,%o0
152 and %o0,-2048,%o0 ! optimize TLB utilization
153 sub %o0,$bias,%sp ! alloca(5*num*8)
154
155 rd %asi,%o7 ! save %asi
156 add %sp,$bias+$frame+$locals,$tp
157 add $tp,$num,$ap_l
158 add $ap_l,$num,$ap_l ! [an]p_[lh] point at the vectors' ends !
159 add $ap_l,$num,$ap_h
160 add $ap_h,$num,$np_l
161 add $np_l,$num,$np_h
162
163 wr %g0,$ASI_FL16_P,%asi ! setup %asi for 16-bit FP loads
164
165 add $rp,$num,$rp ! readjust input pointers to point
166 add $ap,$num,$ap ! at the ends too...
167 add $bp,$num,$bp
168 add $np,$num,$np
169
170 stx %o7,[%sp+$bias+$frame+48] ! save %asi
171
172 sub %g0,$num,$i ! i=-num
173 sub %g0,$num,$j ! j=-num
174
175 add $ap,$j,%o3
176 add $bp,$i,%o4
177
178 ld [%o3+4],%g1 ! bp[0]
179 ld [%o3+0],%o0
180 ld [%o4+4],%g5 ! ap[0]
181 sllx %g1,32,%g1
182 ld [%o4+0],%o1
183 sllx %g5,32,%g5
184 or %g1,%o0,%o0
185 or %g5,%o1,%o1
186
187 add $np,$j,%o5
188
189 mulx %o1,%o0,%o0 ! ap[0]*bp[0]
190 mulx $n0,%o0,%o0 ! ap[0]*bp[0]*n0
191 stx %o0,[%sp+$bias+$frame+0]
192
193 ld [%o3+0],$alo_ ! load a[j] as pair of 32-bit words
194 fzeros $alo
195 ld [%o3+4],$ahi_
196 fzeros $ahi
197 ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words
198 fzeros $nlo
199 ld [%o5+4],$nhi_
200 fzeros $nhi
201
202 ! transfer b[i] to FPU as 4x16-bit values
203 ldda [%o4+2]%asi,$ba
204 fxtod $alo,$alo
205 ldda [%o4+0]%asi,$bb
206 fxtod $ahi,$ahi
207 ldda [%o4+6]%asi,$bc
208 fxtod $nlo,$nlo
209 ldda [%o4+4]%asi,$bd
210 fxtod $nhi,$nhi
211
212 ! transfer ap[0]*b[0]*n0 to FPU as 4x16-bit values
213 ldda [%sp+$bias+$frame+6]%asi,$na
214 fxtod $ba,$ba
215 ldda [%sp+$bias+$frame+4]%asi,$nb
216 fxtod $bb,$bb
217 ldda [%sp+$bias+$frame+2]%asi,$nc
218 fxtod $bc,$bc
219 ldda [%sp+$bias+$frame+0]%asi,$nd
220 fxtod $bd,$bd
221
222 std $alo,[$ap_l+$j] ! save smashed ap[j] in double format
223 fxtod $na,$na
224 std $ahi,[$ap_h+$j]
225 fxtod $nb,$nb
226 std $nlo,[$np_l+$j] ! save smashed np[j] in double format
227 fxtod $nc,$nc
228 std $nhi,[$np_h+$j]
229 fxtod $nd,$nd
230
231 fmuld $alo,$ba,$aloa
232 fmuld $nlo,$na,$nloa
233 fmuld $alo,$bb,$alob
234 fmuld $nlo,$nb,$nlob
235 fmuld $alo,$bc,$aloc
236 faddd $aloa,$nloa,$nloa
237 fmuld $nlo,$nc,$nloc
238 fmuld $alo,$bd,$alod
239 faddd $alob,$nlob,$nlob
240 fmuld $nlo,$nd,$nlod
241 fmuld $ahi,$ba,$ahia
242 faddd $aloc,$nloc,$nloc
243 fmuld $nhi,$na,$nhia
244 fmuld $ahi,$bb,$ahib
245 faddd $alod,$nlod,$nlod
246 fmuld $nhi,$nb,$nhib
247 fmuld $ahi,$bc,$ahic
248 faddd $ahia,$nhia,$nhia
249 fmuld $nhi,$nc,$nhic
250 fmuld $ahi,$bd,$ahid
251 faddd $ahib,$nhib,$nhib
252 fmuld $nhi,$nd,$nhid
253
254 faddd $ahic,$nhic,$dota ! $nhic
255 faddd $ahid,$nhid,$dotb ! $nhid
256
257 faddd $nloc,$nhia,$nloc
258 faddd $nlod,$nhib,$nlod
259
260 fdtox $nloa,$nloa
261 fdtox $nlob,$nlob
262 fdtox $nloc,$nloc
263 fdtox $nlod,$nlod
264
265 std $nloa,[%sp+$bias+$frame+0]
266 add $j,8,$j
267 std $nlob,[%sp+$bias+$frame+8]
268 add $ap,$j,%o4
269 std $nloc,[%sp+$bias+$frame+16]
270 add $np,$j,%o5
271 std $nlod,[%sp+$bias+$frame+24]
272
273 ld [%o4+0],$alo_ ! load a[j] as pair of 32-bit words
274 fzeros $alo
275 ld [%o4+4],$ahi_
276 fzeros $ahi
277 ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words
278 fzeros $nlo
279 ld [%o5+4],$nhi_
280 fzeros $nhi
281
282 fxtod $alo,$alo
283 fxtod $ahi,$ahi
284 fxtod $nlo,$nlo
285 fxtod $nhi,$nhi
286
287 ldx [%sp+$bias+$frame+0],%o0
288 fmuld $alo,$ba,$aloa
289 ldx [%sp+$bias+$frame+8],%o1
290 fmuld $nlo,$na,$nloa
291 ldx [%sp+$bias+$frame+16],%o2
292 fmuld $alo,$bb,$alob
293 ldx [%sp+$bias+$frame+24],%o3
294 fmuld $nlo,$nb,$nlob
295
296 srlx %o0,16,%o7
297 std $alo,[$ap_l+$j] ! save smashed ap[j] in double format
298 fmuld $alo,$bc,$aloc
299 add %o7,%o1,%o1
300 std $ahi,[$ap_h+$j]
301 faddd $aloa,$nloa,$nloa
302 fmuld $nlo,$nc,$nloc
303 srlx %o1,16,%o7
304 std $nlo,[$np_l+$j] ! save smashed np[j] in double format
305 fmuld $alo,$bd,$alod
306 add %o7,%o2,%o2
307 std $nhi,[$np_h+$j]
308 faddd $alob,$nlob,$nlob
309 fmuld $nlo,$nd,$nlod
310 srlx %o2,16,%o7
311 fmuld $ahi,$ba,$ahia
312 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
313 faddd $aloc,$nloc,$nloc
314 fmuld $nhi,$na,$nhia
315 !and %o0,$mask,%o0
316 !and %o1,$mask,%o1
317 !and %o2,$mask,%o2
318 !sllx %o1,16,%o1
319 !sllx %o2,32,%o2
320 !sllx %o3,48,%o7
321 !or %o1,%o0,%o0
322 !or %o2,%o0,%o0
323 !or %o7,%o0,%o0 ! 64-bit result
324 srlx %o3,16,%g1 ! 34-bit carry
325 fmuld $ahi,$bb,$ahib
326
327 faddd $alod,$nlod,$nlod
328 fmuld $nhi,$nb,$nhib
329 fmuld $ahi,$bc,$ahic
330 faddd $ahia,$nhia,$nhia
331 fmuld $nhi,$nc,$nhic
332 fmuld $ahi,$bd,$ahid
333 faddd $ahib,$nhib,$nhib
334 fmuld $nhi,$nd,$nhid
335
336 faddd $dota,$nloa,$nloa
337 faddd $dotb,$nlob,$nlob
338 faddd $ahic,$nhic,$dota ! $nhic
339 faddd $ahid,$nhid,$dotb ! $nhid
340
341 faddd $nloc,$nhia,$nloc
342 faddd $nlod,$nhib,$nlod
343
344 fdtox $nloa,$nloa
345 fdtox $nlob,$nlob
346 fdtox $nloc,$nloc
347 fdtox $nlod,$nlod
348
349 std $nloa,[%sp+$bias+$frame+0]
350 std $nlob,[%sp+$bias+$frame+8]
351 addcc $j,8,$j
352 std $nloc,[%sp+$bias+$frame+16]
353 bz,pn %icc,.L1stskip
354 std $nlod,[%sp+$bias+$frame+24]
355
356.align 32 ! incidentally already aligned !
357.L1st:
358 add $ap,$j,%o4
359 add $np,$j,%o5
360 ld [%o4+0],$alo_ ! load a[j] as pair of 32-bit words
361 fzeros $alo
362 ld [%o4+4],$ahi_
363 fzeros $ahi
364 ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words
365 fzeros $nlo
366 ld [%o5+4],$nhi_
367 fzeros $nhi
368
369 fxtod $alo,$alo
370 fxtod $ahi,$ahi
371 fxtod $nlo,$nlo
372 fxtod $nhi,$nhi
373
374 ldx [%sp+$bias+$frame+0],%o0
375 fmuld $alo,$ba,$aloa
376 ldx [%sp+$bias+$frame+8],%o1
377 fmuld $nlo,$na,$nloa
378 ldx [%sp+$bias+$frame+16],%o2
379 fmuld $alo,$bb,$alob
380 ldx [%sp+$bias+$frame+24],%o3
381 fmuld $nlo,$nb,$nlob
382
383 srlx %o0,16,%o7
384 std $alo,[$ap_l+$j] ! save smashed ap[j] in double format
385 fmuld $alo,$bc,$aloc
386 add %o7,%o1,%o1
387 std $ahi,[$ap_h+$j]
388 faddd $aloa,$nloa,$nloa
389 fmuld $nlo,$nc,$nloc
390 srlx %o1,16,%o7
391 std $nlo,[$np_l+$j] ! save smashed np[j] in double format
392 fmuld $alo,$bd,$alod
393 add %o7,%o2,%o2
394 std $nhi,[$np_h+$j]
395 faddd $alob,$nlob,$nlob
396 fmuld $nlo,$nd,$nlod
397 srlx %o2,16,%o7
398 fmuld $ahi,$ba,$ahia
399 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
400 and %o0,$mask,%o0
401 faddd $aloc,$nloc,$nloc
402 fmuld $nhi,$na,$nhia
403 and %o1,$mask,%o1
404 and %o2,$mask,%o2
405 fmuld $ahi,$bb,$ahib
406 sllx %o1,16,%o1
407 faddd $alod,$nlod,$nlod
408 fmuld $nhi,$nb,$nhib
409 sllx %o2,32,%o2
410 fmuld $ahi,$bc,$ahic
411 sllx %o3,48,%o7
412 or %o1,%o0,%o0
413 faddd $ahia,$nhia,$nhia
414 fmuld $nhi,$nc,$nhic
415 or %o2,%o0,%o0
416 fmuld $ahi,$bd,$ahid
417 or %o7,%o0,%o0 ! 64-bit result
418 faddd $ahib,$nhib,$nhib
419 fmuld $nhi,$nd,$nhid
420 addcc %g1,%o0,%o0
421 faddd $dota,$nloa,$nloa
422 srlx %o3,16,%g1 ! 34-bit carry
423 faddd $dotb,$nlob,$nlob
424 bcs,a %xcc,.+8
425 add %g1,1,%g1
426
427 stx %o0,[$tp] ! tp[j-1]=
428
429 faddd $ahic,$nhic,$dota ! $nhic
430 faddd $ahid,$nhid,$dotb ! $nhid
431
432 faddd $nloc,$nhia,$nloc
433 faddd $nlod,$nhib,$nlod
434
435 fdtox $nloa,$nloa
436 fdtox $nlob,$nlob
437 fdtox $nloc,$nloc
438 fdtox $nlod,$nlod
439
440 std $nloa,[%sp+$bias+$frame+0]
441 std $nlob,[%sp+$bias+$frame+8]
442 std $nloc,[%sp+$bias+$frame+16]
443 std $nlod,[%sp+$bias+$frame+24]
444
445 addcc $j,8,$j
446 bnz,pt %icc,.L1st
447 add $tp,8,$tp
448
449.L1stskip:
450 fdtox $dota,$dota
451 fdtox $dotb,$dotb
452
453 ldx [%sp+$bias+$frame+0],%o0
454 ldx [%sp+$bias+$frame+8],%o1
455 ldx [%sp+$bias+$frame+16],%o2
456 ldx [%sp+$bias+$frame+24],%o3
457
458 srlx %o0,16,%o7
459 std $dota,[%sp+$bias+$frame+32]
460 add %o7,%o1,%o1
461 std $dotb,[%sp+$bias+$frame+40]
462 srlx %o1,16,%o7
463 add %o7,%o2,%o2
464 srlx %o2,16,%o7
465 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
466 and %o0,$mask,%o0
467 and %o1,$mask,%o1
468 and %o2,$mask,%o2
469 sllx %o1,16,%o1
470 sllx %o2,32,%o2
471 sllx %o3,48,%o7
472 or %o1,%o0,%o0
473 or %o2,%o0,%o0
474 or %o7,%o0,%o0 ! 64-bit result
475 ldx [%sp+$bias+$frame+32],%o4
476 addcc %g1,%o0,%o0
477 ldx [%sp+$bias+$frame+40],%o5
478 srlx %o3,16,%g1 ! 34-bit carry
479 bcs,a %xcc,.+8
480 add %g1,1,%g1
481
482 stx %o0,[$tp] ! tp[j-1]=
483 add $tp,8,$tp
484
485 srlx %o4,16,%o7
486 add %o7,%o5,%o5
487 and %o4,$mask,%o4
488 sllx %o5,16,%o7
489 or %o7,%o4,%o4
490 addcc %g1,%o4,%o4
491 srlx %o5,48,%g1
492 bcs,a %xcc,.+8
493 add %g1,1,%g1
494
495 mov %g1,$carry
496 stx %o4,[$tp] ! tp[num-1]=
497
498 ba .Louter
499 add $i,8,$i
500.align 32
501.Louter:
502 sub %g0,$num,$j ! j=-num
503 add %sp,$bias+$frame+$locals,$tp
504
505 add $ap,$j,%o3
506 add $bp,$i,%o4
507
508 ld [%o3+4],%g1 ! bp[i]
509 ld [%o3+0],%o0
510 ld [%o4+4],%g5 ! ap[0]
511 sllx %g1,32,%g1
512 ld [%o4+0],%o1
513 sllx %g5,32,%g5
514 or %g1,%o0,%o0
515 or %g5,%o1,%o1
516
517 ldx [$tp],%o2 ! tp[0]
518 mulx %o1,%o0,%o0
519 addcc %o2,%o0,%o0
520 mulx $n0,%o0,%o0 ! (ap[0]*bp[i]+t[0])*n0
521 stx %o0,[%sp+$bias+$frame+0]
522
523 ! transfer b[i] to FPU as 4x16-bit values
524 ldda [%o4+2]%asi,$ba
525 ldda [%o4+0]%asi,$bb
526 ldda [%o4+6]%asi,$bc
527 ldda [%o4+4]%asi,$bd
528
529 ! transfer (ap[0]*b[i]+t[0])*n0 to FPU as 4x16-bit values
530 ldda [%sp+$bias+$frame+6]%asi,$na
531 fxtod $ba,$ba
532 ldda [%sp+$bias+$frame+4]%asi,$nb
533 fxtod $bb,$bb
534 ldda [%sp+$bias+$frame+2]%asi,$nc
535 fxtod $bc,$bc
536 ldda [%sp+$bias+$frame+0]%asi,$nd
537 fxtod $bd,$bd
538 ldd [$ap_l+$j],$alo ! load a[j] in double format
539 fxtod $na,$na
540 ldd [$ap_h+$j],$ahi
541 fxtod $nb,$nb
542 ldd [$np_l+$j],$nlo ! load n[j] in double format
543 fxtod $nc,$nc
544 ldd [$np_h+$j],$nhi
545 fxtod $nd,$nd
546
547 fmuld $alo,$ba,$aloa
548 fmuld $nlo,$na,$nloa
549 fmuld $alo,$bb,$alob
550 fmuld $nlo,$nb,$nlob
551 fmuld $alo,$bc,$aloc
552 faddd $aloa,$nloa,$nloa
553 fmuld $nlo,$nc,$nloc
554 fmuld $alo,$bd,$alod
555 faddd $alob,$nlob,$nlob
556 fmuld $nlo,$nd,$nlod
557 fmuld $ahi,$ba,$ahia
558 faddd $aloc,$nloc,$nloc
559 fmuld $nhi,$na,$nhia
560 fmuld $ahi,$bb,$ahib
561 faddd $alod,$nlod,$nlod
562 fmuld $nhi,$nb,$nhib
563 fmuld $ahi,$bc,$ahic
564 faddd $ahia,$nhia,$nhia
565 fmuld $nhi,$nc,$nhic
566 fmuld $ahi,$bd,$ahid
567 faddd $ahib,$nhib,$nhib
568 fmuld $nhi,$nd,$nhid
569
570 faddd $ahic,$nhic,$dota ! $nhic
571 faddd $ahid,$nhid,$dotb ! $nhid
572
573 faddd $nloc,$nhia,$nloc
574 faddd $nlod,$nhib,$nlod
575
576 fdtox $nloa,$nloa
577 fdtox $nlob,$nlob
578 fdtox $nloc,$nloc
579 fdtox $nlod,$nlod
580
581 std $nloa,[%sp+$bias+$frame+0]
582 std $nlob,[%sp+$bias+$frame+8]
583 std $nloc,[%sp+$bias+$frame+16]
584 add $j,8,$j
585 std $nlod,[%sp+$bias+$frame+24]
586
587 ldd [$ap_l+$j],$alo ! load a[j] in double format
588 ldd [$ap_h+$j],$ahi
589 ldd [$np_l+$j],$nlo ! load n[j] in double format
590 ldd [$np_h+$j],$nhi
591
592 fmuld $alo,$ba,$aloa
593 fmuld $nlo,$na,$nloa
594 fmuld $alo,$bb,$alob
595 fmuld $nlo,$nb,$nlob
596 fmuld $alo,$bc,$aloc
597 ldx [%sp+$bias+$frame+0],%o0
598 faddd $aloa,$nloa,$nloa
599 fmuld $nlo,$nc,$nloc
600 ldx [%sp+$bias+$frame+8],%o1
601 fmuld $alo,$bd,$alod
602 ldx [%sp+$bias+$frame+16],%o2
603 faddd $alob,$nlob,$nlob
604 fmuld $nlo,$nd,$nlod
605 ldx [%sp+$bias+$frame+24],%o3
606 fmuld $ahi,$ba,$ahia
607
608 srlx %o0,16,%o7
609 faddd $aloc,$nloc,$nloc
610 fmuld $nhi,$na,$nhia
611 add %o7,%o1,%o1
612 fmuld $ahi,$bb,$ahib
613 srlx %o1,16,%o7
614 faddd $alod,$nlod,$nlod
615 fmuld $nhi,$nb,$nhib
616 add %o7,%o2,%o2
617 fmuld $ahi,$bc,$ahic
618 srlx %o2,16,%o7
619 faddd $ahia,$nhia,$nhia
620 fmuld $nhi,$nc,$nhic
621 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
622 ! why?
623 and %o0,$mask,%o0
624 fmuld $ahi,$bd,$ahid
625 and %o1,$mask,%o1
626 and %o2,$mask,%o2
627 faddd $ahib,$nhib,$nhib
628 fmuld $nhi,$nd,$nhid
629 sllx %o1,16,%o1
630 faddd $dota,$nloa,$nloa
631 sllx %o2,32,%o2
632 faddd $dotb,$nlob,$nlob
633 sllx %o3,48,%o7
634 or %o1,%o0,%o0
635 faddd $ahic,$nhic,$dota ! $nhic
636 or %o2,%o0,%o0
637 faddd $ahid,$nhid,$dotb ! $nhid
638 or %o7,%o0,%o0 ! 64-bit result
639 ldx [$tp],%o7
640 faddd $nloc,$nhia,$nloc
641 addcc %o7,%o0,%o0
642 ! end-of-why?
643 faddd $nlod,$nhib,$nlod
644 srlx %o3,16,%g1 ! 34-bit carry
645 fdtox $nloa,$nloa
646 bcs,a %xcc,.+8
647 add %g1,1,%g1
648
649 fdtox $nlob,$nlob
650 fdtox $nloc,$nloc
651 fdtox $nlod,$nlod
652
653 std $nloa,[%sp+$bias+$frame+0]
654 std $nlob,[%sp+$bias+$frame+8]
655 addcc $j,8,$j
656 std $nloc,[%sp+$bias+$frame+16]
657 bz,pn %icc,.Linnerskip
658 std $nlod,[%sp+$bias+$frame+24]
659
660 ba .Linner
661 nop
662.align 32
663.Linner:
664 ldd [$ap_l+$j],$alo ! load a[j] in double format
665 ldd [$ap_h+$j],$ahi
666 ldd [$np_l+$j],$nlo ! load n[j] in double format
667 ldd [$np_h+$j],$nhi
668
669 fmuld $alo,$ba,$aloa
670 fmuld $nlo,$na,$nloa
671 fmuld $alo,$bb,$alob
672 fmuld $nlo,$nb,$nlob
673 fmuld $alo,$bc,$aloc
674 ldx [%sp+$bias+$frame+0],%o0
675 faddd $aloa,$nloa,$nloa
676 fmuld $nlo,$nc,$nloc
677 ldx [%sp+$bias+$frame+8],%o1
678 fmuld $alo,$bd,$alod
679 ldx [%sp+$bias+$frame+16],%o2
680 faddd $alob,$nlob,$nlob
681 fmuld $nlo,$nd,$nlod
682 ldx [%sp+$bias+$frame+24],%o3
683 fmuld $ahi,$ba,$ahia
684
685 srlx %o0,16,%o7
686 faddd $aloc,$nloc,$nloc
687 fmuld $nhi,$na,$nhia
688 add %o7,%o1,%o1
689 fmuld $ahi,$bb,$ahib
690 srlx %o1,16,%o7
691 faddd $alod,$nlod,$nlod
692 fmuld $nhi,$nb,$nhib
693 add %o7,%o2,%o2
694 fmuld $ahi,$bc,$ahic
695 srlx %o2,16,%o7
696 faddd $ahia,$nhia,$nhia
697 fmuld $nhi,$nc,$nhic
698 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
699 and %o0,$mask,%o0
700 fmuld $ahi,$bd,$ahid
701 and %o1,$mask,%o1
702 and %o2,$mask,%o2
703 faddd $ahib,$nhib,$nhib
704 fmuld $nhi,$nd,$nhid
705 sllx %o1,16,%o1
706 faddd $dota,$nloa,$nloa
707 sllx %o2,32,%o2
708 faddd $dotb,$nlob,$nlob
709 sllx %o3,48,%o7
710 or %o1,%o0,%o0
711 faddd $ahic,$nhic,$dota ! $nhic
712 or %o2,%o0,%o0
713 faddd $ahid,$nhid,$dotb ! $nhid
714 or %o7,%o0,%o0 ! 64-bit result
715 faddd $nloc,$nhia,$nloc
716 addcc %g1,%o0,%o0
717 ldx [$tp+8],%o7 ! tp[j]
718 faddd $nlod,$nhib,$nlod
719 srlx %o3,16,%g1 ! 34-bit carry
720 fdtox $nloa,$nloa
721 bcs,a %xcc,.+8
722 add %g1,1,%g1
723 fdtox $nlob,$nlob
724 addcc %o7,%o0,%o0
725 fdtox $nloc,$nloc
726 bcs,a %xcc,.+8
727 add %g1,1,%g1
728
729 stx %o0,[$tp] ! tp[j-1]
730 fdtox $nlod,$nlod
731
732 std $nloa,[%sp+$bias+$frame+0]
733 std $nlob,[%sp+$bias+$frame+8]
734 std $nloc,[%sp+$bias+$frame+16]
735 addcc $j,8,$j
736 std $nlod,[%sp+$bias+$frame+24]
737 bnz,pt %icc,.Linner
738 add $tp,8,$tp
739
740.Linnerskip:
741 fdtox $dota,$dota
742 fdtox $dotb,$dotb
743
744 ldx [%sp+$bias+$frame+0],%o0
745 ldx [%sp+$bias+$frame+8],%o1
746 ldx [%sp+$bias+$frame+16],%o2
747 ldx [%sp+$bias+$frame+24],%o3
748
749 srlx %o0,16,%o7
750 std $dota,[%sp+$bias+$frame+32]
751 add %o7,%o1,%o1
752 std $dotb,[%sp+$bias+$frame+40]
753 srlx %o1,16,%o7
754 add %o7,%o2,%o2
755 srlx %o2,16,%o7
756 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
757 and %o0,$mask,%o0
758 and %o1,$mask,%o1
759 and %o2,$mask,%o2
760 sllx %o1,16,%o1
761 sllx %o2,32,%o2
762 sllx %o3,48,%o7
763 or %o1,%o0,%o0
764 or %o2,%o0,%o0
765 ldx [%sp+$bias+$frame+32],%o4
766 or %o7,%o0,%o0 ! 64-bit result
767 ldx [%sp+$bias+$frame+40],%o5
768 addcc %g1,%o0,%o0
769 ldx [$tp+8],%o7 ! tp[j]
770 srlx %o3,16,%g1 ! 34-bit carry
771 bcs,a %xcc,.+8
772 add %g1,1,%g1
773
774 addcc %o7,%o0,%o0
775 bcs,a %xcc,.+8
776 add %g1,1,%g1
777
778 stx %o0,[$tp] ! tp[j-1]
779 add $tp,8,$tp
780
781 srlx %o4,16,%o7
782 add %o7,%o5,%o5
783 and %o4,$mask,%o4
784 sllx %o5,16,%o7
785 or %o7,%o4,%o4
786 addcc %g1,%o4,%o4
787 srlx %o5,48,%g1
788 bcs,a %xcc,.+8
789 add %g1,1,%g1
790
791 addcc $carry,%o4,%o4
792 stx %o4,[$tp] ! tp[num-1]
793 mov %g1,$carry
794 bcs,a %xcc,.+8
795 add $carry,1,$carry
796
797 addcc $i,8,$i
798 bnz %icc,.Louter
799 nop
800
801 add $tp,8,$tp ! adjust tp to point at the end
802 orn %g0,%g0,%g4
803 sub %g0,$num,%o7 ! n=-num
804 ba .Lsub
805 subcc %g0,%g0,%g0 ! clear %icc.c
806
807.align 32
808.Lsub:
809 ldx [$tp+%o7],%o0
810 add $np,%o7,%g1
811 ld [%g1+0],%o2
812 ld [%g1+4],%o3
813 srlx %o0,32,%o1
814 subccc %o0,%o2,%o2
815 add $rp,%o7,%g1
816 subccc %o1,%o3,%o3
817 st %o2,[%g1+0]
818 add %o7,8,%o7
819 brnz,pt %o7,.Lsub
820 st %o3,[%g1+4]
821 subc $carry,0,%g4
822 sub %g0,$num,%o7 ! n=-num
823 ba .Lcopy
824 nop
825
826.align 32
827.Lcopy:
828 ldx [$tp+%o7],%o0
829 add $rp,%o7,%g1
830 ld [%g1+0],%o2
831 ld [%g1+4],%o3
832 stx %g0,[$tp+%o7]
833 and %o0,%g4,%o0
834 srlx %o0,32,%o1
835 andn %o2,%g4,%o2
836 andn %o3,%g4,%o3
837 or %o2,%o0,%o0
838 or %o3,%o1,%o1
839 st %o0,[%g1+0]
840 add %o7,8,%o7
841 brnz,pt %o7,.Lcopy
842 st %o1,[%g1+4]
843 sub %g0,$num,%o7 ! n=-num
844
845.Lzap:
846 stx %g0,[$ap_l+%o7]
847 stx %g0,[$ap_h+%o7]
848 stx %g0,[$np_l+%o7]
849 stx %g0,[$np_h+%o7]
850 add %o7,8,%o7
851 brnz,pt %o7,.Lzap
852 nop
853
854 ldx [%sp+$bias+$frame+48],%o7
855 wr %g0,%o7,%asi ! restore %asi
856
857 mov 1,%i0
858.Lret:
859 ret
860 restore
861.type $fname,#function
862.size $fname,(.-$fname)
863.asciz "Montgomery Multipltication for UltraSPARC, CRYPTOGAMS by <appro\@openssl.org>"
864.align 32
865___
866
867$code =~ s/\`([^\`]*)\`/eval($1)/gem;
868
869# Below substitution makes it possible to compile without demanding
870# VIS extentions on command line, e.g. -xarch=v9 vs. -xarch=v9a. I
871# dare to do this, because VIS capability is detected at run-time now
872# and this routine is not called on CPU not capable to execute it. Do
873# note that fzeros is not the only VIS dependency! Another dependency
874# is implicit and is just _a_ numerical value loaded to %asi register,
875# which assembler can't recognize as VIS specific...
876$code =~ s/fzeros\s+%f([0-9]+)/
877 sprintf(".word\t0x%x\t! fzeros %%f%d",0x81b00c20|($1<<25),$1)
878 /gem;
879
880print $code;
881# flush
882close STDOUT;
diff --git a/src/lib/libcrypto/bn/asm/via-mont.pl b/src/lib/libcrypto/bn/asm/via-mont.pl
deleted file mode 100644
index c046a514c8..0000000000
--- a/src/lib/libcrypto/bn/asm/via-mont.pl
+++ /dev/null
@@ -1,242 +0,0 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# Wrapper around 'rep montmul', VIA-specific instruction accessing
11# PadLock Montgomery Multiplier. The wrapper is designed as drop-in
12# replacement for OpenSSL bn_mul_mont [first implemented in 0.9.9].
13#
14# Below are interleaved outputs from 'openssl speed rsa dsa' for 4
15# different software configurations on 1.5GHz VIA Esther processor.
16# Lines marked with "software integer" denote performance of hand-
17# coded integer-only assembler found in OpenSSL 0.9.7. "Software SSE2"
18# refers to hand-coded SSE2 Montgomery multiplication procedure found
19# OpenSSL 0.9.9. "Hardware VIA SDK" refers to padlock_pmm routine from
20# Padlock SDK 2.0.1 available for download from VIA, which naturally
21# utilizes the magic 'repz montmul' instruction. And finally "hardware
22# this" refers to *this* implementation which also uses 'repz montmul'
23#
24# sign verify sign/s verify/s
25# rsa 512 bits 0.001720s 0.000140s 581.4 7149.7 software integer
26# rsa 512 bits 0.000690s 0.000086s 1450.3 11606.0 software SSE2
27# rsa 512 bits 0.006136s 0.000201s 163.0 4974.5 hardware VIA SDK
28# rsa 512 bits 0.000712s 0.000050s 1404.9 19858.5 hardware this
29#
30# rsa 1024 bits 0.008518s 0.000413s 117.4 2420.8 software integer
31# rsa 1024 bits 0.004275s 0.000277s 233.9 3609.7 software SSE2
32# rsa 1024 bits 0.012136s 0.000260s 82.4 3844.5 hardware VIA SDK
33# rsa 1024 bits 0.002522s 0.000116s 396.5 8650.9 hardware this
34#
35# rsa 2048 bits 0.050101s 0.001371s 20.0 729.6 software integer
36# rsa 2048 bits 0.030273s 0.001008s 33.0 991.9 software SSE2
37# rsa 2048 bits 0.030833s 0.000976s 32.4 1025.1 hardware VIA SDK
38# rsa 2048 bits 0.011879s 0.000342s 84.2 2921.7 hardware this
39#
40# rsa 4096 bits 0.327097s 0.004859s 3.1 205.8 software integer
41# rsa 4096 bits 0.229318s 0.003859s 4.4 259.2 software SSE2
42# rsa 4096 bits 0.233953s 0.003274s 4.3 305.4 hardware VIA SDK
43# rsa 4096 bits 0.070493s 0.001166s 14.2 857.6 hardware this
44#
45# dsa 512 bits 0.001342s 0.001651s 745.2 605.7 software integer
46# dsa 512 bits 0.000844s 0.000987s 1185.3 1013.1 software SSE2
47# dsa 512 bits 0.001902s 0.002247s 525.6 444.9 hardware VIA SDK
48# dsa 512 bits 0.000458s 0.000524s 2182.2 1909.1 hardware this
49#
50# dsa 1024 bits 0.003964s 0.004926s 252.3 203.0 software integer
51# dsa 1024 bits 0.002686s 0.003166s 372.3 315.8 software SSE2
52# dsa 1024 bits 0.002397s 0.002823s 417.1 354.3 hardware VIA SDK
53# dsa 1024 bits 0.000978s 0.001170s 1022.2 855.0 hardware this
54#
55# dsa 2048 bits 0.013280s 0.016518s 75.3 60.5 software integer
56# dsa 2048 bits 0.009911s 0.011522s 100.9 86.8 software SSE2
57# dsa 2048 bits 0.009542s 0.011763s 104.8 85.0 hardware VIA SDK
58# dsa 2048 bits 0.002884s 0.003352s 346.8 298.3 hardware this
59#
60# To give you some other reference point here is output for 2.4GHz P4
61# running hand-coded SSE2 bn_mul_mont found in 0.9.9, i.e. "software
62# SSE2" in above terms.
63#
64# rsa 512 bits 0.000407s 0.000047s 2454.2 21137.0
65# rsa 1024 bits 0.002426s 0.000141s 412.1 7100.0
66# rsa 2048 bits 0.015046s 0.000491s 66.5 2034.9
67# rsa 4096 bits 0.109770s 0.002379s 9.1 420.3
68# dsa 512 bits 0.000438s 0.000525s 2281.1 1904.1
69# dsa 1024 bits 0.001346s 0.001595s 742.7 627.0
70# dsa 2048 bits 0.004745s 0.005582s 210.7 179.1
71#
72# Conclusions:
73# - VIA SDK leaves a *lot* of room for improvement (which this
74# implementation successfully fills:-);
75# - 'rep montmul' gives up to >3x performance improvement depending on
76# key length;
77# - in terms of absolute performance it delivers approximately as much
78# as modern out-of-order 32-bit cores [again, for longer keys].
79
80$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
81push(@INC,"${dir}","${dir}../../perlasm");
82require "x86asm.pl";
83
84&asm_init($ARGV[0],"via-mont.pl");
85
86# int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num);
87$func="bn_mul_mont_padlock";
88
89$pad=16*1; # amount of reserved bytes on top of every vector
90
91# stack layout
92$mZeroPrime=&DWP(0,"esp"); # these are specified by VIA
93$A=&DWP(4,"esp");
94$B=&DWP(8,"esp");
95$T=&DWP(12,"esp");
96$M=&DWP(16,"esp");
97$scratch=&DWP(20,"esp");
98$rp=&DWP(24,"esp"); # these are mine
99$sp=&DWP(28,"esp");
100# &DWP(32,"esp") # 32 byte scratch area
101# &DWP(64+(4*$num+$pad)*0,"esp") # padded tp[num]
102# &DWP(64+(4*$num+$pad)*1,"esp") # padded copy of ap[num]
103# &DWP(64+(4*$num+$pad)*2,"esp") # padded copy of bp[num]
104# &DWP(64+(4*$num+$pad)*3,"esp") # padded copy of np[num]
105# Note that SDK suggests to unconditionally allocate 2K per vector. This
106# has quite an impact on performance. It naturally depends on key length,
107# but to give an example 1024 bit private RSA key operations suffer >30%
108# penalty. I allocate only as much as actually required...
109
110&function_begin($func);
111 &xor ("eax","eax");
112 &mov ("ecx",&wparam(5)); # num
113 # meet VIA's limitations for num [note that the specification
114 # expresses them in bits, while we work with amount of 32-bit words]
115 &test ("ecx",3);
116 &jnz (&label("leave")); # num % 4 != 0
117 &cmp ("ecx",8);
118 &jb (&label("leave")); # num < 8
119 &cmp ("ecx",1024);
120 &ja (&label("leave")); # num > 1024
121
122 &pushf ();
123 &cld ();
124
125 &mov ("edi",&wparam(0)); # rp
126 &mov ("eax",&wparam(1)); # ap
127 &mov ("ebx",&wparam(2)); # bp
128 &mov ("edx",&wparam(3)); # np
129 &mov ("esi",&wparam(4)); # n0
130 &mov ("esi",&DWP(0,"esi")); # *n0
131
132 &lea ("ecx",&DWP($pad,"","ecx",4)); # ecx becomes vector size in bytes
133 &lea ("ebp",&DWP(64,"","ecx",4)); # allocate 4 vectors + 64 bytes
134 &neg ("ebp");
135 &add ("ebp","esp");
136 &and ("ebp",-64); # align to cache-line
137 &xchg ("ebp","esp"); # alloca
138
139 &mov ($rp,"edi"); # save rp
140 &mov ($sp,"ebp"); # save esp
141
142 &mov ($mZeroPrime,"esi");
143 &lea ("esi",&DWP(64,"esp")); # tp
144 &mov ($T,"esi");
145 &lea ("edi",&DWP(32,"esp")); # scratch area
146 &mov ($scratch,"edi");
147 &mov ("esi","eax");
148
149 &lea ("ebp",&DWP(-$pad,"ecx"));
150 &shr ("ebp",2); # restore original num value in ebp
151
152 &xor ("eax","eax");
153
154 &mov ("ecx","ebp");
155 &lea ("ecx",&DWP((32+$pad)/4,"ecx"));# padded tp + scratch
156 &data_byte(0xf3,0xab); # rep stosl, bzero
157
158 &mov ("ecx","ebp");
159 &lea ("edi",&DWP(64+$pad,"esp","ecx",4));# pointer to ap copy
160 &mov ($A,"edi");
161 &data_byte(0xf3,0xa5); # rep movsl, memcpy
162 &mov ("ecx",$pad/4);
163 &data_byte(0xf3,0xab); # rep stosl, bzero pad
164 # edi points at the end of padded ap copy...
165
166 &mov ("ecx","ebp");
167 &mov ("esi","ebx");
168 &mov ($B,"edi");
169 &data_byte(0xf3,0xa5); # rep movsl, memcpy
170 &mov ("ecx",$pad/4);
171 &data_byte(0xf3,0xab); # rep stosl, bzero pad
172 # edi points at the end of padded bp copy...
173
174 &mov ("ecx","ebp");
175 &mov ("esi","edx");
176 &mov ($M,"edi");
177 &data_byte(0xf3,0xa5); # rep movsl, memcpy
178 &mov ("ecx",$pad/4);
179 &data_byte(0xf3,0xab); # rep stosl, bzero pad
180 # edi points at the end of padded np copy...
181
182 # let magic happen...
183 &mov ("ecx","ebp");
184 &mov ("esi","esp");
185 &shl ("ecx",5); # convert word counter to bit counter
186 &align (4);
187 &data_byte(0xf3,0x0f,0xa6,0xc0);# rep montmul
188
189 &mov ("ecx","ebp");
190 &lea ("esi",&DWP(64,"esp")); # tp
191 # edi still points at the end of padded np copy...
192 &neg ("ebp");
193 &lea ("ebp",&DWP(-$pad,"edi","ebp",4)); # so just "rewind"
194 &mov ("edi",$rp); # restore rp
195 &xor ("edx","edx"); # i=0 and clear CF
196
197&set_label("sub",8);
198 &mov ("eax",&DWP(0,"esi","edx",4));
199 &sbb ("eax",&DWP(0,"ebp","edx",4));
200 &mov (&DWP(0,"edi","edx",4),"eax"); # rp[i]=tp[i]-np[i]
201 &lea ("edx",&DWP(1,"edx")); # i++
202 &loop (&label("sub")); # doesn't affect CF!
203
204 &mov ("eax",&DWP(0,"esi","edx",4)); # upmost overflow bit
205 &sbb ("eax",0);
206 &and ("esi","eax");
207 &not ("eax");
208 &mov ("ebp","edi");
209 &and ("ebp","eax");
210 &or ("esi","ebp"); # tp=carry?tp:rp
211
212 &mov ("ecx","edx"); # num
213 &xor ("edx","edx"); # i=0
214
215&set_label("copy",8);
216 &mov ("eax",&DWP(0,"esi","edx",4));
217 &mov (&DWP(64,"esp","edx",4),"ecx"); # zap tp
218 &mov (&DWP(0,"edi","edx",4),"eax");
219 &lea ("edx",&DWP(1,"edx")); # i++
220 &loop (&label("copy"));
221
222 &mov ("ebp",$sp);
223 &xor ("eax","eax");
224
225 &mov ("ecx",64/4);
226 &mov ("edi","esp"); # zap frame including scratch area
227 &data_byte(0xf3,0xab); # rep stosl, bzero
228
229 # zap copies of ap, bp and np
230 &lea ("edi",&DWP(64+$pad,"esp","edx",4));# pointer to ap
231 &lea ("ecx",&DWP(3*$pad/4,"edx","edx",2));
232 &data_byte(0xf3,0xab); # rep stosl, bzero
233
234 &mov ("esp","ebp");
235 &inc ("eax"); # signal "done"
236 &popf ();
237&set_label("leave");
238&function_end($func);
239
240&asciz("Padlock Montgomery Multiplication, CRYPTOGAMS by <appro\@openssl.org>");
241
242&asm_finish();
diff --git a/src/lib/libcrypto/bn/asm/x86-gf2m.pl b/src/lib/libcrypto/bn/asm/x86-gf2m.pl
deleted file mode 100644
index 808a1e5969..0000000000
--- a/src/lib/libcrypto/bn/asm/x86-gf2m.pl
+++ /dev/null
@@ -1,313 +0,0 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# May 2011
11#
12# The module implements bn_GF2m_mul_2x2 polynomial multiplication used
13# in bn_gf2m.c. It's kind of low-hanging mechanical port from C for
14# the time being... Except that it has three code paths: pure integer
15# code suitable for any x86 CPU, MMX code suitable for PIII and later
16# and PCLMULQDQ suitable for Westmere and later. Improvement varies
17# from one benchmark and µ-arch to another. Below are interval values
18# for 163- and 571-bit ECDH benchmarks relative to compiler-generated
19# code:
20#
21# PIII 16%-30%
22# P4 12%-12%
23# Opteron 18%-40%
24# Core2 19%-44%
25# Atom 38%-64%
26# Westmere 53%-121%(PCLMULQDQ)/20%-32%(MMX)
27# Sandy Bridge 72%-127%(PCLMULQDQ)/27%-23%(MMX)
28#
29# Note that above improvement coefficients are not coefficients for
30# bn_GF2m_mul_2x2 itself. For example 120% ECDH improvement is result
31# of bn_GF2m_mul_2x2 being >4x faster. As it gets faster, benchmark
32# is more and more dominated by other subroutines, most notably by
33# BN_GF2m_mod[_mul]_arr...
34
35$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
36push(@INC,"${dir}","${dir}../../perlasm");
37require "x86asm.pl";
38
39&asm_init($ARGV[0],$0,$x86only = $ARGV[$#ARGV] eq "386");
40
41$sse2=0;
42for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
43
44&external_label("OPENSSL_ia32cap_P") if ($sse2);
45
46$a="eax";
47$b="ebx";
48($a1,$a2,$a4)=("ecx","edx","ebp");
49
50$R="mm0";
51@T=("mm1","mm2");
52($A,$B,$B30,$B31)=("mm2","mm3","mm4","mm5");
53@i=("esi","edi");
54
55 if (!$x86only) {
56&function_begin_B("_mul_1x1_mmx");
57 &sub ("esp",32+4);
58 &mov ($a1,$a);
59 &lea ($a2,&DWP(0,$a,$a));
60 &and ($a1,0x3fffffff);
61 &lea ($a4,&DWP(0,$a2,$a2));
62 &mov (&DWP(0*4,"esp"),0);
63 &and ($a2,0x7fffffff);
64 &movd ($A,$a);
65 &movd ($B,$b);
66 &mov (&DWP(1*4,"esp"),$a1); # a1
67 &xor ($a1,$a2); # a1^a2
68 &pxor ($B31,$B31);
69 &pxor ($B30,$B30);
70 &mov (&DWP(2*4,"esp"),$a2); # a2
71 &xor ($a2,$a4); # a2^a4
72 &mov (&DWP(3*4,"esp"),$a1); # a1^a2
73 &pcmpgtd($B31,$A); # broadcast 31st bit
74 &paddd ($A,$A); # $A<<=1
75 &xor ($a1,$a2); # a1^a4=a1^a2^a2^a4
76 &mov (&DWP(4*4,"esp"),$a4); # a4
77 &xor ($a4,$a2); # a2=a4^a2^a4
78 &pand ($B31,$B);
79 &pcmpgtd($B30,$A); # broadcast 30th bit
80 &mov (&DWP(5*4,"esp"),$a1); # a1^a4
81 &xor ($a4,$a1); # a1^a2^a4
82 &psllq ($B31,31);
83 &pand ($B30,$B);
84 &mov (&DWP(6*4,"esp"),$a2); # a2^a4
85 &mov (@i[0],0x7);
86 &mov (&DWP(7*4,"esp"),$a4); # a1^a2^a4
87 &mov ($a4,@i[0]);
88 &and (@i[0],$b);
89 &shr ($b,3);
90 &mov (@i[1],$a4);
91 &psllq ($B30,30);
92 &and (@i[1],$b);
93 &shr ($b,3);
94 &movd ($R,&DWP(0,"esp",@i[0],4));
95 &mov (@i[0],$a4);
96 &and (@i[0],$b);
97 &shr ($b,3);
98 for($n=1;$n<9;$n++) {
99 &movd (@T[1],&DWP(0,"esp",@i[1],4));
100 &mov (@i[1],$a4);
101 &psllq (@T[1],3*$n);
102 &and (@i[1],$b);
103 &shr ($b,3);
104 &pxor ($R,@T[1]);
105
106 push(@i,shift(@i)); push(@T,shift(@T));
107 }
108 &movd (@T[1],&DWP(0,"esp",@i[1],4));
109 &pxor ($R,$B30);
110 &psllq (@T[1],3*$n++);
111 &pxor ($R,@T[1]);
112
113 &movd (@T[0],&DWP(0,"esp",@i[0],4));
114 &pxor ($R,$B31);
115 &psllq (@T[0],3*$n);
116 &add ("esp",32+4);
117 &pxor ($R,@T[0]);
118 &ret ();
119&function_end_B("_mul_1x1_mmx");
120 }
121
122($lo,$hi)=("eax","edx");
123@T=("ecx","ebp");
124
125&function_begin_B("_mul_1x1_ialu");
126 &sub ("esp",32+4);
127 &mov ($a1,$a);
128 &lea ($a2,&DWP(0,$a,$a));
129 &lea ($a4,&DWP(0,"",$a,4));
130 &and ($a1,0x3fffffff);
131 &lea (@i[1],&DWP(0,$lo,$lo));
132 &sar ($lo,31); # broadcast 31st bit
133 &mov (&DWP(0*4,"esp"),0);
134 &and ($a2,0x7fffffff);
135 &mov (&DWP(1*4,"esp"),$a1); # a1
136 &xor ($a1,$a2); # a1^a2
137 &mov (&DWP(2*4,"esp"),$a2); # a2
138 &xor ($a2,$a4); # a2^a4
139 &mov (&DWP(3*4,"esp"),$a1); # a1^a2
140 &xor ($a1,$a2); # a1^a4=a1^a2^a2^a4
141 &mov (&DWP(4*4,"esp"),$a4); # a4
142 &xor ($a4,$a2); # a2=a4^a2^a4
143 &mov (&DWP(5*4,"esp"),$a1); # a1^a4
144 &xor ($a4,$a1); # a1^a2^a4
145 &sar (@i[1],31); # broardcast 30th bit
146 &and ($lo,$b);
147 &mov (&DWP(6*4,"esp"),$a2); # a2^a4
148 &and (@i[1],$b);
149 &mov (&DWP(7*4,"esp"),$a4); # a1^a2^a4
150 &mov ($hi,$lo);
151 &shl ($lo,31);
152 &mov (@T[0],@i[1]);
153 &shr ($hi,1);
154
155 &mov (@i[0],0x7);
156 &shl (@i[1],30);
157 &and (@i[0],$b);
158 &shr (@T[0],2);
159 &xor ($lo,@i[1]);
160
161 &shr ($b,3);
162 &mov (@i[1],0x7); # 5-byte instruction!?
163 &and (@i[1],$b);
164 &shr ($b,3);
165 &xor ($hi,@T[0]);
166 &xor ($lo,&DWP(0,"esp",@i[0],4));
167 &mov (@i[0],0x7);
168 &and (@i[0],$b);
169 &shr ($b,3);
170 for($n=1;$n<9;$n++) {
171 &mov (@T[1],&DWP(0,"esp",@i[1],4));
172 &mov (@i[1],0x7);
173 &mov (@T[0],@T[1]);
174 &shl (@T[1],3*$n);
175 &and (@i[1],$b);
176 &shr (@T[0],32-3*$n);
177 &xor ($lo,@T[1]);
178 &shr ($b,3);
179 &xor ($hi,@T[0]);
180
181 push(@i,shift(@i)); push(@T,shift(@T));
182 }
183 &mov (@T[1],&DWP(0,"esp",@i[1],4));
184 &mov (@T[0],@T[1]);
185 &shl (@T[1],3*$n);
186 &mov (@i[1],&DWP(0,"esp",@i[0],4));
187 &shr (@T[0],32-3*$n); $n++;
188 &mov (@i[0],@i[1]);
189 &xor ($lo,@T[1]);
190 &shl (@i[1],3*$n);
191 &xor ($hi,@T[0]);
192 &shr (@i[0],32-3*$n);
193 &xor ($lo,@i[1]);
194 &xor ($hi,@i[0]);
195
196 &add ("esp",32+4);
197 &ret ();
198&function_end_B("_mul_1x1_ialu");
199
200# void bn_GF2m_mul_2x2(BN_ULONG *r, BN_ULONG a1, BN_ULONG a0, BN_ULONG b1, BN_ULONG b0);
201&function_begin_B("bn_GF2m_mul_2x2");
202if (!$x86only) {
203 &picmeup("edx","OPENSSL_ia32cap_P");
204 &mov ("eax",&DWP(0,"edx"));
205 &mov ("edx",&DWP(4,"edx"));
206 &test ("eax",1<<23); # check MMX bit
207 &jz (&label("ialu"));
208if ($sse2) {
209 &test ("eax",1<<24); # check FXSR bit
210 &jz (&label("mmx"));
211 &test ("edx",1<<1); # check PCLMULQDQ bit
212 &jz (&label("mmx"));
213
214 &movups ("xmm0",&QWP(8,"esp"));
215 &shufps ("xmm0","xmm0",0b10110001);
216 &pclmulqdq ("xmm0","xmm0",1);
217 &mov ("eax",&DWP(4,"esp"));
218 &movups (&QWP(0,"eax"),"xmm0");
219 &ret ();
220
221&set_label("mmx",16);
222}
223 &push ("ebp");
224 &push ("ebx");
225 &push ("esi");
226 &push ("edi");
227 &mov ($a,&wparam(1));
228 &mov ($b,&wparam(3));
229 &call ("_mul_1x1_mmx"); # a1·b1
230 &movq ("mm7",$R);
231
232 &mov ($a,&wparam(2));
233 &mov ($b,&wparam(4));
234 &call ("_mul_1x1_mmx"); # a0·b0
235 &movq ("mm6",$R);
236
237 &mov ($a,&wparam(1));
238 &mov ($b,&wparam(3));
239 &xor ($a,&wparam(2));
240 &xor ($b,&wparam(4));
241 &call ("_mul_1x1_mmx"); # (a0+a1)·(b0+b1)
242 &pxor ($R,"mm7");
243 &mov ($a,&wparam(0));
244 &pxor ($R,"mm6"); # (a0+a1)·(b0+b1)-a1·b1-a0·b0
245
246 &movq ($A,$R);
247 &psllq ($R,32);
248 &pop ("edi");
249 &psrlq ($A,32);
250 &pop ("esi");
251 &pxor ($R,"mm6");
252 &pop ("ebx");
253 &pxor ($A,"mm7");
254 &movq (&QWP(0,$a),$R);
255 &pop ("ebp");
256 &movq (&QWP(8,$a),$A);
257 &emms ();
258 &ret ();
259&set_label("ialu",16);
260}
261 &push ("ebp");
262 &push ("ebx");
263 &push ("esi");
264 &push ("edi");
265 &stack_push(4+1);
266
267 &mov ($a,&wparam(1));
268 &mov ($b,&wparam(3));
269 &call ("_mul_1x1_ialu"); # a1·b1
270 &mov (&DWP(8,"esp"),$lo);
271 &mov (&DWP(12,"esp"),$hi);
272
273 &mov ($a,&wparam(2));
274 &mov ($b,&wparam(4));
275 &call ("_mul_1x1_ialu"); # a0·b0
276 &mov (&DWP(0,"esp"),$lo);
277 &mov (&DWP(4,"esp"),$hi);
278
279 &mov ($a,&wparam(1));
280 &mov ($b,&wparam(3));
281 &xor ($a,&wparam(2));
282 &xor ($b,&wparam(4));
283 &call ("_mul_1x1_ialu"); # (a0+a1)·(b0+b1)
284
285 &mov ("ebp",&wparam(0));
286 @r=("ebx","ecx","edi","esi");
287 &mov (@r[0],&DWP(0,"esp"));
288 &mov (@r[1],&DWP(4,"esp"));
289 &mov (@r[2],&DWP(8,"esp"));
290 &mov (@r[3],&DWP(12,"esp"));
291
292 &xor ($lo,$hi);
293 &xor ($hi,@r[1]);
294 &xor ($lo,@r[0]);
295 &mov (&DWP(0,"ebp"),@r[0]);
296 &xor ($hi,@r[2]);
297 &mov (&DWP(12,"ebp"),@r[3]);
298 &xor ($lo,@r[3]);
299 &stack_pop(4+1);
300 &xor ($hi,@r[3]);
301 &pop ("edi");
302 &xor ($lo,$hi);
303 &pop ("esi");
304 &mov (&DWP(8,"ebp"),$hi);
305 &pop ("ebx");
306 &mov (&DWP(4,"ebp"),$lo);
307 &pop ("ebp");
308 &ret ();
309&function_end_B("bn_GF2m_mul_2x2");
310
311&asciz ("GF(2^m) Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>");
312
313&asm_finish();
diff --git a/src/lib/libcrypto/bn/asm/x86-mont.pl b/src/lib/libcrypto/bn/asm/x86-mont.pl
deleted file mode 100755
index e8f6b05084..0000000000
--- a/src/lib/libcrypto/bn/asm/x86-mont.pl
+++ /dev/null
@@ -1,593 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# October 2005
11#
12# This is a "teaser" code, as it can be improved in several ways...
13# First of all non-SSE2 path should be implemented (yes, for now it
14# performs Montgomery multiplication/convolution only on SSE2-capable
15# CPUs such as P4, others fall down to original code). Then inner loop
16# can be unrolled and modulo-scheduled to improve ILP and possibly
17# moved to 128-bit XMM register bank (though it would require input
18# rearrangement and/or increase bus bandwidth utilization). Dedicated
19# squaring procedure should give further performance improvement...
20# Yet, for being draft, the code improves rsa512 *sign* benchmark by
21# 110%(!), rsa1024 one - by 70% and rsa4096 - by 20%:-)
22
23# December 2006
24#
25# Modulo-scheduling SSE2 loops results in further 15-20% improvement.
26# Integer-only code [being equipped with dedicated squaring procedure]
27# gives ~40% on rsa512 sign benchmark...
28
29$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
30push(@INC,"${dir}","${dir}../../perlasm");
31require "x86asm.pl";
32
33&asm_init($ARGV[0],$0);
34
35$sse2=0;
36for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
37
38&external_label("OPENSSL_ia32cap_P") if ($sse2);
39
40&function_begin("bn_mul_mont");
41
42$i="edx";
43$j="ecx";
44$ap="esi"; $tp="esi"; # overlapping variables!!!
45$rp="edi"; $bp="edi"; # overlapping variables!!!
46$np="ebp";
47$num="ebx";
48
49$_num=&DWP(4*0,"esp"); # stack top layout
50$_rp=&DWP(4*1,"esp");
51$_ap=&DWP(4*2,"esp");
52$_bp=&DWP(4*3,"esp");
53$_np=&DWP(4*4,"esp");
54$_n0=&DWP(4*5,"esp"); $_n0q=&QWP(4*5,"esp");
55$_sp=&DWP(4*6,"esp");
56$_bpend=&DWP(4*7,"esp");
57$frame=32; # size of above frame rounded up to 16n
58
59 &xor ("eax","eax");
60 &mov ("edi",&wparam(5)); # int num
61 &cmp ("edi",4);
62 &jl (&label("just_leave"));
63
64 &lea ("esi",&wparam(0)); # put aside pointer to argument block
65 &lea ("edx",&wparam(1)); # load ap
66 &mov ("ebp","esp"); # saved stack pointer!
67 &add ("edi",2); # extra two words on top of tp
68 &neg ("edi");
69 &lea ("esp",&DWP(-$frame,"esp","edi",4)); # alloca($frame+4*(num+2))
70 &neg ("edi");
71
72 # minimize cache contention by arraning 2K window between stack
73 # pointer and ap argument [np is also position sensitive vector,
74 # but it's assumed to be near ap, as it's allocated at ~same
75 # time].
76 &mov ("eax","esp");
77 &sub ("eax","edx");
78 &and ("eax",2047);
79 &sub ("esp","eax"); # this aligns sp and ap modulo 2048
80
81 &xor ("edx","esp");
82 &and ("edx",2048);
83 &xor ("edx",2048);
84 &sub ("esp","edx"); # this splits them apart modulo 4096
85
86 &and ("esp",-64); # align to cache line
87
88 ################################# load argument block...
89 &mov ("eax",&DWP(0*4,"esi"));# BN_ULONG *rp
90 &mov ("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap
91 &mov ("ecx",&DWP(2*4,"esi"));# const BN_ULONG *bp
92 &mov ("edx",&DWP(3*4,"esi"));# const BN_ULONG *np
93 &mov ("esi",&DWP(4*4,"esi"));# const BN_ULONG *n0
94 #&mov ("edi",&DWP(5*4,"esi"));# int num
95
96 &mov ("esi",&DWP(0,"esi")); # pull n0[0]
97 &mov ($_rp,"eax"); # ... save a copy of argument block
98 &mov ($_ap,"ebx");
99 &mov ($_bp,"ecx");
100 &mov ($_np,"edx");
101 &mov ($_n0,"esi");
102 &lea ($num,&DWP(-3,"edi")); # num=num-1 to assist modulo-scheduling
103 #&mov ($_num,$num); # redundant as $num is not reused
104 &mov ($_sp,"ebp"); # saved stack pointer!
105
106if($sse2) {
107$acc0="mm0"; # mmx register bank layout
108$acc1="mm1";
109$car0="mm2";
110$car1="mm3";
111$mul0="mm4";
112$mul1="mm5";
113$temp="mm6";
114$mask="mm7";
115
116 &picmeup("eax","OPENSSL_ia32cap_P");
117 &bt (&DWP(0,"eax"),26);
118 &jnc (&label("non_sse2"));
119
120 &mov ("eax",-1);
121 &movd ($mask,"eax"); # mask 32 lower bits
122
123 &mov ($ap,$_ap); # load input pointers
124 &mov ($bp,$_bp);
125 &mov ($np,$_np);
126
127 &xor ($i,$i); # i=0
128 &xor ($j,$j); # j=0
129
130 &movd ($mul0,&DWP(0,$bp)); # bp[0]
131 &movd ($mul1,&DWP(0,$ap)); # ap[0]
132 &movd ($car1,&DWP(0,$np)); # np[0]
133
134 &pmuludq($mul1,$mul0); # ap[0]*bp[0]
135 &movq ($car0,$mul1);
136 &movq ($acc0,$mul1); # I wish movd worked for
137 &pand ($acc0,$mask); # inter-register transfers
138
139 &pmuludq($mul1,$_n0q); # *=n0
140
141 &pmuludq($car1,$mul1); # "t[0]"*np[0]*n0
142 &paddq ($car1,$acc0);
143
144 &movd ($acc1,&DWP(4,$np)); # np[1]
145 &movd ($acc0,&DWP(4,$ap)); # ap[1]
146
147 &psrlq ($car0,32);
148 &psrlq ($car1,32);
149
150 &inc ($j); # j++
151&set_label("1st",16);
152 &pmuludq($acc0,$mul0); # ap[j]*bp[0]
153 &pmuludq($acc1,$mul1); # np[j]*m1
154 &paddq ($car0,$acc0); # +=c0
155 &paddq ($car1,$acc1); # +=c1
156
157 &movq ($acc0,$car0);
158 &pand ($acc0,$mask);
159 &movd ($acc1,&DWP(4,$np,$j,4)); # np[j+1]
160 &paddq ($car1,$acc0); # +=ap[j]*bp[0];
161 &movd ($acc0,&DWP(4,$ap,$j,4)); # ap[j+1]
162 &psrlq ($car0,32);
163 &movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[j-1]=
164 &psrlq ($car1,32);
165
166 &lea ($j,&DWP(1,$j));
167 &cmp ($j,$num);
168 &jl (&label("1st"));
169
170 &pmuludq($acc0,$mul0); # ap[num-1]*bp[0]
171 &pmuludq($acc1,$mul1); # np[num-1]*m1
172 &paddq ($car0,$acc0); # +=c0
173 &paddq ($car1,$acc1); # +=c1
174
175 &movq ($acc0,$car0);
176 &pand ($acc0,$mask);
177 &paddq ($car1,$acc0); # +=ap[num-1]*bp[0];
178 &movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[num-2]=
179
180 &psrlq ($car0,32);
181 &psrlq ($car1,32);
182
183 &paddq ($car1,$car0);
184 &movq (&QWP($frame,"esp",$num,4),$car1); # tp[num].tp[num-1]
185
186 &inc ($i); # i++
187&set_label("outer");
188 &xor ($j,$j); # j=0
189
190 &movd ($mul0,&DWP(0,$bp,$i,4)); # bp[i]
191 &movd ($mul1,&DWP(0,$ap)); # ap[0]
192 &movd ($temp,&DWP($frame,"esp")); # tp[0]
193 &movd ($car1,&DWP(0,$np)); # np[0]
194 &pmuludq($mul1,$mul0); # ap[0]*bp[i]
195
196 &paddq ($mul1,$temp); # +=tp[0]
197 &movq ($acc0,$mul1);
198 &movq ($car0,$mul1);
199 &pand ($acc0,$mask);
200
201 &pmuludq($mul1,$_n0q); # *=n0
202
203 &pmuludq($car1,$mul1);
204 &paddq ($car1,$acc0);
205
206 &movd ($temp,&DWP($frame+4,"esp")); # tp[1]
207 &movd ($acc1,&DWP(4,$np)); # np[1]
208 &movd ($acc0,&DWP(4,$ap)); # ap[1]
209
210 &psrlq ($car0,32);
211 &psrlq ($car1,32);
212 &paddq ($car0,$temp); # +=tp[1]
213
214 &inc ($j); # j++
215 &dec ($num);
216&set_label("inner");
217 &pmuludq($acc0,$mul0); # ap[j]*bp[i]
218 &pmuludq($acc1,$mul1); # np[j]*m1
219 &paddq ($car0,$acc0); # +=c0
220 &paddq ($car1,$acc1); # +=c1
221
222 &movq ($acc0,$car0);
223 &movd ($temp,&DWP($frame+4,"esp",$j,4));# tp[j+1]
224 &pand ($acc0,$mask);
225 &movd ($acc1,&DWP(4,$np,$j,4)); # np[j+1]
226 &paddq ($car1,$acc0); # +=ap[j]*bp[i]+tp[j]
227 &movd ($acc0,&DWP(4,$ap,$j,4)); # ap[j+1]
228 &psrlq ($car0,32);
229 &movd (&DWP($frame-4,"esp",$j,4),$car1);# tp[j-1]=
230 &psrlq ($car1,32);
231 &paddq ($car0,$temp); # +=tp[j+1]
232
233 &dec ($num);
234 &lea ($j,&DWP(1,$j)); # j++
235 &jnz (&label("inner"));
236
237 &mov ($num,$j);
238 &pmuludq($acc0,$mul0); # ap[num-1]*bp[i]
239 &pmuludq($acc1,$mul1); # np[num-1]*m1
240 &paddq ($car0,$acc0); # +=c0
241 &paddq ($car1,$acc1); # +=c1
242
243 &movq ($acc0,$car0);
244 &pand ($acc0,$mask);
245 &paddq ($car1,$acc0); # +=ap[num-1]*bp[i]+tp[num-1]
246 &movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[num-2]=
247 &psrlq ($car0,32);
248 &psrlq ($car1,32);
249
250 &movd ($temp,&DWP($frame+4,"esp",$num,4)); # += tp[num]
251 &paddq ($car1,$car0);
252 &paddq ($car1,$temp);
253 &movq (&QWP($frame,"esp",$num,4),$car1); # tp[num].tp[num-1]
254
255 &lea ($i,&DWP(1,$i)); # i++
256 &cmp ($i,$num);
257 &jle (&label("outer"));
258
259 &emms (); # done with mmx bank
260 &jmp (&label("common_tail"));
261
262&set_label("non_sse2",16);
263}
264
265if (0) {
266 &mov ("esp",$_sp);
267 &xor ("eax","eax"); # signal "not fast enough [yet]"
268 &jmp (&label("just_leave"));
269 # While the below code provides competitive performance for
270 # all key lengthes on modern Intel cores, it's still more
271 # than 10% slower for 4096-bit key elsewhere:-( "Competitive"
272 # means compared to the original integer-only assembler.
273 # 512-bit RSA sign is better by ~40%, but that's about all
274 # one can say about all CPUs...
275} else {
276$inp="esi"; # integer path uses these registers differently
277$word="edi";
278$carry="ebp";
279
280 &mov ($inp,$_ap);
281 &lea ($carry,&DWP(1,$num));
282 &mov ($word,$_bp);
283 &xor ($j,$j); # j=0
284 &mov ("edx",$inp);
285 &and ($carry,1); # see if num is even
286 &sub ("edx",$word); # see if ap==bp
287 &lea ("eax",&DWP(4,$word,$num,4)); # &bp[num]
288 &or ($carry,"edx");
289 &mov ($word,&DWP(0,$word)); # bp[0]
290 &jz (&label("bn_sqr_mont"));
291 &mov ($_bpend,"eax");
292 &mov ("eax",&DWP(0,$inp));
293 &xor ("edx","edx");
294
295&set_label("mull",16);
296 &mov ($carry,"edx");
297 &mul ($word); # ap[j]*bp[0]
298 &add ($carry,"eax");
299 &lea ($j,&DWP(1,$j));
300 &adc ("edx",0);
301 &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j+1]
302 &cmp ($j,$num);
303 &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]=
304 &jl (&label("mull"));
305
306 &mov ($carry,"edx");
307 &mul ($word); # ap[num-1]*bp[0]
308 &mov ($word,$_n0);
309 &add ("eax",$carry);
310 &mov ($inp,$_np);
311 &adc ("edx",0);
312 &imul ($word,&DWP($frame,"esp")); # n0*tp[0]
313
314 &mov (&DWP($frame,"esp",$num,4),"eax"); # tp[num-1]=
315 &xor ($j,$j);
316 &mov (&DWP($frame+4,"esp",$num,4),"edx"); # tp[num]=
317 &mov (&DWP($frame+8,"esp",$num,4),$j); # tp[num+1]=
318
319 &mov ("eax",&DWP(0,$inp)); # np[0]
320 &mul ($word); # np[0]*m
321 &add ("eax",&DWP($frame,"esp")); # +=tp[0]
322 &mov ("eax",&DWP(4,$inp)); # np[1]
323 &adc ("edx",0);
324 &inc ($j);
325
326 &jmp (&label("2ndmadd"));
327
328&set_label("1stmadd",16);
329 &mov ($carry,"edx");
330 &mul ($word); # ap[j]*bp[i]
331 &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j]
332 &lea ($j,&DWP(1,$j));
333 &adc ("edx",0);
334 &add ($carry,"eax");
335 &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j+1]
336 &adc ("edx",0);
337 &cmp ($j,$num);
338 &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]=
339 &jl (&label("1stmadd"));
340
341 &mov ($carry,"edx");
342 &mul ($word); # ap[num-1]*bp[i]
343 &add ("eax",&DWP($frame,"esp",$num,4)); # +=tp[num-1]
344 &mov ($word,$_n0);
345 &adc ("edx",0);
346 &mov ($inp,$_np);
347 &add ($carry,"eax");
348 &adc ("edx",0);
349 &imul ($word,&DWP($frame,"esp")); # n0*tp[0]
350
351 &xor ($j,$j);
352 &add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num]
353 &mov (&DWP($frame,"esp",$num,4),$carry); # tp[num-1]=
354 &adc ($j,0);
355 &mov ("eax",&DWP(0,$inp)); # np[0]
356 &mov (&DWP($frame+4,"esp",$num,4),"edx"); # tp[num]=
357 &mov (&DWP($frame+8,"esp",$num,4),$j); # tp[num+1]=
358
359 &mul ($word); # np[0]*m
360 &add ("eax",&DWP($frame,"esp")); # +=tp[0]
361 &mov ("eax",&DWP(4,$inp)); # np[1]
362 &adc ("edx",0);
363 &mov ($j,1);
364
365&set_label("2ndmadd",16);
366 &mov ($carry,"edx");
367 &mul ($word); # np[j]*m
368 &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j]
369 &lea ($j,&DWP(1,$j));
370 &adc ("edx",0);
371 &add ($carry,"eax");
372 &mov ("eax",&DWP(0,$inp,$j,4)); # np[j+1]
373 &adc ("edx",0);
374 &cmp ($j,$num);
375 &mov (&DWP($frame-8,"esp",$j,4),$carry); # tp[j-1]=
376 &jl (&label("2ndmadd"));
377
378 &mov ($carry,"edx");
379 &mul ($word); # np[j]*m
380 &add ($carry,&DWP($frame,"esp",$num,4)); # +=tp[num-1]
381 &adc ("edx",0);
382 &add ($carry,"eax");
383 &adc ("edx",0);
384 &mov (&DWP($frame-4,"esp",$num,4),$carry); # tp[num-2]=
385
386 &xor ("eax","eax");
387 &mov ($j,$_bp); # &bp[i]
388 &add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num]
389 &adc ("eax",&DWP($frame+8,"esp",$num,4)); # +=tp[num+1]
390 &lea ($j,&DWP(4,$j));
391 &mov (&DWP($frame,"esp",$num,4),"edx"); # tp[num-1]=
392 &cmp ($j,$_bpend);
393 &mov (&DWP($frame+4,"esp",$num,4),"eax"); # tp[num]=
394 &je (&label("common_tail"));
395
396 &mov ($word,&DWP(0,$j)); # bp[i+1]
397 &mov ($inp,$_ap);
398 &mov ($_bp,$j); # &bp[++i]
399 &xor ($j,$j);
400 &xor ("edx","edx");
401 &mov ("eax",&DWP(0,$inp));
402 &jmp (&label("1stmadd"));
403
404&set_label("bn_sqr_mont",16);
405$sbit=$num;
406 &mov ($_num,$num);
407 &mov ($_bp,$j); # i=0
408
409 &mov ("eax",$word); # ap[0]
410 &mul ($word); # ap[0]*ap[0]
411 &mov (&DWP($frame,"esp"),"eax"); # tp[0]=
412 &mov ($sbit,"edx");
413 &shr ("edx",1);
414 &and ($sbit,1);
415 &inc ($j);
416&set_label("sqr",16);
417 &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j]
418 &mov ($carry,"edx");
419 &mul ($word); # ap[j]*ap[0]
420 &add ("eax",$carry);
421 &lea ($j,&DWP(1,$j));
422 &adc ("edx",0);
423 &lea ($carry,&DWP(0,$sbit,"eax",2));
424 &shr ("eax",31);
425 &cmp ($j,$_num);
426 &mov ($sbit,"eax");
427 &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]=
428 &jl (&label("sqr"));
429
430 &mov ("eax",&DWP(0,$inp,$j,4)); # ap[num-1]
431 &mov ($carry,"edx");
432 &mul ($word); # ap[num-1]*ap[0]
433 &add ("eax",$carry);
434 &mov ($word,$_n0);
435 &adc ("edx",0);
436 &mov ($inp,$_np);
437 &lea ($carry,&DWP(0,$sbit,"eax",2));
438 &imul ($word,&DWP($frame,"esp")); # n0*tp[0]
439 &shr ("eax",31);
440 &mov (&DWP($frame,"esp",$j,4),$carry); # tp[num-1]=
441
442 &lea ($carry,&DWP(0,"eax","edx",2));
443 &mov ("eax",&DWP(0,$inp)); # np[0]
444 &shr ("edx",31);
445 &mov (&DWP($frame+4,"esp",$j,4),$carry); # tp[num]=
446 &mov (&DWP($frame+8,"esp",$j,4),"edx"); # tp[num+1]=
447
448 &mul ($word); # np[0]*m
449 &add ("eax",&DWP($frame,"esp")); # +=tp[0]
450 &mov ($num,$j);
451 &adc ("edx",0);
452 &mov ("eax",&DWP(4,$inp)); # np[1]
453 &mov ($j,1);
454
455&set_label("3rdmadd",16);
456 &mov ($carry,"edx");
457 &mul ($word); # np[j]*m
458 &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j]
459 &adc ("edx",0);
460 &add ($carry,"eax");
461 &mov ("eax",&DWP(4,$inp,$j,4)); # np[j+1]
462 &adc ("edx",0);
463 &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j-1]=
464
465 &mov ($carry,"edx");
466 &mul ($word); # np[j+1]*m
467 &add ($carry,&DWP($frame+4,"esp",$j,4)); # +=tp[j+1]
468 &lea ($j,&DWP(2,$j));
469 &adc ("edx",0);
470 &add ($carry,"eax");
471 &mov ("eax",&DWP(0,$inp,$j,4)); # np[j+2]
472 &adc ("edx",0);
473 &cmp ($j,$num);
474 &mov (&DWP($frame-8,"esp",$j,4),$carry); # tp[j]=
475 &jl (&label("3rdmadd"));
476
477 &mov ($carry,"edx");
478 &mul ($word); # np[j]*m
479 &add ($carry,&DWP($frame,"esp",$num,4)); # +=tp[num-1]
480 &adc ("edx",0);
481 &add ($carry,"eax");
482 &adc ("edx",0);
483 &mov (&DWP($frame-4,"esp",$num,4),$carry); # tp[num-2]=
484
485 &mov ($j,$_bp); # i
486 &xor ("eax","eax");
487 &mov ($inp,$_ap);
488 &add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num]
489 &adc ("eax",&DWP($frame+8,"esp",$num,4)); # +=tp[num+1]
490 &mov (&DWP($frame,"esp",$num,4),"edx"); # tp[num-1]=
491 &cmp ($j,$num);
492 &mov (&DWP($frame+4,"esp",$num,4),"eax"); # tp[num]=
493 &je (&label("common_tail"));
494
495 &mov ($word,&DWP(4,$inp,$j,4)); # ap[i]
496 &lea ($j,&DWP(1,$j));
497 &mov ("eax",$word);
498 &mov ($_bp,$j); # ++i
499 &mul ($word); # ap[i]*ap[i]
500 &add ("eax",&DWP($frame,"esp",$j,4)); # +=tp[i]
501 &adc ("edx",0);
502 &mov (&DWP($frame,"esp",$j,4),"eax"); # tp[i]=
503 &xor ($carry,$carry);
504 &cmp ($j,$num);
505 &lea ($j,&DWP(1,$j));
506 &je (&label("sqrlast"));
507
508 &mov ($sbit,"edx"); # zaps $num
509 &shr ("edx",1);
510 &and ($sbit,1);
511&set_label("sqradd",16);
512 &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j]
513 &mov ($carry,"edx");
514 &mul ($word); # ap[j]*ap[i]
515 &add ("eax",$carry);
516 &lea ($carry,&DWP(0,"eax","eax"));
517 &adc ("edx",0);
518 &shr ("eax",31);
519 &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j]
520 &lea ($j,&DWP(1,$j));
521 &adc ("eax",0);
522 &add ($carry,$sbit);
523 &adc ("eax",0);
524 &cmp ($j,$_num);
525 &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]=
526 &mov ($sbit,"eax");
527 &jle (&label("sqradd"));
528
529 &mov ($carry,"edx");
530 &add ("edx","edx");
531 &shr ($carry,31);
532 &add ("edx",$sbit);
533 &adc ($carry,0);
534&set_label("sqrlast");
535 &mov ($word,$_n0);
536 &mov ($inp,$_np);
537 &imul ($word,&DWP($frame,"esp")); # n0*tp[0]
538
539 &add ("edx",&DWP($frame,"esp",$j,4)); # +=tp[num]
540 &mov ("eax",&DWP(0,$inp)); # np[0]
541 &adc ($carry,0);
542 &mov (&DWP($frame,"esp",$j,4),"edx"); # tp[num]=
543 &mov (&DWP($frame+4,"esp",$j,4),$carry); # tp[num+1]=
544
545 &mul ($word); # np[0]*m
546 &add ("eax",&DWP($frame,"esp")); # +=tp[0]
547 &lea ($num,&DWP(-1,$j));
548 &adc ("edx",0);
549 &mov ($j,1);
550 &mov ("eax",&DWP(4,$inp)); # np[1]
551
552 &jmp (&label("3rdmadd"));
553}
554
555&set_label("common_tail",16);
556 &mov ($np,$_np); # load modulus pointer
557 &mov ($rp,$_rp); # load result pointer
558 &lea ($tp,&DWP($frame,"esp")); # [$ap and $bp are zapped]
559
560 &mov ("eax",&DWP(0,$tp)); # tp[0]
561 &mov ($j,$num); # j=num-1
562 &xor ($i,$i); # i=0 and clear CF!
563
564&set_label("sub",16);
565 &sbb ("eax",&DWP(0,$np,$i,4));
566 &mov (&DWP(0,$rp,$i,4),"eax"); # rp[i]=tp[i]-np[i]
567 &dec ($j); # doesn't affect CF!
568 &mov ("eax",&DWP(4,$tp,$i,4)); # tp[i+1]
569 &lea ($i,&DWP(1,$i)); # i++
570 &jge (&label("sub"));
571
572 &sbb ("eax",0); # handle upmost overflow bit
573 &and ($tp,"eax");
574 &not ("eax");
575 &mov ($np,$rp);
576 &and ($np,"eax");
577 &or ($tp,$np); # tp=carry?tp:rp
578
579&set_label("copy",16); # copy or in-place refresh
580 &mov ("eax",&DWP(0,$tp,$num,4));
581 &mov (&DWP(0,$rp,$num,4),"eax"); # rp[i]=tp[i]
582 &mov (&DWP($frame,"esp",$num,4),$j); # zap temporary vector
583 &dec ($num);
584 &jge (&label("copy"));
585
586 &mov ("esp",$_sp); # pull saved stack pointer
587 &mov ("eax",1);
588&set_label("just_leave");
589&function_end("bn_mul_mont");
590
591&asciz("Montgomery Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>");
592
593&asm_finish();
diff --git a/src/lib/libcrypto/bn/asm/x86.pl b/src/lib/libcrypto/bn/asm/x86.pl
deleted file mode 100644
index 1bc4f1bb27..0000000000
--- a/src/lib/libcrypto/bn/asm/x86.pl
+++ /dev/null
@@ -1,28 +0,0 @@
1#!/usr/local/bin/perl
2
3push(@INC,"perlasm","../../perlasm");
4require "x86asm.pl";
5
6require("x86/mul_add.pl");
7require("x86/mul.pl");
8require("x86/sqr.pl");
9require("x86/div.pl");
10require("x86/add.pl");
11require("x86/sub.pl");
12require("x86/comba.pl");
13
14&asm_init($ARGV[0],$0);
15
16&bn_mul_add_words("bn_mul_add_words");
17&bn_mul_words("bn_mul_words");
18&bn_sqr_words("bn_sqr_words");
19&bn_div_words("bn_div_words");
20&bn_add_words("bn_add_words");
21&bn_sub_words("bn_sub_words");
22&bn_mul_comba("bn_mul_comba8",8);
23&bn_mul_comba("bn_mul_comba4",4);
24&bn_sqr_comba("bn_sqr_comba8",8);
25&bn_sqr_comba("bn_sqr_comba4",4);
26
27&asm_finish();
28
diff --git a/src/lib/libcrypto/bn/asm/x86/add.pl b/src/lib/libcrypto/bn/asm/x86/add.pl
deleted file mode 100644
index 0b5cf583e3..0000000000
--- a/src/lib/libcrypto/bn/asm/x86/add.pl
+++ /dev/null
@@ -1,76 +0,0 @@
1#!/usr/local/bin/perl
2# x86 assember
3
4sub bn_add_words
5 {
6 local($name)=@_;
7
8 &function_begin($name,"");
9
10 &comment("");
11 $a="esi";
12 $b="edi";
13 $c="eax";
14 $r="ebx";
15 $tmp1="ecx";
16 $tmp2="edx";
17 $num="ebp";
18
19 &mov($r,&wparam(0)); # get r
20 &mov($a,&wparam(1)); # get a
21 &mov($b,&wparam(2)); # get b
22 &mov($num,&wparam(3)); # get num
23 &xor($c,$c); # clear carry
24 &and($num,0xfffffff8); # num / 8
25
26 &jz(&label("aw_finish"));
27
28 &set_label("aw_loop",0);
29 for ($i=0; $i<8; $i++)
30 {
31 &comment("Round $i");
32
33 &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
34 &mov($tmp2,&DWP($i*4,$b,"",0)); # *b
35 &add($tmp1,$c);
36 &mov($c,0);
37 &adc($c,$c);
38 &add($tmp1,$tmp2);
39 &adc($c,0);
40 &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
41 }
42
43 &comment("");
44 &add($a,32);
45 &add($b,32);
46 &add($r,32);
47 &sub($num,8);
48 &jnz(&label("aw_loop"));
49
50 &set_label("aw_finish",0);
51 &mov($num,&wparam(3)); # get num
52 &and($num,7);
53 &jz(&label("aw_end"));
54
55 for ($i=0; $i<7; $i++)
56 {
57 &comment("Tail Round $i");
58 &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
59 &mov($tmp2,&DWP($i*4,$b,"",0));# *b
60 &add($tmp1,$c);
61 &mov($c,0);
62 &adc($c,$c);
63 &add($tmp1,$tmp2);
64 &adc($c,0);
65 &dec($num) if ($i != 6);
66 &mov(&DWP($i*4,$r,"",0),$tmp1); # *a
67 &jz(&label("aw_end")) if ($i != 6);
68 }
69 &set_label("aw_end",0);
70
71# &mov("eax",$c); # $c is "eax"
72
73 &function_end($name);
74 }
75
761;
diff --git a/src/lib/libcrypto/bn/asm/x86/comba.pl b/src/lib/libcrypto/bn/asm/x86/comba.pl
deleted file mode 100644
index 2291253629..0000000000
--- a/src/lib/libcrypto/bn/asm/x86/comba.pl
+++ /dev/null
@@ -1,277 +0,0 @@
1#!/usr/local/bin/perl
2# x86 assember
3
4sub mul_add_c
5 {
6 local($a,$ai,$b,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
7
8 # pos == -1 if eax and edx are pre-loaded, 0 to load from next
9 # words, and 1 if load return value
10
11 &comment("mul a[$ai]*b[$bi]");
12
13 # "eax" and "edx" will always be pre-loaded.
14 # &mov("eax",&DWP($ai*4,$a,"",0)) ;
15 # &mov("edx",&DWP($bi*4,$b,"",0));
16
17 &mul("edx");
18 &add($c0,"eax");
19 &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # laod next a
20 &mov("eax",&wparam(0)) if $pos > 0; # load r[]
21 ###
22 &adc($c1,"edx");
23 &mov("edx",&DWP(($nb)*4,$b,"",0)) if $pos == 0; # laod next b
24 &mov("edx",&DWP(($nb)*4,$b,"",0)) if $pos == 1; # laod next b
25 ###
26 &adc($c2,0);
27 # is pos > 1, it means it is the last loop
28 &mov(&DWP($i*4,"eax","",0),$c0) if $pos > 0; # save r[];
29 &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # laod next a
30 }
31
32sub sqr_add_c
33 {
34 local($r,$a,$ai,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
35
36 # pos == -1 if eax and edx are pre-loaded, 0 to load from next
37 # words, and 1 if load return value
38
39 &comment("sqr a[$ai]*a[$bi]");
40
41 # "eax" and "edx" will always be pre-loaded.
42 # &mov("eax",&DWP($ai*4,$a,"",0)) ;
43 # &mov("edx",&DWP($bi*4,$b,"",0));
44
45 if ($ai == $bi)
46 { &mul("eax");}
47 else
48 { &mul("edx");}
49 &add($c0,"eax");
50 &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # load next a
51 ###
52 &adc($c1,"edx");
53 &mov("edx",&DWP(($nb)*4,$a,"",0)) if ($pos == 1) && ($na != $nb);
54 ###
55 &adc($c2,0);
56 # is pos > 1, it means it is the last loop
57 &mov(&DWP($i*4,$r,"",0),$c0) if $pos > 0; # save r[];
58 &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # load next b
59 }
60
61sub sqr_add_c2
62 {
63 local($r,$a,$ai,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
64
65 # pos == -1 if eax and edx are pre-loaded, 0 to load from next
66 # words, and 1 if load return value
67
68 &comment("sqr a[$ai]*a[$bi]");
69
70 # "eax" and "edx" will always be pre-loaded.
71 # &mov("eax",&DWP($ai*4,$a,"",0)) ;
72 # &mov("edx",&DWP($bi*4,$a,"",0));
73
74 if ($ai == $bi)
75 { &mul("eax");}
76 else
77 { &mul("edx");}
78 &add("eax","eax");
79 ###
80 &adc("edx","edx");
81 ###
82 &adc($c2,0);
83 &add($c0,"eax");
84 &adc($c1,"edx");
85 &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # load next a
86 &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # load next b
87 &adc($c2,0);
88 &mov(&DWP($i*4,$r,"",0),$c0) if $pos > 0; # save r[];
89 &mov("edx",&DWP(($nb)*4,$a,"",0)) if ($pos <= 1) && ($na != $nb);
90 ###
91 }
92
93sub bn_mul_comba
94 {
95 local($name,$num)=@_;
96 local($a,$b,$c0,$c1,$c2);
97 local($i,$as,$ae,$bs,$be,$ai,$bi);
98 local($tot,$end);
99
100 &function_begin_B($name,"");
101
102 $c0="ebx";
103 $c1="ecx";
104 $c2="ebp";
105 $a="esi";
106 $b="edi";
107
108 $as=0;
109 $ae=0;
110 $bs=0;
111 $be=0;
112 $tot=$num+$num-1;
113
114 &push("esi");
115 &mov($a,&wparam(1));
116 &push("edi");
117 &mov($b,&wparam(2));
118 &push("ebp");
119 &push("ebx");
120
121 &xor($c0,$c0);
122 &mov("eax",&DWP(0,$a,"",0)); # load the first word
123 &xor($c1,$c1);
124 &mov("edx",&DWP(0,$b,"",0)); # load the first second
125
126 for ($i=0; $i<$tot; $i++)
127 {
128 $ai=$as;
129 $bi=$bs;
130 $end=$be+1;
131
132 &comment("################## Calculate word $i");
133
134 for ($j=$bs; $j<$end; $j++)
135 {
136 &xor($c2,$c2) if ($j == $bs);
137 if (($j+1) == $end)
138 {
139 $v=1;
140 $v=2 if (($i+1) == $tot);
141 }
142 else
143 { $v=0; }
144 if (($j+1) != $end)
145 {
146 $na=($ai-1);
147 $nb=($bi+1);
148 }
149 else
150 {
151 $na=$as+($i < ($num-1));
152 $nb=$bs+($i >= ($num-1));
153 }
154#printf STDERR "[$ai,$bi] -> [$na,$nb]\n";
155 &mul_add_c($a,$ai,$b,$bi,$c0,$c1,$c2,$v,$i,$na,$nb);
156 if ($v)
157 {
158 &comment("saved r[$i]");
159 # &mov("eax",&wparam(0));
160 # &mov(&DWP($i*4,"eax","",0),$c0);
161 ($c0,$c1,$c2)=($c1,$c2,$c0);
162 }
163 $ai--;
164 $bi++;
165 }
166 $as++ if ($i < ($num-1));
167 $ae++ if ($i >= ($num-1));
168
169 $bs++ if ($i >= ($num-1));
170 $be++ if ($i < ($num-1));
171 }
172 &comment("save r[$i]");
173 # &mov("eax",&wparam(0));
174 &mov(&DWP($i*4,"eax","",0),$c0);
175
176 &pop("ebx");
177 &pop("ebp");
178 &pop("edi");
179 &pop("esi");
180 &ret();
181 &function_end_B($name);
182 }
183
184sub bn_sqr_comba
185 {
186 local($name,$num)=@_;
187 local($r,$a,$c0,$c1,$c2)=@_;
188 local($i,$as,$ae,$bs,$be,$ai,$bi);
189 local($b,$tot,$end,$half);
190
191 &function_begin_B($name,"");
192
193 $c0="ebx";
194 $c1="ecx";
195 $c2="ebp";
196 $a="esi";
197 $r="edi";
198
199 &push("esi");
200 &push("edi");
201 &push("ebp");
202 &push("ebx");
203 &mov($r,&wparam(0));
204 &mov($a,&wparam(1));
205 &xor($c0,$c0);
206 &xor($c1,$c1);
207 &mov("eax",&DWP(0,$a,"",0)); # load the first word
208
209 $as=0;
210 $ae=0;
211 $bs=0;
212 $be=0;
213 $tot=$num+$num-1;
214
215 for ($i=0; $i<$tot; $i++)
216 {
217 $ai=$as;
218 $bi=$bs;
219 $end=$be+1;
220
221 &comment("############### Calculate word $i");
222 for ($j=$bs; $j<$end; $j++)
223 {
224 &xor($c2,$c2) if ($j == $bs);
225 if (($ai-1) < ($bi+1))
226 {
227 $v=1;
228 $v=2 if ($i+1) == $tot;
229 }
230 else
231 { $v=0; }
232 if (!$v)
233 {
234 $na=$ai-1;
235 $nb=$bi+1;
236 }
237 else
238 {
239 $na=$as+($i < ($num-1));
240 $nb=$bs+($i >= ($num-1));
241 }
242 if ($ai == $bi)
243 {
244 &sqr_add_c($r,$a,$ai,$bi,
245 $c0,$c1,$c2,$v,$i,$na,$nb);
246 }
247 else
248 {
249 &sqr_add_c2($r,$a,$ai,$bi,
250 $c0,$c1,$c2,$v,$i,$na,$nb);
251 }
252 if ($v)
253 {
254 &comment("saved r[$i]");
255 #&mov(&DWP($i*4,$r,"",0),$c0);
256 ($c0,$c1,$c2)=($c1,$c2,$c0);
257 last;
258 }
259 $ai--;
260 $bi++;
261 }
262 $as++ if ($i < ($num-1));
263 $ae++ if ($i >= ($num-1));
264
265 $bs++ if ($i >= ($num-1));
266 $be++ if ($i < ($num-1));
267 }
268 &mov(&DWP($i*4,$r,"",0),$c0);
269 &pop("ebx");
270 &pop("ebp");
271 &pop("edi");
272 &pop("esi");
273 &ret();
274 &function_end_B($name);
275 }
276
2771;
diff --git a/src/lib/libcrypto/bn/asm/x86/div.pl b/src/lib/libcrypto/bn/asm/x86/div.pl
deleted file mode 100644
index 0e90152caa..0000000000
--- a/src/lib/libcrypto/bn/asm/x86/div.pl
+++ /dev/null
@@ -1,15 +0,0 @@
1#!/usr/local/bin/perl
2# x86 assember
3
4sub bn_div_words
5 {
6 local($name)=@_;
7
8 &function_begin($name,"");
9 &mov("edx",&wparam(0)); #
10 &mov("eax",&wparam(1)); #
11 &mov("ebx",&wparam(2)); #
12 &div("ebx");
13 &function_end($name);
14 }
151;
diff --git a/src/lib/libcrypto/bn/asm/x86/mul.pl b/src/lib/libcrypto/bn/asm/x86/mul.pl
deleted file mode 100644
index 674cb9b055..0000000000
--- a/src/lib/libcrypto/bn/asm/x86/mul.pl
+++ /dev/null
@@ -1,77 +0,0 @@
1#!/usr/local/bin/perl
2# x86 assember
3
4sub bn_mul_words
5 {
6 local($name)=@_;
7
8 &function_begin($name,"");
9
10 &comment("");
11 $Low="eax";
12 $High="edx";
13 $a="ebx";
14 $w="ecx";
15 $r="edi";
16 $c="esi";
17 $num="ebp";
18
19 &xor($c,$c); # clear carry
20 &mov($r,&wparam(0)); #
21 &mov($a,&wparam(1)); #
22 &mov($num,&wparam(2)); #
23 &mov($w,&wparam(3)); #
24
25 &and($num,0xfffffff8); # num / 8
26 &jz(&label("mw_finish"));
27
28 &set_label("mw_loop",0);
29 for ($i=0; $i<32; $i+=4)
30 {
31 &comment("Round $i");
32
33 &mov("eax",&DWP($i,$a,"",0)); # *a
34 &mul($w); # *a * w
35 &add("eax",$c); # L(t)+=c
36 # XXX
37
38 &adc("edx",0); # H(t)+=carry
39 &mov(&DWP($i,$r,"",0),"eax"); # *r= L(t);
40
41 &mov($c,"edx"); # c= H(t);
42 }
43
44 &comment("");
45 &add($a,32);
46 &add($r,32);
47 &sub($num,8);
48 &jz(&label("mw_finish"));
49 &jmp(&label("mw_loop"));
50
51 &set_label("mw_finish",0);
52 &mov($num,&wparam(2)); # get num
53 &and($num,7);
54 &jnz(&label("mw_finish2"));
55 &jmp(&label("mw_end"));
56
57 &set_label("mw_finish2",1);
58 for ($i=0; $i<7; $i++)
59 {
60 &comment("Tail Round $i");
61 &mov("eax",&DWP($i*4,$a,"",0));# *a
62 &mul($w); # *a * w
63 &add("eax",$c); # L(t)+=c
64 # XXX
65 &adc("edx",0); # H(t)+=carry
66 &mov(&DWP($i*4,$r,"",0),"eax");# *r= L(t);
67 &mov($c,"edx"); # c= H(t);
68 &dec($num) if ($i != 7-1);
69 &jz(&label("mw_end")) if ($i != 7-1);
70 }
71 &set_label("mw_end",0);
72 &mov("eax",$c);
73
74 &function_end($name);
75 }
76
771;
diff --git a/src/lib/libcrypto/bn/asm/x86/mul_add.pl b/src/lib/libcrypto/bn/asm/x86/mul_add.pl
deleted file mode 100644
index 61830d3a90..0000000000
--- a/src/lib/libcrypto/bn/asm/x86/mul_add.pl
+++ /dev/null
@@ -1,87 +0,0 @@
1#!/usr/local/bin/perl
2# x86 assember
3
4sub bn_mul_add_words
5 {
6 local($name)=@_;
7
8 &function_begin($name,"");
9
10 &comment("");
11 $Low="eax";
12 $High="edx";
13 $a="ebx";
14 $w="ebp";
15 $r="edi";
16 $c="esi";
17
18 &xor($c,$c); # clear carry
19 &mov($r,&wparam(0)); #
20
21 &mov("ecx",&wparam(2)); #
22 &mov($a,&wparam(1)); #
23
24 &and("ecx",0xfffffff8); # num / 8
25 &mov($w,&wparam(3)); #
26
27 &push("ecx"); # Up the stack for a tmp variable
28
29 &jz(&label("maw_finish"));
30
31 &set_label("maw_loop",0);
32
33 &mov(&swtmp(0),"ecx"); #
34
35 for ($i=0; $i<32; $i+=4)
36 {
37 &comment("Round $i");
38
39 &mov("eax",&DWP($i,$a,"",0)); # *a
40 &mul($w); # *a * w
41 &add("eax",$c); # L(t)+= *r
42 &mov($c,&DWP($i,$r,"",0)); # L(t)+= *r
43 &adc("edx",0); # H(t)+=carry
44 &add("eax",$c); # L(t)+=c
45 &adc("edx",0); # H(t)+=carry
46 &mov(&DWP($i,$r,"",0),"eax"); # *r= L(t);
47 &mov($c,"edx"); # c= H(t);
48 }
49
50 &comment("");
51 &mov("ecx",&swtmp(0)); #
52 &add($a,32);
53 &add($r,32);
54 &sub("ecx",8);
55 &jnz(&label("maw_loop"));
56
57 &set_label("maw_finish",0);
58 &mov("ecx",&wparam(2)); # get num
59 &and("ecx",7);
60 &jnz(&label("maw_finish2")); # helps branch prediction
61 &jmp(&label("maw_end"));
62
63 &set_label("maw_finish2",1);
64 for ($i=0; $i<7; $i++)
65 {
66 &comment("Tail Round $i");
67 &mov("eax",&DWP($i*4,$a,"",0));# *a
68 &mul($w); # *a * w
69 &add("eax",$c); # L(t)+=c
70 &mov($c,&DWP($i*4,$r,"",0)); # L(t)+= *r
71 &adc("edx",0); # H(t)+=carry
72 &add("eax",$c);
73 &adc("edx",0); # H(t)+=carry
74 &dec("ecx") if ($i != 7-1);
75 &mov(&DWP($i*4,$r,"",0),"eax"); # *r= L(t);
76 &mov($c,"edx"); # c= H(t);
77 &jz(&label("maw_end")) if ($i != 7-1);
78 }
79 &set_label("maw_end",0);
80 &mov("eax",$c);
81
82 &pop("ecx"); # clear variable from
83
84 &function_end($name);
85 }
86
871;
diff --git a/src/lib/libcrypto/bn/asm/x86/sqr.pl b/src/lib/libcrypto/bn/asm/x86/sqr.pl
deleted file mode 100644
index 1f90993cf6..0000000000
--- a/src/lib/libcrypto/bn/asm/x86/sqr.pl
+++ /dev/null
@@ -1,60 +0,0 @@
1#!/usr/local/bin/perl
2# x86 assember
3
4sub bn_sqr_words
5 {
6 local($name)=@_;
7
8 &function_begin($name,"");
9
10 &comment("");
11 $r="esi";
12 $a="edi";
13 $num="ebx";
14
15 &mov($r,&wparam(0)); #
16 &mov($a,&wparam(1)); #
17 &mov($num,&wparam(2)); #
18
19 &and($num,0xfffffff8); # num / 8
20 &jz(&label("sw_finish"));
21
22 &set_label("sw_loop",0);
23 for ($i=0; $i<32; $i+=4)
24 {
25 &comment("Round $i");
26 &mov("eax",&DWP($i,$a,"",0)); # *a
27 # XXX
28 &mul("eax"); # *a * *a
29 &mov(&DWP($i*2,$r,"",0),"eax"); #
30 &mov(&DWP($i*2+4,$r,"",0),"edx");#
31 }
32
33 &comment("");
34 &add($a,32);
35 &add($r,64);
36 &sub($num,8);
37 &jnz(&label("sw_loop"));
38
39 &set_label("sw_finish",0);
40 &mov($num,&wparam(2)); # get num
41 &and($num,7);
42 &jz(&label("sw_end"));
43
44 for ($i=0; $i<7; $i++)
45 {
46 &comment("Tail Round $i");
47 &mov("eax",&DWP($i*4,$a,"",0)); # *a
48 # XXX
49 &mul("eax"); # *a * *a
50 &mov(&DWP($i*8,$r,"",0),"eax"); #
51 &dec($num) if ($i != 7-1);
52 &mov(&DWP($i*8+4,$r,"",0),"edx");
53 &jz(&label("sw_end")) if ($i != 7-1);
54 }
55 &set_label("sw_end",0);
56
57 &function_end($name);
58 }
59
601;
diff --git a/src/lib/libcrypto/bn/asm/x86/sub.pl b/src/lib/libcrypto/bn/asm/x86/sub.pl
deleted file mode 100644
index 837b0e1b07..0000000000
--- a/src/lib/libcrypto/bn/asm/x86/sub.pl
+++ /dev/null
@@ -1,76 +0,0 @@
1#!/usr/local/bin/perl
2# x86 assember
3
4sub bn_sub_words
5 {
6 local($name)=@_;
7
8 &function_begin($name,"");
9
10 &comment("");
11 $a="esi";
12 $b="edi";
13 $c="eax";
14 $r="ebx";
15 $tmp1="ecx";
16 $tmp2="edx";
17 $num="ebp";
18
19 &mov($r,&wparam(0)); # get r
20 &mov($a,&wparam(1)); # get a
21 &mov($b,&wparam(2)); # get b
22 &mov($num,&wparam(3)); # get num
23 &xor($c,$c); # clear carry
24 &and($num,0xfffffff8); # num / 8
25
26 &jz(&label("aw_finish"));
27
28 &set_label("aw_loop",0);
29 for ($i=0; $i<8; $i++)
30 {
31 &comment("Round $i");
32
33 &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
34 &mov($tmp2,&DWP($i*4,$b,"",0)); # *b
35 &sub($tmp1,$c);
36 &mov($c,0);
37 &adc($c,$c);
38 &sub($tmp1,$tmp2);
39 &adc($c,0);
40 &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
41 }
42
43 &comment("");
44 &add($a,32);
45 &add($b,32);
46 &add($r,32);
47 &sub($num,8);
48 &jnz(&label("aw_loop"));
49
50 &set_label("aw_finish",0);
51 &mov($num,&wparam(3)); # get num
52 &and($num,7);
53 &jz(&label("aw_end"));
54
55 for ($i=0; $i<7; $i++)
56 {
57 &comment("Tail Round $i");
58 &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
59 &mov($tmp2,&DWP($i*4,$b,"",0));# *b
60 &sub($tmp1,$c);
61 &mov($c,0);
62 &adc($c,$c);
63 &sub($tmp1,$tmp2);
64 &adc($c,0);
65 &dec($num) if ($i != 6);
66 &mov(&DWP($i*4,$r,"",0),$tmp1); # *a
67 &jz(&label("aw_end")) if ($i != 6);
68 }
69 &set_label("aw_end",0);
70
71# &mov("eax",$c); # $c is "eax"
72
73 &function_end($name);
74 }
75
761;
diff --git a/src/lib/libcrypto/bn/asm/x86_64-gcc.c b/src/lib/libcrypto/bn/asm/x86_64-gcc.c
deleted file mode 100644
index acb0b40118..0000000000
--- a/src/lib/libcrypto/bn/asm/x86_64-gcc.c
+++ /dev/null
@@ -1,606 +0,0 @@
1#include "../bn_lcl.h"
2#if !(defined(__GNUC__) && __GNUC__>=2)
3# include "../bn_asm.c" /* kind of dirty hack for Sun Studio */
4#else
5/*
6 * x86_64 BIGNUM accelerator version 0.1, December 2002.
7 *
8 * Implemented by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
9 * project.
10 *
11 * Rights for redistribution and usage in source and binary forms are
12 * granted according to the OpenSSL license. Warranty of any kind is
13 * disclaimed.
14 *
15 * Q. Version 0.1? It doesn't sound like Andy, he used to assign real
16 * versions, like 1.0...
17 * A. Well, that's because this code is basically a quick-n-dirty
18 * proof-of-concept hack. As you can see it's implemented with
19 * inline assembler, which means that you're bound to GCC and that
20 * there might be enough room for further improvement.
21 *
22 * Q. Why inline assembler?
23 * A. x86_64 features own ABI which I'm not familiar with. This is
24 * why I decided to let the compiler take care of subroutine
25 * prologue/epilogue as well as register allocation. For reference.
26 * Win64 implements different ABI for AMD64, different from Linux.
27 *
28 * Q. How much faster does it get?
29 * A. 'apps/openssl speed rsa dsa' output with no-asm:
30 *
31 * sign verify sign/s verify/s
32 * rsa 512 bits 0.0006s 0.0001s 1683.8 18456.2
33 * rsa 1024 bits 0.0028s 0.0002s 356.0 6407.0
34 * rsa 2048 bits 0.0172s 0.0005s 58.0 1957.8
35 * rsa 4096 bits 0.1155s 0.0018s 8.7 555.6
36 * sign verify sign/s verify/s
37 * dsa 512 bits 0.0005s 0.0006s 2100.8 1768.3
38 * dsa 1024 bits 0.0014s 0.0018s 692.3 559.2
39 * dsa 2048 bits 0.0049s 0.0061s 204.7 165.0
40 *
41 * 'apps/openssl speed rsa dsa' output with this module:
42 *
43 * sign verify sign/s verify/s
44 * rsa 512 bits 0.0004s 0.0000s 2767.1 33297.9
45 * rsa 1024 bits 0.0012s 0.0001s 867.4 14674.7
46 * rsa 2048 bits 0.0061s 0.0002s 164.0 5270.0
47 * rsa 4096 bits 0.0384s 0.0006s 26.1 1650.8
48 * sign verify sign/s verify/s
49 * dsa 512 bits 0.0002s 0.0003s 4442.2 3786.3
50 * dsa 1024 bits 0.0005s 0.0007s 1835.1 1497.4
51 * dsa 2048 bits 0.0016s 0.0020s 620.4 504.6
52 *
53 * For the reference. IA-32 assembler implementation performs
54 * very much like 64-bit code compiled with no-asm on the same
55 * machine.
56 */
57
58#ifdef _WIN64
59#define BN_ULONG unsigned long long
60#else
61#define BN_ULONG unsigned long
62#endif
63
64#undef mul
65#undef mul_add
66#undef sqr
67
68/*
69 * "m"(a), "+m"(r) is the way to favor DirectPath µ-code;
70 * "g"(0) let the compiler to decide where does it
71 * want to keep the value of zero;
72 */
73#define mul_add(r,a,word,carry) do { \
74 register BN_ULONG high,low; \
75 asm ("mulq %3" \
76 : "=a"(low),"=d"(high) \
77 : "a"(word),"m"(a) \
78 : "cc"); \
79 asm ("addq %2,%0; adcq %3,%1" \
80 : "+r"(carry),"+d"(high)\
81 : "a"(low),"g"(0) \
82 : "cc"); \
83 asm ("addq %2,%0; adcq %3,%1" \
84 : "+m"(r),"+d"(high) \
85 : "r"(carry),"g"(0) \
86 : "cc"); \
87 carry=high; \
88 } while (0)
89
90#define mul(r,a,word,carry) do { \
91 register BN_ULONG high,low; \
92 asm ("mulq %3" \
93 : "=a"(low),"=d"(high) \
94 : "a"(word),"g"(a) \
95 : "cc"); \
96 asm ("addq %2,%0; adcq %3,%1" \
97 : "+r"(carry),"+d"(high)\
98 : "a"(low),"g"(0) \
99 : "cc"); \
100 (r)=carry, carry=high; \
101 } while (0)
102
103#define sqr(r0,r1,a) \
104 asm ("mulq %2" \
105 : "=a"(r0),"=d"(r1) \
106 : "a"(a) \
107 : "cc");
108
109BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
110 {
111 BN_ULONG c1=0;
112
113 if (num <= 0) return(c1);
114
115 while (num&~3)
116 {
117 mul_add(rp[0],ap[0],w,c1);
118 mul_add(rp[1],ap[1],w,c1);
119 mul_add(rp[2],ap[2],w,c1);
120 mul_add(rp[3],ap[3],w,c1);
121 ap+=4; rp+=4; num-=4;
122 }
123 if (num)
124 {
125 mul_add(rp[0],ap[0],w,c1); if (--num==0) return c1;
126 mul_add(rp[1],ap[1],w,c1); if (--num==0) return c1;
127 mul_add(rp[2],ap[2],w,c1); return c1;
128 }
129
130 return(c1);
131 }
132
133BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
134 {
135 BN_ULONG c1=0;
136
137 if (num <= 0) return(c1);
138
139 while (num&~3)
140 {
141 mul(rp[0],ap[0],w,c1);
142 mul(rp[1],ap[1],w,c1);
143 mul(rp[2],ap[2],w,c1);
144 mul(rp[3],ap[3],w,c1);
145 ap+=4; rp+=4; num-=4;
146 }
147 if (num)
148 {
149 mul(rp[0],ap[0],w,c1); if (--num == 0) return c1;
150 mul(rp[1],ap[1],w,c1); if (--num == 0) return c1;
151 mul(rp[2],ap[2],w,c1);
152 }
153 return(c1);
154 }
155
156void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n)
157 {
158 if (n <= 0) return;
159
160 while (n&~3)
161 {
162 sqr(r[0],r[1],a[0]);
163 sqr(r[2],r[3],a[1]);
164 sqr(r[4],r[5],a[2]);
165 sqr(r[6],r[7],a[3]);
166 a+=4; r+=8; n-=4;
167 }
168 if (n)
169 {
170 sqr(r[0],r[1],a[0]); if (--n == 0) return;
171 sqr(r[2],r[3],a[1]); if (--n == 0) return;
172 sqr(r[4],r[5],a[2]);
173 }
174 }
175
176BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
177{ BN_ULONG ret,waste;
178
179 asm ("divq %4"
180 : "=a"(ret),"=d"(waste)
181 : "a"(l),"d"(h),"g"(d)
182 : "cc");
183
184 return ret;
185}
186
187BN_ULONG bn_add_words (BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,int n)
188{ BN_ULONG ret=0,i=0;
189
190 if (n <= 0) return 0;
191
192 asm (
193 " subq %2,%2 \n"
194 ".p2align 4 \n"
195 "1: movq (%4,%2,8),%0 \n"
196 " adcq (%5,%2,8),%0 \n"
197 " movq %0,(%3,%2,8) \n"
198 " leaq 1(%2),%2 \n"
199 " loop 1b \n"
200 " sbbq %0,%0 \n"
201 : "=&a"(ret),"+c"(n),"=&r"(i)
202 : "r"(rp),"r"(ap),"r"(bp)
203 : "cc"
204 );
205
206 return ret&1;
207}
208
209#ifndef SIMICS
210BN_ULONG bn_sub_words (BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,int n)
211{ BN_ULONG ret=0,i=0;
212
213 if (n <= 0) return 0;
214
215 asm (
216 " subq %2,%2 \n"
217 ".p2align 4 \n"
218 "1: movq (%4,%2,8),%0 \n"
219 " sbbq (%5,%2,8),%0 \n"
220 " movq %0,(%3,%2,8) \n"
221 " leaq 1(%2),%2 \n"
222 " loop 1b \n"
223 " sbbq %0,%0 \n"
224 : "=&a"(ret),"+c"(n),"=&r"(i)
225 : "r"(rp),"r"(ap),"r"(bp)
226 : "cc"
227 );
228
229 return ret&1;
230}
231#else
232/* Simics 1.4<7 has buggy sbbq:-( */
233#define BN_MASK2 0xffffffffffffffffL
234BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
235 {
236 BN_ULONG t1,t2;
237 int c=0;
238
239 if (n <= 0) return((BN_ULONG)0);
240
241 for (;;)
242 {
243 t1=a[0]; t2=b[0];
244 r[0]=(t1-t2-c)&BN_MASK2;
245 if (t1 != t2) c=(t1 < t2);
246 if (--n <= 0) break;
247
248 t1=a[1]; t2=b[1];
249 r[1]=(t1-t2-c)&BN_MASK2;
250 if (t1 != t2) c=(t1 < t2);
251 if (--n <= 0) break;
252
253 t1=a[2]; t2=b[2];
254 r[2]=(t1-t2-c)&BN_MASK2;
255 if (t1 != t2) c=(t1 < t2);
256 if (--n <= 0) break;
257
258 t1=a[3]; t2=b[3];
259 r[3]=(t1-t2-c)&BN_MASK2;
260 if (t1 != t2) c=(t1 < t2);
261 if (--n <= 0) break;
262
263 a+=4;
264 b+=4;
265 r+=4;
266 }
267 return(c);
268 }
269#endif
270
271/* mul_add_c(a,b,c0,c1,c2) -- c+=a*b for three word number c=(c2,c1,c0) */
272/* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */
273/* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
274/* sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0) */
275
276#if 0
277/* original macros are kept for reference purposes */
278#define mul_add_c(a,b,c0,c1,c2) { \
279 BN_ULONG ta=(a),tb=(b); \
280 t1 = ta * tb; \
281 t2 = BN_UMULT_HIGH(ta,tb); \
282 c0 += t1; t2 += (c0<t1)?1:0; \
283 c1 += t2; c2 += (c1<t2)?1:0; \
284 }
285
286#define mul_add_c2(a,b,c0,c1,c2) { \
287 BN_ULONG ta=(a),tb=(b),t0; \
288 t1 = BN_UMULT_HIGH(ta,tb); \
289 t0 = ta * tb; \
290 t2 = t1+t1; c2 += (t2<t1)?1:0; \
291 t1 = t0+t0; t2 += (t1<t0)?1:0; \
292 c0 += t1; t2 += (c0<t1)?1:0; \
293 c1 += t2; c2 += (c1<t2)?1:0; \
294 }
295#else
296#define mul_add_c(a,b,c0,c1,c2) do { \
297 asm ("mulq %3" \
298 : "=a"(t1),"=d"(t2) \
299 : "a"(a),"m"(b) \
300 : "cc"); \
301 asm ("addq %2,%0; adcq %3,%1" \
302 : "+r"(c0),"+d"(t2) \
303 : "a"(t1),"g"(0) \
304 : "cc"); \
305 asm ("addq %2,%0; adcq %3,%1" \
306 : "+r"(c1),"+r"(c2) \
307 : "d"(t2),"g"(0) \
308 : "cc"); \
309 } while (0)
310
311#define sqr_add_c(a,i,c0,c1,c2) do { \
312 asm ("mulq %2" \
313 : "=a"(t1),"=d"(t2) \
314 : "a"(a[i]) \
315 : "cc"); \
316 asm ("addq %2,%0; adcq %3,%1" \
317 : "+r"(c0),"+d"(t2) \
318 : "a"(t1),"g"(0) \
319 : "cc"); \
320 asm ("addq %2,%0; adcq %3,%1" \
321 : "+r"(c1),"+r"(c2) \
322 : "d"(t2),"g"(0) \
323 : "cc"); \
324 } while (0)
325
326#define mul_add_c2(a,b,c0,c1,c2) do { \
327 asm ("mulq %3" \
328 : "=a"(t1),"=d"(t2) \
329 : "a"(a),"m"(b) \
330 : "cc"); \
331 asm ("addq %0,%0; adcq %2,%1" \
332 : "+d"(t2),"+r"(c2) \
333 : "g"(0) \
334 : "cc"); \
335 asm ("addq %0,%0; adcq %2,%1" \
336 : "+a"(t1),"+d"(t2) \
337 : "g"(0) \
338 : "cc"); \
339 asm ("addq %2,%0; adcq %3,%1" \
340 : "+r"(c0),"+d"(t2) \
341 : "a"(t1),"g"(0) \
342 : "cc"); \
343 asm ("addq %2,%0; adcq %3,%1" \
344 : "+r"(c1),"+r"(c2) \
345 : "d"(t2),"g"(0) \
346 : "cc"); \
347 } while (0)
348#endif
349
350#define sqr_add_c2(a,i,j,c0,c1,c2) \
351 mul_add_c2((a)[i],(a)[j],c0,c1,c2)
352
353void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
354 {
355 BN_ULONG t1,t2;
356 BN_ULONG c1,c2,c3;
357
358 c1=0;
359 c2=0;
360 c3=0;
361 mul_add_c(a[0],b[0],c1,c2,c3);
362 r[0]=c1;
363 c1=0;
364 mul_add_c(a[0],b[1],c2,c3,c1);
365 mul_add_c(a[1],b[0],c2,c3,c1);
366 r[1]=c2;
367 c2=0;
368 mul_add_c(a[2],b[0],c3,c1,c2);
369 mul_add_c(a[1],b[1],c3,c1,c2);
370 mul_add_c(a[0],b[2],c3,c1,c2);
371 r[2]=c3;
372 c3=0;
373 mul_add_c(a[0],b[3],c1,c2,c3);
374 mul_add_c(a[1],b[2],c1,c2,c3);
375 mul_add_c(a[2],b[1],c1,c2,c3);
376 mul_add_c(a[3],b[0],c1,c2,c3);
377 r[3]=c1;
378 c1=0;
379 mul_add_c(a[4],b[0],c2,c3,c1);
380 mul_add_c(a[3],b[1],c2,c3,c1);
381 mul_add_c(a[2],b[2],c2,c3,c1);
382 mul_add_c(a[1],b[3],c2,c3,c1);
383 mul_add_c(a[0],b[4],c2,c3,c1);
384 r[4]=c2;
385 c2=0;
386 mul_add_c(a[0],b[5],c3,c1,c2);
387 mul_add_c(a[1],b[4],c3,c1,c2);
388 mul_add_c(a[2],b[3],c3,c1,c2);
389 mul_add_c(a[3],b[2],c3,c1,c2);
390 mul_add_c(a[4],b[1],c3,c1,c2);
391 mul_add_c(a[5],b[0],c3,c1,c2);
392 r[5]=c3;
393 c3=0;
394 mul_add_c(a[6],b[0],c1,c2,c3);
395 mul_add_c(a[5],b[1],c1,c2,c3);
396 mul_add_c(a[4],b[2],c1,c2,c3);
397 mul_add_c(a[3],b[3],c1,c2,c3);
398 mul_add_c(a[2],b[4],c1,c2,c3);
399 mul_add_c(a[1],b[5],c1,c2,c3);
400 mul_add_c(a[0],b[6],c1,c2,c3);
401 r[6]=c1;
402 c1=0;
403 mul_add_c(a[0],b[7],c2,c3,c1);
404 mul_add_c(a[1],b[6],c2,c3,c1);
405 mul_add_c(a[2],b[5],c2,c3,c1);
406 mul_add_c(a[3],b[4],c2,c3,c1);
407 mul_add_c(a[4],b[3],c2,c3,c1);
408 mul_add_c(a[5],b[2],c2,c3,c1);
409 mul_add_c(a[6],b[1],c2,c3,c1);
410 mul_add_c(a[7],b[0],c2,c3,c1);
411 r[7]=c2;
412 c2=0;
413 mul_add_c(a[7],b[1],c3,c1,c2);
414 mul_add_c(a[6],b[2],c3,c1,c2);
415 mul_add_c(a[5],b[3],c3,c1,c2);
416 mul_add_c(a[4],b[4],c3,c1,c2);
417 mul_add_c(a[3],b[5],c3,c1,c2);
418 mul_add_c(a[2],b[6],c3,c1,c2);
419 mul_add_c(a[1],b[7],c3,c1,c2);
420 r[8]=c3;
421 c3=0;
422 mul_add_c(a[2],b[7],c1,c2,c3);
423 mul_add_c(a[3],b[6],c1,c2,c3);
424 mul_add_c(a[4],b[5],c1,c2,c3);
425 mul_add_c(a[5],b[4],c1,c2,c3);
426 mul_add_c(a[6],b[3],c1,c2,c3);
427 mul_add_c(a[7],b[2],c1,c2,c3);
428 r[9]=c1;
429 c1=0;
430 mul_add_c(a[7],b[3],c2,c3,c1);
431 mul_add_c(a[6],b[4],c2,c3,c1);
432 mul_add_c(a[5],b[5],c2,c3,c1);
433 mul_add_c(a[4],b[6],c2,c3,c1);
434 mul_add_c(a[3],b[7],c2,c3,c1);
435 r[10]=c2;
436 c2=0;
437 mul_add_c(a[4],b[7],c3,c1,c2);
438 mul_add_c(a[5],b[6],c3,c1,c2);
439 mul_add_c(a[6],b[5],c3,c1,c2);
440 mul_add_c(a[7],b[4],c3,c1,c2);
441 r[11]=c3;
442 c3=0;
443 mul_add_c(a[7],b[5],c1,c2,c3);
444 mul_add_c(a[6],b[6],c1,c2,c3);
445 mul_add_c(a[5],b[7],c1,c2,c3);
446 r[12]=c1;
447 c1=0;
448 mul_add_c(a[6],b[7],c2,c3,c1);
449 mul_add_c(a[7],b[6],c2,c3,c1);
450 r[13]=c2;
451 c2=0;
452 mul_add_c(a[7],b[7],c3,c1,c2);
453 r[14]=c3;
454 r[15]=c1;
455 }
456
457void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
458 {
459 BN_ULONG t1,t2;
460 BN_ULONG c1,c2,c3;
461
462 c1=0;
463 c2=0;
464 c3=0;
465 mul_add_c(a[0],b[0],c1,c2,c3);
466 r[0]=c1;
467 c1=0;
468 mul_add_c(a[0],b[1],c2,c3,c1);
469 mul_add_c(a[1],b[0],c2,c3,c1);
470 r[1]=c2;
471 c2=0;
472 mul_add_c(a[2],b[0],c3,c1,c2);
473 mul_add_c(a[1],b[1],c3,c1,c2);
474 mul_add_c(a[0],b[2],c3,c1,c2);
475 r[2]=c3;
476 c3=0;
477 mul_add_c(a[0],b[3],c1,c2,c3);
478 mul_add_c(a[1],b[2],c1,c2,c3);
479 mul_add_c(a[2],b[1],c1,c2,c3);
480 mul_add_c(a[3],b[0],c1,c2,c3);
481 r[3]=c1;
482 c1=0;
483 mul_add_c(a[3],b[1],c2,c3,c1);
484 mul_add_c(a[2],b[2],c2,c3,c1);
485 mul_add_c(a[1],b[3],c2,c3,c1);
486 r[4]=c2;
487 c2=0;
488 mul_add_c(a[2],b[3],c3,c1,c2);
489 mul_add_c(a[3],b[2],c3,c1,c2);
490 r[5]=c3;
491 c3=0;
492 mul_add_c(a[3],b[3],c1,c2,c3);
493 r[6]=c1;
494 r[7]=c2;
495 }
496
497void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
498 {
499 BN_ULONG t1,t2;
500 BN_ULONG c1,c2,c3;
501
502 c1=0;
503 c2=0;
504 c3=0;
505 sqr_add_c(a,0,c1,c2,c3);
506 r[0]=c1;
507 c1=0;
508 sqr_add_c2(a,1,0,c2,c3,c1);
509 r[1]=c2;
510 c2=0;
511 sqr_add_c(a,1,c3,c1,c2);
512 sqr_add_c2(a,2,0,c3,c1,c2);
513 r[2]=c3;
514 c3=0;
515 sqr_add_c2(a,3,0,c1,c2,c3);
516 sqr_add_c2(a,2,1,c1,c2,c3);
517 r[3]=c1;
518 c1=0;
519 sqr_add_c(a,2,c2,c3,c1);
520 sqr_add_c2(a,3,1,c2,c3,c1);
521 sqr_add_c2(a,4,0,c2,c3,c1);
522 r[4]=c2;
523 c2=0;
524 sqr_add_c2(a,5,0,c3,c1,c2);
525 sqr_add_c2(a,4,1,c3,c1,c2);
526 sqr_add_c2(a,3,2,c3,c1,c2);
527 r[5]=c3;
528 c3=0;
529 sqr_add_c(a,3,c1,c2,c3);
530 sqr_add_c2(a,4,2,c1,c2,c3);
531 sqr_add_c2(a,5,1,c1,c2,c3);
532 sqr_add_c2(a,6,0,c1,c2,c3);
533 r[6]=c1;
534 c1=0;
535 sqr_add_c2(a,7,0,c2,c3,c1);
536 sqr_add_c2(a,6,1,c2,c3,c1);
537 sqr_add_c2(a,5,2,c2,c3,c1);
538 sqr_add_c2(a,4,3,c2,c3,c1);
539 r[7]=c2;
540 c2=0;
541 sqr_add_c(a,4,c3,c1,c2);
542 sqr_add_c2(a,5,3,c3,c1,c2);
543 sqr_add_c2(a,6,2,c3,c1,c2);
544 sqr_add_c2(a,7,1,c3,c1,c2);
545 r[8]=c3;
546 c3=0;
547 sqr_add_c2(a,7,2,c1,c2,c3);
548 sqr_add_c2(a,6,3,c1,c2,c3);
549 sqr_add_c2(a,5,4,c1,c2,c3);
550 r[9]=c1;
551 c1=0;
552 sqr_add_c(a,5,c2,c3,c1);
553 sqr_add_c2(a,6,4,c2,c3,c1);
554 sqr_add_c2(a,7,3,c2,c3,c1);
555 r[10]=c2;
556 c2=0;
557 sqr_add_c2(a,7,4,c3,c1,c2);
558 sqr_add_c2(a,6,5,c3,c1,c2);
559 r[11]=c3;
560 c3=0;
561 sqr_add_c(a,6,c1,c2,c3);
562 sqr_add_c2(a,7,5,c1,c2,c3);
563 r[12]=c1;
564 c1=0;
565 sqr_add_c2(a,7,6,c2,c3,c1);
566 r[13]=c2;
567 c2=0;
568 sqr_add_c(a,7,c3,c1,c2);
569 r[14]=c3;
570 r[15]=c1;
571 }
572
573void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
574 {
575 BN_ULONG t1,t2;
576 BN_ULONG c1,c2,c3;
577
578 c1=0;
579 c2=0;
580 c3=0;
581 sqr_add_c(a,0,c1,c2,c3);
582 r[0]=c1;
583 c1=0;
584 sqr_add_c2(a,1,0,c2,c3,c1);
585 r[1]=c2;
586 c2=0;
587 sqr_add_c(a,1,c3,c1,c2);
588 sqr_add_c2(a,2,0,c3,c1,c2);
589 r[2]=c3;
590 c3=0;
591 sqr_add_c2(a,3,0,c1,c2,c3);
592 sqr_add_c2(a,2,1,c1,c2,c3);
593 r[3]=c1;
594 c1=0;
595 sqr_add_c(a,2,c2,c3,c1);
596 sqr_add_c2(a,3,1,c2,c3,c1);
597 r[4]=c2;
598 c2=0;
599 sqr_add_c2(a,3,2,c3,c1,c2);
600 r[5]=c3;
601 c3=0;
602 sqr_add_c(a,3,c1,c2,c3);
603 r[6]=c1;
604 r[7]=c2;
605 }
606#endif
diff --git a/src/lib/libcrypto/bn/asm/x86_64-gf2m.pl b/src/lib/libcrypto/bn/asm/x86_64-gf2m.pl
deleted file mode 100644
index 1658acbbdd..0000000000
--- a/src/lib/libcrypto/bn/asm/x86_64-gf2m.pl
+++ /dev/null
@@ -1,389 +0,0 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# May 2011
11#
12# The module implements bn_GF2m_mul_2x2 polynomial multiplication used
13# in bn_gf2m.c. It's kind of low-hanging mechanical port from C for
14# the time being... Except that it has two code paths: code suitable
15# for any x86_64 CPU and PCLMULQDQ one suitable for Westmere and
16# later. Improvement varies from one benchmark and µ-arch to another.
17# Vanilla code path is at most 20% faster than compiler-generated code
18# [not very impressive], while PCLMULQDQ - whole 85%-160% better on
19# 163- and 571-bit ECDH benchmarks on Intel CPUs. Keep in mind that
20# these coefficients are not ones for bn_GF2m_mul_2x2 itself, as not
21# all CPU time is burnt in it...
22
23$flavour = shift;
24$output = shift;
25if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
26
27$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
28
29$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
30( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
31( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
32die "can't locate x86_64-xlate.pl";
33
34open STDOUT,"| $^X $xlate $flavour $output";
35
36($lo,$hi)=("%rax","%rdx"); $a=$lo;
37($i0,$i1)=("%rsi","%rdi");
38($t0,$t1)=("%rbx","%rcx");
39($b,$mask)=("%rbp","%r8");
40($a1,$a2,$a4,$a8,$a12,$a48)=map("%r$_",(9..15));
41($R,$Tx)=("%xmm0","%xmm1");
42
43$code.=<<___;
44.text
45
46.type _mul_1x1,\@abi-omnipotent
47.align 16
48_mul_1x1:
49 sub \$128+8,%rsp
50 mov \$-1,$a1
51 lea ($a,$a),$i0
52 shr \$3,$a1
53 lea (,$a,4),$i1
54 and $a,$a1 # a1=a&0x1fffffffffffffff
55 lea (,$a,8),$a8
56 sar \$63,$a # broadcast 63rd bit
57 lea ($a1,$a1),$a2
58 sar \$63,$i0 # broadcast 62nd bit
59 lea (,$a1,4),$a4
60 and $b,$a
61 sar \$63,$i1 # boardcast 61st bit
62 mov $a,$hi # $a is $lo
63 shl \$63,$lo
64 and $b,$i0
65 shr \$1,$hi
66 mov $i0,$t1
67 shl \$62,$i0
68 and $b,$i1
69 shr \$2,$t1
70 xor $i0,$lo
71 mov $i1,$t0
72 shl \$61,$i1
73 xor $t1,$hi
74 shr \$3,$t0
75 xor $i1,$lo
76 xor $t0,$hi
77
78 mov $a1,$a12
79 movq \$0,0(%rsp) # tab[0]=0
80 xor $a2,$a12 # a1^a2
81 mov $a1,8(%rsp) # tab[1]=a1
82 mov $a4,$a48
83 mov $a2,16(%rsp) # tab[2]=a2
84 xor $a8,$a48 # a4^a8
85 mov $a12,24(%rsp) # tab[3]=a1^a2
86
87 xor $a4,$a1
88 mov $a4,32(%rsp) # tab[4]=a4
89 xor $a4,$a2
90 mov $a1,40(%rsp) # tab[5]=a1^a4
91 xor $a4,$a12
92 mov $a2,48(%rsp) # tab[6]=a2^a4
93 xor $a48,$a1 # a1^a4^a4^a8=a1^a8
94 mov $a12,56(%rsp) # tab[7]=a1^a2^a4
95 xor $a48,$a2 # a2^a4^a4^a8=a1^a8
96
97 mov $a8,64(%rsp) # tab[8]=a8
98 xor $a48,$a12 # a1^a2^a4^a4^a8=a1^a2^a8
99 mov $a1,72(%rsp) # tab[9]=a1^a8
100 xor $a4,$a1 # a1^a8^a4
101 mov $a2,80(%rsp) # tab[10]=a2^a8
102 xor $a4,$a2 # a2^a8^a4
103 mov $a12,88(%rsp) # tab[11]=a1^a2^a8
104
105 xor $a4,$a12 # a1^a2^a8^a4
106 mov $a48,96(%rsp) # tab[12]=a4^a8
107 mov $mask,$i0
108 mov $a1,104(%rsp) # tab[13]=a1^a4^a8
109 and $b,$i0
110 mov $a2,112(%rsp) # tab[14]=a2^a4^a8
111 shr \$4,$b
112 mov $a12,120(%rsp) # tab[15]=a1^a2^a4^a8
113 mov $mask,$i1
114 and $b,$i1
115 shr \$4,$b
116
117 movq (%rsp,$i0,8),$R # half of calculations is done in SSE2
118 mov $mask,$i0
119 and $b,$i0
120 shr \$4,$b
121___
122 for ($n=1;$n<8;$n++) {
123 $code.=<<___;
124 mov (%rsp,$i1,8),$t1
125 mov $mask,$i1
126 mov $t1,$t0
127 shl \$`8*$n-4`,$t1
128 and $b,$i1
129 movq (%rsp,$i0,8),$Tx
130 shr \$`64-(8*$n-4)`,$t0
131 xor $t1,$lo
132 pslldq \$$n,$Tx
133 mov $mask,$i0
134 shr \$4,$b
135 xor $t0,$hi
136 and $b,$i0
137 shr \$4,$b
138 pxor $Tx,$R
139___
140 }
141$code.=<<___;
142 mov (%rsp,$i1,8),$t1
143 mov $t1,$t0
144 shl \$`8*$n-4`,$t1
145 movq $R,$i0
146 shr \$`64-(8*$n-4)`,$t0
147 xor $t1,$lo
148 psrldq \$8,$R
149 xor $t0,$hi
150 movq $R,$i1
151 xor $i0,$lo
152 xor $i1,$hi
153
154 add \$128+8,%rsp
155 ret
156.Lend_mul_1x1:
157.size _mul_1x1,.-_mul_1x1
158___
159
160($rp,$a1,$a0,$b1,$b0) = $win64? ("%rcx","%rdx","%r8", "%r9","%r10") : # Win64 order
161 ("%rdi","%rsi","%rdx","%rcx","%r8"); # Unix order
162
163$code.=<<___;
164.extern OPENSSL_ia32cap_P
165.globl bn_GF2m_mul_2x2
166.type bn_GF2m_mul_2x2,\@abi-omnipotent
167.align 16
168bn_GF2m_mul_2x2:
169 mov OPENSSL_ia32cap_P(%rip),%rax
170 bt \$33,%rax
171 jnc .Lvanilla_mul_2x2
172
173 movq $a1,%xmm0
174 movq $b1,%xmm1
175 movq $a0,%xmm2
176___
177$code.=<<___ if ($win64);
178 movq 40(%rsp),%xmm3
179___
180$code.=<<___ if (!$win64);
181 movq $b0,%xmm3
182___
183$code.=<<___;
184 movdqa %xmm0,%xmm4
185 movdqa %xmm1,%xmm5
186 pclmulqdq \$0,%xmm1,%xmm0 # a1·b1
187 pxor %xmm2,%xmm4
188 pxor %xmm3,%xmm5
189 pclmulqdq \$0,%xmm3,%xmm2 # a0·b0
190 pclmulqdq \$0,%xmm5,%xmm4 # (a0+a1)·(b0+b1)
191 xorps %xmm0,%xmm4
192 xorps %xmm2,%xmm4 # (a0+a1)·(b0+b1)-a0·b0-a1·b1
193 movdqa %xmm4,%xmm5
194 pslldq \$8,%xmm4
195 psrldq \$8,%xmm5
196 pxor %xmm4,%xmm2
197 pxor %xmm5,%xmm0
198 movdqu %xmm2,0($rp)
199 movdqu %xmm0,16($rp)
200 ret
201
202.align 16
203.Lvanilla_mul_2x2:
204 lea -8*17(%rsp),%rsp
205___
206$code.=<<___ if ($win64);
207 mov `8*17+40`(%rsp),$b0
208 mov %rdi,8*15(%rsp)
209 mov %rsi,8*16(%rsp)
210___
211$code.=<<___;
212 mov %r14,8*10(%rsp)
213 mov %r13,8*11(%rsp)
214 mov %r12,8*12(%rsp)
215 mov %rbp,8*13(%rsp)
216 mov %rbx,8*14(%rsp)
217.Lbody_mul_2x2:
218 mov $rp,32(%rsp) # save the arguments
219 mov $a1,40(%rsp)
220 mov $a0,48(%rsp)
221 mov $b1,56(%rsp)
222 mov $b0,64(%rsp)
223
224 mov \$0xf,$mask
225 mov $a1,$a
226 mov $b1,$b
227 call _mul_1x1 # a1·b1
228 mov $lo,16(%rsp)
229 mov $hi,24(%rsp)
230
231 mov 48(%rsp),$a
232 mov 64(%rsp),$b
233 call _mul_1x1 # a0·b0
234 mov $lo,0(%rsp)
235 mov $hi,8(%rsp)
236
237 mov 40(%rsp),$a
238 mov 56(%rsp),$b
239 xor 48(%rsp),$a
240 xor 64(%rsp),$b
241 call _mul_1x1 # (a0+a1)·(b0+b1)
242___
243 @r=("%rbx","%rcx","%rdi","%rsi");
244$code.=<<___;
245 mov 0(%rsp),@r[0]
246 mov 8(%rsp),@r[1]
247 mov 16(%rsp),@r[2]
248 mov 24(%rsp),@r[3]
249 mov 32(%rsp),%rbp
250
251 xor $hi,$lo
252 xor @r[1],$hi
253 xor @r[0],$lo
254 mov @r[0],0(%rbp)
255 xor @r[2],$hi
256 mov @r[3],24(%rbp)
257 xor @r[3],$lo
258 xor @r[3],$hi
259 xor $hi,$lo
260 mov $hi,16(%rbp)
261 mov $lo,8(%rbp)
262
263 mov 8*10(%rsp),%r14
264 mov 8*11(%rsp),%r13
265 mov 8*12(%rsp),%r12
266 mov 8*13(%rsp),%rbp
267 mov 8*14(%rsp),%rbx
268___
269$code.=<<___ if ($win64);
270 mov 8*15(%rsp),%rdi
271 mov 8*16(%rsp),%rsi
272___
273$code.=<<___;
274 lea 8*17(%rsp),%rsp
275 ret
276.Lend_mul_2x2:
277.size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
278.asciz "GF(2^m) Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
279.align 16
280___
281
282# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
283# CONTEXT *context,DISPATCHER_CONTEXT *disp)
284if ($win64) {
285$rec="%rcx";
286$frame="%rdx";
287$context="%r8";
288$disp="%r9";
289
290$code.=<<___;
291.extern __imp_RtlVirtualUnwind
292
293.type se_handler,\@abi-omnipotent
294.align 16
295se_handler:
296 push %rsi
297 push %rdi
298 push %rbx
299 push %rbp
300 push %r12
301 push %r13
302 push %r14
303 push %r15
304 pushfq
305 sub \$64,%rsp
306
307 mov 152($context),%rax # pull context->Rsp
308 mov 248($context),%rbx # pull context->Rip
309
310 lea .Lbody_mul_2x2(%rip),%r10
311 cmp %r10,%rbx # context->Rip<"prologue" label
312 jb .Lin_prologue
313
314 mov 8*10(%rax),%r14 # mimic epilogue
315 mov 8*11(%rax),%r13
316 mov 8*12(%rax),%r12
317 mov 8*13(%rax),%rbp
318 mov 8*14(%rax),%rbx
319 mov 8*15(%rax),%rdi
320 mov 8*16(%rax),%rsi
321
322 mov %rbx,144($context) # restore context->Rbx
323 mov %rbp,160($context) # restore context->Rbp
324 mov %rsi,168($context) # restore context->Rsi
325 mov %rdi,176($context) # restore context->Rdi
326 mov %r12,216($context) # restore context->R12
327 mov %r13,224($context) # restore context->R13
328 mov %r14,232($context) # restore context->R14
329
330.Lin_prologue:
331 lea 8*17(%rax),%rax
332 mov %rax,152($context) # restore context->Rsp
333
334 mov 40($disp),%rdi # disp->ContextRecord
335 mov $context,%rsi # context
336 mov \$154,%ecx # sizeof(CONTEXT)
337 .long 0xa548f3fc # cld; rep movsq
338
339 mov $disp,%rsi
340 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
341 mov 8(%rsi),%rdx # arg2, disp->ImageBase
342 mov 0(%rsi),%r8 # arg3, disp->ControlPc
343 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
344 mov 40(%rsi),%r10 # disp->ContextRecord
345 lea 56(%rsi),%r11 # &disp->HandlerData
346 lea 24(%rsi),%r12 # &disp->EstablisherFrame
347 mov %r10,32(%rsp) # arg5
348 mov %r11,40(%rsp) # arg6
349 mov %r12,48(%rsp) # arg7
350 mov %rcx,56(%rsp) # arg8, (NULL)
351 call *__imp_RtlVirtualUnwind(%rip)
352
353 mov \$1,%eax # ExceptionContinueSearch
354 add \$64,%rsp
355 popfq
356 pop %r15
357 pop %r14
358 pop %r13
359 pop %r12
360 pop %rbp
361 pop %rbx
362 pop %rdi
363 pop %rsi
364 ret
365.size se_handler,.-se_handler
366
367.section .pdata
368.align 4
369 .rva _mul_1x1
370 .rva .Lend_mul_1x1
371 .rva .LSEH_info_1x1
372
373 .rva .Lvanilla_mul_2x2
374 .rva .Lend_mul_2x2
375 .rva .LSEH_info_2x2
376.section .xdata
377.align 8
378.LSEH_info_1x1:
379 .byte 0x01,0x07,0x02,0x00
380 .byte 0x07,0x01,0x11,0x00 # sub rsp,128+8
381.LSEH_info_2x2:
382 .byte 9,0,0,0
383 .rva se_handler
384___
385}
386
387$code =~ s/\`([^\`]*)\`/eval($1)/gem;
388print $code;
389close STDOUT;
diff --git a/src/lib/libcrypto/bn/asm/x86_64-mont.pl b/src/lib/libcrypto/bn/asm/x86_64-mont.pl
deleted file mode 100755
index 5d79b35e1c..0000000000
--- a/src/lib/libcrypto/bn/asm/x86_64-mont.pl
+++ /dev/null
@@ -1,1680 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# October 2005.
11#
12# Montgomery multiplication routine for x86_64. While it gives modest
13# 9% improvement of rsa4096 sign on Opteron, rsa512 sign runs more
14# than twice, >2x, as fast. Most common rsa1024 sign is improved by
15# respectful 50%. It remains to be seen if loop unrolling and
16# dedicated squaring routine can provide further improvement...
17
18# July 2011.
19#
20# Add dedicated squaring procedure. Performance improvement varies
21# from platform to platform, but in average it's ~5%/15%/25%/33%
22# for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
23
24# August 2011.
25#
26# Unroll and modulo-schedule inner loops in such manner that they
27# are "fallen through" for input lengths of 8, which is critical for
28# 1024-bit RSA *sign*. Average performance improvement in comparison
29# to *initial* version of this module from 2005 is ~0%/30%/40%/45%
30# for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
31
32$flavour = shift;
33$output = shift;
34if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
35
36$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
37
38$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
39( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
40( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
41die "can't locate x86_64-xlate.pl";
42
43open STDOUT,"| $^X $xlate $flavour $output";
44
45# int bn_mul_mont(
46$rp="%rdi"; # BN_ULONG *rp,
47$ap="%rsi"; # const BN_ULONG *ap,
48$bp="%rdx"; # const BN_ULONG *bp,
49$np="%rcx"; # const BN_ULONG *np,
50$n0="%r8"; # const BN_ULONG *n0,
51$num="%r9"; # int num);
52$lo0="%r10";
53$hi0="%r11";
54$hi1="%r13";
55$i="%r14";
56$j="%r15";
57$m0="%rbx";
58$m1="%rbp";
59
60$code=<<___;
61.text
62
63.globl bn_mul_mont
64.type bn_mul_mont,\@function,6
65.align 16
66bn_mul_mont:
67 test \$3,${num}d
68 jnz .Lmul_enter
69 cmp \$8,${num}d
70 jb .Lmul_enter
71 cmp $ap,$bp
72 jne .Lmul4x_enter
73 jmp .Lsqr4x_enter
74
75.align 16
76.Lmul_enter:
77 push %rbx
78 push %rbp
79 push %r12
80 push %r13
81 push %r14
82 push %r15
83
84 mov ${num}d,${num}d
85 lea 2($num),%r10
86 mov %rsp,%r11
87 neg %r10
88 lea (%rsp,%r10,8),%rsp # tp=alloca(8*(num+2))
89 and \$-1024,%rsp # minimize TLB usage
90
91 mov %r11,8(%rsp,$num,8) # tp[num+1]=%rsp
92.Lmul_body:
93 mov $bp,%r12 # reassign $bp
94___
95 $bp="%r12";
96$code.=<<___;
97 mov ($n0),$n0 # pull n0[0] value
98 mov ($bp),$m0 # m0=bp[0]
99 mov ($ap),%rax
100
101 xor $i,$i # i=0
102 xor $j,$j # j=0
103
104 mov $n0,$m1
105 mulq $m0 # ap[0]*bp[0]
106 mov %rax,$lo0
107 mov ($np),%rax
108
109 imulq $lo0,$m1 # "tp[0]"*n0
110 mov %rdx,$hi0
111
112 mulq $m1 # np[0]*m1
113 add %rax,$lo0 # discarded
114 mov 8($ap),%rax
115 adc \$0,%rdx
116 mov %rdx,$hi1
117
118 lea 1($j),$j # j++
119 jmp .L1st_enter
120
121.align 16
122.L1st:
123 add %rax,$hi1
124 mov ($ap,$j,8),%rax
125 adc \$0,%rdx
126 add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0]
127 mov $lo0,$hi0
128 adc \$0,%rdx
129 mov $hi1,-16(%rsp,$j,8) # tp[j-1]
130 mov %rdx,$hi1
131
132.L1st_enter:
133 mulq $m0 # ap[j]*bp[0]
134 add %rax,$hi0
135 mov ($np,$j,8),%rax
136 adc \$0,%rdx
137 lea 1($j),$j # j++
138 mov %rdx,$lo0
139
140 mulq $m1 # np[j]*m1
141 cmp $num,$j
142 jne .L1st
143
144 add %rax,$hi1
145 mov ($ap),%rax # ap[0]
146 adc \$0,%rdx
147 add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0]
148 adc \$0,%rdx
149 mov $hi1,-16(%rsp,$j,8) # tp[j-1]
150 mov %rdx,$hi1
151 mov $lo0,$hi0
152
153 xor %rdx,%rdx
154 add $hi0,$hi1
155 adc \$0,%rdx
156 mov $hi1,-8(%rsp,$num,8)
157 mov %rdx,(%rsp,$num,8) # store upmost overflow bit
158
159 lea 1($i),$i # i++
160 jmp .Louter
161.align 16
162.Louter:
163 mov ($bp,$i,8),$m0 # m0=bp[i]
164 xor $j,$j # j=0
165 mov $n0,$m1
166 mov (%rsp),$lo0
167 mulq $m0 # ap[0]*bp[i]
168 add %rax,$lo0 # ap[0]*bp[i]+tp[0]
169 mov ($np),%rax
170 adc \$0,%rdx
171
172 imulq $lo0,$m1 # tp[0]*n0
173 mov %rdx,$hi0
174
175 mulq $m1 # np[0]*m1
176 add %rax,$lo0 # discarded
177 mov 8($ap),%rax
178 adc \$0,%rdx
179 mov 8(%rsp),$lo0 # tp[1]
180 mov %rdx,$hi1
181
182 lea 1($j),$j # j++
183 jmp .Linner_enter
184
185.align 16
186.Linner:
187 add %rax,$hi1
188 mov ($ap,$j,8),%rax
189 adc \$0,%rdx
190 add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j]
191 mov (%rsp,$j,8),$lo0
192 adc \$0,%rdx
193 mov $hi1,-16(%rsp,$j,8) # tp[j-1]
194 mov %rdx,$hi1
195
196.Linner_enter:
197 mulq $m0 # ap[j]*bp[i]
198 add %rax,$hi0
199 mov ($np,$j,8),%rax
200 adc \$0,%rdx
201 add $hi0,$lo0 # ap[j]*bp[i]+tp[j]
202 mov %rdx,$hi0
203 adc \$0,$hi0
204 lea 1($j),$j # j++
205
206 mulq $m1 # np[j]*m1
207 cmp $num,$j
208 jne .Linner
209
210 add %rax,$hi1
211 mov ($ap),%rax # ap[0]
212 adc \$0,%rdx
213 add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j]
214 mov (%rsp,$j,8),$lo0
215 adc \$0,%rdx
216 mov $hi1,-16(%rsp,$j,8) # tp[j-1]
217 mov %rdx,$hi1
218
219 xor %rdx,%rdx
220 add $hi0,$hi1
221 adc \$0,%rdx
222 add $lo0,$hi1 # pull upmost overflow bit
223 adc \$0,%rdx
224 mov $hi1,-8(%rsp,$num,8)
225 mov %rdx,(%rsp,$num,8) # store upmost overflow bit
226
227 lea 1($i),$i # i++
228 cmp $num,$i
229 jl .Louter
230
231 xor $i,$i # i=0 and clear CF!
232 mov (%rsp),%rax # tp[0]
233 lea (%rsp),$ap # borrow ap for tp
234 mov $num,$j # j=num
235 jmp .Lsub
236.align 16
237.Lsub: sbb ($np,$i,8),%rax
238 mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i]
239 mov 8($ap,$i,8),%rax # tp[i+1]
240 lea 1($i),$i # i++
241 dec $j # doesnn't affect CF!
242 jnz .Lsub
243
244 sbb \$0,%rax # handle upmost overflow bit
245 xor $i,$i
246 and %rax,$ap
247 not %rax
248 mov $rp,$np
249 and %rax,$np
250 mov $num,$j # j=num
251 or $np,$ap # ap=borrow?tp:rp
252.align 16
253.Lcopy: # copy or in-place refresh
254 mov ($ap,$i,8),%rax
255 mov $i,(%rsp,$i,8) # zap temporary vector
256 mov %rax,($rp,$i,8) # rp[i]=tp[i]
257 lea 1($i),$i
258 sub \$1,$j
259 jnz .Lcopy
260
261 mov 8(%rsp,$num,8),%rsi # restore %rsp
262 mov \$1,%rax
263 mov (%rsi),%r15
264 mov 8(%rsi),%r14
265 mov 16(%rsi),%r13
266 mov 24(%rsi),%r12
267 mov 32(%rsi),%rbp
268 mov 40(%rsi),%rbx
269 lea 48(%rsi),%rsp
270.Lmul_epilogue:
271 ret
272.size bn_mul_mont,.-bn_mul_mont
273___
274{{{
275my @A=("%r10","%r11");
276my @N=("%r13","%rdi");
277$code.=<<___;
278.type bn_mul4x_mont,\@function,6
279.align 16
280bn_mul4x_mont:
281.Lmul4x_enter:
282 push %rbx
283 push %rbp
284 push %r12
285 push %r13
286 push %r14
287 push %r15
288
289 mov ${num}d,${num}d
290 lea 4($num),%r10
291 mov %rsp,%r11
292 neg %r10
293 lea (%rsp,%r10,8),%rsp # tp=alloca(8*(num+4))
294 and \$-1024,%rsp # minimize TLB usage
295
296 mov %r11,8(%rsp,$num,8) # tp[num+1]=%rsp
297.Lmul4x_body:
298 mov $rp,16(%rsp,$num,8) # tp[num+2]=$rp
299 mov %rdx,%r12 # reassign $bp
300___
301 $bp="%r12";
302$code.=<<___;
303 mov ($n0),$n0 # pull n0[0] value
304 mov ($bp),$m0 # m0=bp[0]
305 mov ($ap),%rax
306
307 xor $i,$i # i=0
308 xor $j,$j # j=0
309
310 mov $n0,$m1
311 mulq $m0 # ap[0]*bp[0]
312 mov %rax,$A[0]
313 mov ($np),%rax
314
315 imulq $A[0],$m1 # "tp[0]"*n0
316 mov %rdx,$A[1]
317
318 mulq $m1 # np[0]*m1
319 add %rax,$A[0] # discarded
320 mov 8($ap),%rax
321 adc \$0,%rdx
322 mov %rdx,$N[1]
323
324 mulq $m0
325 add %rax,$A[1]
326 mov 8($np),%rax
327 adc \$0,%rdx
328 mov %rdx,$A[0]
329
330 mulq $m1
331 add %rax,$N[1]
332 mov 16($ap),%rax
333 adc \$0,%rdx
334 add $A[1],$N[1]
335 lea 4($j),$j # j++
336 adc \$0,%rdx
337 mov $N[1],(%rsp)
338 mov %rdx,$N[0]
339 jmp .L1st4x
340.align 16
341.L1st4x:
342 mulq $m0 # ap[j]*bp[0]
343 add %rax,$A[0]
344 mov -16($np,$j,8),%rax
345 adc \$0,%rdx
346 mov %rdx,$A[1]
347
348 mulq $m1 # np[j]*m1
349 add %rax,$N[0]
350 mov -8($ap,$j,8),%rax
351 adc \$0,%rdx
352 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
353 adc \$0,%rdx
354 mov $N[0],-24(%rsp,$j,8) # tp[j-1]
355 mov %rdx,$N[1]
356
357 mulq $m0 # ap[j]*bp[0]
358 add %rax,$A[1]
359 mov -8($np,$j,8),%rax
360 adc \$0,%rdx
361 mov %rdx,$A[0]
362
363 mulq $m1 # np[j]*m1
364 add %rax,$N[1]
365 mov ($ap,$j,8),%rax
366 adc \$0,%rdx
367 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
368 adc \$0,%rdx
369 mov $N[1],-16(%rsp,$j,8) # tp[j-1]
370 mov %rdx,$N[0]
371
372 mulq $m0 # ap[j]*bp[0]
373 add %rax,$A[0]
374 mov ($np,$j,8),%rax
375 adc \$0,%rdx
376 mov %rdx,$A[1]
377
378 mulq $m1 # np[j]*m1
379 add %rax,$N[0]
380 mov 8($ap,$j,8),%rax
381 adc \$0,%rdx
382 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
383 adc \$0,%rdx
384 mov $N[0],-8(%rsp,$j,8) # tp[j-1]
385 mov %rdx,$N[1]
386
387 mulq $m0 # ap[j]*bp[0]
388 add %rax,$A[1]
389 mov 8($np,$j,8),%rax
390 adc \$0,%rdx
391 lea 4($j),$j # j++
392 mov %rdx,$A[0]
393
394 mulq $m1 # np[j]*m1
395 add %rax,$N[1]
396 mov -16($ap,$j,8),%rax
397 adc \$0,%rdx
398 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
399 adc \$0,%rdx
400 mov $N[1],-32(%rsp,$j,8) # tp[j-1]
401 mov %rdx,$N[0]
402 cmp $num,$j
403 jl .L1st4x
404
405 mulq $m0 # ap[j]*bp[0]
406 add %rax,$A[0]
407 mov -16($np,$j,8),%rax
408 adc \$0,%rdx
409 mov %rdx,$A[1]
410
411 mulq $m1 # np[j]*m1
412 add %rax,$N[0]
413 mov -8($ap,$j,8),%rax
414 adc \$0,%rdx
415 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
416 adc \$0,%rdx
417 mov $N[0],-24(%rsp,$j,8) # tp[j-1]
418 mov %rdx,$N[1]
419
420 mulq $m0 # ap[j]*bp[0]
421 add %rax,$A[1]
422 mov -8($np,$j,8),%rax
423 adc \$0,%rdx
424 mov %rdx,$A[0]
425
426 mulq $m1 # np[j]*m1
427 add %rax,$N[1]
428 mov ($ap),%rax # ap[0]
429 adc \$0,%rdx
430 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
431 adc \$0,%rdx
432 mov $N[1],-16(%rsp,$j,8) # tp[j-1]
433 mov %rdx,$N[0]
434
435 xor $N[1],$N[1]
436 add $A[0],$N[0]
437 adc \$0,$N[1]
438 mov $N[0],-8(%rsp,$j,8)
439 mov $N[1],(%rsp,$j,8) # store upmost overflow bit
440
441 lea 1($i),$i # i++
442.align 4
443.Louter4x:
444 mov ($bp,$i,8),$m0 # m0=bp[i]
445 xor $j,$j # j=0
446 mov (%rsp),$A[0]
447 mov $n0,$m1
448 mulq $m0 # ap[0]*bp[i]
449 add %rax,$A[0] # ap[0]*bp[i]+tp[0]
450 mov ($np),%rax
451 adc \$0,%rdx
452
453 imulq $A[0],$m1 # tp[0]*n0
454 mov %rdx,$A[1]
455
456 mulq $m1 # np[0]*m1
457 add %rax,$A[0] # "$N[0]", discarded
458 mov 8($ap),%rax
459 adc \$0,%rdx
460 mov %rdx,$N[1]
461
462 mulq $m0 # ap[j]*bp[i]
463 add %rax,$A[1]
464 mov 8($np),%rax
465 adc \$0,%rdx
466 add 8(%rsp),$A[1] # +tp[1]
467 adc \$0,%rdx
468 mov %rdx,$A[0]
469
470 mulq $m1 # np[j]*m1
471 add %rax,$N[1]
472 mov 16($ap),%rax
473 adc \$0,%rdx
474 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[i]+tp[j]
475 lea 4($j),$j # j+=2
476 adc \$0,%rdx
477 mov $N[1],(%rsp) # tp[j-1]
478 mov %rdx,$N[0]
479 jmp .Linner4x
480.align 16
481.Linner4x:
482 mulq $m0 # ap[j]*bp[i]
483 add %rax,$A[0]
484 mov -16($np,$j,8),%rax
485 adc \$0,%rdx
486 add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
487 adc \$0,%rdx
488 mov %rdx,$A[1]
489
490 mulq $m1 # np[j]*m1
491 add %rax,$N[0]
492 mov -8($ap,$j,8),%rax
493 adc \$0,%rdx
494 add $A[0],$N[0]
495 adc \$0,%rdx
496 mov $N[0],-24(%rsp,$j,8) # tp[j-1]
497 mov %rdx,$N[1]
498
499 mulq $m0 # ap[j]*bp[i]
500 add %rax,$A[1]
501 mov -8($np,$j,8),%rax
502 adc \$0,%rdx
503 add -8(%rsp,$j,8),$A[1]
504 adc \$0,%rdx
505 mov %rdx,$A[0]
506
507 mulq $m1 # np[j]*m1
508 add %rax,$N[1]
509 mov ($ap,$j,8),%rax
510 adc \$0,%rdx
511 add $A[1],$N[1]
512 adc \$0,%rdx
513 mov $N[1],-16(%rsp,$j,8) # tp[j-1]
514 mov %rdx,$N[0]
515
516 mulq $m0 # ap[j]*bp[i]
517 add %rax,$A[0]
518 mov ($np,$j,8),%rax
519 adc \$0,%rdx
520 add (%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
521 adc \$0,%rdx
522 mov %rdx,$A[1]
523
524 mulq $m1 # np[j]*m1
525 add %rax,$N[0]
526 mov 8($ap,$j,8),%rax
527 adc \$0,%rdx
528 add $A[0],$N[0]
529 adc \$0,%rdx
530 mov $N[0],-8(%rsp,$j,8) # tp[j-1]
531 mov %rdx,$N[1]
532
533 mulq $m0 # ap[j]*bp[i]
534 add %rax,$A[1]
535 mov 8($np,$j,8),%rax
536 adc \$0,%rdx
537 add 8(%rsp,$j,8),$A[1]
538 adc \$0,%rdx
539 lea 4($j),$j # j++
540 mov %rdx,$A[0]
541
542 mulq $m1 # np[j]*m1
543 add %rax,$N[1]
544 mov -16($ap,$j,8),%rax
545 adc \$0,%rdx
546 add $A[1],$N[1]
547 adc \$0,%rdx
548 mov $N[1],-32(%rsp,$j,8) # tp[j-1]
549 mov %rdx,$N[0]
550 cmp $num,$j
551 jl .Linner4x
552
553 mulq $m0 # ap[j]*bp[i]
554 add %rax,$A[0]
555 mov -16($np,$j,8),%rax
556 adc \$0,%rdx
557 add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
558 adc \$0,%rdx
559 mov %rdx,$A[1]
560
561 mulq $m1 # np[j]*m1
562 add %rax,$N[0]
563 mov -8($ap,$j,8),%rax
564 adc \$0,%rdx
565 add $A[0],$N[0]
566 adc \$0,%rdx
567 mov $N[0],-24(%rsp,$j,8) # tp[j-1]
568 mov %rdx,$N[1]
569
570 mulq $m0 # ap[j]*bp[i]
571 add %rax,$A[1]
572 mov -8($np,$j,8),%rax
573 adc \$0,%rdx
574 add -8(%rsp,$j,8),$A[1]
575 adc \$0,%rdx
576 lea 1($i),$i # i++
577 mov %rdx,$A[0]
578
579 mulq $m1 # np[j]*m1
580 add %rax,$N[1]
581 mov ($ap),%rax # ap[0]
582 adc \$0,%rdx
583 add $A[1],$N[1]
584 adc \$0,%rdx
585 mov $N[1],-16(%rsp,$j,8) # tp[j-1]
586 mov %rdx,$N[0]
587
588 xor $N[1],$N[1]
589 add $A[0],$N[0]
590 adc \$0,$N[1]
591 add (%rsp,$num,8),$N[0] # pull upmost overflow bit
592 adc \$0,$N[1]
593 mov $N[0],-8(%rsp,$j,8)
594 mov $N[1],(%rsp,$j,8) # store upmost overflow bit
595
596 cmp $num,$i
597 jl .Louter4x
598___
599{
600my @ri=("%rax","%rdx",$m0,$m1);
601$code.=<<___;
602 mov 16(%rsp,$num,8),$rp # restore $rp
603 mov 0(%rsp),@ri[0] # tp[0]
604 pxor %xmm0,%xmm0
605 mov 8(%rsp),@ri[1] # tp[1]
606 shr \$2,$num # num/=4
607 lea (%rsp),$ap # borrow ap for tp
608 xor $i,$i # i=0 and clear CF!
609
610 sub 0($np),@ri[0]
611 mov 16($ap),@ri[2] # tp[2]
612 mov 24($ap),@ri[3] # tp[3]
613 sbb 8($np),@ri[1]
614 lea -1($num),$j # j=num/4-1
615 jmp .Lsub4x
616.align 16
617.Lsub4x:
618 mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i]
619 mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i]
620 sbb 16($np,$i,8),@ri[2]
621 mov 32($ap,$i,8),@ri[0] # tp[i+1]
622 mov 40($ap,$i,8),@ri[1]
623 sbb 24($np,$i,8),@ri[3]
624 mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i]
625 mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i]
626 sbb 32($np,$i,8),@ri[0]
627 mov 48($ap,$i,8),@ri[2]
628 mov 56($ap,$i,8),@ri[3]
629 sbb 40($np,$i,8),@ri[1]
630 lea 4($i),$i # i++
631 dec $j # doesnn't affect CF!
632 jnz .Lsub4x
633
634 mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i]
635 mov 32($ap,$i,8),@ri[0] # load overflow bit
636 sbb 16($np,$i,8),@ri[2]
637 mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i]
638 sbb 24($np,$i,8),@ri[3]
639 mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i]
640
641 sbb \$0,@ri[0] # handle upmost overflow bit
642 mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i]
643 xor $i,$i # i=0
644 and @ri[0],$ap
645 not @ri[0]
646 mov $rp,$np
647 and @ri[0],$np
648 lea -1($num),$j
649 or $np,$ap # ap=borrow?tp:rp
650
651 movdqu ($ap),%xmm1
652 movdqa %xmm0,(%rsp)
653 movdqu %xmm1,($rp)
654 jmp .Lcopy4x
655.align 16
656.Lcopy4x: # copy or in-place refresh
657 movdqu 16($ap,$i),%xmm2
658 movdqu 32($ap,$i),%xmm1
659 movdqa %xmm0,16(%rsp,$i)
660 movdqu %xmm2,16($rp,$i)
661 movdqa %xmm0,32(%rsp,$i)
662 movdqu %xmm1,32($rp,$i)
663 lea 32($i),$i
664 dec $j
665 jnz .Lcopy4x
666
667 shl \$2,$num
668 movdqu 16($ap,$i),%xmm2
669 movdqa %xmm0,16(%rsp,$i)
670 movdqu %xmm2,16($rp,$i)
671___
672}
673$code.=<<___;
674 mov 8(%rsp,$num,8),%rsi # restore %rsp
675 mov \$1,%rax
676 mov (%rsi),%r15
677 mov 8(%rsi),%r14
678 mov 16(%rsi),%r13
679 mov 24(%rsi),%r12
680 mov 32(%rsi),%rbp
681 mov 40(%rsi),%rbx
682 lea 48(%rsi),%rsp
683.Lmul4x_epilogue:
684 ret
685.size bn_mul4x_mont,.-bn_mul4x_mont
686___
687}}}
688 {{{
689######################################################################
690# void bn_sqr4x_mont(
691my $rptr="%rdi"; # const BN_ULONG *rptr,
692my $aptr="%rsi"; # const BN_ULONG *aptr,
693my $bptr="%rdx"; # not used
694my $nptr="%rcx"; # const BN_ULONG *nptr,
695my $n0 ="%r8"; # const BN_ULONG *n0);
696my $num ="%r9"; # int num, has to be divisible by 4 and
697 # not less than 8
698
699my ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
700my @A0=("%r10","%r11");
701my @A1=("%r12","%r13");
702my ($a0,$a1,$ai)=("%r14","%r15","%rbx");
703
704$code.=<<___;
705.type bn_sqr4x_mont,\@function,6
706.align 16
707bn_sqr4x_mont:
708.Lsqr4x_enter:
709 push %rbx
710 push %rbp
711 push %r12
712 push %r13
713 push %r14
714 push %r15
715
716 shl \$3,${num}d # convert $num to bytes
717 xor %r10,%r10
718 mov %rsp,%r11 # put aside %rsp
719 sub $num,%r10 # -$num
720 mov ($n0),$n0 # *n0
721 lea -72(%rsp,%r10,2),%rsp # alloca(frame+2*$num)
722 and \$-1024,%rsp # minimize TLB usage
723 ##############################################################
724 # Stack layout
725 #
726 # +0 saved $num, used in reduction section
727 # +8 &t[2*$num], used in reduction section
728 # +32 saved $rptr
729 # +40 saved $nptr
730 # +48 saved *n0
731 # +56 saved %rsp
732 # +64 t[2*$num]
733 #
734 mov $rptr,32(%rsp) # save $rptr
735 mov $nptr,40(%rsp)
736 mov $n0, 48(%rsp)
737 mov %r11, 56(%rsp) # save original %rsp
738.Lsqr4x_body:
739 ##############################################################
740 # Squaring part:
741 #
742 # a) multiply-n-add everything but a[i]*a[i];
743 # b) shift result of a) by 1 to the left and accumulate
744 # a[i]*a[i] products;
745 #
746 lea 32(%r10),$i # $i=-($num-32)
747 lea ($aptr,$num),$aptr # end of a[] buffer, ($aptr,$i)=&ap[2]
748
749 mov $num,$j # $j=$num
750
751 # comments apply to $num==8 case
752 mov -32($aptr,$i),$a0 # a[0]
753 lea 64(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num]
754 mov -24($aptr,$i),%rax # a[1]
755 lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"]
756 mov -16($aptr,$i),$ai # a[2]
757 mov %rax,$a1
758
759 mul $a0 # a[1]*a[0]
760 mov %rax,$A0[0] # a[1]*a[0]
761 mov $ai,%rax # a[2]
762 mov %rdx,$A0[1]
763 mov $A0[0],-24($tptr,$i) # t[1]
764
765 xor $A0[0],$A0[0]
766 mul $a0 # a[2]*a[0]
767 add %rax,$A0[1]
768 mov $ai,%rax
769 adc %rdx,$A0[0]
770 mov $A0[1],-16($tptr,$i) # t[2]
771
772 lea -16($i),$j # j=-16
773
774
775 mov 8($aptr,$j),$ai # a[3]
776 mul $a1 # a[2]*a[1]
777 mov %rax,$A1[0] # a[2]*a[1]+t[3]
778 mov $ai,%rax
779 mov %rdx,$A1[1]
780
781 xor $A0[1],$A0[1]
782 add $A1[0],$A0[0]
783 lea 16($j),$j
784 adc \$0,$A0[1]
785 mul $a0 # a[3]*a[0]
786 add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3]
787 mov $ai,%rax
788 adc %rdx,$A0[1]
789 mov $A0[0],-8($tptr,$j) # t[3]
790 jmp .Lsqr4x_1st
791
792.align 16
793.Lsqr4x_1st:
794 mov ($aptr,$j),$ai # a[4]
795 xor $A1[0],$A1[0]
796 mul $a1 # a[3]*a[1]
797 add %rax,$A1[1] # a[3]*a[1]+t[4]
798 mov $ai,%rax
799 adc %rdx,$A1[0]
800
801 xor $A0[0],$A0[0]
802 add $A1[1],$A0[1]
803 adc \$0,$A0[0]
804 mul $a0 # a[4]*a[0]
805 add %rax,$A0[1] # a[4]*a[0]+a[3]*a[1]+t[4]
806 mov $ai,%rax # a[3]
807 adc %rdx,$A0[0]
808 mov $A0[1],($tptr,$j) # t[4]
809
810
811 mov 8($aptr,$j),$ai # a[5]
812 xor $A1[1],$A1[1]
813 mul $a1 # a[4]*a[3]
814 add %rax,$A1[0] # a[4]*a[3]+t[5]
815 mov $ai,%rax
816 adc %rdx,$A1[1]
817
818 xor $A0[1],$A0[1]
819 add $A1[0],$A0[0]
820 adc \$0,$A0[1]
821 mul $a0 # a[5]*a[2]
822 add %rax,$A0[0] # a[5]*a[2]+a[4]*a[3]+t[5]
823 mov $ai,%rax
824 adc %rdx,$A0[1]
825 mov $A0[0],8($tptr,$j) # t[5]
826
827 mov 16($aptr,$j),$ai # a[6]
828 xor $A1[0],$A1[0]
829 mul $a1 # a[5]*a[3]
830 add %rax,$A1[1] # a[5]*a[3]+t[6]
831 mov $ai,%rax
832 adc %rdx,$A1[0]
833
834 xor $A0[0],$A0[0]
835 add $A1[1],$A0[1]
836 adc \$0,$A0[0]
837 mul $a0 # a[6]*a[2]
838 add %rax,$A0[1] # a[6]*a[2]+a[5]*a[3]+t[6]
839 mov $ai,%rax # a[3]
840 adc %rdx,$A0[0]
841 mov $A0[1],16($tptr,$j) # t[6]
842
843
844 mov 24($aptr,$j),$ai # a[7]
845 xor $A1[1],$A1[1]
846 mul $a1 # a[6]*a[5]
847 add %rax,$A1[0] # a[6]*a[5]+t[7]
848 mov $ai,%rax
849 adc %rdx,$A1[1]
850
851 xor $A0[1],$A0[1]
852 add $A1[0],$A0[0]
853 lea 32($j),$j
854 adc \$0,$A0[1]
855 mul $a0 # a[7]*a[4]
856 add %rax,$A0[0] # a[7]*a[4]+a[6]*a[5]+t[6]
857 mov $ai,%rax
858 adc %rdx,$A0[1]
859 mov $A0[0],-8($tptr,$j) # t[7]
860
861 cmp \$0,$j
862 jne .Lsqr4x_1st
863
864 xor $A1[0],$A1[0]
865 add $A0[1],$A1[1]
866 adc \$0,$A1[0]
867 mul $a1 # a[7]*a[5]
868 add %rax,$A1[1]
869 adc %rdx,$A1[0]
870
871 mov $A1[1],($tptr) # t[8]
872 lea 16($i),$i
873 mov $A1[0],8($tptr) # t[9]
874 jmp .Lsqr4x_outer
875
876.align 16
877.Lsqr4x_outer: # comments apply to $num==6 case
878 mov -32($aptr,$i),$a0 # a[0]
879 lea 64(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num]
880 mov -24($aptr,$i),%rax # a[1]
881 lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"]
882 mov -16($aptr,$i),$ai # a[2]
883 mov %rax,$a1
884
885 mov -24($tptr,$i),$A0[0] # t[1]
886 xor $A0[1],$A0[1]
887 mul $a0 # a[1]*a[0]
888 add %rax,$A0[0] # a[1]*a[0]+t[1]
889 mov $ai,%rax # a[2]
890 adc %rdx,$A0[1]
891 mov $A0[0],-24($tptr,$i) # t[1]
892
893 xor $A0[0],$A0[0]
894 add -16($tptr,$i),$A0[1] # a[2]*a[0]+t[2]
895 adc \$0,$A0[0]
896 mul $a0 # a[2]*a[0]
897 add %rax,$A0[1]
898 mov $ai,%rax
899 adc %rdx,$A0[0]
900 mov $A0[1],-16($tptr,$i) # t[2]
901
902 lea -16($i),$j # j=-16
903 xor $A1[0],$A1[0]
904
905
906 mov 8($aptr,$j),$ai # a[3]
907 xor $A1[1],$A1[1]
908 add 8($tptr,$j),$A1[0]
909 adc \$0,$A1[1]
910 mul $a1 # a[2]*a[1]
911 add %rax,$A1[0] # a[2]*a[1]+t[3]
912 mov $ai,%rax
913 adc %rdx,$A1[1]
914
915 xor $A0[1],$A0[1]
916 add $A1[0],$A0[0]
917 adc \$0,$A0[1]
918 mul $a0 # a[3]*a[0]
919 add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3]
920 mov $ai,%rax
921 adc %rdx,$A0[1]
922 mov $A0[0],8($tptr,$j) # t[3]
923
924 lea 16($j),$j
925 jmp .Lsqr4x_inner
926
927.align 16
928.Lsqr4x_inner:
929 mov ($aptr,$j),$ai # a[4]
930 xor $A1[0],$A1[0]
931 add ($tptr,$j),$A1[1]
932 adc \$0,$A1[0]
933 mul $a1 # a[3]*a[1]
934 add %rax,$A1[1] # a[3]*a[1]+t[4]
935 mov $ai,%rax
936 adc %rdx,$A1[0]
937
938 xor $A0[0],$A0[0]
939 add $A1[1],$A0[1]
940 adc \$0,$A0[0]
941 mul $a0 # a[4]*a[0]
942 add %rax,$A0[1] # a[4]*a[0]+a[3]*a[1]+t[4]
943 mov $ai,%rax # a[3]
944 adc %rdx,$A0[0]
945 mov $A0[1],($tptr,$j) # t[4]
946
947 mov 8($aptr,$j),$ai # a[5]
948 xor $A1[1],$A1[1]
949 add 8($tptr,$j),$A1[0]
950 adc \$0,$A1[1]
951 mul $a1 # a[4]*a[3]
952 add %rax,$A1[0] # a[4]*a[3]+t[5]
953 mov $ai,%rax
954 adc %rdx,$A1[1]
955
956 xor $A0[1],$A0[1]
957 add $A1[0],$A0[0]
958 lea 16($j),$j # j++
959 adc \$0,$A0[1]
960 mul $a0 # a[5]*a[2]
961 add %rax,$A0[0] # a[5]*a[2]+a[4]*a[3]+t[5]
962 mov $ai,%rax
963 adc %rdx,$A0[1]
964 mov $A0[0],-8($tptr,$j) # t[5], "preloaded t[1]" below
965
966 cmp \$0,$j
967 jne .Lsqr4x_inner
968
969 xor $A1[0],$A1[0]
970 add $A0[1],$A1[1]
971 adc \$0,$A1[0]
972 mul $a1 # a[5]*a[3]
973 add %rax,$A1[1]
974 adc %rdx,$A1[0]
975
976 mov $A1[1],($tptr) # t[6], "preloaded t[2]" below
977 mov $A1[0],8($tptr) # t[7], "preloaded t[3]" below
978
979 add \$16,$i
980 jnz .Lsqr4x_outer
981
982 # comments apply to $num==4 case
983 mov -32($aptr),$a0 # a[0]
984 lea 64(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num]
985 mov -24($aptr),%rax # a[1]
986 lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"]
987 mov -16($aptr),$ai # a[2]
988 mov %rax,$a1
989
990 xor $A0[1],$A0[1]
991 mul $a0 # a[1]*a[0]
992 add %rax,$A0[0] # a[1]*a[0]+t[1], preloaded t[1]
993 mov $ai,%rax # a[2]
994 adc %rdx,$A0[1]
995 mov $A0[0],-24($tptr) # t[1]
996
997 xor $A0[0],$A0[0]
998 add $A1[1],$A0[1] # a[2]*a[0]+t[2], preloaded t[2]
999 adc \$0,$A0[0]
1000 mul $a0 # a[2]*a[0]
1001 add %rax,$A0[1]
1002 mov $ai,%rax
1003 adc %rdx,$A0[0]
1004 mov $A0[1],-16($tptr) # t[2]
1005
1006 mov -8($aptr),$ai # a[3]
1007 mul $a1 # a[2]*a[1]
1008 add %rax,$A1[0] # a[2]*a[1]+t[3], preloaded t[3]
1009 mov $ai,%rax
1010 adc \$0,%rdx
1011
1012 xor $A0[1],$A0[1]
1013 add $A1[0],$A0[0]
1014 mov %rdx,$A1[1]
1015 adc \$0,$A0[1]
1016 mul $a0 # a[3]*a[0]
1017 add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3]
1018 mov $ai,%rax
1019 adc %rdx,$A0[1]
1020 mov $A0[0],-8($tptr) # t[3]
1021
1022 xor $A1[0],$A1[0]
1023 add $A0[1],$A1[1]
1024 adc \$0,$A1[0]
1025 mul $a1 # a[3]*a[1]
1026 add %rax,$A1[1]
1027 mov -16($aptr),%rax # a[2]
1028 adc %rdx,$A1[0]
1029
1030 mov $A1[1],($tptr) # t[4]
1031 mov $A1[0],8($tptr) # t[5]
1032
1033 mul $ai # a[2]*a[3]
1034___
1035{
1036my ($shift,$carry)=($a0,$a1);
1037my @S=(@A1,$ai,$n0);
1038$code.=<<___;
1039 add \$16,$i
1040 xor $shift,$shift
1041 sub $num,$i # $i=16-$num
1042 xor $carry,$carry
1043
1044 add $A1[0],%rax # t[5]
1045 adc \$0,%rdx
1046 mov %rax,8($tptr) # t[5]
1047 mov %rdx,16($tptr) # t[6]
1048 mov $carry,24($tptr) # t[7]
1049
1050 mov -16($aptr,$i),%rax # a[0]
1051 lea 64(%rsp,$num,2),$tptr
1052 xor $A0[0],$A0[0] # t[0]
1053 mov -24($tptr,$i,2),$A0[1] # t[1]
1054
1055 lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift
1056 shr \$63,$A0[0]
1057 lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 |
1058 shr \$63,$A0[1]
1059 or $A0[0],$S[1] # | t[2*i]>>63
1060 mov -16($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch
1061 mov $A0[1],$shift # shift=t[2*i+1]>>63
1062 mul %rax # a[i]*a[i]
1063 neg $carry # mov $carry,cf
1064 mov -8($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch
1065 adc %rax,$S[0]
1066 mov -8($aptr,$i),%rax # a[i+1] # prefetch
1067 mov $S[0],-32($tptr,$i,2)
1068 adc %rdx,$S[1]
1069
1070 lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift
1071 mov $S[1],-24($tptr,$i,2)
1072 sbb $carry,$carry # mov cf,$carry
1073 shr \$63,$A0[0]
1074 lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 |
1075 shr \$63,$A0[1]
1076 or $A0[0],$S[3] # | t[2*i]>>63
1077 mov 0($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch
1078 mov $A0[1],$shift # shift=t[2*i+1]>>63
1079 mul %rax # a[i]*a[i]
1080 neg $carry # mov $carry,cf
1081 mov 8($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch
1082 adc %rax,$S[2]
1083 mov 0($aptr,$i),%rax # a[i+1] # prefetch
1084 mov $S[2],-16($tptr,$i,2)
1085 adc %rdx,$S[3]
1086 lea 16($i),$i
1087 mov $S[3],-40($tptr,$i,2)
1088 sbb $carry,$carry # mov cf,$carry
1089 jmp .Lsqr4x_shift_n_add
1090
1091.align 16
1092.Lsqr4x_shift_n_add:
1093 lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift
1094 shr \$63,$A0[0]
1095 lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 |
1096 shr \$63,$A0[1]
1097 or $A0[0],$S[1] # | t[2*i]>>63
1098 mov -16($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch
1099 mov $A0[1],$shift # shift=t[2*i+1]>>63
1100 mul %rax # a[i]*a[i]
1101 neg $carry # mov $carry,cf
1102 mov -8($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch
1103 adc %rax,$S[0]
1104 mov -8($aptr,$i),%rax # a[i+1] # prefetch
1105 mov $S[0],-32($tptr,$i,2)
1106 adc %rdx,$S[1]
1107
1108 lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift
1109 mov $S[1],-24($tptr,$i,2)
1110 sbb $carry,$carry # mov cf,$carry
1111 shr \$63,$A0[0]
1112 lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 |
1113 shr \$63,$A0[1]
1114 or $A0[0],$S[3] # | t[2*i]>>63
1115 mov 0($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch
1116 mov $A0[1],$shift # shift=t[2*i+1]>>63
1117 mul %rax # a[i]*a[i]
1118 neg $carry # mov $carry,cf
1119 mov 8($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch
1120 adc %rax,$S[2]
1121 mov 0($aptr,$i),%rax # a[i+1] # prefetch
1122 mov $S[2],-16($tptr,$i,2)
1123 adc %rdx,$S[3]
1124
1125 lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift
1126 mov $S[3],-8($tptr,$i,2)
1127 sbb $carry,$carry # mov cf,$carry
1128 shr \$63,$A0[0]
1129 lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 |
1130 shr \$63,$A0[1]
1131 or $A0[0],$S[1] # | t[2*i]>>63
1132 mov 16($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch
1133 mov $A0[1],$shift # shift=t[2*i+1]>>63
1134 mul %rax # a[i]*a[i]
1135 neg $carry # mov $carry,cf
1136 mov 24($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch
1137 adc %rax,$S[0]
1138 mov 8($aptr,$i),%rax # a[i+1] # prefetch
1139 mov $S[0],0($tptr,$i,2)
1140 adc %rdx,$S[1]
1141
1142 lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift
1143 mov $S[1],8($tptr,$i,2)
1144 sbb $carry,$carry # mov cf,$carry
1145 shr \$63,$A0[0]
1146 lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 |
1147 shr \$63,$A0[1]
1148 or $A0[0],$S[3] # | t[2*i]>>63
1149 mov 32($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch
1150 mov $A0[1],$shift # shift=t[2*i+1]>>63
1151 mul %rax # a[i]*a[i]
1152 neg $carry # mov $carry,cf
1153 mov 40($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch
1154 adc %rax,$S[2]
1155 mov 16($aptr,$i),%rax # a[i+1] # prefetch
1156 mov $S[2],16($tptr,$i,2)
1157 adc %rdx,$S[3]
1158 mov $S[3],24($tptr,$i,2)
1159 sbb $carry,$carry # mov cf,$carry
1160 add \$32,$i
1161 jnz .Lsqr4x_shift_n_add
1162
1163 lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift
1164 shr \$63,$A0[0]
1165 lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 |
1166 shr \$63,$A0[1]
1167 or $A0[0],$S[1] # | t[2*i]>>63
1168 mov -16($tptr),$A0[0] # t[2*i+2] # prefetch
1169 mov $A0[1],$shift # shift=t[2*i+1]>>63
1170 mul %rax # a[i]*a[i]
1171 neg $carry # mov $carry,cf
1172 mov -8($tptr),$A0[1] # t[2*i+2+1] # prefetch
1173 adc %rax,$S[0]
1174 mov -8($aptr),%rax # a[i+1] # prefetch
1175 mov $S[0],-32($tptr)
1176 adc %rdx,$S[1]
1177
1178 lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1|shift
1179 mov $S[1],-24($tptr)
1180 sbb $carry,$carry # mov cf,$carry
1181 shr \$63,$A0[0]
1182 lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 |
1183 shr \$63,$A0[1]
1184 or $A0[0],$S[3] # | t[2*i]>>63
1185 mul %rax # a[i]*a[i]
1186 neg $carry # mov $carry,cf
1187 adc %rax,$S[2]
1188 adc %rdx,$S[3]
1189 mov $S[2],-16($tptr)
1190 mov $S[3],-8($tptr)
1191___
1192}
1193##############################################################
1194# Montgomery reduction part, "word-by-word" algorithm.
1195#
1196{
1197my ($topbit,$nptr)=("%rbp",$aptr);
1198my ($m0,$m1)=($a0,$a1);
1199my @Ni=("%rbx","%r9");
1200$code.=<<___;
1201 mov 40(%rsp),$nptr # restore $nptr
1202 mov 48(%rsp),$n0 # restore *n0
1203 xor $j,$j
1204 mov $num,0(%rsp) # save $num
1205 sub $num,$j # $j=-$num
1206 mov 64(%rsp),$A0[0] # t[0] # modsched #
1207 mov $n0,$m0 # # modsched #
1208 lea 64(%rsp,$num,2),%rax # end of t[] buffer
1209 lea 64(%rsp,$num),$tptr # end of t[] window
1210 mov %rax,8(%rsp) # save end of t[] buffer
1211 lea ($nptr,$num),$nptr # end of n[] buffer
1212 xor $topbit,$topbit # $topbit=0
1213
1214 mov 0($nptr,$j),%rax # n[0] # modsched #
1215 mov 8($nptr,$j),$Ni[1] # n[1] # modsched #
1216 imulq $A0[0],$m0 # m0=t[0]*n0 # modsched #
1217 mov %rax,$Ni[0] # # modsched #
1218 jmp .Lsqr4x_mont_outer
1219
1220.align 16
1221.Lsqr4x_mont_outer:
1222 xor $A0[1],$A0[1]
1223 mul $m0 # n[0]*m0
1224 add %rax,$A0[0] # n[0]*m0+t[0]
1225 mov $Ni[1],%rax
1226 adc %rdx,$A0[1]
1227 mov $n0,$m1
1228
1229 xor $A0[0],$A0[0]
1230 add 8($tptr,$j),$A0[1]
1231 adc \$0,$A0[0]
1232 mul $m0 # n[1]*m0
1233 add %rax,$A0[1] # n[1]*m0+t[1]
1234 mov $Ni[0],%rax
1235 adc %rdx,$A0[0]
1236
1237 imulq $A0[1],$m1
1238
1239 mov 16($nptr,$j),$Ni[0] # n[2]
1240 xor $A1[1],$A1[1]
1241 add $A0[1],$A1[0]
1242 adc \$0,$A1[1]
1243 mul $m1 # n[0]*m1
1244 add %rax,$A1[0] # n[0]*m1+"t[1]"
1245 mov $Ni[0],%rax
1246 adc %rdx,$A1[1]
1247 mov $A1[0],8($tptr,$j) # "t[1]"
1248
1249 xor $A0[1],$A0[1]
1250 add 16($tptr,$j),$A0[0]
1251 adc \$0,$A0[1]
1252 mul $m0 # n[2]*m0
1253 add %rax,$A0[0] # n[2]*m0+t[2]
1254 mov $Ni[1],%rax
1255 adc %rdx,$A0[1]
1256
1257 mov 24($nptr,$j),$Ni[1] # n[3]
1258 xor $A1[0],$A1[0]
1259 add $A0[0],$A1[1]
1260 adc \$0,$A1[0]
1261 mul $m1 # n[1]*m1
1262 add %rax,$A1[1] # n[1]*m1+"t[2]"
1263 mov $Ni[1],%rax
1264 adc %rdx,$A1[0]
1265 mov $A1[1],16($tptr,$j) # "t[2]"
1266
1267 xor $A0[0],$A0[0]
1268 add 24($tptr,$j),$A0[1]
1269 lea 32($j),$j
1270 adc \$0,$A0[0]
1271 mul $m0 # n[3]*m0
1272 add %rax,$A0[1] # n[3]*m0+t[3]
1273 mov $Ni[0],%rax
1274 adc %rdx,$A0[0]
1275 jmp .Lsqr4x_mont_inner
1276
1277.align 16
1278.Lsqr4x_mont_inner:
1279 mov ($nptr,$j),$Ni[0] # n[4]
1280 xor $A1[1],$A1[1]
1281 add $A0[1],$A1[0]
1282 adc \$0,$A1[1]
1283 mul $m1 # n[2]*m1
1284 add %rax,$A1[0] # n[2]*m1+"t[3]"
1285 mov $Ni[0],%rax
1286 adc %rdx,$A1[1]
1287 mov $A1[0],-8($tptr,$j) # "t[3]"
1288
1289 xor $A0[1],$A0[1]
1290 add ($tptr,$j),$A0[0]
1291 adc \$0,$A0[1]
1292 mul $m0 # n[4]*m0
1293 add %rax,$A0[0] # n[4]*m0+t[4]
1294 mov $Ni[1],%rax
1295 adc %rdx,$A0[1]
1296
1297 mov 8($nptr,$j),$Ni[1] # n[5]
1298 xor $A1[0],$A1[0]
1299 add $A0[0],$A1[1]
1300 adc \$0,$A1[0]
1301 mul $m1 # n[3]*m1
1302 add %rax,$A1[1] # n[3]*m1+"t[4]"
1303 mov $Ni[1],%rax
1304 adc %rdx,$A1[0]
1305 mov $A1[1],($tptr,$j) # "t[4]"
1306
1307 xor $A0[0],$A0[0]
1308 add 8($tptr,$j),$A0[1]
1309 adc \$0,$A0[0]
1310 mul $m0 # n[5]*m0
1311 add %rax,$A0[1] # n[5]*m0+t[5]
1312 mov $Ni[0],%rax
1313 adc %rdx,$A0[0]
1314
1315
1316 mov 16($nptr,$j),$Ni[0] # n[6]
1317 xor $A1[1],$A1[1]
1318 add $A0[1],$A1[0]
1319 adc \$0,$A1[1]
1320 mul $m1 # n[4]*m1
1321 add %rax,$A1[0] # n[4]*m1+"t[5]"
1322 mov $Ni[0],%rax
1323 adc %rdx,$A1[1]
1324 mov $A1[0],8($tptr,$j) # "t[5]"
1325
1326 xor $A0[1],$A0[1]
1327 add 16($tptr,$j),$A0[0]
1328 adc \$0,$A0[1]
1329 mul $m0 # n[6]*m0
1330 add %rax,$A0[0] # n[6]*m0+t[6]
1331 mov $Ni[1],%rax
1332 adc %rdx,$A0[1]
1333
1334 mov 24($nptr,$j),$Ni[1] # n[7]
1335 xor $A1[0],$A1[0]
1336 add $A0[0],$A1[1]
1337 adc \$0,$A1[0]
1338 mul $m1 # n[5]*m1
1339 add %rax,$A1[1] # n[5]*m1+"t[6]"
1340 mov $Ni[1],%rax
1341 adc %rdx,$A1[0]
1342 mov $A1[1],16($tptr,$j) # "t[6]"
1343
1344 xor $A0[0],$A0[0]
1345 add 24($tptr,$j),$A0[1]
1346 lea 32($j),$j
1347 adc \$0,$A0[0]
1348 mul $m0 # n[7]*m0
1349 add %rax,$A0[1] # n[7]*m0+t[7]
1350 mov $Ni[0],%rax
1351 adc %rdx,$A0[0]
1352 cmp \$0,$j
1353 jne .Lsqr4x_mont_inner
1354
1355 sub 0(%rsp),$j # $j=-$num # modsched #
1356 mov $n0,$m0 # # modsched #
1357
1358 xor $A1[1],$A1[1]
1359 add $A0[1],$A1[0]
1360 adc \$0,$A1[1]
1361 mul $m1 # n[6]*m1
1362 add %rax,$A1[0] # n[6]*m1+"t[7]"
1363 mov $Ni[1],%rax
1364 adc %rdx,$A1[1]
1365 mov $A1[0],-8($tptr) # "t[7]"
1366
1367 xor $A0[1],$A0[1]
1368 add ($tptr),$A0[0] # +t[8]
1369 adc \$0,$A0[1]
1370 mov 0($nptr,$j),$Ni[0] # n[0] # modsched #
1371 add $topbit,$A0[0]
1372 adc \$0,$A0[1]
1373
1374 imulq 16($tptr,$j),$m0 # m0=t[0]*n0 # modsched #
1375 xor $A1[0],$A1[0]
1376 mov 8($nptr,$j),$Ni[1] # n[1] # modsched #
1377 add $A0[0],$A1[1]
1378 mov 16($tptr,$j),$A0[0] # t[0] # modsched #
1379 adc \$0,$A1[0]
1380 mul $m1 # n[7]*m1
1381 add %rax,$A1[1] # n[7]*m1+"t[8]"
1382 mov $Ni[0],%rax # # modsched #
1383 adc %rdx,$A1[0]
1384 mov $A1[1],($tptr) # "t[8]"
1385
1386 xor $topbit,$topbit
1387 add 8($tptr),$A1[0] # +t[9]
1388 adc $topbit,$topbit
1389 add $A0[1],$A1[0]
1390 lea 16($tptr),$tptr # "t[$num]>>128"
1391 adc \$0,$topbit
1392 mov $A1[0],-8($tptr) # "t[9]"
1393 cmp 8(%rsp),$tptr # are we done?
1394 jb .Lsqr4x_mont_outer
1395
1396 mov 0(%rsp),$num # restore $num
1397 mov $topbit,($tptr) # save $topbit
1398___
1399}
1400##############################################################
1401# Post-condition, 4x unrolled copy from bn_mul_mont
1402#
1403{
1404my ($tptr,$nptr)=("%rbx",$aptr);
1405my @ri=("%rax","%rdx","%r10","%r11");
1406$code.=<<___;
1407 mov 64(%rsp,$num),@ri[0] # tp[0]
1408 lea 64(%rsp,$num),$tptr # upper half of t[2*$num] holds result
1409 mov 40(%rsp),$nptr # restore $nptr
1410 shr \$5,$num # num/4
1411 mov 8($tptr),@ri[1] # t[1]
1412 xor $i,$i # i=0 and clear CF!
1413
1414 mov 32(%rsp),$rptr # restore $rptr
1415 sub 0($nptr),@ri[0]
1416 mov 16($tptr),@ri[2] # t[2]
1417 mov 24($tptr),@ri[3] # t[3]
1418 sbb 8($nptr),@ri[1]
1419 lea -1($num),$j # j=num/4-1
1420 jmp .Lsqr4x_sub
1421.align 16
1422.Lsqr4x_sub:
1423 mov @ri[0],0($rptr,$i,8) # rp[i]=tp[i]-np[i]
1424 mov @ri[1],8($rptr,$i,8) # rp[i]=tp[i]-np[i]
1425 sbb 16($nptr,$i,8),@ri[2]
1426 mov 32($tptr,$i,8),@ri[0] # tp[i+1]
1427 mov 40($tptr,$i,8),@ri[1]
1428 sbb 24($nptr,$i,8),@ri[3]
1429 mov @ri[2],16($rptr,$i,8) # rp[i]=tp[i]-np[i]
1430 mov @ri[3],24($rptr,$i,8) # rp[i]=tp[i]-np[i]
1431 sbb 32($nptr,$i,8),@ri[0]
1432 mov 48($tptr,$i,8),@ri[2]
1433 mov 56($tptr,$i,8),@ri[3]
1434 sbb 40($nptr,$i,8),@ri[1]
1435 lea 4($i),$i # i++
1436 dec $j # doesn't affect CF!
1437 jnz .Lsqr4x_sub
1438
1439 mov @ri[0],0($rptr,$i,8) # rp[i]=tp[i]-np[i]
1440 mov 32($tptr,$i,8),@ri[0] # load overflow bit
1441 sbb 16($nptr,$i,8),@ri[2]
1442 mov @ri[1],8($rptr,$i,8) # rp[i]=tp[i]-np[i]
1443 sbb 24($nptr,$i,8),@ri[3]
1444 mov @ri[2],16($rptr,$i,8) # rp[i]=tp[i]-np[i]
1445
1446 sbb \$0,@ri[0] # handle upmost overflow bit
1447 mov @ri[3],24($rptr,$i,8) # rp[i]=tp[i]-np[i]
1448 xor $i,$i # i=0
1449 and @ri[0],$tptr
1450 not @ri[0]
1451 mov $rptr,$nptr
1452 and @ri[0],$nptr
1453 lea -1($num),$j
1454 or $nptr,$tptr # tp=borrow?tp:rp
1455
1456 pxor %xmm0,%xmm0
1457 lea 64(%rsp,$num,8),$nptr
1458 movdqu ($tptr),%xmm1
1459 lea ($nptr,$num,8),$nptr
1460 movdqa %xmm0,64(%rsp) # zap lower half of temporary vector
1461 movdqa %xmm0,($nptr) # zap upper half of temporary vector
1462 movdqu %xmm1,($rptr)
1463 jmp .Lsqr4x_copy
1464.align 16
1465.Lsqr4x_copy: # copy or in-place refresh
1466 movdqu 16($tptr,$i),%xmm2
1467 movdqu 32($tptr,$i),%xmm1
1468 movdqa %xmm0,80(%rsp,$i) # zap lower half of temporary vector
1469 movdqa %xmm0,96(%rsp,$i) # zap lower half of temporary vector
1470 movdqa %xmm0,16($nptr,$i) # zap upper half of temporary vector
1471 movdqa %xmm0,32($nptr,$i) # zap upper half of temporary vector
1472 movdqu %xmm2,16($rptr,$i)
1473 movdqu %xmm1,32($rptr,$i)
1474 lea 32($i),$i
1475 dec $j
1476 jnz .Lsqr4x_copy
1477
1478 movdqu 16($tptr,$i),%xmm2
1479 movdqa %xmm0,80(%rsp,$i) # zap lower half of temporary vector
1480 movdqa %xmm0,16($nptr,$i) # zap upper half of temporary vector
1481 movdqu %xmm2,16($rptr,$i)
1482___
1483}
1484$code.=<<___;
1485 mov 56(%rsp),%rsi # restore %rsp
1486 mov \$1,%rax
1487 mov 0(%rsi),%r15
1488 mov 8(%rsi),%r14
1489 mov 16(%rsi),%r13
1490 mov 24(%rsi),%r12
1491 mov 32(%rsi),%rbp
1492 mov 40(%rsi),%rbx
1493 lea 48(%rsi),%rsp
1494.Lsqr4x_epilogue:
1495 ret
1496.size bn_sqr4x_mont,.-bn_sqr4x_mont
1497___
1498}}}
1499$code.=<<___;
1500.asciz "Montgomery Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1501.align 16
1502___
1503
1504# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1505# CONTEXT *context,DISPATCHER_CONTEXT *disp)
1506if ($win64) {
1507$rec="%rcx";
1508$frame="%rdx";
1509$context="%r8";
1510$disp="%r9";
1511
1512$code.=<<___;
1513.extern __imp_RtlVirtualUnwind
1514.type mul_handler,\@abi-omnipotent
1515.align 16
1516mul_handler:
1517 push %rsi
1518 push %rdi
1519 push %rbx
1520 push %rbp
1521 push %r12
1522 push %r13
1523 push %r14
1524 push %r15
1525 pushfq
1526 sub \$64,%rsp
1527
1528 mov 120($context),%rax # pull context->Rax
1529 mov 248($context),%rbx # pull context->Rip
1530
1531 mov 8($disp),%rsi # disp->ImageBase
1532 mov 56($disp),%r11 # disp->HandlerData
1533
1534 mov 0(%r11),%r10d # HandlerData[0]
1535 lea (%rsi,%r10),%r10 # end of prologue label
1536 cmp %r10,%rbx # context->Rip<end of prologue label
1537 jb .Lcommon_seh_tail
1538
1539 mov 152($context),%rax # pull context->Rsp
1540
1541 mov 4(%r11),%r10d # HandlerData[1]
1542 lea (%rsi,%r10),%r10 # epilogue label
1543 cmp %r10,%rbx # context->Rip>=epilogue label
1544 jae .Lcommon_seh_tail
1545
1546 mov 192($context),%r10 # pull $num
1547 mov 8(%rax,%r10,8),%rax # pull saved stack pointer
1548 lea 48(%rax),%rax
1549
1550 mov -8(%rax),%rbx
1551 mov -16(%rax),%rbp
1552 mov -24(%rax),%r12
1553 mov -32(%rax),%r13
1554 mov -40(%rax),%r14
1555 mov -48(%rax),%r15
1556 mov %rbx,144($context) # restore context->Rbx
1557 mov %rbp,160($context) # restore context->Rbp
1558 mov %r12,216($context) # restore context->R12
1559 mov %r13,224($context) # restore context->R13
1560 mov %r14,232($context) # restore context->R14
1561 mov %r15,240($context) # restore context->R15
1562
1563 jmp .Lcommon_seh_tail
1564.size mul_handler,.-mul_handler
1565
1566.type sqr_handler,\@abi-omnipotent
1567.align 16
1568sqr_handler:
1569 push %rsi
1570 push %rdi
1571 push %rbx
1572 push %rbp
1573 push %r12
1574 push %r13
1575 push %r14
1576 push %r15
1577 pushfq
1578 sub \$64,%rsp
1579
1580 mov 120($context),%rax # pull context->Rax
1581 mov 248($context),%rbx # pull context->Rip
1582
1583 lea .Lsqr4x_body(%rip),%r10
1584 cmp %r10,%rbx # context->Rip<.Lsqr_body
1585 jb .Lcommon_seh_tail
1586
1587 mov 152($context),%rax # pull context->Rsp
1588
1589 lea .Lsqr4x_epilogue(%rip),%r10
1590 cmp %r10,%rbx # context->Rip>=.Lsqr_epilogue
1591 jae .Lcommon_seh_tail
1592
1593 mov 56(%rax),%rax # pull saved stack pointer
1594 lea 48(%rax),%rax
1595
1596 mov -8(%rax),%rbx
1597 mov -16(%rax),%rbp
1598 mov -24(%rax),%r12
1599 mov -32(%rax),%r13
1600 mov -40(%rax),%r14
1601 mov -48(%rax),%r15
1602 mov %rbx,144($context) # restore context->Rbx
1603 mov %rbp,160($context) # restore context->Rbp
1604 mov %r12,216($context) # restore context->R12
1605 mov %r13,224($context) # restore context->R13
1606 mov %r14,232($context) # restore context->R14
1607 mov %r15,240($context) # restore context->R15
1608
1609.Lcommon_seh_tail:
1610 mov 8(%rax),%rdi
1611 mov 16(%rax),%rsi
1612 mov %rax,152($context) # restore context->Rsp
1613 mov %rsi,168($context) # restore context->Rsi
1614 mov %rdi,176($context) # restore context->Rdi
1615
1616 mov 40($disp),%rdi # disp->ContextRecord
1617 mov $context,%rsi # context
1618 mov \$154,%ecx # sizeof(CONTEXT)
1619 .long 0xa548f3fc # cld; rep movsq
1620
1621 mov $disp,%rsi
1622 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1623 mov 8(%rsi),%rdx # arg2, disp->ImageBase
1624 mov 0(%rsi),%r8 # arg3, disp->ControlPc
1625 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1626 mov 40(%rsi),%r10 # disp->ContextRecord
1627 lea 56(%rsi),%r11 # &disp->HandlerData
1628 lea 24(%rsi),%r12 # &disp->EstablisherFrame
1629 mov %r10,32(%rsp) # arg5
1630 mov %r11,40(%rsp) # arg6
1631 mov %r12,48(%rsp) # arg7
1632 mov %rcx,56(%rsp) # arg8, (NULL)
1633 call *__imp_RtlVirtualUnwind(%rip)
1634
1635 mov \$1,%eax # ExceptionContinueSearch
1636 add \$64,%rsp
1637 popfq
1638 pop %r15
1639 pop %r14
1640 pop %r13
1641 pop %r12
1642 pop %rbp
1643 pop %rbx
1644 pop %rdi
1645 pop %rsi
1646 ret
1647.size sqr_handler,.-sqr_handler
1648
1649.section .pdata
1650.align 4
1651 .rva .LSEH_begin_bn_mul_mont
1652 .rva .LSEH_end_bn_mul_mont
1653 .rva .LSEH_info_bn_mul_mont
1654
1655 .rva .LSEH_begin_bn_mul4x_mont
1656 .rva .LSEH_end_bn_mul4x_mont
1657 .rva .LSEH_info_bn_mul4x_mont
1658
1659 .rva .LSEH_begin_bn_sqr4x_mont
1660 .rva .LSEH_end_bn_sqr4x_mont
1661 .rva .LSEH_info_bn_sqr4x_mont
1662
1663.section .xdata
1664.align 8
1665.LSEH_info_bn_mul_mont:
1666 .byte 9,0,0,0
1667 .rva mul_handler
1668 .rva .Lmul_body,.Lmul_epilogue # HandlerData[]
1669.LSEH_info_bn_mul4x_mont:
1670 .byte 9,0,0,0
1671 .rva mul_handler
1672 .rva .Lmul4x_body,.Lmul4x_epilogue # HandlerData[]
1673.LSEH_info_bn_sqr4x_mont:
1674 .byte 9,0,0,0
1675 .rva sqr_handler
1676___
1677}
1678
1679print $code;
1680close STDOUT;
diff --git a/src/lib/libcrypto/bn/asm/x86_64-mont5.pl b/src/lib/libcrypto/bn/asm/x86_64-mont5.pl
deleted file mode 100755
index 057cda28aa..0000000000
--- a/src/lib/libcrypto/bn/asm/x86_64-mont5.pl
+++ /dev/null
@@ -1,1070 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# August 2011.
11#
12# Companion to x86_64-mont.pl that optimizes cache-timing attack
13# countermeasures. The subroutines are produced by replacing bp[i]
14# references in their x86_64-mont.pl counterparts with cache-neutral
15# references to powers table computed in BN_mod_exp_mont_consttime.
16# In addition subroutine that scatters elements of the powers table
17# is implemented, so that scatter-/gathering can be tuned without
18# bn_exp.c modifications.
19
20$flavour = shift;
21$output = shift;
22if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
23
24$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
25
26$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
27( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
28( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
29die "can't locate x86_64-xlate.pl";
30
31open STDOUT,"| $^X $xlate $flavour $output";
32
33# int bn_mul_mont_gather5(
34$rp="%rdi"; # BN_ULONG *rp,
35$ap="%rsi"; # const BN_ULONG *ap,
36$bp="%rdx"; # const BN_ULONG *bp,
37$np="%rcx"; # const BN_ULONG *np,
38$n0="%r8"; # const BN_ULONG *n0,
39$num="%r9"; # int num,
40 # int idx); # 0 to 2^5-1, "index" in $bp holding
41 # pre-computed powers of a', interlaced
42 # in such manner that b[0] is $bp[idx],
43 # b[1] is [2^5+idx], etc.
44$lo0="%r10";
45$hi0="%r11";
46$hi1="%r13";
47$i="%r14";
48$j="%r15";
49$m0="%rbx";
50$m1="%rbp";
51
52$code=<<___;
53.text
54
55.globl bn_mul_mont_gather5
56.type bn_mul_mont_gather5,\@function,6
57.align 64
58bn_mul_mont_gather5:
59 test \$3,${num}d
60 jnz .Lmul_enter
61 cmp \$8,${num}d
62 jb .Lmul_enter
63 jmp .Lmul4x_enter
64
65.align 16
66.Lmul_enter:
67 mov ${num}d,${num}d
68 mov `($win64?56:8)`(%rsp),%r10d # load 7th argument
69 push %rbx
70 push %rbp
71 push %r12
72 push %r13
73 push %r14
74 push %r15
75___
76$code.=<<___ if ($win64);
77 lea -0x28(%rsp),%rsp
78 movaps %xmm6,(%rsp)
79 movaps %xmm7,0x10(%rsp)
80.Lmul_alloca:
81___
82$code.=<<___;
83 mov %rsp,%rax
84 lea 2($num),%r11
85 neg %r11
86 lea (%rsp,%r11,8),%rsp # tp=alloca(8*(num+2))
87 and \$-1024,%rsp # minimize TLB usage
88
89 mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp
90.Lmul_body:
91 mov $bp,%r12 # reassign $bp
92___
93 $bp="%r12";
94 $STRIDE=2**5*8; # 5 is "window size"
95 $N=$STRIDE/4; # should match cache line size
96$code.=<<___;
97 mov %r10,%r11
98 shr \$`log($N/8)/log(2)`,%r10
99 and \$`$N/8-1`,%r11
100 not %r10
101 lea .Lmagic_masks(%rip),%rax
102 and \$`2**5/($N/8)-1`,%r10 # 5 is "window size"
103 lea 96($bp,%r11,8),$bp # pointer within 1st cache line
104 movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which
105 movq 8(%rax,%r10,8),%xmm5 # cache line contains element
106 movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument
107 movq 24(%rax,%r10,8),%xmm7
108
109 movq `0*$STRIDE/4-96`($bp),%xmm0
110 movq `1*$STRIDE/4-96`($bp),%xmm1
111 pand %xmm4,%xmm0
112 movq `2*$STRIDE/4-96`($bp),%xmm2
113 pand %xmm5,%xmm1
114 movq `3*$STRIDE/4-96`($bp),%xmm3
115 pand %xmm6,%xmm2
116 por %xmm1,%xmm0
117 pand %xmm7,%xmm3
118 por %xmm2,%xmm0
119 lea $STRIDE($bp),$bp
120 por %xmm3,%xmm0
121
122 movq %xmm0,$m0 # m0=bp[0]
123
124 mov ($n0),$n0 # pull n0[0] value
125 mov ($ap),%rax
126
127 xor $i,$i # i=0
128 xor $j,$j # j=0
129
130 movq `0*$STRIDE/4-96`($bp),%xmm0
131 movq `1*$STRIDE/4-96`($bp),%xmm1
132 pand %xmm4,%xmm0
133 movq `2*$STRIDE/4-96`($bp),%xmm2
134 pand %xmm5,%xmm1
135
136 mov $n0,$m1
137 mulq $m0 # ap[0]*bp[0]
138 mov %rax,$lo0
139 mov ($np),%rax
140
141 movq `3*$STRIDE/4-96`($bp),%xmm3
142 pand %xmm6,%xmm2
143 por %xmm1,%xmm0
144 pand %xmm7,%xmm3
145
146 imulq $lo0,$m1 # "tp[0]"*n0
147 mov %rdx,$hi0
148
149 por %xmm2,%xmm0
150 lea $STRIDE($bp),$bp
151 por %xmm3,%xmm0
152
153 mulq $m1 # np[0]*m1
154 add %rax,$lo0 # discarded
155 mov 8($ap),%rax
156 adc \$0,%rdx
157 mov %rdx,$hi1
158
159 lea 1($j),$j # j++
160 jmp .L1st_enter
161
162.align 16
163.L1st:
164 add %rax,$hi1
165 mov ($ap,$j,8),%rax
166 adc \$0,%rdx
167 add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0]
168 mov $lo0,$hi0
169 adc \$0,%rdx
170 mov $hi1,-16(%rsp,$j,8) # tp[j-1]
171 mov %rdx,$hi1
172
173.L1st_enter:
174 mulq $m0 # ap[j]*bp[0]
175 add %rax,$hi0
176 mov ($np,$j,8),%rax
177 adc \$0,%rdx
178 lea 1($j),$j # j++
179 mov %rdx,$lo0
180
181 mulq $m1 # np[j]*m1
182 cmp $num,$j
183 jne .L1st
184
185 movq %xmm0,$m0 # bp[1]
186
187 add %rax,$hi1
188 mov ($ap),%rax # ap[0]
189 adc \$0,%rdx
190 add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0]
191 adc \$0,%rdx
192 mov $hi1,-16(%rsp,$j,8) # tp[j-1]
193 mov %rdx,$hi1
194 mov $lo0,$hi0
195
196 xor %rdx,%rdx
197 add $hi0,$hi1
198 adc \$0,%rdx
199 mov $hi1,-8(%rsp,$num,8)
200 mov %rdx,(%rsp,$num,8) # store upmost overflow bit
201
202 lea 1($i),$i # i++
203 jmp .Louter
204.align 16
205.Louter:
206 xor $j,$j # j=0
207 mov $n0,$m1
208 mov (%rsp),$lo0
209
210 movq `0*$STRIDE/4-96`($bp),%xmm0
211 movq `1*$STRIDE/4-96`($bp),%xmm1
212 pand %xmm4,%xmm0
213 movq `2*$STRIDE/4-96`($bp),%xmm2
214 pand %xmm5,%xmm1
215
216 mulq $m0 # ap[0]*bp[i]
217 add %rax,$lo0 # ap[0]*bp[i]+tp[0]
218 mov ($np),%rax
219 adc \$0,%rdx
220
221 movq `3*$STRIDE/4-96`($bp),%xmm3
222 pand %xmm6,%xmm2
223 por %xmm1,%xmm0
224 pand %xmm7,%xmm3
225
226 imulq $lo0,$m1 # tp[0]*n0
227 mov %rdx,$hi0
228
229 por %xmm2,%xmm0
230 lea $STRIDE($bp),$bp
231 por %xmm3,%xmm0
232
233 mulq $m1 # np[0]*m1
234 add %rax,$lo0 # discarded
235 mov 8($ap),%rax
236 adc \$0,%rdx
237 mov 8(%rsp),$lo0 # tp[1]
238 mov %rdx,$hi1
239
240 lea 1($j),$j # j++
241 jmp .Linner_enter
242
243.align 16
244.Linner:
245 add %rax,$hi1
246 mov ($ap,$j,8),%rax
247 adc \$0,%rdx
248 add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j]
249 mov (%rsp,$j,8),$lo0
250 adc \$0,%rdx
251 mov $hi1,-16(%rsp,$j,8) # tp[j-1]
252 mov %rdx,$hi1
253
254.Linner_enter:
255 mulq $m0 # ap[j]*bp[i]
256 add %rax,$hi0
257 mov ($np,$j,8),%rax
258 adc \$0,%rdx
259 add $hi0,$lo0 # ap[j]*bp[i]+tp[j]
260 mov %rdx,$hi0
261 adc \$0,$hi0
262 lea 1($j),$j # j++
263
264 mulq $m1 # np[j]*m1
265 cmp $num,$j
266 jne .Linner
267
268 movq %xmm0,$m0 # bp[i+1]
269
270 add %rax,$hi1
271 mov ($ap),%rax # ap[0]
272 adc \$0,%rdx
273 add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j]
274 mov (%rsp,$j,8),$lo0
275 adc \$0,%rdx
276 mov $hi1,-16(%rsp,$j,8) # tp[j-1]
277 mov %rdx,$hi1
278
279 xor %rdx,%rdx
280 add $hi0,$hi1
281 adc \$0,%rdx
282 add $lo0,$hi1 # pull upmost overflow bit
283 adc \$0,%rdx
284 mov $hi1,-8(%rsp,$num,8)
285 mov %rdx,(%rsp,$num,8) # store upmost overflow bit
286
287 lea 1($i),$i # i++
288 cmp $num,$i
289 jl .Louter
290
291 xor $i,$i # i=0 and clear CF!
292 mov (%rsp),%rax # tp[0]
293 lea (%rsp),$ap # borrow ap for tp
294 mov $num,$j # j=num
295 jmp .Lsub
296.align 16
297.Lsub: sbb ($np,$i,8),%rax
298 mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i]
299 mov 8($ap,$i,8),%rax # tp[i+1]
300 lea 1($i),$i # i++
301 dec $j # doesnn't affect CF!
302 jnz .Lsub
303
304 sbb \$0,%rax # handle upmost overflow bit
305 xor $i,$i
306 and %rax,$ap
307 not %rax
308 mov $rp,$np
309 and %rax,$np
310 mov $num,$j # j=num
311 or $np,$ap # ap=borrow?tp:rp
312.align 16
313.Lcopy: # copy or in-place refresh
314 mov ($ap,$i,8),%rax
315 mov $i,(%rsp,$i,8) # zap temporary vector
316 mov %rax,($rp,$i,8) # rp[i]=tp[i]
317 lea 1($i),$i
318 sub \$1,$j
319 jnz .Lcopy
320
321 mov 8(%rsp,$num,8),%rsi # restore %rsp
322 mov \$1,%rax
323___
324$code.=<<___ if ($win64);
325 movaps (%rsi),%xmm6
326 movaps 0x10(%rsi),%xmm7
327 lea 0x28(%rsi),%rsi
328___
329$code.=<<___;
330 mov (%rsi),%r15
331 mov 8(%rsi),%r14
332 mov 16(%rsi),%r13
333 mov 24(%rsi),%r12
334 mov 32(%rsi),%rbp
335 mov 40(%rsi),%rbx
336 lea 48(%rsi),%rsp
337.Lmul_epilogue:
338 ret
339.size bn_mul_mont_gather5,.-bn_mul_mont_gather5
340___
341{{{
342my @A=("%r10","%r11");
343my @N=("%r13","%rdi");
344$code.=<<___;
345.type bn_mul4x_mont_gather5,\@function,6
346.align 16
347bn_mul4x_mont_gather5:
348.Lmul4x_enter:
349 mov ${num}d,${num}d
350 mov `($win64?56:8)`(%rsp),%r10d # load 7th argument
351 push %rbx
352 push %rbp
353 push %r12
354 push %r13
355 push %r14
356 push %r15
357___
358$code.=<<___ if ($win64);
359 lea -0x28(%rsp),%rsp
360 movaps %xmm6,(%rsp)
361 movaps %xmm7,0x10(%rsp)
362.Lmul4x_alloca:
363___
364$code.=<<___;
365 mov %rsp,%rax
366 lea 4($num),%r11
367 neg %r11
368 lea (%rsp,%r11,8),%rsp # tp=alloca(8*(num+4))
369 and \$-1024,%rsp # minimize TLB usage
370
371 mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp
372.Lmul4x_body:
373 mov $rp,16(%rsp,$num,8) # tp[num+2]=$rp
374 mov %rdx,%r12 # reassign $bp
375___
376 $bp="%r12";
377 $STRIDE=2**5*8; # 5 is "window size"
378 $N=$STRIDE/4; # should match cache line size
379$code.=<<___;
380 mov %r10,%r11
381 shr \$`log($N/8)/log(2)`,%r10
382 and \$`$N/8-1`,%r11
383 not %r10
384 lea .Lmagic_masks(%rip),%rax
385 and \$`2**5/($N/8)-1`,%r10 # 5 is "window size"
386 lea 96($bp,%r11,8),$bp # pointer within 1st cache line
387 movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which
388 movq 8(%rax,%r10,8),%xmm5 # cache line contains element
389 movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument
390 movq 24(%rax,%r10,8),%xmm7
391
392 movq `0*$STRIDE/4-96`($bp),%xmm0
393 movq `1*$STRIDE/4-96`($bp),%xmm1
394 pand %xmm4,%xmm0
395 movq `2*$STRIDE/4-96`($bp),%xmm2
396 pand %xmm5,%xmm1
397 movq `3*$STRIDE/4-96`($bp),%xmm3
398 pand %xmm6,%xmm2
399 por %xmm1,%xmm0
400 pand %xmm7,%xmm3
401 por %xmm2,%xmm0
402 lea $STRIDE($bp),$bp
403 por %xmm3,%xmm0
404
405 movq %xmm0,$m0 # m0=bp[0]
406 mov ($n0),$n0 # pull n0[0] value
407 mov ($ap),%rax
408
409 xor $i,$i # i=0
410 xor $j,$j # j=0
411
412 movq `0*$STRIDE/4-96`($bp),%xmm0
413 movq `1*$STRIDE/4-96`($bp),%xmm1
414 pand %xmm4,%xmm0
415 movq `2*$STRIDE/4-96`($bp),%xmm2
416 pand %xmm5,%xmm1
417
418 mov $n0,$m1
419 mulq $m0 # ap[0]*bp[0]
420 mov %rax,$A[0]
421 mov ($np),%rax
422
423 movq `3*$STRIDE/4-96`($bp),%xmm3
424 pand %xmm6,%xmm2
425 por %xmm1,%xmm0
426 pand %xmm7,%xmm3
427
428 imulq $A[0],$m1 # "tp[0]"*n0
429 mov %rdx,$A[1]
430
431 por %xmm2,%xmm0
432 lea $STRIDE($bp),$bp
433 por %xmm3,%xmm0
434
435 mulq $m1 # np[0]*m1
436 add %rax,$A[0] # discarded
437 mov 8($ap),%rax
438 adc \$0,%rdx
439 mov %rdx,$N[1]
440
441 mulq $m0
442 add %rax,$A[1]
443 mov 8($np),%rax
444 adc \$0,%rdx
445 mov %rdx,$A[0]
446
447 mulq $m1
448 add %rax,$N[1]
449 mov 16($ap),%rax
450 adc \$0,%rdx
451 add $A[1],$N[1]
452 lea 4($j),$j # j++
453 adc \$0,%rdx
454 mov $N[1],(%rsp)
455 mov %rdx,$N[0]
456 jmp .L1st4x
457.align 16
458.L1st4x:
459 mulq $m0 # ap[j]*bp[0]
460 add %rax,$A[0]
461 mov -16($np,$j,8),%rax
462 adc \$0,%rdx
463 mov %rdx,$A[1]
464
465 mulq $m1 # np[j]*m1
466 add %rax,$N[0]
467 mov -8($ap,$j,8),%rax
468 adc \$0,%rdx
469 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
470 adc \$0,%rdx
471 mov $N[0],-24(%rsp,$j,8) # tp[j-1]
472 mov %rdx,$N[1]
473
474 mulq $m0 # ap[j]*bp[0]
475 add %rax,$A[1]
476 mov -8($np,$j,8),%rax
477 adc \$0,%rdx
478 mov %rdx,$A[0]
479
480 mulq $m1 # np[j]*m1
481 add %rax,$N[1]
482 mov ($ap,$j,8),%rax
483 adc \$0,%rdx
484 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
485 adc \$0,%rdx
486 mov $N[1],-16(%rsp,$j,8) # tp[j-1]
487 mov %rdx,$N[0]
488
489 mulq $m0 # ap[j]*bp[0]
490 add %rax,$A[0]
491 mov ($np,$j,8),%rax
492 adc \$0,%rdx
493 mov %rdx,$A[1]
494
495 mulq $m1 # np[j]*m1
496 add %rax,$N[0]
497 mov 8($ap,$j,8),%rax
498 adc \$0,%rdx
499 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
500 adc \$0,%rdx
501 mov $N[0],-8(%rsp,$j,8) # tp[j-1]
502 mov %rdx,$N[1]
503
504 mulq $m0 # ap[j]*bp[0]
505 add %rax,$A[1]
506 mov 8($np,$j,8),%rax
507 adc \$0,%rdx
508 lea 4($j),$j # j++
509 mov %rdx,$A[0]
510
511 mulq $m1 # np[j]*m1
512 add %rax,$N[1]
513 mov -16($ap,$j,8),%rax
514 adc \$0,%rdx
515 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
516 adc \$0,%rdx
517 mov $N[1],-32(%rsp,$j,8) # tp[j-1]
518 mov %rdx,$N[0]
519 cmp $num,$j
520 jl .L1st4x
521
522 mulq $m0 # ap[j]*bp[0]
523 add %rax,$A[0]
524 mov -16($np,$j,8),%rax
525 adc \$0,%rdx
526 mov %rdx,$A[1]
527
528 mulq $m1 # np[j]*m1
529 add %rax,$N[0]
530 mov -8($ap,$j,8),%rax
531 adc \$0,%rdx
532 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
533 adc \$0,%rdx
534 mov $N[0],-24(%rsp,$j,8) # tp[j-1]
535 mov %rdx,$N[1]
536
537 mulq $m0 # ap[j]*bp[0]
538 add %rax,$A[1]
539 mov -8($np,$j,8),%rax
540 adc \$0,%rdx
541 mov %rdx,$A[0]
542
543 mulq $m1 # np[j]*m1
544 add %rax,$N[1]
545 mov ($ap),%rax # ap[0]
546 adc \$0,%rdx
547 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
548 adc \$0,%rdx
549 mov $N[1],-16(%rsp,$j,8) # tp[j-1]
550 mov %rdx,$N[0]
551
552 movq %xmm0,$m0 # bp[1]
553
554 xor $N[1],$N[1]
555 add $A[0],$N[0]
556 adc \$0,$N[1]
557 mov $N[0],-8(%rsp,$j,8)
558 mov $N[1],(%rsp,$j,8) # store upmost overflow bit
559
560 lea 1($i),$i # i++
561.align 4
562.Louter4x:
563 xor $j,$j # j=0
564 movq `0*$STRIDE/4-96`($bp),%xmm0
565 movq `1*$STRIDE/4-96`($bp),%xmm1
566 pand %xmm4,%xmm0
567 movq `2*$STRIDE/4-96`($bp),%xmm2
568 pand %xmm5,%xmm1
569
570 mov (%rsp),$A[0]
571 mov $n0,$m1
572 mulq $m0 # ap[0]*bp[i]
573 add %rax,$A[0] # ap[0]*bp[i]+tp[0]
574 mov ($np),%rax
575 adc \$0,%rdx
576
577 movq `3*$STRIDE/4-96`($bp),%xmm3
578 pand %xmm6,%xmm2
579 por %xmm1,%xmm0
580 pand %xmm7,%xmm3
581
582 imulq $A[0],$m1 # tp[0]*n0
583 mov %rdx,$A[1]
584
585 por %xmm2,%xmm0
586 lea $STRIDE($bp),$bp
587 por %xmm3,%xmm0
588
589 mulq $m1 # np[0]*m1
590 add %rax,$A[0] # "$N[0]", discarded
591 mov 8($ap),%rax
592 adc \$0,%rdx
593 mov %rdx,$N[1]
594
595 mulq $m0 # ap[j]*bp[i]
596 add %rax,$A[1]
597 mov 8($np),%rax
598 adc \$0,%rdx
599 add 8(%rsp),$A[1] # +tp[1]
600 adc \$0,%rdx
601 mov %rdx,$A[0]
602
603 mulq $m1 # np[j]*m1
604 add %rax,$N[1]
605 mov 16($ap),%rax
606 adc \$0,%rdx
607 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[i]+tp[j]
608 lea 4($j),$j # j+=2
609 adc \$0,%rdx
610 mov %rdx,$N[0]
611 jmp .Linner4x
612.align 16
613.Linner4x:
614 mulq $m0 # ap[j]*bp[i]
615 add %rax,$A[0]
616 mov -16($np,$j,8),%rax
617 adc \$0,%rdx
618 add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
619 adc \$0,%rdx
620 mov %rdx,$A[1]
621
622 mulq $m1 # np[j]*m1
623 add %rax,$N[0]
624 mov -8($ap,$j,8),%rax
625 adc \$0,%rdx
626 add $A[0],$N[0]
627 adc \$0,%rdx
628 mov $N[1],-32(%rsp,$j,8) # tp[j-1]
629 mov %rdx,$N[1]
630
631 mulq $m0 # ap[j]*bp[i]
632 add %rax,$A[1]
633 mov -8($np,$j,8),%rax
634 adc \$0,%rdx
635 add -8(%rsp,$j,8),$A[1]
636 adc \$0,%rdx
637 mov %rdx,$A[0]
638
639 mulq $m1 # np[j]*m1
640 add %rax,$N[1]
641 mov ($ap,$j,8),%rax
642 adc \$0,%rdx
643 add $A[1],$N[1]
644 adc \$0,%rdx
645 mov $N[0],-24(%rsp,$j,8) # tp[j-1]
646 mov %rdx,$N[0]
647
648 mulq $m0 # ap[j]*bp[i]
649 add %rax,$A[0]
650 mov ($np,$j,8),%rax
651 adc \$0,%rdx
652 add (%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
653 adc \$0,%rdx
654 mov %rdx,$A[1]
655
656 mulq $m1 # np[j]*m1
657 add %rax,$N[0]
658 mov 8($ap,$j,8),%rax
659 adc \$0,%rdx
660 add $A[0],$N[0]
661 adc \$0,%rdx
662 mov $N[1],-16(%rsp,$j,8) # tp[j-1]
663 mov %rdx,$N[1]
664
665 mulq $m0 # ap[j]*bp[i]
666 add %rax,$A[1]
667 mov 8($np,$j,8),%rax
668 adc \$0,%rdx
669 add 8(%rsp,$j,8),$A[1]
670 adc \$0,%rdx
671 lea 4($j),$j # j++
672 mov %rdx,$A[0]
673
674 mulq $m1 # np[j]*m1
675 add %rax,$N[1]
676 mov -16($ap,$j,8),%rax
677 adc \$0,%rdx
678 add $A[1],$N[1]
679 adc \$0,%rdx
680 mov $N[0],-40(%rsp,$j,8) # tp[j-1]
681 mov %rdx,$N[0]
682 cmp $num,$j
683 jl .Linner4x
684
685 mulq $m0 # ap[j]*bp[i]
686 add %rax,$A[0]
687 mov -16($np,$j,8),%rax
688 adc \$0,%rdx
689 add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
690 adc \$0,%rdx
691 mov %rdx,$A[1]
692
693 mulq $m1 # np[j]*m1
694 add %rax,$N[0]
695 mov -8($ap,$j,8),%rax
696 adc \$0,%rdx
697 add $A[0],$N[0]
698 adc \$0,%rdx
699 mov $N[1],-32(%rsp,$j,8) # tp[j-1]
700 mov %rdx,$N[1]
701
702 mulq $m0 # ap[j]*bp[i]
703 add %rax,$A[1]
704 mov -8($np,$j,8),%rax
705 adc \$0,%rdx
706 add -8(%rsp,$j,8),$A[1]
707 adc \$0,%rdx
708 lea 1($i),$i # i++
709 mov %rdx,$A[0]
710
711 mulq $m1 # np[j]*m1
712 add %rax,$N[1]
713 mov ($ap),%rax # ap[0]
714 adc \$0,%rdx
715 add $A[1],$N[1]
716 adc \$0,%rdx
717 mov $N[0],-24(%rsp,$j,8) # tp[j-1]
718 mov %rdx,$N[0]
719
720 movq %xmm0,$m0 # bp[i+1]
721 mov $N[1],-16(%rsp,$j,8) # tp[j-1]
722
723 xor $N[1],$N[1]
724 add $A[0],$N[0]
725 adc \$0,$N[1]
726 add (%rsp,$num,8),$N[0] # pull upmost overflow bit
727 adc \$0,$N[1]
728 mov $N[0],-8(%rsp,$j,8)
729 mov $N[1],(%rsp,$j,8) # store upmost overflow bit
730
731 cmp $num,$i
732 jl .Louter4x
733___
734{
735my @ri=("%rax","%rdx",$m0,$m1);
736$code.=<<___;
737 mov 16(%rsp,$num,8),$rp # restore $rp
738 mov 0(%rsp),@ri[0] # tp[0]
739 pxor %xmm0,%xmm0
740 mov 8(%rsp),@ri[1] # tp[1]
741 shr \$2,$num # num/=4
742 lea (%rsp),$ap # borrow ap for tp
743 xor $i,$i # i=0 and clear CF!
744
745 sub 0($np),@ri[0]
746 mov 16($ap),@ri[2] # tp[2]
747 mov 24($ap),@ri[3] # tp[3]
748 sbb 8($np),@ri[1]
749 lea -1($num),$j # j=num/4-1
750 jmp .Lsub4x
751.align 16
752.Lsub4x:
753 mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i]
754 mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i]
755 sbb 16($np,$i,8),@ri[2]
756 mov 32($ap,$i,8),@ri[0] # tp[i+1]
757 mov 40($ap,$i,8),@ri[1]
758 sbb 24($np,$i,8),@ri[3]
759 mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i]
760 mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i]
761 sbb 32($np,$i,8),@ri[0]
762 mov 48($ap,$i,8),@ri[2]
763 mov 56($ap,$i,8),@ri[3]
764 sbb 40($np,$i,8),@ri[1]
765 lea 4($i),$i # i++
766 dec $j # doesnn't affect CF!
767 jnz .Lsub4x
768
769 mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i]
770 mov 32($ap,$i,8),@ri[0] # load overflow bit
771 sbb 16($np,$i,8),@ri[2]
772 mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i]
773 sbb 24($np,$i,8),@ri[3]
774 mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i]
775
776 sbb \$0,@ri[0] # handle upmost overflow bit
777 mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i]
778 xor $i,$i # i=0
779 and @ri[0],$ap
780 not @ri[0]
781 mov $rp,$np
782 and @ri[0],$np
783 lea -1($num),$j
784 or $np,$ap # ap=borrow?tp:rp
785
786 movdqu ($ap),%xmm1
787 movdqa %xmm0,(%rsp)
788 movdqu %xmm1,($rp)
789 jmp .Lcopy4x
790.align 16
791.Lcopy4x: # copy or in-place refresh
792 movdqu 16($ap,$i),%xmm2
793 movdqu 32($ap,$i),%xmm1
794 movdqa %xmm0,16(%rsp,$i)
795 movdqu %xmm2,16($rp,$i)
796 movdqa %xmm0,32(%rsp,$i)
797 movdqu %xmm1,32($rp,$i)
798 lea 32($i),$i
799 dec $j
800 jnz .Lcopy4x
801
802 shl \$2,$num
803 movdqu 16($ap,$i),%xmm2
804 movdqa %xmm0,16(%rsp,$i)
805 movdqu %xmm2,16($rp,$i)
806___
807}
808$code.=<<___;
809 mov 8(%rsp,$num,8),%rsi # restore %rsp
810 mov \$1,%rax
811___
812$code.=<<___ if ($win64);
813 movaps (%rsi),%xmm6
814 movaps 0x10(%rsi),%xmm7
815 lea 0x28(%rsi),%rsi
816___
817$code.=<<___;
818 mov (%rsi),%r15
819 mov 8(%rsi),%r14
820 mov 16(%rsi),%r13
821 mov 24(%rsi),%r12
822 mov 32(%rsi),%rbp
823 mov 40(%rsi),%rbx
824 lea 48(%rsi),%rsp
825.Lmul4x_epilogue:
826 ret
827.size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
828___
829}}}
830
831{
832my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%rdx","%r8", "%r9") : # Win64 order
833 ("%rdi","%rsi","%rdx","%rcx"); # Unix order
834my $out=$inp;
835my $STRIDE=2**5*8;
836my $N=$STRIDE/4;
837
838$code.=<<___;
839.globl bn_scatter5
840.type bn_scatter5,\@abi-omnipotent
841.align 16
842bn_scatter5:
843 cmp \$0, $num
844 jz .Lscatter_epilogue
845 lea ($tbl,$idx,8),$tbl
846.Lscatter:
847 mov ($inp),%rax
848 lea 8($inp),$inp
849 mov %rax,($tbl)
850 lea 32*8($tbl),$tbl
851 sub \$1,$num
852 jnz .Lscatter
853.Lscatter_epilogue:
854 ret
855.size bn_scatter5,.-bn_scatter5
856
857.globl bn_gather5
858.type bn_gather5,\@abi-omnipotent
859.align 16
860bn_gather5:
861___
862$code.=<<___ if ($win64);
863.LSEH_begin_bn_gather5:
864 # I can't trust assembler to use specific encoding:-(
865 .byte 0x48,0x83,0xec,0x28 #sub \$0x28,%rsp
866 .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp)
867 .byte 0x0f,0x29,0x7c,0x24,0x10 #movdqa %xmm7,0x10(%rsp)
868___
869$code.=<<___;
870 mov $idx,%r11
871 shr \$`log($N/8)/log(2)`,$idx
872 and \$`$N/8-1`,%r11
873 not $idx
874 lea .Lmagic_masks(%rip),%rax
875 and \$`2**5/($N/8)-1`,$idx # 5 is "window size"
876 lea 96($tbl,%r11,8),$tbl # pointer within 1st cache line
877 movq 0(%rax,$idx,8),%xmm4 # set of masks denoting which
878 movq 8(%rax,$idx,8),%xmm5 # cache line contains element
879 movq 16(%rax,$idx,8),%xmm6 # denoted by 7th argument
880 movq 24(%rax,$idx,8),%xmm7
881 jmp .Lgather
882.align 16
883.Lgather:
884 movq `0*$STRIDE/4-96`($tbl),%xmm0
885 movq `1*$STRIDE/4-96`($tbl),%xmm1
886 pand %xmm4,%xmm0
887 movq `2*$STRIDE/4-96`($tbl),%xmm2
888 pand %xmm5,%xmm1
889 movq `3*$STRIDE/4-96`($tbl),%xmm3
890 pand %xmm6,%xmm2
891 por %xmm1,%xmm0
892 pand %xmm7,%xmm3
893 por %xmm2,%xmm0
894 lea $STRIDE($tbl),$tbl
895 por %xmm3,%xmm0
896
897 movq %xmm0,($out) # m0=bp[0]
898 lea 8($out),$out
899 sub \$1,$num
900 jnz .Lgather
901___
902$code.=<<___ if ($win64);
903 movaps %xmm6,(%rsp)
904 movaps %xmm7,0x10(%rsp)
905 lea 0x28(%rsp),%rsp
906___
907$code.=<<___;
908 ret
909.LSEH_end_bn_gather5:
910.size bn_gather5,.-bn_gather5
911___
912}
913$code.=<<___;
914.align 64
915.Lmagic_masks:
916 .long 0,0, 0,0, 0,0, -1,-1
917 .long 0,0, 0,0, 0,0, 0,0
918.asciz "Montgomery Multiplication with scatter/gather for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
919___
920
921# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
922# CONTEXT *context,DISPATCHER_CONTEXT *disp)
923if ($win64) {
924$rec="%rcx";
925$frame="%rdx";
926$context="%r8";
927$disp="%r9";
928
929$code.=<<___;
930.extern __imp_RtlVirtualUnwind
931.type mul_handler,\@abi-omnipotent
932.align 16
933mul_handler:
934 push %rsi
935 push %rdi
936 push %rbx
937 push %rbp
938 push %r12
939 push %r13
940 push %r14
941 push %r15
942 pushfq
943 sub \$64,%rsp
944
945 mov 120($context),%rax # pull context->Rax
946 mov 248($context),%rbx # pull context->Rip
947
948 mov 8($disp),%rsi # disp->ImageBase
949 mov 56($disp),%r11 # disp->HandlerData
950
951 mov 0(%r11),%r10d # HandlerData[0]
952 lea (%rsi,%r10),%r10 # end of prologue label
953 cmp %r10,%rbx # context->Rip<end of prologue label
954 jb .Lcommon_seh_tail
955
956 lea `40+48`(%rax),%rax
957
958 mov 4(%r11),%r10d # HandlerData[1]
959 lea (%rsi,%r10),%r10 # end of alloca label
960 cmp %r10,%rbx # context->Rip<end of alloca label
961 jb .Lcommon_seh_tail
962
963 mov 152($context),%rax # pull context->Rsp
964
965 mov 8(%r11),%r10d # HandlerData[2]
966 lea (%rsi,%r10),%r10 # epilogue label
967 cmp %r10,%rbx # context->Rip>=epilogue label
968 jae .Lcommon_seh_tail
969
970 mov 192($context),%r10 # pull $num
971 mov 8(%rax,%r10,8),%rax # pull saved stack pointer
972
973 movaps (%rax),%xmm0
974 movaps 16(%rax),%xmm1
975 lea `40+48`(%rax),%rax
976
977 mov -8(%rax),%rbx
978 mov -16(%rax),%rbp
979 mov -24(%rax),%r12
980 mov -32(%rax),%r13
981 mov -40(%rax),%r14
982 mov -48(%rax),%r15
983 mov %rbx,144($context) # restore context->Rbx
984 mov %rbp,160($context) # restore context->Rbp
985 mov %r12,216($context) # restore context->R12
986 mov %r13,224($context) # restore context->R13
987 mov %r14,232($context) # restore context->R14
988 mov %r15,240($context) # restore context->R15
989 movups %xmm0,512($context) # restore context->Xmm6
990 movups %xmm1,528($context) # restore context->Xmm7
991
992.Lcommon_seh_tail:
993 mov 8(%rax),%rdi
994 mov 16(%rax),%rsi
995 mov %rax,152($context) # restore context->Rsp
996 mov %rsi,168($context) # restore context->Rsi
997 mov %rdi,176($context) # restore context->Rdi
998
999 mov 40($disp),%rdi # disp->ContextRecord
1000 mov $context,%rsi # context
1001 mov \$154,%ecx # sizeof(CONTEXT)
1002 .long 0xa548f3fc # cld; rep movsq
1003
1004 mov $disp,%rsi
1005 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1006 mov 8(%rsi),%rdx # arg2, disp->ImageBase
1007 mov 0(%rsi),%r8 # arg3, disp->ControlPc
1008 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1009 mov 40(%rsi),%r10 # disp->ContextRecord
1010 lea 56(%rsi),%r11 # &disp->HandlerData
1011 lea 24(%rsi),%r12 # &disp->EstablisherFrame
1012 mov %r10,32(%rsp) # arg5
1013 mov %r11,40(%rsp) # arg6
1014 mov %r12,48(%rsp) # arg7
1015 mov %rcx,56(%rsp) # arg8, (NULL)
1016 call *__imp_RtlVirtualUnwind(%rip)
1017
1018 mov \$1,%eax # ExceptionContinueSearch
1019 add \$64,%rsp
1020 popfq
1021 pop %r15
1022 pop %r14
1023 pop %r13
1024 pop %r12
1025 pop %rbp
1026 pop %rbx
1027 pop %rdi
1028 pop %rsi
1029 ret
1030.size mul_handler,.-mul_handler
1031
1032.section .pdata
1033.align 4
1034 .rva .LSEH_begin_bn_mul_mont_gather5
1035 .rva .LSEH_end_bn_mul_mont_gather5
1036 .rva .LSEH_info_bn_mul_mont_gather5
1037
1038 .rva .LSEH_begin_bn_mul4x_mont_gather5
1039 .rva .LSEH_end_bn_mul4x_mont_gather5
1040 .rva .LSEH_info_bn_mul4x_mont_gather5
1041
1042 .rva .LSEH_begin_bn_gather5
1043 .rva .LSEH_end_bn_gather5
1044 .rva .LSEH_info_bn_gather5
1045
1046.section .xdata
1047.align 8
1048.LSEH_info_bn_mul_mont_gather5:
1049 .byte 9,0,0,0
1050 .rva mul_handler
1051 .rva .Lmul_alloca,.Lmul_body,.Lmul_epilogue # HandlerData[]
1052.align 8
1053.LSEH_info_bn_mul4x_mont_gather5:
1054 .byte 9,0,0,0
1055 .rva mul_handler
1056 .rva .Lmul4x_alloca,.Lmul4x_body,.Lmul4x_epilogue # HandlerData[]
1057.align 8
1058.LSEH_info_bn_gather5:
1059 .byte 0x01,0x0d,0x05,0x00
1060 .byte 0x0d,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7
1061 .byte 0x08,0x68,0x00,0x00 #movaps (rsp),xmm6
1062 .byte 0x04,0x42,0x00,0x00 #sub rsp,0x28
1063.align 8
1064___
1065}
1066
1067$code =~ s/\`([^\`]*)\`/eval($1)/gem;
1068
1069print $code;
1070close STDOUT;
diff --git a/src/lib/libcrypto/bn/bn.h b/src/lib/libcrypto/bn/bn.h
deleted file mode 100644
index f34248ec4f..0000000000
--- a/src/lib/libcrypto/bn/bn.h
+++ /dev/null
@@ -1,891 +0,0 @@
1/* crypto/bn/bn.h */
2/* Copyright (C) 1995-1997 Eric Young (eay@cryptsoft.com)
3 * All rights reserved.
4 *
5 * This package is an SSL implementation written
6 * by Eric Young (eay@cryptsoft.com).
7 * The implementation was written so as to conform with Netscapes SSL.
8 *
9 * This library is free for commercial and non-commercial use as long as
10 * the following conditions are aheared to. The following conditions
11 * apply to all code found in this distribution, be it the RC4, RSA,
12 * lhash, DES, etc., code; not just the SSL code. The SSL documentation
13 * included with this distribution is covered by the same copyright terms
14 * except that the holder is Tim Hudson (tjh@cryptsoft.com).
15 *
16 * Copyright remains Eric Young's, and as such any Copyright notices in
17 * the code are not to be removed.
18 * If this package is used in a product, Eric Young should be given attribution
19 * as the author of the parts of the library used.
20 * This can be in the form of a textual message at program startup or
21 * in documentation (online or textual) provided with the package.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 * 1. Redistributions of source code must retain the copyright
27 * notice, this list of conditions and the following disclaimer.
28 * 2. Redistributions in binary form must reproduce the above copyright
29 * notice, this list of conditions and the following disclaimer in the
30 * documentation and/or other materials provided with the distribution.
31 * 3. All advertising materials mentioning features or use of this software
32 * must display the following acknowledgement:
33 * "This product includes cryptographic software written by
34 * Eric Young (eay@cryptsoft.com)"
35 * The word 'cryptographic' can be left out if the rouines from the library
36 * being used are not cryptographic related :-).
37 * 4. If you include any Windows specific code (or a derivative thereof) from
38 * the apps directory (application code) you must include an acknowledgement:
39 * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
40 *
41 * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
42 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51 * SUCH DAMAGE.
52 *
53 * The licence and distribution terms for any publically available version or
54 * derivative of this code cannot be changed. i.e. this code cannot simply be
55 * copied and put under another distribution licence
56 * [including the GNU Public Licence.]
57 */
58/* ====================================================================
59 * Copyright (c) 1998-2006 The OpenSSL Project. All rights reserved.
60 *
61 * Redistribution and use in source and binary forms, with or without
62 * modification, are permitted provided that the following conditions
63 * are met:
64 *
65 * 1. Redistributions of source code must retain the above copyright
66 * notice, this list of conditions and the following disclaimer.
67 *
68 * 2. Redistributions in binary form must reproduce the above copyright
69 * notice, this list of conditions and the following disclaimer in
70 * the documentation and/or other materials provided with the
71 * distribution.
72 *
73 * 3. All advertising materials mentioning features or use of this
74 * software must display the following acknowledgment:
75 * "This product includes software developed by the OpenSSL Project
76 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
77 *
78 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
79 * endorse or promote products derived from this software without
80 * prior written permission. For written permission, please contact
81 * openssl-core@openssl.org.
82 *
83 * 5. Products derived from this software may not be called "OpenSSL"
84 * nor may "OpenSSL" appear in their names without prior written
85 * permission of the OpenSSL Project.
86 *
87 * 6. Redistributions of any form whatsoever must retain the following
88 * acknowledgment:
89 * "This product includes software developed by the OpenSSL Project
90 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
91 *
92 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
93 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
94 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
95 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
96 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
97 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
98 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
99 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
100 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
101 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
102 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
103 * OF THE POSSIBILITY OF SUCH DAMAGE.
104 * ====================================================================
105 *
106 * This product includes cryptographic software written by Eric Young
107 * (eay@cryptsoft.com). This product includes software written by Tim
108 * Hudson (tjh@cryptsoft.com).
109 *
110 */
111/* ====================================================================
112 * Copyright 2002 Sun Microsystems, Inc. ALL RIGHTS RESERVED.
113 *
114 * Portions of the attached software ("Contribution") are developed by
115 * SUN MICROSYSTEMS, INC., and are contributed to the OpenSSL project.
116 *
117 * The Contribution is licensed pursuant to the Eric Young open source
118 * license provided above.
119 *
120 * The binary polynomial arithmetic software is originally written by
121 * Sheueling Chang Shantz and Douglas Stebila of Sun Microsystems Laboratories.
122 *
123 */
124
125#ifndef HEADER_BN_H
126#define HEADER_BN_H
127
128#include <openssl/e_os2.h>
129#ifndef OPENSSL_NO_FP_API
130#include <stdio.h> /* FILE */
131#endif
132#include <openssl/ossl_typ.h>
133#include <openssl/crypto.h>
134
135#ifdef __cplusplus
136extern "C" {
137#endif
138
139/* These preprocessor symbols control various aspects of the bignum headers and
140 * library code. They're not defined by any "normal" configuration, as they are
141 * intended for development and testing purposes. NB: defining all three can be
142 * useful for debugging application code as well as openssl itself.
143 *
144 * BN_DEBUG - turn on various debugging alterations to the bignum code
145 * BN_DEBUG_RAND - uses random poisoning of unused words to trip up
146 * mismanagement of bignum internals. You must also define BN_DEBUG.
147 */
148/* #define BN_DEBUG */
149/* #define BN_DEBUG_RAND */
150
151#ifndef OPENSSL_SMALL_FOOTPRINT
152#define BN_MUL_COMBA
153#define BN_SQR_COMBA
154#define BN_RECURSION
155#endif
156
157/* This next option uses the C libraries (2 word)/(1 word) function.
158 * If it is not defined, I use my C version (which is slower).
159 * The reason for this flag is that when the particular C compiler
160 * library routine is used, and the library is linked with a different
161 * compiler, the library is missing. This mostly happens when the
162 * library is built with gcc and then linked using normal cc. This would
163 * be a common occurrence because gcc normally produces code that is
164 * 2 times faster than system compilers for the big number stuff.
165 * For machines with only one compiler (or shared libraries), this should
166 * be on. Again this in only really a problem on machines
167 * using "long long's", are 32bit, and are not using my assembler code. */
168#if defined(OPENSSL_SYS_MSDOS) || defined(OPENSSL_SYS_WINDOWS) || \
169 defined(OPENSSL_SYS_WIN32) || defined(linux)
170# ifndef BN_DIV2W
171# define BN_DIV2W
172# endif
173#endif
174
175/* assuming long is 64bit - this is the DEC Alpha
176 * unsigned long long is only 64 bits :-(, don't define
177 * BN_LLONG for the DEC Alpha */
178#ifdef SIXTY_FOUR_BIT_LONG
179#define BN_ULLONG unsigned long long
180#define BN_ULONG unsigned long
181#define BN_LONG long
182#define BN_BITS 128
183#define BN_BYTES 8
184#define BN_BITS2 64
185#define BN_BITS4 32
186#define BN_MASK (0xffffffffffffffffffffffffffffffffLL)
187#define BN_MASK2 (0xffffffffffffffffL)
188#define BN_MASK2l (0xffffffffL)
189#define BN_MASK2h (0xffffffff00000000L)
190#define BN_MASK2h1 (0xffffffff80000000L)
191#define BN_TBIT (0x8000000000000000L)
192#define BN_DEC_CONV (10000000000000000000UL)
193#define BN_DEC_FMT1 "%lu"
194#define BN_DEC_FMT2 "%019lu"
195#define BN_DEC_NUM 19
196#define BN_HEX_FMT1 "%lX"
197#define BN_HEX_FMT2 "%016lX"
198#endif
199
200/* This is where the long long data type is 64 bits, but long is 32.
201 * For machines where there are 64bit registers, this is the mode to use.
202 * IRIX, on R4000 and above should use this mode, along with the relevant
203 * assembler code :-). Do NOT define BN_LLONG.
204 */
205#ifdef SIXTY_FOUR_BIT
206#undef BN_LLONG
207#undef BN_ULLONG
208#define BN_ULONG unsigned long long
209#define BN_LONG long long
210#define BN_BITS 128
211#define BN_BYTES 8
212#define BN_BITS2 64
213#define BN_BITS4 32
214#define BN_MASK2 (0xffffffffffffffffLL)
215#define BN_MASK2l (0xffffffffL)
216#define BN_MASK2h (0xffffffff00000000LL)
217#define BN_MASK2h1 (0xffffffff80000000LL)
218#define BN_TBIT (0x8000000000000000LL)
219#define BN_DEC_CONV (10000000000000000000ULL)
220#define BN_DEC_FMT1 "%llu"
221#define BN_DEC_FMT2 "%019llu"
222#define BN_DEC_NUM 19
223#define BN_HEX_FMT1 "%llX"
224#define BN_HEX_FMT2 "%016llX"
225#endif
226
227#ifdef THIRTY_TWO_BIT
228#ifdef BN_LLONG
229# if defined(_WIN32) && !defined(__GNUC__)
230# define BN_ULLONG unsigned __int64
231# define BN_MASK (0xffffffffffffffffI64)
232# else
233# define BN_ULLONG unsigned long long
234# define BN_MASK (0xffffffffffffffffLL)
235# endif
236#endif
237#define BN_ULONG unsigned int
238#define BN_LONG int
239#define BN_BITS 64
240#define BN_BYTES 4
241#define BN_BITS2 32
242#define BN_BITS4 16
243#define BN_MASK2 (0xffffffffL)
244#define BN_MASK2l (0xffff)
245#define BN_MASK2h1 (0xffff8000L)
246#define BN_MASK2h (0xffff0000L)
247#define BN_TBIT (0x80000000L)
248#define BN_DEC_CONV (1000000000L)
249#define BN_DEC_FMT1 "%u"
250#define BN_DEC_FMT2 "%09u"
251#define BN_DEC_NUM 9
252#define BN_HEX_FMT1 "%X"
253#define BN_HEX_FMT2 "%08X"
254#endif
255
256/* 2011-02-22 SMS.
257 * In various places, a size_t variable or a type cast to size_t was
258 * used to perform integer-only operations on pointers. This failed on
259 * VMS with 64-bit pointers (CC /POINTER_SIZE = 64) because size_t is
260 * still only 32 bits. What's needed in these cases is an integer type
261 * with the same size as a pointer, which size_t is not certain to be.
262 * The only fix here is VMS-specific.
263 */
264#if defined(OPENSSL_SYS_VMS)
265# if __INITIAL_POINTER_SIZE == 64
266# define PTR_SIZE_INT long long
267# else /* __INITIAL_POINTER_SIZE == 64 */
268# define PTR_SIZE_INT int
269# endif /* __INITIAL_POINTER_SIZE == 64 [else] */
270#else /* defined(OPENSSL_SYS_VMS) */
271# define PTR_SIZE_INT size_t
272#endif /* defined(OPENSSL_SYS_VMS) [else] */
273
274#define BN_DEFAULT_BITS 1280
275
276#define BN_FLG_MALLOCED 0x01
277#define BN_FLG_STATIC_DATA 0x02
278#define BN_FLG_CONSTTIME 0x04 /* avoid leaking exponent information through timing,
279 * BN_mod_exp_mont() will call BN_mod_exp_mont_consttime,
280 * BN_div() will call BN_div_no_branch,
281 * BN_mod_inverse() will call BN_mod_inverse_no_branch.
282 */
283
284#ifndef OPENSSL_NO_DEPRECATED
285#define BN_FLG_EXP_CONSTTIME BN_FLG_CONSTTIME /* deprecated name for the flag */
286 /* avoid leaking exponent information through timings
287 * (BN_mod_exp_mont() will call BN_mod_exp_mont_consttime) */
288#endif
289
290#ifndef OPENSSL_NO_DEPRECATED
291#define BN_FLG_FREE 0x8000 /* used for debuging */
292#endif
293#define BN_set_flags(b,n) ((b)->flags|=(n))
294#define BN_get_flags(b,n) ((b)->flags&(n))
295
296/* get a clone of a BIGNUM with changed flags, for *temporary* use only
297 * (the two BIGNUMs cannot not be used in parallel!) */
298#define BN_with_flags(dest,b,n) ((dest)->d=(b)->d, \
299 (dest)->top=(b)->top, \
300 (dest)->dmax=(b)->dmax, \
301 (dest)->neg=(b)->neg, \
302 (dest)->flags=(((dest)->flags & BN_FLG_MALLOCED) \
303 | ((b)->flags & ~BN_FLG_MALLOCED) \
304 | BN_FLG_STATIC_DATA \
305 | (n)))
306
307/* Already declared in ossl_typ.h */
308#if 0
309typedef struct bignum_st BIGNUM;
310/* Used for temp variables (declaration hidden in bn_lcl.h) */
311typedef struct bignum_ctx BN_CTX;
312typedef struct bn_blinding_st BN_BLINDING;
313typedef struct bn_mont_ctx_st BN_MONT_CTX;
314typedef struct bn_recp_ctx_st BN_RECP_CTX;
315typedef struct bn_gencb_st BN_GENCB;
316#endif
317
318struct bignum_st
319 {
320 BN_ULONG *d; /* Pointer to an array of 'BN_BITS2' bit chunks. */
321 int top; /* Index of last used d +1. */
322 /* The next are internal book keeping for bn_expand. */
323 int dmax; /* Size of the d array. */
324 int neg; /* one if the number is negative */
325 int flags;
326 };
327
328/* Used for montgomery multiplication */
329struct bn_mont_ctx_st
330 {
331 int ri; /* number of bits in R */
332 BIGNUM RR; /* used to convert to montgomery form */
333 BIGNUM N; /* The modulus */
334 BIGNUM Ni; /* R*(1/R mod N) - N*Ni = 1
335 * (Ni is only stored for bignum algorithm) */
336 BN_ULONG n0[2];/* least significant word(s) of Ni;
337 (type changed with 0.9.9, was "BN_ULONG n0;" before) */
338 int flags;
339 };
340
341/* Used for reciprocal division/mod functions
342 * It cannot be shared between threads
343 */
344struct bn_recp_ctx_st
345 {
346 BIGNUM N; /* the divisor */
347 BIGNUM Nr; /* the reciprocal */
348 int num_bits;
349 int shift;
350 int flags;
351 };
352
353/* Used for slow "generation" functions. */
354struct bn_gencb_st
355 {
356 unsigned int ver; /* To handle binary (in)compatibility */
357 void *arg; /* callback-specific data */
358 union
359 {
360 /* if(ver==1) - handles old style callbacks */
361 void (*cb_1)(int, int, void *);
362 /* if(ver==2) - new callback style */
363 int (*cb_2)(int, int, BN_GENCB *);
364 } cb;
365 };
366/* Wrapper function to make using BN_GENCB easier, */
367int BN_GENCB_call(BN_GENCB *cb, int a, int b);
368/* Macro to populate a BN_GENCB structure with an "old"-style callback */
369#define BN_GENCB_set_old(gencb, callback, cb_arg) { \
370 BN_GENCB *tmp_gencb = (gencb); \
371 tmp_gencb->ver = 1; \
372 tmp_gencb->arg = (cb_arg); \
373 tmp_gencb->cb.cb_1 = (callback); }
374/* Macro to populate a BN_GENCB structure with a "new"-style callback */
375#define BN_GENCB_set(gencb, callback, cb_arg) { \
376 BN_GENCB *tmp_gencb = (gencb); \
377 tmp_gencb->ver = 2; \
378 tmp_gencb->arg = (cb_arg); \
379 tmp_gencb->cb.cb_2 = (callback); }
380
381#define BN_prime_checks 0 /* default: select number of iterations
382 based on the size of the number */
383
384/* number of Miller-Rabin iterations for an error rate of less than 2^-80
385 * for random 'b'-bit input, b >= 100 (taken from table 4.4 in the Handbook
386 * of Applied Cryptography [Menezes, van Oorschot, Vanstone; CRC Press 1996];
387 * original paper: Damgaard, Landrock, Pomerance: Average case error estimates
388 * for the strong probable prime test. -- Math. Comp. 61 (1993) 177-194) */
389#define BN_prime_checks_for_size(b) ((b) >= 1300 ? 2 : \
390 (b) >= 850 ? 3 : \
391 (b) >= 650 ? 4 : \
392 (b) >= 550 ? 5 : \
393 (b) >= 450 ? 6 : \
394 (b) >= 400 ? 7 : \
395 (b) >= 350 ? 8 : \
396 (b) >= 300 ? 9 : \
397 (b) >= 250 ? 12 : \
398 (b) >= 200 ? 15 : \
399 (b) >= 150 ? 18 : \
400 /* b >= 100 */ 27)
401
402#define BN_num_bytes(a) ((BN_num_bits(a)+7)/8)
403
404/* Note that BN_abs_is_word didn't work reliably for w == 0 until 0.9.8 */
405#define BN_abs_is_word(a,w) ((((a)->top == 1) && ((a)->d[0] == (BN_ULONG)(w))) || \
406 (((w) == 0) && ((a)->top == 0)))
407#define BN_is_zero(a) ((a)->top == 0)
408#define BN_is_one(a) (BN_abs_is_word((a),1) && !(a)->neg)
409#define BN_is_word(a,w) (BN_abs_is_word((a),(w)) && (!(w) || !(a)->neg))
410#define BN_is_odd(a) (((a)->top > 0) && ((a)->d[0] & 1))
411
412#define BN_one(a) (BN_set_word((a),1))
413#define BN_zero_ex(a) \
414 do { \
415 BIGNUM *_tmp_bn = (a); \
416 _tmp_bn->top = 0; \
417 _tmp_bn->neg = 0; \
418 } while(0)
419#ifdef OPENSSL_NO_DEPRECATED
420#define BN_zero(a) BN_zero_ex(a)
421#else
422#define BN_zero(a) (BN_set_word((a),0))
423#endif
424
425const BIGNUM *BN_value_one(void);
426char * BN_options(void);
427BN_CTX *BN_CTX_new(void);
428#ifndef OPENSSL_NO_DEPRECATED
429void BN_CTX_init(BN_CTX *c);
430#endif
431void BN_CTX_free(BN_CTX *c);
432void BN_CTX_start(BN_CTX *ctx);
433BIGNUM *BN_CTX_get(BN_CTX *ctx);
434void BN_CTX_end(BN_CTX *ctx);
435int BN_rand(BIGNUM *rnd, int bits, int top,int bottom);
436int BN_pseudo_rand(BIGNUM *rnd, int bits, int top,int bottom);
437int BN_rand_range(BIGNUM *rnd, const BIGNUM *range);
438int BN_pseudo_rand_range(BIGNUM *rnd, const BIGNUM *range);
439int BN_num_bits(const BIGNUM *a);
440int BN_num_bits_word(BN_ULONG);
441BIGNUM *BN_new(void);
442void BN_init(BIGNUM *);
443void BN_clear_free(BIGNUM *a);
444BIGNUM *BN_copy(BIGNUM *a, const BIGNUM *b);
445void BN_swap(BIGNUM *a, BIGNUM *b);
446BIGNUM *BN_bin2bn(const unsigned char *s,int len,BIGNUM *ret);
447int BN_bn2bin(const BIGNUM *a, unsigned char *to);
448BIGNUM *BN_mpi2bn(const unsigned char *s,int len,BIGNUM *ret);
449int BN_bn2mpi(const BIGNUM *a, unsigned char *to);
450int BN_sub(BIGNUM *r, const BIGNUM *a, const BIGNUM *b);
451int BN_usub(BIGNUM *r, const BIGNUM *a, const BIGNUM *b);
452int BN_uadd(BIGNUM *r, const BIGNUM *a, const BIGNUM *b);
453int BN_add(BIGNUM *r, const BIGNUM *a, const BIGNUM *b);
454int BN_mul(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx);
455int BN_sqr(BIGNUM *r, const BIGNUM *a,BN_CTX *ctx);
456/** BN_set_negative sets sign of a BIGNUM
457 * \param b pointer to the BIGNUM object
458 * \param n 0 if the BIGNUM b should be positive and a value != 0 otherwise
459 */
460void BN_set_negative(BIGNUM *b, int n);
461/** BN_is_negative returns 1 if the BIGNUM is negative
462 * \param a pointer to the BIGNUM object
463 * \return 1 if a < 0 and 0 otherwise
464 */
465#define BN_is_negative(a) ((a)->neg != 0)
466
467int BN_div(BIGNUM *dv, BIGNUM *rem, const BIGNUM *m, const BIGNUM *d,
468 BN_CTX *ctx);
469#define BN_mod(rem,m,d,ctx) BN_div(NULL,(rem),(m),(d),(ctx))
470int BN_nnmod(BIGNUM *r, const BIGNUM *m, const BIGNUM *d, BN_CTX *ctx);
471int BN_mod_add(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, const BIGNUM *m, BN_CTX *ctx);
472int BN_mod_add_quick(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, const BIGNUM *m);
473int BN_mod_sub(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, const BIGNUM *m, BN_CTX *ctx);
474int BN_mod_sub_quick(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, const BIGNUM *m);
475int BN_mod_mul(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
476 const BIGNUM *m, BN_CTX *ctx);
477int BN_mod_sqr(BIGNUM *r, const BIGNUM *a, const BIGNUM *m, BN_CTX *ctx);
478int BN_mod_lshift1(BIGNUM *r, const BIGNUM *a, const BIGNUM *m, BN_CTX *ctx);
479int BN_mod_lshift1_quick(BIGNUM *r, const BIGNUM *a, const BIGNUM *m);
480int BN_mod_lshift(BIGNUM *r, const BIGNUM *a, int n, const BIGNUM *m, BN_CTX *ctx);
481int BN_mod_lshift_quick(BIGNUM *r, const BIGNUM *a, int n, const BIGNUM *m);
482
483BN_ULONG BN_mod_word(const BIGNUM *a, BN_ULONG w);
484BN_ULONG BN_div_word(BIGNUM *a, BN_ULONG w);
485int BN_mul_word(BIGNUM *a, BN_ULONG w);
486int BN_add_word(BIGNUM *a, BN_ULONG w);
487int BN_sub_word(BIGNUM *a, BN_ULONG w);
488int BN_set_word(BIGNUM *a, BN_ULONG w);
489BN_ULONG BN_get_word(const BIGNUM *a);
490
491int BN_cmp(const BIGNUM *a, const BIGNUM *b);
492void BN_free(BIGNUM *a);
493int BN_is_bit_set(const BIGNUM *a, int n);
494int BN_lshift(BIGNUM *r, const BIGNUM *a, int n);
495int BN_lshift1(BIGNUM *r, const BIGNUM *a);
496int BN_exp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p,BN_CTX *ctx);
497
498int BN_mod_exp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p,
499 const BIGNUM *m,BN_CTX *ctx);
500int BN_mod_exp_mont(BIGNUM *r, const BIGNUM *a, const BIGNUM *p,
501 const BIGNUM *m, BN_CTX *ctx, BN_MONT_CTX *m_ctx);
502int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
503 const BIGNUM *m, BN_CTX *ctx, BN_MONT_CTX *in_mont);
504int BN_mod_exp_mont_word(BIGNUM *r, BN_ULONG a, const BIGNUM *p,
505 const BIGNUM *m, BN_CTX *ctx, BN_MONT_CTX *m_ctx);
506int BN_mod_exp2_mont(BIGNUM *r, const BIGNUM *a1, const BIGNUM *p1,
507 const BIGNUM *a2, const BIGNUM *p2,const BIGNUM *m,
508 BN_CTX *ctx,BN_MONT_CTX *m_ctx);
509int BN_mod_exp_simple(BIGNUM *r, const BIGNUM *a, const BIGNUM *p,
510 const BIGNUM *m,BN_CTX *ctx);
511
512int BN_mask_bits(BIGNUM *a,int n);
513#ifndef OPENSSL_NO_FP_API
514int BN_print_fp(FILE *fp, const BIGNUM *a);
515#endif
516#ifdef HEADER_BIO_H
517int BN_print(BIO *fp, const BIGNUM *a);
518#else
519int BN_print(void *fp, const BIGNUM *a);
520#endif
521int BN_reciprocal(BIGNUM *r, const BIGNUM *m, int len, BN_CTX *ctx);
522int BN_rshift(BIGNUM *r, const BIGNUM *a, int n);
523int BN_rshift1(BIGNUM *r, const BIGNUM *a);
524void BN_clear(BIGNUM *a);
525BIGNUM *BN_dup(const BIGNUM *a);
526int BN_ucmp(const BIGNUM *a, const BIGNUM *b);
527int BN_set_bit(BIGNUM *a, int n);
528int BN_clear_bit(BIGNUM *a, int n);
529char * BN_bn2hex(const BIGNUM *a);
530char * BN_bn2dec(const BIGNUM *a);
531int BN_hex2bn(BIGNUM **a, const char *str);
532int BN_dec2bn(BIGNUM **a, const char *str);
533int BN_asc2bn(BIGNUM **a, const char *str);
534int BN_gcd(BIGNUM *r,const BIGNUM *a,const BIGNUM *b,BN_CTX *ctx);
535int BN_kronecker(const BIGNUM *a,const BIGNUM *b,BN_CTX *ctx); /* returns -2 for error */
536BIGNUM *BN_mod_inverse(BIGNUM *ret,
537 const BIGNUM *a, const BIGNUM *n,BN_CTX *ctx);
538BIGNUM *BN_mod_sqrt(BIGNUM *ret,
539 const BIGNUM *a, const BIGNUM *n,BN_CTX *ctx);
540
541/* Deprecated versions */
542#ifndef OPENSSL_NO_DEPRECATED
543BIGNUM *BN_generate_prime(BIGNUM *ret,int bits,int safe,
544 const BIGNUM *add, const BIGNUM *rem,
545 void (*callback)(int,int,void *),void *cb_arg);
546int BN_is_prime(const BIGNUM *p,int nchecks,
547 void (*callback)(int,int,void *),
548 BN_CTX *ctx,void *cb_arg);
549int BN_is_prime_fasttest(const BIGNUM *p,int nchecks,
550 void (*callback)(int,int,void *),BN_CTX *ctx,void *cb_arg,
551 int do_trial_division);
552#endif /* !defined(OPENSSL_NO_DEPRECATED) */
553
554/* Newer versions */
555int BN_generate_prime_ex(BIGNUM *ret,int bits,int safe, const BIGNUM *add,
556 const BIGNUM *rem, BN_GENCB *cb);
557int BN_is_prime_ex(const BIGNUM *p,int nchecks, BN_CTX *ctx, BN_GENCB *cb);
558int BN_is_prime_fasttest_ex(const BIGNUM *p,int nchecks, BN_CTX *ctx,
559 int do_trial_division, BN_GENCB *cb);
560
561int BN_X931_generate_Xpq(BIGNUM *Xp, BIGNUM *Xq, int nbits, BN_CTX *ctx);
562
563int BN_X931_derive_prime_ex(BIGNUM *p, BIGNUM *p1, BIGNUM *p2,
564 const BIGNUM *Xp, const BIGNUM *Xp1, const BIGNUM *Xp2,
565 const BIGNUM *e, BN_CTX *ctx, BN_GENCB *cb);
566int BN_X931_generate_prime_ex(BIGNUM *p, BIGNUM *p1, BIGNUM *p2,
567 BIGNUM *Xp1, BIGNUM *Xp2,
568 const BIGNUM *Xp,
569 const BIGNUM *e, BN_CTX *ctx,
570 BN_GENCB *cb);
571
572BN_MONT_CTX *BN_MONT_CTX_new(void );
573void BN_MONT_CTX_init(BN_MONT_CTX *ctx);
574int BN_mod_mul_montgomery(BIGNUM *r,const BIGNUM *a,const BIGNUM *b,
575 BN_MONT_CTX *mont, BN_CTX *ctx);
576#define BN_to_montgomery(r,a,mont,ctx) BN_mod_mul_montgomery(\
577 (r),(a),&((mont)->RR),(mont),(ctx))
578int BN_from_montgomery(BIGNUM *r,const BIGNUM *a,
579 BN_MONT_CTX *mont, BN_CTX *ctx);
580void BN_MONT_CTX_free(BN_MONT_CTX *mont);
581int BN_MONT_CTX_set(BN_MONT_CTX *mont,const BIGNUM *mod,BN_CTX *ctx);
582BN_MONT_CTX *BN_MONT_CTX_copy(BN_MONT_CTX *to,BN_MONT_CTX *from);
583BN_MONT_CTX *BN_MONT_CTX_set_locked(BN_MONT_CTX **pmont, int lock,
584 const BIGNUM *mod, BN_CTX *ctx);
585
586/* BN_BLINDING flags */
587#define BN_BLINDING_NO_UPDATE 0x00000001
588#define BN_BLINDING_NO_RECREATE 0x00000002
589
590BN_BLINDING *BN_BLINDING_new(const BIGNUM *A, const BIGNUM *Ai, BIGNUM *mod);
591void BN_BLINDING_free(BN_BLINDING *b);
592int BN_BLINDING_update(BN_BLINDING *b,BN_CTX *ctx);
593int BN_BLINDING_convert(BIGNUM *n, BN_BLINDING *b, BN_CTX *ctx);
594int BN_BLINDING_invert(BIGNUM *n, BN_BLINDING *b, BN_CTX *ctx);
595int BN_BLINDING_convert_ex(BIGNUM *n, BIGNUM *r, BN_BLINDING *b, BN_CTX *);
596int BN_BLINDING_invert_ex(BIGNUM *n, const BIGNUM *r, BN_BLINDING *b, BN_CTX *);
597#ifndef OPENSSL_NO_DEPRECATED
598unsigned long BN_BLINDING_get_thread_id(const BN_BLINDING *);
599void BN_BLINDING_set_thread_id(BN_BLINDING *, unsigned long);
600#endif
601CRYPTO_THREADID *BN_BLINDING_thread_id(BN_BLINDING *);
602unsigned long BN_BLINDING_get_flags(const BN_BLINDING *);
603void BN_BLINDING_set_flags(BN_BLINDING *, unsigned long);
604BN_BLINDING *BN_BLINDING_create_param(BN_BLINDING *b,
605 const BIGNUM *e, BIGNUM *m, BN_CTX *ctx,
606 int (*bn_mod_exp)(BIGNUM *r, const BIGNUM *a, const BIGNUM *p,
607 const BIGNUM *m, BN_CTX *ctx, BN_MONT_CTX *m_ctx),
608 BN_MONT_CTX *m_ctx);
609
610#ifndef OPENSSL_NO_DEPRECATED
611void BN_set_params(int mul,int high,int low,int mont);
612int BN_get_params(int which); /* 0, mul, 1 high, 2 low, 3 mont */
613#endif
614
615void BN_RECP_CTX_init(BN_RECP_CTX *recp);
616BN_RECP_CTX *BN_RECP_CTX_new(void);
617void BN_RECP_CTX_free(BN_RECP_CTX *recp);
618int BN_RECP_CTX_set(BN_RECP_CTX *recp,const BIGNUM *rdiv,BN_CTX *ctx);
619int BN_mod_mul_reciprocal(BIGNUM *r, const BIGNUM *x, const BIGNUM *y,
620 BN_RECP_CTX *recp,BN_CTX *ctx);
621int BN_mod_exp_recp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p,
622 const BIGNUM *m, BN_CTX *ctx);
623int BN_div_recp(BIGNUM *dv, BIGNUM *rem, const BIGNUM *m,
624 BN_RECP_CTX *recp, BN_CTX *ctx);
625
626#ifndef OPENSSL_NO_EC2M
627
628/* Functions for arithmetic over binary polynomials represented by BIGNUMs.
629 *
630 * The BIGNUM::neg property of BIGNUMs representing binary polynomials is
631 * ignored.
632 *
633 * Note that input arguments are not const so that their bit arrays can
634 * be expanded to the appropriate size if needed.
635 */
636
637int BN_GF2m_add(BIGNUM *r, const BIGNUM *a, const BIGNUM *b); /*r = a + b*/
638#define BN_GF2m_sub(r, a, b) BN_GF2m_add(r, a, b)
639int BN_GF2m_mod(BIGNUM *r, const BIGNUM *a, const BIGNUM *p); /*r=a mod p*/
640int BN_GF2m_mod_mul(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
641 const BIGNUM *p, BN_CTX *ctx); /* r = (a * b) mod p */
642int BN_GF2m_mod_sqr(BIGNUM *r, const BIGNUM *a, const BIGNUM *p,
643 BN_CTX *ctx); /* r = (a * a) mod p */
644int BN_GF2m_mod_inv(BIGNUM *r, const BIGNUM *b, const BIGNUM *p,
645 BN_CTX *ctx); /* r = (1 / b) mod p */
646int BN_GF2m_mod_div(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
647 const BIGNUM *p, BN_CTX *ctx); /* r = (a / b) mod p */
648int BN_GF2m_mod_exp(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
649 const BIGNUM *p, BN_CTX *ctx); /* r = (a ^ b) mod p */
650int BN_GF2m_mod_sqrt(BIGNUM *r, const BIGNUM *a, const BIGNUM *p,
651 BN_CTX *ctx); /* r = sqrt(a) mod p */
652int BN_GF2m_mod_solve_quad(BIGNUM *r, const BIGNUM *a, const BIGNUM *p,
653 BN_CTX *ctx); /* r^2 + r = a mod p */
654#define BN_GF2m_cmp(a, b) BN_ucmp((a), (b))
655/* Some functions allow for representation of the irreducible polynomials
656 * as an unsigned int[], say p. The irreducible f(t) is then of the form:
657 * t^p[0] + t^p[1] + ... + t^p[k]
658 * where m = p[0] > p[1] > ... > p[k] = 0.
659 */
660int BN_GF2m_mod_arr(BIGNUM *r, const BIGNUM *a, const int p[]);
661 /* r = a mod p */
662int BN_GF2m_mod_mul_arr(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
663 const int p[], BN_CTX *ctx); /* r = (a * b) mod p */
664int BN_GF2m_mod_sqr_arr(BIGNUM *r, const BIGNUM *a, const int p[],
665 BN_CTX *ctx); /* r = (a * a) mod p */
666int BN_GF2m_mod_inv_arr(BIGNUM *r, const BIGNUM *b, const int p[],
667 BN_CTX *ctx); /* r = (1 / b) mod p */
668int BN_GF2m_mod_div_arr(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
669 const int p[], BN_CTX *ctx); /* r = (a / b) mod p */
670int BN_GF2m_mod_exp_arr(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
671 const int p[], BN_CTX *ctx); /* r = (a ^ b) mod p */
672int BN_GF2m_mod_sqrt_arr(BIGNUM *r, const BIGNUM *a,
673 const int p[], BN_CTX *ctx); /* r = sqrt(a) mod p */
674int BN_GF2m_mod_solve_quad_arr(BIGNUM *r, const BIGNUM *a,
675 const int p[], BN_CTX *ctx); /* r^2 + r = a mod p */
676int BN_GF2m_poly2arr(const BIGNUM *a, int p[], int max);
677int BN_GF2m_arr2poly(const int p[], BIGNUM *a);
678
679#endif
680
681/* faster mod functions for the 'NIST primes'
682 * 0 <= a < p^2 */
683int BN_nist_mod_192(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx);
684int BN_nist_mod_224(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx);
685int BN_nist_mod_256(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx);
686int BN_nist_mod_384(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx);
687int BN_nist_mod_521(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx);
688
689const BIGNUM *BN_get0_nist_prime_192(void);
690const BIGNUM *BN_get0_nist_prime_224(void);
691const BIGNUM *BN_get0_nist_prime_256(void);
692const BIGNUM *BN_get0_nist_prime_384(void);
693const BIGNUM *BN_get0_nist_prime_521(void);
694
695/* library internal functions */
696
697#define bn_expand(a,bits) ((((((bits+BN_BITS2-1))/BN_BITS2)) <= (a)->dmax)?\
698 (a):bn_expand2((a),(bits+BN_BITS2-1)/BN_BITS2))
699#define bn_wexpand(a,words) (((words) <= (a)->dmax)?(a):bn_expand2((a),(words)))
700BIGNUM *bn_expand2(BIGNUM *a, int words);
701#ifndef OPENSSL_NO_DEPRECATED
702BIGNUM *bn_dup_expand(const BIGNUM *a, int words); /* unused */
703#endif
704
705/* Bignum consistency macros
706 * There is one "API" macro, bn_fix_top(), for stripping leading zeroes from
707 * bignum data after direct manipulations on the data. There is also an
708 * "internal" macro, bn_check_top(), for verifying that there are no leading
709 * zeroes. Unfortunately, some auditing is required due to the fact that
710 * bn_fix_top() has become an overabused duct-tape because bignum data is
711 * occasionally passed around in an inconsistent state. So the following
712 * changes have been made to sort this out;
713 * - bn_fix_top()s implementation has been moved to bn_correct_top()
714 * - if BN_DEBUG isn't defined, bn_fix_top() maps to bn_correct_top(), and
715 * bn_check_top() is as before.
716 * - if BN_DEBUG *is* defined;
717 * - bn_check_top() tries to pollute unused words even if the bignum 'top' is
718 * consistent. (ed: only if BN_DEBUG_RAND is defined)
719 * - bn_fix_top() maps to bn_check_top() rather than "fixing" anything.
720 * The idea is to have debug builds flag up inconsistent bignums when they
721 * occur. If that occurs in a bn_fix_top(), we examine the code in question; if
722 * the use of bn_fix_top() was appropriate (ie. it follows directly after code
723 * that manipulates the bignum) it is converted to bn_correct_top(), and if it
724 * was not appropriate, we convert it permanently to bn_check_top() and track
725 * down the cause of the bug. Eventually, no internal code should be using the
726 * bn_fix_top() macro. External applications and libraries should try this with
727 * their own code too, both in terms of building against the openssl headers
728 * with BN_DEBUG defined *and* linking with a version of OpenSSL built with it
729 * defined. This not only improves external code, it provides more test
730 * coverage for openssl's own code.
731 */
732
733#ifdef BN_DEBUG
734
735/* We only need assert() when debugging */
736#include <assert.h>
737
738#ifdef BN_DEBUG_RAND
739/* To avoid "make update" cvs wars due to BN_DEBUG, use some tricks */
740#ifndef RAND_pseudo_bytes
741int RAND_pseudo_bytes(unsigned char *buf,int num);
742#define BN_DEBUG_TRIX
743#endif
744#define bn_pollute(a) \
745 do { \
746 const BIGNUM *_bnum1 = (a); \
747 if(_bnum1->top < _bnum1->dmax) { \
748 unsigned char _tmp_char; \
749 /* We cast away const without the compiler knowing, any \
750 * *genuinely* constant variables that aren't mutable \
751 * wouldn't be constructed with top!=dmax. */ \
752 BN_ULONG *_not_const; \
753 memcpy(&_not_const, &_bnum1->d, sizeof(BN_ULONG*)); \
754 RAND_pseudo_bytes(&_tmp_char, 1); \
755 memset((unsigned char *)(_not_const + _bnum1->top), _tmp_char, \
756 (_bnum1->dmax - _bnum1->top) * sizeof(BN_ULONG)); \
757 } \
758 } while(0)
759#ifdef BN_DEBUG_TRIX
760#undef RAND_pseudo_bytes
761#endif
762#else
763#define bn_pollute(a)
764#endif
765#define bn_check_top(a) \
766 do { \
767 const BIGNUM *_bnum2 = (a); \
768 if (_bnum2 != NULL) { \
769 assert((_bnum2->top == 0) || \
770 (_bnum2->d[_bnum2->top - 1] != 0)); \
771 bn_pollute(_bnum2); \
772 } \
773 } while(0)
774
775#define bn_fix_top(a) bn_check_top(a)
776
777#else /* !BN_DEBUG */
778
779#define bn_pollute(a)
780#define bn_check_top(a)
781#define bn_fix_top(a) bn_correct_top(a)
782
783#endif
784
785#define bn_correct_top(a) \
786 { \
787 BN_ULONG *ftl; \
788 int tmp_top = (a)->top; \
789 if (tmp_top > 0) \
790 { \
791 for (ftl= &((a)->d[tmp_top-1]); tmp_top > 0; tmp_top--) \
792 if (*(ftl--)) break; \
793 (a)->top = tmp_top; \
794 } \
795 bn_pollute(a); \
796 }
797
798BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w);
799BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w);
800void bn_sqr_words(BN_ULONG *rp, const BN_ULONG *ap, int num);
801BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d);
802BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,int num);
803BN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,int num);
804
805/* Primes from RFC 2409 */
806BIGNUM *get_rfc2409_prime_768(BIGNUM *bn);
807BIGNUM *get_rfc2409_prime_1024(BIGNUM *bn);
808
809/* Primes from RFC 3526 */
810BIGNUM *get_rfc3526_prime_1536(BIGNUM *bn);
811BIGNUM *get_rfc3526_prime_2048(BIGNUM *bn);
812BIGNUM *get_rfc3526_prime_3072(BIGNUM *bn);
813BIGNUM *get_rfc3526_prime_4096(BIGNUM *bn);
814BIGNUM *get_rfc3526_prime_6144(BIGNUM *bn);
815BIGNUM *get_rfc3526_prime_8192(BIGNUM *bn);
816
817int BN_bntest_rand(BIGNUM *rnd, int bits, int top,int bottom);
818
819/* BEGIN ERROR CODES */
820/* The following lines are auto generated by the script mkerr.pl. Any changes
821 * made after this point may be overwritten when the script is next run.
822 */
823void ERR_load_BN_strings(void);
824
825/* Error codes for the BN functions. */
826
827/* Function codes. */
828#define BN_F_BNRAND 127
829#define BN_F_BN_BLINDING_CONVERT_EX 100
830#define BN_F_BN_BLINDING_CREATE_PARAM 128
831#define BN_F_BN_BLINDING_INVERT_EX 101
832#define BN_F_BN_BLINDING_NEW 102
833#define BN_F_BN_BLINDING_UPDATE 103
834#define BN_F_BN_BN2DEC 104
835#define BN_F_BN_BN2HEX 105
836#define BN_F_BN_CTX_GET 116
837#define BN_F_BN_CTX_NEW 106
838#define BN_F_BN_CTX_START 129
839#define BN_F_BN_DIV 107
840#define BN_F_BN_DIV_NO_BRANCH 138
841#define BN_F_BN_DIV_RECP 130
842#define BN_F_BN_EXP 123
843#define BN_F_BN_EXPAND2 108
844#define BN_F_BN_EXPAND_INTERNAL 120
845#define BN_F_BN_GF2M_MOD 131
846#define BN_F_BN_GF2M_MOD_EXP 132
847#define BN_F_BN_GF2M_MOD_MUL 133
848#define BN_F_BN_GF2M_MOD_SOLVE_QUAD 134
849#define BN_F_BN_GF2M_MOD_SOLVE_QUAD_ARR 135
850#define BN_F_BN_GF2M_MOD_SQR 136
851#define BN_F_BN_GF2M_MOD_SQRT 137
852#define BN_F_BN_MOD_EXP2_MONT 118
853#define BN_F_BN_MOD_EXP_MONT 109
854#define BN_F_BN_MOD_EXP_MONT_CONSTTIME 124
855#define BN_F_BN_MOD_EXP_MONT_WORD 117
856#define BN_F_BN_MOD_EXP_RECP 125
857#define BN_F_BN_MOD_EXP_SIMPLE 126
858#define BN_F_BN_MOD_INVERSE 110
859#define BN_F_BN_MOD_INVERSE_NO_BRANCH 139
860#define BN_F_BN_MOD_LSHIFT_QUICK 119
861#define BN_F_BN_MOD_MUL_RECIPROCAL 111
862#define BN_F_BN_MOD_SQRT 121
863#define BN_F_BN_MPI2BN 112
864#define BN_F_BN_NEW 113
865#define BN_F_BN_RAND 114
866#define BN_F_BN_RAND_RANGE 122
867#define BN_F_BN_USUB 115
868
869/* Reason codes. */
870#define BN_R_ARG2_LT_ARG3 100
871#define BN_R_BAD_RECIPROCAL 101
872#define BN_R_BIGNUM_TOO_LONG 114
873#define BN_R_CALLED_WITH_EVEN_MODULUS 102
874#define BN_R_DIV_BY_ZERO 103
875#define BN_R_ENCODING_ERROR 104
876#define BN_R_EXPAND_ON_STATIC_BIGNUM_DATA 105
877#define BN_R_INPUT_NOT_REDUCED 110
878#define BN_R_INVALID_LENGTH 106
879#define BN_R_INVALID_RANGE 115
880#define BN_R_NOT_A_SQUARE 111
881#define BN_R_NOT_INITIALIZED 107
882#define BN_R_NO_INVERSE 108
883#define BN_R_NO_SOLUTION 116
884#define BN_R_P_IS_NOT_PRIME 112
885#define BN_R_TOO_MANY_ITERATIONS 113
886#define BN_R_TOO_MANY_TEMPORARY_VARIABLES 109
887
888#ifdef __cplusplus
889}
890#endif
891#endif
diff --git a/src/lib/libcrypto/bn/bn_add.c b/src/lib/libcrypto/bn/bn_add.c
deleted file mode 100644
index 9405163706..0000000000
--- a/src/lib/libcrypto/bn/bn_add.c
+++ /dev/null
@@ -1,313 +0,0 @@
1/* crypto/bn/bn_add.c */
2/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
3 * All rights reserved.
4 *
5 * This package is an SSL implementation written
6 * by Eric Young (eay@cryptsoft.com).
7 * The implementation was written so as to conform with Netscapes SSL.
8 *
9 * This library is free for commercial and non-commercial use as long as
10 * the following conditions are aheared to. The following conditions
11 * apply to all code found in this distribution, be it the RC4, RSA,
12 * lhash, DES, etc., code; not just the SSL code. The SSL documentation
13 * included with this distribution is covered by the same copyright terms
14 * except that the holder is Tim Hudson (tjh@cryptsoft.com).
15 *
16 * Copyright remains Eric Young's, and as such any Copyright notices in
17 * the code are not to be removed.
18 * If this package is used in a product, Eric Young should be given attribution
19 * as the author of the parts of the library used.
20 * This can be in the form of a textual message at program startup or
21 * in documentation (online or textual) provided with the package.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 * 1. Redistributions of source code must retain the copyright
27 * notice, this list of conditions and the following disclaimer.
28 * 2. Redistributions in binary form must reproduce the above copyright
29 * notice, this list of conditions and the following disclaimer in the
30 * documentation and/or other materials provided with the distribution.
31 * 3. All advertising materials mentioning features or use of this software
32 * must display the following acknowledgement:
33 * "This product includes cryptographic software written by
34 * Eric Young (eay@cryptsoft.com)"
35 * The word 'cryptographic' can be left out if the rouines from the library
36 * being used are not cryptographic related :-).
37 * 4. If you include any Windows specific code (or a derivative thereof) from
38 * the apps directory (application code) you must include an acknowledgement:
39 * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
40 *
41 * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
42 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51 * SUCH DAMAGE.
52 *
53 * The licence and distribution terms for any publically available version or
54 * derivative of this code cannot be changed. i.e. this code cannot simply be
55 * copied and put under another distribution licence
56 * [including the GNU Public Licence.]
57 */
58
59#include <stdio.h>
60#include "cryptlib.h"
61#include "bn_lcl.h"
62
63/* r can == a or b */
64int BN_add(BIGNUM *r, const BIGNUM *a, const BIGNUM *b)
65 {
66 const BIGNUM *tmp;
67 int a_neg = a->neg, ret;
68
69 bn_check_top(a);
70 bn_check_top(b);
71
72 /* a + b a+b
73 * a + -b a-b
74 * -a + b b-a
75 * -a + -b -(a+b)
76 */
77 if (a_neg ^ b->neg)
78 {
79 /* only one is negative */
80 if (a_neg)
81 { tmp=a; a=b; b=tmp; }
82
83 /* we are now a - b */
84
85 if (BN_ucmp(a,b) < 0)
86 {
87 if (!BN_usub(r,b,a)) return(0);
88 r->neg=1;
89 }
90 else
91 {
92 if (!BN_usub(r,a,b)) return(0);
93 r->neg=0;
94 }
95 return(1);
96 }
97
98 ret = BN_uadd(r,a,b);
99 r->neg = a_neg;
100 bn_check_top(r);
101 return ret;
102 }
103
104/* unsigned add of b to a */
105int BN_uadd(BIGNUM *r, const BIGNUM *a, const BIGNUM *b)
106 {
107 int max,min,dif;
108 BN_ULONG *ap,*bp,*rp,carry,t1,t2;
109 const BIGNUM *tmp;
110
111 bn_check_top(a);
112 bn_check_top(b);
113
114 if (a->top < b->top)
115 { tmp=a; a=b; b=tmp; }
116 max = a->top;
117 min = b->top;
118 dif = max - min;
119
120 if (bn_wexpand(r,max+1) == NULL)
121 return 0;
122
123 r->top=max;
124
125
126 ap=a->d;
127 bp=b->d;
128 rp=r->d;
129
130 carry=bn_add_words(rp,ap,bp,min);
131 rp+=min;
132 ap+=min;
133 bp+=min;
134
135 if (carry)
136 {
137 while (dif)
138 {
139 dif--;
140 t1 = *(ap++);
141 t2 = (t1+1) & BN_MASK2;
142 *(rp++) = t2;
143 if (t2)
144 {
145 carry=0;
146 break;
147 }
148 }
149 if (carry)
150 {
151 /* carry != 0 => dif == 0 */
152 *rp = 1;
153 r->top++;
154 }
155 }
156 if (dif && rp != ap)
157 while (dif--)
158 /* copy remaining words if ap != rp */
159 *(rp++) = *(ap++);
160 r->neg = 0;
161 bn_check_top(r);
162 return 1;
163 }
164
165/* unsigned subtraction of b from a, a must be larger than b. */
166int BN_usub(BIGNUM *r, const BIGNUM *a, const BIGNUM *b)
167 {
168 int max,min,dif;
169 register BN_ULONG t1,t2,*ap,*bp,*rp;
170 int i,carry;
171#if defined(IRIX_CC_BUG) && !defined(LINT)
172 int dummy;
173#endif
174
175 bn_check_top(a);
176 bn_check_top(b);
177
178 max = a->top;
179 min = b->top;
180 dif = max - min;
181
182 if (dif < 0) /* hmm... should not be happening */
183 {
184 BNerr(BN_F_BN_USUB,BN_R_ARG2_LT_ARG3);
185 return(0);
186 }
187
188 if (bn_wexpand(r,max) == NULL) return(0);
189
190 ap=a->d;
191 bp=b->d;
192 rp=r->d;
193
194#if 1
195 carry=0;
196 for (i = min; i != 0; i--)
197 {
198 t1= *(ap++);
199 t2= *(bp++);
200 if (carry)
201 {
202 carry=(t1 <= t2);
203 t1=(t1-t2-1)&BN_MASK2;
204 }
205 else
206 {
207 carry=(t1 < t2);
208 t1=(t1-t2)&BN_MASK2;
209 }
210#if defined(IRIX_CC_BUG) && !defined(LINT)
211 dummy=t1;
212#endif
213 *(rp++)=t1&BN_MASK2;
214 }
215#else
216 carry=bn_sub_words(rp,ap,bp,min);
217 ap+=min;
218 bp+=min;
219 rp+=min;
220#endif
221 if (carry) /* subtracted */
222 {
223 if (!dif)
224 /* error: a < b */
225 return 0;
226 while (dif)
227 {
228 dif--;
229 t1 = *(ap++);
230 t2 = (t1-1)&BN_MASK2;
231 *(rp++) = t2;
232 if (t1)
233 break;
234 }
235 }
236#if 0
237 memcpy(rp,ap,sizeof(*rp)*(max-i));
238#else
239 if (rp != ap)
240 {
241 for (;;)
242 {
243 if (!dif--) break;
244 rp[0]=ap[0];
245 if (!dif--) break;
246 rp[1]=ap[1];
247 if (!dif--) break;
248 rp[2]=ap[2];
249 if (!dif--) break;
250 rp[3]=ap[3];
251 rp+=4;
252 ap+=4;
253 }
254 }
255#endif
256
257 r->top=max;
258 r->neg=0;
259 bn_correct_top(r);
260 return(1);
261 }
262
263int BN_sub(BIGNUM *r, const BIGNUM *a, const BIGNUM *b)
264 {
265 int max;
266 int add=0,neg=0;
267 const BIGNUM *tmp;
268
269 bn_check_top(a);
270 bn_check_top(b);
271
272 /* a - b a-b
273 * a - -b a+b
274 * -a - b -(a+b)
275 * -a - -b b-a
276 */
277 if (a->neg)
278 {
279 if (b->neg)
280 { tmp=a; a=b; b=tmp; }
281 else
282 { add=1; neg=1; }
283 }
284 else
285 {
286 if (b->neg) { add=1; neg=0; }
287 }
288
289 if (add)
290 {
291 if (!BN_uadd(r,a,b)) return(0);
292 r->neg=neg;
293 return(1);
294 }
295
296 /* We are actually doing a - b :-) */
297
298 max=(a->top > b->top)?a->top:b->top;
299 if (bn_wexpand(r,max) == NULL) return(0);
300 if (BN_ucmp(a,b) < 0)
301 {
302 if (!BN_usub(r,b,a)) return(0);
303 r->neg=1;
304 }
305 else
306 {
307 if (!BN_usub(r,a,b)) return(0);
308 r->neg=0;
309 }
310 bn_check_top(r);
311 return(1);
312 }
313
diff --git a/src/lib/libcrypto/bn/bn_asm.c b/src/lib/libcrypto/bn/bn_asm.c
deleted file mode 100644
index c43c91cc09..0000000000
--- a/src/lib/libcrypto/bn/bn_asm.c
+++ /dev/null
@@ -1,1030 +0,0 @@
1/* crypto/bn/bn_asm.c */
2/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
3 * All rights reserved.
4 *
5 * This package is an SSL implementation written
6 * by Eric Young (eay@cryptsoft.com).
7 * The implementation was written so as to conform with Netscapes SSL.
8 *
9 * This library is free for commercial and non-commercial use as long as
10 * the following conditions are aheared to. The following conditions
11 * apply to all code found in this distribution, be it the RC4, RSA,
12 * lhash, DES, etc., code; not just the SSL code. The SSL documentation
13 * included with this distribution is covered by the same copyright terms
14 * except that the holder is Tim Hudson (tjh@cryptsoft.com).
15 *
16 * Copyright remains Eric Young's, and as such any Copyright notices in
17 * the code are not to be removed.
18 * If this package is used in a product, Eric Young should be given attribution
19 * as the author of the parts of the library used.
20 * This can be in the form of a textual message at program startup or
21 * in documentation (online or textual) provided with the package.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 * 1. Redistributions of source code must retain the copyright
27 * notice, this list of conditions and the following disclaimer.
28 * 2. Redistributions in binary form must reproduce the above copyright
29 * notice, this list of conditions and the following disclaimer in the
30 * documentation and/or other materials provided with the distribution.
31 * 3. All advertising materials mentioning features or use of this software
32 * must display the following acknowledgement:
33 * "This product includes cryptographic software written by
34 * Eric Young (eay@cryptsoft.com)"
35 * The word 'cryptographic' can be left out if the rouines from the library
36 * being used are not cryptographic related :-).
37 * 4. If you include any Windows specific code (or a derivative thereof) from
38 * the apps directory (application code) you must include an acknowledgement:
39 * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
40 *
41 * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
42 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51 * SUCH DAMAGE.
52 *
53 * The licence and distribution terms for any publically available version or
54 * derivative of this code cannot be changed. i.e. this code cannot simply be
55 * copied and put under another distribution licence
56 * [including the GNU Public Licence.]
57 */
58
59#ifndef BN_DEBUG
60# undef NDEBUG /* avoid conflicting definitions */
61# define NDEBUG
62#endif
63
64#include <stdio.h>
65#include <assert.h>
66#include "cryptlib.h"
67#include "bn_lcl.h"
68
69#if defined(BN_LLONG) || defined(BN_UMULT_HIGH)
70
71BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
72 {
73 BN_ULONG c1=0;
74
75 assert(num >= 0);
76 if (num <= 0) return(c1);
77
78#ifndef OPENSSL_SMALL_FOOTPRINT
79 while (num&~3)
80 {
81 mul_add(rp[0],ap[0],w,c1);
82 mul_add(rp[1],ap[1],w,c1);
83 mul_add(rp[2],ap[2],w,c1);
84 mul_add(rp[3],ap[3],w,c1);
85 ap+=4; rp+=4; num-=4;
86 }
87#endif
88 while (num)
89 {
90 mul_add(rp[0],ap[0],w,c1);
91 ap++; rp++; num--;
92 }
93
94 return(c1);
95 }
96
97BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
98 {
99 BN_ULONG c1=0;
100
101 assert(num >= 0);
102 if (num <= 0) return(c1);
103
104#ifndef OPENSSL_SMALL_FOOTPRINT
105 while (num&~3)
106 {
107 mul(rp[0],ap[0],w,c1);
108 mul(rp[1],ap[1],w,c1);
109 mul(rp[2],ap[2],w,c1);
110 mul(rp[3],ap[3],w,c1);
111 ap+=4; rp+=4; num-=4;
112 }
113#endif
114 while (num)
115 {
116 mul(rp[0],ap[0],w,c1);
117 ap++; rp++; num--;
118 }
119 return(c1);
120 }
121
122void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n)
123 {
124 assert(n >= 0);
125 if (n <= 0) return;
126
127#ifndef OPENSSL_SMALL_FOOTPRINT
128 while (n&~3)
129 {
130 sqr(r[0],r[1],a[0]);
131 sqr(r[2],r[3],a[1]);
132 sqr(r[4],r[5],a[2]);
133 sqr(r[6],r[7],a[3]);
134 a+=4; r+=8; n-=4;
135 }
136#endif
137 while (n)
138 {
139 sqr(r[0],r[1],a[0]);
140 a++; r+=2; n--;
141 }
142 }
143
144#else /* !(defined(BN_LLONG) || defined(BN_UMULT_HIGH)) */
145
146BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
147 {
148 BN_ULONG c=0;
149 BN_ULONG bl,bh;
150
151 assert(num >= 0);
152 if (num <= 0) return((BN_ULONG)0);
153
154 bl=LBITS(w);
155 bh=HBITS(w);
156
157#ifndef OPENSSL_SMALL_FOOTPRINT
158 while (num&~3)
159 {
160 mul_add(rp[0],ap[0],bl,bh,c);
161 mul_add(rp[1],ap[1],bl,bh,c);
162 mul_add(rp[2],ap[2],bl,bh,c);
163 mul_add(rp[3],ap[3],bl,bh,c);
164 ap+=4; rp+=4; num-=4;
165 }
166#endif
167 while (num)
168 {
169 mul_add(rp[0],ap[0],bl,bh,c);
170 ap++; rp++; num--;
171 }
172 return(c);
173 }
174
175BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
176 {
177 BN_ULONG carry=0;
178 BN_ULONG bl,bh;
179
180 assert(num >= 0);
181 if (num <= 0) return((BN_ULONG)0);
182
183 bl=LBITS(w);
184 bh=HBITS(w);
185
186#ifndef OPENSSL_SMALL_FOOTPRINT
187 while (num&~3)
188 {
189 mul(rp[0],ap[0],bl,bh,carry);
190 mul(rp[1],ap[1],bl,bh,carry);
191 mul(rp[2],ap[2],bl,bh,carry);
192 mul(rp[3],ap[3],bl,bh,carry);
193 ap+=4; rp+=4; num-=4;
194 }
195#endif
196 while (num)
197 {
198 mul(rp[0],ap[0],bl,bh,carry);
199 ap++; rp++; num--;
200 }
201 return(carry);
202 }
203
204void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n)
205 {
206 assert(n >= 0);
207 if (n <= 0) return;
208
209#ifndef OPENSSL_SMALL_FOOTPRINT
210 while (n&~3)
211 {
212 sqr64(r[0],r[1],a[0]);
213 sqr64(r[2],r[3],a[1]);
214 sqr64(r[4],r[5],a[2]);
215 sqr64(r[6],r[7],a[3]);
216 a+=4; r+=8; n-=4;
217 }
218#endif
219 while (n)
220 {
221 sqr64(r[0],r[1],a[0]);
222 a++; r+=2; n--;
223 }
224 }
225
226#endif /* !(defined(BN_LLONG) || defined(BN_UMULT_HIGH)) */
227
228#if defined(BN_LLONG) && defined(BN_DIV2W)
229
230BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
231 {
232 return((BN_ULONG)(((((BN_ULLONG)h)<<BN_BITS2)|l)/(BN_ULLONG)d));
233 }
234
235#else
236
237/* Divide h,l by d and return the result. */
238/* I need to test this some more :-( */
239BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
240 {
241 BN_ULONG dh,dl,q,ret=0,th,tl,t;
242 int i,count=2;
243
244 if (d == 0) return(BN_MASK2);
245
246 i=BN_num_bits_word(d);
247 assert((i == BN_BITS2) || (h <= (BN_ULONG)1<<i));
248
249 i=BN_BITS2-i;
250 if (h >= d) h-=d;
251
252 if (i)
253 {
254 d<<=i;
255 h=(h<<i)|(l>>(BN_BITS2-i));
256 l<<=i;
257 }
258 dh=(d&BN_MASK2h)>>BN_BITS4;
259 dl=(d&BN_MASK2l);
260 for (;;)
261 {
262 if ((h>>BN_BITS4) == dh)
263 q=BN_MASK2l;
264 else
265 q=h/dh;
266
267 th=q*dh;
268 tl=dl*q;
269 for (;;)
270 {
271 t=h-th;
272 if ((t&BN_MASK2h) ||
273 ((tl) <= (
274 (t<<BN_BITS4)|
275 ((l&BN_MASK2h)>>BN_BITS4))))
276 break;
277 q--;
278 th-=dh;
279 tl-=dl;
280 }
281 t=(tl>>BN_BITS4);
282 tl=(tl<<BN_BITS4)&BN_MASK2h;
283 th+=t;
284
285 if (l < tl) th++;
286 l-=tl;
287 if (h < th)
288 {
289 h+=d;
290 q--;
291 }
292 h-=th;
293
294 if (--count == 0) break;
295
296 ret=q<<BN_BITS4;
297 h=((h<<BN_BITS4)|(l>>BN_BITS4))&BN_MASK2;
298 l=(l&BN_MASK2l)<<BN_BITS4;
299 }
300 ret|=q;
301 return(ret);
302 }
303#endif /* !defined(BN_LLONG) && defined(BN_DIV2W) */
304
305#ifdef BN_LLONG
306BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, int n)
307 {
308 BN_ULLONG ll=0;
309
310 assert(n >= 0);
311 if (n <= 0) return((BN_ULONG)0);
312
313#ifndef OPENSSL_SMALL_FOOTPRINT
314 while (n&~3)
315 {
316 ll+=(BN_ULLONG)a[0]+b[0];
317 r[0]=(BN_ULONG)ll&BN_MASK2;
318 ll>>=BN_BITS2;
319 ll+=(BN_ULLONG)a[1]+b[1];
320 r[1]=(BN_ULONG)ll&BN_MASK2;
321 ll>>=BN_BITS2;
322 ll+=(BN_ULLONG)a[2]+b[2];
323 r[2]=(BN_ULONG)ll&BN_MASK2;
324 ll>>=BN_BITS2;
325 ll+=(BN_ULLONG)a[3]+b[3];
326 r[3]=(BN_ULONG)ll&BN_MASK2;
327 ll>>=BN_BITS2;
328 a+=4; b+=4; r+=4; n-=4;
329 }
330#endif
331 while (n)
332 {
333 ll+=(BN_ULLONG)a[0]+b[0];
334 r[0]=(BN_ULONG)ll&BN_MASK2;
335 ll>>=BN_BITS2;
336 a++; b++; r++; n--;
337 }
338 return((BN_ULONG)ll);
339 }
340#else /* !BN_LLONG */
341BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, int n)
342 {
343 BN_ULONG c,l,t;
344
345 assert(n >= 0);
346 if (n <= 0) return((BN_ULONG)0);
347
348 c=0;
349#ifndef OPENSSL_SMALL_FOOTPRINT
350 while (n&~3)
351 {
352 t=a[0];
353 t=(t+c)&BN_MASK2;
354 c=(t < c);
355 l=(t+b[0])&BN_MASK2;
356 c+=(l < t);
357 r[0]=l;
358 t=a[1];
359 t=(t+c)&BN_MASK2;
360 c=(t < c);
361 l=(t+b[1])&BN_MASK2;
362 c+=(l < t);
363 r[1]=l;
364 t=a[2];
365 t=(t+c)&BN_MASK2;
366 c=(t < c);
367 l=(t+b[2])&BN_MASK2;
368 c+=(l < t);
369 r[2]=l;
370 t=a[3];
371 t=(t+c)&BN_MASK2;
372 c=(t < c);
373 l=(t+b[3])&BN_MASK2;
374 c+=(l < t);
375 r[3]=l;
376 a+=4; b+=4; r+=4; n-=4;
377 }
378#endif
379 while(n)
380 {
381 t=a[0];
382 t=(t+c)&BN_MASK2;
383 c=(t < c);
384 l=(t+b[0])&BN_MASK2;
385 c+=(l < t);
386 r[0]=l;
387 a++; b++; r++; n--;
388 }
389 return((BN_ULONG)c);
390 }
391#endif /* !BN_LLONG */
392
393BN_ULONG bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, int n)
394 {
395 BN_ULONG t1,t2;
396 int c=0;
397
398 assert(n >= 0);
399 if (n <= 0) return((BN_ULONG)0);
400
401#ifndef OPENSSL_SMALL_FOOTPRINT
402 while (n&~3)
403 {
404 t1=a[0]; t2=b[0];
405 r[0]=(t1-t2-c)&BN_MASK2;
406 if (t1 != t2) c=(t1 < t2);
407 t1=a[1]; t2=b[1];
408 r[1]=(t1-t2-c)&BN_MASK2;
409 if (t1 != t2) c=(t1 < t2);
410 t1=a[2]; t2=b[2];
411 r[2]=(t1-t2-c)&BN_MASK2;
412 if (t1 != t2) c=(t1 < t2);
413 t1=a[3]; t2=b[3];
414 r[3]=(t1-t2-c)&BN_MASK2;
415 if (t1 != t2) c=(t1 < t2);
416 a+=4; b+=4; r+=4; n-=4;
417 }
418#endif
419 while (n)
420 {
421 t1=a[0]; t2=b[0];
422 r[0]=(t1-t2-c)&BN_MASK2;
423 if (t1 != t2) c=(t1 < t2);
424 a++; b++; r++; n--;
425 }
426 return(c);
427 }
428
429#if defined(BN_MUL_COMBA) && !defined(OPENSSL_SMALL_FOOTPRINT)
430
431#undef bn_mul_comba8
432#undef bn_mul_comba4
433#undef bn_sqr_comba8
434#undef bn_sqr_comba4
435
436/* mul_add_c(a,b,c0,c1,c2) -- c+=a*b for three word number c=(c2,c1,c0) */
437/* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */
438/* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
439/* sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0) */
440
441#ifdef BN_LLONG
442#define mul_add_c(a,b,c0,c1,c2) \
443 t=(BN_ULLONG)a*b; \
444 t1=(BN_ULONG)Lw(t); \
445 t2=(BN_ULONG)Hw(t); \
446 c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \
447 c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++;
448
449#define mul_add_c2(a,b,c0,c1,c2) \
450 t=(BN_ULLONG)a*b; \
451 tt=(t+t)&BN_MASK; \
452 if (tt < t) c2++; \
453 t1=(BN_ULONG)Lw(tt); \
454 t2=(BN_ULONG)Hw(tt); \
455 c0=(c0+t1)&BN_MASK2; \
456 if ((c0 < t1) && (((++t2)&BN_MASK2) == 0)) c2++; \
457 c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++;
458
459#define sqr_add_c(a,i,c0,c1,c2) \
460 t=(BN_ULLONG)a[i]*a[i]; \
461 t1=(BN_ULONG)Lw(t); \
462 t2=(BN_ULONG)Hw(t); \
463 c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \
464 c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++;
465
466#define sqr_add_c2(a,i,j,c0,c1,c2) \
467 mul_add_c2((a)[i],(a)[j],c0,c1,c2)
468
469#elif defined(BN_UMULT_LOHI)
470
471#define mul_add_c(a,b,c0,c1,c2) { \
472 BN_ULONG ta=(a),tb=(b); \
473 BN_UMULT_LOHI(t1,t2,ta,tb); \
474 c0 += t1; t2 += (c0<t1)?1:0; \
475 c1 += t2; c2 += (c1<t2)?1:0; \
476 }
477
478#define mul_add_c2(a,b,c0,c1,c2) { \
479 BN_ULONG ta=(a),tb=(b),t0; \
480 BN_UMULT_LOHI(t0,t1,ta,tb); \
481 t2 = t1+t1; c2 += (t2<t1)?1:0; \
482 t1 = t0+t0; t2 += (t1<t0)?1:0; \
483 c0 += t1; t2 += (c0<t1)?1:0; \
484 c1 += t2; c2 += (c1<t2)?1:0; \
485 }
486
487#define sqr_add_c(a,i,c0,c1,c2) { \
488 BN_ULONG ta=(a)[i]; \
489 BN_UMULT_LOHI(t1,t2,ta,ta); \
490 c0 += t1; t2 += (c0<t1)?1:0; \
491 c1 += t2; c2 += (c1<t2)?1:0; \
492 }
493
494#define sqr_add_c2(a,i,j,c0,c1,c2) \
495 mul_add_c2((a)[i],(a)[j],c0,c1,c2)
496
497#elif defined(BN_UMULT_HIGH)
498
499#define mul_add_c(a,b,c0,c1,c2) { \
500 BN_ULONG ta=(a),tb=(b); \
501 t1 = ta * tb; \
502 t2 = BN_UMULT_HIGH(ta,tb); \
503 c0 += t1; t2 += (c0<t1)?1:0; \
504 c1 += t2; c2 += (c1<t2)?1:0; \
505 }
506
507#define mul_add_c2(a,b,c0,c1,c2) { \
508 BN_ULONG ta=(a),tb=(b),t0; \
509 t1 = BN_UMULT_HIGH(ta,tb); \
510 t0 = ta * tb; \
511 t2 = t1+t1; c2 += (t2<t1)?1:0; \
512 t1 = t0+t0; t2 += (t1<t0)?1:0; \
513 c0 += t1; t2 += (c0<t1)?1:0; \
514 c1 += t2; c2 += (c1<t2)?1:0; \
515 }
516
517#define sqr_add_c(a,i,c0,c1,c2) { \
518 BN_ULONG ta=(a)[i]; \
519 t1 = ta * ta; \
520 t2 = BN_UMULT_HIGH(ta,ta); \
521 c0 += t1; t2 += (c0<t1)?1:0; \
522 c1 += t2; c2 += (c1<t2)?1:0; \
523 }
524
525#define sqr_add_c2(a,i,j,c0,c1,c2) \
526 mul_add_c2((a)[i],(a)[j],c0,c1,c2)
527
528#else /* !BN_LLONG */
529#define mul_add_c(a,b,c0,c1,c2) \
530 t1=LBITS(a); t2=HBITS(a); \
531 bl=LBITS(b); bh=HBITS(b); \
532 mul64(t1,t2,bl,bh); \
533 c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \
534 c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++;
535
536#define mul_add_c2(a,b,c0,c1,c2) \
537 t1=LBITS(a); t2=HBITS(a); \
538 bl=LBITS(b); bh=HBITS(b); \
539 mul64(t1,t2,bl,bh); \
540 if (t2 & BN_TBIT) c2++; \
541 t2=(t2+t2)&BN_MASK2; \
542 if (t1 & BN_TBIT) t2++; \
543 t1=(t1+t1)&BN_MASK2; \
544 c0=(c0+t1)&BN_MASK2; \
545 if ((c0 < t1) && (((++t2)&BN_MASK2) == 0)) c2++; \
546 c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++;
547
548#define sqr_add_c(a,i,c0,c1,c2) \
549 sqr64(t1,t2,(a)[i]); \
550 c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \
551 c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++;
552
553#define sqr_add_c2(a,i,j,c0,c1,c2) \
554 mul_add_c2((a)[i],(a)[j],c0,c1,c2)
555#endif /* !BN_LLONG */
556
557void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
558 {
559#ifdef BN_LLONG
560 BN_ULLONG t;
561#else
562 BN_ULONG bl,bh;
563#endif
564 BN_ULONG t1,t2;
565 BN_ULONG c1,c2,c3;
566
567 c1=0;
568 c2=0;
569 c3=0;
570 mul_add_c(a[0],b[0],c1,c2,c3);
571 r[0]=c1;
572 c1=0;
573 mul_add_c(a[0],b[1],c2,c3,c1);
574 mul_add_c(a[1],b[0],c2,c3,c1);
575 r[1]=c2;
576 c2=0;
577 mul_add_c(a[2],b[0],c3,c1,c2);
578 mul_add_c(a[1],b[1],c3,c1,c2);
579 mul_add_c(a[0],b[2],c3,c1,c2);
580 r[2]=c3;
581 c3=0;
582 mul_add_c(a[0],b[3],c1,c2,c3);
583 mul_add_c(a[1],b[2],c1,c2,c3);
584 mul_add_c(a[2],b[1],c1,c2,c3);
585 mul_add_c(a[3],b[0],c1,c2,c3);
586 r[3]=c1;
587 c1=0;
588 mul_add_c(a[4],b[0],c2,c3,c1);
589 mul_add_c(a[3],b[1],c2,c3,c1);
590 mul_add_c(a[2],b[2],c2,c3,c1);
591 mul_add_c(a[1],b[3],c2,c3,c1);
592 mul_add_c(a[0],b[4],c2,c3,c1);
593 r[4]=c2;
594 c2=0;
595 mul_add_c(a[0],b[5],c3,c1,c2);
596 mul_add_c(a[1],b[4],c3,c1,c2);
597 mul_add_c(a[2],b[3],c3,c1,c2);
598 mul_add_c(a[3],b[2],c3,c1,c2);
599 mul_add_c(a[4],b[1],c3,c1,c2);
600 mul_add_c(a[5],b[0],c3,c1,c2);
601 r[5]=c3;
602 c3=0;
603 mul_add_c(a[6],b[0],c1,c2,c3);
604 mul_add_c(a[5],b[1],c1,c2,c3);
605 mul_add_c(a[4],b[2],c1,c2,c3);
606 mul_add_c(a[3],b[3],c1,c2,c3);
607 mul_add_c(a[2],b[4],c1,c2,c3);
608 mul_add_c(a[1],b[5],c1,c2,c3);
609 mul_add_c(a[0],b[6],c1,c2,c3);
610 r[6]=c1;
611 c1=0;
612 mul_add_c(a[0],b[7],c2,c3,c1);
613 mul_add_c(a[1],b[6],c2,c3,c1);
614 mul_add_c(a[2],b[5],c2,c3,c1);
615 mul_add_c(a[3],b[4],c2,c3,c1);
616 mul_add_c(a[4],b[3],c2,c3,c1);
617 mul_add_c(a[5],b[2],c2,c3,c1);
618 mul_add_c(a[6],b[1],c2,c3,c1);
619 mul_add_c(a[7],b[0],c2,c3,c1);
620 r[7]=c2;
621 c2=0;
622 mul_add_c(a[7],b[1],c3,c1,c2);
623 mul_add_c(a[6],b[2],c3,c1,c2);
624 mul_add_c(a[5],b[3],c3,c1,c2);
625 mul_add_c(a[4],b[4],c3,c1,c2);
626 mul_add_c(a[3],b[5],c3,c1,c2);
627 mul_add_c(a[2],b[6],c3,c1,c2);
628 mul_add_c(a[1],b[7],c3,c1,c2);
629 r[8]=c3;
630 c3=0;
631 mul_add_c(a[2],b[7],c1,c2,c3);
632 mul_add_c(a[3],b[6],c1,c2,c3);
633 mul_add_c(a[4],b[5],c1,c2,c3);
634 mul_add_c(a[5],b[4],c1,c2,c3);
635 mul_add_c(a[6],b[3],c1,c2,c3);
636 mul_add_c(a[7],b[2],c1,c2,c3);
637 r[9]=c1;
638 c1=0;
639 mul_add_c(a[7],b[3],c2,c3,c1);
640 mul_add_c(a[6],b[4],c2,c3,c1);
641 mul_add_c(a[5],b[5],c2,c3,c1);
642 mul_add_c(a[4],b[6],c2,c3,c1);
643 mul_add_c(a[3],b[7],c2,c3,c1);
644 r[10]=c2;
645 c2=0;
646 mul_add_c(a[4],b[7],c3,c1,c2);
647 mul_add_c(a[5],b[6],c3,c1,c2);
648 mul_add_c(a[6],b[5],c3,c1,c2);
649 mul_add_c(a[7],b[4],c3,c1,c2);
650 r[11]=c3;
651 c3=0;
652 mul_add_c(a[7],b[5],c1,c2,c3);
653 mul_add_c(a[6],b[6],c1,c2,c3);
654 mul_add_c(a[5],b[7],c1,c2,c3);
655 r[12]=c1;
656 c1=0;
657 mul_add_c(a[6],b[7],c2,c3,c1);
658 mul_add_c(a[7],b[6],c2,c3,c1);
659 r[13]=c2;
660 c2=0;
661 mul_add_c(a[7],b[7],c3,c1,c2);
662 r[14]=c3;
663 r[15]=c1;
664 }
665
666void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
667 {
668#ifdef BN_LLONG
669 BN_ULLONG t;
670#else
671 BN_ULONG bl,bh;
672#endif
673 BN_ULONG t1,t2;
674 BN_ULONG c1,c2,c3;
675
676 c1=0;
677 c2=0;
678 c3=0;
679 mul_add_c(a[0],b[0],c1,c2,c3);
680 r[0]=c1;
681 c1=0;
682 mul_add_c(a[0],b[1],c2,c3,c1);
683 mul_add_c(a[1],b[0],c2,c3,c1);
684 r[1]=c2;
685 c2=0;
686 mul_add_c(a[2],b[0],c3,c1,c2);
687 mul_add_c(a[1],b[1],c3,c1,c2);
688 mul_add_c(a[0],b[2],c3,c1,c2);
689 r[2]=c3;
690 c3=0;
691 mul_add_c(a[0],b[3],c1,c2,c3);
692 mul_add_c(a[1],b[2],c1,c2,c3);
693 mul_add_c(a[2],b[1],c1,c2,c3);
694 mul_add_c(a[3],b[0],c1,c2,c3);
695 r[3]=c1;
696 c1=0;
697 mul_add_c(a[3],b[1],c2,c3,c1);
698 mul_add_c(a[2],b[2],c2,c3,c1);
699 mul_add_c(a[1],b[3],c2,c3,c1);
700 r[4]=c2;
701 c2=0;
702 mul_add_c(a[2],b[3],c3,c1,c2);
703 mul_add_c(a[3],b[2],c3,c1,c2);
704 r[5]=c3;
705 c3=0;
706 mul_add_c(a[3],b[3],c1,c2,c3);
707 r[6]=c1;
708 r[7]=c2;
709 }
710
711void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
712 {
713#ifdef BN_LLONG
714 BN_ULLONG t,tt;
715#else
716 BN_ULONG bl,bh;
717#endif
718 BN_ULONG t1,t2;
719 BN_ULONG c1,c2,c3;
720
721 c1=0;
722 c2=0;
723 c3=0;
724 sqr_add_c(a,0,c1,c2,c3);
725 r[0]=c1;
726 c1=0;
727 sqr_add_c2(a,1,0,c2,c3,c1);
728 r[1]=c2;
729 c2=0;
730 sqr_add_c(a,1,c3,c1,c2);
731 sqr_add_c2(a,2,0,c3,c1,c2);
732 r[2]=c3;
733 c3=0;
734 sqr_add_c2(a,3,0,c1,c2,c3);
735 sqr_add_c2(a,2,1,c1,c2,c3);
736 r[3]=c1;
737 c1=0;
738 sqr_add_c(a,2,c2,c3,c1);
739 sqr_add_c2(a,3,1,c2,c3,c1);
740 sqr_add_c2(a,4,0,c2,c3,c1);
741 r[4]=c2;
742 c2=0;
743 sqr_add_c2(a,5,0,c3,c1,c2);
744 sqr_add_c2(a,4,1,c3,c1,c2);
745 sqr_add_c2(a,3,2,c3,c1,c2);
746 r[5]=c3;
747 c3=0;
748 sqr_add_c(a,3,c1,c2,c3);
749 sqr_add_c2(a,4,2,c1,c2,c3);
750 sqr_add_c2(a,5,1,c1,c2,c3);
751 sqr_add_c2(a,6,0,c1,c2,c3);
752 r[6]=c1;
753 c1=0;
754 sqr_add_c2(a,7,0,c2,c3,c1);
755 sqr_add_c2(a,6,1,c2,c3,c1);
756 sqr_add_c2(a,5,2,c2,c3,c1);
757 sqr_add_c2(a,4,3,c2,c3,c1);
758 r[7]=c2;
759 c2=0;
760 sqr_add_c(a,4,c3,c1,c2);
761 sqr_add_c2(a,5,3,c3,c1,c2);
762 sqr_add_c2(a,6,2,c3,c1,c2);
763 sqr_add_c2(a,7,1,c3,c1,c2);
764 r[8]=c3;
765 c3=0;
766 sqr_add_c2(a,7,2,c1,c2,c3);
767 sqr_add_c2(a,6,3,c1,c2,c3);
768 sqr_add_c2(a,5,4,c1,c2,c3);
769 r[9]=c1;
770 c1=0;
771 sqr_add_c(a,5,c2,c3,c1);
772 sqr_add_c2(a,6,4,c2,c3,c1);
773 sqr_add_c2(a,7,3,c2,c3,c1);
774 r[10]=c2;
775 c2=0;
776 sqr_add_c2(a,7,4,c3,c1,c2);
777 sqr_add_c2(a,6,5,c3,c1,c2);
778 r[11]=c3;
779 c3=0;
780 sqr_add_c(a,6,c1,c2,c3);
781 sqr_add_c2(a,7,5,c1,c2,c3);
782 r[12]=c1;
783 c1=0;
784 sqr_add_c2(a,7,6,c2,c3,c1);
785 r[13]=c2;
786 c2=0;
787 sqr_add_c(a,7,c3,c1,c2);
788 r[14]=c3;
789 r[15]=c1;
790 }
791
792void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
793 {
794#ifdef BN_LLONG
795 BN_ULLONG t,tt;
796#else
797 BN_ULONG bl,bh;
798#endif
799 BN_ULONG t1,t2;
800 BN_ULONG c1,c2,c3;
801
802 c1=0;
803 c2=0;
804 c3=0;
805 sqr_add_c(a,0,c1,c2,c3);
806 r[0]=c1;
807 c1=0;
808 sqr_add_c2(a,1,0,c2,c3,c1);
809 r[1]=c2;
810 c2=0;
811 sqr_add_c(a,1,c3,c1,c2);
812 sqr_add_c2(a,2,0,c3,c1,c2);
813 r[2]=c3;
814 c3=0;
815 sqr_add_c2(a,3,0,c1,c2,c3);
816 sqr_add_c2(a,2,1,c1,c2,c3);
817 r[3]=c1;
818 c1=0;
819 sqr_add_c(a,2,c2,c3,c1);
820 sqr_add_c2(a,3,1,c2,c3,c1);
821 r[4]=c2;
822 c2=0;
823 sqr_add_c2(a,3,2,c3,c1,c2);
824 r[5]=c3;
825 c3=0;
826 sqr_add_c(a,3,c1,c2,c3);
827 r[6]=c1;
828 r[7]=c2;
829 }
830
831#ifdef OPENSSL_NO_ASM
832#ifdef OPENSSL_BN_ASM_MONT
833#include <alloca.h>
834/*
835 * This is essentially reference implementation, which may or may not
836 * result in performance improvement. E.g. on IA-32 this routine was
837 * observed to give 40% faster rsa1024 private key operations and 10%
838 * faster rsa4096 ones, while on AMD64 it improves rsa1024 sign only
839 * by 10% and *worsens* rsa4096 sign by 15%. Once again, it's a
840 * reference implementation, one to be used as starting point for
841 * platform-specific assembler. Mentioned numbers apply to compiler
842 * generated code compiled with and without -DOPENSSL_BN_ASM_MONT and
843 * can vary not only from platform to platform, but even for compiler
844 * versions. Assembler vs. assembler improvement coefficients can
845 * [and are known to] differ and are to be documented elsewhere.
846 */
847int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0p, int num)
848 {
849 BN_ULONG c0,c1,ml,*tp,n0;
850#ifdef mul64
851 BN_ULONG mh;
852#endif
853 volatile BN_ULONG *vp;
854 int i=0,j;
855
856#if 0 /* template for platform-specific implementation */
857 if (ap==bp) return bn_sqr_mont(rp,ap,np,n0p,num);
858#endif
859 vp = tp = alloca((num+2)*sizeof(BN_ULONG));
860
861 n0 = *n0p;
862
863 c0 = 0;
864 ml = bp[0];
865#ifdef mul64
866 mh = HBITS(ml);
867 ml = LBITS(ml);
868 for (j=0;j<num;++j)
869 mul(tp[j],ap[j],ml,mh,c0);
870#else
871 for (j=0;j<num;++j)
872 mul(tp[j],ap[j],ml,c0);
873#endif
874
875 tp[num] = c0;
876 tp[num+1] = 0;
877 goto enter;
878
879 for(i=0;i<num;i++)
880 {
881 c0 = 0;
882 ml = bp[i];
883#ifdef mul64
884 mh = HBITS(ml);
885 ml = LBITS(ml);
886 for (j=0;j<num;++j)
887 mul_add(tp[j],ap[j],ml,mh,c0);
888#else
889 for (j=0;j<num;++j)
890 mul_add(tp[j],ap[j],ml,c0);
891#endif
892 c1 = (tp[num] + c0)&BN_MASK2;
893 tp[num] = c1;
894 tp[num+1] = (c1<c0?1:0);
895 enter:
896 c1 = tp[0];
897 ml = (c1*n0)&BN_MASK2;
898 c0 = 0;
899#ifdef mul64
900 mh = HBITS(ml);
901 ml = LBITS(ml);
902 mul_add(c1,np[0],ml,mh,c0);
903#else
904 mul_add(c1,ml,np[0],c0);
905#endif
906 for(j=1;j<num;j++)
907 {
908 c1 = tp[j];
909#ifdef mul64
910 mul_add(c1,np[j],ml,mh,c0);
911#else
912 mul_add(c1,ml,np[j],c0);
913#endif
914 tp[j-1] = c1&BN_MASK2;
915 }
916 c1 = (tp[num] + c0)&BN_MASK2;
917 tp[num-1] = c1;
918 tp[num] = tp[num+1] + (c1<c0?1:0);
919 }
920
921 if (tp[num]!=0 || tp[num-1]>=np[num-1])
922 {
923 c0 = bn_sub_words(rp,tp,np,num);
924 if (tp[num]!=0 || c0==0)
925 {
926 for(i=0;i<num+2;i++) vp[i] = 0;
927 return 1;
928 }
929 }
930 for(i=0;i<num;i++) rp[i] = tp[i], vp[i] = 0;
931 vp[num] = 0;
932 vp[num+1] = 0;
933 return 1;
934 }
935#else
936/*
937 * Return value of 0 indicates that multiplication/convolution was not
938 * performed to signal the caller to fall down to alternative/original
939 * code-path.
940 */
941int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num)
942{ return 0; }
943#endif /* OPENSSL_BN_ASM_MONT */
944#endif
945
946#else /* !BN_MUL_COMBA */
947
948/* hmm... is it faster just to do a multiply? */
949#undef bn_sqr_comba4
950void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
951 {
952 BN_ULONG t[8];
953 bn_sqr_normal(r,a,4,t);
954 }
955
956#undef bn_sqr_comba8
957void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
958 {
959 BN_ULONG t[16];
960 bn_sqr_normal(r,a,8,t);
961 }
962
963void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
964 {
965 r[4]=bn_mul_words( &(r[0]),a,4,b[0]);
966 r[5]=bn_mul_add_words(&(r[1]),a,4,b[1]);
967 r[6]=bn_mul_add_words(&(r[2]),a,4,b[2]);
968 r[7]=bn_mul_add_words(&(r[3]),a,4,b[3]);
969 }
970
971void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
972 {
973 r[ 8]=bn_mul_words( &(r[0]),a,8,b[0]);
974 r[ 9]=bn_mul_add_words(&(r[1]),a,8,b[1]);
975 r[10]=bn_mul_add_words(&(r[2]),a,8,b[2]);
976 r[11]=bn_mul_add_words(&(r[3]),a,8,b[3]);
977 r[12]=bn_mul_add_words(&(r[4]),a,8,b[4]);
978 r[13]=bn_mul_add_words(&(r[5]),a,8,b[5]);
979 r[14]=bn_mul_add_words(&(r[6]),a,8,b[6]);
980 r[15]=bn_mul_add_words(&(r[7]),a,8,b[7]);
981 }
982
983#ifdef OPENSSL_NO_ASM
984#ifdef OPENSSL_BN_ASM_MONT
985#include <alloca.h>
986int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0p, int num)
987 {
988 BN_ULONG c0,c1,*tp,n0=*n0p;
989 volatile BN_ULONG *vp;
990 int i=0,j;
991
992 vp = tp = alloca((num+2)*sizeof(BN_ULONG));
993
994 for(i=0;i<=num;i++) tp[i]=0;
995
996 for(i=0;i<num;i++)
997 {
998 c0 = bn_mul_add_words(tp,ap,num,bp[i]);
999 c1 = (tp[num] + c0)&BN_MASK2;
1000 tp[num] = c1;
1001 tp[num+1] = (c1<c0?1:0);
1002
1003 c0 = bn_mul_add_words(tp,np,num,tp[0]*n0);
1004 c1 = (tp[num] + c0)&BN_MASK2;
1005 tp[num] = c1;
1006 tp[num+1] += (c1<c0?1:0);
1007 for(j=0;j<=num;j++) tp[j]=tp[j+1];
1008 }
1009
1010 if (tp[num]!=0 || tp[num-1]>=np[num-1])
1011 {
1012 c0 = bn_sub_words(rp,tp,np,num);
1013 if (tp[num]!=0 || c0==0)
1014 {
1015 for(i=0;i<num+2;i++) vp[i] = 0;
1016 return 1;
1017 }
1018 }
1019 for(i=0;i<num;i++) rp[i] = tp[i], vp[i] = 0;
1020 vp[num] = 0;
1021 vp[num+1] = 0;
1022 return 1;
1023 }
1024#else
1025int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num)
1026{ return 0; }
1027#endif /* OPENSSL_BN_ASM_MONT */
1028#endif
1029
1030#endif /* !BN_MUL_COMBA */
diff --git a/src/lib/libcrypto/bn/bn_blind.c b/src/lib/libcrypto/bn/bn_blind.c
deleted file mode 100644
index 9ed8bc2b40..0000000000
--- a/src/lib/libcrypto/bn/bn_blind.c
+++ /dev/null
@@ -1,385 +0,0 @@
1/* crypto/bn/bn_blind.c */
2/* ====================================================================
3 * Copyright (c) 1998-2006 The OpenSSL Project. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in
14 * the documentation and/or other materials provided with the
15 * distribution.
16 *
17 * 3. All advertising materials mentioning features or use of this
18 * software must display the following acknowledgment:
19 * "This product includes software developed by the OpenSSL Project
20 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
21 *
22 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
23 * endorse or promote products derived from this software without
24 * prior written permission. For written permission, please contact
25 * openssl-core@openssl.org.
26 *
27 * 5. Products derived from this software may not be called "OpenSSL"
28 * nor may "OpenSSL" appear in their names without prior written
29 * permission of the OpenSSL Project.
30 *
31 * 6. Redistributions of any form whatsoever must retain the following
32 * acknowledgment:
33 * "This product includes software developed by the OpenSSL Project
34 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
35 *
36 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
37 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
38 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
39 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
40 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
41 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
42 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
43 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
44 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
45 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
46 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
47 * OF THE POSSIBILITY OF SUCH DAMAGE.
48 * ====================================================================
49 *
50 * This product includes cryptographic software written by Eric Young
51 * (eay@cryptsoft.com). This product includes software written by Tim
52 * Hudson (tjh@cryptsoft.com).
53 *
54 */
55/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
56 * All rights reserved.
57 *
58 * This package is an SSL implementation written
59 * by Eric Young (eay@cryptsoft.com).
60 * The implementation was written so as to conform with Netscapes SSL.
61 *
62 * This library is free for commercial and non-commercial use as long as
63 * the following conditions are aheared to. The following conditions
64 * apply to all code found in this distribution, be it the RC4, RSA,
65 * lhash, DES, etc., code; not just the SSL code. The SSL documentation
66 * included with this distribution is covered by the same copyright terms
67 * except that the holder is Tim Hudson (tjh@cryptsoft.com).
68 *
69 * Copyright remains Eric Young's, and as such any Copyright notices in
70 * the code are not to be removed.
71 * If this package is used in a product, Eric Young should be given attribution
72 * as the author of the parts of the library used.
73 * This can be in the form of a textual message at program startup or
74 * in documentation (online or textual) provided with the package.
75 *
76 * Redistribution and use in source and binary forms, with or without
77 * modification, are permitted provided that the following conditions
78 * are met:
79 * 1. Redistributions of source code must retain the copyright
80 * notice, this list of conditions and the following disclaimer.
81 * 2. Redistributions in binary form must reproduce the above copyright
82 * notice, this list of conditions and the following disclaimer in the
83 * documentation and/or other materials provided with the distribution.
84 * 3. All advertising materials mentioning features or use of this software
85 * must display the following acknowledgement:
86 * "This product includes cryptographic software written by
87 * Eric Young (eay@cryptsoft.com)"
88 * The word 'cryptographic' can be left out if the rouines from the library
89 * being used are not cryptographic related :-).
90 * 4. If you include any Windows specific code (or a derivative thereof) from
91 * the apps directory (application code) you must include an acknowledgement:
92 * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
93 *
94 * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
95 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
96 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
97 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
98 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
99 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
100 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
101 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
102 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
103 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
104 * SUCH DAMAGE.
105 *
106 * The licence and distribution terms for any publically available version or
107 * derivative of this code cannot be changed. i.e. this code cannot simply be
108 * copied and put under another distribution licence
109 * [including the GNU Public Licence.]
110 */
111
112#include <stdio.h>
113#include "cryptlib.h"
114#include "bn_lcl.h"
115
116#define BN_BLINDING_COUNTER 32
117
118struct bn_blinding_st
119 {
120 BIGNUM *A;
121 BIGNUM *Ai;
122 BIGNUM *e;
123 BIGNUM *mod; /* just a reference */
124#ifndef OPENSSL_NO_DEPRECATED
125 unsigned long thread_id; /* added in OpenSSL 0.9.6j and 0.9.7b;
126 * used only by crypto/rsa/rsa_eay.c, rsa_lib.c */
127#endif
128 CRYPTO_THREADID tid;
129 int counter;
130 unsigned long flags;
131 BN_MONT_CTX *m_ctx;
132 int (*bn_mod_exp)(BIGNUM *r, const BIGNUM *a, const BIGNUM *p,
133 const BIGNUM *m, BN_CTX *ctx,
134 BN_MONT_CTX *m_ctx);
135 };
136
137BN_BLINDING *BN_BLINDING_new(const BIGNUM *A, const BIGNUM *Ai, BIGNUM *mod)
138 {
139 BN_BLINDING *ret=NULL;
140
141 bn_check_top(mod);
142
143 if ((ret=(BN_BLINDING *)OPENSSL_malloc(sizeof(BN_BLINDING))) == NULL)
144 {
145 BNerr(BN_F_BN_BLINDING_NEW,ERR_R_MALLOC_FAILURE);
146 return(NULL);
147 }
148 memset(ret,0,sizeof(BN_BLINDING));
149 if (A != NULL)
150 {
151 if ((ret->A = BN_dup(A)) == NULL) goto err;
152 }
153 if (Ai != NULL)
154 {
155 if ((ret->Ai = BN_dup(Ai)) == NULL) goto err;
156 }
157
158 /* save a copy of mod in the BN_BLINDING structure */
159 if ((ret->mod = BN_dup(mod)) == NULL) goto err;
160 if (BN_get_flags(mod, BN_FLG_CONSTTIME) != 0)
161 BN_set_flags(ret->mod, BN_FLG_CONSTTIME);
162
163 /* Set the counter to the special value -1
164 * to indicate that this is never-used fresh blinding
165 * that does not need updating before first use. */
166 ret->counter = -1;
167 CRYPTO_THREADID_current(&ret->tid);
168 return(ret);
169err:
170 if (ret != NULL) BN_BLINDING_free(ret);
171 return(NULL);
172 }
173
174void BN_BLINDING_free(BN_BLINDING *r)
175 {
176 if(r == NULL)
177 return;
178
179 if (r->A != NULL) BN_free(r->A );
180 if (r->Ai != NULL) BN_free(r->Ai);
181 if (r->e != NULL) BN_free(r->e );
182 if (r->mod != NULL) BN_free(r->mod);
183 OPENSSL_free(r);
184 }
185
186int BN_BLINDING_update(BN_BLINDING *b, BN_CTX *ctx)
187 {
188 int ret=0;
189
190 if ((b->A == NULL) || (b->Ai == NULL))
191 {
192 BNerr(BN_F_BN_BLINDING_UPDATE,BN_R_NOT_INITIALIZED);
193 goto err;
194 }
195
196 if (b->counter == -1)
197 b->counter = 0;
198
199 if (++b->counter == BN_BLINDING_COUNTER && b->e != NULL &&
200 !(b->flags & BN_BLINDING_NO_RECREATE))
201 {
202 /* re-create blinding parameters */
203 if (!BN_BLINDING_create_param(b, NULL, NULL, ctx, NULL, NULL))
204 goto err;
205 }
206 else if (!(b->flags & BN_BLINDING_NO_UPDATE))
207 {
208 if (!BN_mod_mul(b->A,b->A,b->A,b->mod,ctx)) goto err;
209 if (!BN_mod_mul(b->Ai,b->Ai,b->Ai,b->mod,ctx)) goto err;
210 }
211
212 ret=1;
213err:
214 if (b->counter == BN_BLINDING_COUNTER)
215 b->counter = 0;
216 return(ret);
217 }
218
219int BN_BLINDING_convert(BIGNUM *n, BN_BLINDING *b, BN_CTX *ctx)
220 {
221 return BN_BLINDING_convert_ex(n, NULL, b, ctx);
222 }
223
224int BN_BLINDING_convert_ex(BIGNUM *n, BIGNUM *r, BN_BLINDING *b, BN_CTX *ctx)
225 {
226 int ret = 1;
227
228 bn_check_top(n);
229
230 if ((b->A == NULL) || (b->Ai == NULL))
231 {
232 BNerr(BN_F_BN_BLINDING_CONVERT_EX,BN_R_NOT_INITIALIZED);
233 return(0);
234 }
235
236 if (b->counter == -1)
237 /* Fresh blinding, doesn't need updating. */
238 b->counter = 0;
239 else if (!BN_BLINDING_update(b,ctx))
240 return(0);
241
242 if (r != NULL)
243 {
244 if (!BN_copy(r, b->Ai)) ret=0;
245 }
246
247 if (!BN_mod_mul(n,n,b->A,b->mod,ctx)) ret=0;
248
249 return ret;
250 }
251
252int BN_BLINDING_invert(BIGNUM *n, BN_BLINDING *b, BN_CTX *ctx)
253 {
254 return BN_BLINDING_invert_ex(n, NULL, b, ctx);
255 }
256
257int BN_BLINDING_invert_ex(BIGNUM *n, const BIGNUM *r, BN_BLINDING *b, BN_CTX *ctx)
258 {
259 int ret;
260
261 bn_check_top(n);
262
263 if (r != NULL)
264 ret = BN_mod_mul(n, n, r, b->mod, ctx);
265 else
266 {
267 if (b->Ai == NULL)
268 {
269 BNerr(BN_F_BN_BLINDING_INVERT_EX,BN_R_NOT_INITIALIZED);
270 return(0);
271 }
272 ret = BN_mod_mul(n, n, b->Ai, b->mod, ctx);
273 }
274
275 bn_check_top(n);
276 return(ret);
277 }
278
279#ifndef OPENSSL_NO_DEPRECATED
280unsigned long BN_BLINDING_get_thread_id(const BN_BLINDING *b)
281 {
282 return b->thread_id;
283 }
284
285void BN_BLINDING_set_thread_id(BN_BLINDING *b, unsigned long n)
286 {
287 b->thread_id = n;
288 }
289#endif
290
291CRYPTO_THREADID *BN_BLINDING_thread_id(BN_BLINDING *b)
292 {
293 return &b->tid;
294 }
295
296unsigned long BN_BLINDING_get_flags(const BN_BLINDING *b)
297 {
298 return b->flags;
299 }
300
301void BN_BLINDING_set_flags(BN_BLINDING *b, unsigned long flags)
302 {
303 b->flags = flags;
304 }
305
306BN_BLINDING *BN_BLINDING_create_param(BN_BLINDING *b,
307 const BIGNUM *e, BIGNUM *m, BN_CTX *ctx,
308 int (*bn_mod_exp)(BIGNUM *r, const BIGNUM *a, const BIGNUM *p,
309 const BIGNUM *m, BN_CTX *ctx, BN_MONT_CTX *m_ctx),
310 BN_MONT_CTX *m_ctx)
311{
312 int retry_counter = 32;
313 BN_BLINDING *ret = NULL;
314
315 if (b == NULL)
316 ret = BN_BLINDING_new(NULL, NULL, m);
317 else
318 ret = b;
319
320 if (ret == NULL)
321 goto err;
322
323 if (ret->A == NULL && (ret->A = BN_new()) == NULL)
324 goto err;
325 if (ret->Ai == NULL && (ret->Ai = BN_new()) == NULL)
326 goto err;
327
328 if (e != NULL)
329 {
330 if (ret->e != NULL)
331 BN_free(ret->e);
332 ret->e = BN_dup(e);
333 }
334 if (ret->e == NULL)
335 goto err;
336
337 if (bn_mod_exp != NULL)
338 ret->bn_mod_exp = bn_mod_exp;
339 if (m_ctx != NULL)
340 ret->m_ctx = m_ctx;
341
342 do {
343 if (!BN_rand_range(ret->A, ret->mod)) goto err;
344 if (BN_mod_inverse(ret->Ai, ret->A, ret->mod, ctx) == NULL)
345 {
346 /* this should almost never happen for good RSA keys */
347 unsigned long error = ERR_peek_last_error();
348 if (ERR_GET_REASON(error) == BN_R_NO_INVERSE)
349 {
350 if (retry_counter-- == 0)
351 {
352 BNerr(BN_F_BN_BLINDING_CREATE_PARAM,
353 BN_R_TOO_MANY_ITERATIONS);
354 goto err;
355 }
356 ERR_clear_error();
357 }
358 else
359 goto err;
360 }
361 else
362 break;
363 } while (1);
364
365 if (ret->bn_mod_exp != NULL && ret->m_ctx != NULL)
366 {
367 if (!ret->bn_mod_exp(ret->A, ret->A, ret->e, ret->mod, ctx, ret->m_ctx))
368 goto err;
369 }
370 else
371 {
372 if (!BN_mod_exp(ret->A, ret->A, ret->e, ret->mod, ctx))
373 goto err;
374 }
375
376 return ret;
377err:
378 if (b == NULL && ret != NULL)
379 {
380 BN_BLINDING_free(ret);
381 ret = NULL;
382 }
383
384 return ret;
385}
diff --git a/src/lib/libcrypto/bn/bn_const.c b/src/lib/libcrypto/bn/bn_const.c
deleted file mode 100644
index eb60a25b3c..0000000000
--- a/src/lib/libcrypto/bn/bn_const.c
+++ /dev/null
@@ -1,402 +0,0 @@
1/* crypto/bn/knownprimes.c */
2/* Insert boilerplate */
3
4#include "bn.h"
5
6/* "First Oakley Default Group" from RFC2409, section 6.1.
7 *
8 * The prime is: 2^768 - 2 ^704 - 1 + 2^64 * { [2^638 pi] + 149686 }
9 *
10 * RFC2409 specifies a generator of 2.
11 * RFC2412 specifies a generator of of 22.
12 */
13
14BIGNUM *get_rfc2409_prime_768(BIGNUM *bn)
15 {
16 static const unsigned char RFC2409_PRIME_768[]={
17 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xC9,0x0F,0xDA,0xA2,
18 0x21,0x68,0xC2,0x34,0xC4,0xC6,0x62,0x8B,0x80,0xDC,0x1C,0xD1,
19 0x29,0x02,0x4E,0x08,0x8A,0x67,0xCC,0x74,0x02,0x0B,0xBE,0xA6,
20 0x3B,0x13,0x9B,0x22,0x51,0x4A,0x08,0x79,0x8E,0x34,0x04,0xDD,
21 0xEF,0x95,0x19,0xB3,0xCD,0x3A,0x43,0x1B,0x30,0x2B,0x0A,0x6D,
22 0xF2,0x5F,0x14,0x37,0x4F,0xE1,0x35,0x6D,0x6D,0x51,0xC2,0x45,
23 0xE4,0x85,0xB5,0x76,0x62,0x5E,0x7E,0xC6,0xF4,0x4C,0x42,0xE9,
24 0xA6,0x3A,0x36,0x20,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
25 };
26 return BN_bin2bn(RFC2409_PRIME_768,sizeof(RFC2409_PRIME_768),bn);
27 }
28
29/* "Second Oakley Default Group" from RFC2409, section 6.2.
30 *
31 * The prime is: 2^1024 - 2^960 - 1 + 2^64 * { [2^894 pi] + 129093 }.
32 *
33 * RFC2409 specifies a generator of 2.
34 * RFC2412 specifies a generator of 22.
35 */
36
37BIGNUM *get_rfc2409_prime_1024(BIGNUM *bn)
38 {
39 static const unsigned char RFC2409_PRIME_1024[]={
40 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xC9,0x0F,0xDA,0xA2,
41 0x21,0x68,0xC2,0x34,0xC4,0xC6,0x62,0x8B,0x80,0xDC,0x1C,0xD1,
42 0x29,0x02,0x4E,0x08,0x8A,0x67,0xCC,0x74,0x02,0x0B,0xBE,0xA6,
43 0x3B,0x13,0x9B,0x22,0x51,0x4A,0x08,0x79,0x8E,0x34,0x04,0xDD,
44 0xEF,0x95,0x19,0xB3,0xCD,0x3A,0x43,0x1B,0x30,0x2B,0x0A,0x6D,
45 0xF2,0x5F,0x14,0x37,0x4F,0xE1,0x35,0x6D,0x6D,0x51,0xC2,0x45,
46 0xE4,0x85,0xB5,0x76,0x62,0x5E,0x7E,0xC6,0xF4,0x4C,0x42,0xE9,
47 0xA6,0x37,0xED,0x6B,0x0B,0xFF,0x5C,0xB6,0xF4,0x06,0xB7,0xED,
48 0xEE,0x38,0x6B,0xFB,0x5A,0x89,0x9F,0xA5,0xAE,0x9F,0x24,0x11,
49 0x7C,0x4B,0x1F,0xE6,0x49,0x28,0x66,0x51,0xEC,0xE6,0x53,0x81,
50 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
51 };
52 return BN_bin2bn(RFC2409_PRIME_1024,sizeof(RFC2409_PRIME_1024),bn);
53 }
54
55/* "1536-bit MODP Group" from RFC3526, Section 2.
56 *
57 * The prime is: 2^1536 - 2^1472 - 1 + 2^64 * { [2^1406 pi] + 741804 }
58 *
59 * RFC3526 specifies a generator of 2.
60 * RFC2312 specifies a generator of 22.
61 */
62
63BIGNUM *get_rfc3526_prime_1536(BIGNUM *bn)
64 {
65 static const unsigned char RFC3526_PRIME_1536[]={
66 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xC9,0x0F,0xDA,0xA2,
67 0x21,0x68,0xC2,0x34,0xC4,0xC6,0x62,0x8B,0x80,0xDC,0x1C,0xD1,
68 0x29,0x02,0x4E,0x08,0x8A,0x67,0xCC,0x74,0x02,0x0B,0xBE,0xA6,
69 0x3B,0x13,0x9B,0x22,0x51,0x4A,0x08,0x79,0x8E,0x34,0x04,0xDD,
70 0xEF,0x95,0x19,0xB3,0xCD,0x3A,0x43,0x1B,0x30,0x2B,0x0A,0x6D,
71 0xF2,0x5F,0x14,0x37,0x4F,0xE1,0x35,0x6D,0x6D,0x51,0xC2,0x45,
72 0xE4,0x85,0xB5,0x76,0x62,0x5E,0x7E,0xC6,0xF4,0x4C,0x42,0xE9,
73 0xA6,0x37,0xED,0x6B,0x0B,0xFF,0x5C,0xB6,0xF4,0x06,0xB7,0xED,
74 0xEE,0x38,0x6B,0xFB,0x5A,0x89,0x9F,0xA5,0xAE,0x9F,0x24,0x11,
75 0x7C,0x4B,0x1F,0xE6,0x49,0x28,0x66,0x51,0xEC,0xE4,0x5B,0x3D,
76 0xC2,0x00,0x7C,0xB8,0xA1,0x63,0xBF,0x05,0x98,0xDA,0x48,0x36,
77 0x1C,0x55,0xD3,0x9A,0x69,0x16,0x3F,0xA8,0xFD,0x24,0xCF,0x5F,
78 0x83,0x65,0x5D,0x23,0xDC,0xA3,0xAD,0x96,0x1C,0x62,0xF3,0x56,
79 0x20,0x85,0x52,0xBB,0x9E,0xD5,0x29,0x07,0x70,0x96,0x96,0x6D,
80 0x67,0x0C,0x35,0x4E,0x4A,0xBC,0x98,0x04,0xF1,0x74,0x6C,0x08,
81 0xCA,0x23,0x73,0x27,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
82 };
83 return BN_bin2bn(RFC3526_PRIME_1536,sizeof(RFC3526_PRIME_1536),bn);
84 }
85
86/* "2048-bit MODP Group" from RFC3526, Section 3.
87 *
88 * The prime is: 2^2048 - 2^1984 - 1 + 2^64 * { [2^1918 pi] + 124476 }
89 *
90 * RFC3526 specifies a generator of 2.
91 */
92
93BIGNUM *get_rfc3526_prime_2048(BIGNUM *bn)
94 {
95 static const unsigned char RFC3526_PRIME_2048[]={
96 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xC9,0x0F,0xDA,0xA2,
97 0x21,0x68,0xC2,0x34,0xC4,0xC6,0x62,0x8B,0x80,0xDC,0x1C,0xD1,
98 0x29,0x02,0x4E,0x08,0x8A,0x67,0xCC,0x74,0x02,0x0B,0xBE,0xA6,
99 0x3B,0x13,0x9B,0x22,0x51,0x4A,0x08,0x79,0x8E,0x34,0x04,0xDD,
100 0xEF,0x95,0x19,0xB3,0xCD,0x3A,0x43,0x1B,0x30,0x2B,0x0A,0x6D,
101 0xF2,0x5F,0x14,0x37,0x4F,0xE1,0x35,0x6D,0x6D,0x51,0xC2,0x45,
102 0xE4,0x85,0xB5,0x76,0x62,0x5E,0x7E,0xC6,0xF4,0x4C,0x42,0xE9,
103 0xA6,0x37,0xED,0x6B,0x0B,0xFF,0x5C,0xB6,0xF4,0x06,0xB7,0xED,
104 0xEE,0x38,0x6B,0xFB,0x5A,0x89,0x9F,0xA5,0xAE,0x9F,0x24,0x11,
105 0x7C,0x4B,0x1F,0xE6,0x49,0x28,0x66,0x51,0xEC,0xE4,0x5B,0x3D,
106 0xC2,0x00,0x7C,0xB8,0xA1,0x63,0xBF,0x05,0x98,0xDA,0x48,0x36,
107 0x1C,0x55,0xD3,0x9A,0x69,0x16,0x3F,0xA8,0xFD,0x24,0xCF,0x5F,
108 0x83,0x65,0x5D,0x23,0xDC,0xA3,0xAD,0x96,0x1C,0x62,0xF3,0x56,
109 0x20,0x85,0x52,0xBB,0x9E,0xD5,0x29,0x07,0x70,0x96,0x96,0x6D,
110 0x67,0x0C,0x35,0x4E,0x4A,0xBC,0x98,0x04,0xF1,0x74,0x6C,0x08,
111 0xCA,0x18,0x21,0x7C,0x32,0x90,0x5E,0x46,0x2E,0x36,0xCE,0x3B,
112 0xE3,0x9E,0x77,0x2C,0x18,0x0E,0x86,0x03,0x9B,0x27,0x83,0xA2,
113 0xEC,0x07,0xA2,0x8F,0xB5,0xC5,0x5D,0xF0,0x6F,0x4C,0x52,0xC9,
114 0xDE,0x2B,0xCB,0xF6,0x95,0x58,0x17,0x18,0x39,0x95,0x49,0x7C,
115 0xEA,0x95,0x6A,0xE5,0x15,0xD2,0x26,0x18,0x98,0xFA,0x05,0x10,
116 0x15,0x72,0x8E,0x5A,0x8A,0xAC,0xAA,0x68,0xFF,0xFF,0xFF,0xFF,
117 0xFF,0xFF,0xFF,0xFF,
118 };
119 return BN_bin2bn(RFC3526_PRIME_2048,sizeof(RFC3526_PRIME_2048),bn);
120 }
121
122/* "3072-bit MODP Group" from RFC3526, Section 4.
123 *
124 * The prime is: 2^3072 - 2^3008 - 1 + 2^64 * { [2^2942 pi] + 1690314 }
125 *
126 * RFC3526 specifies a generator of 2.
127 */
128
129BIGNUM *get_rfc3526_prime_3072(BIGNUM *bn)
130 {
131 static const unsigned char RFC3526_PRIME_3072[]={
132 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xC9,0x0F,0xDA,0xA2,
133 0x21,0x68,0xC2,0x34,0xC4,0xC6,0x62,0x8B,0x80,0xDC,0x1C,0xD1,
134 0x29,0x02,0x4E,0x08,0x8A,0x67,0xCC,0x74,0x02,0x0B,0xBE,0xA6,
135 0x3B,0x13,0x9B,0x22,0x51,0x4A,0x08,0x79,0x8E,0x34,0x04,0xDD,
136 0xEF,0x95,0x19,0xB3,0xCD,0x3A,0x43,0x1B,0x30,0x2B,0x0A,0x6D,
137 0xF2,0x5F,0x14,0x37,0x4F,0xE1,0x35,0x6D,0x6D,0x51,0xC2,0x45,
138 0xE4,0x85,0xB5,0x76,0x62,0x5E,0x7E,0xC6,0xF4,0x4C,0x42,0xE9,
139 0xA6,0x37,0xED,0x6B,0x0B,0xFF,0x5C,0xB6,0xF4,0x06,0xB7,0xED,
140 0xEE,0x38,0x6B,0xFB,0x5A,0x89,0x9F,0xA5,0xAE,0x9F,0x24,0x11,
141 0x7C,0x4B,0x1F,0xE6,0x49,0x28,0x66,0x51,0xEC,0xE4,0x5B,0x3D,
142 0xC2,0x00,0x7C,0xB8,0xA1,0x63,0xBF,0x05,0x98,0xDA,0x48,0x36,
143 0x1C,0x55,0xD3,0x9A,0x69,0x16,0x3F,0xA8,0xFD,0x24,0xCF,0x5F,
144 0x83,0x65,0x5D,0x23,0xDC,0xA3,0xAD,0x96,0x1C,0x62,0xF3,0x56,
145 0x20,0x85,0x52,0xBB,0x9E,0xD5,0x29,0x07,0x70,0x96,0x96,0x6D,
146 0x67,0x0C,0x35,0x4E,0x4A,0xBC,0x98,0x04,0xF1,0x74,0x6C,0x08,
147 0xCA,0x18,0x21,0x7C,0x32,0x90,0x5E,0x46,0x2E,0x36,0xCE,0x3B,
148 0xE3,0x9E,0x77,0x2C,0x18,0x0E,0x86,0x03,0x9B,0x27,0x83,0xA2,
149 0xEC,0x07,0xA2,0x8F,0xB5,0xC5,0x5D,0xF0,0x6F,0x4C,0x52,0xC9,
150 0xDE,0x2B,0xCB,0xF6,0x95,0x58,0x17,0x18,0x39,0x95,0x49,0x7C,
151 0xEA,0x95,0x6A,0xE5,0x15,0xD2,0x26,0x18,0x98,0xFA,0x05,0x10,
152 0x15,0x72,0x8E,0x5A,0x8A,0xAA,0xC4,0x2D,0xAD,0x33,0x17,0x0D,
153 0x04,0x50,0x7A,0x33,0xA8,0x55,0x21,0xAB,0xDF,0x1C,0xBA,0x64,
154 0xEC,0xFB,0x85,0x04,0x58,0xDB,0xEF,0x0A,0x8A,0xEA,0x71,0x57,
155 0x5D,0x06,0x0C,0x7D,0xB3,0x97,0x0F,0x85,0xA6,0xE1,0xE4,0xC7,
156 0xAB,0xF5,0xAE,0x8C,0xDB,0x09,0x33,0xD7,0x1E,0x8C,0x94,0xE0,
157 0x4A,0x25,0x61,0x9D,0xCE,0xE3,0xD2,0x26,0x1A,0xD2,0xEE,0x6B,
158 0xF1,0x2F,0xFA,0x06,0xD9,0x8A,0x08,0x64,0xD8,0x76,0x02,0x73,
159 0x3E,0xC8,0x6A,0x64,0x52,0x1F,0x2B,0x18,0x17,0x7B,0x20,0x0C,
160 0xBB,0xE1,0x17,0x57,0x7A,0x61,0x5D,0x6C,0x77,0x09,0x88,0xC0,
161 0xBA,0xD9,0x46,0xE2,0x08,0xE2,0x4F,0xA0,0x74,0xE5,0xAB,0x31,
162 0x43,0xDB,0x5B,0xFC,0xE0,0xFD,0x10,0x8E,0x4B,0x82,0xD1,0x20,
163 0xA9,0x3A,0xD2,0xCA,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
164 };
165 return BN_bin2bn(RFC3526_PRIME_3072,sizeof(RFC3526_PRIME_3072),bn);
166 }
167
168/* "4096-bit MODP Group" from RFC3526, Section 5.
169 *
170 * The prime is: 2^4096 - 2^4032 - 1 + 2^64 * { [2^3966 pi] + 240904 }
171 *
172 * RFC3526 specifies a generator of 2.
173 */
174
175BIGNUM *get_rfc3526_prime_4096(BIGNUM *bn)
176 {
177 static const unsigned char RFC3526_PRIME_4096[]={
178 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xC9,0x0F,0xDA,0xA2,
179 0x21,0x68,0xC2,0x34,0xC4,0xC6,0x62,0x8B,0x80,0xDC,0x1C,0xD1,
180 0x29,0x02,0x4E,0x08,0x8A,0x67,0xCC,0x74,0x02,0x0B,0xBE,0xA6,
181 0x3B,0x13,0x9B,0x22,0x51,0x4A,0x08,0x79,0x8E,0x34,0x04,0xDD,
182 0xEF,0x95,0x19,0xB3,0xCD,0x3A,0x43,0x1B,0x30,0x2B,0x0A,0x6D,
183 0xF2,0x5F,0x14,0x37,0x4F,0xE1,0x35,0x6D,0x6D,0x51,0xC2,0x45,
184 0xE4,0x85,0xB5,0x76,0x62,0x5E,0x7E,0xC6,0xF4,0x4C,0x42,0xE9,
185 0xA6,0x37,0xED,0x6B,0x0B,0xFF,0x5C,0xB6,0xF4,0x06,0xB7,0xED,
186 0xEE,0x38,0x6B,0xFB,0x5A,0x89,0x9F,0xA5,0xAE,0x9F,0x24,0x11,
187 0x7C,0x4B,0x1F,0xE6,0x49,0x28,0x66,0x51,0xEC,0xE4,0x5B,0x3D,
188 0xC2,0x00,0x7C,0xB8,0xA1,0x63,0xBF,0x05,0x98,0xDA,0x48,0x36,
189 0x1C,0x55,0xD3,0x9A,0x69,0x16,0x3F,0xA8,0xFD,0x24,0xCF,0x5F,
190 0x83,0x65,0x5D,0x23,0xDC,0xA3,0xAD,0x96,0x1C,0x62,0xF3,0x56,
191 0x20,0x85,0x52,0xBB,0x9E,0xD5,0x29,0x07,0x70,0x96,0x96,0x6D,
192 0x67,0x0C,0x35,0x4E,0x4A,0xBC,0x98,0x04,0xF1,0x74,0x6C,0x08,
193 0xCA,0x18,0x21,0x7C,0x32,0x90,0x5E,0x46,0x2E,0x36,0xCE,0x3B,
194 0xE3,0x9E,0x77,0x2C,0x18,0x0E,0x86,0x03,0x9B,0x27,0x83,0xA2,
195 0xEC,0x07,0xA2,0x8F,0xB5,0xC5,0x5D,0xF0,0x6F,0x4C,0x52,0xC9,
196 0xDE,0x2B,0xCB,0xF6,0x95,0x58,0x17,0x18,0x39,0x95,0x49,0x7C,
197 0xEA,0x95,0x6A,0xE5,0x15,0xD2,0x26,0x18,0x98,0xFA,0x05,0x10,
198 0x15,0x72,0x8E,0x5A,0x8A,0xAA,0xC4,0x2D,0xAD,0x33,0x17,0x0D,
199 0x04,0x50,0x7A,0x33,0xA8,0x55,0x21,0xAB,0xDF,0x1C,0xBA,0x64,
200 0xEC,0xFB,0x85,0x04,0x58,0xDB,0xEF,0x0A,0x8A,0xEA,0x71,0x57,
201 0x5D,0x06,0x0C,0x7D,0xB3,0x97,0x0F,0x85,0xA6,0xE1,0xE4,0xC7,
202 0xAB,0xF5,0xAE,0x8C,0xDB,0x09,0x33,0xD7,0x1E,0x8C,0x94,0xE0,
203 0x4A,0x25,0x61,0x9D,0xCE,0xE3,0xD2,0x26,0x1A,0xD2,0xEE,0x6B,
204 0xF1,0x2F,0xFA,0x06,0xD9,0x8A,0x08,0x64,0xD8,0x76,0x02,0x73,
205 0x3E,0xC8,0x6A,0x64,0x52,0x1F,0x2B,0x18,0x17,0x7B,0x20,0x0C,
206 0xBB,0xE1,0x17,0x57,0x7A,0x61,0x5D,0x6C,0x77,0x09,0x88,0xC0,
207 0xBA,0xD9,0x46,0xE2,0x08,0xE2,0x4F,0xA0,0x74,0xE5,0xAB,0x31,
208 0x43,0xDB,0x5B,0xFC,0xE0,0xFD,0x10,0x8E,0x4B,0x82,0xD1,0x20,
209 0xA9,0x21,0x08,0x01,0x1A,0x72,0x3C,0x12,0xA7,0x87,0xE6,0xD7,
210 0x88,0x71,0x9A,0x10,0xBD,0xBA,0x5B,0x26,0x99,0xC3,0x27,0x18,
211 0x6A,0xF4,0xE2,0x3C,0x1A,0x94,0x68,0x34,0xB6,0x15,0x0B,0xDA,
212 0x25,0x83,0xE9,0xCA,0x2A,0xD4,0x4C,0xE8,0xDB,0xBB,0xC2,0xDB,
213 0x04,0xDE,0x8E,0xF9,0x2E,0x8E,0xFC,0x14,0x1F,0xBE,0xCA,0xA6,
214 0x28,0x7C,0x59,0x47,0x4E,0x6B,0xC0,0x5D,0x99,0xB2,0x96,0x4F,
215 0xA0,0x90,0xC3,0xA2,0x23,0x3B,0xA1,0x86,0x51,0x5B,0xE7,0xED,
216 0x1F,0x61,0x29,0x70,0xCE,0xE2,0xD7,0xAF,0xB8,0x1B,0xDD,0x76,
217 0x21,0x70,0x48,0x1C,0xD0,0x06,0x91,0x27,0xD5,0xB0,0x5A,0xA9,
218 0x93,0xB4,0xEA,0x98,0x8D,0x8F,0xDD,0xC1,0x86,0xFF,0xB7,0xDC,
219 0x90,0xA6,0xC0,0x8F,0x4D,0xF4,0x35,0xC9,0x34,0x06,0x31,0x99,
220 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
221 };
222 return BN_bin2bn(RFC3526_PRIME_4096,sizeof(RFC3526_PRIME_4096),bn);
223 }
224
225/* "6144-bit MODP Group" from RFC3526, Section 6.
226 *
227 * The prime is: 2^6144 - 2^6080 - 1 + 2^64 * { [2^6014 pi] + 929484 }
228 *
229 * RFC3526 specifies a generator of 2.
230 */
231
232BIGNUM *get_rfc3526_prime_6144(BIGNUM *bn)
233 {
234 static const unsigned char RFC3526_PRIME_6144[]={
235 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xC9,0x0F,0xDA,0xA2,
236 0x21,0x68,0xC2,0x34,0xC4,0xC6,0x62,0x8B,0x80,0xDC,0x1C,0xD1,
237 0x29,0x02,0x4E,0x08,0x8A,0x67,0xCC,0x74,0x02,0x0B,0xBE,0xA6,
238 0x3B,0x13,0x9B,0x22,0x51,0x4A,0x08,0x79,0x8E,0x34,0x04,0xDD,
239 0xEF,0x95,0x19,0xB3,0xCD,0x3A,0x43,0x1B,0x30,0x2B,0x0A,0x6D,
240 0xF2,0x5F,0x14,0x37,0x4F,0xE1,0x35,0x6D,0x6D,0x51,0xC2,0x45,
241 0xE4,0x85,0xB5,0x76,0x62,0x5E,0x7E,0xC6,0xF4,0x4C,0x42,0xE9,
242 0xA6,0x37,0xED,0x6B,0x0B,0xFF,0x5C,0xB6,0xF4,0x06,0xB7,0xED,
243 0xEE,0x38,0x6B,0xFB,0x5A,0x89,0x9F,0xA5,0xAE,0x9F,0x24,0x11,
244 0x7C,0x4B,0x1F,0xE6,0x49,0x28,0x66,0x51,0xEC,0xE4,0x5B,0x3D,
245 0xC2,0x00,0x7C,0xB8,0xA1,0x63,0xBF,0x05,0x98,0xDA,0x48,0x36,
246 0x1C,0x55,0xD3,0x9A,0x69,0x16,0x3F,0xA8,0xFD,0x24,0xCF,0x5F,
247 0x83,0x65,0x5D,0x23,0xDC,0xA3,0xAD,0x96,0x1C,0x62,0xF3,0x56,
248 0x20,0x85,0x52,0xBB,0x9E,0xD5,0x29,0x07,0x70,0x96,0x96,0x6D,
249 0x67,0x0C,0x35,0x4E,0x4A,0xBC,0x98,0x04,0xF1,0x74,0x6C,0x08,
250 0xCA,0x18,0x21,0x7C,0x32,0x90,0x5E,0x46,0x2E,0x36,0xCE,0x3B,
251 0xE3,0x9E,0x77,0x2C,0x18,0x0E,0x86,0x03,0x9B,0x27,0x83,0xA2,
252 0xEC,0x07,0xA2,0x8F,0xB5,0xC5,0x5D,0xF0,0x6F,0x4C,0x52,0xC9,
253 0xDE,0x2B,0xCB,0xF6,0x95,0x58,0x17,0x18,0x39,0x95,0x49,0x7C,
254 0xEA,0x95,0x6A,0xE5,0x15,0xD2,0x26,0x18,0x98,0xFA,0x05,0x10,
255 0x15,0x72,0x8E,0x5A,0x8A,0xAA,0xC4,0x2D,0xAD,0x33,0x17,0x0D,
256 0x04,0x50,0x7A,0x33,0xA8,0x55,0x21,0xAB,0xDF,0x1C,0xBA,0x64,
257 0xEC,0xFB,0x85,0x04,0x58,0xDB,0xEF,0x0A,0x8A,0xEA,0x71,0x57,
258 0x5D,0x06,0x0C,0x7D,0xB3,0x97,0x0F,0x85,0xA6,0xE1,0xE4,0xC7,
259 0xAB,0xF5,0xAE,0x8C,0xDB,0x09,0x33,0xD7,0x1E,0x8C,0x94,0xE0,
260 0x4A,0x25,0x61,0x9D,0xCE,0xE3,0xD2,0x26,0x1A,0xD2,0xEE,0x6B,
261 0xF1,0x2F,0xFA,0x06,0xD9,0x8A,0x08,0x64,0xD8,0x76,0x02,0x73,
262 0x3E,0xC8,0x6A,0x64,0x52,0x1F,0x2B,0x18,0x17,0x7B,0x20,0x0C,
263 0xBB,0xE1,0x17,0x57,0x7A,0x61,0x5D,0x6C,0x77,0x09,0x88,0xC0,
264 0xBA,0xD9,0x46,0xE2,0x08,0xE2,0x4F,0xA0,0x74,0xE5,0xAB,0x31,
265 0x43,0xDB,0x5B,0xFC,0xE0,0xFD,0x10,0x8E,0x4B,0x82,0xD1,0x20,
266 0xA9,0x21,0x08,0x01,0x1A,0x72,0x3C,0x12,0xA7,0x87,0xE6,0xD7,
267 0x88,0x71,0x9A,0x10,0xBD,0xBA,0x5B,0x26,0x99,0xC3,0x27,0x18,
268 0x6A,0xF4,0xE2,0x3C,0x1A,0x94,0x68,0x34,0xB6,0x15,0x0B,0xDA,
269 0x25,0x83,0xE9,0xCA,0x2A,0xD4,0x4C,0xE8,0xDB,0xBB,0xC2,0xDB,
270 0x04,0xDE,0x8E,0xF9,0x2E,0x8E,0xFC,0x14,0x1F,0xBE,0xCA,0xA6,
271 0x28,0x7C,0x59,0x47,0x4E,0x6B,0xC0,0x5D,0x99,0xB2,0x96,0x4F,
272 0xA0,0x90,0xC3,0xA2,0x23,0x3B,0xA1,0x86,0x51,0x5B,0xE7,0xED,
273 0x1F,0x61,0x29,0x70,0xCE,0xE2,0xD7,0xAF,0xB8,0x1B,0xDD,0x76,
274 0x21,0x70,0x48,0x1C,0xD0,0x06,0x91,0x27,0xD5,0xB0,0x5A,0xA9,
275 0x93,0xB4,0xEA,0x98,0x8D,0x8F,0xDD,0xC1,0x86,0xFF,0xB7,0xDC,
276 0x90,0xA6,0xC0,0x8F,0x4D,0xF4,0x35,0xC9,0x34,0x02,0x84,0x92,
277 0x36,0xC3,0xFA,0xB4,0xD2,0x7C,0x70,0x26,0xC1,0xD4,0xDC,0xB2,
278 0x60,0x26,0x46,0xDE,0xC9,0x75,0x1E,0x76,0x3D,0xBA,0x37,0xBD,
279 0xF8,0xFF,0x94,0x06,0xAD,0x9E,0x53,0x0E,0xE5,0xDB,0x38,0x2F,
280 0x41,0x30,0x01,0xAE,0xB0,0x6A,0x53,0xED,0x90,0x27,0xD8,0x31,
281 0x17,0x97,0x27,0xB0,0x86,0x5A,0x89,0x18,0xDA,0x3E,0xDB,0xEB,
282 0xCF,0x9B,0x14,0xED,0x44,0xCE,0x6C,0xBA,0xCE,0xD4,0xBB,0x1B,
283 0xDB,0x7F,0x14,0x47,0xE6,0xCC,0x25,0x4B,0x33,0x20,0x51,0x51,
284 0x2B,0xD7,0xAF,0x42,0x6F,0xB8,0xF4,0x01,0x37,0x8C,0xD2,0xBF,
285 0x59,0x83,0xCA,0x01,0xC6,0x4B,0x92,0xEC,0xF0,0x32,0xEA,0x15,
286 0xD1,0x72,0x1D,0x03,0xF4,0x82,0xD7,0xCE,0x6E,0x74,0xFE,0xF6,
287 0xD5,0x5E,0x70,0x2F,0x46,0x98,0x0C,0x82,0xB5,0xA8,0x40,0x31,
288 0x90,0x0B,0x1C,0x9E,0x59,0xE7,0xC9,0x7F,0xBE,0xC7,0xE8,0xF3,
289 0x23,0xA9,0x7A,0x7E,0x36,0xCC,0x88,0xBE,0x0F,0x1D,0x45,0xB7,
290 0xFF,0x58,0x5A,0xC5,0x4B,0xD4,0x07,0xB2,0x2B,0x41,0x54,0xAA,
291 0xCC,0x8F,0x6D,0x7E,0xBF,0x48,0xE1,0xD8,0x14,0xCC,0x5E,0xD2,
292 0x0F,0x80,0x37,0xE0,0xA7,0x97,0x15,0xEE,0xF2,0x9B,0xE3,0x28,
293 0x06,0xA1,0xD5,0x8B,0xB7,0xC5,0xDA,0x76,0xF5,0x50,0xAA,0x3D,
294 0x8A,0x1F,0xBF,0xF0,0xEB,0x19,0xCC,0xB1,0xA3,0x13,0xD5,0x5C,
295 0xDA,0x56,0xC9,0xEC,0x2E,0xF2,0x96,0x32,0x38,0x7F,0xE8,0xD7,
296 0x6E,0x3C,0x04,0x68,0x04,0x3E,0x8F,0x66,0x3F,0x48,0x60,0xEE,
297 0x12,0xBF,0x2D,0x5B,0x0B,0x74,0x74,0xD6,0xE6,0x94,0xF9,0x1E,
298 0x6D,0xCC,0x40,0x24,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
299 };
300 return BN_bin2bn(RFC3526_PRIME_6144,sizeof(RFC3526_PRIME_6144),bn);
301 }
302
303/* "8192-bit MODP Group" from RFC3526, Section 7.
304 *
305 * The prime is: 2^8192 - 2^8128 - 1 + 2^64 * { [2^8062 pi] + 4743158 }
306 *
307 * RFC3526 specifies a generator of 2.
308 */
309
310BIGNUM *get_rfc3526_prime_8192(BIGNUM *bn)
311 {
312 static const unsigned char RFC3526_PRIME_8192[]={
313 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xC9,0x0F,0xDA,0xA2,
314 0x21,0x68,0xC2,0x34,0xC4,0xC6,0x62,0x8B,0x80,0xDC,0x1C,0xD1,
315 0x29,0x02,0x4E,0x08,0x8A,0x67,0xCC,0x74,0x02,0x0B,0xBE,0xA6,
316 0x3B,0x13,0x9B,0x22,0x51,0x4A,0x08,0x79,0x8E,0x34,0x04,0xDD,
317 0xEF,0x95,0x19,0xB3,0xCD,0x3A,0x43,0x1B,0x30,0x2B,0x0A,0x6D,
318 0xF2,0x5F,0x14,0x37,0x4F,0xE1,0x35,0x6D,0x6D,0x51,0xC2,0x45,
319 0xE4,0x85,0xB5,0x76,0x62,0x5E,0x7E,0xC6,0xF4,0x4C,0x42,0xE9,
320 0xA6,0x37,0xED,0x6B,0x0B,0xFF,0x5C,0xB6,0xF4,0x06,0xB7,0xED,
321 0xEE,0x38,0x6B,0xFB,0x5A,0x89,0x9F,0xA5,0xAE,0x9F,0x24,0x11,
322 0x7C,0x4B,0x1F,0xE6,0x49,0x28,0x66,0x51,0xEC,0xE4,0x5B,0x3D,
323 0xC2,0x00,0x7C,0xB8,0xA1,0x63,0xBF,0x05,0x98,0xDA,0x48,0x36,
324 0x1C,0x55,0xD3,0x9A,0x69,0x16,0x3F,0xA8,0xFD,0x24,0xCF,0x5F,
325 0x83,0x65,0x5D,0x23,0xDC,0xA3,0xAD,0x96,0x1C,0x62,0xF3,0x56,
326 0x20,0x85,0x52,0xBB,0x9E,0xD5,0x29,0x07,0x70,0x96,0x96,0x6D,
327 0x67,0x0C,0x35,0x4E,0x4A,0xBC,0x98,0x04,0xF1,0x74,0x6C,0x08,
328 0xCA,0x18,0x21,0x7C,0x32,0x90,0x5E,0x46,0x2E,0x36,0xCE,0x3B,
329 0xE3,0x9E,0x77,0x2C,0x18,0x0E,0x86,0x03,0x9B,0x27,0x83,0xA2,
330 0xEC,0x07,0xA2,0x8F,0xB5,0xC5,0x5D,0xF0,0x6F,0x4C,0x52,0xC9,
331 0xDE,0x2B,0xCB,0xF6,0x95,0x58,0x17,0x18,0x39,0x95,0x49,0x7C,
332 0xEA,0x95,0x6A,0xE5,0x15,0xD2,0x26,0x18,0x98,0xFA,0x05,0x10,
333 0x15,0x72,0x8E,0x5A,0x8A,0xAA,0xC4,0x2D,0xAD,0x33,0x17,0x0D,
334 0x04,0x50,0x7A,0x33,0xA8,0x55,0x21,0xAB,0xDF,0x1C,0xBA,0x64,
335 0xEC,0xFB,0x85,0x04,0x58,0xDB,0xEF,0x0A,0x8A,0xEA,0x71,0x57,
336 0x5D,0x06,0x0C,0x7D,0xB3,0x97,0x0F,0x85,0xA6,0xE1,0xE4,0xC7,
337 0xAB,0xF5,0xAE,0x8C,0xDB,0x09,0x33,0xD7,0x1E,0x8C,0x94,0xE0,
338 0x4A,0x25,0x61,0x9D,0xCE,0xE3,0xD2,0x26,0x1A,0xD2,0xEE,0x6B,
339 0xF1,0x2F,0xFA,0x06,0xD9,0x8A,0x08,0x64,0xD8,0x76,0x02,0x73,
340 0x3E,0xC8,0x6A,0x64,0x52,0x1F,0x2B,0x18,0x17,0x7B,0x20,0x0C,
341 0xBB,0xE1,0x17,0x57,0x7A,0x61,0x5D,0x6C,0x77,0x09,0x88,0xC0,
342 0xBA,0xD9,0x46,0xE2,0x08,0xE2,0x4F,0xA0,0x74,0xE5,0xAB,0x31,
343 0x43,0xDB,0x5B,0xFC,0xE0,0xFD,0x10,0x8E,0x4B,0x82,0xD1,0x20,
344 0xA9,0x21,0x08,0x01,0x1A,0x72,0x3C,0x12,0xA7,0x87,0xE6,0xD7,
345 0x88,0x71,0x9A,0x10,0xBD,0xBA,0x5B,0x26,0x99,0xC3,0x27,0x18,
346 0x6A,0xF4,0xE2,0x3C,0x1A,0x94,0x68,0x34,0xB6,0x15,0x0B,0xDA,
347 0x25,0x83,0xE9,0xCA,0x2A,0xD4,0x4C,0xE8,0xDB,0xBB,0xC2,0xDB,
348 0x04,0xDE,0x8E,0xF9,0x2E,0x8E,0xFC,0x14,0x1F,0xBE,0xCA,0xA6,
349 0x28,0x7C,0x59,0x47,0x4E,0x6B,0xC0,0x5D,0x99,0xB2,0x96,0x4F,
350 0xA0,0x90,0xC3,0xA2,0x23,0x3B,0xA1,0x86,0x51,0x5B,0xE7,0xED,
351 0x1F,0x61,0x29,0x70,0xCE,0xE2,0xD7,0xAF,0xB8,0x1B,0xDD,0x76,
352 0x21,0x70,0x48,0x1C,0xD0,0x06,0x91,0x27,0xD5,0xB0,0x5A,0xA9,
353 0x93,0xB4,0xEA,0x98,0x8D,0x8F,0xDD,0xC1,0x86,0xFF,0xB7,0xDC,
354 0x90,0xA6,0xC0,0x8F,0x4D,0xF4,0x35,0xC9,0x34,0x02,0x84,0x92,
355 0x36,0xC3,0xFA,0xB4,0xD2,0x7C,0x70,0x26,0xC1,0xD4,0xDC,0xB2,
356 0x60,0x26,0x46,0xDE,0xC9,0x75,0x1E,0x76,0x3D,0xBA,0x37,0xBD,
357 0xF8,0xFF,0x94,0x06,0xAD,0x9E,0x53,0x0E,0xE5,0xDB,0x38,0x2F,
358 0x41,0x30,0x01,0xAE,0xB0,0x6A,0x53,0xED,0x90,0x27,0xD8,0x31,
359 0x17,0x97,0x27,0xB0,0x86,0x5A,0x89,0x18,0xDA,0x3E,0xDB,0xEB,
360 0xCF,0x9B,0x14,0xED,0x44,0xCE,0x6C,0xBA,0xCE,0xD4,0xBB,0x1B,
361 0xDB,0x7F,0x14,0x47,0xE6,0xCC,0x25,0x4B,0x33,0x20,0x51,0x51,
362 0x2B,0xD7,0xAF,0x42,0x6F,0xB8,0xF4,0x01,0x37,0x8C,0xD2,0xBF,
363 0x59,0x83,0xCA,0x01,0xC6,0x4B,0x92,0xEC,0xF0,0x32,0xEA,0x15,
364 0xD1,0x72,0x1D,0x03,0xF4,0x82,0xD7,0xCE,0x6E,0x74,0xFE,0xF6,
365 0xD5,0x5E,0x70,0x2F,0x46,0x98,0x0C,0x82,0xB5,0xA8,0x40,0x31,
366 0x90,0x0B,0x1C,0x9E,0x59,0xE7,0xC9,0x7F,0xBE,0xC7,0xE8,0xF3,
367 0x23,0xA9,0x7A,0x7E,0x36,0xCC,0x88,0xBE,0x0F,0x1D,0x45,0xB7,
368 0xFF,0x58,0x5A,0xC5,0x4B,0xD4,0x07,0xB2,0x2B,0x41,0x54,0xAA,
369 0xCC,0x8F,0x6D,0x7E,0xBF,0x48,0xE1,0xD8,0x14,0xCC,0x5E,0xD2,
370 0x0F,0x80,0x37,0xE0,0xA7,0x97,0x15,0xEE,0xF2,0x9B,0xE3,0x28,
371 0x06,0xA1,0xD5,0x8B,0xB7,0xC5,0xDA,0x76,0xF5,0x50,0xAA,0x3D,
372 0x8A,0x1F,0xBF,0xF0,0xEB,0x19,0xCC,0xB1,0xA3,0x13,0xD5,0x5C,
373 0xDA,0x56,0xC9,0xEC,0x2E,0xF2,0x96,0x32,0x38,0x7F,0xE8,0xD7,
374 0x6E,0x3C,0x04,0x68,0x04,0x3E,0x8F,0x66,0x3F,0x48,0x60,0xEE,
375 0x12,0xBF,0x2D,0x5B,0x0B,0x74,0x74,0xD6,0xE6,0x94,0xF9,0x1E,
376 0x6D,0xBE,0x11,0x59,0x74,0xA3,0x92,0x6F,0x12,0xFE,0xE5,0xE4,
377 0x38,0x77,0x7C,0xB6,0xA9,0x32,0xDF,0x8C,0xD8,0xBE,0xC4,0xD0,
378 0x73,0xB9,0x31,0xBA,0x3B,0xC8,0x32,0xB6,0x8D,0x9D,0xD3,0x00,
379 0x74,0x1F,0xA7,0xBF,0x8A,0xFC,0x47,0xED,0x25,0x76,0xF6,0x93,
380 0x6B,0xA4,0x24,0x66,0x3A,0xAB,0x63,0x9C,0x5A,0xE4,0xF5,0x68,
381 0x34,0x23,0xB4,0x74,0x2B,0xF1,0xC9,0x78,0x23,0x8F,0x16,0xCB,
382 0xE3,0x9D,0x65,0x2D,0xE3,0xFD,0xB8,0xBE,0xFC,0x84,0x8A,0xD9,
383 0x22,0x22,0x2E,0x04,0xA4,0x03,0x7C,0x07,0x13,0xEB,0x57,0xA8,
384 0x1A,0x23,0xF0,0xC7,0x34,0x73,0xFC,0x64,0x6C,0xEA,0x30,0x6B,
385 0x4B,0xCB,0xC8,0x86,0x2F,0x83,0x85,0xDD,0xFA,0x9D,0x4B,0x7F,
386 0xA2,0xC0,0x87,0xE8,0x79,0x68,0x33,0x03,0xED,0x5B,0xDD,0x3A,
387 0x06,0x2B,0x3C,0xF5,0xB3,0xA2,0x78,0xA6,0x6D,0x2A,0x13,0xF8,
388 0x3F,0x44,0xF8,0x2D,0xDF,0x31,0x0E,0xE0,0x74,0xAB,0x6A,0x36,
389 0x45,0x97,0xE8,0x99,0xA0,0x25,0x5D,0xC1,0x64,0xF3,0x1C,0xC5,
390 0x08,0x46,0x85,0x1D,0xF9,0xAB,0x48,0x19,0x5D,0xED,0x7E,0xA1,
391 0xB1,0xD5,0x10,0xBD,0x7E,0xE7,0x4D,0x73,0xFA,0xF3,0x6B,0xC3,
392 0x1E,0xCF,0xA2,0x68,0x35,0x90,0x46,0xF4,0xEB,0x87,0x9F,0x92,
393 0x40,0x09,0x43,0x8B,0x48,0x1C,0x6C,0xD7,0x88,0x9A,0x00,0x2E,
394 0xD5,0xEE,0x38,0x2B,0xC9,0x19,0x0D,0xA6,0xFC,0x02,0x6E,0x47,
395 0x95,0x58,0xE4,0x47,0x56,0x77,0xE9,0xAA,0x9E,0x30,0x50,0xE2,
396 0x76,0x56,0x94,0xDF,0xC8,0x1F,0x56,0xE8,0x80,0xB9,0x6E,0x71,
397 0x60,0xC9,0x80,0xDD,0x98,0xED,0xD3,0xDF,0xFF,0xFF,0xFF,0xFF,
398 0xFF,0xFF,0xFF,0xFF,
399 };
400 return BN_bin2bn(RFC3526_PRIME_8192,sizeof(RFC3526_PRIME_8192),bn);
401 }
402
diff --git a/src/lib/libcrypto/bn/bn_ctx.c b/src/lib/libcrypto/bn/bn_ctx.c
deleted file mode 100644
index 3f2256f675..0000000000
--- a/src/lib/libcrypto/bn/bn_ctx.c
+++ /dev/null
@@ -1,454 +0,0 @@
1/* crypto/bn/bn_ctx.c */
2/* Written by Ulf Moeller for the OpenSSL project. */
3/* ====================================================================
4 * Copyright (c) 1998-2004 The OpenSSL Project. All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 *
18 * 3. All advertising materials mentioning features or use of this
19 * software must display the following acknowledgment:
20 * "This product includes software developed by the OpenSSL Project
21 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
22 *
23 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
24 * endorse or promote products derived from this software without
25 * prior written permission. For written permission, please contact
26 * openssl-core@openssl.org.
27 *
28 * 5. Products derived from this software may not be called "OpenSSL"
29 * nor may "OpenSSL" appear in their names without prior written
30 * permission of the OpenSSL Project.
31 *
32 * 6. Redistributions of any form whatsoever must retain the following
33 * acknowledgment:
34 * "This product includes software developed by the OpenSSL Project
35 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
36 *
37 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
38 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
39 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
40 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
41 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
42 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
43 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
44 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
45 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
46 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
47 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
48 * OF THE POSSIBILITY OF SUCH DAMAGE.
49 * ====================================================================
50 *
51 * This product includes cryptographic software written by Eric Young
52 * (eay@cryptsoft.com). This product includes software written by Tim
53 * Hudson (tjh@cryptsoft.com).
54 *
55 */
56
57#if !defined(BN_CTX_DEBUG) && !defined(BN_DEBUG)
58#ifndef NDEBUG
59#define NDEBUG
60#endif
61#endif
62
63#include <stdio.h>
64#include <assert.h>
65
66#include "cryptlib.h"
67#include "bn_lcl.h"
68
69/* TODO list
70 *
71 * 1. Check a bunch of "(words+1)" type hacks in various bignum functions and
72 * check they can be safely removed.
73 * - Check +1 and other ugliness in BN_from_montgomery()
74 *
75 * 2. Consider allowing a BN_new_ex() that, at least, lets you specify an
76 * appropriate 'block' size that will be honoured by bn_expand_internal() to
77 * prevent piddly little reallocations. OTOH, profiling bignum expansions in
78 * BN_CTX doesn't show this to be a big issue.
79 */
80
81/* How many bignums are in each "pool item"; */
82#define BN_CTX_POOL_SIZE 16
83/* The stack frame info is resizing, set a first-time expansion size; */
84#define BN_CTX_START_FRAMES 32
85
86/***********/
87/* BN_POOL */
88/***********/
89
90/* A bundle of bignums that can be linked with other bundles */
91typedef struct bignum_pool_item
92 {
93 /* The bignum values */
94 BIGNUM vals[BN_CTX_POOL_SIZE];
95 /* Linked-list admin */
96 struct bignum_pool_item *prev, *next;
97 } BN_POOL_ITEM;
98/* A linked-list of bignums grouped in bundles */
99typedef struct bignum_pool
100 {
101 /* Linked-list admin */
102 BN_POOL_ITEM *head, *current, *tail;
103 /* Stack depth and allocation size */
104 unsigned used, size;
105 } BN_POOL;
106static void BN_POOL_init(BN_POOL *);
107static void BN_POOL_finish(BN_POOL *);
108#ifndef OPENSSL_NO_DEPRECATED
109static void BN_POOL_reset(BN_POOL *);
110#endif
111static BIGNUM * BN_POOL_get(BN_POOL *);
112static void BN_POOL_release(BN_POOL *, unsigned int);
113
114/************/
115/* BN_STACK */
116/************/
117
118/* A wrapper to manage the "stack frames" */
119typedef struct bignum_ctx_stack
120 {
121 /* Array of indexes into the bignum stack */
122 unsigned int *indexes;
123 /* Number of stack frames, and the size of the allocated array */
124 unsigned int depth, size;
125 } BN_STACK;
126static void BN_STACK_init(BN_STACK *);
127static void BN_STACK_finish(BN_STACK *);
128#ifndef OPENSSL_NO_DEPRECATED
129static void BN_STACK_reset(BN_STACK *);
130#endif
131static int BN_STACK_push(BN_STACK *, unsigned int);
132static unsigned int BN_STACK_pop(BN_STACK *);
133
134/**********/
135/* BN_CTX */
136/**********/
137
138/* The opaque BN_CTX type */
139struct bignum_ctx
140 {
141 /* The bignum bundles */
142 BN_POOL pool;
143 /* The "stack frames", if you will */
144 BN_STACK stack;
145 /* The number of bignums currently assigned */
146 unsigned int used;
147 /* Depth of stack overflow */
148 int err_stack;
149 /* Block "gets" until an "end" (compatibility behaviour) */
150 int too_many;
151 };
152
153/* Enable this to find BN_CTX bugs */
154#ifdef BN_CTX_DEBUG
155static const char *ctxdbg_cur = NULL;
156static void ctxdbg(BN_CTX *ctx)
157 {
158 unsigned int bnidx = 0, fpidx = 0;
159 BN_POOL_ITEM *item = ctx->pool.head;
160 BN_STACK *stack = &ctx->stack;
161 fprintf(stderr,"(%08x): ", (unsigned int)ctx);
162 while(bnidx < ctx->used)
163 {
164 fprintf(stderr,"%03x ", item->vals[bnidx++ % BN_CTX_POOL_SIZE].dmax);
165 if(!(bnidx % BN_CTX_POOL_SIZE))
166 item = item->next;
167 }
168 fprintf(stderr,"\n");
169 bnidx = 0;
170 fprintf(stderr," : ");
171 while(fpidx < stack->depth)
172 {
173 while(bnidx++ < stack->indexes[fpidx])
174 fprintf(stderr," ");
175 fprintf(stderr,"^^^ ");
176 bnidx++;
177 fpidx++;
178 }
179 fprintf(stderr,"\n");
180 }
181#define CTXDBG_ENTRY(str, ctx) do { \
182 ctxdbg_cur = (str); \
183 fprintf(stderr,"Starting %s\n", ctxdbg_cur); \
184 ctxdbg(ctx); \
185 } while(0)
186#define CTXDBG_EXIT(ctx) do { \
187 fprintf(stderr,"Ending %s\n", ctxdbg_cur); \
188 ctxdbg(ctx); \
189 } while(0)
190#define CTXDBG_RET(ctx,ret)
191#else
192#define CTXDBG_ENTRY(str, ctx)
193#define CTXDBG_EXIT(ctx)
194#define CTXDBG_RET(ctx,ret)
195#endif
196
197/* This function is an evil legacy and should not be used. This implementation
198 * is WYSIWYG, though I've done my best. */
199#ifndef OPENSSL_NO_DEPRECATED
200void BN_CTX_init(BN_CTX *ctx)
201 {
202 /* Assume the caller obtained the context via BN_CTX_new() and so is
203 * trying to reset it for use. Nothing else makes sense, least of all
204 * binary compatibility from a time when they could declare a static
205 * variable. */
206 BN_POOL_reset(&ctx->pool);
207 BN_STACK_reset(&ctx->stack);
208 ctx->used = 0;
209 ctx->err_stack = 0;
210 ctx->too_many = 0;
211 }
212#endif
213
214BN_CTX *BN_CTX_new(void)
215 {
216 BN_CTX *ret = OPENSSL_malloc(sizeof(BN_CTX));
217 if(!ret)
218 {
219 BNerr(BN_F_BN_CTX_NEW,ERR_R_MALLOC_FAILURE);
220 return NULL;
221 }
222 /* Initialise the structure */
223 BN_POOL_init(&ret->pool);
224 BN_STACK_init(&ret->stack);
225 ret->used = 0;
226 ret->err_stack = 0;
227 ret->too_many = 0;
228 return ret;
229 }
230
231void BN_CTX_free(BN_CTX *ctx)
232 {
233 if (ctx == NULL)
234 return;
235#ifdef BN_CTX_DEBUG
236 {
237 BN_POOL_ITEM *pool = ctx->pool.head;
238 fprintf(stderr,"BN_CTX_free, stack-size=%d, pool-bignums=%d\n",
239 ctx->stack.size, ctx->pool.size);
240 fprintf(stderr,"dmaxs: ");
241 while(pool) {
242 unsigned loop = 0;
243 while(loop < BN_CTX_POOL_SIZE)
244 fprintf(stderr,"%02x ", pool->vals[loop++].dmax);
245 pool = pool->next;
246 }
247 fprintf(stderr,"\n");
248 }
249#endif
250 BN_STACK_finish(&ctx->stack);
251 BN_POOL_finish(&ctx->pool);
252 OPENSSL_free(ctx);
253 }
254
255void BN_CTX_start(BN_CTX *ctx)
256 {
257 CTXDBG_ENTRY("BN_CTX_start", ctx);
258 /* If we're already overflowing ... */
259 if(ctx->err_stack || ctx->too_many)
260 ctx->err_stack++;
261 /* (Try to) get a new frame pointer */
262 else if(!BN_STACK_push(&ctx->stack, ctx->used))
263 {
264 BNerr(BN_F_BN_CTX_START,BN_R_TOO_MANY_TEMPORARY_VARIABLES);
265 ctx->err_stack++;
266 }
267 CTXDBG_EXIT(ctx);
268 }
269
270void BN_CTX_end(BN_CTX *ctx)
271 {
272 CTXDBG_ENTRY("BN_CTX_end", ctx);
273 if(ctx->err_stack)
274 ctx->err_stack--;
275 else
276 {
277 unsigned int fp = BN_STACK_pop(&ctx->stack);
278 /* Does this stack frame have anything to release? */
279 if(fp < ctx->used)
280 BN_POOL_release(&ctx->pool, ctx->used - fp);
281 ctx->used = fp;
282 /* Unjam "too_many" in case "get" had failed */
283 ctx->too_many = 0;
284 }
285 CTXDBG_EXIT(ctx);
286 }
287
288BIGNUM *BN_CTX_get(BN_CTX *ctx)
289 {
290 BIGNUM *ret;
291 CTXDBG_ENTRY("BN_CTX_get", ctx);
292 if(ctx->err_stack || ctx->too_many) return NULL;
293 if((ret = BN_POOL_get(&ctx->pool)) == NULL)
294 {
295 /* Setting too_many prevents repeated "get" attempts from
296 * cluttering the error stack. */
297 ctx->too_many = 1;
298 BNerr(BN_F_BN_CTX_GET,BN_R_TOO_MANY_TEMPORARY_VARIABLES);
299 return NULL;
300 }
301 /* OK, make sure the returned bignum is "zero" */
302 BN_zero(ret);
303 ctx->used++;
304 CTXDBG_RET(ctx, ret);
305 return ret;
306 }
307
308/************/
309/* BN_STACK */
310/************/
311
312static void BN_STACK_init(BN_STACK *st)
313 {
314 st->indexes = NULL;
315 st->depth = st->size = 0;
316 }
317
318static void BN_STACK_finish(BN_STACK *st)
319 {
320 if(st->size) OPENSSL_free(st->indexes);
321 }
322
323#ifndef OPENSSL_NO_DEPRECATED
324static void BN_STACK_reset(BN_STACK *st)
325 {
326 st->depth = 0;
327 }
328#endif
329
330static int BN_STACK_push(BN_STACK *st, unsigned int idx)
331 {
332 if(st->depth == st->size)
333 /* Need to expand */
334 {
335 unsigned int newsize = (st->size ?
336 (st->size * 3 / 2) : BN_CTX_START_FRAMES);
337 unsigned int *newitems = OPENSSL_malloc(newsize *
338 sizeof(unsigned int));
339 if(!newitems) return 0;
340 if(st->depth)
341 memcpy(newitems, st->indexes, st->depth *
342 sizeof(unsigned int));
343 if(st->size) OPENSSL_free(st->indexes);
344 st->indexes = newitems;
345 st->size = newsize;
346 }
347 st->indexes[(st->depth)++] = idx;
348 return 1;
349 }
350
351static unsigned int BN_STACK_pop(BN_STACK *st)
352 {
353 return st->indexes[--(st->depth)];
354 }
355
356/***********/
357/* BN_POOL */
358/***********/
359
360static void BN_POOL_init(BN_POOL *p)
361 {
362 p->head = p->current = p->tail = NULL;
363 p->used = p->size = 0;
364 }
365
366static void BN_POOL_finish(BN_POOL *p)
367 {
368 while(p->head)
369 {
370 unsigned int loop = 0;
371 BIGNUM *bn = p->head->vals;
372 while(loop++ < BN_CTX_POOL_SIZE)
373 {
374 if(bn->d) BN_clear_free(bn);
375 bn++;
376 }
377 p->current = p->head->next;
378 OPENSSL_free(p->head);
379 p->head = p->current;
380 }
381 }
382
383#ifndef OPENSSL_NO_DEPRECATED
384static void BN_POOL_reset(BN_POOL *p)
385 {
386 BN_POOL_ITEM *item = p->head;
387 while(item)
388 {
389 unsigned int loop = 0;
390 BIGNUM *bn = item->vals;
391 while(loop++ < BN_CTX_POOL_SIZE)
392 {
393 if(bn->d) BN_clear(bn);
394 bn++;
395 }
396 item = item->next;
397 }
398 p->current = p->head;
399 p->used = 0;
400 }
401#endif
402
403static BIGNUM *BN_POOL_get(BN_POOL *p)
404 {
405 if(p->used == p->size)
406 {
407 BIGNUM *bn;
408 unsigned int loop = 0;
409 BN_POOL_ITEM *item = OPENSSL_malloc(sizeof(BN_POOL_ITEM));
410 if(!item) return NULL;
411 /* Initialise the structure */
412 bn = item->vals;
413 while(loop++ < BN_CTX_POOL_SIZE)
414 BN_init(bn++);
415 item->prev = p->tail;
416 item->next = NULL;
417 /* Link it in */
418 if(!p->head)
419 p->head = p->current = p->tail = item;
420 else
421 {
422 p->tail->next = item;
423 p->tail = item;
424 p->current = item;
425 }
426 p->size += BN_CTX_POOL_SIZE;
427 p->used++;
428 /* Return the first bignum from the new pool */
429 return item->vals;
430 }
431 if(!p->used)
432 p->current = p->head;
433 else if((p->used % BN_CTX_POOL_SIZE) == 0)
434 p->current = p->current->next;
435 return p->current->vals + ((p->used++) % BN_CTX_POOL_SIZE);
436 }
437
438static void BN_POOL_release(BN_POOL *p, unsigned int num)
439 {
440 unsigned int offset = (p->used - 1) % BN_CTX_POOL_SIZE;
441 p->used -= num;
442 while(num--)
443 {
444 bn_check_top(p->current->vals + offset);
445 if(!offset)
446 {
447 offset = BN_CTX_POOL_SIZE - 1;
448 p->current = p->current->prev;
449 }
450 else
451 offset--;
452 }
453 }
454
diff --git a/src/lib/libcrypto/bn/bn_depr.c b/src/lib/libcrypto/bn/bn_depr.c
deleted file mode 100644
index 27535e4fca..0000000000
--- a/src/lib/libcrypto/bn/bn_depr.c
+++ /dev/null
@@ -1,112 +0,0 @@
1/* crypto/bn/bn_depr.c */
2/* ====================================================================
3 * Copyright (c) 1998-2002 The OpenSSL Project. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in
14 * the documentation and/or other materials provided with the
15 * distribution.
16 *
17 * 3. All advertising materials mentioning features or use of this
18 * software must display the following acknowledgment:
19 * "This product includes software developed by the OpenSSL Project
20 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
21 *
22 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
23 * endorse or promote products derived from this software without
24 * prior written permission. For written permission, please contact
25 * openssl-core@openssl.org.
26 *
27 * 5. Products derived from this software may not be called "OpenSSL"
28 * nor may "OpenSSL" appear in their names without prior written
29 * permission of the OpenSSL Project.
30 *
31 * 6. Redistributions of any form whatsoever must retain the following
32 * acknowledgment:
33 * "This product includes software developed by the OpenSSL Project
34 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
35 *
36 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
37 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
38 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
39 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
40 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
41 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
42 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
43 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
44 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
45 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
46 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
47 * OF THE POSSIBILITY OF SUCH DAMAGE.
48 * ====================================================================
49 *
50 * This product includes cryptographic software written by Eric Young
51 * (eay@cryptsoft.com). This product includes software written by Tim
52 * Hudson (tjh@cryptsoft.com).
53 *
54 */
55
56/* Support for deprecated functions goes here - static linkage will only slurp
57 * this code if applications are using them directly. */
58
59#include <stdio.h>
60#include <time.h>
61#include "cryptlib.h"
62#include "bn_lcl.h"
63#include <openssl/rand.h>
64
65static void *dummy=&dummy;
66
67#ifndef OPENSSL_NO_DEPRECATED
68BIGNUM *BN_generate_prime(BIGNUM *ret, int bits, int safe,
69 const BIGNUM *add, const BIGNUM *rem,
70 void (*callback)(int,int,void *), void *cb_arg)
71 {
72 BN_GENCB cb;
73 BIGNUM *rnd=NULL;
74 int found = 0;
75
76 BN_GENCB_set_old(&cb, callback, cb_arg);
77
78 if (ret == NULL)
79 {
80 if ((rnd=BN_new()) == NULL) goto err;
81 }
82 else
83 rnd=ret;
84 if(!BN_generate_prime_ex(rnd, bits, safe, add, rem, &cb))
85 goto err;
86
87 /* we have a prime :-) */
88 found = 1;
89err:
90 if (!found && (ret == NULL) && (rnd != NULL)) BN_free(rnd);
91 return(found ? rnd : NULL);
92 }
93
94int BN_is_prime(const BIGNUM *a, int checks, void (*callback)(int,int,void *),
95 BN_CTX *ctx_passed, void *cb_arg)
96 {
97 BN_GENCB cb;
98 BN_GENCB_set_old(&cb, callback, cb_arg);
99 return BN_is_prime_ex(a, checks, ctx_passed, &cb);
100 }
101
102int BN_is_prime_fasttest(const BIGNUM *a, int checks,
103 void (*callback)(int,int,void *),
104 BN_CTX *ctx_passed, void *cb_arg,
105 int do_trial_division)
106 {
107 BN_GENCB cb;
108 BN_GENCB_set_old(&cb, callback, cb_arg);
109 return BN_is_prime_fasttest_ex(a, checks, ctx_passed,
110 do_trial_division, &cb);
111 }
112#endif
diff --git a/src/lib/libcrypto/bn/bn_div.c b/src/lib/libcrypto/bn/bn_div.c
deleted file mode 100644
index 52b3304293..0000000000
--- a/src/lib/libcrypto/bn/bn_div.c
+++ /dev/null
@@ -1,446 +0,0 @@
1/* crypto/bn/bn_div.c */
2/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
3 * All rights reserved.
4 *
5 * This package is an SSL implementation written
6 * by Eric Young (eay@cryptsoft.com).
7 * The implementation was written so as to conform with Netscapes SSL.
8 *
9 * This library is free for commercial and non-commercial use as long as
10 * the following conditions are aheared to. The following conditions
11 * apply to all code found in this distribution, be it the RC4, RSA,
12 * lhash, DES, etc., code; not just the SSL code. The SSL documentation
13 * included with this distribution is covered by the same copyright terms
14 * except that the holder is Tim Hudson (tjh@cryptsoft.com).
15 *
16 * Copyright remains Eric Young's, and as such any Copyright notices in
17 * the code are not to be removed.
18 * If this package is used in a product, Eric Young should be given attribution
19 * as the author of the parts of the library used.
20 * This can be in the form of a textual message at program startup or
21 * in documentation (online or textual) provided with the package.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 * 1. Redistributions of source code must retain the copyright
27 * notice, this list of conditions and the following disclaimer.
28 * 2. Redistributions in binary form must reproduce the above copyright
29 * notice, this list of conditions and the following disclaimer in the
30 * documentation and/or other materials provided with the distribution.
31 * 3. All advertising materials mentioning features or use of this software
32 * must display the following acknowledgement:
33 * "This product includes cryptographic software written by
34 * Eric Young (eay@cryptsoft.com)"
35 * The word 'cryptographic' can be left out if the rouines from the library
36 * being used are not cryptographic related :-).
37 * 4. If you include any Windows specific code (or a derivative thereof) from
38 * the apps directory (application code) you must include an acknowledgement:
39 * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
40 *
41 * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
42 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51 * SUCH DAMAGE.
52 *
53 * The licence and distribution terms for any publically available version or
54 * derivative of this code cannot be changed. i.e. this code cannot simply be
55 * copied and put under another distribution licence
56 * [including the GNU Public Licence.]
57 */
58
59#include <stdio.h>
60#include <openssl/bn.h>
61#include "cryptlib.h"
62#include "bn_lcl.h"
63
64
65/* The old slow way */
66#if 0
67int BN_div(BIGNUM *dv, BIGNUM *rem, const BIGNUM *m, const BIGNUM *d,
68 BN_CTX *ctx)
69 {
70 int i,nm,nd;
71 int ret = 0;
72 BIGNUM *D;
73
74 bn_check_top(m);
75 bn_check_top(d);
76 if (BN_is_zero(d))
77 {
78 BNerr(BN_F_BN_DIV,BN_R_DIV_BY_ZERO);
79 return(0);
80 }
81
82 if (BN_ucmp(m,d) < 0)
83 {
84 if (rem != NULL)
85 { if (BN_copy(rem,m) == NULL) return(0); }
86 if (dv != NULL) BN_zero(dv);
87 return(1);
88 }
89
90 BN_CTX_start(ctx);
91 D = BN_CTX_get(ctx);
92 if (dv == NULL) dv = BN_CTX_get(ctx);
93 if (rem == NULL) rem = BN_CTX_get(ctx);
94 if (D == NULL || dv == NULL || rem == NULL)
95 goto end;
96
97 nd=BN_num_bits(d);
98 nm=BN_num_bits(m);
99 if (BN_copy(D,d) == NULL) goto end;
100 if (BN_copy(rem,m) == NULL) goto end;
101
102 /* The next 2 are needed so we can do a dv->d[0]|=1 later
103 * since BN_lshift1 will only work once there is a value :-) */
104 BN_zero(dv);
105 if(bn_wexpand(dv,1) == NULL) goto end;
106 dv->top=1;
107
108 if (!BN_lshift(D,D,nm-nd)) goto end;
109 for (i=nm-nd; i>=0; i--)
110 {
111 if (!BN_lshift1(dv,dv)) goto end;
112 if (BN_ucmp(rem,D) >= 0)
113 {
114 dv->d[0]|=1;
115 if (!BN_usub(rem,rem,D)) goto end;
116 }
117/* CAN IMPROVE (and have now :=) */
118 if (!BN_rshift1(D,D)) goto end;
119 }
120 rem->neg=BN_is_zero(rem)?0:m->neg;
121 dv->neg=m->neg^d->neg;
122 ret = 1;
123 end:
124 BN_CTX_end(ctx);
125 return(ret);
126 }
127
128#else
129
130#if !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_NO_INLINE_ASM) \
131 && !defined(PEDANTIC) && !defined(BN_DIV3W)
132# if defined(__GNUC__) && __GNUC__>=2
133# if defined(__i386) || defined (__i386__)
134 /*
135 * There were two reasons for implementing this template:
136 * - GNU C generates a call to a function (__udivdi3 to be exact)
137 * in reply to ((((BN_ULLONG)n0)<<BN_BITS2)|n1)/d0 (I fail to
138 * understand why...);
139 * - divl doesn't only calculate quotient, but also leaves
140 * remainder in %edx which we can definitely use here:-)
141 *
142 * <appro@fy.chalmers.se>
143 */
144# define bn_div_words(n0,n1,d0) \
145 ({ asm volatile ( \
146 "divl %4" \
147 : "=a"(q), "=d"(rem) \
148 : "a"(n1), "d"(n0), "g"(d0) \
149 : "cc"); \
150 q; \
151 })
152# define REMAINDER_IS_ALREADY_CALCULATED
153# elif defined(__x86_64) && defined(SIXTY_FOUR_BIT_LONG)
154 /*
155 * Same story here, but it's 128-bit by 64-bit division. Wow!
156 * <appro@fy.chalmers.se>
157 */
158# define bn_div_words(n0,n1,d0) \
159 ({ asm volatile ( \
160 "divq %4" \
161 : "=a"(q), "=d"(rem) \
162 : "a"(n1), "d"(n0), "g"(d0) \
163 : "cc"); \
164 q; \
165 })
166# define REMAINDER_IS_ALREADY_CALCULATED
167# endif /* __<cpu> */
168# endif /* __GNUC__ */
169#endif /* OPENSSL_NO_ASM */
170
171
172/* BN_div computes dv := num / divisor, rounding towards
173 * zero, and sets up rm such that dv*divisor + rm = num holds.
174 * Thus:
175 * dv->neg == num->neg ^ divisor->neg (unless the result is zero)
176 * rm->neg == num->neg (unless the remainder is zero)
177 * If 'dv' or 'rm' is NULL, the respective value is not returned.
178 */
179int BN_div(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, const BIGNUM *divisor,
180 BN_CTX *ctx)
181 {
182 int norm_shift,i,loop;
183 BIGNUM *tmp,wnum,*snum,*sdiv,*res;
184 BN_ULONG *resp,*wnump;
185 BN_ULONG d0,d1;
186 int num_n,div_n;
187 int no_branch=0;
188
189 /* Invalid zero-padding would have particularly bad consequences
190 * in the case of 'num', so don't just rely on bn_check_top() for this one
191 * (bn_check_top() works only for BN_DEBUG builds) */
192 if (num->top > 0 && num->d[num->top - 1] == 0)
193 {
194 BNerr(BN_F_BN_DIV,BN_R_NOT_INITIALIZED);
195 return 0;
196 }
197
198 bn_check_top(num);
199
200 if ((BN_get_flags(num, BN_FLG_CONSTTIME) != 0) || (BN_get_flags(divisor, BN_FLG_CONSTTIME) != 0))
201 {
202 no_branch=1;
203 }
204
205 bn_check_top(dv);
206 bn_check_top(rm);
207 /* bn_check_top(num); */ /* 'num' has been checked already */
208 bn_check_top(divisor);
209
210 if (BN_is_zero(divisor))
211 {
212 BNerr(BN_F_BN_DIV,BN_R_DIV_BY_ZERO);
213 return(0);
214 }
215
216 if (!no_branch && BN_ucmp(num,divisor) < 0)
217 {
218 if (rm != NULL)
219 { if (BN_copy(rm,num) == NULL) return(0); }
220 if (dv != NULL) BN_zero(dv);
221 return(1);
222 }
223
224 BN_CTX_start(ctx);
225 tmp=BN_CTX_get(ctx);
226 snum=BN_CTX_get(ctx);
227 sdiv=BN_CTX_get(ctx);
228 if (dv == NULL)
229 res=BN_CTX_get(ctx);
230 else res=dv;
231 if (sdiv == NULL || res == NULL || tmp == NULL || snum == NULL)
232 goto err;
233
234 /* First we normalise the numbers */
235 norm_shift=BN_BITS2-((BN_num_bits(divisor))%BN_BITS2);
236 if (!(BN_lshift(sdiv,divisor,norm_shift))) goto err;
237 sdiv->neg=0;
238 norm_shift+=BN_BITS2;
239 if (!(BN_lshift(snum,num,norm_shift))) goto err;
240 snum->neg=0;
241
242 if (no_branch)
243 {
244 /* Since we don't know whether snum is larger than sdiv,
245 * we pad snum with enough zeroes without changing its
246 * value.
247 */
248 if (snum->top <= sdiv->top+1)
249 {
250 if (bn_wexpand(snum, sdiv->top + 2) == NULL) goto err;
251 for (i = snum->top; i < sdiv->top + 2; i++) snum->d[i] = 0;
252 snum->top = sdiv->top + 2;
253 }
254 else
255 {
256 if (bn_wexpand(snum, snum->top + 1) == NULL) goto err;
257 snum->d[snum->top] = 0;
258 snum->top ++;
259 }
260 }
261
262 div_n=sdiv->top;
263 num_n=snum->top;
264 loop=num_n-div_n;
265 /* Lets setup a 'window' into snum
266 * This is the part that corresponds to the current
267 * 'area' being divided */
268 wnum.neg = 0;
269 wnum.d = &(snum->d[loop]);
270 wnum.top = div_n;
271 /* only needed when BN_ucmp messes up the values between top and max */
272 wnum.dmax = snum->dmax - loop; /* so we don't step out of bounds */
273
274 /* Get the top 2 words of sdiv */
275 /* div_n=sdiv->top; */
276 d0=sdiv->d[div_n-1];
277 d1=(div_n == 1)?0:sdiv->d[div_n-2];
278
279 /* pointer to the 'top' of snum */
280 wnump= &(snum->d[num_n-1]);
281
282 /* Setup to 'res' */
283 res->neg= (num->neg^divisor->neg);
284 if (!bn_wexpand(res,(loop+1))) goto err;
285 res->top=loop-no_branch;
286 resp= &(res->d[loop-1]);
287
288 /* space for temp */
289 if (!bn_wexpand(tmp,(div_n+1))) goto err;
290
291 if (!no_branch)
292 {
293 if (BN_ucmp(&wnum,sdiv) >= 0)
294 {
295 /* If BN_DEBUG_RAND is defined BN_ucmp changes (via
296 * bn_pollute) the const bignum arguments =>
297 * clean the values between top and max again */
298 bn_clear_top2max(&wnum);
299 bn_sub_words(wnum.d, wnum.d, sdiv->d, div_n);
300 *resp=1;
301 }
302 else
303 res->top--;
304 }
305
306 /* if res->top == 0 then clear the neg value otherwise decrease
307 * the resp pointer */
308 if (res->top == 0)
309 res->neg = 0;
310 else
311 resp--;
312
313 for (i=0; i<loop-1; i++, wnump--, resp--)
314 {
315 BN_ULONG q,l0;
316 /* the first part of the loop uses the top two words of
317 * snum and sdiv to calculate a BN_ULONG q such that
318 * | wnum - sdiv * q | < sdiv */
319#if defined(BN_DIV3W) && !defined(OPENSSL_NO_ASM)
320 BN_ULONG bn_div_3_words(BN_ULONG*,BN_ULONG,BN_ULONG);
321 q=bn_div_3_words(wnump,d1,d0);
322#else
323 BN_ULONG n0,n1,rem=0;
324
325 n0=wnump[0];
326 n1=wnump[-1];
327 if (n0 == d0)
328 q=BN_MASK2;
329 else /* n0 < d0 */
330 {
331#ifdef BN_LLONG
332 BN_ULLONG t2;
333
334#if defined(BN_LLONG) && defined(BN_DIV2W) && !defined(bn_div_words)
335 q=(BN_ULONG)(((((BN_ULLONG)n0)<<BN_BITS2)|n1)/d0);
336#else
337 q=bn_div_words(n0,n1,d0);
338#ifdef BN_DEBUG_LEVITTE
339 fprintf(stderr,"DEBUG: bn_div_words(0x%08X,0x%08X,0x%08\
340X) -> 0x%08X\n",
341 n0, n1, d0, q);
342#endif
343#endif
344
345#ifndef REMAINDER_IS_ALREADY_CALCULATED
346 /*
347 * rem doesn't have to be BN_ULLONG. The least we
348 * know it's less that d0, isn't it?
349 */
350 rem=(n1-q*d0)&BN_MASK2;
351#endif
352 t2=(BN_ULLONG)d1*q;
353
354 for (;;)
355 {
356 if (t2 <= ((((BN_ULLONG)rem)<<BN_BITS2)|wnump[-2]))
357 break;
358 q--;
359 rem += d0;
360 if (rem < d0) break; /* don't let rem overflow */
361 t2 -= d1;
362 }
363#else /* !BN_LLONG */
364 BN_ULONG t2l,t2h;
365
366 q=bn_div_words(n0,n1,d0);
367#ifdef BN_DEBUG_LEVITTE
368 fprintf(stderr,"DEBUG: bn_div_words(0x%08X,0x%08X,0x%08\
369X) -> 0x%08X\n",
370 n0, n1, d0, q);
371#endif
372#ifndef REMAINDER_IS_ALREADY_CALCULATED
373 rem=(n1-q*d0)&BN_MASK2;
374#endif
375
376#if defined(BN_UMULT_LOHI)
377 BN_UMULT_LOHI(t2l,t2h,d1,q);
378#elif defined(BN_UMULT_HIGH)
379 t2l = d1 * q;
380 t2h = BN_UMULT_HIGH(d1,q);
381#else
382 {
383 BN_ULONG ql, qh;
384 t2l=LBITS(d1); t2h=HBITS(d1);
385 ql =LBITS(q); qh =HBITS(q);
386 mul64(t2l,t2h,ql,qh); /* t2=(BN_ULLONG)d1*q; */
387 }
388#endif
389
390 for (;;)
391 {
392 if ((t2h < rem) ||
393 ((t2h == rem) && (t2l <= wnump[-2])))
394 break;
395 q--;
396 rem += d0;
397 if (rem < d0) break; /* don't let rem overflow */
398 if (t2l < d1) t2h--; t2l -= d1;
399 }
400#endif /* !BN_LLONG */
401 }
402#endif /* !BN_DIV3W */
403
404 l0=bn_mul_words(tmp->d,sdiv->d,div_n,q);
405 tmp->d[div_n]=l0;
406 wnum.d--;
407 /* ingore top values of the bignums just sub the two
408 * BN_ULONG arrays with bn_sub_words */
409 if (bn_sub_words(wnum.d, wnum.d, tmp->d, div_n+1))
410 {
411 /* Note: As we have considered only the leading
412 * two BN_ULONGs in the calculation of q, sdiv * q
413 * might be greater than wnum (but then (q-1) * sdiv
414 * is less or equal than wnum)
415 */
416 q--;
417 if (bn_add_words(wnum.d, wnum.d, sdiv->d, div_n))
418 /* we can't have an overflow here (assuming
419 * that q != 0, but if q == 0 then tmp is
420 * zero anyway) */
421 (*wnump)++;
422 }
423 /* store part of the result */
424 *resp = q;
425 }
426 bn_correct_top(snum);
427 if (rm != NULL)
428 {
429 /* Keep a copy of the neg flag in num because if rm==num
430 * BN_rshift() will overwrite it.
431 */
432 int neg = num->neg;
433 BN_rshift(rm,snum,norm_shift);
434 if (!BN_is_zero(rm))
435 rm->neg = neg;
436 bn_check_top(rm);
437 }
438 if (no_branch) bn_correct_top(res);
439 BN_CTX_end(ctx);
440 return(1);
441err:
442 bn_check_top(rm);
443 BN_CTX_end(ctx);
444 return(0);
445 }
446#endif
diff --git a/src/lib/libcrypto/bn/bn_err.c b/src/lib/libcrypto/bn/bn_err.c
deleted file mode 100644
index cfe2eb94a0..0000000000
--- a/src/lib/libcrypto/bn/bn_err.c
+++ /dev/null
@@ -1,150 +0,0 @@
1/* crypto/bn/bn_err.c */
2/* ====================================================================
3 * Copyright (c) 1999-2007 The OpenSSL Project. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in
14 * the documentation and/or other materials provided with the
15 * distribution.
16 *
17 * 3. All advertising materials mentioning features or use of this
18 * software must display the following acknowledgment:
19 * "This product includes software developed by the OpenSSL Project
20 * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
21 *
22 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
23 * endorse or promote products derived from this software without
24 * prior written permission. For written permission, please contact
25 * openssl-core@OpenSSL.org.
26 *
27 * 5. Products derived from this software may not be called "OpenSSL"
28 * nor may "OpenSSL" appear in their names without prior written
29 * permission of the OpenSSL Project.
30 *
31 * 6. Redistributions of any form whatsoever must retain the following
32 * acknowledgment:
33 * "This product includes software developed by the OpenSSL Project
34 * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
35 *
36 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
37 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
38 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
39 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
40 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
41 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
42 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
43 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
44 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
45 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
46 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
47 * OF THE POSSIBILITY OF SUCH DAMAGE.
48 * ====================================================================
49 *
50 * This product includes cryptographic software written by Eric Young
51 * (eay@cryptsoft.com). This product includes software written by Tim
52 * Hudson (tjh@cryptsoft.com).
53 *
54 */
55
56/* NOTE: this file was auto generated by the mkerr.pl script: any changes
57 * made to it will be overwritten when the script next updates this file,
58 * only reason strings will be preserved.
59 */
60
61#include <stdio.h>
62#include <openssl/err.h>
63#include <openssl/bn.h>
64
65/* BEGIN ERROR CODES */
66#ifndef OPENSSL_NO_ERR
67
68#define ERR_FUNC(func) ERR_PACK(ERR_LIB_BN,func,0)
69#define ERR_REASON(reason) ERR_PACK(ERR_LIB_BN,0,reason)
70
71static ERR_STRING_DATA BN_str_functs[]=
72 {
73{ERR_FUNC(BN_F_BNRAND), "BNRAND"},
74{ERR_FUNC(BN_F_BN_BLINDING_CONVERT_EX), "BN_BLINDING_convert_ex"},
75{ERR_FUNC(BN_F_BN_BLINDING_CREATE_PARAM), "BN_BLINDING_create_param"},
76{ERR_FUNC(BN_F_BN_BLINDING_INVERT_EX), "BN_BLINDING_invert_ex"},
77{ERR_FUNC(BN_F_BN_BLINDING_NEW), "BN_BLINDING_new"},
78{ERR_FUNC(BN_F_BN_BLINDING_UPDATE), "BN_BLINDING_update"},
79{ERR_FUNC(BN_F_BN_BN2DEC), "BN_bn2dec"},
80{ERR_FUNC(BN_F_BN_BN2HEX), "BN_bn2hex"},
81{ERR_FUNC(BN_F_BN_CTX_GET), "BN_CTX_get"},
82{ERR_FUNC(BN_F_BN_CTX_NEW), "BN_CTX_new"},
83{ERR_FUNC(BN_F_BN_CTX_START), "BN_CTX_start"},
84{ERR_FUNC(BN_F_BN_DIV), "BN_div"},
85{ERR_FUNC(BN_F_BN_DIV_NO_BRANCH), "BN_div_no_branch"},
86{ERR_FUNC(BN_F_BN_DIV_RECP), "BN_div_recp"},
87{ERR_FUNC(BN_F_BN_EXP), "BN_exp"},
88{ERR_FUNC(BN_F_BN_EXPAND2), "bn_expand2"},
89{ERR_FUNC(BN_F_BN_EXPAND_INTERNAL), "BN_EXPAND_INTERNAL"},
90{ERR_FUNC(BN_F_BN_GF2M_MOD), "BN_GF2m_mod"},
91{ERR_FUNC(BN_F_BN_GF2M_MOD_EXP), "BN_GF2m_mod_exp"},
92{ERR_FUNC(BN_F_BN_GF2M_MOD_MUL), "BN_GF2m_mod_mul"},
93{ERR_FUNC(BN_F_BN_GF2M_MOD_SOLVE_QUAD), "BN_GF2m_mod_solve_quad"},
94{ERR_FUNC(BN_F_BN_GF2M_MOD_SOLVE_QUAD_ARR), "BN_GF2m_mod_solve_quad_arr"},
95{ERR_FUNC(BN_F_BN_GF2M_MOD_SQR), "BN_GF2m_mod_sqr"},
96{ERR_FUNC(BN_F_BN_GF2M_MOD_SQRT), "BN_GF2m_mod_sqrt"},
97{ERR_FUNC(BN_F_BN_MOD_EXP2_MONT), "BN_mod_exp2_mont"},
98{ERR_FUNC(BN_F_BN_MOD_EXP_MONT), "BN_mod_exp_mont"},
99{ERR_FUNC(BN_F_BN_MOD_EXP_MONT_CONSTTIME), "BN_mod_exp_mont_consttime"},
100{ERR_FUNC(BN_F_BN_MOD_EXP_MONT_WORD), "BN_mod_exp_mont_word"},
101{ERR_FUNC(BN_F_BN_MOD_EXP_RECP), "BN_mod_exp_recp"},
102{ERR_FUNC(BN_F_BN_MOD_EXP_SIMPLE), "BN_mod_exp_simple"},
103{ERR_FUNC(BN_F_BN_MOD_INVERSE), "BN_mod_inverse"},
104{ERR_FUNC(BN_F_BN_MOD_INVERSE_NO_BRANCH), "BN_mod_inverse_no_branch"},
105{ERR_FUNC(BN_F_BN_MOD_LSHIFT_QUICK), "BN_mod_lshift_quick"},
106{ERR_FUNC(BN_F_BN_MOD_MUL_RECIPROCAL), "BN_mod_mul_reciprocal"},
107{ERR_FUNC(BN_F_BN_MOD_SQRT), "BN_mod_sqrt"},
108{ERR_FUNC(BN_F_BN_MPI2BN), "BN_mpi2bn"},
109{ERR_FUNC(BN_F_BN_NEW), "BN_new"},
110{ERR_FUNC(BN_F_BN_RAND), "BN_rand"},
111{ERR_FUNC(BN_F_BN_RAND_RANGE), "BN_rand_range"},
112{ERR_FUNC(BN_F_BN_USUB), "BN_usub"},
113{0,NULL}
114 };
115
116static ERR_STRING_DATA BN_str_reasons[]=
117 {
118{ERR_REASON(BN_R_ARG2_LT_ARG3) ,"arg2 lt arg3"},
119{ERR_REASON(BN_R_BAD_RECIPROCAL) ,"bad reciprocal"},
120{ERR_REASON(BN_R_BIGNUM_TOO_LONG) ,"bignum too long"},
121{ERR_REASON(BN_R_CALLED_WITH_EVEN_MODULUS),"called with even modulus"},
122{ERR_REASON(BN_R_DIV_BY_ZERO) ,"div by zero"},
123{ERR_REASON(BN_R_ENCODING_ERROR) ,"encoding error"},
124{ERR_REASON(BN_R_EXPAND_ON_STATIC_BIGNUM_DATA),"expand on static bignum data"},
125{ERR_REASON(BN_R_INPUT_NOT_REDUCED) ,"input not reduced"},
126{ERR_REASON(BN_R_INVALID_LENGTH) ,"invalid length"},
127{ERR_REASON(BN_R_INVALID_RANGE) ,"invalid range"},
128{ERR_REASON(BN_R_NOT_A_SQUARE) ,"not a square"},
129{ERR_REASON(BN_R_NOT_INITIALIZED) ,"not initialized"},
130{ERR_REASON(BN_R_NO_INVERSE) ,"no inverse"},
131{ERR_REASON(BN_R_NO_SOLUTION) ,"no solution"},
132{ERR_REASON(BN_R_P_IS_NOT_PRIME) ,"p is not prime"},
133{ERR_REASON(BN_R_TOO_MANY_ITERATIONS) ,"too many iterations"},
134{ERR_REASON(BN_R_TOO_MANY_TEMPORARY_VARIABLES),"too many temporary variables"},
135{0,NULL}
136 };
137
138#endif
139
140void ERR_load_BN_strings(void)
141 {
142#ifndef OPENSSL_NO_ERR
143
144 if (ERR_func_error_string(BN_str_functs[0].error) == NULL)
145 {
146 ERR_load_strings(0,BN_str_functs);
147 ERR_load_strings(0,BN_str_reasons);
148 }
149#endif
150 }
diff --git a/src/lib/libcrypto/bn/bn_exp.c b/src/lib/libcrypto/bn/bn_exp.c
deleted file mode 100644
index 2abf6fd678..0000000000
--- a/src/lib/libcrypto/bn/bn_exp.c
+++ /dev/null
@@ -1,1097 +0,0 @@
1/* crypto/bn/bn_exp.c */
2/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
3 * All rights reserved.
4 *
5 * This package is an SSL implementation written
6 * by Eric Young (eay@cryptsoft.com).
7 * The implementation was written so as to conform with Netscapes SSL.
8 *
9 * This library is free for commercial and non-commercial use as long as
10 * the following conditions are aheared to. The following conditions
11 * apply to all code found in this distribution, be it the RC4, RSA,
12 * lhash, DES, etc., code; not just the SSL code. The SSL documentation
13 * included with this distribution is covered by the same copyright terms
14 * except that the holder is Tim Hudson (tjh@cryptsoft.com).
15 *
16 * Copyright remains Eric Young's, and as such any Copyright notices in
17 * the code are not to be removed.
18 * If this package is used in a product, Eric Young should be given attribution
19 * as the author of the parts of the library used.
20 * This can be in the form of a textual message at program startup or
21 * in documentation (online or textual) provided with the package.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 * 1. Redistributions of source code must retain the copyright
27 * notice, this list of conditions and the following disclaimer.
28 * 2. Redistributions in binary form must reproduce the above copyright
29 * notice, this list of conditions and the following disclaimer in the
30 * documentation and/or other materials provided with the distribution.
31 * 3. All advertising materials mentioning features or use of this software
32 * must display the following acknowledgement:
33 * "This product includes cryptographic software written by
34 * Eric Young (eay@cryptsoft.com)"
35 * The word 'cryptographic' can be left out if the rouines from the library
36 * being used are not cryptographic related :-).
37 * 4. If you include any Windows specific code (or a derivative thereof) from
38 * the apps directory (application code) you must include an acknowledgement:
39 * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
40 *
41 * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
42 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51 * SUCH DAMAGE.
52 *
53 * The licence and distribution terms for any publically available version or
54 * derivative of this code cannot be changed. i.e. this code cannot simply be
55 * copied and put under another distribution licence
56 * [including the GNU Public Licence.]
57 */
58/* ====================================================================
59 * Copyright (c) 1998-2005 The OpenSSL Project. All rights reserved.
60 *
61 * Redistribution and use in source and binary forms, with or without
62 * modification, are permitted provided that the following conditions
63 * are met:
64 *
65 * 1. Redistributions of source code must retain the above copyright
66 * notice, this list of conditions and the following disclaimer.
67 *
68 * 2. Redistributions in binary form must reproduce the above copyright
69 * notice, this list of conditions and the following disclaimer in
70 * the documentation and/or other materials provided with the
71 * distribution.
72 *
73 * 3. All advertising materials mentioning features or use of this
74 * software must display the following acknowledgment:
75 * "This product includes software developed by the OpenSSL Project
76 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
77 *
78 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
79 * endorse or promote products derived from this software without
80 * prior written permission. For written permission, please contact
81 * openssl-core@openssl.org.
82 *
83 * 5. Products derived from this software may not be called "OpenSSL"
84 * nor may "OpenSSL" appear in their names without prior written
85 * permission of the OpenSSL Project.
86 *
87 * 6. Redistributions of any form whatsoever must retain the following
88 * acknowledgment:
89 * "This product includes software developed by the OpenSSL Project
90 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
91 *
92 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
93 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
94 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
95 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
96 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
97 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
98 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
99 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
100 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
101 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
102 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
103 * OF THE POSSIBILITY OF SUCH DAMAGE.
104 * ====================================================================
105 *
106 * This product includes cryptographic software written by Eric Young
107 * (eay@cryptsoft.com). This product includes software written by Tim
108 * Hudson (tjh@cryptsoft.com).
109 *
110 */
111
112
113#include "cryptlib.h"
114#include "bn_lcl.h"
115
116#include <stdlib.h>
117#ifdef _WIN32
118# include <malloc.h>
119# ifndef alloca
120# define alloca _alloca
121# endif
122#elif defined(__GNUC__)
123# ifndef alloca
124# define alloca(s) __builtin_alloca((s))
125# endif
126#endif
127
128/* maximum precomputation table size for *variable* sliding windows */
129#define TABLE_SIZE 32
130
131/* this one works - simple but works */
132int BN_exp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx)
133 {
134 int i,bits,ret=0;
135 BIGNUM *v,*rr;
136
137 if (BN_get_flags(p, BN_FLG_CONSTTIME) != 0)
138 {
139 /* BN_FLG_CONSTTIME only supported by BN_mod_exp_mont() */
140 BNerr(BN_F_BN_EXP,ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
141 return -1;
142 }
143
144 BN_CTX_start(ctx);
145 if ((r == a) || (r == p))
146 rr = BN_CTX_get(ctx);
147 else
148 rr = r;
149 v = BN_CTX_get(ctx);
150 if (rr == NULL || v == NULL) goto err;
151
152 if (BN_copy(v,a) == NULL) goto err;
153 bits=BN_num_bits(p);
154
155 if (BN_is_odd(p))
156 { if (BN_copy(rr,a) == NULL) goto err; }
157 else { if (!BN_one(rr)) goto err; }
158
159 for (i=1; i<bits; i++)
160 {
161 if (!BN_sqr(v,v,ctx)) goto err;
162 if (BN_is_bit_set(p,i))
163 {
164 if (!BN_mul(rr,rr,v,ctx)) goto err;
165 }
166 }
167 ret=1;
168err:
169 if (r != rr) BN_copy(r,rr);
170 BN_CTX_end(ctx);
171 bn_check_top(r);
172 return(ret);
173 }
174
175
176int BN_mod_exp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, const BIGNUM *m,
177 BN_CTX *ctx)
178 {
179 int ret;
180
181 bn_check_top(a);
182 bn_check_top(p);
183 bn_check_top(m);
184
185 /* For even modulus m = 2^k*m_odd, it might make sense to compute
186 * a^p mod m_odd and a^p mod 2^k separately (with Montgomery
187 * exponentiation for the odd part), using appropriate exponent
188 * reductions, and combine the results using the CRT.
189 *
190 * For now, we use Montgomery only if the modulus is odd; otherwise,
191 * exponentiation using the reciprocal-based quick remaindering
192 * algorithm is used.
193 *
194 * (Timing obtained with expspeed.c [computations a^p mod m
195 * where a, p, m are of the same length: 256, 512, 1024, 2048,
196 * 4096, 8192 bits], compared to the running time of the
197 * standard algorithm:
198 *
199 * BN_mod_exp_mont 33 .. 40 % [AMD K6-2, Linux, debug configuration]
200 * 55 .. 77 % [UltraSparc processor, but
201 * debug-solaris-sparcv8-gcc conf.]
202 *
203 * BN_mod_exp_recp 50 .. 70 % [AMD K6-2, Linux, debug configuration]
204 * 62 .. 118 % [UltraSparc, debug-solaris-sparcv8-gcc]
205 *
206 * On the Sparc, BN_mod_exp_recp was faster than BN_mod_exp_mont
207 * at 2048 and more bits, but at 512 and 1024 bits, it was
208 * slower even than the standard algorithm!
209 *
210 * "Real" timings [linux-elf, solaris-sparcv9-gcc configurations]
211 * should be obtained when the new Montgomery reduction code
212 * has been integrated into OpenSSL.)
213 */
214
215#define MONT_MUL_MOD
216#define MONT_EXP_WORD
217#define RECP_MUL_MOD
218
219#ifdef MONT_MUL_MOD
220 /* I have finally been able to take out this pre-condition of
221 * the top bit being set. It was caused by an error in BN_div
222 * with negatives. There was also another problem when for a^b%m
223 * a >= m. eay 07-May-97 */
224/* if ((m->d[m->top-1]&BN_TBIT) && BN_is_odd(m)) */
225
226 if (BN_is_odd(m))
227 {
228# ifdef MONT_EXP_WORD
229 if (a->top == 1 && !a->neg && (BN_get_flags(p, BN_FLG_CONSTTIME) == 0))
230 {
231 BN_ULONG A = a->d[0];
232 ret=BN_mod_exp_mont_word(r,A,p,m,ctx,NULL);
233 }
234 else
235# endif
236 ret=BN_mod_exp_mont(r,a,p,m,ctx,NULL);
237 }
238 else
239#endif
240#ifdef RECP_MUL_MOD
241 { ret=BN_mod_exp_recp(r,a,p,m,ctx); }
242#else
243 { ret=BN_mod_exp_simple(r,a,p,m,ctx); }
244#endif
245
246 bn_check_top(r);
247 return(ret);
248 }
249
250
251int BN_mod_exp_recp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p,
252 const BIGNUM *m, BN_CTX *ctx)
253 {
254 int i,j,bits,ret=0,wstart,wend,window,wvalue;
255 int start=1;
256 BIGNUM *aa;
257 /* Table of variables obtained from 'ctx' */
258 BIGNUM *val[TABLE_SIZE];
259 BN_RECP_CTX recp;
260
261 if (BN_get_flags(p, BN_FLG_CONSTTIME) != 0)
262 {
263 /* BN_FLG_CONSTTIME only supported by BN_mod_exp_mont() */
264 BNerr(BN_F_BN_MOD_EXP_RECP,ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
265 return -1;
266 }
267
268 bits=BN_num_bits(p);
269
270 if (bits == 0)
271 {
272 ret = BN_one(r);
273 return ret;
274 }
275
276 BN_CTX_start(ctx);
277 aa = BN_CTX_get(ctx);
278 val[0] = BN_CTX_get(ctx);
279 if(!aa || !val[0]) goto err;
280
281 BN_RECP_CTX_init(&recp);
282 if (m->neg)
283 {
284 /* ignore sign of 'm' */
285 if (!BN_copy(aa, m)) goto err;
286 aa->neg = 0;
287 if (BN_RECP_CTX_set(&recp,aa,ctx) <= 0) goto err;
288 }
289 else
290 {
291 if (BN_RECP_CTX_set(&recp,m,ctx) <= 0) goto err;
292 }
293
294 if (!BN_nnmod(val[0],a,m,ctx)) goto err; /* 1 */
295 if (BN_is_zero(val[0]))
296 {
297 BN_zero(r);
298 ret = 1;
299 goto err;
300 }
301
302 window = BN_window_bits_for_exponent_size(bits);
303 if (window > 1)
304 {
305 if (!BN_mod_mul_reciprocal(aa,val[0],val[0],&recp,ctx))
306 goto err; /* 2 */
307 j=1<<(window-1);
308 for (i=1; i<j; i++)
309 {
310 if(((val[i] = BN_CTX_get(ctx)) == NULL) ||
311 !BN_mod_mul_reciprocal(val[i],val[i-1],
312 aa,&recp,ctx))
313 goto err;
314 }
315 }
316
317 start=1; /* This is used to avoid multiplication etc
318 * when there is only the value '1' in the
319 * buffer. */
320 wvalue=0; /* The 'value' of the window */
321 wstart=bits-1; /* The top bit of the window */
322 wend=0; /* The bottom bit of the window */
323
324 if (!BN_one(r)) goto err;
325
326 for (;;)
327 {
328 if (BN_is_bit_set(p,wstart) == 0)
329 {
330 if (!start)
331 if (!BN_mod_mul_reciprocal(r,r,r,&recp,ctx))
332 goto err;
333 if (wstart == 0) break;
334 wstart--;
335 continue;
336 }
337 /* We now have wstart on a 'set' bit, we now need to work out
338 * how bit a window to do. To do this we need to scan
339 * forward until the last set bit before the end of the
340 * window */
341 j=wstart;
342 wvalue=1;
343 wend=0;
344 for (i=1; i<window; i++)
345 {
346 if (wstart-i < 0) break;
347 if (BN_is_bit_set(p,wstart-i))
348 {
349 wvalue<<=(i-wend);
350 wvalue|=1;
351 wend=i;
352 }
353 }
354
355 /* wend is the size of the current window */
356 j=wend+1;
357 /* add the 'bytes above' */
358 if (!start)
359 for (i=0; i<j; i++)
360 {
361 if (!BN_mod_mul_reciprocal(r,r,r,&recp,ctx))
362 goto err;
363 }
364
365 /* wvalue will be an odd number < 2^window */
366 if (!BN_mod_mul_reciprocal(r,r,val[wvalue>>1],&recp,ctx))
367 goto err;
368
369 /* move the 'window' down further */
370 wstart-=wend+1;
371 wvalue=0;
372 start=0;
373 if (wstart < 0) break;
374 }
375 ret=1;
376err:
377 BN_CTX_end(ctx);
378 BN_RECP_CTX_free(&recp);
379 bn_check_top(r);
380 return(ret);
381 }
382
383
384int BN_mod_exp_mont(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
385 const BIGNUM *m, BN_CTX *ctx, BN_MONT_CTX *in_mont)
386 {
387 int i,j,bits,ret=0,wstart,wend,window,wvalue;
388 int start=1;
389 BIGNUM *d,*r;
390 const BIGNUM *aa;
391 /* Table of variables obtained from 'ctx' */
392 BIGNUM *val[TABLE_SIZE];
393 BN_MONT_CTX *mont=NULL;
394
395 if (BN_get_flags(p, BN_FLG_CONSTTIME) != 0)
396 {
397 return BN_mod_exp_mont_consttime(rr, a, p, m, ctx, in_mont);
398 }
399
400 bn_check_top(a);
401 bn_check_top(p);
402 bn_check_top(m);
403
404 if (!BN_is_odd(m))
405 {
406 BNerr(BN_F_BN_MOD_EXP_MONT,BN_R_CALLED_WITH_EVEN_MODULUS);
407 return(0);
408 }
409 bits=BN_num_bits(p);
410 if (bits == 0)
411 {
412 ret = BN_one(rr);
413 return ret;
414 }
415
416 BN_CTX_start(ctx);
417 d = BN_CTX_get(ctx);
418 r = BN_CTX_get(ctx);
419 val[0] = BN_CTX_get(ctx);
420 if (!d || !r || !val[0]) goto err;
421
422 /* If this is not done, things will break in the montgomery
423 * part */
424
425 if (in_mont != NULL)
426 mont=in_mont;
427 else
428 {
429 if ((mont=BN_MONT_CTX_new()) == NULL) goto err;
430 if (!BN_MONT_CTX_set(mont,m,ctx)) goto err;
431 }
432
433 if (a->neg || BN_ucmp(a,m) >= 0)
434 {
435 if (!BN_nnmod(val[0],a,m,ctx))
436 goto err;
437 aa= val[0];
438 }
439 else
440 aa=a;
441 if (BN_is_zero(aa))
442 {
443 BN_zero(rr);
444 ret = 1;
445 goto err;
446 }
447 if (!BN_to_montgomery(val[0],aa,mont,ctx)) goto err; /* 1 */
448
449 window = BN_window_bits_for_exponent_size(bits);
450 if (window > 1)
451 {
452 if (!BN_mod_mul_montgomery(d,val[0],val[0],mont,ctx)) goto err; /* 2 */
453 j=1<<(window-1);
454 for (i=1; i<j; i++)
455 {
456 if(((val[i] = BN_CTX_get(ctx)) == NULL) ||
457 !BN_mod_mul_montgomery(val[i],val[i-1],
458 d,mont,ctx))
459 goto err;
460 }
461 }
462
463 start=1; /* This is used to avoid multiplication etc
464 * when there is only the value '1' in the
465 * buffer. */
466 wvalue=0; /* The 'value' of the window */
467 wstart=bits-1; /* The top bit of the window */
468 wend=0; /* The bottom bit of the window */
469
470 if (!BN_to_montgomery(r,BN_value_one(),mont,ctx)) goto err;
471 for (;;)
472 {
473 if (BN_is_bit_set(p,wstart) == 0)
474 {
475 if (!start)
476 {
477 if (!BN_mod_mul_montgomery(r,r,r,mont,ctx))
478 goto err;
479 }
480 if (wstart == 0) break;
481 wstart--;
482 continue;
483 }
484 /* We now have wstart on a 'set' bit, we now need to work out
485 * how bit a window to do. To do this we need to scan
486 * forward until the last set bit before the end of the
487 * window */
488 j=wstart;
489 wvalue=1;
490 wend=0;
491 for (i=1; i<window; i++)
492 {
493 if (wstart-i < 0) break;
494 if (BN_is_bit_set(p,wstart-i))
495 {
496 wvalue<<=(i-wend);
497 wvalue|=1;
498 wend=i;
499 }
500 }
501
502 /* wend is the size of the current window */
503 j=wend+1;
504 /* add the 'bytes above' */
505 if (!start)
506 for (i=0; i<j; i++)
507 {
508 if (!BN_mod_mul_montgomery(r,r,r,mont,ctx))
509 goto err;
510 }
511
512 /* wvalue will be an odd number < 2^window */
513 if (!BN_mod_mul_montgomery(r,r,val[wvalue>>1],mont,ctx))
514 goto err;
515
516 /* move the 'window' down further */
517 wstart-=wend+1;
518 wvalue=0;
519 start=0;
520 if (wstart < 0) break;
521 }
522 if (!BN_from_montgomery(rr,r,mont,ctx)) goto err;
523 ret=1;
524err:
525 if ((in_mont == NULL) && (mont != NULL)) BN_MONT_CTX_free(mont);
526 BN_CTX_end(ctx);
527 bn_check_top(rr);
528 return(ret);
529 }
530
531
532/* BN_mod_exp_mont_consttime() stores the precomputed powers in a specific layout
533 * so that accessing any of these table values shows the same access pattern as far
534 * as cache lines are concerned. The following functions are used to transfer a BIGNUM
535 * from/to that table. */
536
537static int MOD_EXP_CTIME_COPY_TO_PREBUF(const BIGNUM *b, int top, unsigned char *buf, int idx, int width)
538 {
539 size_t i, j;
540
541 if (top > b->top)
542 top = b->top; /* this works because 'buf' is explicitly zeroed */
543 for (i = 0, j=idx; i < top * sizeof b->d[0]; i++, j+=width)
544 {
545 buf[j] = ((unsigned char*)b->d)[i];
546 }
547
548 return 1;
549 }
550
551static int MOD_EXP_CTIME_COPY_FROM_PREBUF(BIGNUM *b, int top, unsigned char *buf, int idx, int width)
552 {
553 size_t i, j;
554
555 if (bn_wexpand(b, top) == NULL)
556 return 0;
557
558 for (i=0, j=idx; i < top * sizeof b->d[0]; i++, j+=width)
559 {
560 ((unsigned char*)b->d)[i] = buf[j];
561 }
562
563 b->top = top;
564 bn_correct_top(b);
565 return 1;
566 }
567
568/* Given a pointer value, compute the next address that is a cache line multiple. */
569#define MOD_EXP_CTIME_ALIGN(x_) \
570 ((unsigned char*)(x_) + (MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH - (((size_t)(x_)) & (MOD_EXP_CTIME_MIN_CACHE_LINE_MASK))))
571
572/* This variant of BN_mod_exp_mont() uses fixed windows and the special
573 * precomputation memory layout to limit data-dependency to a minimum
574 * to protect secret exponents (cf. the hyper-threading timing attacks
575 * pointed out by Colin Percival,
576 * http://www.daemonology.net/hyperthreading-considered-harmful/)
577 */
578int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
579 const BIGNUM *m, BN_CTX *ctx, BN_MONT_CTX *in_mont)
580 {
581 int i,bits,ret=0,window,wvalue;
582 int top;
583 BN_MONT_CTX *mont=NULL;
584
585 int numPowers;
586 unsigned char *powerbufFree=NULL;
587 int powerbufLen = 0;
588 unsigned char *powerbuf=NULL;
589 BIGNUM tmp, am;
590
591 bn_check_top(a);
592 bn_check_top(p);
593 bn_check_top(m);
594
595 top = m->top;
596
597 if (!(m->d[0] & 1))
598 {
599 BNerr(BN_F_BN_MOD_EXP_MONT_CONSTTIME,BN_R_CALLED_WITH_EVEN_MODULUS);
600 return(0);
601 }
602 bits=BN_num_bits(p);
603 if (bits == 0)
604 {
605 ret = BN_one(rr);
606 return ret;
607 }
608
609 BN_CTX_start(ctx);
610
611 /* Allocate a montgomery context if it was not supplied by the caller.
612 * If this is not done, things will break in the montgomery part.
613 */
614 if (in_mont != NULL)
615 mont=in_mont;
616 else
617 {
618 if ((mont=BN_MONT_CTX_new()) == NULL) goto err;
619 if (!BN_MONT_CTX_set(mont,m,ctx)) goto err;
620 }
621
622 /* Get the window size to use with size of p. */
623 window = BN_window_bits_for_ctime_exponent_size(bits);
624#if defined(OPENSSL_BN_ASM_MONT5)
625 if (window==6 && bits<=1024) window=5; /* ~5% improvement of 2048-bit RSA sign */
626#endif
627
628 /* Allocate a buffer large enough to hold all of the pre-computed
629 * powers of am, am itself and tmp.
630 */
631 numPowers = 1 << window;
632 powerbufLen = sizeof(m->d[0])*(top*numPowers +
633 ((2*top)>numPowers?(2*top):numPowers));
634#ifdef alloca
635 if (powerbufLen < 3072)
636 powerbufFree = alloca(powerbufLen+MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH);
637 else
638#endif
639 if ((powerbufFree=(unsigned char*)OPENSSL_malloc(powerbufLen+MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH)) == NULL)
640 goto err;
641
642 powerbuf = MOD_EXP_CTIME_ALIGN(powerbufFree);
643 memset(powerbuf, 0, powerbufLen);
644
645#ifdef alloca
646 if (powerbufLen < 3072)
647 powerbufFree = NULL;
648#endif
649
650 /* lay down tmp and am right after powers table */
651 tmp.d = (BN_ULONG *)(powerbuf + sizeof(m->d[0])*top*numPowers);
652 am.d = tmp.d + top;
653 tmp.top = am.top = 0;
654 tmp.dmax = am.dmax = top;
655 tmp.neg = am.neg = 0;
656 tmp.flags = am.flags = BN_FLG_STATIC_DATA;
657
658 /* prepare a^0 in Montgomery domain */
659#if 1
660 if (!BN_to_montgomery(&tmp,BN_value_one(),mont,ctx)) goto err;
661#else
662 tmp.d[0] = (0-m->d[0])&BN_MASK2; /* 2^(top*BN_BITS2) - m */
663 for (i=1;i<top;i++)
664 tmp.d[i] = (~m->d[i])&BN_MASK2;
665 tmp.top = top;
666#endif
667
668 /* prepare a^1 in Montgomery domain */
669 if (a->neg || BN_ucmp(a,m) >= 0)
670 {
671 if (!BN_mod(&am,a,m,ctx)) goto err;
672 if (!BN_to_montgomery(&am,&am,mont,ctx)) goto err;
673 }
674 else if (!BN_to_montgomery(&am,a,mont,ctx)) goto err;
675
676#if defined(OPENSSL_BN_ASM_MONT5)
677 /* This optimization uses ideas from http://eprint.iacr.org/2011/239,
678 * specifically optimization of cache-timing attack countermeasures
679 * and pre-computation optimization. */
680
681 /* Dedicated window==4 case improves 512-bit RSA sign by ~15%, but as
682 * 512-bit RSA is hardly relevant, we omit it to spare size... */
683 if (window==5)
684 {
685 void bn_mul_mont_gather5(BN_ULONG *rp,const BN_ULONG *ap,
686 const void *table,const BN_ULONG *np,
687 const BN_ULONG *n0,int num,int power);
688 void bn_scatter5(const BN_ULONG *inp,size_t num,
689 void *table,size_t power);
690 void bn_gather5(BN_ULONG *out,size_t num,
691 void *table,size_t power);
692
693 BN_ULONG *np=mont->N.d, *n0=mont->n0;
694
695 /* BN_to_montgomery can contaminate words above .top
696 * [in BN_DEBUG[_DEBUG] build]... */
697 for (i=am.top; i<top; i++) am.d[i]=0;
698 for (i=tmp.top; i<top; i++) tmp.d[i]=0;
699
700 bn_scatter5(tmp.d,top,powerbuf,0);
701 bn_scatter5(am.d,am.top,powerbuf,1);
702 bn_mul_mont(tmp.d,am.d,am.d,np,n0,top);
703 bn_scatter5(tmp.d,top,powerbuf,2);
704
705#if 0
706 for (i=3; i<32; i++)
707 {
708 /* Calculate a^i = a^(i-1) * a */
709 bn_mul_mont_gather5(tmp.d,am.d,powerbuf,np,n0,top,i-1);
710 bn_scatter5(tmp.d,top,powerbuf,i);
711 }
712#else
713 /* same as above, but uses squaring for 1/2 of operations */
714 for (i=4; i<32; i*=2)
715 {
716 bn_mul_mont(tmp.d,tmp.d,tmp.d,np,n0,top);
717 bn_scatter5(tmp.d,top,powerbuf,i);
718 }
719 for (i=3; i<8; i+=2)
720 {
721 int j;
722 bn_mul_mont_gather5(tmp.d,am.d,powerbuf,np,n0,top,i-1);
723 bn_scatter5(tmp.d,top,powerbuf,i);
724 for (j=2*i; j<32; j*=2)
725 {
726 bn_mul_mont(tmp.d,tmp.d,tmp.d,np,n0,top);
727 bn_scatter5(tmp.d,top,powerbuf,j);
728 }
729 }
730 for (; i<16; i+=2)
731 {
732 bn_mul_mont_gather5(tmp.d,am.d,powerbuf,np,n0,top,i-1);
733 bn_scatter5(tmp.d,top,powerbuf,i);
734 bn_mul_mont(tmp.d,tmp.d,tmp.d,np,n0,top);
735 bn_scatter5(tmp.d,top,powerbuf,2*i);
736 }
737 for (; i<32; i+=2)
738 {
739 bn_mul_mont_gather5(tmp.d,am.d,powerbuf,np,n0,top,i-1);
740 bn_scatter5(tmp.d,top,powerbuf,i);
741 }
742#endif
743 bits--;
744 for (wvalue=0, i=bits%5; i>=0; i--,bits--)
745 wvalue = (wvalue<<1)+BN_is_bit_set(p,bits);
746 bn_gather5(tmp.d,top,powerbuf,wvalue);
747
748 /* Scan the exponent one window at a time starting from the most
749 * significant bits.
750 */
751 while (bits >= 0)
752 {
753 for (wvalue=0, i=0; i<5; i++,bits--)
754 wvalue = (wvalue<<1)+BN_is_bit_set(p,bits);
755
756 bn_mul_mont(tmp.d,tmp.d,tmp.d,np,n0,top);
757 bn_mul_mont(tmp.d,tmp.d,tmp.d,np,n0,top);
758 bn_mul_mont(tmp.d,tmp.d,tmp.d,np,n0,top);
759 bn_mul_mont(tmp.d,tmp.d,tmp.d,np,n0,top);
760 bn_mul_mont(tmp.d,tmp.d,tmp.d,np,n0,top);
761 bn_mul_mont_gather5(tmp.d,tmp.d,powerbuf,np,n0,top,wvalue);
762 }
763
764 tmp.top=top;
765 bn_correct_top(&tmp);
766 }
767 else
768#endif
769 {
770 if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&tmp, top, powerbuf, 0, numPowers)) goto err;
771 if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&am, top, powerbuf, 1, numPowers)) goto err;
772
773 /* If the window size is greater than 1, then calculate
774 * val[i=2..2^winsize-1]. Powers are computed as a*a^(i-1)
775 * (even powers could instead be computed as (a^(i/2))^2
776 * to use the slight performance advantage of sqr over mul).
777 */
778 if (window > 1)
779 {
780 if (!BN_mod_mul_montgomery(&tmp,&am,&am,mont,ctx)) goto err;
781 if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&tmp, top, powerbuf, 2, numPowers)) goto err;
782 for (i=3; i<numPowers; i++)
783 {
784 /* Calculate a^i = a^(i-1) * a */
785 if (!BN_mod_mul_montgomery(&tmp,&am,&tmp,mont,ctx))
786 goto err;
787 if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&tmp, top, powerbuf, i, numPowers)) goto err;
788 }
789 }
790
791 bits--;
792 for (wvalue=0, i=bits%window; i>=0; i--,bits--)
793 wvalue = (wvalue<<1)+BN_is_bit_set(p,bits);
794 if (!MOD_EXP_CTIME_COPY_FROM_PREBUF(&tmp,top,powerbuf,wvalue,numPowers)) goto err;
795
796 /* Scan the exponent one window at a time starting from the most
797 * significant bits.
798 */
799 while (bits >= 0)
800 {
801 wvalue=0; /* The 'value' of the window */
802
803 /* Scan the window, squaring the result as we go */
804 for (i=0; i<window; i++,bits--)
805 {
806 if (!BN_mod_mul_montgomery(&tmp,&tmp,&tmp,mont,ctx)) goto err;
807 wvalue = (wvalue<<1)+BN_is_bit_set(p,bits);
808 }
809
810 /* Fetch the appropriate pre-computed value from the pre-buf */
811 if (!MOD_EXP_CTIME_COPY_FROM_PREBUF(&am, top, powerbuf, wvalue, numPowers)) goto err;
812
813 /* Multiply the result into the intermediate result */
814 if (!BN_mod_mul_montgomery(&tmp,&tmp,&am,mont,ctx)) goto err;
815 }
816 }
817
818 /* Convert the final result from montgomery to standard format */
819 if (!BN_from_montgomery(rr,&tmp,mont,ctx)) goto err;
820 ret=1;
821err:
822 if ((in_mont == NULL) && (mont != NULL)) BN_MONT_CTX_free(mont);
823 if (powerbuf!=NULL)
824 {
825 OPENSSL_cleanse(powerbuf,powerbufLen);
826 if (powerbufFree) OPENSSL_free(powerbufFree);
827 }
828 BN_CTX_end(ctx);
829 return(ret);
830 }
831
832int BN_mod_exp_mont_word(BIGNUM *rr, BN_ULONG a, const BIGNUM *p,
833 const BIGNUM *m, BN_CTX *ctx, BN_MONT_CTX *in_mont)
834 {
835 BN_MONT_CTX *mont = NULL;
836 int b, bits, ret=0;
837 int r_is_one;
838 BN_ULONG w, next_w;
839 BIGNUM *d, *r, *t;
840 BIGNUM *swap_tmp;
841#define BN_MOD_MUL_WORD(r, w, m) \
842 (BN_mul_word(r, (w)) && \
843 (/* BN_ucmp(r, (m)) < 0 ? 1 :*/ \
844 (BN_mod(t, r, m, ctx) && (swap_tmp = r, r = t, t = swap_tmp, 1))))
845 /* BN_MOD_MUL_WORD is only used with 'w' large,
846 * so the BN_ucmp test is probably more overhead
847 * than always using BN_mod (which uses BN_copy if
848 * a similar test returns true). */
849 /* We can use BN_mod and do not need BN_nnmod because our
850 * accumulator is never negative (the result of BN_mod does
851 * not depend on the sign of the modulus).
852 */
853#define BN_TO_MONTGOMERY_WORD(r, w, mont) \
854 (BN_set_word(r, (w)) && BN_to_montgomery(r, r, (mont), ctx))
855
856 if (BN_get_flags(p, BN_FLG_CONSTTIME) != 0)
857 {
858 /* BN_FLG_CONSTTIME only supported by BN_mod_exp_mont() */
859 BNerr(BN_F_BN_MOD_EXP_MONT_WORD,ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
860 return -1;
861 }
862
863 bn_check_top(p);
864 bn_check_top(m);
865
866 if (!BN_is_odd(m))
867 {
868 BNerr(BN_F_BN_MOD_EXP_MONT_WORD,BN_R_CALLED_WITH_EVEN_MODULUS);
869 return(0);
870 }
871 if (m->top == 1)
872 a %= m->d[0]; /* make sure that 'a' is reduced */
873
874 bits = BN_num_bits(p);
875 if (bits == 0)
876 {
877 ret = BN_one(rr);
878 return ret;
879 }
880 if (a == 0)
881 {
882 BN_zero(rr);
883 ret = 1;
884 return ret;
885 }
886
887 BN_CTX_start(ctx);
888 d = BN_CTX_get(ctx);
889 r = BN_CTX_get(ctx);
890 t = BN_CTX_get(ctx);
891 if (d == NULL || r == NULL || t == NULL) goto err;
892
893 if (in_mont != NULL)
894 mont=in_mont;
895 else
896 {
897 if ((mont = BN_MONT_CTX_new()) == NULL) goto err;
898 if (!BN_MONT_CTX_set(mont, m, ctx)) goto err;
899 }
900
901 r_is_one = 1; /* except for Montgomery factor */
902
903 /* bits-1 >= 0 */
904
905 /* The result is accumulated in the product r*w. */
906 w = a; /* bit 'bits-1' of 'p' is always set */
907 for (b = bits-2; b >= 0; b--)
908 {
909 /* First, square r*w. */
910 next_w = w*w;
911 if ((next_w/w) != w) /* overflow */
912 {
913 if (r_is_one)
914 {
915 if (!BN_TO_MONTGOMERY_WORD(r, w, mont)) goto err;
916 r_is_one = 0;
917 }
918 else
919 {
920 if (!BN_MOD_MUL_WORD(r, w, m)) goto err;
921 }
922 next_w = 1;
923 }
924 w = next_w;
925 if (!r_is_one)
926 {
927 if (!BN_mod_mul_montgomery(r, r, r, mont, ctx)) goto err;
928 }
929
930 /* Second, multiply r*w by 'a' if exponent bit is set. */
931 if (BN_is_bit_set(p, b))
932 {
933 next_w = w*a;
934 if ((next_w/a) != w) /* overflow */
935 {
936 if (r_is_one)
937 {
938 if (!BN_TO_MONTGOMERY_WORD(r, w, mont)) goto err;
939 r_is_one = 0;
940 }
941 else
942 {
943 if (!BN_MOD_MUL_WORD(r, w, m)) goto err;
944 }
945 next_w = a;
946 }
947 w = next_w;
948 }
949 }
950
951 /* Finally, set r:=r*w. */
952 if (w != 1)
953 {
954 if (r_is_one)
955 {
956 if (!BN_TO_MONTGOMERY_WORD(r, w, mont)) goto err;
957 r_is_one = 0;
958 }
959 else
960 {
961 if (!BN_MOD_MUL_WORD(r, w, m)) goto err;
962 }
963 }
964
965 if (r_is_one) /* can happen only if a == 1*/
966 {
967 if (!BN_one(rr)) goto err;
968 }
969 else
970 {
971 if (!BN_from_montgomery(rr, r, mont, ctx)) goto err;
972 }
973 ret = 1;
974err:
975 if ((in_mont == NULL) && (mont != NULL)) BN_MONT_CTX_free(mont);
976 BN_CTX_end(ctx);
977 bn_check_top(rr);
978 return(ret);
979 }
980
981
982/* The old fallback, simple version :-) */
983int BN_mod_exp_simple(BIGNUM *r, const BIGNUM *a, const BIGNUM *p,
984 const BIGNUM *m, BN_CTX *ctx)
985 {
986 int i,j,bits,ret=0,wstart,wend,window,wvalue;
987 int start=1;
988 BIGNUM *d;
989 /* Table of variables obtained from 'ctx' */
990 BIGNUM *val[TABLE_SIZE];
991
992 if (BN_get_flags(p, BN_FLG_CONSTTIME) != 0)
993 {
994 /* BN_FLG_CONSTTIME only supported by BN_mod_exp_mont() */
995 BNerr(BN_F_BN_MOD_EXP_SIMPLE,ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
996 return -1;
997 }
998
999 bits=BN_num_bits(p);
1000
1001 if (bits == 0)
1002 {
1003 ret = BN_one(r);
1004 return ret;
1005 }
1006
1007 BN_CTX_start(ctx);
1008 d = BN_CTX_get(ctx);
1009 val[0] = BN_CTX_get(ctx);
1010 if(!d || !val[0]) goto err;
1011
1012 if (!BN_nnmod(val[0],a,m,ctx)) goto err; /* 1 */
1013 if (BN_is_zero(val[0]))
1014 {
1015 BN_zero(r);
1016 ret = 1;
1017 goto err;
1018 }
1019
1020 window = BN_window_bits_for_exponent_size(bits);
1021 if (window > 1)
1022 {
1023 if (!BN_mod_mul(d,val[0],val[0],m,ctx))
1024 goto err; /* 2 */
1025 j=1<<(window-1);
1026 for (i=1; i<j; i++)
1027 {
1028 if(((val[i] = BN_CTX_get(ctx)) == NULL) ||
1029 !BN_mod_mul(val[i],val[i-1],d,m,ctx))
1030 goto err;
1031 }
1032 }
1033
1034 start=1; /* This is used to avoid multiplication etc
1035 * when there is only the value '1' in the
1036 * buffer. */
1037 wvalue=0; /* The 'value' of the window */
1038 wstart=bits-1; /* The top bit of the window */
1039 wend=0; /* The bottom bit of the window */
1040
1041 if (!BN_one(r)) goto err;
1042
1043 for (;;)
1044 {
1045 if (BN_is_bit_set(p,wstart) == 0)
1046 {
1047 if (!start)
1048 if (!BN_mod_mul(r,r,r,m,ctx))
1049 goto err;
1050 if (wstart == 0) break;
1051 wstart--;
1052 continue;
1053 }
1054 /* We now have wstart on a 'set' bit, we now need to work out
1055 * how bit a window to do. To do this we need to scan
1056 * forward until the last set bit before the end of the
1057 * window */
1058 j=wstart;
1059 wvalue=1;
1060 wend=0;
1061 for (i=1; i<window; i++)
1062 {
1063 if (wstart-i < 0) break;
1064 if (BN_is_bit_set(p,wstart-i))
1065 {
1066 wvalue<<=(i-wend);
1067 wvalue|=1;
1068 wend=i;
1069 }
1070 }
1071
1072 /* wend is the size of the current window */
1073 j=wend+1;
1074 /* add the 'bytes above' */
1075 if (!start)
1076 for (i=0; i<j; i++)
1077 {
1078 if (!BN_mod_mul(r,r,r,m,ctx))
1079 goto err;
1080 }
1081
1082 /* wvalue will be an odd number < 2^window */
1083 if (!BN_mod_mul(r,r,val[wvalue>>1],m,ctx))
1084 goto err;
1085
1086 /* move the 'window' down further */
1087 wstart-=wend+1;
1088 wvalue=0;
1089 start=0;
1090 if (wstart < 0) break;
1091 }
1092 ret=1;
1093err:
1094 BN_CTX_end(ctx);
1095 bn_check_top(r);
1096 return(ret);
1097 }
diff --git a/src/lib/libcrypto/bn/bn_exp2.c b/src/lib/libcrypto/bn/bn_exp2.c
deleted file mode 100644
index bd0c34b91b..0000000000
--- a/src/lib/libcrypto/bn/bn_exp2.c
+++ /dev/null
@@ -1,312 +0,0 @@
1/* crypto/bn/bn_exp2.c */
2/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
3 * All rights reserved.
4 *
5 * This package is an SSL implementation written
6 * by Eric Young (eay@cryptsoft.com).
7 * The implementation was written so as to conform with Netscapes SSL.
8 *
9 * This library is free for commercial and non-commercial use as long as
10 * the following conditions are aheared to. The following conditions
11 * apply to all code found in this distribution, be it the RC4, RSA,
12 * lhash, DES, etc., code; not just the SSL code. The SSL documentation
13 * included with this distribution is covered by the same copyright terms
14 * except that the holder is Tim Hudson (tjh@cryptsoft.com).
15 *
16 * Copyright remains Eric Young's, and as such any Copyright notices in
17 * the code are not to be removed.
18 * If this package is used in a product, Eric Young should be given attribution
19 * as the author of the parts of the library used.
20 * This can be in the form of a textual message at program startup or
21 * in documentation (online or textual) provided with the package.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 * 1. Redistributions of source code must retain the copyright
27 * notice, this list of conditions and the following disclaimer.
28 * 2. Redistributions in binary form must reproduce the above copyright
29 * notice, this list of conditions and the following disclaimer in the
30 * documentation and/or other materials provided with the distribution.
31 * 3. All advertising materials mentioning features or use of this software
32 * must display the following acknowledgement:
33 * "This product includes cryptographic software written by
34 * Eric Young (eay@cryptsoft.com)"
35 * The word 'cryptographic' can be left out if the rouines from the library
36 * being used are not cryptographic related :-).
37 * 4. If you include any Windows specific code (or a derivative thereof) from
38 * the apps directory (application code) you must include an acknowledgement:
39 * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
40 *
41 * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
42 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51 * SUCH DAMAGE.
52 *
53 * The licence and distribution terms for any publically available version or
54 * derivative of this code cannot be changed. i.e. this code cannot simply be
55 * copied and put under another distribution licence
56 * [including the GNU Public Licence.]
57 */
58/* ====================================================================
59 * Copyright (c) 1998-2000 The OpenSSL Project. All rights reserved.
60 *
61 * Redistribution and use in source and binary forms, with or without
62 * modification, are permitted provided that the following conditions
63 * are met:
64 *
65 * 1. Redistributions of source code must retain the above copyright
66 * notice, this list of conditions and the following disclaimer.
67 *
68 * 2. Redistributions in binary form must reproduce the above copyright
69 * notice, this list of conditions and the following disclaimer in
70 * the documentation and/or other materials provided with the
71 * distribution.
72 *
73 * 3. All advertising materials mentioning features or use of this
74 * software must display the following acknowledgment:
75 * "This product includes software developed by the OpenSSL Project
76 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
77 *
78 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
79 * endorse or promote products derived from this software without
80 * prior written permission. For written permission, please contact
81 * openssl-core@openssl.org.
82 *
83 * 5. Products derived from this software may not be called "OpenSSL"
84 * nor may "OpenSSL" appear in their names without prior written
85 * permission of the OpenSSL Project.
86 *
87 * 6. Redistributions of any form whatsoever must retain the following
88 * acknowledgment:
89 * "This product includes software developed by the OpenSSL Project
90 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
91 *
92 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
93 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
94 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
95 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
96 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
97 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
98 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
99 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
100 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
101 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
102 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
103 * OF THE POSSIBILITY OF SUCH DAMAGE.
104 * ====================================================================
105 *
106 * This product includes cryptographic software written by Eric Young
107 * (eay@cryptsoft.com). This product includes software written by Tim
108 * Hudson (tjh@cryptsoft.com).
109 *
110 */
111
112#include <stdio.h>
113#include "cryptlib.h"
114#include "bn_lcl.h"
115
116#define TABLE_SIZE 32
117
118int BN_mod_exp2_mont(BIGNUM *rr, const BIGNUM *a1, const BIGNUM *p1,
119 const BIGNUM *a2, const BIGNUM *p2, const BIGNUM *m,
120 BN_CTX *ctx, BN_MONT_CTX *in_mont)
121 {
122 int i,j,bits,b,bits1,bits2,ret=0,wpos1,wpos2,window1,window2,wvalue1,wvalue2;
123 int r_is_one=1;
124 BIGNUM *d,*r;
125 const BIGNUM *a_mod_m;
126 /* Tables of variables obtained from 'ctx' */
127 BIGNUM *val1[TABLE_SIZE], *val2[TABLE_SIZE];
128 BN_MONT_CTX *mont=NULL;
129
130 bn_check_top(a1);
131 bn_check_top(p1);
132 bn_check_top(a2);
133 bn_check_top(p2);
134 bn_check_top(m);
135
136 if (!(m->d[0] & 1))
137 {
138 BNerr(BN_F_BN_MOD_EXP2_MONT,BN_R_CALLED_WITH_EVEN_MODULUS);
139 return(0);
140 }
141 bits1=BN_num_bits(p1);
142 bits2=BN_num_bits(p2);
143 if ((bits1 == 0) && (bits2 == 0))
144 {
145 ret = BN_one(rr);
146 return ret;
147 }
148
149 bits=(bits1 > bits2)?bits1:bits2;
150
151 BN_CTX_start(ctx);
152 d = BN_CTX_get(ctx);
153 r = BN_CTX_get(ctx);
154 val1[0] = BN_CTX_get(ctx);
155 val2[0] = BN_CTX_get(ctx);
156 if(!d || !r || !val1[0] || !val2[0]) goto err;
157
158 if (in_mont != NULL)
159 mont=in_mont;
160 else
161 {
162 if ((mont=BN_MONT_CTX_new()) == NULL) goto err;
163 if (!BN_MONT_CTX_set(mont,m,ctx)) goto err;
164 }
165
166 window1 = BN_window_bits_for_exponent_size(bits1);
167 window2 = BN_window_bits_for_exponent_size(bits2);
168
169 /*
170 * Build table for a1: val1[i] := a1^(2*i + 1) mod m for i = 0 .. 2^(window1-1)
171 */
172 if (a1->neg || BN_ucmp(a1,m) >= 0)
173 {
174 if (!BN_mod(val1[0],a1,m,ctx))
175 goto err;
176 a_mod_m = val1[0];
177 }
178 else
179 a_mod_m = a1;
180 if (BN_is_zero(a_mod_m))
181 {
182 BN_zero(rr);
183 ret = 1;
184 goto err;
185 }
186
187 if (!BN_to_montgomery(val1[0],a_mod_m,mont,ctx)) goto err;
188 if (window1 > 1)
189 {
190 if (!BN_mod_mul_montgomery(d,val1[0],val1[0],mont,ctx)) goto err;
191
192 j=1<<(window1-1);
193 for (i=1; i<j; i++)
194 {
195 if(((val1[i] = BN_CTX_get(ctx)) == NULL) ||
196 !BN_mod_mul_montgomery(val1[i],val1[i-1],
197 d,mont,ctx))
198 goto err;
199 }
200 }
201
202
203 /*
204 * Build table for a2: val2[i] := a2^(2*i + 1) mod m for i = 0 .. 2^(window2-1)
205 */
206 if (a2->neg || BN_ucmp(a2,m) >= 0)
207 {
208 if (!BN_mod(val2[0],a2,m,ctx))
209 goto err;
210 a_mod_m = val2[0];
211 }
212 else
213 a_mod_m = a2;
214 if (BN_is_zero(a_mod_m))
215 {
216 BN_zero(rr);
217 ret = 1;
218 goto err;
219 }
220 if (!BN_to_montgomery(val2[0],a_mod_m,mont,ctx)) goto err;
221 if (window2 > 1)
222 {
223 if (!BN_mod_mul_montgomery(d,val2[0],val2[0],mont,ctx)) goto err;
224
225 j=1<<(window2-1);
226 for (i=1; i<j; i++)
227 {
228 if(((val2[i] = BN_CTX_get(ctx)) == NULL) ||
229 !BN_mod_mul_montgomery(val2[i],val2[i-1],
230 d,mont,ctx))
231 goto err;
232 }
233 }
234
235
236 /* Now compute the power product, using independent windows. */
237 r_is_one=1;
238 wvalue1=0; /* The 'value' of the first window */
239 wvalue2=0; /* The 'value' of the second window */
240 wpos1=0; /* If wvalue1 > 0, the bottom bit of the first window */
241 wpos2=0; /* If wvalue2 > 0, the bottom bit of the second window */
242
243 if (!BN_to_montgomery(r,BN_value_one(),mont,ctx)) goto err;
244 for (b=bits-1; b>=0; b--)
245 {
246 if (!r_is_one)
247 {
248 if (!BN_mod_mul_montgomery(r,r,r,mont,ctx))
249 goto err;
250 }
251
252 if (!wvalue1)
253 if (BN_is_bit_set(p1, b))
254 {
255 /* consider bits b-window1+1 .. b for this window */
256 i = b-window1+1;
257 while (!BN_is_bit_set(p1, i)) /* works for i<0 */
258 i++;
259 wpos1 = i;
260 wvalue1 = 1;
261 for (i = b-1; i >= wpos1; i--)
262 {
263 wvalue1 <<= 1;
264 if (BN_is_bit_set(p1, i))
265 wvalue1++;
266 }
267 }
268
269 if (!wvalue2)
270 if (BN_is_bit_set(p2, b))
271 {
272 /* consider bits b-window2+1 .. b for this window */
273 i = b-window2+1;
274 while (!BN_is_bit_set(p2, i))
275 i++;
276 wpos2 = i;
277 wvalue2 = 1;
278 for (i = b-1; i >= wpos2; i--)
279 {
280 wvalue2 <<= 1;
281 if (BN_is_bit_set(p2, i))
282 wvalue2++;
283 }
284 }
285
286 if (wvalue1 && b == wpos1)
287 {
288 /* wvalue1 is odd and < 2^window1 */
289 if (!BN_mod_mul_montgomery(r,r,val1[wvalue1>>1],mont,ctx))
290 goto err;
291 wvalue1 = 0;
292 r_is_one = 0;
293 }
294
295 if (wvalue2 && b == wpos2)
296 {
297 /* wvalue2 is odd and < 2^window2 */
298 if (!BN_mod_mul_montgomery(r,r,val2[wvalue2>>1],mont,ctx))
299 goto err;
300 wvalue2 = 0;
301 r_is_one = 0;
302 }
303 }
304 if (!BN_from_montgomery(rr,r,mont,ctx))
305 goto err;
306 ret=1;
307err:
308 if ((in_mont == NULL) && (mont != NULL)) BN_MONT_CTX_free(mont);
309 BN_CTX_end(ctx);
310 bn_check_top(rr);
311 return(ret);
312 }
diff --git a/src/lib/libcrypto/bn/bn_gcd.c b/src/lib/libcrypto/bn/bn_gcd.c
deleted file mode 100644
index 4a352119ba..0000000000
--- a/src/lib/libcrypto/bn/bn_gcd.c
+++ /dev/null
@@ -1,654 +0,0 @@
1/* crypto/bn/bn_gcd.c */
2/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
3 * All rights reserved.
4 *
5 * This package is an SSL implementation written
6 * by Eric Young (eay@cryptsoft.com).
7 * The implementation was written so as to conform with Netscapes SSL.
8 *
9 * This library is free for commercial and non-commercial use as long as
10 * the following conditions are aheared to. The following conditions
11 * apply to all code found in this distribution, be it the RC4, RSA,
12 * lhash, DES, etc., code; not just the SSL code. The SSL documentation
13 * included with this distribution is covered by the same copyright terms
14 * except that the holder is Tim Hudson (tjh@cryptsoft.com).
15 *
16 * Copyright remains Eric Young's, and as such any Copyright notices in
17 * the code are not to be removed.
18 * If this package is used in a product, Eric Young should be given attribution
19 * as the author of the parts of the library used.
20 * This can be in the form of a textual message at program startup or
21 * in documentation (online or textual) provided with the package.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 * 1. Redistributions of source code must retain the copyright
27 * notice, this list of conditions and the following disclaimer.
28 * 2. Redistributions in binary form must reproduce the above copyright
29 * notice, this list of conditions and the following disclaimer in the
30 * documentation and/or other materials provided with the distribution.
31 * 3. All advertising materials mentioning features or use of this software
32 * must display the following acknowledgement:
33 * "This product includes cryptographic software written by
34 * Eric Young (eay@cryptsoft.com)"
35 * The word 'cryptographic' can be left out if the rouines from the library
36 * being used are not cryptographic related :-).
37 * 4. If you include any Windows specific code (or a derivative thereof) from
38 * the apps directory (application code) you must include an acknowledgement:
39 * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
40 *
41 * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
42 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51 * SUCH DAMAGE.
52 *
53 * The licence and distribution terms for any publically available version or
54 * derivative of this code cannot be changed. i.e. this code cannot simply be
55 * copied and put under another distribution licence
56 * [including the GNU Public Licence.]
57 */
58/* ====================================================================
59 * Copyright (c) 1998-2001 The OpenSSL Project. All rights reserved.
60 *
61 * Redistribution and use in source and binary forms, with or without
62 * modification, are permitted provided that the following conditions
63 * are met:
64 *
65 * 1. Redistributions of source code must retain the above copyright
66 * notice, this list of conditions and the following disclaimer.
67 *
68 * 2. Redistributions in binary form must reproduce the above copyright
69 * notice, this list of conditions and the following disclaimer in
70 * the documentation and/or other materials provided with the
71 * distribution.
72 *
73 * 3. All advertising materials mentioning features or use of this
74 * software must display the following acknowledgment:
75 * "This product includes software developed by the OpenSSL Project
76 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
77 *
78 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
79 * endorse or promote products derived from this software without
80 * prior written permission. For written permission, please contact
81 * openssl-core@openssl.org.
82 *
83 * 5. Products derived from this software may not be called "OpenSSL"
84 * nor may "OpenSSL" appear in their names without prior written
85 * permission of the OpenSSL Project.
86 *
87 * 6. Redistributions of any form whatsoever must retain the following
88 * acknowledgment:
89 * "This product includes software developed by the OpenSSL Project
90 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
91 *
92 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
93 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
94 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
95 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
96 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
97 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
98 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
99 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
100 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
101 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
102 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
103 * OF THE POSSIBILITY OF SUCH DAMAGE.
104 * ====================================================================
105 *
106 * This product includes cryptographic software written by Eric Young
107 * (eay@cryptsoft.com). This product includes software written by Tim
108 * Hudson (tjh@cryptsoft.com).
109 *
110 */
111
112#include "cryptlib.h"
113#include "bn_lcl.h"
114
115static BIGNUM *euclid(BIGNUM *a, BIGNUM *b);
116
117int BN_gcd(BIGNUM *r, const BIGNUM *in_a, const BIGNUM *in_b, BN_CTX *ctx)
118 {
119 BIGNUM *a,*b,*t;
120 int ret=0;
121
122 bn_check_top(in_a);
123 bn_check_top(in_b);
124
125 BN_CTX_start(ctx);
126 a = BN_CTX_get(ctx);
127 b = BN_CTX_get(ctx);
128 if (a == NULL || b == NULL) goto err;
129
130 if (BN_copy(a,in_a) == NULL) goto err;
131 if (BN_copy(b,in_b) == NULL) goto err;
132 a->neg = 0;
133 b->neg = 0;
134
135 if (BN_cmp(a,b) < 0) { t=a; a=b; b=t; }
136 t=euclid(a,b);
137 if (t == NULL) goto err;
138
139 if (BN_copy(r,t) == NULL) goto err;
140 ret=1;
141err:
142 BN_CTX_end(ctx);
143 bn_check_top(r);
144 return(ret);
145 }
146
147static BIGNUM *euclid(BIGNUM *a, BIGNUM *b)
148 {
149 BIGNUM *t;
150 int shifts=0;
151
152 bn_check_top(a);
153 bn_check_top(b);
154
155 /* 0 <= b <= a */
156 while (!BN_is_zero(b))
157 {
158 /* 0 < b <= a */
159
160 if (BN_is_odd(a))
161 {
162 if (BN_is_odd(b))
163 {
164 if (!BN_sub(a,a,b)) goto err;
165 if (!BN_rshift1(a,a)) goto err;
166 if (BN_cmp(a,b) < 0)
167 { t=a; a=b; b=t; }
168 }
169 else /* a odd - b even */
170 {
171 if (!BN_rshift1(b,b)) goto err;
172 if (BN_cmp(a,b) < 0)
173 { t=a; a=b; b=t; }
174 }
175 }
176 else /* a is even */
177 {
178 if (BN_is_odd(b))
179 {
180 if (!BN_rshift1(a,a)) goto err;
181 if (BN_cmp(a,b) < 0)
182 { t=a; a=b; b=t; }
183 }
184 else /* a even - b even */
185 {
186 if (!BN_rshift1(a,a)) goto err;
187 if (!BN_rshift1(b,b)) goto err;
188 shifts++;
189 }
190 }
191 /* 0 <= b <= a */
192 }
193
194 if (shifts)
195 {
196 if (!BN_lshift(a,a,shifts)) goto err;
197 }
198 bn_check_top(a);
199 return(a);
200err:
201 return(NULL);
202 }
203
204
205/* solves ax == 1 (mod n) */
206static BIGNUM *BN_mod_inverse_no_branch(BIGNUM *in,
207 const BIGNUM *a, const BIGNUM *n, BN_CTX *ctx);
208BIGNUM *BN_mod_inverse(BIGNUM *in,
209 const BIGNUM *a, const BIGNUM *n, BN_CTX *ctx)
210 {
211 BIGNUM *A,*B,*X,*Y,*M,*D,*T,*R=NULL;
212 BIGNUM *ret=NULL;
213 int sign;
214
215 if ((BN_get_flags(a, BN_FLG_CONSTTIME) != 0) || (BN_get_flags(n, BN_FLG_CONSTTIME) != 0))
216 {
217 return BN_mod_inverse_no_branch(in, a, n, ctx);
218 }
219
220 bn_check_top(a);
221 bn_check_top(n);
222
223 BN_CTX_start(ctx);
224 A = BN_CTX_get(ctx);
225 B = BN_CTX_get(ctx);
226 X = BN_CTX_get(ctx);
227 D = BN_CTX_get(ctx);
228 M = BN_CTX_get(ctx);
229 Y = BN_CTX_get(ctx);
230 T = BN_CTX_get(ctx);
231 if (T == NULL) goto err;
232
233 if (in == NULL)
234 R=BN_new();
235 else
236 R=in;
237 if (R == NULL) goto err;
238
239 BN_one(X);
240 BN_zero(Y);
241 if (BN_copy(B,a) == NULL) goto err;
242 if (BN_copy(A,n) == NULL) goto err;
243 A->neg = 0;
244 if (B->neg || (BN_ucmp(B, A) >= 0))
245 {
246 if (!BN_nnmod(B, B, A, ctx)) goto err;
247 }
248 sign = -1;
249 /* From B = a mod |n|, A = |n| it follows that
250 *
251 * 0 <= B < A,
252 * -sign*X*a == B (mod |n|),
253 * sign*Y*a == A (mod |n|).
254 */
255
256 if (BN_is_odd(n) && (BN_num_bits(n) <= (BN_BITS <= 32 ? 450 : 2048)))
257 {
258 /* Binary inversion algorithm; requires odd modulus.
259 * This is faster than the general algorithm if the modulus
260 * is sufficiently small (about 400 .. 500 bits on 32-bit
261 * sytems, but much more on 64-bit systems) */
262 int shift;
263
264 while (!BN_is_zero(B))
265 {
266 /*
267 * 0 < B < |n|,
268 * 0 < A <= |n|,
269 * (1) -sign*X*a == B (mod |n|),
270 * (2) sign*Y*a == A (mod |n|)
271 */
272
273 /* Now divide B by the maximum possible power of two in the integers,
274 * and divide X by the same value mod |n|.
275 * When we're done, (1) still holds. */
276 shift = 0;
277 while (!BN_is_bit_set(B, shift)) /* note that 0 < B */
278 {
279 shift++;
280
281 if (BN_is_odd(X))
282 {
283 if (!BN_uadd(X, X, n)) goto err;
284 }
285 /* now X is even, so we can easily divide it by two */
286 if (!BN_rshift1(X, X)) goto err;
287 }
288 if (shift > 0)
289 {
290 if (!BN_rshift(B, B, shift)) goto err;
291 }
292
293
294 /* Same for A and Y. Afterwards, (2) still holds. */
295 shift = 0;
296 while (!BN_is_bit_set(A, shift)) /* note that 0 < A */
297 {
298 shift++;
299
300 if (BN_is_odd(Y))
301 {
302 if (!BN_uadd(Y, Y, n)) goto err;
303 }
304 /* now Y is even */
305 if (!BN_rshift1(Y, Y)) goto err;
306 }
307 if (shift > 0)
308 {
309 if (!BN_rshift(A, A, shift)) goto err;
310 }
311
312
313 /* We still have (1) and (2).
314 * Both A and B are odd.
315 * The following computations ensure that
316 *
317 * 0 <= B < |n|,
318 * 0 < A < |n|,
319 * (1) -sign*X*a == B (mod |n|),
320 * (2) sign*Y*a == A (mod |n|),
321 *
322 * and that either A or B is even in the next iteration.
323 */
324 if (BN_ucmp(B, A) >= 0)
325 {
326 /* -sign*(X + Y)*a == B - A (mod |n|) */
327 if (!BN_uadd(X, X, Y)) goto err;
328 /* NB: we could use BN_mod_add_quick(X, X, Y, n), but that
329 * actually makes the algorithm slower */
330 if (!BN_usub(B, B, A)) goto err;
331 }
332 else
333 {
334 /* sign*(X + Y)*a == A - B (mod |n|) */
335 if (!BN_uadd(Y, Y, X)) goto err;
336 /* as above, BN_mod_add_quick(Y, Y, X, n) would slow things down */
337 if (!BN_usub(A, A, B)) goto err;
338 }
339 }
340 }
341 else
342 {
343 /* general inversion algorithm */
344
345 while (!BN_is_zero(B))
346 {
347 BIGNUM *tmp;
348
349 /*
350 * 0 < B < A,
351 * (*) -sign*X*a == B (mod |n|),
352 * sign*Y*a == A (mod |n|)
353 */
354
355 /* (D, M) := (A/B, A%B) ... */
356 if (BN_num_bits(A) == BN_num_bits(B))
357 {
358 if (!BN_one(D)) goto err;
359 if (!BN_sub(M,A,B)) goto err;
360 }
361 else if (BN_num_bits(A) == BN_num_bits(B) + 1)
362 {
363 /* A/B is 1, 2, or 3 */
364 if (!BN_lshift1(T,B)) goto err;
365 if (BN_ucmp(A,T) < 0)
366 {
367 /* A < 2*B, so D=1 */
368 if (!BN_one(D)) goto err;
369 if (!BN_sub(M,A,B)) goto err;
370 }
371 else
372 {
373 /* A >= 2*B, so D=2 or D=3 */
374 if (!BN_sub(M,A,T)) goto err;
375 if (!BN_add(D,T,B)) goto err; /* use D (:= 3*B) as temp */
376 if (BN_ucmp(A,D) < 0)
377 {
378 /* A < 3*B, so D=2 */
379 if (!BN_set_word(D,2)) goto err;
380 /* M (= A - 2*B) already has the correct value */
381 }
382 else
383 {
384 /* only D=3 remains */
385 if (!BN_set_word(D,3)) goto err;
386 /* currently M = A - 2*B, but we need M = A - 3*B */
387 if (!BN_sub(M,M,B)) goto err;
388 }
389 }
390 }
391 else
392 {
393 if (!BN_div(D,M,A,B,ctx)) goto err;
394 }
395
396 /* Now
397 * A = D*B + M;
398 * thus we have
399 * (**) sign*Y*a == D*B + M (mod |n|).
400 */
401
402 tmp=A; /* keep the BIGNUM object, the value does not matter */
403
404 /* (A, B) := (B, A mod B) ... */
405 A=B;
406 B=M;
407 /* ... so we have 0 <= B < A again */
408
409 /* Since the former M is now B and the former B is now A,
410 * (**) translates into
411 * sign*Y*a == D*A + B (mod |n|),
412 * i.e.
413 * sign*Y*a - D*A == B (mod |n|).
414 * Similarly, (*) translates into
415 * -sign*X*a == A (mod |n|).
416 *
417 * Thus,
418 * sign*Y*a + D*sign*X*a == B (mod |n|),
419 * i.e.
420 * sign*(Y + D*X)*a == B (mod |n|).
421 *
422 * So if we set (X, Y, sign) := (Y + D*X, X, -sign), we arrive back at
423 * -sign*X*a == B (mod |n|),
424 * sign*Y*a == A (mod |n|).
425 * Note that X and Y stay non-negative all the time.
426 */
427
428 /* most of the time D is very small, so we can optimize tmp := D*X+Y */
429 if (BN_is_one(D))
430 {
431 if (!BN_add(tmp,X,Y)) goto err;
432 }
433 else
434 {
435 if (BN_is_word(D,2))
436 {
437 if (!BN_lshift1(tmp,X)) goto err;
438 }
439 else if (BN_is_word(D,4))
440 {
441 if (!BN_lshift(tmp,X,2)) goto err;
442 }
443 else if (D->top == 1)
444 {
445 if (!BN_copy(tmp,X)) goto err;
446 if (!BN_mul_word(tmp,D->d[0])) goto err;
447 }
448 else
449 {
450 if (!BN_mul(tmp,D,X,ctx)) goto err;
451 }
452 if (!BN_add(tmp,tmp,Y)) goto err;
453 }
454
455 M=Y; /* keep the BIGNUM object, the value does not matter */
456 Y=X;
457 X=tmp;
458 sign = -sign;
459 }
460 }
461
462 /*
463 * The while loop (Euclid's algorithm) ends when
464 * A == gcd(a,n);
465 * we have
466 * sign*Y*a == A (mod |n|),
467 * where Y is non-negative.
468 */
469
470 if (sign < 0)
471 {
472 if (!BN_sub(Y,n,Y)) goto err;
473 }
474 /* Now Y*a == A (mod |n|). */
475
476
477 if (BN_is_one(A))
478 {
479 /* Y*a == 1 (mod |n|) */
480 if (!Y->neg && BN_ucmp(Y,n) < 0)
481 {
482 if (!BN_copy(R,Y)) goto err;
483 }
484 else
485 {
486 if (!BN_nnmod(R,Y,n,ctx)) goto err;
487 }
488 }
489 else
490 {
491 BNerr(BN_F_BN_MOD_INVERSE,BN_R_NO_INVERSE);
492 goto err;
493 }
494 ret=R;
495err:
496 if ((ret == NULL) && (in == NULL)) BN_free(R);
497 BN_CTX_end(ctx);
498 bn_check_top(ret);
499 return(ret);
500 }
501
502
503/* BN_mod_inverse_no_branch is a special version of BN_mod_inverse.
504 * It does not contain branches that may leak sensitive information.
505 */
506static BIGNUM *BN_mod_inverse_no_branch(BIGNUM *in,
507 const BIGNUM *a, const BIGNUM *n, BN_CTX *ctx)
508 {
509 BIGNUM *A,*B,*X,*Y,*M,*D,*T,*R=NULL;
510 BIGNUM local_A, local_B;
511 BIGNUM *pA, *pB;
512 BIGNUM *ret=NULL;
513 int sign;
514
515 bn_check_top(a);
516 bn_check_top(n);
517
518 BN_CTX_start(ctx);
519 A = BN_CTX_get(ctx);
520 B = BN_CTX_get(ctx);
521 X = BN_CTX_get(ctx);
522 D = BN_CTX_get(ctx);
523 M = BN_CTX_get(ctx);
524 Y = BN_CTX_get(ctx);
525 T = BN_CTX_get(ctx);
526 if (T == NULL) goto err;
527
528 if (in == NULL)
529 R=BN_new();
530 else
531 R=in;
532 if (R == NULL) goto err;
533
534 BN_one(X);
535 BN_zero(Y);
536 if (BN_copy(B,a) == NULL) goto err;
537 if (BN_copy(A,n) == NULL) goto err;
538 A->neg = 0;
539
540 if (B->neg || (BN_ucmp(B, A) >= 0))
541 {
542 /* Turn BN_FLG_CONSTTIME flag on, so that when BN_div is invoked,
543 * BN_div_no_branch will be called eventually.
544 */
545 pB = &local_B;
546 BN_with_flags(pB, B, BN_FLG_CONSTTIME);
547 if (!BN_nnmod(B, pB, A, ctx)) goto err;
548 }
549 sign = -1;
550 /* From B = a mod |n|, A = |n| it follows that
551 *
552 * 0 <= B < A,
553 * -sign*X*a == B (mod |n|),
554 * sign*Y*a == A (mod |n|).
555 */
556
557 while (!BN_is_zero(B))
558 {
559 BIGNUM *tmp;
560
561 /*
562 * 0 < B < A,
563 * (*) -sign*X*a == B (mod |n|),
564 * sign*Y*a == A (mod |n|)
565 */
566
567 /* Turn BN_FLG_CONSTTIME flag on, so that when BN_div is invoked,
568 * BN_div_no_branch will be called eventually.
569 */
570 pA = &local_A;
571 BN_with_flags(pA, A, BN_FLG_CONSTTIME);
572
573 /* (D, M) := (A/B, A%B) ... */
574 if (!BN_div(D,M,pA,B,ctx)) goto err;
575
576 /* Now
577 * A = D*B + M;
578 * thus we have
579 * (**) sign*Y*a == D*B + M (mod |n|).
580 */
581
582 tmp=A; /* keep the BIGNUM object, the value does not matter */
583
584 /* (A, B) := (B, A mod B) ... */
585 A=B;
586 B=M;
587 /* ... so we have 0 <= B < A again */
588
589 /* Since the former M is now B and the former B is now A,
590 * (**) translates into
591 * sign*Y*a == D*A + B (mod |n|),
592 * i.e.
593 * sign*Y*a - D*A == B (mod |n|).
594 * Similarly, (*) translates into
595 * -sign*X*a == A (mod |n|).
596 *
597 * Thus,
598 * sign*Y*a + D*sign*X*a == B (mod |n|),
599 * i.e.
600 * sign*(Y + D*X)*a == B (mod |n|).
601 *
602 * So if we set (X, Y, sign) := (Y + D*X, X, -sign), we arrive back at
603 * -sign*X*a == B (mod |n|),
604 * sign*Y*a == A (mod |n|).
605 * Note that X and Y stay non-negative all the time.
606 */
607
608 if (!BN_mul(tmp,D,X,ctx)) goto err;
609 if (!BN_add(tmp,tmp,Y)) goto err;
610
611 M=Y; /* keep the BIGNUM object, the value does not matter */
612 Y=X;
613 X=tmp;
614 sign = -sign;
615 }
616
617 /*
618 * The while loop (Euclid's algorithm) ends when
619 * A == gcd(a,n);
620 * we have
621 * sign*Y*a == A (mod |n|),
622 * where Y is non-negative.
623 */
624
625 if (sign < 0)
626 {
627 if (!BN_sub(Y,n,Y)) goto err;
628 }
629 /* Now Y*a == A (mod |n|). */
630
631 if (BN_is_one(A))
632 {
633 /* Y*a == 1 (mod |n|) */
634 if (!Y->neg && BN_ucmp(Y,n) < 0)
635 {
636 if (!BN_copy(R,Y)) goto err;
637 }
638 else
639 {
640 if (!BN_nnmod(R,Y,n,ctx)) goto err;
641 }
642 }
643 else
644 {
645 BNerr(BN_F_BN_MOD_INVERSE_NO_BRANCH,BN_R_NO_INVERSE);
646 goto err;
647 }
648 ret=R;
649err:
650 if ((ret == NULL) && (in == NULL)) BN_free(R);
651 BN_CTX_end(ctx);
652 bn_check_top(ret);
653 return(ret);
654 }
diff --git a/src/lib/libcrypto/bn/bn_gf2m.c b/src/lib/libcrypto/bn/bn_gf2m.c
deleted file mode 100644
index 8a4dc20ad9..0000000000
--- a/src/lib/libcrypto/bn/bn_gf2m.c
+++ /dev/null
@@ -1,1113 +0,0 @@
1/* crypto/bn/bn_gf2m.c */
2/* ====================================================================
3 * Copyright 2002 Sun Microsystems, Inc. ALL RIGHTS RESERVED.
4 *
5 * The Elliptic Curve Public-Key Crypto Library (ECC Code) included
6 * herein is developed by SUN MICROSYSTEMS, INC., and is contributed
7 * to the OpenSSL project.
8 *
9 * The ECC Code is licensed pursuant to the OpenSSL open source
10 * license provided below.
11 *
12 * In addition, Sun covenants to all licensees who provide a reciprocal
13 * covenant with respect to their own patents if any, not to sue under
14 * current and future patent claims necessarily infringed by the making,
15 * using, practicing, selling, offering for sale and/or otherwise
16 * disposing of the ECC Code as delivered hereunder (or portions thereof),
17 * provided that such covenant shall not apply:
18 * 1) for code that a licensee deletes from the ECC Code;
19 * 2) separates from the ECC Code; or
20 * 3) for infringements caused by:
21 * i) the modification of the ECC Code or
22 * ii) the combination of the ECC Code with other software or
23 * devices where such combination causes the infringement.
24 *
25 * The software is originally written by Sheueling Chang Shantz and
26 * Douglas Stebila of Sun Microsystems Laboratories.
27 *
28 */
29
30/* NOTE: This file is licensed pursuant to the OpenSSL license below
31 * and may be modified; but after modifications, the above covenant
32 * may no longer apply! In such cases, the corresponding paragraph
33 * ["In addition, Sun covenants ... causes the infringement."] and
34 * this note can be edited out; but please keep the Sun copyright
35 * notice and attribution. */
36
37/* ====================================================================
38 * Copyright (c) 1998-2002 The OpenSSL Project. All rights reserved.
39 *
40 * Redistribution and use in source and binary forms, with or without
41 * modification, are permitted provided that the following conditions
42 * are met:
43 *
44 * 1. Redistributions of source code must retain the above copyright
45 * notice, this list of conditions and the following disclaimer.
46 *
47 * 2. Redistributions in binary form must reproduce the above copyright
48 * notice, this list of conditions and the following disclaimer in
49 * the documentation and/or other materials provided with the
50 * distribution.
51 *
52 * 3. All advertising materials mentioning features or use of this
53 * software must display the following acknowledgment:
54 * "This product includes software developed by the OpenSSL Project
55 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
56 *
57 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
58 * endorse or promote products derived from this software without
59 * prior written permission. For written permission, please contact
60 * openssl-core@openssl.org.
61 *
62 * 5. Products derived from this software may not be called "OpenSSL"
63 * nor may "OpenSSL" appear in their names without prior written
64 * permission of the OpenSSL Project.
65 *
66 * 6. Redistributions of any form whatsoever must retain the following
67 * acknowledgment:
68 * "This product includes software developed by the OpenSSL Project
69 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
70 *
71 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
72 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
73 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
74 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
75 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
76 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
77 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
78 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
79 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
80 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
81 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
82 * OF THE POSSIBILITY OF SUCH DAMAGE.
83 * ====================================================================
84 *
85 * This product includes cryptographic software written by Eric Young
86 * (eay@cryptsoft.com). This product includes software written by Tim
87 * Hudson (tjh@cryptsoft.com).
88 *
89 */
90
91#include <assert.h>
92#include <limits.h>
93#include <stdio.h>
94#include "cryptlib.h"
95#include "bn_lcl.h"
96
97#ifndef OPENSSL_NO_EC2M
98
99/* Maximum number of iterations before BN_GF2m_mod_solve_quad_arr should fail. */
100#define MAX_ITERATIONS 50
101
102static const BN_ULONG SQR_tb[16] =
103 { 0, 1, 4, 5, 16, 17, 20, 21,
104 64, 65, 68, 69, 80, 81, 84, 85 };
105/* Platform-specific macros to accelerate squaring. */
106#if defined(SIXTY_FOUR_BIT) || defined(SIXTY_FOUR_BIT_LONG)
107#define SQR1(w) \
108 SQR_tb[(w) >> 60 & 0xF] << 56 | SQR_tb[(w) >> 56 & 0xF] << 48 | \
109 SQR_tb[(w) >> 52 & 0xF] << 40 | SQR_tb[(w) >> 48 & 0xF] << 32 | \
110 SQR_tb[(w) >> 44 & 0xF] << 24 | SQR_tb[(w) >> 40 & 0xF] << 16 | \
111 SQR_tb[(w) >> 36 & 0xF] << 8 | SQR_tb[(w) >> 32 & 0xF]
112#define SQR0(w) \
113 SQR_tb[(w) >> 28 & 0xF] << 56 | SQR_tb[(w) >> 24 & 0xF] << 48 | \
114 SQR_tb[(w) >> 20 & 0xF] << 40 | SQR_tb[(w) >> 16 & 0xF] << 32 | \
115 SQR_tb[(w) >> 12 & 0xF] << 24 | SQR_tb[(w) >> 8 & 0xF] << 16 | \
116 SQR_tb[(w) >> 4 & 0xF] << 8 | SQR_tb[(w) & 0xF]
117#endif
118#ifdef THIRTY_TWO_BIT
119#define SQR1(w) \
120 SQR_tb[(w) >> 28 & 0xF] << 24 | SQR_tb[(w) >> 24 & 0xF] << 16 | \
121 SQR_tb[(w) >> 20 & 0xF] << 8 | SQR_tb[(w) >> 16 & 0xF]
122#define SQR0(w) \
123 SQR_tb[(w) >> 12 & 0xF] << 24 | SQR_tb[(w) >> 8 & 0xF] << 16 | \
124 SQR_tb[(w) >> 4 & 0xF] << 8 | SQR_tb[(w) & 0xF]
125#endif
126
127#if !defined(OPENSSL_BN_ASM_GF2m)
128/* Product of two polynomials a, b each with degree < BN_BITS2 - 1,
129 * result is a polynomial r with degree < 2 * BN_BITS - 1
130 * The caller MUST ensure that the variables have the right amount
131 * of space allocated.
132 */
133#ifdef THIRTY_TWO_BIT
134static void bn_GF2m_mul_1x1(BN_ULONG *r1, BN_ULONG *r0, const BN_ULONG a, const BN_ULONG b)
135 {
136 register BN_ULONG h, l, s;
137 BN_ULONG tab[8], top2b = a >> 30;
138 register BN_ULONG a1, a2, a4;
139
140 a1 = a & (0x3FFFFFFF); a2 = a1 << 1; a4 = a2 << 1;
141
142 tab[0] = 0; tab[1] = a1; tab[2] = a2; tab[3] = a1^a2;
143 tab[4] = a4; tab[5] = a1^a4; tab[6] = a2^a4; tab[7] = a1^a2^a4;
144
145 s = tab[b & 0x7]; l = s;
146 s = tab[b >> 3 & 0x7]; l ^= s << 3; h = s >> 29;
147 s = tab[b >> 6 & 0x7]; l ^= s << 6; h ^= s >> 26;
148 s = tab[b >> 9 & 0x7]; l ^= s << 9; h ^= s >> 23;
149 s = tab[b >> 12 & 0x7]; l ^= s << 12; h ^= s >> 20;
150 s = tab[b >> 15 & 0x7]; l ^= s << 15; h ^= s >> 17;
151 s = tab[b >> 18 & 0x7]; l ^= s << 18; h ^= s >> 14;
152 s = tab[b >> 21 & 0x7]; l ^= s << 21; h ^= s >> 11;
153 s = tab[b >> 24 & 0x7]; l ^= s << 24; h ^= s >> 8;
154 s = tab[b >> 27 & 0x7]; l ^= s << 27; h ^= s >> 5;
155 s = tab[b >> 30 ]; l ^= s << 30; h ^= s >> 2;
156
157 /* compensate for the top two bits of a */
158
159 if (top2b & 01) { l ^= b << 30; h ^= b >> 2; }
160 if (top2b & 02) { l ^= b << 31; h ^= b >> 1; }
161
162 *r1 = h; *r0 = l;
163 }
164#endif
165#if defined(SIXTY_FOUR_BIT) || defined(SIXTY_FOUR_BIT_LONG)
166static void bn_GF2m_mul_1x1(BN_ULONG *r1, BN_ULONG *r0, const BN_ULONG a, const BN_ULONG b)
167 {
168 register BN_ULONG h, l, s;
169 BN_ULONG tab[16], top3b = a >> 61;
170 register BN_ULONG a1, a2, a4, a8;
171
172 a1 = a & (0x1FFFFFFFFFFFFFFFULL); a2 = a1 << 1; a4 = a2 << 1; a8 = a4 << 1;
173
174 tab[ 0] = 0; tab[ 1] = a1; tab[ 2] = a2; tab[ 3] = a1^a2;
175 tab[ 4] = a4; tab[ 5] = a1^a4; tab[ 6] = a2^a4; tab[ 7] = a1^a2^a4;
176 tab[ 8] = a8; tab[ 9] = a1^a8; tab[10] = a2^a8; tab[11] = a1^a2^a8;
177 tab[12] = a4^a8; tab[13] = a1^a4^a8; tab[14] = a2^a4^a8; tab[15] = a1^a2^a4^a8;
178
179 s = tab[b & 0xF]; l = s;
180 s = tab[b >> 4 & 0xF]; l ^= s << 4; h = s >> 60;
181 s = tab[b >> 8 & 0xF]; l ^= s << 8; h ^= s >> 56;
182 s = tab[b >> 12 & 0xF]; l ^= s << 12; h ^= s >> 52;
183 s = tab[b >> 16 & 0xF]; l ^= s << 16; h ^= s >> 48;
184 s = tab[b >> 20 & 0xF]; l ^= s << 20; h ^= s >> 44;
185 s = tab[b >> 24 & 0xF]; l ^= s << 24; h ^= s >> 40;
186 s = tab[b >> 28 & 0xF]; l ^= s << 28; h ^= s >> 36;
187 s = tab[b >> 32 & 0xF]; l ^= s << 32; h ^= s >> 32;
188 s = tab[b >> 36 & 0xF]; l ^= s << 36; h ^= s >> 28;
189 s = tab[b >> 40 & 0xF]; l ^= s << 40; h ^= s >> 24;
190 s = tab[b >> 44 & 0xF]; l ^= s << 44; h ^= s >> 20;
191 s = tab[b >> 48 & 0xF]; l ^= s << 48; h ^= s >> 16;
192 s = tab[b >> 52 & 0xF]; l ^= s << 52; h ^= s >> 12;
193 s = tab[b >> 56 & 0xF]; l ^= s << 56; h ^= s >> 8;
194 s = tab[b >> 60 ]; l ^= s << 60; h ^= s >> 4;
195
196 /* compensate for the top three bits of a */
197
198 if (top3b & 01) { l ^= b << 61; h ^= b >> 3; }
199 if (top3b & 02) { l ^= b << 62; h ^= b >> 2; }
200 if (top3b & 04) { l ^= b << 63; h ^= b >> 1; }
201
202 *r1 = h; *r0 = l;
203 }
204#endif
205
206/* Product of two polynomials a, b each with degree < 2 * BN_BITS2 - 1,
207 * result is a polynomial r with degree < 4 * BN_BITS2 - 1
208 * The caller MUST ensure that the variables have the right amount
209 * of space allocated.
210 */
211static void bn_GF2m_mul_2x2(BN_ULONG *r, const BN_ULONG a1, const BN_ULONG a0, const BN_ULONG b1, const BN_ULONG b0)
212 {
213 BN_ULONG m1, m0;
214 /* r[3] = h1, r[2] = h0; r[1] = l1; r[0] = l0 */
215 bn_GF2m_mul_1x1(r+3, r+2, a1, b1);
216 bn_GF2m_mul_1x1(r+1, r, a0, b0);
217 bn_GF2m_mul_1x1(&m1, &m0, a0 ^ a1, b0 ^ b1);
218 /* Correction on m1 ^= l1 ^ h1; m0 ^= l0 ^ h0; */
219 r[2] ^= m1 ^ r[1] ^ r[3]; /* h0 ^= m1 ^ l1 ^ h1; */
220 r[1] = r[3] ^ r[2] ^ r[0] ^ m1 ^ m0; /* l1 ^= l0 ^ h0 ^ m0; */
221 }
222#else
223void bn_GF2m_mul_2x2(BN_ULONG *r, BN_ULONG a1, BN_ULONG a0, BN_ULONG b1, BN_ULONG b0);
224#endif
225
226/* Add polynomials a and b and store result in r; r could be a or b, a and b
227 * could be equal; r is the bitwise XOR of a and b.
228 */
229int BN_GF2m_add(BIGNUM *r, const BIGNUM *a, const BIGNUM *b)
230 {
231 int i;
232 const BIGNUM *at, *bt;
233
234 bn_check_top(a);
235 bn_check_top(b);
236
237 if (a->top < b->top) { at = b; bt = a; }
238 else { at = a; bt = b; }
239
240 if(bn_wexpand(r, at->top) == NULL)
241 return 0;
242
243 for (i = 0; i < bt->top; i++)
244 {
245 r->d[i] = at->d[i] ^ bt->d[i];
246 }
247 for (; i < at->top; i++)
248 {
249 r->d[i] = at->d[i];
250 }
251
252 r->top = at->top;
253 bn_correct_top(r);
254
255 return 1;
256 }
257
258
259/* Some functions allow for representation of the irreducible polynomials
260 * as an int[], say p. The irreducible f(t) is then of the form:
261 * t^p[0] + t^p[1] + ... + t^p[k]
262 * where m = p[0] > p[1] > ... > p[k] = 0.
263 */
264
265
266/* Performs modular reduction of a and store result in r. r could be a. */
267int BN_GF2m_mod_arr(BIGNUM *r, const BIGNUM *a, const int p[])
268 {
269 int j, k;
270 int n, dN, d0, d1;
271 BN_ULONG zz, *z;
272
273 bn_check_top(a);
274
275 if (!p[0])
276 {
277 /* reduction mod 1 => return 0 */
278 BN_zero(r);
279 return 1;
280 }
281
282 /* Since the algorithm does reduction in the r value, if a != r, copy
283 * the contents of a into r so we can do reduction in r.
284 */
285 if (a != r)
286 {
287 if (!bn_wexpand(r, a->top)) return 0;
288 for (j = 0; j < a->top; j++)
289 {
290 r->d[j] = a->d[j];
291 }
292 r->top = a->top;
293 }
294 z = r->d;
295
296 /* start reduction */
297 dN = p[0] / BN_BITS2;
298 for (j = r->top - 1; j > dN;)
299 {
300 zz = z[j];
301 if (z[j] == 0) { j--; continue; }
302 z[j] = 0;
303
304 for (k = 1; p[k] != 0; k++)
305 {
306 /* reducing component t^p[k] */
307 n = p[0] - p[k];
308 d0 = n % BN_BITS2; d1 = BN_BITS2 - d0;
309 n /= BN_BITS2;
310 z[j-n] ^= (zz>>d0);
311 if (d0) z[j-n-1] ^= (zz<<d1);
312 }
313
314 /* reducing component t^0 */
315 n = dN;
316 d0 = p[0] % BN_BITS2;
317 d1 = BN_BITS2 - d0;
318 z[j-n] ^= (zz >> d0);
319 if (d0) z[j-n-1] ^= (zz << d1);
320 }
321
322 /* final round of reduction */
323 while (j == dN)
324 {
325
326 d0 = p[0] % BN_BITS2;
327 zz = z[dN] >> d0;
328 if (zz == 0) break;
329 d1 = BN_BITS2 - d0;
330
331 /* clear up the top d1 bits */
332 if (d0)
333 z[dN] = (z[dN] << d1) >> d1;
334 else
335 z[dN] = 0;
336 z[0] ^= zz; /* reduction t^0 component */
337
338 for (k = 1; p[k] != 0; k++)
339 {
340 BN_ULONG tmp_ulong;
341
342 /* reducing component t^p[k]*/
343 n = p[k] / BN_BITS2;
344 d0 = p[k] % BN_BITS2;
345 d1 = BN_BITS2 - d0;
346 z[n] ^= (zz << d0);
347 tmp_ulong = zz >> d1;
348 if (d0 && tmp_ulong)
349 z[n+1] ^= tmp_ulong;
350 }
351
352
353 }
354
355 bn_correct_top(r);
356 return 1;
357 }
358
359/* Performs modular reduction of a by p and store result in r. r could be a.
360 *
361 * This function calls down to the BN_GF2m_mod_arr implementation; this wrapper
362 * function is only provided for convenience; for best performance, use the
363 * BN_GF2m_mod_arr function.
364 */
365int BN_GF2m_mod(BIGNUM *r, const BIGNUM *a, const BIGNUM *p)
366 {
367 int ret = 0;
368 int arr[6];
369 bn_check_top(a);
370 bn_check_top(p);
371 ret = BN_GF2m_poly2arr(p, arr, sizeof(arr)/sizeof(arr[0]));
372 if (!ret || ret > (int)(sizeof(arr)/sizeof(arr[0])))
373 {
374 BNerr(BN_F_BN_GF2M_MOD,BN_R_INVALID_LENGTH);
375 return 0;
376 }
377 ret = BN_GF2m_mod_arr(r, a, arr);
378 bn_check_top(r);
379 return ret;
380 }
381
382
383/* Compute the product of two polynomials a and b, reduce modulo p, and store
384 * the result in r. r could be a or b; a could be b.
385 */
386int BN_GF2m_mod_mul_arr(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, const int p[], BN_CTX *ctx)
387 {
388 int zlen, i, j, k, ret = 0;
389 BIGNUM *s;
390 BN_ULONG x1, x0, y1, y0, zz[4];
391
392 bn_check_top(a);
393 bn_check_top(b);
394
395 if (a == b)
396 {
397 return BN_GF2m_mod_sqr_arr(r, a, p, ctx);
398 }
399
400 BN_CTX_start(ctx);
401 if ((s = BN_CTX_get(ctx)) == NULL) goto err;
402
403 zlen = a->top + b->top + 4;
404 if (!bn_wexpand(s, zlen)) goto err;
405 s->top = zlen;
406
407 for (i = 0; i < zlen; i++) s->d[i] = 0;
408
409 for (j = 0; j < b->top; j += 2)
410 {
411 y0 = b->d[j];
412 y1 = ((j+1) == b->top) ? 0 : b->d[j+1];
413 for (i = 0; i < a->top; i += 2)
414 {
415 x0 = a->d[i];
416 x1 = ((i+1) == a->top) ? 0 : a->d[i+1];
417 bn_GF2m_mul_2x2(zz, x1, x0, y1, y0);
418 for (k = 0; k < 4; k++) s->d[i+j+k] ^= zz[k];
419 }
420 }
421
422 bn_correct_top(s);
423 if (BN_GF2m_mod_arr(r, s, p))
424 ret = 1;
425 bn_check_top(r);
426
427err:
428 BN_CTX_end(ctx);
429 return ret;
430 }
431
432/* Compute the product of two polynomials a and b, reduce modulo p, and store
433 * the result in r. r could be a or b; a could equal b.
434 *
435 * This function calls down to the BN_GF2m_mod_mul_arr implementation; this wrapper
436 * function is only provided for convenience; for best performance, use the
437 * BN_GF2m_mod_mul_arr function.
438 */
439int BN_GF2m_mod_mul(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, const BIGNUM *p, BN_CTX *ctx)
440 {
441 int ret = 0;
442 const int max = BN_num_bits(p) + 1;
443 int *arr=NULL;
444 bn_check_top(a);
445 bn_check_top(b);
446 bn_check_top(p);
447 if ((arr = (int *)OPENSSL_malloc(sizeof(int) * max)) == NULL) goto err;
448 ret = BN_GF2m_poly2arr(p, arr, max);
449 if (!ret || ret > max)
450 {
451 BNerr(BN_F_BN_GF2M_MOD_MUL,BN_R_INVALID_LENGTH);
452 goto err;
453 }
454 ret = BN_GF2m_mod_mul_arr(r, a, b, arr, ctx);
455 bn_check_top(r);
456err:
457 if (arr) OPENSSL_free(arr);
458 return ret;
459 }
460
461
462/* Square a, reduce the result mod p, and store it in a. r could be a. */
463int BN_GF2m_mod_sqr_arr(BIGNUM *r, const BIGNUM *a, const int p[], BN_CTX *ctx)
464 {
465 int i, ret = 0;
466 BIGNUM *s;
467
468 bn_check_top(a);
469 BN_CTX_start(ctx);
470 if ((s = BN_CTX_get(ctx)) == NULL) return 0;
471 if (!bn_wexpand(s, 2 * a->top)) goto err;
472
473 for (i = a->top - 1; i >= 0; i--)
474 {
475 s->d[2*i+1] = SQR1(a->d[i]);
476 s->d[2*i ] = SQR0(a->d[i]);
477 }
478
479 s->top = 2 * a->top;
480 bn_correct_top(s);
481 if (!BN_GF2m_mod_arr(r, s, p)) goto err;
482 bn_check_top(r);
483 ret = 1;
484err:
485 BN_CTX_end(ctx);
486 return ret;
487 }
488
489/* Square a, reduce the result mod p, and store it in a. r could be a.
490 *
491 * This function calls down to the BN_GF2m_mod_sqr_arr implementation; this wrapper
492 * function is only provided for convenience; for best performance, use the
493 * BN_GF2m_mod_sqr_arr function.
494 */
495int BN_GF2m_mod_sqr(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx)
496 {
497 int ret = 0;
498 const int max = BN_num_bits(p) + 1;
499 int *arr=NULL;
500
501 bn_check_top(a);
502 bn_check_top(p);
503 if ((arr = (int *)OPENSSL_malloc(sizeof(int) * max)) == NULL) goto err;
504 ret = BN_GF2m_poly2arr(p, arr, max);
505 if (!ret || ret > max)
506 {
507 BNerr(BN_F_BN_GF2M_MOD_SQR,BN_R_INVALID_LENGTH);
508 goto err;
509 }
510 ret = BN_GF2m_mod_sqr_arr(r, a, arr, ctx);
511 bn_check_top(r);
512err:
513 if (arr) OPENSSL_free(arr);
514 return ret;
515 }
516
517
518/* Invert a, reduce modulo p, and store the result in r. r could be a.
519 * Uses Modified Almost Inverse Algorithm (Algorithm 10) from
520 * Hankerson, D., Hernandez, J.L., and Menezes, A. "Software Implementation
521 * of Elliptic Curve Cryptography Over Binary Fields".
522 */
523int BN_GF2m_mod_inv(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx)
524 {
525 BIGNUM *b, *c = NULL, *u = NULL, *v = NULL, *tmp;
526 int ret = 0;
527
528 bn_check_top(a);
529 bn_check_top(p);
530
531 BN_CTX_start(ctx);
532
533 if ((b = BN_CTX_get(ctx))==NULL) goto err;
534 if ((c = BN_CTX_get(ctx))==NULL) goto err;
535 if ((u = BN_CTX_get(ctx))==NULL) goto err;
536 if ((v = BN_CTX_get(ctx))==NULL) goto err;
537
538 if (!BN_GF2m_mod(u, a, p)) goto err;
539 if (BN_is_zero(u)) goto err;
540
541 if (!BN_copy(v, p)) goto err;
542#if 0
543 if (!BN_one(b)) goto err;
544
545 while (1)
546 {
547 while (!BN_is_odd(u))
548 {
549 if (BN_is_zero(u)) goto err;
550 if (!BN_rshift1(u, u)) goto err;
551 if (BN_is_odd(b))
552 {
553 if (!BN_GF2m_add(b, b, p)) goto err;
554 }
555 if (!BN_rshift1(b, b)) goto err;
556 }
557
558 if (BN_abs_is_word(u, 1)) break;
559
560 if (BN_num_bits(u) < BN_num_bits(v))
561 {
562 tmp = u; u = v; v = tmp;
563 tmp = b; b = c; c = tmp;
564 }
565
566 if (!BN_GF2m_add(u, u, v)) goto err;
567 if (!BN_GF2m_add(b, b, c)) goto err;
568 }
569#else
570 {
571 int i, ubits = BN_num_bits(u),
572 vbits = BN_num_bits(v), /* v is copy of p */
573 top = p->top;
574 BN_ULONG *udp,*bdp,*vdp,*cdp;
575
576 bn_wexpand(u,top); udp = u->d;
577 for (i=u->top;i<top;i++) udp[i] = 0;
578 u->top = top;
579 bn_wexpand(b,top); bdp = b->d;
580 bdp[0] = 1;
581 for (i=1;i<top;i++) bdp[i] = 0;
582 b->top = top;
583 bn_wexpand(c,top); cdp = c->d;
584 for (i=0;i<top;i++) cdp[i] = 0;
585 c->top = top;
586 vdp = v->d; /* It pays off to "cache" *->d pointers, because
587 * it allows optimizer to be more aggressive.
588 * But we don't have to "cache" p->d, because *p
589 * is declared 'const'... */
590 while (1)
591 {
592 while (ubits && !(udp[0]&1))
593 {
594 BN_ULONG u0,u1,b0,b1,mask;
595
596 u0 = udp[0];
597 b0 = bdp[0];
598 mask = (BN_ULONG)0-(b0&1);
599 b0 ^= p->d[0]&mask;
600 for (i=0;i<top-1;i++)
601 {
602 u1 = udp[i+1];
603 udp[i] = ((u0>>1)|(u1<<(BN_BITS2-1)))&BN_MASK2;
604 u0 = u1;
605 b1 = bdp[i+1]^(p->d[i+1]&mask);
606 bdp[i] = ((b0>>1)|(b1<<(BN_BITS2-1)))&BN_MASK2;
607 b0 = b1;
608 }
609 udp[i] = u0>>1;
610 bdp[i] = b0>>1;
611 ubits--;
612 }
613
614 if (ubits<=BN_BITS2 && udp[0]==1) break;
615
616 if (ubits<vbits)
617 {
618 i = ubits; ubits = vbits; vbits = i;
619 tmp = u; u = v; v = tmp;
620 tmp = b; b = c; c = tmp;
621 udp = vdp; vdp = v->d;
622 bdp = cdp; cdp = c->d;
623 }
624 for(i=0;i<top;i++)
625 {
626 udp[i] ^= vdp[i];
627 bdp[i] ^= cdp[i];
628 }
629 if (ubits==vbits)
630 {
631 BN_ULONG ul;
632 int utop = (ubits-1)/BN_BITS2;
633
634 while ((ul=udp[utop])==0 && utop) utop--;
635 ubits = utop*BN_BITS2 + BN_num_bits_word(ul);
636 }
637 }
638 bn_correct_top(b);
639 }
640#endif
641
642 if (!BN_copy(r, b)) goto err;
643 bn_check_top(r);
644 ret = 1;
645
646err:
647#ifdef BN_DEBUG /* BN_CTX_end would complain about the expanded form */
648 bn_correct_top(c);
649 bn_correct_top(u);
650 bn_correct_top(v);
651#endif
652 BN_CTX_end(ctx);
653 return ret;
654 }
655
656/* Invert xx, reduce modulo p, and store the result in r. r could be xx.
657 *
658 * This function calls down to the BN_GF2m_mod_inv implementation; this wrapper
659 * function is only provided for convenience; for best performance, use the
660 * BN_GF2m_mod_inv function.
661 */
662int BN_GF2m_mod_inv_arr(BIGNUM *r, const BIGNUM *xx, const int p[], BN_CTX *ctx)
663 {
664 BIGNUM *field;
665 int ret = 0;
666
667 bn_check_top(xx);
668 BN_CTX_start(ctx);
669 if ((field = BN_CTX_get(ctx)) == NULL) goto err;
670 if (!BN_GF2m_arr2poly(p, field)) goto err;
671
672 ret = BN_GF2m_mod_inv(r, xx, field, ctx);
673 bn_check_top(r);
674
675err:
676 BN_CTX_end(ctx);
677 return ret;
678 }
679
680
681#ifndef OPENSSL_SUN_GF2M_DIV
682/* Divide y by x, reduce modulo p, and store the result in r. r could be x
683 * or y, x could equal y.
684 */
685int BN_GF2m_mod_div(BIGNUM *r, const BIGNUM *y, const BIGNUM *x, const BIGNUM *p, BN_CTX *ctx)
686 {
687 BIGNUM *xinv = NULL;
688 int ret = 0;
689
690 bn_check_top(y);
691 bn_check_top(x);
692 bn_check_top(p);
693
694 BN_CTX_start(ctx);
695 xinv = BN_CTX_get(ctx);
696 if (xinv == NULL) goto err;
697
698 if (!BN_GF2m_mod_inv(xinv, x, p, ctx)) goto err;
699 if (!BN_GF2m_mod_mul(r, y, xinv, p, ctx)) goto err;
700 bn_check_top(r);
701 ret = 1;
702
703err:
704 BN_CTX_end(ctx);
705 return ret;
706 }
707#else
708/* Divide y by x, reduce modulo p, and store the result in r. r could be x
709 * or y, x could equal y.
710 * Uses algorithm Modular_Division_GF(2^m) from
711 * Chang-Shantz, S. "From Euclid's GCD to Montgomery Multiplication to
712 * the Great Divide".
713 */
714int BN_GF2m_mod_div(BIGNUM *r, const BIGNUM *y, const BIGNUM *x, const BIGNUM *p, BN_CTX *ctx)
715 {
716 BIGNUM *a, *b, *u, *v;
717 int ret = 0;
718
719 bn_check_top(y);
720 bn_check_top(x);
721 bn_check_top(p);
722
723 BN_CTX_start(ctx);
724
725 a = BN_CTX_get(ctx);
726 b = BN_CTX_get(ctx);
727 u = BN_CTX_get(ctx);
728 v = BN_CTX_get(ctx);
729 if (v == NULL) goto err;
730
731 /* reduce x and y mod p */
732 if (!BN_GF2m_mod(u, y, p)) goto err;
733 if (!BN_GF2m_mod(a, x, p)) goto err;
734 if (!BN_copy(b, p)) goto err;
735
736 while (!BN_is_odd(a))
737 {
738 if (!BN_rshift1(a, a)) goto err;
739 if (BN_is_odd(u)) if (!BN_GF2m_add(u, u, p)) goto err;
740 if (!BN_rshift1(u, u)) goto err;
741 }
742
743 do
744 {
745 if (BN_GF2m_cmp(b, a) > 0)
746 {
747 if (!BN_GF2m_add(b, b, a)) goto err;
748 if (!BN_GF2m_add(v, v, u)) goto err;
749 do
750 {
751 if (!BN_rshift1(b, b)) goto err;
752 if (BN_is_odd(v)) if (!BN_GF2m_add(v, v, p)) goto err;
753 if (!BN_rshift1(v, v)) goto err;
754 } while (!BN_is_odd(b));
755 }
756 else if (BN_abs_is_word(a, 1))
757 break;
758 else
759 {
760 if (!BN_GF2m_add(a, a, b)) goto err;
761 if (!BN_GF2m_add(u, u, v)) goto err;
762 do
763 {
764 if (!BN_rshift1(a, a)) goto err;
765 if (BN_is_odd(u)) if (!BN_GF2m_add(u, u, p)) goto err;
766 if (!BN_rshift1(u, u)) goto err;
767 } while (!BN_is_odd(a));
768 }
769 } while (1);
770
771 if (!BN_copy(r, u)) goto err;
772 bn_check_top(r);
773 ret = 1;
774
775err:
776 BN_CTX_end(ctx);
777 return ret;
778 }
779#endif
780
781/* Divide yy by xx, reduce modulo p, and store the result in r. r could be xx
782 * or yy, xx could equal yy.
783 *
784 * This function calls down to the BN_GF2m_mod_div implementation; this wrapper
785 * function is only provided for convenience; for best performance, use the
786 * BN_GF2m_mod_div function.
787 */
788int BN_GF2m_mod_div_arr(BIGNUM *r, const BIGNUM *yy, const BIGNUM *xx, const int p[], BN_CTX *ctx)
789 {
790 BIGNUM *field;
791 int ret = 0;
792
793 bn_check_top(yy);
794 bn_check_top(xx);
795
796 BN_CTX_start(ctx);
797 if ((field = BN_CTX_get(ctx)) == NULL) goto err;
798 if (!BN_GF2m_arr2poly(p, field)) goto err;
799
800 ret = BN_GF2m_mod_div(r, yy, xx, field, ctx);
801 bn_check_top(r);
802
803err:
804 BN_CTX_end(ctx);
805 return ret;
806 }
807
808
809/* Compute the bth power of a, reduce modulo p, and store
810 * the result in r. r could be a.
811 * Uses simple square-and-multiply algorithm A.5.1 from IEEE P1363.
812 */
813int BN_GF2m_mod_exp_arr(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, const int p[], BN_CTX *ctx)
814 {
815 int ret = 0, i, n;
816 BIGNUM *u;
817
818 bn_check_top(a);
819 bn_check_top(b);
820
821 if (BN_is_zero(b))
822 return(BN_one(r));
823
824 if (BN_abs_is_word(b, 1))
825 return (BN_copy(r, a) != NULL);
826
827 BN_CTX_start(ctx);
828 if ((u = BN_CTX_get(ctx)) == NULL) goto err;
829
830 if (!BN_GF2m_mod_arr(u, a, p)) goto err;
831
832 n = BN_num_bits(b) - 1;
833 for (i = n - 1; i >= 0; i--)
834 {
835 if (!BN_GF2m_mod_sqr_arr(u, u, p, ctx)) goto err;
836 if (BN_is_bit_set(b, i))
837 {
838 if (!BN_GF2m_mod_mul_arr(u, u, a, p, ctx)) goto err;
839 }
840 }
841 if (!BN_copy(r, u)) goto err;
842 bn_check_top(r);
843 ret = 1;
844err:
845 BN_CTX_end(ctx);
846 return ret;
847 }
848
849/* Compute the bth power of a, reduce modulo p, and store
850 * the result in r. r could be a.
851 *
852 * This function calls down to the BN_GF2m_mod_exp_arr implementation; this wrapper
853 * function is only provided for convenience; for best performance, use the
854 * BN_GF2m_mod_exp_arr function.
855 */
856int BN_GF2m_mod_exp(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, const BIGNUM *p, BN_CTX *ctx)
857 {
858 int ret = 0;
859 const int max = BN_num_bits(p) + 1;
860 int *arr=NULL;
861 bn_check_top(a);
862 bn_check_top(b);
863 bn_check_top(p);
864 if ((arr = (int *)OPENSSL_malloc(sizeof(int) * max)) == NULL) goto err;
865 ret = BN_GF2m_poly2arr(p, arr, max);
866 if (!ret || ret > max)
867 {
868 BNerr(BN_F_BN_GF2M_MOD_EXP,BN_R_INVALID_LENGTH);
869 goto err;
870 }
871 ret = BN_GF2m_mod_exp_arr(r, a, b, arr, ctx);
872 bn_check_top(r);
873err:
874 if (arr) OPENSSL_free(arr);
875 return ret;
876 }
877
878/* Compute the square root of a, reduce modulo p, and store
879 * the result in r. r could be a.
880 * Uses exponentiation as in algorithm A.4.1 from IEEE P1363.
881 */
882int BN_GF2m_mod_sqrt_arr(BIGNUM *r, const BIGNUM *a, const int p[], BN_CTX *ctx)
883 {
884 int ret = 0;
885 BIGNUM *u;
886
887 bn_check_top(a);
888
889 if (!p[0])
890 {
891 /* reduction mod 1 => return 0 */
892 BN_zero(r);
893 return 1;
894 }
895
896 BN_CTX_start(ctx);
897 if ((u = BN_CTX_get(ctx)) == NULL) goto err;
898
899 if (!BN_set_bit(u, p[0] - 1)) goto err;
900 ret = BN_GF2m_mod_exp_arr(r, a, u, p, ctx);
901 bn_check_top(r);
902
903err:
904 BN_CTX_end(ctx);
905 return ret;
906 }
907
908/* Compute the square root of a, reduce modulo p, and store
909 * the result in r. r could be a.
910 *
911 * This function calls down to the BN_GF2m_mod_sqrt_arr implementation; this wrapper
912 * function is only provided for convenience; for best performance, use the
913 * BN_GF2m_mod_sqrt_arr function.
914 */
915int BN_GF2m_mod_sqrt(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx)
916 {
917 int ret = 0;
918 const int max = BN_num_bits(p) + 1;
919 int *arr=NULL;
920 bn_check_top(a);
921 bn_check_top(p);
922 if ((arr = (int *)OPENSSL_malloc(sizeof(int) * max)) == NULL) goto err;
923 ret = BN_GF2m_poly2arr(p, arr, max);
924 if (!ret || ret > max)
925 {
926 BNerr(BN_F_BN_GF2M_MOD_SQRT,BN_R_INVALID_LENGTH);
927 goto err;
928 }
929 ret = BN_GF2m_mod_sqrt_arr(r, a, arr, ctx);
930 bn_check_top(r);
931err:
932 if (arr) OPENSSL_free(arr);
933 return ret;
934 }
935
936/* Find r such that r^2 + r = a mod p. r could be a. If no r exists returns 0.
937 * Uses algorithms A.4.7 and A.4.6 from IEEE P1363.
938 */
939int BN_GF2m_mod_solve_quad_arr(BIGNUM *r, const BIGNUM *a_, const int p[], BN_CTX *ctx)
940 {
941 int ret = 0, count = 0, j;
942 BIGNUM *a, *z, *rho, *w, *w2, *tmp;
943
944 bn_check_top(a_);
945
946 if (!p[0])
947 {
948 /* reduction mod 1 => return 0 */
949 BN_zero(r);
950 return 1;
951 }
952
953 BN_CTX_start(ctx);
954 a = BN_CTX_get(ctx);
955 z = BN_CTX_get(ctx);
956 w = BN_CTX_get(ctx);
957 if (w == NULL) goto err;
958
959 if (!BN_GF2m_mod_arr(a, a_, p)) goto err;
960
961 if (BN_is_zero(a))
962 {
963 BN_zero(r);
964 ret = 1;
965 goto err;
966 }
967
968 if (p[0] & 0x1) /* m is odd */
969 {
970 /* compute half-trace of a */
971 if (!BN_copy(z, a)) goto err;
972 for (j = 1; j <= (p[0] - 1) / 2; j++)
973 {
974 if (!BN_GF2m_mod_sqr_arr(z, z, p, ctx)) goto err;
975 if (!BN_GF2m_mod_sqr_arr(z, z, p, ctx)) goto err;
976 if (!BN_GF2m_add(z, z, a)) goto err;
977 }
978
979 }
980 else /* m is even */
981 {
982 rho = BN_CTX_get(ctx);
983 w2 = BN_CTX_get(ctx);
984 tmp = BN_CTX_get(ctx);
985 if (tmp == NULL) goto err;
986 do
987 {
988 if (!BN_rand(rho, p[0], 0, 0)) goto err;
989 if (!BN_GF2m_mod_arr(rho, rho, p)) goto err;
990 BN_zero(z);
991 if (!BN_copy(w, rho)) goto err;
992 for (j = 1; j <= p[0] - 1; j++)
993 {
994 if (!BN_GF2m_mod_sqr_arr(z, z, p, ctx)) goto err;
995 if (!BN_GF2m_mod_sqr_arr(w2, w, p, ctx)) goto err;
996 if (!BN_GF2m_mod_mul_arr(tmp, w2, a, p, ctx)) goto err;
997 if (!BN_GF2m_add(z, z, tmp)) goto err;
998 if (!BN_GF2m_add(w, w2, rho)) goto err;
999 }
1000 count++;
1001 } while (BN_is_zero(w) && (count < MAX_ITERATIONS));
1002 if (BN_is_zero(w))
1003 {
1004 BNerr(BN_F_BN_GF2M_MOD_SOLVE_QUAD_ARR,BN_R_TOO_MANY_ITERATIONS);
1005 goto err;
1006 }
1007 }
1008
1009 if (!BN_GF2m_mod_sqr_arr(w, z, p, ctx)) goto err;
1010 if (!BN_GF2m_add(w, z, w)) goto err;
1011 if (BN_GF2m_cmp(w, a))
1012 {
1013 BNerr(BN_F_BN_GF2M_MOD_SOLVE_QUAD_ARR, BN_R_NO_SOLUTION);
1014 goto err;
1015 }
1016
1017 if (!BN_copy(r, z)) goto err;
1018 bn_check_top(r);
1019
1020 ret = 1;
1021
1022err:
1023 BN_CTX_end(ctx);
1024 return ret;
1025 }
1026
1027/* Find r such that r^2 + r = a mod p. r could be a. If no r exists returns 0.
1028 *
1029 * This function calls down to the BN_GF2m_mod_solve_quad_arr implementation; this wrapper
1030 * function is only provided for convenience; for best performance, use the
1031 * BN_GF2m_mod_solve_quad_arr function.
1032 */
1033int BN_GF2m_mod_solve_quad(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx)
1034 {
1035 int ret = 0;
1036 const int max = BN_num_bits(p) + 1;
1037 int *arr=NULL;
1038 bn_check_top(a);
1039 bn_check_top(p);
1040 if ((arr = (int *)OPENSSL_malloc(sizeof(int) *
1041 max)) == NULL) goto err;
1042 ret = BN_GF2m_poly2arr(p, arr, max);
1043 if (!ret || ret > max)
1044 {
1045 BNerr(BN_F_BN_GF2M_MOD_SOLVE_QUAD,BN_R_INVALID_LENGTH);
1046 goto err;
1047 }
1048 ret = BN_GF2m_mod_solve_quad_arr(r, a, arr, ctx);
1049 bn_check_top(r);
1050err:
1051 if (arr) OPENSSL_free(arr);
1052 return ret;
1053 }
1054
1055/* Convert the bit-string representation of a polynomial
1056 * ( \sum_{i=0}^n a_i * x^i) into an array of integers corresponding
1057 * to the bits with non-zero coefficient. Array is terminated with -1.
1058 * Up to max elements of the array will be filled. Return value is total
1059 * number of array elements that would be filled if array was large enough.
1060 */
1061int BN_GF2m_poly2arr(const BIGNUM *a, int p[], int max)
1062 {
1063 int i, j, k = 0;
1064 BN_ULONG mask;
1065
1066 if (BN_is_zero(a))
1067 return 0;
1068
1069 for (i = a->top - 1; i >= 0; i--)
1070 {
1071 if (!a->d[i])
1072 /* skip word if a->d[i] == 0 */
1073 continue;
1074 mask = BN_TBIT;
1075 for (j = BN_BITS2 - 1; j >= 0; j--)
1076 {
1077 if (a->d[i] & mask)
1078 {
1079 if (k < max) p[k] = BN_BITS2 * i + j;
1080 k++;
1081 }
1082 mask >>= 1;
1083 }
1084 }
1085
1086 if (k < max) {
1087 p[k] = -1;
1088 k++;
1089 }
1090
1091 return k;
1092 }
1093
1094/* Convert the coefficient array representation of a polynomial to a
1095 * bit-string. The array must be terminated by -1.
1096 */
1097int BN_GF2m_arr2poly(const int p[], BIGNUM *a)
1098 {
1099 int i;
1100
1101 bn_check_top(a);
1102 BN_zero(a);
1103 for (i = 0; p[i] != -1; i++)
1104 {
1105 if (BN_set_bit(a, p[i]) == 0)
1106 return 0;
1107 }
1108 bn_check_top(a);
1109
1110 return 1;
1111 }
1112
1113#endif
diff --git a/src/lib/libcrypto/bn/bn_kron.c b/src/lib/libcrypto/bn/bn_kron.c
deleted file mode 100644
index 740359b752..0000000000
--- a/src/lib/libcrypto/bn/bn_kron.c
+++ /dev/null
@@ -1,184 +0,0 @@
1/* crypto/bn/bn_kron.c */
2/* ====================================================================
3 * Copyright (c) 1998-2000 The OpenSSL Project. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in
14 * the documentation and/or other materials provided with the
15 * distribution.
16 *
17 * 3. All advertising materials mentioning features or use of this
18 * software must display the following acknowledgment:
19 * "This product includes software developed by the OpenSSL Project
20 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
21 *
22 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
23 * endorse or promote products derived from this software without
24 * prior written permission. For written permission, please contact
25 * openssl-core@openssl.org.
26 *
27 * 5. Products derived from this software may not be called "OpenSSL"
28 * nor may "OpenSSL" appear in their names without prior written
29 * permission of the OpenSSL Project.
30 *
31 * 6. Redistributions of any form whatsoever must retain the following
32 * acknowledgment:
33 * "This product includes software developed by the OpenSSL Project
34 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
35 *
36 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
37 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
38 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
39 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
40 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
41 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
42 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
43 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
44 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
45 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
46 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
47 * OF THE POSSIBILITY OF SUCH DAMAGE.
48 * ====================================================================
49 *
50 * This product includes cryptographic software written by Eric Young
51 * (eay@cryptsoft.com). This product includes software written by Tim
52 * Hudson (tjh@cryptsoft.com).
53 *
54 */
55
56#include "cryptlib.h"
57#include "bn_lcl.h"
58
59/* least significant word */
60#define BN_lsw(n) (((n)->top == 0) ? (BN_ULONG) 0 : (n)->d[0])
61
62/* Returns -2 for errors because both -1 and 0 are valid results. */
63int BN_kronecker(const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx)
64 {
65 int i;
66 int ret = -2; /* avoid 'uninitialized' warning */
67 int err = 0;
68 BIGNUM *A, *B, *tmp;
69 /* In 'tab', only odd-indexed entries are relevant:
70 * For any odd BIGNUM n,
71 * tab[BN_lsw(n) & 7]
72 * is $(-1)^{(n^2-1)/8}$ (using TeX notation).
73 * Note that the sign of n does not matter.
74 */
75 static const int tab[8] = {0, 1, 0, -1, 0, -1, 0, 1};
76
77 bn_check_top(a);
78 bn_check_top(b);
79
80 BN_CTX_start(ctx);
81 A = BN_CTX_get(ctx);
82 B = BN_CTX_get(ctx);
83 if (B == NULL) goto end;
84
85 err = !BN_copy(A, a);
86 if (err) goto end;
87 err = !BN_copy(B, b);
88 if (err) goto end;
89
90 /*
91 * Kronecker symbol, imlemented according to Henri Cohen,
92 * "A Course in Computational Algebraic Number Theory"
93 * (algorithm 1.4.10).
94 */
95
96 /* Cohen's step 1: */
97
98 if (BN_is_zero(B))
99 {
100 ret = BN_abs_is_word(A, 1);
101 goto end;
102 }
103
104 /* Cohen's step 2: */
105
106 if (!BN_is_odd(A) && !BN_is_odd(B))
107 {
108 ret = 0;
109 goto end;
110 }
111
112 /* now B is non-zero */
113 i = 0;
114 while (!BN_is_bit_set(B, i))
115 i++;
116 err = !BN_rshift(B, B, i);
117 if (err) goto end;
118 if (i & 1)
119 {
120 /* i is odd */
121 /* (thus B was even, thus A must be odd!) */
122
123 /* set 'ret' to $(-1)^{(A^2-1)/8}$ */
124 ret = tab[BN_lsw(A) & 7];
125 }
126 else
127 {
128 /* i is even */
129 ret = 1;
130 }
131
132 if (B->neg)
133 {
134 B->neg = 0;
135 if (A->neg)
136 ret = -ret;
137 }
138
139 /* now B is positive and odd, so what remains to be done is
140 * to compute the Jacobi symbol (A/B) and multiply it by 'ret' */
141
142 while (1)
143 {
144 /* Cohen's step 3: */
145
146 /* B is positive and odd */
147
148 if (BN_is_zero(A))
149 {
150 ret = BN_is_one(B) ? ret : 0;
151 goto end;
152 }
153
154 /* now A is non-zero */
155 i = 0;
156 while (!BN_is_bit_set(A, i))
157 i++;
158 err = !BN_rshift(A, A, i);
159 if (err) goto end;
160 if (i & 1)
161 {
162 /* i is odd */
163 /* multiply 'ret' by $(-1)^{(B^2-1)/8}$ */
164 ret = ret * tab[BN_lsw(B) & 7];
165 }
166
167 /* Cohen's step 4: */
168 /* multiply 'ret' by $(-1)^{(A-1)(B-1)/4}$ */
169 if ((A->neg ? ~BN_lsw(A) : BN_lsw(A)) & BN_lsw(B) & 2)
170 ret = -ret;
171
172 /* (A, B) := (B mod |A|, |A|) */
173 err = !BN_nnmod(B, B, A, ctx);
174 if (err) goto end;
175 tmp = A; A = B; B = tmp;
176 tmp->neg = 0;
177 }
178end:
179 BN_CTX_end(ctx);
180 if (err)
181 return -2;
182 else
183 return ret;
184 }
diff --git a/src/lib/libcrypto/bn/bn_lcl.h b/src/lib/libcrypto/bn/bn_lcl.h
deleted file mode 100644
index eecfd8cc99..0000000000
--- a/src/lib/libcrypto/bn/bn_lcl.h
+++ /dev/null
@@ -1,508 +0,0 @@
1/* crypto/bn/bn_lcl.h */
2/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
3 * All rights reserved.
4 *
5 * This package is an SSL implementation written
6 * by Eric Young (eay@cryptsoft.com).
7 * The implementation was written so as to conform with Netscapes SSL.
8 *
9 * This library is free for commercial and non-commercial use as long as
10 * the following conditions are aheared to. The following conditions
11 * apply to all code found in this distribution, be it the RC4, RSA,
12 * lhash, DES, etc., code; not just the SSL code. The SSL documentation
13 * included with this distribution is covered by the same copyright terms
14 * except that the holder is Tim Hudson (tjh@cryptsoft.com).
15 *
16 * Copyright remains Eric Young's, and as such any Copyright notices in
17 * the code are not to be removed.
18 * If this package is used in a product, Eric Young should be given attribution
19 * as the author of the parts of the library used.
20 * This can be in the form of a textual message at program startup or
21 * in documentation (online or textual) provided with the package.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 * 1. Redistributions of source code must retain the copyright
27 * notice, this list of conditions and the following disclaimer.
28 * 2. Redistributions in binary form must reproduce the above copyright
29 * notice, this list of conditions and the following disclaimer in the
30 * documentation and/or other materials provided with the distribution.
31 * 3. All advertising materials mentioning features or use of this software
32 * must display the following acknowledgement:
33 * "This product includes cryptographic software written by
34 * Eric Young (eay@cryptsoft.com)"
35 * The word 'cryptographic' can be left out if the rouines from the library
36 * being used are not cryptographic related :-).
37 * 4. If you include any Windows specific code (or a derivative thereof) from
38 * the apps directory (application code) you must include an acknowledgement:
39 * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
40 *
41 * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
42 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51 * SUCH DAMAGE.
52 *
53 * The licence and distribution terms for any publically available version or
54 * derivative of this code cannot be changed. i.e. this code cannot simply be
55 * copied and put under another distribution licence
56 * [including the GNU Public Licence.]
57 */
58/* ====================================================================
59 * Copyright (c) 1998-2000 The OpenSSL Project. All rights reserved.
60 *
61 * Redistribution and use in source and binary forms, with or without
62 * modification, are permitted provided that the following conditions
63 * are met:
64 *
65 * 1. Redistributions of source code must retain the above copyright
66 * notice, this list of conditions and the following disclaimer.
67 *
68 * 2. Redistributions in binary form must reproduce the above copyright
69 * notice, this list of conditions and the following disclaimer in
70 * the documentation and/or other materials provided with the
71 * distribution.
72 *
73 * 3. All advertising materials mentioning features or use of this
74 * software must display the following acknowledgment:
75 * "This product includes software developed by the OpenSSL Project
76 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
77 *
78 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
79 * endorse or promote products derived from this software without
80 * prior written permission. For written permission, please contact
81 * openssl-core@openssl.org.
82 *
83 * 5. Products derived from this software may not be called "OpenSSL"
84 * nor may "OpenSSL" appear in their names without prior written
85 * permission of the OpenSSL Project.
86 *
87 * 6. Redistributions of any form whatsoever must retain the following
88 * acknowledgment:
89 * "This product includes software developed by the OpenSSL Project
90 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
91 *
92 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
93 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
94 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
95 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
96 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
97 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
98 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
99 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
100 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
101 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
102 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
103 * OF THE POSSIBILITY OF SUCH DAMAGE.
104 * ====================================================================
105 *
106 * This product includes cryptographic software written by Eric Young
107 * (eay@cryptsoft.com). This product includes software written by Tim
108 * Hudson (tjh@cryptsoft.com).
109 *
110 */
111
112#ifndef HEADER_BN_LCL_H
113#define HEADER_BN_LCL_H
114
115#include <openssl/bn.h>
116
117#ifdef __cplusplus
118extern "C" {
119#endif
120
121
122/*
123 * BN_window_bits_for_exponent_size -- macro for sliding window mod_exp functions
124 *
125 *
126 * For window size 'w' (w >= 2) and a random 'b' bits exponent,
127 * the number of multiplications is a constant plus on average
128 *
129 * 2^(w-1) + (b-w)/(w+1);
130 *
131 * here 2^(w-1) is for precomputing the table (we actually need
132 * entries only for windows that have the lowest bit set), and
133 * (b-w)/(w+1) is an approximation for the expected number of
134 * w-bit windows, not counting the first one.
135 *
136 * Thus we should use
137 *
138 * w >= 6 if b > 671
139 * w = 5 if 671 > b > 239
140 * w = 4 if 239 > b > 79
141 * w = 3 if 79 > b > 23
142 * w <= 2 if 23 > b
143 *
144 * (with draws in between). Very small exponents are often selected
145 * with low Hamming weight, so we use w = 1 for b <= 23.
146 */
147#if 1
148#define BN_window_bits_for_exponent_size(b) \
149 ((b) > 671 ? 6 : \
150 (b) > 239 ? 5 : \
151 (b) > 79 ? 4 : \
152 (b) > 23 ? 3 : 1)
153#else
154/* Old SSLeay/OpenSSL table.
155 * Maximum window size was 5, so this table differs for b==1024;
156 * but it coincides for other interesting values (b==160, b==512).
157 */
158#define BN_window_bits_for_exponent_size(b) \
159 ((b) > 255 ? 5 : \
160 (b) > 127 ? 4 : \
161 (b) > 17 ? 3 : 1)
162#endif
163
164
165
166/* BN_mod_exp_mont_conttime is based on the assumption that the
167 * L1 data cache line width of the target processor is at least
168 * the following value.
169 */
170#define MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH ( 64 )
171#define MOD_EXP_CTIME_MIN_CACHE_LINE_MASK (MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH - 1)
172
173/* Window sizes optimized for fixed window size modular exponentiation
174 * algorithm (BN_mod_exp_mont_consttime).
175 *
176 * To achieve the security goals of BN_mode_exp_mont_consttime, the
177 * maximum size of the window must not exceed
178 * log_2(MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH).
179 *
180 * Window size thresholds are defined for cache line sizes of 32 and 64,
181 * cache line sizes where log_2(32)=5 and log_2(64)=6 respectively. A
182 * window size of 7 should only be used on processors that have a 128
183 * byte or greater cache line size.
184 */
185#if MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH == 64
186
187# define BN_window_bits_for_ctime_exponent_size(b) \
188 ((b) > 937 ? 6 : \
189 (b) > 306 ? 5 : \
190 (b) > 89 ? 4 : \
191 (b) > 22 ? 3 : 1)
192# define BN_MAX_WINDOW_BITS_FOR_CTIME_EXPONENT_SIZE (6)
193
194#elif MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH == 32
195
196# define BN_window_bits_for_ctime_exponent_size(b) \
197 ((b) > 306 ? 5 : \
198 (b) > 89 ? 4 : \
199 (b) > 22 ? 3 : 1)
200# define BN_MAX_WINDOW_BITS_FOR_CTIME_EXPONENT_SIZE (5)
201
202#endif
203
204
205/* Pentium pro 16,16,16,32,64 */
206/* Alpha 16,16,16,16.64 */
207#define BN_MULL_SIZE_NORMAL (16) /* 32 */
208#define BN_MUL_RECURSIVE_SIZE_NORMAL (16) /* 32 less than */
209#define BN_SQR_RECURSIVE_SIZE_NORMAL (16) /* 32 */
210#define BN_MUL_LOW_RECURSIVE_SIZE_NORMAL (32) /* 32 */
211#define BN_MONT_CTX_SET_SIZE_WORD (64) /* 32 */
212
213#if !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_NO_INLINE_ASM) && !defined(PEDANTIC)
214/*
215 * BN_UMULT_HIGH section.
216 *
217 * No, I'm not trying to overwhelm you when stating that the
218 * product of N-bit numbers is 2*N bits wide:-) No, I don't expect
219 * you to be impressed when I say that if the compiler doesn't
220 * support 2*N integer type, then you have to replace every N*N
221 * multiplication with 4 (N/2)*(N/2) accompanied by some shifts
222 * and additions which unavoidably results in severe performance
223 * penalties. Of course provided that the hardware is capable of
224 * producing 2*N result... That's when you normally start
225 * considering assembler implementation. However! It should be
226 * pointed out that some CPUs (most notably Alpha, PowerPC and
227 * upcoming IA-64 family:-) provide *separate* instruction
228 * calculating the upper half of the product placing the result
229 * into a general purpose register. Now *if* the compiler supports
230 * inline assembler, then it's not impossible to implement the
231 * "bignum" routines (and have the compiler optimize 'em)
232 * exhibiting "native" performance in C. That's what BN_UMULT_HIGH
233 * macro is about:-)
234 *
235 * <appro@fy.chalmers.se>
236 */
237# if defined(__alpha) && (defined(SIXTY_FOUR_BIT_LONG) || defined(SIXTY_FOUR_BIT))
238# if defined(__DECC)
239# include <c_asm.h>
240# define BN_UMULT_HIGH(a,b) (BN_ULONG)asm("umulh %a0,%a1,%v0",(a),(b))
241# elif defined(__GNUC__) && __GNUC__>=2
242# define BN_UMULT_HIGH(a,b) ({ \
243 register BN_ULONG ret; \
244 asm ("umulh %1,%2,%0" \
245 : "=r"(ret) \
246 : "r"(a), "r"(b)); \
247 ret; })
248# endif /* compiler */
249# elif defined(_ARCH_PPC) && defined(__64BIT__) && defined(SIXTY_FOUR_BIT_LONG)
250# if defined(__GNUC__) && __GNUC__>=2
251# define BN_UMULT_HIGH(a,b) ({ \
252 register BN_ULONG ret; \
253 asm ("mulhdu %0,%1,%2" \
254 : "=r"(ret) \
255 : "r"(a), "r"(b)); \
256 ret; })
257# endif /* compiler */
258# elif (defined(__x86_64) || defined(__x86_64__)) && \
259 (defined(SIXTY_FOUR_BIT_LONG) || defined(SIXTY_FOUR_BIT))
260# if defined(__GNUC__) && __GNUC__>=2
261# define BN_UMULT_HIGH(a,b) ({ \
262 register BN_ULONG ret,discard; \
263 asm ("mulq %3" \
264 : "=a"(discard),"=d"(ret) \
265 : "a"(a), "g"(b) \
266 : "cc"); \
267 ret; })
268# define BN_UMULT_LOHI(low,high,a,b) \
269 asm ("mulq %3" \
270 : "=a"(low),"=d"(high) \
271 : "a"(a),"g"(b) \
272 : "cc");
273# endif
274# elif (defined(_M_AMD64) || defined(_M_X64)) && defined(SIXTY_FOUR_BIT)
275# if defined(_MSC_VER) && _MSC_VER>=1400
276 unsigned __int64 __umulh (unsigned __int64 a,unsigned __int64 b);
277 unsigned __int64 _umul128 (unsigned __int64 a,unsigned __int64 b,
278 unsigned __int64 *h);
279# pragma intrinsic(__umulh,_umul128)
280# define BN_UMULT_HIGH(a,b) __umulh((a),(b))
281# define BN_UMULT_LOHI(low,high,a,b) ((low)=_umul128((a),(b),&(high)))
282# endif
283# elif defined(__mips) && (defined(SIXTY_FOUR_BIT) || defined(SIXTY_FOUR_BIT_LONG))
284# if defined(__GNUC__) && __GNUC__>=2
285# define BN_UMULT_HIGH(a,b) ({ \
286 register BN_ULONG ret; \
287 asm ("dmultu %1,%2" \
288 : "=h"(ret) \
289 : "r"(a), "r"(b) : "l"); \
290 ret; })
291# define BN_UMULT_LOHI(low,high,a,b) \
292 asm ("dmultu %2,%3" \
293 : "=l"(low),"=h"(high) \
294 : "r"(a), "r"(b));
295# endif
296# endif /* cpu */
297#endif /* OPENSSL_NO_ASM */
298
299/*************************************************************
300 * Using the long long type
301 */
302#define Lw(t) (((BN_ULONG)(t))&BN_MASK2)
303#define Hw(t) (((BN_ULONG)((t)>>BN_BITS2))&BN_MASK2)
304
305#ifdef BN_DEBUG_RAND
306#define bn_clear_top2max(a) \
307 { \
308 int ind = (a)->dmax - (a)->top; \
309 BN_ULONG *ftl = &(a)->d[(a)->top-1]; \
310 for (; ind != 0; ind--) \
311 *(++ftl) = 0x0; \
312 }
313#else
314#define bn_clear_top2max(a)
315#endif
316
317#ifdef BN_LLONG
318#define mul_add(r,a,w,c) { \
319 BN_ULLONG t; \
320 t=(BN_ULLONG)w * (a) + (r) + (c); \
321 (r)= Lw(t); \
322 (c)= Hw(t); \
323 }
324
325#define mul(r,a,w,c) { \
326 BN_ULLONG t; \
327 t=(BN_ULLONG)w * (a) + (c); \
328 (r)= Lw(t); \
329 (c)= Hw(t); \
330 }
331
332#define sqr(r0,r1,a) { \
333 BN_ULLONG t; \
334 t=(BN_ULLONG)(a)*(a); \
335 (r0)=Lw(t); \
336 (r1)=Hw(t); \
337 }
338
339#elif defined(BN_UMULT_LOHI)
340#define mul_add(r,a,w,c) { \
341 BN_ULONG high,low,ret,tmp=(a); \
342 ret = (r); \
343 BN_UMULT_LOHI(low,high,w,tmp); \
344 ret += (c); \
345 (c) = (ret<(c))?1:0; \
346 (c) += high; \
347 ret += low; \
348 (c) += (ret<low)?1:0; \
349 (r) = ret; \
350 }
351
352#define mul(r,a,w,c) { \
353 BN_ULONG high,low,ret,ta=(a); \
354 BN_UMULT_LOHI(low,high,w,ta); \
355 ret = low + (c); \
356 (c) = high; \
357 (c) += (ret<low)?1:0; \
358 (r) = ret; \
359 }
360
361#define sqr(r0,r1,a) { \
362 BN_ULONG tmp=(a); \
363 BN_UMULT_LOHI(r0,r1,tmp,tmp); \
364 }
365
366#elif defined(BN_UMULT_HIGH)
367#define mul_add(r,a,w,c) { \
368 BN_ULONG high,low,ret,tmp=(a); \
369 ret = (r); \
370 high= BN_UMULT_HIGH(w,tmp); \
371 ret += (c); \
372 low = (w) * tmp; \
373 (c) = (ret<(c))?1:0; \
374 (c) += high; \
375 ret += low; \
376 (c) += (ret<low)?1:0; \
377 (r) = ret; \
378 }
379
380#define mul(r,a,w,c) { \
381 BN_ULONG high,low,ret,ta=(a); \
382 low = (w) * ta; \
383 high= BN_UMULT_HIGH(w,ta); \
384 ret = low + (c); \
385 (c) = high; \
386 (c) += (ret<low)?1:0; \
387 (r) = ret; \
388 }
389
390#define sqr(r0,r1,a) { \
391 BN_ULONG tmp=(a); \
392 (r0) = tmp * tmp; \
393 (r1) = BN_UMULT_HIGH(tmp,tmp); \
394 }
395
396#else
397/*************************************************************
398 * No long long type
399 */
400
401#define LBITS(a) ((a)&BN_MASK2l)
402#define HBITS(a) (((a)>>BN_BITS4)&BN_MASK2l)
403#define L2HBITS(a) (((a)<<BN_BITS4)&BN_MASK2)
404
405#define LLBITS(a) ((a)&BN_MASKl)
406#define LHBITS(a) (((a)>>BN_BITS2)&BN_MASKl)
407#define LL2HBITS(a) ((BN_ULLONG)((a)&BN_MASKl)<<BN_BITS2)
408
409#define mul64(l,h,bl,bh) \
410 { \
411 BN_ULONG m,m1,lt,ht; \
412 \
413 lt=l; \
414 ht=h; \
415 m =(bh)*(lt); \
416 lt=(bl)*(lt); \
417 m1=(bl)*(ht); \
418 ht =(bh)*(ht); \
419 m=(m+m1)&BN_MASK2; if (m < m1) ht+=L2HBITS((BN_ULONG)1); \
420 ht+=HBITS(m); \
421 m1=L2HBITS(m); \
422 lt=(lt+m1)&BN_MASK2; if (lt < m1) ht++; \
423 (l)=lt; \
424 (h)=ht; \
425 }
426
427#define sqr64(lo,ho,in) \
428 { \
429 BN_ULONG l,h,m; \
430 \
431 h=(in); \
432 l=LBITS(h); \
433 h=HBITS(h); \
434 m =(l)*(h); \
435 l*=l; \
436 h*=h; \
437 h+=(m&BN_MASK2h1)>>(BN_BITS4-1); \
438 m =(m&BN_MASK2l)<<(BN_BITS4+1); \
439 l=(l+m)&BN_MASK2; if (l < m) h++; \
440 (lo)=l; \
441 (ho)=h; \
442 }
443
444#define mul_add(r,a,bl,bh,c) { \
445 BN_ULONG l,h; \
446 \
447 h= (a); \
448 l=LBITS(h); \
449 h=HBITS(h); \
450 mul64(l,h,(bl),(bh)); \
451 \
452 /* non-multiply part */ \
453 l=(l+(c))&BN_MASK2; if (l < (c)) h++; \
454 (c)=(r); \
455 l=(l+(c))&BN_MASK2; if (l < (c)) h++; \
456 (c)=h&BN_MASK2; \
457 (r)=l; \
458 }
459
460#define mul(r,a,bl,bh,c) { \
461 BN_ULONG l,h; \
462 \
463 h= (a); \
464 l=LBITS(h); \
465 h=HBITS(h); \
466 mul64(l,h,(bl),(bh)); \
467 \
468 /* non-multiply part */ \
469 l+=(c); if ((l&BN_MASK2) < (c)) h++; \
470 (c)=h&BN_MASK2; \
471 (r)=l&BN_MASK2; \
472 }
473#endif /* !BN_LLONG */
474
475#if defined(OPENSSL_DOING_MAKEDEPEND) && defined(OPENSSL_FIPS)
476#undef bn_div_words
477#endif
478
479void bn_mul_normal(BN_ULONG *r,BN_ULONG *a,int na,BN_ULONG *b,int nb);
480void bn_mul_comba8(BN_ULONG *r,BN_ULONG *a,BN_ULONG *b);
481void bn_mul_comba4(BN_ULONG *r,BN_ULONG *a,BN_ULONG *b);
482void bn_sqr_normal(BN_ULONG *r, const BN_ULONG *a, int n, BN_ULONG *tmp);
483void bn_sqr_comba8(BN_ULONG *r,const BN_ULONG *a);
484void bn_sqr_comba4(BN_ULONG *r,const BN_ULONG *a);
485int bn_cmp_words(const BN_ULONG *a,const BN_ULONG *b,int n);
486int bn_cmp_part_words(const BN_ULONG *a, const BN_ULONG *b,
487 int cl, int dl);
488void bn_mul_recursive(BN_ULONG *r,BN_ULONG *a,BN_ULONG *b,int n2,
489 int dna,int dnb,BN_ULONG *t);
490void bn_mul_part_recursive(BN_ULONG *r,BN_ULONG *a,BN_ULONG *b,
491 int n,int tna,int tnb,BN_ULONG *t);
492void bn_sqr_recursive(BN_ULONG *r,const BN_ULONG *a, int n2, BN_ULONG *t);
493void bn_mul_low_normal(BN_ULONG *r,BN_ULONG *a,BN_ULONG *b, int n);
494void bn_mul_low_recursive(BN_ULONG *r,BN_ULONG *a,BN_ULONG *b,int n2,
495 BN_ULONG *t);
496void bn_mul_high(BN_ULONG *r,BN_ULONG *a,BN_ULONG *b,BN_ULONG *l,int n2,
497 BN_ULONG *t);
498BN_ULONG bn_add_part_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
499 int cl, int dl);
500BN_ULONG bn_sub_part_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
501 int cl, int dl);
502int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num);
503
504#ifdef __cplusplus
505}
506#endif
507
508#endif
diff --git a/src/lib/libcrypto/bn/bn_lib.c b/src/lib/libcrypto/bn/bn_lib.c
deleted file mode 100644
index 7a5676de69..0000000000
--- a/src/lib/libcrypto/bn/bn_lib.c
+++ /dev/null
@@ -1,826 +0,0 @@
1/* crypto/bn/bn_lib.c */
2/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
3 * All rights reserved.
4 *
5 * This package is an SSL implementation written
6 * by Eric Young (eay@cryptsoft.com).
7 * The implementation was written so as to conform with Netscapes SSL.
8 *
9 * This library is free for commercial and non-commercial use as long as
10 * the following conditions are aheared to. The following conditions
11 * apply to all code found in this distribution, be it the RC4, RSA,
12 * lhash, DES, etc., code; not just the SSL code. The SSL documentation
13 * included with this distribution is covered by the same copyright terms
14 * except that the holder is Tim Hudson (tjh@cryptsoft.com).
15 *
16 * Copyright remains Eric Young's, and as such any Copyright notices in
17 * the code are not to be removed.
18 * If this package is used in a product, Eric Young should be given attribution
19 * as the author of the parts of the library used.
20 * This can be in the form of a textual message at program startup or
21 * in documentation (online or textual) provided with the package.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 * 1. Redistributions of source code must retain the copyright
27 * notice, this list of conditions and the following disclaimer.
28 * 2. Redistributions in binary form must reproduce the above copyright
29 * notice, this list of conditions and the following disclaimer in the
30 * documentation and/or other materials provided with the distribution.
31 * 3. All advertising materials mentioning features or use of this software
32 * must display the following acknowledgement:
33 * "This product includes cryptographic software written by
34 * Eric Young (eay@cryptsoft.com)"
35 * The word 'cryptographic' can be left out if the rouines from the library
36 * being used are not cryptographic related :-).
37 * 4. If you include any Windows specific code (or a derivative thereof) from
38 * the apps directory (application code) you must include an acknowledgement:
39 * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
40 *
41 * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
42 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51 * SUCH DAMAGE.
52 *
53 * The licence and distribution terms for any publically available version or
54 * derivative of this code cannot be changed. i.e. this code cannot simply be
55 * copied and put under another distribution licence
56 * [including the GNU Public Licence.]
57 */
58
59#ifndef BN_DEBUG
60# undef NDEBUG /* avoid conflicting definitions */
61# define NDEBUG
62#endif
63
64#include <assert.h>
65#include <limits.h>
66#include <stdio.h>
67#include "cryptlib.h"
68#include "bn_lcl.h"
69
70const char BN_version[]="Big Number" OPENSSL_VERSION_PTEXT;
71
72/* This stuff appears to be completely unused, so is deprecated */
73#ifndef OPENSSL_NO_DEPRECATED
74/* For a 32 bit machine
75 * 2 - 4 == 128
76 * 3 - 8 == 256
77 * 4 - 16 == 512
78 * 5 - 32 == 1024
79 * 6 - 64 == 2048
80 * 7 - 128 == 4096
81 * 8 - 256 == 8192
82 */
83static int bn_limit_bits=0;
84static int bn_limit_num=8; /* (1<<bn_limit_bits) */
85static int bn_limit_bits_low=0;
86static int bn_limit_num_low=8; /* (1<<bn_limit_bits_low) */
87static int bn_limit_bits_high=0;
88static int bn_limit_num_high=8; /* (1<<bn_limit_bits_high) */
89static int bn_limit_bits_mont=0;
90static int bn_limit_num_mont=8; /* (1<<bn_limit_bits_mont) */
91
92void BN_set_params(int mult, int high, int low, int mont)
93 {
94 if (mult >= 0)
95 {
96 if (mult > (int)(sizeof(int)*8)-1)
97 mult=sizeof(int)*8-1;
98 bn_limit_bits=mult;
99 bn_limit_num=1<<mult;
100 }
101 if (high >= 0)
102 {
103 if (high > (int)(sizeof(int)*8)-1)
104 high=sizeof(int)*8-1;
105 bn_limit_bits_high=high;
106 bn_limit_num_high=1<<high;
107 }
108 if (low >= 0)
109 {
110 if (low > (int)(sizeof(int)*8)-1)
111 low=sizeof(int)*8-1;
112 bn_limit_bits_low=low;
113 bn_limit_num_low=1<<low;
114 }
115 if (mont >= 0)
116 {
117 if (mont > (int)(sizeof(int)*8)-1)
118 mont=sizeof(int)*8-1;
119 bn_limit_bits_mont=mont;
120 bn_limit_num_mont=1<<mont;
121 }
122 }
123
124int BN_get_params(int which)
125 {
126 if (which == 0) return(bn_limit_bits);
127 else if (which == 1) return(bn_limit_bits_high);
128 else if (which == 2) return(bn_limit_bits_low);
129 else if (which == 3) return(bn_limit_bits_mont);
130 else return(0);
131 }
132#endif
133
134const BIGNUM *BN_value_one(void)
135 {
136 static const BN_ULONG data_one=1L;
137 static const BIGNUM const_one={(BN_ULONG *)&data_one,1,1,0,BN_FLG_STATIC_DATA};
138
139 return(&const_one);
140 }
141
142int BN_num_bits_word(BN_ULONG l)
143 {
144 static const unsigned char bits[256]={
145 0,1,2,2,3,3,3,3,4,4,4,4,4,4,4,4,
146 5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,
147 6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
148 6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
149 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
150 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
151 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
152 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
153 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
154 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
155 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
156 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
157 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
158 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
159 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
160 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
161 };
162
163#if defined(SIXTY_FOUR_BIT_LONG)
164 if (l & 0xffffffff00000000L)
165 {
166 if (l & 0xffff000000000000L)
167 {
168 if (l & 0xff00000000000000L)
169 {
170 return(bits[(int)(l>>56)]+56);
171 }
172 else return(bits[(int)(l>>48)]+48);
173 }
174 else
175 {
176 if (l & 0x0000ff0000000000L)
177 {
178 return(bits[(int)(l>>40)]+40);
179 }
180 else return(bits[(int)(l>>32)]+32);
181 }
182 }
183 else
184#else
185#ifdef SIXTY_FOUR_BIT
186 if (l & 0xffffffff00000000LL)
187 {
188 if (l & 0xffff000000000000LL)
189 {
190 if (l & 0xff00000000000000LL)
191 {
192 return(bits[(int)(l>>56)]+56);
193 }
194 else return(bits[(int)(l>>48)]+48);
195 }
196 else
197 {
198 if (l & 0x0000ff0000000000LL)
199 {
200 return(bits[(int)(l>>40)]+40);
201 }
202 else return(bits[(int)(l>>32)]+32);
203 }
204 }
205 else
206#endif
207#endif
208 {
209#if defined(THIRTY_TWO_BIT) || defined(SIXTY_FOUR_BIT) || defined(SIXTY_FOUR_BIT_LONG)
210 if (l & 0xffff0000L)
211 {
212 if (l & 0xff000000L)
213 return(bits[(int)(l>>24L)]+24);
214 else return(bits[(int)(l>>16L)]+16);
215 }
216 else
217#endif
218 {
219#if defined(THIRTY_TWO_BIT) || defined(SIXTY_FOUR_BIT) || defined(SIXTY_FOUR_BIT_LONG)
220 if (l & 0xff00L)
221 return(bits[(int)(l>>8)]+8);
222 else
223#endif
224 return(bits[(int)(l )] );
225 }
226 }
227 }
228
229int BN_num_bits(const BIGNUM *a)
230 {
231 int i = a->top - 1;
232 bn_check_top(a);
233
234 if (BN_is_zero(a)) return 0;
235 return ((i*BN_BITS2) + BN_num_bits_word(a->d[i]));
236 }
237
238void BN_clear_free(BIGNUM *a)
239 {
240 int i;
241
242 if (a == NULL) return;
243 bn_check_top(a);
244 if (a->d != NULL)
245 {
246 OPENSSL_cleanse(a->d,a->dmax*sizeof(a->d[0]));
247 if (!(BN_get_flags(a,BN_FLG_STATIC_DATA)))
248 OPENSSL_free(a->d);
249 }
250 i=BN_get_flags(a,BN_FLG_MALLOCED);
251 OPENSSL_cleanse(a,sizeof(BIGNUM));
252 if (i)
253 OPENSSL_free(a);
254 }
255
256void BN_free(BIGNUM *a)
257 {
258 if (a == NULL) return;
259 bn_check_top(a);
260 if ((a->d != NULL) && !(BN_get_flags(a,BN_FLG_STATIC_DATA)))
261 OPENSSL_free(a->d);
262 if (a->flags & BN_FLG_MALLOCED)
263 OPENSSL_free(a);
264 else
265 {
266#ifndef OPENSSL_NO_DEPRECATED
267 a->flags|=BN_FLG_FREE;
268#endif
269 a->d = NULL;
270 }
271 }
272
273void BN_init(BIGNUM *a)
274 {
275 memset(a,0,sizeof(BIGNUM));
276 bn_check_top(a);
277 }
278
279BIGNUM *BN_new(void)
280 {
281 BIGNUM *ret;
282
283 if ((ret=(BIGNUM *)OPENSSL_malloc(sizeof(BIGNUM))) == NULL)
284 {
285 BNerr(BN_F_BN_NEW,ERR_R_MALLOC_FAILURE);
286 return(NULL);
287 }
288 ret->flags=BN_FLG_MALLOCED;
289 ret->top=0;
290 ret->neg=0;
291 ret->dmax=0;
292 ret->d=NULL;
293 bn_check_top(ret);
294 return(ret);
295 }
296
297/* This is used both by bn_expand2() and bn_dup_expand() */
298/* The caller MUST check that words > b->dmax before calling this */
299static BN_ULONG *bn_expand_internal(const BIGNUM *b, int words)
300 {
301 BN_ULONG *A,*a = NULL;
302 const BN_ULONG *B;
303 int i;
304
305 bn_check_top(b);
306
307 if (words > (INT_MAX/(4*BN_BITS2)))
308 {
309 BNerr(BN_F_BN_EXPAND_INTERNAL,BN_R_BIGNUM_TOO_LONG);
310 return NULL;
311 }
312 if (BN_get_flags(b,BN_FLG_STATIC_DATA))
313 {
314 BNerr(BN_F_BN_EXPAND_INTERNAL,BN_R_EXPAND_ON_STATIC_BIGNUM_DATA);
315 return(NULL);
316 }
317 a=A=(BN_ULONG *)OPENSSL_malloc(sizeof(BN_ULONG)*words);
318 if (A == NULL)
319 {
320 BNerr(BN_F_BN_EXPAND_INTERNAL,ERR_R_MALLOC_FAILURE);
321 return(NULL);
322 }
323#if 1
324 B=b->d;
325 /* Check if the previous number needs to be copied */
326 if (B != NULL)
327 {
328 for (i=b->top>>2; i>0; i--,A+=4,B+=4)
329 {
330 /*
331 * The fact that the loop is unrolled
332 * 4-wise is a tribute to Intel. It's
333 * the one that doesn't have enough
334 * registers to accomodate more data.
335 * I'd unroll it 8-wise otherwise:-)
336 *
337 * <appro@fy.chalmers.se>
338 */
339 BN_ULONG a0,a1,a2,a3;
340 a0=B[0]; a1=B[1]; a2=B[2]; a3=B[3];
341 A[0]=a0; A[1]=a1; A[2]=a2; A[3]=a3;
342 }
343 switch (b->top&3)
344 {
345 case 3: A[2]=B[2];
346 case 2: A[1]=B[1];
347 case 1: A[0]=B[0];
348 case 0: /* workaround for ultrix cc: without 'case 0', the optimizer does
349 * the switch table by doing a=top&3; a--; goto jump_table[a];
350 * which fails for top== 0 */
351 ;
352 }
353 }
354
355#else
356 memset(A,0,sizeof(BN_ULONG)*words);
357 memcpy(A,b->d,sizeof(b->d[0])*b->top);
358#endif
359
360 return(a);
361 }
362
363/* This is an internal function that can be used instead of bn_expand2()
364 * when there is a need to copy BIGNUMs instead of only expanding the
365 * data part, while still expanding them.
366 * Especially useful when needing to expand BIGNUMs that are declared
367 * 'const' and should therefore not be changed.
368 * The reason to use this instead of a BN_dup() followed by a bn_expand2()
369 * is memory allocation overhead. A BN_dup() followed by a bn_expand2()
370 * will allocate new memory for the BIGNUM data twice, and free it once,
371 * while bn_dup_expand() makes sure allocation is made only once.
372 */
373
374#ifndef OPENSSL_NO_DEPRECATED
375BIGNUM *bn_dup_expand(const BIGNUM *b, int words)
376 {
377 BIGNUM *r = NULL;
378
379 bn_check_top(b);
380
381 /* This function does not work if
382 * words <= b->dmax && top < words
383 * because BN_dup() does not preserve 'dmax'!
384 * (But bn_dup_expand() is not used anywhere yet.)
385 */
386
387 if (words > b->dmax)
388 {
389 BN_ULONG *a = bn_expand_internal(b, words);
390
391 if (a)
392 {
393 r = BN_new();
394 if (r)
395 {
396 r->top = b->top;
397 r->dmax = words;
398 r->neg = b->neg;
399 r->d = a;
400 }
401 else
402 {
403 /* r == NULL, BN_new failure */
404 OPENSSL_free(a);
405 }
406 }
407 /* If a == NULL, there was an error in allocation in
408 bn_expand_internal(), and NULL should be returned */
409 }
410 else
411 {
412 r = BN_dup(b);
413 }
414
415 bn_check_top(r);
416 return r;
417 }
418#endif
419
420/* This is an internal function that should not be used in applications.
421 * It ensures that 'b' has enough room for a 'words' word number
422 * and initialises any unused part of b->d with leading zeros.
423 * It is mostly used by the various BIGNUM routines. If there is an error,
424 * NULL is returned. If not, 'b' is returned. */
425
426BIGNUM *bn_expand2(BIGNUM *b, int words)
427 {
428 bn_check_top(b);
429
430 if (words > b->dmax)
431 {
432 BN_ULONG *a = bn_expand_internal(b, words);
433 if(!a) return NULL;
434 if(b->d) OPENSSL_free(b->d);
435 b->d=a;
436 b->dmax=words;
437 }
438
439/* None of this should be necessary because of what b->top means! */
440#if 0
441 /* NB: bn_wexpand() calls this only if the BIGNUM really has to grow */
442 if (b->top < b->dmax)
443 {
444 int i;
445 BN_ULONG *A = &(b->d[b->top]);
446 for (i=(b->dmax - b->top)>>3; i>0; i--,A+=8)
447 {
448 A[0]=0; A[1]=0; A[2]=0; A[3]=0;
449 A[4]=0; A[5]=0; A[6]=0; A[7]=0;
450 }
451 for (i=(b->dmax - b->top)&7; i>0; i--,A++)
452 A[0]=0;
453 assert(A == &(b->d[b->dmax]));
454 }
455#endif
456 bn_check_top(b);
457 return b;
458 }
459
460BIGNUM *BN_dup(const BIGNUM *a)
461 {
462 BIGNUM *t;
463
464 if (a == NULL) return NULL;
465 bn_check_top(a);
466
467 t = BN_new();
468 if (t == NULL) return NULL;
469 if(!BN_copy(t, a))
470 {
471 BN_free(t);
472 return NULL;
473 }
474 bn_check_top(t);
475 return t;
476 }
477
478BIGNUM *BN_copy(BIGNUM *a, const BIGNUM *b)
479 {
480 int i;
481 BN_ULONG *A;
482 const BN_ULONG *B;
483
484 bn_check_top(b);
485
486 if (a == b) return(a);
487 if (bn_wexpand(a,b->top) == NULL) return(NULL);
488
489#if 1
490 A=a->d;
491 B=b->d;
492 for (i=b->top>>2; i>0; i--,A+=4,B+=4)
493 {
494 BN_ULONG a0,a1,a2,a3;
495 a0=B[0]; a1=B[1]; a2=B[2]; a3=B[3];
496 A[0]=a0; A[1]=a1; A[2]=a2; A[3]=a3;
497 }
498 switch (b->top&3)
499 {
500 case 3: A[2]=B[2];
501 case 2: A[1]=B[1];
502 case 1: A[0]=B[0];
503 case 0: ; /* ultrix cc workaround, see comments in bn_expand_internal */
504 }
505#else
506 memcpy(a->d,b->d,sizeof(b->d[0])*b->top);
507#endif
508
509 a->top=b->top;
510 a->neg=b->neg;
511 bn_check_top(a);
512 return(a);
513 }
514
515void BN_swap(BIGNUM *a, BIGNUM *b)
516 {
517 int flags_old_a, flags_old_b;
518 BN_ULONG *tmp_d;
519 int tmp_top, tmp_dmax, tmp_neg;
520
521 bn_check_top(a);
522 bn_check_top(b);
523
524 flags_old_a = a->flags;
525 flags_old_b = b->flags;
526
527 tmp_d = a->d;
528 tmp_top = a->top;
529 tmp_dmax = a->dmax;
530 tmp_neg = a->neg;
531
532 a->d = b->d;
533 a->top = b->top;
534 a->dmax = b->dmax;
535 a->neg = b->neg;
536
537 b->d = tmp_d;
538 b->top = tmp_top;
539 b->dmax = tmp_dmax;
540 b->neg = tmp_neg;
541
542 a->flags = (flags_old_a & BN_FLG_MALLOCED) | (flags_old_b & BN_FLG_STATIC_DATA);
543 b->flags = (flags_old_b & BN_FLG_MALLOCED) | (flags_old_a & BN_FLG_STATIC_DATA);
544 bn_check_top(a);
545 bn_check_top(b);
546 }
547
548void BN_clear(BIGNUM *a)
549 {
550 bn_check_top(a);
551 if (a->d != NULL)
552 memset(a->d,0,a->dmax*sizeof(a->d[0]));
553 a->top=0;
554 a->neg=0;
555 }
556
557BN_ULONG BN_get_word(const BIGNUM *a)
558 {
559 if (a->top > 1)
560 return BN_MASK2;
561 else if (a->top == 1)
562 return a->d[0];
563 /* a->top == 0 */
564 return 0;
565 }
566
567int BN_set_word(BIGNUM *a, BN_ULONG w)
568 {
569 bn_check_top(a);
570 if (bn_expand(a,(int)sizeof(BN_ULONG)*8) == NULL) return(0);
571 a->neg = 0;
572 a->d[0] = w;
573 a->top = (w ? 1 : 0);
574 bn_check_top(a);
575 return(1);
576 }
577
578BIGNUM *BN_bin2bn(const unsigned char *s, int len, BIGNUM *ret)
579 {
580 unsigned int i,m;
581 unsigned int n;
582 BN_ULONG l;
583 BIGNUM *bn = NULL;
584
585 if (ret == NULL)
586 ret = bn = BN_new();
587 if (ret == NULL) return(NULL);
588 bn_check_top(ret);
589 l=0;
590 n=len;
591 if (n == 0)
592 {
593 ret->top=0;
594 return(ret);
595 }
596 i=((n-1)/BN_BYTES)+1;
597 m=((n-1)%(BN_BYTES));
598 if (bn_wexpand(ret, (int)i) == NULL)
599 {
600 if (bn) BN_free(bn);
601 return NULL;
602 }
603 ret->top=i;
604 ret->neg=0;
605 while (n--)
606 {
607 l=(l<<8L)| *(s++);
608 if (m-- == 0)
609 {
610 ret->d[--i]=l;
611 l=0;
612 m=BN_BYTES-1;
613 }
614 }
615 /* need to call this due to clear byte at top if avoiding
616 * having the top bit set (-ve number) */
617 bn_correct_top(ret);
618 return(ret);
619 }
620
621/* ignore negative */
622int BN_bn2bin(const BIGNUM *a, unsigned char *to)
623 {
624 int n,i;
625 BN_ULONG l;
626
627 bn_check_top(a);
628 n=i=BN_num_bytes(a);
629 while (i--)
630 {
631 l=a->d[i/BN_BYTES];
632 *(to++)=(unsigned char)(l>>(8*(i%BN_BYTES)))&0xff;
633 }
634 return(n);
635 }
636
637int BN_ucmp(const BIGNUM *a, const BIGNUM *b)
638 {
639 int i;
640 BN_ULONG t1,t2,*ap,*bp;
641
642 bn_check_top(a);
643 bn_check_top(b);
644
645 i=a->top-b->top;
646 if (i != 0) return(i);
647 ap=a->d;
648 bp=b->d;
649 for (i=a->top-1; i>=0; i--)
650 {
651 t1= ap[i];
652 t2= bp[i];
653 if (t1 != t2)
654 return((t1 > t2) ? 1 : -1);
655 }
656 return(0);
657 }
658
659int BN_cmp(const BIGNUM *a, const BIGNUM *b)
660 {
661 int i;
662 int gt,lt;
663 BN_ULONG t1,t2;
664
665 if ((a == NULL) || (b == NULL))
666 {
667 if (a != NULL)
668 return(-1);
669 else if (b != NULL)
670 return(1);
671 else
672 return(0);
673 }
674
675 bn_check_top(a);
676 bn_check_top(b);
677
678 if (a->neg != b->neg)
679 {
680 if (a->neg)
681 return(-1);
682 else return(1);
683 }
684 if (a->neg == 0)
685 { gt=1; lt= -1; }
686 else { gt= -1; lt=1; }
687
688 if (a->top > b->top) return(gt);
689 if (a->top < b->top) return(lt);
690 for (i=a->top-1; i>=0; i--)
691 {
692 t1=a->d[i];
693 t2=b->d[i];
694 if (t1 > t2) return(gt);
695 if (t1 < t2) return(lt);
696 }
697 return(0);
698 }
699
700int BN_set_bit(BIGNUM *a, int n)
701 {
702 int i,j,k;
703
704 if (n < 0)
705 return 0;
706
707 i=n/BN_BITS2;
708 j=n%BN_BITS2;
709 if (a->top <= i)
710 {
711 if (bn_wexpand(a,i+1) == NULL) return(0);
712 for(k=a->top; k<i+1; k++)
713 a->d[k]=0;
714 a->top=i+1;
715 }
716
717 a->d[i]|=(((BN_ULONG)1)<<j);
718 bn_check_top(a);
719 return(1);
720 }
721
722int BN_clear_bit(BIGNUM *a, int n)
723 {
724 int i,j;
725
726 bn_check_top(a);
727 if (n < 0) return 0;
728
729 i=n/BN_BITS2;
730 j=n%BN_BITS2;
731 if (a->top <= i) return(0);
732
733 a->d[i]&=(~(((BN_ULONG)1)<<j));
734 bn_correct_top(a);
735 return(1);
736 }
737
738int BN_is_bit_set(const BIGNUM *a, int n)
739 {
740 int i,j;
741
742 bn_check_top(a);
743 if (n < 0) return 0;
744 i=n/BN_BITS2;
745 j=n%BN_BITS2;
746 if (a->top <= i) return 0;
747 return (int)(((a->d[i])>>j)&((BN_ULONG)1));
748 }
749
750int BN_mask_bits(BIGNUM *a, int n)
751 {
752 int b,w;
753
754 bn_check_top(a);
755 if (n < 0) return 0;
756
757 w=n/BN_BITS2;
758 b=n%BN_BITS2;
759 if (w >= a->top) return 0;
760 if (b == 0)
761 a->top=w;
762 else
763 {
764 a->top=w+1;
765 a->d[w]&= ~(BN_MASK2<<b);
766 }
767 bn_correct_top(a);
768 return(1);
769 }
770
771void BN_set_negative(BIGNUM *a, int b)
772 {
773 if (b && !BN_is_zero(a))
774 a->neg = 1;
775 else
776 a->neg = 0;
777 }
778
779int bn_cmp_words(const BN_ULONG *a, const BN_ULONG *b, int n)
780 {
781 int i;
782 BN_ULONG aa,bb;
783
784 aa=a[n-1];
785 bb=b[n-1];
786 if (aa != bb) return((aa > bb)?1:-1);
787 for (i=n-2; i>=0; i--)
788 {
789 aa=a[i];
790 bb=b[i];
791 if (aa != bb) return((aa > bb)?1:-1);
792 }
793 return(0);
794 }
795
796/* Here follows a specialised variants of bn_cmp_words(). It has the
797 property of performing the operation on arrays of different sizes.
798 The sizes of those arrays is expressed through cl, which is the
799 common length ( basicall, min(len(a),len(b)) ), and dl, which is the
800 delta between the two lengths, calculated as len(a)-len(b).
801 All lengths are the number of BN_ULONGs... */
802
803int bn_cmp_part_words(const BN_ULONG *a, const BN_ULONG *b,
804 int cl, int dl)
805 {
806 int n,i;
807 n = cl-1;
808
809 if (dl < 0)
810 {
811 for (i=dl; i<0; i++)
812 {
813 if (b[n-i] != 0)
814 return -1; /* a < b */
815 }
816 }
817 if (dl > 0)
818 {
819 for (i=dl; i>0; i--)
820 {
821 if (a[n+i] != 0)
822 return 1; /* a > b */
823 }
824 }
825 return bn_cmp_words(a,b,cl);
826 }
diff --git a/src/lib/libcrypto/bn/bn_mod.c b/src/lib/libcrypto/bn/bn_mod.c
deleted file mode 100644
index 77d6ddb91a..0000000000
--- a/src/lib/libcrypto/bn/bn_mod.c
+++ /dev/null
@@ -1,301 +0,0 @@
1/* crypto/bn/bn_mod.c */
2/* Includes code written by Lenka Fibikova <fibikova@exp-math.uni-essen.de>
3 * for the OpenSSL project. */
4/* ====================================================================
5 * Copyright (c) 1998-2000 The OpenSSL Project. All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 *
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 *
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in
16 * the documentation and/or other materials provided with the
17 * distribution.
18 *
19 * 3. All advertising materials mentioning features or use of this
20 * software must display the following acknowledgment:
21 * "This product includes software developed by the OpenSSL Project
22 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
23 *
24 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
25 * endorse or promote products derived from this software without
26 * prior written permission. For written permission, please contact
27 * openssl-core@openssl.org.
28 *
29 * 5. Products derived from this software may not be called "OpenSSL"
30 * nor may "OpenSSL" appear in their names without prior written
31 * permission of the OpenSSL Project.
32 *
33 * 6. Redistributions of any form whatsoever must retain the following
34 * acknowledgment:
35 * "This product includes software developed by the OpenSSL Project
36 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
37 *
38 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
39 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
40 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
41 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
42 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
43 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
44 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
45 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
46 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
47 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
48 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
49 * OF THE POSSIBILITY OF SUCH DAMAGE.
50 * ====================================================================
51 *
52 * This product includes cryptographic software written by Eric Young
53 * (eay@cryptsoft.com). This product includes software written by Tim
54 * Hudson (tjh@cryptsoft.com).
55 *
56 */
57/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
58 * All rights reserved.
59 *
60 * This package is an SSL implementation written
61 * by Eric Young (eay@cryptsoft.com).
62 * The implementation was written so as to conform with Netscapes SSL.
63 *
64 * This library is free for commercial and non-commercial use as long as
65 * the following conditions are aheared to. The following conditions
66 * apply to all code found in this distribution, be it the RC4, RSA,
67 * lhash, DES, etc., code; not just the SSL code. The SSL documentation
68 * included with this distribution is covered by the same copyright terms
69 * except that the holder is Tim Hudson (tjh@cryptsoft.com).
70 *
71 * Copyright remains Eric Young's, and as such any Copyright notices in
72 * the code are not to be removed.
73 * If this package is used in a product, Eric Young should be given attribution
74 * as the author of the parts of the library used.
75 * This can be in the form of a textual message at program startup or
76 * in documentation (online or textual) provided with the package.
77 *
78 * Redistribution and use in source and binary forms, with or without
79 * modification, are permitted provided that the following conditions
80 * are met:
81 * 1. Redistributions of source code must retain the copyright
82 * notice, this list of conditions and the following disclaimer.
83 * 2. Redistributions in binary form must reproduce the above copyright
84 * notice, this list of conditions and the following disclaimer in the
85 * documentation and/or other materials provided with the distribution.
86 * 3. All advertising materials mentioning features or use of this software
87 * must display the following acknowledgement:
88 * "This product includes cryptographic software written by
89 * Eric Young (eay@cryptsoft.com)"
90 * The word 'cryptographic' can be left out if the rouines from the library
91 * being used are not cryptographic related :-).
92 * 4. If you include any Windows specific code (or a derivative thereof) from
93 * the apps directory (application code) you must include an acknowledgement:
94 * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
95 *
96 * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
97 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
98 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
99 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
100 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
101 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
102 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
103 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
104 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
105 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
106 * SUCH DAMAGE.
107 *
108 * The licence and distribution terms for any publically available version or
109 * derivative of this code cannot be changed. i.e. this code cannot simply be
110 * copied and put under another distribution licence
111 * [including the GNU Public Licence.]
112 */
113
114#include "cryptlib.h"
115#include "bn_lcl.h"
116
117
118#if 0 /* now just a #define */
119int BN_mod(BIGNUM *rem, const BIGNUM *m, const BIGNUM *d, BN_CTX *ctx)
120 {
121 return(BN_div(NULL,rem,m,d,ctx));
122 /* note that rem->neg == m->neg (unless the remainder is zero) */
123 }
124#endif
125
126
127int BN_nnmod(BIGNUM *r, const BIGNUM *m, const BIGNUM *d, BN_CTX *ctx)
128 {
129 /* like BN_mod, but returns non-negative remainder
130 * (i.e., 0 <= r < |d| always holds) */
131
132 if (!(BN_mod(r,m,d,ctx)))
133 return 0;
134 if (!r->neg)
135 return 1;
136 /* now -|d| < r < 0, so we have to set r := r + |d| */
137 return (d->neg ? BN_sub : BN_add)(r, r, d);
138}
139
140
141int BN_mod_add(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, const BIGNUM *m, BN_CTX *ctx)
142 {
143 if (!BN_add(r, a, b)) return 0;
144 return BN_nnmod(r, r, m, ctx);
145 }
146
147
148/* BN_mod_add variant that may be used if both a and b are non-negative
149 * and less than m */
150int BN_mod_add_quick(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, const BIGNUM *m)
151 {
152 if (!BN_uadd(r, a, b)) return 0;
153 if (BN_ucmp(r, m) >= 0)
154 return BN_usub(r, r, m);
155 return 1;
156 }
157
158
159int BN_mod_sub(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, const BIGNUM *m, BN_CTX *ctx)
160 {
161 if (!BN_sub(r, a, b)) return 0;
162 return BN_nnmod(r, r, m, ctx);
163 }
164
165
166/* BN_mod_sub variant that may be used if both a and b are non-negative
167 * and less than m */
168int BN_mod_sub_quick(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, const BIGNUM *m)
169 {
170 if (!BN_sub(r, a, b)) return 0;
171 if (r->neg)
172 return BN_add(r, r, m);
173 return 1;
174 }
175
176
177/* slow but works */
178int BN_mod_mul(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, const BIGNUM *m,
179 BN_CTX *ctx)
180 {
181 BIGNUM *t;
182 int ret=0;
183
184 bn_check_top(a);
185 bn_check_top(b);
186 bn_check_top(m);
187
188 BN_CTX_start(ctx);
189 if ((t = BN_CTX_get(ctx)) == NULL) goto err;
190 if (a == b)
191 { if (!BN_sqr(t,a,ctx)) goto err; }
192 else
193 { if (!BN_mul(t,a,b,ctx)) goto err; }
194 if (!BN_nnmod(r,t,m,ctx)) goto err;
195 bn_check_top(r);
196 ret=1;
197err:
198 BN_CTX_end(ctx);
199 return(ret);
200 }
201
202
203int BN_mod_sqr(BIGNUM *r, const BIGNUM *a, const BIGNUM *m, BN_CTX *ctx)
204 {
205 if (!BN_sqr(r, a, ctx)) return 0;
206 /* r->neg == 0, thus we don't need BN_nnmod */
207 return BN_mod(r, r, m, ctx);
208 }
209
210
211int BN_mod_lshift1(BIGNUM *r, const BIGNUM *a, const BIGNUM *m, BN_CTX *ctx)
212 {
213 if (!BN_lshift1(r, a)) return 0;
214 bn_check_top(r);
215 return BN_nnmod(r, r, m, ctx);
216 }
217
218
219/* BN_mod_lshift1 variant that may be used if a is non-negative
220 * and less than m */
221int BN_mod_lshift1_quick(BIGNUM *r, const BIGNUM *a, const BIGNUM *m)
222 {
223 if (!BN_lshift1(r, a)) return 0;
224 bn_check_top(r);
225 if (BN_cmp(r, m) >= 0)
226 return BN_sub(r, r, m);
227 return 1;
228 }
229
230
231int BN_mod_lshift(BIGNUM *r, const BIGNUM *a, int n, const BIGNUM *m, BN_CTX *ctx)
232 {
233 BIGNUM *abs_m = NULL;
234 int ret;
235
236 if (!BN_nnmod(r, a, m, ctx)) return 0;
237
238 if (m->neg)
239 {
240 abs_m = BN_dup(m);
241 if (abs_m == NULL) return 0;
242 abs_m->neg = 0;
243 }
244
245 ret = BN_mod_lshift_quick(r, r, n, (abs_m ? abs_m : m));
246 bn_check_top(r);
247
248 if (abs_m)
249 BN_free(abs_m);
250 return ret;
251 }
252
253
254/* BN_mod_lshift variant that may be used if a is non-negative
255 * and less than m */
256int BN_mod_lshift_quick(BIGNUM *r, const BIGNUM *a, int n, const BIGNUM *m)
257 {
258 if (r != a)
259 {
260 if (BN_copy(r, a) == NULL) return 0;
261 }
262
263 while (n > 0)
264 {
265 int max_shift;
266
267 /* 0 < r < m */
268 max_shift = BN_num_bits(m) - BN_num_bits(r);
269 /* max_shift >= 0 */
270
271 if (max_shift < 0)
272 {
273 BNerr(BN_F_BN_MOD_LSHIFT_QUICK, BN_R_INPUT_NOT_REDUCED);
274 return 0;
275 }
276
277 if (max_shift > n)
278 max_shift = n;
279
280 if (max_shift)
281 {
282 if (!BN_lshift(r, r, max_shift)) return 0;
283 n -= max_shift;
284 }
285 else
286 {
287 if (!BN_lshift1(r, r)) return 0;
288 --n;
289 }
290
291 /* BN_num_bits(r) <= BN_num_bits(m) */
292
293 if (BN_cmp(r, m) >= 0)
294 {
295 if (!BN_sub(r, r, m)) return 0;
296 }
297 }
298 bn_check_top(r);
299
300 return 1;
301 }
diff --git a/src/lib/libcrypto/bn/bn_mont.c b/src/lib/libcrypto/bn/bn_mont.c
deleted file mode 100644
index 427b5cf4df..0000000000
--- a/src/lib/libcrypto/bn/bn_mont.c
+++ /dev/null
@@ -1,509 +0,0 @@
1/* crypto/bn/bn_mont.c */
2/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
3 * All rights reserved.
4 *
5 * This package is an SSL implementation written
6 * by Eric Young (eay@cryptsoft.com).
7 * The implementation was written so as to conform with Netscapes SSL.
8 *
9 * This library is free for commercial and non-commercial use as long as
10 * the following conditions are aheared to. The following conditions
11 * apply to all code found in this distribution, be it the RC4, RSA,
12 * lhash, DES, etc., code; not just the SSL code. The SSL documentation
13 * included with this distribution is covered by the same copyright terms
14 * except that the holder is Tim Hudson (tjh@cryptsoft.com).
15 *
16 * Copyright remains Eric Young's, and as such any Copyright notices in
17 * the code are not to be removed.
18 * If this package is used in a product, Eric Young should be given attribution
19 * as the author of the parts of the library used.
20 * This can be in the form of a textual message at program startup or
21 * in documentation (online or textual) provided with the package.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 * 1. Redistributions of source code must retain the copyright
27 * notice, this list of conditions and the following disclaimer.
28 * 2. Redistributions in binary form must reproduce the above copyright
29 * notice, this list of conditions and the following disclaimer in the
30 * documentation and/or other materials provided with the distribution.
31 * 3. All advertising materials mentioning features or use of this software
32 * must display the following acknowledgement:
33 * "This product includes cryptographic software written by
34 * Eric Young (eay@cryptsoft.com)"
35 * The word 'cryptographic' can be left out if the rouines from the library
36 * being used are not cryptographic related :-).
37 * 4. If you include any Windows specific code (or a derivative thereof) from
38 * the apps directory (application code) you must include an acknowledgement:
39 * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
40 *
41 * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
42 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51 * SUCH DAMAGE.
52 *
53 * The licence and distribution terms for any publically available version or
54 * derivative of this code cannot be changed. i.e. this code cannot simply be
55 * copied and put under another distribution licence
56 * [including the GNU Public Licence.]
57 */
58/* ====================================================================
59 * Copyright (c) 1998-2006 The OpenSSL Project. All rights reserved.
60 *
61 * Redistribution and use in source and binary forms, with or without
62 * modification, are permitted provided that the following conditions
63 * are met:
64 *
65 * 1. Redistributions of source code must retain the above copyright
66 * notice, this list of conditions and the following disclaimer.
67 *
68 * 2. Redistributions in binary form must reproduce the above copyright
69 * notice, this list of conditions and the following disclaimer in
70 * the documentation and/or other materials provided with the
71 * distribution.
72 *
73 * 3. All advertising materials mentioning features or use of this
74 * software must display the following acknowledgment:
75 * "This product includes software developed by the OpenSSL Project
76 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
77 *
78 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
79 * endorse or promote products derived from this software without
80 * prior written permission. For written permission, please contact
81 * openssl-core@openssl.org.
82 *
83 * 5. Products derived from this software may not be called "OpenSSL"
84 * nor may "OpenSSL" appear in their names without prior written
85 * permission of the OpenSSL Project.
86 *
87 * 6. Redistributions of any form whatsoever must retain the following
88 * acknowledgment:
89 * "This product includes software developed by the OpenSSL Project
90 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
91 *
92 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
93 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
94 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
95 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
96 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
97 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
98 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
99 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
100 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
101 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
102 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
103 * OF THE POSSIBILITY OF SUCH DAMAGE.
104 * ====================================================================
105 *
106 * This product includes cryptographic software written by Eric Young
107 * (eay@cryptsoft.com). This product includes software written by Tim
108 * Hudson (tjh@cryptsoft.com).
109 *
110 */
111
112/*
113 * Details about Montgomery multiplication algorithms can be found at
114 * http://security.ece.orst.edu/publications.html, e.g.
115 * http://security.ece.orst.edu/koc/papers/j37acmon.pdf and
116 * sections 3.8 and 4.2 in http://security.ece.orst.edu/koc/papers/r01rsasw.pdf
117 */
118
119#include <stdio.h>
120#include "cryptlib.h"
121#include "bn_lcl.h"
122
123#define MONT_WORD /* use the faster word-based algorithm */
124
125#ifdef MONT_WORD
126static int BN_from_montgomery_word(BIGNUM *ret, BIGNUM *r, BN_MONT_CTX *mont);
127#endif
128
129int BN_mod_mul_montgomery(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
130 BN_MONT_CTX *mont, BN_CTX *ctx)
131 {
132 BIGNUM *tmp;
133 int ret=0;
134#if defined(OPENSSL_BN_ASM_MONT) && defined(MONT_WORD)
135 int num = mont->N.top;
136
137 if (num>1 && a->top==num && b->top==num)
138 {
139 if (bn_wexpand(r,num) == NULL) return(0);
140 if (bn_mul_mont(r->d,a->d,b->d,mont->N.d,mont->n0,num))
141 {
142 r->neg = a->neg^b->neg;
143 r->top = num;
144 bn_correct_top(r);
145 return(1);
146 }
147 }
148#endif
149
150 BN_CTX_start(ctx);
151 tmp = BN_CTX_get(ctx);
152 if (tmp == NULL) goto err;
153
154 bn_check_top(tmp);
155 if (a == b)
156 {
157 if (!BN_sqr(tmp,a,ctx)) goto err;
158 }
159 else
160 {
161 if (!BN_mul(tmp,a,b,ctx)) goto err;
162 }
163 /* reduce from aRR to aR */
164#ifdef MONT_WORD
165 if (!BN_from_montgomery_word(r,tmp,mont)) goto err;
166#else
167 if (!BN_from_montgomery(r,tmp,mont,ctx)) goto err;
168#endif
169 bn_check_top(r);
170 ret=1;
171err:
172 BN_CTX_end(ctx);
173 return(ret);
174 }
175
176#ifdef MONT_WORD
177static int BN_from_montgomery_word(BIGNUM *ret, BIGNUM *r, BN_MONT_CTX *mont)
178 {
179 BIGNUM *n;
180 BN_ULONG *ap,*np,*rp,n0,v,carry;
181 int nl,max,i;
182
183 n= &(mont->N);
184 nl=n->top;
185 if (nl == 0) { ret->top=0; return(1); }
186
187 max=(2*nl); /* carry is stored separately */
188 if (bn_wexpand(r,max) == NULL) return(0);
189
190 r->neg^=n->neg;
191 np=n->d;
192 rp=r->d;
193
194 /* clear the top words of T */
195#if 1
196 for (i=r->top; i<max; i++) /* memset? XXX */
197 rp[i]=0;
198#else
199 memset(&(rp[r->top]),0,(max-r->top)*sizeof(BN_ULONG));
200#endif
201
202 r->top=max;
203 n0=mont->n0[0];
204
205#ifdef BN_COUNT
206 fprintf(stderr,"word BN_from_montgomery_word %d * %d\n",nl,nl);
207#endif
208 for (carry=0, i=0; i<nl; i++, rp++)
209 {
210#ifdef __TANDEM
211 {
212 long long t1;
213 long long t2;
214 long long t3;
215 t1 = rp[0] * (n0 & 0177777);
216 t2 = 037777600000l;
217 t2 = n0 & t2;
218 t3 = rp[0] & 0177777;
219 t2 = (t3 * t2) & BN_MASK2;
220 t1 = t1 + t2;
221 v=bn_mul_add_words(rp,np,nl,(BN_ULONG) t1);
222 }
223#else
224 v=bn_mul_add_words(rp,np,nl,(rp[0]*n0)&BN_MASK2);
225#endif
226 v = (v+carry+rp[nl])&BN_MASK2;
227 carry |= (v != rp[nl]);
228 carry &= (v <= rp[nl]);
229 rp[nl]=v;
230 }
231
232 if (bn_wexpand(ret,nl) == NULL) return(0);
233 ret->top=nl;
234 ret->neg=r->neg;
235
236 rp=ret->d;
237 ap=&(r->d[nl]);
238
239#define BRANCH_FREE 1
240#if BRANCH_FREE
241 {
242 BN_ULONG *nrp;
243 size_t m;
244
245 v=bn_sub_words(rp,ap,np,nl)-carry;
246 /* if subtraction result is real, then
247 * trick unconditional memcpy below to perform in-place
248 * "refresh" instead of actual copy. */
249 m=(0-(size_t)v);
250 nrp=(BN_ULONG *)(((PTR_SIZE_INT)rp&~m)|((PTR_SIZE_INT)ap&m));
251
252 for (i=0,nl-=4; i<nl; i+=4)
253 {
254 BN_ULONG t1,t2,t3,t4;
255
256 t1=nrp[i+0];
257 t2=nrp[i+1];
258 t3=nrp[i+2]; ap[i+0]=0;
259 t4=nrp[i+3]; ap[i+1]=0;
260 rp[i+0]=t1; ap[i+2]=0;
261 rp[i+1]=t2; ap[i+3]=0;
262 rp[i+2]=t3;
263 rp[i+3]=t4;
264 }
265 for (nl+=4; i<nl; i++)
266 rp[i]=nrp[i], ap[i]=0;
267 }
268#else
269 if (bn_sub_words (rp,ap,np,nl)-carry)
270 memcpy(rp,ap,nl*sizeof(BN_ULONG));
271#endif
272 bn_correct_top(r);
273 bn_correct_top(ret);
274 bn_check_top(ret);
275
276 return(1);
277 }
278#endif /* MONT_WORD */
279
280int BN_from_montgomery(BIGNUM *ret, const BIGNUM *a, BN_MONT_CTX *mont,
281 BN_CTX *ctx)
282 {
283 int retn=0;
284#ifdef MONT_WORD
285 BIGNUM *t;
286
287 BN_CTX_start(ctx);
288 if ((t = BN_CTX_get(ctx)) && BN_copy(t,a))
289 retn = BN_from_montgomery_word(ret,t,mont);
290 BN_CTX_end(ctx);
291#else /* !MONT_WORD */
292 BIGNUM *t1,*t2;
293
294 BN_CTX_start(ctx);
295 t1 = BN_CTX_get(ctx);
296 t2 = BN_CTX_get(ctx);
297 if (t1 == NULL || t2 == NULL) goto err;
298
299 if (!BN_copy(t1,a)) goto err;
300 BN_mask_bits(t1,mont->ri);
301
302 if (!BN_mul(t2,t1,&mont->Ni,ctx)) goto err;
303 BN_mask_bits(t2,mont->ri);
304
305 if (!BN_mul(t1,t2,&mont->N,ctx)) goto err;
306 if (!BN_add(t2,a,t1)) goto err;
307 if (!BN_rshift(ret,t2,mont->ri)) goto err;
308
309 if (BN_ucmp(ret, &(mont->N)) >= 0)
310 {
311 if (!BN_usub(ret,ret,&(mont->N))) goto err;
312 }
313 retn=1;
314 bn_check_top(ret);
315 err:
316 BN_CTX_end(ctx);
317#endif /* MONT_WORD */
318 return(retn);
319 }
320
321BN_MONT_CTX *BN_MONT_CTX_new(void)
322 {
323 BN_MONT_CTX *ret;
324
325 if ((ret=(BN_MONT_CTX *)OPENSSL_malloc(sizeof(BN_MONT_CTX))) == NULL)
326 return(NULL);
327
328 BN_MONT_CTX_init(ret);
329 ret->flags=BN_FLG_MALLOCED;
330 return(ret);
331 }
332
333void BN_MONT_CTX_init(BN_MONT_CTX *ctx)
334 {
335 ctx->ri=0;
336 BN_init(&(ctx->RR));
337 BN_init(&(ctx->N));
338 BN_init(&(ctx->Ni));
339 ctx->n0[0] = ctx->n0[1] = 0;
340 ctx->flags=0;
341 }
342
343void BN_MONT_CTX_free(BN_MONT_CTX *mont)
344 {
345 if(mont == NULL)
346 return;
347
348 BN_free(&(mont->RR));
349 BN_free(&(mont->N));
350 BN_free(&(mont->Ni));
351 if (mont->flags & BN_FLG_MALLOCED)
352 OPENSSL_free(mont);
353 }
354
355int BN_MONT_CTX_set(BN_MONT_CTX *mont, const BIGNUM *mod, BN_CTX *ctx)
356 {
357 int ret = 0;
358 BIGNUM *Ri,*R;
359
360 BN_CTX_start(ctx);
361 if((Ri = BN_CTX_get(ctx)) == NULL) goto err;
362 R= &(mont->RR); /* grab RR as a temp */
363 if (!BN_copy(&(mont->N),mod)) goto err; /* Set N */
364 mont->N.neg = 0;
365
366#ifdef MONT_WORD
367 {
368 BIGNUM tmod;
369 BN_ULONG buf[2];
370
371 BN_init(&tmod);
372 tmod.d=buf;
373 tmod.dmax=2;
374 tmod.neg=0;
375
376 mont->ri=(BN_num_bits(mod)+(BN_BITS2-1))/BN_BITS2*BN_BITS2;
377
378#if defined(OPENSSL_BN_ASM_MONT) && (BN_BITS2<=32)
379 /* Only certain BN_BITS2<=32 platforms actually make use of
380 * n0[1], and we could use the #else case (with a shorter R
381 * value) for the others. However, currently only the assembler
382 * files do know which is which. */
383
384 BN_zero(R);
385 if (!(BN_set_bit(R,2*BN_BITS2))) goto err;
386
387 tmod.top=0;
388 if ((buf[0] = mod->d[0])) tmod.top=1;
389 if ((buf[1] = mod->top>1 ? mod->d[1] : 0)) tmod.top=2;
390
391 if ((BN_mod_inverse(Ri,R,&tmod,ctx)) == NULL)
392 goto err;
393 if (!BN_lshift(Ri,Ri,2*BN_BITS2)) goto err; /* R*Ri */
394 if (!BN_is_zero(Ri))
395 {
396 if (!BN_sub_word(Ri,1)) goto err;
397 }
398 else /* if N mod word size == 1 */
399 {
400 if (bn_expand(Ri,(int)sizeof(BN_ULONG)*2) == NULL)
401 goto err;
402 /* Ri-- (mod double word size) */
403 Ri->neg=0;
404 Ri->d[0]=BN_MASK2;
405 Ri->d[1]=BN_MASK2;
406 Ri->top=2;
407 }
408 if (!BN_div(Ri,NULL,Ri,&tmod,ctx)) goto err;
409 /* Ni = (R*Ri-1)/N,
410 * keep only couple of least significant words: */
411 mont->n0[0] = (Ri->top > 0) ? Ri->d[0] : 0;
412 mont->n0[1] = (Ri->top > 1) ? Ri->d[1] : 0;
413#else
414 BN_zero(R);
415 if (!(BN_set_bit(R,BN_BITS2))) goto err; /* R */
416
417 buf[0]=mod->d[0]; /* tmod = N mod word size */
418 buf[1]=0;
419 tmod.top = buf[0] != 0 ? 1 : 0;
420 /* Ri = R^-1 mod N*/
421 if ((BN_mod_inverse(Ri,R,&tmod,ctx)) == NULL)
422 goto err;
423 if (!BN_lshift(Ri,Ri,BN_BITS2)) goto err; /* R*Ri */
424 if (!BN_is_zero(Ri))
425 {
426 if (!BN_sub_word(Ri,1)) goto err;
427 }
428 else /* if N mod word size == 1 */
429 {
430 if (!BN_set_word(Ri,BN_MASK2)) goto err; /* Ri-- (mod word size) */
431 }
432 if (!BN_div(Ri,NULL,Ri,&tmod,ctx)) goto err;
433 /* Ni = (R*Ri-1)/N,
434 * keep only least significant word: */
435 mont->n0[0] = (Ri->top > 0) ? Ri->d[0] : 0;
436 mont->n0[1] = 0;
437#endif
438 }
439#else /* !MONT_WORD */
440 { /* bignum version */
441 mont->ri=BN_num_bits(&mont->N);
442 BN_zero(R);
443 if (!BN_set_bit(R,mont->ri)) goto err; /* R = 2^ri */
444 /* Ri = R^-1 mod N*/
445 if ((BN_mod_inverse(Ri,R,&mont->N,ctx)) == NULL)
446 goto err;
447 if (!BN_lshift(Ri,Ri,mont->ri)) goto err; /* R*Ri */
448 if (!BN_sub_word(Ri,1)) goto err;
449 /* Ni = (R*Ri-1) / N */
450 if (!BN_div(&(mont->Ni),NULL,Ri,&mont->N,ctx)) goto err;
451 }
452#endif
453
454 /* setup RR for conversions */
455 BN_zero(&(mont->RR));
456 if (!BN_set_bit(&(mont->RR),mont->ri*2)) goto err;
457 if (!BN_mod(&(mont->RR),&(mont->RR),&(mont->N),ctx)) goto err;
458
459 ret = 1;
460err:
461 BN_CTX_end(ctx);
462 return ret;
463 }
464
465BN_MONT_CTX *BN_MONT_CTX_copy(BN_MONT_CTX *to, BN_MONT_CTX *from)
466 {
467 if (to == from) return(to);
468
469 if (!BN_copy(&(to->RR),&(from->RR))) return NULL;
470 if (!BN_copy(&(to->N),&(from->N))) return NULL;
471 if (!BN_copy(&(to->Ni),&(from->Ni))) return NULL;
472 to->ri=from->ri;
473 to->n0[0]=from->n0[0];
474 to->n0[1]=from->n0[1];
475 return(to);
476 }
477
478BN_MONT_CTX *BN_MONT_CTX_set_locked(BN_MONT_CTX **pmont, int lock,
479 const BIGNUM *mod, BN_CTX *ctx)
480 {
481 int got_write_lock = 0;
482 BN_MONT_CTX *ret;
483
484 CRYPTO_r_lock(lock);
485 if (!*pmont)
486 {
487 CRYPTO_r_unlock(lock);
488 CRYPTO_w_lock(lock);
489 got_write_lock = 1;
490
491 if (!*pmont)
492 {
493 ret = BN_MONT_CTX_new();
494 if (ret && !BN_MONT_CTX_set(ret, mod, ctx))
495 BN_MONT_CTX_free(ret);
496 else
497 *pmont = ret;
498 }
499 }
500
501 ret = *pmont;
502
503 if (got_write_lock)
504 CRYPTO_w_unlock(lock);
505 else
506 CRYPTO_r_unlock(lock);
507
508 return ret;
509 }
diff --git a/src/lib/libcrypto/bn/bn_mpi.c b/src/lib/libcrypto/bn/bn_mpi.c
deleted file mode 100644
index a054d21aed..0000000000
--- a/src/lib/libcrypto/bn/bn_mpi.c
+++ /dev/null
@@ -1,130 +0,0 @@
1/* crypto/bn/bn_mpi.c */
2/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
3 * All rights reserved.
4 *
5 * This package is an SSL implementation written
6 * by Eric Young (eay@cryptsoft.com).
7 * The implementation was written so as to conform with Netscapes SSL.
8 *
9 * This library is free for commercial and non-commercial use as long as
10 * the following conditions are aheared to. The following conditions
11 * apply to all code found in this distribution, be it the RC4, RSA,
12 * lhash, DES, etc., code; not just the SSL code. The SSL documentation
13 * included with this distribution is covered by the same copyright terms
14 * except that the holder is Tim Hudson (tjh@cryptsoft.com).
15 *
16 * Copyright remains Eric Young's, and as such any Copyright notices in
17 * the code are not to be removed.
18 * If this package is used in a product, Eric Young should be given attribution
19 * as the author of the parts of the library used.
20 * This can be in the form of a textual message at program startup or
21 * in documentation (online or textual) provided with the package.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 * 1. Redistributions of source code must retain the copyright
27 * notice, this list of conditions and the following disclaimer.
28 * 2. Redistributions in binary form must reproduce the above copyright
29 * notice, this list of conditions and the following disclaimer in the
30 * documentation and/or other materials provided with the distribution.
31 * 3. All advertising materials mentioning features or use of this software
32 * must display the following acknowledgement:
33 * "This product includes cryptographic software written by
34 * Eric Young (eay@cryptsoft.com)"
35 * The word 'cryptographic' can be left out if the rouines from the library
36 * being used are not cryptographic related :-).
37 * 4. If you include any Windows specific code (or a derivative thereof) from
38 * the apps directory (application code) you must include an acknowledgement:
39 * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
40 *
41 * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
42 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51 * SUCH DAMAGE.
52 *
53 * The licence and distribution terms for any publically available version or
54 * derivative of this code cannot be changed. i.e. this code cannot simply be
55 * copied and put under another distribution licence
56 * [including the GNU Public Licence.]
57 */
58
59#include <stdio.h>
60#include "cryptlib.h"
61#include "bn_lcl.h"
62
63int BN_bn2mpi(const BIGNUM *a, unsigned char *d)
64 {
65 int bits;
66 int num=0;
67 int ext=0;
68 long l;
69
70 bits=BN_num_bits(a);
71 num=(bits+7)/8;
72 if (bits > 0)
73 {
74 ext=((bits & 0x07) == 0);
75 }
76 if (d == NULL)
77 return(num+4+ext);
78
79 l=num+ext;
80 d[0]=(unsigned char)(l>>24)&0xff;
81 d[1]=(unsigned char)(l>>16)&0xff;
82 d[2]=(unsigned char)(l>> 8)&0xff;
83 d[3]=(unsigned char)(l )&0xff;
84 if (ext) d[4]=0;
85 num=BN_bn2bin(a,&(d[4+ext]));
86 if (a->neg)
87 d[4]|=0x80;
88 return(num+4+ext);
89 }
90
91BIGNUM *BN_mpi2bn(const unsigned char *d, int n, BIGNUM *a)
92 {
93 long len;
94 int neg=0;
95
96 if (n < 4)
97 {
98 BNerr(BN_F_BN_MPI2BN,BN_R_INVALID_LENGTH);
99 return(NULL);
100 }
101 len=((long)d[0]<<24)|((long)d[1]<<16)|((int)d[2]<<8)|(int)d[3];
102 if ((len+4) != n)
103 {
104 BNerr(BN_F_BN_MPI2BN,BN_R_ENCODING_ERROR);
105 return(NULL);
106 }
107
108 if (a == NULL) a=BN_new();
109 if (a == NULL) return(NULL);
110
111 if (len == 0)
112 {
113 a->neg=0;
114 a->top=0;
115 return(a);
116 }
117 d+=4;
118 if ((*d) & 0x80)
119 neg=1;
120 if (BN_bin2bn(d,(int)len,a) == NULL)
121 return(NULL);
122 a->neg=neg;
123 if (neg)
124 {
125 BN_clear_bit(a,BN_num_bits(a)-1);
126 }
127 bn_check_top(a);
128 return(a);
129 }
130
diff --git a/src/lib/libcrypto/bn/bn_mul.c b/src/lib/libcrypto/bn/bn_mul.c
deleted file mode 100644
index 12e5be80eb..0000000000
--- a/src/lib/libcrypto/bn/bn_mul.c
+++ /dev/null
@@ -1,1166 +0,0 @@
1/* crypto/bn/bn_mul.c */
2/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
3 * All rights reserved.
4 *
5 * This package is an SSL implementation written
6 * by Eric Young (eay@cryptsoft.com).
7 * The implementation was written so as to conform with Netscapes SSL.
8 *
9 * This library is free for commercial and non-commercial use as long as
10 * the following conditions are aheared to. The following conditions
11 * apply to all code found in this distribution, be it the RC4, RSA,
12 * lhash, DES, etc., code; not just the SSL code. The SSL documentation
13 * included with this distribution is covered by the same copyright terms
14 * except that the holder is Tim Hudson (tjh@cryptsoft.com).
15 *
16 * Copyright remains Eric Young's, and as such any Copyright notices in
17 * the code are not to be removed.
18 * If this package is used in a product, Eric Young should be given attribution
19 * as the author of the parts of the library used.
20 * This can be in the form of a textual message at program startup or
21 * in documentation (online or textual) provided with the package.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 * 1. Redistributions of source code must retain the copyright
27 * notice, this list of conditions and the following disclaimer.
28 * 2. Redistributions in binary form must reproduce the above copyright
29 * notice, this list of conditions and the following disclaimer in the
30 * documentation and/or other materials provided with the distribution.
31 * 3. All advertising materials mentioning features or use of this software
32 * must display the following acknowledgement:
33 * "This product includes cryptographic software written by
34 * Eric Young (eay@cryptsoft.com)"
35 * The word 'cryptographic' can be left out if the rouines from the library
36 * being used are not cryptographic related :-).
37 * 4. If you include any Windows specific code (or a derivative thereof) from
38 * the apps directory (application code) you must include an acknowledgement:
39 * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
40 *
41 * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
42 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51 * SUCH DAMAGE.
52 *
53 * The licence and distribution terms for any publically available version or
54 * derivative of this code cannot be changed. i.e. this code cannot simply be
55 * copied and put under another distribution licence
56 * [including the GNU Public Licence.]
57 */
58
59#ifndef BN_DEBUG
60# undef NDEBUG /* avoid conflicting definitions */
61# define NDEBUG
62#endif
63
64#include <stdio.h>
65#include <assert.h>
66#include "cryptlib.h"
67#include "bn_lcl.h"
68
69#if defined(OPENSSL_NO_ASM) || !defined(OPENSSL_BN_ASM_PART_WORDS)
70/* Here follows specialised variants of bn_add_words() and
71 bn_sub_words(). They have the property performing operations on
72 arrays of different sizes. The sizes of those arrays is expressed through
73 cl, which is the common length ( basicall, min(len(a),len(b)) ), and dl,
74 which is the delta between the two lengths, calculated as len(a)-len(b).
75 All lengths are the number of BN_ULONGs... For the operations that require
76 a result array as parameter, it must have the length cl+abs(dl).
77 These functions should probably end up in bn_asm.c as soon as there are
78 assembler counterparts for the systems that use assembler files. */
79
80BN_ULONG bn_sub_part_words(BN_ULONG *r,
81 const BN_ULONG *a, const BN_ULONG *b,
82 int cl, int dl)
83 {
84 BN_ULONG c, t;
85
86 assert(cl >= 0);
87 c = bn_sub_words(r, a, b, cl);
88
89 if (dl == 0)
90 return c;
91
92 r += cl;
93 a += cl;
94 b += cl;
95
96 if (dl < 0)
97 {
98#ifdef BN_COUNT
99 fprintf(stderr, " bn_sub_part_words %d + %d (dl < 0, c = %d)\n", cl, dl, c);
100#endif
101 for (;;)
102 {
103 t = b[0];
104 r[0] = (0-t-c)&BN_MASK2;
105 if (t != 0) c=1;
106 if (++dl >= 0) break;
107
108 t = b[1];
109 r[1] = (0-t-c)&BN_MASK2;
110 if (t != 0) c=1;
111 if (++dl >= 0) break;
112
113 t = b[2];
114 r[2] = (0-t-c)&BN_MASK2;
115 if (t != 0) c=1;
116 if (++dl >= 0) break;
117
118 t = b[3];
119 r[3] = (0-t-c)&BN_MASK2;
120 if (t != 0) c=1;
121 if (++dl >= 0) break;
122
123 b += 4;
124 r += 4;
125 }
126 }
127 else
128 {
129 int save_dl = dl;
130#ifdef BN_COUNT
131 fprintf(stderr, " bn_sub_part_words %d + %d (dl > 0, c = %d)\n", cl, dl, c);
132#endif
133 while(c)
134 {
135 t = a[0];
136 r[0] = (t-c)&BN_MASK2;
137 if (t != 0) c=0;
138 if (--dl <= 0) break;
139
140 t = a[1];
141 r[1] = (t-c)&BN_MASK2;
142 if (t != 0) c=0;
143 if (--dl <= 0) break;
144
145 t = a[2];
146 r[2] = (t-c)&BN_MASK2;
147 if (t != 0) c=0;
148 if (--dl <= 0) break;
149
150 t = a[3];
151 r[3] = (t-c)&BN_MASK2;
152 if (t != 0) c=0;
153 if (--dl <= 0) break;
154
155 save_dl = dl;
156 a += 4;
157 r += 4;
158 }
159 if (dl > 0)
160 {
161#ifdef BN_COUNT
162 fprintf(stderr, " bn_sub_part_words %d + %d (dl > 0, c == 0)\n", cl, dl);
163#endif
164 if (save_dl > dl)
165 {
166 switch (save_dl - dl)
167 {
168 case 1:
169 r[1] = a[1];
170 if (--dl <= 0) break;
171 case 2:
172 r[2] = a[2];
173 if (--dl <= 0) break;
174 case 3:
175 r[3] = a[3];
176 if (--dl <= 0) break;
177 }
178 a += 4;
179 r += 4;
180 }
181 }
182 if (dl > 0)
183 {
184#ifdef BN_COUNT
185 fprintf(stderr, " bn_sub_part_words %d + %d (dl > 0, copy)\n", cl, dl);
186#endif
187 for(;;)
188 {
189 r[0] = a[0];
190 if (--dl <= 0) break;
191 r[1] = a[1];
192 if (--dl <= 0) break;
193 r[2] = a[2];
194 if (--dl <= 0) break;
195 r[3] = a[3];
196 if (--dl <= 0) break;
197
198 a += 4;
199 r += 4;
200 }
201 }
202 }
203 return c;
204 }
205#endif
206
207BN_ULONG bn_add_part_words(BN_ULONG *r,
208 const BN_ULONG *a, const BN_ULONG *b,
209 int cl, int dl)
210 {
211 BN_ULONG c, l, t;
212
213 assert(cl >= 0);
214 c = bn_add_words(r, a, b, cl);
215
216 if (dl == 0)
217 return c;
218
219 r += cl;
220 a += cl;
221 b += cl;
222
223 if (dl < 0)
224 {
225 int save_dl = dl;
226#ifdef BN_COUNT
227 fprintf(stderr, " bn_add_part_words %d + %d (dl < 0, c = %d)\n", cl, dl, c);
228#endif
229 while (c)
230 {
231 l=(c+b[0])&BN_MASK2;
232 c=(l < c);
233 r[0]=l;
234 if (++dl >= 0) break;
235
236 l=(c+b[1])&BN_MASK2;
237 c=(l < c);
238 r[1]=l;
239 if (++dl >= 0) break;
240
241 l=(c+b[2])&BN_MASK2;
242 c=(l < c);
243 r[2]=l;
244 if (++dl >= 0) break;
245
246 l=(c+b[3])&BN_MASK2;
247 c=(l < c);
248 r[3]=l;
249 if (++dl >= 0) break;
250
251 save_dl = dl;
252 b+=4;
253 r+=4;
254 }
255 if (dl < 0)
256 {
257#ifdef BN_COUNT
258 fprintf(stderr, " bn_add_part_words %d + %d (dl < 0, c == 0)\n", cl, dl);
259#endif
260 if (save_dl < dl)
261 {
262 switch (dl - save_dl)
263 {
264 case 1:
265 r[1] = b[1];
266 if (++dl >= 0) break;
267 case 2:
268 r[2] = b[2];
269 if (++dl >= 0) break;
270 case 3:
271 r[3] = b[3];
272 if (++dl >= 0) break;
273 }
274 b += 4;
275 r += 4;
276 }
277 }
278 if (dl < 0)
279 {
280#ifdef BN_COUNT
281 fprintf(stderr, " bn_add_part_words %d + %d (dl < 0, copy)\n", cl, dl);
282#endif
283 for(;;)
284 {
285 r[0] = b[0];
286 if (++dl >= 0) break;
287 r[1] = b[1];
288 if (++dl >= 0) break;
289 r[2] = b[2];
290 if (++dl >= 0) break;
291 r[3] = b[3];
292 if (++dl >= 0) break;
293
294 b += 4;
295 r += 4;
296 }
297 }
298 }
299 else
300 {
301 int save_dl = dl;
302#ifdef BN_COUNT
303 fprintf(stderr, " bn_add_part_words %d + %d (dl > 0)\n", cl, dl);
304#endif
305 while (c)
306 {
307 t=(a[0]+c)&BN_MASK2;
308 c=(t < c);
309 r[0]=t;
310 if (--dl <= 0) break;
311
312 t=(a[1]+c)&BN_MASK2;
313 c=(t < c);
314 r[1]=t;
315 if (--dl <= 0) break;
316
317 t=(a[2]+c)&BN_MASK2;
318 c=(t < c);
319 r[2]=t;
320 if (--dl <= 0) break;
321
322 t=(a[3]+c)&BN_MASK2;
323 c=(t < c);
324 r[3]=t;
325 if (--dl <= 0) break;
326
327 save_dl = dl;
328 a+=4;
329 r+=4;
330 }
331#ifdef BN_COUNT
332 fprintf(stderr, " bn_add_part_words %d + %d (dl > 0, c == 0)\n", cl, dl);
333#endif
334 if (dl > 0)
335 {
336 if (save_dl > dl)
337 {
338 switch (save_dl - dl)
339 {
340 case 1:
341 r[1] = a[1];
342 if (--dl <= 0) break;
343 case 2:
344 r[2] = a[2];
345 if (--dl <= 0) break;
346 case 3:
347 r[3] = a[3];
348 if (--dl <= 0) break;
349 }
350 a += 4;
351 r += 4;
352 }
353 }
354 if (dl > 0)
355 {
356#ifdef BN_COUNT
357 fprintf(stderr, " bn_add_part_words %d + %d (dl > 0, copy)\n", cl, dl);
358#endif
359 for(;;)
360 {
361 r[0] = a[0];
362 if (--dl <= 0) break;
363 r[1] = a[1];
364 if (--dl <= 0) break;
365 r[2] = a[2];
366 if (--dl <= 0) break;
367 r[3] = a[3];
368 if (--dl <= 0) break;
369
370 a += 4;
371 r += 4;
372 }
373 }
374 }
375 return c;
376 }
377
378#ifdef BN_RECURSION
379/* Karatsuba recursive multiplication algorithm
380 * (cf. Knuth, The Art of Computer Programming, Vol. 2) */
381
382/* r is 2*n2 words in size,
383 * a and b are both n2 words in size.
384 * n2 must be a power of 2.
385 * We multiply and return the result.
386 * t must be 2*n2 words in size
387 * We calculate
388 * a[0]*b[0]
389 * a[0]*b[0]+a[1]*b[1]+(a[0]-a[1])*(b[1]-b[0])
390 * a[1]*b[1]
391 */
392/* dnX may not be positive, but n2/2+dnX has to be */
393void bn_mul_recursive(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n2,
394 int dna, int dnb, BN_ULONG *t)
395 {
396 int n=n2/2,c1,c2;
397 int tna=n+dna, tnb=n+dnb;
398 unsigned int neg,zero;
399 BN_ULONG ln,lo,*p;
400
401# ifdef BN_COUNT
402 fprintf(stderr," bn_mul_recursive %d%+d * %d%+d\n",n2,dna,n2,dnb);
403# endif
404# ifdef BN_MUL_COMBA
405# if 0
406 if (n2 == 4)
407 {
408 bn_mul_comba4(r,a,b);
409 return;
410 }
411# endif
412 /* Only call bn_mul_comba 8 if n2 == 8 and the
413 * two arrays are complete [steve]
414 */
415 if (n2 == 8 && dna == 0 && dnb == 0)
416 {
417 bn_mul_comba8(r,a,b);
418 return;
419 }
420# endif /* BN_MUL_COMBA */
421 /* Else do normal multiply */
422 if (n2 < BN_MUL_RECURSIVE_SIZE_NORMAL)
423 {
424 bn_mul_normal(r,a,n2+dna,b,n2+dnb);
425 if ((dna + dnb) < 0)
426 memset(&r[2*n2 + dna + dnb], 0,
427 sizeof(BN_ULONG) * -(dna + dnb));
428 return;
429 }
430 /* r=(a[0]-a[1])*(b[1]-b[0]) */
431 c1=bn_cmp_part_words(a,&(a[n]),tna,n-tna);
432 c2=bn_cmp_part_words(&(b[n]),b,tnb,tnb-n);
433 zero=neg=0;
434 switch (c1*3+c2)
435 {
436 case -4:
437 bn_sub_part_words(t, &(a[n]),a, tna,tna-n); /* - */
438 bn_sub_part_words(&(t[n]),b, &(b[n]),tnb,n-tnb); /* - */
439 break;
440 case -3:
441 zero=1;
442 break;
443 case -2:
444 bn_sub_part_words(t, &(a[n]),a, tna,tna-n); /* - */
445 bn_sub_part_words(&(t[n]),&(b[n]),b, tnb,tnb-n); /* + */
446 neg=1;
447 break;
448 case -1:
449 case 0:
450 case 1:
451 zero=1;
452 break;
453 case 2:
454 bn_sub_part_words(t, a, &(a[n]),tna,n-tna); /* + */
455 bn_sub_part_words(&(t[n]),b, &(b[n]),tnb,n-tnb); /* - */
456 neg=1;
457 break;
458 case 3:
459 zero=1;
460 break;
461 case 4:
462 bn_sub_part_words(t, a, &(a[n]),tna,n-tna);
463 bn_sub_part_words(&(t[n]),&(b[n]),b, tnb,tnb-n);
464 break;
465 }
466
467# ifdef BN_MUL_COMBA
468 if (n == 4 && dna == 0 && dnb == 0) /* XXX: bn_mul_comba4 could take
469 extra args to do this well */
470 {
471 if (!zero)
472 bn_mul_comba4(&(t[n2]),t,&(t[n]));
473 else
474 memset(&(t[n2]),0,8*sizeof(BN_ULONG));
475
476 bn_mul_comba4(r,a,b);
477 bn_mul_comba4(&(r[n2]),&(a[n]),&(b[n]));
478 }
479 else if (n == 8 && dna == 0 && dnb == 0) /* XXX: bn_mul_comba8 could
480 take extra args to do this
481 well */
482 {
483 if (!zero)
484 bn_mul_comba8(&(t[n2]),t,&(t[n]));
485 else
486 memset(&(t[n2]),0,16*sizeof(BN_ULONG));
487
488 bn_mul_comba8(r,a,b);
489 bn_mul_comba8(&(r[n2]),&(a[n]),&(b[n]));
490 }
491 else
492# endif /* BN_MUL_COMBA */
493 {
494 p= &(t[n2*2]);
495 if (!zero)
496 bn_mul_recursive(&(t[n2]),t,&(t[n]),n,0,0,p);
497 else
498 memset(&(t[n2]),0,n2*sizeof(BN_ULONG));
499 bn_mul_recursive(r,a,b,n,0,0,p);
500 bn_mul_recursive(&(r[n2]),&(a[n]),&(b[n]),n,dna,dnb,p);
501 }
502
503 /* t[32] holds (a[0]-a[1])*(b[1]-b[0]), c1 is the sign
504 * r[10] holds (a[0]*b[0])
505 * r[32] holds (b[1]*b[1])
506 */
507
508 c1=(int)(bn_add_words(t,r,&(r[n2]),n2));
509
510 if (neg) /* if t[32] is negative */
511 {
512 c1-=(int)(bn_sub_words(&(t[n2]),t,&(t[n2]),n2));
513 }
514 else
515 {
516 /* Might have a carry */
517 c1+=(int)(bn_add_words(&(t[n2]),&(t[n2]),t,n2));
518 }
519
520 /* t[32] holds (a[0]-a[1])*(b[1]-b[0])+(a[0]*b[0])+(a[1]*b[1])
521 * r[10] holds (a[0]*b[0])
522 * r[32] holds (b[1]*b[1])
523 * c1 holds the carry bits
524 */
525 c1+=(int)(bn_add_words(&(r[n]),&(r[n]),&(t[n2]),n2));
526 if (c1)
527 {
528 p= &(r[n+n2]);
529 lo= *p;
530 ln=(lo+c1)&BN_MASK2;
531 *p=ln;
532
533 /* The overflow will stop before we over write
534 * words we should not overwrite */
535 if (ln < (BN_ULONG)c1)
536 {
537 do {
538 p++;
539 lo= *p;
540 ln=(lo+1)&BN_MASK2;
541 *p=ln;
542 } while (ln == 0);
543 }
544 }
545 }
546
547/* n+tn is the word length
548 * t needs to be n*4 is size, as does r */
549/* tnX may not be negative but less than n */
550void bn_mul_part_recursive(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n,
551 int tna, int tnb, BN_ULONG *t)
552 {
553 int i,j,n2=n*2;
554 int c1,c2,neg;
555 BN_ULONG ln,lo,*p;
556
557# ifdef BN_COUNT
558 fprintf(stderr," bn_mul_part_recursive (%d%+d) * (%d%+d)\n",
559 n, tna, n, tnb);
560# endif
561 if (n < 8)
562 {
563 bn_mul_normal(r,a,n+tna,b,n+tnb);
564 return;
565 }
566
567 /* r=(a[0]-a[1])*(b[1]-b[0]) */
568 c1=bn_cmp_part_words(a,&(a[n]),tna,n-tna);
569 c2=bn_cmp_part_words(&(b[n]),b,tnb,tnb-n);
570 neg=0;
571 switch (c1*3+c2)
572 {
573 case -4:
574 bn_sub_part_words(t, &(a[n]),a, tna,tna-n); /* - */
575 bn_sub_part_words(&(t[n]),b, &(b[n]),tnb,n-tnb); /* - */
576 break;
577 case -3:
578 /* break; */
579 case -2:
580 bn_sub_part_words(t, &(a[n]),a, tna,tna-n); /* - */
581 bn_sub_part_words(&(t[n]),&(b[n]),b, tnb,tnb-n); /* + */
582 neg=1;
583 break;
584 case -1:
585 case 0:
586 case 1:
587 /* break; */
588 case 2:
589 bn_sub_part_words(t, a, &(a[n]),tna,n-tna); /* + */
590 bn_sub_part_words(&(t[n]),b, &(b[n]),tnb,n-tnb); /* - */
591 neg=1;
592 break;
593 case 3:
594 /* break; */
595 case 4:
596 bn_sub_part_words(t, a, &(a[n]),tna,n-tna);
597 bn_sub_part_words(&(t[n]),&(b[n]),b, tnb,tnb-n);
598 break;
599 }
600 /* The zero case isn't yet implemented here. The speedup
601 would probably be negligible. */
602# if 0
603 if (n == 4)
604 {
605 bn_mul_comba4(&(t[n2]),t,&(t[n]));
606 bn_mul_comba4(r,a,b);
607 bn_mul_normal(&(r[n2]),&(a[n]),tn,&(b[n]),tn);
608 memset(&(r[n2+tn*2]),0,sizeof(BN_ULONG)*(n2-tn*2));
609 }
610 else
611# endif
612 if (n == 8)
613 {
614 bn_mul_comba8(&(t[n2]),t,&(t[n]));
615 bn_mul_comba8(r,a,b);
616 bn_mul_normal(&(r[n2]),&(a[n]),tna,&(b[n]),tnb);
617 memset(&(r[n2+tna+tnb]),0,sizeof(BN_ULONG)*(n2-tna-tnb));
618 }
619 else
620 {
621 p= &(t[n2*2]);
622 bn_mul_recursive(&(t[n2]),t,&(t[n]),n,0,0,p);
623 bn_mul_recursive(r,a,b,n,0,0,p);
624 i=n/2;
625 /* If there is only a bottom half to the number,
626 * just do it */
627 if (tna > tnb)
628 j = tna - i;
629 else
630 j = tnb - i;
631 if (j == 0)
632 {
633 bn_mul_recursive(&(r[n2]),&(a[n]),&(b[n]),
634 i,tna-i,tnb-i,p);
635 memset(&(r[n2+i*2]),0,sizeof(BN_ULONG)*(n2-i*2));
636 }
637 else if (j > 0) /* eg, n == 16, i == 8 and tn == 11 */
638 {
639 bn_mul_part_recursive(&(r[n2]),&(a[n]),&(b[n]),
640 i,tna-i,tnb-i,p);
641 memset(&(r[n2+tna+tnb]),0,
642 sizeof(BN_ULONG)*(n2-tna-tnb));
643 }
644 else /* (j < 0) eg, n == 16, i == 8 and tn == 5 */
645 {
646 memset(&(r[n2]),0,sizeof(BN_ULONG)*n2);
647 if (tna < BN_MUL_RECURSIVE_SIZE_NORMAL
648 && tnb < BN_MUL_RECURSIVE_SIZE_NORMAL)
649 {
650 bn_mul_normal(&(r[n2]),&(a[n]),tna,&(b[n]),tnb);
651 }
652 else
653 {
654 for (;;)
655 {
656 i/=2;
657 /* these simplified conditions work
658 * exclusively because difference
659 * between tna and tnb is 1 or 0 */
660 if (i < tna || i < tnb)
661 {
662 bn_mul_part_recursive(&(r[n2]),
663 &(a[n]),&(b[n]),
664 i,tna-i,tnb-i,p);
665 break;
666 }
667 else if (i == tna || i == tnb)
668 {
669 bn_mul_recursive(&(r[n2]),
670 &(a[n]),&(b[n]),
671 i,tna-i,tnb-i,p);
672 break;
673 }
674 }
675 }
676 }
677 }
678
679 /* t[32] holds (a[0]-a[1])*(b[1]-b[0]), c1 is the sign
680 * r[10] holds (a[0]*b[0])
681 * r[32] holds (b[1]*b[1])
682 */
683
684 c1=(int)(bn_add_words(t,r,&(r[n2]),n2));
685
686 if (neg) /* if t[32] is negative */
687 {
688 c1-=(int)(bn_sub_words(&(t[n2]),t,&(t[n2]),n2));
689 }
690 else
691 {
692 /* Might have a carry */
693 c1+=(int)(bn_add_words(&(t[n2]),&(t[n2]),t,n2));
694 }
695
696 /* t[32] holds (a[0]-a[1])*(b[1]-b[0])+(a[0]*b[0])+(a[1]*b[1])
697 * r[10] holds (a[0]*b[0])
698 * r[32] holds (b[1]*b[1])
699 * c1 holds the carry bits
700 */
701 c1+=(int)(bn_add_words(&(r[n]),&(r[n]),&(t[n2]),n2));
702 if (c1)
703 {
704 p= &(r[n+n2]);
705 lo= *p;
706 ln=(lo+c1)&BN_MASK2;
707 *p=ln;
708
709 /* The overflow will stop before we over write
710 * words we should not overwrite */
711 if (ln < (BN_ULONG)c1)
712 {
713 do {
714 p++;
715 lo= *p;
716 ln=(lo+1)&BN_MASK2;
717 *p=ln;
718 } while (ln == 0);
719 }
720 }
721 }
722
723/* a and b must be the same size, which is n2.
724 * r needs to be n2 words and t needs to be n2*2
725 */
726void bn_mul_low_recursive(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n2,
727 BN_ULONG *t)
728 {
729 int n=n2/2;
730
731# ifdef BN_COUNT
732 fprintf(stderr," bn_mul_low_recursive %d * %d\n",n2,n2);
733# endif
734
735 bn_mul_recursive(r,a,b,n,0,0,&(t[0]));
736 if (n >= BN_MUL_LOW_RECURSIVE_SIZE_NORMAL)
737 {
738 bn_mul_low_recursive(&(t[0]),&(a[0]),&(b[n]),n,&(t[n2]));
739 bn_add_words(&(r[n]),&(r[n]),&(t[0]),n);
740 bn_mul_low_recursive(&(t[0]),&(a[n]),&(b[0]),n,&(t[n2]));
741 bn_add_words(&(r[n]),&(r[n]),&(t[0]),n);
742 }
743 else
744 {
745 bn_mul_low_normal(&(t[0]),&(a[0]),&(b[n]),n);
746 bn_mul_low_normal(&(t[n]),&(a[n]),&(b[0]),n);
747 bn_add_words(&(r[n]),&(r[n]),&(t[0]),n);
748 bn_add_words(&(r[n]),&(r[n]),&(t[n]),n);
749 }
750 }
751
752/* a and b must be the same size, which is n2.
753 * r needs to be n2 words and t needs to be n2*2
754 * l is the low words of the output.
755 * t needs to be n2*3
756 */
757void bn_mul_high(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, BN_ULONG *l, int n2,
758 BN_ULONG *t)
759 {
760 int i,n;
761 int c1,c2;
762 int neg,oneg,zero;
763 BN_ULONG ll,lc,*lp,*mp;
764
765# ifdef BN_COUNT
766 fprintf(stderr," bn_mul_high %d * %d\n",n2,n2);
767# endif
768 n=n2/2;
769
770 /* Calculate (al-ah)*(bh-bl) */
771 neg=zero=0;
772 c1=bn_cmp_words(&(a[0]),&(a[n]),n);
773 c2=bn_cmp_words(&(b[n]),&(b[0]),n);
774 switch (c1*3+c2)
775 {
776 case -4:
777 bn_sub_words(&(r[0]),&(a[n]),&(a[0]),n);
778 bn_sub_words(&(r[n]),&(b[0]),&(b[n]),n);
779 break;
780 case -3:
781 zero=1;
782 break;
783 case -2:
784 bn_sub_words(&(r[0]),&(a[n]),&(a[0]),n);
785 bn_sub_words(&(r[n]),&(b[n]),&(b[0]),n);
786 neg=1;
787 break;
788 case -1:
789 case 0:
790 case 1:
791 zero=1;
792 break;
793 case 2:
794 bn_sub_words(&(r[0]),&(a[0]),&(a[n]),n);
795 bn_sub_words(&(r[n]),&(b[0]),&(b[n]),n);
796 neg=1;
797 break;
798 case 3:
799 zero=1;
800 break;
801 case 4:
802 bn_sub_words(&(r[0]),&(a[0]),&(a[n]),n);
803 bn_sub_words(&(r[n]),&(b[n]),&(b[0]),n);
804 break;
805 }
806
807 oneg=neg;
808 /* t[10] = (a[0]-a[1])*(b[1]-b[0]) */
809 /* r[10] = (a[1]*b[1]) */
810# ifdef BN_MUL_COMBA
811 if (n == 8)
812 {
813 bn_mul_comba8(&(t[0]),&(r[0]),&(r[n]));
814 bn_mul_comba8(r,&(a[n]),&(b[n]));
815 }
816 else
817# endif
818 {
819 bn_mul_recursive(&(t[0]),&(r[0]),&(r[n]),n,0,0,&(t[n2]));
820 bn_mul_recursive(r,&(a[n]),&(b[n]),n,0,0,&(t[n2]));
821 }
822
823 /* s0 == low(al*bl)
824 * s1 == low(ah*bh)+low((al-ah)*(bh-bl))+low(al*bl)+high(al*bl)
825 * We know s0 and s1 so the only unknown is high(al*bl)
826 * high(al*bl) == s1 - low(ah*bh+s0+(al-ah)*(bh-bl))
827 * high(al*bl) == s1 - (r[0]+l[0]+t[0])
828 */
829 if (l != NULL)
830 {
831 lp= &(t[n2+n]);
832 c1=(int)(bn_add_words(lp,&(r[0]),&(l[0]),n));
833 }
834 else
835 {
836 c1=0;
837 lp= &(r[0]);
838 }
839
840 if (neg)
841 neg=(int)(bn_sub_words(&(t[n2]),lp,&(t[0]),n));
842 else
843 {
844 bn_add_words(&(t[n2]),lp,&(t[0]),n);
845 neg=0;
846 }
847
848 if (l != NULL)
849 {
850 bn_sub_words(&(t[n2+n]),&(l[n]),&(t[n2]),n);
851 }
852 else
853 {
854 lp= &(t[n2+n]);
855 mp= &(t[n2]);
856 for (i=0; i<n; i++)
857 lp[i]=((~mp[i])+1)&BN_MASK2;
858 }
859
860 /* s[0] = low(al*bl)
861 * t[3] = high(al*bl)
862 * t[10] = (a[0]-a[1])*(b[1]-b[0]) neg is the sign
863 * r[10] = (a[1]*b[1])
864 */
865 /* R[10] = al*bl
866 * R[21] = al*bl + ah*bh + (a[0]-a[1])*(b[1]-b[0])
867 * R[32] = ah*bh
868 */
869 /* R[1]=t[3]+l[0]+r[0](+-)t[0] (have carry/borrow)
870 * R[2]=r[0]+t[3]+r[1](+-)t[1] (have carry/borrow)
871 * R[3]=r[1]+(carry/borrow)
872 */
873 if (l != NULL)
874 {
875 lp= &(t[n2]);
876 c1= (int)(bn_add_words(lp,&(t[n2+n]),&(l[0]),n));
877 }
878 else
879 {
880 lp= &(t[n2+n]);
881 c1=0;
882 }
883 c1+=(int)(bn_add_words(&(t[n2]),lp, &(r[0]),n));
884 if (oneg)
885 c1-=(int)(bn_sub_words(&(t[n2]),&(t[n2]),&(t[0]),n));
886 else
887 c1+=(int)(bn_add_words(&(t[n2]),&(t[n2]),&(t[0]),n));
888
889 c2 =(int)(bn_add_words(&(r[0]),&(r[0]),&(t[n2+n]),n));
890 c2+=(int)(bn_add_words(&(r[0]),&(r[0]),&(r[n]),n));
891 if (oneg)
892 c2-=(int)(bn_sub_words(&(r[0]),&(r[0]),&(t[n]),n));
893 else
894 c2+=(int)(bn_add_words(&(r[0]),&(r[0]),&(t[n]),n));
895
896 if (c1 != 0) /* Add starting at r[0], could be +ve or -ve */
897 {
898 i=0;
899 if (c1 > 0)
900 {
901 lc=c1;
902 do {
903 ll=(r[i]+lc)&BN_MASK2;
904 r[i++]=ll;
905 lc=(lc > ll);
906 } while (lc);
907 }
908 else
909 {
910 lc= -c1;
911 do {
912 ll=r[i];
913 r[i++]=(ll-lc)&BN_MASK2;
914 lc=(lc > ll);
915 } while (lc);
916 }
917 }
918 if (c2 != 0) /* Add starting at r[1] */
919 {
920 i=n;
921 if (c2 > 0)
922 {
923 lc=c2;
924 do {
925 ll=(r[i]+lc)&BN_MASK2;
926 r[i++]=ll;
927 lc=(lc > ll);
928 } while (lc);
929 }
930 else
931 {
932 lc= -c2;
933 do {
934 ll=r[i];
935 r[i++]=(ll-lc)&BN_MASK2;
936 lc=(lc > ll);
937 } while (lc);
938 }
939 }
940 }
941#endif /* BN_RECURSION */
942
943int BN_mul(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx)
944 {
945 int ret=0;
946 int top,al,bl;
947 BIGNUM *rr;
948#if defined(BN_MUL_COMBA) || defined(BN_RECURSION)
949 int i;
950#endif
951#ifdef BN_RECURSION
952 BIGNUM *t=NULL;
953 int j=0,k;
954#endif
955
956#ifdef BN_COUNT
957 fprintf(stderr,"BN_mul %d * %d\n",a->top,b->top);
958#endif
959
960 bn_check_top(a);
961 bn_check_top(b);
962 bn_check_top(r);
963
964 al=a->top;
965 bl=b->top;
966
967 if ((al == 0) || (bl == 0))
968 {
969 BN_zero(r);
970 return(1);
971 }
972 top=al+bl;
973
974 BN_CTX_start(ctx);
975 if ((r == a) || (r == b))
976 {
977 if ((rr = BN_CTX_get(ctx)) == NULL) goto err;
978 }
979 else
980 rr = r;
981 rr->neg=a->neg^b->neg;
982
983#if defined(BN_MUL_COMBA) || defined(BN_RECURSION)
984 i = al-bl;
985#endif
986#ifdef BN_MUL_COMBA
987 if (i == 0)
988 {
989# if 0
990 if (al == 4)
991 {
992 if (bn_wexpand(rr,8) == NULL) goto err;
993 rr->top=8;
994 bn_mul_comba4(rr->d,a->d,b->d);
995 goto end;
996 }
997# endif
998 if (al == 8)
999 {
1000 if (bn_wexpand(rr,16) == NULL) goto err;
1001 rr->top=16;
1002 bn_mul_comba8(rr->d,a->d,b->d);
1003 goto end;
1004 }
1005 }
1006#endif /* BN_MUL_COMBA */
1007#ifdef BN_RECURSION
1008 if ((al >= BN_MULL_SIZE_NORMAL) && (bl >= BN_MULL_SIZE_NORMAL))
1009 {
1010 if (i >= -1 && i <= 1)
1011 {
1012 /* Find out the power of two lower or equal
1013 to the longest of the two numbers */
1014 if (i >= 0)
1015 {
1016 j = BN_num_bits_word((BN_ULONG)al);
1017 }
1018 if (i == -1)
1019 {
1020 j = BN_num_bits_word((BN_ULONG)bl);
1021 }
1022 j = 1<<(j-1);
1023 assert(j <= al || j <= bl);
1024 k = j+j;
1025 t = BN_CTX_get(ctx);
1026 if (t == NULL)
1027 goto err;
1028 if (al > j || bl > j)
1029 {
1030 if (bn_wexpand(t,k*4) == NULL) goto err;
1031 if (bn_wexpand(rr,k*4) == NULL) goto err;
1032 bn_mul_part_recursive(rr->d,a->d,b->d,
1033 j,al-j,bl-j,t->d);
1034 }
1035 else /* al <= j || bl <= j */
1036 {
1037 if (bn_wexpand(t,k*2) == NULL) goto err;
1038 if (bn_wexpand(rr,k*2) == NULL) goto err;
1039 bn_mul_recursive(rr->d,a->d,b->d,
1040 j,al-j,bl-j,t->d);
1041 }
1042 rr->top=top;
1043 goto end;
1044 }
1045#if 0
1046 if (i == 1 && !BN_get_flags(b,BN_FLG_STATIC_DATA))
1047 {
1048 BIGNUM *tmp_bn = (BIGNUM *)b;
1049 if (bn_wexpand(tmp_bn,al) == NULL) goto err;
1050 tmp_bn->d[bl]=0;
1051 bl++;
1052 i--;
1053 }
1054 else if (i == -1 && !BN_get_flags(a,BN_FLG_STATIC_DATA))
1055 {
1056 BIGNUM *tmp_bn = (BIGNUM *)a;
1057 if (bn_wexpand(tmp_bn,bl) == NULL) goto err;
1058 tmp_bn->d[al]=0;
1059 al++;
1060 i++;
1061 }
1062 if (i == 0)
1063 {
1064 /* symmetric and > 4 */
1065 /* 16 or larger */
1066 j=BN_num_bits_word((BN_ULONG)al);
1067 j=1<<(j-1);
1068 k=j+j;
1069 t = BN_CTX_get(ctx);
1070 if (al == j) /* exact multiple */
1071 {
1072 if (bn_wexpand(t,k*2) == NULL) goto err;
1073 if (bn_wexpand(rr,k*2) == NULL) goto err;
1074 bn_mul_recursive(rr->d,a->d,b->d,al,t->d);
1075 }
1076 else
1077 {
1078 if (bn_wexpand(t,k*4) == NULL) goto err;
1079 if (bn_wexpand(rr,k*4) == NULL) goto err;
1080 bn_mul_part_recursive(rr->d,a->d,b->d,al-j,j,t->d);
1081 }
1082 rr->top=top;
1083 goto end;
1084 }
1085#endif
1086 }
1087#endif /* BN_RECURSION */
1088 if (bn_wexpand(rr,top) == NULL) goto err;
1089 rr->top=top;
1090 bn_mul_normal(rr->d,a->d,al,b->d,bl);
1091
1092#if defined(BN_MUL_COMBA) || defined(BN_RECURSION)
1093end:
1094#endif
1095 bn_correct_top(rr);
1096 if (r != rr) BN_copy(r,rr);
1097 ret=1;
1098err:
1099 bn_check_top(r);
1100 BN_CTX_end(ctx);
1101 return(ret);
1102 }
1103
1104void bn_mul_normal(BN_ULONG *r, BN_ULONG *a, int na, BN_ULONG *b, int nb)
1105 {
1106 BN_ULONG *rr;
1107
1108#ifdef BN_COUNT
1109 fprintf(stderr," bn_mul_normal %d * %d\n",na,nb);
1110#endif
1111
1112 if (na < nb)
1113 {
1114 int itmp;
1115 BN_ULONG *ltmp;
1116
1117 itmp=na; na=nb; nb=itmp;
1118 ltmp=a; a=b; b=ltmp;
1119
1120 }
1121 rr= &(r[na]);
1122 if (nb <= 0)
1123 {
1124 (void)bn_mul_words(r,a,na,0);
1125 return;
1126 }
1127 else
1128 rr[0]=bn_mul_words(r,a,na,b[0]);
1129
1130 for (;;)
1131 {
1132 if (--nb <= 0) return;
1133 rr[1]=bn_mul_add_words(&(r[1]),a,na,b[1]);
1134 if (--nb <= 0) return;
1135 rr[2]=bn_mul_add_words(&(r[2]),a,na,b[2]);
1136 if (--nb <= 0) return;
1137 rr[3]=bn_mul_add_words(&(r[3]),a,na,b[3]);
1138 if (--nb <= 0) return;
1139 rr[4]=bn_mul_add_words(&(r[4]),a,na,b[4]);
1140 rr+=4;
1141 r+=4;
1142 b+=4;
1143 }
1144 }
1145
1146void bn_mul_low_normal(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
1147 {
1148#ifdef BN_COUNT
1149 fprintf(stderr," bn_mul_low_normal %d * %d\n",n,n);
1150#endif
1151 bn_mul_words(r,a,n,b[0]);
1152
1153 for (;;)
1154 {
1155 if (--n <= 0) return;
1156 bn_mul_add_words(&(r[1]),a,n,b[1]);
1157 if (--n <= 0) return;
1158 bn_mul_add_words(&(r[2]),a,n,b[2]);
1159 if (--n <= 0) return;
1160 bn_mul_add_words(&(r[3]),a,n,b[3]);
1161 if (--n <= 0) return;
1162 bn_mul_add_words(&(r[4]),a,n,b[4]);
1163 r+=4;
1164 b+=4;
1165 }
1166 }
diff --git a/src/lib/libcrypto/bn/bn_nist.c b/src/lib/libcrypto/bn/bn_nist.c
deleted file mode 100644
index 43caee4770..0000000000
--- a/src/lib/libcrypto/bn/bn_nist.c
+++ /dev/null
@@ -1,1102 +0,0 @@
1/* crypto/bn/bn_nist.c */
2/*
3 * Written by Nils Larsch for the OpenSSL project
4 */
5/* ====================================================================
6 * Copyright (c) 1998-2005 The OpenSSL Project. All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 *
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 *
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in
17 * the documentation and/or other materials provided with the
18 * distribution.
19 *
20 * 3. All advertising materials mentioning features or use of this
21 * software must display the following acknowledgment:
22 * "This product includes software developed by the OpenSSL Project
23 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
24 *
25 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
26 * endorse or promote products derived from this software without
27 * prior written permission. For written permission, please contact
28 * openssl-core@openssl.org.
29 *
30 * 5. Products derived from this software may not be called "OpenSSL"
31 * nor may "OpenSSL" appear in their names without prior written
32 * permission of the OpenSSL Project.
33 *
34 * 6. Redistributions of any form whatsoever must retain the following
35 * acknowledgment:
36 * "This product includes software developed by the OpenSSL Project
37 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
38 *
39 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
40 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
41 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
42 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
43 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
44 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
45 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
46 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
47 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
48 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
49 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
50 * OF THE POSSIBILITY OF SUCH DAMAGE.
51 * ====================================================================
52 *
53 * This product includes cryptographic software written by Eric Young
54 * (eay@cryptsoft.com). This product includes software written by Tim
55 * Hudson (tjh@cryptsoft.com).
56 *
57 */
58
59#include "bn_lcl.h"
60#include "cryptlib.h"
61
62
63#define BN_NIST_192_TOP (192+BN_BITS2-1)/BN_BITS2
64#define BN_NIST_224_TOP (224+BN_BITS2-1)/BN_BITS2
65#define BN_NIST_256_TOP (256+BN_BITS2-1)/BN_BITS2
66#define BN_NIST_384_TOP (384+BN_BITS2-1)/BN_BITS2
67#define BN_NIST_521_TOP (521+BN_BITS2-1)/BN_BITS2
68
69/* pre-computed tables are "carry-less" values of modulus*(i+1) */
70#if BN_BITS2 == 64
71static const BN_ULONG _nist_p_192[][BN_NIST_192_TOP] = {
72 {0xFFFFFFFFFFFFFFFFULL,0xFFFFFFFFFFFFFFFEULL,0xFFFFFFFFFFFFFFFFULL},
73 {0xFFFFFFFFFFFFFFFEULL,0xFFFFFFFFFFFFFFFDULL,0xFFFFFFFFFFFFFFFFULL},
74 {0xFFFFFFFFFFFFFFFDULL,0xFFFFFFFFFFFFFFFCULL,0xFFFFFFFFFFFFFFFFULL}
75 };
76static const BN_ULONG _nist_p_192_sqr[] = {
77 0x0000000000000001ULL,0x0000000000000002ULL,0x0000000000000001ULL,
78 0xFFFFFFFFFFFFFFFEULL,0xFFFFFFFFFFFFFFFDULL,0xFFFFFFFFFFFFFFFFULL
79 };
80static const BN_ULONG _nist_p_224[][BN_NIST_224_TOP] = {
81 {0x0000000000000001ULL,0xFFFFFFFF00000000ULL,
82 0xFFFFFFFFFFFFFFFFULL,0x00000000FFFFFFFFULL},
83 {0x0000000000000002ULL,0xFFFFFFFE00000000ULL,
84 0xFFFFFFFFFFFFFFFFULL,0x00000001FFFFFFFFULL} /* this one is "carry-full" */
85 };
86static const BN_ULONG _nist_p_224_sqr[] = {
87 0x0000000000000001ULL,0xFFFFFFFE00000000ULL,
88 0xFFFFFFFFFFFFFFFFULL,0x0000000200000000ULL,
89 0x0000000000000000ULL,0xFFFFFFFFFFFFFFFEULL,
90 0xFFFFFFFFFFFFFFFFULL
91 };
92static const BN_ULONG _nist_p_256[][BN_NIST_256_TOP] = {
93 {0xFFFFFFFFFFFFFFFFULL,0x00000000FFFFFFFFULL,
94 0x0000000000000000ULL,0xFFFFFFFF00000001ULL},
95 {0xFFFFFFFFFFFFFFFEULL,0x00000001FFFFFFFFULL,
96 0x0000000000000000ULL,0xFFFFFFFE00000002ULL},
97 {0xFFFFFFFFFFFFFFFDULL,0x00000002FFFFFFFFULL,
98 0x0000000000000000ULL,0xFFFFFFFD00000003ULL},
99 {0xFFFFFFFFFFFFFFFCULL,0x00000003FFFFFFFFULL,
100 0x0000000000000000ULL,0xFFFFFFFC00000004ULL},
101 {0xFFFFFFFFFFFFFFFBULL,0x00000004FFFFFFFFULL,
102 0x0000000000000000ULL,0xFFFFFFFB00000005ULL},
103 };
104static const BN_ULONG _nist_p_256_sqr[] = {
105 0x0000000000000001ULL,0xFFFFFFFE00000000ULL,
106 0xFFFFFFFFFFFFFFFFULL,0x00000001FFFFFFFEULL,
107 0x00000001FFFFFFFEULL,0x00000001FFFFFFFEULL,
108 0xFFFFFFFE00000001ULL,0xFFFFFFFE00000002ULL
109 };
110static const BN_ULONG _nist_p_384[][BN_NIST_384_TOP] = {
111 {0x00000000FFFFFFFFULL,0xFFFFFFFF00000000ULL,0xFFFFFFFFFFFFFFFEULL,
112 0xFFFFFFFFFFFFFFFFULL,0xFFFFFFFFFFFFFFFFULL,0xFFFFFFFFFFFFFFFFULL},
113 {0x00000001FFFFFFFEULL,0xFFFFFFFE00000000ULL,0xFFFFFFFFFFFFFFFDULL,
114 0xFFFFFFFFFFFFFFFFULL,0xFFFFFFFFFFFFFFFFULL,0xFFFFFFFFFFFFFFFFULL},
115 {0x00000002FFFFFFFDULL,0xFFFFFFFD00000000ULL,0xFFFFFFFFFFFFFFFCULL,
116 0xFFFFFFFFFFFFFFFFULL,0xFFFFFFFFFFFFFFFFULL,0xFFFFFFFFFFFFFFFFULL},
117 {0x00000003FFFFFFFCULL,0xFFFFFFFC00000000ULL,0xFFFFFFFFFFFFFFFBULL,
118 0xFFFFFFFFFFFFFFFFULL,0xFFFFFFFFFFFFFFFFULL,0xFFFFFFFFFFFFFFFFULL},
119 {0x00000004FFFFFFFBULL,0xFFFFFFFB00000000ULL,0xFFFFFFFFFFFFFFFAULL,
120 0xFFFFFFFFFFFFFFFFULL,0xFFFFFFFFFFFFFFFFULL,0xFFFFFFFFFFFFFFFFULL},
121 };
122static const BN_ULONG _nist_p_384_sqr[] = {
123 0xFFFFFFFE00000001ULL,0x0000000200000000ULL,0xFFFFFFFE00000000ULL,
124 0x0000000200000000ULL,0x0000000000000001ULL,0x0000000000000000ULL,
125 0x00000001FFFFFFFEULL,0xFFFFFFFE00000000ULL,0xFFFFFFFFFFFFFFFDULL,
126 0xFFFFFFFFFFFFFFFFULL,0xFFFFFFFFFFFFFFFFULL,0xFFFFFFFFFFFFFFFFULL
127 };
128static const BN_ULONG _nist_p_521[] =
129 {0xFFFFFFFFFFFFFFFFULL,0xFFFFFFFFFFFFFFFFULL,
130 0xFFFFFFFFFFFFFFFFULL,0xFFFFFFFFFFFFFFFFULL,
131 0xFFFFFFFFFFFFFFFFULL,0xFFFFFFFFFFFFFFFFULL,
132 0xFFFFFFFFFFFFFFFFULL,0xFFFFFFFFFFFFFFFFULL,
133 0x00000000000001FFULL};
134static const BN_ULONG _nist_p_521_sqr[] = {
135 0x0000000000000001ULL,0x0000000000000000ULL,0x0000000000000000ULL,
136 0x0000000000000000ULL,0x0000000000000000ULL,0x0000000000000000ULL,
137 0x0000000000000000ULL,0x0000000000000000ULL,0xFFFFFFFFFFFFFC00ULL,
138 0xFFFFFFFFFFFFFFFFULL,0xFFFFFFFFFFFFFFFFULL,0xFFFFFFFFFFFFFFFFULL,
139 0xFFFFFFFFFFFFFFFFULL,0xFFFFFFFFFFFFFFFFULL,0xFFFFFFFFFFFFFFFFULL,
140 0xFFFFFFFFFFFFFFFFULL,0x000000000003FFFFULL
141 };
142#elif BN_BITS2 == 32
143static const BN_ULONG _nist_p_192[][BN_NIST_192_TOP] = {
144 {0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFE,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF},
145 {0xFFFFFFFE,0xFFFFFFFF,0xFFFFFFFD,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF},
146 {0xFFFFFFFD,0xFFFFFFFF,0xFFFFFFFC,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF}
147 };
148static const BN_ULONG _nist_p_192_sqr[] = {
149 0x00000001,0x00000000,0x00000002,0x00000000,0x00000001,0x00000000,
150 0xFFFFFFFE,0xFFFFFFFF,0xFFFFFFFD,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF
151 };
152static const BN_ULONG _nist_p_224[][BN_NIST_224_TOP] = {
153 {0x00000001,0x00000000,0x00000000,0xFFFFFFFF,
154 0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF},
155 {0x00000002,0x00000000,0x00000000,0xFFFFFFFE,
156 0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF}
157 };
158static const BN_ULONG _nist_p_224_sqr[] = {
159 0x00000001,0x00000000,0x00000000,0xFFFFFFFE,
160 0xFFFFFFFF,0xFFFFFFFF,0x00000000,0x00000002,
161 0x00000000,0x00000000,0xFFFFFFFE,0xFFFFFFFF,
162 0xFFFFFFFF,0xFFFFFFFF
163 };
164static const BN_ULONG _nist_p_256[][BN_NIST_256_TOP] = {
165 {0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0x00000000,
166 0x00000000,0x00000000,0x00000001,0xFFFFFFFF},
167 {0xFFFFFFFE,0xFFFFFFFF,0xFFFFFFFF,0x00000001,
168 0x00000000,0x00000000,0x00000002,0xFFFFFFFE},
169 {0xFFFFFFFD,0xFFFFFFFF,0xFFFFFFFF,0x00000002,
170 0x00000000,0x00000000,0x00000003,0xFFFFFFFD},
171 {0xFFFFFFFC,0xFFFFFFFF,0xFFFFFFFF,0x00000003,
172 0x00000000,0x00000000,0x00000004,0xFFFFFFFC},
173 {0xFFFFFFFB,0xFFFFFFFF,0xFFFFFFFF,0x00000004,
174 0x00000000,0x00000000,0x00000005,0xFFFFFFFB},
175 };
176static const BN_ULONG _nist_p_256_sqr[] = {
177 0x00000001,0x00000000,0x00000000,0xFFFFFFFE,
178 0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFE,0x00000001,
179 0xFFFFFFFE,0x00000001,0xFFFFFFFE,0x00000001,
180 0x00000001,0xFFFFFFFE,0x00000002,0xFFFFFFFE
181 };
182static const BN_ULONG _nist_p_384[][BN_NIST_384_TOP] = {
183 {0xFFFFFFFF,0x00000000,0x00000000,0xFFFFFFFF,0xFFFFFFFE,0xFFFFFFFF,
184 0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF},
185 {0xFFFFFFFE,0x00000001,0x00000000,0xFFFFFFFE,0xFFFFFFFD,0xFFFFFFFF,
186 0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF},
187 {0xFFFFFFFD,0x00000002,0x00000000,0xFFFFFFFD,0xFFFFFFFC,0xFFFFFFFF,
188 0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF},
189 {0xFFFFFFFC,0x00000003,0x00000000,0xFFFFFFFC,0xFFFFFFFB,0xFFFFFFFF,
190 0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF},
191 {0xFFFFFFFB,0x00000004,0x00000000,0xFFFFFFFB,0xFFFFFFFA,0xFFFFFFFF,
192 0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF},
193 };
194static const BN_ULONG _nist_p_384_sqr[] = {
195 0x00000001,0xFFFFFFFE,0x00000000,0x00000002,0x00000000,0xFFFFFFFE,
196 0x00000000,0x00000002,0x00000001,0x00000000,0x00000000,0x00000000,
197 0xFFFFFFFE,0x00000001,0x00000000,0xFFFFFFFE,0xFFFFFFFD,0xFFFFFFFF,
198 0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF
199 };
200static const BN_ULONG _nist_p_521[] = {0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,
201 0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,
202 0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,
203 0xFFFFFFFF,0x000001FF};
204static const BN_ULONG _nist_p_521_sqr[] = {
205 0x00000001,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,
206 0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,
207 0x00000000,0x00000000,0x00000000,0x00000000,0xFFFFFC00,0xFFFFFFFF,
208 0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,
209 0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,
210 0xFFFFFFFF,0xFFFFFFFF,0x0003FFFF
211 };
212#else
213#error "unsupported BN_BITS2"
214#endif
215
216
217static const BIGNUM _bignum_nist_p_192 =
218 {
219 (BN_ULONG *)_nist_p_192[0],
220 BN_NIST_192_TOP,
221 BN_NIST_192_TOP,
222 0,
223 BN_FLG_STATIC_DATA
224 };
225
226static const BIGNUM _bignum_nist_p_224 =
227 {
228 (BN_ULONG *)_nist_p_224[0],
229 BN_NIST_224_TOP,
230 BN_NIST_224_TOP,
231 0,
232 BN_FLG_STATIC_DATA
233 };
234
235static const BIGNUM _bignum_nist_p_256 =
236 {
237 (BN_ULONG *)_nist_p_256[0],
238 BN_NIST_256_TOP,
239 BN_NIST_256_TOP,
240 0,
241 BN_FLG_STATIC_DATA
242 };
243
244static const BIGNUM _bignum_nist_p_384 =
245 {
246 (BN_ULONG *)_nist_p_384[0],
247 BN_NIST_384_TOP,
248 BN_NIST_384_TOP,
249 0,
250 BN_FLG_STATIC_DATA
251 };
252
253static const BIGNUM _bignum_nist_p_521 =
254 {
255 (BN_ULONG *)_nist_p_521,
256 BN_NIST_521_TOP,
257 BN_NIST_521_TOP,
258 0,
259 BN_FLG_STATIC_DATA
260 };
261
262
263const BIGNUM *BN_get0_nist_prime_192(void)
264 {
265 return &_bignum_nist_p_192;
266 }
267
268const BIGNUM *BN_get0_nist_prime_224(void)
269 {
270 return &_bignum_nist_p_224;
271 }
272
273const BIGNUM *BN_get0_nist_prime_256(void)
274 {
275 return &_bignum_nist_p_256;
276 }
277
278const BIGNUM *BN_get0_nist_prime_384(void)
279 {
280 return &_bignum_nist_p_384;
281 }
282
283const BIGNUM *BN_get0_nist_prime_521(void)
284 {
285 return &_bignum_nist_p_521;
286 }
287
288
289static void nist_cp_bn_0(BN_ULONG *buf, BN_ULONG *a, int top, int max)
290 {
291 int i;
292 BN_ULONG *_tmp1 = (buf), *_tmp2 = (a);
293
294#ifdef BN_DEBUG
295 OPENSSL_assert(top <= max);
296#endif
297 for (i = (top); i != 0; i--)
298 *_tmp1++ = *_tmp2++;
299 for (i = (max) - (top); i != 0; i--)
300 *_tmp1++ = (BN_ULONG) 0;
301 }
302
303static void nist_cp_bn(BN_ULONG *buf, BN_ULONG *a, int top)
304 {
305 int i;
306 BN_ULONG *_tmp1 = (buf), *_tmp2 = (a);
307 for (i = (top); i != 0; i--)
308 *_tmp1++ = *_tmp2++;
309 }
310
311#if BN_BITS2 == 64
312#define bn_cp_64(to, n, from, m) (to)[n] = (m>=0)?((from)[m]):0;
313#define bn_64_set_0(to, n) (to)[n] = (BN_ULONG)0;
314/*
315 * two following macros are implemented under assumption that they
316 * are called in a sequence with *ascending* n, i.e. as they are...
317 */
318#define bn_cp_32_naked(to, n, from, m) (((n)&1)?(to[(n)/2]|=((m)&1)?(from[(m)/2]&BN_MASK2h):(from[(m)/2]<<32))\
319 :(to[(n)/2] =((m)&1)?(from[(m)/2]>>32):(from[(m)/2]&BN_MASK2l)))
320#define bn_32_set_0(to, n) (((n)&1)?(to[(n)/2]&=BN_MASK2l):(to[(n)/2]=0));
321#define bn_cp_32(to,n,from,m) ((m)>=0)?bn_cp_32_naked(to,n,from,m):bn_32_set_0(to,n)
322# if defined(L_ENDIAN)
323# if defined(__arch64__)
324# define NIST_INT64 long
325# else
326# define NIST_INT64 long long
327# endif
328# endif
329#else
330#define bn_cp_64(to, n, from, m) \
331 { \
332 bn_cp_32(to, (n)*2, from, (m)*2); \
333 bn_cp_32(to, (n)*2+1, from, (m)*2+1); \
334 }
335#define bn_64_set_0(to, n) \
336 { \
337 bn_32_set_0(to, (n)*2); \
338 bn_32_set_0(to, (n)*2+1); \
339 }
340#define bn_cp_32(to, n, from, m) (to)[n] = (m>=0)?((from)[m]):0;
341#define bn_32_set_0(to, n) (to)[n] = (BN_ULONG)0;
342# if defined(_WIN32) && !defined(__GNUC__)
343# define NIST_INT64 __int64
344# elif defined(BN_LLONG)
345# define NIST_INT64 long long
346# endif
347#endif /* BN_BITS2 != 64 */
348
349#define nist_set_192(to, from, a1, a2, a3) \
350 { \
351 bn_cp_64(to, 0, from, (a3) - 3) \
352 bn_cp_64(to, 1, from, (a2) - 3) \
353 bn_cp_64(to, 2, from, (a1) - 3) \
354 }
355
356int BN_nist_mod_192(BIGNUM *r, const BIGNUM *a, const BIGNUM *field,
357 BN_CTX *ctx)
358 {
359 int top = a->top, i;
360 int carry;
361 register BN_ULONG *r_d, *a_d = a->d;
362 union {
363 BN_ULONG bn[BN_NIST_192_TOP];
364 unsigned int ui[BN_NIST_192_TOP*sizeof(BN_ULONG)/sizeof(unsigned int)];
365 } buf;
366 BN_ULONG c_d[BN_NIST_192_TOP],
367 *res;
368 PTR_SIZE_INT mask;
369 static const BIGNUM _bignum_nist_p_192_sqr = {
370 (BN_ULONG *)_nist_p_192_sqr,
371 sizeof(_nist_p_192_sqr)/sizeof(_nist_p_192_sqr[0]),
372 sizeof(_nist_p_192_sqr)/sizeof(_nist_p_192_sqr[0]),
373 0,BN_FLG_STATIC_DATA };
374
375 field = &_bignum_nist_p_192; /* just to make sure */
376
377 if (BN_is_negative(a) || BN_ucmp(a,&_bignum_nist_p_192_sqr)>=0)
378 return BN_nnmod(r, a, field, ctx);
379
380 i = BN_ucmp(field, a);
381 if (i == 0)
382 {
383 BN_zero(r);
384 return 1;
385 }
386 else if (i > 0)
387 return (r == a) ? 1 : (BN_copy(r ,a) != NULL);
388
389 if (r != a)
390 {
391 if (!bn_wexpand(r, BN_NIST_192_TOP))
392 return 0;
393 r_d = r->d;
394 nist_cp_bn(r_d, a_d, BN_NIST_192_TOP);
395 }
396 else
397 r_d = a_d;
398
399 nist_cp_bn_0(buf.bn, a_d + BN_NIST_192_TOP, top - BN_NIST_192_TOP, BN_NIST_192_TOP);
400
401#if defined(NIST_INT64)
402 {
403 NIST_INT64 acc; /* accumulator */
404 unsigned int *rp=(unsigned int *)r_d;
405 const unsigned int *bp=(const unsigned int *)buf.ui;
406
407 acc = rp[0]; acc += bp[3*2-6];
408 acc += bp[5*2-6]; rp[0] = (unsigned int)acc; acc >>= 32;
409
410 acc += rp[1]; acc += bp[3*2-5];
411 acc += bp[5*2-5]; rp[1] = (unsigned int)acc; acc >>= 32;
412
413 acc += rp[2]; acc += bp[3*2-6];
414 acc += bp[4*2-6];
415 acc += bp[5*2-6]; rp[2] = (unsigned int)acc; acc >>= 32;
416
417 acc += rp[3]; acc += bp[3*2-5];
418 acc += bp[4*2-5];
419 acc += bp[5*2-5]; rp[3] = (unsigned int)acc; acc >>= 32;
420
421 acc += rp[4]; acc += bp[4*2-6];
422 acc += bp[5*2-6]; rp[4] = (unsigned int)acc; acc >>= 32;
423
424 acc += rp[5]; acc += bp[4*2-5];
425 acc += bp[5*2-5]; rp[5] = (unsigned int)acc;
426
427 carry = (int)(acc>>32);
428 }
429#else
430 {
431 BN_ULONG t_d[BN_NIST_192_TOP];
432
433 nist_set_192(t_d, buf.bn, 0, 3, 3);
434 carry = (int)bn_add_words(r_d, r_d, t_d, BN_NIST_192_TOP);
435 nist_set_192(t_d, buf.bn, 4, 4, 0);
436 carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_192_TOP);
437 nist_set_192(t_d, buf.bn, 5, 5, 5)
438 carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_192_TOP);
439 }
440#endif
441 if (carry > 0)
442 carry = (int)bn_sub_words(r_d,r_d,_nist_p_192[carry-1],BN_NIST_192_TOP);
443 else
444 carry = 1;
445
446 /*
447 * we need 'if (carry==0 || result>=modulus) result-=modulus;'
448 * as comparison implies subtraction, we can write
449 * 'tmp=result-modulus; if (!carry || !borrow) result=tmp;'
450 * this is what happens below, but without explicit if:-) a.
451 */
452 mask = 0-(PTR_SIZE_INT)bn_sub_words(c_d,r_d,_nist_p_192[0],BN_NIST_192_TOP);
453 mask &= 0-(PTR_SIZE_INT)carry;
454 res = (BN_ULONG *)
455 (((PTR_SIZE_INT)c_d&~mask) | ((PTR_SIZE_INT)r_d&mask));
456 nist_cp_bn(r_d, res, BN_NIST_192_TOP);
457 r->top = BN_NIST_192_TOP;
458 bn_correct_top(r);
459
460 return 1;
461 }
462
463typedef BN_ULONG (*bn_addsub_f)(BN_ULONG *,const BN_ULONG *,const BN_ULONG *,int);
464
465#define nist_set_224(to, from, a1, a2, a3, a4, a5, a6, a7) \
466 { \
467 bn_cp_32(to, 0, from, (a7) - 7) \
468 bn_cp_32(to, 1, from, (a6) - 7) \
469 bn_cp_32(to, 2, from, (a5) - 7) \
470 bn_cp_32(to, 3, from, (a4) - 7) \
471 bn_cp_32(to, 4, from, (a3) - 7) \
472 bn_cp_32(to, 5, from, (a2) - 7) \
473 bn_cp_32(to, 6, from, (a1) - 7) \
474 }
475
476int BN_nist_mod_224(BIGNUM *r, const BIGNUM *a, const BIGNUM *field,
477 BN_CTX *ctx)
478 {
479 int top = a->top, i;
480 int carry;
481 BN_ULONG *r_d, *a_d = a->d;
482 BN_ULONG buf[BN_NIST_224_TOP],
483 c_d[BN_NIST_224_TOP],
484 *res;
485 PTR_SIZE_INT mask;
486 union { bn_addsub_f f; PTR_SIZE_INT p; } u;
487 static const BIGNUM _bignum_nist_p_224_sqr = {
488 (BN_ULONG *)_nist_p_224_sqr,
489 sizeof(_nist_p_224_sqr)/sizeof(_nist_p_224_sqr[0]),
490 sizeof(_nist_p_224_sqr)/sizeof(_nist_p_224_sqr[0]),
491 0,BN_FLG_STATIC_DATA };
492
493
494 field = &_bignum_nist_p_224; /* just to make sure */
495
496 if (BN_is_negative(a) || BN_ucmp(a,&_bignum_nist_p_224_sqr)>=0)
497 return BN_nnmod(r, a, field, ctx);
498
499 i = BN_ucmp(field, a);
500 if (i == 0)
501 {
502 BN_zero(r);
503 return 1;
504 }
505 else if (i > 0)
506 return (r == a)? 1 : (BN_copy(r ,a) != NULL);
507
508 if (r != a)
509 {
510 if (!bn_wexpand(r, BN_NIST_224_TOP))
511 return 0;
512 r_d = r->d;
513 nist_cp_bn(r_d, a_d, BN_NIST_224_TOP);
514 }
515 else
516 r_d = a_d;
517
518#if BN_BITS2==64
519 /* copy upper 256 bits of 448 bit number ... */
520 nist_cp_bn_0(c_d, a_d + (BN_NIST_224_TOP-1), top - (BN_NIST_224_TOP-1), BN_NIST_224_TOP);
521 /* ... and right shift by 32 to obtain upper 224 bits */
522 nist_set_224(buf, c_d, 14, 13, 12, 11, 10, 9, 8);
523 /* truncate lower part to 224 bits too */
524 r_d[BN_NIST_224_TOP-1] &= BN_MASK2l;
525#else
526 nist_cp_bn_0(buf, a_d + BN_NIST_224_TOP, top - BN_NIST_224_TOP, BN_NIST_224_TOP);
527#endif
528
529#if defined(NIST_INT64) && BN_BITS2!=64
530 {
531 NIST_INT64 acc; /* accumulator */
532 unsigned int *rp=(unsigned int *)r_d;
533 const unsigned int *bp=(const unsigned int *)buf;
534
535 acc = rp[0]; acc -= bp[7-7];
536 acc -= bp[11-7]; rp[0] = (unsigned int)acc; acc >>= 32;
537
538 acc += rp[1]; acc -= bp[8-7];
539 acc -= bp[12-7]; rp[1] = (unsigned int)acc; acc >>= 32;
540
541 acc += rp[2]; acc -= bp[9-7];
542 acc -= bp[13-7]; rp[2] = (unsigned int)acc; acc >>= 32;
543
544 acc += rp[3]; acc += bp[7-7];
545 acc += bp[11-7];
546 acc -= bp[10-7]; rp[3] = (unsigned int)acc; acc>>= 32;
547
548 acc += rp[4]; acc += bp[8-7];
549 acc += bp[12-7];
550 acc -= bp[11-7]; rp[4] = (unsigned int)acc; acc >>= 32;
551
552 acc += rp[5]; acc += bp[9-7];
553 acc += bp[13-7];
554 acc -= bp[12-7]; rp[5] = (unsigned int)acc; acc >>= 32;
555
556 acc += rp[6]; acc += bp[10-7];
557 acc -= bp[13-7]; rp[6] = (unsigned int)acc;
558
559 carry = (int)(acc>>32);
560# if BN_BITS2==64
561 rp[7] = carry;
562# endif
563 }
564#else
565 {
566 BN_ULONG t_d[BN_NIST_224_TOP];
567
568 nist_set_224(t_d, buf, 10, 9, 8, 7, 0, 0, 0);
569 carry = (int)bn_add_words(r_d, r_d, t_d, BN_NIST_224_TOP);
570 nist_set_224(t_d, buf, 0, 13, 12, 11, 0, 0, 0);
571 carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_224_TOP);
572 nist_set_224(t_d, buf, 13, 12, 11, 10, 9, 8, 7);
573 carry -= (int)bn_sub_words(r_d, r_d, t_d, BN_NIST_224_TOP);
574 nist_set_224(t_d, buf, 0, 0, 0, 0, 13, 12, 11);
575 carry -= (int)bn_sub_words(r_d, r_d, t_d, BN_NIST_224_TOP);
576
577#if BN_BITS2==64
578 carry = (int)(r_d[BN_NIST_224_TOP-1]>>32);
579#endif
580 }
581#endif
582 u.f = bn_sub_words;
583 if (carry > 0)
584 {
585 carry = (int)bn_sub_words(r_d,r_d,_nist_p_224[carry-1],BN_NIST_224_TOP);
586#if BN_BITS2==64
587 carry=(int)(~(r_d[BN_NIST_224_TOP-1]>>32))&1;
588#endif
589 }
590 else if (carry < 0)
591 {
592 /* it's a bit more comlicated logic in this case.
593 * if bn_add_words yields no carry, then result
594 * has to be adjusted by unconditionally *adding*
595 * the modulus. but if it does, then result has
596 * to be compared to the modulus and conditionally
597 * adjusted by *subtracting* the latter. */
598 carry = (int)bn_add_words(r_d,r_d,_nist_p_224[-carry-1],BN_NIST_224_TOP);
599 mask = 0-(PTR_SIZE_INT)carry;
600 u.p = ((PTR_SIZE_INT)bn_sub_words&mask) |
601 ((PTR_SIZE_INT)bn_add_words&~mask);
602 }
603 else
604 carry = 1;
605
606 /* otherwise it's effectively same as in BN_nist_mod_192... */
607 mask = 0-(PTR_SIZE_INT)(*u.f)(c_d,r_d,_nist_p_224[0],BN_NIST_224_TOP);
608 mask &= 0-(PTR_SIZE_INT)carry;
609 res = (BN_ULONG *)(((PTR_SIZE_INT)c_d&~mask) |
610 ((PTR_SIZE_INT)r_d&mask));
611 nist_cp_bn(r_d, res, BN_NIST_224_TOP);
612 r->top = BN_NIST_224_TOP;
613 bn_correct_top(r);
614
615 return 1;
616 }
617
618#define nist_set_256(to, from, a1, a2, a3, a4, a5, a6, a7, a8) \
619 { \
620 bn_cp_32(to, 0, from, (a8) - 8) \
621 bn_cp_32(to, 1, from, (a7) - 8) \
622 bn_cp_32(to, 2, from, (a6) - 8) \
623 bn_cp_32(to, 3, from, (a5) - 8) \
624 bn_cp_32(to, 4, from, (a4) - 8) \
625 bn_cp_32(to, 5, from, (a3) - 8) \
626 bn_cp_32(to, 6, from, (a2) - 8) \
627 bn_cp_32(to, 7, from, (a1) - 8) \
628 }
629
630int BN_nist_mod_256(BIGNUM *r, const BIGNUM *a, const BIGNUM *field,
631 BN_CTX *ctx)
632 {
633 int i, top = a->top;
634 int carry = 0;
635 register BN_ULONG *a_d = a->d, *r_d;
636 union {
637 BN_ULONG bn[BN_NIST_256_TOP];
638 unsigned int ui[BN_NIST_256_TOP*sizeof(BN_ULONG)/sizeof(unsigned int)];
639 } buf;
640 BN_ULONG c_d[BN_NIST_256_TOP],
641 *res;
642 PTR_SIZE_INT mask;
643 union { bn_addsub_f f; PTR_SIZE_INT p; } u;
644 static const BIGNUM _bignum_nist_p_256_sqr = {
645 (BN_ULONG *)_nist_p_256_sqr,
646 sizeof(_nist_p_256_sqr)/sizeof(_nist_p_256_sqr[0]),
647 sizeof(_nist_p_256_sqr)/sizeof(_nist_p_256_sqr[0]),
648 0,BN_FLG_STATIC_DATA };
649
650 field = &_bignum_nist_p_256; /* just to make sure */
651
652 if (BN_is_negative(a) || BN_ucmp(a,&_bignum_nist_p_256_sqr)>=0)
653 return BN_nnmod(r, a, field, ctx);
654
655 i = BN_ucmp(field, a);
656 if (i == 0)
657 {
658 BN_zero(r);
659 return 1;
660 }
661 else if (i > 0)
662 return (r == a)? 1 : (BN_copy(r ,a) != NULL);
663
664 if (r != a)
665 {
666 if (!bn_wexpand(r, BN_NIST_256_TOP))
667 return 0;
668 r_d = r->d;
669 nist_cp_bn(r_d, a_d, BN_NIST_256_TOP);
670 }
671 else
672 r_d = a_d;
673
674 nist_cp_bn_0(buf.bn, a_d + BN_NIST_256_TOP, top - BN_NIST_256_TOP, BN_NIST_256_TOP);
675
676#if defined(NIST_INT64)
677 {
678 NIST_INT64 acc; /* accumulator */
679 unsigned int *rp=(unsigned int *)r_d;
680 const unsigned int *bp=(const unsigned int *)buf.ui;
681
682 acc = rp[0]; acc += bp[8-8];
683 acc += bp[9-8];
684 acc -= bp[11-8];
685 acc -= bp[12-8];
686 acc -= bp[13-8];
687 acc -= bp[14-8]; rp[0] = (unsigned int)acc; acc >>= 32;
688
689 acc += rp[1]; acc += bp[9-8];
690 acc += bp[10-8];
691 acc -= bp[12-8];
692 acc -= bp[13-8];
693 acc -= bp[14-8];
694 acc -= bp[15-8]; rp[1] = (unsigned int)acc; acc >>= 32;
695
696 acc += rp[2]; acc += bp[10-8];
697 acc += bp[11-8];
698 acc -= bp[13-8];
699 acc -= bp[14-8];
700 acc -= bp[15-8]; rp[2] = (unsigned int)acc; acc >>= 32;
701
702 acc += rp[3]; acc += bp[11-8];
703 acc += bp[11-8];
704 acc += bp[12-8];
705 acc += bp[12-8];
706 acc += bp[13-8];
707 acc -= bp[15-8];
708 acc -= bp[8-8];
709 acc -= bp[9-8]; rp[3] = (unsigned int)acc; acc >>= 32;
710
711 acc += rp[4]; acc += bp[12-8];
712 acc += bp[12-8];
713 acc += bp[13-8];
714 acc += bp[13-8];
715 acc += bp[14-8];
716 acc -= bp[9-8];
717 acc -= bp[10-8]; rp[4] = (unsigned int)acc; acc >>= 32;
718
719 acc += rp[5]; acc += bp[13-8];
720 acc += bp[13-8];
721 acc += bp[14-8];
722 acc += bp[14-8];
723 acc += bp[15-8];
724 acc -= bp[10-8];
725 acc -= bp[11-8]; rp[5] = (unsigned int)acc; acc >>= 32;
726
727 acc += rp[6]; acc += bp[14-8];
728 acc += bp[14-8];
729 acc += bp[15-8];
730 acc += bp[15-8];
731 acc += bp[14-8];
732 acc += bp[13-8];
733 acc -= bp[8-8];
734 acc -= bp[9-8]; rp[6] = (unsigned int)acc; acc >>= 32;
735
736 acc += rp[7]; acc += bp[15-8];
737 acc += bp[15-8];
738 acc += bp[15-8];
739 acc += bp[8 -8];
740 acc -= bp[10-8];
741 acc -= bp[11-8];
742 acc -= bp[12-8];
743 acc -= bp[13-8]; rp[7] = (unsigned int)acc;
744
745 carry = (int)(acc>>32);
746 }
747#else
748 {
749 BN_ULONG t_d[BN_NIST_256_TOP];
750
751 /*S1*/
752 nist_set_256(t_d, buf.bn, 15, 14, 13, 12, 11, 0, 0, 0);
753 /*S2*/
754 nist_set_256(c_d, buf.bn, 0, 15, 14, 13, 12, 0, 0, 0);
755 carry = (int)bn_add_words(t_d, t_d, c_d, BN_NIST_256_TOP);
756 /* left shift */
757 {
758 register BN_ULONG *ap,t,c;
759 ap = t_d;
760 c=0;
761 for (i = BN_NIST_256_TOP; i != 0; --i)
762 {
763 t= *ap;
764 *(ap++)=((t<<1)|c)&BN_MASK2;
765 c=(t & BN_TBIT)?1:0;
766 }
767 carry <<= 1;
768 carry |= c;
769 }
770 carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_256_TOP);
771 /*S3*/
772 nist_set_256(t_d, buf.bn, 15, 14, 0, 0, 0, 10, 9, 8);
773 carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_256_TOP);
774 /*S4*/
775 nist_set_256(t_d, buf.bn, 8, 13, 15, 14, 13, 11, 10, 9);
776 carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_256_TOP);
777 /*D1*/
778 nist_set_256(t_d, buf.bn, 10, 8, 0, 0, 0, 13, 12, 11);
779 carry -= (int)bn_sub_words(r_d, r_d, t_d, BN_NIST_256_TOP);
780 /*D2*/
781 nist_set_256(t_d, buf.bn, 11, 9, 0, 0, 15, 14, 13, 12);
782 carry -= (int)bn_sub_words(r_d, r_d, t_d, BN_NIST_256_TOP);
783 /*D3*/
784 nist_set_256(t_d, buf.bn, 12, 0, 10, 9, 8, 15, 14, 13);
785 carry -= (int)bn_sub_words(r_d, r_d, t_d, BN_NIST_256_TOP);
786 /*D4*/
787 nist_set_256(t_d, buf.bn, 13, 0, 11, 10, 9, 0, 15, 14);
788 carry -= (int)bn_sub_words(r_d, r_d, t_d, BN_NIST_256_TOP);
789
790 }
791#endif
792 /* see BN_nist_mod_224 for explanation */
793 u.f = bn_sub_words;
794 if (carry > 0)
795 carry = (int)bn_sub_words(r_d,r_d,_nist_p_256[carry-1],BN_NIST_256_TOP);
796 else if (carry < 0)
797 {
798 carry = (int)bn_add_words(r_d,r_d,_nist_p_256[-carry-1],BN_NIST_256_TOP);
799 mask = 0-(PTR_SIZE_INT)carry;
800 u.p = ((PTR_SIZE_INT)bn_sub_words&mask) |
801 ((PTR_SIZE_INT)bn_add_words&~mask);
802 }
803 else
804 carry = 1;
805
806 mask = 0-(PTR_SIZE_INT)(*u.f)(c_d,r_d,_nist_p_256[0],BN_NIST_256_TOP);
807 mask &= 0-(PTR_SIZE_INT)carry;
808 res = (BN_ULONG *)(((PTR_SIZE_INT)c_d&~mask) |
809 ((PTR_SIZE_INT)r_d&mask));
810 nist_cp_bn(r_d, res, BN_NIST_256_TOP);
811 r->top = BN_NIST_256_TOP;
812 bn_correct_top(r);
813
814 return 1;
815 }
816
817#define nist_set_384(to,from,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12) \
818 { \
819 bn_cp_32(to, 0, from, (a12) - 12) \
820 bn_cp_32(to, 1, from, (a11) - 12) \
821 bn_cp_32(to, 2, from, (a10) - 12) \
822 bn_cp_32(to, 3, from, (a9) - 12) \
823 bn_cp_32(to, 4, from, (a8) - 12) \
824 bn_cp_32(to, 5, from, (a7) - 12) \
825 bn_cp_32(to, 6, from, (a6) - 12) \
826 bn_cp_32(to, 7, from, (a5) - 12) \
827 bn_cp_32(to, 8, from, (a4) - 12) \
828 bn_cp_32(to, 9, from, (a3) - 12) \
829 bn_cp_32(to, 10, from, (a2) - 12) \
830 bn_cp_32(to, 11, from, (a1) - 12) \
831 }
832
833int BN_nist_mod_384(BIGNUM *r, const BIGNUM *a, const BIGNUM *field,
834 BN_CTX *ctx)
835 {
836 int i, top = a->top;
837 int carry = 0;
838 register BN_ULONG *r_d, *a_d = a->d;
839 union {
840 BN_ULONG bn[BN_NIST_384_TOP];
841 unsigned int ui[BN_NIST_384_TOP*sizeof(BN_ULONG)/sizeof(unsigned int)];
842 } buf;
843 BN_ULONG c_d[BN_NIST_384_TOP],
844 *res;
845 PTR_SIZE_INT mask;
846 union { bn_addsub_f f; PTR_SIZE_INT p; } u;
847 static const BIGNUM _bignum_nist_p_384_sqr = {
848 (BN_ULONG *)_nist_p_384_sqr,
849 sizeof(_nist_p_384_sqr)/sizeof(_nist_p_384_sqr[0]),
850 sizeof(_nist_p_384_sqr)/sizeof(_nist_p_384_sqr[0]),
851 0,BN_FLG_STATIC_DATA };
852
853
854 field = &_bignum_nist_p_384; /* just to make sure */
855
856 if (BN_is_negative(a) || BN_ucmp(a,&_bignum_nist_p_384_sqr)>=0)
857 return BN_nnmod(r, a, field, ctx);
858
859 i = BN_ucmp(field, a);
860 if (i == 0)
861 {
862 BN_zero(r);
863 return 1;
864 }
865 else if (i > 0)
866 return (r == a)? 1 : (BN_copy(r ,a) != NULL);
867
868 if (r != a)
869 {
870 if (!bn_wexpand(r, BN_NIST_384_TOP))
871 return 0;
872 r_d = r->d;
873 nist_cp_bn(r_d, a_d, BN_NIST_384_TOP);
874 }
875 else
876 r_d = a_d;
877
878 nist_cp_bn_0(buf.bn, a_d + BN_NIST_384_TOP, top - BN_NIST_384_TOP, BN_NIST_384_TOP);
879
880#if defined(NIST_INT64)
881 {
882 NIST_INT64 acc; /* accumulator */
883 unsigned int *rp=(unsigned int *)r_d;
884 const unsigned int *bp=(const unsigned int *)buf.ui;
885
886 acc = rp[0]; acc += bp[12-12];
887 acc += bp[21-12];
888 acc += bp[20-12];
889 acc -= bp[23-12]; rp[0] = (unsigned int)acc; acc >>= 32;
890
891 acc += rp[1]; acc += bp[13-12];
892 acc += bp[22-12];
893 acc += bp[23-12];
894 acc -= bp[12-12];
895 acc -= bp[20-12]; rp[1] = (unsigned int)acc; acc >>= 32;
896
897 acc += rp[2]; acc += bp[14-12];
898 acc += bp[23-12];
899 acc -= bp[13-12];
900 acc -= bp[21-12]; rp[2] = (unsigned int)acc; acc >>= 32;
901
902 acc += rp[3]; acc += bp[15-12];
903 acc += bp[12-12];
904 acc += bp[20-12];
905 acc += bp[21-12];
906 acc -= bp[14-12];
907 acc -= bp[22-12];
908 acc -= bp[23-12]; rp[3] = (unsigned int)acc; acc >>= 32;
909
910 acc += rp[4]; acc += bp[21-12];
911 acc += bp[21-12];
912 acc += bp[16-12];
913 acc += bp[13-12];
914 acc += bp[12-12];
915 acc += bp[20-12];
916 acc += bp[22-12];
917 acc -= bp[15-12];
918 acc -= bp[23-12];
919 acc -= bp[23-12]; rp[4] = (unsigned int)acc; acc >>= 32;
920
921 acc += rp[5]; acc += bp[22-12];
922 acc += bp[22-12];
923 acc += bp[17-12];
924 acc += bp[14-12];
925 acc += bp[13-12];
926 acc += bp[21-12];
927 acc += bp[23-12];
928 acc -= bp[16-12]; rp[5] = (unsigned int)acc; acc >>= 32;
929
930 acc += rp[6]; acc += bp[23-12];
931 acc += bp[23-12];
932 acc += bp[18-12];
933 acc += bp[15-12];
934 acc += bp[14-12];
935 acc += bp[22-12];
936 acc -= bp[17-12]; rp[6] = (unsigned int)acc; acc >>= 32;
937
938 acc += rp[7]; acc += bp[19-12];
939 acc += bp[16-12];
940 acc += bp[15-12];
941 acc += bp[23-12];
942 acc -= bp[18-12]; rp[7] = (unsigned int)acc; acc >>= 32;
943
944 acc += rp[8]; acc += bp[20-12];
945 acc += bp[17-12];
946 acc += bp[16-12];
947 acc -= bp[19-12]; rp[8] = (unsigned int)acc; acc >>= 32;
948
949 acc += rp[9]; acc += bp[21-12];
950 acc += bp[18-12];
951 acc += bp[17-12];
952 acc -= bp[20-12]; rp[9] = (unsigned int)acc; acc >>= 32;
953
954 acc += rp[10]; acc += bp[22-12];
955 acc += bp[19-12];
956 acc += bp[18-12];
957 acc -= bp[21-12]; rp[10] = (unsigned int)acc; acc >>= 32;
958
959 acc += rp[11]; acc += bp[23-12];
960 acc += bp[20-12];
961 acc += bp[19-12];
962 acc -= bp[22-12]; rp[11] = (unsigned int)acc;
963
964 carry = (int)(acc>>32);
965 }
966#else
967 {
968 BN_ULONG t_d[BN_NIST_384_TOP];
969
970 /*S1*/
971 nist_set_256(t_d, buf.bn, 0, 0, 0, 0, 0, 23-4, 22-4, 21-4);
972 /* left shift */
973 {
974 register BN_ULONG *ap,t,c;
975 ap = t_d;
976 c=0;
977 for (i = 3; i != 0; --i)
978 {
979 t= *ap;
980 *(ap++)=((t<<1)|c)&BN_MASK2;
981 c=(t & BN_TBIT)?1:0;
982 }
983 *ap=c;
984 }
985 carry = (int)bn_add_words(r_d+(128/BN_BITS2), r_d+(128/BN_BITS2),
986 t_d, BN_NIST_256_TOP);
987 /*S2 */
988 carry += (int)bn_add_words(r_d, r_d, buf.bn, BN_NIST_384_TOP);
989 /*S3*/
990 nist_set_384(t_d,buf.bn,20,19,18,17,16,15,14,13,12,23,22,21);
991 carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_384_TOP);
992 /*S4*/
993 nist_set_384(t_d,buf.bn,19,18,17,16,15,14,13,12,20,0,23,0);
994 carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_384_TOP);
995 /*S5*/
996 nist_set_384(t_d, buf.bn,0,0,0,0,23,22,21,20,0,0,0,0);
997 carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_384_TOP);
998 /*S6*/
999 nist_set_384(t_d,buf.bn,0,0,0,0,0,0,23,22,21,0,0,20);
1000 carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_384_TOP);
1001 /*D1*/
1002 nist_set_384(t_d,buf.bn,22,21,20,19,18,17,16,15,14,13,12,23);
1003 carry -= (int)bn_sub_words(r_d, r_d, t_d, BN_NIST_384_TOP);
1004 /*D2*/
1005 nist_set_384(t_d,buf.bn,0,0,0,0,0,0,0,23,22,21,20,0);
1006 carry -= (int)bn_sub_words(r_d, r_d, t_d, BN_NIST_384_TOP);
1007 /*D3*/
1008 nist_set_384(t_d,buf.bn,0,0,0,0,0,0,0,23,23,0,0,0);
1009 carry -= (int)bn_sub_words(r_d, r_d, t_d, BN_NIST_384_TOP);
1010
1011 }
1012#endif
1013 /* see BN_nist_mod_224 for explanation */
1014 u.f = bn_sub_words;
1015 if (carry > 0)
1016 carry = (int)bn_sub_words(r_d,r_d,_nist_p_384[carry-1],BN_NIST_384_TOP);
1017 else if (carry < 0)
1018 {
1019 carry = (int)bn_add_words(r_d,r_d,_nist_p_384[-carry-1],BN_NIST_384_TOP);
1020 mask = 0-(PTR_SIZE_INT)carry;
1021 u.p = ((PTR_SIZE_INT)bn_sub_words&mask) |
1022 ((PTR_SIZE_INT)bn_add_words&~mask);
1023 }
1024 else
1025 carry = 1;
1026
1027 mask = 0-(PTR_SIZE_INT)(*u.f)(c_d,r_d,_nist_p_384[0],BN_NIST_384_TOP);
1028 mask &= 0-(PTR_SIZE_INT)carry;
1029 res = (BN_ULONG *)(((PTR_SIZE_INT)c_d&~mask) |
1030 ((PTR_SIZE_INT)r_d&mask));
1031 nist_cp_bn(r_d, res, BN_NIST_384_TOP);
1032 r->top = BN_NIST_384_TOP;
1033 bn_correct_top(r);
1034
1035 return 1;
1036 }
1037
1038#define BN_NIST_521_RSHIFT (521%BN_BITS2)
1039#define BN_NIST_521_LSHIFT (BN_BITS2-BN_NIST_521_RSHIFT)
1040#define BN_NIST_521_TOP_MASK ((BN_ULONG)BN_MASK2>>BN_NIST_521_LSHIFT)
1041
1042int BN_nist_mod_521(BIGNUM *r, const BIGNUM *a, const BIGNUM *field,
1043 BN_CTX *ctx)
1044 {
1045 int top = a->top, i;
1046 BN_ULONG *r_d, *a_d = a->d,
1047 t_d[BN_NIST_521_TOP],
1048 val,tmp,*res;
1049 PTR_SIZE_INT mask;
1050 static const BIGNUM _bignum_nist_p_521_sqr = {
1051 (BN_ULONG *)_nist_p_521_sqr,
1052 sizeof(_nist_p_521_sqr)/sizeof(_nist_p_521_sqr[0]),
1053 sizeof(_nist_p_521_sqr)/sizeof(_nist_p_521_sqr[0]),
1054 0,BN_FLG_STATIC_DATA };
1055
1056 field = &_bignum_nist_p_521; /* just to make sure */
1057
1058 if (BN_is_negative(a) || BN_ucmp(a,&_bignum_nist_p_521_sqr)>=0)
1059 return BN_nnmod(r, a, field, ctx);
1060
1061 i = BN_ucmp(field, a);
1062 if (i == 0)
1063 {
1064 BN_zero(r);
1065 return 1;
1066 }
1067 else if (i > 0)
1068 return (r == a)? 1 : (BN_copy(r ,a) != NULL);
1069
1070 if (r != a)
1071 {
1072 if (!bn_wexpand(r,BN_NIST_521_TOP))
1073 return 0;
1074 r_d = r->d;
1075 nist_cp_bn(r_d,a_d, BN_NIST_521_TOP);
1076 }
1077 else
1078 r_d = a_d;
1079
1080 /* upper 521 bits, copy ... */
1081 nist_cp_bn_0(t_d,a_d + (BN_NIST_521_TOP-1), top - (BN_NIST_521_TOP-1),BN_NIST_521_TOP);
1082 /* ... and right shift */
1083 for (val=t_d[0],i=0; i<BN_NIST_521_TOP-1; i++)
1084 {
1085 tmp = val>>BN_NIST_521_RSHIFT;
1086 val = t_d[i+1];
1087 t_d[i] = (tmp | val<<BN_NIST_521_LSHIFT) & BN_MASK2;
1088 }
1089 t_d[i] = val>>BN_NIST_521_RSHIFT;
1090 /* lower 521 bits */
1091 r_d[i] &= BN_NIST_521_TOP_MASK;
1092
1093 bn_add_words(r_d,r_d,t_d,BN_NIST_521_TOP);
1094 mask = 0-(PTR_SIZE_INT)bn_sub_words(t_d,r_d,_nist_p_521,BN_NIST_521_TOP);
1095 res = (BN_ULONG *)(((PTR_SIZE_INT)t_d&~mask) |
1096 ((PTR_SIZE_INT)r_d&mask));
1097 nist_cp_bn(r_d,res,BN_NIST_521_TOP);
1098 r->top = BN_NIST_521_TOP;
1099 bn_correct_top(r);
1100
1101 return 1;
1102 }
diff --git a/src/lib/libcrypto/bn/bn_prime.c b/src/lib/libcrypto/bn/bn_prime.c
deleted file mode 100644
index 7b25979dd1..0000000000
--- a/src/lib/libcrypto/bn/bn_prime.c
+++ /dev/null
@@ -1,494 +0,0 @@
1/* crypto/bn/bn_prime.c */
2/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
3 * All rights reserved.
4 *
5 * This package is an SSL implementation written
6 * by Eric Young (eay@cryptsoft.com).
7 * The implementation was written so as to conform with Netscapes SSL.
8 *
9 * This library is free for commercial and non-commercial use as long as
10 * the following conditions are aheared to. The following conditions
11 * apply to all code found in this distribution, be it the RC4, RSA,
12 * lhash, DES, etc., code; not just the SSL code. The SSL documentation
13 * included with this distribution is covered by the same copyright terms
14 * except that the holder is Tim Hudson (tjh@cryptsoft.com).
15 *
16 * Copyright remains Eric Young's, and as such any Copyright notices in
17 * the code are not to be removed.
18 * If this package is used in a product, Eric Young should be given attribution
19 * as the author of the parts of the library used.
20 * This can be in the form of a textual message at program startup or
21 * in documentation (online or textual) provided with the package.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 * 1. Redistributions of source code must retain the copyright
27 * notice, this list of conditions and the following disclaimer.
28 * 2. Redistributions in binary form must reproduce the above copyright
29 * notice, this list of conditions and the following disclaimer in the
30 * documentation and/or other materials provided with the distribution.
31 * 3. All advertising materials mentioning features or use of this software
32 * must display the following acknowledgement:
33 * "This product includes cryptographic software written by
34 * Eric Young (eay@cryptsoft.com)"
35 * The word 'cryptographic' can be left out if the rouines from the library
36 * being used are not cryptographic related :-).
37 * 4. If you include any Windows specific code (or a derivative thereof) from
38 * the apps directory (application code) you must include an acknowledgement:
39 * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
40 *
41 * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
42 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51 * SUCH DAMAGE.
52 *
53 * The licence and distribution terms for any publically available version or
54 * derivative of this code cannot be changed. i.e. this code cannot simply be
55 * copied and put under another distribution licence
56 * [including the GNU Public Licence.]
57 */
58/* ====================================================================
59 * Copyright (c) 1998-2001 The OpenSSL Project. All rights reserved.
60 *
61 * Redistribution and use in source and binary forms, with or without
62 * modification, are permitted provided that the following conditions
63 * are met:
64 *
65 * 1. Redistributions of source code must retain the above copyright
66 * notice, this list of conditions and the following disclaimer.
67 *
68 * 2. Redistributions in binary form must reproduce the above copyright
69 * notice, this list of conditions and the following disclaimer in
70 * the documentation and/or other materials provided with the
71 * distribution.
72 *
73 * 3. All advertising materials mentioning features or use of this
74 * software must display the following acknowledgment:
75 * "This product includes software developed by the OpenSSL Project
76 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
77 *
78 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
79 * endorse or promote products derived from this software without
80 * prior written permission. For written permission, please contact
81 * openssl-core@openssl.org.
82 *
83 * 5. Products derived from this software may not be called "OpenSSL"
84 * nor may "OpenSSL" appear in their names without prior written
85 * permission of the OpenSSL Project.
86 *
87 * 6. Redistributions of any form whatsoever must retain the following
88 * acknowledgment:
89 * "This product includes software developed by the OpenSSL Project
90 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
91 *
92 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
93 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
94 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
95 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
96 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
97 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
98 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
99 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
100 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
101 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
102 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
103 * OF THE POSSIBILITY OF SUCH DAMAGE.
104 * ====================================================================
105 *
106 * This product includes cryptographic software written by Eric Young
107 * (eay@cryptsoft.com). This product includes software written by Tim
108 * Hudson (tjh@cryptsoft.com).
109 *
110 */
111
112#include <stdio.h>
113#include <time.h>
114#include "cryptlib.h"
115#include "bn_lcl.h"
116#include <openssl/rand.h>
117
118/* NB: these functions have been "upgraded", the deprecated versions (which are
119 * compatibility wrappers using these functions) are in bn_depr.c.
120 * - Geoff
121 */
122
123/* The quick sieve algorithm approach to weeding out primes is
124 * Philip Zimmermann's, as implemented in PGP. I have had a read of
125 * his comments and implemented my own version.
126 */
127#include "bn_prime.h"
128
129static int witness(BIGNUM *w, const BIGNUM *a, const BIGNUM *a1,
130 const BIGNUM *a1_odd, int k, BN_CTX *ctx, BN_MONT_CTX *mont);
131static int probable_prime(BIGNUM *rnd, int bits);
132static int probable_prime_dh(BIGNUM *rnd, int bits,
133 const BIGNUM *add, const BIGNUM *rem, BN_CTX *ctx);
134static int probable_prime_dh_safe(BIGNUM *rnd, int bits,
135 const BIGNUM *add, const BIGNUM *rem, BN_CTX *ctx);
136
137int BN_GENCB_call(BN_GENCB *cb, int a, int b)
138 {
139 /* No callback means continue */
140 if(!cb) return 1;
141 switch(cb->ver)
142 {
143 case 1:
144 /* Deprecated-style callbacks */
145 if(!cb->cb.cb_1)
146 return 1;
147 cb->cb.cb_1(a, b, cb->arg);
148 return 1;
149 case 2:
150 /* New-style callbacks */
151 return cb->cb.cb_2(a, b, cb);
152 default:
153 break;
154 }
155 /* Unrecognised callback type */
156 return 0;
157 }
158
159int BN_generate_prime_ex(BIGNUM *ret, int bits, int safe,
160 const BIGNUM *add, const BIGNUM *rem, BN_GENCB *cb)
161 {
162 BIGNUM *t;
163 int found=0;
164 int i,j,c1=0;
165 BN_CTX *ctx;
166 int checks = BN_prime_checks_for_size(bits);
167
168 ctx=BN_CTX_new();
169 if (ctx == NULL) goto err;
170 BN_CTX_start(ctx);
171 t = BN_CTX_get(ctx);
172 if(!t) goto err;
173loop:
174 /* make a random number and set the top and bottom bits */
175 if (add == NULL)
176 {
177 if (!probable_prime(ret,bits)) goto err;
178 }
179 else
180 {
181 if (safe)
182 {
183 if (!probable_prime_dh_safe(ret,bits,add,rem,ctx))
184 goto err;
185 }
186 else
187 {
188 if (!probable_prime_dh(ret,bits,add,rem,ctx))
189 goto err;
190 }
191 }
192 /* if (BN_mod_word(ret,(BN_ULONG)3) == 1) goto loop; */
193 if(!BN_GENCB_call(cb, 0, c1++))
194 /* aborted */
195 goto err;
196
197 if (!safe)
198 {
199 i=BN_is_prime_fasttest_ex(ret,checks,ctx,0,cb);
200 if (i == -1) goto err;
201 if (i == 0) goto loop;
202 }
203 else
204 {
205 /* for "safe prime" generation,
206 * check that (p-1)/2 is prime.
207 * Since a prime is odd, We just
208 * need to divide by 2 */
209 if (!BN_rshift1(t,ret)) goto err;
210
211 for (i=0; i<checks; i++)
212 {
213 j=BN_is_prime_fasttest_ex(ret,1,ctx,0,cb);
214 if (j == -1) goto err;
215 if (j == 0) goto loop;
216
217 j=BN_is_prime_fasttest_ex(t,1,ctx,0,cb);
218 if (j == -1) goto err;
219 if (j == 0) goto loop;
220
221 if(!BN_GENCB_call(cb, 2, c1-1))
222 goto err;
223 /* We have a safe prime test pass */
224 }
225 }
226 /* we have a prime :-) */
227 found = 1;
228err:
229 if (ctx != NULL)
230 {
231 BN_CTX_end(ctx);
232 BN_CTX_free(ctx);
233 }
234 bn_check_top(ret);
235 return found;
236 }
237
238int BN_is_prime_ex(const BIGNUM *a, int checks, BN_CTX *ctx_passed, BN_GENCB *cb)
239 {
240 return BN_is_prime_fasttest_ex(a, checks, ctx_passed, 0, cb);
241 }
242
243int BN_is_prime_fasttest_ex(const BIGNUM *a, int checks, BN_CTX *ctx_passed,
244 int do_trial_division, BN_GENCB *cb)
245 {
246 int i, j, ret = -1;
247 int k;
248 BN_CTX *ctx = NULL;
249 BIGNUM *A1, *A1_odd, *check; /* taken from ctx */
250 BN_MONT_CTX *mont = NULL;
251 const BIGNUM *A = NULL;
252
253 if (BN_cmp(a, BN_value_one()) <= 0)
254 return 0;
255
256 if (checks == BN_prime_checks)
257 checks = BN_prime_checks_for_size(BN_num_bits(a));
258
259 /* first look for small factors */
260 if (!BN_is_odd(a))
261 /* a is even => a is prime if and only if a == 2 */
262 return BN_is_word(a, 2);
263 if (do_trial_division)
264 {
265 for (i = 1; i < NUMPRIMES; i++)
266 if (BN_mod_word(a, primes[i]) == 0)
267 return 0;
268 if(!BN_GENCB_call(cb, 1, -1))
269 goto err;
270 }
271
272 if (ctx_passed != NULL)
273 ctx = ctx_passed;
274 else
275 if ((ctx=BN_CTX_new()) == NULL)
276 goto err;
277 BN_CTX_start(ctx);
278
279 /* A := abs(a) */
280 if (a->neg)
281 {
282 BIGNUM *t;
283 if ((t = BN_CTX_get(ctx)) == NULL) goto err;
284 BN_copy(t, a);
285 t->neg = 0;
286 A = t;
287 }
288 else
289 A = a;
290 A1 = BN_CTX_get(ctx);
291 A1_odd = BN_CTX_get(ctx);
292 check = BN_CTX_get(ctx);
293 if (check == NULL) goto err;
294
295 /* compute A1 := A - 1 */
296 if (!BN_copy(A1, A))
297 goto err;
298 if (!BN_sub_word(A1, 1))
299 goto err;
300 if (BN_is_zero(A1))
301 {
302 ret = 0;
303 goto err;
304 }
305
306 /* write A1 as A1_odd * 2^k */
307 k = 1;
308 while (!BN_is_bit_set(A1, k))
309 k++;
310 if (!BN_rshift(A1_odd, A1, k))
311 goto err;
312
313 /* Montgomery setup for computations mod A */
314 mont = BN_MONT_CTX_new();
315 if (mont == NULL)
316 goto err;
317 if (!BN_MONT_CTX_set(mont, A, ctx))
318 goto err;
319
320 for (i = 0; i < checks; i++)
321 {
322 if (!BN_pseudo_rand_range(check, A1))
323 goto err;
324 if (!BN_add_word(check, 1))
325 goto err;
326 /* now 1 <= check < A */
327
328 j = witness(check, A, A1, A1_odd, k, ctx, mont);
329 if (j == -1) goto err;
330 if (j)
331 {
332 ret=0;
333 goto err;
334 }
335 if(!BN_GENCB_call(cb, 1, i))
336 goto err;
337 }
338 ret=1;
339err:
340 if (ctx != NULL)
341 {
342 BN_CTX_end(ctx);
343 if (ctx_passed == NULL)
344 BN_CTX_free(ctx);
345 }
346 if (mont != NULL)
347 BN_MONT_CTX_free(mont);
348
349 return(ret);
350 }
351
352static int witness(BIGNUM *w, const BIGNUM *a, const BIGNUM *a1,
353 const BIGNUM *a1_odd, int k, BN_CTX *ctx, BN_MONT_CTX *mont)
354 {
355 if (!BN_mod_exp_mont(w, w, a1_odd, a, ctx, mont)) /* w := w^a1_odd mod a */
356 return -1;
357 if (BN_is_one(w))
358 return 0; /* probably prime */
359 if (BN_cmp(w, a1) == 0)
360 return 0; /* w == -1 (mod a), 'a' is probably prime */
361 while (--k)
362 {
363 if (!BN_mod_mul(w, w, w, a, ctx)) /* w := w^2 mod a */
364 return -1;
365 if (BN_is_one(w))
366 return 1; /* 'a' is composite, otherwise a previous 'w' would
367 * have been == -1 (mod 'a') */
368 if (BN_cmp(w, a1) == 0)
369 return 0; /* w == -1 (mod a), 'a' is probably prime */
370 }
371 /* If we get here, 'w' is the (a-1)/2-th power of the original 'w',
372 * and it is neither -1 nor +1 -- so 'a' cannot be prime */
373 bn_check_top(w);
374 return 1;
375 }
376
377static int probable_prime(BIGNUM *rnd, int bits)
378 {
379 int i;
380 prime_t mods[NUMPRIMES];
381 BN_ULONG delta,maxdelta;
382
383again:
384 if (!BN_rand(rnd,bits,1,1)) return(0);
385 /* we now have a random number 'rand' to test. */
386 for (i=1; i<NUMPRIMES; i++)
387 mods[i]=(prime_t)BN_mod_word(rnd,(BN_ULONG)primes[i]);
388 maxdelta=BN_MASK2 - primes[NUMPRIMES-1];
389 delta=0;
390 loop: for (i=1; i<NUMPRIMES; i++)
391 {
392 /* check that rnd is not a prime and also
393 * that gcd(rnd-1,primes) == 1 (except for 2) */
394 if (((mods[i]+delta)%primes[i]) <= 1)
395 {
396 delta+=2;
397 if (delta > maxdelta) goto again;
398 goto loop;
399 }
400 }
401 if (!BN_add_word(rnd,delta)) return(0);
402 bn_check_top(rnd);
403 return(1);
404 }
405
406static int probable_prime_dh(BIGNUM *rnd, int bits,
407 const BIGNUM *add, const BIGNUM *rem, BN_CTX *ctx)
408 {
409 int i,ret=0;
410 BIGNUM *t1;
411
412 BN_CTX_start(ctx);
413 if ((t1 = BN_CTX_get(ctx)) == NULL) goto err;
414
415 if (!BN_rand(rnd,bits,0,1)) goto err;
416
417 /* we need ((rnd-rem) % add) == 0 */
418
419 if (!BN_mod(t1,rnd,add,ctx)) goto err;
420 if (!BN_sub(rnd,rnd,t1)) goto err;
421 if (rem == NULL)
422 { if (!BN_add_word(rnd,1)) goto err; }
423 else
424 { if (!BN_add(rnd,rnd,rem)) goto err; }
425
426 /* we now have a random number 'rand' to test. */
427
428 loop: for (i=1; i<NUMPRIMES; i++)
429 {
430 /* check that rnd is a prime */
431 if (BN_mod_word(rnd,(BN_ULONG)primes[i]) <= 1)
432 {
433 if (!BN_add(rnd,rnd,add)) goto err;
434 goto loop;
435 }
436 }
437 ret=1;
438err:
439 BN_CTX_end(ctx);
440 bn_check_top(rnd);
441 return(ret);
442 }
443
444static int probable_prime_dh_safe(BIGNUM *p, int bits, const BIGNUM *padd,
445 const BIGNUM *rem, BN_CTX *ctx)
446 {
447 int i,ret=0;
448 BIGNUM *t1,*qadd,*q;
449
450 bits--;
451 BN_CTX_start(ctx);
452 t1 = BN_CTX_get(ctx);
453 q = BN_CTX_get(ctx);
454 qadd = BN_CTX_get(ctx);
455 if (qadd == NULL) goto err;
456
457 if (!BN_rshift1(qadd,padd)) goto err;
458
459 if (!BN_rand(q,bits,0,1)) goto err;
460
461 /* we need ((rnd-rem) % add) == 0 */
462 if (!BN_mod(t1,q,qadd,ctx)) goto err;
463 if (!BN_sub(q,q,t1)) goto err;
464 if (rem == NULL)
465 { if (!BN_add_word(q,1)) goto err; }
466 else
467 {
468 if (!BN_rshift1(t1,rem)) goto err;
469 if (!BN_add(q,q,t1)) goto err;
470 }
471
472 /* we now have a random number 'rand' to test. */
473 if (!BN_lshift1(p,q)) goto err;
474 if (!BN_add_word(p,1)) goto err;
475
476 loop: for (i=1; i<NUMPRIMES; i++)
477 {
478 /* check that p and q are prime */
479 /* check that for p and q
480 * gcd(p-1,primes) == 1 (except for 2) */
481 if ( (BN_mod_word(p,(BN_ULONG)primes[i]) == 0) ||
482 (BN_mod_word(q,(BN_ULONG)primes[i]) == 0))
483 {
484 if (!BN_add(p,p,padd)) goto err;
485 if (!BN_add(q,q,qadd)) goto err;
486 goto loop;
487 }
488 }
489 ret=1;
490err:
491 BN_CTX_end(ctx);
492 bn_check_top(p);
493 return(ret);
494 }
diff --git a/src/lib/libcrypto/bn/bn_prime.h b/src/lib/libcrypto/bn/bn_prime.h
deleted file mode 100644
index 51d2194feb..0000000000
--- a/src/lib/libcrypto/bn/bn_prime.h
+++ /dev/null
@@ -1,327 +0,0 @@
1/* Auto generated by bn_prime.pl */
2/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
3 * All rights reserved.
4 *
5 * This package is an SSL implementation written
6 * by Eric Young (eay@cryptsoft.com).
7 * The implementation was written so as to conform with Netscapes SSL.
8 *
9 * This library is free for commercial and non-commercial use as long as
10 * the following conditions are aheared to. The following conditions
11 * apply to all code found in this distribution, be it the RC4, RSA,
12 * lhash, DES, etc., code; not just the SSL code. The SSL documentation
13 * included with this distribution is covered by the same copyright terms
14 * except that the holder is Tim Hudson (tjh@cryptsoft.com).
15 *
16 * Copyright remains Eric Young's, and as such any Copyright notices in
17 * the code are not to be removed.
18 * If this package is used in a product, Eric Young should be given attribution
19 * as the author of the parts of the library used.
20 * This can be in the form of a textual message at program startup or
21 * in documentation (online or textual) provided with the package.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 * 1. Redistributions of source code must retain the copyright
27 * notice, this list of conditions and the following disclaimer.
28 * 2. Redistributions in binary form must reproduce the above copyright
29 * notice, this list of conditions and the following disclaimer in the
30 * documentation and/or other materials provided with the distribution.
31 * 3. All advertising materials mentioning features or use of this software
32 * must display the following acknowledgement:
33 * "This product includes cryptographic software written by
34 * Eric Young (eay@cryptsoft.com)"
35 * The word 'cryptographic' can be left out if the rouines from the library
36 * being used are not cryptographic related :-).
37 * 4. If you include any Windows specific code (or a derivative thereof) from
38 * the apps directory (application code) you must include an acknowledgement:
39 * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
40 *
41 * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
42 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51 * SUCH DAMAGE.
52 *
53 * The licence and distribution terms for any publically available version or
54 * derivative of this code cannot be changed. i.e. this code cannot simply be
55 * copied and put under another distribution licence
56 * [including the GNU Public Licence.]
57 */
58
59#ifndef EIGHT_BIT
60#define NUMPRIMES 2048
61typedef unsigned short prime_t;
62#else
63#define NUMPRIMES 54
64typedef unsigned char prime_t;
65#endif
66static const prime_t primes[NUMPRIMES]=
67 {
68 2, 3, 5, 7, 11, 13, 17, 19,
69 23, 29, 31, 37, 41, 43, 47, 53,
70 59, 61, 67, 71, 73, 79, 83, 89,
71 97, 101, 103, 107, 109, 113, 127, 131,
72 137, 139, 149, 151, 157, 163, 167, 173,
73 179, 181, 191, 193, 197, 199, 211, 223,
74 227, 229, 233, 239, 241, 251,
75#ifndef EIGHT_BIT
76 257, 263,
77 269, 271, 277, 281, 283, 293, 307, 311,
78 313, 317, 331, 337, 347, 349, 353, 359,
79 367, 373, 379, 383, 389, 397, 401, 409,
80 419, 421, 431, 433, 439, 443, 449, 457,
81 461, 463, 467, 479, 487, 491, 499, 503,
82 509, 521, 523, 541, 547, 557, 563, 569,
83 571, 577, 587, 593, 599, 601, 607, 613,
84 617, 619, 631, 641, 643, 647, 653, 659,
85 661, 673, 677, 683, 691, 701, 709, 719,
86 727, 733, 739, 743, 751, 757, 761, 769,
87 773, 787, 797, 809, 811, 821, 823, 827,
88 829, 839, 853, 857, 859, 863, 877, 881,
89 883, 887, 907, 911, 919, 929, 937, 941,
90 947, 953, 967, 971, 977, 983, 991, 997,
91 1009,1013,1019,1021,1031,1033,1039,1049,
92 1051,1061,1063,1069,1087,1091,1093,1097,
93 1103,1109,1117,1123,1129,1151,1153,1163,
94 1171,1181,1187,1193,1201,1213,1217,1223,
95 1229,1231,1237,1249,1259,1277,1279,1283,
96 1289,1291,1297,1301,1303,1307,1319,1321,
97 1327,1361,1367,1373,1381,1399,1409,1423,
98 1427,1429,1433,1439,1447,1451,1453,1459,
99 1471,1481,1483,1487,1489,1493,1499,1511,
100 1523,1531,1543,1549,1553,1559,1567,1571,
101 1579,1583,1597,1601,1607,1609,1613,1619,
102 1621,1627,1637,1657,1663,1667,1669,1693,
103 1697,1699,1709,1721,1723,1733,1741,1747,
104 1753,1759,1777,1783,1787,1789,1801,1811,
105 1823,1831,1847,1861,1867,1871,1873,1877,
106 1879,1889,1901,1907,1913,1931,1933,1949,
107 1951,1973,1979,1987,1993,1997,1999,2003,
108 2011,2017,2027,2029,2039,2053,2063,2069,
109 2081,2083,2087,2089,2099,2111,2113,2129,
110 2131,2137,2141,2143,2153,2161,2179,2203,
111 2207,2213,2221,2237,2239,2243,2251,2267,
112 2269,2273,2281,2287,2293,2297,2309,2311,
113 2333,2339,2341,2347,2351,2357,2371,2377,
114 2381,2383,2389,2393,2399,2411,2417,2423,
115 2437,2441,2447,2459,2467,2473,2477,2503,
116 2521,2531,2539,2543,2549,2551,2557,2579,
117 2591,2593,2609,2617,2621,2633,2647,2657,
118 2659,2663,2671,2677,2683,2687,2689,2693,
119 2699,2707,2711,2713,2719,2729,2731,2741,
120 2749,2753,2767,2777,2789,2791,2797,2801,
121 2803,2819,2833,2837,2843,2851,2857,2861,
122 2879,2887,2897,2903,2909,2917,2927,2939,
123 2953,2957,2963,2969,2971,2999,3001,3011,
124 3019,3023,3037,3041,3049,3061,3067,3079,
125 3083,3089,3109,3119,3121,3137,3163,3167,
126 3169,3181,3187,3191,3203,3209,3217,3221,
127 3229,3251,3253,3257,3259,3271,3299,3301,
128 3307,3313,3319,3323,3329,3331,3343,3347,
129 3359,3361,3371,3373,3389,3391,3407,3413,
130 3433,3449,3457,3461,3463,3467,3469,3491,
131 3499,3511,3517,3527,3529,3533,3539,3541,
132 3547,3557,3559,3571,3581,3583,3593,3607,
133 3613,3617,3623,3631,3637,3643,3659,3671,
134 3673,3677,3691,3697,3701,3709,3719,3727,
135 3733,3739,3761,3767,3769,3779,3793,3797,
136 3803,3821,3823,3833,3847,3851,3853,3863,
137 3877,3881,3889,3907,3911,3917,3919,3923,
138 3929,3931,3943,3947,3967,3989,4001,4003,
139 4007,4013,4019,4021,4027,4049,4051,4057,
140 4073,4079,4091,4093,4099,4111,4127,4129,
141 4133,4139,4153,4157,4159,4177,4201,4211,
142 4217,4219,4229,4231,4241,4243,4253,4259,
143 4261,4271,4273,4283,4289,4297,4327,4337,
144 4339,4349,4357,4363,4373,4391,4397,4409,
145 4421,4423,4441,4447,4451,4457,4463,4481,
146 4483,4493,4507,4513,4517,4519,4523,4547,
147 4549,4561,4567,4583,4591,4597,4603,4621,
148 4637,4639,4643,4649,4651,4657,4663,4673,
149 4679,4691,4703,4721,4723,4729,4733,4751,
150 4759,4783,4787,4789,4793,4799,4801,4813,
151 4817,4831,4861,4871,4877,4889,4903,4909,
152 4919,4931,4933,4937,4943,4951,4957,4967,
153 4969,4973,4987,4993,4999,5003,5009,5011,
154 5021,5023,5039,5051,5059,5077,5081,5087,
155 5099,5101,5107,5113,5119,5147,5153,5167,
156 5171,5179,5189,5197,5209,5227,5231,5233,
157 5237,5261,5273,5279,5281,5297,5303,5309,
158 5323,5333,5347,5351,5381,5387,5393,5399,
159 5407,5413,5417,5419,5431,5437,5441,5443,
160 5449,5471,5477,5479,5483,5501,5503,5507,
161 5519,5521,5527,5531,5557,5563,5569,5573,
162 5581,5591,5623,5639,5641,5647,5651,5653,
163 5657,5659,5669,5683,5689,5693,5701,5711,
164 5717,5737,5741,5743,5749,5779,5783,5791,
165 5801,5807,5813,5821,5827,5839,5843,5849,
166 5851,5857,5861,5867,5869,5879,5881,5897,
167 5903,5923,5927,5939,5953,5981,5987,6007,
168 6011,6029,6037,6043,6047,6053,6067,6073,
169 6079,6089,6091,6101,6113,6121,6131,6133,
170 6143,6151,6163,6173,6197,6199,6203,6211,
171 6217,6221,6229,6247,6257,6263,6269,6271,
172 6277,6287,6299,6301,6311,6317,6323,6329,
173 6337,6343,6353,6359,6361,6367,6373,6379,
174 6389,6397,6421,6427,6449,6451,6469,6473,
175 6481,6491,6521,6529,6547,6551,6553,6563,
176 6569,6571,6577,6581,6599,6607,6619,6637,
177 6653,6659,6661,6673,6679,6689,6691,6701,
178 6703,6709,6719,6733,6737,6761,6763,6779,
179 6781,6791,6793,6803,6823,6827,6829,6833,
180 6841,6857,6863,6869,6871,6883,6899,6907,
181 6911,6917,6947,6949,6959,6961,6967,6971,
182 6977,6983,6991,6997,7001,7013,7019,7027,
183 7039,7043,7057,7069,7079,7103,7109,7121,
184 7127,7129,7151,7159,7177,7187,7193,7207,
185 7211,7213,7219,7229,7237,7243,7247,7253,
186 7283,7297,7307,7309,7321,7331,7333,7349,
187 7351,7369,7393,7411,7417,7433,7451,7457,
188 7459,7477,7481,7487,7489,7499,7507,7517,
189 7523,7529,7537,7541,7547,7549,7559,7561,
190 7573,7577,7583,7589,7591,7603,7607,7621,
191 7639,7643,7649,7669,7673,7681,7687,7691,
192 7699,7703,7717,7723,7727,7741,7753,7757,
193 7759,7789,7793,7817,7823,7829,7841,7853,
194 7867,7873,7877,7879,7883,7901,7907,7919,
195 7927,7933,7937,7949,7951,7963,7993,8009,
196 8011,8017,8039,8053,8059,8069,8081,8087,
197 8089,8093,8101,8111,8117,8123,8147,8161,
198 8167,8171,8179,8191,8209,8219,8221,8231,
199 8233,8237,8243,8263,8269,8273,8287,8291,
200 8293,8297,8311,8317,8329,8353,8363,8369,
201 8377,8387,8389,8419,8423,8429,8431,8443,
202 8447,8461,8467,8501,8513,8521,8527,8537,
203 8539,8543,8563,8573,8581,8597,8599,8609,
204 8623,8627,8629,8641,8647,8663,8669,8677,
205 8681,8689,8693,8699,8707,8713,8719,8731,
206 8737,8741,8747,8753,8761,8779,8783,8803,
207 8807,8819,8821,8831,8837,8839,8849,8861,
208 8863,8867,8887,8893,8923,8929,8933,8941,
209 8951,8963,8969,8971,8999,9001,9007,9011,
210 9013,9029,9041,9043,9049,9059,9067,9091,
211 9103,9109,9127,9133,9137,9151,9157,9161,
212 9173,9181,9187,9199,9203,9209,9221,9227,
213 9239,9241,9257,9277,9281,9283,9293,9311,
214 9319,9323,9337,9341,9343,9349,9371,9377,
215 9391,9397,9403,9413,9419,9421,9431,9433,
216 9437,9439,9461,9463,9467,9473,9479,9491,
217 9497,9511,9521,9533,9539,9547,9551,9587,
218 9601,9613,9619,9623,9629,9631,9643,9649,
219 9661,9677,9679,9689,9697,9719,9721,9733,
220 9739,9743,9749,9767,9769,9781,9787,9791,
221 9803,9811,9817,9829,9833,9839,9851,9857,
222 9859,9871,9883,9887,9901,9907,9923,9929,
223 9931,9941,9949,9967,9973,10007,10009,10037,
224 10039,10061,10067,10069,10079,10091,10093,10099,
225 10103,10111,10133,10139,10141,10151,10159,10163,
226 10169,10177,10181,10193,10211,10223,10243,10247,
227 10253,10259,10267,10271,10273,10289,10301,10303,
228 10313,10321,10331,10333,10337,10343,10357,10369,
229 10391,10399,10427,10429,10433,10453,10457,10459,
230 10463,10477,10487,10499,10501,10513,10529,10531,
231 10559,10567,10589,10597,10601,10607,10613,10627,
232 10631,10639,10651,10657,10663,10667,10687,10691,
233 10709,10711,10723,10729,10733,10739,10753,10771,
234 10781,10789,10799,10831,10837,10847,10853,10859,
235 10861,10867,10883,10889,10891,10903,10909,10937,
236 10939,10949,10957,10973,10979,10987,10993,11003,
237 11027,11047,11057,11059,11069,11071,11083,11087,
238 11093,11113,11117,11119,11131,11149,11159,11161,
239 11171,11173,11177,11197,11213,11239,11243,11251,
240 11257,11261,11273,11279,11287,11299,11311,11317,
241 11321,11329,11351,11353,11369,11383,11393,11399,
242 11411,11423,11437,11443,11447,11467,11471,11483,
243 11489,11491,11497,11503,11519,11527,11549,11551,
244 11579,11587,11593,11597,11617,11621,11633,11657,
245 11677,11681,11689,11699,11701,11717,11719,11731,
246 11743,11777,11779,11783,11789,11801,11807,11813,
247 11821,11827,11831,11833,11839,11863,11867,11887,
248 11897,11903,11909,11923,11927,11933,11939,11941,
249 11953,11959,11969,11971,11981,11987,12007,12011,
250 12037,12041,12043,12049,12071,12073,12097,12101,
251 12107,12109,12113,12119,12143,12149,12157,12161,
252 12163,12197,12203,12211,12227,12239,12241,12251,
253 12253,12263,12269,12277,12281,12289,12301,12323,
254 12329,12343,12347,12373,12377,12379,12391,12401,
255 12409,12413,12421,12433,12437,12451,12457,12473,
256 12479,12487,12491,12497,12503,12511,12517,12527,
257 12539,12541,12547,12553,12569,12577,12583,12589,
258 12601,12611,12613,12619,12637,12641,12647,12653,
259 12659,12671,12689,12697,12703,12713,12721,12739,
260 12743,12757,12763,12781,12791,12799,12809,12821,
261 12823,12829,12841,12853,12889,12893,12899,12907,
262 12911,12917,12919,12923,12941,12953,12959,12967,
263 12973,12979,12983,13001,13003,13007,13009,13033,
264 13037,13043,13049,13063,13093,13099,13103,13109,
265 13121,13127,13147,13151,13159,13163,13171,13177,
266 13183,13187,13217,13219,13229,13241,13249,13259,
267 13267,13291,13297,13309,13313,13327,13331,13337,
268 13339,13367,13381,13397,13399,13411,13417,13421,
269 13441,13451,13457,13463,13469,13477,13487,13499,
270 13513,13523,13537,13553,13567,13577,13591,13597,
271 13613,13619,13627,13633,13649,13669,13679,13681,
272 13687,13691,13693,13697,13709,13711,13721,13723,
273 13729,13751,13757,13759,13763,13781,13789,13799,
274 13807,13829,13831,13841,13859,13873,13877,13879,
275 13883,13901,13903,13907,13913,13921,13931,13933,
276 13963,13967,13997,13999,14009,14011,14029,14033,
277 14051,14057,14071,14081,14083,14087,14107,14143,
278 14149,14153,14159,14173,14177,14197,14207,14221,
279 14243,14249,14251,14281,14293,14303,14321,14323,
280 14327,14341,14347,14369,14387,14389,14401,14407,
281 14411,14419,14423,14431,14437,14447,14449,14461,
282 14479,14489,14503,14519,14533,14537,14543,14549,
283 14551,14557,14561,14563,14591,14593,14621,14627,
284 14629,14633,14639,14653,14657,14669,14683,14699,
285 14713,14717,14723,14731,14737,14741,14747,14753,
286 14759,14767,14771,14779,14783,14797,14813,14821,
287 14827,14831,14843,14851,14867,14869,14879,14887,
288 14891,14897,14923,14929,14939,14947,14951,14957,
289 14969,14983,15013,15017,15031,15053,15061,15073,
290 15077,15083,15091,15101,15107,15121,15131,15137,
291 15139,15149,15161,15173,15187,15193,15199,15217,
292 15227,15233,15241,15259,15263,15269,15271,15277,
293 15287,15289,15299,15307,15313,15319,15329,15331,
294 15349,15359,15361,15373,15377,15383,15391,15401,
295 15413,15427,15439,15443,15451,15461,15467,15473,
296 15493,15497,15511,15527,15541,15551,15559,15569,
297 15581,15583,15601,15607,15619,15629,15641,15643,
298 15647,15649,15661,15667,15671,15679,15683,15727,
299 15731,15733,15737,15739,15749,15761,15767,15773,
300 15787,15791,15797,15803,15809,15817,15823,15859,
301 15877,15881,15887,15889,15901,15907,15913,15919,
302 15923,15937,15959,15971,15973,15991,16001,16007,
303 16033,16057,16061,16063,16067,16069,16073,16087,
304 16091,16097,16103,16111,16127,16139,16141,16183,
305 16187,16189,16193,16217,16223,16229,16231,16249,
306 16253,16267,16273,16301,16319,16333,16339,16349,
307 16361,16363,16369,16381,16411,16417,16421,16427,
308 16433,16447,16451,16453,16477,16481,16487,16493,
309 16519,16529,16547,16553,16561,16567,16573,16603,
310 16607,16619,16631,16633,16649,16651,16657,16661,
311 16673,16691,16693,16699,16703,16729,16741,16747,
312 16759,16763,16787,16811,16823,16829,16831,16843,
313 16871,16879,16883,16889,16901,16903,16921,16927,
314 16931,16937,16943,16963,16979,16981,16987,16993,
315 17011,17021,17027,17029,17033,17041,17047,17053,
316 17077,17093,17099,17107,17117,17123,17137,17159,
317 17167,17183,17189,17191,17203,17207,17209,17231,
318 17239,17257,17291,17293,17299,17317,17321,17327,
319 17333,17341,17351,17359,17377,17383,17387,17389,
320 17393,17401,17417,17419,17431,17443,17449,17467,
321 17471,17477,17483,17489,17491,17497,17509,17519,
322 17539,17551,17569,17573,17579,17581,17597,17599,
323 17609,17623,17627,17657,17659,17669,17681,17683,
324 17707,17713,17729,17737,17747,17749,17761,17783,
325 17789,17791,17807,17827,17837,17839,17851,17863,
326#endif
327 };
diff --git a/src/lib/libcrypto/bn/bn_prime.pl b/src/lib/libcrypto/bn/bn_prime.pl
deleted file mode 100644
index 3fafb6f3e9..0000000000
--- a/src/lib/libcrypto/bn/bn_prime.pl
+++ /dev/null
@@ -1,119 +0,0 @@
1#!/usr/local/bin/perl
2# bn_prime.pl
3
4$num=2048;
5$num=$ARGV[0] if ($#ARGV >= 0);
6
7push(@primes,2);
8$p=1;
9loop: while ($#primes < $num-1)
10 {
11 $p+=2;
12 $s=int(sqrt($p));
13
14 for ($i=0; defined($primes[$i]) && $primes[$i]<=$s; $i++)
15 {
16 next loop if (($p%$primes[$i]) == 0);
17 }
18 push(@primes,$p);
19 }
20
21# print <<"EOF";
22# /* Auto generated by bn_prime.pl */
23# /* Copyright (C) 1995-1997 Eric Young (eay\@mincom.oz.au).
24# * All rights reserved.
25# * Copyright remains Eric Young's, and as such any Copyright notices in
26# * the code are not to be removed.
27# * See the COPYRIGHT file in the SSLeay distribution for more details.
28# */
29#
30# EOF
31
32print <<\EOF;
33/* Auto generated by bn_prime.pl */
34/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
35 * All rights reserved.
36 *
37 * This package is an SSL implementation written
38 * by Eric Young (eay@cryptsoft.com).
39 * The implementation was written so as to conform with Netscapes SSL.
40 *
41 * This library is free for commercial and non-commercial use as long as
42 * the following conditions are aheared to. The following conditions
43 * apply to all code found in this distribution, be it the RC4, RSA,
44 * lhash, DES, etc., code; not just the SSL code. The SSL documentation
45 * included with this distribution is covered by the same copyright terms
46 * except that the holder is Tim Hudson (tjh@cryptsoft.com).
47 *
48 * Copyright remains Eric Young's, and as such any Copyright notices in
49 * the code are not to be removed.
50 * If this package is used in a product, Eric Young should be given attribution
51 * as the author of the parts of the library used.
52 * This can be in the form of a textual message at program startup or
53 * in documentation (online or textual) provided with the package.
54 *
55 * Redistribution and use in source and binary forms, with or without
56 * modification, are permitted provided that the following conditions
57 * are met:
58 * 1. Redistributions of source code must retain the copyright
59 * notice, this list of conditions and the following disclaimer.
60 * 2. Redistributions in binary form must reproduce the above copyright
61 * notice, this list of conditions and the following disclaimer in the
62 * documentation and/or other materials provided with the distribution.
63 * 3. All advertising materials mentioning features or use of this software
64 * must display the following acknowledgement:
65 * "This product includes cryptographic software written by
66 * Eric Young (eay@cryptsoft.com)"
67 * The word 'cryptographic' can be left out if the rouines from the library
68 * being used are not cryptographic related :-).
69 * 4. If you include any Windows specific code (or a derivative thereof) from
70 * the apps directory (application code) you must include an acknowledgement:
71 * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
72 *
73 * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
74 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
75 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
76 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
77 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
78 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
79 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
80 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
81 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
82 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
83 * SUCH DAMAGE.
84 *
85 * The licence and distribution terms for any publically available version or
86 * derivative of this code cannot be changed. i.e. this code cannot simply be
87 * copied and put under another distribution licence
88 * [including the GNU Public Licence.]
89 */
90
91EOF
92
93for ($i=0; $i <= $#primes; $i++)
94 {
95 if ($primes[$i] > 256)
96 {
97 $eight=$i;
98 last;
99 }
100 }
101
102printf "#ifndef EIGHT_BIT\n";
103printf "#define NUMPRIMES %d\n",$num;
104printf "typedef unsigned short prime_t;\n";
105printf "#else\n";
106printf "#define NUMPRIMES %d\n",$eight;
107printf "typedef unsigned char prime_t;\n";
108printf "#endif\n";
109print "static const prime_t primes[NUMPRIMES]=\n\t{\n\t";
110$init=0;
111for ($i=0; $i <= $#primes; $i++)
112 {
113 printf "\n#ifndef EIGHT_BIT\n\t" if ($primes[$i] > 256) && !($init++);
114 printf("\n\t") if (($i%8) == 0) && ($i != 0);
115 printf("%4d,",$primes[$i]);
116 }
117print "\n#endif\n\t};\n";
118
119
diff --git a/src/lib/libcrypto/bn/bn_print.c b/src/lib/libcrypto/bn/bn_print.c
deleted file mode 100644
index 1743b6a7e2..0000000000
--- a/src/lib/libcrypto/bn/bn_print.c
+++ /dev/null
@@ -1,378 +0,0 @@
1/* crypto/bn/bn_print.c */
2/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
3 * All rights reserved.
4 *
5 * This package is an SSL implementation written
6 * by Eric Young (eay@cryptsoft.com).
7 * The implementation was written so as to conform with Netscapes SSL.
8 *
9 * This library is free for commercial and non-commercial use as long as
10 * the following conditions are aheared to. The following conditions
11 * apply to all code found in this distribution, be it the RC4, RSA,
12 * lhash, DES, etc., code; not just the SSL code. The SSL documentation
13 * included with this distribution is covered by the same copyright terms
14 * except that the holder is Tim Hudson (tjh@cryptsoft.com).
15 *
16 * Copyright remains Eric Young's, and as such any Copyright notices in
17 * the code are not to be removed.
18 * If this package is used in a product, Eric Young should be given attribution
19 * as the author of the parts of the library used.
20 * This can be in the form of a textual message at program startup or
21 * in documentation (online or textual) provided with the package.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 * 1. Redistributions of source code must retain the copyright
27 * notice, this list of conditions and the following disclaimer.
28 * 2. Redistributions in binary form must reproduce the above copyright
29 * notice, this list of conditions and the following disclaimer in the
30 * documentation and/or other materials provided with the distribution.
31 * 3. All advertising materials mentioning features or use of this software
32 * must display the following acknowledgement:
33 * "This product includes cryptographic software written by
34 * Eric Young (eay@cryptsoft.com)"
35 * The word 'cryptographic' can be left out if the rouines from the library
36 * being used are not cryptographic related :-).
37 * 4. If you include any Windows specific code (or a derivative thereof) from
38 * the apps directory (application code) you must include an acknowledgement:
39 * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
40 *
41 * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
42 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51 * SUCH DAMAGE.
52 *
53 * The licence and distribution terms for any publically available version or
54 * derivative of this code cannot be changed. i.e. this code cannot simply be
55 * copied and put under another distribution licence
56 * [including the GNU Public Licence.]
57 */
58
59#include <stdio.h>
60#include <ctype.h>
61#include "cryptlib.h"
62#include <openssl/buffer.h>
63#include "bn_lcl.h"
64
65static const char Hex[]="0123456789ABCDEF";
66
67/* Must 'OPENSSL_free' the returned data */
68char *BN_bn2hex(const BIGNUM *a)
69 {
70 int i,j,v,z=0;
71 char *buf;
72 char *p;
73
74 buf=(char *)OPENSSL_malloc(a->top*BN_BYTES*2+2);
75 if (buf == NULL)
76 {
77 BNerr(BN_F_BN_BN2HEX,ERR_R_MALLOC_FAILURE);
78 goto err;
79 }
80 p=buf;
81 if (a->neg) *(p++)='-';
82 if (BN_is_zero(a)) *(p++)='0';
83 for (i=a->top-1; i >=0; i--)
84 {
85 for (j=BN_BITS2-8; j >= 0; j-=8)
86 {
87 /* strip leading zeros */
88 v=((int)(a->d[i]>>(long)j))&0xff;
89 if (z || (v != 0))
90 {
91 *(p++)=Hex[v>>4];
92 *(p++)=Hex[v&0x0f];
93 z=1;
94 }
95 }
96 }
97 *p='\0';
98err:
99 return(buf);
100 }
101
102/* Must 'OPENSSL_free' the returned data */
103char *BN_bn2dec(const BIGNUM *a)
104 {
105 int i=0,num, ok = 0;
106 char *buf=NULL;
107 char *p;
108 BIGNUM *t=NULL;
109 BN_ULONG *bn_data=NULL,*lp;
110
111 /* get an upper bound for the length of the decimal integer
112 * num <= (BN_num_bits(a) + 1) * log(2)
113 * <= 3 * BN_num_bits(a) * 0.1001 + log(2) + 1 (rounding error)
114 * <= BN_num_bits(a)/10 + BN_num_bits/1000 + 1 + 1
115 */
116 i=BN_num_bits(a)*3;
117 num=(i/10+i/1000+1)+1;
118 bn_data=(BN_ULONG *)OPENSSL_malloc((num/BN_DEC_NUM+1)*sizeof(BN_ULONG));
119 buf=(char *)OPENSSL_malloc(num+3);
120 if ((buf == NULL) || (bn_data == NULL))
121 {
122 BNerr(BN_F_BN_BN2DEC,ERR_R_MALLOC_FAILURE);
123 goto err;
124 }
125 if ((t=BN_dup(a)) == NULL) goto err;
126
127#define BUF_REMAIN (num+3 - (size_t)(p - buf))
128 p=buf;
129 lp=bn_data;
130 if (BN_is_zero(t))
131 {
132 *(p++)='0';
133 *(p++)='\0';
134 }
135 else
136 {
137 if (BN_is_negative(t))
138 *p++ = '-';
139
140 i=0;
141 while (!BN_is_zero(t))
142 {
143 *lp=BN_div_word(t,BN_DEC_CONV);
144 lp++;
145 }
146 lp--;
147 /* We now have a series of blocks, BN_DEC_NUM chars
148 * in length, where the last one needs truncation.
149 * The blocks need to be reversed in order. */
150 BIO_snprintf(p,BUF_REMAIN,BN_DEC_FMT1,*lp);
151 while (*p) p++;
152 while (lp != bn_data)
153 {
154 lp--;
155 BIO_snprintf(p,BUF_REMAIN,BN_DEC_FMT2,*lp);
156 while (*p) p++;
157 }
158 }
159 ok = 1;
160err:
161 if (bn_data != NULL) OPENSSL_free(bn_data);
162 if (t != NULL) BN_free(t);
163 if (!ok && buf)
164 {
165 OPENSSL_free(buf);
166 buf = NULL;
167 }
168
169 return(buf);
170 }
171
172int BN_hex2bn(BIGNUM **bn, const char *a)
173 {
174 BIGNUM *ret=NULL;
175 BN_ULONG l=0;
176 int neg=0,h,m,i,j,k,c;
177 int num;
178
179 if ((a == NULL) || (*a == '\0')) return(0);
180
181 if (*a == '-') { neg=1; a++; }
182
183 for (i=0; isxdigit((unsigned char) a[i]); i++)
184 ;
185
186 num=i+neg;
187 if (bn == NULL) return(num);
188
189 /* a is the start of the hex digits, and it is 'i' long */
190 if (*bn == NULL)
191 {
192 if ((ret=BN_new()) == NULL) return(0);
193 }
194 else
195 {
196 ret= *bn;
197 BN_zero(ret);
198 }
199
200 /* i is the number of hex digests; */
201 if (bn_expand(ret,i*4) == NULL) goto err;
202
203 j=i; /* least significant 'hex' */
204 m=0;
205 h=0;
206 while (j > 0)
207 {
208 m=((BN_BYTES*2) <= j)?(BN_BYTES*2):j;
209 l=0;
210 for (;;)
211 {
212 c=a[j-m];
213 if ((c >= '0') && (c <= '9')) k=c-'0';
214 else if ((c >= 'a') && (c <= 'f')) k=c-'a'+10;
215 else if ((c >= 'A') && (c <= 'F')) k=c-'A'+10;
216 else k=0; /* paranoia */
217 l=(l<<4)|k;
218
219 if (--m <= 0)
220 {
221 ret->d[h++]=l;
222 break;
223 }
224 }
225 j-=(BN_BYTES*2);
226 }
227 ret->top=h;
228 bn_correct_top(ret);
229 ret->neg=neg;
230
231 *bn=ret;
232 bn_check_top(ret);
233 return(num);
234err:
235 if (*bn == NULL) BN_free(ret);
236 return(0);
237 }
238
239int BN_dec2bn(BIGNUM **bn, const char *a)
240 {
241 BIGNUM *ret=NULL;
242 BN_ULONG l=0;
243 int neg=0,i,j;
244 int num;
245
246 if ((a == NULL) || (*a == '\0')) return(0);
247 if (*a == '-') { neg=1; a++; }
248
249 for (i=0; isdigit((unsigned char) a[i]); i++)
250 ;
251
252 num=i+neg;
253 if (bn == NULL) return(num);
254
255 /* a is the start of the digits, and it is 'i' long.
256 * We chop it into BN_DEC_NUM digits at a time */
257 if (*bn == NULL)
258 {
259 if ((ret=BN_new()) == NULL) return(0);
260 }
261 else
262 {
263 ret= *bn;
264 BN_zero(ret);
265 }
266
267 /* i is the number of digests, a bit of an over expand; */
268 if (bn_expand(ret,i*4) == NULL) goto err;
269
270 j=BN_DEC_NUM-(i%BN_DEC_NUM);
271 if (j == BN_DEC_NUM) j=0;
272 l=0;
273 while (*a)
274 {
275 l*=10;
276 l+= *a-'0';
277 a++;
278 if (++j == BN_DEC_NUM)
279 {
280 BN_mul_word(ret,BN_DEC_CONV);
281 BN_add_word(ret,l);
282 l=0;
283 j=0;
284 }
285 }
286 ret->neg=neg;
287
288 bn_correct_top(ret);
289 *bn=ret;
290 bn_check_top(ret);
291 return(num);
292err:
293 if (*bn == NULL) BN_free(ret);
294 return(0);
295 }
296
297int BN_asc2bn(BIGNUM **bn, const char *a)
298 {
299 const char *p = a;
300 if (*p == '-')
301 p++;
302
303 if (p[0] == '0' && (p[1] == 'X' || p[1] == 'x'))
304 {
305 if (!BN_hex2bn(bn, p + 2))
306 return 0;
307 }
308 else
309 {
310 if (!BN_dec2bn(bn, p))
311 return 0;
312 }
313 if (*a == '-')
314 (*bn)->neg = 1;
315 return 1;
316 }
317
318#ifndef OPENSSL_NO_BIO
319#ifndef OPENSSL_NO_FP_API
320int BN_print_fp(FILE *fp, const BIGNUM *a)
321 {
322 BIO *b;
323 int ret;
324
325 if ((b=BIO_new(BIO_s_file())) == NULL)
326 return(0);
327 BIO_set_fp(b,fp,BIO_NOCLOSE);
328 ret=BN_print(b,a);
329 BIO_free(b);
330 return(ret);
331 }
332#endif
333
334int BN_print(BIO *bp, const BIGNUM *a)
335 {
336 int i,j,v,z=0;
337 int ret=0;
338
339 if ((a->neg) && (BIO_write(bp,"-",1) != 1)) goto end;
340 if (BN_is_zero(a) && (BIO_write(bp,"0",1) != 1)) goto end;
341 for (i=a->top-1; i >=0; i--)
342 {
343 for (j=BN_BITS2-4; j >= 0; j-=4)
344 {
345 /* strip leading zeros */
346 v=((int)(a->d[i]>>(long)j))&0x0f;
347 if (z || (v != 0))
348 {
349 if (BIO_write(bp,&(Hex[v]),1) != 1)
350 goto end;
351 z=1;
352 }
353 }
354 }
355 ret=1;
356end:
357 return(ret);
358 }
359#endif
360
361char *BN_options(void)
362 {
363 static int init=0;
364 static char data[16];
365
366 if (!init)
367 {
368 init++;
369#ifdef BN_LLONG
370 BIO_snprintf(data,sizeof data,"bn(%d,%d)",
371 (int)sizeof(BN_ULLONG)*8,(int)sizeof(BN_ULONG)*8);
372#else
373 BIO_snprintf(data,sizeof data,"bn(%d,%d)",
374 (int)sizeof(BN_ULONG)*8,(int)sizeof(BN_ULONG)*8);
375#endif
376 }
377 return(data);
378 }
diff --git a/src/lib/libcrypto/bn/bn_rand.c b/src/lib/libcrypto/bn/bn_rand.c
deleted file mode 100644
index b376c28ff3..0000000000
--- a/src/lib/libcrypto/bn/bn_rand.c
+++ /dev/null
@@ -1,305 +0,0 @@
1/* crypto/bn/bn_rand.c */
2/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
3 * All rights reserved.
4 *
5 * This package is an SSL implementation written
6 * by Eric Young (eay@cryptsoft.com).
7 * The implementation was written so as to conform with Netscapes SSL.
8 *
9 * This library is free for commercial and non-commercial use as long as
10 * the following conditions are aheared to. The following conditions
11 * apply to all code found in this distribution, be it the RC4, RSA,
12 * lhash, DES, etc., code; not just the SSL code. The SSL documentation
13 * included with this distribution is covered by the same copyright terms
14 * except that the holder is Tim Hudson (tjh@cryptsoft.com).
15 *
16 * Copyright remains Eric Young's, and as such any Copyright notices in
17 * the code are not to be removed.
18 * If this package is used in a product, Eric Young should be given attribution
19 * as the author of the parts of the library used.
20 * This can be in the form of a textual message at program startup or
21 * in documentation (online or textual) provided with the package.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 * 1. Redistributions of source code must retain the copyright
27 * notice, this list of conditions and the following disclaimer.
28 * 2. Redistributions in binary form must reproduce the above copyright
29 * notice, this list of conditions and the following disclaimer in the
30 * documentation and/or other materials provided with the distribution.
31 * 3. All advertising materials mentioning features or use of this software
32 * must display the following acknowledgement:
33 * "This product includes cryptographic software written by
34 * Eric Young (eay@cryptsoft.com)"
35 * The word 'cryptographic' can be left out if the rouines from the library
36 * being used are not cryptographic related :-).
37 * 4. If you include any Windows specific code (or a derivative thereof) from
38 * the apps directory (application code) you must include an acknowledgement:
39 * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
40 *
41 * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
42 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51 * SUCH DAMAGE.
52 *
53 * The licence and distribution terms for any publically available version or
54 * derivative of this code cannot be changed. i.e. this code cannot simply be
55 * copied and put under another distribution licence
56 * [including the GNU Public Licence.]
57 */
58/* ====================================================================
59 * Copyright (c) 1998-2001 The OpenSSL Project. All rights reserved.
60 *
61 * Redistribution and use in source and binary forms, with or without
62 * modification, are permitted provided that the following conditions
63 * are met:
64 *
65 * 1. Redistributions of source code must retain the above copyright
66 * notice, this list of conditions and the following disclaimer.
67 *
68 * 2. Redistributions in binary form must reproduce the above copyright
69 * notice, this list of conditions and the following disclaimer in
70 * the documentation and/or other materials provided with the
71 * distribution.
72 *
73 * 3. All advertising materials mentioning features or use of this
74 * software must display the following acknowledgment:
75 * "This product includes software developed by the OpenSSL Project
76 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
77 *
78 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
79 * endorse or promote products derived from this software without
80 * prior written permission. For written permission, please contact
81 * openssl-core@openssl.org.
82 *
83 * 5. Products derived from this software may not be called "OpenSSL"
84 * nor may "OpenSSL" appear in their names without prior written
85 * permission of the OpenSSL Project.
86 *
87 * 6. Redistributions of any form whatsoever must retain the following
88 * acknowledgment:
89 * "This product includes software developed by the OpenSSL Project
90 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
91 *
92 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
93 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
94 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
95 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
96 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
97 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
98 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
99 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
100 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
101 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
102 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
103 * OF THE POSSIBILITY OF SUCH DAMAGE.
104 * ====================================================================
105 *
106 * This product includes cryptographic software written by Eric Young
107 * (eay@cryptsoft.com). This product includes software written by Tim
108 * Hudson (tjh@cryptsoft.com).
109 *
110 */
111
112#include <stdio.h>
113#include <time.h>
114#include "cryptlib.h"
115#include "bn_lcl.h"
116#include <openssl/rand.h>
117
118static int bnrand(int pseudorand, BIGNUM *rnd, int bits, int top, int bottom)
119 {
120 unsigned char *buf=NULL;
121 int ret=0,bit,bytes,mask;
122 time_t tim;
123
124 if (bits == 0)
125 {
126 BN_zero(rnd);
127 return 1;
128 }
129
130 bytes=(bits+7)/8;
131 bit=(bits-1)%8;
132 mask=0xff<<(bit+1);
133
134 buf=(unsigned char *)OPENSSL_malloc(bytes);
135 if (buf == NULL)
136 {
137 BNerr(BN_F_BNRAND,ERR_R_MALLOC_FAILURE);
138 goto err;
139 }
140
141 /* make a random number and set the top and bottom bits */
142 time(&tim);
143 RAND_add(&tim,sizeof(tim),0.0);
144
145 if (pseudorand)
146 {
147 if (RAND_pseudo_bytes(buf, bytes) == -1)
148 goto err;
149 }
150 else
151 {
152 if (RAND_bytes(buf, bytes) <= 0)
153 goto err;
154 }
155
156#if 1
157 if (pseudorand == 2)
158 {
159 /* generate patterns that are more likely to trigger BN
160 library bugs */
161 int i;
162 unsigned char c;
163
164 for (i = 0; i < bytes; i++)
165 {
166 RAND_pseudo_bytes(&c, 1);
167 if (c >= 128 && i > 0)
168 buf[i] = buf[i-1];
169 else if (c < 42)
170 buf[i] = 0;
171 else if (c < 84)
172 buf[i] = 255;
173 }
174 }
175#endif
176
177 if (top != -1)
178 {
179 if (top)
180 {
181 if (bit == 0)
182 {
183 buf[0]=1;
184 buf[1]|=0x80;
185 }
186 else
187 {
188 buf[0]|=(3<<(bit-1));
189 }
190 }
191 else
192 {
193 buf[0]|=(1<<bit);
194 }
195 }
196 buf[0] &= ~mask;
197 if (bottom) /* set bottom bit if requested */
198 buf[bytes-1]|=1;
199 if (!BN_bin2bn(buf,bytes,rnd)) goto err;
200 ret=1;
201err:
202 if (buf != NULL)
203 {
204 OPENSSL_cleanse(buf,bytes);
205 OPENSSL_free(buf);
206 }
207 bn_check_top(rnd);
208 return(ret);
209 }
210
211int BN_rand(BIGNUM *rnd, int bits, int top, int bottom)
212 {
213 return bnrand(0, rnd, bits, top, bottom);
214 }
215
216int BN_pseudo_rand(BIGNUM *rnd, int bits, int top, int bottom)
217 {
218 return bnrand(1, rnd, bits, top, bottom);
219 }
220
221#if 1
222int BN_bntest_rand(BIGNUM *rnd, int bits, int top, int bottom)
223 {
224 return bnrand(2, rnd, bits, top, bottom);
225 }
226#endif
227
228
229/* random number r: 0 <= r < range */
230static int bn_rand_range(int pseudo, BIGNUM *r, const BIGNUM *range)
231 {
232 int (*bn_rand)(BIGNUM *, int, int, int) = pseudo ? BN_pseudo_rand : BN_rand;
233 int n;
234 int count = 100;
235
236 if (range->neg || BN_is_zero(range))
237 {
238 BNerr(BN_F_BN_RAND_RANGE, BN_R_INVALID_RANGE);
239 return 0;
240 }
241
242 n = BN_num_bits(range); /* n > 0 */
243
244 /* BN_is_bit_set(range, n - 1) always holds */
245
246 if (n == 1)
247 BN_zero(r);
248 else if (!BN_is_bit_set(range, n - 2) && !BN_is_bit_set(range, n - 3))
249 {
250 /* range = 100..._2,
251 * so 3*range (= 11..._2) is exactly one bit longer than range */
252 do
253 {
254 if (!bn_rand(r, n + 1, -1, 0)) return 0;
255 /* If r < 3*range, use r := r MOD range
256 * (which is either r, r - range, or r - 2*range).
257 * Otherwise, iterate once more.
258 * Since 3*range = 11..._2, each iteration succeeds with
259 * probability >= .75. */
260 if (BN_cmp(r ,range) >= 0)
261 {
262 if (!BN_sub(r, r, range)) return 0;
263 if (BN_cmp(r, range) >= 0)
264 if (!BN_sub(r, r, range)) return 0;
265 }
266
267 if (!--count)
268 {
269 BNerr(BN_F_BN_RAND_RANGE, BN_R_TOO_MANY_ITERATIONS);
270 return 0;
271 }
272
273 }
274 while (BN_cmp(r, range) >= 0);
275 }
276 else
277 {
278 do
279 {
280 /* range = 11..._2 or range = 101..._2 */
281 if (!bn_rand(r, n, -1, 0)) return 0;
282
283 if (!--count)
284 {
285 BNerr(BN_F_BN_RAND_RANGE, BN_R_TOO_MANY_ITERATIONS);
286 return 0;
287 }
288 }
289 while (BN_cmp(r, range) >= 0);
290 }
291
292 bn_check_top(r);
293 return 1;
294 }
295
296
297int BN_rand_range(BIGNUM *r, const BIGNUM *range)
298 {
299 return bn_rand_range(0, r, range);
300 }
301
302int BN_pseudo_rand_range(BIGNUM *r, const BIGNUM *range)
303 {
304 return bn_rand_range(1, r, range);
305 }
diff --git a/src/lib/libcrypto/bn/bn_recp.c b/src/lib/libcrypto/bn/bn_recp.c
deleted file mode 100644
index 2e8efb8dae..0000000000
--- a/src/lib/libcrypto/bn/bn_recp.c
+++ /dev/null
@@ -1,234 +0,0 @@
1/* crypto/bn/bn_recp.c */
2/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
3 * All rights reserved.
4 *
5 * This package is an SSL implementation written
6 * by Eric Young (eay@cryptsoft.com).
7 * The implementation was written so as to conform with Netscapes SSL.
8 *
9 * This library is free for commercial and non-commercial use as long as
10 * the following conditions are aheared to. The following conditions
11 * apply to all code found in this distribution, be it the RC4, RSA,
12 * lhash, DES, etc., code; not just the SSL code. The SSL documentation
13 * included with this distribution is covered by the same copyright terms
14 * except that the holder is Tim Hudson (tjh@cryptsoft.com).
15 *
16 * Copyright remains Eric Young's, and as such any Copyright notices in
17 * the code are not to be removed.
18 * If this package is used in a product, Eric Young should be given attribution
19 * as the author of the parts of the library used.
20 * This can be in the form of a textual message at program startup or
21 * in documentation (online or textual) provided with the package.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 * 1. Redistributions of source code must retain the copyright
27 * notice, this list of conditions and the following disclaimer.
28 * 2. Redistributions in binary form must reproduce the above copyright
29 * notice, this list of conditions and the following disclaimer in the
30 * documentation and/or other materials provided with the distribution.
31 * 3. All advertising materials mentioning features or use of this software
32 * must display the following acknowledgement:
33 * "This product includes cryptographic software written by
34 * Eric Young (eay@cryptsoft.com)"
35 * The word 'cryptographic' can be left out if the rouines from the library
36 * being used are not cryptographic related :-).
37 * 4. If you include any Windows specific code (or a derivative thereof) from
38 * the apps directory (application code) you must include an acknowledgement:
39 * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
40 *
41 * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
42 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51 * SUCH DAMAGE.
52 *
53 * The licence and distribution terms for any publically available version or
54 * derivative of this code cannot be changed. i.e. this code cannot simply be
55 * copied and put under another distribution licence
56 * [including the GNU Public Licence.]
57 */
58
59#include <stdio.h>
60#include "cryptlib.h"
61#include "bn_lcl.h"
62
63void BN_RECP_CTX_init(BN_RECP_CTX *recp)
64 {
65 BN_init(&(recp->N));
66 BN_init(&(recp->Nr));
67 recp->num_bits=0;
68 recp->flags=0;
69 }
70
71BN_RECP_CTX *BN_RECP_CTX_new(void)
72 {
73 BN_RECP_CTX *ret;
74
75 if ((ret=(BN_RECP_CTX *)OPENSSL_malloc(sizeof(BN_RECP_CTX))) == NULL)
76 return(NULL);
77
78 BN_RECP_CTX_init(ret);
79 ret->flags=BN_FLG_MALLOCED;
80 return(ret);
81 }
82
83void BN_RECP_CTX_free(BN_RECP_CTX *recp)
84 {
85 if(recp == NULL)
86 return;
87
88 BN_free(&(recp->N));
89 BN_free(&(recp->Nr));
90 if (recp->flags & BN_FLG_MALLOCED)
91 OPENSSL_free(recp);
92 }
93
94int BN_RECP_CTX_set(BN_RECP_CTX *recp, const BIGNUM *d, BN_CTX *ctx)
95 {
96 if (!BN_copy(&(recp->N),d)) return 0;
97 BN_zero(&(recp->Nr));
98 recp->num_bits=BN_num_bits(d);
99 recp->shift=0;
100 return(1);
101 }
102
103int BN_mod_mul_reciprocal(BIGNUM *r, const BIGNUM *x, const BIGNUM *y,
104 BN_RECP_CTX *recp, BN_CTX *ctx)
105 {
106 int ret=0;
107 BIGNUM *a;
108 const BIGNUM *ca;
109
110 BN_CTX_start(ctx);
111 if ((a = BN_CTX_get(ctx)) == NULL) goto err;
112 if (y != NULL)
113 {
114 if (x == y)
115 { if (!BN_sqr(a,x,ctx)) goto err; }
116 else
117 { if (!BN_mul(a,x,y,ctx)) goto err; }
118 ca = a;
119 }
120 else
121 ca=x; /* Just do the mod */
122
123 ret = BN_div_recp(NULL,r,ca,recp,ctx);
124err:
125 BN_CTX_end(ctx);
126 bn_check_top(r);
127 return(ret);
128 }
129
130int BN_div_recp(BIGNUM *dv, BIGNUM *rem, const BIGNUM *m,
131 BN_RECP_CTX *recp, BN_CTX *ctx)
132 {
133 int i,j,ret=0;
134 BIGNUM *a,*b,*d,*r;
135
136 BN_CTX_start(ctx);
137 a=BN_CTX_get(ctx);
138 b=BN_CTX_get(ctx);
139 if (dv != NULL)
140 d=dv;
141 else
142 d=BN_CTX_get(ctx);
143 if (rem != NULL)
144 r=rem;
145 else
146 r=BN_CTX_get(ctx);
147 if (a == NULL || b == NULL || d == NULL || r == NULL) goto err;
148
149 if (BN_ucmp(m,&(recp->N)) < 0)
150 {
151 BN_zero(d);
152 if (!BN_copy(r,m)) return 0;
153 BN_CTX_end(ctx);
154 return(1);
155 }
156
157 /* We want the remainder
158 * Given input of ABCDEF / ab
159 * we need multiply ABCDEF by 3 digests of the reciprocal of ab
160 *
161 */
162
163 /* i := max(BN_num_bits(m), 2*BN_num_bits(N)) */
164 i=BN_num_bits(m);
165 j=recp->num_bits<<1;
166 if (j>i) i=j;
167
168 /* Nr := round(2^i / N) */
169 if (i != recp->shift)
170 recp->shift=BN_reciprocal(&(recp->Nr),&(recp->N),
171 i,ctx); /* BN_reciprocal returns i, or -1 for an error */
172 if (recp->shift == -1) goto err;
173
174 /* d := |round(round(m / 2^BN_num_bits(N)) * recp->Nr / 2^(i - BN_num_bits(N)))|
175 * = |round(round(m / 2^BN_num_bits(N)) * round(2^i / N) / 2^(i - BN_num_bits(N)))|
176 * <= |(m / 2^BN_num_bits(N)) * (2^i / N) * (2^BN_num_bits(N) / 2^i)|
177 * = |m/N|
178 */
179 if (!BN_rshift(a,m,recp->num_bits)) goto err;
180 if (!BN_mul(b,a,&(recp->Nr),ctx)) goto err;
181 if (!BN_rshift(d,b,i-recp->num_bits)) goto err;
182 d->neg=0;
183
184 if (!BN_mul(b,&(recp->N),d,ctx)) goto err;
185 if (!BN_usub(r,m,b)) goto err;
186 r->neg=0;
187
188#if 1
189 j=0;
190 while (BN_ucmp(r,&(recp->N)) >= 0)
191 {
192 if (j++ > 2)
193 {
194 BNerr(BN_F_BN_DIV_RECP,BN_R_BAD_RECIPROCAL);
195 goto err;
196 }
197 if (!BN_usub(r,r,&(recp->N))) goto err;
198 if (!BN_add_word(d,1)) goto err;
199 }
200#endif
201
202 r->neg=BN_is_zero(r)?0:m->neg;
203 d->neg=m->neg^recp->N.neg;
204 ret=1;
205err:
206 BN_CTX_end(ctx);
207 bn_check_top(dv);
208 bn_check_top(rem);
209 return(ret);
210 }
211
212/* len is the expected size of the result
213 * We actually calculate with an extra word of precision, so
214 * we can do faster division if the remainder is not required.
215 */
216/* r := 2^len / m */
217int BN_reciprocal(BIGNUM *r, const BIGNUM *m, int len, BN_CTX *ctx)
218 {
219 int ret= -1;
220 BIGNUM *t;
221
222 BN_CTX_start(ctx);
223 if((t = BN_CTX_get(ctx)) == NULL) goto err;
224
225 if (!BN_set_bit(t,len)) goto err;
226
227 if (!BN_div(r,NULL,t,m,ctx)) goto err;
228
229 ret=len;
230err:
231 bn_check_top(r);
232 BN_CTX_end(ctx);
233 return(ret);
234 }
diff --git a/src/lib/libcrypto/bn/bn_shift.c b/src/lib/libcrypto/bn/bn_shift.c
deleted file mode 100644
index a6fca2c424..0000000000
--- a/src/lib/libcrypto/bn/bn_shift.c
+++ /dev/null
@@ -1,223 +0,0 @@
1/* crypto/bn/bn_shift.c */
2/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
3 * All rights reserved.
4 *
5 * This package is an SSL implementation written
6 * by Eric Young (eay@cryptsoft.com).
7 * The implementation was written so as to conform with Netscapes SSL.
8 *
9 * This library is free for commercial and non-commercial use as long as
10 * the following conditions are aheared to. The following conditions
11 * apply to all code found in this distribution, be it the RC4, RSA,
12 * lhash, DES, etc., code; not just the SSL code. The SSL documentation
13 * included with this distribution is covered by the same copyright terms
14 * except that the holder is Tim Hudson (tjh@cryptsoft.com).
15 *
16 * Copyright remains Eric Young's, and as such any Copyright notices in
17 * the code are not to be removed.
18 * If this package is used in a product, Eric Young should be given attribution
19 * as the author of the parts of the library used.
20 * This can be in the form of a textual message at program startup or
21 * in documentation (online or textual) provided with the package.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 * 1. Redistributions of source code must retain the copyright
27 * notice, this list of conditions and the following disclaimer.
28 * 2. Redistributions in binary form must reproduce the above copyright
29 * notice, this list of conditions and the following disclaimer in the
30 * documentation and/or other materials provided with the distribution.
31 * 3. All advertising materials mentioning features or use of this software
32 * must display the following acknowledgement:
33 * "This product includes cryptographic software written by
34 * Eric Young (eay@cryptsoft.com)"
35 * The word 'cryptographic' can be left out if the rouines from the library
36 * being used are not cryptographic related :-).
37 * 4. If you include any Windows specific code (or a derivative thereof) from
38 * the apps directory (application code) you must include an acknowledgement:
39 * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
40 *
41 * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
42 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51 * SUCH DAMAGE.
52 *
53 * The licence and distribution terms for any publically available version or
54 * derivative of this code cannot be changed. i.e. this code cannot simply be
55 * copied and put under another distribution licence
56 * [including the GNU Public Licence.]
57 */
58
59#include <stdio.h>
60#include "cryptlib.h"
61#include "bn_lcl.h"
62
63int BN_lshift1(BIGNUM *r, const BIGNUM *a)
64 {
65 register BN_ULONG *ap,*rp,t,c;
66 int i;
67
68 bn_check_top(r);
69 bn_check_top(a);
70
71 if (r != a)
72 {
73 r->neg=a->neg;
74 if (bn_wexpand(r,a->top+1) == NULL) return(0);
75 r->top=a->top;
76 }
77 else
78 {
79 if (bn_wexpand(r,a->top+1) == NULL) return(0);
80 }
81 ap=a->d;
82 rp=r->d;
83 c=0;
84 for (i=0; i<a->top; i++)
85 {
86 t= *(ap++);
87 *(rp++)=((t<<1)|c)&BN_MASK2;
88 c=(t & BN_TBIT)?1:0;
89 }
90 if (c)
91 {
92 *rp=1;
93 r->top++;
94 }
95 bn_check_top(r);
96 return(1);
97 }
98
99int BN_rshift1(BIGNUM *r, const BIGNUM *a)
100 {
101 BN_ULONG *ap,*rp,t,c;
102 int i,j;
103
104 bn_check_top(r);
105 bn_check_top(a);
106
107 if (BN_is_zero(a))
108 {
109 BN_zero(r);
110 return(1);
111 }
112 i = a->top;
113 ap= a->d;
114 j = i-(ap[i-1]==1);
115 if (a != r)
116 {
117 if (bn_wexpand(r,j) == NULL) return(0);
118 r->neg=a->neg;
119 }
120 rp=r->d;
121 t=ap[--i];
122 c=(t&1)?BN_TBIT:0;
123 if (t>>=1) rp[i]=t;
124 while (i>0)
125 {
126 t=ap[--i];
127 rp[i]=((t>>1)&BN_MASK2)|c;
128 c=(t&1)?BN_TBIT:0;
129 }
130 r->top=j;
131 bn_check_top(r);
132 return(1);
133 }
134
135int BN_lshift(BIGNUM *r, const BIGNUM *a, int n)
136 {
137 int i,nw,lb,rb;
138 BN_ULONG *t,*f;
139 BN_ULONG l;
140
141 bn_check_top(r);
142 bn_check_top(a);
143
144 r->neg=a->neg;
145 nw=n/BN_BITS2;
146 if (bn_wexpand(r,a->top+nw+1) == NULL) return(0);
147 lb=n%BN_BITS2;
148 rb=BN_BITS2-lb;
149 f=a->d;
150 t=r->d;
151 t[a->top+nw]=0;
152 if (lb == 0)
153 for (i=a->top-1; i>=0; i--)
154 t[nw+i]=f[i];
155 else
156 for (i=a->top-1; i>=0; i--)
157 {
158 l=f[i];
159 t[nw+i+1]|=(l>>rb)&BN_MASK2;
160 t[nw+i]=(l<<lb)&BN_MASK2;
161 }
162 memset(t,0,nw*sizeof(t[0]));
163/* for (i=0; i<nw; i++)
164 t[i]=0;*/
165 r->top=a->top+nw+1;
166 bn_correct_top(r);
167 bn_check_top(r);
168 return(1);
169 }
170
171int BN_rshift(BIGNUM *r, const BIGNUM *a, int n)
172 {
173 int i,j,nw,lb,rb;
174 BN_ULONG *t,*f;
175 BN_ULONG l,tmp;
176
177 bn_check_top(r);
178 bn_check_top(a);
179
180 nw=n/BN_BITS2;
181 rb=n%BN_BITS2;
182 lb=BN_BITS2-rb;
183 if (nw >= a->top || a->top == 0)
184 {
185 BN_zero(r);
186 return(1);
187 }
188 i = (BN_num_bits(a)-n+(BN_BITS2-1))/BN_BITS2;
189 if (r != a)
190 {
191 r->neg=a->neg;
192 if (bn_wexpand(r,i) == NULL) return(0);
193 }
194 else
195 {
196 if (n == 0)
197 return 1; /* or the copying loop will go berserk */
198 }
199
200 f= &(a->d[nw]);
201 t=r->d;
202 j=a->top-nw;
203 r->top=i;
204
205 if (rb == 0)
206 {
207 for (i=j; i != 0; i--)
208 *(t++)= *(f++);
209 }
210 else
211 {
212 l= *(f++);
213 for (i=j-1; i != 0; i--)
214 {
215 tmp =(l>>rb)&BN_MASK2;
216 l= *(f++);
217 *(t++) =(tmp|(l<<lb))&BN_MASK2;
218 }
219 if ((l = (l>>rb)&BN_MASK2)) *(t) = l;
220 }
221 bn_check_top(r);
222 return(1);
223 }
diff --git a/src/lib/libcrypto/bn/bn_sqr.c b/src/lib/libcrypto/bn/bn_sqr.c
deleted file mode 100644
index 270d0cd348..0000000000
--- a/src/lib/libcrypto/bn/bn_sqr.c
+++ /dev/null
@@ -1,294 +0,0 @@
1/* crypto/bn/bn_sqr.c */
2/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
3 * All rights reserved.
4 *
5 * This package is an SSL implementation written
6 * by Eric Young (eay@cryptsoft.com).
7 * The implementation was written so as to conform with Netscapes SSL.
8 *
9 * This library is free for commercial and non-commercial use as long as
10 * the following conditions are aheared to. The following conditions
11 * apply to all code found in this distribution, be it the RC4, RSA,
12 * lhash, DES, etc., code; not just the SSL code. The SSL documentation
13 * included with this distribution is covered by the same copyright terms
14 * except that the holder is Tim Hudson (tjh@cryptsoft.com).
15 *
16 * Copyright remains Eric Young's, and as such any Copyright notices in
17 * the code are not to be removed.
18 * If this package is used in a product, Eric Young should be given attribution
19 * as the author of the parts of the library used.
20 * This can be in the form of a textual message at program startup or
21 * in documentation (online or textual) provided with the package.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 * 1. Redistributions of source code must retain the copyright
27 * notice, this list of conditions and the following disclaimer.
28 * 2. Redistributions in binary form must reproduce the above copyright
29 * notice, this list of conditions and the following disclaimer in the
30 * documentation and/or other materials provided with the distribution.
31 * 3. All advertising materials mentioning features or use of this software
32 * must display the following acknowledgement:
33 * "This product includes cryptographic software written by
34 * Eric Young (eay@cryptsoft.com)"
35 * The word 'cryptographic' can be left out if the rouines from the library
36 * being used are not cryptographic related :-).
37 * 4. If you include any Windows specific code (or a derivative thereof) from
38 * the apps directory (application code) you must include an acknowledgement:
39 * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
40 *
41 * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
42 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51 * SUCH DAMAGE.
52 *
53 * The licence and distribution terms for any publically available version or
54 * derivative of this code cannot be changed. i.e. this code cannot simply be
55 * copied and put under another distribution licence
56 * [including the GNU Public Licence.]
57 */
58
59#include <stdio.h>
60#include "cryptlib.h"
61#include "bn_lcl.h"
62
63/* r must not be a */
64/* I've just gone over this and it is now %20 faster on x86 - eay - 27 Jun 96 */
65int BN_sqr(BIGNUM *r, const BIGNUM *a, BN_CTX *ctx)
66 {
67 int max,al;
68 int ret = 0;
69 BIGNUM *tmp,*rr;
70
71#ifdef BN_COUNT
72 fprintf(stderr,"BN_sqr %d * %d\n",a->top,a->top);
73#endif
74 bn_check_top(a);
75
76 al=a->top;
77 if (al <= 0)
78 {
79 r->top=0;
80 return 1;
81 }
82
83 BN_CTX_start(ctx);
84 rr=(a != r) ? r : BN_CTX_get(ctx);
85 tmp=BN_CTX_get(ctx);
86 if (!rr || !tmp) goto err;
87
88 max = 2 * al; /* Non-zero (from above) */
89 if (bn_wexpand(rr,max) == NULL) goto err;
90
91 if (al == 4)
92 {
93#ifndef BN_SQR_COMBA
94 BN_ULONG t[8];
95 bn_sqr_normal(rr->d,a->d,4,t);
96#else
97 bn_sqr_comba4(rr->d,a->d);
98#endif
99 }
100 else if (al == 8)
101 {
102#ifndef BN_SQR_COMBA
103 BN_ULONG t[16];
104 bn_sqr_normal(rr->d,a->d,8,t);
105#else
106 bn_sqr_comba8(rr->d,a->d);
107#endif
108 }
109 else
110 {
111#if defined(BN_RECURSION)
112 if (al < BN_SQR_RECURSIVE_SIZE_NORMAL)
113 {
114 BN_ULONG t[BN_SQR_RECURSIVE_SIZE_NORMAL*2];
115 bn_sqr_normal(rr->d,a->d,al,t);
116 }
117 else
118 {
119 int j,k;
120
121 j=BN_num_bits_word((BN_ULONG)al);
122 j=1<<(j-1);
123 k=j+j;
124 if (al == j)
125 {
126 if (bn_wexpand(tmp,k*2) == NULL) goto err;
127 bn_sqr_recursive(rr->d,a->d,al,tmp->d);
128 }
129 else
130 {
131 if (bn_wexpand(tmp,max) == NULL) goto err;
132 bn_sqr_normal(rr->d,a->d,al,tmp->d);
133 }
134 }
135#else
136 if (bn_wexpand(tmp,max) == NULL) goto err;
137 bn_sqr_normal(rr->d,a->d,al,tmp->d);
138#endif
139 }
140
141 rr->neg=0;
142 /* If the most-significant half of the top word of 'a' is zero, then
143 * the square of 'a' will max-1 words. */
144 if(a->d[al - 1] == (a->d[al - 1] & BN_MASK2l))
145 rr->top = max - 1;
146 else
147 rr->top = max;
148 if (rr != r) BN_copy(r,rr);
149 ret = 1;
150 err:
151 bn_check_top(rr);
152 bn_check_top(tmp);
153 BN_CTX_end(ctx);
154 return(ret);
155 }
156
157/* tmp must have 2*n words */
158void bn_sqr_normal(BN_ULONG *r, const BN_ULONG *a, int n, BN_ULONG *tmp)
159 {
160 int i,j,max;
161 const BN_ULONG *ap;
162 BN_ULONG *rp;
163
164 max=n*2;
165 ap=a;
166 rp=r;
167 rp[0]=rp[max-1]=0;
168 rp++;
169 j=n;
170
171 if (--j > 0)
172 {
173 ap++;
174 rp[j]=bn_mul_words(rp,ap,j,ap[-1]);
175 rp+=2;
176 }
177
178 for (i=n-2; i>0; i--)
179 {
180 j--;
181 ap++;
182 rp[j]=bn_mul_add_words(rp,ap,j,ap[-1]);
183 rp+=2;
184 }
185
186 bn_add_words(r,r,r,max);
187
188 /* There will not be a carry */
189
190 bn_sqr_words(tmp,a,n);
191
192 bn_add_words(r,r,tmp,max);
193 }
194
195#ifdef BN_RECURSION
196/* r is 2*n words in size,
197 * a and b are both n words in size. (There's not actually a 'b' here ...)
198 * n must be a power of 2.
199 * We multiply and return the result.
200 * t must be 2*n words in size
201 * We calculate
202 * a[0]*b[0]
203 * a[0]*b[0]+a[1]*b[1]+(a[0]-a[1])*(b[1]-b[0])
204 * a[1]*b[1]
205 */
206void bn_sqr_recursive(BN_ULONG *r, const BN_ULONG *a, int n2, BN_ULONG *t)
207 {
208 int n=n2/2;
209 int zero,c1;
210 BN_ULONG ln,lo,*p;
211
212#ifdef BN_COUNT
213 fprintf(stderr," bn_sqr_recursive %d * %d\n",n2,n2);
214#endif
215 if (n2 == 4)
216 {
217#ifndef BN_SQR_COMBA
218 bn_sqr_normal(r,a,4,t);
219#else
220 bn_sqr_comba4(r,a);
221#endif
222 return;
223 }
224 else if (n2 == 8)
225 {
226#ifndef BN_SQR_COMBA
227 bn_sqr_normal(r,a,8,t);
228#else
229 bn_sqr_comba8(r,a);
230#endif
231 return;
232 }
233 if (n2 < BN_SQR_RECURSIVE_SIZE_NORMAL)
234 {
235 bn_sqr_normal(r,a,n2,t);
236 return;
237 }
238 /* r=(a[0]-a[1])*(a[1]-a[0]) */
239 c1=bn_cmp_words(a,&(a[n]),n);
240 zero=0;
241 if (c1 > 0)
242 bn_sub_words(t,a,&(a[n]),n);
243 else if (c1 < 0)
244 bn_sub_words(t,&(a[n]),a,n);
245 else
246 zero=1;
247
248 /* The result will always be negative unless it is zero */
249 p= &(t[n2*2]);
250
251 if (!zero)
252 bn_sqr_recursive(&(t[n2]),t,n,p);
253 else
254 memset(&(t[n2]),0,n2*sizeof(BN_ULONG));
255 bn_sqr_recursive(r,a,n,p);
256 bn_sqr_recursive(&(r[n2]),&(a[n]),n,p);
257
258 /* t[32] holds (a[0]-a[1])*(a[1]-a[0]), it is negative or zero
259 * r[10] holds (a[0]*b[0])
260 * r[32] holds (b[1]*b[1])
261 */
262
263 c1=(int)(bn_add_words(t,r,&(r[n2]),n2));
264
265 /* t[32] is negative */
266 c1-=(int)(bn_sub_words(&(t[n2]),t,&(t[n2]),n2));
267
268 /* t[32] holds (a[0]-a[1])*(a[1]-a[0])+(a[0]*a[0])+(a[1]*a[1])
269 * r[10] holds (a[0]*a[0])
270 * r[32] holds (a[1]*a[1])
271 * c1 holds the carry bits
272 */
273 c1+=(int)(bn_add_words(&(r[n]),&(r[n]),&(t[n2]),n2));
274 if (c1)
275 {
276 p= &(r[n+n2]);
277 lo= *p;
278 ln=(lo+c1)&BN_MASK2;
279 *p=ln;
280
281 /* The overflow will stop before we over write
282 * words we should not overwrite */
283 if (ln < (BN_ULONG)c1)
284 {
285 do {
286 p++;
287 lo= *p;
288 ln=(lo+1)&BN_MASK2;
289 *p=ln;
290 } while (ln == 0);
291 }
292 }
293 }
294#endif
diff --git a/src/lib/libcrypto/bn/bn_sqrt.c b/src/lib/libcrypto/bn/bn_sqrt.c
deleted file mode 100644
index 6beaf9e5e5..0000000000
--- a/src/lib/libcrypto/bn/bn_sqrt.c
+++ /dev/null
@@ -1,393 +0,0 @@
1/* crypto/bn/bn_sqrt.c */
2/* Written by Lenka Fibikova <fibikova@exp-math.uni-essen.de>
3 * and Bodo Moeller for the OpenSSL project. */
4/* ====================================================================
5 * Copyright (c) 1998-2000 The OpenSSL Project. All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 *
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 *
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in
16 * the documentation and/or other materials provided with the
17 * distribution.
18 *
19 * 3. All advertising materials mentioning features or use of this
20 * software must display the following acknowledgment:
21 * "This product includes software developed by the OpenSSL Project
22 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
23 *
24 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
25 * endorse or promote products derived from this software without
26 * prior written permission. For written permission, please contact
27 * openssl-core@openssl.org.
28 *
29 * 5. Products derived from this software may not be called "OpenSSL"
30 * nor may "OpenSSL" appear in their names without prior written
31 * permission of the OpenSSL Project.
32 *
33 * 6. Redistributions of any form whatsoever must retain the following
34 * acknowledgment:
35 * "This product includes software developed by the OpenSSL Project
36 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
37 *
38 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
39 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
40 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
41 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
42 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
43 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
44 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
45 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
46 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
47 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
48 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
49 * OF THE POSSIBILITY OF SUCH DAMAGE.
50 * ====================================================================
51 *
52 * This product includes cryptographic software written by Eric Young
53 * (eay@cryptsoft.com). This product includes software written by Tim
54 * Hudson (tjh@cryptsoft.com).
55 *
56 */
57
58#include "cryptlib.h"
59#include "bn_lcl.h"
60
61
62BIGNUM *BN_mod_sqrt(BIGNUM *in, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx)
63/* Returns 'ret' such that
64 * ret^2 == a (mod p),
65 * using the Tonelli/Shanks algorithm (cf. Henri Cohen, "A Course
66 * in Algebraic Computational Number Theory", algorithm 1.5.1).
67 * 'p' must be prime!
68 */
69 {
70 BIGNUM *ret = in;
71 int err = 1;
72 int r;
73 BIGNUM *A, *b, *q, *t, *x, *y;
74 int e, i, j;
75
76 if (!BN_is_odd(p) || BN_abs_is_word(p, 1))
77 {
78 if (BN_abs_is_word(p, 2))
79 {
80 if (ret == NULL)
81 ret = BN_new();
82 if (ret == NULL)
83 goto end;
84 if (!BN_set_word(ret, BN_is_bit_set(a, 0)))
85 {
86 if (ret != in)
87 BN_free(ret);
88 return NULL;
89 }
90 bn_check_top(ret);
91 return ret;
92 }
93
94 BNerr(BN_F_BN_MOD_SQRT, BN_R_P_IS_NOT_PRIME);
95 return(NULL);
96 }
97
98 if (BN_is_zero(a) || BN_is_one(a))
99 {
100 if (ret == NULL)
101 ret = BN_new();
102 if (ret == NULL)
103 goto end;
104 if (!BN_set_word(ret, BN_is_one(a)))
105 {
106 if (ret != in)
107 BN_free(ret);
108 return NULL;
109 }
110 bn_check_top(ret);
111 return ret;
112 }
113
114 BN_CTX_start(ctx);
115 A = BN_CTX_get(ctx);
116 b = BN_CTX_get(ctx);
117 q = BN_CTX_get(ctx);
118 t = BN_CTX_get(ctx);
119 x = BN_CTX_get(ctx);
120 y = BN_CTX_get(ctx);
121 if (y == NULL) goto end;
122
123 if (ret == NULL)
124 ret = BN_new();
125 if (ret == NULL) goto end;
126
127 /* A = a mod p */
128 if (!BN_nnmod(A, a, p, ctx)) goto end;
129
130 /* now write |p| - 1 as 2^e*q where q is odd */
131 e = 1;
132 while (!BN_is_bit_set(p, e))
133 e++;
134 /* we'll set q later (if needed) */
135
136 if (e == 1)
137 {
138 /* The easy case: (|p|-1)/2 is odd, so 2 has an inverse
139 * modulo (|p|-1)/2, and square roots can be computed
140 * directly by modular exponentiation.
141 * We have
142 * 2 * (|p|+1)/4 == 1 (mod (|p|-1)/2),
143 * so we can use exponent (|p|+1)/4, i.e. (|p|-3)/4 + 1.
144 */
145 if (!BN_rshift(q, p, 2)) goto end;
146 q->neg = 0;
147 if (!BN_add_word(q, 1)) goto end;
148 if (!BN_mod_exp(ret, A, q, p, ctx)) goto end;
149 err = 0;
150 goto vrfy;
151 }
152
153 if (e == 2)
154 {
155 /* |p| == 5 (mod 8)
156 *
157 * In this case 2 is always a non-square since
158 * Legendre(2,p) = (-1)^((p^2-1)/8) for any odd prime.
159 * So if a really is a square, then 2*a is a non-square.
160 * Thus for
161 * b := (2*a)^((|p|-5)/8),
162 * i := (2*a)*b^2
163 * we have
164 * i^2 = (2*a)^((1 + (|p|-5)/4)*2)
165 * = (2*a)^((p-1)/2)
166 * = -1;
167 * so if we set
168 * x := a*b*(i-1),
169 * then
170 * x^2 = a^2 * b^2 * (i^2 - 2*i + 1)
171 * = a^2 * b^2 * (-2*i)
172 * = a*(-i)*(2*a*b^2)
173 * = a*(-i)*i
174 * = a.
175 *
176 * (This is due to A.O.L. Atkin,
177 * <URL: http://listserv.nodak.edu/scripts/wa.exe?A2=ind9211&L=nmbrthry&O=T&P=562>,
178 * November 1992.)
179 */
180
181 /* t := 2*a */
182 if (!BN_mod_lshift1_quick(t, A, p)) goto end;
183
184 /* b := (2*a)^((|p|-5)/8) */
185 if (!BN_rshift(q, p, 3)) goto end;
186 q->neg = 0;
187 if (!BN_mod_exp(b, t, q, p, ctx)) goto end;
188
189 /* y := b^2 */
190 if (!BN_mod_sqr(y, b, p, ctx)) goto end;
191
192 /* t := (2*a)*b^2 - 1*/
193 if (!BN_mod_mul(t, t, y, p, ctx)) goto end;
194 if (!BN_sub_word(t, 1)) goto end;
195
196 /* x = a*b*t */
197 if (!BN_mod_mul(x, A, b, p, ctx)) goto end;
198 if (!BN_mod_mul(x, x, t, p, ctx)) goto end;
199
200 if (!BN_copy(ret, x)) goto end;
201 err = 0;
202 goto vrfy;
203 }
204
205 /* e > 2, so we really have to use the Tonelli/Shanks algorithm.
206 * First, find some y that is not a square. */
207 if (!BN_copy(q, p)) goto end; /* use 'q' as temp */
208 q->neg = 0;
209 i = 2;
210 do
211 {
212 /* For efficiency, try small numbers first;
213 * if this fails, try random numbers.
214 */
215 if (i < 22)
216 {
217 if (!BN_set_word(y, i)) goto end;
218 }
219 else
220 {
221 if (!BN_pseudo_rand(y, BN_num_bits(p), 0, 0)) goto end;
222 if (BN_ucmp(y, p) >= 0)
223 {
224 if (!(p->neg ? BN_add : BN_sub)(y, y, p)) goto end;
225 }
226 /* now 0 <= y < |p| */
227 if (BN_is_zero(y))
228 if (!BN_set_word(y, i)) goto end;
229 }
230
231 r = BN_kronecker(y, q, ctx); /* here 'q' is |p| */
232 if (r < -1) goto end;
233 if (r == 0)
234 {
235 /* m divides p */
236 BNerr(BN_F_BN_MOD_SQRT, BN_R_P_IS_NOT_PRIME);
237 goto end;
238 }
239 }
240 while (r == 1 && ++i < 82);
241
242 if (r != -1)
243 {
244 /* Many rounds and still no non-square -- this is more likely
245 * a bug than just bad luck.
246 * Even if p is not prime, we should have found some y
247 * such that r == -1.
248 */
249 BNerr(BN_F_BN_MOD_SQRT, BN_R_TOO_MANY_ITERATIONS);
250 goto end;
251 }
252
253 /* Here's our actual 'q': */
254 if (!BN_rshift(q, q, e)) goto end;
255
256 /* Now that we have some non-square, we can find an element
257 * of order 2^e by computing its q'th power. */
258 if (!BN_mod_exp(y, y, q, p, ctx)) goto end;
259 if (BN_is_one(y))
260 {
261 BNerr(BN_F_BN_MOD_SQRT, BN_R_P_IS_NOT_PRIME);
262 goto end;
263 }
264
265 /* Now we know that (if p is indeed prime) there is an integer
266 * k, 0 <= k < 2^e, such that
267 *
268 * a^q * y^k == 1 (mod p).
269 *
270 * As a^q is a square and y is not, k must be even.
271 * q+1 is even, too, so there is an element
272 *
273 * X := a^((q+1)/2) * y^(k/2),
274 *
275 * and it satisfies
276 *
277 * X^2 = a^q * a * y^k
278 * = a,
279 *
280 * so it is the square root that we are looking for.
281 */
282
283 /* t := (q-1)/2 (note that q is odd) */
284 if (!BN_rshift1(t, q)) goto end;
285
286 /* x := a^((q-1)/2) */
287 if (BN_is_zero(t)) /* special case: p = 2^e + 1 */
288 {
289 if (!BN_nnmod(t, A, p, ctx)) goto end;
290 if (BN_is_zero(t))
291 {
292 /* special case: a == 0 (mod p) */
293 BN_zero(ret);
294 err = 0;
295 goto end;
296 }
297 else
298 if (!BN_one(x)) goto end;
299 }
300 else
301 {
302 if (!BN_mod_exp(x, A, t, p, ctx)) goto end;
303 if (BN_is_zero(x))
304 {
305 /* special case: a == 0 (mod p) */
306 BN_zero(ret);
307 err = 0;
308 goto end;
309 }
310 }
311
312 /* b := a*x^2 (= a^q) */
313 if (!BN_mod_sqr(b, x, p, ctx)) goto end;
314 if (!BN_mod_mul(b, b, A, p, ctx)) goto end;
315
316 /* x := a*x (= a^((q+1)/2)) */
317 if (!BN_mod_mul(x, x, A, p, ctx)) goto end;
318
319 while (1)
320 {
321 /* Now b is a^q * y^k for some even k (0 <= k < 2^E
322 * where E refers to the original value of e, which we
323 * don't keep in a variable), and x is a^((q+1)/2) * y^(k/2).
324 *
325 * We have a*b = x^2,
326 * y^2^(e-1) = -1,
327 * b^2^(e-1) = 1.
328 */
329
330 if (BN_is_one(b))
331 {
332 if (!BN_copy(ret, x)) goto end;
333 err = 0;
334 goto vrfy;
335 }
336
337
338 /* find smallest i such that b^(2^i) = 1 */
339 i = 1;
340 if (!BN_mod_sqr(t, b, p, ctx)) goto end;
341 while (!BN_is_one(t))
342 {
343 i++;
344 if (i == e)
345 {
346 BNerr(BN_F_BN_MOD_SQRT, BN_R_NOT_A_SQUARE);
347 goto end;
348 }
349 if (!BN_mod_mul(t, t, t, p, ctx)) goto end;
350 }
351
352
353 /* t := y^2^(e - i - 1) */
354 if (!BN_copy(t, y)) goto end;
355 for (j = e - i - 1; j > 0; j--)
356 {
357 if (!BN_mod_sqr(t, t, p, ctx)) goto end;
358 }
359 if (!BN_mod_mul(y, t, t, p, ctx)) goto end;
360 if (!BN_mod_mul(x, x, t, p, ctx)) goto end;
361 if (!BN_mod_mul(b, b, y, p, ctx)) goto end;
362 e = i;
363 }
364
365 vrfy:
366 if (!err)
367 {
368 /* verify the result -- the input might have been not a square
369 * (test added in 0.9.8) */
370
371 if (!BN_mod_sqr(x, ret, p, ctx))
372 err = 1;
373
374 if (!err && 0 != BN_cmp(x, A))
375 {
376 BNerr(BN_F_BN_MOD_SQRT, BN_R_NOT_A_SQUARE);
377 err = 1;
378 }
379 }
380
381 end:
382 if (err)
383 {
384 if (ret != NULL && ret != in)
385 {
386 BN_clear_free(ret);
387 }
388 ret = NULL;
389 }
390 BN_CTX_end(ctx);
391 bn_check_top(ret);
392 return ret;
393 }
diff --git a/src/lib/libcrypto/bn/bn_word.c b/src/lib/libcrypto/bn/bn_word.c
deleted file mode 100644
index de83a15b99..0000000000
--- a/src/lib/libcrypto/bn/bn_word.c
+++ /dev/null
@@ -1,238 +0,0 @@
1/* crypto/bn/bn_word.c */
2/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
3 * All rights reserved.
4 *
5 * This package is an SSL implementation written
6 * by Eric Young (eay@cryptsoft.com).
7 * The implementation was written so as to conform with Netscapes SSL.
8 *
9 * This library is free for commercial and non-commercial use as long as
10 * the following conditions are aheared to. The following conditions
11 * apply to all code found in this distribution, be it the RC4, RSA,
12 * lhash, DES, etc., code; not just the SSL code. The SSL documentation
13 * included with this distribution is covered by the same copyright terms
14 * except that the holder is Tim Hudson (tjh@cryptsoft.com).
15 *
16 * Copyright remains Eric Young's, and as such any Copyright notices in
17 * the code are not to be removed.
18 * If this package is used in a product, Eric Young should be given attribution
19 * as the author of the parts of the library used.
20 * This can be in the form of a textual message at program startup or
21 * in documentation (online or textual) provided with the package.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 * 1. Redistributions of source code must retain the copyright
27 * notice, this list of conditions and the following disclaimer.
28 * 2. Redistributions in binary form must reproduce the above copyright
29 * notice, this list of conditions and the following disclaimer in the
30 * documentation and/or other materials provided with the distribution.
31 * 3. All advertising materials mentioning features or use of this software
32 * must display the following acknowledgement:
33 * "This product includes cryptographic software written by
34 * Eric Young (eay@cryptsoft.com)"
35 * The word 'cryptographic' can be left out if the rouines from the library
36 * being used are not cryptographic related :-).
37 * 4. If you include any Windows specific code (or a derivative thereof) from
38 * the apps directory (application code) you must include an acknowledgement:
39 * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
40 *
41 * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
42 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51 * SUCH DAMAGE.
52 *
53 * The licence and distribution terms for any publically available version or
54 * derivative of this code cannot be changed. i.e. this code cannot simply be
55 * copied and put under another distribution licence
56 * [including the GNU Public Licence.]
57 */
58
59#include <stdio.h>
60#include "cryptlib.h"
61#include "bn_lcl.h"
62
63BN_ULONG BN_mod_word(const BIGNUM *a, BN_ULONG w)
64 {
65#ifndef BN_LLONG
66 BN_ULONG ret=0;
67#else
68 BN_ULLONG ret=0;
69#endif
70 int i;
71
72 if (w == 0)
73 return (BN_ULONG)-1;
74
75 bn_check_top(a);
76 w&=BN_MASK2;
77 for (i=a->top-1; i>=0; i--)
78 {
79#ifndef BN_LLONG
80 ret=((ret<<BN_BITS4)|((a->d[i]>>BN_BITS4)&BN_MASK2l))%w;
81 ret=((ret<<BN_BITS4)|(a->d[i]&BN_MASK2l))%w;
82#else
83 ret=(BN_ULLONG)(((ret<<(BN_ULLONG)BN_BITS2)|a->d[i])%
84 (BN_ULLONG)w);
85#endif
86 }
87 return((BN_ULONG)ret);
88 }
89
90BN_ULONG BN_div_word(BIGNUM *a, BN_ULONG w)
91 {
92 BN_ULONG ret = 0;
93 int i, j;
94
95 bn_check_top(a);
96 w &= BN_MASK2;
97
98 if (!w)
99 /* actually this an error (division by zero) */
100 return (BN_ULONG)-1;
101 if (a->top == 0)
102 return 0;
103
104 /* normalize input (so bn_div_words doesn't complain) */
105 j = BN_BITS2 - BN_num_bits_word(w);
106 w <<= j;
107 if (!BN_lshift(a, a, j))
108 return (BN_ULONG)-1;
109
110 for (i=a->top-1; i>=0; i--)
111 {
112 BN_ULONG l,d;
113
114 l=a->d[i];
115 d=bn_div_words(ret,l,w);
116 ret=(l-((d*w)&BN_MASK2))&BN_MASK2;
117 a->d[i]=d;
118 }
119 if ((a->top > 0) && (a->d[a->top-1] == 0))
120 a->top--;
121 ret >>= j;
122 bn_check_top(a);
123 return(ret);
124 }
125
126int BN_add_word(BIGNUM *a, BN_ULONG w)
127 {
128 BN_ULONG l;
129 int i;
130
131 bn_check_top(a);
132 w &= BN_MASK2;
133
134 /* degenerate case: w is zero */
135 if (!w) return 1;
136 /* degenerate case: a is zero */
137 if(BN_is_zero(a)) return BN_set_word(a, w);
138 /* handle 'a' when negative */
139 if (a->neg)
140 {
141 a->neg=0;
142 i=BN_sub_word(a,w);
143 if (!BN_is_zero(a))
144 a->neg=!(a->neg);
145 return(i);
146 }
147 for (i=0;w!=0 && i<a->top;i++)
148 {
149 a->d[i] = l = (a->d[i]+w)&BN_MASK2;
150 w = (w>l)?1:0;
151 }
152 if (w && i==a->top)
153 {
154 if (bn_wexpand(a,a->top+1) == NULL) return 0;
155 a->top++;
156 a->d[i]=w;
157 }
158 bn_check_top(a);
159 return(1);
160 }
161
162int BN_sub_word(BIGNUM *a, BN_ULONG w)
163 {
164 int i;
165
166 bn_check_top(a);
167 w &= BN_MASK2;
168
169 /* degenerate case: w is zero */
170 if (!w) return 1;
171 /* degenerate case: a is zero */
172 if(BN_is_zero(a))
173 {
174 i = BN_set_word(a,w);
175 if (i != 0)
176 BN_set_negative(a, 1);
177 return i;
178 }
179 /* handle 'a' when negative */
180 if (a->neg)
181 {
182 a->neg=0;
183 i=BN_add_word(a,w);
184 a->neg=1;
185 return(i);
186 }
187
188 if ((a->top == 1) && (a->d[0] < w))
189 {
190 a->d[0]=w-a->d[0];
191 a->neg=1;
192 return(1);
193 }
194 i=0;
195 for (;;)
196 {
197 if (a->d[i] >= w)
198 {
199 a->d[i]-=w;
200 break;
201 }
202 else
203 {
204 a->d[i]=(a->d[i]-w)&BN_MASK2;
205 i++;
206 w=1;
207 }
208 }
209 if ((a->d[i] == 0) && (i == (a->top-1)))
210 a->top--;
211 bn_check_top(a);
212 return(1);
213 }
214
215int BN_mul_word(BIGNUM *a, BN_ULONG w)
216 {
217 BN_ULONG ll;
218
219 bn_check_top(a);
220 w&=BN_MASK2;
221 if (a->top)
222 {
223 if (w == 0)
224 BN_zero(a);
225 else
226 {
227 ll=bn_mul_words(a->d,a->d,a->top,w);
228 if (ll)
229 {
230 if (bn_wexpand(a,a->top+1) == NULL) return(0);
231 a->d[a->top++]=ll;
232 }
233 }
234 }
235 bn_check_top(a);
236 return(1);
237 }
238
diff --git a/src/lib/libcrypto/bn/bn_x931p.c b/src/lib/libcrypto/bn/bn_x931p.c
deleted file mode 100644
index 04c5c874ec..0000000000
--- a/src/lib/libcrypto/bn/bn_x931p.c
+++ /dev/null
@@ -1,272 +0,0 @@
1/* bn_x931p.c */
2/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL
3 * project 2005.
4 */
5/* ====================================================================
6 * Copyright (c) 2005 The OpenSSL Project. All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 *
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 *
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in
17 * the documentation and/or other materials provided with the
18 * distribution.
19 *
20 * 3. All advertising materials mentioning features or use of this
21 * software must display the following acknowledgment:
22 * "This product includes software developed by the OpenSSL Project
23 * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
24 *
25 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
26 * endorse or promote products derived from this software without
27 * prior written permission. For written permission, please contact
28 * licensing@OpenSSL.org.
29 *
30 * 5. Products derived from this software may not be called "OpenSSL"
31 * nor may "OpenSSL" appear in their names without prior written
32 * permission of the OpenSSL Project.
33 *
34 * 6. Redistributions of any form whatsoever must retain the following
35 * acknowledgment:
36 * "This product includes software developed by the OpenSSL Project
37 * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
38 *
39 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
40 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
41 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
42 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
43 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
44 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
45 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
46 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
47 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
48 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
49 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
50 * OF THE POSSIBILITY OF SUCH DAMAGE.
51 * ====================================================================
52 *
53 * This product includes cryptographic software written by Eric Young
54 * (eay@cryptsoft.com). This product includes software written by Tim
55 * Hudson (tjh@cryptsoft.com).
56 *
57 */
58
59#include <stdio.h>
60#include <openssl/bn.h>
61
62/* X9.31 routines for prime derivation */
63
64/* X9.31 prime derivation. This is used to generate the primes pi
65 * (p1, p2, q1, q2) from a parameter Xpi by checking successive odd
66 * integers.
67 */
68
69static int bn_x931_derive_pi(BIGNUM *pi, const BIGNUM *Xpi, BN_CTX *ctx,
70 BN_GENCB *cb)
71 {
72 int i = 0;
73 if (!BN_copy(pi, Xpi))
74 return 0;
75 if (!BN_is_odd(pi) && !BN_add_word(pi, 1))
76 return 0;
77 for(;;)
78 {
79 i++;
80 BN_GENCB_call(cb, 0, i);
81 /* NB 27 MR is specificed in X9.31 */
82 if (BN_is_prime_fasttest_ex(pi, 27, ctx, 1, cb))
83 break;
84 if (!BN_add_word(pi, 2))
85 return 0;
86 }
87 BN_GENCB_call(cb, 2, i);
88 return 1;
89 }
90
91/* This is the main X9.31 prime derivation function. From parameters
92 * Xp1, Xp2 and Xp derive the prime p. If the parameters p1 or p2 are
93 * not NULL they will be returned too: this is needed for testing.
94 */
95
96int BN_X931_derive_prime_ex(BIGNUM *p, BIGNUM *p1, BIGNUM *p2,
97 const BIGNUM *Xp, const BIGNUM *Xp1, const BIGNUM *Xp2,
98 const BIGNUM *e, BN_CTX *ctx, BN_GENCB *cb)
99 {
100 int ret = 0;
101
102 BIGNUM *t, *p1p2, *pm1;
103
104 /* Only even e supported */
105 if (!BN_is_odd(e))
106 return 0;
107
108 BN_CTX_start(ctx);
109 if (!p1)
110 p1 = BN_CTX_get(ctx);
111
112 if (!p2)
113 p2 = BN_CTX_get(ctx);
114
115 t = BN_CTX_get(ctx);
116
117 p1p2 = BN_CTX_get(ctx);
118
119 pm1 = BN_CTX_get(ctx);
120
121 if (!bn_x931_derive_pi(p1, Xp1, ctx, cb))
122 goto err;
123
124 if (!bn_x931_derive_pi(p2, Xp2, ctx, cb))
125 goto err;
126
127 if (!BN_mul(p1p2, p1, p2, ctx))
128 goto err;
129
130 /* First set p to value of Rp */
131
132 if (!BN_mod_inverse(p, p2, p1, ctx))
133 goto err;
134
135 if (!BN_mul(p, p, p2, ctx))
136 goto err;
137
138 if (!BN_mod_inverse(t, p1, p2, ctx))
139 goto err;
140
141 if (!BN_mul(t, t, p1, ctx))
142 goto err;
143
144 if (!BN_sub(p, p, t))
145 goto err;
146
147 if (p->neg && !BN_add(p, p, p1p2))
148 goto err;
149
150 /* p now equals Rp */
151
152 if (!BN_mod_sub(p, p, Xp, p1p2, ctx))
153 goto err;
154
155 if (!BN_add(p, p, Xp))
156 goto err;
157
158 /* p now equals Yp0 */
159
160 for (;;)
161 {
162 int i = 1;
163 BN_GENCB_call(cb, 0, i++);
164 if (!BN_copy(pm1, p))
165 goto err;
166 if (!BN_sub_word(pm1, 1))
167 goto err;
168 if (!BN_gcd(t, pm1, e, ctx))
169 goto err;
170 if (BN_is_one(t)
171 /* X9.31 specifies 8 MR and 1 Lucas test or any prime test
172 * offering similar or better guarantees 50 MR is considerably
173 * better.
174 */
175 && BN_is_prime_fasttest_ex(p, 50, ctx, 1, cb))
176 break;
177 if (!BN_add(p, p, p1p2))
178 goto err;
179 }
180
181 BN_GENCB_call(cb, 3, 0);
182
183 ret = 1;
184
185 err:
186
187 BN_CTX_end(ctx);
188
189 return ret;
190 }
191
192/* Generate pair of paramters Xp, Xq for X9.31 prime generation.
193 * Note: nbits paramter is sum of number of bits in both.
194 */
195
196int BN_X931_generate_Xpq(BIGNUM *Xp, BIGNUM *Xq, int nbits, BN_CTX *ctx)
197 {
198 BIGNUM *t;
199 int i;
200 /* Number of bits for each prime is of the form
201 * 512+128s for s = 0, 1, ...
202 */
203 if ((nbits < 1024) || (nbits & 0xff))
204 return 0;
205 nbits >>= 1;
206 /* The random value Xp must be between sqrt(2) * 2^(nbits-1) and
207 * 2^nbits - 1. By setting the top two bits we ensure that the lower
208 * bound is exceeded.
209 */
210 if (!BN_rand(Xp, nbits, 1, 0))
211 return 0;
212
213 BN_CTX_start(ctx);
214 t = BN_CTX_get(ctx);
215
216 for (i = 0; i < 1000; i++)
217 {
218 if (!BN_rand(Xq, nbits, 1, 0))
219 return 0;
220 /* Check that |Xp - Xq| > 2^(nbits - 100) */
221 BN_sub(t, Xp, Xq);
222 if (BN_num_bits(t) > (nbits - 100))
223 break;
224 }
225
226 BN_CTX_end(ctx);
227
228 if (i < 1000)
229 return 1;
230
231 return 0;
232
233 }
234
235/* Generate primes using X9.31 algorithm. Of the values p, p1, p2, Xp1
236 * and Xp2 only 'p' needs to be non-NULL. If any of the others are not NULL
237 * the relevant parameter will be stored in it.
238 *
239 * Due to the fact that |Xp - Xq| > 2^(nbits - 100) must be satisfied Xp and Xq
240 * are generated using the previous function and supplied as input.
241 */
242
243int BN_X931_generate_prime_ex(BIGNUM *p, BIGNUM *p1, BIGNUM *p2,
244 BIGNUM *Xp1, BIGNUM *Xp2,
245 const BIGNUM *Xp,
246 const BIGNUM *e, BN_CTX *ctx,
247 BN_GENCB *cb)
248 {
249 int ret = 0;
250
251 BN_CTX_start(ctx);
252 if (!Xp1)
253 Xp1 = BN_CTX_get(ctx);
254 if (!Xp2)
255 Xp2 = BN_CTX_get(ctx);
256
257 if (!BN_rand(Xp1, 101, 0, 0))
258 goto error;
259 if (!BN_rand(Xp2, 101, 0, 0))
260 goto error;
261 if (!BN_X931_derive_prime_ex(p, p1, p2, Xp, Xp1, Xp2, e, ctx, cb))
262 goto error;
263
264 ret = 1;
265
266 error:
267 BN_CTX_end(ctx);
268
269 return ret;
270
271 }
272