summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/lib/libcrypto/bn/asm/pa-risc2.s1618
-rw-r--r--src/lib/libcrypto/bn/asm/pa-risc2W.s1605
-rw-r--r--src/lib/libcrypto/bn/asm/sparcv8plus.S1558
-rw-r--r--src/lib/libcrypto/bn/asm/sparcv9-mont.pl604
-rwxr-xr-xsrc/lib/libcrypto/bn/asm/sparcv9a-mont.pl880
-rw-r--r--src/lib/libcrypto/bn/asm/via-mont.pl242
-rw-r--r--src/lib/libcrypto/cast/asm/cast-586.pl177
-rw-r--r--src/lib/libcrypto/des/asm/crypt586.pl209
-rw-r--r--src/lib/libcrypto/sha/asm/sha1-sparcv9a.pl608
-rw-r--r--src/lib/libcrypto/sha/asm/sha1-thumb.pl259
10 files changed, 0 insertions, 7760 deletions
diff --git a/src/lib/libcrypto/bn/asm/pa-risc2.s b/src/lib/libcrypto/bn/asm/pa-risc2.s
deleted file mode 100644
index f3b16290eb..0000000000
--- a/src/lib/libcrypto/bn/asm/pa-risc2.s
+++ /dev/null
@@ -1,1618 +0,0 @@
1;
2; PA-RISC 2.0 implementation of bn_asm code, based on the
3; 64-bit version of the code. This code is effectively the
4; same as the 64-bit version except the register model is
5; slightly different given all values must be 32-bit between
6; function calls. Thus the 64-bit return values are returned
7; in %ret0 and %ret1 vs just %ret0 as is done in 64-bit
8;
9;
10; This code is approximately 2x faster than the C version
11; for RSA/DSA.
12;
13; See http://devresource.hp.com/ for more details on the PA-RISC
14; architecture. Also see the book "PA-RISC 2.0 Architecture"
15; by Gerry Kane for information on the instruction set architecture.
16;
17; Code written by Chris Ruemmler (with some help from the HP C
18; compiler).
19;
20; The code compiles with HP's assembler
21;
22
23 .level 2.0N
24 .space $TEXT$
25 .subspa $CODE$,QUAD=0,ALIGN=8,ACCESS=0x2c,CODE_ONLY
26
27;
28; Global Register definitions used for the routines.
29;
30; Some information about HP's runtime architecture for 32-bits.
31;
32; "Caller save" means the calling function must save the register
33; if it wants the register to be preserved.
34; "Callee save" means if a function uses the register, it must save
35; the value before using it.
36;
37; For the floating point registers
38;
39; "caller save" registers: fr4-fr11, fr22-fr31
40; "callee save" registers: fr12-fr21
41; "special" registers: fr0-fr3 (status and exception registers)
42;
43; For the integer registers
44; value zero : r0
45; "caller save" registers: r1,r19-r26
46; "callee save" registers: r3-r18
47; return register : r2 (rp)
48; return values ; r28,r29 (ret0,ret1)
49; Stack pointer ; r30 (sp)
50; millicode return ptr ; r31 (also a caller save register)
51
52
53;
54; Arguments to the routines
55;
56r_ptr .reg %r26
57a_ptr .reg %r25
58b_ptr .reg %r24
59num .reg %r24
60n .reg %r23
61
62;
63; Note that the "w" argument for bn_mul_add_words and bn_mul_words
64; is passed on the stack at a delta of -56 from the top of stack
65; as the routine is entered.
66;
67
68;
69; Globals used in some routines
70;
71
72top_overflow .reg %r23
73high_mask .reg %r22 ; value 0xffffffff80000000L
74
75
76;------------------------------------------------------------------------------
77;
78; bn_mul_add_words
79;
80;BN_ULONG bn_mul_add_words(BN_ULONG *r_ptr, BN_ULONG *a_ptr,
81; int num, BN_ULONG w)
82;
83; arg0 = r_ptr
84; arg1 = a_ptr
85; arg3 = num
86; -56(sp) = w
87;
88; Local register definitions
89;
90
91fm1 .reg %fr22
92fm .reg %fr23
93ht_temp .reg %fr24
94ht_temp_1 .reg %fr25
95lt_temp .reg %fr26
96lt_temp_1 .reg %fr27
97fm1_1 .reg %fr28
98fm_1 .reg %fr29
99
100fw_h .reg %fr7L
101fw_l .reg %fr7R
102fw .reg %fr7
103
104fht_0 .reg %fr8L
105flt_0 .reg %fr8R
106t_float_0 .reg %fr8
107
108fht_1 .reg %fr9L
109flt_1 .reg %fr9R
110t_float_1 .reg %fr9
111
112tmp_0 .reg %r31
113tmp_1 .reg %r21
114m_0 .reg %r20
115m_1 .reg %r19
116ht_0 .reg %r1
117ht_1 .reg %r3
118lt_0 .reg %r4
119lt_1 .reg %r5
120m1_0 .reg %r6
121m1_1 .reg %r7
122rp_val .reg %r8
123rp_val_1 .reg %r9
124
125bn_mul_add_words
126 .export bn_mul_add_words,entry,NO_RELOCATION,LONG_RETURN
127 .proc
128 .callinfo frame=128
129 .entry
130 .align 64
131
132 STD %r3,0(%sp) ; save r3
133 STD %r4,8(%sp) ; save r4
134 NOP ; Needed to make the loop 16-byte aligned
135 NOP ; needed to make the loop 16-byte aligned
136
137 STD %r5,16(%sp) ; save r5
138 NOP
139 STD %r6,24(%sp) ; save r6
140 STD %r7,32(%sp) ; save r7
141
142 STD %r8,40(%sp) ; save r8
143 STD %r9,48(%sp) ; save r9
144 COPY %r0,%ret1 ; return 0 by default
145 DEPDI,Z 1,31,1,top_overflow ; top_overflow = 1 << 32
146
147 CMPIB,>= 0,num,bn_mul_add_words_exit ; if (num <= 0) then exit
148 LDO 128(%sp),%sp ; bump stack
149
150 ;
151 ; The loop is unrolled twice, so if there is only 1 number
152 ; then go straight to the cleanup code.
153 ;
154 CMPIB,= 1,num,bn_mul_add_words_single_top
155 FLDD -184(%sp),fw ; (-56-128) load up w into fw (fw_h/fw_l)
156
157 ;
158 ; This loop is unrolled 2 times (64-byte aligned as well)
159 ;
160 ; PA-RISC 2.0 chips have two fully pipelined multipliers, thus
161 ; two 32-bit mutiplies can be issued per cycle.
162 ;
163bn_mul_add_words_unroll2
164
165 FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R)
166 FLDD 8(a_ptr),t_float_1 ; load up 64-bit value (fr8L) ht(L)/lt(R)
167 LDD 0(r_ptr),rp_val ; rp[0]
168 LDD 8(r_ptr),rp_val_1 ; rp[1]
169
170 XMPYU fht_0,fw_l,fm1 ; m1[0] = fht_0*fw_l
171 XMPYU fht_1,fw_l,fm1_1 ; m1[1] = fht_1*fw_l
172 FSTD fm1,-16(%sp) ; -16(sp) = m1[0]
173 FSTD fm1_1,-48(%sp) ; -48(sp) = m1[1]
174
175 XMPYU flt_0,fw_h,fm ; m[0] = flt_0*fw_h
176 XMPYU flt_1,fw_h,fm_1 ; m[1] = flt_1*fw_h
177 FSTD fm,-8(%sp) ; -8(sp) = m[0]
178 FSTD fm_1,-40(%sp) ; -40(sp) = m[1]
179
180 XMPYU fht_0,fw_h,ht_temp ; ht_temp = fht_0*fw_h
181 XMPYU fht_1,fw_h,ht_temp_1 ; ht_temp_1 = fht_1*fw_h
182 FSTD ht_temp,-24(%sp) ; -24(sp) = ht_temp
183 FSTD ht_temp_1,-56(%sp) ; -56(sp) = ht_temp_1
184
185 XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l
186 XMPYU flt_1,fw_l,lt_temp_1 ; lt_temp = lt*fw_l
187 FSTD lt_temp,-32(%sp) ; -32(sp) = lt_temp
188 FSTD lt_temp_1,-64(%sp) ; -64(sp) = lt_temp_1
189
190 LDD -8(%sp),m_0 ; m[0]
191 LDD -40(%sp),m_1 ; m[1]
192 LDD -16(%sp),m1_0 ; m1[0]
193 LDD -48(%sp),m1_1 ; m1[1]
194
195 LDD -24(%sp),ht_0 ; ht[0]
196 LDD -56(%sp),ht_1 ; ht[1]
197 ADD,L m1_0,m_0,tmp_0 ; tmp_0 = m[0] + m1[0];
198 ADD,L m1_1,m_1,tmp_1 ; tmp_1 = m[1] + m1[1];
199
200 LDD -32(%sp),lt_0
201 LDD -64(%sp),lt_1
202 CMPCLR,*>>= tmp_0,m1_0, %r0 ; if (m[0] < m1[0])
203 ADD,L ht_0,top_overflow,ht_0 ; ht[0] += (1<<32)
204
205 CMPCLR,*>>= tmp_1,m1_1,%r0 ; if (m[1] < m1[1])
206 ADD,L ht_1,top_overflow,ht_1 ; ht[1] += (1<<32)
207 EXTRD,U tmp_0,31,32,m_0 ; m[0]>>32
208 DEPD,Z tmp_0,31,32,m1_0 ; m1[0] = m[0]<<32
209
210 EXTRD,U tmp_1,31,32,m_1 ; m[1]>>32
211 DEPD,Z tmp_1,31,32,m1_1 ; m1[1] = m[1]<<32
212 ADD,L ht_0,m_0,ht_0 ; ht[0]+= (m[0]>>32)
213 ADD,L ht_1,m_1,ht_1 ; ht[1]+= (m[1]>>32)
214
215 ADD lt_0,m1_0,lt_0 ; lt[0] = lt[0]+m1[0];
216 ADD,DC ht_0,%r0,ht_0 ; ht[0]++
217 ADD lt_1,m1_1,lt_1 ; lt[1] = lt[1]+m1[1];
218 ADD,DC ht_1,%r0,ht_1 ; ht[1]++
219
220 ADD %ret1,lt_0,lt_0 ; lt[0] = lt[0] + c;
221 ADD,DC ht_0,%r0,ht_0 ; ht[0]++
222 ADD lt_0,rp_val,lt_0 ; lt[0] = lt[0]+rp[0]
223 ADD,DC ht_0,%r0,ht_0 ; ht[0]++
224
225 LDO -2(num),num ; num = num - 2;
226 ADD ht_0,lt_1,lt_1 ; lt[1] = lt[1] + ht_0 (c);
227 ADD,DC ht_1,%r0,ht_1 ; ht[1]++
228 STD lt_0,0(r_ptr) ; rp[0] = lt[0]
229
230 ADD lt_1,rp_val_1,lt_1 ; lt[1] = lt[1]+rp[1]
231 ADD,DC ht_1,%r0,%ret1 ; ht[1]++
232 LDO 16(a_ptr),a_ptr ; a_ptr += 2
233
234 STD lt_1,8(r_ptr) ; rp[1] = lt[1]
235 CMPIB,<= 2,num,bn_mul_add_words_unroll2 ; go again if more to do
236 LDO 16(r_ptr),r_ptr ; r_ptr += 2
237
238 CMPIB,=,N 0,num,bn_mul_add_words_exit ; are we done, or cleanup last one
239
240 ;
241 ; Top of loop aligned on 64-byte boundary
242 ;
243bn_mul_add_words_single_top
244 FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R)
245 LDD 0(r_ptr),rp_val ; rp[0]
246 LDO 8(a_ptr),a_ptr ; a_ptr++
247 XMPYU fht_0,fw_l,fm1 ; m1 = ht*fw_l
248 FSTD fm1,-16(%sp) ; -16(sp) = m1
249 XMPYU flt_0,fw_h,fm ; m = lt*fw_h
250 FSTD fm,-8(%sp) ; -8(sp) = m
251 XMPYU fht_0,fw_h,ht_temp ; ht_temp = ht*fw_h
252 FSTD ht_temp,-24(%sp) ; -24(sp) = ht
253 XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l
254 FSTD lt_temp,-32(%sp) ; -32(sp) = lt
255
256 LDD -8(%sp),m_0
257 LDD -16(%sp),m1_0 ; m1 = temp1
258 ADD,L m_0,m1_0,tmp_0 ; tmp_0 = m + m1;
259 LDD -24(%sp),ht_0
260 LDD -32(%sp),lt_0
261
262 CMPCLR,*>>= tmp_0,m1_0,%r0 ; if (m < m1)
263 ADD,L ht_0,top_overflow,ht_0 ; ht += (1<<32)
264
265 EXTRD,U tmp_0,31,32,m_0 ; m>>32
266 DEPD,Z tmp_0,31,32,m1_0 ; m1 = m<<32
267
268 ADD,L ht_0,m_0,ht_0 ; ht+= (m>>32)
269 ADD lt_0,m1_0,tmp_0 ; tmp_0 = lt+m1;
270 ADD,DC ht_0,%r0,ht_0 ; ht++
271 ADD %ret1,tmp_0,lt_0 ; lt = lt + c;
272 ADD,DC ht_0,%r0,ht_0 ; ht++
273 ADD lt_0,rp_val,lt_0 ; lt = lt+rp[0]
274 ADD,DC ht_0,%r0,%ret1 ; ht++
275 STD lt_0,0(r_ptr) ; rp[0] = lt
276
277bn_mul_add_words_exit
278 .EXIT
279
280 EXTRD,U %ret1,31,32,%ret0 ; for 32-bit, return in ret0/ret1
281 LDD -80(%sp),%r9 ; restore r9
282 LDD -88(%sp),%r8 ; restore r8
283 LDD -96(%sp),%r7 ; restore r7
284 LDD -104(%sp),%r6 ; restore r6
285 LDD -112(%sp),%r5 ; restore r5
286 LDD -120(%sp),%r4 ; restore r4
287 BVE (%rp)
288 LDD,MB -128(%sp),%r3 ; restore r3
289 .PROCEND ;in=23,24,25,26,29;out=28;
290
291;----------------------------------------------------------------------------
292;
293;BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
294;
295; arg0 = rp
296; arg1 = ap
297; arg3 = num
298; w on stack at -56(sp)
299
300bn_mul_words
301 .proc
302 .callinfo frame=128
303 .entry
304 .EXPORT bn_mul_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
305 .align 64
306
307 STD %r3,0(%sp) ; save r3
308 STD %r4,8(%sp) ; save r4
309 NOP
310 STD %r5,16(%sp) ; save r5
311
312 STD %r6,24(%sp) ; save r6
313 STD %r7,32(%sp) ; save r7
314 COPY %r0,%ret1 ; return 0 by default
315 DEPDI,Z 1,31,1,top_overflow ; top_overflow = 1 << 32
316
317 CMPIB,>= 0,num,bn_mul_words_exit
318 LDO 128(%sp),%sp ; bump stack
319
320 ;
321 ; See if only 1 word to do, thus just do cleanup
322 ;
323 CMPIB,= 1,num,bn_mul_words_single_top
324 FLDD -184(%sp),fw ; (-56-128) load up w into fw (fw_h/fw_l)
325
326 ;
327 ; This loop is unrolled 2 times (64-byte aligned as well)
328 ;
329 ; PA-RISC 2.0 chips have two fully pipelined multipliers, thus
330 ; two 32-bit mutiplies can be issued per cycle.
331 ;
332bn_mul_words_unroll2
333
334 FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R)
335 FLDD 8(a_ptr),t_float_1 ; load up 64-bit value (fr8L) ht(L)/lt(R)
336 XMPYU fht_0,fw_l,fm1 ; m1[0] = fht_0*fw_l
337 XMPYU fht_1,fw_l,fm1_1 ; m1[1] = ht*fw_l
338
339 FSTD fm1,-16(%sp) ; -16(sp) = m1
340 FSTD fm1_1,-48(%sp) ; -48(sp) = m1
341 XMPYU flt_0,fw_h,fm ; m = lt*fw_h
342 XMPYU flt_1,fw_h,fm_1 ; m = lt*fw_h
343
344 FSTD fm,-8(%sp) ; -8(sp) = m
345 FSTD fm_1,-40(%sp) ; -40(sp) = m
346 XMPYU fht_0,fw_h,ht_temp ; ht_temp = fht_0*fw_h
347 XMPYU fht_1,fw_h,ht_temp_1 ; ht_temp = ht*fw_h
348
349 FSTD ht_temp,-24(%sp) ; -24(sp) = ht
350 FSTD ht_temp_1,-56(%sp) ; -56(sp) = ht
351 XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l
352 XMPYU flt_1,fw_l,lt_temp_1 ; lt_temp = lt*fw_l
353
354 FSTD lt_temp,-32(%sp) ; -32(sp) = lt
355 FSTD lt_temp_1,-64(%sp) ; -64(sp) = lt
356 LDD -8(%sp),m_0
357 LDD -40(%sp),m_1
358
359 LDD -16(%sp),m1_0
360 LDD -48(%sp),m1_1
361 LDD -24(%sp),ht_0
362 LDD -56(%sp),ht_1
363
364 ADD,L m1_0,m_0,tmp_0 ; tmp_0 = m + m1;
365 ADD,L m1_1,m_1,tmp_1 ; tmp_1 = m + m1;
366 LDD -32(%sp),lt_0
367 LDD -64(%sp),lt_1
368
369 CMPCLR,*>>= tmp_0,m1_0, %r0 ; if (m < m1)
370 ADD,L ht_0,top_overflow,ht_0 ; ht += (1<<32)
371 CMPCLR,*>>= tmp_1,m1_1,%r0 ; if (m < m1)
372 ADD,L ht_1,top_overflow,ht_1 ; ht += (1<<32)
373
374 EXTRD,U tmp_0,31,32,m_0 ; m>>32
375 DEPD,Z tmp_0,31,32,m1_0 ; m1 = m<<32
376 EXTRD,U tmp_1,31,32,m_1 ; m>>32
377 DEPD,Z tmp_1,31,32,m1_1 ; m1 = m<<32
378
379 ADD,L ht_0,m_0,ht_0 ; ht+= (m>>32)
380 ADD,L ht_1,m_1,ht_1 ; ht+= (m>>32)
381 ADD lt_0,m1_0,lt_0 ; lt = lt+m1;
382 ADD,DC ht_0,%r0,ht_0 ; ht++
383
384 ADD lt_1,m1_1,lt_1 ; lt = lt+m1;
385 ADD,DC ht_1,%r0,ht_1 ; ht++
386 ADD %ret1,lt_0,lt_0 ; lt = lt + c (ret1);
387 ADD,DC ht_0,%r0,ht_0 ; ht++
388
389 ADD ht_0,lt_1,lt_1 ; lt = lt + c (ht_0)
390 ADD,DC ht_1,%r0,ht_1 ; ht++
391 STD lt_0,0(r_ptr) ; rp[0] = lt
392 STD lt_1,8(r_ptr) ; rp[1] = lt
393
394 COPY ht_1,%ret1 ; carry = ht
395 LDO -2(num),num ; num = num - 2;
396 LDO 16(a_ptr),a_ptr ; ap += 2
397 CMPIB,<= 2,num,bn_mul_words_unroll2
398 LDO 16(r_ptr),r_ptr ; rp++
399
400 CMPIB,=,N 0,num,bn_mul_words_exit ; are we done?
401
402 ;
403 ; Top of loop aligned on 64-byte boundary
404 ;
405bn_mul_words_single_top
406 FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R)
407
408 XMPYU fht_0,fw_l,fm1 ; m1 = ht*fw_l
409 FSTD fm1,-16(%sp) ; -16(sp) = m1
410 XMPYU flt_0,fw_h,fm ; m = lt*fw_h
411 FSTD fm,-8(%sp) ; -8(sp) = m
412 XMPYU fht_0,fw_h,ht_temp ; ht_temp = ht*fw_h
413 FSTD ht_temp,-24(%sp) ; -24(sp) = ht
414 XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l
415 FSTD lt_temp,-32(%sp) ; -32(sp) = lt
416
417 LDD -8(%sp),m_0
418 LDD -16(%sp),m1_0
419 ADD,L m_0,m1_0,tmp_0 ; tmp_0 = m + m1;
420 LDD -24(%sp),ht_0
421 LDD -32(%sp),lt_0
422
423 CMPCLR,*>>= tmp_0,m1_0,%r0 ; if (m < m1)
424 ADD,L ht_0,top_overflow,ht_0 ; ht += (1<<32)
425
426 EXTRD,U tmp_0,31,32,m_0 ; m>>32
427 DEPD,Z tmp_0,31,32,m1_0 ; m1 = m<<32
428
429 ADD,L ht_0,m_0,ht_0 ; ht+= (m>>32)
430 ADD lt_0,m1_0,lt_0 ; lt= lt+m1;
431 ADD,DC ht_0,%r0,ht_0 ; ht++
432
433 ADD %ret1,lt_0,lt_0 ; lt = lt + c;
434 ADD,DC ht_0,%r0,ht_0 ; ht++
435
436 COPY ht_0,%ret1 ; copy carry
437 STD lt_0,0(r_ptr) ; rp[0] = lt
438
439bn_mul_words_exit
440 .EXIT
441 EXTRD,U %ret1,31,32,%ret0 ; for 32-bit, return in ret0/ret1
442 LDD -96(%sp),%r7 ; restore r7
443 LDD -104(%sp),%r6 ; restore r6
444 LDD -112(%sp),%r5 ; restore r5
445 LDD -120(%sp),%r4 ; restore r4
446 BVE (%rp)
447 LDD,MB -128(%sp),%r3 ; restore r3
448 .PROCEND
449
450;----------------------------------------------------------------------------
451;
452;void bn_sqr_words(BN_ULONG *rp, BN_ULONG *ap, int num)
453;
454; arg0 = rp
455; arg1 = ap
456; arg2 = num
457;
458
459bn_sqr_words
460 .proc
461 .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
462 .EXPORT bn_sqr_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
463 .entry
464 .align 64
465
466 STD %r3,0(%sp) ; save r3
467 STD %r4,8(%sp) ; save r4
468 NOP
469 STD %r5,16(%sp) ; save r5
470
471 CMPIB,>= 0,num,bn_sqr_words_exit
472 LDO 128(%sp),%sp ; bump stack
473
474 ;
475 ; If only 1, the goto straight to cleanup
476 ;
477 CMPIB,= 1,num,bn_sqr_words_single_top
478 DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L
479
480 ;
481 ; This loop is unrolled 2 times (64-byte aligned as well)
482 ;
483
484bn_sqr_words_unroll2
485 FLDD 0(a_ptr),t_float_0 ; a[0]
486 FLDD 8(a_ptr),t_float_1 ; a[1]
487 XMPYU fht_0,flt_0,fm ; m[0]
488 XMPYU fht_1,flt_1,fm_1 ; m[1]
489
490 FSTD fm,-24(%sp) ; store m[0]
491 FSTD fm_1,-56(%sp) ; store m[1]
492 XMPYU flt_0,flt_0,lt_temp ; lt[0]
493 XMPYU flt_1,flt_1,lt_temp_1 ; lt[1]
494
495 FSTD lt_temp,-16(%sp) ; store lt[0]
496 FSTD lt_temp_1,-48(%sp) ; store lt[1]
497 XMPYU fht_0,fht_0,ht_temp ; ht[0]
498 XMPYU fht_1,fht_1,ht_temp_1 ; ht[1]
499
500 FSTD ht_temp,-8(%sp) ; store ht[0]
501 FSTD ht_temp_1,-40(%sp) ; store ht[1]
502 LDD -24(%sp),m_0
503 LDD -56(%sp),m_1
504
505 AND m_0,high_mask,tmp_0 ; m[0] & Mask
506 AND m_1,high_mask,tmp_1 ; m[1] & Mask
507 DEPD,Z m_0,30,31,m_0 ; m[0] << 32+1
508 DEPD,Z m_1,30,31,m_1 ; m[1] << 32+1
509
510 LDD -16(%sp),lt_0
511 LDD -48(%sp),lt_1
512 EXTRD,U tmp_0,32,33,tmp_0 ; tmp_0 = m[0]&Mask >> 32-1
513 EXTRD,U tmp_1,32,33,tmp_1 ; tmp_1 = m[1]&Mask >> 32-1
514
515 LDD -8(%sp),ht_0
516 LDD -40(%sp),ht_1
517 ADD,L ht_0,tmp_0,ht_0 ; ht[0] += tmp_0
518 ADD,L ht_1,tmp_1,ht_1 ; ht[1] += tmp_1
519
520 ADD lt_0,m_0,lt_0 ; lt = lt+m
521 ADD,DC ht_0,%r0,ht_0 ; ht[0]++
522 STD lt_0,0(r_ptr) ; rp[0] = lt[0]
523 STD ht_0,8(r_ptr) ; rp[1] = ht[1]
524
525 ADD lt_1,m_1,lt_1 ; lt = lt+m
526 ADD,DC ht_1,%r0,ht_1 ; ht[1]++
527 STD lt_1,16(r_ptr) ; rp[2] = lt[1]
528 STD ht_1,24(r_ptr) ; rp[3] = ht[1]
529
530 LDO -2(num),num ; num = num - 2;
531 LDO 16(a_ptr),a_ptr ; ap += 2
532 CMPIB,<= 2,num,bn_sqr_words_unroll2
533 LDO 32(r_ptr),r_ptr ; rp += 4
534
535 CMPIB,=,N 0,num,bn_sqr_words_exit ; are we done?
536
537 ;
538 ; Top of loop aligned on 64-byte boundary
539 ;
540bn_sqr_words_single_top
541 FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R)
542
543 XMPYU fht_0,flt_0,fm ; m
544 FSTD fm,-24(%sp) ; store m
545
546 XMPYU flt_0,flt_0,lt_temp ; lt
547 FSTD lt_temp,-16(%sp) ; store lt
548
549 XMPYU fht_0,fht_0,ht_temp ; ht
550 FSTD ht_temp,-8(%sp) ; store ht
551
552 LDD -24(%sp),m_0 ; load m
553 AND m_0,high_mask,tmp_0 ; m & Mask
554 DEPD,Z m_0,30,31,m_0 ; m << 32+1
555 LDD -16(%sp),lt_0 ; lt
556
557 LDD -8(%sp),ht_0 ; ht
558 EXTRD,U tmp_0,32,33,tmp_0 ; tmp_0 = m&Mask >> 32-1
559 ADD m_0,lt_0,lt_0 ; lt = lt+m
560 ADD,L ht_0,tmp_0,ht_0 ; ht += tmp_0
561 ADD,DC ht_0,%r0,ht_0 ; ht++
562
563 STD lt_0,0(r_ptr) ; rp[0] = lt
564 STD ht_0,8(r_ptr) ; rp[1] = ht
565
566bn_sqr_words_exit
567 .EXIT
568 LDD -112(%sp),%r5 ; restore r5
569 LDD -120(%sp),%r4 ; restore r4
570 BVE (%rp)
571 LDD,MB -128(%sp),%r3
572 .PROCEND ;in=23,24,25,26,29;out=28;
573
574
575;----------------------------------------------------------------------------
576;
577;BN_ULONG bn_add_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
578;
579; arg0 = rp
580; arg1 = ap
581; arg2 = bp
582; arg3 = n
583
584t .reg %r22
585b .reg %r21
586l .reg %r20
587
588bn_add_words
589 .proc
590 .entry
591 .callinfo
592 .EXPORT bn_add_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
593 .align 64
594
595 CMPIB,>= 0,n,bn_add_words_exit
596 COPY %r0,%ret1 ; return 0 by default
597
598 ;
599 ; If 2 or more numbers do the loop
600 ;
601 CMPIB,= 1,n,bn_add_words_single_top
602 NOP
603
604 ;
605 ; This loop is unrolled 2 times (64-byte aligned as well)
606 ;
607bn_add_words_unroll2
608 LDD 0(a_ptr),t
609 LDD 0(b_ptr),b
610 ADD t,%ret1,t ; t = t+c;
611 ADD,DC %r0,%r0,%ret1 ; set c to carry
612 ADD t,b,l ; l = t + b[0]
613 ADD,DC %ret1,%r0,%ret1 ; c+= carry
614 STD l,0(r_ptr)
615
616 LDD 8(a_ptr),t
617 LDD 8(b_ptr),b
618 ADD t,%ret1,t ; t = t+c;
619 ADD,DC %r0,%r0,%ret1 ; set c to carry
620 ADD t,b,l ; l = t + b[0]
621 ADD,DC %ret1,%r0,%ret1 ; c+= carry
622 STD l,8(r_ptr)
623
624 LDO -2(n),n
625 LDO 16(a_ptr),a_ptr
626 LDO 16(b_ptr),b_ptr
627
628 CMPIB,<= 2,n,bn_add_words_unroll2
629 LDO 16(r_ptr),r_ptr
630
631 CMPIB,=,N 0,n,bn_add_words_exit ; are we done?
632
633bn_add_words_single_top
634 LDD 0(a_ptr),t
635 LDD 0(b_ptr),b
636
637 ADD t,%ret1,t ; t = t+c;
638 ADD,DC %r0,%r0,%ret1 ; set c to carry (could use CMPCLR??)
639 ADD t,b,l ; l = t + b[0]
640 ADD,DC %ret1,%r0,%ret1 ; c+= carry
641 STD l,0(r_ptr)
642
643bn_add_words_exit
644 .EXIT
645 BVE (%rp)
646 EXTRD,U %ret1,31,32,%ret0 ; for 32-bit, return in ret0/ret1
647 .PROCEND ;in=23,24,25,26,29;out=28;
648
649;----------------------------------------------------------------------------
650;
651;BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
652;
653; arg0 = rp
654; arg1 = ap
655; arg2 = bp
656; arg3 = n
657
658t1 .reg %r22
659t2 .reg %r21
660sub_tmp1 .reg %r20
661sub_tmp2 .reg %r19
662
663
664bn_sub_words
665 .proc
666 .callinfo
667 .EXPORT bn_sub_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
668 .entry
669 .align 64
670
671 CMPIB,>= 0,n,bn_sub_words_exit
672 COPY %r0,%ret1 ; return 0 by default
673
674 ;
675 ; If 2 or more numbers do the loop
676 ;
677 CMPIB,= 1,n,bn_sub_words_single_top
678 NOP
679
680 ;
681 ; This loop is unrolled 2 times (64-byte aligned as well)
682 ;
683bn_sub_words_unroll2
684 LDD 0(a_ptr),t1
685 LDD 0(b_ptr),t2
686 SUB t1,t2,sub_tmp1 ; t3 = t1-t2;
687 SUB sub_tmp1,%ret1,sub_tmp1 ; t3 = t3- c;
688
689 CMPCLR,*>> t1,t2,sub_tmp2 ; clear if t1 > t2
690 LDO 1(%r0),sub_tmp2
691
692 CMPCLR,*= t1,t2,%r0
693 COPY sub_tmp2,%ret1
694 STD sub_tmp1,0(r_ptr)
695
696 LDD 8(a_ptr),t1
697 LDD 8(b_ptr),t2
698 SUB t1,t2,sub_tmp1 ; t3 = t1-t2;
699 SUB sub_tmp1,%ret1,sub_tmp1 ; t3 = t3- c;
700 CMPCLR,*>> t1,t2,sub_tmp2 ; clear if t1 > t2
701 LDO 1(%r0),sub_tmp2
702
703 CMPCLR,*= t1,t2,%r0
704 COPY sub_tmp2,%ret1
705 STD sub_tmp1,8(r_ptr)
706
707 LDO -2(n),n
708 LDO 16(a_ptr),a_ptr
709 LDO 16(b_ptr),b_ptr
710
711 CMPIB,<= 2,n,bn_sub_words_unroll2
712 LDO 16(r_ptr),r_ptr
713
714 CMPIB,=,N 0,n,bn_sub_words_exit ; are we done?
715
716bn_sub_words_single_top
717 LDD 0(a_ptr),t1
718 LDD 0(b_ptr),t2
719 SUB t1,t2,sub_tmp1 ; t3 = t1-t2;
720 SUB sub_tmp1,%ret1,sub_tmp1 ; t3 = t3- c;
721 CMPCLR,*>> t1,t2,sub_tmp2 ; clear if t1 > t2
722 LDO 1(%r0),sub_tmp2
723
724 CMPCLR,*= t1,t2,%r0
725 COPY sub_tmp2,%ret1
726
727 STD sub_tmp1,0(r_ptr)
728
729bn_sub_words_exit
730 .EXIT
731 BVE (%rp)
732 EXTRD,U %ret1,31,32,%ret0 ; for 32-bit, return in ret0/ret1
733 .PROCEND ;in=23,24,25,26,29;out=28;
734
735;------------------------------------------------------------------------------
736;
737; unsigned long bn_div_words(unsigned long h, unsigned long l, unsigned long d)
738;
739; arg0 = h
740; arg1 = l
741; arg2 = d
742;
743; This is mainly just output from the HP C compiler.
744;
745;------------------------------------------------------------------------------
746bn_div_words
747 .PROC
748 .EXPORT bn_div_words,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR,RTNVAL=GR,LONG_RETURN
749 .IMPORT BN_num_bits_word,CODE
750 ;--- not PIC .IMPORT __iob,DATA
751 ;--- not PIC .IMPORT fprintf,CODE
752 .IMPORT abort,CODE
753 .IMPORT $$div2U,MILLICODE
754 .CALLINFO CALLER,FRAME=144,ENTRY_GR=%r9,SAVE_RP,ARGS_SAVED,ORDERING_AWARE
755 .ENTRY
756 STW %r2,-20(%r30) ;offset 0x8ec
757 STW,MA %r3,192(%r30) ;offset 0x8f0
758 STW %r4,-188(%r30) ;offset 0x8f4
759 DEPD %r5,31,32,%r6 ;offset 0x8f8
760 STD %r6,-184(%r30) ;offset 0x8fc
761 DEPD %r7,31,32,%r8 ;offset 0x900
762 STD %r8,-176(%r30) ;offset 0x904
763 STW %r9,-168(%r30) ;offset 0x908
764 LDD -248(%r30),%r3 ;offset 0x90c
765 COPY %r26,%r4 ;offset 0x910
766 COPY %r24,%r5 ;offset 0x914
767 DEPD %r25,31,32,%r4 ;offset 0x918
768 CMPB,*<> %r3,%r0,$0006000C ;offset 0x91c
769 DEPD %r23,31,32,%r5 ;offset 0x920
770 MOVIB,TR -1,%r29,$00060002 ;offset 0x924
771 EXTRD,U %r29,31,32,%r28 ;offset 0x928
772$0006002A
773 LDO -1(%r29),%r29 ;offset 0x92c
774 SUB %r23,%r7,%r23 ;offset 0x930
775$00060024
776 SUB %r4,%r31,%r25 ;offset 0x934
777 AND %r25,%r19,%r26 ;offset 0x938
778 CMPB,*<>,N %r0,%r26,$00060046 ;offset 0x93c
779 DEPD,Z %r25,31,32,%r20 ;offset 0x940
780 OR %r20,%r24,%r21 ;offset 0x944
781 CMPB,*<<,N %r21,%r23,$0006002A ;offset 0x948
782 SUB %r31,%r2,%r31 ;offset 0x94c
783$00060046
784$0006002E
785 DEPD,Z %r23,31,32,%r25 ;offset 0x950
786 EXTRD,U %r23,31,32,%r26 ;offset 0x954
787 AND %r25,%r19,%r24 ;offset 0x958
788 ADD,L %r31,%r26,%r31 ;offset 0x95c
789 CMPCLR,*>>= %r5,%r24,%r0 ;offset 0x960
790 LDO 1(%r31),%r31 ;offset 0x964
791$00060032
792 CMPB,*<<=,N %r31,%r4,$00060036 ;offset 0x968
793 LDO -1(%r29),%r29 ;offset 0x96c
794 ADD,L %r4,%r3,%r4 ;offset 0x970
795$00060036
796 ADDIB,=,N -1,%r8,$D0 ;offset 0x974
797 SUB %r5,%r24,%r28 ;offset 0x978
798$0006003A
799 SUB %r4,%r31,%r24 ;offset 0x97c
800 SHRPD %r24,%r28,32,%r4 ;offset 0x980
801 DEPD,Z %r29,31,32,%r9 ;offset 0x984
802 DEPD,Z %r28,31,32,%r5 ;offset 0x988
803$0006001C
804 EXTRD,U %r4,31,32,%r31 ;offset 0x98c
805 CMPB,*<>,N %r31,%r2,$00060020 ;offset 0x990
806 MOVB,TR %r6,%r29,$D1 ;offset 0x994
807 STD %r29,-152(%r30) ;offset 0x998
808$0006000C
809 EXTRD,U %r3,31,32,%r25 ;offset 0x99c
810 COPY %r3,%r26 ;offset 0x9a0
811 EXTRD,U %r3,31,32,%r9 ;offset 0x9a4
812 EXTRD,U %r4,31,32,%r8 ;offset 0x9a8
813 .CALL ARGW0=GR,ARGW1=GR,RTNVAL=GR ;in=25,26;out=28;
814 B,L BN_num_bits_word,%r2 ;offset 0x9ac
815 EXTRD,U %r5,31,32,%r7 ;offset 0x9b0
816 LDI 64,%r20 ;offset 0x9b4
817 DEPD %r7,31,32,%r5 ;offset 0x9b8
818 DEPD %r8,31,32,%r4 ;offset 0x9bc
819 DEPD %r9,31,32,%r3 ;offset 0x9c0
820 CMPB,= %r28,%r20,$00060012 ;offset 0x9c4
821 COPY %r28,%r24 ;offset 0x9c8
822 MTSARCM %r24 ;offset 0x9cc
823 DEPDI,Z -1,%sar,1,%r19 ;offset 0x9d0
824 CMPB,*>>,N %r4,%r19,$D2 ;offset 0x9d4
825$00060012
826 SUBI 64,%r24,%r31 ;offset 0x9d8
827 CMPCLR,*<< %r4,%r3,%r0 ;offset 0x9dc
828 SUB %r4,%r3,%r4 ;offset 0x9e0
829$00060016
830 CMPB,= %r31,%r0,$0006001A ;offset 0x9e4
831 COPY %r0,%r9 ;offset 0x9e8
832 MTSARCM %r31 ;offset 0x9ec
833 DEPD,Z %r3,%sar,64,%r3 ;offset 0x9f0
834 SUBI 64,%r31,%r26 ;offset 0x9f4
835 MTSAR %r26 ;offset 0x9f8
836 SHRPD %r4,%r5,%sar,%r4 ;offset 0x9fc
837 MTSARCM %r31 ;offset 0xa00
838 DEPD,Z %r5,%sar,64,%r5 ;offset 0xa04
839$0006001A
840 DEPDI,Z -1,31,32,%r19 ;offset 0xa08
841 AND %r3,%r19,%r29 ;offset 0xa0c
842 EXTRD,U %r29,31,32,%r2 ;offset 0xa10
843 DEPDI,Z -1,63,32,%r6 ;offset 0xa14
844 MOVIB,TR 2,%r8,$0006001C ;offset 0xa18
845 EXTRD,U %r3,63,32,%r7 ;offset 0xa1c
846$D2
847 ;--- not PIC ADDIL LR'__iob-$global$,%r27,%r1 ;offset 0xa20
848 ;--- not PIC LDIL LR'C$7,%r21 ;offset 0xa24
849 ;--- not PIC LDO RR'__iob-$global$+32(%r1),%r26 ;offset 0xa28
850 ;--- not PIC .CALL ARGW0=GR,ARGW1=GR,ARGW2=GR,RTNVAL=GR ;in=24,25,26;out=28;
851 ;--- not PIC B,L fprintf,%r2 ;offset 0xa2c
852 ;--- not PIC LDO RR'C$7(%r21),%r25 ;offset 0xa30
853 .CALL ;
854 B,L abort,%r2 ;offset 0xa34
855 NOP ;offset 0xa38
856 B $D3 ;offset 0xa3c
857 LDW -212(%r30),%r2 ;offset 0xa40
858$00060020
859 COPY %r4,%r26 ;offset 0xa44
860 EXTRD,U %r4,31,32,%r25 ;offset 0xa48
861 COPY %r2,%r24 ;offset 0xa4c
862 .CALL ;in=23,24,25,26;out=20,21,22,28,29; (MILLICALL)
863 B,L $$div2U,%r31 ;offset 0xa50
864 EXTRD,U %r2,31,32,%r23 ;offset 0xa54
865 DEPD %r28,31,32,%r29 ;offset 0xa58
866$00060022
867 STD %r29,-152(%r30) ;offset 0xa5c
868$D1
869 AND %r5,%r19,%r24 ;offset 0xa60
870 EXTRD,U %r24,31,32,%r24 ;offset 0xa64
871 STW %r2,-160(%r30) ;offset 0xa68
872 STW %r7,-128(%r30) ;offset 0xa6c
873 FLDD -152(%r30),%fr4 ;offset 0xa70
874 FLDD -152(%r30),%fr7 ;offset 0xa74
875 FLDW -160(%r30),%fr8L ;offset 0xa78
876 FLDW -128(%r30),%fr5L ;offset 0xa7c
877 XMPYU %fr8L,%fr7L,%fr10 ;offset 0xa80
878 FSTD %fr10,-136(%r30) ;offset 0xa84
879 XMPYU %fr8L,%fr7R,%fr22 ;offset 0xa88
880 FSTD %fr22,-144(%r30) ;offset 0xa8c
881 XMPYU %fr5L,%fr4L,%fr11 ;offset 0xa90
882 XMPYU %fr5L,%fr4R,%fr23 ;offset 0xa94
883 FSTD %fr11,-112(%r30) ;offset 0xa98
884 FSTD %fr23,-120(%r30) ;offset 0xa9c
885 LDD -136(%r30),%r28 ;offset 0xaa0
886 DEPD,Z %r28,31,32,%r31 ;offset 0xaa4
887 LDD -144(%r30),%r20 ;offset 0xaa8
888 ADD,L %r20,%r31,%r31 ;offset 0xaac
889 LDD -112(%r30),%r22 ;offset 0xab0
890 DEPD,Z %r22,31,32,%r22 ;offset 0xab4
891 LDD -120(%r30),%r21 ;offset 0xab8
892 B $00060024 ;offset 0xabc
893 ADD,L %r21,%r22,%r23 ;offset 0xac0
894$D0
895 OR %r9,%r29,%r29 ;offset 0xac4
896$00060040
897 EXTRD,U %r29,31,32,%r28 ;offset 0xac8
898$00060002
899$L2
900 LDW -212(%r30),%r2 ;offset 0xacc
901$D3
902 LDW -168(%r30),%r9 ;offset 0xad0
903 LDD -176(%r30),%r8 ;offset 0xad4
904 EXTRD,U %r8,31,32,%r7 ;offset 0xad8
905 LDD -184(%r30),%r6 ;offset 0xadc
906 EXTRD,U %r6,31,32,%r5 ;offset 0xae0
907 LDW -188(%r30),%r4 ;offset 0xae4
908 BVE (%r2) ;offset 0xae8
909 .EXIT
910 LDW,MB -192(%r30),%r3 ;offset 0xaec
911 .PROCEND ;in=23,25;out=28,29;fpin=105,107;
912
913
914
915
916;----------------------------------------------------------------------------
917;
918; Registers to hold 64-bit values to manipulate. The "L" part
919; of the register corresponds to the upper 32-bits, while the "R"
920; part corresponds to the lower 32-bits
921;
922; Note, that when using b6 and b7, the code must save these before
923; using them because they are callee save registers
924;
925;
926; Floating point registers to use to save values that
927; are manipulated. These don't collide with ftemp1-6 and
928; are all caller save registers
929;
930a0 .reg %fr22
931a0L .reg %fr22L
932a0R .reg %fr22R
933
934a1 .reg %fr23
935a1L .reg %fr23L
936a1R .reg %fr23R
937
938a2 .reg %fr24
939a2L .reg %fr24L
940a2R .reg %fr24R
941
942a3 .reg %fr25
943a3L .reg %fr25L
944a3R .reg %fr25R
945
946a4 .reg %fr26
947a4L .reg %fr26L
948a4R .reg %fr26R
949
950a5 .reg %fr27
951a5L .reg %fr27L
952a5R .reg %fr27R
953
954a6 .reg %fr28
955a6L .reg %fr28L
956a6R .reg %fr28R
957
958a7 .reg %fr29
959a7L .reg %fr29L
960a7R .reg %fr29R
961
962b0 .reg %fr30
963b0L .reg %fr30L
964b0R .reg %fr30R
965
966b1 .reg %fr31
967b1L .reg %fr31L
968b1R .reg %fr31R
969
970;
971; Temporary floating point variables, these are all caller save
972; registers
973;
974ftemp1 .reg %fr4
975ftemp2 .reg %fr5
976ftemp3 .reg %fr6
977ftemp4 .reg %fr7
978
979;
980; The B set of registers when used.
981;
982
983b2 .reg %fr8
984b2L .reg %fr8L
985b2R .reg %fr8R
986
987b3 .reg %fr9
988b3L .reg %fr9L
989b3R .reg %fr9R
990
991b4 .reg %fr10
992b4L .reg %fr10L
993b4R .reg %fr10R
994
995b5 .reg %fr11
996b5L .reg %fr11L
997b5R .reg %fr11R
998
999b6 .reg %fr12
1000b6L .reg %fr12L
1001b6R .reg %fr12R
1002
1003b7 .reg %fr13
1004b7L .reg %fr13L
1005b7R .reg %fr13R
1006
1007c1 .reg %r21 ; only reg
1008temp1 .reg %r20 ; only reg
1009temp2 .reg %r19 ; only reg
1010temp3 .reg %r31 ; only reg
1011
1012m1 .reg %r28
1013c2 .reg %r23
1014high_one .reg %r1
1015ht .reg %r6
1016lt .reg %r5
1017m .reg %r4
1018c3 .reg %r3
1019
1020SQR_ADD_C .macro A0L,A0R,C1,C2,C3
1021 XMPYU A0L,A0R,ftemp1 ; m
1022 FSTD ftemp1,-24(%sp) ; store m
1023
1024 XMPYU A0R,A0R,ftemp2 ; lt
1025 FSTD ftemp2,-16(%sp) ; store lt
1026
1027 XMPYU A0L,A0L,ftemp3 ; ht
1028 FSTD ftemp3,-8(%sp) ; store ht
1029
1030 LDD -24(%sp),m ; load m
1031 AND m,high_mask,temp2 ; m & Mask
1032 DEPD,Z m,30,31,temp3 ; m << 32+1
1033 LDD -16(%sp),lt ; lt
1034
1035 LDD -8(%sp),ht ; ht
1036 EXTRD,U temp2,32,33,temp1 ; temp1 = m&Mask >> 32-1
1037 ADD temp3,lt,lt ; lt = lt+m
1038 ADD,L ht,temp1,ht ; ht += temp1
1039 ADD,DC ht,%r0,ht ; ht++
1040
1041 ADD C1,lt,C1 ; c1=c1+lt
1042 ADD,DC ht,%r0,ht ; ht++
1043
1044 ADD C2,ht,C2 ; c2=c2+ht
1045 ADD,DC C3,%r0,C3 ; c3++
1046.endm
1047
1048SQR_ADD_C2 .macro A0L,A0R,A1L,A1R,C1,C2,C3
1049 XMPYU A0L,A1R,ftemp1 ; m1 = bl*ht
1050 FSTD ftemp1,-16(%sp) ;
1051 XMPYU A0R,A1L,ftemp2 ; m = bh*lt
1052 FSTD ftemp2,-8(%sp) ;
1053 XMPYU A0R,A1R,ftemp3 ; lt = bl*lt
1054 FSTD ftemp3,-32(%sp)
1055 XMPYU A0L,A1L,ftemp4 ; ht = bh*ht
1056 FSTD ftemp4,-24(%sp) ;
1057
1058 LDD -8(%sp),m ; r21 = m
1059 LDD -16(%sp),m1 ; r19 = m1
1060 ADD,L m,m1,m ; m+m1
1061
1062 DEPD,Z m,31,32,temp3 ; (m+m1<<32)
1063 LDD -24(%sp),ht ; r24 = ht
1064
1065 CMPCLR,*>>= m,m1,%r0 ; if (m < m1)
1066 ADD,L ht,high_one,ht ; ht+=high_one
1067
1068 EXTRD,U m,31,32,temp1 ; m >> 32
1069 LDD -32(%sp),lt ; lt
1070 ADD,L ht,temp1,ht ; ht+= m>>32
1071 ADD lt,temp3,lt ; lt = lt+m1
1072 ADD,DC ht,%r0,ht ; ht++
1073
1074 ADD ht,ht,ht ; ht=ht+ht;
1075 ADD,DC C3,%r0,C3 ; add in carry (c3++)
1076
1077 ADD lt,lt,lt ; lt=lt+lt;
1078 ADD,DC ht,%r0,ht ; add in carry (ht++)
1079
1080 ADD C1,lt,C1 ; c1=c1+lt
1081 ADD,DC,*NUV ht,%r0,ht ; add in carry (ht++)
1082 LDO 1(C3),C3 ; bump c3 if overflow,nullify otherwise
1083
1084 ADD C2,ht,C2 ; c2 = c2 + ht
1085 ADD,DC C3,%r0,C3 ; add in carry (c3++)
1086.endm
1087
1088;
1089;void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
1090; arg0 = r_ptr
1091; arg1 = a_ptr
1092;
1093
1094bn_sqr_comba8
1095 .PROC
1096 .CALLINFO FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
1097 .EXPORT bn_sqr_comba8,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
1098 .ENTRY
1099 .align 64
1100
1101 STD %r3,0(%sp) ; save r3
1102 STD %r4,8(%sp) ; save r4
1103 STD %r5,16(%sp) ; save r5
1104 STD %r6,24(%sp) ; save r6
1105
1106 ;
1107 ; Zero out carries
1108 ;
1109 COPY %r0,c1
1110 COPY %r0,c2
1111 COPY %r0,c3
1112
1113 LDO 128(%sp),%sp ; bump stack
1114 DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L
1115 DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32
1116
1117 ;
1118 ; Load up all of the values we are going to use
1119 ;
1120 FLDD 0(a_ptr),a0
1121 FLDD 8(a_ptr),a1
1122 FLDD 16(a_ptr),a2
1123 FLDD 24(a_ptr),a3
1124 FLDD 32(a_ptr),a4
1125 FLDD 40(a_ptr),a5
1126 FLDD 48(a_ptr),a6
1127 FLDD 56(a_ptr),a7
1128
1129 SQR_ADD_C a0L,a0R,c1,c2,c3
1130 STD c1,0(r_ptr) ; r[0] = c1;
1131 COPY %r0,c1
1132
1133 SQR_ADD_C2 a1L,a1R,a0L,a0R,c2,c3,c1
1134 STD c2,8(r_ptr) ; r[1] = c2;
1135 COPY %r0,c2
1136
1137 SQR_ADD_C a1L,a1R,c3,c1,c2
1138 SQR_ADD_C2 a2L,a2R,a0L,a0R,c3,c1,c2
1139 STD c3,16(r_ptr) ; r[2] = c3;
1140 COPY %r0,c3
1141
1142 SQR_ADD_C2 a3L,a3R,a0L,a0R,c1,c2,c3
1143 SQR_ADD_C2 a2L,a2R,a1L,a1R,c1,c2,c3
1144 STD c1,24(r_ptr) ; r[3] = c1;
1145 COPY %r0,c1
1146
1147 SQR_ADD_C a2L,a2R,c2,c3,c1
1148 SQR_ADD_C2 a3L,a3R,a1L,a1R,c2,c3,c1
1149 SQR_ADD_C2 a4L,a4R,a0L,a0R,c2,c3,c1
1150 STD c2,32(r_ptr) ; r[4] = c2;
1151 COPY %r0,c2
1152
1153 SQR_ADD_C2 a5L,a5R,a0L,a0R,c3,c1,c2
1154 SQR_ADD_C2 a4L,a4R,a1L,a1R,c3,c1,c2
1155 SQR_ADD_C2 a3L,a3R,a2L,a2R,c3,c1,c2
1156 STD c3,40(r_ptr) ; r[5] = c3;
1157 COPY %r0,c3
1158
1159 SQR_ADD_C a3L,a3R,c1,c2,c3
1160 SQR_ADD_C2 a4L,a4R,a2L,a2R,c1,c2,c3
1161 SQR_ADD_C2 a5L,a5R,a1L,a1R,c1,c2,c3
1162 SQR_ADD_C2 a6L,a6R,a0L,a0R,c1,c2,c3
1163 STD c1,48(r_ptr) ; r[6] = c1;
1164 COPY %r0,c1
1165
1166 SQR_ADD_C2 a7L,a7R,a0L,a0R,c2,c3,c1
1167 SQR_ADD_C2 a6L,a6R,a1L,a1R,c2,c3,c1
1168 SQR_ADD_C2 a5L,a5R,a2L,a2R,c2,c3,c1
1169 SQR_ADD_C2 a4L,a4R,a3L,a3R,c2,c3,c1
1170 STD c2,56(r_ptr) ; r[7] = c2;
1171 COPY %r0,c2
1172
1173 SQR_ADD_C a4L,a4R,c3,c1,c2
1174 SQR_ADD_C2 a5L,a5R,a3L,a3R,c3,c1,c2
1175 SQR_ADD_C2 a6L,a6R,a2L,a2R,c3,c1,c2
1176 SQR_ADD_C2 a7L,a7R,a1L,a1R,c3,c1,c2
1177 STD c3,64(r_ptr) ; r[8] = c3;
1178 COPY %r0,c3
1179
1180 SQR_ADD_C2 a7L,a7R,a2L,a2R,c1,c2,c3
1181 SQR_ADD_C2 a6L,a6R,a3L,a3R,c1,c2,c3
1182 SQR_ADD_C2 a5L,a5R,a4L,a4R,c1,c2,c3
1183 STD c1,72(r_ptr) ; r[9] = c1;
1184 COPY %r0,c1
1185
1186 SQR_ADD_C a5L,a5R,c2,c3,c1
1187 SQR_ADD_C2 a6L,a6R,a4L,a4R,c2,c3,c1
1188 SQR_ADD_C2 a7L,a7R,a3L,a3R,c2,c3,c1
1189 STD c2,80(r_ptr) ; r[10] = c2;
1190 COPY %r0,c2
1191
1192 SQR_ADD_C2 a7L,a7R,a4L,a4R,c3,c1,c2
1193 SQR_ADD_C2 a6L,a6R,a5L,a5R,c3,c1,c2
1194 STD c3,88(r_ptr) ; r[11] = c3;
1195 COPY %r0,c3
1196
1197 SQR_ADD_C a6L,a6R,c1,c2,c3
1198 SQR_ADD_C2 a7L,a7R,a5L,a5R,c1,c2,c3
1199 STD c1,96(r_ptr) ; r[12] = c1;
1200 COPY %r0,c1
1201
1202 SQR_ADD_C2 a7L,a7R,a6L,a6R,c2,c3,c1
1203 STD c2,104(r_ptr) ; r[13] = c2;
1204 COPY %r0,c2
1205
1206 SQR_ADD_C a7L,a7R,c3,c1,c2
1207 STD c3, 112(r_ptr) ; r[14] = c3
1208 STD c1, 120(r_ptr) ; r[15] = c1
1209
1210 .EXIT
1211 LDD -104(%sp),%r6 ; restore r6
1212 LDD -112(%sp),%r5 ; restore r5
1213 LDD -120(%sp),%r4 ; restore r4
1214 BVE (%rp)
1215 LDD,MB -128(%sp),%r3
1216
1217 .PROCEND
1218
1219;-----------------------------------------------------------------------------
1220;
1221;void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
1222; arg0 = r_ptr
1223; arg1 = a_ptr
1224;
1225
1226bn_sqr_comba4
1227 .proc
1228 .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
1229 .EXPORT bn_sqr_comba4,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
1230 .entry
1231 .align 64
1232 STD %r3,0(%sp) ; save r3
1233 STD %r4,8(%sp) ; save r4
1234 STD %r5,16(%sp) ; save r5
1235 STD %r6,24(%sp) ; save r6
1236
1237 ;
1238 ; Zero out carries
1239 ;
1240 COPY %r0,c1
1241 COPY %r0,c2
1242 COPY %r0,c3
1243
1244 LDO 128(%sp),%sp ; bump stack
1245 DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L
1246 DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32
1247
1248 ;
1249 ; Load up all of the values we are going to use
1250 ;
1251 FLDD 0(a_ptr),a0
1252 FLDD 8(a_ptr),a1
1253 FLDD 16(a_ptr),a2
1254 FLDD 24(a_ptr),a3
1255 FLDD 32(a_ptr),a4
1256 FLDD 40(a_ptr),a5
1257 FLDD 48(a_ptr),a6
1258 FLDD 56(a_ptr),a7
1259
1260 SQR_ADD_C a0L,a0R,c1,c2,c3
1261
1262 STD c1,0(r_ptr) ; r[0] = c1;
1263 COPY %r0,c1
1264
1265 SQR_ADD_C2 a1L,a1R,a0L,a0R,c2,c3,c1
1266
1267 STD c2,8(r_ptr) ; r[1] = c2;
1268 COPY %r0,c2
1269
1270 SQR_ADD_C a1L,a1R,c3,c1,c2
1271 SQR_ADD_C2 a2L,a2R,a0L,a0R,c3,c1,c2
1272
1273 STD c3,16(r_ptr) ; r[2] = c3;
1274 COPY %r0,c3
1275
1276 SQR_ADD_C2 a3L,a3R,a0L,a0R,c1,c2,c3
1277 SQR_ADD_C2 a2L,a2R,a1L,a1R,c1,c2,c3
1278
1279 STD c1,24(r_ptr) ; r[3] = c1;
1280 COPY %r0,c1
1281
1282 SQR_ADD_C a2L,a2R,c2,c3,c1
1283 SQR_ADD_C2 a3L,a3R,a1L,a1R,c2,c3,c1
1284
1285 STD c2,32(r_ptr) ; r[4] = c2;
1286 COPY %r0,c2
1287
1288 SQR_ADD_C2 a3L,a3R,a2L,a2R,c3,c1,c2
1289 STD c3,40(r_ptr) ; r[5] = c3;
1290 COPY %r0,c3
1291
1292 SQR_ADD_C a3L,a3R,c1,c2,c3
1293 STD c1,48(r_ptr) ; r[6] = c1;
1294 STD c2,56(r_ptr) ; r[7] = c2;
1295
1296 .EXIT
1297 LDD -104(%sp),%r6 ; restore r6
1298 LDD -112(%sp),%r5 ; restore r5
1299 LDD -120(%sp),%r4 ; restore r4
1300 BVE (%rp)
1301 LDD,MB -128(%sp),%r3
1302
1303 .PROCEND
1304
1305
1306;---------------------------------------------------------------------------
1307
1308MUL_ADD_C .macro A0L,A0R,B0L,B0R,C1,C2,C3
1309 XMPYU A0L,B0R,ftemp1 ; m1 = bl*ht
1310 FSTD ftemp1,-16(%sp) ;
1311 XMPYU A0R,B0L,ftemp2 ; m = bh*lt
1312 FSTD ftemp2,-8(%sp) ;
1313 XMPYU A0R,B0R,ftemp3 ; lt = bl*lt
1314 FSTD ftemp3,-32(%sp)
1315 XMPYU A0L,B0L,ftemp4 ; ht = bh*ht
1316 FSTD ftemp4,-24(%sp) ;
1317
1318 LDD -8(%sp),m ; r21 = m
1319 LDD -16(%sp),m1 ; r19 = m1
1320 ADD,L m,m1,m ; m+m1
1321
1322 DEPD,Z m,31,32,temp3 ; (m+m1<<32)
1323 LDD -24(%sp),ht ; r24 = ht
1324
1325 CMPCLR,*>>= m,m1,%r0 ; if (m < m1)
1326 ADD,L ht,high_one,ht ; ht+=high_one
1327
1328 EXTRD,U m,31,32,temp1 ; m >> 32
1329 LDD -32(%sp),lt ; lt
1330 ADD,L ht,temp1,ht ; ht+= m>>32
1331 ADD lt,temp3,lt ; lt = lt+m1
1332 ADD,DC ht,%r0,ht ; ht++
1333
1334 ADD C1,lt,C1 ; c1=c1+lt
1335 ADD,DC ht,%r0,ht ; bump c3 if overflow,nullify otherwise
1336
1337 ADD C2,ht,C2 ; c2 = c2 + ht
1338 ADD,DC C3,%r0,C3 ; add in carry (c3++)
1339.endm
1340
1341
1342;
1343;void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
1344; arg0 = r_ptr
1345; arg1 = a_ptr
1346; arg2 = b_ptr
1347;
1348
1349bn_mul_comba8
1350 .proc
1351 .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
1352 .EXPORT bn_mul_comba8,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
1353 .entry
1354 .align 64
1355
1356 STD %r3,0(%sp) ; save r3
1357 STD %r4,8(%sp) ; save r4
1358 STD %r5,16(%sp) ; save r5
1359 STD %r6,24(%sp) ; save r6
1360 FSTD %fr12,32(%sp) ; save r6
1361 FSTD %fr13,40(%sp) ; save r7
1362
1363 ;
1364 ; Zero out carries
1365 ;
1366 COPY %r0,c1
1367 COPY %r0,c2
1368 COPY %r0,c3
1369
1370 LDO 128(%sp),%sp ; bump stack
1371 DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32
1372
1373 ;
1374 ; Load up all of the values we are going to use
1375 ;
1376 FLDD 0(a_ptr),a0
1377 FLDD 8(a_ptr),a1
1378 FLDD 16(a_ptr),a2
1379 FLDD 24(a_ptr),a3
1380 FLDD 32(a_ptr),a4
1381 FLDD 40(a_ptr),a5
1382 FLDD 48(a_ptr),a6
1383 FLDD 56(a_ptr),a7
1384
1385 FLDD 0(b_ptr),b0
1386 FLDD 8(b_ptr),b1
1387 FLDD 16(b_ptr),b2
1388 FLDD 24(b_ptr),b3
1389 FLDD 32(b_ptr),b4
1390 FLDD 40(b_ptr),b5
1391 FLDD 48(b_ptr),b6
1392 FLDD 56(b_ptr),b7
1393
1394 MUL_ADD_C a0L,a0R,b0L,b0R,c1,c2,c3
1395 STD c1,0(r_ptr)
1396 COPY %r0,c1
1397
1398 MUL_ADD_C a0L,a0R,b1L,b1R,c2,c3,c1
1399 MUL_ADD_C a1L,a1R,b0L,b0R,c2,c3,c1
1400 STD c2,8(r_ptr)
1401 COPY %r0,c2
1402
1403 MUL_ADD_C a2L,a2R,b0L,b0R,c3,c1,c2
1404 MUL_ADD_C a1L,a1R,b1L,b1R,c3,c1,c2
1405 MUL_ADD_C a0L,a0R,b2L,b2R,c3,c1,c2
1406 STD c3,16(r_ptr)
1407 COPY %r0,c3
1408
1409 MUL_ADD_C a0L,a0R,b3L,b3R,c1,c2,c3
1410 MUL_ADD_C a1L,a1R,b2L,b2R,c1,c2,c3
1411 MUL_ADD_C a2L,a2R,b1L,b1R,c1,c2,c3
1412 MUL_ADD_C a3L,a3R,b0L,b0R,c1,c2,c3
1413 STD c1,24(r_ptr)
1414 COPY %r0,c1
1415
1416 MUL_ADD_C a4L,a4R,b0L,b0R,c2,c3,c1
1417 MUL_ADD_C a3L,a3R,b1L,b1R,c2,c3,c1
1418 MUL_ADD_C a2L,a2R,b2L,b2R,c2,c3,c1
1419 MUL_ADD_C a1L,a1R,b3L,b3R,c2,c3,c1
1420 MUL_ADD_C a0L,a0R,b4L,b4R,c2,c3,c1
1421 STD c2,32(r_ptr)
1422 COPY %r0,c2
1423
1424 MUL_ADD_C a0L,a0R,b5L,b5R,c3,c1,c2
1425 MUL_ADD_C a1L,a1R,b4L,b4R,c3,c1,c2
1426 MUL_ADD_C a2L,a2R,b3L,b3R,c3,c1,c2
1427 MUL_ADD_C a3L,a3R,b2L,b2R,c3,c1,c2
1428 MUL_ADD_C a4L,a4R,b1L,b1R,c3,c1,c2
1429 MUL_ADD_C a5L,a5R,b0L,b0R,c3,c1,c2
1430 STD c3,40(r_ptr)
1431 COPY %r0,c3
1432
1433 MUL_ADD_C a6L,a6R,b0L,b0R,c1,c2,c3
1434 MUL_ADD_C a5L,a5R,b1L,b1R,c1,c2,c3
1435 MUL_ADD_C a4L,a4R,b2L,b2R,c1,c2,c3
1436 MUL_ADD_C a3L,a3R,b3L,b3R,c1,c2,c3
1437 MUL_ADD_C a2L,a2R,b4L,b4R,c1,c2,c3
1438 MUL_ADD_C a1L,a1R,b5L,b5R,c1,c2,c3
1439 MUL_ADD_C a0L,a0R,b6L,b6R,c1,c2,c3
1440 STD c1,48(r_ptr)
1441 COPY %r0,c1
1442
1443 MUL_ADD_C a0L,a0R,b7L,b7R,c2,c3,c1
1444 MUL_ADD_C a1L,a1R,b6L,b6R,c2,c3,c1
1445 MUL_ADD_C a2L,a2R,b5L,b5R,c2,c3,c1
1446 MUL_ADD_C a3L,a3R,b4L,b4R,c2,c3,c1
1447 MUL_ADD_C a4L,a4R,b3L,b3R,c2,c3,c1
1448 MUL_ADD_C a5L,a5R,b2L,b2R,c2,c3,c1
1449 MUL_ADD_C a6L,a6R,b1L,b1R,c2,c3,c1
1450 MUL_ADD_C a7L,a7R,b0L,b0R,c2,c3,c1
1451 STD c2,56(r_ptr)
1452 COPY %r0,c2
1453
1454 MUL_ADD_C a7L,a7R,b1L,b1R,c3,c1,c2
1455 MUL_ADD_C a6L,a6R,b2L,b2R,c3,c1,c2
1456 MUL_ADD_C a5L,a5R,b3L,b3R,c3,c1,c2
1457 MUL_ADD_C a4L,a4R,b4L,b4R,c3,c1,c2
1458 MUL_ADD_C a3L,a3R,b5L,b5R,c3,c1,c2
1459 MUL_ADD_C a2L,a2R,b6L,b6R,c3,c1,c2
1460 MUL_ADD_C a1L,a1R,b7L,b7R,c3,c1,c2
1461 STD c3,64(r_ptr)
1462 COPY %r0,c3
1463
1464 MUL_ADD_C a2L,a2R,b7L,b7R,c1,c2,c3
1465 MUL_ADD_C a3L,a3R,b6L,b6R,c1,c2,c3
1466 MUL_ADD_C a4L,a4R,b5L,b5R,c1,c2,c3
1467 MUL_ADD_C a5L,a5R,b4L,b4R,c1,c2,c3
1468 MUL_ADD_C a6L,a6R,b3L,b3R,c1,c2,c3
1469 MUL_ADD_C a7L,a7R,b2L,b2R,c1,c2,c3
1470 STD c1,72(r_ptr)
1471 COPY %r0,c1
1472
1473 MUL_ADD_C a7L,a7R,b3L,b3R,c2,c3,c1
1474 MUL_ADD_C a6L,a6R,b4L,b4R,c2,c3,c1
1475 MUL_ADD_C a5L,a5R,b5L,b5R,c2,c3,c1
1476 MUL_ADD_C a4L,a4R,b6L,b6R,c2,c3,c1
1477 MUL_ADD_C a3L,a3R,b7L,b7R,c2,c3,c1
1478 STD c2,80(r_ptr)
1479 COPY %r0,c2
1480
1481 MUL_ADD_C a4L,a4R,b7L,b7R,c3,c1,c2
1482 MUL_ADD_C a5L,a5R,b6L,b6R,c3,c1,c2
1483 MUL_ADD_C a6L,a6R,b5L,b5R,c3,c1,c2
1484 MUL_ADD_C a7L,a7R,b4L,b4R,c3,c1,c2
1485 STD c3,88(r_ptr)
1486 COPY %r0,c3
1487
1488 MUL_ADD_C a7L,a7R,b5L,b5R,c1,c2,c3
1489 MUL_ADD_C a6L,a6R,b6L,b6R,c1,c2,c3
1490 MUL_ADD_C a5L,a5R,b7L,b7R,c1,c2,c3
1491 STD c1,96(r_ptr)
1492 COPY %r0,c1
1493
1494 MUL_ADD_C a6L,a6R,b7L,b7R,c2,c3,c1
1495 MUL_ADD_C a7L,a7R,b6L,b6R,c2,c3,c1
1496 STD c2,104(r_ptr)
1497 COPY %r0,c2
1498
1499 MUL_ADD_C a7L,a7R,b7L,b7R,c3,c1,c2
1500 STD c3,112(r_ptr)
1501 STD c1,120(r_ptr)
1502
1503 .EXIT
1504 FLDD -88(%sp),%fr13
1505 FLDD -96(%sp),%fr12
1506 LDD -104(%sp),%r6 ; restore r6
1507 LDD -112(%sp),%r5 ; restore r5
1508 LDD -120(%sp),%r4 ; restore r4
1509 BVE (%rp)
1510 LDD,MB -128(%sp),%r3
1511
1512 .PROCEND
1513
1514;-----------------------------------------------------------------------------
1515;
1516;void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
1517; arg0 = r_ptr
1518; arg1 = a_ptr
1519; arg2 = b_ptr
1520;
1521
1522bn_mul_comba4
1523 .proc
1524 .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
1525 .EXPORT bn_mul_comba4,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
1526 .entry
1527 .align 64
1528
1529 STD %r3,0(%sp) ; save r3
1530 STD %r4,8(%sp) ; save r4
1531 STD %r5,16(%sp) ; save r5
1532 STD %r6,24(%sp) ; save r6
1533 FSTD %fr12,32(%sp) ; save r6
1534 FSTD %fr13,40(%sp) ; save r7
1535
1536 ;
1537 ; Zero out carries
1538 ;
1539 COPY %r0,c1
1540 COPY %r0,c2
1541 COPY %r0,c3
1542
1543 LDO 128(%sp),%sp ; bump stack
1544 DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32
1545
1546 ;
1547 ; Load up all of the values we are going to use
1548 ;
1549 FLDD 0(a_ptr),a0
1550 FLDD 8(a_ptr),a1
1551 FLDD 16(a_ptr),a2
1552 FLDD 24(a_ptr),a3
1553
1554 FLDD 0(b_ptr),b0
1555 FLDD 8(b_ptr),b1
1556 FLDD 16(b_ptr),b2
1557 FLDD 24(b_ptr),b3
1558
1559 MUL_ADD_C a0L,a0R,b0L,b0R,c1,c2,c3
1560 STD c1,0(r_ptr)
1561 COPY %r0,c1
1562
1563 MUL_ADD_C a0L,a0R,b1L,b1R,c2,c3,c1
1564 MUL_ADD_C a1L,a1R,b0L,b0R,c2,c3,c1
1565 STD c2,8(r_ptr)
1566 COPY %r0,c2
1567
1568 MUL_ADD_C a2L,a2R,b0L,b0R,c3,c1,c2
1569 MUL_ADD_C a1L,a1R,b1L,b1R,c3,c1,c2
1570 MUL_ADD_C a0L,a0R,b2L,b2R,c3,c1,c2
1571 STD c3,16(r_ptr)
1572 COPY %r0,c3
1573
1574 MUL_ADD_C a0L,a0R,b3L,b3R,c1,c2,c3
1575 MUL_ADD_C a1L,a1R,b2L,b2R,c1,c2,c3
1576 MUL_ADD_C a2L,a2R,b1L,b1R,c1,c2,c3
1577 MUL_ADD_C a3L,a3R,b0L,b0R,c1,c2,c3
1578 STD c1,24(r_ptr)
1579 COPY %r0,c1
1580
1581 MUL_ADD_C a3L,a3R,b1L,b1R,c2,c3,c1
1582 MUL_ADD_C a2L,a2R,b2L,b2R,c2,c3,c1
1583 MUL_ADD_C a1L,a1R,b3L,b3R,c2,c3,c1
1584 STD c2,32(r_ptr)
1585 COPY %r0,c2
1586
1587 MUL_ADD_C a2L,a2R,b3L,b3R,c3,c1,c2
1588 MUL_ADD_C a3L,a3R,b2L,b2R,c3,c1,c2
1589 STD c3,40(r_ptr)
1590 COPY %r0,c3
1591
1592 MUL_ADD_C a3L,a3R,b3L,b3R,c1,c2,c3
1593 STD c1,48(r_ptr)
1594 STD c2,56(r_ptr)
1595
1596 .EXIT
1597 FLDD -88(%sp),%fr13
1598 FLDD -96(%sp),%fr12
1599 LDD -104(%sp),%r6 ; restore r6
1600 LDD -112(%sp),%r5 ; restore r5
1601 LDD -120(%sp),%r4 ; restore r4
1602 BVE (%rp)
1603 LDD,MB -128(%sp),%r3
1604
1605 .PROCEND
1606
1607
1608;--- not PIC .SPACE $TEXT$
1609;--- not PIC .SUBSPA $CODE$
1610;--- not PIC .SPACE $PRIVATE$,SORT=16
1611;--- not PIC .IMPORT $global$,DATA
1612;--- not PIC .SPACE $TEXT$
1613;--- not PIC .SUBSPA $CODE$
1614;--- not PIC .SUBSPA $LIT$,ACCESS=0x2c
1615;--- not PIC C$7
1616;--- not PIC .ALIGN 8
1617;--- not PIC .STRINGZ "Division would overflow (%d)\n"
1618 .END
diff --git a/src/lib/libcrypto/bn/asm/pa-risc2W.s b/src/lib/libcrypto/bn/asm/pa-risc2W.s
deleted file mode 100644
index a91f3ea5af..0000000000
--- a/src/lib/libcrypto/bn/asm/pa-risc2W.s
+++ /dev/null
@@ -1,1605 +0,0 @@
1;
2; PA-RISC 64-bit implementation of bn_asm code
3;
4; This code is approximately 2x faster than the C version
5; for RSA/DSA.
6;
7; See http://devresource.hp.com/ for more details on the PA-RISC
8; architecture. Also see the book "PA-RISC 2.0 Architecture"
9; by Gerry Kane for information on the instruction set architecture.
10;
11; Code written by Chris Ruemmler (with some help from the HP C
12; compiler).
13;
14; The code compiles with HP's assembler
15;
16
17 .level 2.0W
18 .space $TEXT$
19 .subspa $CODE$,QUAD=0,ALIGN=8,ACCESS=0x2c,CODE_ONLY
20
21;
22; Global Register definitions used for the routines.
23;
24; Some information about HP's runtime architecture for 64-bits.
25;
26; "Caller save" means the calling function must save the register
27; if it wants the register to be preserved.
28; "Callee save" means if a function uses the register, it must save
29; the value before using it.
30;
31; For the floating point registers
32;
33; "caller save" registers: fr4-fr11, fr22-fr31
34; "callee save" registers: fr12-fr21
35; "special" registers: fr0-fr3 (status and exception registers)
36;
37; For the integer registers
38; value zero : r0
39; "caller save" registers: r1,r19-r26
40; "callee save" registers: r3-r18
41; return register : r2 (rp)
42; return values ; r28 (ret0,ret1)
43; Stack pointer ; r30 (sp)
44; global data pointer ; r27 (dp)
45; argument pointer ; r29 (ap)
46; millicode return ptr ; r31 (also a caller save register)
47
48
49;
50; Arguments to the routines
51;
52r_ptr .reg %r26
53a_ptr .reg %r25
54b_ptr .reg %r24
55num .reg %r24
56w .reg %r23
57n .reg %r23
58
59
60;
61; Globals used in some routines
62;
63
64top_overflow .reg %r29
65high_mask .reg %r22 ; value 0xffffffff80000000L
66
67
68;------------------------------------------------------------------------------
69;
70; bn_mul_add_words
71;
72;BN_ULONG bn_mul_add_words(BN_ULONG *r_ptr, BN_ULONG *a_ptr,
73; int num, BN_ULONG w)
74;
75; arg0 = r_ptr
76; arg1 = a_ptr
77; arg2 = num
78; arg3 = w
79;
80; Local register definitions
81;
82
83fm1 .reg %fr22
84fm .reg %fr23
85ht_temp .reg %fr24
86ht_temp_1 .reg %fr25
87lt_temp .reg %fr26
88lt_temp_1 .reg %fr27
89fm1_1 .reg %fr28
90fm_1 .reg %fr29
91
92fw_h .reg %fr7L
93fw_l .reg %fr7R
94fw .reg %fr7
95
96fht_0 .reg %fr8L
97flt_0 .reg %fr8R
98t_float_0 .reg %fr8
99
100fht_1 .reg %fr9L
101flt_1 .reg %fr9R
102t_float_1 .reg %fr9
103
104tmp_0 .reg %r31
105tmp_1 .reg %r21
106m_0 .reg %r20
107m_1 .reg %r19
108ht_0 .reg %r1
109ht_1 .reg %r3
110lt_0 .reg %r4
111lt_1 .reg %r5
112m1_0 .reg %r6
113m1_1 .reg %r7
114rp_val .reg %r8
115rp_val_1 .reg %r9
116
117bn_mul_add_words
118 .export bn_mul_add_words,entry,NO_RELOCATION,LONG_RETURN
119 .proc
120 .callinfo frame=128
121 .entry
122 .align 64
123
124 STD %r3,0(%sp) ; save r3
125 STD %r4,8(%sp) ; save r4
126 NOP ; Needed to make the loop 16-byte aligned
127 NOP ; Needed to make the loop 16-byte aligned
128
129 STD %r5,16(%sp) ; save r5
130 STD %r6,24(%sp) ; save r6
131 STD %r7,32(%sp) ; save r7
132 STD %r8,40(%sp) ; save r8
133
134 STD %r9,48(%sp) ; save r9
135 COPY %r0,%ret0 ; return 0 by default
136 DEPDI,Z 1,31,1,top_overflow ; top_overflow = 1 << 32
137 STD w,56(%sp) ; store w on stack
138
139 CMPIB,>= 0,num,bn_mul_add_words_exit ; if (num <= 0) then exit
140 LDO 128(%sp),%sp ; bump stack
141
142 ;
143 ; The loop is unrolled twice, so if there is only 1 number
144 ; then go straight to the cleanup code.
145 ;
146 CMPIB,= 1,num,bn_mul_add_words_single_top
147 FLDD -72(%sp),fw ; load up w into fp register fw (fw_h/fw_l)
148
149 ;
150 ; This loop is unrolled 2 times (64-byte aligned as well)
151 ;
152 ; PA-RISC 2.0 chips have two fully pipelined multipliers, thus
153 ; two 32-bit mutiplies can be issued per cycle.
154 ;
155bn_mul_add_words_unroll2
156
157 FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R)
158 FLDD 8(a_ptr),t_float_1 ; load up 64-bit value (fr8L) ht(L)/lt(R)
159 LDD 0(r_ptr),rp_val ; rp[0]
160 LDD 8(r_ptr),rp_val_1 ; rp[1]
161
162 XMPYU fht_0,fw_l,fm1 ; m1[0] = fht_0*fw_l
163 XMPYU fht_1,fw_l,fm1_1 ; m1[1] = fht_1*fw_l
164 FSTD fm1,-16(%sp) ; -16(sp) = m1[0]
165 FSTD fm1_1,-48(%sp) ; -48(sp) = m1[1]
166
167 XMPYU flt_0,fw_h,fm ; m[0] = flt_0*fw_h
168 XMPYU flt_1,fw_h,fm_1 ; m[1] = flt_1*fw_h
169 FSTD fm,-8(%sp) ; -8(sp) = m[0]
170 FSTD fm_1,-40(%sp) ; -40(sp) = m[1]
171
172 XMPYU fht_0,fw_h,ht_temp ; ht_temp = fht_0*fw_h
173 XMPYU fht_1,fw_h,ht_temp_1 ; ht_temp_1 = fht_1*fw_h
174 FSTD ht_temp,-24(%sp) ; -24(sp) = ht_temp
175 FSTD ht_temp_1,-56(%sp) ; -56(sp) = ht_temp_1
176
177 XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l
178 XMPYU flt_1,fw_l,lt_temp_1 ; lt_temp = lt*fw_l
179 FSTD lt_temp,-32(%sp) ; -32(sp) = lt_temp
180 FSTD lt_temp_1,-64(%sp) ; -64(sp) = lt_temp_1
181
182 LDD -8(%sp),m_0 ; m[0]
183 LDD -40(%sp),m_1 ; m[1]
184 LDD -16(%sp),m1_0 ; m1[0]
185 LDD -48(%sp),m1_1 ; m1[1]
186
187 LDD -24(%sp),ht_0 ; ht[0]
188 LDD -56(%sp),ht_1 ; ht[1]
189 ADD,L m1_0,m_0,tmp_0 ; tmp_0 = m[0] + m1[0];
190 ADD,L m1_1,m_1,tmp_1 ; tmp_1 = m[1] + m1[1];
191
192 LDD -32(%sp),lt_0
193 LDD -64(%sp),lt_1
194 CMPCLR,*>>= tmp_0,m1_0, %r0 ; if (m[0] < m1[0])
195 ADD,L ht_0,top_overflow,ht_0 ; ht[0] += (1<<32)
196
197 CMPCLR,*>>= tmp_1,m1_1,%r0 ; if (m[1] < m1[1])
198 ADD,L ht_1,top_overflow,ht_1 ; ht[1] += (1<<32)
199 EXTRD,U tmp_0,31,32,m_0 ; m[0]>>32
200 DEPD,Z tmp_0,31,32,m1_0 ; m1[0] = m[0]<<32
201
202 EXTRD,U tmp_1,31,32,m_1 ; m[1]>>32
203 DEPD,Z tmp_1,31,32,m1_1 ; m1[1] = m[1]<<32
204 ADD,L ht_0,m_0,ht_0 ; ht[0]+= (m[0]>>32)
205 ADD,L ht_1,m_1,ht_1 ; ht[1]+= (m[1]>>32)
206
207 ADD lt_0,m1_0,lt_0 ; lt[0] = lt[0]+m1[0];
208 ADD,DC ht_0,%r0,ht_0 ; ht[0]++
209 ADD lt_1,m1_1,lt_1 ; lt[1] = lt[1]+m1[1];
210 ADD,DC ht_1,%r0,ht_1 ; ht[1]++
211
212 ADD %ret0,lt_0,lt_0 ; lt[0] = lt[0] + c;
213 ADD,DC ht_0,%r0,ht_0 ; ht[0]++
214 ADD lt_0,rp_val,lt_0 ; lt[0] = lt[0]+rp[0]
215 ADD,DC ht_0,%r0,ht_0 ; ht[0]++
216
217 LDO -2(num),num ; num = num - 2;
218 ADD ht_0,lt_1,lt_1 ; lt[1] = lt[1] + ht_0 (c);
219 ADD,DC ht_1,%r0,ht_1 ; ht[1]++
220 STD lt_0,0(r_ptr) ; rp[0] = lt[0]
221
222 ADD lt_1,rp_val_1,lt_1 ; lt[1] = lt[1]+rp[1]
223 ADD,DC ht_1,%r0,%ret0 ; ht[1]++
224 LDO 16(a_ptr),a_ptr ; a_ptr += 2
225
226 STD lt_1,8(r_ptr) ; rp[1] = lt[1]
227 CMPIB,<= 2,num,bn_mul_add_words_unroll2 ; go again if more to do
228 LDO 16(r_ptr),r_ptr ; r_ptr += 2
229
230 CMPIB,=,N 0,num,bn_mul_add_words_exit ; are we done, or cleanup last one
231
232 ;
233 ; Top of loop aligned on 64-byte boundary
234 ;
235bn_mul_add_words_single_top
236 FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R)
237 LDD 0(r_ptr),rp_val ; rp[0]
238 LDO 8(a_ptr),a_ptr ; a_ptr++
239 XMPYU fht_0,fw_l,fm1 ; m1 = ht*fw_l
240 FSTD fm1,-16(%sp) ; -16(sp) = m1
241 XMPYU flt_0,fw_h,fm ; m = lt*fw_h
242 FSTD fm,-8(%sp) ; -8(sp) = m
243 XMPYU fht_0,fw_h,ht_temp ; ht_temp = ht*fw_h
244 FSTD ht_temp,-24(%sp) ; -24(sp) = ht
245 XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l
246 FSTD lt_temp,-32(%sp) ; -32(sp) = lt
247
248 LDD -8(%sp),m_0
249 LDD -16(%sp),m1_0 ; m1 = temp1
250 ADD,L m_0,m1_0,tmp_0 ; tmp_0 = m + m1;
251 LDD -24(%sp),ht_0
252 LDD -32(%sp),lt_0
253
254 CMPCLR,*>>= tmp_0,m1_0,%r0 ; if (m < m1)
255 ADD,L ht_0,top_overflow,ht_0 ; ht += (1<<32)
256
257 EXTRD,U tmp_0,31,32,m_0 ; m>>32
258 DEPD,Z tmp_0,31,32,m1_0 ; m1 = m<<32
259
260 ADD,L ht_0,m_0,ht_0 ; ht+= (m>>32)
261 ADD lt_0,m1_0,tmp_0 ; tmp_0 = lt+m1;
262 ADD,DC ht_0,%r0,ht_0 ; ht++
263 ADD %ret0,tmp_0,lt_0 ; lt = lt + c;
264 ADD,DC ht_0,%r0,ht_0 ; ht++
265 ADD lt_0,rp_val,lt_0 ; lt = lt+rp[0]
266 ADD,DC ht_0,%r0,%ret0 ; ht++
267 STD lt_0,0(r_ptr) ; rp[0] = lt
268
269bn_mul_add_words_exit
270 .EXIT
271 LDD -80(%sp),%r9 ; restore r9
272 LDD -88(%sp),%r8 ; restore r8
273 LDD -96(%sp),%r7 ; restore r7
274 LDD -104(%sp),%r6 ; restore r6
275 LDD -112(%sp),%r5 ; restore r5
276 LDD -120(%sp),%r4 ; restore r4
277 BVE (%rp)
278 LDD,MB -128(%sp),%r3 ; restore r3
279 .PROCEND ;in=23,24,25,26,29;out=28;
280
281;----------------------------------------------------------------------------
282;
283;BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
284;
285; arg0 = rp
286; arg1 = ap
287; arg2 = num
288; arg3 = w
289
290bn_mul_words
291 .proc
292 .callinfo frame=128
293 .entry
294 .EXPORT bn_mul_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
295 .align 64
296
297 STD %r3,0(%sp) ; save r3
298 STD %r4,8(%sp) ; save r4
299 STD %r5,16(%sp) ; save r5
300 STD %r6,24(%sp) ; save r6
301
302 STD %r7,32(%sp) ; save r7
303 COPY %r0,%ret0 ; return 0 by default
304 DEPDI,Z 1,31,1,top_overflow ; top_overflow = 1 << 32
305 STD w,56(%sp) ; w on stack
306
307 CMPIB,>= 0,num,bn_mul_words_exit
308 LDO 128(%sp),%sp ; bump stack
309
310 ;
311 ; See if only 1 word to do, thus just do cleanup
312 ;
313 CMPIB,= 1,num,bn_mul_words_single_top
314 FLDD -72(%sp),fw ; load up w into fp register fw (fw_h/fw_l)
315
316 ;
317 ; This loop is unrolled 2 times (64-byte aligned as well)
318 ;
319 ; PA-RISC 2.0 chips have two fully pipelined multipliers, thus
320 ; two 32-bit mutiplies can be issued per cycle.
321 ;
322bn_mul_words_unroll2
323
324 FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R)
325 FLDD 8(a_ptr),t_float_1 ; load up 64-bit value (fr8L) ht(L)/lt(R)
326 XMPYU fht_0,fw_l,fm1 ; m1[0] = fht_0*fw_l
327 XMPYU fht_1,fw_l,fm1_1 ; m1[1] = ht*fw_l
328
329 FSTD fm1,-16(%sp) ; -16(sp) = m1
330 FSTD fm1_1,-48(%sp) ; -48(sp) = m1
331 XMPYU flt_0,fw_h,fm ; m = lt*fw_h
332 XMPYU flt_1,fw_h,fm_1 ; m = lt*fw_h
333
334 FSTD fm,-8(%sp) ; -8(sp) = m
335 FSTD fm_1,-40(%sp) ; -40(sp) = m
336 XMPYU fht_0,fw_h,ht_temp ; ht_temp = fht_0*fw_h
337 XMPYU fht_1,fw_h,ht_temp_1 ; ht_temp = ht*fw_h
338
339 FSTD ht_temp,-24(%sp) ; -24(sp) = ht
340 FSTD ht_temp_1,-56(%sp) ; -56(sp) = ht
341 XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l
342 XMPYU flt_1,fw_l,lt_temp_1 ; lt_temp = lt*fw_l
343
344 FSTD lt_temp,-32(%sp) ; -32(sp) = lt
345 FSTD lt_temp_1,-64(%sp) ; -64(sp) = lt
346 LDD -8(%sp),m_0
347 LDD -40(%sp),m_1
348
349 LDD -16(%sp),m1_0
350 LDD -48(%sp),m1_1
351 LDD -24(%sp),ht_0
352 LDD -56(%sp),ht_1
353
354 ADD,L m1_0,m_0,tmp_0 ; tmp_0 = m + m1;
355 ADD,L m1_1,m_1,tmp_1 ; tmp_1 = m + m1;
356 LDD -32(%sp),lt_0
357 LDD -64(%sp),lt_1
358
359 CMPCLR,*>>= tmp_0,m1_0, %r0 ; if (m < m1)
360 ADD,L ht_0,top_overflow,ht_0 ; ht += (1<<32)
361 CMPCLR,*>>= tmp_1,m1_1,%r0 ; if (m < m1)
362 ADD,L ht_1,top_overflow,ht_1 ; ht += (1<<32)
363
364 EXTRD,U tmp_0,31,32,m_0 ; m>>32
365 DEPD,Z tmp_0,31,32,m1_0 ; m1 = m<<32
366 EXTRD,U tmp_1,31,32,m_1 ; m>>32
367 DEPD,Z tmp_1,31,32,m1_1 ; m1 = m<<32
368
369 ADD,L ht_0,m_0,ht_0 ; ht+= (m>>32)
370 ADD,L ht_1,m_1,ht_1 ; ht+= (m>>32)
371 ADD lt_0,m1_0,lt_0 ; lt = lt+m1;
372 ADD,DC ht_0,%r0,ht_0 ; ht++
373
374 ADD lt_1,m1_1,lt_1 ; lt = lt+m1;
375 ADD,DC ht_1,%r0,ht_1 ; ht++
376 ADD %ret0,lt_0,lt_0 ; lt = lt + c (ret0);
377 ADD,DC ht_0,%r0,ht_0 ; ht++
378
379 ADD ht_0,lt_1,lt_1 ; lt = lt + c (ht_0)
380 ADD,DC ht_1,%r0,ht_1 ; ht++
381 STD lt_0,0(r_ptr) ; rp[0] = lt
382 STD lt_1,8(r_ptr) ; rp[1] = lt
383
384 COPY ht_1,%ret0 ; carry = ht
385 LDO -2(num),num ; num = num - 2;
386 LDO 16(a_ptr),a_ptr ; ap += 2
387 CMPIB,<= 2,num,bn_mul_words_unroll2
388 LDO 16(r_ptr),r_ptr ; rp++
389
390 CMPIB,=,N 0,num,bn_mul_words_exit ; are we done?
391
392 ;
393 ; Top of loop aligned on 64-byte boundary
394 ;
395bn_mul_words_single_top
396 FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R)
397
398 XMPYU fht_0,fw_l,fm1 ; m1 = ht*fw_l
399 FSTD fm1,-16(%sp) ; -16(sp) = m1
400 XMPYU flt_0,fw_h,fm ; m = lt*fw_h
401 FSTD fm,-8(%sp) ; -8(sp) = m
402 XMPYU fht_0,fw_h,ht_temp ; ht_temp = ht*fw_h
403 FSTD ht_temp,-24(%sp) ; -24(sp) = ht
404 XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l
405 FSTD lt_temp,-32(%sp) ; -32(sp) = lt
406
407 LDD -8(%sp),m_0
408 LDD -16(%sp),m1_0
409 ADD,L m_0,m1_0,tmp_0 ; tmp_0 = m + m1;
410 LDD -24(%sp),ht_0
411 LDD -32(%sp),lt_0
412
413 CMPCLR,*>>= tmp_0,m1_0,%r0 ; if (m < m1)
414 ADD,L ht_0,top_overflow,ht_0 ; ht += (1<<32)
415
416 EXTRD,U tmp_0,31,32,m_0 ; m>>32
417 DEPD,Z tmp_0,31,32,m1_0 ; m1 = m<<32
418
419 ADD,L ht_0,m_0,ht_0 ; ht+= (m>>32)
420 ADD lt_0,m1_0,lt_0 ; lt= lt+m1;
421 ADD,DC ht_0,%r0,ht_0 ; ht++
422
423 ADD %ret0,lt_0,lt_0 ; lt = lt + c;
424 ADD,DC ht_0,%r0,ht_0 ; ht++
425
426 COPY ht_0,%ret0 ; copy carry
427 STD lt_0,0(r_ptr) ; rp[0] = lt
428
429bn_mul_words_exit
430 .EXIT
431 LDD -96(%sp),%r7 ; restore r7
432 LDD -104(%sp),%r6 ; restore r6
433 LDD -112(%sp),%r5 ; restore r5
434 LDD -120(%sp),%r4 ; restore r4
435 BVE (%rp)
436 LDD,MB -128(%sp),%r3 ; restore r3
437 .PROCEND ;in=23,24,25,26,29;out=28;
438
439;----------------------------------------------------------------------------
440;
441;void bn_sqr_words(BN_ULONG *rp, BN_ULONG *ap, int num)
442;
443; arg0 = rp
444; arg1 = ap
445; arg2 = num
446;
447
448bn_sqr_words
449 .proc
450 .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
451 .EXPORT bn_sqr_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
452 .entry
453 .align 64
454
455 STD %r3,0(%sp) ; save r3
456 STD %r4,8(%sp) ; save r4
457 NOP
458 STD %r5,16(%sp) ; save r5
459
460 CMPIB,>= 0,num,bn_sqr_words_exit
461 LDO 128(%sp),%sp ; bump stack
462
463 ;
464 ; If only 1, the goto straight to cleanup
465 ;
466 CMPIB,= 1,num,bn_sqr_words_single_top
467 DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L
468
469 ;
470 ; This loop is unrolled 2 times (64-byte aligned as well)
471 ;
472
473bn_sqr_words_unroll2
474 FLDD 0(a_ptr),t_float_0 ; a[0]
475 FLDD 8(a_ptr),t_float_1 ; a[1]
476 XMPYU fht_0,flt_0,fm ; m[0]
477 XMPYU fht_1,flt_1,fm_1 ; m[1]
478
479 FSTD fm,-24(%sp) ; store m[0]
480 FSTD fm_1,-56(%sp) ; store m[1]
481 XMPYU flt_0,flt_0,lt_temp ; lt[0]
482 XMPYU flt_1,flt_1,lt_temp_1 ; lt[1]
483
484 FSTD lt_temp,-16(%sp) ; store lt[0]
485 FSTD lt_temp_1,-48(%sp) ; store lt[1]
486 XMPYU fht_0,fht_0,ht_temp ; ht[0]
487 XMPYU fht_1,fht_1,ht_temp_1 ; ht[1]
488
489 FSTD ht_temp,-8(%sp) ; store ht[0]
490 FSTD ht_temp_1,-40(%sp) ; store ht[1]
491 LDD -24(%sp),m_0
492 LDD -56(%sp),m_1
493
494 AND m_0,high_mask,tmp_0 ; m[0] & Mask
495 AND m_1,high_mask,tmp_1 ; m[1] & Mask
496 DEPD,Z m_0,30,31,m_0 ; m[0] << 32+1
497 DEPD,Z m_1,30,31,m_1 ; m[1] << 32+1
498
499 LDD -16(%sp),lt_0
500 LDD -48(%sp),lt_1
501 EXTRD,U tmp_0,32,33,tmp_0 ; tmp_0 = m[0]&Mask >> 32-1
502 EXTRD,U tmp_1,32,33,tmp_1 ; tmp_1 = m[1]&Mask >> 32-1
503
504 LDD -8(%sp),ht_0
505 LDD -40(%sp),ht_1
506 ADD,L ht_0,tmp_0,ht_0 ; ht[0] += tmp_0
507 ADD,L ht_1,tmp_1,ht_1 ; ht[1] += tmp_1
508
509 ADD lt_0,m_0,lt_0 ; lt = lt+m
510 ADD,DC ht_0,%r0,ht_0 ; ht[0]++
511 STD lt_0,0(r_ptr) ; rp[0] = lt[0]
512 STD ht_0,8(r_ptr) ; rp[1] = ht[1]
513
514 ADD lt_1,m_1,lt_1 ; lt = lt+m
515 ADD,DC ht_1,%r0,ht_1 ; ht[1]++
516 STD lt_1,16(r_ptr) ; rp[2] = lt[1]
517 STD ht_1,24(r_ptr) ; rp[3] = ht[1]
518
519 LDO -2(num),num ; num = num - 2;
520 LDO 16(a_ptr),a_ptr ; ap += 2
521 CMPIB,<= 2,num,bn_sqr_words_unroll2
522 LDO 32(r_ptr),r_ptr ; rp += 4
523
524 CMPIB,=,N 0,num,bn_sqr_words_exit ; are we done?
525
526 ;
527 ; Top of loop aligned on 64-byte boundary
528 ;
529bn_sqr_words_single_top
530 FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R)
531
532 XMPYU fht_0,flt_0,fm ; m
533 FSTD fm,-24(%sp) ; store m
534
535 XMPYU flt_0,flt_0,lt_temp ; lt
536 FSTD lt_temp,-16(%sp) ; store lt
537
538 XMPYU fht_0,fht_0,ht_temp ; ht
539 FSTD ht_temp,-8(%sp) ; store ht
540
541 LDD -24(%sp),m_0 ; load m
542 AND m_0,high_mask,tmp_0 ; m & Mask
543 DEPD,Z m_0,30,31,m_0 ; m << 32+1
544 LDD -16(%sp),lt_0 ; lt
545
546 LDD -8(%sp),ht_0 ; ht
547 EXTRD,U tmp_0,32,33,tmp_0 ; tmp_0 = m&Mask >> 32-1
548 ADD m_0,lt_0,lt_0 ; lt = lt+m
549 ADD,L ht_0,tmp_0,ht_0 ; ht += tmp_0
550 ADD,DC ht_0,%r0,ht_0 ; ht++
551
552 STD lt_0,0(r_ptr) ; rp[0] = lt
553 STD ht_0,8(r_ptr) ; rp[1] = ht
554
555bn_sqr_words_exit
556 .EXIT
557 LDD -112(%sp),%r5 ; restore r5
558 LDD -120(%sp),%r4 ; restore r4
559 BVE (%rp)
560 LDD,MB -128(%sp),%r3
561 .PROCEND ;in=23,24,25,26,29;out=28;
562
563
564;----------------------------------------------------------------------------
565;
566;BN_ULONG bn_add_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
567;
568; arg0 = rp
569; arg1 = ap
570; arg2 = bp
571; arg3 = n
572
573t .reg %r22
574b .reg %r21
575l .reg %r20
576
577bn_add_words
578 .proc
579 .entry
580 .callinfo
581 .EXPORT bn_add_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
582 .align 64
583
584 CMPIB,>= 0,n,bn_add_words_exit
585 COPY %r0,%ret0 ; return 0 by default
586
587 ;
588 ; If 2 or more numbers do the loop
589 ;
590 CMPIB,= 1,n,bn_add_words_single_top
591 NOP
592
593 ;
594 ; This loop is unrolled 2 times (64-byte aligned as well)
595 ;
596bn_add_words_unroll2
597 LDD 0(a_ptr),t
598 LDD 0(b_ptr),b
599 ADD t,%ret0,t ; t = t+c;
600 ADD,DC %r0,%r0,%ret0 ; set c to carry
601 ADD t,b,l ; l = t + b[0]
602 ADD,DC %ret0,%r0,%ret0 ; c+= carry
603 STD l,0(r_ptr)
604
605 LDD 8(a_ptr),t
606 LDD 8(b_ptr),b
607 ADD t,%ret0,t ; t = t+c;
608 ADD,DC %r0,%r0,%ret0 ; set c to carry
609 ADD t,b,l ; l = t + b[0]
610 ADD,DC %ret0,%r0,%ret0 ; c+= carry
611 STD l,8(r_ptr)
612
613 LDO -2(n),n
614 LDO 16(a_ptr),a_ptr
615 LDO 16(b_ptr),b_ptr
616
617 CMPIB,<= 2,n,bn_add_words_unroll2
618 LDO 16(r_ptr),r_ptr
619
620 CMPIB,=,N 0,n,bn_add_words_exit ; are we done?
621
622bn_add_words_single_top
623 LDD 0(a_ptr),t
624 LDD 0(b_ptr),b
625
626 ADD t,%ret0,t ; t = t+c;
627 ADD,DC %r0,%r0,%ret0 ; set c to carry (could use CMPCLR??)
628 ADD t,b,l ; l = t + b[0]
629 ADD,DC %ret0,%r0,%ret0 ; c+= carry
630 STD l,0(r_ptr)
631
632bn_add_words_exit
633 .EXIT
634 BVE (%rp)
635 NOP
636 .PROCEND ;in=23,24,25,26,29;out=28;
637
638;----------------------------------------------------------------------------
639;
640;BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
641;
642; arg0 = rp
643; arg1 = ap
644; arg2 = bp
645; arg3 = n
646
647t1 .reg %r22
648t2 .reg %r21
649sub_tmp1 .reg %r20
650sub_tmp2 .reg %r19
651
652
653bn_sub_words
654 .proc
655 .callinfo
656 .EXPORT bn_sub_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
657 .entry
658 .align 64
659
660 CMPIB,>= 0,n,bn_sub_words_exit
661 COPY %r0,%ret0 ; return 0 by default
662
663 ;
664 ; If 2 or more numbers do the loop
665 ;
666 CMPIB,= 1,n,bn_sub_words_single_top
667 NOP
668
669 ;
670 ; This loop is unrolled 2 times (64-byte aligned as well)
671 ;
672bn_sub_words_unroll2
673 LDD 0(a_ptr),t1
674 LDD 0(b_ptr),t2
675 SUB t1,t2,sub_tmp1 ; t3 = t1-t2;
676 SUB sub_tmp1,%ret0,sub_tmp1 ; t3 = t3- c;
677
678 CMPCLR,*>> t1,t2,sub_tmp2 ; clear if t1 > t2
679 LDO 1(%r0),sub_tmp2
680
681 CMPCLR,*= t1,t2,%r0
682 COPY sub_tmp2,%ret0
683 STD sub_tmp1,0(r_ptr)
684
685 LDD 8(a_ptr),t1
686 LDD 8(b_ptr),t2
687 SUB t1,t2,sub_tmp1 ; t3 = t1-t2;
688 SUB sub_tmp1,%ret0,sub_tmp1 ; t3 = t3- c;
689 CMPCLR,*>> t1,t2,sub_tmp2 ; clear if t1 > t2
690 LDO 1(%r0),sub_tmp2
691
692 CMPCLR,*= t1,t2,%r0
693 COPY sub_tmp2,%ret0
694 STD sub_tmp1,8(r_ptr)
695
696 LDO -2(n),n
697 LDO 16(a_ptr),a_ptr
698 LDO 16(b_ptr),b_ptr
699
700 CMPIB,<= 2,n,bn_sub_words_unroll2
701 LDO 16(r_ptr),r_ptr
702
703 CMPIB,=,N 0,n,bn_sub_words_exit ; are we done?
704
705bn_sub_words_single_top
706 LDD 0(a_ptr),t1
707 LDD 0(b_ptr),t2
708 SUB t1,t2,sub_tmp1 ; t3 = t1-t2;
709 SUB sub_tmp1,%ret0,sub_tmp1 ; t3 = t3- c;
710 CMPCLR,*>> t1,t2,sub_tmp2 ; clear if t1 > t2
711 LDO 1(%r0),sub_tmp2
712
713 CMPCLR,*= t1,t2,%r0
714 COPY sub_tmp2,%ret0
715
716 STD sub_tmp1,0(r_ptr)
717
718bn_sub_words_exit
719 .EXIT
720 BVE (%rp)
721 NOP
722 .PROCEND ;in=23,24,25,26,29;out=28;
723
724;------------------------------------------------------------------------------
725;
726; unsigned long bn_div_words(unsigned long h, unsigned long l, unsigned long d)
727;
728; arg0 = h
729; arg1 = l
730; arg2 = d
731;
732; This is mainly just modified assembly from the compiler, thus the
733; lack of variable names.
734;
735;------------------------------------------------------------------------------
736bn_div_words
737 .proc
738 .callinfo CALLER,FRAME=272,ENTRY_GR=%r10,SAVE_RP,ARGS_SAVED,ORDERING_AWARE
739 .EXPORT bn_div_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
740 .IMPORT BN_num_bits_word,CODE,NO_RELOCATION
741 .IMPORT __iob,DATA
742 .IMPORT fprintf,CODE,NO_RELOCATION
743 .IMPORT abort,CODE,NO_RELOCATION
744 .IMPORT $$div2U,MILLICODE
745 .entry
746 STD %r2,-16(%r30)
747 STD,MA %r3,352(%r30)
748 STD %r4,-344(%r30)
749 STD %r5,-336(%r30)
750 STD %r6,-328(%r30)
751 STD %r7,-320(%r30)
752 STD %r8,-312(%r30)
753 STD %r9,-304(%r30)
754 STD %r10,-296(%r30)
755
756 STD %r27,-288(%r30) ; save gp
757
758 COPY %r24,%r3 ; save d
759 COPY %r26,%r4 ; save h (high 64-bits)
760 LDO -1(%r0),%ret0 ; return -1 by default
761
762 CMPB,*= %r0,%arg2,$D3 ; if (d == 0)
763 COPY %r25,%r5 ; save l (low 64-bits)
764
765 LDO -48(%r30),%r29 ; create ap
766 .CALL ;in=26,29;out=28;
767 B,L BN_num_bits_word,%r2
768 COPY %r3,%r26
769 LDD -288(%r30),%r27 ; restore gp
770 LDI 64,%r21
771
772 CMPB,= %r21,%ret0,$00000012 ;if (i == 64) (forward)
773 COPY %ret0,%r24 ; i
774 MTSARCM %r24
775 DEPDI,Z -1,%sar,1,%r29
776 CMPB,*<<,N %r29,%r4,bn_div_err_case ; if (h > 1<<i) (forward)
777
778$00000012
779 SUBI 64,%r24,%r31 ; i = 64 - i;
780 CMPCLR,*<< %r4,%r3,%r0 ; if (h >= d)
781 SUB %r4,%r3,%r4 ; h -= d
782 CMPB,= %r31,%r0,$0000001A ; if (i)
783 COPY %r0,%r10 ; ret = 0
784 MTSARCM %r31 ; i to shift
785 DEPD,Z %r3,%sar,64,%r3 ; d <<= i;
786 SUBI 64,%r31,%r19 ; 64 - i; redundant
787 MTSAR %r19 ; (64 -i) to shift
788 SHRPD %r4,%r5,%sar,%r4 ; l>> (64-i)
789 MTSARCM %r31 ; i to shift
790 DEPD,Z %r5,%sar,64,%r5 ; l <<= i;
791
792$0000001A
793 DEPDI,Z -1,31,32,%r19
794 EXTRD,U %r3,31,32,%r6 ; dh=(d&0xfff)>>32
795 EXTRD,U %r3,63,32,%r8 ; dl = d&0xffffff
796 LDO 2(%r0),%r9
797 STD %r3,-280(%r30) ; "d" to stack
798
799$0000001C
800 DEPDI,Z -1,63,32,%r29 ;
801 EXTRD,U %r4,31,32,%r31 ; h >> 32
802 CMPB,*=,N %r31,%r6,$D2 ; if ((h>>32) != dh)(forward) div
803 COPY %r4,%r26
804 EXTRD,U %r4,31,32,%r25
805 COPY %r6,%r24
806 .CALL ;in=23,24,25,26;out=20,21,22,28,29; (MILLICALL)
807 B,L $$div2U,%r2
808 EXTRD,U %r6,31,32,%r23
809 DEPD %r28,31,32,%r29
810$D2
811 STD %r29,-272(%r30) ; q
812 AND %r5,%r19,%r24 ; t & 0xffffffff00000000;
813 EXTRD,U %r24,31,32,%r24 ; ???
814 FLDD -272(%r30),%fr7 ; q
815 FLDD -280(%r30),%fr8 ; d
816 XMPYU %fr8L,%fr7L,%fr10
817 FSTD %fr10,-256(%r30)
818 XMPYU %fr8L,%fr7R,%fr22
819 FSTD %fr22,-264(%r30)
820 XMPYU %fr8R,%fr7L,%fr11
821 XMPYU %fr8R,%fr7R,%fr23
822 FSTD %fr11,-232(%r30)
823 FSTD %fr23,-240(%r30)
824 LDD -256(%r30),%r28
825 DEPD,Z %r28,31,32,%r2
826 LDD -264(%r30),%r20
827 ADD,L %r20,%r2,%r31
828 LDD -232(%r30),%r22
829 DEPD,Z %r22,31,32,%r22
830 LDD -240(%r30),%r21
831 B $00000024 ; enter loop
832 ADD,L %r21,%r22,%r23
833
834$0000002A
835 LDO -1(%r29),%r29
836 SUB %r23,%r8,%r23
837$00000024
838 SUB %r4,%r31,%r25
839 AND %r25,%r19,%r26
840 CMPB,*<>,N %r0,%r26,$00000046 ; (forward)
841 DEPD,Z %r25,31,32,%r20
842 OR %r20,%r24,%r21
843 CMPB,*<<,N %r21,%r23,$0000002A ;(backward)
844 SUB %r31,%r6,%r31
845;-------------Break path---------------------
846
847$00000046
848 DEPD,Z %r23,31,32,%r25 ;tl
849 EXTRD,U %r23,31,32,%r26 ;t
850 AND %r25,%r19,%r24 ;tl = (tl<<32)&0xfffffff0000000L
851 ADD,L %r31,%r26,%r31 ;th += t;
852 CMPCLR,*>>= %r5,%r24,%r0 ;if (l<tl)
853 LDO 1(%r31),%r31 ; th++;
854 CMPB,*<<=,N %r31,%r4,$00000036 ;if (n < th) (forward)
855 LDO -1(%r29),%r29 ;q--;
856 ADD,L %r4,%r3,%r4 ;h += d;
857$00000036
858 ADDIB,=,N -1,%r9,$D1 ;if (--count == 0) break (forward)
859 SUB %r5,%r24,%r28 ; l -= tl;
860 SUB %r4,%r31,%r24 ; h -= th;
861 SHRPD %r24,%r28,32,%r4 ; h = ((h<<32)|(l>>32));
862 DEPD,Z %r29,31,32,%r10 ; ret = q<<32
863 b $0000001C
864 DEPD,Z %r28,31,32,%r5 ; l = l << 32
865
866$D1
867 OR %r10,%r29,%r28 ; ret |= q
868$D3
869 LDD -368(%r30),%r2
870$D0
871 LDD -296(%r30),%r10
872 LDD -304(%r30),%r9
873 LDD -312(%r30),%r8
874 LDD -320(%r30),%r7
875 LDD -328(%r30),%r6
876 LDD -336(%r30),%r5
877 LDD -344(%r30),%r4
878 BVE (%r2)
879 .EXIT
880 LDD,MB -352(%r30),%r3
881
882bn_div_err_case
883 MFIA %r6
884 ADDIL L'bn_div_words-bn_div_err_case,%r6,%r1
885 LDO R'bn_div_words-bn_div_err_case(%r1),%r6
886 ADDIL LT'__iob,%r27,%r1
887 LDD RT'__iob(%r1),%r26
888 ADDIL L'C$4-bn_div_words,%r6,%r1
889 LDO R'C$4-bn_div_words(%r1),%r25
890 LDO 64(%r26),%r26
891 .CALL ;in=24,25,26,29;out=28;
892 B,L fprintf,%r2
893 LDO -48(%r30),%r29
894 LDD -288(%r30),%r27
895 .CALL ;in=29;
896 B,L abort,%r2
897 LDO -48(%r30),%r29
898 LDD -288(%r30),%r27
899 B $D0
900 LDD -368(%r30),%r2
901 .PROCEND ;in=24,25,26,29;out=28;
902
903;----------------------------------------------------------------------------
904;
905; Registers to hold 64-bit values to manipulate. The "L" part
906; of the register corresponds to the upper 32-bits, while the "R"
907; part corresponds to the lower 32-bits
908;
909; Note, that when using b6 and b7, the code must save these before
910; using them because they are callee save registers
911;
912;
913; Floating point registers to use to save values that
914; are manipulated. These don't collide with ftemp1-6 and
915; are all caller save registers
916;
917a0 .reg %fr22
918a0L .reg %fr22L
919a0R .reg %fr22R
920
921a1 .reg %fr23
922a1L .reg %fr23L
923a1R .reg %fr23R
924
925a2 .reg %fr24
926a2L .reg %fr24L
927a2R .reg %fr24R
928
929a3 .reg %fr25
930a3L .reg %fr25L
931a3R .reg %fr25R
932
933a4 .reg %fr26
934a4L .reg %fr26L
935a4R .reg %fr26R
936
937a5 .reg %fr27
938a5L .reg %fr27L
939a5R .reg %fr27R
940
941a6 .reg %fr28
942a6L .reg %fr28L
943a6R .reg %fr28R
944
945a7 .reg %fr29
946a7L .reg %fr29L
947a7R .reg %fr29R
948
949b0 .reg %fr30
950b0L .reg %fr30L
951b0R .reg %fr30R
952
953b1 .reg %fr31
954b1L .reg %fr31L
955b1R .reg %fr31R
956
957;
958; Temporary floating point variables, these are all caller save
959; registers
960;
961ftemp1 .reg %fr4
962ftemp2 .reg %fr5
963ftemp3 .reg %fr6
964ftemp4 .reg %fr7
965
966;
967; The B set of registers when used.
968;
969
970b2 .reg %fr8
971b2L .reg %fr8L
972b2R .reg %fr8R
973
974b3 .reg %fr9
975b3L .reg %fr9L
976b3R .reg %fr9R
977
978b4 .reg %fr10
979b4L .reg %fr10L
980b4R .reg %fr10R
981
982b5 .reg %fr11
983b5L .reg %fr11L
984b5R .reg %fr11R
985
986b6 .reg %fr12
987b6L .reg %fr12L
988b6R .reg %fr12R
989
990b7 .reg %fr13
991b7L .reg %fr13L
992b7R .reg %fr13R
993
994c1 .reg %r21 ; only reg
995temp1 .reg %r20 ; only reg
996temp2 .reg %r19 ; only reg
997temp3 .reg %r31 ; only reg
998
999m1 .reg %r28
1000c2 .reg %r23
1001high_one .reg %r1
1002ht .reg %r6
1003lt .reg %r5
1004m .reg %r4
1005c3 .reg %r3
1006
1007SQR_ADD_C .macro A0L,A0R,C1,C2,C3
1008 XMPYU A0L,A0R,ftemp1 ; m
1009 FSTD ftemp1,-24(%sp) ; store m
1010
1011 XMPYU A0R,A0R,ftemp2 ; lt
1012 FSTD ftemp2,-16(%sp) ; store lt
1013
1014 XMPYU A0L,A0L,ftemp3 ; ht
1015 FSTD ftemp3,-8(%sp) ; store ht
1016
1017 LDD -24(%sp),m ; load m
1018 AND m,high_mask,temp2 ; m & Mask
1019 DEPD,Z m,30,31,temp3 ; m << 32+1
1020 LDD -16(%sp),lt ; lt
1021
1022 LDD -8(%sp),ht ; ht
1023 EXTRD,U temp2,32,33,temp1 ; temp1 = m&Mask >> 32-1
1024 ADD temp3,lt,lt ; lt = lt+m
1025 ADD,L ht,temp1,ht ; ht += temp1
1026 ADD,DC ht,%r0,ht ; ht++
1027
1028 ADD C1,lt,C1 ; c1=c1+lt
1029 ADD,DC ht,%r0,ht ; ht++
1030
1031 ADD C2,ht,C2 ; c2=c2+ht
1032 ADD,DC C3,%r0,C3 ; c3++
1033.endm
1034
1035SQR_ADD_C2 .macro A0L,A0R,A1L,A1R,C1,C2,C3
1036 XMPYU A0L,A1R,ftemp1 ; m1 = bl*ht
1037 FSTD ftemp1,-16(%sp) ;
1038 XMPYU A0R,A1L,ftemp2 ; m = bh*lt
1039 FSTD ftemp2,-8(%sp) ;
1040 XMPYU A0R,A1R,ftemp3 ; lt = bl*lt
1041 FSTD ftemp3,-32(%sp)
1042 XMPYU A0L,A1L,ftemp4 ; ht = bh*ht
1043 FSTD ftemp4,-24(%sp) ;
1044
1045 LDD -8(%sp),m ; r21 = m
1046 LDD -16(%sp),m1 ; r19 = m1
1047 ADD,L m,m1,m ; m+m1
1048
1049 DEPD,Z m,31,32,temp3 ; (m+m1<<32)
1050 LDD -24(%sp),ht ; r24 = ht
1051
1052 CMPCLR,*>>= m,m1,%r0 ; if (m < m1)
1053 ADD,L ht,high_one,ht ; ht+=high_one
1054
1055 EXTRD,U m,31,32,temp1 ; m >> 32
1056 LDD -32(%sp),lt ; lt
1057 ADD,L ht,temp1,ht ; ht+= m>>32
1058 ADD lt,temp3,lt ; lt = lt+m1
1059 ADD,DC ht,%r0,ht ; ht++
1060
1061 ADD ht,ht,ht ; ht=ht+ht;
1062 ADD,DC C3,%r0,C3 ; add in carry (c3++)
1063
1064 ADD lt,lt,lt ; lt=lt+lt;
1065 ADD,DC ht,%r0,ht ; add in carry (ht++)
1066
1067 ADD C1,lt,C1 ; c1=c1+lt
1068 ADD,DC,*NUV ht,%r0,ht ; add in carry (ht++)
1069 LDO 1(C3),C3 ; bump c3 if overflow,nullify otherwise
1070
1071 ADD C2,ht,C2 ; c2 = c2 + ht
1072 ADD,DC C3,%r0,C3 ; add in carry (c3++)
1073.endm
1074
1075;
1076;void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
1077; arg0 = r_ptr
1078; arg1 = a_ptr
1079;
1080
1081bn_sqr_comba8
1082 .PROC
1083 .CALLINFO FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
1084 .EXPORT bn_sqr_comba8,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
1085 .ENTRY
1086 .align 64
1087
1088 STD %r3,0(%sp) ; save r3
1089 STD %r4,8(%sp) ; save r4
1090 STD %r5,16(%sp) ; save r5
1091 STD %r6,24(%sp) ; save r6
1092
1093 ;
1094 ; Zero out carries
1095 ;
1096 COPY %r0,c1
1097 COPY %r0,c2
1098 COPY %r0,c3
1099
1100 LDO 128(%sp),%sp ; bump stack
1101 DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L
1102 DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32
1103
1104 ;
1105 ; Load up all of the values we are going to use
1106 ;
1107 FLDD 0(a_ptr),a0
1108 FLDD 8(a_ptr),a1
1109 FLDD 16(a_ptr),a2
1110 FLDD 24(a_ptr),a3
1111 FLDD 32(a_ptr),a4
1112 FLDD 40(a_ptr),a5
1113 FLDD 48(a_ptr),a6
1114 FLDD 56(a_ptr),a7
1115
1116 SQR_ADD_C a0L,a0R,c1,c2,c3
1117 STD c1,0(r_ptr) ; r[0] = c1;
1118 COPY %r0,c1
1119
1120 SQR_ADD_C2 a1L,a1R,a0L,a0R,c2,c3,c1
1121 STD c2,8(r_ptr) ; r[1] = c2;
1122 COPY %r0,c2
1123
1124 SQR_ADD_C a1L,a1R,c3,c1,c2
1125 SQR_ADD_C2 a2L,a2R,a0L,a0R,c3,c1,c2
1126 STD c3,16(r_ptr) ; r[2] = c3;
1127 COPY %r0,c3
1128
1129 SQR_ADD_C2 a3L,a3R,a0L,a0R,c1,c2,c3
1130 SQR_ADD_C2 a2L,a2R,a1L,a1R,c1,c2,c3
1131 STD c1,24(r_ptr) ; r[3] = c1;
1132 COPY %r0,c1
1133
1134 SQR_ADD_C a2L,a2R,c2,c3,c1
1135 SQR_ADD_C2 a3L,a3R,a1L,a1R,c2,c3,c1
1136 SQR_ADD_C2 a4L,a4R,a0L,a0R,c2,c3,c1
1137 STD c2,32(r_ptr) ; r[4] = c2;
1138 COPY %r0,c2
1139
1140 SQR_ADD_C2 a5L,a5R,a0L,a0R,c3,c1,c2
1141 SQR_ADD_C2 a4L,a4R,a1L,a1R,c3,c1,c2
1142 SQR_ADD_C2 a3L,a3R,a2L,a2R,c3,c1,c2
1143 STD c3,40(r_ptr) ; r[5] = c3;
1144 COPY %r0,c3
1145
1146 SQR_ADD_C a3L,a3R,c1,c2,c3
1147 SQR_ADD_C2 a4L,a4R,a2L,a2R,c1,c2,c3
1148 SQR_ADD_C2 a5L,a5R,a1L,a1R,c1,c2,c3
1149 SQR_ADD_C2 a6L,a6R,a0L,a0R,c1,c2,c3
1150 STD c1,48(r_ptr) ; r[6] = c1;
1151 COPY %r0,c1
1152
1153 SQR_ADD_C2 a7L,a7R,a0L,a0R,c2,c3,c1
1154 SQR_ADD_C2 a6L,a6R,a1L,a1R,c2,c3,c1
1155 SQR_ADD_C2 a5L,a5R,a2L,a2R,c2,c3,c1
1156 SQR_ADD_C2 a4L,a4R,a3L,a3R,c2,c3,c1
1157 STD c2,56(r_ptr) ; r[7] = c2;
1158 COPY %r0,c2
1159
1160 SQR_ADD_C a4L,a4R,c3,c1,c2
1161 SQR_ADD_C2 a5L,a5R,a3L,a3R,c3,c1,c2
1162 SQR_ADD_C2 a6L,a6R,a2L,a2R,c3,c1,c2
1163 SQR_ADD_C2 a7L,a7R,a1L,a1R,c3,c1,c2
1164 STD c3,64(r_ptr) ; r[8] = c3;
1165 COPY %r0,c3
1166
1167 SQR_ADD_C2 a7L,a7R,a2L,a2R,c1,c2,c3
1168 SQR_ADD_C2 a6L,a6R,a3L,a3R,c1,c2,c3
1169 SQR_ADD_C2 a5L,a5R,a4L,a4R,c1,c2,c3
1170 STD c1,72(r_ptr) ; r[9] = c1;
1171 COPY %r0,c1
1172
1173 SQR_ADD_C a5L,a5R,c2,c3,c1
1174 SQR_ADD_C2 a6L,a6R,a4L,a4R,c2,c3,c1
1175 SQR_ADD_C2 a7L,a7R,a3L,a3R,c2,c3,c1
1176 STD c2,80(r_ptr) ; r[10] = c2;
1177 COPY %r0,c2
1178
1179 SQR_ADD_C2 a7L,a7R,a4L,a4R,c3,c1,c2
1180 SQR_ADD_C2 a6L,a6R,a5L,a5R,c3,c1,c2
1181 STD c3,88(r_ptr) ; r[11] = c3;
1182 COPY %r0,c3
1183
1184 SQR_ADD_C a6L,a6R,c1,c2,c3
1185 SQR_ADD_C2 a7L,a7R,a5L,a5R,c1,c2,c3
1186 STD c1,96(r_ptr) ; r[12] = c1;
1187 COPY %r0,c1
1188
1189 SQR_ADD_C2 a7L,a7R,a6L,a6R,c2,c3,c1
1190 STD c2,104(r_ptr) ; r[13] = c2;
1191 COPY %r0,c2
1192
1193 SQR_ADD_C a7L,a7R,c3,c1,c2
1194 STD c3, 112(r_ptr) ; r[14] = c3
1195 STD c1, 120(r_ptr) ; r[15] = c1
1196
1197 .EXIT
1198 LDD -104(%sp),%r6 ; restore r6
1199 LDD -112(%sp),%r5 ; restore r5
1200 LDD -120(%sp),%r4 ; restore r4
1201 BVE (%rp)
1202 LDD,MB -128(%sp),%r3
1203
1204 .PROCEND
1205
1206;-----------------------------------------------------------------------------
1207;
1208;void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
1209; arg0 = r_ptr
1210; arg1 = a_ptr
1211;
1212
1213bn_sqr_comba4
1214 .proc
1215 .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
1216 .EXPORT bn_sqr_comba4,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
1217 .entry
1218 .align 64
1219 STD %r3,0(%sp) ; save r3
1220 STD %r4,8(%sp) ; save r4
1221 STD %r5,16(%sp) ; save r5
1222 STD %r6,24(%sp) ; save r6
1223
1224 ;
1225 ; Zero out carries
1226 ;
1227 COPY %r0,c1
1228 COPY %r0,c2
1229 COPY %r0,c3
1230
1231 LDO 128(%sp),%sp ; bump stack
1232 DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L
1233 DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32
1234
1235 ;
1236 ; Load up all of the values we are going to use
1237 ;
1238 FLDD 0(a_ptr),a0
1239 FLDD 8(a_ptr),a1
1240 FLDD 16(a_ptr),a2
1241 FLDD 24(a_ptr),a3
1242 FLDD 32(a_ptr),a4
1243 FLDD 40(a_ptr),a5
1244 FLDD 48(a_ptr),a6
1245 FLDD 56(a_ptr),a7
1246
1247 SQR_ADD_C a0L,a0R,c1,c2,c3
1248
1249 STD c1,0(r_ptr) ; r[0] = c1;
1250 COPY %r0,c1
1251
1252 SQR_ADD_C2 a1L,a1R,a0L,a0R,c2,c3,c1
1253
1254 STD c2,8(r_ptr) ; r[1] = c2;
1255 COPY %r0,c2
1256
1257 SQR_ADD_C a1L,a1R,c3,c1,c2
1258 SQR_ADD_C2 a2L,a2R,a0L,a0R,c3,c1,c2
1259
1260 STD c3,16(r_ptr) ; r[2] = c3;
1261 COPY %r0,c3
1262
1263 SQR_ADD_C2 a3L,a3R,a0L,a0R,c1,c2,c3
1264 SQR_ADD_C2 a2L,a2R,a1L,a1R,c1,c2,c3
1265
1266 STD c1,24(r_ptr) ; r[3] = c1;
1267 COPY %r0,c1
1268
1269 SQR_ADD_C a2L,a2R,c2,c3,c1
1270 SQR_ADD_C2 a3L,a3R,a1L,a1R,c2,c3,c1
1271
1272 STD c2,32(r_ptr) ; r[4] = c2;
1273 COPY %r0,c2
1274
1275 SQR_ADD_C2 a3L,a3R,a2L,a2R,c3,c1,c2
1276 STD c3,40(r_ptr) ; r[5] = c3;
1277 COPY %r0,c3
1278
1279 SQR_ADD_C a3L,a3R,c1,c2,c3
1280 STD c1,48(r_ptr) ; r[6] = c1;
1281 STD c2,56(r_ptr) ; r[7] = c2;
1282
1283 .EXIT
1284 LDD -104(%sp),%r6 ; restore r6
1285 LDD -112(%sp),%r5 ; restore r5
1286 LDD -120(%sp),%r4 ; restore r4
1287 BVE (%rp)
1288 LDD,MB -128(%sp),%r3
1289
1290 .PROCEND
1291
1292
1293;---------------------------------------------------------------------------
1294
1295MUL_ADD_C .macro A0L,A0R,B0L,B0R,C1,C2,C3
1296 XMPYU A0L,B0R,ftemp1 ; m1 = bl*ht
1297 FSTD ftemp1,-16(%sp) ;
1298 XMPYU A0R,B0L,ftemp2 ; m = bh*lt
1299 FSTD ftemp2,-8(%sp) ;
1300 XMPYU A0R,B0R,ftemp3 ; lt = bl*lt
1301 FSTD ftemp3,-32(%sp)
1302 XMPYU A0L,B0L,ftemp4 ; ht = bh*ht
1303 FSTD ftemp4,-24(%sp) ;
1304
1305 LDD -8(%sp),m ; r21 = m
1306 LDD -16(%sp),m1 ; r19 = m1
1307 ADD,L m,m1,m ; m+m1
1308
1309 DEPD,Z m,31,32,temp3 ; (m+m1<<32)
1310 LDD -24(%sp),ht ; r24 = ht
1311
1312 CMPCLR,*>>= m,m1,%r0 ; if (m < m1)
1313 ADD,L ht,high_one,ht ; ht+=high_one
1314
1315 EXTRD,U m,31,32,temp1 ; m >> 32
1316 LDD -32(%sp),lt ; lt
1317 ADD,L ht,temp1,ht ; ht+= m>>32
1318 ADD lt,temp3,lt ; lt = lt+m1
1319 ADD,DC ht,%r0,ht ; ht++
1320
1321 ADD C1,lt,C1 ; c1=c1+lt
1322 ADD,DC ht,%r0,ht ; bump c3 if overflow,nullify otherwise
1323
1324 ADD C2,ht,C2 ; c2 = c2 + ht
1325 ADD,DC C3,%r0,C3 ; add in carry (c3++)
1326.endm
1327
1328
1329;
1330;void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
1331; arg0 = r_ptr
1332; arg1 = a_ptr
1333; arg2 = b_ptr
1334;
1335
1336bn_mul_comba8
1337 .proc
1338 .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
1339 .EXPORT bn_mul_comba8,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
1340 .entry
1341 .align 64
1342
1343 STD %r3,0(%sp) ; save r3
1344 STD %r4,8(%sp) ; save r4
1345 STD %r5,16(%sp) ; save r5
1346 STD %r6,24(%sp) ; save r6
1347 FSTD %fr12,32(%sp) ; save r6
1348 FSTD %fr13,40(%sp) ; save r7
1349
1350 ;
1351 ; Zero out carries
1352 ;
1353 COPY %r0,c1
1354 COPY %r0,c2
1355 COPY %r0,c3
1356
1357 LDO 128(%sp),%sp ; bump stack
1358 DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32
1359
1360 ;
1361 ; Load up all of the values we are going to use
1362 ;
1363 FLDD 0(a_ptr),a0
1364 FLDD 8(a_ptr),a1
1365 FLDD 16(a_ptr),a2
1366 FLDD 24(a_ptr),a3
1367 FLDD 32(a_ptr),a4
1368 FLDD 40(a_ptr),a5
1369 FLDD 48(a_ptr),a6
1370 FLDD 56(a_ptr),a7
1371
1372 FLDD 0(b_ptr),b0
1373 FLDD 8(b_ptr),b1
1374 FLDD 16(b_ptr),b2
1375 FLDD 24(b_ptr),b3
1376 FLDD 32(b_ptr),b4
1377 FLDD 40(b_ptr),b5
1378 FLDD 48(b_ptr),b6
1379 FLDD 56(b_ptr),b7
1380
1381 MUL_ADD_C a0L,a0R,b0L,b0R,c1,c2,c3
1382 STD c1,0(r_ptr)
1383 COPY %r0,c1
1384
1385 MUL_ADD_C a0L,a0R,b1L,b1R,c2,c3,c1
1386 MUL_ADD_C a1L,a1R,b0L,b0R,c2,c3,c1
1387 STD c2,8(r_ptr)
1388 COPY %r0,c2
1389
1390 MUL_ADD_C a2L,a2R,b0L,b0R,c3,c1,c2
1391 MUL_ADD_C a1L,a1R,b1L,b1R,c3,c1,c2
1392 MUL_ADD_C a0L,a0R,b2L,b2R,c3,c1,c2
1393 STD c3,16(r_ptr)
1394 COPY %r0,c3
1395
1396 MUL_ADD_C a0L,a0R,b3L,b3R,c1,c2,c3
1397 MUL_ADD_C a1L,a1R,b2L,b2R,c1,c2,c3
1398 MUL_ADD_C a2L,a2R,b1L,b1R,c1,c2,c3
1399 MUL_ADD_C a3L,a3R,b0L,b0R,c1,c2,c3
1400 STD c1,24(r_ptr)
1401 COPY %r0,c1
1402
1403 MUL_ADD_C a4L,a4R,b0L,b0R,c2,c3,c1
1404 MUL_ADD_C a3L,a3R,b1L,b1R,c2,c3,c1
1405 MUL_ADD_C a2L,a2R,b2L,b2R,c2,c3,c1
1406 MUL_ADD_C a1L,a1R,b3L,b3R,c2,c3,c1
1407 MUL_ADD_C a0L,a0R,b4L,b4R,c2,c3,c1
1408 STD c2,32(r_ptr)
1409 COPY %r0,c2
1410
1411 MUL_ADD_C a0L,a0R,b5L,b5R,c3,c1,c2
1412 MUL_ADD_C a1L,a1R,b4L,b4R,c3,c1,c2
1413 MUL_ADD_C a2L,a2R,b3L,b3R,c3,c1,c2
1414 MUL_ADD_C a3L,a3R,b2L,b2R,c3,c1,c2
1415 MUL_ADD_C a4L,a4R,b1L,b1R,c3,c1,c2
1416 MUL_ADD_C a5L,a5R,b0L,b0R,c3,c1,c2
1417 STD c3,40(r_ptr)
1418 COPY %r0,c3
1419
1420 MUL_ADD_C a6L,a6R,b0L,b0R,c1,c2,c3
1421 MUL_ADD_C a5L,a5R,b1L,b1R,c1,c2,c3
1422 MUL_ADD_C a4L,a4R,b2L,b2R,c1,c2,c3
1423 MUL_ADD_C a3L,a3R,b3L,b3R,c1,c2,c3
1424 MUL_ADD_C a2L,a2R,b4L,b4R,c1,c2,c3
1425 MUL_ADD_C a1L,a1R,b5L,b5R,c1,c2,c3
1426 MUL_ADD_C a0L,a0R,b6L,b6R,c1,c2,c3
1427 STD c1,48(r_ptr)
1428 COPY %r0,c1
1429
1430 MUL_ADD_C a0L,a0R,b7L,b7R,c2,c3,c1
1431 MUL_ADD_C a1L,a1R,b6L,b6R,c2,c3,c1
1432 MUL_ADD_C a2L,a2R,b5L,b5R,c2,c3,c1
1433 MUL_ADD_C a3L,a3R,b4L,b4R,c2,c3,c1
1434 MUL_ADD_C a4L,a4R,b3L,b3R,c2,c3,c1
1435 MUL_ADD_C a5L,a5R,b2L,b2R,c2,c3,c1
1436 MUL_ADD_C a6L,a6R,b1L,b1R,c2,c3,c1
1437 MUL_ADD_C a7L,a7R,b0L,b0R,c2,c3,c1
1438 STD c2,56(r_ptr)
1439 COPY %r0,c2
1440
1441 MUL_ADD_C a7L,a7R,b1L,b1R,c3,c1,c2
1442 MUL_ADD_C a6L,a6R,b2L,b2R,c3,c1,c2
1443 MUL_ADD_C a5L,a5R,b3L,b3R,c3,c1,c2
1444 MUL_ADD_C a4L,a4R,b4L,b4R,c3,c1,c2
1445 MUL_ADD_C a3L,a3R,b5L,b5R,c3,c1,c2
1446 MUL_ADD_C a2L,a2R,b6L,b6R,c3,c1,c2
1447 MUL_ADD_C a1L,a1R,b7L,b7R,c3,c1,c2
1448 STD c3,64(r_ptr)
1449 COPY %r0,c3
1450
1451 MUL_ADD_C a2L,a2R,b7L,b7R,c1,c2,c3
1452 MUL_ADD_C a3L,a3R,b6L,b6R,c1,c2,c3
1453 MUL_ADD_C a4L,a4R,b5L,b5R,c1,c2,c3
1454 MUL_ADD_C a5L,a5R,b4L,b4R,c1,c2,c3
1455 MUL_ADD_C a6L,a6R,b3L,b3R,c1,c2,c3
1456 MUL_ADD_C a7L,a7R,b2L,b2R,c1,c2,c3
1457 STD c1,72(r_ptr)
1458 COPY %r0,c1
1459
1460 MUL_ADD_C a7L,a7R,b3L,b3R,c2,c3,c1
1461 MUL_ADD_C a6L,a6R,b4L,b4R,c2,c3,c1
1462 MUL_ADD_C a5L,a5R,b5L,b5R,c2,c3,c1
1463 MUL_ADD_C a4L,a4R,b6L,b6R,c2,c3,c1
1464 MUL_ADD_C a3L,a3R,b7L,b7R,c2,c3,c1
1465 STD c2,80(r_ptr)
1466 COPY %r0,c2
1467
1468 MUL_ADD_C a4L,a4R,b7L,b7R,c3,c1,c2
1469 MUL_ADD_C a5L,a5R,b6L,b6R,c3,c1,c2
1470 MUL_ADD_C a6L,a6R,b5L,b5R,c3,c1,c2
1471 MUL_ADD_C a7L,a7R,b4L,b4R,c3,c1,c2
1472 STD c3,88(r_ptr)
1473 COPY %r0,c3
1474
1475 MUL_ADD_C a7L,a7R,b5L,b5R,c1,c2,c3
1476 MUL_ADD_C a6L,a6R,b6L,b6R,c1,c2,c3
1477 MUL_ADD_C a5L,a5R,b7L,b7R,c1,c2,c3
1478 STD c1,96(r_ptr)
1479 COPY %r0,c1
1480
1481 MUL_ADD_C a6L,a6R,b7L,b7R,c2,c3,c1
1482 MUL_ADD_C a7L,a7R,b6L,b6R,c2,c3,c1
1483 STD c2,104(r_ptr)
1484 COPY %r0,c2
1485
1486 MUL_ADD_C a7L,a7R,b7L,b7R,c3,c1,c2
1487 STD c3,112(r_ptr)
1488 STD c1,120(r_ptr)
1489
1490 .EXIT
1491 FLDD -88(%sp),%fr13
1492 FLDD -96(%sp),%fr12
1493 LDD -104(%sp),%r6 ; restore r6
1494 LDD -112(%sp),%r5 ; restore r5
1495 LDD -120(%sp),%r4 ; restore r4
1496 BVE (%rp)
1497 LDD,MB -128(%sp),%r3
1498
1499 .PROCEND
1500
1501;-----------------------------------------------------------------------------
1502;
1503;void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
1504; arg0 = r_ptr
1505; arg1 = a_ptr
1506; arg2 = b_ptr
1507;
1508
1509bn_mul_comba4
1510 .proc
1511 .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
1512 .EXPORT bn_mul_comba4,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
1513 .entry
1514 .align 64
1515
1516 STD %r3,0(%sp) ; save r3
1517 STD %r4,8(%sp) ; save r4
1518 STD %r5,16(%sp) ; save r5
1519 STD %r6,24(%sp) ; save r6
1520 FSTD %fr12,32(%sp) ; save r6
1521 FSTD %fr13,40(%sp) ; save r7
1522
1523 ;
1524 ; Zero out carries
1525 ;
1526 COPY %r0,c1
1527 COPY %r0,c2
1528 COPY %r0,c3
1529
1530 LDO 128(%sp),%sp ; bump stack
1531 DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32
1532
1533 ;
1534 ; Load up all of the values we are going to use
1535 ;
1536 FLDD 0(a_ptr),a0
1537 FLDD 8(a_ptr),a1
1538 FLDD 16(a_ptr),a2
1539 FLDD 24(a_ptr),a3
1540
1541 FLDD 0(b_ptr),b0
1542 FLDD 8(b_ptr),b1
1543 FLDD 16(b_ptr),b2
1544 FLDD 24(b_ptr),b3
1545
1546 MUL_ADD_C a0L,a0R,b0L,b0R,c1,c2,c3
1547 STD c1,0(r_ptr)
1548 COPY %r0,c1
1549
1550 MUL_ADD_C a0L,a0R,b1L,b1R,c2,c3,c1
1551 MUL_ADD_C a1L,a1R,b0L,b0R,c2,c3,c1
1552 STD c2,8(r_ptr)
1553 COPY %r0,c2
1554
1555 MUL_ADD_C a2L,a2R,b0L,b0R,c3,c1,c2
1556 MUL_ADD_C a1L,a1R,b1L,b1R,c3,c1,c2
1557 MUL_ADD_C a0L,a0R,b2L,b2R,c3,c1,c2
1558 STD c3,16(r_ptr)
1559 COPY %r0,c3
1560
1561 MUL_ADD_C a0L,a0R,b3L,b3R,c1,c2,c3
1562 MUL_ADD_C a1L,a1R,b2L,b2R,c1,c2,c3
1563 MUL_ADD_C a2L,a2R,b1L,b1R,c1,c2,c3
1564 MUL_ADD_C a3L,a3R,b0L,b0R,c1,c2,c3
1565 STD c1,24(r_ptr)
1566 COPY %r0,c1
1567
1568 MUL_ADD_C a3L,a3R,b1L,b1R,c2,c3,c1
1569 MUL_ADD_C a2L,a2R,b2L,b2R,c2,c3,c1
1570 MUL_ADD_C a1L,a1R,b3L,b3R,c2,c3,c1
1571 STD c2,32(r_ptr)
1572 COPY %r0,c2
1573
1574 MUL_ADD_C a2L,a2R,b3L,b3R,c3,c1,c2
1575 MUL_ADD_C a3L,a3R,b2L,b2R,c3,c1,c2
1576 STD c3,40(r_ptr)
1577 COPY %r0,c3
1578
1579 MUL_ADD_C a3L,a3R,b3L,b3R,c1,c2,c3
1580 STD c1,48(r_ptr)
1581 STD c2,56(r_ptr)
1582
1583 .EXIT
1584 FLDD -88(%sp),%fr13
1585 FLDD -96(%sp),%fr12
1586 LDD -104(%sp),%r6 ; restore r6
1587 LDD -112(%sp),%r5 ; restore r5
1588 LDD -120(%sp),%r4 ; restore r4
1589 BVE (%rp)
1590 LDD,MB -128(%sp),%r3
1591
1592 .PROCEND
1593
1594
1595 .SPACE $TEXT$
1596 .SUBSPA $CODE$
1597 .SPACE $PRIVATE$,SORT=16
1598 .IMPORT $global$,DATA
1599 .SPACE $TEXT$
1600 .SUBSPA $CODE$
1601 .SUBSPA $LIT$,ACCESS=0x2c
1602C$4
1603 .ALIGN 8
1604 .STRINGZ "Division would overflow (%d)\n"
1605 .END
diff --git a/src/lib/libcrypto/bn/asm/sparcv8plus.S b/src/lib/libcrypto/bn/asm/sparcv8plus.S
deleted file mode 100644
index 02ad6069c2..0000000000
--- a/src/lib/libcrypto/bn/asm/sparcv8plus.S
+++ /dev/null
@@ -1,1558 +0,0 @@
1.ident "sparcv8plus.s, Version 1.4"
2.ident "SPARC v9 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>"
3
4/*
5 * ====================================================================
6 * Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
7 * project.
8 *
9 * Rights for redistribution and usage in source and binary forms are
10 * granted according to the OpenSSL license. Warranty of any kind is
11 * disclaimed.
12 * ====================================================================
13 */
14
15/*
16 * This is my modest contribution to OpenSSL project (see
17 * http://www.openssl.org/ for more information about it) and is
18 * a drop-in UltraSPARC ISA replacement for crypto/bn/bn_asm.c
19 * module. For updates see http://fy.chalmers.se/~appro/hpe/.
20 *
21 * Questions-n-answers.
22 *
23 * Q. How to compile?
24 * A. With SC4.x/SC5.x:
25 *
26 * cc -xarch=v8plus -c bn_asm.sparc.v8plus.S -o bn_asm.o
27 *
28 * and with gcc:
29 *
30 * gcc -mcpu=ultrasparc -c bn_asm.sparc.v8plus.S -o bn_asm.o
31 *
32 * or if above fails (it does if you have gas installed):
33 *
34 * gcc -E bn_asm.sparc.v8plus.S | as -xarch=v8plus /dev/fd/0 -o bn_asm.o
35 *
36 * Quick-n-dirty way to fuse the module into the library.
37 * Provided that the library is already configured and built
38 * (in 0.9.2 case with no-asm option):
39 *
40 * # cd crypto/bn
41 * # cp /some/place/bn_asm.sparc.v8plus.S .
42 * # cc -xarch=v8plus -c bn_asm.sparc.v8plus.S -o bn_asm.o
43 * # make
44 * # cd ../..
45 * # make; make test
46 *
47 * Quick-n-dirty way to get rid of it:
48 *
49 * # cd crypto/bn
50 * # touch bn_asm.c
51 * # make
52 * # cd ../..
53 * # make; make test
54 *
55 * Q. V8plus architecture? What kind of beast is that?
56 * A. Well, it's rather a programming model than an architecture...
57 * It's actually v9-compliant, i.e. *any* UltraSPARC, CPU under
58 * special conditions, namely when kernel doesn't preserve upper
59 * 32 bits of otherwise 64-bit registers during a context switch.
60 *
61 * Q. Why just UltraSPARC? What about SuperSPARC?
62 * A. Original release did target UltraSPARC only. Now SuperSPARC
63 * version is provided along. Both version share bn_*comba[48]
64 * implementations (see comment later in code for explanation).
65 * But what's so special about this UltraSPARC implementation?
66 * Why didn't I let compiler do the job? Trouble is that most of
67 * available compilers (well, SC5.0 is the only exception) don't
68 * attempt to take advantage of UltraSPARC's 64-bitness under
69 * 32-bit kernels even though it's perfectly possible (see next
70 * question).
71 *
72 * Q. 64-bit registers under 32-bit kernels? Didn't you just say it
73 * doesn't work?
74 * A. You can't address *all* registers as 64-bit wide:-( The catch is
75 * that you actually may rely upon %o0-%o5 and %g1-%g4 being fully
76 * preserved if you're in a leaf function, i.e. such never calling
77 * any other functions. All functions in this module are leaf and
78 * 10 registers is a handful. And as a matter of fact none-"comba"
79 * routines don't require even that much and I could even afford to
80 * not allocate own stack frame for 'em:-)
81 *
82 * Q. What about 64-bit kernels?
83 * A. What about 'em? Just kidding:-) Pure 64-bit version is currently
84 * under evaluation and development...
85 *
86 * Q. What about shared libraries?
87 * A. What about 'em? Kidding again:-) Code does *not* contain any
88 * code position dependencies and it's safe to include it into
89 * shared library as is.
90 *
91 * Q. How much faster does it go?
92 * A. Do you have a good benchmark? In either case below is what I
93 * experience with crypto/bn/expspeed.c test program:
94 *
95 * v8plus module on U10/300MHz against bn_asm.c compiled with:
96 *
97 * cc-5.0 -xarch=v8plus -xO5 -xdepend +7-12%
98 * cc-4.2 -xarch=v8plus -xO5 -xdepend +25-35%
99 * egcs-1.1.2 -mcpu=ultrasparc -O3 +35-45%
100 *
101 * v8 module on SS10/60MHz against bn_asm.c compiled with:
102 *
103 * cc-5.0 -xarch=v8 -xO5 -xdepend +7-10%
104 * cc-4.2 -xarch=v8 -xO5 -xdepend +10%
105 * egcs-1.1.2 -mv8 -O3 +35-45%
106 *
107 * As you can see it's damn hard to beat the new Sun C compiler
108 * and it's in first place GNU C users who will appreciate this
109 * assembler implementation:-)
110 */
111
112/*
113 * Revision history.
114 *
115 * 1.0 - initial release;
116 * 1.1 - new loop unrolling model(*);
117 * - some more fine tuning;
118 * 1.2 - made gas friendly;
119 * - updates to documentation concerning v9;
120 * - new performance comparison matrix;
121 * 1.3 - fixed problem with /usr/ccs/lib/cpp;
122 * 1.4 - native V9 bn_*_comba[48] implementation (15% more efficient)
123 * resulting in slight overall performance kick;
124 * - some retunes;
125 * - support for GNU as added;
126 *
127 * (*) Originally unrolled loop looked like this:
128 * for (;;) {
129 * op(p+0); if (--n==0) break;
130 * op(p+1); if (--n==0) break;
131 * op(p+2); if (--n==0) break;
132 * op(p+3); if (--n==0) break;
133 * p+=4;
134 * }
135 * I unroll according to following:
136 * while (n&~3) {
137 * op(p+0); op(p+1); op(p+2); op(p+3);
138 * p+=4; n=-4;
139 * }
140 * if (n) {
141 * op(p+0); if (--n==0) return;
142 * op(p+2); if (--n==0) return;
143 * op(p+3); return;
144 * }
145 */
146
147#if defined(__SUNPRO_C) && defined(__sparcv9)
148 /* They've said -xarch=v9 at command line */
149 .register %g2,#scratch
150 .register %g3,#scratch
151# define FRAME_SIZE -192
152#elif defined(__GNUC__) && defined(__arch64__)
153 /* They've said -m64 at command line */
154 .register %g2,#scratch
155 .register %g3,#scratch
156# define FRAME_SIZE -192
157#else
158# define FRAME_SIZE -96
159#endif
160/*
161 * GNU assembler can't stand stuw:-(
162 */
163#define stuw st
164
165.section ".text",#alloc,#execinstr
166.file "bn_asm.sparc.v8plus.S"
167
168.align 32
169
170.global bn_mul_add_words
171/*
172 * BN_ULONG bn_mul_add_words(rp,ap,num,w)
173 * BN_ULONG *rp,*ap;
174 * int num;
175 * BN_ULONG w;
176 */
177bn_mul_add_words:
178 sra %o2,%g0,%o2 ! signx %o2
179 brgz,a %o2,.L_bn_mul_add_words_proceed
180 lduw [%o1],%g2
181 retl
182 clr %o0
183 nop
184 nop
185 nop
186
187.L_bn_mul_add_words_proceed:
188 srl %o3,%g0,%o3 ! clruw %o3
189 andcc %o2,-4,%g0
190 bz,pn %icc,.L_bn_mul_add_words_tail
191 clr %o5
192
193.L_bn_mul_add_words_loop: ! wow! 32 aligned!
194 lduw [%o0],%g1
195 lduw [%o1+4],%g3
196 mulx %o3,%g2,%g2
197 add %g1,%o5,%o4
198 nop
199 add %o4,%g2,%o4
200 stuw %o4,[%o0]
201 srlx %o4,32,%o5
202
203 lduw [%o0+4],%g1
204 lduw [%o1+8],%g2
205 mulx %o3,%g3,%g3
206 add %g1,%o5,%o4
207 dec 4,%o2
208 add %o4,%g3,%o4
209 stuw %o4,[%o0+4]
210 srlx %o4,32,%o5
211
212 lduw [%o0+8],%g1
213 lduw [%o1+12],%g3
214 mulx %o3,%g2,%g2
215 add %g1,%o5,%o4
216 inc 16,%o1
217 add %o4,%g2,%o4
218 stuw %o4,[%o0+8]
219 srlx %o4,32,%o5
220
221 lduw [%o0+12],%g1
222 mulx %o3,%g3,%g3
223 add %g1,%o5,%o4
224 inc 16,%o0
225 add %o4,%g3,%o4
226 andcc %o2,-4,%g0
227 stuw %o4,[%o0-4]
228 srlx %o4,32,%o5
229 bnz,a,pt %icc,.L_bn_mul_add_words_loop
230 lduw [%o1],%g2
231
232 brnz,a,pn %o2,.L_bn_mul_add_words_tail
233 lduw [%o1],%g2
234.L_bn_mul_add_words_return:
235 retl
236 mov %o5,%o0
237
238.L_bn_mul_add_words_tail:
239 lduw [%o0],%g1
240 mulx %o3,%g2,%g2
241 add %g1,%o5,%o4
242 dec %o2
243 add %o4,%g2,%o4
244 srlx %o4,32,%o5
245 brz,pt %o2,.L_bn_mul_add_words_return
246 stuw %o4,[%o0]
247
248 lduw [%o1+4],%g2
249 lduw [%o0+4],%g1
250 mulx %o3,%g2,%g2
251 add %g1,%o5,%o4
252 dec %o2
253 add %o4,%g2,%o4
254 srlx %o4,32,%o5
255 brz,pt %o2,.L_bn_mul_add_words_return
256 stuw %o4,[%o0+4]
257
258 lduw [%o1+8],%g2
259 lduw [%o0+8],%g1
260 mulx %o3,%g2,%g2
261 add %g1,%o5,%o4
262 add %o4,%g2,%o4
263 stuw %o4,[%o0+8]
264 retl
265 srlx %o4,32,%o0
266
267.type bn_mul_add_words,#function
268.size bn_mul_add_words,(.-bn_mul_add_words)
269
270.align 32
271
272.global bn_mul_words
273/*
274 * BN_ULONG bn_mul_words(rp,ap,num,w)
275 * BN_ULONG *rp,*ap;
276 * int num;
277 * BN_ULONG w;
278 */
279bn_mul_words:
280 sra %o2,%g0,%o2 ! signx %o2
281 brgz,a %o2,.L_bn_mul_words_proceeed
282 lduw [%o1],%g2
283 retl
284 clr %o0
285 nop
286 nop
287 nop
288
289.L_bn_mul_words_proceeed:
290 srl %o3,%g0,%o3 ! clruw %o3
291 andcc %o2,-4,%g0
292 bz,pn %icc,.L_bn_mul_words_tail
293 clr %o5
294
295.L_bn_mul_words_loop: ! wow! 32 aligned!
296 lduw [%o1+4],%g3
297 mulx %o3,%g2,%g2
298 add %g2,%o5,%o4
299 nop
300 stuw %o4,[%o0]
301 srlx %o4,32,%o5
302
303 lduw [%o1+8],%g2
304 mulx %o3,%g3,%g3
305 add %g3,%o5,%o4
306 dec 4,%o2
307 stuw %o4,[%o0+4]
308 srlx %o4,32,%o5
309
310 lduw [%o1+12],%g3
311 mulx %o3,%g2,%g2
312 add %g2,%o5,%o4
313 inc 16,%o1
314 stuw %o4,[%o0+8]
315 srlx %o4,32,%o5
316
317 mulx %o3,%g3,%g3
318 add %g3,%o5,%o4
319 inc 16,%o0
320 stuw %o4,[%o0-4]
321 srlx %o4,32,%o5
322 andcc %o2,-4,%g0
323 bnz,a,pt %icc,.L_bn_mul_words_loop
324 lduw [%o1],%g2
325 nop
326 nop
327
328 brnz,a,pn %o2,.L_bn_mul_words_tail
329 lduw [%o1],%g2
330.L_bn_mul_words_return:
331 retl
332 mov %o5,%o0
333
334.L_bn_mul_words_tail:
335 mulx %o3,%g2,%g2
336 add %g2,%o5,%o4
337 dec %o2
338 srlx %o4,32,%o5
339 brz,pt %o2,.L_bn_mul_words_return
340 stuw %o4,[%o0]
341
342 lduw [%o1+4],%g2
343 mulx %o3,%g2,%g2
344 add %g2,%o5,%o4
345 dec %o2
346 srlx %o4,32,%o5
347 brz,pt %o2,.L_bn_mul_words_return
348 stuw %o4,[%o0+4]
349
350 lduw [%o1+8],%g2
351 mulx %o3,%g2,%g2
352 add %g2,%o5,%o4
353 stuw %o4,[%o0+8]
354 retl
355 srlx %o4,32,%o0
356
357.type bn_mul_words,#function
358.size bn_mul_words,(.-bn_mul_words)
359
360.align 32
361.global bn_sqr_words
362/*
363 * void bn_sqr_words(r,a,n)
364 * BN_ULONG *r,*a;
365 * int n;
366 */
367bn_sqr_words:
368 sra %o2,%g0,%o2 ! signx %o2
369 brgz,a %o2,.L_bn_sqr_words_proceeed
370 lduw [%o1],%g2
371 retl
372 clr %o0
373 nop
374 nop
375 nop
376
377.L_bn_sqr_words_proceeed:
378 andcc %o2,-4,%g0
379 nop
380 bz,pn %icc,.L_bn_sqr_words_tail
381 nop
382
383.L_bn_sqr_words_loop: ! wow! 32 aligned!
384 lduw [%o1+4],%g3
385 mulx %g2,%g2,%o4
386 stuw %o4,[%o0]
387 srlx %o4,32,%o5
388 stuw %o5,[%o0+4]
389 nop
390
391 lduw [%o1+8],%g2
392 mulx %g3,%g3,%o4
393 dec 4,%o2
394 stuw %o4,[%o0+8]
395 srlx %o4,32,%o5
396 stuw %o5,[%o0+12]
397
398 lduw [%o1+12],%g3
399 mulx %g2,%g2,%o4
400 srlx %o4,32,%o5
401 stuw %o4,[%o0+16]
402 inc 16,%o1
403 stuw %o5,[%o0+20]
404
405 mulx %g3,%g3,%o4
406 inc 32,%o0
407 stuw %o4,[%o0-8]
408 srlx %o4,32,%o5
409 andcc %o2,-4,%g2
410 stuw %o5,[%o0-4]
411 bnz,a,pt %icc,.L_bn_sqr_words_loop
412 lduw [%o1],%g2
413 nop
414
415 brnz,a,pn %o2,.L_bn_sqr_words_tail
416 lduw [%o1],%g2
417.L_bn_sqr_words_return:
418 retl
419 clr %o0
420
421.L_bn_sqr_words_tail:
422 mulx %g2,%g2,%o4
423 dec %o2
424 stuw %o4,[%o0]
425 srlx %o4,32,%o5
426 brz,pt %o2,.L_bn_sqr_words_return
427 stuw %o5,[%o0+4]
428
429 lduw [%o1+4],%g2
430 mulx %g2,%g2,%o4
431 dec %o2
432 stuw %o4,[%o0+8]
433 srlx %o4,32,%o5
434 brz,pt %o2,.L_bn_sqr_words_return
435 stuw %o5,[%o0+12]
436
437 lduw [%o1+8],%g2
438 mulx %g2,%g2,%o4
439 srlx %o4,32,%o5
440 stuw %o4,[%o0+16]
441 stuw %o5,[%o0+20]
442 retl
443 clr %o0
444
445.type bn_sqr_words,#function
446.size bn_sqr_words,(.-bn_sqr_words)
447
448.align 32
449.global bn_div_words
450/*
451 * BN_ULONG bn_div_words(h,l,d)
452 * BN_ULONG h,l,d;
453 */
454bn_div_words:
455 sllx %o0,32,%o0
456 or %o0,%o1,%o0
457 udivx %o0,%o2,%o0
458 retl
459 srl %o0,%g0,%o0 ! clruw %o0
460
461.type bn_div_words,#function
462.size bn_div_words,(.-bn_div_words)
463
464.align 32
465
466.global bn_add_words
467/*
468 * BN_ULONG bn_add_words(rp,ap,bp,n)
469 * BN_ULONG *rp,*ap,*bp;
470 * int n;
471 */
472bn_add_words:
473 sra %o3,%g0,%o3 ! signx %o3
474 brgz,a %o3,.L_bn_add_words_proceed
475 lduw [%o1],%o4
476 retl
477 clr %o0
478
479.L_bn_add_words_proceed:
480 andcc %o3,-4,%g0
481 bz,pn %icc,.L_bn_add_words_tail
482 addcc %g0,0,%g0 ! clear carry flag
483
484.L_bn_add_words_loop: ! wow! 32 aligned!
485 dec 4,%o3
486 lduw [%o2],%o5
487 lduw [%o1+4],%g1
488 lduw [%o2+4],%g2
489 lduw [%o1+8],%g3
490 lduw [%o2+8],%g4
491 addccc %o5,%o4,%o5
492 stuw %o5,[%o0]
493
494 lduw [%o1+12],%o4
495 lduw [%o2+12],%o5
496 inc 16,%o1
497 addccc %g1,%g2,%g1
498 stuw %g1,[%o0+4]
499
500 inc 16,%o2
501 addccc %g3,%g4,%g3
502 stuw %g3,[%o0+8]
503
504 inc 16,%o0
505 addccc %o5,%o4,%o5
506 stuw %o5,[%o0-4]
507 and %o3,-4,%g1
508 brnz,a,pt %g1,.L_bn_add_words_loop
509 lduw [%o1],%o4
510
511 brnz,a,pn %o3,.L_bn_add_words_tail
512 lduw [%o1],%o4
513.L_bn_add_words_return:
514 clr %o0
515 retl
516 movcs %icc,1,%o0
517 nop
518
519.L_bn_add_words_tail:
520 lduw [%o2],%o5
521 dec %o3
522 addccc %o5,%o4,%o5
523 brz,pt %o3,.L_bn_add_words_return
524 stuw %o5,[%o0]
525
526 lduw [%o1+4],%o4
527 lduw [%o2+4],%o5
528 dec %o3
529 addccc %o5,%o4,%o5
530 brz,pt %o3,.L_bn_add_words_return
531 stuw %o5,[%o0+4]
532
533 lduw [%o1+8],%o4
534 lduw [%o2+8],%o5
535 addccc %o5,%o4,%o5
536 stuw %o5,[%o0+8]
537 clr %o0
538 retl
539 movcs %icc,1,%o0
540
541.type bn_add_words,#function
542.size bn_add_words,(.-bn_add_words)
543
544.global bn_sub_words
545/*
546 * BN_ULONG bn_sub_words(rp,ap,bp,n)
547 * BN_ULONG *rp,*ap,*bp;
548 * int n;
549 */
550bn_sub_words:
551 sra %o3,%g0,%o3 ! signx %o3
552 brgz,a %o3,.L_bn_sub_words_proceed
553 lduw [%o1],%o4
554 retl
555 clr %o0
556
557.L_bn_sub_words_proceed:
558 andcc %o3,-4,%g0
559 bz,pn %icc,.L_bn_sub_words_tail
560 addcc %g0,0,%g0 ! clear carry flag
561
562.L_bn_sub_words_loop: ! wow! 32 aligned!
563 dec 4,%o3
564 lduw [%o2],%o5
565 lduw [%o1+4],%g1
566 lduw [%o2+4],%g2
567 lduw [%o1+8],%g3
568 lduw [%o2+8],%g4
569 subccc %o4,%o5,%o5
570 stuw %o5,[%o0]
571
572 lduw [%o1+12],%o4
573 lduw [%o2+12],%o5
574 inc 16,%o1
575 subccc %g1,%g2,%g2
576 stuw %g2,[%o0+4]
577
578 inc 16,%o2
579 subccc %g3,%g4,%g4
580 stuw %g4,[%o0+8]
581
582 inc 16,%o0
583 subccc %o4,%o5,%o5
584 stuw %o5,[%o0-4]
585 and %o3,-4,%g1
586 brnz,a,pt %g1,.L_bn_sub_words_loop
587 lduw [%o1],%o4
588
589 brnz,a,pn %o3,.L_bn_sub_words_tail
590 lduw [%o1],%o4
591.L_bn_sub_words_return:
592 clr %o0
593 retl
594 movcs %icc,1,%o0
595 nop
596
597.L_bn_sub_words_tail: ! wow! 32 aligned!
598 lduw [%o2],%o5
599 dec %o3
600 subccc %o4,%o5,%o5
601 brz,pt %o3,.L_bn_sub_words_return
602 stuw %o5,[%o0]
603
604 lduw [%o1+4],%o4
605 lduw [%o2+4],%o5
606 dec %o3
607 subccc %o4,%o5,%o5
608 brz,pt %o3,.L_bn_sub_words_return
609 stuw %o5,[%o0+4]
610
611 lduw [%o1+8],%o4
612 lduw [%o2+8],%o5
613 subccc %o4,%o5,%o5
614 stuw %o5,[%o0+8]
615 clr %o0
616 retl
617 movcs %icc,1,%o0
618
619.type bn_sub_words,#function
620.size bn_sub_words,(.-bn_sub_words)
621
622/*
623 * Code below depends on the fact that upper parts of the %l0-%l7
624 * and %i0-%i7 are zeroed by kernel after context switch. In
625 * previous versions this comment stated that "the trouble is that
626 * it's not feasible to implement the mumbo-jumbo in less V9
627 * instructions:-(" which apparently isn't true thanks to
628 * 'bcs,a %xcc,.+8; inc %rd' pair. But the performance improvement
629 * results not from the shorter code, but from elimination of
630 * multicycle none-pairable 'rd %y,%rd' instructions.
631 *
632 * Andy.
633 */
634
635/*
636 * Here is register usage map for *all* routines below.
637 */
638#define t_1 %o0
639#define t_2 %o1
640#define c_12 %o2
641#define c_3 %o3
642
643#define ap(I) [%i1+4*I]
644#define bp(I) [%i2+4*I]
645#define rp(I) [%i0+4*I]
646
647#define a_0 %l0
648#define a_1 %l1
649#define a_2 %l2
650#define a_3 %l3
651#define a_4 %l4
652#define a_5 %l5
653#define a_6 %l6
654#define a_7 %l7
655
656#define b_0 %i3
657#define b_1 %i4
658#define b_2 %i5
659#define b_3 %o4
660#define b_4 %o5
661#define b_5 %o7
662#define b_6 %g1
663#define b_7 %g4
664
665.align 32
666.global bn_mul_comba8
667/*
668 * void bn_mul_comba8(r,a,b)
669 * BN_ULONG *r,*a,*b;
670 */
671bn_mul_comba8:
672 save %sp,FRAME_SIZE,%sp
673 mov 1,t_2
674 lduw ap(0),a_0
675 sllx t_2,32,t_2
676 lduw bp(0),b_0 !=
677 lduw bp(1),b_1
678 mulx a_0,b_0,t_1 !mul_add_c(a[0],b[0],c1,c2,c3);
679 srlx t_1,32,c_12
680 stuw t_1,rp(0) !=!r[0]=c1;
681
682 lduw ap(1),a_1
683 mulx a_0,b_1,t_1 !mul_add_c(a[0],b[1],c2,c3,c1);
684 addcc c_12,t_1,c_12
685 clr c_3 !=
686 bcs,a %xcc,.+8
687 add c_3,t_2,c_3
688 lduw ap(2),a_2
689 mulx a_1,b_0,t_1 !=!mul_add_c(a[1],b[0],c2,c3,c1);
690 addcc c_12,t_1,t_1
691 bcs,a %xcc,.+8
692 add c_3,t_2,c_3
693 srlx t_1,32,c_12 !=
694 stuw t_1,rp(1) !r[1]=c2;
695 or c_12,c_3,c_12
696
697 mulx a_2,b_0,t_1 !mul_add_c(a[2],b[0],c3,c1,c2);
698 addcc c_12,t_1,c_12 !=
699 clr c_3
700 bcs,a %xcc,.+8
701 add c_3,t_2,c_3
702 lduw bp(2),b_2 !=
703 mulx a_1,b_1,t_1 !mul_add_c(a[1],b[1],c3,c1,c2);
704 addcc c_12,t_1,c_12
705 bcs,a %xcc,.+8
706 add c_3,t_2,c_3 !=
707 lduw bp(3),b_3
708 mulx a_0,b_2,t_1 !mul_add_c(a[0],b[2],c3,c1,c2);
709 addcc c_12,t_1,t_1
710 bcs,a %xcc,.+8 !=
711 add c_3,t_2,c_3
712 srlx t_1,32,c_12
713 stuw t_1,rp(2) !r[2]=c3;
714 or c_12,c_3,c_12 !=
715
716 mulx a_0,b_3,t_1 !mul_add_c(a[0],b[3],c1,c2,c3);
717 addcc c_12,t_1,c_12
718 clr c_3
719 bcs,a %xcc,.+8 !=
720 add c_3,t_2,c_3
721 mulx a_1,b_2,t_1 !=!mul_add_c(a[1],b[2],c1,c2,c3);
722 addcc c_12,t_1,c_12
723 bcs,a %xcc,.+8 !=
724 add c_3,t_2,c_3
725 lduw ap(3),a_3
726 mulx a_2,b_1,t_1 !mul_add_c(a[2],b[1],c1,c2,c3);
727 addcc c_12,t_1,c_12 !=
728 bcs,a %xcc,.+8
729 add c_3,t_2,c_3
730 lduw ap(4),a_4
731 mulx a_3,b_0,t_1 !=!mul_add_c(a[3],b[0],c1,c2,c3);!=
732 addcc c_12,t_1,t_1
733 bcs,a %xcc,.+8
734 add c_3,t_2,c_3
735 srlx t_1,32,c_12 !=
736 stuw t_1,rp(3) !r[3]=c1;
737 or c_12,c_3,c_12
738
739 mulx a_4,b_0,t_1 !mul_add_c(a[4],b[0],c2,c3,c1);
740 addcc c_12,t_1,c_12 !=
741 clr c_3
742 bcs,a %xcc,.+8
743 add c_3,t_2,c_3
744 mulx a_3,b_1,t_1 !=!mul_add_c(a[3],b[1],c2,c3,c1);
745 addcc c_12,t_1,c_12
746 bcs,a %xcc,.+8
747 add c_3,t_2,c_3
748 mulx a_2,b_2,t_1 !=!mul_add_c(a[2],b[2],c2,c3,c1);
749 addcc c_12,t_1,c_12
750 bcs,a %xcc,.+8
751 add c_3,t_2,c_3
752 lduw bp(4),b_4 !=
753 mulx a_1,b_3,t_1 !mul_add_c(a[1],b[3],c2,c3,c1);
754 addcc c_12,t_1,c_12
755 bcs,a %xcc,.+8
756 add c_3,t_2,c_3 !=
757 lduw bp(5),b_5
758 mulx a_0,b_4,t_1 !mul_add_c(a[0],b[4],c2,c3,c1);
759 addcc c_12,t_1,t_1
760 bcs,a %xcc,.+8 !=
761 add c_3,t_2,c_3
762 srlx t_1,32,c_12
763 stuw t_1,rp(4) !r[4]=c2;
764 or c_12,c_3,c_12 !=
765
766 mulx a_0,b_5,t_1 !mul_add_c(a[0],b[5],c3,c1,c2);
767 addcc c_12,t_1,c_12
768 clr c_3
769 bcs,a %xcc,.+8 !=
770 add c_3,t_2,c_3
771 mulx a_1,b_4,t_1 !mul_add_c(a[1],b[4],c3,c1,c2);
772 addcc c_12,t_1,c_12
773 bcs,a %xcc,.+8 !=
774 add c_3,t_2,c_3
775 mulx a_2,b_3,t_1 !mul_add_c(a[2],b[3],c3,c1,c2);
776 addcc c_12,t_1,c_12
777 bcs,a %xcc,.+8 !=
778 add c_3,t_2,c_3
779 mulx a_3,b_2,t_1 !mul_add_c(a[3],b[2],c3,c1,c2);
780 addcc c_12,t_1,c_12
781 bcs,a %xcc,.+8 !=
782 add c_3,t_2,c_3
783 lduw ap(5),a_5
784 mulx a_4,b_1,t_1 !mul_add_c(a[4],b[1],c3,c1,c2);
785 addcc c_12,t_1,c_12 !=
786 bcs,a %xcc,.+8
787 add c_3,t_2,c_3
788 lduw ap(6),a_6
789 mulx a_5,b_0,t_1 !=!mul_add_c(a[5],b[0],c3,c1,c2);
790 addcc c_12,t_1,t_1
791 bcs,a %xcc,.+8
792 add c_3,t_2,c_3
793 srlx t_1,32,c_12 !=
794 stuw t_1,rp(5) !r[5]=c3;
795 or c_12,c_3,c_12
796
797 mulx a_6,b_0,t_1 !mul_add_c(a[6],b[0],c1,c2,c3);
798 addcc c_12,t_1,c_12 !=
799 clr c_3
800 bcs,a %xcc,.+8
801 add c_3,t_2,c_3
802 mulx a_5,b_1,t_1 !=!mul_add_c(a[5],b[1],c1,c2,c3);
803 addcc c_12,t_1,c_12
804 bcs,a %xcc,.+8
805 add c_3,t_2,c_3
806 mulx a_4,b_2,t_1 !=!mul_add_c(a[4],b[2],c1,c2,c3);
807 addcc c_12,t_1,c_12
808 bcs,a %xcc,.+8
809 add c_3,t_2,c_3
810 mulx a_3,b_3,t_1 !=!mul_add_c(a[3],b[3],c1,c2,c3);
811 addcc c_12,t_1,c_12
812 bcs,a %xcc,.+8
813 add c_3,t_2,c_3
814 mulx a_2,b_4,t_1 !=!mul_add_c(a[2],b[4],c1,c2,c3);
815 addcc c_12,t_1,c_12
816 bcs,a %xcc,.+8
817 add c_3,t_2,c_3
818 lduw bp(6),b_6 !=
819 mulx a_1,b_5,t_1 !mul_add_c(a[1],b[5],c1,c2,c3);
820 addcc c_12,t_1,c_12
821 bcs,a %xcc,.+8
822 add c_3,t_2,c_3 !=
823 lduw bp(7),b_7
824 mulx a_0,b_6,t_1 !mul_add_c(a[0],b[6],c1,c2,c3);
825 addcc c_12,t_1,t_1
826 bcs,a %xcc,.+8 !=
827 add c_3,t_2,c_3
828 srlx t_1,32,c_12
829 stuw t_1,rp(6) !r[6]=c1;
830 or c_12,c_3,c_12 !=
831
832 mulx a_0,b_7,t_1 !mul_add_c(a[0],b[7],c2,c3,c1);
833 addcc c_12,t_1,c_12
834 clr c_3
835 bcs,a %xcc,.+8 !=
836 add c_3,t_2,c_3
837 mulx a_1,b_6,t_1 !mul_add_c(a[1],b[6],c2,c3,c1);
838 addcc c_12,t_1,c_12
839 bcs,a %xcc,.+8 !=
840 add c_3,t_2,c_3
841 mulx a_2,b_5,t_1 !mul_add_c(a[2],b[5],c2,c3,c1);
842 addcc c_12,t_1,c_12
843 bcs,a %xcc,.+8 !=
844 add c_3,t_2,c_3
845 mulx a_3,b_4,t_1 !mul_add_c(a[3],b[4],c2,c3,c1);
846 addcc c_12,t_1,c_12
847 bcs,a %xcc,.+8 !=
848 add c_3,t_2,c_3
849 mulx a_4,b_3,t_1 !mul_add_c(a[4],b[3],c2,c3,c1);
850 addcc c_12,t_1,c_12
851 bcs,a %xcc,.+8 !=
852 add c_3,t_2,c_3
853 mulx a_5,b_2,t_1 !mul_add_c(a[5],b[2],c2,c3,c1);
854 addcc c_12,t_1,c_12
855 bcs,a %xcc,.+8 !=
856 add c_3,t_2,c_3
857 lduw ap(7),a_7
858 mulx a_6,b_1,t_1 !=!mul_add_c(a[6],b[1],c2,c3,c1);
859 addcc c_12,t_1,c_12
860 bcs,a %xcc,.+8
861 add c_3,t_2,c_3
862 mulx a_7,b_0,t_1 !=!mul_add_c(a[7],b[0],c2,c3,c1);
863 addcc c_12,t_1,t_1
864 bcs,a %xcc,.+8
865 add c_3,t_2,c_3
866 srlx t_1,32,c_12 !=
867 stuw t_1,rp(7) !r[7]=c2;
868 or c_12,c_3,c_12
869
870 mulx a_7,b_1,t_1 !=!mul_add_c(a[7],b[1],c3,c1,c2);
871 addcc c_12,t_1,c_12
872 clr c_3
873 bcs,a %xcc,.+8
874 add c_3,t_2,c_3 !=
875 mulx a_6,b_2,t_1 !mul_add_c(a[6],b[2],c3,c1,c2);
876 addcc c_12,t_1,c_12
877 bcs,a %xcc,.+8
878 add c_3,t_2,c_3 !=
879 mulx a_5,b_3,t_1 !mul_add_c(a[5],b[3],c3,c1,c2);
880 addcc c_12,t_1,c_12
881 bcs,a %xcc,.+8
882 add c_3,t_2,c_3 !=
883 mulx a_4,b_4,t_1 !mul_add_c(a[4],b[4],c3,c1,c2);
884 addcc c_12,t_1,c_12
885 bcs,a %xcc,.+8
886 add c_3,t_2,c_3 !=
887 mulx a_3,b_5,t_1 !mul_add_c(a[3],b[5],c3,c1,c2);
888 addcc c_12,t_1,c_12
889 bcs,a %xcc,.+8
890 add c_3,t_2,c_3 !=
891 mulx a_2,b_6,t_1 !mul_add_c(a[2],b[6],c3,c1,c2);
892 addcc c_12,t_1,c_12
893 bcs,a %xcc,.+8
894 add c_3,t_2,c_3 !=
895 mulx a_1,b_7,t_1 !mul_add_c(a[1],b[7],c3,c1,c2);
896 addcc c_12,t_1,t_1
897 bcs,a %xcc,.+8
898 add c_3,t_2,c_3 !=
899 srlx t_1,32,c_12
900 stuw t_1,rp(8) !r[8]=c3;
901 or c_12,c_3,c_12
902
903 mulx a_2,b_7,t_1 !=!mul_add_c(a[2],b[7],c1,c2,c3);
904 addcc c_12,t_1,c_12
905 clr c_3
906 bcs,a %xcc,.+8
907 add c_3,t_2,c_3 !=
908 mulx a_3,b_6,t_1 !mul_add_c(a[3],b[6],c1,c2,c3);
909 addcc c_12,t_1,c_12
910 bcs,a %xcc,.+8 !=
911 add c_3,t_2,c_3
912 mulx a_4,b_5,t_1 !mul_add_c(a[4],b[5],c1,c2,c3);
913 addcc c_12,t_1,c_12
914 bcs,a %xcc,.+8 !=
915 add c_3,t_2,c_3
916 mulx a_5,b_4,t_1 !mul_add_c(a[5],b[4],c1,c2,c3);
917 addcc c_12,t_1,c_12
918 bcs,a %xcc,.+8 !=
919 add c_3,t_2,c_3
920 mulx a_6,b_3,t_1 !mul_add_c(a[6],b[3],c1,c2,c3);
921 addcc c_12,t_1,c_12
922 bcs,a %xcc,.+8 !=
923 add c_3,t_2,c_3
924 mulx a_7,b_2,t_1 !mul_add_c(a[7],b[2],c1,c2,c3);
925 addcc c_12,t_1,t_1
926 bcs,a %xcc,.+8 !=
927 add c_3,t_2,c_3
928 srlx t_1,32,c_12
929 stuw t_1,rp(9) !r[9]=c1;
930 or c_12,c_3,c_12 !=
931
932 mulx a_7,b_3,t_1 !mul_add_c(a[7],b[3],c2,c3,c1);
933 addcc c_12,t_1,c_12
934 clr c_3
935 bcs,a %xcc,.+8 !=
936 add c_3,t_2,c_3
937 mulx a_6,b_4,t_1 !mul_add_c(a[6],b[4],c2,c3,c1);
938 addcc c_12,t_1,c_12
939 bcs,a %xcc,.+8 !=
940 add c_3,t_2,c_3
941 mulx a_5,b_5,t_1 !mul_add_c(a[5],b[5],c2,c3,c1);
942 addcc c_12,t_1,c_12
943 bcs,a %xcc,.+8 !=
944 add c_3,t_2,c_3
945 mulx a_4,b_6,t_1 !mul_add_c(a[4],b[6],c2,c3,c1);
946 addcc c_12,t_1,c_12
947 bcs,a %xcc,.+8 !=
948 add c_3,t_2,c_3
949 mulx a_3,b_7,t_1 !mul_add_c(a[3],b[7],c2,c3,c1);
950 addcc c_12,t_1,t_1
951 bcs,a %xcc,.+8 !=
952 add c_3,t_2,c_3
953 srlx t_1,32,c_12
954 stuw t_1,rp(10) !r[10]=c2;
955 or c_12,c_3,c_12 !=
956
957 mulx a_4,b_7,t_1 !mul_add_c(a[4],b[7],c3,c1,c2);
958 addcc c_12,t_1,c_12
959 clr c_3
960 bcs,a %xcc,.+8 !=
961 add c_3,t_2,c_3
962 mulx a_5,b_6,t_1 !mul_add_c(a[5],b[6],c3,c1,c2);
963 addcc c_12,t_1,c_12
964 bcs,a %xcc,.+8 !=
965 add c_3,t_2,c_3
966 mulx a_6,b_5,t_1 !mul_add_c(a[6],b[5],c3,c1,c2);
967 addcc c_12,t_1,c_12
968 bcs,a %xcc,.+8 !=
969 add c_3,t_2,c_3
970 mulx a_7,b_4,t_1 !mul_add_c(a[7],b[4],c3,c1,c2);
971 addcc c_12,t_1,t_1
972 bcs,a %xcc,.+8 !=
973 add c_3,t_2,c_3
974 srlx t_1,32,c_12
975 stuw t_1,rp(11) !r[11]=c3;
976 or c_12,c_3,c_12 !=
977
978 mulx a_7,b_5,t_1 !mul_add_c(a[7],b[5],c1,c2,c3);
979 addcc c_12,t_1,c_12
980 clr c_3
981 bcs,a %xcc,.+8 !=
982 add c_3,t_2,c_3
983 mulx a_6,b_6,t_1 !mul_add_c(a[6],b[6],c1,c2,c3);
984 addcc c_12,t_1,c_12
985 bcs,a %xcc,.+8 !=
986 add c_3,t_2,c_3
987 mulx a_5,b_7,t_1 !mul_add_c(a[5],b[7],c1,c2,c3);
988 addcc c_12,t_1,t_1
989 bcs,a %xcc,.+8 !=
990 add c_3,t_2,c_3
991 srlx t_1,32,c_12
992 stuw t_1,rp(12) !r[12]=c1;
993 or c_12,c_3,c_12 !=
994
995 mulx a_6,b_7,t_1 !mul_add_c(a[6],b[7],c2,c3,c1);
996 addcc c_12,t_1,c_12
997 clr c_3
998 bcs,a %xcc,.+8 !=
999 add c_3,t_2,c_3
1000 mulx a_7,b_6,t_1 !mul_add_c(a[7],b[6],c2,c3,c1);
1001 addcc c_12,t_1,t_1
1002 bcs,a %xcc,.+8 !=
1003 add c_3,t_2,c_3
1004 srlx t_1,32,c_12
1005 st t_1,rp(13) !r[13]=c2;
1006 or c_12,c_3,c_12 !=
1007
1008 mulx a_7,b_7,t_1 !mul_add_c(a[7],b[7],c3,c1,c2);
1009 addcc c_12,t_1,t_1
1010 srlx t_1,32,c_12 !=
1011 stuw t_1,rp(14) !r[14]=c3;
1012 stuw c_12,rp(15) !r[15]=c1;
1013
1014 ret
1015 restore %g0,%g0,%o0 !=
1016
1017.type bn_mul_comba8,#function
1018.size bn_mul_comba8,(.-bn_mul_comba8)
1019
1020.align 32
1021
1022.global bn_mul_comba4
1023/*
1024 * void bn_mul_comba4(r,a,b)
1025 * BN_ULONG *r,*a,*b;
1026 */
1027bn_mul_comba4:
1028 save %sp,FRAME_SIZE,%sp
1029 lduw ap(0),a_0
1030 mov 1,t_2
1031 lduw bp(0),b_0
1032 sllx t_2,32,t_2 !=
1033 lduw bp(1),b_1
1034 mulx a_0,b_0,t_1 !mul_add_c(a[0],b[0],c1,c2,c3);
1035 srlx t_1,32,c_12
1036 stuw t_1,rp(0) !=!r[0]=c1;
1037
1038 lduw ap(1),a_1
1039 mulx a_0,b_1,t_1 !mul_add_c(a[0],b[1],c2,c3,c1);
1040 addcc c_12,t_1,c_12
1041 clr c_3 !=
1042 bcs,a %xcc,.+8
1043 add c_3,t_2,c_3
1044 lduw ap(2),a_2
1045 mulx a_1,b_0,t_1 !=!mul_add_c(a[1],b[0],c2,c3,c1);
1046 addcc c_12,t_1,t_1
1047 bcs,a %xcc,.+8
1048 add c_3,t_2,c_3
1049 srlx t_1,32,c_12 !=
1050 stuw t_1,rp(1) !r[1]=c2;
1051 or c_12,c_3,c_12
1052
1053 mulx a_2,b_0,t_1 !mul_add_c(a[2],b[0],c3,c1,c2);
1054 addcc c_12,t_1,c_12 !=
1055 clr c_3
1056 bcs,a %xcc,.+8
1057 add c_3,t_2,c_3
1058 lduw bp(2),b_2 !=
1059 mulx a_1,b_1,t_1 !mul_add_c(a[1],b[1],c3,c1,c2);
1060 addcc c_12,t_1,c_12
1061 bcs,a %xcc,.+8
1062 add c_3,t_2,c_3 !=
1063 lduw bp(3),b_3
1064 mulx a_0,b_2,t_1 !mul_add_c(a[0],b[2],c3,c1,c2);
1065 addcc c_12,t_1,t_1
1066 bcs,a %xcc,.+8 !=
1067 add c_3,t_2,c_3
1068 srlx t_1,32,c_12
1069 stuw t_1,rp(2) !r[2]=c3;
1070 or c_12,c_3,c_12 !=
1071
1072 mulx a_0,b_3,t_1 !mul_add_c(a[0],b[3],c1,c2,c3);
1073 addcc c_12,t_1,c_12
1074 clr c_3
1075 bcs,a %xcc,.+8 !=
1076 add c_3,t_2,c_3
1077 mulx a_1,b_2,t_1 !mul_add_c(a[1],b[2],c1,c2,c3);
1078 addcc c_12,t_1,c_12
1079 bcs,a %xcc,.+8 !=
1080 add c_3,t_2,c_3
1081 lduw ap(3),a_3
1082 mulx a_2,b_1,t_1 !mul_add_c(a[2],b[1],c1,c2,c3);
1083 addcc c_12,t_1,c_12 !=
1084 bcs,a %xcc,.+8
1085 add c_3,t_2,c_3
1086 mulx a_3,b_0,t_1 !mul_add_c(a[3],b[0],c1,c2,c3);!=
1087 addcc c_12,t_1,t_1 !=
1088 bcs,a %xcc,.+8
1089 add c_3,t_2,c_3
1090 srlx t_1,32,c_12
1091 stuw t_1,rp(3) !=!r[3]=c1;
1092 or c_12,c_3,c_12
1093
1094 mulx a_3,b_1,t_1 !mul_add_c(a[3],b[1],c2,c3,c1);
1095 addcc c_12,t_1,c_12
1096 clr c_3 !=
1097 bcs,a %xcc,.+8
1098 add c_3,t_2,c_3
1099 mulx a_2,b_2,t_1 !mul_add_c(a[2],b[2],c2,c3,c1);
1100 addcc c_12,t_1,c_12 !=
1101 bcs,a %xcc,.+8
1102 add c_3,t_2,c_3
1103 mulx a_1,b_3,t_1 !mul_add_c(a[1],b[3],c2,c3,c1);
1104 addcc c_12,t_1,t_1 !=
1105 bcs,a %xcc,.+8
1106 add c_3,t_2,c_3
1107 srlx t_1,32,c_12
1108 stuw t_1,rp(4) !=!r[4]=c2;
1109 or c_12,c_3,c_12
1110
1111 mulx a_2,b_3,t_1 !mul_add_c(a[2],b[3],c3,c1,c2);
1112 addcc c_12,t_1,c_12
1113 clr c_3 !=
1114 bcs,a %xcc,.+8
1115 add c_3,t_2,c_3
1116 mulx a_3,b_2,t_1 !mul_add_c(a[3],b[2],c3,c1,c2);
1117 addcc c_12,t_1,t_1 !=
1118 bcs,a %xcc,.+8
1119 add c_3,t_2,c_3
1120 srlx t_1,32,c_12
1121 stuw t_1,rp(5) !=!r[5]=c3;
1122 or c_12,c_3,c_12
1123
1124 mulx a_3,b_3,t_1 !mul_add_c(a[3],b[3],c1,c2,c3);
1125 addcc c_12,t_1,t_1
1126 srlx t_1,32,c_12 !=
1127 stuw t_1,rp(6) !r[6]=c1;
1128 stuw c_12,rp(7) !r[7]=c2;
1129
1130 ret
1131 restore %g0,%g0,%o0
1132
1133.type bn_mul_comba4,#function
1134.size bn_mul_comba4,(.-bn_mul_comba4)
1135
1136.align 32
1137
1138.global bn_sqr_comba8
1139bn_sqr_comba8:
1140 save %sp,FRAME_SIZE,%sp
1141 mov 1,t_2
1142 lduw ap(0),a_0
1143 sllx t_2,32,t_2
1144 lduw ap(1),a_1
1145 mulx a_0,a_0,t_1 !sqr_add_c(a,0,c1,c2,c3);
1146 srlx t_1,32,c_12
1147 stuw t_1,rp(0) !r[0]=c1;
1148
1149 lduw ap(2),a_2
1150 mulx a_0,a_1,t_1 !=!sqr_add_c2(a,1,0,c2,c3,c1);
1151 addcc c_12,t_1,c_12
1152 clr c_3
1153 bcs,a %xcc,.+8
1154 add c_3,t_2,c_3
1155 addcc c_12,t_1,t_1
1156 bcs,a %xcc,.+8
1157 add c_3,t_2,c_3
1158 srlx t_1,32,c_12
1159 stuw t_1,rp(1) !r[1]=c2;
1160 or c_12,c_3,c_12
1161
1162 mulx a_2,a_0,t_1 !sqr_add_c2(a,2,0,c3,c1,c2);
1163 addcc c_12,t_1,c_12
1164 clr c_3
1165 bcs,a %xcc,.+8
1166 add c_3,t_2,c_3
1167 addcc c_12,t_1,c_12
1168 bcs,a %xcc,.+8
1169 add c_3,t_2,c_3
1170 lduw ap(3),a_3
1171 mulx a_1,a_1,t_1 !sqr_add_c(a,1,c3,c1,c2);
1172 addcc c_12,t_1,t_1
1173 bcs,a %xcc,.+8
1174 add c_3,t_2,c_3
1175 srlx t_1,32,c_12
1176 stuw t_1,rp(2) !r[2]=c3;
1177 or c_12,c_3,c_12
1178
1179 mulx a_0,a_3,t_1 !sqr_add_c2(a,3,0,c1,c2,c3);
1180 addcc c_12,t_1,c_12
1181 clr c_3
1182 bcs,a %xcc,.+8
1183 add c_3,t_2,c_3
1184 addcc c_12,t_1,c_12
1185 bcs,a %xcc,.+8
1186 add c_3,t_2,c_3
1187 lduw ap(4),a_4
1188 mulx a_1,a_2,t_1 !sqr_add_c2(a,2,1,c1,c2,c3);
1189 addcc c_12,t_1,c_12
1190 bcs,a %xcc,.+8
1191 add c_3,t_2,c_3
1192 addcc c_12,t_1,t_1
1193 bcs,a %xcc,.+8
1194 add c_3,t_2,c_3
1195 srlx t_1,32,c_12
1196 st t_1,rp(3) !r[3]=c1;
1197 or c_12,c_3,c_12
1198
1199 mulx a_4,a_0,t_1 !sqr_add_c2(a,4,0,c2,c3,c1);
1200 addcc c_12,t_1,c_12
1201 clr c_3
1202 bcs,a %xcc,.+8
1203 add c_3,t_2,c_3
1204 addcc c_12,t_1,c_12
1205 bcs,a %xcc,.+8
1206 add c_3,t_2,c_3
1207 mulx a_3,a_1,t_1 !sqr_add_c2(a,3,1,c2,c3,c1);
1208 addcc c_12,t_1,c_12
1209 bcs,a %xcc,.+8
1210 add c_3,t_2,c_3
1211 addcc c_12,t_1,c_12
1212 bcs,a %xcc,.+8
1213 add c_3,t_2,c_3
1214 lduw ap(5),a_5
1215 mulx a_2,a_2,t_1 !sqr_add_c(a,2,c2,c3,c1);
1216 addcc c_12,t_1,t_1
1217 bcs,a %xcc,.+8
1218 add c_3,t_2,c_3
1219 srlx t_1,32,c_12
1220 stuw t_1,rp(4) !r[4]=c2;
1221 or c_12,c_3,c_12
1222
1223 mulx a_0,a_5,t_1 !sqr_add_c2(a,5,0,c3,c1,c2);
1224 addcc c_12,t_1,c_12
1225 clr c_3
1226 bcs,a %xcc,.+8
1227 add c_3,t_2,c_3
1228 addcc c_12,t_1,c_12
1229 bcs,a %xcc,.+8
1230 add c_3,t_2,c_3
1231 mulx a_1,a_4,t_1 !sqr_add_c2(a,4,1,c3,c1,c2);
1232 addcc c_12,t_1,c_12
1233 bcs,a %xcc,.+8
1234 add c_3,t_2,c_3
1235 addcc c_12,t_1,c_12
1236 bcs,a %xcc,.+8
1237 add c_3,t_2,c_3
1238 lduw ap(6),a_6
1239 mulx a_2,a_3,t_1 !sqr_add_c2(a,3,2,c3,c1,c2);
1240 addcc c_12,t_1,c_12
1241 bcs,a %xcc,.+8
1242 add c_3,t_2,c_3
1243 addcc c_12,t_1,t_1
1244 bcs,a %xcc,.+8
1245 add c_3,t_2,c_3
1246 srlx t_1,32,c_12
1247 stuw t_1,rp(5) !r[5]=c3;
1248 or c_12,c_3,c_12
1249
1250 mulx a_6,a_0,t_1 !sqr_add_c2(a,6,0,c1,c2,c3);
1251 addcc c_12,t_1,c_12
1252 clr c_3
1253 bcs,a %xcc,.+8
1254 add c_3,t_2,c_3
1255 addcc c_12,t_1,c_12
1256 bcs,a %xcc,.+8
1257 add c_3,t_2,c_3
1258 mulx a_5,a_1,t_1 !sqr_add_c2(a,5,1,c1,c2,c3);
1259 addcc c_12,t_1,c_12
1260 bcs,a %xcc,.+8
1261 add c_3,t_2,c_3
1262 addcc c_12,t_1,c_12
1263 bcs,a %xcc,.+8
1264 add c_3,t_2,c_3
1265 mulx a_4,a_2,t_1 !sqr_add_c2(a,4,2,c1,c2,c3);
1266 addcc c_12,t_1,c_12
1267 bcs,a %xcc,.+8
1268 add c_3,t_2,c_3
1269 addcc c_12,t_1,c_12
1270 bcs,a %xcc,.+8
1271 add c_3,t_2,c_3
1272 lduw ap(7),a_7
1273 mulx a_3,a_3,t_1 !=!sqr_add_c(a,3,c1,c2,c3);
1274 addcc c_12,t_1,t_1
1275 bcs,a %xcc,.+8
1276 add c_3,t_2,c_3
1277 srlx t_1,32,c_12
1278 stuw t_1,rp(6) !r[6]=c1;
1279 or c_12,c_3,c_12
1280
1281 mulx a_0,a_7,t_1 !sqr_add_c2(a,7,0,c2,c3,c1);
1282 addcc c_12,t_1,c_12
1283 clr c_3
1284 bcs,a %xcc,.+8
1285 add c_3,t_2,c_3
1286 addcc c_12,t_1,c_12
1287 bcs,a %xcc,.+8
1288 add c_3,t_2,c_3
1289 mulx a_1,a_6,t_1 !sqr_add_c2(a,6,1,c2,c3,c1);
1290 addcc c_12,t_1,c_12
1291 bcs,a %xcc,.+8
1292 add c_3,t_2,c_3
1293 addcc c_12,t_1,c_12
1294 bcs,a %xcc,.+8
1295 add c_3,t_2,c_3
1296 mulx a_2,a_5,t_1 !sqr_add_c2(a,5,2,c2,c3,c1);
1297 addcc c_12,t_1,c_12
1298 bcs,a %xcc,.+8
1299 add c_3,t_2,c_3
1300 addcc c_12,t_1,c_12
1301 bcs,a %xcc,.+8
1302 add c_3,t_2,c_3
1303 mulx a_3,a_4,t_1 !sqr_add_c2(a,4,3,c2,c3,c1);
1304 addcc c_12,t_1,c_12
1305 bcs,a %xcc,.+8
1306 add c_3,t_2,c_3
1307 addcc c_12,t_1,t_1
1308 bcs,a %xcc,.+8
1309 add c_3,t_2,c_3
1310 srlx t_1,32,c_12
1311 stuw t_1,rp(7) !r[7]=c2;
1312 or c_12,c_3,c_12
1313
1314 mulx a_7,a_1,t_1 !sqr_add_c2(a,7,1,c3,c1,c2);
1315 addcc c_12,t_1,c_12
1316 clr c_3
1317 bcs,a %xcc,.+8
1318 add c_3,t_2,c_3
1319 addcc c_12,t_1,c_12
1320 bcs,a %xcc,.+8
1321 add c_3,t_2,c_3
1322 mulx a_6,a_2,t_1 !sqr_add_c2(a,6,2,c3,c1,c2);
1323 addcc c_12,t_1,c_12
1324 bcs,a %xcc,.+8
1325 add c_3,t_2,c_3
1326 addcc c_12,t_1,c_12
1327 bcs,a %xcc,.+8
1328 add c_3,t_2,c_3
1329 mulx a_5,a_3,t_1 !sqr_add_c2(a,5,3,c3,c1,c2);
1330 addcc c_12,t_1,c_12
1331 bcs,a %xcc,.+8
1332 add c_3,t_2,c_3
1333 addcc c_12,t_1,c_12
1334 bcs,a %xcc,.+8
1335 add c_3,t_2,c_3
1336 mulx a_4,a_4,t_1 !sqr_add_c(a,4,c3,c1,c2);
1337 addcc c_12,t_1,t_1
1338 bcs,a %xcc,.+8
1339 add c_3,t_2,c_3
1340 srlx t_1,32,c_12
1341 stuw t_1,rp(8) !r[8]=c3;
1342 or c_12,c_3,c_12
1343
1344 mulx a_2,a_7,t_1 !sqr_add_c2(a,7,2,c1,c2,c3);
1345 addcc c_12,t_1,c_12
1346 clr c_3
1347 bcs,a %xcc,.+8
1348 add c_3,t_2,c_3
1349 addcc c_12,t_1,c_12
1350 bcs,a %xcc,.+8
1351 add c_3,t_2,c_3
1352 mulx a_3,a_6,t_1 !sqr_add_c2(a,6,3,c1,c2,c3);
1353 addcc c_12,t_1,c_12
1354 bcs,a %xcc,.+8
1355 add c_3,t_2,c_3
1356 addcc c_12,t_1,c_12
1357 bcs,a %xcc,.+8
1358 add c_3,t_2,c_3
1359 mulx a_4,a_5,t_1 !sqr_add_c2(a,5,4,c1,c2,c3);
1360 addcc c_12,t_1,c_12
1361 bcs,a %xcc,.+8
1362 add c_3,t_2,c_3
1363 addcc c_12,t_1,t_1
1364 bcs,a %xcc,.+8
1365 add c_3,t_2,c_3
1366 srlx t_1,32,c_12
1367 stuw t_1,rp(9) !r[9]=c1;
1368 or c_12,c_3,c_12
1369
1370 mulx a_7,a_3,t_1 !sqr_add_c2(a,7,3,c2,c3,c1);
1371 addcc c_12,t_1,c_12
1372 clr c_3
1373 bcs,a %xcc,.+8
1374 add c_3,t_2,c_3
1375 addcc c_12,t_1,c_12
1376 bcs,a %xcc,.+8
1377 add c_3,t_2,c_3
1378 mulx a_6,a_4,t_1 !sqr_add_c2(a,6,4,c2,c3,c1);
1379 addcc c_12,t_1,c_12
1380 bcs,a %xcc,.+8
1381 add c_3,t_2,c_3
1382 addcc c_12,t_1,c_12
1383 bcs,a %xcc,.+8
1384 add c_3,t_2,c_3
1385 mulx a_5,a_5,t_1 !sqr_add_c(a,5,c2,c3,c1);
1386 addcc c_12,t_1,t_1
1387 bcs,a %xcc,.+8
1388 add c_3,t_2,c_3
1389 srlx t_1,32,c_12
1390 stuw t_1,rp(10) !r[10]=c2;
1391 or c_12,c_3,c_12
1392
1393 mulx a_4,a_7,t_1 !sqr_add_c2(a,7,4,c3,c1,c2);
1394 addcc c_12,t_1,c_12
1395 clr c_3
1396 bcs,a %xcc,.+8
1397 add c_3,t_2,c_3
1398 addcc c_12,t_1,c_12
1399 bcs,a %xcc,.+8
1400 add c_3,t_2,c_3
1401 mulx a_5,a_6,t_1 !sqr_add_c2(a,6,5,c3,c1,c2);
1402 addcc c_12,t_1,c_12
1403 bcs,a %xcc,.+8
1404 add c_3,t_2,c_3
1405 addcc c_12,t_1,t_1
1406 bcs,a %xcc,.+8
1407 add c_3,t_2,c_3
1408 srlx t_1,32,c_12
1409 stuw t_1,rp(11) !r[11]=c3;
1410 or c_12,c_3,c_12
1411
1412 mulx a_7,a_5,t_1 !sqr_add_c2(a,7,5,c1,c2,c3);
1413 addcc c_12,t_1,c_12
1414 clr c_3
1415 bcs,a %xcc,.+8
1416 add c_3,t_2,c_3
1417 addcc c_12,t_1,c_12
1418 bcs,a %xcc,.+8
1419 add c_3,t_2,c_3
1420 mulx a_6,a_6,t_1 !sqr_add_c(a,6,c1,c2,c3);
1421 addcc c_12,t_1,t_1
1422 bcs,a %xcc,.+8
1423 add c_3,t_2,c_3
1424 srlx t_1,32,c_12
1425 stuw t_1,rp(12) !r[12]=c1;
1426 or c_12,c_3,c_12
1427
1428 mulx a_6,a_7,t_1 !sqr_add_c2(a,7,6,c2,c3,c1);
1429 addcc c_12,t_1,c_12
1430 clr c_3
1431 bcs,a %xcc,.+8
1432 add c_3,t_2,c_3
1433 addcc c_12,t_1,t_1
1434 bcs,a %xcc,.+8
1435 add c_3,t_2,c_3
1436 srlx t_1,32,c_12
1437 stuw t_1,rp(13) !r[13]=c2;
1438 or c_12,c_3,c_12
1439
1440 mulx a_7,a_7,t_1 !sqr_add_c(a,7,c3,c1,c2);
1441 addcc c_12,t_1,t_1
1442 srlx t_1,32,c_12
1443 stuw t_1,rp(14) !r[14]=c3;
1444 stuw c_12,rp(15) !r[15]=c1;
1445
1446 ret
1447 restore %g0,%g0,%o0
1448
1449.type bn_sqr_comba8,#function
1450.size bn_sqr_comba8,(.-bn_sqr_comba8)
1451
1452.align 32
1453
1454.global bn_sqr_comba4
1455/*
1456 * void bn_sqr_comba4(r,a)
1457 * BN_ULONG *r,*a;
1458 */
1459bn_sqr_comba4:
1460 save %sp,FRAME_SIZE,%sp
1461 mov 1,t_2
1462 lduw ap(0),a_0
1463 sllx t_2,32,t_2
1464 lduw ap(1),a_1
1465 mulx a_0,a_0,t_1 !sqr_add_c(a,0,c1,c2,c3);
1466 srlx t_1,32,c_12
1467 stuw t_1,rp(0) !r[0]=c1;
1468
1469 lduw ap(2),a_2
1470 mulx a_0,a_1,t_1 !sqr_add_c2(a,1,0,c2,c3,c1);
1471 addcc c_12,t_1,c_12
1472 clr c_3
1473 bcs,a %xcc,.+8
1474 add c_3,t_2,c_3
1475 addcc c_12,t_1,t_1
1476 bcs,a %xcc,.+8
1477 add c_3,t_2,c_3
1478 srlx t_1,32,c_12
1479 stuw t_1,rp(1) !r[1]=c2;
1480 or c_12,c_3,c_12
1481
1482 mulx a_2,a_0,t_1 !sqr_add_c2(a,2,0,c3,c1,c2);
1483 addcc c_12,t_1,c_12
1484 clr c_3
1485 bcs,a %xcc,.+8
1486 add c_3,t_2,c_3
1487 addcc c_12,t_1,c_12
1488 bcs,a %xcc,.+8
1489 add c_3,t_2,c_3
1490 lduw ap(3),a_3
1491 mulx a_1,a_1,t_1 !sqr_add_c(a,1,c3,c1,c2);
1492 addcc c_12,t_1,t_1
1493 bcs,a %xcc,.+8
1494 add c_3,t_2,c_3
1495 srlx t_1,32,c_12
1496 stuw t_1,rp(2) !r[2]=c3;
1497 or c_12,c_3,c_12
1498
1499 mulx a_0,a_3,t_1 !sqr_add_c2(a,3,0,c1,c2,c3);
1500 addcc c_12,t_1,c_12
1501 clr c_3
1502 bcs,a %xcc,.+8
1503 add c_3,t_2,c_3
1504 addcc c_12,t_1,c_12
1505 bcs,a %xcc,.+8
1506 add c_3,t_2,c_3
1507 mulx a_1,a_2,t_1 !sqr_add_c2(a,2,1,c1,c2,c3);
1508 addcc c_12,t_1,c_12
1509 bcs,a %xcc,.+8
1510 add c_3,t_2,c_3
1511 addcc c_12,t_1,t_1
1512 bcs,a %xcc,.+8
1513 add c_3,t_2,c_3
1514 srlx t_1,32,c_12
1515 stuw t_1,rp(3) !r[3]=c1;
1516 or c_12,c_3,c_12
1517
1518 mulx a_3,a_1,t_1 !sqr_add_c2(a,3,1,c2,c3,c1);
1519 addcc c_12,t_1,c_12
1520 clr c_3
1521 bcs,a %xcc,.+8
1522 add c_3,t_2,c_3
1523 addcc c_12,t_1,c_12
1524 bcs,a %xcc,.+8
1525 add c_3,t_2,c_3
1526 mulx a_2,a_2,t_1 !sqr_add_c(a,2,c2,c3,c1);
1527 addcc c_12,t_1,t_1
1528 bcs,a %xcc,.+8
1529 add c_3,t_2,c_3
1530 srlx t_1,32,c_12
1531 stuw t_1,rp(4) !r[4]=c2;
1532 or c_12,c_3,c_12
1533
1534 mulx a_2,a_3,t_1 !sqr_add_c2(a,3,2,c3,c1,c2);
1535 addcc c_12,t_1,c_12
1536 clr c_3
1537 bcs,a %xcc,.+8
1538 add c_3,t_2,c_3
1539 addcc c_12,t_1,t_1
1540 bcs,a %xcc,.+8
1541 add c_3,t_2,c_3
1542 srlx t_1,32,c_12
1543 stuw t_1,rp(5) !r[5]=c3;
1544 or c_12,c_3,c_12
1545
1546 mulx a_3,a_3,t_1 !sqr_add_c(a,3,c1,c2,c3);
1547 addcc c_12,t_1,t_1
1548 srlx t_1,32,c_12
1549 stuw t_1,rp(6) !r[6]=c1;
1550 stuw c_12,rp(7) !r[7]=c2;
1551
1552 ret
1553 restore %g0,%g0,%o0
1554
1555.type bn_sqr_comba4,#function
1556.size bn_sqr_comba4,(.-bn_sqr_comba4)
1557
1558.align 32
diff --git a/src/lib/libcrypto/bn/asm/sparcv9-mont.pl b/src/lib/libcrypto/bn/asm/sparcv9-mont.pl
deleted file mode 100644
index 610ec1a968..0000000000
--- a/src/lib/libcrypto/bn/asm/sparcv9-mont.pl
+++ /dev/null
@@ -1,604 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# December 2005
11#
12# Pure SPARCv9/8+ and IALU-only bn_mul_mont implementation. The reasons
13# for undertaken effort are multiple. First of all, UltraSPARC is not
14# the whole SPARCv9 universe and other VIS-free implementations deserve
15# optimized code as much. Secondly, newly introduced UltraSPARC T1,
16# a.k.a. Niagara, has shared FPU and concurrent FPU-intensive paths,
17# such as sparcv9a-mont, will simply sink it. Yes, T1 is equipped with
18# several integrated RSA/DSA accelerator circuits accessible through
19# kernel driver [only(*)], but having decent user-land software
20# implementation is important too. Finally, reasons like desire to
21# experiment with dedicated squaring procedure. Yes, this module
22# implements one, because it was easiest to draft it in SPARCv9
23# instructions...
24
25# (*) Engine accessing the driver in question is on my TODO list.
26# For reference, acceleator is estimated to give 6 to 10 times
27# improvement on single-threaded RSA sign. It should be noted
28# that 6-10x improvement coefficient does not actually mean
29# something extraordinary in terms of absolute [single-threaded]
30# performance, as SPARCv9 instruction set is by all means least
31# suitable for high performance crypto among other 64 bit
32# platforms. 6-10x factor simply places T1 in same performance
33# domain as say AMD64 and IA-64. Improvement of RSA verify don't
34# appear impressive at all, but it's the sign operation which is
35# far more critical/interesting.
36
37# You might notice that inner loops are modulo-scheduled:-) This has
38# essentially negligible impact on UltraSPARC performance, it's
39# Fujitsu SPARC64 V users who should notice and hopefully appreciate
40# the advantage... Currently this module surpasses sparcv9a-mont.pl
41# by ~20% on UltraSPARC-III and later cores, but recall that sparcv9a
42# module still have hidden potential [see TODO list there], which is
43# estimated to be larger than 20%...
44
45# int bn_mul_mont(
46$rp="%i0"; # BN_ULONG *rp,
47$ap="%i1"; # const BN_ULONG *ap,
48$bp="%i2"; # const BN_ULONG *bp,
49$np="%i3"; # const BN_ULONG *np,
50$n0="%i4"; # const BN_ULONG *n0,
51$num="%i5"; # int num);
52
53$bits=32;
54for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
55if ($bits==64) { $bias=2047; $frame=192; }
56else { $bias=0; $frame=128; }
57
58$car0="%o0";
59$car1="%o1";
60$car2="%o2"; # 1 bit
61$acc0="%o3";
62$acc1="%o4";
63$mask="%g1"; # 32 bits, what a waste...
64$tmp0="%g4";
65$tmp1="%g5";
66
67$i="%l0";
68$j="%l1";
69$mul0="%l2";
70$mul1="%l3";
71$tp="%l4";
72$apj="%l5";
73$npj="%l6";
74$tpj="%l7";
75
76$fname="bn_mul_mont_int";
77
78$code=<<___;
79.section ".text",#alloc,#execinstr
80
81.global $fname
82.align 32
83$fname:
84 cmp %o5,4 ! 128 bits minimum
85 bge,pt %icc,.Lenter
86 sethi %hi(0xffffffff),$mask
87 retl
88 clr %o0
89.align 32
90.Lenter:
91 save %sp,-$frame,%sp
92 sll $num,2,$num ! num*=4
93 or $mask,%lo(0xffffffff),$mask
94 ld [$n0],$n0
95 cmp $ap,$bp
96 and $num,$mask,$num
97 ld [$bp],$mul0 ! bp[0]
98 nop
99
100 add %sp,$bias,%o7 ! real top of stack
101 ld [$ap],$car0 ! ap[0] ! redundant in squaring context
102 sub %o7,$num,%o7
103 ld [$ap+4],$apj ! ap[1]
104 and %o7,-1024,%o7
105 ld [$np],$car1 ! np[0]
106 sub %o7,$bias,%sp ! alloca
107 ld [$np+4],$npj ! np[1]
108 be,pt `$bits==32?"%icc":"%xcc"`,.Lbn_sqr_mont
109 mov 12,$j
110
111 mulx $car0,$mul0,$car0 ! ap[0]*bp[0]
112 mulx $apj,$mul0,$tmp0 !prologue! ap[1]*bp[0]
113 and $car0,$mask,$acc0
114 add %sp,$bias+$frame,$tp
115 ld [$ap+8],$apj !prologue!
116
117 mulx $n0,$acc0,$mul1 ! "t[0]"*n0
118 and $mul1,$mask,$mul1
119
120 mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0
121 mulx $npj,$mul1,$acc1 !prologue! np[1]*"t[0]"*n0
122 srlx $car0,32,$car0
123 add $acc0,$car1,$car1
124 ld [$np+8],$npj !prologue!
125 srlx $car1,32,$car1
126 mov $tmp0,$acc0 !prologue!
127
128.L1st:
129 mulx $apj,$mul0,$tmp0
130 mulx $npj,$mul1,$tmp1
131 add $acc0,$car0,$car0
132 ld [$ap+$j],$apj ! ap[j]
133 and $car0,$mask,$acc0
134 add $acc1,$car1,$car1
135 ld [$np+$j],$npj ! np[j]
136 srlx $car0,32,$car0
137 add $acc0,$car1,$car1
138 add $j,4,$j ! j++
139 mov $tmp0,$acc0
140 st $car1,[$tp]
141 cmp $j,$num
142 mov $tmp1,$acc1
143 srlx $car1,32,$car1
144 bl %icc,.L1st
145 add $tp,4,$tp ! tp++
146!.L1st
147
148 mulx $apj,$mul0,$tmp0 !epilogue!
149 mulx $npj,$mul1,$tmp1
150 add $acc0,$car0,$car0
151 and $car0,$mask,$acc0
152 add $acc1,$car1,$car1
153 srlx $car0,32,$car0
154 add $acc0,$car1,$car1
155 st $car1,[$tp]
156 srlx $car1,32,$car1
157
158 add $tmp0,$car0,$car0
159 and $car0,$mask,$acc0
160 add $tmp1,$car1,$car1
161 srlx $car0,32,$car0
162 add $acc0,$car1,$car1
163 st $car1,[$tp+4]
164 srlx $car1,32,$car1
165
166 add $car0,$car1,$car1
167 st $car1,[$tp+8]
168 srlx $car1,32,$car2
169
170 mov 4,$i ! i++
171 ld [$bp+4],$mul0 ! bp[1]
172.Louter:
173 add %sp,$bias+$frame,$tp
174 ld [$ap],$car0 ! ap[0]
175 ld [$ap+4],$apj ! ap[1]
176 ld [$np],$car1 ! np[0]
177 ld [$np+4],$npj ! np[1]
178 ld [$tp],$tmp1 ! tp[0]
179 ld [$tp+4],$tpj ! tp[1]
180 mov 12,$j
181
182 mulx $car0,$mul0,$car0
183 mulx $apj,$mul0,$tmp0 !prologue!
184 add $tmp1,$car0,$car0
185 ld [$ap+8],$apj !prologue!
186 and $car0,$mask,$acc0
187
188 mulx $n0,$acc0,$mul1
189 and $mul1,$mask,$mul1
190
191 mulx $car1,$mul1,$car1
192 mulx $npj,$mul1,$acc1 !prologue!
193 srlx $car0,32,$car0
194 add $acc0,$car1,$car1
195 ld [$np+8],$npj !prologue!
196 srlx $car1,32,$car1
197 mov $tmp0,$acc0 !prologue!
198
199.Linner:
200 mulx $apj,$mul0,$tmp0
201 mulx $npj,$mul1,$tmp1
202 add $tpj,$car0,$car0
203 ld [$ap+$j],$apj ! ap[j]
204 add $acc0,$car0,$car0
205 add $acc1,$car1,$car1
206 ld [$np+$j],$npj ! np[j]
207 and $car0,$mask,$acc0
208 ld [$tp+8],$tpj ! tp[j]
209 srlx $car0,32,$car0
210 add $acc0,$car1,$car1
211 add $j,4,$j ! j++
212 mov $tmp0,$acc0
213 st $car1,[$tp] ! tp[j-1]
214 srlx $car1,32,$car1
215 mov $tmp1,$acc1
216 cmp $j,$num
217 bl %icc,.Linner
218 add $tp,4,$tp ! tp++
219!.Linner
220
221 mulx $apj,$mul0,$tmp0 !epilogue!
222 mulx $npj,$mul1,$tmp1
223 add $tpj,$car0,$car0
224 add $acc0,$car0,$car0
225 ld [$tp+8],$tpj ! tp[j]
226 and $car0,$mask,$acc0
227 add $acc1,$car1,$car1
228 srlx $car0,32,$car0
229 add $acc0,$car1,$car1
230 st $car1,[$tp] ! tp[j-1]
231 srlx $car1,32,$car1
232
233 add $tpj,$car0,$car0
234 add $tmp0,$car0,$car0
235 and $car0,$mask,$acc0
236 add $tmp1,$car1,$car1
237 add $acc0,$car1,$car1
238 st $car1,[$tp+4] ! tp[j-1]
239 srlx $car0,32,$car0
240 add $i,4,$i ! i++
241 srlx $car1,32,$car1
242
243 add $car0,$car1,$car1
244 cmp $i,$num
245 add $car2,$car1,$car1
246 st $car1,[$tp+8]
247
248 srlx $car1,32,$car2
249 bl,a %icc,.Louter
250 ld [$bp+$i],$mul0 ! bp[i]
251!.Louter
252
253 add $tp,12,$tp
254
255.Ltail:
256 add $np,$num,$np
257 add $rp,$num,$rp
258 mov $tp,$ap
259 sub %g0,$num,%o7 ! k=-num
260 ba .Lsub
261 subcc %g0,%g0,%g0 ! clear %icc.c
262.align 16
263.Lsub:
264 ld [$tp+%o7],%o0
265 ld [$np+%o7],%o1
266 subccc %o0,%o1,%o1 ! tp[j]-np[j]
267 add $rp,%o7,$i
268 add %o7,4,%o7
269 brnz %o7,.Lsub
270 st %o1,[$i]
271 subc $car2,0,$car2 ! handle upmost overflow bit
272 and $tp,$car2,$ap
273 andn $rp,$car2,$np
274 or $ap,$np,$ap
275 sub %g0,$num,%o7
276
277.Lcopy:
278 ld [$ap+%o7],%o0 ! copy or in-place refresh
279 st %g0,[$tp+%o7] ! zap tp
280 st %o0,[$rp+%o7]
281 add %o7,4,%o7
282 brnz %o7,.Lcopy
283 nop
284 mov 1,%i0
285 ret
286 restore
287___
288
289########
290######## .Lbn_sqr_mont gives up to 20% *overall* improvement over
291######## code without following dedicated squaring procedure.
292########
293$sbit="%i2"; # re-use $bp!
294
295$code.=<<___;
296.align 32
297.Lbn_sqr_mont:
298 mulx $mul0,$mul0,$car0 ! ap[0]*ap[0]
299 mulx $apj,$mul0,$tmp0 !prologue!
300 and $car0,$mask,$acc0
301 add %sp,$bias+$frame,$tp
302 ld [$ap+8],$apj !prologue!
303
304 mulx $n0,$acc0,$mul1 ! "t[0]"*n0
305 srlx $car0,32,$car0
306 and $mul1,$mask,$mul1
307
308 mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0
309 mulx $npj,$mul1,$acc1 !prologue!
310 and $car0,1,$sbit
311 ld [$np+8],$npj !prologue!
312 srlx $car0,1,$car0
313 add $acc0,$car1,$car1
314 srlx $car1,32,$car1
315 mov $tmp0,$acc0 !prologue!
316
317.Lsqr_1st:
318 mulx $apj,$mul0,$tmp0
319 mulx $npj,$mul1,$tmp1
320 add $acc0,$car0,$car0 ! ap[j]*a0+c0
321 add $acc1,$car1,$car1
322 ld [$ap+$j],$apj ! ap[j]
323 and $car0,$mask,$acc0
324 ld [$np+$j],$npj ! np[j]
325 srlx $car0,32,$car0
326 add $acc0,$acc0,$acc0
327 or $sbit,$acc0,$acc0
328 mov $tmp1,$acc1
329 srlx $acc0,32,$sbit
330 add $j,4,$j ! j++
331 and $acc0,$mask,$acc0
332 cmp $j,$num
333 add $acc0,$car1,$car1
334 st $car1,[$tp]
335 mov $tmp0,$acc0
336 srlx $car1,32,$car1
337 bl %icc,.Lsqr_1st
338 add $tp,4,$tp ! tp++
339!.Lsqr_1st
340
341 mulx $apj,$mul0,$tmp0 ! epilogue
342 mulx $npj,$mul1,$tmp1
343 add $acc0,$car0,$car0 ! ap[j]*a0+c0
344 add $acc1,$car1,$car1
345 and $car0,$mask,$acc0
346 srlx $car0,32,$car0
347 add $acc0,$acc0,$acc0
348 or $sbit,$acc0,$acc0
349 srlx $acc0,32,$sbit
350 and $acc0,$mask,$acc0
351 add $acc0,$car1,$car1
352 st $car1,[$tp]
353 srlx $car1,32,$car1
354
355 add $tmp0,$car0,$car0 ! ap[j]*a0+c0
356 add $tmp1,$car1,$car1
357 and $car0,$mask,$acc0
358 srlx $car0,32,$car0
359 add $acc0,$acc0,$acc0
360 or $sbit,$acc0,$acc0
361 srlx $acc0,32,$sbit
362 and $acc0,$mask,$acc0
363 add $acc0,$car1,$car1
364 st $car1,[$tp+4]
365 srlx $car1,32,$car1
366
367 add $car0,$car0,$car0
368 or $sbit,$car0,$car0
369 add $car0,$car1,$car1
370 st $car1,[$tp+8]
371 srlx $car1,32,$car2
372
373 ld [%sp+$bias+$frame],$tmp0 ! tp[0]
374 ld [%sp+$bias+$frame+4],$tmp1 ! tp[1]
375 ld [%sp+$bias+$frame+8],$tpj ! tp[2]
376 ld [$ap+4],$mul0 ! ap[1]
377 ld [$ap+8],$apj ! ap[2]
378 ld [$np],$car1 ! np[0]
379 ld [$np+4],$npj ! np[1]
380 mulx $n0,$tmp0,$mul1
381
382 mulx $mul0,$mul0,$car0
383 and $mul1,$mask,$mul1
384
385 mulx $car1,$mul1,$car1
386 mulx $npj,$mul1,$acc1
387 add $tmp0,$car1,$car1
388 and $car0,$mask,$acc0
389 ld [$np+8],$npj ! np[2]
390 srlx $car1,32,$car1
391 add $tmp1,$car1,$car1
392 srlx $car0,32,$car0
393 add $acc0,$car1,$car1
394 and $car0,1,$sbit
395 add $acc1,$car1,$car1
396 srlx $car0,1,$car0
397 mov 12,$j
398 st $car1,[%sp+$bias+$frame] ! tp[0]=
399 srlx $car1,32,$car1
400 add %sp,$bias+$frame+4,$tp
401
402.Lsqr_2nd:
403 mulx $apj,$mul0,$acc0
404 mulx $npj,$mul1,$acc1
405 add $acc0,$car0,$car0
406 add $tpj,$car1,$car1
407 ld [$ap+$j],$apj ! ap[j]
408 and $car0,$mask,$acc0
409 ld [$np+$j],$npj ! np[j]
410 srlx $car0,32,$car0
411 add $acc1,$car1,$car1
412 ld [$tp+8],$tpj ! tp[j]
413 add $acc0,$acc0,$acc0
414 add $j,4,$j ! j++
415 or $sbit,$acc0,$acc0
416 srlx $acc0,32,$sbit
417 and $acc0,$mask,$acc0
418 cmp $j,$num
419 add $acc0,$car1,$car1
420 st $car1,[$tp] ! tp[j-1]
421 srlx $car1,32,$car1
422 bl %icc,.Lsqr_2nd
423 add $tp,4,$tp ! tp++
424!.Lsqr_2nd
425
426 mulx $apj,$mul0,$acc0
427 mulx $npj,$mul1,$acc1
428 add $acc0,$car0,$car0
429 add $tpj,$car1,$car1
430 and $car0,$mask,$acc0
431 srlx $car0,32,$car0
432 add $acc1,$car1,$car1
433 add $acc0,$acc0,$acc0
434 or $sbit,$acc0,$acc0
435 srlx $acc0,32,$sbit
436 and $acc0,$mask,$acc0
437 add $acc0,$car1,$car1
438 st $car1,[$tp] ! tp[j-1]
439 srlx $car1,32,$car1
440
441 add $car0,$car0,$car0
442 or $sbit,$car0,$car0
443 add $car0,$car1,$car1
444 add $car2,$car1,$car1
445 st $car1,[$tp+4]
446 srlx $car1,32,$car2
447
448 ld [%sp+$bias+$frame],$tmp1 ! tp[0]
449 ld [%sp+$bias+$frame+4],$tpj ! tp[1]
450 ld [$ap+8],$mul0 ! ap[2]
451 ld [$np],$car1 ! np[0]
452 ld [$np+4],$npj ! np[1]
453 mulx $n0,$tmp1,$mul1
454 and $mul1,$mask,$mul1
455 mov 8,$i
456
457 mulx $mul0,$mul0,$car0
458 mulx $car1,$mul1,$car1
459 and $car0,$mask,$acc0
460 add $tmp1,$car1,$car1
461 srlx $car0,32,$car0
462 add %sp,$bias+$frame,$tp
463 srlx $car1,32,$car1
464 and $car0,1,$sbit
465 srlx $car0,1,$car0
466 mov 4,$j
467
468.Lsqr_outer:
469.Lsqr_inner1:
470 mulx $npj,$mul1,$acc1
471 add $tpj,$car1,$car1
472 add $j,4,$j
473 ld [$tp+8],$tpj
474 cmp $j,$i
475 add $acc1,$car1,$car1
476 ld [$np+$j],$npj
477 st $car1,[$tp]
478 srlx $car1,32,$car1
479 bl %icc,.Lsqr_inner1
480 add $tp,4,$tp
481!.Lsqr_inner1
482
483 add $j,4,$j
484 ld [$ap+$j],$apj ! ap[j]
485 mulx $npj,$mul1,$acc1
486 add $tpj,$car1,$car1
487 ld [$np+$j],$npj ! np[j]
488 add $acc0,$car1,$car1
489 ld [$tp+8],$tpj ! tp[j]
490 add $acc1,$car1,$car1
491 st $car1,[$tp]
492 srlx $car1,32,$car1
493
494 add $j,4,$j
495 cmp $j,$num
496 be,pn %icc,.Lsqr_no_inner2
497 add $tp,4,$tp
498
499.Lsqr_inner2:
500 mulx $apj,$mul0,$acc0
501 mulx $npj,$mul1,$acc1
502 add $tpj,$car1,$car1
503 add $acc0,$car0,$car0
504 ld [$ap+$j],$apj ! ap[j]
505 and $car0,$mask,$acc0
506 ld [$np+$j],$npj ! np[j]
507 srlx $car0,32,$car0
508 add $acc0,$acc0,$acc0
509 ld [$tp+8],$tpj ! tp[j]
510 or $sbit,$acc0,$acc0
511 add $j,4,$j ! j++
512 srlx $acc0,32,$sbit
513 and $acc0,$mask,$acc0
514 cmp $j,$num
515 add $acc0,$car1,$car1
516 add $acc1,$car1,$car1
517 st $car1,[$tp] ! tp[j-1]
518 srlx $car1,32,$car1
519 bl %icc,.Lsqr_inner2
520 add $tp,4,$tp ! tp++
521
522.Lsqr_no_inner2:
523 mulx $apj,$mul0,$acc0
524 mulx $npj,$mul1,$acc1
525 add $tpj,$car1,$car1
526 add $acc0,$car0,$car0
527 and $car0,$mask,$acc0
528 srlx $car0,32,$car0
529 add $acc0,$acc0,$acc0
530 or $sbit,$acc0,$acc0
531 srlx $acc0,32,$sbit
532 and $acc0,$mask,$acc0
533 add $acc0,$car1,$car1
534 add $acc1,$car1,$car1
535 st $car1,[$tp] ! tp[j-1]
536 srlx $car1,32,$car1
537
538 add $car0,$car0,$car0
539 or $sbit,$car0,$car0
540 add $car0,$car1,$car1
541 add $car2,$car1,$car1
542 st $car1,[$tp+4]
543 srlx $car1,32,$car2
544
545 add $i,4,$i ! i++
546 ld [%sp+$bias+$frame],$tmp1 ! tp[0]
547 ld [%sp+$bias+$frame+4],$tpj ! tp[1]
548 ld [$ap+$i],$mul0 ! ap[j]
549 ld [$np],$car1 ! np[0]
550 ld [$np+4],$npj ! np[1]
551 mulx $n0,$tmp1,$mul1
552 and $mul1,$mask,$mul1
553 add $i,4,$tmp0
554
555 mulx $mul0,$mul0,$car0
556 mulx $car1,$mul1,$car1
557 and $car0,$mask,$acc0
558 add $tmp1,$car1,$car1
559 srlx $car0,32,$car0
560 add %sp,$bias+$frame,$tp
561 srlx $car1,32,$car1
562 and $car0,1,$sbit
563 srlx $car0,1,$car0
564
565 cmp $tmp0,$num ! i<num-1
566 bl %icc,.Lsqr_outer
567 mov 4,$j
568
569.Lsqr_last:
570 mulx $npj,$mul1,$acc1
571 add $tpj,$car1,$car1
572 add $j,4,$j
573 ld [$tp+8],$tpj
574 cmp $j,$i
575 add $acc1,$car1,$car1
576 ld [$np+$j],$npj
577 st $car1,[$tp]
578 srlx $car1,32,$car1
579 bl %icc,.Lsqr_last
580 add $tp,4,$tp
581!.Lsqr_last
582
583 mulx $npj,$mul1,$acc1
584 add $tpj,$car1,$car1
585 add $acc0,$car1,$car1
586 add $acc1,$car1,$car1
587 st $car1,[$tp]
588 srlx $car1,32,$car1
589
590 add $car0,$car0,$car0 ! recover $car0
591 or $sbit,$car0,$car0
592 add $car0,$car1,$car1
593 add $car2,$car1,$car1
594 st $car1,[$tp+4]
595 srlx $car1,32,$car2
596
597 ba .Ltail
598 add $tp,8,$tp
599.type $fname,#function
600.size $fname,(.-$fname)
601___
602$code =~ s/\`([^\`]*)\`/eval($1)/gem;
603print $code;
604close STDOUT;
diff --git a/src/lib/libcrypto/bn/asm/sparcv9a-mont.pl b/src/lib/libcrypto/bn/asm/sparcv9a-mont.pl
deleted file mode 100755
index 7bb1725a0e..0000000000
--- a/src/lib/libcrypto/bn/asm/sparcv9a-mont.pl
+++ /dev/null
@@ -1,880 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# October 2005
11#
12# "Teaser" Montgomery multiplication module for UltraSPARC. Why FPU?
13# Because unlike integer multiplier, which simply stalls whole CPU,
14# FPU is fully pipelined and can effectively emit 48 bit partial
15# product every cycle. Why not blended SPARC v9? One can argue that
16# making this module dependent on UltraSPARC VIS extension limits its
17# binary compatibility. Well yes, it does exclude SPARC64 prior-V(!)
18# implementations from compatibility matrix. But the rest, whole Sun
19# UltraSPARC family and brand new Fujitsu's SPARC64 V, all support
20# VIS extension instructions used in this module. This is considered
21# good enough to not care about HAL SPARC64 users [if any] who have
22# integer-only pure SPARCv9 module to "fall down" to.
23
24# USI&II cores currently exhibit uniform 2x improvement [over pre-
25# bn_mul_mont codebase] for all key lengths and benchmarks. On USIII
26# performance improves few percents for shorter keys and worsens few
27# percents for longer keys. This is because USIII integer multiplier
28# is >3x faster than USI&II one, which is harder to match [but see
29# TODO list below]. It should also be noted that SPARC64 V features
30# out-of-order execution, which *might* mean that integer multiplier
31# is pipelined, which in turn *might* be impossible to match... On
32# additional note, SPARC64 V implements FP Multiply-Add instruction,
33# which is perfectly usable in this context... In other words, as far
34# as Fujitsu SPARC64 V goes, talk to the author:-)
35
36# The implementation implies following "non-natural" limitations on
37# input arguments:
38# - num may not be less than 4;
39# - num has to be even;
40# Failure to meet either condition has no fatal effects, simply
41# doesn't give any performance gain.
42
43# TODO:
44# - modulo-schedule inner loop for better performance (on in-order
45# execution core such as UltraSPARC this shall result in further
46# noticeable(!) improvement);
47# - dedicated squaring procedure[?];
48
49######################################################################
50# November 2006
51#
52# Modulo-scheduled inner loops allow to interleave floating point and
53# integer instructions and minimize Read-After-Write penalties. This
54# results in *further* 20-50% performance improvement [depending on
55# key length, more for longer keys] on USI&II cores and 30-80% - on
56# USIII&IV.
57
58$fname="bn_mul_mont_fpu";
59$bits=32;
60for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
61
62if ($bits==64) {
63 $bias=2047;
64 $frame=192;
65} else {
66 $bias=0;
67 $frame=128; # 96 rounded up to largest known cache-line
68}
69$locals=64;
70
71# In order to provide for 32-/64-bit ABI duality, I keep integers wider
72# than 32 bit in %g1-%g4 and %o0-%o5. %l0-%l7 and %i0-%i5 are used
73# exclusively for pointers, indexes and other small values...
74# int bn_mul_mont(
75$rp="%i0"; # BN_ULONG *rp,
76$ap="%i1"; # const BN_ULONG *ap,
77$bp="%i2"; # const BN_ULONG *bp,
78$np="%i3"; # const BN_ULONG *np,
79$n0="%i4"; # const BN_ULONG *n0,
80$num="%i5"; # int num);
81
82$tp="%l0"; # t[num]
83$ap_l="%l1"; # a[num],n[num] are smashed to 32-bit words and saved
84$ap_h="%l2"; # to these four vectors as double-precision FP values.
85$np_l="%l3"; # This way a bunch of fxtods are eliminated in second
86$np_h="%l4"; # loop and L1-cache aliasing is minimized...
87$i="%l5";
88$j="%l6";
89$mask="%l7"; # 16-bit mask, 0xffff
90
91$n0="%g4"; # reassigned(!) to "64-bit" register
92$carry="%i4"; # %i4 reused(!) for a carry bit
93
94# FP register naming chart
95#
96# ..HILO
97# dcba
98# --------
99# LOa
100# LOb
101# LOc
102# LOd
103# HIa
104# HIb
105# HIc
106# HId
107# ..a
108# ..b
109$ba="%f0"; $bb="%f2"; $bc="%f4"; $bd="%f6";
110$na="%f8"; $nb="%f10"; $nc="%f12"; $nd="%f14";
111$alo="%f16"; $alo_="%f17"; $ahi="%f18"; $ahi_="%f19";
112$nlo="%f20"; $nlo_="%f21"; $nhi="%f22"; $nhi_="%f23";
113
114$dota="%f24"; $dotb="%f26";
115
116$aloa="%f32"; $alob="%f34"; $aloc="%f36"; $alod="%f38";
117$ahia="%f40"; $ahib="%f42"; $ahic="%f44"; $ahid="%f46";
118$nloa="%f48"; $nlob="%f50"; $nloc="%f52"; $nlod="%f54";
119$nhia="%f56"; $nhib="%f58"; $nhic="%f60"; $nhid="%f62";
120
121$ASI_FL16_P=0xD2; # magic ASI value to engage 16-bit FP load
122
123$code=<<___;
124.section ".text",#alloc,#execinstr
125
126.global $fname
127.align 32
128$fname:
129 save %sp,-$frame-$locals,%sp
130
131 cmp $num,4
132 bl,a,pn %icc,.Lret
133 clr %i0
134 andcc $num,1,%g0 ! $num has to be even...
135 bnz,a,pn %icc,.Lret
136 clr %i0 ! signal "unsupported input value"
137
138 srl $num,1,$num
139 sethi %hi(0xffff),$mask
140 ld [%i4+0],$n0 ! $n0 reassigned, remember?
141 or $mask,%lo(0xffff),$mask
142 ld [%i4+4],%o0
143 sllx %o0,32,%o0
144 or %o0,$n0,$n0 ! $n0=n0[1].n0[0]
145
146 sll $num,3,$num ! num*=8
147
148 add %sp,$bias,%o0 ! real top of stack
149 sll $num,2,%o1
150 add %o1,$num,%o1 ! %o1=num*5
151 sub %o0,%o1,%o0
152 and %o0,-2048,%o0 ! optimize TLB utilization
153 sub %o0,$bias,%sp ! alloca(5*num*8)
154
155 rd %asi,%o7 ! save %asi
156 add %sp,$bias+$frame+$locals,$tp
157 add $tp,$num,$ap_l
158 add $ap_l,$num,$ap_l ! [an]p_[lh] point at the vectors' ends !
159 add $ap_l,$num,$ap_h
160 add $ap_h,$num,$np_l
161 add $np_l,$num,$np_h
162
163 wr %g0,$ASI_FL16_P,%asi ! setup %asi for 16-bit FP loads
164
165 add $rp,$num,$rp ! readjust input pointers to point
166 add $ap,$num,$ap ! at the ends too...
167 add $bp,$num,$bp
168 add $np,$num,$np
169
170 stx %o7,[%sp+$bias+$frame+48] ! save %asi
171
172 sub %g0,$num,$i ! i=-num
173 sub %g0,$num,$j ! j=-num
174
175 add $ap,$j,%o3
176 add $bp,$i,%o4
177
178 ld [%o3+4],%g1 ! bp[0]
179 ld [%o3+0],%o0
180 ld [%o4+4],%g5 ! ap[0]
181 sllx %g1,32,%g1
182 ld [%o4+0],%o1
183 sllx %g5,32,%g5
184 or %g1,%o0,%o0
185 or %g5,%o1,%o1
186
187 add $np,$j,%o5
188
189 mulx %o1,%o0,%o0 ! ap[0]*bp[0]
190 mulx $n0,%o0,%o0 ! ap[0]*bp[0]*n0
191 stx %o0,[%sp+$bias+$frame+0]
192
193 ld [%o3+0],$alo_ ! load a[j] as pair of 32-bit words
194 fzeros $alo
195 ld [%o3+4],$ahi_
196 fzeros $ahi
197 ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words
198 fzeros $nlo
199 ld [%o5+4],$nhi_
200 fzeros $nhi
201
202 ! transfer b[i] to FPU as 4x16-bit values
203 ldda [%o4+2]%asi,$ba
204 fxtod $alo,$alo
205 ldda [%o4+0]%asi,$bb
206 fxtod $ahi,$ahi
207 ldda [%o4+6]%asi,$bc
208 fxtod $nlo,$nlo
209 ldda [%o4+4]%asi,$bd
210 fxtod $nhi,$nhi
211
212 ! transfer ap[0]*b[0]*n0 to FPU as 4x16-bit values
213 ldda [%sp+$bias+$frame+6]%asi,$na
214 fxtod $ba,$ba
215 ldda [%sp+$bias+$frame+4]%asi,$nb
216 fxtod $bb,$bb
217 ldda [%sp+$bias+$frame+2]%asi,$nc
218 fxtod $bc,$bc
219 ldda [%sp+$bias+$frame+0]%asi,$nd
220 fxtod $bd,$bd
221
222 std $alo,[$ap_l+$j] ! save smashed ap[j] in double format
223 fxtod $na,$na
224 std $ahi,[$ap_h+$j]
225 fxtod $nb,$nb
226 std $nlo,[$np_l+$j] ! save smashed np[j] in double format
227 fxtod $nc,$nc
228 std $nhi,[$np_h+$j]
229 fxtod $nd,$nd
230
231 fmuld $alo,$ba,$aloa
232 fmuld $nlo,$na,$nloa
233 fmuld $alo,$bb,$alob
234 fmuld $nlo,$nb,$nlob
235 fmuld $alo,$bc,$aloc
236 faddd $aloa,$nloa,$nloa
237 fmuld $nlo,$nc,$nloc
238 fmuld $alo,$bd,$alod
239 faddd $alob,$nlob,$nlob
240 fmuld $nlo,$nd,$nlod
241 fmuld $ahi,$ba,$ahia
242 faddd $aloc,$nloc,$nloc
243 fmuld $nhi,$na,$nhia
244 fmuld $ahi,$bb,$ahib
245 faddd $alod,$nlod,$nlod
246 fmuld $nhi,$nb,$nhib
247 fmuld $ahi,$bc,$ahic
248 faddd $ahia,$nhia,$nhia
249 fmuld $nhi,$nc,$nhic
250 fmuld $ahi,$bd,$ahid
251 faddd $ahib,$nhib,$nhib
252 fmuld $nhi,$nd,$nhid
253
254 faddd $ahic,$nhic,$dota ! $nhic
255 faddd $ahid,$nhid,$dotb ! $nhid
256
257 faddd $nloc,$nhia,$nloc
258 faddd $nlod,$nhib,$nlod
259
260 fdtox $nloa,$nloa
261 fdtox $nlob,$nlob
262 fdtox $nloc,$nloc
263 fdtox $nlod,$nlod
264
265 std $nloa,[%sp+$bias+$frame+0]
266 add $j,8,$j
267 std $nlob,[%sp+$bias+$frame+8]
268 add $ap,$j,%o4
269 std $nloc,[%sp+$bias+$frame+16]
270 add $np,$j,%o5
271 std $nlod,[%sp+$bias+$frame+24]
272
273 ld [%o4+0],$alo_ ! load a[j] as pair of 32-bit words
274 fzeros $alo
275 ld [%o4+4],$ahi_
276 fzeros $ahi
277 ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words
278 fzeros $nlo
279 ld [%o5+4],$nhi_
280 fzeros $nhi
281
282 fxtod $alo,$alo
283 fxtod $ahi,$ahi
284 fxtod $nlo,$nlo
285 fxtod $nhi,$nhi
286
287 ldx [%sp+$bias+$frame+0],%o0
288 fmuld $alo,$ba,$aloa
289 ldx [%sp+$bias+$frame+8],%o1
290 fmuld $nlo,$na,$nloa
291 ldx [%sp+$bias+$frame+16],%o2
292 fmuld $alo,$bb,$alob
293 ldx [%sp+$bias+$frame+24],%o3
294 fmuld $nlo,$nb,$nlob
295
296 srlx %o0,16,%o7
297 std $alo,[$ap_l+$j] ! save smashed ap[j] in double format
298 fmuld $alo,$bc,$aloc
299 add %o7,%o1,%o1
300 std $ahi,[$ap_h+$j]
301 faddd $aloa,$nloa,$nloa
302 fmuld $nlo,$nc,$nloc
303 srlx %o1,16,%o7
304 std $nlo,[$np_l+$j] ! save smashed np[j] in double format
305 fmuld $alo,$bd,$alod
306 add %o7,%o2,%o2
307 std $nhi,[$np_h+$j]
308 faddd $alob,$nlob,$nlob
309 fmuld $nlo,$nd,$nlod
310 srlx %o2,16,%o7
311 fmuld $ahi,$ba,$ahia
312 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
313 faddd $aloc,$nloc,$nloc
314 fmuld $nhi,$na,$nhia
315 !and %o0,$mask,%o0
316 !and %o1,$mask,%o1
317 !and %o2,$mask,%o2
318 !sllx %o1,16,%o1
319 !sllx %o2,32,%o2
320 !sllx %o3,48,%o7
321 !or %o1,%o0,%o0
322 !or %o2,%o0,%o0
323 !or %o7,%o0,%o0 ! 64-bit result
324 srlx %o3,16,%g1 ! 34-bit carry
325 fmuld $ahi,$bb,$ahib
326
327 faddd $alod,$nlod,$nlod
328 fmuld $nhi,$nb,$nhib
329 fmuld $ahi,$bc,$ahic
330 faddd $ahia,$nhia,$nhia
331 fmuld $nhi,$nc,$nhic
332 fmuld $ahi,$bd,$ahid
333 faddd $ahib,$nhib,$nhib
334 fmuld $nhi,$nd,$nhid
335
336 faddd $dota,$nloa,$nloa
337 faddd $dotb,$nlob,$nlob
338 faddd $ahic,$nhic,$dota ! $nhic
339 faddd $ahid,$nhid,$dotb ! $nhid
340
341 faddd $nloc,$nhia,$nloc
342 faddd $nlod,$nhib,$nlod
343
344 fdtox $nloa,$nloa
345 fdtox $nlob,$nlob
346 fdtox $nloc,$nloc
347 fdtox $nlod,$nlod
348
349 std $nloa,[%sp+$bias+$frame+0]
350 std $nlob,[%sp+$bias+$frame+8]
351 addcc $j,8,$j
352 std $nloc,[%sp+$bias+$frame+16]
353 bz,pn %icc,.L1stskip
354 std $nlod,[%sp+$bias+$frame+24]
355
356.align 32 ! incidentally already aligned !
357.L1st:
358 add $ap,$j,%o4
359 add $np,$j,%o5
360 ld [%o4+0],$alo_ ! load a[j] as pair of 32-bit words
361 fzeros $alo
362 ld [%o4+4],$ahi_
363 fzeros $ahi
364 ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words
365 fzeros $nlo
366 ld [%o5+4],$nhi_
367 fzeros $nhi
368
369 fxtod $alo,$alo
370 fxtod $ahi,$ahi
371 fxtod $nlo,$nlo
372 fxtod $nhi,$nhi
373
374 ldx [%sp+$bias+$frame+0],%o0
375 fmuld $alo,$ba,$aloa
376 ldx [%sp+$bias+$frame+8],%o1
377 fmuld $nlo,$na,$nloa
378 ldx [%sp+$bias+$frame+16],%o2
379 fmuld $alo,$bb,$alob
380 ldx [%sp+$bias+$frame+24],%o3
381 fmuld $nlo,$nb,$nlob
382
383 srlx %o0,16,%o7
384 std $alo,[$ap_l+$j] ! save smashed ap[j] in double format
385 fmuld $alo,$bc,$aloc
386 add %o7,%o1,%o1
387 std $ahi,[$ap_h+$j]
388 faddd $aloa,$nloa,$nloa
389 fmuld $nlo,$nc,$nloc
390 srlx %o1,16,%o7
391 std $nlo,[$np_l+$j] ! save smashed np[j] in double format
392 fmuld $alo,$bd,$alod
393 add %o7,%o2,%o2
394 std $nhi,[$np_h+$j]
395 faddd $alob,$nlob,$nlob
396 fmuld $nlo,$nd,$nlod
397 srlx %o2,16,%o7
398 fmuld $ahi,$ba,$ahia
399 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
400 and %o0,$mask,%o0
401 faddd $aloc,$nloc,$nloc
402 fmuld $nhi,$na,$nhia
403 and %o1,$mask,%o1
404 and %o2,$mask,%o2
405 fmuld $ahi,$bb,$ahib
406 sllx %o1,16,%o1
407 faddd $alod,$nlod,$nlod
408 fmuld $nhi,$nb,$nhib
409 sllx %o2,32,%o2
410 fmuld $ahi,$bc,$ahic
411 sllx %o3,48,%o7
412 or %o1,%o0,%o0
413 faddd $ahia,$nhia,$nhia
414 fmuld $nhi,$nc,$nhic
415 or %o2,%o0,%o0
416 fmuld $ahi,$bd,$ahid
417 or %o7,%o0,%o0 ! 64-bit result
418 faddd $ahib,$nhib,$nhib
419 fmuld $nhi,$nd,$nhid
420 addcc %g1,%o0,%o0
421 faddd $dota,$nloa,$nloa
422 srlx %o3,16,%g1 ! 34-bit carry
423 faddd $dotb,$nlob,$nlob
424 bcs,a %xcc,.+8
425 add %g1,1,%g1
426
427 stx %o0,[$tp] ! tp[j-1]=
428
429 faddd $ahic,$nhic,$dota ! $nhic
430 faddd $ahid,$nhid,$dotb ! $nhid
431
432 faddd $nloc,$nhia,$nloc
433 faddd $nlod,$nhib,$nlod
434
435 fdtox $nloa,$nloa
436 fdtox $nlob,$nlob
437 fdtox $nloc,$nloc
438 fdtox $nlod,$nlod
439
440 std $nloa,[%sp+$bias+$frame+0]
441 std $nlob,[%sp+$bias+$frame+8]
442 std $nloc,[%sp+$bias+$frame+16]
443 std $nlod,[%sp+$bias+$frame+24]
444
445 addcc $j,8,$j
446 bnz,pt %icc,.L1st
447 add $tp,8,$tp
448
449.L1stskip:
450 fdtox $dota,$dota
451 fdtox $dotb,$dotb
452
453 ldx [%sp+$bias+$frame+0],%o0
454 ldx [%sp+$bias+$frame+8],%o1
455 ldx [%sp+$bias+$frame+16],%o2
456 ldx [%sp+$bias+$frame+24],%o3
457
458 srlx %o0,16,%o7
459 std $dota,[%sp+$bias+$frame+32]
460 add %o7,%o1,%o1
461 std $dotb,[%sp+$bias+$frame+40]
462 srlx %o1,16,%o7
463 add %o7,%o2,%o2
464 srlx %o2,16,%o7
465 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
466 and %o0,$mask,%o0
467 and %o1,$mask,%o1
468 and %o2,$mask,%o2
469 sllx %o1,16,%o1
470 sllx %o2,32,%o2
471 sllx %o3,48,%o7
472 or %o1,%o0,%o0
473 or %o2,%o0,%o0
474 or %o7,%o0,%o0 ! 64-bit result
475 ldx [%sp+$bias+$frame+32],%o4
476 addcc %g1,%o0,%o0
477 ldx [%sp+$bias+$frame+40],%o5
478 srlx %o3,16,%g1 ! 34-bit carry
479 bcs,a %xcc,.+8
480 add %g1,1,%g1
481
482 stx %o0,[$tp] ! tp[j-1]=
483 add $tp,8,$tp
484
485 srlx %o4,16,%o7
486 add %o7,%o5,%o5
487 and %o4,$mask,%o4
488 sllx %o5,16,%o7
489 or %o7,%o4,%o4
490 addcc %g1,%o4,%o4
491 srlx %o5,48,%g1
492 bcs,a %xcc,.+8
493 add %g1,1,%g1
494
495 mov %g1,$carry
496 stx %o4,[$tp] ! tp[num-1]=
497
498 ba .Louter
499 add $i,8,$i
500.align 32
501.Louter:
502 sub %g0,$num,$j ! j=-num
503 add %sp,$bias+$frame+$locals,$tp
504
505 add $ap,$j,%o3
506 add $bp,$i,%o4
507
508 ld [%o3+4],%g1 ! bp[i]
509 ld [%o3+0],%o0
510 ld [%o4+4],%g5 ! ap[0]
511 sllx %g1,32,%g1
512 ld [%o4+0],%o1
513 sllx %g5,32,%g5
514 or %g1,%o0,%o0
515 or %g5,%o1,%o1
516
517 ldx [$tp],%o2 ! tp[0]
518 mulx %o1,%o0,%o0
519 addcc %o2,%o0,%o0
520 mulx $n0,%o0,%o0 ! (ap[0]*bp[i]+t[0])*n0
521 stx %o0,[%sp+$bias+$frame+0]
522
523 ! transfer b[i] to FPU as 4x16-bit values
524 ldda [%o4+2]%asi,$ba
525 ldda [%o4+0]%asi,$bb
526 ldda [%o4+6]%asi,$bc
527 ldda [%o4+4]%asi,$bd
528
529 ! transfer (ap[0]*b[i]+t[0])*n0 to FPU as 4x16-bit values
530 ldda [%sp+$bias+$frame+6]%asi,$na
531 fxtod $ba,$ba
532 ldda [%sp+$bias+$frame+4]%asi,$nb
533 fxtod $bb,$bb
534 ldda [%sp+$bias+$frame+2]%asi,$nc
535 fxtod $bc,$bc
536 ldda [%sp+$bias+$frame+0]%asi,$nd
537 fxtod $bd,$bd
538 ldd [$ap_l+$j],$alo ! load a[j] in double format
539 fxtod $na,$na
540 ldd [$ap_h+$j],$ahi
541 fxtod $nb,$nb
542 ldd [$np_l+$j],$nlo ! load n[j] in double format
543 fxtod $nc,$nc
544 ldd [$np_h+$j],$nhi
545 fxtod $nd,$nd
546
547 fmuld $alo,$ba,$aloa
548 fmuld $nlo,$na,$nloa
549 fmuld $alo,$bb,$alob
550 fmuld $nlo,$nb,$nlob
551 fmuld $alo,$bc,$aloc
552 faddd $aloa,$nloa,$nloa
553 fmuld $nlo,$nc,$nloc
554 fmuld $alo,$bd,$alod
555 faddd $alob,$nlob,$nlob
556 fmuld $nlo,$nd,$nlod
557 fmuld $ahi,$ba,$ahia
558 faddd $aloc,$nloc,$nloc
559 fmuld $nhi,$na,$nhia
560 fmuld $ahi,$bb,$ahib
561 faddd $alod,$nlod,$nlod
562 fmuld $nhi,$nb,$nhib
563 fmuld $ahi,$bc,$ahic
564 faddd $ahia,$nhia,$nhia
565 fmuld $nhi,$nc,$nhic
566 fmuld $ahi,$bd,$ahid
567 faddd $ahib,$nhib,$nhib
568 fmuld $nhi,$nd,$nhid
569
570 faddd $ahic,$nhic,$dota ! $nhic
571 faddd $ahid,$nhid,$dotb ! $nhid
572
573 faddd $nloc,$nhia,$nloc
574 faddd $nlod,$nhib,$nlod
575
576 fdtox $nloa,$nloa
577 fdtox $nlob,$nlob
578 fdtox $nloc,$nloc
579 fdtox $nlod,$nlod
580
581 std $nloa,[%sp+$bias+$frame+0]
582 std $nlob,[%sp+$bias+$frame+8]
583 std $nloc,[%sp+$bias+$frame+16]
584 add $j,8,$j
585 std $nlod,[%sp+$bias+$frame+24]
586
587 ldd [$ap_l+$j],$alo ! load a[j] in double format
588 ldd [$ap_h+$j],$ahi
589 ldd [$np_l+$j],$nlo ! load n[j] in double format
590 ldd [$np_h+$j],$nhi
591
592 fmuld $alo,$ba,$aloa
593 fmuld $nlo,$na,$nloa
594 fmuld $alo,$bb,$alob
595 fmuld $nlo,$nb,$nlob
596 fmuld $alo,$bc,$aloc
597 ldx [%sp+$bias+$frame+0],%o0
598 faddd $aloa,$nloa,$nloa
599 fmuld $nlo,$nc,$nloc
600 ldx [%sp+$bias+$frame+8],%o1
601 fmuld $alo,$bd,$alod
602 ldx [%sp+$bias+$frame+16],%o2
603 faddd $alob,$nlob,$nlob
604 fmuld $nlo,$nd,$nlod
605 ldx [%sp+$bias+$frame+24],%o3
606 fmuld $ahi,$ba,$ahia
607
608 srlx %o0,16,%o7
609 faddd $aloc,$nloc,$nloc
610 fmuld $nhi,$na,$nhia
611 add %o7,%o1,%o1
612 fmuld $ahi,$bb,$ahib
613 srlx %o1,16,%o7
614 faddd $alod,$nlod,$nlod
615 fmuld $nhi,$nb,$nhib
616 add %o7,%o2,%o2
617 fmuld $ahi,$bc,$ahic
618 srlx %o2,16,%o7
619 faddd $ahia,$nhia,$nhia
620 fmuld $nhi,$nc,$nhic
621 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
622 ! why?
623 and %o0,$mask,%o0
624 fmuld $ahi,$bd,$ahid
625 and %o1,$mask,%o1
626 and %o2,$mask,%o2
627 faddd $ahib,$nhib,$nhib
628 fmuld $nhi,$nd,$nhid
629 sllx %o1,16,%o1
630 faddd $dota,$nloa,$nloa
631 sllx %o2,32,%o2
632 faddd $dotb,$nlob,$nlob
633 sllx %o3,48,%o7
634 or %o1,%o0,%o0
635 faddd $ahic,$nhic,$dota ! $nhic
636 or %o2,%o0,%o0
637 faddd $ahid,$nhid,$dotb ! $nhid
638 or %o7,%o0,%o0 ! 64-bit result
639 ldx [$tp],%o7
640 faddd $nloc,$nhia,$nloc
641 addcc %o7,%o0,%o0
642 ! end-of-why?
643 faddd $nlod,$nhib,$nlod
644 srlx %o3,16,%g1 ! 34-bit carry
645 fdtox $nloa,$nloa
646 bcs,a %xcc,.+8
647 add %g1,1,%g1
648
649 fdtox $nlob,$nlob
650 fdtox $nloc,$nloc
651 fdtox $nlod,$nlod
652
653 std $nloa,[%sp+$bias+$frame+0]
654 std $nlob,[%sp+$bias+$frame+8]
655 addcc $j,8,$j
656 std $nloc,[%sp+$bias+$frame+16]
657 bz,pn %icc,.Linnerskip
658 std $nlod,[%sp+$bias+$frame+24]
659
660 ba .Linner
661 nop
662.align 32
663.Linner:
664 ldd [$ap_l+$j],$alo ! load a[j] in double format
665 ldd [$ap_h+$j],$ahi
666 ldd [$np_l+$j],$nlo ! load n[j] in double format
667 ldd [$np_h+$j],$nhi
668
669 fmuld $alo,$ba,$aloa
670 fmuld $nlo,$na,$nloa
671 fmuld $alo,$bb,$alob
672 fmuld $nlo,$nb,$nlob
673 fmuld $alo,$bc,$aloc
674 ldx [%sp+$bias+$frame+0],%o0
675 faddd $aloa,$nloa,$nloa
676 fmuld $nlo,$nc,$nloc
677 ldx [%sp+$bias+$frame+8],%o1
678 fmuld $alo,$bd,$alod
679 ldx [%sp+$bias+$frame+16],%o2
680 faddd $alob,$nlob,$nlob
681 fmuld $nlo,$nd,$nlod
682 ldx [%sp+$bias+$frame+24],%o3
683 fmuld $ahi,$ba,$ahia
684
685 srlx %o0,16,%o7
686 faddd $aloc,$nloc,$nloc
687 fmuld $nhi,$na,$nhia
688 add %o7,%o1,%o1
689 fmuld $ahi,$bb,$ahib
690 srlx %o1,16,%o7
691 faddd $alod,$nlod,$nlod
692 fmuld $nhi,$nb,$nhib
693 add %o7,%o2,%o2
694 fmuld $ahi,$bc,$ahic
695 srlx %o2,16,%o7
696 faddd $ahia,$nhia,$nhia
697 fmuld $nhi,$nc,$nhic
698 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
699 and %o0,$mask,%o0
700 fmuld $ahi,$bd,$ahid
701 and %o1,$mask,%o1
702 and %o2,$mask,%o2
703 faddd $ahib,$nhib,$nhib
704 fmuld $nhi,$nd,$nhid
705 sllx %o1,16,%o1
706 faddd $dota,$nloa,$nloa
707 sllx %o2,32,%o2
708 faddd $dotb,$nlob,$nlob
709 sllx %o3,48,%o7
710 or %o1,%o0,%o0
711 faddd $ahic,$nhic,$dota ! $nhic
712 or %o2,%o0,%o0
713 faddd $ahid,$nhid,$dotb ! $nhid
714 or %o7,%o0,%o0 ! 64-bit result
715 faddd $nloc,$nhia,$nloc
716 addcc %g1,%o0,%o0
717 ldx [$tp+8],%o7 ! tp[j]
718 faddd $nlod,$nhib,$nlod
719 srlx %o3,16,%g1 ! 34-bit carry
720 fdtox $nloa,$nloa
721 bcs,a %xcc,.+8
722 add %g1,1,%g1
723 fdtox $nlob,$nlob
724 addcc %o7,%o0,%o0
725 fdtox $nloc,$nloc
726 bcs,a %xcc,.+8
727 add %g1,1,%g1
728
729 stx %o0,[$tp] ! tp[j-1]
730 fdtox $nlod,$nlod
731
732 std $nloa,[%sp+$bias+$frame+0]
733 std $nlob,[%sp+$bias+$frame+8]
734 std $nloc,[%sp+$bias+$frame+16]
735 addcc $j,8,$j
736 std $nlod,[%sp+$bias+$frame+24]
737 bnz,pt %icc,.Linner
738 add $tp,8,$tp
739
740.Linnerskip:
741 fdtox $dota,$dota
742 fdtox $dotb,$dotb
743
744 ldx [%sp+$bias+$frame+0],%o0
745 ldx [%sp+$bias+$frame+8],%o1
746 ldx [%sp+$bias+$frame+16],%o2
747 ldx [%sp+$bias+$frame+24],%o3
748
749 srlx %o0,16,%o7
750 std $dota,[%sp+$bias+$frame+32]
751 add %o7,%o1,%o1
752 std $dotb,[%sp+$bias+$frame+40]
753 srlx %o1,16,%o7
754 add %o7,%o2,%o2
755 srlx %o2,16,%o7
756 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
757 and %o0,$mask,%o0
758 and %o1,$mask,%o1
759 and %o2,$mask,%o2
760 sllx %o1,16,%o1
761 sllx %o2,32,%o2
762 sllx %o3,48,%o7
763 or %o1,%o0,%o0
764 or %o2,%o0,%o0
765 ldx [%sp+$bias+$frame+32],%o4
766 or %o7,%o0,%o0 ! 64-bit result
767 ldx [%sp+$bias+$frame+40],%o5
768 addcc %g1,%o0,%o0
769 ldx [$tp+8],%o7 ! tp[j]
770 srlx %o3,16,%g1 ! 34-bit carry
771 bcs,a %xcc,.+8
772 add %g1,1,%g1
773
774 addcc %o7,%o0,%o0
775 bcs,a %xcc,.+8
776 add %g1,1,%g1
777
778 stx %o0,[$tp] ! tp[j-1]
779 add $tp,8,$tp
780
781 srlx %o4,16,%o7
782 add %o7,%o5,%o5
783 and %o4,$mask,%o4
784 sllx %o5,16,%o7
785 or %o7,%o4,%o4
786 addcc %g1,%o4,%o4
787 srlx %o5,48,%g1
788 bcs,a %xcc,.+8
789 add %g1,1,%g1
790
791 addcc $carry,%o4,%o4
792 stx %o4,[$tp] ! tp[num-1]
793 mov %g1,$carry
794 bcs,a %xcc,.+8
795 add $carry,1,$carry
796
797 addcc $i,8,$i
798 bnz %icc,.Louter
799 nop
800
801 add $tp,8,$tp ! adjust tp to point at the end
802 orn %g0,%g0,%g4
803 sub %g0,$num,%o7 ! n=-num
804 ba .Lsub
805 subcc %g0,%g0,%g0 ! clear %icc.c
806
807.align 32
808.Lsub:
809 ldx [$tp+%o7],%o0
810 add $np,%o7,%g1
811 ld [%g1+0],%o2
812 ld [%g1+4],%o3
813 srlx %o0,32,%o1
814 subccc %o0,%o2,%o2
815 add $rp,%o7,%g1
816 subccc %o1,%o3,%o3
817 st %o2,[%g1+0]
818 add %o7,8,%o7
819 brnz,pt %o7,.Lsub
820 st %o3,[%g1+4]
821 subc $carry,0,%g4
822 sub %g0,$num,%o7 ! n=-num
823 ba .Lcopy
824 nop
825
826.align 32
827.Lcopy:
828 ldx [$tp+%o7],%o0
829 add $rp,%o7,%g1
830 ld [%g1+0],%o2
831 ld [%g1+4],%o3
832 stx %g0,[$tp+%o7]
833 and %o0,%g4,%o0
834 srlx %o0,32,%o1
835 andn %o2,%g4,%o2
836 andn %o3,%g4,%o3
837 or %o2,%o0,%o0
838 or %o3,%o1,%o1
839 st %o0,[%g1+0]
840 add %o7,8,%o7
841 brnz,pt %o7,.Lcopy
842 st %o1,[%g1+4]
843 sub %g0,$num,%o7 ! n=-num
844
845.Lzap:
846 stx %g0,[$ap_l+%o7]
847 stx %g0,[$ap_h+%o7]
848 stx %g0,[$np_l+%o7]
849 stx %g0,[$np_h+%o7]
850 add %o7,8,%o7
851 brnz,pt %o7,.Lzap
852 nop
853
854 ldx [%sp+$bias+$frame+48],%o7
855 wr %g0,%o7,%asi ! restore %asi
856
857 mov 1,%i0
858.Lret:
859 ret
860 restore
861.type $fname,#function
862.size $fname,(.-$fname)
863___
864
865$code =~ s/\`([^\`]*)\`/eval($1)/gem;
866
867# Below substitution makes it possible to compile without demanding
868# VIS extensions on command line, e.g. -xarch=v9 vs. -xarch=v9a. I
869# dare to do this, because VIS capability is detected at run-time now
870# and this routine is not called on CPU not capable to execute it. Do
871# note that fzeros is not the only VIS dependency! Another dependency
872# is implicit and is just _a_ numerical value loaded to %asi register,
873# which assembler can't recognize as VIS specific...
874$code =~ s/fzeros\s+%f([0-9]+)/
875 sprintf(".word\t0x%x\t! fzeros %%f%d",0x81b00c20|($1<<25),$1)
876 /gem;
877
878print $code;
879# flush
880close STDOUT;
diff --git a/src/lib/libcrypto/bn/asm/via-mont.pl b/src/lib/libcrypto/bn/asm/via-mont.pl
deleted file mode 100644
index c046a514c8..0000000000
--- a/src/lib/libcrypto/bn/asm/via-mont.pl
+++ /dev/null
@@ -1,242 +0,0 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# Wrapper around 'rep montmul', VIA-specific instruction accessing
11# PadLock Montgomery Multiplier. The wrapper is designed as drop-in
12# replacement for OpenSSL bn_mul_mont [first implemented in 0.9.9].
13#
14# Below are interleaved outputs from 'openssl speed rsa dsa' for 4
15# different software configurations on 1.5GHz VIA Esther processor.
16# Lines marked with "software integer" denote performance of hand-
17# coded integer-only assembler found in OpenSSL 0.9.7. "Software SSE2"
18# refers to hand-coded SSE2 Montgomery multiplication procedure found
19# OpenSSL 0.9.9. "Hardware VIA SDK" refers to padlock_pmm routine from
20# Padlock SDK 2.0.1 available for download from VIA, which naturally
21# utilizes the magic 'repz montmul' instruction. And finally "hardware
22# this" refers to *this* implementation which also uses 'repz montmul'
23#
24# sign verify sign/s verify/s
25# rsa 512 bits 0.001720s 0.000140s 581.4 7149.7 software integer
26# rsa 512 bits 0.000690s 0.000086s 1450.3 11606.0 software SSE2
27# rsa 512 bits 0.006136s 0.000201s 163.0 4974.5 hardware VIA SDK
28# rsa 512 bits 0.000712s 0.000050s 1404.9 19858.5 hardware this
29#
30# rsa 1024 bits 0.008518s 0.000413s 117.4 2420.8 software integer
31# rsa 1024 bits 0.004275s 0.000277s 233.9 3609.7 software SSE2
32# rsa 1024 bits 0.012136s 0.000260s 82.4 3844.5 hardware VIA SDK
33# rsa 1024 bits 0.002522s 0.000116s 396.5 8650.9 hardware this
34#
35# rsa 2048 bits 0.050101s 0.001371s 20.0 729.6 software integer
36# rsa 2048 bits 0.030273s 0.001008s 33.0 991.9 software SSE2
37# rsa 2048 bits 0.030833s 0.000976s 32.4 1025.1 hardware VIA SDK
38# rsa 2048 bits 0.011879s 0.000342s 84.2 2921.7 hardware this
39#
40# rsa 4096 bits 0.327097s 0.004859s 3.1 205.8 software integer
41# rsa 4096 bits 0.229318s 0.003859s 4.4 259.2 software SSE2
42# rsa 4096 bits 0.233953s 0.003274s 4.3 305.4 hardware VIA SDK
43# rsa 4096 bits 0.070493s 0.001166s 14.2 857.6 hardware this
44#
45# dsa 512 bits 0.001342s 0.001651s 745.2 605.7 software integer
46# dsa 512 bits 0.000844s 0.000987s 1185.3 1013.1 software SSE2
47# dsa 512 bits 0.001902s 0.002247s 525.6 444.9 hardware VIA SDK
48# dsa 512 bits 0.000458s 0.000524s 2182.2 1909.1 hardware this
49#
50# dsa 1024 bits 0.003964s 0.004926s 252.3 203.0 software integer
51# dsa 1024 bits 0.002686s 0.003166s 372.3 315.8 software SSE2
52# dsa 1024 bits 0.002397s 0.002823s 417.1 354.3 hardware VIA SDK
53# dsa 1024 bits 0.000978s 0.001170s 1022.2 855.0 hardware this
54#
55# dsa 2048 bits 0.013280s 0.016518s 75.3 60.5 software integer
56# dsa 2048 bits 0.009911s 0.011522s 100.9 86.8 software SSE2
57# dsa 2048 bits 0.009542s 0.011763s 104.8 85.0 hardware VIA SDK
58# dsa 2048 bits 0.002884s 0.003352s 346.8 298.3 hardware this
59#
60# To give you some other reference point here is output for 2.4GHz P4
61# running hand-coded SSE2 bn_mul_mont found in 0.9.9, i.e. "software
62# SSE2" in above terms.
63#
64# rsa 512 bits 0.000407s 0.000047s 2454.2 21137.0
65# rsa 1024 bits 0.002426s 0.000141s 412.1 7100.0
66# rsa 2048 bits 0.015046s 0.000491s 66.5 2034.9
67# rsa 4096 bits 0.109770s 0.002379s 9.1 420.3
68# dsa 512 bits 0.000438s 0.000525s 2281.1 1904.1
69# dsa 1024 bits 0.001346s 0.001595s 742.7 627.0
70# dsa 2048 bits 0.004745s 0.005582s 210.7 179.1
71#
72# Conclusions:
73# - VIA SDK leaves a *lot* of room for improvement (which this
74# implementation successfully fills:-);
75# - 'rep montmul' gives up to >3x performance improvement depending on
76# key length;
77# - in terms of absolute performance it delivers approximately as much
78# as modern out-of-order 32-bit cores [again, for longer keys].
79
80$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
81push(@INC,"${dir}","${dir}../../perlasm");
82require "x86asm.pl";
83
84&asm_init($ARGV[0],"via-mont.pl");
85
86# int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num);
87$func="bn_mul_mont_padlock";
88
89$pad=16*1; # amount of reserved bytes on top of every vector
90
91# stack layout
92$mZeroPrime=&DWP(0,"esp"); # these are specified by VIA
93$A=&DWP(4,"esp");
94$B=&DWP(8,"esp");
95$T=&DWP(12,"esp");
96$M=&DWP(16,"esp");
97$scratch=&DWP(20,"esp");
98$rp=&DWP(24,"esp"); # these are mine
99$sp=&DWP(28,"esp");
100# &DWP(32,"esp") # 32 byte scratch area
101# &DWP(64+(4*$num+$pad)*0,"esp") # padded tp[num]
102# &DWP(64+(4*$num+$pad)*1,"esp") # padded copy of ap[num]
103# &DWP(64+(4*$num+$pad)*2,"esp") # padded copy of bp[num]
104# &DWP(64+(4*$num+$pad)*3,"esp") # padded copy of np[num]
105# Note that SDK suggests to unconditionally allocate 2K per vector. This
106# has quite an impact on performance. It naturally depends on key length,
107# but to give an example 1024 bit private RSA key operations suffer >30%
108# penalty. I allocate only as much as actually required...
109
110&function_begin($func);
111 &xor ("eax","eax");
112 &mov ("ecx",&wparam(5)); # num
113 # meet VIA's limitations for num [note that the specification
114 # expresses them in bits, while we work with amount of 32-bit words]
115 &test ("ecx",3);
116 &jnz (&label("leave")); # num % 4 != 0
117 &cmp ("ecx",8);
118 &jb (&label("leave")); # num < 8
119 &cmp ("ecx",1024);
120 &ja (&label("leave")); # num > 1024
121
122 &pushf ();
123 &cld ();
124
125 &mov ("edi",&wparam(0)); # rp
126 &mov ("eax",&wparam(1)); # ap
127 &mov ("ebx",&wparam(2)); # bp
128 &mov ("edx",&wparam(3)); # np
129 &mov ("esi",&wparam(4)); # n0
130 &mov ("esi",&DWP(0,"esi")); # *n0
131
132 &lea ("ecx",&DWP($pad,"","ecx",4)); # ecx becomes vector size in bytes
133 &lea ("ebp",&DWP(64,"","ecx",4)); # allocate 4 vectors + 64 bytes
134 &neg ("ebp");
135 &add ("ebp","esp");
136 &and ("ebp",-64); # align to cache-line
137 &xchg ("ebp","esp"); # alloca
138
139 &mov ($rp,"edi"); # save rp
140 &mov ($sp,"ebp"); # save esp
141
142 &mov ($mZeroPrime,"esi");
143 &lea ("esi",&DWP(64,"esp")); # tp
144 &mov ($T,"esi");
145 &lea ("edi",&DWP(32,"esp")); # scratch area
146 &mov ($scratch,"edi");
147 &mov ("esi","eax");
148
149 &lea ("ebp",&DWP(-$pad,"ecx"));
150 &shr ("ebp",2); # restore original num value in ebp
151
152 &xor ("eax","eax");
153
154 &mov ("ecx","ebp");
155 &lea ("ecx",&DWP((32+$pad)/4,"ecx"));# padded tp + scratch
156 &data_byte(0xf3,0xab); # rep stosl, bzero
157
158 &mov ("ecx","ebp");
159 &lea ("edi",&DWP(64+$pad,"esp","ecx",4));# pointer to ap copy
160 &mov ($A,"edi");
161 &data_byte(0xf3,0xa5); # rep movsl, memcpy
162 &mov ("ecx",$pad/4);
163 &data_byte(0xf3,0xab); # rep stosl, bzero pad
164 # edi points at the end of padded ap copy...
165
166 &mov ("ecx","ebp");
167 &mov ("esi","ebx");
168 &mov ($B,"edi");
169 &data_byte(0xf3,0xa5); # rep movsl, memcpy
170 &mov ("ecx",$pad/4);
171 &data_byte(0xf3,0xab); # rep stosl, bzero pad
172 # edi points at the end of padded bp copy...
173
174 &mov ("ecx","ebp");
175 &mov ("esi","edx");
176 &mov ($M,"edi");
177 &data_byte(0xf3,0xa5); # rep movsl, memcpy
178 &mov ("ecx",$pad/4);
179 &data_byte(0xf3,0xab); # rep stosl, bzero pad
180 # edi points at the end of padded np copy...
181
182 # let magic happen...
183 &mov ("ecx","ebp");
184 &mov ("esi","esp");
185 &shl ("ecx",5); # convert word counter to bit counter
186 &align (4);
187 &data_byte(0xf3,0x0f,0xa6,0xc0);# rep montmul
188
189 &mov ("ecx","ebp");
190 &lea ("esi",&DWP(64,"esp")); # tp
191 # edi still points at the end of padded np copy...
192 &neg ("ebp");
193 &lea ("ebp",&DWP(-$pad,"edi","ebp",4)); # so just "rewind"
194 &mov ("edi",$rp); # restore rp
195 &xor ("edx","edx"); # i=0 and clear CF
196
197&set_label("sub",8);
198 &mov ("eax",&DWP(0,"esi","edx",4));
199 &sbb ("eax",&DWP(0,"ebp","edx",4));
200 &mov (&DWP(0,"edi","edx",4),"eax"); # rp[i]=tp[i]-np[i]
201 &lea ("edx",&DWP(1,"edx")); # i++
202 &loop (&label("sub")); # doesn't affect CF!
203
204 &mov ("eax",&DWP(0,"esi","edx",4)); # upmost overflow bit
205 &sbb ("eax",0);
206 &and ("esi","eax");
207 &not ("eax");
208 &mov ("ebp","edi");
209 &and ("ebp","eax");
210 &or ("esi","ebp"); # tp=carry?tp:rp
211
212 &mov ("ecx","edx"); # num
213 &xor ("edx","edx"); # i=0
214
215&set_label("copy",8);
216 &mov ("eax",&DWP(0,"esi","edx",4));
217 &mov (&DWP(64,"esp","edx",4),"ecx"); # zap tp
218 &mov (&DWP(0,"edi","edx",4),"eax");
219 &lea ("edx",&DWP(1,"edx")); # i++
220 &loop (&label("copy"));
221
222 &mov ("ebp",$sp);
223 &xor ("eax","eax");
224
225 &mov ("ecx",64/4);
226 &mov ("edi","esp"); # zap frame including scratch area
227 &data_byte(0xf3,0xab); # rep stosl, bzero
228
229 # zap copies of ap, bp and np
230 &lea ("edi",&DWP(64+$pad,"esp","edx",4));# pointer to ap
231 &lea ("ecx",&DWP(3*$pad/4,"edx","edx",2));
232 &data_byte(0xf3,0xab); # rep stosl, bzero
233
234 &mov ("esp","ebp");
235 &inc ("eax"); # signal "done"
236 &popf ();
237&set_label("leave");
238&function_end($func);
239
240&asciz("Padlock Montgomery Multiplication, CRYPTOGAMS by <appro\@openssl.org>");
241
242&asm_finish();
diff --git a/src/lib/libcrypto/cast/asm/cast-586.pl b/src/lib/libcrypto/cast/asm/cast-586.pl
deleted file mode 100644
index 7a0083ecb8..0000000000
--- a/src/lib/libcrypto/cast/asm/cast-586.pl
+++ /dev/null
@@ -1,177 +0,0 @@
1#!/usr/local/bin/perl
2
3# define for pentium pro friendly version
4$ppro=1;
5
6$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
7push(@INC,"${dir}","${dir}../../perlasm");
8require "x86asm.pl";
9require "cbc.pl";
10
11&asm_init($ARGV[0],"cast-586.pl",$ARGV[$#ARGV] eq "386");
12
13$CAST_ROUNDS=16;
14$L="edi";
15$R="esi";
16$K="ebp";
17$tmp1="ecx";
18$tmp2="ebx";
19$tmp3="eax";
20$tmp4="edx";
21$S1="CAST_S_table0";
22$S2="CAST_S_table1";
23$S3="CAST_S_table2";
24$S4="CAST_S_table3";
25
26@F1=("add","xor","sub");
27@F2=("xor","sub","add");
28@F3=("sub","add","xor");
29
30&CAST_encrypt("CAST_encrypt",1);
31&CAST_encrypt("CAST_decrypt",0);
32&cbc("CAST_cbc_encrypt","CAST_encrypt","CAST_decrypt",1,4,5,3,-1,-1) unless $main'openbsd;
33
34&asm_finish();
35
36sub CAST_encrypt {
37 local($name,$enc)=@_;
38
39 local($win_ex)=<<"EOF";
40EXTERN _CAST_S_table0:DWORD
41EXTERN _CAST_S_table1:DWORD
42EXTERN _CAST_S_table2:DWORD
43EXTERN _CAST_S_table3:DWORD
44EOF
45 &main::external_label(
46 "CAST_S_table0",
47 "CAST_S_table1",
48 "CAST_S_table2",
49 "CAST_S_table3",
50 );
51
52 &function_begin_B($name,$win_ex);
53
54 &comment("");
55
56 &push("ebp");
57 &push("ebx");
58 &mov($tmp2,&wparam(0));
59 &mov($K,&wparam(1));
60 &push("esi");
61 &push("edi");
62
63 &comment("Load the 2 words");
64 &mov($L,&DWP(0,$tmp2,"",0));
65 &mov($R,&DWP(4,$tmp2,"",0));
66
67 &comment('Get short key flag');
68 &mov($tmp3,&DWP(128,$K,"",0));
69 if($enc) {
70 &push($tmp3);
71 } else {
72 &or($tmp3,$tmp3);
73 &jnz(&label('cast_dec_skip'));
74 }
75
76 &xor($tmp3, $tmp3);
77
78 # encrypting part
79
80 if ($enc) {
81 &E_CAST( 0,$S,$L,$R,$K,@F1,$tmp1,$tmp2,$tmp3,$tmp4);
82 &E_CAST( 1,$S,$R,$L,$K,@F2,$tmp1,$tmp2,$tmp3,$tmp4);
83 &E_CAST( 2,$S,$L,$R,$K,@F3,$tmp1,$tmp2,$tmp3,$tmp4);
84 &E_CAST( 3,$S,$R,$L,$K,@F1,$tmp1,$tmp2,$tmp3,$tmp4);
85 &E_CAST( 4,$S,$L,$R,$K,@F2,$tmp1,$tmp2,$tmp3,$tmp4);
86 &E_CAST( 5,$S,$R,$L,$K,@F3,$tmp1,$tmp2,$tmp3,$tmp4);
87 &E_CAST( 6,$S,$L,$R,$K,@F1,$tmp1,$tmp2,$tmp3,$tmp4);
88 &E_CAST( 7,$S,$R,$L,$K,@F2,$tmp1,$tmp2,$tmp3,$tmp4);
89 &E_CAST( 8,$S,$L,$R,$K,@F3,$tmp1,$tmp2,$tmp3,$tmp4);
90 &E_CAST( 9,$S,$R,$L,$K,@F1,$tmp1,$tmp2,$tmp3,$tmp4);
91 &E_CAST(10,$S,$L,$R,$K,@F2,$tmp1,$tmp2,$tmp3,$tmp4);
92 &E_CAST(11,$S,$R,$L,$K,@F3,$tmp1,$tmp2,$tmp3,$tmp4);
93 &comment('test short key flag');
94 &pop($tmp4);
95 &or($tmp4,$tmp4);
96 &jnz(&label('cast_enc_done'));
97 &E_CAST(12,$S,$L,$R,$K,@F1,$tmp1,$tmp2,$tmp3,$tmp4);
98 &E_CAST(13,$S,$R,$L,$K,@F2,$tmp1,$tmp2,$tmp3,$tmp4);
99 &E_CAST(14,$S,$L,$R,$K,@F3,$tmp1,$tmp2,$tmp3,$tmp4);
100 &E_CAST(15,$S,$R,$L,$K,@F1,$tmp1,$tmp2,$tmp3,$tmp4);
101 } else {
102 &E_CAST(15,$S,$L,$R,$K,@F1,$tmp1,$tmp2,$tmp3,$tmp4);
103 &E_CAST(14,$S,$R,$L,$K,@F3,$tmp1,$tmp2,$tmp3,$tmp4);
104 &E_CAST(13,$S,$L,$R,$K,@F2,$tmp1,$tmp2,$tmp3,$tmp4);
105 &E_CAST(12,$S,$R,$L,$K,@F1,$tmp1,$tmp2,$tmp3,$tmp4);
106 &set_label('cast_dec_skip');
107 &E_CAST(11,$S,$L,$R,$K,@F3,$tmp1,$tmp2,$tmp3,$tmp4);
108 &E_CAST(10,$S,$R,$L,$K,@F2,$tmp1,$tmp2,$tmp3,$tmp4);
109 &E_CAST( 9,$S,$L,$R,$K,@F1,$tmp1,$tmp2,$tmp3,$tmp4);
110 &E_CAST( 8,$S,$R,$L,$K,@F3,$tmp1,$tmp2,$tmp3,$tmp4);
111 &E_CAST( 7,$S,$L,$R,$K,@F2,$tmp1,$tmp2,$tmp3,$tmp4);
112 &E_CAST( 6,$S,$R,$L,$K,@F1,$tmp1,$tmp2,$tmp3,$tmp4);
113 &E_CAST( 5,$S,$L,$R,$K,@F3,$tmp1,$tmp2,$tmp3,$tmp4);
114 &E_CAST( 4,$S,$R,$L,$K,@F2,$tmp1,$tmp2,$tmp3,$tmp4);
115 &E_CAST( 3,$S,$L,$R,$K,@F1,$tmp1,$tmp2,$tmp3,$tmp4);
116 &E_CAST( 2,$S,$R,$L,$K,@F3,$tmp1,$tmp2,$tmp3,$tmp4);
117 &E_CAST( 1,$S,$L,$R,$K,@F2,$tmp1,$tmp2,$tmp3,$tmp4);
118 &E_CAST( 0,$S,$R,$L,$K,@F1,$tmp1,$tmp2,$tmp3,$tmp4);
119 }
120
121 &set_label('cast_enc_done') if $enc;
122# Why the nop? - Ben 17/1/99
123 &nop();
124 &mov($tmp3,&wparam(0));
125 &mov(&DWP(4,$tmp3,"",0),$L);
126 &mov(&DWP(0,$tmp3,"",0),$R);
127 &function_end($name);
128}
129
130sub E_CAST {
131 local($i,$S,$L,$R,$K,$OP1,$OP2,$OP3,$tmp1,$tmp2,$tmp3,$tmp4)=@_;
132 # Ri needs to have 16 pre added.
133
134 &comment("round $i");
135 &mov( $tmp4, &DWP($i*8,$K,"",1));
136
137 &mov( $tmp1, &DWP($i*8+4,$K,"",1));
138 &$OP1( $tmp4, $R);
139
140 &rotl( $tmp4, &LB($tmp1));
141
142 if ($ppro) {
143 &mov( $tmp2, $tmp4); # B
144 &xor( $tmp1, $tmp1);
145
146 &movb( &LB($tmp1), &HB($tmp4)); # A
147 &and( $tmp2, 0xff);
148
149 &shr( $tmp4, 16); #
150 &xor( $tmp3, $tmp3);
151 } else {
152 &mov( $tmp2, $tmp4); # B
153 &movb( &LB($tmp1), &HB($tmp4)); # A # BAD BAD BAD
154
155 &shr( $tmp4, 16); #
156 &and( $tmp2, 0xff);
157 }
158
159 &movb( &LB($tmp3), &HB($tmp4)); # C # BAD BAD BAD
160 &and( $tmp4, 0xff); # D
161
162 &mov( $tmp1, &DWP($S1,"",$tmp1,4));
163 &mov( $tmp2, &DWP($S2,"",$tmp2,4));
164
165 &$OP2( $tmp1, $tmp2);
166 &mov( $tmp2, &DWP($S3,"",$tmp3,4));
167
168 &$OP3( $tmp1, $tmp2);
169 &mov( $tmp2, &DWP($S4,"",$tmp4,4));
170
171 &$OP1( $tmp1, $tmp2);
172 # XXX
173
174 &xor( $L, $tmp1);
175 # XXX
176}
177
diff --git a/src/lib/libcrypto/des/asm/crypt586.pl b/src/lib/libcrypto/des/asm/crypt586.pl
deleted file mode 100644
index e36f7d44bd..0000000000
--- a/src/lib/libcrypto/des/asm/crypt586.pl
+++ /dev/null
@@ -1,209 +0,0 @@
1#!/usr/local/bin/perl
2#
3# The inner loop instruction sequence and the IP/FP modifications are from
4# Svend Olaf Mikkelsen <svolaf@inet.uni-c.dk>
5# I've added the stuff needed for crypt() but I've not worried about making
6# things perfect.
7#
8
9$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
10push(@INC,"${dir}","${dir}../../perlasm");
11require "x86asm.pl";
12
13&asm_init($ARGV[0],"crypt586.pl");
14
15$L="edi";
16$R="esi";
17
18&external_label("DES_SPtrans");
19&fcrypt_body("fcrypt_body");
20&asm_finish();
21
22sub fcrypt_body
23 {
24 local($name,$do_ip)=@_;
25
26 &function_begin($name);
27
28 &comment("");
29 &comment("Load the 2 words");
30 $trans="ebp";
31
32 &xor( $L, $L);
33 &xor( $R, $R);
34
35 # PIC-ification:-)
36 &picmeup("edx","DES_SPtrans");
37 #if ($cpp) { &picmeup("edx","DES_SPtrans"); }
38 #else { &lea("edx",&DWP("DES_SPtrans")); }
39 &push("edx"); # becomes &swtmp(1)
40 #
41 &mov($trans,&wparam(1)); # reloaded with DES_SPtrans in D_ENCRYPT
42
43 &push(&DWC(25)); # add a variable
44
45 &set_label("start");
46 for ($i=0; $i<16; $i+=2)
47 {
48 &comment("");
49 &comment("Round $i");
50 &D_ENCRYPT($i,$L,$R,$i*2,$trans,"eax","ebx","ecx","edx");
51
52 &comment("");
53 &comment("Round ".sprintf("%d",$i+1));
54 &D_ENCRYPT($i+1,$R,$L,($i+1)*2,$trans,"eax","ebx","ecx","edx");
55 }
56 &mov("ebx", &swtmp(0));
57 &mov("eax", $L);
58 &dec("ebx");
59 &mov($L, $R);
60 &mov($R, "eax");
61 &mov(&swtmp(0), "ebx");
62 &jnz(&label("start"));
63
64 &comment("");
65 &comment("FP");
66 &mov("edx",&wparam(0));
67
68 &FP_new($R,$L,"eax",3);
69 &mov(&DWP(0,"edx","",0),"eax");
70 &mov(&DWP(4,"edx","",0),$L);
71
72 &add("esp",8); # remove variables
73
74 &function_end($name);
75 }
76
77sub D_ENCRYPT
78 {
79 local($r,$L,$R,$S,$trans,$u,$tmp1,$tmp2,$t)=@_;
80
81 &mov( $u, &wparam(2)); # 2
82 &mov( $t, $R);
83 &shr( $t, 16); # 1
84 &mov( $tmp2, &wparam(3)); # 2
85 &xor( $t, $R); # 1
86
87 &and( $u, $t); # 2
88 &and( $t, $tmp2); # 2
89
90 &mov( $tmp1, $u);
91 &shl( $tmp1, 16); # 1
92 &mov( $tmp2, $t);
93 &shl( $tmp2, 16); # 1
94 &xor( $u, $tmp1); # 2
95 &xor( $t, $tmp2); # 2
96 &mov( $tmp1, &DWP(&n2a($S*4),$trans,"",0)); # 2
97 &xor( $u, $tmp1);
98 &mov( $tmp2, &DWP(&n2a(($S+1)*4),$trans,"",0)); # 2
99 &xor( $u, $R);
100 &xor( $t, $R);
101 &xor( $t, $tmp2);
102
103 &and( $u, "0xfcfcfcfc" ); # 2
104 &xor( $tmp1, $tmp1); # 1
105 &and( $t, "0xcfcfcfcf" ); # 2
106 &xor( $tmp2, $tmp2);
107 &movb( &LB($tmp1), &LB($u) );
108 &movb( &LB($tmp2), &HB($u) );
109 &rotr( $t, 4 );
110 &mov( $trans, &swtmp(1));
111 &xor( $L, &DWP(" ",$trans,$tmp1,0));
112 &movb( &LB($tmp1), &LB($t) );
113 &xor( $L, &DWP("0x200",$trans,$tmp2,0));
114 &movb( &LB($tmp2), &HB($t) );
115 &shr( $u, 16);
116 &xor( $L, &DWP("0x100",$trans,$tmp1,0));
117 &movb( &LB($tmp1), &HB($u) );
118 &shr( $t, 16);
119 &xor( $L, &DWP("0x300",$trans,$tmp2,0));
120 &movb( &LB($tmp2), &HB($t) );
121 &and( $u, "0xff" );
122 &and( $t, "0xff" );
123 &mov( $tmp1, &DWP("0x600",$trans,$tmp1,0));
124 &xor( $L, $tmp1);
125 &mov( $tmp1, &DWP("0x700",$trans,$tmp2,0));
126 &xor( $L, $tmp1);
127 &mov( $tmp1, &DWP("0x400",$trans,$u,0));
128 &xor( $L, $tmp1);
129 &mov( $tmp1, &DWP("0x500",$trans,$t,0));
130 &xor( $L, $tmp1);
131 &mov( $trans, &wparam(1));
132 }
133
134sub n2a
135 {
136 sprintf("%d",$_[0]);
137 }
138
139# now has a side affect of rotating $a by $shift
140sub R_PERM_OP
141 {
142 local($a,$b,$tt,$shift,$mask,$last)=@_;
143
144 &rotl( $a, $shift ) if ($shift != 0);
145 &mov( $tt, $a );
146 &xor( $a, $b );
147 &and( $a, $mask );
148 if ($notlast eq $b)
149 {
150 &xor( $b, $a );
151 &xor( $tt, $a );
152 }
153 else
154 {
155 &xor( $tt, $a );
156 &xor( $b, $a );
157 }
158 &comment("");
159 }
160
161sub IP_new
162 {
163 local($l,$r,$tt,$lr)=@_;
164
165 &R_PERM_OP($l,$r,$tt, 4,"0xf0f0f0f0",$l);
166 &R_PERM_OP($r,$tt,$l,20,"0xfff0000f",$l);
167 &R_PERM_OP($l,$tt,$r,14,"0x33333333",$r);
168 &R_PERM_OP($tt,$r,$l,22,"0x03fc03fc",$r);
169 &R_PERM_OP($l,$r,$tt, 9,"0xaaaaaaaa",$r);
170
171 if ($lr != 3)
172 {
173 if (($lr-3) < 0)
174 { &rotr($tt, 3-$lr); }
175 else { &rotl($tt, $lr-3); }
176 }
177 if ($lr != 2)
178 {
179 if (($lr-2) < 0)
180 { &rotr($r, 2-$lr); }
181 else { &rotl($r, $lr-2); }
182 }
183 }
184
185sub FP_new
186 {
187 local($l,$r,$tt,$lr)=@_;
188
189 if ($lr != 2)
190 {
191 if (($lr-2) < 0)
192 { &rotl($r, 2-$lr); }
193 else { &rotr($r, $lr-2); }
194 }
195 if ($lr != 3)
196 {
197 if (($lr-3) < 0)
198 { &rotl($l, 3-$lr); }
199 else { &rotr($l, $lr-3); }
200 }
201
202 &R_PERM_OP($l,$r,$tt, 0,"0xaaaaaaaa",$r);
203 &R_PERM_OP($tt,$r,$l,23,"0x03fc03fc",$r);
204 &R_PERM_OP($l,$r,$tt,10,"0x33333333",$l);
205 &R_PERM_OP($r,$tt,$l,18,"0xfff0000f",$l);
206 &R_PERM_OP($l,$tt,$r,12,"0xf0f0f0f0",$r);
207 &rotr($tt , 4);
208 }
209
diff --git a/src/lib/libcrypto/sha/asm/sha1-sparcv9a.pl b/src/lib/libcrypto/sha/asm/sha1-sparcv9a.pl
deleted file mode 100644
index 8e7674e9d2..0000000000
--- a/src/lib/libcrypto/sha/asm/sha1-sparcv9a.pl
+++ /dev/null
@@ -1,608 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# January 2009
11#
12# Provided that UltraSPARC VIS instructions are pipe-lined(*) and
13# pairable(*) with IALU ones, offloading of Xupdate to the UltraSPARC
14# Graphic Unit would make it possible to achieve higher instruction-
15# level parallelism, ILP, and thus higher performance. It should be
16# explicitly noted that ILP is the keyword, and it means that this
17# code would be unsuitable for cores like UltraSPARC-Tx. The idea is
18# not really novel, Sun had VIS-powered implementation for a while.
19# Unlike Sun's implementation this one can process multiple unaligned
20# input blocks, and as such works as drop-in replacement for OpenSSL
21# sha1_block_data_order. Performance improvement was measured to be
22# 40% over pure IALU sha1-sparcv9.pl on UltraSPARC-IIi, but 12% on
23# UltraSPARC-III. See below for discussion...
24#
25# The module does not present direct interest for OpenSSL, because
26# it doesn't provide better performance on contemporary SPARCv9 CPUs,
27# UltraSPARC-Tx and SPARC64-V[II] to be specific. Those who feel they
28# absolutely must score on UltraSPARC-I-IV can simply replace
29# crypto/sha/asm/sha1-sparcv9.pl with this module.
30#
31# (*) "Pipe-lined" means that even if it takes several cycles to
32# complete, next instruction using same functional unit [but not
33# depending on the result of the current instruction] can start
34# execution without having to wait for the unit. "Pairable"
35# means that two [or more] independent instructions can be
36# issued at the very same time.
37
38$bits=32;
39for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
40if ($bits==64) { $bias=2047; $frame=192; }
41else { $bias=0; $frame=112; }
42
43$output=shift;
44open STDOUT,">$output";
45
46$ctx="%i0";
47$inp="%i1";
48$len="%i2";
49$tmp0="%i3";
50$tmp1="%i4";
51$tmp2="%i5";
52$tmp3="%g5";
53
54$base="%g1";
55$align="%g4";
56$Xfer="%o5";
57$nXfer=$tmp3;
58$Xi="%o7";
59
60$A="%l0";
61$B="%l1";
62$C="%l2";
63$D="%l3";
64$E="%l4";
65@V=($A,$B,$C,$D,$E);
66
67$Actx="%o0";
68$Bctx="%o1";
69$Cctx="%o2";
70$Dctx="%o3";
71$Ectx="%o4";
72
73$fmul="%f32";
74$VK_00_19="%f34";
75$VK_20_39="%f36";
76$VK_40_59="%f38";
77$VK_60_79="%f40";
78@VK=($VK_00_19,$VK_20_39,$VK_40_59,$VK_60_79);
79@X=("%f0", "%f1", "%f2", "%f3", "%f4", "%f5", "%f6", "%f7",
80 "%f8", "%f9","%f10","%f11","%f12","%f13","%f14","%f15","%f16");
81
82# This is reference 2x-parallelized VIS-powered Xupdate procedure. It
83# covers even K_NN_MM addition...
84sub Xupdate {
85my ($i)=@_;
86my $K=@VK[($i+16)/20];
87my $j=($i+16)%16;
88
89# [ provided that GSR.alignaddr_offset is 5, $mul contains
90# 0x100ULL<<32|0x100 value and K_NN_MM are pre-loaded to
91# chosen registers... ]
92$code.=<<___;
93 fxors @X[($j+13)%16],@X[$j],@X[$j] !-1/-1/-1:X[0]^=X[13]
94 fxors @X[($j+14)%16],@X[$j+1],@X[$j+1]! 0/ 0/ 0:X[1]^=X[14]
95 fxor @X[($j+2)%16],@X[($j+8)%16],%f18! 1/ 1/ 1:Tmp=X[2,3]^X[8,9]
96 fxor %f18,@X[$j],@X[$j] ! 2/ 4/ 3:X[0,1]^=X[2,3]^X[8,9]
97 faligndata @X[$j],@X[$j],%f18 ! 3/ 7/ 5:Tmp=X[0,1]>>>24
98 fpadd32 @X[$j],@X[$j],@X[$j] ! 4/ 8/ 6:X[0,1]<<=1
99 fmul8ulx16 %f18,$fmul,%f18 ! 5/10/ 7:Tmp>>=7, Tmp&=1
100 ![fxors %f15,%f2,%f2]
101 for %f18,@X[$j],@X[$j] ! 8/14/10:X[0,1]|=Tmp
102 ![fxors %f0,%f3,%f3] !10/17/12:X[0] dependency
103 fpadd32 $K,@X[$j],%f20
104 std %f20,[$Xfer+`4*$j`]
105___
106# The numbers delimited with slash are the earliest possible dispatch
107# cycles for given instruction assuming 1 cycle latency for simple VIS
108# instructions, such as on UltraSPARC-I&II, 3 cycles latency, such as
109# on UltraSPARC-III&IV, and 2 cycles latency(*), respectively. Being
110# 2x-parallelized the procedure is "worth" 5, 8.5 or 6 ticks per SHA1
111# round. As [long as] FPU/VIS instructions are perfectly pairable with
112# IALU ones, the round timing is defined by the maximum between VIS
113# and IALU timings. The latter varies from round to round and averages
114# out at 6.25 ticks. This means that USI&II should operate at IALU
115# rate, while USIII&IV - at VIS rate. This explains why performance
116# improvement varies among processors. Well, given that pure IALU
117# sha1-sparcv9.pl module exhibits virtually uniform performance of
118# ~9.3 cycles per SHA1 round. Timings mentioned above are theoretical
119# lower limits. Real-life performance was measured to be 6.6 cycles
120# per SHA1 round on USIIi and 8.3 on USIII. The latter is lower than
121# half-round VIS timing, because there are 16 Xupdate-free rounds,
122# which "push down" average theoretical timing to 8 cycles...
123
124# (*) SPARC64-V[II] was originally believed to have 2 cycles VIS
125# latency. Well, it might have, but it doesn't have dedicated
126# VIS-unit. Instead, VIS instructions are executed by other
127# functional units, ones used here - by IALU. This doesn't
128# improve effective ILP...
129}
130
131# The reference Xupdate procedure is then "strained" over *pairs* of
132# BODY_NN_MM and kind of modulo-scheduled in respect to X[n]^=X[n+13]
133# and K_NN_MM addition. It's "running" 15 rounds ahead, which leaves
134# plenty of room to amortize for read-after-write hazard, as well as
135# to fetch and align input for the next spin. The VIS instructions are
136# scheduled for latency of 2 cycles, because there are not enough IALU
137# instructions to schedule for latency of 3, while scheduling for 1
138# would give no gain on USI&II anyway.
139
140sub BODY_00_19 {
141my ($i,$a,$b,$c,$d,$e)=@_;
142my $j=$i&~1;
143my $k=($j+16+2)%16; # ahead reference
144my $l=($j+16-2)%16; # behind reference
145my $K=@VK[($j+16-2)/20];
146
147$j=($j+16)%16;
148
149$code.=<<___ if (!($i&1));
150 sll $a,5,$tmp0 !! $i
151 and $c,$b,$tmp3
152 ld [$Xfer+`4*($i%16)`],$Xi
153 fxors @X[($j+14)%16],@X[$j+1],@X[$j+1]! 0/ 0/ 0:X[1]^=X[14]
154 srl $a,27,$tmp1
155 add $tmp0,$e,$e
156 fxor @X[($j+2)%16],@X[($j+8)%16],%f18! 1/ 1/ 1:Tmp=X[2,3]^X[8,9]
157 sll $b,30,$tmp2
158 add $tmp1,$e,$e
159 andn $d,$b,$tmp1
160 add $Xi,$e,$e
161 fxor %f18,@X[$j],@X[$j] ! 2/ 4/ 3:X[0,1]^=X[2,3]^X[8,9]
162 srl $b,2,$b
163 or $tmp1,$tmp3,$tmp1
164 or $tmp2,$b,$b
165 add $tmp1,$e,$e
166 faligndata @X[$j],@X[$j],%f18 ! 3/ 7/ 5:Tmp=X[0,1]>>>24
167___
168$code.=<<___ if ($i&1);
169 sll $a,5,$tmp0 !! $i
170 and $c,$b,$tmp3
171 ld [$Xfer+`4*($i%16)`],$Xi
172 fpadd32 @X[$j],@X[$j],@X[$j] ! 4/ 8/ 6:X[0,1]<<=1
173 srl $a,27,$tmp1
174 add $tmp0,$e,$e
175 fmul8ulx16 %f18,$fmul,%f18 ! 5/10/ 7:Tmp>>=7, Tmp&=1
176 sll $b,30,$tmp2
177 add $tmp1,$e,$e
178 fpadd32 $K,@X[$l],%f20 !
179 andn $d,$b,$tmp1
180 add $Xi,$e,$e
181 fxors @X[($k+13)%16],@X[$k],@X[$k] !-1/-1/-1:X[0]^=X[13]
182 srl $b,2,$b
183 or $tmp1,$tmp3,$tmp1
184 fxor %f18,@X[$j],@X[$j] ! 8/14/10:X[0,1]|=Tmp
185 or $tmp2,$b,$b
186 add $tmp1,$e,$e
187___
188$code.=<<___ if ($i&1 && $i>=2);
189 std %f20,[$Xfer+`4*$l`] !
190___
191}
192
193sub BODY_20_39 {
194my ($i,$a,$b,$c,$d,$e)=@_;
195my $j=$i&~1;
196my $k=($j+16+2)%16; # ahead reference
197my $l=($j+16-2)%16; # behind reference
198my $K=@VK[($j+16-2)/20];
199
200$j=($j+16)%16;
201
202$code.=<<___ if (!($i&1) && $i<64);
203 sll $a,5,$tmp0 !! $i
204 ld [$Xfer+`4*($i%16)`],$Xi
205 fxors @X[($j+14)%16],@X[$j+1],@X[$j+1]! 0/ 0/ 0:X[1]^=X[14]
206 srl $a,27,$tmp1
207 add $tmp0,$e,$e
208 fxor @X[($j+2)%16],@X[($j+8)%16],%f18! 1/ 1/ 1:Tmp=X[2,3]^X[8,9]
209 xor $c,$b,$tmp0
210 add $tmp1,$e,$e
211 sll $b,30,$tmp2
212 xor $d,$tmp0,$tmp1
213 fxor %f18,@X[$j],@X[$j] ! 2/ 4/ 3:X[0,1]^=X[2,3]^X[8,9]
214 srl $b,2,$b
215 add $tmp1,$e,$e
216 or $tmp2,$b,$b
217 add $Xi,$e,$e
218 faligndata @X[$j],@X[$j],%f18 ! 3/ 7/ 5:Tmp=X[0,1]>>>24
219___
220$code.=<<___ if ($i&1 && $i<64);
221 sll $a,5,$tmp0 !! $i
222 ld [$Xfer+`4*($i%16)`],$Xi
223 fpadd32 @X[$j],@X[$j],@X[$j] ! 4/ 8/ 6:X[0,1]<<=1
224 srl $a,27,$tmp1
225 add $tmp0,$e,$e
226 fmul8ulx16 %f18,$fmul,%f18 ! 5/10/ 7:Tmp>>=7, Tmp&=1
227 xor $c,$b,$tmp0
228 add $tmp1,$e,$e
229 fpadd32 $K,@X[$l],%f20 !
230 sll $b,30,$tmp2
231 xor $d,$tmp0,$tmp1
232 fxors @X[($k+13)%16],@X[$k],@X[$k] !-1/-1/-1:X[0]^=X[13]
233 srl $b,2,$b
234 add $tmp1,$e,$e
235 fxor %f18,@X[$j],@X[$j] ! 8/14/10:X[0,1]|=Tmp
236 or $tmp2,$b,$b
237 add $Xi,$e,$e
238 std %f20,[$Xfer+`4*$l`] !
239___
240$code.=<<___ if ($i==64);
241 sll $a,5,$tmp0 !! $i
242 ld [$Xfer+`4*($i%16)`],$Xi
243 fpadd32 $K,@X[$l],%f20
244 srl $a,27,$tmp1
245 add $tmp0,$e,$e
246 xor $c,$b,$tmp0
247 add $tmp1,$e,$e
248 sll $b,30,$tmp2
249 xor $d,$tmp0,$tmp1
250 std %f20,[$Xfer+`4*$l`]
251 srl $b,2,$b
252 add $tmp1,$e,$e
253 or $tmp2,$b,$b
254 add $Xi,$e,$e
255___
256$code.=<<___ if ($i>64);
257 sll $a,5,$tmp0 !! $i
258 ld [$Xfer+`4*($i%16)`],$Xi
259 srl $a,27,$tmp1
260 add $tmp0,$e,$e
261 xor $c,$b,$tmp0
262 add $tmp1,$e,$e
263 sll $b,30,$tmp2
264 xor $d,$tmp0,$tmp1
265 srl $b,2,$b
266 add $tmp1,$e,$e
267 or $tmp2,$b,$b
268 add $Xi,$e,$e
269___
270}
271
272sub BODY_40_59 {
273my ($i,$a,$b,$c,$d,$e)=@_;
274my $j=$i&~1;
275my $k=($j+16+2)%16; # ahead reference
276my $l=($j+16-2)%16; # behind reference
277my $K=@VK[($j+16-2)/20];
278
279$j=($j+16)%16;
280
281$code.=<<___ if (!($i&1));
282 sll $a,5,$tmp0 !! $i
283 ld [$Xfer+`4*($i%16)`],$Xi
284 fxors @X[($j+14)%16],@X[$j+1],@X[$j+1]! 0/ 0/ 0:X[1]^=X[14]
285 srl $a,27,$tmp1
286 add $tmp0,$e,$e
287 fxor @X[($j+2)%16],@X[($j+8)%16],%f18! 1/ 1/ 1:Tmp=X[2,3]^X[8,9]
288 and $c,$b,$tmp0
289 add $tmp1,$e,$e
290 sll $b,30,$tmp2
291 or $c,$b,$tmp1
292 fxor %f18,@X[$j],@X[$j] ! 2/ 4/ 3:X[0,1]^=X[2,3]^X[8,9]
293 srl $b,2,$b
294 and $d,$tmp1,$tmp1
295 add $Xi,$e,$e
296 or $tmp1,$tmp0,$tmp1
297 faligndata @X[$j],@X[$j],%f18 ! 3/ 7/ 5:Tmp=X[0,1]>>>24
298 or $tmp2,$b,$b
299 add $tmp1,$e,$e
300 fpadd32 @X[$j],@X[$j],@X[$j] ! 4/ 8/ 6:X[0,1]<<=1
301___
302$code.=<<___ if ($i&1);
303 sll $a,5,$tmp0 !! $i
304 ld [$Xfer+`4*($i%16)`],$Xi
305 srl $a,27,$tmp1
306 add $tmp0,$e,$e
307 fmul8ulx16 %f18,$fmul,%f18 ! 5/10/ 7:Tmp>>=7, Tmp&=1
308 and $c,$b,$tmp0
309 add $tmp1,$e,$e
310 fpadd32 $K,@X[$l],%f20 !
311 sll $b,30,$tmp2
312 or $c,$b,$tmp1
313 fxors @X[($k+13)%16],@X[$k],@X[$k] !-1/-1/-1:X[0]^=X[13]
314 srl $b,2,$b
315 and $d,$tmp1,$tmp1
316 fxor %f18,@X[$j],@X[$j] ! 8/14/10:X[0,1]|=Tmp
317 add $Xi,$e,$e
318 or $tmp1,$tmp0,$tmp1
319 or $tmp2,$b,$b
320 add $tmp1,$e,$e
321 std %f20,[$Xfer+`4*$l`] !
322___
323}
324
325# If there is more data to process, then we pre-fetch the data for
326# next iteration in last ten rounds...
327sub BODY_70_79 {
328my ($i,$a,$b,$c,$d,$e)=@_;
329my $j=$i&~1;
330my $m=($i%8)*2;
331
332$j=($j+16)%16;
333
334$code.=<<___ if ($i==70);
335 sll $a,5,$tmp0 !! $i
336 ld [$Xfer+`4*($i%16)`],$Xi
337 srl $a,27,$tmp1
338 add $tmp0,$e,$e
339 ldd [$inp+64],@X[0]
340 xor $c,$b,$tmp0
341 add $tmp1,$e,$e
342 sll $b,30,$tmp2
343 xor $d,$tmp0,$tmp1
344 srl $b,2,$b
345 add $tmp1,$e,$e
346 or $tmp2,$b,$b
347 add $Xi,$e,$e
348
349 and $inp,-64,$nXfer
350 inc 64,$inp
351 and $nXfer,255,$nXfer
352 alignaddr %g0,$align,%g0
353 add $base,$nXfer,$nXfer
354___
355$code.=<<___ if ($i==71);
356 sll $a,5,$tmp0 !! $i
357 ld [$Xfer+`4*($i%16)`],$Xi
358 srl $a,27,$tmp1
359 add $tmp0,$e,$e
360 xor $c,$b,$tmp0
361 add $tmp1,$e,$e
362 sll $b,30,$tmp2
363 xor $d,$tmp0,$tmp1
364 srl $b,2,$b
365 add $tmp1,$e,$e
366 or $tmp2,$b,$b
367 add $Xi,$e,$e
368___
369$code.=<<___ if ($i>=72);
370 faligndata @X[$m],@X[$m+2],@X[$m]
371 sll $a,5,$tmp0 !! $i
372 ld [$Xfer+`4*($i%16)`],$Xi
373 srl $a,27,$tmp1
374 add $tmp0,$e,$e
375 xor $c,$b,$tmp0
376 add $tmp1,$e,$e
377 fpadd32 $VK_00_19,@X[$m],%f20
378 sll $b,30,$tmp2
379 xor $d,$tmp0,$tmp1
380 srl $b,2,$b
381 add $tmp1,$e,$e
382 or $tmp2,$b,$b
383 add $Xi,$e,$e
384___
385$code.=<<___ if ($i<77);
386 ldd [$inp+`8*($i+1-70)`],@X[2*($i+1-70)]
387___
388$code.=<<___ if ($i==77); # redundant if $inp was aligned
389 add $align,63,$tmp0
390 and $tmp0,-8,$tmp0
391 ldd [$inp+$tmp0],@X[16]
392___
393$code.=<<___ if ($i>=72);
394 std %f20,[$nXfer+`4*$m`]
395___
396}
397
398$code.=<<___;
399.section ".rodata",#alloc
400
401.align 64
402vis_const:
403.long 0x5a827999,0x5a827999 ! K_00_19
404.long 0x6ed9eba1,0x6ed9eba1 ! K_20_39
405.long 0x8f1bbcdc,0x8f1bbcdc ! K_40_59
406.long 0xca62c1d6,0xca62c1d6 ! K_60_79
407.long 0x00000100,0x00000100
408.align 64
409.type vis_const,#object
410.size vis_const,(.-vis_const)
411
412.section ".text",#alloc,#execinstr
413.globl sha1_block_data_order
414sha1_block_data_order:
415 save %sp,-$frame,%sp
416 add %fp,$bias-256,$base
417
418#ifdef __PIC__
419 sethi %hi(_GLOBAL_OFFSET_TABLE_-4), %o5
420 rd %pc, %o4
421 or %o5, %lo(_GLOBAL_OFFSET_TABLE_+4), %o5
422 add %o5, %o4, %o5
423 set vis_const, %o4
424 ldx [%o4+%o5], %o4
425#else
426 set vis_const, %o4
427#endif
428
429 ldd [$tmp0+0],$VK_00_19
430 ldd [$tmp0+8],$VK_20_39
431 ldd [$tmp0+16],$VK_40_59
432 ldd [$tmp0+24],$VK_60_79
433 ldd [$tmp0+32],$fmul
434
435 ld [$ctx+0],$Actx
436 and $base,-256,$base
437 ld [$ctx+4],$Bctx
438 sub $base,$bias+$frame,%sp
439 ld [$ctx+8],$Cctx
440 and $inp,7,$align
441 ld [$ctx+12],$Dctx
442 and $inp,-8,$inp
443 ld [$ctx+16],$Ectx
444
445 ! X[16] is maintained in FP register bank
446 alignaddr %g0,$align,%g0
447 ldd [$inp+0],@X[0]
448 sub $inp,-64,$Xfer
449 ldd [$inp+8],@X[2]
450 and $Xfer,-64,$Xfer
451 ldd [$inp+16],@X[4]
452 and $Xfer,255,$Xfer
453 ldd [$inp+24],@X[6]
454 add $base,$Xfer,$Xfer
455 ldd [$inp+32],@X[8]
456 ldd [$inp+40],@X[10]
457 ldd [$inp+48],@X[12]
458 brz,pt $align,.Laligned
459 ldd [$inp+56],@X[14]
460
461 ldd [$inp+64],@X[16]
462 faligndata @X[0],@X[2],@X[0]
463 faligndata @X[2],@X[4],@X[2]
464 faligndata @X[4],@X[6],@X[4]
465 faligndata @X[6],@X[8],@X[6]
466 faligndata @X[8],@X[10],@X[8]
467 faligndata @X[10],@X[12],@X[10]
468 faligndata @X[12],@X[14],@X[12]
469 faligndata @X[14],@X[16],@X[14]
470
471.Laligned:
472 mov 5,$tmp0
473 dec 1,$len
474 alignaddr %g0,$tmp0,%g0
475 fpadd32 $VK_00_19,@X[0],%f16
476 fpadd32 $VK_00_19,@X[2],%f18
477 fpadd32 $VK_00_19,@X[4],%f20
478 fpadd32 $VK_00_19,@X[6],%f22
479 fpadd32 $VK_00_19,@X[8],%f24
480 fpadd32 $VK_00_19,@X[10],%f26
481 fpadd32 $VK_00_19,@X[12],%f28
482 fpadd32 $VK_00_19,@X[14],%f30
483 std %f16,[$Xfer+0]
484 mov $Actx,$A
485 std %f18,[$Xfer+8]
486 mov $Bctx,$B
487 std %f20,[$Xfer+16]
488 mov $Cctx,$C
489 std %f22,[$Xfer+24]
490 mov $Dctx,$D
491 std %f24,[$Xfer+32]
492 mov $Ectx,$E
493 std %f26,[$Xfer+40]
494 fxors @X[13],@X[0],@X[0]
495 std %f28,[$Xfer+48]
496 ba .Loop
497 std %f30,[$Xfer+56]
498.align 32
499.Loop:
500___
501for ($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
502for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
503for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
504for (;$i<70;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
505$code.=<<___;
506 tst $len
507 bz,pn `$bits==32?"%icc":"%xcc"`,.Ltail
508 nop
509___
510for (;$i<80;$i++) { &BODY_70_79($i,@V); unshift(@V,pop(@V)); }
511$code.=<<___;
512 add $A,$Actx,$Actx
513 add $B,$Bctx,$Bctx
514 add $C,$Cctx,$Cctx
515 add $D,$Dctx,$Dctx
516 add $E,$Ectx,$Ectx
517 mov 5,$tmp0
518 fxors @X[13],@X[0],@X[0]
519 mov $Actx,$A
520 mov $Bctx,$B
521 mov $Cctx,$C
522 mov $Dctx,$D
523 mov $Ectx,$E
524 alignaddr %g0,$tmp0,%g0
525 dec 1,$len
526 ba .Loop
527 mov $nXfer,$Xfer
528
529.align 32
530.Ltail:
531___
532for($i=70;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
533$code.=<<___;
534 add $A,$Actx,$Actx
535 add $B,$Bctx,$Bctx
536 add $C,$Cctx,$Cctx
537 add $D,$Dctx,$Dctx
538 add $E,$Ectx,$Ectx
539
540 st $Actx,[$ctx+0]
541 st $Bctx,[$ctx+4]
542 st $Cctx,[$ctx+8]
543 st $Dctx,[$ctx+12]
544 st $Ectx,[$ctx+16]
545
546 ret
547 restore
548.type sha1_block_data_order,#function
549.size sha1_block_data_order,(.-sha1_block_data_order)
550___
551
552# Purpose of these subroutines is to explicitly encode VIS instructions,
553# so that one can compile the module without having to specify VIS
554# extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
555# Idea is to reserve for option to produce "universal" binary and let
556# programmer detect if current CPU is VIS capable at run-time.
557sub unvis {
558my ($mnemonic,$rs1,$rs2,$rd)=@_;
559my ($ref,$opf);
560my %visopf = ( "fmul8ulx16" => 0x037,
561 "faligndata" => 0x048,
562 "fpadd32" => 0x052,
563 "fxor" => 0x06c,
564 "fxors" => 0x06d );
565
566 $ref = "$mnemonic\t$rs1,$rs2,$rd";
567
568 if ($opf=$visopf{$mnemonic}) {
569 foreach ($rs1,$rs2,$rd) {
570 return $ref if (!/%f([0-9]{1,2})/);
571 $_=$1;
572 if ($1>=32) {
573 return $ref if ($1&1);
574 # re-encode for upper double register addressing
575 $_=($1|$1>>5)&31;
576 }
577 }
578
579 return sprintf ".word\t0x%08x !%s",
580 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
581 $ref;
582 } else {
583 return $ref;
584 }
585}
586sub unalignaddr {
587my ($mnemonic,$rs1,$rs2,$rd)=@_;
588my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
589my $ref="$mnemonic\t$rs1,$rs2,$rd";
590
591 foreach ($rs1,$rs2,$rd) {
592 if (/%([goli])([0-7])/) { $_=$bias{$1}+$2; }
593 else { return $ref; }
594 }
595 return sprintf ".word\t0x%08x !%s",
596 0x81b00300|$rd<<25|$rs1<<14|$rs2,
597 $ref;
598}
599
600$code =~ s/\`([^\`]*)\`/eval $1/gem;
601$code =~ s/\b(f[^\s]*)\s+(%f[0-9]{1,2}),(%f[0-9]{1,2}),(%f[0-9]{1,2})/
602 &unvis($1,$2,$3,$4)
603 /gem;
604$code =~ s/\b(alignaddr)\s+(%[goli][0-7]),(%[goli][0-7]),(%[goli][0-7])/
605 &unalignaddr($1,$2,$3,$4)
606 /gem;
607print $code;
608close STDOUT;
diff --git a/src/lib/libcrypto/sha/asm/sha1-thumb.pl b/src/lib/libcrypto/sha/asm/sha1-thumb.pl
deleted file mode 100644
index 553e9cedb5..0000000000
--- a/src/lib/libcrypto/sha/asm/sha1-thumb.pl
+++ /dev/null
@@ -1,259 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# sha1_block for Thumb.
11#
12# January 2007.
13#
14# The code does not present direct interest to OpenSSL, because of low
15# performance. Its purpose is to establish _size_ benchmark. Pretty
16# useless one I must say, because 30% or 88 bytes larger ARMv4 code
17# [available on demand] is almost _twice_ as fast. It should also be
18# noted that in-lining of .Lcommon and .Lrotate improves performance
19# by over 40%, while code increases by only 10% or 32 bytes. But once
20# again, the goal was to establish _size_ benchmark, not performance.
21
22$output=shift;
23open STDOUT,">$output";
24
25$inline=0;
26#$cheat_on_binutils=1;
27
28$t0="r0";
29$t1="r1";
30$t2="r2";
31$a="r3";
32$b="r4";
33$c="r5";
34$d="r6";
35$e="r7";
36$K="r8"; # "upper" registers can be used in add/sub and mov insns
37$ctx="r9";
38$inp="r10";
39$len="r11";
40$Xi="r12";
41
42sub common {
43<<___;
44 sub $t0,#4
45 ldr $t1,[$t0]
46 add $e,$K @ E+=K_xx_xx
47 lsl $t2,$a,#5
48 add $t2,$e
49 lsr $e,$a,#27
50 add $t2,$e @ E+=ROR(A,27)
51 add $t2,$t1 @ E+=X[i]
52___
53}
54sub rotate {
55<<___;
56 mov $e,$d @ E=D
57 mov $d,$c @ D=C
58 lsl $c,$b,#30
59 lsr $b,$b,#2
60 orr $c,$b @ C=ROR(B,2)
61 mov $b,$a @ B=A
62 add $a,$t2,$t1 @ A=E+F_xx_xx(B,C,D)
63___
64}
65
66sub BODY_00_19 {
67$code.=$inline?&common():"\tbl .Lcommon\n";
68$code.=<<___;
69 mov $t1,$c
70 eor $t1,$d
71 and $t1,$b
72 eor $t1,$d @ F_00_19(B,C,D)
73___
74$code.=$inline?&rotate():"\tbl .Lrotate\n";
75}
76
77sub BODY_20_39 {
78$code.=$inline?&common():"\tbl .Lcommon\n";
79$code.=<<___;
80 mov $t1,$b
81 eor $t1,$c
82 eor $t1,$d @ F_20_39(B,C,D)
83___
84$code.=$inline?&rotate():"\tbl .Lrotate\n";
85}
86
87sub BODY_40_59 {
88$code.=$inline?&common():"\tbl .Lcommon\n";
89$code.=<<___;
90 mov $t1,$b
91 and $t1,$c
92 mov $e,$b
93 orr $e,$c
94 and $e,$d
95 orr $t1,$e @ F_40_59(B,C,D)
96___
97$code.=$inline?&rotate():"\tbl .Lrotate\n";
98}
99
100$code=<<___;
101.text
102.code 16
103
104.global sha1_block_data_order
105.type sha1_block_data_order,%function
106
107.align 2
108sha1_block_data_order:
109___
110if ($cheat_on_binutils) {
111$code.=<<___;
112.code 32
113 add r3,pc,#1
114 bx r3 @ switch to Thumb ISA
115.code 16
116___
117}
118$code.=<<___;
119 push {r4-r7}
120 mov r3,r8
121 mov r4,r9
122 mov r5,r10
123 mov r6,r11
124 mov r7,r12
125 push {r3-r7,lr}
126 lsl r2,#6
127 mov $ctx,r0 @ save context
128 mov $inp,r1 @ save inp
129 mov $len,r2 @ save len
130 add $len,$inp @ $len to point at inp end
131
132.Lloop:
133 mov $Xi,sp
134 mov $t2,sp
135 sub $t2,#16*4 @ [3]
136.LXload:
137 ldrb $a,[$t1,#0] @ $t1 is r1 and holds inp
138 ldrb $b,[$t1,#1]
139 ldrb $c,[$t1,#2]
140 ldrb $d,[$t1,#3]
141 lsl $a,#24
142 lsl $b,#16
143 lsl $c,#8
144 orr $a,$b
145 orr $a,$c
146 orr $a,$d
147 add $t1,#4
148 push {$a}
149 cmp sp,$t2
150 bne .LXload @ [+14*16]
151
152 mov $inp,$t1 @ update $inp
153 sub $t2,#32*4
154 sub $t2,#32*4
155 mov $e,#31 @ [+4]
156.LXupdate:
157 ldr $a,[sp,#15*4]
158 ldr $b,[sp,#13*4]
159 ldr $c,[sp,#7*4]
160 ldr $d,[sp,#2*4]
161 eor $a,$b
162 eor $a,$c
163 eor $a,$d
164 ror $a,$e
165 push {$a}
166 cmp sp,$t2
167 bne .LXupdate @ [+(11+1)*64]
168
169 ldmia $t0!,{$a,$b,$c,$d,$e} @ $t0 is r0 and holds ctx
170 mov $t0,$Xi
171
172 ldr $t2,.LK_00_19
173 mov $t1,$t0
174 sub $t1,#20*4
175 mov $Xi,$t1
176 mov $K,$t2 @ [+7+4]
177.L_00_19:
178___
179 &BODY_00_19();
180$code.=<<___;
181 cmp $Xi,$t0
182 bne .L_00_19 @ [+(2+9+4+2+8+2)*20]
183
184 ldr $t2,.LK_20_39
185 mov $t1,$t0
186 sub $t1,#20*4
187 mov $Xi,$t1
188 mov $K,$t2 @ [+5]
189.L_20_39_or_60_79:
190___
191 &BODY_20_39();
192$code.=<<___;
193 cmp $Xi,$t0
194 bne .L_20_39_or_60_79 @ [+(2+9+3+2+8+2)*20*2]
195 cmp sp,$t0
196 beq .Ldone @ [+2]
197
198 ldr $t2,.LK_40_59
199 mov $t1,$t0
200 sub $t1,#20*4
201 mov $Xi,$t1
202 mov $K,$t2 @ [+5]
203.L_40_59:
204___
205 &BODY_40_59();
206$code.=<<___;
207 cmp $Xi,$t0
208 bne .L_40_59 @ [+(2+9+6+2+8+2)*20]
209
210 ldr $t2,.LK_60_79
211 mov $Xi,sp
212 mov $K,$t2
213 b .L_20_39_or_60_79 @ [+4]
214.Ldone:
215 mov $t0,$ctx
216 ldr $t1,[$t0,#0]
217 ldr $t2,[$t0,#4]
218 add $a,$t1
219 ldr $t1,[$t0,#8]
220 add $b,$t2
221 ldr $t2,[$t0,#12]
222 add $c,$t1
223 ldr $t1,[$t0,#16]
224 add $d,$t2
225 add $e,$t1
226 stmia $t0!,{$a,$b,$c,$d,$e} @ [+20]
227
228 add sp,#80*4 @ deallocate stack frame
229 mov $t0,$ctx @ restore ctx
230 mov $t1,$inp @ restore inp
231 cmp $t1,$len
232 beq .Lexit
233 b .Lloop @ [+6] total 3212 cycles
234.Lexit:
235 pop {r2-r7}
236 mov r8,r2
237 mov r9,r3
238 mov r10,r4
239 mov r11,r5
240 mov r12,r6
241 mov lr,r7
242 pop {r4-r7}
243 bx lr
244.align 2
245___
246$code.=".Lcommon:\n".&common()."\tmov pc,lr\n" if (!$inline);
247$code.=".Lrotate:\n".&rotate()."\tmov pc,lr\n" if (!$inline);
248$code.=<<___;
249.align 2
250.LK_00_19: .word 0x5a827999
251.LK_20_39: .word 0x6ed9eba1
252.LK_40_59: .word 0x8f1bbcdc
253.LK_60_79: .word 0xca62c1d6
254.size sha1_block_data_order,.-sha1_block_data_order
255.asciz "SHA1 block transform for Thumb, CRYPTOGAMS by <appro\@openssl.org>"
256___
257
258print $code;
259close STDOUT; # enforce flush