summaryrefslogtreecommitdiff
path: root/src/lib/libcrypto/bn/asm/pa-risc2.s
diff options
context:
space:
mode:
Diffstat (limited to 'src/lib/libcrypto/bn/asm/pa-risc2.s')
-rw-r--r--src/lib/libcrypto/bn/asm/pa-risc2.s2024
1 files changed, 1613 insertions, 411 deletions
diff --git a/src/lib/libcrypto/bn/asm/pa-risc2.s b/src/lib/libcrypto/bn/asm/pa-risc2.s
index c2725996a4..7239aa2c76 100644
--- a/src/lib/libcrypto/bn/asm/pa-risc2.s
+++ b/src/lib/libcrypto/bn/asm/pa-risc2.s
@@ -1,416 +1,1618 @@
1 .SPACE $PRIVATE$ 1;
2 .SUBSPA $DATA$,QUAD=1,ALIGN=8,ACCESS=31 2; PA-RISC 2.0 implementation of bn_asm code, based on the
3 .SUBSPA $BSS$,QUAD=1,ALIGN=8,ACCESS=31,ZERO,SORT=82 3; 64-bit version of the code. This code is effectively the
4 .SPACE $TEXT$ 4; same as the 64-bit version except the register model is
5 .SUBSPA $LIT$,QUAD=0,ALIGN=8,ACCESS=44 5; slightly different given all values must be 32-bit between
6 .SUBSPA $CODE$,QUAD=0,ALIGN=8,ACCESS=44,CODE_ONLY 6; function calls. Thus the 64-bit return values are returned
7 .IMPORT $global$,DATA 7; in %ret0 and %ret1 vs just %ret0 as is done in 64-bit
8 .IMPORT $$dyncall,MILLICODE 8;
9; gcc_compiled.: 9;
10 .SPACE $TEXT$ 10; This code is approximately 2x faster than the C version
11 .SUBSPA $CODE$ 11; for RSA/DSA.
12 12;
13 .align 4 13; See http://devresource.hp.com/ for more details on the PA-RISC
14 .EXPORT bn_mul_add_words,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR,RTNVAL=GR 14; architecture. Also see the book "PA-RISC 2.0 Architecture"
15; by Gerry Kane for information on the instruction set architecture.
16;
17; Code written by Chris Ruemmler (with some help from the HP C
18; compiler).
19;
20; The code compiles with HP's assembler
21;
22
23 .level 2.0N
24 .space $TEXT$
25 .subspa $CODE$,QUAD=0,ALIGN=8,ACCESS=0x2c,CODE_ONLY
26
27;
28; Global Register definitions used for the routines.
29;
30; Some information about HP's runtime architecture for 32-bits.
31;
32; "Caller save" means the calling function must save the register
33; if it wants the register to be preserved.
34; "Callee save" means if a function uses the register, it must save
35; the value before using it.
36;
37; For the floating point registers
38;
39; "caller save" registers: fr4-fr11, fr22-fr31
40; "callee save" registers: fr12-fr21
41; "special" registers: fr0-fr3 (status and exception registers)
42;
43; For the integer registers
44; value zero : r0
45; "caller save" registers: r1,r19-r26
46; "callee save" registers: r3-r18
47; return register : r2 (rp)
48; return values ; r28,r29 (ret0,ret1)
49; Stack pointer ; r30 (sp)
50; millicode return ptr ; r31 (also a caller save register)
51
52
53;
54; Arguments to the routines
55;
56r_ptr .reg %r26
57a_ptr .reg %r25
58b_ptr .reg %r24
59num .reg %r24
60n .reg %r23
61
62;
63; Note that the "w" argument for bn_mul_add_words and bn_mul_words
64; is passed on the stack at a delta of -56 from the top of stack
65; as the routine is entered.
66;
67
68;
69; Globals used in some routines
70;
71
72top_overflow .reg %r23
73high_mask .reg %r22 ; value 0xffffffff80000000L
74
75
76;------------------------------------------------------------------------------
77;
78; bn_mul_add_words
79;
80;BN_ULONG bn_mul_add_words(BN_ULONG *r_ptr, BN_ULONG *a_ptr,
81; int num, BN_ULONG w)
82;
83; arg0 = r_ptr
84; arg1 = a_ptr
85; arg3 = num
86; -56(sp) = w
87;
88; Local register definitions
89;
90
91fm1 .reg %fr22
92fm .reg %fr23
93ht_temp .reg %fr24
94ht_temp_1 .reg %fr25
95lt_temp .reg %fr26
96lt_temp_1 .reg %fr27
97fm1_1 .reg %fr28
98fm_1 .reg %fr29
99
100fw_h .reg %fr7L
101fw_l .reg %fr7R
102fw .reg %fr7
103
104fht_0 .reg %fr8L
105flt_0 .reg %fr8R
106t_float_0 .reg %fr8
107
108fht_1 .reg %fr9L
109flt_1 .reg %fr9R
110t_float_1 .reg %fr9
111
112tmp_0 .reg %r31
113tmp_1 .reg %r21
114m_0 .reg %r20
115m_1 .reg %r19
116ht_0 .reg %r1
117ht_1 .reg %r3
118lt_0 .reg %r4
119lt_1 .reg %r5
120m1_0 .reg %r6
121m1_1 .reg %r7
122rp_val .reg %r8
123rp_val_1 .reg %r9
124
15bn_mul_add_words 125bn_mul_add_words
16 .PROC 126 .export bn_mul_add_words,entry,NO_RELOCATION,LONG_RETURN
17 .CALLINFO FRAME=64,CALLS,SAVE_RP,ENTRY_GR=4 127 .proc
18 .ENTRY 128 .callinfo frame=128
19 stw %r2,-20(0,%r30) 129 .entry
20 stwm %r4,64(0,%r30) 130 .align 64
21 copy %r24,%r31 131
22 stw %r3,-60(0,%r30) 132 STD %r3,0(%sp) ; save r3
23 ldi 0,%r20 133 STD %r4,8(%sp) ; save r4
24 ldo 12(%r26),%r2 134 NOP ; Needed to make the loop 16-byte aligned
25 stw %r23,-16(0,%r30) 135 NOP ; needed to make the loop 16-byte aligned
26 copy %r25,%r3 136
27 ldo 12(%r3),%r1 137 STD %r5,16(%sp) ; save r5
28 fldws -16(0,%r30),%fr8L 138 NOP
29L$0010 139 STD %r6,24(%sp) ; save r6
30 copy %r20,%r25 140 STD %r7,32(%sp) ; save r7
31 ldi 0,%r24 141
32 fldws 0(0,%r3),%fr9L 142 STD %r8,40(%sp) ; save r8
33 ldw 0(0,%r26),%r19 143 STD %r9,48(%sp) ; save r9
34 xmpyu %fr8L,%fr9L,%fr9 144 COPY %r0,%ret1 ; return 0 by default
35 fstds %fr9,-16(0,%r30) 145 DEPDI,Z 1,31,1,top_overflow ; top_overflow = 1 << 32
36 copy %r19,%r23 146
37 ldw -16(0,%r30),%r28 147 CMPIB,>= 0,num,bn_mul_add_words_exit ; if (num <= 0) then exit
38 ldw -12(0,%r30),%r29 148 LDO 128(%sp),%sp ; bump stack
39 ldi 0,%r22 149
40 add %r23,%r29,%r29 150 ;
41 addc %r22,%r28,%r28 151 ; The loop is unrolled twice, so if there is only 1 number
42 add %r25,%r29,%r29 152 ; then go straight to the cleanup code.
43 addc %r24,%r28,%r28 153 ;
44 copy %r28,%r21 154 CMPIB,= 1,num,bn_mul_add_words_single_top
45 ldi 0,%r20 155 FLDD -184(%sp),fw ; (-56-128) load up w into fw (fw_h/fw_l)
46 copy %r21,%r20 156
47 addib,= -1,%r31,L$0011 157 ;
48 stw %r29,0(0,%r26) 158 ; This loop is unrolled 2 times (64-byte aligned as well)
49 copy %r20,%r25 159 ;
50 ldi 0,%r24 160 ; PA-RISC 2.0 chips have two fully pipelined multipliers, thus
51 fldws -8(0,%r1),%fr9L 161 ; two 32-bit mutiplies can be issued per cycle.
52 ldw -8(0,%r2),%r19 162 ;
53 xmpyu %fr8L,%fr9L,%fr9 163bn_mul_add_words_unroll2
54 fstds %fr9,-16(0,%r30) 164
55 copy %r19,%r23 165 FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R)
56 ldw -16(0,%r30),%r28 166 FLDD 8(a_ptr),t_float_1 ; load up 64-bit value (fr8L) ht(L)/lt(R)
57 ldw -12(0,%r30),%r29 167 LDD 0(r_ptr),rp_val ; rp[0]
58 ldi 0,%r22 168 LDD 8(r_ptr),rp_val_1 ; rp[1]
59 add %r23,%r29,%r29 169
60 addc %r22,%r28,%r28 170 XMPYU fht_0,fw_l,fm1 ; m1[0] = fht_0*fw_l
61 add %r25,%r29,%r29 171 XMPYU fht_1,fw_l,fm1_1 ; m1[1] = fht_1*fw_l
62 addc %r24,%r28,%r28 172 FSTD fm1,-16(%sp) ; -16(sp) = m1[0]
63 copy %r28,%r21 173 FSTD fm1_1,-48(%sp) ; -48(sp) = m1[1]
64 ldi 0,%r20 174
65 copy %r21,%r20 175 XMPYU flt_0,fw_h,fm ; m[0] = flt_0*fw_h
66 addib,= -1,%r31,L$0011 176 XMPYU flt_1,fw_h,fm_1 ; m[1] = flt_1*fw_h
67 stw %r29,-8(0,%r2) 177 FSTD fm,-8(%sp) ; -8(sp) = m[0]
68 copy %r20,%r25 178 FSTD fm_1,-40(%sp) ; -40(sp) = m[1]
69 ldi 0,%r24 179
70 fldws -4(0,%r1),%fr9L 180 XMPYU fht_0,fw_h,ht_temp ; ht_temp = fht_0*fw_h
71 ldw -4(0,%r2),%r19 181 XMPYU fht_1,fw_h,ht_temp_1 ; ht_temp_1 = fht_1*fw_h
72 xmpyu %fr8L,%fr9L,%fr9 182 FSTD ht_temp,-24(%sp) ; -24(sp) = ht_temp
73 fstds %fr9,-16(0,%r30) 183 FSTD ht_temp_1,-56(%sp) ; -56(sp) = ht_temp_1
74 copy %r19,%r23 184
75 ldw -16(0,%r30),%r28 185 XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l
76 ldw -12(0,%r30),%r29 186 XMPYU flt_1,fw_l,lt_temp_1 ; lt_temp = lt*fw_l
77 ldi 0,%r22 187 FSTD lt_temp,-32(%sp) ; -32(sp) = lt_temp
78 add %r23,%r29,%r29 188 FSTD lt_temp_1,-64(%sp) ; -64(sp) = lt_temp_1
79 addc %r22,%r28,%r28 189
80 add %r25,%r29,%r29 190 LDD -8(%sp),m_0 ; m[0]
81 addc %r24,%r28,%r28 191 LDD -40(%sp),m_1 ; m[1]
82 copy %r28,%r21 192 LDD -16(%sp),m1_0 ; m1[0]
83 ldi 0,%r20 193 LDD -48(%sp),m1_1 ; m1[1]
84 copy %r21,%r20 194
85 addib,= -1,%r31,L$0011 195 LDD -24(%sp),ht_0 ; ht[0]
86 stw %r29,-4(0,%r2) 196 LDD -56(%sp),ht_1 ; ht[1]
87 copy %r20,%r25 197 ADD,L m1_0,m_0,tmp_0 ; tmp_0 = m[0] + m1[0];
88 ldi 0,%r24 198 ADD,L m1_1,m_1,tmp_1 ; tmp_1 = m[1] + m1[1];
89 fldws 0(0,%r1),%fr9L 199
90 ldw 0(0,%r2),%r19 200 LDD -32(%sp),lt_0
91 xmpyu %fr8L,%fr9L,%fr9 201 LDD -64(%sp),lt_1
92 fstds %fr9,-16(0,%r30) 202 CMPCLR,*>>= tmp_0,m1_0, %r0 ; if (m[0] < m1[0])
93 copy %r19,%r23 203 ADD,L ht_0,top_overflow,ht_0 ; ht[0] += (1<<32)
94 ldw -16(0,%r30),%r28 204
95 ldw -12(0,%r30),%r29 205 CMPCLR,*>>= tmp_1,m1_1,%r0 ; if (m[1] < m1[1])
96 ldi 0,%r22 206 ADD,L ht_1,top_overflow,ht_1 ; ht[1] += (1<<32)
97 add %r23,%r29,%r29 207 EXTRD,U tmp_0,31,32,m_0 ; m[0]>>32
98 addc %r22,%r28,%r28 208 DEPD,Z tmp_0,31,32,m1_0 ; m1[0] = m[0]<<32
99 add %r25,%r29,%r29 209
100 addc %r24,%r28,%r28 210 EXTRD,U tmp_1,31,32,m_1 ; m[1]>>32
101 copy %r28,%r21 211 DEPD,Z tmp_1,31,32,m1_1 ; m1[1] = m[1]<<32
102 ldi 0,%r20 212 ADD,L ht_0,m_0,ht_0 ; ht[0]+= (m[0]>>32)
103 copy %r21,%r20 213 ADD,L ht_1,m_1,ht_1 ; ht[1]+= (m[1]>>32)
104 addib,= -1,%r31,L$0011 214
105 stw %r29,0(0,%r2) 215 ADD lt_0,m1_0,lt_0 ; lt[0] = lt[0]+m1[0];
106 ldo 16(%r1),%r1 216 ADD,DC ht_0,%r0,ht_0 ; ht[0]++
107 ldo 16(%r3),%r3 217 ADD lt_1,m1_1,lt_1 ; lt[1] = lt[1]+m1[1];
108 ldo 16(%r2),%r2 218 ADD,DC ht_1,%r0,ht_1 ; ht[1]++
109 bl L$0010,0 219
110 ldo 16(%r26),%r26 220 ADD %ret1,lt_0,lt_0 ; lt[0] = lt[0] + c;
111L$0011 221 ADD,DC ht_0,%r0,ht_0 ; ht[0]++
112 copy %r20,%r28 222 ADD lt_0,rp_val,lt_0 ; lt[0] = lt[0]+rp[0]
113 ldw -84(0,%r30),%r2 223 ADD,DC ht_0,%r0,ht_0 ; ht[0]++
114 ldw -60(0,%r30),%r3 224
115 bv 0(%r2) 225 LDO -2(num),num ; num = num - 2;
116 ldwm -64(0,%r30),%r4 226 ADD ht_0,lt_1,lt_1 ; lt[1] = lt[1] + ht_0 (c);
117 .EXIT 227 ADD,DC ht_1,%r0,ht_1 ; ht[1]++
118 .PROCEND 228 STD lt_0,0(r_ptr) ; rp[0] = lt[0]
119 .align 4 229
120 .EXPORT bn_mul_words,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR,RTNVAL=GR 230 ADD lt_1,rp_val_1,lt_1 ; lt[1] = lt[1]+rp[1]
231 ADD,DC ht_1,%r0,%ret1 ; ht[1]++
232 LDO 16(a_ptr),a_ptr ; a_ptr += 2
233
234 STD lt_1,8(r_ptr) ; rp[1] = lt[1]
235 CMPIB,<= 2,num,bn_mul_add_words_unroll2 ; go again if more to do
236 LDO 16(r_ptr),r_ptr ; r_ptr += 2
237
238 CMPIB,=,N 0,num,bn_mul_add_words_exit ; are we done, or cleanup last one
239
240 ;
241 ; Top of loop aligned on 64-byte boundary
242 ;
243bn_mul_add_words_single_top
244 FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R)
245 LDD 0(r_ptr),rp_val ; rp[0]
246 LDO 8(a_ptr),a_ptr ; a_ptr++
247 XMPYU fht_0,fw_l,fm1 ; m1 = ht*fw_l
248 FSTD fm1,-16(%sp) ; -16(sp) = m1
249 XMPYU flt_0,fw_h,fm ; m = lt*fw_h
250 FSTD fm,-8(%sp) ; -8(sp) = m
251 XMPYU fht_0,fw_h,ht_temp ; ht_temp = ht*fw_h
252 FSTD ht_temp,-24(%sp) ; -24(sp) = ht
253 XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l
254 FSTD lt_temp,-32(%sp) ; -32(sp) = lt
255
256 LDD -8(%sp),m_0
257 LDD -16(%sp),m1_0 ; m1 = temp1
258 ADD,L m_0,m1_0,tmp_0 ; tmp_0 = m + m1;
259 LDD -24(%sp),ht_0
260 LDD -32(%sp),lt_0
261
262 CMPCLR,*>>= tmp_0,m1_0,%r0 ; if (m < m1)
263 ADD,L ht_0,top_overflow,ht_0 ; ht += (1<<32)
264
265 EXTRD,U tmp_0,31,32,m_0 ; m>>32
266 DEPD,Z tmp_0,31,32,m1_0 ; m1 = m<<32
267
268 ADD,L ht_0,m_0,ht_0 ; ht+= (m>>32)
269 ADD lt_0,m1_0,tmp_0 ; tmp_0 = lt+m1;
270 ADD,DC ht_0,%r0,ht_0 ; ht++
271 ADD %ret1,tmp_0,lt_0 ; lt = lt + c;
272 ADD,DC ht_0,%r0,ht_0 ; ht++
273 ADD lt_0,rp_val,lt_0 ; lt = lt+rp[0]
274 ADD,DC ht_0,%r0,%ret1 ; ht++
275 STD lt_0,0(r_ptr) ; rp[0] = lt
276
277bn_mul_add_words_exit
278 .EXIT
279
280 EXTRD,U %ret1,31,32,%ret0 ; for 32-bit, return in ret0/ret1
281 LDD -80(%sp),%r9 ; restore r9
282 LDD -88(%sp),%r8 ; restore r8
283 LDD -96(%sp),%r7 ; restore r7
284 LDD -104(%sp),%r6 ; restore r6
285 LDD -112(%sp),%r5 ; restore r5
286 LDD -120(%sp),%r4 ; restore r4
287 BVE (%rp)
288 LDD,MB -128(%sp),%r3 ; restore r3
289 .PROCEND ;in=23,24,25,26,29;out=28;
290
291;----------------------------------------------------------------------------
292;
293;BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
294;
295; arg0 = rp
296; arg1 = ap
297; arg3 = num
298; w on stack at -56(sp)
299
121bn_mul_words 300bn_mul_words
122 .PROC 301 .proc
123 .CALLINFO FRAME=64,CALLS,SAVE_RP,ENTRY_GR=3 302 .callinfo frame=128
124 .ENTRY 303 .entry
125 stw %r2,-20(0,%r30) 304 .EXPORT bn_mul_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
126 copy %r25,%r2 305 .align 64
127 stwm %r4,64(0,%r30) 306
128 copy %r24,%r19 307 STD %r3,0(%sp) ; save r3
129 ldi 0,%r28 308 STD %r4,8(%sp) ; save r4
130 stw %r23,-16(0,%r30) 309 NOP
131 ldo 12(%r26),%r31 310 STD %r5,16(%sp) ; save r5
132 ldo 12(%r2),%r29 311
133 fldws -16(0,%r30),%fr8L 312 STD %r6,24(%sp) ; save r6
134L$0026 313 STD %r7,32(%sp) ; save r7
135 fldws 0(0,%r2),%fr9L 314 COPY %r0,%ret1 ; return 0 by default
136 xmpyu %fr8L,%fr9L,%fr9 315 DEPDI,Z 1,31,1,top_overflow ; top_overflow = 1 << 32
137 fstds %fr9,-16(0,%r30) 316
138 copy %r28,%r21 317 CMPIB,>= 0,num,bn_mul_words_exit
139 ldi 0,%r20 318 LDO 128(%sp),%sp ; bump stack
140 ldw -16(0,%r30),%r24 319
141 ldw -12(0,%r30),%r25 320 ;
142 add %r21,%r25,%r25 321 ; See if only 1 word to do, thus just do cleanup
143 addc %r20,%r24,%r24 322 ;
144 copy %r24,%r23 323 CMPIB,= 1,num,bn_mul_words_single_top
145 ldi 0,%r22 324 FLDD -184(%sp),fw ; (-56-128) load up w into fw (fw_h/fw_l)
146 copy %r23,%r28 325
147 addib,= -1,%r19,L$0027 326 ;
148 stw %r25,0(0,%r26) 327 ; This loop is unrolled 2 times (64-byte aligned as well)
149 fldws -8(0,%r29),%fr9L 328 ;
150 xmpyu %fr8L,%fr9L,%fr9 329 ; PA-RISC 2.0 chips have two fully pipelined multipliers, thus
151 fstds %fr9,-16(0,%r30) 330 ; two 32-bit mutiplies can be issued per cycle.
152 copy %r28,%r21 331 ;
153 ldi 0,%r20 332bn_mul_words_unroll2
154 ldw -16(0,%r30),%r24 333
155 ldw -12(0,%r30),%r25 334 FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R)
156 add %r21,%r25,%r25 335 FLDD 8(a_ptr),t_float_1 ; load up 64-bit value (fr8L) ht(L)/lt(R)
157 addc %r20,%r24,%r24 336 XMPYU fht_0,fw_l,fm1 ; m1[0] = fht_0*fw_l
158 copy %r24,%r23 337 XMPYU fht_1,fw_l,fm1_1 ; m1[1] = ht*fw_l
159 ldi 0,%r22 338
160 copy %r23,%r28 339 FSTD fm1,-16(%sp) ; -16(sp) = m1
161 addib,= -1,%r19,L$0027 340 FSTD fm1_1,-48(%sp) ; -48(sp) = m1
162 stw %r25,-8(0,%r31) 341 XMPYU flt_0,fw_h,fm ; m = lt*fw_h
163 fldws -4(0,%r29),%fr9L 342 XMPYU flt_1,fw_h,fm_1 ; m = lt*fw_h
164 xmpyu %fr8L,%fr9L,%fr9 343
165 fstds %fr9,-16(0,%r30) 344 FSTD fm,-8(%sp) ; -8(sp) = m
166 copy %r28,%r21 345 FSTD fm_1,-40(%sp) ; -40(sp) = m
167 ldi 0,%r20 346 XMPYU fht_0,fw_h,ht_temp ; ht_temp = fht_0*fw_h
168 ldw -16(0,%r30),%r24 347 XMPYU fht_1,fw_h,ht_temp_1 ; ht_temp = ht*fw_h
169 ldw -12(0,%r30),%r25 348
170 add %r21,%r25,%r25 349 FSTD ht_temp,-24(%sp) ; -24(sp) = ht
171 addc %r20,%r24,%r24 350 FSTD ht_temp_1,-56(%sp) ; -56(sp) = ht
172 copy %r24,%r23 351 XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l
173 ldi 0,%r22 352 XMPYU flt_1,fw_l,lt_temp_1 ; lt_temp = lt*fw_l
174 copy %r23,%r28 353
175 addib,= -1,%r19,L$0027 354 FSTD lt_temp,-32(%sp) ; -32(sp) = lt
176 stw %r25,-4(0,%r31) 355 FSTD lt_temp_1,-64(%sp) ; -64(sp) = lt
177 fldws 0(0,%r29),%fr9L 356 LDD -8(%sp),m_0
178 xmpyu %fr8L,%fr9L,%fr9 357 LDD -40(%sp),m_1
179 fstds %fr9,-16(0,%r30) 358
180 copy %r28,%r21 359 LDD -16(%sp),m1_0
181 ldi 0,%r20 360 LDD -48(%sp),m1_1
182 ldw -16(0,%r30),%r24 361 LDD -24(%sp),ht_0
183 ldw -12(0,%r30),%r25 362 LDD -56(%sp),ht_1
184 add %r21,%r25,%r25 363
185 addc %r20,%r24,%r24 364 ADD,L m1_0,m_0,tmp_0 ; tmp_0 = m + m1;
186 copy %r24,%r23 365 ADD,L m1_1,m_1,tmp_1 ; tmp_1 = m + m1;
187 ldi 0,%r22 366 LDD -32(%sp),lt_0
188 copy %r23,%r28 367 LDD -64(%sp),lt_1
189 addib,= -1,%r19,L$0027 368
190 stw %r25,0(0,%r31) 369 CMPCLR,*>>= tmp_0,m1_0, %r0 ; if (m < m1)
191 ldo 16(%r29),%r29 370 ADD,L ht_0,top_overflow,ht_0 ; ht += (1<<32)
192 ldo 16(%r2),%r2 371 CMPCLR,*>>= tmp_1,m1_1,%r0 ; if (m < m1)
193 ldo 16(%r31),%r31 372 ADD,L ht_1,top_overflow,ht_1 ; ht += (1<<32)
194 bl L$0026,0 373
195 ldo 16(%r26),%r26 374 EXTRD,U tmp_0,31,32,m_0 ; m>>32
196L$0027 375 DEPD,Z tmp_0,31,32,m1_0 ; m1 = m<<32
197 ldw -84(0,%r30),%r2 376 EXTRD,U tmp_1,31,32,m_1 ; m>>32
198 bv 0(%r2) 377 DEPD,Z tmp_1,31,32,m1_1 ; m1 = m<<32
199 ldwm -64(0,%r30),%r4 378
200 .EXIT 379 ADD,L ht_0,m_0,ht_0 ; ht+= (m>>32)
201 .PROCEND 380 ADD,L ht_1,m_1,ht_1 ; ht+= (m>>32)
202 .align 4 381 ADD lt_0,m1_0,lt_0 ; lt = lt+m1;
203 .EXPORT bn_sqr_words,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR 382 ADD,DC ht_0,%r0,ht_0 ; ht++
383
384 ADD lt_1,m1_1,lt_1 ; lt = lt+m1;
385 ADD,DC ht_1,%r0,ht_1 ; ht++
386 ADD %ret1,lt_0,lt_0 ; lt = lt + c (ret1);
387 ADD,DC ht_0,%r0,ht_0 ; ht++
388
389 ADD ht_0,lt_1,lt_1 ; lt = lt + c (ht_0)
390 ADD,DC ht_1,%r0,ht_1 ; ht++
391 STD lt_0,0(r_ptr) ; rp[0] = lt
392 STD lt_1,8(r_ptr) ; rp[1] = lt
393
394 COPY ht_1,%ret1 ; carry = ht
395 LDO -2(num),num ; num = num - 2;
396 LDO 16(a_ptr),a_ptr ; ap += 2
397 CMPIB,<= 2,num,bn_mul_words_unroll2
398 LDO 16(r_ptr),r_ptr ; rp++
399
400 CMPIB,=,N 0,num,bn_mul_words_exit ; are we done?
401
402 ;
403 ; Top of loop aligned on 64-byte boundary
404 ;
405bn_mul_words_single_top
406 FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R)
407
408 XMPYU fht_0,fw_l,fm1 ; m1 = ht*fw_l
409 FSTD fm1,-16(%sp) ; -16(sp) = m1
410 XMPYU flt_0,fw_h,fm ; m = lt*fw_h
411 FSTD fm,-8(%sp) ; -8(sp) = m
412 XMPYU fht_0,fw_h,ht_temp ; ht_temp = ht*fw_h
413 FSTD ht_temp,-24(%sp) ; -24(sp) = ht
414 XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l
415 FSTD lt_temp,-32(%sp) ; -32(sp) = lt
416
417 LDD -8(%sp),m_0
418 LDD -16(%sp),m1_0
419 ADD,L m_0,m1_0,tmp_0 ; tmp_0 = m + m1;
420 LDD -24(%sp),ht_0
421 LDD -32(%sp),lt_0
422
423 CMPCLR,*>>= tmp_0,m1_0,%r0 ; if (m < m1)
424 ADD,L ht_0,top_overflow,ht_0 ; ht += (1<<32)
425
426 EXTRD,U tmp_0,31,32,m_0 ; m>>32
427 DEPD,Z tmp_0,31,32,m1_0 ; m1 = m<<32
428
429 ADD,L ht_0,m_0,ht_0 ; ht+= (m>>32)
430 ADD lt_0,m1_0,lt_0 ; lt= lt+m1;
431 ADD,DC ht_0,%r0,ht_0 ; ht++
432
433 ADD %ret1,lt_0,lt_0 ; lt = lt + c;
434 ADD,DC ht_0,%r0,ht_0 ; ht++
435
436 COPY ht_0,%ret1 ; copy carry
437 STD lt_0,0(r_ptr) ; rp[0] = lt
438
439bn_mul_words_exit
440 .EXIT
441 EXTRD,U %ret1,31,32,%ret0 ; for 32-bit, return in ret0/ret1
442 LDD -96(%sp),%r7 ; restore r7
443 LDD -104(%sp),%r6 ; restore r6
444 LDD -112(%sp),%r5 ; restore r5
445 LDD -120(%sp),%r4 ; restore r4
446 BVE (%rp)
447 LDD,MB -128(%sp),%r3 ; restore r3
448 .PROCEND
449
450;----------------------------------------------------------------------------
451;
452;void bn_sqr_words(BN_ULONG *rp, BN_ULONG *ap, int num)
453;
454; arg0 = rp
455; arg1 = ap
456; arg2 = num
457;
458
204bn_sqr_words 459bn_sqr_words
460 .proc
461 .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
462 .EXPORT bn_sqr_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
463 .entry
464 .align 64
465
466 STD %r3,0(%sp) ; save r3
467 STD %r4,8(%sp) ; save r4
468 NOP
469 STD %r5,16(%sp) ; save r5
470
471 CMPIB,>= 0,num,bn_sqr_words_exit
472 LDO 128(%sp),%sp ; bump stack
473
474 ;
475 ; If only 1, the goto straight to cleanup
476 ;
477 CMPIB,= 1,num,bn_sqr_words_single_top
478 DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L
479
480 ;
481 ; This loop is unrolled 2 times (64-byte aligned as well)
482 ;
483
484bn_sqr_words_unroll2
485 FLDD 0(a_ptr),t_float_0 ; a[0]
486 FLDD 8(a_ptr),t_float_1 ; a[1]
487 XMPYU fht_0,flt_0,fm ; m[0]
488 XMPYU fht_1,flt_1,fm_1 ; m[1]
489
490 FSTD fm,-24(%sp) ; store m[0]
491 FSTD fm_1,-56(%sp) ; store m[1]
492 XMPYU flt_0,flt_0,lt_temp ; lt[0]
493 XMPYU flt_1,flt_1,lt_temp_1 ; lt[1]
494
495 FSTD lt_temp,-16(%sp) ; store lt[0]
496 FSTD lt_temp_1,-48(%sp) ; store lt[1]
497 XMPYU fht_0,fht_0,ht_temp ; ht[0]
498 XMPYU fht_1,fht_1,ht_temp_1 ; ht[1]
499
500 FSTD ht_temp,-8(%sp) ; store ht[0]
501 FSTD ht_temp_1,-40(%sp) ; store ht[1]
502 LDD -24(%sp),m_0
503 LDD -56(%sp),m_1
504
505 AND m_0,high_mask,tmp_0 ; m[0] & Mask
506 AND m_1,high_mask,tmp_1 ; m[1] & Mask
507 DEPD,Z m_0,30,31,m_0 ; m[0] << 32+1
508 DEPD,Z m_1,30,31,m_1 ; m[1] << 32+1
509
510 LDD -16(%sp),lt_0
511 LDD -48(%sp),lt_1
512 EXTRD,U tmp_0,32,33,tmp_0 ; tmp_0 = m[0]&Mask >> 32-1
513 EXTRD,U tmp_1,32,33,tmp_1 ; tmp_1 = m[1]&Mask >> 32-1
514
515 LDD -8(%sp),ht_0
516 LDD -40(%sp),ht_1
517 ADD,L ht_0,tmp_0,ht_0 ; ht[0] += tmp_0
518 ADD,L ht_1,tmp_1,ht_1 ; ht[1] += tmp_1
519
520 ADD lt_0,m_0,lt_0 ; lt = lt+m
521 ADD,DC ht_0,%r0,ht_0 ; ht[0]++
522 STD lt_0,0(r_ptr) ; rp[0] = lt[0]
523 STD ht_0,8(r_ptr) ; rp[1] = ht[1]
524
525 ADD lt_1,m_1,lt_1 ; lt = lt+m
526 ADD,DC ht_1,%r0,ht_1 ; ht[1]++
527 STD lt_1,16(r_ptr) ; rp[2] = lt[1]
528 STD ht_1,24(r_ptr) ; rp[3] = ht[1]
529
530 LDO -2(num),num ; num = num - 2;
531 LDO 16(a_ptr),a_ptr ; ap += 2
532 CMPIB,<= 2,num,bn_sqr_words_unroll2
533 LDO 32(r_ptr),r_ptr ; rp += 4
534
535 CMPIB,=,N 0,num,bn_sqr_words_exit ; are we done?
536
537 ;
538 ; Top of loop aligned on 64-byte boundary
539 ;
540bn_sqr_words_single_top
541 FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R)
542
543 XMPYU fht_0,flt_0,fm ; m
544 FSTD fm,-24(%sp) ; store m
545
546 XMPYU flt_0,flt_0,lt_temp ; lt
547 FSTD lt_temp,-16(%sp) ; store lt
548
549 XMPYU fht_0,fht_0,ht_temp ; ht
550 FSTD ht_temp,-8(%sp) ; store ht
551
552 LDD -24(%sp),m_0 ; load m
553 AND m_0,high_mask,tmp_0 ; m & Mask
554 DEPD,Z m_0,30,31,m_0 ; m << 32+1
555 LDD -16(%sp),lt_0 ; lt
556
557 LDD -8(%sp),ht_0 ; ht
558 EXTRD,U tmp_0,32,33,tmp_0 ; tmp_0 = m&Mask >> 32-1
559 ADD m_0,lt_0,lt_0 ; lt = lt+m
560 ADD,L ht_0,tmp_0,ht_0 ; ht += tmp_0
561 ADD,DC ht_0,%r0,ht_0 ; ht++
562
563 STD lt_0,0(r_ptr) ; rp[0] = lt
564 STD ht_0,8(r_ptr) ; rp[1] = ht
565
566bn_sqr_words_exit
567 .EXIT
568 LDD -112(%sp),%r5 ; restore r5
569 LDD -120(%sp),%r4 ; restore r4
570 BVE (%rp)
571 LDD,MB -128(%sp),%r3
572 .PROCEND ;in=23,24,25,26,29;out=28;
573
574
575;----------------------------------------------------------------------------
576;
577;BN_ULONG bn_add_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
578;
579; arg0 = rp
580; arg1 = ap
581; arg2 = bp
582; arg3 = n
583
584t .reg %r22
585b .reg %r21
586l .reg %r20
587
588bn_add_words
589 .proc
590 .entry
591 .callinfo
592 .EXPORT bn_add_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
593 .align 64
594
595 CMPIB,>= 0,n,bn_add_words_exit
596 COPY %r0,%ret1 ; return 0 by default
597
598 ;
599 ; If 2 or more numbers do the loop
600 ;
601 CMPIB,= 1,n,bn_add_words_single_top
602 NOP
603
604 ;
605 ; This loop is unrolled 2 times (64-byte aligned as well)
606 ;
607bn_add_words_unroll2
608 LDD 0(a_ptr),t
609 LDD 0(b_ptr),b
610 ADD t,%ret1,t ; t = t+c;
611 ADD,DC %r0,%r0,%ret1 ; set c to carry
612 ADD t,b,l ; l = t + b[0]
613 ADD,DC %ret1,%r0,%ret1 ; c+= carry
614 STD l,0(r_ptr)
615
616 LDD 8(a_ptr),t
617 LDD 8(b_ptr),b
618 ADD t,%ret1,t ; t = t+c;
619 ADD,DC %r0,%r0,%ret1 ; set c to carry
620 ADD t,b,l ; l = t + b[0]
621 ADD,DC %ret1,%r0,%ret1 ; c+= carry
622 STD l,8(r_ptr)
623
624 LDO -2(n),n
625 LDO 16(a_ptr),a_ptr
626 LDO 16(b_ptr),b_ptr
627
628 CMPIB,<= 2,n,bn_add_words_unroll2
629 LDO 16(r_ptr),r_ptr
630
631 CMPIB,=,N 0,n,bn_add_words_exit ; are we done?
632
633bn_add_words_single_top
634 LDD 0(a_ptr),t
635 LDD 0(b_ptr),b
636
637 ADD t,%ret1,t ; t = t+c;
638 ADD,DC %r0,%r0,%ret1 ; set c to carry (could use CMPCLR??)
639 ADD t,b,l ; l = t + b[0]
640 ADD,DC %ret1,%r0,%ret1 ; c+= carry
641 STD l,0(r_ptr)
642
643bn_add_words_exit
644 .EXIT
645 BVE (%rp)
646 EXTRD,U %ret1,31,32,%ret0 ; for 32-bit, return in ret0/ret1
647 .PROCEND ;in=23,24,25,26,29;out=28;
648
649;----------------------------------------------------------------------------
650;
651;BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
652;
653; arg0 = rp
654; arg1 = ap
655; arg2 = bp
656; arg3 = n
657
658t1 .reg %r22
659t2 .reg %r21
660sub_tmp1 .reg %r20
661sub_tmp2 .reg %r19
662
663
664bn_sub_words
665 .proc
666 .callinfo
667 .EXPORT bn_sub_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
668 .entry
669 .align 64
670
671 CMPIB,>= 0,n,bn_sub_words_exit
672 COPY %r0,%ret1 ; return 0 by default
673
674 ;
675 ; If 2 or more numbers do the loop
676 ;
677 CMPIB,= 1,n,bn_sub_words_single_top
678 NOP
679
680 ;
681 ; This loop is unrolled 2 times (64-byte aligned as well)
682 ;
683bn_sub_words_unroll2
684 LDD 0(a_ptr),t1
685 LDD 0(b_ptr),t2
686 SUB t1,t2,sub_tmp1 ; t3 = t1-t2;
687 SUB sub_tmp1,%ret1,sub_tmp1 ; t3 = t3- c;
688
689 CMPCLR,*>> t1,t2,sub_tmp2 ; clear if t1 > t2
690 LDO 1(%r0),sub_tmp2
691
692 CMPCLR,*= t1,t2,%r0
693 COPY sub_tmp2,%ret1
694 STD sub_tmp1,0(r_ptr)
695
696 LDD 8(a_ptr),t1
697 LDD 8(b_ptr),t2
698 SUB t1,t2,sub_tmp1 ; t3 = t1-t2;
699 SUB sub_tmp1,%ret1,sub_tmp1 ; t3 = t3- c;
700 CMPCLR,*>> t1,t2,sub_tmp2 ; clear if t1 > t2
701 LDO 1(%r0),sub_tmp2
702
703 CMPCLR,*= t1,t2,%r0
704 COPY sub_tmp2,%ret1
705 STD sub_tmp1,8(r_ptr)
706
707 LDO -2(n),n
708 LDO 16(a_ptr),a_ptr
709 LDO 16(b_ptr),b_ptr
710
711 CMPIB,<= 2,n,bn_sub_words_unroll2
712 LDO 16(r_ptr),r_ptr
713
714 CMPIB,=,N 0,n,bn_sub_words_exit ; are we done?
715
716bn_sub_words_single_top
717 LDD 0(a_ptr),t1
718 LDD 0(b_ptr),t2
719 SUB t1,t2,sub_tmp1 ; t3 = t1-t2;
720 SUB sub_tmp1,%ret1,sub_tmp1 ; t3 = t3- c;
721 CMPCLR,*>> t1,t2,sub_tmp2 ; clear if t1 > t2
722 LDO 1(%r0),sub_tmp2
723
724 CMPCLR,*= t1,t2,%r0
725 COPY sub_tmp2,%ret1
726
727 STD sub_tmp1,0(r_ptr)
728
729bn_sub_words_exit
730 .EXIT
731 BVE (%rp)
732 EXTRD,U %ret1,31,32,%ret0 ; for 32-bit, return in ret0/ret1
733 .PROCEND ;in=23,24,25,26,29;out=28;
734
735;------------------------------------------------------------------------------
736;
737; unsigned long bn_div_words(unsigned long h, unsigned long l, unsigned long d)
738;
739; arg0 = h
740; arg1 = l
741; arg2 = d
742;
743; This is mainly just output from the HP C compiler.
744;
745;------------------------------------------------------------------------------
746bn_div_words
205 .PROC 747 .PROC
206 .CALLINFO FRAME=0,NO_CALLS 748 .EXPORT bn_div_words,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR,RTNVAL=GR,LONG_RETURN
207 .ENTRY 749 .IMPORT BN_num_bits_word,CODE
208 ldo 28(%r26),%r19 750 .IMPORT __iob,DATA
209 ldo 12(%r25),%r28 751 .IMPORT fprintf,CODE
210L$0042 752 .IMPORT abort,CODE
211 fldws 0(0,%r25),%fr8L 753 .IMPORT $$div2U,MILLICODE
212 fldws 0(0,%r25),%fr8R 754 .CALLINFO CALLER,FRAME=144,ENTRY_GR=%r9,SAVE_RP,ARGS_SAVED,ORDERING_AWARE
213 xmpyu %fr8L,%fr8R,%fr8 755 .ENTRY
214 fstds %fr8,-16(0,%r30) 756 STW %r2,-20(%r30) ;offset 0x8ec
215 ldw -16(0,%r30),%r22 757 STW,MA %r3,192(%r30) ;offset 0x8f0
216 ldw -12(0,%r30),%r23 758 STW %r4,-188(%r30) ;offset 0x8f4
217 stw %r23,0(0,%r26) 759 DEPD %r5,31,32,%r6 ;offset 0x8f8
218 copy %r22,%r21 760 STD %r6,-184(%r30) ;offset 0x8fc
219 ldi 0,%r20 761 DEPD %r7,31,32,%r8 ;offset 0x900
220 addib,= -1,%r24,L$0049 762 STD %r8,-176(%r30) ;offset 0x904
221 stw %r21,-24(0,%r19) 763 STW %r9,-168(%r30) ;offset 0x908
222 fldws -8(0,%r28),%fr8L 764 LDD -248(%r30),%r3 ;offset 0x90c
223 fldws -8(0,%r28),%fr8R 765 COPY %r26,%r4 ;offset 0x910
224 xmpyu %fr8L,%fr8R,%fr8 766 COPY %r24,%r5 ;offset 0x914
225 fstds %fr8,-16(0,%r30) 767 DEPD %r25,31,32,%r4 ;offset 0x918
226 ldw -16(0,%r30),%r22 768 CMPB,*<> %r3,%r0,$0006000C ;offset 0x91c
227 ldw -12(0,%r30),%r23 769 DEPD %r23,31,32,%r5 ;offset 0x920
228 stw %r23,-20(0,%r19) 770 MOVIB,TR -1,%r29,$00060002 ;offset 0x924
229 copy %r22,%r21 771 EXTRD,U %r29,31,32,%r28 ;offset 0x928
230 ldi 0,%r20 772$0006002A
231 addib,= -1,%r24,L$0049 773 LDO -1(%r29),%r29 ;offset 0x92c
232 stw %r21,-16(0,%r19) 774 SUB %r23,%r7,%r23 ;offset 0x930
233 fldws -4(0,%r28),%fr8L 775$00060024
234 fldws -4(0,%r28),%fr8R 776 SUB %r4,%r31,%r25 ;offset 0x934
235 xmpyu %fr8L,%fr8R,%fr8 777 AND %r25,%r19,%r26 ;offset 0x938
236 fstds %fr8,-16(0,%r30) 778 CMPB,*<>,N %r0,%r26,$00060046 ;offset 0x93c
237 ldw -16(0,%r30),%r22 779 DEPD,Z %r25,31,32,%r20 ;offset 0x940
238 ldw -12(0,%r30),%r23 780 OR %r20,%r24,%r21 ;offset 0x944
239 stw %r23,-12(0,%r19) 781 CMPB,*<<,N %r21,%r23,$0006002A ;offset 0x948
240 copy %r22,%r21 782 SUB %r31,%r2,%r31 ;offset 0x94c
241 ldi 0,%r20 783$00060046
242 addib,= -1,%r24,L$0049 784$0006002E
243 stw %r21,-8(0,%r19) 785 DEPD,Z %r23,31,32,%r25 ;offset 0x950
244 fldws 0(0,%r28),%fr8L 786 EXTRD,U %r23,31,32,%r26 ;offset 0x954
245 fldws 0(0,%r28),%fr8R 787 AND %r25,%r19,%r24 ;offset 0x958
246 xmpyu %fr8L,%fr8R,%fr8 788 ADD,L %r31,%r26,%r31 ;offset 0x95c
247 fstds %fr8,-16(0,%r30) 789 CMPCLR,*>>= %r5,%r24,%r0 ;offset 0x960
248 ldw -16(0,%r30),%r22 790 LDO 1(%r31),%r31 ;offset 0x964
249 ldw -12(0,%r30),%r23 791$00060032
250 stw %r23,-4(0,%r19) 792 CMPB,*<<=,N %r31,%r4,$00060036 ;offset 0x968
251 copy %r22,%r21 793 LDO -1(%r29),%r29 ;offset 0x96c
252 ldi 0,%r20 794 ADD,L %r4,%r3,%r4 ;offset 0x970
253 addib,= -1,%r24,L$0049 795$00060036
254 stw %r21,0(0,%r19) 796 ADDIB,=,N -1,%r8,$D0 ;offset 0x974
255 ldo 16(%r28),%r28 797 SUB %r5,%r24,%r28 ;offset 0x978
256 ldo 16(%r25),%r25 798$0006003A
257 ldo 32(%r19),%r19 799 SUB %r4,%r31,%r24 ;offset 0x97c
258 bl L$0042,0 800 SHRPD %r24,%r28,32,%r4 ;offset 0x980
259 ldo 32(%r26),%r26 801 DEPD,Z %r29,31,32,%r9 ;offset 0x984
260L$0049 802 DEPD,Z %r28,31,32,%r5 ;offset 0x988
261 bv,n 0(%r2) 803$0006001C
262 .EXIT 804 EXTRD,U %r4,31,32,%r31 ;offset 0x98c
263 .PROCEND 805 CMPB,*<>,N %r31,%r2,$00060020 ;offset 0x990
264 .IMPORT BN_num_bits_word,CODE 806 MOVB,TR %r6,%r29,$D1 ;offset 0x994
265 .IMPORT fprintf,CODE 807 STD %r29,-152(%r30) ;offset 0x998
266 .IMPORT __iob,DATA 808$0006000C
267 .SPACE $TEXT$ 809 EXTRD,U %r3,31,32,%r25 ;offset 0x99c
268 .SUBSPA $LIT$ 810 COPY %r3,%r26 ;offset 0x9a0
269 811 EXTRD,U %r3,31,32,%r9 ;offset 0x9a4
270 .align 4 812 EXTRD,U %r4,31,32,%r8 ;offset 0x9a8
271L$C0000 813 .CALL ARGW0=GR,ARGW1=GR,RTNVAL=GR ;in=25,26;out=28;
272 .STRING "Division would overflow (%d)\x0a\x00" 814 B,L BN_num_bits_word,%r2 ;offset 0x9ac
273 .IMPORT abort,CODE 815 EXTRD,U %r5,31,32,%r7 ;offset 0x9b0
274 .SPACE $TEXT$ 816 LDI 64,%r20 ;offset 0x9b4
275 .SUBSPA $CODE$ 817 DEPD %r7,31,32,%r5 ;offset 0x9b8
276 818 DEPD %r8,31,32,%r4 ;offset 0x9bc
277 .align 4 819 DEPD %r9,31,32,%r3 ;offset 0x9c0
278 .EXPORT bn_div64,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,RTNVAL=GR 820 CMPB,= %r28,%r20,$00060012 ;offset 0x9c4
279bn_div64 821 COPY %r28,%r24 ;offset 0x9c8
822 MTSARCM %r24 ;offset 0x9cc
823 DEPDI,Z -1,%sar,1,%r19 ;offset 0x9d0
824 CMPB,*>>,N %r4,%r19,$D2 ;offset 0x9d4
825$00060012
826 SUBI 64,%r24,%r31 ;offset 0x9d8
827 CMPCLR,*<< %r4,%r3,%r0 ;offset 0x9dc
828 SUB %r4,%r3,%r4 ;offset 0x9e0
829$00060016
830 CMPB,= %r31,%r0,$0006001A ;offset 0x9e4
831 COPY %r0,%r9 ;offset 0x9e8
832 MTSARCM %r31 ;offset 0x9ec
833 DEPD,Z %r3,%sar,64,%r3 ;offset 0x9f0
834 SUBI 64,%r31,%r26 ;offset 0x9f4
835 MTSAR %r26 ;offset 0x9f8
836 SHRPD %r4,%r5,%sar,%r4 ;offset 0x9fc
837 MTSARCM %r31 ;offset 0xa00
838 DEPD,Z %r5,%sar,64,%r5 ;offset 0xa04
839$0006001A
840 DEPDI,Z -1,31,32,%r19 ;offset 0xa08
841 AND %r3,%r19,%r29 ;offset 0xa0c
842 EXTRD,U %r29,31,32,%r2 ;offset 0xa10
843 DEPDI,Z -1,63,32,%r6 ;offset 0xa14
844 MOVIB,TR 2,%r8,$0006001C ;offset 0xa18
845 EXTRD,U %r3,63,32,%r7 ;offset 0xa1c
846$D2
847 ADDIL LR'__iob-$global$,%r27,%r1 ;offset 0xa20
848 LDIL LR'C$7,%r21 ;offset 0xa24
849 LDO RR'__iob-$global$+32(%r1),%r26 ;offset 0xa28
850 .CALL ARGW0=GR,ARGW1=GR,ARGW2=GR,RTNVAL=GR ;in=24,25,26;out=28;
851 B,L fprintf,%r2 ;offset 0xa2c
852 LDO RR'C$7(%r21),%r25 ;offset 0xa30
853 .CALL ;
854 B,L abort,%r2 ;offset 0xa34
855 NOP ;offset 0xa38
856 B $D3 ;offset 0xa3c
857 LDW -212(%r30),%r2 ;offset 0xa40
858$00060020
859 COPY %r4,%r26 ;offset 0xa44
860 EXTRD,U %r4,31,32,%r25 ;offset 0xa48
861 COPY %r2,%r24 ;offset 0xa4c
862 .CALL ;in=23,24,25,26;out=20,21,22,28,29; (MILLICALL)
863 B,L $$div2U,%r31 ;offset 0xa50
864 EXTRD,U %r2,31,32,%r23 ;offset 0xa54
865 DEPD %r28,31,32,%r29 ;offset 0xa58
866$00060022
867 STD %r29,-152(%r30) ;offset 0xa5c
868$D1
869 AND %r5,%r19,%r24 ;offset 0xa60
870 EXTRD,U %r24,31,32,%r24 ;offset 0xa64
871 STW %r2,-160(%r30) ;offset 0xa68
872 STW %r7,-128(%r30) ;offset 0xa6c
873 FLDD -152(%r30),%fr4 ;offset 0xa70
874 FLDD -152(%r30),%fr7 ;offset 0xa74
875 FLDW -160(%r30),%fr8L ;offset 0xa78
876 FLDW -128(%r30),%fr5L ;offset 0xa7c
877 XMPYU %fr8L,%fr7L,%fr10 ;offset 0xa80
878 FSTD %fr10,-136(%r30) ;offset 0xa84
879 XMPYU %fr8L,%fr7R,%fr22 ;offset 0xa88
880 FSTD %fr22,-144(%r30) ;offset 0xa8c
881 XMPYU %fr5L,%fr4L,%fr11 ;offset 0xa90
882 XMPYU %fr5L,%fr4R,%fr23 ;offset 0xa94
883 FSTD %fr11,-112(%r30) ;offset 0xa98
884 FSTD %fr23,-120(%r30) ;offset 0xa9c
885 LDD -136(%r30),%r28 ;offset 0xaa0
886 DEPD,Z %r28,31,32,%r31 ;offset 0xaa4
887 LDD -144(%r30),%r20 ;offset 0xaa8
888 ADD,L %r20,%r31,%r31 ;offset 0xaac
889 LDD -112(%r30),%r22 ;offset 0xab0
890 DEPD,Z %r22,31,32,%r22 ;offset 0xab4
891 LDD -120(%r30),%r21 ;offset 0xab8
892 B $00060024 ;offset 0xabc
893 ADD,L %r21,%r22,%r23 ;offset 0xac0
894$D0
895 OR %r9,%r29,%r29 ;offset 0xac4
896$00060040
897 EXTRD,U %r29,31,32,%r28 ;offset 0xac8
898$00060002
899$L2
900 LDW -212(%r30),%r2 ;offset 0xacc
901$D3
902 LDW -168(%r30),%r9 ;offset 0xad0
903 LDD -176(%r30),%r8 ;offset 0xad4
904 EXTRD,U %r8,31,32,%r7 ;offset 0xad8
905 LDD -184(%r30),%r6 ;offset 0xadc
906 EXTRD,U %r6,31,32,%r5 ;offset 0xae0
907 LDW -188(%r30),%r4 ;offset 0xae4
908 BVE (%r2) ;offset 0xae8
909 .EXIT
910 LDW,MB -192(%r30),%r3 ;offset 0xaec
911 .PROCEND ;in=23,25;out=28,29;fpin=105,107;
912
913
914
915
916;----------------------------------------------------------------------------
917;
918; Registers to hold 64-bit values to manipulate. The "L" part
919; of the register corresponds to the upper 32-bits, while the "R"
920; part corresponds to the lower 32-bits
921;
922; Note, that when using b6 and b7, the code must save these before
923; using them because they are callee save registers
924;
925;
926; Floating point registers to use to save values that
927; are manipulated. These don't collide with ftemp1-6 and
928; are all caller save registers
929;
930a0 .reg %fr22
931a0L .reg %fr22L
932a0R .reg %fr22R
933
934a1 .reg %fr23
935a1L .reg %fr23L
936a1R .reg %fr23R
937
938a2 .reg %fr24
939a2L .reg %fr24L
940a2R .reg %fr24R
941
942a3 .reg %fr25
943a3L .reg %fr25L
944a3R .reg %fr25R
945
946a4 .reg %fr26
947a4L .reg %fr26L
948a4R .reg %fr26R
949
950a5 .reg %fr27
951a5L .reg %fr27L
952a5R .reg %fr27R
953
954a6 .reg %fr28
955a6L .reg %fr28L
956a6R .reg %fr28R
957
958a7 .reg %fr29
959a7L .reg %fr29L
960a7R .reg %fr29R
961
962b0 .reg %fr30
963b0L .reg %fr30L
964b0R .reg %fr30R
965
966b1 .reg %fr31
967b1L .reg %fr31L
968b1R .reg %fr31R
969
970;
971; Temporary floating point variables, these are all caller save
972; registers
973;
974ftemp1 .reg %fr4
975ftemp2 .reg %fr5
976ftemp3 .reg %fr6
977ftemp4 .reg %fr7
978
979;
980; The B set of registers when used.
981;
982
983b2 .reg %fr8
984b2L .reg %fr8L
985b2R .reg %fr8R
986
987b3 .reg %fr9
988b3L .reg %fr9L
989b3R .reg %fr9R
990
991b4 .reg %fr10
992b4L .reg %fr10L
993b4R .reg %fr10R
994
995b5 .reg %fr11
996b5L .reg %fr11L
997b5R .reg %fr11R
998
999b6 .reg %fr12
1000b6L .reg %fr12L
1001b6R .reg %fr12R
1002
1003b7 .reg %fr13
1004b7L .reg %fr13L
1005b7R .reg %fr13R
1006
1007c1 .reg %r21 ; only reg
1008temp1 .reg %r20 ; only reg
1009temp2 .reg %r19 ; only reg
1010temp3 .reg %r31 ; only reg
1011
1012m1 .reg %r28
1013c2 .reg %r23
1014high_one .reg %r1
1015ht .reg %r6
1016lt .reg %r5
1017m .reg %r4
1018c3 .reg %r3
1019
1020SQR_ADD_C .macro A0L,A0R,C1,C2,C3
1021 XMPYU A0L,A0R,ftemp1 ; m
1022 FSTD ftemp1,-24(%sp) ; store m
1023
1024 XMPYU A0R,A0R,ftemp2 ; lt
1025 FSTD ftemp2,-16(%sp) ; store lt
1026
1027 XMPYU A0L,A0L,ftemp3 ; ht
1028 FSTD ftemp3,-8(%sp) ; store ht
1029
1030 LDD -24(%sp),m ; load m
1031 AND m,high_mask,temp2 ; m & Mask
1032 DEPD,Z m,30,31,temp3 ; m << 32+1
1033 LDD -16(%sp),lt ; lt
1034
1035 LDD -8(%sp),ht ; ht
1036 EXTRD,U temp2,32,33,temp1 ; temp1 = m&Mask >> 32-1
1037 ADD temp3,lt,lt ; lt = lt+m
1038 ADD,L ht,temp1,ht ; ht += temp1
1039 ADD,DC ht,%r0,ht ; ht++
1040
1041 ADD C1,lt,C1 ; c1=c1+lt
1042 ADD,DC ht,%r0,ht ; ht++
1043
1044 ADD C2,ht,C2 ; c2=c2+ht
1045 ADD,DC C3,%r0,C3 ; c3++
1046.endm
1047
1048SQR_ADD_C2 .macro A0L,A0R,A1L,A1R,C1,C2,C3
1049 XMPYU A0L,A1R,ftemp1 ; m1 = bl*ht
1050 FSTD ftemp1,-16(%sp) ;
1051 XMPYU A0R,A1L,ftemp2 ; m = bh*lt
1052 FSTD ftemp2,-8(%sp) ;
1053 XMPYU A0R,A1R,ftemp3 ; lt = bl*lt
1054 FSTD ftemp3,-32(%sp)
1055 XMPYU A0L,A1L,ftemp4 ; ht = bh*ht
1056 FSTD ftemp4,-24(%sp) ;
1057
1058 LDD -8(%sp),m ; r21 = m
1059 LDD -16(%sp),m1 ; r19 = m1
1060 ADD,L m,m1,m ; m+m1
1061
1062 DEPD,Z m,31,32,temp3 ; (m+m1<<32)
1063 LDD -24(%sp),ht ; r24 = ht
1064
1065 CMPCLR,*>>= m,m1,%r0 ; if (m < m1)
1066 ADD,L ht,high_one,ht ; ht+=high_one
1067
1068 EXTRD,U m,31,32,temp1 ; m >> 32
1069 LDD -32(%sp),lt ; lt
1070 ADD,L ht,temp1,ht ; ht+= m>>32
1071 ADD lt,temp3,lt ; lt = lt+m1
1072 ADD,DC ht,%r0,ht ; ht++
1073
1074 ADD ht,ht,ht ; ht=ht+ht;
1075 ADD,DC C3,%r0,C3 ; add in carry (c3++)
1076
1077 ADD lt,lt,lt ; lt=lt+lt;
1078 ADD,DC ht,%r0,ht ; add in carry (ht++)
1079
1080 ADD C1,lt,C1 ; c1=c1+lt
1081 ADD,DC,*NUV ht,%r0,ht ; add in carry (ht++)
1082 LDO 1(C3),C3 ; bump c3 if overflow,nullify otherwise
1083
1084 ADD C2,ht,C2 ; c2 = c2 + ht
1085 ADD,DC C3,%r0,C3 ; add in carry (c3++)
1086.endm
1087
1088;
1089;void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
1090; arg0 = r_ptr
1091; arg1 = a_ptr
1092;
1093
1094bn_sqr_comba8
280 .PROC 1095 .PROC
281 .CALLINFO FRAME=128,CALLS,SAVE_RP,ENTRY_GR=8 1096 .CALLINFO FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
282 .ENTRY 1097 .EXPORT bn_sqr_comba8,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
283 stw %r2,-20(0,%r30) 1098 .ENTRY
284 stwm %r8,128(0,%r30) 1099 .align 64
285 stw %r7,-124(0,%r30) 1100
286 stw %r4,-112(0,%r30) 1101 STD %r3,0(%sp) ; save r3
287 stw %r3,-108(0,%r30) 1102 STD %r4,8(%sp) ; save r4
288 copy %r26,%r3 1103 STD %r5,16(%sp) ; save r5
289 copy %r25,%r4 1104 STD %r6,24(%sp) ; save r6
290 stw %r6,-120(0,%r30) 1105
291 ldi 0,%r7 1106 ;
292 stw %r5,-116(0,%r30) 1107 ; Zero out carries
293 movb,<> %r24,%r5,L$0051 1108 ;
294 ldi 2,%r6 1109 COPY %r0,c1
295 bl L$0068,0 1110 COPY %r0,c2
296 ldi -1,%r28 1111 COPY %r0,c3
297L$0051 1112
298 .CALL ARGW0=GR 1113 LDO 128(%sp),%sp ; bump stack
299 bl BN_num_bits_word,%r2 1114 DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L
300 copy %r5,%r26 1115 DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32
301 copy %r28,%r24 1116
302 ldi 32,%r19 1117 ;
303 comb,= %r19,%r24,L$0052 1118 ; Load up all of the values we are going to use
304 subi 31,%r24,%r19 1119 ;
305 mtsar %r19 1120 FLDD 0(a_ptr),a0
306 zvdepi 1,32,%r19 1121 FLDD 8(a_ptr),a1
307 comb,>>= %r19,%r3,L$0052 1122 FLDD 16(a_ptr),a2
308 addil LR'__iob-$global$+32,%r27 1123 FLDD 24(a_ptr),a3
309 ldo RR'__iob-$global$+32(%r1),%r26 1124 FLDD 32(a_ptr),a4
310 ldil LR'L$C0000,%r25 1125 FLDD 40(a_ptr),a5
311 .CALL ARGW0=GR,ARGW1=GR,ARGW2=GR 1126 FLDD 48(a_ptr),a6
312 bl fprintf,%r2 1127 FLDD 56(a_ptr),a7
313 ldo RR'L$C0000(%r25),%r25 1128
314 .CALL 1129 SQR_ADD_C a0L,a0R,c1,c2,c3
315 bl abort,%r2 1130 STD c1,0(r_ptr) ; r[0] = c1;
316 nop 1131 COPY %r0,c1
317L$0052 1132
318 comb,>> %r5,%r3,L$0053 1133 SQR_ADD_C2 a1L,a1R,a0L,a0R,c2,c3,c1
319 subi 32,%r24,%r24 1134 STD c2,8(r_ptr) ; r[1] = c2;
320 sub %r3,%r5,%r3 1135 COPY %r0,c2
321L$0053 1136
322 comib,= 0,%r24,L$0054 1137 SQR_ADD_C a1L,a1R,c3,c1,c2
323 subi 31,%r24,%r19 1138 SQR_ADD_C2 a2L,a2R,a0L,a0R,c3,c1,c2
324 mtsar %r19 1139 STD c3,16(r_ptr) ; r[2] = c3;
325 zvdep %r5,32,%r5 1140 COPY %r0,c3
326 zvdep %r3,32,%r21 1141
327 subi 32,%r24,%r20 1142 SQR_ADD_C2 a3L,a3R,a0L,a0R,c1,c2,c3
328 mtsar %r20 1143 SQR_ADD_C2 a2L,a2R,a1L,a1R,c1,c2,c3
329 vshd 0,%r4,%r20 1144 STD c1,24(r_ptr) ; r[3] = c1;
330 or %r21,%r20,%r3 1145 COPY %r0,c1
331 mtsar %r19 1146
332 zvdep %r4,32,%r4 1147 SQR_ADD_C a2L,a2R,c2,c3,c1
333L$0054 1148 SQR_ADD_C2 a3L,a3R,a1L,a1R,c2,c3,c1
334 extru %r5,15,16,%r23 1149 SQR_ADD_C2 a4L,a4R,a0L,a0R,c2,c3,c1
335 extru %r5,31,16,%r28 1150 STD c2,32(r_ptr) ; r[4] = c2;
336L$0055 1151 COPY %r0,c2
337 extru %r3,15,16,%r19 1152
338 comb,<> %r23,%r19,L$0058 1153 SQR_ADD_C2 a5L,a5R,a0L,a0R,c3,c1,c2
339 copy %r3,%r26 1154 SQR_ADD_C2 a4L,a4R,a1L,a1R,c3,c1,c2
340 bl L$0059,0 1155 SQR_ADD_C2 a3L,a3R,a2L,a2R,c3,c1,c2
341 zdepi -1,31,16,%r29 1156 STD c3,40(r_ptr) ; r[5] = c3;
342L$0058 1157 COPY %r0,c3
343 .IMPORT $$divU,MILLICODE 1158
344 bl $$divU,%r31 1159 SQR_ADD_C a3L,a3R,c1,c2,c3
345 copy %r23,%r25 1160 SQR_ADD_C2 a4L,a4R,a2L,a2R,c1,c2,c3
346L$0059 1161 SQR_ADD_C2 a5L,a5R,a1L,a1R,c1,c2,c3
347 stw %r29,-16(0,%r30) 1162 SQR_ADD_C2 a6L,a6R,a0L,a0R,c1,c2,c3
348 fldws -16(0,%r30),%fr10L 1163 STD c1,48(r_ptr) ; r[6] = c1;
349 stw %r28,-16(0,%r30) 1164 COPY %r0,c1
350 fldws -16(0,%r30),%fr10R 1165
351 stw %r23,-16(0,%r30) 1166 SQR_ADD_C2 a7L,a7R,a0L,a0R,c2,c3,c1
352 xmpyu %fr10L,%fr10R,%fr8 1167 SQR_ADD_C2 a6L,a6R,a1L,a1R,c2,c3,c1
353 fldws -16(0,%r30),%fr10R 1168 SQR_ADD_C2 a5L,a5R,a2L,a2R,c2,c3,c1
354 fstws %fr8R,-16(0,%r30) 1169 SQR_ADD_C2 a4L,a4R,a3L,a3R,c2,c3,c1
355 xmpyu %fr10L,%fr10R,%fr9 1170 STD c2,56(r_ptr) ; r[7] = c2;
356 ldw -16(0,%r30),%r8 1171 COPY %r0,c2
357 fstws %fr9R,-16(0,%r30) 1172
358 copy %r8,%r22 1173 SQR_ADD_C a4L,a4R,c3,c1,c2
359 ldw -16(0,%r30),%r8 1174 SQR_ADD_C2 a5L,a5R,a3L,a3R,c3,c1,c2
360 extru %r4,15,16,%r24 1175 SQR_ADD_C2 a6L,a6R,a2L,a2R,c3,c1,c2
361 copy %r8,%r21 1176 SQR_ADD_C2 a7L,a7R,a1L,a1R,c3,c1,c2
362L$0060 1177 STD c3,64(r_ptr) ; r[8] = c3;
363 sub %r3,%r21,%r20 1178 COPY %r0,c3
364 copy %r20,%r19 1179
365 depi 0,31,16,%r19 1180 SQR_ADD_C2 a7L,a7R,a2L,a2R,c1,c2,c3
366 comib,<> 0,%r19,L$0061 1181 SQR_ADD_C2 a6L,a6R,a3L,a3R,c1,c2,c3
367 zdep %r20,15,16,%r19 1182 SQR_ADD_C2 a5L,a5R,a4L,a4R,c1,c2,c3
368 addl %r19,%r24,%r19 1183 STD c1,72(r_ptr) ; r[9] = c1;
369 comb,>>= %r19,%r22,L$0061 1184 COPY %r0,c1
370 sub %r22,%r28,%r22 1185
371 sub %r21,%r23,%r21 1186 SQR_ADD_C a5L,a5R,c2,c3,c1
372 bl L$0060,0 1187 SQR_ADD_C2 a6L,a6R,a4L,a4R,c2,c3,c1
373 ldo -1(%r29),%r29 1188 SQR_ADD_C2 a7L,a7R,a3L,a3R,c2,c3,c1
374L$0061 1189 STD c2,80(r_ptr) ; r[10] = c2;
375 stw %r29,-16(0,%r30) 1190 COPY %r0,c2
376 fldws -16(0,%r30),%fr10L 1191
377 stw %r28,-16(0,%r30) 1192 SQR_ADD_C2 a7L,a7R,a4L,a4R,c3,c1,c2
378 fldws -16(0,%r30),%fr10R 1193 SQR_ADD_C2 a6L,a6R,a5L,a5R,c3,c1,c2
379 xmpyu %fr10L,%fr10R,%fr8 1194 STD c3,88(r_ptr) ; r[11] = c3;
380 fstws %fr8R,-16(0,%r30) 1195 COPY %r0,c3
381 ldw -16(0,%r30),%r8 1196
382 stw %r23,-16(0,%r30) 1197 SQR_ADD_C a6L,a6R,c1,c2,c3
383 fldws -16(0,%r30),%fr10R 1198 SQR_ADD_C2 a7L,a7R,a5L,a5R,c1,c2,c3
384 copy %r8,%r19 1199 STD c1,96(r_ptr) ; r[12] = c1;
385 xmpyu %fr10L,%fr10R,%fr8 1200 COPY %r0,c1
386 fstws %fr8R,-16(0,%r30) 1201
387 extru %r19,15,16,%r20 1202 SQR_ADD_C2 a7L,a7R,a6L,a6R,c2,c3,c1
388 ldw -16(0,%r30),%r8 1203 STD c2,104(r_ptr) ; r[13] = c2;
389 zdep %r19,15,16,%r19 1204 COPY %r0,c2
390 addl %r8,%r20,%r20 1205
391 comclr,<<= %r19,%r4,0 1206 SQR_ADD_C a7L,a7R,c3,c1,c2
392 addi 1,%r20,%r20 1207 STD c3, 112(r_ptr) ; r[14] = c3
393 comb,<<= %r20,%r3,L$0066 1208 STD c1, 120(r_ptr) ; r[15] = c1
394 sub %r4,%r19,%r4 1209
395 addl %r3,%r5,%r3 1210 .EXIT
396 ldo -1(%r29),%r29 1211 LDD -104(%sp),%r6 ; restore r6
397L$0066 1212 LDD -112(%sp),%r5 ; restore r5
398 addib,= -1,%r6,L$0056 1213 LDD -120(%sp),%r4 ; restore r4
399 sub %r3,%r20,%r3 1214 BVE (%rp)
400 zdep %r29,15,16,%r7 1215 LDD,MB -128(%sp),%r3
401 shd %r3,%r4,16,%r3 1216
402 bl L$0055,0 1217 .PROCEND
403 zdep %r4,15,16,%r4 1218
404L$0056 1219;-----------------------------------------------------------------------------
405 or %r7,%r29,%r28 1220;
406L$0068 1221;void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
407 ldw -148(0,%r30),%r2 1222; arg0 = r_ptr
408 ldw -124(0,%r30),%r7 1223; arg1 = a_ptr
409 ldw -120(0,%r30),%r6 1224;
410 ldw -116(0,%r30),%r5 1225
411 ldw -112(0,%r30),%r4 1226bn_sqr_comba4
412 ldw -108(0,%r30),%r3 1227 .proc
413 bv 0(%r2) 1228 .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
414 ldwm -128(0,%r30),%r8 1229 .EXPORT bn_sqr_comba4,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
415 .EXIT 1230 .entry
416 .PROCEND 1231 .align 64
1232 STD %r3,0(%sp) ; save r3
1233 STD %r4,8(%sp) ; save r4
1234 STD %r5,16(%sp) ; save r5
1235 STD %r6,24(%sp) ; save r6
1236
1237 ;
1238 ; Zero out carries
1239 ;
1240 COPY %r0,c1
1241 COPY %r0,c2
1242 COPY %r0,c3
1243
1244 LDO 128(%sp),%sp ; bump stack
1245 DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L
1246 DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32
1247
1248 ;
1249 ; Load up all of the values we are going to use
1250 ;
1251 FLDD 0(a_ptr),a0
1252 FLDD 8(a_ptr),a1
1253 FLDD 16(a_ptr),a2
1254 FLDD 24(a_ptr),a3
1255 FLDD 32(a_ptr),a4
1256 FLDD 40(a_ptr),a5
1257 FLDD 48(a_ptr),a6
1258 FLDD 56(a_ptr),a7
1259
1260 SQR_ADD_C a0L,a0R,c1,c2,c3
1261
1262 STD c1,0(r_ptr) ; r[0] = c1;
1263 COPY %r0,c1
1264
1265 SQR_ADD_C2 a1L,a1R,a0L,a0R,c2,c3,c1
1266
1267 STD c2,8(r_ptr) ; r[1] = c2;
1268 COPY %r0,c2
1269
1270 SQR_ADD_C a1L,a1R,c3,c1,c2
1271 SQR_ADD_C2 a2L,a2R,a0L,a0R,c3,c1,c2
1272
1273 STD c3,16(r_ptr) ; r[2] = c3;
1274 COPY %r0,c3
1275
1276 SQR_ADD_C2 a3L,a3R,a0L,a0R,c1,c2,c3
1277 SQR_ADD_C2 a2L,a2R,a1L,a1R,c1,c2,c3
1278
1279 STD c1,24(r_ptr) ; r[3] = c1;
1280 COPY %r0,c1
1281
1282 SQR_ADD_C a2L,a2R,c2,c3,c1
1283 SQR_ADD_C2 a3L,a3R,a1L,a1R,c2,c3,c1
1284
1285 STD c2,32(r_ptr) ; r[4] = c2;
1286 COPY %r0,c2
1287
1288 SQR_ADD_C2 a3L,a3R,a2L,a2R,c3,c1,c2
1289 STD c3,40(r_ptr) ; r[5] = c3;
1290 COPY %r0,c3
1291
1292 SQR_ADD_C a3L,a3R,c1,c2,c3
1293 STD c1,48(r_ptr) ; r[6] = c1;
1294 STD c2,56(r_ptr) ; r[7] = c2;
1295
1296 .EXIT
1297 LDD -104(%sp),%r6 ; restore r6
1298 LDD -112(%sp),%r5 ; restore r5
1299 LDD -120(%sp),%r4 ; restore r4
1300 BVE (%rp)
1301 LDD,MB -128(%sp),%r3
1302
1303 .PROCEND
1304
1305
1306;---------------------------------------------------------------------------
1307
1308MUL_ADD_C .macro A0L,A0R,B0L,B0R,C1,C2,C3
1309 XMPYU A0L,B0R,ftemp1 ; m1 = bl*ht
1310 FSTD ftemp1,-16(%sp) ;
1311 XMPYU A0R,B0L,ftemp2 ; m = bh*lt
1312 FSTD ftemp2,-8(%sp) ;
1313 XMPYU A0R,B0R,ftemp3 ; lt = bl*lt
1314 FSTD ftemp3,-32(%sp)
1315 XMPYU A0L,B0L,ftemp4 ; ht = bh*ht
1316 FSTD ftemp4,-24(%sp) ;
1317
1318 LDD -8(%sp),m ; r21 = m
1319 LDD -16(%sp),m1 ; r19 = m1
1320 ADD,L m,m1,m ; m+m1
1321
1322 DEPD,Z m,31,32,temp3 ; (m+m1<<32)
1323 LDD -24(%sp),ht ; r24 = ht
1324
1325 CMPCLR,*>>= m,m1,%r0 ; if (m < m1)
1326 ADD,L ht,high_one,ht ; ht+=high_one
1327
1328 EXTRD,U m,31,32,temp1 ; m >> 32
1329 LDD -32(%sp),lt ; lt
1330 ADD,L ht,temp1,ht ; ht+= m>>32
1331 ADD lt,temp3,lt ; lt = lt+m1
1332 ADD,DC ht,%r0,ht ; ht++
1333
1334 ADD C1,lt,C1 ; c1=c1+lt
1335 ADD,DC ht,%r0,ht ; bump c3 if overflow,nullify otherwise
1336
1337 ADD C2,ht,C2 ; c2 = c2 + ht
1338 ADD,DC C3,%r0,C3 ; add in carry (c3++)
1339.endm
1340
1341
1342;
1343;void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
1344; arg0 = r_ptr
1345; arg1 = a_ptr
1346; arg2 = b_ptr
1347;
1348
1349bn_mul_comba8
1350 .proc
1351 .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
1352 .EXPORT bn_mul_comba8,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
1353 .entry
1354 .align 64
1355
1356 STD %r3,0(%sp) ; save r3
1357 STD %r4,8(%sp) ; save r4
1358 STD %r5,16(%sp) ; save r5
1359 STD %r6,24(%sp) ; save r6
1360 FSTD %fr12,32(%sp) ; save r6
1361 FSTD %fr13,40(%sp) ; save r7
1362
1363 ;
1364 ; Zero out carries
1365 ;
1366 COPY %r0,c1
1367 COPY %r0,c2
1368 COPY %r0,c3
1369
1370 LDO 128(%sp),%sp ; bump stack
1371 DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32
1372
1373 ;
1374 ; Load up all of the values we are going to use
1375 ;
1376 FLDD 0(a_ptr),a0
1377 FLDD 8(a_ptr),a1
1378 FLDD 16(a_ptr),a2
1379 FLDD 24(a_ptr),a3
1380 FLDD 32(a_ptr),a4
1381 FLDD 40(a_ptr),a5
1382 FLDD 48(a_ptr),a6
1383 FLDD 56(a_ptr),a7
1384
1385 FLDD 0(b_ptr),b0
1386 FLDD 8(b_ptr),b1
1387 FLDD 16(b_ptr),b2
1388 FLDD 24(b_ptr),b3
1389 FLDD 32(b_ptr),b4
1390 FLDD 40(b_ptr),b5
1391 FLDD 48(b_ptr),b6
1392 FLDD 56(b_ptr),b7
1393
1394 MUL_ADD_C a0L,a0R,b0L,b0R,c1,c2,c3
1395 STD c1,0(r_ptr)
1396 COPY %r0,c1
1397
1398 MUL_ADD_C a0L,a0R,b1L,b1R,c2,c3,c1
1399 MUL_ADD_C a1L,a1R,b0L,b0R,c2,c3,c1
1400 STD c2,8(r_ptr)
1401 COPY %r0,c2
1402
1403 MUL_ADD_C a2L,a2R,b0L,b0R,c3,c1,c2
1404 MUL_ADD_C a1L,a1R,b1L,b1R,c3,c1,c2
1405 MUL_ADD_C a0L,a0R,b2L,b2R,c3,c1,c2
1406 STD c3,16(r_ptr)
1407 COPY %r0,c3
1408
1409 MUL_ADD_C a0L,a0R,b3L,b3R,c1,c2,c3
1410 MUL_ADD_C a1L,a1R,b2L,b2R,c1,c2,c3
1411 MUL_ADD_C a2L,a2R,b1L,b1R,c1,c2,c3
1412 MUL_ADD_C a3L,a3R,b0L,b0R,c1,c2,c3
1413 STD c1,24(r_ptr)
1414 COPY %r0,c1
1415
1416 MUL_ADD_C a4L,a4R,b0L,b0R,c2,c3,c1
1417 MUL_ADD_C a3L,a3R,b1L,b1R,c2,c3,c1
1418 MUL_ADD_C a2L,a2R,b2L,b2R,c2,c3,c1
1419 MUL_ADD_C a1L,a1R,b3L,b3R,c2,c3,c1
1420 MUL_ADD_C a0L,a0R,b4L,b4R,c2,c3,c1
1421 STD c2,32(r_ptr)
1422 COPY %r0,c2
1423
1424 MUL_ADD_C a0L,a0R,b5L,b5R,c3,c1,c2
1425 MUL_ADD_C a1L,a1R,b4L,b4R,c3,c1,c2
1426 MUL_ADD_C a2L,a2R,b3L,b3R,c3,c1,c2
1427 MUL_ADD_C a3L,a3R,b2L,b2R,c3,c1,c2
1428 MUL_ADD_C a4L,a4R,b1L,b1R,c3,c1,c2
1429 MUL_ADD_C a5L,a5R,b0L,b0R,c3,c1,c2
1430 STD c3,40(r_ptr)
1431 COPY %r0,c3
1432
1433 MUL_ADD_C a6L,a6R,b0L,b0R,c1,c2,c3
1434 MUL_ADD_C a5L,a5R,b1L,b1R,c1,c2,c3
1435 MUL_ADD_C a4L,a4R,b2L,b2R,c1,c2,c3
1436 MUL_ADD_C a3L,a3R,b3L,b3R,c1,c2,c3
1437 MUL_ADD_C a2L,a2R,b4L,b4R,c1,c2,c3
1438 MUL_ADD_C a1L,a1R,b5L,b5R,c1,c2,c3
1439 MUL_ADD_C a0L,a0R,b6L,b6R,c1,c2,c3
1440 STD c1,48(r_ptr)
1441 COPY %r0,c1
1442
1443 MUL_ADD_C a0L,a0R,b7L,b7R,c2,c3,c1
1444 MUL_ADD_C a1L,a1R,b6L,b6R,c2,c3,c1
1445 MUL_ADD_C a2L,a2R,b5L,b5R,c2,c3,c1
1446 MUL_ADD_C a3L,a3R,b4L,b4R,c2,c3,c1
1447 MUL_ADD_C a4L,a4R,b3L,b3R,c2,c3,c1
1448 MUL_ADD_C a5L,a5R,b2L,b2R,c2,c3,c1
1449 MUL_ADD_C a6L,a6R,b1L,b1R,c2,c3,c1
1450 MUL_ADD_C a7L,a7R,b0L,b0R,c2,c3,c1
1451 STD c2,56(r_ptr)
1452 COPY %r0,c2
1453
1454 MUL_ADD_C a7L,a7R,b1L,b1R,c3,c1,c2
1455 MUL_ADD_C a6L,a6R,b2L,b2R,c3,c1,c2
1456 MUL_ADD_C a5L,a5R,b3L,b3R,c3,c1,c2
1457 MUL_ADD_C a4L,a4R,b4L,b4R,c3,c1,c2
1458 MUL_ADD_C a3L,a3R,b5L,b5R,c3,c1,c2
1459 MUL_ADD_C a2L,a2R,b6L,b6R,c3,c1,c2
1460 MUL_ADD_C a1L,a1R,b7L,b7R,c3,c1,c2
1461 STD c3,64(r_ptr)
1462 COPY %r0,c3
1463
1464 MUL_ADD_C a2L,a2R,b7L,b7R,c1,c2,c3
1465 MUL_ADD_C a3L,a3R,b6L,b6R,c1,c2,c3
1466 MUL_ADD_C a4L,a4R,b5L,b5R,c1,c2,c3
1467 MUL_ADD_C a5L,a5R,b4L,b4R,c1,c2,c3
1468 MUL_ADD_C a6L,a6R,b3L,b3R,c1,c2,c3
1469 MUL_ADD_C a7L,a7R,b2L,b2R,c1,c2,c3
1470 STD c1,72(r_ptr)
1471 COPY %r0,c1
1472
1473 MUL_ADD_C a7L,a7R,b3L,b3R,c2,c3,c1
1474 MUL_ADD_C a6L,a6R,b4L,b4R,c2,c3,c1
1475 MUL_ADD_C a5L,a5R,b5L,b5R,c2,c3,c1
1476 MUL_ADD_C a4L,a4R,b6L,b6R,c2,c3,c1
1477 MUL_ADD_C a3L,a3R,b7L,b7R,c2,c3,c1
1478 STD c2,80(r_ptr)
1479 COPY %r0,c2
1480
1481 MUL_ADD_C a4L,a4R,b7L,b7R,c3,c1,c2
1482 MUL_ADD_C a5L,a5R,b6L,b6R,c3,c1,c2
1483 MUL_ADD_C a6L,a6R,b5L,b5R,c3,c1,c2
1484 MUL_ADD_C a7L,a7R,b4L,b4R,c3,c1,c2
1485 STD c3,88(r_ptr)
1486 COPY %r0,c3
1487
1488 MUL_ADD_C a7L,a7R,b5L,b5R,c1,c2,c3
1489 MUL_ADD_C a6L,a6R,b6L,b6R,c1,c2,c3
1490 MUL_ADD_C a5L,a5R,b7L,b7R,c1,c2,c3
1491 STD c1,96(r_ptr)
1492 COPY %r0,c1
1493
1494 MUL_ADD_C a6L,a6R,b7L,b7R,c2,c3,c1
1495 MUL_ADD_C a7L,a7R,b6L,b6R,c2,c3,c1
1496 STD c2,104(r_ptr)
1497 COPY %r0,c2
1498
1499 MUL_ADD_C a7L,a7R,b7L,b7R,c3,c1,c2
1500 STD c3,112(r_ptr)
1501 STD c1,120(r_ptr)
1502
1503 .EXIT
1504 FLDD -88(%sp),%fr13
1505 FLDD -96(%sp),%fr12
1506 LDD -104(%sp),%r6 ; restore r6
1507 LDD -112(%sp),%r5 ; restore r5
1508 LDD -120(%sp),%r4 ; restore r4
1509 BVE (%rp)
1510 LDD,MB -128(%sp),%r3
1511
1512 .PROCEND
1513
1514;-----------------------------------------------------------------------------
1515;
1516;void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
1517; arg0 = r_ptr
1518; arg1 = a_ptr
1519; arg2 = b_ptr
1520;
1521
1522bn_mul_comba4
1523 .proc
1524 .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
1525 .EXPORT bn_mul_comba4,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
1526 .entry
1527 .align 64
1528
1529 STD %r3,0(%sp) ; save r3
1530 STD %r4,8(%sp) ; save r4
1531 STD %r5,16(%sp) ; save r5
1532 STD %r6,24(%sp) ; save r6
1533 FSTD %fr12,32(%sp) ; save r6
1534 FSTD %fr13,40(%sp) ; save r7
1535
1536 ;
1537 ; Zero out carries
1538 ;
1539 COPY %r0,c1
1540 COPY %r0,c2
1541 COPY %r0,c3
1542
1543 LDO 128(%sp),%sp ; bump stack
1544 DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32
1545
1546 ;
1547 ; Load up all of the values we are going to use
1548 ;
1549 FLDD 0(a_ptr),a0
1550 FLDD 8(a_ptr),a1
1551 FLDD 16(a_ptr),a2
1552 FLDD 24(a_ptr),a3
1553
1554 FLDD 0(b_ptr),b0
1555 FLDD 8(b_ptr),b1
1556 FLDD 16(b_ptr),b2
1557 FLDD 24(b_ptr),b3
1558
1559 MUL_ADD_C a0L,a0R,b0L,b0R,c1,c2,c3
1560 STD c1,0(r_ptr)
1561 COPY %r0,c1
1562
1563 MUL_ADD_C a0L,a0R,b1L,b1R,c2,c3,c1
1564 MUL_ADD_C a1L,a1R,b0L,b0R,c2,c3,c1
1565 STD c2,8(r_ptr)
1566 COPY %r0,c2
1567
1568 MUL_ADD_C a2L,a2R,b0L,b0R,c3,c1,c2
1569 MUL_ADD_C a1L,a1R,b1L,b1R,c3,c1,c2
1570 MUL_ADD_C a0L,a0R,b2L,b2R,c3,c1,c2
1571 STD c3,16(r_ptr)
1572 COPY %r0,c3
1573
1574 MUL_ADD_C a0L,a0R,b3L,b3R,c1,c2,c3
1575 MUL_ADD_C a1L,a1R,b2L,b2R,c1,c2,c3
1576 MUL_ADD_C a2L,a2R,b1L,b1R,c1,c2,c3
1577 MUL_ADD_C a3L,a3R,b0L,b0R,c1,c2,c3
1578 STD c1,24(r_ptr)
1579 COPY %r0,c1
1580
1581 MUL_ADD_C a3L,a3R,b1L,b1R,c2,c3,c1
1582 MUL_ADD_C a2L,a2R,b2L,b2R,c2,c3,c1
1583 MUL_ADD_C a1L,a1R,b3L,b3R,c2,c3,c1
1584 STD c2,32(r_ptr)
1585 COPY %r0,c2
1586
1587 MUL_ADD_C a2L,a2R,b3L,b3R,c3,c1,c2
1588 MUL_ADD_C a3L,a3R,b2L,b2R,c3,c1,c2
1589 STD c3,40(r_ptr)
1590 COPY %r0,c3
1591
1592 MUL_ADD_C a3L,a3R,b3L,b3R,c1,c2,c3
1593 STD c1,48(r_ptr)
1594 STD c2,56(r_ptr)
1595
1596 .EXIT
1597 FLDD -88(%sp),%fr13
1598 FLDD -96(%sp),%fr12
1599 LDD -104(%sp),%r6 ; restore r6
1600 LDD -112(%sp),%r5 ; restore r5
1601 LDD -120(%sp),%r4 ; restore r4
1602 BVE (%rp)
1603 LDD,MB -128(%sp),%r3
1604
1605 .PROCEND
1606
1607
1608 .SPACE $TEXT$
1609 .SUBSPA $CODE$
1610 .SPACE $PRIVATE$,SORT=16
1611 .IMPORT $global$,DATA
1612 .SPACE $TEXT$
1613 .SUBSPA $CODE$
1614 .SUBSPA $LIT$,QUAD=0,ALIGN=8,ACCESS=0x2c,SORT=16
1615C$7
1616 .ALIGN 8
1617 .STRINGZ "Division would overflow (%d)\n"
1618 .END