diff options
Diffstat (limited to 'src/lib/libcrypto/bn/asm/pa-risc2.s')
-rw-r--r-- | src/lib/libcrypto/bn/asm/pa-risc2.s | 2024 |
1 files changed, 1613 insertions, 411 deletions
diff --git a/src/lib/libcrypto/bn/asm/pa-risc2.s b/src/lib/libcrypto/bn/asm/pa-risc2.s index c2725996a4..7239aa2c76 100644 --- a/src/lib/libcrypto/bn/asm/pa-risc2.s +++ b/src/lib/libcrypto/bn/asm/pa-risc2.s | |||
@@ -1,416 +1,1618 @@ | |||
1 | .SPACE $PRIVATE$ | 1 | ; |
2 | .SUBSPA $DATA$,QUAD=1,ALIGN=8,ACCESS=31 | 2 | ; PA-RISC 2.0 implementation of bn_asm code, based on the |
3 | .SUBSPA $BSS$,QUAD=1,ALIGN=8,ACCESS=31,ZERO,SORT=82 | 3 | ; 64-bit version of the code. This code is effectively the |
4 | .SPACE $TEXT$ | 4 | ; same as the 64-bit version except the register model is |
5 | .SUBSPA $LIT$,QUAD=0,ALIGN=8,ACCESS=44 | 5 | ; slightly different given all values must be 32-bit between |
6 | .SUBSPA $CODE$,QUAD=0,ALIGN=8,ACCESS=44,CODE_ONLY | 6 | ; function calls. Thus the 64-bit return values are returned |
7 | .IMPORT $global$,DATA | 7 | ; in %ret0 and %ret1 vs just %ret0 as is done in 64-bit |
8 | .IMPORT $$dyncall,MILLICODE | 8 | ; |
9 | ; gcc_compiled.: | 9 | ; |
10 | .SPACE $TEXT$ | 10 | ; This code is approximately 2x faster than the C version |
11 | .SUBSPA $CODE$ | 11 | ; for RSA/DSA. |
12 | 12 | ; | |
13 | .align 4 | 13 | ; See http://devresource.hp.com/ for more details on the PA-RISC |
14 | .EXPORT bn_mul_add_words,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR,RTNVAL=GR | 14 | ; architecture. Also see the book "PA-RISC 2.0 Architecture" |
15 | ; by Gerry Kane for information on the instruction set architecture. | ||
16 | ; | ||
17 | ; Code written by Chris Ruemmler (with some help from the HP C | ||
18 | ; compiler). | ||
19 | ; | ||
20 | ; The code compiles with HP's assembler | ||
21 | ; | ||
22 | |||
23 | .level 2.0N | ||
24 | .space $TEXT$ | ||
25 | .subspa $CODE$,QUAD=0,ALIGN=8,ACCESS=0x2c,CODE_ONLY | ||
26 | |||
27 | ; | ||
28 | ; Global Register definitions used for the routines. | ||
29 | ; | ||
30 | ; Some information about HP's runtime architecture for 32-bits. | ||
31 | ; | ||
32 | ; "Caller save" means the calling function must save the register | ||
33 | ; if it wants the register to be preserved. | ||
34 | ; "Callee save" means if a function uses the register, it must save | ||
35 | ; the value before using it. | ||
36 | ; | ||
37 | ; For the floating point registers | ||
38 | ; | ||
39 | ; "caller save" registers: fr4-fr11, fr22-fr31 | ||
40 | ; "callee save" registers: fr12-fr21 | ||
41 | ; "special" registers: fr0-fr3 (status and exception registers) | ||
42 | ; | ||
43 | ; For the integer registers | ||
44 | ; value zero : r0 | ||
45 | ; "caller save" registers: r1,r19-r26 | ||
46 | ; "callee save" registers: r3-r18 | ||
47 | ; return register : r2 (rp) | ||
48 | ; return values ; r28,r29 (ret0,ret1) | ||
49 | ; Stack pointer ; r30 (sp) | ||
50 | ; millicode return ptr ; r31 (also a caller save register) | ||
51 | |||
52 | |||
53 | ; | ||
54 | ; Arguments to the routines | ||
55 | ; | ||
56 | r_ptr .reg %r26 | ||
57 | a_ptr .reg %r25 | ||
58 | b_ptr .reg %r24 | ||
59 | num .reg %r24 | ||
60 | n .reg %r23 | ||
61 | |||
62 | ; | ||
63 | ; Note that the "w" argument for bn_mul_add_words and bn_mul_words | ||
64 | ; is passed on the stack at a delta of -56 from the top of stack | ||
65 | ; as the routine is entered. | ||
66 | ; | ||
67 | |||
68 | ; | ||
69 | ; Globals used in some routines | ||
70 | ; | ||
71 | |||
72 | top_overflow .reg %r23 | ||
73 | high_mask .reg %r22 ; value 0xffffffff80000000L | ||
74 | |||
75 | |||
76 | ;------------------------------------------------------------------------------ | ||
77 | ; | ||
78 | ; bn_mul_add_words | ||
79 | ; | ||
80 | ;BN_ULONG bn_mul_add_words(BN_ULONG *r_ptr, BN_ULONG *a_ptr, | ||
81 | ; int num, BN_ULONG w) | ||
82 | ; | ||
83 | ; arg0 = r_ptr | ||
84 | ; arg1 = a_ptr | ||
85 | ; arg3 = num | ||
86 | ; -56(sp) = w | ||
87 | ; | ||
88 | ; Local register definitions | ||
89 | ; | ||
90 | |||
91 | fm1 .reg %fr22 | ||
92 | fm .reg %fr23 | ||
93 | ht_temp .reg %fr24 | ||
94 | ht_temp_1 .reg %fr25 | ||
95 | lt_temp .reg %fr26 | ||
96 | lt_temp_1 .reg %fr27 | ||
97 | fm1_1 .reg %fr28 | ||
98 | fm_1 .reg %fr29 | ||
99 | |||
100 | fw_h .reg %fr7L | ||
101 | fw_l .reg %fr7R | ||
102 | fw .reg %fr7 | ||
103 | |||
104 | fht_0 .reg %fr8L | ||
105 | flt_0 .reg %fr8R | ||
106 | t_float_0 .reg %fr8 | ||
107 | |||
108 | fht_1 .reg %fr9L | ||
109 | flt_1 .reg %fr9R | ||
110 | t_float_1 .reg %fr9 | ||
111 | |||
112 | tmp_0 .reg %r31 | ||
113 | tmp_1 .reg %r21 | ||
114 | m_0 .reg %r20 | ||
115 | m_1 .reg %r19 | ||
116 | ht_0 .reg %r1 | ||
117 | ht_1 .reg %r3 | ||
118 | lt_0 .reg %r4 | ||
119 | lt_1 .reg %r5 | ||
120 | m1_0 .reg %r6 | ||
121 | m1_1 .reg %r7 | ||
122 | rp_val .reg %r8 | ||
123 | rp_val_1 .reg %r9 | ||
124 | |||
15 | bn_mul_add_words | 125 | bn_mul_add_words |
16 | .PROC | 126 | .export bn_mul_add_words,entry,NO_RELOCATION,LONG_RETURN |
17 | .CALLINFO FRAME=64,CALLS,SAVE_RP,ENTRY_GR=4 | 127 | .proc |
18 | .ENTRY | 128 | .callinfo frame=128 |
19 | stw %r2,-20(0,%r30) | 129 | .entry |
20 | stwm %r4,64(0,%r30) | 130 | .align 64 |
21 | copy %r24,%r31 | 131 | |
22 | stw %r3,-60(0,%r30) | 132 | STD %r3,0(%sp) ; save r3 |
23 | ldi 0,%r20 | 133 | STD %r4,8(%sp) ; save r4 |
24 | ldo 12(%r26),%r2 | 134 | NOP ; Needed to make the loop 16-byte aligned |
25 | stw %r23,-16(0,%r30) | 135 | NOP ; needed to make the loop 16-byte aligned |
26 | copy %r25,%r3 | 136 | |
27 | ldo 12(%r3),%r1 | 137 | STD %r5,16(%sp) ; save r5 |
28 | fldws -16(0,%r30),%fr8L | 138 | NOP |
29 | L$0010 | 139 | STD %r6,24(%sp) ; save r6 |
30 | copy %r20,%r25 | 140 | STD %r7,32(%sp) ; save r7 |
31 | ldi 0,%r24 | 141 | |
32 | fldws 0(0,%r3),%fr9L | 142 | STD %r8,40(%sp) ; save r8 |
33 | ldw 0(0,%r26),%r19 | 143 | STD %r9,48(%sp) ; save r9 |
34 | xmpyu %fr8L,%fr9L,%fr9 | 144 | COPY %r0,%ret1 ; return 0 by default |
35 | fstds %fr9,-16(0,%r30) | 145 | DEPDI,Z 1,31,1,top_overflow ; top_overflow = 1 << 32 |
36 | copy %r19,%r23 | 146 | |
37 | ldw -16(0,%r30),%r28 | 147 | CMPIB,>= 0,num,bn_mul_add_words_exit ; if (num <= 0) then exit |
38 | ldw -12(0,%r30),%r29 | 148 | LDO 128(%sp),%sp ; bump stack |
39 | ldi 0,%r22 | 149 | |
40 | add %r23,%r29,%r29 | 150 | ; |
41 | addc %r22,%r28,%r28 | 151 | ; The loop is unrolled twice, so if there is only 1 number |
42 | add %r25,%r29,%r29 | 152 | ; then go straight to the cleanup code. |
43 | addc %r24,%r28,%r28 | 153 | ; |
44 | copy %r28,%r21 | 154 | CMPIB,= 1,num,bn_mul_add_words_single_top |
45 | ldi 0,%r20 | 155 | FLDD -184(%sp),fw ; (-56-128) load up w into fw (fw_h/fw_l) |
46 | copy %r21,%r20 | 156 | |
47 | addib,= -1,%r31,L$0011 | 157 | ; |
48 | stw %r29,0(0,%r26) | 158 | ; This loop is unrolled 2 times (64-byte aligned as well) |
49 | copy %r20,%r25 | 159 | ; |
50 | ldi 0,%r24 | 160 | ; PA-RISC 2.0 chips have two fully pipelined multipliers, thus |
51 | fldws -8(0,%r1),%fr9L | 161 | ; two 32-bit mutiplies can be issued per cycle. |
52 | ldw -8(0,%r2),%r19 | 162 | ; |
53 | xmpyu %fr8L,%fr9L,%fr9 | 163 | bn_mul_add_words_unroll2 |
54 | fstds %fr9,-16(0,%r30) | 164 | |
55 | copy %r19,%r23 | 165 | FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R) |
56 | ldw -16(0,%r30),%r28 | 166 | FLDD 8(a_ptr),t_float_1 ; load up 64-bit value (fr8L) ht(L)/lt(R) |
57 | ldw -12(0,%r30),%r29 | 167 | LDD 0(r_ptr),rp_val ; rp[0] |
58 | ldi 0,%r22 | 168 | LDD 8(r_ptr),rp_val_1 ; rp[1] |
59 | add %r23,%r29,%r29 | 169 | |
60 | addc %r22,%r28,%r28 | 170 | XMPYU fht_0,fw_l,fm1 ; m1[0] = fht_0*fw_l |
61 | add %r25,%r29,%r29 | 171 | XMPYU fht_1,fw_l,fm1_1 ; m1[1] = fht_1*fw_l |
62 | addc %r24,%r28,%r28 | 172 | FSTD fm1,-16(%sp) ; -16(sp) = m1[0] |
63 | copy %r28,%r21 | 173 | FSTD fm1_1,-48(%sp) ; -48(sp) = m1[1] |
64 | ldi 0,%r20 | 174 | |
65 | copy %r21,%r20 | 175 | XMPYU flt_0,fw_h,fm ; m[0] = flt_0*fw_h |
66 | addib,= -1,%r31,L$0011 | 176 | XMPYU flt_1,fw_h,fm_1 ; m[1] = flt_1*fw_h |
67 | stw %r29,-8(0,%r2) | 177 | FSTD fm,-8(%sp) ; -8(sp) = m[0] |
68 | copy %r20,%r25 | 178 | FSTD fm_1,-40(%sp) ; -40(sp) = m[1] |
69 | ldi 0,%r24 | 179 | |
70 | fldws -4(0,%r1),%fr9L | 180 | XMPYU fht_0,fw_h,ht_temp ; ht_temp = fht_0*fw_h |
71 | ldw -4(0,%r2),%r19 | 181 | XMPYU fht_1,fw_h,ht_temp_1 ; ht_temp_1 = fht_1*fw_h |
72 | xmpyu %fr8L,%fr9L,%fr9 | 182 | FSTD ht_temp,-24(%sp) ; -24(sp) = ht_temp |
73 | fstds %fr9,-16(0,%r30) | 183 | FSTD ht_temp_1,-56(%sp) ; -56(sp) = ht_temp_1 |
74 | copy %r19,%r23 | 184 | |
75 | ldw -16(0,%r30),%r28 | 185 | XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l |
76 | ldw -12(0,%r30),%r29 | 186 | XMPYU flt_1,fw_l,lt_temp_1 ; lt_temp = lt*fw_l |
77 | ldi 0,%r22 | 187 | FSTD lt_temp,-32(%sp) ; -32(sp) = lt_temp |
78 | add %r23,%r29,%r29 | 188 | FSTD lt_temp_1,-64(%sp) ; -64(sp) = lt_temp_1 |
79 | addc %r22,%r28,%r28 | 189 | |
80 | add %r25,%r29,%r29 | 190 | LDD -8(%sp),m_0 ; m[0] |
81 | addc %r24,%r28,%r28 | 191 | LDD -40(%sp),m_1 ; m[1] |
82 | copy %r28,%r21 | 192 | LDD -16(%sp),m1_0 ; m1[0] |
83 | ldi 0,%r20 | 193 | LDD -48(%sp),m1_1 ; m1[1] |
84 | copy %r21,%r20 | 194 | |
85 | addib,= -1,%r31,L$0011 | 195 | LDD -24(%sp),ht_0 ; ht[0] |
86 | stw %r29,-4(0,%r2) | 196 | LDD -56(%sp),ht_1 ; ht[1] |
87 | copy %r20,%r25 | 197 | ADD,L m1_0,m_0,tmp_0 ; tmp_0 = m[0] + m1[0]; |
88 | ldi 0,%r24 | 198 | ADD,L m1_1,m_1,tmp_1 ; tmp_1 = m[1] + m1[1]; |
89 | fldws 0(0,%r1),%fr9L | 199 | |
90 | ldw 0(0,%r2),%r19 | 200 | LDD -32(%sp),lt_0 |
91 | xmpyu %fr8L,%fr9L,%fr9 | 201 | LDD -64(%sp),lt_1 |
92 | fstds %fr9,-16(0,%r30) | 202 | CMPCLR,*>>= tmp_0,m1_0, %r0 ; if (m[0] < m1[0]) |
93 | copy %r19,%r23 | 203 | ADD,L ht_0,top_overflow,ht_0 ; ht[0] += (1<<32) |
94 | ldw -16(0,%r30),%r28 | 204 | |
95 | ldw -12(0,%r30),%r29 | 205 | CMPCLR,*>>= tmp_1,m1_1,%r0 ; if (m[1] < m1[1]) |
96 | ldi 0,%r22 | 206 | ADD,L ht_1,top_overflow,ht_1 ; ht[1] += (1<<32) |
97 | add %r23,%r29,%r29 | 207 | EXTRD,U tmp_0,31,32,m_0 ; m[0]>>32 |
98 | addc %r22,%r28,%r28 | 208 | DEPD,Z tmp_0,31,32,m1_0 ; m1[0] = m[0]<<32 |
99 | add %r25,%r29,%r29 | 209 | |
100 | addc %r24,%r28,%r28 | 210 | EXTRD,U tmp_1,31,32,m_1 ; m[1]>>32 |
101 | copy %r28,%r21 | 211 | DEPD,Z tmp_1,31,32,m1_1 ; m1[1] = m[1]<<32 |
102 | ldi 0,%r20 | 212 | ADD,L ht_0,m_0,ht_0 ; ht[0]+= (m[0]>>32) |
103 | copy %r21,%r20 | 213 | ADD,L ht_1,m_1,ht_1 ; ht[1]+= (m[1]>>32) |
104 | addib,= -1,%r31,L$0011 | 214 | |
105 | stw %r29,0(0,%r2) | 215 | ADD lt_0,m1_0,lt_0 ; lt[0] = lt[0]+m1[0]; |
106 | ldo 16(%r1),%r1 | 216 | ADD,DC ht_0,%r0,ht_0 ; ht[0]++ |
107 | ldo 16(%r3),%r3 | 217 | ADD lt_1,m1_1,lt_1 ; lt[1] = lt[1]+m1[1]; |
108 | ldo 16(%r2),%r2 | 218 | ADD,DC ht_1,%r0,ht_1 ; ht[1]++ |
109 | bl L$0010,0 | 219 | |
110 | ldo 16(%r26),%r26 | 220 | ADD %ret1,lt_0,lt_0 ; lt[0] = lt[0] + c; |
111 | L$0011 | 221 | ADD,DC ht_0,%r0,ht_0 ; ht[0]++ |
112 | copy %r20,%r28 | 222 | ADD lt_0,rp_val,lt_0 ; lt[0] = lt[0]+rp[0] |
113 | ldw -84(0,%r30),%r2 | 223 | ADD,DC ht_0,%r0,ht_0 ; ht[0]++ |
114 | ldw -60(0,%r30),%r3 | 224 | |
115 | bv 0(%r2) | 225 | LDO -2(num),num ; num = num - 2; |
116 | ldwm -64(0,%r30),%r4 | 226 | ADD ht_0,lt_1,lt_1 ; lt[1] = lt[1] + ht_0 (c); |
117 | .EXIT | 227 | ADD,DC ht_1,%r0,ht_1 ; ht[1]++ |
118 | .PROCEND | 228 | STD lt_0,0(r_ptr) ; rp[0] = lt[0] |
119 | .align 4 | 229 | |
120 | .EXPORT bn_mul_words,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR,RTNVAL=GR | 230 | ADD lt_1,rp_val_1,lt_1 ; lt[1] = lt[1]+rp[1] |
231 | ADD,DC ht_1,%r0,%ret1 ; ht[1]++ | ||
232 | LDO 16(a_ptr),a_ptr ; a_ptr += 2 | ||
233 | |||
234 | STD lt_1,8(r_ptr) ; rp[1] = lt[1] | ||
235 | CMPIB,<= 2,num,bn_mul_add_words_unroll2 ; go again if more to do | ||
236 | LDO 16(r_ptr),r_ptr ; r_ptr += 2 | ||
237 | |||
238 | CMPIB,=,N 0,num,bn_mul_add_words_exit ; are we done, or cleanup last one | ||
239 | |||
240 | ; | ||
241 | ; Top of loop aligned on 64-byte boundary | ||
242 | ; | ||
243 | bn_mul_add_words_single_top | ||
244 | FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R) | ||
245 | LDD 0(r_ptr),rp_val ; rp[0] | ||
246 | LDO 8(a_ptr),a_ptr ; a_ptr++ | ||
247 | XMPYU fht_0,fw_l,fm1 ; m1 = ht*fw_l | ||
248 | FSTD fm1,-16(%sp) ; -16(sp) = m1 | ||
249 | XMPYU flt_0,fw_h,fm ; m = lt*fw_h | ||
250 | FSTD fm,-8(%sp) ; -8(sp) = m | ||
251 | XMPYU fht_0,fw_h,ht_temp ; ht_temp = ht*fw_h | ||
252 | FSTD ht_temp,-24(%sp) ; -24(sp) = ht | ||
253 | XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l | ||
254 | FSTD lt_temp,-32(%sp) ; -32(sp) = lt | ||
255 | |||
256 | LDD -8(%sp),m_0 | ||
257 | LDD -16(%sp),m1_0 ; m1 = temp1 | ||
258 | ADD,L m_0,m1_0,tmp_0 ; tmp_0 = m + m1; | ||
259 | LDD -24(%sp),ht_0 | ||
260 | LDD -32(%sp),lt_0 | ||
261 | |||
262 | CMPCLR,*>>= tmp_0,m1_0,%r0 ; if (m < m1) | ||
263 | ADD,L ht_0,top_overflow,ht_0 ; ht += (1<<32) | ||
264 | |||
265 | EXTRD,U tmp_0,31,32,m_0 ; m>>32 | ||
266 | DEPD,Z tmp_0,31,32,m1_0 ; m1 = m<<32 | ||
267 | |||
268 | ADD,L ht_0,m_0,ht_0 ; ht+= (m>>32) | ||
269 | ADD lt_0,m1_0,tmp_0 ; tmp_0 = lt+m1; | ||
270 | ADD,DC ht_0,%r0,ht_0 ; ht++ | ||
271 | ADD %ret1,tmp_0,lt_0 ; lt = lt + c; | ||
272 | ADD,DC ht_0,%r0,ht_0 ; ht++ | ||
273 | ADD lt_0,rp_val,lt_0 ; lt = lt+rp[0] | ||
274 | ADD,DC ht_0,%r0,%ret1 ; ht++ | ||
275 | STD lt_0,0(r_ptr) ; rp[0] = lt | ||
276 | |||
277 | bn_mul_add_words_exit | ||
278 | .EXIT | ||
279 | |||
280 | EXTRD,U %ret1,31,32,%ret0 ; for 32-bit, return in ret0/ret1 | ||
281 | LDD -80(%sp),%r9 ; restore r9 | ||
282 | LDD -88(%sp),%r8 ; restore r8 | ||
283 | LDD -96(%sp),%r7 ; restore r7 | ||
284 | LDD -104(%sp),%r6 ; restore r6 | ||
285 | LDD -112(%sp),%r5 ; restore r5 | ||
286 | LDD -120(%sp),%r4 ; restore r4 | ||
287 | BVE (%rp) | ||
288 | LDD,MB -128(%sp),%r3 ; restore r3 | ||
289 | .PROCEND ;in=23,24,25,26,29;out=28; | ||
290 | |||
291 | ;---------------------------------------------------------------------------- | ||
292 | ; | ||
293 | ;BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w) | ||
294 | ; | ||
295 | ; arg0 = rp | ||
296 | ; arg1 = ap | ||
297 | ; arg3 = num | ||
298 | ; w on stack at -56(sp) | ||
299 | |||
121 | bn_mul_words | 300 | bn_mul_words |
122 | .PROC | 301 | .proc |
123 | .CALLINFO FRAME=64,CALLS,SAVE_RP,ENTRY_GR=3 | 302 | .callinfo frame=128 |
124 | .ENTRY | 303 | .entry |
125 | stw %r2,-20(0,%r30) | 304 | .EXPORT bn_mul_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN |
126 | copy %r25,%r2 | 305 | .align 64 |
127 | stwm %r4,64(0,%r30) | 306 | |
128 | copy %r24,%r19 | 307 | STD %r3,0(%sp) ; save r3 |
129 | ldi 0,%r28 | 308 | STD %r4,8(%sp) ; save r4 |
130 | stw %r23,-16(0,%r30) | 309 | NOP |
131 | ldo 12(%r26),%r31 | 310 | STD %r5,16(%sp) ; save r5 |
132 | ldo 12(%r2),%r29 | 311 | |
133 | fldws -16(0,%r30),%fr8L | 312 | STD %r6,24(%sp) ; save r6 |
134 | L$0026 | 313 | STD %r7,32(%sp) ; save r7 |
135 | fldws 0(0,%r2),%fr9L | 314 | COPY %r0,%ret1 ; return 0 by default |
136 | xmpyu %fr8L,%fr9L,%fr9 | 315 | DEPDI,Z 1,31,1,top_overflow ; top_overflow = 1 << 32 |
137 | fstds %fr9,-16(0,%r30) | 316 | |
138 | copy %r28,%r21 | 317 | CMPIB,>= 0,num,bn_mul_words_exit |
139 | ldi 0,%r20 | 318 | LDO 128(%sp),%sp ; bump stack |
140 | ldw -16(0,%r30),%r24 | 319 | |
141 | ldw -12(0,%r30),%r25 | 320 | ; |
142 | add %r21,%r25,%r25 | 321 | ; See if only 1 word to do, thus just do cleanup |
143 | addc %r20,%r24,%r24 | 322 | ; |
144 | copy %r24,%r23 | 323 | CMPIB,= 1,num,bn_mul_words_single_top |
145 | ldi 0,%r22 | 324 | FLDD -184(%sp),fw ; (-56-128) load up w into fw (fw_h/fw_l) |
146 | copy %r23,%r28 | 325 | |
147 | addib,= -1,%r19,L$0027 | 326 | ; |
148 | stw %r25,0(0,%r26) | 327 | ; This loop is unrolled 2 times (64-byte aligned as well) |
149 | fldws -8(0,%r29),%fr9L | 328 | ; |
150 | xmpyu %fr8L,%fr9L,%fr9 | 329 | ; PA-RISC 2.0 chips have two fully pipelined multipliers, thus |
151 | fstds %fr9,-16(0,%r30) | 330 | ; two 32-bit mutiplies can be issued per cycle. |
152 | copy %r28,%r21 | 331 | ; |
153 | ldi 0,%r20 | 332 | bn_mul_words_unroll2 |
154 | ldw -16(0,%r30),%r24 | 333 | |
155 | ldw -12(0,%r30),%r25 | 334 | FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R) |
156 | add %r21,%r25,%r25 | 335 | FLDD 8(a_ptr),t_float_1 ; load up 64-bit value (fr8L) ht(L)/lt(R) |
157 | addc %r20,%r24,%r24 | 336 | XMPYU fht_0,fw_l,fm1 ; m1[0] = fht_0*fw_l |
158 | copy %r24,%r23 | 337 | XMPYU fht_1,fw_l,fm1_1 ; m1[1] = ht*fw_l |
159 | ldi 0,%r22 | 338 | |
160 | copy %r23,%r28 | 339 | FSTD fm1,-16(%sp) ; -16(sp) = m1 |
161 | addib,= -1,%r19,L$0027 | 340 | FSTD fm1_1,-48(%sp) ; -48(sp) = m1 |
162 | stw %r25,-8(0,%r31) | 341 | XMPYU flt_0,fw_h,fm ; m = lt*fw_h |
163 | fldws -4(0,%r29),%fr9L | 342 | XMPYU flt_1,fw_h,fm_1 ; m = lt*fw_h |
164 | xmpyu %fr8L,%fr9L,%fr9 | 343 | |
165 | fstds %fr9,-16(0,%r30) | 344 | FSTD fm,-8(%sp) ; -8(sp) = m |
166 | copy %r28,%r21 | 345 | FSTD fm_1,-40(%sp) ; -40(sp) = m |
167 | ldi 0,%r20 | 346 | XMPYU fht_0,fw_h,ht_temp ; ht_temp = fht_0*fw_h |
168 | ldw -16(0,%r30),%r24 | 347 | XMPYU fht_1,fw_h,ht_temp_1 ; ht_temp = ht*fw_h |
169 | ldw -12(0,%r30),%r25 | 348 | |
170 | add %r21,%r25,%r25 | 349 | FSTD ht_temp,-24(%sp) ; -24(sp) = ht |
171 | addc %r20,%r24,%r24 | 350 | FSTD ht_temp_1,-56(%sp) ; -56(sp) = ht |
172 | copy %r24,%r23 | 351 | XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l |
173 | ldi 0,%r22 | 352 | XMPYU flt_1,fw_l,lt_temp_1 ; lt_temp = lt*fw_l |
174 | copy %r23,%r28 | 353 | |
175 | addib,= -1,%r19,L$0027 | 354 | FSTD lt_temp,-32(%sp) ; -32(sp) = lt |
176 | stw %r25,-4(0,%r31) | 355 | FSTD lt_temp_1,-64(%sp) ; -64(sp) = lt |
177 | fldws 0(0,%r29),%fr9L | 356 | LDD -8(%sp),m_0 |
178 | xmpyu %fr8L,%fr9L,%fr9 | 357 | LDD -40(%sp),m_1 |
179 | fstds %fr9,-16(0,%r30) | 358 | |
180 | copy %r28,%r21 | 359 | LDD -16(%sp),m1_0 |
181 | ldi 0,%r20 | 360 | LDD -48(%sp),m1_1 |
182 | ldw -16(0,%r30),%r24 | 361 | LDD -24(%sp),ht_0 |
183 | ldw -12(0,%r30),%r25 | 362 | LDD -56(%sp),ht_1 |
184 | add %r21,%r25,%r25 | 363 | |
185 | addc %r20,%r24,%r24 | 364 | ADD,L m1_0,m_0,tmp_0 ; tmp_0 = m + m1; |
186 | copy %r24,%r23 | 365 | ADD,L m1_1,m_1,tmp_1 ; tmp_1 = m + m1; |
187 | ldi 0,%r22 | 366 | LDD -32(%sp),lt_0 |
188 | copy %r23,%r28 | 367 | LDD -64(%sp),lt_1 |
189 | addib,= -1,%r19,L$0027 | 368 | |
190 | stw %r25,0(0,%r31) | 369 | CMPCLR,*>>= tmp_0,m1_0, %r0 ; if (m < m1) |
191 | ldo 16(%r29),%r29 | 370 | ADD,L ht_0,top_overflow,ht_0 ; ht += (1<<32) |
192 | ldo 16(%r2),%r2 | 371 | CMPCLR,*>>= tmp_1,m1_1,%r0 ; if (m < m1) |
193 | ldo 16(%r31),%r31 | 372 | ADD,L ht_1,top_overflow,ht_1 ; ht += (1<<32) |
194 | bl L$0026,0 | 373 | |
195 | ldo 16(%r26),%r26 | 374 | EXTRD,U tmp_0,31,32,m_0 ; m>>32 |
196 | L$0027 | 375 | DEPD,Z tmp_0,31,32,m1_0 ; m1 = m<<32 |
197 | ldw -84(0,%r30),%r2 | 376 | EXTRD,U tmp_1,31,32,m_1 ; m>>32 |
198 | bv 0(%r2) | 377 | DEPD,Z tmp_1,31,32,m1_1 ; m1 = m<<32 |
199 | ldwm -64(0,%r30),%r4 | 378 | |
200 | .EXIT | 379 | ADD,L ht_0,m_0,ht_0 ; ht+= (m>>32) |
201 | .PROCEND | 380 | ADD,L ht_1,m_1,ht_1 ; ht+= (m>>32) |
202 | .align 4 | 381 | ADD lt_0,m1_0,lt_0 ; lt = lt+m1; |
203 | .EXPORT bn_sqr_words,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR | 382 | ADD,DC ht_0,%r0,ht_0 ; ht++ |
383 | |||
384 | ADD lt_1,m1_1,lt_1 ; lt = lt+m1; | ||
385 | ADD,DC ht_1,%r0,ht_1 ; ht++ | ||
386 | ADD %ret1,lt_0,lt_0 ; lt = lt + c (ret1); | ||
387 | ADD,DC ht_0,%r0,ht_0 ; ht++ | ||
388 | |||
389 | ADD ht_0,lt_1,lt_1 ; lt = lt + c (ht_0) | ||
390 | ADD,DC ht_1,%r0,ht_1 ; ht++ | ||
391 | STD lt_0,0(r_ptr) ; rp[0] = lt | ||
392 | STD lt_1,8(r_ptr) ; rp[1] = lt | ||
393 | |||
394 | COPY ht_1,%ret1 ; carry = ht | ||
395 | LDO -2(num),num ; num = num - 2; | ||
396 | LDO 16(a_ptr),a_ptr ; ap += 2 | ||
397 | CMPIB,<= 2,num,bn_mul_words_unroll2 | ||
398 | LDO 16(r_ptr),r_ptr ; rp++ | ||
399 | |||
400 | CMPIB,=,N 0,num,bn_mul_words_exit ; are we done? | ||
401 | |||
402 | ; | ||
403 | ; Top of loop aligned on 64-byte boundary | ||
404 | ; | ||
405 | bn_mul_words_single_top | ||
406 | FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R) | ||
407 | |||
408 | XMPYU fht_0,fw_l,fm1 ; m1 = ht*fw_l | ||
409 | FSTD fm1,-16(%sp) ; -16(sp) = m1 | ||
410 | XMPYU flt_0,fw_h,fm ; m = lt*fw_h | ||
411 | FSTD fm,-8(%sp) ; -8(sp) = m | ||
412 | XMPYU fht_0,fw_h,ht_temp ; ht_temp = ht*fw_h | ||
413 | FSTD ht_temp,-24(%sp) ; -24(sp) = ht | ||
414 | XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l | ||
415 | FSTD lt_temp,-32(%sp) ; -32(sp) = lt | ||
416 | |||
417 | LDD -8(%sp),m_0 | ||
418 | LDD -16(%sp),m1_0 | ||
419 | ADD,L m_0,m1_0,tmp_0 ; tmp_0 = m + m1; | ||
420 | LDD -24(%sp),ht_0 | ||
421 | LDD -32(%sp),lt_0 | ||
422 | |||
423 | CMPCLR,*>>= tmp_0,m1_0,%r0 ; if (m < m1) | ||
424 | ADD,L ht_0,top_overflow,ht_0 ; ht += (1<<32) | ||
425 | |||
426 | EXTRD,U tmp_0,31,32,m_0 ; m>>32 | ||
427 | DEPD,Z tmp_0,31,32,m1_0 ; m1 = m<<32 | ||
428 | |||
429 | ADD,L ht_0,m_0,ht_0 ; ht+= (m>>32) | ||
430 | ADD lt_0,m1_0,lt_0 ; lt= lt+m1; | ||
431 | ADD,DC ht_0,%r0,ht_0 ; ht++ | ||
432 | |||
433 | ADD %ret1,lt_0,lt_0 ; lt = lt + c; | ||
434 | ADD,DC ht_0,%r0,ht_0 ; ht++ | ||
435 | |||
436 | COPY ht_0,%ret1 ; copy carry | ||
437 | STD lt_0,0(r_ptr) ; rp[0] = lt | ||
438 | |||
439 | bn_mul_words_exit | ||
440 | .EXIT | ||
441 | EXTRD,U %ret1,31,32,%ret0 ; for 32-bit, return in ret0/ret1 | ||
442 | LDD -96(%sp),%r7 ; restore r7 | ||
443 | LDD -104(%sp),%r6 ; restore r6 | ||
444 | LDD -112(%sp),%r5 ; restore r5 | ||
445 | LDD -120(%sp),%r4 ; restore r4 | ||
446 | BVE (%rp) | ||
447 | LDD,MB -128(%sp),%r3 ; restore r3 | ||
448 | .PROCEND | ||
449 | |||
450 | ;---------------------------------------------------------------------------- | ||
451 | ; | ||
452 | ;void bn_sqr_words(BN_ULONG *rp, BN_ULONG *ap, int num) | ||
453 | ; | ||
454 | ; arg0 = rp | ||
455 | ; arg1 = ap | ||
456 | ; arg2 = num | ||
457 | ; | ||
458 | |||
204 | bn_sqr_words | 459 | bn_sqr_words |
460 | .proc | ||
461 | .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE | ||
462 | .EXPORT bn_sqr_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN | ||
463 | .entry | ||
464 | .align 64 | ||
465 | |||
466 | STD %r3,0(%sp) ; save r3 | ||
467 | STD %r4,8(%sp) ; save r4 | ||
468 | NOP | ||
469 | STD %r5,16(%sp) ; save r5 | ||
470 | |||
471 | CMPIB,>= 0,num,bn_sqr_words_exit | ||
472 | LDO 128(%sp),%sp ; bump stack | ||
473 | |||
474 | ; | ||
475 | ; If only 1, the goto straight to cleanup | ||
476 | ; | ||
477 | CMPIB,= 1,num,bn_sqr_words_single_top | ||
478 | DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L | ||
479 | |||
480 | ; | ||
481 | ; This loop is unrolled 2 times (64-byte aligned as well) | ||
482 | ; | ||
483 | |||
484 | bn_sqr_words_unroll2 | ||
485 | FLDD 0(a_ptr),t_float_0 ; a[0] | ||
486 | FLDD 8(a_ptr),t_float_1 ; a[1] | ||
487 | XMPYU fht_0,flt_0,fm ; m[0] | ||
488 | XMPYU fht_1,flt_1,fm_1 ; m[1] | ||
489 | |||
490 | FSTD fm,-24(%sp) ; store m[0] | ||
491 | FSTD fm_1,-56(%sp) ; store m[1] | ||
492 | XMPYU flt_0,flt_0,lt_temp ; lt[0] | ||
493 | XMPYU flt_1,flt_1,lt_temp_1 ; lt[1] | ||
494 | |||
495 | FSTD lt_temp,-16(%sp) ; store lt[0] | ||
496 | FSTD lt_temp_1,-48(%sp) ; store lt[1] | ||
497 | XMPYU fht_0,fht_0,ht_temp ; ht[0] | ||
498 | XMPYU fht_1,fht_1,ht_temp_1 ; ht[1] | ||
499 | |||
500 | FSTD ht_temp,-8(%sp) ; store ht[0] | ||
501 | FSTD ht_temp_1,-40(%sp) ; store ht[1] | ||
502 | LDD -24(%sp),m_0 | ||
503 | LDD -56(%sp),m_1 | ||
504 | |||
505 | AND m_0,high_mask,tmp_0 ; m[0] & Mask | ||
506 | AND m_1,high_mask,tmp_1 ; m[1] & Mask | ||
507 | DEPD,Z m_0,30,31,m_0 ; m[0] << 32+1 | ||
508 | DEPD,Z m_1,30,31,m_1 ; m[1] << 32+1 | ||
509 | |||
510 | LDD -16(%sp),lt_0 | ||
511 | LDD -48(%sp),lt_1 | ||
512 | EXTRD,U tmp_0,32,33,tmp_0 ; tmp_0 = m[0]&Mask >> 32-1 | ||
513 | EXTRD,U tmp_1,32,33,tmp_1 ; tmp_1 = m[1]&Mask >> 32-1 | ||
514 | |||
515 | LDD -8(%sp),ht_0 | ||
516 | LDD -40(%sp),ht_1 | ||
517 | ADD,L ht_0,tmp_0,ht_0 ; ht[0] += tmp_0 | ||
518 | ADD,L ht_1,tmp_1,ht_1 ; ht[1] += tmp_1 | ||
519 | |||
520 | ADD lt_0,m_0,lt_0 ; lt = lt+m | ||
521 | ADD,DC ht_0,%r0,ht_0 ; ht[0]++ | ||
522 | STD lt_0,0(r_ptr) ; rp[0] = lt[0] | ||
523 | STD ht_0,8(r_ptr) ; rp[1] = ht[1] | ||
524 | |||
525 | ADD lt_1,m_1,lt_1 ; lt = lt+m | ||
526 | ADD,DC ht_1,%r0,ht_1 ; ht[1]++ | ||
527 | STD lt_1,16(r_ptr) ; rp[2] = lt[1] | ||
528 | STD ht_1,24(r_ptr) ; rp[3] = ht[1] | ||
529 | |||
530 | LDO -2(num),num ; num = num - 2; | ||
531 | LDO 16(a_ptr),a_ptr ; ap += 2 | ||
532 | CMPIB,<= 2,num,bn_sqr_words_unroll2 | ||
533 | LDO 32(r_ptr),r_ptr ; rp += 4 | ||
534 | |||
535 | CMPIB,=,N 0,num,bn_sqr_words_exit ; are we done? | ||
536 | |||
537 | ; | ||
538 | ; Top of loop aligned on 64-byte boundary | ||
539 | ; | ||
540 | bn_sqr_words_single_top | ||
541 | FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R) | ||
542 | |||
543 | XMPYU fht_0,flt_0,fm ; m | ||
544 | FSTD fm,-24(%sp) ; store m | ||
545 | |||
546 | XMPYU flt_0,flt_0,lt_temp ; lt | ||
547 | FSTD lt_temp,-16(%sp) ; store lt | ||
548 | |||
549 | XMPYU fht_0,fht_0,ht_temp ; ht | ||
550 | FSTD ht_temp,-8(%sp) ; store ht | ||
551 | |||
552 | LDD -24(%sp),m_0 ; load m | ||
553 | AND m_0,high_mask,tmp_0 ; m & Mask | ||
554 | DEPD,Z m_0,30,31,m_0 ; m << 32+1 | ||
555 | LDD -16(%sp),lt_0 ; lt | ||
556 | |||
557 | LDD -8(%sp),ht_0 ; ht | ||
558 | EXTRD,U tmp_0,32,33,tmp_0 ; tmp_0 = m&Mask >> 32-1 | ||
559 | ADD m_0,lt_0,lt_0 ; lt = lt+m | ||
560 | ADD,L ht_0,tmp_0,ht_0 ; ht += tmp_0 | ||
561 | ADD,DC ht_0,%r0,ht_0 ; ht++ | ||
562 | |||
563 | STD lt_0,0(r_ptr) ; rp[0] = lt | ||
564 | STD ht_0,8(r_ptr) ; rp[1] = ht | ||
565 | |||
566 | bn_sqr_words_exit | ||
567 | .EXIT | ||
568 | LDD -112(%sp),%r5 ; restore r5 | ||
569 | LDD -120(%sp),%r4 ; restore r4 | ||
570 | BVE (%rp) | ||
571 | LDD,MB -128(%sp),%r3 | ||
572 | .PROCEND ;in=23,24,25,26,29;out=28; | ||
573 | |||
574 | |||
575 | ;---------------------------------------------------------------------------- | ||
576 | ; | ||
577 | ;BN_ULONG bn_add_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) | ||
578 | ; | ||
579 | ; arg0 = rp | ||
580 | ; arg1 = ap | ||
581 | ; arg2 = bp | ||
582 | ; arg3 = n | ||
583 | |||
584 | t .reg %r22 | ||
585 | b .reg %r21 | ||
586 | l .reg %r20 | ||
587 | |||
588 | bn_add_words | ||
589 | .proc | ||
590 | .entry | ||
591 | .callinfo | ||
592 | .EXPORT bn_add_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN | ||
593 | .align 64 | ||
594 | |||
595 | CMPIB,>= 0,n,bn_add_words_exit | ||
596 | COPY %r0,%ret1 ; return 0 by default | ||
597 | |||
598 | ; | ||
599 | ; If 2 or more numbers do the loop | ||
600 | ; | ||
601 | CMPIB,= 1,n,bn_add_words_single_top | ||
602 | NOP | ||
603 | |||
604 | ; | ||
605 | ; This loop is unrolled 2 times (64-byte aligned as well) | ||
606 | ; | ||
607 | bn_add_words_unroll2 | ||
608 | LDD 0(a_ptr),t | ||
609 | LDD 0(b_ptr),b | ||
610 | ADD t,%ret1,t ; t = t+c; | ||
611 | ADD,DC %r0,%r0,%ret1 ; set c to carry | ||
612 | ADD t,b,l ; l = t + b[0] | ||
613 | ADD,DC %ret1,%r0,%ret1 ; c+= carry | ||
614 | STD l,0(r_ptr) | ||
615 | |||
616 | LDD 8(a_ptr),t | ||
617 | LDD 8(b_ptr),b | ||
618 | ADD t,%ret1,t ; t = t+c; | ||
619 | ADD,DC %r0,%r0,%ret1 ; set c to carry | ||
620 | ADD t,b,l ; l = t + b[0] | ||
621 | ADD,DC %ret1,%r0,%ret1 ; c+= carry | ||
622 | STD l,8(r_ptr) | ||
623 | |||
624 | LDO -2(n),n | ||
625 | LDO 16(a_ptr),a_ptr | ||
626 | LDO 16(b_ptr),b_ptr | ||
627 | |||
628 | CMPIB,<= 2,n,bn_add_words_unroll2 | ||
629 | LDO 16(r_ptr),r_ptr | ||
630 | |||
631 | CMPIB,=,N 0,n,bn_add_words_exit ; are we done? | ||
632 | |||
633 | bn_add_words_single_top | ||
634 | LDD 0(a_ptr),t | ||
635 | LDD 0(b_ptr),b | ||
636 | |||
637 | ADD t,%ret1,t ; t = t+c; | ||
638 | ADD,DC %r0,%r0,%ret1 ; set c to carry (could use CMPCLR??) | ||
639 | ADD t,b,l ; l = t + b[0] | ||
640 | ADD,DC %ret1,%r0,%ret1 ; c+= carry | ||
641 | STD l,0(r_ptr) | ||
642 | |||
643 | bn_add_words_exit | ||
644 | .EXIT | ||
645 | BVE (%rp) | ||
646 | EXTRD,U %ret1,31,32,%ret0 ; for 32-bit, return in ret0/ret1 | ||
647 | .PROCEND ;in=23,24,25,26,29;out=28; | ||
648 | |||
649 | ;---------------------------------------------------------------------------- | ||
650 | ; | ||
651 | ;BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) | ||
652 | ; | ||
653 | ; arg0 = rp | ||
654 | ; arg1 = ap | ||
655 | ; arg2 = bp | ||
656 | ; arg3 = n | ||
657 | |||
658 | t1 .reg %r22 | ||
659 | t2 .reg %r21 | ||
660 | sub_tmp1 .reg %r20 | ||
661 | sub_tmp2 .reg %r19 | ||
662 | |||
663 | |||
664 | bn_sub_words | ||
665 | .proc | ||
666 | .callinfo | ||
667 | .EXPORT bn_sub_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN | ||
668 | .entry | ||
669 | .align 64 | ||
670 | |||
671 | CMPIB,>= 0,n,bn_sub_words_exit | ||
672 | COPY %r0,%ret1 ; return 0 by default | ||
673 | |||
674 | ; | ||
675 | ; If 2 or more numbers do the loop | ||
676 | ; | ||
677 | CMPIB,= 1,n,bn_sub_words_single_top | ||
678 | NOP | ||
679 | |||
680 | ; | ||
681 | ; This loop is unrolled 2 times (64-byte aligned as well) | ||
682 | ; | ||
683 | bn_sub_words_unroll2 | ||
684 | LDD 0(a_ptr),t1 | ||
685 | LDD 0(b_ptr),t2 | ||
686 | SUB t1,t2,sub_tmp1 ; t3 = t1-t2; | ||
687 | SUB sub_tmp1,%ret1,sub_tmp1 ; t3 = t3- c; | ||
688 | |||
689 | CMPCLR,*>> t1,t2,sub_tmp2 ; clear if t1 > t2 | ||
690 | LDO 1(%r0),sub_tmp2 | ||
691 | |||
692 | CMPCLR,*= t1,t2,%r0 | ||
693 | COPY sub_tmp2,%ret1 | ||
694 | STD sub_tmp1,0(r_ptr) | ||
695 | |||
696 | LDD 8(a_ptr),t1 | ||
697 | LDD 8(b_ptr),t2 | ||
698 | SUB t1,t2,sub_tmp1 ; t3 = t1-t2; | ||
699 | SUB sub_tmp1,%ret1,sub_tmp1 ; t3 = t3- c; | ||
700 | CMPCLR,*>> t1,t2,sub_tmp2 ; clear if t1 > t2 | ||
701 | LDO 1(%r0),sub_tmp2 | ||
702 | |||
703 | CMPCLR,*= t1,t2,%r0 | ||
704 | COPY sub_tmp2,%ret1 | ||
705 | STD sub_tmp1,8(r_ptr) | ||
706 | |||
707 | LDO -2(n),n | ||
708 | LDO 16(a_ptr),a_ptr | ||
709 | LDO 16(b_ptr),b_ptr | ||
710 | |||
711 | CMPIB,<= 2,n,bn_sub_words_unroll2 | ||
712 | LDO 16(r_ptr),r_ptr | ||
713 | |||
714 | CMPIB,=,N 0,n,bn_sub_words_exit ; are we done? | ||
715 | |||
716 | bn_sub_words_single_top | ||
717 | LDD 0(a_ptr),t1 | ||
718 | LDD 0(b_ptr),t2 | ||
719 | SUB t1,t2,sub_tmp1 ; t3 = t1-t2; | ||
720 | SUB sub_tmp1,%ret1,sub_tmp1 ; t3 = t3- c; | ||
721 | CMPCLR,*>> t1,t2,sub_tmp2 ; clear if t1 > t2 | ||
722 | LDO 1(%r0),sub_tmp2 | ||
723 | |||
724 | CMPCLR,*= t1,t2,%r0 | ||
725 | COPY sub_tmp2,%ret1 | ||
726 | |||
727 | STD sub_tmp1,0(r_ptr) | ||
728 | |||
729 | bn_sub_words_exit | ||
730 | .EXIT | ||
731 | BVE (%rp) | ||
732 | EXTRD,U %ret1,31,32,%ret0 ; for 32-bit, return in ret0/ret1 | ||
733 | .PROCEND ;in=23,24,25,26,29;out=28; | ||
734 | |||
735 | ;------------------------------------------------------------------------------ | ||
736 | ; | ||
737 | ; unsigned long bn_div_words(unsigned long h, unsigned long l, unsigned long d) | ||
738 | ; | ||
739 | ; arg0 = h | ||
740 | ; arg1 = l | ||
741 | ; arg2 = d | ||
742 | ; | ||
743 | ; This is mainly just output from the HP C compiler. | ||
744 | ; | ||
745 | ;------------------------------------------------------------------------------ | ||
746 | bn_div_words | ||
205 | .PROC | 747 | .PROC |
206 | .CALLINFO FRAME=0,NO_CALLS | 748 | .EXPORT bn_div_words,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR,RTNVAL=GR,LONG_RETURN |
207 | .ENTRY | 749 | .IMPORT BN_num_bits_word,CODE |
208 | ldo 28(%r26),%r19 | 750 | .IMPORT __iob,DATA |
209 | ldo 12(%r25),%r28 | 751 | .IMPORT fprintf,CODE |
210 | L$0042 | 752 | .IMPORT abort,CODE |
211 | fldws 0(0,%r25),%fr8L | 753 | .IMPORT $$div2U,MILLICODE |
212 | fldws 0(0,%r25),%fr8R | 754 | .CALLINFO CALLER,FRAME=144,ENTRY_GR=%r9,SAVE_RP,ARGS_SAVED,ORDERING_AWARE |
213 | xmpyu %fr8L,%fr8R,%fr8 | 755 | .ENTRY |
214 | fstds %fr8,-16(0,%r30) | 756 | STW %r2,-20(%r30) ;offset 0x8ec |
215 | ldw -16(0,%r30),%r22 | 757 | STW,MA %r3,192(%r30) ;offset 0x8f0 |
216 | ldw -12(0,%r30),%r23 | 758 | STW %r4,-188(%r30) ;offset 0x8f4 |
217 | stw %r23,0(0,%r26) | 759 | DEPD %r5,31,32,%r6 ;offset 0x8f8 |
218 | copy %r22,%r21 | 760 | STD %r6,-184(%r30) ;offset 0x8fc |
219 | ldi 0,%r20 | 761 | DEPD %r7,31,32,%r8 ;offset 0x900 |
220 | addib,= -1,%r24,L$0049 | 762 | STD %r8,-176(%r30) ;offset 0x904 |
221 | stw %r21,-24(0,%r19) | 763 | STW %r9,-168(%r30) ;offset 0x908 |
222 | fldws -8(0,%r28),%fr8L | 764 | LDD -248(%r30),%r3 ;offset 0x90c |
223 | fldws -8(0,%r28),%fr8R | 765 | COPY %r26,%r4 ;offset 0x910 |
224 | xmpyu %fr8L,%fr8R,%fr8 | 766 | COPY %r24,%r5 ;offset 0x914 |
225 | fstds %fr8,-16(0,%r30) | 767 | DEPD %r25,31,32,%r4 ;offset 0x918 |
226 | ldw -16(0,%r30),%r22 | 768 | CMPB,*<> %r3,%r0,$0006000C ;offset 0x91c |
227 | ldw -12(0,%r30),%r23 | 769 | DEPD %r23,31,32,%r5 ;offset 0x920 |
228 | stw %r23,-20(0,%r19) | 770 | MOVIB,TR -1,%r29,$00060002 ;offset 0x924 |
229 | copy %r22,%r21 | 771 | EXTRD,U %r29,31,32,%r28 ;offset 0x928 |
230 | ldi 0,%r20 | 772 | $0006002A |
231 | addib,= -1,%r24,L$0049 | 773 | LDO -1(%r29),%r29 ;offset 0x92c |
232 | stw %r21,-16(0,%r19) | 774 | SUB %r23,%r7,%r23 ;offset 0x930 |
233 | fldws -4(0,%r28),%fr8L | 775 | $00060024 |
234 | fldws -4(0,%r28),%fr8R | 776 | SUB %r4,%r31,%r25 ;offset 0x934 |
235 | xmpyu %fr8L,%fr8R,%fr8 | 777 | AND %r25,%r19,%r26 ;offset 0x938 |
236 | fstds %fr8,-16(0,%r30) | 778 | CMPB,*<>,N %r0,%r26,$00060046 ;offset 0x93c |
237 | ldw -16(0,%r30),%r22 | 779 | DEPD,Z %r25,31,32,%r20 ;offset 0x940 |
238 | ldw -12(0,%r30),%r23 | 780 | OR %r20,%r24,%r21 ;offset 0x944 |
239 | stw %r23,-12(0,%r19) | 781 | CMPB,*<<,N %r21,%r23,$0006002A ;offset 0x948 |
240 | copy %r22,%r21 | 782 | SUB %r31,%r2,%r31 ;offset 0x94c |
241 | ldi 0,%r20 | 783 | $00060046 |
242 | addib,= -1,%r24,L$0049 | 784 | $0006002E |
243 | stw %r21,-8(0,%r19) | 785 | DEPD,Z %r23,31,32,%r25 ;offset 0x950 |
244 | fldws 0(0,%r28),%fr8L | 786 | EXTRD,U %r23,31,32,%r26 ;offset 0x954 |
245 | fldws 0(0,%r28),%fr8R | 787 | AND %r25,%r19,%r24 ;offset 0x958 |
246 | xmpyu %fr8L,%fr8R,%fr8 | 788 | ADD,L %r31,%r26,%r31 ;offset 0x95c |
247 | fstds %fr8,-16(0,%r30) | 789 | CMPCLR,*>>= %r5,%r24,%r0 ;offset 0x960 |
248 | ldw -16(0,%r30),%r22 | 790 | LDO 1(%r31),%r31 ;offset 0x964 |
249 | ldw -12(0,%r30),%r23 | 791 | $00060032 |
250 | stw %r23,-4(0,%r19) | 792 | CMPB,*<<=,N %r31,%r4,$00060036 ;offset 0x968 |
251 | copy %r22,%r21 | 793 | LDO -1(%r29),%r29 ;offset 0x96c |
252 | ldi 0,%r20 | 794 | ADD,L %r4,%r3,%r4 ;offset 0x970 |
253 | addib,= -1,%r24,L$0049 | 795 | $00060036 |
254 | stw %r21,0(0,%r19) | 796 | ADDIB,=,N -1,%r8,$D0 ;offset 0x974 |
255 | ldo 16(%r28),%r28 | 797 | SUB %r5,%r24,%r28 ;offset 0x978 |
256 | ldo 16(%r25),%r25 | 798 | $0006003A |
257 | ldo 32(%r19),%r19 | 799 | SUB %r4,%r31,%r24 ;offset 0x97c |
258 | bl L$0042,0 | 800 | SHRPD %r24,%r28,32,%r4 ;offset 0x980 |
259 | ldo 32(%r26),%r26 | 801 | DEPD,Z %r29,31,32,%r9 ;offset 0x984 |
260 | L$0049 | 802 | DEPD,Z %r28,31,32,%r5 ;offset 0x988 |
261 | bv,n 0(%r2) | 803 | $0006001C |
262 | .EXIT | 804 | EXTRD,U %r4,31,32,%r31 ;offset 0x98c |
263 | .PROCEND | 805 | CMPB,*<>,N %r31,%r2,$00060020 ;offset 0x990 |
264 | .IMPORT BN_num_bits_word,CODE | 806 | MOVB,TR %r6,%r29,$D1 ;offset 0x994 |
265 | .IMPORT fprintf,CODE | 807 | STD %r29,-152(%r30) ;offset 0x998 |
266 | .IMPORT __iob,DATA | 808 | $0006000C |
267 | .SPACE $TEXT$ | 809 | EXTRD,U %r3,31,32,%r25 ;offset 0x99c |
268 | .SUBSPA $LIT$ | 810 | COPY %r3,%r26 ;offset 0x9a0 |
269 | 811 | EXTRD,U %r3,31,32,%r9 ;offset 0x9a4 | |
270 | .align 4 | 812 | EXTRD,U %r4,31,32,%r8 ;offset 0x9a8 |
271 | L$C0000 | 813 | .CALL ARGW0=GR,ARGW1=GR,RTNVAL=GR ;in=25,26;out=28; |
272 | .STRING "Division would overflow (%d)\x0a\x00" | 814 | B,L BN_num_bits_word,%r2 ;offset 0x9ac |
273 | .IMPORT abort,CODE | 815 | EXTRD,U %r5,31,32,%r7 ;offset 0x9b0 |
274 | .SPACE $TEXT$ | 816 | LDI 64,%r20 ;offset 0x9b4 |
275 | .SUBSPA $CODE$ | 817 | DEPD %r7,31,32,%r5 ;offset 0x9b8 |
276 | 818 | DEPD %r8,31,32,%r4 ;offset 0x9bc | |
277 | .align 4 | 819 | DEPD %r9,31,32,%r3 ;offset 0x9c0 |
278 | .EXPORT bn_div64,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,RTNVAL=GR | 820 | CMPB,= %r28,%r20,$00060012 ;offset 0x9c4 |
279 | bn_div64 | 821 | COPY %r28,%r24 ;offset 0x9c8 |
822 | MTSARCM %r24 ;offset 0x9cc | ||
823 | DEPDI,Z -1,%sar,1,%r19 ;offset 0x9d0 | ||
824 | CMPB,*>>,N %r4,%r19,$D2 ;offset 0x9d4 | ||
825 | $00060012 | ||
826 | SUBI 64,%r24,%r31 ;offset 0x9d8 | ||
827 | CMPCLR,*<< %r4,%r3,%r0 ;offset 0x9dc | ||
828 | SUB %r4,%r3,%r4 ;offset 0x9e0 | ||
829 | $00060016 | ||
830 | CMPB,= %r31,%r0,$0006001A ;offset 0x9e4 | ||
831 | COPY %r0,%r9 ;offset 0x9e8 | ||
832 | MTSARCM %r31 ;offset 0x9ec | ||
833 | DEPD,Z %r3,%sar,64,%r3 ;offset 0x9f0 | ||
834 | SUBI 64,%r31,%r26 ;offset 0x9f4 | ||
835 | MTSAR %r26 ;offset 0x9f8 | ||
836 | SHRPD %r4,%r5,%sar,%r4 ;offset 0x9fc | ||
837 | MTSARCM %r31 ;offset 0xa00 | ||
838 | DEPD,Z %r5,%sar,64,%r5 ;offset 0xa04 | ||
839 | $0006001A | ||
840 | DEPDI,Z -1,31,32,%r19 ;offset 0xa08 | ||
841 | AND %r3,%r19,%r29 ;offset 0xa0c | ||
842 | EXTRD,U %r29,31,32,%r2 ;offset 0xa10 | ||
843 | DEPDI,Z -1,63,32,%r6 ;offset 0xa14 | ||
844 | MOVIB,TR 2,%r8,$0006001C ;offset 0xa18 | ||
845 | EXTRD,U %r3,63,32,%r7 ;offset 0xa1c | ||
846 | $D2 | ||
847 | ADDIL LR'__iob-$global$,%r27,%r1 ;offset 0xa20 | ||
848 | LDIL LR'C$7,%r21 ;offset 0xa24 | ||
849 | LDO RR'__iob-$global$+32(%r1),%r26 ;offset 0xa28 | ||
850 | .CALL ARGW0=GR,ARGW1=GR,ARGW2=GR,RTNVAL=GR ;in=24,25,26;out=28; | ||
851 | B,L fprintf,%r2 ;offset 0xa2c | ||
852 | LDO RR'C$7(%r21),%r25 ;offset 0xa30 | ||
853 | .CALL ; | ||
854 | B,L abort,%r2 ;offset 0xa34 | ||
855 | NOP ;offset 0xa38 | ||
856 | B $D3 ;offset 0xa3c | ||
857 | LDW -212(%r30),%r2 ;offset 0xa40 | ||
858 | $00060020 | ||
859 | COPY %r4,%r26 ;offset 0xa44 | ||
860 | EXTRD,U %r4,31,32,%r25 ;offset 0xa48 | ||
861 | COPY %r2,%r24 ;offset 0xa4c | ||
862 | .CALL ;in=23,24,25,26;out=20,21,22,28,29; (MILLICALL) | ||
863 | B,L $$div2U,%r31 ;offset 0xa50 | ||
864 | EXTRD,U %r2,31,32,%r23 ;offset 0xa54 | ||
865 | DEPD %r28,31,32,%r29 ;offset 0xa58 | ||
866 | $00060022 | ||
867 | STD %r29,-152(%r30) ;offset 0xa5c | ||
868 | $D1 | ||
869 | AND %r5,%r19,%r24 ;offset 0xa60 | ||
870 | EXTRD,U %r24,31,32,%r24 ;offset 0xa64 | ||
871 | STW %r2,-160(%r30) ;offset 0xa68 | ||
872 | STW %r7,-128(%r30) ;offset 0xa6c | ||
873 | FLDD -152(%r30),%fr4 ;offset 0xa70 | ||
874 | FLDD -152(%r30),%fr7 ;offset 0xa74 | ||
875 | FLDW -160(%r30),%fr8L ;offset 0xa78 | ||
876 | FLDW -128(%r30),%fr5L ;offset 0xa7c | ||
877 | XMPYU %fr8L,%fr7L,%fr10 ;offset 0xa80 | ||
878 | FSTD %fr10,-136(%r30) ;offset 0xa84 | ||
879 | XMPYU %fr8L,%fr7R,%fr22 ;offset 0xa88 | ||
880 | FSTD %fr22,-144(%r30) ;offset 0xa8c | ||
881 | XMPYU %fr5L,%fr4L,%fr11 ;offset 0xa90 | ||
882 | XMPYU %fr5L,%fr4R,%fr23 ;offset 0xa94 | ||
883 | FSTD %fr11,-112(%r30) ;offset 0xa98 | ||
884 | FSTD %fr23,-120(%r30) ;offset 0xa9c | ||
885 | LDD -136(%r30),%r28 ;offset 0xaa0 | ||
886 | DEPD,Z %r28,31,32,%r31 ;offset 0xaa4 | ||
887 | LDD -144(%r30),%r20 ;offset 0xaa8 | ||
888 | ADD,L %r20,%r31,%r31 ;offset 0xaac | ||
889 | LDD -112(%r30),%r22 ;offset 0xab0 | ||
890 | DEPD,Z %r22,31,32,%r22 ;offset 0xab4 | ||
891 | LDD -120(%r30),%r21 ;offset 0xab8 | ||
892 | B $00060024 ;offset 0xabc | ||
893 | ADD,L %r21,%r22,%r23 ;offset 0xac0 | ||
894 | $D0 | ||
895 | OR %r9,%r29,%r29 ;offset 0xac4 | ||
896 | $00060040 | ||
897 | EXTRD,U %r29,31,32,%r28 ;offset 0xac8 | ||
898 | $00060002 | ||
899 | $L2 | ||
900 | LDW -212(%r30),%r2 ;offset 0xacc | ||
901 | $D3 | ||
902 | LDW -168(%r30),%r9 ;offset 0xad0 | ||
903 | LDD -176(%r30),%r8 ;offset 0xad4 | ||
904 | EXTRD,U %r8,31,32,%r7 ;offset 0xad8 | ||
905 | LDD -184(%r30),%r6 ;offset 0xadc | ||
906 | EXTRD,U %r6,31,32,%r5 ;offset 0xae0 | ||
907 | LDW -188(%r30),%r4 ;offset 0xae4 | ||
908 | BVE (%r2) ;offset 0xae8 | ||
909 | .EXIT | ||
910 | LDW,MB -192(%r30),%r3 ;offset 0xaec | ||
911 | .PROCEND ;in=23,25;out=28,29;fpin=105,107; | ||
912 | |||
913 | |||
914 | |||
915 | |||
916 | ;---------------------------------------------------------------------------- | ||
917 | ; | ||
918 | ; Registers to hold 64-bit values to manipulate. The "L" part | ||
919 | ; of the register corresponds to the upper 32-bits, while the "R" | ||
920 | ; part corresponds to the lower 32-bits | ||
921 | ; | ||
922 | ; Note, that when using b6 and b7, the code must save these before | ||
923 | ; using them because they are callee save registers | ||
924 | ; | ||
925 | ; | ||
926 | ; Floating point registers to use to save values that | ||
927 | ; are manipulated. These don't collide with ftemp1-6 and | ||
928 | ; are all caller save registers | ||
929 | ; | ||
930 | a0 .reg %fr22 | ||
931 | a0L .reg %fr22L | ||
932 | a0R .reg %fr22R | ||
933 | |||
934 | a1 .reg %fr23 | ||
935 | a1L .reg %fr23L | ||
936 | a1R .reg %fr23R | ||
937 | |||
938 | a2 .reg %fr24 | ||
939 | a2L .reg %fr24L | ||
940 | a2R .reg %fr24R | ||
941 | |||
942 | a3 .reg %fr25 | ||
943 | a3L .reg %fr25L | ||
944 | a3R .reg %fr25R | ||
945 | |||
946 | a4 .reg %fr26 | ||
947 | a4L .reg %fr26L | ||
948 | a4R .reg %fr26R | ||
949 | |||
950 | a5 .reg %fr27 | ||
951 | a5L .reg %fr27L | ||
952 | a5R .reg %fr27R | ||
953 | |||
954 | a6 .reg %fr28 | ||
955 | a6L .reg %fr28L | ||
956 | a6R .reg %fr28R | ||
957 | |||
958 | a7 .reg %fr29 | ||
959 | a7L .reg %fr29L | ||
960 | a7R .reg %fr29R | ||
961 | |||
962 | b0 .reg %fr30 | ||
963 | b0L .reg %fr30L | ||
964 | b0R .reg %fr30R | ||
965 | |||
966 | b1 .reg %fr31 | ||
967 | b1L .reg %fr31L | ||
968 | b1R .reg %fr31R | ||
969 | |||
970 | ; | ||
971 | ; Temporary floating point variables, these are all caller save | ||
972 | ; registers | ||
973 | ; | ||
974 | ftemp1 .reg %fr4 | ||
975 | ftemp2 .reg %fr5 | ||
976 | ftemp3 .reg %fr6 | ||
977 | ftemp4 .reg %fr7 | ||
978 | |||
979 | ; | ||
980 | ; The B set of registers when used. | ||
981 | ; | ||
982 | |||
983 | b2 .reg %fr8 | ||
984 | b2L .reg %fr8L | ||
985 | b2R .reg %fr8R | ||
986 | |||
987 | b3 .reg %fr9 | ||
988 | b3L .reg %fr9L | ||
989 | b3R .reg %fr9R | ||
990 | |||
991 | b4 .reg %fr10 | ||
992 | b4L .reg %fr10L | ||
993 | b4R .reg %fr10R | ||
994 | |||
995 | b5 .reg %fr11 | ||
996 | b5L .reg %fr11L | ||
997 | b5R .reg %fr11R | ||
998 | |||
999 | b6 .reg %fr12 | ||
1000 | b6L .reg %fr12L | ||
1001 | b6R .reg %fr12R | ||
1002 | |||
1003 | b7 .reg %fr13 | ||
1004 | b7L .reg %fr13L | ||
1005 | b7R .reg %fr13R | ||
1006 | |||
1007 | c1 .reg %r21 ; only reg | ||
1008 | temp1 .reg %r20 ; only reg | ||
1009 | temp2 .reg %r19 ; only reg | ||
1010 | temp3 .reg %r31 ; only reg | ||
1011 | |||
1012 | m1 .reg %r28 | ||
1013 | c2 .reg %r23 | ||
1014 | high_one .reg %r1 | ||
1015 | ht .reg %r6 | ||
1016 | lt .reg %r5 | ||
1017 | m .reg %r4 | ||
1018 | c3 .reg %r3 | ||
1019 | |||
1020 | SQR_ADD_C .macro A0L,A0R,C1,C2,C3 | ||
1021 | XMPYU A0L,A0R,ftemp1 ; m | ||
1022 | FSTD ftemp1,-24(%sp) ; store m | ||
1023 | |||
1024 | XMPYU A0R,A0R,ftemp2 ; lt | ||
1025 | FSTD ftemp2,-16(%sp) ; store lt | ||
1026 | |||
1027 | XMPYU A0L,A0L,ftemp3 ; ht | ||
1028 | FSTD ftemp3,-8(%sp) ; store ht | ||
1029 | |||
1030 | LDD -24(%sp),m ; load m | ||
1031 | AND m,high_mask,temp2 ; m & Mask | ||
1032 | DEPD,Z m,30,31,temp3 ; m << 32+1 | ||
1033 | LDD -16(%sp),lt ; lt | ||
1034 | |||
1035 | LDD -8(%sp),ht ; ht | ||
1036 | EXTRD,U temp2,32,33,temp1 ; temp1 = m&Mask >> 32-1 | ||
1037 | ADD temp3,lt,lt ; lt = lt+m | ||
1038 | ADD,L ht,temp1,ht ; ht += temp1 | ||
1039 | ADD,DC ht,%r0,ht ; ht++ | ||
1040 | |||
1041 | ADD C1,lt,C1 ; c1=c1+lt | ||
1042 | ADD,DC ht,%r0,ht ; ht++ | ||
1043 | |||
1044 | ADD C2,ht,C2 ; c2=c2+ht | ||
1045 | ADD,DC C3,%r0,C3 ; c3++ | ||
1046 | .endm | ||
1047 | |||
1048 | SQR_ADD_C2 .macro A0L,A0R,A1L,A1R,C1,C2,C3 | ||
1049 | XMPYU A0L,A1R,ftemp1 ; m1 = bl*ht | ||
1050 | FSTD ftemp1,-16(%sp) ; | ||
1051 | XMPYU A0R,A1L,ftemp2 ; m = bh*lt | ||
1052 | FSTD ftemp2,-8(%sp) ; | ||
1053 | XMPYU A0R,A1R,ftemp3 ; lt = bl*lt | ||
1054 | FSTD ftemp3,-32(%sp) | ||
1055 | XMPYU A0L,A1L,ftemp4 ; ht = bh*ht | ||
1056 | FSTD ftemp4,-24(%sp) ; | ||
1057 | |||
1058 | LDD -8(%sp),m ; r21 = m | ||
1059 | LDD -16(%sp),m1 ; r19 = m1 | ||
1060 | ADD,L m,m1,m ; m+m1 | ||
1061 | |||
1062 | DEPD,Z m,31,32,temp3 ; (m+m1<<32) | ||
1063 | LDD -24(%sp),ht ; r24 = ht | ||
1064 | |||
1065 | CMPCLR,*>>= m,m1,%r0 ; if (m < m1) | ||
1066 | ADD,L ht,high_one,ht ; ht+=high_one | ||
1067 | |||
1068 | EXTRD,U m,31,32,temp1 ; m >> 32 | ||
1069 | LDD -32(%sp),lt ; lt | ||
1070 | ADD,L ht,temp1,ht ; ht+= m>>32 | ||
1071 | ADD lt,temp3,lt ; lt = lt+m1 | ||
1072 | ADD,DC ht,%r0,ht ; ht++ | ||
1073 | |||
1074 | ADD ht,ht,ht ; ht=ht+ht; | ||
1075 | ADD,DC C3,%r0,C3 ; add in carry (c3++) | ||
1076 | |||
1077 | ADD lt,lt,lt ; lt=lt+lt; | ||
1078 | ADD,DC ht,%r0,ht ; add in carry (ht++) | ||
1079 | |||
1080 | ADD C1,lt,C1 ; c1=c1+lt | ||
1081 | ADD,DC,*NUV ht,%r0,ht ; add in carry (ht++) | ||
1082 | LDO 1(C3),C3 ; bump c3 if overflow,nullify otherwise | ||
1083 | |||
1084 | ADD C2,ht,C2 ; c2 = c2 + ht | ||
1085 | ADD,DC C3,%r0,C3 ; add in carry (c3++) | ||
1086 | .endm | ||
1087 | |||
1088 | ; | ||
1089 | ;void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a) | ||
1090 | ; arg0 = r_ptr | ||
1091 | ; arg1 = a_ptr | ||
1092 | ; | ||
1093 | |||
1094 | bn_sqr_comba8 | ||
280 | .PROC | 1095 | .PROC |
281 | .CALLINFO FRAME=128,CALLS,SAVE_RP,ENTRY_GR=8 | 1096 | .CALLINFO FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE |
282 | .ENTRY | 1097 | .EXPORT bn_sqr_comba8,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN |
283 | stw %r2,-20(0,%r30) | 1098 | .ENTRY |
284 | stwm %r8,128(0,%r30) | 1099 | .align 64 |
285 | stw %r7,-124(0,%r30) | 1100 | |
286 | stw %r4,-112(0,%r30) | 1101 | STD %r3,0(%sp) ; save r3 |
287 | stw %r3,-108(0,%r30) | 1102 | STD %r4,8(%sp) ; save r4 |
288 | copy %r26,%r3 | 1103 | STD %r5,16(%sp) ; save r5 |
289 | copy %r25,%r4 | 1104 | STD %r6,24(%sp) ; save r6 |
290 | stw %r6,-120(0,%r30) | 1105 | |
291 | ldi 0,%r7 | 1106 | ; |
292 | stw %r5,-116(0,%r30) | 1107 | ; Zero out carries |
293 | movb,<> %r24,%r5,L$0051 | 1108 | ; |
294 | ldi 2,%r6 | 1109 | COPY %r0,c1 |
295 | bl L$0068,0 | 1110 | COPY %r0,c2 |
296 | ldi -1,%r28 | 1111 | COPY %r0,c3 |
297 | L$0051 | 1112 | |
298 | .CALL ARGW0=GR | 1113 | LDO 128(%sp),%sp ; bump stack |
299 | bl BN_num_bits_word,%r2 | 1114 | DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L |
300 | copy %r5,%r26 | 1115 | DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32 |
301 | copy %r28,%r24 | 1116 | |
302 | ldi 32,%r19 | 1117 | ; |
303 | comb,= %r19,%r24,L$0052 | 1118 | ; Load up all of the values we are going to use |
304 | subi 31,%r24,%r19 | 1119 | ; |
305 | mtsar %r19 | 1120 | FLDD 0(a_ptr),a0 |
306 | zvdepi 1,32,%r19 | 1121 | FLDD 8(a_ptr),a1 |
307 | comb,>>= %r19,%r3,L$0052 | 1122 | FLDD 16(a_ptr),a2 |
308 | addil LR'__iob-$global$+32,%r27 | 1123 | FLDD 24(a_ptr),a3 |
309 | ldo RR'__iob-$global$+32(%r1),%r26 | 1124 | FLDD 32(a_ptr),a4 |
310 | ldil LR'L$C0000,%r25 | 1125 | FLDD 40(a_ptr),a5 |
311 | .CALL ARGW0=GR,ARGW1=GR,ARGW2=GR | 1126 | FLDD 48(a_ptr),a6 |
312 | bl fprintf,%r2 | 1127 | FLDD 56(a_ptr),a7 |
313 | ldo RR'L$C0000(%r25),%r25 | 1128 | |
314 | .CALL | 1129 | SQR_ADD_C a0L,a0R,c1,c2,c3 |
315 | bl abort,%r2 | 1130 | STD c1,0(r_ptr) ; r[0] = c1; |
316 | nop | 1131 | COPY %r0,c1 |
317 | L$0052 | 1132 | |
318 | comb,>> %r5,%r3,L$0053 | 1133 | SQR_ADD_C2 a1L,a1R,a0L,a0R,c2,c3,c1 |
319 | subi 32,%r24,%r24 | 1134 | STD c2,8(r_ptr) ; r[1] = c2; |
320 | sub %r3,%r5,%r3 | 1135 | COPY %r0,c2 |
321 | L$0053 | 1136 | |
322 | comib,= 0,%r24,L$0054 | 1137 | SQR_ADD_C a1L,a1R,c3,c1,c2 |
323 | subi 31,%r24,%r19 | 1138 | SQR_ADD_C2 a2L,a2R,a0L,a0R,c3,c1,c2 |
324 | mtsar %r19 | 1139 | STD c3,16(r_ptr) ; r[2] = c3; |
325 | zvdep %r5,32,%r5 | 1140 | COPY %r0,c3 |
326 | zvdep %r3,32,%r21 | 1141 | |
327 | subi 32,%r24,%r20 | 1142 | SQR_ADD_C2 a3L,a3R,a0L,a0R,c1,c2,c3 |
328 | mtsar %r20 | 1143 | SQR_ADD_C2 a2L,a2R,a1L,a1R,c1,c2,c3 |
329 | vshd 0,%r4,%r20 | 1144 | STD c1,24(r_ptr) ; r[3] = c1; |
330 | or %r21,%r20,%r3 | 1145 | COPY %r0,c1 |
331 | mtsar %r19 | 1146 | |
332 | zvdep %r4,32,%r4 | 1147 | SQR_ADD_C a2L,a2R,c2,c3,c1 |
333 | L$0054 | 1148 | SQR_ADD_C2 a3L,a3R,a1L,a1R,c2,c3,c1 |
334 | extru %r5,15,16,%r23 | 1149 | SQR_ADD_C2 a4L,a4R,a0L,a0R,c2,c3,c1 |
335 | extru %r5,31,16,%r28 | 1150 | STD c2,32(r_ptr) ; r[4] = c2; |
336 | L$0055 | 1151 | COPY %r0,c2 |
337 | extru %r3,15,16,%r19 | 1152 | |
338 | comb,<> %r23,%r19,L$0058 | 1153 | SQR_ADD_C2 a5L,a5R,a0L,a0R,c3,c1,c2 |
339 | copy %r3,%r26 | 1154 | SQR_ADD_C2 a4L,a4R,a1L,a1R,c3,c1,c2 |
340 | bl L$0059,0 | 1155 | SQR_ADD_C2 a3L,a3R,a2L,a2R,c3,c1,c2 |
341 | zdepi -1,31,16,%r29 | 1156 | STD c3,40(r_ptr) ; r[5] = c3; |
342 | L$0058 | 1157 | COPY %r0,c3 |
343 | .IMPORT $$divU,MILLICODE | 1158 | |
344 | bl $$divU,%r31 | 1159 | SQR_ADD_C a3L,a3R,c1,c2,c3 |
345 | copy %r23,%r25 | 1160 | SQR_ADD_C2 a4L,a4R,a2L,a2R,c1,c2,c3 |
346 | L$0059 | 1161 | SQR_ADD_C2 a5L,a5R,a1L,a1R,c1,c2,c3 |
347 | stw %r29,-16(0,%r30) | 1162 | SQR_ADD_C2 a6L,a6R,a0L,a0R,c1,c2,c3 |
348 | fldws -16(0,%r30),%fr10L | 1163 | STD c1,48(r_ptr) ; r[6] = c1; |
349 | stw %r28,-16(0,%r30) | 1164 | COPY %r0,c1 |
350 | fldws -16(0,%r30),%fr10R | 1165 | |
351 | stw %r23,-16(0,%r30) | 1166 | SQR_ADD_C2 a7L,a7R,a0L,a0R,c2,c3,c1 |
352 | xmpyu %fr10L,%fr10R,%fr8 | 1167 | SQR_ADD_C2 a6L,a6R,a1L,a1R,c2,c3,c1 |
353 | fldws -16(0,%r30),%fr10R | 1168 | SQR_ADD_C2 a5L,a5R,a2L,a2R,c2,c3,c1 |
354 | fstws %fr8R,-16(0,%r30) | 1169 | SQR_ADD_C2 a4L,a4R,a3L,a3R,c2,c3,c1 |
355 | xmpyu %fr10L,%fr10R,%fr9 | 1170 | STD c2,56(r_ptr) ; r[7] = c2; |
356 | ldw -16(0,%r30),%r8 | 1171 | COPY %r0,c2 |
357 | fstws %fr9R,-16(0,%r30) | 1172 | |
358 | copy %r8,%r22 | 1173 | SQR_ADD_C a4L,a4R,c3,c1,c2 |
359 | ldw -16(0,%r30),%r8 | 1174 | SQR_ADD_C2 a5L,a5R,a3L,a3R,c3,c1,c2 |
360 | extru %r4,15,16,%r24 | 1175 | SQR_ADD_C2 a6L,a6R,a2L,a2R,c3,c1,c2 |
361 | copy %r8,%r21 | 1176 | SQR_ADD_C2 a7L,a7R,a1L,a1R,c3,c1,c2 |
362 | L$0060 | 1177 | STD c3,64(r_ptr) ; r[8] = c3; |
363 | sub %r3,%r21,%r20 | 1178 | COPY %r0,c3 |
364 | copy %r20,%r19 | 1179 | |
365 | depi 0,31,16,%r19 | 1180 | SQR_ADD_C2 a7L,a7R,a2L,a2R,c1,c2,c3 |
366 | comib,<> 0,%r19,L$0061 | 1181 | SQR_ADD_C2 a6L,a6R,a3L,a3R,c1,c2,c3 |
367 | zdep %r20,15,16,%r19 | 1182 | SQR_ADD_C2 a5L,a5R,a4L,a4R,c1,c2,c3 |
368 | addl %r19,%r24,%r19 | 1183 | STD c1,72(r_ptr) ; r[9] = c1; |
369 | comb,>>= %r19,%r22,L$0061 | 1184 | COPY %r0,c1 |
370 | sub %r22,%r28,%r22 | 1185 | |
371 | sub %r21,%r23,%r21 | 1186 | SQR_ADD_C a5L,a5R,c2,c3,c1 |
372 | bl L$0060,0 | 1187 | SQR_ADD_C2 a6L,a6R,a4L,a4R,c2,c3,c1 |
373 | ldo -1(%r29),%r29 | 1188 | SQR_ADD_C2 a7L,a7R,a3L,a3R,c2,c3,c1 |
374 | L$0061 | 1189 | STD c2,80(r_ptr) ; r[10] = c2; |
375 | stw %r29,-16(0,%r30) | 1190 | COPY %r0,c2 |
376 | fldws -16(0,%r30),%fr10L | 1191 | |
377 | stw %r28,-16(0,%r30) | 1192 | SQR_ADD_C2 a7L,a7R,a4L,a4R,c3,c1,c2 |
378 | fldws -16(0,%r30),%fr10R | 1193 | SQR_ADD_C2 a6L,a6R,a5L,a5R,c3,c1,c2 |
379 | xmpyu %fr10L,%fr10R,%fr8 | 1194 | STD c3,88(r_ptr) ; r[11] = c3; |
380 | fstws %fr8R,-16(0,%r30) | 1195 | COPY %r0,c3 |
381 | ldw -16(0,%r30),%r8 | 1196 | |
382 | stw %r23,-16(0,%r30) | 1197 | SQR_ADD_C a6L,a6R,c1,c2,c3 |
383 | fldws -16(0,%r30),%fr10R | 1198 | SQR_ADD_C2 a7L,a7R,a5L,a5R,c1,c2,c3 |
384 | copy %r8,%r19 | 1199 | STD c1,96(r_ptr) ; r[12] = c1; |
385 | xmpyu %fr10L,%fr10R,%fr8 | 1200 | COPY %r0,c1 |
386 | fstws %fr8R,-16(0,%r30) | 1201 | |
387 | extru %r19,15,16,%r20 | 1202 | SQR_ADD_C2 a7L,a7R,a6L,a6R,c2,c3,c1 |
388 | ldw -16(0,%r30),%r8 | 1203 | STD c2,104(r_ptr) ; r[13] = c2; |
389 | zdep %r19,15,16,%r19 | 1204 | COPY %r0,c2 |
390 | addl %r8,%r20,%r20 | 1205 | |
391 | comclr,<<= %r19,%r4,0 | 1206 | SQR_ADD_C a7L,a7R,c3,c1,c2 |
392 | addi 1,%r20,%r20 | 1207 | STD c3, 112(r_ptr) ; r[14] = c3 |
393 | comb,<<= %r20,%r3,L$0066 | 1208 | STD c1, 120(r_ptr) ; r[15] = c1 |
394 | sub %r4,%r19,%r4 | 1209 | |
395 | addl %r3,%r5,%r3 | 1210 | .EXIT |
396 | ldo -1(%r29),%r29 | 1211 | LDD -104(%sp),%r6 ; restore r6 |
397 | L$0066 | 1212 | LDD -112(%sp),%r5 ; restore r5 |
398 | addib,= -1,%r6,L$0056 | 1213 | LDD -120(%sp),%r4 ; restore r4 |
399 | sub %r3,%r20,%r3 | 1214 | BVE (%rp) |
400 | zdep %r29,15,16,%r7 | 1215 | LDD,MB -128(%sp),%r3 |
401 | shd %r3,%r4,16,%r3 | 1216 | |
402 | bl L$0055,0 | 1217 | .PROCEND |
403 | zdep %r4,15,16,%r4 | 1218 | |
404 | L$0056 | 1219 | ;----------------------------------------------------------------------------- |
405 | or %r7,%r29,%r28 | 1220 | ; |
406 | L$0068 | 1221 | ;void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a) |
407 | ldw -148(0,%r30),%r2 | 1222 | ; arg0 = r_ptr |
408 | ldw -124(0,%r30),%r7 | 1223 | ; arg1 = a_ptr |
409 | ldw -120(0,%r30),%r6 | 1224 | ; |
410 | ldw -116(0,%r30),%r5 | 1225 | |
411 | ldw -112(0,%r30),%r4 | 1226 | bn_sqr_comba4 |
412 | ldw -108(0,%r30),%r3 | 1227 | .proc |
413 | bv 0(%r2) | 1228 | .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE |
414 | ldwm -128(0,%r30),%r8 | 1229 | .EXPORT bn_sqr_comba4,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN |
415 | .EXIT | 1230 | .entry |
416 | .PROCEND | 1231 | .align 64 |
1232 | STD %r3,0(%sp) ; save r3 | ||
1233 | STD %r4,8(%sp) ; save r4 | ||
1234 | STD %r5,16(%sp) ; save r5 | ||
1235 | STD %r6,24(%sp) ; save r6 | ||
1236 | |||
1237 | ; | ||
1238 | ; Zero out carries | ||
1239 | ; | ||
1240 | COPY %r0,c1 | ||
1241 | COPY %r0,c2 | ||
1242 | COPY %r0,c3 | ||
1243 | |||
1244 | LDO 128(%sp),%sp ; bump stack | ||
1245 | DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L | ||
1246 | DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32 | ||
1247 | |||
1248 | ; | ||
1249 | ; Load up all of the values we are going to use | ||
1250 | ; | ||
1251 | FLDD 0(a_ptr),a0 | ||
1252 | FLDD 8(a_ptr),a1 | ||
1253 | FLDD 16(a_ptr),a2 | ||
1254 | FLDD 24(a_ptr),a3 | ||
1255 | FLDD 32(a_ptr),a4 | ||
1256 | FLDD 40(a_ptr),a5 | ||
1257 | FLDD 48(a_ptr),a6 | ||
1258 | FLDD 56(a_ptr),a7 | ||
1259 | |||
1260 | SQR_ADD_C a0L,a0R,c1,c2,c3 | ||
1261 | |||
1262 | STD c1,0(r_ptr) ; r[0] = c1; | ||
1263 | COPY %r0,c1 | ||
1264 | |||
1265 | SQR_ADD_C2 a1L,a1R,a0L,a0R,c2,c3,c1 | ||
1266 | |||
1267 | STD c2,8(r_ptr) ; r[1] = c2; | ||
1268 | COPY %r0,c2 | ||
1269 | |||
1270 | SQR_ADD_C a1L,a1R,c3,c1,c2 | ||
1271 | SQR_ADD_C2 a2L,a2R,a0L,a0R,c3,c1,c2 | ||
1272 | |||
1273 | STD c3,16(r_ptr) ; r[2] = c3; | ||
1274 | COPY %r0,c3 | ||
1275 | |||
1276 | SQR_ADD_C2 a3L,a3R,a0L,a0R,c1,c2,c3 | ||
1277 | SQR_ADD_C2 a2L,a2R,a1L,a1R,c1,c2,c3 | ||
1278 | |||
1279 | STD c1,24(r_ptr) ; r[3] = c1; | ||
1280 | COPY %r0,c1 | ||
1281 | |||
1282 | SQR_ADD_C a2L,a2R,c2,c3,c1 | ||
1283 | SQR_ADD_C2 a3L,a3R,a1L,a1R,c2,c3,c1 | ||
1284 | |||
1285 | STD c2,32(r_ptr) ; r[4] = c2; | ||
1286 | COPY %r0,c2 | ||
1287 | |||
1288 | SQR_ADD_C2 a3L,a3R,a2L,a2R,c3,c1,c2 | ||
1289 | STD c3,40(r_ptr) ; r[5] = c3; | ||
1290 | COPY %r0,c3 | ||
1291 | |||
1292 | SQR_ADD_C a3L,a3R,c1,c2,c3 | ||
1293 | STD c1,48(r_ptr) ; r[6] = c1; | ||
1294 | STD c2,56(r_ptr) ; r[7] = c2; | ||
1295 | |||
1296 | .EXIT | ||
1297 | LDD -104(%sp),%r6 ; restore r6 | ||
1298 | LDD -112(%sp),%r5 ; restore r5 | ||
1299 | LDD -120(%sp),%r4 ; restore r4 | ||
1300 | BVE (%rp) | ||
1301 | LDD,MB -128(%sp),%r3 | ||
1302 | |||
1303 | .PROCEND | ||
1304 | |||
1305 | |||
1306 | ;--------------------------------------------------------------------------- | ||
1307 | |||
1308 | MUL_ADD_C .macro A0L,A0R,B0L,B0R,C1,C2,C3 | ||
1309 | XMPYU A0L,B0R,ftemp1 ; m1 = bl*ht | ||
1310 | FSTD ftemp1,-16(%sp) ; | ||
1311 | XMPYU A0R,B0L,ftemp2 ; m = bh*lt | ||
1312 | FSTD ftemp2,-8(%sp) ; | ||
1313 | XMPYU A0R,B0R,ftemp3 ; lt = bl*lt | ||
1314 | FSTD ftemp3,-32(%sp) | ||
1315 | XMPYU A0L,B0L,ftemp4 ; ht = bh*ht | ||
1316 | FSTD ftemp4,-24(%sp) ; | ||
1317 | |||
1318 | LDD -8(%sp),m ; r21 = m | ||
1319 | LDD -16(%sp),m1 ; r19 = m1 | ||
1320 | ADD,L m,m1,m ; m+m1 | ||
1321 | |||
1322 | DEPD,Z m,31,32,temp3 ; (m+m1<<32) | ||
1323 | LDD -24(%sp),ht ; r24 = ht | ||
1324 | |||
1325 | CMPCLR,*>>= m,m1,%r0 ; if (m < m1) | ||
1326 | ADD,L ht,high_one,ht ; ht+=high_one | ||
1327 | |||
1328 | EXTRD,U m,31,32,temp1 ; m >> 32 | ||
1329 | LDD -32(%sp),lt ; lt | ||
1330 | ADD,L ht,temp1,ht ; ht+= m>>32 | ||
1331 | ADD lt,temp3,lt ; lt = lt+m1 | ||
1332 | ADD,DC ht,%r0,ht ; ht++ | ||
1333 | |||
1334 | ADD C1,lt,C1 ; c1=c1+lt | ||
1335 | ADD,DC ht,%r0,ht ; bump c3 if overflow,nullify otherwise | ||
1336 | |||
1337 | ADD C2,ht,C2 ; c2 = c2 + ht | ||
1338 | ADD,DC C3,%r0,C3 ; add in carry (c3++) | ||
1339 | .endm | ||
1340 | |||
1341 | |||
1342 | ; | ||
1343 | ;void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) | ||
1344 | ; arg0 = r_ptr | ||
1345 | ; arg1 = a_ptr | ||
1346 | ; arg2 = b_ptr | ||
1347 | ; | ||
1348 | |||
1349 | bn_mul_comba8 | ||
1350 | .proc | ||
1351 | .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE | ||
1352 | .EXPORT bn_mul_comba8,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN | ||
1353 | .entry | ||
1354 | .align 64 | ||
1355 | |||
1356 | STD %r3,0(%sp) ; save r3 | ||
1357 | STD %r4,8(%sp) ; save r4 | ||
1358 | STD %r5,16(%sp) ; save r5 | ||
1359 | STD %r6,24(%sp) ; save r6 | ||
1360 | FSTD %fr12,32(%sp) ; save r6 | ||
1361 | FSTD %fr13,40(%sp) ; save r7 | ||
1362 | |||
1363 | ; | ||
1364 | ; Zero out carries | ||
1365 | ; | ||
1366 | COPY %r0,c1 | ||
1367 | COPY %r0,c2 | ||
1368 | COPY %r0,c3 | ||
1369 | |||
1370 | LDO 128(%sp),%sp ; bump stack | ||
1371 | DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32 | ||
1372 | |||
1373 | ; | ||
1374 | ; Load up all of the values we are going to use | ||
1375 | ; | ||
1376 | FLDD 0(a_ptr),a0 | ||
1377 | FLDD 8(a_ptr),a1 | ||
1378 | FLDD 16(a_ptr),a2 | ||
1379 | FLDD 24(a_ptr),a3 | ||
1380 | FLDD 32(a_ptr),a4 | ||
1381 | FLDD 40(a_ptr),a5 | ||
1382 | FLDD 48(a_ptr),a6 | ||
1383 | FLDD 56(a_ptr),a7 | ||
1384 | |||
1385 | FLDD 0(b_ptr),b0 | ||
1386 | FLDD 8(b_ptr),b1 | ||
1387 | FLDD 16(b_ptr),b2 | ||
1388 | FLDD 24(b_ptr),b3 | ||
1389 | FLDD 32(b_ptr),b4 | ||
1390 | FLDD 40(b_ptr),b5 | ||
1391 | FLDD 48(b_ptr),b6 | ||
1392 | FLDD 56(b_ptr),b7 | ||
1393 | |||
1394 | MUL_ADD_C a0L,a0R,b0L,b0R,c1,c2,c3 | ||
1395 | STD c1,0(r_ptr) | ||
1396 | COPY %r0,c1 | ||
1397 | |||
1398 | MUL_ADD_C a0L,a0R,b1L,b1R,c2,c3,c1 | ||
1399 | MUL_ADD_C a1L,a1R,b0L,b0R,c2,c3,c1 | ||
1400 | STD c2,8(r_ptr) | ||
1401 | COPY %r0,c2 | ||
1402 | |||
1403 | MUL_ADD_C a2L,a2R,b0L,b0R,c3,c1,c2 | ||
1404 | MUL_ADD_C a1L,a1R,b1L,b1R,c3,c1,c2 | ||
1405 | MUL_ADD_C a0L,a0R,b2L,b2R,c3,c1,c2 | ||
1406 | STD c3,16(r_ptr) | ||
1407 | COPY %r0,c3 | ||
1408 | |||
1409 | MUL_ADD_C a0L,a0R,b3L,b3R,c1,c2,c3 | ||
1410 | MUL_ADD_C a1L,a1R,b2L,b2R,c1,c2,c3 | ||
1411 | MUL_ADD_C a2L,a2R,b1L,b1R,c1,c2,c3 | ||
1412 | MUL_ADD_C a3L,a3R,b0L,b0R,c1,c2,c3 | ||
1413 | STD c1,24(r_ptr) | ||
1414 | COPY %r0,c1 | ||
1415 | |||
1416 | MUL_ADD_C a4L,a4R,b0L,b0R,c2,c3,c1 | ||
1417 | MUL_ADD_C a3L,a3R,b1L,b1R,c2,c3,c1 | ||
1418 | MUL_ADD_C a2L,a2R,b2L,b2R,c2,c3,c1 | ||
1419 | MUL_ADD_C a1L,a1R,b3L,b3R,c2,c3,c1 | ||
1420 | MUL_ADD_C a0L,a0R,b4L,b4R,c2,c3,c1 | ||
1421 | STD c2,32(r_ptr) | ||
1422 | COPY %r0,c2 | ||
1423 | |||
1424 | MUL_ADD_C a0L,a0R,b5L,b5R,c3,c1,c2 | ||
1425 | MUL_ADD_C a1L,a1R,b4L,b4R,c3,c1,c2 | ||
1426 | MUL_ADD_C a2L,a2R,b3L,b3R,c3,c1,c2 | ||
1427 | MUL_ADD_C a3L,a3R,b2L,b2R,c3,c1,c2 | ||
1428 | MUL_ADD_C a4L,a4R,b1L,b1R,c3,c1,c2 | ||
1429 | MUL_ADD_C a5L,a5R,b0L,b0R,c3,c1,c2 | ||
1430 | STD c3,40(r_ptr) | ||
1431 | COPY %r0,c3 | ||
1432 | |||
1433 | MUL_ADD_C a6L,a6R,b0L,b0R,c1,c2,c3 | ||
1434 | MUL_ADD_C a5L,a5R,b1L,b1R,c1,c2,c3 | ||
1435 | MUL_ADD_C a4L,a4R,b2L,b2R,c1,c2,c3 | ||
1436 | MUL_ADD_C a3L,a3R,b3L,b3R,c1,c2,c3 | ||
1437 | MUL_ADD_C a2L,a2R,b4L,b4R,c1,c2,c3 | ||
1438 | MUL_ADD_C a1L,a1R,b5L,b5R,c1,c2,c3 | ||
1439 | MUL_ADD_C a0L,a0R,b6L,b6R,c1,c2,c3 | ||
1440 | STD c1,48(r_ptr) | ||
1441 | COPY %r0,c1 | ||
1442 | |||
1443 | MUL_ADD_C a0L,a0R,b7L,b7R,c2,c3,c1 | ||
1444 | MUL_ADD_C a1L,a1R,b6L,b6R,c2,c3,c1 | ||
1445 | MUL_ADD_C a2L,a2R,b5L,b5R,c2,c3,c1 | ||
1446 | MUL_ADD_C a3L,a3R,b4L,b4R,c2,c3,c1 | ||
1447 | MUL_ADD_C a4L,a4R,b3L,b3R,c2,c3,c1 | ||
1448 | MUL_ADD_C a5L,a5R,b2L,b2R,c2,c3,c1 | ||
1449 | MUL_ADD_C a6L,a6R,b1L,b1R,c2,c3,c1 | ||
1450 | MUL_ADD_C a7L,a7R,b0L,b0R,c2,c3,c1 | ||
1451 | STD c2,56(r_ptr) | ||
1452 | COPY %r0,c2 | ||
1453 | |||
1454 | MUL_ADD_C a7L,a7R,b1L,b1R,c3,c1,c2 | ||
1455 | MUL_ADD_C a6L,a6R,b2L,b2R,c3,c1,c2 | ||
1456 | MUL_ADD_C a5L,a5R,b3L,b3R,c3,c1,c2 | ||
1457 | MUL_ADD_C a4L,a4R,b4L,b4R,c3,c1,c2 | ||
1458 | MUL_ADD_C a3L,a3R,b5L,b5R,c3,c1,c2 | ||
1459 | MUL_ADD_C a2L,a2R,b6L,b6R,c3,c1,c2 | ||
1460 | MUL_ADD_C a1L,a1R,b7L,b7R,c3,c1,c2 | ||
1461 | STD c3,64(r_ptr) | ||
1462 | COPY %r0,c3 | ||
1463 | |||
1464 | MUL_ADD_C a2L,a2R,b7L,b7R,c1,c2,c3 | ||
1465 | MUL_ADD_C a3L,a3R,b6L,b6R,c1,c2,c3 | ||
1466 | MUL_ADD_C a4L,a4R,b5L,b5R,c1,c2,c3 | ||
1467 | MUL_ADD_C a5L,a5R,b4L,b4R,c1,c2,c3 | ||
1468 | MUL_ADD_C a6L,a6R,b3L,b3R,c1,c2,c3 | ||
1469 | MUL_ADD_C a7L,a7R,b2L,b2R,c1,c2,c3 | ||
1470 | STD c1,72(r_ptr) | ||
1471 | COPY %r0,c1 | ||
1472 | |||
1473 | MUL_ADD_C a7L,a7R,b3L,b3R,c2,c3,c1 | ||
1474 | MUL_ADD_C a6L,a6R,b4L,b4R,c2,c3,c1 | ||
1475 | MUL_ADD_C a5L,a5R,b5L,b5R,c2,c3,c1 | ||
1476 | MUL_ADD_C a4L,a4R,b6L,b6R,c2,c3,c1 | ||
1477 | MUL_ADD_C a3L,a3R,b7L,b7R,c2,c3,c1 | ||
1478 | STD c2,80(r_ptr) | ||
1479 | COPY %r0,c2 | ||
1480 | |||
1481 | MUL_ADD_C a4L,a4R,b7L,b7R,c3,c1,c2 | ||
1482 | MUL_ADD_C a5L,a5R,b6L,b6R,c3,c1,c2 | ||
1483 | MUL_ADD_C a6L,a6R,b5L,b5R,c3,c1,c2 | ||
1484 | MUL_ADD_C a7L,a7R,b4L,b4R,c3,c1,c2 | ||
1485 | STD c3,88(r_ptr) | ||
1486 | COPY %r0,c3 | ||
1487 | |||
1488 | MUL_ADD_C a7L,a7R,b5L,b5R,c1,c2,c3 | ||
1489 | MUL_ADD_C a6L,a6R,b6L,b6R,c1,c2,c3 | ||
1490 | MUL_ADD_C a5L,a5R,b7L,b7R,c1,c2,c3 | ||
1491 | STD c1,96(r_ptr) | ||
1492 | COPY %r0,c1 | ||
1493 | |||
1494 | MUL_ADD_C a6L,a6R,b7L,b7R,c2,c3,c1 | ||
1495 | MUL_ADD_C a7L,a7R,b6L,b6R,c2,c3,c1 | ||
1496 | STD c2,104(r_ptr) | ||
1497 | COPY %r0,c2 | ||
1498 | |||
1499 | MUL_ADD_C a7L,a7R,b7L,b7R,c3,c1,c2 | ||
1500 | STD c3,112(r_ptr) | ||
1501 | STD c1,120(r_ptr) | ||
1502 | |||
1503 | .EXIT | ||
1504 | FLDD -88(%sp),%fr13 | ||
1505 | FLDD -96(%sp),%fr12 | ||
1506 | LDD -104(%sp),%r6 ; restore r6 | ||
1507 | LDD -112(%sp),%r5 ; restore r5 | ||
1508 | LDD -120(%sp),%r4 ; restore r4 | ||
1509 | BVE (%rp) | ||
1510 | LDD,MB -128(%sp),%r3 | ||
1511 | |||
1512 | .PROCEND | ||
1513 | |||
1514 | ;----------------------------------------------------------------------------- | ||
1515 | ; | ||
1516 | ;void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) | ||
1517 | ; arg0 = r_ptr | ||
1518 | ; arg1 = a_ptr | ||
1519 | ; arg2 = b_ptr | ||
1520 | ; | ||
1521 | |||
1522 | bn_mul_comba4 | ||
1523 | .proc | ||
1524 | .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE | ||
1525 | .EXPORT bn_mul_comba4,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN | ||
1526 | .entry | ||
1527 | .align 64 | ||
1528 | |||
1529 | STD %r3,0(%sp) ; save r3 | ||
1530 | STD %r4,8(%sp) ; save r4 | ||
1531 | STD %r5,16(%sp) ; save r5 | ||
1532 | STD %r6,24(%sp) ; save r6 | ||
1533 | FSTD %fr12,32(%sp) ; save r6 | ||
1534 | FSTD %fr13,40(%sp) ; save r7 | ||
1535 | |||
1536 | ; | ||
1537 | ; Zero out carries | ||
1538 | ; | ||
1539 | COPY %r0,c1 | ||
1540 | COPY %r0,c2 | ||
1541 | COPY %r0,c3 | ||
1542 | |||
1543 | LDO 128(%sp),%sp ; bump stack | ||
1544 | DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32 | ||
1545 | |||
1546 | ; | ||
1547 | ; Load up all of the values we are going to use | ||
1548 | ; | ||
1549 | FLDD 0(a_ptr),a0 | ||
1550 | FLDD 8(a_ptr),a1 | ||
1551 | FLDD 16(a_ptr),a2 | ||
1552 | FLDD 24(a_ptr),a3 | ||
1553 | |||
1554 | FLDD 0(b_ptr),b0 | ||
1555 | FLDD 8(b_ptr),b1 | ||
1556 | FLDD 16(b_ptr),b2 | ||
1557 | FLDD 24(b_ptr),b3 | ||
1558 | |||
1559 | MUL_ADD_C a0L,a0R,b0L,b0R,c1,c2,c3 | ||
1560 | STD c1,0(r_ptr) | ||
1561 | COPY %r0,c1 | ||
1562 | |||
1563 | MUL_ADD_C a0L,a0R,b1L,b1R,c2,c3,c1 | ||
1564 | MUL_ADD_C a1L,a1R,b0L,b0R,c2,c3,c1 | ||
1565 | STD c2,8(r_ptr) | ||
1566 | COPY %r0,c2 | ||
1567 | |||
1568 | MUL_ADD_C a2L,a2R,b0L,b0R,c3,c1,c2 | ||
1569 | MUL_ADD_C a1L,a1R,b1L,b1R,c3,c1,c2 | ||
1570 | MUL_ADD_C a0L,a0R,b2L,b2R,c3,c1,c2 | ||
1571 | STD c3,16(r_ptr) | ||
1572 | COPY %r0,c3 | ||
1573 | |||
1574 | MUL_ADD_C a0L,a0R,b3L,b3R,c1,c2,c3 | ||
1575 | MUL_ADD_C a1L,a1R,b2L,b2R,c1,c2,c3 | ||
1576 | MUL_ADD_C a2L,a2R,b1L,b1R,c1,c2,c3 | ||
1577 | MUL_ADD_C a3L,a3R,b0L,b0R,c1,c2,c3 | ||
1578 | STD c1,24(r_ptr) | ||
1579 | COPY %r0,c1 | ||
1580 | |||
1581 | MUL_ADD_C a3L,a3R,b1L,b1R,c2,c3,c1 | ||
1582 | MUL_ADD_C a2L,a2R,b2L,b2R,c2,c3,c1 | ||
1583 | MUL_ADD_C a1L,a1R,b3L,b3R,c2,c3,c1 | ||
1584 | STD c2,32(r_ptr) | ||
1585 | COPY %r0,c2 | ||
1586 | |||
1587 | MUL_ADD_C a2L,a2R,b3L,b3R,c3,c1,c2 | ||
1588 | MUL_ADD_C a3L,a3R,b2L,b2R,c3,c1,c2 | ||
1589 | STD c3,40(r_ptr) | ||
1590 | COPY %r0,c3 | ||
1591 | |||
1592 | MUL_ADD_C a3L,a3R,b3L,b3R,c1,c2,c3 | ||
1593 | STD c1,48(r_ptr) | ||
1594 | STD c2,56(r_ptr) | ||
1595 | |||
1596 | .EXIT | ||
1597 | FLDD -88(%sp),%fr13 | ||
1598 | FLDD -96(%sp),%fr12 | ||
1599 | LDD -104(%sp),%r6 ; restore r6 | ||
1600 | LDD -112(%sp),%r5 ; restore r5 | ||
1601 | LDD -120(%sp),%r4 ; restore r4 | ||
1602 | BVE (%rp) | ||
1603 | LDD,MB -128(%sp),%r3 | ||
1604 | |||
1605 | .PROCEND | ||
1606 | |||
1607 | |||
1608 | .SPACE $TEXT$ | ||
1609 | .SUBSPA $CODE$ | ||
1610 | .SPACE $PRIVATE$,SORT=16 | ||
1611 | .IMPORT $global$,DATA | ||
1612 | .SPACE $TEXT$ | ||
1613 | .SUBSPA $CODE$ | ||
1614 | .SUBSPA $LIT$,QUAD=0,ALIGN=8,ACCESS=0x2c,SORT=16 | ||
1615 | C$7 | ||
1616 | .ALIGN 8 | ||
1617 | .STRINGZ "Division would overflow (%d)\n" | ||
1618 | .END | ||