diff options
Diffstat (limited to 'src/lib/libcrypto')
| -rw-r--r-- | src/lib/libcrypto/bn/asm/pa-risc2.s | 1618 | ||||
| -rw-r--r-- | src/lib/libcrypto/bn/asm/pa-risc2W.s | 1605 | ||||
| -rw-r--r-- | src/lib/libcrypto/bn/asm/sparcv8plus.S | 1558 | ||||
| -rw-r--r-- | src/lib/libcrypto/bn/asm/sparcv9-mont.pl | 604 | ||||
| -rwxr-xr-x | src/lib/libcrypto/bn/asm/sparcv9a-mont.pl | 880 | ||||
| -rw-r--r-- | src/lib/libcrypto/bn/asm/via-mont.pl | 242 | ||||
| -rw-r--r-- | src/lib/libcrypto/cast/asm/cast-586.pl | 177 | ||||
| -rw-r--r-- | src/lib/libcrypto/des/asm/crypt586.pl | 209 | ||||
| -rw-r--r-- | src/lib/libcrypto/sha/asm/sha1-sparcv9a.pl | 608 | ||||
| -rw-r--r-- | src/lib/libcrypto/sha/asm/sha1-thumb.pl | 259 |
10 files changed, 0 insertions, 7760 deletions
diff --git a/src/lib/libcrypto/bn/asm/pa-risc2.s b/src/lib/libcrypto/bn/asm/pa-risc2.s deleted file mode 100644 index f3b16290eb..0000000000 --- a/src/lib/libcrypto/bn/asm/pa-risc2.s +++ /dev/null | |||
| @@ -1,1618 +0,0 @@ | |||
| 1 | ; | ||
| 2 | ; PA-RISC 2.0 implementation of bn_asm code, based on the | ||
| 3 | ; 64-bit version of the code. This code is effectively the | ||
| 4 | ; same as the 64-bit version except the register model is | ||
| 5 | ; slightly different given all values must be 32-bit between | ||
| 6 | ; function calls. Thus the 64-bit return values are returned | ||
| 7 | ; in %ret0 and %ret1 vs just %ret0 as is done in 64-bit | ||
| 8 | ; | ||
| 9 | ; | ||
| 10 | ; This code is approximately 2x faster than the C version | ||
| 11 | ; for RSA/DSA. | ||
| 12 | ; | ||
| 13 | ; See http://devresource.hp.com/ for more details on the PA-RISC | ||
| 14 | ; architecture. Also see the book "PA-RISC 2.0 Architecture" | ||
| 15 | ; by Gerry Kane for information on the instruction set architecture. | ||
| 16 | ; | ||
| 17 | ; Code written by Chris Ruemmler (with some help from the HP C | ||
| 18 | ; compiler). | ||
| 19 | ; | ||
| 20 | ; The code compiles with HP's assembler | ||
| 21 | ; | ||
| 22 | |||
| 23 | .level 2.0N | ||
| 24 | .space $TEXT$ | ||
| 25 | .subspa $CODE$,QUAD=0,ALIGN=8,ACCESS=0x2c,CODE_ONLY | ||
| 26 | |||
| 27 | ; | ||
| 28 | ; Global Register definitions used for the routines. | ||
| 29 | ; | ||
| 30 | ; Some information about HP's runtime architecture for 32-bits. | ||
| 31 | ; | ||
| 32 | ; "Caller save" means the calling function must save the register | ||
| 33 | ; if it wants the register to be preserved. | ||
| 34 | ; "Callee save" means if a function uses the register, it must save | ||
| 35 | ; the value before using it. | ||
| 36 | ; | ||
| 37 | ; For the floating point registers | ||
| 38 | ; | ||
| 39 | ; "caller save" registers: fr4-fr11, fr22-fr31 | ||
| 40 | ; "callee save" registers: fr12-fr21 | ||
| 41 | ; "special" registers: fr0-fr3 (status and exception registers) | ||
| 42 | ; | ||
| 43 | ; For the integer registers | ||
| 44 | ; value zero : r0 | ||
| 45 | ; "caller save" registers: r1,r19-r26 | ||
| 46 | ; "callee save" registers: r3-r18 | ||
| 47 | ; return register : r2 (rp) | ||
| 48 | ; return values ; r28,r29 (ret0,ret1) | ||
| 49 | ; Stack pointer ; r30 (sp) | ||
| 50 | ; millicode return ptr ; r31 (also a caller save register) | ||
| 51 | |||
| 52 | |||
| 53 | ; | ||
| 54 | ; Arguments to the routines | ||
| 55 | ; | ||
| 56 | r_ptr .reg %r26 | ||
| 57 | a_ptr .reg %r25 | ||
| 58 | b_ptr .reg %r24 | ||
| 59 | num .reg %r24 | ||
| 60 | n .reg %r23 | ||
| 61 | |||
| 62 | ; | ||
| 63 | ; Note that the "w" argument for bn_mul_add_words and bn_mul_words | ||
| 64 | ; is passed on the stack at a delta of -56 from the top of stack | ||
| 65 | ; as the routine is entered. | ||
| 66 | ; | ||
| 67 | |||
| 68 | ; | ||
| 69 | ; Globals used in some routines | ||
| 70 | ; | ||
| 71 | |||
| 72 | top_overflow .reg %r23 | ||
| 73 | high_mask .reg %r22 ; value 0xffffffff80000000L | ||
| 74 | |||
| 75 | |||
| 76 | ;------------------------------------------------------------------------------ | ||
| 77 | ; | ||
| 78 | ; bn_mul_add_words | ||
| 79 | ; | ||
| 80 | ;BN_ULONG bn_mul_add_words(BN_ULONG *r_ptr, BN_ULONG *a_ptr, | ||
| 81 | ; int num, BN_ULONG w) | ||
| 82 | ; | ||
| 83 | ; arg0 = r_ptr | ||
| 84 | ; arg1 = a_ptr | ||
| 85 | ; arg3 = num | ||
| 86 | ; -56(sp) = w | ||
| 87 | ; | ||
| 88 | ; Local register definitions | ||
| 89 | ; | ||
| 90 | |||
| 91 | fm1 .reg %fr22 | ||
| 92 | fm .reg %fr23 | ||
| 93 | ht_temp .reg %fr24 | ||
| 94 | ht_temp_1 .reg %fr25 | ||
| 95 | lt_temp .reg %fr26 | ||
| 96 | lt_temp_1 .reg %fr27 | ||
| 97 | fm1_1 .reg %fr28 | ||
| 98 | fm_1 .reg %fr29 | ||
| 99 | |||
| 100 | fw_h .reg %fr7L | ||
| 101 | fw_l .reg %fr7R | ||
| 102 | fw .reg %fr7 | ||
| 103 | |||
| 104 | fht_0 .reg %fr8L | ||
| 105 | flt_0 .reg %fr8R | ||
| 106 | t_float_0 .reg %fr8 | ||
| 107 | |||
| 108 | fht_1 .reg %fr9L | ||
| 109 | flt_1 .reg %fr9R | ||
| 110 | t_float_1 .reg %fr9 | ||
| 111 | |||
| 112 | tmp_0 .reg %r31 | ||
| 113 | tmp_1 .reg %r21 | ||
| 114 | m_0 .reg %r20 | ||
| 115 | m_1 .reg %r19 | ||
| 116 | ht_0 .reg %r1 | ||
| 117 | ht_1 .reg %r3 | ||
| 118 | lt_0 .reg %r4 | ||
| 119 | lt_1 .reg %r5 | ||
| 120 | m1_0 .reg %r6 | ||
| 121 | m1_1 .reg %r7 | ||
| 122 | rp_val .reg %r8 | ||
| 123 | rp_val_1 .reg %r9 | ||
| 124 | |||
| 125 | bn_mul_add_words | ||
| 126 | .export bn_mul_add_words,entry,NO_RELOCATION,LONG_RETURN | ||
| 127 | .proc | ||
| 128 | .callinfo frame=128 | ||
| 129 | .entry | ||
| 130 | .align 64 | ||
| 131 | |||
| 132 | STD %r3,0(%sp) ; save r3 | ||
| 133 | STD %r4,8(%sp) ; save r4 | ||
| 134 | NOP ; Needed to make the loop 16-byte aligned | ||
| 135 | NOP ; needed to make the loop 16-byte aligned | ||
| 136 | |||
| 137 | STD %r5,16(%sp) ; save r5 | ||
| 138 | NOP | ||
| 139 | STD %r6,24(%sp) ; save r6 | ||
| 140 | STD %r7,32(%sp) ; save r7 | ||
| 141 | |||
| 142 | STD %r8,40(%sp) ; save r8 | ||
| 143 | STD %r9,48(%sp) ; save r9 | ||
| 144 | COPY %r0,%ret1 ; return 0 by default | ||
| 145 | DEPDI,Z 1,31,1,top_overflow ; top_overflow = 1 << 32 | ||
| 146 | |||
| 147 | CMPIB,>= 0,num,bn_mul_add_words_exit ; if (num <= 0) then exit | ||
| 148 | LDO 128(%sp),%sp ; bump stack | ||
| 149 | |||
| 150 | ; | ||
| 151 | ; The loop is unrolled twice, so if there is only 1 number | ||
| 152 | ; then go straight to the cleanup code. | ||
| 153 | ; | ||
| 154 | CMPIB,= 1,num,bn_mul_add_words_single_top | ||
| 155 | FLDD -184(%sp),fw ; (-56-128) load up w into fw (fw_h/fw_l) | ||
| 156 | |||
| 157 | ; | ||
| 158 | ; This loop is unrolled 2 times (64-byte aligned as well) | ||
| 159 | ; | ||
| 160 | ; PA-RISC 2.0 chips have two fully pipelined multipliers, thus | ||
| 161 | ; two 32-bit mutiplies can be issued per cycle. | ||
| 162 | ; | ||
| 163 | bn_mul_add_words_unroll2 | ||
| 164 | |||
| 165 | FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R) | ||
| 166 | FLDD 8(a_ptr),t_float_1 ; load up 64-bit value (fr8L) ht(L)/lt(R) | ||
| 167 | LDD 0(r_ptr),rp_val ; rp[0] | ||
| 168 | LDD 8(r_ptr),rp_val_1 ; rp[1] | ||
| 169 | |||
| 170 | XMPYU fht_0,fw_l,fm1 ; m1[0] = fht_0*fw_l | ||
| 171 | XMPYU fht_1,fw_l,fm1_1 ; m1[1] = fht_1*fw_l | ||
| 172 | FSTD fm1,-16(%sp) ; -16(sp) = m1[0] | ||
| 173 | FSTD fm1_1,-48(%sp) ; -48(sp) = m1[1] | ||
| 174 | |||
| 175 | XMPYU flt_0,fw_h,fm ; m[0] = flt_0*fw_h | ||
| 176 | XMPYU flt_1,fw_h,fm_1 ; m[1] = flt_1*fw_h | ||
| 177 | FSTD fm,-8(%sp) ; -8(sp) = m[0] | ||
| 178 | FSTD fm_1,-40(%sp) ; -40(sp) = m[1] | ||
| 179 | |||
| 180 | XMPYU fht_0,fw_h,ht_temp ; ht_temp = fht_0*fw_h | ||
| 181 | XMPYU fht_1,fw_h,ht_temp_1 ; ht_temp_1 = fht_1*fw_h | ||
| 182 | FSTD ht_temp,-24(%sp) ; -24(sp) = ht_temp | ||
| 183 | FSTD ht_temp_1,-56(%sp) ; -56(sp) = ht_temp_1 | ||
| 184 | |||
| 185 | XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l | ||
| 186 | XMPYU flt_1,fw_l,lt_temp_1 ; lt_temp = lt*fw_l | ||
| 187 | FSTD lt_temp,-32(%sp) ; -32(sp) = lt_temp | ||
| 188 | FSTD lt_temp_1,-64(%sp) ; -64(sp) = lt_temp_1 | ||
| 189 | |||
| 190 | LDD -8(%sp),m_0 ; m[0] | ||
| 191 | LDD -40(%sp),m_1 ; m[1] | ||
| 192 | LDD -16(%sp),m1_0 ; m1[0] | ||
| 193 | LDD -48(%sp),m1_1 ; m1[1] | ||
| 194 | |||
| 195 | LDD -24(%sp),ht_0 ; ht[0] | ||
| 196 | LDD -56(%sp),ht_1 ; ht[1] | ||
| 197 | ADD,L m1_0,m_0,tmp_0 ; tmp_0 = m[0] + m1[0]; | ||
| 198 | ADD,L m1_1,m_1,tmp_1 ; tmp_1 = m[1] + m1[1]; | ||
| 199 | |||
| 200 | LDD -32(%sp),lt_0 | ||
| 201 | LDD -64(%sp),lt_1 | ||
| 202 | CMPCLR,*>>= tmp_0,m1_0, %r0 ; if (m[0] < m1[0]) | ||
| 203 | ADD,L ht_0,top_overflow,ht_0 ; ht[0] += (1<<32) | ||
| 204 | |||
| 205 | CMPCLR,*>>= tmp_1,m1_1,%r0 ; if (m[1] < m1[1]) | ||
| 206 | ADD,L ht_1,top_overflow,ht_1 ; ht[1] += (1<<32) | ||
| 207 | EXTRD,U tmp_0,31,32,m_0 ; m[0]>>32 | ||
| 208 | DEPD,Z tmp_0,31,32,m1_0 ; m1[0] = m[0]<<32 | ||
| 209 | |||
| 210 | EXTRD,U tmp_1,31,32,m_1 ; m[1]>>32 | ||
| 211 | DEPD,Z tmp_1,31,32,m1_1 ; m1[1] = m[1]<<32 | ||
| 212 | ADD,L ht_0,m_0,ht_0 ; ht[0]+= (m[0]>>32) | ||
| 213 | ADD,L ht_1,m_1,ht_1 ; ht[1]+= (m[1]>>32) | ||
| 214 | |||
| 215 | ADD lt_0,m1_0,lt_0 ; lt[0] = lt[0]+m1[0]; | ||
| 216 | ADD,DC ht_0,%r0,ht_0 ; ht[0]++ | ||
| 217 | ADD lt_1,m1_1,lt_1 ; lt[1] = lt[1]+m1[1]; | ||
| 218 | ADD,DC ht_1,%r0,ht_1 ; ht[1]++ | ||
| 219 | |||
| 220 | ADD %ret1,lt_0,lt_0 ; lt[0] = lt[0] + c; | ||
| 221 | ADD,DC ht_0,%r0,ht_0 ; ht[0]++ | ||
| 222 | ADD lt_0,rp_val,lt_0 ; lt[0] = lt[0]+rp[0] | ||
| 223 | ADD,DC ht_0,%r0,ht_0 ; ht[0]++ | ||
| 224 | |||
| 225 | LDO -2(num),num ; num = num - 2; | ||
| 226 | ADD ht_0,lt_1,lt_1 ; lt[1] = lt[1] + ht_0 (c); | ||
| 227 | ADD,DC ht_1,%r0,ht_1 ; ht[1]++ | ||
| 228 | STD lt_0,0(r_ptr) ; rp[0] = lt[0] | ||
| 229 | |||
| 230 | ADD lt_1,rp_val_1,lt_1 ; lt[1] = lt[1]+rp[1] | ||
| 231 | ADD,DC ht_1,%r0,%ret1 ; ht[1]++ | ||
| 232 | LDO 16(a_ptr),a_ptr ; a_ptr += 2 | ||
| 233 | |||
| 234 | STD lt_1,8(r_ptr) ; rp[1] = lt[1] | ||
| 235 | CMPIB,<= 2,num,bn_mul_add_words_unroll2 ; go again if more to do | ||
| 236 | LDO 16(r_ptr),r_ptr ; r_ptr += 2 | ||
| 237 | |||
| 238 | CMPIB,=,N 0,num,bn_mul_add_words_exit ; are we done, or cleanup last one | ||
| 239 | |||
| 240 | ; | ||
| 241 | ; Top of loop aligned on 64-byte boundary | ||
| 242 | ; | ||
| 243 | bn_mul_add_words_single_top | ||
| 244 | FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R) | ||
| 245 | LDD 0(r_ptr),rp_val ; rp[0] | ||
| 246 | LDO 8(a_ptr),a_ptr ; a_ptr++ | ||
| 247 | XMPYU fht_0,fw_l,fm1 ; m1 = ht*fw_l | ||
| 248 | FSTD fm1,-16(%sp) ; -16(sp) = m1 | ||
| 249 | XMPYU flt_0,fw_h,fm ; m = lt*fw_h | ||
| 250 | FSTD fm,-8(%sp) ; -8(sp) = m | ||
| 251 | XMPYU fht_0,fw_h,ht_temp ; ht_temp = ht*fw_h | ||
| 252 | FSTD ht_temp,-24(%sp) ; -24(sp) = ht | ||
| 253 | XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l | ||
| 254 | FSTD lt_temp,-32(%sp) ; -32(sp) = lt | ||
| 255 | |||
| 256 | LDD -8(%sp),m_0 | ||
| 257 | LDD -16(%sp),m1_0 ; m1 = temp1 | ||
| 258 | ADD,L m_0,m1_0,tmp_0 ; tmp_0 = m + m1; | ||
| 259 | LDD -24(%sp),ht_0 | ||
| 260 | LDD -32(%sp),lt_0 | ||
| 261 | |||
| 262 | CMPCLR,*>>= tmp_0,m1_0,%r0 ; if (m < m1) | ||
| 263 | ADD,L ht_0,top_overflow,ht_0 ; ht += (1<<32) | ||
| 264 | |||
| 265 | EXTRD,U tmp_0,31,32,m_0 ; m>>32 | ||
| 266 | DEPD,Z tmp_0,31,32,m1_0 ; m1 = m<<32 | ||
| 267 | |||
| 268 | ADD,L ht_0,m_0,ht_0 ; ht+= (m>>32) | ||
| 269 | ADD lt_0,m1_0,tmp_0 ; tmp_0 = lt+m1; | ||
| 270 | ADD,DC ht_0,%r0,ht_0 ; ht++ | ||
| 271 | ADD %ret1,tmp_0,lt_0 ; lt = lt + c; | ||
| 272 | ADD,DC ht_0,%r0,ht_0 ; ht++ | ||
| 273 | ADD lt_0,rp_val,lt_0 ; lt = lt+rp[0] | ||
| 274 | ADD,DC ht_0,%r0,%ret1 ; ht++ | ||
| 275 | STD lt_0,0(r_ptr) ; rp[0] = lt | ||
| 276 | |||
| 277 | bn_mul_add_words_exit | ||
| 278 | .EXIT | ||
| 279 | |||
| 280 | EXTRD,U %ret1,31,32,%ret0 ; for 32-bit, return in ret0/ret1 | ||
| 281 | LDD -80(%sp),%r9 ; restore r9 | ||
| 282 | LDD -88(%sp),%r8 ; restore r8 | ||
| 283 | LDD -96(%sp),%r7 ; restore r7 | ||
| 284 | LDD -104(%sp),%r6 ; restore r6 | ||
| 285 | LDD -112(%sp),%r5 ; restore r5 | ||
| 286 | LDD -120(%sp),%r4 ; restore r4 | ||
| 287 | BVE (%rp) | ||
| 288 | LDD,MB -128(%sp),%r3 ; restore r3 | ||
| 289 | .PROCEND ;in=23,24,25,26,29;out=28; | ||
| 290 | |||
| 291 | ;---------------------------------------------------------------------------- | ||
| 292 | ; | ||
| 293 | ;BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w) | ||
| 294 | ; | ||
| 295 | ; arg0 = rp | ||
| 296 | ; arg1 = ap | ||
| 297 | ; arg3 = num | ||
| 298 | ; w on stack at -56(sp) | ||
| 299 | |||
| 300 | bn_mul_words | ||
| 301 | .proc | ||
| 302 | .callinfo frame=128 | ||
| 303 | .entry | ||
| 304 | .EXPORT bn_mul_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN | ||
| 305 | .align 64 | ||
| 306 | |||
| 307 | STD %r3,0(%sp) ; save r3 | ||
| 308 | STD %r4,8(%sp) ; save r4 | ||
| 309 | NOP | ||
| 310 | STD %r5,16(%sp) ; save r5 | ||
| 311 | |||
| 312 | STD %r6,24(%sp) ; save r6 | ||
| 313 | STD %r7,32(%sp) ; save r7 | ||
| 314 | COPY %r0,%ret1 ; return 0 by default | ||
| 315 | DEPDI,Z 1,31,1,top_overflow ; top_overflow = 1 << 32 | ||
| 316 | |||
| 317 | CMPIB,>= 0,num,bn_mul_words_exit | ||
| 318 | LDO 128(%sp),%sp ; bump stack | ||
| 319 | |||
| 320 | ; | ||
| 321 | ; See if only 1 word to do, thus just do cleanup | ||
| 322 | ; | ||
| 323 | CMPIB,= 1,num,bn_mul_words_single_top | ||
| 324 | FLDD -184(%sp),fw ; (-56-128) load up w into fw (fw_h/fw_l) | ||
| 325 | |||
| 326 | ; | ||
| 327 | ; This loop is unrolled 2 times (64-byte aligned as well) | ||
| 328 | ; | ||
| 329 | ; PA-RISC 2.0 chips have two fully pipelined multipliers, thus | ||
| 330 | ; two 32-bit mutiplies can be issued per cycle. | ||
| 331 | ; | ||
| 332 | bn_mul_words_unroll2 | ||
| 333 | |||
| 334 | FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R) | ||
| 335 | FLDD 8(a_ptr),t_float_1 ; load up 64-bit value (fr8L) ht(L)/lt(R) | ||
| 336 | XMPYU fht_0,fw_l,fm1 ; m1[0] = fht_0*fw_l | ||
| 337 | XMPYU fht_1,fw_l,fm1_1 ; m1[1] = ht*fw_l | ||
| 338 | |||
| 339 | FSTD fm1,-16(%sp) ; -16(sp) = m1 | ||
| 340 | FSTD fm1_1,-48(%sp) ; -48(sp) = m1 | ||
| 341 | XMPYU flt_0,fw_h,fm ; m = lt*fw_h | ||
| 342 | XMPYU flt_1,fw_h,fm_1 ; m = lt*fw_h | ||
| 343 | |||
| 344 | FSTD fm,-8(%sp) ; -8(sp) = m | ||
| 345 | FSTD fm_1,-40(%sp) ; -40(sp) = m | ||
| 346 | XMPYU fht_0,fw_h,ht_temp ; ht_temp = fht_0*fw_h | ||
| 347 | XMPYU fht_1,fw_h,ht_temp_1 ; ht_temp = ht*fw_h | ||
| 348 | |||
| 349 | FSTD ht_temp,-24(%sp) ; -24(sp) = ht | ||
| 350 | FSTD ht_temp_1,-56(%sp) ; -56(sp) = ht | ||
| 351 | XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l | ||
| 352 | XMPYU flt_1,fw_l,lt_temp_1 ; lt_temp = lt*fw_l | ||
| 353 | |||
| 354 | FSTD lt_temp,-32(%sp) ; -32(sp) = lt | ||
| 355 | FSTD lt_temp_1,-64(%sp) ; -64(sp) = lt | ||
| 356 | LDD -8(%sp),m_0 | ||
| 357 | LDD -40(%sp),m_1 | ||
| 358 | |||
| 359 | LDD -16(%sp),m1_0 | ||
| 360 | LDD -48(%sp),m1_1 | ||
| 361 | LDD -24(%sp),ht_0 | ||
| 362 | LDD -56(%sp),ht_1 | ||
| 363 | |||
| 364 | ADD,L m1_0,m_0,tmp_0 ; tmp_0 = m + m1; | ||
| 365 | ADD,L m1_1,m_1,tmp_1 ; tmp_1 = m + m1; | ||
| 366 | LDD -32(%sp),lt_0 | ||
| 367 | LDD -64(%sp),lt_1 | ||
| 368 | |||
| 369 | CMPCLR,*>>= tmp_0,m1_0, %r0 ; if (m < m1) | ||
| 370 | ADD,L ht_0,top_overflow,ht_0 ; ht += (1<<32) | ||
| 371 | CMPCLR,*>>= tmp_1,m1_1,%r0 ; if (m < m1) | ||
| 372 | ADD,L ht_1,top_overflow,ht_1 ; ht += (1<<32) | ||
| 373 | |||
| 374 | EXTRD,U tmp_0,31,32,m_0 ; m>>32 | ||
| 375 | DEPD,Z tmp_0,31,32,m1_0 ; m1 = m<<32 | ||
| 376 | EXTRD,U tmp_1,31,32,m_1 ; m>>32 | ||
| 377 | DEPD,Z tmp_1,31,32,m1_1 ; m1 = m<<32 | ||
| 378 | |||
| 379 | ADD,L ht_0,m_0,ht_0 ; ht+= (m>>32) | ||
| 380 | ADD,L ht_1,m_1,ht_1 ; ht+= (m>>32) | ||
| 381 | ADD lt_0,m1_0,lt_0 ; lt = lt+m1; | ||
| 382 | ADD,DC ht_0,%r0,ht_0 ; ht++ | ||
| 383 | |||
| 384 | ADD lt_1,m1_1,lt_1 ; lt = lt+m1; | ||
| 385 | ADD,DC ht_1,%r0,ht_1 ; ht++ | ||
| 386 | ADD %ret1,lt_0,lt_0 ; lt = lt + c (ret1); | ||
| 387 | ADD,DC ht_0,%r0,ht_0 ; ht++ | ||
| 388 | |||
| 389 | ADD ht_0,lt_1,lt_1 ; lt = lt + c (ht_0) | ||
| 390 | ADD,DC ht_1,%r0,ht_1 ; ht++ | ||
| 391 | STD lt_0,0(r_ptr) ; rp[0] = lt | ||
| 392 | STD lt_1,8(r_ptr) ; rp[1] = lt | ||
| 393 | |||
| 394 | COPY ht_1,%ret1 ; carry = ht | ||
| 395 | LDO -2(num),num ; num = num - 2; | ||
| 396 | LDO 16(a_ptr),a_ptr ; ap += 2 | ||
| 397 | CMPIB,<= 2,num,bn_mul_words_unroll2 | ||
| 398 | LDO 16(r_ptr),r_ptr ; rp++ | ||
| 399 | |||
| 400 | CMPIB,=,N 0,num,bn_mul_words_exit ; are we done? | ||
| 401 | |||
| 402 | ; | ||
| 403 | ; Top of loop aligned on 64-byte boundary | ||
| 404 | ; | ||
| 405 | bn_mul_words_single_top | ||
| 406 | FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R) | ||
| 407 | |||
| 408 | XMPYU fht_0,fw_l,fm1 ; m1 = ht*fw_l | ||
| 409 | FSTD fm1,-16(%sp) ; -16(sp) = m1 | ||
| 410 | XMPYU flt_0,fw_h,fm ; m = lt*fw_h | ||
| 411 | FSTD fm,-8(%sp) ; -8(sp) = m | ||
| 412 | XMPYU fht_0,fw_h,ht_temp ; ht_temp = ht*fw_h | ||
| 413 | FSTD ht_temp,-24(%sp) ; -24(sp) = ht | ||
| 414 | XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l | ||
| 415 | FSTD lt_temp,-32(%sp) ; -32(sp) = lt | ||
| 416 | |||
| 417 | LDD -8(%sp),m_0 | ||
| 418 | LDD -16(%sp),m1_0 | ||
| 419 | ADD,L m_0,m1_0,tmp_0 ; tmp_0 = m + m1; | ||
| 420 | LDD -24(%sp),ht_0 | ||
| 421 | LDD -32(%sp),lt_0 | ||
| 422 | |||
| 423 | CMPCLR,*>>= tmp_0,m1_0,%r0 ; if (m < m1) | ||
| 424 | ADD,L ht_0,top_overflow,ht_0 ; ht += (1<<32) | ||
| 425 | |||
| 426 | EXTRD,U tmp_0,31,32,m_0 ; m>>32 | ||
| 427 | DEPD,Z tmp_0,31,32,m1_0 ; m1 = m<<32 | ||
| 428 | |||
| 429 | ADD,L ht_0,m_0,ht_0 ; ht+= (m>>32) | ||
| 430 | ADD lt_0,m1_0,lt_0 ; lt= lt+m1; | ||
| 431 | ADD,DC ht_0,%r0,ht_0 ; ht++ | ||
| 432 | |||
| 433 | ADD %ret1,lt_0,lt_0 ; lt = lt + c; | ||
| 434 | ADD,DC ht_0,%r0,ht_0 ; ht++ | ||
| 435 | |||
| 436 | COPY ht_0,%ret1 ; copy carry | ||
| 437 | STD lt_0,0(r_ptr) ; rp[0] = lt | ||
| 438 | |||
| 439 | bn_mul_words_exit | ||
| 440 | .EXIT | ||
| 441 | EXTRD,U %ret1,31,32,%ret0 ; for 32-bit, return in ret0/ret1 | ||
| 442 | LDD -96(%sp),%r7 ; restore r7 | ||
| 443 | LDD -104(%sp),%r6 ; restore r6 | ||
| 444 | LDD -112(%sp),%r5 ; restore r5 | ||
| 445 | LDD -120(%sp),%r4 ; restore r4 | ||
| 446 | BVE (%rp) | ||
| 447 | LDD,MB -128(%sp),%r3 ; restore r3 | ||
| 448 | .PROCEND | ||
| 449 | |||
| 450 | ;---------------------------------------------------------------------------- | ||
| 451 | ; | ||
| 452 | ;void bn_sqr_words(BN_ULONG *rp, BN_ULONG *ap, int num) | ||
| 453 | ; | ||
| 454 | ; arg0 = rp | ||
| 455 | ; arg1 = ap | ||
| 456 | ; arg2 = num | ||
| 457 | ; | ||
| 458 | |||
| 459 | bn_sqr_words | ||
| 460 | .proc | ||
| 461 | .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE | ||
| 462 | .EXPORT bn_sqr_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN | ||
| 463 | .entry | ||
| 464 | .align 64 | ||
| 465 | |||
| 466 | STD %r3,0(%sp) ; save r3 | ||
| 467 | STD %r4,8(%sp) ; save r4 | ||
| 468 | NOP | ||
| 469 | STD %r5,16(%sp) ; save r5 | ||
| 470 | |||
| 471 | CMPIB,>= 0,num,bn_sqr_words_exit | ||
| 472 | LDO 128(%sp),%sp ; bump stack | ||
| 473 | |||
| 474 | ; | ||
| 475 | ; If only 1, the goto straight to cleanup | ||
| 476 | ; | ||
| 477 | CMPIB,= 1,num,bn_sqr_words_single_top | ||
| 478 | DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L | ||
| 479 | |||
| 480 | ; | ||
| 481 | ; This loop is unrolled 2 times (64-byte aligned as well) | ||
| 482 | ; | ||
| 483 | |||
| 484 | bn_sqr_words_unroll2 | ||
| 485 | FLDD 0(a_ptr),t_float_0 ; a[0] | ||
| 486 | FLDD 8(a_ptr),t_float_1 ; a[1] | ||
| 487 | XMPYU fht_0,flt_0,fm ; m[0] | ||
| 488 | XMPYU fht_1,flt_1,fm_1 ; m[1] | ||
| 489 | |||
| 490 | FSTD fm,-24(%sp) ; store m[0] | ||
| 491 | FSTD fm_1,-56(%sp) ; store m[1] | ||
| 492 | XMPYU flt_0,flt_0,lt_temp ; lt[0] | ||
| 493 | XMPYU flt_1,flt_1,lt_temp_1 ; lt[1] | ||
| 494 | |||
| 495 | FSTD lt_temp,-16(%sp) ; store lt[0] | ||
| 496 | FSTD lt_temp_1,-48(%sp) ; store lt[1] | ||
| 497 | XMPYU fht_0,fht_0,ht_temp ; ht[0] | ||
| 498 | XMPYU fht_1,fht_1,ht_temp_1 ; ht[1] | ||
| 499 | |||
| 500 | FSTD ht_temp,-8(%sp) ; store ht[0] | ||
| 501 | FSTD ht_temp_1,-40(%sp) ; store ht[1] | ||
| 502 | LDD -24(%sp),m_0 | ||
| 503 | LDD -56(%sp),m_1 | ||
| 504 | |||
| 505 | AND m_0,high_mask,tmp_0 ; m[0] & Mask | ||
| 506 | AND m_1,high_mask,tmp_1 ; m[1] & Mask | ||
| 507 | DEPD,Z m_0,30,31,m_0 ; m[0] << 32+1 | ||
| 508 | DEPD,Z m_1,30,31,m_1 ; m[1] << 32+1 | ||
| 509 | |||
| 510 | LDD -16(%sp),lt_0 | ||
| 511 | LDD -48(%sp),lt_1 | ||
| 512 | EXTRD,U tmp_0,32,33,tmp_0 ; tmp_0 = m[0]&Mask >> 32-1 | ||
| 513 | EXTRD,U tmp_1,32,33,tmp_1 ; tmp_1 = m[1]&Mask >> 32-1 | ||
| 514 | |||
| 515 | LDD -8(%sp),ht_0 | ||
| 516 | LDD -40(%sp),ht_1 | ||
| 517 | ADD,L ht_0,tmp_0,ht_0 ; ht[0] += tmp_0 | ||
| 518 | ADD,L ht_1,tmp_1,ht_1 ; ht[1] += tmp_1 | ||
| 519 | |||
| 520 | ADD lt_0,m_0,lt_0 ; lt = lt+m | ||
| 521 | ADD,DC ht_0,%r0,ht_0 ; ht[0]++ | ||
| 522 | STD lt_0,0(r_ptr) ; rp[0] = lt[0] | ||
| 523 | STD ht_0,8(r_ptr) ; rp[1] = ht[1] | ||
| 524 | |||
| 525 | ADD lt_1,m_1,lt_1 ; lt = lt+m | ||
| 526 | ADD,DC ht_1,%r0,ht_1 ; ht[1]++ | ||
| 527 | STD lt_1,16(r_ptr) ; rp[2] = lt[1] | ||
| 528 | STD ht_1,24(r_ptr) ; rp[3] = ht[1] | ||
| 529 | |||
| 530 | LDO -2(num),num ; num = num - 2; | ||
| 531 | LDO 16(a_ptr),a_ptr ; ap += 2 | ||
| 532 | CMPIB,<= 2,num,bn_sqr_words_unroll2 | ||
| 533 | LDO 32(r_ptr),r_ptr ; rp += 4 | ||
| 534 | |||
| 535 | CMPIB,=,N 0,num,bn_sqr_words_exit ; are we done? | ||
| 536 | |||
| 537 | ; | ||
| 538 | ; Top of loop aligned on 64-byte boundary | ||
| 539 | ; | ||
| 540 | bn_sqr_words_single_top | ||
| 541 | FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R) | ||
| 542 | |||
| 543 | XMPYU fht_0,flt_0,fm ; m | ||
| 544 | FSTD fm,-24(%sp) ; store m | ||
| 545 | |||
| 546 | XMPYU flt_0,flt_0,lt_temp ; lt | ||
| 547 | FSTD lt_temp,-16(%sp) ; store lt | ||
| 548 | |||
| 549 | XMPYU fht_0,fht_0,ht_temp ; ht | ||
| 550 | FSTD ht_temp,-8(%sp) ; store ht | ||
| 551 | |||
| 552 | LDD -24(%sp),m_0 ; load m | ||
| 553 | AND m_0,high_mask,tmp_0 ; m & Mask | ||
| 554 | DEPD,Z m_0,30,31,m_0 ; m << 32+1 | ||
| 555 | LDD -16(%sp),lt_0 ; lt | ||
| 556 | |||
| 557 | LDD -8(%sp),ht_0 ; ht | ||
| 558 | EXTRD,U tmp_0,32,33,tmp_0 ; tmp_0 = m&Mask >> 32-1 | ||
| 559 | ADD m_0,lt_0,lt_0 ; lt = lt+m | ||
| 560 | ADD,L ht_0,tmp_0,ht_0 ; ht += tmp_0 | ||
| 561 | ADD,DC ht_0,%r0,ht_0 ; ht++ | ||
| 562 | |||
| 563 | STD lt_0,0(r_ptr) ; rp[0] = lt | ||
| 564 | STD ht_0,8(r_ptr) ; rp[1] = ht | ||
| 565 | |||
| 566 | bn_sqr_words_exit | ||
| 567 | .EXIT | ||
| 568 | LDD -112(%sp),%r5 ; restore r5 | ||
| 569 | LDD -120(%sp),%r4 ; restore r4 | ||
| 570 | BVE (%rp) | ||
| 571 | LDD,MB -128(%sp),%r3 | ||
| 572 | .PROCEND ;in=23,24,25,26,29;out=28; | ||
| 573 | |||
| 574 | |||
| 575 | ;---------------------------------------------------------------------------- | ||
| 576 | ; | ||
| 577 | ;BN_ULONG bn_add_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) | ||
| 578 | ; | ||
| 579 | ; arg0 = rp | ||
| 580 | ; arg1 = ap | ||
| 581 | ; arg2 = bp | ||
| 582 | ; arg3 = n | ||
| 583 | |||
| 584 | t .reg %r22 | ||
| 585 | b .reg %r21 | ||
| 586 | l .reg %r20 | ||
| 587 | |||
| 588 | bn_add_words | ||
| 589 | .proc | ||
| 590 | .entry | ||
| 591 | .callinfo | ||
| 592 | .EXPORT bn_add_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN | ||
| 593 | .align 64 | ||
| 594 | |||
| 595 | CMPIB,>= 0,n,bn_add_words_exit | ||
| 596 | COPY %r0,%ret1 ; return 0 by default | ||
| 597 | |||
| 598 | ; | ||
| 599 | ; If 2 or more numbers do the loop | ||
| 600 | ; | ||
| 601 | CMPIB,= 1,n,bn_add_words_single_top | ||
| 602 | NOP | ||
| 603 | |||
| 604 | ; | ||
| 605 | ; This loop is unrolled 2 times (64-byte aligned as well) | ||
| 606 | ; | ||
| 607 | bn_add_words_unroll2 | ||
| 608 | LDD 0(a_ptr),t | ||
| 609 | LDD 0(b_ptr),b | ||
| 610 | ADD t,%ret1,t ; t = t+c; | ||
| 611 | ADD,DC %r0,%r0,%ret1 ; set c to carry | ||
| 612 | ADD t,b,l ; l = t + b[0] | ||
| 613 | ADD,DC %ret1,%r0,%ret1 ; c+= carry | ||
| 614 | STD l,0(r_ptr) | ||
| 615 | |||
| 616 | LDD 8(a_ptr),t | ||
| 617 | LDD 8(b_ptr),b | ||
| 618 | ADD t,%ret1,t ; t = t+c; | ||
| 619 | ADD,DC %r0,%r0,%ret1 ; set c to carry | ||
| 620 | ADD t,b,l ; l = t + b[0] | ||
| 621 | ADD,DC %ret1,%r0,%ret1 ; c+= carry | ||
| 622 | STD l,8(r_ptr) | ||
| 623 | |||
| 624 | LDO -2(n),n | ||
| 625 | LDO 16(a_ptr),a_ptr | ||
| 626 | LDO 16(b_ptr),b_ptr | ||
| 627 | |||
| 628 | CMPIB,<= 2,n,bn_add_words_unroll2 | ||
| 629 | LDO 16(r_ptr),r_ptr | ||
| 630 | |||
| 631 | CMPIB,=,N 0,n,bn_add_words_exit ; are we done? | ||
| 632 | |||
| 633 | bn_add_words_single_top | ||
| 634 | LDD 0(a_ptr),t | ||
| 635 | LDD 0(b_ptr),b | ||
| 636 | |||
| 637 | ADD t,%ret1,t ; t = t+c; | ||
| 638 | ADD,DC %r0,%r0,%ret1 ; set c to carry (could use CMPCLR??) | ||
| 639 | ADD t,b,l ; l = t + b[0] | ||
| 640 | ADD,DC %ret1,%r0,%ret1 ; c+= carry | ||
| 641 | STD l,0(r_ptr) | ||
| 642 | |||
| 643 | bn_add_words_exit | ||
| 644 | .EXIT | ||
| 645 | BVE (%rp) | ||
| 646 | EXTRD,U %ret1,31,32,%ret0 ; for 32-bit, return in ret0/ret1 | ||
| 647 | .PROCEND ;in=23,24,25,26,29;out=28; | ||
| 648 | |||
| 649 | ;---------------------------------------------------------------------------- | ||
| 650 | ; | ||
| 651 | ;BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) | ||
| 652 | ; | ||
| 653 | ; arg0 = rp | ||
| 654 | ; arg1 = ap | ||
| 655 | ; arg2 = bp | ||
| 656 | ; arg3 = n | ||
| 657 | |||
| 658 | t1 .reg %r22 | ||
| 659 | t2 .reg %r21 | ||
| 660 | sub_tmp1 .reg %r20 | ||
| 661 | sub_tmp2 .reg %r19 | ||
| 662 | |||
| 663 | |||
| 664 | bn_sub_words | ||
| 665 | .proc | ||
| 666 | .callinfo | ||
| 667 | .EXPORT bn_sub_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN | ||
| 668 | .entry | ||
| 669 | .align 64 | ||
| 670 | |||
| 671 | CMPIB,>= 0,n,bn_sub_words_exit | ||
| 672 | COPY %r0,%ret1 ; return 0 by default | ||
| 673 | |||
| 674 | ; | ||
| 675 | ; If 2 or more numbers do the loop | ||
| 676 | ; | ||
| 677 | CMPIB,= 1,n,bn_sub_words_single_top | ||
| 678 | NOP | ||
| 679 | |||
| 680 | ; | ||
| 681 | ; This loop is unrolled 2 times (64-byte aligned as well) | ||
| 682 | ; | ||
| 683 | bn_sub_words_unroll2 | ||
| 684 | LDD 0(a_ptr),t1 | ||
| 685 | LDD 0(b_ptr),t2 | ||
| 686 | SUB t1,t2,sub_tmp1 ; t3 = t1-t2; | ||
| 687 | SUB sub_tmp1,%ret1,sub_tmp1 ; t3 = t3- c; | ||
| 688 | |||
| 689 | CMPCLR,*>> t1,t2,sub_tmp2 ; clear if t1 > t2 | ||
| 690 | LDO 1(%r0),sub_tmp2 | ||
| 691 | |||
| 692 | CMPCLR,*= t1,t2,%r0 | ||
| 693 | COPY sub_tmp2,%ret1 | ||
| 694 | STD sub_tmp1,0(r_ptr) | ||
| 695 | |||
| 696 | LDD 8(a_ptr),t1 | ||
| 697 | LDD 8(b_ptr),t2 | ||
| 698 | SUB t1,t2,sub_tmp1 ; t3 = t1-t2; | ||
| 699 | SUB sub_tmp1,%ret1,sub_tmp1 ; t3 = t3- c; | ||
| 700 | CMPCLR,*>> t1,t2,sub_tmp2 ; clear if t1 > t2 | ||
| 701 | LDO 1(%r0),sub_tmp2 | ||
| 702 | |||
| 703 | CMPCLR,*= t1,t2,%r0 | ||
| 704 | COPY sub_tmp2,%ret1 | ||
| 705 | STD sub_tmp1,8(r_ptr) | ||
| 706 | |||
| 707 | LDO -2(n),n | ||
| 708 | LDO 16(a_ptr),a_ptr | ||
| 709 | LDO 16(b_ptr),b_ptr | ||
| 710 | |||
| 711 | CMPIB,<= 2,n,bn_sub_words_unroll2 | ||
| 712 | LDO 16(r_ptr),r_ptr | ||
| 713 | |||
| 714 | CMPIB,=,N 0,n,bn_sub_words_exit ; are we done? | ||
| 715 | |||
| 716 | bn_sub_words_single_top | ||
| 717 | LDD 0(a_ptr),t1 | ||
| 718 | LDD 0(b_ptr),t2 | ||
| 719 | SUB t1,t2,sub_tmp1 ; t3 = t1-t2; | ||
| 720 | SUB sub_tmp1,%ret1,sub_tmp1 ; t3 = t3- c; | ||
| 721 | CMPCLR,*>> t1,t2,sub_tmp2 ; clear if t1 > t2 | ||
| 722 | LDO 1(%r0),sub_tmp2 | ||
| 723 | |||
| 724 | CMPCLR,*= t1,t2,%r0 | ||
| 725 | COPY sub_tmp2,%ret1 | ||
| 726 | |||
| 727 | STD sub_tmp1,0(r_ptr) | ||
| 728 | |||
| 729 | bn_sub_words_exit | ||
| 730 | .EXIT | ||
| 731 | BVE (%rp) | ||
| 732 | EXTRD,U %ret1,31,32,%ret0 ; for 32-bit, return in ret0/ret1 | ||
| 733 | .PROCEND ;in=23,24,25,26,29;out=28; | ||
| 734 | |||
| 735 | ;------------------------------------------------------------------------------ | ||
| 736 | ; | ||
| 737 | ; unsigned long bn_div_words(unsigned long h, unsigned long l, unsigned long d) | ||
| 738 | ; | ||
| 739 | ; arg0 = h | ||
| 740 | ; arg1 = l | ||
| 741 | ; arg2 = d | ||
| 742 | ; | ||
| 743 | ; This is mainly just output from the HP C compiler. | ||
| 744 | ; | ||
| 745 | ;------------------------------------------------------------------------------ | ||
| 746 | bn_div_words | ||
| 747 | .PROC | ||
| 748 | .EXPORT bn_div_words,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR,RTNVAL=GR,LONG_RETURN | ||
| 749 | .IMPORT BN_num_bits_word,CODE | ||
| 750 | ;--- not PIC .IMPORT __iob,DATA | ||
| 751 | ;--- not PIC .IMPORT fprintf,CODE | ||
| 752 | .IMPORT abort,CODE | ||
| 753 | .IMPORT $$div2U,MILLICODE | ||
| 754 | .CALLINFO CALLER,FRAME=144,ENTRY_GR=%r9,SAVE_RP,ARGS_SAVED,ORDERING_AWARE | ||
| 755 | .ENTRY | ||
| 756 | STW %r2,-20(%r30) ;offset 0x8ec | ||
| 757 | STW,MA %r3,192(%r30) ;offset 0x8f0 | ||
| 758 | STW %r4,-188(%r30) ;offset 0x8f4 | ||
| 759 | DEPD %r5,31,32,%r6 ;offset 0x8f8 | ||
| 760 | STD %r6,-184(%r30) ;offset 0x8fc | ||
| 761 | DEPD %r7,31,32,%r8 ;offset 0x900 | ||
| 762 | STD %r8,-176(%r30) ;offset 0x904 | ||
| 763 | STW %r9,-168(%r30) ;offset 0x908 | ||
| 764 | LDD -248(%r30),%r3 ;offset 0x90c | ||
| 765 | COPY %r26,%r4 ;offset 0x910 | ||
| 766 | COPY %r24,%r5 ;offset 0x914 | ||
| 767 | DEPD %r25,31,32,%r4 ;offset 0x918 | ||
| 768 | CMPB,*<> %r3,%r0,$0006000C ;offset 0x91c | ||
| 769 | DEPD %r23,31,32,%r5 ;offset 0x920 | ||
| 770 | MOVIB,TR -1,%r29,$00060002 ;offset 0x924 | ||
| 771 | EXTRD,U %r29,31,32,%r28 ;offset 0x928 | ||
| 772 | $0006002A | ||
| 773 | LDO -1(%r29),%r29 ;offset 0x92c | ||
| 774 | SUB %r23,%r7,%r23 ;offset 0x930 | ||
| 775 | $00060024 | ||
| 776 | SUB %r4,%r31,%r25 ;offset 0x934 | ||
| 777 | AND %r25,%r19,%r26 ;offset 0x938 | ||
| 778 | CMPB,*<>,N %r0,%r26,$00060046 ;offset 0x93c | ||
| 779 | DEPD,Z %r25,31,32,%r20 ;offset 0x940 | ||
| 780 | OR %r20,%r24,%r21 ;offset 0x944 | ||
| 781 | CMPB,*<<,N %r21,%r23,$0006002A ;offset 0x948 | ||
| 782 | SUB %r31,%r2,%r31 ;offset 0x94c | ||
| 783 | $00060046 | ||
| 784 | $0006002E | ||
| 785 | DEPD,Z %r23,31,32,%r25 ;offset 0x950 | ||
| 786 | EXTRD,U %r23,31,32,%r26 ;offset 0x954 | ||
| 787 | AND %r25,%r19,%r24 ;offset 0x958 | ||
| 788 | ADD,L %r31,%r26,%r31 ;offset 0x95c | ||
| 789 | CMPCLR,*>>= %r5,%r24,%r0 ;offset 0x960 | ||
| 790 | LDO 1(%r31),%r31 ;offset 0x964 | ||
| 791 | $00060032 | ||
| 792 | CMPB,*<<=,N %r31,%r4,$00060036 ;offset 0x968 | ||
| 793 | LDO -1(%r29),%r29 ;offset 0x96c | ||
| 794 | ADD,L %r4,%r3,%r4 ;offset 0x970 | ||
| 795 | $00060036 | ||
| 796 | ADDIB,=,N -1,%r8,$D0 ;offset 0x974 | ||
| 797 | SUB %r5,%r24,%r28 ;offset 0x978 | ||
| 798 | $0006003A | ||
| 799 | SUB %r4,%r31,%r24 ;offset 0x97c | ||
| 800 | SHRPD %r24,%r28,32,%r4 ;offset 0x980 | ||
| 801 | DEPD,Z %r29,31,32,%r9 ;offset 0x984 | ||
| 802 | DEPD,Z %r28,31,32,%r5 ;offset 0x988 | ||
| 803 | $0006001C | ||
| 804 | EXTRD,U %r4,31,32,%r31 ;offset 0x98c | ||
| 805 | CMPB,*<>,N %r31,%r2,$00060020 ;offset 0x990 | ||
| 806 | MOVB,TR %r6,%r29,$D1 ;offset 0x994 | ||
| 807 | STD %r29,-152(%r30) ;offset 0x998 | ||
| 808 | $0006000C | ||
| 809 | EXTRD,U %r3,31,32,%r25 ;offset 0x99c | ||
| 810 | COPY %r3,%r26 ;offset 0x9a0 | ||
| 811 | EXTRD,U %r3,31,32,%r9 ;offset 0x9a4 | ||
| 812 | EXTRD,U %r4,31,32,%r8 ;offset 0x9a8 | ||
| 813 | .CALL ARGW0=GR,ARGW1=GR,RTNVAL=GR ;in=25,26;out=28; | ||
| 814 | B,L BN_num_bits_word,%r2 ;offset 0x9ac | ||
| 815 | EXTRD,U %r5,31,32,%r7 ;offset 0x9b0 | ||
| 816 | LDI 64,%r20 ;offset 0x9b4 | ||
| 817 | DEPD %r7,31,32,%r5 ;offset 0x9b8 | ||
| 818 | DEPD %r8,31,32,%r4 ;offset 0x9bc | ||
| 819 | DEPD %r9,31,32,%r3 ;offset 0x9c0 | ||
| 820 | CMPB,= %r28,%r20,$00060012 ;offset 0x9c4 | ||
| 821 | COPY %r28,%r24 ;offset 0x9c8 | ||
| 822 | MTSARCM %r24 ;offset 0x9cc | ||
| 823 | DEPDI,Z -1,%sar,1,%r19 ;offset 0x9d0 | ||
| 824 | CMPB,*>>,N %r4,%r19,$D2 ;offset 0x9d4 | ||
| 825 | $00060012 | ||
| 826 | SUBI 64,%r24,%r31 ;offset 0x9d8 | ||
| 827 | CMPCLR,*<< %r4,%r3,%r0 ;offset 0x9dc | ||
| 828 | SUB %r4,%r3,%r4 ;offset 0x9e0 | ||
| 829 | $00060016 | ||
| 830 | CMPB,= %r31,%r0,$0006001A ;offset 0x9e4 | ||
| 831 | COPY %r0,%r9 ;offset 0x9e8 | ||
| 832 | MTSARCM %r31 ;offset 0x9ec | ||
| 833 | DEPD,Z %r3,%sar,64,%r3 ;offset 0x9f0 | ||
| 834 | SUBI 64,%r31,%r26 ;offset 0x9f4 | ||
| 835 | MTSAR %r26 ;offset 0x9f8 | ||
| 836 | SHRPD %r4,%r5,%sar,%r4 ;offset 0x9fc | ||
| 837 | MTSARCM %r31 ;offset 0xa00 | ||
| 838 | DEPD,Z %r5,%sar,64,%r5 ;offset 0xa04 | ||
| 839 | $0006001A | ||
| 840 | DEPDI,Z -1,31,32,%r19 ;offset 0xa08 | ||
| 841 | AND %r3,%r19,%r29 ;offset 0xa0c | ||
| 842 | EXTRD,U %r29,31,32,%r2 ;offset 0xa10 | ||
| 843 | DEPDI,Z -1,63,32,%r6 ;offset 0xa14 | ||
| 844 | MOVIB,TR 2,%r8,$0006001C ;offset 0xa18 | ||
| 845 | EXTRD,U %r3,63,32,%r7 ;offset 0xa1c | ||
| 846 | $D2 | ||
| 847 | ;--- not PIC ADDIL LR'__iob-$global$,%r27,%r1 ;offset 0xa20 | ||
| 848 | ;--- not PIC LDIL LR'C$7,%r21 ;offset 0xa24 | ||
| 849 | ;--- not PIC LDO RR'__iob-$global$+32(%r1),%r26 ;offset 0xa28 | ||
| 850 | ;--- not PIC .CALL ARGW0=GR,ARGW1=GR,ARGW2=GR,RTNVAL=GR ;in=24,25,26;out=28; | ||
| 851 | ;--- not PIC B,L fprintf,%r2 ;offset 0xa2c | ||
| 852 | ;--- not PIC LDO RR'C$7(%r21),%r25 ;offset 0xa30 | ||
| 853 | .CALL ; | ||
| 854 | B,L abort,%r2 ;offset 0xa34 | ||
| 855 | NOP ;offset 0xa38 | ||
| 856 | B $D3 ;offset 0xa3c | ||
| 857 | LDW -212(%r30),%r2 ;offset 0xa40 | ||
| 858 | $00060020 | ||
| 859 | COPY %r4,%r26 ;offset 0xa44 | ||
| 860 | EXTRD,U %r4,31,32,%r25 ;offset 0xa48 | ||
| 861 | COPY %r2,%r24 ;offset 0xa4c | ||
| 862 | .CALL ;in=23,24,25,26;out=20,21,22,28,29; (MILLICALL) | ||
| 863 | B,L $$div2U,%r31 ;offset 0xa50 | ||
| 864 | EXTRD,U %r2,31,32,%r23 ;offset 0xa54 | ||
| 865 | DEPD %r28,31,32,%r29 ;offset 0xa58 | ||
| 866 | $00060022 | ||
| 867 | STD %r29,-152(%r30) ;offset 0xa5c | ||
| 868 | $D1 | ||
| 869 | AND %r5,%r19,%r24 ;offset 0xa60 | ||
| 870 | EXTRD,U %r24,31,32,%r24 ;offset 0xa64 | ||
| 871 | STW %r2,-160(%r30) ;offset 0xa68 | ||
| 872 | STW %r7,-128(%r30) ;offset 0xa6c | ||
| 873 | FLDD -152(%r30),%fr4 ;offset 0xa70 | ||
| 874 | FLDD -152(%r30),%fr7 ;offset 0xa74 | ||
| 875 | FLDW -160(%r30),%fr8L ;offset 0xa78 | ||
| 876 | FLDW -128(%r30),%fr5L ;offset 0xa7c | ||
| 877 | XMPYU %fr8L,%fr7L,%fr10 ;offset 0xa80 | ||
| 878 | FSTD %fr10,-136(%r30) ;offset 0xa84 | ||
| 879 | XMPYU %fr8L,%fr7R,%fr22 ;offset 0xa88 | ||
| 880 | FSTD %fr22,-144(%r30) ;offset 0xa8c | ||
| 881 | XMPYU %fr5L,%fr4L,%fr11 ;offset 0xa90 | ||
| 882 | XMPYU %fr5L,%fr4R,%fr23 ;offset 0xa94 | ||
| 883 | FSTD %fr11,-112(%r30) ;offset 0xa98 | ||
| 884 | FSTD %fr23,-120(%r30) ;offset 0xa9c | ||
| 885 | LDD -136(%r30),%r28 ;offset 0xaa0 | ||
| 886 | DEPD,Z %r28,31,32,%r31 ;offset 0xaa4 | ||
| 887 | LDD -144(%r30),%r20 ;offset 0xaa8 | ||
| 888 | ADD,L %r20,%r31,%r31 ;offset 0xaac | ||
| 889 | LDD -112(%r30),%r22 ;offset 0xab0 | ||
| 890 | DEPD,Z %r22,31,32,%r22 ;offset 0xab4 | ||
| 891 | LDD -120(%r30),%r21 ;offset 0xab8 | ||
| 892 | B $00060024 ;offset 0xabc | ||
| 893 | ADD,L %r21,%r22,%r23 ;offset 0xac0 | ||
| 894 | $D0 | ||
| 895 | OR %r9,%r29,%r29 ;offset 0xac4 | ||
| 896 | $00060040 | ||
| 897 | EXTRD,U %r29,31,32,%r28 ;offset 0xac8 | ||
| 898 | $00060002 | ||
| 899 | $L2 | ||
| 900 | LDW -212(%r30),%r2 ;offset 0xacc | ||
| 901 | $D3 | ||
| 902 | LDW -168(%r30),%r9 ;offset 0xad0 | ||
| 903 | LDD -176(%r30),%r8 ;offset 0xad4 | ||
| 904 | EXTRD,U %r8,31,32,%r7 ;offset 0xad8 | ||
| 905 | LDD -184(%r30),%r6 ;offset 0xadc | ||
| 906 | EXTRD,U %r6,31,32,%r5 ;offset 0xae0 | ||
| 907 | LDW -188(%r30),%r4 ;offset 0xae4 | ||
| 908 | BVE (%r2) ;offset 0xae8 | ||
| 909 | .EXIT | ||
| 910 | LDW,MB -192(%r30),%r3 ;offset 0xaec | ||
| 911 | .PROCEND ;in=23,25;out=28,29;fpin=105,107; | ||
| 912 | |||
| 913 | |||
| 914 | |||
| 915 | |||
| 916 | ;---------------------------------------------------------------------------- | ||
| 917 | ; | ||
| 918 | ; Registers to hold 64-bit values to manipulate. The "L" part | ||
| 919 | ; of the register corresponds to the upper 32-bits, while the "R" | ||
| 920 | ; part corresponds to the lower 32-bits | ||
| 921 | ; | ||
| 922 | ; Note, that when using b6 and b7, the code must save these before | ||
| 923 | ; using them because they are callee save registers | ||
| 924 | ; | ||
| 925 | ; | ||
| 926 | ; Floating point registers to use to save values that | ||
| 927 | ; are manipulated. These don't collide with ftemp1-6 and | ||
| 928 | ; are all caller save registers | ||
| 929 | ; | ||
| 930 | a0 .reg %fr22 | ||
| 931 | a0L .reg %fr22L | ||
| 932 | a0R .reg %fr22R | ||
| 933 | |||
| 934 | a1 .reg %fr23 | ||
| 935 | a1L .reg %fr23L | ||
| 936 | a1R .reg %fr23R | ||
| 937 | |||
| 938 | a2 .reg %fr24 | ||
| 939 | a2L .reg %fr24L | ||
| 940 | a2R .reg %fr24R | ||
| 941 | |||
| 942 | a3 .reg %fr25 | ||
| 943 | a3L .reg %fr25L | ||
| 944 | a3R .reg %fr25R | ||
| 945 | |||
| 946 | a4 .reg %fr26 | ||
| 947 | a4L .reg %fr26L | ||
| 948 | a4R .reg %fr26R | ||
| 949 | |||
| 950 | a5 .reg %fr27 | ||
| 951 | a5L .reg %fr27L | ||
| 952 | a5R .reg %fr27R | ||
| 953 | |||
| 954 | a6 .reg %fr28 | ||
| 955 | a6L .reg %fr28L | ||
| 956 | a6R .reg %fr28R | ||
| 957 | |||
| 958 | a7 .reg %fr29 | ||
| 959 | a7L .reg %fr29L | ||
| 960 | a7R .reg %fr29R | ||
| 961 | |||
| 962 | b0 .reg %fr30 | ||
| 963 | b0L .reg %fr30L | ||
| 964 | b0R .reg %fr30R | ||
| 965 | |||
| 966 | b1 .reg %fr31 | ||
| 967 | b1L .reg %fr31L | ||
| 968 | b1R .reg %fr31R | ||
| 969 | |||
| 970 | ; | ||
| 971 | ; Temporary floating point variables, these are all caller save | ||
| 972 | ; registers | ||
| 973 | ; | ||
| 974 | ftemp1 .reg %fr4 | ||
| 975 | ftemp2 .reg %fr5 | ||
| 976 | ftemp3 .reg %fr6 | ||
| 977 | ftemp4 .reg %fr7 | ||
| 978 | |||
| 979 | ; | ||
| 980 | ; The B set of registers when used. | ||
| 981 | ; | ||
| 982 | |||
| 983 | b2 .reg %fr8 | ||
| 984 | b2L .reg %fr8L | ||
| 985 | b2R .reg %fr8R | ||
| 986 | |||
| 987 | b3 .reg %fr9 | ||
| 988 | b3L .reg %fr9L | ||
| 989 | b3R .reg %fr9R | ||
| 990 | |||
| 991 | b4 .reg %fr10 | ||
| 992 | b4L .reg %fr10L | ||
| 993 | b4R .reg %fr10R | ||
| 994 | |||
| 995 | b5 .reg %fr11 | ||
| 996 | b5L .reg %fr11L | ||
| 997 | b5R .reg %fr11R | ||
| 998 | |||
| 999 | b6 .reg %fr12 | ||
| 1000 | b6L .reg %fr12L | ||
| 1001 | b6R .reg %fr12R | ||
| 1002 | |||
| 1003 | b7 .reg %fr13 | ||
| 1004 | b7L .reg %fr13L | ||
| 1005 | b7R .reg %fr13R | ||
| 1006 | |||
| 1007 | c1 .reg %r21 ; only reg | ||
| 1008 | temp1 .reg %r20 ; only reg | ||
| 1009 | temp2 .reg %r19 ; only reg | ||
| 1010 | temp3 .reg %r31 ; only reg | ||
| 1011 | |||
| 1012 | m1 .reg %r28 | ||
| 1013 | c2 .reg %r23 | ||
| 1014 | high_one .reg %r1 | ||
| 1015 | ht .reg %r6 | ||
| 1016 | lt .reg %r5 | ||
| 1017 | m .reg %r4 | ||
| 1018 | c3 .reg %r3 | ||
| 1019 | |||
| 1020 | SQR_ADD_C .macro A0L,A0R,C1,C2,C3 | ||
| 1021 | XMPYU A0L,A0R,ftemp1 ; m | ||
| 1022 | FSTD ftemp1,-24(%sp) ; store m | ||
| 1023 | |||
| 1024 | XMPYU A0R,A0R,ftemp2 ; lt | ||
| 1025 | FSTD ftemp2,-16(%sp) ; store lt | ||
| 1026 | |||
| 1027 | XMPYU A0L,A0L,ftemp3 ; ht | ||
| 1028 | FSTD ftemp3,-8(%sp) ; store ht | ||
| 1029 | |||
| 1030 | LDD -24(%sp),m ; load m | ||
| 1031 | AND m,high_mask,temp2 ; m & Mask | ||
| 1032 | DEPD,Z m,30,31,temp3 ; m << 32+1 | ||
| 1033 | LDD -16(%sp),lt ; lt | ||
| 1034 | |||
| 1035 | LDD -8(%sp),ht ; ht | ||
| 1036 | EXTRD,U temp2,32,33,temp1 ; temp1 = m&Mask >> 32-1 | ||
| 1037 | ADD temp3,lt,lt ; lt = lt+m | ||
| 1038 | ADD,L ht,temp1,ht ; ht += temp1 | ||
| 1039 | ADD,DC ht,%r0,ht ; ht++ | ||
| 1040 | |||
| 1041 | ADD C1,lt,C1 ; c1=c1+lt | ||
| 1042 | ADD,DC ht,%r0,ht ; ht++ | ||
| 1043 | |||
| 1044 | ADD C2,ht,C2 ; c2=c2+ht | ||
| 1045 | ADD,DC C3,%r0,C3 ; c3++ | ||
| 1046 | .endm | ||
| 1047 | |||
| 1048 | SQR_ADD_C2 .macro A0L,A0R,A1L,A1R,C1,C2,C3 | ||
| 1049 | XMPYU A0L,A1R,ftemp1 ; m1 = bl*ht | ||
| 1050 | FSTD ftemp1,-16(%sp) ; | ||
| 1051 | XMPYU A0R,A1L,ftemp2 ; m = bh*lt | ||
| 1052 | FSTD ftemp2,-8(%sp) ; | ||
| 1053 | XMPYU A0R,A1R,ftemp3 ; lt = bl*lt | ||
| 1054 | FSTD ftemp3,-32(%sp) | ||
| 1055 | XMPYU A0L,A1L,ftemp4 ; ht = bh*ht | ||
| 1056 | FSTD ftemp4,-24(%sp) ; | ||
| 1057 | |||
| 1058 | LDD -8(%sp),m ; r21 = m | ||
| 1059 | LDD -16(%sp),m1 ; r19 = m1 | ||
| 1060 | ADD,L m,m1,m ; m+m1 | ||
| 1061 | |||
| 1062 | DEPD,Z m,31,32,temp3 ; (m+m1<<32) | ||
| 1063 | LDD -24(%sp),ht ; r24 = ht | ||
| 1064 | |||
| 1065 | CMPCLR,*>>= m,m1,%r0 ; if (m < m1) | ||
| 1066 | ADD,L ht,high_one,ht ; ht+=high_one | ||
| 1067 | |||
| 1068 | EXTRD,U m,31,32,temp1 ; m >> 32 | ||
| 1069 | LDD -32(%sp),lt ; lt | ||
| 1070 | ADD,L ht,temp1,ht ; ht+= m>>32 | ||
| 1071 | ADD lt,temp3,lt ; lt = lt+m1 | ||
| 1072 | ADD,DC ht,%r0,ht ; ht++ | ||
| 1073 | |||
| 1074 | ADD ht,ht,ht ; ht=ht+ht; | ||
| 1075 | ADD,DC C3,%r0,C3 ; add in carry (c3++) | ||
| 1076 | |||
| 1077 | ADD lt,lt,lt ; lt=lt+lt; | ||
| 1078 | ADD,DC ht,%r0,ht ; add in carry (ht++) | ||
| 1079 | |||
| 1080 | ADD C1,lt,C1 ; c1=c1+lt | ||
| 1081 | ADD,DC,*NUV ht,%r0,ht ; add in carry (ht++) | ||
| 1082 | LDO 1(C3),C3 ; bump c3 if overflow,nullify otherwise | ||
| 1083 | |||
| 1084 | ADD C2,ht,C2 ; c2 = c2 + ht | ||
| 1085 | ADD,DC C3,%r0,C3 ; add in carry (c3++) | ||
| 1086 | .endm | ||
| 1087 | |||
| 1088 | ; | ||
| 1089 | ;void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a) | ||
| 1090 | ; arg0 = r_ptr | ||
| 1091 | ; arg1 = a_ptr | ||
| 1092 | ; | ||
| 1093 | |||
| 1094 | bn_sqr_comba8 | ||
| 1095 | .PROC | ||
| 1096 | .CALLINFO FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE | ||
| 1097 | .EXPORT bn_sqr_comba8,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN | ||
| 1098 | .ENTRY | ||
| 1099 | .align 64 | ||
| 1100 | |||
| 1101 | STD %r3,0(%sp) ; save r3 | ||
| 1102 | STD %r4,8(%sp) ; save r4 | ||
| 1103 | STD %r5,16(%sp) ; save r5 | ||
| 1104 | STD %r6,24(%sp) ; save r6 | ||
| 1105 | |||
| 1106 | ; | ||
| 1107 | ; Zero out carries | ||
| 1108 | ; | ||
| 1109 | COPY %r0,c1 | ||
| 1110 | COPY %r0,c2 | ||
| 1111 | COPY %r0,c3 | ||
| 1112 | |||
| 1113 | LDO 128(%sp),%sp ; bump stack | ||
| 1114 | DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L | ||
| 1115 | DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32 | ||
| 1116 | |||
| 1117 | ; | ||
| 1118 | ; Load up all of the values we are going to use | ||
| 1119 | ; | ||
| 1120 | FLDD 0(a_ptr),a0 | ||
| 1121 | FLDD 8(a_ptr),a1 | ||
| 1122 | FLDD 16(a_ptr),a2 | ||
| 1123 | FLDD 24(a_ptr),a3 | ||
| 1124 | FLDD 32(a_ptr),a4 | ||
| 1125 | FLDD 40(a_ptr),a5 | ||
| 1126 | FLDD 48(a_ptr),a6 | ||
| 1127 | FLDD 56(a_ptr),a7 | ||
| 1128 | |||
| 1129 | SQR_ADD_C a0L,a0R,c1,c2,c3 | ||
| 1130 | STD c1,0(r_ptr) ; r[0] = c1; | ||
| 1131 | COPY %r0,c1 | ||
| 1132 | |||
| 1133 | SQR_ADD_C2 a1L,a1R,a0L,a0R,c2,c3,c1 | ||
| 1134 | STD c2,8(r_ptr) ; r[1] = c2; | ||
| 1135 | COPY %r0,c2 | ||
| 1136 | |||
| 1137 | SQR_ADD_C a1L,a1R,c3,c1,c2 | ||
| 1138 | SQR_ADD_C2 a2L,a2R,a0L,a0R,c3,c1,c2 | ||
| 1139 | STD c3,16(r_ptr) ; r[2] = c3; | ||
| 1140 | COPY %r0,c3 | ||
| 1141 | |||
| 1142 | SQR_ADD_C2 a3L,a3R,a0L,a0R,c1,c2,c3 | ||
| 1143 | SQR_ADD_C2 a2L,a2R,a1L,a1R,c1,c2,c3 | ||
| 1144 | STD c1,24(r_ptr) ; r[3] = c1; | ||
| 1145 | COPY %r0,c1 | ||
| 1146 | |||
| 1147 | SQR_ADD_C a2L,a2R,c2,c3,c1 | ||
| 1148 | SQR_ADD_C2 a3L,a3R,a1L,a1R,c2,c3,c1 | ||
| 1149 | SQR_ADD_C2 a4L,a4R,a0L,a0R,c2,c3,c1 | ||
| 1150 | STD c2,32(r_ptr) ; r[4] = c2; | ||
| 1151 | COPY %r0,c2 | ||
| 1152 | |||
| 1153 | SQR_ADD_C2 a5L,a5R,a0L,a0R,c3,c1,c2 | ||
| 1154 | SQR_ADD_C2 a4L,a4R,a1L,a1R,c3,c1,c2 | ||
| 1155 | SQR_ADD_C2 a3L,a3R,a2L,a2R,c3,c1,c2 | ||
| 1156 | STD c3,40(r_ptr) ; r[5] = c3; | ||
| 1157 | COPY %r0,c3 | ||
| 1158 | |||
| 1159 | SQR_ADD_C a3L,a3R,c1,c2,c3 | ||
| 1160 | SQR_ADD_C2 a4L,a4R,a2L,a2R,c1,c2,c3 | ||
| 1161 | SQR_ADD_C2 a5L,a5R,a1L,a1R,c1,c2,c3 | ||
| 1162 | SQR_ADD_C2 a6L,a6R,a0L,a0R,c1,c2,c3 | ||
| 1163 | STD c1,48(r_ptr) ; r[6] = c1; | ||
| 1164 | COPY %r0,c1 | ||
| 1165 | |||
| 1166 | SQR_ADD_C2 a7L,a7R,a0L,a0R,c2,c3,c1 | ||
| 1167 | SQR_ADD_C2 a6L,a6R,a1L,a1R,c2,c3,c1 | ||
| 1168 | SQR_ADD_C2 a5L,a5R,a2L,a2R,c2,c3,c1 | ||
| 1169 | SQR_ADD_C2 a4L,a4R,a3L,a3R,c2,c3,c1 | ||
| 1170 | STD c2,56(r_ptr) ; r[7] = c2; | ||
| 1171 | COPY %r0,c2 | ||
| 1172 | |||
| 1173 | SQR_ADD_C a4L,a4R,c3,c1,c2 | ||
| 1174 | SQR_ADD_C2 a5L,a5R,a3L,a3R,c3,c1,c2 | ||
| 1175 | SQR_ADD_C2 a6L,a6R,a2L,a2R,c3,c1,c2 | ||
| 1176 | SQR_ADD_C2 a7L,a7R,a1L,a1R,c3,c1,c2 | ||
| 1177 | STD c3,64(r_ptr) ; r[8] = c3; | ||
| 1178 | COPY %r0,c3 | ||
| 1179 | |||
| 1180 | SQR_ADD_C2 a7L,a7R,a2L,a2R,c1,c2,c3 | ||
| 1181 | SQR_ADD_C2 a6L,a6R,a3L,a3R,c1,c2,c3 | ||
| 1182 | SQR_ADD_C2 a5L,a5R,a4L,a4R,c1,c2,c3 | ||
| 1183 | STD c1,72(r_ptr) ; r[9] = c1; | ||
| 1184 | COPY %r0,c1 | ||
| 1185 | |||
| 1186 | SQR_ADD_C a5L,a5R,c2,c3,c1 | ||
| 1187 | SQR_ADD_C2 a6L,a6R,a4L,a4R,c2,c3,c1 | ||
| 1188 | SQR_ADD_C2 a7L,a7R,a3L,a3R,c2,c3,c1 | ||
| 1189 | STD c2,80(r_ptr) ; r[10] = c2; | ||
| 1190 | COPY %r0,c2 | ||
| 1191 | |||
| 1192 | SQR_ADD_C2 a7L,a7R,a4L,a4R,c3,c1,c2 | ||
| 1193 | SQR_ADD_C2 a6L,a6R,a5L,a5R,c3,c1,c2 | ||
| 1194 | STD c3,88(r_ptr) ; r[11] = c3; | ||
| 1195 | COPY %r0,c3 | ||
| 1196 | |||
| 1197 | SQR_ADD_C a6L,a6R,c1,c2,c3 | ||
| 1198 | SQR_ADD_C2 a7L,a7R,a5L,a5R,c1,c2,c3 | ||
| 1199 | STD c1,96(r_ptr) ; r[12] = c1; | ||
| 1200 | COPY %r0,c1 | ||
| 1201 | |||
| 1202 | SQR_ADD_C2 a7L,a7R,a6L,a6R,c2,c3,c1 | ||
| 1203 | STD c2,104(r_ptr) ; r[13] = c2; | ||
| 1204 | COPY %r0,c2 | ||
| 1205 | |||
| 1206 | SQR_ADD_C a7L,a7R,c3,c1,c2 | ||
| 1207 | STD c3, 112(r_ptr) ; r[14] = c3 | ||
| 1208 | STD c1, 120(r_ptr) ; r[15] = c1 | ||
| 1209 | |||
| 1210 | .EXIT | ||
| 1211 | LDD -104(%sp),%r6 ; restore r6 | ||
| 1212 | LDD -112(%sp),%r5 ; restore r5 | ||
| 1213 | LDD -120(%sp),%r4 ; restore r4 | ||
| 1214 | BVE (%rp) | ||
| 1215 | LDD,MB -128(%sp),%r3 | ||
| 1216 | |||
| 1217 | .PROCEND | ||
| 1218 | |||
| 1219 | ;----------------------------------------------------------------------------- | ||
| 1220 | ; | ||
| 1221 | ;void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a) | ||
| 1222 | ; arg0 = r_ptr | ||
| 1223 | ; arg1 = a_ptr | ||
| 1224 | ; | ||
| 1225 | |||
| 1226 | bn_sqr_comba4 | ||
| 1227 | .proc | ||
| 1228 | .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE | ||
| 1229 | .EXPORT bn_sqr_comba4,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN | ||
| 1230 | .entry | ||
| 1231 | .align 64 | ||
| 1232 | STD %r3,0(%sp) ; save r3 | ||
| 1233 | STD %r4,8(%sp) ; save r4 | ||
| 1234 | STD %r5,16(%sp) ; save r5 | ||
| 1235 | STD %r6,24(%sp) ; save r6 | ||
| 1236 | |||
| 1237 | ; | ||
| 1238 | ; Zero out carries | ||
| 1239 | ; | ||
| 1240 | COPY %r0,c1 | ||
| 1241 | COPY %r0,c2 | ||
| 1242 | COPY %r0,c3 | ||
| 1243 | |||
| 1244 | LDO 128(%sp),%sp ; bump stack | ||
| 1245 | DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L | ||
| 1246 | DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32 | ||
| 1247 | |||
| 1248 | ; | ||
| 1249 | ; Load up all of the values we are going to use | ||
| 1250 | ; | ||
| 1251 | FLDD 0(a_ptr),a0 | ||
| 1252 | FLDD 8(a_ptr),a1 | ||
| 1253 | FLDD 16(a_ptr),a2 | ||
| 1254 | FLDD 24(a_ptr),a3 | ||
| 1255 | FLDD 32(a_ptr),a4 | ||
| 1256 | FLDD 40(a_ptr),a5 | ||
| 1257 | FLDD 48(a_ptr),a6 | ||
| 1258 | FLDD 56(a_ptr),a7 | ||
| 1259 | |||
| 1260 | SQR_ADD_C a0L,a0R,c1,c2,c3 | ||
| 1261 | |||
| 1262 | STD c1,0(r_ptr) ; r[0] = c1; | ||
| 1263 | COPY %r0,c1 | ||
| 1264 | |||
| 1265 | SQR_ADD_C2 a1L,a1R,a0L,a0R,c2,c3,c1 | ||
| 1266 | |||
| 1267 | STD c2,8(r_ptr) ; r[1] = c2; | ||
| 1268 | COPY %r0,c2 | ||
| 1269 | |||
| 1270 | SQR_ADD_C a1L,a1R,c3,c1,c2 | ||
| 1271 | SQR_ADD_C2 a2L,a2R,a0L,a0R,c3,c1,c2 | ||
| 1272 | |||
| 1273 | STD c3,16(r_ptr) ; r[2] = c3; | ||
| 1274 | COPY %r0,c3 | ||
| 1275 | |||
| 1276 | SQR_ADD_C2 a3L,a3R,a0L,a0R,c1,c2,c3 | ||
| 1277 | SQR_ADD_C2 a2L,a2R,a1L,a1R,c1,c2,c3 | ||
| 1278 | |||
| 1279 | STD c1,24(r_ptr) ; r[3] = c1; | ||
| 1280 | COPY %r0,c1 | ||
| 1281 | |||
| 1282 | SQR_ADD_C a2L,a2R,c2,c3,c1 | ||
| 1283 | SQR_ADD_C2 a3L,a3R,a1L,a1R,c2,c3,c1 | ||
| 1284 | |||
| 1285 | STD c2,32(r_ptr) ; r[4] = c2; | ||
| 1286 | COPY %r0,c2 | ||
| 1287 | |||
| 1288 | SQR_ADD_C2 a3L,a3R,a2L,a2R,c3,c1,c2 | ||
| 1289 | STD c3,40(r_ptr) ; r[5] = c3; | ||
| 1290 | COPY %r0,c3 | ||
| 1291 | |||
| 1292 | SQR_ADD_C a3L,a3R,c1,c2,c3 | ||
| 1293 | STD c1,48(r_ptr) ; r[6] = c1; | ||
| 1294 | STD c2,56(r_ptr) ; r[7] = c2; | ||
| 1295 | |||
| 1296 | .EXIT | ||
| 1297 | LDD -104(%sp),%r6 ; restore r6 | ||
| 1298 | LDD -112(%sp),%r5 ; restore r5 | ||
| 1299 | LDD -120(%sp),%r4 ; restore r4 | ||
| 1300 | BVE (%rp) | ||
| 1301 | LDD,MB -128(%sp),%r3 | ||
| 1302 | |||
| 1303 | .PROCEND | ||
| 1304 | |||
| 1305 | |||
| 1306 | ;--------------------------------------------------------------------------- | ||
| 1307 | |||
| 1308 | MUL_ADD_C .macro A0L,A0R,B0L,B0R,C1,C2,C3 | ||
| 1309 | XMPYU A0L,B0R,ftemp1 ; m1 = bl*ht | ||
| 1310 | FSTD ftemp1,-16(%sp) ; | ||
| 1311 | XMPYU A0R,B0L,ftemp2 ; m = bh*lt | ||
| 1312 | FSTD ftemp2,-8(%sp) ; | ||
| 1313 | XMPYU A0R,B0R,ftemp3 ; lt = bl*lt | ||
| 1314 | FSTD ftemp3,-32(%sp) | ||
| 1315 | XMPYU A0L,B0L,ftemp4 ; ht = bh*ht | ||
| 1316 | FSTD ftemp4,-24(%sp) ; | ||
| 1317 | |||
| 1318 | LDD -8(%sp),m ; r21 = m | ||
| 1319 | LDD -16(%sp),m1 ; r19 = m1 | ||
| 1320 | ADD,L m,m1,m ; m+m1 | ||
| 1321 | |||
| 1322 | DEPD,Z m,31,32,temp3 ; (m+m1<<32) | ||
| 1323 | LDD -24(%sp),ht ; r24 = ht | ||
| 1324 | |||
| 1325 | CMPCLR,*>>= m,m1,%r0 ; if (m < m1) | ||
| 1326 | ADD,L ht,high_one,ht ; ht+=high_one | ||
| 1327 | |||
| 1328 | EXTRD,U m,31,32,temp1 ; m >> 32 | ||
| 1329 | LDD -32(%sp),lt ; lt | ||
| 1330 | ADD,L ht,temp1,ht ; ht+= m>>32 | ||
| 1331 | ADD lt,temp3,lt ; lt = lt+m1 | ||
| 1332 | ADD,DC ht,%r0,ht ; ht++ | ||
| 1333 | |||
| 1334 | ADD C1,lt,C1 ; c1=c1+lt | ||
| 1335 | ADD,DC ht,%r0,ht ; bump c3 if overflow,nullify otherwise | ||
| 1336 | |||
| 1337 | ADD C2,ht,C2 ; c2 = c2 + ht | ||
| 1338 | ADD,DC C3,%r0,C3 ; add in carry (c3++) | ||
| 1339 | .endm | ||
| 1340 | |||
| 1341 | |||
| 1342 | ; | ||
| 1343 | ;void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) | ||
| 1344 | ; arg0 = r_ptr | ||
| 1345 | ; arg1 = a_ptr | ||
| 1346 | ; arg2 = b_ptr | ||
| 1347 | ; | ||
| 1348 | |||
| 1349 | bn_mul_comba8 | ||
| 1350 | .proc | ||
| 1351 | .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE | ||
| 1352 | .EXPORT bn_mul_comba8,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN | ||
| 1353 | .entry | ||
| 1354 | .align 64 | ||
| 1355 | |||
| 1356 | STD %r3,0(%sp) ; save r3 | ||
| 1357 | STD %r4,8(%sp) ; save r4 | ||
| 1358 | STD %r5,16(%sp) ; save r5 | ||
| 1359 | STD %r6,24(%sp) ; save r6 | ||
| 1360 | FSTD %fr12,32(%sp) ; save r6 | ||
| 1361 | FSTD %fr13,40(%sp) ; save r7 | ||
| 1362 | |||
| 1363 | ; | ||
| 1364 | ; Zero out carries | ||
| 1365 | ; | ||
| 1366 | COPY %r0,c1 | ||
| 1367 | COPY %r0,c2 | ||
| 1368 | COPY %r0,c3 | ||
| 1369 | |||
| 1370 | LDO 128(%sp),%sp ; bump stack | ||
| 1371 | DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32 | ||
| 1372 | |||
| 1373 | ; | ||
| 1374 | ; Load up all of the values we are going to use | ||
| 1375 | ; | ||
| 1376 | FLDD 0(a_ptr),a0 | ||
| 1377 | FLDD 8(a_ptr),a1 | ||
| 1378 | FLDD 16(a_ptr),a2 | ||
| 1379 | FLDD 24(a_ptr),a3 | ||
| 1380 | FLDD 32(a_ptr),a4 | ||
| 1381 | FLDD 40(a_ptr),a5 | ||
| 1382 | FLDD 48(a_ptr),a6 | ||
| 1383 | FLDD 56(a_ptr),a7 | ||
| 1384 | |||
| 1385 | FLDD 0(b_ptr),b0 | ||
| 1386 | FLDD 8(b_ptr),b1 | ||
| 1387 | FLDD 16(b_ptr),b2 | ||
| 1388 | FLDD 24(b_ptr),b3 | ||
| 1389 | FLDD 32(b_ptr),b4 | ||
| 1390 | FLDD 40(b_ptr),b5 | ||
| 1391 | FLDD 48(b_ptr),b6 | ||
| 1392 | FLDD 56(b_ptr),b7 | ||
| 1393 | |||
| 1394 | MUL_ADD_C a0L,a0R,b0L,b0R,c1,c2,c3 | ||
| 1395 | STD c1,0(r_ptr) | ||
| 1396 | COPY %r0,c1 | ||
| 1397 | |||
| 1398 | MUL_ADD_C a0L,a0R,b1L,b1R,c2,c3,c1 | ||
| 1399 | MUL_ADD_C a1L,a1R,b0L,b0R,c2,c3,c1 | ||
| 1400 | STD c2,8(r_ptr) | ||
| 1401 | COPY %r0,c2 | ||
| 1402 | |||
| 1403 | MUL_ADD_C a2L,a2R,b0L,b0R,c3,c1,c2 | ||
| 1404 | MUL_ADD_C a1L,a1R,b1L,b1R,c3,c1,c2 | ||
| 1405 | MUL_ADD_C a0L,a0R,b2L,b2R,c3,c1,c2 | ||
| 1406 | STD c3,16(r_ptr) | ||
| 1407 | COPY %r0,c3 | ||
| 1408 | |||
| 1409 | MUL_ADD_C a0L,a0R,b3L,b3R,c1,c2,c3 | ||
| 1410 | MUL_ADD_C a1L,a1R,b2L,b2R,c1,c2,c3 | ||
| 1411 | MUL_ADD_C a2L,a2R,b1L,b1R,c1,c2,c3 | ||
| 1412 | MUL_ADD_C a3L,a3R,b0L,b0R,c1,c2,c3 | ||
| 1413 | STD c1,24(r_ptr) | ||
| 1414 | COPY %r0,c1 | ||
| 1415 | |||
| 1416 | MUL_ADD_C a4L,a4R,b0L,b0R,c2,c3,c1 | ||
| 1417 | MUL_ADD_C a3L,a3R,b1L,b1R,c2,c3,c1 | ||
| 1418 | MUL_ADD_C a2L,a2R,b2L,b2R,c2,c3,c1 | ||
| 1419 | MUL_ADD_C a1L,a1R,b3L,b3R,c2,c3,c1 | ||
| 1420 | MUL_ADD_C a0L,a0R,b4L,b4R,c2,c3,c1 | ||
| 1421 | STD c2,32(r_ptr) | ||
| 1422 | COPY %r0,c2 | ||
| 1423 | |||
| 1424 | MUL_ADD_C a0L,a0R,b5L,b5R,c3,c1,c2 | ||
| 1425 | MUL_ADD_C a1L,a1R,b4L,b4R,c3,c1,c2 | ||
| 1426 | MUL_ADD_C a2L,a2R,b3L,b3R,c3,c1,c2 | ||
| 1427 | MUL_ADD_C a3L,a3R,b2L,b2R,c3,c1,c2 | ||
| 1428 | MUL_ADD_C a4L,a4R,b1L,b1R,c3,c1,c2 | ||
| 1429 | MUL_ADD_C a5L,a5R,b0L,b0R,c3,c1,c2 | ||
| 1430 | STD c3,40(r_ptr) | ||
| 1431 | COPY %r0,c3 | ||
| 1432 | |||
| 1433 | MUL_ADD_C a6L,a6R,b0L,b0R,c1,c2,c3 | ||
| 1434 | MUL_ADD_C a5L,a5R,b1L,b1R,c1,c2,c3 | ||
| 1435 | MUL_ADD_C a4L,a4R,b2L,b2R,c1,c2,c3 | ||
| 1436 | MUL_ADD_C a3L,a3R,b3L,b3R,c1,c2,c3 | ||
| 1437 | MUL_ADD_C a2L,a2R,b4L,b4R,c1,c2,c3 | ||
| 1438 | MUL_ADD_C a1L,a1R,b5L,b5R,c1,c2,c3 | ||
| 1439 | MUL_ADD_C a0L,a0R,b6L,b6R,c1,c2,c3 | ||
| 1440 | STD c1,48(r_ptr) | ||
| 1441 | COPY %r0,c1 | ||
| 1442 | |||
| 1443 | MUL_ADD_C a0L,a0R,b7L,b7R,c2,c3,c1 | ||
| 1444 | MUL_ADD_C a1L,a1R,b6L,b6R,c2,c3,c1 | ||
| 1445 | MUL_ADD_C a2L,a2R,b5L,b5R,c2,c3,c1 | ||
| 1446 | MUL_ADD_C a3L,a3R,b4L,b4R,c2,c3,c1 | ||
| 1447 | MUL_ADD_C a4L,a4R,b3L,b3R,c2,c3,c1 | ||
| 1448 | MUL_ADD_C a5L,a5R,b2L,b2R,c2,c3,c1 | ||
| 1449 | MUL_ADD_C a6L,a6R,b1L,b1R,c2,c3,c1 | ||
| 1450 | MUL_ADD_C a7L,a7R,b0L,b0R,c2,c3,c1 | ||
| 1451 | STD c2,56(r_ptr) | ||
| 1452 | COPY %r0,c2 | ||
| 1453 | |||
| 1454 | MUL_ADD_C a7L,a7R,b1L,b1R,c3,c1,c2 | ||
| 1455 | MUL_ADD_C a6L,a6R,b2L,b2R,c3,c1,c2 | ||
| 1456 | MUL_ADD_C a5L,a5R,b3L,b3R,c3,c1,c2 | ||
| 1457 | MUL_ADD_C a4L,a4R,b4L,b4R,c3,c1,c2 | ||
| 1458 | MUL_ADD_C a3L,a3R,b5L,b5R,c3,c1,c2 | ||
| 1459 | MUL_ADD_C a2L,a2R,b6L,b6R,c3,c1,c2 | ||
| 1460 | MUL_ADD_C a1L,a1R,b7L,b7R,c3,c1,c2 | ||
| 1461 | STD c3,64(r_ptr) | ||
| 1462 | COPY %r0,c3 | ||
| 1463 | |||
| 1464 | MUL_ADD_C a2L,a2R,b7L,b7R,c1,c2,c3 | ||
| 1465 | MUL_ADD_C a3L,a3R,b6L,b6R,c1,c2,c3 | ||
| 1466 | MUL_ADD_C a4L,a4R,b5L,b5R,c1,c2,c3 | ||
| 1467 | MUL_ADD_C a5L,a5R,b4L,b4R,c1,c2,c3 | ||
| 1468 | MUL_ADD_C a6L,a6R,b3L,b3R,c1,c2,c3 | ||
| 1469 | MUL_ADD_C a7L,a7R,b2L,b2R,c1,c2,c3 | ||
| 1470 | STD c1,72(r_ptr) | ||
| 1471 | COPY %r0,c1 | ||
| 1472 | |||
| 1473 | MUL_ADD_C a7L,a7R,b3L,b3R,c2,c3,c1 | ||
| 1474 | MUL_ADD_C a6L,a6R,b4L,b4R,c2,c3,c1 | ||
| 1475 | MUL_ADD_C a5L,a5R,b5L,b5R,c2,c3,c1 | ||
| 1476 | MUL_ADD_C a4L,a4R,b6L,b6R,c2,c3,c1 | ||
| 1477 | MUL_ADD_C a3L,a3R,b7L,b7R,c2,c3,c1 | ||
| 1478 | STD c2,80(r_ptr) | ||
| 1479 | COPY %r0,c2 | ||
| 1480 | |||
| 1481 | MUL_ADD_C a4L,a4R,b7L,b7R,c3,c1,c2 | ||
| 1482 | MUL_ADD_C a5L,a5R,b6L,b6R,c3,c1,c2 | ||
| 1483 | MUL_ADD_C a6L,a6R,b5L,b5R,c3,c1,c2 | ||
| 1484 | MUL_ADD_C a7L,a7R,b4L,b4R,c3,c1,c2 | ||
| 1485 | STD c3,88(r_ptr) | ||
| 1486 | COPY %r0,c3 | ||
| 1487 | |||
| 1488 | MUL_ADD_C a7L,a7R,b5L,b5R,c1,c2,c3 | ||
| 1489 | MUL_ADD_C a6L,a6R,b6L,b6R,c1,c2,c3 | ||
| 1490 | MUL_ADD_C a5L,a5R,b7L,b7R,c1,c2,c3 | ||
| 1491 | STD c1,96(r_ptr) | ||
| 1492 | COPY %r0,c1 | ||
| 1493 | |||
| 1494 | MUL_ADD_C a6L,a6R,b7L,b7R,c2,c3,c1 | ||
| 1495 | MUL_ADD_C a7L,a7R,b6L,b6R,c2,c3,c1 | ||
| 1496 | STD c2,104(r_ptr) | ||
| 1497 | COPY %r0,c2 | ||
| 1498 | |||
| 1499 | MUL_ADD_C a7L,a7R,b7L,b7R,c3,c1,c2 | ||
| 1500 | STD c3,112(r_ptr) | ||
| 1501 | STD c1,120(r_ptr) | ||
| 1502 | |||
| 1503 | .EXIT | ||
| 1504 | FLDD -88(%sp),%fr13 | ||
| 1505 | FLDD -96(%sp),%fr12 | ||
| 1506 | LDD -104(%sp),%r6 ; restore r6 | ||
| 1507 | LDD -112(%sp),%r5 ; restore r5 | ||
| 1508 | LDD -120(%sp),%r4 ; restore r4 | ||
| 1509 | BVE (%rp) | ||
| 1510 | LDD,MB -128(%sp),%r3 | ||
| 1511 | |||
| 1512 | .PROCEND | ||
| 1513 | |||
| 1514 | ;----------------------------------------------------------------------------- | ||
| 1515 | ; | ||
| 1516 | ;void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) | ||
| 1517 | ; arg0 = r_ptr | ||
| 1518 | ; arg1 = a_ptr | ||
| 1519 | ; arg2 = b_ptr | ||
| 1520 | ; | ||
| 1521 | |||
| 1522 | bn_mul_comba4 | ||
| 1523 | .proc | ||
| 1524 | .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE | ||
| 1525 | .EXPORT bn_mul_comba4,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN | ||
| 1526 | .entry | ||
| 1527 | .align 64 | ||
| 1528 | |||
| 1529 | STD %r3,0(%sp) ; save r3 | ||
| 1530 | STD %r4,8(%sp) ; save r4 | ||
| 1531 | STD %r5,16(%sp) ; save r5 | ||
| 1532 | STD %r6,24(%sp) ; save r6 | ||
| 1533 | FSTD %fr12,32(%sp) ; save r6 | ||
| 1534 | FSTD %fr13,40(%sp) ; save r7 | ||
| 1535 | |||
| 1536 | ; | ||
| 1537 | ; Zero out carries | ||
| 1538 | ; | ||
| 1539 | COPY %r0,c1 | ||
| 1540 | COPY %r0,c2 | ||
| 1541 | COPY %r0,c3 | ||
| 1542 | |||
| 1543 | LDO 128(%sp),%sp ; bump stack | ||
| 1544 | DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32 | ||
| 1545 | |||
| 1546 | ; | ||
| 1547 | ; Load up all of the values we are going to use | ||
| 1548 | ; | ||
| 1549 | FLDD 0(a_ptr),a0 | ||
| 1550 | FLDD 8(a_ptr),a1 | ||
| 1551 | FLDD 16(a_ptr),a2 | ||
| 1552 | FLDD 24(a_ptr),a3 | ||
| 1553 | |||
| 1554 | FLDD 0(b_ptr),b0 | ||
| 1555 | FLDD 8(b_ptr),b1 | ||
| 1556 | FLDD 16(b_ptr),b2 | ||
| 1557 | FLDD 24(b_ptr),b3 | ||
| 1558 | |||
| 1559 | MUL_ADD_C a0L,a0R,b0L,b0R,c1,c2,c3 | ||
| 1560 | STD c1,0(r_ptr) | ||
| 1561 | COPY %r0,c1 | ||
| 1562 | |||
| 1563 | MUL_ADD_C a0L,a0R,b1L,b1R,c2,c3,c1 | ||
| 1564 | MUL_ADD_C a1L,a1R,b0L,b0R,c2,c3,c1 | ||
| 1565 | STD c2,8(r_ptr) | ||
| 1566 | COPY %r0,c2 | ||
| 1567 | |||
| 1568 | MUL_ADD_C a2L,a2R,b0L,b0R,c3,c1,c2 | ||
| 1569 | MUL_ADD_C a1L,a1R,b1L,b1R,c3,c1,c2 | ||
| 1570 | MUL_ADD_C a0L,a0R,b2L,b2R,c3,c1,c2 | ||
| 1571 | STD c3,16(r_ptr) | ||
| 1572 | COPY %r0,c3 | ||
| 1573 | |||
| 1574 | MUL_ADD_C a0L,a0R,b3L,b3R,c1,c2,c3 | ||
| 1575 | MUL_ADD_C a1L,a1R,b2L,b2R,c1,c2,c3 | ||
| 1576 | MUL_ADD_C a2L,a2R,b1L,b1R,c1,c2,c3 | ||
| 1577 | MUL_ADD_C a3L,a3R,b0L,b0R,c1,c2,c3 | ||
| 1578 | STD c1,24(r_ptr) | ||
| 1579 | COPY %r0,c1 | ||
| 1580 | |||
| 1581 | MUL_ADD_C a3L,a3R,b1L,b1R,c2,c3,c1 | ||
| 1582 | MUL_ADD_C a2L,a2R,b2L,b2R,c2,c3,c1 | ||
| 1583 | MUL_ADD_C a1L,a1R,b3L,b3R,c2,c3,c1 | ||
| 1584 | STD c2,32(r_ptr) | ||
| 1585 | COPY %r0,c2 | ||
| 1586 | |||
| 1587 | MUL_ADD_C a2L,a2R,b3L,b3R,c3,c1,c2 | ||
| 1588 | MUL_ADD_C a3L,a3R,b2L,b2R,c3,c1,c2 | ||
| 1589 | STD c3,40(r_ptr) | ||
| 1590 | COPY %r0,c3 | ||
| 1591 | |||
| 1592 | MUL_ADD_C a3L,a3R,b3L,b3R,c1,c2,c3 | ||
| 1593 | STD c1,48(r_ptr) | ||
| 1594 | STD c2,56(r_ptr) | ||
| 1595 | |||
| 1596 | .EXIT | ||
| 1597 | FLDD -88(%sp),%fr13 | ||
| 1598 | FLDD -96(%sp),%fr12 | ||
| 1599 | LDD -104(%sp),%r6 ; restore r6 | ||
| 1600 | LDD -112(%sp),%r5 ; restore r5 | ||
| 1601 | LDD -120(%sp),%r4 ; restore r4 | ||
| 1602 | BVE (%rp) | ||
| 1603 | LDD,MB -128(%sp),%r3 | ||
| 1604 | |||
| 1605 | .PROCEND | ||
| 1606 | |||
| 1607 | |||
| 1608 | ;--- not PIC .SPACE $TEXT$ | ||
| 1609 | ;--- not PIC .SUBSPA $CODE$ | ||
| 1610 | ;--- not PIC .SPACE $PRIVATE$,SORT=16 | ||
| 1611 | ;--- not PIC .IMPORT $global$,DATA | ||
| 1612 | ;--- not PIC .SPACE $TEXT$ | ||
| 1613 | ;--- not PIC .SUBSPA $CODE$ | ||
| 1614 | ;--- not PIC .SUBSPA $LIT$,ACCESS=0x2c | ||
| 1615 | ;--- not PIC C$7 | ||
| 1616 | ;--- not PIC .ALIGN 8 | ||
| 1617 | ;--- not PIC .STRINGZ "Division would overflow (%d)\n" | ||
| 1618 | .END | ||
diff --git a/src/lib/libcrypto/bn/asm/pa-risc2W.s b/src/lib/libcrypto/bn/asm/pa-risc2W.s deleted file mode 100644 index a91f3ea5af..0000000000 --- a/src/lib/libcrypto/bn/asm/pa-risc2W.s +++ /dev/null | |||
| @@ -1,1605 +0,0 @@ | |||
| 1 | ; | ||
| 2 | ; PA-RISC 64-bit implementation of bn_asm code | ||
| 3 | ; | ||
| 4 | ; This code is approximately 2x faster than the C version | ||
| 5 | ; for RSA/DSA. | ||
| 6 | ; | ||
| 7 | ; See http://devresource.hp.com/ for more details on the PA-RISC | ||
| 8 | ; architecture. Also see the book "PA-RISC 2.0 Architecture" | ||
| 9 | ; by Gerry Kane for information on the instruction set architecture. | ||
| 10 | ; | ||
| 11 | ; Code written by Chris Ruemmler (with some help from the HP C | ||
| 12 | ; compiler). | ||
| 13 | ; | ||
| 14 | ; The code compiles with HP's assembler | ||
| 15 | ; | ||
| 16 | |||
| 17 | .level 2.0W | ||
| 18 | .space $TEXT$ | ||
| 19 | .subspa $CODE$,QUAD=0,ALIGN=8,ACCESS=0x2c,CODE_ONLY | ||
| 20 | |||
| 21 | ; | ||
| 22 | ; Global Register definitions used for the routines. | ||
| 23 | ; | ||
| 24 | ; Some information about HP's runtime architecture for 64-bits. | ||
| 25 | ; | ||
| 26 | ; "Caller save" means the calling function must save the register | ||
| 27 | ; if it wants the register to be preserved. | ||
| 28 | ; "Callee save" means if a function uses the register, it must save | ||
| 29 | ; the value before using it. | ||
| 30 | ; | ||
| 31 | ; For the floating point registers | ||
| 32 | ; | ||
| 33 | ; "caller save" registers: fr4-fr11, fr22-fr31 | ||
| 34 | ; "callee save" registers: fr12-fr21 | ||
| 35 | ; "special" registers: fr0-fr3 (status and exception registers) | ||
| 36 | ; | ||
| 37 | ; For the integer registers | ||
| 38 | ; value zero : r0 | ||
| 39 | ; "caller save" registers: r1,r19-r26 | ||
| 40 | ; "callee save" registers: r3-r18 | ||
| 41 | ; return register : r2 (rp) | ||
| 42 | ; return values ; r28 (ret0,ret1) | ||
| 43 | ; Stack pointer ; r30 (sp) | ||
| 44 | ; global data pointer ; r27 (dp) | ||
| 45 | ; argument pointer ; r29 (ap) | ||
| 46 | ; millicode return ptr ; r31 (also a caller save register) | ||
| 47 | |||
| 48 | |||
| 49 | ; | ||
| 50 | ; Arguments to the routines | ||
| 51 | ; | ||
| 52 | r_ptr .reg %r26 | ||
| 53 | a_ptr .reg %r25 | ||
| 54 | b_ptr .reg %r24 | ||
| 55 | num .reg %r24 | ||
| 56 | w .reg %r23 | ||
| 57 | n .reg %r23 | ||
| 58 | |||
| 59 | |||
| 60 | ; | ||
| 61 | ; Globals used in some routines | ||
| 62 | ; | ||
| 63 | |||
| 64 | top_overflow .reg %r29 | ||
| 65 | high_mask .reg %r22 ; value 0xffffffff80000000L | ||
| 66 | |||
| 67 | |||
| 68 | ;------------------------------------------------------------------------------ | ||
| 69 | ; | ||
| 70 | ; bn_mul_add_words | ||
| 71 | ; | ||
| 72 | ;BN_ULONG bn_mul_add_words(BN_ULONG *r_ptr, BN_ULONG *a_ptr, | ||
| 73 | ; int num, BN_ULONG w) | ||
| 74 | ; | ||
| 75 | ; arg0 = r_ptr | ||
| 76 | ; arg1 = a_ptr | ||
| 77 | ; arg2 = num | ||
| 78 | ; arg3 = w | ||
| 79 | ; | ||
| 80 | ; Local register definitions | ||
| 81 | ; | ||
| 82 | |||
| 83 | fm1 .reg %fr22 | ||
| 84 | fm .reg %fr23 | ||
| 85 | ht_temp .reg %fr24 | ||
| 86 | ht_temp_1 .reg %fr25 | ||
| 87 | lt_temp .reg %fr26 | ||
| 88 | lt_temp_1 .reg %fr27 | ||
| 89 | fm1_1 .reg %fr28 | ||
| 90 | fm_1 .reg %fr29 | ||
| 91 | |||
| 92 | fw_h .reg %fr7L | ||
| 93 | fw_l .reg %fr7R | ||
| 94 | fw .reg %fr7 | ||
| 95 | |||
| 96 | fht_0 .reg %fr8L | ||
| 97 | flt_0 .reg %fr8R | ||
| 98 | t_float_0 .reg %fr8 | ||
| 99 | |||
| 100 | fht_1 .reg %fr9L | ||
| 101 | flt_1 .reg %fr9R | ||
| 102 | t_float_1 .reg %fr9 | ||
| 103 | |||
| 104 | tmp_0 .reg %r31 | ||
| 105 | tmp_1 .reg %r21 | ||
| 106 | m_0 .reg %r20 | ||
| 107 | m_1 .reg %r19 | ||
| 108 | ht_0 .reg %r1 | ||
| 109 | ht_1 .reg %r3 | ||
| 110 | lt_0 .reg %r4 | ||
| 111 | lt_1 .reg %r5 | ||
| 112 | m1_0 .reg %r6 | ||
| 113 | m1_1 .reg %r7 | ||
| 114 | rp_val .reg %r8 | ||
| 115 | rp_val_1 .reg %r9 | ||
| 116 | |||
| 117 | bn_mul_add_words | ||
| 118 | .export bn_mul_add_words,entry,NO_RELOCATION,LONG_RETURN | ||
| 119 | .proc | ||
| 120 | .callinfo frame=128 | ||
| 121 | .entry | ||
| 122 | .align 64 | ||
| 123 | |||
| 124 | STD %r3,0(%sp) ; save r3 | ||
| 125 | STD %r4,8(%sp) ; save r4 | ||
| 126 | NOP ; Needed to make the loop 16-byte aligned | ||
| 127 | NOP ; Needed to make the loop 16-byte aligned | ||
| 128 | |||
| 129 | STD %r5,16(%sp) ; save r5 | ||
| 130 | STD %r6,24(%sp) ; save r6 | ||
| 131 | STD %r7,32(%sp) ; save r7 | ||
| 132 | STD %r8,40(%sp) ; save r8 | ||
| 133 | |||
| 134 | STD %r9,48(%sp) ; save r9 | ||
| 135 | COPY %r0,%ret0 ; return 0 by default | ||
| 136 | DEPDI,Z 1,31,1,top_overflow ; top_overflow = 1 << 32 | ||
| 137 | STD w,56(%sp) ; store w on stack | ||
| 138 | |||
| 139 | CMPIB,>= 0,num,bn_mul_add_words_exit ; if (num <= 0) then exit | ||
| 140 | LDO 128(%sp),%sp ; bump stack | ||
| 141 | |||
| 142 | ; | ||
| 143 | ; The loop is unrolled twice, so if there is only 1 number | ||
| 144 | ; then go straight to the cleanup code. | ||
| 145 | ; | ||
| 146 | CMPIB,= 1,num,bn_mul_add_words_single_top | ||
| 147 | FLDD -72(%sp),fw ; load up w into fp register fw (fw_h/fw_l) | ||
| 148 | |||
| 149 | ; | ||
| 150 | ; This loop is unrolled 2 times (64-byte aligned as well) | ||
| 151 | ; | ||
| 152 | ; PA-RISC 2.0 chips have two fully pipelined multipliers, thus | ||
| 153 | ; two 32-bit mutiplies can be issued per cycle. | ||
| 154 | ; | ||
| 155 | bn_mul_add_words_unroll2 | ||
| 156 | |||
| 157 | FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R) | ||
| 158 | FLDD 8(a_ptr),t_float_1 ; load up 64-bit value (fr8L) ht(L)/lt(R) | ||
| 159 | LDD 0(r_ptr),rp_val ; rp[0] | ||
| 160 | LDD 8(r_ptr),rp_val_1 ; rp[1] | ||
| 161 | |||
| 162 | XMPYU fht_0,fw_l,fm1 ; m1[0] = fht_0*fw_l | ||
| 163 | XMPYU fht_1,fw_l,fm1_1 ; m1[1] = fht_1*fw_l | ||
| 164 | FSTD fm1,-16(%sp) ; -16(sp) = m1[0] | ||
| 165 | FSTD fm1_1,-48(%sp) ; -48(sp) = m1[1] | ||
| 166 | |||
| 167 | XMPYU flt_0,fw_h,fm ; m[0] = flt_0*fw_h | ||
| 168 | XMPYU flt_1,fw_h,fm_1 ; m[1] = flt_1*fw_h | ||
| 169 | FSTD fm,-8(%sp) ; -8(sp) = m[0] | ||
| 170 | FSTD fm_1,-40(%sp) ; -40(sp) = m[1] | ||
| 171 | |||
| 172 | XMPYU fht_0,fw_h,ht_temp ; ht_temp = fht_0*fw_h | ||
| 173 | XMPYU fht_1,fw_h,ht_temp_1 ; ht_temp_1 = fht_1*fw_h | ||
| 174 | FSTD ht_temp,-24(%sp) ; -24(sp) = ht_temp | ||
| 175 | FSTD ht_temp_1,-56(%sp) ; -56(sp) = ht_temp_1 | ||
| 176 | |||
| 177 | XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l | ||
| 178 | XMPYU flt_1,fw_l,lt_temp_1 ; lt_temp = lt*fw_l | ||
| 179 | FSTD lt_temp,-32(%sp) ; -32(sp) = lt_temp | ||
| 180 | FSTD lt_temp_1,-64(%sp) ; -64(sp) = lt_temp_1 | ||
| 181 | |||
| 182 | LDD -8(%sp),m_0 ; m[0] | ||
| 183 | LDD -40(%sp),m_1 ; m[1] | ||
| 184 | LDD -16(%sp),m1_0 ; m1[0] | ||
| 185 | LDD -48(%sp),m1_1 ; m1[1] | ||
| 186 | |||
| 187 | LDD -24(%sp),ht_0 ; ht[0] | ||
| 188 | LDD -56(%sp),ht_1 ; ht[1] | ||
| 189 | ADD,L m1_0,m_0,tmp_0 ; tmp_0 = m[0] + m1[0]; | ||
| 190 | ADD,L m1_1,m_1,tmp_1 ; tmp_1 = m[1] + m1[1]; | ||
| 191 | |||
| 192 | LDD -32(%sp),lt_0 | ||
| 193 | LDD -64(%sp),lt_1 | ||
| 194 | CMPCLR,*>>= tmp_0,m1_0, %r0 ; if (m[0] < m1[0]) | ||
| 195 | ADD,L ht_0,top_overflow,ht_0 ; ht[0] += (1<<32) | ||
| 196 | |||
| 197 | CMPCLR,*>>= tmp_1,m1_1,%r0 ; if (m[1] < m1[1]) | ||
| 198 | ADD,L ht_1,top_overflow,ht_1 ; ht[1] += (1<<32) | ||
| 199 | EXTRD,U tmp_0,31,32,m_0 ; m[0]>>32 | ||
| 200 | DEPD,Z tmp_0,31,32,m1_0 ; m1[0] = m[0]<<32 | ||
| 201 | |||
| 202 | EXTRD,U tmp_1,31,32,m_1 ; m[1]>>32 | ||
| 203 | DEPD,Z tmp_1,31,32,m1_1 ; m1[1] = m[1]<<32 | ||
| 204 | ADD,L ht_0,m_0,ht_0 ; ht[0]+= (m[0]>>32) | ||
| 205 | ADD,L ht_1,m_1,ht_1 ; ht[1]+= (m[1]>>32) | ||
| 206 | |||
| 207 | ADD lt_0,m1_0,lt_0 ; lt[0] = lt[0]+m1[0]; | ||
| 208 | ADD,DC ht_0,%r0,ht_0 ; ht[0]++ | ||
| 209 | ADD lt_1,m1_1,lt_1 ; lt[1] = lt[1]+m1[1]; | ||
| 210 | ADD,DC ht_1,%r0,ht_1 ; ht[1]++ | ||
| 211 | |||
| 212 | ADD %ret0,lt_0,lt_0 ; lt[0] = lt[0] + c; | ||
| 213 | ADD,DC ht_0,%r0,ht_0 ; ht[0]++ | ||
| 214 | ADD lt_0,rp_val,lt_0 ; lt[0] = lt[0]+rp[0] | ||
| 215 | ADD,DC ht_0,%r0,ht_0 ; ht[0]++ | ||
| 216 | |||
| 217 | LDO -2(num),num ; num = num - 2; | ||
| 218 | ADD ht_0,lt_1,lt_1 ; lt[1] = lt[1] + ht_0 (c); | ||
| 219 | ADD,DC ht_1,%r0,ht_1 ; ht[1]++ | ||
| 220 | STD lt_0,0(r_ptr) ; rp[0] = lt[0] | ||
| 221 | |||
| 222 | ADD lt_1,rp_val_1,lt_1 ; lt[1] = lt[1]+rp[1] | ||
| 223 | ADD,DC ht_1,%r0,%ret0 ; ht[1]++ | ||
| 224 | LDO 16(a_ptr),a_ptr ; a_ptr += 2 | ||
| 225 | |||
| 226 | STD lt_1,8(r_ptr) ; rp[1] = lt[1] | ||
| 227 | CMPIB,<= 2,num,bn_mul_add_words_unroll2 ; go again if more to do | ||
| 228 | LDO 16(r_ptr),r_ptr ; r_ptr += 2 | ||
| 229 | |||
| 230 | CMPIB,=,N 0,num,bn_mul_add_words_exit ; are we done, or cleanup last one | ||
| 231 | |||
| 232 | ; | ||
| 233 | ; Top of loop aligned on 64-byte boundary | ||
| 234 | ; | ||
| 235 | bn_mul_add_words_single_top | ||
| 236 | FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R) | ||
| 237 | LDD 0(r_ptr),rp_val ; rp[0] | ||
| 238 | LDO 8(a_ptr),a_ptr ; a_ptr++ | ||
| 239 | XMPYU fht_0,fw_l,fm1 ; m1 = ht*fw_l | ||
| 240 | FSTD fm1,-16(%sp) ; -16(sp) = m1 | ||
| 241 | XMPYU flt_0,fw_h,fm ; m = lt*fw_h | ||
| 242 | FSTD fm,-8(%sp) ; -8(sp) = m | ||
| 243 | XMPYU fht_0,fw_h,ht_temp ; ht_temp = ht*fw_h | ||
| 244 | FSTD ht_temp,-24(%sp) ; -24(sp) = ht | ||
| 245 | XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l | ||
| 246 | FSTD lt_temp,-32(%sp) ; -32(sp) = lt | ||
| 247 | |||
| 248 | LDD -8(%sp),m_0 | ||
| 249 | LDD -16(%sp),m1_0 ; m1 = temp1 | ||
| 250 | ADD,L m_0,m1_0,tmp_0 ; tmp_0 = m + m1; | ||
| 251 | LDD -24(%sp),ht_0 | ||
| 252 | LDD -32(%sp),lt_0 | ||
| 253 | |||
| 254 | CMPCLR,*>>= tmp_0,m1_0,%r0 ; if (m < m1) | ||
| 255 | ADD,L ht_0,top_overflow,ht_0 ; ht += (1<<32) | ||
| 256 | |||
| 257 | EXTRD,U tmp_0,31,32,m_0 ; m>>32 | ||
| 258 | DEPD,Z tmp_0,31,32,m1_0 ; m1 = m<<32 | ||
| 259 | |||
| 260 | ADD,L ht_0,m_0,ht_0 ; ht+= (m>>32) | ||
| 261 | ADD lt_0,m1_0,tmp_0 ; tmp_0 = lt+m1; | ||
| 262 | ADD,DC ht_0,%r0,ht_0 ; ht++ | ||
| 263 | ADD %ret0,tmp_0,lt_0 ; lt = lt + c; | ||
| 264 | ADD,DC ht_0,%r0,ht_0 ; ht++ | ||
| 265 | ADD lt_0,rp_val,lt_0 ; lt = lt+rp[0] | ||
| 266 | ADD,DC ht_0,%r0,%ret0 ; ht++ | ||
| 267 | STD lt_0,0(r_ptr) ; rp[0] = lt | ||
| 268 | |||
| 269 | bn_mul_add_words_exit | ||
| 270 | .EXIT | ||
| 271 | LDD -80(%sp),%r9 ; restore r9 | ||
| 272 | LDD -88(%sp),%r8 ; restore r8 | ||
| 273 | LDD -96(%sp),%r7 ; restore r7 | ||
| 274 | LDD -104(%sp),%r6 ; restore r6 | ||
| 275 | LDD -112(%sp),%r5 ; restore r5 | ||
| 276 | LDD -120(%sp),%r4 ; restore r4 | ||
| 277 | BVE (%rp) | ||
| 278 | LDD,MB -128(%sp),%r3 ; restore r3 | ||
| 279 | .PROCEND ;in=23,24,25,26,29;out=28; | ||
| 280 | |||
| 281 | ;---------------------------------------------------------------------------- | ||
| 282 | ; | ||
| 283 | ;BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w) | ||
| 284 | ; | ||
| 285 | ; arg0 = rp | ||
| 286 | ; arg1 = ap | ||
| 287 | ; arg2 = num | ||
| 288 | ; arg3 = w | ||
| 289 | |||
| 290 | bn_mul_words | ||
| 291 | .proc | ||
| 292 | .callinfo frame=128 | ||
| 293 | .entry | ||
| 294 | .EXPORT bn_mul_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN | ||
| 295 | .align 64 | ||
| 296 | |||
| 297 | STD %r3,0(%sp) ; save r3 | ||
| 298 | STD %r4,8(%sp) ; save r4 | ||
| 299 | STD %r5,16(%sp) ; save r5 | ||
| 300 | STD %r6,24(%sp) ; save r6 | ||
| 301 | |||
| 302 | STD %r7,32(%sp) ; save r7 | ||
| 303 | COPY %r0,%ret0 ; return 0 by default | ||
| 304 | DEPDI,Z 1,31,1,top_overflow ; top_overflow = 1 << 32 | ||
| 305 | STD w,56(%sp) ; w on stack | ||
| 306 | |||
| 307 | CMPIB,>= 0,num,bn_mul_words_exit | ||
| 308 | LDO 128(%sp),%sp ; bump stack | ||
| 309 | |||
| 310 | ; | ||
| 311 | ; See if only 1 word to do, thus just do cleanup | ||
| 312 | ; | ||
| 313 | CMPIB,= 1,num,bn_mul_words_single_top | ||
| 314 | FLDD -72(%sp),fw ; load up w into fp register fw (fw_h/fw_l) | ||
| 315 | |||
| 316 | ; | ||
| 317 | ; This loop is unrolled 2 times (64-byte aligned as well) | ||
| 318 | ; | ||
| 319 | ; PA-RISC 2.0 chips have two fully pipelined multipliers, thus | ||
| 320 | ; two 32-bit mutiplies can be issued per cycle. | ||
| 321 | ; | ||
| 322 | bn_mul_words_unroll2 | ||
| 323 | |||
| 324 | FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R) | ||
| 325 | FLDD 8(a_ptr),t_float_1 ; load up 64-bit value (fr8L) ht(L)/lt(R) | ||
| 326 | XMPYU fht_0,fw_l,fm1 ; m1[0] = fht_0*fw_l | ||
| 327 | XMPYU fht_1,fw_l,fm1_1 ; m1[1] = ht*fw_l | ||
| 328 | |||
| 329 | FSTD fm1,-16(%sp) ; -16(sp) = m1 | ||
| 330 | FSTD fm1_1,-48(%sp) ; -48(sp) = m1 | ||
| 331 | XMPYU flt_0,fw_h,fm ; m = lt*fw_h | ||
| 332 | XMPYU flt_1,fw_h,fm_1 ; m = lt*fw_h | ||
| 333 | |||
| 334 | FSTD fm,-8(%sp) ; -8(sp) = m | ||
| 335 | FSTD fm_1,-40(%sp) ; -40(sp) = m | ||
| 336 | XMPYU fht_0,fw_h,ht_temp ; ht_temp = fht_0*fw_h | ||
| 337 | XMPYU fht_1,fw_h,ht_temp_1 ; ht_temp = ht*fw_h | ||
| 338 | |||
| 339 | FSTD ht_temp,-24(%sp) ; -24(sp) = ht | ||
| 340 | FSTD ht_temp_1,-56(%sp) ; -56(sp) = ht | ||
| 341 | XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l | ||
| 342 | XMPYU flt_1,fw_l,lt_temp_1 ; lt_temp = lt*fw_l | ||
| 343 | |||
| 344 | FSTD lt_temp,-32(%sp) ; -32(sp) = lt | ||
| 345 | FSTD lt_temp_1,-64(%sp) ; -64(sp) = lt | ||
| 346 | LDD -8(%sp),m_0 | ||
| 347 | LDD -40(%sp),m_1 | ||
| 348 | |||
| 349 | LDD -16(%sp),m1_0 | ||
| 350 | LDD -48(%sp),m1_1 | ||
| 351 | LDD -24(%sp),ht_0 | ||
| 352 | LDD -56(%sp),ht_1 | ||
| 353 | |||
| 354 | ADD,L m1_0,m_0,tmp_0 ; tmp_0 = m + m1; | ||
| 355 | ADD,L m1_1,m_1,tmp_1 ; tmp_1 = m + m1; | ||
| 356 | LDD -32(%sp),lt_0 | ||
| 357 | LDD -64(%sp),lt_1 | ||
| 358 | |||
| 359 | CMPCLR,*>>= tmp_0,m1_0, %r0 ; if (m < m1) | ||
| 360 | ADD,L ht_0,top_overflow,ht_0 ; ht += (1<<32) | ||
| 361 | CMPCLR,*>>= tmp_1,m1_1,%r0 ; if (m < m1) | ||
| 362 | ADD,L ht_1,top_overflow,ht_1 ; ht += (1<<32) | ||
| 363 | |||
| 364 | EXTRD,U tmp_0,31,32,m_0 ; m>>32 | ||
| 365 | DEPD,Z tmp_0,31,32,m1_0 ; m1 = m<<32 | ||
| 366 | EXTRD,U tmp_1,31,32,m_1 ; m>>32 | ||
| 367 | DEPD,Z tmp_1,31,32,m1_1 ; m1 = m<<32 | ||
| 368 | |||
| 369 | ADD,L ht_0,m_0,ht_0 ; ht+= (m>>32) | ||
| 370 | ADD,L ht_1,m_1,ht_1 ; ht+= (m>>32) | ||
| 371 | ADD lt_0,m1_0,lt_0 ; lt = lt+m1; | ||
| 372 | ADD,DC ht_0,%r0,ht_0 ; ht++ | ||
| 373 | |||
| 374 | ADD lt_1,m1_1,lt_1 ; lt = lt+m1; | ||
| 375 | ADD,DC ht_1,%r0,ht_1 ; ht++ | ||
| 376 | ADD %ret0,lt_0,lt_0 ; lt = lt + c (ret0); | ||
| 377 | ADD,DC ht_0,%r0,ht_0 ; ht++ | ||
| 378 | |||
| 379 | ADD ht_0,lt_1,lt_1 ; lt = lt + c (ht_0) | ||
| 380 | ADD,DC ht_1,%r0,ht_1 ; ht++ | ||
| 381 | STD lt_0,0(r_ptr) ; rp[0] = lt | ||
| 382 | STD lt_1,8(r_ptr) ; rp[1] = lt | ||
| 383 | |||
| 384 | COPY ht_1,%ret0 ; carry = ht | ||
| 385 | LDO -2(num),num ; num = num - 2; | ||
| 386 | LDO 16(a_ptr),a_ptr ; ap += 2 | ||
| 387 | CMPIB,<= 2,num,bn_mul_words_unroll2 | ||
| 388 | LDO 16(r_ptr),r_ptr ; rp++ | ||
| 389 | |||
| 390 | CMPIB,=,N 0,num,bn_mul_words_exit ; are we done? | ||
| 391 | |||
| 392 | ; | ||
| 393 | ; Top of loop aligned on 64-byte boundary | ||
| 394 | ; | ||
| 395 | bn_mul_words_single_top | ||
| 396 | FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R) | ||
| 397 | |||
| 398 | XMPYU fht_0,fw_l,fm1 ; m1 = ht*fw_l | ||
| 399 | FSTD fm1,-16(%sp) ; -16(sp) = m1 | ||
| 400 | XMPYU flt_0,fw_h,fm ; m = lt*fw_h | ||
| 401 | FSTD fm,-8(%sp) ; -8(sp) = m | ||
| 402 | XMPYU fht_0,fw_h,ht_temp ; ht_temp = ht*fw_h | ||
| 403 | FSTD ht_temp,-24(%sp) ; -24(sp) = ht | ||
| 404 | XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l | ||
| 405 | FSTD lt_temp,-32(%sp) ; -32(sp) = lt | ||
| 406 | |||
| 407 | LDD -8(%sp),m_0 | ||
| 408 | LDD -16(%sp),m1_0 | ||
| 409 | ADD,L m_0,m1_0,tmp_0 ; tmp_0 = m + m1; | ||
| 410 | LDD -24(%sp),ht_0 | ||
| 411 | LDD -32(%sp),lt_0 | ||
| 412 | |||
| 413 | CMPCLR,*>>= tmp_0,m1_0,%r0 ; if (m < m1) | ||
| 414 | ADD,L ht_0,top_overflow,ht_0 ; ht += (1<<32) | ||
| 415 | |||
| 416 | EXTRD,U tmp_0,31,32,m_0 ; m>>32 | ||
| 417 | DEPD,Z tmp_0,31,32,m1_0 ; m1 = m<<32 | ||
| 418 | |||
| 419 | ADD,L ht_0,m_0,ht_0 ; ht+= (m>>32) | ||
| 420 | ADD lt_0,m1_0,lt_0 ; lt= lt+m1; | ||
| 421 | ADD,DC ht_0,%r0,ht_0 ; ht++ | ||
| 422 | |||
| 423 | ADD %ret0,lt_0,lt_0 ; lt = lt + c; | ||
| 424 | ADD,DC ht_0,%r0,ht_0 ; ht++ | ||
| 425 | |||
| 426 | COPY ht_0,%ret0 ; copy carry | ||
| 427 | STD lt_0,0(r_ptr) ; rp[0] = lt | ||
| 428 | |||
| 429 | bn_mul_words_exit | ||
| 430 | .EXIT | ||
| 431 | LDD -96(%sp),%r7 ; restore r7 | ||
| 432 | LDD -104(%sp),%r6 ; restore r6 | ||
| 433 | LDD -112(%sp),%r5 ; restore r5 | ||
| 434 | LDD -120(%sp),%r4 ; restore r4 | ||
| 435 | BVE (%rp) | ||
| 436 | LDD,MB -128(%sp),%r3 ; restore r3 | ||
| 437 | .PROCEND ;in=23,24,25,26,29;out=28; | ||
| 438 | |||
| 439 | ;---------------------------------------------------------------------------- | ||
| 440 | ; | ||
| 441 | ;void bn_sqr_words(BN_ULONG *rp, BN_ULONG *ap, int num) | ||
| 442 | ; | ||
| 443 | ; arg0 = rp | ||
| 444 | ; arg1 = ap | ||
| 445 | ; arg2 = num | ||
| 446 | ; | ||
| 447 | |||
| 448 | bn_sqr_words | ||
| 449 | .proc | ||
| 450 | .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE | ||
| 451 | .EXPORT bn_sqr_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN | ||
| 452 | .entry | ||
| 453 | .align 64 | ||
| 454 | |||
| 455 | STD %r3,0(%sp) ; save r3 | ||
| 456 | STD %r4,8(%sp) ; save r4 | ||
| 457 | NOP | ||
| 458 | STD %r5,16(%sp) ; save r5 | ||
| 459 | |||
| 460 | CMPIB,>= 0,num,bn_sqr_words_exit | ||
| 461 | LDO 128(%sp),%sp ; bump stack | ||
| 462 | |||
| 463 | ; | ||
| 464 | ; If only 1, the goto straight to cleanup | ||
| 465 | ; | ||
| 466 | CMPIB,= 1,num,bn_sqr_words_single_top | ||
| 467 | DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L | ||
| 468 | |||
| 469 | ; | ||
| 470 | ; This loop is unrolled 2 times (64-byte aligned as well) | ||
| 471 | ; | ||
| 472 | |||
| 473 | bn_sqr_words_unroll2 | ||
| 474 | FLDD 0(a_ptr),t_float_0 ; a[0] | ||
| 475 | FLDD 8(a_ptr),t_float_1 ; a[1] | ||
| 476 | XMPYU fht_0,flt_0,fm ; m[0] | ||
| 477 | XMPYU fht_1,flt_1,fm_1 ; m[1] | ||
| 478 | |||
| 479 | FSTD fm,-24(%sp) ; store m[0] | ||
| 480 | FSTD fm_1,-56(%sp) ; store m[1] | ||
| 481 | XMPYU flt_0,flt_0,lt_temp ; lt[0] | ||
| 482 | XMPYU flt_1,flt_1,lt_temp_1 ; lt[1] | ||
| 483 | |||
| 484 | FSTD lt_temp,-16(%sp) ; store lt[0] | ||
| 485 | FSTD lt_temp_1,-48(%sp) ; store lt[1] | ||
| 486 | XMPYU fht_0,fht_0,ht_temp ; ht[0] | ||
| 487 | XMPYU fht_1,fht_1,ht_temp_1 ; ht[1] | ||
| 488 | |||
| 489 | FSTD ht_temp,-8(%sp) ; store ht[0] | ||
| 490 | FSTD ht_temp_1,-40(%sp) ; store ht[1] | ||
| 491 | LDD -24(%sp),m_0 | ||
| 492 | LDD -56(%sp),m_1 | ||
| 493 | |||
| 494 | AND m_0,high_mask,tmp_0 ; m[0] & Mask | ||
| 495 | AND m_1,high_mask,tmp_1 ; m[1] & Mask | ||
| 496 | DEPD,Z m_0,30,31,m_0 ; m[0] << 32+1 | ||
| 497 | DEPD,Z m_1,30,31,m_1 ; m[1] << 32+1 | ||
| 498 | |||
| 499 | LDD -16(%sp),lt_0 | ||
| 500 | LDD -48(%sp),lt_1 | ||
| 501 | EXTRD,U tmp_0,32,33,tmp_0 ; tmp_0 = m[0]&Mask >> 32-1 | ||
| 502 | EXTRD,U tmp_1,32,33,tmp_1 ; tmp_1 = m[1]&Mask >> 32-1 | ||
| 503 | |||
| 504 | LDD -8(%sp),ht_0 | ||
| 505 | LDD -40(%sp),ht_1 | ||
| 506 | ADD,L ht_0,tmp_0,ht_0 ; ht[0] += tmp_0 | ||
| 507 | ADD,L ht_1,tmp_1,ht_1 ; ht[1] += tmp_1 | ||
| 508 | |||
| 509 | ADD lt_0,m_0,lt_0 ; lt = lt+m | ||
| 510 | ADD,DC ht_0,%r0,ht_0 ; ht[0]++ | ||
| 511 | STD lt_0,0(r_ptr) ; rp[0] = lt[0] | ||
| 512 | STD ht_0,8(r_ptr) ; rp[1] = ht[1] | ||
| 513 | |||
| 514 | ADD lt_1,m_1,lt_1 ; lt = lt+m | ||
| 515 | ADD,DC ht_1,%r0,ht_1 ; ht[1]++ | ||
| 516 | STD lt_1,16(r_ptr) ; rp[2] = lt[1] | ||
| 517 | STD ht_1,24(r_ptr) ; rp[3] = ht[1] | ||
| 518 | |||
| 519 | LDO -2(num),num ; num = num - 2; | ||
| 520 | LDO 16(a_ptr),a_ptr ; ap += 2 | ||
| 521 | CMPIB,<= 2,num,bn_sqr_words_unroll2 | ||
| 522 | LDO 32(r_ptr),r_ptr ; rp += 4 | ||
| 523 | |||
| 524 | CMPIB,=,N 0,num,bn_sqr_words_exit ; are we done? | ||
| 525 | |||
| 526 | ; | ||
| 527 | ; Top of loop aligned on 64-byte boundary | ||
| 528 | ; | ||
| 529 | bn_sqr_words_single_top | ||
| 530 | FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R) | ||
| 531 | |||
| 532 | XMPYU fht_0,flt_0,fm ; m | ||
| 533 | FSTD fm,-24(%sp) ; store m | ||
| 534 | |||
| 535 | XMPYU flt_0,flt_0,lt_temp ; lt | ||
| 536 | FSTD lt_temp,-16(%sp) ; store lt | ||
| 537 | |||
| 538 | XMPYU fht_0,fht_0,ht_temp ; ht | ||
| 539 | FSTD ht_temp,-8(%sp) ; store ht | ||
| 540 | |||
| 541 | LDD -24(%sp),m_0 ; load m | ||
| 542 | AND m_0,high_mask,tmp_0 ; m & Mask | ||
| 543 | DEPD,Z m_0,30,31,m_0 ; m << 32+1 | ||
| 544 | LDD -16(%sp),lt_0 ; lt | ||
| 545 | |||
| 546 | LDD -8(%sp),ht_0 ; ht | ||
| 547 | EXTRD,U tmp_0,32,33,tmp_0 ; tmp_0 = m&Mask >> 32-1 | ||
| 548 | ADD m_0,lt_0,lt_0 ; lt = lt+m | ||
| 549 | ADD,L ht_0,tmp_0,ht_0 ; ht += tmp_0 | ||
| 550 | ADD,DC ht_0,%r0,ht_0 ; ht++ | ||
| 551 | |||
| 552 | STD lt_0,0(r_ptr) ; rp[0] = lt | ||
| 553 | STD ht_0,8(r_ptr) ; rp[1] = ht | ||
| 554 | |||
| 555 | bn_sqr_words_exit | ||
| 556 | .EXIT | ||
| 557 | LDD -112(%sp),%r5 ; restore r5 | ||
| 558 | LDD -120(%sp),%r4 ; restore r4 | ||
| 559 | BVE (%rp) | ||
| 560 | LDD,MB -128(%sp),%r3 | ||
| 561 | .PROCEND ;in=23,24,25,26,29;out=28; | ||
| 562 | |||
| 563 | |||
| 564 | ;---------------------------------------------------------------------------- | ||
| 565 | ; | ||
| 566 | ;BN_ULONG bn_add_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) | ||
| 567 | ; | ||
| 568 | ; arg0 = rp | ||
| 569 | ; arg1 = ap | ||
| 570 | ; arg2 = bp | ||
| 571 | ; arg3 = n | ||
| 572 | |||
| 573 | t .reg %r22 | ||
| 574 | b .reg %r21 | ||
| 575 | l .reg %r20 | ||
| 576 | |||
| 577 | bn_add_words | ||
| 578 | .proc | ||
| 579 | .entry | ||
| 580 | .callinfo | ||
| 581 | .EXPORT bn_add_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN | ||
| 582 | .align 64 | ||
| 583 | |||
| 584 | CMPIB,>= 0,n,bn_add_words_exit | ||
| 585 | COPY %r0,%ret0 ; return 0 by default | ||
| 586 | |||
| 587 | ; | ||
| 588 | ; If 2 or more numbers do the loop | ||
| 589 | ; | ||
| 590 | CMPIB,= 1,n,bn_add_words_single_top | ||
| 591 | NOP | ||
| 592 | |||
| 593 | ; | ||
| 594 | ; This loop is unrolled 2 times (64-byte aligned as well) | ||
| 595 | ; | ||
| 596 | bn_add_words_unroll2 | ||
| 597 | LDD 0(a_ptr),t | ||
| 598 | LDD 0(b_ptr),b | ||
| 599 | ADD t,%ret0,t ; t = t+c; | ||
| 600 | ADD,DC %r0,%r0,%ret0 ; set c to carry | ||
| 601 | ADD t,b,l ; l = t + b[0] | ||
| 602 | ADD,DC %ret0,%r0,%ret0 ; c+= carry | ||
| 603 | STD l,0(r_ptr) | ||
| 604 | |||
| 605 | LDD 8(a_ptr),t | ||
| 606 | LDD 8(b_ptr),b | ||
| 607 | ADD t,%ret0,t ; t = t+c; | ||
| 608 | ADD,DC %r0,%r0,%ret0 ; set c to carry | ||
| 609 | ADD t,b,l ; l = t + b[0] | ||
| 610 | ADD,DC %ret0,%r0,%ret0 ; c+= carry | ||
| 611 | STD l,8(r_ptr) | ||
| 612 | |||
| 613 | LDO -2(n),n | ||
| 614 | LDO 16(a_ptr),a_ptr | ||
| 615 | LDO 16(b_ptr),b_ptr | ||
| 616 | |||
| 617 | CMPIB,<= 2,n,bn_add_words_unroll2 | ||
| 618 | LDO 16(r_ptr),r_ptr | ||
| 619 | |||
| 620 | CMPIB,=,N 0,n,bn_add_words_exit ; are we done? | ||
| 621 | |||
| 622 | bn_add_words_single_top | ||
| 623 | LDD 0(a_ptr),t | ||
| 624 | LDD 0(b_ptr),b | ||
| 625 | |||
| 626 | ADD t,%ret0,t ; t = t+c; | ||
| 627 | ADD,DC %r0,%r0,%ret0 ; set c to carry (could use CMPCLR??) | ||
| 628 | ADD t,b,l ; l = t + b[0] | ||
| 629 | ADD,DC %ret0,%r0,%ret0 ; c+= carry | ||
| 630 | STD l,0(r_ptr) | ||
| 631 | |||
| 632 | bn_add_words_exit | ||
| 633 | .EXIT | ||
| 634 | BVE (%rp) | ||
| 635 | NOP | ||
| 636 | .PROCEND ;in=23,24,25,26,29;out=28; | ||
| 637 | |||
| 638 | ;---------------------------------------------------------------------------- | ||
| 639 | ; | ||
| 640 | ;BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) | ||
| 641 | ; | ||
| 642 | ; arg0 = rp | ||
| 643 | ; arg1 = ap | ||
| 644 | ; arg2 = bp | ||
| 645 | ; arg3 = n | ||
| 646 | |||
| 647 | t1 .reg %r22 | ||
| 648 | t2 .reg %r21 | ||
| 649 | sub_tmp1 .reg %r20 | ||
| 650 | sub_tmp2 .reg %r19 | ||
| 651 | |||
| 652 | |||
| 653 | bn_sub_words | ||
| 654 | .proc | ||
| 655 | .callinfo | ||
| 656 | .EXPORT bn_sub_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN | ||
| 657 | .entry | ||
| 658 | .align 64 | ||
| 659 | |||
| 660 | CMPIB,>= 0,n,bn_sub_words_exit | ||
| 661 | COPY %r0,%ret0 ; return 0 by default | ||
| 662 | |||
| 663 | ; | ||
| 664 | ; If 2 or more numbers do the loop | ||
| 665 | ; | ||
| 666 | CMPIB,= 1,n,bn_sub_words_single_top | ||
| 667 | NOP | ||
| 668 | |||
| 669 | ; | ||
| 670 | ; This loop is unrolled 2 times (64-byte aligned as well) | ||
| 671 | ; | ||
| 672 | bn_sub_words_unroll2 | ||
| 673 | LDD 0(a_ptr),t1 | ||
| 674 | LDD 0(b_ptr),t2 | ||
| 675 | SUB t1,t2,sub_tmp1 ; t3 = t1-t2; | ||
| 676 | SUB sub_tmp1,%ret0,sub_tmp1 ; t3 = t3- c; | ||
| 677 | |||
| 678 | CMPCLR,*>> t1,t2,sub_tmp2 ; clear if t1 > t2 | ||
| 679 | LDO 1(%r0),sub_tmp2 | ||
| 680 | |||
| 681 | CMPCLR,*= t1,t2,%r0 | ||
| 682 | COPY sub_tmp2,%ret0 | ||
| 683 | STD sub_tmp1,0(r_ptr) | ||
| 684 | |||
| 685 | LDD 8(a_ptr),t1 | ||
| 686 | LDD 8(b_ptr),t2 | ||
| 687 | SUB t1,t2,sub_tmp1 ; t3 = t1-t2; | ||
| 688 | SUB sub_tmp1,%ret0,sub_tmp1 ; t3 = t3- c; | ||
| 689 | CMPCLR,*>> t1,t2,sub_tmp2 ; clear if t1 > t2 | ||
| 690 | LDO 1(%r0),sub_tmp2 | ||
| 691 | |||
| 692 | CMPCLR,*= t1,t2,%r0 | ||
| 693 | COPY sub_tmp2,%ret0 | ||
| 694 | STD sub_tmp1,8(r_ptr) | ||
| 695 | |||
| 696 | LDO -2(n),n | ||
| 697 | LDO 16(a_ptr),a_ptr | ||
| 698 | LDO 16(b_ptr),b_ptr | ||
| 699 | |||
| 700 | CMPIB,<= 2,n,bn_sub_words_unroll2 | ||
| 701 | LDO 16(r_ptr),r_ptr | ||
| 702 | |||
| 703 | CMPIB,=,N 0,n,bn_sub_words_exit ; are we done? | ||
| 704 | |||
| 705 | bn_sub_words_single_top | ||
| 706 | LDD 0(a_ptr),t1 | ||
| 707 | LDD 0(b_ptr),t2 | ||
| 708 | SUB t1,t2,sub_tmp1 ; t3 = t1-t2; | ||
| 709 | SUB sub_tmp1,%ret0,sub_tmp1 ; t3 = t3- c; | ||
| 710 | CMPCLR,*>> t1,t2,sub_tmp2 ; clear if t1 > t2 | ||
| 711 | LDO 1(%r0),sub_tmp2 | ||
| 712 | |||
| 713 | CMPCLR,*= t1,t2,%r0 | ||
| 714 | COPY sub_tmp2,%ret0 | ||
| 715 | |||
| 716 | STD sub_tmp1,0(r_ptr) | ||
| 717 | |||
| 718 | bn_sub_words_exit | ||
| 719 | .EXIT | ||
| 720 | BVE (%rp) | ||
| 721 | NOP | ||
| 722 | .PROCEND ;in=23,24,25,26,29;out=28; | ||
| 723 | |||
| 724 | ;------------------------------------------------------------------------------ | ||
| 725 | ; | ||
| 726 | ; unsigned long bn_div_words(unsigned long h, unsigned long l, unsigned long d) | ||
| 727 | ; | ||
| 728 | ; arg0 = h | ||
| 729 | ; arg1 = l | ||
| 730 | ; arg2 = d | ||
| 731 | ; | ||
| 732 | ; This is mainly just modified assembly from the compiler, thus the | ||
| 733 | ; lack of variable names. | ||
| 734 | ; | ||
| 735 | ;------------------------------------------------------------------------------ | ||
| 736 | bn_div_words | ||
| 737 | .proc | ||
| 738 | .callinfo CALLER,FRAME=272,ENTRY_GR=%r10,SAVE_RP,ARGS_SAVED,ORDERING_AWARE | ||
| 739 | .EXPORT bn_div_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN | ||
| 740 | .IMPORT BN_num_bits_word,CODE,NO_RELOCATION | ||
| 741 | .IMPORT __iob,DATA | ||
| 742 | .IMPORT fprintf,CODE,NO_RELOCATION | ||
| 743 | .IMPORT abort,CODE,NO_RELOCATION | ||
| 744 | .IMPORT $$div2U,MILLICODE | ||
| 745 | .entry | ||
| 746 | STD %r2,-16(%r30) | ||
| 747 | STD,MA %r3,352(%r30) | ||
| 748 | STD %r4,-344(%r30) | ||
| 749 | STD %r5,-336(%r30) | ||
| 750 | STD %r6,-328(%r30) | ||
| 751 | STD %r7,-320(%r30) | ||
| 752 | STD %r8,-312(%r30) | ||
| 753 | STD %r9,-304(%r30) | ||
| 754 | STD %r10,-296(%r30) | ||
| 755 | |||
| 756 | STD %r27,-288(%r30) ; save gp | ||
| 757 | |||
| 758 | COPY %r24,%r3 ; save d | ||
| 759 | COPY %r26,%r4 ; save h (high 64-bits) | ||
| 760 | LDO -1(%r0),%ret0 ; return -1 by default | ||
| 761 | |||
| 762 | CMPB,*= %r0,%arg2,$D3 ; if (d == 0) | ||
| 763 | COPY %r25,%r5 ; save l (low 64-bits) | ||
| 764 | |||
| 765 | LDO -48(%r30),%r29 ; create ap | ||
| 766 | .CALL ;in=26,29;out=28; | ||
| 767 | B,L BN_num_bits_word,%r2 | ||
| 768 | COPY %r3,%r26 | ||
| 769 | LDD -288(%r30),%r27 ; restore gp | ||
| 770 | LDI 64,%r21 | ||
| 771 | |||
| 772 | CMPB,= %r21,%ret0,$00000012 ;if (i == 64) (forward) | ||
| 773 | COPY %ret0,%r24 ; i | ||
| 774 | MTSARCM %r24 | ||
| 775 | DEPDI,Z -1,%sar,1,%r29 | ||
| 776 | CMPB,*<<,N %r29,%r4,bn_div_err_case ; if (h > 1<<i) (forward) | ||
| 777 | |||
| 778 | $00000012 | ||
| 779 | SUBI 64,%r24,%r31 ; i = 64 - i; | ||
| 780 | CMPCLR,*<< %r4,%r3,%r0 ; if (h >= d) | ||
| 781 | SUB %r4,%r3,%r4 ; h -= d | ||
| 782 | CMPB,= %r31,%r0,$0000001A ; if (i) | ||
| 783 | COPY %r0,%r10 ; ret = 0 | ||
| 784 | MTSARCM %r31 ; i to shift | ||
| 785 | DEPD,Z %r3,%sar,64,%r3 ; d <<= i; | ||
| 786 | SUBI 64,%r31,%r19 ; 64 - i; redundant | ||
| 787 | MTSAR %r19 ; (64 -i) to shift | ||
| 788 | SHRPD %r4,%r5,%sar,%r4 ; l>> (64-i) | ||
| 789 | MTSARCM %r31 ; i to shift | ||
| 790 | DEPD,Z %r5,%sar,64,%r5 ; l <<= i; | ||
| 791 | |||
| 792 | $0000001A | ||
| 793 | DEPDI,Z -1,31,32,%r19 | ||
| 794 | EXTRD,U %r3,31,32,%r6 ; dh=(d&0xfff)>>32 | ||
| 795 | EXTRD,U %r3,63,32,%r8 ; dl = d&0xffffff | ||
| 796 | LDO 2(%r0),%r9 | ||
| 797 | STD %r3,-280(%r30) ; "d" to stack | ||
| 798 | |||
| 799 | $0000001C | ||
| 800 | DEPDI,Z -1,63,32,%r29 ; | ||
| 801 | EXTRD,U %r4,31,32,%r31 ; h >> 32 | ||
| 802 | CMPB,*=,N %r31,%r6,$D2 ; if ((h>>32) != dh)(forward) div | ||
| 803 | COPY %r4,%r26 | ||
| 804 | EXTRD,U %r4,31,32,%r25 | ||
| 805 | COPY %r6,%r24 | ||
| 806 | .CALL ;in=23,24,25,26;out=20,21,22,28,29; (MILLICALL) | ||
| 807 | B,L $$div2U,%r2 | ||
| 808 | EXTRD,U %r6,31,32,%r23 | ||
| 809 | DEPD %r28,31,32,%r29 | ||
| 810 | $D2 | ||
| 811 | STD %r29,-272(%r30) ; q | ||
| 812 | AND %r5,%r19,%r24 ; t & 0xffffffff00000000; | ||
| 813 | EXTRD,U %r24,31,32,%r24 ; ??? | ||
| 814 | FLDD -272(%r30),%fr7 ; q | ||
| 815 | FLDD -280(%r30),%fr8 ; d | ||
| 816 | XMPYU %fr8L,%fr7L,%fr10 | ||
| 817 | FSTD %fr10,-256(%r30) | ||
| 818 | XMPYU %fr8L,%fr7R,%fr22 | ||
| 819 | FSTD %fr22,-264(%r30) | ||
| 820 | XMPYU %fr8R,%fr7L,%fr11 | ||
| 821 | XMPYU %fr8R,%fr7R,%fr23 | ||
| 822 | FSTD %fr11,-232(%r30) | ||
| 823 | FSTD %fr23,-240(%r30) | ||
| 824 | LDD -256(%r30),%r28 | ||
| 825 | DEPD,Z %r28,31,32,%r2 | ||
| 826 | LDD -264(%r30),%r20 | ||
| 827 | ADD,L %r20,%r2,%r31 | ||
| 828 | LDD -232(%r30),%r22 | ||
| 829 | DEPD,Z %r22,31,32,%r22 | ||
| 830 | LDD -240(%r30),%r21 | ||
| 831 | B $00000024 ; enter loop | ||
| 832 | ADD,L %r21,%r22,%r23 | ||
| 833 | |||
| 834 | $0000002A | ||
| 835 | LDO -1(%r29),%r29 | ||
| 836 | SUB %r23,%r8,%r23 | ||
| 837 | $00000024 | ||
| 838 | SUB %r4,%r31,%r25 | ||
| 839 | AND %r25,%r19,%r26 | ||
| 840 | CMPB,*<>,N %r0,%r26,$00000046 ; (forward) | ||
| 841 | DEPD,Z %r25,31,32,%r20 | ||
| 842 | OR %r20,%r24,%r21 | ||
| 843 | CMPB,*<<,N %r21,%r23,$0000002A ;(backward) | ||
| 844 | SUB %r31,%r6,%r31 | ||
| 845 | ;-------------Break path--------------------- | ||
| 846 | |||
| 847 | $00000046 | ||
| 848 | DEPD,Z %r23,31,32,%r25 ;tl | ||
| 849 | EXTRD,U %r23,31,32,%r26 ;t | ||
| 850 | AND %r25,%r19,%r24 ;tl = (tl<<32)&0xfffffff0000000L | ||
| 851 | ADD,L %r31,%r26,%r31 ;th += t; | ||
| 852 | CMPCLR,*>>= %r5,%r24,%r0 ;if (l<tl) | ||
| 853 | LDO 1(%r31),%r31 ; th++; | ||
| 854 | CMPB,*<<=,N %r31,%r4,$00000036 ;if (n < th) (forward) | ||
| 855 | LDO -1(%r29),%r29 ;q--; | ||
| 856 | ADD,L %r4,%r3,%r4 ;h += d; | ||
| 857 | $00000036 | ||
| 858 | ADDIB,=,N -1,%r9,$D1 ;if (--count == 0) break (forward) | ||
| 859 | SUB %r5,%r24,%r28 ; l -= tl; | ||
| 860 | SUB %r4,%r31,%r24 ; h -= th; | ||
| 861 | SHRPD %r24,%r28,32,%r4 ; h = ((h<<32)|(l>>32)); | ||
| 862 | DEPD,Z %r29,31,32,%r10 ; ret = q<<32 | ||
| 863 | b $0000001C | ||
| 864 | DEPD,Z %r28,31,32,%r5 ; l = l << 32 | ||
| 865 | |||
| 866 | $D1 | ||
| 867 | OR %r10,%r29,%r28 ; ret |= q | ||
| 868 | $D3 | ||
| 869 | LDD -368(%r30),%r2 | ||
| 870 | $D0 | ||
| 871 | LDD -296(%r30),%r10 | ||
| 872 | LDD -304(%r30),%r9 | ||
| 873 | LDD -312(%r30),%r8 | ||
| 874 | LDD -320(%r30),%r7 | ||
| 875 | LDD -328(%r30),%r6 | ||
| 876 | LDD -336(%r30),%r5 | ||
| 877 | LDD -344(%r30),%r4 | ||
| 878 | BVE (%r2) | ||
| 879 | .EXIT | ||
| 880 | LDD,MB -352(%r30),%r3 | ||
| 881 | |||
| 882 | bn_div_err_case | ||
| 883 | MFIA %r6 | ||
| 884 | ADDIL L'bn_div_words-bn_div_err_case,%r6,%r1 | ||
| 885 | LDO R'bn_div_words-bn_div_err_case(%r1),%r6 | ||
| 886 | ADDIL LT'__iob,%r27,%r1 | ||
| 887 | LDD RT'__iob(%r1),%r26 | ||
| 888 | ADDIL L'C$4-bn_div_words,%r6,%r1 | ||
| 889 | LDO R'C$4-bn_div_words(%r1),%r25 | ||
| 890 | LDO 64(%r26),%r26 | ||
| 891 | .CALL ;in=24,25,26,29;out=28; | ||
| 892 | B,L fprintf,%r2 | ||
| 893 | LDO -48(%r30),%r29 | ||
| 894 | LDD -288(%r30),%r27 | ||
| 895 | .CALL ;in=29; | ||
| 896 | B,L abort,%r2 | ||
| 897 | LDO -48(%r30),%r29 | ||
| 898 | LDD -288(%r30),%r27 | ||
| 899 | B $D0 | ||
| 900 | LDD -368(%r30),%r2 | ||
| 901 | .PROCEND ;in=24,25,26,29;out=28; | ||
| 902 | |||
| 903 | ;---------------------------------------------------------------------------- | ||
| 904 | ; | ||
| 905 | ; Registers to hold 64-bit values to manipulate. The "L" part | ||
| 906 | ; of the register corresponds to the upper 32-bits, while the "R" | ||
| 907 | ; part corresponds to the lower 32-bits | ||
| 908 | ; | ||
| 909 | ; Note, that when using b6 and b7, the code must save these before | ||
| 910 | ; using them because they are callee save registers | ||
| 911 | ; | ||
| 912 | ; | ||
| 913 | ; Floating point registers to use to save values that | ||
| 914 | ; are manipulated. These don't collide with ftemp1-6 and | ||
| 915 | ; are all caller save registers | ||
| 916 | ; | ||
| 917 | a0 .reg %fr22 | ||
| 918 | a0L .reg %fr22L | ||
| 919 | a0R .reg %fr22R | ||
| 920 | |||
| 921 | a1 .reg %fr23 | ||
| 922 | a1L .reg %fr23L | ||
| 923 | a1R .reg %fr23R | ||
| 924 | |||
| 925 | a2 .reg %fr24 | ||
| 926 | a2L .reg %fr24L | ||
| 927 | a2R .reg %fr24R | ||
| 928 | |||
| 929 | a3 .reg %fr25 | ||
| 930 | a3L .reg %fr25L | ||
| 931 | a3R .reg %fr25R | ||
| 932 | |||
| 933 | a4 .reg %fr26 | ||
| 934 | a4L .reg %fr26L | ||
| 935 | a4R .reg %fr26R | ||
| 936 | |||
| 937 | a5 .reg %fr27 | ||
| 938 | a5L .reg %fr27L | ||
| 939 | a5R .reg %fr27R | ||
| 940 | |||
| 941 | a6 .reg %fr28 | ||
| 942 | a6L .reg %fr28L | ||
| 943 | a6R .reg %fr28R | ||
| 944 | |||
| 945 | a7 .reg %fr29 | ||
| 946 | a7L .reg %fr29L | ||
| 947 | a7R .reg %fr29R | ||
| 948 | |||
| 949 | b0 .reg %fr30 | ||
| 950 | b0L .reg %fr30L | ||
| 951 | b0R .reg %fr30R | ||
| 952 | |||
| 953 | b1 .reg %fr31 | ||
| 954 | b1L .reg %fr31L | ||
| 955 | b1R .reg %fr31R | ||
| 956 | |||
| 957 | ; | ||
| 958 | ; Temporary floating point variables, these are all caller save | ||
| 959 | ; registers | ||
| 960 | ; | ||
| 961 | ftemp1 .reg %fr4 | ||
| 962 | ftemp2 .reg %fr5 | ||
| 963 | ftemp3 .reg %fr6 | ||
| 964 | ftemp4 .reg %fr7 | ||
| 965 | |||
| 966 | ; | ||
| 967 | ; The B set of registers when used. | ||
| 968 | ; | ||
| 969 | |||
| 970 | b2 .reg %fr8 | ||
| 971 | b2L .reg %fr8L | ||
| 972 | b2R .reg %fr8R | ||
| 973 | |||
| 974 | b3 .reg %fr9 | ||
| 975 | b3L .reg %fr9L | ||
| 976 | b3R .reg %fr9R | ||
| 977 | |||
| 978 | b4 .reg %fr10 | ||
| 979 | b4L .reg %fr10L | ||
| 980 | b4R .reg %fr10R | ||
| 981 | |||
| 982 | b5 .reg %fr11 | ||
| 983 | b5L .reg %fr11L | ||
| 984 | b5R .reg %fr11R | ||
| 985 | |||
| 986 | b6 .reg %fr12 | ||
| 987 | b6L .reg %fr12L | ||
| 988 | b6R .reg %fr12R | ||
| 989 | |||
| 990 | b7 .reg %fr13 | ||
| 991 | b7L .reg %fr13L | ||
| 992 | b7R .reg %fr13R | ||
| 993 | |||
| 994 | c1 .reg %r21 ; only reg | ||
| 995 | temp1 .reg %r20 ; only reg | ||
| 996 | temp2 .reg %r19 ; only reg | ||
| 997 | temp3 .reg %r31 ; only reg | ||
| 998 | |||
| 999 | m1 .reg %r28 | ||
| 1000 | c2 .reg %r23 | ||
| 1001 | high_one .reg %r1 | ||
| 1002 | ht .reg %r6 | ||
| 1003 | lt .reg %r5 | ||
| 1004 | m .reg %r4 | ||
| 1005 | c3 .reg %r3 | ||
| 1006 | |||
| 1007 | SQR_ADD_C .macro A0L,A0R,C1,C2,C3 | ||
| 1008 | XMPYU A0L,A0R,ftemp1 ; m | ||
| 1009 | FSTD ftemp1,-24(%sp) ; store m | ||
| 1010 | |||
| 1011 | XMPYU A0R,A0R,ftemp2 ; lt | ||
| 1012 | FSTD ftemp2,-16(%sp) ; store lt | ||
| 1013 | |||
| 1014 | XMPYU A0L,A0L,ftemp3 ; ht | ||
| 1015 | FSTD ftemp3,-8(%sp) ; store ht | ||
| 1016 | |||
| 1017 | LDD -24(%sp),m ; load m | ||
| 1018 | AND m,high_mask,temp2 ; m & Mask | ||
| 1019 | DEPD,Z m,30,31,temp3 ; m << 32+1 | ||
| 1020 | LDD -16(%sp),lt ; lt | ||
| 1021 | |||
| 1022 | LDD -8(%sp),ht ; ht | ||
| 1023 | EXTRD,U temp2,32,33,temp1 ; temp1 = m&Mask >> 32-1 | ||
| 1024 | ADD temp3,lt,lt ; lt = lt+m | ||
| 1025 | ADD,L ht,temp1,ht ; ht += temp1 | ||
| 1026 | ADD,DC ht,%r0,ht ; ht++ | ||
| 1027 | |||
| 1028 | ADD C1,lt,C1 ; c1=c1+lt | ||
| 1029 | ADD,DC ht,%r0,ht ; ht++ | ||
| 1030 | |||
| 1031 | ADD C2,ht,C2 ; c2=c2+ht | ||
| 1032 | ADD,DC C3,%r0,C3 ; c3++ | ||
| 1033 | .endm | ||
| 1034 | |||
| 1035 | SQR_ADD_C2 .macro A0L,A0R,A1L,A1R,C1,C2,C3 | ||
| 1036 | XMPYU A0L,A1R,ftemp1 ; m1 = bl*ht | ||
| 1037 | FSTD ftemp1,-16(%sp) ; | ||
| 1038 | XMPYU A0R,A1L,ftemp2 ; m = bh*lt | ||
| 1039 | FSTD ftemp2,-8(%sp) ; | ||
| 1040 | XMPYU A0R,A1R,ftemp3 ; lt = bl*lt | ||
| 1041 | FSTD ftemp3,-32(%sp) | ||
| 1042 | XMPYU A0L,A1L,ftemp4 ; ht = bh*ht | ||
| 1043 | FSTD ftemp4,-24(%sp) ; | ||
| 1044 | |||
| 1045 | LDD -8(%sp),m ; r21 = m | ||
| 1046 | LDD -16(%sp),m1 ; r19 = m1 | ||
| 1047 | ADD,L m,m1,m ; m+m1 | ||
| 1048 | |||
| 1049 | DEPD,Z m,31,32,temp3 ; (m+m1<<32) | ||
| 1050 | LDD -24(%sp),ht ; r24 = ht | ||
| 1051 | |||
| 1052 | CMPCLR,*>>= m,m1,%r0 ; if (m < m1) | ||
| 1053 | ADD,L ht,high_one,ht ; ht+=high_one | ||
| 1054 | |||
| 1055 | EXTRD,U m,31,32,temp1 ; m >> 32 | ||
| 1056 | LDD -32(%sp),lt ; lt | ||
| 1057 | ADD,L ht,temp1,ht ; ht+= m>>32 | ||
| 1058 | ADD lt,temp3,lt ; lt = lt+m1 | ||
| 1059 | ADD,DC ht,%r0,ht ; ht++ | ||
| 1060 | |||
| 1061 | ADD ht,ht,ht ; ht=ht+ht; | ||
| 1062 | ADD,DC C3,%r0,C3 ; add in carry (c3++) | ||
| 1063 | |||
| 1064 | ADD lt,lt,lt ; lt=lt+lt; | ||
| 1065 | ADD,DC ht,%r0,ht ; add in carry (ht++) | ||
| 1066 | |||
| 1067 | ADD C1,lt,C1 ; c1=c1+lt | ||
| 1068 | ADD,DC,*NUV ht,%r0,ht ; add in carry (ht++) | ||
| 1069 | LDO 1(C3),C3 ; bump c3 if overflow,nullify otherwise | ||
| 1070 | |||
| 1071 | ADD C2,ht,C2 ; c2 = c2 + ht | ||
| 1072 | ADD,DC C3,%r0,C3 ; add in carry (c3++) | ||
| 1073 | .endm | ||
| 1074 | |||
| 1075 | ; | ||
| 1076 | ;void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a) | ||
| 1077 | ; arg0 = r_ptr | ||
| 1078 | ; arg1 = a_ptr | ||
| 1079 | ; | ||
| 1080 | |||
| 1081 | bn_sqr_comba8 | ||
| 1082 | .PROC | ||
| 1083 | .CALLINFO FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE | ||
| 1084 | .EXPORT bn_sqr_comba8,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN | ||
| 1085 | .ENTRY | ||
| 1086 | .align 64 | ||
| 1087 | |||
| 1088 | STD %r3,0(%sp) ; save r3 | ||
| 1089 | STD %r4,8(%sp) ; save r4 | ||
| 1090 | STD %r5,16(%sp) ; save r5 | ||
| 1091 | STD %r6,24(%sp) ; save r6 | ||
| 1092 | |||
| 1093 | ; | ||
| 1094 | ; Zero out carries | ||
| 1095 | ; | ||
| 1096 | COPY %r0,c1 | ||
| 1097 | COPY %r0,c2 | ||
| 1098 | COPY %r0,c3 | ||
| 1099 | |||
| 1100 | LDO 128(%sp),%sp ; bump stack | ||
| 1101 | DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L | ||
| 1102 | DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32 | ||
| 1103 | |||
| 1104 | ; | ||
| 1105 | ; Load up all of the values we are going to use | ||
| 1106 | ; | ||
| 1107 | FLDD 0(a_ptr),a0 | ||
| 1108 | FLDD 8(a_ptr),a1 | ||
| 1109 | FLDD 16(a_ptr),a2 | ||
| 1110 | FLDD 24(a_ptr),a3 | ||
| 1111 | FLDD 32(a_ptr),a4 | ||
| 1112 | FLDD 40(a_ptr),a5 | ||
| 1113 | FLDD 48(a_ptr),a6 | ||
| 1114 | FLDD 56(a_ptr),a7 | ||
| 1115 | |||
| 1116 | SQR_ADD_C a0L,a0R,c1,c2,c3 | ||
| 1117 | STD c1,0(r_ptr) ; r[0] = c1; | ||
| 1118 | COPY %r0,c1 | ||
| 1119 | |||
| 1120 | SQR_ADD_C2 a1L,a1R,a0L,a0R,c2,c3,c1 | ||
| 1121 | STD c2,8(r_ptr) ; r[1] = c2; | ||
| 1122 | COPY %r0,c2 | ||
| 1123 | |||
| 1124 | SQR_ADD_C a1L,a1R,c3,c1,c2 | ||
| 1125 | SQR_ADD_C2 a2L,a2R,a0L,a0R,c3,c1,c2 | ||
| 1126 | STD c3,16(r_ptr) ; r[2] = c3; | ||
| 1127 | COPY %r0,c3 | ||
| 1128 | |||
| 1129 | SQR_ADD_C2 a3L,a3R,a0L,a0R,c1,c2,c3 | ||
| 1130 | SQR_ADD_C2 a2L,a2R,a1L,a1R,c1,c2,c3 | ||
| 1131 | STD c1,24(r_ptr) ; r[3] = c1; | ||
| 1132 | COPY %r0,c1 | ||
| 1133 | |||
| 1134 | SQR_ADD_C a2L,a2R,c2,c3,c1 | ||
| 1135 | SQR_ADD_C2 a3L,a3R,a1L,a1R,c2,c3,c1 | ||
| 1136 | SQR_ADD_C2 a4L,a4R,a0L,a0R,c2,c3,c1 | ||
| 1137 | STD c2,32(r_ptr) ; r[4] = c2; | ||
| 1138 | COPY %r0,c2 | ||
| 1139 | |||
| 1140 | SQR_ADD_C2 a5L,a5R,a0L,a0R,c3,c1,c2 | ||
| 1141 | SQR_ADD_C2 a4L,a4R,a1L,a1R,c3,c1,c2 | ||
| 1142 | SQR_ADD_C2 a3L,a3R,a2L,a2R,c3,c1,c2 | ||
| 1143 | STD c3,40(r_ptr) ; r[5] = c3; | ||
| 1144 | COPY %r0,c3 | ||
| 1145 | |||
| 1146 | SQR_ADD_C a3L,a3R,c1,c2,c3 | ||
| 1147 | SQR_ADD_C2 a4L,a4R,a2L,a2R,c1,c2,c3 | ||
| 1148 | SQR_ADD_C2 a5L,a5R,a1L,a1R,c1,c2,c3 | ||
| 1149 | SQR_ADD_C2 a6L,a6R,a0L,a0R,c1,c2,c3 | ||
| 1150 | STD c1,48(r_ptr) ; r[6] = c1; | ||
| 1151 | COPY %r0,c1 | ||
| 1152 | |||
| 1153 | SQR_ADD_C2 a7L,a7R,a0L,a0R,c2,c3,c1 | ||
| 1154 | SQR_ADD_C2 a6L,a6R,a1L,a1R,c2,c3,c1 | ||
| 1155 | SQR_ADD_C2 a5L,a5R,a2L,a2R,c2,c3,c1 | ||
| 1156 | SQR_ADD_C2 a4L,a4R,a3L,a3R,c2,c3,c1 | ||
| 1157 | STD c2,56(r_ptr) ; r[7] = c2; | ||
| 1158 | COPY %r0,c2 | ||
| 1159 | |||
| 1160 | SQR_ADD_C a4L,a4R,c3,c1,c2 | ||
| 1161 | SQR_ADD_C2 a5L,a5R,a3L,a3R,c3,c1,c2 | ||
| 1162 | SQR_ADD_C2 a6L,a6R,a2L,a2R,c3,c1,c2 | ||
| 1163 | SQR_ADD_C2 a7L,a7R,a1L,a1R,c3,c1,c2 | ||
| 1164 | STD c3,64(r_ptr) ; r[8] = c3; | ||
| 1165 | COPY %r0,c3 | ||
| 1166 | |||
| 1167 | SQR_ADD_C2 a7L,a7R,a2L,a2R,c1,c2,c3 | ||
| 1168 | SQR_ADD_C2 a6L,a6R,a3L,a3R,c1,c2,c3 | ||
| 1169 | SQR_ADD_C2 a5L,a5R,a4L,a4R,c1,c2,c3 | ||
| 1170 | STD c1,72(r_ptr) ; r[9] = c1; | ||
| 1171 | COPY %r0,c1 | ||
| 1172 | |||
| 1173 | SQR_ADD_C a5L,a5R,c2,c3,c1 | ||
| 1174 | SQR_ADD_C2 a6L,a6R,a4L,a4R,c2,c3,c1 | ||
| 1175 | SQR_ADD_C2 a7L,a7R,a3L,a3R,c2,c3,c1 | ||
| 1176 | STD c2,80(r_ptr) ; r[10] = c2; | ||
| 1177 | COPY %r0,c2 | ||
| 1178 | |||
| 1179 | SQR_ADD_C2 a7L,a7R,a4L,a4R,c3,c1,c2 | ||
| 1180 | SQR_ADD_C2 a6L,a6R,a5L,a5R,c3,c1,c2 | ||
| 1181 | STD c3,88(r_ptr) ; r[11] = c3; | ||
| 1182 | COPY %r0,c3 | ||
| 1183 | |||
| 1184 | SQR_ADD_C a6L,a6R,c1,c2,c3 | ||
| 1185 | SQR_ADD_C2 a7L,a7R,a5L,a5R,c1,c2,c3 | ||
| 1186 | STD c1,96(r_ptr) ; r[12] = c1; | ||
| 1187 | COPY %r0,c1 | ||
| 1188 | |||
| 1189 | SQR_ADD_C2 a7L,a7R,a6L,a6R,c2,c3,c1 | ||
| 1190 | STD c2,104(r_ptr) ; r[13] = c2; | ||
| 1191 | COPY %r0,c2 | ||
| 1192 | |||
| 1193 | SQR_ADD_C a7L,a7R,c3,c1,c2 | ||
| 1194 | STD c3, 112(r_ptr) ; r[14] = c3 | ||
| 1195 | STD c1, 120(r_ptr) ; r[15] = c1 | ||
| 1196 | |||
| 1197 | .EXIT | ||
| 1198 | LDD -104(%sp),%r6 ; restore r6 | ||
| 1199 | LDD -112(%sp),%r5 ; restore r5 | ||
| 1200 | LDD -120(%sp),%r4 ; restore r4 | ||
| 1201 | BVE (%rp) | ||
| 1202 | LDD,MB -128(%sp),%r3 | ||
| 1203 | |||
| 1204 | .PROCEND | ||
| 1205 | |||
| 1206 | ;----------------------------------------------------------------------------- | ||
| 1207 | ; | ||
| 1208 | ;void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a) | ||
| 1209 | ; arg0 = r_ptr | ||
| 1210 | ; arg1 = a_ptr | ||
| 1211 | ; | ||
| 1212 | |||
| 1213 | bn_sqr_comba4 | ||
| 1214 | .proc | ||
| 1215 | .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE | ||
| 1216 | .EXPORT bn_sqr_comba4,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN | ||
| 1217 | .entry | ||
| 1218 | .align 64 | ||
| 1219 | STD %r3,0(%sp) ; save r3 | ||
| 1220 | STD %r4,8(%sp) ; save r4 | ||
| 1221 | STD %r5,16(%sp) ; save r5 | ||
| 1222 | STD %r6,24(%sp) ; save r6 | ||
| 1223 | |||
| 1224 | ; | ||
| 1225 | ; Zero out carries | ||
| 1226 | ; | ||
| 1227 | COPY %r0,c1 | ||
| 1228 | COPY %r0,c2 | ||
| 1229 | COPY %r0,c3 | ||
| 1230 | |||
| 1231 | LDO 128(%sp),%sp ; bump stack | ||
| 1232 | DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L | ||
| 1233 | DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32 | ||
| 1234 | |||
| 1235 | ; | ||
| 1236 | ; Load up all of the values we are going to use | ||
| 1237 | ; | ||
| 1238 | FLDD 0(a_ptr),a0 | ||
| 1239 | FLDD 8(a_ptr),a1 | ||
| 1240 | FLDD 16(a_ptr),a2 | ||
| 1241 | FLDD 24(a_ptr),a3 | ||
| 1242 | FLDD 32(a_ptr),a4 | ||
| 1243 | FLDD 40(a_ptr),a5 | ||
| 1244 | FLDD 48(a_ptr),a6 | ||
| 1245 | FLDD 56(a_ptr),a7 | ||
| 1246 | |||
| 1247 | SQR_ADD_C a0L,a0R,c1,c2,c3 | ||
| 1248 | |||
| 1249 | STD c1,0(r_ptr) ; r[0] = c1; | ||
| 1250 | COPY %r0,c1 | ||
| 1251 | |||
| 1252 | SQR_ADD_C2 a1L,a1R,a0L,a0R,c2,c3,c1 | ||
| 1253 | |||
| 1254 | STD c2,8(r_ptr) ; r[1] = c2; | ||
| 1255 | COPY %r0,c2 | ||
| 1256 | |||
| 1257 | SQR_ADD_C a1L,a1R,c3,c1,c2 | ||
| 1258 | SQR_ADD_C2 a2L,a2R,a0L,a0R,c3,c1,c2 | ||
| 1259 | |||
| 1260 | STD c3,16(r_ptr) ; r[2] = c3; | ||
| 1261 | COPY %r0,c3 | ||
| 1262 | |||
| 1263 | SQR_ADD_C2 a3L,a3R,a0L,a0R,c1,c2,c3 | ||
| 1264 | SQR_ADD_C2 a2L,a2R,a1L,a1R,c1,c2,c3 | ||
| 1265 | |||
| 1266 | STD c1,24(r_ptr) ; r[3] = c1; | ||
| 1267 | COPY %r0,c1 | ||
| 1268 | |||
| 1269 | SQR_ADD_C a2L,a2R,c2,c3,c1 | ||
| 1270 | SQR_ADD_C2 a3L,a3R,a1L,a1R,c2,c3,c1 | ||
| 1271 | |||
| 1272 | STD c2,32(r_ptr) ; r[4] = c2; | ||
| 1273 | COPY %r0,c2 | ||
| 1274 | |||
| 1275 | SQR_ADD_C2 a3L,a3R,a2L,a2R,c3,c1,c2 | ||
| 1276 | STD c3,40(r_ptr) ; r[5] = c3; | ||
| 1277 | COPY %r0,c3 | ||
| 1278 | |||
| 1279 | SQR_ADD_C a3L,a3R,c1,c2,c3 | ||
| 1280 | STD c1,48(r_ptr) ; r[6] = c1; | ||
| 1281 | STD c2,56(r_ptr) ; r[7] = c2; | ||
| 1282 | |||
| 1283 | .EXIT | ||
| 1284 | LDD -104(%sp),%r6 ; restore r6 | ||
| 1285 | LDD -112(%sp),%r5 ; restore r5 | ||
| 1286 | LDD -120(%sp),%r4 ; restore r4 | ||
| 1287 | BVE (%rp) | ||
| 1288 | LDD,MB -128(%sp),%r3 | ||
| 1289 | |||
| 1290 | .PROCEND | ||
| 1291 | |||
| 1292 | |||
| 1293 | ;--------------------------------------------------------------------------- | ||
| 1294 | |||
| 1295 | MUL_ADD_C .macro A0L,A0R,B0L,B0R,C1,C2,C3 | ||
| 1296 | XMPYU A0L,B0R,ftemp1 ; m1 = bl*ht | ||
| 1297 | FSTD ftemp1,-16(%sp) ; | ||
| 1298 | XMPYU A0R,B0L,ftemp2 ; m = bh*lt | ||
| 1299 | FSTD ftemp2,-8(%sp) ; | ||
| 1300 | XMPYU A0R,B0R,ftemp3 ; lt = bl*lt | ||
| 1301 | FSTD ftemp3,-32(%sp) | ||
| 1302 | XMPYU A0L,B0L,ftemp4 ; ht = bh*ht | ||
| 1303 | FSTD ftemp4,-24(%sp) ; | ||
| 1304 | |||
| 1305 | LDD -8(%sp),m ; r21 = m | ||
| 1306 | LDD -16(%sp),m1 ; r19 = m1 | ||
| 1307 | ADD,L m,m1,m ; m+m1 | ||
| 1308 | |||
| 1309 | DEPD,Z m,31,32,temp3 ; (m+m1<<32) | ||
| 1310 | LDD -24(%sp),ht ; r24 = ht | ||
| 1311 | |||
| 1312 | CMPCLR,*>>= m,m1,%r0 ; if (m < m1) | ||
| 1313 | ADD,L ht,high_one,ht ; ht+=high_one | ||
| 1314 | |||
| 1315 | EXTRD,U m,31,32,temp1 ; m >> 32 | ||
| 1316 | LDD -32(%sp),lt ; lt | ||
| 1317 | ADD,L ht,temp1,ht ; ht+= m>>32 | ||
| 1318 | ADD lt,temp3,lt ; lt = lt+m1 | ||
| 1319 | ADD,DC ht,%r0,ht ; ht++ | ||
| 1320 | |||
| 1321 | ADD C1,lt,C1 ; c1=c1+lt | ||
| 1322 | ADD,DC ht,%r0,ht ; bump c3 if overflow,nullify otherwise | ||
| 1323 | |||
| 1324 | ADD C2,ht,C2 ; c2 = c2 + ht | ||
| 1325 | ADD,DC C3,%r0,C3 ; add in carry (c3++) | ||
| 1326 | .endm | ||
| 1327 | |||
| 1328 | |||
| 1329 | ; | ||
| 1330 | ;void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) | ||
| 1331 | ; arg0 = r_ptr | ||
| 1332 | ; arg1 = a_ptr | ||
| 1333 | ; arg2 = b_ptr | ||
| 1334 | ; | ||
| 1335 | |||
| 1336 | bn_mul_comba8 | ||
| 1337 | .proc | ||
| 1338 | .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE | ||
| 1339 | .EXPORT bn_mul_comba8,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN | ||
| 1340 | .entry | ||
| 1341 | .align 64 | ||
| 1342 | |||
| 1343 | STD %r3,0(%sp) ; save r3 | ||
| 1344 | STD %r4,8(%sp) ; save r4 | ||
| 1345 | STD %r5,16(%sp) ; save r5 | ||
| 1346 | STD %r6,24(%sp) ; save r6 | ||
| 1347 | FSTD %fr12,32(%sp) ; save r6 | ||
| 1348 | FSTD %fr13,40(%sp) ; save r7 | ||
| 1349 | |||
| 1350 | ; | ||
| 1351 | ; Zero out carries | ||
| 1352 | ; | ||
| 1353 | COPY %r0,c1 | ||
| 1354 | COPY %r0,c2 | ||
| 1355 | COPY %r0,c3 | ||
| 1356 | |||
| 1357 | LDO 128(%sp),%sp ; bump stack | ||
| 1358 | DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32 | ||
| 1359 | |||
| 1360 | ; | ||
| 1361 | ; Load up all of the values we are going to use | ||
| 1362 | ; | ||
| 1363 | FLDD 0(a_ptr),a0 | ||
| 1364 | FLDD 8(a_ptr),a1 | ||
| 1365 | FLDD 16(a_ptr),a2 | ||
| 1366 | FLDD 24(a_ptr),a3 | ||
| 1367 | FLDD 32(a_ptr),a4 | ||
| 1368 | FLDD 40(a_ptr),a5 | ||
| 1369 | FLDD 48(a_ptr),a6 | ||
| 1370 | FLDD 56(a_ptr),a7 | ||
| 1371 | |||
| 1372 | FLDD 0(b_ptr),b0 | ||
| 1373 | FLDD 8(b_ptr),b1 | ||
| 1374 | FLDD 16(b_ptr),b2 | ||
| 1375 | FLDD 24(b_ptr),b3 | ||
| 1376 | FLDD 32(b_ptr),b4 | ||
| 1377 | FLDD 40(b_ptr),b5 | ||
| 1378 | FLDD 48(b_ptr),b6 | ||
| 1379 | FLDD 56(b_ptr),b7 | ||
| 1380 | |||
| 1381 | MUL_ADD_C a0L,a0R,b0L,b0R,c1,c2,c3 | ||
| 1382 | STD c1,0(r_ptr) | ||
| 1383 | COPY %r0,c1 | ||
| 1384 | |||
| 1385 | MUL_ADD_C a0L,a0R,b1L,b1R,c2,c3,c1 | ||
| 1386 | MUL_ADD_C a1L,a1R,b0L,b0R,c2,c3,c1 | ||
| 1387 | STD c2,8(r_ptr) | ||
| 1388 | COPY %r0,c2 | ||
| 1389 | |||
| 1390 | MUL_ADD_C a2L,a2R,b0L,b0R,c3,c1,c2 | ||
| 1391 | MUL_ADD_C a1L,a1R,b1L,b1R,c3,c1,c2 | ||
| 1392 | MUL_ADD_C a0L,a0R,b2L,b2R,c3,c1,c2 | ||
| 1393 | STD c3,16(r_ptr) | ||
| 1394 | COPY %r0,c3 | ||
| 1395 | |||
| 1396 | MUL_ADD_C a0L,a0R,b3L,b3R,c1,c2,c3 | ||
| 1397 | MUL_ADD_C a1L,a1R,b2L,b2R,c1,c2,c3 | ||
| 1398 | MUL_ADD_C a2L,a2R,b1L,b1R,c1,c2,c3 | ||
| 1399 | MUL_ADD_C a3L,a3R,b0L,b0R,c1,c2,c3 | ||
| 1400 | STD c1,24(r_ptr) | ||
| 1401 | COPY %r0,c1 | ||
| 1402 | |||
| 1403 | MUL_ADD_C a4L,a4R,b0L,b0R,c2,c3,c1 | ||
| 1404 | MUL_ADD_C a3L,a3R,b1L,b1R,c2,c3,c1 | ||
| 1405 | MUL_ADD_C a2L,a2R,b2L,b2R,c2,c3,c1 | ||
| 1406 | MUL_ADD_C a1L,a1R,b3L,b3R,c2,c3,c1 | ||
| 1407 | MUL_ADD_C a0L,a0R,b4L,b4R,c2,c3,c1 | ||
| 1408 | STD c2,32(r_ptr) | ||
| 1409 | COPY %r0,c2 | ||
| 1410 | |||
| 1411 | MUL_ADD_C a0L,a0R,b5L,b5R,c3,c1,c2 | ||
| 1412 | MUL_ADD_C a1L,a1R,b4L,b4R,c3,c1,c2 | ||
| 1413 | MUL_ADD_C a2L,a2R,b3L,b3R,c3,c1,c2 | ||
| 1414 | MUL_ADD_C a3L,a3R,b2L,b2R,c3,c1,c2 | ||
| 1415 | MUL_ADD_C a4L,a4R,b1L,b1R,c3,c1,c2 | ||
| 1416 | MUL_ADD_C a5L,a5R,b0L,b0R,c3,c1,c2 | ||
| 1417 | STD c3,40(r_ptr) | ||
| 1418 | COPY %r0,c3 | ||
| 1419 | |||
| 1420 | MUL_ADD_C a6L,a6R,b0L,b0R,c1,c2,c3 | ||
| 1421 | MUL_ADD_C a5L,a5R,b1L,b1R,c1,c2,c3 | ||
| 1422 | MUL_ADD_C a4L,a4R,b2L,b2R,c1,c2,c3 | ||
| 1423 | MUL_ADD_C a3L,a3R,b3L,b3R,c1,c2,c3 | ||
| 1424 | MUL_ADD_C a2L,a2R,b4L,b4R,c1,c2,c3 | ||
| 1425 | MUL_ADD_C a1L,a1R,b5L,b5R,c1,c2,c3 | ||
| 1426 | MUL_ADD_C a0L,a0R,b6L,b6R,c1,c2,c3 | ||
| 1427 | STD c1,48(r_ptr) | ||
| 1428 | COPY %r0,c1 | ||
| 1429 | |||
| 1430 | MUL_ADD_C a0L,a0R,b7L,b7R,c2,c3,c1 | ||
| 1431 | MUL_ADD_C a1L,a1R,b6L,b6R,c2,c3,c1 | ||
| 1432 | MUL_ADD_C a2L,a2R,b5L,b5R,c2,c3,c1 | ||
| 1433 | MUL_ADD_C a3L,a3R,b4L,b4R,c2,c3,c1 | ||
| 1434 | MUL_ADD_C a4L,a4R,b3L,b3R,c2,c3,c1 | ||
| 1435 | MUL_ADD_C a5L,a5R,b2L,b2R,c2,c3,c1 | ||
| 1436 | MUL_ADD_C a6L,a6R,b1L,b1R,c2,c3,c1 | ||
| 1437 | MUL_ADD_C a7L,a7R,b0L,b0R,c2,c3,c1 | ||
| 1438 | STD c2,56(r_ptr) | ||
| 1439 | COPY %r0,c2 | ||
| 1440 | |||
| 1441 | MUL_ADD_C a7L,a7R,b1L,b1R,c3,c1,c2 | ||
| 1442 | MUL_ADD_C a6L,a6R,b2L,b2R,c3,c1,c2 | ||
| 1443 | MUL_ADD_C a5L,a5R,b3L,b3R,c3,c1,c2 | ||
| 1444 | MUL_ADD_C a4L,a4R,b4L,b4R,c3,c1,c2 | ||
| 1445 | MUL_ADD_C a3L,a3R,b5L,b5R,c3,c1,c2 | ||
| 1446 | MUL_ADD_C a2L,a2R,b6L,b6R,c3,c1,c2 | ||
| 1447 | MUL_ADD_C a1L,a1R,b7L,b7R,c3,c1,c2 | ||
| 1448 | STD c3,64(r_ptr) | ||
| 1449 | COPY %r0,c3 | ||
| 1450 | |||
| 1451 | MUL_ADD_C a2L,a2R,b7L,b7R,c1,c2,c3 | ||
| 1452 | MUL_ADD_C a3L,a3R,b6L,b6R,c1,c2,c3 | ||
| 1453 | MUL_ADD_C a4L,a4R,b5L,b5R,c1,c2,c3 | ||
| 1454 | MUL_ADD_C a5L,a5R,b4L,b4R,c1,c2,c3 | ||
| 1455 | MUL_ADD_C a6L,a6R,b3L,b3R,c1,c2,c3 | ||
| 1456 | MUL_ADD_C a7L,a7R,b2L,b2R,c1,c2,c3 | ||
| 1457 | STD c1,72(r_ptr) | ||
| 1458 | COPY %r0,c1 | ||
| 1459 | |||
| 1460 | MUL_ADD_C a7L,a7R,b3L,b3R,c2,c3,c1 | ||
| 1461 | MUL_ADD_C a6L,a6R,b4L,b4R,c2,c3,c1 | ||
| 1462 | MUL_ADD_C a5L,a5R,b5L,b5R,c2,c3,c1 | ||
| 1463 | MUL_ADD_C a4L,a4R,b6L,b6R,c2,c3,c1 | ||
| 1464 | MUL_ADD_C a3L,a3R,b7L,b7R,c2,c3,c1 | ||
| 1465 | STD c2,80(r_ptr) | ||
| 1466 | COPY %r0,c2 | ||
| 1467 | |||
| 1468 | MUL_ADD_C a4L,a4R,b7L,b7R,c3,c1,c2 | ||
| 1469 | MUL_ADD_C a5L,a5R,b6L,b6R,c3,c1,c2 | ||
| 1470 | MUL_ADD_C a6L,a6R,b5L,b5R,c3,c1,c2 | ||
| 1471 | MUL_ADD_C a7L,a7R,b4L,b4R,c3,c1,c2 | ||
| 1472 | STD c3,88(r_ptr) | ||
| 1473 | COPY %r0,c3 | ||
| 1474 | |||
| 1475 | MUL_ADD_C a7L,a7R,b5L,b5R,c1,c2,c3 | ||
| 1476 | MUL_ADD_C a6L,a6R,b6L,b6R,c1,c2,c3 | ||
| 1477 | MUL_ADD_C a5L,a5R,b7L,b7R,c1,c2,c3 | ||
| 1478 | STD c1,96(r_ptr) | ||
| 1479 | COPY %r0,c1 | ||
| 1480 | |||
| 1481 | MUL_ADD_C a6L,a6R,b7L,b7R,c2,c3,c1 | ||
| 1482 | MUL_ADD_C a7L,a7R,b6L,b6R,c2,c3,c1 | ||
| 1483 | STD c2,104(r_ptr) | ||
| 1484 | COPY %r0,c2 | ||
| 1485 | |||
| 1486 | MUL_ADD_C a7L,a7R,b7L,b7R,c3,c1,c2 | ||
| 1487 | STD c3,112(r_ptr) | ||
| 1488 | STD c1,120(r_ptr) | ||
| 1489 | |||
| 1490 | .EXIT | ||
| 1491 | FLDD -88(%sp),%fr13 | ||
| 1492 | FLDD -96(%sp),%fr12 | ||
| 1493 | LDD -104(%sp),%r6 ; restore r6 | ||
| 1494 | LDD -112(%sp),%r5 ; restore r5 | ||
| 1495 | LDD -120(%sp),%r4 ; restore r4 | ||
| 1496 | BVE (%rp) | ||
| 1497 | LDD,MB -128(%sp),%r3 | ||
| 1498 | |||
| 1499 | .PROCEND | ||
| 1500 | |||
| 1501 | ;----------------------------------------------------------------------------- | ||
| 1502 | ; | ||
| 1503 | ;void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) | ||
| 1504 | ; arg0 = r_ptr | ||
| 1505 | ; arg1 = a_ptr | ||
| 1506 | ; arg2 = b_ptr | ||
| 1507 | ; | ||
| 1508 | |||
| 1509 | bn_mul_comba4 | ||
| 1510 | .proc | ||
| 1511 | .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE | ||
| 1512 | .EXPORT bn_mul_comba4,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN | ||
| 1513 | .entry | ||
| 1514 | .align 64 | ||
| 1515 | |||
| 1516 | STD %r3,0(%sp) ; save r3 | ||
| 1517 | STD %r4,8(%sp) ; save r4 | ||
| 1518 | STD %r5,16(%sp) ; save r5 | ||
| 1519 | STD %r6,24(%sp) ; save r6 | ||
| 1520 | FSTD %fr12,32(%sp) ; save r6 | ||
| 1521 | FSTD %fr13,40(%sp) ; save r7 | ||
| 1522 | |||
| 1523 | ; | ||
| 1524 | ; Zero out carries | ||
| 1525 | ; | ||
| 1526 | COPY %r0,c1 | ||
| 1527 | COPY %r0,c2 | ||
| 1528 | COPY %r0,c3 | ||
| 1529 | |||
| 1530 | LDO 128(%sp),%sp ; bump stack | ||
| 1531 | DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32 | ||
| 1532 | |||
| 1533 | ; | ||
| 1534 | ; Load up all of the values we are going to use | ||
| 1535 | ; | ||
| 1536 | FLDD 0(a_ptr),a0 | ||
| 1537 | FLDD 8(a_ptr),a1 | ||
| 1538 | FLDD 16(a_ptr),a2 | ||
| 1539 | FLDD 24(a_ptr),a3 | ||
| 1540 | |||
| 1541 | FLDD 0(b_ptr),b0 | ||
| 1542 | FLDD 8(b_ptr),b1 | ||
| 1543 | FLDD 16(b_ptr),b2 | ||
| 1544 | FLDD 24(b_ptr),b3 | ||
| 1545 | |||
| 1546 | MUL_ADD_C a0L,a0R,b0L,b0R,c1,c2,c3 | ||
| 1547 | STD c1,0(r_ptr) | ||
| 1548 | COPY %r0,c1 | ||
| 1549 | |||
| 1550 | MUL_ADD_C a0L,a0R,b1L,b1R,c2,c3,c1 | ||
| 1551 | MUL_ADD_C a1L,a1R,b0L,b0R,c2,c3,c1 | ||
| 1552 | STD c2,8(r_ptr) | ||
| 1553 | COPY %r0,c2 | ||
| 1554 | |||
| 1555 | MUL_ADD_C a2L,a2R,b0L,b0R,c3,c1,c2 | ||
| 1556 | MUL_ADD_C a1L,a1R,b1L,b1R,c3,c1,c2 | ||
| 1557 | MUL_ADD_C a0L,a0R,b2L,b2R,c3,c1,c2 | ||
| 1558 | STD c3,16(r_ptr) | ||
| 1559 | COPY %r0,c3 | ||
| 1560 | |||
| 1561 | MUL_ADD_C a0L,a0R,b3L,b3R,c1,c2,c3 | ||
| 1562 | MUL_ADD_C a1L,a1R,b2L,b2R,c1,c2,c3 | ||
| 1563 | MUL_ADD_C a2L,a2R,b1L,b1R,c1,c2,c3 | ||
| 1564 | MUL_ADD_C a3L,a3R,b0L,b0R,c1,c2,c3 | ||
| 1565 | STD c1,24(r_ptr) | ||
| 1566 | COPY %r0,c1 | ||
| 1567 | |||
| 1568 | MUL_ADD_C a3L,a3R,b1L,b1R,c2,c3,c1 | ||
| 1569 | MUL_ADD_C a2L,a2R,b2L,b2R,c2,c3,c1 | ||
| 1570 | MUL_ADD_C a1L,a1R,b3L,b3R,c2,c3,c1 | ||
| 1571 | STD c2,32(r_ptr) | ||
| 1572 | COPY %r0,c2 | ||
| 1573 | |||
| 1574 | MUL_ADD_C a2L,a2R,b3L,b3R,c3,c1,c2 | ||
| 1575 | MUL_ADD_C a3L,a3R,b2L,b2R,c3,c1,c2 | ||
| 1576 | STD c3,40(r_ptr) | ||
| 1577 | COPY %r0,c3 | ||
| 1578 | |||
| 1579 | MUL_ADD_C a3L,a3R,b3L,b3R,c1,c2,c3 | ||
| 1580 | STD c1,48(r_ptr) | ||
| 1581 | STD c2,56(r_ptr) | ||
| 1582 | |||
| 1583 | .EXIT | ||
| 1584 | FLDD -88(%sp),%fr13 | ||
| 1585 | FLDD -96(%sp),%fr12 | ||
| 1586 | LDD -104(%sp),%r6 ; restore r6 | ||
| 1587 | LDD -112(%sp),%r5 ; restore r5 | ||
| 1588 | LDD -120(%sp),%r4 ; restore r4 | ||
| 1589 | BVE (%rp) | ||
| 1590 | LDD,MB -128(%sp),%r3 | ||
| 1591 | |||
| 1592 | .PROCEND | ||
| 1593 | |||
| 1594 | |||
| 1595 | .SPACE $TEXT$ | ||
| 1596 | .SUBSPA $CODE$ | ||
| 1597 | .SPACE $PRIVATE$,SORT=16 | ||
| 1598 | .IMPORT $global$,DATA | ||
| 1599 | .SPACE $TEXT$ | ||
| 1600 | .SUBSPA $CODE$ | ||
| 1601 | .SUBSPA $LIT$,ACCESS=0x2c | ||
| 1602 | C$4 | ||
| 1603 | .ALIGN 8 | ||
| 1604 | .STRINGZ "Division would overflow (%d)\n" | ||
| 1605 | .END | ||
diff --git a/src/lib/libcrypto/bn/asm/sparcv8plus.S b/src/lib/libcrypto/bn/asm/sparcv8plus.S deleted file mode 100644 index 02ad6069c2..0000000000 --- a/src/lib/libcrypto/bn/asm/sparcv8plus.S +++ /dev/null | |||
| @@ -1,1558 +0,0 @@ | |||
| 1 | .ident "sparcv8plus.s, Version 1.4" | ||
| 2 | .ident "SPARC v9 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>" | ||
| 3 | |||
| 4 | /* | ||
| 5 | * ==================================================================== | ||
| 6 | * Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | ||
| 7 | * project. | ||
| 8 | * | ||
| 9 | * Rights for redistribution and usage in source and binary forms are | ||
| 10 | * granted according to the OpenSSL license. Warranty of any kind is | ||
| 11 | * disclaimed. | ||
| 12 | * ==================================================================== | ||
| 13 | */ | ||
| 14 | |||
| 15 | /* | ||
| 16 | * This is my modest contribution to OpenSSL project (see | ||
| 17 | * http://www.openssl.org/ for more information about it) and is | ||
| 18 | * a drop-in UltraSPARC ISA replacement for crypto/bn/bn_asm.c | ||
| 19 | * module. For updates see http://fy.chalmers.se/~appro/hpe/. | ||
| 20 | * | ||
| 21 | * Questions-n-answers. | ||
| 22 | * | ||
| 23 | * Q. How to compile? | ||
| 24 | * A. With SC4.x/SC5.x: | ||
| 25 | * | ||
| 26 | * cc -xarch=v8plus -c bn_asm.sparc.v8plus.S -o bn_asm.o | ||
| 27 | * | ||
| 28 | * and with gcc: | ||
| 29 | * | ||
| 30 | * gcc -mcpu=ultrasparc -c bn_asm.sparc.v8plus.S -o bn_asm.o | ||
| 31 | * | ||
| 32 | * or if above fails (it does if you have gas installed): | ||
| 33 | * | ||
| 34 | * gcc -E bn_asm.sparc.v8plus.S | as -xarch=v8plus /dev/fd/0 -o bn_asm.o | ||
| 35 | * | ||
| 36 | * Quick-n-dirty way to fuse the module into the library. | ||
| 37 | * Provided that the library is already configured and built | ||
| 38 | * (in 0.9.2 case with no-asm option): | ||
| 39 | * | ||
| 40 | * # cd crypto/bn | ||
| 41 | * # cp /some/place/bn_asm.sparc.v8plus.S . | ||
| 42 | * # cc -xarch=v8plus -c bn_asm.sparc.v8plus.S -o bn_asm.o | ||
| 43 | * # make | ||
| 44 | * # cd ../.. | ||
| 45 | * # make; make test | ||
| 46 | * | ||
| 47 | * Quick-n-dirty way to get rid of it: | ||
| 48 | * | ||
| 49 | * # cd crypto/bn | ||
| 50 | * # touch bn_asm.c | ||
| 51 | * # make | ||
| 52 | * # cd ../.. | ||
| 53 | * # make; make test | ||
| 54 | * | ||
| 55 | * Q. V8plus architecture? What kind of beast is that? | ||
| 56 | * A. Well, it's rather a programming model than an architecture... | ||
| 57 | * It's actually v9-compliant, i.e. *any* UltraSPARC, CPU under | ||
| 58 | * special conditions, namely when kernel doesn't preserve upper | ||
| 59 | * 32 bits of otherwise 64-bit registers during a context switch. | ||
| 60 | * | ||
| 61 | * Q. Why just UltraSPARC? What about SuperSPARC? | ||
| 62 | * A. Original release did target UltraSPARC only. Now SuperSPARC | ||
| 63 | * version is provided along. Both version share bn_*comba[48] | ||
| 64 | * implementations (see comment later in code for explanation). | ||
| 65 | * But what's so special about this UltraSPARC implementation? | ||
| 66 | * Why didn't I let compiler do the job? Trouble is that most of | ||
| 67 | * available compilers (well, SC5.0 is the only exception) don't | ||
| 68 | * attempt to take advantage of UltraSPARC's 64-bitness under | ||
| 69 | * 32-bit kernels even though it's perfectly possible (see next | ||
| 70 | * question). | ||
| 71 | * | ||
| 72 | * Q. 64-bit registers under 32-bit kernels? Didn't you just say it | ||
| 73 | * doesn't work? | ||
| 74 | * A. You can't address *all* registers as 64-bit wide:-( The catch is | ||
| 75 | * that you actually may rely upon %o0-%o5 and %g1-%g4 being fully | ||
| 76 | * preserved if you're in a leaf function, i.e. such never calling | ||
| 77 | * any other functions. All functions in this module are leaf and | ||
| 78 | * 10 registers is a handful. And as a matter of fact none-"comba" | ||
| 79 | * routines don't require even that much and I could even afford to | ||
| 80 | * not allocate own stack frame for 'em:-) | ||
| 81 | * | ||
| 82 | * Q. What about 64-bit kernels? | ||
| 83 | * A. What about 'em? Just kidding:-) Pure 64-bit version is currently | ||
| 84 | * under evaluation and development... | ||
| 85 | * | ||
| 86 | * Q. What about shared libraries? | ||
| 87 | * A. What about 'em? Kidding again:-) Code does *not* contain any | ||
| 88 | * code position dependencies and it's safe to include it into | ||
| 89 | * shared library as is. | ||
| 90 | * | ||
| 91 | * Q. How much faster does it go? | ||
| 92 | * A. Do you have a good benchmark? In either case below is what I | ||
| 93 | * experience with crypto/bn/expspeed.c test program: | ||
| 94 | * | ||
| 95 | * v8plus module on U10/300MHz against bn_asm.c compiled with: | ||
| 96 | * | ||
| 97 | * cc-5.0 -xarch=v8plus -xO5 -xdepend +7-12% | ||
| 98 | * cc-4.2 -xarch=v8plus -xO5 -xdepend +25-35% | ||
| 99 | * egcs-1.1.2 -mcpu=ultrasparc -O3 +35-45% | ||
| 100 | * | ||
| 101 | * v8 module on SS10/60MHz against bn_asm.c compiled with: | ||
| 102 | * | ||
| 103 | * cc-5.0 -xarch=v8 -xO5 -xdepend +7-10% | ||
| 104 | * cc-4.2 -xarch=v8 -xO5 -xdepend +10% | ||
| 105 | * egcs-1.1.2 -mv8 -O3 +35-45% | ||
| 106 | * | ||
| 107 | * As you can see it's damn hard to beat the new Sun C compiler | ||
| 108 | * and it's in first place GNU C users who will appreciate this | ||
| 109 | * assembler implementation:-) | ||
| 110 | */ | ||
| 111 | |||
| 112 | /* | ||
| 113 | * Revision history. | ||
| 114 | * | ||
| 115 | * 1.0 - initial release; | ||
| 116 | * 1.1 - new loop unrolling model(*); | ||
| 117 | * - some more fine tuning; | ||
| 118 | * 1.2 - made gas friendly; | ||
| 119 | * - updates to documentation concerning v9; | ||
| 120 | * - new performance comparison matrix; | ||
| 121 | * 1.3 - fixed problem with /usr/ccs/lib/cpp; | ||
| 122 | * 1.4 - native V9 bn_*_comba[48] implementation (15% more efficient) | ||
| 123 | * resulting in slight overall performance kick; | ||
| 124 | * - some retunes; | ||
| 125 | * - support for GNU as added; | ||
| 126 | * | ||
| 127 | * (*) Originally unrolled loop looked like this: | ||
| 128 | * for (;;) { | ||
| 129 | * op(p+0); if (--n==0) break; | ||
| 130 | * op(p+1); if (--n==0) break; | ||
| 131 | * op(p+2); if (--n==0) break; | ||
| 132 | * op(p+3); if (--n==0) break; | ||
| 133 | * p+=4; | ||
| 134 | * } | ||
| 135 | * I unroll according to following: | ||
| 136 | * while (n&~3) { | ||
| 137 | * op(p+0); op(p+1); op(p+2); op(p+3); | ||
| 138 | * p+=4; n=-4; | ||
| 139 | * } | ||
| 140 | * if (n) { | ||
| 141 | * op(p+0); if (--n==0) return; | ||
| 142 | * op(p+2); if (--n==0) return; | ||
| 143 | * op(p+3); return; | ||
| 144 | * } | ||
| 145 | */ | ||
| 146 | |||
| 147 | #if defined(__SUNPRO_C) && defined(__sparcv9) | ||
| 148 | /* They've said -xarch=v9 at command line */ | ||
| 149 | .register %g2,#scratch | ||
| 150 | .register %g3,#scratch | ||
| 151 | # define FRAME_SIZE -192 | ||
| 152 | #elif defined(__GNUC__) && defined(__arch64__) | ||
| 153 | /* They've said -m64 at command line */ | ||
| 154 | .register %g2,#scratch | ||
| 155 | .register %g3,#scratch | ||
| 156 | # define FRAME_SIZE -192 | ||
| 157 | #else | ||
| 158 | # define FRAME_SIZE -96 | ||
| 159 | #endif | ||
| 160 | /* | ||
| 161 | * GNU assembler can't stand stuw:-( | ||
| 162 | */ | ||
| 163 | #define stuw st | ||
| 164 | |||
| 165 | .section ".text",#alloc,#execinstr | ||
| 166 | .file "bn_asm.sparc.v8plus.S" | ||
| 167 | |||
| 168 | .align 32 | ||
| 169 | |||
| 170 | .global bn_mul_add_words | ||
| 171 | /* | ||
| 172 | * BN_ULONG bn_mul_add_words(rp,ap,num,w) | ||
| 173 | * BN_ULONG *rp,*ap; | ||
| 174 | * int num; | ||
| 175 | * BN_ULONG w; | ||
| 176 | */ | ||
| 177 | bn_mul_add_words: | ||
| 178 | sra %o2,%g0,%o2 ! signx %o2 | ||
| 179 | brgz,a %o2,.L_bn_mul_add_words_proceed | ||
| 180 | lduw [%o1],%g2 | ||
| 181 | retl | ||
| 182 | clr %o0 | ||
| 183 | nop | ||
| 184 | nop | ||
| 185 | nop | ||
| 186 | |||
| 187 | .L_bn_mul_add_words_proceed: | ||
| 188 | srl %o3,%g0,%o3 ! clruw %o3 | ||
| 189 | andcc %o2,-4,%g0 | ||
| 190 | bz,pn %icc,.L_bn_mul_add_words_tail | ||
| 191 | clr %o5 | ||
| 192 | |||
| 193 | .L_bn_mul_add_words_loop: ! wow! 32 aligned! | ||
| 194 | lduw [%o0],%g1 | ||
| 195 | lduw [%o1+4],%g3 | ||
| 196 | mulx %o3,%g2,%g2 | ||
| 197 | add %g1,%o5,%o4 | ||
| 198 | nop | ||
| 199 | add %o4,%g2,%o4 | ||
| 200 | stuw %o4,[%o0] | ||
| 201 | srlx %o4,32,%o5 | ||
| 202 | |||
| 203 | lduw [%o0+4],%g1 | ||
| 204 | lduw [%o1+8],%g2 | ||
| 205 | mulx %o3,%g3,%g3 | ||
| 206 | add %g1,%o5,%o4 | ||
| 207 | dec 4,%o2 | ||
| 208 | add %o4,%g3,%o4 | ||
| 209 | stuw %o4,[%o0+4] | ||
| 210 | srlx %o4,32,%o5 | ||
| 211 | |||
| 212 | lduw [%o0+8],%g1 | ||
| 213 | lduw [%o1+12],%g3 | ||
| 214 | mulx %o3,%g2,%g2 | ||
| 215 | add %g1,%o5,%o4 | ||
| 216 | inc 16,%o1 | ||
| 217 | add %o4,%g2,%o4 | ||
| 218 | stuw %o4,[%o0+8] | ||
| 219 | srlx %o4,32,%o5 | ||
| 220 | |||
| 221 | lduw [%o0+12],%g1 | ||
| 222 | mulx %o3,%g3,%g3 | ||
| 223 | add %g1,%o5,%o4 | ||
| 224 | inc 16,%o0 | ||
| 225 | add %o4,%g3,%o4 | ||
| 226 | andcc %o2,-4,%g0 | ||
| 227 | stuw %o4,[%o0-4] | ||
| 228 | srlx %o4,32,%o5 | ||
| 229 | bnz,a,pt %icc,.L_bn_mul_add_words_loop | ||
| 230 | lduw [%o1],%g2 | ||
| 231 | |||
| 232 | brnz,a,pn %o2,.L_bn_mul_add_words_tail | ||
| 233 | lduw [%o1],%g2 | ||
| 234 | .L_bn_mul_add_words_return: | ||
| 235 | retl | ||
| 236 | mov %o5,%o0 | ||
| 237 | |||
| 238 | .L_bn_mul_add_words_tail: | ||
| 239 | lduw [%o0],%g1 | ||
| 240 | mulx %o3,%g2,%g2 | ||
| 241 | add %g1,%o5,%o4 | ||
| 242 | dec %o2 | ||
| 243 | add %o4,%g2,%o4 | ||
| 244 | srlx %o4,32,%o5 | ||
| 245 | brz,pt %o2,.L_bn_mul_add_words_return | ||
| 246 | stuw %o4,[%o0] | ||
| 247 | |||
| 248 | lduw [%o1+4],%g2 | ||
| 249 | lduw [%o0+4],%g1 | ||
| 250 | mulx %o3,%g2,%g2 | ||
| 251 | add %g1,%o5,%o4 | ||
| 252 | dec %o2 | ||
| 253 | add %o4,%g2,%o4 | ||
| 254 | srlx %o4,32,%o5 | ||
| 255 | brz,pt %o2,.L_bn_mul_add_words_return | ||
| 256 | stuw %o4,[%o0+4] | ||
| 257 | |||
| 258 | lduw [%o1+8],%g2 | ||
| 259 | lduw [%o0+8],%g1 | ||
| 260 | mulx %o3,%g2,%g2 | ||
| 261 | add %g1,%o5,%o4 | ||
| 262 | add %o4,%g2,%o4 | ||
| 263 | stuw %o4,[%o0+8] | ||
| 264 | retl | ||
| 265 | srlx %o4,32,%o0 | ||
| 266 | |||
| 267 | .type bn_mul_add_words,#function | ||
| 268 | .size bn_mul_add_words,(.-bn_mul_add_words) | ||
| 269 | |||
| 270 | .align 32 | ||
| 271 | |||
| 272 | .global bn_mul_words | ||
| 273 | /* | ||
| 274 | * BN_ULONG bn_mul_words(rp,ap,num,w) | ||
| 275 | * BN_ULONG *rp,*ap; | ||
| 276 | * int num; | ||
| 277 | * BN_ULONG w; | ||
| 278 | */ | ||
| 279 | bn_mul_words: | ||
| 280 | sra %o2,%g0,%o2 ! signx %o2 | ||
| 281 | brgz,a %o2,.L_bn_mul_words_proceeed | ||
| 282 | lduw [%o1],%g2 | ||
| 283 | retl | ||
| 284 | clr %o0 | ||
| 285 | nop | ||
| 286 | nop | ||
| 287 | nop | ||
| 288 | |||
| 289 | .L_bn_mul_words_proceeed: | ||
| 290 | srl %o3,%g0,%o3 ! clruw %o3 | ||
| 291 | andcc %o2,-4,%g0 | ||
| 292 | bz,pn %icc,.L_bn_mul_words_tail | ||
| 293 | clr %o5 | ||
| 294 | |||
| 295 | .L_bn_mul_words_loop: ! wow! 32 aligned! | ||
| 296 | lduw [%o1+4],%g3 | ||
| 297 | mulx %o3,%g2,%g2 | ||
| 298 | add %g2,%o5,%o4 | ||
| 299 | nop | ||
| 300 | stuw %o4,[%o0] | ||
| 301 | srlx %o4,32,%o5 | ||
| 302 | |||
| 303 | lduw [%o1+8],%g2 | ||
| 304 | mulx %o3,%g3,%g3 | ||
| 305 | add %g3,%o5,%o4 | ||
| 306 | dec 4,%o2 | ||
| 307 | stuw %o4,[%o0+4] | ||
| 308 | srlx %o4,32,%o5 | ||
| 309 | |||
| 310 | lduw [%o1+12],%g3 | ||
| 311 | mulx %o3,%g2,%g2 | ||
| 312 | add %g2,%o5,%o4 | ||
| 313 | inc 16,%o1 | ||
| 314 | stuw %o4,[%o0+8] | ||
| 315 | srlx %o4,32,%o5 | ||
| 316 | |||
| 317 | mulx %o3,%g3,%g3 | ||
| 318 | add %g3,%o5,%o4 | ||
| 319 | inc 16,%o0 | ||
| 320 | stuw %o4,[%o0-4] | ||
| 321 | srlx %o4,32,%o5 | ||
| 322 | andcc %o2,-4,%g0 | ||
| 323 | bnz,a,pt %icc,.L_bn_mul_words_loop | ||
| 324 | lduw [%o1],%g2 | ||
| 325 | nop | ||
| 326 | nop | ||
| 327 | |||
| 328 | brnz,a,pn %o2,.L_bn_mul_words_tail | ||
| 329 | lduw [%o1],%g2 | ||
| 330 | .L_bn_mul_words_return: | ||
| 331 | retl | ||
| 332 | mov %o5,%o0 | ||
| 333 | |||
| 334 | .L_bn_mul_words_tail: | ||
| 335 | mulx %o3,%g2,%g2 | ||
| 336 | add %g2,%o5,%o4 | ||
| 337 | dec %o2 | ||
| 338 | srlx %o4,32,%o5 | ||
| 339 | brz,pt %o2,.L_bn_mul_words_return | ||
| 340 | stuw %o4,[%o0] | ||
| 341 | |||
| 342 | lduw [%o1+4],%g2 | ||
| 343 | mulx %o3,%g2,%g2 | ||
| 344 | add %g2,%o5,%o4 | ||
| 345 | dec %o2 | ||
| 346 | srlx %o4,32,%o5 | ||
| 347 | brz,pt %o2,.L_bn_mul_words_return | ||
| 348 | stuw %o4,[%o0+4] | ||
| 349 | |||
| 350 | lduw [%o1+8],%g2 | ||
| 351 | mulx %o3,%g2,%g2 | ||
| 352 | add %g2,%o5,%o4 | ||
| 353 | stuw %o4,[%o0+8] | ||
| 354 | retl | ||
| 355 | srlx %o4,32,%o0 | ||
| 356 | |||
| 357 | .type bn_mul_words,#function | ||
| 358 | .size bn_mul_words,(.-bn_mul_words) | ||
| 359 | |||
| 360 | .align 32 | ||
| 361 | .global bn_sqr_words | ||
| 362 | /* | ||
| 363 | * void bn_sqr_words(r,a,n) | ||
| 364 | * BN_ULONG *r,*a; | ||
| 365 | * int n; | ||
| 366 | */ | ||
| 367 | bn_sqr_words: | ||
| 368 | sra %o2,%g0,%o2 ! signx %o2 | ||
| 369 | brgz,a %o2,.L_bn_sqr_words_proceeed | ||
| 370 | lduw [%o1],%g2 | ||
| 371 | retl | ||
| 372 | clr %o0 | ||
| 373 | nop | ||
| 374 | nop | ||
| 375 | nop | ||
| 376 | |||
| 377 | .L_bn_sqr_words_proceeed: | ||
| 378 | andcc %o2,-4,%g0 | ||
| 379 | nop | ||
| 380 | bz,pn %icc,.L_bn_sqr_words_tail | ||
| 381 | nop | ||
| 382 | |||
| 383 | .L_bn_sqr_words_loop: ! wow! 32 aligned! | ||
| 384 | lduw [%o1+4],%g3 | ||
| 385 | mulx %g2,%g2,%o4 | ||
| 386 | stuw %o4,[%o0] | ||
| 387 | srlx %o4,32,%o5 | ||
| 388 | stuw %o5,[%o0+4] | ||
| 389 | nop | ||
| 390 | |||
| 391 | lduw [%o1+8],%g2 | ||
| 392 | mulx %g3,%g3,%o4 | ||
| 393 | dec 4,%o2 | ||
| 394 | stuw %o4,[%o0+8] | ||
| 395 | srlx %o4,32,%o5 | ||
| 396 | stuw %o5,[%o0+12] | ||
| 397 | |||
| 398 | lduw [%o1+12],%g3 | ||
| 399 | mulx %g2,%g2,%o4 | ||
| 400 | srlx %o4,32,%o5 | ||
| 401 | stuw %o4,[%o0+16] | ||
| 402 | inc 16,%o1 | ||
| 403 | stuw %o5,[%o0+20] | ||
| 404 | |||
| 405 | mulx %g3,%g3,%o4 | ||
| 406 | inc 32,%o0 | ||
| 407 | stuw %o4,[%o0-8] | ||
| 408 | srlx %o4,32,%o5 | ||
| 409 | andcc %o2,-4,%g2 | ||
| 410 | stuw %o5,[%o0-4] | ||
| 411 | bnz,a,pt %icc,.L_bn_sqr_words_loop | ||
| 412 | lduw [%o1],%g2 | ||
| 413 | nop | ||
| 414 | |||
| 415 | brnz,a,pn %o2,.L_bn_sqr_words_tail | ||
| 416 | lduw [%o1],%g2 | ||
| 417 | .L_bn_sqr_words_return: | ||
| 418 | retl | ||
| 419 | clr %o0 | ||
| 420 | |||
| 421 | .L_bn_sqr_words_tail: | ||
| 422 | mulx %g2,%g2,%o4 | ||
| 423 | dec %o2 | ||
| 424 | stuw %o4,[%o0] | ||
| 425 | srlx %o4,32,%o5 | ||
| 426 | brz,pt %o2,.L_bn_sqr_words_return | ||
| 427 | stuw %o5,[%o0+4] | ||
| 428 | |||
| 429 | lduw [%o1+4],%g2 | ||
| 430 | mulx %g2,%g2,%o4 | ||
| 431 | dec %o2 | ||
| 432 | stuw %o4,[%o0+8] | ||
| 433 | srlx %o4,32,%o5 | ||
| 434 | brz,pt %o2,.L_bn_sqr_words_return | ||
| 435 | stuw %o5,[%o0+12] | ||
| 436 | |||
| 437 | lduw [%o1+8],%g2 | ||
| 438 | mulx %g2,%g2,%o4 | ||
| 439 | srlx %o4,32,%o5 | ||
| 440 | stuw %o4,[%o0+16] | ||
| 441 | stuw %o5,[%o0+20] | ||
| 442 | retl | ||
| 443 | clr %o0 | ||
| 444 | |||
| 445 | .type bn_sqr_words,#function | ||
| 446 | .size bn_sqr_words,(.-bn_sqr_words) | ||
| 447 | |||
| 448 | .align 32 | ||
| 449 | .global bn_div_words | ||
| 450 | /* | ||
| 451 | * BN_ULONG bn_div_words(h,l,d) | ||
| 452 | * BN_ULONG h,l,d; | ||
| 453 | */ | ||
| 454 | bn_div_words: | ||
| 455 | sllx %o0,32,%o0 | ||
| 456 | or %o0,%o1,%o0 | ||
| 457 | udivx %o0,%o2,%o0 | ||
| 458 | retl | ||
| 459 | srl %o0,%g0,%o0 ! clruw %o0 | ||
| 460 | |||
| 461 | .type bn_div_words,#function | ||
| 462 | .size bn_div_words,(.-bn_div_words) | ||
| 463 | |||
| 464 | .align 32 | ||
| 465 | |||
| 466 | .global bn_add_words | ||
| 467 | /* | ||
| 468 | * BN_ULONG bn_add_words(rp,ap,bp,n) | ||
| 469 | * BN_ULONG *rp,*ap,*bp; | ||
| 470 | * int n; | ||
| 471 | */ | ||
| 472 | bn_add_words: | ||
| 473 | sra %o3,%g0,%o3 ! signx %o3 | ||
| 474 | brgz,a %o3,.L_bn_add_words_proceed | ||
| 475 | lduw [%o1],%o4 | ||
| 476 | retl | ||
| 477 | clr %o0 | ||
| 478 | |||
| 479 | .L_bn_add_words_proceed: | ||
| 480 | andcc %o3,-4,%g0 | ||
| 481 | bz,pn %icc,.L_bn_add_words_tail | ||
| 482 | addcc %g0,0,%g0 ! clear carry flag | ||
| 483 | |||
| 484 | .L_bn_add_words_loop: ! wow! 32 aligned! | ||
| 485 | dec 4,%o3 | ||
| 486 | lduw [%o2],%o5 | ||
| 487 | lduw [%o1+4],%g1 | ||
| 488 | lduw [%o2+4],%g2 | ||
| 489 | lduw [%o1+8],%g3 | ||
| 490 | lduw [%o2+8],%g4 | ||
| 491 | addccc %o5,%o4,%o5 | ||
| 492 | stuw %o5,[%o0] | ||
| 493 | |||
| 494 | lduw [%o1+12],%o4 | ||
| 495 | lduw [%o2+12],%o5 | ||
| 496 | inc 16,%o1 | ||
| 497 | addccc %g1,%g2,%g1 | ||
| 498 | stuw %g1,[%o0+4] | ||
| 499 | |||
| 500 | inc 16,%o2 | ||
| 501 | addccc %g3,%g4,%g3 | ||
| 502 | stuw %g3,[%o0+8] | ||
| 503 | |||
| 504 | inc 16,%o0 | ||
| 505 | addccc %o5,%o4,%o5 | ||
| 506 | stuw %o5,[%o0-4] | ||
| 507 | and %o3,-4,%g1 | ||
| 508 | brnz,a,pt %g1,.L_bn_add_words_loop | ||
| 509 | lduw [%o1],%o4 | ||
| 510 | |||
| 511 | brnz,a,pn %o3,.L_bn_add_words_tail | ||
| 512 | lduw [%o1],%o4 | ||
| 513 | .L_bn_add_words_return: | ||
| 514 | clr %o0 | ||
| 515 | retl | ||
| 516 | movcs %icc,1,%o0 | ||
| 517 | nop | ||
| 518 | |||
| 519 | .L_bn_add_words_tail: | ||
| 520 | lduw [%o2],%o5 | ||
| 521 | dec %o3 | ||
| 522 | addccc %o5,%o4,%o5 | ||
| 523 | brz,pt %o3,.L_bn_add_words_return | ||
| 524 | stuw %o5,[%o0] | ||
| 525 | |||
| 526 | lduw [%o1+4],%o4 | ||
| 527 | lduw [%o2+4],%o5 | ||
| 528 | dec %o3 | ||
| 529 | addccc %o5,%o4,%o5 | ||
| 530 | brz,pt %o3,.L_bn_add_words_return | ||
| 531 | stuw %o5,[%o0+4] | ||
| 532 | |||
| 533 | lduw [%o1+8],%o4 | ||
| 534 | lduw [%o2+8],%o5 | ||
| 535 | addccc %o5,%o4,%o5 | ||
| 536 | stuw %o5,[%o0+8] | ||
| 537 | clr %o0 | ||
| 538 | retl | ||
| 539 | movcs %icc,1,%o0 | ||
| 540 | |||
| 541 | .type bn_add_words,#function | ||
| 542 | .size bn_add_words,(.-bn_add_words) | ||
| 543 | |||
| 544 | .global bn_sub_words | ||
| 545 | /* | ||
| 546 | * BN_ULONG bn_sub_words(rp,ap,bp,n) | ||
| 547 | * BN_ULONG *rp,*ap,*bp; | ||
| 548 | * int n; | ||
| 549 | */ | ||
| 550 | bn_sub_words: | ||
| 551 | sra %o3,%g0,%o3 ! signx %o3 | ||
| 552 | brgz,a %o3,.L_bn_sub_words_proceed | ||
| 553 | lduw [%o1],%o4 | ||
| 554 | retl | ||
| 555 | clr %o0 | ||
| 556 | |||
| 557 | .L_bn_sub_words_proceed: | ||
| 558 | andcc %o3,-4,%g0 | ||
| 559 | bz,pn %icc,.L_bn_sub_words_tail | ||
| 560 | addcc %g0,0,%g0 ! clear carry flag | ||
| 561 | |||
| 562 | .L_bn_sub_words_loop: ! wow! 32 aligned! | ||
| 563 | dec 4,%o3 | ||
| 564 | lduw [%o2],%o5 | ||
| 565 | lduw [%o1+4],%g1 | ||
| 566 | lduw [%o2+4],%g2 | ||
| 567 | lduw [%o1+8],%g3 | ||
| 568 | lduw [%o2+8],%g4 | ||
| 569 | subccc %o4,%o5,%o5 | ||
| 570 | stuw %o5,[%o0] | ||
| 571 | |||
| 572 | lduw [%o1+12],%o4 | ||
| 573 | lduw [%o2+12],%o5 | ||
| 574 | inc 16,%o1 | ||
| 575 | subccc %g1,%g2,%g2 | ||
| 576 | stuw %g2,[%o0+4] | ||
| 577 | |||
| 578 | inc 16,%o2 | ||
| 579 | subccc %g3,%g4,%g4 | ||
| 580 | stuw %g4,[%o0+8] | ||
| 581 | |||
| 582 | inc 16,%o0 | ||
| 583 | subccc %o4,%o5,%o5 | ||
| 584 | stuw %o5,[%o0-4] | ||
| 585 | and %o3,-4,%g1 | ||
| 586 | brnz,a,pt %g1,.L_bn_sub_words_loop | ||
| 587 | lduw [%o1],%o4 | ||
| 588 | |||
| 589 | brnz,a,pn %o3,.L_bn_sub_words_tail | ||
| 590 | lduw [%o1],%o4 | ||
| 591 | .L_bn_sub_words_return: | ||
| 592 | clr %o0 | ||
| 593 | retl | ||
| 594 | movcs %icc,1,%o0 | ||
| 595 | nop | ||
| 596 | |||
| 597 | .L_bn_sub_words_tail: ! wow! 32 aligned! | ||
| 598 | lduw [%o2],%o5 | ||
| 599 | dec %o3 | ||
| 600 | subccc %o4,%o5,%o5 | ||
| 601 | brz,pt %o3,.L_bn_sub_words_return | ||
| 602 | stuw %o5,[%o0] | ||
| 603 | |||
| 604 | lduw [%o1+4],%o4 | ||
| 605 | lduw [%o2+4],%o5 | ||
| 606 | dec %o3 | ||
| 607 | subccc %o4,%o5,%o5 | ||
| 608 | brz,pt %o3,.L_bn_sub_words_return | ||
| 609 | stuw %o5,[%o0+4] | ||
| 610 | |||
| 611 | lduw [%o1+8],%o4 | ||
| 612 | lduw [%o2+8],%o5 | ||
| 613 | subccc %o4,%o5,%o5 | ||
| 614 | stuw %o5,[%o0+8] | ||
| 615 | clr %o0 | ||
| 616 | retl | ||
| 617 | movcs %icc,1,%o0 | ||
| 618 | |||
| 619 | .type bn_sub_words,#function | ||
| 620 | .size bn_sub_words,(.-bn_sub_words) | ||
| 621 | |||
| 622 | /* | ||
| 623 | * Code below depends on the fact that upper parts of the %l0-%l7 | ||
| 624 | * and %i0-%i7 are zeroed by kernel after context switch. In | ||
| 625 | * previous versions this comment stated that "the trouble is that | ||
| 626 | * it's not feasible to implement the mumbo-jumbo in less V9 | ||
| 627 | * instructions:-(" which apparently isn't true thanks to | ||
| 628 | * 'bcs,a %xcc,.+8; inc %rd' pair. But the performance improvement | ||
| 629 | * results not from the shorter code, but from elimination of | ||
| 630 | * multicycle none-pairable 'rd %y,%rd' instructions. | ||
| 631 | * | ||
| 632 | * Andy. | ||
| 633 | */ | ||
| 634 | |||
| 635 | /* | ||
| 636 | * Here is register usage map for *all* routines below. | ||
| 637 | */ | ||
| 638 | #define t_1 %o0 | ||
| 639 | #define t_2 %o1 | ||
| 640 | #define c_12 %o2 | ||
| 641 | #define c_3 %o3 | ||
| 642 | |||
| 643 | #define ap(I) [%i1+4*I] | ||
| 644 | #define bp(I) [%i2+4*I] | ||
| 645 | #define rp(I) [%i0+4*I] | ||
| 646 | |||
| 647 | #define a_0 %l0 | ||
| 648 | #define a_1 %l1 | ||
| 649 | #define a_2 %l2 | ||
| 650 | #define a_3 %l3 | ||
| 651 | #define a_4 %l4 | ||
| 652 | #define a_5 %l5 | ||
| 653 | #define a_6 %l6 | ||
| 654 | #define a_7 %l7 | ||
| 655 | |||
| 656 | #define b_0 %i3 | ||
| 657 | #define b_1 %i4 | ||
| 658 | #define b_2 %i5 | ||
| 659 | #define b_3 %o4 | ||
| 660 | #define b_4 %o5 | ||
| 661 | #define b_5 %o7 | ||
| 662 | #define b_6 %g1 | ||
| 663 | #define b_7 %g4 | ||
| 664 | |||
| 665 | .align 32 | ||
| 666 | .global bn_mul_comba8 | ||
| 667 | /* | ||
| 668 | * void bn_mul_comba8(r,a,b) | ||
| 669 | * BN_ULONG *r,*a,*b; | ||
| 670 | */ | ||
| 671 | bn_mul_comba8: | ||
| 672 | save %sp,FRAME_SIZE,%sp | ||
| 673 | mov 1,t_2 | ||
| 674 | lduw ap(0),a_0 | ||
| 675 | sllx t_2,32,t_2 | ||
| 676 | lduw bp(0),b_0 != | ||
| 677 | lduw bp(1),b_1 | ||
| 678 | mulx a_0,b_0,t_1 !mul_add_c(a[0],b[0],c1,c2,c3); | ||
| 679 | srlx t_1,32,c_12 | ||
| 680 | stuw t_1,rp(0) !=!r[0]=c1; | ||
| 681 | |||
| 682 | lduw ap(1),a_1 | ||
| 683 | mulx a_0,b_1,t_1 !mul_add_c(a[0],b[1],c2,c3,c1); | ||
| 684 | addcc c_12,t_1,c_12 | ||
| 685 | clr c_3 != | ||
| 686 | bcs,a %xcc,.+8 | ||
| 687 | add c_3,t_2,c_3 | ||
| 688 | lduw ap(2),a_2 | ||
| 689 | mulx a_1,b_0,t_1 !=!mul_add_c(a[1],b[0],c2,c3,c1); | ||
| 690 | addcc c_12,t_1,t_1 | ||
| 691 | bcs,a %xcc,.+8 | ||
| 692 | add c_3,t_2,c_3 | ||
| 693 | srlx t_1,32,c_12 != | ||
| 694 | stuw t_1,rp(1) !r[1]=c2; | ||
| 695 | or c_12,c_3,c_12 | ||
| 696 | |||
| 697 | mulx a_2,b_0,t_1 !mul_add_c(a[2],b[0],c3,c1,c2); | ||
| 698 | addcc c_12,t_1,c_12 != | ||
| 699 | clr c_3 | ||
| 700 | bcs,a %xcc,.+8 | ||
| 701 | add c_3,t_2,c_3 | ||
| 702 | lduw bp(2),b_2 != | ||
| 703 | mulx a_1,b_1,t_1 !mul_add_c(a[1],b[1],c3,c1,c2); | ||
| 704 | addcc c_12,t_1,c_12 | ||
| 705 | bcs,a %xcc,.+8 | ||
| 706 | add c_3,t_2,c_3 != | ||
| 707 | lduw bp(3),b_3 | ||
| 708 | mulx a_0,b_2,t_1 !mul_add_c(a[0],b[2],c3,c1,c2); | ||
| 709 | addcc c_12,t_1,t_1 | ||
| 710 | bcs,a %xcc,.+8 != | ||
| 711 | add c_3,t_2,c_3 | ||
| 712 | srlx t_1,32,c_12 | ||
| 713 | stuw t_1,rp(2) !r[2]=c3; | ||
| 714 | or c_12,c_3,c_12 != | ||
| 715 | |||
| 716 | mulx a_0,b_3,t_1 !mul_add_c(a[0],b[3],c1,c2,c3); | ||
| 717 | addcc c_12,t_1,c_12 | ||
| 718 | clr c_3 | ||
| 719 | bcs,a %xcc,.+8 != | ||
| 720 | add c_3,t_2,c_3 | ||
| 721 | mulx a_1,b_2,t_1 !=!mul_add_c(a[1],b[2],c1,c2,c3); | ||
| 722 | addcc c_12,t_1,c_12 | ||
| 723 | bcs,a %xcc,.+8 != | ||
| 724 | add c_3,t_2,c_3 | ||
| 725 | lduw ap(3),a_3 | ||
| 726 | mulx a_2,b_1,t_1 !mul_add_c(a[2],b[1],c1,c2,c3); | ||
| 727 | addcc c_12,t_1,c_12 != | ||
| 728 | bcs,a %xcc,.+8 | ||
| 729 | add c_3,t_2,c_3 | ||
| 730 | lduw ap(4),a_4 | ||
| 731 | mulx a_3,b_0,t_1 !=!mul_add_c(a[3],b[0],c1,c2,c3);!= | ||
| 732 | addcc c_12,t_1,t_1 | ||
| 733 | bcs,a %xcc,.+8 | ||
| 734 | add c_3,t_2,c_3 | ||
| 735 | srlx t_1,32,c_12 != | ||
| 736 | stuw t_1,rp(3) !r[3]=c1; | ||
| 737 | or c_12,c_3,c_12 | ||
| 738 | |||
| 739 | mulx a_4,b_0,t_1 !mul_add_c(a[4],b[0],c2,c3,c1); | ||
| 740 | addcc c_12,t_1,c_12 != | ||
| 741 | clr c_3 | ||
| 742 | bcs,a %xcc,.+8 | ||
| 743 | add c_3,t_2,c_3 | ||
| 744 | mulx a_3,b_1,t_1 !=!mul_add_c(a[3],b[1],c2,c3,c1); | ||
| 745 | addcc c_12,t_1,c_12 | ||
| 746 | bcs,a %xcc,.+8 | ||
| 747 | add c_3,t_2,c_3 | ||
| 748 | mulx a_2,b_2,t_1 !=!mul_add_c(a[2],b[2],c2,c3,c1); | ||
| 749 | addcc c_12,t_1,c_12 | ||
| 750 | bcs,a %xcc,.+8 | ||
| 751 | add c_3,t_2,c_3 | ||
| 752 | lduw bp(4),b_4 != | ||
| 753 | mulx a_1,b_3,t_1 !mul_add_c(a[1],b[3],c2,c3,c1); | ||
| 754 | addcc c_12,t_1,c_12 | ||
| 755 | bcs,a %xcc,.+8 | ||
| 756 | add c_3,t_2,c_3 != | ||
| 757 | lduw bp(5),b_5 | ||
| 758 | mulx a_0,b_4,t_1 !mul_add_c(a[0],b[4],c2,c3,c1); | ||
| 759 | addcc c_12,t_1,t_1 | ||
| 760 | bcs,a %xcc,.+8 != | ||
| 761 | add c_3,t_2,c_3 | ||
| 762 | srlx t_1,32,c_12 | ||
| 763 | stuw t_1,rp(4) !r[4]=c2; | ||
| 764 | or c_12,c_3,c_12 != | ||
| 765 | |||
| 766 | mulx a_0,b_5,t_1 !mul_add_c(a[0],b[5],c3,c1,c2); | ||
| 767 | addcc c_12,t_1,c_12 | ||
| 768 | clr c_3 | ||
| 769 | bcs,a %xcc,.+8 != | ||
| 770 | add c_3,t_2,c_3 | ||
| 771 | mulx a_1,b_4,t_1 !mul_add_c(a[1],b[4],c3,c1,c2); | ||
| 772 | addcc c_12,t_1,c_12 | ||
| 773 | bcs,a %xcc,.+8 != | ||
| 774 | add c_3,t_2,c_3 | ||
| 775 | mulx a_2,b_3,t_1 !mul_add_c(a[2],b[3],c3,c1,c2); | ||
| 776 | addcc c_12,t_1,c_12 | ||
| 777 | bcs,a %xcc,.+8 != | ||
| 778 | add c_3,t_2,c_3 | ||
| 779 | mulx a_3,b_2,t_1 !mul_add_c(a[3],b[2],c3,c1,c2); | ||
| 780 | addcc c_12,t_1,c_12 | ||
| 781 | bcs,a %xcc,.+8 != | ||
| 782 | add c_3,t_2,c_3 | ||
| 783 | lduw ap(5),a_5 | ||
| 784 | mulx a_4,b_1,t_1 !mul_add_c(a[4],b[1],c3,c1,c2); | ||
| 785 | addcc c_12,t_1,c_12 != | ||
| 786 | bcs,a %xcc,.+8 | ||
| 787 | add c_3,t_2,c_3 | ||
| 788 | lduw ap(6),a_6 | ||
| 789 | mulx a_5,b_0,t_1 !=!mul_add_c(a[5],b[0],c3,c1,c2); | ||
| 790 | addcc c_12,t_1,t_1 | ||
| 791 | bcs,a %xcc,.+8 | ||
| 792 | add c_3,t_2,c_3 | ||
| 793 | srlx t_1,32,c_12 != | ||
| 794 | stuw t_1,rp(5) !r[5]=c3; | ||
| 795 | or c_12,c_3,c_12 | ||
| 796 | |||
| 797 | mulx a_6,b_0,t_1 !mul_add_c(a[6],b[0],c1,c2,c3); | ||
| 798 | addcc c_12,t_1,c_12 != | ||
| 799 | clr c_3 | ||
| 800 | bcs,a %xcc,.+8 | ||
| 801 | add c_3,t_2,c_3 | ||
| 802 | mulx a_5,b_1,t_1 !=!mul_add_c(a[5],b[1],c1,c2,c3); | ||
| 803 | addcc c_12,t_1,c_12 | ||
| 804 | bcs,a %xcc,.+8 | ||
| 805 | add c_3,t_2,c_3 | ||
| 806 | mulx a_4,b_2,t_1 !=!mul_add_c(a[4],b[2],c1,c2,c3); | ||
| 807 | addcc c_12,t_1,c_12 | ||
| 808 | bcs,a %xcc,.+8 | ||
| 809 | add c_3,t_2,c_3 | ||
| 810 | mulx a_3,b_3,t_1 !=!mul_add_c(a[3],b[3],c1,c2,c3); | ||
| 811 | addcc c_12,t_1,c_12 | ||
| 812 | bcs,a %xcc,.+8 | ||
| 813 | add c_3,t_2,c_3 | ||
| 814 | mulx a_2,b_4,t_1 !=!mul_add_c(a[2],b[4],c1,c2,c3); | ||
| 815 | addcc c_12,t_1,c_12 | ||
| 816 | bcs,a %xcc,.+8 | ||
| 817 | add c_3,t_2,c_3 | ||
| 818 | lduw bp(6),b_6 != | ||
| 819 | mulx a_1,b_5,t_1 !mul_add_c(a[1],b[5],c1,c2,c3); | ||
| 820 | addcc c_12,t_1,c_12 | ||
| 821 | bcs,a %xcc,.+8 | ||
| 822 | add c_3,t_2,c_3 != | ||
| 823 | lduw bp(7),b_7 | ||
| 824 | mulx a_0,b_6,t_1 !mul_add_c(a[0],b[6],c1,c2,c3); | ||
| 825 | addcc c_12,t_1,t_1 | ||
| 826 | bcs,a %xcc,.+8 != | ||
| 827 | add c_3,t_2,c_3 | ||
| 828 | srlx t_1,32,c_12 | ||
| 829 | stuw t_1,rp(6) !r[6]=c1; | ||
| 830 | or c_12,c_3,c_12 != | ||
| 831 | |||
| 832 | mulx a_0,b_7,t_1 !mul_add_c(a[0],b[7],c2,c3,c1); | ||
| 833 | addcc c_12,t_1,c_12 | ||
| 834 | clr c_3 | ||
| 835 | bcs,a %xcc,.+8 != | ||
| 836 | add c_3,t_2,c_3 | ||
| 837 | mulx a_1,b_6,t_1 !mul_add_c(a[1],b[6],c2,c3,c1); | ||
| 838 | addcc c_12,t_1,c_12 | ||
| 839 | bcs,a %xcc,.+8 != | ||
| 840 | add c_3,t_2,c_3 | ||
| 841 | mulx a_2,b_5,t_1 !mul_add_c(a[2],b[5],c2,c3,c1); | ||
| 842 | addcc c_12,t_1,c_12 | ||
| 843 | bcs,a %xcc,.+8 != | ||
| 844 | add c_3,t_2,c_3 | ||
| 845 | mulx a_3,b_4,t_1 !mul_add_c(a[3],b[4],c2,c3,c1); | ||
| 846 | addcc c_12,t_1,c_12 | ||
| 847 | bcs,a %xcc,.+8 != | ||
| 848 | add c_3,t_2,c_3 | ||
| 849 | mulx a_4,b_3,t_1 !mul_add_c(a[4],b[3],c2,c3,c1); | ||
| 850 | addcc c_12,t_1,c_12 | ||
| 851 | bcs,a %xcc,.+8 != | ||
| 852 | add c_3,t_2,c_3 | ||
| 853 | mulx a_5,b_2,t_1 !mul_add_c(a[5],b[2],c2,c3,c1); | ||
| 854 | addcc c_12,t_1,c_12 | ||
| 855 | bcs,a %xcc,.+8 != | ||
| 856 | add c_3,t_2,c_3 | ||
| 857 | lduw ap(7),a_7 | ||
| 858 | mulx a_6,b_1,t_1 !=!mul_add_c(a[6],b[1],c2,c3,c1); | ||
| 859 | addcc c_12,t_1,c_12 | ||
| 860 | bcs,a %xcc,.+8 | ||
| 861 | add c_3,t_2,c_3 | ||
| 862 | mulx a_7,b_0,t_1 !=!mul_add_c(a[7],b[0],c2,c3,c1); | ||
| 863 | addcc c_12,t_1,t_1 | ||
| 864 | bcs,a %xcc,.+8 | ||
| 865 | add c_3,t_2,c_3 | ||
| 866 | srlx t_1,32,c_12 != | ||
| 867 | stuw t_1,rp(7) !r[7]=c2; | ||
| 868 | or c_12,c_3,c_12 | ||
| 869 | |||
| 870 | mulx a_7,b_1,t_1 !=!mul_add_c(a[7],b[1],c3,c1,c2); | ||
| 871 | addcc c_12,t_1,c_12 | ||
| 872 | clr c_3 | ||
| 873 | bcs,a %xcc,.+8 | ||
| 874 | add c_3,t_2,c_3 != | ||
| 875 | mulx a_6,b_2,t_1 !mul_add_c(a[6],b[2],c3,c1,c2); | ||
| 876 | addcc c_12,t_1,c_12 | ||
| 877 | bcs,a %xcc,.+8 | ||
| 878 | add c_3,t_2,c_3 != | ||
| 879 | mulx a_5,b_3,t_1 !mul_add_c(a[5],b[3],c3,c1,c2); | ||
| 880 | addcc c_12,t_1,c_12 | ||
| 881 | bcs,a %xcc,.+8 | ||
| 882 | add c_3,t_2,c_3 != | ||
| 883 | mulx a_4,b_4,t_1 !mul_add_c(a[4],b[4],c3,c1,c2); | ||
| 884 | addcc c_12,t_1,c_12 | ||
| 885 | bcs,a %xcc,.+8 | ||
| 886 | add c_3,t_2,c_3 != | ||
| 887 | mulx a_3,b_5,t_1 !mul_add_c(a[3],b[5],c3,c1,c2); | ||
| 888 | addcc c_12,t_1,c_12 | ||
| 889 | bcs,a %xcc,.+8 | ||
| 890 | add c_3,t_2,c_3 != | ||
| 891 | mulx a_2,b_6,t_1 !mul_add_c(a[2],b[6],c3,c1,c2); | ||
| 892 | addcc c_12,t_1,c_12 | ||
| 893 | bcs,a %xcc,.+8 | ||
| 894 | add c_3,t_2,c_3 != | ||
| 895 | mulx a_1,b_7,t_1 !mul_add_c(a[1],b[7],c3,c1,c2); | ||
| 896 | addcc c_12,t_1,t_1 | ||
| 897 | bcs,a %xcc,.+8 | ||
| 898 | add c_3,t_2,c_3 != | ||
| 899 | srlx t_1,32,c_12 | ||
| 900 | stuw t_1,rp(8) !r[8]=c3; | ||
| 901 | or c_12,c_3,c_12 | ||
| 902 | |||
| 903 | mulx a_2,b_7,t_1 !=!mul_add_c(a[2],b[7],c1,c2,c3); | ||
| 904 | addcc c_12,t_1,c_12 | ||
| 905 | clr c_3 | ||
| 906 | bcs,a %xcc,.+8 | ||
| 907 | add c_3,t_2,c_3 != | ||
| 908 | mulx a_3,b_6,t_1 !mul_add_c(a[3],b[6],c1,c2,c3); | ||
| 909 | addcc c_12,t_1,c_12 | ||
| 910 | bcs,a %xcc,.+8 != | ||
| 911 | add c_3,t_2,c_3 | ||
| 912 | mulx a_4,b_5,t_1 !mul_add_c(a[4],b[5],c1,c2,c3); | ||
| 913 | addcc c_12,t_1,c_12 | ||
| 914 | bcs,a %xcc,.+8 != | ||
| 915 | add c_3,t_2,c_3 | ||
| 916 | mulx a_5,b_4,t_1 !mul_add_c(a[5],b[4],c1,c2,c3); | ||
| 917 | addcc c_12,t_1,c_12 | ||
| 918 | bcs,a %xcc,.+8 != | ||
| 919 | add c_3,t_2,c_3 | ||
| 920 | mulx a_6,b_3,t_1 !mul_add_c(a[6],b[3],c1,c2,c3); | ||
| 921 | addcc c_12,t_1,c_12 | ||
| 922 | bcs,a %xcc,.+8 != | ||
| 923 | add c_3,t_2,c_3 | ||
| 924 | mulx a_7,b_2,t_1 !mul_add_c(a[7],b[2],c1,c2,c3); | ||
| 925 | addcc c_12,t_1,t_1 | ||
| 926 | bcs,a %xcc,.+8 != | ||
| 927 | add c_3,t_2,c_3 | ||
| 928 | srlx t_1,32,c_12 | ||
| 929 | stuw t_1,rp(9) !r[9]=c1; | ||
| 930 | or c_12,c_3,c_12 != | ||
| 931 | |||
| 932 | mulx a_7,b_3,t_1 !mul_add_c(a[7],b[3],c2,c3,c1); | ||
| 933 | addcc c_12,t_1,c_12 | ||
| 934 | clr c_3 | ||
| 935 | bcs,a %xcc,.+8 != | ||
| 936 | add c_3,t_2,c_3 | ||
| 937 | mulx a_6,b_4,t_1 !mul_add_c(a[6],b[4],c2,c3,c1); | ||
| 938 | addcc c_12,t_1,c_12 | ||
| 939 | bcs,a %xcc,.+8 != | ||
| 940 | add c_3,t_2,c_3 | ||
| 941 | mulx a_5,b_5,t_1 !mul_add_c(a[5],b[5],c2,c3,c1); | ||
| 942 | addcc c_12,t_1,c_12 | ||
| 943 | bcs,a %xcc,.+8 != | ||
| 944 | add c_3,t_2,c_3 | ||
| 945 | mulx a_4,b_6,t_1 !mul_add_c(a[4],b[6],c2,c3,c1); | ||
| 946 | addcc c_12,t_1,c_12 | ||
| 947 | bcs,a %xcc,.+8 != | ||
| 948 | add c_3,t_2,c_3 | ||
| 949 | mulx a_3,b_7,t_1 !mul_add_c(a[3],b[7],c2,c3,c1); | ||
| 950 | addcc c_12,t_1,t_1 | ||
| 951 | bcs,a %xcc,.+8 != | ||
| 952 | add c_3,t_2,c_3 | ||
| 953 | srlx t_1,32,c_12 | ||
| 954 | stuw t_1,rp(10) !r[10]=c2; | ||
| 955 | or c_12,c_3,c_12 != | ||
| 956 | |||
| 957 | mulx a_4,b_7,t_1 !mul_add_c(a[4],b[7],c3,c1,c2); | ||
| 958 | addcc c_12,t_1,c_12 | ||
| 959 | clr c_3 | ||
| 960 | bcs,a %xcc,.+8 != | ||
| 961 | add c_3,t_2,c_3 | ||
| 962 | mulx a_5,b_6,t_1 !mul_add_c(a[5],b[6],c3,c1,c2); | ||
| 963 | addcc c_12,t_1,c_12 | ||
| 964 | bcs,a %xcc,.+8 != | ||
| 965 | add c_3,t_2,c_3 | ||
| 966 | mulx a_6,b_5,t_1 !mul_add_c(a[6],b[5],c3,c1,c2); | ||
| 967 | addcc c_12,t_1,c_12 | ||
| 968 | bcs,a %xcc,.+8 != | ||
| 969 | add c_3,t_2,c_3 | ||
| 970 | mulx a_7,b_4,t_1 !mul_add_c(a[7],b[4],c3,c1,c2); | ||
| 971 | addcc c_12,t_1,t_1 | ||
| 972 | bcs,a %xcc,.+8 != | ||
| 973 | add c_3,t_2,c_3 | ||
| 974 | srlx t_1,32,c_12 | ||
| 975 | stuw t_1,rp(11) !r[11]=c3; | ||
| 976 | or c_12,c_3,c_12 != | ||
| 977 | |||
| 978 | mulx a_7,b_5,t_1 !mul_add_c(a[7],b[5],c1,c2,c3); | ||
| 979 | addcc c_12,t_1,c_12 | ||
| 980 | clr c_3 | ||
| 981 | bcs,a %xcc,.+8 != | ||
| 982 | add c_3,t_2,c_3 | ||
| 983 | mulx a_6,b_6,t_1 !mul_add_c(a[6],b[6],c1,c2,c3); | ||
| 984 | addcc c_12,t_1,c_12 | ||
| 985 | bcs,a %xcc,.+8 != | ||
| 986 | add c_3,t_2,c_3 | ||
| 987 | mulx a_5,b_7,t_1 !mul_add_c(a[5],b[7],c1,c2,c3); | ||
| 988 | addcc c_12,t_1,t_1 | ||
| 989 | bcs,a %xcc,.+8 != | ||
| 990 | add c_3,t_2,c_3 | ||
| 991 | srlx t_1,32,c_12 | ||
| 992 | stuw t_1,rp(12) !r[12]=c1; | ||
| 993 | or c_12,c_3,c_12 != | ||
| 994 | |||
| 995 | mulx a_6,b_7,t_1 !mul_add_c(a[6],b[7],c2,c3,c1); | ||
| 996 | addcc c_12,t_1,c_12 | ||
| 997 | clr c_3 | ||
| 998 | bcs,a %xcc,.+8 != | ||
| 999 | add c_3,t_2,c_3 | ||
| 1000 | mulx a_7,b_6,t_1 !mul_add_c(a[7],b[6],c2,c3,c1); | ||
| 1001 | addcc c_12,t_1,t_1 | ||
| 1002 | bcs,a %xcc,.+8 != | ||
| 1003 | add c_3,t_2,c_3 | ||
| 1004 | srlx t_1,32,c_12 | ||
| 1005 | st t_1,rp(13) !r[13]=c2; | ||
| 1006 | or c_12,c_3,c_12 != | ||
| 1007 | |||
| 1008 | mulx a_7,b_7,t_1 !mul_add_c(a[7],b[7],c3,c1,c2); | ||
| 1009 | addcc c_12,t_1,t_1 | ||
| 1010 | srlx t_1,32,c_12 != | ||
| 1011 | stuw t_1,rp(14) !r[14]=c3; | ||
| 1012 | stuw c_12,rp(15) !r[15]=c1; | ||
| 1013 | |||
| 1014 | ret | ||
| 1015 | restore %g0,%g0,%o0 != | ||
| 1016 | |||
| 1017 | .type bn_mul_comba8,#function | ||
| 1018 | .size bn_mul_comba8,(.-bn_mul_comba8) | ||
| 1019 | |||
| 1020 | .align 32 | ||
| 1021 | |||
| 1022 | .global bn_mul_comba4 | ||
| 1023 | /* | ||
| 1024 | * void bn_mul_comba4(r,a,b) | ||
| 1025 | * BN_ULONG *r,*a,*b; | ||
| 1026 | */ | ||
| 1027 | bn_mul_comba4: | ||
| 1028 | save %sp,FRAME_SIZE,%sp | ||
| 1029 | lduw ap(0),a_0 | ||
| 1030 | mov 1,t_2 | ||
| 1031 | lduw bp(0),b_0 | ||
| 1032 | sllx t_2,32,t_2 != | ||
| 1033 | lduw bp(1),b_1 | ||
| 1034 | mulx a_0,b_0,t_1 !mul_add_c(a[0],b[0],c1,c2,c3); | ||
| 1035 | srlx t_1,32,c_12 | ||
| 1036 | stuw t_1,rp(0) !=!r[0]=c1; | ||
| 1037 | |||
| 1038 | lduw ap(1),a_1 | ||
| 1039 | mulx a_0,b_1,t_1 !mul_add_c(a[0],b[1],c2,c3,c1); | ||
| 1040 | addcc c_12,t_1,c_12 | ||
| 1041 | clr c_3 != | ||
| 1042 | bcs,a %xcc,.+8 | ||
| 1043 | add c_3,t_2,c_3 | ||
| 1044 | lduw ap(2),a_2 | ||
| 1045 | mulx a_1,b_0,t_1 !=!mul_add_c(a[1],b[0],c2,c3,c1); | ||
| 1046 | addcc c_12,t_1,t_1 | ||
| 1047 | bcs,a %xcc,.+8 | ||
| 1048 | add c_3,t_2,c_3 | ||
| 1049 | srlx t_1,32,c_12 != | ||
| 1050 | stuw t_1,rp(1) !r[1]=c2; | ||
| 1051 | or c_12,c_3,c_12 | ||
| 1052 | |||
| 1053 | mulx a_2,b_0,t_1 !mul_add_c(a[2],b[0],c3,c1,c2); | ||
| 1054 | addcc c_12,t_1,c_12 != | ||
| 1055 | clr c_3 | ||
| 1056 | bcs,a %xcc,.+8 | ||
| 1057 | add c_3,t_2,c_3 | ||
| 1058 | lduw bp(2),b_2 != | ||
| 1059 | mulx a_1,b_1,t_1 !mul_add_c(a[1],b[1],c3,c1,c2); | ||
| 1060 | addcc c_12,t_1,c_12 | ||
| 1061 | bcs,a %xcc,.+8 | ||
| 1062 | add c_3,t_2,c_3 != | ||
| 1063 | lduw bp(3),b_3 | ||
| 1064 | mulx a_0,b_2,t_1 !mul_add_c(a[0],b[2],c3,c1,c2); | ||
| 1065 | addcc c_12,t_1,t_1 | ||
| 1066 | bcs,a %xcc,.+8 != | ||
| 1067 | add c_3,t_2,c_3 | ||
| 1068 | srlx t_1,32,c_12 | ||
| 1069 | stuw t_1,rp(2) !r[2]=c3; | ||
| 1070 | or c_12,c_3,c_12 != | ||
| 1071 | |||
| 1072 | mulx a_0,b_3,t_1 !mul_add_c(a[0],b[3],c1,c2,c3); | ||
| 1073 | addcc c_12,t_1,c_12 | ||
| 1074 | clr c_3 | ||
| 1075 | bcs,a %xcc,.+8 != | ||
| 1076 | add c_3,t_2,c_3 | ||
| 1077 | mulx a_1,b_2,t_1 !mul_add_c(a[1],b[2],c1,c2,c3); | ||
| 1078 | addcc c_12,t_1,c_12 | ||
| 1079 | bcs,a %xcc,.+8 != | ||
| 1080 | add c_3,t_2,c_3 | ||
| 1081 | lduw ap(3),a_3 | ||
| 1082 | mulx a_2,b_1,t_1 !mul_add_c(a[2],b[1],c1,c2,c3); | ||
| 1083 | addcc c_12,t_1,c_12 != | ||
| 1084 | bcs,a %xcc,.+8 | ||
| 1085 | add c_3,t_2,c_3 | ||
| 1086 | mulx a_3,b_0,t_1 !mul_add_c(a[3],b[0],c1,c2,c3);!= | ||
| 1087 | addcc c_12,t_1,t_1 != | ||
| 1088 | bcs,a %xcc,.+8 | ||
| 1089 | add c_3,t_2,c_3 | ||
| 1090 | srlx t_1,32,c_12 | ||
| 1091 | stuw t_1,rp(3) !=!r[3]=c1; | ||
| 1092 | or c_12,c_3,c_12 | ||
| 1093 | |||
| 1094 | mulx a_3,b_1,t_1 !mul_add_c(a[3],b[1],c2,c3,c1); | ||
| 1095 | addcc c_12,t_1,c_12 | ||
| 1096 | clr c_3 != | ||
| 1097 | bcs,a %xcc,.+8 | ||
| 1098 | add c_3,t_2,c_3 | ||
| 1099 | mulx a_2,b_2,t_1 !mul_add_c(a[2],b[2],c2,c3,c1); | ||
| 1100 | addcc c_12,t_1,c_12 != | ||
| 1101 | bcs,a %xcc,.+8 | ||
| 1102 | add c_3,t_2,c_3 | ||
| 1103 | mulx a_1,b_3,t_1 !mul_add_c(a[1],b[3],c2,c3,c1); | ||
| 1104 | addcc c_12,t_1,t_1 != | ||
| 1105 | bcs,a %xcc,.+8 | ||
| 1106 | add c_3,t_2,c_3 | ||
| 1107 | srlx t_1,32,c_12 | ||
| 1108 | stuw t_1,rp(4) !=!r[4]=c2; | ||
| 1109 | or c_12,c_3,c_12 | ||
| 1110 | |||
| 1111 | mulx a_2,b_3,t_1 !mul_add_c(a[2],b[3],c3,c1,c2); | ||
| 1112 | addcc c_12,t_1,c_12 | ||
| 1113 | clr c_3 != | ||
| 1114 | bcs,a %xcc,.+8 | ||
| 1115 | add c_3,t_2,c_3 | ||
| 1116 | mulx a_3,b_2,t_1 !mul_add_c(a[3],b[2],c3,c1,c2); | ||
| 1117 | addcc c_12,t_1,t_1 != | ||
| 1118 | bcs,a %xcc,.+8 | ||
| 1119 | add c_3,t_2,c_3 | ||
| 1120 | srlx t_1,32,c_12 | ||
| 1121 | stuw t_1,rp(5) !=!r[5]=c3; | ||
| 1122 | or c_12,c_3,c_12 | ||
| 1123 | |||
| 1124 | mulx a_3,b_3,t_1 !mul_add_c(a[3],b[3],c1,c2,c3); | ||
| 1125 | addcc c_12,t_1,t_1 | ||
| 1126 | srlx t_1,32,c_12 != | ||
| 1127 | stuw t_1,rp(6) !r[6]=c1; | ||
| 1128 | stuw c_12,rp(7) !r[7]=c2; | ||
| 1129 | |||
| 1130 | ret | ||
| 1131 | restore %g0,%g0,%o0 | ||
| 1132 | |||
| 1133 | .type bn_mul_comba4,#function | ||
| 1134 | .size bn_mul_comba4,(.-bn_mul_comba4) | ||
| 1135 | |||
| 1136 | .align 32 | ||
| 1137 | |||
| 1138 | .global bn_sqr_comba8 | ||
| 1139 | bn_sqr_comba8: | ||
| 1140 | save %sp,FRAME_SIZE,%sp | ||
| 1141 | mov 1,t_2 | ||
| 1142 | lduw ap(0),a_0 | ||
| 1143 | sllx t_2,32,t_2 | ||
| 1144 | lduw ap(1),a_1 | ||
| 1145 | mulx a_0,a_0,t_1 !sqr_add_c(a,0,c1,c2,c3); | ||
| 1146 | srlx t_1,32,c_12 | ||
| 1147 | stuw t_1,rp(0) !r[0]=c1; | ||
| 1148 | |||
| 1149 | lduw ap(2),a_2 | ||
| 1150 | mulx a_0,a_1,t_1 !=!sqr_add_c2(a,1,0,c2,c3,c1); | ||
| 1151 | addcc c_12,t_1,c_12 | ||
| 1152 | clr c_3 | ||
| 1153 | bcs,a %xcc,.+8 | ||
| 1154 | add c_3,t_2,c_3 | ||
| 1155 | addcc c_12,t_1,t_1 | ||
| 1156 | bcs,a %xcc,.+8 | ||
| 1157 | add c_3,t_2,c_3 | ||
| 1158 | srlx t_1,32,c_12 | ||
| 1159 | stuw t_1,rp(1) !r[1]=c2; | ||
| 1160 | or c_12,c_3,c_12 | ||
| 1161 | |||
| 1162 | mulx a_2,a_0,t_1 !sqr_add_c2(a,2,0,c3,c1,c2); | ||
| 1163 | addcc c_12,t_1,c_12 | ||
| 1164 | clr c_3 | ||
| 1165 | bcs,a %xcc,.+8 | ||
| 1166 | add c_3,t_2,c_3 | ||
| 1167 | addcc c_12,t_1,c_12 | ||
| 1168 | bcs,a %xcc,.+8 | ||
| 1169 | add c_3,t_2,c_3 | ||
| 1170 | lduw ap(3),a_3 | ||
| 1171 | mulx a_1,a_1,t_1 !sqr_add_c(a,1,c3,c1,c2); | ||
| 1172 | addcc c_12,t_1,t_1 | ||
| 1173 | bcs,a %xcc,.+8 | ||
| 1174 | add c_3,t_2,c_3 | ||
| 1175 | srlx t_1,32,c_12 | ||
| 1176 | stuw t_1,rp(2) !r[2]=c3; | ||
| 1177 | or c_12,c_3,c_12 | ||
| 1178 | |||
| 1179 | mulx a_0,a_3,t_1 !sqr_add_c2(a,3,0,c1,c2,c3); | ||
| 1180 | addcc c_12,t_1,c_12 | ||
| 1181 | clr c_3 | ||
| 1182 | bcs,a %xcc,.+8 | ||
| 1183 | add c_3,t_2,c_3 | ||
| 1184 | addcc c_12,t_1,c_12 | ||
| 1185 | bcs,a %xcc,.+8 | ||
| 1186 | add c_3,t_2,c_3 | ||
| 1187 | lduw ap(4),a_4 | ||
| 1188 | mulx a_1,a_2,t_1 !sqr_add_c2(a,2,1,c1,c2,c3); | ||
| 1189 | addcc c_12,t_1,c_12 | ||
| 1190 | bcs,a %xcc,.+8 | ||
| 1191 | add c_3,t_2,c_3 | ||
| 1192 | addcc c_12,t_1,t_1 | ||
| 1193 | bcs,a %xcc,.+8 | ||
| 1194 | add c_3,t_2,c_3 | ||
| 1195 | srlx t_1,32,c_12 | ||
| 1196 | st t_1,rp(3) !r[3]=c1; | ||
| 1197 | or c_12,c_3,c_12 | ||
| 1198 | |||
| 1199 | mulx a_4,a_0,t_1 !sqr_add_c2(a,4,0,c2,c3,c1); | ||
| 1200 | addcc c_12,t_1,c_12 | ||
| 1201 | clr c_3 | ||
| 1202 | bcs,a %xcc,.+8 | ||
| 1203 | add c_3,t_2,c_3 | ||
| 1204 | addcc c_12,t_1,c_12 | ||
| 1205 | bcs,a %xcc,.+8 | ||
| 1206 | add c_3,t_2,c_3 | ||
| 1207 | mulx a_3,a_1,t_1 !sqr_add_c2(a,3,1,c2,c3,c1); | ||
| 1208 | addcc c_12,t_1,c_12 | ||
| 1209 | bcs,a %xcc,.+8 | ||
| 1210 | add c_3,t_2,c_3 | ||
| 1211 | addcc c_12,t_1,c_12 | ||
| 1212 | bcs,a %xcc,.+8 | ||
| 1213 | add c_3,t_2,c_3 | ||
| 1214 | lduw ap(5),a_5 | ||
| 1215 | mulx a_2,a_2,t_1 !sqr_add_c(a,2,c2,c3,c1); | ||
| 1216 | addcc c_12,t_1,t_1 | ||
| 1217 | bcs,a %xcc,.+8 | ||
| 1218 | add c_3,t_2,c_3 | ||
| 1219 | srlx t_1,32,c_12 | ||
| 1220 | stuw t_1,rp(4) !r[4]=c2; | ||
| 1221 | or c_12,c_3,c_12 | ||
| 1222 | |||
| 1223 | mulx a_0,a_5,t_1 !sqr_add_c2(a,5,0,c3,c1,c2); | ||
| 1224 | addcc c_12,t_1,c_12 | ||
| 1225 | clr c_3 | ||
| 1226 | bcs,a %xcc,.+8 | ||
| 1227 | add c_3,t_2,c_3 | ||
| 1228 | addcc c_12,t_1,c_12 | ||
| 1229 | bcs,a %xcc,.+8 | ||
| 1230 | add c_3,t_2,c_3 | ||
| 1231 | mulx a_1,a_4,t_1 !sqr_add_c2(a,4,1,c3,c1,c2); | ||
| 1232 | addcc c_12,t_1,c_12 | ||
| 1233 | bcs,a %xcc,.+8 | ||
| 1234 | add c_3,t_2,c_3 | ||
| 1235 | addcc c_12,t_1,c_12 | ||
| 1236 | bcs,a %xcc,.+8 | ||
| 1237 | add c_3,t_2,c_3 | ||
| 1238 | lduw ap(6),a_6 | ||
| 1239 | mulx a_2,a_3,t_1 !sqr_add_c2(a,3,2,c3,c1,c2); | ||
| 1240 | addcc c_12,t_1,c_12 | ||
| 1241 | bcs,a %xcc,.+8 | ||
| 1242 | add c_3,t_2,c_3 | ||
| 1243 | addcc c_12,t_1,t_1 | ||
| 1244 | bcs,a %xcc,.+8 | ||
| 1245 | add c_3,t_2,c_3 | ||
| 1246 | srlx t_1,32,c_12 | ||
| 1247 | stuw t_1,rp(5) !r[5]=c3; | ||
| 1248 | or c_12,c_3,c_12 | ||
| 1249 | |||
| 1250 | mulx a_6,a_0,t_1 !sqr_add_c2(a,6,0,c1,c2,c3); | ||
| 1251 | addcc c_12,t_1,c_12 | ||
| 1252 | clr c_3 | ||
| 1253 | bcs,a %xcc,.+8 | ||
| 1254 | add c_3,t_2,c_3 | ||
| 1255 | addcc c_12,t_1,c_12 | ||
| 1256 | bcs,a %xcc,.+8 | ||
| 1257 | add c_3,t_2,c_3 | ||
| 1258 | mulx a_5,a_1,t_1 !sqr_add_c2(a,5,1,c1,c2,c3); | ||
| 1259 | addcc c_12,t_1,c_12 | ||
| 1260 | bcs,a %xcc,.+8 | ||
| 1261 | add c_3,t_2,c_3 | ||
| 1262 | addcc c_12,t_1,c_12 | ||
| 1263 | bcs,a %xcc,.+8 | ||
| 1264 | add c_3,t_2,c_3 | ||
| 1265 | mulx a_4,a_2,t_1 !sqr_add_c2(a,4,2,c1,c2,c3); | ||
| 1266 | addcc c_12,t_1,c_12 | ||
| 1267 | bcs,a %xcc,.+8 | ||
| 1268 | add c_3,t_2,c_3 | ||
| 1269 | addcc c_12,t_1,c_12 | ||
| 1270 | bcs,a %xcc,.+8 | ||
| 1271 | add c_3,t_2,c_3 | ||
| 1272 | lduw ap(7),a_7 | ||
| 1273 | mulx a_3,a_3,t_1 !=!sqr_add_c(a,3,c1,c2,c3); | ||
| 1274 | addcc c_12,t_1,t_1 | ||
| 1275 | bcs,a %xcc,.+8 | ||
| 1276 | add c_3,t_2,c_3 | ||
| 1277 | srlx t_1,32,c_12 | ||
| 1278 | stuw t_1,rp(6) !r[6]=c1; | ||
| 1279 | or c_12,c_3,c_12 | ||
| 1280 | |||
| 1281 | mulx a_0,a_7,t_1 !sqr_add_c2(a,7,0,c2,c3,c1); | ||
| 1282 | addcc c_12,t_1,c_12 | ||
| 1283 | clr c_3 | ||
| 1284 | bcs,a %xcc,.+8 | ||
| 1285 | add c_3,t_2,c_3 | ||
| 1286 | addcc c_12,t_1,c_12 | ||
| 1287 | bcs,a %xcc,.+8 | ||
| 1288 | add c_3,t_2,c_3 | ||
| 1289 | mulx a_1,a_6,t_1 !sqr_add_c2(a,6,1,c2,c3,c1); | ||
| 1290 | addcc c_12,t_1,c_12 | ||
| 1291 | bcs,a %xcc,.+8 | ||
| 1292 | add c_3,t_2,c_3 | ||
| 1293 | addcc c_12,t_1,c_12 | ||
| 1294 | bcs,a %xcc,.+8 | ||
| 1295 | add c_3,t_2,c_3 | ||
| 1296 | mulx a_2,a_5,t_1 !sqr_add_c2(a,5,2,c2,c3,c1); | ||
| 1297 | addcc c_12,t_1,c_12 | ||
| 1298 | bcs,a %xcc,.+8 | ||
| 1299 | add c_3,t_2,c_3 | ||
| 1300 | addcc c_12,t_1,c_12 | ||
| 1301 | bcs,a %xcc,.+8 | ||
| 1302 | add c_3,t_2,c_3 | ||
| 1303 | mulx a_3,a_4,t_1 !sqr_add_c2(a,4,3,c2,c3,c1); | ||
| 1304 | addcc c_12,t_1,c_12 | ||
| 1305 | bcs,a %xcc,.+8 | ||
| 1306 | add c_3,t_2,c_3 | ||
| 1307 | addcc c_12,t_1,t_1 | ||
| 1308 | bcs,a %xcc,.+8 | ||
| 1309 | add c_3,t_2,c_3 | ||
| 1310 | srlx t_1,32,c_12 | ||
| 1311 | stuw t_1,rp(7) !r[7]=c2; | ||
| 1312 | or c_12,c_3,c_12 | ||
| 1313 | |||
| 1314 | mulx a_7,a_1,t_1 !sqr_add_c2(a,7,1,c3,c1,c2); | ||
| 1315 | addcc c_12,t_1,c_12 | ||
| 1316 | clr c_3 | ||
| 1317 | bcs,a %xcc,.+8 | ||
| 1318 | add c_3,t_2,c_3 | ||
| 1319 | addcc c_12,t_1,c_12 | ||
| 1320 | bcs,a %xcc,.+8 | ||
| 1321 | add c_3,t_2,c_3 | ||
| 1322 | mulx a_6,a_2,t_1 !sqr_add_c2(a,6,2,c3,c1,c2); | ||
| 1323 | addcc c_12,t_1,c_12 | ||
| 1324 | bcs,a %xcc,.+8 | ||
| 1325 | add c_3,t_2,c_3 | ||
| 1326 | addcc c_12,t_1,c_12 | ||
| 1327 | bcs,a %xcc,.+8 | ||
| 1328 | add c_3,t_2,c_3 | ||
| 1329 | mulx a_5,a_3,t_1 !sqr_add_c2(a,5,3,c3,c1,c2); | ||
| 1330 | addcc c_12,t_1,c_12 | ||
| 1331 | bcs,a %xcc,.+8 | ||
| 1332 | add c_3,t_2,c_3 | ||
| 1333 | addcc c_12,t_1,c_12 | ||
| 1334 | bcs,a %xcc,.+8 | ||
| 1335 | add c_3,t_2,c_3 | ||
| 1336 | mulx a_4,a_4,t_1 !sqr_add_c(a,4,c3,c1,c2); | ||
| 1337 | addcc c_12,t_1,t_1 | ||
| 1338 | bcs,a %xcc,.+8 | ||
| 1339 | add c_3,t_2,c_3 | ||
| 1340 | srlx t_1,32,c_12 | ||
| 1341 | stuw t_1,rp(8) !r[8]=c3; | ||
| 1342 | or c_12,c_3,c_12 | ||
| 1343 | |||
| 1344 | mulx a_2,a_7,t_1 !sqr_add_c2(a,7,2,c1,c2,c3); | ||
| 1345 | addcc c_12,t_1,c_12 | ||
| 1346 | clr c_3 | ||
| 1347 | bcs,a %xcc,.+8 | ||
| 1348 | add c_3,t_2,c_3 | ||
| 1349 | addcc c_12,t_1,c_12 | ||
| 1350 | bcs,a %xcc,.+8 | ||
| 1351 | add c_3,t_2,c_3 | ||
| 1352 | mulx a_3,a_6,t_1 !sqr_add_c2(a,6,3,c1,c2,c3); | ||
| 1353 | addcc c_12,t_1,c_12 | ||
| 1354 | bcs,a %xcc,.+8 | ||
| 1355 | add c_3,t_2,c_3 | ||
| 1356 | addcc c_12,t_1,c_12 | ||
| 1357 | bcs,a %xcc,.+8 | ||
| 1358 | add c_3,t_2,c_3 | ||
| 1359 | mulx a_4,a_5,t_1 !sqr_add_c2(a,5,4,c1,c2,c3); | ||
| 1360 | addcc c_12,t_1,c_12 | ||
| 1361 | bcs,a %xcc,.+8 | ||
| 1362 | add c_3,t_2,c_3 | ||
| 1363 | addcc c_12,t_1,t_1 | ||
| 1364 | bcs,a %xcc,.+8 | ||
| 1365 | add c_3,t_2,c_3 | ||
| 1366 | srlx t_1,32,c_12 | ||
| 1367 | stuw t_1,rp(9) !r[9]=c1; | ||
| 1368 | or c_12,c_3,c_12 | ||
| 1369 | |||
| 1370 | mulx a_7,a_3,t_1 !sqr_add_c2(a,7,3,c2,c3,c1); | ||
| 1371 | addcc c_12,t_1,c_12 | ||
| 1372 | clr c_3 | ||
| 1373 | bcs,a %xcc,.+8 | ||
| 1374 | add c_3,t_2,c_3 | ||
| 1375 | addcc c_12,t_1,c_12 | ||
| 1376 | bcs,a %xcc,.+8 | ||
| 1377 | add c_3,t_2,c_3 | ||
| 1378 | mulx a_6,a_4,t_1 !sqr_add_c2(a,6,4,c2,c3,c1); | ||
| 1379 | addcc c_12,t_1,c_12 | ||
| 1380 | bcs,a %xcc,.+8 | ||
| 1381 | add c_3,t_2,c_3 | ||
| 1382 | addcc c_12,t_1,c_12 | ||
| 1383 | bcs,a %xcc,.+8 | ||
| 1384 | add c_3,t_2,c_3 | ||
| 1385 | mulx a_5,a_5,t_1 !sqr_add_c(a,5,c2,c3,c1); | ||
| 1386 | addcc c_12,t_1,t_1 | ||
| 1387 | bcs,a %xcc,.+8 | ||
| 1388 | add c_3,t_2,c_3 | ||
| 1389 | srlx t_1,32,c_12 | ||
| 1390 | stuw t_1,rp(10) !r[10]=c2; | ||
| 1391 | or c_12,c_3,c_12 | ||
| 1392 | |||
| 1393 | mulx a_4,a_7,t_1 !sqr_add_c2(a,7,4,c3,c1,c2); | ||
| 1394 | addcc c_12,t_1,c_12 | ||
| 1395 | clr c_3 | ||
| 1396 | bcs,a %xcc,.+8 | ||
| 1397 | add c_3,t_2,c_3 | ||
| 1398 | addcc c_12,t_1,c_12 | ||
| 1399 | bcs,a %xcc,.+8 | ||
| 1400 | add c_3,t_2,c_3 | ||
| 1401 | mulx a_5,a_6,t_1 !sqr_add_c2(a,6,5,c3,c1,c2); | ||
| 1402 | addcc c_12,t_1,c_12 | ||
| 1403 | bcs,a %xcc,.+8 | ||
| 1404 | add c_3,t_2,c_3 | ||
| 1405 | addcc c_12,t_1,t_1 | ||
| 1406 | bcs,a %xcc,.+8 | ||
| 1407 | add c_3,t_2,c_3 | ||
| 1408 | srlx t_1,32,c_12 | ||
| 1409 | stuw t_1,rp(11) !r[11]=c3; | ||
| 1410 | or c_12,c_3,c_12 | ||
| 1411 | |||
| 1412 | mulx a_7,a_5,t_1 !sqr_add_c2(a,7,5,c1,c2,c3); | ||
| 1413 | addcc c_12,t_1,c_12 | ||
| 1414 | clr c_3 | ||
| 1415 | bcs,a %xcc,.+8 | ||
| 1416 | add c_3,t_2,c_3 | ||
| 1417 | addcc c_12,t_1,c_12 | ||
| 1418 | bcs,a %xcc,.+8 | ||
| 1419 | add c_3,t_2,c_3 | ||
| 1420 | mulx a_6,a_6,t_1 !sqr_add_c(a,6,c1,c2,c3); | ||
| 1421 | addcc c_12,t_1,t_1 | ||
| 1422 | bcs,a %xcc,.+8 | ||
| 1423 | add c_3,t_2,c_3 | ||
| 1424 | srlx t_1,32,c_12 | ||
| 1425 | stuw t_1,rp(12) !r[12]=c1; | ||
| 1426 | or c_12,c_3,c_12 | ||
| 1427 | |||
| 1428 | mulx a_6,a_7,t_1 !sqr_add_c2(a,7,6,c2,c3,c1); | ||
| 1429 | addcc c_12,t_1,c_12 | ||
| 1430 | clr c_3 | ||
| 1431 | bcs,a %xcc,.+8 | ||
| 1432 | add c_3,t_2,c_3 | ||
| 1433 | addcc c_12,t_1,t_1 | ||
| 1434 | bcs,a %xcc,.+8 | ||
| 1435 | add c_3,t_2,c_3 | ||
| 1436 | srlx t_1,32,c_12 | ||
| 1437 | stuw t_1,rp(13) !r[13]=c2; | ||
| 1438 | or c_12,c_3,c_12 | ||
| 1439 | |||
| 1440 | mulx a_7,a_7,t_1 !sqr_add_c(a,7,c3,c1,c2); | ||
| 1441 | addcc c_12,t_1,t_1 | ||
| 1442 | srlx t_1,32,c_12 | ||
| 1443 | stuw t_1,rp(14) !r[14]=c3; | ||
| 1444 | stuw c_12,rp(15) !r[15]=c1; | ||
| 1445 | |||
| 1446 | ret | ||
| 1447 | restore %g0,%g0,%o0 | ||
| 1448 | |||
| 1449 | .type bn_sqr_comba8,#function | ||
| 1450 | .size bn_sqr_comba8,(.-bn_sqr_comba8) | ||
| 1451 | |||
| 1452 | .align 32 | ||
| 1453 | |||
| 1454 | .global bn_sqr_comba4 | ||
| 1455 | /* | ||
| 1456 | * void bn_sqr_comba4(r,a) | ||
| 1457 | * BN_ULONG *r,*a; | ||
| 1458 | */ | ||
| 1459 | bn_sqr_comba4: | ||
| 1460 | save %sp,FRAME_SIZE,%sp | ||
| 1461 | mov 1,t_2 | ||
| 1462 | lduw ap(0),a_0 | ||
| 1463 | sllx t_2,32,t_2 | ||
| 1464 | lduw ap(1),a_1 | ||
| 1465 | mulx a_0,a_0,t_1 !sqr_add_c(a,0,c1,c2,c3); | ||
| 1466 | srlx t_1,32,c_12 | ||
| 1467 | stuw t_1,rp(0) !r[0]=c1; | ||
| 1468 | |||
| 1469 | lduw ap(2),a_2 | ||
| 1470 | mulx a_0,a_1,t_1 !sqr_add_c2(a,1,0,c2,c3,c1); | ||
| 1471 | addcc c_12,t_1,c_12 | ||
| 1472 | clr c_3 | ||
| 1473 | bcs,a %xcc,.+8 | ||
| 1474 | add c_3,t_2,c_3 | ||
| 1475 | addcc c_12,t_1,t_1 | ||
| 1476 | bcs,a %xcc,.+8 | ||
| 1477 | add c_3,t_2,c_3 | ||
| 1478 | srlx t_1,32,c_12 | ||
| 1479 | stuw t_1,rp(1) !r[1]=c2; | ||
| 1480 | or c_12,c_3,c_12 | ||
| 1481 | |||
| 1482 | mulx a_2,a_0,t_1 !sqr_add_c2(a,2,0,c3,c1,c2); | ||
| 1483 | addcc c_12,t_1,c_12 | ||
| 1484 | clr c_3 | ||
| 1485 | bcs,a %xcc,.+8 | ||
| 1486 | add c_3,t_2,c_3 | ||
| 1487 | addcc c_12,t_1,c_12 | ||
| 1488 | bcs,a %xcc,.+8 | ||
| 1489 | add c_3,t_2,c_3 | ||
| 1490 | lduw ap(3),a_3 | ||
| 1491 | mulx a_1,a_1,t_1 !sqr_add_c(a,1,c3,c1,c2); | ||
| 1492 | addcc c_12,t_1,t_1 | ||
| 1493 | bcs,a %xcc,.+8 | ||
| 1494 | add c_3,t_2,c_3 | ||
| 1495 | srlx t_1,32,c_12 | ||
| 1496 | stuw t_1,rp(2) !r[2]=c3; | ||
| 1497 | or c_12,c_3,c_12 | ||
| 1498 | |||
| 1499 | mulx a_0,a_3,t_1 !sqr_add_c2(a,3,0,c1,c2,c3); | ||
| 1500 | addcc c_12,t_1,c_12 | ||
| 1501 | clr c_3 | ||
| 1502 | bcs,a %xcc,.+8 | ||
| 1503 | add c_3,t_2,c_3 | ||
| 1504 | addcc c_12,t_1,c_12 | ||
| 1505 | bcs,a %xcc,.+8 | ||
| 1506 | add c_3,t_2,c_3 | ||
| 1507 | mulx a_1,a_2,t_1 !sqr_add_c2(a,2,1,c1,c2,c3); | ||
| 1508 | addcc c_12,t_1,c_12 | ||
| 1509 | bcs,a %xcc,.+8 | ||
| 1510 | add c_3,t_2,c_3 | ||
| 1511 | addcc c_12,t_1,t_1 | ||
| 1512 | bcs,a %xcc,.+8 | ||
| 1513 | add c_3,t_2,c_3 | ||
| 1514 | srlx t_1,32,c_12 | ||
| 1515 | stuw t_1,rp(3) !r[3]=c1; | ||
| 1516 | or c_12,c_3,c_12 | ||
| 1517 | |||
| 1518 | mulx a_3,a_1,t_1 !sqr_add_c2(a,3,1,c2,c3,c1); | ||
| 1519 | addcc c_12,t_1,c_12 | ||
| 1520 | clr c_3 | ||
| 1521 | bcs,a %xcc,.+8 | ||
| 1522 | add c_3,t_2,c_3 | ||
| 1523 | addcc c_12,t_1,c_12 | ||
| 1524 | bcs,a %xcc,.+8 | ||
| 1525 | add c_3,t_2,c_3 | ||
| 1526 | mulx a_2,a_2,t_1 !sqr_add_c(a,2,c2,c3,c1); | ||
| 1527 | addcc c_12,t_1,t_1 | ||
| 1528 | bcs,a %xcc,.+8 | ||
| 1529 | add c_3,t_2,c_3 | ||
| 1530 | srlx t_1,32,c_12 | ||
| 1531 | stuw t_1,rp(4) !r[4]=c2; | ||
| 1532 | or c_12,c_3,c_12 | ||
| 1533 | |||
| 1534 | mulx a_2,a_3,t_1 !sqr_add_c2(a,3,2,c3,c1,c2); | ||
| 1535 | addcc c_12,t_1,c_12 | ||
| 1536 | clr c_3 | ||
| 1537 | bcs,a %xcc,.+8 | ||
| 1538 | add c_3,t_2,c_3 | ||
| 1539 | addcc c_12,t_1,t_1 | ||
| 1540 | bcs,a %xcc,.+8 | ||
| 1541 | add c_3,t_2,c_3 | ||
| 1542 | srlx t_1,32,c_12 | ||
| 1543 | stuw t_1,rp(5) !r[5]=c3; | ||
| 1544 | or c_12,c_3,c_12 | ||
| 1545 | |||
| 1546 | mulx a_3,a_3,t_1 !sqr_add_c(a,3,c1,c2,c3); | ||
| 1547 | addcc c_12,t_1,t_1 | ||
| 1548 | srlx t_1,32,c_12 | ||
| 1549 | stuw t_1,rp(6) !r[6]=c1; | ||
| 1550 | stuw c_12,rp(7) !r[7]=c2; | ||
| 1551 | |||
| 1552 | ret | ||
| 1553 | restore %g0,%g0,%o0 | ||
| 1554 | |||
| 1555 | .type bn_sqr_comba4,#function | ||
| 1556 | .size bn_sqr_comba4,(.-bn_sqr_comba4) | ||
| 1557 | |||
| 1558 | .align 32 | ||
diff --git a/src/lib/libcrypto/bn/asm/sparcv9-mont.pl b/src/lib/libcrypto/bn/asm/sparcv9-mont.pl deleted file mode 100644 index 610ec1a968..0000000000 --- a/src/lib/libcrypto/bn/asm/sparcv9-mont.pl +++ /dev/null | |||
| @@ -1,604 +0,0 @@ | |||
| 1 | #!/usr/bin/env perl | ||
| 2 | |||
| 3 | # ==================================================================== | ||
| 4 | # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | ||
| 5 | # project. The module is, however, dual licensed under OpenSSL and | ||
| 6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
| 7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
| 8 | # ==================================================================== | ||
| 9 | |||
| 10 | # December 2005 | ||
| 11 | # | ||
| 12 | # Pure SPARCv9/8+ and IALU-only bn_mul_mont implementation. The reasons | ||
| 13 | # for undertaken effort are multiple. First of all, UltraSPARC is not | ||
| 14 | # the whole SPARCv9 universe and other VIS-free implementations deserve | ||
| 15 | # optimized code as much. Secondly, newly introduced UltraSPARC T1, | ||
| 16 | # a.k.a. Niagara, has shared FPU and concurrent FPU-intensive paths, | ||
| 17 | # such as sparcv9a-mont, will simply sink it. Yes, T1 is equipped with | ||
| 18 | # several integrated RSA/DSA accelerator circuits accessible through | ||
| 19 | # kernel driver [only(*)], but having decent user-land software | ||
| 20 | # implementation is important too. Finally, reasons like desire to | ||
| 21 | # experiment with dedicated squaring procedure. Yes, this module | ||
| 22 | # implements one, because it was easiest to draft it in SPARCv9 | ||
| 23 | # instructions... | ||
| 24 | |||
| 25 | # (*) Engine accessing the driver in question is on my TODO list. | ||
| 26 | # For reference, acceleator is estimated to give 6 to 10 times | ||
| 27 | # improvement on single-threaded RSA sign. It should be noted | ||
| 28 | # that 6-10x improvement coefficient does not actually mean | ||
| 29 | # something extraordinary in terms of absolute [single-threaded] | ||
| 30 | # performance, as SPARCv9 instruction set is by all means least | ||
| 31 | # suitable for high performance crypto among other 64 bit | ||
| 32 | # platforms. 6-10x factor simply places T1 in same performance | ||
| 33 | # domain as say AMD64 and IA-64. Improvement of RSA verify don't | ||
| 34 | # appear impressive at all, but it's the sign operation which is | ||
| 35 | # far more critical/interesting. | ||
| 36 | |||
| 37 | # You might notice that inner loops are modulo-scheduled:-) This has | ||
| 38 | # essentially negligible impact on UltraSPARC performance, it's | ||
| 39 | # Fujitsu SPARC64 V users who should notice and hopefully appreciate | ||
| 40 | # the advantage... Currently this module surpasses sparcv9a-mont.pl | ||
| 41 | # by ~20% on UltraSPARC-III and later cores, but recall that sparcv9a | ||
| 42 | # module still have hidden potential [see TODO list there], which is | ||
| 43 | # estimated to be larger than 20%... | ||
| 44 | |||
| 45 | # int bn_mul_mont( | ||
| 46 | $rp="%i0"; # BN_ULONG *rp, | ||
| 47 | $ap="%i1"; # const BN_ULONG *ap, | ||
| 48 | $bp="%i2"; # const BN_ULONG *bp, | ||
| 49 | $np="%i3"; # const BN_ULONG *np, | ||
| 50 | $n0="%i4"; # const BN_ULONG *n0, | ||
| 51 | $num="%i5"; # int num); | ||
| 52 | |||
| 53 | $bits=32; | ||
| 54 | for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); } | ||
| 55 | if ($bits==64) { $bias=2047; $frame=192; } | ||
| 56 | else { $bias=0; $frame=128; } | ||
| 57 | |||
| 58 | $car0="%o0"; | ||
| 59 | $car1="%o1"; | ||
| 60 | $car2="%o2"; # 1 bit | ||
| 61 | $acc0="%o3"; | ||
| 62 | $acc1="%o4"; | ||
| 63 | $mask="%g1"; # 32 bits, what a waste... | ||
| 64 | $tmp0="%g4"; | ||
| 65 | $tmp1="%g5"; | ||
| 66 | |||
| 67 | $i="%l0"; | ||
| 68 | $j="%l1"; | ||
| 69 | $mul0="%l2"; | ||
| 70 | $mul1="%l3"; | ||
| 71 | $tp="%l4"; | ||
| 72 | $apj="%l5"; | ||
| 73 | $npj="%l6"; | ||
| 74 | $tpj="%l7"; | ||
| 75 | |||
| 76 | $fname="bn_mul_mont_int"; | ||
| 77 | |||
| 78 | $code=<<___; | ||
| 79 | .section ".text",#alloc,#execinstr | ||
| 80 | |||
| 81 | .global $fname | ||
| 82 | .align 32 | ||
| 83 | $fname: | ||
| 84 | cmp %o5,4 ! 128 bits minimum | ||
| 85 | bge,pt %icc,.Lenter | ||
| 86 | sethi %hi(0xffffffff),$mask | ||
| 87 | retl | ||
| 88 | clr %o0 | ||
| 89 | .align 32 | ||
| 90 | .Lenter: | ||
| 91 | save %sp,-$frame,%sp | ||
| 92 | sll $num,2,$num ! num*=4 | ||
| 93 | or $mask,%lo(0xffffffff),$mask | ||
| 94 | ld [$n0],$n0 | ||
| 95 | cmp $ap,$bp | ||
| 96 | and $num,$mask,$num | ||
| 97 | ld [$bp],$mul0 ! bp[0] | ||
| 98 | nop | ||
| 99 | |||
| 100 | add %sp,$bias,%o7 ! real top of stack | ||
| 101 | ld [$ap],$car0 ! ap[0] ! redundant in squaring context | ||
| 102 | sub %o7,$num,%o7 | ||
| 103 | ld [$ap+4],$apj ! ap[1] | ||
| 104 | and %o7,-1024,%o7 | ||
| 105 | ld [$np],$car1 ! np[0] | ||
| 106 | sub %o7,$bias,%sp ! alloca | ||
| 107 | ld [$np+4],$npj ! np[1] | ||
| 108 | be,pt `$bits==32?"%icc":"%xcc"`,.Lbn_sqr_mont | ||
| 109 | mov 12,$j | ||
| 110 | |||
| 111 | mulx $car0,$mul0,$car0 ! ap[0]*bp[0] | ||
| 112 | mulx $apj,$mul0,$tmp0 !prologue! ap[1]*bp[0] | ||
| 113 | and $car0,$mask,$acc0 | ||
| 114 | add %sp,$bias+$frame,$tp | ||
| 115 | ld [$ap+8],$apj !prologue! | ||
| 116 | |||
| 117 | mulx $n0,$acc0,$mul1 ! "t[0]"*n0 | ||
| 118 | and $mul1,$mask,$mul1 | ||
| 119 | |||
| 120 | mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0 | ||
| 121 | mulx $npj,$mul1,$acc1 !prologue! np[1]*"t[0]"*n0 | ||
| 122 | srlx $car0,32,$car0 | ||
| 123 | add $acc0,$car1,$car1 | ||
| 124 | ld [$np+8],$npj !prologue! | ||
| 125 | srlx $car1,32,$car1 | ||
| 126 | mov $tmp0,$acc0 !prologue! | ||
| 127 | |||
| 128 | .L1st: | ||
| 129 | mulx $apj,$mul0,$tmp0 | ||
| 130 | mulx $npj,$mul1,$tmp1 | ||
| 131 | add $acc0,$car0,$car0 | ||
| 132 | ld [$ap+$j],$apj ! ap[j] | ||
| 133 | and $car0,$mask,$acc0 | ||
| 134 | add $acc1,$car1,$car1 | ||
| 135 | ld [$np+$j],$npj ! np[j] | ||
| 136 | srlx $car0,32,$car0 | ||
| 137 | add $acc0,$car1,$car1 | ||
| 138 | add $j,4,$j ! j++ | ||
| 139 | mov $tmp0,$acc0 | ||
| 140 | st $car1,[$tp] | ||
| 141 | cmp $j,$num | ||
| 142 | mov $tmp1,$acc1 | ||
| 143 | srlx $car1,32,$car1 | ||
| 144 | bl %icc,.L1st | ||
| 145 | add $tp,4,$tp ! tp++ | ||
| 146 | !.L1st | ||
| 147 | |||
| 148 | mulx $apj,$mul0,$tmp0 !epilogue! | ||
| 149 | mulx $npj,$mul1,$tmp1 | ||
| 150 | add $acc0,$car0,$car0 | ||
| 151 | and $car0,$mask,$acc0 | ||
| 152 | add $acc1,$car1,$car1 | ||
| 153 | srlx $car0,32,$car0 | ||
| 154 | add $acc0,$car1,$car1 | ||
| 155 | st $car1,[$tp] | ||
| 156 | srlx $car1,32,$car1 | ||
| 157 | |||
| 158 | add $tmp0,$car0,$car0 | ||
| 159 | and $car0,$mask,$acc0 | ||
| 160 | add $tmp1,$car1,$car1 | ||
| 161 | srlx $car0,32,$car0 | ||
| 162 | add $acc0,$car1,$car1 | ||
| 163 | st $car1,[$tp+4] | ||
| 164 | srlx $car1,32,$car1 | ||
| 165 | |||
| 166 | add $car0,$car1,$car1 | ||
| 167 | st $car1,[$tp+8] | ||
| 168 | srlx $car1,32,$car2 | ||
| 169 | |||
| 170 | mov 4,$i ! i++ | ||
| 171 | ld [$bp+4],$mul0 ! bp[1] | ||
| 172 | .Louter: | ||
| 173 | add %sp,$bias+$frame,$tp | ||
| 174 | ld [$ap],$car0 ! ap[0] | ||
| 175 | ld [$ap+4],$apj ! ap[1] | ||
| 176 | ld [$np],$car1 ! np[0] | ||
| 177 | ld [$np+4],$npj ! np[1] | ||
| 178 | ld [$tp],$tmp1 ! tp[0] | ||
| 179 | ld [$tp+4],$tpj ! tp[1] | ||
| 180 | mov 12,$j | ||
| 181 | |||
| 182 | mulx $car0,$mul0,$car0 | ||
| 183 | mulx $apj,$mul0,$tmp0 !prologue! | ||
| 184 | add $tmp1,$car0,$car0 | ||
| 185 | ld [$ap+8],$apj !prologue! | ||
| 186 | and $car0,$mask,$acc0 | ||
| 187 | |||
| 188 | mulx $n0,$acc0,$mul1 | ||
| 189 | and $mul1,$mask,$mul1 | ||
| 190 | |||
| 191 | mulx $car1,$mul1,$car1 | ||
| 192 | mulx $npj,$mul1,$acc1 !prologue! | ||
| 193 | srlx $car0,32,$car0 | ||
| 194 | add $acc0,$car1,$car1 | ||
| 195 | ld [$np+8],$npj !prologue! | ||
| 196 | srlx $car1,32,$car1 | ||
| 197 | mov $tmp0,$acc0 !prologue! | ||
| 198 | |||
| 199 | .Linner: | ||
| 200 | mulx $apj,$mul0,$tmp0 | ||
| 201 | mulx $npj,$mul1,$tmp1 | ||
| 202 | add $tpj,$car0,$car0 | ||
| 203 | ld [$ap+$j],$apj ! ap[j] | ||
| 204 | add $acc0,$car0,$car0 | ||
| 205 | add $acc1,$car1,$car1 | ||
| 206 | ld [$np+$j],$npj ! np[j] | ||
| 207 | and $car0,$mask,$acc0 | ||
| 208 | ld [$tp+8],$tpj ! tp[j] | ||
| 209 | srlx $car0,32,$car0 | ||
| 210 | add $acc0,$car1,$car1 | ||
| 211 | add $j,4,$j ! j++ | ||
| 212 | mov $tmp0,$acc0 | ||
| 213 | st $car1,[$tp] ! tp[j-1] | ||
| 214 | srlx $car1,32,$car1 | ||
| 215 | mov $tmp1,$acc1 | ||
| 216 | cmp $j,$num | ||
| 217 | bl %icc,.Linner | ||
| 218 | add $tp,4,$tp ! tp++ | ||
| 219 | !.Linner | ||
| 220 | |||
| 221 | mulx $apj,$mul0,$tmp0 !epilogue! | ||
| 222 | mulx $npj,$mul1,$tmp1 | ||
| 223 | add $tpj,$car0,$car0 | ||
| 224 | add $acc0,$car0,$car0 | ||
| 225 | ld [$tp+8],$tpj ! tp[j] | ||
| 226 | and $car0,$mask,$acc0 | ||
| 227 | add $acc1,$car1,$car1 | ||
| 228 | srlx $car0,32,$car0 | ||
| 229 | add $acc0,$car1,$car1 | ||
| 230 | st $car1,[$tp] ! tp[j-1] | ||
| 231 | srlx $car1,32,$car1 | ||
| 232 | |||
| 233 | add $tpj,$car0,$car0 | ||
| 234 | add $tmp0,$car0,$car0 | ||
| 235 | and $car0,$mask,$acc0 | ||
| 236 | add $tmp1,$car1,$car1 | ||
| 237 | add $acc0,$car1,$car1 | ||
| 238 | st $car1,[$tp+4] ! tp[j-1] | ||
| 239 | srlx $car0,32,$car0 | ||
| 240 | add $i,4,$i ! i++ | ||
| 241 | srlx $car1,32,$car1 | ||
| 242 | |||
| 243 | add $car0,$car1,$car1 | ||
| 244 | cmp $i,$num | ||
| 245 | add $car2,$car1,$car1 | ||
| 246 | st $car1,[$tp+8] | ||
| 247 | |||
| 248 | srlx $car1,32,$car2 | ||
| 249 | bl,a %icc,.Louter | ||
| 250 | ld [$bp+$i],$mul0 ! bp[i] | ||
| 251 | !.Louter | ||
| 252 | |||
| 253 | add $tp,12,$tp | ||
| 254 | |||
| 255 | .Ltail: | ||
| 256 | add $np,$num,$np | ||
| 257 | add $rp,$num,$rp | ||
| 258 | mov $tp,$ap | ||
| 259 | sub %g0,$num,%o7 ! k=-num | ||
| 260 | ba .Lsub | ||
| 261 | subcc %g0,%g0,%g0 ! clear %icc.c | ||
| 262 | .align 16 | ||
| 263 | .Lsub: | ||
| 264 | ld [$tp+%o7],%o0 | ||
| 265 | ld [$np+%o7],%o1 | ||
| 266 | subccc %o0,%o1,%o1 ! tp[j]-np[j] | ||
| 267 | add $rp,%o7,$i | ||
| 268 | add %o7,4,%o7 | ||
| 269 | brnz %o7,.Lsub | ||
| 270 | st %o1,[$i] | ||
| 271 | subc $car2,0,$car2 ! handle upmost overflow bit | ||
| 272 | and $tp,$car2,$ap | ||
| 273 | andn $rp,$car2,$np | ||
| 274 | or $ap,$np,$ap | ||
| 275 | sub %g0,$num,%o7 | ||
| 276 | |||
| 277 | .Lcopy: | ||
| 278 | ld [$ap+%o7],%o0 ! copy or in-place refresh | ||
| 279 | st %g0,[$tp+%o7] ! zap tp | ||
| 280 | st %o0,[$rp+%o7] | ||
| 281 | add %o7,4,%o7 | ||
| 282 | brnz %o7,.Lcopy | ||
| 283 | nop | ||
| 284 | mov 1,%i0 | ||
| 285 | ret | ||
| 286 | restore | ||
| 287 | ___ | ||
| 288 | |||
| 289 | ######## | ||
| 290 | ######## .Lbn_sqr_mont gives up to 20% *overall* improvement over | ||
| 291 | ######## code without following dedicated squaring procedure. | ||
| 292 | ######## | ||
| 293 | $sbit="%i2"; # re-use $bp! | ||
| 294 | |||
| 295 | $code.=<<___; | ||
| 296 | .align 32 | ||
| 297 | .Lbn_sqr_mont: | ||
| 298 | mulx $mul0,$mul0,$car0 ! ap[0]*ap[0] | ||
| 299 | mulx $apj,$mul0,$tmp0 !prologue! | ||
| 300 | and $car0,$mask,$acc0 | ||
| 301 | add %sp,$bias+$frame,$tp | ||
| 302 | ld [$ap+8],$apj !prologue! | ||
| 303 | |||
| 304 | mulx $n0,$acc0,$mul1 ! "t[0]"*n0 | ||
| 305 | srlx $car0,32,$car0 | ||
| 306 | and $mul1,$mask,$mul1 | ||
| 307 | |||
| 308 | mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0 | ||
| 309 | mulx $npj,$mul1,$acc1 !prologue! | ||
| 310 | and $car0,1,$sbit | ||
| 311 | ld [$np+8],$npj !prologue! | ||
| 312 | srlx $car0,1,$car0 | ||
| 313 | add $acc0,$car1,$car1 | ||
| 314 | srlx $car1,32,$car1 | ||
| 315 | mov $tmp0,$acc0 !prologue! | ||
| 316 | |||
| 317 | .Lsqr_1st: | ||
| 318 | mulx $apj,$mul0,$tmp0 | ||
| 319 | mulx $npj,$mul1,$tmp1 | ||
| 320 | add $acc0,$car0,$car0 ! ap[j]*a0+c0 | ||
| 321 | add $acc1,$car1,$car1 | ||
| 322 | ld [$ap+$j],$apj ! ap[j] | ||
| 323 | and $car0,$mask,$acc0 | ||
| 324 | ld [$np+$j],$npj ! np[j] | ||
| 325 | srlx $car0,32,$car0 | ||
| 326 | add $acc0,$acc0,$acc0 | ||
| 327 | or $sbit,$acc0,$acc0 | ||
| 328 | mov $tmp1,$acc1 | ||
| 329 | srlx $acc0,32,$sbit | ||
| 330 | add $j,4,$j ! j++ | ||
| 331 | and $acc0,$mask,$acc0 | ||
| 332 | cmp $j,$num | ||
| 333 | add $acc0,$car1,$car1 | ||
| 334 | st $car1,[$tp] | ||
| 335 | mov $tmp0,$acc0 | ||
| 336 | srlx $car1,32,$car1 | ||
| 337 | bl %icc,.Lsqr_1st | ||
| 338 | add $tp,4,$tp ! tp++ | ||
| 339 | !.Lsqr_1st | ||
| 340 | |||
| 341 | mulx $apj,$mul0,$tmp0 ! epilogue | ||
| 342 | mulx $npj,$mul1,$tmp1 | ||
| 343 | add $acc0,$car0,$car0 ! ap[j]*a0+c0 | ||
| 344 | add $acc1,$car1,$car1 | ||
| 345 | and $car0,$mask,$acc0 | ||
| 346 | srlx $car0,32,$car0 | ||
| 347 | add $acc0,$acc0,$acc0 | ||
| 348 | or $sbit,$acc0,$acc0 | ||
| 349 | srlx $acc0,32,$sbit | ||
| 350 | and $acc0,$mask,$acc0 | ||
| 351 | add $acc0,$car1,$car1 | ||
| 352 | st $car1,[$tp] | ||
| 353 | srlx $car1,32,$car1 | ||
| 354 | |||
| 355 | add $tmp0,$car0,$car0 ! ap[j]*a0+c0 | ||
| 356 | add $tmp1,$car1,$car1 | ||
| 357 | and $car0,$mask,$acc0 | ||
| 358 | srlx $car0,32,$car0 | ||
| 359 | add $acc0,$acc0,$acc0 | ||
| 360 | or $sbit,$acc0,$acc0 | ||
| 361 | srlx $acc0,32,$sbit | ||
| 362 | and $acc0,$mask,$acc0 | ||
| 363 | add $acc0,$car1,$car1 | ||
| 364 | st $car1,[$tp+4] | ||
| 365 | srlx $car1,32,$car1 | ||
| 366 | |||
| 367 | add $car0,$car0,$car0 | ||
| 368 | or $sbit,$car0,$car0 | ||
| 369 | add $car0,$car1,$car1 | ||
| 370 | st $car1,[$tp+8] | ||
| 371 | srlx $car1,32,$car2 | ||
| 372 | |||
| 373 | ld [%sp+$bias+$frame],$tmp0 ! tp[0] | ||
| 374 | ld [%sp+$bias+$frame+4],$tmp1 ! tp[1] | ||
| 375 | ld [%sp+$bias+$frame+8],$tpj ! tp[2] | ||
| 376 | ld [$ap+4],$mul0 ! ap[1] | ||
| 377 | ld [$ap+8],$apj ! ap[2] | ||
| 378 | ld [$np],$car1 ! np[0] | ||
| 379 | ld [$np+4],$npj ! np[1] | ||
| 380 | mulx $n0,$tmp0,$mul1 | ||
| 381 | |||
| 382 | mulx $mul0,$mul0,$car0 | ||
| 383 | and $mul1,$mask,$mul1 | ||
| 384 | |||
| 385 | mulx $car1,$mul1,$car1 | ||
| 386 | mulx $npj,$mul1,$acc1 | ||
| 387 | add $tmp0,$car1,$car1 | ||
| 388 | and $car0,$mask,$acc0 | ||
| 389 | ld [$np+8],$npj ! np[2] | ||
| 390 | srlx $car1,32,$car1 | ||
| 391 | add $tmp1,$car1,$car1 | ||
| 392 | srlx $car0,32,$car0 | ||
| 393 | add $acc0,$car1,$car1 | ||
| 394 | and $car0,1,$sbit | ||
| 395 | add $acc1,$car1,$car1 | ||
| 396 | srlx $car0,1,$car0 | ||
| 397 | mov 12,$j | ||
| 398 | st $car1,[%sp+$bias+$frame] ! tp[0]= | ||
| 399 | srlx $car1,32,$car1 | ||
| 400 | add %sp,$bias+$frame+4,$tp | ||
| 401 | |||
| 402 | .Lsqr_2nd: | ||
| 403 | mulx $apj,$mul0,$acc0 | ||
| 404 | mulx $npj,$mul1,$acc1 | ||
| 405 | add $acc0,$car0,$car0 | ||
| 406 | add $tpj,$car1,$car1 | ||
| 407 | ld [$ap+$j],$apj ! ap[j] | ||
| 408 | and $car0,$mask,$acc0 | ||
| 409 | ld [$np+$j],$npj ! np[j] | ||
| 410 | srlx $car0,32,$car0 | ||
| 411 | add $acc1,$car1,$car1 | ||
| 412 | ld [$tp+8],$tpj ! tp[j] | ||
| 413 | add $acc0,$acc0,$acc0 | ||
| 414 | add $j,4,$j ! j++ | ||
| 415 | or $sbit,$acc0,$acc0 | ||
| 416 | srlx $acc0,32,$sbit | ||
| 417 | and $acc0,$mask,$acc0 | ||
| 418 | cmp $j,$num | ||
| 419 | add $acc0,$car1,$car1 | ||
| 420 | st $car1,[$tp] ! tp[j-1] | ||
| 421 | srlx $car1,32,$car1 | ||
| 422 | bl %icc,.Lsqr_2nd | ||
| 423 | add $tp,4,$tp ! tp++ | ||
| 424 | !.Lsqr_2nd | ||
| 425 | |||
| 426 | mulx $apj,$mul0,$acc0 | ||
| 427 | mulx $npj,$mul1,$acc1 | ||
| 428 | add $acc0,$car0,$car0 | ||
| 429 | add $tpj,$car1,$car1 | ||
| 430 | and $car0,$mask,$acc0 | ||
| 431 | srlx $car0,32,$car0 | ||
| 432 | add $acc1,$car1,$car1 | ||
| 433 | add $acc0,$acc0,$acc0 | ||
| 434 | or $sbit,$acc0,$acc0 | ||
| 435 | srlx $acc0,32,$sbit | ||
| 436 | and $acc0,$mask,$acc0 | ||
| 437 | add $acc0,$car1,$car1 | ||
| 438 | st $car1,[$tp] ! tp[j-1] | ||
| 439 | srlx $car1,32,$car1 | ||
| 440 | |||
| 441 | add $car0,$car0,$car0 | ||
| 442 | or $sbit,$car0,$car0 | ||
| 443 | add $car0,$car1,$car1 | ||
| 444 | add $car2,$car1,$car1 | ||
| 445 | st $car1,[$tp+4] | ||
| 446 | srlx $car1,32,$car2 | ||
| 447 | |||
| 448 | ld [%sp+$bias+$frame],$tmp1 ! tp[0] | ||
| 449 | ld [%sp+$bias+$frame+4],$tpj ! tp[1] | ||
| 450 | ld [$ap+8],$mul0 ! ap[2] | ||
| 451 | ld [$np],$car1 ! np[0] | ||
| 452 | ld [$np+4],$npj ! np[1] | ||
| 453 | mulx $n0,$tmp1,$mul1 | ||
| 454 | and $mul1,$mask,$mul1 | ||
| 455 | mov 8,$i | ||
| 456 | |||
| 457 | mulx $mul0,$mul0,$car0 | ||
| 458 | mulx $car1,$mul1,$car1 | ||
| 459 | and $car0,$mask,$acc0 | ||
| 460 | add $tmp1,$car1,$car1 | ||
| 461 | srlx $car0,32,$car0 | ||
| 462 | add %sp,$bias+$frame,$tp | ||
| 463 | srlx $car1,32,$car1 | ||
| 464 | and $car0,1,$sbit | ||
| 465 | srlx $car0,1,$car0 | ||
| 466 | mov 4,$j | ||
| 467 | |||
| 468 | .Lsqr_outer: | ||
| 469 | .Lsqr_inner1: | ||
| 470 | mulx $npj,$mul1,$acc1 | ||
| 471 | add $tpj,$car1,$car1 | ||
| 472 | add $j,4,$j | ||
| 473 | ld [$tp+8],$tpj | ||
| 474 | cmp $j,$i | ||
| 475 | add $acc1,$car1,$car1 | ||
| 476 | ld [$np+$j],$npj | ||
| 477 | st $car1,[$tp] | ||
| 478 | srlx $car1,32,$car1 | ||
| 479 | bl %icc,.Lsqr_inner1 | ||
| 480 | add $tp,4,$tp | ||
| 481 | !.Lsqr_inner1 | ||
| 482 | |||
| 483 | add $j,4,$j | ||
| 484 | ld [$ap+$j],$apj ! ap[j] | ||
| 485 | mulx $npj,$mul1,$acc1 | ||
| 486 | add $tpj,$car1,$car1 | ||
| 487 | ld [$np+$j],$npj ! np[j] | ||
| 488 | add $acc0,$car1,$car1 | ||
| 489 | ld [$tp+8],$tpj ! tp[j] | ||
| 490 | add $acc1,$car1,$car1 | ||
| 491 | st $car1,[$tp] | ||
| 492 | srlx $car1,32,$car1 | ||
| 493 | |||
| 494 | add $j,4,$j | ||
| 495 | cmp $j,$num | ||
| 496 | be,pn %icc,.Lsqr_no_inner2 | ||
| 497 | add $tp,4,$tp | ||
| 498 | |||
| 499 | .Lsqr_inner2: | ||
| 500 | mulx $apj,$mul0,$acc0 | ||
| 501 | mulx $npj,$mul1,$acc1 | ||
| 502 | add $tpj,$car1,$car1 | ||
| 503 | add $acc0,$car0,$car0 | ||
| 504 | ld [$ap+$j],$apj ! ap[j] | ||
| 505 | and $car0,$mask,$acc0 | ||
| 506 | ld [$np+$j],$npj ! np[j] | ||
| 507 | srlx $car0,32,$car0 | ||
| 508 | add $acc0,$acc0,$acc0 | ||
| 509 | ld [$tp+8],$tpj ! tp[j] | ||
| 510 | or $sbit,$acc0,$acc0 | ||
| 511 | add $j,4,$j ! j++ | ||
| 512 | srlx $acc0,32,$sbit | ||
| 513 | and $acc0,$mask,$acc0 | ||
| 514 | cmp $j,$num | ||
| 515 | add $acc0,$car1,$car1 | ||
| 516 | add $acc1,$car1,$car1 | ||
| 517 | st $car1,[$tp] ! tp[j-1] | ||
| 518 | srlx $car1,32,$car1 | ||
| 519 | bl %icc,.Lsqr_inner2 | ||
| 520 | add $tp,4,$tp ! tp++ | ||
| 521 | |||
| 522 | .Lsqr_no_inner2: | ||
| 523 | mulx $apj,$mul0,$acc0 | ||
| 524 | mulx $npj,$mul1,$acc1 | ||
| 525 | add $tpj,$car1,$car1 | ||
| 526 | add $acc0,$car0,$car0 | ||
| 527 | and $car0,$mask,$acc0 | ||
| 528 | srlx $car0,32,$car0 | ||
| 529 | add $acc0,$acc0,$acc0 | ||
| 530 | or $sbit,$acc0,$acc0 | ||
| 531 | srlx $acc0,32,$sbit | ||
| 532 | and $acc0,$mask,$acc0 | ||
| 533 | add $acc0,$car1,$car1 | ||
| 534 | add $acc1,$car1,$car1 | ||
| 535 | st $car1,[$tp] ! tp[j-1] | ||
| 536 | srlx $car1,32,$car1 | ||
| 537 | |||
| 538 | add $car0,$car0,$car0 | ||
| 539 | or $sbit,$car0,$car0 | ||
| 540 | add $car0,$car1,$car1 | ||
| 541 | add $car2,$car1,$car1 | ||
| 542 | st $car1,[$tp+4] | ||
| 543 | srlx $car1,32,$car2 | ||
| 544 | |||
| 545 | add $i,4,$i ! i++ | ||
| 546 | ld [%sp+$bias+$frame],$tmp1 ! tp[0] | ||
| 547 | ld [%sp+$bias+$frame+4],$tpj ! tp[1] | ||
| 548 | ld [$ap+$i],$mul0 ! ap[j] | ||
| 549 | ld [$np],$car1 ! np[0] | ||
| 550 | ld [$np+4],$npj ! np[1] | ||
| 551 | mulx $n0,$tmp1,$mul1 | ||
| 552 | and $mul1,$mask,$mul1 | ||
| 553 | add $i,4,$tmp0 | ||
| 554 | |||
| 555 | mulx $mul0,$mul0,$car0 | ||
| 556 | mulx $car1,$mul1,$car1 | ||
| 557 | and $car0,$mask,$acc0 | ||
| 558 | add $tmp1,$car1,$car1 | ||
| 559 | srlx $car0,32,$car0 | ||
| 560 | add %sp,$bias+$frame,$tp | ||
| 561 | srlx $car1,32,$car1 | ||
| 562 | and $car0,1,$sbit | ||
| 563 | srlx $car0,1,$car0 | ||
| 564 | |||
| 565 | cmp $tmp0,$num ! i<num-1 | ||
| 566 | bl %icc,.Lsqr_outer | ||
| 567 | mov 4,$j | ||
| 568 | |||
| 569 | .Lsqr_last: | ||
| 570 | mulx $npj,$mul1,$acc1 | ||
| 571 | add $tpj,$car1,$car1 | ||
| 572 | add $j,4,$j | ||
| 573 | ld [$tp+8],$tpj | ||
| 574 | cmp $j,$i | ||
| 575 | add $acc1,$car1,$car1 | ||
| 576 | ld [$np+$j],$npj | ||
| 577 | st $car1,[$tp] | ||
| 578 | srlx $car1,32,$car1 | ||
| 579 | bl %icc,.Lsqr_last | ||
| 580 | add $tp,4,$tp | ||
| 581 | !.Lsqr_last | ||
| 582 | |||
| 583 | mulx $npj,$mul1,$acc1 | ||
| 584 | add $tpj,$car1,$car1 | ||
| 585 | add $acc0,$car1,$car1 | ||
| 586 | add $acc1,$car1,$car1 | ||
| 587 | st $car1,[$tp] | ||
| 588 | srlx $car1,32,$car1 | ||
| 589 | |||
| 590 | add $car0,$car0,$car0 ! recover $car0 | ||
| 591 | or $sbit,$car0,$car0 | ||
| 592 | add $car0,$car1,$car1 | ||
| 593 | add $car2,$car1,$car1 | ||
| 594 | st $car1,[$tp+4] | ||
| 595 | srlx $car1,32,$car2 | ||
| 596 | |||
| 597 | ba .Ltail | ||
| 598 | add $tp,8,$tp | ||
| 599 | .type $fname,#function | ||
| 600 | .size $fname,(.-$fname) | ||
| 601 | ___ | ||
| 602 | $code =~ s/\`([^\`]*)\`/eval($1)/gem; | ||
| 603 | print $code; | ||
| 604 | close STDOUT; | ||
diff --git a/src/lib/libcrypto/bn/asm/sparcv9a-mont.pl b/src/lib/libcrypto/bn/asm/sparcv9a-mont.pl deleted file mode 100755 index 7bb1725a0e..0000000000 --- a/src/lib/libcrypto/bn/asm/sparcv9a-mont.pl +++ /dev/null | |||
| @@ -1,880 +0,0 @@ | |||
| 1 | #!/usr/bin/env perl | ||
| 2 | |||
| 3 | # ==================================================================== | ||
| 4 | # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | ||
| 5 | # project. The module is, however, dual licensed under OpenSSL and | ||
| 6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
| 7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
| 8 | # ==================================================================== | ||
| 9 | |||
| 10 | # October 2005 | ||
| 11 | # | ||
| 12 | # "Teaser" Montgomery multiplication module for UltraSPARC. Why FPU? | ||
| 13 | # Because unlike integer multiplier, which simply stalls whole CPU, | ||
| 14 | # FPU is fully pipelined and can effectively emit 48 bit partial | ||
| 15 | # product every cycle. Why not blended SPARC v9? One can argue that | ||
| 16 | # making this module dependent on UltraSPARC VIS extension limits its | ||
| 17 | # binary compatibility. Well yes, it does exclude SPARC64 prior-V(!) | ||
| 18 | # implementations from compatibility matrix. But the rest, whole Sun | ||
| 19 | # UltraSPARC family and brand new Fujitsu's SPARC64 V, all support | ||
| 20 | # VIS extension instructions used in this module. This is considered | ||
| 21 | # good enough to not care about HAL SPARC64 users [if any] who have | ||
| 22 | # integer-only pure SPARCv9 module to "fall down" to. | ||
| 23 | |||
| 24 | # USI&II cores currently exhibit uniform 2x improvement [over pre- | ||
| 25 | # bn_mul_mont codebase] for all key lengths and benchmarks. On USIII | ||
| 26 | # performance improves few percents for shorter keys and worsens few | ||
| 27 | # percents for longer keys. This is because USIII integer multiplier | ||
| 28 | # is >3x faster than USI&II one, which is harder to match [but see | ||
| 29 | # TODO list below]. It should also be noted that SPARC64 V features | ||
| 30 | # out-of-order execution, which *might* mean that integer multiplier | ||
| 31 | # is pipelined, which in turn *might* be impossible to match... On | ||
| 32 | # additional note, SPARC64 V implements FP Multiply-Add instruction, | ||
| 33 | # which is perfectly usable in this context... In other words, as far | ||
| 34 | # as Fujitsu SPARC64 V goes, talk to the author:-) | ||
| 35 | |||
| 36 | # The implementation implies following "non-natural" limitations on | ||
| 37 | # input arguments: | ||
| 38 | # - num may not be less than 4; | ||
| 39 | # - num has to be even; | ||
| 40 | # Failure to meet either condition has no fatal effects, simply | ||
| 41 | # doesn't give any performance gain. | ||
| 42 | |||
| 43 | # TODO: | ||
| 44 | # - modulo-schedule inner loop for better performance (on in-order | ||
| 45 | # execution core such as UltraSPARC this shall result in further | ||
| 46 | # noticeable(!) improvement); | ||
| 47 | # - dedicated squaring procedure[?]; | ||
| 48 | |||
| 49 | ###################################################################### | ||
| 50 | # November 2006 | ||
| 51 | # | ||
| 52 | # Modulo-scheduled inner loops allow to interleave floating point and | ||
| 53 | # integer instructions and minimize Read-After-Write penalties. This | ||
| 54 | # results in *further* 20-50% performance improvement [depending on | ||
| 55 | # key length, more for longer keys] on USI&II cores and 30-80% - on | ||
| 56 | # USIII&IV. | ||
| 57 | |||
| 58 | $fname="bn_mul_mont_fpu"; | ||
| 59 | $bits=32; | ||
| 60 | for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); } | ||
| 61 | |||
| 62 | if ($bits==64) { | ||
| 63 | $bias=2047; | ||
| 64 | $frame=192; | ||
| 65 | } else { | ||
| 66 | $bias=0; | ||
| 67 | $frame=128; # 96 rounded up to largest known cache-line | ||
| 68 | } | ||
| 69 | $locals=64; | ||
| 70 | |||
| 71 | # In order to provide for 32-/64-bit ABI duality, I keep integers wider | ||
| 72 | # than 32 bit in %g1-%g4 and %o0-%o5. %l0-%l7 and %i0-%i5 are used | ||
| 73 | # exclusively for pointers, indexes and other small values... | ||
| 74 | # int bn_mul_mont( | ||
| 75 | $rp="%i0"; # BN_ULONG *rp, | ||
| 76 | $ap="%i1"; # const BN_ULONG *ap, | ||
| 77 | $bp="%i2"; # const BN_ULONG *bp, | ||
| 78 | $np="%i3"; # const BN_ULONG *np, | ||
| 79 | $n0="%i4"; # const BN_ULONG *n0, | ||
| 80 | $num="%i5"; # int num); | ||
| 81 | |||
| 82 | $tp="%l0"; # t[num] | ||
| 83 | $ap_l="%l1"; # a[num],n[num] are smashed to 32-bit words and saved | ||
| 84 | $ap_h="%l2"; # to these four vectors as double-precision FP values. | ||
| 85 | $np_l="%l3"; # This way a bunch of fxtods are eliminated in second | ||
| 86 | $np_h="%l4"; # loop and L1-cache aliasing is minimized... | ||
| 87 | $i="%l5"; | ||
| 88 | $j="%l6"; | ||
| 89 | $mask="%l7"; # 16-bit mask, 0xffff | ||
| 90 | |||
| 91 | $n0="%g4"; # reassigned(!) to "64-bit" register | ||
| 92 | $carry="%i4"; # %i4 reused(!) for a carry bit | ||
| 93 | |||
| 94 | # FP register naming chart | ||
| 95 | # | ||
| 96 | # ..HILO | ||
| 97 | # dcba | ||
| 98 | # -------- | ||
| 99 | # LOa | ||
| 100 | # LOb | ||
| 101 | # LOc | ||
| 102 | # LOd | ||
| 103 | # HIa | ||
| 104 | # HIb | ||
| 105 | # HIc | ||
| 106 | # HId | ||
| 107 | # ..a | ||
| 108 | # ..b | ||
| 109 | $ba="%f0"; $bb="%f2"; $bc="%f4"; $bd="%f6"; | ||
| 110 | $na="%f8"; $nb="%f10"; $nc="%f12"; $nd="%f14"; | ||
| 111 | $alo="%f16"; $alo_="%f17"; $ahi="%f18"; $ahi_="%f19"; | ||
| 112 | $nlo="%f20"; $nlo_="%f21"; $nhi="%f22"; $nhi_="%f23"; | ||
| 113 | |||
| 114 | $dota="%f24"; $dotb="%f26"; | ||
| 115 | |||
| 116 | $aloa="%f32"; $alob="%f34"; $aloc="%f36"; $alod="%f38"; | ||
| 117 | $ahia="%f40"; $ahib="%f42"; $ahic="%f44"; $ahid="%f46"; | ||
| 118 | $nloa="%f48"; $nlob="%f50"; $nloc="%f52"; $nlod="%f54"; | ||
| 119 | $nhia="%f56"; $nhib="%f58"; $nhic="%f60"; $nhid="%f62"; | ||
| 120 | |||
| 121 | $ASI_FL16_P=0xD2; # magic ASI value to engage 16-bit FP load | ||
| 122 | |||
| 123 | $code=<<___; | ||
| 124 | .section ".text",#alloc,#execinstr | ||
| 125 | |||
| 126 | .global $fname | ||
| 127 | .align 32 | ||
| 128 | $fname: | ||
| 129 | save %sp,-$frame-$locals,%sp | ||
| 130 | |||
| 131 | cmp $num,4 | ||
| 132 | bl,a,pn %icc,.Lret | ||
| 133 | clr %i0 | ||
| 134 | andcc $num,1,%g0 ! $num has to be even... | ||
| 135 | bnz,a,pn %icc,.Lret | ||
| 136 | clr %i0 ! signal "unsupported input value" | ||
| 137 | |||
| 138 | srl $num,1,$num | ||
| 139 | sethi %hi(0xffff),$mask | ||
| 140 | ld [%i4+0],$n0 ! $n0 reassigned, remember? | ||
| 141 | or $mask,%lo(0xffff),$mask | ||
| 142 | ld [%i4+4],%o0 | ||
| 143 | sllx %o0,32,%o0 | ||
| 144 | or %o0,$n0,$n0 ! $n0=n0[1].n0[0] | ||
| 145 | |||
| 146 | sll $num,3,$num ! num*=8 | ||
| 147 | |||
| 148 | add %sp,$bias,%o0 ! real top of stack | ||
| 149 | sll $num,2,%o1 | ||
| 150 | add %o1,$num,%o1 ! %o1=num*5 | ||
| 151 | sub %o0,%o1,%o0 | ||
| 152 | and %o0,-2048,%o0 ! optimize TLB utilization | ||
| 153 | sub %o0,$bias,%sp ! alloca(5*num*8) | ||
| 154 | |||
| 155 | rd %asi,%o7 ! save %asi | ||
| 156 | add %sp,$bias+$frame+$locals,$tp | ||
| 157 | add $tp,$num,$ap_l | ||
| 158 | add $ap_l,$num,$ap_l ! [an]p_[lh] point at the vectors' ends ! | ||
| 159 | add $ap_l,$num,$ap_h | ||
| 160 | add $ap_h,$num,$np_l | ||
| 161 | add $np_l,$num,$np_h | ||
| 162 | |||
| 163 | wr %g0,$ASI_FL16_P,%asi ! setup %asi for 16-bit FP loads | ||
| 164 | |||
| 165 | add $rp,$num,$rp ! readjust input pointers to point | ||
| 166 | add $ap,$num,$ap ! at the ends too... | ||
| 167 | add $bp,$num,$bp | ||
| 168 | add $np,$num,$np | ||
| 169 | |||
| 170 | stx %o7,[%sp+$bias+$frame+48] ! save %asi | ||
| 171 | |||
| 172 | sub %g0,$num,$i ! i=-num | ||
| 173 | sub %g0,$num,$j ! j=-num | ||
| 174 | |||
| 175 | add $ap,$j,%o3 | ||
| 176 | add $bp,$i,%o4 | ||
| 177 | |||
| 178 | ld [%o3+4],%g1 ! bp[0] | ||
| 179 | ld [%o3+0],%o0 | ||
| 180 | ld [%o4+4],%g5 ! ap[0] | ||
| 181 | sllx %g1,32,%g1 | ||
| 182 | ld [%o4+0],%o1 | ||
| 183 | sllx %g5,32,%g5 | ||
| 184 | or %g1,%o0,%o0 | ||
| 185 | or %g5,%o1,%o1 | ||
| 186 | |||
| 187 | add $np,$j,%o5 | ||
| 188 | |||
| 189 | mulx %o1,%o0,%o0 ! ap[0]*bp[0] | ||
| 190 | mulx $n0,%o0,%o0 ! ap[0]*bp[0]*n0 | ||
| 191 | stx %o0,[%sp+$bias+$frame+0] | ||
| 192 | |||
| 193 | ld [%o3+0],$alo_ ! load a[j] as pair of 32-bit words | ||
| 194 | fzeros $alo | ||
| 195 | ld [%o3+4],$ahi_ | ||
| 196 | fzeros $ahi | ||
| 197 | ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words | ||
| 198 | fzeros $nlo | ||
| 199 | ld [%o5+4],$nhi_ | ||
| 200 | fzeros $nhi | ||
| 201 | |||
| 202 | ! transfer b[i] to FPU as 4x16-bit values | ||
| 203 | ldda [%o4+2]%asi,$ba | ||
| 204 | fxtod $alo,$alo | ||
| 205 | ldda [%o4+0]%asi,$bb | ||
| 206 | fxtod $ahi,$ahi | ||
| 207 | ldda [%o4+6]%asi,$bc | ||
| 208 | fxtod $nlo,$nlo | ||
| 209 | ldda [%o4+4]%asi,$bd | ||
| 210 | fxtod $nhi,$nhi | ||
| 211 | |||
| 212 | ! transfer ap[0]*b[0]*n0 to FPU as 4x16-bit values | ||
| 213 | ldda [%sp+$bias+$frame+6]%asi,$na | ||
| 214 | fxtod $ba,$ba | ||
| 215 | ldda [%sp+$bias+$frame+4]%asi,$nb | ||
| 216 | fxtod $bb,$bb | ||
| 217 | ldda [%sp+$bias+$frame+2]%asi,$nc | ||
| 218 | fxtod $bc,$bc | ||
| 219 | ldda [%sp+$bias+$frame+0]%asi,$nd | ||
| 220 | fxtod $bd,$bd | ||
| 221 | |||
| 222 | std $alo,[$ap_l+$j] ! save smashed ap[j] in double format | ||
| 223 | fxtod $na,$na | ||
| 224 | std $ahi,[$ap_h+$j] | ||
| 225 | fxtod $nb,$nb | ||
| 226 | std $nlo,[$np_l+$j] ! save smashed np[j] in double format | ||
| 227 | fxtod $nc,$nc | ||
| 228 | std $nhi,[$np_h+$j] | ||
| 229 | fxtod $nd,$nd | ||
| 230 | |||
| 231 | fmuld $alo,$ba,$aloa | ||
| 232 | fmuld $nlo,$na,$nloa | ||
| 233 | fmuld $alo,$bb,$alob | ||
| 234 | fmuld $nlo,$nb,$nlob | ||
| 235 | fmuld $alo,$bc,$aloc | ||
| 236 | faddd $aloa,$nloa,$nloa | ||
| 237 | fmuld $nlo,$nc,$nloc | ||
| 238 | fmuld $alo,$bd,$alod | ||
| 239 | faddd $alob,$nlob,$nlob | ||
| 240 | fmuld $nlo,$nd,$nlod | ||
| 241 | fmuld $ahi,$ba,$ahia | ||
| 242 | faddd $aloc,$nloc,$nloc | ||
| 243 | fmuld $nhi,$na,$nhia | ||
| 244 | fmuld $ahi,$bb,$ahib | ||
| 245 | faddd $alod,$nlod,$nlod | ||
| 246 | fmuld $nhi,$nb,$nhib | ||
| 247 | fmuld $ahi,$bc,$ahic | ||
| 248 | faddd $ahia,$nhia,$nhia | ||
| 249 | fmuld $nhi,$nc,$nhic | ||
| 250 | fmuld $ahi,$bd,$ahid | ||
| 251 | faddd $ahib,$nhib,$nhib | ||
| 252 | fmuld $nhi,$nd,$nhid | ||
| 253 | |||
| 254 | faddd $ahic,$nhic,$dota ! $nhic | ||
| 255 | faddd $ahid,$nhid,$dotb ! $nhid | ||
| 256 | |||
| 257 | faddd $nloc,$nhia,$nloc | ||
| 258 | faddd $nlod,$nhib,$nlod | ||
| 259 | |||
| 260 | fdtox $nloa,$nloa | ||
| 261 | fdtox $nlob,$nlob | ||
| 262 | fdtox $nloc,$nloc | ||
| 263 | fdtox $nlod,$nlod | ||
| 264 | |||
| 265 | std $nloa,[%sp+$bias+$frame+0] | ||
| 266 | add $j,8,$j | ||
| 267 | std $nlob,[%sp+$bias+$frame+8] | ||
| 268 | add $ap,$j,%o4 | ||
| 269 | std $nloc,[%sp+$bias+$frame+16] | ||
| 270 | add $np,$j,%o5 | ||
| 271 | std $nlod,[%sp+$bias+$frame+24] | ||
| 272 | |||
| 273 | ld [%o4+0],$alo_ ! load a[j] as pair of 32-bit words | ||
| 274 | fzeros $alo | ||
| 275 | ld [%o4+4],$ahi_ | ||
| 276 | fzeros $ahi | ||
| 277 | ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words | ||
| 278 | fzeros $nlo | ||
| 279 | ld [%o5+4],$nhi_ | ||
| 280 | fzeros $nhi | ||
| 281 | |||
| 282 | fxtod $alo,$alo | ||
| 283 | fxtod $ahi,$ahi | ||
| 284 | fxtod $nlo,$nlo | ||
| 285 | fxtod $nhi,$nhi | ||
| 286 | |||
| 287 | ldx [%sp+$bias+$frame+0],%o0 | ||
| 288 | fmuld $alo,$ba,$aloa | ||
| 289 | ldx [%sp+$bias+$frame+8],%o1 | ||
| 290 | fmuld $nlo,$na,$nloa | ||
| 291 | ldx [%sp+$bias+$frame+16],%o2 | ||
| 292 | fmuld $alo,$bb,$alob | ||
| 293 | ldx [%sp+$bias+$frame+24],%o3 | ||
| 294 | fmuld $nlo,$nb,$nlob | ||
| 295 | |||
| 296 | srlx %o0,16,%o7 | ||
| 297 | std $alo,[$ap_l+$j] ! save smashed ap[j] in double format | ||
| 298 | fmuld $alo,$bc,$aloc | ||
| 299 | add %o7,%o1,%o1 | ||
| 300 | std $ahi,[$ap_h+$j] | ||
| 301 | faddd $aloa,$nloa,$nloa | ||
| 302 | fmuld $nlo,$nc,$nloc | ||
| 303 | srlx %o1,16,%o7 | ||
| 304 | std $nlo,[$np_l+$j] ! save smashed np[j] in double format | ||
| 305 | fmuld $alo,$bd,$alod | ||
| 306 | add %o7,%o2,%o2 | ||
| 307 | std $nhi,[$np_h+$j] | ||
| 308 | faddd $alob,$nlob,$nlob | ||
| 309 | fmuld $nlo,$nd,$nlod | ||
| 310 | srlx %o2,16,%o7 | ||
| 311 | fmuld $ahi,$ba,$ahia | ||
| 312 | add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] | ||
| 313 | faddd $aloc,$nloc,$nloc | ||
| 314 | fmuld $nhi,$na,$nhia | ||
| 315 | !and %o0,$mask,%o0 | ||
| 316 | !and %o1,$mask,%o1 | ||
| 317 | !and %o2,$mask,%o2 | ||
| 318 | !sllx %o1,16,%o1 | ||
| 319 | !sllx %o2,32,%o2 | ||
| 320 | !sllx %o3,48,%o7 | ||
| 321 | !or %o1,%o0,%o0 | ||
| 322 | !or %o2,%o0,%o0 | ||
| 323 | !or %o7,%o0,%o0 ! 64-bit result | ||
| 324 | srlx %o3,16,%g1 ! 34-bit carry | ||
| 325 | fmuld $ahi,$bb,$ahib | ||
| 326 | |||
| 327 | faddd $alod,$nlod,$nlod | ||
| 328 | fmuld $nhi,$nb,$nhib | ||
| 329 | fmuld $ahi,$bc,$ahic | ||
| 330 | faddd $ahia,$nhia,$nhia | ||
| 331 | fmuld $nhi,$nc,$nhic | ||
| 332 | fmuld $ahi,$bd,$ahid | ||
| 333 | faddd $ahib,$nhib,$nhib | ||
| 334 | fmuld $nhi,$nd,$nhid | ||
| 335 | |||
| 336 | faddd $dota,$nloa,$nloa | ||
| 337 | faddd $dotb,$nlob,$nlob | ||
| 338 | faddd $ahic,$nhic,$dota ! $nhic | ||
| 339 | faddd $ahid,$nhid,$dotb ! $nhid | ||
| 340 | |||
| 341 | faddd $nloc,$nhia,$nloc | ||
| 342 | faddd $nlod,$nhib,$nlod | ||
| 343 | |||
| 344 | fdtox $nloa,$nloa | ||
| 345 | fdtox $nlob,$nlob | ||
| 346 | fdtox $nloc,$nloc | ||
| 347 | fdtox $nlod,$nlod | ||
| 348 | |||
| 349 | std $nloa,[%sp+$bias+$frame+0] | ||
| 350 | std $nlob,[%sp+$bias+$frame+8] | ||
| 351 | addcc $j,8,$j | ||
| 352 | std $nloc,[%sp+$bias+$frame+16] | ||
| 353 | bz,pn %icc,.L1stskip | ||
| 354 | std $nlod,[%sp+$bias+$frame+24] | ||
| 355 | |||
| 356 | .align 32 ! incidentally already aligned ! | ||
| 357 | .L1st: | ||
| 358 | add $ap,$j,%o4 | ||
| 359 | add $np,$j,%o5 | ||
| 360 | ld [%o4+0],$alo_ ! load a[j] as pair of 32-bit words | ||
| 361 | fzeros $alo | ||
| 362 | ld [%o4+4],$ahi_ | ||
| 363 | fzeros $ahi | ||
| 364 | ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words | ||
| 365 | fzeros $nlo | ||
| 366 | ld [%o5+4],$nhi_ | ||
| 367 | fzeros $nhi | ||
| 368 | |||
| 369 | fxtod $alo,$alo | ||
| 370 | fxtod $ahi,$ahi | ||
| 371 | fxtod $nlo,$nlo | ||
| 372 | fxtod $nhi,$nhi | ||
| 373 | |||
| 374 | ldx [%sp+$bias+$frame+0],%o0 | ||
| 375 | fmuld $alo,$ba,$aloa | ||
| 376 | ldx [%sp+$bias+$frame+8],%o1 | ||
| 377 | fmuld $nlo,$na,$nloa | ||
| 378 | ldx [%sp+$bias+$frame+16],%o2 | ||
| 379 | fmuld $alo,$bb,$alob | ||
| 380 | ldx [%sp+$bias+$frame+24],%o3 | ||
| 381 | fmuld $nlo,$nb,$nlob | ||
| 382 | |||
| 383 | srlx %o0,16,%o7 | ||
| 384 | std $alo,[$ap_l+$j] ! save smashed ap[j] in double format | ||
| 385 | fmuld $alo,$bc,$aloc | ||
| 386 | add %o7,%o1,%o1 | ||
| 387 | std $ahi,[$ap_h+$j] | ||
| 388 | faddd $aloa,$nloa,$nloa | ||
| 389 | fmuld $nlo,$nc,$nloc | ||
| 390 | srlx %o1,16,%o7 | ||
| 391 | std $nlo,[$np_l+$j] ! save smashed np[j] in double format | ||
| 392 | fmuld $alo,$bd,$alod | ||
| 393 | add %o7,%o2,%o2 | ||
| 394 | std $nhi,[$np_h+$j] | ||
| 395 | faddd $alob,$nlob,$nlob | ||
| 396 | fmuld $nlo,$nd,$nlod | ||
| 397 | srlx %o2,16,%o7 | ||
| 398 | fmuld $ahi,$ba,$ahia | ||
| 399 | add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] | ||
| 400 | and %o0,$mask,%o0 | ||
| 401 | faddd $aloc,$nloc,$nloc | ||
| 402 | fmuld $nhi,$na,$nhia | ||
| 403 | and %o1,$mask,%o1 | ||
| 404 | and %o2,$mask,%o2 | ||
| 405 | fmuld $ahi,$bb,$ahib | ||
| 406 | sllx %o1,16,%o1 | ||
| 407 | faddd $alod,$nlod,$nlod | ||
| 408 | fmuld $nhi,$nb,$nhib | ||
| 409 | sllx %o2,32,%o2 | ||
| 410 | fmuld $ahi,$bc,$ahic | ||
| 411 | sllx %o3,48,%o7 | ||
| 412 | or %o1,%o0,%o0 | ||
| 413 | faddd $ahia,$nhia,$nhia | ||
| 414 | fmuld $nhi,$nc,$nhic | ||
| 415 | or %o2,%o0,%o0 | ||
| 416 | fmuld $ahi,$bd,$ahid | ||
| 417 | or %o7,%o0,%o0 ! 64-bit result | ||
| 418 | faddd $ahib,$nhib,$nhib | ||
| 419 | fmuld $nhi,$nd,$nhid | ||
| 420 | addcc %g1,%o0,%o0 | ||
| 421 | faddd $dota,$nloa,$nloa | ||
| 422 | srlx %o3,16,%g1 ! 34-bit carry | ||
| 423 | faddd $dotb,$nlob,$nlob | ||
| 424 | bcs,a %xcc,.+8 | ||
| 425 | add %g1,1,%g1 | ||
| 426 | |||
| 427 | stx %o0,[$tp] ! tp[j-1]= | ||
| 428 | |||
| 429 | faddd $ahic,$nhic,$dota ! $nhic | ||
| 430 | faddd $ahid,$nhid,$dotb ! $nhid | ||
| 431 | |||
| 432 | faddd $nloc,$nhia,$nloc | ||
| 433 | faddd $nlod,$nhib,$nlod | ||
| 434 | |||
| 435 | fdtox $nloa,$nloa | ||
| 436 | fdtox $nlob,$nlob | ||
| 437 | fdtox $nloc,$nloc | ||
| 438 | fdtox $nlod,$nlod | ||
| 439 | |||
| 440 | std $nloa,[%sp+$bias+$frame+0] | ||
| 441 | std $nlob,[%sp+$bias+$frame+8] | ||
| 442 | std $nloc,[%sp+$bias+$frame+16] | ||
| 443 | std $nlod,[%sp+$bias+$frame+24] | ||
| 444 | |||
| 445 | addcc $j,8,$j | ||
| 446 | bnz,pt %icc,.L1st | ||
| 447 | add $tp,8,$tp | ||
| 448 | |||
| 449 | .L1stskip: | ||
| 450 | fdtox $dota,$dota | ||
| 451 | fdtox $dotb,$dotb | ||
| 452 | |||
| 453 | ldx [%sp+$bias+$frame+0],%o0 | ||
| 454 | ldx [%sp+$bias+$frame+8],%o1 | ||
| 455 | ldx [%sp+$bias+$frame+16],%o2 | ||
| 456 | ldx [%sp+$bias+$frame+24],%o3 | ||
| 457 | |||
| 458 | srlx %o0,16,%o7 | ||
| 459 | std $dota,[%sp+$bias+$frame+32] | ||
| 460 | add %o7,%o1,%o1 | ||
| 461 | std $dotb,[%sp+$bias+$frame+40] | ||
| 462 | srlx %o1,16,%o7 | ||
| 463 | add %o7,%o2,%o2 | ||
| 464 | srlx %o2,16,%o7 | ||
| 465 | add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] | ||
| 466 | and %o0,$mask,%o0 | ||
| 467 | and %o1,$mask,%o1 | ||
| 468 | and %o2,$mask,%o2 | ||
| 469 | sllx %o1,16,%o1 | ||
| 470 | sllx %o2,32,%o2 | ||
| 471 | sllx %o3,48,%o7 | ||
| 472 | or %o1,%o0,%o0 | ||
| 473 | or %o2,%o0,%o0 | ||
| 474 | or %o7,%o0,%o0 ! 64-bit result | ||
| 475 | ldx [%sp+$bias+$frame+32],%o4 | ||
| 476 | addcc %g1,%o0,%o0 | ||
| 477 | ldx [%sp+$bias+$frame+40],%o5 | ||
| 478 | srlx %o3,16,%g1 ! 34-bit carry | ||
| 479 | bcs,a %xcc,.+8 | ||
| 480 | add %g1,1,%g1 | ||
| 481 | |||
| 482 | stx %o0,[$tp] ! tp[j-1]= | ||
| 483 | add $tp,8,$tp | ||
| 484 | |||
| 485 | srlx %o4,16,%o7 | ||
| 486 | add %o7,%o5,%o5 | ||
| 487 | and %o4,$mask,%o4 | ||
| 488 | sllx %o5,16,%o7 | ||
| 489 | or %o7,%o4,%o4 | ||
| 490 | addcc %g1,%o4,%o4 | ||
| 491 | srlx %o5,48,%g1 | ||
| 492 | bcs,a %xcc,.+8 | ||
| 493 | add %g1,1,%g1 | ||
| 494 | |||
| 495 | mov %g1,$carry | ||
| 496 | stx %o4,[$tp] ! tp[num-1]= | ||
| 497 | |||
| 498 | ba .Louter | ||
| 499 | add $i,8,$i | ||
| 500 | .align 32 | ||
| 501 | .Louter: | ||
| 502 | sub %g0,$num,$j ! j=-num | ||
| 503 | add %sp,$bias+$frame+$locals,$tp | ||
| 504 | |||
| 505 | add $ap,$j,%o3 | ||
| 506 | add $bp,$i,%o4 | ||
| 507 | |||
| 508 | ld [%o3+4],%g1 ! bp[i] | ||
| 509 | ld [%o3+0],%o0 | ||
| 510 | ld [%o4+4],%g5 ! ap[0] | ||
| 511 | sllx %g1,32,%g1 | ||
| 512 | ld [%o4+0],%o1 | ||
| 513 | sllx %g5,32,%g5 | ||
| 514 | or %g1,%o0,%o0 | ||
| 515 | or %g5,%o1,%o1 | ||
| 516 | |||
| 517 | ldx [$tp],%o2 ! tp[0] | ||
| 518 | mulx %o1,%o0,%o0 | ||
| 519 | addcc %o2,%o0,%o0 | ||
| 520 | mulx $n0,%o0,%o0 ! (ap[0]*bp[i]+t[0])*n0 | ||
| 521 | stx %o0,[%sp+$bias+$frame+0] | ||
| 522 | |||
| 523 | ! transfer b[i] to FPU as 4x16-bit values | ||
| 524 | ldda [%o4+2]%asi,$ba | ||
| 525 | ldda [%o4+0]%asi,$bb | ||
| 526 | ldda [%o4+6]%asi,$bc | ||
| 527 | ldda [%o4+4]%asi,$bd | ||
| 528 | |||
| 529 | ! transfer (ap[0]*b[i]+t[0])*n0 to FPU as 4x16-bit values | ||
| 530 | ldda [%sp+$bias+$frame+6]%asi,$na | ||
| 531 | fxtod $ba,$ba | ||
| 532 | ldda [%sp+$bias+$frame+4]%asi,$nb | ||
| 533 | fxtod $bb,$bb | ||
| 534 | ldda [%sp+$bias+$frame+2]%asi,$nc | ||
| 535 | fxtod $bc,$bc | ||
| 536 | ldda [%sp+$bias+$frame+0]%asi,$nd | ||
| 537 | fxtod $bd,$bd | ||
| 538 | ldd [$ap_l+$j],$alo ! load a[j] in double format | ||
| 539 | fxtod $na,$na | ||
| 540 | ldd [$ap_h+$j],$ahi | ||
| 541 | fxtod $nb,$nb | ||
| 542 | ldd [$np_l+$j],$nlo ! load n[j] in double format | ||
| 543 | fxtod $nc,$nc | ||
| 544 | ldd [$np_h+$j],$nhi | ||
| 545 | fxtod $nd,$nd | ||
| 546 | |||
| 547 | fmuld $alo,$ba,$aloa | ||
| 548 | fmuld $nlo,$na,$nloa | ||
| 549 | fmuld $alo,$bb,$alob | ||
| 550 | fmuld $nlo,$nb,$nlob | ||
| 551 | fmuld $alo,$bc,$aloc | ||
| 552 | faddd $aloa,$nloa,$nloa | ||
| 553 | fmuld $nlo,$nc,$nloc | ||
| 554 | fmuld $alo,$bd,$alod | ||
| 555 | faddd $alob,$nlob,$nlob | ||
| 556 | fmuld $nlo,$nd,$nlod | ||
| 557 | fmuld $ahi,$ba,$ahia | ||
| 558 | faddd $aloc,$nloc,$nloc | ||
| 559 | fmuld $nhi,$na,$nhia | ||
| 560 | fmuld $ahi,$bb,$ahib | ||
| 561 | faddd $alod,$nlod,$nlod | ||
| 562 | fmuld $nhi,$nb,$nhib | ||
| 563 | fmuld $ahi,$bc,$ahic | ||
| 564 | faddd $ahia,$nhia,$nhia | ||
| 565 | fmuld $nhi,$nc,$nhic | ||
| 566 | fmuld $ahi,$bd,$ahid | ||
| 567 | faddd $ahib,$nhib,$nhib | ||
| 568 | fmuld $nhi,$nd,$nhid | ||
| 569 | |||
| 570 | faddd $ahic,$nhic,$dota ! $nhic | ||
| 571 | faddd $ahid,$nhid,$dotb ! $nhid | ||
| 572 | |||
| 573 | faddd $nloc,$nhia,$nloc | ||
| 574 | faddd $nlod,$nhib,$nlod | ||
| 575 | |||
| 576 | fdtox $nloa,$nloa | ||
| 577 | fdtox $nlob,$nlob | ||
| 578 | fdtox $nloc,$nloc | ||
| 579 | fdtox $nlod,$nlod | ||
| 580 | |||
| 581 | std $nloa,[%sp+$bias+$frame+0] | ||
| 582 | std $nlob,[%sp+$bias+$frame+8] | ||
| 583 | std $nloc,[%sp+$bias+$frame+16] | ||
| 584 | add $j,8,$j | ||
| 585 | std $nlod,[%sp+$bias+$frame+24] | ||
| 586 | |||
| 587 | ldd [$ap_l+$j],$alo ! load a[j] in double format | ||
| 588 | ldd [$ap_h+$j],$ahi | ||
| 589 | ldd [$np_l+$j],$nlo ! load n[j] in double format | ||
| 590 | ldd [$np_h+$j],$nhi | ||
| 591 | |||
| 592 | fmuld $alo,$ba,$aloa | ||
| 593 | fmuld $nlo,$na,$nloa | ||
| 594 | fmuld $alo,$bb,$alob | ||
| 595 | fmuld $nlo,$nb,$nlob | ||
| 596 | fmuld $alo,$bc,$aloc | ||
| 597 | ldx [%sp+$bias+$frame+0],%o0 | ||
| 598 | faddd $aloa,$nloa,$nloa | ||
| 599 | fmuld $nlo,$nc,$nloc | ||
| 600 | ldx [%sp+$bias+$frame+8],%o1 | ||
| 601 | fmuld $alo,$bd,$alod | ||
| 602 | ldx [%sp+$bias+$frame+16],%o2 | ||
| 603 | faddd $alob,$nlob,$nlob | ||
| 604 | fmuld $nlo,$nd,$nlod | ||
| 605 | ldx [%sp+$bias+$frame+24],%o3 | ||
| 606 | fmuld $ahi,$ba,$ahia | ||
| 607 | |||
| 608 | srlx %o0,16,%o7 | ||
| 609 | faddd $aloc,$nloc,$nloc | ||
| 610 | fmuld $nhi,$na,$nhia | ||
| 611 | add %o7,%o1,%o1 | ||
| 612 | fmuld $ahi,$bb,$ahib | ||
| 613 | srlx %o1,16,%o7 | ||
| 614 | faddd $alod,$nlod,$nlod | ||
| 615 | fmuld $nhi,$nb,$nhib | ||
| 616 | add %o7,%o2,%o2 | ||
| 617 | fmuld $ahi,$bc,$ahic | ||
| 618 | srlx %o2,16,%o7 | ||
| 619 | faddd $ahia,$nhia,$nhia | ||
| 620 | fmuld $nhi,$nc,$nhic | ||
| 621 | add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] | ||
| 622 | ! why? | ||
| 623 | and %o0,$mask,%o0 | ||
| 624 | fmuld $ahi,$bd,$ahid | ||
| 625 | and %o1,$mask,%o1 | ||
| 626 | and %o2,$mask,%o2 | ||
| 627 | faddd $ahib,$nhib,$nhib | ||
| 628 | fmuld $nhi,$nd,$nhid | ||
| 629 | sllx %o1,16,%o1 | ||
| 630 | faddd $dota,$nloa,$nloa | ||
| 631 | sllx %o2,32,%o2 | ||
| 632 | faddd $dotb,$nlob,$nlob | ||
| 633 | sllx %o3,48,%o7 | ||
| 634 | or %o1,%o0,%o0 | ||
| 635 | faddd $ahic,$nhic,$dota ! $nhic | ||
| 636 | or %o2,%o0,%o0 | ||
| 637 | faddd $ahid,$nhid,$dotb ! $nhid | ||
| 638 | or %o7,%o0,%o0 ! 64-bit result | ||
| 639 | ldx [$tp],%o7 | ||
| 640 | faddd $nloc,$nhia,$nloc | ||
| 641 | addcc %o7,%o0,%o0 | ||
| 642 | ! end-of-why? | ||
| 643 | faddd $nlod,$nhib,$nlod | ||
| 644 | srlx %o3,16,%g1 ! 34-bit carry | ||
| 645 | fdtox $nloa,$nloa | ||
| 646 | bcs,a %xcc,.+8 | ||
| 647 | add %g1,1,%g1 | ||
| 648 | |||
| 649 | fdtox $nlob,$nlob | ||
| 650 | fdtox $nloc,$nloc | ||
| 651 | fdtox $nlod,$nlod | ||
| 652 | |||
| 653 | std $nloa,[%sp+$bias+$frame+0] | ||
| 654 | std $nlob,[%sp+$bias+$frame+8] | ||
| 655 | addcc $j,8,$j | ||
| 656 | std $nloc,[%sp+$bias+$frame+16] | ||
| 657 | bz,pn %icc,.Linnerskip | ||
| 658 | std $nlod,[%sp+$bias+$frame+24] | ||
| 659 | |||
| 660 | ba .Linner | ||
| 661 | nop | ||
| 662 | .align 32 | ||
| 663 | .Linner: | ||
| 664 | ldd [$ap_l+$j],$alo ! load a[j] in double format | ||
| 665 | ldd [$ap_h+$j],$ahi | ||
| 666 | ldd [$np_l+$j],$nlo ! load n[j] in double format | ||
| 667 | ldd [$np_h+$j],$nhi | ||
| 668 | |||
| 669 | fmuld $alo,$ba,$aloa | ||
| 670 | fmuld $nlo,$na,$nloa | ||
| 671 | fmuld $alo,$bb,$alob | ||
| 672 | fmuld $nlo,$nb,$nlob | ||
| 673 | fmuld $alo,$bc,$aloc | ||
| 674 | ldx [%sp+$bias+$frame+0],%o0 | ||
| 675 | faddd $aloa,$nloa,$nloa | ||
| 676 | fmuld $nlo,$nc,$nloc | ||
| 677 | ldx [%sp+$bias+$frame+8],%o1 | ||
| 678 | fmuld $alo,$bd,$alod | ||
| 679 | ldx [%sp+$bias+$frame+16],%o2 | ||
| 680 | faddd $alob,$nlob,$nlob | ||
| 681 | fmuld $nlo,$nd,$nlod | ||
| 682 | ldx [%sp+$bias+$frame+24],%o3 | ||
| 683 | fmuld $ahi,$ba,$ahia | ||
| 684 | |||
| 685 | srlx %o0,16,%o7 | ||
| 686 | faddd $aloc,$nloc,$nloc | ||
| 687 | fmuld $nhi,$na,$nhia | ||
| 688 | add %o7,%o1,%o1 | ||
| 689 | fmuld $ahi,$bb,$ahib | ||
| 690 | srlx %o1,16,%o7 | ||
| 691 | faddd $alod,$nlod,$nlod | ||
| 692 | fmuld $nhi,$nb,$nhib | ||
| 693 | add %o7,%o2,%o2 | ||
| 694 | fmuld $ahi,$bc,$ahic | ||
| 695 | srlx %o2,16,%o7 | ||
| 696 | faddd $ahia,$nhia,$nhia | ||
| 697 | fmuld $nhi,$nc,$nhic | ||
| 698 | add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] | ||
| 699 | and %o0,$mask,%o0 | ||
| 700 | fmuld $ahi,$bd,$ahid | ||
| 701 | and %o1,$mask,%o1 | ||
| 702 | and %o2,$mask,%o2 | ||
| 703 | faddd $ahib,$nhib,$nhib | ||
| 704 | fmuld $nhi,$nd,$nhid | ||
| 705 | sllx %o1,16,%o1 | ||
| 706 | faddd $dota,$nloa,$nloa | ||
| 707 | sllx %o2,32,%o2 | ||
| 708 | faddd $dotb,$nlob,$nlob | ||
| 709 | sllx %o3,48,%o7 | ||
| 710 | or %o1,%o0,%o0 | ||
| 711 | faddd $ahic,$nhic,$dota ! $nhic | ||
| 712 | or %o2,%o0,%o0 | ||
| 713 | faddd $ahid,$nhid,$dotb ! $nhid | ||
| 714 | or %o7,%o0,%o0 ! 64-bit result | ||
| 715 | faddd $nloc,$nhia,$nloc | ||
| 716 | addcc %g1,%o0,%o0 | ||
| 717 | ldx [$tp+8],%o7 ! tp[j] | ||
| 718 | faddd $nlod,$nhib,$nlod | ||
| 719 | srlx %o3,16,%g1 ! 34-bit carry | ||
| 720 | fdtox $nloa,$nloa | ||
| 721 | bcs,a %xcc,.+8 | ||
| 722 | add %g1,1,%g1 | ||
| 723 | fdtox $nlob,$nlob | ||
| 724 | addcc %o7,%o0,%o0 | ||
| 725 | fdtox $nloc,$nloc | ||
| 726 | bcs,a %xcc,.+8 | ||
| 727 | add %g1,1,%g1 | ||
| 728 | |||
| 729 | stx %o0,[$tp] ! tp[j-1] | ||
| 730 | fdtox $nlod,$nlod | ||
| 731 | |||
| 732 | std $nloa,[%sp+$bias+$frame+0] | ||
| 733 | std $nlob,[%sp+$bias+$frame+8] | ||
| 734 | std $nloc,[%sp+$bias+$frame+16] | ||
| 735 | addcc $j,8,$j | ||
| 736 | std $nlod,[%sp+$bias+$frame+24] | ||
| 737 | bnz,pt %icc,.Linner | ||
| 738 | add $tp,8,$tp | ||
| 739 | |||
| 740 | .Linnerskip: | ||
| 741 | fdtox $dota,$dota | ||
| 742 | fdtox $dotb,$dotb | ||
| 743 | |||
| 744 | ldx [%sp+$bias+$frame+0],%o0 | ||
| 745 | ldx [%sp+$bias+$frame+8],%o1 | ||
| 746 | ldx [%sp+$bias+$frame+16],%o2 | ||
| 747 | ldx [%sp+$bias+$frame+24],%o3 | ||
| 748 | |||
| 749 | srlx %o0,16,%o7 | ||
| 750 | std $dota,[%sp+$bias+$frame+32] | ||
| 751 | add %o7,%o1,%o1 | ||
| 752 | std $dotb,[%sp+$bias+$frame+40] | ||
| 753 | srlx %o1,16,%o7 | ||
| 754 | add %o7,%o2,%o2 | ||
| 755 | srlx %o2,16,%o7 | ||
| 756 | add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] | ||
| 757 | and %o0,$mask,%o0 | ||
| 758 | and %o1,$mask,%o1 | ||
| 759 | and %o2,$mask,%o2 | ||
| 760 | sllx %o1,16,%o1 | ||
| 761 | sllx %o2,32,%o2 | ||
| 762 | sllx %o3,48,%o7 | ||
| 763 | or %o1,%o0,%o0 | ||
| 764 | or %o2,%o0,%o0 | ||
| 765 | ldx [%sp+$bias+$frame+32],%o4 | ||
| 766 | or %o7,%o0,%o0 ! 64-bit result | ||
| 767 | ldx [%sp+$bias+$frame+40],%o5 | ||
| 768 | addcc %g1,%o0,%o0 | ||
| 769 | ldx [$tp+8],%o7 ! tp[j] | ||
| 770 | srlx %o3,16,%g1 ! 34-bit carry | ||
| 771 | bcs,a %xcc,.+8 | ||
| 772 | add %g1,1,%g1 | ||
| 773 | |||
| 774 | addcc %o7,%o0,%o0 | ||
| 775 | bcs,a %xcc,.+8 | ||
| 776 | add %g1,1,%g1 | ||
| 777 | |||
| 778 | stx %o0,[$tp] ! tp[j-1] | ||
| 779 | add $tp,8,$tp | ||
| 780 | |||
| 781 | srlx %o4,16,%o7 | ||
| 782 | add %o7,%o5,%o5 | ||
| 783 | and %o4,$mask,%o4 | ||
| 784 | sllx %o5,16,%o7 | ||
| 785 | or %o7,%o4,%o4 | ||
| 786 | addcc %g1,%o4,%o4 | ||
| 787 | srlx %o5,48,%g1 | ||
| 788 | bcs,a %xcc,.+8 | ||
| 789 | add %g1,1,%g1 | ||
| 790 | |||
| 791 | addcc $carry,%o4,%o4 | ||
| 792 | stx %o4,[$tp] ! tp[num-1] | ||
| 793 | mov %g1,$carry | ||
| 794 | bcs,a %xcc,.+8 | ||
| 795 | add $carry,1,$carry | ||
| 796 | |||
| 797 | addcc $i,8,$i | ||
| 798 | bnz %icc,.Louter | ||
| 799 | nop | ||
| 800 | |||
| 801 | add $tp,8,$tp ! adjust tp to point at the end | ||
| 802 | orn %g0,%g0,%g4 | ||
| 803 | sub %g0,$num,%o7 ! n=-num | ||
| 804 | ba .Lsub | ||
| 805 | subcc %g0,%g0,%g0 ! clear %icc.c | ||
| 806 | |||
| 807 | .align 32 | ||
| 808 | .Lsub: | ||
| 809 | ldx [$tp+%o7],%o0 | ||
| 810 | add $np,%o7,%g1 | ||
| 811 | ld [%g1+0],%o2 | ||
| 812 | ld [%g1+4],%o3 | ||
| 813 | srlx %o0,32,%o1 | ||
| 814 | subccc %o0,%o2,%o2 | ||
| 815 | add $rp,%o7,%g1 | ||
| 816 | subccc %o1,%o3,%o3 | ||
| 817 | st %o2,[%g1+0] | ||
| 818 | add %o7,8,%o7 | ||
| 819 | brnz,pt %o7,.Lsub | ||
| 820 | st %o3,[%g1+4] | ||
| 821 | subc $carry,0,%g4 | ||
| 822 | sub %g0,$num,%o7 ! n=-num | ||
| 823 | ba .Lcopy | ||
| 824 | nop | ||
| 825 | |||
| 826 | .align 32 | ||
| 827 | .Lcopy: | ||
| 828 | ldx [$tp+%o7],%o0 | ||
| 829 | add $rp,%o7,%g1 | ||
| 830 | ld [%g1+0],%o2 | ||
| 831 | ld [%g1+4],%o3 | ||
| 832 | stx %g0,[$tp+%o7] | ||
| 833 | and %o0,%g4,%o0 | ||
| 834 | srlx %o0,32,%o1 | ||
| 835 | andn %o2,%g4,%o2 | ||
| 836 | andn %o3,%g4,%o3 | ||
| 837 | or %o2,%o0,%o0 | ||
| 838 | or %o3,%o1,%o1 | ||
| 839 | st %o0,[%g1+0] | ||
| 840 | add %o7,8,%o7 | ||
| 841 | brnz,pt %o7,.Lcopy | ||
| 842 | st %o1,[%g1+4] | ||
| 843 | sub %g0,$num,%o7 ! n=-num | ||
| 844 | |||
| 845 | .Lzap: | ||
| 846 | stx %g0,[$ap_l+%o7] | ||
| 847 | stx %g0,[$ap_h+%o7] | ||
| 848 | stx %g0,[$np_l+%o7] | ||
| 849 | stx %g0,[$np_h+%o7] | ||
| 850 | add %o7,8,%o7 | ||
| 851 | brnz,pt %o7,.Lzap | ||
| 852 | nop | ||
| 853 | |||
| 854 | ldx [%sp+$bias+$frame+48],%o7 | ||
| 855 | wr %g0,%o7,%asi ! restore %asi | ||
| 856 | |||
| 857 | mov 1,%i0 | ||
| 858 | .Lret: | ||
| 859 | ret | ||
| 860 | restore | ||
| 861 | .type $fname,#function | ||
| 862 | .size $fname,(.-$fname) | ||
| 863 | ___ | ||
| 864 | |||
| 865 | $code =~ s/\`([^\`]*)\`/eval($1)/gem; | ||
| 866 | |||
| 867 | # Below substitution makes it possible to compile without demanding | ||
| 868 | # VIS extensions on command line, e.g. -xarch=v9 vs. -xarch=v9a. I | ||
| 869 | # dare to do this, because VIS capability is detected at run-time now | ||
| 870 | # and this routine is not called on CPU not capable to execute it. Do | ||
| 871 | # note that fzeros is not the only VIS dependency! Another dependency | ||
| 872 | # is implicit and is just _a_ numerical value loaded to %asi register, | ||
| 873 | # which assembler can't recognize as VIS specific... | ||
| 874 | $code =~ s/fzeros\s+%f([0-9]+)/ | ||
| 875 | sprintf(".word\t0x%x\t! fzeros %%f%d",0x81b00c20|($1<<25),$1) | ||
| 876 | /gem; | ||
| 877 | |||
| 878 | print $code; | ||
| 879 | # flush | ||
| 880 | close STDOUT; | ||
diff --git a/src/lib/libcrypto/bn/asm/via-mont.pl b/src/lib/libcrypto/bn/asm/via-mont.pl deleted file mode 100644 index c046a514c8..0000000000 --- a/src/lib/libcrypto/bn/asm/via-mont.pl +++ /dev/null | |||
| @@ -1,242 +0,0 @@ | |||
| 1 | #!/usr/bin/env perl | ||
| 2 | # | ||
| 3 | # ==================================================================== | ||
| 4 | # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | ||
| 5 | # project. The module is, however, dual licensed under OpenSSL and | ||
| 6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
| 7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
| 8 | # ==================================================================== | ||
| 9 | # | ||
| 10 | # Wrapper around 'rep montmul', VIA-specific instruction accessing | ||
| 11 | # PadLock Montgomery Multiplier. The wrapper is designed as drop-in | ||
| 12 | # replacement for OpenSSL bn_mul_mont [first implemented in 0.9.9]. | ||
| 13 | # | ||
| 14 | # Below are interleaved outputs from 'openssl speed rsa dsa' for 4 | ||
| 15 | # different software configurations on 1.5GHz VIA Esther processor. | ||
| 16 | # Lines marked with "software integer" denote performance of hand- | ||
| 17 | # coded integer-only assembler found in OpenSSL 0.9.7. "Software SSE2" | ||
| 18 | # refers to hand-coded SSE2 Montgomery multiplication procedure found | ||
| 19 | # OpenSSL 0.9.9. "Hardware VIA SDK" refers to padlock_pmm routine from | ||
| 20 | # Padlock SDK 2.0.1 available for download from VIA, which naturally | ||
| 21 | # utilizes the magic 'repz montmul' instruction. And finally "hardware | ||
| 22 | # this" refers to *this* implementation which also uses 'repz montmul' | ||
| 23 | # | ||
| 24 | # sign verify sign/s verify/s | ||
| 25 | # rsa 512 bits 0.001720s 0.000140s 581.4 7149.7 software integer | ||
| 26 | # rsa 512 bits 0.000690s 0.000086s 1450.3 11606.0 software SSE2 | ||
| 27 | # rsa 512 bits 0.006136s 0.000201s 163.0 4974.5 hardware VIA SDK | ||
| 28 | # rsa 512 bits 0.000712s 0.000050s 1404.9 19858.5 hardware this | ||
| 29 | # | ||
| 30 | # rsa 1024 bits 0.008518s 0.000413s 117.4 2420.8 software integer | ||
| 31 | # rsa 1024 bits 0.004275s 0.000277s 233.9 3609.7 software SSE2 | ||
| 32 | # rsa 1024 bits 0.012136s 0.000260s 82.4 3844.5 hardware VIA SDK | ||
| 33 | # rsa 1024 bits 0.002522s 0.000116s 396.5 8650.9 hardware this | ||
| 34 | # | ||
| 35 | # rsa 2048 bits 0.050101s 0.001371s 20.0 729.6 software integer | ||
| 36 | # rsa 2048 bits 0.030273s 0.001008s 33.0 991.9 software SSE2 | ||
| 37 | # rsa 2048 bits 0.030833s 0.000976s 32.4 1025.1 hardware VIA SDK | ||
| 38 | # rsa 2048 bits 0.011879s 0.000342s 84.2 2921.7 hardware this | ||
| 39 | # | ||
| 40 | # rsa 4096 bits 0.327097s 0.004859s 3.1 205.8 software integer | ||
| 41 | # rsa 4096 bits 0.229318s 0.003859s 4.4 259.2 software SSE2 | ||
| 42 | # rsa 4096 bits 0.233953s 0.003274s 4.3 305.4 hardware VIA SDK | ||
| 43 | # rsa 4096 bits 0.070493s 0.001166s 14.2 857.6 hardware this | ||
| 44 | # | ||
| 45 | # dsa 512 bits 0.001342s 0.001651s 745.2 605.7 software integer | ||
| 46 | # dsa 512 bits 0.000844s 0.000987s 1185.3 1013.1 software SSE2 | ||
| 47 | # dsa 512 bits 0.001902s 0.002247s 525.6 444.9 hardware VIA SDK | ||
| 48 | # dsa 512 bits 0.000458s 0.000524s 2182.2 1909.1 hardware this | ||
| 49 | # | ||
| 50 | # dsa 1024 bits 0.003964s 0.004926s 252.3 203.0 software integer | ||
| 51 | # dsa 1024 bits 0.002686s 0.003166s 372.3 315.8 software SSE2 | ||
| 52 | # dsa 1024 bits 0.002397s 0.002823s 417.1 354.3 hardware VIA SDK | ||
| 53 | # dsa 1024 bits 0.000978s 0.001170s 1022.2 855.0 hardware this | ||
| 54 | # | ||
| 55 | # dsa 2048 bits 0.013280s 0.016518s 75.3 60.5 software integer | ||
| 56 | # dsa 2048 bits 0.009911s 0.011522s 100.9 86.8 software SSE2 | ||
| 57 | # dsa 2048 bits 0.009542s 0.011763s 104.8 85.0 hardware VIA SDK | ||
| 58 | # dsa 2048 bits 0.002884s 0.003352s 346.8 298.3 hardware this | ||
| 59 | # | ||
| 60 | # To give you some other reference point here is output for 2.4GHz P4 | ||
| 61 | # running hand-coded SSE2 bn_mul_mont found in 0.9.9, i.e. "software | ||
| 62 | # SSE2" in above terms. | ||
| 63 | # | ||
| 64 | # rsa 512 bits 0.000407s 0.000047s 2454.2 21137.0 | ||
| 65 | # rsa 1024 bits 0.002426s 0.000141s 412.1 7100.0 | ||
| 66 | # rsa 2048 bits 0.015046s 0.000491s 66.5 2034.9 | ||
| 67 | # rsa 4096 bits 0.109770s 0.002379s 9.1 420.3 | ||
| 68 | # dsa 512 bits 0.000438s 0.000525s 2281.1 1904.1 | ||
| 69 | # dsa 1024 bits 0.001346s 0.001595s 742.7 627.0 | ||
| 70 | # dsa 2048 bits 0.004745s 0.005582s 210.7 179.1 | ||
| 71 | # | ||
| 72 | # Conclusions: | ||
| 73 | # - VIA SDK leaves a *lot* of room for improvement (which this | ||
| 74 | # implementation successfully fills:-); | ||
| 75 | # - 'rep montmul' gives up to >3x performance improvement depending on | ||
| 76 | # key length; | ||
| 77 | # - in terms of absolute performance it delivers approximately as much | ||
| 78 | # as modern out-of-order 32-bit cores [again, for longer keys]. | ||
| 79 | |||
| 80 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | ||
| 81 | push(@INC,"${dir}","${dir}../../perlasm"); | ||
| 82 | require "x86asm.pl"; | ||
| 83 | |||
| 84 | &asm_init($ARGV[0],"via-mont.pl"); | ||
| 85 | |||
| 86 | # int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num); | ||
| 87 | $func="bn_mul_mont_padlock"; | ||
| 88 | |||
| 89 | $pad=16*1; # amount of reserved bytes on top of every vector | ||
| 90 | |||
| 91 | # stack layout | ||
| 92 | $mZeroPrime=&DWP(0,"esp"); # these are specified by VIA | ||
| 93 | $A=&DWP(4,"esp"); | ||
| 94 | $B=&DWP(8,"esp"); | ||
| 95 | $T=&DWP(12,"esp"); | ||
| 96 | $M=&DWP(16,"esp"); | ||
| 97 | $scratch=&DWP(20,"esp"); | ||
| 98 | $rp=&DWP(24,"esp"); # these are mine | ||
| 99 | $sp=&DWP(28,"esp"); | ||
| 100 | # &DWP(32,"esp") # 32 byte scratch area | ||
| 101 | # &DWP(64+(4*$num+$pad)*0,"esp") # padded tp[num] | ||
| 102 | # &DWP(64+(4*$num+$pad)*1,"esp") # padded copy of ap[num] | ||
| 103 | # &DWP(64+(4*$num+$pad)*2,"esp") # padded copy of bp[num] | ||
| 104 | # &DWP(64+(4*$num+$pad)*3,"esp") # padded copy of np[num] | ||
| 105 | # Note that SDK suggests to unconditionally allocate 2K per vector. This | ||
| 106 | # has quite an impact on performance. It naturally depends on key length, | ||
| 107 | # but to give an example 1024 bit private RSA key operations suffer >30% | ||
| 108 | # penalty. I allocate only as much as actually required... | ||
| 109 | |||
| 110 | &function_begin($func); | ||
| 111 | &xor ("eax","eax"); | ||
| 112 | &mov ("ecx",&wparam(5)); # num | ||
| 113 | # meet VIA's limitations for num [note that the specification | ||
| 114 | # expresses them in bits, while we work with amount of 32-bit words] | ||
| 115 | &test ("ecx",3); | ||
| 116 | &jnz (&label("leave")); # num % 4 != 0 | ||
| 117 | &cmp ("ecx",8); | ||
| 118 | &jb (&label("leave")); # num < 8 | ||
| 119 | &cmp ("ecx",1024); | ||
| 120 | &ja (&label("leave")); # num > 1024 | ||
| 121 | |||
| 122 | &pushf (); | ||
| 123 | &cld (); | ||
| 124 | |||
| 125 | &mov ("edi",&wparam(0)); # rp | ||
| 126 | &mov ("eax",&wparam(1)); # ap | ||
| 127 | &mov ("ebx",&wparam(2)); # bp | ||
| 128 | &mov ("edx",&wparam(3)); # np | ||
| 129 | &mov ("esi",&wparam(4)); # n0 | ||
| 130 | &mov ("esi",&DWP(0,"esi")); # *n0 | ||
| 131 | |||
| 132 | &lea ("ecx",&DWP($pad,"","ecx",4)); # ecx becomes vector size in bytes | ||
| 133 | &lea ("ebp",&DWP(64,"","ecx",4)); # allocate 4 vectors + 64 bytes | ||
| 134 | &neg ("ebp"); | ||
| 135 | &add ("ebp","esp"); | ||
| 136 | &and ("ebp",-64); # align to cache-line | ||
| 137 | &xchg ("ebp","esp"); # alloca | ||
| 138 | |||
| 139 | &mov ($rp,"edi"); # save rp | ||
| 140 | &mov ($sp,"ebp"); # save esp | ||
| 141 | |||
| 142 | &mov ($mZeroPrime,"esi"); | ||
| 143 | &lea ("esi",&DWP(64,"esp")); # tp | ||
| 144 | &mov ($T,"esi"); | ||
| 145 | &lea ("edi",&DWP(32,"esp")); # scratch area | ||
| 146 | &mov ($scratch,"edi"); | ||
| 147 | &mov ("esi","eax"); | ||
| 148 | |||
| 149 | &lea ("ebp",&DWP(-$pad,"ecx")); | ||
| 150 | &shr ("ebp",2); # restore original num value in ebp | ||
| 151 | |||
| 152 | &xor ("eax","eax"); | ||
| 153 | |||
| 154 | &mov ("ecx","ebp"); | ||
| 155 | &lea ("ecx",&DWP((32+$pad)/4,"ecx"));# padded tp + scratch | ||
| 156 | &data_byte(0xf3,0xab); # rep stosl, bzero | ||
| 157 | |||
| 158 | &mov ("ecx","ebp"); | ||
| 159 | &lea ("edi",&DWP(64+$pad,"esp","ecx",4));# pointer to ap copy | ||
| 160 | &mov ($A,"edi"); | ||
| 161 | &data_byte(0xf3,0xa5); # rep movsl, memcpy | ||
| 162 | &mov ("ecx",$pad/4); | ||
| 163 | &data_byte(0xf3,0xab); # rep stosl, bzero pad | ||
| 164 | # edi points at the end of padded ap copy... | ||
| 165 | |||
| 166 | &mov ("ecx","ebp"); | ||
| 167 | &mov ("esi","ebx"); | ||
| 168 | &mov ($B,"edi"); | ||
| 169 | &data_byte(0xf3,0xa5); # rep movsl, memcpy | ||
| 170 | &mov ("ecx",$pad/4); | ||
| 171 | &data_byte(0xf3,0xab); # rep stosl, bzero pad | ||
| 172 | # edi points at the end of padded bp copy... | ||
| 173 | |||
| 174 | &mov ("ecx","ebp"); | ||
| 175 | &mov ("esi","edx"); | ||
| 176 | &mov ($M,"edi"); | ||
| 177 | &data_byte(0xf3,0xa5); # rep movsl, memcpy | ||
| 178 | &mov ("ecx",$pad/4); | ||
| 179 | &data_byte(0xf3,0xab); # rep stosl, bzero pad | ||
| 180 | # edi points at the end of padded np copy... | ||
| 181 | |||
| 182 | # let magic happen... | ||
| 183 | &mov ("ecx","ebp"); | ||
| 184 | &mov ("esi","esp"); | ||
| 185 | &shl ("ecx",5); # convert word counter to bit counter | ||
| 186 | &align (4); | ||
| 187 | &data_byte(0xf3,0x0f,0xa6,0xc0);# rep montmul | ||
| 188 | |||
| 189 | &mov ("ecx","ebp"); | ||
| 190 | &lea ("esi",&DWP(64,"esp")); # tp | ||
| 191 | # edi still points at the end of padded np copy... | ||
| 192 | &neg ("ebp"); | ||
| 193 | &lea ("ebp",&DWP(-$pad,"edi","ebp",4)); # so just "rewind" | ||
| 194 | &mov ("edi",$rp); # restore rp | ||
| 195 | &xor ("edx","edx"); # i=0 and clear CF | ||
| 196 | |||
| 197 | &set_label("sub",8); | ||
| 198 | &mov ("eax",&DWP(0,"esi","edx",4)); | ||
| 199 | &sbb ("eax",&DWP(0,"ebp","edx",4)); | ||
| 200 | &mov (&DWP(0,"edi","edx",4),"eax"); # rp[i]=tp[i]-np[i] | ||
| 201 | &lea ("edx",&DWP(1,"edx")); # i++ | ||
| 202 | &loop (&label("sub")); # doesn't affect CF! | ||
| 203 | |||
| 204 | &mov ("eax",&DWP(0,"esi","edx",4)); # upmost overflow bit | ||
| 205 | &sbb ("eax",0); | ||
| 206 | &and ("esi","eax"); | ||
| 207 | ¬ ("eax"); | ||
| 208 | &mov ("ebp","edi"); | ||
| 209 | &and ("ebp","eax"); | ||
| 210 | &or ("esi","ebp"); # tp=carry?tp:rp | ||
| 211 | |||
| 212 | &mov ("ecx","edx"); # num | ||
| 213 | &xor ("edx","edx"); # i=0 | ||
| 214 | |||
| 215 | &set_label("copy",8); | ||
| 216 | &mov ("eax",&DWP(0,"esi","edx",4)); | ||
| 217 | &mov (&DWP(64,"esp","edx",4),"ecx"); # zap tp | ||
| 218 | &mov (&DWP(0,"edi","edx",4),"eax"); | ||
| 219 | &lea ("edx",&DWP(1,"edx")); # i++ | ||
| 220 | &loop (&label("copy")); | ||
| 221 | |||
| 222 | &mov ("ebp",$sp); | ||
| 223 | &xor ("eax","eax"); | ||
| 224 | |||
| 225 | &mov ("ecx",64/4); | ||
| 226 | &mov ("edi","esp"); # zap frame including scratch area | ||
| 227 | &data_byte(0xf3,0xab); # rep stosl, bzero | ||
| 228 | |||
| 229 | # zap copies of ap, bp and np | ||
| 230 | &lea ("edi",&DWP(64+$pad,"esp","edx",4));# pointer to ap | ||
| 231 | &lea ("ecx",&DWP(3*$pad/4,"edx","edx",2)); | ||
| 232 | &data_byte(0xf3,0xab); # rep stosl, bzero | ||
| 233 | |||
| 234 | &mov ("esp","ebp"); | ||
| 235 | &inc ("eax"); # signal "done" | ||
| 236 | &popf (); | ||
| 237 | &set_label("leave"); | ||
| 238 | &function_end($func); | ||
| 239 | |||
| 240 | &asciz("Padlock Montgomery Multiplication, CRYPTOGAMS by <appro\@openssl.org>"); | ||
| 241 | |||
| 242 | &asm_finish(); | ||
diff --git a/src/lib/libcrypto/cast/asm/cast-586.pl b/src/lib/libcrypto/cast/asm/cast-586.pl deleted file mode 100644 index 7a0083ecb8..0000000000 --- a/src/lib/libcrypto/cast/asm/cast-586.pl +++ /dev/null | |||
| @@ -1,177 +0,0 @@ | |||
| 1 | #!/usr/local/bin/perl | ||
| 2 | |||
| 3 | # define for pentium pro friendly version | ||
| 4 | $ppro=1; | ||
| 5 | |||
| 6 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | ||
| 7 | push(@INC,"${dir}","${dir}../../perlasm"); | ||
| 8 | require "x86asm.pl"; | ||
| 9 | require "cbc.pl"; | ||
| 10 | |||
| 11 | &asm_init($ARGV[0],"cast-586.pl",$ARGV[$#ARGV] eq "386"); | ||
| 12 | |||
| 13 | $CAST_ROUNDS=16; | ||
| 14 | $L="edi"; | ||
| 15 | $R="esi"; | ||
| 16 | $K="ebp"; | ||
| 17 | $tmp1="ecx"; | ||
| 18 | $tmp2="ebx"; | ||
| 19 | $tmp3="eax"; | ||
| 20 | $tmp4="edx"; | ||
| 21 | $S1="CAST_S_table0"; | ||
| 22 | $S2="CAST_S_table1"; | ||
| 23 | $S3="CAST_S_table2"; | ||
| 24 | $S4="CAST_S_table3"; | ||
| 25 | |||
| 26 | @F1=("add","xor","sub"); | ||
| 27 | @F2=("xor","sub","add"); | ||
| 28 | @F3=("sub","add","xor"); | ||
| 29 | |||
| 30 | &CAST_encrypt("CAST_encrypt",1); | ||
| 31 | &CAST_encrypt("CAST_decrypt",0); | ||
| 32 | &cbc("CAST_cbc_encrypt","CAST_encrypt","CAST_decrypt",1,4,5,3,-1,-1) unless $main'openbsd; | ||
| 33 | |||
| 34 | &asm_finish(); | ||
| 35 | |||
| 36 | sub CAST_encrypt { | ||
| 37 | local($name,$enc)=@_; | ||
| 38 | |||
| 39 | local($win_ex)=<<"EOF"; | ||
| 40 | EXTERN _CAST_S_table0:DWORD | ||
| 41 | EXTERN _CAST_S_table1:DWORD | ||
| 42 | EXTERN _CAST_S_table2:DWORD | ||
| 43 | EXTERN _CAST_S_table3:DWORD | ||
| 44 | EOF | ||
| 45 | &main::external_label( | ||
| 46 | "CAST_S_table0", | ||
| 47 | "CAST_S_table1", | ||
| 48 | "CAST_S_table2", | ||
| 49 | "CAST_S_table3", | ||
| 50 | ); | ||
| 51 | |||
| 52 | &function_begin_B($name,$win_ex); | ||
| 53 | |||
| 54 | &comment(""); | ||
| 55 | |||
| 56 | &push("ebp"); | ||
| 57 | &push("ebx"); | ||
| 58 | &mov($tmp2,&wparam(0)); | ||
| 59 | &mov($K,&wparam(1)); | ||
| 60 | &push("esi"); | ||
| 61 | &push("edi"); | ||
| 62 | |||
| 63 | &comment("Load the 2 words"); | ||
| 64 | &mov($L,&DWP(0,$tmp2,"",0)); | ||
| 65 | &mov($R,&DWP(4,$tmp2,"",0)); | ||
| 66 | |||
| 67 | &comment('Get short key flag'); | ||
| 68 | &mov($tmp3,&DWP(128,$K,"",0)); | ||
| 69 | if($enc) { | ||
| 70 | &push($tmp3); | ||
| 71 | } else { | ||
| 72 | &or($tmp3,$tmp3); | ||
| 73 | &jnz(&label('cast_dec_skip')); | ||
| 74 | } | ||
| 75 | |||
| 76 | &xor($tmp3, $tmp3); | ||
| 77 | |||
| 78 | # encrypting part | ||
| 79 | |||
| 80 | if ($enc) { | ||
| 81 | &E_CAST( 0,$S,$L,$R,$K,@F1,$tmp1,$tmp2,$tmp3,$tmp4); | ||
| 82 | &E_CAST( 1,$S,$R,$L,$K,@F2,$tmp1,$tmp2,$tmp3,$tmp4); | ||
| 83 | &E_CAST( 2,$S,$L,$R,$K,@F3,$tmp1,$tmp2,$tmp3,$tmp4); | ||
| 84 | &E_CAST( 3,$S,$R,$L,$K,@F1,$tmp1,$tmp2,$tmp3,$tmp4); | ||
| 85 | &E_CAST( 4,$S,$L,$R,$K,@F2,$tmp1,$tmp2,$tmp3,$tmp4); | ||
| 86 | &E_CAST( 5,$S,$R,$L,$K,@F3,$tmp1,$tmp2,$tmp3,$tmp4); | ||
| 87 | &E_CAST( 6,$S,$L,$R,$K,@F1,$tmp1,$tmp2,$tmp3,$tmp4); | ||
| 88 | &E_CAST( 7,$S,$R,$L,$K,@F2,$tmp1,$tmp2,$tmp3,$tmp4); | ||
| 89 | &E_CAST( 8,$S,$L,$R,$K,@F3,$tmp1,$tmp2,$tmp3,$tmp4); | ||
| 90 | &E_CAST( 9,$S,$R,$L,$K,@F1,$tmp1,$tmp2,$tmp3,$tmp4); | ||
| 91 | &E_CAST(10,$S,$L,$R,$K,@F2,$tmp1,$tmp2,$tmp3,$tmp4); | ||
| 92 | &E_CAST(11,$S,$R,$L,$K,@F3,$tmp1,$tmp2,$tmp3,$tmp4); | ||
| 93 | &comment('test short key flag'); | ||
| 94 | &pop($tmp4); | ||
| 95 | &or($tmp4,$tmp4); | ||
| 96 | &jnz(&label('cast_enc_done')); | ||
| 97 | &E_CAST(12,$S,$L,$R,$K,@F1,$tmp1,$tmp2,$tmp3,$tmp4); | ||
| 98 | &E_CAST(13,$S,$R,$L,$K,@F2,$tmp1,$tmp2,$tmp3,$tmp4); | ||
| 99 | &E_CAST(14,$S,$L,$R,$K,@F3,$tmp1,$tmp2,$tmp3,$tmp4); | ||
| 100 | &E_CAST(15,$S,$R,$L,$K,@F1,$tmp1,$tmp2,$tmp3,$tmp4); | ||
| 101 | } else { | ||
| 102 | &E_CAST(15,$S,$L,$R,$K,@F1,$tmp1,$tmp2,$tmp3,$tmp4); | ||
| 103 | &E_CAST(14,$S,$R,$L,$K,@F3,$tmp1,$tmp2,$tmp3,$tmp4); | ||
| 104 | &E_CAST(13,$S,$L,$R,$K,@F2,$tmp1,$tmp2,$tmp3,$tmp4); | ||
| 105 | &E_CAST(12,$S,$R,$L,$K,@F1,$tmp1,$tmp2,$tmp3,$tmp4); | ||
| 106 | &set_label('cast_dec_skip'); | ||
| 107 | &E_CAST(11,$S,$L,$R,$K,@F3,$tmp1,$tmp2,$tmp3,$tmp4); | ||
| 108 | &E_CAST(10,$S,$R,$L,$K,@F2,$tmp1,$tmp2,$tmp3,$tmp4); | ||
| 109 | &E_CAST( 9,$S,$L,$R,$K,@F1,$tmp1,$tmp2,$tmp3,$tmp4); | ||
| 110 | &E_CAST( 8,$S,$R,$L,$K,@F3,$tmp1,$tmp2,$tmp3,$tmp4); | ||
| 111 | &E_CAST( 7,$S,$L,$R,$K,@F2,$tmp1,$tmp2,$tmp3,$tmp4); | ||
| 112 | &E_CAST( 6,$S,$R,$L,$K,@F1,$tmp1,$tmp2,$tmp3,$tmp4); | ||
| 113 | &E_CAST( 5,$S,$L,$R,$K,@F3,$tmp1,$tmp2,$tmp3,$tmp4); | ||
| 114 | &E_CAST( 4,$S,$R,$L,$K,@F2,$tmp1,$tmp2,$tmp3,$tmp4); | ||
| 115 | &E_CAST( 3,$S,$L,$R,$K,@F1,$tmp1,$tmp2,$tmp3,$tmp4); | ||
| 116 | &E_CAST( 2,$S,$R,$L,$K,@F3,$tmp1,$tmp2,$tmp3,$tmp4); | ||
| 117 | &E_CAST( 1,$S,$L,$R,$K,@F2,$tmp1,$tmp2,$tmp3,$tmp4); | ||
| 118 | &E_CAST( 0,$S,$R,$L,$K,@F1,$tmp1,$tmp2,$tmp3,$tmp4); | ||
| 119 | } | ||
| 120 | |||
| 121 | &set_label('cast_enc_done') if $enc; | ||
| 122 | # Why the nop? - Ben 17/1/99 | ||
| 123 | &nop(); | ||
| 124 | &mov($tmp3,&wparam(0)); | ||
| 125 | &mov(&DWP(4,$tmp3,"",0),$L); | ||
| 126 | &mov(&DWP(0,$tmp3,"",0),$R); | ||
| 127 | &function_end($name); | ||
| 128 | } | ||
| 129 | |||
| 130 | sub E_CAST { | ||
| 131 | local($i,$S,$L,$R,$K,$OP1,$OP2,$OP3,$tmp1,$tmp2,$tmp3,$tmp4)=@_; | ||
| 132 | # Ri needs to have 16 pre added. | ||
| 133 | |||
| 134 | &comment("round $i"); | ||
| 135 | &mov( $tmp4, &DWP($i*8,$K,"",1)); | ||
| 136 | |||
| 137 | &mov( $tmp1, &DWP($i*8+4,$K,"",1)); | ||
| 138 | &$OP1( $tmp4, $R); | ||
| 139 | |||
| 140 | &rotl( $tmp4, &LB($tmp1)); | ||
| 141 | |||
| 142 | if ($ppro) { | ||
| 143 | &mov( $tmp2, $tmp4); # B | ||
| 144 | &xor( $tmp1, $tmp1); | ||
| 145 | |||
| 146 | &movb( &LB($tmp1), &HB($tmp4)); # A | ||
| 147 | &and( $tmp2, 0xff); | ||
| 148 | |||
| 149 | &shr( $tmp4, 16); # | ||
| 150 | &xor( $tmp3, $tmp3); | ||
| 151 | } else { | ||
| 152 | &mov( $tmp2, $tmp4); # B | ||
| 153 | &movb( &LB($tmp1), &HB($tmp4)); # A # BAD BAD BAD | ||
| 154 | |||
| 155 | &shr( $tmp4, 16); # | ||
| 156 | &and( $tmp2, 0xff); | ||
| 157 | } | ||
| 158 | |||
| 159 | &movb( &LB($tmp3), &HB($tmp4)); # C # BAD BAD BAD | ||
| 160 | &and( $tmp4, 0xff); # D | ||
| 161 | |||
| 162 | &mov( $tmp1, &DWP($S1,"",$tmp1,4)); | ||
| 163 | &mov( $tmp2, &DWP($S2,"",$tmp2,4)); | ||
| 164 | |||
| 165 | &$OP2( $tmp1, $tmp2); | ||
| 166 | &mov( $tmp2, &DWP($S3,"",$tmp3,4)); | ||
| 167 | |||
| 168 | &$OP3( $tmp1, $tmp2); | ||
| 169 | &mov( $tmp2, &DWP($S4,"",$tmp4,4)); | ||
| 170 | |||
| 171 | &$OP1( $tmp1, $tmp2); | ||
| 172 | # XXX | ||
| 173 | |||
| 174 | &xor( $L, $tmp1); | ||
| 175 | # XXX | ||
| 176 | } | ||
| 177 | |||
diff --git a/src/lib/libcrypto/des/asm/crypt586.pl b/src/lib/libcrypto/des/asm/crypt586.pl deleted file mode 100644 index e36f7d44bd..0000000000 --- a/src/lib/libcrypto/des/asm/crypt586.pl +++ /dev/null | |||
| @@ -1,209 +0,0 @@ | |||
| 1 | #!/usr/local/bin/perl | ||
| 2 | # | ||
| 3 | # The inner loop instruction sequence and the IP/FP modifications are from | ||
| 4 | # Svend Olaf Mikkelsen <svolaf@inet.uni-c.dk> | ||
| 5 | # I've added the stuff needed for crypt() but I've not worried about making | ||
| 6 | # things perfect. | ||
| 7 | # | ||
| 8 | |||
| 9 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | ||
| 10 | push(@INC,"${dir}","${dir}../../perlasm"); | ||
| 11 | require "x86asm.pl"; | ||
| 12 | |||
| 13 | &asm_init($ARGV[0],"crypt586.pl"); | ||
| 14 | |||
| 15 | $L="edi"; | ||
| 16 | $R="esi"; | ||
| 17 | |||
| 18 | &external_label("DES_SPtrans"); | ||
| 19 | &fcrypt_body("fcrypt_body"); | ||
| 20 | &asm_finish(); | ||
| 21 | |||
| 22 | sub fcrypt_body | ||
| 23 | { | ||
| 24 | local($name,$do_ip)=@_; | ||
| 25 | |||
| 26 | &function_begin($name); | ||
| 27 | |||
| 28 | &comment(""); | ||
| 29 | &comment("Load the 2 words"); | ||
| 30 | $trans="ebp"; | ||
| 31 | |||
| 32 | &xor( $L, $L); | ||
| 33 | &xor( $R, $R); | ||
| 34 | |||
| 35 | # PIC-ification:-) | ||
| 36 | &picmeup("edx","DES_SPtrans"); | ||
| 37 | #if ($cpp) { &picmeup("edx","DES_SPtrans"); } | ||
| 38 | #else { &lea("edx",&DWP("DES_SPtrans")); } | ||
| 39 | &push("edx"); # becomes &swtmp(1) | ||
| 40 | # | ||
| 41 | &mov($trans,&wparam(1)); # reloaded with DES_SPtrans in D_ENCRYPT | ||
| 42 | |||
| 43 | &push(&DWC(25)); # add a variable | ||
| 44 | |||
| 45 | &set_label("start"); | ||
| 46 | for ($i=0; $i<16; $i+=2) | ||
| 47 | { | ||
| 48 | &comment(""); | ||
| 49 | &comment("Round $i"); | ||
| 50 | &D_ENCRYPT($i,$L,$R,$i*2,$trans,"eax","ebx","ecx","edx"); | ||
| 51 | |||
| 52 | &comment(""); | ||
| 53 | &comment("Round ".sprintf("%d",$i+1)); | ||
| 54 | &D_ENCRYPT($i+1,$R,$L,($i+1)*2,$trans,"eax","ebx","ecx","edx"); | ||
| 55 | } | ||
| 56 | &mov("ebx", &swtmp(0)); | ||
| 57 | &mov("eax", $L); | ||
| 58 | &dec("ebx"); | ||
| 59 | &mov($L, $R); | ||
| 60 | &mov($R, "eax"); | ||
| 61 | &mov(&swtmp(0), "ebx"); | ||
| 62 | &jnz(&label("start")); | ||
| 63 | |||
| 64 | &comment(""); | ||
| 65 | &comment("FP"); | ||
| 66 | &mov("edx",&wparam(0)); | ||
| 67 | |||
| 68 | &FP_new($R,$L,"eax",3); | ||
| 69 | &mov(&DWP(0,"edx","",0),"eax"); | ||
| 70 | &mov(&DWP(4,"edx","",0),$L); | ||
| 71 | |||
| 72 | &add("esp",8); # remove variables | ||
| 73 | |||
| 74 | &function_end($name); | ||
| 75 | } | ||
| 76 | |||
| 77 | sub D_ENCRYPT | ||
| 78 | { | ||
| 79 | local($r,$L,$R,$S,$trans,$u,$tmp1,$tmp2,$t)=@_; | ||
| 80 | |||
| 81 | &mov( $u, &wparam(2)); # 2 | ||
| 82 | &mov( $t, $R); | ||
| 83 | &shr( $t, 16); # 1 | ||
| 84 | &mov( $tmp2, &wparam(3)); # 2 | ||
| 85 | &xor( $t, $R); # 1 | ||
| 86 | |||
| 87 | &and( $u, $t); # 2 | ||
| 88 | &and( $t, $tmp2); # 2 | ||
| 89 | |||
| 90 | &mov( $tmp1, $u); | ||
| 91 | &shl( $tmp1, 16); # 1 | ||
| 92 | &mov( $tmp2, $t); | ||
| 93 | &shl( $tmp2, 16); # 1 | ||
| 94 | &xor( $u, $tmp1); # 2 | ||
| 95 | &xor( $t, $tmp2); # 2 | ||
| 96 | &mov( $tmp1, &DWP(&n2a($S*4),$trans,"",0)); # 2 | ||
| 97 | &xor( $u, $tmp1); | ||
| 98 | &mov( $tmp2, &DWP(&n2a(($S+1)*4),$trans,"",0)); # 2 | ||
| 99 | &xor( $u, $R); | ||
| 100 | &xor( $t, $R); | ||
| 101 | &xor( $t, $tmp2); | ||
| 102 | |||
| 103 | &and( $u, "0xfcfcfcfc" ); # 2 | ||
| 104 | &xor( $tmp1, $tmp1); # 1 | ||
| 105 | &and( $t, "0xcfcfcfcf" ); # 2 | ||
| 106 | &xor( $tmp2, $tmp2); | ||
| 107 | &movb( &LB($tmp1), &LB($u) ); | ||
| 108 | &movb( &LB($tmp2), &HB($u) ); | ||
| 109 | &rotr( $t, 4 ); | ||
| 110 | &mov( $trans, &swtmp(1)); | ||
| 111 | &xor( $L, &DWP(" ",$trans,$tmp1,0)); | ||
| 112 | &movb( &LB($tmp1), &LB($t) ); | ||
| 113 | &xor( $L, &DWP("0x200",$trans,$tmp2,0)); | ||
| 114 | &movb( &LB($tmp2), &HB($t) ); | ||
| 115 | &shr( $u, 16); | ||
| 116 | &xor( $L, &DWP("0x100",$trans,$tmp1,0)); | ||
| 117 | &movb( &LB($tmp1), &HB($u) ); | ||
| 118 | &shr( $t, 16); | ||
| 119 | &xor( $L, &DWP("0x300",$trans,$tmp2,0)); | ||
| 120 | &movb( &LB($tmp2), &HB($t) ); | ||
| 121 | &and( $u, "0xff" ); | ||
| 122 | &and( $t, "0xff" ); | ||
| 123 | &mov( $tmp1, &DWP("0x600",$trans,$tmp1,0)); | ||
| 124 | &xor( $L, $tmp1); | ||
| 125 | &mov( $tmp1, &DWP("0x700",$trans,$tmp2,0)); | ||
| 126 | &xor( $L, $tmp1); | ||
| 127 | &mov( $tmp1, &DWP("0x400",$trans,$u,0)); | ||
| 128 | &xor( $L, $tmp1); | ||
| 129 | &mov( $tmp1, &DWP("0x500",$trans,$t,0)); | ||
| 130 | &xor( $L, $tmp1); | ||
| 131 | &mov( $trans, &wparam(1)); | ||
| 132 | } | ||
| 133 | |||
| 134 | sub n2a | ||
| 135 | { | ||
| 136 | sprintf("%d",$_[0]); | ||
| 137 | } | ||
| 138 | |||
| 139 | # now has a side affect of rotating $a by $shift | ||
| 140 | sub R_PERM_OP | ||
| 141 | { | ||
| 142 | local($a,$b,$tt,$shift,$mask,$last)=@_; | ||
| 143 | |||
| 144 | &rotl( $a, $shift ) if ($shift != 0); | ||
| 145 | &mov( $tt, $a ); | ||
| 146 | &xor( $a, $b ); | ||
| 147 | &and( $a, $mask ); | ||
| 148 | if ($notlast eq $b) | ||
| 149 | { | ||
| 150 | &xor( $b, $a ); | ||
| 151 | &xor( $tt, $a ); | ||
| 152 | } | ||
| 153 | else | ||
| 154 | { | ||
| 155 | &xor( $tt, $a ); | ||
| 156 | &xor( $b, $a ); | ||
| 157 | } | ||
| 158 | &comment(""); | ||
| 159 | } | ||
| 160 | |||
| 161 | sub IP_new | ||
| 162 | { | ||
| 163 | local($l,$r,$tt,$lr)=@_; | ||
| 164 | |||
| 165 | &R_PERM_OP($l,$r,$tt, 4,"0xf0f0f0f0",$l); | ||
| 166 | &R_PERM_OP($r,$tt,$l,20,"0xfff0000f",$l); | ||
| 167 | &R_PERM_OP($l,$tt,$r,14,"0x33333333",$r); | ||
| 168 | &R_PERM_OP($tt,$r,$l,22,"0x03fc03fc",$r); | ||
| 169 | &R_PERM_OP($l,$r,$tt, 9,"0xaaaaaaaa",$r); | ||
| 170 | |||
| 171 | if ($lr != 3) | ||
| 172 | { | ||
| 173 | if (($lr-3) < 0) | ||
| 174 | { &rotr($tt, 3-$lr); } | ||
| 175 | else { &rotl($tt, $lr-3); } | ||
| 176 | } | ||
| 177 | if ($lr != 2) | ||
| 178 | { | ||
| 179 | if (($lr-2) < 0) | ||
| 180 | { &rotr($r, 2-$lr); } | ||
| 181 | else { &rotl($r, $lr-2); } | ||
| 182 | } | ||
| 183 | } | ||
| 184 | |||
| 185 | sub FP_new | ||
| 186 | { | ||
| 187 | local($l,$r,$tt,$lr)=@_; | ||
| 188 | |||
| 189 | if ($lr != 2) | ||
| 190 | { | ||
| 191 | if (($lr-2) < 0) | ||
| 192 | { &rotl($r, 2-$lr); } | ||
| 193 | else { &rotr($r, $lr-2); } | ||
| 194 | } | ||
| 195 | if ($lr != 3) | ||
| 196 | { | ||
| 197 | if (($lr-3) < 0) | ||
| 198 | { &rotl($l, 3-$lr); } | ||
| 199 | else { &rotr($l, $lr-3); } | ||
| 200 | } | ||
| 201 | |||
| 202 | &R_PERM_OP($l,$r,$tt, 0,"0xaaaaaaaa",$r); | ||
| 203 | &R_PERM_OP($tt,$r,$l,23,"0x03fc03fc",$r); | ||
| 204 | &R_PERM_OP($l,$r,$tt,10,"0x33333333",$l); | ||
| 205 | &R_PERM_OP($r,$tt,$l,18,"0xfff0000f",$l); | ||
| 206 | &R_PERM_OP($l,$tt,$r,12,"0xf0f0f0f0",$r); | ||
| 207 | &rotr($tt , 4); | ||
| 208 | } | ||
| 209 | |||
diff --git a/src/lib/libcrypto/sha/asm/sha1-sparcv9a.pl b/src/lib/libcrypto/sha/asm/sha1-sparcv9a.pl deleted file mode 100644 index 8e7674e9d2..0000000000 --- a/src/lib/libcrypto/sha/asm/sha1-sparcv9a.pl +++ /dev/null | |||
| @@ -1,608 +0,0 @@ | |||
| 1 | #!/usr/bin/env perl | ||
| 2 | |||
| 3 | # ==================================================================== | ||
| 4 | # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | ||
| 5 | # project. The module is, however, dual licensed under OpenSSL and | ||
| 6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
| 7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
| 8 | # ==================================================================== | ||
| 9 | |||
| 10 | # January 2009 | ||
| 11 | # | ||
| 12 | # Provided that UltraSPARC VIS instructions are pipe-lined(*) and | ||
| 13 | # pairable(*) with IALU ones, offloading of Xupdate to the UltraSPARC | ||
| 14 | # Graphic Unit would make it possible to achieve higher instruction- | ||
| 15 | # level parallelism, ILP, and thus higher performance. It should be | ||
| 16 | # explicitly noted that ILP is the keyword, and it means that this | ||
| 17 | # code would be unsuitable for cores like UltraSPARC-Tx. The idea is | ||
| 18 | # not really novel, Sun had VIS-powered implementation for a while. | ||
| 19 | # Unlike Sun's implementation this one can process multiple unaligned | ||
| 20 | # input blocks, and as such works as drop-in replacement for OpenSSL | ||
| 21 | # sha1_block_data_order. Performance improvement was measured to be | ||
| 22 | # 40% over pure IALU sha1-sparcv9.pl on UltraSPARC-IIi, but 12% on | ||
| 23 | # UltraSPARC-III. See below for discussion... | ||
| 24 | # | ||
| 25 | # The module does not present direct interest for OpenSSL, because | ||
| 26 | # it doesn't provide better performance on contemporary SPARCv9 CPUs, | ||
| 27 | # UltraSPARC-Tx and SPARC64-V[II] to be specific. Those who feel they | ||
| 28 | # absolutely must score on UltraSPARC-I-IV can simply replace | ||
| 29 | # crypto/sha/asm/sha1-sparcv9.pl with this module. | ||
| 30 | # | ||
| 31 | # (*) "Pipe-lined" means that even if it takes several cycles to | ||
| 32 | # complete, next instruction using same functional unit [but not | ||
| 33 | # depending on the result of the current instruction] can start | ||
| 34 | # execution without having to wait for the unit. "Pairable" | ||
| 35 | # means that two [or more] independent instructions can be | ||
| 36 | # issued at the very same time. | ||
| 37 | |||
| 38 | $bits=32; | ||
| 39 | for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); } | ||
| 40 | if ($bits==64) { $bias=2047; $frame=192; } | ||
| 41 | else { $bias=0; $frame=112; } | ||
| 42 | |||
| 43 | $output=shift; | ||
| 44 | open STDOUT,">$output"; | ||
| 45 | |||
| 46 | $ctx="%i0"; | ||
| 47 | $inp="%i1"; | ||
| 48 | $len="%i2"; | ||
| 49 | $tmp0="%i3"; | ||
| 50 | $tmp1="%i4"; | ||
| 51 | $tmp2="%i5"; | ||
| 52 | $tmp3="%g5"; | ||
| 53 | |||
| 54 | $base="%g1"; | ||
| 55 | $align="%g4"; | ||
| 56 | $Xfer="%o5"; | ||
| 57 | $nXfer=$tmp3; | ||
| 58 | $Xi="%o7"; | ||
| 59 | |||
| 60 | $A="%l0"; | ||
| 61 | $B="%l1"; | ||
| 62 | $C="%l2"; | ||
| 63 | $D="%l3"; | ||
| 64 | $E="%l4"; | ||
| 65 | @V=($A,$B,$C,$D,$E); | ||
| 66 | |||
| 67 | $Actx="%o0"; | ||
| 68 | $Bctx="%o1"; | ||
| 69 | $Cctx="%o2"; | ||
| 70 | $Dctx="%o3"; | ||
| 71 | $Ectx="%o4"; | ||
| 72 | |||
| 73 | $fmul="%f32"; | ||
| 74 | $VK_00_19="%f34"; | ||
| 75 | $VK_20_39="%f36"; | ||
| 76 | $VK_40_59="%f38"; | ||
| 77 | $VK_60_79="%f40"; | ||
| 78 | @VK=($VK_00_19,$VK_20_39,$VK_40_59,$VK_60_79); | ||
| 79 | @X=("%f0", "%f1", "%f2", "%f3", "%f4", "%f5", "%f6", "%f7", | ||
| 80 | "%f8", "%f9","%f10","%f11","%f12","%f13","%f14","%f15","%f16"); | ||
| 81 | |||
| 82 | # This is reference 2x-parallelized VIS-powered Xupdate procedure. It | ||
| 83 | # covers even K_NN_MM addition... | ||
| 84 | sub Xupdate { | ||
| 85 | my ($i)=@_; | ||
| 86 | my $K=@VK[($i+16)/20]; | ||
| 87 | my $j=($i+16)%16; | ||
| 88 | |||
| 89 | # [ provided that GSR.alignaddr_offset is 5, $mul contains | ||
| 90 | # 0x100ULL<<32|0x100 value and K_NN_MM are pre-loaded to | ||
| 91 | # chosen registers... ] | ||
| 92 | $code.=<<___; | ||
| 93 | fxors @X[($j+13)%16],@X[$j],@X[$j] !-1/-1/-1:X[0]^=X[13] | ||
| 94 | fxors @X[($j+14)%16],@X[$j+1],@X[$j+1]! 0/ 0/ 0:X[1]^=X[14] | ||
| 95 | fxor @X[($j+2)%16],@X[($j+8)%16],%f18! 1/ 1/ 1:Tmp=X[2,3]^X[8,9] | ||
| 96 | fxor %f18,@X[$j],@X[$j] ! 2/ 4/ 3:X[0,1]^=X[2,3]^X[8,9] | ||
| 97 | faligndata @X[$j],@X[$j],%f18 ! 3/ 7/ 5:Tmp=X[0,1]>>>24 | ||
| 98 | fpadd32 @X[$j],@X[$j],@X[$j] ! 4/ 8/ 6:X[0,1]<<=1 | ||
| 99 | fmul8ulx16 %f18,$fmul,%f18 ! 5/10/ 7:Tmp>>=7, Tmp&=1 | ||
| 100 | ![fxors %f15,%f2,%f2] | ||
| 101 | for %f18,@X[$j],@X[$j] ! 8/14/10:X[0,1]|=Tmp | ||
| 102 | ![fxors %f0,%f3,%f3] !10/17/12:X[0] dependency | ||
| 103 | fpadd32 $K,@X[$j],%f20 | ||
| 104 | std %f20,[$Xfer+`4*$j`] | ||
| 105 | ___ | ||
| 106 | # The numbers delimited with slash are the earliest possible dispatch | ||
| 107 | # cycles for given instruction assuming 1 cycle latency for simple VIS | ||
| 108 | # instructions, such as on UltraSPARC-I&II, 3 cycles latency, such as | ||
| 109 | # on UltraSPARC-III&IV, and 2 cycles latency(*), respectively. Being | ||
| 110 | # 2x-parallelized the procedure is "worth" 5, 8.5 or 6 ticks per SHA1 | ||
| 111 | # round. As [long as] FPU/VIS instructions are perfectly pairable with | ||
| 112 | # IALU ones, the round timing is defined by the maximum between VIS | ||
| 113 | # and IALU timings. The latter varies from round to round and averages | ||
| 114 | # out at 6.25 ticks. This means that USI&II should operate at IALU | ||
| 115 | # rate, while USIII&IV - at VIS rate. This explains why performance | ||
| 116 | # improvement varies among processors. Well, given that pure IALU | ||
| 117 | # sha1-sparcv9.pl module exhibits virtually uniform performance of | ||
| 118 | # ~9.3 cycles per SHA1 round. Timings mentioned above are theoretical | ||
| 119 | # lower limits. Real-life performance was measured to be 6.6 cycles | ||
| 120 | # per SHA1 round on USIIi and 8.3 on USIII. The latter is lower than | ||
| 121 | # half-round VIS timing, because there are 16 Xupdate-free rounds, | ||
| 122 | # which "push down" average theoretical timing to 8 cycles... | ||
| 123 | |||
| 124 | # (*) SPARC64-V[II] was originally believed to have 2 cycles VIS | ||
| 125 | # latency. Well, it might have, but it doesn't have dedicated | ||
| 126 | # VIS-unit. Instead, VIS instructions are executed by other | ||
| 127 | # functional units, ones used here - by IALU. This doesn't | ||
| 128 | # improve effective ILP... | ||
| 129 | } | ||
| 130 | |||
| 131 | # The reference Xupdate procedure is then "strained" over *pairs* of | ||
| 132 | # BODY_NN_MM and kind of modulo-scheduled in respect to X[n]^=X[n+13] | ||
| 133 | # and K_NN_MM addition. It's "running" 15 rounds ahead, which leaves | ||
| 134 | # plenty of room to amortize for read-after-write hazard, as well as | ||
| 135 | # to fetch and align input for the next spin. The VIS instructions are | ||
| 136 | # scheduled for latency of 2 cycles, because there are not enough IALU | ||
| 137 | # instructions to schedule for latency of 3, while scheduling for 1 | ||
| 138 | # would give no gain on USI&II anyway. | ||
| 139 | |||
| 140 | sub BODY_00_19 { | ||
| 141 | my ($i,$a,$b,$c,$d,$e)=@_; | ||
| 142 | my $j=$i&~1; | ||
| 143 | my $k=($j+16+2)%16; # ahead reference | ||
| 144 | my $l=($j+16-2)%16; # behind reference | ||
| 145 | my $K=@VK[($j+16-2)/20]; | ||
| 146 | |||
| 147 | $j=($j+16)%16; | ||
| 148 | |||
| 149 | $code.=<<___ if (!($i&1)); | ||
| 150 | sll $a,5,$tmp0 !! $i | ||
| 151 | and $c,$b,$tmp3 | ||
| 152 | ld [$Xfer+`4*($i%16)`],$Xi | ||
| 153 | fxors @X[($j+14)%16],@X[$j+1],@X[$j+1]! 0/ 0/ 0:X[1]^=X[14] | ||
| 154 | srl $a,27,$tmp1 | ||
| 155 | add $tmp0,$e,$e | ||
| 156 | fxor @X[($j+2)%16],@X[($j+8)%16],%f18! 1/ 1/ 1:Tmp=X[2,3]^X[8,9] | ||
| 157 | sll $b,30,$tmp2 | ||
| 158 | add $tmp1,$e,$e | ||
| 159 | andn $d,$b,$tmp1 | ||
| 160 | add $Xi,$e,$e | ||
| 161 | fxor %f18,@X[$j],@X[$j] ! 2/ 4/ 3:X[0,1]^=X[2,3]^X[8,9] | ||
| 162 | srl $b,2,$b | ||
| 163 | or $tmp1,$tmp3,$tmp1 | ||
| 164 | or $tmp2,$b,$b | ||
| 165 | add $tmp1,$e,$e | ||
| 166 | faligndata @X[$j],@X[$j],%f18 ! 3/ 7/ 5:Tmp=X[0,1]>>>24 | ||
| 167 | ___ | ||
| 168 | $code.=<<___ if ($i&1); | ||
| 169 | sll $a,5,$tmp0 !! $i | ||
| 170 | and $c,$b,$tmp3 | ||
| 171 | ld [$Xfer+`4*($i%16)`],$Xi | ||
| 172 | fpadd32 @X[$j],@X[$j],@X[$j] ! 4/ 8/ 6:X[0,1]<<=1 | ||
| 173 | srl $a,27,$tmp1 | ||
| 174 | add $tmp0,$e,$e | ||
| 175 | fmul8ulx16 %f18,$fmul,%f18 ! 5/10/ 7:Tmp>>=7, Tmp&=1 | ||
| 176 | sll $b,30,$tmp2 | ||
| 177 | add $tmp1,$e,$e | ||
| 178 | fpadd32 $K,@X[$l],%f20 ! | ||
| 179 | andn $d,$b,$tmp1 | ||
| 180 | add $Xi,$e,$e | ||
| 181 | fxors @X[($k+13)%16],@X[$k],@X[$k] !-1/-1/-1:X[0]^=X[13] | ||
| 182 | srl $b,2,$b | ||
| 183 | or $tmp1,$tmp3,$tmp1 | ||
| 184 | fxor %f18,@X[$j],@X[$j] ! 8/14/10:X[0,1]|=Tmp | ||
| 185 | or $tmp2,$b,$b | ||
| 186 | add $tmp1,$e,$e | ||
| 187 | ___ | ||
| 188 | $code.=<<___ if ($i&1 && $i>=2); | ||
| 189 | std %f20,[$Xfer+`4*$l`] ! | ||
| 190 | ___ | ||
| 191 | } | ||
| 192 | |||
| 193 | sub BODY_20_39 { | ||
| 194 | my ($i,$a,$b,$c,$d,$e)=@_; | ||
| 195 | my $j=$i&~1; | ||
| 196 | my $k=($j+16+2)%16; # ahead reference | ||
| 197 | my $l=($j+16-2)%16; # behind reference | ||
| 198 | my $K=@VK[($j+16-2)/20]; | ||
| 199 | |||
| 200 | $j=($j+16)%16; | ||
| 201 | |||
| 202 | $code.=<<___ if (!($i&1) && $i<64); | ||
| 203 | sll $a,5,$tmp0 !! $i | ||
| 204 | ld [$Xfer+`4*($i%16)`],$Xi | ||
| 205 | fxors @X[($j+14)%16],@X[$j+1],@X[$j+1]! 0/ 0/ 0:X[1]^=X[14] | ||
| 206 | srl $a,27,$tmp1 | ||
| 207 | add $tmp0,$e,$e | ||
| 208 | fxor @X[($j+2)%16],@X[($j+8)%16],%f18! 1/ 1/ 1:Tmp=X[2,3]^X[8,9] | ||
| 209 | xor $c,$b,$tmp0 | ||
| 210 | add $tmp1,$e,$e | ||
| 211 | sll $b,30,$tmp2 | ||
| 212 | xor $d,$tmp0,$tmp1 | ||
| 213 | fxor %f18,@X[$j],@X[$j] ! 2/ 4/ 3:X[0,1]^=X[2,3]^X[8,9] | ||
| 214 | srl $b,2,$b | ||
| 215 | add $tmp1,$e,$e | ||
| 216 | or $tmp2,$b,$b | ||
| 217 | add $Xi,$e,$e | ||
| 218 | faligndata @X[$j],@X[$j],%f18 ! 3/ 7/ 5:Tmp=X[0,1]>>>24 | ||
| 219 | ___ | ||
| 220 | $code.=<<___ if ($i&1 && $i<64); | ||
| 221 | sll $a,5,$tmp0 !! $i | ||
| 222 | ld [$Xfer+`4*($i%16)`],$Xi | ||
| 223 | fpadd32 @X[$j],@X[$j],@X[$j] ! 4/ 8/ 6:X[0,1]<<=1 | ||
| 224 | srl $a,27,$tmp1 | ||
| 225 | add $tmp0,$e,$e | ||
| 226 | fmul8ulx16 %f18,$fmul,%f18 ! 5/10/ 7:Tmp>>=7, Tmp&=1 | ||
| 227 | xor $c,$b,$tmp0 | ||
| 228 | add $tmp1,$e,$e | ||
| 229 | fpadd32 $K,@X[$l],%f20 ! | ||
| 230 | sll $b,30,$tmp2 | ||
| 231 | xor $d,$tmp0,$tmp1 | ||
| 232 | fxors @X[($k+13)%16],@X[$k],@X[$k] !-1/-1/-1:X[0]^=X[13] | ||
| 233 | srl $b,2,$b | ||
| 234 | add $tmp1,$e,$e | ||
| 235 | fxor %f18,@X[$j],@X[$j] ! 8/14/10:X[0,1]|=Tmp | ||
| 236 | or $tmp2,$b,$b | ||
| 237 | add $Xi,$e,$e | ||
| 238 | std %f20,[$Xfer+`4*$l`] ! | ||
| 239 | ___ | ||
| 240 | $code.=<<___ if ($i==64); | ||
| 241 | sll $a,5,$tmp0 !! $i | ||
| 242 | ld [$Xfer+`4*($i%16)`],$Xi | ||
| 243 | fpadd32 $K,@X[$l],%f20 | ||
| 244 | srl $a,27,$tmp1 | ||
| 245 | add $tmp0,$e,$e | ||
| 246 | xor $c,$b,$tmp0 | ||
| 247 | add $tmp1,$e,$e | ||
| 248 | sll $b,30,$tmp2 | ||
| 249 | xor $d,$tmp0,$tmp1 | ||
| 250 | std %f20,[$Xfer+`4*$l`] | ||
| 251 | srl $b,2,$b | ||
| 252 | add $tmp1,$e,$e | ||
| 253 | or $tmp2,$b,$b | ||
| 254 | add $Xi,$e,$e | ||
| 255 | ___ | ||
| 256 | $code.=<<___ if ($i>64); | ||
| 257 | sll $a,5,$tmp0 !! $i | ||
| 258 | ld [$Xfer+`4*($i%16)`],$Xi | ||
| 259 | srl $a,27,$tmp1 | ||
| 260 | add $tmp0,$e,$e | ||
| 261 | xor $c,$b,$tmp0 | ||
| 262 | add $tmp1,$e,$e | ||
| 263 | sll $b,30,$tmp2 | ||
| 264 | xor $d,$tmp0,$tmp1 | ||
| 265 | srl $b,2,$b | ||
| 266 | add $tmp1,$e,$e | ||
| 267 | or $tmp2,$b,$b | ||
| 268 | add $Xi,$e,$e | ||
| 269 | ___ | ||
| 270 | } | ||
| 271 | |||
| 272 | sub BODY_40_59 { | ||
| 273 | my ($i,$a,$b,$c,$d,$e)=@_; | ||
| 274 | my $j=$i&~1; | ||
| 275 | my $k=($j+16+2)%16; # ahead reference | ||
| 276 | my $l=($j+16-2)%16; # behind reference | ||
| 277 | my $K=@VK[($j+16-2)/20]; | ||
| 278 | |||
| 279 | $j=($j+16)%16; | ||
| 280 | |||
| 281 | $code.=<<___ if (!($i&1)); | ||
| 282 | sll $a,5,$tmp0 !! $i | ||
| 283 | ld [$Xfer+`4*($i%16)`],$Xi | ||
| 284 | fxors @X[($j+14)%16],@X[$j+1],@X[$j+1]! 0/ 0/ 0:X[1]^=X[14] | ||
| 285 | srl $a,27,$tmp1 | ||
| 286 | add $tmp0,$e,$e | ||
| 287 | fxor @X[($j+2)%16],@X[($j+8)%16],%f18! 1/ 1/ 1:Tmp=X[2,3]^X[8,9] | ||
| 288 | and $c,$b,$tmp0 | ||
| 289 | add $tmp1,$e,$e | ||
| 290 | sll $b,30,$tmp2 | ||
| 291 | or $c,$b,$tmp1 | ||
| 292 | fxor %f18,@X[$j],@X[$j] ! 2/ 4/ 3:X[0,1]^=X[2,3]^X[8,9] | ||
| 293 | srl $b,2,$b | ||
| 294 | and $d,$tmp1,$tmp1 | ||
| 295 | add $Xi,$e,$e | ||
| 296 | or $tmp1,$tmp0,$tmp1 | ||
| 297 | faligndata @X[$j],@X[$j],%f18 ! 3/ 7/ 5:Tmp=X[0,1]>>>24 | ||
| 298 | or $tmp2,$b,$b | ||
| 299 | add $tmp1,$e,$e | ||
| 300 | fpadd32 @X[$j],@X[$j],@X[$j] ! 4/ 8/ 6:X[0,1]<<=1 | ||
| 301 | ___ | ||
| 302 | $code.=<<___ if ($i&1); | ||
| 303 | sll $a,5,$tmp0 !! $i | ||
| 304 | ld [$Xfer+`4*($i%16)`],$Xi | ||
| 305 | srl $a,27,$tmp1 | ||
| 306 | add $tmp0,$e,$e | ||
| 307 | fmul8ulx16 %f18,$fmul,%f18 ! 5/10/ 7:Tmp>>=7, Tmp&=1 | ||
| 308 | and $c,$b,$tmp0 | ||
| 309 | add $tmp1,$e,$e | ||
| 310 | fpadd32 $K,@X[$l],%f20 ! | ||
| 311 | sll $b,30,$tmp2 | ||
| 312 | or $c,$b,$tmp1 | ||
| 313 | fxors @X[($k+13)%16],@X[$k],@X[$k] !-1/-1/-1:X[0]^=X[13] | ||
| 314 | srl $b,2,$b | ||
| 315 | and $d,$tmp1,$tmp1 | ||
| 316 | fxor %f18,@X[$j],@X[$j] ! 8/14/10:X[0,1]|=Tmp | ||
| 317 | add $Xi,$e,$e | ||
| 318 | or $tmp1,$tmp0,$tmp1 | ||
| 319 | or $tmp2,$b,$b | ||
| 320 | add $tmp1,$e,$e | ||
| 321 | std %f20,[$Xfer+`4*$l`] ! | ||
| 322 | ___ | ||
| 323 | } | ||
| 324 | |||
| 325 | # If there is more data to process, then we pre-fetch the data for | ||
| 326 | # next iteration in last ten rounds... | ||
| 327 | sub BODY_70_79 { | ||
| 328 | my ($i,$a,$b,$c,$d,$e)=@_; | ||
| 329 | my $j=$i&~1; | ||
| 330 | my $m=($i%8)*2; | ||
| 331 | |||
| 332 | $j=($j+16)%16; | ||
| 333 | |||
| 334 | $code.=<<___ if ($i==70); | ||
| 335 | sll $a,5,$tmp0 !! $i | ||
| 336 | ld [$Xfer+`4*($i%16)`],$Xi | ||
| 337 | srl $a,27,$tmp1 | ||
| 338 | add $tmp0,$e,$e | ||
| 339 | ldd [$inp+64],@X[0] | ||
| 340 | xor $c,$b,$tmp0 | ||
| 341 | add $tmp1,$e,$e | ||
| 342 | sll $b,30,$tmp2 | ||
| 343 | xor $d,$tmp0,$tmp1 | ||
| 344 | srl $b,2,$b | ||
| 345 | add $tmp1,$e,$e | ||
| 346 | or $tmp2,$b,$b | ||
| 347 | add $Xi,$e,$e | ||
| 348 | |||
| 349 | and $inp,-64,$nXfer | ||
| 350 | inc 64,$inp | ||
| 351 | and $nXfer,255,$nXfer | ||
| 352 | alignaddr %g0,$align,%g0 | ||
| 353 | add $base,$nXfer,$nXfer | ||
| 354 | ___ | ||
| 355 | $code.=<<___ if ($i==71); | ||
| 356 | sll $a,5,$tmp0 !! $i | ||
| 357 | ld [$Xfer+`4*($i%16)`],$Xi | ||
| 358 | srl $a,27,$tmp1 | ||
| 359 | add $tmp0,$e,$e | ||
| 360 | xor $c,$b,$tmp0 | ||
| 361 | add $tmp1,$e,$e | ||
| 362 | sll $b,30,$tmp2 | ||
| 363 | xor $d,$tmp0,$tmp1 | ||
| 364 | srl $b,2,$b | ||
| 365 | add $tmp1,$e,$e | ||
| 366 | or $tmp2,$b,$b | ||
| 367 | add $Xi,$e,$e | ||
| 368 | ___ | ||
| 369 | $code.=<<___ if ($i>=72); | ||
| 370 | faligndata @X[$m],@X[$m+2],@X[$m] | ||
| 371 | sll $a,5,$tmp0 !! $i | ||
| 372 | ld [$Xfer+`4*($i%16)`],$Xi | ||
| 373 | srl $a,27,$tmp1 | ||
| 374 | add $tmp0,$e,$e | ||
| 375 | xor $c,$b,$tmp0 | ||
| 376 | add $tmp1,$e,$e | ||
| 377 | fpadd32 $VK_00_19,@X[$m],%f20 | ||
| 378 | sll $b,30,$tmp2 | ||
| 379 | xor $d,$tmp0,$tmp1 | ||
| 380 | srl $b,2,$b | ||
| 381 | add $tmp1,$e,$e | ||
| 382 | or $tmp2,$b,$b | ||
| 383 | add $Xi,$e,$e | ||
| 384 | ___ | ||
| 385 | $code.=<<___ if ($i<77); | ||
| 386 | ldd [$inp+`8*($i+1-70)`],@X[2*($i+1-70)] | ||
| 387 | ___ | ||
| 388 | $code.=<<___ if ($i==77); # redundant if $inp was aligned | ||
| 389 | add $align,63,$tmp0 | ||
| 390 | and $tmp0,-8,$tmp0 | ||
| 391 | ldd [$inp+$tmp0],@X[16] | ||
| 392 | ___ | ||
| 393 | $code.=<<___ if ($i>=72); | ||
| 394 | std %f20,[$nXfer+`4*$m`] | ||
| 395 | ___ | ||
| 396 | } | ||
| 397 | |||
| 398 | $code.=<<___; | ||
| 399 | .section ".rodata",#alloc | ||
| 400 | |||
| 401 | .align 64 | ||
| 402 | vis_const: | ||
| 403 | .long 0x5a827999,0x5a827999 ! K_00_19 | ||
| 404 | .long 0x6ed9eba1,0x6ed9eba1 ! K_20_39 | ||
| 405 | .long 0x8f1bbcdc,0x8f1bbcdc ! K_40_59 | ||
| 406 | .long 0xca62c1d6,0xca62c1d6 ! K_60_79 | ||
| 407 | .long 0x00000100,0x00000100 | ||
| 408 | .align 64 | ||
| 409 | .type vis_const,#object | ||
| 410 | .size vis_const,(.-vis_const) | ||
| 411 | |||
| 412 | .section ".text",#alloc,#execinstr | ||
| 413 | .globl sha1_block_data_order | ||
| 414 | sha1_block_data_order: | ||
| 415 | save %sp,-$frame,%sp | ||
| 416 | add %fp,$bias-256,$base | ||
| 417 | |||
| 418 | #ifdef __PIC__ | ||
| 419 | sethi %hi(_GLOBAL_OFFSET_TABLE_-4), %o5 | ||
| 420 | rd %pc, %o4 | ||
| 421 | or %o5, %lo(_GLOBAL_OFFSET_TABLE_+4), %o5 | ||
| 422 | add %o5, %o4, %o5 | ||
| 423 | set vis_const, %o4 | ||
| 424 | ldx [%o4+%o5], %o4 | ||
| 425 | #else | ||
| 426 | set vis_const, %o4 | ||
| 427 | #endif | ||
| 428 | |||
| 429 | ldd [$tmp0+0],$VK_00_19 | ||
| 430 | ldd [$tmp0+8],$VK_20_39 | ||
| 431 | ldd [$tmp0+16],$VK_40_59 | ||
| 432 | ldd [$tmp0+24],$VK_60_79 | ||
| 433 | ldd [$tmp0+32],$fmul | ||
| 434 | |||
| 435 | ld [$ctx+0],$Actx | ||
| 436 | and $base,-256,$base | ||
| 437 | ld [$ctx+4],$Bctx | ||
| 438 | sub $base,$bias+$frame,%sp | ||
| 439 | ld [$ctx+8],$Cctx | ||
| 440 | and $inp,7,$align | ||
| 441 | ld [$ctx+12],$Dctx | ||
| 442 | and $inp,-8,$inp | ||
| 443 | ld [$ctx+16],$Ectx | ||
| 444 | |||
| 445 | ! X[16] is maintained in FP register bank | ||
| 446 | alignaddr %g0,$align,%g0 | ||
| 447 | ldd [$inp+0],@X[0] | ||
| 448 | sub $inp,-64,$Xfer | ||
| 449 | ldd [$inp+8],@X[2] | ||
| 450 | and $Xfer,-64,$Xfer | ||
| 451 | ldd [$inp+16],@X[4] | ||
| 452 | and $Xfer,255,$Xfer | ||
| 453 | ldd [$inp+24],@X[6] | ||
| 454 | add $base,$Xfer,$Xfer | ||
| 455 | ldd [$inp+32],@X[8] | ||
| 456 | ldd [$inp+40],@X[10] | ||
| 457 | ldd [$inp+48],@X[12] | ||
| 458 | brz,pt $align,.Laligned | ||
| 459 | ldd [$inp+56],@X[14] | ||
| 460 | |||
| 461 | ldd [$inp+64],@X[16] | ||
| 462 | faligndata @X[0],@X[2],@X[0] | ||
| 463 | faligndata @X[2],@X[4],@X[2] | ||
| 464 | faligndata @X[4],@X[6],@X[4] | ||
| 465 | faligndata @X[6],@X[8],@X[6] | ||
| 466 | faligndata @X[8],@X[10],@X[8] | ||
| 467 | faligndata @X[10],@X[12],@X[10] | ||
| 468 | faligndata @X[12],@X[14],@X[12] | ||
| 469 | faligndata @X[14],@X[16],@X[14] | ||
| 470 | |||
| 471 | .Laligned: | ||
| 472 | mov 5,$tmp0 | ||
| 473 | dec 1,$len | ||
| 474 | alignaddr %g0,$tmp0,%g0 | ||
| 475 | fpadd32 $VK_00_19,@X[0],%f16 | ||
| 476 | fpadd32 $VK_00_19,@X[2],%f18 | ||
| 477 | fpadd32 $VK_00_19,@X[4],%f20 | ||
| 478 | fpadd32 $VK_00_19,@X[6],%f22 | ||
| 479 | fpadd32 $VK_00_19,@X[8],%f24 | ||
| 480 | fpadd32 $VK_00_19,@X[10],%f26 | ||
| 481 | fpadd32 $VK_00_19,@X[12],%f28 | ||
| 482 | fpadd32 $VK_00_19,@X[14],%f30 | ||
| 483 | std %f16,[$Xfer+0] | ||
| 484 | mov $Actx,$A | ||
| 485 | std %f18,[$Xfer+8] | ||
| 486 | mov $Bctx,$B | ||
| 487 | std %f20,[$Xfer+16] | ||
| 488 | mov $Cctx,$C | ||
| 489 | std %f22,[$Xfer+24] | ||
| 490 | mov $Dctx,$D | ||
| 491 | std %f24,[$Xfer+32] | ||
| 492 | mov $Ectx,$E | ||
| 493 | std %f26,[$Xfer+40] | ||
| 494 | fxors @X[13],@X[0],@X[0] | ||
| 495 | std %f28,[$Xfer+48] | ||
| 496 | ba .Loop | ||
| 497 | std %f30,[$Xfer+56] | ||
| 498 | .align 32 | ||
| 499 | .Loop: | ||
| 500 | ___ | ||
| 501 | for ($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); } | ||
| 502 | for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } | ||
| 503 | for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); } | ||
| 504 | for (;$i<70;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } | ||
| 505 | $code.=<<___; | ||
| 506 | tst $len | ||
| 507 | bz,pn `$bits==32?"%icc":"%xcc"`,.Ltail | ||
| 508 | nop | ||
| 509 | ___ | ||
| 510 | for (;$i<80;$i++) { &BODY_70_79($i,@V); unshift(@V,pop(@V)); } | ||
| 511 | $code.=<<___; | ||
| 512 | add $A,$Actx,$Actx | ||
| 513 | add $B,$Bctx,$Bctx | ||
| 514 | add $C,$Cctx,$Cctx | ||
| 515 | add $D,$Dctx,$Dctx | ||
| 516 | add $E,$Ectx,$Ectx | ||
| 517 | mov 5,$tmp0 | ||
| 518 | fxors @X[13],@X[0],@X[0] | ||
| 519 | mov $Actx,$A | ||
| 520 | mov $Bctx,$B | ||
| 521 | mov $Cctx,$C | ||
| 522 | mov $Dctx,$D | ||
| 523 | mov $Ectx,$E | ||
| 524 | alignaddr %g0,$tmp0,%g0 | ||
| 525 | dec 1,$len | ||
| 526 | ba .Loop | ||
| 527 | mov $nXfer,$Xfer | ||
| 528 | |||
| 529 | .align 32 | ||
| 530 | .Ltail: | ||
| 531 | ___ | ||
| 532 | for($i=70;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } | ||
| 533 | $code.=<<___; | ||
| 534 | add $A,$Actx,$Actx | ||
| 535 | add $B,$Bctx,$Bctx | ||
| 536 | add $C,$Cctx,$Cctx | ||
| 537 | add $D,$Dctx,$Dctx | ||
| 538 | add $E,$Ectx,$Ectx | ||
| 539 | |||
| 540 | st $Actx,[$ctx+0] | ||
| 541 | st $Bctx,[$ctx+4] | ||
| 542 | st $Cctx,[$ctx+8] | ||
| 543 | st $Dctx,[$ctx+12] | ||
| 544 | st $Ectx,[$ctx+16] | ||
| 545 | |||
| 546 | ret | ||
| 547 | restore | ||
| 548 | .type sha1_block_data_order,#function | ||
| 549 | .size sha1_block_data_order,(.-sha1_block_data_order) | ||
| 550 | ___ | ||
| 551 | |||
| 552 | # Purpose of these subroutines is to explicitly encode VIS instructions, | ||
| 553 | # so that one can compile the module without having to specify VIS | ||
| 554 | # extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a. | ||
| 555 | # Idea is to reserve for option to produce "universal" binary and let | ||
| 556 | # programmer detect if current CPU is VIS capable at run-time. | ||
| 557 | sub unvis { | ||
| 558 | my ($mnemonic,$rs1,$rs2,$rd)=@_; | ||
| 559 | my ($ref,$opf); | ||
| 560 | my %visopf = ( "fmul8ulx16" => 0x037, | ||
| 561 | "faligndata" => 0x048, | ||
| 562 | "fpadd32" => 0x052, | ||
| 563 | "fxor" => 0x06c, | ||
| 564 | "fxors" => 0x06d ); | ||
| 565 | |||
| 566 | $ref = "$mnemonic\t$rs1,$rs2,$rd"; | ||
| 567 | |||
| 568 | if ($opf=$visopf{$mnemonic}) { | ||
| 569 | foreach ($rs1,$rs2,$rd) { | ||
| 570 | return $ref if (!/%f([0-9]{1,2})/); | ||
| 571 | $_=$1; | ||
| 572 | if ($1>=32) { | ||
| 573 | return $ref if ($1&1); | ||
| 574 | # re-encode for upper double register addressing | ||
| 575 | $_=($1|$1>>5)&31; | ||
| 576 | } | ||
| 577 | } | ||
| 578 | |||
| 579 | return sprintf ".word\t0x%08x !%s", | ||
| 580 | 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2, | ||
| 581 | $ref; | ||
| 582 | } else { | ||
| 583 | return $ref; | ||
| 584 | } | ||
| 585 | } | ||
| 586 | sub unalignaddr { | ||
| 587 | my ($mnemonic,$rs1,$rs2,$rd)=@_; | ||
| 588 | my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 ); | ||
| 589 | my $ref="$mnemonic\t$rs1,$rs2,$rd"; | ||
| 590 | |||
| 591 | foreach ($rs1,$rs2,$rd) { | ||
| 592 | if (/%([goli])([0-7])/) { $_=$bias{$1}+$2; } | ||
| 593 | else { return $ref; } | ||
| 594 | } | ||
| 595 | return sprintf ".word\t0x%08x !%s", | ||
| 596 | 0x81b00300|$rd<<25|$rs1<<14|$rs2, | ||
| 597 | $ref; | ||
| 598 | } | ||
| 599 | |||
| 600 | $code =~ s/\`([^\`]*)\`/eval $1/gem; | ||
| 601 | $code =~ s/\b(f[^\s]*)\s+(%f[0-9]{1,2}),(%f[0-9]{1,2}),(%f[0-9]{1,2})/ | ||
| 602 | &unvis($1,$2,$3,$4) | ||
| 603 | /gem; | ||
| 604 | $code =~ s/\b(alignaddr)\s+(%[goli][0-7]),(%[goli][0-7]),(%[goli][0-7])/ | ||
| 605 | &unalignaddr($1,$2,$3,$4) | ||
| 606 | /gem; | ||
| 607 | print $code; | ||
| 608 | close STDOUT; | ||
diff --git a/src/lib/libcrypto/sha/asm/sha1-thumb.pl b/src/lib/libcrypto/sha/asm/sha1-thumb.pl deleted file mode 100644 index 553e9cedb5..0000000000 --- a/src/lib/libcrypto/sha/asm/sha1-thumb.pl +++ /dev/null | |||
| @@ -1,259 +0,0 @@ | |||
| 1 | #!/usr/bin/env perl | ||
| 2 | |||
| 3 | # ==================================================================== | ||
| 4 | # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | ||
| 5 | # project. The module is, however, dual licensed under OpenSSL and | ||
| 6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
| 7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
| 8 | # ==================================================================== | ||
| 9 | |||
| 10 | # sha1_block for Thumb. | ||
| 11 | # | ||
| 12 | # January 2007. | ||
| 13 | # | ||
| 14 | # The code does not present direct interest to OpenSSL, because of low | ||
| 15 | # performance. Its purpose is to establish _size_ benchmark. Pretty | ||
| 16 | # useless one I must say, because 30% or 88 bytes larger ARMv4 code | ||
| 17 | # [available on demand] is almost _twice_ as fast. It should also be | ||
| 18 | # noted that in-lining of .Lcommon and .Lrotate improves performance | ||
| 19 | # by over 40%, while code increases by only 10% or 32 bytes. But once | ||
| 20 | # again, the goal was to establish _size_ benchmark, not performance. | ||
| 21 | |||
| 22 | $output=shift; | ||
| 23 | open STDOUT,">$output"; | ||
| 24 | |||
| 25 | $inline=0; | ||
| 26 | #$cheat_on_binutils=1; | ||
| 27 | |||
| 28 | $t0="r0"; | ||
| 29 | $t1="r1"; | ||
| 30 | $t2="r2"; | ||
| 31 | $a="r3"; | ||
| 32 | $b="r4"; | ||
| 33 | $c="r5"; | ||
| 34 | $d="r6"; | ||
| 35 | $e="r7"; | ||
| 36 | $K="r8"; # "upper" registers can be used in add/sub and mov insns | ||
| 37 | $ctx="r9"; | ||
| 38 | $inp="r10"; | ||
| 39 | $len="r11"; | ||
| 40 | $Xi="r12"; | ||
| 41 | |||
| 42 | sub common { | ||
| 43 | <<___; | ||
| 44 | sub $t0,#4 | ||
| 45 | ldr $t1,[$t0] | ||
| 46 | add $e,$K @ E+=K_xx_xx | ||
| 47 | lsl $t2,$a,#5 | ||
| 48 | add $t2,$e | ||
| 49 | lsr $e,$a,#27 | ||
| 50 | add $t2,$e @ E+=ROR(A,27) | ||
| 51 | add $t2,$t1 @ E+=X[i] | ||
| 52 | ___ | ||
| 53 | } | ||
| 54 | sub rotate { | ||
| 55 | <<___; | ||
| 56 | mov $e,$d @ E=D | ||
| 57 | mov $d,$c @ D=C | ||
| 58 | lsl $c,$b,#30 | ||
| 59 | lsr $b,$b,#2 | ||
| 60 | orr $c,$b @ C=ROR(B,2) | ||
| 61 | mov $b,$a @ B=A | ||
| 62 | add $a,$t2,$t1 @ A=E+F_xx_xx(B,C,D) | ||
| 63 | ___ | ||
| 64 | } | ||
| 65 | |||
| 66 | sub BODY_00_19 { | ||
| 67 | $code.=$inline?&common():"\tbl .Lcommon\n"; | ||
| 68 | $code.=<<___; | ||
| 69 | mov $t1,$c | ||
| 70 | eor $t1,$d | ||
| 71 | and $t1,$b | ||
| 72 | eor $t1,$d @ F_00_19(B,C,D) | ||
| 73 | ___ | ||
| 74 | $code.=$inline?&rotate():"\tbl .Lrotate\n"; | ||
| 75 | } | ||
| 76 | |||
| 77 | sub BODY_20_39 { | ||
| 78 | $code.=$inline?&common():"\tbl .Lcommon\n"; | ||
| 79 | $code.=<<___; | ||
| 80 | mov $t1,$b | ||
| 81 | eor $t1,$c | ||
| 82 | eor $t1,$d @ F_20_39(B,C,D) | ||
| 83 | ___ | ||
| 84 | $code.=$inline?&rotate():"\tbl .Lrotate\n"; | ||
| 85 | } | ||
| 86 | |||
| 87 | sub BODY_40_59 { | ||
| 88 | $code.=$inline?&common():"\tbl .Lcommon\n"; | ||
| 89 | $code.=<<___; | ||
| 90 | mov $t1,$b | ||
| 91 | and $t1,$c | ||
| 92 | mov $e,$b | ||
| 93 | orr $e,$c | ||
| 94 | and $e,$d | ||
| 95 | orr $t1,$e @ F_40_59(B,C,D) | ||
| 96 | ___ | ||
| 97 | $code.=$inline?&rotate():"\tbl .Lrotate\n"; | ||
| 98 | } | ||
| 99 | |||
| 100 | $code=<<___; | ||
| 101 | .text | ||
| 102 | .code 16 | ||
| 103 | |||
| 104 | .global sha1_block_data_order | ||
| 105 | .type sha1_block_data_order,%function | ||
| 106 | |||
| 107 | .align 2 | ||
| 108 | sha1_block_data_order: | ||
| 109 | ___ | ||
| 110 | if ($cheat_on_binutils) { | ||
| 111 | $code.=<<___; | ||
| 112 | .code 32 | ||
| 113 | add r3,pc,#1 | ||
| 114 | bx r3 @ switch to Thumb ISA | ||
| 115 | .code 16 | ||
| 116 | ___ | ||
| 117 | } | ||
| 118 | $code.=<<___; | ||
| 119 | push {r4-r7} | ||
| 120 | mov r3,r8 | ||
| 121 | mov r4,r9 | ||
| 122 | mov r5,r10 | ||
| 123 | mov r6,r11 | ||
| 124 | mov r7,r12 | ||
| 125 | push {r3-r7,lr} | ||
| 126 | lsl r2,#6 | ||
| 127 | mov $ctx,r0 @ save context | ||
| 128 | mov $inp,r1 @ save inp | ||
| 129 | mov $len,r2 @ save len | ||
| 130 | add $len,$inp @ $len to point at inp end | ||
| 131 | |||
| 132 | .Lloop: | ||
| 133 | mov $Xi,sp | ||
| 134 | mov $t2,sp | ||
| 135 | sub $t2,#16*4 @ [3] | ||
| 136 | .LXload: | ||
| 137 | ldrb $a,[$t1,#0] @ $t1 is r1 and holds inp | ||
| 138 | ldrb $b,[$t1,#1] | ||
| 139 | ldrb $c,[$t1,#2] | ||
| 140 | ldrb $d,[$t1,#3] | ||
| 141 | lsl $a,#24 | ||
| 142 | lsl $b,#16 | ||
| 143 | lsl $c,#8 | ||
| 144 | orr $a,$b | ||
| 145 | orr $a,$c | ||
| 146 | orr $a,$d | ||
| 147 | add $t1,#4 | ||
| 148 | push {$a} | ||
| 149 | cmp sp,$t2 | ||
| 150 | bne .LXload @ [+14*16] | ||
| 151 | |||
| 152 | mov $inp,$t1 @ update $inp | ||
| 153 | sub $t2,#32*4 | ||
| 154 | sub $t2,#32*4 | ||
| 155 | mov $e,#31 @ [+4] | ||
| 156 | .LXupdate: | ||
| 157 | ldr $a,[sp,#15*4] | ||
| 158 | ldr $b,[sp,#13*4] | ||
| 159 | ldr $c,[sp,#7*4] | ||
| 160 | ldr $d,[sp,#2*4] | ||
| 161 | eor $a,$b | ||
| 162 | eor $a,$c | ||
| 163 | eor $a,$d | ||
| 164 | ror $a,$e | ||
| 165 | push {$a} | ||
| 166 | cmp sp,$t2 | ||
| 167 | bne .LXupdate @ [+(11+1)*64] | ||
| 168 | |||
| 169 | ldmia $t0!,{$a,$b,$c,$d,$e} @ $t0 is r0 and holds ctx | ||
| 170 | mov $t0,$Xi | ||
| 171 | |||
| 172 | ldr $t2,.LK_00_19 | ||
| 173 | mov $t1,$t0 | ||
| 174 | sub $t1,#20*4 | ||
| 175 | mov $Xi,$t1 | ||
| 176 | mov $K,$t2 @ [+7+4] | ||
| 177 | .L_00_19: | ||
| 178 | ___ | ||
| 179 | &BODY_00_19(); | ||
| 180 | $code.=<<___; | ||
| 181 | cmp $Xi,$t0 | ||
| 182 | bne .L_00_19 @ [+(2+9+4+2+8+2)*20] | ||
| 183 | |||
| 184 | ldr $t2,.LK_20_39 | ||
| 185 | mov $t1,$t0 | ||
| 186 | sub $t1,#20*4 | ||
| 187 | mov $Xi,$t1 | ||
| 188 | mov $K,$t2 @ [+5] | ||
| 189 | .L_20_39_or_60_79: | ||
| 190 | ___ | ||
| 191 | &BODY_20_39(); | ||
| 192 | $code.=<<___; | ||
| 193 | cmp $Xi,$t0 | ||
| 194 | bne .L_20_39_or_60_79 @ [+(2+9+3+2+8+2)*20*2] | ||
| 195 | cmp sp,$t0 | ||
| 196 | beq .Ldone @ [+2] | ||
| 197 | |||
| 198 | ldr $t2,.LK_40_59 | ||
| 199 | mov $t1,$t0 | ||
| 200 | sub $t1,#20*4 | ||
| 201 | mov $Xi,$t1 | ||
| 202 | mov $K,$t2 @ [+5] | ||
| 203 | .L_40_59: | ||
| 204 | ___ | ||
| 205 | &BODY_40_59(); | ||
| 206 | $code.=<<___; | ||
| 207 | cmp $Xi,$t0 | ||
| 208 | bne .L_40_59 @ [+(2+9+6+2+8+2)*20] | ||
| 209 | |||
| 210 | ldr $t2,.LK_60_79 | ||
| 211 | mov $Xi,sp | ||
| 212 | mov $K,$t2 | ||
| 213 | b .L_20_39_or_60_79 @ [+4] | ||
| 214 | .Ldone: | ||
| 215 | mov $t0,$ctx | ||
| 216 | ldr $t1,[$t0,#0] | ||
| 217 | ldr $t2,[$t0,#4] | ||
| 218 | add $a,$t1 | ||
| 219 | ldr $t1,[$t0,#8] | ||
| 220 | add $b,$t2 | ||
| 221 | ldr $t2,[$t0,#12] | ||
| 222 | add $c,$t1 | ||
| 223 | ldr $t1,[$t0,#16] | ||
| 224 | add $d,$t2 | ||
| 225 | add $e,$t1 | ||
| 226 | stmia $t0!,{$a,$b,$c,$d,$e} @ [+20] | ||
| 227 | |||
| 228 | add sp,#80*4 @ deallocate stack frame | ||
| 229 | mov $t0,$ctx @ restore ctx | ||
| 230 | mov $t1,$inp @ restore inp | ||
| 231 | cmp $t1,$len | ||
| 232 | beq .Lexit | ||
| 233 | b .Lloop @ [+6] total 3212 cycles | ||
| 234 | .Lexit: | ||
| 235 | pop {r2-r7} | ||
| 236 | mov r8,r2 | ||
| 237 | mov r9,r3 | ||
| 238 | mov r10,r4 | ||
| 239 | mov r11,r5 | ||
| 240 | mov r12,r6 | ||
| 241 | mov lr,r7 | ||
| 242 | pop {r4-r7} | ||
| 243 | bx lr | ||
| 244 | .align 2 | ||
| 245 | ___ | ||
| 246 | $code.=".Lcommon:\n".&common()."\tmov pc,lr\n" if (!$inline); | ||
| 247 | $code.=".Lrotate:\n".&rotate()."\tmov pc,lr\n" if (!$inline); | ||
| 248 | $code.=<<___; | ||
| 249 | .align 2 | ||
| 250 | .LK_00_19: .word 0x5a827999 | ||
| 251 | .LK_20_39: .word 0x6ed9eba1 | ||
| 252 | .LK_40_59: .word 0x8f1bbcdc | ||
| 253 | .LK_60_79: .word 0xca62c1d6 | ||
| 254 | .size sha1_block_data_order,.-sha1_block_data_order | ||
| 255 | .asciz "SHA1 block transform for Thumb, CRYPTOGAMS by <appro\@openssl.org>" | ||
| 256 | ___ | ||
| 257 | |||
| 258 | print $code; | ||
| 259 | close STDOUT; # enforce flush | ||
