diff options
| author | djm <> | 2009-04-06 06:30:10 +0000 |
|---|---|---|
| committer | djm <> | 2009-04-06 06:30:10 +0000 |
| commit | 2b6e09b39ef1d803b50ee024a06d1c250fde442d (patch) | |
| tree | f116109c359f26a2b149bbc752be39c16099bae1 /src/lib/libcrypto/bn | |
| parent | a0fdc9ec41594852f67ec77dfad9cb06bacc4186 (diff) | |
| download | openbsd-2b6e09b39ef1d803b50ee024a06d1c250fde442d.tar.gz openbsd-2b6e09b39ef1d803b50ee024a06d1c250fde442d.tar.bz2 openbsd-2b6e09b39ef1d803b50ee024a06d1c250fde442d.zip | |
import of OpenSSL 0.9.8k
Diffstat (limited to 'src/lib/libcrypto/bn')
| -rw-r--r-- | src/lib/libcrypto/bn/asm/alpha-mont.pl | 317 | ||||
| -rw-r--r-- | src/lib/libcrypto/bn/asm/armv4-mont.pl | 200 | ||||
| -rw-r--r-- | src/lib/libcrypto/bn/asm/ppc-mont.pl | 323 | ||||
| -rw-r--r-- | src/lib/libcrypto/bn/asm/ppc64-mont.pl | 918 | ||||
| -rw-r--r-- | src/lib/libcrypto/bn/asm/s390x-mont.pl | 225 | ||||
| -rwxr-xr-x | src/lib/libcrypto/bn/asm/s390x.S | 678 | ||||
| -rw-r--r-- | src/lib/libcrypto/bn/asm/sparcv9-mont.pl | 606 | ||||
| -rwxr-xr-x | src/lib/libcrypto/bn/asm/sparcv9a-mont.pl | 882 | ||||
| -rw-r--r-- | src/lib/libcrypto/bn/asm/via-mont.pl | 242 | ||||
| -rwxr-xr-x | src/lib/libcrypto/bn/asm/x86-mont.pl | 591 |
10 files changed, 4982 insertions, 0 deletions
diff --git a/src/lib/libcrypto/bn/asm/alpha-mont.pl b/src/lib/libcrypto/bn/asm/alpha-mont.pl new file mode 100644 index 0000000000..7a2cc3173b --- /dev/null +++ b/src/lib/libcrypto/bn/asm/alpha-mont.pl | |||
| @@ -0,0 +1,317 @@ | |||
| 1 | #!/usr/bin/env perl | ||
| 2 | # | ||
| 3 | # ==================================================================== | ||
| 4 | # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | ||
| 5 | # project. The module is, however, dual licensed under OpenSSL and | ||
| 6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
| 7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
| 8 | # ==================================================================== | ||
| 9 | # | ||
| 10 | # On 21264 RSA sign performance improves by 70/35/20/15 percent for | ||
| 11 | # 512/1024/2048/4096 bit key lengths. This is against vendor compiler | ||
| 12 | # instructed to '-tune host' code with in-line assembler. Other | ||
| 13 | # benchmarks improve by 15-20%. To anchor it to something else, the | ||
| 14 | # code provides approximately the same performance per GHz as AMD64. | ||
| 15 | # I.e. if you compare 1GHz 21264 and 2GHz Opteron, you'll observe ~2x | ||
| 16 | # difference. | ||
| 17 | |||
| 18 | # int bn_mul_mont( | ||
| 19 | $rp="a0"; # BN_ULONG *rp, | ||
| 20 | $ap="a1"; # const BN_ULONG *ap, | ||
| 21 | $bp="a2"; # const BN_ULONG *bp, | ||
| 22 | $np="a3"; # const BN_ULONG *np, | ||
| 23 | $n0="a4"; # const BN_ULONG *n0, | ||
| 24 | $num="a5"; # int num); | ||
| 25 | |||
| 26 | $lo0="t0"; | ||
| 27 | $hi0="t1"; | ||
| 28 | $lo1="t2"; | ||
| 29 | $hi1="t3"; | ||
| 30 | $aj="t4"; | ||
| 31 | $bi="t5"; | ||
| 32 | $nj="t6"; | ||
| 33 | $tp="t7"; | ||
| 34 | $alo="t8"; | ||
| 35 | $ahi="t9"; | ||
| 36 | $nlo="t10"; | ||
| 37 | $nhi="t11"; | ||
| 38 | $tj="t12"; | ||
| 39 | $i="s3"; | ||
| 40 | $j="s4"; | ||
| 41 | $m1="s5"; | ||
| 42 | |||
| 43 | $code=<<___; | ||
| 44 | #include <asm.h> | ||
| 45 | #include <regdef.h> | ||
| 46 | |||
| 47 | .text | ||
| 48 | |||
| 49 | .set noat | ||
| 50 | .set noreorder | ||
| 51 | |||
| 52 | .globl bn_mul_mont | ||
| 53 | .align 5 | ||
| 54 | .ent bn_mul_mont | ||
| 55 | bn_mul_mont: | ||
| 56 | lda sp,-40(sp) | ||
| 57 | stq ra,0(sp) | ||
| 58 | stq s3,8(sp) | ||
| 59 | stq s4,16(sp) | ||
| 60 | stq s5,24(sp) | ||
| 61 | stq fp,32(sp) | ||
| 62 | mov sp,fp | ||
| 63 | .mask 0x0400f000,-40 | ||
| 64 | .frame fp,40,ra | ||
| 65 | .prologue 0 | ||
| 66 | |||
| 67 | .align 4 | ||
| 68 | .set reorder | ||
| 69 | sextl $num,$num | ||
| 70 | mov 0,v0 | ||
| 71 | cmplt $num,4,AT | ||
| 72 | bne AT,.Lexit | ||
| 73 | |||
| 74 | ldq $hi0,0($ap) # ap[0] | ||
| 75 | s8addq $num,16,AT | ||
| 76 | ldq $aj,8($ap) | ||
| 77 | subq sp,AT,sp | ||
| 78 | ldq $bi,0($bp) # bp[0] | ||
| 79 | mov -4096,AT | ||
| 80 | ldq $n0,0($n0) | ||
| 81 | and sp,AT,sp | ||
| 82 | |||
| 83 | mulq $hi0,$bi,$lo0 | ||
| 84 | ldq $hi1,0($np) # np[0] | ||
| 85 | umulh $hi0,$bi,$hi0 | ||
| 86 | ldq $nj,8($np) | ||
| 87 | |||
| 88 | mulq $lo0,$n0,$m1 | ||
| 89 | |||
| 90 | mulq $hi1,$m1,$lo1 | ||
| 91 | umulh $hi1,$m1,$hi1 | ||
| 92 | |||
| 93 | addq $lo1,$lo0,$lo1 | ||
| 94 | cmpult $lo1,$lo0,AT | ||
| 95 | addq $hi1,AT,$hi1 | ||
| 96 | |||
| 97 | mulq $aj,$bi,$alo | ||
| 98 | mov 2,$j | ||
| 99 | umulh $aj,$bi,$ahi | ||
| 100 | mov sp,$tp | ||
| 101 | |||
| 102 | mulq $nj,$m1,$nlo | ||
| 103 | s8addq $j,$ap,$aj | ||
| 104 | umulh $nj,$m1,$nhi | ||
| 105 | s8addq $j,$np,$nj | ||
| 106 | .align 4 | ||
| 107 | .L1st: | ||
| 108 | .set noreorder | ||
| 109 | ldq $aj,($aj) | ||
| 110 | addl $j,1,$j | ||
| 111 | ldq $nj,($nj) | ||
| 112 | lda $tp,8($tp) | ||
| 113 | |||
| 114 | addq $alo,$hi0,$lo0 | ||
| 115 | mulq $aj,$bi,$alo | ||
| 116 | cmpult $lo0,$hi0,AT | ||
| 117 | addq $nlo,$hi1,$lo1 | ||
| 118 | |||
| 119 | mulq $nj,$m1,$nlo | ||
| 120 | addq $ahi,AT,$hi0 | ||
| 121 | cmpult $lo1,$hi1,v0 | ||
| 122 | cmplt $j,$num,$tj | ||
| 123 | |||
| 124 | umulh $aj,$bi,$ahi | ||
| 125 | addq $nhi,v0,$hi1 | ||
| 126 | addq $lo1,$lo0,$lo1 | ||
| 127 | s8addq $j,$ap,$aj | ||
| 128 | |||
| 129 | umulh $nj,$m1,$nhi | ||
| 130 | cmpult $lo1,$lo0,v0 | ||
| 131 | addq $hi1,v0,$hi1 | ||
| 132 | s8addq $j,$np,$nj | ||
| 133 | |||
| 134 | stq $lo1,-8($tp) | ||
| 135 | nop | ||
| 136 | unop | ||
| 137 | bne $tj,.L1st | ||
| 138 | .set reorder | ||
| 139 | |||
| 140 | addq $alo,$hi0,$lo0 | ||
| 141 | addq $nlo,$hi1,$lo1 | ||
| 142 | cmpult $lo0,$hi0,AT | ||
| 143 | cmpult $lo1,$hi1,v0 | ||
| 144 | addq $ahi,AT,$hi0 | ||
| 145 | addq $nhi,v0,$hi1 | ||
| 146 | |||
| 147 | addq $lo1,$lo0,$lo1 | ||
| 148 | cmpult $lo1,$lo0,v0 | ||
| 149 | addq $hi1,v0,$hi1 | ||
| 150 | |||
| 151 | stq $lo1,0($tp) | ||
| 152 | |||
| 153 | addq $hi1,$hi0,$hi1 | ||
| 154 | cmpult $hi1,$hi0,AT | ||
| 155 | stq $hi1,8($tp) | ||
| 156 | stq AT,16($tp) | ||
| 157 | |||
| 158 | mov 1,$i | ||
| 159 | .align 4 | ||
| 160 | .Louter: | ||
| 161 | s8addq $i,$bp,$bi | ||
| 162 | ldq $hi0,($ap) | ||
| 163 | ldq $aj,8($ap) | ||
| 164 | ldq $bi,($bi) | ||
| 165 | ldq $hi1,($np) | ||
| 166 | ldq $nj,8($np) | ||
| 167 | ldq $tj,(sp) | ||
| 168 | |||
| 169 | mulq $hi0,$bi,$lo0 | ||
| 170 | umulh $hi0,$bi,$hi0 | ||
| 171 | |||
| 172 | addq $lo0,$tj,$lo0 | ||
| 173 | cmpult $lo0,$tj,AT | ||
| 174 | addq $hi0,AT,$hi0 | ||
| 175 | |||
| 176 | mulq $lo0,$n0,$m1 | ||
| 177 | |||
| 178 | mulq $hi1,$m1,$lo1 | ||
| 179 | umulh $hi1,$m1,$hi1 | ||
| 180 | |||
| 181 | addq $lo1,$lo0,$lo1 | ||
| 182 | cmpult $lo1,$lo0,AT | ||
| 183 | mov 2,$j | ||
| 184 | addq $hi1,AT,$hi1 | ||
| 185 | |||
| 186 | mulq $aj,$bi,$alo | ||
| 187 | mov sp,$tp | ||
| 188 | umulh $aj,$bi,$ahi | ||
| 189 | |||
| 190 | mulq $nj,$m1,$nlo | ||
| 191 | s8addq $j,$ap,$aj | ||
| 192 | umulh $nj,$m1,$nhi | ||
| 193 | .align 4 | ||
| 194 | .Linner: | ||
| 195 | .set noreorder | ||
| 196 | ldq $tj,8($tp) #L0 | ||
| 197 | nop #U1 | ||
| 198 | ldq $aj,($aj) #L1 | ||
| 199 | s8addq $j,$np,$nj #U0 | ||
| 200 | |||
| 201 | ldq $nj,($nj) #L0 | ||
| 202 | nop #U1 | ||
| 203 | addq $alo,$hi0,$lo0 #L1 | ||
| 204 | lda $tp,8($tp) | ||
| 205 | |||
| 206 | mulq $aj,$bi,$alo #U1 | ||
| 207 | cmpult $lo0,$hi0,AT #L0 | ||
| 208 | addq $nlo,$hi1,$lo1 #L1 | ||
| 209 | addl $j,1,$j | ||
| 210 | |||
| 211 | mulq $nj,$m1,$nlo #U1 | ||
| 212 | addq $ahi,AT,$hi0 #L0 | ||
| 213 | addq $lo0,$tj,$lo0 #L1 | ||
| 214 | cmpult $lo1,$hi1,v0 #U0 | ||
| 215 | |||
| 216 | umulh $aj,$bi,$ahi #U1 | ||
| 217 | cmpult $lo0,$tj,AT #L0 | ||
| 218 | addq $lo1,$lo0,$lo1 #L1 | ||
| 219 | addq $nhi,v0,$hi1 #U0 | ||
| 220 | |||
| 221 | umulh $nj,$m1,$nhi #U1 | ||
| 222 | s8addq $j,$ap,$aj #L0 | ||
| 223 | cmpult $lo1,$lo0,v0 #L1 | ||
| 224 | cmplt $j,$num,$tj #U0 # borrow $tj | ||
| 225 | |||
| 226 | addq $hi0,AT,$hi0 #L0 | ||
| 227 | addq $hi1,v0,$hi1 #U1 | ||
| 228 | stq $lo1,-8($tp) #L1 | ||
| 229 | bne $tj,.Linner #U0 | ||
| 230 | .set reorder | ||
| 231 | |||
| 232 | ldq $tj,8($tp) | ||
| 233 | addq $alo,$hi0,$lo0 | ||
| 234 | addq $nlo,$hi1,$lo1 | ||
| 235 | cmpult $lo0,$hi0,AT | ||
| 236 | cmpult $lo1,$hi1,v0 | ||
| 237 | addq $ahi,AT,$hi0 | ||
| 238 | addq $nhi,v0,$hi1 | ||
| 239 | |||
| 240 | addq $lo0,$tj,$lo0 | ||
| 241 | cmpult $lo0,$tj,AT | ||
| 242 | addq $hi0,AT,$hi0 | ||
| 243 | |||
| 244 | ldq $tj,16($tp) | ||
| 245 | addq $lo1,$lo0,$j | ||
| 246 | cmpult $j,$lo0,v0 | ||
| 247 | addq $hi1,v0,$hi1 | ||
| 248 | |||
| 249 | addq $hi1,$hi0,$lo1 | ||
| 250 | stq $j,($tp) | ||
| 251 | cmpult $lo1,$hi0,$hi1 | ||
| 252 | addq $lo1,$tj,$lo1 | ||
| 253 | cmpult $lo1,$tj,AT | ||
| 254 | addl $i,1,$i | ||
| 255 | addq $hi1,AT,$hi1 | ||
| 256 | stq $lo1,8($tp) | ||
| 257 | cmplt $i,$num,$tj # borrow $tj | ||
| 258 | stq $hi1,16($tp) | ||
| 259 | bne $tj,.Louter | ||
| 260 | |||
| 261 | s8addq $num,sp,$tj # &tp[num] | ||
| 262 | mov $rp,$bp # put rp aside | ||
| 263 | mov sp,$tp | ||
| 264 | mov sp,$ap | ||
| 265 | mov 0,$hi0 # clear borrow bit | ||
| 266 | |||
| 267 | .align 4 | ||
| 268 | .Lsub: ldq $lo0,($tp) | ||
| 269 | ldq $lo1,($np) | ||
| 270 | lda $tp,8($tp) | ||
| 271 | lda $np,8($np) | ||
| 272 | subq $lo0,$lo1,$lo1 # tp[i]-np[i] | ||
| 273 | cmpult $lo0,$lo1,AT | ||
| 274 | subq $lo1,$hi0,$lo0 | ||
| 275 | cmpult $lo1,$lo0,$hi0 | ||
| 276 | or $hi0,AT,$hi0 | ||
| 277 | stq $lo0,($rp) | ||
| 278 | cmpult $tp,$tj,v0 | ||
| 279 | lda $rp,8($rp) | ||
| 280 | bne v0,.Lsub | ||
| 281 | |||
| 282 | subq $hi1,$hi0,$hi0 # handle upmost overflow bit | ||
| 283 | mov sp,$tp | ||
| 284 | mov $bp,$rp # restore rp | ||
| 285 | |||
| 286 | and sp,$hi0,$ap | ||
| 287 | bic $bp,$hi0,$bp | ||
| 288 | bis $bp,$ap,$ap # ap=borrow?tp:rp | ||
| 289 | |||
| 290 | .align 4 | ||
| 291 | .Lcopy: ldq $aj,($ap) # copy or in-place refresh | ||
| 292 | lda $tp,8($tp) | ||
| 293 | lda $rp,8($rp) | ||
| 294 | lda $ap,8($ap) | ||
| 295 | stq zero,-8($tp) # zap tp | ||
| 296 | cmpult $tp,$tj,AT | ||
| 297 | stq $aj,-8($rp) | ||
| 298 | bne AT,.Lcopy | ||
| 299 | mov 1,v0 | ||
| 300 | |||
| 301 | .Lexit: | ||
| 302 | .set noreorder | ||
| 303 | mov fp,sp | ||
| 304 | /*ldq ra,0(sp)*/ | ||
| 305 | ldq s3,8(sp) | ||
| 306 | ldq s4,16(sp) | ||
| 307 | ldq s5,24(sp) | ||
| 308 | ldq fp,32(sp) | ||
| 309 | lda sp,40(sp) | ||
| 310 | ret (ra) | ||
| 311 | .end bn_mul_mont | ||
| 312 | .rdata | ||
| 313 | .asciiz "Montgomery Multiplication for Alpha, CRYPTOGAMS by <appro\@openssl.org>" | ||
| 314 | ___ | ||
| 315 | |||
| 316 | print $code; | ||
| 317 | close STDOUT; | ||
diff --git a/src/lib/libcrypto/bn/asm/armv4-mont.pl b/src/lib/libcrypto/bn/asm/armv4-mont.pl new file mode 100644 index 0000000000..05d5dc1a48 --- /dev/null +++ b/src/lib/libcrypto/bn/asm/armv4-mont.pl | |||
| @@ -0,0 +1,200 @@ | |||
| 1 | #!/usr/bin/env perl | ||
| 2 | |||
| 3 | # ==================================================================== | ||
| 4 | # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | ||
| 5 | # project. The module is, however, dual licensed under OpenSSL and | ||
| 6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
| 7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
| 8 | # ==================================================================== | ||
| 9 | |||
| 10 | # January 2007. | ||
| 11 | |||
| 12 | # Montgomery multiplication for ARMv4. | ||
| 13 | # | ||
| 14 | # Performance improvement naturally varies among CPU implementations | ||
| 15 | # and compilers. The code was observed to provide +65-35% improvement | ||
| 16 | # [depending on key length, less for longer keys] on ARM920T, and | ||
| 17 | # +115-80% on Intel IXP425. This is compared to pre-bn_mul_mont code | ||
| 18 | # base and compiler generated code with in-lined umull and even umlal | ||
| 19 | # instructions. The latter means that this code didn't really have an | ||
| 20 | # "advantage" of utilizing some "secret" instruction. | ||
| 21 | # | ||
| 22 | # The code is interoperable with Thumb ISA and is rather compact, less | ||
| 23 | # than 1/2KB. Windows CE port would be trivial, as it's exclusively | ||
| 24 | # about decorations, ABI and instruction syntax are identical. | ||
| 25 | |||
| 26 | $num="r0"; # starts as num argument, but holds &tp[num-1] | ||
| 27 | $ap="r1"; | ||
| 28 | $bp="r2"; $bi="r2"; $rp="r2"; | ||
| 29 | $np="r3"; | ||
| 30 | $tp="r4"; | ||
| 31 | $aj="r5"; | ||
| 32 | $nj="r6"; | ||
| 33 | $tj="r7"; | ||
| 34 | $n0="r8"; | ||
| 35 | ########### # r9 is reserved by ELF as platform specific, e.g. TLS pointer | ||
| 36 | $alo="r10"; # sl, gcc uses it to keep @GOT | ||
| 37 | $ahi="r11"; # fp | ||
| 38 | $nlo="r12"; # ip | ||
| 39 | ########### # r13 is stack pointer | ||
| 40 | $nhi="r14"; # lr | ||
| 41 | ########### # r15 is program counter | ||
| 42 | |||
| 43 | #### argument block layout relative to &tp[num-1], a.k.a. $num | ||
| 44 | $_rp="$num,#12*4"; | ||
| 45 | # ap permanently resides in r1 | ||
| 46 | $_bp="$num,#13*4"; | ||
| 47 | # np permanently resides in r3 | ||
| 48 | $_n0="$num,#14*4"; | ||
| 49 | $_num="$num,#15*4"; $_bpend=$_num; | ||
| 50 | |||
| 51 | $code=<<___; | ||
| 52 | .text | ||
| 53 | |||
| 54 | .global bn_mul_mont | ||
| 55 | .type bn_mul_mont,%function | ||
| 56 | |||
| 57 | .align 2 | ||
| 58 | bn_mul_mont: | ||
| 59 | stmdb sp!,{r0,r2} @ sp points at argument block | ||
| 60 | ldr $num,[sp,#3*4] @ load num | ||
| 61 | cmp $num,#2 | ||
| 62 | movlt r0,#0 | ||
| 63 | addlt sp,sp,#2*4 | ||
| 64 | blt .Labrt | ||
| 65 | |||
| 66 | stmdb sp!,{r4-r12,lr} @ save 10 registers | ||
| 67 | |||
| 68 | mov $num,$num,lsl#2 @ rescale $num for byte count | ||
| 69 | sub sp,sp,$num @ alloca(4*num) | ||
| 70 | sub sp,sp,#4 @ +extra dword | ||
| 71 | sub $num,$num,#4 @ "num=num-1" | ||
| 72 | add $tp,$bp,$num @ &bp[num-1] | ||
| 73 | |||
| 74 | add $num,sp,$num @ $num to point at &tp[num-1] | ||
| 75 | ldr $n0,[$_n0] @ &n0 | ||
| 76 | ldr $bi,[$bp] @ bp[0] | ||
| 77 | ldr $aj,[$ap],#4 @ ap[0],ap++ | ||
| 78 | ldr $nj,[$np],#4 @ np[0],np++ | ||
| 79 | ldr $n0,[$n0] @ *n0 | ||
| 80 | str $tp,[$_bpend] @ save &bp[num] | ||
| 81 | |||
| 82 | umull $alo,$ahi,$aj,$bi @ ap[0]*bp[0] | ||
| 83 | str $n0,[$_n0] @ save n0 value | ||
| 84 | mul $n0,$alo,$n0 @ "tp[0]"*n0 | ||
| 85 | mov $nlo,#0 | ||
| 86 | umlal $alo,$nlo,$nj,$n0 @ np[0]*n0+"t[0]" | ||
| 87 | mov $tp,sp | ||
| 88 | |||
| 89 | .L1st: | ||
| 90 | ldr $aj,[$ap],#4 @ ap[j],ap++ | ||
| 91 | mov $alo,$ahi | ||
| 92 | mov $ahi,#0 | ||
| 93 | umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[0] | ||
| 94 | ldr $nj,[$np],#4 @ np[j],np++ | ||
| 95 | mov $nhi,#0 | ||
| 96 | umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0 | ||
| 97 | adds $nlo,$nlo,$alo | ||
| 98 | str $nlo,[$tp],#4 @ tp[j-1]=,tp++ | ||
| 99 | adc $nlo,$nhi,#0 | ||
| 100 | cmp $tp,$num | ||
| 101 | bne .L1st | ||
| 102 | |||
| 103 | adds $nlo,$nlo,$ahi | ||
| 104 | mov $nhi,#0 | ||
| 105 | adc $nhi,$nhi,#0 | ||
| 106 | ldr $tp,[$_bp] @ restore bp | ||
| 107 | str $nlo,[$num] @ tp[num-1]= | ||
| 108 | ldr $n0,[$_n0] @ restore n0 | ||
| 109 | str $nhi,[$num,#4] @ tp[num]= | ||
| 110 | |||
| 111 | .Louter: | ||
| 112 | sub $tj,$num,sp @ "original" $num-1 value | ||
| 113 | sub $ap,$ap,$tj @ "rewind" ap to &ap[1] | ||
| 114 | sub $np,$np,$tj @ "rewind" np to &np[1] | ||
| 115 | ldr $bi,[$tp,#4]! @ *(++bp) | ||
| 116 | ldr $aj,[$ap,#-4] @ ap[0] | ||
| 117 | ldr $nj,[$np,#-4] @ np[0] | ||
| 118 | ldr $alo,[sp] @ tp[0] | ||
| 119 | ldr $tj,[sp,#4] @ tp[1] | ||
| 120 | |||
| 121 | mov $ahi,#0 | ||
| 122 | umlal $alo,$ahi,$aj,$bi @ ap[0]*bp[i]+tp[0] | ||
| 123 | str $tp,[$_bp] @ save bp | ||
| 124 | mul $n0,$alo,$n0 | ||
| 125 | mov $nlo,#0 | ||
| 126 | umlal $alo,$nlo,$nj,$n0 @ np[0]*n0+"tp[0]" | ||
| 127 | mov $tp,sp | ||
| 128 | |||
| 129 | .Linner: | ||
| 130 | ldr $aj,[$ap],#4 @ ap[j],ap++ | ||
| 131 | adds $alo,$ahi,$tj @ +=tp[j] | ||
| 132 | mov $ahi,#0 | ||
| 133 | umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[i] | ||
| 134 | ldr $nj,[$np],#4 @ np[j],np++ | ||
| 135 | mov $nhi,#0 | ||
| 136 | umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0 | ||
| 137 | ldr $tj,[$tp,#8] @ tp[j+1] | ||
| 138 | adc $ahi,$ahi,#0 | ||
| 139 | adds $nlo,$nlo,$alo | ||
| 140 | str $nlo,[$tp],#4 @ tp[j-1]=,tp++ | ||
| 141 | adc $nlo,$nhi,#0 | ||
| 142 | cmp $tp,$num | ||
| 143 | bne .Linner | ||
| 144 | |||
| 145 | adds $nlo,$nlo,$ahi | ||
| 146 | mov $nhi,#0 | ||
| 147 | adc $nhi,$nhi,#0 | ||
| 148 | adds $nlo,$nlo,$tj | ||
| 149 | adc $nhi,$nhi,#0 | ||
| 150 | ldr $tp,[$_bp] @ restore bp | ||
| 151 | ldr $tj,[$_bpend] @ restore &bp[num] | ||
| 152 | str $nlo,[$num] @ tp[num-1]= | ||
| 153 | ldr $n0,[$_n0] @ restore n0 | ||
| 154 | str $nhi,[$num,#4] @ tp[num]= | ||
| 155 | |||
| 156 | cmp $tp,$tj | ||
| 157 | bne .Louter | ||
| 158 | |||
| 159 | ldr $rp,[$_rp] @ pull rp | ||
| 160 | add $num,$num,#4 @ $num to point at &tp[num] | ||
| 161 | sub $aj,$num,sp @ "original" num value | ||
| 162 | mov $tp,sp @ "rewind" $tp | ||
| 163 | mov $ap,$tp @ "borrow" $ap | ||
| 164 | sub $np,$np,$aj @ "rewind" $np to &np[0] | ||
| 165 | |||
| 166 | subs $tj,$tj,$tj @ "clear" carry flag | ||
| 167 | .Lsub: ldr $tj,[$tp],#4 | ||
| 168 | ldr $nj,[$np],#4 | ||
| 169 | sbcs $tj,$tj,$nj @ tp[j]-np[j] | ||
| 170 | str $tj,[$rp],#4 @ rp[j]= | ||
| 171 | teq $tp,$num @ preserve carry | ||
| 172 | bne .Lsub | ||
| 173 | sbcs $nhi,$nhi,#0 @ upmost carry | ||
| 174 | mov $tp,sp @ "rewind" $tp | ||
| 175 | sub $rp,$rp,$aj @ "rewind" $rp | ||
| 176 | |||
| 177 | and $ap,$tp,$nhi | ||
| 178 | bic $np,$rp,$nhi | ||
| 179 | orr $ap,$ap,$np @ ap=borrow?tp:rp | ||
| 180 | |||
| 181 | .Lcopy: ldr $tj,[$ap],#4 @ copy or in-place refresh | ||
| 182 | str sp,[$tp],#4 @ zap tp | ||
| 183 | str $tj,[$rp],#4 | ||
| 184 | cmp $tp,$num | ||
| 185 | bne .Lcopy | ||
| 186 | |||
| 187 | add sp,$num,#4 @ skip over tp[num+1] | ||
| 188 | ldmia sp!,{r4-r12,lr} @ restore registers | ||
| 189 | add sp,sp,#2*4 @ skip over {r0,r2} | ||
| 190 | mov r0,#1 | ||
| 191 | .Labrt: tst lr,#1 | ||
| 192 | moveq pc,lr @ be binary compatible with V4, yet | ||
| 193 | bx lr @ interoperable with Thumb ISA:-) | ||
| 194 | .size bn_mul_mont,.-bn_mul_mont | ||
| 195 | .asciz "Montgomery multiplication for ARMv4, CRYPTOGAMS by <appro\@openssl.org>" | ||
| 196 | ___ | ||
| 197 | |||
| 198 | $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4 | ||
| 199 | print $code; | ||
| 200 | close STDOUT; | ||
diff --git a/src/lib/libcrypto/bn/asm/ppc-mont.pl b/src/lib/libcrypto/bn/asm/ppc-mont.pl new file mode 100644 index 0000000000..7849eae959 --- /dev/null +++ b/src/lib/libcrypto/bn/asm/ppc-mont.pl | |||
| @@ -0,0 +1,323 @@ | |||
| 1 | #!/usr/bin/env perl | ||
| 2 | |||
| 3 | # ==================================================================== | ||
| 4 | # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | ||
| 5 | # project. The module is, however, dual licensed under OpenSSL and | ||
| 6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
| 7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
| 8 | # ==================================================================== | ||
| 9 | |||
| 10 | # April 2006 | ||
| 11 | |||
| 12 | # "Teaser" Montgomery multiplication module for PowerPC. It's possible | ||
| 13 | # to gain a bit more by modulo-scheduling outer loop, then dedicated | ||
| 14 | # squaring procedure should give further 20% and code can be adapted | ||
| 15 | # for 32-bit application running on 64-bit CPU. As for the latter. | ||
| 16 | # It won't be able to achieve "native" 64-bit performance, because in | ||
| 17 | # 32-bit application context every addc instruction will have to be | ||
| 18 | # expanded as addc, twice right shift by 32 and finally adde, etc. | ||
| 19 | # So far RSA *sign* performance improvement over pre-bn_mul_mont asm | ||
| 20 | # for 64-bit application running on PPC970/G5 is: | ||
| 21 | # | ||
| 22 | # 512-bit +65% | ||
| 23 | # 1024-bit +35% | ||
| 24 | # 2048-bit +18% | ||
| 25 | # 4096-bit +4% | ||
| 26 | |||
| 27 | $flavour = shift; | ||
| 28 | |||
| 29 | if ($flavour =~ /32/) { | ||
| 30 | $BITS= 32; | ||
| 31 | $BNSZ= $BITS/8; | ||
| 32 | $SIZE_T=4; | ||
| 33 | $RZONE= 224; | ||
| 34 | $FRAME= $SIZE_T*16; | ||
| 35 | |||
| 36 | $LD= "lwz"; # load | ||
| 37 | $LDU= "lwzu"; # load and update | ||
| 38 | $LDX= "lwzx"; # load indexed | ||
| 39 | $ST= "stw"; # store | ||
| 40 | $STU= "stwu"; # store and update | ||
| 41 | $STX= "stwx"; # store indexed | ||
| 42 | $STUX= "stwux"; # store indexed and update | ||
| 43 | $UMULL= "mullw"; # unsigned multiply low | ||
| 44 | $UMULH= "mulhwu"; # unsigned multiply high | ||
| 45 | $UCMP= "cmplw"; # unsigned compare | ||
| 46 | $SHRI= "srwi"; # unsigned shift right by immediate | ||
| 47 | $PUSH= $ST; | ||
| 48 | $POP= $LD; | ||
| 49 | } elsif ($flavour =~ /64/) { | ||
| 50 | $BITS= 64; | ||
| 51 | $BNSZ= $BITS/8; | ||
| 52 | $SIZE_T=8; | ||
| 53 | $RZONE= 288; | ||
| 54 | $FRAME= $SIZE_T*16; | ||
| 55 | |||
| 56 | # same as above, but 64-bit mnemonics... | ||
| 57 | $LD= "ld"; # load | ||
| 58 | $LDU= "ldu"; # load and update | ||
| 59 | $LDX= "ldx"; # load indexed | ||
| 60 | $ST= "std"; # store | ||
| 61 | $STU= "stdu"; # store and update | ||
| 62 | $STX= "stdx"; # store indexed | ||
| 63 | $STUX= "stdux"; # store indexed and update | ||
| 64 | $UMULL= "mulld"; # unsigned multiply low | ||
| 65 | $UMULH= "mulhdu"; # unsigned multiply high | ||
| 66 | $UCMP= "cmpld"; # unsigned compare | ||
| 67 | $SHRI= "srdi"; # unsigned shift right by immediate | ||
| 68 | $PUSH= $ST; | ||
| 69 | $POP= $LD; | ||
| 70 | } else { die "nonsense $flavour"; } | ||
| 71 | |||
| 72 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | ||
| 73 | ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or | ||
| 74 | ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or | ||
| 75 | die "can't locate ppc-xlate.pl"; | ||
| 76 | |||
| 77 | open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!"; | ||
| 78 | |||
| 79 | $sp="r1"; | ||
| 80 | $toc="r2"; | ||
| 81 | $rp="r3"; $ovf="r3"; | ||
| 82 | $ap="r4"; | ||
| 83 | $bp="r5"; | ||
| 84 | $np="r6"; | ||
| 85 | $n0="r7"; | ||
| 86 | $num="r8"; | ||
| 87 | $rp="r9"; # $rp is reassigned | ||
| 88 | $aj="r10"; | ||
| 89 | $nj="r11"; | ||
| 90 | $tj="r12"; | ||
| 91 | # non-volatile registers | ||
| 92 | $i="r14"; | ||
| 93 | $j="r15"; | ||
| 94 | $tp="r16"; | ||
| 95 | $m0="r17"; | ||
| 96 | $m1="r18"; | ||
| 97 | $lo0="r19"; | ||
| 98 | $hi0="r20"; | ||
| 99 | $lo1="r21"; | ||
| 100 | $hi1="r22"; | ||
| 101 | $alo="r23"; | ||
| 102 | $ahi="r24"; | ||
| 103 | $nlo="r25"; | ||
| 104 | # | ||
| 105 | $nhi="r0"; | ||
| 106 | |||
| 107 | $code=<<___; | ||
| 108 | .machine "any" | ||
| 109 | .text | ||
| 110 | |||
| 111 | .globl .bn_mul_mont | ||
| 112 | .align 4 | ||
| 113 | .bn_mul_mont: | ||
| 114 | cmpwi $num,4 | ||
| 115 | mr $rp,r3 ; $rp is reassigned | ||
| 116 | li r3,0 | ||
| 117 | bltlr | ||
| 118 | |||
| 119 | slwi $num,$num,`log($BNSZ)/log(2)` | ||
| 120 | li $tj,-4096 | ||
| 121 | addi $ovf,$num,`$FRAME+$RZONE` | ||
| 122 | subf $ovf,$ovf,$sp ; $sp-$ovf | ||
| 123 | and $ovf,$ovf,$tj ; minimize TLB usage | ||
| 124 | subf $ovf,$sp,$ovf ; $ovf-$sp | ||
| 125 | srwi $num,$num,`log($BNSZ)/log(2)` | ||
| 126 | $STUX $sp,$sp,$ovf | ||
| 127 | |||
| 128 | $PUSH r14,`4*$SIZE_T`($sp) | ||
| 129 | $PUSH r15,`5*$SIZE_T`($sp) | ||
| 130 | $PUSH r16,`6*$SIZE_T`($sp) | ||
| 131 | $PUSH r17,`7*$SIZE_T`($sp) | ||
| 132 | $PUSH r18,`8*$SIZE_T`($sp) | ||
| 133 | $PUSH r19,`9*$SIZE_T`($sp) | ||
| 134 | $PUSH r20,`10*$SIZE_T`($sp) | ||
| 135 | $PUSH r21,`11*$SIZE_T`($sp) | ||
| 136 | $PUSH r22,`12*$SIZE_T`($sp) | ||
| 137 | $PUSH r23,`13*$SIZE_T`($sp) | ||
| 138 | $PUSH r24,`14*$SIZE_T`($sp) | ||
| 139 | $PUSH r25,`15*$SIZE_T`($sp) | ||
| 140 | |||
| 141 | $LD $n0,0($n0) ; pull n0[0] value | ||
| 142 | addi $num,$num,-2 ; adjust $num for counter register | ||
| 143 | |||
| 144 | $LD $m0,0($bp) ; m0=bp[0] | ||
| 145 | $LD $aj,0($ap) ; ap[0] | ||
| 146 | addi $tp,$sp,$FRAME | ||
| 147 | $UMULL $lo0,$aj,$m0 ; ap[0]*bp[0] | ||
| 148 | $UMULH $hi0,$aj,$m0 | ||
| 149 | |||
| 150 | $LD $aj,$BNSZ($ap) ; ap[1] | ||
| 151 | $LD $nj,0($np) ; np[0] | ||
| 152 | |||
| 153 | $UMULL $m1,$lo0,$n0 ; "tp[0]"*n0 | ||
| 154 | |||
| 155 | $UMULL $alo,$aj,$m0 ; ap[1]*bp[0] | ||
| 156 | $UMULH $ahi,$aj,$m0 | ||
| 157 | |||
| 158 | $UMULL $lo1,$nj,$m1 ; np[0]*m1 | ||
| 159 | $UMULH $hi1,$nj,$m1 | ||
| 160 | $LD $nj,$BNSZ($np) ; np[1] | ||
| 161 | addc $lo1,$lo1,$lo0 | ||
| 162 | addze $hi1,$hi1 | ||
| 163 | |||
| 164 | $UMULL $nlo,$nj,$m1 ; np[1]*m1 | ||
| 165 | $UMULH $nhi,$nj,$m1 | ||
| 166 | |||
| 167 | mtctr $num | ||
| 168 | li $j,`2*$BNSZ` | ||
| 169 | .align 4 | ||
| 170 | L1st: | ||
| 171 | $LDX $aj,$ap,$j ; ap[j] | ||
| 172 | addc $lo0,$alo,$hi0 | ||
| 173 | $LDX $nj,$np,$j ; np[j] | ||
| 174 | addze $hi0,$ahi | ||
| 175 | $UMULL $alo,$aj,$m0 ; ap[j]*bp[0] | ||
| 176 | addc $lo1,$nlo,$hi1 | ||
| 177 | $UMULH $ahi,$aj,$m0 | ||
| 178 | addze $hi1,$nhi | ||
| 179 | $UMULL $nlo,$nj,$m1 ; np[j]*m1 | ||
| 180 | addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[0] | ||
| 181 | $UMULH $nhi,$nj,$m1 | ||
| 182 | addze $hi1,$hi1 | ||
| 183 | $ST $lo1,0($tp) ; tp[j-1] | ||
| 184 | |||
| 185 | addi $j,$j,$BNSZ ; j++ | ||
| 186 | addi $tp,$tp,$BNSZ ; tp++ | ||
| 187 | bdnz- L1st | ||
| 188 | ;L1st | ||
| 189 | addc $lo0,$alo,$hi0 | ||
| 190 | addze $hi0,$ahi | ||
| 191 | |||
| 192 | addc $lo1,$nlo,$hi1 | ||
| 193 | addze $hi1,$nhi | ||
| 194 | addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[0] | ||
| 195 | addze $hi1,$hi1 | ||
| 196 | $ST $lo1,0($tp) ; tp[j-1] | ||
| 197 | |||
| 198 | li $ovf,0 | ||
| 199 | addc $hi1,$hi1,$hi0 | ||
| 200 | addze $ovf,$ovf ; upmost overflow bit | ||
| 201 | $ST $hi1,$BNSZ($tp) | ||
| 202 | |||
| 203 | li $i,$BNSZ | ||
| 204 | .align 4 | ||
| 205 | Louter: | ||
| 206 | $LDX $m0,$bp,$i ; m0=bp[i] | ||
| 207 | $LD $aj,0($ap) ; ap[0] | ||
| 208 | addi $tp,$sp,$FRAME | ||
| 209 | $LD $tj,$FRAME($sp) ; tp[0] | ||
| 210 | $UMULL $lo0,$aj,$m0 ; ap[0]*bp[i] | ||
| 211 | $UMULH $hi0,$aj,$m0 | ||
| 212 | $LD $aj,$BNSZ($ap) ; ap[1] | ||
| 213 | $LD $nj,0($np) ; np[0] | ||
| 214 | addc $lo0,$lo0,$tj ; ap[0]*bp[i]+tp[0] | ||
| 215 | $UMULL $alo,$aj,$m0 ; ap[j]*bp[i] | ||
| 216 | addze $hi0,$hi0 | ||
| 217 | $UMULL $m1,$lo0,$n0 ; tp[0]*n0 | ||
| 218 | $UMULH $ahi,$aj,$m0 | ||
| 219 | $UMULL $lo1,$nj,$m1 ; np[0]*m1 | ||
| 220 | $UMULH $hi1,$nj,$m1 | ||
| 221 | $LD $nj,$BNSZ($np) ; np[1] | ||
| 222 | addc $lo1,$lo1,$lo0 | ||
| 223 | $UMULL $nlo,$nj,$m1 ; np[1]*m1 | ||
| 224 | addze $hi1,$hi1 | ||
| 225 | $UMULH $nhi,$nj,$m1 | ||
| 226 | |||
| 227 | mtctr $num | ||
| 228 | li $j,`2*$BNSZ` | ||
| 229 | .align 4 | ||
| 230 | Linner: | ||
| 231 | $LDX $aj,$ap,$j ; ap[j] | ||
| 232 | addc $lo0,$alo,$hi0 | ||
| 233 | $LD $tj,$BNSZ($tp) ; tp[j] | ||
| 234 | addze $hi0,$ahi | ||
| 235 | $LDX $nj,$np,$j ; np[j] | ||
| 236 | addc $lo1,$nlo,$hi1 | ||
| 237 | $UMULL $alo,$aj,$m0 ; ap[j]*bp[i] | ||
| 238 | addze $hi1,$nhi | ||
| 239 | $UMULH $ahi,$aj,$m0 | ||
| 240 | addc $lo0,$lo0,$tj ; ap[j]*bp[i]+tp[j] | ||
| 241 | $UMULL $nlo,$nj,$m1 ; np[j]*m1 | ||
| 242 | addze $hi0,$hi0 | ||
| 243 | $UMULH $nhi,$nj,$m1 | ||
| 244 | addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[i]+tp[j] | ||
| 245 | addi $j,$j,$BNSZ ; j++ | ||
| 246 | addze $hi1,$hi1 | ||
| 247 | $ST $lo1,0($tp) ; tp[j-1] | ||
| 248 | addi $tp,$tp,$BNSZ ; tp++ | ||
| 249 | bdnz- Linner | ||
| 250 | ;Linner | ||
| 251 | $LD $tj,$BNSZ($tp) ; tp[j] | ||
| 252 | addc $lo0,$alo,$hi0 | ||
| 253 | addze $hi0,$ahi | ||
| 254 | addc $lo0,$lo0,$tj ; ap[j]*bp[i]+tp[j] | ||
| 255 | addze $hi0,$hi0 | ||
| 256 | |||
| 257 | addc $lo1,$nlo,$hi1 | ||
| 258 | addze $hi1,$nhi | ||
| 259 | addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[i]+tp[j] | ||
| 260 | addze $hi1,$hi1 | ||
| 261 | $ST $lo1,0($tp) ; tp[j-1] | ||
| 262 | |||
| 263 | addic $ovf,$ovf,-1 ; move upmost overflow to XER[CA] | ||
| 264 | li $ovf,0 | ||
| 265 | adde $hi1,$hi1,$hi0 | ||
| 266 | addze $ovf,$ovf | ||
| 267 | $ST $hi1,$BNSZ($tp) | ||
| 268 | ; | ||
| 269 | slwi $tj,$num,`log($BNSZ)/log(2)` | ||
| 270 | $UCMP $i,$tj | ||
| 271 | addi $i,$i,$BNSZ | ||
| 272 | ble- Louter | ||
| 273 | |||
| 274 | addi $num,$num,2 ; restore $num | ||
| 275 | subfc $j,$j,$j ; j=0 and "clear" XER[CA] | ||
| 276 | addi $tp,$sp,$FRAME | ||
| 277 | mtctr $num | ||
| 278 | |||
| 279 | .align 4 | ||
| 280 | Lsub: $LDX $tj,$tp,$j | ||
| 281 | $LDX $nj,$np,$j | ||
| 282 | subfe $aj,$nj,$tj ; tp[j]-np[j] | ||
| 283 | $STX $aj,$rp,$j | ||
| 284 | addi $j,$j,$BNSZ | ||
| 285 | bdnz- Lsub | ||
| 286 | |||
| 287 | li $j,0 | ||
| 288 | mtctr $num | ||
| 289 | subfe $ovf,$j,$ovf ; handle upmost overflow bit | ||
| 290 | and $ap,$tp,$ovf | ||
| 291 | andc $np,$rp,$ovf | ||
| 292 | or $ap,$ap,$np ; ap=borrow?tp:rp | ||
| 293 | |||
| 294 | .align 4 | ||
| 295 | Lcopy: ; copy or in-place refresh | ||
| 296 | $LDX $tj,$ap,$j | ||
| 297 | $STX $tj,$rp,$j | ||
| 298 | $STX $j,$tp,$j ; zap at once | ||
| 299 | addi $j,$j,$BNSZ | ||
| 300 | bdnz- Lcopy | ||
| 301 | |||
| 302 | $POP r14,`4*$SIZE_T`($sp) | ||
| 303 | $POP r15,`5*$SIZE_T`($sp) | ||
| 304 | $POP r16,`6*$SIZE_T`($sp) | ||
| 305 | $POP r17,`7*$SIZE_T`($sp) | ||
| 306 | $POP r18,`8*$SIZE_T`($sp) | ||
| 307 | $POP r19,`9*$SIZE_T`($sp) | ||
| 308 | $POP r20,`10*$SIZE_T`($sp) | ||
| 309 | $POP r21,`11*$SIZE_T`($sp) | ||
| 310 | $POP r22,`12*$SIZE_T`($sp) | ||
| 311 | $POP r23,`13*$SIZE_T`($sp) | ||
| 312 | $POP r24,`14*$SIZE_T`($sp) | ||
| 313 | $POP r25,`15*$SIZE_T`($sp) | ||
| 314 | $POP $sp,0($sp) | ||
| 315 | li r3,1 | ||
| 316 | blr | ||
| 317 | .long 0 | ||
| 318 | .asciz "Montgomery Multiplication for PPC, CRYPTOGAMS by <appro\@fy.chalmers.se>" | ||
| 319 | ___ | ||
| 320 | |||
| 321 | $code =~ s/\`([^\`]*)\`/eval $1/gem; | ||
| 322 | print $code; | ||
| 323 | close STDOUT; | ||
diff --git a/src/lib/libcrypto/bn/asm/ppc64-mont.pl b/src/lib/libcrypto/bn/asm/ppc64-mont.pl new file mode 100644 index 0000000000..3449b35855 --- /dev/null +++ b/src/lib/libcrypto/bn/asm/ppc64-mont.pl | |||
| @@ -0,0 +1,918 @@ | |||
| 1 | #!/usr/bin/env perl | ||
| 2 | |||
| 3 | # ==================================================================== | ||
| 4 | # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | ||
| 5 | # project. The module is, however, dual licensed under OpenSSL and | ||
| 6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
| 7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
| 8 | # ==================================================================== | ||
| 9 | |||
| 10 | # December 2007 | ||
| 11 | |||
| 12 | # The reason for undertaken effort is basically following. Even though | ||
| 13 | # Power 6 CPU operates at incredible 4.7GHz clock frequency, its PKI | ||
| 14 | # performance was observed to be less than impressive, essentially as | ||
| 15 | # fast as 1.8GHz PPC970, or 2.6 times(!) slower than one would hope. | ||
| 16 | # Well, it's not surprising that IBM had to make some sacrifices to | ||
| 17 | # boost the clock frequency that much, but no overall improvement? | ||
| 18 | # Having observed how much difference did switching to FPU make on | ||
| 19 | # UltraSPARC, playing same stunt on Power 6 appeared appropriate... | ||
| 20 | # Unfortunately the resulting performance improvement is not as | ||
| 21 | # impressive, ~30%, and in absolute terms is still very far from what | ||
| 22 | # one would expect from 4.7GHz CPU. There is a chance that I'm doing | ||
| 23 | # something wrong, but in the lack of assembler level micro-profiling | ||
| 24 | # data or at least decent platform guide I can't tell... Or better | ||
| 25 | # results might be achieved with VMX... Anyway, this module provides | ||
| 26 | # *worse* performance on other PowerPC implementations, ~40-15% slower | ||
| 27 | # on PPC970 depending on key length and ~40% slower on Power 5 for all | ||
| 28 | # key lengths. As it's obviously inappropriate as "best all-round" | ||
| 29 | # alternative, it has to be complemented with run-time CPU family | ||
| 30 | # detection. Oh! It should also be noted that unlike other PowerPC | ||
| 31 | # implementation IALU ppc-mont.pl module performs *suboptimaly* on | ||
| 32 | # >=1024-bit key lengths on Power 6. It should also be noted that | ||
| 33 | # *everything* said so far applies to 64-bit builds! As far as 32-bit | ||
| 34 | # application executed on 64-bit CPU goes, this module is likely to | ||
| 35 | # become preferred choice, because it's easy to adapt it for such | ||
| 36 | # case and *is* faster than 32-bit ppc-mont.pl on *all* processors. | ||
| 37 | |||
| 38 | # February 2008 | ||
| 39 | |||
| 40 | # Micro-profiling assisted optimization results in ~15% improvement | ||
| 41 | # over original ppc64-mont.pl version, or overall ~50% improvement | ||
| 42 | # over ppc.pl module on Power 6. If compared to ppc-mont.pl on same | ||
| 43 | # Power 6 CPU, this module is 5-150% faster depending on key length, | ||
| 44 | # [hereafter] more for longer keys. But if compared to ppc-mont.pl | ||
| 45 | # on 1.8GHz PPC970, it's only 5-55% faster. Still far from impressive | ||
| 46 | # in absolute terms, but it's apparently the way Power 6 is... | ||
| 47 | |||
| 48 | $flavour = shift; | ||
| 49 | |||
| 50 | if ($flavour =~ /32/) { | ||
| 51 | $SIZE_T=4; | ||
| 52 | $RZONE= 224; | ||
| 53 | $FRAME= $SIZE_T*12+8*12; | ||
| 54 | $fname= "bn_mul_mont_ppc64"; | ||
| 55 | |||
| 56 | $STUX= "stwux"; # store indexed and update | ||
| 57 | $PUSH= "stw"; | ||
| 58 | $POP= "lwz"; | ||
| 59 | die "not implemented yet"; | ||
| 60 | } elsif ($flavour =~ /64/) { | ||
| 61 | $SIZE_T=8; | ||
| 62 | $RZONE= 288; | ||
| 63 | $FRAME= $SIZE_T*12+8*12; | ||
| 64 | $fname= "bn_mul_mont"; | ||
| 65 | |||
| 66 | # same as above, but 64-bit mnemonics... | ||
| 67 | $STUX= "stdux"; # store indexed and update | ||
| 68 | $PUSH= "std"; | ||
| 69 | $POP= "ld"; | ||
| 70 | } else { die "nonsense $flavour"; } | ||
| 71 | |||
| 72 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | ||
| 73 | ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or | ||
| 74 | ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or | ||
| 75 | die "can't locate ppc-xlate.pl"; | ||
| 76 | |||
| 77 | open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!"; | ||
| 78 | |||
| 79 | $FRAME=($FRAME+63)&~63; | ||
| 80 | $TRANSFER=16*8; | ||
| 81 | |||
| 82 | $carry="r0"; | ||
| 83 | $sp="r1"; | ||
| 84 | $toc="r2"; | ||
| 85 | $rp="r3"; $ovf="r3"; | ||
| 86 | $ap="r4"; | ||
| 87 | $bp="r5"; | ||
| 88 | $np="r6"; | ||
| 89 | $n0="r7"; | ||
| 90 | $num="r8"; | ||
| 91 | $rp="r9"; # $rp is reassigned | ||
| 92 | $tp="r10"; | ||
| 93 | $j="r11"; | ||
| 94 | $i="r12"; | ||
| 95 | # non-volatile registers | ||
| 96 | $nap_d="r14"; # interleaved ap and np in double format | ||
| 97 | $a0="r15"; # ap[0] | ||
| 98 | $t0="r16"; # temporary registers | ||
| 99 | $t1="r17"; | ||
| 100 | $t2="r18"; | ||
| 101 | $t3="r19"; | ||
| 102 | $t4="r20"; | ||
| 103 | $t5="r21"; | ||
| 104 | $t6="r22"; | ||
| 105 | $t7="r23"; | ||
| 106 | |||
| 107 | # PPC offers enough register bank capacity to unroll inner loops twice | ||
| 108 | # | ||
| 109 | # ..A3A2A1A0 | ||
| 110 | # dcba | ||
| 111 | # ----------- | ||
| 112 | # A0a | ||
| 113 | # A0b | ||
| 114 | # A0c | ||
| 115 | # A0d | ||
| 116 | # A1a | ||
| 117 | # A1b | ||
| 118 | # A1c | ||
| 119 | # A1d | ||
| 120 | # A2a | ||
| 121 | # A2b | ||
| 122 | # A2c | ||
| 123 | # A2d | ||
| 124 | # A3a | ||
| 125 | # A3b | ||
| 126 | # A3c | ||
| 127 | # A3d | ||
| 128 | # ..a | ||
| 129 | # ..b | ||
| 130 | # | ||
| 131 | $ba="f0"; $bb="f1"; $bc="f2"; $bd="f3"; | ||
| 132 | $na="f4"; $nb="f5"; $nc="f6"; $nd="f7"; | ||
| 133 | $dota="f8"; $dotb="f9"; | ||
| 134 | $A0="f10"; $A1="f11"; $A2="f12"; $A3="f13"; | ||
| 135 | $N0="f14"; $N1="f15"; $N2="f16"; $N3="f17"; | ||
| 136 | $T0a="f18"; $T0b="f19"; | ||
| 137 | $T1a="f20"; $T1b="f21"; | ||
| 138 | $T2a="f22"; $T2b="f23"; | ||
| 139 | $T3a="f24"; $T3b="f25"; | ||
| 140 | |||
| 141 | # sp----------->+-------------------------------+ | ||
| 142 | # | saved sp | | ||
| 143 | # +-------------------------------+ | ||
| 144 | # | | | ||
| 145 | # +-------------------------------+ | ||
| 146 | # | 10 saved gpr, r14-r23 | | ||
| 147 | # . . | ||
| 148 | # . . | ||
| 149 | # +12*size_t +-------------------------------+ | ||
| 150 | # | 12 saved fpr, f14-f25 | | ||
| 151 | # . . | ||
| 152 | # . . | ||
| 153 | # +12*8 +-------------------------------+ | ||
| 154 | # | padding to 64 byte boundary | | ||
| 155 | # . . | ||
| 156 | # +X +-------------------------------+ | ||
| 157 | # | 16 gpr<->fpr transfer zone | | ||
| 158 | # . . | ||
| 159 | # . . | ||
| 160 | # +16*8 +-------------------------------+ | ||
| 161 | # | __int64 tmp[-1] | | ||
| 162 | # +-------------------------------+ | ||
| 163 | # | __int64 tmp[num] | | ||
| 164 | # . . | ||
| 165 | # . . | ||
| 166 | # . . | ||
| 167 | # +(num+1)*8 +-------------------------------+ | ||
| 168 | # | padding to 64 byte boundary | | ||
| 169 | # . . | ||
| 170 | # +X +-------------------------------+ | ||
| 171 | # | double nap_d[4*num] | | ||
| 172 | # . . | ||
| 173 | # . . | ||
| 174 | # . . | ||
| 175 | # +-------------------------------+ | ||
| 176 | |||
| 177 | $code=<<___; | ||
| 178 | .machine "any" | ||
| 179 | .text | ||
| 180 | |||
| 181 | .globl .$fname | ||
| 182 | .align 5 | ||
| 183 | .$fname: | ||
| 184 | cmpwi $num,4 | ||
| 185 | mr $rp,r3 ; $rp is reassigned | ||
| 186 | li r3,0 ; possible "not handled" return code | ||
| 187 | bltlr- | ||
| 188 | andi. r0,$num,1 ; $num has to be even | ||
| 189 | bnelr- | ||
| 190 | |||
| 191 | slwi $num,$num,3 ; num*=8 | ||
| 192 | li $i,-4096 | ||
| 193 | slwi $tp,$num,2 ; place for {an}p_{lh}[num], i.e. 4*num | ||
| 194 | add $tp,$tp,$num ; place for tp[num+1] | ||
| 195 | addi $tp,$tp,`$FRAME+$TRANSFER+8+64+$RZONE` | ||
| 196 | subf $tp,$tp,$sp ; $sp-$tp | ||
| 197 | and $tp,$tp,$i ; minimize TLB usage | ||
| 198 | subf $tp,$sp,$tp ; $tp-$sp | ||
| 199 | $STUX $sp,$sp,$tp ; alloca | ||
| 200 | |||
| 201 | $PUSH r14,`2*$SIZE_T`($sp) | ||
| 202 | $PUSH r15,`3*$SIZE_T`($sp) | ||
| 203 | $PUSH r16,`4*$SIZE_T`($sp) | ||
| 204 | $PUSH r17,`5*$SIZE_T`($sp) | ||
| 205 | $PUSH r18,`6*$SIZE_T`($sp) | ||
| 206 | $PUSH r19,`7*$SIZE_T`($sp) | ||
| 207 | $PUSH r20,`8*$SIZE_T`($sp) | ||
| 208 | $PUSH r21,`9*$SIZE_T`($sp) | ||
| 209 | $PUSH r22,`10*$SIZE_T`($sp) | ||
| 210 | $PUSH r23,`11*$SIZE_T`($sp) | ||
| 211 | stfd f14,`12*$SIZE_T+0`($sp) | ||
| 212 | stfd f15,`12*$SIZE_T+8`($sp) | ||
| 213 | stfd f16,`12*$SIZE_T+16`($sp) | ||
| 214 | stfd f17,`12*$SIZE_T+24`($sp) | ||
| 215 | stfd f18,`12*$SIZE_T+32`($sp) | ||
| 216 | stfd f19,`12*$SIZE_T+40`($sp) | ||
| 217 | stfd f20,`12*$SIZE_T+48`($sp) | ||
| 218 | stfd f21,`12*$SIZE_T+56`($sp) | ||
| 219 | stfd f22,`12*$SIZE_T+64`($sp) | ||
| 220 | stfd f23,`12*$SIZE_T+72`($sp) | ||
| 221 | stfd f24,`12*$SIZE_T+80`($sp) | ||
| 222 | stfd f25,`12*$SIZE_T+88`($sp) | ||
| 223 | |||
| 224 | ld $a0,0($ap) ; pull ap[0] value | ||
| 225 | ld $n0,0($n0) ; pull n0[0] value | ||
| 226 | ld $t3,0($bp) ; bp[0] | ||
| 227 | |||
| 228 | addi $tp,$sp,`$FRAME+$TRANSFER+8+64` | ||
| 229 | li $i,-64 | ||
| 230 | add $nap_d,$tp,$num | ||
| 231 | and $nap_d,$nap_d,$i ; align to 64 bytes | ||
| 232 | |||
| 233 | mulld $t7,$a0,$t3 ; ap[0]*bp[0] | ||
| 234 | ; nap_d is off by 1, because it's used with stfdu/lfdu | ||
| 235 | addi $nap_d,$nap_d,-8 | ||
| 236 | srwi $j,$num,`3+1` ; counter register, num/2 | ||
| 237 | mulld $t7,$t7,$n0 ; tp[0]*n0 | ||
| 238 | addi $j,$j,-1 | ||
| 239 | addi $tp,$sp,`$FRAME+$TRANSFER-8` | ||
| 240 | li $carry,0 | ||
| 241 | mtctr $j | ||
| 242 | |||
| 243 | ; transfer bp[0] to FPU as 4x16-bit values | ||
| 244 | extrdi $t0,$t3,16,48 | ||
| 245 | extrdi $t1,$t3,16,32 | ||
| 246 | extrdi $t2,$t3,16,16 | ||
| 247 | extrdi $t3,$t3,16,0 | ||
| 248 | std $t0,`$FRAME+0`($sp) | ||
| 249 | std $t1,`$FRAME+8`($sp) | ||
| 250 | std $t2,`$FRAME+16`($sp) | ||
| 251 | std $t3,`$FRAME+24`($sp) | ||
| 252 | ; transfer (ap[0]*bp[0])*n0 to FPU as 4x16-bit values | ||
| 253 | extrdi $t4,$t7,16,48 | ||
| 254 | extrdi $t5,$t7,16,32 | ||
| 255 | extrdi $t6,$t7,16,16 | ||
| 256 | extrdi $t7,$t7,16,0 | ||
| 257 | std $t4,`$FRAME+32`($sp) | ||
| 258 | std $t5,`$FRAME+40`($sp) | ||
| 259 | std $t6,`$FRAME+48`($sp) | ||
| 260 | std $t7,`$FRAME+56`($sp) | ||
| 261 | lwz $t0,4($ap) ; load a[j] as 32-bit word pair | ||
| 262 | lwz $t1,0($ap) | ||
| 263 | lwz $t2,12($ap) ; load a[j+1] as 32-bit word pair | ||
| 264 | lwz $t3,8($ap) | ||
| 265 | lwz $t4,4($np) ; load n[j] as 32-bit word pair | ||
| 266 | lwz $t5,0($np) | ||
| 267 | lwz $t6,12($np) ; load n[j+1] as 32-bit word pair | ||
| 268 | lwz $t7,8($np) | ||
| 269 | lfd $ba,`$FRAME+0`($sp) | ||
| 270 | lfd $bb,`$FRAME+8`($sp) | ||
| 271 | lfd $bc,`$FRAME+16`($sp) | ||
| 272 | lfd $bd,`$FRAME+24`($sp) | ||
| 273 | lfd $na,`$FRAME+32`($sp) | ||
| 274 | lfd $nb,`$FRAME+40`($sp) | ||
| 275 | lfd $nc,`$FRAME+48`($sp) | ||
| 276 | lfd $nd,`$FRAME+56`($sp) | ||
| 277 | std $t0,`$FRAME+64`($sp) | ||
| 278 | std $t1,`$FRAME+72`($sp) | ||
| 279 | std $t2,`$FRAME+80`($sp) | ||
| 280 | std $t3,`$FRAME+88`($sp) | ||
| 281 | std $t4,`$FRAME+96`($sp) | ||
| 282 | std $t5,`$FRAME+104`($sp) | ||
| 283 | std $t6,`$FRAME+112`($sp) | ||
| 284 | std $t7,`$FRAME+120`($sp) | ||
| 285 | fcfid $ba,$ba | ||
| 286 | fcfid $bb,$bb | ||
| 287 | fcfid $bc,$bc | ||
| 288 | fcfid $bd,$bd | ||
| 289 | fcfid $na,$na | ||
| 290 | fcfid $nb,$nb | ||
| 291 | fcfid $nc,$nc | ||
| 292 | fcfid $nd,$nd | ||
| 293 | |||
| 294 | lfd $A0,`$FRAME+64`($sp) | ||
| 295 | lfd $A1,`$FRAME+72`($sp) | ||
| 296 | lfd $A2,`$FRAME+80`($sp) | ||
| 297 | lfd $A3,`$FRAME+88`($sp) | ||
| 298 | lfd $N0,`$FRAME+96`($sp) | ||
| 299 | lfd $N1,`$FRAME+104`($sp) | ||
| 300 | lfd $N2,`$FRAME+112`($sp) | ||
| 301 | lfd $N3,`$FRAME+120`($sp) | ||
| 302 | fcfid $A0,$A0 | ||
| 303 | fcfid $A1,$A1 | ||
| 304 | fcfid $A2,$A2 | ||
| 305 | fcfid $A3,$A3 | ||
| 306 | fcfid $N0,$N0 | ||
| 307 | fcfid $N1,$N1 | ||
| 308 | fcfid $N2,$N2 | ||
| 309 | fcfid $N3,$N3 | ||
| 310 | addi $ap,$ap,16 | ||
| 311 | addi $np,$np,16 | ||
| 312 | |||
| 313 | fmul $T1a,$A1,$ba | ||
| 314 | fmul $T1b,$A1,$bb | ||
| 315 | stfd $A0,8($nap_d) ; save a[j] in double format | ||
| 316 | stfd $A1,16($nap_d) | ||
| 317 | fmul $T2a,$A2,$ba | ||
| 318 | fmul $T2b,$A2,$bb | ||
| 319 | stfd $A2,24($nap_d) ; save a[j+1] in double format | ||
| 320 | stfd $A3,32($nap_d) | ||
| 321 | fmul $T3a,$A3,$ba | ||
| 322 | fmul $T3b,$A3,$bb | ||
| 323 | stfd $N0,40($nap_d) ; save n[j] in double format | ||
| 324 | stfd $N1,48($nap_d) | ||
| 325 | fmul $T0a,$A0,$ba | ||
| 326 | fmul $T0b,$A0,$bb | ||
| 327 | stfd $N2,56($nap_d) ; save n[j+1] in double format | ||
| 328 | stfdu $N3,64($nap_d) | ||
| 329 | |||
| 330 | fmadd $T1a,$A0,$bc,$T1a | ||
| 331 | fmadd $T1b,$A0,$bd,$T1b | ||
| 332 | fmadd $T2a,$A1,$bc,$T2a | ||
| 333 | fmadd $T2b,$A1,$bd,$T2b | ||
| 334 | fmadd $T3a,$A2,$bc,$T3a | ||
| 335 | fmadd $T3b,$A2,$bd,$T3b | ||
| 336 | fmul $dota,$A3,$bc | ||
| 337 | fmul $dotb,$A3,$bd | ||
| 338 | |||
| 339 | fmadd $T1a,$N1,$na,$T1a | ||
| 340 | fmadd $T1b,$N1,$nb,$T1b | ||
| 341 | fmadd $T2a,$N2,$na,$T2a | ||
| 342 | fmadd $T2b,$N2,$nb,$T2b | ||
| 343 | fmadd $T3a,$N3,$na,$T3a | ||
| 344 | fmadd $T3b,$N3,$nb,$T3b | ||
| 345 | fmadd $T0a,$N0,$na,$T0a | ||
| 346 | fmadd $T0b,$N0,$nb,$T0b | ||
| 347 | |||
| 348 | fmadd $T1a,$N0,$nc,$T1a | ||
| 349 | fmadd $T1b,$N0,$nd,$T1b | ||
| 350 | fmadd $T2a,$N1,$nc,$T2a | ||
| 351 | fmadd $T2b,$N1,$nd,$T2b | ||
| 352 | fmadd $T3a,$N2,$nc,$T3a | ||
| 353 | fmadd $T3b,$N2,$nd,$T3b | ||
| 354 | fmadd $dota,$N3,$nc,$dota | ||
| 355 | fmadd $dotb,$N3,$nd,$dotb | ||
| 356 | |||
| 357 | fctid $T0a,$T0a | ||
| 358 | fctid $T0b,$T0b | ||
| 359 | fctid $T1a,$T1a | ||
| 360 | fctid $T1b,$T1b | ||
| 361 | fctid $T2a,$T2a | ||
| 362 | fctid $T2b,$T2b | ||
| 363 | fctid $T3a,$T3a | ||
| 364 | fctid $T3b,$T3b | ||
| 365 | |||
| 366 | stfd $T0a,`$FRAME+0`($sp) | ||
| 367 | stfd $T0b,`$FRAME+8`($sp) | ||
| 368 | stfd $T1a,`$FRAME+16`($sp) | ||
| 369 | stfd $T1b,`$FRAME+24`($sp) | ||
| 370 | stfd $T2a,`$FRAME+32`($sp) | ||
| 371 | stfd $T2b,`$FRAME+40`($sp) | ||
| 372 | stfd $T3a,`$FRAME+48`($sp) | ||
| 373 | stfd $T3b,`$FRAME+56`($sp) | ||
| 374 | |||
| 375 | .align 5 | ||
| 376 | L1st: | ||
| 377 | lwz $t0,4($ap) ; load a[j] as 32-bit word pair | ||
| 378 | lwz $t1,0($ap) | ||
| 379 | lwz $t2,12($ap) ; load a[j+1] as 32-bit word pair | ||
| 380 | lwz $t3,8($ap) | ||
| 381 | lwz $t4,4($np) ; load n[j] as 32-bit word pair | ||
| 382 | lwz $t5,0($np) | ||
| 383 | lwz $t6,12($np) ; load n[j+1] as 32-bit word pair | ||
| 384 | lwz $t7,8($np) | ||
| 385 | std $t0,`$FRAME+64`($sp) | ||
| 386 | std $t1,`$FRAME+72`($sp) | ||
| 387 | std $t2,`$FRAME+80`($sp) | ||
| 388 | std $t3,`$FRAME+88`($sp) | ||
| 389 | std $t4,`$FRAME+96`($sp) | ||
| 390 | std $t5,`$FRAME+104`($sp) | ||
| 391 | std $t6,`$FRAME+112`($sp) | ||
| 392 | std $t7,`$FRAME+120`($sp) | ||
| 393 | ld $t0,`$FRAME+0`($sp) | ||
| 394 | ld $t1,`$FRAME+8`($sp) | ||
| 395 | ld $t2,`$FRAME+16`($sp) | ||
| 396 | ld $t3,`$FRAME+24`($sp) | ||
| 397 | ld $t4,`$FRAME+32`($sp) | ||
| 398 | ld $t5,`$FRAME+40`($sp) | ||
| 399 | ld $t6,`$FRAME+48`($sp) | ||
| 400 | ld $t7,`$FRAME+56`($sp) | ||
| 401 | lfd $A0,`$FRAME+64`($sp) | ||
| 402 | lfd $A1,`$FRAME+72`($sp) | ||
| 403 | lfd $A2,`$FRAME+80`($sp) | ||
| 404 | lfd $A3,`$FRAME+88`($sp) | ||
| 405 | lfd $N0,`$FRAME+96`($sp) | ||
| 406 | lfd $N1,`$FRAME+104`($sp) | ||
| 407 | lfd $N2,`$FRAME+112`($sp) | ||
| 408 | lfd $N3,`$FRAME+120`($sp) | ||
| 409 | fcfid $A0,$A0 | ||
| 410 | fcfid $A1,$A1 | ||
| 411 | fcfid $A2,$A2 | ||
| 412 | fcfid $A3,$A3 | ||
| 413 | fcfid $N0,$N0 | ||
| 414 | fcfid $N1,$N1 | ||
| 415 | fcfid $N2,$N2 | ||
| 416 | fcfid $N3,$N3 | ||
| 417 | addi $ap,$ap,16 | ||
| 418 | addi $np,$np,16 | ||
| 419 | |||
| 420 | fmul $T1a,$A1,$ba | ||
| 421 | fmul $T1b,$A1,$bb | ||
| 422 | fmul $T2a,$A2,$ba | ||
| 423 | fmul $T2b,$A2,$bb | ||
| 424 | stfd $A0,8($nap_d) ; save a[j] in double format | ||
| 425 | stfd $A1,16($nap_d) | ||
| 426 | fmul $T3a,$A3,$ba | ||
| 427 | fmul $T3b,$A3,$bb | ||
| 428 | fmadd $T0a,$A0,$ba,$dota | ||
| 429 | fmadd $T0b,$A0,$bb,$dotb | ||
| 430 | stfd $A2,24($nap_d) ; save a[j+1] in double format | ||
| 431 | stfd $A3,32($nap_d) | ||
| 432 | |||
| 433 | fmadd $T1a,$A0,$bc,$T1a | ||
| 434 | fmadd $T1b,$A0,$bd,$T1b | ||
| 435 | fmadd $T2a,$A1,$bc,$T2a | ||
| 436 | fmadd $T2b,$A1,$bd,$T2b | ||
| 437 | stfd $N0,40($nap_d) ; save n[j] in double format | ||
| 438 | stfd $N1,48($nap_d) | ||
| 439 | fmadd $T3a,$A2,$bc,$T3a | ||
| 440 | fmadd $T3b,$A2,$bd,$T3b | ||
| 441 | add $t0,$t0,$carry ; can not overflow | ||
| 442 | fmul $dota,$A3,$bc | ||
| 443 | fmul $dotb,$A3,$bd | ||
| 444 | stfd $N2,56($nap_d) ; save n[j+1] in double format | ||
| 445 | stfdu $N3,64($nap_d) | ||
| 446 | srdi $carry,$t0,16 | ||
| 447 | add $t1,$t1,$carry | ||
| 448 | srdi $carry,$t1,16 | ||
| 449 | |||
| 450 | fmadd $T1a,$N1,$na,$T1a | ||
| 451 | fmadd $T1b,$N1,$nb,$T1b | ||
| 452 | insrdi $t0,$t1,16,32 | ||
| 453 | fmadd $T2a,$N2,$na,$T2a | ||
| 454 | fmadd $T2b,$N2,$nb,$T2b | ||
| 455 | add $t2,$t2,$carry | ||
| 456 | fmadd $T3a,$N3,$na,$T3a | ||
| 457 | fmadd $T3b,$N3,$nb,$T3b | ||
| 458 | srdi $carry,$t2,16 | ||
| 459 | fmadd $T0a,$N0,$na,$T0a | ||
| 460 | fmadd $T0b,$N0,$nb,$T0b | ||
| 461 | insrdi $t0,$t2,16,16 | ||
| 462 | add $t3,$t3,$carry | ||
| 463 | srdi $carry,$t3,16 | ||
| 464 | |||
| 465 | fmadd $T1a,$N0,$nc,$T1a | ||
| 466 | fmadd $T1b,$N0,$nd,$T1b | ||
| 467 | insrdi $t0,$t3,16,0 ; 0..63 bits | ||
| 468 | fmadd $T2a,$N1,$nc,$T2a | ||
| 469 | fmadd $T2b,$N1,$nd,$T2b | ||
| 470 | add $t4,$t4,$carry | ||
| 471 | fmadd $T3a,$N2,$nc,$T3a | ||
| 472 | fmadd $T3b,$N2,$nd,$T3b | ||
| 473 | srdi $carry,$t4,16 | ||
| 474 | fmadd $dota,$N3,$nc,$dota | ||
| 475 | fmadd $dotb,$N3,$nd,$dotb | ||
| 476 | add $t5,$t5,$carry | ||
| 477 | srdi $carry,$t5,16 | ||
| 478 | insrdi $t4,$t5,16,32 | ||
| 479 | |||
| 480 | fctid $T0a,$T0a | ||
| 481 | fctid $T0b,$T0b | ||
| 482 | add $t6,$t6,$carry | ||
| 483 | fctid $T1a,$T1a | ||
| 484 | fctid $T1b,$T1b | ||
| 485 | srdi $carry,$t6,16 | ||
| 486 | fctid $T2a,$T2a | ||
| 487 | fctid $T2b,$T2b | ||
| 488 | insrdi $t4,$t6,16,16 | ||
| 489 | fctid $T3a,$T3a | ||
| 490 | fctid $T3b,$T3b | ||
| 491 | add $t7,$t7,$carry | ||
| 492 | insrdi $t4,$t7,16,0 ; 64..127 bits | ||
| 493 | srdi $carry,$t7,16 ; upper 33 bits | ||
| 494 | |||
| 495 | stfd $T0a,`$FRAME+0`($sp) | ||
| 496 | stfd $T0b,`$FRAME+8`($sp) | ||
| 497 | stfd $T1a,`$FRAME+16`($sp) | ||
| 498 | stfd $T1b,`$FRAME+24`($sp) | ||
| 499 | stfd $T2a,`$FRAME+32`($sp) | ||
| 500 | stfd $T2b,`$FRAME+40`($sp) | ||
| 501 | stfd $T3a,`$FRAME+48`($sp) | ||
| 502 | stfd $T3b,`$FRAME+56`($sp) | ||
| 503 | std $t0,8($tp) ; tp[j-1] | ||
| 504 | stdu $t4,16($tp) ; tp[j] | ||
| 505 | bdnz- L1st | ||
| 506 | |||
| 507 | fctid $dota,$dota | ||
| 508 | fctid $dotb,$dotb | ||
| 509 | |||
| 510 | ld $t0,`$FRAME+0`($sp) | ||
| 511 | ld $t1,`$FRAME+8`($sp) | ||
| 512 | ld $t2,`$FRAME+16`($sp) | ||
| 513 | ld $t3,`$FRAME+24`($sp) | ||
| 514 | ld $t4,`$FRAME+32`($sp) | ||
| 515 | ld $t5,`$FRAME+40`($sp) | ||
| 516 | ld $t6,`$FRAME+48`($sp) | ||
| 517 | ld $t7,`$FRAME+56`($sp) | ||
| 518 | stfd $dota,`$FRAME+64`($sp) | ||
| 519 | stfd $dotb,`$FRAME+72`($sp) | ||
| 520 | |||
| 521 | add $t0,$t0,$carry ; can not overflow | ||
| 522 | srdi $carry,$t0,16 | ||
| 523 | add $t1,$t1,$carry | ||
| 524 | srdi $carry,$t1,16 | ||
| 525 | insrdi $t0,$t1,16,32 | ||
| 526 | add $t2,$t2,$carry | ||
| 527 | srdi $carry,$t2,16 | ||
| 528 | insrdi $t0,$t2,16,16 | ||
| 529 | add $t3,$t3,$carry | ||
| 530 | srdi $carry,$t3,16 | ||
| 531 | insrdi $t0,$t3,16,0 ; 0..63 bits | ||
| 532 | add $t4,$t4,$carry | ||
| 533 | srdi $carry,$t4,16 | ||
| 534 | add $t5,$t5,$carry | ||
| 535 | srdi $carry,$t5,16 | ||
| 536 | insrdi $t4,$t5,16,32 | ||
| 537 | add $t6,$t6,$carry | ||
| 538 | srdi $carry,$t6,16 | ||
| 539 | insrdi $t4,$t6,16,16 | ||
| 540 | add $t7,$t7,$carry | ||
| 541 | insrdi $t4,$t7,16,0 ; 64..127 bits | ||
| 542 | srdi $carry,$t7,16 ; upper 33 bits | ||
| 543 | ld $t6,`$FRAME+64`($sp) | ||
| 544 | ld $t7,`$FRAME+72`($sp) | ||
| 545 | |||
| 546 | std $t0,8($tp) ; tp[j-1] | ||
| 547 | stdu $t4,16($tp) ; tp[j] | ||
| 548 | |||
| 549 | add $t6,$t6,$carry ; can not overflow | ||
| 550 | srdi $carry,$t6,16 | ||
| 551 | add $t7,$t7,$carry | ||
| 552 | insrdi $t6,$t7,48,0 | ||
| 553 | srdi $ovf,$t7,48 | ||
| 554 | std $t6,8($tp) ; tp[num-1] | ||
| 555 | |||
| 556 | slwi $t7,$num,2 | ||
| 557 | subf $nap_d,$t7,$nap_d ; rewind pointer | ||
| 558 | |||
| 559 | li $i,8 ; i=1 | ||
| 560 | .align 5 | ||
| 561 | Louter: | ||
| 562 | ldx $t3,$bp,$i ; bp[i] | ||
| 563 | ld $t6,`$FRAME+$TRANSFER+8`($sp) ; tp[0] | ||
| 564 | mulld $t7,$a0,$t3 ; ap[0]*bp[i] | ||
| 565 | |||
| 566 | addi $tp,$sp,`$FRAME+$TRANSFER` | ||
| 567 | add $t7,$t7,$t6 ; ap[0]*bp[i]+tp[0] | ||
| 568 | li $carry,0 | ||
| 569 | mulld $t7,$t7,$n0 ; tp[0]*n0 | ||
| 570 | mtctr $j | ||
| 571 | |||
| 572 | ; transfer bp[i] to FPU as 4x16-bit values | ||
| 573 | extrdi $t0,$t3,16,48 | ||
| 574 | extrdi $t1,$t3,16,32 | ||
| 575 | extrdi $t2,$t3,16,16 | ||
| 576 | extrdi $t3,$t3,16,0 | ||
| 577 | std $t0,`$FRAME+0`($sp) | ||
| 578 | std $t1,`$FRAME+8`($sp) | ||
| 579 | std $t2,`$FRAME+16`($sp) | ||
| 580 | std $t3,`$FRAME+24`($sp) | ||
| 581 | ; transfer (ap[0]*bp[i]+tp[0])*n0 to FPU as 4x16-bit values | ||
| 582 | extrdi $t4,$t7,16,48 | ||
| 583 | extrdi $t5,$t7,16,32 | ||
| 584 | extrdi $t6,$t7,16,16 | ||
| 585 | extrdi $t7,$t7,16,0 | ||
| 586 | std $t4,`$FRAME+32`($sp) | ||
| 587 | std $t5,`$FRAME+40`($sp) | ||
| 588 | std $t6,`$FRAME+48`($sp) | ||
| 589 | std $t7,`$FRAME+56`($sp) | ||
| 590 | |||
| 591 | lfd $A0,8($nap_d) ; load a[j] in double format | ||
| 592 | lfd $A1,16($nap_d) | ||
| 593 | lfd $A2,24($nap_d) ; load a[j+1] in double format | ||
| 594 | lfd $A3,32($nap_d) | ||
| 595 | lfd $N0,40($nap_d) ; load n[j] in double format | ||
| 596 | lfd $N1,48($nap_d) | ||
| 597 | lfd $N2,56($nap_d) ; load n[j+1] in double format | ||
| 598 | lfdu $N3,64($nap_d) | ||
| 599 | |||
| 600 | lfd $ba,`$FRAME+0`($sp) | ||
| 601 | lfd $bb,`$FRAME+8`($sp) | ||
| 602 | lfd $bc,`$FRAME+16`($sp) | ||
| 603 | lfd $bd,`$FRAME+24`($sp) | ||
| 604 | lfd $na,`$FRAME+32`($sp) | ||
| 605 | lfd $nb,`$FRAME+40`($sp) | ||
| 606 | lfd $nc,`$FRAME+48`($sp) | ||
| 607 | lfd $nd,`$FRAME+56`($sp) | ||
| 608 | |||
| 609 | fcfid $ba,$ba | ||
| 610 | fcfid $bb,$bb | ||
| 611 | fcfid $bc,$bc | ||
| 612 | fcfid $bd,$bd | ||
| 613 | fcfid $na,$na | ||
| 614 | fcfid $nb,$nb | ||
| 615 | fcfid $nc,$nc | ||
| 616 | fcfid $nd,$nd | ||
| 617 | |||
| 618 | fmul $T1a,$A1,$ba | ||
| 619 | fmul $T1b,$A1,$bb | ||
| 620 | fmul $T2a,$A2,$ba | ||
| 621 | fmul $T2b,$A2,$bb | ||
| 622 | fmul $T3a,$A3,$ba | ||
| 623 | fmul $T3b,$A3,$bb | ||
| 624 | fmul $T0a,$A0,$ba | ||
| 625 | fmul $T0b,$A0,$bb | ||
| 626 | |||
| 627 | fmadd $T1a,$A0,$bc,$T1a | ||
| 628 | fmadd $T1b,$A0,$bd,$T1b | ||
| 629 | fmadd $T2a,$A1,$bc,$T2a | ||
| 630 | fmadd $T2b,$A1,$bd,$T2b | ||
| 631 | fmadd $T3a,$A2,$bc,$T3a | ||
| 632 | fmadd $T3b,$A2,$bd,$T3b | ||
| 633 | fmul $dota,$A3,$bc | ||
| 634 | fmul $dotb,$A3,$bd | ||
| 635 | |||
| 636 | fmadd $T1a,$N1,$na,$T1a | ||
| 637 | fmadd $T1b,$N1,$nb,$T1b | ||
| 638 | lfd $A0,8($nap_d) ; load a[j] in double format | ||
| 639 | lfd $A1,16($nap_d) | ||
| 640 | fmadd $T2a,$N2,$na,$T2a | ||
| 641 | fmadd $T2b,$N2,$nb,$T2b | ||
| 642 | lfd $A2,24($nap_d) ; load a[j+1] in double format | ||
| 643 | lfd $A3,32($nap_d) | ||
| 644 | fmadd $T3a,$N3,$na,$T3a | ||
| 645 | fmadd $T3b,$N3,$nb,$T3b | ||
| 646 | fmadd $T0a,$N0,$na,$T0a | ||
| 647 | fmadd $T0b,$N0,$nb,$T0b | ||
| 648 | |||
| 649 | fmadd $T1a,$N0,$nc,$T1a | ||
| 650 | fmadd $T1b,$N0,$nd,$T1b | ||
| 651 | fmadd $T2a,$N1,$nc,$T2a | ||
| 652 | fmadd $T2b,$N1,$nd,$T2b | ||
| 653 | fmadd $T3a,$N2,$nc,$T3a | ||
| 654 | fmadd $T3b,$N2,$nd,$T3b | ||
| 655 | fmadd $dota,$N3,$nc,$dota | ||
| 656 | fmadd $dotb,$N3,$nd,$dotb | ||
| 657 | |||
| 658 | fctid $T0a,$T0a | ||
| 659 | fctid $T0b,$T0b | ||
| 660 | fctid $T1a,$T1a | ||
| 661 | fctid $T1b,$T1b | ||
| 662 | fctid $T2a,$T2a | ||
| 663 | fctid $T2b,$T2b | ||
| 664 | fctid $T3a,$T3a | ||
| 665 | fctid $T3b,$T3b | ||
| 666 | |||
| 667 | stfd $T0a,`$FRAME+0`($sp) | ||
| 668 | stfd $T0b,`$FRAME+8`($sp) | ||
| 669 | stfd $T1a,`$FRAME+16`($sp) | ||
| 670 | stfd $T1b,`$FRAME+24`($sp) | ||
| 671 | stfd $T2a,`$FRAME+32`($sp) | ||
| 672 | stfd $T2b,`$FRAME+40`($sp) | ||
| 673 | stfd $T3a,`$FRAME+48`($sp) | ||
| 674 | stfd $T3b,`$FRAME+56`($sp) | ||
| 675 | |||
| 676 | .align 5 | ||
| 677 | Linner: | ||
| 678 | fmul $T1a,$A1,$ba | ||
| 679 | fmul $T1b,$A1,$bb | ||
| 680 | fmul $T2a,$A2,$ba | ||
| 681 | fmul $T2b,$A2,$bb | ||
| 682 | lfd $N0,40($nap_d) ; load n[j] in double format | ||
| 683 | lfd $N1,48($nap_d) | ||
| 684 | fmul $T3a,$A3,$ba | ||
| 685 | fmul $T3b,$A3,$bb | ||
| 686 | fmadd $T0a,$A0,$ba,$dota | ||
| 687 | fmadd $T0b,$A0,$bb,$dotb | ||
| 688 | lfd $N2,56($nap_d) ; load n[j+1] in double format | ||
| 689 | lfdu $N3,64($nap_d) | ||
| 690 | |||
| 691 | fmadd $T1a,$A0,$bc,$T1a | ||
| 692 | fmadd $T1b,$A0,$bd,$T1b | ||
| 693 | fmadd $T2a,$A1,$bc,$T2a | ||
| 694 | fmadd $T2b,$A1,$bd,$T2b | ||
| 695 | lfd $A0,8($nap_d) ; load a[j] in double format | ||
| 696 | lfd $A1,16($nap_d) | ||
| 697 | fmadd $T3a,$A2,$bc,$T3a | ||
| 698 | fmadd $T3b,$A2,$bd,$T3b | ||
| 699 | fmul $dota,$A3,$bc | ||
| 700 | fmul $dotb,$A3,$bd | ||
| 701 | lfd $A2,24($nap_d) ; load a[j+1] in double format | ||
| 702 | lfd $A3,32($nap_d) | ||
| 703 | |||
| 704 | fmadd $T1a,$N1,$na,$T1a | ||
| 705 | fmadd $T1b,$N1,$nb,$T1b | ||
| 706 | ld $t0,`$FRAME+0`($sp) | ||
| 707 | ld $t1,`$FRAME+8`($sp) | ||
| 708 | fmadd $T2a,$N2,$na,$T2a | ||
| 709 | fmadd $T2b,$N2,$nb,$T2b | ||
| 710 | ld $t2,`$FRAME+16`($sp) | ||
| 711 | ld $t3,`$FRAME+24`($sp) | ||
| 712 | fmadd $T3a,$N3,$na,$T3a | ||
| 713 | fmadd $T3b,$N3,$nb,$T3b | ||
| 714 | add $t0,$t0,$carry ; can not overflow | ||
| 715 | ld $t4,`$FRAME+32`($sp) | ||
| 716 | ld $t5,`$FRAME+40`($sp) | ||
| 717 | fmadd $T0a,$N0,$na,$T0a | ||
| 718 | fmadd $T0b,$N0,$nb,$T0b | ||
| 719 | srdi $carry,$t0,16 | ||
| 720 | add $t1,$t1,$carry | ||
| 721 | srdi $carry,$t1,16 | ||
| 722 | ld $t6,`$FRAME+48`($sp) | ||
| 723 | ld $t7,`$FRAME+56`($sp) | ||
| 724 | |||
| 725 | fmadd $T1a,$N0,$nc,$T1a | ||
| 726 | fmadd $T1b,$N0,$nd,$T1b | ||
| 727 | insrdi $t0,$t1,16,32 | ||
| 728 | ld $t1,8($tp) ; tp[j] | ||
| 729 | fmadd $T2a,$N1,$nc,$T2a | ||
| 730 | fmadd $T2b,$N1,$nd,$T2b | ||
| 731 | add $t2,$t2,$carry | ||
| 732 | fmadd $T3a,$N2,$nc,$T3a | ||
| 733 | fmadd $T3b,$N2,$nd,$T3b | ||
| 734 | srdi $carry,$t2,16 | ||
| 735 | insrdi $t0,$t2,16,16 | ||
| 736 | fmadd $dota,$N3,$nc,$dota | ||
| 737 | fmadd $dotb,$N3,$nd,$dotb | ||
| 738 | add $t3,$t3,$carry | ||
| 739 | ldu $t2,16($tp) ; tp[j+1] | ||
| 740 | srdi $carry,$t3,16 | ||
| 741 | insrdi $t0,$t3,16,0 ; 0..63 bits | ||
| 742 | add $t4,$t4,$carry | ||
| 743 | |||
| 744 | fctid $T0a,$T0a | ||
| 745 | fctid $T0b,$T0b | ||
| 746 | srdi $carry,$t4,16 | ||
| 747 | fctid $T1a,$T1a | ||
| 748 | fctid $T1b,$T1b | ||
| 749 | add $t5,$t5,$carry | ||
| 750 | fctid $T2a,$T2a | ||
| 751 | fctid $T2b,$T2b | ||
| 752 | srdi $carry,$t5,16 | ||
| 753 | insrdi $t4,$t5,16,32 | ||
| 754 | fctid $T3a,$T3a | ||
| 755 | fctid $T3b,$T3b | ||
| 756 | add $t6,$t6,$carry | ||
| 757 | srdi $carry,$t6,16 | ||
| 758 | insrdi $t4,$t6,16,16 | ||
| 759 | |||
| 760 | stfd $T0a,`$FRAME+0`($sp) | ||
| 761 | stfd $T0b,`$FRAME+8`($sp) | ||
| 762 | add $t7,$t7,$carry | ||
| 763 | addc $t3,$t0,$t1 | ||
| 764 | stfd $T1a,`$FRAME+16`($sp) | ||
| 765 | stfd $T1b,`$FRAME+24`($sp) | ||
| 766 | insrdi $t4,$t7,16,0 ; 64..127 bits | ||
| 767 | srdi $carry,$t7,16 ; upper 33 bits | ||
| 768 | stfd $T2a,`$FRAME+32`($sp) | ||
| 769 | stfd $T2b,`$FRAME+40`($sp) | ||
| 770 | adde $t5,$t4,$t2 | ||
| 771 | stfd $T3a,`$FRAME+48`($sp) | ||
| 772 | stfd $T3b,`$FRAME+56`($sp) | ||
| 773 | addze $carry,$carry | ||
| 774 | std $t3,-16($tp) ; tp[j-1] | ||
| 775 | std $t5,-8($tp) ; tp[j] | ||
| 776 | bdnz- Linner | ||
| 777 | |||
| 778 | fctid $dota,$dota | ||
| 779 | fctid $dotb,$dotb | ||
| 780 | ld $t0,`$FRAME+0`($sp) | ||
| 781 | ld $t1,`$FRAME+8`($sp) | ||
| 782 | ld $t2,`$FRAME+16`($sp) | ||
| 783 | ld $t3,`$FRAME+24`($sp) | ||
| 784 | ld $t4,`$FRAME+32`($sp) | ||
| 785 | ld $t5,`$FRAME+40`($sp) | ||
| 786 | ld $t6,`$FRAME+48`($sp) | ||
| 787 | ld $t7,`$FRAME+56`($sp) | ||
| 788 | stfd $dota,`$FRAME+64`($sp) | ||
| 789 | stfd $dotb,`$FRAME+72`($sp) | ||
| 790 | |||
| 791 | add $t0,$t0,$carry ; can not overflow | ||
| 792 | srdi $carry,$t0,16 | ||
| 793 | add $t1,$t1,$carry | ||
| 794 | srdi $carry,$t1,16 | ||
| 795 | insrdi $t0,$t1,16,32 | ||
| 796 | add $t2,$t2,$carry | ||
| 797 | ld $t1,8($tp) ; tp[j] | ||
| 798 | srdi $carry,$t2,16 | ||
| 799 | insrdi $t0,$t2,16,16 | ||
| 800 | add $t3,$t3,$carry | ||
| 801 | ldu $t2,16($tp) ; tp[j+1] | ||
| 802 | srdi $carry,$t3,16 | ||
| 803 | insrdi $t0,$t3,16,0 ; 0..63 bits | ||
| 804 | add $t4,$t4,$carry | ||
| 805 | srdi $carry,$t4,16 | ||
| 806 | add $t5,$t5,$carry | ||
| 807 | srdi $carry,$t5,16 | ||
| 808 | insrdi $t4,$t5,16,32 | ||
| 809 | add $t6,$t6,$carry | ||
| 810 | srdi $carry,$t6,16 | ||
| 811 | insrdi $t4,$t6,16,16 | ||
| 812 | add $t7,$t7,$carry | ||
| 813 | insrdi $t4,$t7,16,0 ; 64..127 bits | ||
| 814 | srdi $carry,$t7,16 ; upper 33 bits | ||
| 815 | ld $t6,`$FRAME+64`($sp) | ||
| 816 | ld $t7,`$FRAME+72`($sp) | ||
| 817 | |||
| 818 | addc $t3,$t0,$t1 | ||
| 819 | adde $t5,$t4,$t2 | ||
| 820 | addze $carry,$carry | ||
| 821 | |||
| 822 | std $t3,-16($tp) ; tp[j-1] | ||
| 823 | std $t5,-8($tp) ; tp[j] | ||
| 824 | |||
| 825 | add $carry,$carry,$ovf ; comsume upmost overflow | ||
| 826 | add $t6,$t6,$carry ; can not overflow | ||
| 827 | srdi $carry,$t6,16 | ||
| 828 | add $t7,$t7,$carry | ||
| 829 | insrdi $t6,$t7,48,0 | ||
| 830 | srdi $ovf,$t7,48 | ||
| 831 | std $t6,0($tp) ; tp[num-1] | ||
| 832 | |||
| 833 | slwi $t7,$num,2 | ||
| 834 | addi $i,$i,8 | ||
| 835 | subf $nap_d,$t7,$nap_d ; rewind pointer | ||
| 836 | cmpw $i,$num | ||
| 837 | blt- Louter | ||
| 838 | |||
| 839 | subf $np,$num,$np ; rewind np | ||
| 840 | addi $j,$j,1 ; restore counter | ||
| 841 | subfc $i,$i,$i ; j=0 and "clear" XER[CA] | ||
| 842 | addi $tp,$sp,`$FRAME+$TRANSFER+8` | ||
| 843 | addi $t4,$sp,`$FRAME+$TRANSFER+16` | ||
| 844 | addi $t5,$np,8 | ||
| 845 | addi $t6,$rp,8 | ||
| 846 | mtctr $j | ||
| 847 | |||
| 848 | .align 4 | ||
| 849 | Lsub: ldx $t0,$tp,$i | ||
| 850 | ldx $t1,$np,$i | ||
| 851 | ldx $t2,$t4,$i | ||
| 852 | ldx $t3,$t5,$i | ||
| 853 | subfe $t0,$t1,$t0 ; tp[j]-np[j] | ||
| 854 | subfe $t2,$t3,$t2 ; tp[j+1]-np[j+1] | ||
| 855 | stdx $t0,$rp,$i | ||
| 856 | stdx $t2,$t6,$i | ||
| 857 | addi $i,$i,16 | ||
| 858 | bdnz- Lsub | ||
| 859 | |||
| 860 | li $i,0 | ||
| 861 | subfe $ovf,$i,$ovf ; handle upmost overflow bit | ||
| 862 | and $ap,$tp,$ovf | ||
| 863 | andc $np,$rp,$ovf | ||
| 864 | or $ap,$ap,$np ; ap=borrow?tp:rp | ||
| 865 | addi $t7,$ap,8 | ||
| 866 | mtctr $j | ||
| 867 | |||
| 868 | .align 4 | ||
| 869 | Lcopy: ; copy or in-place refresh | ||
| 870 | ldx $t0,$ap,$i | ||
| 871 | ldx $t1,$t7,$i | ||
| 872 | std $i,8($nap_d) ; zap nap_d | ||
| 873 | std $i,16($nap_d) | ||
| 874 | std $i,24($nap_d) | ||
| 875 | std $i,32($nap_d) | ||
| 876 | std $i,40($nap_d) | ||
| 877 | std $i,48($nap_d) | ||
| 878 | std $i,56($nap_d) | ||
| 879 | stdu $i,64($nap_d) | ||
| 880 | stdx $t0,$rp,$i | ||
| 881 | stdx $t1,$t6,$i | ||
| 882 | stdx $i,$tp,$i ; zap tp at once | ||
| 883 | stdx $i,$t4,$i | ||
| 884 | addi $i,$i,16 | ||
| 885 | bdnz- Lcopy | ||
| 886 | |||
| 887 | $POP r14,`2*$SIZE_T`($sp) | ||
| 888 | $POP r15,`3*$SIZE_T`($sp) | ||
| 889 | $POP r16,`4*$SIZE_T`($sp) | ||
| 890 | $POP r17,`5*$SIZE_T`($sp) | ||
| 891 | $POP r18,`6*$SIZE_T`($sp) | ||
| 892 | $POP r19,`7*$SIZE_T`($sp) | ||
| 893 | $POP r20,`8*$SIZE_T`($sp) | ||
| 894 | $POP r21,`9*$SIZE_T`($sp) | ||
| 895 | $POP r22,`10*$SIZE_T`($sp) | ||
| 896 | $POP r23,`11*$SIZE_T`($sp) | ||
| 897 | lfd f14,`12*$SIZE_T+0`($sp) | ||
| 898 | lfd f15,`12*$SIZE_T+8`($sp) | ||
| 899 | lfd f16,`12*$SIZE_T+16`($sp) | ||
| 900 | lfd f17,`12*$SIZE_T+24`($sp) | ||
| 901 | lfd f18,`12*$SIZE_T+32`($sp) | ||
| 902 | lfd f19,`12*$SIZE_T+40`($sp) | ||
| 903 | lfd f20,`12*$SIZE_T+48`($sp) | ||
| 904 | lfd f21,`12*$SIZE_T+56`($sp) | ||
| 905 | lfd f22,`12*$SIZE_T+64`($sp) | ||
| 906 | lfd f23,`12*$SIZE_T+72`($sp) | ||
| 907 | lfd f24,`12*$SIZE_T+80`($sp) | ||
| 908 | lfd f25,`12*$SIZE_T+88`($sp) | ||
| 909 | $POP $sp,0($sp) | ||
| 910 | li r3,1 ; signal "handled" | ||
| 911 | blr | ||
| 912 | .long 0 | ||
| 913 | .asciz "Montgomery Multiplication for PPC64, CRYPTOGAMS by <appro\@fy.chalmers.se>" | ||
| 914 | ___ | ||
| 915 | |||
| 916 | $code =~ s/\`([^\`]*)\`/eval $1/gem; | ||
| 917 | print $code; | ||
| 918 | close STDOUT; | ||
diff --git a/src/lib/libcrypto/bn/asm/s390x-mont.pl b/src/lib/libcrypto/bn/asm/s390x-mont.pl new file mode 100644 index 0000000000..d23251033b --- /dev/null +++ b/src/lib/libcrypto/bn/asm/s390x-mont.pl | |||
| @@ -0,0 +1,225 @@ | |||
| 1 | #!/usr/bin/env perl | ||
| 2 | |||
| 3 | # ==================================================================== | ||
| 4 | # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | ||
| 5 | # project. The module is, however, dual licensed under OpenSSL and | ||
| 6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
| 7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
| 8 | # ==================================================================== | ||
| 9 | |||
| 10 | # April 2007. | ||
| 11 | # | ||
| 12 | # Performance improvement over vanilla C code varies from 85% to 45% | ||
| 13 | # depending on key length and benchmark. Unfortunately in this context | ||
| 14 | # these are not very impressive results [for code that utilizes "wide" | ||
| 15 | # 64x64=128-bit multiplication, which is not commonly available to C | ||
| 16 | # programmers], at least hand-coded bn_asm.c replacement is known to | ||
| 17 | # provide 30-40% better results for longest keys. Well, on a second | ||
| 18 | # thought it's not very surprising, because z-CPUs are single-issue | ||
| 19 | # and _strictly_ in-order execution, while bn_mul_mont is more or less | ||
| 20 | # dependent on CPU ability to pipe-line instructions and have several | ||
| 21 | # of them "in-flight" at the same time. I mean while other methods, | ||
| 22 | # for example Karatsuba, aim to minimize amount of multiplications at | ||
| 23 | # the cost of other operations increase, bn_mul_mont aim to neatly | ||
| 24 | # "overlap" multiplications and the other operations [and on most | ||
| 25 | # platforms even minimize the amount of the other operations, in | ||
| 26 | # particular references to memory]. But it's possible to improve this | ||
| 27 | # module performance by implementing dedicated squaring code-path and | ||
| 28 | # possibly by unrolling loops... | ||
| 29 | |||
| 30 | # January 2009. | ||
| 31 | # | ||
| 32 | # Reschedule to minimize/avoid Address Generation Interlock hazard, | ||
| 33 | # make inner loops counter-based. | ||
| 34 | |||
| 35 | $mn0="%r0"; | ||
| 36 | $num="%r1"; | ||
| 37 | |||
| 38 | # int bn_mul_mont( | ||
| 39 | $rp="%r2"; # BN_ULONG *rp, | ||
| 40 | $ap="%r3"; # const BN_ULONG *ap, | ||
| 41 | $bp="%r4"; # const BN_ULONG *bp, | ||
| 42 | $np="%r5"; # const BN_ULONG *np, | ||
| 43 | $n0="%r6"; # const BN_ULONG *n0, | ||
| 44 | #$num="160(%r15)" # int num); | ||
| 45 | |||
| 46 | $bi="%r2"; # zaps rp | ||
| 47 | $j="%r7"; | ||
| 48 | |||
| 49 | $ahi="%r8"; | ||
| 50 | $alo="%r9"; | ||
| 51 | $nhi="%r10"; | ||
| 52 | $nlo="%r11"; | ||
| 53 | $AHI="%r12"; | ||
| 54 | $NHI="%r13"; | ||
| 55 | $count="%r14"; | ||
| 56 | $sp="%r15"; | ||
| 57 | |||
| 58 | $code.=<<___; | ||
| 59 | .text | ||
| 60 | .globl bn_mul_mont | ||
| 61 | .type bn_mul_mont,\@function | ||
| 62 | bn_mul_mont: | ||
| 63 | lgf $num,164($sp) # pull $num | ||
| 64 | sla $num,3 # $num to enumerate bytes | ||
| 65 | la $bp,0($num,$bp) | ||
| 66 | |||
| 67 | stg %r2,16($sp) | ||
| 68 | |||
| 69 | cghi $num,16 # | ||
| 70 | lghi %r2,0 # | ||
| 71 | blr %r14 # if($num<16) return 0; | ||
| 72 | cghi $num,128 # | ||
| 73 | bhr %r14 # if($num>128) return 0; | ||
| 74 | |||
| 75 | stmg %r3,%r15,24($sp) | ||
| 76 | |||
| 77 | lghi $rp,-160-8 # leave room for carry bit | ||
| 78 | lcgr $j,$num # -$num | ||
| 79 | lgr %r0,$sp | ||
| 80 | la $rp,0($rp,$sp) | ||
| 81 | la $sp,0($j,$rp) # alloca | ||
| 82 | stg %r0,0($sp) # back chain | ||
| 83 | |||
| 84 | sra $num,3 # restore $num | ||
| 85 | la $bp,0($j,$bp) # restore $bp | ||
| 86 | ahi $num,-1 # adjust $num for inner loop | ||
| 87 | lg $n0,0($n0) # pull n0 | ||
| 88 | |||
| 89 | lg $bi,0($bp) | ||
| 90 | lg $alo,0($ap) | ||
| 91 | mlgr $ahi,$bi # ap[0]*bp[0] | ||
| 92 | lgr $AHI,$ahi | ||
| 93 | |||
| 94 | lgr $mn0,$alo # "tp[0]"*n0 | ||
| 95 | msgr $mn0,$n0 | ||
| 96 | |||
| 97 | lg $nlo,0($np) # | ||
| 98 | mlgr $nhi,$mn0 # np[0]*m1 | ||
| 99 | algr $nlo,$alo # +="tp[0]" | ||
| 100 | lghi $NHI,0 | ||
| 101 | alcgr $NHI,$nhi | ||
| 102 | |||
| 103 | la $j,8(%r0) # j=1 | ||
| 104 | lr $count,$num | ||
| 105 | |||
| 106 | .align 16 | ||
| 107 | .L1st: | ||
| 108 | lg $alo,0($j,$ap) | ||
| 109 | mlgr $ahi,$bi # ap[j]*bp[0] | ||
| 110 | algr $alo,$AHI | ||
| 111 | lghi $AHI,0 | ||
| 112 | alcgr $AHI,$ahi | ||
| 113 | |||
| 114 | lg $nlo,0($j,$np) | ||
| 115 | mlgr $nhi,$mn0 # np[j]*m1 | ||
| 116 | algr $nlo,$NHI | ||
| 117 | lghi $NHI,0 | ||
| 118 | alcgr $nhi,$NHI # +="tp[j]" | ||
| 119 | algr $nlo,$alo | ||
| 120 | alcgr $NHI,$nhi | ||
| 121 | |||
| 122 | stg $nlo,160-8($j,$sp) # tp[j-1]= | ||
| 123 | la $j,8($j) # j++ | ||
| 124 | brct $count,.L1st | ||
| 125 | |||
| 126 | algr $NHI,$AHI | ||
| 127 | lghi $AHI,0 | ||
| 128 | alcgr $AHI,$AHI # upmost overflow bit | ||
| 129 | stg $NHI,160-8($j,$sp) | ||
| 130 | stg $AHI,160($j,$sp) | ||
| 131 | la $bp,8($bp) # bp++ | ||
| 132 | |||
| 133 | .Louter: | ||
| 134 | lg $bi,0($bp) # bp[i] | ||
| 135 | lg $alo,0($ap) | ||
| 136 | mlgr $ahi,$bi # ap[0]*bp[i] | ||
| 137 | alg $alo,160($sp) # +=tp[0] | ||
| 138 | lghi $AHI,0 | ||
| 139 | alcgr $AHI,$ahi | ||
| 140 | |||
| 141 | lgr $mn0,$alo | ||
| 142 | msgr $mn0,$n0 # tp[0]*n0 | ||
| 143 | |||
| 144 | lg $nlo,0($np) # np[0] | ||
| 145 | mlgr $nhi,$mn0 # np[0]*m1 | ||
| 146 | algr $nlo,$alo # +="tp[0]" | ||
| 147 | lghi $NHI,0 | ||
| 148 | alcgr $NHI,$nhi | ||
| 149 | |||
| 150 | la $j,8(%r0) # j=1 | ||
| 151 | lr $count,$num | ||
| 152 | |||
| 153 | .align 16 | ||
| 154 | .Linner: | ||
| 155 | lg $alo,0($j,$ap) | ||
| 156 | mlgr $ahi,$bi # ap[j]*bp[i] | ||
| 157 | algr $alo,$AHI | ||
| 158 | lghi $AHI,0 | ||
| 159 | alcgr $ahi,$AHI | ||
| 160 | alg $alo,160($j,$sp)# +=tp[j] | ||
| 161 | alcgr $AHI,$ahi | ||
| 162 | |||
| 163 | lg $nlo,0($j,$np) | ||
| 164 | mlgr $nhi,$mn0 # np[j]*m1 | ||
| 165 | algr $nlo,$NHI | ||
| 166 | lghi $NHI,0 | ||
| 167 | alcgr $nhi,$NHI | ||
| 168 | algr $nlo,$alo # +="tp[j]" | ||
| 169 | alcgr $NHI,$nhi | ||
| 170 | |||
| 171 | stg $nlo,160-8($j,$sp) # tp[j-1]= | ||
| 172 | la $j,8($j) # j++ | ||
| 173 | brct $count,.Linner | ||
| 174 | |||
| 175 | algr $NHI,$AHI | ||
| 176 | lghi $AHI,0 | ||
| 177 | alcgr $AHI,$AHI | ||
| 178 | alg $NHI,160($j,$sp)# accumulate previous upmost overflow bit | ||
| 179 | lghi $ahi,0 | ||
| 180 | alcgr $AHI,$ahi # new upmost overflow bit | ||
| 181 | stg $NHI,160-8($j,$sp) | ||
| 182 | stg $AHI,160($j,$sp) | ||
| 183 | |||
| 184 | la $bp,8($bp) # bp++ | ||
| 185 | clg $bp,160+8+32($j,$sp) # compare to &bp[num] | ||
| 186 | jne .Louter | ||
| 187 | |||
| 188 | lg $rp,160+8+16($j,$sp) # reincarnate rp | ||
| 189 | la $ap,160($sp) | ||
| 190 | ahi $num,1 # restore $num, incidentally clears "borrow" | ||
| 191 | |||
| 192 | la $j,0(%r0) | ||
| 193 | lr $count,$num | ||
| 194 | .Lsub: lg $alo,0($j,$ap) | ||
| 195 | slbg $alo,0($j,$np) | ||
| 196 | stg $alo,0($j,$rp) | ||
| 197 | la $j,8($j) | ||
| 198 | brct $count,.Lsub | ||
| 199 | lghi $ahi,0 | ||
| 200 | slbgr $AHI,$ahi # handle upmost carry | ||
| 201 | |||
| 202 | ngr $ap,$AHI | ||
| 203 | lghi $np,-1 | ||
| 204 | xgr $np,$AHI | ||
| 205 | ngr $np,$rp | ||
| 206 | ogr $ap,$np # ap=borrow?tp:rp | ||
| 207 | |||
| 208 | la $j,0(%r0) | ||
| 209 | lgr $count,$num | ||
| 210 | .Lcopy: lg $alo,0($j,$ap) # copy or in-place refresh | ||
| 211 | stg $j,160($j,$sp) # zap tp | ||
| 212 | stg $alo,0($j,$rp) | ||
| 213 | la $j,8($j) | ||
| 214 | brct $count,.Lcopy | ||
| 215 | |||
| 216 | la %r1,160+8+48($j,$sp) | ||
| 217 | lmg %r6,%r15,0(%r1) | ||
| 218 | lghi %r2,1 # signal "processed" | ||
| 219 | br %r14 | ||
| 220 | .size bn_mul_mont,.-bn_mul_mont | ||
| 221 | .string "Montgomery Multiplication for s390x, CRYPTOGAMS by <appro\@openssl.org>" | ||
| 222 | ___ | ||
| 223 | |||
| 224 | print $code; | ||
| 225 | close STDOUT; | ||
diff --git a/src/lib/libcrypto/bn/asm/s390x.S b/src/lib/libcrypto/bn/asm/s390x.S new file mode 100755 index 0000000000..8f45f5d513 --- /dev/null +++ b/src/lib/libcrypto/bn/asm/s390x.S | |||
| @@ -0,0 +1,678 @@ | |||
| 1 | .ident "s390x.S, version 1.0" | ||
| 2 | // ==================================================================== | ||
| 3 | // Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | ||
| 4 | // project. | ||
| 5 | // | ||
| 6 | // Rights for redistribution and usage in source and binary forms are | ||
| 7 | // granted according to the OpenSSL license. Warranty of any kind is | ||
| 8 | // disclaimed. | ||
| 9 | // ==================================================================== | ||
| 10 | |||
| 11 | .text | ||
| 12 | |||
| 13 | #define zero %r0 | ||
| 14 | |||
| 15 | // BN_ULONG bn_mul_add_words(BN_ULONG *r2,BN_ULONG *r3,int r4,BN_ULONG r5); | ||
| 16 | .globl bn_mul_add_words | ||
| 17 | .type bn_mul_add_words,@function | ||
| 18 | .align 4 | ||
| 19 | bn_mul_add_words: | ||
| 20 | lghi zero,0 // zero = 0 | ||
| 21 | la %r1,0(%r2) // put rp aside | ||
| 22 | lghi %r2,0 // i=0; | ||
| 23 | ltgfr %r4,%r4 | ||
| 24 | bler %r14 // if (len<=0) return 0; | ||
| 25 | |||
| 26 | stmg %r6,%r10,48(%r15) | ||
| 27 | lghi %r8,0 // carry = 0 | ||
| 28 | srag %r10,%r4,2 // cnt=len/4 | ||
| 29 | jz .Loop1_madd | ||
| 30 | |||
| 31 | .Loop4_madd: | ||
| 32 | lg %r7,0(%r2,%r3) // ap[i] | ||
| 33 | mlgr %r6,%r5 // *=w | ||
| 34 | algr %r7,%r8 // +=carry | ||
| 35 | alcgr %r6,zero | ||
| 36 | alg %r7,0(%r2,%r1) // +=rp[i] | ||
| 37 | alcgr %r6,zero | ||
| 38 | stg %r7,0(%r2,%r1) // rp[i]= | ||
| 39 | |||
| 40 | lg %r9,8(%r2,%r3) | ||
| 41 | mlgr %r8,%r5 | ||
| 42 | algr %r9,%r6 | ||
| 43 | alcgr %r8,zero | ||
| 44 | alg %r9,8(%r2,%r1) | ||
| 45 | alcgr %r8,zero | ||
| 46 | stg %r9,8(%r2,%r1) | ||
| 47 | |||
| 48 | lg %r7,16(%r2,%r3) | ||
| 49 | mlgr %r6,%r5 | ||
| 50 | algr %r7,%r8 | ||
| 51 | alcgr %r6,zero | ||
| 52 | alg %r7,16(%r2,%r1) | ||
| 53 | alcgr %r6,zero | ||
| 54 | stg %r7,16(%r2,%r1) | ||
| 55 | |||
| 56 | lg %r9,24(%r2,%r3) | ||
| 57 | mlgr %r8,%r5 | ||
| 58 | algr %r9,%r6 | ||
| 59 | alcgr %r8,zero | ||
| 60 | alg %r9,24(%r2,%r1) | ||
| 61 | alcgr %r8,zero | ||
| 62 | stg %r9,24(%r2,%r1) | ||
| 63 | |||
| 64 | la %r2,32(%r2) // i+=4 | ||
| 65 | brct %r10,.Loop4_madd | ||
| 66 | |||
| 67 | lghi %r10,3 | ||
| 68 | nr %r4,%r10 // cnt=len%4 | ||
| 69 | jz .Lend_madd | ||
| 70 | |||
| 71 | .Loop1_madd: | ||
| 72 | lg %r7,0(%r2,%r3) // ap[i] | ||
| 73 | mlgr %r6,%r5 // *=w | ||
| 74 | algr %r7,%r8 // +=carry | ||
| 75 | alcgr %r6,zero | ||
| 76 | alg %r7,0(%r2,%r1) // +=rp[i] | ||
| 77 | alcgr %r6,zero | ||
| 78 | stg %r7,0(%r2,%r1) // rp[i]= | ||
| 79 | |||
| 80 | lgr %r8,%r6 | ||
| 81 | la %r2,8(%r2) // i++ | ||
| 82 | brct %r4,.Loop1_madd | ||
| 83 | |||
| 84 | .Lend_madd: | ||
| 85 | lgr %r2,%r8 | ||
| 86 | lmg %r6,%r10,48(%r15) | ||
| 87 | br %r14 | ||
| 88 | .size bn_mul_add_words,.-bn_mul_add_words | ||
| 89 | |||
| 90 | // BN_ULONG bn_mul_words(BN_ULONG *r2,BN_ULONG *r3,int r4,BN_ULONG r5); | ||
| 91 | .globl bn_mul_words | ||
| 92 | .type bn_mul_words,@function | ||
| 93 | .align 4 | ||
| 94 | bn_mul_words: | ||
| 95 | lghi zero,0 // zero = 0 | ||
| 96 | la %r1,0(%r2) // put rp aside | ||
| 97 | lghi %r2,0 // i=0; | ||
| 98 | ltgfr %r4,%r4 | ||
| 99 | bler %r14 // if (len<=0) return 0; | ||
| 100 | |||
| 101 | stmg %r6,%r10,48(%r15) | ||
| 102 | lghi %r8,0 // carry = 0 | ||
| 103 | srag %r10,%r4,2 // cnt=len/4 | ||
| 104 | jz .Loop1_mul | ||
| 105 | |||
| 106 | .Loop4_mul: | ||
| 107 | lg %r7,0(%r2,%r3) // ap[i] | ||
| 108 | mlgr %r6,%r5 // *=w | ||
| 109 | algr %r7,%r8 // +=carry | ||
| 110 | alcgr %r6,zero | ||
| 111 | stg %r7,0(%r2,%r1) // rp[i]= | ||
| 112 | |||
| 113 | lg %r9,8(%r2,%r3) | ||
| 114 | mlgr %r8,%r5 | ||
| 115 | algr %r9,%r6 | ||
| 116 | alcgr %r8,zero | ||
| 117 | stg %r9,8(%r2,%r1) | ||
| 118 | |||
| 119 | lg %r7,16(%r2,%r3) | ||
| 120 | mlgr %r6,%r5 | ||
| 121 | algr %r7,%r8 | ||
| 122 | alcgr %r6,zero | ||
| 123 | stg %r7,16(%r2,%r1) | ||
| 124 | |||
| 125 | lg %r9,24(%r2,%r3) | ||
| 126 | mlgr %r8,%r5 | ||
| 127 | algr %r9,%r6 | ||
| 128 | alcgr %r8,zero | ||
| 129 | stg %r9,24(%r2,%r1) | ||
| 130 | |||
| 131 | la %r2,32(%r2) // i+=4 | ||
| 132 | brct %r10,.Loop4_mul | ||
| 133 | |||
| 134 | lghi %r10,3 | ||
| 135 | nr %r4,%r10 // cnt=len%4 | ||
| 136 | jz .Lend_mul | ||
| 137 | |||
| 138 | .Loop1_mul: | ||
| 139 | lg %r7,0(%r2,%r3) // ap[i] | ||
| 140 | mlgr %r6,%r5 // *=w | ||
| 141 | algr %r7,%r8 // +=carry | ||
| 142 | alcgr %r6,zero | ||
| 143 | stg %r7,0(%r2,%r1) // rp[i]= | ||
| 144 | |||
| 145 | lgr %r8,%r6 | ||
| 146 | la %r2,8(%r2) // i++ | ||
| 147 | brct %r4,.Loop1_mul | ||
| 148 | |||
| 149 | .Lend_mul: | ||
| 150 | lgr %r2,%r8 | ||
| 151 | lmg %r6,%r10,48(%r15) | ||
| 152 | br %r14 | ||
| 153 | .size bn_mul_words,.-bn_mul_words | ||
| 154 | |||
| 155 | // void bn_sqr_words(BN_ULONG *r2,BN_ULONG *r2,int r4) | ||
| 156 | .globl bn_sqr_words | ||
| 157 | .type bn_sqr_words,@function | ||
| 158 | .align 4 | ||
| 159 | bn_sqr_words: | ||
| 160 | ltgfr %r4,%r4 | ||
| 161 | bler %r14 | ||
| 162 | |||
| 163 | stmg %r6,%r7,48(%r15) | ||
| 164 | srag %r1,%r4,2 // cnt=len/4 | ||
| 165 | jz .Loop1_sqr | ||
| 166 | |||
| 167 | .Loop4_sqr: | ||
| 168 | lg %r7,0(%r3) | ||
| 169 | mlgr %r6,%r7 | ||
| 170 | stg %r7,0(%r2) | ||
| 171 | stg %r6,8(%r2) | ||
| 172 | |||
| 173 | lg %r7,8(%r3) | ||
| 174 | mlgr %r6,%r7 | ||
| 175 | stg %r7,16(%r2) | ||
| 176 | stg %r6,24(%r2) | ||
| 177 | |||
| 178 | lg %r7,16(%r3) | ||
| 179 | mlgr %r6,%r7 | ||
| 180 | stg %r7,32(%r2) | ||
| 181 | stg %r6,40(%r2) | ||
| 182 | |||
| 183 | lg %r7,24(%r3) | ||
| 184 | mlgr %r6,%r7 | ||
| 185 | stg %r7,48(%r2) | ||
| 186 | stg %r6,56(%r2) | ||
| 187 | |||
| 188 | la %r3,32(%r3) | ||
| 189 | la %r2,64(%r2) | ||
| 190 | brct %r1,.Loop4_sqr | ||
| 191 | |||
| 192 | lghi %r1,3 | ||
| 193 | nr %r4,%r1 // cnt=len%4 | ||
| 194 | jz .Lend_sqr | ||
| 195 | |||
| 196 | .Loop1_sqr: | ||
| 197 | lg %r7,0(%r3) | ||
| 198 | mlgr %r6,%r7 | ||
| 199 | stg %r7,0(%r2) | ||
| 200 | stg %r6,8(%r2) | ||
| 201 | |||
| 202 | la %r3,8(%r3) | ||
| 203 | la %r2,16(%r2) | ||
| 204 | brct %r4,.Loop1_sqr | ||
| 205 | |||
| 206 | .Lend_sqr: | ||
| 207 | lmg %r6,%r7,48(%r15) | ||
| 208 | br %r14 | ||
| 209 | .size bn_sqr_words,.-bn_sqr_words | ||
| 210 | |||
| 211 | // BN_ULONG bn_div_words(BN_ULONG h,BN_ULONG l,BN_ULONG d); | ||
| 212 | .globl bn_div_words | ||
| 213 | .type bn_div_words,@function | ||
| 214 | .align 4 | ||
| 215 | bn_div_words: | ||
| 216 | dlgr %r2,%r4 | ||
| 217 | lgr %r2,%r3 | ||
| 218 | br %r14 | ||
| 219 | .size bn_div_words,.-bn_div_words | ||
| 220 | |||
| 221 | // BN_ULONG bn_add_words(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4,int r5); | ||
| 222 | .globl bn_add_words | ||
| 223 | .type bn_add_words,@function | ||
| 224 | .align 4 | ||
| 225 | bn_add_words: | ||
| 226 | la %r1,0(%r2) // put rp aside | ||
| 227 | lghi %r2,0 // i=0 | ||
| 228 | ltgfr %r5,%r5 | ||
| 229 | bler %r14 // if (len<=0) return 0; | ||
| 230 | |||
| 231 | stg %r6,48(%r15) | ||
| 232 | lghi %r6,3 | ||
| 233 | nr %r6,%r5 // len%4 | ||
| 234 | sra %r5,2 // len/4, use sra because it sets condition code | ||
| 235 | jz .Loop1_add // carry is incidentally cleared if branch taken | ||
| 236 | algr %r2,%r2 // clear carry | ||
| 237 | |||
| 238 | .Loop4_add: | ||
| 239 | lg %r0,0(%r2,%r3) | ||
| 240 | alcg %r0,0(%r2,%r4) | ||
| 241 | stg %r0,0(%r2,%r1) | ||
| 242 | lg %r0,8(%r2,%r3) | ||
| 243 | alcg %r0,8(%r2,%r4) | ||
| 244 | stg %r0,8(%r2,%r1) | ||
| 245 | lg %r0,16(%r2,%r3) | ||
| 246 | alcg %r0,16(%r2,%r4) | ||
| 247 | stg %r0,16(%r2,%r1) | ||
| 248 | lg %r0,24(%r2,%r3) | ||
| 249 | alcg %r0,24(%r2,%r4) | ||
| 250 | stg %r0,24(%r2,%r1) | ||
| 251 | |||
| 252 | la %r2,32(%r2) // i+=4 | ||
| 253 | brct %r5,.Loop4_add | ||
| 254 | |||
| 255 | la %r6,1(%r6) // see if len%4 is zero ... | ||
| 256 | brct %r6,.Loop1_add // without touching condition code:-) | ||
| 257 | |||
| 258 | .Lexit_add: | ||
| 259 | lghi %r2,0 | ||
| 260 | alcgr %r2,%r2 | ||
| 261 | lg %r6,48(%r15) | ||
| 262 | br %r14 | ||
| 263 | |||
| 264 | .Loop1_add: | ||
| 265 | lg %r0,0(%r2,%r3) | ||
| 266 | alcg %r0,0(%r2,%r4) | ||
| 267 | stg %r0,0(%r2,%r1) | ||
| 268 | |||
| 269 | la %r2,8(%r2) // i++ | ||
| 270 | brct %r6,.Loop1_add | ||
| 271 | |||
| 272 | j .Lexit_add | ||
| 273 | .size bn_add_words,.-bn_add_words | ||
| 274 | |||
| 275 | // BN_ULONG bn_sub_words(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4,int r5); | ||
| 276 | .globl bn_sub_words | ||
| 277 | .type bn_sub_words,@function | ||
| 278 | .align 4 | ||
| 279 | bn_sub_words: | ||
| 280 | la %r1,0(%r2) // put rp aside | ||
| 281 | lghi %r2,0 // i=0 | ||
| 282 | ltgfr %r5,%r5 | ||
| 283 | bler %r14 // if (len<=0) return 0; | ||
| 284 | |||
| 285 | stg %r6,48(%r15) | ||
| 286 | lghi %r6,3 | ||
| 287 | nr %r6,%r5 // len%4 | ||
| 288 | sra %r5,2 // len/4, use sra because it sets condition code | ||
| 289 | jnz .Loop4_sub // borrow is incidentally cleared if branch taken | ||
| 290 | slgr %r2,%r2 // clear borrow | ||
| 291 | |||
| 292 | .Loop1_sub: | ||
| 293 | lg %r0,0(%r2,%r3) | ||
| 294 | slbg %r0,0(%r2,%r4) | ||
| 295 | stg %r0,0(%r2,%r1) | ||
| 296 | |||
| 297 | la %r2,8(%r2) // i++ | ||
| 298 | brct %r6,.Loop1_sub | ||
| 299 | j .Lexit_sub | ||
| 300 | |||
| 301 | .Loop4_sub: | ||
| 302 | lg %r0,0(%r2,%r3) | ||
| 303 | slbg %r0,0(%r2,%r4) | ||
| 304 | stg %r0,0(%r2,%r1) | ||
| 305 | lg %r0,8(%r2,%r3) | ||
| 306 | slbg %r0,8(%r2,%r4) | ||
| 307 | stg %r0,8(%r2,%r1) | ||
| 308 | lg %r0,16(%r2,%r3) | ||
| 309 | slbg %r0,16(%r2,%r4) | ||
| 310 | stg %r0,16(%r2,%r1) | ||
| 311 | lg %r0,24(%r2,%r3) | ||
| 312 | slbg %r0,24(%r2,%r4) | ||
| 313 | stg %r0,24(%r2,%r1) | ||
| 314 | |||
| 315 | la %r2,32(%r2) // i+=4 | ||
| 316 | brct %r5,.Loop4_sub | ||
| 317 | |||
| 318 | la %r6,1(%r6) // see if len%4 is zero ... | ||
| 319 | brct %r6,.Loop1_sub // without touching condition code:-) | ||
| 320 | |||
| 321 | .Lexit_sub: | ||
| 322 | lghi %r2,0 | ||
| 323 | slbgr %r2,%r2 | ||
| 324 | lcgr %r2,%r2 | ||
| 325 | lg %r6,48(%r15) | ||
| 326 | br %r14 | ||
| 327 | .size bn_sub_words,.-bn_sub_words | ||
| 328 | |||
| 329 | #define c1 %r1 | ||
| 330 | #define c2 %r5 | ||
| 331 | #define c3 %r8 | ||
| 332 | |||
| 333 | #define mul_add_c(ai,bi,c1,c2,c3) \ | ||
| 334 | lg %r7,ai*8(%r3); \ | ||
| 335 | mlg %r6,bi*8(%r4); \ | ||
| 336 | algr c1,%r7; \ | ||
| 337 | alcgr c2,%r6; \ | ||
| 338 | alcgr c3,zero | ||
| 339 | |||
| 340 | // void bn_mul_comba8(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4); | ||
| 341 | .globl bn_mul_comba8 | ||
| 342 | .type bn_mul_comba8,@function | ||
| 343 | .align 4 | ||
| 344 | bn_mul_comba8: | ||
| 345 | stmg %r6,%r8,48(%r15) | ||
| 346 | |||
| 347 | lghi c1,0 | ||
| 348 | lghi c2,0 | ||
| 349 | lghi c3,0 | ||
| 350 | lghi zero,0 | ||
| 351 | |||
| 352 | mul_add_c(0,0,c1,c2,c3); | ||
| 353 | stg c1,0*8(%r2) | ||
| 354 | lghi c1,0 | ||
| 355 | |||
| 356 | mul_add_c(0,1,c2,c3,c1); | ||
| 357 | mul_add_c(1,0,c2,c3,c1); | ||
| 358 | stg c2,1*8(%r2) | ||
| 359 | lghi c2,0 | ||
| 360 | |||
| 361 | mul_add_c(2,0,c3,c1,c2); | ||
| 362 | mul_add_c(1,1,c3,c1,c2); | ||
| 363 | mul_add_c(0,2,c3,c1,c2); | ||
| 364 | stg c3,2*8(%r2) | ||
| 365 | lghi c3,0 | ||
| 366 | |||
| 367 | mul_add_c(0,3,c1,c2,c3); | ||
| 368 | mul_add_c(1,2,c1,c2,c3); | ||
| 369 | mul_add_c(2,1,c1,c2,c3); | ||
| 370 | mul_add_c(3,0,c1,c2,c3); | ||
| 371 | stg c1,3*8(%r2) | ||
| 372 | lghi c1,0 | ||
| 373 | |||
| 374 | mul_add_c(4,0,c2,c3,c1); | ||
| 375 | mul_add_c(3,1,c2,c3,c1); | ||
| 376 | mul_add_c(2,2,c2,c3,c1); | ||
| 377 | mul_add_c(1,3,c2,c3,c1); | ||
| 378 | mul_add_c(0,4,c2,c3,c1); | ||
| 379 | stg c2,4*8(%r2) | ||
| 380 | lghi c2,0 | ||
| 381 | |||
| 382 | mul_add_c(0,5,c3,c1,c2); | ||
| 383 | mul_add_c(1,4,c3,c1,c2); | ||
| 384 | mul_add_c(2,3,c3,c1,c2); | ||
| 385 | mul_add_c(3,2,c3,c1,c2); | ||
| 386 | mul_add_c(4,1,c3,c1,c2); | ||
| 387 | mul_add_c(5,0,c3,c1,c2); | ||
| 388 | stg c3,5*8(%r2) | ||
| 389 | lghi c3,0 | ||
| 390 | |||
| 391 | mul_add_c(6,0,c1,c2,c3); | ||
| 392 | mul_add_c(5,1,c1,c2,c3); | ||
| 393 | mul_add_c(4,2,c1,c2,c3); | ||
| 394 | mul_add_c(3,3,c1,c2,c3); | ||
| 395 | mul_add_c(2,4,c1,c2,c3); | ||
| 396 | mul_add_c(1,5,c1,c2,c3); | ||
| 397 | mul_add_c(0,6,c1,c2,c3); | ||
| 398 | stg c1,6*8(%r2) | ||
| 399 | lghi c1,0 | ||
| 400 | |||
| 401 | mul_add_c(0,7,c2,c3,c1); | ||
| 402 | mul_add_c(1,6,c2,c3,c1); | ||
| 403 | mul_add_c(2,5,c2,c3,c1); | ||
| 404 | mul_add_c(3,4,c2,c3,c1); | ||
| 405 | mul_add_c(4,3,c2,c3,c1); | ||
| 406 | mul_add_c(5,2,c2,c3,c1); | ||
| 407 | mul_add_c(6,1,c2,c3,c1); | ||
| 408 | mul_add_c(7,0,c2,c3,c1); | ||
| 409 | stg c2,7*8(%r2) | ||
| 410 | lghi c2,0 | ||
| 411 | |||
| 412 | mul_add_c(7,1,c3,c1,c2); | ||
| 413 | mul_add_c(6,2,c3,c1,c2); | ||
| 414 | mul_add_c(5,3,c3,c1,c2); | ||
| 415 | mul_add_c(4,4,c3,c1,c2); | ||
| 416 | mul_add_c(3,5,c3,c1,c2); | ||
| 417 | mul_add_c(2,6,c3,c1,c2); | ||
| 418 | mul_add_c(1,7,c3,c1,c2); | ||
| 419 | stg c3,8*8(%r2) | ||
| 420 | lghi c3,0 | ||
| 421 | |||
| 422 | mul_add_c(2,7,c1,c2,c3); | ||
| 423 | mul_add_c(3,6,c1,c2,c3); | ||
| 424 | mul_add_c(4,5,c1,c2,c3); | ||
| 425 | mul_add_c(5,4,c1,c2,c3); | ||
| 426 | mul_add_c(6,3,c1,c2,c3); | ||
| 427 | mul_add_c(7,2,c1,c2,c3); | ||
| 428 | stg c1,9*8(%r2) | ||
| 429 | lghi c1,0 | ||
| 430 | |||
| 431 | mul_add_c(7,3,c2,c3,c1); | ||
| 432 | mul_add_c(6,4,c2,c3,c1); | ||
| 433 | mul_add_c(5,5,c2,c3,c1); | ||
| 434 | mul_add_c(4,6,c2,c3,c1); | ||
| 435 | mul_add_c(3,7,c2,c3,c1); | ||
| 436 | stg c2,10*8(%r2) | ||
| 437 | lghi c2,0 | ||
| 438 | |||
| 439 | mul_add_c(4,7,c3,c1,c2); | ||
| 440 | mul_add_c(5,6,c3,c1,c2); | ||
| 441 | mul_add_c(6,5,c3,c1,c2); | ||
| 442 | mul_add_c(7,4,c3,c1,c2); | ||
| 443 | stg c3,11*8(%r2) | ||
| 444 | lghi c3,0 | ||
| 445 | |||
| 446 | mul_add_c(7,5,c1,c2,c3); | ||
| 447 | mul_add_c(6,6,c1,c2,c3); | ||
| 448 | mul_add_c(5,7,c1,c2,c3); | ||
| 449 | stg c1,12*8(%r2) | ||
| 450 | lghi c1,0 | ||
| 451 | |||
| 452 | |||
| 453 | mul_add_c(6,7,c2,c3,c1); | ||
| 454 | mul_add_c(7,6,c2,c3,c1); | ||
| 455 | stg c2,13*8(%r2) | ||
| 456 | lghi c2,0 | ||
| 457 | |||
| 458 | mul_add_c(7,7,c3,c1,c2); | ||
| 459 | stg c3,14*8(%r2) | ||
| 460 | stg c1,15*8(%r2) | ||
| 461 | |||
| 462 | lmg %r6,%r8,48(%r15) | ||
| 463 | br %r14 | ||
| 464 | .size bn_mul_comba8,.-bn_mul_comba8 | ||
| 465 | |||
| 466 | // void bn_mul_comba4(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4); | ||
| 467 | .globl bn_mul_comba4 | ||
| 468 | .type bn_mul_comba4,@function | ||
| 469 | .align 4 | ||
| 470 | bn_mul_comba4: | ||
| 471 | stmg %r6,%r8,48(%r15) | ||
| 472 | |||
| 473 | lghi c1,0 | ||
| 474 | lghi c2,0 | ||
| 475 | lghi c3,0 | ||
| 476 | lghi zero,0 | ||
| 477 | |||
| 478 | mul_add_c(0,0,c1,c2,c3); | ||
| 479 | stg c1,0*8(%r3) | ||
| 480 | lghi c1,0 | ||
| 481 | |||
| 482 | mul_add_c(0,1,c2,c3,c1); | ||
| 483 | mul_add_c(1,0,c2,c3,c1); | ||
| 484 | stg c2,1*8(%r2) | ||
| 485 | lghi c2,0 | ||
| 486 | |||
| 487 | mul_add_c(2,0,c3,c1,c2); | ||
| 488 | mul_add_c(1,1,c3,c1,c2); | ||
| 489 | mul_add_c(0,2,c3,c1,c2); | ||
| 490 | stg c3,2*8(%r2) | ||
| 491 | lghi c3,0 | ||
| 492 | |||
| 493 | mul_add_c(0,3,c1,c2,c3); | ||
| 494 | mul_add_c(1,2,c1,c2,c3); | ||
| 495 | mul_add_c(2,1,c1,c2,c3); | ||
| 496 | mul_add_c(3,0,c1,c2,c3); | ||
| 497 | stg c1,3*8(%r2) | ||
| 498 | lghi c1,0 | ||
| 499 | |||
| 500 | mul_add_c(3,1,c2,c3,c1); | ||
| 501 | mul_add_c(2,2,c2,c3,c1); | ||
| 502 | mul_add_c(1,3,c2,c3,c1); | ||
| 503 | stg c2,4*8(%r2) | ||
| 504 | lghi c2,0 | ||
| 505 | |||
| 506 | mul_add_c(2,3,c3,c1,c2); | ||
| 507 | mul_add_c(3,2,c3,c1,c2); | ||
| 508 | stg c3,5*8(%r2) | ||
| 509 | lghi c3,0 | ||
| 510 | |||
| 511 | mul_add_c(3,3,c1,c2,c3); | ||
| 512 | stg c1,6*8(%r2) | ||
| 513 | stg c2,7*8(%r2) | ||
| 514 | |||
| 515 | stmg %r6,%r8,48(%r15) | ||
| 516 | br %r14 | ||
| 517 | .size bn_mul_comba4,.-bn_mul_comba4 | ||
| 518 | |||
| 519 | #define sqr_add_c(ai,c1,c2,c3) \ | ||
| 520 | lg %r7,ai*8(%r3); \ | ||
| 521 | mlgr %r6,%r7; \ | ||
| 522 | algr c1,%r7; \ | ||
| 523 | alcgr c2,%r6; \ | ||
| 524 | alcgr c3,zero | ||
| 525 | |||
| 526 | #define sqr_add_c2(ai,aj,c1,c2,c3) \ | ||
| 527 | lg %r7,ai*8(%r3); \ | ||
| 528 | mlg %r6,aj*8(%r3); \ | ||
| 529 | algr c1,%r7; \ | ||
| 530 | alcgr c2,%r6; \ | ||
| 531 | alcgr c3,zero; \ | ||
| 532 | algr c1,%r7; \ | ||
| 533 | alcgr c2,%r6; \ | ||
| 534 | alcgr c3,zero | ||
| 535 | |||
| 536 | // void bn_sqr_comba8(BN_ULONG *r2,BN_ULONG *r3); | ||
| 537 | .globl bn_sqr_comba8 | ||
| 538 | .type bn_sqr_comba8,@function | ||
| 539 | .align 4 | ||
| 540 | bn_sqr_comba8: | ||
| 541 | stmg %r6,%r8,48(%r15) | ||
| 542 | |||
| 543 | lghi c1,0 | ||
| 544 | lghi c2,0 | ||
| 545 | lghi c3,0 | ||
| 546 | lghi zero,0 | ||
| 547 | |||
| 548 | sqr_add_c(0,c1,c2,c3); | ||
| 549 | stg c1,0*8(%r2) | ||
| 550 | lghi c1,0 | ||
| 551 | |||
| 552 | sqr_add_c2(1,0,c2,c3,c1); | ||
| 553 | stg c2,1*8(%r2) | ||
| 554 | lghi c2,0 | ||
| 555 | |||
| 556 | sqr_add_c(1,c3,c1,c2); | ||
| 557 | sqr_add_c2(2,0,c3,c1,c2); | ||
| 558 | stg c3,2*8(%r2) | ||
| 559 | lghi c3,0 | ||
| 560 | |||
| 561 | sqr_add_c2(3,0,c1,c2,c3); | ||
| 562 | sqr_add_c2(2,1,c1,c2,c3); | ||
| 563 | stg c1,3*8(%r2) | ||
| 564 | lghi c1,0 | ||
| 565 | |||
| 566 | sqr_add_c(2,c2,c3,c1); | ||
| 567 | sqr_add_c2(3,1,c2,c3,c1); | ||
| 568 | sqr_add_c2(4,0,c2,c3,c1); | ||
| 569 | stg c2,4*8(%r2) | ||
| 570 | lghi c2,0 | ||
| 571 | |||
| 572 | sqr_add_c2(5,0,c3,c1,c2); | ||
| 573 | sqr_add_c2(4,1,c3,c1,c2); | ||
| 574 | sqr_add_c2(3,2,c3,c1,c2); | ||
| 575 | stg c3,5*8(%r2) | ||
| 576 | lghi c3,0 | ||
| 577 | |||
| 578 | sqr_add_c(3,c1,c2,c3); | ||
| 579 | sqr_add_c2(4,2,c1,c2,c3); | ||
| 580 | sqr_add_c2(5,1,c1,c2,c3); | ||
| 581 | sqr_add_c2(6,0,c1,c2,c3); | ||
| 582 | stg c1,6*8(%r2) | ||
| 583 | lghi c1,0 | ||
| 584 | |||
| 585 | sqr_add_c2(7,0,c2,c3,c1); | ||
| 586 | sqr_add_c2(6,1,c2,c3,c1); | ||
| 587 | sqr_add_c2(5,2,c2,c3,c1); | ||
| 588 | sqr_add_c2(4,3,c2,c3,c1); | ||
| 589 | stg c2,7*8(%r2) | ||
| 590 | lghi c2,0 | ||
| 591 | |||
| 592 | sqr_add_c(4,c3,c1,c2); | ||
| 593 | sqr_add_c2(5,3,c3,c1,c2); | ||
| 594 | sqr_add_c2(6,2,c3,c1,c2); | ||
| 595 | sqr_add_c2(7,1,c3,c1,c2); | ||
| 596 | stg c3,8*8(%r2) | ||
| 597 | lghi c3,0 | ||
| 598 | |||
| 599 | sqr_add_c2(7,2,c1,c2,c3); | ||
| 600 | sqr_add_c2(6,3,c1,c2,c3); | ||
| 601 | sqr_add_c2(5,4,c1,c2,c3); | ||
| 602 | stg c1,9*8(%r2) | ||
| 603 | lghi c1,0 | ||
| 604 | |||
| 605 | sqr_add_c(5,c2,c3,c1); | ||
| 606 | sqr_add_c2(6,4,c2,c3,c1); | ||
| 607 | sqr_add_c2(7,3,c2,c3,c1); | ||
| 608 | stg c2,10*8(%r2) | ||
| 609 | lghi c2,0 | ||
| 610 | |||
| 611 | sqr_add_c2(7,4,c3,c1,c2); | ||
| 612 | sqr_add_c2(6,5,c3,c1,c2); | ||
| 613 | stg c3,11*8(%r2) | ||
| 614 | lghi c3,0 | ||
| 615 | |||
| 616 | sqr_add_c(6,c1,c2,c3); | ||
| 617 | sqr_add_c2(7,5,c1,c2,c3); | ||
| 618 | stg c1,12*8(%r2) | ||
| 619 | lghi c1,0 | ||
| 620 | |||
| 621 | sqr_add_c2(7,6,c2,c3,c1); | ||
| 622 | stg c2,13*8(%r2) | ||
| 623 | lghi c2,0 | ||
| 624 | |||
| 625 | sqr_add_c(7,c3,c1,c2); | ||
| 626 | stg c3,14*8(%r2) | ||
| 627 | stg c1,15*8(%r2) | ||
| 628 | |||
| 629 | lmg %r6,%r8,48(%r15) | ||
| 630 | br %r14 | ||
| 631 | .size bn_sqr_comba8,.-bn_sqr_comba8 | ||
| 632 | |||
| 633 | // void bn_sqr_comba4(BN_ULONG *r2,BN_ULONG *r3); | ||
| 634 | .globl bn_sqr_comba4 | ||
| 635 | .type bn_sqr_comba4,@function | ||
| 636 | .align 4 | ||
| 637 | bn_sqr_comba4: | ||
| 638 | stmg %r6,%r8,48(%r15) | ||
| 639 | |||
| 640 | lghi c1,0 | ||
| 641 | lghi c2,0 | ||
| 642 | lghi c3,0 | ||
| 643 | lghi zero,0 | ||
| 644 | |||
| 645 | sqr_add_c(0,c1,c2,c3); | ||
| 646 | stg c1,0*8(%r2) | ||
| 647 | lghi c1,0 | ||
| 648 | |||
| 649 | sqr_add_c2(1,0,c2,c3,c1); | ||
| 650 | stg c2,1*8(%r2) | ||
| 651 | lghi c2,0 | ||
| 652 | |||
| 653 | sqr_add_c(1,c3,c1,c2); | ||
| 654 | sqr_add_c2(2,0,c3,c1,c2); | ||
| 655 | stg c3,2*8(%r2) | ||
| 656 | lghi c3,0 | ||
| 657 | |||
| 658 | sqr_add_c2(3,0,c1,c2,c3); | ||
| 659 | sqr_add_c2(2,1,c1,c2,c3); | ||
| 660 | stg c1,3*8(%r2) | ||
| 661 | lghi c1,0 | ||
| 662 | |||
| 663 | sqr_add_c(2,c2,c3,c1); | ||
| 664 | sqr_add_c2(3,1,c2,c3,c1); | ||
| 665 | stg c2,4*8(%r2) | ||
| 666 | lghi c2,0 | ||
| 667 | |||
| 668 | sqr_add_c2(3,2,c3,c1,c2); | ||
| 669 | stg c3,5*8(%r2) | ||
| 670 | lghi c3,0 | ||
| 671 | |||
| 672 | sqr_add_c(3,c1,c2,c3); | ||
| 673 | stg c1,6*8(%r2) | ||
| 674 | stg c2,7*8(%r2) | ||
| 675 | |||
| 676 | lmg %r6,%r8,48(%r15) | ||
| 677 | br %r14 | ||
| 678 | .size bn_sqr_comba4,.-bn_sqr_comba4 | ||
diff --git a/src/lib/libcrypto/bn/asm/sparcv9-mont.pl b/src/lib/libcrypto/bn/asm/sparcv9-mont.pl new file mode 100644 index 0000000000..b8fb1e8a25 --- /dev/null +++ b/src/lib/libcrypto/bn/asm/sparcv9-mont.pl | |||
| @@ -0,0 +1,606 @@ | |||
| 1 | #!/usr/bin/env perl | ||
| 2 | |||
| 3 | # ==================================================================== | ||
| 4 | # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | ||
| 5 | # project. The module is, however, dual licensed under OpenSSL and | ||
| 6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
| 7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
| 8 | # ==================================================================== | ||
| 9 | |||
| 10 | # December 2005 | ||
| 11 | # | ||
| 12 | # Pure SPARCv9/8+ and IALU-only bn_mul_mont implementation. The reasons | ||
| 13 | # for undertaken effort are multiple. First of all, UltraSPARC is not | ||
| 14 | # the whole SPARCv9 universe and other VIS-free implementations deserve | ||
| 15 | # optimized code as much. Secondly, newly introduced UltraSPARC T1, | ||
| 16 | # a.k.a. Niagara, has shared FPU and concurrent FPU-intensive pathes, | ||
| 17 | # such as sparcv9a-mont, will simply sink it. Yes, T1 is equipped with | ||
| 18 | # several integrated RSA/DSA accelerator circuits accessible through | ||
| 19 | # kernel driver [only(*)], but having decent user-land software | ||
| 20 | # implementation is important too. Finally, reasons like desire to | ||
| 21 | # experiment with dedicated squaring procedure. Yes, this module | ||
| 22 | # implements one, because it was easiest to draft it in SPARCv9 | ||
| 23 | # instructions... | ||
| 24 | |||
| 25 | # (*) Engine accessing the driver in question is on my TODO list. | ||
| 26 | # For reference, acceleator is estimated to give 6 to 10 times | ||
| 27 | # improvement on single-threaded RSA sign. It should be noted | ||
| 28 | # that 6-10x improvement coefficient does not actually mean | ||
| 29 | # something extraordinary in terms of absolute [single-threaded] | ||
| 30 | # performance, as SPARCv9 instruction set is by all means least | ||
| 31 | # suitable for high performance crypto among other 64 bit | ||
| 32 | # platforms. 6-10x factor simply places T1 in same performance | ||
| 33 | # domain as say AMD64 and IA-64. Improvement of RSA verify don't | ||
| 34 | # appear impressive at all, but it's the sign operation which is | ||
| 35 | # far more critical/interesting. | ||
| 36 | |||
| 37 | # You might notice that inner loops are modulo-scheduled:-) This has | ||
| 38 | # essentially negligible impact on UltraSPARC performance, it's | ||
| 39 | # Fujitsu SPARC64 V users who should notice and hopefully appreciate | ||
| 40 | # the advantage... Currently this module surpasses sparcv9a-mont.pl | ||
| 41 | # by ~20% on UltraSPARC-III and later cores, but recall that sparcv9a | ||
| 42 | # module still have hidden potential [see TODO list there], which is | ||
| 43 | # estimated to be larger than 20%... | ||
| 44 | |||
| 45 | # int bn_mul_mont( | ||
| 46 | $rp="%i0"; # BN_ULONG *rp, | ||
| 47 | $ap="%i1"; # const BN_ULONG *ap, | ||
| 48 | $bp="%i2"; # const BN_ULONG *bp, | ||
| 49 | $np="%i3"; # const BN_ULONG *np, | ||
| 50 | $n0="%i4"; # const BN_ULONG *n0, | ||
| 51 | $num="%i5"; # int num); | ||
| 52 | |||
| 53 | $bits=32; | ||
| 54 | for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); } | ||
| 55 | if ($bits==64) { $bias=2047; $frame=192; } | ||
| 56 | else { $bias=0; $frame=128; } | ||
| 57 | |||
| 58 | $car0="%o0"; | ||
| 59 | $car1="%o1"; | ||
| 60 | $car2="%o2"; # 1 bit | ||
| 61 | $acc0="%o3"; | ||
| 62 | $acc1="%o4"; | ||
| 63 | $mask="%g1"; # 32 bits, what a waste... | ||
| 64 | $tmp0="%g4"; | ||
| 65 | $tmp1="%g5"; | ||
| 66 | |||
| 67 | $i="%l0"; | ||
| 68 | $j="%l1"; | ||
| 69 | $mul0="%l2"; | ||
| 70 | $mul1="%l3"; | ||
| 71 | $tp="%l4"; | ||
| 72 | $apj="%l5"; | ||
| 73 | $npj="%l6"; | ||
| 74 | $tpj="%l7"; | ||
| 75 | |||
| 76 | $fname="bn_mul_mont_int"; | ||
| 77 | |||
| 78 | $code=<<___; | ||
| 79 | .section ".text",#alloc,#execinstr | ||
| 80 | |||
| 81 | .global $fname | ||
| 82 | .align 32 | ||
| 83 | $fname: | ||
| 84 | cmp %o5,4 ! 128 bits minimum | ||
| 85 | bge,pt %icc,.Lenter | ||
| 86 | sethi %hi(0xffffffff),$mask | ||
| 87 | retl | ||
| 88 | clr %o0 | ||
| 89 | .align 32 | ||
| 90 | .Lenter: | ||
| 91 | save %sp,-$frame,%sp | ||
| 92 | sll $num,2,$num ! num*=4 | ||
| 93 | or $mask,%lo(0xffffffff),$mask | ||
| 94 | ld [$n0],$n0 | ||
| 95 | cmp $ap,$bp | ||
| 96 | and $num,$mask,$num | ||
| 97 | ld [$bp],$mul0 ! bp[0] | ||
| 98 | nop | ||
| 99 | |||
| 100 | add %sp,$bias,%o7 ! real top of stack | ||
| 101 | ld [$ap],$car0 ! ap[0] ! redundant in squaring context | ||
| 102 | sub %o7,$num,%o7 | ||
| 103 | ld [$ap+4],$apj ! ap[1] | ||
| 104 | and %o7,-1024,%o7 | ||
| 105 | ld [$np],$car1 ! np[0] | ||
| 106 | sub %o7,$bias,%sp ! alloca | ||
| 107 | ld [$np+4],$npj ! np[1] | ||
| 108 | be,pt `$bits==32?"%icc":"%xcc"`,.Lbn_sqr_mont | ||
| 109 | mov 12,$j | ||
| 110 | |||
| 111 | mulx $car0,$mul0,$car0 ! ap[0]*bp[0] | ||
| 112 | mulx $apj,$mul0,$tmp0 !prologue! ap[1]*bp[0] | ||
| 113 | and $car0,$mask,$acc0 | ||
| 114 | add %sp,$bias+$frame,$tp | ||
| 115 | ld [$ap+8],$apj !prologue! | ||
| 116 | |||
| 117 | mulx $n0,$acc0,$mul1 ! "t[0]"*n0 | ||
| 118 | and $mul1,$mask,$mul1 | ||
| 119 | |||
| 120 | mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0 | ||
| 121 | mulx $npj,$mul1,$acc1 !prologue! np[1]*"t[0]"*n0 | ||
| 122 | srlx $car0,32,$car0 | ||
| 123 | add $acc0,$car1,$car1 | ||
| 124 | ld [$np+8],$npj !prologue! | ||
| 125 | srlx $car1,32,$car1 | ||
| 126 | mov $tmp0,$acc0 !prologue! | ||
| 127 | |||
| 128 | .L1st: | ||
| 129 | mulx $apj,$mul0,$tmp0 | ||
| 130 | mulx $npj,$mul1,$tmp1 | ||
| 131 | add $acc0,$car0,$car0 | ||
| 132 | ld [$ap+$j],$apj ! ap[j] | ||
| 133 | and $car0,$mask,$acc0 | ||
| 134 | add $acc1,$car1,$car1 | ||
| 135 | ld [$np+$j],$npj ! np[j] | ||
| 136 | srlx $car0,32,$car0 | ||
| 137 | add $acc0,$car1,$car1 | ||
| 138 | add $j,4,$j ! j++ | ||
| 139 | mov $tmp0,$acc0 | ||
| 140 | st $car1,[$tp] | ||
| 141 | cmp $j,$num | ||
| 142 | mov $tmp1,$acc1 | ||
| 143 | srlx $car1,32,$car1 | ||
| 144 | bl %icc,.L1st | ||
| 145 | add $tp,4,$tp ! tp++ | ||
| 146 | !.L1st | ||
| 147 | |||
| 148 | mulx $apj,$mul0,$tmp0 !epilogue! | ||
| 149 | mulx $npj,$mul1,$tmp1 | ||
| 150 | add $acc0,$car0,$car0 | ||
| 151 | and $car0,$mask,$acc0 | ||
| 152 | add $acc1,$car1,$car1 | ||
| 153 | srlx $car0,32,$car0 | ||
| 154 | add $acc0,$car1,$car1 | ||
| 155 | st $car1,[$tp] | ||
| 156 | srlx $car1,32,$car1 | ||
| 157 | |||
| 158 | add $tmp0,$car0,$car0 | ||
| 159 | and $car0,$mask,$acc0 | ||
| 160 | add $tmp1,$car1,$car1 | ||
| 161 | srlx $car0,32,$car0 | ||
| 162 | add $acc0,$car1,$car1 | ||
| 163 | st $car1,[$tp+4] | ||
| 164 | srlx $car1,32,$car1 | ||
| 165 | |||
| 166 | add $car0,$car1,$car1 | ||
| 167 | st $car1,[$tp+8] | ||
| 168 | srlx $car1,32,$car2 | ||
| 169 | |||
| 170 | mov 4,$i ! i++ | ||
| 171 | ld [$bp+4],$mul0 ! bp[1] | ||
| 172 | .Louter: | ||
| 173 | add %sp,$bias+$frame,$tp | ||
| 174 | ld [$ap],$car0 ! ap[0] | ||
| 175 | ld [$ap+4],$apj ! ap[1] | ||
| 176 | ld [$np],$car1 ! np[0] | ||
| 177 | ld [$np+4],$npj ! np[1] | ||
| 178 | ld [$tp],$tmp1 ! tp[0] | ||
| 179 | ld [$tp+4],$tpj ! tp[1] | ||
| 180 | mov 12,$j | ||
| 181 | |||
| 182 | mulx $car0,$mul0,$car0 | ||
| 183 | mulx $apj,$mul0,$tmp0 !prologue! | ||
| 184 | add $tmp1,$car0,$car0 | ||
| 185 | ld [$ap+8],$apj !prologue! | ||
| 186 | and $car0,$mask,$acc0 | ||
| 187 | |||
| 188 | mulx $n0,$acc0,$mul1 | ||
| 189 | and $mul1,$mask,$mul1 | ||
| 190 | |||
| 191 | mulx $car1,$mul1,$car1 | ||
| 192 | mulx $npj,$mul1,$acc1 !prologue! | ||
| 193 | srlx $car0,32,$car0 | ||
| 194 | add $acc0,$car1,$car1 | ||
| 195 | ld [$np+8],$npj !prologue! | ||
| 196 | srlx $car1,32,$car1 | ||
| 197 | mov $tmp0,$acc0 !prologue! | ||
| 198 | |||
| 199 | .Linner: | ||
| 200 | mulx $apj,$mul0,$tmp0 | ||
| 201 | mulx $npj,$mul1,$tmp1 | ||
| 202 | add $tpj,$car0,$car0 | ||
| 203 | ld [$ap+$j],$apj ! ap[j] | ||
| 204 | add $acc0,$car0,$car0 | ||
| 205 | add $acc1,$car1,$car1 | ||
| 206 | ld [$np+$j],$npj ! np[j] | ||
| 207 | and $car0,$mask,$acc0 | ||
| 208 | ld [$tp+8],$tpj ! tp[j] | ||
| 209 | srlx $car0,32,$car0 | ||
| 210 | add $acc0,$car1,$car1 | ||
| 211 | add $j,4,$j ! j++ | ||
| 212 | mov $tmp0,$acc0 | ||
| 213 | st $car1,[$tp] ! tp[j-1] | ||
| 214 | srlx $car1,32,$car1 | ||
| 215 | mov $tmp1,$acc1 | ||
| 216 | cmp $j,$num | ||
| 217 | bl %icc,.Linner | ||
| 218 | add $tp,4,$tp ! tp++ | ||
| 219 | !.Linner | ||
| 220 | |||
| 221 | mulx $apj,$mul0,$tmp0 !epilogue! | ||
| 222 | mulx $npj,$mul1,$tmp1 | ||
| 223 | add $tpj,$car0,$car0 | ||
| 224 | add $acc0,$car0,$car0 | ||
| 225 | ld [$tp+8],$tpj ! tp[j] | ||
| 226 | and $car0,$mask,$acc0 | ||
| 227 | add $acc1,$car1,$car1 | ||
| 228 | srlx $car0,32,$car0 | ||
| 229 | add $acc0,$car1,$car1 | ||
| 230 | st $car1,[$tp] ! tp[j-1] | ||
| 231 | srlx $car1,32,$car1 | ||
| 232 | |||
| 233 | add $tpj,$car0,$car0 | ||
| 234 | add $tmp0,$car0,$car0 | ||
| 235 | and $car0,$mask,$acc0 | ||
| 236 | add $tmp1,$car1,$car1 | ||
| 237 | add $acc0,$car1,$car1 | ||
| 238 | st $car1,[$tp+4] ! tp[j-1] | ||
| 239 | srlx $car0,32,$car0 | ||
| 240 | add $i,4,$i ! i++ | ||
| 241 | srlx $car1,32,$car1 | ||
| 242 | |||
| 243 | add $car0,$car1,$car1 | ||
| 244 | cmp $i,$num | ||
| 245 | add $car2,$car1,$car1 | ||
| 246 | st $car1,[$tp+8] | ||
| 247 | |||
| 248 | srlx $car1,32,$car2 | ||
| 249 | bl,a %icc,.Louter | ||
| 250 | ld [$bp+$i],$mul0 ! bp[i] | ||
| 251 | !.Louter | ||
| 252 | |||
| 253 | add $tp,12,$tp | ||
| 254 | |||
| 255 | .Ltail: | ||
| 256 | add $np,$num,$np | ||
| 257 | add $rp,$num,$rp | ||
| 258 | mov $tp,$ap | ||
| 259 | sub %g0,$num,%o7 ! k=-num | ||
| 260 | ba .Lsub | ||
| 261 | subcc %g0,%g0,%g0 ! clear %icc.c | ||
| 262 | .align 16 | ||
| 263 | .Lsub: | ||
| 264 | ld [$tp+%o7],%o0 | ||
| 265 | ld [$np+%o7],%o1 | ||
| 266 | subccc %o0,%o1,%o1 ! tp[j]-np[j] | ||
| 267 | add $rp,%o7,$i | ||
| 268 | add %o7,4,%o7 | ||
| 269 | brnz %o7,.Lsub | ||
| 270 | st %o1,[$i] | ||
| 271 | subc $car2,0,$car2 ! handle upmost overflow bit | ||
| 272 | and $tp,$car2,$ap | ||
| 273 | andn $rp,$car2,$np | ||
| 274 | or $ap,$np,$ap | ||
| 275 | sub %g0,$num,%o7 | ||
| 276 | |||
| 277 | .Lcopy: | ||
| 278 | ld [$ap+%o7],%o0 ! copy or in-place refresh | ||
| 279 | st %g0,[$tp+%o7] ! zap tp | ||
| 280 | st %o0,[$rp+%o7] | ||
| 281 | add %o7,4,%o7 | ||
| 282 | brnz %o7,.Lcopy | ||
| 283 | nop | ||
| 284 | mov 1,%i0 | ||
| 285 | ret | ||
| 286 | restore | ||
| 287 | ___ | ||
| 288 | |||
| 289 | ######## | ||
| 290 | ######## .Lbn_sqr_mont gives up to 20% *overall* improvement over | ||
| 291 | ######## code without following dedicated squaring procedure. | ||
| 292 | ######## | ||
| 293 | $sbit="%i2"; # re-use $bp! | ||
| 294 | |||
| 295 | $code.=<<___; | ||
| 296 | .align 32 | ||
| 297 | .Lbn_sqr_mont: | ||
| 298 | mulx $mul0,$mul0,$car0 ! ap[0]*ap[0] | ||
| 299 | mulx $apj,$mul0,$tmp0 !prologue! | ||
| 300 | and $car0,$mask,$acc0 | ||
| 301 | add %sp,$bias+$frame,$tp | ||
| 302 | ld [$ap+8],$apj !prologue! | ||
| 303 | |||
| 304 | mulx $n0,$acc0,$mul1 ! "t[0]"*n0 | ||
| 305 | srlx $car0,32,$car0 | ||
| 306 | and $mul1,$mask,$mul1 | ||
| 307 | |||
| 308 | mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0 | ||
| 309 | mulx $npj,$mul1,$acc1 !prologue! | ||
| 310 | and $car0,1,$sbit | ||
| 311 | ld [$np+8],$npj !prologue! | ||
| 312 | srlx $car0,1,$car0 | ||
| 313 | add $acc0,$car1,$car1 | ||
| 314 | srlx $car1,32,$car1 | ||
| 315 | mov $tmp0,$acc0 !prologue! | ||
| 316 | |||
| 317 | .Lsqr_1st: | ||
| 318 | mulx $apj,$mul0,$tmp0 | ||
| 319 | mulx $npj,$mul1,$tmp1 | ||
| 320 | add $acc0,$car0,$car0 ! ap[j]*a0+c0 | ||
| 321 | add $acc1,$car1,$car1 | ||
| 322 | ld [$ap+$j],$apj ! ap[j] | ||
| 323 | and $car0,$mask,$acc0 | ||
| 324 | ld [$np+$j],$npj ! np[j] | ||
| 325 | srlx $car0,32,$car0 | ||
| 326 | add $acc0,$acc0,$acc0 | ||
| 327 | or $sbit,$acc0,$acc0 | ||
| 328 | mov $tmp1,$acc1 | ||
| 329 | srlx $acc0,32,$sbit | ||
| 330 | add $j,4,$j ! j++ | ||
| 331 | and $acc0,$mask,$acc0 | ||
| 332 | cmp $j,$num | ||
| 333 | add $acc0,$car1,$car1 | ||
| 334 | st $car1,[$tp] | ||
| 335 | mov $tmp0,$acc0 | ||
| 336 | srlx $car1,32,$car1 | ||
| 337 | bl %icc,.Lsqr_1st | ||
| 338 | add $tp,4,$tp ! tp++ | ||
| 339 | !.Lsqr_1st | ||
| 340 | |||
| 341 | mulx $apj,$mul0,$tmp0 ! epilogue | ||
| 342 | mulx $npj,$mul1,$tmp1 | ||
| 343 | add $acc0,$car0,$car0 ! ap[j]*a0+c0 | ||
| 344 | add $acc1,$car1,$car1 | ||
| 345 | and $car0,$mask,$acc0 | ||
| 346 | srlx $car0,32,$car0 | ||
| 347 | add $acc0,$acc0,$acc0 | ||
| 348 | or $sbit,$acc0,$acc0 | ||
| 349 | srlx $acc0,32,$sbit | ||
| 350 | and $acc0,$mask,$acc0 | ||
| 351 | add $acc0,$car1,$car1 | ||
| 352 | st $car1,[$tp] | ||
| 353 | srlx $car1,32,$car1 | ||
| 354 | |||
| 355 | add $tmp0,$car0,$car0 ! ap[j]*a0+c0 | ||
| 356 | add $tmp1,$car1,$car1 | ||
| 357 | and $car0,$mask,$acc0 | ||
| 358 | srlx $car0,32,$car0 | ||
| 359 | add $acc0,$acc0,$acc0 | ||
| 360 | or $sbit,$acc0,$acc0 | ||
| 361 | srlx $acc0,32,$sbit | ||
| 362 | and $acc0,$mask,$acc0 | ||
| 363 | add $acc0,$car1,$car1 | ||
| 364 | st $car1,[$tp+4] | ||
| 365 | srlx $car1,32,$car1 | ||
| 366 | |||
| 367 | add $car0,$car0,$car0 | ||
| 368 | or $sbit,$car0,$car0 | ||
| 369 | add $car0,$car1,$car1 | ||
| 370 | st $car1,[$tp+8] | ||
| 371 | srlx $car1,32,$car2 | ||
| 372 | |||
| 373 | ld [%sp+$bias+$frame],$tmp0 ! tp[0] | ||
| 374 | ld [%sp+$bias+$frame+4],$tmp1 ! tp[1] | ||
| 375 | ld [%sp+$bias+$frame+8],$tpj ! tp[2] | ||
| 376 | ld [$ap+4],$mul0 ! ap[1] | ||
| 377 | ld [$ap+8],$apj ! ap[2] | ||
| 378 | ld [$np],$car1 ! np[0] | ||
| 379 | ld [$np+4],$npj ! np[1] | ||
| 380 | mulx $n0,$tmp0,$mul1 | ||
| 381 | |||
| 382 | mulx $mul0,$mul0,$car0 | ||
| 383 | and $mul1,$mask,$mul1 | ||
| 384 | |||
| 385 | mulx $car1,$mul1,$car1 | ||
| 386 | mulx $npj,$mul1,$acc1 | ||
| 387 | add $tmp0,$car1,$car1 | ||
| 388 | and $car0,$mask,$acc0 | ||
| 389 | ld [$np+8],$npj ! np[2] | ||
| 390 | srlx $car1,32,$car1 | ||
| 391 | add $tmp1,$car1,$car1 | ||
| 392 | srlx $car0,32,$car0 | ||
| 393 | add $acc0,$car1,$car1 | ||
| 394 | and $car0,1,$sbit | ||
| 395 | add $acc1,$car1,$car1 | ||
| 396 | srlx $car0,1,$car0 | ||
| 397 | mov 12,$j | ||
| 398 | st $car1,[%sp+$bias+$frame] ! tp[0]= | ||
| 399 | srlx $car1,32,$car1 | ||
| 400 | add %sp,$bias+$frame+4,$tp | ||
| 401 | |||
| 402 | .Lsqr_2nd: | ||
| 403 | mulx $apj,$mul0,$acc0 | ||
| 404 | mulx $npj,$mul1,$acc1 | ||
| 405 | add $acc0,$car0,$car0 | ||
| 406 | add $tpj,$car1,$car1 | ||
| 407 | ld [$ap+$j],$apj ! ap[j] | ||
| 408 | and $car0,$mask,$acc0 | ||
| 409 | ld [$np+$j],$npj ! np[j] | ||
| 410 | srlx $car0,32,$car0 | ||
| 411 | add $acc1,$car1,$car1 | ||
| 412 | ld [$tp+8],$tpj ! tp[j] | ||
| 413 | add $acc0,$acc0,$acc0 | ||
| 414 | add $j,4,$j ! j++ | ||
| 415 | or $sbit,$acc0,$acc0 | ||
| 416 | srlx $acc0,32,$sbit | ||
| 417 | and $acc0,$mask,$acc0 | ||
| 418 | cmp $j,$num | ||
| 419 | add $acc0,$car1,$car1 | ||
| 420 | st $car1,[$tp] ! tp[j-1] | ||
| 421 | srlx $car1,32,$car1 | ||
| 422 | bl %icc,.Lsqr_2nd | ||
| 423 | add $tp,4,$tp ! tp++ | ||
| 424 | !.Lsqr_2nd | ||
| 425 | |||
| 426 | mulx $apj,$mul0,$acc0 | ||
| 427 | mulx $npj,$mul1,$acc1 | ||
| 428 | add $acc0,$car0,$car0 | ||
| 429 | add $tpj,$car1,$car1 | ||
| 430 | and $car0,$mask,$acc0 | ||
| 431 | srlx $car0,32,$car0 | ||
| 432 | add $acc1,$car1,$car1 | ||
| 433 | add $acc0,$acc0,$acc0 | ||
| 434 | or $sbit,$acc0,$acc0 | ||
| 435 | srlx $acc0,32,$sbit | ||
| 436 | and $acc0,$mask,$acc0 | ||
| 437 | add $acc0,$car1,$car1 | ||
| 438 | st $car1,[$tp] ! tp[j-1] | ||
| 439 | srlx $car1,32,$car1 | ||
| 440 | |||
| 441 | add $car0,$car0,$car0 | ||
| 442 | or $sbit,$car0,$car0 | ||
| 443 | add $car0,$car1,$car1 | ||
| 444 | add $car2,$car1,$car1 | ||
| 445 | st $car1,[$tp+4] | ||
| 446 | srlx $car1,32,$car2 | ||
| 447 | |||
| 448 | ld [%sp+$bias+$frame],$tmp1 ! tp[0] | ||
| 449 | ld [%sp+$bias+$frame+4],$tpj ! tp[1] | ||
| 450 | ld [$ap+8],$mul0 ! ap[2] | ||
| 451 | ld [$np],$car1 ! np[0] | ||
| 452 | ld [$np+4],$npj ! np[1] | ||
| 453 | mulx $n0,$tmp1,$mul1 | ||
| 454 | and $mul1,$mask,$mul1 | ||
| 455 | mov 8,$i | ||
| 456 | |||
| 457 | mulx $mul0,$mul0,$car0 | ||
| 458 | mulx $car1,$mul1,$car1 | ||
| 459 | and $car0,$mask,$acc0 | ||
| 460 | add $tmp1,$car1,$car1 | ||
| 461 | srlx $car0,32,$car0 | ||
| 462 | add %sp,$bias+$frame,$tp | ||
| 463 | srlx $car1,32,$car1 | ||
| 464 | and $car0,1,$sbit | ||
| 465 | srlx $car0,1,$car0 | ||
| 466 | mov 4,$j | ||
| 467 | |||
| 468 | .Lsqr_outer: | ||
| 469 | .Lsqr_inner1: | ||
| 470 | mulx $npj,$mul1,$acc1 | ||
| 471 | add $tpj,$car1,$car1 | ||
| 472 | add $j,4,$j | ||
| 473 | ld [$tp+8],$tpj | ||
| 474 | cmp $j,$i | ||
| 475 | add $acc1,$car1,$car1 | ||
| 476 | ld [$np+$j],$npj | ||
| 477 | st $car1,[$tp] | ||
| 478 | srlx $car1,32,$car1 | ||
| 479 | bl %icc,.Lsqr_inner1 | ||
| 480 | add $tp,4,$tp | ||
| 481 | !.Lsqr_inner1 | ||
| 482 | |||
| 483 | add $j,4,$j | ||
| 484 | ld [$ap+$j],$apj ! ap[j] | ||
| 485 | mulx $npj,$mul1,$acc1 | ||
| 486 | add $tpj,$car1,$car1 | ||
| 487 | ld [$np+$j],$npj ! np[j] | ||
| 488 | add $acc0,$car1,$car1 | ||
| 489 | ld [$tp+8],$tpj ! tp[j] | ||
| 490 | add $acc1,$car1,$car1 | ||
| 491 | st $car1,[$tp] | ||
| 492 | srlx $car1,32,$car1 | ||
| 493 | |||
| 494 | add $j,4,$j | ||
| 495 | cmp $j,$num | ||
| 496 | be,pn %icc,.Lsqr_no_inner2 | ||
| 497 | add $tp,4,$tp | ||
| 498 | |||
| 499 | .Lsqr_inner2: | ||
| 500 | mulx $apj,$mul0,$acc0 | ||
| 501 | mulx $npj,$mul1,$acc1 | ||
| 502 | add $tpj,$car1,$car1 | ||
| 503 | add $acc0,$car0,$car0 | ||
| 504 | ld [$ap+$j],$apj ! ap[j] | ||
| 505 | and $car0,$mask,$acc0 | ||
| 506 | ld [$np+$j],$npj ! np[j] | ||
| 507 | srlx $car0,32,$car0 | ||
| 508 | add $acc0,$acc0,$acc0 | ||
| 509 | ld [$tp+8],$tpj ! tp[j] | ||
| 510 | or $sbit,$acc0,$acc0 | ||
| 511 | add $j,4,$j ! j++ | ||
| 512 | srlx $acc0,32,$sbit | ||
| 513 | and $acc0,$mask,$acc0 | ||
| 514 | cmp $j,$num | ||
| 515 | add $acc0,$car1,$car1 | ||
| 516 | add $acc1,$car1,$car1 | ||
| 517 | st $car1,[$tp] ! tp[j-1] | ||
| 518 | srlx $car1,32,$car1 | ||
| 519 | bl %icc,.Lsqr_inner2 | ||
| 520 | add $tp,4,$tp ! tp++ | ||
| 521 | |||
| 522 | .Lsqr_no_inner2: | ||
| 523 | mulx $apj,$mul0,$acc0 | ||
| 524 | mulx $npj,$mul1,$acc1 | ||
| 525 | add $tpj,$car1,$car1 | ||
| 526 | add $acc0,$car0,$car0 | ||
| 527 | and $car0,$mask,$acc0 | ||
| 528 | srlx $car0,32,$car0 | ||
| 529 | add $acc0,$acc0,$acc0 | ||
| 530 | or $sbit,$acc0,$acc0 | ||
| 531 | srlx $acc0,32,$sbit | ||
| 532 | and $acc0,$mask,$acc0 | ||
| 533 | add $acc0,$car1,$car1 | ||
| 534 | add $acc1,$car1,$car1 | ||
| 535 | st $car1,[$tp] ! tp[j-1] | ||
| 536 | srlx $car1,32,$car1 | ||
| 537 | |||
| 538 | add $car0,$car0,$car0 | ||
| 539 | or $sbit,$car0,$car0 | ||
| 540 | add $car0,$car1,$car1 | ||
| 541 | add $car2,$car1,$car1 | ||
| 542 | st $car1,[$tp+4] | ||
| 543 | srlx $car1,32,$car2 | ||
| 544 | |||
| 545 | add $i,4,$i ! i++ | ||
| 546 | ld [%sp+$bias+$frame],$tmp1 ! tp[0] | ||
| 547 | ld [%sp+$bias+$frame+4],$tpj ! tp[1] | ||
| 548 | ld [$ap+$i],$mul0 ! ap[j] | ||
| 549 | ld [$np],$car1 ! np[0] | ||
| 550 | ld [$np+4],$npj ! np[1] | ||
| 551 | mulx $n0,$tmp1,$mul1 | ||
| 552 | and $mul1,$mask,$mul1 | ||
| 553 | add $i,4,$tmp0 | ||
| 554 | |||
| 555 | mulx $mul0,$mul0,$car0 | ||
| 556 | mulx $car1,$mul1,$car1 | ||
| 557 | and $car0,$mask,$acc0 | ||
| 558 | add $tmp1,$car1,$car1 | ||
| 559 | srlx $car0,32,$car0 | ||
| 560 | add %sp,$bias+$frame,$tp | ||
| 561 | srlx $car1,32,$car1 | ||
| 562 | and $car0,1,$sbit | ||
| 563 | srlx $car0,1,$car0 | ||
| 564 | |||
| 565 | cmp $tmp0,$num ! i<num-1 | ||
| 566 | bl %icc,.Lsqr_outer | ||
| 567 | mov 4,$j | ||
| 568 | |||
| 569 | .Lsqr_last: | ||
| 570 | mulx $npj,$mul1,$acc1 | ||
| 571 | add $tpj,$car1,$car1 | ||
| 572 | add $j,4,$j | ||
| 573 | ld [$tp+8],$tpj | ||
| 574 | cmp $j,$i | ||
| 575 | add $acc1,$car1,$car1 | ||
| 576 | ld [$np+$j],$npj | ||
| 577 | st $car1,[$tp] | ||
| 578 | srlx $car1,32,$car1 | ||
| 579 | bl %icc,.Lsqr_last | ||
| 580 | add $tp,4,$tp | ||
| 581 | !.Lsqr_last | ||
| 582 | |||
| 583 | mulx $npj,$mul1,$acc1 | ||
| 584 | add $tpj,$car1,$car1 | ||
| 585 | add $acc0,$car1,$car1 | ||
| 586 | add $acc1,$car1,$car1 | ||
| 587 | st $car1,[$tp] | ||
| 588 | srlx $car1,32,$car1 | ||
| 589 | |||
| 590 | add $car0,$car0,$car0 ! recover $car0 | ||
| 591 | or $sbit,$car0,$car0 | ||
| 592 | add $car0,$car1,$car1 | ||
| 593 | add $car2,$car1,$car1 | ||
| 594 | st $car1,[$tp+4] | ||
| 595 | srlx $car1,32,$car2 | ||
| 596 | |||
| 597 | ba .Ltail | ||
| 598 | add $tp,8,$tp | ||
| 599 | .type $fname,#function | ||
| 600 | .size $fname,(.-$fname) | ||
| 601 | .asciz "Montgomery Multipltication for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>" | ||
| 602 | .align 32 | ||
| 603 | ___ | ||
| 604 | $code =~ s/\`([^\`]*)\`/eval($1)/gem; | ||
| 605 | print $code; | ||
| 606 | close STDOUT; | ||
diff --git a/src/lib/libcrypto/bn/asm/sparcv9a-mont.pl b/src/lib/libcrypto/bn/asm/sparcv9a-mont.pl new file mode 100755 index 0000000000..a14205f2f0 --- /dev/null +++ b/src/lib/libcrypto/bn/asm/sparcv9a-mont.pl | |||
| @@ -0,0 +1,882 @@ | |||
| 1 | #!/usr/bin/env perl | ||
| 2 | |||
| 3 | # ==================================================================== | ||
| 4 | # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | ||
| 5 | # project. The module is, however, dual licensed under OpenSSL and | ||
| 6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
| 7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
| 8 | # ==================================================================== | ||
| 9 | |||
| 10 | # October 2005 | ||
| 11 | # | ||
| 12 | # "Teaser" Montgomery multiplication module for UltraSPARC. Why FPU? | ||
| 13 | # Because unlike integer multiplier, which simply stalls whole CPU, | ||
| 14 | # FPU is fully pipelined and can effectively emit 48 bit partial | ||
| 15 | # product every cycle. Why not blended SPARC v9? One can argue that | ||
| 16 | # making this module dependent on UltraSPARC VIS extension limits its | ||
| 17 | # binary compatibility. Well yes, it does exclude SPARC64 prior-V(!) | ||
| 18 | # implementations from compatibility matrix. But the rest, whole Sun | ||
| 19 | # UltraSPARC family and brand new Fujitsu's SPARC64 V, all support | ||
| 20 | # VIS extension instructions used in this module. This is considered | ||
| 21 | # good enough to not care about HAL SPARC64 users [if any] who have | ||
| 22 | # integer-only pure SPARCv9 module to "fall down" to. | ||
| 23 | |||
| 24 | # USI&II cores currently exhibit uniform 2x improvement [over pre- | ||
| 25 | # bn_mul_mont codebase] for all key lengths and benchmarks. On USIII | ||
| 26 | # performance improves few percents for shorter keys and worsens few | ||
| 27 | # percents for longer keys. This is because USIII integer multiplier | ||
| 28 | # is >3x faster than USI&II one, which is harder to match [but see | ||
| 29 | # TODO list below]. It should also be noted that SPARC64 V features | ||
| 30 | # out-of-order execution, which *might* mean that integer multiplier | ||
| 31 | # is pipelined, which in turn *might* be impossible to match... On | ||
| 32 | # additional note, SPARC64 V implements FP Multiply-Add instruction, | ||
| 33 | # which is perfectly usable in this context... In other words, as far | ||
| 34 | # as Fujitsu SPARC64 V goes, talk to the author:-) | ||
| 35 | |||
| 36 | # The implementation implies following "non-natural" limitations on | ||
| 37 | # input arguments: | ||
| 38 | # - num may not be less than 4; | ||
| 39 | # - num has to be even; | ||
| 40 | # Failure to meet either condition has no fatal effects, simply | ||
| 41 | # doesn't give any performance gain. | ||
| 42 | |||
| 43 | # TODO: | ||
| 44 | # - modulo-schedule inner loop for better performance (on in-order | ||
| 45 | # execution core such as UltraSPARC this shall result in further | ||
| 46 | # noticeable(!) improvement); | ||
| 47 | # - dedicated squaring procedure[?]; | ||
| 48 | |||
| 49 | ###################################################################### | ||
| 50 | # November 2006 | ||
| 51 | # | ||
| 52 | # Modulo-scheduled inner loops allow to interleave floating point and | ||
| 53 | # integer instructions and minimize Read-After-Write penalties. This | ||
| 54 | # results in *further* 20-50% perfromance improvement [depending on | ||
| 55 | # key length, more for longer keys] on USI&II cores and 30-80% - on | ||
| 56 | # USIII&IV. | ||
| 57 | |||
| 58 | $fname="bn_mul_mont_fpu"; | ||
| 59 | $bits=32; | ||
| 60 | for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); } | ||
| 61 | |||
| 62 | if ($bits==64) { | ||
| 63 | $bias=2047; | ||
| 64 | $frame=192; | ||
| 65 | } else { | ||
| 66 | $bias=0; | ||
| 67 | $frame=128; # 96 rounded up to largest known cache-line | ||
| 68 | } | ||
| 69 | $locals=64; | ||
| 70 | |||
| 71 | # In order to provide for 32-/64-bit ABI duality, I keep integers wider | ||
| 72 | # than 32 bit in %g1-%g4 and %o0-%o5. %l0-%l7 and %i0-%i5 are used | ||
| 73 | # exclusively for pointers, indexes and other small values... | ||
| 74 | # int bn_mul_mont( | ||
| 75 | $rp="%i0"; # BN_ULONG *rp, | ||
| 76 | $ap="%i1"; # const BN_ULONG *ap, | ||
| 77 | $bp="%i2"; # const BN_ULONG *bp, | ||
| 78 | $np="%i3"; # const BN_ULONG *np, | ||
| 79 | $n0="%i4"; # const BN_ULONG *n0, | ||
| 80 | $num="%i5"; # int num); | ||
| 81 | |||
| 82 | $tp="%l0"; # t[num] | ||
| 83 | $ap_l="%l1"; # a[num],n[num] are smashed to 32-bit words and saved | ||
| 84 | $ap_h="%l2"; # to these four vectors as double-precision FP values. | ||
| 85 | $np_l="%l3"; # This way a bunch of fxtods are eliminated in second | ||
| 86 | $np_h="%l4"; # loop and L1-cache aliasing is minimized... | ||
| 87 | $i="%l5"; | ||
| 88 | $j="%l6"; | ||
| 89 | $mask="%l7"; # 16-bit mask, 0xffff | ||
| 90 | |||
| 91 | $n0="%g4"; # reassigned(!) to "64-bit" register | ||
| 92 | $carry="%i4"; # %i4 reused(!) for a carry bit | ||
| 93 | |||
| 94 | # FP register naming chart | ||
| 95 | # | ||
| 96 | # ..HILO | ||
| 97 | # dcba | ||
| 98 | # -------- | ||
| 99 | # LOa | ||
| 100 | # LOb | ||
| 101 | # LOc | ||
| 102 | # LOd | ||
| 103 | # HIa | ||
| 104 | # HIb | ||
| 105 | # HIc | ||
| 106 | # HId | ||
| 107 | # ..a | ||
| 108 | # ..b | ||
| 109 | $ba="%f0"; $bb="%f2"; $bc="%f4"; $bd="%f6"; | ||
| 110 | $na="%f8"; $nb="%f10"; $nc="%f12"; $nd="%f14"; | ||
| 111 | $alo="%f16"; $alo_="%f17"; $ahi="%f18"; $ahi_="%f19"; | ||
| 112 | $nlo="%f20"; $nlo_="%f21"; $nhi="%f22"; $nhi_="%f23"; | ||
| 113 | |||
| 114 | $dota="%f24"; $dotb="%f26"; | ||
| 115 | |||
| 116 | $aloa="%f32"; $alob="%f34"; $aloc="%f36"; $alod="%f38"; | ||
| 117 | $ahia="%f40"; $ahib="%f42"; $ahic="%f44"; $ahid="%f46"; | ||
| 118 | $nloa="%f48"; $nlob="%f50"; $nloc="%f52"; $nlod="%f54"; | ||
| 119 | $nhia="%f56"; $nhib="%f58"; $nhic="%f60"; $nhid="%f62"; | ||
| 120 | |||
| 121 | $ASI_FL16_P=0xD2; # magic ASI value to engage 16-bit FP load | ||
| 122 | |||
| 123 | $code=<<___; | ||
| 124 | .section ".text",#alloc,#execinstr | ||
| 125 | |||
| 126 | .global $fname | ||
| 127 | .align 32 | ||
| 128 | $fname: | ||
| 129 | save %sp,-$frame-$locals,%sp | ||
| 130 | |||
| 131 | cmp $num,4 | ||
| 132 | bl,a,pn %icc,.Lret | ||
| 133 | clr %i0 | ||
| 134 | andcc $num,1,%g0 ! $num has to be even... | ||
| 135 | bnz,a,pn %icc,.Lret | ||
| 136 | clr %i0 ! signal "unsupported input value" | ||
| 137 | |||
| 138 | srl $num,1,$num | ||
| 139 | sethi %hi(0xffff),$mask | ||
| 140 | ld [%i4+0],$n0 ! $n0 reassigned, remember? | ||
| 141 | or $mask,%lo(0xffff),$mask | ||
| 142 | ld [%i4+4],%o0 | ||
| 143 | sllx %o0,32,%o0 | ||
| 144 | or %o0,$n0,$n0 ! $n0=n0[1].n0[0] | ||
| 145 | |||
| 146 | sll $num,3,$num ! num*=8 | ||
| 147 | |||
| 148 | add %sp,$bias,%o0 ! real top of stack | ||
| 149 | sll $num,2,%o1 | ||
| 150 | add %o1,$num,%o1 ! %o1=num*5 | ||
| 151 | sub %o0,%o1,%o0 | ||
| 152 | and %o0,-2048,%o0 ! optimize TLB utilization | ||
| 153 | sub %o0,$bias,%sp ! alloca(5*num*8) | ||
| 154 | |||
| 155 | rd %asi,%o7 ! save %asi | ||
| 156 | add %sp,$bias+$frame+$locals,$tp | ||
| 157 | add $tp,$num,$ap_l | ||
| 158 | add $ap_l,$num,$ap_l ! [an]p_[lh] point at the vectors' ends ! | ||
| 159 | add $ap_l,$num,$ap_h | ||
| 160 | add $ap_h,$num,$np_l | ||
| 161 | add $np_l,$num,$np_h | ||
| 162 | |||
| 163 | wr %g0,$ASI_FL16_P,%asi ! setup %asi for 16-bit FP loads | ||
| 164 | |||
| 165 | add $rp,$num,$rp ! readjust input pointers to point | ||
| 166 | add $ap,$num,$ap ! at the ends too... | ||
| 167 | add $bp,$num,$bp | ||
| 168 | add $np,$num,$np | ||
| 169 | |||
| 170 | stx %o7,[%sp+$bias+$frame+48] ! save %asi | ||
| 171 | |||
| 172 | sub %g0,$num,$i ! i=-num | ||
| 173 | sub %g0,$num,$j ! j=-num | ||
| 174 | |||
| 175 | add $ap,$j,%o3 | ||
| 176 | add $bp,$i,%o4 | ||
| 177 | |||
| 178 | ld [%o3+4],%g1 ! bp[0] | ||
| 179 | ld [%o3+0],%o0 | ||
| 180 | ld [%o4+4],%g5 ! ap[0] | ||
| 181 | sllx %g1,32,%g1 | ||
| 182 | ld [%o4+0],%o1 | ||
| 183 | sllx %g5,32,%g5 | ||
| 184 | or %g1,%o0,%o0 | ||
| 185 | or %g5,%o1,%o1 | ||
| 186 | |||
| 187 | add $np,$j,%o5 | ||
| 188 | |||
| 189 | mulx %o1,%o0,%o0 ! ap[0]*bp[0] | ||
| 190 | mulx $n0,%o0,%o0 ! ap[0]*bp[0]*n0 | ||
| 191 | stx %o0,[%sp+$bias+$frame+0] | ||
| 192 | |||
| 193 | ld [%o3+0],$alo_ ! load a[j] as pair of 32-bit words | ||
| 194 | fzeros $alo | ||
| 195 | ld [%o3+4],$ahi_ | ||
| 196 | fzeros $ahi | ||
| 197 | ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words | ||
| 198 | fzeros $nlo | ||
| 199 | ld [%o5+4],$nhi_ | ||
| 200 | fzeros $nhi | ||
| 201 | |||
| 202 | ! transfer b[i] to FPU as 4x16-bit values | ||
| 203 | ldda [%o4+2]%asi,$ba | ||
| 204 | fxtod $alo,$alo | ||
| 205 | ldda [%o4+0]%asi,$bb | ||
| 206 | fxtod $ahi,$ahi | ||
| 207 | ldda [%o4+6]%asi,$bc | ||
| 208 | fxtod $nlo,$nlo | ||
| 209 | ldda [%o4+4]%asi,$bd | ||
| 210 | fxtod $nhi,$nhi | ||
| 211 | |||
| 212 | ! transfer ap[0]*b[0]*n0 to FPU as 4x16-bit values | ||
| 213 | ldda [%sp+$bias+$frame+6]%asi,$na | ||
| 214 | fxtod $ba,$ba | ||
| 215 | ldda [%sp+$bias+$frame+4]%asi,$nb | ||
| 216 | fxtod $bb,$bb | ||
| 217 | ldda [%sp+$bias+$frame+2]%asi,$nc | ||
| 218 | fxtod $bc,$bc | ||
| 219 | ldda [%sp+$bias+$frame+0]%asi,$nd | ||
| 220 | fxtod $bd,$bd | ||
| 221 | |||
| 222 | std $alo,[$ap_l+$j] ! save smashed ap[j] in double format | ||
| 223 | fxtod $na,$na | ||
| 224 | std $ahi,[$ap_h+$j] | ||
| 225 | fxtod $nb,$nb | ||
| 226 | std $nlo,[$np_l+$j] ! save smashed np[j] in double format | ||
| 227 | fxtod $nc,$nc | ||
| 228 | std $nhi,[$np_h+$j] | ||
| 229 | fxtod $nd,$nd | ||
| 230 | |||
| 231 | fmuld $alo,$ba,$aloa | ||
| 232 | fmuld $nlo,$na,$nloa | ||
| 233 | fmuld $alo,$bb,$alob | ||
| 234 | fmuld $nlo,$nb,$nlob | ||
| 235 | fmuld $alo,$bc,$aloc | ||
| 236 | faddd $aloa,$nloa,$nloa | ||
| 237 | fmuld $nlo,$nc,$nloc | ||
| 238 | fmuld $alo,$bd,$alod | ||
| 239 | faddd $alob,$nlob,$nlob | ||
| 240 | fmuld $nlo,$nd,$nlod | ||
| 241 | fmuld $ahi,$ba,$ahia | ||
| 242 | faddd $aloc,$nloc,$nloc | ||
| 243 | fmuld $nhi,$na,$nhia | ||
| 244 | fmuld $ahi,$bb,$ahib | ||
| 245 | faddd $alod,$nlod,$nlod | ||
| 246 | fmuld $nhi,$nb,$nhib | ||
| 247 | fmuld $ahi,$bc,$ahic | ||
| 248 | faddd $ahia,$nhia,$nhia | ||
| 249 | fmuld $nhi,$nc,$nhic | ||
| 250 | fmuld $ahi,$bd,$ahid | ||
| 251 | faddd $ahib,$nhib,$nhib | ||
| 252 | fmuld $nhi,$nd,$nhid | ||
| 253 | |||
| 254 | faddd $ahic,$nhic,$dota ! $nhic | ||
| 255 | faddd $ahid,$nhid,$dotb ! $nhid | ||
| 256 | |||
| 257 | faddd $nloc,$nhia,$nloc | ||
| 258 | faddd $nlod,$nhib,$nlod | ||
| 259 | |||
| 260 | fdtox $nloa,$nloa | ||
| 261 | fdtox $nlob,$nlob | ||
| 262 | fdtox $nloc,$nloc | ||
| 263 | fdtox $nlod,$nlod | ||
| 264 | |||
| 265 | std $nloa,[%sp+$bias+$frame+0] | ||
| 266 | add $j,8,$j | ||
| 267 | std $nlob,[%sp+$bias+$frame+8] | ||
| 268 | add $ap,$j,%o4 | ||
| 269 | std $nloc,[%sp+$bias+$frame+16] | ||
| 270 | add $np,$j,%o5 | ||
| 271 | std $nlod,[%sp+$bias+$frame+24] | ||
| 272 | |||
| 273 | ld [%o4+0],$alo_ ! load a[j] as pair of 32-bit words | ||
| 274 | fzeros $alo | ||
| 275 | ld [%o4+4],$ahi_ | ||
| 276 | fzeros $ahi | ||
| 277 | ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words | ||
| 278 | fzeros $nlo | ||
| 279 | ld [%o5+4],$nhi_ | ||
| 280 | fzeros $nhi | ||
| 281 | |||
| 282 | fxtod $alo,$alo | ||
| 283 | fxtod $ahi,$ahi | ||
| 284 | fxtod $nlo,$nlo | ||
| 285 | fxtod $nhi,$nhi | ||
| 286 | |||
| 287 | ldx [%sp+$bias+$frame+0],%o0 | ||
| 288 | fmuld $alo,$ba,$aloa | ||
| 289 | ldx [%sp+$bias+$frame+8],%o1 | ||
| 290 | fmuld $nlo,$na,$nloa | ||
| 291 | ldx [%sp+$bias+$frame+16],%o2 | ||
| 292 | fmuld $alo,$bb,$alob | ||
| 293 | ldx [%sp+$bias+$frame+24],%o3 | ||
| 294 | fmuld $nlo,$nb,$nlob | ||
| 295 | |||
| 296 | srlx %o0,16,%o7 | ||
| 297 | std $alo,[$ap_l+$j] ! save smashed ap[j] in double format | ||
| 298 | fmuld $alo,$bc,$aloc | ||
| 299 | add %o7,%o1,%o1 | ||
| 300 | std $ahi,[$ap_h+$j] | ||
| 301 | faddd $aloa,$nloa,$nloa | ||
| 302 | fmuld $nlo,$nc,$nloc | ||
| 303 | srlx %o1,16,%o7 | ||
| 304 | std $nlo,[$np_l+$j] ! save smashed np[j] in double format | ||
| 305 | fmuld $alo,$bd,$alod | ||
| 306 | add %o7,%o2,%o2 | ||
| 307 | std $nhi,[$np_h+$j] | ||
| 308 | faddd $alob,$nlob,$nlob | ||
| 309 | fmuld $nlo,$nd,$nlod | ||
| 310 | srlx %o2,16,%o7 | ||
| 311 | fmuld $ahi,$ba,$ahia | ||
| 312 | add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] | ||
| 313 | faddd $aloc,$nloc,$nloc | ||
| 314 | fmuld $nhi,$na,$nhia | ||
| 315 | !and %o0,$mask,%o0 | ||
| 316 | !and %o1,$mask,%o1 | ||
| 317 | !and %o2,$mask,%o2 | ||
| 318 | !sllx %o1,16,%o1 | ||
| 319 | !sllx %o2,32,%o2 | ||
| 320 | !sllx %o3,48,%o7 | ||
| 321 | !or %o1,%o0,%o0 | ||
| 322 | !or %o2,%o0,%o0 | ||
| 323 | !or %o7,%o0,%o0 ! 64-bit result | ||
| 324 | srlx %o3,16,%g1 ! 34-bit carry | ||
| 325 | fmuld $ahi,$bb,$ahib | ||
| 326 | |||
| 327 | faddd $alod,$nlod,$nlod | ||
| 328 | fmuld $nhi,$nb,$nhib | ||
| 329 | fmuld $ahi,$bc,$ahic | ||
| 330 | faddd $ahia,$nhia,$nhia | ||
| 331 | fmuld $nhi,$nc,$nhic | ||
| 332 | fmuld $ahi,$bd,$ahid | ||
| 333 | faddd $ahib,$nhib,$nhib | ||
| 334 | fmuld $nhi,$nd,$nhid | ||
| 335 | |||
| 336 | faddd $dota,$nloa,$nloa | ||
| 337 | faddd $dotb,$nlob,$nlob | ||
| 338 | faddd $ahic,$nhic,$dota ! $nhic | ||
| 339 | faddd $ahid,$nhid,$dotb ! $nhid | ||
| 340 | |||
| 341 | faddd $nloc,$nhia,$nloc | ||
| 342 | faddd $nlod,$nhib,$nlod | ||
| 343 | |||
| 344 | fdtox $nloa,$nloa | ||
| 345 | fdtox $nlob,$nlob | ||
| 346 | fdtox $nloc,$nloc | ||
| 347 | fdtox $nlod,$nlod | ||
| 348 | |||
| 349 | std $nloa,[%sp+$bias+$frame+0] | ||
| 350 | std $nlob,[%sp+$bias+$frame+8] | ||
| 351 | addcc $j,8,$j | ||
| 352 | std $nloc,[%sp+$bias+$frame+16] | ||
| 353 | bz,pn %icc,.L1stskip | ||
| 354 | std $nlod,[%sp+$bias+$frame+24] | ||
| 355 | |||
| 356 | .align 32 ! incidentally already aligned ! | ||
| 357 | .L1st: | ||
| 358 | add $ap,$j,%o4 | ||
| 359 | add $np,$j,%o5 | ||
| 360 | ld [%o4+0],$alo_ ! load a[j] as pair of 32-bit words | ||
| 361 | fzeros $alo | ||
| 362 | ld [%o4+4],$ahi_ | ||
| 363 | fzeros $ahi | ||
| 364 | ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words | ||
| 365 | fzeros $nlo | ||
| 366 | ld [%o5+4],$nhi_ | ||
| 367 | fzeros $nhi | ||
| 368 | |||
| 369 | fxtod $alo,$alo | ||
| 370 | fxtod $ahi,$ahi | ||
| 371 | fxtod $nlo,$nlo | ||
| 372 | fxtod $nhi,$nhi | ||
| 373 | |||
| 374 | ldx [%sp+$bias+$frame+0],%o0 | ||
| 375 | fmuld $alo,$ba,$aloa | ||
| 376 | ldx [%sp+$bias+$frame+8],%o1 | ||
| 377 | fmuld $nlo,$na,$nloa | ||
| 378 | ldx [%sp+$bias+$frame+16],%o2 | ||
| 379 | fmuld $alo,$bb,$alob | ||
| 380 | ldx [%sp+$bias+$frame+24],%o3 | ||
| 381 | fmuld $nlo,$nb,$nlob | ||
| 382 | |||
| 383 | srlx %o0,16,%o7 | ||
| 384 | std $alo,[$ap_l+$j] ! save smashed ap[j] in double format | ||
| 385 | fmuld $alo,$bc,$aloc | ||
| 386 | add %o7,%o1,%o1 | ||
| 387 | std $ahi,[$ap_h+$j] | ||
| 388 | faddd $aloa,$nloa,$nloa | ||
| 389 | fmuld $nlo,$nc,$nloc | ||
| 390 | srlx %o1,16,%o7 | ||
| 391 | std $nlo,[$np_l+$j] ! save smashed np[j] in double format | ||
| 392 | fmuld $alo,$bd,$alod | ||
| 393 | add %o7,%o2,%o2 | ||
| 394 | std $nhi,[$np_h+$j] | ||
| 395 | faddd $alob,$nlob,$nlob | ||
| 396 | fmuld $nlo,$nd,$nlod | ||
| 397 | srlx %o2,16,%o7 | ||
| 398 | fmuld $ahi,$ba,$ahia | ||
| 399 | add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] | ||
| 400 | and %o0,$mask,%o0 | ||
| 401 | faddd $aloc,$nloc,$nloc | ||
| 402 | fmuld $nhi,$na,$nhia | ||
| 403 | and %o1,$mask,%o1 | ||
| 404 | and %o2,$mask,%o2 | ||
| 405 | fmuld $ahi,$bb,$ahib | ||
| 406 | sllx %o1,16,%o1 | ||
| 407 | faddd $alod,$nlod,$nlod | ||
| 408 | fmuld $nhi,$nb,$nhib | ||
| 409 | sllx %o2,32,%o2 | ||
| 410 | fmuld $ahi,$bc,$ahic | ||
| 411 | sllx %o3,48,%o7 | ||
| 412 | or %o1,%o0,%o0 | ||
| 413 | faddd $ahia,$nhia,$nhia | ||
| 414 | fmuld $nhi,$nc,$nhic | ||
| 415 | or %o2,%o0,%o0 | ||
| 416 | fmuld $ahi,$bd,$ahid | ||
| 417 | or %o7,%o0,%o0 ! 64-bit result | ||
| 418 | faddd $ahib,$nhib,$nhib | ||
| 419 | fmuld $nhi,$nd,$nhid | ||
| 420 | addcc %g1,%o0,%o0 | ||
| 421 | faddd $dota,$nloa,$nloa | ||
| 422 | srlx %o3,16,%g1 ! 34-bit carry | ||
| 423 | faddd $dotb,$nlob,$nlob | ||
| 424 | bcs,a %xcc,.+8 | ||
| 425 | add %g1,1,%g1 | ||
| 426 | |||
| 427 | stx %o0,[$tp] ! tp[j-1]= | ||
| 428 | |||
| 429 | faddd $ahic,$nhic,$dota ! $nhic | ||
| 430 | faddd $ahid,$nhid,$dotb ! $nhid | ||
| 431 | |||
| 432 | faddd $nloc,$nhia,$nloc | ||
| 433 | faddd $nlod,$nhib,$nlod | ||
| 434 | |||
| 435 | fdtox $nloa,$nloa | ||
| 436 | fdtox $nlob,$nlob | ||
| 437 | fdtox $nloc,$nloc | ||
| 438 | fdtox $nlod,$nlod | ||
| 439 | |||
| 440 | std $nloa,[%sp+$bias+$frame+0] | ||
| 441 | std $nlob,[%sp+$bias+$frame+8] | ||
| 442 | std $nloc,[%sp+$bias+$frame+16] | ||
| 443 | std $nlod,[%sp+$bias+$frame+24] | ||
| 444 | |||
| 445 | addcc $j,8,$j | ||
| 446 | bnz,pt %icc,.L1st | ||
| 447 | add $tp,8,$tp | ||
| 448 | |||
| 449 | .L1stskip: | ||
| 450 | fdtox $dota,$dota | ||
| 451 | fdtox $dotb,$dotb | ||
| 452 | |||
| 453 | ldx [%sp+$bias+$frame+0],%o0 | ||
| 454 | ldx [%sp+$bias+$frame+8],%o1 | ||
| 455 | ldx [%sp+$bias+$frame+16],%o2 | ||
| 456 | ldx [%sp+$bias+$frame+24],%o3 | ||
| 457 | |||
| 458 | srlx %o0,16,%o7 | ||
| 459 | std $dota,[%sp+$bias+$frame+32] | ||
| 460 | add %o7,%o1,%o1 | ||
| 461 | std $dotb,[%sp+$bias+$frame+40] | ||
| 462 | srlx %o1,16,%o7 | ||
| 463 | add %o7,%o2,%o2 | ||
| 464 | srlx %o2,16,%o7 | ||
| 465 | add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] | ||
| 466 | and %o0,$mask,%o0 | ||
| 467 | and %o1,$mask,%o1 | ||
| 468 | and %o2,$mask,%o2 | ||
| 469 | sllx %o1,16,%o1 | ||
| 470 | sllx %o2,32,%o2 | ||
| 471 | sllx %o3,48,%o7 | ||
| 472 | or %o1,%o0,%o0 | ||
| 473 | or %o2,%o0,%o0 | ||
| 474 | or %o7,%o0,%o0 ! 64-bit result | ||
| 475 | ldx [%sp+$bias+$frame+32],%o4 | ||
| 476 | addcc %g1,%o0,%o0 | ||
| 477 | ldx [%sp+$bias+$frame+40],%o5 | ||
| 478 | srlx %o3,16,%g1 ! 34-bit carry | ||
| 479 | bcs,a %xcc,.+8 | ||
| 480 | add %g1,1,%g1 | ||
| 481 | |||
| 482 | stx %o0,[$tp] ! tp[j-1]= | ||
| 483 | add $tp,8,$tp | ||
| 484 | |||
| 485 | srlx %o4,16,%o7 | ||
| 486 | add %o7,%o5,%o5 | ||
| 487 | and %o4,$mask,%o4 | ||
| 488 | sllx %o5,16,%o7 | ||
| 489 | or %o7,%o4,%o4 | ||
| 490 | addcc %g1,%o4,%o4 | ||
| 491 | srlx %o5,48,%g1 | ||
| 492 | bcs,a %xcc,.+8 | ||
| 493 | add %g1,1,%g1 | ||
| 494 | |||
| 495 | mov %g1,$carry | ||
| 496 | stx %o4,[$tp] ! tp[num-1]= | ||
| 497 | |||
| 498 | ba .Louter | ||
| 499 | add $i,8,$i | ||
| 500 | .align 32 | ||
| 501 | .Louter: | ||
| 502 | sub %g0,$num,$j ! j=-num | ||
| 503 | add %sp,$bias+$frame+$locals,$tp | ||
| 504 | |||
| 505 | add $ap,$j,%o3 | ||
| 506 | add $bp,$i,%o4 | ||
| 507 | |||
| 508 | ld [%o3+4],%g1 ! bp[i] | ||
| 509 | ld [%o3+0],%o0 | ||
| 510 | ld [%o4+4],%g5 ! ap[0] | ||
| 511 | sllx %g1,32,%g1 | ||
| 512 | ld [%o4+0],%o1 | ||
| 513 | sllx %g5,32,%g5 | ||
| 514 | or %g1,%o0,%o0 | ||
| 515 | or %g5,%o1,%o1 | ||
| 516 | |||
| 517 | ldx [$tp],%o2 ! tp[0] | ||
| 518 | mulx %o1,%o0,%o0 | ||
| 519 | addcc %o2,%o0,%o0 | ||
| 520 | mulx $n0,%o0,%o0 ! (ap[0]*bp[i]+t[0])*n0 | ||
| 521 | stx %o0,[%sp+$bias+$frame+0] | ||
| 522 | |||
| 523 | ! transfer b[i] to FPU as 4x16-bit values | ||
| 524 | ldda [%o4+2]%asi,$ba | ||
| 525 | ldda [%o4+0]%asi,$bb | ||
| 526 | ldda [%o4+6]%asi,$bc | ||
| 527 | ldda [%o4+4]%asi,$bd | ||
| 528 | |||
| 529 | ! transfer (ap[0]*b[i]+t[0])*n0 to FPU as 4x16-bit values | ||
| 530 | ldda [%sp+$bias+$frame+6]%asi,$na | ||
| 531 | fxtod $ba,$ba | ||
| 532 | ldda [%sp+$bias+$frame+4]%asi,$nb | ||
| 533 | fxtod $bb,$bb | ||
| 534 | ldda [%sp+$bias+$frame+2]%asi,$nc | ||
| 535 | fxtod $bc,$bc | ||
| 536 | ldda [%sp+$bias+$frame+0]%asi,$nd | ||
| 537 | fxtod $bd,$bd | ||
| 538 | ldd [$ap_l+$j],$alo ! load a[j] in double format | ||
| 539 | fxtod $na,$na | ||
| 540 | ldd [$ap_h+$j],$ahi | ||
| 541 | fxtod $nb,$nb | ||
| 542 | ldd [$np_l+$j],$nlo ! load n[j] in double format | ||
| 543 | fxtod $nc,$nc | ||
| 544 | ldd [$np_h+$j],$nhi | ||
| 545 | fxtod $nd,$nd | ||
| 546 | |||
| 547 | fmuld $alo,$ba,$aloa | ||
| 548 | fmuld $nlo,$na,$nloa | ||
| 549 | fmuld $alo,$bb,$alob | ||
| 550 | fmuld $nlo,$nb,$nlob | ||
| 551 | fmuld $alo,$bc,$aloc | ||
| 552 | faddd $aloa,$nloa,$nloa | ||
| 553 | fmuld $nlo,$nc,$nloc | ||
| 554 | fmuld $alo,$bd,$alod | ||
| 555 | faddd $alob,$nlob,$nlob | ||
| 556 | fmuld $nlo,$nd,$nlod | ||
| 557 | fmuld $ahi,$ba,$ahia | ||
| 558 | faddd $aloc,$nloc,$nloc | ||
| 559 | fmuld $nhi,$na,$nhia | ||
| 560 | fmuld $ahi,$bb,$ahib | ||
| 561 | faddd $alod,$nlod,$nlod | ||
| 562 | fmuld $nhi,$nb,$nhib | ||
| 563 | fmuld $ahi,$bc,$ahic | ||
| 564 | faddd $ahia,$nhia,$nhia | ||
| 565 | fmuld $nhi,$nc,$nhic | ||
| 566 | fmuld $ahi,$bd,$ahid | ||
| 567 | faddd $ahib,$nhib,$nhib | ||
| 568 | fmuld $nhi,$nd,$nhid | ||
| 569 | |||
| 570 | faddd $ahic,$nhic,$dota ! $nhic | ||
| 571 | faddd $ahid,$nhid,$dotb ! $nhid | ||
| 572 | |||
| 573 | faddd $nloc,$nhia,$nloc | ||
| 574 | faddd $nlod,$nhib,$nlod | ||
| 575 | |||
| 576 | fdtox $nloa,$nloa | ||
| 577 | fdtox $nlob,$nlob | ||
| 578 | fdtox $nloc,$nloc | ||
| 579 | fdtox $nlod,$nlod | ||
| 580 | |||
| 581 | std $nloa,[%sp+$bias+$frame+0] | ||
| 582 | std $nlob,[%sp+$bias+$frame+8] | ||
| 583 | std $nloc,[%sp+$bias+$frame+16] | ||
| 584 | add $j,8,$j | ||
| 585 | std $nlod,[%sp+$bias+$frame+24] | ||
| 586 | |||
| 587 | ldd [$ap_l+$j],$alo ! load a[j] in double format | ||
| 588 | ldd [$ap_h+$j],$ahi | ||
| 589 | ldd [$np_l+$j],$nlo ! load n[j] in double format | ||
| 590 | ldd [$np_h+$j],$nhi | ||
| 591 | |||
| 592 | fmuld $alo,$ba,$aloa | ||
| 593 | fmuld $nlo,$na,$nloa | ||
| 594 | fmuld $alo,$bb,$alob | ||
| 595 | fmuld $nlo,$nb,$nlob | ||
| 596 | fmuld $alo,$bc,$aloc | ||
| 597 | ldx [%sp+$bias+$frame+0],%o0 | ||
| 598 | faddd $aloa,$nloa,$nloa | ||
| 599 | fmuld $nlo,$nc,$nloc | ||
| 600 | ldx [%sp+$bias+$frame+8],%o1 | ||
| 601 | fmuld $alo,$bd,$alod | ||
| 602 | ldx [%sp+$bias+$frame+16],%o2 | ||
| 603 | faddd $alob,$nlob,$nlob | ||
| 604 | fmuld $nlo,$nd,$nlod | ||
| 605 | ldx [%sp+$bias+$frame+24],%o3 | ||
| 606 | fmuld $ahi,$ba,$ahia | ||
| 607 | |||
| 608 | srlx %o0,16,%o7 | ||
| 609 | faddd $aloc,$nloc,$nloc | ||
| 610 | fmuld $nhi,$na,$nhia | ||
| 611 | add %o7,%o1,%o1 | ||
| 612 | fmuld $ahi,$bb,$ahib | ||
| 613 | srlx %o1,16,%o7 | ||
| 614 | faddd $alod,$nlod,$nlod | ||
| 615 | fmuld $nhi,$nb,$nhib | ||
| 616 | add %o7,%o2,%o2 | ||
| 617 | fmuld $ahi,$bc,$ahic | ||
| 618 | srlx %o2,16,%o7 | ||
| 619 | faddd $ahia,$nhia,$nhia | ||
| 620 | fmuld $nhi,$nc,$nhic | ||
| 621 | add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] | ||
| 622 | ! why? | ||
| 623 | and %o0,$mask,%o0 | ||
| 624 | fmuld $ahi,$bd,$ahid | ||
| 625 | and %o1,$mask,%o1 | ||
| 626 | and %o2,$mask,%o2 | ||
| 627 | faddd $ahib,$nhib,$nhib | ||
| 628 | fmuld $nhi,$nd,$nhid | ||
| 629 | sllx %o1,16,%o1 | ||
| 630 | faddd $dota,$nloa,$nloa | ||
| 631 | sllx %o2,32,%o2 | ||
| 632 | faddd $dotb,$nlob,$nlob | ||
| 633 | sllx %o3,48,%o7 | ||
| 634 | or %o1,%o0,%o0 | ||
| 635 | faddd $ahic,$nhic,$dota ! $nhic | ||
| 636 | or %o2,%o0,%o0 | ||
| 637 | faddd $ahid,$nhid,$dotb ! $nhid | ||
| 638 | or %o7,%o0,%o0 ! 64-bit result | ||
| 639 | ldx [$tp],%o7 | ||
| 640 | faddd $nloc,$nhia,$nloc | ||
| 641 | addcc %o7,%o0,%o0 | ||
| 642 | ! end-of-why? | ||
| 643 | faddd $nlod,$nhib,$nlod | ||
| 644 | srlx %o3,16,%g1 ! 34-bit carry | ||
| 645 | fdtox $nloa,$nloa | ||
| 646 | bcs,a %xcc,.+8 | ||
| 647 | add %g1,1,%g1 | ||
| 648 | |||
| 649 | fdtox $nlob,$nlob | ||
| 650 | fdtox $nloc,$nloc | ||
| 651 | fdtox $nlod,$nlod | ||
| 652 | |||
| 653 | std $nloa,[%sp+$bias+$frame+0] | ||
| 654 | std $nlob,[%sp+$bias+$frame+8] | ||
| 655 | addcc $j,8,$j | ||
| 656 | std $nloc,[%sp+$bias+$frame+16] | ||
| 657 | bz,pn %icc,.Linnerskip | ||
| 658 | std $nlod,[%sp+$bias+$frame+24] | ||
| 659 | |||
| 660 | ba .Linner | ||
| 661 | nop | ||
| 662 | .align 32 | ||
| 663 | .Linner: | ||
| 664 | ldd [$ap_l+$j],$alo ! load a[j] in double format | ||
| 665 | ldd [$ap_h+$j],$ahi | ||
| 666 | ldd [$np_l+$j],$nlo ! load n[j] in double format | ||
| 667 | ldd [$np_h+$j],$nhi | ||
| 668 | |||
| 669 | fmuld $alo,$ba,$aloa | ||
| 670 | fmuld $nlo,$na,$nloa | ||
| 671 | fmuld $alo,$bb,$alob | ||
| 672 | fmuld $nlo,$nb,$nlob | ||
| 673 | fmuld $alo,$bc,$aloc | ||
| 674 | ldx [%sp+$bias+$frame+0],%o0 | ||
| 675 | faddd $aloa,$nloa,$nloa | ||
| 676 | fmuld $nlo,$nc,$nloc | ||
| 677 | ldx [%sp+$bias+$frame+8],%o1 | ||
| 678 | fmuld $alo,$bd,$alod | ||
| 679 | ldx [%sp+$bias+$frame+16],%o2 | ||
| 680 | faddd $alob,$nlob,$nlob | ||
| 681 | fmuld $nlo,$nd,$nlod | ||
| 682 | ldx [%sp+$bias+$frame+24],%o3 | ||
| 683 | fmuld $ahi,$ba,$ahia | ||
| 684 | |||
| 685 | srlx %o0,16,%o7 | ||
| 686 | faddd $aloc,$nloc,$nloc | ||
| 687 | fmuld $nhi,$na,$nhia | ||
| 688 | add %o7,%o1,%o1 | ||
| 689 | fmuld $ahi,$bb,$ahib | ||
| 690 | srlx %o1,16,%o7 | ||
| 691 | faddd $alod,$nlod,$nlod | ||
| 692 | fmuld $nhi,$nb,$nhib | ||
| 693 | add %o7,%o2,%o2 | ||
| 694 | fmuld $ahi,$bc,$ahic | ||
| 695 | srlx %o2,16,%o7 | ||
| 696 | faddd $ahia,$nhia,$nhia | ||
| 697 | fmuld $nhi,$nc,$nhic | ||
| 698 | add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] | ||
| 699 | and %o0,$mask,%o0 | ||
| 700 | fmuld $ahi,$bd,$ahid | ||
| 701 | and %o1,$mask,%o1 | ||
| 702 | and %o2,$mask,%o2 | ||
| 703 | faddd $ahib,$nhib,$nhib | ||
| 704 | fmuld $nhi,$nd,$nhid | ||
| 705 | sllx %o1,16,%o1 | ||
| 706 | faddd $dota,$nloa,$nloa | ||
| 707 | sllx %o2,32,%o2 | ||
| 708 | faddd $dotb,$nlob,$nlob | ||
| 709 | sllx %o3,48,%o7 | ||
| 710 | or %o1,%o0,%o0 | ||
| 711 | faddd $ahic,$nhic,$dota ! $nhic | ||
| 712 | or %o2,%o0,%o0 | ||
| 713 | faddd $ahid,$nhid,$dotb ! $nhid | ||
| 714 | or %o7,%o0,%o0 ! 64-bit result | ||
| 715 | faddd $nloc,$nhia,$nloc | ||
| 716 | addcc %g1,%o0,%o0 | ||
| 717 | ldx [$tp+8],%o7 ! tp[j] | ||
| 718 | faddd $nlod,$nhib,$nlod | ||
| 719 | srlx %o3,16,%g1 ! 34-bit carry | ||
| 720 | fdtox $nloa,$nloa | ||
| 721 | bcs,a %xcc,.+8 | ||
| 722 | add %g1,1,%g1 | ||
| 723 | fdtox $nlob,$nlob | ||
| 724 | addcc %o7,%o0,%o0 | ||
| 725 | fdtox $nloc,$nloc | ||
| 726 | bcs,a %xcc,.+8 | ||
| 727 | add %g1,1,%g1 | ||
| 728 | |||
| 729 | stx %o0,[$tp] ! tp[j-1] | ||
| 730 | fdtox $nlod,$nlod | ||
| 731 | |||
| 732 | std $nloa,[%sp+$bias+$frame+0] | ||
| 733 | std $nlob,[%sp+$bias+$frame+8] | ||
| 734 | std $nloc,[%sp+$bias+$frame+16] | ||
| 735 | addcc $j,8,$j | ||
| 736 | std $nlod,[%sp+$bias+$frame+24] | ||
| 737 | bnz,pt %icc,.Linner | ||
| 738 | add $tp,8,$tp | ||
| 739 | |||
| 740 | .Linnerskip: | ||
| 741 | fdtox $dota,$dota | ||
| 742 | fdtox $dotb,$dotb | ||
| 743 | |||
| 744 | ldx [%sp+$bias+$frame+0],%o0 | ||
| 745 | ldx [%sp+$bias+$frame+8],%o1 | ||
| 746 | ldx [%sp+$bias+$frame+16],%o2 | ||
| 747 | ldx [%sp+$bias+$frame+24],%o3 | ||
| 748 | |||
| 749 | srlx %o0,16,%o7 | ||
| 750 | std $dota,[%sp+$bias+$frame+32] | ||
| 751 | add %o7,%o1,%o1 | ||
| 752 | std $dotb,[%sp+$bias+$frame+40] | ||
| 753 | srlx %o1,16,%o7 | ||
| 754 | add %o7,%o2,%o2 | ||
| 755 | srlx %o2,16,%o7 | ||
| 756 | add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] | ||
| 757 | and %o0,$mask,%o0 | ||
| 758 | and %o1,$mask,%o1 | ||
| 759 | and %o2,$mask,%o2 | ||
| 760 | sllx %o1,16,%o1 | ||
| 761 | sllx %o2,32,%o2 | ||
| 762 | sllx %o3,48,%o7 | ||
| 763 | or %o1,%o0,%o0 | ||
| 764 | or %o2,%o0,%o0 | ||
| 765 | ldx [%sp+$bias+$frame+32],%o4 | ||
| 766 | or %o7,%o0,%o0 ! 64-bit result | ||
| 767 | ldx [%sp+$bias+$frame+40],%o5 | ||
| 768 | addcc %g1,%o0,%o0 | ||
| 769 | ldx [$tp+8],%o7 ! tp[j] | ||
| 770 | srlx %o3,16,%g1 ! 34-bit carry | ||
| 771 | bcs,a %xcc,.+8 | ||
| 772 | add %g1,1,%g1 | ||
| 773 | |||
| 774 | addcc %o7,%o0,%o0 | ||
| 775 | bcs,a %xcc,.+8 | ||
| 776 | add %g1,1,%g1 | ||
| 777 | |||
| 778 | stx %o0,[$tp] ! tp[j-1] | ||
| 779 | add $tp,8,$tp | ||
| 780 | |||
| 781 | srlx %o4,16,%o7 | ||
| 782 | add %o7,%o5,%o5 | ||
| 783 | and %o4,$mask,%o4 | ||
| 784 | sllx %o5,16,%o7 | ||
| 785 | or %o7,%o4,%o4 | ||
| 786 | addcc %g1,%o4,%o4 | ||
| 787 | srlx %o5,48,%g1 | ||
| 788 | bcs,a %xcc,.+8 | ||
| 789 | add %g1,1,%g1 | ||
| 790 | |||
| 791 | addcc $carry,%o4,%o4 | ||
| 792 | stx %o4,[$tp] ! tp[num-1] | ||
| 793 | mov %g1,$carry | ||
| 794 | bcs,a %xcc,.+8 | ||
| 795 | add $carry,1,$carry | ||
| 796 | |||
| 797 | addcc $i,8,$i | ||
| 798 | bnz %icc,.Louter | ||
| 799 | nop | ||
| 800 | |||
| 801 | add $tp,8,$tp ! adjust tp to point at the end | ||
| 802 | orn %g0,%g0,%g4 | ||
| 803 | sub %g0,$num,%o7 ! n=-num | ||
| 804 | ba .Lsub | ||
| 805 | subcc %g0,%g0,%g0 ! clear %icc.c | ||
| 806 | |||
| 807 | .align 32 | ||
| 808 | .Lsub: | ||
| 809 | ldx [$tp+%o7],%o0 | ||
| 810 | add $np,%o7,%g1 | ||
| 811 | ld [%g1+0],%o2 | ||
| 812 | ld [%g1+4],%o3 | ||
| 813 | srlx %o0,32,%o1 | ||
| 814 | subccc %o0,%o2,%o2 | ||
| 815 | add $rp,%o7,%g1 | ||
| 816 | subccc %o1,%o3,%o3 | ||
| 817 | st %o2,[%g1+0] | ||
| 818 | add %o7,8,%o7 | ||
| 819 | brnz,pt %o7,.Lsub | ||
| 820 | st %o3,[%g1+4] | ||
| 821 | subc $carry,0,%g4 | ||
| 822 | sub %g0,$num,%o7 ! n=-num | ||
| 823 | ba .Lcopy | ||
| 824 | nop | ||
| 825 | |||
| 826 | .align 32 | ||
| 827 | .Lcopy: | ||
| 828 | ldx [$tp+%o7],%o0 | ||
| 829 | add $rp,%o7,%g1 | ||
| 830 | ld [%g1+0],%o2 | ||
| 831 | ld [%g1+4],%o3 | ||
| 832 | stx %g0,[$tp+%o7] | ||
| 833 | and %o0,%g4,%o0 | ||
| 834 | srlx %o0,32,%o1 | ||
| 835 | andn %o2,%g4,%o2 | ||
| 836 | andn %o3,%g4,%o3 | ||
| 837 | or %o2,%o0,%o0 | ||
| 838 | or %o3,%o1,%o1 | ||
| 839 | st %o0,[%g1+0] | ||
| 840 | add %o7,8,%o7 | ||
| 841 | brnz,pt %o7,.Lcopy | ||
| 842 | st %o1,[%g1+4] | ||
| 843 | sub %g0,$num,%o7 ! n=-num | ||
| 844 | |||
| 845 | .Lzap: | ||
| 846 | stx %g0,[$ap_l+%o7] | ||
| 847 | stx %g0,[$ap_h+%o7] | ||
| 848 | stx %g0,[$np_l+%o7] | ||
| 849 | stx %g0,[$np_h+%o7] | ||
| 850 | add %o7,8,%o7 | ||
| 851 | brnz,pt %o7,.Lzap | ||
| 852 | nop | ||
| 853 | |||
| 854 | ldx [%sp+$bias+$frame+48],%o7 | ||
| 855 | wr %g0,%o7,%asi ! restore %asi | ||
| 856 | |||
| 857 | mov 1,%i0 | ||
| 858 | .Lret: | ||
| 859 | ret | ||
| 860 | restore | ||
| 861 | .type $fname,#function | ||
| 862 | .size $fname,(.-$fname) | ||
| 863 | .asciz "Montgomery Multipltication for UltraSPARC, CRYPTOGAMS by <appro\@openssl.org>" | ||
| 864 | .align 32 | ||
| 865 | ___ | ||
| 866 | |||
| 867 | $code =~ s/\`([^\`]*)\`/eval($1)/gem; | ||
| 868 | |||
| 869 | # Below substitution makes it possible to compile without demanding | ||
| 870 | # VIS extentions on command line, e.g. -xarch=v9 vs. -xarch=v9a. I | ||
| 871 | # dare to do this, because VIS capability is detected at run-time now | ||
| 872 | # and this routine is not called on CPU not capable to execute it. Do | ||
| 873 | # note that fzeros is not the only VIS dependency! Another dependency | ||
| 874 | # is implicit and is just _a_ numerical value loaded to %asi register, | ||
| 875 | # which assembler can't recognize as VIS specific... | ||
| 876 | $code =~ s/fzeros\s+%f([0-9]+)/ | ||
| 877 | sprintf(".word\t0x%x\t! fzeros %%f%d",0x81b00c20|($1<<25),$1) | ||
| 878 | /gem; | ||
| 879 | |||
| 880 | print $code; | ||
| 881 | # flush | ||
| 882 | close STDOUT; | ||
diff --git a/src/lib/libcrypto/bn/asm/via-mont.pl b/src/lib/libcrypto/bn/asm/via-mont.pl new file mode 100644 index 0000000000..c046a514c8 --- /dev/null +++ b/src/lib/libcrypto/bn/asm/via-mont.pl | |||
| @@ -0,0 +1,242 @@ | |||
| 1 | #!/usr/bin/env perl | ||
| 2 | # | ||
| 3 | # ==================================================================== | ||
| 4 | # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | ||
| 5 | # project. The module is, however, dual licensed under OpenSSL and | ||
| 6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
| 7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
| 8 | # ==================================================================== | ||
| 9 | # | ||
| 10 | # Wrapper around 'rep montmul', VIA-specific instruction accessing | ||
| 11 | # PadLock Montgomery Multiplier. The wrapper is designed as drop-in | ||
| 12 | # replacement for OpenSSL bn_mul_mont [first implemented in 0.9.9]. | ||
| 13 | # | ||
| 14 | # Below are interleaved outputs from 'openssl speed rsa dsa' for 4 | ||
| 15 | # different software configurations on 1.5GHz VIA Esther processor. | ||
| 16 | # Lines marked with "software integer" denote performance of hand- | ||
| 17 | # coded integer-only assembler found in OpenSSL 0.9.7. "Software SSE2" | ||
| 18 | # refers to hand-coded SSE2 Montgomery multiplication procedure found | ||
| 19 | # OpenSSL 0.9.9. "Hardware VIA SDK" refers to padlock_pmm routine from | ||
| 20 | # Padlock SDK 2.0.1 available for download from VIA, which naturally | ||
| 21 | # utilizes the magic 'repz montmul' instruction. And finally "hardware | ||
| 22 | # this" refers to *this* implementation which also uses 'repz montmul' | ||
| 23 | # | ||
| 24 | # sign verify sign/s verify/s | ||
| 25 | # rsa 512 bits 0.001720s 0.000140s 581.4 7149.7 software integer | ||
| 26 | # rsa 512 bits 0.000690s 0.000086s 1450.3 11606.0 software SSE2 | ||
| 27 | # rsa 512 bits 0.006136s 0.000201s 163.0 4974.5 hardware VIA SDK | ||
| 28 | # rsa 512 bits 0.000712s 0.000050s 1404.9 19858.5 hardware this | ||
| 29 | # | ||
| 30 | # rsa 1024 bits 0.008518s 0.000413s 117.4 2420.8 software integer | ||
| 31 | # rsa 1024 bits 0.004275s 0.000277s 233.9 3609.7 software SSE2 | ||
| 32 | # rsa 1024 bits 0.012136s 0.000260s 82.4 3844.5 hardware VIA SDK | ||
| 33 | # rsa 1024 bits 0.002522s 0.000116s 396.5 8650.9 hardware this | ||
| 34 | # | ||
| 35 | # rsa 2048 bits 0.050101s 0.001371s 20.0 729.6 software integer | ||
| 36 | # rsa 2048 bits 0.030273s 0.001008s 33.0 991.9 software SSE2 | ||
| 37 | # rsa 2048 bits 0.030833s 0.000976s 32.4 1025.1 hardware VIA SDK | ||
| 38 | # rsa 2048 bits 0.011879s 0.000342s 84.2 2921.7 hardware this | ||
| 39 | # | ||
| 40 | # rsa 4096 bits 0.327097s 0.004859s 3.1 205.8 software integer | ||
| 41 | # rsa 4096 bits 0.229318s 0.003859s 4.4 259.2 software SSE2 | ||
| 42 | # rsa 4096 bits 0.233953s 0.003274s 4.3 305.4 hardware VIA SDK | ||
| 43 | # rsa 4096 bits 0.070493s 0.001166s 14.2 857.6 hardware this | ||
| 44 | # | ||
| 45 | # dsa 512 bits 0.001342s 0.001651s 745.2 605.7 software integer | ||
| 46 | # dsa 512 bits 0.000844s 0.000987s 1185.3 1013.1 software SSE2 | ||
| 47 | # dsa 512 bits 0.001902s 0.002247s 525.6 444.9 hardware VIA SDK | ||
| 48 | # dsa 512 bits 0.000458s 0.000524s 2182.2 1909.1 hardware this | ||
| 49 | # | ||
| 50 | # dsa 1024 bits 0.003964s 0.004926s 252.3 203.0 software integer | ||
| 51 | # dsa 1024 bits 0.002686s 0.003166s 372.3 315.8 software SSE2 | ||
| 52 | # dsa 1024 bits 0.002397s 0.002823s 417.1 354.3 hardware VIA SDK | ||
| 53 | # dsa 1024 bits 0.000978s 0.001170s 1022.2 855.0 hardware this | ||
| 54 | # | ||
| 55 | # dsa 2048 bits 0.013280s 0.016518s 75.3 60.5 software integer | ||
| 56 | # dsa 2048 bits 0.009911s 0.011522s 100.9 86.8 software SSE2 | ||
| 57 | # dsa 2048 bits 0.009542s 0.011763s 104.8 85.0 hardware VIA SDK | ||
| 58 | # dsa 2048 bits 0.002884s 0.003352s 346.8 298.3 hardware this | ||
| 59 | # | ||
| 60 | # To give you some other reference point here is output for 2.4GHz P4 | ||
| 61 | # running hand-coded SSE2 bn_mul_mont found in 0.9.9, i.e. "software | ||
| 62 | # SSE2" in above terms. | ||
| 63 | # | ||
| 64 | # rsa 512 bits 0.000407s 0.000047s 2454.2 21137.0 | ||
| 65 | # rsa 1024 bits 0.002426s 0.000141s 412.1 7100.0 | ||
| 66 | # rsa 2048 bits 0.015046s 0.000491s 66.5 2034.9 | ||
| 67 | # rsa 4096 bits 0.109770s 0.002379s 9.1 420.3 | ||
| 68 | # dsa 512 bits 0.000438s 0.000525s 2281.1 1904.1 | ||
| 69 | # dsa 1024 bits 0.001346s 0.001595s 742.7 627.0 | ||
| 70 | # dsa 2048 bits 0.004745s 0.005582s 210.7 179.1 | ||
| 71 | # | ||
| 72 | # Conclusions: | ||
| 73 | # - VIA SDK leaves a *lot* of room for improvement (which this | ||
| 74 | # implementation successfully fills:-); | ||
| 75 | # - 'rep montmul' gives up to >3x performance improvement depending on | ||
| 76 | # key length; | ||
| 77 | # - in terms of absolute performance it delivers approximately as much | ||
| 78 | # as modern out-of-order 32-bit cores [again, for longer keys]. | ||
| 79 | |||
| 80 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | ||
| 81 | push(@INC,"${dir}","${dir}../../perlasm"); | ||
| 82 | require "x86asm.pl"; | ||
| 83 | |||
| 84 | &asm_init($ARGV[0],"via-mont.pl"); | ||
| 85 | |||
| 86 | # int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num); | ||
| 87 | $func="bn_mul_mont_padlock"; | ||
| 88 | |||
| 89 | $pad=16*1; # amount of reserved bytes on top of every vector | ||
| 90 | |||
| 91 | # stack layout | ||
| 92 | $mZeroPrime=&DWP(0,"esp"); # these are specified by VIA | ||
| 93 | $A=&DWP(4,"esp"); | ||
| 94 | $B=&DWP(8,"esp"); | ||
| 95 | $T=&DWP(12,"esp"); | ||
| 96 | $M=&DWP(16,"esp"); | ||
| 97 | $scratch=&DWP(20,"esp"); | ||
| 98 | $rp=&DWP(24,"esp"); # these are mine | ||
| 99 | $sp=&DWP(28,"esp"); | ||
| 100 | # &DWP(32,"esp") # 32 byte scratch area | ||
| 101 | # &DWP(64+(4*$num+$pad)*0,"esp") # padded tp[num] | ||
| 102 | # &DWP(64+(4*$num+$pad)*1,"esp") # padded copy of ap[num] | ||
| 103 | # &DWP(64+(4*$num+$pad)*2,"esp") # padded copy of bp[num] | ||
| 104 | # &DWP(64+(4*$num+$pad)*3,"esp") # padded copy of np[num] | ||
| 105 | # Note that SDK suggests to unconditionally allocate 2K per vector. This | ||
| 106 | # has quite an impact on performance. It naturally depends on key length, | ||
| 107 | # but to give an example 1024 bit private RSA key operations suffer >30% | ||
| 108 | # penalty. I allocate only as much as actually required... | ||
| 109 | |||
| 110 | &function_begin($func); | ||
| 111 | &xor ("eax","eax"); | ||
| 112 | &mov ("ecx",&wparam(5)); # num | ||
| 113 | # meet VIA's limitations for num [note that the specification | ||
| 114 | # expresses them in bits, while we work with amount of 32-bit words] | ||
| 115 | &test ("ecx",3); | ||
| 116 | &jnz (&label("leave")); # num % 4 != 0 | ||
| 117 | &cmp ("ecx",8); | ||
| 118 | &jb (&label("leave")); # num < 8 | ||
| 119 | &cmp ("ecx",1024); | ||
| 120 | &ja (&label("leave")); # num > 1024 | ||
| 121 | |||
| 122 | &pushf (); | ||
| 123 | &cld (); | ||
| 124 | |||
| 125 | &mov ("edi",&wparam(0)); # rp | ||
| 126 | &mov ("eax",&wparam(1)); # ap | ||
| 127 | &mov ("ebx",&wparam(2)); # bp | ||
| 128 | &mov ("edx",&wparam(3)); # np | ||
| 129 | &mov ("esi",&wparam(4)); # n0 | ||
| 130 | &mov ("esi",&DWP(0,"esi")); # *n0 | ||
| 131 | |||
| 132 | &lea ("ecx",&DWP($pad,"","ecx",4)); # ecx becomes vector size in bytes | ||
| 133 | &lea ("ebp",&DWP(64,"","ecx",4)); # allocate 4 vectors + 64 bytes | ||
| 134 | &neg ("ebp"); | ||
| 135 | &add ("ebp","esp"); | ||
| 136 | &and ("ebp",-64); # align to cache-line | ||
| 137 | &xchg ("ebp","esp"); # alloca | ||
| 138 | |||
| 139 | &mov ($rp,"edi"); # save rp | ||
| 140 | &mov ($sp,"ebp"); # save esp | ||
| 141 | |||
| 142 | &mov ($mZeroPrime,"esi"); | ||
| 143 | &lea ("esi",&DWP(64,"esp")); # tp | ||
| 144 | &mov ($T,"esi"); | ||
| 145 | &lea ("edi",&DWP(32,"esp")); # scratch area | ||
| 146 | &mov ($scratch,"edi"); | ||
| 147 | &mov ("esi","eax"); | ||
| 148 | |||
| 149 | &lea ("ebp",&DWP(-$pad,"ecx")); | ||
| 150 | &shr ("ebp",2); # restore original num value in ebp | ||
| 151 | |||
| 152 | &xor ("eax","eax"); | ||
| 153 | |||
| 154 | &mov ("ecx","ebp"); | ||
| 155 | &lea ("ecx",&DWP((32+$pad)/4,"ecx"));# padded tp + scratch | ||
| 156 | &data_byte(0xf3,0xab); # rep stosl, bzero | ||
| 157 | |||
| 158 | &mov ("ecx","ebp"); | ||
| 159 | &lea ("edi",&DWP(64+$pad,"esp","ecx",4));# pointer to ap copy | ||
| 160 | &mov ($A,"edi"); | ||
| 161 | &data_byte(0xf3,0xa5); # rep movsl, memcpy | ||
| 162 | &mov ("ecx",$pad/4); | ||
| 163 | &data_byte(0xf3,0xab); # rep stosl, bzero pad | ||
| 164 | # edi points at the end of padded ap copy... | ||
| 165 | |||
| 166 | &mov ("ecx","ebp"); | ||
| 167 | &mov ("esi","ebx"); | ||
| 168 | &mov ($B,"edi"); | ||
| 169 | &data_byte(0xf3,0xa5); # rep movsl, memcpy | ||
| 170 | &mov ("ecx",$pad/4); | ||
| 171 | &data_byte(0xf3,0xab); # rep stosl, bzero pad | ||
| 172 | # edi points at the end of padded bp copy... | ||
| 173 | |||
| 174 | &mov ("ecx","ebp"); | ||
| 175 | &mov ("esi","edx"); | ||
| 176 | &mov ($M,"edi"); | ||
| 177 | &data_byte(0xf3,0xa5); # rep movsl, memcpy | ||
| 178 | &mov ("ecx",$pad/4); | ||
| 179 | &data_byte(0xf3,0xab); # rep stosl, bzero pad | ||
| 180 | # edi points at the end of padded np copy... | ||
| 181 | |||
| 182 | # let magic happen... | ||
| 183 | &mov ("ecx","ebp"); | ||
| 184 | &mov ("esi","esp"); | ||
| 185 | &shl ("ecx",5); # convert word counter to bit counter | ||
| 186 | &align (4); | ||
| 187 | &data_byte(0xf3,0x0f,0xa6,0xc0);# rep montmul | ||
| 188 | |||
| 189 | &mov ("ecx","ebp"); | ||
| 190 | &lea ("esi",&DWP(64,"esp")); # tp | ||
| 191 | # edi still points at the end of padded np copy... | ||
| 192 | &neg ("ebp"); | ||
| 193 | &lea ("ebp",&DWP(-$pad,"edi","ebp",4)); # so just "rewind" | ||
| 194 | &mov ("edi",$rp); # restore rp | ||
| 195 | &xor ("edx","edx"); # i=0 and clear CF | ||
| 196 | |||
| 197 | &set_label("sub",8); | ||
| 198 | &mov ("eax",&DWP(0,"esi","edx",4)); | ||
| 199 | &sbb ("eax",&DWP(0,"ebp","edx",4)); | ||
| 200 | &mov (&DWP(0,"edi","edx",4),"eax"); # rp[i]=tp[i]-np[i] | ||
| 201 | &lea ("edx",&DWP(1,"edx")); # i++ | ||
| 202 | &loop (&label("sub")); # doesn't affect CF! | ||
| 203 | |||
| 204 | &mov ("eax",&DWP(0,"esi","edx",4)); # upmost overflow bit | ||
| 205 | &sbb ("eax",0); | ||
| 206 | &and ("esi","eax"); | ||
| 207 | ¬ ("eax"); | ||
| 208 | &mov ("ebp","edi"); | ||
| 209 | &and ("ebp","eax"); | ||
| 210 | &or ("esi","ebp"); # tp=carry?tp:rp | ||
| 211 | |||
| 212 | &mov ("ecx","edx"); # num | ||
| 213 | &xor ("edx","edx"); # i=0 | ||
| 214 | |||
| 215 | &set_label("copy",8); | ||
| 216 | &mov ("eax",&DWP(0,"esi","edx",4)); | ||
| 217 | &mov (&DWP(64,"esp","edx",4),"ecx"); # zap tp | ||
| 218 | &mov (&DWP(0,"edi","edx",4),"eax"); | ||
| 219 | &lea ("edx",&DWP(1,"edx")); # i++ | ||
| 220 | &loop (&label("copy")); | ||
| 221 | |||
| 222 | &mov ("ebp",$sp); | ||
| 223 | &xor ("eax","eax"); | ||
| 224 | |||
| 225 | &mov ("ecx",64/4); | ||
| 226 | &mov ("edi","esp"); # zap frame including scratch area | ||
| 227 | &data_byte(0xf3,0xab); # rep stosl, bzero | ||
| 228 | |||
| 229 | # zap copies of ap, bp and np | ||
| 230 | &lea ("edi",&DWP(64+$pad,"esp","edx",4));# pointer to ap | ||
| 231 | &lea ("ecx",&DWP(3*$pad/4,"edx","edx",2)); | ||
| 232 | &data_byte(0xf3,0xab); # rep stosl, bzero | ||
| 233 | |||
| 234 | &mov ("esp","ebp"); | ||
| 235 | &inc ("eax"); # signal "done" | ||
| 236 | &popf (); | ||
| 237 | &set_label("leave"); | ||
| 238 | &function_end($func); | ||
| 239 | |||
| 240 | &asciz("Padlock Montgomery Multiplication, CRYPTOGAMS by <appro\@openssl.org>"); | ||
| 241 | |||
| 242 | &asm_finish(); | ||
diff --git a/src/lib/libcrypto/bn/asm/x86-mont.pl b/src/lib/libcrypto/bn/asm/x86-mont.pl new file mode 100755 index 0000000000..5cd3cd2ed5 --- /dev/null +++ b/src/lib/libcrypto/bn/asm/x86-mont.pl | |||
| @@ -0,0 +1,591 @@ | |||
| 1 | #!/usr/bin/env perl | ||
| 2 | |||
| 3 | # ==================================================================== | ||
| 4 | # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | ||
| 5 | # project. The module is, however, dual licensed under OpenSSL and | ||
| 6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
| 7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
| 8 | # ==================================================================== | ||
| 9 | |||
| 10 | # October 2005 | ||
| 11 | # | ||
| 12 | # This is a "teaser" code, as it can be improved in several ways... | ||
| 13 | # First of all non-SSE2 path should be implemented (yes, for now it | ||
| 14 | # performs Montgomery multiplication/convolution only on SSE2-capable | ||
| 15 | # CPUs such as P4, others fall down to original code). Then inner loop | ||
| 16 | # can be unrolled and modulo-scheduled to improve ILP and possibly | ||
| 17 | # moved to 128-bit XMM register bank (though it would require input | ||
| 18 | # rearrangement and/or increase bus bandwidth utilization). Dedicated | ||
| 19 | # squaring procedure should give further performance improvement... | ||
| 20 | # Yet, for being draft, the code improves rsa512 *sign* benchmark by | ||
| 21 | # 110%(!), rsa1024 one - by 70% and rsa4096 - by 20%:-) | ||
| 22 | |||
| 23 | # December 2006 | ||
| 24 | # | ||
| 25 | # Modulo-scheduling SSE2 loops results in further 15-20% improvement. | ||
| 26 | # Integer-only code [being equipped with dedicated squaring procedure] | ||
| 27 | # gives ~40% on rsa512 sign benchmark... | ||
| 28 | |||
| 29 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | ||
| 30 | push(@INC,"${dir}","${dir}../../perlasm"); | ||
| 31 | require "x86asm.pl"; | ||
| 32 | |||
| 33 | &asm_init($ARGV[0],$0); | ||
| 34 | |||
| 35 | $sse2=0; | ||
| 36 | for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } | ||
| 37 | |||
| 38 | &external_label("OPENSSL_ia32cap_P") if ($sse2); | ||
| 39 | |||
| 40 | &function_begin("bn_mul_mont"); | ||
| 41 | |||
| 42 | $i="edx"; | ||
| 43 | $j="ecx"; | ||
| 44 | $ap="esi"; $tp="esi"; # overlapping variables!!! | ||
| 45 | $rp="edi"; $bp="edi"; # overlapping variables!!! | ||
| 46 | $np="ebp"; | ||
| 47 | $num="ebx"; | ||
| 48 | |||
| 49 | $_num=&DWP(4*0,"esp"); # stack top layout | ||
| 50 | $_rp=&DWP(4*1,"esp"); | ||
| 51 | $_ap=&DWP(4*2,"esp"); | ||
| 52 | $_bp=&DWP(4*3,"esp"); | ||
| 53 | $_np=&DWP(4*4,"esp"); | ||
| 54 | $_n0=&DWP(4*5,"esp"); $_n0q=&QWP(4*5,"esp"); | ||
| 55 | $_sp=&DWP(4*6,"esp"); | ||
| 56 | $_bpend=&DWP(4*7,"esp"); | ||
| 57 | $frame=32; # size of above frame rounded up to 16n | ||
| 58 | |||
| 59 | &xor ("eax","eax"); | ||
| 60 | &mov ("edi",&wparam(5)); # int num | ||
| 61 | &cmp ("edi",4); | ||
| 62 | &jl (&label("just_leave")); | ||
| 63 | |||
| 64 | &lea ("esi",&wparam(0)); # put aside pointer to argument block | ||
| 65 | &lea ("edx",&wparam(1)); # load ap | ||
| 66 | &mov ("ebp","esp"); # saved stack pointer! | ||
| 67 | &add ("edi",2); # extra two words on top of tp | ||
| 68 | &neg ("edi"); | ||
| 69 | &lea ("esp",&DWP(-$frame,"esp","edi",4)); # alloca($frame+4*(num+2)) | ||
| 70 | &neg ("edi"); | ||
| 71 | |||
| 72 | # minimize cache contention by arraning 2K window between stack | ||
| 73 | # pointer and ap argument [np is also position sensitive vector, | ||
| 74 | # but it's assumed to be near ap, as it's allocated at ~same | ||
| 75 | # time]. | ||
| 76 | &mov ("eax","esp"); | ||
| 77 | &sub ("eax","edx"); | ||
| 78 | &and ("eax",2047); | ||
| 79 | &sub ("esp","eax"); # this aligns sp and ap modulo 2048 | ||
| 80 | |||
| 81 | &xor ("edx","esp"); | ||
| 82 | &and ("edx",2048); | ||
| 83 | &xor ("edx",2048); | ||
| 84 | &sub ("esp","edx"); # this splits them apart modulo 4096 | ||
| 85 | |||
| 86 | &and ("esp",-64); # align to cache line | ||
| 87 | |||
| 88 | ################################# load argument block... | ||
| 89 | &mov ("eax",&DWP(0*4,"esi"));# BN_ULONG *rp | ||
| 90 | &mov ("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap | ||
| 91 | &mov ("ecx",&DWP(2*4,"esi"));# const BN_ULONG *bp | ||
| 92 | &mov ("edx",&DWP(3*4,"esi"));# const BN_ULONG *np | ||
| 93 | &mov ("esi",&DWP(4*4,"esi"));# const BN_ULONG *n0 | ||
| 94 | #&mov ("edi",&DWP(5*4,"esi"));# int num | ||
| 95 | |||
| 96 | &mov ("esi",&DWP(0,"esi")); # pull n0[0] | ||
| 97 | &mov ($_rp,"eax"); # ... save a copy of argument block | ||
| 98 | &mov ($_ap,"ebx"); | ||
| 99 | &mov ($_bp,"ecx"); | ||
| 100 | &mov ($_np,"edx"); | ||
| 101 | &mov ($_n0,"esi"); | ||
| 102 | &lea ($num,&DWP(-3,"edi")); # num=num-1 to assist modulo-scheduling | ||
| 103 | #&mov ($_num,$num); # redundant as $num is not reused | ||
| 104 | &mov ($_sp,"ebp"); # saved stack pointer! | ||
| 105 | |||
| 106 | if($sse2) { | ||
| 107 | $acc0="mm0"; # mmx register bank layout | ||
| 108 | $acc1="mm1"; | ||
| 109 | $car0="mm2"; | ||
| 110 | $car1="mm3"; | ||
| 111 | $mul0="mm4"; | ||
| 112 | $mul1="mm5"; | ||
| 113 | $temp="mm6"; | ||
| 114 | $mask="mm7"; | ||
| 115 | |||
| 116 | &picmeup("eax","OPENSSL_ia32cap_P"); | ||
| 117 | &bt (&DWP(0,"eax"),26); | ||
| 118 | &jnc (&label("non_sse2")); | ||
| 119 | |||
| 120 | &mov ("eax",-1); | ||
| 121 | &movd ($mask,"eax"); # mask 32 lower bits | ||
| 122 | |||
| 123 | &mov ($ap,$_ap); # load input pointers | ||
| 124 | &mov ($bp,$_bp); | ||
| 125 | &mov ($np,$_np); | ||
| 126 | |||
| 127 | &xor ($i,$i); # i=0 | ||
| 128 | &xor ($j,$j); # j=0 | ||
| 129 | |||
| 130 | &movd ($mul0,&DWP(0,$bp)); # bp[0] | ||
| 131 | &movd ($mul1,&DWP(0,$ap)); # ap[0] | ||
| 132 | &movd ($car1,&DWP(0,$np)); # np[0] | ||
| 133 | |||
| 134 | &pmuludq($mul1,$mul0); # ap[0]*bp[0] | ||
| 135 | &movq ($car0,$mul1); | ||
| 136 | &movq ($acc0,$mul1); # I wish movd worked for | ||
| 137 | &pand ($acc0,$mask); # inter-register transfers | ||
| 138 | |||
| 139 | &pmuludq($mul1,$_n0q); # *=n0 | ||
| 140 | |||
| 141 | &pmuludq($car1,$mul1); # "t[0]"*np[0]*n0 | ||
| 142 | &paddq ($car1,$acc0); | ||
| 143 | |||
| 144 | &movd ($acc1,&DWP(4,$np)); # np[1] | ||
| 145 | &movd ($acc0,&DWP(4,$ap)); # ap[1] | ||
| 146 | |||
| 147 | &psrlq ($car0,32); | ||
| 148 | &psrlq ($car1,32); | ||
| 149 | |||
| 150 | &inc ($j); # j++ | ||
| 151 | &set_label("1st",16); | ||
| 152 | &pmuludq($acc0,$mul0); # ap[j]*bp[0] | ||
| 153 | &pmuludq($acc1,$mul1); # np[j]*m1 | ||
| 154 | &paddq ($car0,$acc0); # +=c0 | ||
| 155 | &paddq ($car1,$acc1); # +=c1 | ||
| 156 | |||
| 157 | &movq ($acc0,$car0); | ||
| 158 | &pand ($acc0,$mask); | ||
| 159 | &movd ($acc1,&DWP(4,$np,$j,4)); # np[j+1] | ||
| 160 | &paddq ($car1,$acc0); # +=ap[j]*bp[0]; | ||
| 161 | &movd ($acc0,&DWP(4,$ap,$j,4)); # ap[j+1] | ||
| 162 | &psrlq ($car0,32); | ||
| 163 | &movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[j-1]= | ||
| 164 | &psrlq ($car1,32); | ||
| 165 | |||
| 166 | &lea ($j,&DWP(1,$j)); | ||
| 167 | &cmp ($j,$num); | ||
| 168 | &jl (&label("1st")); | ||
| 169 | |||
| 170 | &pmuludq($acc0,$mul0); # ap[num-1]*bp[0] | ||
| 171 | &pmuludq($acc1,$mul1); # np[num-1]*m1 | ||
| 172 | &paddq ($car0,$acc0); # +=c0 | ||
| 173 | &paddq ($car1,$acc1); # +=c1 | ||
| 174 | |||
| 175 | &movq ($acc0,$car0); | ||
| 176 | &pand ($acc0,$mask); | ||
| 177 | &paddq ($car1,$acc0); # +=ap[num-1]*bp[0]; | ||
| 178 | &movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[num-2]= | ||
| 179 | |||
| 180 | &psrlq ($car0,32); | ||
| 181 | &psrlq ($car1,32); | ||
| 182 | |||
| 183 | &paddq ($car1,$car0); | ||
| 184 | &movq (&QWP($frame,"esp",$num,4),$car1); # tp[num].tp[num-1] | ||
| 185 | |||
| 186 | &inc ($i); # i++ | ||
| 187 | &set_label("outer"); | ||
| 188 | &xor ($j,$j); # j=0 | ||
| 189 | |||
| 190 | &movd ($mul0,&DWP(0,$bp,$i,4)); # bp[i] | ||
| 191 | &movd ($mul1,&DWP(0,$ap)); # ap[0] | ||
| 192 | &movd ($temp,&DWP($frame,"esp")); # tp[0] | ||
| 193 | &movd ($car1,&DWP(0,$np)); # np[0] | ||
| 194 | &pmuludq($mul1,$mul0); # ap[0]*bp[i] | ||
| 195 | |||
| 196 | &paddq ($mul1,$temp); # +=tp[0] | ||
| 197 | &movq ($acc0,$mul1); | ||
| 198 | &movq ($car0,$mul1); | ||
| 199 | &pand ($acc0,$mask); | ||
| 200 | |||
| 201 | &pmuludq($mul1,$_n0q); # *=n0 | ||
| 202 | |||
| 203 | &pmuludq($car1,$mul1); | ||
| 204 | &paddq ($car1,$acc0); | ||
| 205 | |||
| 206 | &movd ($temp,&DWP($frame+4,"esp")); # tp[1] | ||
| 207 | &movd ($acc1,&DWP(4,$np)); # np[1] | ||
| 208 | &movd ($acc0,&DWP(4,$ap)); # ap[1] | ||
| 209 | |||
| 210 | &psrlq ($car0,32); | ||
| 211 | &psrlq ($car1,32); | ||
| 212 | &paddq ($car0,$temp); # +=tp[1] | ||
| 213 | |||
| 214 | &inc ($j); # j++ | ||
| 215 | &dec ($num); | ||
| 216 | &set_label("inner"); | ||
| 217 | &pmuludq($acc0,$mul0); # ap[j]*bp[i] | ||
| 218 | &pmuludq($acc1,$mul1); # np[j]*m1 | ||
| 219 | &paddq ($car0,$acc0); # +=c0 | ||
| 220 | &paddq ($car1,$acc1); # +=c1 | ||
| 221 | |||
| 222 | &movq ($acc0,$car0); | ||
| 223 | &movd ($temp,&DWP($frame+4,"esp",$j,4));# tp[j+1] | ||
| 224 | &pand ($acc0,$mask); | ||
| 225 | &movd ($acc1,&DWP(4,$np,$j,4)); # np[j+1] | ||
| 226 | &paddq ($car1,$acc0); # +=ap[j]*bp[i]+tp[j] | ||
| 227 | &movd ($acc0,&DWP(4,$ap,$j,4)); # ap[j+1] | ||
| 228 | &psrlq ($car0,32); | ||
| 229 | &movd (&DWP($frame-4,"esp",$j,4),$car1);# tp[j-1]= | ||
| 230 | &psrlq ($car1,32); | ||
| 231 | &paddq ($car0,$temp); # +=tp[j+1] | ||
| 232 | |||
| 233 | &dec ($num); | ||
| 234 | &lea ($j,&DWP(1,$j)); # j++ | ||
| 235 | &jnz (&label("inner")); | ||
| 236 | |||
| 237 | &mov ($num,$j); | ||
| 238 | &pmuludq($acc0,$mul0); # ap[num-1]*bp[i] | ||
| 239 | &pmuludq($acc1,$mul1); # np[num-1]*m1 | ||
| 240 | &paddq ($car0,$acc0); # +=c0 | ||
| 241 | &paddq ($car1,$acc1); # +=c1 | ||
| 242 | |||
| 243 | &movq ($acc0,$car0); | ||
| 244 | &pand ($acc0,$mask); | ||
| 245 | &paddq ($car1,$acc0); # +=ap[num-1]*bp[i]+tp[num-1] | ||
| 246 | &movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[num-2]= | ||
| 247 | &psrlq ($car0,32); | ||
| 248 | &psrlq ($car1,32); | ||
| 249 | |||
| 250 | &movd ($temp,&DWP($frame+4,"esp",$num,4)); # += tp[num] | ||
| 251 | &paddq ($car1,$car0); | ||
| 252 | &paddq ($car1,$temp); | ||
| 253 | &movq (&QWP($frame,"esp",$num,4),$car1); # tp[num].tp[num-1] | ||
| 254 | |||
| 255 | &lea ($i,&DWP(1,$i)); # i++ | ||
| 256 | &cmp ($i,$num); | ||
| 257 | &jle (&label("outer")); | ||
| 258 | |||
| 259 | &emms (); # done with mmx bank | ||
| 260 | &jmp (&label("common_tail")); | ||
| 261 | |||
| 262 | &set_label("non_sse2",16); | ||
| 263 | } | ||
| 264 | |||
| 265 | if (0) { | ||
| 266 | &mov ("esp",$_sp); | ||
| 267 | &xor ("eax","eax"); # signal "not fast enough [yet]" | ||
| 268 | &jmp (&label("just_leave")); | ||
| 269 | # While the below code provides competitive performance for | ||
| 270 | # all key lengthes on modern Intel cores, it's still more | ||
| 271 | # than 10% slower for 4096-bit key elsewhere:-( "Competitive" | ||
| 272 | # means compared to the original integer-only assembler. | ||
| 273 | # 512-bit RSA sign is better by ~40%, but that's about all | ||
| 274 | # one can say about all CPUs... | ||
| 275 | } else { | ||
| 276 | $inp="esi"; # integer path uses these registers differently | ||
| 277 | $word="edi"; | ||
| 278 | $carry="ebp"; | ||
| 279 | |||
| 280 | &mov ($inp,$_ap); | ||
| 281 | &lea ($carry,&DWP(1,$num)); | ||
| 282 | &mov ($word,$_bp); | ||
| 283 | &xor ($j,$j); # j=0 | ||
| 284 | &mov ("edx",$inp); | ||
| 285 | &and ($carry,1); # see if num is even | ||
| 286 | &sub ("edx",$word); # see if ap==bp | ||
| 287 | &lea ("eax",&DWP(4,$word,$num,4)); # &bp[num] | ||
| 288 | &or ($carry,"edx"); | ||
| 289 | &mov ($word,&DWP(0,$word)); # bp[0] | ||
| 290 | &jz (&label("bn_sqr_mont")); | ||
| 291 | &mov ($_bpend,"eax"); | ||
| 292 | &mov ("eax",&DWP(0,$inp)); | ||
| 293 | &xor ("edx","edx"); | ||
| 294 | |||
| 295 | &set_label("mull",16); | ||
| 296 | &mov ($carry,"edx"); | ||
| 297 | &mul ($word); # ap[j]*bp[0] | ||
| 298 | &add ($carry,"eax"); | ||
| 299 | &lea ($j,&DWP(1,$j)); | ||
| 300 | &adc ("edx",0); | ||
| 301 | &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j+1] | ||
| 302 | &cmp ($j,$num); | ||
| 303 | &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]= | ||
| 304 | &jl (&label("mull")); | ||
| 305 | |||
| 306 | &mov ($carry,"edx"); | ||
| 307 | &mul ($word); # ap[num-1]*bp[0] | ||
| 308 | &mov ($word,$_n0); | ||
| 309 | &add ("eax",$carry); | ||
| 310 | &mov ($inp,$_np); | ||
| 311 | &adc ("edx",0); | ||
| 312 | &imul ($word,&DWP($frame,"esp")); # n0*tp[0] | ||
| 313 | |||
| 314 | &mov (&DWP($frame,"esp",$num,4),"eax"); # tp[num-1]= | ||
| 315 | &xor ($j,$j); | ||
| 316 | &mov (&DWP($frame+4,"esp",$num,4),"edx"); # tp[num]= | ||
| 317 | &mov (&DWP($frame+8,"esp",$num,4),$j); # tp[num+1]= | ||
| 318 | |||
| 319 | &mov ("eax",&DWP(0,$inp)); # np[0] | ||
| 320 | &mul ($word); # np[0]*m | ||
| 321 | &add ("eax",&DWP($frame,"esp")); # +=tp[0] | ||
| 322 | &mov ("eax",&DWP(4,$inp)); # np[1] | ||
| 323 | &adc ("edx",0); | ||
| 324 | &inc ($j); | ||
| 325 | |||
| 326 | &jmp (&label("2ndmadd")); | ||
| 327 | |||
| 328 | &set_label("1stmadd",16); | ||
| 329 | &mov ($carry,"edx"); | ||
| 330 | &mul ($word); # ap[j]*bp[i] | ||
| 331 | &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j] | ||
| 332 | &lea ($j,&DWP(1,$j)); | ||
| 333 | &adc ("edx",0); | ||
| 334 | &add ($carry,"eax"); | ||
| 335 | &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j+1] | ||
| 336 | &adc ("edx",0); | ||
| 337 | &cmp ($j,$num); | ||
| 338 | &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]= | ||
| 339 | &jl (&label("1stmadd")); | ||
| 340 | |||
| 341 | &mov ($carry,"edx"); | ||
| 342 | &mul ($word); # ap[num-1]*bp[i] | ||
| 343 | &add ("eax",&DWP($frame,"esp",$num,4)); # +=tp[num-1] | ||
| 344 | &mov ($word,$_n0); | ||
| 345 | &adc ("edx",0); | ||
| 346 | &mov ($inp,$_np); | ||
| 347 | &add ($carry,"eax"); | ||
| 348 | &adc ("edx",0); | ||
| 349 | &imul ($word,&DWP($frame,"esp")); # n0*tp[0] | ||
| 350 | |||
| 351 | &xor ($j,$j); | ||
| 352 | &add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num] | ||
| 353 | &mov (&DWP($frame,"esp",$num,4),$carry); # tp[num-1]= | ||
| 354 | &adc ($j,0); | ||
| 355 | &mov ("eax",&DWP(0,$inp)); # np[0] | ||
| 356 | &mov (&DWP($frame+4,"esp",$num,4),"edx"); # tp[num]= | ||
| 357 | &mov (&DWP($frame+8,"esp",$num,4),$j); # tp[num+1]= | ||
| 358 | |||
| 359 | &mul ($word); # np[0]*m | ||
| 360 | &add ("eax",&DWP($frame,"esp")); # +=tp[0] | ||
| 361 | &mov ("eax",&DWP(4,$inp)); # np[1] | ||
| 362 | &adc ("edx",0); | ||
| 363 | &mov ($j,1); | ||
| 364 | |||
| 365 | &set_label("2ndmadd",16); | ||
| 366 | &mov ($carry,"edx"); | ||
| 367 | &mul ($word); # np[j]*m | ||
| 368 | &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j] | ||
| 369 | &lea ($j,&DWP(1,$j)); | ||
| 370 | &adc ("edx",0); | ||
| 371 | &add ($carry,"eax"); | ||
| 372 | &mov ("eax",&DWP(0,$inp,$j,4)); # np[j+1] | ||
| 373 | &adc ("edx",0); | ||
| 374 | &cmp ($j,$num); | ||
| 375 | &mov (&DWP($frame-8,"esp",$j,4),$carry); # tp[j-1]= | ||
| 376 | &jl (&label("2ndmadd")); | ||
| 377 | |||
| 378 | &mov ($carry,"edx"); | ||
| 379 | &mul ($word); # np[j]*m | ||
| 380 | &add ($carry,&DWP($frame,"esp",$num,4)); # +=tp[num-1] | ||
| 381 | &adc ("edx",0); | ||
| 382 | &add ($carry,"eax"); | ||
| 383 | &adc ("edx",0); | ||
| 384 | &mov (&DWP($frame-4,"esp",$num,4),$carry); # tp[num-2]= | ||
| 385 | |||
| 386 | &xor ("eax","eax"); | ||
| 387 | &mov ($j,$_bp); # &bp[i] | ||
| 388 | &add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num] | ||
| 389 | &adc ("eax",&DWP($frame+8,"esp",$num,4)); # +=tp[num+1] | ||
| 390 | &lea ($j,&DWP(4,$j)); | ||
| 391 | &mov (&DWP($frame,"esp",$num,4),"edx"); # tp[num-1]= | ||
| 392 | &cmp ($j,$_bpend); | ||
| 393 | &mov (&DWP($frame+4,"esp",$num,4),"eax"); # tp[num]= | ||
| 394 | &je (&label("common_tail")); | ||
| 395 | |||
| 396 | &mov ($word,&DWP(0,$j)); # bp[i+1] | ||
| 397 | &mov ($inp,$_ap); | ||
| 398 | &mov ($_bp,$j); # &bp[++i] | ||
| 399 | &xor ($j,$j); | ||
| 400 | &xor ("edx","edx"); | ||
| 401 | &mov ("eax",&DWP(0,$inp)); | ||
| 402 | &jmp (&label("1stmadd")); | ||
| 403 | |||
| 404 | &set_label("bn_sqr_mont",16); | ||
| 405 | $sbit=$num; | ||
| 406 | &mov ($_num,$num); | ||
| 407 | &mov ($_bp,$j); # i=0 | ||
| 408 | |||
| 409 | &mov ("eax",$word); # ap[0] | ||
| 410 | &mul ($word); # ap[0]*ap[0] | ||
| 411 | &mov (&DWP($frame,"esp"),"eax"); # tp[0]= | ||
| 412 | &mov ($sbit,"edx"); | ||
| 413 | &shr ("edx",1); | ||
| 414 | &and ($sbit,1); | ||
| 415 | &inc ($j); | ||
| 416 | &set_label("sqr",16); | ||
| 417 | &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j] | ||
| 418 | &mov ($carry,"edx"); | ||
| 419 | &mul ($word); # ap[j]*ap[0] | ||
| 420 | &add ("eax",$carry); | ||
| 421 | &lea ($j,&DWP(1,$j)); | ||
| 422 | &adc ("edx",0); | ||
| 423 | &lea ($carry,&DWP(0,$sbit,"eax",2)); | ||
| 424 | &shr ("eax",31); | ||
| 425 | &cmp ($j,$_num); | ||
| 426 | &mov ($sbit,"eax"); | ||
| 427 | &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]= | ||
| 428 | &jl (&label("sqr")); | ||
| 429 | |||
| 430 | &mov ("eax",&DWP(0,$inp,$j,4)); # ap[num-1] | ||
| 431 | &mov ($carry,"edx"); | ||
| 432 | &mul ($word); # ap[num-1]*ap[0] | ||
| 433 | &add ("eax",$carry); | ||
| 434 | &mov ($word,$_n0); | ||
| 435 | &adc ("edx",0); | ||
| 436 | &mov ($inp,$_np); | ||
| 437 | &lea ($carry,&DWP(0,$sbit,"eax",2)); | ||
| 438 | &imul ($word,&DWP($frame,"esp")); # n0*tp[0] | ||
| 439 | &shr ("eax",31); | ||
| 440 | &mov (&DWP($frame,"esp",$j,4),$carry); # tp[num-1]= | ||
| 441 | |||
| 442 | &lea ($carry,&DWP(0,"eax","edx",2)); | ||
| 443 | &mov ("eax",&DWP(0,$inp)); # np[0] | ||
| 444 | &shr ("edx",31); | ||
| 445 | &mov (&DWP($frame+4,"esp",$j,4),$carry); # tp[num]= | ||
| 446 | &mov (&DWP($frame+8,"esp",$j,4),"edx"); # tp[num+1]= | ||
| 447 | |||
| 448 | &mul ($word); # np[0]*m | ||
| 449 | &add ("eax",&DWP($frame,"esp")); # +=tp[0] | ||
| 450 | &mov ($num,$j); | ||
| 451 | &adc ("edx",0); | ||
| 452 | &mov ("eax",&DWP(4,$inp)); # np[1] | ||
| 453 | &mov ($j,1); | ||
| 454 | |||
| 455 | &set_label("3rdmadd",16); | ||
| 456 | &mov ($carry,"edx"); | ||
| 457 | &mul ($word); # np[j]*m | ||
| 458 | &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j] | ||
| 459 | &adc ("edx",0); | ||
| 460 | &add ($carry,"eax"); | ||
| 461 | &mov ("eax",&DWP(4,$inp,$j,4)); # np[j+1] | ||
| 462 | &adc ("edx",0); | ||
| 463 | &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j-1]= | ||
| 464 | |||
| 465 | &mov ($carry,"edx"); | ||
| 466 | &mul ($word); # np[j+1]*m | ||
| 467 | &add ($carry,&DWP($frame+4,"esp",$j,4)); # +=tp[j+1] | ||
| 468 | &lea ($j,&DWP(2,$j)); | ||
| 469 | &adc ("edx",0); | ||
| 470 | &add ($carry,"eax"); | ||
| 471 | &mov ("eax",&DWP(0,$inp,$j,4)); # np[j+2] | ||
| 472 | &adc ("edx",0); | ||
| 473 | &cmp ($j,$num); | ||
| 474 | &mov (&DWP($frame-8,"esp",$j,4),$carry); # tp[j]= | ||
| 475 | &jl (&label("3rdmadd")); | ||
| 476 | |||
| 477 | &mov ($carry,"edx"); | ||
| 478 | &mul ($word); # np[j]*m | ||
| 479 | &add ($carry,&DWP($frame,"esp",$num,4)); # +=tp[num-1] | ||
| 480 | &adc ("edx",0); | ||
| 481 | &add ($carry,"eax"); | ||
| 482 | &adc ("edx",0); | ||
| 483 | &mov (&DWP($frame-4,"esp",$num,4),$carry); # tp[num-2]= | ||
| 484 | |||
| 485 | &mov ($j,$_bp); # i | ||
| 486 | &xor ("eax","eax"); | ||
| 487 | &mov ($inp,$_ap); | ||
| 488 | &add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num] | ||
| 489 | &adc ("eax",&DWP($frame+8,"esp",$num,4)); # +=tp[num+1] | ||
| 490 | &mov (&DWP($frame,"esp",$num,4),"edx"); # tp[num-1]= | ||
| 491 | &cmp ($j,$num); | ||
| 492 | &mov (&DWP($frame+4,"esp",$num,4),"eax"); # tp[num]= | ||
| 493 | &je (&label("common_tail")); | ||
| 494 | |||
| 495 | &mov ($word,&DWP(4,$inp,$j,4)); # ap[i] | ||
| 496 | &lea ($j,&DWP(1,$j)); | ||
| 497 | &mov ("eax",$word); | ||
| 498 | &mov ($_bp,$j); # ++i | ||
| 499 | &mul ($word); # ap[i]*ap[i] | ||
| 500 | &add ("eax",&DWP($frame,"esp",$j,4)); # +=tp[i] | ||
| 501 | &adc ("edx",0); | ||
| 502 | &mov (&DWP($frame,"esp",$j,4),"eax"); # tp[i]= | ||
| 503 | &xor ($carry,$carry); | ||
| 504 | &cmp ($j,$num); | ||
| 505 | &lea ($j,&DWP(1,$j)); | ||
| 506 | &je (&label("sqrlast")); | ||
| 507 | |||
| 508 | &mov ($sbit,"edx"); # zaps $num | ||
| 509 | &shr ("edx",1); | ||
| 510 | &and ($sbit,1); | ||
| 511 | &set_label("sqradd",16); | ||
| 512 | &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j] | ||
| 513 | &mov ($carry,"edx"); | ||
| 514 | &mul ($word); # ap[j]*ap[i] | ||
| 515 | &add ("eax",$carry); | ||
| 516 | &lea ($carry,&DWP(0,"eax","eax")); | ||
| 517 | &adc ("edx",0); | ||
| 518 | &shr ("eax",31); | ||
| 519 | &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j] | ||
| 520 | &lea ($j,&DWP(1,$j)); | ||
| 521 | &adc ("eax",0); | ||
| 522 | &add ($carry,$sbit); | ||
| 523 | &adc ("eax",0); | ||
| 524 | &cmp ($j,$_num); | ||
| 525 | &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]= | ||
| 526 | &mov ($sbit,"eax"); | ||
| 527 | &jle (&label("sqradd")); | ||
| 528 | |||
| 529 | &mov ($carry,"edx"); | ||
| 530 | &lea ("edx",&DWP(0,$sbit,"edx",2)); | ||
| 531 | &shr ($carry,31); | ||
| 532 | &set_label("sqrlast"); | ||
| 533 | &mov ($word,$_n0); | ||
| 534 | &mov ($inp,$_np); | ||
| 535 | &imul ($word,&DWP($frame,"esp")); # n0*tp[0] | ||
| 536 | |||
| 537 | &add ("edx",&DWP($frame,"esp",$j,4)); # +=tp[num] | ||
| 538 | &mov ("eax",&DWP(0,$inp)); # np[0] | ||
| 539 | &adc ($carry,0); | ||
| 540 | &mov (&DWP($frame,"esp",$j,4),"edx"); # tp[num]= | ||
| 541 | &mov (&DWP($frame+4,"esp",$j,4),$carry); # tp[num+1]= | ||
| 542 | |||
| 543 | &mul ($word); # np[0]*m | ||
| 544 | &add ("eax",&DWP($frame,"esp")); # +=tp[0] | ||
| 545 | &lea ($num,&DWP(-1,$j)); | ||
| 546 | &adc ("edx",0); | ||
| 547 | &mov ($j,1); | ||
| 548 | &mov ("eax",&DWP(4,$inp)); # np[1] | ||
| 549 | |||
| 550 | &jmp (&label("3rdmadd")); | ||
| 551 | } | ||
| 552 | |||
| 553 | &set_label("common_tail",16); | ||
| 554 | &mov ($np,$_np); # load modulus pointer | ||
| 555 | &mov ($rp,$_rp); # load result pointer | ||
| 556 | &lea ($tp,&DWP($frame,"esp")); # [$ap and $bp are zapped] | ||
| 557 | |||
| 558 | &mov ("eax",&DWP(0,$tp)); # tp[0] | ||
| 559 | &mov ($j,$num); # j=num-1 | ||
| 560 | &xor ($i,$i); # i=0 and clear CF! | ||
| 561 | |||
| 562 | &set_label("sub",16); | ||
| 563 | &sbb ("eax",&DWP(0,$np,$i,4)); | ||
| 564 | &mov (&DWP(0,$rp,$i,4),"eax"); # rp[i]=tp[i]-np[i] | ||
| 565 | &dec ($j); # doesn't affect CF! | ||
| 566 | &mov ("eax",&DWP(4,$tp,$i,4)); # tp[i+1] | ||
| 567 | &lea ($i,&DWP(1,$i)); # i++ | ||
| 568 | &jge (&label("sub")); | ||
| 569 | |||
| 570 | &sbb ("eax",0); # handle upmost overflow bit | ||
| 571 | &and ($tp,"eax"); | ||
| 572 | ¬ ("eax"); | ||
| 573 | &mov ($np,$rp); | ||
| 574 | &and ($np,"eax"); | ||
| 575 | &or ($tp,$np); # tp=carry?tp:rp | ||
| 576 | |||
| 577 | &set_label("copy",16); # copy or in-place refresh | ||
| 578 | &mov ("eax",&DWP(0,$tp,$num,4)); | ||
| 579 | &mov (&DWP(0,$rp,$num,4),"eax"); # rp[i]=tp[i] | ||
| 580 | &mov (&DWP($frame,"esp",$num,4),$j); # zap temporary vector | ||
| 581 | &dec ($num); | ||
| 582 | &jge (&label("copy")); | ||
| 583 | |||
| 584 | &mov ("esp",$_sp); # pull saved stack pointer | ||
| 585 | &mov ("eax",1); | ||
| 586 | &set_label("just_leave"); | ||
| 587 | &function_end("bn_mul_mont"); | ||
| 588 | |||
| 589 | &asciz("Montgomery Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>"); | ||
| 590 | |||
| 591 | &asm_finish(); | ||
