diff options
| author | djm <> | 2010-10-01 22:54:21 +0000 |
|---|---|---|
| committer | djm <> | 2010-10-01 22:54:21 +0000 |
| commit | 2ea67f4aa254b09ded62e6e14fc893bbe6381579 (patch) | |
| tree | bb3923b81f2ce34b1ad62684afdf1a94d904c185 /src/lib/libcrypto/bn | |
| parent | 6ddfb710ab14b10183ff3a6a32f643554c80065e (diff) | |
| parent | 829fd51d4f8dde4a7f3bf54754f3c1d1a502f5e2 (diff) | |
| download | openbsd-2ea67f4aa254b09ded62e6e14fc893bbe6381579.tar.gz openbsd-2ea67f4aa254b09ded62e6e14fc893bbe6381579.tar.bz2 openbsd-2ea67f4aa254b09ded62e6e14fc893bbe6381579.zip | |
This commit was generated by cvs2git to track changes on a CVS vendor
branch.
Diffstat (limited to 'src/lib/libcrypto/bn')
| -rw-r--r-- | src/lib/libcrypto/bn/asm/alpha-mont.pl | 8 | ||||
| -rw-r--r-- | src/lib/libcrypto/bn/asm/armv4-mont.pl | 1 | ||||
| -rw-r--r-- | src/lib/libcrypto/bn/asm/ppc.pl | 233 | ||||
| -rw-r--r-- | src/lib/libcrypto/bn/asm/x86_64-gcc.c | 29 | ||||
| -rwxr-xr-x | src/lib/libcrypto/bn/asm/x86_64-mont.pl | 136 |
5 files changed, 218 insertions, 189 deletions
diff --git a/src/lib/libcrypto/bn/asm/alpha-mont.pl b/src/lib/libcrypto/bn/asm/alpha-mont.pl index 7a2cc3173b..f7e0ca1646 100644 --- a/src/lib/libcrypto/bn/asm/alpha-mont.pl +++ b/src/lib/libcrypto/bn/asm/alpha-mont.pl | |||
| @@ -53,15 +53,15 @@ $code=<<___; | |||
| 53 | .align 5 | 53 | .align 5 |
| 54 | .ent bn_mul_mont | 54 | .ent bn_mul_mont |
| 55 | bn_mul_mont: | 55 | bn_mul_mont: |
| 56 | lda sp,-40(sp) | 56 | lda sp,-48(sp) |
| 57 | stq ra,0(sp) | 57 | stq ra,0(sp) |
| 58 | stq s3,8(sp) | 58 | stq s3,8(sp) |
| 59 | stq s4,16(sp) | 59 | stq s4,16(sp) |
| 60 | stq s5,24(sp) | 60 | stq s5,24(sp) |
| 61 | stq fp,32(sp) | 61 | stq fp,32(sp) |
| 62 | mov sp,fp | 62 | mov sp,fp |
| 63 | .mask 0x0400f000,-40 | 63 | .mask 0x0400f000,-48 |
| 64 | .frame fp,40,ra | 64 | .frame fp,48,ra |
| 65 | .prologue 0 | 65 | .prologue 0 |
| 66 | 66 | ||
| 67 | .align 4 | 67 | .align 4 |
| @@ -306,7 +306,7 @@ bn_mul_mont: | |||
| 306 | ldq s4,16(sp) | 306 | ldq s4,16(sp) |
| 307 | ldq s5,24(sp) | 307 | ldq s5,24(sp) |
| 308 | ldq fp,32(sp) | 308 | ldq fp,32(sp) |
| 309 | lda sp,40(sp) | 309 | lda sp,48(sp) |
| 310 | ret (ra) | 310 | ret (ra) |
| 311 | .end bn_mul_mont | 311 | .end bn_mul_mont |
| 312 | .rdata | 312 | .rdata |
diff --git a/src/lib/libcrypto/bn/asm/armv4-mont.pl b/src/lib/libcrypto/bn/asm/armv4-mont.pl index 05d5dc1a48..14e0d2d1dd 100644 --- a/src/lib/libcrypto/bn/asm/armv4-mont.pl +++ b/src/lib/libcrypto/bn/asm/armv4-mont.pl | |||
| @@ -193,6 +193,7 @@ bn_mul_mont: | |||
| 193 | bx lr @ interoperable with Thumb ISA:-) | 193 | bx lr @ interoperable with Thumb ISA:-) |
| 194 | .size bn_mul_mont,.-bn_mul_mont | 194 | .size bn_mul_mont,.-bn_mul_mont |
| 195 | .asciz "Montgomery multiplication for ARMv4, CRYPTOGAMS by <appro\@openssl.org>" | 195 | .asciz "Montgomery multiplication for ARMv4, CRYPTOGAMS by <appro\@openssl.org>" |
| 196 | .align 2 | ||
| 196 | ___ | 197 | ___ |
| 197 | 198 | ||
| 198 | $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4 | 199 | $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4 |
diff --git a/src/lib/libcrypto/bn/asm/ppc.pl b/src/lib/libcrypto/bn/asm/ppc.pl index 08e0053473..37c65d3511 100644 --- a/src/lib/libcrypto/bn/asm/ppc.pl +++ b/src/lib/libcrypto/bn/asm/ppc.pl | |||
| @@ -100,9 +100,9 @@ | |||
| 100 | # me a note at schari@us.ibm.com | 100 | # me a note at schari@us.ibm.com |
| 101 | # | 101 | # |
| 102 | 102 | ||
| 103 | $opf = shift; | 103 | $flavour = shift; |
| 104 | 104 | ||
| 105 | if ($opf =~ /32\.s/) { | 105 | if ($flavour =~ /32/) { |
| 106 | $BITS= 32; | 106 | $BITS= 32; |
| 107 | $BNSZ= $BITS/8; | 107 | $BNSZ= $BITS/8; |
| 108 | $ISA= "\"ppc\""; | 108 | $ISA= "\"ppc\""; |
| @@ -125,7 +125,7 @@ if ($opf =~ /32\.s/) { | |||
| 125 | $INSR= "insrwi"; # insert right | 125 | $INSR= "insrwi"; # insert right |
| 126 | $ROTL= "rotlwi"; # rotate left by immediate | 126 | $ROTL= "rotlwi"; # rotate left by immediate |
| 127 | $TR= "tw"; # conditional trap | 127 | $TR= "tw"; # conditional trap |
| 128 | } elsif ($opf =~ /64\.s/) { | 128 | } elsif ($flavour =~ /64/) { |
| 129 | $BITS= 64; | 129 | $BITS= 64; |
| 130 | $BNSZ= $BITS/8; | 130 | $BNSZ= $BITS/8; |
| 131 | $ISA= "\"ppc64\""; | 131 | $ISA= "\"ppc64\""; |
| @@ -149,93 +149,16 @@ if ($opf =~ /32\.s/) { | |||
| 149 | $INSR= "insrdi"; # insert right | 149 | $INSR= "insrdi"; # insert right |
| 150 | $ROTL= "rotldi"; # rotate left by immediate | 150 | $ROTL= "rotldi"; # rotate left by immediate |
| 151 | $TR= "td"; # conditional trap | 151 | $TR= "td"; # conditional trap |
| 152 | } else { die "nonsense $opf"; } | 152 | } else { die "nonsense $flavour"; } |
| 153 | 153 | ||
| 154 | ( defined shift || open STDOUT,">$opf" ) || die "can't open $opf: $!"; | 154 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; |
| 155 | ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or | ||
| 156 | ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or | ||
| 157 | die "can't locate ppc-xlate.pl"; | ||
| 155 | 158 | ||
| 156 | # function entry points from the AIX code | 159 | open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!"; |
| 157 | # | ||
| 158 | # There are other, more elegant, ways to handle this. We (IBM) chose | ||
| 159 | # this approach as it plays well with scripts we run to 'namespace' | ||
| 160 | # OpenSSL .i.e. we add a prefix to all the public symbols so we can | ||
| 161 | # co-exist in the same process with other implementations of OpenSSL. | ||
| 162 | # 'cleverer' ways of doing these substitutions tend to hide data we | ||
| 163 | # need to be obvious. | ||
| 164 | # | ||
| 165 | my @items = ("bn_sqr_comba4", | ||
| 166 | "bn_sqr_comba8", | ||
| 167 | "bn_mul_comba4", | ||
| 168 | "bn_mul_comba8", | ||
| 169 | "bn_sub_words", | ||
| 170 | "bn_add_words", | ||
| 171 | "bn_div_words", | ||
| 172 | "bn_sqr_words", | ||
| 173 | "bn_mul_words", | ||
| 174 | "bn_mul_add_words"); | ||
| 175 | 160 | ||
| 176 | if ($opf =~ /linux/) { do_linux(); } | 161 | $data=<<EOF; |
| 177 | elsif ($opf =~ /aix/) { do_aix(); } | ||
| 178 | elsif ($opf =~ /osx/) { do_osx(); } | ||
| 179 | else { do_bsd(); } | ||
| 180 | |||
| 181 | sub do_linux { | ||
| 182 | $d=&data(); | ||
| 183 | |||
| 184 | if ($BITS==64) { | ||
| 185 | foreach $t (@items) { | ||
| 186 | $d =~ s/\.$t:/\ | ||
| 187 | \t.section\t".opd","aw"\ | ||
| 188 | \t.align\t3\ | ||
| 189 | \t.globl\t$t\ | ||
| 190 | $t:\ | ||
| 191 | \t.quad\t.$t,.TOC.\@tocbase,0\ | ||
| 192 | \t.size\t$t,24\ | ||
| 193 | \t.previous\n\ | ||
| 194 | \t.type\t.$t,\@function\ | ||
| 195 | \t.globl\t.$t\ | ||
| 196 | .$t:/g; | ||
| 197 | } | ||
| 198 | } | ||
| 199 | else { | ||
| 200 | foreach $t (@items) { | ||
| 201 | $d=~s/\.$t/$t/g; | ||
| 202 | } | ||
| 203 | } | ||
| 204 | # hide internal labels to avoid pollution of name table... | ||
| 205 | $d=~s/Lppcasm_/.Lppcasm_/gm; | ||
| 206 | print $d; | ||
| 207 | } | ||
| 208 | |||
| 209 | sub do_aix { | ||
| 210 | # AIX assembler is smart enough to please the linker without | ||
| 211 | # making us do something special... | ||
| 212 | print &data(); | ||
| 213 | } | ||
| 214 | |||
| 215 | # MacOSX 32 bit | ||
| 216 | sub do_osx { | ||
| 217 | $d=&data(); | ||
| 218 | # Change the bn symbol prefix from '.' to '_' | ||
| 219 | foreach $t (@items) { | ||
| 220 | $d=~s/\.$t/_$t/g; | ||
| 221 | } | ||
| 222 | # Change .machine to something OS X asm will accept | ||
| 223 | $d=~s/\.machine.*/.text/g; | ||
| 224 | $d=~s/\#/;/g; # change comment from '#' to ';' | ||
| 225 | print $d; | ||
| 226 | } | ||
| 227 | |||
| 228 | # BSD (Untested) | ||
| 229 | sub do_bsd { | ||
| 230 | $d=&data(); | ||
| 231 | foreach $t (@items) { | ||
| 232 | $d=~s/\.$t/_$t/g; | ||
| 233 | } | ||
| 234 | print $d; | ||
| 235 | } | ||
| 236 | |||
| 237 | sub data { | ||
| 238 | local($data)=<<EOF; | ||
| 239 | #-------------------------------------------------------------------- | 162 | #-------------------------------------------------------------------- |
| 240 | # | 163 | # |
| 241 | # | 164 | # |
| @@ -297,33 +220,20 @@ sub data { | |||
| 297 | # | 220 | # |
| 298 | # Defines to be used in the assembly code. | 221 | # Defines to be used in the assembly code. |
| 299 | # | 222 | # |
| 300 | .set r0,0 # we use it as storage for value of 0 | 223 | #.set r0,0 # we use it as storage for value of 0 |
| 301 | .set SP,1 # preserved | 224 | #.set SP,1 # preserved |
| 302 | .set RTOC,2 # preserved | 225 | #.set RTOC,2 # preserved |
| 303 | .set r3,3 # 1st argument/return value | 226 | #.set r3,3 # 1st argument/return value |
| 304 | .set r4,4 # 2nd argument/volatile register | 227 | #.set r4,4 # 2nd argument/volatile register |
| 305 | .set r5,5 # 3rd argument/volatile register | 228 | #.set r5,5 # 3rd argument/volatile register |
| 306 | .set r6,6 # ... | 229 | #.set r6,6 # ... |
| 307 | .set r7,7 | 230 | #.set r7,7 |
| 308 | .set r8,8 | 231 | #.set r8,8 |
| 309 | .set r9,9 | 232 | #.set r9,9 |
| 310 | .set r10,10 | 233 | #.set r10,10 |
| 311 | .set r11,11 | 234 | #.set r11,11 |
| 312 | .set r12,12 | 235 | #.set r12,12 |
| 313 | .set r13,13 # not used, nor any other "below" it... | 236 | #.set r13,13 # not used, nor any other "below" it... |
| 314 | |||
| 315 | .set BO_IF_NOT,4 | ||
| 316 | .set BO_IF,12 | ||
| 317 | .set BO_dCTR_NZERO,16 | ||
| 318 | .set BO_dCTR_ZERO,18 | ||
| 319 | .set BO_ALWAYS,20 | ||
| 320 | .set CR0_LT,0; | ||
| 321 | .set CR0_GT,1; | ||
| 322 | .set CR0_EQ,2 | ||
| 323 | .set CR1_FX,4; | ||
| 324 | .set CR1_FEX,5; | ||
| 325 | .set CR1_VX,6 | ||
| 326 | .set LR,8 | ||
| 327 | 237 | ||
| 328 | # Declare function names to be global | 238 | # Declare function names to be global |
| 329 | # NOTE: For gcc these names MUST be changed to remove | 239 | # NOTE: For gcc these names MUST be changed to remove |
| @@ -344,7 +254,7 @@ sub data { | |||
| 344 | 254 | ||
| 345 | # .text section | 255 | # .text section |
| 346 | 256 | ||
| 347 | .machine $ISA | 257 | .machine "any" |
| 348 | 258 | ||
| 349 | # | 259 | # |
| 350 | # NOTE: The following label name should be changed to | 260 | # NOTE: The following label name should be changed to |
| @@ -478,7 +388,7 @@ sub data { | |||
| 478 | 388 | ||
| 479 | $ST r9,`6*$BNSZ`(r3) #r[6]=c1 | 389 | $ST r9,`6*$BNSZ`(r3) #r[6]=c1 |
| 480 | $ST r10,`7*$BNSZ`(r3) #r[7]=c2 | 390 | $ST r10,`7*$BNSZ`(r3) #r[7]=c2 |
| 481 | bclr BO_ALWAYS,CR0_LT | 391 | blr |
| 482 | .long 0x00000000 | 392 | .long 0x00000000 |
| 483 | 393 | ||
| 484 | # | 394 | # |
| @@ -903,7 +813,7 @@ sub data { | |||
| 903 | $ST r9, `15*$BNSZ`(r3) #r[15]=c1; | 813 | $ST r9, `15*$BNSZ`(r3) #r[15]=c1; |
| 904 | 814 | ||
| 905 | 815 | ||
| 906 | bclr BO_ALWAYS,CR0_LT | 816 | blr |
| 907 | 817 | ||
| 908 | .long 0x00000000 | 818 | .long 0x00000000 |
| 909 | 819 | ||
| @@ -1055,7 +965,7 @@ sub data { | |||
| 1055 | 965 | ||
| 1056 | $ST r10,`6*$BNSZ`(r3) #r[6]=c1 | 966 | $ST r10,`6*$BNSZ`(r3) #r[6]=c1 |
| 1057 | $ST r11,`7*$BNSZ`(r3) #r[7]=c2 | 967 | $ST r11,`7*$BNSZ`(r3) #r[7]=c2 |
| 1058 | bclr BO_ALWAYS,CR0_LT | 968 | blr |
| 1059 | .long 0x00000000 | 969 | .long 0x00000000 |
| 1060 | 970 | ||
| 1061 | # | 971 | # |
| @@ -1591,7 +1501,7 @@ sub data { | |||
| 1591 | adde r10,r10,r9 | 1501 | adde r10,r10,r9 |
| 1592 | $ST r12,`14*$BNSZ`(r3) #r[14]=c3; | 1502 | $ST r12,`14*$BNSZ`(r3) #r[14]=c3; |
| 1593 | $ST r10,`15*$BNSZ`(r3) #r[15]=c1; | 1503 | $ST r10,`15*$BNSZ`(r3) #r[15]=c1; |
| 1594 | bclr BO_ALWAYS,CR0_LT | 1504 | blr |
| 1595 | .long 0x00000000 | 1505 | .long 0x00000000 |
| 1596 | 1506 | ||
| 1597 | # | 1507 | # |
| @@ -1623,7 +1533,7 @@ sub data { | |||
| 1623 | subfc. r7,r0,r6 # If r6 is 0 then result is 0. | 1533 | subfc. r7,r0,r6 # If r6 is 0 then result is 0. |
| 1624 | # if r6 > 0 then result !=0 | 1534 | # if r6 > 0 then result !=0 |
| 1625 | # In either case carry bit is set. | 1535 | # In either case carry bit is set. |
| 1626 | bc BO_IF,CR0_EQ,Lppcasm_sub_adios | 1536 | beq Lppcasm_sub_adios |
| 1627 | addi r4,r4,-$BNSZ | 1537 | addi r4,r4,-$BNSZ |
| 1628 | addi r3,r3,-$BNSZ | 1538 | addi r3,r3,-$BNSZ |
| 1629 | addi r5,r5,-$BNSZ | 1539 | addi r5,r5,-$BNSZ |
| @@ -1635,11 +1545,11 @@ Lppcasm_sub_mainloop: | |||
| 1635 | # if carry = 1 this is r7-r8. Else it | 1545 | # if carry = 1 this is r7-r8. Else it |
| 1636 | # is r7-r8 -1 as we need. | 1546 | # is r7-r8 -1 as we need. |
| 1637 | $STU r6,$BNSZ(r3) | 1547 | $STU r6,$BNSZ(r3) |
| 1638 | bc BO_dCTR_NZERO,CR0_EQ,Lppcasm_sub_mainloop | 1548 | bdnz- Lppcasm_sub_mainloop |
| 1639 | Lppcasm_sub_adios: | 1549 | Lppcasm_sub_adios: |
| 1640 | subfze r3,r0 # if carry bit is set then r3 = 0 else -1 | 1550 | subfze r3,r0 # if carry bit is set then r3 = 0 else -1 |
| 1641 | andi. r3,r3,1 # keep only last bit. | 1551 | andi. r3,r3,1 # keep only last bit. |
| 1642 | bclr BO_ALWAYS,CR0_LT | 1552 | blr |
| 1643 | .long 0x00000000 | 1553 | .long 0x00000000 |
| 1644 | 1554 | ||
| 1645 | 1555 | ||
| @@ -1670,7 +1580,7 @@ Lppcasm_sub_adios: | |||
| 1670 | # check for r6 = 0. Is this needed? | 1580 | # check for r6 = 0. Is this needed? |
| 1671 | # | 1581 | # |
| 1672 | addic. r6,r6,0 #test r6 and clear carry bit. | 1582 | addic. r6,r6,0 #test r6 and clear carry bit. |
| 1673 | bc BO_IF,CR0_EQ,Lppcasm_add_adios | 1583 | beq Lppcasm_add_adios |
| 1674 | addi r4,r4,-$BNSZ | 1584 | addi r4,r4,-$BNSZ |
| 1675 | addi r3,r3,-$BNSZ | 1585 | addi r3,r3,-$BNSZ |
| 1676 | addi r5,r5,-$BNSZ | 1586 | addi r5,r5,-$BNSZ |
| @@ -1680,10 +1590,10 @@ Lppcasm_add_mainloop: | |||
| 1680 | $LDU r8,$BNSZ(r5) | 1590 | $LDU r8,$BNSZ(r5) |
| 1681 | adde r8,r7,r8 | 1591 | adde r8,r7,r8 |
| 1682 | $STU r8,$BNSZ(r3) | 1592 | $STU r8,$BNSZ(r3) |
| 1683 | bc BO_dCTR_NZERO,CR0_EQ,Lppcasm_add_mainloop | 1593 | bdnz- Lppcasm_add_mainloop |
| 1684 | Lppcasm_add_adios: | 1594 | Lppcasm_add_adios: |
| 1685 | addze r3,r0 #return carry bit. | 1595 | addze r3,r0 #return carry bit. |
| 1686 | bclr BO_ALWAYS,CR0_LT | 1596 | blr |
| 1687 | .long 0x00000000 | 1597 | .long 0x00000000 |
| 1688 | 1598 | ||
| 1689 | # | 1599 | # |
| @@ -1707,24 +1617,24 @@ Lppcasm_add_adios: | |||
| 1707 | # r5 = d | 1617 | # r5 = d |
| 1708 | 1618 | ||
| 1709 | $UCMPI 0,r5,0 # compare r5 and 0 | 1619 | $UCMPI 0,r5,0 # compare r5 and 0 |
| 1710 | bc BO_IF_NOT,CR0_EQ,Lppcasm_div1 # proceed if d!=0 | 1620 | bne Lppcasm_div1 # proceed if d!=0 |
| 1711 | li r3,-1 # d=0 return -1 | 1621 | li r3,-1 # d=0 return -1 |
| 1712 | bclr BO_ALWAYS,CR0_LT | 1622 | blr |
| 1713 | Lppcasm_div1: | 1623 | Lppcasm_div1: |
| 1714 | xor r0,r0,r0 #r0=0 | 1624 | xor r0,r0,r0 #r0=0 |
| 1715 | li r8,$BITS | 1625 | li r8,$BITS |
| 1716 | $CNTLZ. r7,r5 #r7 = num leading 0s in d. | 1626 | $CNTLZ. r7,r5 #r7 = num leading 0s in d. |
| 1717 | bc BO_IF,CR0_EQ,Lppcasm_div2 #proceed if no leading zeros | 1627 | beq Lppcasm_div2 #proceed if no leading zeros |
| 1718 | subf r8,r7,r8 #r8 = BN_num_bits_word(d) | 1628 | subf r8,r7,r8 #r8 = BN_num_bits_word(d) |
| 1719 | $SHR. r9,r3,r8 #are there any bits above r8'th? | 1629 | $SHR. r9,r3,r8 #are there any bits above r8'th? |
| 1720 | $TR 16,r9,r0 #if there're, signal to dump core... | 1630 | $TR 16,r9,r0 #if there're, signal to dump core... |
| 1721 | Lppcasm_div2: | 1631 | Lppcasm_div2: |
| 1722 | $UCMP 0,r3,r5 #h>=d? | 1632 | $UCMP 0,r3,r5 #h>=d? |
| 1723 | bc BO_IF,CR0_LT,Lppcasm_div3 #goto Lppcasm_div3 if not | 1633 | blt Lppcasm_div3 #goto Lppcasm_div3 if not |
| 1724 | subf r3,r5,r3 #h-=d ; | 1634 | subf r3,r5,r3 #h-=d ; |
| 1725 | Lppcasm_div3: #r7 = BN_BITS2-i. so r7=i | 1635 | Lppcasm_div3: #r7 = BN_BITS2-i. so r7=i |
| 1726 | cmpi 0,0,r7,0 # is (i == 0)? | 1636 | cmpi 0,0,r7,0 # is (i == 0)? |
| 1727 | bc BO_IF,CR0_EQ,Lppcasm_div4 | 1637 | beq Lppcasm_div4 |
| 1728 | $SHL r3,r3,r7 # h = (h<< i) | 1638 | $SHL r3,r3,r7 # h = (h<< i) |
| 1729 | $SHR r8,r4,r8 # r8 = (l >> BN_BITS2 -i) | 1639 | $SHR r8,r4,r8 # r8 = (l >> BN_BITS2 -i) |
| 1730 | $SHL r5,r5,r7 # d<<=i | 1640 | $SHL r5,r5,r7 # d<<=i |
| @@ -1741,7 +1651,7 @@ Lppcasm_divouterloop: | |||
| 1741 | $SHRI r11,r4,`$BITS/2` #r11= (l&BN_MASK2h)>>BN_BITS4 | 1651 | $SHRI r11,r4,`$BITS/2` #r11= (l&BN_MASK2h)>>BN_BITS4 |
| 1742 | # compute here for innerloop. | 1652 | # compute here for innerloop. |
| 1743 | $UCMP 0,r8,r9 # is (h>>BN_BITS4)==dh | 1653 | $UCMP 0,r8,r9 # is (h>>BN_BITS4)==dh |
| 1744 | bc BO_IF_NOT,CR0_EQ,Lppcasm_div5 # goto Lppcasm_div5 if not | 1654 | bne Lppcasm_div5 # goto Lppcasm_div5 if not |
| 1745 | 1655 | ||
| 1746 | li r8,-1 | 1656 | li r8,-1 |
| 1747 | $CLRU r8,r8,`$BITS/2` #q = BN_MASK2l | 1657 | $CLRU r8,r8,`$BITS/2` #q = BN_MASK2l |
| @@ -1762,9 +1672,9 @@ Lppcasm_divinnerloop: | |||
| 1762 | # the following 2 instructions do that | 1672 | # the following 2 instructions do that |
| 1763 | $SHLI r7,r10,`$BITS/2` # r7 = (t<<BN_BITS4) | 1673 | $SHLI r7,r10,`$BITS/2` # r7 = (t<<BN_BITS4) |
| 1764 | or r7,r7,r11 # r7|=((l&BN_MASK2h)>>BN_BITS4) | 1674 | or r7,r7,r11 # r7|=((l&BN_MASK2h)>>BN_BITS4) |
| 1765 | $UCMP 1,r6,r7 # compare (tl <= r7) | 1675 | $UCMP cr1,r6,r7 # compare (tl <= r7) |
| 1766 | bc BO_IF_NOT,CR0_EQ,Lppcasm_divinnerexit | 1676 | bne Lppcasm_divinnerexit |
| 1767 | bc BO_IF_NOT,CR1_FEX,Lppcasm_divinnerexit | 1677 | ble cr1,Lppcasm_divinnerexit |
| 1768 | addi r8,r8,-1 #q-- | 1678 | addi r8,r8,-1 #q-- |
| 1769 | subf r12,r9,r12 #th -=dh | 1679 | subf r12,r9,r12 #th -=dh |
| 1770 | $CLRU r10,r5,`$BITS/2` #r10=dl. t is no longer needed in loop. | 1680 | $CLRU r10,r5,`$BITS/2` #r10=dl. t is no longer needed in loop. |
| @@ -1773,14 +1683,14 @@ Lppcasm_divinnerloop: | |||
| 1773 | Lppcasm_divinnerexit: | 1683 | Lppcasm_divinnerexit: |
| 1774 | $SHRI r10,r6,`$BITS/2` #t=(tl>>BN_BITS4) | 1684 | $SHRI r10,r6,`$BITS/2` #t=(tl>>BN_BITS4) |
| 1775 | $SHLI r11,r6,`$BITS/2` #tl=(tl<<BN_BITS4)&BN_MASK2h; | 1685 | $SHLI r11,r6,`$BITS/2` #tl=(tl<<BN_BITS4)&BN_MASK2h; |
| 1776 | $UCMP 1,r4,r11 # compare l and tl | 1686 | $UCMP cr1,r4,r11 # compare l and tl |
| 1777 | add r12,r12,r10 # th+=t | 1687 | add r12,r12,r10 # th+=t |
| 1778 | bc BO_IF_NOT,CR1_FX,Lppcasm_div7 # if (l>=tl) goto Lppcasm_div7 | 1688 | bge cr1,Lppcasm_div7 # if (l>=tl) goto Lppcasm_div7 |
| 1779 | addi r12,r12,1 # th++ | 1689 | addi r12,r12,1 # th++ |
| 1780 | Lppcasm_div7: | 1690 | Lppcasm_div7: |
| 1781 | subf r11,r11,r4 #r11=l-tl | 1691 | subf r11,r11,r4 #r11=l-tl |
| 1782 | $UCMP 1,r3,r12 #compare h and th | 1692 | $UCMP cr1,r3,r12 #compare h and th |
| 1783 | bc BO_IF_NOT,CR1_FX,Lppcasm_div8 #if (h>=th) goto Lppcasm_div8 | 1693 | bge cr1,Lppcasm_div8 #if (h>=th) goto Lppcasm_div8 |
| 1784 | addi r8,r8,-1 # q-- | 1694 | addi r8,r8,-1 # q-- |
| 1785 | add r3,r5,r3 # h+=d | 1695 | add r3,r5,r3 # h+=d |
| 1786 | Lppcasm_div8: | 1696 | Lppcasm_div8: |
| @@ -1791,12 +1701,12 @@ Lppcasm_div8: | |||
| 1791 | # the following 2 instructions will do this. | 1701 | # the following 2 instructions will do this. |
| 1792 | $INSR r11,r12,`$BITS/2`,`$BITS/2` # r11 is the value we want rotated $BITS/2. | 1702 | $INSR r11,r12,`$BITS/2`,`$BITS/2` # r11 is the value we want rotated $BITS/2. |
| 1793 | $ROTL r3,r11,`$BITS/2` # rotate by $BITS/2 and store in r3 | 1703 | $ROTL r3,r11,`$BITS/2` # rotate by $BITS/2 and store in r3 |
| 1794 | bc BO_dCTR_ZERO,CR0_EQ,Lppcasm_div9#if (count==0) break ; | 1704 | bdz Lppcasm_div9 #if (count==0) break ; |
| 1795 | $SHLI r0,r8,`$BITS/2` #ret =q<<BN_BITS4 | 1705 | $SHLI r0,r8,`$BITS/2` #ret =q<<BN_BITS4 |
| 1796 | b Lppcasm_divouterloop | 1706 | b Lppcasm_divouterloop |
| 1797 | Lppcasm_div9: | 1707 | Lppcasm_div9: |
| 1798 | or r3,r8,r0 | 1708 | or r3,r8,r0 |
| 1799 | bclr BO_ALWAYS,CR0_LT | 1709 | blr |
| 1800 | .long 0x00000000 | 1710 | .long 0x00000000 |
| 1801 | 1711 | ||
| 1802 | # | 1712 | # |
| @@ -1822,7 +1732,7 @@ Lppcasm_div9: | |||
| 1822 | # No unrolling done here. Not performance critical. | 1732 | # No unrolling done here. Not performance critical. |
| 1823 | 1733 | ||
| 1824 | addic. r5,r5,0 #test r5. | 1734 | addic. r5,r5,0 #test r5. |
| 1825 | bc BO_IF,CR0_EQ,Lppcasm_sqr_adios | 1735 | beq Lppcasm_sqr_adios |
| 1826 | addi r4,r4,-$BNSZ | 1736 | addi r4,r4,-$BNSZ |
| 1827 | addi r3,r3,-$BNSZ | 1737 | addi r3,r3,-$BNSZ |
| 1828 | mtctr r5 | 1738 | mtctr r5 |
| @@ -1833,9 +1743,9 @@ Lppcasm_sqr_mainloop: | |||
| 1833 | $UMULH r8,r6,r6 | 1743 | $UMULH r8,r6,r6 |
| 1834 | $STU r7,$BNSZ(r3) | 1744 | $STU r7,$BNSZ(r3) |
| 1835 | $STU r8,$BNSZ(r3) | 1745 | $STU r8,$BNSZ(r3) |
| 1836 | bc BO_dCTR_NZERO,CR0_EQ,Lppcasm_sqr_mainloop | 1746 | bdnz- Lppcasm_sqr_mainloop |
| 1837 | Lppcasm_sqr_adios: | 1747 | Lppcasm_sqr_adios: |
| 1838 | bclr BO_ALWAYS,CR0_LT | 1748 | blr |
| 1839 | .long 0x00000000 | 1749 | .long 0x00000000 |
| 1840 | 1750 | ||
| 1841 | 1751 | ||
| @@ -1858,7 +1768,7 @@ Lppcasm_sqr_adios: | |||
| 1858 | xor r0,r0,r0 | 1768 | xor r0,r0,r0 |
| 1859 | xor r12,r12,r12 # used for carry | 1769 | xor r12,r12,r12 # used for carry |
| 1860 | rlwinm. r7,r5,30,2,31 # num >> 2 | 1770 | rlwinm. r7,r5,30,2,31 # num >> 2 |
| 1861 | bc BO_IF,CR0_EQ,Lppcasm_mw_REM | 1771 | beq Lppcasm_mw_REM |
| 1862 | mtctr r7 | 1772 | mtctr r7 |
| 1863 | Lppcasm_mw_LOOP: | 1773 | Lppcasm_mw_LOOP: |
| 1864 | #mul(rp[0],ap[0],w,c1); | 1774 | #mul(rp[0],ap[0],w,c1); |
| @@ -1896,11 +1806,11 @@ Lppcasm_mw_LOOP: | |||
| 1896 | 1806 | ||
| 1897 | addi r3,r3,`4*$BNSZ` | 1807 | addi r3,r3,`4*$BNSZ` |
| 1898 | addi r4,r4,`4*$BNSZ` | 1808 | addi r4,r4,`4*$BNSZ` |
| 1899 | bc BO_dCTR_NZERO,CR0_EQ,Lppcasm_mw_LOOP | 1809 | bdnz- Lppcasm_mw_LOOP |
| 1900 | 1810 | ||
| 1901 | Lppcasm_mw_REM: | 1811 | Lppcasm_mw_REM: |
| 1902 | andi. r5,r5,0x3 | 1812 | andi. r5,r5,0x3 |
| 1903 | bc BO_IF,CR0_EQ,Lppcasm_mw_OVER | 1813 | beq Lppcasm_mw_OVER |
| 1904 | #mul(rp[0],ap[0],w,c1); | 1814 | #mul(rp[0],ap[0],w,c1); |
| 1905 | $LD r8,`0*$BNSZ`(r4) | 1815 | $LD r8,`0*$BNSZ`(r4) |
| 1906 | $UMULL r9,r6,r8 | 1816 | $UMULL r9,r6,r8 |
| @@ -1912,7 +1822,7 @@ Lppcasm_mw_REM: | |||
| 1912 | 1822 | ||
| 1913 | addi r5,r5,-1 | 1823 | addi r5,r5,-1 |
| 1914 | cmpli 0,0,r5,0 | 1824 | cmpli 0,0,r5,0 |
| 1915 | bc BO_IF,CR0_EQ,Lppcasm_mw_OVER | 1825 | beq Lppcasm_mw_OVER |
| 1916 | 1826 | ||
| 1917 | 1827 | ||
| 1918 | #mul(rp[1],ap[1],w,c1); | 1828 | #mul(rp[1],ap[1],w,c1); |
| @@ -1926,7 +1836,7 @@ Lppcasm_mw_REM: | |||
| 1926 | 1836 | ||
| 1927 | addi r5,r5,-1 | 1837 | addi r5,r5,-1 |
| 1928 | cmpli 0,0,r5,0 | 1838 | cmpli 0,0,r5,0 |
| 1929 | bc BO_IF,CR0_EQ,Lppcasm_mw_OVER | 1839 | beq Lppcasm_mw_OVER |
| 1930 | 1840 | ||
| 1931 | #mul_add(rp[2],ap[2],w,c1); | 1841 | #mul_add(rp[2],ap[2],w,c1); |
| 1932 | $LD r8,`2*$BNSZ`(r4) | 1842 | $LD r8,`2*$BNSZ`(r4) |
| @@ -1939,7 +1849,7 @@ Lppcasm_mw_REM: | |||
| 1939 | 1849 | ||
| 1940 | Lppcasm_mw_OVER: | 1850 | Lppcasm_mw_OVER: |
| 1941 | addi r3,r12,0 | 1851 | addi r3,r12,0 |
| 1942 | bclr BO_ALWAYS,CR0_LT | 1852 | blr |
| 1943 | .long 0x00000000 | 1853 | .long 0x00000000 |
| 1944 | 1854 | ||
| 1945 | # | 1855 | # |
| @@ -1964,7 +1874,7 @@ Lppcasm_mw_OVER: | |||
| 1964 | xor r0,r0,r0 #r0 = 0 | 1874 | xor r0,r0,r0 #r0 = 0 |
| 1965 | xor r12,r12,r12 #r12 = 0 . used for carry | 1875 | xor r12,r12,r12 #r12 = 0 . used for carry |
| 1966 | rlwinm. r7,r5,30,2,31 # num >> 2 | 1876 | rlwinm. r7,r5,30,2,31 # num >> 2 |
| 1967 | bc BO_IF,CR0_EQ,Lppcasm_maw_leftover # if (num < 4) go LPPCASM_maw_leftover | 1877 | beq Lppcasm_maw_leftover # if (num < 4) go LPPCASM_maw_leftover |
| 1968 | mtctr r7 | 1878 | mtctr r7 |
| 1969 | Lppcasm_maw_mainloop: | 1879 | Lppcasm_maw_mainloop: |
| 1970 | #mul_add(rp[0],ap[0],w,c1); | 1880 | #mul_add(rp[0],ap[0],w,c1); |
| @@ -2017,11 +1927,11 @@ Lppcasm_maw_mainloop: | |||
| 2017 | $ST r11,`3*$BNSZ`(r3) | 1927 | $ST r11,`3*$BNSZ`(r3) |
| 2018 | addi r3,r3,`4*$BNSZ` | 1928 | addi r3,r3,`4*$BNSZ` |
| 2019 | addi r4,r4,`4*$BNSZ` | 1929 | addi r4,r4,`4*$BNSZ` |
| 2020 | bc BO_dCTR_NZERO,CR0_EQ,Lppcasm_maw_mainloop | 1930 | bdnz- Lppcasm_maw_mainloop |
| 2021 | 1931 | ||
| 2022 | Lppcasm_maw_leftover: | 1932 | Lppcasm_maw_leftover: |
| 2023 | andi. r5,r5,0x3 | 1933 | andi. r5,r5,0x3 |
| 2024 | bc BO_IF,CR0_EQ,Lppcasm_maw_adios | 1934 | beq Lppcasm_maw_adios |
| 2025 | addi r3,r3,-$BNSZ | 1935 | addi r3,r3,-$BNSZ |
| 2026 | addi r4,r4,-$BNSZ | 1936 | addi r4,r4,-$BNSZ |
| 2027 | #mul_add(rp[0],ap[0],w,c1); | 1937 | #mul_add(rp[0],ap[0],w,c1); |
| @@ -2036,7 +1946,7 @@ Lppcasm_maw_leftover: | |||
| 2036 | addze r12,r10 | 1946 | addze r12,r10 |
| 2037 | $ST r9,0(r3) | 1947 | $ST r9,0(r3) |
| 2038 | 1948 | ||
| 2039 | bc BO_dCTR_ZERO,CR0_EQ,Lppcasm_maw_adios | 1949 | bdz Lppcasm_maw_adios |
| 2040 | #mul_add(rp[1],ap[1],w,c1); | 1950 | #mul_add(rp[1],ap[1],w,c1); |
| 2041 | $LDU r8,$BNSZ(r4) | 1951 | $LDU r8,$BNSZ(r4) |
| 2042 | $UMULL r9,r6,r8 | 1952 | $UMULL r9,r6,r8 |
| @@ -2048,7 +1958,7 @@ Lppcasm_maw_leftover: | |||
| 2048 | addze r12,r10 | 1958 | addze r12,r10 |
| 2049 | $ST r9,0(r3) | 1959 | $ST r9,0(r3) |
| 2050 | 1960 | ||
| 2051 | bc BO_dCTR_ZERO,CR0_EQ,Lppcasm_maw_adios | 1961 | bdz Lppcasm_maw_adios |
| 2052 | #mul_add(rp[2],ap[2],w,c1); | 1962 | #mul_add(rp[2],ap[2],w,c1); |
| 2053 | $LDU r8,$BNSZ(r4) | 1963 | $LDU r8,$BNSZ(r4) |
| 2054 | $UMULL r9,r6,r8 | 1964 | $UMULL r9,r6,r8 |
| @@ -2062,17 +1972,10 @@ Lppcasm_maw_leftover: | |||
| 2062 | 1972 | ||
| 2063 | Lppcasm_maw_adios: | 1973 | Lppcasm_maw_adios: |
| 2064 | addi r3,r12,0 | 1974 | addi r3,r12,0 |
| 2065 | bclr BO_ALWAYS,CR0_LT | 1975 | blr |
| 2066 | .long 0x00000000 | 1976 | .long 0x00000000 |
| 2067 | .align 4 | 1977 | .align 4 |
| 2068 | EOF | 1978 | EOF |
| 2069 | $data =~ s/\`([^\`]*)\`/eval $1/gem; | 1979 | $data =~ s/\`([^\`]*)\`/eval $1/gem; |
| 2070 | 1980 | print $data; | |
| 2071 | # if some assembler chokes on some simplified mnemonic, | 1981 | close STDOUT; |
| 2072 | # this is the spot to fix it up, e.g.: | ||
| 2073 | # GNU as doesn't seem to accept cmplw, 32-bit unsigned compare | ||
| 2074 | $data =~ s/^(\s*)cmplw(\s+)([^,]+),(.*)/$1cmpl$2$3,0,$4/gm; | ||
| 2075 | # assembler X doesn't accept li, load immediate value | ||
| 2076 | #$data =~ s/^(\s*)li(\s+)([^,]+),(.*)/$1addi$2$3,0,$4/gm; | ||
| 2077 | return($data); | ||
| 2078 | } | ||
diff --git a/src/lib/libcrypto/bn/asm/x86_64-gcc.c b/src/lib/libcrypto/bn/asm/x86_64-gcc.c index f13f52dd85..acb0b40118 100644 --- a/src/lib/libcrypto/bn/asm/x86_64-gcc.c +++ b/src/lib/libcrypto/bn/asm/x86_64-gcc.c | |||
| @@ -1,4 +1,5 @@ | |||
| 1 | #ifdef __SUNPRO_C | 1 | #include "../bn_lcl.h" |
| 2 | #if !(defined(__GNUC__) && __GNUC__>=2) | ||
| 2 | # include "../bn_asm.c" /* kind of dirty hack for Sun Studio */ | 3 | # include "../bn_asm.c" /* kind of dirty hack for Sun Studio */ |
| 3 | #else | 4 | #else |
| 4 | /* | 5 | /* |
| @@ -54,7 +55,15 @@ | |||
| 54 | * machine. | 55 | * machine. |
| 55 | */ | 56 | */ |
| 56 | 57 | ||
| 58 | #ifdef _WIN64 | ||
| 59 | #define BN_ULONG unsigned long long | ||
| 60 | #else | ||
| 57 | #define BN_ULONG unsigned long | 61 | #define BN_ULONG unsigned long |
| 62 | #endif | ||
| 63 | |||
| 64 | #undef mul | ||
| 65 | #undef mul_add | ||
| 66 | #undef sqr | ||
| 58 | 67 | ||
| 59 | /* | 68 | /* |
| 60 | * "m"(a), "+m"(r) is the way to favor DirectPath µ-code; | 69 | * "m"(a), "+m"(r) is the way to favor DirectPath µ-code; |
| @@ -97,7 +106,7 @@ | |||
| 97 | : "a"(a) \ | 106 | : "a"(a) \ |
| 98 | : "cc"); | 107 | : "cc"); |
| 99 | 108 | ||
| 100 | BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w) | 109 | BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) |
| 101 | { | 110 | { |
| 102 | BN_ULONG c1=0; | 111 | BN_ULONG c1=0; |
| 103 | 112 | ||
| @@ -121,7 +130,7 @@ BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w) | |||
| 121 | return(c1); | 130 | return(c1); |
| 122 | } | 131 | } |
| 123 | 132 | ||
| 124 | BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w) | 133 | BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) |
| 125 | { | 134 | { |
| 126 | BN_ULONG c1=0; | 135 | BN_ULONG c1=0; |
| 127 | 136 | ||
| @@ -144,7 +153,7 @@ BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w) | |||
| 144 | return(c1); | 153 | return(c1); |
| 145 | } | 154 | } |
| 146 | 155 | ||
| 147 | void bn_sqr_words(BN_ULONG *r, BN_ULONG *a, int n) | 156 | void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) |
| 148 | { | 157 | { |
| 149 | if (n <= 0) return; | 158 | if (n <= 0) return; |
| 150 | 159 | ||
| @@ -175,14 +184,14 @@ BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) | |||
| 175 | return ret; | 184 | return ret; |
| 176 | } | 185 | } |
| 177 | 186 | ||
| 178 | BN_ULONG bn_add_words (BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int n) | 187 | BN_ULONG bn_add_words (BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,int n) |
| 179 | { BN_ULONG ret=0,i=0; | 188 | { BN_ULONG ret=0,i=0; |
| 180 | 189 | ||
| 181 | if (n <= 0) return 0; | 190 | if (n <= 0) return 0; |
| 182 | 191 | ||
| 183 | asm ( | 192 | asm ( |
| 184 | " subq %2,%2 \n" | 193 | " subq %2,%2 \n" |
| 185 | ".align 16 \n" | 194 | ".p2align 4 \n" |
| 186 | "1: movq (%4,%2,8),%0 \n" | 195 | "1: movq (%4,%2,8),%0 \n" |
| 187 | " adcq (%5,%2,8),%0 \n" | 196 | " adcq (%5,%2,8),%0 \n" |
| 188 | " movq %0,(%3,%2,8) \n" | 197 | " movq %0,(%3,%2,8) \n" |
| @@ -198,14 +207,14 @@ BN_ULONG bn_add_words (BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int n) | |||
| 198 | } | 207 | } |
| 199 | 208 | ||
| 200 | #ifndef SIMICS | 209 | #ifndef SIMICS |
| 201 | BN_ULONG bn_sub_words (BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int n) | 210 | BN_ULONG bn_sub_words (BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,int n) |
| 202 | { BN_ULONG ret=0,i=0; | 211 | { BN_ULONG ret=0,i=0; |
| 203 | 212 | ||
| 204 | if (n <= 0) return 0; | 213 | if (n <= 0) return 0; |
| 205 | 214 | ||
| 206 | asm ( | 215 | asm ( |
| 207 | " subq %2,%2 \n" | 216 | " subq %2,%2 \n" |
| 208 | ".align 16 \n" | 217 | ".p2align 4 \n" |
| 209 | "1: movq (%4,%2,8),%0 \n" | 218 | "1: movq (%4,%2,8),%0 \n" |
| 210 | " sbbq (%5,%2,8),%0 \n" | 219 | " sbbq (%5,%2,8),%0 \n" |
| 211 | " movq %0,(%3,%2,8) \n" | 220 | " movq %0,(%3,%2,8) \n" |
| @@ -485,7 +494,7 @@ void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) | |||
| 485 | r[7]=c2; | 494 | r[7]=c2; |
| 486 | } | 495 | } |
| 487 | 496 | ||
| 488 | void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a) | 497 | void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) |
| 489 | { | 498 | { |
| 490 | BN_ULONG t1,t2; | 499 | BN_ULONG t1,t2; |
| 491 | BN_ULONG c1,c2,c3; | 500 | BN_ULONG c1,c2,c3; |
| @@ -561,7 +570,7 @@ void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a) | |||
| 561 | r[15]=c1; | 570 | r[15]=c1; |
| 562 | } | 571 | } |
| 563 | 572 | ||
| 564 | void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a) | 573 | void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) |
| 565 | { | 574 | { |
| 566 | BN_ULONG t1,t2; | 575 | BN_ULONG t1,t2; |
| 567 | BN_ULONG c1,c2,c3; | 576 | BN_ULONG c1,c2,c3; |
diff --git a/src/lib/libcrypto/bn/asm/x86_64-mont.pl b/src/lib/libcrypto/bn/asm/x86_64-mont.pl index c43b69592a..3b7a6f243f 100755 --- a/src/lib/libcrypto/bn/asm/x86_64-mont.pl +++ b/src/lib/libcrypto/bn/asm/x86_64-mont.pl | |||
| @@ -15,14 +15,18 @@ | |||
| 15 | # respectful 50%. It remains to be seen if loop unrolling and | 15 | # respectful 50%. It remains to be seen if loop unrolling and |
| 16 | # dedicated squaring routine can provide further improvement... | 16 | # dedicated squaring routine can provide further improvement... |
| 17 | 17 | ||
| 18 | $output=shift; | 18 | $flavour = shift; |
| 19 | $output = shift; | ||
| 20 | if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } | ||
| 21 | |||
| 22 | $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); | ||
| 19 | 23 | ||
| 20 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | 24 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; |
| 21 | ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or | 25 | ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or |
| 22 | ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or | 26 | ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or |
| 23 | die "can't locate x86_64-xlate.pl"; | 27 | die "can't locate x86_64-xlate.pl"; |
| 24 | 28 | ||
| 25 | open STDOUT,"| $^X $xlate $output"; | 29 | open STDOUT,"| $^X $xlate $flavour $output"; |
| 26 | 30 | ||
| 27 | # int bn_mul_mont( | 31 | # int bn_mul_mont( |
| 28 | $rp="%rdi"; # BN_ULONG *rp, | 32 | $rp="%rdi"; # BN_ULONG *rp, |
| @@ -55,13 +59,14 @@ bn_mul_mont: | |||
| 55 | push %r15 | 59 | push %r15 |
| 56 | 60 | ||
| 57 | mov ${num}d,${num}d | 61 | mov ${num}d,${num}d |
| 58 | lea 2($num),%rax | 62 | lea 2($num),%r10 |
| 59 | mov %rsp,%rbp | 63 | mov %rsp,%r11 |
| 60 | neg %rax | 64 | neg %r10 |
| 61 | lea (%rsp,%rax,8),%rsp # tp=alloca(8*(num+2)) | 65 | lea (%rsp,%r10,8),%rsp # tp=alloca(8*(num+2)) |
| 62 | and \$-1024,%rsp # minimize TLB usage | 66 | and \$-1024,%rsp # minimize TLB usage |
| 63 | 67 | ||
| 64 | mov %rbp,8(%rsp,$num,8) # tp[num+1]=%rsp | 68 | mov %r11,8(%rsp,$num,8) # tp[num+1]=%rsp |
| 69 | .Lprologue: | ||
| 65 | mov %rdx,$bp # $bp reassigned, remember? | 70 | mov %rdx,$bp # $bp reassigned, remember? |
| 66 | 71 | ||
| 67 | mov ($n0),$n0 # pull n0[0] value | 72 | mov ($n0),$n0 # pull n0[0] value |
| @@ -197,18 +202,129 @@ bn_mul_mont: | |||
| 197 | dec $j | 202 | dec $j |
| 198 | jge .Lcopy | 203 | jge .Lcopy |
| 199 | 204 | ||
| 200 | mov 8(%rsp,$num,8),%rsp # restore %rsp | 205 | mov 8(%rsp,$num,8),%rsi # restore %rsp |
| 201 | mov \$1,%rax | 206 | mov \$1,%rax |
| 207 | mov (%rsi),%r15 | ||
| 208 | mov 8(%rsi),%r14 | ||
| 209 | mov 16(%rsi),%r13 | ||
| 210 | mov 24(%rsi),%r12 | ||
| 211 | mov 32(%rsi),%rbp | ||
| 212 | mov 40(%rsi),%rbx | ||
| 213 | lea 48(%rsi),%rsp | ||
| 214 | .Lepilogue: | ||
| 215 | ret | ||
| 216 | .size bn_mul_mont,.-bn_mul_mont | ||
| 217 | .asciz "Montgomery Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>" | ||
| 218 | .align 16 | ||
| 219 | ___ | ||
| 220 | |||
| 221 | # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, | ||
| 222 | # CONTEXT *context,DISPATCHER_CONTEXT *disp) | ||
| 223 | if ($win64) { | ||
| 224 | $rec="%rcx"; | ||
| 225 | $frame="%rdx"; | ||
| 226 | $context="%r8"; | ||
| 227 | $disp="%r9"; | ||
| 228 | |||
| 229 | $code.=<<___; | ||
| 230 | .extern __imp_RtlVirtualUnwind | ||
| 231 | .type se_handler,\@abi-omnipotent | ||
| 232 | .align 16 | ||
| 233 | se_handler: | ||
| 234 | push %rsi | ||
| 235 | push %rdi | ||
| 236 | push %rbx | ||
| 237 | push %rbp | ||
| 238 | push %r12 | ||
| 239 | push %r13 | ||
| 240 | push %r14 | ||
| 241 | push %r15 | ||
| 242 | pushfq | ||
| 243 | sub \$64,%rsp | ||
| 244 | |||
| 245 | mov 120($context),%rax # pull context->Rax | ||
| 246 | mov 248($context),%rbx # pull context->Rip | ||
| 247 | |||
| 248 | lea .Lprologue(%rip),%r10 | ||
| 249 | cmp %r10,%rbx # context->Rip<.Lprologue | ||
| 250 | jb .Lin_prologue | ||
| 251 | |||
| 252 | mov 152($context),%rax # pull context->Rsp | ||
| 253 | |||
| 254 | lea .Lepilogue(%rip),%r10 | ||
| 255 | cmp %r10,%rbx # context->Rip>=.Lepilogue | ||
| 256 | jae .Lin_prologue | ||
| 257 | |||
| 258 | mov 192($context),%r10 # pull $num | ||
| 259 | mov 8(%rax,%r10,8),%rax # pull saved stack pointer | ||
| 260 | lea 48(%rax),%rax | ||
| 261 | |||
| 262 | mov -8(%rax),%rbx | ||
| 263 | mov -16(%rax),%rbp | ||
| 264 | mov -24(%rax),%r12 | ||
| 265 | mov -32(%rax),%r13 | ||
| 266 | mov -40(%rax),%r14 | ||
| 267 | mov -48(%rax),%r15 | ||
| 268 | mov %rbx,144($context) # restore context->Rbx | ||
| 269 | mov %rbp,160($context) # restore context->Rbp | ||
| 270 | mov %r12,216($context) # restore context->R12 | ||
| 271 | mov %r13,224($context) # restore context->R13 | ||
| 272 | mov %r14,232($context) # restore context->R14 | ||
| 273 | mov %r15,240($context) # restore context->R15 | ||
| 274 | |||
| 275 | .Lin_prologue: | ||
| 276 | mov 8(%rax),%rdi | ||
| 277 | mov 16(%rax),%rsi | ||
| 278 | mov %rax,152($context) # restore context->Rsp | ||
| 279 | mov %rsi,168($context) # restore context->Rsi | ||
| 280 | mov %rdi,176($context) # restore context->Rdi | ||
| 281 | |||
| 282 | mov 40($disp),%rdi # disp->ContextRecord | ||
| 283 | mov $context,%rsi # context | ||
| 284 | mov \$154,%ecx # sizeof(CONTEXT) | ||
| 285 | .long 0xa548f3fc # cld; rep movsq | ||
| 286 | |||
| 287 | mov $disp,%rsi | ||
| 288 | xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER | ||
| 289 | mov 8(%rsi),%rdx # arg2, disp->ImageBase | ||
| 290 | mov 0(%rsi),%r8 # arg3, disp->ControlPc | ||
| 291 | mov 16(%rsi),%r9 # arg4, disp->FunctionEntry | ||
| 292 | mov 40(%rsi),%r10 # disp->ContextRecord | ||
| 293 | lea 56(%rsi),%r11 # &disp->HandlerData | ||
| 294 | lea 24(%rsi),%r12 # &disp->EstablisherFrame | ||
| 295 | mov %r10,32(%rsp) # arg5 | ||
| 296 | mov %r11,40(%rsp) # arg6 | ||
| 297 | mov %r12,48(%rsp) # arg7 | ||
| 298 | mov %rcx,56(%rsp) # arg8, (NULL) | ||
| 299 | call *__imp_RtlVirtualUnwind(%rip) | ||
| 300 | |||
| 301 | mov \$1,%eax # ExceptionContinueSearch | ||
| 302 | add \$64,%rsp | ||
| 303 | popfq | ||
| 202 | pop %r15 | 304 | pop %r15 |
| 203 | pop %r14 | 305 | pop %r14 |
| 204 | pop %r13 | 306 | pop %r13 |
| 205 | pop %r12 | 307 | pop %r12 |
| 206 | pop %rbp | 308 | pop %rbp |
| 207 | pop %rbx | 309 | pop %rbx |
| 310 | pop %rdi | ||
| 311 | pop %rsi | ||
| 208 | ret | 312 | ret |
| 209 | .size bn_mul_mont,.-bn_mul_mont | 313 | .size se_handler,.-se_handler |
| 210 | .asciz "Montgomery Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>" | 314 | |
| 315 | .section .pdata | ||
| 316 | .align 4 | ||
| 317 | .rva .LSEH_begin_bn_mul_mont | ||
| 318 | .rva .LSEH_end_bn_mul_mont | ||
| 319 | .rva .LSEH_info_bn_mul_mont | ||
| 320 | |||
| 321 | .section .xdata | ||
| 322 | .align 8 | ||
| 323 | .LSEH_info_bn_mul_mont: | ||
| 324 | .byte 9,0,0,0 | ||
| 325 | .rva se_handler | ||
| 211 | ___ | 326 | ___ |
| 327 | } | ||
| 212 | 328 | ||
| 213 | print $code; | 329 | print $code; |
| 214 | close STDOUT; | 330 | close STDOUT; |
