summaryrefslogtreecommitdiff
path: root/src/lib/libcrypto/bn/asm
diff options
context:
space:
mode:
authordjm <>2010-10-01 22:54:21 +0000
committerdjm <>2010-10-01 22:54:21 +0000
commit2ea67f4aa254b09ded62e6e14fc893bbe6381579 (patch)
treebb3923b81f2ce34b1ad62684afdf1a94d904c185 /src/lib/libcrypto/bn/asm
parent6ddfb710ab14b10183ff3a6a32f643554c80065e (diff)
parent829fd51d4f8dde4a7f3bf54754f3c1d1a502f5e2 (diff)
downloadopenbsd-2ea67f4aa254b09ded62e6e14fc893bbe6381579.tar.gz
openbsd-2ea67f4aa254b09ded62e6e14fc893bbe6381579.tar.bz2
openbsd-2ea67f4aa254b09ded62e6e14fc893bbe6381579.zip
This commit was generated by cvs2git to track changes on a CVS vendor
branch.
Diffstat (limited to 'src/lib/libcrypto/bn/asm')
-rw-r--r--src/lib/libcrypto/bn/asm/alpha-mont.pl8
-rw-r--r--src/lib/libcrypto/bn/asm/armv4-mont.pl1
-rw-r--r--src/lib/libcrypto/bn/asm/ppc.pl233
-rw-r--r--src/lib/libcrypto/bn/asm/x86_64-gcc.c29
-rwxr-xr-xsrc/lib/libcrypto/bn/asm/x86_64-mont.pl136
5 files changed, 218 insertions, 189 deletions
diff --git a/src/lib/libcrypto/bn/asm/alpha-mont.pl b/src/lib/libcrypto/bn/asm/alpha-mont.pl
index 7a2cc3173b..f7e0ca1646 100644
--- a/src/lib/libcrypto/bn/asm/alpha-mont.pl
+++ b/src/lib/libcrypto/bn/asm/alpha-mont.pl
@@ -53,15 +53,15 @@ $code=<<___;
53.align 5 53.align 5
54.ent bn_mul_mont 54.ent bn_mul_mont
55bn_mul_mont: 55bn_mul_mont:
56 lda sp,-40(sp) 56 lda sp,-48(sp)
57 stq ra,0(sp) 57 stq ra,0(sp)
58 stq s3,8(sp) 58 stq s3,8(sp)
59 stq s4,16(sp) 59 stq s4,16(sp)
60 stq s5,24(sp) 60 stq s5,24(sp)
61 stq fp,32(sp) 61 stq fp,32(sp)
62 mov sp,fp 62 mov sp,fp
63 .mask 0x0400f000,-40 63 .mask 0x0400f000,-48
64 .frame fp,40,ra 64 .frame fp,48,ra
65 .prologue 0 65 .prologue 0
66 66
67 .align 4 67 .align 4
@@ -306,7 +306,7 @@ bn_mul_mont:
306 ldq s4,16(sp) 306 ldq s4,16(sp)
307 ldq s5,24(sp) 307 ldq s5,24(sp)
308 ldq fp,32(sp) 308 ldq fp,32(sp)
309 lda sp,40(sp) 309 lda sp,48(sp)
310 ret (ra) 310 ret (ra)
311.end bn_mul_mont 311.end bn_mul_mont
312.rdata 312.rdata
diff --git a/src/lib/libcrypto/bn/asm/armv4-mont.pl b/src/lib/libcrypto/bn/asm/armv4-mont.pl
index 05d5dc1a48..14e0d2d1dd 100644
--- a/src/lib/libcrypto/bn/asm/armv4-mont.pl
+++ b/src/lib/libcrypto/bn/asm/armv4-mont.pl
@@ -193,6 +193,7 @@ bn_mul_mont:
193 bx lr @ interoperable with Thumb ISA:-) 193 bx lr @ interoperable with Thumb ISA:-)
194.size bn_mul_mont,.-bn_mul_mont 194.size bn_mul_mont,.-bn_mul_mont
195.asciz "Montgomery multiplication for ARMv4, CRYPTOGAMS by <appro\@openssl.org>" 195.asciz "Montgomery multiplication for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
196.align 2
196___ 197___
197 198
198$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4 199$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
diff --git a/src/lib/libcrypto/bn/asm/ppc.pl b/src/lib/libcrypto/bn/asm/ppc.pl
index 08e0053473..37c65d3511 100644
--- a/src/lib/libcrypto/bn/asm/ppc.pl
+++ b/src/lib/libcrypto/bn/asm/ppc.pl
@@ -100,9 +100,9 @@
100# me a note at schari@us.ibm.com 100# me a note at schari@us.ibm.com
101# 101#
102 102
103$opf = shift; 103$flavour = shift;
104 104
105if ($opf =~ /32\.s/) { 105if ($flavour =~ /32/) {
106 $BITS= 32; 106 $BITS= 32;
107 $BNSZ= $BITS/8; 107 $BNSZ= $BITS/8;
108 $ISA= "\"ppc\""; 108 $ISA= "\"ppc\"";
@@ -125,7 +125,7 @@ if ($opf =~ /32\.s/) {
125 $INSR= "insrwi"; # insert right 125 $INSR= "insrwi"; # insert right
126 $ROTL= "rotlwi"; # rotate left by immediate 126 $ROTL= "rotlwi"; # rotate left by immediate
127 $TR= "tw"; # conditional trap 127 $TR= "tw"; # conditional trap
128} elsif ($opf =~ /64\.s/) { 128} elsif ($flavour =~ /64/) {
129 $BITS= 64; 129 $BITS= 64;
130 $BNSZ= $BITS/8; 130 $BNSZ= $BITS/8;
131 $ISA= "\"ppc64\""; 131 $ISA= "\"ppc64\"";
@@ -149,93 +149,16 @@ if ($opf =~ /32\.s/) {
149 $INSR= "insrdi"; # insert right 149 $INSR= "insrdi"; # insert right
150 $ROTL= "rotldi"; # rotate left by immediate 150 $ROTL= "rotldi"; # rotate left by immediate
151 $TR= "td"; # conditional trap 151 $TR= "td"; # conditional trap
152} else { die "nonsense $opf"; } 152} else { die "nonsense $flavour"; }
153 153
154( defined shift || open STDOUT,">$opf" ) || die "can't open $opf: $!"; 154$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
155( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
156( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
157die "can't locate ppc-xlate.pl";
155 158
156# function entry points from the AIX code 159open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
157#
158# There are other, more elegant, ways to handle this. We (IBM) chose
159# this approach as it plays well with scripts we run to 'namespace'
160# OpenSSL .i.e. we add a prefix to all the public symbols so we can
161# co-exist in the same process with other implementations of OpenSSL.
162# 'cleverer' ways of doing these substitutions tend to hide data we
163# need to be obvious.
164#
165my @items = ("bn_sqr_comba4",
166 "bn_sqr_comba8",
167 "bn_mul_comba4",
168 "bn_mul_comba8",
169 "bn_sub_words",
170 "bn_add_words",
171 "bn_div_words",
172 "bn_sqr_words",
173 "bn_mul_words",
174 "bn_mul_add_words");
175 160
176if ($opf =~ /linux/) { do_linux(); } 161$data=<<EOF;
177elsif ($opf =~ /aix/) { do_aix(); }
178elsif ($opf =~ /osx/) { do_osx(); }
179else { do_bsd(); }
180
181sub do_linux {
182 $d=&data();
183
184 if ($BITS==64) {
185 foreach $t (@items) {
186 $d =~ s/\.$t:/\
187\t.section\t".opd","aw"\
188\t.align\t3\
189\t.globl\t$t\
190$t:\
191\t.quad\t.$t,.TOC.\@tocbase,0\
192\t.size\t$t,24\
193\t.previous\n\
194\t.type\t.$t,\@function\
195\t.globl\t.$t\
196.$t:/g;
197 }
198 }
199 else {
200 foreach $t (@items) {
201 $d=~s/\.$t/$t/g;
202 }
203 }
204 # hide internal labels to avoid pollution of name table...
205 $d=~s/Lppcasm_/.Lppcasm_/gm;
206 print $d;
207}
208
209sub do_aix {
210 # AIX assembler is smart enough to please the linker without
211 # making us do something special...
212 print &data();
213}
214
215# MacOSX 32 bit
216sub do_osx {
217 $d=&data();
218 # Change the bn symbol prefix from '.' to '_'
219 foreach $t (@items) {
220 $d=~s/\.$t/_$t/g;
221 }
222 # Change .machine to something OS X asm will accept
223 $d=~s/\.machine.*/.text/g;
224 $d=~s/\#/;/g; # change comment from '#' to ';'
225 print $d;
226}
227
228# BSD (Untested)
229sub do_bsd {
230 $d=&data();
231 foreach $t (@items) {
232 $d=~s/\.$t/_$t/g;
233 }
234 print $d;
235}
236
237sub data {
238 local($data)=<<EOF;
239#-------------------------------------------------------------------- 162#--------------------------------------------------------------------
240# 163#
241# 164#
@@ -297,33 +220,20 @@ sub data {
297# 220#
298# Defines to be used in the assembly code. 221# Defines to be used in the assembly code.
299# 222#
300.set r0,0 # we use it as storage for value of 0 223#.set r0,0 # we use it as storage for value of 0
301.set SP,1 # preserved 224#.set SP,1 # preserved
302.set RTOC,2 # preserved 225#.set RTOC,2 # preserved
303.set r3,3 # 1st argument/return value 226#.set r3,3 # 1st argument/return value
304.set r4,4 # 2nd argument/volatile register 227#.set r4,4 # 2nd argument/volatile register
305.set r5,5 # 3rd argument/volatile register 228#.set r5,5 # 3rd argument/volatile register
306.set r6,6 # ... 229#.set r6,6 # ...
307.set r7,7 230#.set r7,7
308.set r8,8 231#.set r8,8
309.set r9,9 232#.set r9,9
310.set r10,10 233#.set r10,10
311.set r11,11 234#.set r11,11
312.set r12,12 235#.set r12,12
313.set r13,13 # not used, nor any other "below" it... 236#.set r13,13 # not used, nor any other "below" it...
314
315.set BO_IF_NOT,4
316.set BO_IF,12
317.set BO_dCTR_NZERO,16
318.set BO_dCTR_ZERO,18
319.set BO_ALWAYS,20
320.set CR0_LT,0;
321.set CR0_GT,1;
322.set CR0_EQ,2
323.set CR1_FX,4;
324.set CR1_FEX,5;
325.set CR1_VX,6
326.set LR,8
327 237
328# Declare function names to be global 238# Declare function names to be global
329# NOTE: For gcc these names MUST be changed to remove 239# NOTE: For gcc these names MUST be changed to remove
@@ -344,7 +254,7 @@ sub data {
344 254
345# .text section 255# .text section
346 256
347 .machine $ISA 257 .machine "any"
348 258
349# 259#
350# NOTE: The following label name should be changed to 260# NOTE: The following label name should be changed to
@@ -478,7 +388,7 @@ sub data {
478 388
479 $ST r9,`6*$BNSZ`(r3) #r[6]=c1 389 $ST r9,`6*$BNSZ`(r3) #r[6]=c1
480 $ST r10,`7*$BNSZ`(r3) #r[7]=c2 390 $ST r10,`7*$BNSZ`(r3) #r[7]=c2
481 bclr BO_ALWAYS,CR0_LT 391 blr
482 .long 0x00000000 392 .long 0x00000000
483 393
484# 394#
@@ -903,7 +813,7 @@ sub data {
903 $ST r9, `15*$BNSZ`(r3) #r[15]=c1; 813 $ST r9, `15*$BNSZ`(r3) #r[15]=c1;
904 814
905 815
906 bclr BO_ALWAYS,CR0_LT 816 blr
907 817
908 .long 0x00000000 818 .long 0x00000000
909 819
@@ -1055,7 +965,7 @@ sub data {
1055 965
1056 $ST r10,`6*$BNSZ`(r3) #r[6]=c1 966 $ST r10,`6*$BNSZ`(r3) #r[6]=c1
1057 $ST r11,`7*$BNSZ`(r3) #r[7]=c2 967 $ST r11,`7*$BNSZ`(r3) #r[7]=c2
1058 bclr BO_ALWAYS,CR0_LT 968 blr
1059 .long 0x00000000 969 .long 0x00000000
1060 970
1061# 971#
@@ -1591,7 +1501,7 @@ sub data {
1591 adde r10,r10,r9 1501 adde r10,r10,r9
1592 $ST r12,`14*$BNSZ`(r3) #r[14]=c3; 1502 $ST r12,`14*$BNSZ`(r3) #r[14]=c3;
1593 $ST r10,`15*$BNSZ`(r3) #r[15]=c1; 1503 $ST r10,`15*$BNSZ`(r3) #r[15]=c1;
1594 bclr BO_ALWAYS,CR0_LT 1504 blr
1595 .long 0x00000000 1505 .long 0x00000000
1596 1506
1597# 1507#
@@ -1623,7 +1533,7 @@ sub data {
1623 subfc. r7,r0,r6 # If r6 is 0 then result is 0. 1533 subfc. r7,r0,r6 # If r6 is 0 then result is 0.
1624 # if r6 > 0 then result !=0 1534 # if r6 > 0 then result !=0
1625 # In either case carry bit is set. 1535 # In either case carry bit is set.
1626 bc BO_IF,CR0_EQ,Lppcasm_sub_adios 1536 beq Lppcasm_sub_adios
1627 addi r4,r4,-$BNSZ 1537 addi r4,r4,-$BNSZ
1628 addi r3,r3,-$BNSZ 1538 addi r3,r3,-$BNSZ
1629 addi r5,r5,-$BNSZ 1539 addi r5,r5,-$BNSZ
@@ -1635,11 +1545,11 @@ Lppcasm_sub_mainloop:
1635 # if carry = 1 this is r7-r8. Else it 1545 # if carry = 1 this is r7-r8. Else it
1636 # is r7-r8 -1 as we need. 1546 # is r7-r8 -1 as we need.
1637 $STU r6,$BNSZ(r3) 1547 $STU r6,$BNSZ(r3)
1638 bc BO_dCTR_NZERO,CR0_EQ,Lppcasm_sub_mainloop 1548 bdnz- Lppcasm_sub_mainloop
1639Lppcasm_sub_adios: 1549Lppcasm_sub_adios:
1640 subfze r3,r0 # if carry bit is set then r3 = 0 else -1 1550 subfze r3,r0 # if carry bit is set then r3 = 0 else -1
1641 andi. r3,r3,1 # keep only last bit. 1551 andi. r3,r3,1 # keep only last bit.
1642 bclr BO_ALWAYS,CR0_LT 1552 blr
1643 .long 0x00000000 1553 .long 0x00000000
1644 1554
1645 1555
@@ -1670,7 +1580,7 @@ Lppcasm_sub_adios:
1670# check for r6 = 0. Is this needed? 1580# check for r6 = 0. Is this needed?
1671# 1581#
1672 addic. r6,r6,0 #test r6 and clear carry bit. 1582 addic. r6,r6,0 #test r6 and clear carry bit.
1673 bc BO_IF,CR0_EQ,Lppcasm_add_adios 1583 beq Lppcasm_add_adios
1674 addi r4,r4,-$BNSZ 1584 addi r4,r4,-$BNSZ
1675 addi r3,r3,-$BNSZ 1585 addi r3,r3,-$BNSZ
1676 addi r5,r5,-$BNSZ 1586 addi r5,r5,-$BNSZ
@@ -1680,10 +1590,10 @@ Lppcasm_add_mainloop:
1680 $LDU r8,$BNSZ(r5) 1590 $LDU r8,$BNSZ(r5)
1681 adde r8,r7,r8 1591 adde r8,r7,r8
1682 $STU r8,$BNSZ(r3) 1592 $STU r8,$BNSZ(r3)
1683 bc BO_dCTR_NZERO,CR0_EQ,Lppcasm_add_mainloop 1593 bdnz- Lppcasm_add_mainloop
1684Lppcasm_add_adios: 1594Lppcasm_add_adios:
1685 addze r3,r0 #return carry bit. 1595 addze r3,r0 #return carry bit.
1686 bclr BO_ALWAYS,CR0_LT 1596 blr
1687 .long 0x00000000 1597 .long 0x00000000
1688 1598
1689# 1599#
@@ -1707,24 +1617,24 @@ Lppcasm_add_adios:
1707# r5 = d 1617# r5 = d
1708 1618
1709 $UCMPI 0,r5,0 # compare r5 and 0 1619 $UCMPI 0,r5,0 # compare r5 and 0
1710 bc BO_IF_NOT,CR0_EQ,Lppcasm_div1 # proceed if d!=0 1620 bne Lppcasm_div1 # proceed if d!=0
1711 li r3,-1 # d=0 return -1 1621 li r3,-1 # d=0 return -1
1712 bclr BO_ALWAYS,CR0_LT 1622 blr
1713Lppcasm_div1: 1623Lppcasm_div1:
1714 xor r0,r0,r0 #r0=0 1624 xor r0,r0,r0 #r0=0
1715 li r8,$BITS 1625 li r8,$BITS
1716 $CNTLZ. r7,r5 #r7 = num leading 0s in d. 1626 $CNTLZ. r7,r5 #r7 = num leading 0s in d.
1717 bc BO_IF,CR0_EQ,Lppcasm_div2 #proceed if no leading zeros 1627 beq Lppcasm_div2 #proceed if no leading zeros
1718 subf r8,r7,r8 #r8 = BN_num_bits_word(d) 1628 subf r8,r7,r8 #r8 = BN_num_bits_word(d)
1719 $SHR. r9,r3,r8 #are there any bits above r8'th? 1629 $SHR. r9,r3,r8 #are there any bits above r8'th?
1720 $TR 16,r9,r0 #if there're, signal to dump core... 1630 $TR 16,r9,r0 #if there're, signal to dump core...
1721Lppcasm_div2: 1631Lppcasm_div2:
1722 $UCMP 0,r3,r5 #h>=d? 1632 $UCMP 0,r3,r5 #h>=d?
1723 bc BO_IF,CR0_LT,Lppcasm_div3 #goto Lppcasm_div3 if not 1633 blt Lppcasm_div3 #goto Lppcasm_div3 if not
1724 subf r3,r5,r3 #h-=d ; 1634 subf r3,r5,r3 #h-=d ;
1725Lppcasm_div3: #r7 = BN_BITS2-i. so r7=i 1635Lppcasm_div3: #r7 = BN_BITS2-i. so r7=i
1726 cmpi 0,0,r7,0 # is (i == 0)? 1636 cmpi 0,0,r7,0 # is (i == 0)?
1727 bc BO_IF,CR0_EQ,Lppcasm_div4 1637 beq Lppcasm_div4
1728 $SHL r3,r3,r7 # h = (h<< i) 1638 $SHL r3,r3,r7 # h = (h<< i)
1729 $SHR r8,r4,r8 # r8 = (l >> BN_BITS2 -i) 1639 $SHR r8,r4,r8 # r8 = (l >> BN_BITS2 -i)
1730 $SHL r5,r5,r7 # d<<=i 1640 $SHL r5,r5,r7 # d<<=i
@@ -1741,7 +1651,7 @@ Lppcasm_divouterloop:
1741 $SHRI r11,r4,`$BITS/2` #r11= (l&BN_MASK2h)>>BN_BITS4 1651 $SHRI r11,r4,`$BITS/2` #r11= (l&BN_MASK2h)>>BN_BITS4
1742 # compute here for innerloop. 1652 # compute here for innerloop.
1743 $UCMP 0,r8,r9 # is (h>>BN_BITS4)==dh 1653 $UCMP 0,r8,r9 # is (h>>BN_BITS4)==dh
1744 bc BO_IF_NOT,CR0_EQ,Lppcasm_div5 # goto Lppcasm_div5 if not 1654 bne Lppcasm_div5 # goto Lppcasm_div5 if not
1745 1655
1746 li r8,-1 1656 li r8,-1
1747 $CLRU r8,r8,`$BITS/2` #q = BN_MASK2l 1657 $CLRU r8,r8,`$BITS/2` #q = BN_MASK2l
@@ -1762,9 +1672,9 @@ Lppcasm_divinnerloop:
1762 # the following 2 instructions do that 1672 # the following 2 instructions do that
1763 $SHLI r7,r10,`$BITS/2` # r7 = (t<<BN_BITS4) 1673 $SHLI r7,r10,`$BITS/2` # r7 = (t<<BN_BITS4)
1764 or r7,r7,r11 # r7|=((l&BN_MASK2h)>>BN_BITS4) 1674 or r7,r7,r11 # r7|=((l&BN_MASK2h)>>BN_BITS4)
1765 $UCMP 1,r6,r7 # compare (tl <= r7) 1675 $UCMP cr1,r6,r7 # compare (tl <= r7)
1766 bc BO_IF_NOT,CR0_EQ,Lppcasm_divinnerexit 1676 bne Lppcasm_divinnerexit
1767 bc BO_IF_NOT,CR1_FEX,Lppcasm_divinnerexit 1677 ble cr1,Lppcasm_divinnerexit
1768 addi r8,r8,-1 #q-- 1678 addi r8,r8,-1 #q--
1769 subf r12,r9,r12 #th -=dh 1679 subf r12,r9,r12 #th -=dh
1770 $CLRU r10,r5,`$BITS/2` #r10=dl. t is no longer needed in loop. 1680 $CLRU r10,r5,`$BITS/2` #r10=dl. t is no longer needed in loop.
@@ -1773,14 +1683,14 @@ Lppcasm_divinnerloop:
1773Lppcasm_divinnerexit: 1683Lppcasm_divinnerexit:
1774 $SHRI r10,r6,`$BITS/2` #t=(tl>>BN_BITS4) 1684 $SHRI r10,r6,`$BITS/2` #t=(tl>>BN_BITS4)
1775 $SHLI r11,r6,`$BITS/2` #tl=(tl<<BN_BITS4)&BN_MASK2h; 1685 $SHLI r11,r6,`$BITS/2` #tl=(tl<<BN_BITS4)&BN_MASK2h;
1776 $UCMP 1,r4,r11 # compare l and tl 1686 $UCMP cr1,r4,r11 # compare l and tl
1777 add r12,r12,r10 # th+=t 1687 add r12,r12,r10 # th+=t
1778 bc BO_IF_NOT,CR1_FX,Lppcasm_div7 # if (l>=tl) goto Lppcasm_div7 1688 bge cr1,Lppcasm_div7 # if (l>=tl) goto Lppcasm_div7
1779 addi r12,r12,1 # th++ 1689 addi r12,r12,1 # th++
1780Lppcasm_div7: 1690Lppcasm_div7:
1781 subf r11,r11,r4 #r11=l-tl 1691 subf r11,r11,r4 #r11=l-tl
1782 $UCMP 1,r3,r12 #compare h and th 1692 $UCMP cr1,r3,r12 #compare h and th
1783 bc BO_IF_NOT,CR1_FX,Lppcasm_div8 #if (h>=th) goto Lppcasm_div8 1693 bge cr1,Lppcasm_div8 #if (h>=th) goto Lppcasm_div8
1784 addi r8,r8,-1 # q-- 1694 addi r8,r8,-1 # q--
1785 add r3,r5,r3 # h+=d 1695 add r3,r5,r3 # h+=d
1786Lppcasm_div8: 1696Lppcasm_div8:
@@ -1791,12 +1701,12 @@ Lppcasm_div8:
1791 # the following 2 instructions will do this. 1701 # the following 2 instructions will do this.
1792 $INSR r11,r12,`$BITS/2`,`$BITS/2` # r11 is the value we want rotated $BITS/2. 1702 $INSR r11,r12,`$BITS/2`,`$BITS/2` # r11 is the value we want rotated $BITS/2.
1793 $ROTL r3,r11,`$BITS/2` # rotate by $BITS/2 and store in r3 1703 $ROTL r3,r11,`$BITS/2` # rotate by $BITS/2 and store in r3
1794 bc BO_dCTR_ZERO,CR0_EQ,Lppcasm_div9#if (count==0) break ; 1704 bdz Lppcasm_div9 #if (count==0) break ;
1795 $SHLI r0,r8,`$BITS/2` #ret =q<<BN_BITS4 1705 $SHLI r0,r8,`$BITS/2` #ret =q<<BN_BITS4
1796 b Lppcasm_divouterloop 1706 b Lppcasm_divouterloop
1797Lppcasm_div9: 1707Lppcasm_div9:
1798 or r3,r8,r0 1708 or r3,r8,r0
1799 bclr BO_ALWAYS,CR0_LT 1709 blr
1800 .long 0x00000000 1710 .long 0x00000000
1801 1711
1802# 1712#
@@ -1822,7 +1732,7 @@ Lppcasm_div9:
1822# No unrolling done here. Not performance critical. 1732# No unrolling done here. Not performance critical.
1823 1733
1824 addic. r5,r5,0 #test r5. 1734 addic. r5,r5,0 #test r5.
1825 bc BO_IF,CR0_EQ,Lppcasm_sqr_adios 1735 beq Lppcasm_sqr_adios
1826 addi r4,r4,-$BNSZ 1736 addi r4,r4,-$BNSZ
1827 addi r3,r3,-$BNSZ 1737 addi r3,r3,-$BNSZ
1828 mtctr r5 1738 mtctr r5
@@ -1833,9 +1743,9 @@ Lppcasm_sqr_mainloop:
1833 $UMULH r8,r6,r6 1743 $UMULH r8,r6,r6
1834 $STU r7,$BNSZ(r3) 1744 $STU r7,$BNSZ(r3)
1835 $STU r8,$BNSZ(r3) 1745 $STU r8,$BNSZ(r3)
1836 bc BO_dCTR_NZERO,CR0_EQ,Lppcasm_sqr_mainloop 1746 bdnz- Lppcasm_sqr_mainloop
1837Lppcasm_sqr_adios: 1747Lppcasm_sqr_adios:
1838 bclr BO_ALWAYS,CR0_LT 1748 blr
1839 .long 0x00000000 1749 .long 0x00000000
1840 1750
1841 1751
@@ -1858,7 +1768,7 @@ Lppcasm_sqr_adios:
1858 xor r0,r0,r0 1768 xor r0,r0,r0
1859 xor r12,r12,r12 # used for carry 1769 xor r12,r12,r12 # used for carry
1860 rlwinm. r7,r5,30,2,31 # num >> 2 1770 rlwinm. r7,r5,30,2,31 # num >> 2
1861 bc BO_IF,CR0_EQ,Lppcasm_mw_REM 1771 beq Lppcasm_mw_REM
1862 mtctr r7 1772 mtctr r7
1863Lppcasm_mw_LOOP: 1773Lppcasm_mw_LOOP:
1864 #mul(rp[0],ap[0],w,c1); 1774 #mul(rp[0],ap[0],w,c1);
@@ -1896,11 +1806,11 @@ Lppcasm_mw_LOOP:
1896 1806
1897 addi r3,r3,`4*$BNSZ` 1807 addi r3,r3,`4*$BNSZ`
1898 addi r4,r4,`4*$BNSZ` 1808 addi r4,r4,`4*$BNSZ`
1899 bc BO_dCTR_NZERO,CR0_EQ,Lppcasm_mw_LOOP 1809 bdnz- Lppcasm_mw_LOOP
1900 1810
1901Lppcasm_mw_REM: 1811Lppcasm_mw_REM:
1902 andi. r5,r5,0x3 1812 andi. r5,r5,0x3
1903 bc BO_IF,CR0_EQ,Lppcasm_mw_OVER 1813 beq Lppcasm_mw_OVER
1904 #mul(rp[0],ap[0],w,c1); 1814 #mul(rp[0],ap[0],w,c1);
1905 $LD r8,`0*$BNSZ`(r4) 1815 $LD r8,`0*$BNSZ`(r4)
1906 $UMULL r9,r6,r8 1816 $UMULL r9,r6,r8
@@ -1912,7 +1822,7 @@ Lppcasm_mw_REM:
1912 1822
1913 addi r5,r5,-1 1823 addi r5,r5,-1
1914 cmpli 0,0,r5,0 1824 cmpli 0,0,r5,0
1915 bc BO_IF,CR0_EQ,Lppcasm_mw_OVER 1825 beq Lppcasm_mw_OVER
1916 1826
1917 1827
1918 #mul(rp[1],ap[1],w,c1); 1828 #mul(rp[1],ap[1],w,c1);
@@ -1926,7 +1836,7 @@ Lppcasm_mw_REM:
1926 1836
1927 addi r5,r5,-1 1837 addi r5,r5,-1
1928 cmpli 0,0,r5,0 1838 cmpli 0,0,r5,0
1929 bc BO_IF,CR0_EQ,Lppcasm_mw_OVER 1839 beq Lppcasm_mw_OVER
1930 1840
1931 #mul_add(rp[2],ap[2],w,c1); 1841 #mul_add(rp[2],ap[2],w,c1);
1932 $LD r8,`2*$BNSZ`(r4) 1842 $LD r8,`2*$BNSZ`(r4)
@@ -1939,7 +1849,7 @@ Lppcasm_mw_REM:
1939 1849
1940Lppcasm_mw_OVER: 1850Lppcasm_mw_OVER:
1941 addi r3,r12,0 1851 addi r3,r12,0
1942 bclr BO_ALWAYS,CR0_LT 1852 blr
1943 .long 0x00000000 1853 .long 0x00000000
1944 1854
1945# 1855#
@@ -1964,7 +1874,7 @@ Lppcasm_mw_OVER:
1964 xor r0,r0,r0 #r0 = 0 1874 xor r0,r0,r0 #r0 = 0
1965 xor r12,r12,r12 #r12 = 0 . used for carry 1875 xor r12,r12,r12 #r12 = 0 . used for carry
1966 rlwinm. r7,r5,30,2,31 # num >> 2 1876 rlwinm. r7,r5,30,2,31 # num >> 2
1967 bc BO_IF,CR0_EQ,Lppcasm_maw_leftover # if (num < 4) go LPPCASM_maw_leftover 1877 beq Lppcasm_maw_leftover # if (num < 4) go LPPCASM_maw_leftover
1968 mtctr r7 1878 mtctr r7
1969Lppcasm_maw_mainloop: 1879Lppcasm_maw_mainloop:
1970 #mul_add(rp[0],ap[0],w,c1); 1880 #mul_add(rp[0],ap[0],w,c1);
@@ -2017,11 +1927,11 @@ Lppcasm_maw_mainloop:
2017 $ST r11,`3*$BNSZ`(r3) 1927 $ST r11,`3*$BNSZ`(r3)
2018 addi r3,r3,`4*$BNSZ` 1928 addi r3,r3,`4*$BNSZ`
2019 addi r4,r4,`4*$BNSZ` 1929 addi r4,r4,`4*$BNSZ`
2020 bc BO_dCTR_NZERO,CR0_EQ,Lppcasm_maw_mainloop 1930 bdnz- Lppcasm_maw_mainloop
2021 1931
2022Lppcasm_maw_leftover: 1932Lppcasm_maw_leftover:
2023 andi. r5,r5,0x3 1933 andi. r5,r5,0x3
2024 bc BO_IF,CR0_EQ,Lppcasm_maw_adios 1934 beq Lppcasm_maw_adios
2025 addi r3,r3,-$BNSZ 1935 addi r3,r3,-$BNSZ
2026 addi r4,r4,-$BNSZ 1936 addi r4,r4,-$BNSZ
2027 #mul_add(rp[0],ap[0],w,c1); 1937 #mul_add(rp[0],ap[0],w,c1);
@@ -2036,7 +1946,7 @@ Lppcasm_maw_leftover:
2036 addze r12,r10 1946 addze r12,r10
2037 $ST r9,0(r3) 1947 $ST r9,0(r3)
2038 1948
2039 bc BO_dCTR_ZERO,CR0_EQ,Lppcasm_maw_adios 1949 bdz Lppcasm_maw_adios
2040 #mul_add(rp[1],ap[1],w,c1); 1950 #mul_add(rp[1],ap[1],w,c1);
2041 $LDU r8,$BNSZ(r4) 1951 $LDU r8,$BNSZ(r4)
2042 $UMULL r9,r6,r8 1952 $UMULL r9,r6,r8
@@ -2048,7 +1958,7 @@ Lppcasm_maw_leftover:
2048 addze r12,r10 1958 addze r12,r10
2049 $ST r9,0(r3) 1959 $ST r9,0(r3)
2050 1960
2051 bc BO_dCTR_ZERO,CR0_EQ,Lppcasm_maw_adios 1961 bdz Lppcasm_maw_adios
2052 #mul_add(rp[2],ap[2],w,c1); 1962 #mul_add(rp[2],ap[2],w,c1);
2053 $LDU r8,$BNSZ(r4) 1963 $LDU r8,$BNSZ(r4)
2054 $UMULL r9,r6,r8 1964 $UMULL r9,r6,r8
@@ -2062,17 +1972,10 @@ Lppcasm_maw_leftover:
2062 1972
2063Lppcasm_maw_adios: 1973Lppcasm_maw_adios:
2064 addi r3,r12,0 1974 addi r3,r12,0
2065 bclr BO_ALWAYS,CR0_LT 1975 blr
2066 .long 0x00000000 1976 .long 0x00000000
2067 .align 4 1977 .align 4
2068EOF 1978EOF
2069 $data =~ s/\`([^\`]*)\`/eval $1/gem; 1979$data =~ s/\`([^\`]*)\`/eval $1/gem;
2070 1980print $data;
2071 # if some assembler chokes on some simplified mnemonic, 1981close STDOUT;
2072 # this is the spot to fix it up, e.g.:
2073 # GNU as doesn't seem to accept cmplw, 32-bit unsigned compare
2074 $data =~ s/^(\s*)cmplw(\s+)([^,]+),(.*)/$1cmpl$2$3,0,$4/gm;
2075 # assembler X doesn't accept li, load immediate value
2076 #$data =~ s/^(\s*)li(\s+)([^,]+),(.*)/$1addi$2$3,0,$4/gm;
2077 return($data);
2078}
diff --git a/src/lib/libcrypto/bn/asm/x86_64-gcc.c b/src/lib/libcrypto/bn/asm/x86_64-gcc.c
index f13f52dd85..acb0b40118 100644
--- a/src/lib/libcrypto/bn/asm/x86_64-gcc.c
+++ b/src/lib/libcrypto/bn/asm/x86_64-gcc.c
@@ -1,4 +1,5 @@
1#ifdef __SUNPRO_C 1#include "../bn_lcl.h"
2#if !(defined(__GNUC__) && __GNUC__>=2)
2# include "../bn_asm.c" /* kind of dirty hack for Sun Studio */ 3# include "../bn_asm.c" /* kind of dirty hack for Sun Studio */
3#else 4#else
4/* 5/*
@@ -54,7 +55,15 @@
54 * machine. 55 * machine.
55 */ 56 */
56 57
58#ifdef _WIN64
59#define BN_ULONG unsigned long long
60#else
57#define BN_ULONG unsigned long 61#define BN_ULONG unsigned long
62#endif
63
64#undef mul
65#undef mul_add
66#undef sqr
58 67
59/* 68/*
60 * "m"(a), "+m"(r) is the way to favor DirectPath µ-code; 69 * "m"(a), "+m"(r) is the way to favor DirectPath µ-code;
@@ -97,7 +106,7 @@
97 : "a"(a) \ 106 : "a"(a) \
98 : "cc"); 107 : "cc");
99 108
100BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w) 109BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
101 { 110 {
102 BN_ULONG c1=0; 111 BN_ULONG c1=0;
103 112
@@ -121,7 +130,7 @@ BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
121 return(c1); 130 return(c1);
122 } 131 }
123 132
124BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w) 133BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
125 { 134 {
126 BN_ULONG c1=0; 135 BN_ULONG c1=0;
127 136
@@ -144,7 +153,7 @@ BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
144 return(c1); 153 return(c1);
145 } 154 }
146 155
147void bn_sqr_words(BN_ULONG *r, BN_ULONG *a, int n) 156void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n)
148 { 157 {
149 if (n <= 0) return; 158 if (n <= 0) return;
150 159
@@ -175,14 +184,14 @@ BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
175 return ret; 184 return ret;
176} 185}
177 186
178BN_ULONG bn_add_words (BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int n) 187BN_ULONG bn_add_words (BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,int n)
179{ BN_ULONG ret=0,i=0; 188{ BN_ULONG ret=0,i=0;
180 189
181 if (n <= 0) return 0; 190 if (n <= 0) return 0;
182 191
183 asm ( 192 asm (
184 " subq %2,%2 \n" 193 " subq %2,%2 \n"
185 ".align 16 \n" 194 ".p2align 4 \n"
186 "1: movq (%4,%2,8),%0 \n" 195 "1: movq (%4,%2,8),%0 \n"
187 " adcq (%5,%2,8),%0 \n" 196 " adcq (%5,%2,8),%0 \n"
188 " movq %0,(%3,%2,8) \n" 197 " movq %0,(%3,%2,8) \n"
@@ -198,14 +207,14 @@ BN_ULONG bn_add_words (BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int n)
198} 207}
199 208
200#ifndef SIMICS 209#ifndef SIMICS
201BN_ULONG bn_sub_words (BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int n) 210BN_ULONG bn_sub_words (BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,int n)
202{ BN_ULONG ret=0,i=0; 211{ BN_ULONG ret=0,i=0;
203 212
204 if (n <= 0) return 0; 213 if (n <= 0) return 0;
205 214
206 asm ( 215 asm (
207 " subq %2,%2 \n" 216 " subq %2,%2 \n"
208 ".align 16 \n" 217 ".p2align 4 \n"
209 "1: movq (%4,%2,8),%0 \n" 218 "1: movq (%4,%2,8),%0 \n"
210 " sbbq (%5,%2,8),%0 \n" 219 " sbbq (%5,%2,8),%0 \n"
211 " movq %0,(%3,%2,8) \n" 220 " movq %0,(%3,%2,8) \n"
@@ -485,7 +494,7 @@ void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
485 r[7]=c2; 494 r[7]=c2;
486 } 495 }
487 496
488void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a) 497void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
489 { 498 {
490 BN_ULONG t1,t2; 499 BN_ULONG t1,t2;
491 BN_ULONG c1,c2,c3; 500 BN_ULONG c1,c2,c3;
@@ -561,7 +570,7 @@ void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
561 r[15]=c1; 570 r[15]=c1;
562 } 571 }
563 572
564void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a) 573void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
565 { 574 {
566 BN_ULONG t1,t2; 575 BN_ULONG t1,t2;
567 BN_ULONG c1,c2,c3; 576 BN_ULONG c1,c2,c3;
diff --git a/src/lib/libcrypto/bn/asm/x86_64-mont.pl b/src/lib/libcrypto/bn/asm/x86_64-mont.pl
index c43b69592a..3b7a6f243f 100755
--- a/src/lib/libcrypto/bn/asm/x86_64-mont.pl
+++ b/src/lib/libcrypto/bn/asm/x86_64-mont.pl
@@ -15,14 +15,18 @@
15# respectful 50%. It remains to be seen if loop unrolling and 15# respectful 50%. It remains to be seen if loop unrolling and
16# dedicated squaring routine can provide further improvement... 16# dedicated squaring routine can provide further improvement...
17 17
18$output=shift; 18$flavour = shift;
19$output = shift;
20if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
21
22$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
19 23
20$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 24$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
21( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 25( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
22( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 26( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
23die "can't locate x86_64-xlate.pl"; 27die "can't locate x86_64-xlate.pl";
24 28
25open STDOUT,"| $^X $xlate $output"; 29open STDOUT,"| $^X $xlate $flavour $output";
26 30
27# int bn_mul_mont( 31# int bn_mul_mont(
28$rp="%rdi"; # BN_ULONG *rp, 32$rp="%rdi"; # BN_ULONG *rp,
@@ -55,13 +59,14 @@ bn_mul_mont:
55 push %r15 59 push %r15
56 60
57 mov ${num}d,${num}d 61 mov ${num}d,${num}d
58 lea 2($num),%rax 62 lea 2($num),%r10
59 mov %rsp,%rbp 63 mov %rsp,%r11
60 neg %rax 64 neg %r10
61 lea (%rsp,%rax,8),%rsp # tp=alloca(8*(num+2)) 65 lea (%rsp,%r10,8),%rsp # tp=alloca(8*(num+2))
62 and \$-1024,%rsp # minimize TLB usage 66 and \$-1024,%rsp # minimize TLB usage
63 67
64 mov %rbp,8(%rsp,$num,8) # tp[num+1]=%rsp 68 mov %r11,8(%rsp,$num,8) # tp[num+1]=%rsp
69.Lprologue:
65 mov %rdx,$bp # $bp reassigned, remember? 70 mov %rdx,$bp # $bp reassigned, remember?
66 71
67 mov ($n0),$n0 # pull n0[0] value 72 mov ($n0),$n0 # pull n0[0] value
@@ -197,18 +202,129 @@ bn_mul_mont:
197 dec $j 202 dec $j
198 jge .Lcopy 203 jge .Lcopy
199 204
200 mov 8(%rsp,$num,8),%rsp # restore %rsp 205 mov 8(%rsp,$num,8),%rsi # restore %rsp
201 mov \$1,%rax 206 mov \$1,%rax
207 mov (%rsi),%r15
208 mov 8(%rsi),%r14
209 mov 16(%rsi),%r13
210 mov 24(%rsi),%r12
211 mov 32(%rsi),%rbp
212 mov 40(%rsi),%rbx
213 lea 48(%rsi),%rsp
214.Lepilogue:
215 ret
216.size bn_mul_mont,.-bn_mul_mont
217.asciz "Montgomery Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
218.align 16
219___
220
221# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
222# CONTEXT *context,DISPATCHER_CONTEXT *disp)
223if ($win64) {
224$rec="%rcx";
225$frame="%rdx";
226$context="%r8";
227$disp="%r9";
228
229$code.=<<___;
230.extern __imp_RtlVirtualUnwind
231.type se_handler,\@abi-omnipotent
232.align 16
233se_handler:
234 push %rsi
235 push %rdi
236 push %rbx
237 push %rbp
238 push %r12
239 push %r13
240 push %r14
241 push %r15
242 pushfq
243 sub \$64,%rsp
244
245 mov 120($context),%rax # pull context->Rax
246 mov 248($context),%rbx # pull context->Rip
247
248 lea .Lprologue(%rip),%r10
249 cmp %r10,%rbx # context->Rip<.Lprologue
250 jb .Lin_prologue
251
252 mov 152($context),%rax # pull context->Rsp
253
254 lea .Lepilogue(%rip),%r10
255 cmp %r10,%rbx # context->Rip>=.Lepilogue
256 jae .Lin_prologue
257
258 mov 192($context),%r10 # pull $num
259 mov 8(%rax,%r10,8),%rax # pull saved stack pointer
260 lea 48(%rax),%rax
261
262 mov -8(%rax),%rbx
263 mov -16(%rax),%rbp
264 mov -24(%rax),%r12
265 mov -32(%rax),%r13
266 mov -40(%rax),%r14
267 mov -48(%rax),%r15
268 mov %rbx,144($context) # restore context->Rbx
269 mov %rbp,160($context) # restore context->Rbp
270 mov %r12,216($context) # restore context->R12
271 mov %r13,224($context) # restore context->R13
272 mov %r14,232($context) # restore context->R14
273 mov %r15,240($context) # restore context->R15
274
275.Lin_prologue:
276 mov 8(%rax),%rdi
277 mov 16(%rax),%rsi
278 mov %rax,152($context) # restore context->Rsp
279 mov %rsi,168($context) # restore context->Rsi
280 mov %rdi,176($context) # restore context->Rdi
281
282 mov 40($disp),%rdi # disp->ContextRecord
283 mov $context,%rsi # context
284 mov \$154,%ecx # sizeof(CONTEXT)
285 .long 0xa548f3fc # cld; rep movsq
286
287 mov $disp,%rsi
288 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
289 mov 8(%rsi),%rdx # arg2, disp->ImageBase
290 mov 0(%rsi),%r8 # arg3, disp->ControlPc
291 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
292 mov 40(%rsi),%r10 # disp->ContextRecord
293 lea 56(%rsi),%r11 # &disp->HandlerData
294 lea 24(%rsi),%r12 # &disp->EstablisherFrame
295 mov %r10,32(%rsp) # arg5
296 mov %r11,40(%rsp) # arg6
297 mov %r12,48(%rsp) # arg7
298 mov %rcx,56(%rsp) # arg8, (NULL)
299 call *__imp_RtlVirtualUnwind(%rip)
300
301 mov \$1,%eax # ExceptionContinueSearch
302 add \$64,%rsp
303 popfq
202 pop %r15 304 pop %r15
203 pop %r14 305 pop %r14
204 pop %r13 306 pop %r13
205 pop %r12 307 pop %r12
206 pop %rbp 308 pop %rbp
207 pop %rbx 309 pop %rbx
310 pop %rdi
311 pop %rsi
208 ret 312 ret
209.size bn_mul_mont,.-bn_mul_mont 313.size se_handler,.-se_handler
210.asciz "Montgomery Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 314
315.section .pdata
316.align 4
317 .rva .LSEH_begin_bn_mul_mont
318 .rva .LSEH_end_bn_mul_mont
319 .rva .LSEH_info_bn_mul_mont
320
321.section .xdata
322.align 8
323.LSEH_info_bn_mul_mont:
324 .byte 9,0,0,0
325 .rva se_handler
211___ 326___
327}
212 328
213print $code; 329print $code;
214close STDOUT; 330close STDOUT;