diff options
Diffstat (limited to 'src/lib/libcrypto/bn/asm/s390x-mont.pl')
-rw-r--r-- | src/lib/libcrypto/bn/asm/s390x-mont.pl | 102 |
1 files changed, 77 insertions, 25 deletions
diff --git a/src/lib/libcrypto/bn/asm/s390x-mont.pl b/src/lib/libcrypto/bn/asm/s390x-mont.pl index f61246f5b6..9fd64e81ee 100644 --- a/src/lib/libcrypto/bn/asm/s390x-mont.pl +++ b/src/lib/libcrypto/bn/asm/s390x-mont.pl | |||
@@ -32,6 +32,33 @@ | |||
32 | # Reschedule to minimize/avoid Address Generation Interlock hazard, | 32 | # Reschedule to minimize/avoid Address Generation Interlock hazard, |
33 | # make inner loops counter-based. | 33 | # make inner loops counter-based. |
34 | 34 | ||
35 | # November 2010. | ||
36 | # | ||
37 | # Adapt for -m31 build. If kernel supports what's called "highgprs" | ||
38 | # feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit | ||
39 | # instructions and achieve "64-bit" performance even in 31-bit legacy | ||
40 | # application context. The feature is not specific to any particular | ||
41 | # processor, as long as it's "z-CPU". Latter implies that the code | ||
42 | # remains z/Architecture specific. Compatibility with 32-bit BN_ULONG | ||
43 | # is achieved by swapping words after 64-bit loads, follow _dswap-s. | ||
44 | # On z990 it was measured to perform 2.6-2.2 times better than | ||
45 | # compiler-generated code, less for longer keys... | ||
46 | |||
47 | $flavour = shift; | ||
48 | |||
49 | if ($flavour =~ /3[12]/) { | ||
50 | $SIZE_T=4; | ||
51 | $g=""; | ||
52 | } else { | ||
53 | $SIZE_T=8; | ||
54 | $g="g"; | ||
55 | } | ||
56 | |||
57 | while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} | ||
58 | open STDOUT,">$output"; | ||
59 | |||
60 | $stdframe=16*$SIZE_T+4*8; | ||
61 | |||
35 | $mn0="%r0"; | 62 | $mn0="%r0"; |
36 | $num="%r1"; | 63 | $num="%r1"; |
37 | 64 | ||
@@ -60,34 +87,44 @@ $code.=<<___; | |||
60 | .globl bn_mul_mont | 87 | .globl bn_mul_mont |
61 | .type bn_mul_mont,\@function | 88 | .type bn_mul_mont,\@function |
62 | bn_mul_mont: | 89 | bn_mul_mont: |
63 | lgf $num,164($sp) # pull $num | 90 | lgf $num,`$stdframe+$SIZE_T-4`($sp) # pull $num |
64 | sla $num,3 # $num to enumerate bytes | 91 | sla $num,`log($SIZE_T)/log(2)` # $num to enumerate bytes |
65 | la $bp,0($num,$bp) | 92 | la $bp,0($num,$bp) |
66 | 93 | ||
67 | stg %r2,16($sp) | 94 | st${g} %r2,2*$SIZE_T($sp) |
68 | 95 | ||
69 | cghi $num,16 # | 96 | cghi $num,16 # |
70 | lghi %r2,0 # | 97 | lghi %r2,0 # |
71 | blr %r14 # if($num<16) return 0; | 98 | blr %r14 # if($num<16) return 0; |
99 | ___ | ||
100 | $code.=<<___ if ($flavour =~ /3[12]/); | ||
101 | tmll $num,4 | ||
102 | bnzr %r14 # if ($num&1) return 0; | ||
103 | ___ | ||
104 | $code.=<<___ if ($flavour !~ /3[12]/); | ||
72 | cghi $num,96 # | 105 | cghi $num,96 # |
73 | bhr %r14 # if($num>96) return 0; | 106 | bhr %r14 # if($num>96) return 0; |
107 | ___ | ||
108 | $code.=<<___; | ||
109 | stm${g} %r3,%r15,3*$SIZE_T($sp) | ||
74 | 110 | ||
75 | stmg %r3,%r15,24($sp) | 111 | lghi $rp,-$stdframe-8 # leave room for carry bit |
76 | |||
77 | lghi $rp,-160-8 # leave room for carry bit | ||
78 | lcgr $j,$num # -$num | 112 | lcgr $j,$num # -$num |
79 | lgr %r0,$sp | 113 | lgr %r0,$sp |
80 | la $rp,0($rp,$sp) | 114 | la $rp,0($rp,$sp) |
81 | la $sp,0($j,$rp) # alloca | 115 | la $sp,0($j,$rp) # alloca |
82 | stg %r0,0($sp) # back chain | 116 | st${g} %r0,0($sp) # back chain |
83 | 117 | ||
84 | sra $num,3 # restore $num | 118 | sra $num,3 # restore $num |
85 | la $bp,0($j,$bp) # restore $bp | 119 | la $bp,0($j,$bp) # restore $bp |
86 | ahi $num,-1 # adjust $num for inner loop | 120 | ahi $num,-1 # adjust $num for inner loop |
87 | lg $n0,0($n0) # pull n0 | 121 | lg $n0,0($n0) # pull n0 |
122 | _dswap $n0 | ||
88 | 123 | ||
89 | lg $bi,0($bp) | 124 | lg $bi,0($bp) |
125 | _dswap $bi | ||
90 | lg $alo,0($ap) | 126 | lg $alo,0($ap) |
127 | _dswap $alo | ||
91 | mlgr $ahi,$bi # ap[0]*bp[0] | 128 | mlgr $ahi,$bi # ap[0]*bp[0] |
92 | lgr $AHI,$ahi | 129 | lgr $AHI,$ahi |
93 | 130 | ||
@@ -95,6 +132,7 @@ bn_mul_mont: | |||
95 | msgr $mn0,$n0 | 132 | msgr $mn0,$n0 |
96 | 133 | ||
97 | lg $nlo,0($np) # | 134 | lg $nlo,0($np) # |
135 | _dswap $nlo | ||
98 | mlgr $nhi,$mn0 # np[0]*m1 | 136 | mlgr $nhi,$mn0 # np[0]*m1 |
99 | algr $nlo,$alo # +="tp[0]" | 137 | algr $nlo,$alo # +="tp[0]" |
100 | lghi $NHI,0 | 138 | lghi $NHI,0 |
@@ -106,12 +144,14 @@ bn_mul_mont: | |||
106 | .align 16 | 144 | .align 16 |
107 | .L1st: | 145 | .L1st: |
108 | lg $alo,0($j,$ap) | 146 | lg $alo,0($j,$ap) |
147 | _dswap $alo | ||
109 | mlgr $ahi,$bi # ap[j]*bp[0] | 148 | mlgr $ahi,$bi # ap[j]*bp[0] |
110 | algr $alo,$AHI | 149 | algr $alo,$AHI |
111 | lghi $AHI,0 | 150 | lghi $AHI,0 |
112 | alcgr $AHI,$ahi | 151 | alcgr $AHI,$ahi |
113 | 152 | ||
114 | lg $nlo,0($j,$np) | 153 | lg $nlo,0($j,$np) |
154 | _dswap $nlo | ||
115 | mlgr $nhi,$mn0 # np[j]*m1 | 155 | mlgr $nhi,$mn0 # np[j]*m1 |
116 | algr $nlo,$NHI | 156 | algr $nlo,$NHI |
117 | lghi $NHI,0 | 157 | lghi $NHI,0 |
@@ -119,22 +159,24 @@ bn_mul_mont: | |||
119 | algr $nlo,$alo | 159 | algr $nlo,$alo |
120 | alcgr $NHI,$nhi | 160 | alcgr $NHI,$nhi |
121 | 161 | ||
122 | stg $nlo,160-8($j,$sp) # tp[j-1]= | 162 | stg $nlo,$stdframe-8($j,$sp) # tp[j-1]= |
123 | la $j,8($j) # j++ | 163 | la $j,8($j) # j++ |
124 | brct $count,.L1st | 164 | brct $count,.L1st |
125 | 165 | ||
126 | algr $NHI,$AHI | 166 | algr $NHI,$AHI |
127 | lghi $AHI,0 | 167 | lghi $AHI,0 |
128 | alcgr $AHI,$AHI # upmost overflow bit | 168 | alcgr $AHI,$AHI # upmost overflow bit |
129 | stg $NHI,160-8($j,$sp) | 169 | stg $NHI,$stdframe-8($j,$sp) |
130 | stg $AHI,160($j,$sp) | 170 | stg $AHI,$stdframe($j,$sp) |
131 | la $bp,8($bp) # bp++ | 171 | la $bp,8($bp) # bp++ |
132 | 172 | ||
133 | .Louter: | 173 | .Louter: |
134 | lg $bi,0($bp) # bp[i] | 174 | lg $bi,0($bp) # bp[i] |
175 | _dswap $bi | ||
135 | lg $alo,0($ap) | 176 | lg $alo,0($ap) |
177 | _dswap $alo | ||
136 | mlgr $ahi,$bi # ap[0]*bp[i] | 178 | mlgr $ahi,$bi # ap[0]*bp[i] |
137 | alg $alo,160($sp) # +=tp[0] | 179 | alg $alo,$stdframe($sp) # +=tp[0] |
138 | lghi $AHI,0 | 180 | lghi $AHI,0 |
139 | alcgr $AHI,$ahi | 181 | alcgr $AHI,$ahi |
140 | 182 | ||
@@ -142,6 +184,7 @@ bn_mul_mont: | |||
142 | msgr $mn0,$n0 # tp[0]*n0 | 184 | msgr $mn0,$n0 # tp[0]*n0 |
143 | 185 | ||
144 | lg $nlo,0($np) # np[0] | 186 | lg $nlo,0($np) # np[0] |
187 | _dswap $nlo | ||
145 | mlgr $nhi,$mn0 # np[0]*m1 | 188 | mlgr $nhi,$mn0 # np[0]*m1 |
146 | algr $nlo,$alo # +="tp[0]" | 189 | algr $nlo,$alo # +="tp[0]" |
147 | lghi $NHI,0 | 190 | lghi $NHI,0 |
@@ -153,14 +196,16 @@ bn_mul_mont: | |||
153 | .align 16 | 196 | .align 16 |
154 | .Linner: | 197 | .Linner: |
155 | lg $alo,0($j,$ap) | 198 | lg $alo,0($j,$ap) |
199 | _dswap $alo | ||
156 | mlgr $ahi,$bi # ap[j]*bp[i] | 200 | mlgr $ahi,$bi # ap[j]*bp[i] |
157 | algr $alo,$AHI | 201 | algr $alo,$AHI |
158 | lghi $AHI,0 | 202 | lghi $AHI,0 |
159 | alcgr $ahi,$AHI | 203 | alcgr $ahi,$AHI |
160 | alg $alo,160($j,$sp)# +=tp[j] | 204 | alg $alo,$stdframe($j,$sp)# +=tp[j] |
161 | alcgr $AHI,$ahi | 205 | alcgr $AHI,$ahi |
162 | 206 | ||
163 | lg $nlo,0($j,$np) | 207 | lg $nlo,0($j,$np) |
208 | _dswap $nlo | ||
164 | mlgr $nhi,$mn0 # np[j]*m1 | 209 | mlgr $nhi,$mn0 # np[j]*m1 |
165 | algr $nlo,$NHI | 210 | algr $nlo,$NHI |
166 | lghi $NHI,0 | 211 | lghi $NHI,0 |
@@ -168,31 +213,33 @@ bn_mul_mont: | |||
168 | algr $nlo,$alo # +="tp[j]" | 213 | algr $nlo,$alo # +="tp[j]" |
169 | alcgr $NHI,$nhi | 214 | alcgr $NHI,$nhi |
170 | 215 | ||
171 | stg $nlo,160-8($j,$sp) # tp[j-1]= | 216 | stg $nlo,$stdframe-8($j,$sp) # tp[j-1]= |
172 | la $j,8($j) # j++ | 217 | la $j,8($j) # j++ |
173 | brct $count,.Linner | 218 | brct $count,.Linner |
174 | 219 | ||
175 | algr $NHI,$AHI | 220 | algr $NHI,$AHI |
176 | lghi $AHI,0 | 221 | lghi $AHI,0 |
177 | alcgr $AHI,$AHI | 222 | alcgr $AHI,$AHI |
178 | alg $NHI,160($j,$sp)# accumulate previous upmost overflow bit | 223 | alg $NHI,$stdframe($j,$sp)# accumulate previous upmost overflow bit |
179 | lghi $ahi,0 | 224 | lghi $ahi,0 |
180 | alcgr $AHI,$ahi # new upmost overflow bit | 225 | alcgr $AHI,$ahi # new upmost overflow bit |
181 | stg $NHI,160-8($j,$sp) | 226 | stg $NHI,$stdframe-8($j,$sp) |
182 | stg $AHI,160($j,$sp) | 227 | stg $AHI,$stdframe($j,$sp) |
183 | 228 | ||
184 | la $bp,8($bp) # bp++ | 229 | la $bp,8($bp) # bp++ |
185 | clg $bp,160+8+32($j,$sp) # compare to &bp[num] | 230 | cl${g} $bp,`$stdframe+8+4*$SIZE_T`($j,$sp) # compare to &bp[num] |
186 | jne .Louter | 231 | jne .Louter |
187 | 232 | ||
188 | lg $rp,160+8+16($j,$sp) # reincarnate rp | 233 | l${g} $rp,`$stdframe+8+2*$SIZE_T`($j,$sp) # reincarnate rp |
189 | la $ap,160($sp) | 234 | la $ap,$stdframe($sp) |
190 | ahi $num,1 # restore $num, incidentally clears "borrow" | 235 | ahi $num,1 # restore $num, incidentally clears "borrow" |
191 | 236 | ||
192 | la $j,0(%r0) | 237 | la $j,0(%r0) |
193 | lr $count,$num | 238 | lr $count,$num |
194 | .Lsub: lg $alo,0($j,$ap) | 239 | .Lsub: lg $alo,0($j,$ap) |
195 | slbg $alo,0($j,$np) | 240 | lg $nlo,0($j,$np) |
241 | _dswap $nlo | ||
242 | slbgr $alo,$nlo | ||
196 | stg $alo,0($j,$rp) | 243 | stg $alo,0($j,$rp) |
197 | la $j,8($j) | 244 | la $j,8($j) |
198 | brct $count,.Lsub | 245 | brct $count,.Lsub |
@@ -207,19 +254,24 @@ bn_mul_mont: | |||
207 | 254 | ||
208 | la $j,0(%r0) | 255 | la $j,0(%r0) |
209 | lgr $count,$num | 256 | lgr $count,$num |
210 | .Lcopy: lg $alo,0($j,$ap) # copy or in-place refresh | 257 | .Lcopy: lg $alo,0($j,$ap) # copy or in-place refresh |
211 | stg $j,160($j,$sp) # zap tp | 258 | _dswap $alo |
259 | stg $j,$stdframe($j,$sp) # zap tp | ||
212 | stg $alo,0($j,$rp) | 260 | stg $alo,0($j,$rp) |
213 | la $j,8($j) | 261 | la $j,8($j) |
214 | brct $count,.Lcopy | 262 | brct $count,.Lcopy |
215 | 263 | ||
216 | la %r1,160+8+48($j,$sp) | 264 | la %r1,`$stdframe+8+6*$SIZE_T`($j,$sp) |
217 | lmg %r6,%r15,0(%r1) | 265 | lm${g} %r6,%r15,0(%r1) |
218 | lghi %r2,1 # signal "processed" | 266 | lghi %r2,1 # signal "processed" |
219 | br %r14 | 267 | br %r14 |
220 | .size bn_mul_mont,.-bn_mul_mont | 268 | .size bn_mul_mont,.-bn_mul_mont |
221 | .string "Montgomery Multiplication for s390x, CRYPTOGAMS by <appro\@openssl.org>" | 269 | .string "Montgomery Multiplication for s390x, CRYPTOGAMS by <appro\@openssl.org>" |
222 | ___ | 270 | ___ |
223 | 271 | ||
224 | print $code; | 272 | foreach (split("\n",$code)) { |
273 | s/\`([^\`]*)\`/eval $1/ge; | ||
274 | s/_dswap\s+(%r[0-9]+)/sprintf("rllg\t%s,%s,32",$1,$1) if($SIZE_T==4)/e; | ||
275 | print $_,"\n"; | ||
276 | } | ||
225 | close STDOUT; | 277 | close STDOUT; |