diff options
author | djm <> | 2009-04-06 06:30:10 +0000 |
---|---|---|
committer | djm <> | 2009-04-06 06:30:10 +0000 |
commit | f929570d17be2469dc7104fcdf26fdaddf3dbb65 (patch) | |
tree | d27deb705d08b9515fe0c6a5de67639235c5ad78 /src/lib/libcrypto/bn/asm | |
parent | 8495770bca2f5a7c4d65351d78035a1cf89684f0 (diff) | |
parent | 2b6e09b39ef1d803b50ee024a06d1c250fde442d (diff) | |
download | openbsd-f929570d17be2469dc7104fcdf26fdaddf3dbb65.tar.gz openbsd-f929570d17be2469dc7104fcdf26fdaddf3dbb65.tar.bz2 openbsd-f929570d17be2469dc7104fcdf26fdaddf3dbb65.zip |
This commit was generated by cvs2git to track changes on a CVS vendor
branch.
Diffstat (limited to 'src/lib/libcrypto/bn/asm')
-rw-r--r-- | src/lib/libcrypto/bn/asm/alpha-mont.pl | 317 | ||||
-rw-r--r-- | src/lib/libcrypto/bn/asm/armv4-mont.pl | 200 | ||||
-rw-r--r-- | src/lib/libcrypto/bn/asm/ppc-mont.pl | 323 | ||||
-rw-r--r-- | src/lib/libcrypto/bn/asm/ppc64-mont.pl | 918 | ||||
-rw-r--r-- | src/lib/libcrypto/bn/asm/s390x-mont.pl | 225 | ||||
-rwxr-xr-x | src/lib/libcrypto/bn/asm/s390x.S | 678 | ||||
-rw-r--r-- | src/lib/libcrypto/bn/asm/sparcv9-mont.pl | 606 | ||||
-rwxr-xr-x | src/lib/libcrypto/bn/asm/sparcv9a-mont.pl | 882 | ||||
-rw-r--r-- | src/lib/libcrypto/bn/asm/via-mont.pl | 242 | ||||
-rwxr-xr-x | src/lib/libcrypto/bn/asm/x86-mont.pl | 591 |
10 files changed, 4982 insertions, 0 deletions
diff --git a/src/lib/libcrypto/bn/asm/alpha-mont.pl b/src/lib/libcrypto/bn/asm/alpha-mont.pl new file mode 100644 index 0000000000..7a2cc3173b --- /dev/null +++ b/src/lib/libcrypto/bn/asm/alpha-mont.pl | |||
@@ -0,0 +1,317 @@ | |||
1 | #!/usr/bin/env perl | ||
2 | # | ||
3 | # ==================================================================== | ||
4 | # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | ||
5 | # project. The module is, however, dual licensed under OpenSSL and | ||
6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
8 | # ==================================================================== | ||
9 | # | ||
10 | # On 21264 RSA sign performance improves by 70/35/20/15 percent for | ||
11 | # 512/1024/2048/4096 bit key lengths. This is against vendor compiler | ||
12 | # instructed to '-tune host' code with in-line assembler. Other | ||
13 | # benchmarks improve by 15-20%. To anchor it to something else, the | ||
14 | # code provides approximately the same performance per GHz as AMD64. | ||
15 | # I.e. if you compare 1GHz 21264 and 2GHz Opteron, you'll observe ~2x | ||
16 | # difference. | ||
17 | |||
18 | # int bn_mul_mont( | ||
19 | $rp="a0"; # BN_ULONG *rp, | ||
20 | $ap="a1"; # const BN_ULONG *ap, | ||
21 | $bp="a2"; # const BN_ULONG *bp, | ||
22 | $np="a3"; # const BN_ULONG *np, | ||
23 | $n0="a4"; # const BN_ULONG *n0, | ||
24 | $num="a5"; # int num); | ||
25 | |||
26 | $lo0="t0"; | ||
27 | $hi0="t1"; | ||
28 | $lo1="t2"; | ||
29 | $hi1="t3"; | ||
30 | $aj="t4"; | ||
31 | $bi="t5"; | ||
32 | $nj="t6"; | ||
33 | $tp="t7"; | ||
34 | $alo="t8"; | ||
35 | $ahi="t9"; | ||
36 | $nlo="t10"; | ||
37 | $nhi="t11"; | ||
38 | $tj="t12"; | ||
39 | $i="s3"; | ||
40 | $j="s4"; | ||
41 | $m1="s5"; | ||
42 | |||
43 | $code=<<___; | ||
44 | #include <asm.h> | ||
45 | #include <regdef.h> | ||
46 | |||
47 | .text | ||
48 | |||
49 | .set noat | ||
50 | .set noreorder | ||
51 | |||
52 | .globl bn_mul_mont | ||
53 | .align 5 | ||
54 | .ent bn_mul_mont | ||
55 | bn_mul_mont: | ||
56 | lda sp,-40(sp) | ||
57 | stq ra,0(sp) | ||
58 | stq s3,8(sp) | ||
59 | stq s4,16(sp) | ||
60 | stq s5,24(sp) | ||
61 | stq fp,32(sp) | ||
62 | mov sp,fp | ||
63 | .mask 0x0400f000,-40 | ||
64 | .frame fp,40,ra | ||
65 | .prologue 0 | ||
66 | |||
67 | .align 4 | ||
68 | .set reorder | ||
69 | sextl $num,$num | ||
70 | mov 0,v0 | ||
71 | cmplt $num,4,AT | ||
72 | bne AT,.Lexit | ||
73 | |||
74 | ldq $hi0,0($ap) # ap[0] | ||
75 | s8addq $num,16,AT | ||
76 | ldq $aj,8($ap) | ||
77 | subq sp,AT,sp | ||
78 | ldq $bi,0($bp) # bp[0] | ||
79 | mov -4096,AT | ||
80 | ldq $n0,0($n0) | ||
81 | and sp,AT,sp | ||
82 | |||
83 | mulq $hi0,$bi,$lo0 | ||
84 | ldq $hi1,0($np) # np[0] | ||
85 | umulh $hi0,$bi,$hi0 | ||
86 | ldq $nj,8($np) | ||
87 | |||
88 | mulq $lo0,$n0,$m1 | ||
89 | |||
90 | mulq $hi1,$m1,$lo1 | ||
91 | umulh $hi1,$m1,$hi1 | ||
92 | |||
93 | addq $lo1,$lo0,$lo1 | ||
94 | cmpult $lo1,$lo0,AT | ||
95 | addq $hi1,AT,$hi1 | ||
96 | |||
97 | mulq $aj,$bi,$alo | ||
98 | mov 2,$j | ||
99 | umulh $aj,$bi,$ahi | ||
100 | mov sp,$tp | ||
101 | |||
102 | mulq $nj,$m1,$nlo | ||
103 | s8addq $j,$ap,$aj | ||
104 | umulh $nj,$m1,$nhi | ||
105 | s8addq $j,$np,$nj | ||
106 | .align 4 | ||
107 | .L1st: | ||
108 | .set noreorder | ||
109 | ldq $aj,($aj) | ||
110 | addl $j,1,$j | ||
111 | ldq $nj,($nj) | ||
112 | lda $tp,8($tp) | ||
113 | |||
114 | addq $alo,$hi0,$lo0 | ||
115 | mulq $aj,$bi,$alo | ||
116 | cmpult $lo0,$hi0,AT | ||
117 | addq $nlo,$hi1,$lo1 | ||
118 | |||
119 | mulq $nj,$m1,$nlo | ||
120 | addq $ahi,AT,$hi0 | ||
121 | cmpult $lo1,$hi1,v0 | ||
122 | cmplt $j,$num,$tj | ||
123 | |||
124 | umulh $aj,$bi,$ahi | ||
125 | addq $nhi,v0,$hi1 | ||
126 | addq $lo1,$lo0,$lo1 | ||
127 | s8addq $j,$ap,$aj | ||
128 | |||
129 | umulh $nj,$m1,$nhi | ||
130 | cmpult $lo1,$lo0,v0 | ||
131 | addq $hi1,v0,$hi1 | ||
132 | s8addq $j,$np,$nj | ||
133 | |||
134 | stq $lo1,-8($tp) | ||
135 | nop | ||
136 | unop | ||
137 | bne $tj,.L1st | ||
138 | .set reorder | ||
139 | |||
140 | addq $alo,$hi0,$lo0 | ||
141 | addq $nlo,$hi1,$lo1 | ||
142 | cmpult $lo0,$hi0,AT | ||
143 | cmpult $lo1,$hi1,v0 | ||
144 | addq $ahi,AT,$hi0 | ||
145 | addq $nhi,v0,$hi1 | ||
146 | |||
147 | addq $lo1,$lo0,$lo1 | ||
148 | cmpult $lo1,$lo0,v0 | ||
149 | addq $hi1,v0,$hi1 | ||
150 | |||
151 | stq $lo1,0($tp) | ||
152 | |||
153 | addq $hi1,$hi0,$hi1 | ||
154 | cmpult $hi1,$hi0,AT | ||
155 | stq $hi1,8($tp) | ||
156 | stq AT,16($tp) | ||
157 | |||
158 | mov 1,$i | ||
159 | .align 4 | ||
160 | .Louter: | ||
161 | s8addq $i,$bp,$bi | ||
162 | ldq $hi0,($ap) | ||
163 | ldq $aj,8($ap) | ||
164 | ldq $bi,($bi) | ||
165 | ldq $hi1,($np) | ||
166 | ldq $nj,8($np) | ||
167 | ldq $tj,(sp) | ||
168 | |||
169 | mulq $hi0,$bi,$lo0 | ||
170 | umulh $hi0,$bi,$hi0 | ||
171 | |||
172 | addq $lo0,$tj,$lo0 | ||
173 | cmpult $lo0,$tj,AT | ||
174 | addq $hi0,AT,$hi0 | ||
175 | |||
176 | mulq $lo0,$n0,$m1 | ||
177 | |||
178 | mulq $hi1,$m1,$lo1 | ||
179 | umulh $hi1,$m1,$hi1 | ||
180 | |||
181 | addq $lo1,$lo0,$lo1 | ||
182 | cmpult $lo1,$lo0,AT | ||
183 | mov 2,$j | ||
184 | addq $hi1,AT,$hi1 | ||
185 | |||
186 | mulq $aj,$bi,$alo | ||
187 | mov sp,$tp | ||
188 | umulh $aj,$bi,$ahi | ||
189 | |||
190 | mulq $nj,$m1,$nlo | ||
191 | s8addq $j,$ap,$aj | ||
192 | umulh $nj,$m1,$nhi | ||
193 | .align 4 | ||
194 | .Linner: | ||
195 | .set noreorder | ||
196 | ldq $tj,8($tp) #L0 | ||
197 | nop #U1 | ||
198 | ldq $aj,($aj) #L1 | ||
199 | s8addq $j,$np,$nj #U0 | ||
200 | |||
201 | ldq $nj,($nj) #L0 | ||
202 | nop #U1 | ||
203 | addq $alo,$hi0,$lo0 #L1 | ||
204 | lda $tp,8($tp) | ||
205 | |||
206 | mulq $aj,$bi,$alo #U1 | ||
207 | cmpult $lo0,$hi0,AT #L0 | ||
208 | addq $nlo,$hi1,$lo1 #L1 | ||
209 | addl $j,1,$j | ||
210 | |||
211 | mulq $nj,$m1,$nlo #U1 | ||
212 | addq $ahi,AT,$hi0 #L0 | ||
213 | addq $lo0,$tj,$lo0 #L1 | ||
214 | cmpult $lo1,$hi1,v0 #U0 | ||
215 | |||
216 | umulh $aj,$bi,$ahi #U1 | ||
217 | cmpult $lo0,$tj,AT #L0 | ||
218 | addq $lo1,$lo0,$lo1 #L1 | ||
219 | addq $nhi,v0,$hi1 #U0 | ||
220 | |||
221 | umulh $nj,$m1,$nhi #U1 | ||
222 | s8addq $j,$ap,$aj #L0 | ||
223 | cmpult $lo1,$lo0,v0 #L1 | ||
224 | cmplt $j,$num,$tj #U0 # borrow $tj | ||
225 | |||
226 | addq $hi0,AT,$hi0 #L0 | ||
227 | addq $hi1,v0,$hi1 #U1 | ||
228 | stq $lo1,-8($tp) #L1 | ||
229 | bne $tj,.Linner #U0 | ||
230 | .set reorder | ||
231 | |||
232 | ldq $tj,8($tp) | ||
233 | addq $alo,$hi0,$lo0 | ||
234 | addq $nlo,$hi1,$lo1 | ||
235 | cmpult $lo0,$hi0,AT | ||
236 | cmpult $lo1,$hi1,v0 | ||
237 | addq $ahi,AT,$hi0 | ||
238 | addq $nhi,v0,$hi1 | ||
239 | |||
240 | addq $lo0,$tj,$lo0 | ||
241 | cmpult $lo0,$tj,AT | ||
242 | addq $hi0,AT,$hi0 | ||
243 | |||
244 | ldq $tj,16($tp) | ||
245 | addq $lo1,$lo0,$j | ||
246 | cmpult $j,$lo0,v0 | ||
247 | addq $hi1,v0,$hi1 | ||
248 | |||
249 | addq $hi1,$hi0,$lo1 | ||
250 | stq $j,($tp) | ||
251 | cmpult $lo1,$hi0,$hi1 | ||
252 | addq $lo1,$tj,$lo1 | ||
253 | cmpult $lo1,$tj,AT | ||
254 | addl $i,1,$i | ||
255 | addq $hi1,AT,$hi1 | ||
256 | stq $lo1,8($tp) | ||
257 | cmplt $i,$num,$tj # borrow $tj | ||
258 | stq $hi1,16($tp) | ||
259 | bne $tj,.Louter | ||
260 | |||
261 | s8addq $num,sp,$tj # &tp[num] | ||
262 | mov $rp,$bp # put rp aside | ||
263 | mov sp,$tp | ||
264 | mov sp,$ap | ||
265 | mov 0,$hi0 # clear borrow bit | ||
266 | |||
267 | .align 4 | ||
268 | .Lsub: ldq $lo0,($tp) | ||
269 | ldq $lo1,($np) | ||
270 | lda $tp,8($tp) | ||
271 | lda $np,8($np) | ||
272 | subq $lo0,$lo1,$lo1 # tp[i]-np[i] | ||
273 | cmpult $lo0,$lo1,AT | ||
274 | subq $lo1,$hi0,$lo0 | ||
275 | cmpult $lo1,$lo0,$hi0 | ||
276 | or $hi0,AT,$hi0 | ||
277 | stq $lo0,($rp) | ||
278 | cmpult $tp,$tj,v0 | ||
279 | lda $rp,8($rp) | ||
280 | bne v0,.Lsub | ||
281 | |||
282 | subq $hi1,$hi0,$hi0 # handle upmost overflow bit | ||
283 | mov sp,$tp | ||
284 | mov $bp,$rp # restore rp | ||
285 | |||
286 | and sp,$hi0,$ap | ||
287 | bic $bp,$hi0,$bp | ||
288 | bis $bp,$ap,$ap # ap=borrow?tp:rp | ||
289 | |||
290 | .align 4 | ||
291 | .Lcopy: ldq $aj,($ap) # copy or in-place refresh | ||
292 | lda $tp,8($tp) | ||
293 | lda $rp,8($rp) | ||
294 | lda $ap,8($ap) | ||
295 | stq zero,-8($tp) # zap tp | ||
296 | cmpult $tp,$tj,AT | ||
297 | stq $aj,-8($rp) | ||
298 | bne AT,.Lcopy | ||
299 | mov 1,v0 | ||
300 | |||
301 | .Lexit: | ||
302 | .set noreorder | ||
303 | mov fp,sp | ||
304 | /*ldq ra,0(sp)*/ | ||
305 | ldq s3,8(sp) | ||
306 | ldq s4,16(sp) | ||
307 | ldq s5,24(sp) | ||
308 | ldq fp,32(sp) | ||
309 | lda sp,40(sp) | ||
310 | ret (ra) | ||
311 | .end bn_mul_mont | ||
312 | .rdata | ||
313 | .asciiz "Montgomery Multiplication for Alpha, CRYPTOGAMS by <appro\@openssl.org>" | ||
314 | ___ | ||
315 | |||
316 | print $code; | ||
317 | close STDOUT; | ||
diff --git a/src/lib/libcrypto/bn/asm/armv4-mont.pl b/src/lib/libcrypto/bn/asm/armv4-mont.pl new file mode 100644 index 0000000000..05d5dc1a48 --- /dev/null +++ b/src/lib/libcrypto/bn/asm/armv4-mont.pl | |||
@@ -0,0 +1,200 @@ | |||
1 | #!/usr/bin/env perl | ||
2 | |||
3 | # ==================================================================== | ||
4 | # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | ||
5 | # project. The module is, however, dual licensed under OpenSSL and | ||
6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
8 | # ==================================================================== | ||
9 | |||
10 | # January 2007. | ||
11 | |||
12 | # Montgomery multiplication for ARMv4. | ||
13 | # | ||
14 | # Performance improvement naturally varies among CPU implementations | ||
15 | # and compilers. The code was observed to provide +65-35% improvement | ||
16 | # [depending on key length, less for longer keys] on ARM920T, and | ||
17 | # +115-80% on Intel IXP425. This is compared to pre-bn_mul_mont code | ||
18 | # base and compiler generated code with in-lined umull and even umlal | ||
19 | # instructions. The latter means that this code didn't really have an | ||
20 | # "advantage" of utilizing some "secret" instruction. | ||
21 | # | ||
22 | # The code is interoperable with Thumb ISA and is rather compact, less | ||
23 | # than 1/2KB. Windows CE port would be trivial, as it's exclusively | ||
24 | # about decorations, ABI and instruction syntax are identical. | ||
25 | |||
26 | $num="r0"; # starts as num argument, but holds &tp[num-1] | ||
27 | $ap="r1"; | ||
28 | $bp="r2"; $bi="r2"; $rp="r2"; | ||
29 | $np="r3"; | ||
30 | $tp="r4"; | ||
31 | $aj="r5"; | ||
32 | $nj="r6"; | ||
33 | $tj="r7"; | ||
34 | $n0="r8"; | ||
35 | ########### # r9 is reserved by ELF as platform specific, e.g. TLS pointer | ||
36 | $alo="r10"; # sl, gcc uses it to keep @GOT | ||
37 | $ahi="r11"; # fp | ||
38 | $nlo="r12"; # ip | ||
39 | ########### # r13 is stack pointer | ||
40 | $nhi="r14"; # lr | ||
41 | ########### # r15 is program counter | ||
42 | |||
43 | #### argument block layout relative to &tp[num-1], a.k.a. $num | ||
44 | $_rp="$num,#12*4"; | ||
45 | # ap permanently resides in r1 | ||
46 | $_bp="$num,#13*4"; | ||
47 | # np permanently resides in r3 | ||
48 | $_n0="$num,#14*4"; | ||
49 | $_num="$num,#15*4"; $_bpend=$_num; | ||
50 | |||
51 | $code=<<___; | ||
52 | .text | ||
53 | |||
54 | .global bn_mul_mont | ||
55 | .type bn_mul_mont,%function | ||
56 | |||
57 | .align 2 | ||
58 | bn_mul_mont: | ||
59 | stmdb sp!,{r0,r2} @ sp points at argument block | ||
60 | ldr $num,[sp,#3*4] @ load num | ||
61 | cmp $num,#2 | ||
62 | movlt r0,#0 | ||
63 | addlt sp,sp,#2*4 | ||
64 | blt .Labrt | ||
65 | |||
66 | stmdb sp!,{r4-r12,lr} @ save 10 registers | ||
67 | |||
68 | mov $num,$num,lsl#2 @ rescale $num for byte count | ||
69 | sub sp,sp,$num @ alloca(4*num) | ||
70 | sub sp,sp,#4 @ +extra dword | ||
71 | sub $num,$num,#4 @ "num=num-1" | ||
72 | add $tp,$bp,$num @ &bp[num-1] | ||
73 | |||
74 | add $num,sp,$num @ $num to point at &tp[num-1] | ||
75 | ldr $n0,[$_n0] @ &n0 | ||
76 | ldr $bi,[$bp] @ bp[0] | ||
77 | ldr $aj,[$ap],#4 @ ap[0],ap++ | ||
78 | ldr $nj,[$np],#4 @ np[0],np++ | ||
79 | ldr $n0,[$n0] @ *n0 | ||
80 | str $tp,[$_bpend] @ save &bp[num] | ||
81 | |||
82 | umull $alo,$ahi,$aj,$bi @ ap[0]*bp[0] | ||
83 | str $n0,[$_n0] @ save n0 value | ||
84 | mul $n0,$alo,$n0 @ "tp[0]"*n0 | ||
85 | mov $nlo,#0 | ||
86 | umlal $alo,$nlo,$nj,$n0 @ np[0]*n0+"t[0]" | ||
87 | mov $tp,sp | ||
88 | |||
89 | .L1st: | ||
90 | ldr $aj,[$ap],#4 @ ap[j],ap++ | ||
91 | mov $alo,$ahi | ||
92 | mov $ahi,#0 | ||
93 | umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[0] | ||
94 | ldr $nj,[$np],#4 @ np[j],np++ | ||
95 | mov $nhi,#0 | ||
96 | umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0 | ||
97 | adds $nlo,$nlo,$alo | ||
98 | str $nlo,[$tp],#4 @ tp[j-1]=,tp++ | ||
99 | adc $nlo,$nhi,#0 | ||
100 | cmp $tp,$num | ||
101 | bne .L1st | ||
102 | |||
103 | adds $nlo,$nlo,$ahi | ||
104 | mov $nhi,#0 | ||
105 | adc $nhi,$nhi,#0 | ||
106 | ldr $tp,[$_bp] @ restore bp | ||
107 | str $nlo,[$num] @ tp[num-1]= | ||
108 | ldr $n0,[$_n0] @ restore n0 | ||
109 | str $nhi,[$num,#4] @ tp[num]= | ||
110 | |||
111 | .Louter: | ||
112 | sub $tj,$num,sp @ "original" $num-1 value | ||
113 | sub $ap,$ap,$tj @ "rewind" ap to &ap[1] | ||
114 | sub $np,$np,$tj @ "rewind" np to &np[1] | ||
115 | ldr $bi,[$tp,#4]! @ *(++bp) | ||
116 | ldr $aj,[$ap,#-4] @ ap[0] | ||
117 | ldr $nj,[$np,#-4] @ np[0] | ||
118 | ldr $alo,[sp] @ tp[0] | ||
119 | ldr $tj,[sp,#4] @ tp[1] | ||
120 | |||
121 | mov $ahi,#0 | ||
122 | umlal $alo,$ahi,$aj,$bi @ ap[0]*bp[i]+tp[0] | ||
123 | str $tp,[$_bp] @ save bp | ||
124 | mul $n0,$alo,$n0 | ||
125 | mov $nlo,#0 | ||
126 | umlal $alo,$nlo,$nj,$n0 @ np[0]*n0+"tp[0]" | ||
127 | mov $tp,sp | ||
128 | |||
129 | .Linner: | ||
130 | ldr $aj,[$ap],#4 @ ap[j],ap++ | ||
131 | adds $alo,$ahi,$tj @ +=tp[j] | ||
132 | mov $ahi,#0 | ||
133 | umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[i] | ||
134 | ldr $nj,[$np],#4 @ np[j],np++ | ||
135 | mov $nhi,#0 | ||
136 | umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0 | ||
137 | ldr $tj,[$tp,#8] @ tp[j+1] | ||
138 | adc $ahi,$ahi,#0 | ||
139 | adds $nlo,$nlo,$alo | ||
140 | str $nlo,[$tp],#4 @ tp[j-1]=,tp++ | ||
141 | adc $nlo,$nhi,#0 | ||
142 | cmp $tp,$num | ||
143 | bne .Linner | ||
144 | |||
145 | adds $nlo,$nlo,$ahi | ||
146 | mov $nhi,#0 | ||
147 | adc $nhi,$nhi,#0 | ||
148 | adds $nlo,$nlo,$tj | ||
149 | adc $nhi,$nhi,#0 | ||
150 | ldr $tp,[$_bp] @ restore bp | ||
151 | ldr $tj,[$_bpend] @ restore &bp[num] | ||
152 | str $nlo,[$num] @ tp[num-1]= | ||
153 | ldr $n0,[$_n0] @ restore n0 | ||
154 | str $nhi,[$num,#4] @ tp[num]= | ||
155 | |||
156 | cmp $tp,$tj | ||
157 | bne .Louter | ||
158 | |||
159 | ldr $rp,[$_rp] @ pull rp | ||
160 | add $num,$num,#4 @ $num to point at &tp[num] | ||
161 | sub $aj,$num,sp @ "original" num value | ||
162 | mov $tp,sp @ "rewind" $tp | ||
163 | mov $ap,$tp @ "borrow" $ap | ||
164 | sub $np,$np,$aj @ "rewind" $np to &np[0] | ||
165 | |||
166 | subs $tj,$tj,$tj @ "clear" carry flag | ||
167 | .Lsub: ldr $tj,[$tp],#4 | ||
168 | ldr $nj,[$np],#4 | ||
169 | sbcs $tj,$tj,$nj @ tp[j]-np[j] | ||
170 | str $tj,[$rp],#4 @ rp[j]= | ||
171 | teq $tp,$num @ preserve carry | ||
172 | bne .Lsub | ||
173 | sbcs $nhi,$nhi,#0 @ upmost carry | ||
174 | mov $tp,sp @ "rewind" $tp | ||
175 | sub $rp,$rp,$aj @ "rewind" $rp | ||
176 | |||
177 | and $ap,$tp,$nhi | ||
178 | bic $np,$rp,$nhi | ||
179 | orr $ap,$ap,$np @ ap=borrow?tp:rp | ||
180 | |||
181 | .Lcopy: ldr $tj,[$ap],#4 @ copy or in-place refresh | ||
182 | str sp,[$tp],#4 @ zap tp | ||
183 | str $tj,[$rp],#4 | ||
184 | cmp $tp,$num | ||
185 | bne .Lcopy | ||
186 | |||
187 | add sp,$num,#4 @ skip over tp[num+1] | ||
188 | ldmia sp!,{r4-r12,lr} @ restore registers | ||
189 | add sp,sp,#2*4 @ skip over {r0,r2} | ||
190 | mov r0,#1 | ||
191 | .Labrt: tst lr,#1 | ||
192 | moveq pc,lr @ be binary compatible with V4, yet | ||
193 | bx lr @ interoperable with Thumb ISA:-) | ||
194 | .size bn_mul_mont,.-bn_mul_mont | ||
195 | .asciz "Montgomery multiplication for ARMv4, CRYPTOGAMS by <appro\@openssl.org>" | ||
196 | ___ | ||
197 | |||
198 | $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4 | ||
199 | print $code; | ||
200 | close STDOUT; | ||
diff --git a/src/lib/libcrypto/bn/asm/ppc-mont.pl b/src/lib/libcrypto/bn/asm/ppc-mont.pl new file mode 100644 index 0000000000..7849eae959 --- /dev/null +++ b/src/lib/libcrypto/bn/asm/ppc-mont.pl | |||
@@ -0,0 +1,323 @@ | |||
1 | #!/usr/bin/env perl | ||
2 | |||
3 | # ==================================================================== | ||
4 | # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | ||
5 | # project. The module is, however, dual licensed under OpenSSL and | ||
6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
8 | # ==================================================================== | ||
9 | |||
10 | # April 2006 | ||
11 | |||
12 | # "Teaser" Montgomery multiplication module for PowerPC. It's possible | ||
13 | # to gain a bit more by modulo-scheduling outer loop, then dedicated | ||
14 | # squaring procedure should give further 20% and code can be adapted | ||
15 | # for 32-bit application running on 64-bit CPU. As for the latter. | ||
16 | # It won't be able to achieve "native" 64-bit performance, because in | ||
17 | # 32-bit application context every addc instruction will have to be | ||
18 | # expanded as addc, twice right shift by 32 and finally adde, etc. | ||
19 | # So far RSA *sign* performance improvement over pre-bn_mul_mont asm | ||
20 | # for 64-bit application running on PPC970/G5 is: | ||
21 | # | ||
22 | # 512-bit +65% | ||
23 | # 1024-bit +35% | ||
24 | # 2048-bit +18% | ||
25 | # 4096-bit +4% | ||
26 | |||
27 | $flavour = shift; | ||
28 | |||
29 | if ($flavour =~ /32/) { | ||
30 | $BITS= 32; | ||
31 | $BNSZ= $BITS/8; | ||
32 | $SIZE_T=4; | ||
33 | $RZONE= 224; | ||
34 | $FRAME= $SIZE_T*16; | ||
35 | |||
36 | $LD= "lwz"; # load | ||
37 | $LDU= "lwzu"; # load and update | ||
38 | $LDX= "lwzx"; # load indexed | ||
39 | $ST= "stw"; # store | ||
40 | $STU= "stwu"; # store and update | ||
41 | $STX= "stwx"; # store indexed | ||
42 | $STUX= "stwux"; # store indexed and update | ||
43 | $UMULL= "mullw"; # unsigned multiply low | ||
44 | $UMULH= "mulhwu"; # unsigned multiply high | ||
45 | $UCMP= "cmplw"; # unsigned compare | ||
46 | $SHRI= "srwi"; # unsigned shift right by immediate | ||
47 | $PUSH= $ST; | ||
48 | $POP= $LD; | ||
49 | } elsif ($flavour =~ /64/) { | ||
50 | $BITS= 64; | ||
51 | $BNSZ= $BITS/8; | ||
52 | $SIZE_T=8; | ||
53 | $RZONE= 288; | ||
54 | $FRAME= $SIZE_T*16; | ||
55 | |||
56 | # same as above, but 64-bit mnemonics... | ||
57 | $LD= "ld"; # load | ||
58 | $LDU= "ldu"; # load and update | ||
59 | $LDX= "ldx"; # load indexed | ||
60 | $ST= "std"; # store | ||
61 | $STU= "stdu"; # store and update | ||
62 | $STX= "stdx"; # store indexed | ||
63 | $STUX= "stdux"; # store indexed and update | ||
64 | $UMULL= "mulld"; # unsigned multiply low | ||
65 | $UMULH= "mulhdu"; # unsigned multiply high | ||
66 | $UCMP= "cmpld"; # unsigned compare | ||
67 | $SHRI= "srdi"; # unsigned shift right by immediate | ||
68 | $PUSH= $ST; | ||
69 | $POP= $LD; | ||
70 | } else { die "nonsense $flavour"; } | ||
71 | |||
72 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | ||
73 | ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or | ||
74 | ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or | ||
75 | die "can't locate ppc-xlate.pl"; | ||
76 | |||
77 | open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!"; | ||
78 | |||
79 | $sp="r1"; | ||
80 | $toc="r2"; | ||
81 | $rp="r3"; $ovf="r3"; | ||
82 | $ap="r4"; | ||
83 | $bp="r5"; | ||
84 | $np="r6"; | ||
85 | $n0="r7"; | ||
86 | $num="r8"; | ||
87 | $rp="r9"; # $rp is reassigned | ||
88 | $aj="r10"; | ||
89 | $nj="r11"; | ||
90 | $tj="r12"; | ||
91 | # non-volatile registers | ||
92 | $i="r14"; | ||
93 | $j="r15"; | ||
94 | $tp="r16"; | ||
95 | $m0="r17"; | ||
96 | $m1="r18"; | ||
97 | $lo0="r19"; | ||
98 | $hi0="r20"; | ||
99 | $lo1="r21"; | ||
100 | $hi1="r22"; | ||
101 | $alo="r23"; | ||
102 | $ahi="r24"; | ||
103 | $nlo="r25"; | ||
104 | # | ||
105 | $nhi="r0"; | ||
106 | |||
107 | $code=<<___; | ||
108 | .machine "any" | ||
109 | .text | ||
110 | |||
111 | .globl .bn_mul_mont | ||
112 | .align 4 | ||
113 | .bn_mul_mont: | ||
114 | cmpwi $num,4 | ||
115 | mr $rp,r3 ; $rp is reassigned | ||
116 | li r3,0 | ||
117 | bltlr | ||
118 | |||
119 | slwi $num,$num,`log($BNSZ)/log(2)` | ||
120 | li $tj,-4096 | ||
121 | addi $ovf,$num,`$FRAME+$RZONE` | ||
122 | subf $ovf,$ovf,$sp ; $sp-$ovf | ||
123 | and $ovf,$ovf,$tj ; minimize TLB usage | ||
124 | subf $ovf,$sp,$ovf ; $ovf-$sp | ||
125 | srwi $num,$num,`log($BNSZ)/log(2)` | ||
126 | $STUX $sp,$sp,$ovf | ||
127 | |||
128 | $PUSH r14,`4*$SIZE_T`($sp) | ||
129 | $PUSH r15,`5*$SIZE_T`($sp) | ||
130 | $PUSH r16,`6*$SIZE_T`($sp) | ||
131 | $PUSH r17,`7*$SIZE_T`($sp) | ||
132 | $PUSH r18,`8*$SIZE_T`($sp) | ||
133 | $PUSH r19,`9*$SIZE_T`($sp) | ||
134 | $PUSH r20,`10*$SIZE_T`($sp) | ||
135 | $PUSH r21,`11*$SIZE_T`($sp) | ||
136 | $PUSH r22,`12*$SIZE_T`($sp) | ||
137 | $PUSH r23,`13*$SIZE_T`($sp) | ||
138 | $PUSH r24,`14*$SIZE_T`($sp) | ||
139 | $PUSH r25,`15*$SIZE_T`($sp) | ||
140 | |||
141 | $LD $n0,0($n0) ; pull n0[0] value | ||
142 | addi $num,$num,-2 ; adjust $num for counter register | ||
143 | |||
144 | $LD $m0,0($bp) ; m0=bp[0] | ||
145 | $LD $aj,0($ap) ; ap[0] | ||
146 | addi $tp,$sp,$FRAME | ||
147 | $UMULL $lo0,$aj,$m0 ; ap[0]*bp[0] | ||
148 | $UMULH $hi0,$aj,$m0 | ||
149 | |||
150 | $LD $aj,$BNSZ($ap) ; ap[1] | ||
151 | $LD $nj,0($np) ; np[0] | ||
152 | |||
153 | $UMULL $m1,$lo0,$n0 ; "tp[0]"*n0 | ||
154 | |||
155 | $UMULL $alo,$aj,$m0 ; ap[1]*bp[0] | ||
156 | $UMULH $ahi,$aj,$m0 | ||
157 | |||
158 | $UMULL $lo1,$nj,$m1 ; np[0]*m1 | ||
159 | $UMULH $hi1,$nj,$m1 | ||
160 | $LD $nj,$BNSZ($np) ; np[1] | ||
161 | addc $lo1,$lo1,$lo0 | ||
162 | addze $hi1,$hi1 | ||
163 | |||
164 | $UMULL $nlo,$nj,$m1 ; np[1]*m1 | ||
165 | $UMULH $nhi,$nj,$m1 | ||
166 | |||
167 | mtctr $num | ||
168 | li $j,`2*$BNSZ` | ||
169 | .align 4 | ||
170 | L1st: | ||
171 | $LDX $aj,$ap,$j ; ap[j] | ||
172 | addc $lo0,$alo,$hi0 | ||
173 | $LDX $nj,$np,$j ; np[j] | ||
174 | addze $hi0,$ahi | ||
175 | $UMULL $alo,$aj,$m0 ; ap[j]*bp[0] | ||
176 | addc $lo1,$nlo,$hi1 | ||
177 | $UMULH $ahi,$aj,$m0 | ||
178 | addze $hi1,$nhi | ||
179 | $UMULL $nlo,$nj,$m1 ; np[j]*m1 | ||
180 | addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[0] | ||
181 | $UMULH $nhi,$nj,$m1 | ||
182 | addze $hi1,$hi1 | ||
183 | $ST $lo1,0($tp) ; tp[j-1] | ||
184 | |||
185 | addi $j,$j,$BNSZ ; j++ | ||
186 | addi $tp,$tp,$BNSZ ; tp++ | ||
187 | bdnz- L1st | ||
188 | ;L1st | ||
189 | addc $lo0,$alo,$hi0 | ||
190 | addze $hi0,$ahi | ||
191 | |||
192 | addc $lo1,$nlo,$hi1 | ||
193 | addze $hi1,$nhi | ||
194 | addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[0] | ||
195 | addze $hi1,$hi1 | ||
196 | $ST $lo1,0($tp) ; tp[j-1] | ||
197 | |||
198 | li $ovf,0 | ||
199 | addc $hi1,$hi1,$hi0 | ||
200 | addze $ovf,$ovf ; upmost overflow bit | ||
201 | $ST $hi1,$BNSZ($tp) | ||
202 | |||
203 | li $i,$BNSZ | ||
204 | .align 4 | ||
205 | Louter: | ||
206 | $LDX $m0,$bp,$i ; m0=bp[i] | ||
207 | $LD $aj,0($ap) ; ap[0] | ||
208 | addi $tp,$sp,$FRAME | ||
209 | $LD $tj,$FRAME($sp) ; tp[0] | ||
210 | $UMULL $lo0,$aj,$m0 ; ap[0]*bp[i] | ||
211 | $UMULH $hi0,$aj,$m0 | ||
212 | $LD $aj,$BNSZ($ap) ; ap[1] | ||
213 | $LD $nj,0($np) ; np[0] | ||
214 | addc $lo0,$lo0,$tj ; ap[0]*bp[i]+tp[0] | ||
215 | $UMULL $alo,$aj,$m0 ; ap[j]*bp[i] | ||
216 | addze $hi0,$hi0 | ||
217 | $UMULL $m1,$lo0,$n0 ; tp[0]*n0 | ||
218 | $UMULH $ahi,$aj,$m0 | ||
219 | $UMULL $lo1,$nj,$m1 ; np[0]*m1 | ||
220 | $UMULH $hi1,$nj,$m1 | ||
221 | $LD $nj,$BNSZ($np) ; np[1] | ||
222 | addc $lo1,$lo1,$lo0 | ||
223 | $UMULL $nlo,$nj,$m1 ; np[1]*m1 | ||
224 | addze $hi1,$hi1 | ||
225 | $UMULH $nhi,$nj,$m1 | ||
226 | |||
227 | mtctr $num | ||
228 | li $j,`2*$BNSZ` | ||
229 | .align 4 | ||
230 | Linner: | ||
231 | $LDX $aj,$ap,$j ; ap[j] | ||
232 | addc $lo0,$alo,$hi0 | ||
233 | $LD $tj,$BNSZ($tp) ; tp[j] | ||
234 | addze $hi0,$ahi | ||
235 | $LDX $nj,$np,$j ; np[j] | ||
236 | addc $lo1,$nlo,$hi1 | ||
237 | $UMULL $alo,$aj,$m0 ; ap[j]*bp[i] | ||
238 | addze $hi1,$nhi | ||
239 | $UMULH $ahi,$aj,$m0 | ||
240 | addc $lo0,$lo0,$tj ; ap[j]*bp[i]+tp[j] | ||
241 | $UMULL $nlo,$nj,$m1 ; np[j]*m1 | ||
242 | addze $hi0,$hi0 | ||
243 | $UMULH $nhi,$nj,$m1 | ||
244 | addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[i]+tp[j] | ||
245 | addi $j,$j,$BNSZ ; j++ | ||
246 | addze $hi1,$hi1 | ||
247 | $ST $lo1,0($tp) ; tp[j-1] | ||
248 | addi $tp,$tp,$BNSZ ; tp++ | ||
249 | bdnz- Linner | ||
250 | ;Linner | ||
251 | $LD $tj,$BNSZ($tp) ; tp[j] | ||
252 | addc $lo0,$alo,$hi0 | ||
253 | addze $hi0,$ahi | ||
254 | addc $lo0,$lo0,$tj ; ap[j]*bp[i]+tp[j] | ||
255 | addze $hi0,$hi0 | ||
256 | |||
257 | addc $lo1,$nlo,$hi1 | ||
258 | addze $hi1,$nhi | ||
259 | addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[i]+tp[j] | ||
260 | addze $hi1,$hi1 | ||
261 | $ST $lo1,0($tp) ; tp[j-1] | ||
262 | |||
263 | addic $ovf,$ovf,-1 ; move upmost overflow to XER[CA] | ||
264 | li $ovf,0 | ||
265 | adde $hi1,$hi1,$hi0 | ||
266 | addze $ovf,$ovf | ||
267 | $ST $hi1,$BNSZ($tp) | ||
268 | ; | ||
269 | slwi $tj,$num,`log($BNSZ)/log(2)` | ||
270 | $UCMP $i,$tj | ||
271 | addi $i,$i,$BNSZ | ||
272 | ble- Louter | ||
273 | |||
274 | addi $num,$num,2 ; restore $num | ||
275 | subfc $j,$j,$j ; j=0 and "clear" XER[CA] | ||
276 | addi $tp,$sp,$FRAME | ||
277 | mtctr $num | ||
278 | |||
279 | .align 4 | ||
280 | Lsub: $LDX $tj,$tp,$j | ||
281 | $LDX $nj,$np,$j | ||
282 | subfe $aj,$nj,$tj ; tp[j]-np[j] | ||
283 | $STX $aj,$rp,$j | ||
284 | addi $j,$j,$BNSZ | ||
285 | bdnz- Lsub | ||
286 | |||
287 | li $j,0 | ||
288 | mtctr $num | ||
289 | subfe $ovf,$j,$ovf ; handle upmost overflow bit | ||
290 | and $ap,$tp,$ovf | ||
291 | andc $np,$rp,$ovf | ||
292 | or $ap,$ap,$np ; ap=borrow?tp:rp | ||
293 | |||
294 | .align 4 | ||
295 | Lcopy: ; copy or in-place refresh | ||
296 | $LDX $tj,$ap,$j | ||
297 | $STX $tj,$rp,$j | ||
298 | $STX $j,$tp,$j ; zap at once | ||
299 | addi $j,$j,$BNSZ | ||
300 | bdnz- Lcopy | ||
301 | |||
302 | $POP r14,`4*$SIZE_T`($sp) | ||
303 | $POP r15,`5*$SIZE_T`($sp) | ||
304 | $POP r16,`6*$SIZE_T`($sp) | ||
305 | $POP r17,`7*$SIZE_T`($sp) | ||
306 | $POP r18,`8*$SIZE_T`($sp) | ||
307 | $POP r19,`9*$SIZE_T`($sp) | ||
308 | $POP r20,`10*$SIZE_T`($sp) | ||
309 | $POP r21,`11*$SIZE_T`($sp) | ||
310 | $POP r22,`12*$SIZE_T`($sp) | ||
311 | $POP r23,`13*$SIZE_T`($sp) | ||
312 | $POP r24,`14*$SIZE_T`($sp) | ||
313 | $POP r25,`15*$SIZE_T`($sp) | ||
314 | $POP $sp,0($sp) | ||
315 | li r3,1 | ||
316 | blr | ||
317 | .long 0 | ||
318 | .asciz "Montgomery Multiplication for PPC, CRYPTOGAMS by <appro\@fy.chalmers.se>" | ||
319 | ___ | ||
320 | |||
321 | $code =~ s/\`([^\`]*)\`/eval $1/gem; | ||
322 | print $code; | ||
323 | close STDOUT; | ||
diff --git a/src/lib/libcrypto/bn/asm/ppc64-mont.pl b/src/lib/libcrypto/bn/asm/ppc64-mont.pl new file mode 100644 index 0000000000..3449b35855 --- /dev/null +++ b/src/lib/libcrypto/bn/asm/ppc64-mont.pl | |||
@@ -0,0 +1,918 @@ | |||
1 | #!/usr/bin/env perl | ||
2 | |||
3 | # ==================================================================== | ||
4 | # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | ||
5 | # project. The module is, however, dual licensed under OpenSSL and | ||
6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
8 | # ==================================================================== | ||
9 | |||
10 | # December 2007 | ||
11 | |||
12 | # The reason for undertaken effort is basically following. Even though | ||
13 | # Power 6 CPU operates at incredible 4.7GHz clock frequency, its PKI | ||
14 | # performance was observed to be less than impressive, essentially as | ||
15 | # fast as 1.8GHz PPC970, or 2.6 times(!) slower than one would hope. | ||
16 | # Well, it's not surprising that IBM had to make some sacrifices to | ||
17 | # boost the clock frequency that much, but no overall improvement? | ||
18 | # Having observed how much difference did switching to FPU make on | ||
19 | # UltraSPARC, playing same stunt on Power 6 appeared appropriate... | ||
20 | # Unfortunately the resulting performance improvement is not as | ||
21 | # impressive, ~30%, and in absolute terms is still very far from what | ||
22 | # one would expect from 4.7GHz CPU. There is a chance that I'm doing | ||
23 | # something wrong, but in the lack of assembler level micro-profiling | ||
24 | # data or at least decent platform guide I can't tell... Or better | ||
25 | # results might be achieved with VMX... Anyway, this module provides | ||
26 | # *worse* performance on other PowerPC implementations, ~40-15% slower | ||
27 | # on PPC970 depending on key length and ~40% slower on Power 5 for all | ||
28 | # key lengths. As it's obviously inappropriate as "best all-round" | ||
29 | # alternative, it has to be complemented with run-time CPU family | ||
30 | # detection. Oh! It should also be noted that unlike other PowerPC | ||
31 | # implementation IALU ppc-mont.pl module performs *suboptimaly* on | ||
32 | # >=1024-bit key lengths on Power 6. It should also be noted that | ||
33 | # *everything* said so far applies to 64-bit builds! As far as 32-bit | ||
34 | # application executed on 64-bit CPU goes, this module is likely to | ||
35 | # become preferred choice, because it's easy to adapt it for such | ||
36 | # case and *is* faster than 32-bit ppc-mont.pl on *all* processors. | ||
37 | |||
38 | # February 2008 | ||
39 | |||
40 | # Micro-profiling assisted optimization results in ~15% improvement | ||
41 | # over original ppc64-mont.pl version, or overall ~50% improvement | ||
42 | # over ppc.pl module on Power 6. If compared to ppc-mont.pl on same | ||
43 | # Power 6 CPU, this module is 5-150% faster depending on key length, | ||
44 | # [hereafter] more for longer keys. But if compared to ppc-mont.pl | ||
45 | # on 1.8GHz PPC970, it's only 5-55% faster. Still far from impressive | ||
46 | # in absolute terms, but it's apparently the way Power 6 is... | ||
47 | |||
48 | $flavour = shift; | ||
49 | |||
50 | if ($flavour =~ /32/) { | ||
51 | $SIZE_T=4; | ||
52 | $RZONE= 224; | ||
53 | $FRAME= $SIZE_T*12+8*12; | ||
54 | $fname= "bn_mul_mont_ppc64"; | ||
55 | |||
56 | $STUX= "stwux"; # store indexed and update | ||
57 | $PUSH= "stw"; | ||
58 | $POP= "lwz"; | ||
59 | die "not implemented yet"; | ||
60 | } elsif ($flavour =~ /64/) { | ||
61 | $SIZE_T=8; | ||
62 | $RZONE= 288; | ||
63 | $FRAME= $SIZE_T*12+8*12; | ||
64 | $fname= "bn_mul_mont"; | ||
65 | |||
66 | # same as above, but 64-bit mnemonics... | ||
67 | $STUX= "stdux"; # store indexed and update | ||
68 | $PUSH= "std"; | ||
69 | $POP= "ld"; | ||
70 | } else { die "nonsense $flavour"; } | ||
71 | |||
72 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | ||
73 | ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or | ||
74 | ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or | ||
75 | die "can't locate ppc-xlate.pl"; | ||
76 | |||
77 | open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!"; | ||
78 | |||
79 | $FRAME=($FRAME+63)&~63; | ||
80 | $TRANSFER=16*8; | ||
81 | |||
82 | $carry="r0"; | ||
83 | $sp="r1"; | ||
84 | $toc="r2"; | ||
85 | $rp="r3"; $ovf="r3"; | ||
86 | $ap="r4"; | ||
87 | $bp="r5"; | ||
88 | $np="r6"; | ||
89 | $n0="r7"; | ||
90 | $num="r8"; | ||
91 | $rp="r9"; # $rp is reassigned | ||
92 | $tp="r10"; | ||
93 | $j="r11"; | ||
94 | $i="r12"; | ||
95 | # non-volatile registers | ||
96 | $nap_d="r14"; # interleaved ap and np in double format | ||
97 | $a0="r15"; # ap[0] | ||
98 | $t0="r16"; # temporary registers | ||
99 | $t1="r17"; | ||
100 | $t2="r18"; | ||
101 | $t3="r19"; | ||
102 | $t4="r20"; | ||
103 | $t5="r21"; | ||
104 | $t6="r22"; | ||
105 | $t7="r23"; | ||
106 | |||
107 | # PPC offers enough register bank capacity to unroll inner loops twice | ||
108 | # | ||
109 | # ..A3A2A1A0 | ||
110 | # dcba | ||
111 | # ----------- | ||
112 | # A0a | ||
113 | # A0b | ||
114 | # A0c | ||
115 | # A0d | ||
116 | # A1a | ||
117 | # A1b | ||
118 | # A1c | ||
119 | # A1d | ||
120 | # A2a | ||
121 | # A2b | ||
122 | # A2c | ||
123 | # A2d | ||
124 | # A3a | ||
125 | # A3b | ||
126 | # A3c | ||
127 | # A3d | ||
128 | # ..a | ||
129 | # ..b | ||
130 | # | ||
131 | $ba="f0"; $bb="f1"; $bc="f2"; $bd="f3"; | ||
132 | $na="f4"; $nb="f5"; $nc="f6"; $nd="f7"; | ||
133 | $dota="f8"; $dotb="f9"; | ||
134 | $A0="f10"; $A1="f11"; $A2="f12"; $A3="f13"; | ||
135 | $N0="f14"; $N1="f15"; $N2="f16"; $N3="f17"; | ||
136 | $T0a="f18"; $T0b="f19"; | ||
137 | $T1a="f20"; $T1b="f21"; | ||
138 | $T2a="f22"; $T2b="f23"; | ||
139 | $T3a="f24"; $T3b="f25"; | ||
140 | |||
141 | # sp----------->+-------------------------------+ | ||
142 | # | saved sp | | ||
143 | # +-------------------------------+ | ||
144 | # | | | ||
145 | # +-------------------------------+ | ||
146 | # | 10 saved gpr, r14-r23 | | ||
147 | # . . | ||
148 | # . . | ||
149 | # +12*size_t +-------------------------------+ | ||
150 | # | 12 saved fpr, f14-f25 | | ||
151 | # . . | ||
152 | # . . | ||
153 | # +12*8 +-------------------------------+ | ||
154 | # | padding to 64 byte boundary | | ||
155 | # . . | ||
156 | # +X +-------------------------------+ | ||
157 | # | 16 gpr<->fpr transfer zone | | ||
158 | # . . | ||
159 | # . . | ||
160 | # +16*8 +-------------------------------+ | ||
161 | # | __int64 tmp[-1] | | ||
162 | # +-------------------------------+ | ||
163 | # | __int64 tmp[num] | | ||
164 | # . . | ||
165 | # . . | ||
166 | # . . | ||
167 | # +(num+1)*8 +-------------------------------+ | ||
168 | # | padding to 64 byte boundary | | ||
169 | # . . | ||
170 | # +X +-------------------------------+ | ||
171 | # | double nap_d[4*num] | | ||
172 | # . . | ||
173 | # . . | ||
174 | # . . | ||
175 | # +-------------------------------+ | ||
176 | |||
177 | $code=<<___; | ||
178 | .machine "any" | ||
179 | .text | ||
180 | |||
181 | .globl .$fname | ||
182 | .align 5 | ||
183 | .$fname: | ||
184 | cmpwi $num,4 | ||
185 | mr $rp,r3 ; $rp is reassigned | ||
186 | li r3,0 ; possible "not handled" return code | ||
187 | bltlr- | ||
188 | andi. r0,$num,1 ; $num has to be even | ||
189 | bnelr- | ||
190 | |||
191 | slwi $num,$num,3 ; num*=8 | ||
192 | li $i,-4096 | ||
193 | slwi $tp,$num,2 ; place for {an}p_{lh}[num], i.e. 4*num | ||
194 | add $tp,$tp,$num ; place for tp[num+1] | ||
195 | addi $tp,$tp,`$FRAME+$TRANSFER+8+64+$RZONE` | ||
196 | subf $tp,$tp,$sp ; $sp-$tp | ||
197 | and $tp,$tp,$i ; minimize TLB usage | ||
198 | subf $tp,$sp,$tp ; $tp-$sp | ||
199 | $STUX $sp,$sp,$tp ; alloca | ||
200 | |||
201 | $PUSH r14,`2*$SIZE_T`($sp) | ||
202 | $PUSH r15,`3*$SIZE_T`($sp) | ||
203 | $PUSH r16,`4*$SIZE_T`($sp) | ||
204 | $PUSH r17,`5*$SIZE_T`($sp) | ||
205 | $PUSH r18,`6*$SIZE_T`($sp) | ||
206 | $PUSH r19,`7*$SIZE_T`($sp) | ||
207 | $PUSH r20,`8*$SIZE_T`($sp) | ||
208 | $PUSH r21,`9*$SIZE_T`($sp) | ||
209 | $PUSH r22,`10*$SIZE_T`($sp) | ||
210 | $PUSH r23,`11*$SIZE_T`($sp) | ||
211 | stfd f14,`12*$SIZE_T+0`($sp) | ||
212 | stfd f15,`12*$SIZE_T+8`($sp) | ||
213 | stfd f16,`12*$SIZE_T+16`($sp) | ||
214 | stfd f17,`12*$SIZE_T+24`($sp) | ||
215 | stfd f18,`12*$SIZE_T+32`($sp) | ||
216 | stfd f19,`12*$SIZE_T+40`($sp) | ||
217 | stfd f20,`12*$SIZE_T+48`($sp) | ||
218 | stfd f21,`12*$SIZE_T+56`($sp) | ||
219 | stfd f22,`12*$SIZE_T+64`($sp) | ||
220 | stfd f23,`12*$SIZE_T+72`($sp) | ||
221 | stfd f24,`12*$SIZE_T+80`($sp) | ||
222 | stfd f25,`12*$SIZE_T+88`($sp) | ||
223 | |||
224 | ld $a0,0($ap) ; pull ap[0] value | ||
225 | ld $n0,0($n0) ; pull n0[0] value | ||
226 | ld $t3,0($bp) ; bp[0] | ||
227 | |||
228 | addi $tp,$sp,`$FRAME+$TRANSFER+8+64` | ||
229 | li $i,-64 | ||
230 | add $nap_d,$tp,$num | ||
231 | and $nap_d,$nap_d,$i ; align to 64 bytes | ||
232 | |||
233 | mulld $t7,$a0,$t3 ; ap[0]*bp[0] | ||
234 | ; nap_d is off by 1, because it's used with stfdu/lfdu | ||
235 | addi $nap_d,$nap_d,-8 | ||
236 | srwi $j,$num,`3+1` ; counter register, num/2 | ||
237 | mulld $t7,$t7,$n0 ; tp[0]*n0 | ||
238 | addi $j,$j,-1 | ||
239 | addi $tp,$sp,`$FRAME+$TRANSFER-8` | ||
240 | li $carry,0 | ||
241 | mtctr $j | ||
242 | |||
243 | ; transfer bp[0] to FPU as 4x16-bit values | ||
244 | extrdi $t0,$t3,16,48 | ||
245 | extrdi $t1,$t3,16,32 | ||
246 | extrdi $t2,$t3,16,16 | ||
247 | extrdi $t3,$t3,16,0 | ||
248 | std $t0,`$FRAME+0`($sp) | ||
249 | std $t1,`$FRAME+8`($sp) | ||
250 | std $t2,`$FRAME+16`($sp) | ||
251 | std $t3,`$FRAME+24`($sp) | ||
252 | ; transfer (ap[0]*bp[0])*n0 to FPU as 4x16-bit values | ||
253 | extrdi $t4,$t7,16,48 | ||
254 | extrdi $t5,$t7,16,32 | ||
255 | extrdi $t6,$t7,16,16 | ||
256 | extrdi $t7,$t7,16,0 | ||
257 | std $t4,`$FRAME+32`($sp) | ||
258 | std $t5,`$FRAME+40`($sp) | ||
259 | std $t6,`$FRAME+48`($sp) | ||
260 | std $t7,`$FRAME+56`($sp) | ||
261 | lwz $t0,4($ap) ; load a[j] as 32-bit word pair | ||
262 | lwz $t1,0($ap) | ||
263 | lwz $t2,12($ap) ; load a[j+1] as 32-bit word pair | ||
264 | lwz $t3,8($ap) | ||
265 | lwz $t4,4($np) ; load n[j] as 32-bit word pair | ||
266 | lwz $t5,0($np) | ||
267 | lwz $t6,12($np) ; load n[j+1] as 32-bit word pair | ||
268 | lwz $t7,8($np) | ||
269 | lfd $ba,`$FRAME+0`($sp) | ||
270 | lfd $bb,`$FRAME+8`($sp) | ||
271 | lfd $bc,`$FRAME+16`($sp) | ||
272 | lfd $bd,`$FRAME+24`($sp) | ||
273 | lfd $na,`$FRAME+32`($sp) | ||
274 | lfd $nb,`$FRAME+40`($sp) | ||
275 | lfd $nc,`$FRAME+48`($sp) | ||
276 | lfd $nd,`$FRAME+56`($sp) | ||
277 | std $t0,`$FRAME+64`($sp) | ||
278 | std $t1,`$FRAME+72`($sp) | ||
279 | std $t2,`$FRAME+80`($sp) | ||
280 | std $t3,`$FRAME+88`($sp) | ||
281 | std $t4,`$FRAME+96`($sp) | ||
282 | std $t5,`$FRAME+104`($sp) | ||
283 | std $t6,`$FRAME+112`($sp) | ||
284 | std $t7,`$FRAME+120`($sp) | ||
285 | fcfid $ba,$ba | ||
286 | fcfid $bb,$bb | ||
287 | fcfid $bc,$bc | ||
288 | fcfid $bd,$bd | ||
289 | fcfid $na,$na | ||
290 | fcfid $nb,$nb | ||
291 | fcfid $nc,$nc | ||
292 | fcfid $nd,$nd | ||
293 | |||
294 | lfd $A0,`$FRAME+64`($sp) | ||
295 | lfd $A1,`$FRAME+72`($sp) | ||
296 | lfd $A2,`$FRAME+80`($sp) | ||
297 | lfd $A3,`$FRAME+88`($sp) | ||
298 | lfd $N0,`$FRAME+96`($sp) | ||
299 | lfd $N1,`$FRAME+104`($sp) | ||
300 | lfd $N2,`$FRAME+112`($sp) | ||
301 | lfd $N3,`$FRAME+120`($sp) | ||
302 | fcfid $A0,$A0 | ||
303 | fcfid $A1,$A1 | ||
304 | fcfid $A2,$A2 | ||
305 | fcfid $A3,$A3 | ||
306 | fcfid $N0,$N0 | ||
307 | fcfid $N1,$N1 | ||
308 | fcfid $N2,$N2 | ||
309 | fcfid $N3,$N3 | ||
310 | addi $ap,$ap,16 | ||
311 | addi $np,$np,16 | ||
312 | |||
313 | fmul $T1a,$A1,$ba | ||
314 | fmul $T1b,$A1,$bb | ||
315 | stfd $A0,8($nap_d) ; save a[j] in double format | ||
316 | stfd $A1,16($nap_d) | ||
317 | fmul $T2a,$A2,$ba | ||
318 | fmul $T2b,$A2,$bb | ||
319 | stfd $A2,24($nap_d) ; save a[j+1] in double format | ||
320 | stfd $A3,32($nap_d) | ||
321 | fmul $T3a,$A3,$ba | ||
322 | fmul $T3b,$A3,$bb | ||
323 | stfd $N0,40($nap_d) ; save n[j] in double format | ||
324 | stfd $N1,48($nap_d) | ||
325 | fmul $T0a,$A0,$ba | ||
326 | fmul $T0b,$A0,$bb | ||
327 | stfd $N2,56($nap_d) ; save n[j+1] in double format | ||
328 | stfdu $N3,64($nap_d) | ||
329 | |||
330 | fmadd $T1a,$A0,$bc,$T1a | ||
331 | fmadd $T1b,$A0,$bd,$T1b | ||
332 | fmadd $T2a,$A1,$bc,$T2a | ||
333 | fmadd $T2b,$A1,$bd,$T2b | ||
334 | fmadd $T3a,$A2,$bc,$T3a | ||
335 | fmadd $T3b,$A2,$bd,$T3b | ||
336 | fmul $dota,$A3,$bc | ||
337 | fmul $dotb,$A3,$bd | ||
338 | |||
339 | fmadd $T1a,$N1,$na,$T1a | ||
340 | fmadd $T1b,$N1,$nb,$T1b | ||
341 | fmadd $T2a,$N2,$na,$T2a | ||
342 | fmadd $T2b,$N2,$nb,$T2b | ||
343 | fmadd $T3a,$N3,$na,$T3a | ||
344 | fmadd $T3b,$N3,$nb,$T3b | ||
345 | fmadd $T0a,$N0,$na,$T0a | ||
346 | fmadd $T0b,$N0,$nb,$T0b | ||
347 | |||
348 | fmadd $T1a,$N0,$nc,$T1a | ||
349 | fmadd $T1b,$N0,$nd,$T1b | ||
350 | fmadd $T2a,$N1,$nc,$T2a | ||
351 | fmadd $T2b,$N1,$nd,$T2b | ||
352 | fmadd $T3a,$N2,$nc,$T3a | ||
353 | fmadd $T3b,$N2,$nd,$T3b | ||
354 | fmadd $dota,$N3,$nc,$dota | ||
355 | fmadd $dotb,$N3,$nd,$dotb | ||
356 | |||
357 | fctid $T0a,$T0a | ||
358 | fctid $T0b,$T0b | ||
359 | fctid $T1a,$T1a | ||
360 | fctid $T1b,$T1b | ||
361 | fctid $T2a,$T2a | ||
362 | fctid $T2b,$T2b | ||
363 | fctid $T3a,$T3a | ||
364 | fctid $T3b,$T3b | ||
365 | |||
366 | stfd $T0a,`$FRAME+0`($sp) | ||
367 | stfd $T0b,`$FRAME+8`($sp) | ||
368 | stfd $T1a,`$FRAME+16`($sp) | ||
369 | stfd $T1b,`$FRAME+24`($sp) | ||
370 | stfd $T2a,`$FRAME+32`($sp) | ||
371 | stfd $T2b,`$FRAME+40`($sp) | ||
372 | stfd $T3a,`$FRAME+48`($sp) | ||
373 | stfd $T3b,`$FRAME+56`($sp) | ||
374 | |||
375 | .align 5 | ||
376 | L1st: | ||
377 | lwz $t0,4($ap) ; load a[j] as 32-bit word pair | ||
378 | lwz $t1,0($ap) | ||
379 | lwz $t2,12($ap) ; load a[j+1] as 32-bit word pair | ||
380 | lwz $t3,8($ap) | ||
381 | lwz $t4,4($np) ; load n[j] as 32-bit word pair | ||
382 | lwz $t5,0($np) | ||
383 | lwz $t6,12($np) ; load n[j+1] as 32-bit word pair | ||
384 | lwz $t7,8($np) | ||
385 | std $t0,`$FRAME+64`($sp) | ||
386 | std $t1,`$FRAME+72`($sp) | ||
387 | std $t2,`$FRAME+80`($sp) | ||
388 | std $t3,`$FRAME+88`($sp) | ||
389 | std $t4,`$FRAME+96`($sp) | ||
390 | std $t5,`$FRAME+104`($sp) | ||
391 | std $t6,`$FRAME+112`($sp) | ||
392 | std $t7,`$FRAME+120`($sp) | ||
393 | ld $t0,`$FRAME+0`($sp) | ||
394 | ld $t1,`$FRAME+8`($sp) | ||
395 | ld $t2,`$FRAME+16`($sp) | ||
396 | ld $t3,`$FRAME+24`($sp) | ||
397 | ld $t4,`$FRAME+32`($sp) | ||
398 | ld $t5,`$FRAME+40`($sp) | ||
399 | ld $t6,`$FRAME+48`($sp) | ||
400 | ld $t7,`$FRAME+56`($sp) | ||
401 | lfd $A0,`$FRAME+64`($sp) | ||
402 | lfd $A1,`$FRAME+72`($sp) | ||
403 | lfd $A2,`$FRAME+80`($sp) | ||
404 | lfd $A3,`$FRAME+88`($sp) | ||
405 | lfd $N0,`$FRAME+96`($sp) | ||
406 | lfd $N1,`$FRAME+104`($sp) | ||
407 | lfd $N2,`$FRAME+112`($sp) | ||
408 | lfd $N3,`$FRAME+120`($sp) | ||
409 | fcfid $A0,$A0 | ||
410 | fcfid $A1,$A1 | ||
411 | fcfid $A2,$A2 | ||
412 | fcfid $A3,$A3 | ||
413 | fcfid $N0,$N0 | ||
414 | fcfid $N1,$N1 | ||
415 | fcfid $N2,$N2 | ||
416 | fcfid $N3,$N3 | ||
417 | addi $ap,$ap,16 | ||
418 | addi $np,$np,16 | ||
419 | |||
420 | fmul $T1a,$A1,$ba | ||
421 | fmul $T1b,$A1,$bb | ||
422 | fmul $T2a,$A2,$ba | ||
423 | fmul $T2b,$A2,$bb | ||
424 | stfd $A0,8($nap_d) ; save a[j] in double format | ||
425 | stfd $A1,16($nap_d) | ||
426 | fmul $T3a,$A3,$ba | ||
427 | fmul $T3b,$A3,$bb | ||
428 | fmadd $T0a,$A0,$ba,$dota | ||
429 | fmadd $T0b,$A0,$bb,$dotb | ||
430 | stfd $A2,24($nap_d) ; save a[j+1] in double format | ||
431 | stfd $A3,32($nap_d) | ||
432 | |||
433 | fmadd $T1a,$A0,$bc,$T1a | ||
434 | fmadd $T1b,$A0,$bd,$T1b | ||
435 | fmadd $T2a,$A1,$bc,$T2a | ||
436 | fmadd $T2b,$A1,$bd,$T2b | ||
437 | stfd $N0,40($nap_d) ; save n[j] in double format | ||
438 | stfd $N1,48($nap_d) | ||
439 | fmadd $T3a,$A2,$bc,$T3a | ||
440 | fmadd $T3b,$A2,$bd,$T3b | ||
441 | add $t0,$t0,$carry ; can not overflow | ||
442 | fmul $dota,$A3,$bc | ||
443 | fmul $dotb,$A3,$bd | ||
444 | stfd $N2,56($nap_d) ; save n[j+1] in double format | ||
445 | stfdu $N3,64($nap_d) | ||
446 | srdi $carry,$t0,16 | ||
447 | add $t1,$t1,$carry | ||
448 | srdi $carry,$t1,16 | ||
449 | |||
450 | fmadd $T1a,$N1,$na,$T1a | ||
451 | fmadd $T1b,$N1,$nb,$T1b | ||
452 | insrdi $t0,$t1,16,32 | ||
453 | fmadd $T2a,$N2,$na,$T2a | ||
454 | fmadd $T2b,$N2,$nb,$T2b | ||
455 | add $t2,$t2,$carry | ||
456 | fmadd $T3a,$N3,$na,$T3a | ||
457 | fmadd $T3b,$N3,$nb,$T3b | ||
458 | srdi $carry,$t2,16 | ||
459 | fmadd $T0a,$N0,$na,$T0a | ||
460 | fmadd $T0b,$N0,$nb,$T0b | ||
461 | insrdi $t0,$t2,16,16 | ||
462 | add $t3,$t3,$carry | ||
463 | srdi $carry,$t3,16 | ||
464 | |||
465 | fmadd $T1a,$N0,$nc,$T1a | ||
466 | fmadd $T1b,$N0,$nd,$T1b | ||
467 | insrdi $t0,$t3,16,0 ; 0..63 bits | ||
468 | fmadd $T2a,$N1,$nc,$T2a | ||
469 | fmadd $T2b,$N1,$nd,$T2b | ||
470 | add $t4,$t4,$carry | ||
471 | fmadd $T3a,$N2,$nc,$T3a | ||
472 | fmadd $T3b,$N2,$nd,$T3b | ||
473 | srdi $carry,$t4,16 | ||
474 | fmadd $dota,$N3,$nc,$dota | ||
475 | fmadd $dotb,$N3,$nd,$dotb | ||
476 | add $t5,$t5,$carry | ||
477 | srdi $carry,$t5,16 | ||
478 | insrdi $t4,$t5,16,32 | ||
479 | |||
480 | fctid $T0a,$T0a | ||
481 | fctid $T0b,$T0b | ||
482 | add $t6,$t6,$carry | ||
483 | fctid $T1a,$T1a | ||
484 | fctid $T1b,$T1b | ||
485 | srdi $carry,$t6,16 | ||
486 | fctid $T2a,$T2a | ||
487 | fctid $T2b,$T2b | ||
488 | insrdi $t4,$t6,16,16 | ||
489 | fctid $T3a,$T3a | ||
490 | fctid $T3b,$T3b | ||
491 | add $t7,$t7,$carry | ||
492 | insrdi $t4,$t7,16,0 ; 64..127 bits | ||
493 | srdi $carry,$t7,16 ; upper 33 bits | ||
494 | |||
495 | stfd $T0a,`$FRAME+0`($sp) | ||
496 | stfd $T0b,`$FRAME+8`($sp) | ||
497 | stfd $T1a,`$FRAME+16`($sp) | ||
498 | stfd $T1b,`$FRAME+24`($sp) | ||
499 | stfd $T2a,`$FRAME+32`($sp) | ||
500 | stfd $T2b,`$FRAME+40`($sp) | ||
501 | stfd $T3a,`$FRAME+48`($sp) | ||
502 | stfd $T3b,`$FRAME+56`($sp) | ||
503 | std $t0,8($tp) ; tp[j-1] | ||
504 | stdu $t4,16($tp) ; tp[j] | ||
505 | bdnz- L1st | ||
506 | |||
507 | fctid $dota,$dota | ||
508 | fctid $dotb,$dotb | ||
509 | |||
510 | ld $t0,`$FRAME+0`($sp) | ||
511 | ld $t1,`$FRAME+8`($sp) | ||
512 | ld $t2,`$FRAME+16`($sp) | ||
513 | ld $t3,`$FRAME+24`($sp) | ||
514 | ld $t4,`$FRAME+32`($sp) | ||
515 | ld $t5,`$FRAME+40`($sp) | ||
516 | ld $t6,`$FRAME+48`($sp) | ||
517 | ld $t7,`$FRAME+56`($sp) | ||
518 | stfd $dota,`$FRAME+64`($sp) | ||
519 | stfd $dotb,`$FRAME+72`($sp) | ||
520 | |||
521 | add $t0,$t0,$carry ; can not overflow | ||
522 | srdi $carry,$t0,16 | ||
523 | add $t1,$t1,$carry | ||
524 | srdi $carry,$t1,16 | ||
525 | insrdi $t0,$t1,16,32 | ||
526 | add $t2,$t2,$carry | ||
527 | srdi $carry,$t2,16 | ||
528 | insrdi $t0,$t2,16,16 | ||
529 | add $t3,$t3,$carry | ||
530 | srdi $carry,$t3,16 | ||
531 | insrdi $t0,$t3,16,0 ; 0..63 bits | ||
532 | add $t4,$t4,$carry | ||
533 | srdi $carry,$t4,16 | ||
534 | add $t5,$t5,$carry | ||
535 | srdi $carry,$t5,16 | ||
536 | insrdi $t4,$t5,16,32 | ||
537 | add $t6,$t6,$carry | ||
538 | srdi $carry,$t6,16 | ||
539 | insrdi $t4,$t6,16,16 | ||
540 | add $t7,$t7,$carry | ||
541 | insrdi $t4,$t7,16,0 ; 64..127 bits | ||
542 | srdi $carry,$t7,16 ; upper 33 bits | ||
543 | ld $t6,`$FRAME+64`($sp) | ||
544 | ld $t7,`$FRAME+72`($sp) | ||
545 | |||
546 | std $t0,8($tp) ; tp[j-1] | ||
547 | stdu $t4,16($tp) ; tp[j] | ||
548 | |||
549 | add $t6,$t6,$carry ; can not overflow | ||
550 | srdi $carry,$t6,16 | ||
551 | add $t7,$t7,$carry | ||
552 | insrdi $t6,$t7,48,0 | ||
553 | srdi $ovf,$t7,48 | ||
554 | std $t6,8($tp) ; tp[num-1] | ||
555 | |||
556 | slwi $t7,$num,2 | ||
557 | subf $nap_d,$t7,$nap_d ; rewind pointer | ||
558 | |||
559 | li $i,8 ; i=1 | ||
560 | .align 5 | ||
561 | Louter: | ||
562 | ldx $t3,$bp,$i ; bp[i] | ||
563 | ld $t6,`$FRAME+$TRANSFER+8`($sp) ; tp[0] | ||
564 | mulld $t7,$a0,$t3 ; ap[0]*bp[i] | ||
565 | |||
566 | addi $tp,$sp,`$FRAME+$TRANSFER` | ||
567 | add $t7,$t7,$t6 ; ap[0]*bp[i]+tp[0] | ||
568 | li $carry,0 | ||
569 | mulld $t7,$t7,$n0 ; tp[0]*n0 | ||
570 | mtctr $j | ||
571 | |||
572 | ; transfer bp[i] to FPU as 4x16-bit values | ||
573 | extrdi $t0,$t3,16,48 | ||
574 | extrdi $t1,$t3,16,32 | ||
575 | extrdi $t2,$t3,16,16 | ||
576 | extrdi $t3,$t3,16,0 | ||
577 | std $t0,`$FRAME+0`($sp) | ||
578 | std $t1,`$FRAME+8`($sp) | ||
579 | std $t2,`$FRAME+16`($sp) | ||
580 | std $t3,`$FRAME+24`($sp) | ||
581 | ; transfer (ap[0]*bp[i]+tp[0])*n0 to FPU as 4x16-bit values | ||
582 | extrdi $t4,$t7,16,48 | ||
583 | extrdi $t5,$t7,16,32 | ||
584 | extrdi $t6,$t7,16,16 | ||
585 | extrdi $t7,$t7,16,0 | ||
586 | std $t4,`$FRAME+32`($sp) | ||
587 | std $t5,`$FRAME+40`($sp) | ||
588 | std $t6,`$FRAME+48`($sp) | ||
589 | std $t7,`$FRAME+56`($sp) | ||
590 | |||
591 | lfd $A0,8($nap_d) ; load a[j] in double format | ||
592 | lfd $A1,16($nap_d) | ||
593 | lfd $A2,24($nap_d) ; load a[j+1] in double format | ||
594 | lfd $A3,32($nap_d) | ||
595 | lfd $N0,40($nap_d) ; load n[j] in double format | ||
596 | lfd $N1,48($nap_d) | ||
597 | lfd $N2,56($nap_d) ; load n[j+1] in double format | ||
598 | lfdu $N3,64($nap_d) | ||
599 | |||
600 | lfd $ba,`$FRAME+0`($sp) | ||
601 | lfd $bb,`$FRAME+8`($sp) | ||
602 | lfd $bc,`$FRAME+16`($sp) | ||
603 | lfd $bd,`$FRAME+24`($sp) | ||
604 | lfd $na,`$FRAME+32`($sp) | ||
605 | lfd $nb,`$FRAME+40`($sp) | ||
606 | lfd $nc,`$FRAME+48`($sp) | ||
607 | lfd $nd,`$FRAME+56`($sp) | ||
608 | |||
609 | fcfid $ba,$ba | ||
610 | fcfid $bb,$bb | ||
611 | fcfid $bc,$bc | ||
612 | fcfid $bd,$bd | ||
613 | fcfid $na,$na | ||
614 | fcfid $nb,$nb | ||
615 | fcfid $nc,$nc | ||
616 | fcfid $nd,$nd | ||
617 | |||
618 | fmul $T1a,$A1,$ba | ||
619 | fmul $T1b,$A1,$bb | ||
620 | fmul $T2a,$A2,$ba | ||
621 | fmul $T2b,$A2,$bb | ||
622 | fmul $T3a,$A3,$ba | ||
623 | fmul $T3b,$A3,$bb | ||
624 | fmul $T0a,$A0,$ba | ||
625 | fmul $T0b,$A0,$bb | ||
626 | |||
627 | fmadd $T1a,$A0,$bc,$T1a | ||
628 | fmadd $T1b,$A0,$bd,$T1b | ||
629 | fmadd $T2a,$A1,$bc,$T2a | ||
630 | fmadd $T2b,$A1,$bd,$T2b | ||
631 | fmadd $T3a,$A2,$bc,$T3a | ||
632 | fmadd $T3b,$A2,$bd,$T3b | ||
633 | fmul $dota,$A3,$bc | ||
634 | fmul $dotb,$A3,$bd | ||
635 | |||
636 | fmadd $T1a,$N1,$na,$T1a | ||
637 | fmadd $T1b,$N1,$nb,$T1b | ||
638 | lfd $A0,8($nap_d) ; load a[j] in double format | ||
639 | lfd $A1,16($nap_d) | ||
640 | fmadd $T2a,$N2,$na,$T2a | ||
641 | fmadd $T2b,$N2,$nb,$T2b | ||
642 | lfd $A2,24($nap_d) ; load a[j+1] in double format | ||
643 | lfd $A3,32($nap_d) | ||
644 | fmadd $T3a,$N3,$na,$T3a | ||
645 | fmadd $T3b,$N3,$nb,$T3b | ||
646 | fmadd $T0a,$N0,$na,$T0a | ||
647 | fmadd $T0b,$N0,$nb,$T0b | ||
648 | |||
649 | fmadd $T1a,$N0,$nc,$T1a | ||
650 | fmadd $T1b,$N0,$nd,$T1b | ||
651 | fmadd $T2a,$N1,$nc,$T2a | ||
652 | fmadd $T2b,$N1,$nd,$T2b | ||
653 | fmadd $T3a,$N2,$nc,$T3a | ||
654 | fmadd $T3b,$N2,$nd,$T3b | ||
655 | fmadd $dota,$N3,$nc,$dota | ||
656 | fmadd $dotb,$N3,$nd,$dotb | ||
657 | |||
658 | fctid $T0a,$T0a | ||
659 | fctid $T0b,$T0b | ||
660 | fctid $T1a,$T1a | ||
661 | fctid $T1b,$T1b | ||
662 | fctid $T2a,$T2a | ||
663 | fctid $T2b,$T2b | ||
664 | fctid $T3a,$T3a | ||
665 | fctid $T3b,$T3b | ||
666 | |||
667 | stfd $T0a,`$FRAME+0`($sp) | ||
668 | stfd $T0b,`$FRAME+8`($sp) | ||
669 | stfd $T1a,`$FRAME+16`($sp) | ||
670 | stfd $T1b,`$FRAME+24`($sp) | ||
671 | stfd $T2a,`$FRAME+32`($sp) | ||
672 | stfd $T2b,`$FRAME+40`($sp) | ||
673 | stfd $T3a,`$FRAME+48`($sp) | ||
674 | stfd $T3b,`$FRAME+56`($sp) | ||
675 | |||
676 | .align 5 | ||
677 | Linner: | ||
678 | fmul $T1a,$A1,$ba | ||
679 | fmul $T1b,$A1,$bb | ||
680 | fmul $T2a,$A2,$ba | ||
681 | fmul $T2b,$A2,$bb | ||
682 | lfd $N0,40($nap_d) ; load n[j] in double format | ||
683 | lfd $N1,48($nap_d) | ||
684 | fmul $T3a,$A3,$ba | ||
685 | fmul $T3b,$A3,$bb | ||
686 | fmadd $T0a,$A0,$ba,$dota | ||
687 | fmadd $T0b,$A0,$bb,$dotb | ||
688 | lfd $N2,56($nap_d) ; load n[j+1] in double format | ||
689 | lfdu $N3,64($nap_d) | ||
690 | |||
691 | fmadd $T1a,$A0,$bc,$T1a | ||
692 | fmadd $T1b,$A0,$bd,$T1b | ||
693 | fmadd $T2a,$A1,$bc,$T2a | ||
694 | fmadd $T2b,$A1,$bd,$T2b | ||
695 | lfd $A0,8($nap_d) ; load a[j] in double format | ||
696 | lfd $A1,16($nap_d) | ||
697 | fmadd $T3a,$A2,$bc,$T3a | ||
698 | fmadd $T3b,$A2,$bd,$T3b | ||
699 | fmul $dota,$A3,$bc | ||
700 | fmul $dotb,$A3,$bd | ||
701 | lfd $A2,24($nap_d) ; load a[j+1] in double format | ||
702 | lfd $A3,32($nap_d) | ||
703 | |||
704 | fmadd $T1a,$N1,$na,$T1a | ||
705 | fmadd $T1b,$N1,$nb,$T1b | ||
706 | ld $t0,`$FRAME+0`($sp) | ||
707 | ld $t1,`$FRAME+8`($sp) | ||
708 | fmadd $T2a,$N2,$na,$T2a | ||
709 | fmadd $T2b,$N2,$nb,$T2b | ||
710 | ld $t2,`$FRAME+16`($sp) | ||
711 | ld $t3,`$FRAME+24`($sp) | ||
712 | fmadd $T3a,$N3,$na,$T3a | ||
713 | fmadd $T3b,$N3,$nb,$T3b | ||
714 | add $t0,$t0,$carry ; can not overflow | ||
715 | ld $t4,`$FRAME+32`($sp) | ||
716 | ld $t5,`$FRAME+40`($sp) | ||
717 | fmadd $T0a,$N0,$na,$T0a | ||
718 | fmadd $T0b,$N0,$nb,$T0b | ||
719 | srdi $carry,$t0,16 | ||
720 | add $t1,$t1,$carry | ||
721 | srdi $carry,$t1,16 | ||
722 | ld $t6,`$FRAME+48`($sp) | ||
723 | ld $t7,`$FRAME+56`($sp) | ||
724 | |||
725 | fmadd $T1a,$N0,$nc,$T1a | ||
726 | fmadd $T1b,$N0,$nd,$T1b | ||
727 | insrdi $t0,$t1,16,32 | ||
728 | ld $t1,8($tp) ; tp[j] | ||
729 | fmadd $T2a,$N1,$nc,$T2a | ||
730 | fmadd $T2b,$N1,$nd,$T2b | ||
731 | add $t2,$t2,$carry | ||
732 | fmadd $T3a,$N2,$nc,$T3a | ||
733 | fmadd $T3b,$N2,$nd,$T3b | ||
734 | srdi $carry,$t2,16 | ||
735 | insrdi $t0,$t2,16,16 | ||
736 | fmadd $dota,$N3,$nc,$dota | ||
737 | fmadd $dotb,$N3,$nd,$dotb | ||
738 | add $t3,$t3,$carry | ||
739 | ldu $t2,16($tp) ; tp[j+1] | ||
740 | srdi $carry,$t3,16 | ||
741 | insrdi $t0,$t3,16,0 ; 0..63 bits | ||
742 | add $t4,$t4,$carry | ||
743 | |||
744 | fctid $T0a,$T0a | ||
745 | fctid $T0b,$T0b | ||
746 | srdi $carry,$t4,16 | ||
747 | fctid $T1a,$T1a | ||
748 | fctid $T1b,$T1b | ||
749 | add $t5,$t5,$carry | ||
750 | fctid $T2a,$T2a | ||
751 | fctid $T2b,$T2b | ||
752 | srdi $carry,$t5,16 | ||
753 | insrdi $t4,$t5,16,32 | ||
754 | fctid $T3a,$T3a | ||
755 | fctid $T3b,$T3b | ||
756 | add $t6,$t6,$carry | ||
757 | srdi $carry,$t6,16 | ||
758 | insrdi $t4,$t6,16,16 | ||
759 | |||
760 | stfd $T0a,`$FRAME+0`($sp) | ||
761 | stfd $T0b,`$FRAME+8`($sp) | ||
762 | add $t7,$t7,$carry | ||
763 | addc $t3,$t0,$t1 | ||
764 | stfd $T1a,`$FRAME+16`($sp) | ||
765 | stfd $T1b,`$FRAME+24`($sp) | ||
766 | insrdi $t4,$t7,16,0 ; 64..127 bits | ||
767 | srdi $carry,$t7,16 ; upper 33 bits | ||
768 | stfd $T2a,`$FRAME+32`($sp) | ||
769 | stfd $T2b,`$FRAME+40`($sp) | ||
770 | adde $t5,$t4,$t2 | ||
771 | stfd $T3a,`$FRAME+48`($sp) | ||
772 | stfd $T3b,`$FRAME+56`($sp) | ||
773 | addze $carry,$carry | ||
774 | std $t3,-16($tp) ; tp[j-1] | ||
775 | std $t5,-8($tp) ; tp[j] | ||
776 | bdnz- Linner | ||
777 | |||
778 | fctid $dota,$dota | ||
779 | fctid $dotb,$dotb | ||
780 | ld $t0,`$FRAME+0`($sp) | ||
781 | ld $t1,`$FRAME+8`($sp) | ||
782 | ld $t2,`$FRAME+16`($sp) | ||
783 | ld $t3,`$FRAME+24`($sp) | ||
784 | ld $t4,`$FRAME+32`($sp) | ||
785 | ld $t5,`$FRAME+40`($sp) | ||
786 | ld $t6,`$FRAME+48`($sp) | ||
787 | ld $t7,`$FRAME+56`($sp) | ||
788 | stfd $dota,`$FRAME+64`($sp) | ||
789 | stfd $dotb,`$FRAME+72`($sp) | ||
790 | |||
791 | add $t0,$t0,$carry ; can not overflow | ||
792 | srdi $carry,$t0,16 | ||
793 | add $t1,$t1,$carry | ||
794 | srdi $carry,$t1,16 | ||
795 | insrdi $t0,$t1,16,32 | ||
796 | add $t2,$t2,$carry | ||
797 | ld $t1,8($tp) ; tp[j] | ||
798 | srdi $carry,$t2,16 | ||
799 | insrdi $t0,$t2,16,16 | ||
800 | add $t3,$t3,$carry | ||
801 | ldu $t2,16($tp) ; tp[j+1] | ||
802 | srdi $carry,$t3,16 | ||
803 | insrdi $t0,$t3,16,0 ; 0..63 bits | ||
804 | add $t4,$t4,$carry | ||
805 | srdi $carry,$t4,16 | ||
806 | add $t5,$t5,$carry | ||
807 | srdi $carry,$t5,16 | ||
808 | insrdi $t4,$t5,16,32 | ||
809 | add $t6,$t6,$carry | ||
810 | srdi $carry,$t6,16 | ||
811 | insrdi $t4,$t6,16,16 | ||
812 | add $t7,$t7,$carry | ||
813 | insrdi $t4,$t7,16,0 ; 64..127 bits | ||
814 | srdi $carry,$t7,16 ; upper 33 bits | ||
815 | ld $t6,`$FRAME+64`($sp) | ||
816 | ld $t7,`$FRAME+72`($sp) | ||
817 | |||
818 | addc $t3,$t0,$t1 | ||
819 | adde $t5,$t4,$t2 | ||
820 | addze $carry,$carry | ||
821 | |||
822 | std $t3,-16($tp) ; tp[j-1] | ||
823 | std $t5,-8($tp) ; tp[j] | ||
824 | |||
825 | add $carry,$carry,$ovf ; comsume upmost overflow | ||
826 | add $t6,$t6,$carry ; can not overflow | ||
827 | srdi $carry,$t6,16 | ||
828 | add $t7,$t7,$carry | ||
829 | insrdi $t6,$t7,48,0 | ||
830 | srdi $ovf,$t7,48 | ||
831 | std $t6,0($tp) ; tp[num-1] | ||
832 | |||
833 | slwi $t7,$num,2 | ||
834 | addi $i,$i,8 | ||
835 | subf $nap_d,$t7,$nap_d ; rewind pointer | ||
836 | cmpw $i,$num | ||
837 | blt- Louter | ||
838 | |||
839 | subf $np,$num,$np ; rewind np | ||
840 | addi $j,$j,1 ; restore counter | ||
841 | subfc $i,$i,$i ; j=0 and "clear" XER[CA] | ||
842 | addi $tp,$sp,`$FRAME+$TRANSFER+8` | ||
843 | addi $t4,$sp,`$FRAME+$TRANSFER+16` | ||
844 | addi $t5,$np,8 | ||
845 | addi $t6,$rp,8 | ||
846 | mtctr $j | ||
847 | |||
848 | .align 4 | ||
849 | Lsub: ldx $t0,$tp,$i | ||
850 | ldx $t1,$np,$i | ||
851 | ldx $t2,$t4,$i | ||
852 | ldx $t3,$t5,$i | ||
853 | subfe $t0,$t1,$t0 ; tp[j]-np[j] | ||
854 | subfe $t2,$t3,$t2 ; tp[j+1]-np[j+1] | ||
855 | stdx $t0,$rp,$i | ||
856 | stdx $t2,$t6,$i | ||
857 | addi $i,$i,16 | ||
858 | bdnz- Lsub | ||
859 | |||
860 | li $i,0 | ||
861 | subfe $ovf,$i,$ovf ; handle upmost overflow bit | ||
862 | and $ap,$tp,$ovf | ||
863 | andc $np,$rp,$ovf | ||
864 | or $ap,$ap,$np ; ap=borrow?tp:rp | ||
865 | addi $t7,$ap,8 | ||
866 | mtctr $j | ||
867 | |||
868 | .align 4 | ||
869 | Lcopy: ; copy or in-place refresh | ||
870 | ldx $t0,$ap,$i | ||
871 | ldx $t1,$t7,$i | ||
872 | std $i,8($nap_d) ; zap nap_d | ||
873 | std $i,16($nap_d) | ||
874 | std $i,24($nap_d) | ||
875 | std $i,32($nap_d) | ||
876 | std $i,40($nap_d) | ||
877 | std $i,48($nap_d) | ||
878 | std $i,56($nap_d) | ||
879 | stdu $i,64($nap_d) | ||
880 | stdx $t0,$rp,$i | ||
881 | stdx $t1,$t6,$i | ||
882 | stdx $i,$tp,$i ; zap tp at once | ||
883 | stdx $i,$t4,$i | ||
884 | addi $i,$i,16 | ||
885 | bdnz- Lcopy | ||
886 | |||
887 | $POP r14,`2*$SIZE_T`($sp) | ||
888 | $POP r15,`3*$SIZE_T`($sp) | ||
889 | $POP r16,`4*$SIZE_T`($sp) | ||
890 | $POP r17,`5*$SIZE_T`($sp) | ||
891 | $POP r18,`6*$SIZE_T`($sp) | ||
892 | $POP r19,`7*$SIZE_T`($sp) | ||
893 | $POP r20,`8*$SIZE_T`($sp) | ||
894 | $POP r21,`9*$SIZE_T`($sp) | ||
895 | $POP r22,`10*$SIZE_T`($sp) | ||
896 | $POP r23,`11*$SIZE_T`($sp) | ||
897 | lfd f14,`12*$SIZE_T+0`($sp) | ||
898 | lfd f15,`12*$SIZE_T+8`($sp) | ||
899 | lfd f16,`12*$SIZE_T+16`($sp) | ||
900 | lfd f17,`12*$SIZE_T+24`($sp) | ||
901 | lfd f18,`12*$SIZE_T+32`($sp) | ||
902 | lfd f19,`12*$SIZE_T+40`($sp) | ||
903 | lfd f20,`12*$SIZE_T+48`($sp) | ||
904 | lfd f21,`12*$SIZE_T+56`($sp) | ||
905 | lfd f22,`12*$SIZE_T+64`($sp) | ||
906 | lfd f23,`12*$SIZE_T+72`($sp) | ||
907 | lfd f24,`12*$SIZE_T+80`($sp) | ||
908 | lfd f25,`12*$SIZE_T+88`($sp) | ||
909 | $POP $sp,0($sp) | ||
910 | li r3,1 ; signal "handled" | ||
911 | blr | ||
912 | .long 0 | ||
913 | .asciz "Montgomery Multiplication for PPC64, CRYPTOGAMS by <appro\@fy.chalmers.se>" | ||
914 | ___ | ||
915 | |||
916 | $code =~ s/\`([^\`]*)\`/eval $1/gem; | ||
917 | print $code; | ||
918 | close STDOUT; | ||
diff --git a/src/lib/libcrypto/bn/asm/s390x-mont.pl b/src/lib/libcrypto/bn/asm/s390x-mont.pl new file mode 100644 index 0000000000..d23251033b --- /dev/null +++ b/src/lib/libcrypto/bn/asm/s390x-mont.pl | |||
@@ -0,0 +1,225 @@ | |||
1 | #!/usr/bin/env perl | ||
2 | |||
3 | # ==================================================================== | ||
4 | # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | ||
5 | # project. The module is, however, dual licensed under OpenSSL and | ||
6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
8 | # ==================================================================== | ||
9 | |||
10 | # April 2007. | ||
11 | # | ||
12 | # Performance improvement over vanilla C code varies from 85% to 45% | ||
13 | # depending on key length and benchmark. Unfortunately in this context | ||
14 | # these are not very impressive results [for code that utilizes "wide" | ||
15 | # 64x64=128-bit multiplication, which is not commonly available to C | ||
16 | # programmers], at least hand-coded bn_asm.c replacement is known to | ||
17 | # provide 30-40% better results for longest keys. Well, on a second | ||
18 | # thought it's not very surprising, because z-CPUs are single-issue | ||
19 | # and _strictly_ in-order execution, while bn_mul_mont is more or less | ||
20 | # dependent on CPU ability to pipe-line instructions and have several | ||
21 | # of them "in-flight" at the same time. I mean while other methods, | ||
22 | # for example Karatsuba, aim to minimize amount of multiplications at | ||
23 | # the cost of other operations increase, bn_mul_mont aim to neatly | ||
24 | # "overlap" multiplications and the other operations [and on most | ||
25 | # platforms even minimize the amount of the other operations, in | ||
26 | # particular references to memory]. But it's possible to improve this | ||
27 | # module performance by implementing dedicated squaring code-path and | ||
28 | # possibly by unrolling loops... | ||
29 | |||
30 | # January 2009. | ||
31 | # | ||
32 | # Reschedule to minimize/avoid Address Generation Interlock hazard, | ||
33 | # make inner loops counter-based. | ||
34 | |||
35 | $mn0="%r0"; | ||
36 | $num="%r1"; | ||
37 | |||
38 | # int bn_mul_mont( | ||
39 | $rp="%r2"; # BN_ULONG *rp, | ||
40 | $ap="%r3"; # const BN_ULONG *ap, | ||
41 | $bp="%r4"; # const BN_ULONG *bp, | ||
42 | $np="%r5"; # const BN_ULONG *np, | ||
43 | $n0="%r6"; # const BN_ULONG *n0, | ||
44 | #$num="160(%r15)" # int num); | ||
45 | |||
46 | $bi="%r2"; # zaps rp | ||
47 | $j="%r7"; | ||
48 | |||
49 | $ahi="%r8"; | ||
50 | $alo="%r9"; | ||
51 | $nhi="%r10"; | ||
52 | $nlo="%r11"; | ||
53 | $AHI="%r12"; | ||
54 | $NHI="%r13"; | ||
55 | $count="%r14"; | ||
56 | $sp="%r15"; | ||
57 | |||
58 | $code.=<<___; | ||
59 | .text | ||
60 | .globl bn_mul_mont | ||
61 | .type bn_mul_mont,\@function | ||
62 | bn_mul_mont: | ||
63 | lgf $num,164($sp) # pull $num | ||
64 | sla $num,3 # $num to enumerate bytes | ||
65 | la $bp,0($num,$bp) | ||
66 | |||
67 | stg %r2,16($sp) | ||
68 | |||
69 | cghi $num,16 # | ||
70 | lghi %r2,0 # | ||
71 | blr %r14 # if($num<16) return 0; | ||
72 | cghi $num,128 # | ||
73 | bhr %r14 # if($num>128) return 0; | ||
74 | |||
75 | stmg %r3,%r15,24($sp) | ||
76 | |||
77 | lghi $rp,-160-8 # leave room for carry bit | ||
78 | lcgr $j,$num # -$num | ||
79 | lgr %r0,$sp | ||
80 | la $rp,0($rp,$sp) | ||
81 | la $sp,0($j,$rp) # alloca | ||
82 | stg %r0,0($sp) # back chain | ||
83 | |||
84 | sra $num,3 # restore $num | ||
85 | la $bp,0($j,$bp) # restore $bp | ||
86 | ahi $num,-1 # adjust $num for inner loop | ||
87 | lg $n0,0($n0) # pull n0 | ||
88 | |||
89 | lg $bi,0($bp) | ||
90 | lg $alo,0($ap) | ||
91 | mlgr $ahi,$bi # ap[0]*bp[0] | ||
92 | lgr $AHI,$ahi | ||
93 | |||
94 | lgr $mn0,$alo # "tp[0]"*n0 | ||
95 | msgr $mn0,$n0 | ||
96 | |||
97 | lg $nlo,0($np) # | ||
98 | mlgr $nhi,$mn0 # np[0]*m1 | ||
99 | algr $nlo,$alo # +="tp[0]" | ||
100 | lghi $NHI,0 | ||
101 | alcgr $NHI,$nhi | ||
102 | |||
103 | la $j,8(%r0) # j=1 | ||
104 | lr $count,$num | ||
105 | |||
106 | .align 16 | ||
107 | .L1st: | ||
108 | lg $alo,0($j,$ap) | ||
109 | mlgr $ahi,$bi # ap[j]*bp[0] | ||
110 | algr $alo,$AHI | ||
111 | lghi $AHI,0 | ||
112 | alcgr $AHI,$ahi | ||
113 | |||
114 | lg $nlo,0($j,$np) | ||
115 | mlgr $nhi,$mn0 # np[j]*m1 | ||
116 | algr $nlo,$NHI | ||
117 | lghi $NHI,0 | ||
118 | alcgr $nhi,$NHI # +="tp[j]" | ||
119 | algr $nlo,$alo | ||
120 | alcgr $NHI,$nhi | ||
121 | |||
122 | stg $nlo,160-8($j,$sp) # tp[j-1]= | ||
123 | la $j,8($j) # j++ | ||
124 | brct $count,.L1st | ||
125 | |||
126 | algr $NHI,$AHI | ||
127 | lghi $AHI,0 | ||
128 | alcgr $AHI,$AHI # upmost overflow bit | ||
129 | stg $NHI,160-8($j,$sp) | ||
130 | stg $AHI,160($j,$sp) | ||
131 | la $bp,8($bp) # bp++ | ||
132 | |||
133 | .Louter: | ||
134 | lg $bi,0($bp) # bp[i] | ||
135 | lg $alo,0($ap) | ||
136 | mlgr $ahi,$bi # ap[0]*bp[i] | ||
137 | alg $alo,160($sp) # +=tp[0] | ||
138 | lghi $AHI,0 | ||
139 | alcgr $AHI,$ahi | ||
140 | |||
141 | lgr $mn0,$alo | ||
142 | msgr $mn0,$n0 # tp[0]*n0 | ||
143 | |||
144 | lg $nlo,0($np) # np[0] | ||
145 | mlgr $nhi,$mn0 # np[0]*m1 | ||
146 | algr $nlo,$alo # +="tp[0]" | ||
147 | lghi $NHI,0 | ||
148 | alcgr $NHI,$nhi | ||
149 | |||
150 | la $j,8(%r0) # j=1 | ||
151 | lr $count,$num | ||
152 | |||
153 | .align 16 | ||
154 | .Linner: | ||
155 | lg $alo,0($j,$ap) | ||
156 | mlgr $ahi,$bi # ap[j]*bp[i] | ||
157 | algr $alo,$AHI | ||
158 | lghi $AHI,0 | ||
159 | alcgr $ahi,$AHI | ||
160 | alg $alo,160($j,$sp)# +=tp[j] | ||
161 | alcgr $AHI,$ahi | ||
162 | |||
163 | lg $nlo,0($j,$np) | ||
164 | mlgr $nhi,$mn0 # np[j]*m1 | ||
165 | algr $nlo,$NHI | ||
166 | lghi $NHI,0 | ||
167 | alcgr $nhi,$NHI | ||
168 | algr $nlo,$alo # +="tp[j]" | ||
169 | alcgr $NHI,$nhi | ||
170 | |||
171 | stg $nlo,160-8($j,$sp) # tp[j-1]= | ||
172 | la $j,8($j) # j++ | ||
173 | brct $count,.Linner | ||
174 | |||
175 | algr $NHI,$AHI | ||
176 | lghi $AHI,0 | ||
177 | alcgr $AHI,$AHI | ||
178 | alg $NHI,160($j,$sp)# accumulate previous upmost overflow bit | ||
179 | lghi $ahi,0 | ||
180 | alcgr $AHI,$ahi # new upmost overflow bit | ||
181 | stg $NHI,160-8($j,$sp) | ||
182 | stg $AHI,160($j,$sp) | ||
183 | |||
184 | la $bp,8($bp) # bp++ | ||
185 | clg $bp,160+8+32($j,$sp) # compare to &bp[num] | ||
186 | jne .Louter | ||
187 | |||
188 | lg $rp,160+8+16($j,$sp) # reincarnate rp | ||
189 | la $ap,160($sp) | ||
190 | ahi $num,1 # restore $num, incidentally clears "borrow" | ||
191 | |||
192 | la $j,0(%r0) | ||
193 | lr $count,$num | ||
194 | .Lsub: lg $alo,0($j,$ap) | ||
195 | slbg $alo,0($j,$np) | ||
196 | stg $alo,0($j,$rp) | ||
197 | la $j,8($j) | ||
198 | brct $count,.Lsub | ||
199 | lghi $ahi,0 | ||
200 | slbgr $AHI,$ahi # handle upmost carry | ||
201 | |||
202 | ngr $ap,$AHI | ||
203 | lghi $np,-1 | ||
204 | xgr $np,$AHI | ||
205 | ngr $np,$rp | ||
206 | ogr $ap,$np # ap=borrow?tp:rp | ||
207 | |||
208 | la $j,0(%r0) | ||
209 | lgr $count,$num | ||
210 | .Lcopy: lg $alo,0($j,$ap) # copy or in-place refresh | ||
211 | stg $j,160($j,$sp) # zap tp | ||
212 | stg $alo,0($j,$rp) | ||
213 | la $j,8($j) | ||
214 | brct $count,.Lcopy | ||
215 | |||
216 | la %r1,160+8+48($j,$sp) | ||
217 | lmg %r6,%r15,0(%r1) | ||
218 | lghi %r2,1 # signal "processed" | ||
219 | br %r14 | ||
220 | .size bn_mul_mont,.-bn_mul_mont | ||
221 | .string "Montgomery Multiplication for s390x, CRYPTOGAMS by <appro\@openssl.org>" | ||
222 | ___ | ||
223 | |||
224 | print $code; | ||
225 | close STDOUT; | ||
diff --git a/src/lib/libcrypto/bn/asm/s390x.S b/src/lib/libcrypto/bn/asm/s390x.S new file mode 100755 index 0000000000..8f45f5d513 --- /dev/null +++ b/src/lib/libcrypto/bn/asm/s390x.S | |||
@@ -0,0 +1,678 @@ | |||
1 | .ident "s390x.S, version 1.0" | ||
2 | // ==================================================================== | ||
3 | // Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | ||
4 | // project. | ||
5 | // | ||
6 | // Rights for redistribution and usage in source and binary forms are | ||
7 | // granted according to the OpenSSL license. Warranty of any kind is | ||
8 | // disclaimed. | ||
9 | // ==================================================================== | ||
10 | |||
11 | .text | ||
12 | |||
13 | #define zero %r0 | ||
14 | |||
15 | // BN_ULONG bn_mul_add_words(BN_ULONG *r2,BN_ULONG *r3,int r4,BN_ULONG r5); | ||
16 | .globl bn_mul_add_words | ||
17 | .type bn_mul_add_words,@function | ||
18 | .align 4 | ||
19 | bn_mul_add_words: | ||
20 | lghi zero,0 // zero = 0 | ||
21 | la %r1,0(%r2) // put rp aside | ||
22 | lghi %r2,0 // i=0; | ||
23 | ltgfr %r4,%r4 | ||
24 | bler %r14 // if (len<=0) return 0; | ||
25 | |||
26 | stmg %r6,%r10,48(%r15) | ||
27 | lghi %r8,0 // carry = 0 | ||
28 | srag %r10,%r4,2 // cnt=len/4 | ||
29 | jz .Loop1_madd | ||
30 | |||
31 | .Loop4_madd: | ||
32 | lg %r7,0(%r2,%r3) // ap[i] | ||
33 | mlgr %r6,%r5 // *=w | ||
34 | algr %r7,%r8 // +=carry | ||
35 | alcgr %r6,zero | ||
36 | alg %r7,0(%r2,%r1) // +=rp[i] | ||
37 | alcgr %r6,zero | ||
38 | stg %r7,0(%r2,%r1) // rp[i]= | ||
39 | |||
40 | lg %r9,8(%r2,%r3) | ||
41 | mlgr %r8,%r5 | ||
42 | algr %r9,%r6 | ||
43 | alcgr %r8,zero | ||
44 | alg %r9,8(%r2,%r1) | ||
45 | alcgr %r8,zero | ||
46 | stg %r9,8(%r2,%r1) | ||
47 | |||
48 | lg %r7,16(%r2,%r3) | ||
49 | mlgr %r6,%r5 | ||
50 | algr %r7,%r8 | ||
51 | alcgr %r6,zero | ||
52 | alg %r7,16(%r2,%r1) | ||
53 | alcgr %r6,zero | ||
54 | stg %r7,16(%r2,%r1) | ||
55 | |||
56 | lg %r9,24(%r2,%r3) | ||
57 | mlgr %r8,%r5 | ||
58 | algr %r9,%r6 | ||
59 | alcgr %r8,zero | ||
60 | alg %r9,24(%r2,%r1) | ||
61 | alcgr %r8,zero | ||
62 | stg %r9,24(%r2,%r1) | ||
63 | |||
64 | la %r2,32(%r2) // i+=4 | ||
65 | brct %r10,.Loop4_madd | ||
66 | |||
67 | lghi %r10,3 | ||
68 | nr %r4,%r10 // cnt=len%4 | ||
69 | jz .Lend_madd | ||
70 | |||
71 | .Loop1_madd: | ||
72 | lg %r7,0(%r2,%r3) // ap[i] | ||
73 | mlgr %r6,%r5 // *=w | ||
74 | algr %r7,%r8 // +=carry | ||
75 | alcgr %r6,zero | ||
76 | alg %r7,0(%r2,%r1) // +=rp[i] | ||
77 | alcgr %r6,zero | ||
78 | stg %r7,0(%r2,%r1) // rp[i]= | ||
79 | |||
80 | lgr %r8,%r6 | ||
81 | la %r2,8(%r2) // i++ | ||
82 | brct %r4,.Loop1_madd | ||
83 | |||
84 | .Lend_madd: | ||
85 | lgr %r2,%r8 | ||
86 | lmg %r6,%r10,48(%r15) | ||
87 | br %r14 | ||
88 | .size bn_mul_add_words,.-bn_mul_add_words | ||
89 | |||
90 | // BN_ULONG bn_mul_words(BN_ULONG *r2,BN_ULONG *r3,int r4,BN_ULONG r5); | ||
91 | .globl bn_mul_words | ||
92 | .type bn_mul_words,@function | ||
93 | .align 4 | ||
94 | bn_mul_words: | ||
95 | lghi zero,0 // zero = 0 | ||
96 | la %r1,0(%r2) // put rp aside | ||
97 | lghi %r2,0 // i=0; | ||
98 | ltgfr %r4,%r4 | ||
99 | bler %r14 // if (len<=0) return 0; | ||
100 | |||
101 | stmg %r6,%r10,48(%r15) | ||
102 | lghi %r8,0 // carry = 0 | ||
103 | srag %r10,%r4,2 // cnt=len/4 | ||
104 | jz .Loop1_mul | ||
105 | |||
106 | .Loop4_mul: | ||
107 | lg %r7,0(%r2,%r3) // ap[i] | ||
108 | mlgr %r6,%r5 // *=w | ||
109 | algr %r7,%r8 // +=carry | ||
110 | alcgr %r6,zero | ||
111 | stg %r7,0(%r2,%r1) // rp[i]= | ||
112 | |||
113 | lg %r9,8(%r2,%r3) | ||
114 | mlgr %r8,%r5 | ||
115 | algr %r9,%r6 | ||
116 | alcgr %r8,zero | ||
117 | stg %r9,8(%r2,%r1) | ||
118 | |||
119 | lg %r7,16(%r2,%r3) | ||
120 | mlgr %r6,%r5 | ||
121 | algr %r7,%r8 | ||
122 | alcgr %r6,zero | ||
123 | stg %r7,16(%r2,%r1) | ||
124 | |||
125 | lg %r9,24(%r2,%r3) | ||
126 | mlgr %r8,%r5 | ||
127 | algr %r9,%r6 | ||
128 | alcgr %r8,zero | ||
129 | stg %r9,24(%r2,%r1) | ||
130 | |||
131 | la %r2,32(%r2) // i+=4 | ||
132 | brct %r10,.Loop4_mul | ||
133 | |||
134 | lghi %r10,3 | ||
135 | nr %r4,%r10 // cnt=len%4 | ||
136 | jz .Lend_mul | ||
137 | |||
138 | .Loop1_mul: | ||
139 | lg %r7,0(%r2,%r3) // ap[i] | ||
140 | mlgr %r6,%r5 // *=w | ||
141 | algr %r7,%r8 // +=carry | ||
142 | alcgr %r6,zero | ||
143 | stg %r7,0(%r2,%r1) // rp[i]= | ||
144 | |||
145 | lgr %r8,%r6 | ||
146 | la %r2,8(%r2) // i++ | ||
147 | brct %r4,.Loop1_mul | ||
148 | |||
149 | .Lend_mul: | ||
150 | lgr %r2,%r8 | ||
151 | lmg %r6,%r10,48(%r15) | ||
152 | br %r14 | ||
153 | .size bn_mul_words,.-bn_mul_words | ||
154 | |||
155 | // void bn_sqr_words(BN_ULONG *r2,BN_ULONG *r2,int r4) | ||
156 | .globl bn_sqr_words | ||
157 | .type bn_sqr_words,@function | ||
158 | .align 4 | ||
159 | bn_sqr_words: | ||
160 | ltgfr %r4,%r4 | ||
161 | bler %r14 | ||
162 | |||
163 | stmg %r6,%r7,48(%r15) | ||
164 | srag %r1,%r4,2 // cnt=len/4 | ||
165 | jz .Loop1_sqr | ||
166 | |||
167 | .Loop4_sqr: | ||
168 | lg %r7,0(%r3) | ||
169 | mlgr %r6,%r7 | ||
170 | stg %r7,0(%r2) | ||
171 | stg %r6,8(%r2) | ||
172 | |||
173 | lg %r7,8(%r3) | ||
174 | mlgr %r6,%r7 | ||
175 | stg %r7,16(%r2) | ||
176 | stg %r6,24(%r2) | ||
177 | |||
178 | lg %r7,16(%r3) | ||
179 | mlgr %r6,%r7 | ||
180 | stg %r7,32(%r2) | ||
181 | stg %r6,40(%r2) | ||
182 | |||
183 | lg %r7,24(%r3) | ||
184 | mlgr %r6,%r7 | ||
185 | stg %r7,48(%r2) | ||
186 | stg %r6,56(%r2) | ||
187 | |||
188 | la %r3,32(%r3) | ||
189 | la %r2,64(%r2) | ||
190 | brct %r1,.Loop4_sqr | ||
191 | |||
192 | lghi %r1,3 | ||
193 | nr %r4,%r1 // cnt=len%4 | ||
194 | jz .Lend_sqr | ||
195 | |||
196 | .Loop1_sqr: | ||
197 | lg %r7,0(%r3) | ||
198 | mlgr %r6,%r7 | ||
199 | stg %r7,0(%r2) | ||
200 | stg %r6,8(%r2) | ||
201 | |||
202 | la %r3,8(%r3) | ||
203 | la %r2,16(%r2) | ||
204 | brct %r4,.Loop1_sqr | ||
205 | |||
206 | .Lend_sqr: | ||
207 | lmg %r6,%r7,48(%r15) | ||
208 | br %r14 | ||
209 | .size bn_sqr_words,.-bn_sqr_words | ||
210 | |||
211 | // BN_ULONG bn_div_words(BN_ULONG h,BN_ULONG l,BN_ULONG d); | ||
212 | .globl bn_div_words | ||
213 | .type bn_div_words,@function | ||
214 | .align 4 | ||
215 | bn_div_words: | ||
216 | dlgr %r2,%r4 | ||
217 | lgr %r2,%r3 | ||
218 | br %r14 | ||
219 | .size bn_div_words,.-bn_div_words | ||
220 | |||
221 | // BN_ULONG bn_add_words(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4,int r5); | ||
222 | .globl bn_add_words | ||
223 | .type bn_add_words,@function | ||
224 | .align 4 | ||
225 | bn_add_words: | ||
226 | la %r1,0(%r2) // put rp aside | ||
227 | lghi %r2,0 // i=0 | ||
228 | ltgfr %r5,%r5 | ||
229 | bler %r14 // if (len<=0) return 0; | ||
230 | |||
231 | stg %r6,48(%r15) | ||
232 | lghi %r6,3 | ||
233 | nr %r6,%r5 // len%4 | ||
234 | sra %r5,2 // len/4, use sra because it sets condition code | ||
235 | jz .Loop1_add // carry is incidentally cleared if branch taken | ||
236 | algr %r2,%r2 // clear carry | ||
237 | |||
238 | .Loop4_add: | ||
239 | lg %r0,0(%r2,%r3) | ||
240 | alcg %r0,0(%r2,%r4) | ||
241 | stg %r0,0(%r2,%r1) | ||
242 | lg %r0,8(%r2,%r3) | ||
243 | alcg %r0,8(%r2,%r4) | ||
244 | stg %r0,8(%r2,%r1) | ||
245 | lg %r0,16(%r2,%r3) | ||
246 | alcg %r0,16(%r2,%r4) | ||
247 | stg %r0,16(%r2,%r1) | ||
248 | lg %r0,24(%r2,%r3) | ||
249 | alcg %r0,24(%r2,%r4) | ||
250 | stg %r0,24(%r2,%r1) | ||
251 | |||
252 | la %r2,32(%r2) // i+=4 | ||
253 | brct %r5,.Loop4_add | ||
254 | |||
255 | la %r6,1(%r6) // see if len%4 is zero ... | ||
256 | brct %r6,.Loop1_add // without touching condition code:-) | ||
257 | |||
258 | .Lexit_add: | ||
259 | lghi %r2,0 | ||
260 | alcgr %r2,%r2 | ||
261 | lg %r6,48(%r15) | ||
262 | br %r14 | ||
263 | |||
264 | .Loop1_add: | ||
265 | lg %r0,0(%r2,%r3) | ||
266 | alcg %r0,0(%r2,%r4) | ||
267 | stg %r0,0(%r2,%r1) | ||
268 | |||
269 | la %r2,8(%r2) // i++ | ||
270 | brct %r6,.Loop1_add | ||
271 | |||
272 | j .Lexit_add | ||
273 | .size bn_add_words,.-bn_add_words | ||
274 | |||
275 | // BN_ULONG bn_sub_words(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4,int r5); | ||
276 | .globl bn_sub_words | ||
277 | .type bn_sub_words,@function | ||
278 | .align 4 | ||
279 | bn_sub_words: | ||
280 | la %r1,0(%r2) // put rp aside | ||
281 | lghi %r2,0 // i=0 | ||
282 | ltgfr %r5,%r5 | ||
283 | bler %r14 // if (len<=0) return 0; | ||
284 | |||
285 | stg %r6,48(%r15) | ||
286 | lghi %r6,3 | ||
287 | nr %r6,%r5 // len%4 | ||
288 | sra %r5,2 // len/4, use sra because it sets condition code | ||
289 | jnz .Loop4_sub // borrow is incidentally cleared if branch taken | ||
290 | slgr %r2,%r2 // clear borrow | ||
291 | |||
292 | .Loop1_sub: | ||
293 | lg %r0,0(%r2,%r3) | ||
294 | slbg %r0,0(%r2,%r4) | ||
295 | stg %r0,0(%r2,%r1) | ||
296 | |||
297 | la %r2,8(%r2) // i++ | ||
298 | brct %r6,.Loop1_sub | ||
299 | j .Lexit_sub | ||
300 | |||
301 | .Loop4_sub: | ||
302 | lg %r0,0(%r2,%r3) | ||
303 | slbg %r0,0(%r2,%r4) | ||
304 | stg %r0,0(%r2,%r1) | ||
305 | lg %r0,8(%r2,%r3) | ||
306 | slbg %r0,8(%r2,%r4) | ||
307 | stg %r0,8(%r2,%r1) | ||
308 | lg %r0,16(%r2,%r3) | ||
309 | slbg %r0,16(%r2,%r4) | ||
310 | stg %r0,16(%r2,%r1) | ||
311 | lg %r0,24(%r2,%r3) | ||
312 | slbg %r0,24(%r2,%r4) | ||
313 | stg %r0,24(%r2,%r1) | ||
314 | |||
315 | la %r2,32(%r2) // i+=4 | ||
316 | brct %r5,.Loop4_sub | ||
317 | |||
318 | la %r6,1(%r6) // see if len%4 is zero ... | ||
319 | brct %r6,.Loop1_sub // without touching condition code:-) | ||
320 | |||
321 | .Lexit_sub: | ||
322 | lghi %r2,0 | ||
323 | slbgr %r2,%r2 | ||
324 | lcgr %r2,%r2 | ||
325 | lg %r6,48(%r15) | ||
326 | br %r14 | ||
327 | .size bn_sub_words,.-bn_sub_words | ||
328 | |||
329 | #define c1 %r1 | ||
330 | #define c2 %r5 | ||
331 | #define c3 %r8 | ||
332 | |||
333 | #define mul_add_c(ai,bi,c1,c2,c3) \ | ||
334 | lg %r7,ai*8(%r3); \ | ||
335 | mlg %r6,bi*8(%r4); \ | ||
336 | algr c1,%r7; \ | ||
337 | alcgr c2,%r6; \ | ||
338 | alcgr c3,zero | ||
339 | |||
340 | // void bn_mul_comba8(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4); | ||
341 | .globl bn_mul_comba8 | ||
342 | .type bn_mul_comba8,@function | ||
343 | .align 4 | ||
344 | bn_mul_comba8: | ||
345 | stmg %r6,%r8,48(%r15) | ||
346 | |||
347 | lghi c1,0 | ||
348 | lghi c2,0 | ||
349 | lghi c3,0 | ||
350 | lghi zero,0 | ||
351 | |||
352 | mul_add_c(0,0,c1,c2,c3); | ||
353 | stg c1,0*8(%r2) | ||
354 | lghi c1,0 | ||
355 | |||
356 | mul_add_c(0,1,c2,c3,c1); | ||
357 | mul_add_c(1,0,c2,c3,c1); | ||
358 | stg c2,1*8(%r2) | ||
359 | lghi c2,0 | ||
360 | |||
361 | mul_add_c(2,0,c3,c1,c2); | ||
362 | mul_add_c(1,1,c3,c1,c2); | ||
363 | mul_add_c(0,2,c3,c1,c2); | ||
364 | stg c3,2*8(%r2) | ||
365 | lghi c3,0 | ||
366 | |||
367 | mul_add_c(0,3,c1,c2,c3); | ||
368 | mul_add_c(1,2,c1,c2,c3); | ||
369 | mul_add_c(2,1,c1,c2,c3); | ||
370 | mul_add_c(3,0,c1,c2,c3); | ||
371 | stg c1,3*8(%r2) | ||
372 | lghi c1,0 | ||
373 | |||
374 | mul_add_c(4,0,c2,c3,c1); | ||
375 | mul_add_c(3,1,c2,c3,c1); | ||
376 | mul_add_c(2,2,c2,c3,c1); | ||
377 | mul_add_c(1,3,c2,c3,c1); | ||
378 | mul_add_c(0,4,c2,c3,c1); | ||
379 | stg c2,4*8(%r2) | ||
380 | lghi c2,0 | ||
381 | |||
382 | mul_add_c(0,5,c3,c1,c2); | ||
383 | mul_add_c(1,4,c3,c1,c2); | ||
384 | mul_add_c(2,3,c3,c1,c2); | ||
385 | mul_add_c(3,2,c3,c1,c2); | ||
386 | mul_add_c(4,1,c3,c1,c2); | ||
387 | mul_add_c(5,0,c3,c1,c2); | ||
388 | stg c3,5*8(%r2) | ||
389 | lghi c3,0 | ||
390 | |||
391 | mul_add_c(6,0,c1,c2,c3); | ||
392 | mul_add_c(5,1,c1,c2,c3); | ||
393 | mul_add_c(4,2,c1,c2,c3); | ||
394 | mul_add_c(3,3,c1,c2,c3); | ||
395 | mul_add_c(2,4,c1,c2,c3); | ||
396 | mul_add_c(1,5,c1,c2,c3); | ||
397 | mul_add_c(0,6,c1,c2,c3); | ||
398 | stg c1,6*8(%r2) | ||
399 | lghi c1,0 | ||
400 | |||
401 | mul_add_c(0,7,c2,c3,c1); | ||
402 | mul_add_c(1,6,c2,c3,c1); | ||
403 | mul_add_c(2,5,c2,c3,c1); | ||
404 | mul_add_c(3,4,c2,c3,c1); | ||
405 | mul_add_c(4,3,c2,c3,c1); | ||
406 | mul_add_c(5,2,c2,c3,c1); | ||
407 | mul_add_c(6,1,c2,c3,c1); | ||
408 | mul_add_c(7,0,c2,c3,c1); | ||
409 | stg c2,7*8(%r2) | ||
410 | lghi c2,0 | ||
411 | |||
412 | mul_add_c(7,1,c3,c1,c2); | ||
413 | mul_add_c(6,2,c3,c1,c2); | ||
414 | mul_add_c(5,3,c3,c1,c2); | ||
415 | mul_add_c(4,4,c3,c1,c2); | ||
416 | mul_add_c(3,5,c3,c1,c2); | ||
417 | mul_add_c(2,6,c3,c1,c2); | ||
418 | mul_add_c(1,7,c3,c1,c2); | ||
419 | stg c3,8*8(%r2) | ||
420 | lghi c3,0 | ||
421 | |||
422 | mul_add_c(2,7,c1,c2,c3); | ||
423 | mul_add_c(3,6,c1,c2,c3); | ||
424 | mul_add_c(4,5,c1,c2,c3); | ||
425 | mul_add_c(5,4,c1,c2,c3); | ||
426 | mul_add_c(6,3,c1,c2,c3); | ||
427 | mul_add_c(7,2,c1,c2,c3); | ||
428 | stg c1,9*8(%r2) | ||
429 | lghi c1,0 | ||
430 | |||
431 | mul_add_c(7,3,c2,c3,c1); | ||
432 | mul_add_c(6,4,c2,c3,c1); | ||
433 | mul_add_c(5,5,c2,c3,c1); | ||
434 | mul_add_c(4,6,c2,c3,c1); | ||
435 | mul_add_c(3,7,c2,c3,c1); | ||
436 | stg c2,10*8(%r2) | ||
437 | lghi c2,0 | ||
438 | |||
439 | mul_add_c(4,7,c3,c1,c2); | ||
440 | mul_add_c(5,6,c3,c1,c2); | ||
441 | mul_add_c(6,5,c3,c1,c2); | ||
442 | mul_add_c(7,4,c3,c1,c2); | ||
443 | stg c3,11*8(%r2) | ||
444 | lghi c3,0 | ||
445 | |||
446 | mul_add_c(7,5,c1,c2,c3); | ||
447 | mul_add_c(6,6,c1,c2,c3); | ||
448 | mul_add_c(5,7,c1,c2,c3); | ||
449 | stg c1,12*8(%r2) | ||
450 | lghi c1,0 | ||
451 | |||
452 | |||
453 | mul_add_c(6,7,c2,c3,c1); | ||
454 | mul_add_c(7,6,c2,c3,c1); | ||
455 | stg c2,13*8(%r2) | ||
456 | lghi c2,0 | ||
457 | |||
458 | mul_add_c(7,7,c3,c1,c2); | ||
459 | stg c3,14*8(%r2) | ||
460 | stg c1,15*8(%r2) | ||
461 | |||
462 | lmg %r6,%r8,48(%r15) | ||
463 | br %r14 | ||
464 | .size bn_mul_comba8,.-bn_mul_comba8 | ||
465 | |||
466 | // void bn_mul_comba4(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4); | ||
467 | .globl bn_mul_comba4 | ||
468 | .type bn_mul_comba4,@function | ||
469 | .align 4 | ||
470 | bn_mul_comba4: | ||
471 | stmg %r6,%r8,48(%r15) | ||
472 | |||
473 | lghi c1,0 | ||
474 | lghi c2,0 | ||
475 | lghi c3,0 | ||
476 | lghi zero,0 | ||
477 | |||
478 | mul_add_c(0,0,c1,c2,c3); | ||
479 | stg c1,0*8(%r3) | ||
480 | lghi c1,0 | ||
481 | |||
482 | mul_add_c(0,1,c2,c3,c1); | ||
483 | mul_add_c(1,0,c2,c3,c1); | ||
484 | stg c2,1*8(%r2) | ||
485 | lghi c2,0 | ||
486 | |||
487 | mul_add_c(2,0,c3,c1,c2); | ||
488 | mul_add_c(1,1,c3,c1,c2); | ||
489 | mul_add_c(0,2,c3,c1,c2); | ||
490 | stg c3,2*8(%r2) | ||
491 | lghi c3,0 | ||
492 | |||
493 | mul_add_c(0,3,c1,c2,c3); | ||
494 | mul_add_c(1,2,c1,c2,c3); | ||
495 | mul_add_c(2,1,c1,c2,c3); | ||
496 | mul_add_c(3,0,c1,c2,c3); | ||
497 | stg c1,3*8(%r2) | ||
498 | lghi c1,0 | ||
499 | |||
500 | mul_add_c(3,1,c2,c3,c1); | ||
501 | mul_add_c(2,2,c2,c3,c1); | ||
502 | mul_add_c(1,3,c2,c3,c1); | ||
503 | stg c2,4*8(%r2) | ||
504 | lghi c2,0 | ||
505 | |||
506 | mul_add_c(2,3,c3,c1,c2); | ||
507 | mul_add_c(3,2,c3,c1,c2); | ||
508 | stg c3,5*8(%r2) | ||
509 | lghi c3,0 | ||
510 | |||
511 | mul_add_c(3,3,c1,c2,c3); | ||
512 | stg c1,6*8(%r2) | ||
513 | stg c2,7*8(%r2) | ||
514 | |||
515 | stmg %r6,%r8,48(%r15) | ||
516 | br %r14 | ||
517 | .size bn_mul_comba4,.-bn_mul_comba4 | ||
518 | |||
519 | #define sqr_add_c(ai,c1,c2,c3) \ | ||
520 | lg %r7,ai*8(%r3); \ | ||
521 | mlgr %r6,%r7; \ | ||
522 | algr c1,%r7; \ | ||
523 | alcgr c2,%r6; \ | ||
524 | alcgr c3,zero | ||
525 | |||
526 | #define sqr_add_c2(ai,aj,c1,c2,c3) \ | ||
527 | lg %r7,ai*8(%r3); \ | ||
528 | mlg %r6,aj*8(%r3); \ | ||
529 | algr c1,%r7; \ | ||
530 | alcgr c2,%r6; \ | ||
531 | alcgr c3,zero; \ | ||
532 | algr c1,%r7; \ | ||
533 | alcgr c2,%r6; \ | ||
534 | alcgr c3,zero | ||
535 | |||
536 | // void bn_sqr_comba8(BN_ULONG *r2,BN_ULONG *r3); | ||
537 | .globl bn_sqr_comba8 | ||
538 | .type bn_sqr_comba8,@function | ||
539 | .align 4 | ||
540 | bn_sqr_comba8: | ||
541 | stmg %r6,%r8,48(%r15) | ||
542 | |||
543 | lghi c1,0 | ||
544 | lghi c2,0 | ||
545 | lghi c3,0 | ||
546 | lghi zero,0 | ||
547 | |||
548 | sqr_add_c(0,c1,c2,c3); | ||
549 | stg c1,0*8(%r2) | ||
550 | lghi c1,0 | ||
551 | |||
552 | sqr_add_c2(1,0,c2,c3,c1); | ||
553 | stg c2,1*8(%r2) | ||
554 | lghi c2,0 | ||
555 | |||
556 | sqr_add_c(1,c3,c1,c2); | ||
557 | sqr_add_c2(2,0,c3,c1,c2); | ||
558 | stg c3,2*8(%r2) | ||
559 | lghi c3,0 | ||
560 | |||
561 | sqr_add_c2(3,0,c1,c2,c3); | ||
562 | sqr_add_c2(2,1,c1,c2,c3); | ||
563 | stg c1,3*8(%r2) | ||
564 | lghi c1,0 | ||
565 | |||
566 | sqr_add_c(2,c2,c3,c1); | ||
567 | sqr_add_c2(3,1,c2,c3,c1); | ||
568 | sqr_add_c2(4,0,c2,c3,c1); | ||
569 | stg c2,4*8(%r2) | ||
570 | lghi c2,0 | ||
571 | |||
572 | sqr_add_c2(5,0,c3,c1,c2); | ||
573 | sqr_add_c2(4,1,c3,c1,c2); | ||
574 | sqr_add_c2(3,2,c3,c1,c2); | ||
575 | stg c3,5*8(%r2) | ||
576 | lghi c3,0 | ||
577 | |||
578 | sqr_add_c(3,c1,c2,c3); | ||
579 | sqr_add_c2(4,2,c1,c2,c3); | ||
580 | sqr_add_c2(5,1,c1,c2,c3); | ||
581 | sqr_add_c2(6,0,c1,c2,c3); | ||
582 | stg c1,6*8(%r2) | ||
583 | lghi c1,0 | ||
584 | |||
585 | sqr_add_c2(7,0,c2,c3,c1); | ||
586 | sqr_add_c2(6,1,c2,c3,c1); | ||
587 | sqr_add_c2(5,2,c2,c3,c1); | ||
588 | sqr_add_c2(4,3,c2,c3,c1); | ||
589 | stg c2,7*8(%r2) | ||
590 | lghi c2,0 | ||
591 | |||
592 | sqr_add_c(4,c3,c1,c2); | ||
593 | sqr_add_c2(5,3,c3,c1,c2); | ||
594 | sqr_add_c2(6,2,c3,c1,c2); | ||
595 | sqr_add_c2(7,1,c3,c1,c2); | ||
596 | stg c3,8*8(%r2) | ||
597 | lghi c3,0 | ||
598 | |||
599 | sqr_add_c2(7,2,c1,c2,c3); | ||
600 | sqr_add_c2(6,3,c1,c2,c3); | ||
601 | sqr_add_c2(5,4,c1,c2,c3); | ||
602 | stg c1,9*8(%r2) | ||
603 | lghi c1,0 | ||
604 | |||
605 | sqr_add_c(5,c2,c3,c1); | ||
606 | sqr_add_c2(6,4,c2,c3,c1); | ||
607 | sqr_add_c2(7,3,c2,c3,c1); | ||
608 | stg c2,10*8(%r2) | ||
609 | lghi c2,0 | ||
610 | |||
611 | sqr_add_c2(7,4,c3,c1,c2); | ||
612 | sqr_add_c2(6,5,c3,c1,c2); | ||
613 | stg c3,11*8(%r2) | ||
614 | lghi c3,0 | ||
615 | |||
616 | sqr_add_c(6,c1,c2,c3); | ||
617 | sqr_add_c2(7,5,c1,c2,c3); | ||
618 | stg c1,12*8(%r2) | ||
619 | lghi c1,0 | ||
620 | |||
621 | sqr_add_c2(7,6,c2,c3,c1); | ||
622 | stg c2,13*8(%r2) | ||
623 | lghi c2,0 | ||
624 | |||
625 | sqr_add_c(7,c3,c1,c2); | ||
626 | stg c3,14*8(%r2) | ||
627 | stg c1,15*8(%r2) | ||
628 | |||
629 | lmg %r6,%r8,48(%r15) | ||
630 | br %r14 | ||
631 | .size bn_sqr_comba8,.-bn_sqr_comba8 | ||
632 | |||
633 | // void bn_sqr_comba4(BN_ULONG *r2,BN_ULONG *r3); | ||
634 | .globl bn_sqr_comba4 | ||
635 | .type bn_sqr_comba4,@function | ||
636 | .align 4 | ||
637 | bn_sqr_comba4: | ||
638 | stmg %r6,%r8,48(%r15) | ||
639 | |||
640 | lghi c1,0 | ||
641 | lghi c2,0 | ||
642 | lghi c3,0 | ||
643 | lghi zero,0 | ||
644 | |||
645 | sqr_add_c(0,c1,c2,c3); | ||
646 | stg c1,0*8(%r2) | ||
647 | lghi c1,0 | ||
648 | |||
649 | sqr_add_c2(1,0,c2,c3,c1); | ||
650 | stg c2,1*8(%r2) | ||
651 | lghi c2,0 | ||
652 | |||
653 | sqr_add_c(1,c3,c1,c2); | ||
654 | sqr_add_c2(2,0,c3,c1,c2); | ||
655 | stg c3,2*8(%r2) | ||
656 | lghi c3,0 | ||
657 | |||
658 | sqr_add_c2(3,0,c1,c2,c3); | ||
659 | sqr_add_c2(2,1,c1,c2,c3); | ||
660 | stg c1,3*8(%r2) | ||
661 | lghi c1,0 | ||
662 | |||
663 | sqr_add_c(2,c2,c3,c1); | ||
664 | sqr_add_c2(3,1,c2,c3,c1); | ||
665 | stg c2,4*8(%r2) | ||
666 | lghi c2,0 | ||
667 | |||
668 | sqr_add_c2(3,2,c3,c1,c2); | ||
669 | stg c3,5*8(%r2) | ||
670 | lghi c3,0 | ||
671 | |||
672 | sqr_add_c(3,c1,c2,c3); | ||
673 | stg c1,6*8(%r2) | ||
674 | stg c2,7*8(%r2) | ||
675 | |||
676 | lmg %r6,%r8,48(%r15) | ||
677 | br %r14 | ||
678 | .size bn_sqr_comba4,.-bn_sqr_comba4 | ||
diff --git a/src/lib/libcrypto/bn/asm/sparcv9-mont.pl b/src/lib/libcrypto/bn/asm/sparcv9-mont.pl new file mode 100644 index 0000000000..b8fb1e8a25 --- /dev/null +++ b/src/lib/libcrypto/bn/asm/sparcv9-mont.pl | |||
@@ -0,0 +1,606 @@ | |||
1 | #!/usr/bin/env perl | ||
2 | |||
3 | # ==================================================================== | ||
4 | # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | ||
5 | # project. The module is, however, dual licensed under OpenSSL and | ||
6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
8 | # ==================================================================== | ||
9 | |||
10 | # December 2005 | ||
11 | # | ||
12 | # Pure SPARCv9/8+ and IALU-only bn_mul_mont implementation. The reasons | ||
13 | # for undertaken effort are multiple. First of all, UltraSPARC is not | ||
14 | # the whole SPARCv9 universe and other VIS-free implementations deserve | ||
15 | # optimized code as much. Secondly, newly introduced UltraSPARC T1, | ||
16 | # a.k.a. Niagara, has shared FPU and concurrent FPU-intensive pathes, | ||
17 | # such as sparcv9a-mont, will simply sink it. Yes, T1 is equipped with | ||
18 | # several integrated RSA/DSA accelerator circuits accessible through | ||
19 | # kernel driver [only(*)], but having decent user-land software | ||
20 | # implementation is important too. Finally, reasons like desire to | ||
21 | # experiment with dedicated squaring procedure. Yes, this module | ||
22 | # implements one, because it was easiest to draft it in SPARCv9 | ||
23 | # instructions... | ||
24 | |||
25 | # (*) Engine accessing the driver in question is on my TODO list. | ||
26 | # For reference, acceleator is estimated to give 6 to 10 times | ||
27 | # improvement on single-threaded RSA sign. It should be noted | ||
28 | # that 6-10x improvement coefficient does not actually mean | ||
29 | # something extraordinary in terms of absolute [single-threaded] | ||
30 | # performance, as SPARCv9 instruction set is by all means least | ||
31 | # suitable for high performance crypto among other 64 bit | ||
32 | # platforms. 6-10x factor simply places T1 in same performance | ||
33 | # domain as say AMD64 and IA-64. Improvement of RSA verify don't | ||
34 | # appear impressive at all, but it's the sign operation which is | ||
35 | # far more critical/interesting. | ||
36 | |||
37 | # You might notice that inner loops are modulo-scheduled:-) This has | ||
38 | # essentially negligible impact on UltraSPARC performance, it's | ||
39 | # Fujitsu SPARC64 V users who should notice and hopefully appreciate | ||
40 | # the advantage... Currently this module surpasses sparcv9a-mont.pl | ||
41 | # by ~20% on UltraSPARC-III and later cores, but recall that sparcv9a | ||
42 | # module still have hidden potential [see TODO list there], which is | ||
43 | # estimated to be larger than 20%... | ||
44 | |||
45 | # int bn_mul_mont( | ||
46 | $rp="%i0"; # BN_ULONG *rp, | ||
47 | $ap="%i1"; # const BN_ULONG *ap, | ||
48 | $bp="%i2"; # const BN_ULONG *bp, | ||
49 | $np="%i3"; # const BN_ULONG *np, | ||
50 | $n0="%i4"; # const BN_ULONG *n0, | ||
51 | $num="%i5"; # int num); | ||
52 | |||
53 | $bits=32; | ||
54 | for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); } | ||
55 | if ($bits==64) { $bias=2047; $frame=192; } | ||
56 | else { $bias=0; $frame=128; } | ||
57 | |||
58 | $car0="%o0"; | ||
59 | $car1="%o1"; | ||
60 | $car2="%o2"; # 1 bit | ||
61 | $acc0="%o3"; | ||
62 | $acc1="%o4"; | ||
63 | $mask="%g1"; # 32 bits, what a waste... | ||
64 | $tmp0="%g4"; | ||
65 | $tmp1="%g5"; | ||
66 | |||
67 | $i="%l0"; | ||
68 | $j="%l1"; | ||
69 | $mul0="%l2"; | ||
70 | $mul1="%l3"; | ||
71 | $tp="%l4"; | ||
72 | $apj="%l5"; | ||
73 | $npj="%l6"; | ||
74 | $tpj="%l7"; | ||
75 | |||
76 | $fname="bn_mul_mont_int"; | ||
77 | |||
78 | $code=<<___; | ||
79 | .section ".text",#alloc,#execinstr | ||
80 | |||
81 | .global $fname | ||
82 | .align 32 | ||
83 | $fname: | ||
84 | cmp %o5,4 ! 128 bits minimum | ||
85 | bge,pt %icc,.Lenter | ||
86 | sethi %hi(0xffffffff),$mask | ||
87 | retl | ||
88 | clr %o0 | ||
89 | .align 32 | ||
90 | .Lenter: | ||
91 | save %sp,-$frame,%sp | ||
92 | sll $num,2,$num ! num*=4 | ||
93 | or $mask,%lo(0xffffffff),$mask | ||
94 | ld [$n0],$n0 | ||
95 | cmp $ap,$bp | ||
96 | and $num,$mask,$num | ||
97 | ld [$bp],$mul0 ! bp[0] | ||
98 | nop | ||
99 | |||
100 | add %sp,$bias,%o7 ! real top of stack | ||
101 | ld [$ap],$car0 ! ap[0] ! redundant in squaring context | ||
102 | sub %o7,$num,%o7 | ||
103 | ld [$ap+4],$apj ! ap[1] | ||
104 | and %o7,-1024,%o7 | ||
105 | ld [$np],$car1 ! np[0] | ||
106 | sub %o7,$bias,%sp ! alloca | ||
107 | ld [$np+4],$npj ! np[1] | ||
108 | be,pt `$bits==32?"%icc":"%xcc"`,.Lbn_sqr_mont | ||
109 | mov 12,$j | ||
110 | |||
111 | mulx $car0,$mul0,$car0 ! ap[0]*bp[0] | ||
112 | mulx $apj,$mul0,$tmp0 !prologue! ap[1]*bp[0] | ||
113 | and $car0,$mask,$acc0 | ||
114 | add %sp,$bias+$frame,$tp | ||
115 | ld [$ap+8],$apj !prologue! | ||
116 | |||
117 | mulx $n0,$acc0,$mul1 ! "t[0]"*n0 | ||
118 | and $mul1,$mask,$mul1 | ||
119 | |||
120 | mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0 | ||
121 | mulx $npj,$mul1,$acc1 !prologue! np[1]*"t[0]"*n0 | ||
122 | srlx $car0,32,$car0 | ||
123 | add $acc0,$car1,$car1 | ||
124 | ld [$np+8],$npj !prologue! | ||
125 | srlx $car1,32,$car1 | ||
126 | mov $tmp0,$acc0 !prologue! | ||
127 | |||
128 | .L1st: | ||
129 | mulx $apj,$mul0,$tmp0 | ||
130 | mulx $npj,$mul1,$tmp1 | ||
131 | add $acc0,$car0,$car0 | ||
132 | ld [$ap+$j],$apj ! ap[j] | ||
133 | and $car0,$mask,$acc0 | ||
134 | add $acc1,$car1,$car1 | ||
135 | ld [$np+$j],$npj ! np[j] | ||
136 | srlx $car0,32,$car0 | ||
137 | add $acc0,$car1,$car1 | ||
138 | add $j,4,$j ! j++ | ||
139 | mov $tmp0,$acc0 | ||
140 | st $car1,[$tp] | ||
141 | cmp $j,$num | ||
142 | mov $tmp1,$acc1 | ||
143 | srlx $car1,32,$car1 | ||
144 | bl %icc,.L1st | ||
145 | add $tp,4,$tp ! tp++ | ||
146 | !.L1st | ||
147 | |||
148 | mulx $apj,$mul0,$tmp0 !epilogue! | ||
149 | mulx $npj,$mul1,$tmp1 | ||
150 | add $acc0,$car0,$car0 | ||
151 | and $car0,$mask,$acc0 | ||
152 | add $acc1,$car1,$car1 | ||
153 | srlx $car0,32,$car0 | ||
154 | add $acc0,$car1,$car1 | ||
155 | st $car1,[$tp] | ||
156 | srlx $car1,32,$car1 | ||
157 | |||
158 | add $tmp0,$car0,$car0 | ||
159 | and $car0,$mask,$acc0 | ||
160 | add $tmp1,$car1,$car1 | ||
161 | srlx $car0,32,$car0 | ||
162 | add $acc0,$car1,$car1 | ||
163 | st $car1,[$tp+4] | ||
164 | srlx $car1,32,$car1 | ||
165 | |||
166 | add $car0,$car1,$car1 | ||
167 | st $car1,[$tp+8] | ||
168 | srlx $car1,32,$car2 | ||
169 | |||
170 | mov 4,$i ! i++ | ||
171 | ld [$bp+4],$mul0 ! bp[1] | ||
172 | .Louter: | ||
173 | add %sp,$bias+$frame,$tp | ||
174 | ld [$ap],$car0 ! ap[0] | ||
175 | ld [$ap+4],$apj ! ap[1] | ||
176 | ld [$np],$car1 ! np[0] | ||
177 | ld [$np+4],$npj ! np[1] | ||
178 | ld [$tp],$tmp1 ! tp[0] | ||
179 | ld [$tp+4],$tpj ! tp[1] | ||
180 | mov 12,$j | ||
181 | |||
182 | mulx $car0,$mul0,$car0 | ||
183 | mulx $apj,$mul0,$tmp0 !prologue! | ||
184 | add $tmp1,$car0,$car0 | ||
185 | ld [$ap+8],$apj !prologue! | ||
186 | and $car0,$mask,$acc0 | ||
187 | |||
188 | mulx $n0,$acc0,$mul1 | ||
189 | and $mul1,$mask,$mul1 | ||
190 | |||
191 | mulx $car1,$mul1,$car1 | ||
192 | mulx $npj,$mul1,$acc1 !prologue! | ||
193 | srlx $car0,32,$car0 | ||
194 | add $acc0,$car1,$car1 | ||
195 | ld [$np+8],$npj !prologue! | ||
196 | srlx $car1,32,$car1 | ||
197 | mov $tmp0,$acc0 !prologue! | ||
198 | |||
199 | .Linner: | ||
200 | mulx $apj,$mul0,$tmp0 | ||
201 | mulx $npj,$mul1,$tmp1 | ||
202 | add $tpj,$car0,$car0 | ||
203 | ld [$ap+$j],$apj ! ap[j] | ||
204 | add $acc0,$car0,$car0 | ||
205 | add $acc1,$car1,$car1 | ||
206 | ld [$np+$j],$npj ! np[j] | ||
207 | and $car0,$mask,$acc0 | ||
208 | ld [$tp+8],$tpj ! tp[j] | ||
209 | srlx $car0,32,$car0 | ||
210 | add $acc0,$car1,$car1 | ||
211 | add $j,4,$j ! j++ | ||
212 | mov $tmp0,$acc0 | ||
213 | st $car1,[$tp] ! tp[j-1] | ||
214 | srlx $car1,32,$car1 | ||
215 | mov $tmp1,$acc1 | ||
216 | cmp $j,$num | ||
217 | bl %icc,.Linner | ||
218 | add $tp,4,$tp ! tp++ | ||
219 | !.Linner | ||
220 | |||
221 | mulx $apj,$mul0,$tmp0 !epilogue! | ||
222 | mulx $npj,$mul1,$tmp1 | ||
223 | add $tpj,$car0,$car0 | ||
224 | add $acc0,$car0,$car0 | ||
225 | ld [$tp+8],$tpj ! tp[j] | ||
226 | and $car0,$mask,$acc0 | ||
227 | add $acc1,$car1,$car1 | ||
228 | srlx $car0,32,$car0 | ||
229 | add $acc0,$car1,$car1 | ||
230 | st $car1,[$tp] ! tp[j-1] | ||
231 | srlx $car1,32,$car1 | ||
232 | |||
233 | add $tpj,$car0,$car0 | ||
234 | add $tmp0,$car0,$car0 | ||
235 | and $car0,$mask,$acc0 | ||
236 | add $tmp1,$car1,$car1 | ||
237 | add $acc0,$car1,$car1 | ||
238 | st $car1,[$tp+4] ! tp[j-1] | ||
239 | srlx $car0,32,$car0 | ||
240 | add $i,4,$i ! i++ | ||
241 | srlx $car1,32,$car1 | ||
242 | |||
243 | add $car0,$car1,$car1 | ||
244 | cmp $i,$num | ||
245 | add $car2,$car1,$car1 | ||
246 | st $car1,[$tp+8] | ||
247 | |||
248 | srlx $car1,32,$car2 | ||
249 | bl,a %icc,.Louter | ||
250 | ld [$bp+$i],$mul0 ! bp[i] | ||
251 | !.Louter | ||
252 | |||
253 | add $tp,12,$tp | ||
254 | |||
255 | .Ltail: | ||
256 | add $np,$num,$np | ||
257 | add $rp,$num,$rp | ||
258 | mov $tp,$ap | ||
259 | sub %g0,$num,%o7 ! k=-num | ||
260 | ba .Lsub | ||
261 | subcc %g0,%g0,%g0 ! clear %icc.c | ||
262 | .align 16 | ||
263 | .Lsub: | ||
264 | ld [$tp+%o7],%o0 | ||
265 | ld [$np+%o7],%o1 | ||
266 | subccc %o0,%o1,%o1 ! tp[j]-np[j] | ||
267 | add $rp,%o7,$i | ||
268 | add %o7,4,%o7 | ||
269 | brnz %o7,.Lsub | ||
270 | st %o1,[$i] | ||
271 | subc $car2,0,$car2 ! handle upmost overflow bit | ||
272 | and $tp,$car2,$ap | ||
273 | andn $rp,$car2,$np | ||
274 | or $ap,$np,$ap | ||
275 | sub %g0,$num,%o7 | ||
276 | |||
277 | .Lcopy: | ||
278 | ld [$ap+%o7],%o0 ! copy or in-place refresh | ||
279 | st %g0,[$tp+%o7] ! zap tp | ||
280 | st %o0,[$rp+%o7] | ||
281 | add %o7,4,%o7 | ||
282 | brnz %o7,.Lcopy | ||
283 | nop | ||
284 | mov 1,%i0 | ||
285 | ret | ||
286 | restore | ||
287 | ___ | ||
288 | |||
289 | ######## | ||
290 | ######## .Lbn_sqr_mont gives up to 20% *overall* improvement over | ||
291 | ######## code without following dedicated squaring procedure. | ||
292 | ######## | ||
293 | $sbit="%i2"; # re-use $bp! | ||
294 | |||
295 | $code.=<<___; | ||
296 | .align 32 | ||
297 | .Lbn_sqr_mont: | ||
298 | mulx $mul0,$mul0,$car0 ! ap[0]*ap[0] | ||
299 | mulx $apj,$mul0,$tmp0 !prologue! | ||
300 | and $car0,$mask,$acc0 | ||
301 | add %sp,$bias+$frame,$tp | ||
302 | ld [$ap+8],$apj !prologue! | ||
303 | |||
304 | mulx $n0,$acc0,$mul1 ! "t[0]"*n0 | ||
305 | srlx $car0,32,$car0 | ||
306 | and $mul1,$mask,$mul1 | ||
307 | |||
308 | mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0 | ||
309 | mulx $npj,$mul1,$acc1 !prologue! | ||
310 | and $car0,1,$sbit | ||
311 | ld [$np+8],$npj !prologue! | ||
312 | srlx $car0,1,$car0 | ||
313 | add $acc0,$car1,$car1 | ||
314 | srlx $car1,32,$car1 | ||
315 | mov $tmp0,$acc0 !prologue! | ||
316 | |||
317 | .Lsqr_1st: | ||
318 | mulx $apj,$mul0,$tmp0 | ||
319 | mulx $npj,$mul1,$tmp1 | ||
320 | add $acc0,$car0,$car0 ! ap[j]*a0+c0 | ||
321 | add $acc1,$car1,$car1 | ||
322 | ld [$ap+$j],$apj ! ap[j] | ||
323 | and $car0,$mask,$acc0 | ||
324 | ld [$np+$j],$npj ! np[j] | ||
325 | srlx $car0,32,$car0 | ||
326 | add $acc0,$acc0,$acc0 | ||
327 | or $sbit,$acc0,$acc0 | ||
328 | mov $tmp1,$acc1 | ||
329 | srlx $acc0,32,$sbit | ||
330 | add $j,4,$j ! j++ | ||
331 | and $acc0,$mask,$acc0 | ||
332 | cmp $j,$num | ||
333 | add $acc0,$car1,$car1 | ||
334 | st $car1,[$tp] | ||
335 | mov $tmp0,$acc0 | ||
336 | srlx $car1,32,$car1 | ||
337 | bl %icc,.Lsqr_1st | ||
338 | add $tp,4,$tp ! tp++ | ||
339 | !.Lsqr_1st | ||
340 | |||
341 | mulx $apj,$mul0,$tmp0 ! epilogue | ||
342 | mulx $npj,$mul1,$tmp1 | ||
343 | add $acc0,$car0,$car0 ! ap[j]*a0+c0 | ||
344 | add $acc1,$car1,$car1 | ||
345 | and $car0,$mask,$acc0 | ||
346 | srlx $car0,32,$car0 | ||
347 | add $acc0,$acc0,$acc0 | ||
348 | or $sbit,$acc0,$acc0 | ||
349 | srlx $acc0,32,$sbit | ||
350 | and $acc0,$mask,$acc0 | ||
351 | add $acc0,$car1,$car1 | ||
352 | st $car1,[$tp] | ||
353 | srlx $car1,32,$car1 | ||
354 | |||
355 | add $tmp0,$car0,$car0 ! ap[j]*a0+c0 | ||
356 | add $tmp1,$car1,$car1 | ||
357 | and $car0,$mask,$acc0 | ||
358 | srlx $car0,32,$car0 | ||
359 | add $acc0,$acc0,$acc0 | ||
360 | or $sbit,$acc0,$acc0 | ||
361 | srlx $acc0,32,$sbit | ||
362 | and $acc0,$mask,$acc0 | ||
363 | add $acc0,$car1,$car1 | ||
364 | st $car1,[$tp+4] | ||
365 | srlx $car1,32,$car1 | ||
366 | |||
367 | add $car0,$car0,$car0 | ||
368 | or $sbit,$car0,$car0 | ||
369 | add $car0,$car1,$car1 | ||
370 | st $car1,[$tp+8] | ||
371 | srlx $car1,32,$car2 | ||
372 | |||
373 | ld [%sp+$bias+$frame],$tmp0 ! tp[0] | ||
374 | ld [%sp+$bias+$frame+4],$tmp1 ! tp[1] | ||
375 | ld [%sp+$bias+$frame+8],$tpj ! tp[2] | ||
376 | ld [$ap+4],$mul0 ! ap[1] | ||
377 | ld [$ap+8],$apj ! ap[2] | ||
378 | ld [$np],$car1 ! np[0] | ||
379 | ld [$np+4],$npj ! np[1] | ||
380 | mulx $n0,$tmp0,$mul1 | ||
381 | |||
382 | mulx $mul0,$mul0,$car0 | ||
383 | and $mul1,$mask,$mul1 | ||
384 | |||
385 | mulx $car1,$mul1,$car1 | ||
386 | mulx $npj,$mul1,$acc1 | ||
387 | add $tmp0,$car1,$car1 | ||
388 | and $car0,$mask,$acc0 | ||
389 | ld [$np+8],$npj ! np[2] | ||
390 | srlx $car1,32,$car1 | ||
391 | add $tmp1,$car1,$car1 | ||
392 | srlx $car0,32,$car0 | ||
393 | add $acc0,$car1,$car1 | ||
394 | and $car0,1,$sbit | ||
395 | add $acc1,$car1,$car1 | ||
396 | srlx $car0,1,$car0 | ||
397 | mov 12,$j | ||
398 | st $car1,[%sp+$bias+$frame] ! tp[0]= | ||
399 | srlx $car1,32,$car1 | ||
400 | add %sp,$bias+$frame+4,$tp | ||
401 | |||
402 | .Lsqr_2nd: | ||
403 | mulx $apj,$mul0,$acc0 | ||
404 | mulx $npj,$mul1,$acc1 | ||
405 | add $acc0,$car0,$car0 | ||
406 | add $tpj,$car1,$car1 | ||
407 | ld [$ap+$j],$apj ! ap[j] | ||
408 | and $car0,$mask,$acc0 | ||
409 | ld [$np+$j],$npj ! np[j] | ||
410 | srlx $car0,32,$car0 | ||
411 | add $acc1,$car1,$car1 | ||
412 | ld [$tp+8],$tpj ! tp[j] | ||
413 | add $acc0,$acc0,$acc0 | ||
414 | add $j,4,$j ! j++ | ||
415 | or $sbit,$acc0,$acc0 | ||
416 | srlx $acc0,32,$sbit | ||
417 | and $acc0,$mask,$acc0 | ||
418 | cmp $j,$num | ||
419 | add $acc0,$car1,$car1 | ||
420 | st $car1,[$tp] ! tp[j-1] | ||
421 | srlx $car1,32,$car1 | ||
422 | bl %icc,.Lsqr_2nd | ||
423 | add $tp,4,$tp ! tp++ | ||
424 | !.Lsqr_2nd | ||
425 | |||
426 | mulx $apj,$mul0,$acc0 | ||
427 | mulx $npj,$mul1,$acc1 | ||
428 | add $acc0,$car0,$car0 | ||
429 | add $tpj,$car1,$car1 | ||
430 | and $car0,$mask,$acc0 | ||
431 | srlx $car0,32,$car0 | ||
432 | add $acc1,$car1,$car1 | ||
433 | add $acc0,$acc0,$acc0 | ||
434 | or $sbit,$acc0,$acc0 | ||
435 | srlx $acc0,32,$sbit | ||
436 | and $acc0,$mask,$acc0 | ||
437 | add $acc0,$car1,$car1 | ||
438 | st $car1,[$tp] ! tp[j-1] | ||
439 | srlx $car1,32,$car1 | ||
440 | |||
441 | add $car0,$car0,$car0 | ||
442 | or $sbit,$car0,$car0 | ||
443 | add $car0,$car1,$car1 | ||
444 | add $car2,$car1,$car1 | ||
445 | st $car1,[$tp+4] | ||
446 | srlx $car1,32,$car2 | ||
447 | |||
448 | ld [%sp+$bias+$frame],$tmp1 ! tp[0] | ||
449 | ld [%sp+$bias+$frame+4],$tpj ! tp[1] | ||
450 | ld [$ap+8],$mul0 ! ap[2] | ||
451 | ld [$np],$car1 ! np[0] | ||
452 | ld [$np+4],$npj ! np[1] | ||
453 | mulx $n0,$tmp1,$mul1 | ||
454 | and $mul1,$mask,$mul1 | ||
455 | mov 8,$i | ||
456 | |||
457 | mulx $mul0,$mul0,$car0 | ||
458 | mulx $car1,$mul1,$car1 | ||
459 | and $car0,$mask,$acc0 | ||
460 | add $tmp1,$car1,$car1 | ||
461 | srlx $car0,32,$car0 | ||
462 | add %sp,$bias+$frame,$tp | ||
463 | srlx $car1,32,$car1 | ||
464 | and $car0,1,$sbit | ||
465 | srlx $car0,1,$car0 | ||
466 | mov 4,$j | ||
467 | |||
468 | .Lsqr_outer: | ||
469 | .Lsqr_inner1: | ||
470 | mulx $npj,$mul1,$acc1 | ||
471 | add $tpj,$car1,$car1 | ||
472 | add $j,4,$j | ||
473 | ld [$tp+8],$tpj | ||
474 | cmp $j,$i | ||
475 | add $acc1,$car1,$car1 | ||
476 | ld [$np+$j],$npj | ||
477 | st $car1,[$tp] | ||
478 | srlx $car1,32,$car1 | ||
479 | bl %icc,.Lsqr_inner1 | ||
480 | add $tp,4,$tp | ||
481 | !.Lsqr_inner1 | ||
482 | |||
483 | add $j,4,$j | ||
484 | ld [$ap+$j],$apj ! ap[j] | ||
485 | mulx $npj,$mul1,$acc1 | ||
486 | add $tpj,$car1,$car1 | ||
487 | ld [$np+$j],$npj ! np[j] | ||
488 | add $acc0,$car1,$car1 | ||
489 | ld [$tp+8],$tpj ! tp[j] | ||
490 | add $acc1,$car1,$car1 | ||
491 | st $car1,[$tp] | ||
492 | srlx $car1,32,$car1 | ||
493 | |||
494 | add $j,4,$j | ||
495 | cmp $j,$num | ||
496 | be,pn %icc,.Lsqr_no_inner2 | ||
497 | add $tp,4,$tp | ||
498 | |||
499 | .Lsqr_inner2: | ||
500 | mulx $apj,$mul0,$acc0 | ||
501 | mulx $npj,$mul1,$acc1 | ||
502 | add $tpj,$car1,$car1 | ||
503 | add $acc0,$car0,$car0 | ||
504 | ld [$ap+$j],$apj ! ap[j] | ||
505 | and $car0,$mask,$acc0 | ||
506 | ld [$np+$j],$npj ! np[j] | ||
507 | srlx $car0,32,$car0 | ||
508 | add $acc0,$acc0,$acc0 | ||
509 | ld [$tp+8],$tpj ! tp[j] | ||
510 | or $sbit,$acc0,$acc0 | ||
511 | add $j,4,$j ! j++ | ||
512 | srlx $acc0,32,$sbit | ||
513 | and $acc0,$mask,$acc0 | ||
514 | cmp $j,$num | ||
515 | add $acc0,$car1,$car1 | ||
516 | add $acc1,$car1,$car1 | ||
517 | st $car1,[$tp] ! tp[j-1] | ||
518 | srlx $car1,32,$car1 | ||
519 | bl %icc,.Lsqr_inner2 | ||
520 | add $tp,4,$tp ! tp++ | ||
521 | |||
522 | .Lsqr_no_inner2: | ||
523 | mulx $apj,$mul0,$acc0 | ||
524 | mulx $npj,$mul1,$acc1 | ||
525 | add $tpj,$car1,$car1 | ||
526 | add $acc0,$car0,$car0 | ||
527 | and $car0,$mask,$acc0 | ||
528 | srlx $car0,32,$car0 | ||
529 | add $acc0,$acc0,$acc0 | ||
530 | or $sbit,$acc0,$acc0 | ||
531 | srlx $acc0,32,$sbit | ||
532 | and $acc0,$mask,$acc0 | ||
533 | add $acc0,$car1,$car1 | ||
534 | add $acc1,$car1,$car1 | ||
535 | st $car1,[$tp] ! tp[j-1] | ||
536 | srlx $car1,32,$car1 | ||
537 | |||
538 | add $car0,$car0,$car0 | ||
539 | or $sbit,$car0,$car0 | ||
540 | add $car0,$car1,$car1 | ||
541 | add $car2,$car1,$car1 | ||
542 | st $car1,[$tp+4] | ||
543 | srlx $car1,32,$car2 | ||
544 | |||
545 | add $i,4,$i ! i++ | ||
546 | ld [%sp+$bias+$frame],$tmp1 ! tp[0] | ||
547 | ld [%sp+$bias+$frame+4],$tpj ! tp[1] | ||
548 | ld [$ap+$i],$mul0 ! ap[j] | ||
549 | ld [$np],$car1 ! np[0] | ||
550 | ld [$np+4],$npj ! np[1] | ||
551 | mulx $n0,$tmp1,$mul1 | ||
552 | and $mul1,$mask,$mul1 | ||
553 | add $i,4,$tmp0 | ||
554 | |||
555 | mulx $mul0,$mul0,$car0 | ||
556 | mulx $car1,$mul1,$car1 | ||
557 | and $car0,$mask,$acc0 | ||
558 | add $tmp1,$car1,$car1 | ||
559 | srlx $car0,32,$car0 | ||
560 | add %sp,$bias+$frame,$tp | ||
561 | srlx $car1,32,$car1 | ||
562 | and $car0,1,$sbit | ||
563 | srlx $car0,1,$car0 | ||
564 | |||
565 | cmp $tmp0,$num ! i<num-1 | ||
566 | bl %icc,.Lsqr_outer | ||
567 | mov 4,$j | ||
568 | |||
569 | .Lsqr_last: | ||
570 | mulx $npj,$mul1,$acc1 | ||
571 | add $tpj,$car1,$car1 | ||
572 | add $j,4,$j | ||
573 | ld [$tp+8],$tpj | ||
574 | cmp $j,$i | ||
575 | add $acc1,$car1,$car1 | ||
576 | ld [$np+$j],$npj | ||
577 | st $car1,[$tp] | ||
578 | srlx $car1,32,$car1 | ||
579 | bl %icc,.Lsqr_last | ||
580 | add $tp,4,$tp | ||
581 | !.Lsqr_last | ||
582 | |||
583 | mulx $npj,$mul1,$acc1 | ||
584 | add $tpj,$car1,$car1 | ||
585 | add $acc0,$car1,$car1 | ||
586 | add $acc1,$car1,$car1 | ||
587 | st $car1,[$tp] | ||
588 | srlx $car1,32,$car1 | ||
589 | |||
590 | add $car0,$car0,$car0 ! recover $car0 | ||
591 | or $sbit,$car0,$car0 | ||
592 | add $car0,$car1,$car1 | ||
593 | add $car2,$car1,$car1 | ||
594 | st $car1,[$tp+4] | ||
595 | srlx $car1,32,$car2 | ||
596 | |||
597 | ba .Ltail | ||
598 | add $tp,8,$tp | ||
599 | .type $fname,#function | ||
600 | .size $fname,(.-$fname) | ||
601 | .asciz "Montgomery Multipltication for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>" | ||
602 | .align 32 | ||
603 | ___ | ||
604 | $code =~ s/\`([^\`]*)\`/eval($1)/gem; | ||
605 | print $code; | ||
606 | close STDOUT; | ||
diff --git a/src/lib/libcrypto/bn/asm/sparcv9a-mont.pl b/src/lib/libcrypto/bn/asm/sparcv9a-mont.pl new file mode 100755 index 0000000000..a14205f2f0 --- /dev/null +++ b/src/lib/libcrypto/bn/asm/sparcv9a-mont.pl | |||
@@ -0,0 +1,882 @@ | |||
1 | #!/usr/bin/env perl | ||
2 | |||
3 | # ==================================================================== | ||
4 | # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | ||
5 | # project. The module is, however, dual licensed under OpenSSL and | ||
6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
8 | # ==================================================================== | ||
9 | |||
10 | # October 2005 | ||
11 | # | ||
12 | # "Teaser" Montgomery multiplication module for UltraSPARC. Why FPU? | ||
13 | # Because unlike integer multiplier, which simply stalls whole CPU, | ||
14 | # FPU is fully pipelined and can effectively emit 48 bit partial | ||
15 | # product every cycle. Why not blended SPARC v9? One can argue that | ||
16 | # making this module dependent on UltraSPARC VIS extension limits its | ||
17 | # binary compatibility. Well yes, it does exclude SPARC64 prior-V(!) | ||
18 | # implementations from compatibility matrix. But the rest, whole Sun | ||
19 | # UltraSPARC family and brand new Fujitsu's SPARC64 V, all support | ||
20 | # VIS extension instructions used in this module. This is considered | ||
21 | # good enough to not care about HAL SPARC64 users [if any] who have | ||
22 | # integer-only pure SPARCv9 module to "fall down" to. | ||
23 | |||
24 | # USI&II cores currently exhibit uniform 2x improvement [over pre- | ||
25 | # bn_mul_mont codebase] for all key lengths and benchmarks. On USIII | ||
26 | # performance improves few percents for shorter keys and worsens few | ||
27 | # percents for longer keys. This is because USIII integer multiplier | ||
28 | # is >3x faster than USI&II one, which is harder to match [but see | ||
29 | # TODO list below]. It should also be noted that SPARC64 V features | ||
30 | # out-of-order execution, which *might* mean that integer multiplier | ||
31 | # is pipelined, which in turn *might* be impossible to match... On | ||
32 | # additional note, SPARC64 V implements FP Multiply-Add instruction, | ||
33 | # which is perfectly usable in this context... In other words, as far | ||
34 | # as Fujitsu SPARC64 V goes, talk to the author:-) | ||
35 | |||
36 | # The implementation implies following "non-natural" limitations on | ||
37 | # input arguments: | ||
38 | # - num may not be less than 4; | ||
39 | # - num has to be even; | ||
40 | # Failure to meet either condition has no fatal effects, simply | ||
41 | # doesn't give any performance gain. | ||
42 | |||
43 | # TODO: | ||
44 | # - modulo-schedule inner loop for better performance (on in-order | ||
45 | # execution core such as UltraSPARC this shall result in further | ||
46 | # noticeable(!) improvement); | ||
47 | # - dedicated squaring procedure[?]; | ||
48 | |||
49 | ###################################################################### | ||
50 | # November 2006 | ||
51 | # | ||
52 | # Modulo-scheduled inner loops allow to interleave floating point and | ||
53 | # integer instructions and minimize Read-After-Write penalties. This | ||
54 | # results in *further* 20-50% perfromance improvement [depending on | ||
55 | # key length, more for longer keys] on USI&II cores and 30-80% - on | ||
56 | # USIII&IV. | ||
57 | |||
58 | $fname="bn_mul_mont_fpu"; | ||
59 | $bits=32; | ||
60 | for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); } | ||
61 | |||
62 | if ($bits==64) { | ||
63 | $bias=2047; | ||
64 | $frame=192; | ||
65 | } else { | ||
66 | $bias=0; | ||
67 | $frame=128; # 96 rounded up to largest known cache-line | ||
68 | } | ||
69 | $locals=64; | ||
70 | |||
71 | # In order to provide for 32-/64-bit ABI duality, I keep integers wider | ||
72 | # than 32 bit in %g1-%g4 and %o0-%o5. %l0-%l7 and %i0-%i5 are used | ||
73 | # exclusively for pointers, indexes and other small values... | ||
74 | # int bn_mul_mont( | ||
75 | $rp="%i0"; # BN_ULONG *rp, | ||
76 | $ap="%i1"; # const BN_ULONG *ap, | ||
77 | $bp="%i2"; # const BN_ULONG *bp, | ||
78 | $np="%i3"; # const BN_ULONG *np, | ||
79 | $n0="%i4"; # const BN_ULONG *n0, | ||
80 | $num="%i5"; # int num); | ||
81 | |||
82 | $tp="%l0"; # t[num] | ||
83 | $ap_l="%l1"; # a[num],n[num] are smashed to 32-bit words and saved | ||
84 | $ap_h="%l2"; # to these four vectors as double-precision FP values. | ||
85 | $np_l="%l3"; # This way a bunch of fxtods are eliminated in second | ||
86 | $np_h="%l4"; # loop and L1-cache aliasing is minimized... | ||
87 | $i="%l5"; | ||
88 | $j="%l6"; | ||
89 | $mask="%l7"; # 16-bit mask, 0xffff | ||
90 | |||
91 | $n0="%g4"; # reassigned(!) to "64-bit" register | ||
92 | $carry="%i4"; # %i4 reused(!) for a carry bit | ||
93 | |||
94 | # FP register naming chart | ||
95 | # | ||
96 | # ..HILO | ||
97 | # dcba | ||
98 | # -------- | ||
99 | # LOa | ||
100 | # LOb | ||
101 | # LOc | ||
102 | # LOd | ||
103 | # HIa | ||
104 | # HIb | ||
105 | # HIc | ||
106 | # HId | ||
107 | # ..a | ||
108 | # ..b | ||
109 | $ba="%f0"; $bb="%f2"; $bc="%f4"; $bd="%f6"; | ||
110 | $na="%f8"; $nb="%f10"; $nc="%f12"; $nd="%f14"; | ||
111 | $alo="%f16"; $alo_="%f17"; $ahi="%f18"; $ahi_="%f19"; | ||
112 | $nlo="%f20"; $nlo_="%f21"; $nhi="%f22"; $nhi_="%f23"; | ||
113 | |||
114 | $dota="%f24"; $dotb="%f26"; | ||
115 | |||
116 | $aloa="%f32"; $alob="%f34"; $aloc="%f36"; $alod="%f38"; | ||
117 | $ahia="%f40"; $ahib="%f42"; $ahic="%f44"; $ahid="%f46"; | ||
118 | $nloa="%f48"; $nlob="%f50"; $nloc="%f52"; $nlod="%f54"; | ||
119 | $nhia="%f56"; $nhib="%f58"; $nhic="%f60"; $nhid="%f62"; | ||
120 | |||
121 | $ASI_FL16_P=0xD2; # magic ASI value to engage 16-bit FP load | ||
122 | |||
123 | $code=<<___; | ||
124 | .section ".text",#alloc,#execinstr | ||
125 | |||
126 | .global $fname | ||
127 | .align 32 | ||
128 | $fname: | ||
129 | save %sp,-$frame-$locals,%sp | ||
130 | |||
131 | cmp $num,4 | ||
132 | bl,a,pn %icc,.Lret | ||
133 | clr %i0 | ||
134 | andcc $num,1,%g0 ! $num has to be even... | ||
135 | bnz,a,pn %icc,.Lret | ||
136 | clr %i0 ! signal "unsupported input value" | ||
137 | |||
138 | srl $num,1,$num | ||
139 | sethi %hi(0xffff),$mask | ||
140 | ld [%i4+0],$n0 ! $n0 reassigned, remember? | ||
141 | or $mask,%lo(0xffff),$mask | ||
142 | ld [%i4+4],%o0 | ||
143 | sllx %o0,32,%o0 | ||
144 | or %o0,$n0,$n0 ! $n0=n0[1].n0[0] | ||
145 | |||
146 | sll $num,3,$num ! num*=8 | ||
147 | |||
148 | add %sp,$bias,%o0 ! real top of stack | ||
149 | sll $num,2,%o1 | ||
150 | add %o1,$num,%o1 ! %o1=num*5 | ||
151 | sub %o0,%o1,%o0 | ||
152 | and %o0,-2048,%o0 ! optimize TLB utilization | ||
153 | sub %o0,$bias,%sp ! alloca(5*num*8) | ||
154 | |||
155 | rd %asi,%o7 ! save %asi | ||
156 | add %sp,$bias+$frame+$locals,$tp | ||
157 | add $tp,$num,$ap_l | ||
158 | add $ap_l,$num,$ap_l ! [an]p_[lh] point at the vectors' ends ! | ||
159 | add $ap_l,$num,$ap_h | ||
160 | add $ap_h,$num,$np_l | ||
161 | add $np_l,$num,$np_h | ||
162 | |||
163 | wr %g0,$ASI_FL16_P,%asi ! setup %asi for 16-bit FP loads | ||
164 | |||
165 | add $rp,$num,$rp ! readjust input pointers to point | ||
166 | add $ap,$num,$ap ! at the ends too... | ||
167 | add $bp,$num,$bp | ||
168 | add $np,$num,$np | ||
169 | |||
170 | stx %o7,[%sp+$bias+$frame+48] ! save %asi | ||
171 | |||
172 | sub %g0,$num,$i ! i=-num | ||
173 | sub %g0,$num,$j ! j=-num | ||
174 | |||
175 | add $ap,$j,%o3 | ||
176 | add $bp,$i,%o4 | ||
177 | |||
178 | ld [%o3+4],%g1 ! bp[0] | ||
179 | ld [%o3+0],%o0 | ||
180 | ld [%o4+4],%g5 ! ap[0] | ||
181 | sllx %g1,32,%g1 | ||
182 | ld [%o4+0],%o1 | ||
183 | sllx %g5,32,%g5 | ||
184 | or %g1,%o0,%o0 | ||
185 | or %g5,%o1,%o1 | ||
186 | |||
187 | add $np,$j,%o5 | ||
188 | |||
189 | mulx %o1,%o0,%o0 ! ap[0]*bp[0] | ||
190 | mulx $n0,%o0,%o0 ! ap[0]*bp[0]*n0 | ||
191 | stx %o0,[%sp+$bias+$frame+0] | ||
192 | |||
193 | ld [%o3+0],$alo_ ! load a[j] as pair of 32-bit words | ||
194 | fzeros $alo | ||
195 | ld [%o3+4],$ahi_ | ||
196 | fzeros $ahi | ||
197 | ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words | ||
198 | fzeros $nlo | ||
199 | ld [%o5+4],$nhi_ | ||
200 | fzeros $nhi | ||
201 | |||
202 | ! transfer b[i] to FPU as 4x16-bit values | ||
203 | ldda [%o4+2]%asi,$ba | ||
204 | fxtod $alo,$alo | ||
205 | ldda [%o4+0]%asi,$bb | ||
206 | fxtod $ahi,$ahi | ||
207 | ldda [%o4+6]%asi,$bc | ||
208 | fxtod $nlo,$nlo | ||
209 | ldda [%o4+4]%asi,$bd | ||
210 | fxtod $nhi,$nhi | ||
211 | |||
212 | ! transfer ap[0]*b[0]*n0 to FPU as 4x16-bit values | ||
213 | ldda [%sp+$bias+$frame+6]%asi,$na | ||
214 | fxtod $ba,$ba | ||
215 | ldda [%sp+$bias+$frame+4]%asi,$nb | ||
216 | fxtod $bb,$bb | ||
217 | ldda [%sp+$bias+$frame+2]%asi,$nc | ||
218 | fxtod $bc,$bc | ||
219 | ldda [%sp+$bias+$frame+0]%asi,$nd | ||
220 | fxtod $bd,$bd | ||
221 | |||
222 | std $alo,[$ap_l+$j] ! save smashed ap[j] in double format | ||
223 | fxtod $na,$na | ||
224 | std $ahi,[$ap_h+$j] | ||
225 | fxtod $nb,$nb | ||
226 | std $nlo,[$np_l+$j] ! save smashed np[j] in double format | ||
227 | fxtod $nc,$nc | ||
228 | std $nhi,[$np_h+$j] | ||
229 | fxtod $nd,$nd | ||
230 | |||
231 | fmuld $alo,$ba,$aloa | ||
232 | fmuld $nlo,$na,$nloa | ||
233 | fmuld $alo,$bb,$alob | ||
234 | fmuld $nlo,$nb,$nlob | ||
235 | fmuld $alo,$bc,$aloc | ||
236 | faddd $aloa,$nloa,$nloa | ||
237 | fmuld $nlo,$nc,$nloc | ||
238 | fmuld $alo,$bd,$alod | ||
239 | faddd $alob,$nlob,$nlob | ||
240 | fmuld $nlo,$nd,$nlod | ||
241 | fmuld $ahi,$ba,$ahia | ||
242 | faddd $aloc,$nloc,$nloc | ||
243 | fmuld $nhi,$na,$nhia | ||
244 | fmuld $ahi,$bb,$ahib | ||
245 | faddd $alod,$nlod,$nlod | ||
246 | fmuld $nhi,$nb,$nhib | ||
247 | fmuld $ahi,$bc,$ahic | ||
248 | faddd $ahia,$nhia,$nhia | ||
249 | fmuld $nhi,$nc,$nhic | ||
250 | fmuld $ahi,$bd,$ahid | ||
251 | faddd $ahib,$nhib,$nhib | ||
252 | fmuld $nhi,$nd,$nhid | ||
253 | |||
254 | faddd $ahic,$nhic,$dota ! $nhic | ||
255 | faddd $ahid,$nhid,$dotb ! $nhid | ||
256 | |||
257 | faddd $nloc,$nhia,$nloc | ||
258 | faddd $nlod,$nhib,$nlod | ||
259 | |||
260 | fdtox $nloa,$nloa | ||
261 | fdtox $nlob,$nlob | ||
262 | fdtox $nloc,$nloc | ||
263 | fdtox $nlod,$nlod | ||
264 | |||
265 | std $nloa,[%sp+$bias+$frame+0] | ||
266 | add $j,8,$j | ||
267 | std $nlob,[%sp+$bias+$frame+8] | ||
268 | add $ap,$j,%o4 | ||
269 | std $nloc,[%sp+$bias+$frame+16] | ||
270 | add $np,$j,%o5 | ||
271 | std $nlod,[%sp+$bias+$frame+24] | ||
272 | |||
273 | ld [%o4+0],$alo_ ! load a[j] as pair of 32-bit words | ||
274 | fzeros $alo | ||
275 | ld [%o4+4],$ahi_ | ||
276 | fzeros $ahi | ||
277 | ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words | ||
278 | fzeros $nlo | ||
279 | ld [%o5+4],$nhi_ | ||
280 | fzeros $nhi | ||
281 | |||
282 | fxtod $alo,$alo | ||
283 | fxtod $ahi,$ahi | ||
284 | fxtod $nlo,$nlo | ||
285 | fxtod $nhi,$nhi | ||
286 | |||
287 | ldx [%sp+$bias+$frame+0],%o0 | ||
288 | fmuld $alo,$ba,$aloa | ||
289 | ldx [%sp+$bias+$frame+8],%o1 | ||
290 | fmuld $nlo,$na,$nloa | ||
291 | ldx [%sp+$bias+$frame+16],%o2 | ||
292 | fmuld $alo,$bb,$alob | ||
293 | ldx [%sp+$bias+$frame+24],%o3 | ||
294 | fmuld $nlo,$nb,$nlob | ||
295 | |||
296 | srlx %o0,16,%o7 | ||
297 | std $alo,[$ap_l+$j] ! save smashed ap[j] in double format | ||
298 | fmuld $alo,$bc,$aloc | ||
299 | add %o7,%o1,%o1 | ||
300 | std $ahi,[$ap_h+$j] | ||
301 | faddd $aloa,$nloa,$nloa | ||
302 | fmuld $nlo,$nc,$nloc | ||
303 | srlx %o1,16,%o7 | ||
304 | std $nlo,[$np_l+$j] ! save smashed np[j] in double format | ||
305 | fmuld $alo,$bd,$alod | ||
306 | add %o7,%o2,%o2 | ||
307 | std $nhi,[$np_h+$j] | ||
308 | faddd $alob,$nlob,$nlob | ||
309 | fmuld $nlo,$nd,$nlod | ||
310 | srlx %o2,16,%o7 | ||
311 | fmuld $ahi,$ba,$ahia | ||
312 | add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] | ||
313 | faddd $aloc,$nloc,$nloc | ||
314 | fmuld $nhi,$na,$nhia | ||
315 | !and %o0,$mask,%o0 | ||
316 | !and %o1,$mask,%o1 | ||
317 | !and %o2,$mask,%o2 | ||
318 | !sllx %o1,16,%o1 | ||
319 | !sllx %o2,32,%o2 | ||
320 | !sllx %o3,48,%o7 | ||
321 | !or %o1,%o0,%o0 | ||
322 | !or %o2,%o0,%o0 | ||
323 | !or %o7,%o0,%o0 ! 64-bit result | ||
324 | srlx %o3,16,%g1 ! 34-bit carry | ||
325 | fmuld $ahi,$bb,$ahib | ||
326 | |||
327 | faddd $alod,$nlod,$nlod | ||
328 | fmuld $nhi,$nb,$nhib | ||
329 | fmuld $ahi,$bc,$ahic | ||
330 | faddd $ahia,$nhia,$nhia | ||
331 | fmuld $nhi,$nc,$nhic | ||
332 | fmuld $ahi,$bd,$ahid | ||
333 | faddd $ahib,$nhib,$nhib | ||
334 | fmuld $nhi,$nd,$nhid | ||
335 | |||
336 | faddd $dota,$nloa,$nloa | ||
337 | faddd $dotb,$nlob,$nlob | ||
338 | faddd $ahic,$nhic,$dota ! $nhic | ||
339 | faddd $ahid,$nhid,$dotb ! $nhid | ||
340 | |||
341 | faddd $nloc,$nhia,$nloc | ||
342 | faddd $nlod,$nhib,$nlod | ||
343 | |||
344 | fdtox $nloa,$nloa | ||
345 | fdtox $nlob,$nlob | ||
346 | fdtox $nloc,$nloc | ||
347 | fdtox $nlod,$nlod | ||
348 | |||
349 | std $nloa,[%sp+$bias+$frame+0] | ||
350 | std $nlob,[%sp+$bias+$frame+8] | ||
351 | addcc $j,8,$j | ||
352 | std $nloc,[%sp+$bias+$frame+16] | ||
353 | bz,pn %icc,.L1stskip | ||
354 | std $nlod,[%sp+$bias+$frame+24] | ||
355 | |||
356 | .align 32 ! incidentally already aligned ! | ||
357 | .L1st: | ||
358 | add $ap,$j,%o4 | ||
359 | add $np,$j,%o5 | ||
360 | ld [%o4+0],$alo_ ! load a[j] as pair of 32-bit words | ||
361 | fzeros $alo | ||
362 | ld [%o4+4],$ahi_ | ||
363 | fzeros $ahi | ||
364 | ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words | ||
365 | fzeros $nlo | ||
366 | ld [%o5+4],$nhi_ | ||
367 | fzeros $nhi | ||
368 | |||
369 | fxtod $alo,$alo | ||
370 | fxtod $ahi,$ahi | ||
371 | fxtod $nlo,$nlo | ||
372 | fxtod $nhi,$nhi | ||
373 | |||
374 | ldx [%sp+$bias+$frame+0],%o0 | ||
375 | fmuld $alo,$ba,$aloa | ||
376 | ldx [%sp+$bias+$frame+8],%o1 | ||
377 | fmuld $nlo,$na,$nloa | ||
378 | ldx [%sp+$bias+$frame+16],%o2 | ||
379 | fmuld $alo,$bb,$alob | ||
380 | ldx [%sp+$bias+$frame+24],%o3 | ||
381 | fmuld $nlo,$nb,$nlob | ||
382 | |||
383 | srlx %o0,16,%o7 | ||
384 | std $alo,[$ap_l+$j] ! save smashed ap[j] in double format | ||
385 | fmuld $alo,$bc,$aloc | ||
386 | add %o7,%o1,%o1 | ||
387 | std $ahi,[$ap_h+$j] | ||
388 | faddd $aloa,$nloa,$nloa | ||
389 | fmuld $nlo,$nc,$nloc | ||
390 | srlx %o1,16,%o7 | ||
391 | std $nlo,[$np_l+$j] ! save smashed np[j] in double format | ||
392 | fmuld $alo,$bd,$alod | ||
393 | add %o7,%o2,%o2 | ||
394 | std $nhi,[$np_h+$j] | ||
395 | faddd $alob,$nlob,$nlob | ||
396 | fmuld $nlo,$nd,$nlod | ||
397 | srlx %o2,16,%o7 | ||
398 | fmuld $ahi,$ba,$ahia | ||
399 | add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] | ||
400 | and %o0,$mask,%o0 | ||
401 | faddd $aloc,$nloc,$nloc | ||
402 | fmuld $nhi,$na,$nhia | ||
403 | and %o1,$mask,%o1 | ||
404 | and %o2,$mask,%o2 | ||
405 | fmuld $ahi,$bb,$ahib | ||
406 | sllx %o1,16,%o1 | ||
407 | faddd $alod,$nlod,$nlod | ||
408 | fmuld $nhi,$nb,$nhib | ||
409 | sllx %o2,32,%o2 | ||
410 | fmuld $ahi,$bc,$ahic | ||
411 | sllx %o3,48,%o7 | ||
412 | or %o1,%o0,%o0 | ||
413 | faddd $ahia,$nhia,$nhia | ||
414 | fmuld $nhi,$nc,$nhic | ||
415 | or %o2,%o0,%o0 | ||
416 | fmuld $ahi,$bd,$ahid | ||
417 | or %o7,%o0,%o0 ! 64-bit result | ||
418 | faddd $ahib,$nhib,$nhib | ||
419 | fmuld $nhi,$nd,$nhid | ||
420 | addcc %g1,%o0,%o0 | ||
421 | faddd $dota,$nloa,$nloa | ||
422 | srlx %o3,16,%g1 ! 34-bit carry | ||
423 | faddd $dotb,$nlob,$nlob | ||
424 | bcs,a %xcc,.+8 | ||
425 | add %g1,1,%g1 | ||
426 | |||
427 | stx %o0,[$tp] ! tp[j-1]= | ||
428 | |||
429 | faddd $ahic,$nhic,$dota ! $nhic | ||
430 | faddd $ahid,$nhid,$dotb ! $nhid | ||
431 | |||
432 | faddd $nloc,$nhia,$nloc | ||
433 | faddd $nlod,$nhib,$nlod | ||
434 | |||
435 | fdtox $nloa,$nloa | ||
436 | fdtox $nlob,$nlob | ||
437 | fdtox $nloc,$nloc | ||
438 | fdtox $nlod,$nlod | ||
439 | |||
440 | std $nloa,[%sp+$bias+$frame+0] | ||
441 | std $nlob,[%sp+$bias+$frame+8] | ||
442 | std $nloc,[%sp+$bias+$frame+16] | ||
443 | std $nlod,[%sp+$bias+$frame+24] | ||
444 | |||
445 | addcc $j,8,$j | ||
446 | bnz,pt %icc,.L1st | ||
447 | add $tp,8,$tp | ||
448 | |||
449 | .L1stskip: | ||
450 | fdtox $dota,$dota | ||
451 | fdtox $dotb,$dotb | ||
452 | |||
453 | ldx [%sp+$bias+$frame+0],%o0 | ||
454 | ldx [%sp+$bias+$frame+8],%o1 | ||
455 | ldx [%sp+$bias+$frame+16],%o2 | ||
456 | ldx [%sp+$bias+$frame+24],%o3 | ||
457 | |||
458 | srlx %o0,16,%o7 | ||
459 | std $dota,[%sp+$bias+$frame+32] | ||
460 | add %o7,%o1,%o1 | ||
461 | std $dotb,[%sp+$bias+$frame+40] | ||
462 | srlx %o1,16,%o7 | ||
463 | add %o7,%o2,%o2 | ||
464 | srlx %o2,16,%o7 | ||
465 | add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] | ||
466 | and %o0,$mask,%o0 | ||
467 | and %o1,$mask,%o1 | ||
468 | and %o2,$mask,%o2 | ||
469 | sllx %o1,16,%o1 | ||
470 | sllx %o2,32,%o2 | ||
471 | sllx %o3,48,%o7 | ||
472 | or %o1,%o0,%o0 | ||
473 | or %o2,%o0,%o0 | ||
474 | or %o7,%o0,%o0 ! 64-bit result | ||
475 | ldx [%sp+$bias+$frame+32],%o4 | ||
476 | addcc %g1,%o0,%o0 | ||
477 | ldx [%sp+$bias+$frame+40],%o5 | ||
478 | srlx %o3,16,%g1 ! 34-bit carry | ||
479 | bcs,a %xcc,.+8 | ||
480 | add %g1,1,%g1 | ||
481 | |||
482 | stx %o0,[$tp] ! tp[j-1]= | ||
483 | add $tp,8,$tp | ||
484 | |||
485 | srlx %o4,16,%o7 | ||
486 | add %o7,%o5,%o5 | ||
487 | and %o4,$mask,%o4 | ||
488 | sllx %o5,16,%o7 | ||
489 | or %o7,%o4,%o4 | ||
490 | addcc %g1,%o4,%o4 | ||
491 | srlx %o5,48,%g1 | ||
492 | bcs,a %xcc,.+8 | ||
493 | add %g1,1,%g1 | ||
494 | |||
495 | mov %g1,$carry | ||
496 | stx %o4,[$tp] ! tp[num-1]= | ||
497 | |||
498 | ba .Louter | ||
499 | add $i,8,$i | ||
500 | .align 32 | ||
501 | .Louter: | ||
502 | sub %g0,$num,$j ! j=-num | ||
503 | add %sp,$bias+$frame+$locals,$tp | ||
504 | |||
505 | add $ap,$j,%o3 | ||
506 | add $bp,$i,%o4 | ||
507 | |||
508 | ld [%o3+4],%g1 ! bp[i] | ||
509 | ld [%o3+0],%o0 | ||
510 | ld [%o4+4],%g5 ! ap[0] | ||
511 | sllx %g1,32,%g1 | ||
512 | ld [%o4+0],%o1 | ||
513 | sllx %g5,32,%g5 | ||
514 | or %g1,%o0,%o0 | ||
515 | or %g5,%o1,%o1 | ||
516 | |||
517 | ldx [$tp],%o2 ! tp[0] | ||
518 | mulx %o1,%o0,%o0 | ||
519 | addcc %o2,%o0,%o0 | ||
520 | mulx $n0,%o0,%o0 ! (ap[0]*bp[i]+t[0])*n0 | ||
521 | stx %o0,[%sp+$bias+$frame+0] | ||
522 | |||
523 | ! transfer b[i] to FPU as 4x16-bit values | ||
524 | ldda [%o4+2]%asi,$ba | ||
525 | ldda [%o4+0]%asi,$bb | ||
526 | ldda [%o4+6]%asi,$bc | ||
527 | ldda [%o4+4]%asi,$bd | ||
528 | |||
529 | ! transfer (ap[0]*b[i]+t[0])*n0 to FPU as 4x16-bit values | ||
530 | ldda [%sp+$bias+$frame+6]%asi,$na | ||
531 | fxtod $ba,$ba | ||
532 | ldda [%sp+$bias+$frame+4]%asi,$nb | ||
533 | fxtod $bb,$bb | ||
534 | ldda [%sp+$bias+$frame+2]%asi,$nc | ||
535 | fxtod $bc,$bc | ||
536 | ldda [%sp+$bias+$frame+0]%asi,$nd | ||
537 | fxtod $bd,$bd | ||
538 | ldd [$ap_l+$j],$alo ! load a[j] in double format | ||
539 | fxtod $na,$na | ||
540 | ldd [$ap_h+$j],$ahi | ||
541 | fxtod $nb,$nb | ||
542 | ldd [$np_l+$j],$nlo ! load n[j] in double format | ||
543 | fxtod $nc,$nc | ||
544 | ldd [$np_h+$j],$nhi | ||
545 | fxtod $nd,$nd | ||
546 | |||
547 | fmuld $alo,$ba,$aloa | ||
548 | fmuld $nlo,$na,$nloa | ||
549 | fmuld $alo,$bb,$alob | ||
550 | fmuld $nlo,$nb,$nlob | ||
551 | fmuld $alo,$bc,$aloc | ||
552 | faddd $aloa,$nloa,$nloa | ||
553 | fmuld $nlo,$nc,$nloc | ||
554 | fmuld $alo,$bd,$alod | ||
555 | faddd $alob,$nlob,$nlob | ||
556 | fmuld $nlo,$nd,$nlod | ||
557 | fmuld $ahi,$ba,$ahia | ||
558 | faddd $aloc,$nloc,$nloc | ||
559 | fmuld $nhi,$na,$nhia | ||
560 | fmuld $ahi,$bb,$ahib | ||
561 | faddd $alod,$nlod,$nlod | ||
562 | fmuld $nhi,$nb,$nhib | ||
563 | fmuld $ahi,$bc,$ahic | ||
564 | faddd $ahia,$nhia,$nhia | ||
565 | fmuld $nhi,$nc,$nhic | ||
566 | fmuld $ahi,$bd,$ahid | ||
567 | faddd $ahib,$nhib,$nhib | ||
568 | fmuld $nhi,$nd,$nhid | ||
569 | |||
570 | faddd $ahic,$nhic,$dota ! $nhic | ||
571 | faddd $ahid,$nhid,$dotb ! $nhid | ||
572 | |||
573 | faddd $nloc,$nhia,$nloc | ||
574 | faddd $nlod,$nhib,$nlod | ||
575 | |||
576 | fdtox $nloa,$nloa | ||
577 | fdtox $nlob,$nlob | ||
578 | fdtox $nloc,$nloc | ||
579 | fdtox $nlod,$nlod | ||
580 | |||
581 | std $nloa,[%sp+$bias+$frame+0] | ||
582 | std $nlob,[%sp+$bias+$frame+8] | ||
583 | std $nloc,[%sp+$bias+$frame+16] | ||
584 | add $j,8,$j | ||
585 | std $nlod,[%sp+$bias+$frame+24] | ||
586 | |||
587 | ldd [$ap_l+$j],$alo ! load a[j] in double format | ||
588 | ldd [$ap_h+$j],$ahi | ||
589 | ldd [$np_l+$j],$nlo ! load n[j] in double format | ||
590 | ldd [$np_h+$j],$nhi | ||
591 | |||
592 | fmuld $alo,$ba,$aloa | ||
593 | fmuld $nlo,$na,$nloa | ||
594 | fmuld $alo,$bb,$alob | ||
595 | fmuld $nlo,$nb,$nlob | ||
596 | fmuld $alo,$bc,$aloc | ||
597 | ldx [%sp+$bias+$frame+0],%o0 | ||
598 | faddd $aloa,$nloa,$nloa | ||
599 | fmuld $nlo,$nc,$nloc | ||
600 | ldx [%sp+$bias+$frame+8],%o1 | ||
601 | fmuld $alo,$bd,$alod | ||
602 | ldx [%sp+$bias+$frame+16],%o2 | ||
603 | faddd $alob,$nlob,$nlob | ||
604 | fmuld $nlo,$nd,$nlod | ||
605 | ldx [%sp+$bias+$frame+24],%o3 | ||
606 | fmuld $ahi,$ba,$ahia | ||
607 | |||
608 | srlx %o0,16,%o7 | ||
609 | faddd $aloc,$nloc,$nloc | ||
610 | fmuld $nhi,$na,$nhia | ||
611 | add %o7,%o1,%o1 | ||
612 | fmuld $ahi,$bb,$ahib | ||
613 | srlx %o1,16,%o7 | ||
614 | faddd $alod,$nlod,$nlod | ||
615 | fmuld $nhi,$nb,$nhib | ||
616 | add %o7,%o2,%o2 | ||
617 | fmuld $ahi,$bc,$ahic | ||
618 | srlx %o2,16,%o7 | ||
619 | faddd $ahia,$nhia,$nhia | ||
620 | fmuld $nhi,$nc,$nhic | ||
621 | add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] | ||
622 | ! why? | ||
623 | and %o0,$mask,%o0 | ||
624 | fmuld $ahi,$bd,$ahid | ||
625 | and %o1,$mask,%o1 | ||
626 | and %o2,$mask,%o2 | ||
627 | faddd $ahib,$nhib,$nhib | ||
628 | fmuld $nhi,$nd,$nhid | ||
629 | sllx %o1,16,%o1 | ||
630 | faddd $dota,$nloa,$nloa | ||
631 | sllx %o2,32,%o2 | ||
632 | faddd $dotb,$nlob,$nlob | ||
633 | sllx %o3,48,%o7 | ||
634 | or %o1,%o0,%o0 | ||
635 | faddd $ahic,$nhic,$dota ! $nhic | ||
636 | or %o2,%o0,%o0 | ||
637 | faddd $ahid,$nhid,$dotb ! $nhid | ||
638 | or %o7,%o0,%o0 ! 64-bit result | ||
639 | ldx [$tp],%o7 | ||
640 | faddd $nloc,$nhia,$nloc | ||
641 | addcc %o7,%o0,%o0 | ||
642 | ! end-of-why? | ||
643 | faddd $nlod,$nhib,$nlod | ||
644 | srlx %o3,16,%g1 ! 34-bit carry | ||
645 | fdtox $nloa,$nloa | ||
646 | bcs,a %xcc,.+8 | ||
647 | add %g1,1,%g1 | ||
648 | |||
649 | fdtox $nlob,$nlob | ||
650 | fdtox $nloc,$nloc | ||
651 | fdtox $nlod,$nlod | ||
652 | |||
653 | std $nloa,[%sp+$bias+$frame+0] | ||
654 | std $nlob,[%sp+$bias+$frame+8] | ||
655 | addcc $j,8,$j | ||
656 | std $nloc,[%sp+$bias+$frame+16] | ||
657 | bz,pn %icc,.Linnerskip | ||
658 | std $nlod,[%sp+$bias+$frame+24] | ||
659 | |||
660 | ba .Linner | ||
661 | nop | ||
662 | .align 32 | ||
663 | .Linner: | ||
664 | ldd [$ap_l+$j],$alo ! load a[j] in double format | ||
665 | ldd [$ap_h+$j],$ahi | ||
666 | ldd [$np_l+$j],$nlo ! load n[j] in double format | ||
667 | ldd [$np_h+$j],$nhi | ||
668 | |||
669 | fmuld $alo,$ba,$aloa | ||
670 | fmuld $nlo,$na,$nloa | ||
671 | fmuld $alo,$bb,$alob | ||
672 | fmuld $nlo,$nb,$nlob | ||
673 | fmuld $alo,$bc,$aloc | ||
674 | ldx [%sp+$bias+$frame+0],%o0 | ||
675 | faddd $aloa,$nloa,$nloa | ||
676 | fmuld $nlo,$nc,$nloc | ||
677 | ldx [%sp+$bias+$frame+8],%o1 | ||
678 | fmuld $alo,$bd,$alod | ||
679 | ldx [%sp+$bias+$frame+16],%o2 | ||
680 | faddd $alob,$nlob,$nlob | ||
681 | fmuld $nlo,$nd,$nlod | ||
682 | ldx [%sp+$bias+$frame+24],%o3 | ||
683 | fmuld $ahi,$ba,$ahia | ||
684 | |||
685 | srlx %o0,16,%o7 | ||
686 | faddd $aloc,$nloc,$nloc | ||
687 | fmuld $nhi,$na,$nhia | ||
688 | add %o7,%o1,%o1 | ||
689 | fmuld $ahi,$bb,$ahib | ||
690 | srlx %o1,16,%o7 | ||
691 | faddd $alod,$nlod,$nlod | ||
692 | fmuld $nhi,$nb,$nhib | ||
693 | add %o7,%o2,%o2 | ||
694 | fmuld $ahi,$bc,$ahic | ||
695 | srlx %o2,16,%o7 | ||
696 | faddd $ahia,$nhia,$nhia | ||
697 | fmuld $nhi,$nc,$nhic | ||
698 | add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] | ||
699 | and %o0,$mask,%o0 | ||
700 | fmuld $ahi,$bd,$ahid | ||
701 | and %o1,$mask,%o1 | ||
702 | and %o2,$mask,%o2 | ||
703 | faddd $ahib,$nhib,$nhib | ||
704 | fmuld $nhi,$nd,$nhid | ||
705 | sllx %o1,16,%o1 | ||
706 | faddd $dota,$nloa,$nloa | ||
707 | sllx %o2,32,%o2 | ||
708 | faddd $dotb,$nlob,$nlob | ||
709 | sllx %o3,48,%o7 | ||
710 | or %o1,%o0,%o0 | ||
711 | faddd $ahic,$nhic,$dota ! $nhic | ||
712 | or %o2,%o0,%o0 | ||
713 | faddd $ahid,$nhid,$dotb ! $nhid | ||
714 | or %o7,%o0,%o0 ! 64-bit result | ||
715 | faddd $nloc,$nhia,$nloc | ||
716 | addcc %g1,%o0,%o0 | ||
717 | ldx [$tp+8],%o7 ! tp[j] | ||
718 | faddd $nlod,$nhib,$nlod | ||
719 | srlx %o3,16,%g1 ! 34-bit carry | ||
720 | fdtox $nloa,$nloa | ||
721 | bcs,a %xcc,.+8 | ||
722 | add %g1,1,%g1 | ||
723 | fdtox $nlob,$nlob | ||
724 | addcc %o7,%o0,%o0 | ||
725 | fdtox $nloc,$nloc | ||
726 | bcs,a %xcc,.+8 | ||
727 | add %g1,1,%g1 | ||
728 | |||
729 | stx %o0,[$tp] ! tp[j-1] | ||
730 | fdtox $nlod,$nlod | ||
731 | |||
732 | std $nloa,[%sp+$bias+$frame+0] | ||
733 | std $nlob,[%sp+$bias+$frame+8] | ||
734 | std $nloc,[%sp+$bias+$frame+16] | ||
735 | addcc $j,8,$j | ||
736 | std $nlod,[%sp+$bias+$frame+24] | ||
737 | bnz,pt %icc,.Linner | ||
738 | add $tp,8,$tp | ||
739 | |||
740 | .Linnerskip: | ||
741 | fdtox $dota,$dota | ||
742 | fdtox $dotb,$dotb | ||
743 | |||
744 | ldx [%sp+$bias+$frame+0],%o0 | ||
745 | ldx [%sp+$bias+$frame+8],%o1 | ||
746 | ldx [%sp+$bias+$frame+16],%o2 | ||
747 | ldx [%sp+$bias+$frame+24],%o3 | ||
748 | |||
749 | srlx %o0,16,%o7 | ||
750 | std $dota,[%sp+$bias+$frame+32] | ||
751 | add %o7,%o1,%o1 | ||
752 | std $dotb,[%sp+$bias+$frame+40] | ||
753 | srlx %o1,16,%o7 | ||
754 | add %o7,%o2,%o2 | ||
755 | srlx %o2,16,%o7 | ||
756 | add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] | ||
757 | and %o0,$mask,%o0 | ||
758 | and %o1,$mask,%o1 | ||
759 | and %o2,$mask,%o2 | ||
760 | sllx %o1,16,%o1 | ||
761 | sllx %o2,32,%o2 | ||
762 | sllx %o3,48,%o7 | ||
763 | or %o1,%o0,%o0 | ||
764 | or %o2,%o0,%o0 | ||
765 | ldx [%sp+$bias+$frame+32],%o4 | ||
766 | or %o7,%o0,%o0 ! 64-bit result | ||
767 | ldx [%sp+$bias+$frame+40],%o5 | ||
768 | addcc %g1,%o0,%o0 | ||
769 | ldx [$tp+8],%o7 ! tp[j] | ||
770 | srlx %o3,16,%g1 ! 34-bit carry | ||
771 | bcs,a %xcc,.+8 | ||
772 | add %g1,1,%g1 | ||
773 | |||
774 | addcc %o7,%o0,%o0 | ||
775 | bcs,a %xcc,.+8 | ||
776 | add %g1,1,%g1 | ||
777 | |||
778 | stx %o0,[$tp] ! tp[j-1] | ||
779 | add $tp,8,$tp | ||
780 | |||
781 | srlx %o4,16,%o7 | ||
782 | add %o7,%o5,%o5 | ||
783 | and %o4,$mask,%o4 | ||
784 | sllx %o5,16,%o7 | ||
785 | or %o7,%o4,%o4 | ||
786 | addcc %g1,%o4,%o4 | ||
787 | srlx %o5,48,%g1 | ||
788 | bcs,a %xcc,.+8 | ||
789 | add %g1,1,%g1 | ||
790 | |||
791 | addcc $carry,%o4,%o4 | ||
792 | stx %o4,[$tp] ! tp[num-1] | ||
793 | mov %g1,$carry | ||
794 | bcs,a %xcc,.+8 | ||
795 | add $carry,1,$carry | ||
796 | |||
797 | addcc $i,8,$i | ||
798 | bnz %icc,.Louter | ||
799 | nop | ||
800 | |||
801 | add $tp,8,$tp ! adjust tp to point at the end | ||
802 | orn %g0,%g0,%g4 | ||
803 | sub %g0,$num,%o7 ! n=-num | ||
804 | ba .Lsub | ||
805 | subcc %g0,%g0,%g0 ! clear %icc.c | ||
806 | |||
807 | .align 32 | ||
808 | .Lsub: | ||
809 | ldx [$tp+%o7],%o0 | ||
810 | add $np,%o7,%g1 | ||
811 | ld [%g1+0],%o2 | ||
812 | ld [%g1+4],%o3 | ||
813 | srlx %o0,32,%o1 | ||
814 | subccc %o0,%o2,%o2 | ||
815 | add $rp,%o7,%g1 | ||
816 | subccc %o1,%o3,%o3 | ||
817 | st %o2,[%g1+0] | ||
818 | add %o7,8,%o7 | ||
819 | brnz,pt %o7,.Lsub | ||
820 | st %o3,[%g1+4] | ||
821 | subc $carry,0,%g4 | ||
822 | sub %g0,$num,%o7 ! n=-num | ||
823 | ba .Lcopy | ||
824 | nop | ||
825 | |||
826 | .align 32 | ||
827 | .Lcopy: | ||
828 | ldx [$tp+%o7],%o0 | ||
829 | add $rp,%o7,%g1 | ||
830 | ld [%g1+0],%o2 | ||
831 | ld [%g1+4],%o3 | ||
832 | stx %g0,[$tp+%o7] | ||
833 | and %o0,%g4,%o0 | ||
834 | srlx %o0,32,%o1 | ||
835 | andn %o2,%g4,%o2 | ||
836 | andn %o3,%g4,%o3 | ||
837 | or %o2,%o0,%o0 | ||
838 | or %o3,%o1,%o1 | ||
839 | st %o0,[%g1+0] | ||
840 | add %o7,8,%o7 | ||
841 | brnz,pt %o7,.Lcopy | ||
842 | st %o1,[%g1+4] | ||
843 | sub %g0,$num,%o7 ! n=-num | ||
844 | |||
845 | .Lzap: | ||
846 | stx %g0,[$ap_l+%o7] | ||
847 | stx %g0,[$ap_h+%o7] | ||
848 | stx %g0,[$np_l+%o7] | ||
849 | stx %g0,[$np_h+%o7] | ||
850 | add %o7,8,%o7 | ||
851 | brnz,pt %o7,.Lzap | ||
852 | nop | ||
853 | |||
854 | ldx [%sp+$bias+$frame+48],%o7 | ||
855 | wr %g0,%o7,%asi ! restore %asi | ||
856 | |||
857 | mov 1,%i0 | ||
858 | .Lret: | ||
859 | ret | ||
860 | restore | ||
861 | .type $fname,#function | ||
862 | .size $fname,(.-$fname) | ||
863 | .asciz "Montgomery Multipltication for UltraSPARC, CRYPTOGAMS by <appro\@openssl.org>" | ||
864 | .align 32 | ||
865 | ___ | ||
866 | |||
867 | $code =~ s/\`([^\`]*)\`/eval($1)/gem; | ||
868 | |||
869 | # Below substitution makes it possible to compile without demanding | ||
870 | # VIS extentions on command line, e.g. -xarch=v9 vs. -xarch=v9a. I | ||
871 | # dare to do this, because VIS capability is detected at run-time now | ||
872 | # and this routine is not called on CPU not capable to execute it. Do | ||
873 | # note that fzeros is not the only VIS dependency! Another dependency | ||
874 | # is implicit and is just _a_ numerical value loaded to %asi register, | ||
875 | # which assembler can't recognize as VIS specific... | ||
876 | $code =~ s/fzeros\s+%f([0-9]+)/ | ||
877 | sprintf(".word\t0x%x\t! fzeros %%f%d",0x81b00c20|($1<<25),$1) | ||
878 | /gem; | ||
879 | |||
880 | print $code; | ||
881 | # flush | ||
882 | close STDOUT; | ||
diff --git a/src/lib/libcrypto/bn/asm/via-mont.pl b/src/lib/libcrypto/bn/asm/via-mont.pl new file mode 100644 index 0000000000..c046a514c8 --- /dev/null +++ b/src/lib/libcrypto/bn/asm/via-mont.pl | |||
@@ -0,0 +1,242 @@ | |||
1 | #!/usr/bin/env perl | ||
2 | # | ||
3 | # ==================================================================== | ||
4 | # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | ||
5 | # project. The module is, however, dual licensed under OpenSSL and | ||
6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
8 | # ==================================================================== | ||
9 | # | ||
10 | # Wrapper around 'rep montmul', VIA-specific instruction accessing | ||
11 | # PadLock Montgomery Multiplier. The wrapper is designed as drop-in | ||
12 | # replacement for OpenSSL bn_mul_mont [first implemented in 0.9.9]. | ||
13 | # | ||
14 | # Below are interleaved outputs from 'openssl speed rsa dsa' for 4 | ||
15 | # different software configurations on 1.5GHz VIA Esther processor. | ||
16 | # Lines marked with "software integer" denote performance of hand- | ||
17 | # coded integer-only assembler found in OpenSSL 0.9.7. "Software SSE2" | ||
18 | # refers to hand-coded SSE2 Montgomery multiplication procedure found | ||
19 | # OpenSSL 0.9.9. "Hardware VIA SDK" refers to padlock_pmm routine from | ||
20 | # Padlock SDK 2.0.1 available for download from VIA, which naturally | ||
21 | # utilizes the magic 'repz montmul' instruction. And finally "hardware | ||
22 | # this" refers to *this* implementation which also uses 'repz montmul' | ||
23 | # | ||
24 | # sign verify sign/s verify/s | ||
25 | # rsa 512 bits 0.001720s 0.000140s 581.4 7149.7 software integer | ||
26 | # rsa 512 bits 0.000690s 0.000086s 1450.3 11606.0 software SSE2 | ||
27 | # rsa 512 bits 0.006136s 0.000201s 163.0 4974.5 hardware VIA SDK | ||
28 | # rsa 512 bits 0.000712s 0.000050s 1404.9 19858.5 hardware this | ||
29 | # | ||
30 | # rsa 1024 bits 0.008518s 0.000413s 117.4 2420.8 software integer | ||
31 | # rsa 1024 bits 0.004275s 0.000277s 233.9 3609.7 software SSE2 | ||
32 | # rsa 1024 bits 0.012136s 0.000260s 82.4 3844.5 hardware VIA SDK | ||
33 | # rsa 1024 bits 0.002522s 0.000116s 396.5 8650.9 hardware this | ||
34 | # | ||
35 | # rsa 2048 bits 0.050101s 0.001371s 20.0 729.6 software integer | ||
36 | # rsa 2048 bits 0.030273s 0.001008s 33.0 991.9 software SSE2 | ||
37 | # rsa 2048 bits 0.030833s 0.000976s 32.4 1025.1 hardware VIA SDK | ||
38 | # rsa 2048 bits 0.011879s 0.000342s 84.2 2921.7 hardware this | ||
39 | # | ||
40 | # rsa 4096 bits 0.327097s 0.004859s 3.1 205.8 software integer | ||
41 | # rsa 4096 bits 0.229318s 0.003859s 4.4 259.2 software SSE2 | ||
42 | # rsa 4096 bits 0.233953s 0.003274s 4.3 305.4 hardware VIA SDK | ||
43 | # rsa 4096 bits 0.070493s 0.001166s 14.2 857.6 hardware this | ||
44 | # | ||
45 | # dsa 512 bits 0.001342s 0.001651s 745.2 605.7 software integer | ||
46 | # dsa 512 bits 0.000844s 0.000987s 1185.3 1013.1 software SSE2 | ||
47 | # dsa 512 bits 0.001902s 0.002247s 525.6 444.9 hardware VIA SDK | ||
48 | # dsa 512 bits 0.000458s 0.000524s 2182.2 1909.1 hardware this | ||
49 | # | ||
50 | # dsa 1024 bits 0.003964s 0.004926s 252.3 203.0 software integer | ||
51 | # dsa 1024 bits 0.002686s 0.003166s 372.3 315.8 software SSE2 | ||
52 | # dsa 1024 bits 0.002397s 0.002823s 417.1 354.3 hardware VIA SDK | ||
53 | # dsa 1024 bits 0.000978s 0.001170s 1022.2 855.0 hardware this | ||
54 | # | ||
55 | # dsa 2048 bits 0.013280s 0.016518s 75.3 60.5 software integer | ||
56 | # dsa 2048 bits 0.009911s 0.011522s 100.9 86.8 software SSE2 | ||
57 | # dsa 2048 bits 0.009542s 0.011763s 104.8 85.0 hardware VIA SDK | ||
58 | # dsa 2048 bits 0.002884s 0.003352s 346.8 298.3 hardware this | ||
59 | # | ||
60 | # To give you some other reference point here is output for 2.4GHz P4 | ||
61 | # running hand-coded SSE2 bn_mul_mont found in 0.9.9, i.e. "software | ||
62 | # SSE2" in above terms. | ||
63 | # | ||
64 | # rsa 512 bits 0.000407s 0.000047s 2454.2 21137.0 | ||
65 | # rsa 1024 bits 0.002426s 0.000141s 412.1 7100.0 | ||
66 | # rsa 2048 bits 0.015046s 0.000491s 66.5 2034.9 | ||
67 | # rsa 4096 bits 0.109770s 0.002379s 9.1 420.3 | ||
68 | # dsa 512 bits 0.000438s 0.000525s 2281.1 1904.1 | ||
69 | # dsa 1024 bits 0.001346s 0.001595s 742.7 627.0 | ||
70 | # dsa 2048 bits 0.004745s 0.005582s 210.7 179.1 | ||
71 | # | ||
72 | # Conclusions: | ||
73 | # - VIA SDK leaves a *lot* of room for improvement (which this | ||
74 | # implementation successfully fills:-); | ||
75 | # - 'rep montmul' gives up to >3x performance improvement depending on | ||
76 | # key length; | ||
77 | # - in terms of absolute performance it delivers approximately as much | ||
78 | # as modern out-of-order 32-bit cores [again, for longer keys]. | ||
79 | |||
80 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | ||
81 | push(@INC,"${dir}","${dir}../../perlasm"); | ||
82 | require "x86asm.pl"; | ||
83 | |||
84 | &asm_init($ARGV[0],"via-mont.pl"); | ||
85 | |||
86 | # int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num); | ||
87 | $func="bn_mul_mont_padlock"; | ||
88 | |||
89 | $pad=16*1; # amount of reserved bytes on top of every vector | ||
90 | |||
91 | # stack layout | ||
92 | $mZeroPrime=&DWP(0,"esp"); # these are specified by VIA | ||
93 | $A=&DWP(4,"esp"); | ||
94 | $B=&DWP(8,"esp"); | ||
95 | $T=&DWP(12,"esp"); | ||
96 | $M=&DWP(16,"esp"); | ||
97 | $scratch=&DWP(20,"esp"); | ||
98 | $rp=&DWP(24,"esp"); # these are mine | ||
99 | $sp=&DWP(28,"esp"); | ||
100 | # &DWP(32,"esp") # 32 byte scratch area | ||
101 | # &DWP(64+(4*$num+$pad)*0,"esp") # padded tp[num] | ||
102 | # &DWP(64+(4*$num+$pad)*1,"esp") # padded copy of ap[num] | ||
103 | # &DWP(64+(4*$num+$pad)*2,"esp") # padded copy of bp[num] | ||
104 | # &DWP(64+(4*$num+$pad)*3,"esp") # padded copy of np[num] | ||
105 | # Note that SDK suggests to unconditionally allocate 2K per vector. This | ||
106 | # has quite an impact on performance. It naturally depends on key length, | ||
107 | # but to give an example 1024 bit private RSA key operations suffer >30% | ||
108 | # penalty. I allocate only as much as actually required... | ||
109 | |||
110 | &function_begin($func); | ||
111 | &xor ("eax","eax"); | ||
112 | &mov ("ecx",&wparam(5)); # num | ||
113 | # meet VIA's limitations for num [note that the specification | ||
114 | # expresses them in bits, while we work with amount of 32-bit words] | ||
115 | &test ("ecx",3); | ||
116 | &jnz (&label("leave")); # num % 4 != 0 | ||
117 | &cmp ("ecx",8); | ||
118 | &jb (&label("leave")); # num < 8 | ||
119 | &cmp ("ecx",1024); | ||
120 | &ja (&label("leave")); # num > 1024 | ||
121 | |||
122 | &pushf (); | ||
123 | &cld (); | ||
124 | |||
125 | &mov ("edi",&wparam(0)); # rp | ||
126 | &mov ("eax",&wparam(1)); # ap | ||
127 | &mov ("ebx",&wparam(2)); # bp | ||
128 | &mov ("edx",&wparam(3)); # np | ||
129 | &mov ("esi",&wparam(4)); # n0 | ||
130 | &mov ("esi",&DWP(0,"esi")); # *n0 | ||
131 | |||
132 | &lea ("ecx",&DWP($pad,"","ecx",4)); # ecx becomes vector size in bytes | ||
133 | &lea ("ebp",&DWP(64,"","ecx",4)); # allocate 4 vectors + 64 bytes | ||
134 | &neg ("ebp"); | ||
135 | &add ("ebp","esp"); | ||
136 | &and ("ebp",-64); # align to cache-line | ||
137 | &xchg ("ebp","esp"); # alloca | ||
138 | |||
139 | &mov ($rp,"edi"); # save rp | ||
140 | &mov ($sp,"ebp"); # save esp | ||
141 | |||
142 | &mov ($mZeroPrime,"esi"); | ||
143 | &lea ("esi",&DWP(64,"esp")); # tp | ||
144 | &mov ($T,"esi"); | ||
145 | &lea ("edi",&DWP(32,"esp")); # scratch area | ||
146 | &mov ($scratch,"edi"); | ||
147 | &mov ("esi","eax"); | ||
148 | |||
149 | &lea ("ebp",&DWP(-$pad,"ecx")); | ||
150 | &shr ("ebp",2); # restore original num value in ebp | ||
151 | |||
152 | &xor ("eax","eax"); | ||
153 | |||
154 | &mov ("ecx","ebp"); | ||
155 | &lea ("ecx",&DWP((32+$pad)/4,"ecx"));# padded tp + scratch | ||
156 | &data_byte(0xf3,0xab); # rep stosl, bzero | ||
157 | |||
158 | &mov ("ecx","ebp"); | ||
159 | &lea ("edi",&DWP(64+$pad,"esp","ecx",4));# pointer to ap copy | ||
160 | &mov ($A,"edi"); | ||
161 | &data_byte(0xf3,0xa5); # rep movsl, memcpy | ||
162 | &mov ("ecx",$pad/4); | ||
163 | &data_byte(0xf3,0xab); # rep stosl, bzero pad | ||
164 | # edi points at the end of padded ap copy... | ||
165 | |||
166 | &mov ("ecx","ebp"); | ||
167 | &mov ("esi","ebx"); | ||
168 | &mov ($B,"edi"); | ||
169 | &data_byte(0xf3,0xa5); # rep movsl, memcpy | ||
170 | &mov ("ecx",$pad/4); | ||
171 | &data_byte(0xf3,0xab); # rep stosl, bzero pad | ||
172 | # edi points at the end of padded bp copy... | ||
173 | |||
174 | &mov ("ecx","ebp"); | ||
175 | &mov ("esi","edx"); | ||
176 | &mov ($M,"edi"); | ||
177 | &data_byte(0xf3,0xa5); # rep movsl, memcpy | ||
178 | &mov ("ecx",$pad/4); | ||
179 | &data_byte(0xf3,0xab); # rep stosl, bzero pad | ||
180 | # edi points at the end of padded np copy... | ||
181 | |||
182 | # let magic happen... | ||
183 | &mov ("ecx","ebp"); | ||
184 | &mov ("esi","esp"); | ||
185 | &shl ("ecx",5); # convert word counter to bit counter | ||
186 | &align (4); | ||
187 | &data_byte(0xf3,0x0f,0xa6,0xc0);# rep montmul | ||
188 | |||
189 | &mov ("ecx","ebp"); | ||
190 | &lea ("esi",&DWP(64,"esp")); # tp | ||
191 | # edi still points at the end of padded np copy... | ||
192 | &neg ("ebp"); | ||
193 | &lea ("ebp",&DWP(-$pad,"edi","ebp",4)); # so just "rewind" | ||
194 | &mov ("edi",$rp); # restore rp | ||
195 | &xor ("edx","edx"); # i=0 and clear CF | ||
196 | |||
197 | &set_label("sub",8); | ||
198 | &mov ("eax",&DWP(0,"esi","edx",4)); | ||
199 | &sbb ("eax",&DWP(0,"ebp","edx",4)); | ||
200 | &mov (&DWP(0,"edi","edx",4),"eax"); # rp[i]=tp[i]-np[i] | ||
201 | &lea ("edx",&DWP(1,"edx")); # i++ | ||
202 | &loop (&label("sub")); # doesn't affect CF! | ||
203 | |||
204 | &mov ("eax",&DWP(0,"esi","edx",4)); # upmost overflow bit | ||
205 | &sbb ("eax",0); | ||
206 | &and ("esi","eax"); | ||
207 | ¬ ("eax"); | ||
208 | &mov ("ebp","edi"); | ||
209 | &and ("ebp","eax"); | ||
210 | &or ("esi","ebp"); # tp=carry?tp:rp | ||
211 | |||
212 | &mov ("ecx","edx"); # num | ||
213 | &xor ("edx","edx"); # i=0 | ||
214 | |||
215 | &set_label("copy",8); | ||
216 | &mov ("eax",&DWP(0,"esi","edx",4)); | ||
217 | &mov (&DWP(64,"esp","edx",4),"ecx"); # zap tp | ||
218 | &mov (&DWP(0,"edi","edx",4),"eax"); | ||
219 | &lea ("edx",&DWP(1,"edx")); # i++ | ||
220 | &loop (&label("copy")); | ||
221 | |||
222 | &mov ("ebp",$sp); | ||
223 | &xor ("eax","eax"); | ||
224 | |||
225 | &mov ("ecx",64/4); | ||
226 | &mov ("edi","esp"); # zap frame including scratch area | ||
227 | &data_byte(0xf3,0xab); # rep stosl, bzero | ||
228 | |||
229 | # zap copies of ap, bp and np | ||
230 | &lea ("edi",&DWP(64+$pad,"esp","edx",4));# pointer to ap | ||
231 | &lea ("ecx",&DWP(3*$pad/4,"edx","edx",2)); | ||
232 | &data_byte(0xf3,0xab); # rep stosl, bzero | ||
233 | |||
234 | &mov ("esp","ebp"); | ||
235 | &inc ("eax"); # signal "done" | ||
236 | &popf (); | ||
237 | &set_label("leave"); | ||
238 | &function_end($func); | ||
239 | |||
240 | &asciz("Padlock Montgomery Multiplication, CRYPTOGAMS by <appro\@openssl.org>"); | ||
241 | |||
242 | &asm_finish(); | ||
diff --git a/src/lib/libcrypto/bn/asm/x86-mont.pl b/src/lib/libcrypto/bn/asm/x86-mont.pl new file mode 100755 index 0000000000..5cd3cd2ed5 --- /dev/null +++ b/src/lib/libcrypto/bn/asm/x86-mont.pl | |||
@@ -0,0 +1,591 @@ | |||
1 | #!/usr/bin/env perl | ||
2 | |||
3 | # ==================================================================== | ||
4 | # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | ||
5 | # project. The module is, however, dual licensed under OpenSSL and | ||
6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
8 | # ==================================================================== | ||
9 | |||
10 | # October 2005 | ||
11 | # | ||
12 | # This is a "teaser" code, as it can be improved in several ways... | ||
13 | # First of all non-SSE2 path should be implemented (yes, for now it | ||
14 | # performs Montgomery multiplication/convolution only on SSE2-capable | ||
15 | # CPUs such as P4, others fall down to original code). Then inner loop | ||
16 | # can be unrolled and modulo-scheduled to improve ILP and possibly | ||
17 | # moved to 128-bit XMM register bank (though it would require input | ||
18 | # rearrangement and/or increase bus bandwidth utilization). Dedicated | ||
19 | # squaring procedure should give further performance improvement... | ||
20 | # Yet, for being draft, the code improves rsa512 *sign* benchmark by | ||
21 | # 110%(!), rsa1024 one - by 70% and rsa4096 - by 20%:-) | ||
22 | |||
23 | # December 2006 | ||
24 | # | ||
25 | # Modulo-scheduling SSE2 loops results in further 15-20% improvement. | ||
26 | # Integer-only code [being equipped with dedicated squaring procedure] | ||
27 | # gives ~40% on rsa512 sign benchmark... | ||
28 | |||
29 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | ||
30 | push(@INC,"${dir}","${dir}../../perlasm"); | ||
31 | require "x86asm.pl"; | ||
32 | |||
33 | &asm_init($ARGV[0],$0); | ||
34 | |||
35 | $sse2=0; | ||
36 | for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } | ||
37 | |||
38 | &external_label("OPENSSL_ia32cap_P") if ($sse2); | ||
39 | |||
40 | &function_begin("bn_mul_mont"); | ||
41 | |||
42 | $i="edx"; | ||
43 | $j="ecx"; | ||
44 | $ap="esi"; $tp="esi"; # overlapping variables!!! | ||
45 | $rp="edi"; $bp="edi"; # overlapping variables!!! | ||
46 | $np="ebp"; | ||
47 | $num="ebx"; | ||
48 | |||
49 | $_num=&DWP(4*0,"esp"); # stack top layout | ||
50 | $_rp=&DWP(4*1,"esp"); | ||
51 | $_ap=&DWP(4*2,"esp"); | ||
52 | $_bp=&DWP(4*3,"esp"); | ||
53 | $_np=&DWP(4*4,"esp"); | ||
54 | $_n0=&DWP(4*5,"esp"); $_n0q=&QWP(4*5,"esp"); | ||
55 | $_sp=&DWP(4*6,"esp"); | ||
56 | $_bpend=&DWP(4*7,"esp"); | ||
57 | $frame=32; # size of above frame rounded up to 16n | ||
58 | |||
59 | &xor ("eax","eax"); | ||
60 | &mov ("edi",&wparam(5)); # int num | ||
61 | &cmp ("edi",4); | ||
62 | &jl (&label("just_leave")); | ||
63 | |||
64 | &lea ("esi",&wparam(0)); # put aside pointer to argument block | ||
65 | &lea ("edx",&wparam(1)); # load ap | ||
66 | &mov ("ebp","esp"); # saved stack pointer! | ||
67 | &add ("edi",2); # extra two words on top of tp | ||
68 | &neg ("edi"); | ||
69 | &lea ("esp",&DWP(-$frame,"esp","edi",4)); # alloca($frame+4*(num+2)) | ||
70 | &neg ("edi"); | ||
71 | |||
72 | # minimize cache contention by arraning 2K window between stack | ||
73 | # pointer and ap argument [np is also position sensitive vector, | ||
74 | # but it's assumed to be near ap, as it's allocated at ~same | ||
75 | # time]. | ||
76 | &mov ("eax","esp"); | ||
77 | &sub ("eax","edx"); | ||
78 | &and ("eax",2047); | ||
79 | &sub ("esp","eax"); # this aligns sp and ap modulo 2048 | ||
80 | |||
81 | &xor ("edx","esp"); | ||
82 | &and ("edx",2048); | ||
83 | &xor ("edx",2048); | ||
84 | &sub ("esp","edx"); # this splits them apart modulo 4096 | ||
85 | |||
86 | &and ("esp",-64); # align to cache line | ||
87 | |||
88 | ################################# load argument block... | ||
89 | &mov ("eax",&DWP(0*4,"esi"));# BN_ULONG *rp | ||
90 | &mov ("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap | ||
91 | &mov ("ecx",&DWP(2*4,"esi"));# const BN_ULONG *bp | ||
92 | &mov ("edx",&DWP(3*4,"esi"));# const BN_ULONG *np | ||
93 | &mov ("esi",&DWP(4*4,"esi"));# const BN_ULONG *n0 | ||
94 | #&mov ("edi",&DWP(5*4,"esi"));# int num | ||
95 | |||
96 | &mov ("esi",&DWP(0,"esi")); # pull n0[0] | ||
97 | &mov ($_rp,"eax"); # ... save a copy of argument block | ||
98 | &mov ($_ap,"ebx"); | ||
99 | &mov ($_bp,"ecx"); | ||
100 | &mov ($_np,"edx"); | ||
101 | &mov ($_n0,"esi"); | ||
102 | &lea ($num,&DWP(-3,"edi")); # num=num-1 to assist modulo-scheduling | ||
103 | #&mov ($_num,$num); # redundant as $num is not reused | ||
104 | &mov ($_sp,"ebp"); # saved stack pointer! | ||
105 | |||
106 | if($sse2) { | ||
107 | $acc0="mm0"; # mmx register bank layout | ||
108 | $acc1="mm1"; | ||
109 | $car0="mm2"; | ||
110 | $car1="mm3"; | ||
111 | $mul0="mm4"; | ||
112 | $mul1="mm5"; | ||
113 | $temp="mm6"; | ||
114 | $mask="mm7"; | ||
115 | |||
116 | &picmeup("eax","OPENSSL_ia32cap_P"); | ||
117 | &bt (&DWP(0,"eax"),26); | ||
118 | &jnc (&label("non_sse2")); | ||
119 | |||
120 | &mov ("eax",-1); | ||
121 | &movd ($mask,"eax"); # mask 32 lower bits | ||
122 | |||
123 | &mov ($ap,$_ap); # load input pointers | ||
124 | &mov ($bp,$_bp); | ||
125 | &mov ($np,$_np); | ||
126 | |||
127 | &xor ($i,$i); # i=0 | ||
128 | &xor ($j,$j); # j=0 | ||
129 | |||
130 | &movd ($mul0,&DWP(0,$bp)); # bp[0] | ||
131 | &movd ($mul1,&DWP(0,$ap)); # ap[0] | ||
132 | &movd ($car1,&DWP(0,$np)); # np[0] | ||
133 | |||
134 | &pmuludq($mul1,$mul0); # ap[0]*bp[0] | ||
135 | &movq ($car0,$mul1); | ||
136 | &movq ($acc0,$mul1); # I wish movd worked for | ||
137 | &pand ($acc0,$mask); # inter-register transfers | ||
138 | |||
139 | &pmuludq($mul1,$_n0q); # *=n0 | ||
140 | |||
141 | &pmuludq($car1,$mul1); # "t[0]"*np[0]*n0 | ||
142 | &paddq ($car1,$acc0); | ||
143 | |||
144 | &movd ($acc1,&DWP(4,$np)); # np[1] | ||
145 | &movd ($acc0,&DWP(4,$ap)); # ap[1] | ||
146 | |||
147 | &psrlq ($car0,32); | ||
148 | &psrlq ($car1,32); | ||
149 | |||
150 | &inc ($j); # j++ | ||
151 | &set_label("1st",16); | ||
152 | &pmuludq($acc0,$mul0); # ap[j]*bp[0] | ||
153 | &pmuludq($acc1,$mul1); # np[j]*m1 | ||
154 | &paddq ($car0,$acc0); # +=c0 | ||
155 | &paddq ($car1,$acc1); # +=c1 | ||
156 | |||
157 | &movq ($acc0,$car0); | ||
158 | &pand ($acc0,$mask); | ||
159 | &movd ($acc1,&DWP(4,$np,$j,4)); # np[j+1] | ||
160 | &paddq ($car1,$acc0); # +=ap[j]*bp[0]; | ||
161 | &movd ($acc0,&DWP(4,$ap,$j,4)); # ap[j+1] | ||
162 | &psrlq ($car0,32); | ||
163 | &movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[j-1]= | ||
164 | &psrlq ($car1,32); | ||
165 | |||
166 | &lea ($j,&DWP(1,$j)); | ||
167 | &cmp ($j,$num); | ||
168 | &jl (&label("1st")); | ||
169 | |||
170 | &pmuludq($acc0,$mul0); # ap[num-1]*bp[0] | ||
171 | &pmuludq($acc1,$mul1); # np[num-1]*m1 | ||
172 | &paddq ($car0,$acc0); # +=c0 | ||
173 | &paddq ($car1,$acc1); # +=c1 | ||
174 | |||
175 | &movq ($acc0,$car0); | ||
176 | &pand ($acc0,$mask); | ||
177 | &paddq ($car1,$acc0); # +=ap[num-1]*bp[0]; | ||
178 | &movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[num-2]= | ||
179 | |||
180 | &psrlq ($car0,32); | ||
181 | &psrlq ($car1,32); | ||
182 | |||
183 | &paddq ($car1,$car0); | ||
184 | &movq (&QWP($frame,"esp",$num,4),$car1); # tp[num].tp[num-1] | ||
185 | |||
186 | &inc ($i); # i++ | ||
187 | &set_label("outer"); | ||
188 | &xor ($j,$j); # j=0 | ||
189 | |||
190 | &movd ($mul0,&DWP(0,$bp,$i,4)); # bp[i] | ||
191 | &movd ($mul1,&DWP(0,$ap)); # ap[0] | ||
192 | &movd ($temp,&DWP($frame,"esp")); # tp[0] | ||
193 | &movd ($car1,&DWP(0,$np)); # np[0] | ||
194 | &pmuludq($mul1,$mul0); # ap[0]*bp[i] | ||
195 | |||
196 | &paddq ($mul1,$temp); # +=tp[0] | ||
197 | &movq ($acc0,$mul1); | ||
198 | &movq ($car0,$mul1); | ||
199 | &pand ($acc0,$mask); | ||
200 | |||
201 | &pmuludq($mul1,$_n0q); # *=n0 | ||
202 | |||
203 | &pmuludq($car1,$mul1); | ||
204 | &paddq ($car1,$acc0); | ||
205 | |||
206 | &movd ($temp,&DWP($frame+4,"esp")); # tp[1] | ||
207 | &movd ($acc1,&DWP(4,$np)); # np[1] | ||
208 | &movd ($acc0,&DWP(4,$ap)); # ap[1] | ||
209 | |||
210 | &psrlq ($car0,32); | ||
211 | &psrlq ($car1,32); | ||
212 | &paddq ($car0,$temp); # +=tp[1] | ||
213 | |||
214 | &inc ($j); # j++ | ||
215 | &dec ($num); | ||
216 | &set_label("inner"); | ||
217 | &pmuludq($acc0,$mul0); # ap[j]*bp[i] | ||
218 | &pmuludq($acc1,$mul1); # np[j]*m1 | ||
219 | &paddq ($car0,$acc0); # +=c0 | ||
220 | &paddq ($car1,$acc1); # +=c1 | ||
221 | |||
222 | &movq ($acc0,$car0); | ||
223 | &movd ($temp,&DWP($frame+4,"esp",$j,4));# tp[j+1] | ||
224 | &pand ($acc0,$mask); | ||
225 | &movd ($acc1,&DWP(4,$np,$j,4)); # np[j+1] | ||
226 | &paddq ($car1,$acc0); # +=ap[j]*bp[i]+tp[j] | ||
227 | &movd ($acc0,&DWP(4,$ap,$j,4)); # ap[j+1] | ||
228 | &psrlq ($car0,32); | ||
229 | &movd (&DWP($frame-4,"esp",$j,4),$car1);# tp[j-1]= | ||
230 | &psrlq ($car1,32); | ||
231 | &paddq ($car0,$temp); # +=tp[j+1] | ||
232 | |||
233 | &dec ($num); | ||
234 | &lea ($j,&DWP(1,$j)); # j++ | ||
235 | &jnz (&label("inner")); | ||
236 | |||
237 | &mov ($num,$j); | ||
238 | &pmuludq($acc0,$mul0); # ap[num-1]*bp[i] | ||
239 | &pmuludq($acc1,$mul1); # np[num-1]*m1 | ||
240 | &paddq ($car0,$acc0); # +=c0 | ||
241 | &paddq ($car1,$acc1); # +=c1 | ||
242 | |||
243 | &movq ($acc0,$car0); | ||
244 | &pand ($acc0,$mask); | ||
245 | &paddq ($car1,$acc0); # +=ap[num-1]*bp[i]+tp[num-1] | ||
246 | &movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[num-2]= | ||
247 | &psrlq ($car0,32); | ||
248 | &psrlq ($car1,32); | ||
249 | |||
250 | &movd ($temp,&DWP($frame+4,"esp",$num,4)); # += tp[num] | ||
251 | &paddq ($car1,$car0); | ||
252 | &paddq ($car1,$temp); | ||
253 | &movq (&QWP($frame,"esp",$num,4),$car1); # tp[num].tp[num-1] | ||
254 | |||
255 | &lea ($i,&DWP(1,$i)); # i++ | ||
256 | &cmp ($i,$num); | ||
257 | &jle (&label("outer")); | ||
258 | |||
259 | &emms (); # done with mmx bank | ||
260 | &jmp (&label("common_tail")); | ||
261 | |||
262 | &set_label("non_sse2",16); | ||
263 | } | ||
264 | |||
265 | if (0) { | ||
266 | &mov ("esp",$_sp); | ||
267 | &xor ("eax","eax"); # signal "not fast enough [yet]" | ||
268 | &jmp (&label("just_leave")); | ||
269 | # While the below code provides competitive performance for | ||
270 | # all key lengthes on modern Intel cores, it's still more | ||
271 | # than 10% slower for 4096-bit key elsewhere:-( "Competitive" | ||
272 | # means compared to the original integer-only assembler. | ||
273 | # 512-bit RSA sign is better by ~40%, but that's about all | ||
274 | # one can say about all CPUs... | ||
275 | } else { | ||
276 | $inp="esi"; # integer path uses these registers differently | ||
277 | $word="edi"; | ||
278 | $carry="ebp"; | ||
279 | |||
280 | &mov ($inp,$_ap); | ||
281 | &lea ($carry,&DWP(1,$num)); | ||
282 | &mov ($word,$_bp); | ||
283 | &xor ($j,$j); # j=0 | ||
284 | &mov ("edx",$inp); | ||
285 | &and ($carry,1); # see if num is even | ||
286 | &sub ("edx",$word); # see if ap==bp | ||
287 | &lea ("eax",&DWP(4,$word,$num,4)); # &bp[num] | ||
288 | &or ($carry,"edx"); | ||
289 | &mov ($word,&DWP(0,$word)); # bp[0] | ||
290 | &jz (&label("bn_sqr_mont")); | ||
291 | &mov ($_bpend,"eax"); | ||
292 | &mov ("eax",&DWP(0,$inp)); | ||
293 | &xor ("edx","edx"); | ||
294 | |||
295 | &set_label("mull",16); | ||
296 | &mov ($carry,"edx"); | ||
297 | &mul ($word); # ap[j]*bp[0] | ||
298 | &add ($carry,"eax"); | ||
299 | &lea ($j,&DWP(1,$j)); | ||
300 | &adc ("edx",0); | ||
301 | &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j+1] | ||
302 | &cmp ($j,$num); | ||
303 | &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]= | ||
304 | &jl (&label("mull")); | ||
305 | |||
306 | &mov ($carry,"edx"); | ||
307 | &mul ($word); # ap[num-1]*bp[0] | ||
308 | &mov ($word,$_n0); | ||
309 | &add ("eax",$carry); | ||
310 | &mov ($inp,$_np); | ||
311 | &adc ("edx",0); | ||
312 | &imul ($word,&DWP($frame,"esp")); # n0*tp[0] | ||
313 | |||
314 | &mov (&DWP($frame,"esp",$num,4),"eax"); # tp[num-1]= | ||
315 | &xor ($j,$j); | ||
316 | &mov (&DWP($frame+4,"esp",$num,4),"edx"); # tp[num]= | ||
317 | &mov (&DWP($frame+8,"esp",$num,4),$j); # tp[num+1]= | ||
318 | |||
319 | &mov ("eax",&DWP(0,$inp)); # np[0] | ||
320 | &mul ($word); # np[0]*m | ||
321 | &add ("eax",&DWP($frame,"esp")); # +=tp[0] | ||
322 | &mov ("eax",&DWP(4,$inp)); # np[1] | ||
323 | &adc ("edx",0); | ||
324 | &inc ($j); | ||
325 | |||
326 | &jmp (&label("2ndmadd")); | ||
327 | |||
328 | &set_label("1stmadd",16); | ||
329 | &mov ($carry,"edx"); | ||
330 | &mul ($word); # ap[j]*bp[i] | ||
331 | &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j] | ||
332 | &lea ($j,&DWP(1,$j)); | ||
333 | &adc ("edx",0); | ||
334 | &add ($carry,"eax"); | ||
335 | &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j+1] | ||
336 | &adc ("edx",0); | ||
337 | &cmp ($j,$num); | ||
338 | &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]= | ||
339 | &jl (&label("1stmadd")); | ||
340 | |||
341 | &mov ($carry,"edx"); | ||
342 | &mul ($word); # ap[num-1]*bp[i] | ||
343 | &add ("eax",&DWP($frame,"esp",$num,4)); # +=tp[num-1] | ||
344 | &mov ($word,$_n0); | ||
345 | &adc ("edx",0); | ||
346 | &mov ($inp,$_np); | ||
347 | &add ($carry,"eax"); | ||
348 | &adc ("edx",0); | ||
349 | &imul ($word,&DWP($frame,"esp")); # n0*tp[0] | ||
350 | |||
351 | &xor ($j,$j); | ||
352 | &add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num] | ||
353 | &mov (&DWP($frame,"esp",$num,4),$carry); # tp[num-1]= | ||
354 | &adc ($j,0); | ||
355 | &mov ("eax",&DWP(0,$inp)); # np[0] | ||
356 | &mov (&DWP($frame+4,"esp",$num,4),"edx"); # tp[num]= | ||
357 | &mov (&DWP($frame+8,"esp",$num,4),$j); # tp[num+1]= | ||
358 | |||
359 | &mul ($word); # np[0]*m | ||
360 | &add ("eax",&DWP($frame,"esp")); # +=tp[0] | ||
361 | &mov ("eax",&DWP(4,$inp)); # np[1] | ||
362 | &adc ("edx",0); | ||
363 | &mov ($j,1); | ||
364 | |||
365 | &set_label("2ndmadd",16); | ||
366 | &mov ($carry,"edx"); | ||
367 | &mul ($word); # np[j]*m | ||
368 | &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j] | ||
369 | &lea ($j,&DWP(1,$j)); | ||
370 | &adc ("edx",0); | ||
371 | &add ($carry,"eax"); | ||
372 | &mov ("eax",&DWP(0,$inp,$j,4)); # np[j+1] | ||
373 | &adc ("edx",0); | ||
374 | &cmp ($j,$num); | ||
375 | &mov (&DWP($frame-8,"esp",$j,4),$carry); # tp[j-1]= | ||
376 | &jl (&label("2ndmadd")); | ||
377 | |||
378 | &mov ($carry,"edx"); | ||
379 | &mul ($word); # np[j]*m | ||
380 | &add ($carry,&DWP($frame,"esp",$num,4)); # +=tp[num-1] | ||
381 | &adc ("edx",0); | ||
382 | &add ($carry,"eax"); | ||
383 | &adc ("edx",0); | ||
384 | &mov (&DWP($frame-4,"esp",$num,4),$carry); # tp[num-2]= | ||
385 | |||
386 | &xor ("eax","eax"); | ||
387 | &mov ($j,$_bp); # &bp[i] | ||
388 | &add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num] | ||
389 | &adc ("eax",&DWP($frame+8,"esp",$num,4)); # +=tp[num+1] | ||
390 | &lea ($j,&DWP(4,$j)); | ||
391 | &mov (&DWP($frame,"esp",$num,4),"edx"); # tp[num-1]= | ||
392 | &cmp ($j,$_bpend); | ||
393 | &mov (&DWP($frame+4,"esp",$num,4),"eax"); # tp[num]= | ||
394 | &je (&label("common_tail")); | ||
395 | |||
396 | &mov ($word,&DWP(0,$j)); # bp[i+1] | ||
397 | &mov ($inp,$_ap); | ||
398 | &mov ($_bp,$j); # &bp[++i] | ||
399 | &xor ($j,$j); | ||
400 | &xor ("edx","edx"); | ||
401 | &mov ("eax",&DWP(0,$inp)); | ||
402 | &jmp (&label("1stmadd")); | ||
403 | |||
404 | &set_label("bn_sqr_mont",16); | ||
405 | $sbit=$num; | ||
406 | &mov ($_num,$num); | ||
407 | &mov ($_bp,$j); # i=0 | ||
408 | |||
409 | &mov ("eax",$word); # ap[0] | ||
410 | &mul ($word); # ap[0]*ap[0] | ||
411 | &mov (&DWP($frame,"esp"),"eax"); # tp[0]= | ||
412 | &mov ($sbit,"edx"); | ||
413 | &shr ("edx",1); | ||
414 | &and ($sbit,1); | ||
415 | &inc ($j); | ||
416 | &set_label("sqr",16); | ||
417 | &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j] | ||
418 | &mov ($carry,"edx"); | ||
419 | &mul ($word); # ap[j]*ap[0] | ||
420 | &add ("eax",$carry); | ||
421 | &lea ($j,&DWP(1,$j)); | ||
422 | &adc ("edx",0); | ||
423 | &lea ($carry,&DWP(0,$sbit,"eax",2)); | ||
424 | &shr ("eax",31); | ||
425 | &cmp ($j,$_num); | ||
426 | &mov ($sbit,"eax"); | ||
427 | &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]= | ||
428 | &jl (&label("sqr")); | ||
429 | |||
430 | &mov ("eax",&DWP(0,$inp,$j,4)); # ap[num-1] | ||
431 | &mov ($carry,"edx"); | ||
432 | &mul ($word); # ap[num-1]*ap[0] | ||
433 | &add ("eax",$carry); | ||
434 | &mov ($word,$_n0); | ||
435 | &adc ("edx",0); | ||
436 | &mov ($inp,$_np); | ||
437 | &lea ($carry,&DWP(0,$sbit,"eax",2)); | ||
438 | &imul ($word,&DWP($frame,"esp")); # n0*tp[0] | ||
439 | &shr ("eax",31); | ||
440 | &mov (&DWP($frame,"esp",$j,4),$carry); # tp[num-1]= | ||
441 | |||
442 | &lea ($carry,&DWP(0,"eax","edx",2)); | ||
443 | &mov ("eax",&DWP(0,$inp)); # np[0] | ||
444 | &shr ("edx",31); | ||
445 | &mov (&DWP($frame+4,"esp",$j,4),$carry); # tp[num]= | ||
446 | &mov (&DWP($frame+8,"esp",$j,4),"edx"); # tp[num+1]= | ||
447 | |||
448 | &mul ($word); # np[0]*m | ||
449 | &add ("eax",&DWP($frame,"esp")); # +=tp[0] | ||
450 | &mov ($num,$j); | ||
451 | &adc ("edx",0); | ||
452 | &mov ("eax",&DWP(4,$inp)); # np[1] | ||
453 | &mov ($j,1); | ||
454 | |||
455 | &set_label("3rdmadd",16); | ||
456 | &mov ($carry,"edx"); | ||
457 | &mul ($word); # np[j]*m | ||
458 | &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j] | ||
459 | &adc ("edx",0); | ||
460 | &add ($carry,"eax"); | ||
461 | &mov ("eax",&DWP(4,$inp,$j,4)); # np[j+1] | ||
462 | &adc ("edx",0); | ||
463 | &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j-1]= | ||
464 | |||
465 | &mov ($carry,"edx"); | ||
466 | &mul ($word); # np[j+1]*m | ||
467 | &add ($carry,&DWP($frame+4,"esp",$j,4)); # +=tp[j+1] | ||
468 | &lea ($j,&DWP(2,$j)); | ||
469 | &adc ("edx",0); | ||
470 | &add ($carry,"eax"); | ||
471 | &mov ("eax",&DWP(0,$inp,$j,4)); # np[j+2] | ||
472 | &adc ("edx",0); | ||
473 | &cmp ($j,$num); | ||
474 | &mov (&DWP($frame-8,"esp",$j,4),$carry); # tp[j]= | ||
475 | &jl (&label("3rdmadd")); | ||
476 | |||
477 | &mov ($carry,"edx"); | ||
478 | &mul ($word); # np[j]*m | ||
479 | &add ($carry,&DWP($frame,"esp",$num,4)); # +=tp[num-1] | ||
480 | &adc ("edx",0); | ||
481 | &add ($carry,"eax"); | ||
482 | &adc ("edx",0); | ||
483 | &mov (&DWP($frame-4,"esp",$num,4),$carry); # tp[num-2]= | ||
484 | |||
485 | &mov ($j,$_bp); # i | ||
486 | &xor ("eax","eax"); | ||
487 | &mov ($inp,$_ap); | ||
488 | &add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num] | ||
489 | &adc ("eax",&DWP($frame+8,"esp",$num,4)); # +=tp[num+1] | ||
490 | &mov (&DWP($frame,"esp",$num,4),"edx"); # tp[num-1]= | ||
491 | &cmp ($j,$num); | ||
492 | &mov (&DWP($frame+4,"esp",$num,4),"eax"); # tp[num]= | ||
493 | &je (&label("common_tail")); | ||
494 | |||
495 | &mov ($word,&DWP(4,$inp,$j,4)); # ap[i] | ||
496 | &lea ($j,&DWP(1,$j)); | ||
497 | &mov ("eax",$word); | ||
498 | &mov ($_bp,$j); # ++i | ||
499 | &mul ($word); # ap[i]*ap[i] | ||
500 | &add ("eax",&DWP($frame,"esp",$j,4)); # +=tp[i] | ||
501 | &adc ("edx",0); | ||
502 | &mov (&DWP($frame,"esp",$j,4),"eax"); # tp[i]= | ||
503 | &xor ($carry,$carry); | ||
504 | &cmp ($j,$num); | ||
505 | &lea ($j,&DWP(1,$j)); | ||
506 | &je (&label("sqrlast")); | ||
507 | |||
508 | &mov ($sbit,"edx"); # zaps $num | ||
509 | &shr ("edx",1); | ||
510 | &and ($sbit,1); | ||
511 | &set_label("sqradd",16); | ||
512 | &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j] | ||
513 | &mov ($carry,"edx"); | ||
514 | &mul ($word); # ap[j]*ap[i] | ||
515 | &add ("eax",$carry); | ||
516 | &lea ($carry,&DWP(0,"eax","eax")); | ||
517 | &adc ("edx",0); | ||
518 | &shr ("eax",31); | ||
519 | &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j] | ||
520 | &lea ($j,&DWP(1,$j)); | ||
521 | &adc ("eax",0); | ||
522 | &add ($carry,$sbit); | ||
523 | &adc ("eax",0); | ||
524 | &cmp ($j,$_num); | ||
525 | &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]= | ||
526 | &mov ($sbit,"eax"); | ||
527 | &jle (&label("sqradd")); | ||
528 | |||
529 | &mov ($carry,"edx"); | ||
530 | &lea ("edx",&DWP(0,$sbit,"edx",2)); | ||
531 | &shr ($carry,31); | ||
532 | &set_label("sqrlast"); | ||
533 | &mov ($word,$_n0); | ||
534 | &mov ($inp,$_np); | ||
535 | &imul ($word,&DWP($frame,"esp")); # n0*tp[0] | ||
536 | |||
537 | &add ("edx",&DWP($frame,"esp",$j,4)); # +=tp[num] | ||
538 | &mov ("eax",&DWP(0,$inp)); # np[0] | ||
539 | &adc ($carry,0); | ||
540 | &mov (&DWP($frame,"esp",$j,4),"edx"); # tp[num]= | ||
541 | &mov (&DWP($frame+4,"esp",$j,4),$carry); # tp[num+1]= | ||
542 | |||
543 | &mul ($word); # np[0]*m | ||
544 | &add ("eax",&DWP($frame,"esp")); # +=tp[0] | ||
545 | &lea ($num,&DWP(-1,$j)); | ||
546 | &adc ("edx",0); | ||
547 | &mov ($j,1); | ||
548 | &mov ("eax",&DWP(4,$inp)); # np[1] | ||
549 | |||
550 | &jmp (&label("3rdmadd")); | ||
551 | } | ||
552 | |||
553 | &set_label("common_tail",16); | ||
554 | &mov ($np,$_np); # load modulus pointer | ||
555 | &mov ($rp,$_rp); # load result pointer | ||
556 | &lea ($tp,&DWP($frame,"esp")); # [$ap and $bp are zapped] | ||
557 | |||
558 | &mov ("eax",&DWP(0,$tp)); # tp[0] | ||
559 | &mov ($j,$num); # j=num-1 | ||
560 | &xor ($i,$i); # i=0 and clear CF! | ||
561 | |||
562 | &set_label("sub",16); | ||
563 | &sbb ("eax",&DWP(0,$np,$i,4)); | ||
564 | &mov (&DWP(0,$rp,$i,4),"eax"); # rp[i]=tp[i]-np[i] | ||
565 | &dec ($j); # doesn't affect CF! | ||
566 | &mov ("eax",&DWP(4,$tp,$i,4)); # tp[i+1] | ||
567 | &lea ($i,&DWP(1,$i)); # i++ | ||
568 | &jge (&label("sub")); | ||
569 | |||
570 | &sbb ("eax",0); # handle upmost overflow bit | ||
571 | &and ($tp,"eax"); | ||
572 | ¬ ("eax"); | ||
573 | &mov ($np,$rp); | ||
574 | &and ($np,"eax"); | ||
575 | &or ($tp,$np); # tp=carry?tp:rp | ||
576 | |||
577 | &set_label("copy",16); # copy or in-place refresh | ||
578 | &mov ("eax",&DWP(0,$tp,$num,4)); | ||
579 | &mov (&DWP(0,$rp,$num,4),"eax"); # rp[i]=tp[i] | ||
580 | &mov (&DWP($frame,"esp",$num,4),$j); # zap temporary vector | ||
581 | &dec ($num); | ||
582 | &jge (&label("copy")); | ||
583 | |||
584 | &mov ("esp",$_sp); # pull saved stack pointer | ||
585 | &mov ("eax",1); | ||
586 | &set_label("just_leave"); | ||
587 | &function_end("bn_mul_mont"); | ||
588 | |||
589 | &asciz("Montgomery Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>"); | ||
590 | |||
591 | &asm_finish(); | ||