diff options
Diffstat (limited to 'src')
-rwxr-xr-x | src/lib/libcrypto/bn/asm/x86_64-mont5.pl | 513 |
1 files changed, 314 insertions, 199 deletions
diff --git a/src/lib/libcrypto/bn/asm/x86_64-mont5.pl b/src/lib/libcrypto/bn/asm/x86_64-mont5.pl index 81e5c53728..bb7ad4c4b7 100755 --- a/src/lib/libcrypto/bn/asm/x86_64-mont5.pl +++ b/src/lib/libcrypto/bn/asm/x86_64-mont5.pl | |||
@@ -66,60 +66,113 @@ bn_mul_mont_gather5: | |||
66 | .align 16 | 66 | .align 16 |
67 | .Lmul_enter: | 67 | .Lmul_enter: |
68 | mov ${num}d,${num}d | 68 | mov ${num}d,${num}d |
69 | mov `($win64?56:8)`(%rsp),%r10d # load 7th argument | 69 | movd `($win64?56:8)`(%rsp),%xmm5 # load 7th argument |
70 | lea .Linc(%rip),%r10 | ||
70 | push %rbx | 71 | push %rbx |
71 | push %rbp | 72 | push %rbp |
72 | push %r12 | 73 | push %r12 |
73 | push %r13 | 74 | push %r13 |
74 | push %r14 | 75 | push %r14 |
75 | push %r15 | 76 | push %r15 |
76 | ___ | 77 | |
77 | $code.=<<___ if ($win64); | ||
78 | lea -0x28(%rsp),%rsp | ||
79 | movaps %xmm6,(%rsp) | ||
80 | movaps %xmm7,0x10(%rsp) | ||
81 | .Lmul_alloca: | 78 | .Lmul_alloca: |
82 | ___ | ||
83 | $code.=<<___; | ||
84 | mov %rsp,%rax | 79 | mov %rsp,%rax |
85 | lea 2($num),%r11 | 80 | lea 2($num),%r11 |
86 | neg %r11 | 81 | neg %r11 |
87 | lea (%rsp,%r11,8),%rsp # tp=alloca(8*(num+2)) | 82 | lea -264(%rsp,%r11,8),%rsp # tp=alloca(8*(num+2)+256+8) |
88 | and \$-1024,%rsp # minimize TLB usage | 83 | and \$-1024,%rsp # minimize TLB usage |
89 | 84 | ||
90 | mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp | 85 | mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp |
91 | .Lmul_body: | 86 | .Lmul_body: |
92 | mov $bp,%r12 # reassign $bp | 87 | lea 128($bp),%r12 # reassign $bp (+size optimization) |
93 | ___ | 88 | ___ |
94 | $bp="%r12"; | 89 | $bp="%r12"; |
95 | $STRIDE=2**5*8; # 5 is "window size" | 90 | $STRIDE=2**5*8; # 5 is "window size" |
96 | $N=$STRIDE/4; # should match cache line size | 91 | $N=$STRIDE/4; # should match cache line size |
97 | $code.=<<___; | 92 | $code.=<<___; |
98 | mov %r10,%r11 | 93 | movdqa 0(%r10),%xmm0 # 00000001000000010000000000000000 |
99 | shr \$`log($N/8)/log(2)`,%r10 | 94 | movdqa 16(%r10),%xmm1 # 00000002000000020000000200000002 |
100 | and \$`$N/8-1`,%r11 | 95 | lea 24-112(%rsp,$num,8),%r10# place the mask after tp[num+3] (+ICache optimization) |
101 | not %r10 | 96 | and \$-16,%r10 |
102 | lea .Lmagic_masks(%rip),%rax | 97 | |
103 | and \$`2**5/($N/8)-1`,%r10 # 5 is "window size" | 98 | pshufd \$0,%xmm5,%xmm5 # broadcast index |
104 | lea 96($bp,%r11,8),$bp # pointer within 1st cache line | 99 | movdqa %xmm1,%xmm4 |
105 | movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which | 100 | movdqa %xmm1,%xmm2 |
106 | movq 8(%rax,%r10,8),%xmm5 # cache line contains element | 101 | ___ |
107 | movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument | 102 | ######################################################################## |
108 | movq 24(%rax,%r10,8),%xmm7 | 103 | # calculate mask by comparing 0..31 to index and save result to stack |
109 | 104 | # | |
110 | movq `0*$STRIDE/4-96`($bp),%xmm0 | 105 | $code.=<<___; |
111 | movq `1*$STRIDE/4-96`($bp),%xmm1 | 106 | paddd %xmm0,%xmm1 |
112 | pand %xmm4,%xmm0 | 107 | pcmpeqd %xmm5,%xmm0 # compare to 1,0 |
113 | movq `2*$STRIDE/4-96`($bp),%xmm2 | 108 | .byte 0x67 |
114 | pand %xmm5,%xmm1 | 109 | movdqa %xmm4,%xmm3 |
115 | movq `3*$STRIDE/4-96`($bp),%xmm3 | 110 | ___ |
116 | pand %xmm6,%xmm2 | 111 | for($k=0;$k<$STRIDE/16-4;$k+=4) { |
117 | por %xmm1,%xmm0 | 112 | $code.=<<___; |
118 | pand %xmm7,%xmm3 | 113 | paddd %xmm1,%xmm2 |
114 | pcmpeqd %xmm5,%xmm1 # compare to 3,2 | ||
115 | movdqa %xmm0,`16*($k+0)+112`(%r10) | ||
116 | movdqa %xmm4,%xmm0 | ||
117 | |||
118 | paddd %xmm2,%xmm3 | ||
119 | pcmpeqd %xmm5,%xmm2 # compare to 5,4 | ||
120 | movdqa %xmm1,`16*($k+1)+112`(%r10) | ||
121 | movdqa %xmm4,%xmm1 | ||
122 | |||
123 | paddd %xmm3,%xmm0 | ||
124 | pcmpeqd %xmm5,%xmm3 # compare to 7,6 | ||
125 | movdqa %xmm2,`16*($k+2)+112`(%r10) | ||
126 | movdqa %xmm4,%xmm2 | ||
127 | |||
128 | paddd %xmm0,%xmm1 | ||
129 | pcmpeqd %xmm5,%xmm0 | ||
130 | movdqa %xmm3,`16*($k+3)+112`(%r10) | ||
131 | movdqa %xmm4,%xmm3 | ||
132 | ___ | ||
133 | } | ||
134 | $code.=<<___; # last iteration can be optimized | ||
135 | paddd %xmm1,%xmm2 | ||
136 | pcmpeqd %xmm5,%xmm1 | ||
137 | movdqa %xmm0,`16*($k+0)+112`(%r10) | ||
138 | |||
139 | paddd %xmm2,%xmm3 | ||
140 | .byte 0x67 | ||
141 | pcmpeqd %xmm5,%xmm2 | ||
142 | movdqa %xmm1,`16*($k+1)+112`(%r10) | ||
143 | |||
144 | pcmpeqd %xmm5,%xmm3 | ||
145 | movdqa %xmm2,`16*($k+2)+112`(%r10) | ||
146 | pand `16*($k+0)-128`($bp),%xmm0 # while it's still in register | ||
147 | |||
148 | pand `16*($k+1)-128`($bp),%xmm1 | ||
149 | pand `16*($k+2)-128`($bp),%xmm2 | ||
150 | movdqa %xmm3,`16*($k+3)+112`(%r10) | ||
151 | pand `16*($k+3)-128`($bp),%xmm3 | ||
152 | por %xmm2,%xmm0 | ||
153 | por %xmm3,%xmm1 | ||
154 | ___ | ||
155 | for($k=0;$k<$STRIDE/16-4;$k+=4) { | ||
156 | $code.=<<___; | ||
157 | movdqa `16*($k+0)-128`($bp),%xmm4 | ||
158 | movdqa `16*($k+1)-128`($bp),%xmm5 | ||
159 | movdqa `16*($k+2)-128`($bp),%xmm2 | ||
160 | pand `16*($k+0)+112`(%r10),%xmm4 | ||
161 | movdqa `16*($k+3)-128`($bp),%xmm3 | ||
162 | pand `16*($k+1)+112`(%r10),%xmm5 | ||
163 | por %xmm4,%xmm0 | ||
164 | pand `16*($k+2)+112`(%r10),%xmm2 | ||
165 | por %xmm5,%xmm1 | ||
166 | pand `16*($k+3)+112`(%r10),%xmm3 | ||
119 | por %xmm2,%xmm0 | 167 | por %xmm2,%xmm0 |
168 | por %xmm3,%xmm1 | ||
169 | ___ | ||
170 | } | ||
171 | $code.=<<___; | ||
172 | por %xmm1,%xmm0 | ||
173 | pshufd \$0x4e,%xmm0,%xmm1 | ||
174 | por %xmm1,%xmm0 | ||
120 | lea $STRIDE($bp),$bp | 175 | lea $STRIDE($bp),$bp |
121 | por %xmm3,%xmm0 | ||
122 | |||
123 | movd %xmm0,$m0 # m0=bp[0] | 176 | movd %xmm0,$m0 # m0=bp[0] |
124 | 177 | ||
125 | mov ($n0),$n0 # pull n0[0] value | 178 | mov ($n0),$n0 # pull n0[0] value |
@@ -128,29 +181,14 @@ $code.=<<___; | |||
128 | xor $i,$i # i=0 | 181 | xor $i,$i # i=0 |
129 | xor $j,$j # j=0 | 182 | xor $j,$j # j=0 |
130 | 183 | ||
131 | movq `0*$STRIDE/4-96`($bp),%xmm0 | ||
132 | movq `1*$STRIDE/4-96`($bp),%xmm1 | ||
133 | pand %xmm4,%xmm0 | ||
134 | movq `2*$STRIDE/4-96`($bp),%xmm2 | ||
135 | pand %xmm5,%xmm1 | ||
136 | |||
137 | mov $n0,$m1 | 184 | mov $n0,$m1 |
138 | mulq $m0 # ap[0]*bp[0] | 185 | mulq $m0 # ap[0]*bp[0] |
139 | mov %rax,$lo0 | 186 | mov %rax,$lo0 |
140 | mov ($np),%rax | 187 | mov ($np),%rax |
141 | 188 | ||
142 | movq `3*$STRIDE/4-96`($bp),%xmm3 | ||
143 | pand %xmm6,%xmm2 | ||
144 | por %xmm1,%xmm0 | ||
145 | pand %xmm7,%xmm3 | ||
146 | |||
147 | imulq $lo0,$m1 # "tp[0]"*n0 | 189 | imulq $lo0,$m1 # "tp[0]"*n0 |
148 | mov %rdx,$hi0 | 190 | mov %rdx,$hi0 |
149 | 191 | ||
150 | por %xmm2,%xmm0 | ||
151 | lea $STRIDE($bp),$bp | ||
152 | por %xmm3,%xmm0 | ||
153 | |||
154 | mulq $m1 # np[0]*m1 | 192 | mulq $m1 # np[0]*m1 |
155 | add %rax,$lo0 # discarded | 193 | add %rax,$lo0 # discarded |
156 | mov 8($ap),%rax | 194 | mov 8($ap),%rax |
@@ -183,8 +221,6 @@ $code.=<<___; | |||
183 | cmp $num,$j | 221 | cmp $num,$j |
184 | jl .L1st | 222 | jl .L1st |
185 | 223 | ||
186 | movd %xmm0,$m0 # bp[1] | ||
187 | |||
188 | add %rax,$hi1 | 224 | add %rax,$hi1 |
189 | mov ($ap),%rax # ap[0] | 225 | mov ($ap),%rax # ap[0] |
190 | adc \$0,%rdx | 226 | adc \$0,%rdx |
@@ -204,33 +240,46 @@ $code.=<<___; | |||
204 | jmp .Louter | 240 | jmp .Louter |
205 | .align 16 | 241 | .align 16 |
206 | .Louter: | 242 | .Louter: |
243 | lea 24+128(%rsp,$num,8),%rdx # where 256-byte mask is (+size optimization) | ||
244 | and \$-16,%rdx | ||
245 | pxor %xmm4,%xmm4 | ||
246 | pxor %xmm5,%xmm5 | ||
247 | ___ | ||
248 | for($k=0;$k<$STRIDE/16;$k+=4) { | ||
249 | $code.=<<___; | ||
250 | movdqa `16*($k+0)-128`($bp),%xmm0 | ||
251 | movdqa `16*($k+1)-128`($bp),%xmm1 | ||
252 | movdqa `16*($k+2)-128`($bp),%xmm2 | ||
253 | movdqa `16*($k+3)-128`($bp),%xmm3 | ||
254 | pand `16*($k+0)-128`(%rdx),%xmm0 | ||
255 | pand `16*($k+1)-128`(%rdx),%xmm1 | ||
256 | por %xmm0,%xmm4 | ||
257 | pand `16*($k+2)-128`(%rdx),%xmm2 | ||
258 | por %xmm1,%xmm5 | ||
259 | pand `16*($k+3)-128`(%rdx),%xmm3 | ||
260 | por %xmm2,%xmm4 | ||
261 | por %xmm3,%xmm5 | ||
262 | ___ | ||
263 | } | ||
264 | $code.=<<___; | ||
265 | por %xmm5,%xmm4 | ||
266 | pshufd \$0x4e,%xmm4,%xmm0 | ||
267 | por %xmm4,%xmm0 | ||
268 | lea $STRIDE($bp),$bp | ||
269 | movd %xmm0,$m0 # m0=bp[i] | ||
270 | |||
207 | xor $j,$j # j=0 | 271 | xor $j,$j # j=0 |
208 | mov $n0,$m1 | 272 | mov $n0,$m1 |
209 | mov (%rsp),$lo0 | 273 | mov (%rsp),$lo0 |
210 | 274 | ||
211 | movq `0*$STRIDE/4-96`($bp),%xmm0 | ||
212 | movq `1*$STRIDE/4-96`($bp),%xmm1 | ||
213 | pand %xmm4,%xmm0 | ||
214 | movq `2*$STRIDE/4-96`($bp),%xmm2 | ||
215 | pand %xmm5,%xmm1 | ||
216 | |||
217 | mulq $m0 # ap[0]*bp[i] | 275 | mulq $m0 # ap[0]*bp[i] |
218 | add %rax,$lo0 # ap[0]*bp[i]+tp[0] | 276 | add %rax,$lo0 # ap[0]*bp[i]+tp[0] |
219 | mov ($np),%rax | 277 | mov ($np),%rax |
220 | adc \$0,%rdx | 278 | adc \$0,%rdx |
221 | 279 | ||
222 | movq `3*$STRIDE/4-96`($bp),%xmm3 | ||
223 | pand %xmm6,%xmm2 | ||
224 | por %xmm1,%xmm0 | ||
225 | pand %xmm7,%xmm3 | ||
226 | |||
227 | imulq $lo0,$m1 # tp[0]*n0 | 280 | imulq $lo0,$m1 # tp[0]*n0 |
228 | mov %rdx,$hi0 | 281 | mov %rdx,$hi0 |
229 | 282 | ||
230 | por %xmm2,%xmm0 | ||
231 | lea $STRIDE($bp),$bp | ||
232 | por %xmm3,%xmm0 | ||
233 | |||
234 | mulq $m1 # np[0]*m1 | 283 | mulq $m1 # np[0]*m1 |
235 | add %rax,$lo0 # discarded | 284 | add %rax,$lo0 # discarded |
236 | mov 8($ap),%rax | 285 | mov 8($ap),%rax |
@@ -266,8 +315,6 @@ $code.=<<___; | |||
266 | cmp $num,$j | 315 | cmp $num,$j |
267 | jl .Linner | 316 | jl .Linner |
268 | 317 | ||
269 | movd %xmm0,$m0 # bp[i+1] | ||
270 | |||
271 | add %rax,$hi1 | 318 | add %rax,$hi1 |
272 | mov ($ap),%rax # ap[0] | 319 | mov ($ap),%rax # ap[0] |
273 | adc \$0,%rdx | 320 | adc \$0,%rdx |
@@ -321,13 +368,7 @@ $code.=<<___; | |||
321 | 368 | ||
322 | mov 8(%rsp,$num,8),%rsi # restore %rsp | 369 | mov 8(%rsp,$num,8),%rsi # restore %rsp |
323 | mov \$1,%rax | 370 | mov \$1,%rax |
324 | ___ | 371 | |
325 | $code.=<<___ if ($win64); | ||
326 | movaps (%rsi),%xmm6 | ||
327 | movaps 0x10(%rsi),%xmm7 | ||
328 | lea 0x28(%rsi),%rsi | ||
329 | ___ | ||
330 | $code.=<<___; | ||
331 | mov (%rsi),%r15 | 372 | mov (%rsi),%r15 |
332 | mov 8(%rsi),%r14 | 373 | mov 8(%rsi),%r14 |
333 | mov 16(%rsi),%r13 | 374 | mov 16(%rsi),%r13 |
@@ -348,91 +389,130 @@ $code.=<<___; | |||
348 | bn_mul4x_mont_gather5: | 389 | bn_mul4x_mont_gather5: |
349 | .Lmul4x_enter: | 390 | .Lmul4x_enter: |
350 | mov ${num}d,${num}d | 391 | mov ${num}d,${num}d |
351 | mov `($win64?56:8)`(%rsp),%r10d # load 7th argument | 392 | movd `($win64?56:8)`(%rsp),%xmm5 # load 7th argument |
393 | lea .Linc(%rip),%r10 | ||
352 | push %rbx | 394 | push %rbx |
353 | push %rbp | 395 | push %rbp |
354 | push %r12 | 396 | push %r12 |
355 | push %r13 | 397 | push %r13 |
356 | push %r14 | 398 | push %r14 |
357 | push %r15 | 399 | push %r15 |
358 | ___ | 400 | |
359 | $code.=<<___ if ($win64); | ||
360 | lea -0x28(%rsp),%rsp | ||
361 | movaps %xmm6,(%rsp) | ||
362 | movaps %xmm7,0x10(%rsp) | ||
363 | .Lmul4x_alloca: | 401 | .Lmul4x_alloca: |
364 | ___ | ||
365 | $code.=<<___; | ||
366 | mov %rsp,%rax | 402 | mov %rsp,%rax |
367 | lea 4($num),%r11 | 403 | lea 4($num),%r11 |
368 | neg %r11 | 404 | neg %r11 |
369 | lea (%rsp,%r11,8),%rsp # tp=alloca(8*(num+4)) | 405 | lea -256(%rsp,%r11,8),%rsp # tp=alloca(8*(num+4)+256) |
370 | and \$-1024,%rsp # minimize TLB usage | 406 | and \$-1024,%rsp # minimize TLB usage |
371 | 407 | ||
372 | mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp | 408 | mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp |
373 | .Lmul4x_body: | 409 | .Lmul4x_body: |
374 | mov $rp,16(%rsp,$num,8) # tp[num+2]=$rp | 410 | mov $rp,16(%rsp,$num,8) # tp[num+2]=$rp |
375 | mov %rdx,%r12 # reassign $bp | 411 | lea 128(%rdx),%r12 # reassign $bp (+size optimization) |
376 | ___ | 412 | ___ |
377 | $bp="%r12"; | 413 | $bp="%r12"; |
378 | $STRIDE=2**5*8; # 5 is "window size" | 414 | $STRIDE=2**5*8; # 5 is "window size" |
379 | $N=$STRIDE/4; # should match cache line size | 415 | $N=$STRIDE/4; # should match cache line size |
380 | $code.=<<___; | 416 | $code.=<<___; |
381 | mov %r10,%r11 | 417 | movdqa 0(%r10),%xmm0 # 00000001000000010000000000000000 |
382 | shr \$`log($N/8)/log(2)`,%r10 | 418 | movdqa 16(%r10),%xmm1 # 00000002000000020000000200000002 |
383 | and \$`$N/8-1`,%r11 | 419 | lea 32-112(%rsp,$num,8),%r10# place the mask after tp[num+4] (+ICache optimization) |
384 | not %r10 | 420 | |
385 | lea .Lmagic_masks(%rip),%rax | 421 | pshufd \$0,%xmm5,%xmm5 # broadcast index |
386 | and \$`2**5/($N/8)-1`,%r10 # 5 is "window size" | 422 | movdqa %xmm1,%xmm4 |
387 | lea 96($bp,%r11,8),$bp # pointer within 1st cache line | 423 | .byte 0x67,0x67 |
388 | movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which | 424 | movdqa %xmm1,%xmm2 |
389 | movq 8(%rax,%r10,8),%xmm5 # cache line contains element | 425 | ___ |
390 | movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument | 426 | ######################################################################## |
391 | movq 24(%rax,%r10,8),%xmm7 | 427 | # calculate mask by comparing 0..31 to index and save result to stack |
392 | 428 | # | |
393 | movq `0*$STRIDE/4-96`($bp),%xmm0 | 429 | $code.=<<___; |
394 | movq `1*$STRIDE/4-96`($bp),%xmm1 | 430 | paddd %xmm0,%xmm1 |
395 | pand %xmm4,%xmm0 | 431 | pcmpeqd %xmm5,%xmm0 # compare to 1,0 |
396 | movq `2*$STRIDE/4-96`($bp),%xmm2 | 432 | .byte 0x67 |
397 | pand %xmm5,%xmm1 | 433 | movdqa %xmm4,%xmm3 |
398 | movq `3*$STRIDE/4-96`($bp),%xmm3 | 434 | ___ |
399 | pand %xmm6,%xmm2 | 435 | for($k=0;$k<$STRIDE/16-4;$k+=4) { |
400 | por %xmm1,%xmm0 | 436 | $code.=<<___; |
401 | pand %xmm7,%xmm3 | 437 | paddd %xmm1,%xmm2 |
438 | pcmpeqd %xmm5,%xmm1 # compare to 3,2 | ||
439 | movdqa %xmm0,`16*($k+0)+112`(%r10) | ||
440 | movdqa %xmm4,%xmm0 | ||
441 | |||
442 | paddd %xmm2,%xmm3 | ||
443 | pcmpeqd %xmm5,%xmm2 # compare to 5,4 | ||
444 | movdqa %xmm1,`16*($k+1)+112`(%r10) | ||
445 | movdqa %xmm4,%xmm1 | ||
446 | |||
447 | paddd %xmm3,%xmm0 | ||
448 | pcmpeqd %xmm5,%xmm3 # compare to 7,6 | ||
449 | movdqa %xmm2,`16*($k+2)+112`(%r10) | ||
450 | movdqa %xmm4,%xmm2 | ||
451 | |||
452 | paddd %xmm0,%xmm1 | ||
453 | pcmpeqd %xmm5,%xmm0 | ||
454 | movdqa %xmm3,`16*($k+3)+112`(%r10) | ||
455 | movdqa %xmm4,%xmm3 | ||
456 | ___ | ||
457 | } | ||
458 | $code.=<<___; # last iteration can be optimized | ||
459 | paddd %xmm1,%xmm2 | ||
460 | pcmpeqd %xmm5,%xmm1 | ||
461 | movdqa %xmm0,`16*($k+0)+112`(%r10) | ||
462 | |||
463 | paddd %xmm2,%xmm3 | ||
464 | .byte 0x67 | ||
465 | pcmpeqd %xmm5,%xmm2 | ||
466 | movdqa %xmm1,`16*($k+1)+112`(%r10) | ||
467 | |||
468 | pcmpeqd %xmm5,%xmm3 | ||
469 | movdqa %xmm2,`16*($k+2)+112`(%r10) | ||
470 | pand `16*($k+0)-128`($bp),%xmm0 # while it's still in register | ||
471 | |||
472 | pand `16*($k+1)-128`($bp),%xmm1 | ||
473 | pand `16*($k+2)-128`($bp),%xmm2 | ||
474 | movdqa %xmm3,`16*($k+3)+112`(%r10) | ||
475 | pand `16*($k+3)-128`($bp),%xmm3 | ||
476 | por %xmm2,%xmm0 | ||
477 | por %xmm3,%xmm1 | ||
478 | ___ | ||
479 | for($k=0;$k<$STRIDE/16-4;$k+=4) { | ||
480 | $code.=<<___; | ||
481 | movdqa `16*($k+0)-128`($bp),%xmm4 | ||
482 | movdqa `16*($k+1)-128`($bp),%xmm5 | ||
483 | movdqa `16*($k+2)-128`($bp),%xmm2 | ||
484 | pand `16*($k+0)+112`(%r10),%xmm4 | ||
485 | movdqa `16*($k+3)-128`($bp),%xmm3 | ||
486 | pand `16*($k+1)+112`(%r10),%xmm5 | ||
487 | por %xmm4,%xmm0 | ||
488 | pand `16*($k+2)+112`(%r10),%xmm2 | ||
489 | por %xmm5,%xmm1 | ||
490 | pand `16*($k+3)+112`(%r10),%xmm3 | ||
402 | por %xmm2,%xmm0 | 491 | por %xmm2,%xmm0 |
492 | por %xmm3,%xmm1 | ||
493 | ___ | ||
494 | } | ||
495 | $code.=<<___; | ||
496 | por %xmm1,%xmm0 | ||
497 | pshufd \$0x4e,%xmm0,%xmm1 | ||
498 | por %xmm1,%xmm0 | ||
403 | lea $STRIDE($bp),$bp | 499 | lea $STRIDE($bp),$bp |
404 | por %xmm3,%xmm0 | ||
405 | |||
406 | movd %xmm0,$m0 # m0=bp[0] | 500 | movd %xmm0,$m0 # m0=bp[0] |
501 | |||
407 | mov ($n0),$n0 # pull n0[0] value | 502 | mov ($n0),$n0 # pull n0[0] value |
408 | mov ($ap),%rax | 503 | mov ($ap),%rax |
409 | 504 | ||
410 | xor $i,$i # i=0 | 505 | xor $i,$i # i=0 |
411 | xor $j,$j # j=0 | 506 | xor $j,$j # j=0 |
412 | 507 | ||
413 | movq `0*$STRIDE/4-96`($bp),%xmm0 | ||
414 | movq `1*$STRIDE/4-96`($bp),%xmm1 | ||
415 | pand %xmm4,%xmm0 | ||
416 | movq `2*$STRIDE/4-96`($bp),%xmm2 | ||
417 | pand %xmm5,%xmm1 | ||
418 | |||
419 | mov $n0,$m1 | 508 | mov $n0,$m1 |
420 | mulq $m0 # ap[0]*bp[0] | 509 | mulq $m0 # ap[0]*bp[0] |
421 | mov %rax,$A[0] | 510 | mov %rax,$A[0] |
422 | mov ($np),%rax | 511 | mov ($np),%rax |
423 | 512 | ||
424 | movq `3*$STRIDE/4-96`($bp),%xmm3 | ||
425 | pand %xmm6,%xmm2 | ||
426 | por %xmm1,%xmm0 | ||
427 | pand %xmm7,%xmm3 | ||
428 | |||
429 | imulq $A[0],$m1 # "tp[0]"*n0 | 513 | imulq $A[0],$m1 # "tp[0]"*n0 |
430 | mov %rdx,$A[1] | 514 | mov %rdx,$A[1] |
431 | 515 | ||
432 | por %xmm2,%xmm0 | ||
433 | lea $STRIDE($bp),$bp | ||
434 | por %xmm3,%xmm0 | ||
435 | |||
436 | mulq $m1 # np[0]*m1 | 516 | mulq $m1 # np[0]*m1 |
437 | add %rax,$A[0] # discarded | 517 | add %rax,$A[0] # discarded |
438 | mov 8($ap),%rax | 518 | mov 8($ap),%rax |
@@ -550,8 +630,6 @@ $code.=<<___; | |||
550 | mov $N[1],-16(%rsp,$j,8) # tp[j-1] | 630 | mov $N[1],-16(%rsp,$j,8) # tp[j-1] |
551 | mov %rdx,$N[0] | 631 | mov %rdx,$N[0] |
552 | 632 | ||
553 | movd %xmm0,$m0 # bp[1] | ||
554 | |||
555 | xor $N[1],$N[1] | 633 | xor $N[1],$N[1] |
556 | add $A[0],$N[0] | 634 | add $A[0],$N[0] |
557 | adc \$0,$N[1] | 635 | adc \$0,$N[1] |
@@ -561,12 +639,34 @@ $code.=<<___; | |||
561 | lea 1($i),$i # i++ | 639 | lea 1($i),$i # i++ |
562 | .align 4 | 640 | .align 4 |
563 | .Louter4x: | 641 | .Louter4x: |
642 | lea 32+128(%rsp,$num,8),%rdx # where 256-byte mask is (+size optimization) | ||
643 | pxor %xmm4,%xmm4 | ||
644 | pxor %xmm5,%xmm5 | ||
645 | ___ | ||
646 | for($k=0;$k<$STRIDE/16;$k+=4) { | ||
647 | $code.=<<___; | ||
648 | movdqa `16*($k+0)-128`($bp),%xmm0 | ||
649 | movdqa `16*($k+1)-128`($bp),%xmm1 | ||
650 | movdqa `16*($k+2)-128`($bp),%xmm2 | ||
651 | movdqa `16*($k+3)-128`($bp),%xmm3 | ||
652 | pand `16*($k+0)-128`(%rdx),%xmm0 | ||
653 | pand `16*($k+1)-128`(%rdx),%xmm1 | ||
654 | por %xmm0,%xmm4 | ||
655 | pand `16*($k+2)-128`(%rdx),%xmm2 | ||
656 | por %xmm1,%xmm5 | ||
657 | pand `16*($k+3)-128`(%rdx),%xmm3 | ||
658 | por %xmm2,%xmm4 | ||
659 | por %xmm3,%xmm5 | ||
660 | ___ | ||
661 | } | ||
662 | $code.=<<___; | ||
663 | por %xmm5,%xmm4 | ||
664 | pshufd \$0x4e,%xmm4,%xmm0 | ||
665 | por %xmm4,%xmm0 | ||
666 | lea $STRIDE($bp),$bp | ||
667 | movd %xmm0,$m0 # m0=bp[i] | ||
668 | |||
564 | xor $j,$j # j=0 | 669 | xor $j,$j # j=0 |
565 | movq `0*$STRIDE/4-96`($bp),%xmm0 | ||
566 | movq `1*$STRIDE/4-96`($bp),%xmm1 | ||
567 | pand %xmm4,%xmm0 | ||
568 | movq `2*$STRIDE/4-96`($bp),%xmm2 | ||
569 | pand %xmm5,%xmm1 | ||
570 | 670 | ||
571 | mov (%rsp),$A[0] | 671 | mov (%rsp),$A[0] |
572 | mov $n0,$m1 | 672 | mov $n0,$m1 |
@@ -575,18 +675,9 @@ $code.=<<___; | |||
575 | mov ($np),%rax | 675 | mov ($np),%rax |
576 | adc \$0,%rdx | 676 | adc \$0,%rdx |
577 | 677 | ||
578 | movq `3*$STRIDE/4-96`($bp),%xmm3 | ||
579 | pand %xmm6,%xmm2 | ||
580 | por %xmm1,%xmm0 | ||
581 | pand %xmm7,%xmm3 | ||
582 | |||
583 | imulq $A[0],$m1 # tp[0]*n0 | 678 | imulq $A[0],$m1 # tp[0]*n0 |
584 | mov %rdx,$A[1] | 679 | mov %rdx,$A[1] |
585 | 680 | ||
586 | por %xmm2,%xmm0 | ||
587 | lea $STRIDE($bp),$bp | ||
588 | por %xmm3,%xmm0 | ||
589 | |||
590 | mulq $m1 # np[0]*m1 | 681 | mulq $m1 # np[0]*m1 |
591 | add %rax,$A[0] # "$N[0]", discarded | 682 | add %rax,$A[0] # "$N[0]", discarded |
592 | mov 8($ap),%rax | 683 | mov 8($ap),%rax |
@@ -718,7 +809,6 @@ $code.=<<___; | |||
718 | mov $N[0],-24(%rsp,$j,8) # tp[j-1] | 809 | mov $N[0],-24(%rsp,$j,8) # tp[j-1] |
719 | mov %rdx,$N[0] | 810 | mov %rdx,$N[0] |
720 | 811 | ||
721 | movd %xmm0,$m0 # bp[i+1] | ||
722 | mov $N[1],-16(%rsp,$j,8) # tp[j-1] | 812 | mov $N[1],-16(%rsp,$j,8) # tp[j-1] |
723 | 813 | ||
724 | xor $N[1],$N[1] | 814 | xor $N[1],$N[1] |
@@ -809,13 +899,7 @@ ___ | |||
809 | $code.=<<___; | 899 | $code.=<<___; |
810 | mov 8(%rsp,$num,8),%rsi # restore %rsp | 900 | mov 8(%rsp,$num,8),%rsi # restore %rsp |
811 | mov \$1,%rax | 901 | mov \$1,%rax |
812 | ___ | 902 | |
813 | $code.=<<___ if ($win64); | ||
814 | movaps (%rsi),%xmm6 | ||
815 | movaps 0x10(%rsi),%xmm7 | ||
816 | lea 0x28(%rsi),%rsi | ||
817 | ___ | ||
818 | $code.=<<___; | ||
819 | mov (%rsi),%r15 | 903 | mov (%rsi),%r15 |
820 | mov 8(%rsi),%r14 | 904 | mov 8(%rsi),%r14 |
821 | mov 16(%rsi),%r13 | 905 | mov 16(%rsi),%r13 |
@@ -830,8 +914,8 @@ ___ | |||
830 | }}} | 914 | }}} |
831 | 915 | ||
832 | { | 916 | { |
833 | my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%rdx","%r8", "%r9") : # Win64 order | 917 | my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%rdx","%r8", "%r9d") : # Win64 order |
834 | ("%rdi","%rsi","%rdx","%rcx"); # Unix order | 918 | ("%rdi","%rsi","%rdx","%ecx"); # Unix order |
835 | my $out=$inp; | 919 | my $out=$inp; |
836 | my $STRIDE=2**5*8; | 920 | my $STRIDE=2**5*8; |
837 | my $N=$STRIDE/4; | 921 | my $N=$STRIDE/4; |
@@ -859,53 +943,89 @@ bn_scatter5: | |||
859 | .type bn_gather5,\@abi-omnipotent | 943 | .type bn_gather5,\@abi-omnipotent |
860 | .align 16 | 944 | .align 16 |
861 | bn_gather5: | 945 | bn_gather5: |
862 | ___ | 946 | .LSEH_begin_bn_gather5: # Win64 thing, but harmless in other cases |
863 | $code.=<<___ if ($win64); | ||
864 | .LSEH_begin_bn_gather5: | ||
865 | # I can't trust assembler to use specific encoding:-( | 947 | # I can't trust assembler to use specific encoding:-( |
866 | .byte 0x48,0x83,0xec,0x28 #sub \$0x28,%rsp | 948 | .byte 0x4c,0x8d,0x14,0x24 # lea (%rsp),%r10 |
867 | .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp) | 949 | .byte 0x48,0x81,0xec,0x08,0x01,0x00,0x00 # sub $0x108,%rsp |
868 | .byte 0x0f,0x29,0x7c,0x24,0x10 #movdqa %xmm7,0x10(%rsp) | 950 | lea .Linc(%rip),%rax |
951 | and \$-16,%rsp # shouldn't be formally required | ||
952 | |||
953 | movd $idx,%xmm5 | ||
954 | movdqa 0(%rax),%xmm0 # 00000001000000010000000000000000 | ||
955 | movdqa 16(%rax),%xmm1 # 00000002000000020000000200000002 | ||
956 | lea 128($tbl),%r11 # size optimization | ||
957 | lea 128(%rsp),%rax # size optimization | ||
958 | |||
959 | pshufd \$0,%xmm5,%xmm5 # broadcast $idx | ||
960 | movdqa %xmm1,%xmm4 | ||
961 | movdqa %xmm1,%xmm2 | ||
869 | ___ | 962 | ___ |
963 | ######################################################################## | ||
964 | # calculate mask by comparing 0..31 to $idx and save result to stack | ||
965 | # | ||
966 | for($i=0;$i<$STRIDE/16;$i+=4) { | ||
967 | $code.=<<___; | ||
968 | paddd %xmm0,%xmm1 | ||
969 | pcmpeqd %xmm5,%xmm0 # compare to 1,0 | ||
970 | ___ | ||
971 | $code.=<<___ if ($i); | ||
972 | movdqa %xmm3,`16*($i-1)-128`(%rax) | ||
973 | ___ | ||
974 | $code.=<<___; | ||
975 | movdqa %xmm4,%xmm3 | ||
976 | |||
977 | paddd %xmm1,%xmm2 | ||
978 | pcmpeqd %xmm5,%xmm1 # compare to 3,2 | ||
979 | movdqa %xmm0,`16*($i+0)-128`(%rax) | ||
980 | movdqa %xmm4,%xmm0 | ||
981 | |||
982 | paddd %xmm2,%xmm3 | ||
983 | pcmpeqd %xmm5,%xmm2 # compare to 5,4 | ||
984 | movdqa %xmm1,`16*($i+1)-128`(%rax) | ||
985 | movdqa %xmm4,%xmm1 | ||
986 | |||
987 | paddd %xmm3,%xmm0 | ||
988 | pcmpeqd %xmm5,%xmm3 # compare to 7,6 | ||
989 | movdqa %xmm2,`16*($i+2)-128`(%rax) | ||
990 | movdqa %xmm4,%xmm2 | ||
991 | ___ | ||
992 | } | ||
870 | $code.=<<___; | 993 | $code.=<<___; |
871 | mov $idx,%r11 | 994 | movdqa %xmm3,`16*($i-1)-128`(%rax) |
872 | shr \$`log($N/8)/log(2)`,$idx | ||
873 | and \$`$N/8-1`,%r11 | ||
874 | not $idx | ||
875 | lea .Lmagic_masks(%rip),%rax | ||
876 | and \$`2**5/($N/8)-1`,$idx # 5 is "window size" | ||
877 | lea 96($tbl,%r11,8),$tbl # pointer within 1st cache line | ||
878 | movq 0(%rax,$idx,8),%xmm4 # set of masks denoting which | ||
879 | movq 8(%rax,$idx,8),%xmm5 # cache line contains element | ||
880 | movq 16(%rax,$idx,8),%xmm6 # denoted by 7th argument | ||
881 | movq 24(%rax,$idx,8),%xmm7 | ||
882 | jmp .Lgather | 995 | jmp .Lgather |
883 | .align 16 | ||
884 | .Lgather: | ||
885 | movq `0*$STRIDE/4-96`($tbl),%xmm0 | ||
886 | movq `1*$STRIDE/4-96`($tbl),%xmm1 | ||
887 | pand %xmm4,%xmm0 | ||
888 | movq `2*$STRIDE/4-96`($tbl),%xmm2 | ||
889 | pand %xmm5,%xmm1 | ||
890 | movq `3*$STRIDE/4-96`($tbl),%xmm3 | ||
891 | pand %xmm6,%xmm2 | ||
892 | por %xmm1,%xmm0 | ||
893 | pand %xmm7,%xmm3 | ||
894 | por %xmm2,%xmm0 | ||
895 | lea $STRIDE($tbl),$tbl | ||
896 | por %xmm3,%xmm0 | ||
897 | 996 | ||
997 | .align 32 | ||
998 | .Lgather: | ||
999 | pxor %xmm4,%xmm4 | ||
1000 | pxor %xmm5,%xmm5 | ||
1001 | ___ | ||
1002 | for($i=0;$i<$STRIDE/16;$i+=4) { | ||
1003 | $code.=<<___; | ||
1004 | movdqa `16*($i+0)-128`(%r11),%xmm0 | ||
1005 | movdqa `16*($i+1)-128`(%r11),%xmm1 | ||
1006 | movdqa `16*($i+2)-128`(%r11),%xmm2 | ||
1007 | pand `16*($i+0)-128`(%rax),%xmm0 | ||
1008 | movdqa `16*($i+3)-128`(%r11),%xmm3 | ||
1009 | pand `16*($i+1)-128`(%rax),%xmm1 | ||
1010 | por %xmm0,%xmm4 | ||
1011 | pand `16*($i+2)-128`(%rax),%xmm2 | ||
1012 | por %xmm1,%xmm5 | ||
1013 | pand `16*($i+3)-128`(%rax),%xmm3 | ||
1014 | por %xmm2,%xmm4 | ||
1015 | por %xmm3,%xmm5 | ||
1016 | ___ | ||
1017 | } | ||
1018 | $code.=<<___; | ||
1019 | por %xmm5,%xmm4 | ||
1020 | lea $STRIDE(%r11),%r11 | ||
1021 | pshufd \$0x4e,%xmm4,%xmm0 | ||
1022 | por %xmm4,%xmm0 | ||
898 | movq %xmm0,($out) # m0=bp[0] | 1023 | movq %xmm0,($out) # m0=bp[0] |
899 | lea 8($out),$out | 1024 | lea 8($out),$out |
900 | sub \$1,$num | 1025 | sub \$1,$num |
901 | jnz .Lgather | 1026 | jnz .Lgather |
902 | ___ | 1027 | |
903 | $code.=<<___ if ($win64); | 1028 | lea (%r10),%rsp |
904 | movaps (%rsp),%xmm6 | ||
905 | movaps 0x10(%rsp),%xmm7 | ||
906 | lea 0x28(%rsp),%rsp | ||
907 | ___ | ||
908 | $code.=<<___; | ||
909 | ret | 1029 | ret |
910 | .LSEH_end_bn_gather5: | 1030 | .LSEH_end_bn_gather5: |
911 | .size bn_gather5,.-bn_gather5 | 1031 | .size bn_gather5,.-bn_gather5 |
@@ -913,9 +1033,9 @@ ___ | |||
913 | } | 1033 | } |
914 | $code.=<<___; | 1034 | $code.=<<___; |
915 | .align 64 | 1035 | .align 64 |
916 | .Lmagic_masks: | 1036 | .Linc: |
917 | .long 0,0, 0,0, 0,0, -1,-1 | 1037 | .long 0,0, 1,1 |
918 | .long 0,0, 0,0, 0,0, 0,0 | 1038 | .long 2,2, 2,2 |
919 | .asciz "Montgomery Multiplication with scatter/gather for x86_64, CRYPTOGAMS by <appro\@openssl.org>" | 1039 | .asciz "Montgomery Multiplication with scatter/gather for x86_64, CRYPTOGAMS by <appro\@openssl.org>" |
920 | ___ | 1040 | ___ |
921 | 1041 | ||
@@ -954,7 +1074,7 @@ mul_handler: | |||
954 | cmp %r10,%rbx # context->Rip<end of prologue label | 1074 | cmp %r10,%rbx # context->Rip<end of prologue label |
955 | jb .Lcommon_seh_tail | 1075 | jb .Lcommon_seh_tail |
956 | 1076 | ||
957 | lea `40+48`(%rax),%rax | 1077 | lea 48(%rax),%rax |
958 | 1078 | ||
959 | mov 4(%r11),%r10d # HandlerData[1] | 1079 | mov 4(%r11),%r10d # HandlerData[1] |
960 | lea (%rsi,%r10),%r10 # end of alloca label | 1080 | lea (%rsi,%r10),%r10 # end of alloca label |
@@ -971,9 +1091,7 @@ mul_handler: | |||
971 | mov 192($context),%r10 # pull $num | 1091 | mov 192($context),%r10 # pull $num |
972 | mov 8(%rax,%r10,8),%rax # pull saved stack pointer | 1092 | mov 8(%rax,%r10,8),%rax # pull saved stack pointer |
973 | 1093 | ||
974 | movaps (%rax),%xmm0 | 1094 | lea 48(%rax),%rax |
975 | movaps 16(%rax),%xmm1 | ||
976 | lea `40+48`(%rax),%rax | ||
977 | 1095 | ||
978 | mov -8(%rax),%rbx | 1096 | mov -8(%rax),%rbx |
979 | mov -16(%rax),%rbp | 1097 | mov -16(%rax),%rbp |
@@ -987,8 +1105,6 @@ mul_handler: | |||
987 | mov %r13,224($context) # restore context->R13 | 1105 | mov %r13,224($context) # restore context->R13 |
988 | mov %r14,232($context) # restore context->R14 | 1106 | mov %r14,232($context) # restore context->R14 |
989 | mov %r15,240($context) # restore context->R15 | 1107 | mov %r15,240($context) # restore context->R15 |
990 | movups %xmm0,512($context) # restore context->Xmm6 | ||
991 | movups %xmm1,528($context) # restore context->Xmm7 | ||
992 | 1108 | ||
993 | .Lcommon_seh_tail: | 1109 | .Lcommon_seh_tail: |
994 | mov 8(%rax),%rdi | 1110 | mov 8(%rax),%rdi |
@@ -1057,10 +1173,9 @@ mul_handler: | |||
1057 | .rva .Lmul4x_alloca,.Lmul4x_body,.Lmul4x_epilogue # HandlerData[] | 1173 | .rva .Lmul4x_alloca,.Lmul4x_body,.Lmul4x_epilogue # HandlerData[] |
1058 | .align 8 | 1174 | .align 8 |
1059 | .LSEH_info_bn_gather5: | 1175 | .LSEH_info_bn_gather5: |
1060 | .byte 0x01,0x0d,0x05,0x00 | 1176 | .byte 0x01,0x0b,0x03,0x0a |
1061 | .byte 0x0d,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7 | 1177 | .byte 0x0b,0x01,0x21,0x00 # sub rsp,0x108 |
1062 | .byte 0x08,0x68,0x00,0x00 #movaps (rsp),xmm6 | 1178 | .byte 0x04,0xa3,0x00,0x00 # lea r10,(rsp), set_frame r10 |
1063 | .byte 0x04,0x42,0x00,0x00 #sub rsp,0x28 | ||
1064 | .align 8 | 1179 | .align 8 |
1065 | ___ | 1180 | ___ |
1066 | } | 1181 | } |