summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rwxr-xr-xsrc/lib/libcrypto/bn/asm/x86_64-mont5.pl513
1 files changed, 314 insertions, 199 deletions
diff --git a/src/lib/libcrypto/bn/asm/x86_64-mont5.pl b/src/lib/libcrypto/bn/asm/x86_64-mont5.pl
index 81e5c53728..bb7ad4c4b7 100755
--- a/src/lib/libcrypto/bn/asm/x86_64-mont5.pl
+++ b/src/lib/libcrypto/bn/asm/x86_64-mont5.pl
@@ -66,60 +66,113 @@ bn_mul_mont_gather5:
66.align 16 66.align 16
67.Lmul_enter: 67.Lmul_enter:
68 mov ${num}d,${num}d 68 mov ${num}d,${num}d
69 mov `($win64?56:8)`(%rsp),%r10d # load 7th argument 69 movd `($win64?56:8)`(%rsp),%xmm5 # load 7th argument
70 lea .Linc(%rip),%r10
70 push %rbx 71 push %rbx
71 push %rbp 72 push %rbp
72 push %r12 73 push %r12
73 push %r13 74 push %r13
74 push %r14 75 push %r14
75 push %r15 76 push %r15
76___ 77
77$code.=<<___ if ($win64);
78 lea -0x28(%rsp),%rsp
79 movaps %xmm6,(%rsp)
80 movaps %xmm7,0x10(%rsp)
81.Lmul_alloca: 78.Lmul_alloca:
82___
83$code.=<<___;
84 mov %rsp,%rax 79 mov %rsp,%rax
85 lea 2($num),%r11 80 lea 2($num),%r11
86 neg %r11 81 neg %r11
87 lea (%rsp,%r11,8),%rsp # tp=alloca(8*(num+2)) 82 lea -264(%rsp,%r11,8),%rsp # tp=alloca(8*(num+2)+256+8)
88 and \$-1024,%rsp # minimize TLB usage 83 and \$-1024,%rsp # minimize TLB usage
89 84
90 mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp 85 mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp
91.Lmul_body: 86.Lmul_body:
92 mov $bp,%r12 # reassign $bp 87 lea 128($bp),%r12 # reassign $bp (+size optimization)
93___ 88___
94 $bp="%r12"; 89 $bp="%r12";
95 $STRIDE=2**5*8; # 5 is "window size" 90 $STRIDE=2**5*8; # 5 is "window size"
96 $N=$STRIDE/4; # should match cache line size 91 $N=$STRIDE/4; # should match cache line size
97$code.=<<___; 92$code.=<<___;
98 mov %r10,%r11 93 movdqa 0(%r10),%xmm0 # 00000001000000010000000000000000
99 shr \$`log($N/8)/log(2)`,%r10 94 movdqa 16(%r10),%xmm1 # 00000002000000020000000200000002
100 and \$`$N/8-1`,%r11 95 lea 24-112(%rsp,$num,8),%r10# place the mask after tp[num+3] (+ICache optimization)
101 not %r10 96 and \$-16,%r10
102 lea .Lmagic_masks(%rip),%rax 97
103 and \$`2**5/($N/8)-1`,%r10 # 5 is "window size" 98 pshufd \$0,%xmm5,%xmm5 # broadcast index
104 lea 96($bp,%r11,8),$bp # pointer within 1st cache line 99 movdqa %xmm1,%xmm4
105 movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which 100 movdqa %xmm1,%xmm2
106 movq 8(%rax,%r10,8),%xmm5 # cache line contains element 101___
107 movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument 102########################################################################
108 movq 24(%rax,%r10,8),%xmm7 103# calculate mask by comparing 0..31 to index and save result to stack
109 104#
110 movq `0*$STRIDE/4-96`($bp),%xmm0 105$code.=<<___;
111 movq `1*$STRIDE/4-96`($bp),%xmm1 106 paddd %xmm0,%xmm1
112 pand %xmm4,%xmm0 107 pcmpeqd %xmm5,%xmm0 # compare to 1,0
113 movq `2*$STRIDE/4-96`($bp),%xmm2 108 .byte 0x67
114 pand %xmm5,%xmm1 109 movdqa %xmm4,%xmm3
115 movq `3*$STRIDE/4-96`($bp),%xmm3 110___
116 pand %xmm6,%xmm2 111for($k=0;$k<$STRIDE/16-4;$k+=4) {
117 por %xmm1,%xmm0 112$code.=<<___;
118 pand %xmm7,%xmm3 113 paddd %xmm1,%xmm2
114 pcmpeqd %xmm5,%xmm1 # compare to 3,2
115 movdqa %xmm0,`16*($k+0)+112`(%r10)
116 movdqa %xmm4,%xmm0
117
118 paddd %xmm2,%xmm3
119 pcmpeqd %xmm5,%xmm2 # compare to 5,4
120 movdqa %xmm1,`16*($k+1)+112`(%r10)
121 movdqa %xmm4,%xmm1
122
123 paddd %xmm3,%xmm0
124 pcmpeqd %xmm5,%xmm3 # compare to 7,6
125 movdqa %xmm2,`16*($k+2)+112`(%r10)
126 movdqa %xmm4,%xmm2
127
128 paddd %xmm0,%xmm1
129 pcmpeqd %xmm5,%xmm0
130 movdqa %xmm3,`16*($k+3)+112`(%r10)
131 movdqa %xmm4,%xmm3
132___
133}
134$code.=<<___; # last iteration can be optimized
135 paddd %xmm1,%xmm2
136 pcmpeqd %xmm5,%xmm1
137 movdqa %xmm0,`16*($k+0)+112`(%r10)
138
139 paddd %xmm2,%xmm3
140 .byte 0x67
141 pcmpeqd %xmm5,%xmm2
142 movdqa %xmm1,`16*($k+1)+112`(%r10)
143
144 pcmpeqd %xmm5,%xmm3
145 movdqa %xmm2,`16*($k+2)+112`(%r10)
146 pand `16*($k+0)-128`($bp),%xmm0 # while it's still in register
147
148 pand `16*($k+1)-128`($bp),%xmm1
149 pand `16*($k+2)-128`($bp),%xmm2
150 movdqa %xmm3,`16*($k+3)+112`(%r10)
151 pand `16*($k+3)-128`($bp),%xmm3
152 por %xmm2,%xmm0
153 por %xmm3,%xmm1
154___
155for($k=0;$k<$STRIDE/16-4;$k+=4) {
156$code.=<<___;
157 movdqa `16*($k+0)-128`($bp),%xmm4
158 movdqa `16*($k+1)-128`($bp),%xmm5
159 movdqa `16*($k+2)-128`($bp),%xmm2
160 pand `16*($k+0)+112`(%r10),%xmm4
161 movdqa `16*($k+3)-128`($bp),%xmm3
162 pand `16*($k+1)+112`(%r10),%xmm5
163 por %xmm4,%xmm0
164 pand `16*($k+2)+112`(%r10),%xmm2
165 por %xmm5,%xmm1
166 pand `16*($k+3)+112`(%r10),%xmm3
119 por %xmm2,%xmm0 167 por %xmm2,%xmm0
168 por %xmm3,%xmm1
169___
170}
171$code.=<<___;
172 por %xmm1,%xmm0
173 pshufd \$0x4e,%xmm0,%xmm1
174 por %xmm1,%xmm0
120 lea $STRIDE($bp),$bp 175 lea $STRIDE($bp),$bp
121 por %xmm3,%xmm0
122
123 movd %xmm0,$m0 # m0=bp[0] 176 movd %xmm0,$m0 # m0=bp[0]
124 177
125 mov ($n0),$n0 # pull n0[0] value 178 mov ($n0),$n0 # pull n0[0] value
@@ -128,29 +181,14 @@ $code.=<<___;
128 xor $i,$i # i=0 181 xor $i,$i # i=0
129 xor $j,$j # j=0 182 xor $j,$j # j=0
130 183
131 movq `0*$STRIDE/4-96`($bp),%xmm0
132 movq `1*$STRIDE/4-96`($bp),%xmm1
133 pand %xmm4,%xmm0
134 movq `2*$STRIDE/4-96`($bp),%xmm2
135 pand %xmm5,%xmm1
136
137 mov $n0,$m1 184 mov $n0,$m1
138 mulq $m0 # ap[0]*bp[0] 185 mulq $m0 # ap[0]*bp[0]
139 mov %rax,$lo0 186 mov %rax,$lo0
140 mov ($np),%rax 187 mov ($np),%rax
141 188
142 movq `3*$STRIDE/4-96`($bp),%xmm3
143 pand %xmm6,%xmm2
144 por %xmm1,%xmm0
145 pand %xmm7,%xmm3
146
147 imulq $lo0,$m1 # "tp[0]"*n0 189 imulq $lo0,$m1 # "tp[0]"*n0
148 mov %rdx,$hi0 190 mov %rdx,$hi0
149 191
150 por %xmm2,%xmm0
151 lea $STRIDE($bp),$bp
152 por %xmm3,%xmm0
153
154 mulq $m1 # np[0]*m1 192 mulq $m1 # np[0]*m1
155 add %rax,$lo0 # discarded 193 add %rax,$lo0 # discarded
156 mov 8($ap),%rax 194 mov 8($ap),%rax
@@ -183,8 +221,6 @@ $code.=<<___;
183 cmp $num,$j 221 cmp $num,$j
184 jl .L1st 222 jl .L1st
185 223
186 movd %xmm0,$m0 # bp[1]
187
188 add %rax,$hi1 224 add %rax,$hi1
189 mov ($ap),%rax # ap[0] 225 mov ($ap),%rax # ap[0]
190 adc \$0,%rdx 226 adc \$0,%rdx
@@ -204,33 +240,46 @@ $code.=<<___;
204 jmp .Louter 240 jmp .Louter
205.align 16 241.align 16
206.Louter: 242.Louter:
243 lea 24+128(%rsp,$num,8),%rdx # where 256-byte mask is (+size optimization)
244 and \$-16,%rdx
245 pxor %xmm4,%xmm4
246 pxor %xmm5,%xmm5
247___
248for($k=0;$k<$STRIDE/16;$k+=4) {
249$code.=<<___;
250 movdqa `16*($k+0)-128`($bp),%xmm0
251 movdqa `16*($k+1)-128`($bp),%xmm1
252 movdqa `16*($k+2)-128`($bp),%xmm2
253 movdqa `16*($k+3)-128`($bp),%xmm3
254 pand `16*($k+0)-128`(%rdx),%xmm0
255 pand `16*($k+1)-128`(%rdx),%xmm1
256 por %xmm0,%xmm4
257 pand `16*($k+2)-128`(%rdx),%xmm2
258 por %xmm1,%xmm5
259 pand `16*($k+3)-128`(%rdx),%xmm3
260 por %xmm2,%xmm4
261 por %xmm3,%xmm5
262___
263}
264$code.=<<___;
265 por %xmm5,%xmm4
266 pshufd \$0x4e,%xmm4,%xmm0
267 por %xmm4,%xmm0
268 lea $STRIDE($bp),$bp
269 movd %xmm0,$m0 # m0=bp[i]
270
207 xor $j,$j # j=0 271 xor $j,$j # j=0
208 mov $n0,$m1 272 mov $n0,$m1
209 mov (%rsp),$lo0 273 mov (%rsp),$lo0
210 274
211 movq `0*$STRIDE/4-96`($bp),%xmm0
212 movq `1*$STRIDE/4-96`($bp),%xmm1
213 pand %xmm4,%xmm0
214 movq `2*$STRIDE/4-96`($bp),%xmm2
215 pand %xmm5,%xmm1
216
217 mulq $m0 # ap[0]*bp[i] 275 mulq $m0 # ap[0]*bp[i]
218 add %rax,$lo0 # ap[0]*bp[i]+tp[0] 276 add %rax,$lo0 # ap[0]*bp[i]+tp[0]
219 mov ($np),%rax 277 mov ($np),%rax
220 adc \$0,%rdx 278 adc \$0,%rdx
221 279
222 movq `3*$STRIDE/4-96`($bp),%xmm3
223 pand %xmm6,%xmm2
224 por %xmm1,%xmm0
225 pand %xmm7,%xmm3
226
227 imulq $lo0,$m1 # tp[0]*n0 280 imulq $lo0,$m1 # tp[0]*n0
228 mov %rdx,$hi0 281 mov %rdx,$hi0
229 282
230 por %xmm2,%xmm0
231 lea $STRIDE($bp),$bp
232 por %xmm3,%xmm0
233
234 mulq $m1 # np[0]*m1 283 mulq $m1 # np[0]*m1
235 add %rax,$lo0 # discarded 284 add %rax,$lo0 # discarded
236 mov 8($ap),%rax 285 mov 8($ap),%rax
@@ -266,8 +315,6 @@ $code.=<<___;
266 cmp $num,$j 315 cmp $num,$j
267 jl .Linner 316 jl .Linner
268 317
269 movd %xmm0,$m0 # bp[i+1]
270
271 add %rax,$hi1 318 add %rax,$hi1
272 mov ($ap),%rax # ap[0] 319 mov ($ap),%rax # ap[0]
273 adc \$0,%rdx 320 adc \$0,%rdx
@@ -321,13 +368,7 @@ $code.=<<___;
321 368
322 mov 8(%rsp,$num,8),%rsi # restore %rsp 369 mov 8(%rsp,$num,8),%rsi # restore %rsp
323 mov \$1,%rax 370 mov \$1,%rax
324___ 371
325$code.=<<___ if ($win64);
326 movaps (%rsi),%xmm6
327 movaps 0x10(%rsi),%xmm7
328 lea 0x28(%rsi),%rsi
329___
330$code.=<<___;
331 mov (%rsi),%r15 372 mov (%rsi),%r15
332 mov 8(%rsi),%r14 373 mov 8(%rsi),%r14
333 mov 16(%rsi),%r13 374 mov 16(%rsi),%r13
@@ -348,91 +389,130 @@ $code.=<<___;
348bn_mul4x_mont_gather5: 389bn_mul4x_mont_gather5:
349.Lmul4x_enter: 390.Lmul4x_enter:
350 mov ${num}d,${num}d 391 mov ${num}d,${num}d
351 mov `($win64?56:8)`(%rsp),%r10d # load 7th argument 392 movd `($win64?56:8)`(%rsp),%xmm5 # load 7th argument
393 lea .Linc(%rip),%r10
352 push %rbx 394 push %rbx
353 push %rbp 395 push %rbp
354 push %r12 396 push %r12
355 push %r13 397 push %r13
356 push %r14 398 push %r14
357 push %r15 399 push %r15
358___ 400
359$code.=<<___ if ($win64);
360 lea -0x28(%rsp),%rsp
361 movaps %xmm6,(%rsp)
362 movaps %xmm7,0x10(%rsp)
363.Lmul4x_alloca: 401.Lmul4x_alloca:
364___
365$code.=<<___;
366 mov %rsp,%rax 402 mov %rsp,%rax
367 lea 4($num),%r11 403 lea 4($num),%r11
368 neg %r11 404 neg %r11
369 lea (%rsp,%r11,8),%rsp # tp=alloca(8*(num+4)) 405 lea -256(%rsp,%r11,8),%rsp # tp=alloca(8*(num+4)+256)
370 and \$-1024,%rsp # minimize TLB usage 406 and \$-1024,%rsp # minimize TLB usage
371 407
372 mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp 408 mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp
373.Lmul4x_body: 409.Lmul4x_body:
374 mov $rp,16(%rsp,$num,8) # tp[num+2]=$rp 410 mov $rp,16(%rsp,$num,8) # tp[num+2]=$rp
375 mov %rdx,%r12 # reassign $bp 411 lea 128(%rdx),%r12 # reassign $bp (+size optimization)
376___ 412___
377 $bp="%r12"; 413 $bp="%r12";
378 $STRIDE=2**5*8; # 5 is "window size" 414 $STRIDE=2**5*8; # 5 is "window size"
379 $N=$STRIDE/4; # should match cache line size 415 $N=$STRIDE/4; # should match cache line size
380$code.=<<___; 416$code.=<<___;
381 mov %r10,%r11 417 movdqa 0(%r10),%xmm0 # 00000001000000010000000000000000
382 shr \$`log($N/8)/log(2)`,%r10 418 movdqa 16(%r10),%xmm1 # 00000002000000020000000200000002
383 and \$`$N/8-1`,%r11 419 lea 32-112(%rsp,$num,8),%r10# place the mask after tp[num+4] (+ICache optimization)
384 not %r10 420
385 lea .Lmagic_masks(%rip),%rax 421 pshufd \$0,%xmm5,%xmm5 # broadcast index
386 and \$`2**5/($N/8)-1`,%r10 # 5 is "window size" 422 movdqa %xmm1,%xmm4
387 lea 96($bp,%r11,8),$bp # pointer within 1st cache line 423 .byte 0x67,0x67
388 movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which 424 movdqa %xmm1,%xmm2
389 movq 8(%rax,%r10,8),%xmm5 # cache line contains element 425___
390 movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument 426########################################################################
391 movq 24(%rax,%r10,8),%xmm7 427# calculate mask by comparing 0..31 to index and save result to stack
392 428#
393 movq `0*$STRIDE/4-96`($bp),%xmm0 429$code.=<<___;
394 movq `1*$STRIDE/4-96`($bp),%xmm1 430 paddd %xmm0,%xmm1
395 pand %xmm4,%xmm0 431 pcmpeqd %xmm5,%xmm0 # compare to 1,0
396 movq `2*$STRIDE/4-96`($bp),%xmm2 432 .byte 0x67
397 pand %xmm5,%xmm1 433 movdqa %xmm4,%xmm3
398 movq `3*$STRIDE/4-96`($bp),%xmm3 434___
399 pand %xmm6,%xmm2 435for($k=0;$k<$STRIDE/16-4;$k+=4) {
400 por %xmm1,%xmm0 436$code.=<<___;
401 pand %xmm7,%xmm3 437 paddd %xmm1,%xmm2
438 pcmpeqd %xmm5,%xmm1 # compare to 3,2
439 movdqa %xmm0,`16*($k+0)+112`(%r10)
440 movdqa %xmm4,%xmm0
441
442 paddd %xmm2,%xmm3
443 pcmpeqd %xmm5,%xmm2 # compare to 5,4
444 movdqa %xmm1,`16*($k+1)+112`(%r10)
445 movdqa %xmm4,%xmm1
446
447 paddd %xmm3,%xmm0
448 pcmpeqd %xmm5,%xmm3 # compare to 7,6
449 movdqa %xmm2,`16*($k+2)+112`(%r10)
450 movdqa %xmm4,%xmm2
451
452 paddd %xmm0,%xmm1
453 pcmpeqd %xmm5,%xmm0
454 movdqa %xmm3,`16*($k+3)+112`(%r10)
455 movdqa %xmm4,%xmm3
456___
457}
458$code.=<<___; # last iteration can be optimized
459 paddd %xmm1,%xmm2
460 pcmpeqd %xmm5,%xmm1
461 movdqa %xmm0,`16*($k+0)+112`(%r10)
462
463 paddd %xmm2,%xmm3
464 .byte 0x67
465 pcmpeqd %xmm5,%xmm2
466 movdqa %xmm1,`16*($k+1)+112`(%r10)
467
468 pcmpeqd %xmm5,%xmm3
469 movdqa %xmm2,`16*($k+2)+112`(%r10)
470 pand `16*($k+0)-128`($bp),%xmm0 # while it's still in register
471
472 pand `16*($k+1)-128`($bp),%xmm1
473 pand `16*($k+2)-128`($bp),%xmm2
474 movdqa %xmm3,`16*($k+3)+112`(%r10)
475 pand `16*($k+3)-128`($bp),%xmm3
476 por %xmm2,%xmm0
477 por %xmm3,%xmm1
478___
479for($k=0;$k<$STRIDE/16-4;$k+=4) {
480$code.=<<___;
481 movdqa `16*($k+0)-128`($bp),%xmm4
482 movdqa `16*($k+1)-128`($bp),%xmm5
483 movdqa `16*($k+2)-128`($bp),%xmm2
484 pand `16*($k+0)+112`(%r10),%xmm4
485 movdqa `16*($k+3)-128`($bp),%xmm3
486 pand `16*($k+1)+112`(%r10),%xmm5
487 por %xmm4,%xmm0
488 pand `16*($k+2)+112`(%r10),%xmm2
489 por %xmm5,%xmm1
490 pand `16*($k+3)+112`(%r10),%xmm3
402 por %xmm2,%xmm0 491 por %xmm2,%xmm0
492 por %xmm3,%xmm1
493___
494}
495$code.=<<___;
496 por %xmm1,%xmm0
497 pshufd \$0x4e,%xmm0,%xmm1
498 por %xmm1,%xmm0
403 lea $STRIDE($bp),$bp 499 lea $STRIDE($bp),$bp
404 por %xmm3,%xmm0
405
406 movd %xmm0,$m0 # m0=bp[0] 500 movd %xmm0,$m0 # m0=bp[0]
501
407 mov ($n0),$n0 # pull n0[0] value 502 mov ($n0),$n0 # pull n0[0] value
408 mov ($ap),%rax 503 mov ($ap),%rax
409 504
410 xor $i,$i # i=0 505 xor $i,$i # i=0
411 xor $j,$j # j=0 506 xor $j,$j # j=0
412 507
413 movq `0*$STRIDE/4-96`($bp),%xmm0
414 movq `1*$STRIDE/4-96`($bp),%xmm1
415 pand %xmm4,%xmm0
416 movq `2*$STRIDE/4-96`($bp),%xmm2
417 pand %xmm5,%xmm1
418
419 mov $n0,$m1 508 mov $n0,$m1
420 mulq $m0 # ap[0]*bp[0] 509 mulq $m0 # ap[0]*bp[0]
421 mov %rax,$A[0] 510 mov %rax,$A[0]
422 mov ($np),%rax 511 mov ($np),%rax
423 512
424 movq `3*$STRIDE/4-96`($bp),%xmm3
425 pand %xmm6,%xmm2
426 por %xmm1,%xmm0
427 pand %xmm7,%xmm3
428
429 imulq $A[0],$m1 # "tp[0]"*n0 513 imulq $A[0],$m1 # "tp[0]"*n0
430 mov %rdx,$A[1] 514 mov %rdx,$A[1]
431 515
432 por %xmm2,%xmm0
433 lea $STRIDE($bp),$bp
434 por %xmm3,%xmm0
435
436 mulq $m1 # np[0]*m1 516 mulq $m1 # np[0]*m1
437 add %rax,$A[0] # discarded 517 add %rax,$A[0] # discarded
438 mov 8($ap),%rax 518 mov 8($ap),%rax
@@ -550,8 +630,6 @@ $code.=<<___;
550 mov $N[1],-16(%rsp,$j,8) # tp[j-1] 630 mov $N[1],-16(%rsp,$j,8) # tp[j-1]
551 mov %rdx,$N[0] 631 mov %rdx,$N[0]
552 632
553 movd %xmm0,$m0 # bp[1]
554
555 xor $N[1],$N[1] 633 xor $N[1],$N[1]
556 add $A[0],$N[0] 634 add $A[0],$N[0]
557 adc \$0,$N[1] 635 adc \$0,$N[1]
@@ -561,12 +639,34 @@ $code.=<<___;
561 lea 1($i),$i # i++ 639 lea 1($i),$i # i++
562.align 4 640.align 4
563.Louter4x: 641.Louter4x:
642 lea 32+128(%rsp,$num,8),%rdx # where 256-byte mask is (+size optimization)
643 pxor %xmm4,%xmm4
644 pxor %xmm5,%xmm5
645___
646for($k=0;$k<$STRIDE/16;$k+=4) {
647$code.=<<___;
648 movdqa `16*($k+0)-128`($bp),%xmm0
649 movdqa `16*($k+1)-128`($bp),%xmm1
650 movdqa `16*($k+2)-128`($bp),%xmm2
651 movdqa `16*($k+3)-128`($bp),%xmm3
652 pand `16*($k+0)-128`(%rdx),%xmm0
653 pand `16*($k+1)-128`(%rdx),%xmm1
654 por %xmm0,%xmm4
655 pand `16*($k+2)-128`(%rdx),%xmm2
656 por %xmm1,%xmm5
657 pand `16*($k+3)-128`(%rdx),%xmm3
658 por %xmm2,%xmm4
659 por %xmm3,%xmm5
660___
661}
662$code.=<<___;
663 por %xmm5,%xmm4
664 pshufd \$0x4e,%xmm4,%xmm0
665 por %xmm4,%xmm0
666 lea $STRIDE($bp),$bp
667 movd %xmm0,$m0 # m0=bp[i]
668
564 xor $j,$j # j=0 669 xor $j,$j # j=0
565 movq `0*$STRIDE/4-96`($bp),%xmm0
566 movq `1*$STRIDE/4-96`($bp),%xmm1
567 pand %xmm4,%xmm0
568 movq `2*$STRIDE/4-96`($bp),%xmm2
569 pand %xmm5,%xmm1
570 670
571 mov (%rsp),$A[0] 671 mov (%rsp),$A[0]
572 mov $n0,$m1 672 mov $n0,$m1
@@ -575,18 +675,9 @@ $code.=<<___;
575 mov ($np),%rax 675 mov ($np),%rax
576 adc \$0,%rdx 676 adc \$0,%rdx
577 677
578 movq `3*$STRIDE/4-96`($bp),%xmm3
579 pand %xmm6,%xmm2
580 por %xmm1,%xmm0
581 pand %xmm7,%xmm3
582
583 imulq $A[0],$m1 # tp[0]*n0 678 imulq $A[0],$m1 # tp[0]*n0
584 mov %rdx,$A[1] 679 mov %rdx,$A[1]
585 680
586 por %xmm2,%xmm0
587 lea $STRIDE($bp),$bp
588 por %xmm3,%xmm0
589
590 mulq $m1 # np[0]*m1 681 mulq $m1 # np[0]*m1
591 add %rax,$A[0] # "$N[0]", discarded 682 add %rax,$A[0] # "$N[0]", discarded
592 mov 8($ap),%rax 683 mov 8($ap),%rax
@@ -718,7 +809,6 @@ $code.=<<___;
718 mov $N[0],-24(%rsp,$j,8) # tp[j-1] 809 mov $N[0],-24(%rsp,$j,8) # tp[j-1]
719 mov %rdx,$N[0] 810 mov %rdx,$N[0]
720 811
721 movd %xmm0,$m0 # bp[i+1]
722 mov $N[1],-16(%rsp,$j,8) # tp[j-1] 812 mov $N[1],-16(%rsp,$j,8) # tp[j-1]
723 813
724 xor $N[1],$N[1] 814 xor $N[1],$N[1]
@@ -809,13 +899,7 @@ ___
809$code.=<<___; 899$code.=<<___;
810 mov 8(%rsp,$num,8),%rsi # restore %rsp 900 mov 8(%rsp,$num,8),%rsi # restore %rsp
811 mov \$1,%rax 901 mov \$1,%rax
812___ 902
813$code.=<<___ if ($win64);
814 movaps (%rsi),%xmm6
815 movaps 0x10(%rsi),%xmm7
816 lea 0x28(%rsi),%rsi
817___
818$code.=<<___;
819 mov (%rsi),%r15 903 mov (%rsi),%r15
820 mov 8(%rsi),%r14 904 mov 8(%rsi),%r14
821 mov 16(%rsi),%r13 905 mov 16(%rsi),%r13
@@ -830,8 +914,8 @@ ___
830}}} 914}}}
831 915
832{ 916{
833my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%rdx","%r8", "%r9") : # Win64 order 917my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%rdx","%r8", "%r9d") : # Win64 order
834 ("%rdi","%rsi","%rdx","%rcx"); # Unix order 918 ("%rdi","%rsi","%rdx","%ecx"); # Unix order
835my $out=$inp; 919my $out=$inp;
836my $STRIDE=2**5*8; 920my $STRIDE=2**5*8;
837my $N=$STRIDE/4; 921my $N=$STRIDE/4;
@@ -859,53 +943,89 @@ bn_scatter5:
859.type bn_gather5,\@abi-omnipotent 943.type bn_gather5,\@abi-omnipotent
860.align 16 944.align 16
861bn_gather5: 945bn_gather5:
862___ 946.LSEH_begin_bn_gather5: # Win64 thing, but harmless in other cases
863$code.=<<___ if ($win64);
864.LSEH_begin_bn_gather5:
865 # I can't trust assembler to use specific encoding:-( 947 # I can't trust assembler to use specific encoding:-(
866 .byte 0x48,0x83,0xec,0x28 #sub \$0x28,%rsp 948 .byte 0x4c,0x8d,0x14,0x24 # lea (%rsp),%r10
867 .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp) 949 .byte 0x48,0x81,0xec,0x08,0x01,0x00,0x00 # sub $0x108,%rsp
868 .byte 0x0f,0x29,0x7c,0x24,0x10 #movdqa %xmm7,0x10(%rsp) 950 lea .Linc(%rip),%rax
951 and \$-16,%rsp # shouldn't be formally required
952
953 movd $idx,%xmm5
954 movdqa 0(%rax),%xmm0 # 00000001000000010000000000000000
955 movdqa 16(%rax),%xmm1 # 00000002000000020000000200000002
956 lea 128($tbl),%r11 # size optimization
957 lea 128(%rsp),%rax # size optimization
958
959 pshufd \$0,%xmm5,%xmm5 # broadcast $idx
960 movdqa %xmm1,%xmm4
961 movdqa %xmm1,%xmm2
869___ 962___
963########################################################################
964# calculate mask by comparing 0..31 to $idx and save result to stack
965#
966for($i=0;$i<$STRIDE/16;$i+=4) {
967$code.=<<___;
968 paddd %xmm0,%xmm1
969 pcmpeqd %xmm5,%xmm0 # compare to 1,0
970___
971$code.=<<___ if ($i);
972 movdqa %xmm3,`16*($i-1)-128`(%rax)
973___
974$code.=<<___;
975 movdqa %xmm4,%xmm3
976
977 paddd %xmm1,%xmm2
978 pcmpeqd %xmm5,%xmm1 # compare to 3,2
979 movdqa %xmm0,`16*($i+0)-128`(%rax)
980 movdqa %xmm4,%xmm0
981
982 paddd %xmm2,%xmm3
983 pcmpeqd %xmm5,%xmm2 # compare to 5,4
984 movdqa %xmm1,`16*($i+1)-128`(%rax)
985 movdqa %xmm4,%xmm1
986
987 paddd %xmm3,%xmm0
988 pcmpeqd %xmm5,%xmm3 # compare to 7,6
989 movdqa %xmm2,`16*($i+2)-128`(%rax)
990 movdqa %xmm4,%xmm2
991___
992}
870$code.=<<___; 993$code.=<<___;
871 mov $idx,%r11 994 movdqa %xmm3,`16*($i-1)-128`(%rax)
872 shr \$`log($N/8)/log(2)`,$idx
873 and \$`$N/8-1`,%r11
874 not $idx
875 lea .Lmagic_masks(%rip),%rax
876 and \$`2**5/($N/8)-1`,$idx # 5 is "window size"
877 lea 96($tbl,%r11,8),$tbl # pointer within 1st cache line
878 movq 0(%rax,$idx,8),%xmm4 # set of masks denoting which
879 movq 8(%rax,$idx,8),%xmm5 # cache line contains element
880 movq 16(%rax,$idx,8),%xmm6 # denoted by 7th argument
881 movq 24(%rax,$idx,8),%xmm7
882 jmp .Lgather 995 jmp .Lgather
883.align 16
884.Lgather:
885 movq `0*$STRIDE/4-96`($tbl),%xmm0
886 movq `1*$STRIDE/4-96`($tbl),%xmm1
887 pand %xmm4,%xmm0
888 movq `2*$STRIDE/4-96`($tbl),%xmm2
889 pand %xmm5,%xmm1
890 movq `3*$STRIDE/4-96`($tbl),%xmm3
891 pand %xmm6,%xmm2
892 por %xmm1,%xmm0
893 pand %xmm7,%xmm3
894 por %xmm2,%xmm0
895 lea $STRIDE($tbl),$tbl
896 por %xmm3,%xmm0
897 996
997.align 32
998.Lgather:
999 pxor %xmm4,%xmm4
1000 pxor %xmm5,%xmm5
1001___
1002for($i=0;$i<$STRIDE/16;$i+=4) {
1003$code.=<<___;
1004 movdqa `16*($i+0)-128`(%r11),%xmm0
1005 movdqa `16*($i+1)-128`(%r11),%xmm1
1006 movdqa `16*($i+2)-128`(%r11),%xmm2
1007 pand `16*($i+0)-128`(%rax),%xmm0
1008 movdqa `16*($i+3)-128`(%r11),%xmm3
1009 pand `16*($i+1)-128`(%rax),%xmm1
1010 por %xmm0,%xmm4
1011 pand `16*($i+2)-128`(%rax),%xmm2
1012 por %xmm1,%xmm5
1013 pand `16*($i+3)-128`(%rax),%xmm3
1014 por %xmm2,%xmm4
1015 por %xmm3,%xmm5
1016___
1017}
1018$code.=<<___;
1019 por %xmm5,%xmm4
1020 lea $STRIDE(%r11),%r11
1021 pshufd \$0x4e,%xmm4,%xmm0
1022 por %xmm4,%xmm0
898 movq %xmm0,($out) # m0=bp[0] 1023 movq %xmm0,($out) # m0=bp[0]
899 lea 8($out),$out 1024 lea 8($out),$out
900 sub \$1,$num 1025 sub \$1,$num
901 jnz .Lgather 1026 jnz .Lgather
902___ 1027
903$code.=<<___ if ($win64); 1028 lea (%r10),%rsp
904 movaps (%rsp),%xmm6
905 movaps 0x10(%rsp),%xmm7
906 lea 0x28(%rsp),%rsp
907___
908$code.=<<___;
909 ret 1029 ret
910.LSEH_end_bn_gather5: 1030.LSEH_end_bn_gather5:
911.size bn_gather5,.-bn_gather5 1031.size bn_gather5,.-bn_gather5
@@ -913,9 +1033,9 @@ ___
913} 1033}
914$code.=<<___; 1034$code.=<<___;
915.align 64 1035.align 64
916.Lmagic_masks: 1036.Linc:
917 .long 0,0, 0,0, 0,0, -1,-1 1037 .long 0,0, 1,1
918 .long 0,0, 0,0, 0,0, 0,0 1038 .long 2,2, 2,2
919.asciz "Montgomery Multiplication with scatter/gather for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 1039.asciz "Montgomery Multiplication with scatter/gather for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
920___ 1040___
921 1041
@@ -954,7 +1074,7 @@ mul_handler:
954 cmp %r10,%rbx # context->Rip<end of prologue label 1074 cmp %r10,%rbx # context->Rip<end of prologue label
955 jb .Lcommon_seh_tail 1075 jb .Lcommon_seh_tail
956 1076
957 lea `40+48`(%rax),%rax 1077 lea 48(%rax),%rax
958 1078
959 mov 4(%r11),%r10d # HandlerData[1] 1079 mov 4(%r11),%r10d # HandlerData[1]
960 lea (%rsi,%r10),%r10 # end of alloca label 1080 lea (%rsi,%r10),%r10 # end of alloca label
@@ -971,9 +1091,7 @@ mul_handler:
971 mov 192($context),%r10 # pull $num 1091 mov 192($context),%r10 # pull $num
972 mov 8(%rax,%r10,8),%rax # pull saved stack pointer 1092 mov 8(%rax,%r10,8),%rax # pull saved stack pointer
973 1093
974 movaps (%rax),%xmm0 1094 lea 48(%rax),%rax
975 movaps 16(%rax),%xmm1
976 lea `40+48`(%rax),%rax
977 1095
978 mov -8(%rax),%rbx 1096 mov -8(%rax),%rbx
979 mov -16(%rax),%rbp 1097 mov -16(%rax),%rbp
@@ -987,8 +1105,6 @@ mul_handler:
987 mov %r13,224($context) # restore context->R13 1105 mov %r13,224($context) # restore context->R13
988 mov %r14,232($context) # restore context->R14 1106 mov %r14,232($context) # restore context->R14
989 mov %r15,240($context) # restore context->R15 1107 mov %r15,240($context) # restore context->R15
990 movups %xmm0,512($context) # restore context->Xmm6
991 movups %xmm1,528($context) # restore context->Xmm7
992 1108
993.Lcommon_seh_tail: 1109.Lcommon_seh_tail:
994 mov 8(%rax),%rdi 1110 mov 8(%rax),%rdi
@@ -1057,10 +1173,9 @@ mul_handler:
1057 .rva .Lmul4x_alloca,.Lmul4x_body,.Lmul4x_epilogue # HandlerData[] 1173 .rva .Lmul4x_alloca,.Lmul4x_body,.Lmul4x_epilogue # HandlerData[]
1058.align 8 1174.align 8
1059.LSEH_info_bn_gather5: 1175.LSEH_info_bn_gather5:
1060 .byte 0x01,0x0d,0x05,0x00 1176 .byte 0x01,0x0b,0x03,0x0a
1061 .byte 0x0d,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7 1177 .byte 0x0b,0x01,0x21,0x00 # sub rsp,0x108
1062 .byte 0x08,0x68,0x00,0x00 #movaps (rsp),xmm6 1178 .byte 0x04,0xa3,0x00,0x00 # lea r10,(rsp), set_frame r10
1063 .byte 0x04,0x42,0x00,0x00 #sub rsp,0x28
1064.align 8 1179.align 8
1065___ 1180___
1066} 1181}