summaryrefslogtreecommitdiff
path: root/src/lib/libcrypto/bn/asm/x86_64-mont5.pl
diff options
context:
space:
mode:
Diffstat (limited to 'src/lib/libcrypto/bn/asm/x86_64-mont5.pl')
-rwxr-xr-xsrc/lib/libcrypto/bn/asm/x86_64-mont5.pl1192
1 files changed, 0 insertions, 1192 deletions
diff --git a/src/lib/libcrypto/bn/asm/x86_64-mont5.pl b/src/lib/libcrypto/bn/asm/x86_64-mont5.pl
deleted file mode 100755
index 38751ec5de..0000000000
--- a/src/lib/libcrypto/bn/asm/x86_64-mont5.pl
+++ /dev/null
@@ -1,1192 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# August 2011.
11#
12# Companion to x86_64-mont.pl that optimizes cache-timing attack
13# countermeasures. The subroutines are produced by replacing bp[i]
14# references in their x86_64-mont.pl counterparts with cache-neutral
15# references to powers table computed in BN_mod_exp_mont_consttime.
16# In addition subroutine that scatters elements of the powers table
17# is implemented, so that scatter-/gathering can be tuned without
18# bn_exp.c modifications.
19
20$flavour = shift;
21$output = shift;
22if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
23
24$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
25
26$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
27( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
28( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
29die "can't locate x86_64-xlate.pl";
30
31open OUT,"| \"$^X\" $xlate $flavour $output";
32*STDOUT=*OUT;
33
34# int bn_mul_mont_gather5(
35$rp="%rdi"; # BN_ULONG *rp,
36$ap="%rsi"; # const BN_ULONG *ap,
37$bp="%rdx"; # const BN_ULONG *bp,
38$np="%rcx"; # const BN_ULONG *np,
39$n0="%r8"; # const BN_ULONG *n0,
40$num="%r9"; # int num,
41 # int idx); # 0 to 2^5-1, "index" in $bp holding
42 # pre-computed powers of a', interlaced
43 # in such manner that b[0] is $bp[idx],
44 # b[1] is [2^5+idx], etc.
45$lo0="%r10";
46$hi0="%r11";
47$hi1="%r13";
48$i="%r14";
49$j="%r15";
50$m0="%rbx";
51$m1="%rbp";
52
53$code=<<___;
54.text
55
56.globl bn_mul_mont_gather5
57.type bn_mul_mont_gather5,\@function,6
58.align 64
59bn_mul_mont_gather5:
60 _CET_ENDBR
61 test \$3,${num}d
62 jnz .Lmul_enter
63 cmp \$8,${num}d
64 jb .Lmul_enter
65 jmp .Lmul4x_enter
66
67.align 16
68.Lmul_enter:
69 mov ${num}d,${num}d
70 movd `($win64?56:8)`(%rsp),%xmm5 # load 7th argument
71 lea .Linc(%rip),%r10
72 push %rbx
73 push %rbp
74 push %r12
75 push %r13
76 push %r14
77 push %r15
78
79.Lmul_alloca:
80 mov %rsp,%rax
81 lea 2($num),%r11
82 neg %r11
83 lea -264(%rsp,%r11,8),%rsp # tp=alloca(8*(num+2)+256+8)
84 and \$-1024,%rsp # minimize TLB usage
85
86 mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp
87.Lmul_body:
88 lea 128($bp),%r12 # reassign $bp (+size optimization)
89___
90 $bp="%r12";
91 $STRIDE=2**5*8; # 5 is "window size"
92 $N=$STRIDE/4; # should match cache line size
93$code.=<<___;
94 movdqa 0(%r10),%xmm0 # 00000001000000010000000000000000
95 movdqa 16(%r10),%xmm1 # 00000002000000020000000200000002
96 lea 24-112(%rsp,$num,8),%r10# place the mask after tp[num+3] (+ICache optimization)
97 and \$-16,%r10
98
99 pshufd \$0,%xmm5,%xmm5 # broadcast index
100 movdqa %xmm1,%xmm4
101 movdqa %xmm1,%xmm2
102___
103########################################################################
104# calculate mask by comparing 0..31 to index and save result to stack
105#
106$code.=<<___;
107 paddd %xmm0,%xmm1
108 pcmpeqd %xmm5,%xmm0 # compare to 1,0
109 .byte 0x67
110 movdqa %xmm4,%xmm3
111___
112for($k=0;$k<$STRIDE/16-4;$k+=4) {
113$code.=<<___;
114 paddd %xmm1,%xmm2
115 pcmpeqd %xmm5,%xmm1 # compare to 3,2
116 movdqa %xmm0,`16*($k+0)+112`(%r10)
117 movdqa %xmm4,%xmm0
118
119 paddd %xmm2,%xmm3
120 pcmpeqd %xmm5,%xmm2 # compare to 5,4
121 movdqa %xmm1,`16*($k+1)+112`(%r10)
122 movdqa %xmm4,%xmm1
123
124 paddd %xmm3,%xmm0
125 pcmpeqd %xmm5,%xmm3 # compare to 7,6
126 movdqa %xmm2,`16*($k+2)+112`(%r10)
127 movdqa %xmm4,%xmm2
128
129 paddd %xmm0,%xmm1
130 pcmpeqd %xmm5,%xmm0
131 movdqa %xmm3,`16*($k+3)+112`(%r10)
132 movdqa %xmm4,%xmm3
133___
134}
135$code.=<<___; # last iteration can be optimized
136 paddd %xmm1,%xmm2
137 pcmpeqd %xmm5,%xmm1
138 movdqa %xmm0,`16*($k+0)+112`(%r10)
139
140 paddd %xmm2,%xmm3
141 .byte 0x67
142 pcmpeqd %xmm5,%xmm2
143 movdqa %xmm1,`16*($k+1)+112`(%r10)
144
145 pcmpeqd %xmm5,%xmm3
146 movdqa %xmm2,`16*($k+2)+112`(%r10)
147 pand `16*($k+0)-128`($bp),%xmm0 # while it's still in register
148
149 pand `16*($k+1)-128`($bp),%xmm1
150 pand `16*($k+2)-128`($bp),%xmm2
151 movdqa %xmm3,`16*($k+3)+112`(%r10)
152 pand `16*($k+3)-128`($bp),%xmm3
153 por %xmm2,%xmm0
154 por %xmm3,%xmm1
155___
156for($k=0;$k<$STRIDE/16-4;$k+=4) {
157$code.=<<___;
158 movdqa `16*($k+0)-128`($bp),%xmm4
159 movdqa `16*($k+1)-128`($bp),%xmm5
160 movdqa `16*($k+2)-128`($bp),%xmm2
161 pand `16*($k+0)+112`(%r10),%xmm4
162 movdqa `16*($k+3)-128`($bp),%xmm3
163 pand `16*($k+1)+112`(%r10),%xmm5
164 por %xmm4,%xmm0
165 pand `16*($k+2)+112`(%r10),%xmm2
166 por %xmm5,%xmm1
167 pand `16*($k+3)+112`(%r10),%xmm3
168 por %xmm2,%xmm0
169 por %xmm3,%xmm1
170___
171}
172$code.=<<___;
173 por %xmm1,%xmm0
174 pshufd \$0x4e,%xmm0,%xmm1
175 por %xmm1,%xmm0
176 lea $STRIDE($bp),$bp
177 movd %xmm0,$m0 # m0=bp[0]
178
179 mov ($n0),$n0 # pull n0[0] value
180 mov ($ap),%rax
181
182 xor $i,$i # i=0
183 xor $j,$j # j=0
184
185 mov $n0,$m1
186 mulq $m0 # ap[0]*bp[0]
187 mov %rax,$lo0
188 mov ($np),%rax
189
190 imulq $lo0,$m1 # "tp[0]"*n0
191 mov %rdx,$hi0
192
193 mulq $m1 # np[0]*m1
194 add %rax,$lo0 # discarded
195 mov 8($ap),%rax
196 adc \$0,%rdx
197 mov %rdx,$hi1
198
199 lea 1($j),$j # j++
200 jmp .L1st_enter
201
202.align 16
203.L1st:
204 add %rax,$hi1
205 mov ($ap,$j,8),%rax
206 adc \$0,%rdx
207 add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0]
208 mov $lo0,$hi0
209 adc \$0,%rdx
210 mov $hi1,-16(%rsp,$j,8) # tp[j-1]
211 mov %rdx,$hi1
212
213.L1st_enter:
214 mulq $m0 # ap[j]*bp[0]
215 add %rax,$hi0
216 mov ($np,$j,8),%rax
217 adc \$0,%rdx
218 lea 1($j),$j # j++
219 mov %rdx,$lo0
220
221 mulq $m1 # np[j]*m1
222 cmp $num,$j
223 jl .L1st
224
225 add %rax,$hi1
226 mov ($ap),%rax # ap[0]
227 adc \$0,%rdx
228 add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0]
229 adc \$0,%rdx
230 mov $hi1,-16(%rsp,$j,8) # tp[j-1]
231 mov %rdx,$hi1
232 mov $lo0,$hi0
233
234 xor %rdx,%rdx
235 add $hi0,$hi1
236 adc \$0,%rdx
237 mov $hi1,-8(%rsp,$num,8)
238 mov %rdx,(%rsp,$num,8) # store upmost overflow bit
239
240 lea 1($i),$i # i++
241 jmp .Louter
242.align 16
243.Louter:
244 lea 24+128(%rsp,$num,8),%rdx # where 256-byte mask is (+size optimization)
245 and \$-16,%rdx
246 pxor %xmm4,%xmm4
247 pxor %xmm5,%xmm5
248___
249for($k=0;$k<$STRIDE/16;$k+=4) {
250$code.=<<___;
251 movdqa `16*($k+0)-128`($bp),%xmm0
252 movdqa `16*($k+1)-128`($bp),%xmm1
253 movdqa `16*($k+2)-128`($bp),%xmm2
254 movdqa `16*($k+3)-128`($bp),%xmm3
255 pand `16*($k+0)-128`(%rdx),%xmm0
256 pand `16*($k+1)-128`(%rdx),%xmm1
257 por %xmm0,%xmm4
258 pand `16*($k+2)-128`(%rdx),%xmm2
259 por %xmm1,%xmm5
260 pand `16*($k+3)-128`(%rdx),%xmm3
261 por %xmm2,%xmm4
262 por %xmm3,%xmm5
263___
264}
265$code.=<<___;
266 por %xmm5,%xmm4
267 pshufd \$0x4e,%xmm4,%xmm0
268 por %xmm4,%xmm0
269 lea $STRIDE($bp),$bp
270 movd %xmm0,$m0 # m0=bp[i]
271
272 xor $j,$j # j=0
273 mov $n0,$m1
274 mov (%rsp),$lo0
275
276 mulq $m0 # ap[0]*bp[i]
277 add %rax,$lo0 # ap[0]*bp[i]+tp[0]
278 mov ($np),%rax
279 adc \$0,%rdx
280
281 imulq $lo0,$m1 # tp[0]*n0
282 mov %rdx,$hi0
283
284 mulq $m1 # np[0]*m1
285 add %rax,$lo0 # discarded
286 mov 8($ap),%rax
287 adc \$0,%rdx
288 mov 8(%rsp),$lo0 # tp[1]
289 mov %rdx,$hi1
290
291 lea 1($j),$j # j++
292 jmp .Linner_enter
293
294.align 16
295.Linner:
296 add %rax,$hi1
297 mov ($ap,$j,8),%rax
298 adc \$0,%rdx
299 add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j]
300 mov (%rsp,$j,8),$lo0
301 adc \$0,%rdx
302 mov $hi1,-16(%rsp,$j,8) # tp[j-1]
303 mov %rdx,$hi1
304
305.Linner_enter:
306 mulq $m0 # ap[j]*bp[i]
307 add %rax,$hi0
308 mov ($np,$j,8),%rax
309 adc \$0,%rdx
310 add $hi0,$lo0 # ap[j]*bp[i]+tp[j]
311 mov %rdx,$hi0
312 adc \$0,$hi0
313 lea 1($j),$j # j++
314
315 mulq $m1 # np[j]*m1
316 cmp $num,$j
317 jl .Linner
318
319 add %rax,$hi1
320 mov ($ap),%rax # ap[0]
321 adc \$0,%rdx
322 add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j]
323 mov (%rsp,$j,8),$lo0
324 adc \$0,%rdx
325 mov $hi1,-16(%rsp,$j,8) # tp[j-1]
326 mov %rdx,$hi1
327
328 xor %rdx,%rdx
329 add $hi0,$hi1
330 adc \$0,%rdx
331 add $lo0,$hi1 # pull upmost overflow bit
332 adc \$0,%rdx
333 mov $hi1,-8(%rsp,$num,8)
334 mov %rdx,(%rsp,$num,8) # store upmost overflow bit
335
336 lea 1($i),$i # i++
337 cmp $num,$i
338 jl .Louter
339
340 xor $i,$i # i=0 and clear CF!
341 mov (%rsp),%rax # tp[0]
342 lea (%rsp),$ap # borrow ap for tp
343 mov $num,$j # j=num
344 jmp .Lsub
345.align 16
346.Lsub: sbb ($np,$i,8),%rax
347 mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i]
348 mov 8($ap,$i,8),%rax # tp[i+1]
349 lea 1($i),$i # i++
350 dec $j # doesnn't affect CF!
351 jnz .Lsub
352
353 sbb \$0,%rax # handle upmost overflow bit
354 xor $i,$i
355 and %rax,$ap
356 not %rax
357 mov $rp,$np
358 and %rax,$np
359 mov $num,$j # j=num
360 or $np,$ap # ap=borrow?tp:rp
361.align 16
362.Lcopy: # copy or in-place refresh
363 mov ($ap,$i,8),%rax
364 mov $i,(%rsp,$i,8) # zap temporary vector
365 mov %rax,($rp,$i,8) # rp[i]=tp[i]
366 lea 1($i),$i
367 sub \$1,$j
368 jnz .Lcopy
369
370 mov 8(%rsp,$num,8),%rsi # restore %rsp
371 mov \$1,%rax
372
373 mov (%rsi),%r15
374 mov 8(%rsi),%r14
375 mov 16(%rsi),%r13
376 mov 24(%rsi),%r12
377 mov 32(%rsi),%rbp
378 mov 40(%rsi),%rbx
379 lea 48(%rsi),%rsp
380.Lmul_epilogue:
381 ret
382.size bn_mul_mont_gather5,.-bn_mul_mont_gather5
383___
384{{{
385my @A=("%r10","%r11");
386my @N=("%r13","%rdi");
387$code.=<<___;
388.type bn_mul4x_mont_gather5,\@function,6
389.align 16
390bn_mul4x_mont_gather5:
391 _CET_ENDBR
392.Lmul4x_enter:
393 mov ${num}d,${num}d
394 movd `($win64?56:8)`(%rsp),%xmm5 # load 7th argument
395 lea .Linc(%rip),%r10
396 push %rbx
397 push %rbp
398 push %r12
399 push %r13
400 push %r14
401 push %r15
402
403.Lmul4x_alloca:
404 mov %rsp,%rax
405 lea 4($num),%r11
406 neg %r11
407 lea -256(%rsp,%r11,8),%rsp # tp=alloca(8*(num+4)+256)
408 and \$-1024,%rsp # minimize TLB usage
409
410 mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp
411.Lmul4x_body:
412 mov $rp,16(%rsp,$num,8) # tp[num+2]=$rp
413 lea 128(%rdx),%r12 # reassign $bp (+size optimization)
414___
415 $bp="%r12";
416 $STRIDE=2**5*8; # 5 is "window size"
417 $N=$STRIDE/4; # should match cache line size
418$code.=<<___;
419 movdqa 0(%r10),%xmm0 # 00000001000000010000000000000000
420 movdqa 16(%r10),%xmm1 # 00000002000000020000000200000002
421 lea 32-112(%rsp,$num,8),%r10# place the mask after tp[num+4] (+ICache optimization)
422
423 pshufd \$0,%xmm5,%xmm5 # broadcast index
424 movdqa %xmm1,%xmm4
425 .byte 0x67,0x67
426 movdqa %xmm1,%xmm2
427___
428########################################################################
429# calculate mask by comparing 0..31 to index and save result to stack
430#
431$code.=<<___;
432 paddd %xmm0,%xmm1
433 pcmpeqd %xmm5,%xmm0 # compare to 1,0
434 .byte 0x67
435 movdqa %xmm4,%xmm3
436___
437for($k=0;$k<$STRIDE/16-4;$k+=4) {
438$code.=<<___;
439 paddd %xmm1,%xmm2
440 pcmpeqd %xmm5,%xmm1 # compare to 3,2
441 movdqa %xmm0,`16*($k+0)+112`(%r10)
442 movdqa %xmm4,%xmm0
443
444 paddd %xmm2,%xmm3
445 pcmpeqd %xmm5,%xmm2 # compare to 5,4
446 movdqa %xmm1,`16*($k+1)+112`(%r10)
447 movdqa %xmm4,%xmm1
448
449 paddd %xmm3,%xmm0
450 pcmpeqd %xmm5,%xmm3 # compare to 7,6
451 movdqa %xmm2,`16*($k+2)+112`(%r10)
452 movdqa %xmm4,%xmm2
453
454 paddd %xmm0,%xmm1
455 pcmpeqd %xmm5,%xmm0
456 movdqa %xmm3,`16*($k+3)+112`(%r10)
457 movdqa %xmm4,%xmm3
458___
459}
460$code.=<<___; # last iteration can be optimized
461 paddd %xmm1,%xmm2
462 pcmpeqd %xmm5,%xmm1
463 movdqa %xmm0,`16*($k+0)+112`(%r10)
464
465 paddd %xmm2,%xmm3
466 .byte 0x67
467 pcmpeqd %xmm5,%xmm2
468 movdqa %xmm1,`16*($k+1)+112`(%r10)
469
470 pcmpeqd %xmm5,%xmm3
471 movdqa %xmm2,`16*($k+2)+112`(%r10)
472 pand `16*($k+0)-128`($bp),%xmm0 # while it's still in register
473
474 pand `16*($k+1)-128`($bp),%xmm1
475 pand `16*($k+2)-128`($bp),%xmm2
476 movdqa %xmm3,`16*($k+3)+112`(%r10)
477 pand `16*($k+3)-128`($bp),%xmm3
478 por %xmm2,%xmm0
479 por %xmm3,%xmm1
480___
481for($k=0;$k<$STRIDE/16-4;$k+=4) {
482$code.=<<___;
483 movdqa `16*($k+0)-128`($bp),%xmm4
484 movdqa `16*($k+1)-128`($bp),%xmm5
485 movdqa `16*($k+2)-128`($bp),%xmm2
486 pand `16*($k+0)+112`(%r10),%xmm4
487 movdqa `16*($k+3)-128`($bp),%xmm3
488 pand `16*($k+1)+112`(%r10),%xmm5
489 por %xmm4,%xmm0
490 pand `16*($k+2)+112`(%r10),%xmm2
491 por %xmm5,%xmm1
492 pand `16*($k+3)+112`(%r10),%xmm3
493 por %xmm2,%xmm0
494 por %xmm3,%xmm1
495___
496}
497$code.=<<___;
498 por %xmm1,%xmm0
499 pshufd \$0x4e,%xmm0,%xmm1
500 por %xmm1,%xmm0
501 lea $STRIDE($bp),$bp
502 movd %xmm0,$m0 # m0=bp[0]
503
504 mov ($n0),$n0 # pull n0[0] value
505 mov ($ap),%rax
506
507 xor $i,$i # i=0
508 xor $j,$j # j=0
509
510 mov $n0,$m1
511 mulq $m0 # ap[0]*bp[0]
512 mov %rax,$A[0]
513 mov ($np),%rax
514
515 imulq $A[0],$m1 # "tp[0]"*n0
516 mov %rdx,$A[1]
517
518 mulq $m1 # np[0]*m1
519 add %rax,$A[0] # discarded
520 mov 8($ap),%rax
521 adc \$0,%rdx
522 mov %rdx,$N[1]
523
524 mulq $m0
525 add %rax,$A[1]
526 mov 8($np),%rax
527 adc \$0,%rdx
528 mov %rdx,$A[0]
529
530 mulq $m1
531 add %rax,$N[1]
532 mov 16($ap),%rax
533 adc \$0,%rdx
534 add $A[1],$N[1]
535 lea 4($j),$j # j++
536 adc \$0,%rdx
537 mov $N[1],(%rsp)
538 mov %rdx,$N[0]
539 jmp .L1st4x
540.align 16
541.L1st4x:
542 mulq $m0 # ap[j]*bp[0]
543 add %rax,$A[0]
544 mov -16($np,$j,8),%rax
545 adc \$0,%rdx
546 mov %rdx,$A[1]
547
548 mulq $m1 # np[j]*m1
549 add %rax,$N[0]
550 mov -8($ap,$j,8),%rax
551 adc \$0,%rdx
552 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
553 adc \$0,%rdx
554 mov $N[0],-24(%rsp,$j,8) # tp[j-1]
555 mov %rdx,$N[1]
556
557 mulq $m0 # ap[j]*bp[0]
558 add %rax,$A[1]
559 mov -8($np,$j,8),%rax
560 adc \$0,%rdx
561 mov %rdx,$A[0]
562
563 mulq $m1 # np[j]*m1
564 add %rax,$N[1]
565 mov ($ap,$j,8),%rax
566 adc \$0,%rdx
567 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
568 adc \$0,%rdx
569 mov $N[1],-16(%rsp,$j,8) # tp[j-1]
570 mov %rdx,$N[0]
571
572 mulq $m0 # ap[j]*bp[0]
573 add %rax,$A[0]
574 mov ($np,$j,8),%rax
575 adc \$0,%rdx
576 mov %rdx,$A[1]
577
578 mulq $m1 # np[j]*m1
579 add %rax,$N[0]
580 mov 8($ap,$j,8),%rax
581 adc \$0,%rdx
582 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
583 adc \$0,%rdx
584 mov $N[0],-8(%rsp,$j,8) # tp[j-1]
585 mov %rdx,$N[1]
586
587 mulq $m0 # ap[j]*bp[0]
588 add %rax,$A[1]
589 mov 8($np,$j,8),%rax
590 adc \$0,%rdx
591 lea 4($j),$j # j++
592 mov %rdx,$A[0]
593
594 mulq $m1 # np[j]*m1
595 add %rax,$N[1]
596 mov -16($ap,$j,8),%rax
597 adc \$0,%rdx
598 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
599 adc \$0,%rdx
600 mov $N[1],-32(%rsp,$j,8) # tp[j-1]
601 mov %rdx,$N[0]
602 cmp $num,$j
603 jl .L1st4x
604
605 mulq $m0 # ap[j]*bp[0]
606 add %rax,$A[0]
607 mov -16($np,$j,8),%rax
608 adc \$0,%rdx
609 mov %rdx,$A[1]
610
611 mulq $m1 # np[j]*m1
612 add %rax,$N[0]
613 mov -8($ap,$j,8),%rax
614 adc \$0,%rdx
615 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
616 adc \$0,%rdx
617 mov $N[0],-24(%rsp,$j,8) # tp[j-1]
618 mov %rdx,$N[1]
619
620 mulq $m0 # ap[j]*bp[0]
621 add %rax,$A[1]
622 mov -8($np,$j,8),%rax
623 adc \$0,%rdx
624 mov %rdx,$A[0]
625
626 mulq $m1 # np[j]*m1
627 add %rax,$N[1]
628 mov ($ap),%rax # ap[0]
629 adc \$0,%rdx
630 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
631 adc \$0,%rdx
632 mov $N[1],-16(%rsp,$j,8) # tp[j-1]
633 mov %rdx,$N[0]
634
635 xor $N[1],$N[1]
636 add $A[0],$N[0]
637 adc \$0,$N[1]
638 mov $N[0],-8(%rsp,$j,8)
639 mov $N[1],(%rsp,$j,8) # store upmost overflow bit
640
641 lea 1($i),$i # i++
642.align 4
643.Louter4x:
644 lea 32+128(%rsp,$num,8),%rdx # where 256-byte mask is (+size optimization)
645 pxor %xmm4,%xmm4
646 pxor %xmm5,%xmm5
647___
648for($k=0;$k<$STRIDE/16;$k+=4) {
649$code.=<<___;
650 movdqa `16*($k+0)-128`($bp),%xmm0
651 movdqa `16*($k+1)-128`($bp),%xmm1
652 movdqa `16*($k+2)-128`($bp),%xmm2
653 movdqa `16*($k+3)-128`($bp),%xmm3
654 pand `16*($k+0)-128`(%rdx),%xmm0
655 pand `16*($k+1)-128`(%rdx),%xmm1
656 por %xmm0,%xmm4
657 pand `16*($k+2)-128`(%rdx),%xmm2
658 por %xmm1,%xmm5
659 pand `16*($k+3)-128`(%rdx),%xmm3
660 por %xmm2,%xmm4
661 por %xmm3,%xmm5
662___
663}
664$code.=<<___;
665 por %xmm5,%xmm4
666 pshufd \$0x4e,%xmm4,%xmm0
667 por %xmm4,%xmm0
668 lea $STRIDE($bp),$bp
669 movd %xmm0,$m0 # m0=bp[i]
670
671 xor $j,$j # j=0
672
673 mov (%rsp),$A[0]
674 mov $n0,$m1
675 mulq $m0 # ap[0]*bp[i]
676 add %rax,$A[0] # ap[0]*bp[i]+tp[0]
677 mov ($np),%rax
678 adc \$0,%rdx
679
680 imulq $A[0],$m1 # tp[0]*n0
681 mov %rdx,$A[1]
682
683 mulq $m1 # np[0]*m1
684 add %rax,$A[0] # "$N[0]", discarded
685 mov 8($ap),%rax
686 adc \$0,%rdx
687 mov %rdx,$N[1]
688
689 mulq $m0 # ap[j]*bp[i]
690 add %rax,$A[1]
691 mov 8($np),%rax
692 adc \$0,%rdx
693 add 8(%rsp),$A[1] # +tp[1]
694 adc \$0,%rdx
695 mov %rdx,$A[0]
696
697 mulq $m1 # np[j]*m1
698 add %rax,$N[1]
699 mov 16($ap),%rax
700 adc \$0,%rdx
701 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[i]+tp[j]
702 lea 4($j),$j # j+=2
703 adc \$0,%rdx
704 mov %rdx,$N[0]
705 jmp .Linner4x
706.align 16
707.Linner4x:
708 mulq $m0 # ap[j]*bp[i]
709 add %rax,$A[0]
710 mov -16($np,$j,8),%rax
711 adc \$0,%rdx
712 add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
713 adc \$0,%rdx
714 mov %rdx,$A[1]
715
716 mulq $m1 # np[j]*m1
717 add %rax,$N[0]
718 mov -8($ap,$j,8),%rax
719 adc \$0,%rdx
720 add $A[0],$N[0]
721 adc \$0,%rdx
722 mov $N[1],-32(%rsp,$j,8) # tp[j-1]
723 mov %rdx,$N[1]
724
725 mulq $m0 # ap[j]*bp[i]
726 add %rax,$A[1]
727 mov -8($np,$j,8),%rax
728 adc \$0,%rdx
729 add -8(%rsp,$j,8),$A[1]
730 adc \$0,%rdx
731 mov %rdx,$A[0]
732
733 mulq $m1 # np[j]*m1
734 add %rax,$N[1]
735 mov ($ap,$j,8),%rax
736 adc \$0,%rdx
737 add $A[1],$N[1]
738 adc \$0,%rdx
739 mov $N[0],-24(%rsp,$j,8) # tp[j-1]
740 mov %rdx,$N[0]
741
742 mulq $m0 # ap[j]*bp[i]
743 add %rax,$A[0]
744 mov ($np,$j,8),%rax
745 adc \$0,%rdx
746 add (%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
747 adc \$0,%rdx
748 mov %rdx,$A[1]
749
750 mulq $m1 # np[j]*m1
751 add %rax,$N[0]
752 mov 8($ap,$j,8),%rax
753 adc \$0,%rdx
754 add $A[0],$N[0]
755 adc \$0,%rdx
756 mov $N[1],-16(%rsp,$j,8) # tp[j-1]
757 mov %rdx,$N[1]
758
759 mulq $m0 # ap[j]*bp[i]
760 add %rax,$A[1]
761 mov 8($np,$j,8),%rax
762 adc \$0,%rdx
763 add 8(%rsp,$j,8),$A[1]
764 adc \$0,%rdx
765 lea 4($j),$j # j++
766 mov %rdx,$A[0]
767
768 mulq $m1 # np[j]*m1
769 add %rax,$N[1]
770 mov -16($ap,$j,8),%rax
771 adc \$0,%rdx
772 add $A[1],$N[1]
773 adc \$0,%rdx
774 mov $N[0],-40(%rsp,$j,8) # tp[j-1]
775 mov %rdx,$N[0]
776 cmp $num,$j
777 jl .Linner4x
778
779 mulq $m0 # ap[j]*bp[i]
780 add %rax,$A[0]
781 mov -16($np,$j,8),%rax
782 adc \$0,%rdx
783 add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
784 adc \$0,%rdx
785 mov %rdx,$A[1]
786
787 mulq $m1 # np[j]*m1
788 add %rax,$N[0]
789 mov -8($ap,$j,8),%rax
790 adc \$0,%rdx
791 add $A[0],$N[0]
792 adc \$0,%rdx
793 mov $N[1],-32(%rsp,$j,8) # tp[j-1]
794 mov %rdx,$N[1]
795
796 mulq $m0 # ap[j]*bp[i]
797 add %rax,$A[1]
798 mov -8($np,$j,8),%rax
799 adc \$0,%rdx
800 add -8(%rsp,$j,8),$A[1]
801 adc \$0,%rdx
802 lea 1($i),$i # i++
803 mov %rdx,$A[0]
804
805 mulq $m1 # np[j]*m1
806 add %rax,$N[1]
807 mov ($ap),%rax # ap[0]
808 adc \$0,%rdx
809 add $A[1],$N[1]
810 adc \$0,%rdx
811 mov $N[0],-24(%rsp,$j,8) # tp[j-1]
812 mov %rdx,$N[0]
813
814 mov $N[1],-16(%rsp,$j,8) # tp[j-1]
815
816 xor $N[1],$N[1]
817 add $A[0],$N[0]
818 adc \$0,$N[1]
819 add (%rsp,$num,8),$N[0] # pull upmost overflow bit
820 adc \$0,$N[1]
821 mov $N[0],-8(%rsp,$j,8)
822 mov $N[1],(%rsp,$j,8) # store upmost overflow bit
823
824 cmp $num,$i
825 jl .Louter4x
826___
827{
828my @ri=("%rax","%rdx",$m0,$m1);
829$code.=<<___;
830 mov 16(%rsp,$num,8),$rp # restore $rp
831 mov 0(%rsp),@ri[0] # tp[0]
832 pxor %xmm0,%xmm0
833 mov 8(%rsp),@ri[1] # tp[1]
834 shr \$2,$num # num/=4
835 lea (%rsp),$ap # borrow ap for tp
836 xor $i,$i # i=0 and clear CF!
837
838 sub 0($np),@ri[0]
839 mov 16($ap),@ri[2] # tp[2]
840 mov 24($ap),@ri[3] # tp[3]
841 sbb 8($np),@ri[1]
842 lea -1($num),$j # j=num/4-1
843 jmp .Lsub4x
844.align 16
845.Lsub4x:
846 mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i]
847 mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i]
848 sbb 16($np,$i,8),@ri[2]
849 mov 32($ap,$i,8),@ri[0] # tp[i+1]
850 mov 40($ap,$i,8),@ri[1]
851 sbb 24($np,$i,8),@ri[3]
852 mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i]
853 mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i]
854 sbb 32($np,$i,8),@ri[0]
855 mov 48($ap,$i,8),@ri[2]
856 mov 56($ap,$i,8),@ri[3]
857 sbb 40($np,$i,8),@ri[1]
858 lea 4($i),$i # i++
859 dec $j # doesnn't affect CF!
860 jnz .Lsub4x
861
862 mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i]
863 mov 32($ap,$i,8),@ri[0] # load overflow bit
864 sbb 16($np,$i,8),@ri[2]
865 mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i]
866 sbb 24($np,$i,8),@ri[3]
867 mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i]
868
869 sbb \$0,@ri[0] # handle upmost overflow bit
870 mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i]
871 xor $i,$i # i=0
872 and @ri[0],$ap
873 not @ri[0]
874 mov $rp,$np
875 and @ri[0],$np
876 lea -1($num),$j
877 or $np,$ap # ap=borrow?tp:rp
878
879 movdqu ($ap),%xmm1
880 movdqa %xmm0,(%rsp)
881 movdqu %xmm1,($rp)
882 jmp .Lcopy4x
883.align 16
884.Lcopy4x: # copy or in-place refresh
885 movdqu 16($ap,$i),%xmm2
886 movdqu 32($ap,$i),%xmm1
887 movdqa %xmm0,16(%rsp,$i)
888 movdqu %xmm2,16($rp,$i)
889 movdqa %xmm0,32(%rsp,$i)
890 movdqu %xmm1,32($rp,$i)
891 lea 32($i),$i
892 dec $j
893 jnz .Lcopy4x
894
895 shl \$2,$num
896 movdqu 16($ap,$i),%xmm2
897 movdqa %xmm0,16(%rsp,$i)
898 movdqu %xmm2,16($rp,$i)
899___
900}
901$code.=<<___;
902 mov 8(%rsp,$num,8),%rsi # restore %rsp
903 mov \$1,%rax
904
905 mov (%rsi),%r15
906 mov 8(%rsi),%r14
907 mov 16(%rsi),%r13
908 mov 24(%rsi),%r12
909 mov 32(%rsi),%rbp
910 mov 40(%rsi),%rbx
911 lea 48(%rsi),%rsp
912.Lmul4x_epilogue:
913 ret
914.size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
915___
916}}}
917
918{
919my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%rdx","%r8", "%r9d") : # Win64 order
920 ("%rdi","%rsi","%rdx","%ecx"); # Unix order
921my $out=$inp;
922my $STRIDE=2**5*8;
923my $N=$STRIDE/4;
924
925$code.=<<___;
926.globl bn_scatter5
927.type bn_scatter5,\@abi-omnipotent
928.align 16
929bn_scatter5:
930 _CET_ENDBR
931 cmp \$0, $num
932 jz .Lscatter_epilogue
933 lea ($tbl,$idx,8),$tbl
934.Lscatter:
935 mov ($inp),%rax
936 lea 8($inp),$inp
937 mov %rax,($tbl)
938 lea 32*8($tbl),$tbl
939 sub \$1,$num
940 jnz .Lscatter
941.Lscatter_epilogue:
942 ret
943.size bn_scatter5,.-bn_scatter5
944
945.globl bn_gather5
946.type bn_gather5,\@abi-omnipotent
947.align 16
948bn_gather5:
949 _CET_ENDBR
950.LSEH_begin_bn_gather5: # Win64 thing, but harmless in other cases
951 # I can't trust assembler to use specific encoding:-(
952 .byte 0x4c,0x8d,0x14,0x24 # lea (%rsp),%r10
953 .byte 0x48,0x81,0xec,0x08,0x01,0x00,0x00 # sub $0x108,%rsp
954 lea .Linc(%rip),%rax
955 and \$-16,%rsp # shouldn't be formally required
956
957 movd $idx,%xmm5
958 movdqa 0(%rax),%xmm0 # 00000001000000010000000000000000
959 movdqa 16(%rax),%xmm1 # 00000002000000020000000200000002
960 lea 128($tbl),%r11 # size optimization
961 lea 128(%rsp),%rax # size optimization
962
963 pshufd \$0,%xmm5,%xmm5 # broadcast $idx
964 movdqa %xmm1,%xmm4
965 movdqa %xmm1,%xmm2
966___
967########################################################################
968# calculate mask by comparing 0..31 to $idx and save result to stack
969#
970for($i=0;$i<$STRIDE/16;$i+=4) {
971$code.=<<___;
972 paddd %xmm0,%xmm1
973 pcmpeqd %xmm5,%xmm0 # compare to 1,0
974___
975$code.=<<___ if ($i);
976 movdqa %xmm3,`16*($i-1)-128`(%rax)
977___
978$code.=<<___;
979 movdqa %xmm4,%xmm3
980
981 paddd %xmm1,%xmm2
982 pcmpeqd %xmm5,%xmm1 # compare to 3,2
983 movdqa %xmm0,`16*($i+0)-128`(%rax)
984 movdqa %xmm4,%xmm0
985
986 paddd %xmm2,%xmm3
987 pcmpeqd %xmm5,%xmm2 # compare to 5,4
988 movdqa %xmm1,`16*($i+1)-128`(%rax)
989 movdqa %xmm4,%xmm1
990
991 paddd %xmm3,%xmm0
992 pcmpeqd %xmm5,%xmm3 # compare to 7,6
993 movdqa %xmm2,`16*($i+2)-128`(%rax)
994 movdqa %xmm4,%xmm2
995___
996}
997$code.=<<___;
998 movdqa %xmm3,`16*($i-1)-128`(%rax)
999 jmp .Lgather
1000
1001.align 32
1002.Lgather:
1003 pxor %xmm4,%xmm4
1004 pxor %xmm5,%xmm5
1005___
1006for($i=0;$i<$STRIDE/16;$i+=4) {
1007$code.=<<___;
1008 movdqa `16*($i+0)-128`(%r11),%xmm0
1009 movdqa `16*($i+1)-128`(%r11),%xmm1
1010 movdqa `16*($i+2)-128`(%r11),%xmm2
1011 pand `16*($i+0)-128`(%rax),%xmm0
1012 movdqa `16*($i+3)-128`(%r11),%xmm3
1013 pand `16*($i+1)-128`(%rax),%xmm1
1014 por %xmm0,%xmm4
1015 pand `16*($i+2)-128`(%rax),%xmm2
1016 por %xmm1,%xmm5
1017 pand `16*($i+3)-128`(%rax),%xmm3
1018 por %xmm2,%xmm4
1019 por %xmm3,%xmm5
1020___
1021}
1022$code.=<<___;
1023 por %xmm5,%xmm4
1024 lea $STRIDE(%r11),%r11
1025 pshufd \$0x4e,%xmm4,%xmm0
1026 por %xmm4,%xmm0
1027 movq %xmm0,($out) # m0=bp[0]
1028 lea 8($out),$out
1029 sub \$1,$num
1030 jnz .Lgather
1031
1032 lea (%r10),%rsp
1033 ret
1034.LSEH_end_bn_gather5:
1035.size bn_gather5,.-bn_gather5
1036___
1037}
1038$code.=<<___;
1039.section .rodata
1040.align 64
1041.Linc:
1042 .long 0,0, 1,1
1043 .long 2,2, 2,2
1044.text
1045___
1046
1047# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1048# CONTEXT *context,DISPATCHER_CONTEXT *disp)
1049if ($win64) {
1050$rec="%rcx";
1051$frame="%rdx";
1052$context="%r8";
1053$disp="%r9";
1054
1055$code.=<<___;
1056.extern __imp_RtlVirtualUnwind
1057.type mul_handler,\@abi-omnipotent
1058.align 16
1059mul_handler:
1060 _CET_ENDBR
1061 push %rsi
1062 push %rdi
1063 push %rbx
1064 push %rbp
1065 push %r12
1066 push %r13
1067 push %r14
1068 push %r15
1069 pushfq
1070 sub \$64,%rsp
1071
1072 mov 120($context),%rax # pull context->Rax
1073 mov 248($context),%rbx # pull context->Rip
1074
1075 mov 8($disp),%rsi # disp->ImageBase
1076 mov 56($disp),%r11 # disp->HandlerData
1077
1078 mov 0(%r11),%r10d # HandlerData[0]
1079 lea (%rsi,%r10),%r10 # end of prologue label
1080 cmp %r10,%rbx # context->Rip<end of prologue label
1081 jb .Lcommon_seh_tail
1082
1083 lea 48(%rax),%rax
1084
1085 mov 4(%r11),%r10d # HandlerData[1]
1086 lea (%rsi,%r10),%r10 # end of alloca label
1087 cmp %r10,%rbx # context->Rip<end of alloca label
1088 jb .Lcommon_seh_tail
1089
1090 mov 152($context),%rax # pull context->Rsp
1091
1092 mov 8(%r11),%r10d # HandlerData[2]
1093 lea (%rsi,%r10),%r10 # epilogue label
1094 cmp %r10,%rbx # context->Rip>=epilogue label
1095 jae .Lcommon_seh_tail
1096
1097 mov 192($context),%r10 # pull $num
1098 mov 8(%rax,%r10,8),%rax # pull saved stack pointer
1099
1100 lea 48(%rax),%rax
1101
1102 mov -8(%rax),%rbx
1103 mov -16(%rax),%rbp
1104 mov -24(%rax),%r12
1105 mov -32(%rax),%r13
1106 mov -40(%rax),%r14
1107 mov -48(%rax),%r15
1108 mov %rbx,144($context) # restore context->Rbx
1109 mov %rbp,160($context) # restore context->Rbp
1110 mov %r12,216($context) # restore context->R12
1111 mov %r13,224($context) # restore context->R13
1112 mov %r14,232($context) # restore context->R14
1113 mov %r15,240($context) # restore context->R15
1114
1115.Lcommon_seh_tail:
1116 mov 8(%rax),%rdi
1117 mov 16(%rax),%rsi
1118 mov %rax,152($context) # restore context->Rsp
1119 mov %rsi,168($context) # restore context->Rsi
1120 mov %rdi,176($context) # restore context->Rdi
1121
1122 mov 40($disp),%rdi # disp->ContextRecord
1123 mov $context,%rsi # context
1124 mov \$154,%ecx # sizeof(CONTEXT)
1125 .long 0xa548f3fc # cld; rep movsq
1126
1127 mov $disp,%rsi
1128 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1129 mov 8(%rsi),%rdx # arg2, disp->ImageBase
1130 mov 0(%rsi),%r8 # arg3, disp->ControlPc
1131 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1132 mov 40(%rsi),%r10 # disp->ContextRecord
1133 lea 56(%rsi),%r11 # &disp->HandlerData
1134 lea 24(%rsi),%r12 # &disp->EstablisherFrame
1135 mov %r10,32(%rsp) # arg5
1136 mov %r11,40(%rsp) # arg6
1137 mov %r12,48(%rsp) # arg7
1138 mov %rcx,56(%rsp) # arg8, (NULL)
1139 call *__imp_RtlVirtualUnwind(%rip)
1140
1141 mov \$1,%eax # ExceptionContinueSearch
1142 add \$64,%rsp
1143 popfq
1144 pop %r15
1145 pop %r14
1146 pop %r13
1147 pop %r12
1148 pop %rbp
1149 pop %rbx
1150 pop %rdi
1151 pop %rsi
1152 ret
1153.size mul_handler,.-mul_handler
1154
1155.section .pdata
1156.align 4
1157 .rva .LSEH_begin_bn_mul_mont_gather5
1158 .rva .LSEH_end_bn_mul_mont_gather5
1159 .rva .LSEH_info_bn_mul_mont_gather5
1160
1161 .rva .LSEH_begin_bn_mul4x_mont_gather5
1162 .rva .LSEH_end_bn_mul4x_mont_gather5
1163 .rva .LSEH_info_bn_mul4x_mont_gather5
1164
1165 .rva .LSEH_begin_bn_gather5
1166 .rva .LSEH_end_bn_gather5
1167 .rva .LSEH_info_bn_gather5
1168
1169.section .xdata
1170.align 8
1171.LSEH_info_bn_mul_mont_gather5:
1172 .byte 9,0,0,0
1173 .rva mul_handler
1174 .rva .Lmul_alloca,.Lmul_body,.Lmul_epilogue # HandlerData[]
1175.align 8
1176.LSEH_info_bn_mul4x_mont_gather5:
1177 .byte 9,0,0,0
1178 .rva mul_handler
1179 .rva .Lmul4x_alloca,.Lmul4x_body,.Lmul4x_epilogue # HandlerData[]
1180.align 8
1181.LSEH_info_bn_gather5:
1182 .byte 0x01,0x0b,0x03,0x0a
1183 .byte 0x0b,0x01,0x21,0x00 # sub rsp,0x108
1184 .byte 0x04,0xa3,0x00,0x00 # lea r10,(rsp), set_frame r10
1185.align 8
1186___
1187}
1188
1189$code =~ s/\`([^\`]*)\`/eval($1)/gem;
1190
1191print $code;
1192close STDOUT;