summaryrefslogtreecommitdiff
path: root/src/lib/libcrypto/bn/asm/x86_64-mont.pl
diff options
context:
space:
mode:
Diffstat (limited to 'src/lib/libcrypto/bn/asm/x86_64-mont.pl')
-rwxr-xr-xsrc/lib/libcrypto/bn/asm/x86_64-mont.pl330
1 files changed, 0 insertions, 330 deletions
diff --git a/src/lib/libcrypto/bn/asm/x86_64-mont.pl b/src/lib/libcrypto/bn/asm/x86_64-mont.pl
deleted file mode 100755
index 3b7a6f243f..0000000000
--- a/src/lib/libcrypto/bn/asm/x86_64-mont.pl
+++ /dev/null
@@ -1,330 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# October 2005.
11#
12# Montgomery multiplication routine for x86_64. While it gives modest
13# 9% improvement of rsa4096 sign on Opteron, rsa512 sign runs more
14# than twice, >2x, as fast. Most common rsa1024 sign is improved by
15# respectful 50%. It remains to be seen if loop unrolling and
16# dedicated squaring routine can provide further improvement...
17
18$flavour = shift;
19$output = shift;
20if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
21
22$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
23
24$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
25( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
26( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
27die "can't locate x86_64-xlate.pl";
28
29open STDOUT,"| $^X $xlate $flavour $output";
30
31# int bn_mul_mont(
32$rp="%rdi"; # BN_ULONG *rp,
33$ap="%rsi"; # const BN_ULONG *ap,
34$bp="%rdx"; # const BN_ULONG *bp,
35$np="%rcx"; # const BN_ULONG *np,
36$n0="%r8"; # const BN_ULONG *n0,
37$num="%r9"; # int num);
38$lo0="%r10";
39$hi0="%r11";
40$bp="%r12"; # reassign $bp
41$hi1="%r13";
42$i="%r14";
43$j="%r15";
44$m0="%rbx";
45$m1="%rbp";
46
47$code=<<___;
48.text
49
50.globl bn_mul_mont
51.type bn_mul_mont,\@function,6
52.align 16
53bn_mul_mont:
54 push %rbx
55 push %rbp
56 push %r12
57 push %r13
58 push %r14
59 push %r15
60
61 mov ${num}d,${num}d
62 lea 2($num),%r10
63 mov %rsp,%r11
64 neg %r10
65 lea (%rsp,%r10,8),%rsp # tp=alloca(8*(num+2))
66 and \$-1024,%rsp # minimize TLB usage
67
68 mov %r11,8(%rsp,$num,8) # tp[num+1]=%rsp
69.Lprologue:
70 mov %rdx,$bp # $bp reassigned, remember?
71
72 mov ($n0),$n0 # pull n0[0] value
73
74 xor $i,$i # i=0
75 xor $j,$j # j=0
76
77 mov ($bp),$m0 # m0=bp[0]
78 mov ($ap),%rax
79 mulq $m0 # ap[0]*bp[0]
80 mov %rax,$lo0
81 mov %rdx,$hi0
82
83 imulq $n0,%rax # "tp[0]"*n0
84 mov %rax,$m1
85
86 mulq ($np) # np[0]*m1
87 add $lo0,%rax # discarded
88 adc \$0,%rdx
89 mov %rdx,$hi1
90
91 lea 1($j),$j # j++
92.L1st:
93 mov ($ap,$j,8),%rax
94 mulq $m0 # ap[j]*bp[0]
95 add $hi0,%rax
96 adc \$0,%rdx
97 mov %rax,$lo0
98 mov ($np,$j,8),%rax
99 mov %rdx,$hi0
100
101 mulq $m1 # np[j]*m1
102 add $hi1,%rax
103 lea 1($j),$j # j++
104 adc \$0,%rdx
105 add $lo0,%rax # np[j]*m1+ap[j]*bp[0]
106 adc \$0,%rdx
107 mov %rax,-16(%rsp,$j,8) # tp[j-1]
108 cmp $num,$j
109 mov %rdx,$hi1
110 jl .L1st
111
112 xor %rdx,%rdx
113 add $hi0,$hi1
114 adc \$0,%rdx
115 mov $hi1,-8(%rsp,$num,8)
116 mov %rdx,(%rsp,$num,8) # store upmost overflow bit
117
118 lea 1($i),$i # i++
119.align 4
120.Louter:
121 xor $j,$j # j=0
122
123 mov ($bp,$i,8),$m0 # m0=bp[i]
124 mov ($ap),%rax # ap[0]
125 mulq $m0 # ap[0]*bp[i]
126 add (%rsp),%rax # ap[0]*bp[i]+tp[0]
127 adc \$0,%rdx
128 mov %rax,$lo0
129 mov %rdx,$hi0
130
131 imulq $n0,%rax # tp[0]*n0
132 mov %rax,$m1
133
134 mulq ($np,$j,8) # np[0]*m1
135 add $lo0,%rax # discarded
136 mov 8(%rsp),$lo0 # tp[1]
137 adc \$0,%rdx
138 mov %rdx,$hi1
139
140 lea 1($j),$j # j++
141.align 4
142.Linner:
143 mov ($ap,$j,8),%rax
144 mulq $m0 # ap[j]*bp[i]
145 add $hi0,%rax
146 adc \$0,%rdx
147 add %rax,$lo0 # ap[j]*bp[i]+tp[j]
148 mov ($np,$j,8),%rax
149 adc \$0,%rdx
150 mov %rdx,$hi0
151
152 mulq $m1 # np[j]*m1
153 add $hi1,%rax
154 lea 1($j),$j # j++
155 adc \$0,%rdx
156 add $lo0,%rax # np[j]*m1+ap[j]*bp[i]+tp[j]
157 adc \$0,%rdx
158 mov (%rsp,$j,8),$lo0
159 cmp $num,$j
160 mov %rax,-16(%rsp,$j,8) # tp[j-1]
161 mov %rdx,$hi1
162 jl .Linner
163
164 xor %rdx,%rdx
165 add $hi0,$hi1
166 adc \$0,%rdx
167 add $lo0,$hi1 # pull upmost overflow bit
168 adc \$0,%rdx
169 mov $hi1,-8(%rsp,$num,8)
170 mov %rdx,(%rsp,$num,8) # store upmost overflow bit
171
172 lea 1($i),$i # i++
173 cmp $num,$i
174 jl .Louter
175
176 lea (%rsp),$ap # borrow ap for tp
177 lea -1($num),$j # j=num-1
178
179 mov ($ap),%rax # tp[0]
180 xor $i,$i # i=0 and clear CF!
181 jmp .Lsub
182.align 16
183.Lsub: sbb ($np,$i,8),%rax
184 mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i]
185 dec $j # doesn't affect CF!
186 mov 8($ap,$i,8),%rax # tp[i+1]
187 lea 1($i),$i # i++
188 jge .Lsub
189
190 sbb \$0,%rax # handle upmost overflow bit
191 and %rax,$ap
192 not %rax
193 mov $rp,$np
194 and %rax,$np
195 lea -1($num),$j
196 or $np,$ap # ap=borrow?tp:rp
197.align 16
198.Lcopy: # copy or in-place refresh
199 mov ($ap,$j,8),%rax
200 mov %rax,($rp,$j,8) # rp[i]=tp[i]
201 mov $i,(%rsp,$j,8) # zap temporary vector
202 dec $j
203 jge .Lcopy
204
205 mov 8(%rsp,$num,8),%rsi # restore %rsp
206 mov \$1,%rax
207 mov (%rsi),%r15
208 mov 8(%rsi),%r14
209 mov 16(%rsi),%r13
210 mov 24(%rsi),%r12
211 mov 32(%rsi),%rbp
212 mov 40(%rsi),%rbx
213 lea 48(%rsi),%rsp
214.Lepilogue:
215 ret
216.size bn_mul_mont,.-bn_mul_mont
217.asciz "Montgomery Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
218.align 16
219___
220
221# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
222# CONTEXT *context,DISPATCHER_CONTEXT *disp)
223if ($win64) {
224$rec="%rcx";
225$frame="%rdx";
226$context="%r8";
227$disp="%r9";
228
229$code.=<<___;
230.extern __imp_RtlVirtualUnwind
231.type se_handler,\@abi-omnipotent
232.align 16
233se_handler:
234 push %rsi
235 push %rdi
236 push %rbx
237 push %rbp
238 push %r12
239 push %r13
240 push %r14
241 push %r15
242 pushfq
243 sub \$64,%rsp
244
245 mov 120($context),%rax # pull context->Rax
246 mov 248($context),%rbx # pull context->Rip
247
248 lea .Lprologue(%rip),%r10
249 cmp %r10,%rbx # context->Rip<.Lprologue
250 jb .Lin_prologue
251
252 mov 152($context),%rax # pull context->Rsp
253
254 lea .Lepilogue(%rip),%r10
255 cmp %r10,%rbx # context->Rip>=.Lepilogue
256 jae .Lin_prologue
257
258 mov 192($context),%r10 # pull $num
259 mov 8(%rax,%r10,8),%rax # pull saved stack pointer
260 lea 48(%rax),%rax
261
262 mov -8(%rax),%rbx
263 mov -16(%rax),%rbp
264 mov -24(%rax),%r12
265 mov -32(%rax),%r13
266 mov -40(%rax),%r14
267 mov -48(%rax),%r15
268 mov %rbx,144($context) # restore context->Rbx
269 mov %rbp,160($context) # restore context->Rbp
270 mov %r12,216($context) # restore context->R12
271 mov %r13,224($context) # restore context->R13
272 mov %r14,232($context) # restore context->R14
273 mov %r15,240($context) # restore context->R15
274
275.Lin_prologue:
276 mov 8(%rax),%rdi
277 mov 16(%rax),%rsi
278 mov %rax,152($context) # restore context->Rsp
279 mov %rsi,168($context) # restore context->Rsi
280 mov %rdi,176($context) # restore context->Rdi
281
282 mov 40($disp),%rdi # disp->ContextRecord
283 mov $context,%rsi # context
284 mov \$154,%ecx # sizeof(CONTEXT)
285 .long 0xa548f3fc # cld; rep movsq
286
287 mov $disp,%rsi
288 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
289 mov 8(%rsi),%rdx # arg2, disp->ImageBase
290 mov 0(%rsi),%r8 # arg3, disp->ControlPc
291 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
292 mov 40(%rsi),%r10 # disp->ContextRecord
293 lea 56(%rsi),%r11 # &disp->HandlerData
294 lea 24(%rsi),%r12 # &disp->EstablisherFrame
295 mov %r10,32(%rsp) # arg5
296 mov %r11,40(%rsp) # arg6
297 mov %r12,48(%rsp) # arg7
298 mov %rcx,56(%rsp) # arg8, (NULL)
299 call *__imp_RtlVirtualUnwind(%rip)
300
301 mov \$1,%eax # ExceptionContinueSearch
302 add \$64,%rsp
303 popfq
304 pop %r15
305 pop %r14
306 pop %r13
307 pop %r12
308 pop %rbp
309 pop %rbx
310 pop %rdi
311 pop %rsi
312 ret
313.size se_handler,.-se_handler
314
315.section .pdata
316.align 4
317 .rva .LSEH_begin_bn_mul_mont
318 .rva .LSEH_end_bn_mul_mont
319 .rva .LSEH_info_bn_mul_mont
320
321.section .xdata
322.align 8
323.LSEH_info_bn_mul_mont:
324 .byte 9,0,0,0
325 .rva se_handler
326___
327}
328
329print $code;
330close STDOUT;