summaryrefslogtreecommitdiff
path: root/src/lib/libcrypto/bn/asm
diff options
context:
space:
mode:
authordjm <>2008-09-06 12:15:56 +0000
committerdjm <>2008-09-06 12:15:56 +0000
commit5a3c0a05c7f2c5d3c584b7c8d6aec836dd724c80 (patch)
treeaba68249883aa9d2361d92eef69a81d0c4961732 /src/lib/libcrypto/bn/asm
parentf6198d4d0ab97685dc56be2d48715ed39fcc74b9 (diff)
downloadopenbsd-5a3c0a05c7f2c5d3c584b7c8d6aec836dd724c80.tar.gz
openbsd-5a3c0a05c7f2c5d3c584b7c8d6aec836dd724c80.tar.bz2
openbsd-5a3c0a05c7f2c5d3c584b7c8d6aec836dd724c80.zip
import of OpenSSL 0.9.8h
Diffstat (limited to 'src/lib/libcrypto/bn/asm')
-rw-r--r--src/lib/libcrypto/bn/asm/bn-586.pl86
-rw-r--r--src/lib/libcrypto/bn/asm/ia64.S35
-rw-r--r--src/lib/libcrypto/bn/asm/x86_64-gcc.c4
-rwxr-xr-xsrc/lib/libcrypto/bn/asm/x86_64-mont.pl214
4 files changed, 317 insertions, 22 deletions
diff --git a/src/lib/libcrypto/bn/asm/bn-586.pl b/src/lib/libcrypto/bn/asm/bn-586.pl
index c4de4a2bee..26c2685a72 100644
--- a/src/lib/libcrypto/bn/asm/bn-586.pl
+++ b/src/lib/libcrypto/bn/asm/bn-586.pl
@@ -5,13 +5,18 @@ require "x86asm.pl";
5 5
6&asm_init($ARGV[0],$0); 6&asm_init($ARGV[0],$0);
7 7
8$sse2=0;
9for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
10
11&external_label("OPENSSL_ia32cap_P") if ($sse2);
12
8&bn_mul_add_words("bn_mul_add_words"); 13&bn_mul_add_words("bn_mul_add_words");
9&bn_mul_words("bn_mul_words"); 14&bn_mul_words("bn_mul_words");
10&bn_sqr_words("bn_sqr_words"); 15&bn_sqr_words("bn_sqr_words");
11&bn_div_words("bn_div_words"); 16&bn_div_words("bn_div_words");
12&bn_add_words("bn_add_words"); 17&bn_add_words("bn_add_words");
13&bn_sub_words("bn_sub_words"); 18&bn_sub_words("bn_sub_words");
14#&bn_sub_part_words("bn_sub_part_words"); 19&bn_sub_part_words("bn_sub_part_words");
15 20
16&asm_finish(); 21&asm_finish();
17 22
@@ -19,7 +24,7 @@ sub bn_mul_add_words
19 { 24 {
20 local($name)=@_; 25 local($name)=@_;
21 26
22 &function_begin($name,""); 27 &function_begin($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
23 28
24 &comment(""); 29 &comment("");
25 $Low="eax"; 30 $Low="eax";
@@ -42,6 +47,83 @@ sub bn_mul_add_words
42 47
43 &jz(&label("maw_finish")); 48 &jz(&label("maw_finish"));
44 49
50 if ($sse2) {
51 &picmeup("eax","OPENSSL_ia32cap_P");
52 &bt(&DWP(0,"eax"),26);
53 &jnc(&label("maw_loop"));
54
55 &movd("mm0",$w); # mm0 = w
56 &pxor("mm1","mm1"); # mm1 = carry_in
57
58 &set_label("maw_sse2_loop",0);
59 &movd("mm3",&DWP(0,$r,"",0)); # mm3 = r[0]
60 &paddq("mm1","mm3"); # mm1 = carry_in + r[0]
61 &movd("mm2",&DWP(0,$a,"",0)); # mm2 = a[0]
62 &pmuludq("mm2","mm0"); # mm2 = w*a[0]
63 &movd("mm4",&DWP(4,$a,"",0)); # mm4 = a[1]
64 &pmuludq("mm4","mm0"); # mm4 = w*a[1]
65 &movd("mm6",&DWP(8,$a,"",0)); # mm6 = a[2]
66 &pmuludq("mm6","mm0"); # mm6 = w*a[2]
67 &movd("mm7",&DWP(12,$a,"",0)); # mm7 = a[3]
68 &pmuludq("mm7","mm0"); # mm7 = w*a[3]
69 &paddq("mm1","mm2"); # mm1 = carry_in + r[0] + w*a[0]
70 &movd("mm3",&DWP(4,$r,"",0)); # mm3 = r[1]
71 &paddq("mm3","mm4"); # mm3 = r[1] + w*a[1]
72 &movd("mm5",&DWP(8,$r,"",0)); # mm5 = r[2]
73 &paddq("mm5","mm6"); # mm5 = r[2] + w*a[2]
74 &movd("mm4",&DWP(12,$r,"",0)); # mm4 = r[3]
75 &paddq("mm7","mm4"); # mm7 = r[3] + w*a[3]
76 &movd(&DWP(0,$r,"",0),"mm1");
77 &movd("mm2",&DWP(16,$a,"",0)); # mm2 = a[4]
78 &pmuludq("mm2","mm0"); # mm2 = w*a[4]
79 &psrlq("mm1",32); # mm1 = carry0
80 &movd("mm4",&DWP(20,$a,"",0)); # mm4 = a[5]
81 &pmuludq("mm4","mm0"); # mm4 = w*a[5]
82 &paddq("mm1","mm3"); # mm1 = carry0 + r[1] + w*a[1]
83 &movd("mm6",&DWP(24,$a,"",0)); # mm6 = a[6]
84 &pmuludq("mm6","mm0"); # mm6 = w*a[6]
85 &movd(&DWP(4,$r,"",0),"mm1");
86 &psrlq("mm1",32); # mm1 = carry1
87 &movd("mm3",&DWP(28,$a,"",0)); # mm3 = a[7]
88 &add($a,32);
89 &pmuludq("mm3","mm0"); # mm3 = w*a[7]
90 &paddq("mm1","mm5"); # mm1 = carry1 + r[2] + w*a[2]
91 &movd("mm5",&DWP(16,$r,"",0)); # mm5 = r[4]
92 &paddq("mm2","mm5"); # mm2 = r[4] + w*a[4]
93 &movd(&DWP(8,$r,"",0),"mm1");
94 &psrlq("mm1",32); # mm1 = carry2
95 &paddq("mm1","mm7"); # mm1 = carry2 + r[3] + w*a[3]
96 &movd("mm5",&DWP(20,$r,"",0)); # mm5 = r[5]
97 &paddq("mm4","mm5"); # mm4 = r[5] + w*a[5]
98 &movd(&DWP(12,$r,"",0),"mm1");
99 &psrlq("mm1",32); # mm1 = carry3
100 &paddq("mm1","mm2"); # mm1 = carry3 + r[4] + w*a[4]
101 &movd("mm5",&DWP(24,$r,"",0)); # mm5 = r[6]
102 &paddq("mm6","mm5"); # mm6 = r[6] + w*a[6]
103 &movd(&DWP(16,$r,"",0),"mm1");
104 &psrlq("mm1",32); # mm1 = carry4
105 &paddq("mm1","mm4"); # mm1 = carry4 + r[5] + w*a[5]
106 &movd("mm5",&DWP(28,$r,"",0)); # mm5 = r[7]
107 &paddq("mm3","mm5"); # mm3 = r[7] + w*a[7]
108 &movd(&DWP(20,$r,"",0),"mm1");
109 &psrlq("mm1",32); # mm1 = carry5
110 &paddq("mm1","mm6"); # mm1 = carry5 + r[6] + w*a[6]
111 &movd(&DWP(24,$r,"",0),"mm1");
112 &psrlq("mm1",32); # mm1 = carry6
113 &paddq("mm1","mm3"); # mm1 = carry6 + r[7] + w*a[7]
114 &movd(&DWP(28,$r,"",0),"mm1");
115 &add($r,32);
116 &psrlq("mm1",32); # mm1 = carry_out
117
118 &sub("ecx",8);
119 &jnz(&label("maw_sse2_loop"));
120
121 &movd($c,"mm1"); # c = carry_out
122 &emms();
123
124 &jmp(&label("maw_finish"));
125 }
126
45 &set_label("maw_loop",0); 127 &set_label("maw_loop",0);
46 128
47 &mov(&swtmp(0),"ecx"); # 129 &mov(&swtmp(0),"ecx"); #
diff --git a/src/lib/libcrypto/bn/asm/ia64.S b/src/lib/libcrypto/bn/asm/ia64.S
index 7b82b820e6..951abc53ea 100644
--- a/src/lib/libcrypto/bn/asm/ia64.S
+++ b/src/lib/libcrypto/bn/asm/ia64.S
@@ -171,21 +171,21 @@
171.skip 32 // makes the loop body aligned at 64-byte boundary 171.skip 32 // makes the loop body aligned at 64-byte boundary
172bn_add_words: 172bn_add_words:
173 .prologue 173 .prologue
174 .fframe 0
175 .save ar.pfs,r2 174 .save ar.pfs,r2
176{ .mii; alloc r2=ar.pfs,4,12,0,16 175{ .mii; alloc r2=ar.pfs,4,12,0,16
177 cmp4.le p6,p0=r35,r0 };; 176 cmp4.le p6,p0=r35,r0 };;
178{ .mfb; mov r8=r0 // return value 177{ .mfb; mov r8=r0 // return value
179(p6) br.ret.spnt.many b0 };; 178(p6) br.ret.spnt.many b0 };;
180 179
181 .save ar.lc,r3
182{ .mib; sub r10=r35,r0,1 180{ .mib; sub r10=r35,r0,1
181 .save ar.lc,r3
183 mov r3=ar.lc 182 mov r3=ar.lc
184 brp.loop.imp .L_bn_add_words_ctop,.L_bn_add_words_cend-16 183 brp.loop.imp .L_bn_add_words_ctop,.L_bn_add_words_cend-16
185 } 184 }
186 .body
187{ .mib; ADDP r14=0,r32 // rp 185{ .mib; ADDP r14=0,r32 // rp
186 .save pr,r9
188 mov r9=pr };; 187 mov r9=pr };;
188 .body
189{ .mii; ADDP r15=0,r33 // ap 189{ .mii; ADDP r15=0,r33 // ap
190 mov ar.lc=r10 190 mov ar.lc=r10
191 mov ar.ec=6 } 191 mov ar.ec=6 }
@@ -224,21 +224,21 @@ bn_add_words:
224.skip 32 // makes the loop body aligned at 64-byte boundary 224.skip 32 // makes the loop body aligned at 64-byte boundary
225bn_sub_words: 225bn_sub_words:
226 .prologue 226 .prologue
227 .fframe 0
228 .save ar.pfs,r2 227 .save ar.pfs,r2
229{ .mii; alloc r2=ar.pfs,4,12,0,16 228{ .mii; alloc r2=ar.pfs,4,12,0,16
230 cmp4.le p6,p0=r35,r0 };; 229 cmp4.le p6,p0=r35,r0 };;
231{ .mfb; mov r8=r0 // return value 230{ .mfb; mov r8=r0 // return value
232(p6) br.ret.spnt.many b0 };; 231(p6) br.ret.spnt.many b0 };;
233 232
234 .save ar.lc,r3
235{ .mib; sub r10=r35,r0,1 233{ .mib; sub r10=r35,r0,1
234 .save ar.lc,r3
236 mov r3=ar.lc 235 mov r3=ar.lc
237 brp.loop.imp .L_bn_sub_words_ctop,.L_bn_sub_words_cend-16 236 brp.loop.imp .L_bn_sub_words_ctop,.L_bn_sub_words_cend-16
238 } 237 }
239 .body
240{ .mib; ADDP r14=0,r32 // rp 238{ .mib; ADDP r14=0,r32 // rp
239 .save pr,r9
241 mov r9=pr };; 240 mov r9=pr };;
241 .body
242{ .mii; ADDP r15=0,r33 // ap 242{ .mii; ADDP r15=0,r33 // ap
243 mov ar.lc=r10 243 mov ar.lc=r10
244 mov ar.ec=6 } 244 mov ar.ec=6 }
@@ -283,7 +283,6 @@ bn_sub_words:
283.skip 32 // makes the loop body aligned at 64-byte boundary 283.skip 32 // makes the loop body aligned at 64-byte boundary
284bn_mul_words: 284bn_mul_words:
285 .prologue 285 .prologue
286 .fframe 0
287 .save ar.pfs,r2 286 .save ar.pfs,r2
288#ifdef XMA_TEMPTATION 287#ifdef XMA_TEMPTATION
289{ .mfi; alloc r2=ar.pfs,4,0,0,0 };; 288{ .mfi; alloc r2=ar.pfs,4,0,0,0 };;
@@ -294,9 +293,10 @@ bn_mul_words:
294 cmp4.le p6,p0=r34,r0 293 cmp4.le p6,p0=r34,r0
295(p6) br.ret.spnt.many b0 };; 294(p6) br.ret.spnt.many b0 };;
296 295
297 .save ar.lc,r3
298{ .mii; sub r10=r34,r0,1 296{ .mii; sub r10=r34,r0,1
297 .save ar.lc,r3
299 mov r3=ar.lc 298 mov r3=ar.lc
299 .save pr,r9
300 mov r9=pr };; 300 mov r9=pr };;
301 301
302 .body 302 .body
@@ -397,22 +397,21 @@ bn_mul_words:
397.skip 48 // makes the loop body aligned at 64-byte boundary 397.skip 48 // makes the loop body aligned at 64-byte boundary
398bn_mul_add_words: 398bn_mul_add_words:
399 .prologue 399 .prologue
400 .fframe 0
401 .save ar.pfs,r2 400 .save ar.pfs,r2
402 .save ar.lc,r3
403 .save pr,r9
404{ .mmi; alloc r2=ar.pfs,4,4,0,8 401{ .mmi; alloc r2=ar.pfs,4,4,0,8
405 cmp4.le p6,p0=r34,r0 402 cmp4.le p6,p0=r34,r0
403 .save ar.lc,r3
406 mov r3=ar.lc };; 404 mov r3=ar.lc };;
407{ .mib; mov r8=r0 // return value 405{ .mib; mov r8=r0 // return value
408 sub r10=r34,r0,1 406 sub r10=r34,r0,1
409(p6) br.ret.spnt.many b0 };; 407(p6) br.ret.spnt.many b0 };;
410 408
411 .body
412{ .mib; setf.sig f8=r35 // w 409{ .mib; setf.sig f8=r35 // w
410 .save pr,r9
413 mov r9=pr 411 mov r9=pr
414 brp.loop.imp .L_bn_mul_add_words_ctop,.L_bn_mul_add_words_cend-16 412 brp.loop.imp .L_bn_mul_add_words_ctop,.L_bn_mul_add_words_cend-16
415 } 413 }
414 .body
416{ .mmi; ADDP r14=0,r32 // rp 415{ .mmi; ADDP r14=0,r32 // rp
417 ADDP r15=0,r33 // ap 416 ADDP r15=0,r33 // ap
418 mov ar.lc=r10 } 417 mov ar.lc=r10 }
@@ -466,7 +465,6 @@ bn_mul_add_words:
466.skip 32 // makes the loop body aligned at 64-byte boundary 465.skip 32 // makes the loop body aligned at 64-byte boundary
467bn_sqr_words: 466bn_sqr_words:
468 .prologue 467 .prologue
469 .fframe 0
470 .save ar.pfs,r2 468 .save ar.pfs,r2
471{ .mii; alloc r2=ar.pfs,3,0,0,0 469{ .mii; alloc r2=ar.pfs,3,0,0,0
472 sxt4 r34=r34 };; 470 sxt4 r34=r34 };;
@@ -476,9 +474,10 @@ bn_sqr_words:
476 nop.f 0x0 474 nop.f 0x0
477(p6) br.ret.spnt.many b0 };; 475(p6) br.ret.spnt.many b0 };;
478 476
479 .save ar.lc,r3
480{ .mii; sub r10=r34,r0,1 477{ .mii; sub r10=r34,r0,1
478 .save ar.lc,r3
481 mov r3=ar.lc 479 mov r3=ar.lc
480 .save pr,r9
482 mov r9=pr };; 481 mov r9=pr };;
483 482
484 .body 483 .body
@@ -545,7 +544,6 @@ bn_sqr_words:
545.align 64 544.align 64
546bn_sqr_comba8: 545bn_sqr_comba8:
547 .prologue 546 .prologue
548 .fframe 0
549 .save ar.pfs,r2 547 .save ar.pfs,r2
550#if defined(_HPUX_SOURCE) && !defined(_LP64) 548#if defined(_HPUX_SOURCE) && !defined(_LP64)
551{ .mii; alloc r2=ar.pfs,2,1,0,0 549{ .mii; alloc r2=ar.pfs,2,1,0,0
@@ -617,7 +615,6 @@ bn_sqr_comba8:
617.align 64 615.align 64
618bn_mul_comba8: 616bn_mul_comba8:
619 .prologue 617 .prologue
620 .fframe 0
621 .save ar.pfs,r2 618 .save ar.pfs,r2
622#if defined(_HPUX_SOURCE) && !defined(_LP64) 619#if defined(_HPUX_SOURCE) && !defined(_LP64)
623{ .mii; alloc r2=ar.pfs,3,0,0,0 620{ .mii; alloc r2=ar.pfs,3,0,0,0
@@ -1175,7 +1172,6 @@ bn_mul_comba8:
1175.align 64 1172.align 64
1176bn_sqr_comba4: 1173bn_sqr_comba4:
1177 .prologue 1174 .prologue
1178 .fframe 0
1179 .save ar.pfs,r2 1175 .save ar.pfs,r2
1180#if defined(_HPUX_SOURCE) && !defined(_LP64) 1176#if defined(_HPUX_SOURCE) && !defined(_LP64)
1181{ .mii; alloc r2=ar.pfs,2,1,0,0 1177{ .mii; alloc r2=ar.pfs,2,1,0,0
@@ -1208,7 +1204,6 @@ bn_sqr_comba4:
1208.align 64 1204.align 64
1209bn_mul_comba4: 1205bn_mul_comba4:
1210 .prologue 1206 .prologue
1211 .fframe 0
1212 .save ar.pfs,r2 1207 .save ar.pfs,r2
1213#if defined(_HPUX_SOURCE) && !defined(_LP64) 1208#if defined(_HPUX_SOURCE) && !defined(_LP64)
1214{ .mii; alloc r2=ar.pfs,3,0,0,0 1209{ .mii; alloc r2=ar.pfs,3,0,0,0
@@ -1411,11 +1406,11 @@ equ=p24
1411.align 64 1406.align 64
1412bn_div_words: 1407bn_div_words:
1413 .prologue 1408 .prologue
1414 .fframe 0
1415 .save ar.pfs,r2 1409 .save ar.pfs,r2
1416 .save b0,r3
1417{ .mii; alloc r2=ar.pfs,3,5,0,8 1410{ .mii; alloc r2=ar.pfs,3,5,0,8
1411 .save b0,r3
1418 mov r3=b0 1412 mov r3=b0
1413 .save pr,r10
1419 mov r10=pr };; 1414 mov r10=pr };;
1420{ .mmb; cmp.eq p6,p0=r34,r0 1415{ .mmb; cmp.eq p6,p0=r34,r0
1421 mov r8=-1 1416 mov r8=-1
diff --git a/src/lib/libcrypto/bn/asm/x86_64-gcc.c b/src/lib/libcrypto/bn/asm/x86_64-gcc.c
index 7378344251..f13f52dd85 100644
--- a/src/lib/libcrypto/bn/asm/x86_64-gcc.c
+++ b/src/lib/libcrypto/bn/asm/x86_64-gcc.c
@@ -1,3 +1,6 @@
1#ifdef __SUNPRO_C
2# include "../bn_asm.c" /* kind of dirty hack for Sun Studio */
3#else
1/* 4/*
2 * x86_64 BIGNUM accelerator version 0.1, December 2002. 5 * x86_64 BIGNUM accelerator version 0.1, December 2002.
3 * 6 *
@@ -591,3 +594,4 @@ void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
591 r[6]=c1; 594 r[6]=c1;
592 r[7]=c2; 595 r[7]=c2;
593 } 596 }
597#endif
diff --git a/src/lib/libcrypto/bn/asm/x86_64-mont.pl b/src/lib/libcrypto/bn/asm/x86_64-mont.pl
new file mode 100755
index 0000000000..c43b69592a
--- /dev/null
+++ b/src/lib/libcrypto/bn/asm/x86_64-mont.pl
@@ -0,0 +1,214 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# October 2005.
11#
12# Montgomery multiplication routine for x86_64. While it gives modest
13# 9% improvement of rsa4096 sign on Opteron, rsa512 sign runs more
14# than twice, >2x, as fast. Most common rsa1024 sign is improved by
15# respectful 50%. It remains to be seen if loop unrolling and
16# dedicated squaring routine can provide further improvement...
17
18$output=shift;
19
20$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
21( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
22( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
23die "can't locate x86_64-xlate.pl";
24
25open STDOUT,"| $^X $xlate $output";
26
27# int bn_mul_mont(
28$rp="%rdi"; # BN_ULONG *rp,
29$ap="%rsi"; # const BN_ULONG *ap,
30$bp="%rdx"; # const BN_ULONG *bp,
31$np="%rcx"; # const BN_ULONG *np,
32$n0="%r8"; # const BN_ULONG *n0,
33$num="%r9"; # int num);
34$lo0="%r10";
35$hi0="%r11";
36$bp="%r12"; # reassign $bp
37$hi1="%r13";
38$i="%r14";
39$j="%r15";
40$m0="%rbx";
41$m1="%rbp";
42
43$code=<<___;
44.text
45
46.globl bn_mul_mont
47.type bn_mul_mont,\@function,6
48.align 16
49bn_mul_mont:
50 push %rbx
51 push %rbp
52 push %r12
53 push %r13
54 push %r14
55 push %r15
56
57 mov ${num}d,${num}d
58 lea 2($num),%rax
59 mov %rsp,%rbp
60 neg %rax
61 lea (%rsp,%rax,8),%rsp # tp=alloca(8*(num+2))
62 and \$-1024,%rsp # minimize TLB usage
63
64 mov %rbp,8(%rsp,$num,8) # tp[num+1]=%rsp
65 mov %rdx,$bp # $bp reassigned, remember?
66
67 mov ($n0),$n0 # pull n0[0] value
68
69 xor $i,$i # i=0
70 xor $j,$j # j=0
71
72 mov ($bp),$m0 # m0=bp[0]
73 mov ($ap),%rax
74 mulq $m0 # ap[0]*bp[0]
75 mov %rax,$lo0
76 mov %rdx,$hi0
77
78 imulq $n0,%rax # "tp[0]"*n0
79 mov %rax,$m1
80
81 mulq ($np) # np[0]*m1
82 add $lo0,%rax # discarded
83 adc \$0,%rdx
84 mov %rdx,$hi1
85
86 lea 1($j),$j # j++
87.L1st:
88 mov ($ap,$j,8),%rax
89 mulq $m0 # ap[j]*bp[0]
90 add $hi0,%rax
91 adc \$0,%rdx
92 mov %rax,$lo0
93 mov ($np,$j,8),%rax
94 mov %rdx,$hi0
95
96 mulq $m1 # np[j]*m1
97 add $hi1,%rax
98 lea 1($j),$j # j++
99 adc \$0,%rdx
100 add $lo0,%rax # np[j]*m1+ap[j]*bp[0]
101 adc \$0,%rdx
102 mov %rax,-16(%rsp,$j,8) # tp[j-1]
103 cmp $num,$j
104 mov %rdx,$hi1
105 jl .L1st
106
107 xor %rdx,%rdx
108 add $hi0,$hi1
109 adc \$0,%rdx
110 mov $hi1,-8(%rsp,$num,8)
111 mov %rdx,(%rsp,$num,8) # store upmost overflow bit
112
113 lea 1($i),$i # i++
114.align 4
115.Louter:
116 xor $j,$j # j=0
117
118 mov ($bp,$i,8),$m0 # m0=bp[i]
119 mov ($ap),%rax # ap[0]
120 mulq $m0 # ap[0]*bp[i]
121 add (%rsp),%rax # ap[0]*bp[i]+tp[0]
122 adc \$0,%rdx
123 mov %rax,$lo0
124 mov %rdx,$hi0
125
126 imulq $n0,%rax # tp[0]*n0
127 mov %rax,$m1
128
129 mulq ($np,$j,8) # np[0]*m1
130 add $lo0,%rax # discarded
131 mov 8(%rsp),$lo0 # tp[1]
132 adc \$0,%rdx
133 mov %rdx,$hi1
134
135 lea 1($j),$j # j++
136.align 4
137.Linner:
138 mov ($ap,$j,8),%rax
139 mulq $m0 # ap[j]*bp[i]
140 add $hi0,%rax
141 adc \$0,%rdx
142 add %rax,$lo0 # ap[j]*bp[i]+tp[j]
143 mov ($np,$j,8),%rax
144 adc \$0,%rdx
145 mov %rdx,$hi0
146
147 mulq $m1 # np[j]*m1
148 add $hi1,%rax
149 lea 1($j),$j # j++
150 adc \$0,%rdx
151 add $lo0,%rax # np[j]*m1+ap[j]*bp[i]+tp[j]
152 adc \$0,%rdx
153 mov (%rsp,$j,8),$lo0
154 cmp $num,$j
155 mov %rax,-16(%rsp,$j,8) # tp[j-1]
156 mov %rdx,$hi1
157 jl .Linner
158
159 xor %rdx,%rdx
160 add $hi0,$hi1
161 adc \$0,%rdx
162 add $lo0,$hi1 # pull upmost overflow bit
163 adc \$0,%rdx
164 mov $hi1,-8(%rsp,$num,8)
165 mov %rdx,(%rsp,$num,8) # store upmost overflow bit
166
167 lea 1($i),$i # i++
168 cmp $num,$i
169 jl .Louter
170
171 lea (%rsp),$ap # borrow ap for tp
172 lea -1($num),$j # j=num-1
173
174 mov ($ap),%rax # tp[0]
175 xor $i,$i # i=0 and clear CF!
176 jmp .Lsub
177.align 16
178.Lsub: sbb ($np,$i,8),%rax
179 mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i]
180 dec $j # doesn't affect CF!
181 mov 8($ap,$i,8),%rax # tp[i+1]
182 lea 1($i),$i # i++
183 jge .Lsub
184
185 sbb \$0,%rax # handle upmost overflow bit
186 and %rax,$ap
187 not %rax
188 mov $rp,$np
189 and %rax,$np
190 lea -1($num),$j
191 or $np,$ap # ap=borrow?tp:rp
192.align 16
193.Lcopy: # copy or in-place refresh
194 mov ($ap,$j,8),%rax
195 mov %rax,($rp,$j,8) # rp[i]=tp[i]
196 mov $i,(%rsp,$j,8) # zap temporary vector
197 dec $j
198 jge .Lcopy
199
200 mov 8(%rsp,$num,8),%rsp # restore %rsp
201 mov \$1,%rax
202 pop %r15
203 pop %r14
204 pop %r13
205 pop %r12
206 pop %rbp
207 pop %rbx
208 ret
209.size bn_mul_mont,.-bn_mul_mont
210.asciz "Montgomery Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
211___
212
213print $code;
214close STDOUT;