summaryrefslogtreecommitdiff
path: root/src/lib/libcrypto/bn/asm
diff options
context:
space:
mode:
authordjm <>2008-09-06 12:17:54 +0000
committerdjm <>2008-09-06 12:17:54 +0000
commit38ce604e3cc97706b876b0525ddff0121115456d (patch)
tree7ccc28afe1789ea3dbedf72365f955d5b8e105b5 /src/lib/libcrypto/bn/asm
parent12867252827c8efaa8ddd1fa3b3d6e321e2bcdef (diff)
downloadopenbsd-38ce604e3cc97706b876b0525ddff0121115456d.tar.gz
openbsd-38ce604e3cc97706b876b0525ddff0121115456d.tar.bz2
openbsd-38ce604e3cc97706b876b0525ddff0121115456d.zip
resolve conflicts
Diffstat (limited to 'src/lib/libcrypto/bn/asm')
-rw-r--r--src/lib/libcrypto/bn/asm/bn-586.pl86
-rw-r--r--src/lib/libcrypto/bn/asm/ia64.S35
2 files changed, 99 insertions, 22 deletions
diff --git a/src/lib/libcrypto/bn/asm/bn-586.pl b/src/lib/libcrypto/bn/asm/bn-586.pl
index c4de4a2bee..26c2685a72 100644
--- a/src/lib/libcrypto/bn/asm/bn-586.pl
+++ b/src/lib/libcrypto/bn/asm/bn-586.pl
@@ -5,13 +5,18 @@ require "x86asm.pl";
5 5
6&asm_init($ARGV[0],$0); 6&asm_init($ARGV[0],$0);
7 7
8$sse2=0;
9for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
10
11&external_label("OPENSSL_ia32cap_P") if ($sse2);
12
8&bn_mul_add_words("bn_mul_add_words"); 13&bn_mul_add_words("bn_mul_add_words");
9&bn_mul_words("bn_mul_words"); 14&bn_mul_words("bn_mul_words");
10&bn_sqr_words("bn_sqr_words"); 15&bn_sqr_words("bn_sqr_words");
11&bn_div_words("bn_div_words"); 16&bn_div_words("bn_div_words");
12&bn_add_words("bn_add_words"); 17&bn_add_words("bn_add_words");
13&bn_sub_words("bn_sub_words"); 18&bn_sub_words("bn_sub_words");
14#&bn_sub_part_words("bn_sub_part_words"); 19&bn_sub_part_words("bn_sub_part_words");
15 20
16&asm_finish(); 21&asm_finish();
17 22
@@ -19,7 +24,7 @@ sub bn_mul_add_words
19 { 24 {
20 local($name)=@_; 25 local($name)=@_;
21 26
22 &function_begin($name,""); 27 &function_begin($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
23 28
24 &comment(""); 29 &comment("");
25 $Low="eax"; 30 $Low="eax";
@@ -42,6 +47,83 @@ sub bn_mul_add_words
42 47
43 &jz(&label("maw_finish")); 48 &jz(&label("maw_finish"));
44 49
50 if ($sse2) {
51 &picmeup("eax","OPENSSL_ia32cap_P");
52 &bt(&DWP(0,"eax"),26);
53 &jnc(&label("maw_loop"));
54
55 &movd("mm0",$w); # mm0 = w
56 &pxor("mm1","mm1"); # mm1 = carry_in
57
58 &set_label("maw_sse2_loop",0);
59 &movd("mm3",&DWP(0,$r,"",0)); # mm3 = r[0]
60 &paddq("mm1","mm3"); # mm1 = carry_in + r[0]
61 &movd("mm2",&DWP(0,$a,"",0)); # mm2 = a[0]
62 &pmuludq("mm2","mm0"); # mm2 = w*a[0]
63 &movd("mm4",&DWP(4,$a,"",0)); # mm4 = a[1]
64 &pmuludq("mm4","mm0"); # mm4 = w*a[1]
65 &movd("mm6",&DWP(8,$a,"",0)); # mm6 = a[2]
66 &pmuludq("mm6","mm0"); # mm6 = w*a[2]
67 &movd("mm7",&DWP(12,$a,"",0)); # mm7 = a[3]
68 &pmuludq("mm7","mm0"); # mm7 = w*a[3]
69 &paddq("mm1","mm2"); # mm1 = carry_in + r[0] + w*a[0]
70 &movd("mm3",&DWP(4,$r,"",0)); # mm3 = r[1]
71 &paddq("mm3","mm4"); # mm3 = r[1] + w*a[1]
72 &movd("mm5",&DWP(8,$r,"",0)); # mm5 = r[2]
73 &paddq("mm5","mm6"); # mm5 = r[2] + w*a[2]
74 &movd("mm4",&DWP(12,$r,"",0)); # mm4 = r[3]
75 &paddq("mm7","mm4"); # mm7 = r[3] + w*a[3]
76 &movd(&DWP(0,$r,"",0),"mm1");
77 &movd("mm2",&DWP(16,$a,"",0)); # mm2 = a[4]
78 &pmuludq("mm2","mm0"); # mm2 = w*a[4]
79 &psrlq("mm1",32); # mm1 = carry0
80 &movd("mm4",&DWP(20,$a,"",0)); # mm4 = a[5]
81 &pmuludq("mm4","mm0"); # mm4 = w*a[5]
82 &paddq("mm1","mm3"); # mm1 = carry0 + r[1] + w*a[1]
83 &movd("mm6",&DWP(24,$a,"",0)); # mm6 = a[6]
84 &pmuludq("mm6","mm0"); # mm6 = w*a[6]
85 &movd(&DWP(4,$r,"",0),"mm1");
86 &psrlq("mm1",32); # mm1 = carry1
87 &movd("mm3",&DWP(28,$a,"",0)); # mm3 = a[7]
88 &add($a,32);
89 &pmuludq("mm3","mm0"); # mm3 = w*a[7]
90 &paddq("mm1","mm5"); # mm1 = carry1 + r[2] + w*a[2]
91 &movd("mm5",&DWP(16,$r,"",0)); # mm5 = r[4]
92 &paddq("mm2","mm5"); # mm2 = r[4] + w*a[4]
93 &movd(&DWP(8,$r,"",0),"mm1");
94 &psrlq("mm1",32); # mm1 = carry2
95 &paddq("mm1","mm7"); # mm1 = carry2 + r[3] + w*a[3]
96 &movd("mm5",&DWP(20,$r,"",0)); # mm5 = r[5]
97 &paddq("mm4","mm5"); # mm4 = r[5] + w*a[5]
98 &movd(&DWP(12,$r,"",0),"mm1");
99 &psrlq("mm1",32); # mm1 = carry3
100 &paddq("mm1","mm2"); # mm1 = carry3 + r[4] + w*a[4]
101 &movd("mm5",&DWP(24,$r,"",0)); # mm5 = r[6]
102 &paddq("mm6","mm5"); # mm6 = r[6] + w*a[6]
103 &movd(&DWP(16,$r,"",0),"mm1");
104 &psrlq("mm1",32); # mm1 = carry4
105 &paddq("mm1","mm4"); # mm1 = carry4 + r[5] + w*a[5]
106 &movd("mm5",&DWP(28,$r,"",0)); # mm5 = r[7]
107 &paddq("mm3","mm5"); # mm3 = r[7] + w*a[7]
108 &movd(&DWP(20,$r,"",0),"mm1");
109 &psrlq("mm1",32); # mm1 = carry5
110 &paddq("mm1","mm6"); # mm1 = carry5 + r[6] + w*a[6]
111 &movd(&DWP(24,$r,"",0),"mm1");
112 &psrlq("mm1",32); # mm1 = carry6
113 &paddq("mm1","mm3"); # mm1 = carry6 + r[7] + w*a[7]
114 &movd(&DWP(28,$r,"",0),"mm1");
115 &add($r,32);
116 &psrlq("mm1",32); # mm1 = carry_out
117
118 &sub("ecx",8);
119 &jnz(&label("maw_sse2_loop"));
120
121 &movd($c,"mm1"); # c = carry_out
122 &emms();
123
124 &jmp(&label("maw_finish"));
125 }
126
45 &set_label("maw_loop",0); 127 &set_label("maw_loop",0);
46 128
47 &mov(&swtmp(0),"ecx"); # 129 &mov(&swtmp(0),"ecx"); #
diff --git a/src/lib/libcrypto/bn/asm/ia64.S b/src/lib/libcrypto/bn/asm/ia64.S
index 7b82b820e6..951abc53ea 100644
--- a/src/lib/libcrypto/bn/asm/ia64.S
+++ b/src/lib/libcrypto/bn/asm/ia64.S
@@ -171,21 +171,21 @@
171.skip 32 // makes the loop body aligned at 64-byte boundary 171.skip 32 // makes the loop body aligned at 64-byte boundary
172bn_add_words: 172bn_add_words:
173 .prologue 173 .prologue
174 .fframe 0
175 .save ar.pfs,r2 174 .save ar.pfs,r2
176{ .mii; alloc r2=ar.pfs,4,12,0,16 175{ .mii; alloc r2=ar.pfs,4,12,0,16
177 cmp4.le p6,p0=r35,r0 };; 176 cmp4.le p6,p0=r35,r0 };;
178{ .mfb; mov r8=r0 // return value 177{ .mfb; mov r8=r0 // return value
179(p6) br.ret.spnt.many b0 };; 178(p6) br.ret.spnt.many b0 };;
180 179
181 .save ar.lc,r3
182{ .mib; sub r10=r35,r0,1 180{ .mib; sub r10=r35,r0,1
181 .save ar.lc,r3
183 mov r3=ar.lc 182 mov r3=ar.lc
184 brp.loop.imp .L_bn_add_words_ctop,.L_bn_add_words_cend-16 183 brp.loop.imp .L_bn_add_words_ctop,.L_bn_add_words_cend-16
185 } 184 }
186 .body
187{ .mib; ADDP r14=0,r32 // rp 185{ .mib; ADDP r14=0,r32 // rp
186 .save pr,r9
188 mov r9=pr };; 187 mov r9=pr };;
188 .body
189{ .mii; ADDP r15=0,r33 // ap 189{ .mii; ADDP r15=0,r33 // ap
190 mov ar.lc=r10 190 mov ar.lc=r10
191 mov ar.ec=6 } 191 mov ar.ec=6 }
@@ -224,21 +224,21 @@ bn_add_words:
224.skip 32 // makes the loop body aligned at 64-byte boundary 224.skip 32 // makes the loop body aligned at 64-byte boundary
225bn_sub_words: 225bn_sub_words:
226 .prologue 226 .prologue
227 .fframe 0
228 .save ar.pfs,r2 227 .save ar.pfs,r2
229{ .mii; alloc r2=ar.pfs,4,12,0,16 228{ .mii; alloc r2=ar.pfs,4,12,0,16
230 cmp4.le p6,p0=r35,r0 };; 229 cmp4.le p6,p0=r35,r0 };;
231{ .mfb; mov r8=r0 // return value 230{ .mfb; mov r8=r0 // return value
232(p6) br.ret.spnt.many b0 };; 231(p6) br.ret.spnt.many b0 };;
233 232
234 .save ar.lc,r3
235{ .mib; sub r10=r35,r0,1 233{ .mib; sub r10=r35,r0,1
234 .save ar.lc,r3
236 mov r3=ar.lc 235 mov r3=ar.lc
237 brp.loop.imp .L_bn_sub_words_ctop,.L_bn_sub_words_cend-16 236 brp.loop.imp .L_bn_sub_words_ctop,.L_bn_sub_words_cend-16
238 } 237 }
239 .body
240{ .mib; ADDP r14=0,r32 // rp 238{ .mib; ADDP r14=0,r32 // rp
239 .save pr,r9
241 mov r9=pr };; 240 mov r9=pr };;
241 .body
242{ .mii; ADDP r15=0,r33 // ap 242{ .mii; ADDP r15=0,r33 // ap
243 mov ar.lc=r10 243 mov ar.lc=r10
244 mov ar.ec=6 } 244 mov ar.ec=6 }
@@ -283,7 +283,6 @@ bn_sub_words:
283.skip 32 // makes the loop body aligned at 64-byte boundary 283.skip 32 // makes the loop body aligned at 64-byte boundary
284bn_mul_words: 284bn_mul_words:
285 .prologue 285 .prologue
286 .fframe 0
287 .save ar.pfs,r2 286 .save ar.pfs,r2
288#ifdef XMA_TEMPTATION 287#ifdef XMA_TEMPTATION
289{ .mfi; alloc r2=ar.pfs,4,0,0,0 };; 288{ .mfi; alloc r2=ar.pfs,4,0,0,0 };;
@@ -294,9 +293,10 @@ bn_mul_words:
294 cmp4.le p6,p0=r34,r0 293 cmp4.le p6,p0=r34,r0
295(p6) br.ret.spnt.many b0 };; 294(p6) br.ret.spnt.many b0 };;
296 295
297 .save ar.lc,r3
298{ .mii; sub r10=r34,r0,1 296{ .mii; sub r10=r34,r0,1
297 .save ar.lc,r3
299 mov r3=ar.lc 298 mov r3=ar.lc
299 .save pr,r9
300 mov r9=pr };; 300 mov r9=pr };;
301 301
302 .body 302 .body
@@ -397,22 +397,21 @@ bn_mul_words:
397.skip 48 // makes the loop body aligned at 64-byte boundary 397.skip 48 // makes the loop body aligned at 64-byte boundary
398bn_mul_add_words: 398bn_mul_add_words:
399 .prologue 399 .prologue
400 .fframe 0
401 .save ar.pfs,r2 400 .save ar.pfs,r2
402 .save ar.lc,r3
403 .save pr,r9
404{ .mmi; alloc r2=ar.pfs,4,4,0,8 401{ .mmi; alloc r2=ar.pfs,4,4,0,8
405 cmp4.le p6,p0=r34,r0 402 cmp4.le p6,p0=r34,r0
403 .save ar.lc,r3
406 mov r3=ar.lc };; 404 mov r3=ar.lc };;
407{ .mib; mov r8=r0 // return value 405{ .mib; mov r8=r0 // return value
408 sub r10=r34,r0,1 406 sub r10=r34,r0,1
409(p6) br.ret.spnt.many b0 };; 407(p6) br.ret.spnt.many b0 };;
410 408
411 .body
412{ .mib; setf.sig f8=r35 // w 409{ .mib; setf.sig f8=r35 // w
410 .save pr,r9
413 mov r9=pr 411 mov r9=pr
414 brp.loop.imp .L_bn_mul_add_words_ctop,.L_bn_mul_add_words_cend-16 412 brp.loop.imp .L_bn_mul_add_words_ctop,.L_bn_mul_add_words_cend-16
415 } 413 }
414 .body
416{ .mmi; ADDP r14=0,r32 // rp 415{ .mmi; ADDP r14=0,r32 // rp
417 ADDP r15=0,r33 // ap 416 ADDP r15=0,r33 // ap
418 mov ar.lc=r10 } 417 mov ar.lc=r10 }
@@ -466,7 +465,6 @@ bn_mul_add_words:
466.skip 32 // makes the loop body aligned at 64-byte boundary 465.skip 32 // makes the loop body aligned at 64-byte boundary
467bn_sqr_words: 466bn_sqr_words:
468 .prologue 467 .prologue
469 .fframe 0
470 .save ar.pfs,r2 468 .save ar.pfs,r2
471{ .mii; alloc r2=ar.pfs,3,0,0,0 469{ .mii; alloc r2=ar.pfs,3,0,0,0
472 sxt4 r34=r34 };; 470 sxt4 r34=r34 };;
@@ -476,9 +474,10 @@ bn_sqr_words:
476 nop.f 0x0 474 nop.f 0x0
477(p6) br.ret.spnt.many b0 };; 475(p6) br.ret.spnt.many b0 };;
478 476
479 .save ar.lc,r3
480{ .mii; sub r10=r34,r0,1 477{ .mii; sub r10=r34,r0,1
478 .save ar.lc,r3
481 mov r3=ar.lc 479 mov r3=ar.lc
480 .save pr,r9
482 mov r9=pr };; 481 mov r9=pr };;
483 482
484 .body 483 .body
@@ -545,7 +544,6 @@ bn_sqr_words:
545.align 64 544.align 64
546bn_sqr_comba8: 545bn_sqr_comba8:
547 .prologue 546 .prologue
548 .fframe 0
549 .save ar.pfs,r2 547 .save ar.pfs,r2
550#if defined(_HPUX_SOURCE) && !defined(_LP64) 548#if defined(_HPUX_SOURCE) && !defined(_LP64)
551{ .mii; alloc r2=ar.pfs,2,1,0,0 549{ .mii; alloc r2=ar.pfs,2,1,0,0
@@ -617,7 +615,6 @@ bn_sqr_comba8:
617.align 64 615.align 64
618bn_mul_comba8: 616bn_mul_comba8:
619 .prologue 617 .prologue
620 .fframe 0
621 .save ar.pfs,r2 618 .save ar.pfs,r2
622#if defined(_HPUX_SOURCE) && !defined(_LP64) 619#if defined(_HPUX_SOURCE) && !defined(_LP64)
623{ .mii; alloc r2=ar.pfs,3,0,0,0 620{ .mii; alloc r2=ar.pfs,3,0,0,0
@@ -1175,7 +1172,6 @@ bn_mul_comba8:
1175.align 64 1172.align 64
1176bn_sqr_comba4: 1173bn_sqr_comba4:
1177 .prologue 1174 .prologue
1178 .fframe 0
1179 .save ar.pfs,r2 1175 .save ar.pfs,r2
1180#if defined(_HPUX_SOURCE) && !defined(_LP64) 1176#if defined(_HPUX_SOURCE) && !defined(_LP64)
1181{ .mii; alloc r2=ar.pfs,2,1,0,0 1177{ .mii; alloc r2=ar.pfs,2,1,0,0
@@ -1208,7 +1204,6 @@ bn_sqr_comba4:
1208.align 64 1204.align 64
1209bn_mul_comba4: 1205bn_mul_comba4:
1210 .prologue 1206 .prologue
1211 .fframe 0
1212 .save ar.pfs,r2 1207 .save ar.pfs,r2
1213#if defined(_HPUX_SOURCE) && !defined(_LP64) 1208#if defined(_HPUX_SOURCE) && !defined(_LP64)
1214{ .mii; alloc r2=ar.pfs,3,0,0,0 1209{ .mii; alloc r2=ar.pfs,3,0,0,0
@@ -1411,11 +1406,11 @@ equ=p24
1411.align 64 1406.align 64
1412bn_div_words: 1407bn_div_words:
1413 .prologue 1408 .prologue
1414 .fframe 0
1415 .save ar.pfs,r2 1409 .save ar.pfs,r2
1416 .save b0,r3
1417{ .mii; alloc r2=ar.pfs,3,5,0,8 1410{ .mii; alloc r2=ar.pfs,3,5,0,8
1411 .save b0,r3
1418 mov r3=b0 1412 mov r3=b0
1413 .save pr,r10
1419 mov r10=pr };; 1414 mov r10=pr };;
1420{ .mmb; cmp.eq p6,p0=r34,r0 1415{ .mmb; cmp.eq p6,p0=r34,r0
1421 mov r8=-1 1416 mov r8=-1