summaryrefslogtreecommitdiff
path: root/src/lib/libcrypto/bn/asm/ia64-mont.pl
diff options
context:
space:
mode:
Diffstat (limited to 'src/lib/libcrypto/bn/asm/ia64-mont.pl')
-rw-r--r--src/lib/libcrypto/bn/asm/ia64-mont.pl851
1 files changed, 0 insertions, 851 deletions
diff --git a/src/lib/libcrypto/bn/asm/ia64-mont.pl b/src/lib/libcrypto/bn/asm/ia64-mont.pl
deleted file mode 100644
index e258658428..0000000000
--- a/src/lib/libcrypto/bn/asm/ia64-mont.pl
+++ /dev/null
@@ -1,851 +0,0 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# January 2010
11#
12# "Teaser" Montgomery multiplication module for IA-64. There are
13# several possibilities for improvement:
14#
15# - modulo-scheduling outer loop would eliminate quite a number of
16# stalls after ldf8, xma and getf.sig outside inner loop and
17# improve shorter key performance;
18# - shorter vector support [with input vectors being fetched only
19# once] should be added;
20# - 2x unroll with help of n0[1] would make the code scalable on
21# "wider" IA-64, "wider" than Itanium 2 that is, which is not of
22# acute interest, because upcoming Tukwila's individual cores are
23# reportedly based on Itanium 2 design;
24# - dedicated squaring procedure(?);
25#
26# January 2010
27#
28# Shorter vector support is implemented by zero-padding ap and np
29# vectors up to 8 elements, or 512 bits. This means that 256-bit
30# inputs will be processed only 2 times faster than 512-bit inputs,
31# not 4 [as one would expect, because algorithm complexity is n^2].
32# The reason for padding is that inputs shorter than 512 bits won't
33# be processed faster anyway, because minimal critical path of the
34# core loop happens to match 512-bit timing. Either way, it resulted
35# in >100% improvement of 512-bit RSA sign benchmark and 50% - of
36# 1024-bit one [in comparison to original version of *this* module].
37#
38# So far 'openssl speed rsa dsa' output on 900MHz Itanium 2 *with*
39# this module is:
40# sign verify sign/s verify/s
41# rsa 512 bits 0.000290s 0.000024s 3452.8 42031.4
42# rsa 1024 bits 0.000793s 0.000058s 1261.7 17172.0
43# rsa 2048 bits 0.005908s 0.000148s 169.3 6754.0
44# rsa 4096 bits 0.033456s 0.000469s 29.9 2133.6
45# dsa 512 bits 0.000253s 0.000198s 3949.9 5057.0
46# dsa 1024 bits 0.000585s 0.000607s 1708.4 1647.4
47# dsa 2048 bits 0.001453s 0.001703s 688.1 587.4
48#
49# ... and *without* (but still with ia64.S):
50#
51# rsa 512 bits 0.000670s 0.000041s 1491.8 24145.5
52# rsa 1024 bits 0.001988s 0.000080s 502.9 12499.3
53# rsa 2048 bits 0.008702s 0.000189s 114.9 5293.9
54# rsa 4096 bits 0.043860s 0.000533s 22.8 1875.9
55# dsa 512 bits 0.000441s 0.000427s 2265.3 2340.6
56# dsa 1024 bits 0.000823s 0.000867s 1215.6 1153.2
57# dsa 2048 bits 0.001894s 0.002179s 528.1 458.9
58#
59# As it can be seen, RSA sign performance improves by 130-30%,
60# hereafter less for longer keys, while verify - by 74-13%.
61# DSA performance improves by 115-30%.
62
63if ($^O eq "hpux") {
64 $ADDP="addp4";
65 for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
66} else { $ADDP="add"; }
67
68$code=<<___;
69.explicit
70.text
71
72// int bn_mul_mont (BN_ULONG *rp,const BN_ULONG *ap,
73// const BN_ULONG *bp,const BN_ULONG *np,
74// const BN_ULONG *n0p,int num);
75.align 64
76.global bn_mul_mont#
77.proc bn_mul_mont#
78bn_mul_mont:
79 .prologue
80 .body
81{ .mmi; cmp4.le p6,p7=2,r37;;
82(p6) cmp4.lt.unc p8,p9=8,r37
83 mov ret0=r0 };;
84{ .bbb;
85(p9) br.cond.dptk.many bn_mul_mont_8
86(p8) br.cond.dpnt.many bn_mul_mont_general
87(p7) br.ret.spnt.many b0 };;
88.endp bn_mul_mont#
89
90prevfs=r2; prevpr=r3; prevlc=r10; prevsp=r11;
91
92rptr=r8; aptr=r9; bptr=r14; nptr=r15;
93tptr=r16; // &tp[0]
94tp_1=r17; // &tp[-1]
95num=r18; len=r19; lc=r20;
96topbit=r21; // carry bit from tmp[num]
97
98n0=f6;
99m0=f7;
100bi=f8;
101
102.align 64
103.local bn_mul_mont_general#
104.proc bn_mul_mont_general#
105bn_mul_mont_general:
106 .prologue
107{ .mmi; .save ar.pfs,prevfs
108 alloc prevfs=ar.pfs,6,2,0,8
109 $ADDP aptr=0,in1
110 .save ar.lc,prevlc
111 mov prevlc=ar.lc }
112{ .mmi; .vframe prevsp
113 mov prevsp=sp
114 $ADDP bptr=0,in2
115 .save pr,prevpr
116 mov prevpr=pr };;
117
118 .body
119 .rotf alo[6],nlo[4],ahi[8],nhi[6]
120 .rotr a[3],n[3],t[2]
121
122{ .mmi; ldf8 bi=[bptr],8 // (*bp++)
123 ldf8 alo[4]=[aptr],16 // ap[0]
124 $ADDP r30=8,in1 };;
125{ .mmi; ldf8 alo[3]=[r30],16 // ap[1]
126 ldf8 alo[2]=[aptr],16 // ap[2]
127 $ADDP in4=0,in4 };;
128{ .mmi; ldf8 alo[1]=[r30] // ap[3]
129 ldf8 n0=[in4] // n0
130 $ADDP rptr=0,in0 }
131{ .mmi; $ADDP nptr=0,in3
132 mov r31=16
133 zxt4 num=in5 };;
134{ .mmi; ldf8 nlo[2]=[nptr],8 // np[0]
135 shladd len=num,3,r0
136 shladd r31=num,3,r31 };;
137{ .mmi; ldf8 nlo[1]=[nptr],8 // np[1]
138 add lc=-5,num
139 sub r31=sp,r31 };;
140{ .mfb; and sp=-16,r31 // alloca
141 xmpy.hu ahi[2]=alo[4],bi // ap[0]*bp[0]
142 nop.b 0 }
143{ .mfb; nop.m 0
144 xmpy.lu alo[4]=alo[4],bi
145 brp.loop.imp .L1st_ctop,.L1st_cend-16
146 };;
147{ .mfi; nop.m 0
148 xma.hu ahi[1]=alo[3],bi,ahi[2] // ap[1]*bp[0]
149 add tp_1=8,sp }
150{ .mfi; nop.m 0
151 xma.lu alo[3]=alo[3],bi,ahi[2]
152 mov pr.rot=0x20001f<<16
153 // ------^----- (p40) at first (p23)
154 // ----------^^ p[16:20]=1
155 };;
156{ .mfi; nop.m 0
157 xmpy.lu m0=alo[4],n0 // (ap[0]*bp[0])*n0
158 mov ar.lc=lc }
159{ .mfi; nop.m 0
160 fcvt.fxu.s1 nhi[1]=f0
161 mov ar.ec=8 };;
162
163.align 32
164.L1st_ctop:
165.pred.rel "mutex",p40,p42
166{ .mfi; (p16) ldf8 alo[0]=[aptr],8 // *(aptr++)
167 (p18) xma.hu ahi[0]=alo[2],bi,ahi[1]
168 (p40) add n[2]=n[2],a[2] } // (p23) }
169{ .mfi; (p18) ldf8 nlo[0]=[nptr],8 // *(nptr++)(p16)
170 (p18) xma.lu alo[2]=alo[2],bi,ahi[1]
171 (p42) add n[2]=n[2],a[2],1 };; // (p23)
172{ .mfi; (p21) getf.sig a[0]=alo[5]
173 (p20) xma.hu nhi[0]=nlo[2],m0,nhi[1]
174 (p42) cmp.leu p41,p39=n[2],a[2] } // (p23)
175{ .mfi; (p23) st8 [tp_1]=n[2],8
176 (p20) xma.lu nlo[2]=nlo[2],m0,nhi[1]
177 (p40) cmp.ltu p41,p39=n[2],a[2] } // (p23)
178{ .mmb; (p21) getf.sig n[0]=nlo[3]
179 (p16) nop.m 0
180 br.ctop.sptk .L1st_ctop };;
181.L1st_cend:
182
183{ .mmi; getf.sig a[0]=ahi[6] // (p24)
184 getf.sig n[0]=nhi[4]
185 add num=-1,num };; // num--
186{ .mmi; .pred.rel "mutex",p40,p42
187(p40) add n[0]=n[0],a[0]
188(p42) add n[0]=n[0],a[0],1
189 sub aptr=aptr,len };; // rewind
190{ .mmi; .pred.rel "mutex",p40,p42
191(p40) cmp.ltu p41,p39=n[0],a[0]
192(p42) cmp.leu p41,p39=n[0],a[0]
193 sub nptr=nptr,len };;
194{ .mmi; .pred.rel "mutex",p39,p41
195(p39) add topbit=r0,r0
196(p41) add topbit=r0,r0,1
197 nop.i 0 }
198{ .mmi; st8 [tp_1]=n[0]
199 add tptr=16,sp
200 add tp_1=8,sp };;
201
202.Louter:
203{ .mmi; ldf8 bi=[bptr],8 // (*bp++)
204 ldf8 ahi[3]=[tptr] // tp[0]
205 add r30=8,aptr };;
206{ .mmi; ldf8 alo[4]=[aptr],16 // ap[0]
207 ldf8 alo[3]=[r30],16 // ap[1]
208 add r31=8,nptr };;
209{ .mfb; ldf8 alo[2]=[aptr],16 // ap[2]
210 xma.hu ahi[2]=alo[4],bi,ahi[3] // ap[0]*bp[i]+tp[0]
211 brp.loop.imp .Linner_ctop,.Linner_cend-16
212 }
213{ .mfb; ldf8 alo[1]=[r30] // ap[3]
214 xma.lu alo[4]=alo[4],bi,ahi[3]
215 clrrrb.pr };;
216{ .mfi; ldf8 nlo[2]=[nptr],16 // np[0]
217 xma.hu ahi[1]=alo[3],bi,ahi[2] // ap[1]*bp[i]
218 nop.i 0 }
219{ .mfi; ldf8 nlo[1]=[r31] // np[1]
220 xma.lu alo[3]=alo[3],bi,ahi[2]
221 mov pr.rot=0x20101f<<16
222 // ------^----- (p40) at first (p23)
223 // --------^--- (p30) at first (p22)
224 // ----------^^ p[16:20]=1
225 };;
226{ .mfi; st8 [tptr]=r0 // tp[0] is already accounted
227 xmpy.lu m0=alo[4],n0 // (ap[0]*bp[i]+tp[0])*n0
228 mov ar.lc=lc }
229{ .mfi;
230 fcvt.fxu.s1 nhi[1]=f0
231 mov ar.ec=8 };;
232
233// This loop spins in 4*(n+7) ticks on Itanium 2 and should spin in
234// 7*(n+7) ticks on Itanium (the one codenamed Merced). Factor of 7
235// in latter case accounts for two-tick pipeline stall, which means
236// that its performance would be ~20% lower than optimal one. No
237// attempt was made to address this, because original Itanium is
238// hardly represented out in the wild...
239.align 32
240.Linner_ctop:
241.pred.rel "mutex",p40,p42
242.pred.rel "mutex",p30,p32
243{ .mfi; (p16) ldf8 alo[0]=[aptr],8 // *(aptr++)
244 (p18) xma.hu ahi[0]=alo[2],bi,ahi[1]
245 (p40) add n[2]=n[2],a[2] } // (p23)
246{ .mfi; (p16) nop.m 0
247 (p18) xma.lu alo[2]=alo[2],bi,ahi[1]
248 (p42) add n[2]=n[2],a[2],1 };; // (p23)
249{ .mfi; (p21) getf.sig a[0]=alo[5]
250 (p16) nop.f 0
251 (p40) cmp.ltu p41,p39=n[2],a[2] } // (p23)
252{ .mfi; (p21) ld8 t[0]=[tptr],8
253 (p16) nop.f 0
254 (p42) cmp.leu p41,p39=n[2],a[2] };; // (p23)
255{ .mfi; (p18) ldf8 nlo[0]=[nptr],8 // *(nptr++)
256 (p20) xma.hu nhi[0]=nlo[2],m0,nhi[1]
257 (p30) add a[1]=a[1],t[1] } // (p22)
258{ .mfi; (p16) nop.m 0
259 (p20) xma.lu nlo[2]=nlo[2],m0,nhi[1]
260 (p32) add a[1]=a[1],t[1],1 };; // (p22)
261{ .mmi; (p21) getf.sig n[0]=nlo[3]
262 (p16) nop.m 0
263 (p30) cmp.ltu p31,p29=a[1],t[1] } // (p22)
264{ .mmb; (p23) st8 [tp_1]=n[2],8
265 (p32) cmp.leu p31,p29=a[1],t[1] // (p22)
266 br.ctop.sptk .Linner_ctop };;
267.Linner_cend:
268
269{ .mmi; getf.sig a[0]=ahi[6] // (p24)
270 getf.sig n[0]=nhi[4]
271 nop.i 0 };;
272
273{ .mmi; .pred.rel "mutex",p31,p33
274(p31) add a[0]=a[0],topbit
275(p33) add a[0]=a[0],topbit,1
276 mov topbit=r0 };;
277{ .mfi; .pred.rel "mutex",p31,p33
278(p31) cmp.ltu p32,p30=a[0],topbit
279(p33) cmp.leu p32,p30=a[0],topbit
280 }
281{ .mfi; .pred.rel "mutex",p40,p42
282(p40) add n[0]=n[0],a[0]
283(p42) add n[0]=n[0],a[0],1
284 };;
285{ .mmi; .pred.rel "mutex",p44,p46
286(p40) cmp.ltu p41,p39=n[0],a[0]
287(p42) cmp.leu p41,p39=n[0],a[0]
288(p32) add topbit=r0,r0,1 }
289
290{ .mmi; st8 [tp_1]=n[0],8
291 cmp4.ne p6,p0=1,num
292 sub aptr=aptr,len };; // rewind
293{ .mmi; sub nptr=nptr,len
294(p41) add topbit=r0,r0,1
295 add tptr=16,sp }
296{ .mmb; add tp_1=8,sp
297 add num=-1,num // num--
298(p6) br.cond.sptk.many .Louter };;
299
300{ .mbb; add lc=4,lc
301 brp.loop.imp .Lsub_ctop,.Lsub_cend-16
302 clrrrb.pr };;
303{ .mii; nop.m 0
304 mov pr.rot=0x10001<<16
305 // ------^---- (p33) at first (p17)
306 mov ar.lc=lc }
307{ .mii; nop.m 0
308 mov ar.ec=3
309 nop.i 0 };;
310
311.Lsub_ctop:
312.pred.rel "mutex",p33,p35
313{ .mfi; (p16) ld8 t[0]=[tptr],8 // t=*(tp++)
314 (p16) nop.f 0
315 (p33) sub n[1]=t[1],n[1] } // (p17)
316{ .mfi; (p16) ld8 n[0]=[nptr],8 // n=*(np++)
317 (p16) nop.f 0
318 (p35) sub n[1]=t[1],n[1],1 };; // (p17)
319{ .mib; (p18) st8 [rptr]=n[2],8 // *(rp++)=r
320 (p33) cmp.gtu p34,p32=n[1],t[1] // (p17)
321 (p18) nop.b 0 }
322{ .mib; (p18) nop.m 0
323 (p35) cmp.geu p34,p32=n[1],t[1] // (p17)
324 br.ctop.sptk .Lsub_ctop };;
325.Lsub_cend:
326
327{ .mmb; .pred.rel "mutex",p34,p36
328(p34) sub topbit=topbit,r0 // (p19)
329(p36) sub topbit=topbit,r0,1
330 brp.loop.imp .Lcopy_ctop,.Lcopy_cend-16
331 }
332{ .mmb; sub rptr=rptr,len // rewind
333 sub tptr=tptr,len
334 clrrrb.pr };;
335{ .mmi; and aptr=tptr,topbit
336 andcm bptr=rptr,topbit
337 mov pr.rot=1<<16 };;
338{ .mii; or nptr=aptr,bptr
339 mov ar.lc=lc
340 mov ar.ec=3 };;
341
342.Lcopy_ctop:
343{ .mmb; (p16) ld8 n[0]=[nptr],8
344 (p18) st8 [tptr]=r0,8
345 (p16) nop.b 0 }
346{ .mmb; (p16) nop.m 0
347 (p18) st8 [rptr]=n[2],8
348 br.ctop.sptk .Lcopy_ctop };;
349.Lcopy_cend:
350
351{ .mmi; mov ret0=1 // signal "handled"
352 rum 1<<5 // clear um.mfh
353 mov ar.lc=prevlc }
354{ .mib; .restore sp
355 mov sp=prevsp
356 mov pr=prevpr,0x1ffff
357 br.ret.sptk.many b0 };;
358.endp bn_mul_mont_general#
359
360a1=r16; a2=r17; a3=r18; a4=r19; a5=r20; a6=r21; a7=r22; a8=r23;
361n1=r24; n2=r25; n3=r26; n4=r27; n5=r28; n6=r29; n7=r30; n8=r31;
362t0=r15;
363
364ai0=f8; ai1=f9; ai2=f10; ai3=f11; ai4=f12; ai5=f13; ai6=f14; ai7=f15;
365ni0=f16; ni1=f17; ni2=f18; ni3=f19; ni4=f20; ni5=f21; ni6=f22; ni7=f23;
366
367.align 64
368.skip 48 // aligns loop body
369.local bn_mul_mont_8#
370.proc bn_mul_mont_8#
371bn_mul_mont_8:
372 .prologue
373{ .mmi; .save ar.pfs,prevfs
374 alloc prevfs=ar.pfs,6,2,0,8
375 .vframe prevsp
376 mov prevsp=sp
377 .save ar.lc,prevlc
378 mov prevlc=ar.lc }
379{ .mmi; add r17=-6*16,sp
380 add sp=-7*16,sp
381 .save pr,prevpr
382 mov prevpr=pr };;
383
384{ .mmi; .save.gf 0,0x10
385 stf.spill [sp]=f16,-16
386 .save.gf 0,0x20
387 stf.spill [r17]=f17,32
388 add r16=-5*16,prevsp};;
389{ .mmi; .save.gf 0,0x40
390 stf.spill [r16]=f18,32
391 .save.gf 0,0x80
392 stf.spill [r17]=f19,32
393 $ADDP aptr=0,in1 };;
394{ .mmi; .save.gf 0,0x100
395 stf.spill [r16]=f20,32
396 .save.gf 0,0x200
397 stf.spill [r17]=f21,32
398 $ADDP r29=8,in1 };;
399{ .mmi; .save.gf 0,0x400
400 stf.spill [r16]=f22
401 .save.gf 0,0x800
402 stf.spill [r17]=f23
403 $ADDP rptr=0,in0 };;
404
405 .body
406 .rotf bj[8],mj[2],tf[2],alo[10],ahi[10],nlo[10],nhi[10]
407 .rotr t[8]
408
409// load input vectors padding them to 8 elements
410{ .mmi; ldf8 ai0=[aptr],16 // ap[0]
411 ldf8 ai1=[r29],16 // ap[1]
412 $ADDP bptr=0,in2 }
413{ .mmi; $ADDP r30=8,in2
414 $ADDP nptr=0,in3
415 $ADDP r31=8,in3 };;
416{ .mmi; ldf8 bj[7]=[bptr],16 // bp[0]
417 ldf8 bj[6]=[r30],16 // bp[1]
418 cmp4.le p4,p5=3,in5 }
419{ .mmi; ldf8 ni0=[nptr],16 // np[0]
420 ldf8 ni1=[r31],16 // np[1]
421 cmp4.le p6,p7=4,in5 };;
422
423{ .mfi; (p4)ldf8 ai2=[aptr],16 // ap[2]
424 (p5)fcvt.fxu ai2=f0
425 cmp4.le p8,p9=5,in5 }
426{ .mfi; (p6)ldf8 ai3=[r29],16 // ap[3]
427 (p7)fcvt.fxu ai3=f0
428 cmp4.le p10,p11=6,in5 }
429{ .mfi; (p4)ldf8 bj[5]=[bptr],16 // bp[2]
430 (p5)fcvt.fxu bj[5]=f0
431 cmp4.le p12,p13=7,in5 }
432{ .mfi; (p6)ldf8 bj[4]=[r30],16 // bp[3]
433 (p7)fcvt.fxu bj[4]=f0
434 cmp4.le p14,p15=8,in5 }
435{ .mfi; (p4)ldf8 ni2=[nptr],16 // np[2]
436 (p5)fcvt.fxu ni2=f0
437 addp4 r28=-1,in5 }
438{ .mfi; (p6)ldf8 ni3=[r31],16 // np[3]
439 (p7)fcvt.fxu ni3=f0
440 $ADDP in4=0,in4 };;
441
442{ .mfi; ldf8 n0=[in4]
443 fcvt.fxu tf[1]=f0
444 nop.i 0 }
445
446{ .mfi; (p8)ldf8 ai4=[aptr],16 // ap[4]
447 (p9)fcvt.fxu ai4=f0
448 mov t[0]=r0 }
449{ .mfi; (p10)ldf8 ai5=[r29],16 // ap[5]
450 (p11)fcvt.fxu ai5=f0
451 mov t[1]=r0 }
452{ .mfi; (p8)ldf8 bj[3]=[bptr],16 // bp[4]
453 (p9)fcvt.fxu bj[3]=f0
454 mov t[2]=r0 }
455{ .mfi; (p10)ldf8 bj[2]=[r30],16 // bp[5]
456 (p11)fcvt.fxu bj[2]=f0
457 mov t[3]=r0 }
458{ .mfi; (p8)ldf8 ni4=[nptr],16 // np[4]
459 (p9)fcvt.fxu ni4=f0
460 mov t[4]=r0 }
461{ .mfi; (p10)ldf8 ni5=[r31],16 // np[5]
462 (p11)fcvt.fxu ni5=f0
463 mov t[5]=r0 };;
464
465{ .mfi; (p12)ldf8 ai6=[aptr],16 // ap[6]
466 (p13)fcvt.fxu ai6=f0
467 mov t[6]=r0 }
468{ .mfi; (p14)ldf8 ai7=[r29],16 // ap[7]
469 (p15)fcvt.fxu ai7=f0
470 mov t[7]=r0 }
471{ .mfi; (p12)ldf8 bj[1]=[bptr],16 // bp[6]
472 (p13)fcvt.fxu bj[1]=f0
473 mov ar.lc=r28 }
474{ .mfi; (p14)ldf8 bj[0]=[r30],16 // bp[7]
475 (p15)fcvt.fxu bj[0]=f0
476 mov ar.ec=1 }
477{ .mfi; (p12)ldf8 ni6=[nptr],16 // np[6]
478 (p13)fcvt.fxu ni6=f0
479 mov pr.rot=1<<16 }
480{ .mfb; (p14)ldf8 ni7=[r31],16 // np[7]
481 (p15)fcvt.fxu ni7=f0
482 brp.loop.imp .Louter_8_ctop,.Louter_8_cend-16
483 };;
484
485// The loop is scheduled for 32*n ticks on Itanium 2. Actual attempt
486// to measure with help of Interval Time Counter indicated that the
487// factor is a tad higher: 33 or 34, if not 35. Exact measurement and
488// addressing the issue is problematic, because I don't have access
489// to platform-specific instruction-level profiler. On Itanium it
490// should run in 56*n ticks, because of higher xma latency...
491.Louter_8_ctop:
492 .pred.rel "mutex",p40,p42
493 .pred.rel "mutex",p48,p50
494{ .mfi; (p16) nop.m 0 // 0:
495 (p16) xma.hu ahi[0]=ai0,bj[7],tf[1] // ap[0]*b[i]+t[0]
496 (p40) add a3=a3,n3 } // (p17) a3+=n3
497{ .mfi; (p42) add a3=a3,n3,1
498 (p16) xma.lu alo[0]=ai0,bj[7],tf[1]
499 (p16) nop.i 0 };;
500{ .mii; (p17) getf.sig a7=alo[8] // 1:
501 (p48) add t[6]=t[6],a3 // (p17) t[6]+=a3
502 (p50) add t[6]=t[6],a3,1 };;
503{ .mfi; (p17) getf.sig a8=ahi[8] // 2:
504 (p17) xma.hu nhi[7]=ni6,mj[1],nhi[6] // np[6]*m0
505 (p40) cmp.ltu p43,p41=a3,n3 }
506{ .mfi; (p42) cmp.leu p43,p41=a3,n3
507 (p17) xma.lu nlo[7]=ni6,mj[1],nhi[6]
508 (p16) nop.i 0 };;
509{ .mii; (p17) getf.sig n5=nlo[6] // 3:
510 (p48) cmp.ltu p51,p49=t[6],a3
511 (p50) cmp.leu p51,p49=t[6],a3 };;
512 .pred.rel "mutex",p41,p43
513 .pred.rel "mutex",p49,p51
514{ .mfi; (p16) nop.m 0 // 4:
515 (p16) xma.hu ahi[1]=ai1,bj[7],ahi[0] // ap[1]*b[i]
516 (p41) add a4=a4,n4 } // (p17) a4+=n4
517{ .mfi; (p43) add a4=a4,n4,1
518 (p16) xma.lu alo[1]=ai1,bj[7],ahi[0]
519 (p16) nop.i 0 };;
520{ .mfi; (p49) add t[5]=t[5],a4 // 5: (p17) t[5]+=a4
521 (p16) xmpy.lu mj[0]=alo[0],n0 // (ap[0]*b[i]+t[0])*n0
522 (p51) add t[5]=t[5],a4,1 };;
523{ .mfi; (p16) nop.m 0 // 6:
524 (p17) xma.hu nhi[8]=ni7,mj[1],nhi[7] // np[7]*m0
525 (p41) cmp.ltu p42,p40=a4,n4 }
526{ .mfi; (p43) cmp.leu p42,p40=a4,n4
527 (p17) xma.lu nlo[8]=ni7,mj[1],nhi[7]
528 (p16) nop.i 0 };;
529{ .mii; (p17) getf.sig n6=nlo[7] // 7:
530 (p49) cmp.ltu p50,p48=t[5],a4
531 (p51) cmp.leu p50,p48=t[5],a4 };;
532 .pred.rel "mutex",p40,p42
533 .pred.rel "mutex",p48,p50
534{ .mfi; (p16) nop.m 0 // 8:
535 (p16) xma.hu ahi[2]=ai2,bj[7],ahi[1] // ap[2]*b[i]
536 (p40) add a5=a5,n5 } // (p17) a5+=n5
537{ .mfi; (p42) add a5=a5,n5,1
538 (p16) xma.lu alo[2]=ai2,bj[7],ahi[1]
539 (p16) nop.i 0 };;
540{ .mii; (p16) getf.sig a1=alo[1] // 9:
541 (p48) add t[4]=t[4],a5 // p(17) t[4]+=a5
542 (p50) add t[4]=t[4],a5,1 };;
543{ .mfi; (p16) nop.m 0 // 10:
544 (p16) xma.hu nhi[0]=ni0,mj[0],alo[0] // np[0]*m0
545 (p40) cmp.ltu p43,p41=a5,n5 }
546{ .mfi; (p42) cmp.leu p43,p41=a5,n5
547 (p16) xma.lu nlo[0]=ni0,mj[0],alo[0]
548 (p16) nop.i 0 };;
549{ .mii; (p17) getf.sig n7=nlo[8] // 11:
550 (p48) cmp.ltu p51,p49=t[4],a5
551 (p50) cmp.leu p51,p49=t[4],a5 };;
552 .pred.rel "mutex",p41,p43
553 .pred.rel "mutex",p49,p51
554{ .mfi; (p17) getf.sig n8=nhi[8] // 12:
555 (p16) xma.hu ahi[3]=ai3,bj[7],ahi[2] // ap[3]*b[i]
556 (p41) add a6=a6,n6 } // (p17) a6+=n6
557{ .mfi; (p43) add a6=a6,n6,1
558 (p16) xma.lu alo[3]=ai3,bj[7],ahi[2]
559 (p16) nop.i 0 };;
560{ .mii; (p16) getf.sig a2=alo[2] // 13:
561 (p49) add t[3]=t[3],a6 // (p17) t[3]+=a6
562 (p51) add t[3]=t[3],a6,1 };;
563{ .mfi; (p16) nop.m 0 // 14:
564 (p16) xma.hu nhi[1]=ni1,mj[0],nhi[0] // np[1]*m0
565 (p41) cmp.ltu p42,p40=a6,n6 }
566{ .mfi; (p43) cmp.leu p42,p40=a6,n6
567 (p16) xma.lu nlo[1]=ni1,mj[0],nhi[0]
568 (p16) nop.i 0 };;
569{ .mii; (p16) nop.m 0 // 15:
570 (p49) cmp.ltu p50,p48=t[3],a6
571 (p51) cmp.leu p50,p48=t[3],a6 };;
572 .pred.rel "mutex",p40,p42
573 .pred.rel "mutex",p48,p50
574{ .mfi; (p16) nop.m 0 // 16:
575 (p16) xma.hu ahi[4]=ai4,bj[7],ahi[3] // ap[4]*b[i]
576 (p40) add a7=a7,n7 } // (p17) a7+=n7
577{ .mfi; (p42) add a7=a7,n7,1
578 (p16) xma.lu alo[4]=ai4,bj[7],ahi[3]
579 (p16) nop.i 0 };;
580{ .mii; (p16) getf.sig a3=alo[3] // 17:
581 (p48) add t[2]=t[2],a7 // (p17) t[2]+=a7
582 (p50) add t[2]=t[2],a7,1 };;
583{ .mfi; (p16) nop.m 0 // 18:
584 (p16) xma.hu nhi[2]=ni2,mj[0],nhi[1] // np[2]*m0
585 (p40) cmp.ltu p43,p41=a7,n7 }
586{ .mfi; (p42) cmp.leu p43,p41=a7,n7
587 (p16) xma.lu nlo[2]=ni2,mj[0],nhi[1]
588 (p16) nop.i 0 };;
589{ .mii; (p16) getf.sig n1=nlo[1] // 19:
590 (p48) cmp.ltu p51,p49=t[2],a7
591 (p50) cmp.leu p51,p49=t[2],a7 };;
592 .pred.rel "mutex",p41,p43
593 .pred.rel "mutex",p49,p51
594{ .mfi; (p16) nop.m 0 // 20:
595 (p16) xma.hu ahi[5]=ai5,bj[7],ahi[4] // ap[5]*b[i]
596 (p41) add a8=a8,n8 } // (p17) a8+=n8
597{ .mfi; (p43) add a8=a8,n8,1
598 (p16) xma.lu alo[5]=ai5,bj[7],ahi[4]
599 (p16) nop.i 0 };;
600{ .mii; (p16) getf.sig a4=alo[4] // 21:
601 (p49) add t[1]=t[1],a8 // (p17) t[1]+=a8
602 (p51) add t[1]=t[1],a8,1 };;
603{ .mfi; (p16) nop.m 0 // 22:
604 (p16) xma.hu nhi[3]=ni3,mj[0],nhi[2] // np[3]*m0
605 (p41) cmp.ltu p42,p40=a8,n8 }
606{ .mfi; (p43) cmp.leu p42,p40=a8,n8
607 (p16) xma.lu nlo[3]=ni3,mj[0],nhi[2]
608 (p16) nop.i 0 };;
609{ .mii; (p16) getf.sig n2=nlo[2] // 23:
610 (p49) cmp.ltu p50,p48=t[1],a8
611 (p51) cmp.leu p50,p48=t[1],a8 };;
612{ .mfi; (p16) nop.m 0 // 24:
613 (p16) xma.hu ahi[6]=ai6,bj[7],ahi[5] // ap[6]*b[i]
614 (p16) add a1=a1,n1 } // (p16) a1+=n1
615{ .mfi; (p16) nop.m 0
616 (p16) xma.lu alo[6]=ai6,bj[7],ahi[5]
617 (p17) mov t[0]=r0 };;
618{ .mii; (p16) getf.sig a5=alo[5] // 25:
619 (p16) add t0=t[7],a1 // (p16) t[7]+=a1
620 (p42) add t[0]=t[0],r0,1 };;
621{ .mfi; (p16) setf.sig tf[0]=t0 // 26:
622 (p16) xma.hu nhi[4]=ni4,mj[0],nhi[3] // np[4]*m0
623 (p50) add t[0]=t[0],r0,1 }
624{ .mfi; (p16) cmp.ltu.unc p42,p40=a1,n1
625 (p16) xma.lu nlo[4]=ni4,mj[0],nhi[3]
626 (p16) nop.i 0 };;
627{ .mii; (p16) getf.sig n3=nlo[3] // 27:
628 (p16) cmp.ltu.unc p50,p48=t0,a1
629 (p16) nop.i 0 };;
630 .pred.rel "mutex",p40,p42
631 .pred.rel "mutex",p48,p50
632{ .mfi; (p16) nop.m 0 // 28:
633 (p16) xma.hu ahi[7]=ai7,bj[7],ahi[6] // ap[7]*b[i]
634 (p40) add a2=a2,n2 } // (p16) a2+=n2
635{ .mfi; (p42) add a2=a2,n2,1
636 (p16) xma.lu alo[7]=ai7,bj[7],ahi[6]
637 (p16) nop.i 0 };;
638{ .mii; (p16) getf.sig a6=alo[6] // 29:
639 (p48) add t[6]=t[6],a2 // (p16) t[6]+=a2
640 (p50) add t[6]=t[6],a2,1 };;
641{ .mfi; (p16) nop.m 0 // 30:
642 (p16) xma.hu nhi[5]=ni5,mj[0],nhi[4] // np[5]*m0
643 (p40) cmp.ltu p41,p39=a2,n2 }
644{ .mfi; (p42) cmp.leu p41,p39=a2,n2
645 (p16) xma.lu nlo[5]=ni5,mj[0],nhi[4]
646 (p16) nop.i 0 };;
647{ .mfi; (p16) getf.sig n4=nlo[4] // 31:
648 (p16) nop.f 0
649 (p48) cmp.ltu p49,p47=t[6],a2 }
650{ .mfb; (p50) cmp.leu p49,p47=t[6],a2
651 (p16) nop.f 0
652 br.ctop.sptk.many .Louter_8_ctop };;
653.Louter_8_cend:
654
655// above loop has to execute one more time, without (p16), which is
656// replaced with merged move of np[8] to GPR bank
657 .pred.rel "mutex",p40,p42
658 .pred.rel "mutex",p48,p50
659{ .mmi; (p0) getf.sig n1=ni0 // 0:
660 (p40) add a3=a3,n3 // (p17) a3+=n3
661 (p42) add a3=a3,n3,1 };;
662{ .mii; (p17) getf.sig a7=alo[8] // 1:
663 (p48) add t[6]=t[6],a3 // (p17) t[6]+=a3
664 (p50) add t[6]=t[6],a3,1 };;
665{ .mfi; (p17) getf.sig a8=ahi[8] // 2:
666 (p17) xma.hu nhi[7]=ni6,mj[1],nhi[6] // np[6]*m0
667 (p40) cmp.ltu p43,p41=a3,n3 }
668{ .mfi; (p42) cmp.leu p43,p41=a3,n3
669 (p17) xma.lu nlo[7]=ni6,mj[1],nhi[6]
670 (p0) nop.i 0 };;
671{ .mii; (p17) getf.sig n5=nlo[6] // 3:
672 (p48) cmp.ltu p51,p49=t[6],a3
673 (p50) cmp.leu p51,p49=t[6],a3 };;
674 .pred.rel "mutex",p41,p43
675 .pred.rel "mutex",p49,p51
676{ .mmi; (p0) getf.sig n2=ni1 // 4:
677 (p41) add a4=a4,n4 // (p17) a4+=n4
678 (p43) add a4=a4,n4,1 };;
679{ .mfi; (p49) add t[5]=t[5],a4 // 5: (p17) t[5]+=a4
680 (p0) nop.f 0
681 (p51) add t[5]=t[5],a4,1 };;
682{ .mfi; (p0) getf.sig n3=ni2 // 6:
683 (p17) xma.hu nhi[8]=ni7,mj[1],nhi[7] // np[7]*m0
684 (p41) cmp.ltu p42,p40=a4,n4 }
685{ .mfi; (p43) cmp.leu p42,p40=a4,n4
686 (p17) xma.lu nlo[8]=ni7,mj[1],nhi[7]
687 (p0) nop.i 0 };;
688{ .mii; (p17) getf.sig n6=nlo[7] // 7:
689 (p49) cmp.ltu p50,p48=t[5],a4
690 (p51) cmp.leu p50,p48=t[5],a4 };;
691 .pred.rel "mutex",p40,p42
692 .pred.rel "mutex",p48,p50
693{ .mii; (p0) getf.sig n4=ni3 // 8:
694 (p40) add a5=a5,n5 // (p17) a5+=n5
695 (p42) add a5=a5,n5,1 };;
696{ .mii; (p0) nop.m 0 // 9:
697 (p48) add t[4]=t[4],a5 // p(17) t[4]+=a5
698 (p50) add t[4]=t[4],a5,1 };;
699{ .mii; (p0) nop.m 0 // 10:
700 (p40) cmp.ltu p43,p41=a5,n5
701 (p42) cmp.leu p43,p41=a5,n5 };;
702{ .mii; (p17) getf.sig n7=nlo[8] // 11:
703 (p48) cmp.ltu p51,p49=t[4],a5
704 (p50) cmp.leu p51,p49=t[4],a5 };;
705 .pred.rel "mutex",p41,p43
706 .pred.rel "mutex",p49,p51
707{ .mii; (p17) getf.sig n8=nhi[8] // 12:
708 (p41) add a6=a6,n6 // (p17) a6+=n6
709 (p43) add a6=a6,n6,1 };;
710{ .mii; (p0) getf.sig n5=ni4 // 13:
711 (p49) add t[3]=t[3],a6 // (p17) t[3]+=a6
712 (p51) add t[3]=t[3],a6,1 };;
713{ .mii; (p0) nop.m 0 // 14:
714 (p41) cmp.ltu p42,p40=a6,n6
715 (p43) cmp.leu p42,p40=a6,n6 };;
716{ .mii; (p0) getf.sig n6=ni5 // 15:
717 (p49) cmp.ltu p50,p48=t[3],a6
718 (p51) cmp.leu p50,p48=t[3],a6 };;
719 .pred.rel "mutex",p40,p42
720 .pred.rel "mutex",p48,p50
721{ .mii; (p0) nop.m 0 // 16:
722 (p40) add a7=a7,n7 // (p17) a7+=n7
723 (p42) add a7=a7,n7,1 };;
724{ .mii; (p0) nop.m 0 // 17:
725 (p48) add t[2]=t[2],a7 // (p17) t[2]+=a7
726 (p50) add t[2]=t[2],a7,1 };;
727{ .mii; (p0) nop.m 0 // 18:
728 (p40) cmp.ltu p43,p41=a7,n7
729 (p42) cmp.leu p43,p41=a7,n7 };;
730{ .mii; (p0) getf.sig n7=ni6 // 19:
731 (p48) cmp.ltu p51,p49=t[2],a7
732 (p50) cmp.leu p51,p49=t[2],a7 };;
733 .pred.rel "mutex",p41,p43
734 .pred.rel "mutex",p49,p51
735{ .mii; (p0) nop.m 0 // 20:
736 (p41) add a8=a8,n8 // (p17) a8+=n8
737 (p43) add a8=a8,n8,1 };;
738{ .mmi; (p0) nop.m 0 // 21:
739 (p49) add t[1]=t[1],a8 // (p17) t[1]+=a8
740 (p51) add t[1]=t[1],a8,1 }
741{ .mmi; (p17) mov t[0]=r0
742 (p41) cmp.ltu p42,p40=a8,n8
743 (p43) cmp.leu p42,p40=a8,n8 };;
744{ .mmi; (p0) getf.sig n8=ni7 // 22:
745 (p49) cmp.ltu p50,p48=t[1],a8
746 (p51) cmp.leu p50,p48=t[1],a8 }
747{ .mmi; (p42) add t[0]=t[0],r0,1
748 (p0) add r16=-7*16,prevsp
749 (p0) add r17=-6*16,prevsp };;
750
751// subtract np[8] from carrybit|tmp[8]
752// carrybit|tmp[8] layout upon exit from above loop is:
753// t[0]|t[1]|t[2]|t[3]|t[4]|t[5]|t[6]|t[7]|t0 (least significant)
754{ .mmi; (p50)add t[0]=t[0],r0,1
755 add r18=-5*16,prevsp
756 sub n1=t0,n1 };;
757{ .mmi; cmp.gtu p34,p32=n1,t0;;
758 .pred.rel "mutex",p32,p34
759 (p32)sub n2=t[7],n2
760 (p34)sub n2=t[7],n2,1 };;
761{ .mii; (p32)cmp.gtu p35,p33=n2,t[7]
762 (p34)cmp.geu p35,p33=n2,t[7];;
763 .pred.rel "mutex",p33,p35
764 (p33)sub n3=t[6],n3 }
765{ .mmi; (p35)sub n3=t[6],n3,1;;
766 (p33)cmp.gtu p34,p32=n3,t[6]
767 (p35)cmp.geu p34,p32=n3,t[6] };;
768 .pred.rel "mutex",p32,p34
769{ .mii; (p32)sub n4=t[5],n4
770 (p34)sub n4=t[5],n4,1;;
771 (p32)cmp.gtu p35,p33=n4,t[5] }
772{ .mmi; (p34)cmp.geu p35,p33=n4,t[5];;
773 .pred.rel "mutex",p33,p35
774 (p33)sub n5=t[4],n5
775 (p35)sub n5=t[4],n5,1 };;
776{ .mii; (p33)cmp.gtu p34,p32=n5,t[4]
777 (p35)cmp.geu p34,p32=n5,t[4];;
778 .pred.rel "mutex",p32,p34
779 (p32)sub n6=t[3],n6 }
780{ .mmi; (p34)sub n6=t[3],n6,1;;
781 (p32)cmp.gtu p35,p33=n6,t[3]
782 (p34)cmp.geu p35,p33=n6,t[3] };;
783 .pred.rel "mutex",p33,p35
784{ .mii; (p33)sub n7=t[2],n7
785 (p35)sub n7=t[2],n7,1;;
786 (p33)cmp.gtu p34,p32=n7,t[2] }
787{ .mmi; (p35)cmp.geu p34,p32=n7,t[2];;
788 .pred.rel "mutex",p32,p34
789 (p32)sub n8=t[1],n8
790 (p34)sub n8=t[1],n8,1 };;
791{ .mii; (p32)cmp.gtu p35,p33=n8,t[1]
792 (p34)cmp.geu p35,p33=n8,t[1];;
793 .pred.rel "mutex",p33,p35
794 (p33)sub a8=t[0],r0 }
795{ .mmi; (p35)sub a8=t[0],r0,1;;
796 (p33)cmp.gtu p34,p32=a8,t[0]
797 (p35)cmp.geu p34,p32=a8,t[0] };;
798
799// save the result, either tmp[num] or tmp[num]-np[num]
800 .pred.rel "mutex",p32,p34
801{ .mmi; (p32)st8 [rptr]=n1,8
802 (p34)st8 [rptr]=t0,8
803 add r19=-4*16,prevsp};;
804{ .mmb; (p32)st8 [rptr]=n2,8
805 (p34)st8 [rptr]=t[7],8
806 (p5)br.cond.dpnt.few .Ldone };;
807{ .mmb; (p32)st8 [rptr]=n3,8
808 (p34)st8 [rptr]=t[6],8
809 (p7)br.cond.dpnt.few .Ldone };;
810{ .mmb; (p32)st8 [rptr]=n4,8
811 (p34)st8 [rptr]=t[5],8
812 (p9)br.cond.dpnt.few .Ldone };;
813{ .mmb; (p32)st8 [rptr]=n5,8
814 (p34)st8 [rptr]=t[4],8
815 (p11)br.cond.dpnt.few .Ldone };;
816{ .mmb; (p32)st8 [rptr]=n6,8
817 (p34)st8 [rptr]=t[3],8
818 (p13)br.cond.dpnt.few .Ldone };;
819{ .mmb; (p32)st8 [rptr]=n7,8
820 (p34)st8 [rptr]=t[2],8
821 (p15)br.cond.dpnt.few .Ldone };;
822{ .mmb; (p32)st8 [rptr]=n8,8
823 (p34)st8 [rptr]=t[1],8
824 nop.b 0 };;
825.Ldone: // epilogue
826{ .mmi; ldf.fill f16=[r16],64
827 ldf.fill f17=[r17],64
828 nop.i 0 }
829{ .mmi; ldf.fill f18=[r18],64
830 ldf.fill f19=[r19],64
831 mov pr=prevpr,0x1ffff };;
832{ .mmi; ldf.fill f20=[r16]
833 ldf.fill f21=[r17]
834 mov ar.lc=prevlc }
835{ .mmi; ldf.fill f22=[r18]
836 ldf.fill f23=[r19]
837 mov ret0=1 } // signal "handled"
838{ .mib; rum 1<<5
839 .restore sp
840 mov sp=prevsp
841 br.ret.sptk.many b0 };;
842.endp bn_mul_mont_8#
843
844.type copyright#,\@object
845copyright:
846stringz "Montgomery multiplication for IA-64, CRYPTOGAMS by <appro\@openssl.org>"
847___
848
849$output=shift and open STDOUT,">$output";
850print $code;
851close STDOUT;