summaryrefslogtreecommitdiff
path: root/src/lib/libcrypto/bn
diff options
context:
space:
mode:
authorjsing <>2016-09-04 14:06:46 +0000
committerjsing <>2016-09-04 14:06:46 +0000
commit392813b7d9ed86b80127b58bc6e108cc28530eca (patch)
tree8494faa8d6a64a635803db2bcff9d555fe5adcae /src/lib/libcrypto/bn
parentbecd55246777151f47f161f226165d6bbae02434 (diff)
downloadopenbsd-392813b7d9ed86b80127b58bc6e108cc28530eca.tar.gz
openbsd-392813b7d9ed86b80127b58bc6e108cc28530eca.tar.bz2
openbsd-392813b7d9ed86b80127b58bc6e108cc28530eca.zip
Less IA64.
ok deraadt@
Diffstat (limited to 'src/lib/libcrypto/bn')
-rw-r--r--src/lib/libcrypto/bn/asm/ia64-mont.pl851
-rw-r--r--src/lib/libcrypto/bn/asm/ia64.S1555
2 files changed, 0 insertions, 2406 deletions
diff --git a/src/lib/libcrypto/bn/asm/ia64-mont.pl b/src/lib/libcrypto/bn/asm/ia64-mont.pl
deleted file mode 100644
index e258658428..0000000000
--- a/src/lib/libcrypto/bn/asm/ia64-mont.pl
+++ /dev/null
@@ -1,851 +0,0 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# January 2010
11#
12# "Teaser" Montgomery multiplication module for IA-64. There are
13# several possibilities for improvement:
14#
15# - modulo-scheduling outer loop would eliminate quite a number of
16# stalls after ldf8, xma and getf.sig outside inner loop and
17# improve shorter key performance;
18# - shorter vector support [with input vectors being fetched only
19# once] should be added;
20# - 2x unroll with help of n0[1] would make the code scalable on
21# "wider" IA-64, "wider" than Itanium 2 that is, which is not of
22# acute interest, because upcoming Tukwila's individual cores are
23# reportedly based on Itanium 2 design;
24# - dedicated squaring procedure(?);
25#
26# January 2010
27#
28# Shorter vector support is implemented by zero-padding ap and np
29# vectors up to 8 elements, or 512 bits. This means that 256-bit
30# inputs will be processed only 2 times faster than 512-bit inputs,
31# not 4 [as one would expect, because algorithm complexity is n^2].
32# The reason for padding is that inputs shorter than 512 bits won't
33# be processed faster anyway, because minimal critical path of the
34# core loop happens to match 512-bit timing. Either way, it resulted
35# in >100% improvement of 512-bit RSA sign benchmark and 50% - of
36# 1024-bit one [in comparison to original version of *this* module].
37#
38# So far 'openssl speed rsa dsa' output on 900MHz Itanium 2 *with*
39# this module is:
40# sign verify sign/s verify/s
41# rsa 512 bits 0.000290s 0.000024s 3452.8 42031.4
42# rsa 1024 bits 0.000793s 0.000058s 1261.7 17172.0
43# rsa 2048 bits 0.005908s 0.000148s 169.3 6754.0
44# rsa 4096 bits 0.033456s 0.000469s 29.9 2133.6
45# dsa 512 bits 0.000253s 0.000198s 3949.9 5057.0
46# dsa 1024 bits 0.000585s 0.000607s 1708.4 1647.4
47# dsa 2048 bits 0.001453s 0.001703s 688.1 587.4
48#
49# ... and *without* (but still with ia64.S):
50#
51# rsa 512 bits 0.000670s 0.000041s 1491.8 24145.5
52# rsa 1024 bits 0.001988s 0.000080s 502.9 12499.3
53# rsa 2048 bits 0.008702s 0.000189s 114.9 5293.9
54# rsa 4096 bits 0.043860s 0.000533s 22.8 1875.9
55# dsa 512 bits 0.000441s 0.000427s 2265.3 2340.6
56# dsa 1024 bits 0.000823s 0.000867s 1215.6 1153.2
57# dsa 2048 bits 0.001894s 0.002179s 528.1 458.9
58#
59# As it can be seen, RSA sign performance improves by 130-30%,
60# hereafter less for longer keys, while verify - by 74-13%.
61# DSA performance improves by 115-30%.
62
63if ($^O eq "hpux") {
64 $ADDP="addp4";
65 for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
66} else { $ADDP="add"; }
67
68$code=<<___;
69.explicit
70.text
71
72// int bn_mul_mont (BN_ULONG *rp,const BN_ULONG *ap,
73// const BN_ULONG *bp,const BN_ULONG *np,
74// const BN_ULONG *n0p,int num);
75.align 64
76.global bn_mul_mont#
77.proc bn_mul_mont#
78bn_mul_mont:
79 .prologue
80 .body
81{ .mmi; cmp4.le p6,p7=2,r37;;
82(p6) cmp4.lt.unc p8,p9=8,r37
83 mov ret0=r0 };;
84{ .bbb;
85(p9) br.cond.dptk.many bn_mul_mont_8
86(p8) br.cond.dpnt.many bn_mul_mont_general
87(p7) br.ret.spnt.many b0 };;
88.endp bn_mul_mont#
89
90prevfs=r2; prevpr=r3; prevlc=r10; prevsp=r11;
91
92rptr=r8; aptr=r9; bptr=r14; nptr=r15;
93tptr=r16; // &tp[0]
94tp_1=r17; // &tp[-1]
95num=r18; len=r19; lc=r20;
96topbit=r21; // carry bit from tmp[num]
97
98n0=f6;
99m0=f7;
100bi=f8;
101
102.align 64
103.local bn_mul_mont_general#
104.proc bn_mul_mont_general#
105bn_mul_mont_general:
106 .prologue
107{ .mmi; .save ar.pfs,prevfs
108 alloc prevfs=ar.pfs,6,2,0,8
109 $ADDP aptr=0,in1
110 .save ar.lc,prevlc
111 mov prevlc=ar.lc }
112{ .mmi; .vframe prevsp
113 mov prevsp=sp
114 $ADDP bptr=0,in2
115 .save pr,prevpr
116 mov prevpr=pr };;
117
118 .body
119 .rotf alo[6],nlo[4],ahi[8],nhi[6]
120 .rotr a[3],n[3],t[2]
121
122{ .mmi; ldf8 bi=[bptr],8 // (*bp++)
123 ldf8 alo[4]=[aptr],16 // ap[0]
124 $ADDP r30=8,in1 };;
125{ .mmi; ldf8 alo[3]=[r30],16 // ap[1]
126 ldf8 alo[2]=[aptr],16 // ap[2]
127 $ADDP in4=0,in4 };;
128{ .mmi; ldf8 alo[1]=[r30] // ap[3]
129 ldf8 n0=[in4] // n0
130 $ADDP rptr=0,in0 }
131{ .mmi; $ADDP nptr=0,in3
132 mov r31=16
133 zxt4 num=in5 };;
134{ .mmi; ldf8 nlo[2]=[nptr],8 // np[0]
135 shladd len=num,3,r0
136 shladd r31=num,3,r31 };;
137{ .mmi; ldf8 nlo[1]=[nptr],8 // np[1]
138 add lc=-5,num
139 sub r31=sp,r31 };;
140{ .mfb; and sp=-16,r31 // alloca
141 xmpy.hu ahi[2]=alo[4],bi // ap[0]*bp[0]
142 nop.b 0 }
143{ .mfb; nop.m 0
144 xmpy.lu alo[4]=alo[4],bi
145 brp.loop.imp .L1st_ctop,.L1st_cend-16
146 };;
147{ .mfi; nop.m 0
148 xma.hu ahi[1]=alo[3],bi,ahi[2] // ap[1]*bp[0]
149 add tp_1=8,sp }
150{ .mfi; nop.m 0
151 xma.lu alo[3]=alo[3],bi,ahi[2]
152 mov pr.rot=0x20001f<<16
153 // ------^----- (p40) at first (p23)
154 // ----------^^ p[16:20]=1
155 };;
156{ .mfi; nop.m 0
157 xmpy.lu m0=alo[4],n0 // (ap[0]*bp[0])*n0
158 mov ar.lc=lc }
159{ .mfi; nop.m 0
160 fcvt.fxu.s1 nhi[1]=f0
161 mov ar.ec=8 };;
162
163.align 32
164.L1st_ctop:
165.pred.rel "mutex",p40,p42
166{ .mfi; (p16) ldf8 alo[0]=[aptr],8 // *(aptr++)
167 (p18) xma.hu ahi[0]=alo[2],bi,ahi[1]
168 (p40) add n[2]=n[2],a[2] } // (p23) }
169{ .mfi; (p18) ldf8 nlo[0]=[nptr],8 // *(nptr++)(p16)
170 (p18) xma.lu alo[2]=alo[2],bi,ahi[1]
171 (p42) add n[2]=n[2],a[2],1 };; // (p23)
172{ .mfi; (p21) getf.sig a[0]=alo[5]
173 (p20) xma.hu nhi[0]=nlo[2],m0,nhi[1]
174 (p42) cmp.leu p41,p39=n[2],a[2] } // (p23)
175{ .mfi; (p23) st8 [tp_1]=n[2],8
176 (p20) xma.lu nlo[2]=nlo[2],m0,nhi[1]
177 (p40) cmp.ltu p41,p39=n[2],a[2] } // (p23)
178{ .mmb; (p21) getf.sig n[0]=nlo[3]
179 (p16) nop.m 0
180 br.ctop.sptk .L1st_ctop };;
181.L1st_cend:
182
183{ .mmi; getf.sig a[0]=ahi[6] // (p24)
184 getf.sig n[0]=nhi[4]
185 add num=-1,num };; // num--
186{ .mmi; .pred.rel "mutex",p40,p42
187(p40) add n[0]=n[0],a[0]
188(p42) add n[0]=n[0],a[0],1
189 sub aptr=aptr,len };; // rewind
190{ .mmi; .pred.rel "mutex",p40,p42
191(p40) cmp.ltu p41,p39=n[0],a[0]
192(p42) cmp.leu p41,p39=n[0],a[0]
193 sub nptr=nptr,len };;
194{ .mmi; .pred.rel "mutex",p39,p41
195(p39) add topbit=r0,r0
196(p41) add topbit=r0,r0,1
197 nop.i 0 }
198{ .mmi; st8 [tp_1]=n[0]
199 add tptr=16,sp
200 add tp_1=8,sp };;
201
202.Louter:
203{ .mmi; ldf8 bi=[bptr],8 // (*bp++)
204 ldf8 ahi[3]=[tptr] // tp[0]
205 add r30=8,aptr };;
206{ .mmi; ldf8 alo[4]=[aptr],16 // ap[0]
207 ldf8 alo[3]=[r30],16 // ap[1]
208 add r31=8,nptr };;
209{ .mfb; ldf8 alo[2]=[aptr],16 // ap[2]
210 xma.hu ahi[2]=alo[4],bi,ahi[3] // ap[0]*bp[i]+tp[0]
211 brp.loop.imp .Linner_ctop,.Linner_cend-16
212 }
213{ .mfb; ldf8 alo[1]=[r30] // ap[3]
214 xma.lu alo[4]=alo[4],bi,ahi[3]
215 clrrrb.pr };;
216{ .mfi; ldf8 nlo[2]=[nptr],16 // np[0]
217 xma.hu ahi[1]=alo[3],bi,ahi[2] // ap[1]*bp[i]
218 nop.i 0 }
219{ .mfi; ldf8 nlo[1]=[r31] // np[1]
220 xma.lu alo[3]=alo[3],bi,ahi[2]
221 mov pr.rot=0x20101f<<16
222 // ------^----- (p40) at first (p23)
223 // --------^--- (p30) at first (p22)
224 // ----------^^ p[16:20]=1
225 };;
226{ .mfi; st8 [tptr]=r0 // tp[0] is already accounted
227 xmpy.lu m0=alo[4],n0 // (ap[0]*bp[i]+tp[0])*n0
228 mov ar.lc=lc }
229{ .mfi;
230 fcvt.fxu.s1 nhi[1]=f0
231 mov ar.ec=8 };;
232
233// This loop spins in 4*(n+7) ticks on Itanium 2 and should spin in
234// 7*(n+7) ticks on Itanium (the one codenamed Merced). Factor of 7
235// in latter case accounts for two-tick pipeline stall, which means
236// that its performance would be ~20% lower than optimal one. No
237// attempt was made to address this, because original Itanium is
238// hardly represented out in the wild...
239.align 32
240.Linner_ctop:
241.pred.rel "mutex",p40,p42
242.pred.rel "mutex",p30,p32
243{ .mfi; (p16) ldf8 alo[0]=[aptr],8 // *(aptr++)
244 (p18) xma.hu ahi[0]=alo[2],bi,ahi[1]
245 (p40) add n[2]=n[2],a[2] } // (p23)
246{ .mfi; (p16) nop.m 0
247 (p18) xma.lu alo[2]=alo[2],bi,ahi[1]
248 (p42) add n[2]=n[2],a[2],1 };; // (p23)
249{ .mfi; (p21) getf.sig a[0]=alo[5]
250 (p16) nop.f 0
251 (p40) cmp.ltu p41,p39=n[2],a[2] } // (p23)
252{ .mfi; (p21) ld8 t[0]=[tptr],8
253 (p16) nop.f 0
254 (p42) cmp.leu p41,p39=n[2],a[2] };; // (p23)
255{ .mfi; (p18) ldf8 nlo[0]=[nptr],8 // *(nptr++)
256 (p20) xma.hu nhi[0]=nlo[2],m0,nhi[1]
257 (p30) add a[1]=a[1],t[1] } // (p22)
258{ .mfi; (p16) nop.m 0
259 (p20) xma.lu nlo[2]=nlo[2],m0,nhi[1]
260 (p32) add a[1]=a[1],t[1],1 };; // (p22)
261{ .mmi; (p21) getf.sig n[0]=nlo[3]
262 (p16) nop.m 0
263 (p30) cmp.ltu p31,p29=a[1],t[1] } // (p22)
264{ .mmb; (p23) st8 [tp_1]=n[2],8
265 (p32) cmp.leu p31,p29=a[1],t[1] // (p22)
266 br.ctop.sptk .Linner_ctop };;
267.Linner_cend:
268
269{ .mmi; getf.sig a[0]=ahi[6] // (p24)
270 getf.sig n[0]=nhi[4]
271 nop.i 0 };;
272
273{ .mmi; .pred.rel "mutex",p31,p33
274(p31) add a[0]=a[0],topbit
275(p33) add a[0]=a[0],topbit,1
276 mov topbit=r0 };;
277{ .mfi; .pred.rel "mutex",p31,p33
278(p31) cmp.ltu p32,p30=a[0],topbit
279(p33) cmp.leu p32,p30=a[0],topbit
280 }
281{ .mfi; .pred.rel "mutex",p40,p42
282(p40) add n[0]=n[0],a[0]
283(p42) add n[0]=n[0],a[0],1
284 };;
285{ .mmi; .pred.rel "mutex",p44,p46
286(p40) cmp.ltu p41,p39=n[0],a[0]
287(p42) cmp.leu p41,p39=n[0],a[0]
288(p32) add topbit=r0,r0,1 }
289
290{ .mmi; st8 [tp_1]=n[0],8
291 cmp4.ne p6,p0=1,num
292 sub aptr=aptr,len };; // rewind
293{ .mmi; sub nptr=nptr,len
294(p41) add topbit=r0,r0,1
295 add tptr=16,sp }
296{ .mmb; add tp_1=8,sp
297 add num=-1,num // num--
298(p6) br.cond.sptk.many .Louter };;
299
300{ .mbb; add lc=4,lc
301 brp.loop.imp .Lsub_ctop,.Lsub_cend-16
302 clrrrb.pr };;
303{ .mii; nop.m 0
304 mov pr.rot=0x10001<<16
305 // ------^---- (p33) at first (p17)
306 mov ar.lc=lc }
307{ .mii; nop.m 0
308 mov ar.ec=3
309 nop.i 0 };;
310
311.Lsub_ctop:
312.pred.rel "mutex",p33,p35
313{ .mfi; (p16) ld8 t[0]=[tptr],8 // t=*(tp++)
314 (p16) nop.f 0
315 (p33) sub n[1]=t[1],n[1] } // (p17)
316{ .mfi; (p16) ld8 n[0]=[nptr],8 // n=*(np++)
317 (p16) nop.f 0
318 (p35) sub n[1]=t[1],n[1],1 };; // (p17)
319{ .mib; (p18) st8 [rptr]=n[2],8 // *(rp++)=r
320 (p33) cmp.gtu p34,p32=n[1],t[1] // (p17)
321 (p18) nop.b 0 }
322{ .mib; (p18) nop.m 0
323 (p35) cmp.geu p34,p32=n[1],t[1] // (p17)
324 br.ctop.sptk .Lsub_ctop };;
325.Lsub_cend:
326
327{ .mmb; .pred.rel "mutex",p34,p36
328(p34) sub topbit=topbit,r0 // (p19)
329(p36) sub topbit=topbit,r0,1
330 brp.loop.imp .Lcopy_ctop,.Lcopy_cend-16
331 }
332{ .mmb; sub rptr=rptr,len // rewind
333 sub tptr=tptr,len
334 clrrrb.pr };;
335{ .mmi; and aptr=tptr,topbit
336 andcm bptr=rptr,topbit
337 mov pr.rot=1<<16 };;
338{ .mii; or nptr=aptr,bptr
339 mov ar.lc=lc
340 mov ar.ec=3 };;
341
342.Lcopy_ctop:
343{ .mmb; (p16) ld8 n[0]=[nptr],8
344 (p18) st8 [tptr]=r0,8
345 (p16) nop.b 0 }
346{ .mmb; (p16) nop.m 0
347 (p18) st8 [rptr]=n[2],8
348 br.ctop.sptk .Lcopy_ctop };;
349.Lcopy_cend:
350
351{ .mmi; mov ret0=1 // signal "handled"
352 rum 1<<5 // clear um.mfh
353 mov ar.lc=prevlc }
354{ .mib; .restore sp
355 mov sp=prevsp
356 mov pr=prevpr,0x1ffff
357 br.ret.sptk.many b0 };;
358.endp bn_mul_mont_general#
359
360a1=r16; a2=r17; a3=r18; a4=r19; a5=r20; a6=r21; a7=r22; a8=r23;
361n1=r24; n2=r25; n3=r26; n4=r27; n5=r28; n6=r29; n7=r30; n8=r31;
362t0=r15;
363
364ai0=f8; ai1=f9; ai2=f10; ai3=f11; ai4=f12; ai5=f13; ai6=f14; ai7=f15;
365ni0=f16; ni1=f17; ni2=f18; ni3=f19; ni4=f20; ni5=f21; ni6=f22; ni7=f23;
366
367.align 64
368.skip 48 // aligns loop body
369.local bn_mul_mont_8#
370.proc bn_mul_mont_8#
371bn_mul_mont_8:
372 .prologue
373{ .mmi; .save ar.pfs,prevfs
374 alloc prevfs=ar.pfs,6,2,0,8
375 .vframe prevsp
376 mov prevsp=sp
377 .save ar.lc,prevlc
378 mov prevlc=ar.lc }
379{ .mmi; add r17=-6*16,sp
380 add sp=-7*16,sp
381 .save pr,prevpr
382 mov prevpr=pr };;
383
384{ .mmi; .save.gf 0,0x10
385 stf.spill [sp]=f16,-16
386 .save.gf 0,0x20
387 stf.spill [r17]=f17,32
388 add r16=-5*16,prevsp};;
389{ .mmi; .save.gf 0,0x40
390 stf.spill [r16]=f18,32
391 .save.gf 0,0x80
392 stf.spill [r17]=f19,32
393 $ADDP aptr=0,in1 };;
394{ .mmi; .save.gf 0,0x100
395 stf.spill [r16]=f20,32
396 .save.gf 0,0x200
397 stf.spill [r17]=f21,32
398 $ADDP r29=8,in1 };;
399{ .mmi; .save.gf 0,0x400
400 stf.spill [r16]=f22
401 .save.gf 0,0x800
402 stf.spill [r17]=f23
403 $ADDP rptr=0,in0 };;
404
405 .body
406 .rotf bj[8],mj[2],tf[2],alo[10],ahi[10],nlo[10],nhi[10]
407 .rotr t[8]
408
409// load input vectors padding them to 8 elements
410{ .mmi; ldf8 ai0=[aptr],16 // ap[0]
411 ldf8 ai1=[r29],16 // ap[1]
412 $ADDP bptr=0,in2 }
413{ .mmi; $ADDP r30=8,in2
414 $ADDP nptr=0,in3
415 $ADDP r31=8,in3 };;
416{ .mmi; ldf8 bj[7]=[bptr],16 // bp[0]
417 ldf8 bj[6]=[r30],16 // bp[1]
418 cmp4.le p4,p5=3,in5 }
419{ .mmi; ldf8 ni0=[nptr],16 // np[0]
420 ldf8 ni1=[r31],16 // np[1]
421 cmp4.le p6,p7=4,in5 };;
422
423{ .mfi; (p4)ldf8 ai2=[aptr],16 // ap[2]
424 (p5)fcvt.fxu ai2=f0
425 cmp4.le p8,p9=5,in5 }
426{ .mfi; (p6)ldf8 ai3=[r29],16 // ap[3]
427 (p7)fcvt.fxu ai3=f0
428 cmp4.le p10,p11=6,in5 }
429{ .mfi; (p4)ldf8 bj[5]=[bptr],16 // bp[2]
430 (p5)fcvt.fxu bj[5]=f0
431 cmp4.le p12,p13=7,in5 }
432{ .mfi; (p6)ldf8 bj[4]=[r30],16 // bp[3]
433 (p7)fcvt.fxu bj[4]=f0
434 cmp4.le p14,p15=8,in5 }
435{ .mfi; (p4)ldf8 ni2=[nptr],16 // np[2]
436 (p5)fcvt.fxu ni2=f0
437 addp4 r28=-1,in5 }
438{ .mfi; (p6)ldf8 ni3=[r31],16 // np[3]
439 (p7)fcvt.fxu ni3=f0
440 $ADDP in4=0,in4 };;
441
442{ .mfi; ldf8 n0=[in4]
443 fcvt.fxu tf[1]=f0
444 nop.i 0 }
445
446{ .mfi; (p8)ldf8 ai4=[aptr],16 // ap[4]
447 (p9)fcvt.fxu ai4=f0
448 mov t[0]=r0 }
449{ .mfi; (p10)ldf8 ai5=[r29],16 // ap[5]
450 (p11)fcvt.fxu ai5=f0
451 mov t[1]=r0 }
452{ .mfi; (p8)ldf8 bj[3]=[bptr],16 // bp[4]
453 (p9)fcvt.fxu bj[3]=f0
454 mov t[2]=r0 }
455{ .mfi; (p10)ldf8 bj[2]=[r30],16 // bp[5]
456 (p11)fcvt.fxu bj[2]=f0
457 mov t[3]=r0 }
458{ .mfi; (p8)ldf8 ni4=[nptr],16 // np[4]
459 (p9)fcvt.fxu ni4=f0
460 mov t[4]=r0 }
461{ .mfi; (p10)ldf8 ni5=[r31],16 // np[5]
462 (p11)fcvt.fxu ni5=f0
463 mov t[5]=r0 };;
464
465{ .mfi; (p12)ldf8 ai6=[aptr],16 // ap[6]
466 (p13)fcvt.fxu ai6=f0
467 mov t[6]=r0 }
468{ .mfi; (p14)ldf8 ai7=[r29],16 // ap[7]
469 (p15)fcvt.fxu ai7=f0
470 mov t[7]=r0 }
471{ .mfi; (p12)ldf8 bj[1]=[bptr],16 // bp[6]
472 (p13)fcvt.fxu bj[1]=f0
473 mov ar.lc=r28 }
474{ .mfi; (p14)ldf8 bj[0]=[r30],16 // bp[7]
475 (p15)fcvt.fxu bj[0]=f0
476 mov ar.ec=1 }
477{ .mfi; (p12)ldf8 ni6=[nptr],16 // np[6]
478 (p13)fcvt.fxu ni6=f0
479 mov pr.rot=1<<16 }
480{ .mfb; (p14)ldf8 ni7=[r31],16 // np[7]
481 (p15)fcvt.fxu ni7=f0
482 brp.loop.imp .Louter_8_ctop,.Louter_8_cend-16
483 };;
484
485// The loop is scheduled for 32*n ticks on Itanium 2. Actual attempt
486// to measure with help of Interval Time Counter indicated that the
487// factor is a tad higher: 33 or 34, if not 35. Exact measurement and
488// addressing the issue is problematic, because I don't have access
489// to platform-specific instruction-level profiler. On Itanium it
490// should run in 56*n ticks, because of higher xma latency...
491.Louter_8_ctop:
492 .pred.rel "mutex",p40,p42
493 .pred.rel "mutex",p48,p50
494{ .mfi; (p16) nop.m 0 // 0:
495 (p16) xma.hu ahi[0]=ai0,bj[7],tf[1] // ap[0]*b[i]+t[0]
496 (p40) add a3=a3,n3 } // (p17) a3+=n3
497{ .mfi; (p42) add a3=a3,n3,1
498 (p16) xma.lu alo[0]=ai0,bj[7],tf[1]
499 (p16) nop.i 0 };;
500{ .mii; (p17) getf.sig a7=alo[8] // 1:
501 (p48) add t[6]=t[6],a3 // (p17) t[6]+=a3
502 (p50) add t[6]=t[6],a3,1 };;
503{ .mfi; (p17) getf.sig a8=ahi[8] // 2:
504 (p17) xma.hu nhi[7]=ni6,mj[1],nhi[6] // np[6]*m0
505 (p40) cmp.ltu p43,p41=a3,n3 }
506{ .mfi; (p42) cmp.leu p43,p41=a3,n3
507 (p17) xma.lu nlo[7]=ni6,mj[1],nhi[6]
508 (p16) nop.i 0 };;
509{ .mii; (p17) getf.sig n5=nlo[6] // 3:
510 (p48) cmp.ltu p51,p49=t[6],a3
511 (p50) cmp.leu p51,p49=t[6],a3 };;
512 .pred.rel "mutex",p41,p43
513 .pred.rel "mutex",p49,p51
514{ .mfi; (p16) nop.m 0 // 4:
515 (p16) xma.hu ahi[1]=ai1,bj[7],ahi[0] // ap[1]*b[i]
516 (p41) add a4=a4,n4 } // (p17) a4+=n4
517{ .mfi; (p43) add a4=a4,n4,1
518 (p16) xma.lu alo[1]=ai1,bj[7],ahi[0]
519 (p16) nop.i 0 };;
520{ .mfi; (p49) add t[5]=t[5],a4 // 5: (p17) t[5]+=a4
521 (p16) xmpy.lu mj[0]=alo[0],n0 // (ap[0]*b[i]+t[0])*n0
522 (p51) add t[5]=t[5],a4,1 };;
523{ .mfi; (p16) nop.m 0 // 6:
524 (p17) xma.hu nhi[8]=ni7,mj[1],nhi[7] // np[7]*m0
525 (p41) cmp.ltu p42,p40=a4,n4 }
526{ .mfi; (p43) cmp.leu p42,p40=a4,n4
527 (p17) xma.lu nlo[8]=ni7,mj[1],nhi[7]
528 (p16) nop.i 0 };;
529{ .mii; (p17) getf.sig n6=nlo[7] // 7:
530 (p49) cmp.ltu p50,p48=t[5],a4
531 (p51) cmp.leu p50,p48=t[5],a4 };;
532 .pred.rel "mutex",p40,p42
533 .pred.rel "mutex",p48,p50
534{ .mfi; (p16) nop.m 0 // 8:
535 (p16) xma.hu ahi[2]=ai2,bj[7],ahi[1] // ap[2]*b[i]
536 (p40) add a5=a5,n5 } // (p17) a5+=n5
537{ .mfi; (p42) add a5=a5,n5,1
538 (p16) xma.lu alo[2]=ai2,bj[7],ahi[1]
539 (p16) nop.i 0 };;
540{ .mii; (p16) getf.sig a1=alo[1] // 9:
541 (p48) add t[4]=t[4],a5 // p(17) t[4]+=a5
542 (p50) add t[4]=t[4],a5,1 };;
543{ .mfi; (p16) nop.m 0 // 10:
544 (p16) xma.hu nhi[0]=ni0,mj[0],alo[0] // np[0]*m0
545 (p40) cmp.ltu p43,p41=a5,n5 }
546{ .mfi; (p42) cmp.leu p43,p41=a5,n5
547 (p16) xma.lu nlo[0]=ni0,mj[0],alo[0]
548 (p16) nop.i 0 };;
549{ .mii; (p17) getf.sig n7=nlo[8] // 11:
550 (p48) cmp.ltu p51,p49=t[4],a5
551 (p50) cmp.leu p51,p49=t[4],a5 };;
552 .pred.rel "mutex",p41,p43
553 .pred.rel "mutex",p49,p51
554{ .mfi; (p17) getf.sig n8=nhi[8] // 12:
555 (p16) xma.hu ahi[3]=ai3,bj[7],ahi[2] // ap[3]*b[i]
556 (p41) add a6=a6,n6 } // (p17) a6+=n6
557{ .mfi; (p43) add a6=a6,n6,1
558 (p16) xma.lu alo[3]=ai3,bj[7],ahi[2]
559 (p16) nop.i 0 };;
560{ .mii; (p16) getf.sig a2=alo[2] // 13:
561 (p49) add t[3]=t[3],a6 // (p17) t[3]+=a6
562 (p51) add t[3]=t[3],a6,1 };;
563{ .mfi; (p16) nop.m 0 // 14:
564 (p16) xma.hu nhi[1]=ni1,mj[0],nhi[0] // np[1]*m0
565 (p41) cmp.ltu p42,p40=a6,n6 }
566{ .mfi; (p43) cmp.leu p42,p40=a6,n6
567 (p16) xma.lu nlo[1]=ni1,mj[0],nhi[0]
568 (p16) nop.i 0 };;
569{ .mii; (p16) nop.m 0 // 15:
570 (p49) cmp.ltu p50,p48=t[3],a6
571 (p51) cmp.leu p50,p48=t[3],a6 };;
572 .pred.rel "mutex",p40,p42
573 .pred.rel "mutex",p48,p50
574{ .mfi; (p16) nop.m 0 // 16:
575 (p16) xma.hu ahi[4]=ai4,bj[7],ahi[3] // ap[4]*b[i]
576 (p40) add a7=a7,n7 } // (p17) a7+=n7
577{ .mfi; (p42) add a7=a7,n7,1
578 (p16) xma.lu alo[4]=ai4,bj[7],ahi[3]
579 (p16) nop.i 0 };;
580{ .mii; (p16) getf.sig a3=alo[3] // 17:
581 (p48) add t[2]=t[2],a7 // (p17) t[2]+=a7
582 (p50) add t[2]=t[2],a7,1 };;
583{ .mfi; (p16) nop.m 0 // 18:
584 (p16) xma.hu nhi[2]=ni2,mj[0],nhi[1] // np[2]*m0
585 (p40) cmp.ltu p43,p41=a7,n7 }
586{ .mfi; (p42) cmp.leu p43,p41=a7,n7
587 (p16) xma.lu nlo[2]=ni2,mj[0],nhi[1]
588 (p16) nop.i 0 };;
589{ .mii; (p16) getf.sig n1=nlo[1] // 19:
590 (p48) cmp.ltu p51,p49=t[2],a7
591 (p50) cmp.leu p51,p49=t[2],a7 };;
592 .pred.rel "mutex",p41,p43
593 .pred.rel "mutex",p49,p51
594{ .mfi; (p16) nop.m 0 // 20:
595 (p16) xma.hu ahi[5]=ai5,bj[7],ahi[4] // ap[5]*b[i]
596 (p41) add a8=a8,n8 } // (p17) a8+=n8
597{ .mfi; (p43) add a8=a8,n8,1
598 (p16) xma.lu alo[5]=ai5,bj[7],ahi[4]
599 (p16) nop.i 0 };;
600{ .mii; (p16) getf.sig a4=alo[4] // 21:
601 (p49) add t[1]=t[1],a8 // (p17) t[1]+=a8
602 (p51) add t[1]=t[1],a8,1 };;
603{ .mfi; (p16) nop.m 0 // 22:
604 (p16) xma.hu nhi[3]=ni3,mj[0],nhi[2] // np[3]*m0
605 (p41) cmp.ltu p42,p40=a8,n8 }
606{ .mfi; (p43) cmp.leu p42,p40=a8,n8
607 (p16) xma.lu nlo[3]=ni3,mj[0],nhi[2]
608 (p16) nop.i 0 };;
609{ .mii; (p16) getf.sig n2=nlo[2] // 23:
610 (p49) cmp.ltu p50,p48=t[1],a8
611 (p51) cmp.leu p50,p48=t[1],a8 };;
612{ .mfi; (p16) nop.m 0 // 24:
613 (p16) xma.hu ahi[6]=ai6,bj[7],ahi[5] // ap[6]*b[i]
614 (p16) add a1=a1,n1 } // (p16) a1+=n1
615{ .mfi; (p16) nop.m 0
616 (p16) xma.lu alo[6]=ai6,bj[7],ahi[5]
617 (p17) mov t[0]=r0 };;
618{ .mii; (p16) getf.sig a5=alo[5] // 25:
619 (p16) add t0=t[7],a1 // (p16) t[7]+=a1
620 (p42) add t[0]=t[0],r0,1 };;
621{ .mfi; (p16) setf.sig tf[0]=t0 // 26:
622 (p16) xma.hu nhi[4]=ni4,mj[0],nhi[3] // np[4]*m0
623 (p50) add t[0]=t[0],r0,1 }
624{ .mfi; (p16) cmp.ltu.unc p42,p40=a1,n1
625 (p16) xma.lu nlo[4]=ni4,mj[0],nhi[3]
626 (p16) nop.i 0 };;
627{ .mii; (p16) getf.sig n3=nlo[3] // 27:
628 (p16) cmp.ltu.unc p50,p48=t0,a1
629 (p16) nop.i 0 };;
630 .pred.rel "mutex",p40,p42
631 .pred.rel "mutex",p48,p50
632{ .mfi; (p16) nop.m 0 // 28:
633 (p16) xma.hu ahi[7]=ai7,bj[7],ahi[6] // ap[7]*b[i]
634 (p40) add a2=a2,n2 } // (p16) a2+=n2
635{ .mfi; (p42) add a2=a2,n2,1
636 (p16) xma.lu alo[7]=ai7,bj[7],ahi[6]
637 (p16) nop.i 0 };;
638{ .mii; (p16) getf.sig a6=alo[6] // 29:
639 (p48) add t[6]=t[6],a2 // (p16) t[6]+=a2
640 (p50) add t[6]=t[6],a2,1 };;
641{ .mfi; (p16) nop.m 0 // 30:
642 (p16) xma.hu nhi[5]=ni5,mj[0],nhi[4] // np[5]*m0
643 (p40) cmp.ltu p41,p39=a2,n2 }
644{ .mfi; (p42) cmp.leu p41,p39=a2,n2
645 (p16) xma.lu nlo[5]=ni5,mj[0],nhi[4]
646 (p16) nop.i 0 };;
647{ .mfi; (p16) getf.sig n4=nlo[4] // 31:
648 (p16) nop.f 0
649 (p48) cmp.ltu p49,p47=t[6],a2 }
650{ .mfb; (p50) cmp.leu p49,p47=t[6],a2
651 (p16) nop.f 0
652 br.ctop.sptk.many .Louter_8_ctop };;
653.Louter_8_cend:
654
655// above loop has to execute one more time, without (p16), which is
656// replaced with merged move of np[8] to GPR bank
657 .pred.rel "mutex",p40,p42
658 .pred.rel "mutex",p48,p50
659{ .mmi; (p0) getf.sig n1=ni0 // 0:
660 (p40) add a3=a3,n3 // (p17) a3+=n3
661 (p42) add a3=a3,n3,1 };;
662{ .mii; (p17) getf.sig a7=alo[8] // 1:
663 (p48) add t[6]=t[6],a3 // (p17) t[6]+=a3
664 (p50) add t[6]=t[6],a3,1 };;
665{ .mfi; (p17) getf.sig a8=ahi[8] // 2:
666 (p17) xma.hu nhi[7]=ni6,mj[1],nhi[6] // np[6]*m0
667 (p40) cmp.ltu p43,p41=a3,n3 }
668{ .mfi; (p42) cmp.leu p43,p41=a3,n3
669 (p17) xma.lu nlo[7]=ni6,mj[1],nhi[6]
670 (p0) nop.i 0 };;
671{ .mii; (p17) getf.sig n5=nlo[6] // 3:
672 (p48) cmp.ltu p51,p49=t[6],a3
673 (p50) cmp.leu p51,p49=t[6],a3 };;
674 .pred.rel "mutex",p41,p43
675 .pred.rel "mutex",p49,p51
676{ .mmi; (p0) getf.sig n2=ni1 // 4:
677 (p41) add a4=a4,n4 // (p17) a4+=n4
678 (p43) add a4=a4,n4,1 };;
679{ .mfi; (p49) add t[5]=t[5],a4 // 5: (p17) t[5]+=a4
680 (p0) nop.f 0
681 (p51) add t[5]=t[5],a4,1 };;
682{ .mfi; (p0) getf.sig n3=ni2 // 6:
683 (p17) xma.hu nhi[8]=ni7,mj[1],nhi[7] // np[7]*m0
684 (p41) cmp.ltu p42,p40=a4,n4 }
685{ .mfi; (p43) cmp.leu p42,p40=a4,n4
686 (p17) xma.lu nlo[8]=ni7,mj[1],nhi[7]
687 (p0) nop.i 0 };;
688{ .mii; (p17) getf.sig n6=nlo[7] // 7:
689 (p49) cmp.ltu p50,p48=t[5],a4
690 (p51) cmp.leu p50,p48=t[5],a4 };;
691 .pred.rel "mutex",p40,p42
692 .pred.rel "mutex",p48,p50
693{ .mii; (p0) getf.sig n4=ni3 // 8:
694 (p40) add a5=a5,n5 // (p17) a5+=n5
695 (p42) add a5=a5,n5,1 };;
696{ .mii; (p0) nop.m 0 // 9:
697 (p48) add t[4]=t[4],a5 // p(17) t[4]+=a5
698 (p50) add t[4]=t[4],a5,1 };;
699{ .mii; (p0) nop.m 0 // 10:
700 (p40) cmp.ltu p43,p41=a5,n5
701 (p42) cmp.leu p43,p41=a5,n5 };;
702{ .mii; (p17) getf.sig n7=nlo[8] // 11:
703 (p48) cmp.ltu p51,p49=t[4],a5
704 (p50) cmp.leu p51,p49=t[4],a5 };;
705 .pred.rel "mutex",p41,p43
706 .pred.rel "mutex",p49,p51
707{ .mii; (p17) getf.sig n8=nhi[8] // 12:
708 (p41) add a6=a6,n6 // (p17) a6+=n6
709 (p43) add a6=a6,n6,1 };;
710{ .mii; (p0) getf.sig n5=ni4 // 13:
711 (p49) add t[3]=t[3],a6 // (p17) t[3]+=a6
712 (p51) add t[3]=t[3],a6,1 };;
713{ .mii; (p0) nop.m 0 // 14:
714 (p41) cmp.ltu p42,p40=a6,n6
715 (p43) cmp.leu p42,p40=a6,n6 };;
716{ .mii; (p0) getf.sig n6=ni5 // 15:
717 (p49) cmp.ltu p50,p48=t[3],a6
718 (p51) cmp.leu p50,p48=t[3],a6 };;
719 .pred.rel "mutex",p40,p42
720 .pred.rel "mutex",p48,p50
721{ .mii; (p0) nop.m 0 // 16:
722 (p40) add a7=a7,n7 // (p17) a7+=n7
723 (p42) add a7=a7,n7,1 };;
724{ .mii; (p0) nop.m 0 // 17:
725 (p48) add t[2]=t[2],a7 // (p17) t[2]+=a7
726 (p50) add t[2]=t[2],a7,1 };;
727{ .mii; (p0) nop.m 0 // 18:
728 (p40) cmp.ltu p43,p41=a7,n7
729 (p42) cmp.leu p43,p41=a7,n7 };;
730{ .mii; (p0) getf.sig n7=ni6 // 19:
731 (p48) cmp.ltu p51,p49=t[2],a7
732 (p50) cmp.leu p51,p49=t[2],a7 };;
733 .pred.rel "mutex",p41,p43
734 .pred.rel "mutex",p49,p51
735{ .mii; (p0) nop.m 0 // 20:
736 (p41) add a8=a8,n8 // (p17) a8+=n8
737 (p43) add a8=a8,n8,1 };;
738{ .mmi; (p0) nop.m 0 // 21:
739 (p49) add t[1]=t[1],a8 // (p17) t[1]+=a8
740 (p51) add t[1]=t[1],a8,1 }
741{ .mmi; (p17) mov t[0]=r0
742 (p41) cmp.ltu p42,p40=a8,n8
743 (p43) cmp.leu p42,p40=a8,n8 };;
744{ .mmi; (p0) getf.sig n8=ni7 // 22:
745 (p49) cmp.ltu p50,p48=t[1],a8
746 (p51) cmp.leu p50,p48=t[1],a8 }
747{ .mmi; (p42) add t[0]=t[0],r0,1
748 (p0) add r16=-7*16,prevsp
749 (p0) add r17=-6*16,prevsp };;
750
751// subtract np[8] from carrybit|tmp[8]
752// carrybit|tmp[8] layout upon exit from above loop is:
753// t[0]|t[1]|t[2]|t[3]|t[4]|t[5]|t[6]|t[7]|t0 (least significant)
754{ .mmi; (p50)add t[0]=t[0],r0,1
755 add r18=-5*16,prevsp
756 sub n1=t0,n1 };;
757{ .mmi; cmp.gtu p34,p32=n1,t0;;
758 .pred.rel "mutex",p32,p34
759 (p32)sub n2=t[7],n2
760 (p34)sub n2=t[7],n2,1 };;
761{ .mii; (p32)cmp.gtu p35,p33=n2,t[7]
762 (p34)cmp.geu p35,p33=n2,t[7];;
763 .pred.rel "mutex",p33,p35
764 (p33)sub n3=t[6],n3 }
765{ .mmi; (p35)sub n3=t[6],n3,1;;
766 (p33)cmp.gtu p34,p32=n3,t[6]
767 (p35)cmp.geu p34,p32=n3,t[6] };;
768 .pred.rel "mutex",p32,p34
769{ .mii; (p32)sub n4=t[5],n4
770 (p34)sub n4=t[5],n4,1;;
771 (p32)cmp.gtu p35,p33=n4,t[5] }
772{ .mmi; (p34)cmp.geu p35,p33=n4,t[5];;
773 .pred.rel "mutex",p33,p35
774 (p33)sub n5=t[4],n5
775 (p35)sub n5=t[4],n5,1 };;
776{ .mii; (p33)cmp.gtu p34,p32=n5,t[4]
777 (p35)cmp.geu p34,p32=n5,t[4];;
778 .pred.rel "mutex",p32,p34
779 (p32)sub n6=t[3],n6 }
780{ .mmi; (p34)sub n6=t[3],n6,1;;
781 (p32)cmp.gtu p35,p33=n6,t[3]
782 (p34)cmp.geu p35,p33=n6,t[3] };;
783 .pred.rel "mutex",p33,p35
784{ .mii; (p33)sub n7=t[2],n7
785 (p35)sub n7=t[2],n7,1;;
786 (p33)cmp.gtu p34,p32=n7,t[2] }
787{ .mmi; (p35)cmp.geu p34,p32=n7,t[2];;
788 .pred.rel "mutex",p32,p34
789 (p32)sub n8=t[1],n8
790 (p34)sub n8=t[1],n8,1 };;
791{ .mii; (p32)cmp.gtu p35,p33=n8,t[1]
792 (p34)cmp.geu p35,p33=n8,t[1];;
793 .pred.rel "mutex",p33,p35
794 (p33)sub a8=t[0],r0 }
795{ .mmi; (p35)sub a8=t[0],r0,1;;
796 (p33)cmp.gtu p34,p32=a8,t[0]
797 (p35)cmp.geu p34,p32=a8,t[0] };;
798
799// save the result, either tmp[num] or tmp[num]-np[num]
800 .pred.rel "mutex",p32,p34
801{ .mmi; (p32)st8 [rptr]=n1,8
802 (p34)st8 [rptr]=t0,8
803 add r19=-4*16,prevsp};;
804{ .mmb; (p32)st8 [rptr]=n2,8
805 (p34)st8 [rptr]=t[7],8
806 (p5)br.cond.dpnt.few .Ldone };;
807{ .mmb; (p32)st8 [rptr]=n3,8
808 (p34)st8 [rptr]=t[6],8
809 (p7)br.cond.dpnt.few .Ldone };;
810{ .mmb; (p32)st8 [rptr]=n4,8
811 (p34)st8 [rptr]=t[5],8
812 (p9)br.cond.dpnt.few .Ldone };;
813{ .mmb; (p32)st8 [rptr]=n5,8
814 (p34)st8 [rptr]=t[4],8
815 (p11)br.cond.dpnt.few .Ldone };;
816{ .mmb; (p32)st8 [rptr]=n6,8
817 (p34)st8 [rptr]=t[3],8
818 (p13)br.cond.dpnt.few .Ldone };;
819{ .mmb; (p32)st8 [rptr]=n7,8
820 (p34)st8 [rptr]=t[2],8
821 (p15)br.cond.dpnt.few .Ldone };;
822{ .mmb; (p32)st8 [rptr]=n8,8
823 (p34)st8 [rptr]=t[1],8
824 nop.b 0 };;
825.Ldone: // epilogue
826{ .mmi; ldf.fill f16=[r16],64
827 ldf.fill f17=[r17],64
828 nop.i 0 }
829{ .mmi; ldf.fill f18=[r18],64
830 ldf.fill f19=[r19],64
831 mov pr=prevpr,0x1ffff };;
832{ .mmi; ldf.fill f20=[r16]
833 ldf.fill f21=[r17]
834 mov ar.lc=prevlc }
835{ .mmi; ldf.fill f22=[r18]
836 ldf.fill f23=[r19]
837 mov ret0=1 } // signal "handled"
838{ .mib; rum 1<<5
839 .restore sp
840 mov sp=prevsp
841 br.ret.sptk.many b0 };;
842.endp bn_mul_mont_8#
843
844.type copyright#,\@object
845copyright:
846stringz "Montgomery multiplication for IA-64, CRYPTOGAMS by <appro\@openssl.org>"
847___
848
849$output=shift and open STDOUT,">$output";
850print $code;
851close STDOUT;
diff --git a/src/lib/libcrypto/bn/asm/ia64.S b/src/lib/libcrypto/bn/asm/ia64.S
deleted file mode 100644
index 0cf805ddc4..0000000000
--- a/src/lib/libcrypto/bn/asm/ia64.S
+++ /dev/null
@@ -1,1555 +0,0 @@
1.explicit
2.text
3.ident "ia64.S, Version 2.1"
4.ident "IA-64 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>"
5
6//
7// ====================================================================
8// Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
9// project.
10//
11// Rights for redistribution and usage in source and binary forms are
12// granted according to the OpenSSL license. Warranty of any kind is
13// disclaimed.
14// ====================================================================
15//
16// Version 2.x is Itanium2 re-tune. Few words about how Itanum2 is
17// different from Itanium to this module viewpoint. Most notably, is it
18// "wider" than Itanium? Can you experience loop scalability as
19// discussed in commentary sections? Not really:-( Itanium2 has 6
20// integer ALU ports, i.e. it's 2 ports wider, but it's not enough to
21// spin twice as fast, as I need 8 IALU ports. Amount of floating point
22// ports is the same, i.e. 2, while I need 4. In other words, to this
23// module Itanium2 remains effectively as "wide" as Itanium. Yet it's
24// essentially different in respect to this module, and a re-tune was
25// required. Well, because some intruction latencies has changed. Most
26// noticeably those intensively used:
27//
28// Itanium Itanium2
29// ldf8 9 6 L2 hit
30// ld8 2 1 L1 hit
31// getf 2 5
32// xma[->getf] 7[+1] 4[+0]
33// add[->st8] 1[+1] 1[+0]
34//
35// What does it mean? You might ratiocinate that the original code
36// should run just faster... Because sum of latencies is smaller...
37// Wrong! Note that getf latency increased. This means that if a loop is
38// scheduled for lower latency (as they were), then it will suffer from
39// stall condition and the code will therefore turn anti-scalable, e.g.
40// original bn_mul_words spun at 5*n or 2.5 times slower than expected
41// on Itanium2! What to do? Reschedule loops for Itanium2? But then
42// Itanium would exhibit anti-scalability. So I've chosen to reschedule
43// for worst latency for every instruction aiming for best *all-round*
44// performance.
45
46// Q. How much faster does it get?
47// A. Here is the output from 'openssl speed rsa dsa' for vanilla
48// 0.9.6a compiled with gcc version 2.96 20000731 (Red Hat
49// Linux 7.1 2.96-81):
50//
51// sign verify sign/s verify/s
52// rsa 512 bits 0.0036s 0.0003s 275.3 2999.2
53// rsa 1024 bits 0.0203s 0.0011s 49.3 894.1
54// rsa 2048 bits 0.1331s 0.0040s 7.5 250.9
55// rsa 4096 bits 0.9270s 0.0147s 1.1 68.1
56// sign verify sign/s verify/s
57// dsa 512 bits 0.0035s 0.0043s 288.3 234.8
58// dsa 1024 bits 0.0111s 0.0135s 90.0 74.2
59//
60// And here is similar output but for this assembler
61// implementation:-)
62//
63// sign verify sign/s verify/s
64// rsa 512 bits 0.0021s 0.0001s 549.4 9638.5
65// rsa 1024 bits 0.0055s 0.0002s 183.8 4481.1
66// rsa 2048 bits 0.0244s 0.0006s 41.4 1726.3
67// rsa 4096 bits 0.1295s 0.0018s 7.7 561.5
68// sign verify sign/s verify/s
69// dsa 512 bits 0.0012s 0.0013s 891.9 756.6
70// dsa 1024 bits 0.0023s 0.0028s 440.4 376.2
71//
72// Yes, you may argue that it's not fair comparison as it's
73// possible to craft the C implementation with BN_UMULT_HIGH
74// inline assembler macro. But of course! Here is the output
75// with the macro:
76//
77// sign verify sign/s verify/s
78// rsa 512 bits 0.0020s 0.0002s 495.0 6561.0
79// rsa 1024 bits 0.0086s 0.0004s 116.2 2235.7
80// rsa 2048 bits 0.0519s 0.0015s 19.3 667.3
81// rsa 4096 bits 0.3464s 0.0053s 2.9 187.7
82// sign verify sign/s verify/s
83// dsa 512 bits 0.0016s 0.0020s 613.1 510.5
84// dsa 1024 bits 0.0045s 0.0054s 221.0 183.9
85//
86// My code is still way faster, huh:-) And I believe that even
87// higher performance can be achieved. Note that as keys get
88// longer, performance gain is larger. Why? According to the
89// profiler there is another player in the field, namely
90// BN_from_montgomery consuming larger and larger portion of CPU
91// time as keysize decreases. I therefore consider putting effort
92// to assembler implementation of the following routine:
93//
94// void bn_mul_add_mont (BN_ULONG *rp,BN_ULONG *np,int nl,BN_ULONG n0)
95// {
96// int i,j;
97// BN_ULONG v;
98//
99// for (i=0; i<nl; i++)
100// {
101// v=bn_mul_add_words(rp,np,nl,(rp[0]*n0)&BN_MASK2);
102// nrp++;
103// rp++;
104// if (((nrp[-1]+=v)&BN_MASK2) < v)
105// for (j=0; ((++nrp[j])&BN_MASK2) == 0; j++) ;
106// }
107// }
108//
109// It might as well be beneficial to implement even combaX
110// variants, as it appears as it can literally unleash the
111// performance (see comment section to bn_mul_comba8 below).
112//
113// And finally for your reference the output for 0.9.6a compiled
114// with SGIcc version 0.01.0-12 (keep in mind that for the moment
115// of this writing it's not possible to convince SGIcc to use
116// BN_UMULT_HIGH inline assembler macro, yet the code is fast,
117// i.e. for a compiler generated one:-):
118//
119// sign verify sign/s verify/s
120// rsa 512 bits 0.0022s 0.0002s 452.7 5894.3
121// rsa 1024 bits 0.0097s 0.0005s 102.7 2002.9
122// rsa 2048 bits 0.0578s 0.0017s 17.3 600.2
123// rsa 4096 bits 0.3838s 0.0061s 2.6 164.5
124// sign verify sign/s verify/s
125// dsa 512 bits 0.0018s 0.0022s 547.3 459.6
126// dsa 1024 bits 0.0051s 0.0062s 196.6 161.3
127//
128// Oh! Benchmarks were performed on 733MHz Lion-class Itanium
129// system running Redhat Linux 7.1 (very special thanks to Ray
130// McCaffity of Williams Communications for providing an account).
131//
132// Q. What's the heck with 'rum 1<<5' at the end of every function?
133// A. Well, by clearing the "upper FP registers written" bit of the
134// User Mask I want to excuse the kernel from preserving upper
135// (f32-f128) FP register bank over process context switch, thus
136// minimizing bus bandwidth consumption during the switch (i.e.
137// after PKI opration completes and the program is off doing
138// something else like bulk symmetric encryption). Having said
139// this, I also want to point out that it might be good idea
140// to compile the whole toolkit (as well as majority of the
141// programs for that matter) with -mfixed-range=f32-f127 command
142// line option. No, it doesn't prevent the compiler from writing
143// to upper bank, but at least discourages to do so. If you don't
144// like the idea you have the option to compile the module with
145// -Drum=nop.m in command line.
146//
147
148#if defined(_HPUX_SOURCE) && !defined(_LP64)
149#define ADDP addp4
150#else
151#define ADDP add
152#endif
153
154#if 1
155//
156// bn_[add|sub]_words routines.
157//
158// Loops are spinning in 2*(n+5) ticks on Itanuim (provided that the
159// data reside in L1 cache, i.e. 2 ticks away). It's possible to
160// compress the epilogue and get down to 2*n+6, but at the cost of
161// scalability (the neat feature of this implementation is that it
162// shall automagically spin in n+5 on "wider" IA-64 implementations:-)
163// I consider that the epilogue is short enough as it is to trade tiny
164// performance loss on Itanium for scalability.
165//
166// BN_ULONG bn_add_words(BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int num)
167//
168.global bn_add_words#
169.proc bn_add_words#
170.align 64
171.skip 32 // makes the loop body aligned at 64-byte boundary
172bn_add_words:
173 .prologue
174 .save ar.pfs,r2
175{ .mii; alloc r2=ar.pfs,4,12,0,16
176 cmp4.le p6,p0=r35,r0 };;
177{ .mfb; mov r8=r0 // return value
178(p6) br.ret.spnt.many b0 };;
179
180{ .mib; sub r10=r35,r0,1
181 .save ar.lc,r3
182 mov r3=ar.lc
183 brp.loop.imp .L_bn_add_words_ctop,.L_bn_add_words_cend-16
184 }
185{ .mib; ADDP r14=0,r32 // rp
186 .save pr,r9
187 mov r9=pr };;
188 .body
189{ .mii; ADDP r15=0,r33 // ap
190 mov ar.lc=r10
191 mov ar.ec=6 }
192{ .mib; ADDP r16=0,r34 // bp
193 mov pr.rot=1<<16 };;
194
195.L_bn_add_words_ctop:
196{ .mii; (p16) ld8 r32=[r16],8 // b=*(bp++)
197 (p18) add r39=r37,r34
198 (p19) cmp.ltu.unc p56,p0=r40,r38 }
199{ .mfb; (p0) nop.m 0x0
200 (p0) nop.f 0x0
201 (p0) nop.b 0x0 }
202{ .mii; (p16) ld8 r35=[r15],8 // a=*(ap++)
203 (p58) cmp.eq.or p57,p0=-1,r41 // (p20)
204 (p58) add r41=1,r41 } // (p20)
205{ .mfb; (p21) st8 [r14]=r42,8 // *(rp++)=r
206 (p0) nop.f 0x0
207 br.ctop.sptk .L_bn_add_words_ctop };;
208.L_bn_add_words_cend:
209
210{ .mii;
211(p59) add r8=1,r8 // return value
212 mov pr=r9,0x1ffff
213 mov ar.lc=r3 }
214{ .mbb; nop.b 0x0
215 br.ret.sptk.many b0 };;
216.endp bn_add_words#
217
218//
219// BN_ULONG bn_sub_words(BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int num)
220//
221.global bn_sub_words#
222.proc bn_sub_words#
223.align 64
224.skip 32 // makes the loop body aligned at 64-byte boundary
225bn_sub_words:
226 .prologue
227 .save ar.pfs,r2
228{ .mii; alloc r2=ar.pfs,4,12,0,16
229 cmp4.le p6,p0=r35,r0 };;
230{ .mfb; mov r8=r0 // return value
231(p6) br.ret.spnt.many b0 };;
232
233{ .mib; sub r10=r35,r0,1
234 .save ar.lc,r3
235 mov r3=ar.lc
236 brp.loop.imp .L_bn_sub_words_ctop,.L_bn_sub_words_cend-16
237 }
238{ .mib; ADDP r14=0,r32 // rp
239 .save pr,r9
240 mov r9=pr };;
241 .body
242{ .mii; ADDP r15=0,r33 // ap
243 mov ar.lc=r10
244 mov ar.ec=6 }
245{ .mib; ADDP r16=0,r34 // bp
246 mov pr.rot=1<<16 };;
247
248.L_bn_sub_words_ctop:
249{ .mii; (p16) ld8 r32=[r16],8 // b=*(bp++)
250 (p18) sub r39=r37,r34
251 (p19) cmp.gtu.unc p56,p0=r40,r38 }
252{ .mfb; (p0) nop.m 0x0
253 (p0) nop.f 0x0
254 (p0) nop.b 0x0 }
255{ .mii; (p16) ld8 r35=[r15],8 // a=*(ap++)
256 (p58) cmp.eq.or p57,p0=0,r41 // (p20)
257 (p58) add r41=-1,r41 } // (p20)
258{ .mbb; (p21) st8 [r14]=r42,8 // *(rp++)=r
259 (p0) nop.b 0x0
260 br.ctop.sptk .L_bn_sub_words_ctop };;
261.L_bn_sub_words_cend:
262
263{ .mii;
264(p59) add r8=1,r8 // return value
265 mov pr=r9,0x1ffff
266 mov ar.lc=r3 }
267{ .mbb; nop.b 0x0
268 br.ret.sptk.many b0 };;
269.endp bn_sub_words#
270#endif
271
272#if 0
273#define XMA_TEMPTATION
274#endif
275
276#if 1
277//
278// BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
279//
280.global bn_mul_words#
281.proc bn_mul_words#
282.align 64
283.skip 32 // makes the loop body aligned at 64-byte boundary
284bn_mul_words:
285 .prologue
286 .save ar.pfs,r2
287#ifdef XMA_TEMPTATION
288{ .mfi; alloc r2=ar.pfs,4,0,0,0 };;
289#else
290{ .mfi; alloc r2=ar.pfs,4,12,0,16 };;
291#endif
292{ .mib; mov r8=r0 // return value
293 cmp4.le p6,p0=r34,r0
294(p6) br.ret.spnt.many b0 };;
295
296{ .mii; sub r10=r34,r0,1
297 .save ar.lc,r3
298 mov r3=ar.lc
299 .save pr,r9
300 mov r9=pr };;
301
302 .body
303{ .mib; setf.sig f8=r35 // w
304 mov pr.rot=0x800001<<16
305 // ------^----- serves as (p50) at first (p27)
306 brp.loop.imp .L_bn_mul_words_ctop,.L_bn_mul_words_cend-16
307 }
308
309#ifndef XMA_TEMPTATION
310
311{ .mmi; ADDP r14=0,r32 // rp
312 ADDP r15=0,r33 // ap
313 mov ar.lc=r10 }
314{ .mmi; mov r40=0 // serves as r35 at first (p27)
315 mov ar.ec=13 };;
316
317// This loop spins in 2*(n+12) ticks. It's scheduled for data in Itanium
318// L2 cache (i.e. 9 ticks away) as floating point load/store instructions
319// bypass L1 cache and L2 latency is actually best-case scenario for
320// ldf8. The loop is not scalable and shall run in 2*(n+12) even on
321// "wider" IA-64 implementations. It's a trade-off here. n+24 loop
322// would give us ~5% in *overall* performance improvement on "wider"
323// IA-64, but would hurt Itanium for about same because of longer
324// epilogue. As it's a matter of few percents in either case I've
325// chosen to trade the scalability for development time (you can see
326// this very instruction sequence in bn_mul_add_words loop which in
327// turn is scalable).
328.L_bn_mul_words_ctop:
329{ .mfi; (p25) getf.sig r36=f52 // low
330 (p21) xmpy.lu f48=f37,f8
331 (p28) cmp.ltu p54,p50=r41,r39 }
332{ .mfi; (p16) ldf8 f32=[r15],8
333 (p21) xmpy.hu f40=f37,f8
334 (p0) nop.i 0x0 };;
335{ .mii; (p25) getf.sig r32=f44 // high
336 .pred.rel "mutex",p50,p54
337 (p50) add r40=r38,r35 // (p27)
338 (p54) add r40=r38,r35,1 } // (p27)
339{ .mfb; (p28) st8 [r14]=r41,8
340 (p0) nop.f 0x0
341 br.ctop.sptk .L_bn_mul_words_ctop };;
342.L_bn_mul_words_cend:
343
344{ .mii; nop.m 0x0
345.pred.rel "mutex",p51,p55
346(p51) add r8=r36,r0
347(p55) add r8=r36,r0,1 }
348{ .mfb; nop.m 0x0
349 nop.f 0x0
350 nop.b 0x0 }
351
352#else // XMA_TEMPTATION
353
354 setf.sig f37=r0 // serves as carry at (p18) tick
355 mov ar.lc=r10
356 mov ar.ec=5;;
357
358// Most of you examining this code very likely wonder why in the name
359// of Intel the following loop is commented out? Indeed, it looks so
360// neat that you find it hard to believe that it's something wrong
361// with it, right? The catch is that every iteration depends on the
362// result from previous one and the latter isn't available instantly.
363// The loop therefore spins at the latency of xma minus 1, or in other
364// words at 6*(n+4) ticks:-( Compare to the "production" loop above
365// that runs in 2*(n+11) where the low latency problem is worked around
366// by moving the dependency to one-tick latent interger ALU. Note that
367// "distance" between ldf8 and xma is not latency of ldf8, but the
368// *difference* between xma and ldf8 latencies.
369.L_bn_mul_words_ctop:
370{ .mfi; (p16) ldf8 f32=[r33],8
371 (p18) xma.hu f38=f34,f8,f39 }
372{ .mfb; (p20) stf8 [r32]=f37,8
373 (p18) xma.lu f35=f34,f8,f39
374 br.ctop.sptk .L_bn_mul_words_ctop };;
375.L_bn_mul_words_cend:
376
377 getf.sig r8=f41 // the return value
378
379#endif // XMA_TEMPTATION
380
381{ .mii; nop.m 0x0
382 mov pr=r9,0x1ffff
383 mov ar.lc=r3 }
384{ .mfb; rum 1<<5 // clear um.mfh
385 nop.f 0x0
386 br.ret.sptk.many b0 };;
387.endp bn_mul_words#
388#endif
389
390#if 1
391//
392// BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
393//
394.global bn_mul_add_words#
395.proc bn_mul_add_words#
396.align 64
397.skip 48 // makes the loop body aligned at 64-byte boundary
398bn_mul_add_words:
399 .prologue
400 .save ar.pfs,r2
401{ .mmi; alloc r2=ar.pfs,4,4,0,8
402 cmp4.le p6,p0=r34,r0
403 .save ar.lc,r3
404 mov r3=ar.lc };;
405{ .mib; mov r8=r0 // return value
406 sub r10=r34,r0,1
407(p6) br.ret.spnt.many b0 };;
408
409{ .mib; setf.sig f8=r35 // w
410 .save pr,r9
411 mov r9=pr
412 brp.loop.imp .L_bn_mul_add_words_ctop,.L_bn_mul_add_words_cend-16
413 }
414 .body
415{ .mmi; ADDP r14=0,r32 // rp
416 ADDP r15=0,r33 // ap
417 mov ar.lc=r10 }
418{ .mii; ADDP r16=0,r32 // rp copy
419 mov pr.rot=0x2001<<16
420 // ------^----- serves as (p40) at first (p27)
421 mov ar.ec=11 };;
422
423// This loop spins in 3*(n+10) ticks on Itanium and in 2*(n+10) on
424// Itanium 2. Yes, unlike previous versions it scales:-) Previous
425// version was performing *all* additions in IALU and was starving
426// for those even on Itanium 2. In this version one addition is
427// moved to FPU and is folded with multiplication. This is at cost
428// of propogating the result from previous call to this subroutine
429// to L2 cache... In other words negligible even for shorter keys.
430// *Overall* performance improvement [over previous version] varies
431// from 11 to 22 percent depending on key length.
432.L_bn_mul_add_words_ctop:
433.pred.rel "mutex",p40,p42
434{ .mfi; (p23) getf.sig r36=f45 // low
435 (p20) xma.lu f42=f36,f8,f50 // low
436 (p40) add r39=r39,r35 } // (p27)
437{ .mfi; (p16) ldf8 f32=[r15],8 // *(ap++)
438 (p20) xma.hu f36=f36,f8,f50 // high
439 (p42) add r39=r39,r35,1 };; // (p27)
440{ .mmi; (p24) getf.sig r32=f40 // high
441 (p16) ldf8 f46=[r16],8 // *(rp1++)
442 (p40) cmp.ltu p41,p39=r39,r35 } // (p27)
443{ .mib; (p26) st8 [r14]=r39,8 // *(rp2++)
444 (p42) cmp.leu p41,p39=r39,r35 // (p27)
445 br.ctop.sptk .L_bn_mul_add_words_ctop};;
446.L_bn_mul_add_words_cend:
447
448{ .mmi; .pred.rel "mutex",p40,p42
449(p40) add r8=r35,r0
450(p42) add r8=r35,r0,1
451 mov pr=r9,0x1ffff }
452{ .mib; rum 1<<5 // clear um.mfh
453 mov ar.lc=r3
454 br.ret.sptk.many b0 };;
455.endp bn_mul_add_words#
456#endif
457
458#if 1
459//
460// void bn_sqr_words(BN_ULONG *rp, BN_ULONG *ap, int num)
461//
462.global bn_sqr_words#
463.proc bn_sqr_words#
464.align 64
465.skip 32 // makes the loop body aligned at 64-byte boundary
466bn_sqr_words:
467 .prologue
468 .save ar.pfs,r2
469{ .mii; alloc r2=ar.pfs,3,0,0,0
470 sxt4 r34=r34 };;
471{ .mii; cmp.le p6,p0=r34,r0
472 mov r8=r0 } // return value
473{ .mfb; ADDP r32=0,r32
474 nop.f 0x0
475(p6) br.ret.spnt.many b0 };;
476
477{ .mii; sub r10=r34,r0,1
478 .save ar.lc,r3
479 mov r3=ar.lc
480 .save pr,r9
481 mov r9=pr };;
482
483 .body
484{ .mib; ADDP r33=0,r33
485 mov pr.rot=1<<16
486 brp.loop.imp .L_bn_sqr_words_ctop,.L_bn_sqr_words_cend-16
487 }
488{ .mii; add r34=8,r32
489 mov ar.lc=r10
490 mov ar.ec=18 };;
491
492// 2*(n+17) on Itanium, (n+17) on "wider" IA-64 implementations. It's
493// possible to compress the epilogue (I'm getting tired to write this
494// comment over and over) and get down to 2*n+16 at the cost of
495// scalability. The decision will very likely be reconsidered after the
496// benchmark program is profiled. I.e. if performance gain on Itanium
497// will appear larger than loss on "wider" IA-64, then the loop should
498// be explicitely split and the epilogue compressed.
499.L_bn_sqr_words_ctop:
500{ .mfi; (p16) ldf8 f32=[r33],8
501 (p25) xmpy.lu f42=f41,f41
502 (p0) nop.i 0x0 }
503{ .mib; (p33) stf8 [r32]=f50,16
504 (p0) nop.i 0x0
505 (p0) nop.b 0x0 }
506{ .mfi; (p0) nop.m 0x0
507 (p25) xmpy.hu f52=f41,f41
508 (p0) nop.i 0x0 }
509{ .mib; (p33) stf8 [r34]=f60,16
510 (p0) nop.i 0x0
511 br.ctop.sptk .L_bn_sqr_words_ctop };;
512.L_bn_sqr_words_cend:
513
514{ .mii; nop.m 0x0
515 mov pr=r9,0x1ffff
516 mov ar.lc=r3 }
517{ .mfb; rum 1<<5 // clear um.mfh
518 nop.f 0x0
519 br.ret.sptk.many b0 };;
520.endp bn_sqr_words#
521#endif
522
523#if 1
524// Apparently we win nothing by implementing special bn_sqr_comba8.
525// Yes, it is possible to reduce the number of multiplications by
526// almost factor of two, but then the amount of additions would
527// increase by factor of two (as we would have to perform those
528// otherwise performed by xma ourselves). Normally we would trade
529// anyway as multiplications are way more expensive, but not this
530// time... Multiplication kernel is fully pipelined and as we drain
531// one 128-bit multiplication result per clock cycle multiplications
532// are effectively as inexpensive as additions. Special implementation
533// might become of interest for "wider" IA-64 implementation as you'll
534// be able to get through the multiplication phase faster (there won't
535// be any stall issues as discussed in the commentary section below and
536// you therefore will be able to employ all 4 FP units)... But these
537// Itanium days it's simply too hard to justify the effort so I just
538// drop down to bn_mul_comba8 code:-)
539//
540// void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
541//
542.global bn_sqr_comba8#
543.proc bn_sqr_comba8#
544.align 64
545bn_sqr_comba8:
546 .prologue
547 .save ar.pfs,r2
548#if defined(_HPUX_SOURCE) && !defined(_LP64)
549{ .mii; alloc r2=ar.pfs,2,1,0,0
550 addp4 r33=0,r33
551 addp4 r32=0,r32 };;
552{ .mii;
553#else
554{ .mii; alloc r2=ar.pfs,2,1,0,0
555#endif
556 mov r34=r33
557 add r14=8,r33 };;
558 .body
559{ .mii; add r17=8,r34
560 add r15=16,r33
561 add r18=16,r34 }
562{ .mfb; add r16=24,r33
563 br .L_cheat_entry_point8 };;
564.endp bn_sqr_comba8#
565#endif
566
567#if 1
568// I've estimated this routine to run in ~120 ticks, but in reality
569// (i.e. according to ar.itc) it takes ~160 ticks. Are those extra
570// cycles consumed for instructions fetch? Or did I misinterpret some
571// clause in Itanium µ-architecture manual? Comments are welcomed and
572// highly appreciated.
573//
574// On Itanium 2 it takes ~190 ticks. This is because of stalls on
575// result from getf.sig. I do nothing about it at this point for
576// reasons depicted below.
577//
578// However! It should be noted that even 160 ticks is darn good result
579// as it's over 10 (yes, ten, spelled as t-e-n) times faster than the
580// C version (compiled with gcc with inline assembler). I really
581// kicked compiler's butt here, didn't I? Yeah! This brings us to the
582// following statement. It's damn shame that this routine isn't called
583// very often nowadays! According to the profiler most CPU time is
584// consumed by bn_mul_add_words called from BN_from_montgomery. In
585// order to estimate what we're missing, I've compared the performance
586// of this routine against "traditional" implementation, i.e. against
587// following routine:
588//
589// void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
590// { r[ 8]=bn_mul_words( &(r[0]),a,8,b[0]);
591// r[ 9]=bn_mul_add_words(&(r[1]),a,8,b[1]);
592// r[10]=bn_mul_add_words(&(r[2]),a,8,b[2]);
593// r[11]=bn_mul_add_words(&(r[3]),a,8,b[3]);
594// r[12]=bn_mul_add_words(&(r[4]),a,8,b[4]);
595// r[13]=bn_mul_add_words(&(r[5]),a,8,b[5]);
596// r[14]=bn_mul_add_words(&(r[6]),a,8,b[6]);
597// r[15]=bn_mul_add_words(&(r[7]),a,8,b[7]);
598// }
599//
600// The one below is over 8 times faster than the one above:-( Even
601// more reasons to "combafy" bn_mul_add_mont...
602//
603// And yes, this routine really made me wish there were an optimizing
604// assembler! It also feels like it deserves a dedication.
605//
606// To my wife for being there and to my kids...
607//
608// void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
609//
610#define carry1 r14
611#define carry2 r15
612#define carry3 r34
613.global bn_mul_comba8#
614.proc bn_mul_comba8#
615.align 64
616bn_mul_comba8:
617 .prologue
618 .save ar.pfs,r2
619#if defined(_HPUX_SOURCE) && !defined(_LP64)
620{ .mii; alloc r2=ar.pfs,3,0,0,0
621 addp4 r33=0,r33
622 addp4 r34=0,r34 };;
623{ .mii; addp4 r32=0,r32
624#else
625{ .mii; alloc r2=ar.pfs,3,0,0,0
626#endif
627 add r14=8,r33
628 add r17=8,r34 }
629 .body
630{ .mii; add r15=16,r33
631 add r18=16,r34
632 add r16=24,r33 }
633.L_cheat_entry_point8:
634{ .mmi; add r19=24,r34
635
636 ldf8 f32=[r33],32 };;
637
638{ .mmi; ldf8 f120=[r34],32
639 ldf8 f121=[r17],32 }
640{ .mmi; ldf8 f122=[r18],32
641 ldf8 f123=[r19],32 };;
642{ .mmi; ldf8 f124=[r34]
643 ldf8 f125=[r17] }
644{ .mmi; ldf8 f126=[r18]
645 ldf8 f127=[r19] }
646
647{ .mmi; ldf8 f33=[r14],32
648 ldf8 f34=[r15],32 }
649{ .mmi; ldf8 f35=[r16],32;;
650 ldf8 f36=[r33] }
651{ .mmi; ldf8 f37=[r14]
652 ldf8 f38=[r15] }
653{ .mfi; ldf8 f39=[r16]
654// -------\ Entering multiplier's heaven /-------
655// ------------\ /------------
656// -----------------\ /-----------------
657// ----------------------\/----------------------
658 xma.hu f41=f32,f120,f0 }
659{ .mfi; xma.lu f40=f32,f120,f0 };; // (*)
660{ .mfi; xma.hu f51=f32,f121,f0 }
661{ .mfi; xma.lu f50=f32,f121,f0 };;
662{ .mfi; xma.hu f61=f32,f122,f0 }
663{ .mfi; xma.lu f60=f32,f122,f0 };;
664{ .mfi; xma.hu f71=f32,f123,f0 }
665{ .mfi; xma.lu f70=f32,f123,f0 };;
666{ .mfi; xma.hu f81=f32,f124,f0 }
667{ .mfi; xma.lu f80=f32,f124,f0 };;
668{ .mfi; xma.hu f91=f32,f125,f0 }
669{ .mfi; xma.lu f90=f32,f125,f0 };;
670{ .mfi; xma.hu f101=f32,f126,f0 }
671{ .mfi; xma.lu f100=f32,f126,f0 };;
672{ .mfi; xma.hu f111=f32,f127,f0 }
673{ .mfi; xma.lu f110=f32,f127,f0 };;//
674// (*) You can argue that splitting at every second bundle would
675// prevent "wider" IA-64 implementations from achieving the peak
676// performance. Well, not really... The catch is that if you
677// intend to keep 4 FP units busy by splitting at every fourth
678// bundle and thus perform these 16 multiplications in 4 ticks,
679// the first bundle *below* would stall because the result from
680// the first xma bundle *above* won't be available for another 3
681// ticks (if not more, being an optimist, I assume that "wider"
682// implementation will have same latency:-). This stall will hold
683// you back and the performance would be as if every second bundle
684// were split *anyway*...
685{ .mfi; getf.sig r16=f40
686 xma.hu f42=f33,f120,f41
687 add r33=8,r32 }
688{ .mfi; xma.lu f41=f33,f120,f41 };;
689{ .mfi; getf.sig r24=f50
690 xma.hu f52=f33,f121,f51 }
691{ .mfi; xma.lu f51=f33,f121,f51 };;
692{ .mfi; st8 [r32]=r16,16
693 xma.hu f62=f33,f122,f61 }
694{ .mfi; xma.lu f61=f33,f122,f61 };;
695{ .mfi; xma.hu f72=f33,f123,f71 }
696{ .mfi; xma.lu f71=f33,f123,f71 };;
697{ .mfi; xma.hu f82=f33,f124,f81 }
698{ .mfi; xma.lu f81=f33,f124,f81 };;
699{ .mfi; xma.hu f92=f33,f125,f91 }
700{ .mfi; xma.lu f91=f33,f125,f91 };;
701{ .mfi; xma.hu f102=f33,f126,f101 }
702{ .mfi; xma.lu f101=f33,f126,f101 };;
703{ .mfi; xma.hu f112=f33,f127,f111 }
704{ .mfi; xma.lu f111=f33,f127,f111 };;//
705//-------------------------------------------------//
706{ .mfi; getf.sig r25=f41
707 xma.hu f43=f34,f120,f42 }
708{ .mfi; xma.lu f42=f34,f120,f42 };;
709{ .mfi; getf.sig r16=f60
710 xma.hu f53=f34,f121,f52 }
711{ .mfi; xma.lu f52=f34,f121,f52 };;
712{ .mfi; getf.sig r17=f51
713 xma.hu f63=f34,f122,f62
714 add r25=r25,r24 }
715{ .mfi; xma.lu f62=f34,f122,f62
716 mov carry1=0 };;
717{ .mfi; cmp.ltu p6,p0=r25,r24
718 xma.hu f73=f34,f123,f72 }
719{ .mfi; xma.lu f72=f34,f123,f72 };;
720{ .mfi; st8 [r33]=r25,16
721 xma.hu f83=f34,f124,f82
722(p6) add carry1=1,carry1 }
723{ .mfi; xma.lu f82=f34,f124,f82 };;
724{ .mfi; xma.hu f93=f34,f125,f92 }
725{ .mfi; xma.lu f92=f34,f125,f92 };;
726{ .mfi; xma.hu f103=f34,f126,f102 }
727{ .mfi; xma.lu f102=f34,f126,f102 };;
728{ .mfi; xma.hu f113=f34,f127,f112 }
729{ .mfi; xma.lu f112=f34,f127,f112 };;//
730//-------------------------------------------------//
731{ .mfi; getf.sig r18=f42
732 xma.hu f44=f35,f120,f43
733 add r17=r17,r16 }
734{ .mfi; xma.lu f43=f35,f120,f43 };;
735{ .mfi; getf.sig r24=f70
736 xma.hu f54=f35,f121,f53 }
737{ .mfi; mov carry2=0
738 xma.lu f53=f35,f121,f53 };;
739{ .mfi; getf.sig r25=f61
740 xma.hu f64=f35,f122,f63
741 cmp.ltu p7,p0=r17,r16 }
742{ .mfi; add r18=r18,r17
743 xma.lu f63=f35,f122,f63 };;
744{ .mfi; getf.sig r26=f52
745 xma.hu f74=f35,f123,f73
746(p7) add carry2=1,carry2 }
747{ .mfi; cmp.ltu p7,p0=r18,r17
748 xma.lu f73=f35,f123,f73
749 add r18=r18,carry1 };;
750{ .mfi;
751 xma.hu f84=f35,f124,f83
752(p7) add carry2=1,carry2 }
753{ .mfi; cmp.ltu p7,p0=r18,carry1
754 xma.lu f83=f35,f124,f83 };;
755{ .mfi; st8 [r32]=r18,16
756 xma.hu f94=f35,f125,f93
757(p7) add carry2=1,carry2 }
758{ .mfi; xma.lu f93=f35,f125,f93 };;
759{ .mfi; xma.hu f104=f35,f126,f103 }
760{ .mfi; xma.lu f103=f35,f126,f103 };;
761{ .mfi; xma.hu f114=f35,f127,f113 }
762{ .mfi; mov carry1=0
763 xma.lu f113=f35,f127,f113
764 add r25=r25,r24 };;//
765//-------------------------------------------------//
766{ .mfi; getf.sig r27=f43
767 xma.hu f45=f36,f120,f44
768 cmp.ltu p6,p0=r25,r24 }
769{ .mfi; xma.lu f44=f36,f120,f44
770 add r26=r26,r25 };;
771{ .mfi; getf.sig r16=f80
772 xma.hu f55=f36,f121,f54
773(p6) add carry1=1,carry1 }
774{ .mfi; xma.lu f54=f36,f121,f54 };;
775{ .mfi; getf.sig r17=f71
776 xma.hu f65=f36,f122,f64
777 cmp.ltu p6,p0=r26,r25 }
778{ .mfi; xma.lu f64=f36,f122,f64
779 add r27=r27,r26 };;
780{ .mfi; getf.sig r18=f62
781 xma.hu f75=f36,f123,f74
782(p6) add carry1=1,carry1 }
783{ .mfi; cmp.ltu p6,p0=r27,r26
784 xma.lu f74=f36,f123,f74
785 add r27=r27,carry2 };;
786{ .mfi; getf.sig r19=f53
787 xma.hu f85=f36,f124,f84
788(p6) add carry1=1,carry1 }
789{ .mfi; xma.lu f84=f36,f124,f84
790 cmp.ltu p6,p0=r27,carry2 };;
791{ .mfi; st8 [r33]=r27,16
792 xma.hu f95=f36,f125,f94
793(p6) add carry1=1,carry1 }
794{ .mfi; xma.lu f94=f36,f125,f94 };;
795{ .mfi; xma.hu f105=f36,f126,f104 }
796{ .mfi; mov carry2=0
797 xma.lu f104=f36,f126,f104
798 add r17=r17,r16 };;
799{ .mfi; xma.hu f115=f36,f127,f114
800 cmp.ltu p7,p0=r17,r16 }
801{ .mfi; xma.lu f114=f36,f127,f114
802 add r18=r18,r17 };;//
803//-------------------------------------------------//
804{ .mfi; getf.sig r20=f44
805 xma.hu f46=f37,f120,f45
806(p7) add carry2=1,carry2 }
807{ .mfi; cmp.ltu p7,p0=r18,r17
808 xma.lu f45=f37,f120,f45
809 add r19=r19,r18 };;
810{ .mfi; getf.sig r24=f90
811 xma.hu f56=f37,f121,f55 }
812{ .mfi; xma.lu f55=f37,f121,f55 };;
813{ .mfi; getf.sig r25=f81
814 xma.hu f66=f37,f122,f65
815(p7) add carry2=1,carry2 }
816{ .mfi; cmp.ltu p7,p0=r19,r18
817 xma.lu f65=f37,f122,f65
818 add r20=r20,r19 };;
819{ .mfi; getf.sig r26=f72
820 xma.hu f76=f37,f123,f75
821(p7) add carry2=1,carry2 }
822{ .mfi; cmp.ltu p7,p0=r20,r19
823 xma.lu f75=f37,f123,f75
824 add r20=r20,carry1 };;
825{ .mfi; getf.sig r27=f63
826 xma.hu f86=f37,f124,f85
827(p7) add carry2=1,carry2 }
828{ .mfi; xma.lu f85=f37,f124,f85
829 cmp.ltu p7,p0=r20,carry1 };;
830{ .mfi; getf.sig r28=f54
831 xma.hu f96=f37,f125,f95
832(p7) add carry2=1,carry2 }
833{ .mfi; st8 [r32]=r20,16
834 xma.lu f95=f37,f125,f95 };;
835{ .mfi; xma.hu f106=f37,f126,f105 }
836{ .mfi; mov carry1=0
837 xma.lu f105=f37,f126,f105
838 add r25=r25,r24 };;
839{ .mfi; xma.hu f116=f37,f127,f115
840 cmp.ltu p6,p0=r25,r24 }
841{ .mfi; xma.lu f115=f37,f127,f115
842 add r26=r26,r25 };;//
843//-------------------------------------------------//
844{ .mfi; getf.sig r29=f45
845 xma.hu f47=f38,f120,f46
846(p6) add carry1=1,carry1 }
847{ .mfi; cmp.ltu p6,p0=r26,r25
848 xma.lu f46=f38,f120,f46
849 add r27=r27,r26 };;
850{ .mfi; getf.sig r16=f100
851 xma.hu f57=f38,f121,f56
852(p6) add carry1=1,carry1 }
853{ .mfi; cmp.ltu p6,p0=r27,r26
854 xma.lu f56=f38,f121,f56
855 add r28=r28,r27 };;
856{ .mfi; getf.sig r17=f91
857 xma.hu f67=f38,f122,f66
858(p6) add carry1=1,carry1 }
859{ .mfi; cmp.ltu p6,p0=r28,r27
860 xma.lu f66=f38,f122,f66
861 add r29=r29,r28 };;
862{ .mfi; getf.sig r18=f82
863 xma.hu f77=f38,f123,f76
864(p6) add carry1=1,carry1 }
865{ .mfi; cmp.ltu p6,p0=r29,r28
866 xma.lu f76=f38,f123,f76
867 add r29=r29,carry2 };;
868{ .mfi; getf.sig r19=f73
869 xma.hu f87=f38,f124,f86
870(p6) add carry1=1,carry1 }
871{ .mfi; xma.lu f86=f38,f124,f86
872 cmp.ltu p6,p0=r29,carry2 };;
873{ .mfi; getf.sig r20=f64
874 xma.hu f97=f38,f125,f96
875(p6) add carry1=1,carry1 }
876{ .mfi; st8 [r33]=r29,16
877 xma.lu f96=f38,f125,f96 };;
878{ .mfi; getf.sig r21=f55
879 xma.hu f107=f38,f126,f106 }
880{ .mfi; mov carry2=0
881 xma.lu f106=f38,f126,f106
882 add r17=r17,r16 };;
883{ .mfi; xma.hu f117=f38,f127,f116
884 cmp.ltu p7,p0=r17,r16 }
885{ .mfi; xma.lu f116=f38,f127,f116
886 add r18=r18,r17 };;//
887//-------------------------------------------------//
888{ .mfi; getf.sig r22=f46
889 xma.hu f48=f39,f120,f47
890(p7) add carry2=1,carry2 }
891{ .mfi; cmp.ltu p7,p0=r18,r17
892 xma.lu f47=f39,f120,f47
893 add r19=r19,r18 };;
894{ .mfi; getf.sig r24=f110
895 xma.hu f58=f39,f121,f57
896(p7) add carry2=1,carry2 }
897{ .mfi; cmp.ltu p7,p0=r19,r18
898 xma.lu f57=f39,f121,f57
899 add r20=r20,r19 };;
900{ .mfi; getf.sig r25=f101
901 xma.hu f68=f39,f122,f67
902(p7) add carry2=1,carry2 }
903{ .mfi; cmp.ltu p7,p0=r20,r19
904 xma.lu f67=f39,f122,f67
905 add r21=r21,r20 };;
906{ .mfi; getf.sig r26=f92
907 xma.hu f78=f39,f123,f77
908(p7) add carry2=1,carry2 }
909{ .mfi; cmp.ltu p7,p0=r21,r20
910 xma.lu f77=f39,f123,f77
911 add r22=r22,r21 };;
912{ .mfi; getf.sig r27=f83
913 xma.hu f88=f39,f124,f87
914(p7) add carry2=1,carry2 }
915{ .mfi; cmp.ltu p7,p0=r22,r21
916 xma.lu f87=f39,f124,f87
917 add r22=r22,carry1 };;
918{ .mfi; getf.sig r28=f74
919 xma.hu f98=f39,f125,f97
920(p7) add carry2=1,carry2 }
921{ .mfi; xma.lu f97=f39,f125,f97
922 cmp.ltu p7,p0=r22,carry1 };;
923{ .mfi; getf.sig r29=f65
924 xma.hu f108=f39,f126,f107
925(p7) add carry2=1,carry2 }
926{ .mfi; st8 [r32]=r22,16
927 xma.lu f107=f39,f126,f107 };;
928{ .mfi; getf.sig r30=f56
929 xma.hu f118=f39,f127,f117 }
930{ .mfi; xma.lu f117=f39,f127,f117 };;//
931//-------------------------------------------------//
932// Leaving muliplier's heaven... Quite a ride, huh?
933
934{ .mii; getf.sig r31=f47
935 add r25=r25,r24
936 mov carry1=0 };;
937{ .mii; getf.sig r16=f111
938 cmp.ltu p6,p0=r25,r24
939 add r26=r26,r25 };;
940{ .mfb; getf.sig r17=f102 }
941{ .mii;
942(p6) add carry1=1,carry1
943 cmp.ltu p6,p0=r26,r25
944 add r27=r27,r26 };;
945{ .mfb; nop.m 0x0 }
946{ .mii;
947(p6) add carry1=1,carry1
948 cmp.ltu p6,p0=r27,r26
949 add r28=r28,r27 };;
950{ .mii; getf.sig r18=f93
951 add r17=r17,r16
952 mov carry3=0 }
953{ .mii;
954(p6) add carry1=1,carry1
955 cmp.ltu p6,p0=r28,r27
956 add r29=r29,r28 };;
957{ .mii; getf.sig r19=f84
958 cmp.ltu p7,p0=r17,r16 }
959{ .mii;
960(p6) add carry1=1,carry1
961 cmp.ltu p6,p0=r29,r28
962 add r30=r30,r29 };;
963{ .mii; getf.sig r20=f75
964 add r18=r18,r17 }
965{ .mii;
966(p6) add carry1=1,carry1
967 cmp.ltu p6,p0=r30,r29
968 add r31=r31,r30 };;
969{ .mfb; getf.sig r21=f66 }
970{ .mii; (p7) add carry3=1,carry3
971 cmp.ltu p7,p0=r18,r17
972 add r19=r19,r18 }
973{ .mfb; nop.m 0x0 }
974{ .mii;
975(p6) add carry1=1,carry1
976 cmp.ltu p6,p0=r31,r30
977 add r31=r31,carry2 };;
978{ .mfb; getf.sig r22=f57 }
979{ .mii; (p7) add carry3=1,carry3
980 cmp.ltu p7,p0=r19,r18
981 add r20=r20,r19 }
982{ .mfb; nop.m 0x0 }
983{ .mii;
984(p6) add carry1=1,carry1
985 cmp.ltu p6,p0=r31,carry2 };;
986{ .mfb; getf.sig r23=f48 }
987{ .mii; (p7) add carry3=1,carry3
988 cmp.ltu p7,p0=r20,r19
989 add r21=r21,r20 }
990{ .mii;
991(p6) add carry1=1,carry1 }
992{ .mfb; st8 [r33]=r31,16 };;
993
994{ .mfb; getf.sig r24=f112 }
995{ .mii; (p7) add carry3=1,carry3
996 cmp.ltu p7,p0=r21,r20
997 add r22=r22,r21 };;
998{ .mfb; getf.sig r25=f103 }
999{ .mii; (p7) add carry3=1,carry3
1000 cmp.ltu p7,p0=r22,r21
1001 add r23=r23,r22 };;
1002{ .mfb; getf.sig r26=f94 }
1003{ .mii; (p7) add carry3=1,carry3
1004 cmp.ltu p7,p0=r23,r22
1005 add r23=r23,carry1 };;
1006{ .mfb; getf.sig r27=f85 }
1007{ .mii; (p7) add carry3=1,carry3
1008 cmp.ltu p7,p8=r23,carry1};;
1009{ .mii; getf.sig r28=f76
1010 add r25=r25,r24
1011 mov carry1=0 }
1012{ .mii; st8 [r32]=r23,16
1013 (p7) add carry2=1,carry3
1014 (p8) add carry2=0,carry3 };;
1015
1016{ .mfb; nop.m 0x0 }
1017{ .mii; getf.sig r29=f67
1018 cmp.ltu p6,p0=r25,r24
1019 add r26=r26,r25 };;
1020{ .mfb; getf.sig r30=f58 }
1021{ .mii;
1022(p6) add carry1=1,carry1
1023 cmp.ltu p6,p0=r26,r25
1024 add r27=r27,r26 };;
1025{ .mfb; getf.sig r16=f113 }
1026{ .mii;
1027(p6) add carry1=1,carry1
1028 cmp.ltu p6,p0=r27,r26
1029 add r28=r28,r27 };;
1030{ .mfb; getf.sig r17=f104 }
1031{ .mii;
1032(p6) add carry1=1,carry1
1033 cmp.ltu p6,p0=r28,r27
1034 add r29=r29,r28 };;
1035{ .mfb; getf.sig r18=f95 }
1036{ .mii;
1037(p6) add carry1=1,carry1
1038 cmp.ltu p6,p0=r29,r28
1039 add r30=r30,r29 };;
1040{ .mii; getf.sig r19=f86
1041 add r17=r17,r16
1042 mov carry3=0 }
1043{ .mii;
1044(p6) add carry1=1,carry1
1045 cmp.ltu p6,p0=r30,r29
1046 add r30=r30,carry2 };;
1047{ .mii; getf.sig r20=f77
1048 cmp.ltu p7,p0=r17,r16
1049 add r18=r18,r17 }
1050{ .mii;
1051(p6) add carry1=1,carry1
1052 cmp.ltu p6,p0=r30,carry2 };;
1053{ .mfb; getf.sig r21=f68 }
1054{ .mii; st8 [r33]=r30,16
1055(p6) add carry1=1,carry1 };;
1056
1057{ .mfb; getf.sig r24=f114 }
1058{ .mii; (p7) add carry3=1,carry3
1059 cmp.ltu p7,p0=r18,r17
1060 add r19=r19,r18 };;
1061{ .mfb; getf.sig r25=f105 }
1062{ .mii; (p7) add carry3=1,carry3
1063 cmp.ltu p7,p0=r19,r18
1064 add r20=r20,r19 };;
1065{ .mfb; getf.sig r26=f96 }
1066{ .mii; (p7) add carry3=1,carry3
1067 cmp.ltu p7,p0=r20,r19
1068 add r21=r21,r20 };;
1069{ .mfb; getf.sig r27=f87 }
1070{ .mii; (p7) add carry3=1,carry3
1071 cmp.ltu p7,p0=r21,r20
1072 add r21=r21,carry1 };;
1073{ .mib; getf.sig r28=f78
1074 add r25=r25,r24 }
1075{ .mib; (p7) add carry3=1,carry3
1076 cmp.ltu p7,p8=r21,carry1};;
1077{ .mii; st8 [r32]=r21,16
1078 (p7) add carry2=1,carry3
1079 (p8) add carry2=0,carry3 }
1080
1081{ .mii; mov carry1=0
1082 cmp.ltu p6,p0=r25,r24
1083 add r26=r26,r25 };;
1084{ .mfb; getf.sig r16=f115 }
1085{ .mii;
1086(p6) add carry1=1,carry1
1087 cmp.ltu p6,p0=r26,r25
1088 add r27=r27,r26 };;
1089{ .mfb; getf.sig r17=f106 }
1090{ .mii;
1091(p6) add carry1=1,carry1
1092 cmp.ltu p6,p0=r27,r26
1093 add r28=r28,r27 };;
1094{ .mfb; getf.sig r18=f97 }
1095{ .mii;
1096(p6) add carry1=1,carry1
1097 cmp.ltu p6,p0=r28,r27
1098 add r28=r28,carry2 };;
1099{ .mib; getf.sig r19=f88
1100 add r17=r17,r16 }
1101{ .mib;
1102(p6) add carry1=1,carry1
1103 cmp.ltu p6,p0=r28,carry2 };;
1104{ .mii; st8 [r33]=r28,16
1105(p6) add carry1=1,carry1 }
1106
1107{ .mii; mov carry2=0
1108 cmp.ltu p7,p0=r17,r16
1109 add r18=r18,r17 };;
1110{ .mfb; getf.sig r24=f116 }
1111{ .mii; (p7) add carry2=1,carry2
1112 cmp.ltu p7,p0=r18,r17
1113 add r19=r19,r18 };;
1114{ .mfb; getf.sig r25=f107 }
1115{ .mii; (p7) add carry2=1,carry2
1116 cmp.ltu p7,p0=r19,r18
1117 add r19=r19,carry1 };;
1118{ .mfb; getf.sig r26=f98 }
1119{ .mii; (p7) add carry2=1,carry2
1120 cmp.ltu p7,p0=r19,carry1};;
1121{ .mii; st8 [r32]=r19,16
1122 (p7) add carry2=1,carry2 }
1123
1124{ .mfb; add r25=r25,r24 };;
1125
1126{ .mfb; getf.sig r16=f117 }
1127{ .mii; mov carry1=0
1128 cmp.ltu p6,p0=r25,r24
1129 add r26=r26,r25 };;
1130{ .mfb; getf.sig r17=f108 }
1131{ .mii;
1132(p6) add carry1=1,carry1
1133 cmp.ltu p6,p0=r26,r25
1134 add r26=r26,carry2 };;
1135{ .mfb; nop.m 0x0 }
1136{ .mii;
1137(p6) add carry1=1,carry1
1138 cmp.ltu p6,p0=r26,carry2 };;
1139{ .mii; st8 [r33]=r26,16
1140(p6) add carry1=1,carry1 }
1141
1142{ .mfb; add r17=r17,r16 };;
1143{ .mfb; getf.sig r24=f118 }
1144{ .mii; mov carry2=0
1145 cmp.ltu p7,p0=r17,r16
1146 add r17=r17,carry1 };;
1147{ .mii; (p7) add carry2=1,carry2
1148 cmp.ltu p7,p0=r17,carry1};;
1149{ .mii; st8 [r32]=r17
1150 (p7) add carry2=1,carry2 };;
1151{ .mfb; add r24=r24,carry2 };;
1152{ .mib; st8 [r33]=r24 }
1153
1154{ .mib; rum 1<<5 // clear um.mfh
1155 br.ret.sptk.many b0 };;
1156.endp bn_mul_comba8#
1157#undef carry3
1158#undef carry2
1159#undef carry1
1160#endif
1161
1162#if 1
1163// It's possible to make it faster (see comment to bn_sqr_comba8), but
1164// I reckon it doesn't worth the effort. Basically because the routine
1165// (actually both of them) practically never called... So I just play
1166// same trick as with bn_sqr_comba8.
1167//
1168// void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
1169//
1170.global bn_sqr_comba4#
1171.proc bn_sqr_comba4#
1172.align 64
1173bn_sqr_comba4:
1174 .prologue
1175 .save ar.pfs,r2
1176#if defined(_HPUX_SOURCE) && !defined(_LP64)
1177{ .mii; alloc r2=ar.pfs,2,1,0,0
1178 addp4 r32=0,r32
1179 addp4 r33=0,r33 };;
1180{ .mii;
1181#else
1182{ .mii; alloc r2=ar.pfs,2,1,0,0
1183#endif
1184 mov r34=r33
1185 add r14=8,r33 };;
1186 .body
1187{ .mii; add r17=8,r34
1188 add r15=16,r33
1189 add r18=16,r34 }
1190{ .mfb; add r16=24,r33
1191 br .L_cheat_entry_point4 };;
1192.endp bn_sqr_comba4#
1193#endif
1194
1195#if 1
1196// Runs in ~115 cycles and ~4.5 times faster than C. Well, whatever...
1197//
1198// void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
1199//
1200#define carry1 r14
1201#define carry2 r15
1202.global bn_mul_comba4#
1203.proc bn_mul_comba4#
1204.align 64
1205bn_mul_comba4:
1206 .prologue
1207 .save ar.pfs,r2
1208#if defined(_HPUX_SOURCE) && !defined(_LP64)
1209{ .mii; alloc r2=ar.pfs,3,0,0,0
1210 addp4 r33=0,r33
1211 addp4 r34=0,r34 };;
1212{ .mii; addp4 r32=0,r32
1213#else
1214{ .mii; alloc r2=ar.pfs,3,0,0,0
1215#endif
1216 add r14=8,r33
1217 add r17=8,r34 }
1218 .body
1219{ .mii; add r15=16,r33
1220 add r18=16,r34
1221 add r16=24,r33 };;
1222.L_cheat_entry_point4:
1223{ .mmi; add r19=24,r34
1224
1225 ldf8 f32=[r33] }
1226
1227{ .mmi; ldf8 f120=[r34]
1228 ldf8 f121=[r17] };;
1229{ .mmi; ldf8 f122=[r18]
1230 ldf8 f123=[r19] }
1231
1232{ .mmi; ldf8 f33=[r14]
1233 ldf8 f34=[r15] }
1234{ .mfi; ldf8 f35=[r16]
1235
1236 xma.hu f41=f32,f120,f0 }
1237{ .mfi; xma.lu f40=f32,f120,f0 };;
1238{ .mfi; xma.hu f51=f32,f121,f0 }
1239{ .mfi; xma.lu f50=f32,f121,f0 };;
1240{ .mfi; xma.hu f61=f32,f122,f0 }
1241{ .mfi; xma.lu f60=f32,f122,f0 };;
1242{ .mfi; xma.hu f71=f32,f123,f0 }
1243{ .mfi; xma.lu f70=f32,f123,f0 };;//
1244// Major stall takes place here, and 3 more places below. Result from
1245// first xma is not available for another 3 ticks.
1246{ .mfi; getf.sig r16=f40
1247 xma.hu f42=f33,f120,f41
1248 add r33=8,r32 }
1249{ .mfi; xma.lu f41=f33,f120,f41 };;
1250{ .mfi; getf.sig r24=f50
1251 xma.hu f52=f33,f121,f51 }
1252{ .mfi; xma.lu f51=f33,f121,f51 };;
1253{ .mfi; st8 [r32]=r16,16
1254 xma.hu f62=f33,f122,f61 }
1255{ .mfi; xma.lu f61=f33,f122,f61 };;
1256{ .mfi; xma.hu f72=f33,f123,f71 }
1257{ .mfi; xma.lu f71=f33,f123,f71 };;//
1258//-------------------------------------------------//
1259{ .mfi; getf.sig r25=f41
1260 xma.hu f43=f34,f120,f42 }
1261{ .mfi; xma.lu f42=f34,f120,f42 };;
1262{ .mfi; getf.sig r16=f60
1263 xma.hu f53=f34,f121,f52 }
1264{ .mfi; xma.lu f52=f34,f121,f52 };;
1265{ .mfi; getf.sig r17=f51
1266 xma.hu f63=f34,f122,f62
1267 add r25=r25,r24 }
1268{ .mfi; mov carry1=0
1269 xma.lu f62=f34,f122,f62 };;
1270{ .mfi; st8 [r33]=r25,16
1271 xma.hu f73=f34,f123,f72
1272 cmp.ltu p6,p0=r25,r24 }
1273{ .mfi; xma.lu f72=f34,f123,f72 };;//
1274//-------------------------------------------------//
1275{ .mfi; getf.sig r18=f42
1276 xma.hu f44=f35,f120,f43
1277(p6) add carry1=1,carry1 }
1278{ .mfi; add r17=r17,r16
1279 xma.lu f43=f35,f120,f43
1280 mov carry2=0 };;
1281{ .mfi; getf.sig r24=f70
1282 xma.hu f54=f35,f121,f53
1283 cmp.ltu p7,p0=r17,r16 }
1284{ .mfi; xma.lu f53=f35,f121,f53 };;
1285{ .mfi; getf.sig r25=f61
1286 xma.hu f64=f35,f122,f63
1287 add r18=r18,r17 }
1288{ .mfi; xma.lu f63=f35,f122,f63
1289(p7) add carry2=1,carry2 };;
1290{ .mfi; getf.sig r26=f52
1291 xma.hu f74=f35,f123,f73
1292 cmp.ltu p7,p0=r18,r17 }
1293{ .mfi; xma.lu f73=f35,f123,f73
1294 add r18=r18,carry1 };;
1295//-------------------------------------------------//
1296{ .mii; st8 [r32]=r18,16
1297(p7) add carry2=1,carry2
1298 cmp.ltu p7,p0=r18,carry1 };;
1299
1300{ .mfi; getf.sig r27=f43 // last major stall
1301(p7) add carry2=1,carry2 };;
1302{ .mii; getf.sig r16=f71
1303 add r25=r25,r24
1304 mov carry1=0 };;
1305{ .mii; getf.sig r17=f62
1306 cmp.ltu p6,p0=r25,r24
1307 add r26=r26,r25 };;
1308{ .mii;
1309(p6) add carry1=1,carry1
1310 cmp.ltu p6,p0=r26,r25
1311 add r27=r27,r26 };;
1312{ .mii;
1313(p6) add carry1=1,carry1
1314 cmp.ltu p6,p0=r27,r26
1315 add r27=r27,carry2 };;
1316{ .mii; getf.sig r18=f53
1317(p6) add carry1=1,carry1
1318 cmp.ltu p6,p0=r27,carry2 };;
1319{ .mfi; st8 [r33]=r27,16
1320(p6) add carry1=1,carry1 }
1321
1322{ .mii; getf.sig r19=f44
1323 add r17=r17,r16
1324 mov carry2=0 };;
1325{ .mii; getf.sig r24=f72
1326 cmp.ltu p7,p0=r17,r16
1327 add r18=r18,r17 };;
1328{ .mii; (p7) add carry2=1,carry2
1329 cmp.ltu p7,p0=r18,r17
1330 add r19=r19,r18 };;
1331{ .mii; (p7) add carry2=1,carry2
1332 cmp.ltu p7,p0=r19,r18
1333 add r19=r19,carry1 };;
1334{ .mii; getf.sig r25=f63
1335 (p7) add carry2=1,carry2
1336 cmp.ltu p7,p0=r19,carry1};;
1337{ .mii; st8 [r32]=r19,16
1338 (p7) add carry2=1,carry2 }
1339
1340{ .mii; getf.sig r26=f54
1341 add r25=r25,r24
1342 mov carry1=0 };;
1343{ .mii; getf.sig r16=f73
1344 cmp.ltu p6,p0=r25,r24
1345 add r26=r26,r25 };;
1346{ .mii;
1347(p6) add carry1=1,carry1
1348 cmp.ltu p6,p0=r26,r25
1349 add r26=r26,carry2 };;
1350{ .mii; getf.sig r17=f64
1351(p6) add carry1=1,carry1
1352 cmp.ltu p6,p0=r26,carry2 };;
1353{ .mii; st8 [r33]=r26,16
1354(p6) add carry1=1,carry1 }
1355
1356{ .mii; getf.sig r24=f74
1357 add r17=r17,r16
1358 mov carry2=0 };;
1359{ .mii; cmp.ltu p7,p0=r17,r16
1360 add r17=r17,carry1 };;
1361
1362{ .mii; (p7) add carry2=1,carry2
1363 cmp.ltu p7,p0=r17,carry1};;
1364{ .mii; st8 [r32]=r17,16
1365 (p7) add carry2=1,carry2 };;
1366
1367{ .mii; add r24=r24,carry2 };;
1368{ .mii; st8 [r33]=r24 }
1369
1370{ .mib; rum 1<<5 // clear um.mfh
1371 br.ret.sptk.many b0 };;
1372.endp bn_mul_comba4#
1373#undef carry2
1374#undef carry1
1375#endif
1376
1377#if 1
1378//
1379// BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
1380//
1381// In the nutshell it's a port of my MIPS III/IV implementation.
1382//
1383#define AT r14
1384#define H r16
1385#define HH r20
1386#define L r17
1387#define D r18
1388#define DH r22
1389#define I r21
1390
1391#if 0
1392// Some preprocessors (most notably HP-UX) appear to be allergic to
1393// macros enclosed to parenthesis [as these three were].
1394#define cont p16
1395#define break p0 // p20
1396#define equ p24
1397#else
1398cont=p16
1399break=p0
1400equ=p24
1401#endif
1402
1403.global abort#
1404.global bn_div_words#
1405.proc bn_div_words#
1406.align 64
1407bn_div_words:
1408 .prologue
1409 .save ar.pfs,r2
1410{ .mii; alloc r2=ar.pfs,3,5,0,8
1411 .save b0,r3
1412 mov r3=b0
1413 .save pr,r10
1414 mov r10=pr };;
1415{ .mmb; cmp.eq p6,p0=r34,r0
1416 mov r8=-1
1417(p6) br.ret.spnt.many b0 };;
1418
1419 .body
1420{ .mii; mov H=r32 // save h
1421 mov ar.ec=0 // don't rotate at exit
1422 mov pr.rot=0 }
1423{ .mii; mov L=r33 // save l
1424 mov r36=r0 };;
1425
1426.L_divw_shift: // -vv- note signed comparison
1427{ .mfi; (p0) cmp.lt p16,p0=r0,r34 // d
1428 (p0) shladd r33=r34,1,r0 }
1429{ .mfb; (p0) add r35=1,r36
1430 (p0) nop.f 0x0
1431(p16) br.wtop.dpnt .L_divw_shift };;
1432
1433{ .mii; mov D=r34
1434 shr.u DH=r34,32
1435 sub r35=64,r36 };;
1436{ .mii; setf.sig f7=DH
1437 shr.u AT=H,r35
1438 mov I=r36 };;
1439{ .mib; cmp.ne p6,p0=r0,AT
1440 shl H=H,r36
1441(p6) br.call.spnt.clr b0=abort };; // overflow, die...
1442
1443{ .mfi; fcvt.xuf.s1 f7=f7
1444 shr.u AT=L,r35 };;
1445{ .mii; shl L=L,r36
1446 or H=H,AT };;
1447
1448{ .mii; nop.m 0x0
1449 cmp.leu p6,p0=D,H;;
1450(p6) sub H=H,D }
1451
1452{ .mlx; setf.sig f14=D
1453 movl AT=0xffffffff };;
1454///////////////////////////////////////////////////////////
1455{ .mii; setf.sig f6=H
1456 shr.u HH=H,32;;
1457 cmp.eq p6,p7=HH,DH };;
1458{ .mfb;
1459(p6) setf.sig f8=AT
1460(p7) fcvt.xuf.s1 f6=f6
1461(p7) br.call.sptk b6=.L_udiv64_32_b6 };;
1462
1463{ .mfi; getf.sig r33=f8 // q
1464 xmpy.lu f9=f8,f14 }
1465{ .mfi; xmpy.hu f10=f8,f14
1466 shrp H=H,L,32 };;
1467
1468{ .mmi; getf.sig r35=f9 // tl
1469 getf.sig r31=f10 };; // th
1470
1471.L_divw_1st_iter:
1472{ .mii; (p0) add r32=-1,r33
1473 (p0) cmp.eq equ,cont=HH,r31 };;
1474{ .mii; (p0) cmp.ltu p8,p0=r35,D
1475 (p0) sub r34=r35,D
1476 (equ) cmp.leu break,cont=r35,H };;
1477{ .mib; (cont) cmp.leu cont,break=HH,r31
1478 (p8) add r31=-1,r31
1479(cont) br.wtop.spnt .L_divw_1st_iter };;
1480///////////////////////////////////////////////////////////
1481{ .mii; sub H=H,r35
1482 shl r8=r33,32
1483 shl L=L,32 };;
1484///////////////////////////////////////////////////////////
1485{ .mii; setf.sig f6=H
1486 shr.u HH=H,32;;
1487 cmp.eq p6,p7=HH,DH };;
1488{ .mfb;
1489(p6) setf.sig f8=AT
1490(p7) fcvt.xuf.s1 f6=f6
1491(p7) br.call.sptk b6=.L_udiv64_32_b6 };;
1492
1493{ .mfi; getf.sig r33=f8 // q
1494 xmpy.lu f9=f8,f14 }
1495{ .mfi; xmpy.hu f10=f8,f14
1496 shrp H=H,L,32 };;
1497
1498{ .mmi; getf.sig r35=f9 // tl
1499 getf.sig r31=f10 };; // th
1500
1501.L_divw_2nd_iter:
1502{ .mii; (p0) add r32=-1,r33
1503 (p0) cmp.eq equ,cont=HH,r31 };;
1504{ .mii; (p0) cmp.ltu p8,p0=r35,D
1505 (p0) sub r34=r35,D
1506 (equ) cmp.leu break,cont=r35,H };;
1507{ .mib; (cont) cmp.leu cont,break=HH,r31
1508 (p8) add r31=-1,r31
1509(cont) br.wtop.spnt .L_divw_2nd_iter };;
1510///////////////////////////////////////////////////////////
1511{ .mii; sub H=H,r35
1512 or r8=r8,r33
1513 mov ar.pfs=r2 };;
1514{ .mii; shr.u r9=H,I // remainder if anybody wants it
1515 mov pr=r10,0x1ffff }
1516{ .mfb; br.ret.sptk.many b0 };;
1517
1518// Unsigned 64 by 32 (well, by 64 for the moment) bit integer division
1519// procedure.
1520//
1521// inputs: f6 = (double)a, f7 = (double)b
1522// output: f8 = (int)(a/b)
1523// clobbered: f8,f9,f10,f11,pred
1524pred=p15
1525// One can argue that this snippet is copyrighted to Intel
1526// Corporation, as it's essentially identical to one of those
1527// found in "Divide, Square Root and Remainder" section at
1528// http://www.intel.com/software/products/opensource/libraries/num.htm.
1529// Yes, I admit that the referred code was used as template,
1530// but after I realized that there hardly is any other instruction
1531// sequence which would perform this operation. I mean I figure that
1532// any independent attempt to implement high-performance division
1533// will result in code virtually identical to the Intel code. It
1534// should be noted though that below division kernel is 1 cycle
1535// faster than Intel one (note commented splits:-), not to mention
1536// original prologue (rather lack of one) and epilogue.
1537.align 32
1538.skip 16
1539.L_udiv64_32_b6:
1540 frcpa.s1 f8,pred=f6,f7;; // [0] y0 = 1 / b
1541
1542(pred) fnma.s1 f9=f7,f8,f1 // [5] e0 = 1 - b * y0
1543(pred) fmpy.s1 f10=f6,f8;; // [5] q0 = a * y0
1544(pred) fmpy.s1 f11=f9,f9 // [10] e1 = e0 * e0
1545(pred) fma.s1 f10=f9,f10,f10;; // [10] q1 = q0 + e0 * q0
1546(pred) fma.s1 f8=f9,f8,f8 //;; // [15] y1 = y0 + e0 * y0
1547(pred) fma.s1 f9=f11,f10,f10;; // [15] q2 = q1 + e1 * q1
1548(pred) fma.s1 f8=f11,f8,f8 //;; // [20] y2 = y1 + e1 * y1
1549(pred) fnma.s1 f10=f7,f9,f6;; // [20] r2 = a - b * q2
1550(pred) fma.s1 f8=f10,f8,f9;; // [25] q3 = q2 + r2 * y2
1551
1552 fcvt.fxu.trunc.s1 f8=f8 // [30] q = trunc(q3)
1553 br.ret.sptk.many b6;;
1554.endp bn_div_words#
1555#endif