1 files changed, 131 insertions, 86 deletions
diff --git a/src/lib/libcrypto/bn/asm/ia64.S b/src/lib/libcrypto/bn/asm/ia64.S
index 7b82b820e6..7dfda85566 100644
--- a/src/lib/libcrypto/bn/asm/ia64.S
+++ b/src/lib/libcrypto/bn/asm/ia64.S
@@ -1,6 +1,6 @@
 .explicit
 .text
-.ident  "ia64.S, Version 2.1"
+.ident  "ia64.S, Version 2.0"
 .ident  "IA-64 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>"
 //
@@ -35,7 +35,7 @@
 // What does it mean? You might ratiocinate that the original code
 // should run just faster... Because sum of latencies is smaller...
 // Wrong! Note that getf latency increased. This means that if a loop is
-// scheduled for lower latency (as they were), then it will suffer from
+// scheduled for lower latency (and they are), then it will suffer from
 // stall condition and the code will therefore turn anti-scalable, e.g.
 // original bn_mul_words spun at 5*n or 2.5 times slower than expected
 // on Itanium2! What to do? Reschedule loops for Itanium2? But then
@@ -145,12 +145,6 @@
 //      -Drum=nop.m in command line.
 //
-#if defined(_HPUX_SOURCE) && !defined(_LP64)
-#define ADDP    addp4
-#else
-#define ADDP    add
-#endif
 #if 1
 //
 // bn_[add|sub]_words routines.
@@ -184,12 +178,27 @@ bn_add_words:
        brp.loop.imp    .L_bn_add_words_ctop,.L_bn_add_words_cend-16
                                        }
        .body
-{ .mib; ADDP            r14=0,r32               // rp
+{ .mib;
+#if defined(_HPUX_SOURCE) && defined(_ILP32)
+        addp4           r14=0,r32               // rp
+#else
+        mov             r14=r32                 // rp
+#endif
        mov             r9=pr           };;
-{ .mii; ADDP            r15=0,r33               // ap
+{ .mii;
+#if defined(_HPUX_SOURCE) && defined(_ILP32)
+        addp4           r15=0,r33               // ap
+#else
+        mov             r15=r33                 // ap
+#endif
        mov             ar.lc=r10
        mov             ar.ec=6         }
-{ .mib; ADDP            r16=0,r34               // bp
+{ .mib;
+#if defined(_HPUX_SOURCE) && defined(_ILP32)
+        addp4           r16=0,r34               // bp
+#else
+        mov             r16=r34                 // bp
+#endif
        mov             pr.rot=1<<16    };;
 .L_bn_add_words_ctop:
@@ -237,12 +246,27 @@ bn_sub_words:
        brp.loop.imp    .L_bn_sub_words_ctop,.L_bn_sub_words_cend-16
                                        }
        .body
-{ .mib; ADDP            r14=0,r32               // rp
+{ .mib;
+#if defined(_HPUX_SOURCE) && defined(_ILP32)
+        addp4           r14=0,r32               // rp
+#else
+        mov             r14=r32                 // rp
+#endif
        mov             r9=pr           };;
-{ .mii; ADDP            r15=0,r33               // ap
+{ .mii;
+#if defined(_HPUX_SOURCE) && defined(_ILP32)
+        addp4           r15=0,r33               // ap
+#else
+        mov             r15=r33                 // ap
+#endif
        mov             ar.lc=r10
        mov             ar.ec=6         }
-{ .mib; ADDP            r16=0,r34               // bp
+{ .mib;
+#if defined(_HPUX_SOURCE) && defined(_ILP32)
+        addp4           r16=0,r34               // bp
+#else
+        mov             r16=r34                 // bp
+#endif
        mov             pr.rot=1<<16    };;
 .L_bn_sub_words_ctop:
@@ -308,10 +332,16 @@ bn_mul_words:
 #ifndef XMA_TEMPTATION
-{ .mmi; ADDP            r14=0,r32       // rp
+{ .mii;
-        ADDP            r15=0,r33       // ap
+#if defined(_HPUX_SOURCE) && defined(_ILP32)
+        addp4           r14=0,r32       // rp
+        addp4           r15=0,r33       // ap
+#else
+        mov             r14=r32         // rp
+        mov             r15=r33         // ap
+#endif
        mov             ar.lc=r10       }
-{ .mmi; mov             r40=0           // serves as r35 at first (p27)
+{ .mii; mov             r40=0   // serves as r35 at first (p27)
        mov             ar.ec=13        };;
 // This loop spins in 2*(n+12) ticks. It's scheduled for data in Itanium
@@ -394,64 +424,89 @@ bn_mul_words:
 .global bn_mul_add_words#
 .proc   bn_mul_add_words#
 .align  64
-.skip   48      // makes the loop body aligned at 64-byte boundary
+//.skip 0       // makes the loop split at 64-byte boundary
 bn_mul_add_words:
        .prologue
        .fframe 0
        .save   ar.pfs,r2
-        .save   ar.lc,r3
+{ .mii; alloc           r2=ar.pfs,4,12,0,16
-        .save   pr,r9
+        cmp4.le         p6,p0=r34,r0    };;
-{ .mmi; alloc           r2=ar.pfs,4,4,0,8
+{ .mfb; mov             r8=r0                   // return value
-        cmp4.le         p6,p0=r34,r0
-        mov             r3=ar.lc        };;
-{ .mib; mov             r8=r0           // return value
-        sub             r10=r34,r0,1
 (p6)    br.ret.spnt.many        b0      };;
+        .save   ar.lc,r3
+{ .mii; sub     r10=r34,r0,1
+        mov     r3=ar.lc
+        mov     r9=pr                   };;
        .body
-{ .mib; setf.sig        f8=r35          // w
+{ .mib; setf.sig        f8=r35  // w
-        mov             r9=pr
+        mov             pr.rot=0x800001<<16
+                        // ------^----- serves as (p50) at first (p27)
        brp.loop.imp    .L_bn_mul_add_words_ctop,.L_bn_mul_add_words_cend-16
                                        }
-{ .mmi; ADDP            r14=0,r32       // rp
+{ .mii;
-        ADDP            r15=0,r33       // ap
+#if defined(_HPUX_SOURCE) && defined(_ILP32)
+        addp4           r14=0,r32       // rp
+        addp4           r15=0,r33       // ap
+#else
+        mov             r14=r32         // rp
+        mov             r15=r33         // ap
+#endif
        mov             ar.lc=r10       }
-{ .mii; ADDP            r16=0,r32       // rp copy
+{ .mii; mov             r40=0   // serves as r35 at first (p27)
-        mov             pr.rot=0x2001<<16
+#if defined(_HPUX_SOURCE) && defined(_ILP32)
-                        // ------^----- serves as (p40) at first (p27)
+        addp4           r18=0,r32       // rp copy
-        mov             ar.ec=11        };;
+#else
+        mov             r18=r32         // rp copy
-// This loop spins in 3*(n+10) ticks on Itanium and in 2*(n+10) on
+#endif
-// Itanium 2. Yes, unlike previous versions it scales:-) Previous
+        mov             ar.ec=15        };;
-// version was peforming *all* additions in IALU and was starving
-// for those even on Itanium 2. In this version one addition is
+// This loop spins in 3*(n+14) ticks on Itanium and should spin in
-// moved to FPU and is folded with multiplication. This is at cost
+// 2*(n+14) on "wider" IA-64 implementations (to be verified with new
-// of propogating the result from previous call to this subroutine
+// �-architecture manuals as they become available). As usual it's
-// to L2 cache... In other words negligible even for shorter keys.
+// possible to compress the epilogue, down to 10 in this case, at the
-// *Overall* performance improvement [over previous version] varies
+// cost of scalability. Compressed (and therefore non-scalable) loop
-// from 11 to 22 percent depending on key length.
+// running at 3*(n+11) would buy you ~10% on Itanium but take ~35%
+// from "wider" IA-64 so let it be scalable! Special attention was
+// paid for having the loop body split at 64-byte boundary. ld8 is
+// scheduled for L1 cache as the data is more than likely there.
+// Indeed, bn_mul_words has put it there a moment ago:-)
 .L_bn_mul_add_words_ctop:
-.pred.rel       "mutex",p40,p42
+{ .mfi; (p25)   getf.sig        r36=f52                 // low
-{ .mfi; (p23)   getf.sig        r36=f45                 // low
+        (p21)   xmpy.lu         f48=f37,f8
-        (p20)   xma.lu          f42=f36,f8,f50          // low
+        (p28)   cmp.ltu         p54,p50=r41,r39 }
-        (p40)   add             r39=r39,r35     }       // (p27)
+{ .mfi; (p16)   ldf8            f32=[r15],8
-{ .mfi; (p16)   ldf8            f32=[r15],8             // *(ap++)
+        (p21)   xmpy.hu         f40=f37,f8
-        (p20)   xma.hu          f36=f36,f8,f50          // high
+        (p28)   add             r45=r45,r41     };;
-        (p42)   add             r39=r39,r35,1   };;     // (p27)
+{ .mii; (p25)   getf.sig        r32=f44                 // high
-{ .mmi; (p24)   getf.sig        r32=f40                 // high
+        .pred.rel       "mutex",p50,p54
-        (p16)   ldf8            f46=[r16],8             // *(rp1++)
+        (p50)   add             r40=r38,r35             // (p27)
-        (p40)   cmp.ltu         p41,p39=r39,r35 }       // (p27)
+        (p54)   add             r40=r38,r35,1   }       // (p27)
-{ .mib; (p26)   st8             [r14]=r39,8             // *(rp2++)
+{ .mfb; (p28)   cmp.ltu.unc     p60,p0=r45,r41
-        (p42)   cmp.leu         p41,p39=r39,r35         // (p27)
+        (p0)    nop.f           0x0
+        (p0)    nop.b           0x0             }
+{ .mii; (p27)   ld8             r44=[r18],8
+        (p62)   cmp.eq.or       p61,p0=-1,r46
+        (p62)   add             r46=1,r46       }
+{ .mfb; (p30)   st8             [r14]=r47,8
+        (p0)    nop.f           0x0
        br.ctop.sptk    .L_bn_mul_add_words_ctop};;
 .L_bn_mul_add_words_cend:
-{ .mmi; .pred.rel       "mutex",p40,p42
+{ .mii; nop.m           0x0
-(p40)   add             r8=r35,r0
+.pred.rel       "mutex",p53,p57
-(p42)   add             r8=r35,r0,1
+(p53)   add             r8=r38,r0
-        mov             pr=r9,0x1ffff   }
+(p57)   add             r8=r38,r0,1     }
-{ .mib; rum             1<<5            // clear um.mfh
+{ .mfb; nop.m   0x0
-        mov             ar.lc=r3
+        nop.f   0x0
+        nop.b   0x0                     };;
+{ .mii;
+(p63)   add             r8=1,r8
+        mov             pr=r9,0x1ffff
+        mov             ar.lc=r3        }
+{ .mfb; rum             1<<5            // clear um.mfh
+        nop.f           0x0
        br.ret.sptk.many        b0      };;
 .endp   bn_mul_add_words#
 #endif
@@ -472,8 +527,7 @@ bn_sqr_words:
        sxt4            r34=r34         };;
 { .mii; cmp.le          p6,p0=r34,r0
        mov             r8=r0           }       // return value
-{ .mfb; ADDP            r32=0,r32
+{ .mfb; nop.f           0x0
-        nop.f           0x0
 (p6)    br.ret.spnt.many        b0      };;
        .save   ar.lc,r3
@@ -482,7 +536,11 @@ bn_sqr_words:
        mov     r9=pr                   };;
        .body
-{ .mib; ADDP            r33=0,r33
+#if defined(_HPUX_SOURCE) && defined(_ILP32)
+{ .mii; addp4           r32=0,r32
+        addp4           r33=0,r33       };;
+#endif
+{ .mib;
        mov             pr.rot=1<<16
        brp.loop.imp    .L_bn_sqr_words_ctop,.L_bn_sqr_words_cend-16
                                        }
@@ -547,7 +605,7 @@ bn_sqr_comba8:
        .prologue
        .fframe 0
        .save   ar.pfs,r2
-#if defined(_HPUX_SOURCE) && !defined(_LP64)
+#if defined(_HPUX_SOURCE) && defined(_ILP32)
 { .mii; alloc   r2=ar.pfs,2,1,0,0
        addp4   r33=0,r33
        addp4   r32=0,r32               };;
@@ -573,10 +631,6 @@ bn_sqr_comba8:
 // clause in Itanium �-architecture manual? Comments are welcomed and
 // highly appreciated.
 //
-// On Itanium 2 it takes ~190 ticks. This is because of stalls on
-// result from getf.sig. I do nothing about it at this point for
-// reasons depicted below.
-//
 // However! It should be noted that even 160 ticks is darn good result
 // as it's over 10 (yes, ten, spelled as t-e-n) times faster than the
 // C version (compiled with gcc with inline assembler). I really
@@ -619,7 +673,7 @@ bn_mul_comba8:
        .prologue
        .fframe 0
        .save   ar.pfs,r2
-#if defined(_HPUX_SOURCE) && !defined(_LP64)
+#if defined(_HPUX_SOURCE) && defined(_ILP32)
 { .mii; alloc   r2=ar.pfs,3,0,0,0
        addp4   r33=0,r33
        addp4   r34=0,r34               };;
@@ -1177,7 +1231,7 @@ bn_sqr_comba4:
        .prologue
        .fframe 0
        .save   ar.pfs,r2
-#if defined(_HPUX_SOURCE) && !defined(_LP64)
+#if defined(_HPUX_SOURCE) && defined(_ILP32)
 { .mii; alloc   r2=ar.pfs,2,1,0,0
        addp4   r32=0,r32
        addp4   r33=0,r33               };;
@@ -1210,7 +1264,7 @@ bn_mul_comba4:
        .prologue
        .fframe 0
        .save   ar.pfs,r2
-#if defined(_HPUX_SOURCE) && !defined(_LP64)
+#if defined(_HPUX_SOURCE) && defined(_ILP32)
 { .mii; alloc   r2=ar.pfs,3,0,0,0
        addp4   r33=0,r33
        addp4   r34=0,r34               };;
@@ -1394,8 +1448,8 @@ bn_mul_comba4:
 #define I       r21
 #if 0
-// Some preprocessors (most notably HP-UX) appear to be allergic to
+// Some preprocessors (most notably HP-UX) apper to be allergic to
-// macros enclosed to parenthesis [as these three were].
+// macros enclosed to parenthesis as these three will be.
 #define cont    p16
 #define break   p0      // p20
 #define equ     p24
@@ -1527,18 +1581,9 @@ bn_div_words:
 // output:      f8 = (int)(a/b)
 // clobbered:   f8,f9,f10,f11,pred
 pred=p15
-// One can argue that this snippet is copyrighted to Intel
+// This procedure is essentially Intel code and therefore is
-// Corporation, as it's essentially identical to one of those
+// copyrighted to Intel Corporation (I suppose...). It's sligtly
-// found in "Divide, Square Root and Remainder" section at
+// modified for specific needs.
-// http://www.intel.com/software/products/opensource/libraries/num.htm.
-// Yes, I admit that the referred code was used as template,
-// but after I realized that there hardly is any other instruction
-// sequence which would perform this operation. I mean I figure that
-// any independent attempt to implement high-performance division
-// will result in code virtually identical to the Intel code. It
-// should be noted though that below division kernel is 1 cycle
-// faster than Intel one (note commented splits:-), not to mention
-// original prologue (rather lack of one) and epilogue.
 .align  32
 .skip   16
 .L_udiv64_32_b6:

diff --git a/src/lib/libcrypto/bn/asm/ia64.S b/src/lib/libcrypto/bn/asm/ia64.S index 7b82b820e6..7dfda85566 100644 --- a/src/lib/libcrypto/bn/asm/ia64.S +++ b/src/lib/libcrypto/bn/asm/ia64.S
@@ -1,6 +1,6 @@
1	.explicit	1	.explicit
2	.text	2	.text
3	.ident "ia64.S, Version 2.1"	3	.ident "ia64.S, Version 2.0"
4	.ident "IA-64 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>"	4	.ident "IA-64 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>"
5		5
6	//	6	//
@@ -35,7 +35,7 @@
35	// What does it mean? You might ratiocinate that the original code	35	// What does it mean? You might ratiocinate that the original code
36	// should run just faster... Because sum of latencies is smaller...	36	// should run just faster... Because sum of latencies is smaller...
37	// Wrong! Note that getf latency increased. This means that if a loop is	37	// Wrong! Note that getf latency increased. This means that if a loop is
38	// scheduled for lower latency (as they were), then it will suffer from	38	// scheduled for lower latency (and they are), then it will suffer from
39	// stall condition and the code will therefore turn anti-scalable, e.g.	39	// stall condition and the code will therefore turn anti-scalable, e.g.
40	// original bn_mul_words spun at 5*n or 2.5 times slower than expected	40	// original bn_mul_words spun at 5*n or 2.5 times slower than expected
41	// on Itanium2! What to do? Reschedule loops for Itanium2? But then	41	// on Itanium2! What to do? Reschedule loops for Itanium2? But then
@@ -145,12 +145,6 @@
145	// -Drum=nop.m in command line.	145	// -Drum=nop.m in command line.
146	//	146	//
147		147
148	#if defined(_HPUX_SOURCE) && !defined(_LP64)
149	#define ADDP addp4
150	#else
151	#define ADDP add
152	#endif
153
154	#if 1	148	#if 1
155	//	149	//
156	// bn_[add\|sub]_words routines.	150	// bn_[add\|sub]_words routines.
@@ -184,12 +178,27 @@ bn_add_words:
184	brp.loop.imp .L_bn_add_words_ctop,.L_bn_add_words_cend-16	178	brp.loop.imp .L_bn_add_words_ctop,.L_bn_add_words_cend-16
185	}	179	}
186	.body	180	.body
187	{ .mib; ADDP r14=0,r32 // rp	181	{ .mib;
		182	#if defined(_HPUX_SOURCE) && defined(_ILP32)
		183	addp4 r14=0,r32 // rp
		184	#else
		185	mov r14=r32 // rp
		186	#endif
188	mov r9=pr };;	187	mov r9=pr };;
189	{ .mii; ADDP r15=0,r33 // ap	188	{ .mii;
		189	#if defined(_HPUX_SOURCE) && defined(_ILP32)
		190	addp4 r15=0,r33 // ap
		191	#else
		192	mov r15=r33 // ap
		193	#endif
190	mov ar.lc=r10	194	mov ar.lc=r10
191	mov ar.ec=6 }	195	mov ar.ec=6 }
192	{ .mib; ADDP r16=0,r34 // bp	196	{ .mib;
		197	#if defined(_HPUX_SOURCE) && defined(_ILP32)
		198	addp4 r16=0,r34 // bp
		199	#else
		200	mov r16=r34 // bp
		201	#endif
193	mov pr.rot=1<<16 };;	202	mov pr.rot=1<<16 };;
194		203
195	.L_bn_add_words_ctop:	204	.L_bn_add_words_ctop:
@@ -237,12 +246,27 @@ bn_sub_words:
237	brp.loop.imp .L_bn_sub_words_ctop,.L_bn_sub_words_cend-16	246	brp.loop.imp .L_bn_sub_words_ctop,.L_bn_sub_words_cend-16
238	}	247	}
239	.body	248	.body
240	{ .mib; ADDP r14=0,r32 // rp	249	{ .mib;
		250	#if defined(_HPUX_SOURCE) && defined(_ILP32)
		251	addp4 r14=0,r32 // rp
		252	#else
		253	mov r14=r32 // rp
		254	#endif
241	mov r9=pr };;	255	mov r9=pr };;
242	{ .mii; ADDP r15=0,r33 // ap	256	{ .mii;
		257	#if defined(_HPUX_SOURCE) && defined(_ILP32)
		258	addp4 r15=0,r33 // ap
		259	#else
		260	mov r15=r33 // ap
		261	#endif
243	mov ar.lc=r10	262	mov ar.lc=r10
244	mov ar.ec=6 }	263	mov ar.ec=6 }
245	{ .mib; ADDP r16=0,r34 // bp	264	{ .mib;
		265	#if defined(_HPUX_SOURCE) && defined(_ILP32)
		266	addp4 r16=0,r34 // bp
		267	#else
		268	mov r16=r34 // bp
		269	#endif
246	mov pr.rot=1<<16 };;	270	mov pr.rot=1<<16 };;
247		271
248	.L_bn_sub_words_ctop:	272	.L_bn_sub_words_ctop:
@@ -308,10 +332,16 @@ bn_mul_words:
308		332
309	#ifndef XMA_TEMPTATION	333	#ifndef XMA_TEMPTATION
310		334
311	{ .mmi; ADDP r14=0,r32 // rp	335	{ .mii;
312	ADDP r15=0,r33 // ap	336	#if defined(_HPUX_SOURCE) && defined(_ILP32)
		337	addp4 r14=0,r32 // rp
		338	addp4 r15=0,r33 // ap
		339	#else
		340	mov r14=r32 // rp
		341	mov r15=r33 // ap
		342	#endif
313	mov ar.lc=r10 }	343	mov ar.lc=r10 }
314	{ .mmi; mov r40=0 // serves as r35 at first (p27)	344	{ .mii; mov r40=0 // serves as r35 at first (p27)
315	mov ar.ec=13 };;	345	mov ar.ec=13 };;
316		346
317	// This loop spins in 2*(n+12) ticks. It's scheduled for data in Itanium	347	// This loop spins in 2*(n+12) ticks. It's scheduled for data in Itanium
@@ -394,64 +424,89 @@ bn_mul_words:
394	.global bn_mul_add_words#	424	.global bn_mul_add_words#
395	.proc bn_mul_add_words#	425	.proc bn_mul_add_words#
396	.align 64	426	.align 64
397	.skip 48 // makes the loop body aligned at 64-byte boundary	427	//.skip 0 // makes the loop split at 64-byte boundary
398	bn_mul_add_words:	428	bn_mul_add_words:
399	.prologue	429	.prologue
400	.fframe 0	430	.fframe 0
401	.save ar.pfs,r2	431	.save ar.pfs,r2
402	.save ar.lc,r3	432	{ .mii; alloc r2=ar.pfs,4,12,0,16
403	.save pr,r9	433	cmp4.le p6,p0=r34,r0 };;
404	{ .mmi; alloc r2=ar.pfs,4,4,0,8	434	{ .mfb; mov r8=r0 // return value
405	cmp4.le p6,p0=r34,r0
406	mov r3=ar.lc };;
407	{ .mib; mov r8=r0 // return value
408	sub r10=r34,r0,1
409	(p6) br.ret.spnt.many b0 };;	435	(p6) br.ret.spnt.many b0 };;
410		436
		437	.save ar.lc,r3
		438	{ .mii; sub r10=r34,r0,1
		439	mov r3=ar.lc
		440	mov r9=pr };;
		441
411	.body	442	.body
412	{ .mib; setf.sig f8=r35 // w	443	{ .mib; setf.sig f8=r35 // w
413	mov r9=pr	444	mov pr.rot=0x800001<<16
		445	// ------^----- serves as (p50) at first (p27)
414	brp.loop.imp .L_bn_mul_add_words_ctop,.L_bn_mul_add_words_cend-16	446	brp.loop.imp .L_bn_mul_add_words_ctop,.L_bn_mul_add_words_cend-16
415	}	447	}
416	{ .mmi; ADDP r14=0,r32 // rp	448	{ .mii;
417	ADDP r15=0,r33 // ap	449	#if defined(_HPUX_SOURCE) && defined(_ILP32)
		450	addp4 r14=0,r32 // rp
		451	addp4 r15=0,r33 // ap
		452	#else
		453	mov r14=r32 // rp
		454	mov r15=r33 // ap
		455	#endif
418	mov ar.lc=r10 }	456	mov ar.lc=r10 }
419	{ .mii; ADDP r16=0,r32 // rp copy	457	{ .mii; mov r40=0 // serves as r35 at first (p27)
420	mov pr.rot=0x2001<<16	458	#if defined(_HPUX_SOURCE) && defined(_ILP32)
421	// ------^----- serves as (p40) at first (p27)	459	addp4 r18=0,r32 // rp copy
422	mov ar.ec=11 };;	460	#else
423		461	mov r18=r32 // rp copy
424	// This loop spins in 3(n+10) ticks on Itanium and in 2(n+10) on	462	#endif
425	// Itanium 2. Yes, unlike previous versions it scales:-) Previous	463	mov ar.ec=15 };;
426	// version was peforming all additions in IALU and was starving	464
427	// for those even on Itanium 2. In this version one addition is	465	// This loop spins in 3*(n+14) ticks on Itanium and should spin in
428	// moved to FPU and is folded with multiplication. This is at cost	466	// 2*(n+14) on "wider" IA-64 implementations (to be verified with new
429	// of propogating the result from previous call to this subroutine	467	// �-architecture manuals as they become available). As usual it's
430	// to L2 cache... In other words negligible even for shorter keys.	468	// possible to compress the epilogue, down to 10 in this case, at the
431	// Overall performance improvement [over previous version] varies	469	// cost of scalability. Compressed (and therefore non-scalable) loop
432	// from 11 to 22 percent depending on key length.	470	// running at 3*(n+11) would buy you ~10% on Itanium but take ~35%
		471	// from "wider" IA-64 so let it be scalable! Special attention was
		472	// paid for having the loop body split at 64-byte boundary. ld8 is
		473	// scheduled for L1 cache as the data is more than likely there.
		474	// Indeed, bn_mul_words has put it there a moment ago:-)
433	.L_bn_mul_add_words_ctop:	475	.L_bn_mul_add_words_ctop:
434	.pred.rel "mutex",p40,p42	476	{ .mfi; (p25) getf.sig r36=f52 // low
435	{ .mfi; (p23) getf.sig r36=f45 // low	477	(p21) xmpy.lu f48=f37,f8
436	(p20) xma.lu f42=f36,f8,f50 // low	478	(p28) cmp.ltu p54,p50=r41,r39 }
437	(p40) add r39=r39,r35 } // (p27)	479	{ .mfi; (p16) ldf8 f32=[r15],8
438	{ .mfi; (p16) ldf8 f32=[r15],8 // *(ap++)	480	(p21) xmpy.hu f40=f37,f8
439	(p20) xma.hu f36=f36,f8,f50 // high	481	(p28) add r45=r45,r41 };;
440	(p42) add r39=r39,r35,1 };; // (p27)	482	{ .mii; (p25) getf.sig r32=f44 // high
441	{ .mmi; (p24) getf.sig r32=f40 // high	483	.pred.rel "mutex",p50,p54
442	(p16) ldf8 f46=[r16],8 // *(rp1++)	484	(p50) add r40=r38,r35 // (p27)
443	(p40) cmp.ltu p41,p39=r39,r35 } // (p27)	485	(p54) add r40=r38,r35,1 } // (p27)
444	{ .mib; (p26) st8 [r14]=r39,8 // *(rp2++)	486	{ .mfb; (p28) cmp.ltu.unc p60,p0=r45,r41
445	(p42) cmp.leu p41,p39=r39,r35 // (p27)	487	(p0) nop.f 0x0
		488	(p0) nop.b 0x0 }
		489	{ .mii; (p27) ld8 r44=[r18],8
		490	(p62) cmp.eq.or p61,p0=-1,r46
		491	(p62) add r46=1,r46 }
		492	{ .mfb; (p30) st8 [r14]=r47,8
		493	(p0) nop.f 0x0
446	br.ctop.sptk .L_bn_mul_add_words_ctop};;	494	br.ctop.sptk .L_bn_mul_add_words_ctop};;
447	.L_bn_mul_add_words_cend:	495	.L_bn_mul_add_words_cend:
448		496
449	{ .mmi; .pred.rel "mutex",p40,p42	497	{ .mii; nop.m 0x0
450	(p40) add r8=r35,r0	498	.pred.rel "mutex",p53,p57
451	(p42) add r8=r35,r0,1	499	(p53) add r8=r38,r0
452	mov pr=r9,0x1ffff }	500	(p57) add r8=r38,r0,1 }
453	{ .mib; rum 1<<5 // clear um.mfh	501	{ .mfb; nop.m 0x0
454	mov ar.lc=r3	502	nop.f 0x0
		503	nop.b 0x0 };;
		504	{ .mii;
		505	(p63) add r8=1,r8
		506	mov pr=r9,0x1ffff
		507	mov ar.lc=r3 }
		508	{ .mfb; rum 1<<5 // clear um.mfh
		509	nop.f 0x0
455	br.ret.sptk.many b0 };;	510	br.ret.sptk.many b0 };;
456	.endp bn_mul_add_words#	511	.endp bn_mul_add_words#
457	#endif	512	#endif
@@ -472,8 +527,7 @@ bn_sqr_words:
472	sxt4 r34=r34 };;	527	sxt4 r34=r34 };;
473	{ .mii; cmp.le p6,p0=r34,r0	528	{ .mii; cmp.le p6,p0=r34,r0
474	mov r8=r0 } // return value	529	mov r8=r0 } // return value
475	{ .mfb; ADDP r32=0,r32	530	{ .mfb; nop.f 0x0
476	nop.f 0x0
477	(p6) br.ret.spnt.many b0 };;	531	(p6) br.ret.spnt.many b0 };;
478		532
479	.save ar.lc,r3	533	.save ar.lc,r3
@@ -482,7 +536,11 @@ bn_sqr_words:
482	mov r9=pr };;	536	mov r9=pr };;
483		537
484	.body	538	.body
485	{ .mib; ADDP r33=0,r33	539	#if defined(_HPUX_SOURCE) && defined(_ILP32)
		540	{ .mii; addp4 r32=0,r32
		541	addp4 r33=0,r33 };;
		542	#endif
		543	{ .mib;
486	mov pr.rot=1<<16	544	mov pr.rot=1<<16
487	brp.loop.imp .L_bn_sqr_words_ctop,.L_bn_sqr_words_cend-16	545	brp.loop.imp .L_bn_sqr_words_ctop,.L_bn_sqr_words_cend-16
488	}	546	}
@@ -547,7 +605,7 @@ bn_sqr_comba8:
547	.prologue	605	.prologue
548	.fframe 0	606	.fframe 0
549	.save ar.pfs,r2	607	.save ar.pfs,r2
550	#if defined(_HPUX_SOURCE) && !defined(_LP64)	608	#if defined(_HPUX_SOURCE) && defined(_ILP32)
551	{ .mii; alloc r2=ar.pfs,2,1,0,0	609	{ .mii; alloc r2=ar.pfs,2,1,0,0
552	addp4 r33=0,r33	610	addp4 r33=0,r33
553	addp4 r32=0,r32 };;	611	addp4 r32=0,r32 };;
@@ -573,10 +631,6 @@ bn_sqr_comba8:
573	// clause in Itanium �-architecture manual? Comments are welcomed and	631	// clause in Itanium �-architecture manual? Comments are welcomed and
574	// highly appreciated.	632	// highly appreciated.
575	//	633	//
576	// On Itanium 2 it takes ~190 ticks. This is because of stalls on
577	// result from getf.sig. I do nothing about it at this point for
578	// reasons depicted below.
579	//
580	// However! It should be noted that even 160 ticks is darn good result	634	// However! It should be noted that even 160 ticks is darn good result
581	// as it's over 10 (yes, ten, spelled as t-e-n) times faster than the	635	// as it's over 10 (yes, ten, spelled as t-e-n) times faster than the
582	// C version (compiled with gcc with inline assembler). I really	636	// C version (compiled with gcc with inline assembler). I really
@@ -619,7 +673,7 @@ bn_mul_comba8:
619	.prologue	673	.prologue
620	.fframe 0	674	.fframe 0
621	.save ar.pfs,r2	675	.save ar.pfs,r2
622	#if defined(_HPUX_SOURCE) && !defined(_LP64)	676	#if defined(_HPUX_SOURCE) && defined(_ILP32)
623	{ .mii; alloc r2=ar.pfs,3,0,0,0	677	{ .mii; alloc r2=ar.pfs,3,0,0,0
624	addp4 r33=0,r33	678	addp4 r33=0,r33
625	addp4 r34=0,r34 };;	679	addp4 r34=0,r34 };;
@@ -1177,7 +1231,7 @@ bn_sqr_comba4:
1177	.prologue	1231	.prologue
1178	.fframe 0	1232	.fframe 0
1179	.save ar.pfs,r2	1233	.save ar.pfs,r2
1180	#if defined(_HPUX_SOURCE) && !defined(_LP64)	1234	#if defined(_HPUX_SOURCE) && defined(_ILP32)
1181	{ .mii; alloc r2=ar.pfs,2,1,0,0	1235	{ .mii; alloc r2=ar.pfs,2,1,0,0
1182	addp4 r32=0,r32	1236	addp4 r32=0,r32
1183	addp4 r33=0,r33 };;	1237	addp4 r33=0,r33 };;
@@ -1210,7 +1264,7 @@ bn_mul_comba4:
1210	.prologue	1264	.prologue
1211	.fframe 0	1265	.fframe 0
1212	.save ar.pfs,r2	1266	.save ar.pfs,r2
1213	#if defined(_HPUX_SOURCE) && !defined(_LP64)	1267	#if defined(_HPUX_SOURCE) && defined(_ILP32)
1214	{ .mii; alloc r2=ar.pfs,3,0,0,0	1268	{ .mii; alloc r2=ar.pfs,3,0,0,0
1215	addp4 r33=0,r33	1269	addp4 r33=0,r33
1216	addp4 r34=0,r34 };;	1270	addp4 r34=0,r34 };;
@@ -1394,8 +1448,8 @@ bn_mul_comba4:
1394	#define I r21	1448	#define I r21
1395		1449
1396	#if 0	1450	#if 0
1397	// Some preprocessors (most notably HP-UX) appear to be allergic to	1451	// Some preprocessors (most notably HP-UX) apper to be allergic to
1398	// macros enclosed to parenthesis [as these three were].	1452	// macros enclosed to parenthesis as these three will be.
1399	#define cont p16	1453	#define cont p16
1400	#define break p0 // p20	1454	#define break p0 // p20
1401	#define equ p24	1455	#define equ p24
@@ -1527,18 +1581,9 @@ bn_div_words:
1527	// output: f8 = (int)(a/b)	1581	// output: f8 = (int)(a/b)
1528	// clobbered: f8,f9,f10,f11,pred	1582	// clobbered: f8,f9,f10,f11,pred
1529	pred=p15	1583	pred=p15
1530	// One can argue that this snippet is copyrighted to Intel	1584	// This procedure is essentially Intel code and therefore is
1531	// Corporation, as it's essentially identical to one of those	1585	// copyrighted to Intel Corporation (I suppose...). It's sligtly
1532	// found in "Divide, Square Root and Remainder" section at	1586	// modified for specific needs.
1533	// http://www.intel.com/software/products/opensource/libraries/num.htm.
1534	// Yes, I admit that the referred code was used as template,
1535	// but after I realized that there hardly is any other instruction
1536	// sequence which would perform this operation. I mean I figure that
1537	// any independent attempt to implement high-performance division
1538	// will result in code virtually identical to the Intel code. It
1539	// should be noted though that below division kernel is 1 cycle
1540	// faster than Intel one (note commented splits:-), not to mention
1541	// original prologue (rather lack of one) and epilogue.
1542	.align 32	1587	.align 32
1543	.skip 16	1588	.skip 16
1544	.L_udiv64_32_b6:	1589	.L_udiv64_32_b6: