resolve conflicts

author: djm <> 2005-04-29 05:39:33 +0000
committer: djm <> 2005-04-29 05:39:33 +0000
commit: 68edd00d9258df93b1366c71ac124e0cadf7bc08 (patch)
tree: 3ce4ae2a9747bbc11aed1f95f9bbea92c41f8683 /src/lib/libcrypto/bn/asm
parent: f396ed0f5ce0af56bfde2e75e15cf1f52924c779 (diff)
download: openbsd-68edd00d9258df93b1366c71ac124e0cadf7bc08.tar.gz
openbsd-68edd00d9258df93b1366c71ac124e0cadf7bc08.tar.bz2
openbsd-68edd00d9258df93b1366c71ac124e0cadf7bc08.zip
1 files changed, 86 insertions, 131 deletions
diff --git a/src/lib/libcrypto/bn/asm/ia64.S b/src/lib/libcrypto/bn/asm/ia64.S
index 7dfda85566..7b82b820e6 100644
--- a/src/lib/libcrypto/bn/asm/ia64.S
+++ b/src/lib/libcrypto/bn/asm/ia64.S
@@ -1,6 +1,6 @@
 .explicit
 .text
-.ident  "ia64.S, Version 2.0"
+.ident  "ia64.S, Version 2.1"
 .ident  "IA-64 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>"
 //
@@ -35,7 +35,7 @@
 // What does it mean? You might ratiocinate that the original code
 // should run just faster... Because sum of latencies is smaller...
 // Wrong! Note that getf latency increased. This means that if a loop is
-// scheduled for lower latency (and they are), then it will suffer from
+// scheduled for lower latency (as they were), then it will suffer from
 // stall condition and the code will therefore turn anti-scalable, e.g.
 // original bn_mul_words spun at 5*n or 2.5 times slower than expected
 // on Itanium2! What to do? Reschedule loops for Itanium2? But then
@@ -145,6 +145,12 @@
 //      -Drum=nop.m in command line.
 //
+#if defined(_HPUX_SOURCE) && !defined(_LP64)
+#define ADDP    addp4
+#else
+#define ADDP    add
+#endif
 #if 1
 //
 // bn_[add|sub]_words routines.
@@ -178,27 +184,12 @@ bn_add_words:
        brp.loop.imp    .L_bn_add_words_ctop,.L_bn_add_words_cend-16
                                        }
        .body
-{ .mib;
+{ .mib; ADDP            r14=0,r32               // rp
-#if defined(_HPUX_SOURCE) && defined(_ILP32)
-        addp4           r14=0,r32               // rp
-#else
-        mov             r14=r32                 // rp
-#endif
        mov             r9=pr           };;
-{ .mii;
+{ .mii; ADDP            r15=0,r33               // ap
-#if defined(_HPUX_SOURCE) && defined(_ILP32)
-        addp4           r15=0,r33               // ap
-#else
-        mov             r15=r33                 // ap
-#endif
        mov             ar.lc=r10
        mov             ar.ec=6         }
-{ .mib;
+{ .mib; ADDP            r16=0,r34               // bp
-#if defined(_HPUX_SOURCE) && defined(_ILP32)
-        addp4           r16=0,r34               // bp
-#else
-        mov             r16=r34                 // bp
-#endif
        mov             pr.rot=1<<16    };;
 .L_bn_add_words_ctop:
@@ -246,27 +237,12 @@ bn_sub_words:
        brp.loop.imp    .L_bn_sub_words_ctop,.L_bn_sub_words_cend-16
                                        }
        .body
-{ .mib;
+{ .mib; ADDP            r14=0,r32               // rp
-#if defined(_HPUX_SOURCE) && defined(_ILP32)
-        addp4           r14=0,r32               // rp
-#else
-        mov             r14=r32                 // rp
-#endif
        mov             r9=pr           };;
-{ .mii;
+{ .mii; ADDP            r15=0,r33               // ap
-#if defined(_HPUX_SOURCE) && defined(_ILP32)
-        addp4           r15=0,r33               // ap
-#else
-        mov             r15=r33                 // ap
-#endif
        mov             ar.lc=r10
        mov             ar.ec=6         }
-{ .mib;
+{ .mib; ADDP            r16=0,r34               // bp
-#if defined(_HPUX_SOURCE) && defined(_ILP32)
-        addp4           r16=0,r34               // bp
-#else
-        mov             r16=r34                 // bp
-#endif
        mov             pr.rot=1<<16    };;
 .L_bn_sub_words_ctop:
@@ -332,16 +308,10 @@ bn_mul_words:
 #ifndef XMA_TEMPTATION
-{ .mii;
+{ .mmi; ADDP            r14=0,r32       // rp
-#if defined(_HPUX_SOURCE) && defined(_ILP32)
+        ADDP            r15=0,r33       // ap
-        addp4           r14=0,r32       // rp
-        addp4           r15=0,r33       // ap
-#else
-        mov             r14=r32         // rp
-        mov             r15=r33         // ap
-#endif
        mov             ar.lc=r10       }
-{ .mii; mov             r40=0   // serves as r35 at first (p27)
+{ .mmi; mov             r40=0           // serves as r35 at first (p27)
        mov             ar.ec=13        };;
 // This loop spins in 2*(n+12) ticks. It's scheduled for data in Itanium
@@ -424,89 +394,64 @@ bn_mul_words:
 .global bn_mul_add_words#
 .proc   bn_mul_add_words#
 .align  64
-//.skip 0       // makes the loop split at 64-byte boundary
+.skip   48      // makes the loop body aligned at 64-byte boundary
 bn_mul_add_words:
        .prologue
        .fframe 0
        .save   ar.pfs,r2
-{ .mii; alloc           r2=ar.pfs,4,12,0,16
-        cmp4.le         p6,p0=r34,r0    };;
-{ .mfb; mov             r8=r0                   // return value
-(p6)    br.ret.spnt.many        b0      };;
        .save   ar.lc,r3
-{ .mii; sub     r10=r34,r0,1
+        .save   pr,r9
-        mov     r3=ar.lc
+{ .mmi; alloc           r2=ar.pfs,4,4,0,8
-        mov     r9=pr                   };;
+        cmp4.le         p6,p0=r34,r0
+        mov             r3=ar.lc        };;
+{ .mib; mov             r8=r0           // return value
+        sub             r10=r34,r0,1
+(p6)    br.ret.spnt.many        b0      };;
        .body
-{ .mib; setf.sig        f8=r35  // w
+{ .mib; setf.sig        f8=r35          // w
-        mov             pr.rot=0x800001<<16
+        mov             r9=pr
-                        // ------^----- serves as (p50) at first (p27)
        brp.loop.imp    .L_bn_mul_add_words_ctop,.L_bn_mul_add_words_cend-16
                                        }
-{ .mii;
+{ .mmi; ADDP            r14=0,r32       // rp
-#if defined(_HPUX_SOURCE) && defined(_ILP32)
+        ADDP            r15=0,r33       // ap
-        addp4           r14=0,r32       // rp
-        addp4           r15=0,r33       // ap
-#else
-        mov             r14=r32         // rp
-        mov             r15=r33         // ap
-#endif
        mov             ar.lc=r10       }
-{ .mii; mov             r40=0   // serves as r35 at first (p27)
+{ .mii; ADDP            r16=0,r32       // rp copy
-#if defined(_HPUX_SOURCE) && defined(_ILP32)
+        mov             pr.rot=0x2001<<16
-        addp4           r18=0,r32       // rp copy
+                        // ------^----- serves as (p40) at first (p27)
-#else
+        mov             ar.ec=11        };;
-        mov             r18=r32         // rp copy
-#endif
+// This loop spins in 3*(n+10) ticks on Itanium and in 2*(n+10) on
-        mov             ar.ec=15        };;
+// Itanium 2. Yes, unlike previous versions it scales:-) Previous
+// version was peforming *all* additions in IALU and was starving
-// This loop spins in 3*(n+14) ticks on Itanium and should spin in
+// for those even on Itanium 2. In this version one addition is
-// 2*(n+14) on "wider" IA-64 implementations (to be verified with new
+// moved to FPU and is folded with multiplication. This is at cost
-// �-architecture manuals as they become available). As usual it's
+// of propogating the result from previous call to this subroutine
-// possible to compress the epilogue, down to 10 in this case, at the
+// to L2 cache... In other words negligible even for shorter keys.
-// cost of scalability. Compressed (and therefore non-scalable) loop
+// *Overall* performance improvement [over previous version] varies
-// running at 3*(n+11) would buy you ~10% on Itanium but take ~35%
+// from 11 to 22 percent depending on key length.
-// from "wider" IA-64 so let it be scalable! Special attention was
-// paid for having the loop body split at 64-byte boundary. ld8 is
-// scheduled for L1 cache as the data is more than likely there.
-// Indeed, bn_mul_words has put it there a moment ago:-)
 .L_bn_mul_add_words_ctop:
-{ .mfi; (p25)   getf.sig        r36=f52                 // low
+.pred.rel       "mutex",p40,p42
-        (p21)   xmpy.lu         f48=f37,f8
+{ .mfi; (p23)   getf.sig        r36=f45                 // low
-        (p28)   cmp.ltu         p54,p50=r41,r39 }
+        (p20)   xma.lu          f42=f36,f8,f50          // low
-{ .mfi; (p16)   ldf8            f32=[r15],8
+        (p40)   add             r39=r39,r35     }       // (p27)
-        (p21)   xmpy.hu         f40=f37,f8
+{ .mfi; (p16)   ldf8            f32=[r15],8             // *(ap++)
-        (p28)   add             r45=r45,r41     };;
+        (p20)   xma.hu          f36=f36,f8,f50          // high
-{ .mii; (p25)   getf.sig        r32=f44                 // high
+        (p42)   add             r39=r39,r35,1   };;     // (p27)
-        .pred.rel       "mutex",p50,p54
+{ .mmi; (p24)   getf.sig        r32=f40                 // high
-        (p50)   add             r40=r38,r35             // (p27)
+        (p16)   ldf8            f46=[r16],8             // *(rp1++)
-        (p54)   add             r40=r38,r35,1   }       // (p27)
+        (p40)   cmp.ltu         p41,p39=r39,r35 }       // (p27)
-{ .mfb; (p28)   cmp.ltu.unc     p60,p0=r45,r41
+{ .mib; (p26)   st8             [r14]=r39,8             // *(rp2++)
-        (p0)    nop.f           0x0
+        (p42)   cmp.leu         p41,p39=r39,r35         // (p27)
-        (p0)    nop.b           0x0             }
-{ .mii; (p27)   ld8             r44=[r18],8
-        (p62)   cmp.eq.or       p61,p0=-1,r46
-        (p62)   add             r46=1,r46       }
-{ .mfb; (p30)   st8             [r14]=r47,8
-        (p0)    nop.f           0x0
        br.ctop.sptk    .L_bn_mul_add_words_ctop};;
 .L_bn_mul_add_words_cend:
-{ .mii; nop.m           0x0
+{ .mmi; .pred.rel       "mutex",p40,p42
-.pred.rel       "mutex",p53,p57
+(p40)   add             r8=r35,r0
-(p53)   add             r8=r38,r0
+(p42)   add             r8=r35,r0,1
-(p57)   add             r8=r38,r0,1     }
+        mov             pr=r9,0x1ffff   }
-{ .mfb; nop.m   0x0
+{ .mib; rum             1<<5            // clear um.mfh
-        nop.f   0x0
+        mov             ar.lc=r3
-        nop.b   0x0                     };;
-{ .mii;
-(p63)   add             r8=1,r8
-        mov             pr=r9,0x1ffff
-        mov             ar.lc=r3        }
-{ .mfb; rum             1<<5            // clear um.mfh
-        nop.f           0x0
        br.ret.sptk.many        b0      };;
 .endp   bn_mul_add_words#
 #endif
@@ -527,7 +472,8 @@ bn_sqr_words:
        sxt4            r34=r34         };;
 { .mii; cmp.le          p6,p0=r34,r0
        mov             r8=r0           }       // return value
-{ .mfb; nop.f           0x0
+{ .mfb; ADDP            r32=0,r32
+        nop.f           0x0
 (p6)    br.ret.spnt.many        b0      };;
        .save   ar.lc,r3
@@ -536,11 +482,7 @@ bn_sqr_words:
        mov     r9=pr                   };;
        .body
-#if defined(_HPUX_SOURCE) && defined(_ILP32)
+{ .mib; ADDP            r33=0,r33
-{ .mii; addp4           r32=0,r32
-        addp4           r33=0,r33       };;
-#endif
-{ .mib;
        mov             pr.rot=1<<16
        brp.loop.imp    .L_bn_sqr_words_ctop,.L_bn_sqr_words_cend-16
                                        }
@@ -605,7 +547,7 @@ bn_sqr_comba8:
        .prologue
        .fframe 0
        .save   ar.pfs,r2
-#if defined(_HPUX_SOURCE) && defined(_ILP32)
+#if defined(_HPUX_SOURCE) && !defined(_LP64)
 { .mii; alloc   r2=ar.pfs,2,1,0,0
        addp4   r33=0,r33
        addp4   r32=0,r32               };;
@@ -631,6 +573,10 @@ bn_sqr_comba8:
 // clause in Itanium �-architecture manual? Comments are welcomed and
 // highly appreciated.
 //
+// On Itanium 2 it takes ~190 ticks. This is because of stalls on
+// result from getf.sig. I do nothing about it at this point for
+// reasons depicted below.
+//
 // However! It should be noted that even 160 ticks is darn good result
 // as it's over 10 (yes, ten, spelled as t-e-n) times faster than the
 // C version (compiled with gcc with inline assembler). I really
@@ -673,7 +619,7 @@ bn_mul_comba8:
        .prologue
        .fframe 0
        .save   ar.pfs,r2
-#if defined(_HPUX_SOURCE) && defined(_ILP32)
+#if defined(_HPUX_SOURCE) && !defined(_LP64)
 { .mii; alloc   r2=ar.pfs,3,0,0,0
        addp4   r33=0,r33
        addp4   r34=0,r34               };;
@@ -1231,7 +1177,7 @@ bn_sqr_comba4:
        .prologue
        .fframe 0
        .save   ar.pfs,r2
-#if defined(_HPUX_SOURCE) && defined(_ILP32)
+#if defined(_HPUX_SOURCE) && !defined(_LP64)
 { .mii; alloc   r2=ar.pfs,2,1,0,0
        addp4   r32=0,r32
        addp4   r33=0,r33               };;
@@ -1264,7 +1210,7 @@ bn_mul_comba4:
        .prologue
        .fframe 0
        .save   ar.pfs,r2
-#if defined(_HPUX_SOURCE) && defined(_ILP32)
+#if defined(_HPUX_SOURCE) && !defined(_LP64)
 { .mii; alloc   r2=ar.pfs,3,0,0,0
        addp4   r33=0,r33
        addp4   r34=0,r34               };;
@@ -1448,8 +1394,8 @@ bn_mul_comba4:
 #define I       r21
 #if 0
-// Some preprocessors (most notably HP-UX) apper to be allergic to
+// Some preprocessors (most notably HP-UX) appear to be allergic to
-// macros enclosed to parenthesis as these three will be.
+// macros enclosed to parenthesis [as these three were].
 #define cont    p16
 #define break   p0      // p20
 #define equ     p24
@@ -1581,9 +1527,18 @@ bn_div_words:
 // output:      f8 = (int)(a/b)
 // clobbered:   f8,f9,f10,f11,pred
 pred=p15
-// This procedure is essentially Intel code and therefore is
+// One can argue that this snippet is copyrighted to Intel
-// copyrighted to Intel Corporation (I suppose...). It's sligtly
+// Corporation, as it's essentially identical to one of those
-// modified for specific needs.
+// found in "Divide, Square Root and Remainder" section at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
+// Yes, I admit that the referred code was used as template,
+// but after I realized that there hardly is any other instruction
+// sequence which would perform this operation. I mean I figure that
+// any independent attempt to implement high-performance division
+// will result in code virtually identical to the Intel code. It
+// should be noted though that below division kernel is 1 cycle
+// faster than Intel one (note commented splits:-), not to mention
+// original prologue (rather lack of one) and epilogue.
 .align  32
 .skip   16
 .L_udiv64_32_b6:
author	djm <>	2005-04-29 05:39:33 +0000
committer	djm <>	2005-04-29 05:39:33 +0000
commit	68edd00d9258df93b1366c71ac124e0cadf7bc08 (patch)
tree	3ce4ae2a9747bbc11aed1f95f9bbea92c41f8683 /src/lib/libcrypto/bn/asm
parent	f396ed0f5ce0af56bfde2e75e15cf1f52924c779 (diff)
download	openbsd-68edd00d9258df93b1366c71ac124e0cadf7bc08.tar.gz openbsd-68edd00d9258df93b1366c71ac124e0cadf7bc08.tar.bz2 openbsd-68edd00d9258df93b1366c71ac124e0cadf7bc08.zip

diff --git a/src/lib/libcrypto/bn/asm/ia64.S b/src/lib/libcrypto/bn/asm/ia64.S index 7dfda85566..7b82b820e6 100644 --- a/src/lib/libcrypto/bn/asm/ia64.S +++ b/src/lib/libcrypto/bn/asm/ia64.S
@@ -1,6 +1,6 @@
1	.explicit	1	.explicit
2	.text	2	.text
3	.ident "ia64.S, Version 2.0"	3	.ident "ia64.S, Version 2.1"
4	.ident "IA-64 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>"	4	.ident "IA-64 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>"
5		5
6	//	6	//
@@ -35,7 +35,7 @@
35	// What does it mean? You might ratiocinate that the original code	35	// What does it mean? You might ratiocinate that the original code
36	// should run just faster... Because sum of latencies is smaller...	36	// should run just faster... Because sum of latencies is smaller...
37	// Wrong! Note that getf latency increased. This means that if a loop is	37	// Wrong! Note that getf latency increased. This means that if a loop is
38	// scheduled for lower latency (and they are), then it will suffer from	38	// scheduled for lower latency (as they were), then it will suffer from
39	// stall condition and the code will therefore turn anti-scalable, e.g.	39	// stall condition and the code will therefore turn anti-scalable, e.g.
40	// original bn_mul_words spun at 5*n or 2.5 times slower than expected	40	// original bn_mul_words spun at 5*n or 2.5 times slower than expected
41	// on Itanium2! What to do? Reschedule loops for Itanium2? But then	41	// on Itanium2! What to do? Reschedule loops for Itanium2? But then
@@ -145,6 +145,12 @@
145	// -Drum=nop.m in command line.	145	// -Drum=nop.m in command line.
146	//	146	//
147		147
		148	#if defined(_HPUX_SOURCE) && !defined(_LP64)
		149	#define ADDP addp4
		150	#else
		151	#define ADDP add
		152	#endif
		153
148	#if 1	154	#if 1
149	//	155	//
150	// bn_[add\|sub]_words routines.	156	// bn_[add\|sub]_words routines.
@@ -178,27 +184,12 @@ bn_add_words:
178	brp.loop.imp .L_bn_add_words_ctop,.L_bn_add_words_cend-16	184	brp.loop.imp .L_bn_add_words_ctop,.L_bn_add_words_cend-16
179	}	185	}
180	.body	186	.body
181	{ .mib;	187	{ .mib; ADDP r14=0,r32 // rp
182	#if defined(_HPUX_SOURCE) && defined(_ILP32)
183	addp4 r14=0,r32 // rp
184	#else
185	mov r14=r32 // rp
186	#endif
187	mov r9=pr };;	188	mov r9=pr };;
188	{ .mii;	189	{ .mii; ADDP r15=0,r33 // ap
189	#if defined(_HPUX_SOURCE) && defined(_ILP32)
190	addp4 r15=0,r33 // ap
191	#else
192	mov r15=r33 // ap
193	#endif
194	mov ar.lc=r10	190	mov ar.lc=r10
195	mov ar.ec=6 }	191	mov ar.ec=6 }
196	{ .mib;	192	{ .mib; ADDP r16=0,r34 // bp
197	#if defined(_HPUX_SOURCE) && defined(_ILP32)
198	addp4 r16=0,r34 // bp
199	#else
200	mov r16=r34 // bp
201	#endif
202	mov pr.rot=1<<16 };;	193	mov pr.rot=1<<16 };;
203		194
204	.L_bn_add_words_ctop:	195	.L_bn_add_words_ctop:
@@ -246,27 +237,12 @@ bn_sub_words:
246	brp.loop.imp .L_bn_sub_words_ctop,.L_bn_sub_words_cend-16	237	brp.loop.imp .L_bn_sub_words_ctop,.L_bn_sub_words_cend-16
247	}	238	}
248	.body	239	.body
249	{ .mib;	240	{ .mib; ADDP r14=0,r32 // rp
250	#if defined(_HPUX_SOURCE) && defined(_ILP32)
251	addp4 r14=0,r32 // rp
252	#else
253	mov r14=r32 // rp
254	#endif
255	mov r9=pr };;	241	mov r9=pr };;
256	{ .mii;	242	{ .mii; ADDP r15=0,r33 // ap
257	#if defined(_HPUX_SOURCE) && defined(_ILP32)
258	addp4 r15=0,r33 // ap
259	#else
260	mov r15=r33 // ap
261	#endif
262	mov ar.lc=r10	243	mov ar.lc=r10
263	mov ar.ec=6 }	244	mov ar.ec=6 }
264	{ .mib;	245	{ .mib; ADDP r16=0,r34 // bp
265	#if defined(_HPUX_SOURCE) && defined(_ILP32)
266	addp4 r16=0,r34 // bp
267	#else
268	mov r16=r34 // bp
269	#endif
270	mov pr.rot=1<<16 };;	246	mov pr.rot=1<<16 };;
271		247
272	.L_bn_sub_words_ctop:	248	.L_bn_sub_words_ctop:
@@ -332,16 +308,10 @@ bn_mul_words:
332		308
333	#ifndef XMA_TEMPTATION	309	#ifndef XMA_TEMPTATION
334		310
335	{ .mii;	311	{ .mmi; ADDP r14=0,r32 // rp
336	#if defined(_HPUX_SOURCE) && defined(_ILP32)	312	ADDP r15=0,r33 // ap
337	addp4 r14=0,r32 // rp
338	addp4 r15=0,r33 // ap
339	#else
340	mov r14=r32 // rp
341	mov r15=r33 // ap
342	#endif
343	mov ar.lc=r10 }	313	mov ar.lc=r10 }
344	{ .mii; mov r40=0 // serves as r35 at first (p27)	314	{ .mmi; mov r40=0 // serves as r35 at first (p27)
345	mov ar.ec=13 };;	315	mov ar.ec=13 };;
346		316
347	// This loop spins in 2*(n+12) ticks. It's scheduled for data in Itanium	317	// This loop spins in 2*(n+12) ticks. It's scheduled for data in Itanium
@@ -424,89 +394,64 @@ bn_mul_words:
424	.global bn_mul_add_words#	394	.global bn_mul_add_words#
425	.proc bn_mul_add_words#	395	.proc bn_mul_add_words#
426	.align 64	396	.align 64
427	//.skip 0 // makes the loop split at 64-byte boundary	397	.skip 48 // makes the loop body aligned at 64-byte boundary
428	bn_mul_add_words:	398	bn_mul_add_words:
429	.prologue	399	.prologue
430	.fframe 0	400	.fframe 0
431	.save ar.pfs,r2	401	.save ar.pfs,r2
432	{ .mii; alloc r2=ar.pfs,4,12,0,16
433	cmp4.le p6,p0=r34,r0 };;
434	{ .mfb; mov r8=r0 // return value
435	(p6) br.ret.spnt.many b0 };;
436
437	.save ar.lc,r3	402	.save ar.lc,r3
438	{ .mii; sub r10=r34,r0,1	403	.save pr,r9
439	mov r3=ar.lc	404	{ .mmi; alloc r2=ar.pfs,4,4,0,8
440	mov r9=pr };;	405	cmp4.le p6,p0=r34,r0
		406	mov r3=ar.lc };;
		407	{ .mib; mov r8=r0 // return value
		408	sub r10=r34,r0,1
		409	(p6) br.ret.spnt.many b0 };;
441		410
442	.body	411	.body
443	{ .mib; setf.sig f8=r35 // w	412	{ .mib; setf.sig f8=r35 // w
444	mov pr.rot=0x800001<<16	413	mov r9=pr
445	// ------^----- serves as (p50) at first (p27)
446	brp.loop.imp .L_bn_mul_add_words_ctop,.L_bn_mul_add_words_cend-16	414	brp.loop.imp .L_bn_mul_add_words_ctop,.L_bn_mul_add_words_cend-16
447	}	415	}
448	{ .mii;	416	{ .mmi; ADDP r14=0,r32 // rp
449	#if defined(_HPUX_SOURCE) && defined(_ILP32)	417	ADDP r15=0,r33 // ap
450	addp4 r14=0,r32 // rp
451	addp4 r15=0,r33 // ap
452	#else
453	mov r14=r32 // rp
454	mov r15=r33 // ap
455	#endif
456	mov ar.lc=r10 }	418	mov ar.lc=r10 }
457	{ .mii; mov r40=0 // serves as r35 at first (p27)	419	{ .mii; ADDP r16=0,r32 // rp copy
458	#if defined(_HPUX_SOURCE) && defined(_ILP32)	420	mov pr.rot=0x2001<<16
459	addp4 r18=0,r32 // rp copy	421	// ------^----- serves as (p40) at first (p27)
460	#else	422	mov ar.ec=11 };;
461	mov r18=r32 // rp copy	423
462	#endif	424	// This loop spins in 3(n+10) ticks on Itanium and in 2(n+10) on
463	mov ar.ec=15 };;	425	// Itanium 2. Yes, unlike previous versions it scales:-) Previous
464		426	// version was peforming all additions in IALU and was starving
465	// This loop spins in 3*(n+14) ticks on Itanium and should spin in	427	// for those even on Itanium 2. In this version one addition is
466	// 2*(n+14) on "wider" IA-64 implementations (to be verified with new	428	// moved to FPU and is folded with multiplication. This is at cost
467	// �-architecture manuals as they become available). As usual it's	429	// of propogating the result from previous call to this subroutine
468	// possible to compress the epilogue, down to 10 in this case, at the	430	// to L2 cache... In other words negligible even for shorter keys.
469	// cost of scalability. Compressed (and therefore non-scalable) loop	431	// Overall performance improvement [over previous version] varies
470	// running at 3*(n+11) would buy you ~10% on Itanium but take ~35%	432	// from 11 to 22 percent depending on key length.
471	// from "wider" IA-64 so let it be scalable! Special attention was
472	// paid for having the loop body split at 64-byte boundary. ld8 is
473	// scheduled for L1 cache as the data is more than likely there.
474	// Indeed, bn_mul_words has put it there a moment ago:-)
475	.L_bn_mul_add_words_ctop:	433	.L_bn_mul_add_words_ctop:
476	{ .mfi; (p25) getf.sig r36=f52 // low	434	.pred.rel "mutex",p40,p42
477	(p21) xmpy.lu f48=f37,f8	435	{ .mfi; (p23) getf.sig r36=f45 // low
478	(p28) cmp.ltu p54,p50=r41,r39 }	436	(p20) xma.lu f42=f36,f8,f50 // low
479	{ .mfi; (p16) ldf8 f32=[r15],8	437	(p40) add r39=r39,r35 } // (p27)
480	(p21) xmpy.hu f40=f37,f8	438	{ .mfi; (p16) ldf8 f32=[r15],8 // *(ap++)
481	(p28) add r45=r45,r41 };;	439	(p20) xma.hu f36=f36,f8,f50 // high
482	{ .mii; (p25) getf.sig r32=f44 // high	440	(p42) add r39=r39,r35,1 };; // (p27)
483	.pred.rel "mutex",p50,p54	441	{ .mmi; (p24) getf.sig r32=f40 // high
484	(p50) add r40=r38,r35 // (p27)	442	(p16) ldf8 f46=[r16],8 // *(rp1++)
485	(p54) add r40=r38,r35,1 } // (p27)	443	(p40) cmp.ltu p41,p39=r39,r35 } // (p27)
486	{ .mfb; (p28) cmp.ltu.unc p60,p0=r45,r41	444	{ .mib; (p26) st8 [r14]=r39,8 // *(rp2++)
487	(p0) nop.f 0x0	445	(p42) cmp.leu p41,p39=r39,r35 // (p27)
488	(p0) nop.b 0x0 }
489	{ .mii; (p27) ld8 r44=[r18],8
490	(p62) cmp.eq.or p61,p0=-1,r46
491	(p62) add r46=1,r46 }
492	{ .mfb; (p30) st8 [r14]=r47,8
493	(p0) nop.f 0x0
494	br.ctop.sptk .L_bn_mul_add_words_ctop};;	446	br.ctop.sptk .L_bn_mul_add_words_ctop};;
495	.L_bn_mul_add_words_cend:	447	.L_bn_mul_add_words_cend:
496		448
497	{ .mii; nop.m 0x0	449	{ .mmi; .pred.rel "mutex",p40,p42
498	.pred.rel "mutex",p53,p57	450	(p40) add r8=r35,r0
499	(p53) add r8=r38,r0	451	(p42) add r8=r35,r0,1
500	(p57) add r8=r38,r0,1 }	452	mov pr=r9,0x1ffff }
501	{ .mfb; nop.m 0x0	453	{ .mib; rum 1<<5 // clear um.mfh
502	nop.f 0x0	454	mov ar.lc=r3
503	nop.b 0x0 };;
504	{ .mii;
505	(p63) add r8=1,r8
506	mov pr=r9,0x1ffff
507	mov ar.lc=r3 }
508	{ .mfb; rum 1<<5 // clear um.mfh
509	nop.f 0x0
510	br.ret.sptk.many b0 };;	455	br.ret.sptk.many b0 };;
511	.endp bn_mul_add_words#	456	.endp bn_mul_add_words#
512	#endif	457	#endif
@@ -527,7 +472,8 @@ bn_sqr_words:
527	sxt4 r34=r34 };;	472	sxt4 r34=r34 };;
528	{ .mii; cmp.le p6,p0=r34,r0	473	{ .mii; cmp.le p6,p0=r34,r0
529	mov r8=r0 } // return value	474	mov r8=r0 } // return value
530	{ .mfb; nop.f 0x0	475	{ .mfb; ADDP r32=0,r32
		476	nop.f 0x0
531	(p6) br.ret.spnt.many b0 };;	477	(p6) br.ret.spnt.many b0 };;
532		478
533	.save ar.lc,r3	479	.save ar.lc,r3
@@ -536,11 +482,7 @@ bn_sqr_words:
536	mov r9=pr };;	482	mov r9=pr };;
537		483
538	.body	484	.body
539	#if defined(_HPUX_SOURCE) && defined(_ILP32)	485	{ .mib; ADDP r33=0,r33
540	{ .mii; addp4 r32=0,r32
541	addp4 r33=0,r33 };;
542	#endif
543	{ .mib;
544	mov pr.rot=1<<16	486	mov pr.rot=1<<16
545	brp.loop.imp .L_bn_sqr_words_ctop,.L_bn_sqr_words_cend-16	487	brp.loop.imp .L_bn_sqr_words_ctop,.L_bn_sqr_words_cend-16
546	}	488	}
@@ -605,7 +547,7 @@ bn_sqr_comba8:
605	.prologue	547	.prologue
606	.fframe 0	548	.fframe 0
607	.save ar.pfs,r2	549	.save ar.pfs,r2
608	#if defined(_HPUX_SOURCE) && defined(_ILP32)	550	#if defined(_HPUX_SOURCE) && !defined(_LP64)
609	{ .mii; alloc r2=ar.pfs,2,1,0,0	551	{ .mii; alloc r2=ar.pfs,2,1,0,0
610	addp4 r33=0,r33	552	addp4 r33=0,r33
611	addp4 r32=0,r32 };;	553	addp4 r32=0,r32 };;
@@ -631,6 +573,10 @@ bn_sqr_comba8:
631	// clause in Itanium �-architecture manual? Comments are welcomed and	573	// clause in Itanium �-architecture manual? Comments are welcomed and
632	// highly appreciated.	574	// highly appreciated.
633	//	575	//
		576	// On Itanium 2 it takes ~190 ticks. This is because of stalls on
		577	// result from getf.sig. I do nothing about it at this point for
		578	// reasons depicted below.
		579	//
634	// However! It should be noted that even 160 ticks is darn good result	580	// However! It should be noted that even 160 ticks is darn good result
635	// as it's over 10 (yes, ten, spelled as t-e-n) times faster than the	581	// as it's over 10 (yes, ten, spelled as t-e-n) times faster than the
636	// C version (compiled with gcc with inline assembler). I really	582	// C version (compiled with gcc with inline assembler). I really
@@ -673,7 +619,7 @@ bn_mul_comba8:
673	.prologue	619	.prologue
674	.fframe 0	620	.fframe 0
675	.save ar.pfs,r2	621	.save ar.pfs,r2
676	#if defined(_HPUX_SOURCE) && defined(_ILP32)	622	#if defined(_HPUX_SOURCE) && !defined(_LP64)
677	{ .mii; alloc r2=ar.pfs,3,0,0,0	623	{ .mii; alloc r2=ar.pfs,3,0,0,0
678	addp4 r33=0,r33	624	addp4 r33=0,r33
679	addp4 r34=0,r34 };;	625	addp4 r34=0,r34 };;
@@ -1231,7 +1177,7 @@ bn_sqr_comba4:
1231	.prologue	1177	.prologue
1232	.fframe 0	1178	.fframe 0
1233	.save ar.pfs,r2	1179	.save ar.pfs,r2
1234	#if defined(_HPUX_SOURCE) && defined(_ILP32)	1180	#if defined(_HPUX_SOURCE) && !defined(_LP64)
1235	{ .mii; alloc r2=ar.pfs,2,1,0,0	1181	{ .mii; alloc r2=ar.pfs,2,1,0,0
1236	addp4 r32=0,r32	1182	addp4 r32=0,r32
1237	addp4 r33=0,r33 };;	1183	addp4 r33=0,r33 };;
@@ -1264,7 +1210,7 @@ bn_mul_comba4:
1264	.prologue	1210	.prologue
1265	.fframe 0	1211	.fframe 0
1266	.save ar.pfs,r2	1212	.save ar.pfs,r2
1267	#if defined(_HPUX_SOURCE) && defined(_ILP32)	1213	#if defined(_HPUX_SOURCE) && !defined(_LP64)
1268	{ .mii; alloc r2=ar.pfs,3,0,0,0	1214	{ .mii; alloc r2=ar.pfs,3,0,0,0
1269	addp4 r33=0,r33	1215	addp4 r33=0,r33
1270	addp4 r34=0,r34 };;	1216	addp4 r34=0,r34 };;
@@ -1448,8 +1394,8 @@ bn_mul_comba4:
1448	#define I r21	1394	#define I r21
1449		1395
1450	#if 0	1396	#if 0
1451	// Some preprocessors (most notably HP-UX) apper to be allergic to	1397	// Some preprocessors (most notably HP-UX) appear to be allergic to
1452	// macros enclosed to parenthesis as these three will be.	1398	// macros enclosed to parenthesis [as these three were].
1453	#define cont p16	1399	#define cont p16
1454	#define break p0 // p20	1400	#define break p0 // p20
1455	#define equ p24	1401	#define equ p24
@@ -1581,9 +1527,18 @@ bn_div_words:
1581	// output: f8 = (int)(a/b)	1527	// output: f8 = (int)(a/b)
1582	// clobbered: f8,f9,f10,f11,pred	1528	// clobbered: f8,f9,f10,f11,pred
1583	pred=p15	1529	pred=p15
1584	// This procedure is essentially Intel code and therefore is	1530	// One can argue that this snippet is copyrighted to Intel
1585	// copyrighted to Intel Corporation (I suppose...). It's sligtly	1531	// Corporation, as it's essentially identical to one of those
1586	// modified for specific needs.	1532	// found in "Divide, Square Root and Remainder" section at
		1533	// http://www.intel.com/software/products/opensource/libraries/num.htm.
		1534	// Yes, I admit that the referred code was used as template,
		1535	// but after I realized that there hardly is any other instruction
		1536	// sequence which would perform this operation. I mean I figure that
		1537	// any independent attempt to implement high-performance division
		1538	// will result in code virtually identical to the Intel code. It
		1539	// should be noted though that below division kernel is 1 cycle
		1540	// faster than Intel one (note commented splits:-), not to mention
		1541	// original prologue (rather lack of one) and epilogue.
1587	.align 32	1542	.align 32
1588	.skip 16	1543	.skip 16
1589	.L_udiv64_32_b6:	1544	.L_udiv64_32_b6: