diff options
author | jsing <> | 2016-09-04 14:06:46 +0000 |
---|---|---|
committer | jsing <> | 2016-09-04 14:06:46 +0000 |
commit | 392813b7d9ed86b80127b58bc6e108cc28530eca (patch) | |
tree | 8494faa8d6a64a635803db2bcff9d555fe5adcae /src | |
parent | becd55246777151f47f161f226165d6bbae02434 (diff) | |
download | openbsd-392813b7d9ed86b80127b58bc6e108cc28530eca.tar.gz openbsd-392813b7d9ed86b80127b58bc6e108cc28530eca.tar.bz2 openbsd-392813b7d9ed86b80127b58bc6e108cc28530eca.zip |
Less IA64.
ok deraadt@
Diffstat (limited to 'src')
-rw-r--r-- | src/lib/libcrypto/aes/asm/aes-ia64.S | 1123 | ||||
-rw-r--r-- | src/lib/libcrypto/bn/asm/ia64-mont.pl | 851 | ||||
-rw-r--r-- | src/lib/libcrypto/bn/asm/ia64.S | 1555 | ||||
-rw-r--r-- | src/lib/libcrypto/ia64cpuid.S | 121 | ||||
-rw-r--r-- | src/lib/libcrypto/md5/asm/md5-ia64.S | 992 | ||||
-rwxr-xr-x | src/lib/libcrypto/modes/asm/ghash-ia64.pl | 463 | ||||
-rw-r--r-- | src/lib/libcrypto/rc4/asm/rc4-ia64.pl | 755 | ||||
-rw-r--r-- | src/lib/libcrypto/sha/asm/sha1-ia64.pl | 305 | ||||
-rwxr-xr-x | src/lib/libcrypto/sha/asm/sha512-ia64.pl | 672 | ||||
-rw-r--r-- | src/lib/libcrypto/whrlpool/wp_block.c | 10 |
10 files changed, 1 insertions, 6846 deletions
diff --git a/src/lib/libcrypto/aes/asm/aes-ia64.S b/src/lib/libcrypto/aes/asm/aes-ia64.S deleted file mode 100644 index 7f6c4c3662..0000000000 --- a/src/lib/libcrypto/aes/asm/aes-ia64.S +++ /dev/null | |||
@@ -1,1123 +0,0 @@ | |||
1 | // ==================================================================== | ||
2 | // Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | ||
3 | // project. Rights for redistribution and usage in source and binary | ||
4 | // forms are granted according to the OpenSSL license. | ||
5 | // ==================================================================== | ||
6 | // | ||
7 | // What's wrong with compiler generated code? Compiler never uses | ||
8 | // variable 'shr' which is pairable with 'extr'/'dep' instructions. | ||
9 | // Then it uses 'zxt' which is an I-type, but can be replaced with | ||
10 | // 'and' which in turn can be assigned to M-port [there're double as | ||
11 | // much M-ports as there're I-ports on Itanium 2]. By sacrificing few | ||
12 | // registers for small constants (255, 24 and 16) to be used with | ||
13 | // 'shr' and 'and' instructions I can achieve better ILP, Intruction | ||
14 | // Level Parallelism, and performance. This code outperforms GCC 3.3 | ||
15 | // generated code by over factor of 2 (two), GCC 3.4 - by 70% and | ||
16 | // HP C - by 40%. Measured best-case scenario, i.e. aligned | ||
17 | // big-endian input, ECB timing on Itanium 2 is (18 + 13*rounds) | ||
18 | // ticks per block, or 9.25 CPU cycles per byte for 128 bit key. | ||
19 | |||
20 | // Version 1.2 mitigates the hazard of cache-timing attacks by | ||
21 | // a) compressing S-boxes from 8KB to 2KB+256B, b) scheduling | ||
22 | // references to S-boxes for L2 cache latency, c) prefetching T[ed]4 | ||
23 | // prior last round. As result performance dropped to (26 + 15*rounds) | ||
24 | // ticks per block or 11 cycles per byte processed with 128-bit key. | ||
25 | // This is ~16% deterioration. For reference Itanium 2 L1 cache has | ||
26 | // 64 bytes line size and L2 - 128 bytes... | ||
27 | |||
28 | .ident "aes-ia64.S, version 1.2" | ||
29 | .ident "IA-64 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>" | ||
30 | .explicit | ||
31 | .text | ||
32 | |||
33 | rk0=r8; rk1=r9; | ||
34 | |||
35 | pfssave=r2; | ||
36 | lcsave=r10; | ||
37 | prsave=r3; | ||
38 | maskff=r11; | ||
39 | twenty4=r14; | ||
40 | sixteen=r15; | ||
41 | |||
42 | te00=r16; te11=r17; te22=r18; te33=r19; | ||
43 | te01=r20; te12=r21; te23=r22; te30=r23; | ||
44 | te02=r24; te13=r25; te20=r26; te31=r27; | ||
45 | te03=r28; te10=r29; te21=r30; te32=r31; | ||
46 | |||
47 | // these are rotating... | ||
48 | t0=r32; s0=r33; | ||
49 | t1=r34; s1=r35; | ||
50 | t2=r36; s2=r37; | ||
51 | t3=r38; s3=r39; | ||
52 | |||
53 | te0=r40; te1=r41; te2=r42; te3=r43; | ||
54 | |||
55 | #if defined(_HPUX_SOURCE) && !defined(_LP64) | ||
56 | # define ADDP addp4 | ||
57 | #else | ||
58 | # define ADDP add | ||
59 | #endif | ||
60 | |||
61 | // Offsets from Te0 | ||
62 | #define TE0 0 | ||
63 | #define TE2 2 | ||
64 | #if defined(_HPUX_SOURCE) || defined(B_ENDIAN) | ||
65 | #define TE1 3 | ||
66 | #define TE3 1 | ||
67 | #else | ||
68 | #define TE1 1 | ||
69 | #define TE3 3 | ||
70 | #endif | ||
71 | |||
72 | // This implies that AES_KEY comprises 32-bit key schedule elements | ||
73 | // even on LP64 platforms. | ||
74 | #ifndef KSZ | ||
75 | # define KSZ 4 | ||
76 | # define LDKEY ld4 | ||
77 | #endif | ||
78 | |||
79 | .proc _ia64_AES_encrypt# | ||
80 | // Input: rk0-rk1 | ||
81 | // te0 | ||
82 | // te3 as AES_KEY->rounds!!! | ||
83 | // s0-s3 | ||
84 | // maskff,twenty4,sixteen | ||
85 | // Output: r16,r20,r24,r28 as s0-s3 | ||
86 | // Clobber: r16-r31,rk0-rk1,r32-r43 | ||
87 | .align 32 | ||
88 | _ia64_AES_encrypt: | ||
89 | .prologue | ||
90 | .altrp b6 | ||
91 | .body | ||
92 | { .mmi; alloc r16=ar.pfs,12,0,0,8 | ||
93 | LDKEY t0=[rk0],2*KSZ | ||
94 | mov pr.rot=1<<16 } | ||
95 | { .mmi; LDKEY t1=[rk1],2*KSZ | ||
96 | add te1=TE1,te0 | ||
97 | add te3=-3,te3 };; | ||
98 | { .mib; LDKEY t2=[rk0],2*KSZ | ||
99 | mov ar.ec=2 } | ||
100 | { .mib; LDKEY t3=[rk1],2*KSZ | ||
101 | add te2=TE2,te0 | ||
102 | brp.loop.imp .Le_top,.Le_end-16 };; | ||
103 | |||
104 | { .mmi; xor s0=s0,t0 | ||
105 | xor s1=s1,t1 | ||
106 | mov ar.lc=te3 } | ||
107 | { .mmi; xor s2=s2,t2 | ||
108 | xor s3=s3,t3 | ||
109 | add te3=TE3,te0 };; | ||
110 | |||
111 | .align 32 | ||
112 | .Le_top: | ||
113 | { .mmi; (p0) LDKEY t0=[rk0],2*KSZ // 0/0:rk[0] | ||
114 | (p0) and te33=s3,maskff // 0/0:s3&0xff | ||
115 | (p0) extr.u te22=s2,8,8 } // 0/0:s2>>8&0xff | ||
116 | { .mmi; (p0) LDKEY t1=[rk1],2*KSZ // 0/1:rk[1] | ||
117 | (p0) and te30=s0,maskff // 0/1:s0&0xff | ||
118 | (p0) shr.u te00=s0,twenty4 };; // 0/0:s0>>24 | ||
119 | { .mmi; (p0) LDKEY t2=[rk0],2*KSZ // 1/2:rk[2] | ||
120 | (p0) shladd te33=te33,3,te3 // 1/0:te0+s0>>24 | ||
121 | (p0) extr.u te23=s3,8,8 } // 1/1:s3>>8&0xff | ||
122 | { .mmi; (p0) LDKEY t3=[rk1],2*KSZ // 1/3:rk[3] | ||
123 | (p0) shladd te30=te30,3,te3 // 1/1:te3+s0 | ||
124 | (p0) shr.u te01=s1,twenty4 };; // 1/1:s1>>24 | ||
125 | { .mmi; (p0) ld4 te33=[te33] // 2/0:te3[s3&0xff] | ||
126 | (p0) shladd te22=te22,3,te2 // 2/0:te2+s2>>8&0xff | ||
127 | (p0) extr.u te20=s0,8,8 } // 2/2:s0>>8&0xff | ||
128 | { .mmi; (p0) ld4 te30=[te30] // 2/1:te3[s0] | ||
129 | (p0) shladd te23=te23,3,te2 // 2/1:te2+s3>>8 | ||
130 | (p0) shr.u te02=s2,twenty4 };; // 2/2:s2>>24 | ||
131 | { .mmi; (p0) ld4 te22=[te22] // 3/0:te2[s2>>8] | ||
132 | (p0) shladd te20=te20,3,te2 // 3/2:te2+s0>>8 | ||
133 | (p0) extr.u te21=s1,8,8 } // 3/3:s1>>8&0xff | ||
134 | { .mmi; (p0) ld4 te23=[te23] // 3/1:te2[s3>>8] | ||
135 | (p0) shladd te00=te00,3,te0 // 3/0:te0+s0>>24 | ||
136 | (p0) shr.u te03=s3,twenty4 };; // 3/3:s3>>24 | ||
137 | { .mmi; (p0) ld4 te20=[te20] // 4/2:te2[s0>>8] | ||
138 | (p0) shladd te21=te21,3,te2 // 4/3:te3+s2 | ||
139 | (p0) extr.u te11=s1,16,8 } // 4/0:s1>>16&0xff | ||
140 | { .mmi; (p0) ld4 te00=[te00] // 4/0:te0[s0>>24] | ||
141 | (p0) shladd te01=te01,3,te0 // 4/1:te0+s1>>24 | ||
142 | (p0) shr.u te13=s3,sixteen };; // 4/2:s3>>16 | ||
143 | { .mmi; (p0) ld4 te21=[te21] // 5/3:te2[s1>>8] | ||
144 | (p0) shladd te11=te11,3,te1 // 5/0:te1+s1>>16 | ||
145 | (p0) extr.u te12=s2,16,8 } // 5/1:s2>>16&0xff | ||
146 | { .mmi; (p0) ld4 te01=[te01] // 5/1:te0[s1>>24] | ||
147 | (p0) shladd te02=te02,3,te0 // 5/2:te0+s2>>24 | ||
148 | (p0) and te31=s1,maskff };; // 5/2:s1&0xff | ||
149 | { .mmi; (p0) ld4 te11=[te11] // 6/0:te1[s1>>16] | ||
150 | (p0) shladd te12=te12,3,te1 // 6/1:te1+s2>>16 | ||
151 | (p0) extr.u te10=s0,16,8 } // 6/3:s0>>16&0xff | ||
152 | { .mmi; (p0) ld4 te02=[te02] // 6/2:te0[s2>>24] | ||
153 | (p0) shladd te03=te03,3,te0 // 6/3:te1+s0>>16 | ||
154 | (p0) and te32=s2,maskff };; // 6/3:s2&0xff | ||
155 | |||
156 | { .mmi; (p0) ld4 te12=[te12] // 7/1:te1[s2>>16] | ||
157 | (p0) shladd te31=te31,3,te3 // 7/2:te3+s1&0xff | ||
158 | (p0) and te13=te13,maskff} // 7/2:s3>>16&0xff | ||
159 | { .mmi; (p0) ld4 te03=[te03] // 7/3:te0[s3>>24] | ||
160 | (p0) shladd te32=te32,3,te3 // 7/3:te3+s2 | ||
161 | (p0) xor t0=t0,te33 };; // 7/0: | ||
162 | { .mmi; (p0) ld4 te31=[te31] // 8/2:te3[s1] | ||
163 | (p0) shladd te13=te13,3,te1 // 8/2:te1+s3>>16 | ||
164 | (p0) xor t0=t0,te22 } // 8/0: | ||
165 | { .mmi; (p0) ld4 te32=[te32] // 8/3:te3[s2] | ||
166 | (p0) shladd te10=te10,3,te1 // 8/3:te1+s0>>16 | ||
167 | (p0) xor t1=t1,te30 };; // 8/1: | ||
168 | { .mmi; (p0) ld4 te13=[te13] // 9/2:te1[s3>>16] | ||
169 | (p0) ld4 te10=[te10] // 9/3:te1[s0>>16] | ||
170 | (p0) xor t0=t0,te00 };; // 9/0: !L2 scheduling | ||
171 | { .mmi; (p0) xor t1=t1,te23 // 10[9]/1: | ||
172 | (p0) xor t2=t2,te20 // 10[9]/2: | ||
173 | (p0) xor t3=t3,te21 };; // 10[9]/3: | ||
174 | { .mmi; (p0) xor t0=t0,te11 // 11[10]/0:done! | ||
175 | (p0) xor t1=t1,te01 // 11[10]/1: | ||
176 | (p0) xor t2=t2,te02 };; // 11[10]/2: !L2 scheduling | ||
177 | { .mmi; (p0) xor t3=t3,te03 // 12[10]/3: | ||
178 | (p16) cmp.eq p0,p17=r0,r0 };; // 12[10]/clear (p17) | ||
179 | { .mmi; (p0) xor t1=t1,te12 // 13[11]/1:done! | ||
180 | (p0) xor t2=t2,te31 // 13[11]/2: | ||
181 | (p0) xor t3=t3,te32 } // 13[11]/3: | ||
182 | { .mmi; (p17) add te0=2048,te0 // 13[11]/ | ||
183 | (p17) add te1=2048+64-TE1,te1};; // 13[11]/ | ||
184 | { .mib; (p0) xor t2=t2,te13 // 14[12]/2:done! | ||
185 | (p17) add te2=2048+128-TE2,te2} // 14[12]/ | ||
186 | { .mib; (p0) xor t3=t3,te10 // 14[12]/3:done! | ||
187 | (p17) add te3=2048+192-TE3,te3 // 14[12]/ | ||
188 | br.ctop.sptk .Le_top };; | ||
189 | .Le_end: | ||
190 | |||
191 | |||
192 | { .mmi; ld8 te12=[te0] // prefetch Te4 | ||
193 | ld8 te31=[te1] } | ||
194 | { .mmi; ld8 te10=[te2] | ||
195 | ld8 te32=[te3] } | ||
196 | |||
197 | { .mmi; LDKEY t0=[rk0],2*KSZ // 0/0:rk[0] | ||
198 | and te33=s3,maskff // 0/0:s3&0xff | ||
199 | extr.u te22=s2,8,8 } // 0/0:s2>>8&0xff | ||
200 | { .mmi; LDKEY t1=[rk1],2*KSZ // 0/1:rk[1] | ||
201 | and te30=s0,maskff // 0/1:s0&0xff | ||
202 | shr.u te00=s0,twenty4 };; // 0/0:s0>>24 | ||
203 | { .mmi; LDKEY t2=[rk0],2*KSZ // 1/2:rk[2] | ||
204 | add te33=te33,te0 // 1/0:te0+s0>>24 | ||
205 | extr.u te23=s3,8,8 } // 1/1:s3>>8&0xff | ||
206 | { .mmi; LDKEY t3=[rk1],2*KSZ // 1/3:rk[3] | ||
207 | add te30=te30,te0 // 1/1:te0+s0 | ||
208 | shr.u te01=s1,twenty4 };; // 1/1:s1>>24 | ||
209 | { .mmi; ld1 te33=[te33] // 2/0:te0[s3&0xff] | ||
210 | add te22=te22,te0 // 2/0:te0+s2>>8&0xff | ||
211 | extr.u te20=s0,8,8 } // 2/2:s0>>8&0xff | ||
212 | { .mmi; ld1 te30=[te30] // 2/1:te0[s0] | ||
213 | add te23=te23,te0 // 2/1:te0+s3>>8 | ||
214 | shr.u te02=s2,twenty4 };; // 2/2:s2>>24 | ||
215 | { .mmi; ld1 te22=[te22] // 3/0:te0[s2>>8] | ||
216 | add te20=te20,te0 // 3/2:te0+s0>>8 | ||
217 | extr.u te21=s1,8,8 } // 3/3:s1>>8&0xff | ||
218 | { .mmi; ld1 te23=[te23] // 3/1:te0[s3>>8] | ||
219 | add te00=te00,te0 // 3/0:te0+s0>>24 | ||
220 | shr.u te03=s3,twenty4 };; // 3/3:s3>>24 | ||
221 | { .mmi; ld1 te20=[te20] // 4/2:te0[s0>>8] | ||
222 | add te21=te21,te0 // 4/3:te0+s2 | ||
223 | extr.u te11=s1,16,8 } // 4/0:s1>>16&0xff | ||
224 | { .mmi; ld1 te00=[te00] // 4/0:te0[s0>>24] | ||
225 | add te01=te01,te0 // 4/1:te0+s1>>24 | ||
226 | shr.u te13=s3,sixteen };; // 4/2:s3>>16 | ||
227 | { .mmi; ld1 te21=[te21] // 5/3:te0[s1>>8] | ||
228 | add te11=te11,te0 // 5/0:te0+s1>>16 | ||
229 | extr.u te12=s2,16,8 } // 5/1:s2>>16&0xff | ||
230 | { .mmi; ld1 te01=[te01] // 5/1:te0[s1>>24] | ||
231 | add te02=te02,te0 // 5/2:te0+s2>>24 | ||
232 | and te31=s1,maskff };; // 5/2:s1&0xff | ||
233 | { .mmi; ld1 te11=[te11] // 6/0:te0[s1>>16] | ||
234 | add te12=te12,te0 // 6/1:te0+s2>>16 | ||
235 | extr.u te10=s0,16,8 } // 6/3:s0>>16&0xff | ||
236 | { .mmi; ld1 te02=[te02] // 6/2:te0[s2>>24] | ||
237 | add te03=te03,te0 // 6/3:te0+s0>>16 | ||
238 | and te32=s2,maskff };; // 6/3:s2&0xff | ||
239 | |||
240 | { .mmi; ld1 te12=[te12] // 7/1:te0[s2>>16] | ||
241 | add te31=te31,te0 // 7/2:te0+s1&0xff | ||
242 | dep te33=te22,te33,8,8} // 7/0: | ||
243 | { .mmi; ld1 te03=[te03] // 7/3:te0[s3>>24] | ||
244 | add te32=te32,te0 // 7/3:te0+s2 | ||
245 | and te13=te13,maskff};; // 7/2:s3>>16&0xff | ||
246 | { .mmi; ld1 te31=[te31] // 8/2:te0[s1] | ||
247 | add te13=te13,te0 // 8/2:te0+s3>>16 | ||
248 | dep te30=te23,te30,8,8} // 8/1: | ||
249 | { .mmi; ld1 te32=[te32] // 8/3:te0[s2] | ||
250 | add te10=te10,te0 // 8/3:te0+s0>>16 | ||
251 | shl te00=te00,twenty4};; // 8/0: | ||
252 | { .mii; ld1 te13=[te13] // 9/2:te0[s3>>16] | ||
253 | dep te33=te11,te33,16,8 // 9/0: | ||
254 | shl te01=te01,twenty4};; // 9/1: | ||
255 | { .mii; ld1 te10=[te10] // 10/3:te0[s0>>16] | ||
256 | dep te31=te20,te31,8,8 // 10/2: | ||
257 | shl te02=te02,twenty4};; // 10/2: | ||
258 | { .mii; xor t0=t0,te33 // 11/0: | ||
259 | dep te32=te21,te32,8,8 // 11/3: | ||
260 | shl te12=te12,sixteen};; // 11/1: | ||
261 | { .mii; xor r16=t0,te00 // 12/0:done! | ||
262 | dep te31=te13,te31,16,8 // 12/2: | ||
263 | shl te03=te03,twenty4};; // 12/3: | ||
264 | { .mmi; xor t1=t1,te01 // 13/1: | ||
265 | xor t2=t2,te02 // 13/2: | ||
266 | dep te32=te10,te32,16,8};; // 13/3: | ||
267 | { .mmi; xor t1=t1,te30 // 14/1: | ||
268 | xor r24=t2,te31 // 14/2:done! | ||
269 | xor t3=t3,te32 };; // 14/3: | ||
270 | { .mib; xor r20=t1,te12 // 15/1:done! | ||
271 | xor r28=t3,te03 // 15/3:done! | ||
272 | br.ret.sptk b6 };; | ||
273 | .endp _ia64_AES_encrypt# | ||
274 | |||
275 | // void AES_encrypt (const void *in,void *out,const AES_KEY *key); | ||
276 | .global AES_encrypt# | ||
277 | .proc AES_encrypt# | ||
278 | .align 32 | ||
279 | AES_encrypt: | ||
280 | .prologue | ||
281 | .save ar.pfs,pfssave | ||
282 | { .mmi; alloc pfssave=ar.pfs,3,1,12,0 | ||
283 | and out0=3,in0 | ||
284 | mov r3=ip } | ||
285 | { .mmi; ADDP in0=0,in0 | ||
286 | mov loc0=psr.um | ||
287 | ADDP out11=KSZ*60,in2 };; // &AES_KEY->rounds | ||
288 | |||
289 | { .mmi; ld4 out11=[out11] // AES_KEY->rounds | ||
290 | add out8=(AES_Te#-AES_encrypt#),r3 // Te0 | ||
291 | .save pr,prsave | ||
292 | mov prsave=pr } | ||
293 | { .mmi; rum 1<<3 // clear um.ac | ||
294 | .save ar.lc,lcsave | ||
295 | mov lcsave=ar.lc };; | ||
296 | |||
297 | .body | ||
298 | #if defined(_HPUX_SOURCE) // HPUX is big-endian, cut 15+15 cycles... | ||
299 | { .mib; cmp.ne p6,p0=out0,r0 | ||
300 | add out0=4,in0 | ||
301 | (p6) br.dpnt.many .Le_i_unaligned };; | ||
302 | |||
303 | { .mmi; ld4 out1=[in0],8 // s0 | ||
304 | and out9=3,in1 | ||
305 | mov twenty4=24 } | ||
306 | { .mmi; ld4 out3=[out0],8 // s1 | ||
307 | ADDP rk0=0,in2 | ||
308 | mov sixteen=16 };; | ||
309 | { .mmi; ld4 out5=[in0] // s2 | ||
310 | cmp.ne p6,p0=out9,r0 | ||
311 | mov maskff=0xff } | ||
312 | { .mmb; ld4 out7=[out0] // s3 | ||
313 | ADDP rk1=KSZ,in2 | ||
314 | br.call.sptk.many b6=_ia64_AES_encrypt };; | ||
315 | |||
316 | { .mib; ADDP in0=4,in1 | ||
317 | ADDP in1=0,in1 | ||
318 | (p6) br.spnt .Le_o_unaligned };; | ||
319 | |||
320 | { .mii; mov psr.um=loc0 | ||
321 | mov ar.pfs=pfssave | ||
322 | mov ar.lc=lcsave };; | ||
323 | { .mmi; st4 [in1]=r16,8 // s0 | ||
324 | st4 [in0]=r20,8 // s1 | ||
325 | mov pr=prsave,0x1ffff };; | ||
326 | { .mmb; st4 [in1]=r24 // s2 | ||
327 | st4 [in0]=r28 // s3 | ||
328 | br.ret.sptk.many b0 };; | ||
329 | #endif | ||
330 | |||
331 | .align 32 | ||
332 | .Le_i_unaligned: | ||
333 | { .mmi; add out0=1,in0 | ||
334 | add out2=2,in0 | ||
335 | add out4=3,in0 };; | ||
336 | { .mmi; ld1 r16=[in0],4 | ||
337 | ld1 r17=[out0],4 }//;; | ||
338 | { .mmi; ld1 r18=[out2],4 | ||
339 | ld1 out1=[out4],4 };; // s0 | ||
340 | { .mmi; ld1 r20=[in0],4 | ||
341 | ld1 r21=[out0],4 }//;; | ||
342 | { .mmi; ld1 r22=[out2],4 | ||
343 | ld1 out3=[out4],4 };; // s1 | ||
344 | { .mmi; ld1 r24=[in0],4 | ||
345 | ld1 r25=[out0],4 }//;; | ||
346 | { .mmi; ld1 r26=[out2],4 | ||
347 | ld1 out5=[out4],4 };; // s2 | ||
348 | { .mmi; ld1 r28=[in0] | ||
349 | ld1 r29=[out0] }//;; | ||
350 | { .mmi; ld1 r30=[out2] | ||
351 | ld1 out7=[out4] };; // s3 | ||
352 | |||
353 | { .mii; | ||
354 | dep out1=r16,out1,24,8 //;; | ||
355 | dep out3=r20,out3,24,8 }//;; | ||
356 | { .mii; ADDP rk0=0,in2 | ||
357 | dep out5=r24,out5,24,8 //;; | ||
358 | dep out7=r28,out7,24,8 };; | ||
359 | { .mii; ADDP rk1=KSZ,in2 | ||
360 | dep out1=r17,out1,16,8 //;; | ||
361 | dep out3=r21,out3,16,8 }//;; | ||
362 | { .mii; mov twenty4=24 | ||
363 | dep out5=r25,out5,16,8 //;; | ||
364 | dep out7=r29,out7,16,8 };; | ||
365 | { .mii; mov sixteen=16 | ||
366 | dep out1=r18,out1,8,8 //;; | ||
367 | dep out3=r22,out3,8,8 }//;; | ||
368 | { .mii; mov maskff=0xff | ||
369 | dep out5=r26,out5,8,8 //;; | ||
370 | dep out7=r30,out7,8,8 };; | ||
371 | |||
372 | { .mib; br.call.sptk.many b6=_ia64_AES_encrypt };; | ||
373 | |||
374 | .Le_o_unaligned: | ||
375 | { .mii; ADDP out0=0,in1 | ||
376 | extr.u r17=r16,8,8 // s0 | ||
377 | shr.u r19=r16,twenty4 }//;; | ||
378 | { .mii; ADDP out1=1,in1 | ||
379 | extr.u r18=r16,16,8 | ||
380 | shr.u r23=r20,twenty4 }//;; // s1 | ||
381 | { .mii; ADDP out2=2,in1 | ||
382 | extr.u r21=r20,8,8 | ||
383 | shr.u r22=r20,sixteen }//;; | ||
384 | { .mii; ADDP out3=3,in1 | ||
385 | extr.u r25=r24,8,8 // s2 | ||
386 | shr.u r27=r24,twenty4 };; | ||
387 | { .mii; st1 [out3]=r16,4 | ||
388 | extr.u r26=r24,16,8 | ||
389 | shr.u r31=r28,twenty4 }//;; // s3 | ||
390 | { .mii; st1 [out2]=r17,4 | ||
391 | extr.u r29=r28,8,8 | ||
392 | shr.u r30=r28,sixteen }//;; | ||
393 | |||
394 | { .mmi; st1 [out1]=r18,4 | ||
395 | st1 [out0]=r19,4 };; | ||
396 | { .mmi; st1 [out3]=r20,4 | ||
397 | st1 [out2]=r21,4 }//;; | ||
398 | { .mmi; st1 [out1]=r22,4 | ||
399 | st1 [out0]=r23,4 };; | ||
400 | { .mmi; st1 [out3]=r24,4 | ||
401 | st1 [out2]=r25,4 | ||
402 | mov pr=prsave,0x1ffff }//;; | ||
403 | { .mmi; st1 [out1]=r26,4 | ||
404 | st1 [out0]=r27,4 | ||
405 | mov ar.pfs=pfssave };; | ||
406 | { .mmi; st1 [out3]=r28 | ||
407 | st1 [out2]=r29 | ||
408 | mov ar.lc=lcsave }//;; | ||
409 | { .mmi; st1 [out1]=r30 | ||
410 | st1 [out0]=r31 } | ||
411 | { .mfb; mov psr.um=loc0 // restore user mask | ||
412 | br.ret.sptk.many b0 };; | ||
413 | .endp AES_encrypt# | ||
414 | |||
415 | // *AES_decrypt are autogenerated by the following script: | ||
416 | #if 0 | ||
417 | #!/usr/bin/env perl | ||
418 | print "// *AES_decrypt are autogenerated by the following script:\n#if 0\n"; | ||
419 | open(PROG,'<'.$0); while(<PROG>) { print; } close(PROG); | ||
420 | print "#endif\n"; | ||
421 | while(<>) { | ||
422 | $process=1 if (/\.proc\s+_ia64_AES_encrypt/); | ||
423 | next if (!$process); | ||
424 | |||
425 | #s/te00=s0/td00=s0/; s/te00/td00/g; | ||
426 | s/te11=s1/td13=s3/; s/te11/td13/g; | ||
427 | #s/te22=s2/td22=s2/; s/te22/td22/g; | ||
428 | s/te33=s3/td31=s1/; s/te33/td31/g; | ||
429 | |||
430 | #s/te01=s1/td01=s1/; s/te01/td01/g; | ||
431 | s/te12=s2/td10=s0/; s/te12/td10/g; | ||
432 | #s/te23=s3/td23=s3/; s/te23/td23/g; | ||
433 | s/te30=s0/td32=s2/; s/te30/td32/g; | ||
434 | |||
435 | #s/te02=s2/td02=s2/; s/te02/td02/g; | ||
436 | s/te13=s3/td11=s1/; s/te13/td11/g; | ||
437 | #s/te20=s0/td20=s0/; s/te20/td20/g; | ||
438 | s/te31=s1/td33=s3/; s/te31/td33/g; | ||
439 | |||
440 | #s/te03=s3/td03=s3/; s/te03/td03/g; | ||
441 | s/te10=s0/td12=s2/; s/te10/td12/g; | ||
442 | #s/te21=s1/td21=s1/; s/te21/td21/g; | ||
443 | s/te32=s2/td30=s0/; s/te32/td30/g; | ||
444 | |||
445 | s/td/te/g; | ||
446 | |||
447 | s/AES_encrypt/AES_decrypt/g; | ||
448 | s/\.Le_/.Ld_/g; | ||
449 | s/AES_Te#/AES_Td#/g; | ||
450 | |||
451 | print; | ||
452 | |||
453 | exit if (/\.endp\s+AES_decrypt/); | ||
454 | } | ||
455 | #endif | ||
456 | .proc _ia64_AES_decrypt# | ||
457 | // Input: rk0-rk1 | ||
458 | // te0 | ||
459 | // te3 as AES_KEY->rounds!!! | ||
460 | // s0-s3 | ||
461 | // maskff,twenty4,sixteen | ||
462 | // Output: r16,r20,r24,r28 as s0-s3 | ||
463 | // Clobber: r16-r31,rk0-rk1,r32-r43 | ||
464 | .align 32 | ||
465 | _ia64_AES_decrypt: | ||
466 | .prologue | ||
467 | .altrp b6 | ||
468 | .body | ||
469 | { .mmi; alloc r16=ar.pfs,12,0,0,8 | ||
470 | LDKEY t0=[rk0],2*KSZ | ||
471 | mov pr.rot=1<<16 } | ||
472 | { .mmi; LDKEY t1=[rk1],2*KSZ | ||
473 | add te1=TE1,te0 | ||
474 | add te3=-3,te3 };; | ||
475 | { .mib; LDKEY t2=[rk0],2*KSZ | ||
476 | mov ar.ec=2 } | ||
477 | { .mib; LDKEY t3=[rk1],2*KSZ | ||
478 | add te2=TE2,te0 | ||
479 | brp.loop.imp .Ld_top,.Ld_end-16 };; | ||
480 | |||
481 | { .mmi; xor s0=s0,t0 | ||
482 | xor s1=s1,t1 | ||
483 | mov ar.lc=te3 } | ||
484 | { .mmi; xor s2=s2,t2 | ||
485 | xor s3=s3,t3 | ||
486 | add te3=TE3,te0 };; | ||
487 | |||
488 | .align 32 | ||
489 | .Ld_top: | ||
490 | { .mmi; (p0) LDKEY t0=[rk0],2*KSZ // 0/0:rk[0] | ||
491 | (p0) and te31=s1,maskff // 0/0:s3&0xff | ||
492 | (p0) extr.u te22=s2,8,8 } // 0/0:s2>>8&0xff | ||
493 | { .mmi; (p0) LDKEY t1=[rk1],2*KSZ // 0/1:rk[1] | ||
494 | (p0) and te32=s2,maskff // 0/1:s0&0xff | ||
495 | (p0) shr.u te00=s0,twenty4 };; // 0/0:s0>>24 | ||
496 | { .mmi; (p0) LDKEY t2=[rk0],2*KSZ // 1/2:rk[2] | ||
497 | (p0) shladd te31=te31,3,te3 // 1/0:te0+s0>>24 | ||
498 | (p0) extr.u te23=s3,8,8 } // 1/1:s3>>8&0xff | ||
499 | { .mmi; (p0) LDKEY t3=[rk1],2*KSZ // 1/3:rk[3] | ||
500 | (p0) shladd te32=te32,3,te3 // 1/1:te3+s0 | ||
501 | (p0) shr.u te01=s1,twenty4 };; // 1/1:s1>>24 | ||
502 | { .mmi; (p0) ld4 te31=[te31] // 2/0:te3[s3&0xff] | ||
503 | (p0) shladd te22=te22,3,te2 // 2/0:te2+s2>>8&0xff | ||
504 | (p0) extr.u te20=s0,8,8 } // 2/2:s0>>8&0xff | ||
505 | { .mmi; (p0) ld4 te32=[te32] // 2/1:te3[s0] | ||
506 | (p0) shladd te23=te23,3,te2 // 2/1:te2+s3>>8 | ||
507 | (p0) shr.u te02=s2,twenty4 };; // 2/2:s2>>24 | ||
508 | { .mmi; (p0) ld4 te22=[te22] // 3/0:te2[s2>>8] | ||
509 | (p0) shladd te20=te20,3,te2 // 3/2:te2+s0>>8 | ||
510 | (p0) extr.u te21=s1,8,8 } // 3/3:s1>>8&0xff | ||
511 | { .mmi; (p0) ld4 te23=[te23] // 3/1:te2[s3>>8] | ||
512 | (p0) shladd te00=te00,3,te0 // 3/0:te0+s0>>24 | ||
513 | (p0) shr.u te03=s3,twenty4 };; // 3/3:s3>>24 | ||
514 | { .mmi; (p0) ld4 te20=[te20] // 4/2:te2[s0>>8] | ||
515 | (p0) shladd te21=te21,3,te2 // 4/3:te3+s2 | ||
516 | (p0) extr.u te13=s3,16,8 } // 4/0:s1>>16&0xff | ||
517 | { .mmi; (p0) ld4 te00=[te00] // 4/0:te0[s0>>24] | ||
518 | (p0) shladd te01=te01,3,te0 // 4/1:te0+s1>>24 | ||
519 | (p0) shr.u te11=s1,sixteen };; // 4/2:s3>>16 | ||
520 | { .mmi; (p0) ld4 te21=[te21] // 5/3:te2[s1>>8] | ||
521 | (p0) shladd te13=te13,3,te1 // 5/0:te1+s1>>16 | ||
522 | (p0) extr.u te10=s0,16,8 } // 5/1:s2>>16&0xff | ||
523 | { .mmi; (p0) ld4 te01=[te01] // 5/1:te0[s1>>24] | ||
524 | (p0) shladd te02=te02,3,te0 // 5/2:te0+s2>>24 | ||
525 | (p0) and te33=s3,maskff };; // 5/2:s1&0xff | ||
526 | { .mmi; (p0) ld4 te13=[te13] // 6/0:te1[s1>>16] | ||
527 | (p0) shladd te10=te10,3,te1 // 6/1:te1+s2>>16 | ||
528 | (p0) extr.u te12=s2,16,8 } // 6/3:s0>>16&0xff | ||
529 | { .mmi; (p0) ld4 te02=[te02] // 6/2:te0[s2>>24] | ||
530 | (p0) shladd te03=te03,3,te0 // 6/3:te1+s0>>16 | ||
531 | (p0) and te30=s0,maskff };; // 6/3:s2&0xff | ||
532 | |||
533 | { .mmi; (p0) ld4 te10=[te10] // 7/1:te1[s2>>16] | ||
534 | (p0) shladd te33=te33,3,te3 // 7/2:te3+s1&0xff | ||
535 | (p0) and te11=te11,maskff} // 7/2:s3>>16&0xff | ||
536 | { .mmi; (p0) ld4 te03=[te03] // 7/3:te0[s3>>24] | ||
537 | (p0) shladd te30=te30,3,te3 // 7/3:te3+s2 | ||
538 | (p0) xor t0=t0,te31 };; // 7/0: | ||
539 | { .mmi; (p0) ld4 te33=[te33] // 8/2:te3[s1] | ||
540 | (p0) shladd te11=te11,3,te1 // 8/2:te1+s3>>16 | ||
541 | (p0) xor t0=t0,te22 } // 8/0: | ||
542 | { .mmi; (p0) ld4 te30=[te30] // 8/3:te3[s2] | ||
543 | (p0) shladd te12=te12,3,te1 // 8/3:te1+s0>>16 | ||
544 | (p0) xor t1=t1,te32 };; // 8/1: | ||
545 | { .mmi; (p0) ld4 te11=[te11] // 9/2:te1[s3>>16] | ||
546 | (p0) ld4 te12=[te12] // 9/3:te1[s0>>16] | ||
547 | (p0) xor t0=t0,te00 };; // 9/0: !L2 scheduling | ||
548 | { .mmi; (p0) xor t1=t1,te23 // 10[9]/1: | ||
549 | (p0) xor t2=t2,te20 // 10[9]/2: | ||
550 | (p0) xor t3=t3,te21 };; // 10[9]/3: | ||
551 | { .mmi; (p0) xor t0=t0,te13 // 11[10]/0:done! | ||
552 | (p0) xor t1=t1,te01 // 11[10]/1: | ||
553 | (p0) xor t2=t2,te02 };; // 11[10]/2: !L2 scheduling | ||
554 | { .mmi; (p0) xor t3=t3,te03 // 12[10]/3: | ||
555 | (p16) cmp.eq p0,p17=r0,r0 };; // 12[10]/clear (p17) | ||
556 | { .mmi; (p0) xor t1=t1,te10 // 13[11]/1:done! | ||
557 | (p0) xor t2=t2,te33 // 13[11]/2: | ||
558 | (p0) xor t3=t3,te30 } // 13[11]/3: | ||
559 | { .mmi; (p17) add te0=2048,te0 // 13[11]/ | ||
560 | (p17) add te1=2048+64-TE1,te1};; // 13[11]/ | ||
561 | { .mib; (p0) xor t2=t2,te11 // 14[12]/2:done! | ||
562 | (p17) add te2=2048+128-TE2,te2} // 14[12]/ | ||
563 | { .mib; (p0) xor t3=t3,te12 // 14[12]/3:done! | ||
564 | (p17) add te3=2048+192-TE3,te3 // 14[12]/ | ||
565 | br.ctop.sptk .Ld_top };; | ||
566 | .Ld_end: | ||
567 | |||
568 | |||
569 | { .mmi; ld8 te10=[te0] // prefetch Td4 | ||
570 | ld8 te33=[te1] } | ||
571 | { .mmi; ld8 te12=[te2] | ||
572 | ld8 te30=[te3] } | ||
573 | |||
574 | { .mmi; LDKEY t0=[rk0],2*KSZ // 0/0:rk[0] | ||
575 | and te31=s1,maskff // 0/0:s3&0xff | ||
576 | extr.u te22=s2,8,8 } // 0/0:s2>>8&0xff | ||
577 | { .mmi; LDKEY t1=[rk1],2*KSZ // 0/1:rk[1] | ||
578 | and te32=s2,maskff // 0/1:s0&0xff | ||
579 | shr.u te00=s0,twenty4 };; // 0/0:s0>>24 | ||
580 | { .mmi; LDKEY t2=[rk0],2*KSZ // 1/2:rk[2] | ||
581 | add te31=te31,te0 // 1/0:te0+s0>>24 | ||
582 | extr.u te23=s3,8,8 } // 1/1:s3>>8&0xff | ||
583 | { .mmi; LDKEY t3=[rk1],2*KSZ // 1/3:rk[3] | ||
584 | add te32=te32,te0 // 1/1:te0+s0 | ||
585 | shr.u te01=s1,twenty4 };; // 1/1:s1>>24 | ||
586 | { .mmi; ld1 te31=[te31] // 2/0:te0[s3&0xff] | ||
587 | add te22=te22,te0 // 2/0:te0+s2>>8&0xff | ||
588 | extr.u te20=s0,8,8 } // 2/2:s0>>8&0xff | ||
589 | { .mmi; ld1 te32=[te32] // 2/1:te0[s0] | ||
590 | add te23=te23,te0 // 2/1:te0+s3>>8 | ||
591 | shr.u te02=s2,twenty4 };; // 2/2:s2>>24 | ||
592 | { .mmi; ld1 te22=[te22] // 3/0:te0[s2>>8] | ||
593 | add te20=te20,te0 // 3/2:te0+s0>>8 | ||
594 | extr.u te21=s1,8,8 } // 3/3:s1>>8&0xff | ||
595 | { .mmi; ld1 te23=[te23] // 3/1:te0[s3>>8] | ||
596 | add te00=te00,te0 // 3/0:te0+s0>>24 | ||
597 | shr.u te03=s3,twenty4 };; // 3/3:s3>>24 | ||
598 | { .mmi; ld1 te20=[te20] // 4/2:te0[s0>>8] | ||
599 | add te21=te21,te0 // 4/3:te0+s2 | ||
600 | extr.u te13=s3,16,8 } // 4/0:s1>>16&0xff | ||
601 | { .mmi; ld1 te00=[te00] // 4/0:te0[s0>>24] | ||
602 | add te01=te01,te0 // 4/1:te0+s1>>24 | ||
603 | shr.u te11=s1,sixteen };; // 4/2:s3>>16 | ||
604 | { .mmi; ld1 te21=[te21] // 5/3:te0[s1>>8] | ||
605 | add te13=te13,te0 // 5/0:te0+s1>>16 | ||
606 | extr.u te10=s0,16,8 } // 5/1:s2>>16&0xff | ||
607 | { .mmi; ld1 te01=[te01] // 5/1:te0[s1>>24] | ||
608 | add te02=te02,te0 // 5/2:te0+s2>>24 | ||
609 | and te33=s3,maskff };; // 5/2:s1&0xff | ||
610 | { .mmi; ld1 te13=[te13] // 6/0:te0[s1>>16] | ||
611 | add te10=te10,te0 // 6/1:te0+s2>>16 | ||
612 | extr.u te12=s2,16,8 } // 6/3:s0>>16&0xff | ||
613 | { .mmi; ld1 te02=[te02] // 6/2:te0[s2>>24] | ||
614 | add te03=te03,te0 // 6/3:te0+s0>>16 | ||
615 | and te30=s0,maskff };; // 6/3:s2&0xff | ||
616 | |||
617 | { .mmi; ld1 te10=[te10] // 7/1:te0[s2>>16] | ||
618 | add te33=te33,te0 // 7/2:te0+s1&0xff | ||
619 | dep te31=te22,te31,8,8} // 7/0: | ||
620 | { .mmi; ld1 te03=[te03] // 7/3:te0[s3>>24] | ||
621 | add te30=te30,te0 // 7/3:te0+s2 | ||
622 | and te11=te11,maskff};; // 7/2:s3>>16&0xff | ||
623 | { .mmi; ld1 te33=[te33] // 8/2:te0[s1] | ||
624 | add te11=te11,te0 // 8/2:te0+s3>>16 | ||
625 | dep te32=te23,te32,8,8} // 8/1: | ||
626 | { .mmi; ld1 te30=[te30] // 8/3:te0[s2] | ||
627 | add te12=te12,te0 // 8/3:te0+s0>>16 | ||
628 | shl te00=te00,twenty4};; // 8/0: | ||
629 | { .mii; ld1 te11=[te11] // 9/2:te0[s3>>16] | ||
630 | dep te31=te13,te31,16,8 // 9/0: | ||
631 | shl te01=te01,twenty4};; // 9/1: | ||
632 | { .mii; ld1 te12=[te12] // 10/3:te0[s0>>16] | ||
633 | dep te33=te20,te33,8,8 // 10/2: | ||
634 | shl te02=te02,twenty4};; // 10/2: | ||
635 | { .mii; xor t0=t0,te31 // 11/0: | ||
636 | dep te30=te21,te30,8,8 // 11/3: | ||
637 | shl te10=te10,sixteen};; // 11/1: | ||
638 | { .mii; xor r16=t0,te00 // 12/0:done! | ||
639 | dep te33=te11,te33,16,8 // 12/2: | ||
640 | shl te03=te03,twenty4};; // 12/3: | ||
641 | { .mmi; xor t1=t1,te01 // 13/1: | ||
642 | xor t2=t2,te02 // 13/2: | ||
643 | dep te30=te12,te30,16,8};; // 13/3: | ||
644 | { .mmi; xor t1=t1,te32 // 14/1: | ||
645 | xor r24=t2,te33 // 14/2:done! | ||
646 | xor t3=t3,te30 };; // 14/3: | ||
647 | { .mib; xor r20=t1,te10 // 15/1:done! | ||
648 | xor r28=t3,te03 // 15/3:done! | ||
649 | br.ret.sptk b6 };; | ||
650 | .endp _ia64_AES_decrypt# | ||
651 | |||
652 | // void AES_decrypt (const void *in,void *out,const AES_KEY *key); | ||
653 | .global AES_decrypt# | ||
654 | .proc AES_decrypt# | ||
655 | .align 32 | ||
656 | AES_decrypt: | ||
657 | .prologue | ||
658 | .save ar.pfs,pfssave | ||
659 | { .mmi; alloc pfssave=ar.pfs,3,1,12,0 | ||
660 | and out0=3,in0 | ||
661 | mov r3=ip } | ||
662 | { .mmi; ADDP in0=0,in0 | ||
663 | mov loc0=psr.um | ||
664 | ADDP out11=KSZ*60,in2 };; // &AES_KEY->rounds | ||
665 | |||
666 | { .mmi; ld4 out11=[out11] // AES_KEY->rounds | ||
667 | add out8=(AES_Td#-AES_decrypt#),r3 // Te0 | ||
668 | .save pr,prsave | ||
669 | mov prsave=pr } | ||
670 | { .mmi; rum 1<<3 // clear um.ac | ||
671 | .save ar.lc,lcsave | ||
672 | mov lcsave=ar.lc };; | ||
673 | |||
674 | .body | ||
675 | #if defined(_HPUX_SOURCE) // HPUX is big-endian, cut 15+15 cycles... | ||
676 | { .mib; cmp.ne p6,p0=out0,r0 | ||
677 | add out0=4,in0 | ||
678 | (p6) br.dpnt.many .Ld_i_unaligned };; | ||
679 | |||
680 | { .mmi; ld4 out1=[in0],8 // s0 | ||
681 | and out9=3,in1 | ||
682 | mov twenty4=24 } | ||
683 | { .mmi; ld4 out3=[out0],8 // s1 | ||
684 | ADDP rk0=0,in2 | ||
685 | mov sixteen=16 };; | ||
686 | { .mmi; ld4 out5=[in0] // s2 | ||
687 | cmp.ne p6,p0=out9,r0 | ||
688 | mov maskff=0xff } | ||
689 | { .mmb; ld4 out7=[out0] // s3 | ||
690 | ADDP rk1=KSZ,in2 | ||
691 | br.call.sptk.many b6=_ia64_AES_decrypt };; | ||
692 | |||
693 | { .mib; ADDP in0=4,in1 | ||
694 | ADDP in1=0,in1 | ||
695 | (p6) br.spnt .Ld_o_unaligned };; | ||
696 | |||
697 | { .mii; mov psr.um=loc0 | ||
698 | mov ar.pfs=pfssave | ||
699 | mov ar.lc=lcsave };; | ||
700 | { .mmi; st4 [in1]=r16,8 // s0 | ||
701 | st4 [in0]=r20,8 // s1 | ||
702 | mov pr=prsave,0x1ffff };; | ||
703 | { .mmb; st4 [in1]=r24 // s2 | ||
704 | st4 [in0]=r28 // s3 | ||
705 | br.ret.sptk.many b0 };; | ||
706 | #endif | ||
707 | |||
708 | .align 32 | ||
709 | .Ld_i_unaligned: | ||
710 | { .mmi; add out0=1,in0 | ||
711 | add out2=2,in0 | ||
712 | add out4=3,in0 };; | ||
713 | { .mmi; ld1 r16=[in0],4 | ||
714 | ld1 r17=[out0],4 }//;; | ||
715 | { .mmi; ld1 r18=[out2],4 | ||
716 | ld1 out1=[out4],4 };; // s0 | ||
717 | { .mmi; ld1 r20=[in0],4 | ||
718 | ld1 r21=[out0],4 }//;; | ||
719 | { .mmi; ld1 r22=[out2],4 | ||
720 | ld1 out3=[out4],4 };; // s1 | ||
721 | { .mmi; ld1 r24=[in0],4 | ||
722 | ld1 r25=[out0],4 }//;; | ||
723 | { .mmi; ld1 r26=[out2],4 | ||
724 | ld1 out5=[out4],4 };; // s2 | ||
725 | { .mmi; ld1 r28=[in0] | ||
726 | ld1 r29=[out0] }//;; | ||
727 | { .mmi; ld1 r30=[out2] | ||
728 | ld1 out7=[out4] };; // s3 | ||
729 | |||
730 | { .mii; | ||
731 | dep out1=r16,out1,24,8 //;; | ||
732 | dep out3=r20,out3,24,8 }//;; | ||
733 | { .mii; ADDP rk0=0,in2 | ||
734 | dep out5=r24,out5,24,8 //;; | ||
735 | dep out7=r28,out7,24,8 };; | ||
736 | { .mii; ADDP rk1=KSZ,in2 | ||
737 | dep out1=r17,out1,16,8 //;; | ||
738 | dep out3=r21,out3,16,8 }//;; | ||
739 | { .mii; mov twenty4=24 | ||
740 | dep out5=r25,out5,16,8 //;; | ||
741 | dep out7=r29,out7,16,8 };; | ||
742 | { .mii; mov sixteen=16 | ||
743 | dep out1=r18,out1,8,8 //;; | ||
744 | dep out3=r22,out3,8,8 }//;; | ||
745 | { .mii; mov maskff=0xff | ||
746 | dep out5=r26,out5,8,8 //;; | ||
747 | dep out7=r30,out7,8,8 };; | ||
748 | |||
749 | { .mib; br.call.sptk.many b6=_ia64_AES_decrypt };; | ||
750 | |||
751 | .Ld_o_unaligned: | ||
752 | { .mii; ADDP out0=0,in1 | ||
753 | extr.u r17=r16,8,8 // s0 | ||
754 | shr.u r19=r16,twenty4 }//;; | ||
755 | { .mii; ADDP out1=1,in1 | ||
756 | extr.u r18=r16,16,8 | ||
757 | shr.u r23=r20,twenty4 }//;; // s1 | ||
758 | { .mii; ADDP out2=2,in1 | ||
759 | extr.u r21=r20,8,8 | ||
760 | shr.u r22=r20,sixteen }//;; | ||
761 | { .mii; ADDP out3=3,in1 | ||
762 | extr.u r25=r24,8,8 // s2 | ||
763 | shr.u r27=r24,twenty4 };; | ||
764 | { .mii; st1 [out3]=r16,4 | ||
765 | extr.u r26=r24,16,8 | ||
766 | shr.u r31=r28,twenty4 }//;; // s3 | ||
767 | { .mii; st1 [out2]=r17,4 | ||
768 | extr.u r29=r28,8,8 | ||
769 | shr.u r30=r28,sixteen }//;; | ||
770 | |||
771 | { .mmi; st1 [out1]=r18,4 | ||
772 | st1 [out0]=r19,4 };; | ||
773 | { .mmi; st1 [out3]=r20,4 | ||
774 | st1 [out2]=r21,4 }//;; | ||
775 | { .mmi; st1 [out1]=r22,4 | ||
776 | st1 [out0]=r23,4 };; | ||
777 | { .mmi; st1 [out3]=r24,4 | ||
778 | st1 [out2]=r25,4 | ||
779 | mov pr=prsave,0x1ffff }//;; | ||
780 | { .mmi; st1 [out1]=r26,4 | ||
781 | st1 [out0]=r27,4 | ||
782 | mov ar.pfs=pfssave };; | ||
783 | { .mmi; st1 [out3]=r28 | ||
784 | st1 [out2]=r29 | ||
785 | mov ar.lc=lcsave }//;; | ||
786 | { .mmi; st1 [out1]=r30 | ||
787 | st1 [out0]=r31 } | ||
788 | { .mfb; mov psr.um=loc0 // restore user mask | ||
789 | br.ret.sptk.many b0 };; | ||
790 | .endp AES_decrypt# | ||
791 | |||
792 | // leave it in .text segment... | ||
793 | .align 64 | ||
794 | .global AES_Te# | ||
795 | .type AES_Te#,@object | ||
796 | AES_Te: data4 0xc66363a5,0xc66363a5, 0xf87c7c84,0xf87c7c84 | ||
797 | data4 0xee777799,0xee777799, 0xf67b7b8d,0xf67b7b8d | ||
798 | data4 0xfff2f20d,0xfff2f20d, 0xd66b6bbd,0xd66b6bbd | ||
799 | data4 0xde6f6fb1,0xde6f6fb1, 0x91c5c554,0x91c5c554 | ||
800 | data4 0x60303050,0x60303050, 0x02010103,0x02010103 | ||
801 | data4 0xce6767a9,0xce6767a9, 0x562b2b7d,0x562b2b7d | ||
802 | data4 0xe7fefe19,0xe7fefe19, 0xb5d7d762,0xb5d7d762 | ||
803 | data4 0x4dababe6,0x4dababe6, 0xec76769a,0xec76769a | ||
804 | data4 0x8fcaca45,0x8fcaca45, 0x1f82829d,0x1f82829d | ||
805 | data4 0x89c9c940,0x89c9c940, 0xfa7d7d87,0xfa7d7d87 | ||
806 | data4 0xeffafa15,0xeffafa15, 0xb25959eb,0xb25959eb | ||
807 | data4 0x8e4747c9,0x8e4747c9, 0xfbf0f00b,0xfbf0f00b | ||
808 | data4 0x41adadec,0x41adadec, 0xb3d4d467,0xb3d4d467 | ||
809 | data4 0x5fa2a2fd,0x5fa2a2fd, 0x45afafea,0x45afafea | ||
810 | data4 0x239c9cbf,0x239c9cbf, 0x53a4a4f7,0x53a4a4f7 | ||
811 | data4 0xe4727296,0xe4727296, 0x9bc0c05b,0x9bc0c05b | ||
812 | data4 0x75b7b7c2,0x75b7b7c2, 0xe1fdfd1c,0xe1fdfd1c | ||
813 | data4 0x3d9393ae,0x3d9393ae, 0x4c26266a,0x4c26266a | ||
814 | data4 0x6c36365a,0x6c36365a, 0x7e3f3f41,0x7e3f3f41 | ||
815 | data4 0xf5f7f702,0xf5f7f702, 0x83cccc4f,0x83cccc4f | ||
816 | data4 0x6834345c,0x6834345c, 0x51a5a5f4,0x51a5a5f4 | ||
817 | data4 0xd1e5e534,0xd1e5e534, 0xf9f1f108,0xf9f1f108 | ||
818 | data4 0xe2717193,0xe2717193, 0xabd8d873,0xabd8d873 | ||
819 | data4 0x62313153,0x62313153, 0x2a15153f,0x2a15153f | ||
820 | data4 0x0804040c,0x0804040c, 0x95c7c752,0x95c7c752 | ||
821 | data4 0x46232365,0x46232365, 0x9dc3c35e,0x9dc3c35e | ||
822 | data4 0x30181828,0x30181828, 0x379696a1,0x379696a1 | ||
823 | data4 0x0a05050f,0x0a05050f, 0x2f9a9ab5,0x2f9a9ab5 | ||
824 | data4 0x0e070709,0x0e070709, 0x24121236,0x24121236 | ||
825 | data4 0x1b80809b,0x1b80809b, 0xdfe2e23d,0xdfe2e23d | ||
826 | data4 0xcdebeb26,0xcdebeb26, 0x4e272769,0x4e272769 | ||
827 | data4 0x7fb2b2cd,0x7fb2b2cd, 0xea75759f,0xea75759f | ||
828 | data4 0x1209091b,0x1209091b, 0x1d83839e,0x1d83839e | ||
829 | data4 0x582c2c74,0x582c2c74, 0x341a1a2e,0x341a1a2e | ||
830 | data4 0x361b1b2d,0x361b1b2d, 0xdc6e6eb2,0xdc6e6eb2 | ||
831 | data4 0xb45a5aee,0xb45a5aee, 0x5ba0a0fb,0x5ba0a0fb | ||
832 | data4 0xa45252f6,0xa45252f6, 0x763b3b4d,0x763b3b4d | ||
833 | data4 0xb7d6d661,0xb7d6d661, 0x7db3b3ce,0x7db3b3ce | ||
834 | data4 0x5229297b,0x5229297b, 0xdde3e33e,0xdde3e33e | ||
835 | data4 0x5e2f2f71,0x5e2f2f71, 0x13848497,0x13848497 | ||
836 | data4 0xa65353f5,0xa65353f5, 0xb9d1d168,0xb9d1d168 | ||
837 | data4 0x00000000,0x00000000, 0xc1eded2c,0xc1eded2c | ||
838 | data4 0x40202060,0x40202060, 0xe3fcfc1f,0xe3fcfc1f | ||
839 | data4 0x79b1b1c8,0x79b1b1c8, 0xb65b5bed,0xb65b5bed | ||
840 | data4 0xd46a6abe,0xd46a6abe, 0x8dcbcb46,0x8dcbcb46 | ||
841 | data4 0x67bebed9,0x67bebed9, 0x7239394b,0x7239394b | ||
842 | data4 0x944a4ade,0x944a4ade, 0x984c4cd4,0x984c4cd4 | ||
843 | data4 0xb05858e8,0xb05858e8, 0x85cfcf4a,0x85cfcf4a | ||
844 | data4 0xbbd0d06b,0xbbd0d06b, 0xc5efef2a,0xc5efef2a | ||
845 | data4 0x4faaaae5,0x4faaaae5, 0xedfbfb16,0xedfbfb16 | ||
846 | data4 0x864343c5,0x864343c5, 0x9a4d4dd7,0x9a4d4dd7 | ||
847 | data4 0x66333355,0x66333355, 0x11858594,0x11858594 | ||
848 | data4 0x8a4545cf,0x8a4545cf, 0xe9f9f910,0xe9f9f910 | ||
849 | data4 0x04020206,0x04020206, 0xfe7f7f81,0xfe7f7f81 | ||
850 | data4 0xa05050f0,0xa05050f0, 0x783c3c44,0x783c3c44 | ||
851 | data4 0x259f9fba,0x259f9fba, 0x4ba8a8e3,0x4ba8a8e3 | ||
852 | data4 0xa25151f3,0xa25151f3, 0x5da3a3fe,0x5da3a3fe | ||
853 | data4 0x804040c0,0x804040c0, 0x058f8f8a,0x058f8f8a | ||
854 | data4 0x3f9292ad,0x3f9292ad, 0x219d9dbc,0x219d9dbc | ||
855 | data4 0x70383848,0x70383848, 0xf1f5f504,0xf1f5f504 | ||
856 | data4 0x63bcbcdf,0x63bcbcdf, 0x77b6b6c1,0x77b6b6c1 | ||
857 | data4 0xafdada75,0xafdada75, 0x42212163,0x42212163 | ||
858 | data4 0x20101030,0x20101030, 0xe5ffff1a,0xe5ffff1a | ||
859 | data4 0xfdf3f30e,0xfdf3f30e, 0xbfd2d26d,0xbfd2d26d | ||
860 | data4 0x81cdcd4c,0x81cdcd4c, 0x180c0c14,0x180c0c14 | ||
861 | data4 0x26131335,0x26131335, 0xc3ecec2f,0xc3ecec2f | ||
862 | data4 0xbe5f5fe1,0xbe5f5fe1, 0x359797a2,0x359797a2 | ||
863 | data4 0x884444cc,0x884444cc, 0x2e171739,0x2e171739 | ||
864 | data4 0x93c4c457,0x93c4c457, 0x55a7a7f2,0x55a7a7f2 | ||
865 | data4 0xfc7e7e82,0xfc7e7e82, 0x7a3d3d47,0x7a3d3d47 | ||
866 | data4 0xc86464ac,0xc86464ac, 0xba5d5de7,0xba5d5de7 | ||
867 | data4 0x3219192b,0x3219192b, 0xe6737395,0xe6737395 | ||
868 | data4 0xc06060a0,0xc06060a0, 0x19818198,0x19818198 | ||
869 | data4 0x9e4f4fd1,0x9e4f4fd1, 0xa3dcdc7f,0xa3dcdc7f | ||
870 | data4 0x44222266,0x44222266, 0x542a2a7e,0x542a2a7e | ||
871 | data4 0x3b9090ab,0x3b9090ab, 0x0b888883,0x0b888883 | ||
872 | data4 0x8c4646ca,0x8c4646ca, 0xc7eeee29,0xc7eeee29 | ||
873 | data4 0x6bb8b8d3,0x6bb8b8d3, 0x2814143c,0x2814143c | ||
874 | data4 0xa7dede79,0xa7dede79, 0xbc5e5ee2,0xbc5e5ee2 | ||
875 | data4 0x160b0b1d,0x160b0b1d, 0xaddbdb76,0xaddbdb76 | ||
876 | data4 0xdbe0e03b,0xdbe0e03b, 0x64323256,0x64323256 | ||
877 | data4 0x743a3a4e,0x743a3a4e, 0x140a0a1e,0x140a0a1e | ||
878 | data4 0x924949db,0x924949db, 0x0c06060a,0x0c06060a | ||
879 | data4 0x4824246c,0x4824246c, 0xb85c5ce4,0xb85c5ce4 | ||
880 | data4 0x9fc2c25d,0x9fc2c25d, 0xbdd3d36e,0xbdd3d36e | ||
881 | data4 0x43acacef,0x43acacef, 0xc46262a6,0xc46262a6 | ||
882 | data4 0x399191a8,0x399191a8, 0x319595a4,0x319595a4 | ||
883 | data4 0xd3e4e437,0xd3e4e437, 0xf279798b,0xf279798b | ||
884 | data4 0xd5e7e732,0xd5e7e732, 0x8bc8c843,0x8bc8c843 | ||
885 | data4 0x6e373759,0x6e373759, 0xda6d6db7,0xda6d6db7 | ||
886 | data4 0x018d8d8c,0x018d8d8c, 0xb1d5d564,0xb1d5d564 | ||
887 | data4 0x9c4e4ed2,0x9c4e4ed2, 0x49a9a9e0,0x49a9a9e0 | ||
888 | data4 0xd86c6cb4,0xd86c6cb4, 0xac5656fa,0xac5656fa | ||
889 | data4 0xf3f4f407,0xf3f4f407, 0xcfeaea25,0xcfeaea25 | ||
890 | data4 0xca6565af,0xca6565af, 0xf47a7a8e,0xf47a7a8e | ||
891 | data4 0x47aeaee9,0x47aeaee9, 0x10080818,0x10080818 | ||
892 | data4 0x6fbabad5,0x6fbabad5, 0xf0787888,0xf0787888 | ||
893 | data4 0x4a25256f,0x4a25256f, 0x5c2e2e72,0x5c2e2e72 | ||
894 | data4 0x381c1c24,0x381c1c24, 0x57a6a6f1,0x57a6a6f1 | ||
895 | data4 0x73b4b4c7,0x73b4b4c7, 0x97c6c651,0x97c6c651 | ||
896 | data4 0xcbe8e823,0xcbe8e823, 0xa1dddd7c,0xa1dddd7c | ||
897 | data4 0xe874749c,0xe874749c, 0x3e1f1f21,0x3e1f1f21 | ||
898 | data4 0x964b4bdd,0x964b4bdd, 0x61bdbddc,0x61bdbddc | ||
899 | data4 0x0d8b8b86,0x0d8b8b86, 0x0f8a8a85,0x0f8a8a85 | ||
900 | data4 0xe0707090,0xe0707090, 0x7c3e3e42,0x7c3e3e42 | ||
901 | data4 0x71b5b5c4,0x71b5b5c4, 0xcc6666aa,0xcc6666aa | ||
902 | data4 0x904848d8,0x904848d8, 0x06030305,0x06030305 | ||
903 | data4 0xf7f6f601,0xf7f6f601, 0x1c0e0e12,0x1c0e0e12 | ||
904 | data4 0xc26161a3,0xc26161a3, 0x6a35355f,0x6a35355f | ||
905 | data4 0xae5757f9,0xae5757f9, 0x69b9b9d0,0x69b9b9d0 | ||
906 | data4 0x17868691,0x17868691, 0x99c1c158,0x99c1c158 | ||
907 | data4 0x3a1d1d27,0x3a1d1d27, 0x279e9eb9,0x279e9eb9 | ||
908 | data4 0xd9e1e138,0xd9e1e138, 0xebf8f813,0xebf8f813 | ||
909 | data4 0x2b9898b3,0x2b9898b3, 0x22111133,0x22111133 | ||
910 | data4 0xd26969bb,0xd26969bb, 0xa9d9d970,0xa9d9d970 | ||
911 | data4 0x078e8e89,0x078e8e89, 0x339494a7,0x339494a7 | ||
912 | data4 0x2d9b9bb6,0x2d9b9bb6, 0x3c1e1e22,0x3c1e1e22 | ||
913 | data4 0x15878792,0x15878792, 0xc9e9e920,0xc9e9e920 | ||
914 | data4 0x87cece49,0x87cece49, 0xaa5555ff,0xaa5555ff | ||
915 | data4 0x50282878,0x50282878, 0xa5dfdf7a,0xa5dfdf7a | ||
916 | data4 0x038c8c8f,0x038c8c8f, 0x59a1a1f8,0x59a1a1f8 | ||
917 | data4 0x09898980,0x09898980, 0x1a0d0d17,0x1a0d0d17 | ||
918 | data4 0x65bfbfda,0x65bfbfda, 0xd7e6e631,0xd7e6e631 | ||
919 | data4 0x844242c6,0x844242c6, 0xd06868b8,0xd06868b8 | ||
920 | data4 0x824141c3,0x824141c3, 0x299999b0,0x299999b0 | ||
921 | data4 0x5a2d2d77,0x5a2d2d77, 0x1e0f0f11,0x1e0f0f11 | ||
922 | data4 0x7bb0b0cb,0x7bb0b0cb, 0xa85454fc,0xa85454fc | ||
923 | data4 0x6dbbbbd6,0x6dbbbbd6, 0x2c16163a,0x2c16163a | ||
924 | // Te4: | ||
925 | data1 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5 | ||
926 | data1 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76 | ||
927 | data1 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0 | ||
928 | data1 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0 | ||
929 | data1 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc | ||
930 | data1 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15 | ||
931 | data1 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a | ||
932 | data1 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75 | ||
933 | data1 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0 | ||
934 | data1 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84 | ||
935 | data1 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b | ||
936 | data1 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf | ||
937 | data1 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85 | ||
938 | data1 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8 | ||
939 | data1 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5 | ||
940 | data1 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2 | ||
941 | data1 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17 | ||
942 | data1 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73 | ||
943 | data1 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88 | ||
944 | data1 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb | ||
945 | data1 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c | ||
946 | data1 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79 | ||
947 | data1 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9 | ||
948 | data1 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08 | ||
949 | data1 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6 | ||
950 | data1 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a | ||
951 | data1 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e | ||
952 | data1 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e | ||
953 | data1 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94 | ||
954 | data1 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf | ||
955 | data1 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68 | ||
956 | data1 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16 | ||
957 | .size AES_Te#,2048+256 // HP-UX assembler fails to ".-AES_Te#" | ||
958 | |||
959 | .align 64 | ||
960 | .global AES_Td# | ||
961 | .type AES_Td#,@object | ||
962 | AES_Td: data4 0x51f4a750,0x51f4a750, 0x7e416553,0x7e416553 | ||
963 | data4 0x1a17a4c3,0x1a17a4c3, 0x3a275e96,0x3a275e96 | ||
964 | data4 0x3bab6bcb,0x3bab6bcb, 0x1f9d45f1,0x1f9d45f1 | ||
965 | data4 0xacfa58ab,0xacfa58ab, 0x4be30393,0x4be30393 | ||
966 | data4 0x2030fa55,0x2030fa55, 0xad766df6,0xad766df6 | ||
967 | data4 0x88cc7691,0x88cc7691, 0xf5024c25,0xf5024c25 | ||
968 | data4 0x4fe5d7fc,0x4fe5d7fc, 0xc52acbd7,0xc52acbd7 | ||
969 | data4 0x26354480,0x26354480, 0xb562a38f,0xb562a38f | ||
970 | data4 0xdeb15a49,0xdeb15a49, 0x25ba1b67,0x25ba1b67 | ||
971 | data4 0x45ea0e98,0x45ea0e98, 0x5dfec0e1,0x5dfec0e1 | ||
972 | data4 0xc32f7502,0xc32f7502, 0x814cf012,0x814cf012 | ||
973 | data4 0x8d4697a3,0x8d4697a3, 0x6bd3f9c6,0x6bd3f9c6 | ||
974 | data4 0x038f5fe7,0x038f5fe7, 0x15929c95,0x15929c95 | ||
975 | data4 0xbf6d7aeb,0xbf6d7aeb, 0x955259da,0x955259da | ||
976 | data4 0xd4be832d,0xd4be832d, 0x587421d3,0x587421d3 | ||
977 | data4 0x49e06929,0x49e06929, 0x8ec9c844,0x8ec9c844 | ||
978 | data4 0x75c2896a,0x75c2896a, 0xf48e7978,0xf48e7978 | ||
979 | data4 0x99583e6b,0x99583e6b, 0x27b971dd,0x27b971dd | ||
980 | data4 0xbee14fb6,0xbee14fb6, 0xf088ad17,0xf088ad17 | ||
981 | data4 0xc920ac66,0xc920ac66, 0x7dce3ab4,0x7dce3ab4 | ||
982 | data4 0x63df4a18,0x63df4a18, 0xe51a3182,0xe51a3182 | ||
983 | data4 0x97513360,0x97513360, 0x62537f45,0x62537f45 | ||
984 | data4 0xb16477e0,0xb16477e0, 0xbb6bae84,0xbb6bae84 | ||
985 | data4 0xfe81a01c,0xfe81a01c, 0xf9082b94,0xf9082b94 | ||
986 | data4 0x70486858,0x70486858, 0x8f45fd19,0x8f45fd19 | ||
987 | data4 0x94de6c87,0x94de6c87, 0x527bf8b7,0x527bf8b7 | ||
988 | data4 0xab73d323,0xab73d323, 0x724b02e2,0x724b02e2 | ||
989 | data4 0xe31f8f57,0xe31f8f57, 0x6655ab2a,0x6655ab2a | ||
990 | data4 0xb2eb2807,0xb2eb2807, 0x2fb5c203,0x2fb5c203 | ||
991 | data4 0x86c57b9a,0x86c57b9a, 0xd33708a5,0xd33708a5 | ||
992 | data4 0x302887f2,0x302887f2, 0x23bfa5b2,0x23bfa5b2 | ||
993 | data4 0x02036aba,0x02036aba, 0xed16825c,0xed16825c | ||
994 | data4 0x8acf1c2b,0x8acf1c2b, 0xa779b492,0xa779b492 | ||
995 | data4 0xf307f2f0,0xf307f2f0, 0x4e69e2a1,0x4e69e2a1 | ||
996 | data4 0x65daf4cd,0x65daf4cd, 0x0605bed5,0x0605bed5 | ||
997 | data4 0xd134621f,0xd134621f, 0xc4a6fe8a,0xc4a6fe8a | ||
998 | data4 0x342e539d,0x342e539d, 0xa2f355a0,0xa2f355a0 | ||
999 | data4 0x058ae132,0x058ae132, 0xa4f6eb75,0xa4f6eb75 | ||
1000 | data4 0x0b83ec39,0x0b83ec39, 0x4060efaa,0x4060efaa | ||
1001 | data4 0x5e719f06,0x5e719f06, 0xbd6e1051,0xbd6e1051 | ||
1002 | data4 0x3e218af9,0x3e218af9, 0x96dd063d,0x96dd063d | ||
1003 | data4 0xdd3e05ae,0xdd3e05ae, 0x4de6bd46,0x4de6bd46 | ||
1004 | data4 0x91548db5,0x91548db5, 0x71c45d05,0x71c45d05 | ||
1005 | data4 0x0406d46f,0x0406d46f, 0x605015ff,0x605015ff | ||
1006 | data4 0x1998fb24,0x1998fb24, 0xd6bde997,0xd6bde997 | ||
1007 | data4 0x894043cc,0x894043cc, 0x67d99e77,0x67d99e77 | ||
1008 | data4 0xb0e842bd,0xb0e842bd, 0x07898b88,0x07898b88 | ||
1009 | data4 0xe7195b38,0xe7195b38, 0x79c8eedb,0x79c8eedb | ||
1010 | data4 0xa17c0a47,0xa17c0a47, 0x7c420fe9,0x7c420fe9 | ||
1011 | data4 0xf8841ec9,0xf8841ec9, 0x00000000,0x00000000 | ||
1012 | data4 0x09808683,0x09808683, 0x322bed48,0x322bed48 | ||
1013 | data4 0x1e1170ac,0x1e1170ac, 0x6c5a724e,0x6c5a724e | ||
1014 | data4 0xfd0efffb,0xfd0efffb, 0x0f853856,0x0f853856 | ||
1015 | data4 0x3daed51e,0x3daed51e, 0x362d3927,0x362d3927 | ||
1016 | data4 0x0a0fd964,0x0a0fd964, 0x685ca621,0x685ca621 | ||
1017 | data4 0x9b5b54d1,0x9b5b54d1, 0x24362e3a,0x24362e3a | ||
1018 | data4 0x0c0a67b1,0x0c0a67b1, 0x9357e70f,0x9357e70f | ||
1019 | data4 0xb4ee96d2,0xb4ee96d2, 0x1b9b919e,0x1b9b919e | ||
1020 | data4 0x80c0c54f,0x80c0c54f, 0x61dc20a2,0x61dc20a2 | ||
1021 | data4 0x5a774b69,0x5a774b69, 0x1c121a16,0x1c121a16 | ||
1022 | data4 0xe293ba0a,0xe293ba0a, 0xc0a02ae5,0xc0a02ae5 | ||
1023 | data4 0x3c22e043,0x3c22e043, 0x121b171d,0x121b171d | ||
1024 | data4 0x0e090d0b,0x0e090d0b, 0xf28bc7ad,0xf28bc7ad | ||
1025 | data4 0x2db6a8b9,0x2db6a8b9, 0x141ea9c8,0x141ea9c8 | ||
1026 | data4 0x57f11985,0x57f11985, 0xaf75074c,0xaf75074c | ||
1027 | data4 0xee99ddbb,0xee99ddbb, 0xa37f60fd,0xa37f60fd | ||
1028 | data4 0xf701269f,0xf701269f, 0x5c72f5bc,0x5c72f5bc | ||
1029 | data4 0x44663bc5,0x44663bc5, 0x5bfb7e34,0x5bfb7e34 | ||
1030 | data4 0x8b432976,0x8b432976, 0xcb23c6dc,0xcb23c6dc | ||
1031 | data4 0xb6edfc68,0xb6edfc68, 0xb8e4f163,0xb8e4f163 | ||
1032 | data4 0xd731dcca,0xd731dcca, 0x42638510,0x42638510 | ||
1033 | data4 0x13972240,0x13972240, 0x84c61120,0x84c61120 | ||
1034 | data4 0x854a247d,0x854a247d, 0xd2bb3df8,0xd2bb3df8 | ||
1035 | data4 0xaef93211,0xaef93211, 0xc729a16d,0xc729a16d | ||
1036 | data4 0x1d9e2f4b,0x1d9e2f4b, 0xdcb230f3,0xdcb230f3 | ||
1037 | data4 0x0d8652ec,0x0d8652ec, 0x77c1e3d0,0x77c1e3d0 | ||
1038 | data4 0x2bb3166c,0x2bb3166c, 0xa970b999,0xa970b999 | ||
1039 | data4 0x119448fa,0x119448fa, 0x47e96422,0x47e96422 | ||
1040 | data4 0xa8fc8cc4,0xa8fc8cc4, 0xa0f03f1a,0xa0f03f1a | ||
1041 | data4 0x567d2cd8,0x567d2cd8, 0x223390ef,0x223390ef | ||
1042 | data4 0x87494ec7,0x87494ec7, 0xd938d1c1,0xd938d1c1 | ||
1043 | data4 0x8ccaa2fe,0x8ccaa2fe, 0x98d40b36,0x98d40b36 | ||
1044 | data4 0xa6f581cf,0xa6f581cf, 0xa57ade28,0xa57ade28 | ||
1045 | data4 0xdab78e26,0xdab78e26, 0x3fadbfa4,0x3fadbfa4 | ||
1046 | data4 0x2c3a9de4,0x2c3a9de4, 0x5078920d,0x5078920d | ||
1047 | data4 0x6a5fcc9b,0x6a5fcc9b, 0x547e4662,0x547e4662 | ||
1048 | data4 0xf68d13c2,0xf68d13c2, 0x90d8b8e8,0x90d8b8e8 | ||
1049 | data4 0x2e39f75e,0x2e39f75e, 0x82c3aff5,0x82c3aff5 | ||
1050 | data4 0x9f5d80be,0x9f5d80be, 0x69d0937c,0x69d0937c | ||
1051 | data4 0x6fd52da9,0x6fd52da9, 0xcf2512b3,0xcf2512b3 | ||
1052 | data4 0xc8ac993b,0xc8ac993b, 0x10187da7,0x10187da7 | ||
1053 | data4 0xe89c636e,0xe89c636e, 0xdb3bbb7b,0xdb3bbb7b | ||
1054 | data4 0xcd267809,0xcd267809, 0x6e5918f4,0x6e5918f4 | ||
1055 | data4 0xec9ab701,0xec9ab701, 0x834f9aa8,0x834f9aa8 | ||
1056 | data4 0xe6956e65,0xe6956e65, 0xaaffe67e,0xaaffe67e | ||
1057 | data4 0x21bccf08,0x21bccf08, 0xef15e8e6,0xef15e8e6 | ||
1058 | data4 0xbae79bd9,0xbae79bd9, 0x4a6f36ce,0x4a6f36ce | ||
1059 | data4 0xea9f09d4,0xea9f09d4, 0x29b07cd6,0x29b07cd6 | ||
1060 | data4 0x31a4b2af,0x31a4b2af, 0x2a3f2331,0x2a3f2331 | ||
1061 | data4 0xc6a59430,0xc6a59430, 0x35a266c0,0x35a266c0 | ||
1062 | data4 0x744ebc37,0x744ebc37, 0xfc82caa6,0xfc82caa6 | ||
1063 | data4 0xe090d0b0,0xe090d0b0, 0x33a7d815,0x33a7d815 | ||
1064 | data4 0xf104984a,0xf104984a, 0x41ecdaf7,0x41ecdaf7 | ||
1065 | data4 0x7fcd500e,0x7fcd500e, 0x1791f62f,0x1791f62f | ||
1066 | data4 0x764dd68d,0x764dd68d, 0x43efb04d,0x43efb04d | ||
1067 | data4 0xccaa4d54,0xccaa4d54, 0xe49604df,0xe49604df | ||
1068 | data4 0x9ed1b5e3,0x9ed1b5e3, 0x4c6a881b,0x4c6a881b | ||
1069 | data4 0xc12c1fb8,0xc12c1fb8, 0x4665517f,0x4665517f | ||
1070 | data4 0x9d5eea04,0x9d5eea04, 0x018c355d,0x018c355d | ||
1071 | data4 0xfa877473,0xfa877473, 0xfb0b412e,0xfb0b412e | ||
1072 | data4 0xb3671d5a,0xb3671d5a, 0x92dbd252,0x92dbd252 | ||
1073 | data4 0xe9105633,0xe9105633, 0x6dd64713,0x6dd64713 | ||
1074 | data4 0x9ad7618c,0x9ad7618c, 0x37a10c7a,0x37a10c7a | ||
1075 | data4 0x59f8148e,0x59f8148e, 0xeb133c89,0xeb133c89 | ||
1076 | data4 0xcea927ee,0xcea927ee, 0xb761c935,0xb761c935 | ||
1077 | data4 0xe11ce5ed,0xe11ce5ed, 0x7a47b13c,0x7a47b13c | ||
1078 | data4 0x9cd2df59,0x9cd2df59, 0x55f2733f,0x55f2733f | ||
1079 | data4 0x1814ce79,0x1814ce79, 0x73c737bf,0x73c737bf | ||
1080 | data4 0x53f7cdea,0x53f7cdea, 0x5ffdaa5b,0x5ffdaa5b | ||
1081 | data4 0xdf3d6f14,0xdf3d6f14, 0x7844db86,0x7844db86 | ||
1082 | data4 0xcaaff381,0xcaaff381, 0xb968c43e,0xb968c43e | ||
1083 | data4 0x3824342c,0x3824342c, 0xc2a3405f,0xc2a3405f | ||
1084 | data4 0x161dc372,0x161dc372, 0xbce2250c,0xbce2250c | ||
1085 | data4 0x283c498b,0x283c498b, 0xff0d9541,0xff0d9541 | ||
1086 | data4 0x39a80171,0x39a80171, 0x080cb3de,0x080cb3de | ||
1087 | data4 0xd8b4e49c,0xd8b4e49c, 0x6456c190,0x6456c190 | ||
1088 | data4 0x7bcb8461,0x7bcb8461, 0xd532b670,0xd532b670 | ||
1089 | data4 0x486c5c74,0x486c5c74, 0xd0b85742,0xd0b85742 | ||
1090 | // Td4: | ||
1091 | data1 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38 | ||
1092 | data1 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb | ||
1093 | data1 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87 | ||
1094 | data1 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb | ||
1095 | data1 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d | ||
1096 | data1 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e | ||
1097 | data1 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2 | ||
1098 | data1 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25 | ||
1099 | data1 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16 | ||
1100 | data1 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92 | ||
1101 | data1 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda | ||
1102 | data1 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84 | ||
1103 | data1 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a | ||
1104 | data1 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06 | ||
1105 | data1 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02 | ||
1106 | data1 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b | ||
1107 | data1 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea | ||
1108 | data1 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73 | ||
1109 | data1 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85 | ||
1110 | data1 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e | ||
1111 | data1 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89 | ||
1112 | data1 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b | ||
1113 | data1 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20 | ||
1114 | data1 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4 | ||
1115 | data1 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31 | ||
1116 | data1 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f | ||
1117 | data1 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d | ||
1118 | data1 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef | ||
1119 | data1 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0 | ||
1120 | data1 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61 | ||
1121 | data1 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26 | ||
1122 | data1 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d | ||
1123 | .size AES_Td#,2048+256 // HP-UX assembler fails to ".-AES_Td#" | ||
diff --git a/src/lib/libcrypto/bn/asm/ia64-mont.pl b/src/lib/libcrypto/bn/asm/ia64-mont.pl deleted file mode 100644 index e258658428..0000000000 --- a/src/lib/libcrypto/bn/asm/ia64-mont.pl +++ /dev/null | |||
@@ -1,851 +0,0 @@ | |||
1 | #!/usr/bin/env perl | ||
2 | # | ||
3 | # ==================================================================== | ||
4 | # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | ||
5 | # project. The module is, however, dual licensed under OpenSSL and | ||
6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
8 | # ==================================================================== | ||
9 | |||
10 | # January 2010 | ||
11 | # | ||
12 | # "Teaser" Montgomery multiplication module for IA-64. There are | ||
13 | # several possibilities for improvement: | ||
14 | # | ||
15 | # - modulo-scheduling outer loop would eliminate quite a number of | ||
16 | # stalls after ldf8, xma and getf.sig outside inner loop and | ||
17 | # improve shorter key performance; | ||
18 | # - shorter vector support [with input vectors being fetched only | ||
19 | # once] should be added; | ||
20 | # - 2x unroll with help of n0[1] would make the code scalable on | ||
21 | # "wider" IA-64, "wider" than Itanium 2 that is, which is not of | ||
22 | # acute interest, because upcoming Tukwila's individual cores are | ||
23 | # reportedly based on Itanium 2 design; | ||
24 | # - dedicated squaring procedure(?); | ||
25 | # | ||
26 | # January 2010 | ||
27 | # | ||
28 | # Shorter vector support is implemented by zero-padding ap and np | ||
29 | # vectors up to 8 elements, or 512 bits. This means that 256-bit | ||
30 | # inputs will be processed only 2 times faster than 512-bit inputs, | ||
31 | # not 4 [as one would expect, because algorithm complexity is n^2]. | ||
32 | # The reason for padding is that inputs shorter than 512 bits won't | ||
33 | # be processed faster anyway, because minimal critical path of the | ||
34 | # core loop happens to match 512-bit timing. Either way, it resulted | ||
35 | # in >100% improvement of 512-bit RSA sign benchmark and 50% - of | ||
36 | # 1024-bit one [in comparison to original version of *this* module]. | ||
37 | # | ||
38 | # So far 'openssl speed rsa dsa' output on 900MHz Itanium 2 *with* | ||
39 | # this module is: | ||
40 | # sign verify sign/s verify/s | ||
41 | # rsa 512 bits 0.000290s 0.000024s 3452.8 42031.4 | ||
42 | # rsa 1024 bits 0.000793s 0.000058s 1261.7 17172.0 | ||
43 | # rsa 2048 bits 0.005908s 0.000148s 169.3 6754.0 | ||
44 | # rsa 4096 bits 0.033456s 0.000469s 29.9 2133.6 | ||
45 | # dsa 512 bits 0.000253s 0.000198s 3949.9 5057.0 | ||
46 | # dsa 1024 bits 0.000585s 0.000607s 1708.4 1647.4 | ||
47 | # dsa 2048 bits 0.001453s 0.001703s 688.1 587.4 | ||
48 | # | ||
49 | # ... and *without* (but still with ia64.S): | ||
50 | # | ||
51 | # rsa 512 bits 0.000670s 0.000041s 1491.8 24145.5 | ||
52 | # rsa 1024 bits 0.001988s 0.000080s 502.9 12499.3 | ||
53 | # rsa 2048 bits 0.008702s 0.000189s 114.9 5293.9 | ||
54 | # rsa 4096 bits 0.043860s 0.000533s 22.8 1875.9 | ||
55 | # dsa 512 bits 0.000441s 0.000427s 2265.3 2340.6 | ||
56 | # dsa 1024 bits 0.000823s 0.000867s 1215.6 1153.2 | ||
57 | # dsa 2048 bits 0.001894s 0.002179s 528.1 458.9 | ||
58 | # | ||
59 | # As it can be seen, RSA sign performance improves by 130-30%, | ||
60 | # hereafter less for longer keys, while verify - by 74-13%. | ||
61 | # DSA performance improves by 115-30%. | ||
62 | |||
63 | if ($^O eq "hpux") { | ||
64 | $ADDP="addp4"; | ||
65 | for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); } | ||
66 | } else { $ADDP="add"; } | ||
67 | |||
68 | $code=<<___; | ||
69 | .explicit | ||
70 | .text | ||
71 | |||
72 | // int bn_mul_mont (BN_ULONG *rp,const BN_ULONG *ap, | ||
73 | // const BN_ULONG *bp,const BN_ULONG *np, | ||
74 | // const BN_ULONG *n0p,int num); | ||
75 | .align 64 | ||
76 | .global bn_mul_mont# | ||
77 | .proc bn_mul_mont# | ||
78 | bn_mul_mont: | ||
79 | .prologue | ||
80 | .body | ||
81 | { .mmi; cmp4.le p6,p7=2,r37;; | ||
82 | (p6) cmp4.lt.unc p8,p9=8,r37 | ||
83 | mov ret0=r0 };; | ||
84 | { .bbb; | ||
85 | (p9) br.cond.dptk.many bn_mul_mont_8 | ||
86 | (p8) br.cond.dpnt.many bn_mul_mont_general | ||
87 | (p7) br.ret.spnt.many b0 };; | ||
88 | .endp bn_mul_mont# | ||
89 | |||
90 | prevfs=r2; prevpr=r3; prevlc=r10; prevsp=r11; | ||
91 | |||
92 | rptr=r8; aptr=r9; bptr=r14; nptr=r15; | ||
93 | tptr=r16; // &tp[0] | ||
94 | tp_1=r17; // &tp[-1] | ||
95 | num=r18; len=r19; lc=r20; | ||
96 | topbit=r21; // carry bit from tmp[num] | ||
97 | |||
98 | n0=f6; | ||
99 | m0=f7; | ||
100 | bi=f8; | ||
101 | |||
102 | .align 64 | ||
103 | .local bn_mul_mont_general# | ||
104 | .proc bn_mul_mont_general# | ||
105 | bn_mul_mont_general: | ||
106 | .prologue | ||
107 | { .mmi; .save ar.pfs,prevfs | ||
108 | alloc prevfs=ar.pfs,6,2,0,8 | ||
109 | $ADDP aptr=0,in1 | ||
110 | .save ar.lc,prevlc | ||
111 | mov prevlc=ar.lc } | ||
112 | { .mmi; .vframe prevsp | ||
113 | mov prevsp=sp | ||
114 | $ADDP bptr=0,in2 | ||
115 | .save pr,prevpr | ||
116 | mov prevpr=pr };; | ||
117 | |||
118 | .body | ||
119 | .rotf alo[6],nlo[4],ahi[8],nhi[6] | ||
120 | .rotr a[3],n[3],t[2] | ||
121 | |||
122 | { .mmi; ldf8 bi=[bptr],8 // (*bp++) | ||
123 | ldf8 alo[4]=[aptr],16 // ap[0] | ||
124 | $ADDP r30=8,in1 };; | ||
125 | { .mmi; ldf8 alo[3]=[r30],16 // ap[1] | ||
126 | ldf8 alo[2]=[aptr],16 // ap[2] | ||
127 | $ADDP in4=0,in4 };; | ||
128 | { .mmi; ldf8 alo[1]=[r30] // ap[3] | ||
129 | ldf8 n0=[in4] // n0 | ||
130 | $ADDP rptr=0,in0 } | ||
131 | { .mmi; $ADDP nptr=0,in3 | ||
132 | mov r31=16 | ||
133 | zxt4 num=in5 };; | ||
134 | { .mmi; ldf8 nlo[2]=[nptr],8 // np[0] | ||
135 | shladd len=num,3,r0 | ||
136 | shladd r31=num,3,r31 };; | ||
137 | { .mmi; ldf8 nlo[1]=[nptr],8 // np[1] | ||
138 | add lc=-5,num | ||
139 | sub r31=sp,r31 };; | ||
140 | { .mfb; and sp=-16,r31 // alloca | ||
141 | xmpy.hu ahi[2]=alo[4],bi // ap[0]*bp[0] | ||
142 | nop.b 0 } | ||
143 | { .mfb; nop.m 0 | ||
144 | xmpy.lu alo[4]=alo[4],bi | ||
145 | brp.loop.imp .L1st_ctop,.L1st_cend-16 | ||
146 | };; | ||
147 | { .mfi; nop.m 0 | ||
148 | xma.hu ahi[1]=alo[3],bi,ahi[2] // ap[1]*bp[0] | ||
149 | add tp_1=8,sp } | ||
150 | { .mfi; nop.m 0 | ||
151 | xma.lu alo[3]=alo[3],bi,ahi[2] | ||
152 | mov pr.rot=0x20001f<<16 | ||
153 | // ------^----- (p40) at first (p23) | ||
154 | // ----------^^ p[16:20]=1 | ||
155 | };; | ||
156 | { .mfi; nop.m 0 | ||
157 | xmpy.lu m0=alo[4],n0 // (ap[0]*bp[0])*n0 | ||
158 | mov ar.lc=lc } | ||
159 | { .mfi; nop.m 0 | ||
160 | fcvt.fxu.s1 nhi[1]=f0 | ||
161 | mov ar.ec=8 };; | ||
162 | |||
163 | .align 32 | ||
164 | .L1st_ctop: | ||
165 | .pred.rel "mutex",p40,p42 | ||
166 | { .mfi; (p16) ldf8 alo[0]=[aptr],8 // *(aptr++) | ||
167 | (p18) xma.hu ahi[0]=alo[2],bi,ahi[1] | ||
168 | (p40) add n[2]=n[2],a[2] } // (p23) } | ||
169 | { .mfi; (p18) ldf8 nlo[0]=[nptr],8 // *(nptr++)(p16) | ||
170 | (p18) xma.lu alo[2]=alo[2],bi,ahi[1] | ||
171 | (p42) add n[2]=n[2],a[2],1 };; // (p23) | ||
172 | { .mfi; (p21) getf.sig a[0]=alo[5] | ||
173 | (p20) xma.hu nhi[0]=nlo[2],m0,nhi[1] | ||
174 | (p42) cmp.leu p41,p39=n[2],a[2] } // (p23) | ||
175 | { .mfi; (p23) st8 [tp_1]=n[2],8 | ||
176 | (p20) xma.lu nlo[2]=nlo[2],m0,nhi[1] | ||
177 | (p40) cmp.ltu p41,p39=n[2],a[2] } // (p23) | ||
178 | { .mmb; (p21) getf.sig n[0]=nlo[3] | ||
179 | (p16) nop.m 0 | ||
180 | br.ctop.sptk .L1st_ctop };; | ||
181 | .L1st_cend: | ||
182 | |||
183 | { .mmi; getf.sig a[0]=ahi[6] // (p24) | ||
184 | getf.sig n[0]=nhi[4] | ||
185 | add num=-1,num };; // num-- | ||
186 | { .mmi; .pred.rel "mutex",p40,p42 | ||
187 | (p40) add n[0]=n[0],a[0] | ||
188 | (p42) add n[0]=n[0],a[0],1 | ||
189 | sub aptr=aptr,len };; // rewind | ||
190 | { .mmi; .pred.rel "mutex",p40,p42 | ||
191 | (p40) cmp.ltu p41,p39=n[0],a[0] | ||
192 | (p42) cmp.leu p41,p39=n[0],a[0] | ||
193 | sub nptr=nptr,len };; | ||
194 | { .mmi; .pred.rel "mutex",p39,p41 | ||
195 | (p39) add topbit=r0,r0 | ||
196 | (p41) add topbit=r0,r0,1 | ||
197 | nop.i 0 } | ||
198 | { .mmi; st8 [tp_1]=n[0] | ||
199 | add tptr=16,sp | ||
200 | add tp_1=8,sp };; | ||
201 | |||
202 | .Louter: | ||
203 | { .mmi; ldf8 bi=[bptr],8 // (*bp++) | ||
204 | ldf8 ahi[3]=[tptr] // tp[0] | ||
205 | add r30=8,aptr };; | ||
206 | { .mmi; ldf8 alo[4]=[aptr],16 // ap[0] | ||
207 | ldf8 alo[3]=[r30],16 // ap[1] | ||
208 | add r31=8,nptr };; | ||
209 | { .mfb; ldf8 alo[2]=[aptr],16 // ap[2] | ||
210 | xma.hu ahi[2]=alo[4],bi,ahi[3] // ap[0]*bp[i]+tp[0] | ||
211 | brp.loop.imp .Linner_ctop,.Linner_cend-16 | ||
212 | } | ||
213 | { .mfb; ldf8 alo[1]=[r30] // ap[3] | ||
214 | xma.lu alo[4]=alo[4],bi,ahi[3] | ||
215 | clrrrb.pr };; | ||
216 | { .mfi; ldf8 nlo[2]=[nptr],16 // np[0] | ||
217 | xma.hu ahi[1]=alo[3],bi,ahi[2] // ap[1]*bp[i] | ||
218 | nop.i 0 } | ||
219 | { .mfi; ldf8 nlo[1]=[r31] // np[1] | ||
220 | xma.lu alo[3]=alo[3],bi,ahi[2] | ||
221 | mov pr.rot=0x20101f<<16 | ||
222 | // ------^----- (p40) at first (p23) | ||
223 | // --------^--- (p30) at first (p22) | ||
224 | // ----------^^ p[16:20]=1 | ||
225 | };; | ||
226 | { .mfi; st8 [tptr]=r0 // tp[0] is already accounted | ||
227 | xmpy.lu m0=alo[4],n0 // (ap[0]*bp[i]+tp[0])*n0 | ||
228 | mov ar.lc=lc } | ||
229 | { .mfi; | ||
230 | fcvt.fxu.s1 nhi[1]=f0 | ||
231 | mov ar.ec=8 };; | ||
232 | |||
233 | // This loop spins in 4*(n+7) ticks on Itanium 2 and should spin in | ||
234 | // 7*(n+7) ticks on Itanium (the one codenamed Merced). Factor of 7 | ||
235 | // in latter case accounts for two-tick pipeline stall, which means | ||
236 | // that its performance would be ~20% lower than optimal one. No | ||
237 | // attempt was made to address this, because original Itanium is | ||
238 | // hardly represented out in the wild... | ||
239 | .align 32 | ||
240 | .Linner_ctop: | ||
241 | .pred.rel "mutex",p40,p42 | ||
242 | .pred.rel "mutex",p30,p32 | ||
243 | { .mfi; (p16) ldf8 alo[0]=[aptr],8 // *(aptr++) | ||
244 | (p18) xma.hu ahi[0]=alo[2],bi,ahi[1] | ||
245 | (p40) add n[2]=n[2],a[2] } // (p23) | ||
246 | { .mfi; (p16) nop.m 0 | ||
247 | (p18) xma.lu alo[2]=alo[2],bi,ahi[1] | ||
248 | (p42) add n[2]=n[2],a[2],1 };; // (p23) | ||
249 | { .mfi; (p21) getf.sig a[0]=alo[5] | ||
250 | (p16) nop.f 0 | ||
251 | (p40) cmp.ltu p41,p39=n[2],a[2] } // (p23) | ||
252 | { .mfi; (p21) ld8 t[0]=[tptr],8 | ||
253 | (p16) nop.f 0 | ||
254 | (p42) cmp.leu p41,p39=n[2],a[2] };; // (p23) | ||
255 | { .mfi; (p18) ldf8 nlo[0]=[nptr],8 // *(nptr++) | ||
256 | (p20) xma.hu nhi[0]=nlo[2],m0,nhi[1] | ||
257 | (p30) add a[1]=a[1],t[1] } // (p22) | ||
258 | { .mfi; (p16) nop.m 0 | ||
259 | (p20) xma.lu nlo[2]=nlo[2],m0,nhi[1] | ||
260 | (p32) add a[1]=a[1],t[1],1 };; // (p22) | ||
261 | { .mmi; (p21) getf.sig n[0]=nlo[3] | ||
262 | (p16) nop.m 0 | ||
263 | (p30) cmp.ltu p31,p29=a[1],t[1] } // (p22) | ||
264 | { .mmb; (p23) st8 [tp_1]=n[2],8 | ||
265 | (p32) cmp.leu p31,p29=a[1],t[1] // (p22) | ||
266 | br.ctop.sptk .Linner_ctop };; | ||
267 | .Linner_cend: | ||
268 | |||
269 | { .mmi; getf.sig a[0]=ahi[6] // (p24) | ||
270 | getf.sig n[0]=nhi[4] | ||
271 | nop.i 0 };; | ||
272 | |||
273 | { .mmi; .pred.rel "mutex",p31,p33 | ||
274 | (p31) add a[0]=a[0],topbit | ||
275 | (p33) add a[0]=a[0],topbit,1 | ||
276 | mov topbit=r0 };; | ||
277 | { .mfi; .pred.rel "mutex",p31,p33 | ||
278 | (p31) cmp.ltu p32,p30=a[0],topbit | ||
279 | (p33) cmp.leu p32,p30=a[0],topbit | ||
280 | } | ||
281 | { .mfi; .pred.rel "mutex",p40,p42 | ||
282 | (p40) add n[0]=n[0],a[0] | ||
283 | (p42) add n[0]=n[0],a[0],1 | ||
284 | };; | ||
285 | { .mmi; .pred.rel "mutex",p44,p46 | ||
286 | (p40) cmp.ltu p41,p39=n[0],a[0] | ||
287 | (p42) cmp.leu p41,p39=n[0],a[0] | ||
288 | (p32) add topbit=r0,r0,1 } | ||
289 | |||
290 | { .mmi; st8 [tp_1]=n[0],8 | ||
291 | cmp4.ne p6,p0=1,num | ||
292 | sub aptr=aptr,len };; // rewind | ||
293 | { .mmi; sub nptr=nptr,len | ||
294 | (p41) add topbit=r0,r0,1 | ||
295 | add tptr=16,sp } | ||
296 | { .mmb; add tp_1=8,sp | ||
297 | add num=-1,num // num-- | ||
298 | (p6) br.cond.sptk.many .Louter };; | ||
299 | |||
300 | { .mbb; add lc=4,lc | ||
301 | brp.loop.imp .Lsub_ctop,.Lsub_cend-16 | ||
302 | clrrrb.pr };; | ||
303 | { .mii; nop.m 0 | ||
304 | mov pr.rot=0x10001<<16 | ||
305 | // ------^---- (p33) at first (p17) | ||
306 | mov ar.lc=lc } | ||
307 | { .mii; nop.m 0 | ||
308 | mov ar.ec=3 | ||
309 | nop.i 0 };; | ||
310 | |||
311 | .Lsub_ctop: | ||
312 | .pred.rel "mutex",p33,p35 | ||
313 | { .mfi; (p16) ld8 t[0]=[tptr],8 // t=*(tp++) | ||
314 | (p16) nop.f 0 | ||
315 | (p33) sub n[1]=t[1],n[1] } // (p17) | ||
316 | { .mfi; (p16) ld8 n[0]=[nptr],8 // n=*(np++) | ||
317 | (p16) nop.f 0 | ||
318 | (p35) sub n[1]=t[1],n[1],1 };; // (p17) | ||
319 | { .mib; (p18) st8 [rptr]=n[2],8 // *(rp++)=r | ||
320 | (p33) cmp.gtu p34,p32=n[1],t[1] // (p17) | ||
321 | (p18) nop.b 0 } | ||
322 | { .mib; (p18) nop.m 0 | ||
323 | (p35) cmp.geu p34,p32=n[1],t[1] // (p17) | ||
324 | br.ctop.sptk .Lsub_ctop };; | ||
325 | .Lsub_cend: | ||
326 | |||
327 | { .mmb; .pred.rel "mutex",p34,p36 | ||
328 | (p34) sub topbit=topbit,r0 // (p19) | ||
329 | (p36) sub topbit=topbit,r0,1 | ||
330 | brp.loop.imp .Lcopy_ctop,.Lcopy_cend-16 | ||
331 | } | ||
332 | { .mmb; sub rptr=rptr,len // rewind | ||
333 | sub tptr=tptr,len | ||
334 | clrrrb.pr };; | ||
335 | { .mmi; and aptr=tptr,topbit | ||
336 | andcm bptr=rptr,topbit | ||
337 | mov pr.rot=1<<16 };; | ||
338 | { .mii; or nptr=aptr,bptr | ||
339 | mov ar.lc=lc | ||
340 | mov ar.ec=3 };; | ||
341 | |||
342 | .Lcopy_ctop: | ||
343 | { .mmb; (p16) ld8 n[0]=[nptr],8 | ||
344 | (p18) st8 [tptr]=r0,8 | ||
345 | (p16) nop.b 0 } | ||
346 | { .mmb; (p16) nop.m 0 | ||
347 | (p18) st8 [rptr]=n[2],8 | ||
348 | br.ctop.sptk .Lcopy_ctop };; | ||
349 | .Lcopy_cend: | ||
350 | |||
351 | { .mmi; mov ret0=1 // signal "handled" | ||
352 | rum 1<<5 // clear um.mfh | ||
353 | mov ar.lc=prevlc } | ||
354 | { .mib; .restore sp | ||
355 | mov sp=prevsp | ||
356 | mov pr=prevpr,0x1ffff | ||
357 | br.ret.sptk.many b0 };; | ||
358 | .endp bn_mul_mont_general# | ||
359 | |||
360 | a1=r16; a2=r17; a3=r18; a4=r19; a5=r20; a6=r21; a7=r22; a8=r23; | ||
361 | n1=r24; n2=r25; n3=r26; n4=r27; n5=r28; n6=r29; n7=r30; n8=r31; | ||
362 | t0=r15; | ||
363 | |||
364 | ai0=f8; ai1=f9; ai2=f10; ai3=f11; ai4=f12; ai5=f13; ai6=f14; ai7=f15; | ||
365 | ni0=f16; ni1=f17; ni2=f18; ni3=f19; ni4=f20; ni5=f21; ni6=f22; ni7=f23; | ||
366 | |||
367 | .align 64 | ||
368 | .skip 48 // aligns loop body | ||
369 | .local bn_mul_mont_8# | ||
370 | .proc bn_mul_mont_8# | ||
371 | bn_mul_mont_8: | ||
372 | .prologue | ||
373 | { .mmi; .save ar.pfs,prevfs | ||
374 | alloc prevfs=ar.pfs,6,2,0,8 | ||
375 | .vframe prevsp | ||
376 | mov prevsp=sp | ||
377 | .save ar.lc,prevlc | ||
378 | mov prevlc=ar.lc } | ||
379 | { .mmi; add r17=-6*16,sp | ||
380 | add sp=-7*16,sp | ||
381 | .save pr,prevpr | ||
382 | mov prevpr=pr };; | ||
383 | |||
384 | { .mmi; .save.gf 0,0x10 | ||
385 | stf.spill [sp]=f16,-16 | ||
386 | .save.gf 0,0x20 | ||
387 | stf.spill [r17]=f17,32 | ||
388 | add r16=-5*16,prevsp};; | ||
389 | { .mmi; .save.gf 0,0x40 | ||
390 | stf.spill [r16]=f18,32 | ||
391 | .save.gf 0,0x80 | ||
392 | stf.spill [r17]=f19,32 | ||
393 | $ADDP aptr=0,in1 };; | ||
394 | { .mmi; .save.gf 0,0x100 | ||
395 | stf.spill [r16]=f20,32 | ||
396 | .save.gf 0,0x200 | ||
397 | stf.spill [r17]=f21,32 | ||
398 | $ADDP r29=8,in1 };; | ||
399 | { .mmi; .save.gf 0,0x400 | ||
400 | stf.spill [r16]=f22 | ||
401 | .save.gf 0,0x800 | ||
402 | stf.spill [r17]=f23 | ||
403 | $ADDP rptr=0,in0 };; | ||
404 | |||
405 | .body | ||
406 | .rotf bj[8],mj[2],tf[2],alo[10],ahi[10],nlo[10],nhi[10] | ||
407 | .rotr t[8] | ||
408 | |||
409 | // load input vectors padding them to 8 elements | ||
410 | { .mmi; ldf8 ai0=[aptr],16 // ap[0] | ||
411 | ldf8 ai1=[r29],16 // ap[1] | ||
412 | $ADDP bptr=0,in2 } | ||
413 | { .mmi; $ADDP r30=8,in2 | ||
414 | $ADDP nptr=0,in3 | ||
415 | $ADDP r31=8,in3 };; | ||
416 | { .mmi; ldf8 bj[7]=[bptr],16 // bp[0] | ||
417 | ldf8 bj[6]=[r30],16 // bp[1] | ||
418 | cmp4.le p4,p5=3,in5 } | ||
419 | { .mmi; ldf8 ni0=[nptr],16 // np[0] | ||
420 | ldf8 ni1=[r31],16 // np[1] | ||
421 | cmp4.le p6,p7=4,in5 };; | ||
422 | |||
423 | { .mfi; (p4)ldf8 ai2=[aptr],16 // ap[2] | ||
424 | (p5)fcvt.fxu ai2=f0 | ||
425 | cmp4.le p8,p9=5,in5 } | ||
426 | { .mfi; (p6)ldf8 ai3=[r29],16 // ap[3] | ||
427 | (p7)fcvt.fxu ai3=f0 | ||
428 | cmp4.le p10,p11=6,in5 } | ||
429 | { .mfi; (p4)ldf8 bj[5]=[bptr],16 // bp[2] | ||
430 | (p5)fcvt.fxu bj[5]=f0 | ||
431 | cmp4.le p12,p13=7,in5 } | ||
432 | { .mfi; (p6)ldf8 bj[4]=[r30],16 // bp[3] | ||
433 | (p7)fcvt.fxu bj[4]=f0 | ||
434 | cmp4.le p14,p15=8,in5 } | ||
435 | { .mfi; (p4)ldf8 ni2=[nptr],16 // np[2] | ||
436 | (p5)fcvt.fxu ni2=f0 | ||
437 | addp4 r28=-1,in5 } | ||
438 | { .mfi; (p6)ldf8 ni3=[r31],16 // np[3] | ||
439 | (p7)fcvt.fxu ni3=f0 | ||
440 | $ADDP in4=0,in4 };; | ||
441 | |||
442 | { .mfi; ldf8 n0=[in4] | ||
443 | fcvt.fxu tf[1]=f0 | ||
444 | nop.i 0 } | ||
445 | |||
446 | { .mfi; (p8)ldf8 ai4=[aptr],16 // ap[4] | ||
447 | (p9)fcvt.fxu ai4=f0 | ||
448 | mov t[0]=r0 } | ||
449 | { .mfi; (p10)ldf8 ai5=[r29],16 // ap[5] | ||
450 | (p11)fcvt.fxu ai5=f0 | ||
451 | mov t[1]=r0 } | ||
452 | { .mfi; (p8)ldf8 bj[3]=[bptr],16 // bp[4] | ||
453 | (p9)fcvt.fxu bj[3]=f0 | ||
454 | mov t[2]=r0 } | ||
455 | { .mfi; (p10)ldf8 bj[2]=[r30],16 // bp[5] | ||
456 | (p11)fcvt.fxu bj[2]=f0 | ||
457 | mov t[3]=r0 } | ||
458 | { .mfi; (p8)ldf8 ni4=[nptr],16 // np[4] | ||
459 | (p9)fcvt.fxu ni4=f0 | ||
460 | mov t[4]=r0 } | ||
461 | { .mfi; (p10)ldf8 ni5=[r31],16 // np[5] | ||
462 | (p11)fcvt.fxu ni5=f0 | ||
463 | mov t[5]=r0 };; | ||
464 | |||
465 | { .mfi; (p12)ldf8 ai6=[aptr],16 // ap[6] | ||
466 | (p13)fcvt.fxu ai6=f0 | ||
467 | mov t[6]=r0 } | ||
468 | { .mfi; (p14)ldf8 ai7=[r29],16 // ap[7] | ||
469 | (p15)fcvt.fxu ai7=f0 | ||
470 | mov t[7]=r0 } | ||
471 | { .mfi; (p12)ldf8 bj[1]=[bptr],16 // bp[6] | ||
472 | (p13)fcvt.fxu bj[1]=f0 | ||
473 | mov ar.lc=r28 } | ||
474 | { .mfi; (p14)ldf8 bj[0]=[r30],16 // bp[7] | ||
475 | (p15)fcvt.fxu bj[0]=f0 | ||
476 | mov ar.ec=1 } | ||
477 | { .mfi; (p12)ldf8 ni6=[nptr],16 // np[6] | ||
478 | (p13)fcvt.fxu ni6=f0 | ||
479 | mov pr.rot=1<<16 } | ||
480 | { .mfb; (p14)ldf8 ni7=[r31],16 // np[7] | ||
481 | (p15)fcvt.fxu ni7=f0 | ||
482 | brp.loop.imp .Louter_8_ctop,.Louter_8_cend-16 | ||
483 | };; | ||
484 | |||
485 | // The loop is scheduled for 32*n ticks on Itanium 2. Actual attempt | ||
486 | // to measure with help of Interval Time Counter indicated that the | ||
487 | // factor is a tad higher: 33 or 34, if not 35. Exact measurement and | ||
488 | // addressing the issue is problematic, because I don't have access | ||
489 | // to platform-specific instruction-level profiler. On Itanium it | ||
490 | // should run in 56*n ticks, because of higher xma latency... | ||
491 | .Louter_8_ctop: | ||
492 | .pred.rel "mutex",p40,p42 | ||
493 | .pred.rel "mutex",p48,p50 | ||
494 | { .mfi; (p16) nop.m 0 // 0: | ||
495 | (p16) xma.hu ahi[0]=ai0,bj[7],tf[1] // ap[0]*b[i]+t[0] | ||
496 | (p40) add a3=a3,n3 } // (p17) a3+=n3 | ||
497 | { .mfi; (p42) add a3=a3,n3,1 | ||
498 | (p16) xma.lu alo[0]=ai0,bj[7],tf[1] | ||
499 | (p16) nop.i 0 };; | ||
500 | { .mii; (p17) getf.sig a7=alo[8] // 1: | ||
501 | (p48) add t[6]=t[6],a3 // (p17) t[6]+=a3 | ||
502 | (p50) add t[6]=t[6],a3,1 };; | ||
503 | { .mfi; (p17) getf.sig a8=ahi[8] // 2: | ||
504 | (p17) xma.hu nhi[7]=ni6,mj[1],nhi[6] // np[6]*m0 | ||
505 | (p40) cmp.ltu p43,p41=a3,n3 } | ||
506 | { .mfi; (p42) cmp.leu p43,p41=a3,n3 | ||
507 | (p17) xma.lu nlo[7]=ni6,mj[1],nhi[6] | ||
508 | (p16) nop.i 0 };; | ||
509 | { .mii; (p17) getf.sig n5=nlo[6] // 3: | ||
510 | (p48) cmp.ltu p51,p49=t[6],a3 | ||
511 | (p50) cmp.leu p51,p49=t[6],a3 };; | ||
512 | .pred.rel "mutex",p41,p43 | ||
513 | .pred.rel "mutex",p49,p51 | ||
514 | { .mfi; (p16) nop.m 0 // 4: | ||
515 | (p16) xma.hu ahi[1]=ai1,bj[7],ahi[0] // ap[1]*b[i] | ||
516 | (p41) add a4=a4,n4 } // (p17) a4+=n4 | ||
517 | { .mfi; (p43) add a4=a4,n4,1 | ||
518 | (p16) xma.lu alo[1]=ai1,bj[7],ahi[0] | ||
519 | (p16) nop.i 0 };; | ||
520 | { .mfi; (p49) add t[5]=t[5],a4 // 5: (p17) t[5]+=a4 | ||
521 | (p16) xmpy.lu mj[0]=alo[0],n0 // (ap[0]*b[i]+t[0])*n0 | ||
522 | (p51) add t[5]=t[5],a4,1 };; | ||
523 | { .mfi; (p16) nop.m 0 // 6: | ||
524 | (p17) xma.hu nhi[8]=ni7,mj[1],nhi[7] // np[7]*m0 | ||
525 | (p41) cmp.ltu p42,p40=a4,n4 } | ||
526 | { .mfi; (p43) cmp.leu p42,p40=a4,n4 | ||
527 | (p17) xma.lu nlo[8]=ni7,mj[1],nhi[7] | ||
528 | (p16) nop.i 0 };; | ||
529 | { .mii; (p17) getf.sig n6=nlo[7] // 7: | ||
530 | (p49) cmp.ltu p50,p48=t[5],a4 | ||
531 | (p51) cmp.leu p50,p48=t[5],a4 };; | ||
532 | .pred.rel "mutex",p40,p42 | ||
533 | .pred.rel "mutex",p48,p50 | ||
534 | { .mfi; (p16) nop.m 0 // 8: | ||
535 | (p16) xma.hu ahi[2]=ai2,bj[7],ahi[1] // ap[2]*b[i] | ||
536 | (p40) add a5=a5,n5 } // (p17) a5+=n5 | ||
537 | { .mfi; (p42) add a5=a5,n5,1 | ||
538 | (p16) xma.lu alo[2]=ai2,bj[7],ahi[1] | ||
539 | (p16) nop.i 0 };; | ||
540 | { .mii; (p16) getf.sig a1=alo[1] // 9: | ||
541 | (p48) add t[4]=t[4],a5 // p(17) t[4]+=a5 | ||
542 | (p50) add t[4]=t[4],a5,1 };; | ||
543 | { .mfi; (p16) nop.m 0 // 10: | ||
544 | (p16) xma.hu nhi[0]=ni0,mj[0],alo[0] // np[0]*m0 | ||
545 | (p40) cmp.ltu p43,p41=a5,n5 } | ||
546 | { .mfi; (p42) cmp.leu p43,p41=a5,n5 | ||
547 | (p16) xma.lu nlo[0]=ni0,mj[0],alo[0] | ||
548 | (p16) nop.i 0 };; | ||
549 | { .mii; (p17) getf.sig n7=nlo[8] // 11: | ||
550 | (p48) cmp.ltu p51,p49=t[4],a5 | ||
551 | (p50) cmp.leu p51,p49=t[4],a5 };; | ||
552 | .pred.rel "mutex",p41,p43 | ||
553 | .pred.rel "mutex",p49,p51 | ||
554 | { .mfi; (p17) getf.sig n8=nhi[8] // 12: | ||
555 | (p16) xma.hu ahi[3]=ai3,bj[7],ahi[2] // ap[3]*b[i] | ||
556 | (p41) add a6=a6,n6 } // (p17) a6+=n6 | ||
557 | { .mfi; (p43) add a6=a6,n6,1 | ||
558 | (p16) xma.lu alo[3]=ai3,bj[7],ahi[2] | ||
559 | (p16) nop.i 0 };; | ||
560 | { .mii; (p16) getf.sig a2=alo[2] // 13: | ||
561 | (p49) add t[3]=t[3],a6 // (p17) t[3]+=a6 | ||
562 | (p51) add t[3]=t[3],a6,1 };; | ||
563 | { .mfi; (p16) nop.m 0 // 14: | ||
564 | (p16) xma.hu nhi[1]=ni1,mj[0],nhi[0] // np[1]*m0 | ||
565 | (p41) cmp.ltu p42,p40=a6,n6 } | ||
566 | { .mfi; (p43) cmp.leu p42,p40=a6,n6 | ||
567 | (p16) xma.lu nlo[1]=ni1,mj[0],nhi[0] | ||
568 | (p16) nop.i 0 };; | ||
569 | { .mii; (p16) nop.m 0 // 15: | ||
570 | (p49) cmp.ltu p50,p48=t[3],a6 | ||
571 | (p51) cmp.leu p50,p48=t[3],a6 };; | ||
572 | .pred.rel "mutex",p40,p42 | ||
573 | .pred.rel "mutex",p48,p50 | ||
574 | { .mfi; (p16) nop.m 0 // 16: | ||
575 | (p16) xma.hu ahi[4]=ai4,bj[7],ahi[3] // ap[4]*b[i] | ||
576 | (p40) add a7=a7,n7 } // (p17) a7+=n7 | ||
577 | { .mfi; (p42) add a7=a7,n7,1 | ||
578 | (p16) xma.lu alo[4]=ai4,bj[7],ahi[3] | ||
579 | (p16) nop.i 0 };; | ||
580 | { .mii; (p16) getf.sig a3=alo[3] // 17: | ||
581 | (p48) add t[2]=t[2],a7 // (p17) t[2]+=a7 | ||
582 | (p50) add t[2]=t[2],a7,1 };; | ||
583 | { .mfi; (p16) nop.m 0 // 18: | ||
584 | (p16) xma.hu nhi[2]=ni2,mj[0],nhi[1] // np[2]*m0 | ||
585 | (p40) cmp.ltu p43,p41=a7,n7 } | ||
586 | { .mfi; (p42) cmp.leu p43,p41=a7,n7 | ||
587 | (p16) xma.lu nlo[2]=ni2,mj[0],nhi[1] | ||
588 | (p16) nop.i 0 };; | ||
589 | { .mii; (p16) getf.sig n1=nlo[1] // 19: | ||
590 | (p48) cmp.ltu p51,p49=t[2],a7 | ||
591 | (p50) cmp.leu p51,p49=t[2],a7 };; | ||
592 | .pred.rel "mutex",p41,p43 | ||
593 | .pred.rel "mutex",p49,p51 | ||
594 | { .mfi; (p16) nop.m 0 // 20: | ||
595 | (p16) xma.hu ahi[5]=ai5,bj[7],ahi[4] // ap[5]*b[i] | ||
596 | (p41) add a8=a8,n8 } // (p17) a8+=n8 | ||
597 | { .mfi; (p43) add a8=a8,n8,1 | ||
598 | (p16) xma.lu alo[5]=ai5,bj[7],ahi[4] | ||
599 | (p16) nop.i 0 };; | ||
600 | { .mii; (p16) getf.sig a4=alo[4] // 21: | ||
601 | (p49) add t[1]=t[1],a8 // (p17) t[1]+=a8 | ||
602 | (p51) add t[1]=t[1],a8,1 };; | ||
603 | { .mfi; (p16) nop.m 0 // 22: | ||
604 | (p16) xma.hu nhi[3]=ni3,mj[0],nhi[2] // np[3]*m0 | ||
605 | (p41) cmp.ltu p42,p40=a8,n8 } | ||
606 | { .mfi; (p43) cmp.leu p42,p40=a8,n8 | ||
607 | (p16) xma.lu nlo[3]=ni3,mj[0],nhi[2] | ||
608 | (p16) nop.i 0 };; | ||
609 | { .mii; (p16) getf.sig n2=nlo[2] // 23: | ||
610 | (p49) cmp.ltu p50,p48=t[1],a8 | ||
611 | (p51) cmp.leu p50,p48=t[1],a8 };; | ||
612 | { .mfi; (p16) nop.m 0 // 24: | ||
613 | (p16) xma.hu ahi[6]=ai6,bj[7],ahi[5] // ap[6]*b[i] | ||
614 | (p16) add a1=a1,n1 } // (p16) a1+=n1 | ||
615 | { .mfi; (p16) nop.m 0 | ||
616 | (p16) xma.lu alo[6]=ai6,bj[7],ahi[5] | ||
617 | (p17) mov t[0]=r0 };; | ||
618 | { .mii; (p16) getf.sig a5=alo[5] // 25: | ||
619 | (p16) add t0=t[7],a1 // (p16) t[7]+=a1 | ||
620 | (p42) add t[0]=t[0],r0,1 };; | ||
621 | { .mfi; (p16) setf.sig tf[0]=t0 // 26: | ||
622 | (p16) xma.hu nhi[4]=ni4,mj[0],nhi[3] // np[4]*m0 | ||
623 | (p50) add t[0]=t[0],r0,1 } | ||
624 | { .mfi; (p16) cmp.ltu.unc p42,p40=a1,n1 | ||
625 | (p16) xma.lu nlo[4]=ni4,mj[0],nhi[3] | ||
626 | (p16) nop.i 0 };; | ||
627 | { .mii; (p16) getf.sig n3=nlo[3] // 27: | ||
628 | (p16) cmp.ltu.unc p50,p48=t0,a1 | ||
629 | (p16) nop.i 0 };; | ||
630 | .pred.rel "mutex",p40,p42 | ||
631 | .pred.rel "mutex",p48,p50 | ||
632 | { .mfi; (p16) nop.m 0 // 28: | ||
633 | (p16) xma.hu ahi[7]=ai7,bj[7],ahi[6] // ap[7]*b[i] | ||
634 | (p40) add a2=a2,n2 } // (p16) a2+=n2 | ||
635 | { .mfi; (p42) add a2=a2,n2,1 | ||
636 | (p16) xma.lu alo[7]=ai7,bj[7],ahi[6] | ||
637 | (p16) nop.i 0 };; | ||
638 | { .mii; (p16) getf.sig a6=alo[6] // 29: | ||
639 | (p48) add t[6]=t[6],a2 // (p16) t[6]+=a2 | ||
640 | (p50) add t[6]=t[6],a2,1 };; | ||
641 | { .mfi; (p16) nop.m 0 // 30: | ||
642 | (p16) xma.hu nhi[5]=ni5,mj[0],nhi[4] // np[5]*m0 | ||
643 | (p40) cmp.ltu p41,p39=a2,n2 } | ||
644 | { .mfi; (p42) cmp.leu p41,p39=a2,n2 | ||
645 | (p16) xma.lu nlo[5]=ni5,mj[0],nhi[4] | ||
646 | (p16) nop.i 0 };; | ||
647 | { .mfi; (p16) getf.sig n4=nlo[4] // 31: | ||
648 | (p16) nop.f 0 | ||
649 | (p48) cmp.ltu p49,p47=t[6],a2 } | ||
650 | { .mfb; (p50) cmp.leu p49,p47=t[6],a2 | ||
651 | (p16) nop.f 0 | ||
652 | br.ctop.sptk.many .Louter_8_ctop };; | ||
653 | .Louter_8_cend: | ||
654 | |||
655 | // above loop has to execute one more time, without (p16), which is | ||
656 | // replaced with merged move of np[8] to GPR bank | ||
657 | .pred.rel "mutex",p40,p42 | ||
658 | .pred.rel "mutex",p48,p50 | ||
659 | { .mmi; (p0) getf.sig n1=ni0 // 0: | ||
660 | (p40) add a3=a3,n3 // (p17) a3+=n3 | ||
661 | (p42) add a3=a3,n3,1 };; | ||
662 | { .mii; (p17) getf.sig a7=alo[8] // 1: | ||
663 | (p48) add t[6]=t[6],a3 // (p17) t[6]+=a3 | ||
664 | (p50) add t[6]=t[6],a3,1 };; | ||
665 | { .mfi; (p17) getf.sig a8=ahi[8] // 2: | ||
666 | (p17) xma.hu nhi[7]=ni6,mj[1],nhi[6] // np[6]*m0 | ||
667 | (p40) cmp.ltu p43,p41=a3,n3 } | ||
668 | { .mfi; (p42) cmp.leu p43,p41=a3,n3 | ||
669 | (p17) xma.lu nlo[7]=ni6,mj[1],nhi[6] | ||
670 | (p0) nop.i 0 };; | ||
671 | { .mii; (p17) getf.sig n5=nlo[6] // 3: | ||
672 | (p48) cmp.ltu p51,p49=t[6],a3 | ||
673 | (p50) cmp.leu p51,p49=t[6],a3 };; | ||
674 | .pred.rel "mutex",p41,p43 | ||
675 | .pred.rel "mutex",p49,p51 | ||
676 | { .mmi; (p0) getf.sig n2=ni1 // 4: | ||
677 | (p41) add a4=a4,n4 // (p17) a4+=n4 | ||
678 | (p43) add a4=a4,n4,1 };; | ||
679 | { .mfi; (p49) add t[5]=t[5],a4 // 5: (p17) t[5]+=a4 | ||
680 | (p0) nop.f 0 | ||
681 | (p51) add t[5]=t[5],a4,1 };; | ||
682 | { .mfi; (p0) getf.sig n3=ni2 // 6: | ||
683 | (p17) xma.hu nhi[8]=ni7,mj[1],nhi[7] // np[7]*m0 | ||
684 | (p41) cmp.ltu p42,p40=a4,n4 } | ||
685 | { .mfi; (p43) cmp.leu p42,p40=a4,n4 | ||
686 | (p17) xma.lu nlo[8]=ni7,mj[1],nhi[7] | ||
687 | (p0) nop.i 0 };; | ||
688 | { .mii; (p17) getf.sig n6=nlo[7] // 7: | ||
689 | (p49) cmp.ltu p50,p48=t[5],a4 | ||
690 | (p51) cmp.leu p50,p48=t[5],a4 };; | ||
691 | .pred.rel "mutex",p40,p42 | ||
692 | .pred.rel "mutex",p48,p50 | ||
693 | { .mii; (p0) getf.sig n4=ni3 // 8: | ||
694 | (p40) add a5=a5,n5 // (p17) a5+=n5 | ||
695 | (p42) add a5=a5,n5,1 };; | ||
696 | { .mii; (p0) nop.m 0 // 9: | ||
697 | (p48) add t[4]=t[4],a5 // p(17) t[4]+=a5 | ||
698 | (p50) add t[4]=t[4],a5,1 };; | ||
699 | { .mii; (p0) nop.m 0 // 10: | ||
700 | (p40) cmp.ltu p43,p41=a5,n5 | ||
701 | (p42) cmp.leu p43,p41=a5,n5 };; | ||
702 | { .mii; (p17) getf.sig n7=nlo[8] // 11: | ||
703 | (p48) cmp.ltu p51,p49=t[4],a5 | ||
704 | (p50) cmp.leu p51,p49=t[4],a5 };; | ||
705 | .pred.rel "mutex",p41,p43 | ||
706 | .pred.rel "mutex",p49,p51 | ||
707 | { .mii; (p17) getf.sig n8=nhi[8] // 12: | ||
708 | (p41) add a6=a6,n6 // (p17) a6+=n6 | ||
709 | (p43) add a6=a6,n6,1 };; | ||
710 | { .mii; (p0) getf.sig n5=ni4 // 13: | ||
711 | (p49) add t[3]=t[3],a6 // (p17) t[3]+=a6 | ||
712 | (p51) add t[3]=t[3],a6,1 };; | ||
713 | { .mii; (p0) nop.m 0 // 14: | ||
714 | (p41) cmp.ltu p42,p40=a6,n6 | ||
715 | (p43) cmp.leu p42,p40=a6,n6 };; | ||
716 | { .mii; (p0) getf.sig n6=ni5 // 15: | ||
717 | (p49) cmp.ltu p50,p48=t[3],a6 | ||
718 | (p51) cmp.leu p50,p48=t[3],a6 };; | ||
719 | .pred.rel "mutex",p40,p42 | ||
720 | .pred.rel "mutex",p48,p50 | ||
721 | { .mii; (p0) nop.m 0 // 16: | ||
722 | (p40) add a7=a7,n7 // (p17) a7+=n7 | ||
723 | (p42) add a7=a7,n7,1 };; | ||
724 | { .mii; (p0) nop.m 0 // 17: | ||
725 | (p48) add t[2]=t[2],a7 // (p17) t[2]+=a7 | ||
726 | (p50) add t[2]=t[2],a7,1 };; | ||
727 | { .mii; (p0) nop.m 0 // 18: | ||
728 | (p40) cmp.ltu p43,p41=a7,n7 | ||
729 | (p42) cmp.leu p43,p41=a7,n7 };; | ||
730 | { .mii; (p0) getf.sig n7=ni6 // 19: | ||
731 | (p48) cmp.ltu p51,p49=t[2],a7 | ||
732 | (p50) cmp.leu p51,p49=t[2],a7 };; | ||
733 | .pred.rel "mutex",p41,p43 | ||
734 | .pred.rel "mutex",p49,p51 | ||
735 | { .mii; (p0) nop.m 0 // 20: | ||
736 | (p41) add a8=a8,n8 // (p17) a8+=n8 | ||
737 | (p43) add a8=a8,n8,1 };; | ||
738 | { .mmi; (p0) nop.m 0 // 21: | ||
739 | (p49) add t[1]=t[1],a8 // (p17) t[1]+=a8 | ||
740 | (p51) add t[1]=t[1],a8,1 } | ||
741 | { .mmi; (p17) mov t[0]=r0 | ||
742 | (p41) cmp.ltu p42,p40=a8,n8 | ||
743 | (p43) cmp.leu p42,p40=a8,n8 };; | ||
744 | { .mmi; (p0) getf.sig n8=ni7 // 22: | ||
745 | (p49) cmp.ltu p50,p48=t[1],a8 | ||
746 | (p51) cmp.leu p50,p48=t[1],a8 } | ||
747 | { .mmi; (p42) add t[0]=t[0],r0,1 | ||
748 | (p0) add r16=-7*16,prevsp | ||
749 | (p0) add r17=-6*16,prevsp };; | ||
750 | |||
751 | // subtract np[8] from carrybit|tmp[8] | ||
752 | // carrybit|tmp[8] layout upon exit from above loop is: | ||
753 | // t[0]|t[1]|t[2]|t[3]|t[4]|t[5]|t[6]|t[7]|t0 (least significant) | ||
754 | { .mmi; (p50)add t[0]=t[0],r0,1 | ||
755 | add r18=-5*16,prevsp | ||
756 | sub n1=t0,n1 };; | ||
757 | { .mmi; cmp.gtu p34,p32=n1,t0;; | ||
758 | .pred.rel "mutex",p32,p34 | ||
759 | (p32)sub n2=t[7],n2 | ||
760 | (p34)sub n2=t[7],n2,1 };; | ||
761 | { .mii; (p32)cmp.gtu p35,p33=n2,t[7] | ||
762 | (p34)cmp.geu p35,p33=n2,t[7];; | ||
763 | .pred.rel "mutex",p33,p35 | ||
764 | (p33)sub n3=t[6],n3 } | ||
765 | { .mmi; (p35)sub n3=t[6],n3,1;; | ||
766 | (p33)cmp.gtu p34,p32=n3,t[6] | ||
767 | (p35)cmp.geu p34,p32=n3,t[6] };; | ||
768 | .pred.rel "mutex",p32,p34 | ||
769 | { .mii; (p32)sub n4=t[5],n4 | ||
770 | (p34)sub n4=t[5],n4,1;; | ||
771 | (p32)cmp.gtu p35,p33=n4,t[5] } | ||
772 | { .mmi; (p34)cmp.geu p35,p33=n4,t[5];; | ||
773 | .pred.rel "mutex",p33,p35 | ||
774 | (p33)sub n5=t[4],n5 | ||
775 | (p35)sub n5=t[4],n5,1 };; | ||
776 | { .mii; (p33)cmp.gtu p34,p32=n5,t[4] | ||
777 | (p35)cmp.geu p34,p32=n5,t[4];; | ||
778 | .pred.rel "mutex",p32,p34 | ||
779 | (p32)sub n6=t[3],n6 } | ||
780 | { .mmi; (p34)sub n6=t[3],n6,1;; | ||
781 | (p32)cmp.gtu p35,p33=n6,t[3] | ||
782 | (p34)cmp.geu p35,p33=n6,t[3] };; | ||
783 | .pred.rel "mutex",p33,p35 | ||
784 | { .mii; (p33)sub n7=t[2],n7 | ||
785 | (p35)sub n7=t[2],n7,1;; | ||
786 | (p33)cmp.gtu p34,p32=n7,t[2] } | ||
787 | { .mmi; (p35)cmp.geu p34,p32=n7,t[2];; | ||
788 | .pred.rel "mutex",p32,p34 | ||
789 | (p32)sub n8=t[1],n8 | ||
790 | (p34)sub n8=t[1],n8,1 };; | ||
791 | { .mii; (p32)cmp.gtu p35,p33=n8,t[1] | ||
792 | (p34)cmp.geu p35,p33=n8,t[1];; | ||
793 | .pred.rel "mutex",p33,p35 | ||
794 | (p33)sub a8=t[0],r0 } | ||
795 | { .mmi; (p35)sub a8=t[0],r0,1;; | ||
796 | (p33)cmp.gtu p34,p32=a8,t[0] | ||
797 | (p35)cmp.geu p34,p32=a8,t[0] };; | ||
798 | |||
799 | // save the result, either tmp[num] or tmp[num]-np[num] | ||
800 | .pred.rel "mutex",p32,p34 | ||
801 | { .mmi; (p32)st8 [rptr]=n1,8 | ||
802 | (p34)st8 [rptr]=t0,8 | ||
803 | add r19=-4*16,prevsp};; | ||
804 | { .mmb; (p32)st8 [rptr]=n2,8 | ||
805 | (p34)st8 [rptr]=t[7],8 | ||
806 | (p5)br.cond.dpnt.few .Ldone };; | ||
807 | { .mmb; (p32)st8 [rptr]=n3,8 | ||
808 | (p34)st8 [rptr]=t[6],8 | ||
809 | (p7)br.cond.dpnt.few .Ldone };; | ||
810 | { .mmb; (p32)st8 [rptr]=n4,8 | ||
811 | (p34)st8 [rptr]=t[5],8 | ||
812 | (p9)br.cond.dpnt.few .Ldone };; | ||
813 | { .mmb; (p32)st8 [rptr]=n5,8 | ||
814 | (p34)st8 [rptr]=t[4],8 | ||
815 | (p11)br.cond.dpnt.few .Ldone };; | ||
816 | { .mmb; (p32)st8 [rptr]=n6,8 | ||
817 | (p34)st8 [rptr]=t[3],8 | ||
818 | (p13)br.cond.dpnt.few .Ldone };; | ||
819 | { .mmb; (p32)st8 [rptr]=n7,8 | ||
820 | (p34)st8 [rptr]=t[2],8 | ||
821 | (p15)br.cond.dpnt.few .Ldone };; | ||
822 | { .mmb; (p32)st8 [rptr]=n8,8 | ||
823 | (p34)st8 [rptr]=t[1],8 | ||
824 | nop.b 0 };; | ||
825 | .Ldone: // epilogue | ||
826 | { .mmi; ldf.fill f16=[r16],64 | ||
827 | ldf.fill f17=[r17],64 | ||
828 | nop.i 0 } | ||
829 | { .mmi; ldf.fill f18=[r18],64 | ||
830 | ldf.fill f19=[r19],64 | ||
831 | mov pr=prevpr,0x1ffff };; | ||
832 | { .mmi; ldf.fill f20=[r16] | ||
833 | ldf.fill f21=[r17] | ||
834 | mov ar.lc=prevlc } | ||
835 | { .mmi; ldf.fill f22=[r18] | ||
836 | ldf.fill f23=[r19] | ||
837 | mov ret0=1 } // signal "handled" | ||
838 | { .mib; rum 1<<5 | ||
839 | .restore sp | ||
840 | mov sp=prevsp | ||
841 | br.ret.sptk.many b0 };; | ||
842 | .endp bn_mul_mont_8# | ||
843 | |||
844 | .type copyright#,\@object | ||
845 | copyright: | ||
846 | stringz "Montgomery multiplication for IA-64, CRYPTOGAMS by <appro\@openssl.org>" | ||
847 | ___ | ||
848 | |||
849 | $output=shift and open STDOUT,">$output"; | ||
850 | print $code; | ||
851 | close STDOUT; | ||
diff --git a/src/lib/libcrypto/bn/asm/ia64.S b/src/lib/libcrypto/bn/asm/ia64.S deleted file mode 100644 index 0cf805ddc4..0000000000 --- a/src/lib/libcrypto/bn/asm/ia64.S +++ /dev/null | |||
@@ -1,1555 +0,0 @@ | |||
1 | .explicit | ||
2 | .text | ||
3 | .ident "ia64.S, Version 2.1" | ||
4 | .ident "IA-64 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>" | ||
5 | |||
6 | // | ||
7 | // ==================================================================== | ||
8 | // Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | ||
9 | // project. | ||
10 | // | ||
11 | // Rights for redistribution and usage in source and binary forms are | ||
12 | // granted according to the OpenSSL license. Warranty of any kind is | ||
13 | // disclaimed. | ||
14 | // ==================================================================== | ||
15 | // | ||
16 | // Version 2.x is Itanium2 re-tune. Few words about how Itanum2 is | ||
17 | // different from Itanium to this module viewpoint. Most notably, is it | ||
18 | // "wider" than Itanium? Can you experience loop scalability as | ||
19 | // discussed in commentary sections? Not really:-( Itanium2 has 6 | ||
20 | // integer ALU ports, i.e. it's 2 ports wider, but it's not enough to | ||
21 | // spin twice as fast, as I need 8 IALU ports. Amount of floating point | ||
22 | // ports is the same, i.e. 2, while I need 4. In other words, to this | ||
23 | // module Itanium2 remains effectively as "wide" as Itanium. Yet it's | ||
24 | // essentially different in respect to this module, and a re-tune was | ||
25 | // required. Well, because some intruction latencies has changed. Most | ||
26 | // noticeably those intensively used: | ||
27 | // | ||
28 | // Itanium Itanium2 | ||
29 | // ldf8 9 6 L2 hit | ||
30 | // ld8 2 1 L1 hit | ||
31 | // getf 2 5 | ||
32 | // xma[->getf] 7[+1] 4[+0] | ||
33 | // add[->st8] 1[+1] 1[+0] | ||
34 | // | ||
35 | // What does it mean? You might ratiocinate that the original code | ||
36 | // should run just faster... Because sum of latencies is smaller... | ||
37 | // Wrong! Note that getf latency increased. This means that if a loop is | ||
38 | // scheduled for lower latency (as they were), then it will suffer from | ||
39 | // stall condition and the code will therefore turn anti-scalable, e.g. | ||
40 | // original bn_mul_words spun at 5*n or 2.5 times slower than expected | ||
41 | // on Itanium2! What to do? Reschedule loops for Itanium2? But then | ||
42 | // Itanium would exhibit anti-scalability. So I've chosen to reschedule | ||
43 | // for worst latency for every instruction aiming for best *all-round* | ||
44 | // performance. | ||
45 | |||
46 | // Q. How much faster does it get? | ||
47 | // A. Here is the output from 'openssl speed rsa dsa' for vanilla | ||
48 | // 0.9.6a compiled with gcc version 2.96 20000731 (Red Hat | ||
49 | // Linux 7.1 2.96-81): | ||
50 | // | ||
51 | // sign verify sign/s verify/s | ||
52 | // rsa 512 bits 0.0036s 0.0003s 275.3 2999.2 | ||
53 | // rsa 1024 bits 0.0203s 0.0011s 49.3 894.1 | ||
54 | // rsa 2048 bits 0.1331s 0.0040s 7.5 250.9 | ||
55 | // rsa 4096 bits 0.9270s 0.0147s 1.1 68.1 | ||
56 | // sign verify sign/s verify/s | ||
57 | // dsa 512 bits 0.0035s 0.0043s 288.3 234.8 | ||
58 | // dsa 1024 bits 0.0111s 0.0135s 90.0 74.2 | ||
59 | // | ||
60 | // And here is similar output but for this assembler | ||
61 | // implementation:-) | ||
62 | // | ||
63 | // sign verify sign/s verify/s | ||
64 | // rsa 512 bits 0.0021s 0.0001s 549.4 9638.5 | ||
65 | // rsa 1024 bits 0.0055s 0.0002s 183.8 4481.1 | ||
66 | // rsa 2048 bits 0.0244s 0.0006s 41.4 1726.3 | ||
67 | // rsa 4096 bits 0.1295s 0.0018s 7.7 561.5 | ||
68 | // sign verify sign/s verify/s | ||
69 | // dsa 512 bits 0.0012s 0.0013s 891.9 756.6 | ||
70 | // dsa 1024 bits 0.0023s 0.0028s 440.4 376.2 | ||
71 | // | ||
72 | // Yes, you may argue that it's not fair comparison as it's | ||
73 | // possible to craft the C implementation with BN_UMULT_HIGH | ||
74 | // inline assembler macro. But of course! Here is the output | ||
75 | // with the macro: | ||
76 | // | ||
77 | // sign verify sign/s verify/s | ||
78 | // rsa 512 bits 0.0020s 0.0002s 495.0 6561.0 | ||
79 | // rsa 1024 bits 0.0086s 0.0004s 116.2 2235.7 | ||
80 | // rsa 2048 bits 0.0519s 0.0015s 19.3 667.3 | ||
81 | // rsa 4096 bits 0.3464s 0.0053s 2.9 187.7 | ||
82 | // sign verify sign/s verify/s | ||
83 | // dsa 512 bits 0.0016s 0.0020s 613.1 510.5 | ||
84 | // dsa 1024 bits 0.0045s 0.0054s 221.0 183.9 | ||
85 | // | ||
86 | // My code is still way faster, huh:-) And I believe that even | ||
87 | // higher performance can be achieved. Note that as keys get | ||
88 | // longer, performance gain is larger. Why? According to the | ||
89 | // profiler there is another player in the field, namely | ||
90 | // BN_from_montgomery consuming larger and larger portion of CPU | ||
91 | // time as keysize decreases. I therefore consider putting effort | ||
92 | // to assembler implementation of the following routine: | ||
93 | // | ||
94 | // void bn_mul_add_mont (BN_ULONG *rp,BN_ULONG *np,int nl,BN_ULONG n0) | ||
95 | // { | ||
96 | // int i,j; | ||
97 | // BN_ULONG v; | ||
98 | // | ||
99 | // for (i=0; i<nl; i++) | ||
100 | // { | ||
101 | // v=bn_mul_add_words(rp,np,nl,(rp[0]*n0)&BN_MASK2); | ||
102 | // nrp++; | ||
103 | // rp++; | ||
104 | // if (((nrp[-1]+=v)&BN_MASK2) < v) | ||
105 | // for (j=0; ((++nrp[j])&BN_MASK2) == 0; j++) ; | ||
106 | // } | ||
107 | // } | ||
108 | // | ||
109 | // It might as well be beneficial to implement even combaX | ||
110 | // variants, as it appears as it can literally unleash the | ||
111 | // performance (see comment section to bn_mul_comba8 below). | ||
112 | // | ||
113 | // And finally for your reference the output for 0.9.6a compiled | ||
114 | // with SGIcc version 0.01.0-12 (keep in mind that for the moment | ||
115 | // of this writing it's not possible to convince SGIcc to use | ||
116 | // BN_UMULT_HIGH inline assembler macro, yet the code is fast, | ||
117 | // i.e. for a compiler generated one:-): | ||
118 | // | ||
119 | // sign verify sign/s verify/s | ||
120 | // rsa 512 bits 0.0022s 0.0002s 452.7 5894.3 | ||
121 | // rsa 1024 bits 0.0097s 0.0005s 102.7 2002.9 | ||
122 | // rsa 2048 bits 0.0578s 0.0017s 17.3 600.2 | ||
123 | // rsa 4096 bits 0.3838s 0.0061s 2.6 164.5 | ||
124 | // sign verify sign/s verify/s | ||
125 | // dsa 512 bits 0.0018s 0.0022s 547.3 459.6 | ||
126 | // dsa 1024 bits 0.0051s 0.0062s 196.6 161.3 | ||
127 | // | ||
128 | // Oh! Benchmarks were performed on 733MHz Lion-class Itanium | ||
129 | // system running Redhat Linux 7.1 (very special thanks to Ray | ||
130 | // McCaffity of Williams Communications for providing an account). | ||
131 | // | ||
132 | // Q. What's the heck with 'rum 1<<5' at the end of every function? | ||
133 | // A. Well, by clearing the "upper FP registers written" bit of the | ||
134 | // User Mask I want to excuse the kernel from preserving upper | ||
135 | // (f32-f128) FP register bank over process context switch, thus | ||
136 | // minimizing bus bandwidth consumption during the switch (i.e. | ||
137 | // after PKI opration completes and the program is off doing | ||
138 | // something else like bulk symmetric encryption). Having said | ||
139 | // this, I also want to point out that it might be good idea | ||
140 | // to compile the whole toolkit (as well as majority of the | ||
141 | // programs for that matter) with -mfixed-range=f32-f127 command | ||
142 | // line option. No, it doesn't prevent the compiler from writing | ||
143 | // to upper bank, but at least discourages to do so. If you don't | ||
144 | // like the idea you have the option to compile the module with | ||
145 | // -Drum=nop.m in command line. | ||
146 | // | ||
147 | |||
148 | #if defined(_HPUX_SOURCE) && !defined(_LP64) | ||
149 | #define ADDP addp4 | ||
150 | #else | ||
151 | #define ADDP add | ||
152 | #endif | ||
153 | |||
154 | #if 1 | ||
155 | // | ||
156 | // bn_[add|sub]_words routines. | ||
157 | // | ||
158 | // Loops are spinning in 2*(n+5) ticks on Itanuim (provided that the | ||
159 | // data reside in L1 cache, i.e. 2 ticks away). It's possible to | ||
160 | // compress the epilogue and get down to 2*n+6, but at the cost of | ||
161 | // scalability (the neat feature of this implementation is that it | ||
162 | // shall automagically spin in n+5 on "wider" IA-64 implementations:-) | ||
163 | // I consider that the epilogue is short enough as it is to trade tiny | ||
164 | // performance loss on Itanium for scalability. | ||
165 | // | ||
166 | // BN_ULONG bn_add_words(BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int num) | ||
167 | // | ||
168 | .global bn_add_words# | ||
169 | .proc bn_add_words# | ||
170 | .align 64 | ||
171 | .skip 32 // makes the loop body aligned at 64-byte boundary | ||
172 | bn_add_words: | ||
173 | .prologue | ||
174 | .save ar.pfs,r2 | ||
175 | { .mii; alloc r2=ar.pfs,4,12,0,16 | ||
176 | cmp4.le p6,p0=r35,r0 };; | ||
177 | { .mfb; mov r8=r0 // return value | ||
178 | (p6) br.ret.spnt.many b0 };; | ||
179 | |||
180 | { .mib; sub r10=r35,r0,1 | ||
181 | .save ar.lc,r3 | ||
182 | mov r3=ar.lc | ||
183 | brp.loop.imp .L_bn_add_words_ctop,.L_bn_add_words_cend-16 | ||
184 | } | ||
185 | { .mib; ADDP r14=0,r32 // rp | ||
186 | .save pr,r9 | ||
187 | mov r9=pr };; | ||
188 | .body | ||
189 | { .mii; ADDP r15=0,r33 // ap | ||
190 | mov ar.lc=r10 | ||
191 | mov ar.ec=6 } | ||
192 | { .mib; ADDP r16=0,r34 // bp | ||
193 | mov pr.rot=1<<16 };; | ||
194 | |||
195 | .L_bn_add_words_ctop: | ||
196 | { .mii; (p16) ld8 r32=[r16],8 // b=*(bp++) | ||
197 | (p18) add r39=r37,r34 | ||
198 | (p19) cmp.ltu.unc p56,p0=r40,r38 } | ||
199 | { .mfb; (p0) nop.m 0x0 | ||
200 | (p0) nop.f 0x0 | ||
201 | (p0) nop.b 0x0 } | ||
202 | { .mii; (p16) ld8 r35=[r15],8 // a=*(ap++) | ||
203 | (p58) cmp.eq.or p57,p0=-1,r41 // (p20) | ||
204 | (p58) add r41=1,r41 } // (p20) | ||
205 | { .mfb; (p21) st8 [r14]=r42,8 // *(rp++)=r | ||
206 | (p0) nop.f 0x0 | ||
207 | br.ctop.sptk .L_bn_add_words_ctop };; | ||
208 | .L_bn_add_words_cend: | ||
209 | |||
210 | { .mii; | ||
211 | (p59) add r8=1,r8 // return value | ||
212 | mov pr=r9,0x1ffff | ||
213 | mov ar.lc=r3 } | ||
214 | { .mbb; nop.b 0x0 | ||
215 | br.ret.sptk.many b0 };; | ||
216 | .endp bn_add_words# | ||
217 | |||
218 | // | ||
219 | // BN_ULONG bn_sub_words(BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int num) | ||
220 | // | ||
221 | .global bn_sub_words# | ||
222 | .proc bn_sub_words# | ||
223 | .align 64 | ||
224 | .skip 32 // makes the loop body aligned at 64-byte boundary | ||
225 | bn_sub_words: | ||
226 | .prologue | ||
227 | .save ar.pfs,r2 | ||
228 | { .mii; alloc r2=ar.pfs,4,12,0,16 | ||
229 | cmp4.le p6,p0=r35,r0 };; | ||
230 | { .mfb; mov r8=r0 // return value | ||
231 | (p6) br.ret.spnt.many b0 };; | ||
232 | |||
233 | { .mib; sub r10=r35,r0,1 | ||
234 | .save ar.lc,r3 | ||
235 | mov r3=ar.lc | ||
236 | brp.loop.imp .L_bn_sub_words_ctop,.L_bn_sub_words_cend-16 | ||
237 | } | ||
238 | { .mib; ADDP r14=0,r32 // rp | ||
239 | .save pr,r9 | ||
240 | mov r9=pr };; | ||
241 | .body | ||
242 | { .mii; ADDP r15=0,r33 // ap | ||
243 | mov ar.lc=r10 | ||
244 | mov ar.ec=6 } | ||
245 | { .mib; ADDP r16=0,r34 // bp | ||
246 | mov pr.rot=1<<16 };; | ||
247 | |||
248 | .L_bn_sub_words_ctop: | ||
249 | { .mii; (p16) ld8 r32=[r16],8 // b=*(bp++) | ||
250 | (p18) sub r39=r37,r34 | ||
251 | (p19) cmp.gtu.unc p56,p0=r40,r38 } | ||
252 | { .mfb; (p0) nop.m 0x0 | ||
253 | (p0) nop.f 0x0 | ||
254 | (p0) nop.b 0x0 } | ||
255 | { .mii; (p16) ld8 r35=[r15],8 // a=*(ap++) | ||
256 | (p58) cmp.eq.or p57,p0=0,r41 // (p20) | ||
257 | (p58) add r41=-1,r41 } // (p20) | ||
258 | { .mbb; (p21) st8 [r14]=r42,8 // *(rp++)=r | ||
259 | (p0) nop.b 0x0 | ||
260 | br.ctop.sptk .L_bn_sub_words_ctop };; | ||
261 | .L_bn_sub_words_cend: | ||
262 | |||
263 | { .mii; | ||
264 | (p59) add r8=1,r8 // return value | ||
265 | mov pr=r9,0x1ffff | ||
266 | mov ar.lc=r3 } | ||
267 | { .mbb; nop.b 0x0 | ||
268 | br.ret.sptk.many b0 };; | ||
269 | .endp bn_sub_words# | ||
270 | #endif | ||
271 | |||
272 | #if 0 | ||
273 | #define XMA_TEMPTATION | ||
274 | #endif | ||
275 | |||
276 | #if 1 | ||
277 | // | ||
278 | // BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w) | ||
279 | // | ||
280 | .global bn_mul_words# | ||
281 | .proc bn_mul_words# | ||
282 | .align 64 | ||
283 | .skip 32 // makes the loop body aligned at 64-byte boundary | ||
284 | bn_mul_words: | ||
285 | .prologue | ||
286 | .save ar.pfs,r2 | ||
287 | #ifdef XMA_TEMPTATION | ||
288 | { .mfi; alloc r2=ar.pfs,4,0,0,0 };; | ||
289 | #else | ||
290 | { .mfi; alloc r2=ar.pfs,4,12,0,16 };; | ||
291 | #endif | ||
292 | { .mib; mov r8=r0 // return value | ||
293 | cmp4.le p6,p0=r34,r0 | ||
294 | (p6) br.ret.spnt.many b0 };; | ||
295 | |||
296 | { .mii; sub r10=r34,r0,1 | ||
297 | .save ar.lc,r3 | ||
298 | mov r3=ar.lc | ||
299 | .save pr,r9 | ||
300 | mov r9=pr };; | ||
301 | |||
302 | .body | ||
303 | { .mib; setf.sig f8=r35 // w | ||
304 | mov pr.rot=0x800001<<16 | ||
305 | // ------^----- serves as (p50) at first (p27) | ||
306 | brp.loop.imp .L_bn_mul_words_ctop,.L_bn_mul_words_cend-16 | ||
307 | } | ||
308 | |||
309 | #ifndef XMA_TEMPTATION | ||
310 | |||
311 | { .mmi; ADDP r14=0,r32 // rp | ||
312 | ADDP r15=0,r33 // ap | ||
313 | mov ar.lc=r10 } | ||
314 | { .mmi; mov r40=0 // serves as r35 at first (p27) | ||
315 | mov ar.ec=13 };; | ||
316 | |||
317 | // This loop spins in 2*(n+12) ticks. It's scheduled for data in Itanium | ||
318 | // L2 cache (i.e. 9 ticks away) as floating point load/store instructions | ||
319 | // bypass L1 cache and L2 latency is actually best-case scenario for | ||
320 | // ldf8. The loop is not scalable and shall run in 2*(n+12) even on | ||
321 | // "wider" IA-64 implementations. It's a trade-off here. n+24 loop | ||
322 | // would give us ~5% in *overall* performance improvement on "wider" | ||
323 | // IA-64, but would hurt Itanium for about same because of longer | ||
324 | // epilogue. As it's a matter of few percents in either case I've | ||
325 | // chosen to trade the scalability for development time (you can see | ||
326 | // this very instruction sequence in bn_mul_add_words loop which in | ||
327 | // turn is scalable). | ||
328 | .L_bn_mul_words_ctop: | ||
329 | { .mfi; (p25) getf.sig r36=f52 // low | ||
330 | (p21) xmpy.lu f48=f37,f8 | ||
331 | (p28) cmp.ltu p54,p50=r41,r39 } | ||
332 | { .mfi; (p16) ldf8 f32=[r15],8 | ||
333 | (p21) xmpy.hu f40=f37,f8 | ||
334 | (p0) nop.i 0x0 };; | ||
335 | { .mii; (p25) getf.sig r32=f44 // high | ||
336 | .pred.rel "mutex",p50,p54 | ||
337 | (p50) add r40=r38,r35 // (p27) | ||
338 | (p54) add r40=r38,r35,1 } // (p27) | ||
339 | { .mfb; (p28) st8 [r14]=r41,8 | ||
340 | (p0) nop.f 0x0 | ||
341 | br.ctop.sptk .L_bn_mul_words_ctop };; | ||
342 | .L_bn_mul_words_cend: | ||
343 | |||
344 | { .mii; nop.m 0x0 | ||
345 | .pred.rel "mutex",p51,p55 | ||
346 | (p51) add r8=r36,r0 | ||
347 | (p55) add r8=r36,r0,1 } | ||
348 | { .mfb; nop.m 0x0 | ||
349 | nop.f 0x0 | ||
350 | nop.b 0x0 } | ||
351 | |||
352 | #else // XMA_TEMPTATION | ||
353 | |||
354 | setf.sig f37=r0 // serves as carry at (p18) tick | ||
355 | mov ar.lc=r10 | ||
356 | mov ar.ec=5;; | ||
357 | |||
358 | // Most of you examining this code very likely wonder why in the name | ||
359 | // of Intel the following loop is commented out? Indeed, it looks so | ||
360 | // neat that you find it hard to believe that it's something wrong | ||
361 | // with it, right? The catch is that every iteration depends on the | ||
362 | // result from previous one and the latter isn't available instantly. | ||
363 | // The loop therefore spins at the latency of xma minus 1, or in other | ||
364 | // words at 6*(n+4) ticks:-( Compare to the "production" loop above | ||
365 | // that runs in 2*(n+11) where the low latency problem is worked around | ||
366 | // by moving the dependency to one-tick latent interger ALU. Note that | ||
367 | // "distance" between ldf8 and xma is not latency of ldf8, but the | ||
368 | // *difference* between xma and ldf8 latencies. | ||
369 | .L_bn_mul_words_ctop: | ||
370 | { .mfi; (p16) ldf8 f32=[r33],8 | ||
371 | (p18) xma.hu f38=f34,f8,f39 } | ||
372 | { .mfb; (p20) stf8 [r32]=f37,8 | ||
373 | (p18) xma.lu f35=f34,f8,f39 | ||
374 | br.ctop.sptk .L_bn_mul_words_ctop };; | ||
375 | .L_bn_mul_words_cend: | ||
376 | |||
377 | getf.sig r8=f41 // the return value | ||
378 | |||
379 | #endif // XMA_TEMPTATION | ||
380 | |||
381 | { .mii; nop.m 0x0 | ||
382 | mov pr=r9,0x1ffff | ||
383 | mov ar.lc=r3 } | ||
384 | { .mfb; rum 1<<5 // clear um.mfh | ||
385 | nop.f 0x0 | ||
386 | br.ret.sptk.many b0 };; | ||
387 | .endp bn_mul_words# | ||
388 | #endif | ||
389 | |||
390 | #if 1 | ||
391 | // | ||
392 | // BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w) | ||
393 | // | ||
394 | .global bn_mul_add_words# | ||
395 | .proc bn_mul_add_words# | ||
396 | .align 64 | ||
397 | .skip 48 // makes the loop body aligned at 64-byte boundary | ||
398 | bn_mul_add_words: | ||
399 | .prologue | ||
400 | .save ar.pfs,r2 | ||
401 | { .mmi; alloc r2=ar.pfs,4,4,0,8 | ||
402 | cmp4.le p6,p0=r34,r0 | ||
403 | .save ar.lc,r3 | ||
404 | mov r3=ar.lc };; | ||
405 | { .mib; mov r8=r0 // return value | ||
406 | sub r10=r34,r0,1 | ||
407 | (p6) br.ret.spnt.many b0 };; | ||
408 | |||
409 | { .mib; setf.sig f8=r35 // w | ||
410 | .save pr,r9 | ||
411 | mov r9=pr | ||
412 | brp.loop.imp .L_bn_mul_add_words_ctop,.L_bn_mul_add_words_cend-16 | ||
413 | } | ||
414 | .body | ||
415 | { .mmi; ADDP r14=0,r32 // rp | ||
416 | ADDP r15=0,r33 // ap | ||
417 | mov ar.lc=r10 } | ||
418 | { .mii; ADDP r16=0,r32 // rp copy | ||
419 | mov pr.rot=0x2001<<16 | ||
420 | // ------^----- serves as (p40) at first (p27) | ||
421 | mov ar.ec=11 };; | ||
422 | |||
423 | // This loop spins in 3*(n+10) ticks on Itanium and in 2*(n+10) on | ||
424 | // Itanium 2. Yes, unlike previous versions it scales:-) Previous | ||
425 | // version was performing *all* additions in IALU and was starving | ||
426 | // for those even on Itanium 2. In this version one addition is | ||
427 | // moved to FPU and is folded with multiplication. This is at cost | ||
428 | // of propogating the result from previous call to this subroutine | ||
429 | // to L2 cache... In other words negligible even for shorter keys. | ||
430 | // *Overall* performance improvement [over previous version] varies | ||
431 | // from 11 to 22 percent depending on key length. | ||
432 | .L_bn_mul_add_words_ctop: | ||
433 | .pred.rel "mutex",p40,p42 | ||
434 | { .mfi; (p23) getf.sig r36=f45 // low | ||
435 | (p20) xma.lu f42=f36,f8,f50 // low | ||
436 | (p40) add r39=r39,r35 } // (p27) | ||
437 | { .mfi; (p16) ldf8 f32=[r15],8 // *(ap++) | ||
438 | (p20) xma.hu f36=f36,f8,f50 // high | ||
439 | (p42) add r39=r39,r35,1 };; // (p27) | ||
440 | { .mmi; (p24) getf.sig r32=f40 // high | ||
441 | (p16) ldf8 f46=[r16],8 // *(rp1++) | ||
442 | (p40) cmp.ltu p41,p39=r39,r35 } // (p27) | ||
443 | { .mib; (p26) st8 [r14]=r39,8 // *(rp2++) | ||
444 | (p42) cmp.leu p41,p39=r39,r35 // (p27) | ||
445 | br.ctop.sptk .L_bn_mul_add_words_ctop};; | ||
446 | .L_bn_mul_add_words_cend: | ||
447 | |||
448 | { .mmi; .pred.rel "mutex",p40,p42 | ||
449 | (p40) add r8=r35,r0 | ||
450 | (p42) add r8=r35,r0,1 | ||
451 | mov pr=r9,0x1ffff } | ||
452 | { .mib; rum 1<<5 // clear um.mfh | ||
453 | mov ar.lc=r3 | ||
454 | br.ret.sptk.many b0 };; | ||
455 | .endp bn_mul_add_words# | ||
456 | #endif | ||
457 | |||
458 | #if 1 | ||
459 | // | ||
460 | // void bn_sqr_words(BN_ULONG *rp, BN_ULONG *ap, int num) | ||
461 | // | ||
462 | .global bn_sqr_words# | ||
463 | .proc bn_sqr_words# | ||
464 | .align 64 | ||
465 | .skip 32 // makes the loop body aligned at 64-byte boundary | ||
466 | bn_sqr_words: | ||
467 | .prologue | ||
468 | .save ar.pfs,r2 | ||
469 | { .mii; alloc r2=ar.pfs,3,0,0,0 | ||
470 | sxt4 r34=r34 };; | ||
471 | { .mii; cmp.le p6,p0=r34,r0 | ||
472 | mov r8=r0 } // return value | ||
473 | { .mfb; ADDP r32=0,r32 | ||
474 | nop.f 0x0 | ||
475 | (p6) br.ret.spnt.many b0 };; | ||
476 | |||
477 | { .mii; sub r10=r34,r0,1 | ||
478 | .save ar.lc,r3 | ||
479 | mov r3=ar.lc | ||
480 | .save pr,r9 | ||
481 | mov r9=pr };; | ||
482 | |||
483 | .body | ||
484 | { .mib; ADDP r33=0,r33 | ||
485 | mov pr.rot=1<<16 | ||
486 | brp.loop.imp .L_bn_sqr_words_ctop,.L_bn_sqr_words_cend-16 | ||
487 | } | ||
488 | { .mii; add r34=8,r32 | ||
489 | mov ar.lc=r10 | ||
490 | mov ar.ec=18 };; | ||
491 | |||
492 | // 2*(n+17) on Itanium, (n+17) on "wider" IA-64 implementations. It's | ||
493 | // possible to compress the epilogue (I'm getting tired to write this | ||
494 | // comment over and over) and get down to 2*n+16 at the cost of | ||
495 | // scalability. The decision will very likely be reconsidered after the | ||
496 | // benchmark program is profiled. I.e. if performance gain on Itanium | ||
497 | // will appear larger than loss on "wider" IA-64, then the loop should | ||
498 | // be explicitely split and the epilogue compressed. | ||
499 | .L_bn_sqr_words_ctop: | ||
500 | { .mfi; (p16) ldf8 f32=[r33],8 | ||
501 | (p25) xmpy.lu f42=f41,f41 | ||
502 | (p0) nop.i 0x0 } | ||
503 | { .mib; (p33) stf8 [r32]=f50,16 | ||
504 | (p0) nop.i 0x0 | ||
505 | (p0) nop.b 0x0 } | ||
506 | { .mfi; (p0) nop.m 0x0 | ||
507 | (p25) xmpy.hu f52=f41,f41 | ||
508 | (p0) nop.i 0x0 } | ||
509 | { .mib; (p33) stf8 [r34]=f60,16 | ||
510 | (p0) nop.i 0x0 | ||
511 | br.ctop.sptk .L_bn_sqr_words_ctop };; | ||
512 | .L_bn_sqr_words_cend: | ||
513 | |||
514 | { .mii; nop.m 0x0 | ||
515 | mov pr=r9,0x1ffff | ||
516 | mov ar.lc=r3 } | ||
517 | { .mfb; rum 1<<5 // clear um.mfh | ||
518 | nop.f 0x0 | ||
519 | br.ret.sptk.many b0 };; | ||
520 | .endp bn_sqr_words# | ||
521 | #endif | ||
522 | |||
523 | #if 1 | ||
524 | // Apparently we win nothing by implementing special bn_sqr_comba8. | ||
525 | // Yes, it is possible to reduce the number of multiplications by | ||
526 | // almost factor of two, but then the amount of additions would | ||
527 | // increase by factor of two (as we would have to perform those | ||
528 | // otherwise performed by xma ourselves). Normally we would trade | ||
529 | // anyway as multiplications are way more expensive, but not this | ||
530 | // time... Multiplication kernel is fully pipelined and as we drain | ||
531 | // one 128-bit multiplication result per clock cycle multiplications | ||
532 | // are effectively as inexpensive as additions. Special implementation | ||
533 | // might become of interest for "wider" IA-64 implementation as you'll | ||
534 | // be able to get through the multiplication phase faster (there won't | ||
535 | // be any stall issues as discussed in the commentary section below and | ||
536 | // you therefore will be able to employ all 4 FP units)... But these | ||
537 | // Itanium days it's simply too hard to justify the effort so I just | ||
538 | // drop down to bn_mul_comba8 code:-) | ||
539 | // | ||
540 | // void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a) | ||
541 | // | ||
542 | .global bn_sqr_comba8# | ||
543 | .proc bn_sqr_comba8# | ||
544 | .align 64 | ||
545 | bn_sqr_comba8: | ||
546 | .prologue | ||
547 | .save ar.pfs,r2 | ||
548 | #if defined(_HPUX_SOURCE) && !defined(_LP64) | ||
549 | { .mii; alloc r2=ar.pfs,2,1,0,0 | ||
550 | addp4 r33=0,r33 | ||
551 | addp4 r32=0,r32 };; | ||
552 | { .mii; | ||
553 | #else | ||
554 | { .mii; alloc r2=ar.pfs,2,1,0,0 | ||
555 | #endif | ||
556 | mov r34=r33 | ||
557 | add r14=8,r33 };; | ||
558 | .body | ||
559 | { .mii; add r17=8,r34 | ||
560 | add r15=16,r33 | ||
561 | add r18=16,r34 } | ||
562 | { .mfb; add r16=24,r33 | ||
563 | br .L_cheat_entry_point8 };; | ||
564 | .endp bn_sqr_comba8# | ||
565 | #endif | ||
566 | |||
567 | #if 1 | ||
568 | // I've estimated this routine to run in ~120 ticks, but in reality | ||
569 | // (i.e. according to ar.itc) it takes ~160 ticks. Are those extra | ||
570 | // cycles consumed for instructions fetch? Or did I misinterpret some | ||
571 | // clause in Itanium µ-architecture manual? Comments are welcomed and | ||
572 | // highly appreciated. | ||
573 | // | ||
574 | // On Itanium 2 it takes ~190 ticks. This is because of stalls on | ||
575 | // result from getf.sig. I do nothing about it at this point for | ||
576 | // reasons depicted below. | ||
577 | // | ||
578 | // However! It should be noted that even 160 ticks is darn good result | ||
579 | // as it's over 10 (yes, ten, spelled as t-e-n) times faster than the | ||
580 | // C version (compiled with gcc with inline assembler). I really | ||
581 | // kicked compiler's butt here, didn't I? Yeah! This brings us to the | ||
582 | // following statement. It's damn shame that this routine isn't called | ||
583 | // very often nowadays! According to the profiler most CPU time is | ||
584 | // consumed by bn_mul_add_words called from BN_from_montgomery. In | ||
585 | // order to estimate what we're missing, I've compared the performance | ||
586 | // of this routine against "traditional" implementation, i.e. against | ||
587 | // following routine: | ||
588 | // | ||
589 | // void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) | ||
590 | // { r[ 8]=bn_mul_words( &(r[0]),a,8,b[0]); | ||
591 | // r[ 9]=bn_mul_add_words(&(r[1]),a,8,b[1]); | ||
592 | // r[10]=bn_mul_add_words(&(r[2]),a,8,b[2]); | ||
593 | // r[11]=bn_mul_add_words(&(r[3]),a,8,b[3]); | ||
594 | // r[12]=bn_mul_add_words(&(r[4]),a,8,b[4]); | ||
595 | // r[13]=bn_mul_add_words(&(r[5]),a,8,b[5]); | ||
596 | // r[14]=bn_mul_add_words(&(r[6]),a,8,b[6]); | ||
597 | // r[15]=bn_mul_add_words(&(r[7]),a,8,b[7]); | ||
598 | // } | ||
599 | // | ||
600 | // The one below is over 8 times faster than the one above:-( Even | ||
601 | // more reasons to "combafy" bn_mul_add_mont... | ||
602 | // | ||
603 | // And yes, this routine really made me wish there were an optimizing | ||
604 | // assembler! It also feels like it deserves a dedication. | ||
605 | // | ||
606 | // To my wife for being there and to my kids... | ||
607 | // | ||
608 | // void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) | ||
609 | // | ||
610 | #define carry1 r14 | ||
611 | #define carry2 r15 | ||
612 | #define carry3 r34 | ||
613 | .global bn_mul_comba8# | ||
614 | .proc bn_mul_comba8# | ||
615 | .align 64 | ||
616 | bn_mul_comba8: | ||
617 | .prologue | ||
618 | .save ar.pfs,r2 | ||
619 | #if defined(_HPUX_SOURCE) && !defined(_LP64) | ||
620 | { .mii; alloc r2=ar.pfs,3,0,0,0 | ||
621 | addp4 r33=0,r33 | ||
622 | addp4 r34=0,r34 };; | ||
623 | { .mii; addp4 r32=0,r32 | ||
624 | #else | ||
625 | { .mii; alloc r2=ar.pfs,3,0,0,0 | ||
626 | #endif | ||
627 | add r14=8,r33 | ||
628 | add r17=8,r34 } | ||
629 | .body | ||
630 | { .mii; add r15=16,r33 | ||
631 | add r18=16,r34 | ||
632 | add r16=24,r33 } | ||
633 | .L_cheat_entry_point8: | ||
634 | { .mmi; add r19=24,r34 | ||
635 | |||
636 | ldf8 f32=[r33],32 };; | ||
637 | |||
638 | { .mmi; ldf8 f120=[r34],32 | ||
639 | ldf8 f121=[r17],32 } | ||
640 | { .mmi; ldf8 f122=[r18],32 | ||
641 | ldf8 f123=[r19],32 };; | ||
642 | { .mmi; ldf8 f124=[r34] | ||
643 | ldf8 f125=[r17] } | ||
644 | { .mmi; ldf8 f126=[r18] | ||
645 | ldf8 f127=[r19] } | ||
646 | |||
647 | { .mmi; ldf8 f33=[r14],32 | ||
648 | ldf8 f34=[r15],32 } | ||
649 | { .mmi; ldf8 f35=[r16],32;; | ||
650 | ldf8 f36=[r33] } | ||
651 | { .mmi; ldf8 f37=[r14] | ||
652 | ldf8 f38=[r15] } | ||
653 | { .mfi; ldf8 f39=[r16] | ||
654 | // -------\ Entering multiplier's heaven /------- | ||
655 | // ------------\ /------------ | ||
656 | // -----------------\ /----------------- | ||
657 | // ----------------------\/---------------------- | ||
658 | xma.hu f41=f32,f120,f0 } | ||
659 | { .mfi; xma.lu f40=f32,f120,f0 };; // (*) | ||
660 | { .mfi; xma.hu f51=f32,f121,f0 } | ||
661 | { .mfi; xma.lu f50=f32,f121,f0 };; | ||
662 | { .mfi; xma.hu f61=f32,f122,f0 } | ||
663 | { .mfi; xma.lu f60=f32,f122,f0 };; | ||
664 | { .mfi; xma.hu f71=f32,f123,f0 } | ||
665 | { .mfi; xma.lu f70=f32,f123,f0 };; | ||
666 | { .mfi; xma.hu f81=f32,f124,f0 } | ||
667 | { .mfi; xma.lu f80=f32,f124,f0 };; | ||
668 | { .mfi; xma.hu f91=f32,f125,f0 } | ||
669 | { .mfi; xma.lu f90=f32,f125,f0 };; | ||
670 | { .mfi; xma.hu f101=f32,f126,f0 } | ||
671 | { .mfi; xma.lu f100=f32,f126,f0 };; | ||
672 | { .mfi; xma.hu f111=f32,f127,f0 } | ||
673 | { .mfi; xma.lu f110=f32,f127,f0 };;// | ||
674 | // (*) You can argue that splitting at every second bundle would | ||
675 | // prevent "wider" IA-64 implementations from achieving the peak | ||
676 | // performance. Well, not really... The catch is that if you | ||
677 | // intend to keep 4 FP units busy by splitting at every fourth | ||
678 | // bundle and thus perform these 16 multiplications in 4 ticks, | ||
679 | // the first bundle *below* would stall because the result from | ||
680 | // the first xma bundle *above* won't be available for another 3 | ||
681 | // ticks (if not more, being an optimist, I assume that "wider" | ||
682 | // implementation will have same latency:-). This stall will hold | ||
683 | // you back and the performance would be as if every second bundle | ||
684 | // were split *anyway*... | ||
685 | { .mfi; getf.sig r16=f40 | ||
686 | xma.hu f42=f33,f120,f41 | ||
687 | add r33=8,r32 } | ||
688 | { .mfi; xma.lu f41=f33,f120,f41 };; | ||
689 | { .mfi; getf.sig r24=f50 | ||
690 | xma.hu f52=f33,f121,f51 } | ||
691 | { .mfi; xma.lu f51=f33,f121,f51 };; | ||
692 | { .mfi; st8 [r32]=r16,16 | ||
693 | xma.hu f62=f33,f122,f61 } | ||
694 | { .mfi; xma.lu f61=f33,f122,f61 };; | ||
695 | { .mfi; xma.hu f72=f33,f123,f71 } | ||
696 | { .mfi; xma.lu f71=f33,f123,f71 };; | ||
697 | { .mfi; xma.hu f82=f33,f124,f81 } | ||
698 | { .mfi; xma.lu f81=f33,f124,f81 };; | ||
699 | { .mfi; xma.hu f92=f33,f125,f91 } | ||
700 | { .mfi; xma.lu f91=f33,f125,f91 };; | ||
701 | { .mfi; xma.hu f102=f33,f126,f101 } | ||
702 | { .mfi; xma.lu f101=f33,f126,f101 };; | ||
703 | { .mfi; xma.hu f112=f33,f127,f111 } | ||
704 | { .mfi; xma.lu f111=f33,f127,f111 };;// | ||
705 | //-------------------------------------------------// | ||
706 | { .mfi; getf.sig r25=f41 | ||
707 | xma.hu f43=f34,f120,f42 } | ||
708 | { .mfi; xma.lu f42=f34,f120,f42 };; | ||
709 | { .mfi; getf.sig r16=f60 | ||
710 | xma.hu f53=f34,f121,f52 } | ||
711 | { .mfi; xma.lu f52=f34,f121,f52 };; | ||
712 | { .mfi; getf.sig r17=f51 | ||
713 | xma.hu f63=f34,f122,f62 | ||
714 | add r25=r25,r24 } | ||
715 | { .mfi; xma.lu f62=f34,f122,f62 | ||
716 | mov carry1=0 };; | ||
717 | { .mfi; cmp.ltu p6,p0=r25,r24 | ||
718 | xma.hu f73=f34,f123,f72 } | ||
719 | { .mfi; xma.lu f72=f34,f123,f72 };; | ||
720 | { .mfi; st8 [r33]=r25,16 | ||
721 | xma.hu f83=f34,f124,f82 | ||
722 | (p6) add carry1=1,carry1 } | ||
723 | { .mfi; xma.lu f82=f34,f124,f82 };; | ||
724 | { .mfi; xma.hu f93=f34,f125,f92 } | ||
725 | { .mfi; xma.lu f92=f34,f125,f92 };; | ||
726 | { .mfi; xma.hu f103=f34,f126,f102 } | ||
727 | { .mfi; xma.lu f102=f34,f126,f102 };; | ||
728 | { .mfi; xma.hu f113=f34,f127,f112 } | ||
729 | { .mfi; xma.lu f112=f34,f127,f112 };;// | ||
730 | //-------------------------------------------------// | ||
731 | { .mfi; getf.sig r18=f42 | ||
732 | xma.hu f44=f35,f120,f43 | ||
733 | add r17=r17,r16 } | ||
734 | { .mfi; xma.lu f43=f35,f120,f43 };; | ||
735 | { .mfi; getf.sig r24=f70 | ||
736 | xma.hu f54=f35,f121,f53 } | ||
737 | { .mfi; mov carry2=0 | ||
738 | xma.lu f53=f35,f121,f53 };; | ||
739 | { .mfi; getf.sig r25=f61 | ||
740 | xma.hu f64=f35,f122,f63 | ||
741 | cmp.ltu p7,p0=r17,r16 } | ||
742 | { .mfi; add r18=r18,r17 | ||
743 | xma.lu f63=f35,f122,f63 };; | ||
744 | { .mfi; getf.sig r26=f52 | ||
745 | xma.hu f74=f35,f123,f73 | ||
746 | (p7) add carry2=1,carry2 } | ||
747 | { .mfi; cmp.ltu p7,p0=r18,r17 | ||
748 | xma.lu f73=f35,f123,f73 | ||
749 | add r18=r18,carry1 };; | ||
750 | { .mfi; | ||
751 | xma.hu f84=f35,f124,f83 | ||
752 | (p7) add carry2=1,carry2 } | ||
753 | { .mfi; cmp.ltu p7,p0=r18,carry1 | ||
754 | xma.lu f83=f35,f124,f83 };; | ||
755 | { .mfi; st8 [r32]=r18,16 | ||
756 | xma.hu f94=f35,f125,f93 | ||
757 | (p7) add carry2=1,carry2 } | ||
758 | { .mfi; xma.lu f93=f35,f125,f93 };; | ||
759 | { .mfi; xma.hu f104=f35,f126,f103 } | ||
760 | { .mfi; xma.lu f103=f35,f126,f103 };; | ||
761 | { .mfi; xma.hu f114=f35,f127,f113 } | ||
762 | { .mfi; mov carry1=0 | ||
763 | xma.lu f113=f35,f127,f113 | ||
764 | add r25=r25,r24 };;// | ||
765 | //-------------------------------------------------// | ||
766 | { .mfi; getf.sig r27=f43 | ||
767 | xma.hu f45=f36,f120,f44 | ||
768 | cmp.ltu p6,p0=r25,r24 } | ||
769 | { .mfi; xma.lu f44=f36,f120,f44 | ||
770 | add r26=r26,r25 };; | ||
771 | { .mfi; getf.sig r16=f80 | ||
772 | xma.hu f55=f36,f121,f54 | ||
773 | (p6) add carry1=1,carry1 } | ||
774 | { .mfi; xma.lu f54=f36,f121,f54 };; | ||
775 | { .mfi; getf.sig r17=f71 | ||
776 | xma.hu f65=f36,f122,f64 | ||
777 | cmp.ltu p6,p0=r26,r25 } | ||
778 | { .mfi; xma.lu f64=f36,f122,f64 | ||
779 | add r27=r27,r26 };; | ||
780 | { .mfi; getf.sig r18=f62 | ||
781 | xma.hu f75=f36,f123,f74 | ||
782 | (p6) add carry1=1,carry1 } | ||
783 | { .mfi; cmp.ltu p6,p0=r27,r26 | ||
784 | xma.lu f74=f36,f123,f74 | ||
785 | add r27=r27,carry2 };; | ||
786 | { .mfi; getf.sig r19=f53 | ||
787 | xma.hu f85=f36,f124,f84 | ||
788 | (p6) add carry1=1,carry1 } | ||
789 | { .mfi; xma.lu f84=f36,f124,f84 | ||
790 | cmp.ltu p6,p0=r27,carry2 };; | ||
791 | { .mfi; st8 [r33]=r27,16 | ||
792 | xma.hu f95=f36,f125,f94 | ||
793 | (p6) add carry1=1,carry1 } | ||
794 | { .mfi; xma.lu f94=f36,f125,f94 };; | ||
795 | { .mfi; xma.hu f105=f36,f126,f104 } | ||
796 | { .mfi; mov carry2=0 | ||
797 | xma.lu f104=f36,f126,f104 | ||
798 | add r17=r17,r16 };; | ||
799 | { .mfi; xma.hu f115=f36,f127,f114 | ||
800 | cmp.ltu p7,p0=r17,r16 } | ||
801 | { .mfi; xma.lu f114=f36,f127,f114 | ||
802 | add r18=r18,r17 };;// | ||
803 | //-------------------------------------------------// | ||
804 | { .mfi; getf.sig r20=f44 | ||
805 | xma.hu f46=f37,f120,f45 | ||
806 | (p7) add carry2=1,carry2 } | ||
807 | { .mfi; cmp.ltu p7,p0=r18,r17 | ||
808 | xma.lu f45=f37,f120,f45 | ||
809 | add r19=r19,r18 };; | ||
810 | { .mfi; getf.sig r24=f90 | ||
811 | xma.hu f56=f37,f121,f55 } | ||
812 | { .mfi; xma.lu f55=f37,f121,f55 };; | ||
813 | { .mfi; getf.sig r25=f81 | ||
814 | xma.hu f66=f37,f122,f65 | ||
815 | (p7) add carry2=1,carry2 } | ||
816 | { .mfi; cmp.ltu p7,p0=r19,r18 | ||
817 | xma.lu f65=f37,f122,f65 | ||
818 | add r20=r20,r19 };; | ||
819 | { .mfi; getf.sig r26=f72 | ||
820 | xma.hu f76=f37,f123,f75 | ||
821 | (p7) add carry2=1,carry2 } | ||
822 | { .mfi; cmp.ltu p7,p0=r20,r19 | ||
823 | xma.lu f75=f37,f123,f75 | ||
824 | add r20=r20,carry1 };; | ||
825 | { .mfi; getf.sig r27=f63 | ||
826 | xma.hu f86=f37,f124,f85 | ||
827 | (p7) add carry2=1,carry2 } | ||
828 | { .mfi; xma.lu f85=f37,f124,f85 | ||
829 | cmp.ltu p7,p0=r20,carry1 };; | ||
830 | { .mfi; getf.sig r28=f54 | ||
831 | xma.hu f96=f37,f125,f95 | ||
832 | (p7) add carry2=1,carry2 } | ||
833 | { .mfi; st8 [r32]=r20,16 | ||
834 | xma.lu f95=f37,f125,f95 };; | ||
835 | { .mfi; xma.hu f106=f37,f126,f105 } | ||
836 | { .mfi; mov carry1=0 | ||
837 | xma.lu f105=f37,f126,f105 | ||
838 | add r25=r25,r24 };; | ||
839 | { .mfi; xma.hu f116=f37,f127,f115 | ||
840 | cmp.ltu p6,p0=r25,r24 } | ||
841 | { .mfi; xma.lu f115=f37,f127,f115 | ||
842 | add r26=r26,r25 };;// | ||
843 | //-------------------------------------------------// | ||
844 | { .mfi; getf.sig r29=f45 | ||
845 | xma.hu f47=f38,f120,f46 | ||
846 | (p6) add carry1=1,carry1 } | ||
847 | { .mfi; cmp.ltu p6,p0=r26,r25 | ||
848 | xma.lu f46=f38,f120,f46 | ||
849 | add r27=r27,r26 };; | ||
850 | { .mfi; getf.sig r16=f100 | ||
851 | xma.hu f57=f38,f121,f56 | ||
852 | (p6) add carry1=1,carry1 } | ||
853 | { .mfi; cmp.ltu p6,p0=r27,r26 | ||
854 | xma.lu f56=f38,f121,f56 | ||
855 | add r28=r28,r27 };; | ||
856 | { .mfi; getf.sig r17=f91 | ||
857 | xma.hu f67=f38,f122,f66 | ||
858 | (p6) add carry1=1,carry1 } | ||
859 | { .mfi; cmp.ltu p6,p0=r28,r27 | ||
860 | xma.lu f66=f38,f122,f66 | ||
861 | add r29=r29,r28 };; | ||
862 | { .mfi; getf.sig r18=f82 | ||
863 | xma.hu f77=f38,f123,f76 | ||
864 | (p6) add carry1=1,carry1 } | ||
865 | { .mfi; cmp.ltu p6,p0=r29,r28 | ||
866 | xma.lu f76=f38,f123,f76 | ||
867 | add r29=r29,carry2 };; | ||
868 | { .mfi; getf.sig r19=f73 | ||
869 | xma.hu f87=f38,f124,f86 | ||
870 | (p6) add carry1=1,carry1 } | ||
871 | { .mfi; xma.lu f86=f38,f124,f86 | ||
872 | cmp.ltu p6,p0=r29,carry2 };; | ||
873 | { .mfi; getf.sig r20=f64 | ||
874 | xma.hu f97=f38,f125,f96 | ||
875 | (p6) add carry1=1,carry1 } | ||
876 | { .mfi; st8 [r33]=r29,16 | ||
877 | xma.lu f96=f38,f125,f96 };; | ||
878 | { .mfi; getf.sig r21=f55 | ||
879 | xma.hu f107=f38,f126,f106 } | ||
880 | { .mfi; mov carry2=0 | ||
881 | xma.lu f106=f38,f126,f106 | ||
882 | add r17=r17,r16 };; | ||
883 | { .mfi; xma.hu f117=f38,f127,f116 | ||
884 | cmp.ltu p7,p0=r17,r16 } | ||
885 | { .mfi; xma.lu f116=f38,f127,f116 | ||
886 | add r18=r18,r17 };;// | ||
887 | //-------------------------------------------------// | ||
888 | { .mfi; getf.sig r22=f46 | ||
889 | xma.hu f48=f39,f120,f47 | ||
890 | (p7) add carry2=1,carry2 } | ||
891 | { .mfi; cmp.ltu p7,p0=r18,r17 | ||
892 | xma.lu f47=f39,f120,f47 | ||
893 | add r19=r19,r18 };; | ||
894 | { .mfi; getf.sig r24=f110 | ||
895 | xma.hu f58=f39,f121,f57 | ||
896 | (p7) add carry2=1,carry2 } | ||
897 | { .mfi; cmp.ltu p7,p0=r19,r18 | ||
898 | xma.lu f57=f39,f121,f57 | ||
899 | add r20=r20,r19 };; | ||
900 | { .mfi; getf.sig r25=f101 | ||
901 | xma.hu f68=f39,f122,f67 | ||
902 | (p7) add carry2=1,carry2 } | ||
903 | { .mfi; cmp.ltu p7,p0=r20,r19 | ||
904 | xma.lu f67=f39,f122,f67 | ||
905 | add r21=r21,r20 };; | ||
906 | { .mfi; getf.sig r26=f92 | ||
907 | xma.hu f78=f39,f123,f77 | ||
908 | (p7) add carry2=1,carry2 } | ||
909 | { .mfi; cmp.ltu p7,p0=r21,r20 | ||
910 | xma.lu f77=f39,f123,f77 | ||
911 | add r22=r22,r21 };; | ||
912 | { .mfi; getf.sig r27=f83 | ||
913 | xma.hu f88=f39,f124,f87 | ||
914 | (p7) add carry2=1,carry2 } | ||
915 | { .mfi; cmp.ltu p7,p0=r22,r21 | ||
916 | xma.lu f87=f39,f124,f87 | ||
917 | add r22=r22,carry1 };; | ||
918 | { .mfi; getf.sig r28=f74 | ||
919 | xma.hu f98=f39,f125,f97 | ||
920 | (p7) add carry2=1,carry2 } | ||
921 | { .mfi; xma.lu f97=f39,f125,f97 | ||
922 | cmp.ltu p7,p0=r22,carry1 };; | ||
923 | { .mfi; getf.sig r29=f65 | ||
924 | xma.hu f108=f39,f126,f107 | ||
925 | (p7) add carry2=1,carry2 } | ||
926 | { .mfi; st8 [r32]=r22,16 | ||
927 | xma.lu f107=f39,f126,f107 };; | ||
928 | { .mfi; getf.sig r30=f56 | ||
929 | xma.hu f118=f39,f127,f117 } | ||
930 | { .mfi; xma.lu f117=f39,f127,f117 };;// | ||
931 | //-------------------------------------------------// | ||
932 | // Leaving muliplier's heaven... Quite a ride, huh? | ||
933 | |||
934 | { .mii; getf.sig r31=f47 | ||
935 | add r25=r25,r24 | ||
936 | mov carry1=0 };; | ||
937 | { .mii; getf.sig r16=f111 | ||
938 | cmp.ltu p6,p0=r25,r24 | ||
939 | add r26=r26,r25 };; | ||
940 | { .mfb; getf.sig r17=f102 } | ||
941 | { .mii; | ||
942 | (p6) add carry1=1,carry1 | ||
943 | cmp.ltu p6,p0=r26,r25 | ||
944 | add r27=r27,r26 };; | ||
945 | { .mfb; nop.m 0x0 } | ||
946 | { .mii; | ||
947 | (p6) add carry1=1,carry1 | ||
948 | cmp.ltu p6,p0=r27,r26 | ||
949 | add r28=r28,r27 };; | ||
950 | { .mii; getf.sig r18=f93 | ||
951 | add r17=r17,r16 | ||
952 | mov carry3=0 } | ||
953 | { .mii; | ||
954 | (p6) add carry1=1,carry1 | ||
955 | cmp.ltu p6,p0=r28,r27 | ||
956 | add r29=r29,r28 };; | ||
957 | { .mii; getf.sig r19=f84 | ||
958 | cmp.ltu p7,p0=r17,r16 } | ||
959 | { .mii; | ||
960 | (p6) add carry1=1,carry1 | ||
961 | cmp.ltu p6,p0=r29,r28 | ||
962 | add r30=r30,r29 };; | ||
963 | { .mii; getf.sig r20=f75 | ||
964 | add r18=r18,r17 } | ||
965 | { .mii; | ||
966 | (p6) add carry1=1,carry1 | ||
967 | cmp.ltu p6,p0=r30,r29 | ||
968 | add r31=r31,r30 };; | ||
969 | { .mfb; getf.sig r21=f66 } | ||
970 | { .mii; (p7) add carry3=1,carry3 | ||
971 | cmp.ltu p7,p0=r18,r17 | ||
972 | add r19=r19,r18 } | ||
973 | { .mfb; nop.m 0x0 } | ||
974 | { .mii; | ||
975 | (p6) add carry1=1,carry1 | ||
976 | cmp.ltu p6,p0=r31,r30 | ||
977 | add r31=r31,carry2 };; | ||
978 | { .mfb; getf.sig r22=f57 } | ||
979 | { .mii; (p7) add carry3=1,carry3 | ||
980 | cmp.ltu p7,p0=r19,r18 | ||
981 | add r20=r20,r19 } | ||
982 | { .mfb; nop.m 0x0 } | ||
983 | { .mii; | ||
984 | (p6) add carry1=1,carry1 | ||
985 | cmp.ltu p6,p0=r31,carry2 };; | ||
986 | { .mfb; getf.sig r23=f48 } | ||
987 | { .mii; (p7) add carry3=1,carry3 | ||
988 | cmp.ltu p7,p0=r20,r19 | ||
989 | add r21=r21,r20 } | ||
990 | { .mii; | ||
991 | (p6) add carry1=1,carry1 } | ||
992 | { .mfb; st8 [r33]=r31,16 };; | ||
993 | |||
994 | { .mfb; getf.sig r24=f112 } | ||
995 | { .mii; (p7) add carry3=1,carry3 | ||
996 | cmp.ltu p7,p0=r21,r20 | ||
997 | add r22=r22,r21 };; | ||
998 | { .mfb; getf.sig r25=f103 } | ||
999 | { .mii; (p7) add carry3=1,carry3 | ||
1000 | cmp.ltu p7,p0=r22,r21 | ||
1001 | add r23=r23,r22 };; | ||
1002 | { .mfb; getf.sig r26=f94 } | ||
1003 | { .mii; (p7) add carry3=1,carry3 | ||
1004 | cmp.ltu p7,p0=r23,r22 | ||
1005 | add r23=r23,carry1 };; | ||
1006 | { .mfb; getf.sig r27=f85 } | ||
1007 | { .mii; (p7) add carry3=1,carry3 | ||
1008 | cmp.ltu p7,p8=r23,carry1};; | ||
1009 | { .mii; getf.sig r28=f76 | ||
1010 | add r25=r25,r24 | ||
1011 | mov carry1=0 } | ||
1012 | { .mii; st8 [r32]=r23,16 | ||
1013 | (p7) add carry2=1,carry3 | ||
1014 | (p8) add carry2=0,carry3 };; | ||
1015 | |||
1016 | { .mfb; nop.m 0x0 } | ||
1017 | { .mii; getf.sig r29=f67 | ||
1018 | cmp.ltu p6,p0=r25,r24 | ||
1019 | add r26=r26,r25 };; | ||
1020 | { .mfb; getf.sig r30=f58 } | ||
1021 | { .mii; | ||
1022 | (p6) add carry1=1,carry1 | ||
1023 | cmp.ltu p6,p0=r26,r25 | ||
1024 | add r27=r27,r26 };; | ||
1025 | { .mfb; getf.sig r16=f113 } | ||
1026 | { .mii; | ||
1027 | (p6) add carry1=1,carry1 | ||
1028 | cmp.ltu p6,p0=r27,r26 | ||
1029 | add r28=r28,r27 };; | ||
1030 | { .mfb; getf.sig r17=f104 } | ||
1031 | { .mii; | ||
1032 | (p6) add carry1=1,carry1 | ||
1033 | cmp.ltu p6,p0=r28,r27 | ||
1034 | add r29=r29,r28 };; | ||
1035 | { .mfb; getf.sig r18=f95 } | ||
1036 | { .mii; | ||
1037 | (p6) add carry1=1,carry1 | ||
1038 | cmp.ltu p6,p0=r29,r28 | ||
1039 | add r30=r30,r29 };; | ||
1040 | { .mii; getf.sig r19=f86 | ||
1041 | add r17=r17,r16 | ||
1042 | mov carry3=0 } | ||
1043 | { .mii; | ||
1044 | (p6) add carry1=1,carry1 | ||
1045 | cmp.ltu p6,p0=r30,r29 | ||
1046 | add r30=r30,carry2 };; | ||
1047 | { .mii; getf.sig r20=f77 | ||
1048 | cmp.ltu p7,p0=r17,r16 | ||
1049 | add r18=r18,r17 } | ||
1050 | { .mii; | ||
1051 | (p6) add carry1=1,carry1 | ||
1052 | cmp.ltu p6,p0=r30,carry2 };; | ||
1053 | { .mfb; getf.sig r21=f68 } | ||
1054 | { .mii; st8 [r33]=r30,16 | ||
1055 | (p6) add carry1=1,carry1 };; | ||
1056 | |||
1057 | { .mfb; getf.sig r24=f114 } | ||
1058 | { .mii; (p7) add carry3=1,carry3 | ||
1059 | cmp.ltu p7,p0=r18,r17 | ||
1060 | add r19=r19,r18 };; | ||
1061 | { .mfb; getf.sig r25=f105 } | ||
1062 | { .mii; (p7) add carry3=1,carry3 | ||
1063 | cmp.ltu p7,p0=r19,r18 | ||
1064 | add r20=r20,r19 };; | ||
1065 | { .mfb; getf.sig r26=f96 } | ||
1066 | { .mii; (p7) add carry3=1,carry3 | ||
1067 | cmp.ltu p7,p0=r20,r19 | ||
1068 | add r21=r21,r20 };; | ||
1069 | { .mfb; getf.sig r27=f87 } | ||
1070 | { .mii; (p7) add carry3=1,carry3 | ||
1071 | cmp.ltu p7,p0=r21,r20 | ||
1072 | add r21=r21,carry1 };; | ||
1073 | { .mib; getf.sig r28=f78 | ||
1074 | add r25=r25,r24 } | ||
1075 | { .mib; (p7) add carry3=1,carry3 | ||
1076 | cmp.ltu p7,p8=r21,carry1};; | ||
1077 | { .mii; st8 [r32]=r21,16 | ||
1078 | (p7) add carry2=1,carry3 | ||
1079 | (p8) add carry2=0,carry3 } | ||
1080 | |||
1081 | { .mii; mov carry1=0 | ||
1082 | cmp.ltu p6,p0=r25,r24 | ||
1083 | add r26=r26,r25 };; | ||
1084 | { .mfb; getf.sig r16=f115 } | ||
1085 | { .mii; | ||
1086 | (p6) add carry1=1,carry1 | ||
1087 | cmp.ltu p6,p0=r26,r25 | ||
1088 | add r27=r27,r26 };; | ||
1089 | { .mfb; getf.sig r17=f106 } | ||
1090 | { .mii; | ||
1091 | (p6) add carry1=1,carry1 | ||
1092 | cmp.ltu p6,p0=r27,r26 | ||
1093 | add r28=r28,r27 };; | ||
1094 | { .mfb; getf.sig r18=f97 } | ||
1095 | { .mii; | ||
1096 | (p6) add carry1=1,carry1 | ||
1097 | cmp.ltu p6,p0=r28,r27 | ||
1098 | add r28=r28,carry2 };; | ||
1099 | { .mib; getf.sig r19=f88 | ||
1100 | add r17=r17,r16 } | ||
1101 | { .mib; | ||
1102 | (p6) add carry1=1,carry1 | ||
1103 | cmp.ltu p6,p0=r28,carry2 };; | ||
1104 | { .mii; st8 [r33]=r28,16 | ||
1105 | (p6) add carry1=1,carry1 } | ||
1106 | |||
1107 | { .mii; mov carry2=0 | ||
1108 | cmp.ltu p7,p0=r17,r16 | ||
1109 | add r18=r18,r17 };; | ||
1110 | { .mfb; getf.sig r24=f116 } | ||
1111 | { .mii; (p7) add carry2=1,carry2 | ||
1112 | cmp.ltu p7,p0=r18,r17 | ||
1113 | add r19=r19,r18 };; | ||
1114 | { .mfb; getf.sig r25=f107 } | ||
1115 | { .mii; (p7) add carry2=1,carry2 | ||
1116 | cmp.ltu p7,p0=r19,r18 | ||
1117 | add r19=r19,carry1 };; | ||
1118 | { .mfb; getf.sig r26=f98 } | ||
1119 | { .mii; (p7) add carry2=1,carry2 | ||
1120 | cmp.ltu p7,p0=r19,carry1};; | ||
1121 | { .mii; st8 [r32]=r19,16 | ||
1122 | (p7) add carry2=1,carry2 } | ||
1123 | |||
1124 | { .mfb; add r25=r25,r24 };; | ||
1125 | |||
1126 | { .mfb; getf.sig r16=f117 } | ||
1127 | { .mii; mov carry1=0 | ||
1128 | cmp.ltu p6,p0=r25,r24 | ||
1129 | add r26=r26,r25 };; | ||
1130 | { .mfb; getf.sig r17=f108 } | ||
1131 | { .mii; | ||
1132 | (p6) add carry1=1,carry1 | ||
1133 | cmp.ltu p6,p0=r26,r25 | ||
1134 | add r26=r26,carry2 };; | ||
1135 | { .mfb; nop.m 0x0 } | ||
1136 | { .mii; | ||
1137 | (p6) add carry1=1,carry1 | ||
1138 | cmp.ltu p6,p0=r26,carry2 };; | ||
1139 | { .mii; st8 [r33]=r26,16 | ||
1140 | (p6) add carry1=1,carry1 } | ||
1141 | |||
1142 | { .mfb; add r17=r17,r16 };; | ||
1143 | { .mfb; getf.sig r24=f118 } | ||
1144 | { .mii; mov carry2=0 | ||
1145 | cmp.ltu p7,p0=r17,r16 | ||
1146 | add r17=r17,carry1 };; | ||
1147 | { .mii; (p7) add carry2=1,carry2 | ||
1148 | cmp.ltu p7,p0=r17,carry1};; | ||
1149 | { .mii; st8 [r32]=r17 | ||
1150 | (p7) add carry2=1,carry2 };; | ||
1151 | { .mfb; add r24=r24,carry2 };; | ||
1152 | { .mib; st8 [r33]=r24 } | ||
1153 | |||
1154 | { .mib; rum 1<<5 // clear um.mfh | ||
1155 | br.ret.sptk.many b0 };; | ||
1156 | .endp bn_mul_comba8# | ||
1157 | #undef carry3 | ||
1158 | #undef carry2 | ||
1159 | #undef carry1 | ||
1160 | #endif | ||
1161 | |||
1162 | #if 1 | ||
1163 | // It's possible to make it faster (see comment to bn_sqr_comba8), but | ||
1164 | // I reckon it doesn't worth the effort. Basically because the routine | ||
1165 | // (actually both of them) practically never called... So I just play | ||
1166 | // same trick as with bn_sqr_comba8. | ||
1167 | // | ||
1168 | // void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a) | ||
1169 | // | ||
1170 | .global bn_sqr_comba4# | ||
1171 | .proc bn_sqr_comba4# | ||
1172 | .align 64 | ||
1173 | bn_sqr_comba4: | ||
1174 | .prologue | ||
1175 | .save ar.pfs,r2 | ||
1176 | #if defined(_HPUX_SOURCE) && !defined(_LP64) | ||
1177 | { .mii; alloc r2=ar.pfs,2,1,0,0 | ||
1178 | addp4 r32=0,r32 | ||
1179 | addp4 r33=0,r33 };; | ||
1180 | { .mii; | ||
1181 | #else | ||
1182 | { .mii; alloc r2=ar.pfs,2,1,0,0 | ||
1183 | #endif | ||
1184 | mov r34=r33 | ||
1185 | add r14=8,r33 };; | ||
1186 | .body | ||
1187 | { .mii; add r17=8,r34 | ||
1188 | add r15=16,r33 | ||
1189 | add r18=16,r34 } | ||
1190 | { .mfb; add r16=24,r33 | ||
1191 | br .L_cheat_entry_point4 };; | ||
1192 | .endp bn_sqr_comba4# | ||
1193 | #endif | ||
1194 | |||
1195 | #if 1 | ||
1196 | // Runs in ~115 cycles and ~4.5 times faster than C. Well, whatever... | ||
1197 | // | ||
1198 | // void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) | ||
1199 | // | ||
1200 | #define carry1 r14 | ||
1201 | #define carry2 r15 | ||
1202 | .global bn_mul_comba4# | ||
1203 | .proc bn_mul_comba4# | ||
1204 | .align 64 | ||
1205 | bn_mul_comba4: | ||
1206 | .prologue | ||
1207 | .save ar.pfs,r2 | ||
1208 | #if defined(_HPUX_SOURCE) && !defined(_LP64) | ||
1209 | { .mii; alloc r2=ar.pfs,3,0,0,0 | ||
1210 | addp4 r33=0,r33 | ||
1211 | addp4 r34=0,r34 };; | ||
1212 | { .mii; addp4 r32=0,r32 | ||
1213 | #else | ||
1214 | { .mii; alloc r2=ar.pfs,3,0,0,0 | ||
1215 | #endif | ||
1216 | add r14=8,r33 | ||
1217 | add r17=8,r34 } | ||
1218 | .body | ||
1219 | { .mii; add r15=16,r33 | ||
1220 | add r18=16,r34 | ||
1221 | add r16=24,r33 };; | ||
1222 | .L_cheat_entry_point4: | ||
1223 | { .mmi; add r19=24,r34 | ||
1224 | |||
1225 | ldf8 f32=[r33] } | ||
1226 | |||
1227 | { .mmi; ldf8 f120=[r34] | ||
1228 | ldf8 f121=[r17] };; | ||
1229 | { .mmi; ldf8 f122=[r18] | ||
1230 | ldf8 f123=[r19] } | ||
1231 | |||
1232 | { .mmi; ldf8 f33=[r14] | ||
1233 | ldf8 f34=[r15] } | ||
1234 | { .mfi; ldf8 f35=[r16] | ||
1235 | |||
1236 | xma.hu f41=f32,f120,f0 } | ||
1237 | { .mfi; xma.lu f40=f32,f120,f0 };; | ||
1238 | { .mfi; xma.hu f51=f32,f121,f0 } | ||
1239 | { .mfi; xma.lu f50=f32,f121,f0 };; | ||
1240 | { .mfi; xma.hu f61=f32,f122,f0 } | ||
1241 | { .mfi; xma.lu f60=f32,f122,f0 };; | ||
1242 | { .mfi; xma.hu f71=f32,f123,f0 } | ||
1243 | { .mfi; xma.lu f70=f32,f123,f0 };;// | ||
1244 | // Major stall takes place here, and 3 more places below. Result from | ||
1245 | // first xma is not available for another 3 ticks. | ||
1246 | { .mfi; getf.sig r16=f40 | ||
1247 | xma.hu f42=f33,f120,f41 | ||
1248 | add r33=8,r32 } | ||
1249 | { .mfi; xma.lu f41=f33,f120,f41 };; | ||
1250 | { .mfi; getf.sig r24=f50 | ||
1251 | xma.hu f52=f33,f121,f51 } | ||
1252 | { .mfi; xma.lu f51=f33,f121,f51 };; | ||
1253 | { .mfi; st8 [r32]=r16,16 | ||
1254 | xma.hu f62=f33,f122,f61 } | ||
1255 | { .mfi; xma.lu f61=f33,f122,f61 };; | ||
1256 | { .mfi; xma.hu f72=f33,f123,f71 } | ||
1257 | { .mfi; xma.lu f71=f33,f123,f71 };;// | ||
1258 | //-------------------------------------------------// | ||
1259 | { .mfi; getf.sig r25=f41 | ||
1260 | xma.hu f43=f34,f120,f42 } | ||
1261 | { .mfi; xma.lu f42=f34,f120,f42 };; | ||
1262 | { .mfi; getf.sig r16=f60 | ||
1263 | xma.hu f53=f34,f121,f52 } | ||
1264 | { .mfi; xma.lu f52=f34,f121,f52 };; | ||
1265 | { .mfi; getf.sig r17=f51 | ||
1266 | xma.hu f63=f34,f122,f62 | ||
1267 | add r25=r25,r24 } | ||
1268 | { .mfi; mov carry1=0 | ||
1269 | xma.lu f62=f34,f122,f62 };; | ||
1270 | { .mfi; st8 [r33]=r25,16 | ||
1271 | xma.hu f73=f34,f123,f72 | ||
1272 | cmp.ltu p6,p0=r25,r24 } | ||
1273 | { .mfi; xma.lu f72=f34,f123,f72 };;// | ||
1274 | //-------------------------------------------------// | ||
1275 | { .mfi; getf.sig r18=f42 | ||
1276 | xma.hu f44=f35,f120,f43 | ||
1277 | (p6) add carry1=1,carry1 } | ||
1278 | { .mfi; add r17=r17,r16 | ||
1279 | xma.lu f43=f35,f120,f43 | ||
1280 | mov carry2=0 };; | ||
1281 | { .mfi; getf.sig r24=f70 | ||
1282 | xma.hu f54=f35,f121,f53 | ||
1283 | cmp.ltu p7,p0=r17,r16 } | ||
1284 | { .mfi; xma.lu f53=f35,f121,f53 };; | ||
1285 | { .mfi; getf.sig r25=f61 | ||
1286 | xma.hu f64=f35,f122,f63 | ||
1287 | add r18=r18,r17 } | ||
1288 | { .mfi; xma.lu f63=f35,f122,f63 | ||
1289 | (p7) add carry2=1,carry2 };; | ||
1290 | { .mfi; getf.sig r26=f52 | ||
1291 | xma.hu f74=f35,f123,f73 | ||
1292 | cmp.ltu p7,p0=r18,r17 } | ||
1293 | { .mfi; xma.lu f73=f35,f123,f73 | ||
1294 | add r18=r18,carry1 };; | ||
1295 | //-------------------------------------------------// | ||
1296 | { .mii; st8 [r32]=r18,16 | ||
1297 | (p7) add carry2=1,carry2 | ||
1298 | cmp.ltu p7,p0=r18,carry1 };; | ||
1299 | |||
1300 | { .mfi; getf.sig r27=f43 // last major stall | ||
1301 | (p7) add carry2=1,carry2 };; | ||
1302 | { .mii; getf.sig r16=f71 | ||
1303 | add r25=r25,r24 | ||
1304 | mov carry1=0 };; | ||
1305 | { .mii; getf.sig r17=f62 | ||
1306 | cmp.ltu p6,p0=r25,r24 | ||
1307 | add r26=r26,r25 };; | ||
1308 | { .mii; | ||
1309 | (p6) add carry1=1,carry1 | ||
1310 | cmp.ltu p6,p0=r26,r25 | ||
1311 | add r27=r27,r26 };; | ||
1312 | { .mii; | ||
1313 | (p6) add carry1=1,carry1 | ||
1314 | cmp.ltu p6,p0=r27,r26 | ||
1315 | add r27=r27,carry2 };; | ||
1316 | { .mii; getf.sig r18=f53 | ||
1317 | (p6) add carry1=1,carry1 | ||
1318 | cmp.ltu p6,p0=r27,carry2 };; | ||
1319 | { .mfi; st8 [r33]=r27,16 | ||
1320 | (p6) add carry1=1,carry1 } | ||
1321 | |||
1322 | { .mii; getf.sig r19=f44 | ||
1323 | add r17=r17,r16 | ||
1324 | mov carry2=0 };; | ||
1325 | { .mii; getf.sig r24=f72 | ||
1326 | cmp.ltu p7,p0=r17,r16 | ||
1327 | add r18=r18,r17 };; | ||
1328 | { .mii; (p7) add carry2=1,carry2 | ||
1329 | cmp.ltu p7,p0=r18,r17 | ||
1330 | add r19=r19,r18 };; | ||
1331 | { .mii; (p7) add carry2=1,carry2 | ||
1332 | cmp.ltu p7,p0=r19,r18 | ||
1333 | add r19=r19,carry1 };; | ||
1334 | { .mii; getf.sig r25=f63 | ||
1335 | (p7) add carry2=1,carry2 | ||
1336 | cmp.ltu p7,p0=r19,carry1};; | ||
1337 | { .mii; st8 [r32]=r19,16 | ||
1338 | (p7) add carry2=1,carry2 } | ||
1339 | |||
1340 | { .mii; getf.sig r26=f54 | ||
1341 | add r25=r25,r24 | ||
1342 | mov carry1=0 };; | ||
1343 | { .mii; getf.sig r16=f73 | ||
1344 | cmp.ltu p6,p0=r25,r24 | ||
1345 | add r26=r26,r25 };; | ||
1346 | { .mii; | ||
1347 | (p6) add carry1=1,carry1 | ||
1348 | cmp.ltu p6,p0=r26,r25 | ||
1349 | add r26=r26,carry2 };; | ||
1350 | { .mii; getf.sig r17=f64 | ||
1351 | (p6) add carry1=1,carry1 | ||
1352 | cmp.ltu p6,p0=r26,carry2 };; | ||
1353 | { .mii; st8 [r33]=r26,16 | ||
1354 | (p6) add carry1=1,carry1 } | ||
1355 | |||
1356 | { .mii; getf.sig r24=f74 | ||
1357 | add r17=r17,r16 | ||
1358 | mov carry2=0 };; | ||
1359 | { .mii; cmp.ltu p7,p0=r17,r16 | ||
1360 | add r17=r17,carry1 };; | ||
1361 | |||
1362 | { .mii; (p7) add carry2=1,carry2 | ||
1363 | cmp.ltu p7,p0=r17,carry1};; | ||
1364 | { .mii; st8 [r32]=r17,16 | ||
1365 | (p7) add carry2=1,carry2 };; | ||
1366 | |||
1367 | { .mii; add r24=r24,carry2 };; | ||
1368 | { .mii; st8 [r33]=r24 } | ||
1369 | |||
1370 | { .mib; rum 1<<5 // clear um.mfh | ||
1371 | br.ret.sptk.many b0 };; | ||
1372 | .endp bn_mul_comba4# | ||
1373 | #undef carry2 | ||
1374 | #undef carry1 | ||
1375 | #endif | ||
1376 | |||
1377 | #if 1 | ||
1378 | // | ||
1379 | // BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) | ||
1380 | // | ||
1381 | // In the nutshell it's a port of my MIPS III/IV implementation. | ||
1382 | // | ||
1383 | #define AT r14 | ||
1384 | #define H r16 | ||
1385 | #define HH r20 | ||
1386 | #define L r17 | ||
1387 | #define D r18 | ||
1388 | #define DH r22 | ||
1389 | #define I r21 | ||
1390 | |||
1391 | #if 0 | ||
1392 | // Some preprocessors (most notably HP-UX) appear to be allergic to | ||
1393 | // macros enclosed to parenthesis [as these three were]. | ||
1394 | #define cont p16 | ||
1395 | #define break p0 // p20 | ||
1396 | #define equ p24 | ||
1397 | #else | ||
1398 | cont=p16 | ||
1399 | break=p0 | ||
1400 | equ=p24 | ||
1401 | #endif | ||
1402 | |||
1403 | .global abort# | ||
1404 | .global bn_div_words# | ||
1405 | .proc bn_div_words# | ||
1406 | .align 64 | ||
1407 | bn_div_words: | ||
1408 | .prologue | ||
1409 | .save ar.pfs,r2 | ||
1410 | { .mii; alloc r2=ar.pfs,3,5,0,8 | ||
1411 | .save b0,r3 | ||
1412 | mov r3=b0 | ||
1413 | .save pr,r10 | ||
1414 | mov r10=pr };; | ||
1415 | { .mmb; cmp.eq p6,p0=r34,r0 | ||
1416 | mov r8=-1 | ||
1417 | (p6) br.ret.spnt.many b0 };; | ||
1418 | |||
1419 | .body | ||
1420 | { .mii; mov H=r32 // save h | ||
1421 | mov ar.ec=0 // don't rotate at exit | ||
1422 | mov pr.rot=0 } | ||
1423 | { .mii; mov L=r33 // save l | ||
1424 | mov r36=r0 };; | ||
1425 | |||
1426 | .L_divw_shift: // -vv- note signed comparison | ||
1427 | { .mfi; (p0) cmp.lt p16,p0=r0,r34 // d | ||
1428 | (p0) shladd r33=r34,1,r0 } | ||
1429 | { .mfb; (p0) add r35=1,r36 | ||
1430 | (p0) nop.f 0x0 | ||
1431 | (p16) br.wtop.dpnt .L_divw_shift };; | ||
1432 | |||
1433 | { .mii; mov D=r34 | ||
1434 | shr.u DH=r34,32 | ||
1435 | sub r35=64,r36 };; | ||
1436 | { .mii; setf.sig f7=DH | ||
1437 | shr.u AT=H,r35 | ||
1438 | mov I=r36 };; | ||
1439 | { .mib; cmp.ne p6,p0=r0,AT | ||
1440 | shl H=H,r36 | ||
1441 | (p6) br.call.spnt.clr b0=abort };; // overflow, die... | ||
1442 | |||
1443 | { .mfi; fcvt.xuf.s1 f7=f7 | ||
1444 | shr.u AT=L,r35 };; | ||
1445 | { .mii; shl L=L,r36 | ||
1446 | or H=H,AT };; | ||
1447 | |||
1448 | { .mii; nop.m 0x0 | ||
1449 | cmp.leu p6,p0=D,H;; | ||
1450 | (p6) sub H=H,D } | ||
1451 | |||
1452 | { .mlx; setf.sig f14=D | ||
1453 | movl AT=0xffffffff };; | ||
1454 | /////////////////////////////////////////////////////////// | ||
1455 | { .mii; setf.sig f6=H | ||
1456 | shr.u HH=H,32;; | ||
1457 | cmp.eq p6,p7=HH,DH };; | ||
1458 | { .mfb; | ||
1459 | (p6) setf.sig f8=AT | ||
1460 | (p7) fcvt.xuf.s1 f6=f6 | ||
1461 | (p7) br.call.sptk b6=.L_udiv64_32_b6 };; | ||
1462 | |||
1463 | { .mfi; getf.sig r33=f8 // q | ||
1464 | xmpy.lu f9=f8,f14 } | ||
1465 | { .mfi; xmpy.hu f10=f8,f14 | ||
1466 | shrp H=H,L,32 };; | ||
1467 | |||
1468 | { .mmi; getf.sig r35=f9 // tl | ||
1469 | getf.sig r31=f10 };; // th | ||
1470 | |||
1471 | .L_divw_1st_iter: | ||
1472 | { .mii; (p0) add r32=-1,r33 | ||
1473 | (p0) cmp.eq equ,cont=HH,r31 };; | ||
1474 | { .mii; (p0) cmp.ltu p8,p0=r35,D | ||
1475 | (p0) sub r34=r35,D | ||
1476 | (equ) cmp.leu break,cont=r35,H };; | ||
1477 | { .mib; (cont) cmp.leu cont,break=HH,r31 | ||
1478 | (p8) add r31=-1,r31 | ||
1479 | (cont) br.wtop.spnt .L_divw_1st_iter };; | ||
1480 | /////////////////////////////////////////////////////////// | ||
1481 | { .mii; sub H=H,r35 | ||
1482 | shl r8=r33,32 | ||
1483 | shl L=L,32 };; | ||
1484 | /////////////////////////////////////////////////////////// | ||
1485 | { .mii; setf.sig f6=H | ||
1486 | shr.u HH=H,32;; | ||
1487 | cmp.eq p6,p7=HH,DH };; | ||
1488 | { .mfb; | ||
1489 | (p6) setf.sig f8=AT | ||
1490 | (p7) fcvt.xuf.s1 f6=f6 | ||
1491 | (p7) br.call.sptk b6=.L_udiv64_32_b6 };; | ||
1492 | |||
1493 | { .mfi; getf.sig r33=f8 // q | ||
1494 | xmpy.lu f9=f8,f14 } | ||
1495 | { .mfi; xmpy.hu f10=f8,f14 | ||
1496 | shrp H=H,L,32 };; | ||
1497 | |||
1498 | { .mmi; getf.sig r35=f9 // tl | ||
1499 | getf.sig r31=f10 };; // th | ||
1500 | |||
1501 | .L_divw_2nd_iter: | ||
1502 | { .mii; (p0) add r32=-1,r33 | ||
1503 | (p0) cmp.eq equ,cont=HH,r31 };; | ||
1504 | { .mii; (p0) cmp.ltu p8,p0=r35,D | ||
1505 | (p0) sub r34=r35,D | ||
1506 | (equ) cmp.leu break,cont=r35,H };; | ||
1507 | { .mib; (cont) cmp.leu cont,break=HH,r31 | ||
1508 | (p8) add r31=-1,r31 | ||
1509 | (cont) br.wtop.spnt .L_divw_2nd_iter };; | ||
1510 | /////////////////////////////////////////////////////////// | ||
1511 | { .mii; sub H=H,r35 | ||
1512 | or r8=r8,r33 | ||
1513 | mov ar.pfs=r2 };; | ||
1514 | { .mii; shr.u r9=H,I // remainder if anybody wants it | ||
1515 | mov pr=r10,0x1ffff } | ||
1516 | { .mfb; br.ret.sptk.many b0 };; | ||
1517 | |||
1518 | // Unsigned 64 by 32 (well, by 64 for the moment) bit integer division | ||
1519 | // procedure. | ||
1520 | // | ||
1521 | // inputs: f6 = (double)a, f7 = (double)b | ||
1522 | // output: f8 = (int)(a/b) | ||
1523 | // clobbered: f8,f9,f10,f11,pred | ||
1524 | pred=p15 | ||
1525 | // One can argue that this snippet is copyrighted to Intel | ||
1526 | // Corporation, as it's essentially identical to one of those | ||
1527 | // found in "Divide, Square Root and Remainder" section at | ||
1528 | // http://www.intel.com/software/products/opensource/libraries/num.htm. | ||
1529 | // Yes, I admit that the referred code was used as template, | ||
1530 | // but after I realized that there hardly is any other instruction | ||
1531 | // sequence which would perform this operation. I mean I figure that | ||
1532 | // any independent attempt to implement high-performance division | ||
1533 | // will result in code virtually identical to the Intel code. It | ||
1534 | // should be noted though that below division kernel is 1 cycle | ||
1535 | // faster than Intel one (note commented splits:-), not to mention | ||
1536 | // original prologue (rather lack of one) and epilogue. | ||
1537 | .align 32 | ||
1538 | .skip 16 | ||
1539 | .L_udiv64_32_b6: | ||
1540 | frcpa.s1 f8,pred=f6,f7;; // [0] y0 = 1 / b | ||
1541 | |||
1542 | (pred) fnma.s1 f9=f7,f8,f1 // [5] e0 = 1 - b * y0 | ||
1543 | (pred) fmpy.s1 f10=f6,f8;; // [5] q0 = a * y0 | ||
1544 | (pred) fmpy.s1 f11=f9,f9 // [10] e1 = e0 * e0 | ||
1545 | (pred) fma.s1 f10=f9,f10,f10;; // [10] q1 = q0 + e0 * q0 | ||
1546 | (pred) fma.s1 f8=f9,f8,f8 //;; // [15] y1 = y0 + e0 * y0 | ||
1547 | (pred) fma.s1 f9=f11,f10,f10;; // [15] q2 = q1 + e1 * q1 | ||
1548 | (pred) fma.s1 f8=f11,f8,f8 //;; // [20] y2 = y1 + e1 * y1 | ||
1549 | (pred) fnma.s1 f10=f7,f9,f6;; // [20] r2 = a - b * q2 | ||
1550 | (pred) fma.s1 f8=f10,f8,f9;; // [25] q3 = q2 + r2 * y2 | ||
1551 | |||
1552 | fcvt.fxu.trunc.s1 f8=f8 // [30] q = trunc(q3) | ||
1553 | br.ret.sptk.many b6;; | ||
1554 | .endp bn_div_words# | ||
1555 | #endif | ||
diff --git a/src/lib/libcrypto/ia64cpuid.S b/src/lib/libcrypto/ia64cpuid.S deleted file mode 100644 index 39e8093c6c..0000000000 --- a/src/lib/libcrypto/ia64cpuid.S +++ /dev/null | |||
@@ -1,121 +0,0 @@ | |||
1 | // Works on all IA-64 platforms: Linux, HP-UX, Win64i... | ||
2 | // On Win64i compile with ias.exe. | ||
3 | .text | ||
4 | |||
5 | .global OPENSSL_cpuid_setup# | ||
6 | .proc OPENSSL_cpuid_setup# | ||
7 | OPENSSL_cpuid_setup: | ||
8 | { .mib; br.ret.sptk.many b0 };; | ||
9 | .endp OPENSSL_cpuid_setup# | ||
10 | |||
11 | .global OPENSSL_atomic_add# | ||
12 | .proc OPENSSL_atomic_add# | ||
13 | .align 32 | ||
14 | OPENSSL_atomic_add: | ||
15 | { .mii; ld4 r2=[r32] | ||
16 | nop.i 0 | ||
17 | nop.i 0 };; | ||
18 | .Lspin: | ||
19 | { .mii; mov ar.ccv=r2 | ||
20 | add r8=r2,r33 | ||
21 | mov r3=r2 };; | ||
22 | { .mmi; mf;; | ||
23 | cmpxchg4.acq r2=[r32],r8,ar.ccv | ||
24 | nop.i 0 };; | ||
25 | { .mib; cmp.ne p6,p0=r2,r3 | ||
26 | nop.i 0 | ||
27 | (p6) br.dpnt .Lspin };; | ||
28 | { .mib; nop.m 0 | ||
29 | sxt4 r8=r8 | ||
30 | br.ret.sptk.many b0 };; | ||
31 | .endp OPENSSL_atomic_add# | ||
32 | |||
33 | // Returns a structure comprising pointer to the top of stack of | ||
34 | // the caller and pointer beyond backing storage for the current | ||
35 | // register frame. The latter is required, because it might be | ||
36 | // insufficient to wipe backing storage for the current frame | ||
37 | // (as this procedure does), one might have to go further, toward | ||
38 | // higher addresses to reach for whole "retroactively" saved | ||
39 | // context... | ||
40 | .global OPENSSL_wipe_cpu# | ||
41 | .proc OPENSSL_wipe_cpu# | ||
42 | .align 32 | ||
43 | OPENSSL_wipe_cpu: | ||
44 | .prologue | ||
45 | .fframe 0 | ||
46 | .save ar.pfs,r2 | ||
47 | .save ar.lc,r3 | ||
48 | { .mib; alloc r2=ar.pfs,0,96,0,96 | ||
49 | mov r3=ar.lc | ||
50 | brp.loop.imp .L_wipe_top,.L_wipe_end-16 | ||
51 | };; | ||
52 | { .mii; mov r9=ar.bsp | ||
53 | mov r8=pr | ||
54 | mov ar.lc=96 };; | ||
55 | .body | ||
56 | { .mii; add r9=96*8-8,r9 | ||
57 | mov ar.ec=1 };; | ||
58 | |||
59 | // One can sweep double as fast, but then we can't quarantee | ||
60 | // that backing storage is wiped... | ||
61 | .L_wipe_top: | ||
62 | { .mfi; st8 [r9]=r0,-8 | ||
63 | mov f127=f0 | ||
64 | mov r127=r0 } | ||
65 | { .mfb; nop.m 0 | ||
66 | nop.f 0 | ||
67 | br.ctop.sptk .L_wipe_top };; | ||
68 | .L_wipe_end: | ||
69 | |||
70 | { .mfi; mov r11=r0 | ||
71 | mov f6=f0 | ||
72 | mov r14=r0 } | ||
73 | { .mfi; mov r15=r0 | ||
74 | mov f7=f0 | ||
75 | mov r16=r0 } | ||
76 | { .mfi; mov r17=r0 | ||
77 | mov f8=f0 | ||
78 | mov r18=r0 } | ||
79 | { .mfi; mov r19=r0 | ||
80 | mov f9=f0 | ||
81 | mov r20=r0 } | ||
82 | { .mfi; mov r21=r0 | ||
83 | mov f10=f0 | ||
84 | mov r22=r0 } | ||
85 | { .mfi; mov r23=r0 | ||
86 | mov f11=f0 | ||
87 | mov r24=r0 } | ||
88 | { .mfi; mov r25=r0 | ||
89 | mov f12=f0 | ||
90 | mov r26=r0 } | ||
91 | { .mfi; mov r27=r0 | ||
92 | mov f13=f0 | ||
93 | mov r28=r0 } | ||
94 | { .mfi; mov r29=r0 | ||
95 | mov f14=f0 | ||
96 | mov r30=r0 } | ||
97 | { .mfi; mov r31=r0 | ||
98 | mov f15=f0 | ||
99 | nop.i 0 } | ||
100 | { .mfi; mov f16=f0 } | ||
101 | { .mfi; mov f17=f0 } | ||
102 | { .mfi; mov f18=f0 } | ||
103 | { .mfi; mov f19=f0 } | ||
104 | { .mfi; mov f20=f0 } | ||
105 | { .mfi; mov f21=f0 } | ||
106 | { .mfi; mov f22=f0 } | ||
107 | { .mfi; mov f23=f0 } | ||
108 | { .mfi; mov f24=f0 } | ||
109 | { .mfi; mov f25=f0 } | ||
110 | { .mfi; mov f26=f0 } | ||
111 | { .mfi; mov f27=f0 } | ||
112 | { .mfi; mov f28=f0 } | ||
113 | { .mfi; mov f29=f0 } | ||
114 | { .mfi; mov f30=f0 } | ||
115 | { .mfi; add r9=96*8+8,r9 | ||
116 | mov f31=f0 | ||
117 | mov pr=r8,0x1ffff } | ||
118 | { .mib; mov r8=sp | ||
119 | mov ar.lc=r3 | ||
120 | br.ret.sptk b0 };; | ||
121 | .endp OPENSSL_wipe_cpu# | ||
diff --git a/src/lib/libcrypto/md5/asm/md5-ia64.S b/src/lib/libcrypto/md5/asm/md5-ia64.S deleted file mode 100644 index e7de08d46a..0000000000 --- a/src/lib/libcrypto/md5/asm/md5-ia64.S +++ /dev/null | |||
@@ -1,992 +0,0 @@ | |||
1 | /* Copyright (c) 2005 Hewlett-Packard Development Company, L.P. | ||
2 | |||
3 | Permission is hereby granted, free of charge, to any person obtaining | ||
4 | a copy of this software and associated documentation files (the | ||
5 | "Software"), to deal in the Software without restriction, including | ||
6 | without limitation the rights to use, copy, modify, merge, publish, | ||
7 | distribute, sublicense, and/or sell copies of the Software, and to | ||
8 | permit persons to whom the Software is furnished to do so, subject to | ||
9 | the following conditions: | ||
10 | |||
11 | The above copyright notice and this permission notice shall be | ||
12 | included in all copies or substantial portions of the Software. | ||
13 | |||
14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||
15 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | ||
16 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | ||
17 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE | ||
18 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION | ||
19 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION | ||
20 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ | ||
21 | |||
22 | // Common registers are assigned as follows: | ||
23 | // | ||
24 | // COMMON | ||
25 | // | ||
26 | // t0 Const Tbl Ptr TPtr | ||
27 | // t1 Round Constant TRound | ||
28 | // t4 Block residual LenResid | ||
29 | // t5 Residual Data DTmp | ||
30 | // | ||
31 | // {in,out}0 Block 0 Cycle RotateM0 | ||
32 | // {in,out}1 Block Value 12 M12 | ||
33 | // {in,out}2 Block Value 8 M8 | ||
34 | // {in,out}3 Block Value 4 M4 | ||
35 | // {in,out}4 Block Value 0 M0 | ||
36 | // {in,out}5 Block 1 Cycle RotateM1 | ||
37 | // {in,out}6 Block Value 13 M13 | ||
38 | // {in,out}7 Block Value 9 M9 | ||
39 | // {in,out}8 Block Value 5 M5 | ||
40 | // {in,out}9 Block Value 1 M1 | ||
41 | // {in,out}10 Block 2 Cycle RotateM2 | ||
42 | // {in,out}11 Block Value 14 M14 | ||
43 | // {in,out}12 Block Value 10 M10 | ||
44 | // {in,out}13 Block Value 6 M6 | ||
45 | // {in,out}14 Block Value 2 M2 | ||
46 | // {in,out}15 Block 3 Cycle RotateM3 | ||
47 | // {in,out}16 Block Value 15 M15 | ||
48 | // {in,out}17 Block Value 11 M11 | ||
49 | // {in,out}18 Block Value 7 M7 | ||
50 | // {in,out}19 Block Value 3 M3 | ||
51 | // {in,out}20 Scratch Z | ||
52 | // {in,out}21 Scratch Y | ||
53 | // {in,out}22 Scratch X | ||
54 | // {in,out}23 Scratch W | ||
55 | // {in,out}24 Digest A A | ||
56 | // {in,out}25 Digest B B | ||
57 | // {in,out}26 Digest C C | ||
58 | // {in,out}27 Digest D D | ||
59 | // {in,out}28 Active Data Ptr DPtr | ||
60 | // in28 Dummy Value - | ||
61 | // out28 Dummy Value - | ||
62 | // bt0 Coroutine Link QUICK_RTN | ||
63 | // | ||
64 | /// These predicates are used for computing the padding block(s) and | ||
65 | /// are shared between the driver and digest co-routines | ||
66 | // | ||
67 | // pt0 Extra Pad Block pExtra | ||
68 | // pt1 Load next word pLoad | ||
69 | // pt2 Skip next word pSkip | ||
70 | // pt3 Search for Pad pNoPad | ||
71 | // pt4 Pad Word 0 pPad0 | ||
72 | // pt5 Pad Word 1 pPad1 | ||
73 | // pt6 Pad Word 2 pPad2 | ||
74 | // pt7 Pad Word 3 pPad3 | ||
75 | |||
76 | #define DTmp r19 | ||
77 | #define LenResid r18 | ||
78 | #define QUICK_RTN b6 | ||
79 | #define TPtr r14 | ||
80 | #define TRound r15 | ||
81 | #define pExtra p6 | ||
82 | #define pLoad p7 | ||
83 | #define pNoPad p9 | ||
84 | #define pPad0 p10 | ||
85 | #define pPad1 p11 | ||
86 | #define pPad2 p12 | ||
87 | #define pPad3 p13 | ||
88 | #define pSkip p8 | ||
89 | |||
90 | #define A_ out24 | ||
91 | #define B_ out25 | ||
92 | #define C_ out26 | ||
93 | #define D_ out27 | ||
94 | #define DPtr_ out28 | ||
95 | #define M0_ out4 | ||
96 | #define M1_ out9 | ||
97 | #define M10_ out12 | ||
98 | #define M11_ out17 | ||
99 | #define M12_ out1 | ||
100 | #define M13_ out6 | ||
101 | #define M14_ out11 | ||
102 | #define M15_ out16 | ||
103 | #define M2_ out14 | ||
104 | #define M3_ out19 | ||
105 | #define M4_ out3 | ||
106 | #define M5_ out8 | ||
107 | #define M6_ out13 | ||
108 | #define M7_ out18 | ||
109 | #define M8_ out2 | ||
110 | #define M9_ out7 | ||
111 | #define RotateM0_ out0 | ||
112 | #define RotateM1_ out5 | ||
113 | #define RotateM2_ out10 | ||
114 | #define RotateM3_ out15 | ||
115 | #define W_ out23 | ||
116 | #define X_ out22 | ||
117 | #define Y_ out21 | ||
118 | #define Z_ out20 | ||
119 | |||
120 | #define A in24 | ||
121 | #define B in25 | ||
122 | #define C in26 | ||
123 | #define D in27 | ||
124 | #define DPtr in28 | ||
125 | #define M0 in4 | ||
126 | #define M1 in9 | ||
127 | #define M10 in12 | ||
128 | #define M11 in17 | ||
129 | #define M12 in1 | ||
130 | #define M13 in6 | ||
131 | #define M14 in11 | ||
132 | #define M15 in16 | ||
133 | #define M2 in14 | ||
134 | #define M3 in19 | ||
135 | #define M4 in3 | ||
136 | #define M5 in8 | ||
137 | #define M6 in13 | ||
138 | #define M7 in18 | ||
139 | #define M8 in2 | ||
140 | #define M9 in7 | ||
141 | #define RotateM0 in0 | ||
142 | #define RotateM1 in5 | ||
143 | #define RotateM2 in10 | ||
144 | #define RotateM3 in15 | ||
145 | #define W in23 | ||
146 | #define X in22 | ||
147 | #define Y in21 | ||
148 | #define Z in20 | ||
149 | |||
150 | /* register stack configuration for md5_block_asm_data_order(): */ | ||
151 | #define MD5_NINP 3 | ||
152 | #define MD5_NLOC 0 | ||
153 | #define MD5_NOUT 29 | ||
154 | #define MD5_NROT 0 | ||
155 | |||
156 | /* register stack configuration for helpers: */ | ||
157 | #define _NINPUTS MD5_NOUT | ||
158 | #define _NLOCALS 0 | ||
159 | #define _NOUTPUT 0 | ||
160 | #define _NROTATE 24 /* this must be <= _NINPUTS */ | ||
161 | |||
162 | #if defined(_HPUX_SOURCE) && !defined(_LP64) | ||
163 | #define ADDP addp4 | ||
164 | #else | ||
165 | #define ADDP add | ||
166 | #endif | ||
167 | |||
168 | #if defined(_HPUX_SOURCE) || defined(B_ENDIAN) | ||
169 | #define HOST_IS_BIG_ENDIAN | ||
170 | #endif | ||
171 | |||
172 | // Macros for getting the left and right portions of little-endian words | ||
173 | |||
174 | #define GETLW(dst, src, align) dep.z dst = src, 32 - 8 * align, 8 * align | ||
175 | #define GETRW(dst, src, align) extr.u dst = src, 8 * align, 32 - 8 * align | ||
176 | |||
177 | // MD5 driver | ||
178 | // | ||
179 | // Reads an input block, then calls the digest block | ||
180 | // subroutine and adds the results to the accumulated | ||
181 | // digest. It allocates 32 outs which the subroutine | ||
182 | // uses as it's inputs and rotating | ||
183 | // registers. Initializes the round constant pointer and | ||
184 | // takes care of saving/restoring ar.lc | ||
185 | // | ||
186 | /// INPUT | ||
187 | // | ||
188 | // in0 Context Ptr CtxPtr0 | ||
189 | // in1 Input Data Ptr DPtrIn | ||
190 | // in2 Integral Blocks BlockCount | ||
191 | // rp Return Address - | ||
192 | // | ||
193 | /// CODE | ||
194 | // | ||
195 | // v2 Input Align InAlign | ||
196 | // t0 Shared w/digest - | ||
197 | // t1 Shared w/digest - | ||
198 | // t2 Shared w/digest - | ||
199 | // t3 Shared w/digest - | ||
200 | // t4 Shared w/digest - | ||
201 | // t5 Shared w/digest - | ||
202 | // t6 PFS Save PFSSave | ||
203 | // t7 ar.lc Save LCSave | ||
204 | // t8 Saved PR PRSave | ||
205 | // t9 2nd CtxPtr CtxPtr1 | ||
206 | // t10 Table Base CTable | ||
207 | // t11 Table[0] CTable0 | ||
208 | // t13 Accumulator A AccumA | ||
209 | // t14 Accumulator B AccumB | ||
210 | // t15 Accumulator C AccumC | ||
211 | // t16 Accumulator D AccumD | ||
212 | // pt0 Shared w/digest - | ||
213 | // pt1 Shared w/digest - | ||
214 | // pt2 Shared w/digest - | ||
215 | // pt3 Shared w/digest - | ||
216 | // pt4 Shared w/digest - | ||
217 | // pt5 Shared w/digest - | ||
218 | // pt6 Shared w/digest - | ||
219 | // pt7 Shared w/digest - | ||
220 | // pt8 Not Aligned pOff | ||
221 | // pt8 Blocks Left pAgain | ||
222 | |||
223 | #define AccumA r27 | ||
224 | #define AccumB r28 | ||
225 | #define AccumC r29 | ||
226 | #define AccumD r30 | ||
227 | #define CTable r24 | ||
228 | #define CTable0 r25 | ||
229 | #define CtxPtr0 in0 | ||
230 | #define CtxPtr1 r23 | ||
231 | #define DPtrIn in1 | ||
232 | #define BlockCount in2 | ||
233 | #define InAlign r10 | ||
234 | #define LCSave r21 | ||
235 | #define PFSSave r20 | ||
236 | #define PRSave r22 | ||
237 | #define pAgain p63 | ||
238 | #define pOff p63 | ||
239 | |||
240 | .text | ||
241 | |||
242 | /* md5_block_asm_data_order(MD5_CTX *c, const void *data, size_t num) | ||
243 | |||
244 | where: | ||
245 | c: a pointer to a structure of this type: | ||
246 | |||
247 | typedef struct MD5state_st | ||
248 | { | ||
249 | MD5_LONG A,B,C,D; | ||
250 | MD5_LONG Nl,Nh; | ||
251 | MD5_LONG data[MD5_LBLOCK]; | ||
252 | unsigned int num; | ||
253 | } | ||
254 | MD5_CTX; | ||
255 | |||
256 | data: a pointer to the input data (may be misaligned) | ||
257 | num: the number of 16-byte blocks to hash (i.e., the length | ||
258 | of DATA is 16*NUM. | ||
259 | |||
260 | */ | ||
261 | |||
262 | .type md5_block_asm_data_order, @function | ||
263 | .global md5_block_asm_data_order | ||
264 | .align 32 | ||
265 | .proc md5_block_asm_data_order | ||
266 | md5_block_asm_data_order: | ||
267 | .md5_block: | ||
268 | .prologue | ||
269 | { .mmi | ||
270 | .save ar.pfs, PFSSave | ||
271 | alloc PFSSave = ar.pfs, MD5_NINP, MD5_NLOC, MD5_NOUT, MD5_NROT | ||
272 | ADDP CtxPtr1 = 8, CtxPtr0 | ||
273 | mov CTable = ip | ||
274 | } | ||
275 | { .mmi | ||
276 | ADDP DPtrIn = 0, DPtrIn | ||
277 | ADDP CtxPtr0 = 0, CtxPtr0 | ||
278 | .save ar.lc, LCSave | ||
279 | mov LCSave = ar.lc | ||
280 | } | ||
281 | ;; | ||
282 | { .mmi | ||
283 | add CTable = .md5_tbl_data_order#-.md5_block#, CTable | ||
284 | and InAlign = 0x3, DPtrIn | ||
285 | } | ||
286 | |||
287 | { .mmi | ||
288 | ld4 AccumA = [CtxPtr0], 4 | ||
289 | ld4 AccumC = [CtxPtr1], 4 | ||
290 | .save pr, PRSave | ||
291 | mov PRSave = pr | ||
292 | .body | ||
293 | } | ||
294 | ;; | ||
295 | { .mmi | ||
296 | ld4 AccumB = [CtxPtr0] | ||
297 | ld4 AccumD = [CtxPtr1] | ||
298 | dep DPtr_ = 0, DPtrIn, 0, 2 | ||
299 | } ;; | ||
300 | #ifdef HOST_IS_BIG_ENDIAN | ||
301 | rum psr.be;; // switch to little-endian | ||
302 | #endif | ||
303 | { .mmb | ||
304 | ld4 CTable0 = [CTable], 4 | ||
305 | cmp.ne pOff, p0 = 0, InAlign | ||
306 | (pOff) br.cond.spnt.many .md5_unaligned | ||
307 | } ;; | ||
308 | |||
309 | // The FF load/compute loop rotates values three times, so that | ||
310 | // loading into M12 here produces the M0 value, M13 -> M1, etc. | ||
311 | |||
312 | .md5_block_loop0: | ||
313 | { .mmi | ||
314 | ld4 M12_ = [DPtr_], 4 | ||
315 | mov TPtr = CTable | ||
316 | mov TRound = CTable0 | ||
317 | } ;; | ||
318 | { .mmi | ||
319 | ld4 M13_ = [DPtr_], 4 | ||
320 | mov A_ = AccumA | ||
321 | mov B_ = AccumB | ||
322 | } ;; | ||
323 | { .mmi | ||
324 | ld4 M14_ = [DPtr_], 4 | ||
325 | mov C_ = AccumC | ||
326 | mov D_ = AccumD | ||
327 | } ;; | ||
328 | { .mmb | ||
329 | ld4 M15_ = [DPtr_], 4 | ||
330 | add BlockCount = -1, BlockCount | ||
331 | br.call.sptk.many QUICK_RTN = md5_digest_block0 | ||
332 | } ;; | ||
333 | |||
334 | // Now, we add the new digest values and do some clean-up | ||
335 | // before checking if there's another full block to process | ||
336 | |||
337 | { .mmi | ||
338 | add AccumA = AccumA, A_ | ||
339 | add AccumB = AccumB, B_ | ||
340 | cmp.ne pAgain, p0 = 0, BlockCount | ||
341 | } | ||
342 | { .mib | ||
343 | add AccumC = AccumC, C_ | ||
344 | add AccumD = AccumD, D_ | ||
345 | (pAgain) br.cond.dptk.many .md5_block_loop0 | ||
346 | } ;; | ||
347 | |||
348 | .md5_exit: | ||
349 | #ifdef HOST_IS_BIG_ENDIAN | ||
350 | sum psr.be;; // switch back to big-endian mode | ||
351 | #endif | ||
352 | { .mmi | ||
353 | st4 [CtxPtr0] = AccumB, -4 | ||
354 | st4 [CtxPtr1] = AccumD, -4 | ||
355 | mov pr = PRSave, 0x1ffff ;; | ||
356 | } | ||
357 | { .mmi | ||
358 | st4 [CtxPtr0] = AccumA | ||
359 | st4 [CtxPtr1] = AccumC | ||
360 | mov ar.lc = LCSave | ||
361 | } ;; | ||
362 | { .mib | ||
363 | mov ar.pfs = PFSSave | ||
364 | br.ret.sptk.few rp | ||
365 | } ;; | ||
366 | |||
367 | #define MD5UNALIGNED(offset) \ | ||
368 | .md5_process##offset: \ | ||
369 | { .mib ; \ | ||
370 | nop 0x0 ; \ | ||
371 | GETRW(DTmp, DTmp, offset) ; \ | ||
372 | } ;; \ | ||
373 | .md5_block_loop##offset: \ | ||
374 | { .mmi ; \ | ||
375 | ld4 Y_ = [DPtr_], 4 ; \ | ||
376 | mov TPtr = CTable ; \ | ||
377 | mov TRound = CTable0 ; \ | ||
378 | } ;; \ | ||
379 | { .mmi ; \ | ||
380 | ld4 M13_ = [DPtr_], 4 ; \ | ||
381 | mov A_ = AccumA ; \ | ||
382 | mov B_ = AccumB ; \ | ||
383 | } ;; \ | ||
384 | { .mii ; \ | ||
385 | ld4 M14_ = [DPtr_], 4 ; \ | ||
386 | GETLW(W_, Y_, offset) ; \ | ||
387 | mov C_ = AccumC ; \ | ||
388 | } \ | ||
389 | { .mmi ; \ | ||
390 | mov D_ = AccumD ;; \ | ||
391 | or M12_ = W_, DTmp ; \ | ||
392 | GETRW(DTmp, Y_, offset) ; \ | ||
393 | } \ | ||
394 | { .mib ; \ | ||
395 | ld4 M15_ = [DPtr_], 4 ; \ | ||
396 | add BlockCount = -1, BlockCount ; \ | ||
397 | br.call.sptk.many QUICK_RTN = md5_digest_block##offset; \ | ||
398 | } ;; \ | ||
399 | { .mmi ; \ | ||
400 | add AccumA = AccumA, A_ ; \ | ||
401 | add AccumB = AccumB, B_ ; \ | ||
402 | cmp.ne pAgain, p0 = 0, BlockCount ; \ | ||
403 | } \ | ||
404 | { .mib ; \ | ||
405 | add AccumC = AccumC, C_ ; \ | ||
406 | add AccumD = AccumD, D_ ; \ | ||
407 | (pAgain) br.cond.dptk.many .md5_block_loop##offset ; \ | ||
408 | } ;; \ | ||
409 | { .mib ; \ | ||
410 | nop 0x0 ; \ | ||
411 | nop 0x0 ; \ | ||
412 | br.cond.sptk.many .md5_exit ; \ | ||
413 | } ;; | ||
414 | |||
415 | .align 32 | ||
416 | .md5_unaligned: | ||
417 | // | ||
418 | // Because variable shifts are expensive, we special case each of | ||
419 | // the four alignements. In practice, this won't hurt too much | ||
420 | // since only one working set of code will be loaded. | ||
421 | // | ||
422 | { .mib | ||
423 | ld4 DTmp = [DPtr_], 4 | ||
424 | cmp.eq pOff, p0 = 1, InAlign | ||
425 | (pOff) br.cond.dpnt.many .md5_process1 | ||
426 | } ;; | ||
427 | { .mib | ||
428 | cmp.eq pOff, p0 = 2, InAlign | ||
429 | nop 0x0 | ||
430 | (pOff) br.cond.dpnt.many .md5_process2 | ||
431 | } ;; | ||
432 | MD5UNALIGNED(3) | ||
433 | MD5UNALIGNED(1) | ||
434 | MD5UNALIGNED(2) | ||
435 | |||
436 | .endp md5_block_asm_data_order | ||
437 | |||
438 | |||
439 | // MD5 Perform the F function and load | ||
440 | // | ||
441 | // Passed the first 4 words (M0 - M3) and initial (A, B, C, D) values, | ||
442 | // computes the FF() round of functions, then branches to the common | ||
443 | // digest code to finish up with GG(), HH, and II(). | ||
444 | // | ||
445 | // INPUT | ||
446 | // | ||
447 | // rp Return Address - | ||
448 | // | ||
449 | // CODE | ||
450 | // | ||
451 | // v0 PFS bit bucket PFS | ||
452 | // v1 Loop Trip Count LTrip | ||
453 | // pt0 Load next word pMore | ||
454 | |||
455 | /* For F round: */ | ||
456 | #define LTrip r9 | ||
457 | #define PFS r8 | ||
458 | #define pMore p6 | ||
459 | |||
460 | /* For GHI rounds: */ | ||
461 | #define T r9 | ||
462 | #define U r10 | ||
463 | #define V r11 | ||
464 | |||
465 | #define COMPUTE(a, b, s, M, R) \ | ||
466 | { \ | ||
467 | .mii ; \ | ||
468 | ld4 TRound = [TPtr], 4 ; \ | ||
469 | dep.z Y = Z, 32, 32 ;; \ | ||
470 | shrp Z = Z, Y, 64 - s ; \ | ||
471 | } ;; \ | ||
472 | { \ | ||
473 | .mmi ; \ | ||
474 | add a = Z, b ; \ | ||
475 | mov R = M ; \ | ||
476 | nop 0x0 ; \ | ||
477 | } ;; | ||
478 | |||
479 | #define LOOP(a, b, s, M, R, label) \ | ||
480 | { .mii ; \ | ||
481 | ld4 TRound = [TPtr], 4 ; \ | ||
482 | dep.z Y = Z, 32, 32 ;; \ | ||
483 | shrp Z = Z, Y, 64 - s ; \ | ||
484 | } ;; \ | ||
485 | { .mib ; \ | ||
486 | add a = Z, b ; \ | ||
487 | mov R = M ; \ | ||
488 | br.ctop.sptk.many label ; \ | ||
489 | } ;; | ||
490 | |||
491 | // G(B, C, D) = (B & D) | (C & ~D) | ||
492 | |||
493 | #define G(a, b, c, d, M) \ | ||
494 | { .mmi ; \ | ||
495 | add Z = M, TRound ; \ | ||
496 | and Y = b, d ; \ | ||
497 | andcm X = c, d ; \ | ||
498 | } ;; \ | ||
499 | { .mii ; \ | ||
500 | add Z = Z, a ; \ | ||
501 | or Y = Y, X ;; \ | ||
502 | add Z = Z, Y ; \ | ||
503 | } ;; | ||
504 | |||
505 | // H(B, C, D) = B ^ C ^ D | ||
506 | |||
507 | #define H(a, b, c, d, M) \ | ||
508 | { .mmi ; \ | ||
509 | add Z = M, TRound ; \ | ||
510 | xor Y = b, c ; \ | ||
511 | nop 0x0 ; \ | ||
512 | } ;; \ | ||
513 | { .mii ; \ | ||
514 | add Z = Z, a ; \ | ||
515 | xor Y = Y, d ;; \ | ||
516 | add Z = Z, Y ; \ | ||
517 | } ;; | ||
518 | |||
519 | // I(B, C, D) = C ^ (B | ~D) | ||
520 | // | ||
521 | // However, since we have an andcm operator, we use the fact that | ||
522 | // | ||
523 | // Y ^ Z == ~Y ^ ~Z | ||
524 | // | ||
525 | // to rewrite the expression as | ||
526 | // | ||
527 | // I(B, C, D) = ~C ^ (~B & D) | ||
528 | |||
529 | #define I(a, b, c, d, M) \ | ||
530 | { .mmi ; \ | ||
531 | add Z = M, TRound ; \ | ||
532 | andcm Y = d, b ; \ | ||
533 | andcm X = -1, c ; \ | ||
534 | } ;; \ | ||
535 | { .mii ; \ | ||
536 | add Z = Z, a ; \ | ||
537 | xor Y = Y, X ;; \ | ||
538 | add Z = Z, Y ; \ | ||
539 | } ;; | ||
540 | |||
541 | #define GG4(label) \ | ||
542 | G(A, B, C, D, M0) \ | ||
543 | COMPUTE(A, B, 5, M0, RotateM0) \ | ||
544 | G(D, A, B, C, M1) \ | ||
545 | COMPUTE(D, A, 9, M1, RotateM1) \ | ||
546 | G(C, D, A, B, M2) \ | ||
547 | COMPUTE(C, D, 14, M2, RotateM2) \ | ||
548 | G(B, C, D, A, M3) \ | ||
549 | LOOP(B, C, 20, M3, RotateM3, label) | ||
550 | |||
551 | #define HH4(label) \ | ||
552 | H(A, B, C, D, M0) \ | ||
553 | COMPUTE(A, B, 4, M0, RotateM0) \ | ||
554 | H(D, A, B, C, M1) \ | ||
555 | COMPUTE(D, A, 11, M1, RotateM1) \ | ||
556 | H(C, D, A, B, M2) \ | ||
557 | COMPUTE(C, D, 16, M2, RotateM2) \ | ||
558 | H(B, C, D, A, M3) \ | ||
559 | LOOP(B, C, 23, M3, RotateM3, label) | ||
560 | |||
561 | #define II4(label) \ | ||
562 | I(A, B, C, D, M0) \ | ||
563 | COMPUTE(A, B, 6, M0, RotateM0) \ | ||
564 | I(D, A, B, C, M1) \ | ||
565 | COMPUTE(D, A, 10, M1, RotateM1) \ | ||
566 | I(C, D, A, B, M2) \ | ||
567 | COMPUTE(C, D, 15, M2, RotateM2) \ | ||
568 | I(B, C, D, A, M3) \ | ||
569 | LOOP(B, C, 21, M3, RotateM3, label) | ||
570 | |||
571 | #define FFLOAD(a, b, c, d, M, N, s) \ | ||
572 | { .mii ; \ | ||
573 | (pMore) ld4 N = [DPtr], 4 ; \ | ||
574 | add Z = M, TRound ; \ | ||
575 | and Y = c, b ; \ | ||
576 | } \ | ||
577 | { .mmi ; \ | ||
578 | andcm X = d, b ;; \ | ||
579 | add Z = Z, a ; \ | ||
580 | or Y = Y, X ; \ | ||
581 | } ;; \ | ||
582 | { .mii ; \ | ||
583 | ld4 TRound = [TPtr], 4 ; \ | ||
584 | add Z = Z, Y ;; \ | ||
585 | dep.z Y = Z, 32, 32 ; \ | ||
586 | } ;; \ | ||
587 | { .mii ; \ | ||
588 | nop 0x0 ; \ | ||
589 | shrp Z = Z, Y, 64 - s ;; \ | ||
590 | add a = Z, b ; \ | ||
591 | } ;; | ||
592 | |||
593 | #define FFLOOP(a, b, c, d, M, N, s, dest) \ | ||
594 | { .mii ; \ | ||
595 | (pMore) ld4 N = [DPtr], 4 ; \ | ||
596 | add Z = M, TRound ; \ | ||
597 | and Y = c, b ; \ | ||
598 | } \ | ||
599 | { .mmi ; \ | ||
600 | andcm X = d, b ;; \ | ||
601 | add Z = Z, a ; \ | ||
602 | or Y = Y, X ; \ | ||
603 | } ;; \ | ||
604 | { .mii ; \ | ||
605 | ld4 TRound = [TPtr], 4 ; \ | ||
606 | add Z = Z, Y ;; \ | ||
607 | dep.z Y = Z, 32, 32 ; \ | ||
608 | } ;; \ | ||
609 | { .mii ; \ | ||
610 | nop 0x0 ; \ | ||
611 | shrp Z = Z, Y, 64 - s ;; \ | ||
612 | add a = Z, b ; \ | ||
613 | } \ | ||
614 | { .mib ; \ | ||
615 | cmp.ne pMore, p0 = 0, LTrip ; \ | ||
616 | add LTrip = -1, LTrip ; \ | ||
617 | br.ctop.dptk.many dest ; \ | ||
618 | } ;; | ||
619 | |||
620 | .type md5_digest_block0, @function | ||
621 | .align 32 | ||
622 | |||
623 | .proc md5_digest_block0 | ||
624 | .prologue | ||
625 | md5_digest_block0: | ||
626 | .altrp QUICK_RTN | ||
627 | .body | ||
628 | { .mmi | ||
629 | alloc PFS = ar.pfs, _NINPUTS, _NLOCALS, _NOUTPUT, _NROTATE | ||
630 | mov LTrip = 2 | ||
631 | mov ar.lc = 3 | ||
632 | } ;; | ||
633 | { .mii | ||
634 | cmp.eq pMore, p0 = r0, r0 | ||
635 | mov ar.ec = 0 | ||
636 | nop 0x0 | ||
637 | } ;; | ||
638 | |||
639 | .md5_FF_round0: | ||
640 | FFLOAD(A, B, C, D, M12, RotateM0, 7) | ||
641 | FFLOAD(D, A, B, C, M13, RotateM1, 12) | ||
642 | FFLOAD(C, D, A, B, M14, RotateM2, 17) | ||
643 | FFLOOP(B, C, D, A, M15, RotateM3, 22, .md5_FF_round0) | ||
644 | // | ||
645 | // !!! Fall through to md5_digest_GHI | ||
646 | // | ||
647 | .endp md5_digest_block0 | ||
648 | |||
649 | .type md5_digest_GHI, @function | ||
650 | .align 32 | ||
651 | |||
652 | .proc md5_digest_GHI | ||
653 | .prologue | ||
654 | .regstk _NINPUTS, _NLOCALS, _NOUTPUT, _NROTATE | ||
655 | md5_digest_GHI: | ||
656 | .altrp QUICK_RTN | ||
657 | .body | ||
658 | // | ||
659 | // The following sequence shuffles the block counstants round for the | ||
660 | // next round: | ||
661 | // | ||
662 | // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | ||
663 | // 1 6 11 0 5 10 14 4 9 14 3 8 13 2 7 12 | ||
664 | // | ||
665 | { .mmi | ||
666 | mov Z = M0 | ||
667 | mov Y = M15 | ||
668 | mov ar.lc = 3 | ||
669 | } | ||
670 | { .mmi | ||
671 | mov X = M2 | ||
672 | mov W = M9 | ||
673 | mov V = M4 | ||
674 | } ;; | ||
675 | |||
676 | { .mmi | ||
677 | mov M0 = M1 | ||
678 | mov M15 = M12 | ||
679 | mov ar.ec = 1 | ||
680 | } | ||
681 | { .mmi | ||
682 | mov M2 = M11 | ||
683 | mov M9 = M14 | ||
684 | mov M4 = M5 | ||
685 | } ;; | ||
686 | |||
687 | { .mmi | ||
688 | mov M1 = M6 | ||
689 | mov M12 = M13 | ||
690 | mov U = M3 | ||
691 | } | ||
692 | { .mmi | ||
693 | mov M11 = M8 | ||
694 | mov M14 = M7 | ||
695 | mov M5 = M10 | ||
696 | } ;; | ||
697 | |||
698 | { .mmi | ||
699 | mov M6 = Y | ||
700 | mov M13 = X | ||
701 | mov M3 = Z | ||
702 | } | ||
703 | { .mmi | ||
704 | mov M8 = W | ||
705 | mov M7 = V | ||
706 | mov M10 = U | ||
707 | } ;; | ||
708 | |||
709 | .md5_GG_round: | ||
710 | GG4(.md5_GG_round) | ||
711 | |||
712 | // The following sequence shuffles the block constants round for the | ||
713 | // next round: | ||
714 | // | ||
715 | // 1 6 11 0 5 10 14 4 9 14 3 8 13 2 7 12 | ||
716 | // 5 8 11 14 1 4 7 10 13 0 3 6 9 12 15 2 | ||
717 | |||
718 | { .mmi | ||
719 | mov Z = M0 | ||
720 | mov Y = M1 | ||
721 | mov ar.lc = 3 | ||
722 | } | ||
723 | { .mmi | ||
724 | mov X = M3 | ||
725 | mov W = M5 | ||
726 | mov V = M6 | ||
727 | } ;; | ||
728 | |||
729 | { .mmi | ||
730 | mov M0 = M4 | ||
731 | mov M1 = M11 | ||
732 | mov ar.ec = 1 | ||
733 | } | ||
734 | { .mmi | ||
735 | mov M3 = M9 | ||
736 | mov U = M8 | ||
737 | mov T = M13 | ||
738 | } ;; | ||
739 | |||
740 | { .mmi | ||
741 | mov M4 = Z | ||
742 | mov M11 = Y | ||
743 | mov M5 = M7 | ||
744 | } | ||
745 | { .mmi | ||
746 | mov M6 = M14 | ||
747 | mov M8 = M12 | ||
748 | mov M13 = M15 | ||
749 | } ;; | ||
750 | |||
751 | { .mmi | ||
752 | mov M7 = W | ||
753 | mov M14 = V | ||
754 | nop 0x0 | ||
755 | } | ||
756 | { .mmi | ||
757 | mov M9 = X | ||
758 | mov M12 = U | ||
759 | mov M15 = T | ||
760 | } ;; | ||
761 | |||
762 | .md5_HH_round: | ||
763 | HH4(.md5_HH_round) | ||
764 | |||
765 | // The following sequence shuffles the block constants round for the | ||
766 | // next round: | ||
767 | // | ||
768 | // 5 8 11 14 1 4 7 10 13 0 3 6 9 12 15 2 | ||
769 | // 0 7 14 5 12 3 10 1 8 15 6 13 4 11 2 9 | ||
770 | |||
771 | { .mmi | ||
772 | mov Z = M0 | ||
773 | mov Y = M15 | ||
774 | mov ar.lc = 3 | ||
775 | } | ||
776 | { .mmi | ||
777 | mov X = M10 | ||
778 | mov W = M1 | ||
779 | mov V = M4 | ||
780 | } ;; | ||
781 | |||
782 | { .mmi | ||
783 | mov M0 = M9 | ||
784 | mov M15 = M12 | ||
785 | mov ar.ec = 1 | ||
786 | } | ||
787 | { .mmi | ||
788 | mov M10 = M11 | ||
789 | mov M1 = M6 | ||
790 | mov M4 = M13 | ||
791 | } ;; | ||
792 | |||
793 | { .mmi | ||
794 | mov M9 = M14 | ||
795 | mov M12 = M5 | ||
796 | mov U = M3 | ||
797 | } | ||
798 | { .mmi | ||
799 | mov M11 = M8 | ||
800 | mov M6 = M7 | ||
801 | mov M13 = M2 | ||
802 | } ;; | ||
803 | |||
804 | { .mmi | ||
805 | mov M14 = Y | ||
806 | mov M5 = X | ||
807 | mov M3 = Z | ||
808 | } | ||
809 | { .mmi | ||
810 | mov M8 = W | ||
811 | mov M7 = V | ||
812 | mov M2 = U | ||
813 | } ;; | ||
814 | |||
815 | .md5_II_round: | ||
816 | II4(.md5_II_round) | ||
817 | |||
818 | { .mib | ||
819 | nop 0x0 | ||
820 | nop 0x0 | ||
821 | br.ret.sptk.many QUICK_RTN | ||
822 | } ;; | ||
823 | |||
824 | .endp md5_digest_GHI | ||
825 | |||
826 | #define FFLOADU(a, b, c, d, M, P, N, s, offset) \ | ||
827 | { .mii ; \ | ||
828 | (pMore) ld4 N = [DPtr], 4 ; \ | ||
829 | add Z = M, TRound ; \ | ||
830 | and Y = c, b ; \ | ||
831 | } \ | ||
832 | { .mmi ; \ | ||
833 | andcm X = d, b ;; \ | ||
834 | add Z = Z, a ; \ | ||
835 | or Y = Y, X ; \ | ||
836 | } ;; \ | ||
837 | { .mii ; \ | ||
838 | ld4 TRound = [TPtr], 4 ; \ | ||
839 | GETLW(W, P, offset) ; \ | ||
840 | add Z = Z, Y ; \ | ||
841 | } ;; \ | ||
842 | { .mii ; \ | ||
843 | or W = W, DTmp ; \ | ||
844 | dep.z Y = Z, 32, 32 ;; \ | ||
845 | shrp Z = Z, Y, 64 - s ; \ | ||
846 | } ;; \ | ||
847 | { .mii ; \ | ||
848 | add a = Z, b ; \ | ||
849 | GETRW(DTmp, P, offset) ; \ | ||
850 | mov P = W ; \ | ||
851 | } ;; | ||
852 | |||
853 | #define FFLOOPU(a, b, c, d, M, P, N, s, offset) \ | ||
854 | { .mii ; \ | ||
855 | (pMore) ld4 N = [DPtr], 4 ; \ | ||
856 | add Z = M, TRound ; \ | ||
857 | and Y = c, b ; \ | ||
858 | } \ | ||
859 | { .mmi ; \ | ||
860 | andcm X = d, b ;; \ | ||
861 | add Z = Z, a ; \ | ||
862 | or Y = Y, X ; \ | ||
863 | } ;; \ | ||
864 | { .mii ; \ | ||
865 | ld4 TRound = [TPtr], 4 ; \ | ||
866 | (pMore) GETLW(W, P, offset) ; \ | ||
867 | add Z = Z, Y ; \ | ||
868 | } ;; \ | ||
869 | { .mii ; \ | ||
870 | (pMore) or W = W, DTmp ; \ | ||
871 | dep.z Y = Z, 32, 32 ;; \ | ||
872 | shrp Z = Z, Y, 64 - s ; \ | ||
873 | } ;; \ | ||
874 | { .mii ; \ | ||
875 | add a = Z, b ; \ | ||
876 | (pMore) GETRW(DTmp, P, offset) ; \ | ||
877 | (pMore) mov P = W ; \ | ||
878 | } \ | ||
879 | { .mib ; \ | ||
880 | cmp.ne pMore, p0 = 0, LTrip ; \ | ||
881 | add LTrip = -1, LTrip ; \ | ||
882 | br.ctop.sptk.many .md5_FF_round##offset ; \ | ||
883 | } ;; | ||
884 | |||
885 | #define MD5FBLOCK(offset) \ | ||
886 | .type md5_digest_block##offset, @function ; \ | ||
887 | \ | ||
888 | .align 32 ; \ | ||
889 | .proc md5_digest_block##offset ; \ | ||
890 | .prologue ; \ | ||
891 | .altrp QUICK_RTN ; \ | ||
892 | .body ; \ | ||
893 | md5_digest_block##offset: \ | ||
894 | { .mmi ; \ | ||
895 | alloc PFS = ar.pfs, _NINPUTS, _NLOCALS, _NOUTPUT, _NROTATE ; \ | ||
896 | mov LTrip = 2 ; \ | ||
897 | mov ar.lc = 3 ; \ | ||
898 | } ;; \ | ||
899 | { .mii ; \ | ||
900 | cmp.eq pMore, p0 = r0, r0 ; \ | ||
901 | mov ar.ec = 0 ; \ | ||
902 | nop 0x0 ; \ | ||
903 | } ;; \ | ||
904 | \ | ||
905 | .pred.rel "mutex", pLoad, pSkip ; \ | ||
906 | .md5_FF_round##offset: \ | ||
907 | FFLOADU(A, B, C, D, M12, M13, RotateM0, 7, offset) \ | ||
908 | FFLOADU(D, A, B, C, M13, M14, RotateM1, 12, offset) \ | ||
909 | FFLOADU(C, D, A, B, M14, M15, RotateM2, 17, offset) \ | ||
910 | FFLOOPU(B, C, D, A, M15, RotateM0, RotateM3, 22, offset) \ | ||
911 | \ | ||
912 | { .mib ; \ | ||
913 | nop 0x0 ; \ | ||
914 | nop 0x0 ; \ | ||
915 | br.cond.sptk.many md5_digest_GHI ; \ | ||
916 | } ;; \ | ||
917 | .endp md5_digest_block##offset | ||
918 | |||
919 | MD5FBLOCK(1) | ||
920 | MD5FBLOCK(2) | ||
921 | MD5FBLOCK(3) | ||
922 | |||
923 | .align 64 | ||
924 | .type md5_constants, @object | ||
925 | md5_constants: | ||
926 | .md5_tbl_data_order: // To ensure little-endian data | ||
927 | // order, code as bytes. | ||
928 | data1 0x78, 0xa4, 0x6a, 0xd7 // 0 | ||
929 | data1 0x56, 0xb7, 0xc7, 0xe8 // 1 | ||
930 | data1 0xdb, 0x70, 0x20, 0x24 // 2 | ||
931 | data1 0xee, 0xce, 0xbd, 0xc1 // 3 | ||
932 | data1 0xaf, 0x0f, 0x7c, 0xf5 // 4 | ||
933 | data1 0x2a, 0xc6, 0x87, 0x47 // 5 | ||
934 | data1 0x13, 0x46, 0x30, 0xa8 // 6 | ||
935 | data1 0x01, 0x95, 0x46, 0xfd // 7 | ||
936 | data1 0xd8, 0x98, 0x80, 0x69 // 8 | ||
937 | data1 0xaf, 0xf7, 0x44, 0x8b // 9 | ||
938 | data1 0xb1, 0x5b, 0xff, 0xff // 10 | ||
939 | data1 0xbe, 0xd7, 0x5c, 0x89 // 11 | ||
940 | data1 0x22, 0x11, 0x90, 0x6b // 12 | ||
941 | data1 0x93, 0x71, 0x98, 0xfd // 13 | ||
942 | data1 0x8e, 0x43, 0x79, 0xa6 // 14 | ||
943 | data1 0x21, 0x08, 0xb4, 0x49 // 15 | ||
944 | data1 0x62, 0x25, 0x1e, 0xf6 // 16 | ||
945 | data1 0x40, 0xb3, 0x40, 0xc0 // 17 | ||
946 | data1 0x51, 0x5a, 0x5e, 0x26 // 18 | ||
947 | data1 0xaa, 0xc7, 0xb6, 0xe9 // 19 | ||
948 | data1 0x5d, 0x10, 0x2f, 0xd6 // 20 | ||
949 | data1 0x53, 0x14, 0x44, 0x02 // 21 | ||
950 | data1 0x81, 0xe6, 0xa1, 0xd8 // 22 | ||
951 | data1 0xc8, 0xfb, 0xd3, 0xe7 // 23 | ||
952 | data1 0xe6, 0xcd, 0xe1, 0x21 // 24 | ||
953 | data1 0xd6, 0x07, 0x37, 0xc3 // 25 | ||
954 | data1 0x87, 0x0d, 0xd5, 0xf4 // 26 | ||
955 | data1 0xed, 0x14, 0x5a, 0x45 // 27 | ||
956 | data1 0x05, 0xe9, 0xe3, 0xa9 // 28 | ||
957 | data1 0xf8, 0xa3, 0xef, 0xfc // 29 | ||
958 | data1 0xd9, 0x02, 0x6f, 0x67 // 30 | ||
959 | data1 0x8a, 0x4c, 0x2a, 0x8d // 31 | ||
960 | data1 0x42, 0x39, 0xfa, 0xff // 32 | ||
961 | data1 0x81, 0xf6, 0x71, 0x87 // 33 | ||
962 | data1 0x22, 0x61, 0x9d, 0x6d // 34 | ||
963 | data1 0x0c, 0x38, 0xe5, 0xfd // 35 | ||
964 | data1 0x44, 0xea, 0xbe, 0xa4 // 36 | ||
965 | data1 0xa9, 0xcf, 0xde, 0x4b // 37 | ||
966 | data1 0x60, 0x4b, 0xbb, 0xf6 // 38 | ||
967 | data1 0x70, 0xbc, 0xbf, 0xbe // 39 | ||
968 | data1 0xc6, 0x7e, 0x9b, 0x28 // 40 | ||
969 | data1 0xfa, 0x27, 0xa1, 0xea // 41 | ||
970 | data1 0x85, 0x30, 0xef, 0xd4 // 42 | ||
971 | data1 0x05, 0x1d, 0x88, 0x04 // 43 | ||
972 | data1 0x39, 0xd0, 0xd4, 0xd9 // 44 | ||
973 | data1 0xe5, 0x99, 0xdb, 0xe6 // 45 | ||
974 | data1 0xf8, 0x7c, 0xa2, 0x1f // 46 | ||
975 | data1 0x65, 0x56, 0xac, 0xc4 // 47 | ||
976 | data1 0x44, 0x22, 0x29, 0xf4 // 48 | ||
977 | data1 0x97, 0xff, 0x2a, 0x43 // 49 | ||
978 | data1 0xa7, 0x23, 0x94, 0xab // 50 | ||
979 | data1 0x39, 0xa0, 0x93, 0xfc // 51 | ||
980 | data1 0xc3, 0x59, 0x5b, 0x65 // 52 | ||
981 | data1 0x92, 0xcc, 0x0c, 0x8f // 53 | ||
982 | data1 0x7d, 0xf4, 0xef, 0xff // 54 | ||
983 | data1 0xd1, 0x5d, 0x84, 0x85 // 55 | ||
984 | data1 0x4f, 0x7e, 0xa8, 0x6f // 56 | ||
985 | data1 0xe0, 0xe6, 0x2c, 0xfe // 57 | ||
986 | data1 0x14, 0x43, 0x01, 0xa3 // 58 | ||
987 | data1 0xa1, 0x11, 0x08, 0x4e // 59 | ||
988 | data1 0x82, 0x7e, 0x53, 0xf7 // 60 | ||
989 | data1 0x35, 0xf2, 0x3a, 0xbd // 61 | ||
990 | data1 0xbb, 0xd2, 0xd7, 0x2a // 62 | ||
991 | data1 0x91, 0xd3, 0x86, 0xeb // 63 | ||
992 | .size md5_constants#,64*4 | ||
diff --git a/src/lib/libcrypto/modes/asm/ghash-ia64.pl b/src/lib/libcrypto/modes/asm/ghash-ia64.pl deleted file mode 100755 index 0354c95444..0000000000 --- a/src/lib/libcrypto/modes/asm/ghash-ia64.pl +++ /dev/null | |||
@@ -1,463 +0,0 @@ | |||
1 | #!/usr/bin/env perl | ||
2 | |||
3 | # ==================================================================== | ||
4 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | ||
5 | # project. The module is, however, dual licensed under OpenSSL and | ||
6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
8 | # ==================================================================== | ||
9 | # | ||
10 | # March 2010 | ||
11 | # | ||
12 | # The module implements "4-bit" GCM GHASH function and underlying | ||
13 | # single multiplication operation in GF(2^128). "4-bit" means that it | ||
14 | # uses 256 bytes per-key table [+128 bytes shared table]. Streamed | ||
15 | # GHASH performance was measured to be 6.67 cycles per processed byte | ||
16 | # on Itanium 2, which is >90% better than Microsoft compiler generated | ||
17 | # code. To anchor to something else sha1-ia64.pl module processes one | ||
18 | # byte in 5.7 cycles. On Itanium GHASH should run at ~8.5 cycles per | ||
19 | # byte. | ||
20 | |||
21 | # September 2010 | ||
22 | # | ||
23 | # It was originally thought that it makes lesser sense to implement | ||
24 | # "528B" variant on Itanium 2 for following reason. Because number of | ||
25 | # functional units is naturally limited, it appeared impossible to | ||
26 | # implement "528B" loop in 4 cycles, only in 5. This would mean that | ||
27 | # theoretically performance improvement couldn't be more than 20%. | ||
28 | # But occasionally you prove yourself wrong:-) I figured out a way to | ||
29 | # fold couple of instructions and having freed yet another instruction | ||
30 | # slot by unrolling the loop... Resulting performance is 4.45 cycles | ||
31 | # per processed byte and 50% better than "256B" version. On original | ||
32 | # Itanium performance should remain the same as the "256B" version, | ||
33 | # i.e. ~8.5 cycles. | ||
34 | |||
35 | $output=shift and (open STDOUT,">$output" or die "can't open $output: $!"); | ||
36 | |||
37 | if ($^O eq "hpux") { | ||
38 | $ADDP="addp4"; | ||
39 | for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); } | ||
40 | } else { $ADDP="add"; } | ||
41 | for (@ARGV) { $big_endian=1 if (/\-DB_ENDIAN/); | ||
42 | $big_endian=0 if (/\-DL_ENDIAN/); } | ||
43 | if (!defined($big_endian)) | ||
44 | { $big_endian=(unpack('L',pack('N',1))==1); } | ||
45 | |||
46 | sub loop() { | ||
47 | my $label=shift; | ||
48 | my ($p16,$p17)=(shift)?("p63","p63"):("p16","p17"); # mask references to inp | ||
49 | |||
50 | # Loop is scheduled for 6 ticks on Itanium 2 and 8 on Itanium, i.e. | ||
51 | # in scalable manner;-) Naturally assuming data in L1 cache... | ||
52 | # Special note about 'dep' instruction, which is used to construct | ||
53 | # &rem_4bit[Zlo&0xf]. It works, because rem_4bit is aligned at 128 | ||
54 | # bytes boundary and lower 7 bits of its address are guaranteed to | ||
55 | # be zero. | ||
56 | $code.=<<___; | ||
57 | $label: | ||
58 | { .mfi; (p18) ld8 Hlo=[Hi[1]],-8 | ||
59 | (p19) dep rem=Zlo,rem_4bitp,3,4 } | ||
60 | { .mfi; (p19) xor Zhi=Zhi,Hhi | ||
61 | ($p17) xor xi[1]=xi[1],in[1] };; | ||
62 | { .mfi; (p18) ld8 Hhi=[Hi[1]] | ||
63 | (p19) shrp Zlo=Zhi,Zlo,4 } | ||
64 | { .mfi; (p19) ld8 rem=[rem] | ||
65 | (p18) and Hi[1]=mask0xf0,xi[2] };; | ||
66 | { .mmi; ($p16) ld1 in[0]=[inp],-1 | ||
67 | (p18) xor Zlo=Zlo,Hlo | ||
68 | (p19) shr.u Zhi=Zhi,4 } | ||
69 | { .mib; (p19) xor Hhi=Hhi,rem | ||
70 | (p18) add Hi[1]=Htbl,Hi[1] };; | ||
71 | |||
72 | { .mfi; (p18) ld8 Hlo=[Hi[1]],-8 | ||
73 | (p18) dep rem=Zlo,rem_4bitp,3,4 } | ||
74 | { .mfi; (p17) shladd Hi[0]=xi[1],4,r0 | ||
75 | (p18) xor Zhi=Zhi,Hhi };; | ||
76 | { .mfi; (p18) ld8 Hhi=[Hi[1]] | ||
77 | (p18) shrp Zlo=Zhi,Zlo,4 } | ||
78 | { .mfi; (p18) ld8 rem=[rem] | ||
79 | (p17) and Hi[0]=mask0xf0,Hi[0] };; | ||
80 | { .mmi; (p16) ld1 xi[0]=[Xi],-1 | ||
81 | (p18) xor Zlo=Zlo,Hlo | ||
82 | (p18) shr.u Zhi=Zhi,4 } | ||
83 | { .mib; (p18) xor Hhi=Hhi,rem | ||
84 | (p17) add Hi[0]=Htbl,Hi[0] | ||
85 | br.ctop.sptk $label };; | ||
86 | ___ | ||
87 | } | ||
88 | |||
89 | $code=<<___; | ||
90 | .explicit | ||
91 | .text | ||
92 | |||
93 | prevfs=r2; prevlc=r3; prevpr=r8; | ||
94 | mask0xf0=r21; | ||
95 | rem=r22; rem_4bitp=r23; | ||
96 | Xi=r24; Htbl=r25; | ||
97 | inp=r26; end=r27; | ||
98 | Hhi=r28; Hlo=r29; | ||
99 | Zhi=r30; Zlo=r31; | ||
100 | |||
101 | .align 128 | ||
102 | .skip 16 // aligns loop body | ||
103 | .global gcm_gmult_4bit# | ||
104 | .proc gcm_gmult_4bit# | ||
105 | gcm_gmult_4bit: | ||
106 | .prologue | ||
107 | { .mmi; .save ar.pfs,prevfs | ||
108 | alloc prevfs=ar.pfs,2,6,0,8 | ||
109 | $ADDP Xi=15,in0 // &Xi[15] | ||
110 | mov rem_4bitp=ip } | ||
111 | { .mii; $ADDP Htbl=8,in1 // &Htbl[0].lo | ||
112 | .save ar.lc,prevlc | ||
113 | mov prevlc=ar.lc | ||
114 | .save pr,prevpr | ||
115 | mov prevpr=pr };; | ||
116 | |||
117 | .body | ||
118 | .rotr in[3],xi[3],Hi[2] | ||
119 | |||
120 | { .mib; ld1 xi[2]=[Xi],-1 // Xi[15] | ||
121 | mov mask0xf0=0xf0 | ||
122 | brp.loop.imp .Loop1,.Lend1-16};; | ||
123 | { .mmi; ld1 xi[1]=[Xi],-1 // Xi[14] | ||
124 | };; | ||
125 | { .mii; shladd Hi[1]=xi[2],4,r0 | ||
126 | mov pr.rot=0x7<<16 | ||
127 | mov ar.lc=13 };; | ||
128 | { .mii; and Hi[1]=mask0xf0,Hi[1] | ||
129 | mov ar.ec=3 | ||
130 | xor Zlo=Zlo,Zlo };; | ||
131 | { .mii; add Hi[1]=Htbl,Hi[1] // &Htbl[nlo].lo | ||
132 | add rem_4bitp=rem_4bit#-gcm_gmult_4bit#,rem_4bitp | ||
133 | xor Zhi=Zhi,Zhi };; | ||
134 | ___ | ||
135 | &loop (".Loop1",1); | ||
136 | $code.=<<___; | ||
137 | .Lend1: | ||
138 | { .mib; xor Zhi=Zhi,Hhi };; // modulo-scheduling artefact | ||
139 | { .mib; mux1 Zlo=Zlo,\@rev };; | ||
140 | { .mib; mux1 Zhi=Zhi,\@rev };; | ||
141 | { .mmi; add Hlo=9,Xi;; // ;; is here to prevent | ||
142 | add Hhi=1,Xi };; // pipeline flush on Itanium | ||
143 | { .mib; st8 [Hlo]=Zlo | ||
144 | mov pr=prevpr,0x1ffff };; | ||
145 | { .mib; st8 [Hhi]=Zhi | ||
146 | mov ar.lc=prevlc | ||
147 | br.ret.sptk.many b0 };; | ||
148 | .endp gcm_gmult_4bit# | ||
149 | ___ | ||
150 | |||
151 | ###################################################################### | ||
152 | # "528B" (well, "512B" actualy) streamed GHASH | ||
153 | # | ||
154 | $Xip="in0"; | ||
155 | $Htbl="in1"; | ||
156 | $inp="in2"; | ||
157 | $len="in3"; | ||
158 | $rem_8bit="loc0"; | ||
159 | $mask0xff="loc1"; | ||
160 | ($sum,$rum) = $big_endian ? ("nop.m","nop.m") : ("sum","rum"); | ||
161 | |||
162 | sub load_htable() { | ||
163 | for (my $i=0;$i<8;$i++) { | ||
164 | $code.=<<___; | ||
165 | { .mmi; ld8 r`16+2*$i+1`=[r8],16 // Htable[$i].hi | ||
166 | ld8 r`16+2*$i`=[r9],16 } // Htable[$i].lo | ||
167 | { .mmi; ldf8 f`32+2*$i+1`=[r10],16 // Htable[`8+$i`].hi | ||
168 | ldf8 f`32+2*$i`=[r11],16 // Htable[`8+$i`].lo | ||
169 | ___ | ||
170 | $code.=shift if (($i+$#_)==7); | ||
171 | $code.="\t};;\n" | ||
172 | } | ||
173 | } | ||
174 | |||
175 | $code.=<<___; | ||
176 | prevsp=r3; | ||
177 | |||
178 | .align 32 | ||
179 | .skip 16 // aligns loop body | ||
180 | .global gcm_ghash_4bit# | ||
181 | .proc gcm_ghash_4bit# | ||
182 | gcm_ghash_4bit: | ||
183 | .prologue | ||
184 | { .mmi; .save ar.pfs,prevfs | ||
185 | alloc prevfs=ar.pfs,4,2,0,0 | ||
186 | .vframe prevsp | ||
187 | mov prevsp=sp | ||
188 | mov $rem_8bit=ip };; | ||
189 | .body | ||
190 | { .mfi; $ADDP r8=0+0,$Htbl | ||
191 | $ADDP r9=0+8,$Htbl } | ||
192 | { .mfi; $ADDP r10=128+0,$Htbl | ||
193 | $ADDP r11=128+8,$Htbl };; | ||
194 | ___ | ||
195 | &load_htable( | ||
196 | " $ADDP $Xip=15,$Xip", # &Xi[15] | ||
197 | " $ADDP $len=$len,$inp", # &inp[len] | ||
198 | " $ADDP $inp=15,$inp", # &inp[15] | ||
199 | " mov $mask0xff=0xff", | ||
200 | " add sp=-512,sp", | ||
201 | " andcm sp=sp,$mask0xff", # align stack frame | ||
202 | " add r14=0,sp", | ||
203 | " add r15=8,sp"); | ||
204 | $code.=<<___; | ||
205 | { .mmi; $sum 1<<1 // go big-endian | ||
206 | add r8=256+0,sp | ||
207 | add r9=256+8,sp } | ||
208 | { .mmi; add r10=256+128+0,sp | ||
209 | add r11=256+128+8,sp | ||
210 | add $len=-17,$len };; | ||
211 | ___ | ||
212 | for($i=0;$i<8;$i++) { # generate first half of Hshr4[] | ||
213 | my ($rlo,$rhi)=("r".eval(16+2*$i),"r".eval(16+2*$i+1)); | ||
214 | $code.=<<___; | ||
215 | { .mmi; st8 [r8]=$rlo,16 // Htable[$i].lo | ||
216 | st8 [r9]=$rhi,16 // Htable[$i].hi | ||
217 | shrp $rlo=$rhi,$rlo,4 }//;; | ||
218 | { .mmi; stf8 [r10]=f`32+2*$i`,16 // Htable[`8+$i`].lo | ||
219 | stf8 [r11]=f`32+2*$i+1`,16 // Htable[`8+$i`].hi | ||
220 | shr.u $rhi=$rhi,4 };; | ||
221 | { .mmi; st8 [r14]=$rlo,16 // Htable[$i].lo>>4 | ||
222 | st8 [r15]=$rhi,16 }//;; // Htable[$i].hi>>4 | ||
223 | ___ | ||
224 | } | ||
225 | $code.=<<___; | ||
226 | { .mmi; ld8 r16=[r8],16 // Htable[8].lo | ||
227 | ld8 r17=[r9],16 };; // Htable[8].hi | ||
228 | { .mmi; ld8 r18=[r8],16 // Htable[9].lo | ||
229 | ld8 r19=[r9],16 } // Htable[9].hi | ||
230 | { .mmi; rum 1<<5 // clear um.mfh | ||
231 | shrp r16=r17,r16,4 };; | ||
232 | ___ | ||
233 | for($i=0;$i<6;$i++) { # generate second half of Hshr4[] | ||
234 | $code.=<<___; | ||
235 | { .mmi; ld8 r`20+2*$i`=[r8],16 // Htable[`10+$i`].lo | ||
236 | ld8 r`20+2*$i+1`=[r9],16 // Htable[`10+$i`].hi | ||
237 | shr.u r`16+2*$i+1`=r`16+2*$i+1`,4 };; | ||
238 | { .mmi; st8 [r14]=r`16+2*$i`,16 // Htable[`8+$i`].lo>>4 | ||
239 | st8 [r15]=r`16+2*$i+1`,16 // Htable[`8+$i`].hi>>4 | ||
240 | shrp r`18+2*$i`=r`18+2*$i+1`,r`18+2*$i`,4 } | ||
241 | ___ | ||
242 | } | ||
243 | $code.=<<___; | ||
244 | { .mmi; shr.u r`16+2*$i+1`=r`16+2*$i+1`,4 };; | ||
245 | { .mmi; st8 [r14]=r`16+2*$i`,16 // Htable[`8+$i`].lo>>4 | ||
246 | st8 [r15]=r`16+2*$i+1`,16 // Htable[`8+$i`].hi>>4 | ||
247 | shrp r`18+2*$i`=r`18+2*$i+1`,r`18+2*$i`,4 } | ||
248 | { .mmi; add $Htbl=256,sp // &Htable[0] | ||
249 | add $rem_8bit=rem_8bit#-gcm_ghash_4bit#,$rem_8bit | ||
250 | shr.u r`18+2*$i+1`=r`18+2*$i+1`,4 };; | ||
251 | { .mmi; st8 [r14]=r`18+2*$i` // Htable[`8+$i`].lo>>4 | ||
252 | st8 [r15]=r`18+2*$i+1` } // Htable[`8+$i`].hi>>4 | ||
253 | ___ | ||
254 | |||
255 | $in="r15"; | ||
256 | @xi=("r16","r17"); | ||
257 | @rem=("r18","r19"); | ||
258 | ($Alo,$Ahi,$Blo,$Bhi,$Zlo,$Zhi)=("r20","r21","r22","r23","r24","r25"); | ||
259 | ($Atbl,$Btbl)=("r26","r27"); | ||
260 | |||
261 | $code.=<<___; # (p16) | ||
262 | { .mmi; ld1 $in=[$inp],-1 //(p16) *inp-- | ||
263 | ld1 $xi[0]=[$Xip],-1 //(p16) *Xi-- | ||
264 | cmp.eq p0,p6=r0,r0 };; // clear p6 | ||
265 | ___ | ||
266 | push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers | ||
267 | |||
268 | $code.=<<___; # (p16),(p17) | ||
269 | { .mmi; ld1 $xi[0]=[$Xip],-1 //(p16) *Xi-- | ||
270 | xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i] | ||
271 | { .mii; ld1 $in=[$inp],-1 //(p16) *inp-- | ||
272 | dep $Atbl=$xi[1],$Htbl,4,4 //(p17) &Htable[nlo].lo | ||
273 | and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0 | ||
274 | .align 32 | ||
275 | .LOOP: | ||
276 | { .mmi; | ||
277 | (p6) st8 [$Xip]=$Zhi,13 | ||
278 | xor $Zlo=$Zlo,$Zlo | ||
279 | add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi].lo | ||
280 | ___ | ||
281 | push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers | ||
282 | |||
283 | $code.=<<___; # (p16),(p17),(p18) | ||
284 | { .mmi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi | ||
285 | ld8 $rem[0]=[$Btbl],-256 //(p18) Htable[nhi].lo,&Hshr4[nhi].lo | ||
286 | xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i] | ||
287 | { .mfi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi | ||
288 | dep $Atbl=$xi[1],$Htbl,4,4 } //(p17) &Htable[nlo].lo | ||
289 | { .mfi; shladd $rem[0]=$rem[0],4,r0 //(p18) Htable[nhi].lo<<4 | ||
290 | xor $Zlo=$Zlo,$Alo };; //(p18) Z.lo^=Htable[nlo].lo | ||
291 | { .mmi; ld8 $Blo=[$Btbl],8 //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi | ||
292 | ld1 $in=[$inp],-1 } //(p16) *inp-- | ||
293 | { .mmi; xor $rem[0]=$rem[0],$Zlo //(p18) Z.lo^(Htable[nhi].lo<<4) | ||
294 | mov $Zhi=$Ahi //(p18) Z.hi^=Htable[nlo].hi | ||
295 | and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0 | ||
296 | { .mmi; ld8 $Bhi=[$Btbl] //(p18) Hshr4[nhi].hi | ||
297 | ld1 $xi[0]=[$Xip],-1 //(p16) *Xi-- | ||
298 | shrp $Zlo=$Zhi,$Zlo,8 } //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8) | ||
299 | { .mmi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff | ||
300 | add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi] | ||
301 | ___ | ||
302 | push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers | ||
303 | |||
304 | for ($i=1;$i<14;$i++) { | ||
305 | # Above and below fragments are derived from this one by removing | ||
306 | # unsuitable (p??) instructions. | ||
307 | $code.=<<___; # (p16),(p17),(p18),(p19) | ||
308 | { .mmi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi | ||
309 | ld8 $rem[0]=[$Btbl],-256 //(p18) Htable[nhi].lo,&Hshr4[nhi].lo | ||
310 | shr.u $Zhi=$Zhi,8 } //(p19) Z.hi>>=8 | ||
311 | { .mmi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem] | ||
312 | xor $Zlo=$Zlo,$Blo //(p19) Z.lo^=Hshr4[nhi].lo | ||
313 | xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i] | ||
314 | { .mmi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi | ||
315 | ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem] | ||
316 | dep $Atbl=$xi[1],$Htbl,4,4 } //(p17) &Htable[nlo].lo | ||
317 | { .mmi; shladd $rem[0]=$rem[0],4,r0 //(p18) Htable[nhi].lo<<4 | ||
318 | xor $Zlo=$Zlo,$Alo //(p18) Z.lo^=Htable[nlo].lo | ||
319 | xor $Zhi=$Zhi,$Bhi };; //(p19) Z.hi^=Hshr4[nhi].hi | ||
320 | { .mmi; ld8 $Blo=[$Btbl],8 //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi | ||
321 | ld1 $in=[$inp],-1 //(p16) *inp-- | ||
322 | shl $rem[1]=$rem[1],48 } //(p19) rem_8bit[rem]<<48 | ||
323 | { .mmi; xor $rem[0]=$rem[0],$Zlo //(p18) Z.lo^(Htable[nhi].lo<<4) | ||
324 | xor $Zhi=$Zhi,$Ahi //(p18) Z.hi^=Htable[nlo].hi | ||
325 | and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0 | ||
326 | { .mmi; ld8 $Bhi=[$Btbl] //(p18) Hshr4[nhi].hi | ||
327 | ld1 $xi[0]=[$Xip],-1 //(p16) *Xi-- | ||
328 | shrp $Zlo=$Zhi,$Zlo,8 } //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8) | ||
329 | { .mmi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff | ||
330 | xor $Zhi=$Zhi,$rem[1] //(p19) Z.hi^=rem_8bit[rem]<<48 | ||
331 | add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi] | ||
332 | ___ | ||
333 | push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers | ||
334 | } | ||
335 | |||
336 | $code.=<<___; # (p17),(p18),(p19) | ||
337 | { .mmi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi | ||
338 | ld8 $rem[0]=[$Btbl],-256 //(p18) Htable[nhi].lo,&Hshr4[nhi].lo | ||
339 | shr.u $Zhi=$Zhi,8 } //(p19) Z.hi>>=8 | ||
340 | { .mmi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem] | ||
341 | xor $Zlo=$Zlo,$Blo //(p19) Z.lo^=Hshr4[nhi].lo | ||
342 | xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i] | ||
343 | { .mmi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi | ||
344 | ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem] | ||
345 | dep $Atbl=$xi[1],$Htbl,4,4 };; //(p17) &Htable[nlo].lo | ||
346 | { .mmi; shladd $rem[0]=$rem[0],4,r0 //(p18) Htable[nhi].lo<<4 | ||
347 | xor $Zlo=$Zlo,$Alo //(p18) Z.lo^=Htable[nlo].lo | ||
348 | xor $Zhi=$Zhi,$Bhi };; //(p19) Z.hi^=Hshr4[nhi].hi | ||
349 | { .mmi; ld8 $Blo=[$Btbl],8 //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi | ||
350 | shl $rem[1]=$rem[1],48 } //(p19) rem_8bit[rem]<<48 | ||
351 | { .mmi; xor $rem[0]=$rem[0],$Zlo //(p18) Z.lo^(Htable[nhi].lo<<4) | ||
352 | xor $Zhi=$Zhi,$Ahi //(p18) Z.hi^=Htable[nlo].hi | ||
353 | and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0 | ||
354 | { .mmi; ld8 $Bhi=[$Btbl] //(p18) Hshr4[nhi].hi | ||
355 | shrp $Zlo=$Zhi,$Zlo,8 } //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8) | ||
356 | { .mmi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff | ||
357 | xor $Zhi=$Zhi,$rem[1] //(p19) Z.hi^=rem_8bit[rem]<<48 | ||
358 | add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi] | ||
359 | ___ | ||
360 | push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers | ||
361 | |||
362 | $code.=<<___; # (p18),(p19) | ||
363 | { .mfi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi | ||
364 | shr.u $Zhi=$Zhi,8 } //(p19) Z.hi>>=8 | ||
365 | { .mfi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem] | ||
366 | xor $Zlo=$Zlo,$Blo };; //(p19) Z.lo^=Hshr4[nhi].lo | ||
367 | { .mfi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi | ||
368 | xor $Zlo=$Zlo,$Alo } //(p18) Z.lo^=Htable[nlo].lo | ||
369 | { .mfi; ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem] | ||
370 | xor $Zhi=$Zhi,$Bhi };; //(p19) Z.hi^=Hshr4[nhi].hi | ||
371 | { .mfi; ld8 $Blo=[$Btbl],8 //(p18) Htable[nhi].lo,&Htable[nhi].hi | ||
372 | shl $rem[1]=$rem[1],48 } //(p19) rem_8bit[rem]<<48 | ||
373 | { .mfi; shladd $rem[0]=$Zlo,4,r0 //(p18) Z.lo<<4 | ||
374 | xor $Zhi=$Zhi,$Ahi };; //(p18) Z.hi^=Htable[nlo].hi | ||
375 | { .mfi; ld8 $Bhi=[$Btbl] //(p18) Htable[nhi].hi | ||
376 | shrp $Zlo=$Zhi,$Zlo,4 } //(p18) Z.lo=(Z.hi<<60)|(Z.lo>>4) | ||
377 | { .mfi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff | ||
378 | xor $Zhi=$Zhi,$rem[1] };; //(p19) Z.hi^=rem_8bit[rem]<<48 | ||
379 | ___ | ||
380 | push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers | ||
381 | |||
382 | $code.=<<___; # (p19) | ||
383 | { .mmi; cmp.ltu p6,p0=$inp,$len | ||
384 | add $inp=32,$inp | ||
385 | shr.u $Zhi=$Zhi,4 } //(p19) Z.hi>>=4 | ||
386 | { .mmi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem] | ||
387 | xor $Zlo=$Zlo,$Blo //(p19) Z.lo^=Hshr4[nhi].lo | ||
388 | add $Xip=9,$Xip };; // &Xi.lo | ||
389 | { .mmi; ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem] | ||
390 | (p6) ld1 $in=[$inp],-1 //[p16] *inp-- | ||
391 | (p6) extr.u $xi[1]=$Zlo,8,8 } //[p17] Xi[14] | ||
392 | { .mmi; xor $Zhi=$Zhi,$Bhi //(p19) Z.hi^=Hshr4[nhi].hi | ||
393 | (p6) and $xi[0]=$Zlo,$mask0xff };; //[p16] Xi[15] | ||
394 | { .mmi; st8 [$Xip]=$Zlo,-8 | ||
395 | (p6) xor $xi[0]=$xi[0],$in //[p17] xi=$xi[i]^inp[i] | ||
396 | shl $rem[1]=$rem[1],48 };; //(p19) rem_8bit[rem]<<48 | ||
397 | { .mmi; | ||
398 | (p6) ld1 $in=[$inp],-1 //[p16] *inp-- | ||
399 | xor $Zhi=$Zhi,$rem[1] //(p19) Z.hi^=rem_8bit[rem]<<48 | ||
400 | (p6) dep $Atbl=$xi[0],$Htbl,4,4 } //[p17] &Htable[nlo].lo | ||
401 | { .mib; | ||
402 | (p6) and $xi[0]=-16,$xi[0] //[p17] nhi=xi&0xf0 | ||
403 | (p6) br.cond.dptk.many .LOOP };; | ||
404 | |||
405 | { .mib; st8 [$Xip]=$Zhi };; | ||
406 | { .mib; $rum 1<<1 // return to little-endian | ||
407 | .restore sp | ||
408 | mov sp=prevsp | ||
409 | br.ret.sptk.many b0 };; | ||
410 | .endp gcm_ghash_4bit# | ||
411 | ___ | ||
412 | $code.=<<___; | ||
413 | .align 128 | ||
414 | .type rem_4bit#,\@object | ||
415 | rem_4bit: | ||
416 | data8 0x0000<<48, 0x1C20<<48, 0x3840<<48, 0x2460<<48 | ||
417 | data8 0x7080<<48, 0x6CA0<<48, 0x48C0<<48, 0x54E0<<48 | ||
418 | data8 0xE100<<48, 0xFD20<<48, 0xD940<<48, 0xC560<<48 | ||
419 | data8 0x9180<<48, 0x8DA0<<48, 0xA9C0<<48, 0xB5E0<<48 | ||
420 | .size rem_4bit#,128 | ||
421 | .type rem_8bit#,\@object | ||
422 | rem_8bit: | ||
423 | data1 0x00,0x00, 0x01,0xC2, 0x03,0x84, 0x02,0x46, 0x07,0x08, 0x06,0xCA, 0x04,0x8C, 0x05,0x4E | ||
424 | data1 0x0E,0x10, 0x0F,0xD2, 0x0D,0x94, 0x0C,0x56, 0x09,0x18, 0x08,0xDA, 0x0A,0x9C, 0x0B,0x5E | ||
425 | data1 0x1C,0x20, 0x1D,0xE2, 0x1F,0xA4, 0x1E,0x66, 0x1B,0x28, 0x1A,0xEA, 0x18,0xAC, 0x19,0x6E | ||
426 | data1 0x12,0x30, 0x13,0xF2, 0x11,0xB4, 0x10,0x76, 0x15,0x38, 0x14,0xFA, 0x16,0xBC, 0x17,0x7E | ||
427 | data1 0x38,0x40, 0x39,0x82, 0x3B,0xC4, 0x3A,0x06, 0x3F,0x48, 0x3E,0x8A, 0x3C,0xCC, 0x3D,0x0E | ||
428 | data1 0x36,0x50, 0x37,0x92, 0x35,0xD4, 0x34,0x16, 0x31,0x58, 0x30,0x9A, 0x32,0xDC, 0x33,0x1E | ||
429 | data1 0x24,0x60, 0x25,0xA2, 0x27,0xE4, 0x26,0x26, 0x23,0x68, 0x22,0xAA, 0x20,0xEC, 0x21,0x2E | ||
430 | data1 0x2A,0x70, 0x2B,0xB2, 0x29,0xF4, 0x28,0x36, 0x2D,0x78, 0x2C,0xBA, 0x2E,0xFC, 0x2F,0x3E | ||
431 | data1 0x70,0x80, 0x71,0x42, 0x73,0x04, 0x72,0xC6, 0x77,0x88, 0x76,0x4A, 0x74,0x0C, 0x75,0xCE | ||
432 | data1 0x7E,0x90, 0x7F,0x52, 0x7D,0x14, 0x7C,0xD6, 0x79,0x98, 0x78,0x5A, 0x7A,0x1C, 0x7B,0xDE | ||
433 | data1 0x6C,0xA0, 0x6D,0x62, 0x6F,0x24, 0x6E,0xE6, 0x6B,0xA8, 0x6A,0x6A, 0x68,0x2C, 0x69,0xEE | ||
434 | data1 0x62,0xB0, 0x63,0x72, 0x61,0x34, 0x60,0xF6, 0x65,0xB8, 0x64,0x7A, 0x66,0x3C, 0x67,0xFE | ||
435 | data1 0x48,0xC0, 0x49,0x02, 0x4B,0x44, 0x4A,0x86, 0x4F,0xC8, 0x4E,0x0A, 0x4C,0x4C, 0x4D,0x8E | ||
436 | data1 0x46,0xD0, 0x47,0x12, 0x45,0x54, 0x44,0x96, 0x41,0xD8, 0x40,0x1A, 0x42,0x5C, 0x43,0x9E | ||
437 | data1 0x54,0xE0, 0x55,0x22, 0x57,0x64, 0x56,0xA6, 0x53,0xE8, 0x52,0x2A, 0x50,0x6C, 0x51,0xAE | ||
438 | data1 0x5A,0xF0, 0x5B,0x32, 0x59,0x74, 0x58,0xB6, 0x5D,0xF8, 0x5C,0x3A, 0x5E,0x7C, 0x5F,0xBE | ||
439 | data1 0xE1,0x00, 0xE0,0xC2, 0xE2,0x84, 0xE3,0x46, 0xE6,0x08, 0xE7,0xCA, 0xE5,0x8C, 0xE4,0x4E | ||
440 | data1 0xEF,0x10, 0xEE,0xD2, 0xEC,0x94, 0xED,0x56, 0xE8,0x18, 0xE9,0xDA, 0xEB,0x9C, 0xEA,0x5E | ||
441 | data1 0xFD,0x20, 0xFC,0xE2, 0xFE,0xA4, 0xFF,0x66, 0xFA,0x28, 0xFB,0xEA, 0xF9,0xAC, 0xF8,0x6E | ||
442 | data1 0xF3,0x30, 0xF2,0xF2, 0xF0,0xB4, 0xF1,0x76, 0xF4,0x38, 0xF5,0xFA, 0xF7,0xBC, 0xF6,0x7E | ||
443 | data1 0xD9,0x40, 0xD8,0x82, 0xDA,0xC4, 0xDB,0x06, 0xDE,0x48, 0xDF,0x8A, 0xDD,0xCC, 0xDC,0x0E | ||
444 | data1 0xD7,0x50, 0xD6,0x92, 0xD4,0xD4, 0xD5,0x16, 0xD0,0x58, 0xD1,0x9A, 0xD3,0xDC, 0xD2,0x1E | ||
445 | data1 0xC5,0x60, 0xC4,0xA2, 0xC6,0xE4, 0xC7,0x26, 0xC2,0x68, 0xC3,0xAA, 0xC1,0xEC, 0xC0,0x2E | ||
446 | data1 0xCB,0x70, 0xCA,0xB2, 0xC8,0xF4, 0xC9,0x36, 0xCC,0x78, 0xCD,0xBA, 0xCF,0xFC, 0xCE,0x3E | ||
447 | data1 0x91,0x80, 0x90,0x42, 0x92,0x04, 0x93,0xC6, 0x96,0x88, 0x97,0x4A, 0x95,0x0C, 0x94,0xCE | ||
448 | data1 0x9F,0x90, 0x9E,0x52, 0x9C,0x14, 0x9D,0xD6, 0x98,0x98, 0x99,0x5A, 0x9B,0x1C, 0x9A,0xDE | ||
449 | data1 0x8D,0xA0, 0x8C,0x62, 0x8E,0x24, 0x8F,0xE6, 0x8A,0xA8, 0x8B,0x6A, 0x89,0x2C, 0x88,0xEE | ||
450 | data1 0x83,0xB0, 0x82,0x72, 0x80,0x34, 0x81,0xF6, 0x84,0xB8, 0x85,0x7A, 0x87,0x3C, 0x86,0xFE | ||
451 | data1 0xA9,0xC0, 0xA8,0x02, 0xAA,0x44, 0xAB,0x86, 0xAE,0xC8, 0xAF,0x0A, 0xAD,0x4C, 0xAC,0x8E | ||
452 | data1 0xA7,0xD0, 0xA6,0x12, 0xA4,0x54, 0xA5,0x96, 0xA0,0xD8, 0xA1,0x1A, 0xA3,0x5C, 0xA2,0x9E | ||
453 | data1 0xB5,0xE0, 0xB4,0x22, 0xB6,0x64, 0xB7,0xA6, 0xB2,0xE8, 0xB3,0x2A, 0xB1,0x6C, 0xB0,0xAE | ||
454 | data1 0xBB,0xF0, 0xBA,0x32, 0xB8,0x74, 0xB9,0xB6, 0xBC,0xF8, 0xBD,0x3A, 0xBF,0x7C, 0xBE,0xBE | ||
455 | .size rem_8bit#,512 | ||
456 | stringz "GHASH for IA64, CRYPTOGAMS by <appro\@openssl.org>" | ||
457 | ___ | ||
458 | |||
459 | $code =~ s/mux1(\s+)\S+\@rev/nop.i$1 0x0/gm if ($big_endian); | ||
460 | $code =~ s/\`([^\`]*)\`/eval $1/gem; | ||
461 | |||
462 | print $code; | ||
463 | close STDOUT; | ||
diff --git a/src/lib/libcrypto/rc4/asm/rc4-ia64.pl b/src/lib/libcrypto/rc4/asm/rc4-ia64.pl deleted file mode 100644 index 49cd5b5e69..0000000000 --- a/src/lib/libcrypto/rc4/asm/rc4-ia64.pl +++ /dev/null | |||
@@ -1,755 +0,0 @@ | |||
1 | #!/usr/bin/env perl | ||
2 | # | ||
3 | # ==================================================================== | ||
4 | # Written by David Mosberger <David.Mosberger@acm.org> based on the | ||
5 | # Itanium optimized Crypto code which was released by HP Labs at | ||
6 | # http://www.hpl.hp.com/research/linux/crypto/. | ||
7 | # | ||
8 | # Copyright (c) 2005 Hewlett-Packard Development Company, L.P. | ||
9 | # | ||
10 | # Permission is hereby granted, free of charge, to any person obtaining | ||
11 | # a copy of this software and associated documentation files (the | ||
12 | # "Software"), to deal in the Software without restriction, including | ||
13 | # without limitation the rights to use, copy, modify, merge, publish, | ||
14 | # distribute, sublicense, and/or sell copies of the Software, and to | ||
15 | # permit persons to whom the Software is furnished to do so, subject to | ||
16 | # the following conditions: | ||
17 | # | ||
18 | # The above copyright notice and this permission notice shall be | ||
19 | # included in all copies or substantial portions of the Software. | ||
20 | |||
21 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||
22 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | ||
23 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | ||
24 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE | ||
25 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION | ||
26 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION | ||
27 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ | ||
28 | |||
29 | |||
30 | |||
31 | # This is a little helper program which generates a software-pipelined | ||
32 | # for RC4 encryption. The basic algorithm looks like this: | ||
33 | # | ||
34 | # for (counter = 0; counter < len; ++counter) | ||
35 | # { | ||
36 | # in = inp[counter]; | ||
37 | # SI = S[I]; | ||
38 | # J = (SI + J) & 0xff; | ||
39 | # SJ = S[J]; | ||
40 | # T = (SI + SJ) & 0xff; | ||
41 | # S[I] = SJ, S[J] = SI; | ||
42 | # ST = S[T]; | ||
43 | # outp[counter] = in ^ ST; | ||
44 | # I = (I + 1) & 0xff; | ||
45 | # } | ||
46 | # | ||
47 | # Pipelining this loop isn't easy, because the stores to the S[] array | ||
48 | # need to be observed in the right order. The loop generated by the | ||
49 | # code below has the following pipeline diagram: | ||
50 | # | ||
51 | # cycle | ||
52 | # | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 |10 |11 |12 |13 |14 |15 |16 |17 | | ||
53 | # iter | ||
54 | # 1: xxx LDI xxx xxx xxx LDJ xxx SWP xxx LDT xxx xxx | ||
55 | # 2: xxx LDI xxx xxx xxx LDJ xxx SWP xxx LDT xxx xxx | ||
56 | # 3: xxx LDI xxx xxx xxx LDJ xxx SWP xxx LDT xxx xxx | ||
57 | # | ||
58 | # where: | ||
59 | # LDI = load of S[I] | ||
60 | # LDJ = load of S[J] | ||
61 | # SWP = swap of S[I] and S[J] | ||
62 | # LDT = load of S[T] | ||
63 | # | ||
64 | # Note that in the above diagram, the major trouble-spot is that LDI | ||
65 | # of the 2nd iteration is performed BEFORE the SWP of the first | ||
66 | # iteration. Fortunately, this is easy to detect (I of the 1st | ||
67 | # iteration will be equal to J of the 2nd iteration) and when this | ||
68 | # happens, we simply forward the proper value from the 1st iteration | ||
69 | # to the 2nd one. The proper value in this case is simply the value | ||
70 | # of S[I] from the first iteration (thanks to the fact that SWP | ||
71 | # simply swaps the contents of S[I] and S[J]). | ||
72 | # | ||
73 | # Another potential trouble-spot is in cycle 7, where SWP of the 1st | ||
74 | # iteration issues at the same time as the LDI of the 3rd iteration. | ||
75 | # However, thanks to IA-64 execution semantics, this can be taken | ||
76 | # care of simply by placing LDI later in the instruction-group than | ||
77 | # SWP. IA-64 CPUs will automatically forward the value if they | ||
78 | # detect that the SWP and LDI are accessing the same memory-location. | ||
79 | |||
80 | # The core-loop that can be pipelined then looks like this (annotated | ||
81 | # with McKinley/Madison issue port & latency numbers, assuming L1 | ||
82 | # cache hits for the most part): | ||
83 | |||
84 | # operation: instruction: issue-ports: latency | ||
85 | # ------------------ ----------------------------- ------------- ------- | ||
86 | |||
87 | # Data = *inp++ ld1 data = [inp], 1 M0-M1 1 cyc c0 | ||
88 | # shladd Iptr = I, KeyTable, 3 M0-M3, I0, I1 1 cyc | ||
89 | # I = (I + 1) & 0xff padd1 nextI = I, one M0-M3, I0, I1 3 cyc | ||
90 | # ;; | ||
91 | # SI = S[I] ld8 SI = [Iptr] M0-M1 1 cyc c1 * after SWAP! | ||
92 | # ;; | ||
93 | # cmp.eq.unc pBypass = I, J * after J is valid! | ||
94 | # J = SI + J add J = J, SI M0-M3, I0, I1 1 cyc c2 | ||
95 | # (pBypass) br.cond.spnt Bypass | ||
96 | # ;; | ||
97 | # --------------------------------------------------------------------------------------- | ||
98 | # J = J & 0xff zxt1 J = J I0, I1, 1 cyc c3 | ||
99 | # ;; | ||
100 | # shladd Jptr = J, KeyTable, 3 M0-M3, I0, I1 1 cyc c4 | ||
101 | # ;; | ||
102 | # SJ = S[J] ld8 SJ = [Jptr] M0-M1 1 cyc c5 | ||
103 | # ;; | ||
104 | # --------------------------------------------------------------------------------------- | ||
105 | # T = (SI + SJ) add T = SI, SJ M0-M3, I0, I1 1 cyc c6 | ||
106 | # ;; | ||
107 | # T = T & 0xff zxt1 T = T I0, I1 1 cyc | ||
108 | # S[I] = SJ st8 [Iptr] = SJ M2-M3 c7 | ||
109 | # S[J] = SI st8 [Jptr] = SI M2-M3 | ||
110 | # ;; | ||
111 | # shladd Tptr = T, KeyTable, 3 M0-M3, I0, I1 1 cyc c8 | ||
112 | # ;; | ||
113 | # --------------------------------------------------------------------------------------- | ||
114 | # T = S[T] ld8 T = [Tptr] M0-M1 1 cyc c9 | ||
115 | # ;; | ||
116 | # data ^= T xor data = data, T M0-M3, I0, I1 1 cyc c10 | ||
117 | # ;; | ||
118 | # *out++ = Data ^ T dep word = word, data, 8, POS I0, I1 1 cyc c11 | ||
119 | # ;; | ||
120 | # --------------------------------------------------------------------------------------- | ||
121 | |||
122 | # There are several points worth making here: | ||
123 | |||
124 | # - Note that due to the bypass/forwarding-path, the first two | ||
125 | # phases of the loop are strangly mingled together. In | ||
126 | # particular, note that the first stage of the pipeline is | ||
127 | # using the value of "J", as calculated by the second stage. | ||
128 | # - Each bundle-pair will have exactly 6 instructions. | ||
129 | # - Pipelined, the loop can execute in 3 cycles/iteration and | ||
130 | # 4 stages. However, McKinley/Madison can issue "st1" to | ||
131 | # the same bank at a rate of at most one per 4 cycles. Thus, | ||
132 | # instead of storing each byte, we accumulate them in a word | ||
133 | # and then write them back at once with a single "st8" (this | ||
134 | # implies that the setup code needs to ensure that the output | ||
135 | # buffer is properly aligned, if need be, by encoding the | ||
136 | # first few bytes separately). | ||
137 | # - There is no space for a "br.ctop" instruction. For this | ||
138 | # reason we can't use module-loop support in IA-64 and have | ||
139 | # to do a traditional, purely software-pipelined loop. | ||
140 | # - We can't replace any of the remaining "add/zxt1" pairs with | ||
141 | # "padd1" because the latency for that instruction is too high | ||
142 | # and would push the loop to the point where more bypasses | ||
143 | # would be needed, which we don't have space for. | ||
144 | # - The above loop runs at around 3.26 cycles/byte, or roughly | ||
145 | # 440 MByte/sec on a 1.5GHz Madison. This is well below the | ||
146 | # system bus bandwidth and hence with judicious use of | ||
147 | # "lfetch" this loop can run at (almost) peak speed even when | ||
148 | # the input and output data reside in memory. The | ||
149 | # max. latency that can be tolerated is (PREFETCH_DISTANCE * | ||
150 | # L2_LINE_SIZE * 3 cyc), or about 384 cycles assuming (at | ||
151 | # least) 1-ahead prefetching of 128 byte cache-lines. Note | ||
152 | # that we do NOT prefetch into L1, since that would only | ||
153 | # interfere with the S[] table values stored there. This is | ||
154 | # acceptable because there is a 10 cycle latency between | ||
155 | # load and first use of the input data. | ||
156 | # - We use a branch to out-of-line bypass-code of cycle-pressure: | ||
157 | # we calculate the next J, check for the need to activate the | ||
158 | # bypass path, and activate the bypass path ALL IN THE SAME | ||
159 | # CYCLE. If we didn't have these constraints, we could do | ||
160 | # the bypass with a simple conditional move instruction. | ||
161 | # Fortunately, the bypass paths get activated relatively | ||
162 | # infrequently, so the extra branches don't cost all that much | ||
163 | # (about 0.04 cycles/byte, measured on a 16396 byte file with | ||
164 | # random input data). | ||
165 | # | ||
166 | |||
167 | $phases = 4; # number of stages/phases in the pipelined-loop | ||
168 | $unroll_count = 6; # number of times we unrolled it | ||
169 | $pComI = (1 << 0); | ||
170 | $pComJ = (1 << 1); | ||
171 | $pComT = (1 << 2); | ||
172 | $pOut = (1 << 3); | ||
173 | |||
174 | $NData = 4; | ||
175 | $NIP = 3; | ||
176 | $NJP = 2; | ||
177 | $NI = 2; | ||
178 | $NSI = 3; | ||
179 | $NSJ = 2; | ||
180 | $NT = 2; | ||
181 | $NOutWord = 2; | ||
182 | |||
183 | # | ||
184 | # $threshold is the minimum length before we attempt to use the | ||
185 | # big software-pipelined loop. It MUST be greater-or-equal | ||
186 | # to: | ||
187 | # PHASES * (UNROLL_COUNT + 1) + 7 | ||
188 | # | ||
189 | # The "+ 7" comes from the fact we may have to encode up to | ||
190 | # 7 bytes separately before the output pointer is aligned. | ||
191 | # | ||
192 | $threshold = (3 * ($phases * ($unroll_count + 1)) + 7); | ||
193 | |||
194 | sub I { | ||
195 | local *code = shift; | ||
196 | local $format = shift; | ||
197 | $code .= sprintf ("\t\t".$format."\n", @_); | ||
198 | } | ||
199 | |||
200 | sub P { | ||
201 | local *code = shift; | ||
202 | local $format = shift; | ||
203 | $code .= sprintf ($format."\n", @_); | ||
204 | } | ||
205 | |||
206 | sub STOP { | ||
207 | local *code = shift; | ||
208 | $code .=<<___; | ||
209 | ;; | ||
210 | ___ | ||
211 | } | ||
212 | |||
213 | sub emit_body { | ||
214 | local *c = shift; | ||
215 | local *bypass = shift; | ||
216 | local ($iteration, $p) = @_; | ||
217 | |||
218 | local $i0 = $iteration; | ||
219 | local $i1 = $iteration - 1; | ||
220 | local $i2 = $iteration - 2; | ||
221 | local $i3 = $iteration - 3; | ||
222 | local $iw0 = ($iteration - 3) / 8; | ||
223 | local $iw1 = ($iteration > 3) ? ($iteration - 4) / 8 : 1; | ||
224 | local $byte_num = ($iteration - 3) % 8; | ||
225 | local $label = $iteration + 1; | ||
226 | local $pAny = ($p & 0xf) == 0xf; | ||
227 | local $pByp = (($p & $pComI) && ($iteration > 0)); | ||
228 | |||
229 | $c.=<<___; | ||
230 | ////////////////////////////////////////////////// | ||
231 | ___ | ||
232 | |||
233 | if (($p & 0xf) == 0) { | ||
234 | $c.="#ifdef HOST_IS_BIG_ENDIAN\n"; | ||
235 | &I(\$c,"shr.u OutWord[%u] = OutWord[%u], 32;;", | ||
236 | $iw1 % $NOutWord, $iw1 % $NOutWord); | ||
237 | $c.="#endif\n"; | ||
238 | &I(\$c, "st4 [OutPtr] = OutWord[%u], 4", $iw1 % $NOutWord); | ||
239 | return; | ||
240 | } | ||
241 | |||
242 | # Cycle 0 | ||
243 | &I(\$c, "{ .mmi") if ($pAny); | ||
244 | &I(\$c, "ld1 Data[%u] = [InPtr], 1", $i0 % $NData) if ($p & $pComI); | ||
245 | &I(\$c, "padd1 I[%u] = One, I[%u]", $i0 % $NI, $i1 % $NI)if ($p & $pComI); | ||
246 | &I(\$c, "zxt1 J = J") if ($p & $pComJ); | ||
247 | &I(\$c, "}") if ($pAny); | ||
248 | &I(\$c, "{ .mmi") if ($pAny); | ||
249 | &I(\$c, "LKEY T[%u] = [T[%u]]", $i1 % $NT, $i1 % $NT) if ($p & $pOut); | ||
250 | &I(\$c, "add T[%u] = SI[%u], SJ[%u]", | ||
251 | $i0 % $NT, $i2 % $NSI, $i1 % $NSJ) if ($p & $pComT); | ||
252 | &I(\$c, "KEYADDR(IPr[%u], I[%u])", $i0 % $NIP, $i1 % $NI) if ($p & $pComI); | ||
253 | &I(\$c, "}") if ($pAny); | ||
254 | &STOP(\$c); | ||
255 | |||
256 | # Cycle 1 | ||
257 | &I(\$c, "{ .mmi") if ($pAny); | ||
258 | &I(\$c, "SKEY [IPr[%u]] = SJ[%u]", $i2 % $NIP, $i1%$NSJ)if ($p & $pComT); | ||
259 | &I(\$c, "SKEY [JP[%u]] = SI[%u]", $i1 % $NJP, $i2%$NSI) if ($p & $pComT); | ||
260 | &I(\$c, "zxt1 T[%u] = T[%u]", $i0 % $NT, $i0 % $NT) if ($p & $pComT); | ||
261 | &I(\$c, "}") if ($pAny); | ||
262 | &I(\$c, "{ .mmi") if ($pAny); | ||
263 | &I(\$c, "LKEY SI[%u] = [IPr[%u]]", $i0 % $NSI, $i0%$NIP)if ($p & $pComI); | ||
264 | &I(\$c, "KEYADDR(JP[%u], J)", $i0 % $NJP) if ($p & $pComJ); | ||
265 | &I(\$c, "xor Data[%u] = Data[%u], T[%u]", | ||
266 | $i3 % $NData, $i3 % $NData, $i1 % $NT) if ($p & $pOut); | ||
267 | &I(\$c, "}") if ($pAny); | ||
268 | &STOP(\$c); | ||
269 | |||
270 | # Cycle 2 | ||
271 | &I(\$c, "{ .mmi") if ($pAny); | ||
272 | &I(\$c, "LKEY SJ[%u] = [JP[%u]]", $i0 % $NSJ, $i0%$NJP) if ($p & $pComJ); | ||
273 | &I(\$c, "cmp.eq pBypass, p0 = I[%u], J", $i1 % $NI) if ($pByp); | ||
274 | &I(\$c, "dep OutWord[%u] = Data[%u], OutWord[%u], BYTE_POS(%u), 8", | ||
275 | $iw0%$NOutWord, $i3%$NData, $iw1%$NOutWord, $byte_num) if ($p & $pOut); | ||
276 | &I(\$c, "}") if ($pAny); | ||
277 | &I(\$c, "{ .mmb") if ($pAny); | ||
278 | &I(\$c, "add J = J, SI[%u]", $i0 % $NSI) if ($p & $pComI); | ||
279 | &I(\$c, "KEYADDR(T[%u], T[%u])", $i0 % $NT, $i0 % $NT) if ($p & $pComT); | ||
280 | &P(\$c, "(pBypass)\tbr.cond.spnt.many .rc4Bypass%u",$label)if ($pByp); | ||
281 | &I(\$c, "}") if ($pAny); | ||
282 | &STOP(\$c); | ||
283 | |||
284 | &P(\$c, ".rc4Resume%u:", $label) if ($pByp); | ||
285 | if ($byte_num == 0 && $iteration >= $phases) { | ||
286 | &I(\$c, "st8 [OutPtr] = OutWord[%u], 8", | ||
287 | $iw1 % $NOutWord) if ($p & $pOut); | ||
288 | if ($iteration == (1 + $unroll_count) * $phases - 1) { | ||
289 | if ($unroll_count == 6) { | ||
290 | &I(\$c, "mov OutWord[%u] = OutWord[%u]", | ||
291 | $iw1 % $NOutWord, $iw0 % $NOutWord); | ||
292 | } | ||
293 | &I(\$c, "lfetch.nt1 [InPrefetch], %u", | ||
294 | $unroll_count * $phases); | ||
295 | &I(\$c, "lfetch.excl.nt1 [OutPrefetch], %u", | ||
296 | $unroll_count * $phases); | ||
297 | &I(\$c, "br.cloop.sptk.few .rc4Loop"); | ||
298 | } | ||
299 | } | ||
300 | |||
301 | if ($pByp) { | ||
302 | &P(\$bypass, ".rc4Bypass%u:", $label); | ||
303 | &I(\$bypass, "sub J = J, SI[%u]", $i0 % $NSI); | ||
304 | &I(\$bypass, "nop 0"); | ||
305 | &I(\$bypass, "nop 0"); | ||
306 | &I(\$bypass, ";;"); | ||
307 | &I(\$bypass, "add J = J, SI[%u]", $i1 % $NSI); | ||
308 | &I(\$bypass, "mov SI[%u] = SI[%u]", $i0 % $NSI, $i1 % $NSI); | ||
309 | &I(\$bypass, "br.sptk.many .rc4Resume%u\n", $label); | ||
310 | &I(\$bypass, ";;"); | ||
311 | } | ||
312 | } | ||
313 | |||
314 | $code=<<___; | ||
315 | .ident \"rc4-ia64.s, version 3.0\" | ||
316 | .ident \"Copyright (c) 2005 Hewlett-Packard Development Company, L.P.\" | ||
317 | |||
318 | #define LCSave r8 | ||
319 | #define PRSave r9 | ||
320 | |||
321 | /* Inputs become invalid once rotation begins! */ | ||
322 | |||
323 | #define StateTable in0 | ||
324 | #define DataLen in1 | ||
325 | #define InputBuffer in2 | ||
326 | #define OutputBuffer in3 | ||
327 | |||
328 | #define KTable r14 | ||
329 | #define J r15 | ||
330 | #define InPtr r16 | ||
331 | #define OutPtr r17 | ||
332 | #define InPrefetch r18 | ||
333 | #define OutPrefetch r19 | ||
334 | #define One r20 | ||
335 | #define LoopCount r21 | ||
336 | #define Remainder r22 | ||
337 | #define IFinal r23 | ||
338 | #define EndPtr r24 | ||
339 | |||
340 | #define tmp0 r25 | ||
341 | #define tmp1 r26 | ||
342 | |||
343 | #define pBypass p6 | ||
344 | #define pDone p7 | ||
345 | #define pSmall p8 | ||
346 | #define pAligned p9 | ||
347 | #define pUnaligned p10 | ||
348 | |||
349 | #define pComputeI pPhase[0] | ||
350 | #define pComputeJ pPhase[1] | ||
351 | #define pComputeT pPhase[2] | ||
352 | #define pOutput pPhase[3] | ||
353 | |||
354 | #define RetVal r8 | ||
355 | #define L_OK p7 | ||
356 | #define L_NOK p8 | ||
357 | |||
358 | #define _NINPUTS 4 | ||
359 | #define _NOUTPUT 0 | ||
360 | |||
361 | #define _NROTATE 24 | ||
362 | #define _NLOCALS (_NROTATE - _NINPUTS - _NOUTPUT) | ||
363 | |||
364 | #ifndef SZ | ||
365 | # define SZ 4 // this must be set to sizeof(RC4_INT) | ||
366 | #endif | ||
367 | |||
368 | #if SZ == 1 | ||
369 | # define LKEY ld1 | ||
370 | # define SKEY st1 | ||
371 | # define KEYADDR(dst, i) add dst = i, KTable | ||
372 | #elif SZ == 2 | ||
373 | # define LKEY ld2 | ||
374 | # define SKEY st2 | ||
375 | # define KEYADDR(dst, i) shladd dst = i, 1, KTable | ||
376 | #elif SZ == 4 | ||
377 | # define LKEY ld4 | ||
378 | # define SKEY st4 | ||
379 | # define KEYADDR(dst, i) shladd dst = i, 2, KTable | ||
380 | #else | ||
381 | # define LKEY ld8 | ||
382 | # define SKEY st8 | ||
383 | # define KEYADDR(dst, i) shladd dst = i, 3, KTable | ||
384 | #endif | ||
385 | |||
386 | #if defined(_HPUX_SOURCE) && !defined(_LP64) | ||
387 | # define ADDP addp4 | ||
388 | #else | ||
389 | # define ADDP add | ||
390 | #endif | ||
391 | |||
392 | /* Define a macro for the bit number of the n-th byte: */ | ||
393 | |||
394 | #if defined(_HPUX_SOURCE) || defined(B_ENDIAN) | ||
395 | # define HOST_IS_BIG_ENDIAN | ||
396 | # define BYTE_POS(n) (56 - (8 * (n))) | ||
397 | #else | ||
398 | # define BYTE_POS(n) (8 * (n)) | ||
399 | #endif | ||
400 | |||
401 | /* | ||
402 | We must perform the first phase of the pipeline explicitly since | ||
403 | we will always load from the stable the first time. The br.cexit | ||
404 | will never be taken since regardless of the number of bytes because | ||
405 | the epilogue count is 4. | ||
406 | */ | ||
407 | /* MODSCHED_RC4 macro was split to _PROLOGUE and _LOOP, because HP-UX | ||
408 | assembler failed on original macro with syntax error. <appro> */ | ||
409 | #define MODSCHED_RC4_PROLOGUE \\ | ||
410 | { \\ | ||
411 | ld1 Data[0] = [InPtr], 1; \\ | ||
412 | add IFinal = 1, I[1]; \\ | ||
413 | KEYADDR(IPr[0], I[1]); \\ | ||
414 | } ;; \\ | ||
415 | { \\ | ||
416 | LKEY SI[0] = [IPr[0]]; \\ | ||
417 | mov pr.rot = 0x10000; \\ | ||
418 | mov ar.ec = 4; \\ | ||
419 | } ;; \\ | ||
420 | { \\ | ||
421 | add J = J, SI[0]; \\ | ||
422 | zxt1 I[0] = IFinal; \\ | ||
423 | br.cexit.spnt.few .+16; /* never taken */ \\ | ||
424 | } ;; | ||
425 | #define MODSCHED_RC4_LOOP(label) \\ | ||
426 | label: \\ | ||
427 | { .mmi; \\ | ||
428 | (pComputeI) ld1 Data[0] = [InPtr], 1; \\ | ||
429 | (pComputeI) add IFinal = 1, I[1]; \\ | ||
430 | (pComputeJ) zxt1 J = J; \\ | ||
431 | }{ .mmi; \\ | ||
432 | (pOutput) LKEY T[1] = [T[1]]; \\ | ||
433 | (pComputeT) add T[0] = SI[2], SJ[1]; \\ | ||
434 | (pComputeI) KEYADDR(IPr[0], I[1]); \\ | ||
435 | } ;; \\ | ||
436 | { .mmi; \\ | ||
437 | (pComputeT) SKEY [IPr[2]] = SJ[1]; \\ | ||
438 | (pComputeT) SKEY [JP[1]] = SI[2]; \\ | ||
439 | (pComputeT) zxt1 T[0] = T[0]; \\ | ||
440 | }{ .mmi; \\ | ||
441 | (pComputeI) LKEY SI[0] = [IPr[0]]; \\ | ||
442 | (pComputeJ) KEYADDR(JP[0], J); \\ | ||
443 | (pComputeI) cmp.eq.unc pBypass, p0 = I[1], J; \\ | ||
444 | } ;; \\ | ||
445 | { .mmi; \\ | ||
446 | (pComputeJ) LKEY SJ[0] = [JP[0]]; \\ | ||
447 | (pOutput) xor Data[3] = Data[3], T[1]; \\ | ||
448 | nop 0x0; \\ | ||
449 | }{ .mmi; \\ | ||
450 | (pComputeT) KEYADDR(T[0], T[0]); \\ | ||
451 | (pBypass) mov SI[0] = SI[1]; \\ | ||
452 | (pComputeI) zxt1 I[0] = IFinal; \\ | ||
453 | } ;; \\ | ||
454 | { .mmb; \\ | ||
455 | (pOutput) st1 [OutPtr] = Data[3], 1; \\ | ||
456 | (pComputeI) add J = J, SI[0]; \\ | ||
457 | br.ctop.sptk.few label; \\ | ||
458 | } ;; | ||
459 | |||
460 | .text | ||
461 | |||
462 | .align 32 | ||
463 | |||
464 | .type RC4, \@function | ||
465 | .global RC4 | ||
466 | |||
467 | .proc RC4 | ||
468 | .prologue | ||
469 | |||
470 | RC4: | ||
471 | { | ||
472 | .mmi | ||
473 | alloc r2 = ar.pfs, _NINPUTS, _NLOCALS, _NOUTPUT, _NROTATE | ||
474 | |||
475 | .rotr Data[4], I[2], IPr[3], SI[3], JP[2], SJ[2], T[2], \\ | ||
476 | OutWord[2] | ||
477 | .rotp pPhase[4] | ||
478 | |||
479 | ADDP InPrefetch = 0, InputBuffer | ||
480 | ADDP KTable = 0, StateTable | ||
481 | } | ||
482 | { | ||
483 | .mmi | ||
484 | ADDP InPtr = 0, InputBuffer | ||
485 | ADDP OutPtr = 0, OutputBuffer | ||
486 | mov RetVal = r0 | ||
487 | } | ||
488 | ;; | ||
489 | { | ||
490 | .mmi | ||
491 | lfetch.nt1 [InPrefetch], 0x80 | ||
492 | ADDP OutPrefetch = 0, OutputBuffer | ||
493 | } | ||
494 | { // Return 0 if the input length is nonsensical | ||
495 | .mib | ||
496 | ADDP StateTable = 0, StateTable | ||
497 | cmp.ge.unc L_NOK, L_OK = r0, DataLen | ||
498 | (L_NOK) br.ret.sptk.few rp | ||
499 | } | ||
500 | ;; | ||
501 | { | ||
502 | .mib | ||
503 | cmp.eq.or L_NOK, L_OK = r0, InPtr | ||
504 | cmp.eq.or L_NOK, L_OK = r0, OutPtr | ||
505 | nop 0x0 | ||
506 | } | ||
507 | { | ||
508 | .mib | ||
509 | cmp.eq.or L_NOK, L_OK = r0, StateTable | ||
510 | nop 0x0 | ||
511 | (L_NOK) br.ret.sptk.few rp | ||
512 | } | ||
513 | ;; | ||
514 | LKEY I[1] = [KTable], SZ | ||
515 | /* Prefetch the state-table. It contains 256 elements of size SZ */ | ||
516 | |||
517 | #if SZ == 1 | ||
518 | ADDP tmp0 = 1*128, StateTable | ||
519 | #elif SZ == 2 | ||
520 | ADDP tmp0 = 3*128, StateTable | ||
521 | ADDP tmp1 = 2*128, StateTable | ||
522 | #elif SZ == 4 | ||
523 | ADDP tmp0 = 7*128, StateTable | ||
524 | ADDP tmp1 = 6*128, StateTable | ||
525 | #elif SZ == 8 | ||
526 | ADDP tmp0 = 15*128, StateTable | ||
527 | ADDP tmp1 = 14*128, StateTable | ||
528 | #endif | ||
529 | ;; | ||
530 | #if SZ >= 8 | ||
531 | lfetch.fault.nt1 [tmp0], -256 // 15 | ||
532 | lfetch.fault.nt1 [tmp1], -256;; | ||
533 | lfetch.fault.nt1 [tmp0], -256 // 13 | ||
534 | lfetch.fault.nt1 [tmp1], -256;; | ||
535 | lfetch.fault.nt1 [tmp0], -256 // 11 | ||
536 | lfetch.fault.nt1 [tmp1], -256;; | ||
537 | lfetch.fault.nt1 [tmp0], -256 // 9 | ||
538 | lfetch.fault.nt1 [tmp1], -256;; | ||
539 | #endif | ||
540 | #if SZ >= 4 | ||
541 | lfetch.fault.nt1 [tmp0], -256 // 7 | ||
542 | lfetch.fault.nt1 [tmp1], -256;; | ||
543 | lfetch.fault.nt1 [tmp0], -256 // 5 | ||
544 | lfetch.fault.nt1 [tmp1], -256;; | ||
545 | #endif | ||
546 | #if SZ >= 2 | ||
547 | lfetch.fault.nt1 [tmp0], -256 // 3 | ||
548 | lfetch.fault.nt1 [tmp1], -256;; | ||
549 | #endif | ||
550 | { | ||
551 | .mii | ||
552 | lfetch.fault.nt1 [tmp0] // 1 | ||
553 | add I[1]=1,I[1];; | ||
554 | zxt1 I[1]=I[1] | ||
555 | } | ||
556 | { | ||
557 | .mmi | ||
558 | lfetch.nt1 [InPrefetch], 0x80 | ||
559 | lfetch.excl.nt1 [OutPrefetch], 0x80 | ||
560 | .save pr, PRSave | ||
561 | mov PRSave = pr | ||
562 | } ;; | ||
563 | { | ||
564 | .mmi | ||
565 | lfetch.excl.nt1 [OutPrefetch], 0x80 | ||
566 | LKEY J = [KTable], SZ | ||
567 | ADDP EndPtr = DataLen, InPtr | ||
568 | } ;; | ||
569 | { | ||
570 | .mmi | ||
571 | ADDP EndPtr = -1, EndPtr // Make it point to | ||
572 | // last data byte. | ||
573 | mov One = 1 | ||
574 | .save ar.lc, LCSave | ||
575 | mov LCSave = ar.lc | ||
576 | .body | ||
577 | } ;; | ||
578 | { | ||
579 | .mmb | ||
580 | sub Remainder = 0, OutPtr | ||
581 | cmp.gtu pSmall, p0 = $threshold, DataLen | ||
582 | (pSmall) br.cond.dpnt .rc4Remainder // Data too small for | ||
583 | // big loop. | ||
584 | } ;; | ||
585 | { | ||
586 | .mmi | ||
587 | and Remainder = 0x7, Remainder | ||
588 | ;; | ||
589 | cmp.eq pAligned, pUnaligned = Remainder, r0 | ||
590 | nop 0x0 | ||
591 | } ;; | ||
592 | { | ||
593 | .mmb | ||
594 | .pred.rel "mutex",pUnaligned,pAligned | ||
595 | (pUnaligned) add Remainder = -1, Remainder | ||
596 | (pAligned) sub Remainder = EndPtr, InPtr | ||
597 | (pAligned) br.cond.dptk.many .rc4Aligned | ||
598 | } ;; | ||
599 | { | ||
600 | .mmi | ||
601 | nop 0x0 | ||
602 | nop 0x0 | ||
603 | mov.i ar.lc = Remainder | ||
604 | } | ||
605 | |||
606 | /* Do the initial few bytes via the compact, modulo-scheduled loop | ||
607 | until the output pointer is 8-byte-aligned. */ | ||
608 | |||
609 | MODSCHED_RC4_PROLOGUE | ||
610 | MODSCHED_RC4_LOOP(.RC4AlignLoop) | ||
611 | |||
612 | { | ||
613 | .mib | ||
614 | sub Remainder = EndPtr, InPtr | ||
615 | zxt1 IFinal = IFinal | ||
616 | clrrrb // Clear CFM.rrb.pr so | ||
617 | ;; // next "mov pr.rot = N" | ||
618 | // does the right thing. | ||
619 | } | ||
620 | { | ||
621 | .mmi | ||
622 | mov I[1] = IFinal | ||
623 | nop 0x0 | ||
624 | nop 0x0 | ||
625 | } ;; | ||
626 | |||
627 | |||
628 | .rc4Aligned: | ||
629 | |||
630 | /* | ||
631 | Unrolled loop count = (Remainder - ($unroll_count+1)*$phases)/($unroll_count*$phases) | ||
632 | */ | ||
633 | |||
634 | { | ||
635 | .mlx | ||
636 | add LoopCount = 1 - ($unroll_count + 1)*$phases, Remainder | ||
637 | movl Remainder = 0xaaaaaaaaaaaaaaab | ||
638 | } ;; | ||
639 | { | ||
640 | .mmi | ||
641 | setf.sig f6 = LoopCount // M2, M3 6 cyc | ||
642 | setf.sig f7 = Remainder // M2, M3 6 cyc | ||
643 | nop 0x0 | ||
644 | } ;; | ||
645 | { | ||
646 | .mfb | ||
647 | nop 0x0 | ||
648 | xmpy.hu f6 = f6, f7 | ||
649 | nop 0x0 | ||
650 | } ;; | ||
651 | { | ||
652 | .mmi | ||
653 | getf.sig LoopCount = f6;; // M2 5 cyc | ||
654 | nop 0x0 | ||
655 | shr.u LoopCount = LoopCount, 4 | ||
656 | } ;; | ||
657 | { | ||
658 | .mmi | ||
659 | nop 0x0 | ||
660 | nop 0x0 | ||
661 | mov.i ar.lc = LoopCount | ||
662 | } ;; | ||
663 | |||
664 | /* Now comes the unrolled loop: */ | ||
665 | |||
666 | .rc4Prologue: | ||
667 | ___ | ||
668 | |||
669 | $iteration = 0; | ||
670 | |||
671 | # Generate the prologue: | ||
672 | $predicates = 1; | ||
673 | for ($i = 0; $i < $phases; ++$i) { | ||
674 | &emit_body (\$code, \$bypass, $iteration++, $predicates); | ||
675 | $predicates = ($predicates << 1) | 1; | ||
676 | } | ||
677 | |||
678 | $code.=<<___; | ||
679 | .rc4Loop: | ||
680 | ___ | ||
681 | |||
682 | # Generate the body: | ||
683 | for ($i = 0; $i < $unroll_count*$phases; ++$i) { | ||
684 | &emit_body (\$code, \$bypass, $iteration++, $predicates); | ||
685 | } | ||
686 | |||
687 | $code.=<<___; | ||
688 | .rc4Epilogue: | ||
689 | ___ | ||
690 | |||
691 | # Generate the epilogue: | ||
692 | for ($i = 0; $i < $phases; ++$i) { | ||
693 | $predicates <<= 1; | ||
694 | &emit_body (\$code, \$bypass, $iteration++, $predicates); | ||
695 | } | ||
696 | |||
697 | $code.=<<___; | ||
698 | { | ||
699 | .mmi | ||
700 | lfetch.nt1 [EndPtr] // fetch line with last byte | ||
701 | mov IFinal = I[1] | ||
702 | nop 0x0 | ||
703 | } | ||
704 | |||
705 | .rc4Remainder: | ||
706 | { | ||
707 | .mmi | ||
708 | sub Remainder = EndPtr, InPtr // Calculate | ||
709 | // # of bytes | ||
710 | // left - 1 | ||
711 | nop 0x0 | ||
712 | nop 0x0 | ||
713 | } ;; | ||
714 | { | ||
715 | .mib | ||
716 | cmp.eq pDone, p0 = -1, Remainder // done already? | ||
717 | mov.i ar.lc = Remainder | ||
718 | (pDone) br.cond.dptk.few .rc4Complete | ||
719 | } | ||
720 | |||
721 | /* Do the remaining bytes via the compact, modulo-scheduled loop */ | ||
722 | |||
723 | MODSCHED_RC4_PROLOGUE | ||
724 | MODSCHED_RC4_LOOP(.RC4RestLoop) | ||
725 | |||
726 | .rc4Complete: | ||
727 | { | ||
728 | .mmi | ||
729 | add KTable = -SZ, KTable | ||
730 | add IFinal = -1, IFinal | ||
731 | mov ar.lc = LCSave | ||
732 | } ;; | ||
733 | { | ||
734 | .mii | ||
735 | SKEY [KTable] = J,-SZ | ||
736 | zxt1 IFinal = IFinal | ||
737 | mov pr = PRSave, 0x1FFFF | ||
738 | } ;; | ||
739 | { | ||
740 | .mib | ||
741 | SKEY [KTable] = IFinal | ||
742 | add RetVal = 1, r0 | ||
743 | br.ret.sptk.few rp | ||
744 | } ;; | ||
745 | ___ | ||
746 | |||
747 | # Last but not least, emit the code for the bypass-code of the unrolled loop: | ||
748 | |||
749 | $code.=$bypass; | ||
750 | |||
751 | $code.=<<___; | ||
752 | .endp RC4 | ||
753 | ___ | ||
754 | |||
755 | print $code; | ||
diff --git a/src/lib/libcrypto/sha/asm/sha1-ia64.pl b/src/lib/libcrypto/sha/asm/sha1-ia64.pl deleted file mode 100644 index 02d35d1614..0000000000 --- a/src/lib/libcrypto/sha/asm/sha1-ia64.pl +++ /dev/null | |||
@@ -1,305 +0,0 @@ | |||
1 | #!/usr/bin/env perl | ||
2 | # | ||
3 | # ==================================================================== | ||
4 | # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | ||
5 | # project. The module is, however, dual licensed under OpenSSL and | ||
6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
8 | # ==================================================================== | ||
9 | # | ||
10 | # Eternal question is what's wrong with compiler generated code? The | ||
11 | # trick is that it's possible to reduce the number of shifts required | ||
12 | # to perform rotations by maintaining copy of 32-bit value in upper | ||
13 | # bits of 64-bit register. Just follow mux2 and shrp instructions... | ||
14 | # Performance under big-endian OS such as HP-UX is 179MBps*1GHz, which | ||
15 | # is >50% better than HP C and >2x better than gcc. | ||
16 | |||
17 | $code=<<___; | ||
18 | .ident \"sha1-ia64.s, version 1.3\" | ||
19 | .ident \"IA-64 ISA artwork by Andy Polyakov <appro\@fy.chalmers.se>\" | ||
20 | .explicit | ||
21 | |||
22 | ___ | ||
23 | |||
24 | |||
25 | if ($^O eq "hpux") { | ||
26 | $ADDP="addp4"; | ||
27 | for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); } | ||
28 | } else { $ADDP="add"; } | ||
29 | |||
30 | #$human=1; | ||
31 | if ($human) { # useful for visual code auditing... | ||
32 | ($A,$B,$C,$D,$E) = ("A","B","C","D","E"); | ||
33 | ($h0,$h1,$h2,$h3,$h4) = ("h0","h1","h2","h3","h4"); | ||
34 | ($K_00_19, $K_20_39, $K_40_59, $K_60_79) = | ||
35 | ( "K_00_19","K_20_39","K_40_59","K_60_79" ); | ||
36 | @X= ( "X0", "X1", "X2", "X3", "X4", "X5", "X6", "X7", | ||
37 | "X8", "X9","X10","X11","X12","X13","X14","X15" ); | ||
38 | } | ||
39 | else { | ||
40 | ($A,$B,$C,$D,$E) = ("loc0","loc1","loc2","loc3","loc4"); | ||
41 | ($h0,$h1,$h2,$h3,$h4) = ("loc5","loc6","loc7","loc8","loc9"); | ||
42 | ($K_00_19, $K_20_39, $K_40_59, $K_60_79) = | ||
43 | ( "r14", "r15", "loc10", "loc11" ); | ||
44 | @X= ( "r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23", | ||
45 | "r24", "r25", "r26", "r27", "r28", "r29", "r30", "r31" ); | ||
46 | } | ||
47 | |||
48 | sub BODY_00_15 { | ||
49 | local *code=shift; | ||
50 | my ($i,$a,$b,$c,$d,$e)=@_; | ||
51 | my $j=$i+1; | ||
52 | my $Xn=@X[$j%16]; | ||
53 | |||
54 | $code.=<<___ if ($i==0); | ||
55 | { .mmi; ld1 $X[$i]=[inp],2 // MSB | ||
56 | ld1 tmp2=[tmp3],2 };; | ||
57 | { .mmi; ld1 tmp0=[inp],2 | ||
58 | ld1 tmp4=[tmp3],2 // LSB | ||
59 | dep $X[$i]=$X[$i],tmp2,8,8 };; | ||
60 | ___ | ||
61 | if ($i<15) { | ||
62 | $code.=<<___; | ||
63 | { .mmi; ld1 $Xn=[inp],2 // forward Xload | ||
64 | nop.m 0x0 | ||
65 | dep tmp1=tmp0,tmp4,8,8 };; | ||
66 | { .mmi; ld1 tmp2=[tmp3],2 // forward Xload | ||
67 | and tmp4=$c,$b | ||
68 | dep $X[$i]=$X[$i],tmp1,16,16} //;; | ||
69 | { .mmi; add $e=$e,$K_00_19 // e+=K_00_19 | ||
70 | andcm tmp1=$d,$b | ||
71 | dep.z tmp5=$a,5,27 };; // a<<5 | ||
72 | { .mmi; add $e=$e,$X[$i] // e+=Xload | ||
73 | or tmp4=tmp4,tmp1 // F_00_19(b,c,d)=(b&c)|(~b&d) | ||
74 | extr.u tmp1=$a,27,5 };; // a>>27 | ||
75 | { .mmi; ld1 tmp0=[inp],2 // forward Xload | ||
76 | add $e=$e,tmp4 // e+=F_00_19(b,c,d) | ||
77 | shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30) | ||
78 | { .mmi; ld1 tmp4=[tmp3],2 // forward Xload | ||
79 | or tmp5=tmp1,tmp5 // ROTATE(a,5) | ||
80 | mux2 tmp6=$a,0x44 };; // see b in next iteration | ||
81 | { .mii; add $e=$e,tmp5 // e+=ROTATE(a,5) | ||
82 | dep $Xn=$Xn,tmp2,8,8 // forward Xload | ||
83 | mux2 $X[$i]=$X[$i],0x44 } //;; | ||
84 | |||
85 | ___ | ||
86 | } | ||
87 | else { | ||
88 | $code.=<<___; | ||
89 | { .mii; and tmp3=$c,$b | ||
90 | dep tmp1=tmp0,tmp4,8,8;; | ||
91 | dep $X[$i]=$X[$i],tmp1,16,16} //;; | ||
92 | { .mmi; add $e=$e,$K_00_19 // e+=K_00_19 | ||
93 | andcm tmp1=$d,$b | ||
94 | dep.z tmp5=$a,5,27 };; // a<<5 | ||
95 | { .mmi; add $e=$e,$X[$i] // e+=Xupdate | ||
96 | or tmp4=tmp3,tmp1 // F_00_19(b,c,d)=(b&c)|(~b&d) | ||
97 | extr.u tmp1=$a,27,5 } // a>>27 | ||
98 | { .mmi; xor $Xn=$Xn,$X[($j+2)%16] // forward Xupdate | ||
99 | xor tmp3=$X[($j+8)%16],$X[($j+13)%16] // forward Xupdate | ||
100 | nop.i 0 };; | ||
101 | { .mmi; add $e=$e,tmp4 // e+=F_00_19(b,c,d) | ||
102 | xor $Xn=$Xn,tmp3 // forward Xupdate | ||
103 | shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30) | ||
104 | { .mmi; or tmp1=tmp1,tmp5 // ROTATE(a,5) | ||
105 | mux2 tmp6=$a,0x44 };; // see b in next iteration | ||
106 | { .mii; add $e=$e,tmp1 // e+=ROTATE(a,5) | ||
107 | shrp $Xn=$Xn,$Xn,31 // ROTATE(x[0]^x[2]^x[8]^x[13],1) | ||
108 | mux2 $X[$i]=$X[$i],0x44 };; | ||
109 | |||
110 | ___ | ||
111 | } | ||
112 | } | ||
113 | |||
114 | sub BODY_16_19 { | ||
115 | local *code=shift; | ||
116 | my ($i,$a,$b,$c,$d,$e)=@_; | ||
117 | my $j=$i+1; | ||
118 | my $Xn=@X[$j%16]; | ||
119 | |||
120 | $code.=<<___; | ||
121 | { .mib; add $e=$e,$K_00_19 // e+=K_00_19 | ||
122 | dep.z tmp5=$a,5,27 } // a<<5 | ||
123 | { .mib; andcm tmp1=$d,$b | ||
124 | and tmp0=$c,$b };; | ||
125 | { .mmi; add $e=$e,$X[$i%16] // e+=Xupdate | ||
126 | or tmp0=tmp0,tmp1 // F_00_19(b,c,d)=(b&c)|(~b&d) | ||
127 | extr.u tmp1=$a,27,5 } // a>>27 | ||
128 | { .mmi; xor $Xn=$Xn,$X[($j+2)%16] // forward Xupdate | ||
129 | xor tmp3=$X[($j+8)%16],$X[($j+13)%16] // forward Xupdate | ||
130 | nop.i 0 };; | ||
131 | { .mmi; add $e=$e,tmp0 // f+=F_00_19(b,c,d) | ||
132 | xor $Xn=$Xn,tmp3 // forward Xupdate | ||
133 | shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30) | ||
134 | { .mmi; or tmp1=tmp1,tmp5 // ROTATE(a,5) | ||
135 | mux2 tmp6=$a,0x44 };; // see b in next iteration | ||
136 | { .mii; add $e=$e,tmp1 // e+=ROTATE(a,5) | ||
137 | shrp $Xn=$Xn,$Xn,31 // ROTATE(x[0]^x[2]^x[8]^x[13],1) | ||
138 | nop.i 0 };; | ||
139 | |||
140 | ___ | ||
141 | } | ||
142 | |||
143 | sub BODY_20_39 { | ||
144 | local *code=shift; | ||
145 | my ($i,$a,$b,$c,$d,$e,$Konst)=@_; | ||
146 | $Konst = $K_20_39 if (!defined($Konst)); | ||
147 | my $j=$i+1; | ||
148 | my $Xn=@X[$j%16]; | ||
149 | |||
150 | if ($i<79) { | ||
151 | $code.=<<___; | ||
152 | { .mib; add $e=$e,$Konst // e+=K_XX_XX | ||
153 | dep.z tmp5=$a,5,27 } // a<<5 | ||
154 | { .mib; xor tmp0=$c,$b | ||
155 | xor $Xn=$Xn,$X[($j+2)%16] };; // forward Xupdate | ||
156 | { .mib; add $e=$e,$X[$i%16] // e+=Xupdate | ||
157 | extr.u tmp1=$a,27,5 } // a>>27 | ||
158 | { .mib; xor tmp0=tmp0,$d // F_20_39(b,c,d)=b^c^d | ||
159 | xor $Xn=$Xn,$X[($j+8)%16] };; // forward Xupdate | ||
160 | { .mmi; add $e=$e,tmp0 // e+=F_20_39(b,c,d) | ||
161 | xor $Xn=$Xn,$X[($j+13)%16] // forward Xupdate | ||
162 | shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30) | ||
163 | { .mmi; or tmp1=tmp1,tmp5 // ROTATE(a,5) | ||
164 | mux2 tmp6=$a,0x44 };; // see b in next iteration | ||
165 | { .mii; add $e=$e,tmp1 // e+=ROTATE(a,5) | ||
166 | shrp $Xn=$Xn,$Xn,31 // ROTATE(x[0]^x[2]^x[8]^x[13],1) | ||
167 | nop.i 0 };; | ||
168 | |||
169 | ___ | ||
170 | } | ||
171 | else { | ||
172 | $code.=<<___; | ||
173 | { .mib; add $e=$e,$Konst // e+=K_60_79 | ||
174 | dep.z tmp5=$a,5,27 } // a<<5 | ||
175 | { .mib; xor tmp0=$c,$b | ||
176 | add $h1=$h1,$a };; // wrap up | ||
177 | { .mib; add $e=$e,$X[$i%16] // e+=Xupdate | ||
178 | extr.u tmp1=$a,27,5 } // a>>27 | ||
179 | { .mib; xor tmp0=tmp0,$d // F_20_39(b,c,d)=b^c^d | ||
180 | add $h3=$h3,$c };; // wrap up | ||
181 | { .mmi; add $e=$e,tmp0 // e+=F_20_39(b,c,d) | ||
182 | or tmp1=tmp1,tmp5 // ROTATE(a,5) | ||
183 | shrp $b=tmp6,tmp6,2 };; // b=ROTATE(b,30) ;;? | ||
184 | { .mmi; add $e=$e,tmp1 // e+=ROTATE(a,5) | ||
185 | add tmp3=1,inp // used in unaligned codepath | ||
186 | add $h4=$h4,$d };; // wrap up | ||
187 | |||
188 | ___ | ||
189 | } | ||
190 | } | ||
191 | |||
192 | sub BODY_40_59 { | ||
193 | local *code=shift; | ||
194 | my ($i,$a,$b,$c,$d,$e)=@_; | ||
195 | my $j=$i+1; | ||
196 | my $Xn=@X[$j%16]; | ||
197 | |||
198 | $code.=<<___; | ||
199 | { .mib; add $e=$e,$K_40_59 // e+=K_40_59 | ||
200 | dep.z tmp5=$a,5,27 } // a<<5 | ||
201 | { .mib; and tmp1=$c,$d | ||
202 | xor tmp0=$c,$d };; | ||
203 | { .mmi; add $e=$e,$X[$i%16] // e+=Xupdate | ||
204 | add tmp5=tmp5,tmp1 // a<<5+(c&d) | ||
205 | extr.u tmp1=$a,27,5 } // a>>27 | ||
206 | { .mmi; and tmp0=tmp0,$b | ||
207 | xor $Xn=$Xn,$X[($j+2)%16] // forward Xupdate | ||
208 | xor tmp3=$X[($j+8)%16],$X[($j+13)%16] };; // forward Xupdate | ||
209 | { .mmi; add $e=$e,tmp0 // e+=b&(c^d) | ||
210 | add tmp5=tmp5,tmp1 // ROTATE(a,5)+(c&d) | ||
211 | shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30) | ||
212 | { .mmi; xor $Xn=$Xn,tmp3 | ||
213 | mux2 tmp6=$a,0x44 };; // see b in next iteration | ||
214 | { .mii; add $e=$e,tmp5 // e+=ROTATE(a,5)+(c&d) | ||
215 | shrp $Xn=$Xn,$Xn,31 // ROTATE(x[0]^x[2]^x[8]^x[13],1) | ||
216 | nop.i 0x0 };; | ||
217 | |||
218 | ___ | ||
219 | } | ||
220 | sub BODY_60_79 { &BODY_20_39(@_,$K_60_79); } | ||
221 | |||
222 | $code.=<<___; | ||
223 | .text | ||
224 | |||
225 | tmp0=r8; | ||
226 | tmp1=r9; | ||
227 | tmp2=r10; | ||
228 | tmp3=r11; | ||
229 | ctx=r32; // in0 | ||
230 | inp=r33; // in1 | ||
231 | |||
232 | // void sha1_block_data_order(SHA_CTX *c,const void *p,size_t num); | ||
233 | .global sha1_block_data_order# | ||
234 | .proc sha1_block_data_order# | ||
235 | .align 32 | ||
236 | sha1_block_data_order: | ||
237 | .prologue | ||
238 | { .mmi; alloc tmp1=ar.pfs,3,14,0,0 | ||
239 | $ADDP tmp0=4,ctx | ||
240 | .save ar.lc,r3 | ||
241 | mov r3=ar.lc } | ||
242 | { .mmi; $ADDP ctx=0,ctx | ||
243 | $ADDP inp=0,inp | ||
244 | mov r2=pr };; | ||
245 | tmp4=in2; | ||
246 | tmp5=loc12; | ||
247 | tmp6=loc13; | ||
248 | .body | ||
249 | { .mlx; ld4 $h0=[ctx],8 | ||
250 | movl $K_00_19=0x5a827999 } | ||
251 | { .mlx; ld4 $h1=[tmp0],8 | ||
252 | movl $K_20_39=0x6ed9eba1 };; | ||
253 | { .mlx; ld4 $h2=[ctx],8 | ||
254 | movl $K_40_59=0x8f1bbcdc } | ||
255 | { .mlx; ld4 $h3=[tmp0] | ||
256 | movl $K_60_79=0xca62c1d6 };; | ||
257 | { .mmi; ld4 $h4=[ctx],-16 | ||
258 | add in2=-1,in2 // adjust num for ar.lc | ||
259 | mov ar.ec=1 };; | ||
260 | { .mmi; nop.m 0 | ||
261 | add tmp3=1,inp | ||
262 | mov ar.lc=in2 };; // brp.loop.imp: too far | ||
263 | |||
264 | .Ldtop: | ||
265 | { .mmi; mov $A=$h0 | ||
266 | mov $B=$h1 | ||
267 | mux2 tmp6=$h1,0x44 } | ||
268 | { .mmi; mov $C=$h2 | ||
269 | mov $D=$h3 | ||
270 | mov $E=$h4 };; | ||
271 | |||
272 | ___ | ||
273 | |||
274 | { my $i; | ||
275 | my @V=($A,$B,$C,$D,$E); | ||
276 | |||
277 | for($i=0;$i<16;$i++) { &BODY_00_15(\$code,$i,@V); unshift(@V,pop(@V)); } | ||
278 | for(;$i<20;$i++) { &BODY_16_19(\$code,$i,@V); unshift(@V,pop(@V)); } | ||
279 | for(;$i<40;$i++) { &BODY_20_39(\$code,$i,@V); unshift(@V,pop(@V)); } | ||
280 | for(;$i<60;$i++) { &BODY_40_59(\$code,$i,@V); unshift(@V,pop(@V)); } | ||
281 | for(;$i<80;$i++) { &BODY_60_79(\$code,$i,@V); unshift(@V,pop(@V)); } | ||
282 | |||
283 | (($V[0] eq $A) and ($V[4] eq $E)) or die; # double-check | ||
284 | } | ||
285 | |||
286 | $code.=<<___; | ||
287 | { .mmb; add $h0=$h0,$A | ||
288 | add $h2=$h2,$C | ||
289 | br.ctop.dptk.many .Ldtop };; | ||
290 | .Ldend: | ||
291 | { .mmi; add tmp0=4,ctx | ||
292 | mov ar.lc=r3 };; | ||
293 | { .mmi; st4 [ctx]=$h0,8 | ||
294 | st4 [tmp0]=$h1,8 };; | ||
295 | { .mmi; st4 [ctx]=$h2,8 | ||
296 | st4 [tmp0]=$h3 };; | ||
297 | { .mib; st4 [ctx]=$h4,-16 | ||
298 | mov pr=r2,0x1ffff | ||
299 | br.ret.sptk.many b0 };; | ||
300 | .endp sha1_block_data_order# | ||
301 | stringz "SHA1 block transform for IA64, CRYPTOGAMS by <appro\@openssl.org>" | ||
302 | ___ | ||
303 | |||
304 | $output=shift and open STDOUT,">$output"; | ||
305 | print $code; | ||
diff --git a/src/lib/libcrypto/sha/asm/sha512-ia64.pl b/src/lib/libcrypto/sha/asm/sha512-ia64.pl deleted file mode 100755 index 1c6ce56522..0000000000 --- a/src/lib/libcrypto/sha/asm/sha512-ia64.pl +++ /dev/null | |||
@@ -1,672 +0,0 @@ | |||
1 | #!/usr/bin/env perl | ||
2 | # | ||
3 | # ==================================================================== | ||
4 | # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | ||
5 | # project. The module is, however, dual licensed under OpenSSL and | ||
6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
8 | # ==================================================================== | ||
9 | # | ||
10 | # SHA256/512_Transform for Itanium. | ||
11 | # | ||
12 | # sha512_block runs in 1003 cycles on Itanium 2, which is almost 50% | ||
13 | # faster than gcc and >60%(!) faster than code generated by HP-UX | ||
14 | # compiler (yes, HP-UX is generating slower code, because unlike gcc, | ||
15 | # it failed to deploy "shift right pair," 'shrp' instruction, which | ||
16 | # substitutes for 64-bit rotate). | ||
17 | # | ||
18 | # 924 cycles long sha256_block outperforms gcc by over factor of 2(!) | ||
19 | # and HP-UX compiler - by >40% (yes, gcc won sha512_block, but lost | ||
20 | # this one big time). Note that "formally" 924 is about 100 cycles | ||
21 | # too much. I mean it's 64 32-bit rounds vs. 80 virtually identical | ||
22 | # 64-bit ones and 1003*64/80 gives 802. Extra cycles, 2 per round, | ||
23 | # are spent on extra work to provide for 32-bit rotations. 32-bit | ||
24 | # rotations are still handled by 'shrp' instruction and for this | ||
25 | # reason lower 32 bits are deposited to upper half of 64-bit register | ||
26 | # prior 'shrp' issue. And in order to minimize the amount of such | ||
27 | # operations, X[16] values are *maintained* with copies of lower | ||
28 | # halves in upper halves, which is why you'll spot such instructions | ||
29 | # as custom 'mux2', "parallel 32-bit add," 'padd4' and "parallel | ||
30 | # 32-bit unsigned right shift," 'pshr4.u' instructions here. | ||
31 | # | ||
32 | # Rules of engagement. | ||
33 | # | ||
34 | # There is only one integer shifter meaning that if I have two rotate, | ||
35 | # deposit or extract instructions in adjacent bundles, they shall | ||
36 | # split [at run-time if they have to]. But note that variable and | ||
37 | # parallel shifts are performed by multi-media ALU and *are* pairable | ||
38 | # with rotates [and alike]. On the backside MMALU is rather slow: it | ||
39 | # takes 2 extra cycles before the result of integer operation is | ||
40 | # available *to* MMALU and 2(*) extra cycles before the result of MM | ||
41 | # operation is available "back" *to* integer ALU, not to mention that | ||
42 | # MMALU itself has 2 cycles latency. However! I explicitly scheduled | ||
43 | # these MM instructions to avoid MM stalls, so that all these extra | ||
44 | # latencies get "hidden" in instruction-level parallelism. | ||
45 | # | ||
46 | # (*) 2 cycles on Itanium 1 and 1 cycle on Itanium 2. But I schedule | ||
47 | # for 2 in order to provide for best *overall* performance, | ||
48 | # because on Itanium 1 stall on MM result is accompanied by | ||
49 | # pipeline flush, which takes 6 cycles:-( | ||
50 | # | ||
51 | # Resulting performance numbers for 900MHz Itanium 2 system: | ||
52 | # | ||
53 | # The 'numbers' are in 1000s of bytes per second processed. | ||
54 | # type 16 bytes 64 bytes 256 bytes 1024 bytes 8192 bytes | ||
55 | # sha1(*) 6210.14k 20376.30k 52447.83k 85870.05k 105478.12k | ||
56 | # sha256 7476.45k 20572.05k 41538.34k 56062.29k 62093.18k | ||
57 | # sha512 4996.56k 20026.28k 47597.20k 85278.79k 111501.31k | ||
58 | # | ||
59 | # (*) SHA1 numbers are for HP-UX compiler and are presented purely | ||
60 | # for reference purposes. I bet it can improved too... | ||
61 | # | ||
62 | # To generate code, pass the file name with either 256 or 512 in its | ||
63 | # name and compiler flags. | ||
64 | |||
65 | $output=shift; | ||
66 | |||
67 | if ($output =~ /512.*\.[s|asm]/) { | ||
68 | $SZ=8; | ||
69 | $BITS=8*$SZ; | ||
70 | $LDW="ld8"; | ||
71 | $STW="st8"; | ||
72 | $ADD="add"; | ||
73 | $SHRU="shr.u"; | ||
74 | $TABLE="K512"; | ||
75 | $func="sha512_block_data_order"; | ||
76 | @Sigma0=(28,34,39); | ||
77 | @Sigma1=(14,18,41); | ||
78 | @sigma0=(1, 8, 7); | ||
79 | @sigma1=(19,61, 6); | ||
80 | $rounds=80; | ||
81 | } elsif ($output =~ /256.*\.[s|asm]/) { | ||
82 | $SZ=4; | ||
83 | $BITS=8*$SZ; | ||
84 | $LDW="ld4"; | ||
85 | $STW="st4"; | ||
86 | $ADD="padd4"; | ||
87 | $SHRU="pshr4.u"; | ||
88 | $TABLE="K256"; | ||
89 | $func="sha256_block_data_order"; | ||
90 | @Sigma0=( 2,13,22); | ||
91 | @Sigma1=( 6,11,25); | ||
92 | @sigma0=( 7,18, 3); | ||
93 | @sigma1=(17,19,10); | ||
94 | $rounds=64; | ||
95 | } else { die "nonsense $output"; } | ||
96 | |||
97 | open STDOUT,">$output" || die "can't open $output: $!"; | ||
98 | |||
99 | if ($^O eq "hpux") { | ||
100 | $ADDP="addp4"; | ||
101 | for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); } | ||
102 | } else { $ADDP="add"; } | ||
103 | for (@ARGV) { $big_endian=1 if (/\-DB_ENDIAN/); | ||
104 | $big_endian=0 if (/\-DL_ENDIAN/); } | ||
105 | if (!defined($big_endian)) | ||
106 | { $big_endian=(unpack('L',pack('N',1))==1); } | ||
107 | |||
108 | $code=<<___; | ||
109 | .ident \"$output, version 1.1\" | ||
110 | .ident \"IA-64 ISA artwork by Andy Polyakov <appro\@fy.chalmers.se>\" | ||
111 | .explicit | ||
112 | .text | ||
113 | |||
114 | pfssave=r2; | ||
115 | lcsave=r3; | ||
116 | prsave=r14; | ||
117 | K=r15; | ||
118 | A=r16; B=r17; C=r18; D=r19; | ||
119 | E=r20; F=r21; G=r22; H=r23; | ||
120 | T1=r24; T2=r25; | ||
121 | s0=r26; s1=r27; t0=r28; t1=r29; | ||
122 | Ktbl=r30; | ||
123 | ctx=r31; // 1st arg | ||
124 | input=r48; // 2nd arg | ||
125 | num=r49; // 3rd arg | ||
126 | sgm0=r50; sgm1=r51; // small constants | ||
127 | A_=r54; B_=r55; C_=r56; D_=r57; | ||
128 | E_=r58; F_=r59; G_=r60; H_=r61; | ||
129 | |||
130 | // void $func (SHA_CTX *ctx, const void *in,size_t num[,int host]) | ||
131 | .global $func# | ||
132 | .proc $func# | ||
133 | .align 32 | ||
134 | $func: | ||
135 | .prologue | ||
136 | .save ar.pfs,pfssave | ||
137 | { .mmi; alloc pfssave=ar.pfs,3,27,0,16 | ||
138 | $ADDP ctx=0,r32 // 1st arg | ||
139 | .save ar.lc,lcsave | ||
140 | mov lcsave=ar.lc } | ||
141 | { .mmi; $ADDP input=0,r33 // 2nd arg | ||
142 | mov num=r34 // 3rd arg | ||
143 | .save pr,prsave | ||
144 | mov prsave=pr };; | ||
145 | |||
146 | .body | ||
147 | { .mib; add r8=0*$SZ,ctx | ||
148 | add r9=1*$SZ,ctx | ||
149 | brp.loop.imp .L_first16,.L_first16_end-16 } | ||
150 | { .mib; add r10=2*$SZ,ctx | ||
151 | add r11=3*$SZ,ctx | ||
152 | brp.loop.imp .L_rest,.L_rest_end-16 };; | ||
153 | |||
154 | // load A-H | ||
155 | .Lpic_point: | ||
156 | { .mmi; $LDW A_=[r8],4*$SZ | ||
157 | $LDW B_=[r9],4*$SZ | ||
158 | mov Ktbl=ip } | ||
159 | { .mmi; $LDW C_=[r10],4*$SZ | ||
160 | $LDW D_=[r11],4*$SZ | ||
161 | mov sgm0=$sigma0[2] };; | ||
162 | { .mmi; $LDW E_=[r8] | ||
163 | $LDW F_=[r9] | ||
164 | add Ktbl=($TABLE#-.Lpic_point),Ktbl } | ||
165 | { .mmi; $LDW G_=[r10] | ||
166 | $LDW H_=[r11] | ||
167 | cmp.ne p0,p16=0,r0 };; // used in sha256_block | ||
168 | ___ | ||
169 | $code.=<<___ if ($BITS==64); | ||
170 | { .mii; and r8=7,input | ||
171 | and input=~7,input;; | ||
172 | cmp.eq p9,p0=1,r8 } | ||
173 | { .mmi; cmp.eq p10,p0=2,r8 | ||
174 | cmp.eq p11,p0=3,r8 | ||
175 | cmp.eq p12,p0=4,r8 } | ||
176 | { .mmi; cmp.eq p13,p0=5,r8 | ||
177 | cmp.eq p14,p0=6,r8 | ||
178 | cmp.eq p15,p0=7,r8 };; | ||
179 | ___ | ||
180 | $code.=<<___; | ||
181 | .L_outer: | ||
182 | .rotr X[16] | ||
183 | { .mmi; mov A=A_ | ||
184 | mov B=B_ | ||
185 | mov ar.lc=14 } | ||
186 | { .mmi; mov C=C_ | ||
187 | mov D=D_ | ||
188 | mov E=E_ } | ||
189 | { .mmi; mov F=F_ | ||
190 | mov G=G_ | ||
191 | mov ar.ec=2 } | ||
192 | { .mmi; ld1 X[15]=[input],$SZ // eliminated in 64-bit | ||
193 | mov H=H_ | ||
194 | mov sgm1=$sigma1[2] };; | ||
195 | |||
196 | ___ | ||
197 | $t0="t0", $t1="t1", $code.=<<___ if ($BITS==32); | ||
198 | .align 32 | ||
199 | .L_first16: | ||
200 | { .mmi; add r9=1-$SZ,input | ||
201 | add r10=2-$SZ,input | ||
202 | add r11=3-$SZ,input };; | ||
203 | { .mmi; ld1 r9=[r9] | ||
204 | ld1 r10=[r10] | ||
205 | dep.z $t1=E,32,32 } | ||
206 | { .mmi; $LDW K=[Ktbl],$SZ | ||
207 | ld1 r11=[r11] | ||
208 | zxt4 E=E };; | ||
209 | { .mii; or $t1=$t1,E | ||
210 | dep X[15]=X[15],r9,8,8 | ||
211 | dep r11=r10,r11,8,8 };; | ||
212 | { .mmi; and T1=F,E | ||
213 | and T2=A,B | ||
214 | dep X[15]=X[15],r11,16,16 } | ||
215 | { .mmi; andcm r8=G,E | ||
216 | and r9=A,C | ||
217 | mux2 $t0=A,0x44 };; // copy lower half to upper | ||
218 | { .mmi; (p16) ld1 X[15-1]=[input],$SZ // prefetch | ||
219 | xor T1=T1,r8 // T1=((e & f) ^ (~e & g)) | ||
220 | _rotr r11=$t1,$Sigma1[0] } // ROTR(e,14) | ||
221 | { .mib; and r10=B,C | ||
222 | xor T2=T2,r9 };; | ||
223 | ___ | ||
224 | $t0="A", $t1="E", $code.=<<___ if ($BITS==64); | ||
225 | // in 64-bit mode I load whole X[16] at once and take care of alignment... | ||
226 | { .mmi; add r8=1*$SZ,input | ||
227 | add r9=2*$SZ,input | ||
228 | add r10=3*$SZ,input };; | ||
229 | { .mmb; $LDW X[15]=[input],4*$SZ | ||
230 | $LDW X[14]=[r8],4*$SZ | ||
231 | (p9) br.cond.dpnt.many .L1byte };; | ||
232 | { .mmb; $LDW X[13]=[r9],4*$SZ | ||
233 | $LDW X[12]=[r10],4*$SZ | ||
234 | (p10) br.cond.dpnt.many .L2byte };; | ||
235 | { .mmb; $LDW X[11]=[input],4*$SZ | ||
236 | $LDW X[10]=[r8],4*$SZ | ||
237 | (p11) br.cond.dpnt.many .L3byte };; | ||
238 | { .mmb; $LDW X[ 9]=[r9],4*$SZ | ||
239 | $LDW X[ 8]=[r10],4*$SZ | ||
240 | (p12) br.cond.dpnt.many .L4byte };; | ||
241 | { .mmb; $LDW X[ 7]=[input],4*$SZ | ||
242 | $LDW X[ 6]=[r8],4*$SZ | ||
243 | (p13) br.cond.dpnt.many .L5byte };; | ||
244 | { .mmb; $LDW X[ 5]=[r9],4*$SZ | ||
245 | $LDW X[ 4]=[r10],4*$SZ | ||
246 | (p14) br.cond.dpnt.many .L6byte };; | ||
247 | { .mmb; $LDW X[ 3]=[input],4*$SZ | ||
248 | $LDW X[ 2]=[r8],4*$SZ | ||
249 | (p15) br.cond.dpnt.many .L7byte };; | ||
250 | { .mmb; $LDW X[ 1]=[r9],4*$SZ | ||
251 | $LDW X[ 0]=[r10],4*$SZ | ||
252 | br.many .L_first16 };; | ||
253 | .L1byte: | ||
254 | { .mmi; $LDW X[13]=[r9],4*$SZ | ||
255 | $LDW X[12]=[r10],4*$SZ | ||
256 | shrp X[15]=X[15],X[14],56 };; | ||
257 | { .mmi; $LDW X[11]=[input],4*$SZ | ||
258 | $LDW X[10]=[r8],4*$SZ | ||
259 | shrp X[14]=X[14],X[13],56 } | ||
260 | { .mmi; $LDW X[ 9]=[r9],4*$SZ | ||
261 | $LDW X[ 8]=[r10],4*$SZ | ||
262 | shrp X[13]=X[13],X[12],56 };; | ||
263 | { .mmi; $LDW X[ 7]=[input],4*$SZ | ||
264 | $LDW X[ 6]=[r8],4*$SZ | ||
265 | shrp X[12]=X[12],X[11],56 } | ||
266 | { .mmi; $LDW X[ 5]=[r9],4*$SZ | ||
267 | $LDW X[ 4]=[r10],4*$SZ | ||
268 | shrp X[11]=X[11],X[10],56 };; | ||
269 | { .mmi; $LDW X[ 3]=[input],4*$SZ | ||
270 | $LDW X[ 2]=[r8],4*$SZ | ||
271 | shrp X[10]=X[10],X[ 9],56 } | ||
272 | { .mmi; $LDW X[ 1]=[r9],4*$SZ | ||
273 | $LDW X[ 0]=[r10],4*$SZ | ||
274 | shrp X[ 9]=X[ 9],X[ 8],56 };; | ||
275 | { .mii; $LDW T1=[input] | ||
276 | shrp X[ 8]=X[ 8],X[ 7],56 | ||
277 | shrp X[ 7]=X[ 7],X[ 6],56 } | ||
278 | { .mii; shrp X[ 6]=X[ 6],X[ 5],56 | ||
279 | shrp X[ 5]=X[ 5],X[ 4],56 };; | ||
280 | { .mii; shrp X[ 4]=X[ 4],X[ 3],56 | ||
281 | shrp X[ 3]=X[ 3],X[ 2],56 } | ||
282 | { .mii; shrp X[ 2]=X[ 2],X[ 1],56 | ||
283 | shrp X[ 1]=X[ 1],X[ 0],56 } | ||
284 | { .mib; shrp X[ 0]=X[ 0],T1,56 | ||
285 | br.many .L_first16 };; | ||
286 | .L2byte: | ||
287 | { .mmi; $LDW X[11]=[input],4*$SZ | ||
288 | $LDW X[10]=[r8],4*$SZ | ||
289 | shrp X[15]=X[15],X[14],48 } | ||
290 | { .mmi; $LDW X[ 9]=[r9],4*$SZ | ||
291 | $LDW X[ 8]=[r10],4*$SZ | ||
292 | shrp X[14]=X[14],X[13],48 };; | ||
293 | { .mmi; $LDW X[ 7]=[input],4*$SZ | ||
294 | $LDW X[ 6]=[r8],4*$SZ | ||
295 | shrp X[13]=X[13],X[12],48 } | ||
296 | { .mmi; $LDW X[ 5]=[r9],4*$SZ | ||
297 | $LDW X[ 4]=[r10],4*$SZ | ||
298 | shrp X[12]=X[12],X[11],48 };; | ||
299 | { .mmi; $LDW X[ 3]=[input],4*$SZ | ||
300 | $LDW X[ 2]=[r8],4*$SZ | ||
301 | shrp X[11]=X[11],X[10],48 } | ||
302 | { .mmi; $LDW X[ 1]=[r9],4*$SZ | ||
303 | $LDW X[ 0]=[r10],4*$SZ | ||
304 | shrp X[10]=X[10],X[ 9],48 };; | ||
305 | { .mii; $LDW T1=[input] | ||
306 | shrp X[ 9]=X[ 9],X[ 8],48 | ||
307 | shrp X[ 8]=X[ 8],X[ 7],48 } | ||
308 | { .mii; shrp X[ 7]=X[ 7],X[ 6],48 | ||
309 | shrp X[ 6]=X[ 6],X[ 5],48 };; | ||
310 | { .mii; shrp X[ 5]=X[ 5],X[ 4],48 | ||
311 | shrp X[ 4]=X[ 4],X[ 3],48 } | ||
312 | { .mii; shrp X[ 3]=X[ 3],X[ 2],48 | ||
313 | shrp X[ 2]=X[ 2],X[ 1],48 } | ||
314 | { .mii; shrp X[ 1]=X[ 1],X[ 0],48 | ||
315 | shrp X[ 0]=X[ 0],T1,48 } | ||
316 | { .mfb; br.many .L_first16 };; | ||
317 | .L3byte: | ||
318 | { .mmi; $LDW X[ 9]=[r9],4*$SZ | ||
319 | $LDW X[ 8]=[r10],4*$SZ | ||
320 | shrp X[15]=X[15],X[14],40 };; | ||
321 | { .mmi; $LDW X[ 7]=[input],4*$SZ | ||
322 | $LDW X[ 6]=[r8],4*$SZ | ||
323 | shrp X[14]=X[14],X[13],40 } | ||
324 | { .mmi; $LDW X[ 5]=[r9],4*$SZ | ||
325 | $LDW X[ 4]=[r10],4*$SZ | ||
326 | shrp X[13]=X[13],X[12],40 };; | ||
327 | { .mmi; $LDW X[ 3]=[input],4*$SZ | ||
328 | $LDW X[ 2]=[r8],4*$SZ | ||
329 | shrp X[12]=X[12],X[11],40 } | ||
330 | { .mmi; $LDW X[ 1]=[r9],4*$SZ | ||
331 | $LDW X[ 0]=[r10],4*$SZ | ||
332 | shrp X[11]=X[11],X[10],40 };; | ||
333 | { .mii; $LDW T1=[input] | ||
334 | shrp X[10]=X[10],X[ 9],40 | ||
335 | shrp X[ 9]=X[ 9],X[ 8],40 } | ||
336 | { .mii; shrp X[ 8]=X[ 8],X[ 7],40 | ||
337 | shrp X[ 7]=X[ 7],X[ 6],40 };; | ||
338 | { .mii; shrp X[ 6]=X[ 6],X[ 5],40 | ||
339 | shrp X[ 5]=X[ 5],X[ 4],40 } | ||
340 | { .mii; shrp X[ 4]=X[ 4],X[ 3],40 | ||
341 | shrp X[ 3]=X[ 3],X[ 2],40 } | ||
342 | { .mii; shrp X[ 2]=X[ 2],X[ 1],40 | ||
343 | shrp X[ 1]=X[ 1],X[ 0],40 } | ||
344 | { .mib; shrp X[ 0]=X[ 0],T1,40 | ||
345 | br.many .L_first16 };; | ||
346 | .L4byte: | ||
347 | { .mmi; $LDW X[ 7]=[input],4*$SZ | ||
348 | $LDW X[ 6]=[r8],4*$SZ | ||
349 | shrp X[15]=X[15],X[14],32 } | ||
350 | { .mmi; $LDW X[ 5]=[r9],4*$SZ | ||
351 | $LDW X[ 4]=[r10],4*$SZ | ||
352 | shrp X[14]=X[14],X[13],32 };; | ||
353 | { .mmi; $LDW X[ 3]=[input],4*$SZ | ||
354 | $LDW X[ 2]=[r8],4*$SZ | ||
355 | shrp X[13]=X[13],X[12],32 } | ||
356 | { .mmi; $LDW X[ 1]=[r9],4*$SZ | ||
357 | $LDW X[ 0]=[r10],4*$SZ | ||
358 | shrp X[12]=X[12],X[11],32 };; | ||
359 | { .mii; $LDW T1=[input] | ||
360 | shrp X[11]=X[11],X[10],32 | ||
361 | shrp X[10]=X[10],X[ 9],32 } | ||
362 | { .mii; shrp X[ 9]=X[ 9],X[ 8],32 | ||
363 | shrp X[ 8]=X[ 8],X[ 7],32 };; | ||
364 | { .mii; shrp X[ 7]=X[ 7],X[ 6],32 | ||
365 | shrp X[ 6]=X[ 6],X[ 5],32 } | ||
366 | { .mii; shrp X[ 5]=X[ 5],X[ 4],32 | ||
367 | shrp X[ 4]=X[ 4],X[ 3],32 } | ||
368 | { .mii; shrp X[ 3]=X[ 3],X[ 2],32 | ||
369 | shrp X[ 2]=X[ 2],X[ 1],32 } | ||
370 | { .mii; shrp X[ 1]=X[ 1],X[ 0],32 | ||
371 | shrp X[ 0]=X[ 0],T1,32 } | ||
372 | { .mfb; br.many .L_first16 };; | ||
373 | .L5byte: | ||
374 | { .mmi; $LDW X[ 5]=[r9],4*$SZ | ||
375 | $LDW X[ 4]=[r10],4*$SZ | ||
376 | shrp X[15]=X[15],X[14],24 };; | ||
377 | { .mmi; $LDW X[ 3]=[input],4*$SZ | ||
378 | $LDW X[ 2]=[r8],4*$SZ | ||
379 | shrp X[14]=X[14],X[13],24 } | ||
380 | { .mmi; $LDW X[ 1]=[r9],4*$SZ | ||
381 | $LDW X[ 0]=[r10],4*$SZ | ||
382 | shrp X[13]=X[13],X[12],24 };; | ||
383 | { .mii; $LDW T1=[input] | ||
384 | shrp X[12]=X[12],X[11],24 | ||
385 | shrp X[11]=X[11],X[10],24 } | ||
386 | { .mii; shrp X[10]=X[10],X[ 9],24 | ||
387 | shrp X[ 9]=X[ 9],X[ 8],24 };; | ||
388 | { .mii; shrp X[ 8]=X[ 8],X[ 7],24 | ||
389 | shrp X[ 7]=X[ 7],X[ 6],24 } | ||
390 | { .mii; shrp X[ 6]=X[ 6],X[ 5],24 | ||
391 | shrp X[ 5]=X[ 5],X[ 4],24 } | ||
392 | { .mii; shrp X[ 4]=X[ 4],X[ 3],24 | ||
393 | shrp X[ 3]=X[ 3],X[ 2],24 } | ||
394 | { .mii; shrp X[ 2]=X[ 2],X[ 1],24 | ||
395 | shrp X[ 1]=X[ 1],X[ 0],24 } | ||
396 | { .mib; shrp X[ 0]=X[ 0],T1,24 | ||
397 | br.many .L_first16 };; | ||
398 | .L6byte: | ||
399 | { .mmi; $LDW X[ 3]=[input],4*$SZ | ||
400 | $LDW X[ 2]=[r8],4*$SZ | ||
401 | shrp X[15]=X[15],X[14],16 } | ||
402 | { .mmi; $LDW X[ 1]=[r9],4*$SZ | ||
403 | $LDW X[ 0]=[r10],4*$SZ | ||
404 | shrp X[14]=X[14],X[13],16 };; | ||
405 | { .mii; $LDW T1=[input] | ||
406 | shrp X[13]=X[13],X[12],16 | ||
407 | shrp X[12]=X[12],X[11],16 } | ||
408 | { .mii; shrp X[11]=X[11],X[10],16 | ||
409 | shrp X[10]=X[10],X[ 9],16 };; | ||
410 | { .mii; shrp X[ 9]=X[ 9],X[ 8],16 | ||
411 | shrp X[ 8]=X[ 8],X[ 7],16 } | ||
412 | { .mii; shrp X[ 7]=X[ 7],X[ 6],16 | ||
413 | shrp X[ 6]=X[ 6],X[ 5],16 } | ||
414 | { .mii; shrp X[ 5]=X[ 5],X[ 4],16 | ||
415 | shrp X[ 4]=X[ 4],X[ 3],16 } | ||
416 | { .mii; shrp X[ 3]=X[ 3],X[ 2],16 | ||
417 | shrp X[ 2]=X[ 2],X[ 1],16 } | ||
418 | { .mii; shrp X[ 1]=X[ 1],X[ 0],16 | ||
419 | shrp X[ 0]=X[ 0],T1,16 } | ||
420 | { .mfb; br.many .L_first16 };; | ||
421 | .L7byte: | ||
422 | { .mmi; $LDW X[ 1]=[r9],4*$SZ | ||
423 | $LDW X[ 0]=[r10],4*$SZ | ||
424 | shrp X[15]=X[15],X[14],8 };; | ||
425 | { .mii; $LDW T1=[input] | ||
426 | shrp X[14]=X[14],X[13],8 | ||
427 | shrp X[13]=X[13],X[12],8 } | ||
428 | { .mii; shrp X[12]=X[12],X[11],8 | ||
429 | shrp X[11]=X[11],X[10],8 };; | ||
430 | { .mii; shrp X[10]=X[10],X[ 9],8 | ||
431 | shrp X[ 9]=X[ 9],X[ 8],8 } | ||
432 | { .mii; shrp X[ 8]=X[ 8],X[ 7],8 | ||
433 | shrp X[ 7]=X[ 7],X[ 6],8 } | ||
434 | { .mii; shrp X[ 6]=X[ 6],X[ 5],8 | ||
435 | shrp X[ 5]=X[ 5],X[ 4],8 } | ||
436 | { .mii; shrp X[ 4]=X[ 4],X[ 3],8 | ||
437 | shrp X[ 3]=X[ 3],X[ 2],8 } | ||
438 | { .mii; shrp X[ 2]=X[ 2],X[ 1],8 | ||
439 | shrp X[ 1]=X[ 1],X[ 0],8 } | ||
440 | { .mib; shrp X[ 0]=X[ 0],T1,8 | ||
441 | br.many .L_first16 };; | ||
442 | |||
443 | .align 32 | ||
444 | .L_first16: | ||
445 | { .mmi; $LDW K=[Ktbl],$SZ | ||
446 | and T1=F,E | ||
447 | and T2=A,B } | ||
448 | { .mmi; //$LDW X[15]=[input],$SZ // X[i]=*input++ | ||
449 | andcm r8=G,E | ||
450 | and r9=A,C };; | ||
451 | { .mmi; xor T1=T1,r8 //T1=((e & f) ^ (~e & g)) | ||
452 | and r10=B,C | ||
453 | _rotr r11=$t1,$Sigma1[0] } // ROTR(e,14) | ||
454 | { .mmi; xor T2=T2,r9 | ||
455 | mux1 X[15]=X[15],\@rev };; // eliminated in big-endian | ||
456 | ___ | ||
457 | $code.=<<___; | ||
458 | { .mib; add T1=T1,H // T1=Ch(e,f,g)+h | ||
459 | _rotr r8=$t1,$Sigma1[1] } // ROTR(e,18) | ||
460 | { .mib; xor T2=T2,r10 // T2=((a & b) ^ (a & c) ^ (b & c)) | ||
461 | mov H=G };; | ||
462 | { .mib; xor r11=r8,r11 | ||
463 | _rotr r9=$t1,$Sigma1[2] } // ROTR(e,41) | ||
464 | { .mib; mov G=F | ||
465 | mov F=E };; | ||
466 | { .mib; xor r9=r9,r11 // r9=Sigma1(e) | ||
467 | _rotr r10=$t0,$Sigma0[0] } // ROTR(a,28) | ||
468 | { .mib; add T1=T1,K // T1=Ch(e,f,g)+h+K512[i] | ||
469 | mov E=D };; | ||
470 | { .mib; add T1=T1,r9 // T1+=Sigma1(e) | ||
471 | _rotr r11=$t0,$Sigma0[1] } // ROTR(a,34) | ||
472 | { .mib; mov D=C | ||
473 | mov C=B };; | ||
474 | { .mib; add T1=T1,X[15] // T1+=X[i] | ||
475 | _rotr r8=$t0,$Sigma0[2] } // ROTR(a,39) | ||
476 | { .mib; xor r10=r10,r11 | ||
477 | mux2 X[15]=X[15],0x44 };; // eliminated in 64-bit | ||
478 | { .mmi; xor r10=r8,r10 // r10=Sigma0(a) | ||
479 | mov B=A | ||
480 | add A=T1,T2 };; | ||
481 | { .mib; add E=E,T1 | ||
482 | add A=A,r10 // T2=Maj(a,b,c)+Sigma0(a) | ||
483 | br.ctop.sptk .L_first16 };; | ||
484 | .L_first16_end: | ||
485 | |||
486 | { .mii; mov ar.lc=$rounds-17 | ||
487 | mov ar.ec=1 };; | ||
488 | |||
489 | .align 32 | ||
490 | .L_rest: | ||
491 | .rotr X[16] | ||
492 | { .mib; $LDW K=[Ktbl],$SZ | ||
493 | _rotr r8=X[15-1],$sigma0[0] } // ROTR(s0,1) | ||
494 | { .mib; $ADD X[15]=X[15],X[15-9] // X[i&0xF]+=X[(i+9)&0xF] | ||
495 | $SHRU s0=X[15-1],sgm0 };; // s0=X[(i+1)&0xF]>>7 | ||
496 | { .mib; and T1=F,E | ||
497 | _rotr r9=X[15-1],$sigma0[1] } // ROTR(s0,8) | ||
498 | { .mib; andcm r10=G,E | ||
499 | $SHRU s1=X[15-14],sgm1 };; // s1=X[(i+14)&0xF]>>6 | ||
500 | { .mmi; xor T1=T1,r10 // T1=((e & f) ^ (~e & g)) | ||
501 | xor r9=r8,r9 | ||
502 | _rotr r10=X[15-14],$sigma1[0] };;// ROTR(s1,19) | ||
503 | { .mib; and T2=A,B | ||
504 | _rotr r11=X[15-14],$sigma1[1] }// ROTR(s1,61) | ||
505 | { .mib; and r8=A,C };; | ||
506 | ___ | ||
507 | $t0="t0", $t1="t1", $code.=<<___ if ($BITS==32); | ||
508 | // I adhere to mmi; in order to hold Itanium 1 back and avoid 6 cycle | ||
509 | // pipeline flush in last bundle. Note that even on Itanium2 the | ||
510 | // latter stalls for one clock cycle... | ||
511 | { .mmi; xor s0=s0,r9 // s0=sigma0(X[(i+1)&0xF]) | ||
512 | dep.z $t1=E,32,32 } | ||
513 | { .mmi; xor r10=r11,r10 | ||
514 | zxt4 E=E };; | ||
515 | { .mmi; or $t1=$t1,E | ||
516 | xor s1=s1,r10 // s1=sigma1(X[(i+14)&0xF]) | ||
517 | mux2 $t0=A,0x44 };; // copy lower half to upper | ||
518 | { .mmi; xor T2=T2,r8 | ||
519 | _rotr r9=$t1,$Sigma1[0] } // ROTR(e,14) | ||
520 | { .mmi; and r10=B,C | ||
521 | add T1=T1,H // T1=Ch(e,f,g)+h | ||
522 | $ADD X[15]=X[15],s0 };; // X[i&0xF]+=sigma0(X[(i+1)&0xF]) | ||
523 | ___ | ||
524 | $t0="A", $t1="E", $code.=<<___ if ($BITS==64); | ||
525 | { .mib; xor s0=s0,r9 // s0=sigma0(X[(i+1)&0xF]) | ||
526 | _rotr r9=$t1,$Sigma1[0] } // ROTR(e,14) | ||
527 | { .mib; xor r10=r11,r10 | ||
528 | xor T2=T2,r8 };; | ||
529 | { .mib; xor s1=s1,r10 // s1=sigma1(X[(i+14)&0xF]) | ||
530 | add T1=T1,H } | ||
531 | { .mib; and r10=B,C | ||
532 | $ADD X[15]=X[15],s0 };; // X[i&0xF]+=sigma0(X[(i+1)&0xF]) | ||
533 | ___ | ||
534 | $code.=<<___; | ||
535 | { .mmi; xor T2=T2,r10 // T2=((a & b) ^ (a & c) ^ (b & c)) | ||
536 | mov H=G | ||
537 | _rotr r8=$t1,$Sigma1[1] };; // ROTR(e,18) | ||
538 | { .mmi; xor r11=r8,r9 | ||
539 | $ADD X[15]=X[15],s1 // X[i&0xF]+=sigma1(X[(i+14)&0xF]) | ||
540 | _rotr r9=$t1,$Sigma1[2] } // ROTR(e,41) | ||
541 | { .mmi; mov G=F | ||
542 | mov F=E };; | ||
543 | { .mib; xor r9=r9,r11 // r9=Sigma1(e) | ||
544 | _rotr r10=$t0,$Sigma0[0] } // ROTR(a,28) | ||
545 | { .mib; add T1=T1,K // T1=Ch(e,f,g)+h+K512[i] | ||
546 | mov E=D };; | ||
547 | { .mib; add T1=T1,r9 // T1+=Sigma1(e) | ||
548 | _rotr r11=$t0,$Sigma0[1] } // ROTR(a,34) | ||
549 | { .mib; mov D=C | ||
550 | mov C=B };; | ||
551 | { .mmi; add T1=T1,X[15] // T1+=X[i] | ||
552 | xor r10=r10,r11 | ||
553 | _rotr r8=$t0,$Sigma0[2] };; // ROTR(a,39) | ||
554 | { .mmi; xor r10=r8,r10 // r10=Sigma0(a) | ||
555 | mov B=A | ||
556 | add A=T1,T2 };; | ||
557 | { .mib; add E=E,T1 | ||
558 | add A=A,r10 // T2=Maj(a,b,c)+Sigma0(a) | ||
559 | br.ctop.sptk .L_rest };; | ||
560 | .L_rest_end: | ||
561 | |||
562 | { .mmi; add A_=A_,A | ||
563 | add B_=B_,B | ||
564 | add C_=C_,C } | ||
565 | { .mmi; add D_=D_,D | ||
566 | add E_=E_,E | ||
567 | cmp.ltu p16,p0=1,num };; | ||
568 | { .mmi; add F_=F_,F | ||
569 | add G_=G_,G | ||
570 | add H_=H_,H } | ||
571 | { .mmb; add Ktbl=-$SZ*$rounds,Ktbl | ||
572 | (p16) add num=-1,num | ||
573 | (p16) br.dptk.many .L_outer };; | ||
574 | |||
575 | { .mib; add r8=0*$SZ,ctx | ||
576 | add r9=1*$SZ,ctx } | ||
577 | { .mib; add r10=2*$SZ,ctx | ||
578 | add r11=3*$SZ,ctx };; | ||
579 | { .mmi; $STW [r8]=A_,4*$SZ | ||
580 | $STW [r9]=B_,4*$SZ | ||
581 | mov ar.lc=lcsave } | ||
582 | { .mmi; $STW [r10]=C_,4*$SZ | ||
583 | $STW [r11]=D_,4*$SZ | ||
584 | mov pr=prsave,0x1ffff };; | ||
585 | { .mmb; $STW [r8]=E_ | ||
586 | $STW [r9]=F_ } | ||
587 | { .mmb; $STW [r10]=G_ | ||
588 | $STW [r11]=H_ | ||
589 | br.ret.sptk.many b0 };; | ||
590 | .endp $func# | ||
591 | ___ | ||
592 | |||
593 | $code =~ s/\`([^\`]*)\`/eval $1/gem; | ||
594 | $code =~ s/_rotr(\s+)([^=]+)=([^,]+),([0-9]+)/shrp$1$2=$3,$3,$4/gm; | ||
595 | if ($BITS==64) { | ||
596 | $code =~ s/mux2(\s+)\S+/nop.i$1 0x0/gm; | ||
597 | $code =~ s/mux1(\s+)\S+/nop.i$1 0x0/gm if ($big_endian); | ||
598 | $code =~ s/(shrp\s+X\[[^=]+)=([^,]+),([^,]+),([1-9]+)/$1=$3,$2,64-$4/gm | ||
599 | if (!$big_endian); | ||
600 | $code =~ s/ld1(\s+)X\[\S+/nop.m$1 0x0/gm; | ||
601 | } | ||
602 | |||
603 | print $code; | ||
604 | |||
605 | print<<___ if ($BITS==32); | ||
606 | .align 64 | ||
607 | .type K256#,\@object | ||
608 | K256: data4 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 | ||
609 | data4 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 | ||
610 | data4 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 | ||
611 | data4 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 | ||
612 | data4 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc | ||
613 | data4 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da | ||
614 | data4 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 | ||
615 | data4 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 | ||
616 | data4 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 | ||
617 | data4 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 | ||
618 | data4 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 | ||
619 | data4 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 | ||
620 | data4 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 | ||
621 | data4 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 | ||
622 | data4 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 | ||
623 | data4 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 | ||
624 | .size K256#,$SZ*$rounds | ||
625 | stringz "SHA256 block transform for IA64, CRYPTOGAMS by <appro\@openssl.org>" | ||
626 | ___ | ||
627 | print<<___ if ($BITS==64); | ||
628 | .align 64 | ||
629 | .type K512#,\@object | ||
630 | K512: data8 0x428a2f98d728ae22,0x7137449123ef65cd | ||
631 | data8 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc | ||
632 | data8 0x3956c25bf348b538,0x59f111f1b605d019 | ||
633 | data8 0x923f82a4af194f9b,0xab1c5ed5da6d8118 | ||
634 | data8 0xd807aa98a3030242,0x12835b0145706fbe | ||
635 | data8 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 | ||
636 | data8 0x72be5d74f27b896f,0x80deb1fe3b1696b1 | ||
637 | data8 0x9bdc06a725c71235,0xc19bf174cf692694 | ||
638 | data8 0xe49b69c19ef14ad2,0xefbe4786384f25e3 | ||
639 | data8 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 | ||
640 | data8 0x2de92c6f592b0275,0x4a7484aa6ea6e483 | ||
641 | data8 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 | ||
642 | data8 0x983e5152ee66dfab,0xa831c66d2db43210 | ||
643 | data8 0xb00327c898fb213f,0xbf597fc7beef0ee4 | ||
644 | data8 0xc6e00bf33da88fc2,0xd5a79147930aa725 | ||
645 | data8 0x06ca6351e003826f,0x142929670a0e6e70 | ||
646 | data8 0x27b70a8546d22ffc,0x2e1b21385c26c926 | ||
647 | data8 0x4d2c6dfc5ac42aed,0x53380d139d95b3df | ||
648 | data8 0x650a73548baf63de,0x766a0abb3c77b2a8 | ||
649 | data8 0x81c2c92e47edaee6,0x92722c851482353b | ||
650 | data8 0xa2bfe8a14cf10364,0xa81a664bbc423001 | ||
651 | data8 0xc24b8b70d0f89791,0xc76c51a30654be30 | ||
652 | data8 0xd192e819d6ef5218,0xd69906245565a910 | ||
653 | data8 0xf40e35855771202a,0x106aa07032bbd1b8 | ||
654 | data8 0x19a4c116b8d2d0c8,0x1e376c085141ab53 | ||
655 | data8 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 | ||
656 | data8 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb | ||
657 | data8 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 | ||
658 | data8 0x748f82ee5defb2fc,0x78a5636f43172f60 | ||
659 | data8 0x84c87814a1f0ab72,0x8cc702081a6439ec | ||
660 | data8 0x90befffa23631e28,0xa4506cebde82bde9 | ||
661 | data8 0xbef9a3f7b2c67915,0xc67178f2e372532b | ||
662 | data8 0xca273eceea26619c,0xd186b8c721c0c207 | ||
663 | data8 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 | ||
664 | data8 0x06f067aa72176fba,0x0a637dc5a2c898a6 | ||
665 | data8 0x113f9804bef90dae,0x1b710b35131c471b | ||
666 | data8 0x28db77f523047d84,0x32caab7b40c72493 | ||
667 | data8 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c | ||
668 | data8 0x4cc5d4becb3e42b6,0x597f299cfc657e2a | ||
669 | data8 0x5fcb6fab3ad6faec,0x6c44198c4a475817 | ||
670 | .size K512#,$SZ*$rounds | ||
671 | stringz "SHA512 block transform for IA64, CRYPTOGAMS by <appro\@openssl.org>" | ||
672 | ___ | ||
diff --git a/src/lib/libcrypto/whrlpool/wp_block.c b/src/lib/libcrypto/whrlpool/wp_block.c index 57f5b5df7a..d8c1b89ba3 100644 --- a/src/lib/libcrypto/whrlpool/wp_block.c +++ b/src/lib/libcrypto/whrlpool/wp_block.c | |||
@@ -1,4 +1,4 @@ | |||
1 | /* $OpenBSD: wp_block.c,v 1.11 2016/09/04 13:39:48 jsing Exp $ */ | 1 | /* $OpenBSD: wp_block.c,v 1.12 2016/09/04 14:06:46 jsing Exp $ */ |
2 | /** | 2 | /** |
3 | * The Whirlpool hashing function. | 3 | * The Whirlpool hashing function. |
4 | * | 4 | * |
@@ -73,14 +73,6 @@ typedef unsigned long long u64; | |||
73 | # if defined(__x86_64) || defined(__x86_64__) | 73 | # if defined(__x86_64) || defined(__x86_64__) |
74 | # define ROTATE(a,n) ({ u64 ret; asm ("rolq %1,%0" \ | 74 | # define ROTATE(a,n) ({ u64 ret; asm ("rolq %1,%0" \ |
75 | : "=r"(ret) : "J"(n),"0"(a) : "cc"); ret; }) | 75 | : "=r"(ret) : "J"(n),"0"(a) : "cc"); ret; }) |
76 | # elif defined(__ia64) || defined(__ia64__) | ||
77 | # if BYTE_ORDER == LITTLE_ENDIAN | ||
78 | # define ROTATE(a,n) ({ u64 ret; asm ("shrp %0=%1,%1,%2" \ | ||
79 | : "=r"(ret) : "r"(a),"M"(64-(n))); ret; }) | ||
80 | # else | ||
81 | # define ROTATE(a,n) ({ u64 ret; asm ("shrp %0=%1,%1,%2" \ | ||
82 | : "=r"(ret) : "r"(a),"M"(n)); ret; }) | ||
83 | # endif | ||
84 | # endif | 76 | # endif |
85 | #endif | 77 | #endif |
86 | 78 | ||