diff options
Diffstat (limited to 'src/lib/libcrypto/modes')
| -rwxr-xr-x | src/lib/libcrypto/modes/asm/ghash-ia64.pl | 463 |
1 files changed, 0 insertions, 463 deletions
diff --git a/src/lib/libcrypto/modes/asm/ghash-ia64.pl b/src/lib/libcrypto/modes/asm/ghash-ia64.pl deleted file mode 100755 index 0354c95444..0000000000 --- a/src/lib/libcrypto/modes/asm/ghash-ia64.pl +++ /dev/null | |||
| @@ -1,463 +0,0 @@ | |||
| 1 | #!/usr/bin/env perl | ||
| 2 | |||
| 3 | # ==================================================================== | ||
| 4 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | ||
| 5 | # project. The module is, however, dual licensed under OpenSSL and | ||
| 6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
| 7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
| 8 | # ==================================================================== | ||
| 9 | # | ||
| 10 | # March 2010 | ||
| 11 | # | ||
| 12 | # The module implements "4-bit" GCM GHASH function and underlying | ||
| 13 | # single multiplication operation in GF(2^128). "4-bit" means that it | ||
| 14 | # uses 256 bytes per-key table [+128 bytes shared table]. Streamed | ||
| 15 | # GHASH performance was measured to be 6.67 cycles per processed byte | ||
| 16 | # on Itanium 2, which is >90% better than Microsoft compiler generated | ||
| 17 | # code. To anchor to something else sha1-ia64.pl module processes one | ||
| 18 | # byte in 5.7 cycles. On Itanium GHASH should run at ~8.5 cycles per | ||
| 19 | # byte. | ||
| 20 | |||
| 21 | # September 2010 | ||
| 22 | # | ||
| 23 | # It was originally thought that it makes lesser sense to implement | ||
| 24 | # "528B" variant on Itanium 2 for following reason. Because number of | ||
| 25 | # functional units is naturally limited, it appeared impossible to | ||
| 26 | # implement "528B" loop in 4 cycles, only in 5. This would mean that | ||
| 27 | # theoretically performance improvement couldn't be more than 20%. | ||
| 28 | # But occasionally you prove yourself wrong:-) I figured out a way to | ||
| 29 | # fold couple of instructions and having freed yet another instruction | ||
| 30 | # slot by unrolling the loop... Resulting performance is 4.45 cycles | ||
| 31 | # per processed byte and 50% better than "256B" version. On original | ||
| 32 | # Itanium performance should remain the same as the "256B" version, | ||
| 33 | # i.e. ~8.5 cycles. | ||
| 34 | |||
| 35 | $output=shift and (open STDOUT,">$output" or die "can't open $output: $!"); | ||
| 36 | |||
| 37 | if ($^O eq "hpux") { | ||
| 38 | $ADDP="addp4"; | ||
| 39 | for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); } | ||
| 40 | } else { $ADDP="add"; } | ||
| 41 | for (@ARGV) { $big_endian=1 if (/\-DB_ENDIAN/); | ||
| 42 | $big_endian=0 if (/\-DL_ENDIAN/); } | ||
| 43 | if (!defined($big_endian)) | ||
| 44 | { $big_endian=(unpack('L',pack('N',1))==1); } | ||
| 45 | |||
| 46 | sub loop() { | ||
| 47 | my $label=shift; | ||
| 48 | my ($p16,$p17)=(shift)?("p63","p63"):("p16","p17"); # mask references to inp | ||
| 49 | |||
| 50 | # Loop is scheduled for 6 ticks on Itanium 2 and 8 on Itanium, i.e. | ||
| 51 | # in scalable manner;-) Naturally assuming data in L1 cache... | ||
| 52 | # Special note about 'dep' instruction, which is used to construct | ||
| 53 | # &rem_4bit[Zlo&0xf]. It works, because rem_4bit is aligned at 128 | ||
| 54 | # bytes boundary and lower 7 bits of its address are guaranteed to | ||
| 55 | # be zero. | ||
| 56 | $code.=<<___; | ||
| 57 | $label: | ||
| 58 | { .mfi; (p18) ld8 Hlo=[Hi[1]],-8 | ||
| 59 | (p19) dep rem=Zlo,rem_4bitp,3,4 } | ||
| 60 | { .mfi; (p19) xor Zhi=Zhi,Hhi | ||
| 61 | ($p17) xor xi[1]=xi[1],in[1] };; | ||
| 62 | { .mfi; (p18) ld8 Hhi=[Hi[1]] | ||
| 63 | (p19) shrp Zlo=Zhi,Zlo,4 } | ||
| 64 | { .mfi; (p19) ld8 rem=[rem] | ||
| 65 | (p18) and Hi[1]=mask0xf0,xi[2] };; | ||
| 66 | { .mmi; ($p16) ld1 in[0]=[inp],-1 | ||
| 67 | (p18) xor Zlo=Zlo,Hlo | ||
| 68 | (p19) shr.u Zhi=Zhi,4 } | ||
| 69 | { .mib; (p19) xor Hhi=Hhi,rem | ||
| 70 | (p18) add Hi[1]=Htbl,Hi[1] };; | ||
| 71 | |||
| 72 | { .mfi; (p18) ld8 Hlo=[Hi[1]],-8 | ||
| 73 | (p18) dep rem=Zlo,rem_4bitp,3,4 } | ||
| 74 | { .mfi; (p17) shladd Hi[0]=xi[1],4,r0 | ||
| 75 | (p18) xor Zhi=Zhi,Hhi };; | ||
| 76 | { .mfi; (p18) ld8 Hhi=[Hi[1]] | ||
| 77 | (p18) shrp Zlo=Zhi,Zlo,4 } | ||
| 78 | { .mfi; (p18) ld8 rem=[rem] | ||
| 79 | (p17) and Hi[0]=mask0xf0,Hi[0] };; | ||
| 80 | { .mmi; (p16) ld1 xi[0]=[Xi],-1 | ||
| 81 | (p18) xor Zlo=Zlo,Hlo | ||
| 82 | (p18) shr.u Zhi=Zhi,4 } | ||
| 83 | { .mib; (p18) xor Hhi=Hhi,rem | ||
| 84 | (p17) add Hi[0]=Htbl,Hi[0] | ||
| 85 | br.ctop.sptk $label };; | ||
| 86 | ___ | ||
| 87 | } | ||
| 88 | |||
| 89 | $code=<<___; | ||
| 90 | .explicit | ||
| 91 | .text | ||
| 92 | |||
| 93 | prevfs=r2; prevlc=r3; prevpr=r8; | ||
| 94 | mask0xf0=r21; | ||
| 95 | rem=r22; rem_4bitp=r23; | ||
| 96 | Xi=r24; Htbl=r25; | ||
| 97 | inp=r26; end=r27; | ||
| 98 | Hhi=r28; Hlo=r29; | ||
| 99 | Zhi=r30; Zlo=r31; | ||
| 100 | |||
| 101 | .align 128 | ||
| 102 | .skip 16 // aligns loop body | ||
| 103 | .global gcm_gmult_4bit# | ||
| 104 | .proc gcm_gmult_4bit# | ||
| 105 | gcm_gmult_4bit: | ||
| 106 | .prologue | ||
| 107 | { .mmi; .save ar.pfs,prevfs | ||
| 108 | alloc prevfs=ar.pfs,2,6,0,8 | ||
| 109 | $ADDP Xi=15,in0 // &Xi[15] | ||
| 110 | mov rem_4bitp=ip } | ||
| 111 | { .mii; $ADDP Htbl=8,in1 // &Htbl[0].lo | ||
| 112 | .save ar.lc,prevlc | ||
| 113 | mov prevlc=ar.lc | ||
| 114 | .save pr,prevpr | ||
| 115 | mov prevpr=pr };; | ||
| 116 | |||
| 117 | .body | ||
| 118 | .rotr in[3],xi[3],Hi[2] | ||
| 119 | |||
| 120 | { .mib; ld1 xi[2]=[Xi],-1 // Xi[15] | ||
| 121 | mov mask0xf0=0xf0 | ||
| 122 | brp.loop.imp .Loop1,.Lend1-16};; | ||
| 123 | { .mmi; ld1 xi[1]=[Xi],-1 // Xi[14] | ||
| 124 | };; | ||
| 125 | { .mii; shladd Hi[1]=xi[2],4,r0 | ||
| 126 | mov pr.rot=0x7<<16 | ||
| 127 | mov ar.lc=13 };; | ||
| 128 | { .mii; and Hi[1]=mask0xf0,Hi[1] | ||
| 129 | mov ar.ec=3 | ||
| 130 | xor Zlo=Zlo,Zlo };; | ||
| 131 | { .mii; add Hi[1]=Htbl,Hi[1] // &Htbl[nlo].lo | ||
| 132 | add rem_4bitp=rem_4bit#-gcm_gmult_4bit#,rem_4bitp | ||
| 133 | xor Zhi=Zhi,Zhi };; | ||
| 134 | ___ | ||
| 135 | &loop (".Loop1",1); | ||
| 136 | $code.=<<___; | ||
| 137 | .Lend1: | ||
| 138 | { .mib; xor Zhi=Zhi,Hhi };; // modulo-scheduling artefact | ||
| 139 | { .mib; mux1 Zlo=Zlo,\@rev };; | ||
| 140 | { .mib; mux1 Zhi=Zhi,\@rev };; | ||
| 141 | { .mmi; add Hlo=9,Xi;; // ;; is here to prevent | ||
| 142 | add Hhi=1,Xi };; // pipeline flush on Itanium | ||
| 143 | { .mib; st8 [Hlo]=Zlo | ||
| 144 | mov pr=prevpr,0x1ffff };; | ||
| 145 | { .mib; st8 [Hhi]=Zhi | ||
| 146 | mov ar.lc=prevlc | ||
| 147 | br.ret.sptk.many b0 };; | ||
| 148 | .endp gcm_gmult_4bit# | ||
| 149 | ___ | ||
| 150 | |||
| 151 | ###################################################################### | ||
| 152 | # "528B" (well, "512B" actualy) streamed GHASH | ||
| 153 | # | ||
| 154 | $Xip="in0"; | ||
| 155 | $Htbl="in1"; | ||
| 156 | $inp="in2"; | ||
| 157 | $len="in3"; | ||
| 158 | $rem_8bit="loc0"; | ||
| 159 | $mask0xff="loc1"; | ||
| 160 | ($sum,$rum) = $big_endian ? ("nop.m","nop.m") : ("sum","rum"); | ||
| 161 | |||
| 162 | sub load_htable() { | ||
| 163 | for (my $i=0;$i<8;$i++) { | ||
| 164 | $code.=<<___; | ||
| 165 | { .mmi; ld8 r`16+2*$i+1`=[r8],16 // Htable[$i].hi | ||
| 166 | ld8 r`16+2*$i`=[r9],16 } // Htable[$i].lo | ||
| 167 | { .mmi; ldf8 f`32+2*$i+1`=[r10],16 // Htable[`8+$i`].hi | ||
| 168 | ldf8 f`32+2*$i`=[r11],16 // Htable[`8+$i`].lo | ||
| 169 | ___ | ||
| 170 | $code.=shift if (($i+$#_)==7); | ||
| 171 | $code.="\t};;\n" | ||
| 172 | } | ||
| 173 | } | ||
| 174 | |||
| 175 | $code.=<<___; | ||
| 176 | prevsp=r3; | ||
| 177 | |||
| 178 | .align 32 | ||
| 179 | .skip 16 // aligns loop body | ||
| 180 | .global gcm_ghash_4bit# | ||
| 181 | .proc gcm_ghash_4bit# | ||
| 182 | gcm_ghash_4bit: | ||
| 183 | .prologue | ||
| 184 | { .mmi; .save ar.pfs,prevfs | ||
| 185 | alloc prevfs=ar.pfs,4,2,0,0 | ||
| 186 | .vframe prevsp | ||
| 187 | mov prevsp=sp | ||
| 188 | mov $rem_8bit=ip };; | ||
| 189 | .body | ||
| 190 | { .mfi; $ADDP r8=0+0,$Htbl | ||
| 191 | $ADDP r9=0+8,$Htbl } | ||
| 192 | { .mfi; $ADDP r10=128+0,$Htbl | ||
| 193 | $ADDP r11=128+8,$Htbl };; | ||
| 194 | ___ | ||
| 195 | &load_htable( | ||
| 196 | " $ADDP $Xip=15,$Xip", # &Xi[15] | ||
| 197 | " $ADDP $len=$len,$inp", # &inp[len] | ||
| 198 | " $ADDP $inp=15,$inp", # &inp[15] | ||
| 199 | " mov $mask0xff=0xff", | ||
| 200 | " add sp=-512,sp", | ||
| 201 | " andcm sp=sp,$mask0xff", # align stack frame | ||
| 202 | " add r14=0,sp", | ||
| 203 | " add r15=8,sp"); | ||
| 204 | $code.=<<___; | ||
| 205 | { .mmi; $sum 1<<1 // go big-endian | ||
| 206 | add r8=256+0,sp | ||
| 207 | add r9=256+8,sp } | ||
| 208 | { .mmi; add r10=256+128+0,sp | ||
| 209 | add r11=256+128+8,sp | ||
| 210 | add $len=-17,$len };; | ||
| 211 | ___ | ||
| 212 | for($i=0;$i<8;$i++) { # generate first half of Hshr4[] | ||
| 213 | my ($rlo,$rhi)=("r".eval(16+2*$i),"r".eval(16+2*$i+1)); | ||
| 214 | $code.=<<___; | ||
| 215 | { .mmi; st8 [r8]=$rlo,16 // Htable[$i].lo | ||
| 216 | st8 [r9]=$rhi,16 // Htable[$i].hi | ||
| 217 | shrp $rlo=$rhi,$rlo,4 }//;; | ||
| 218 | { .mmi; stf8 [r10]=f`32+2*$i`,16 // Htable[`8+$i`].lo | ||
| 219 | stf8 [r11]=f`32+2*$i+1`,16 // Htable[`8+$i`].hi | ||
| 220 | shr.u $rhi=$rhi,4 };; | ||
| 221 | { .mmi; st8 [r14]=$rlo,16 // Htable[$i].lo>>4 | ||
| 222 | st8 [r15]=$rhi,16 }//;; // Htable[$i].hi>>4 | ||
| 223 | ___ | ||
| 224 | } | ||
| 225 | $code.=<<___; | ||
| 226 | { .mmi; ld8 r16=[r8],16 // Htable[8].lo | ||
| 227 | ld8 r17=[r9],16 };; // Htable[8].hi | ||
| 228 | { .mmi; ld8 r18=[r8],16 // Htable[9].lo | ||
| 229 | ld8 r19=[r9],16 } // Htable[9].hi | ||
| 230 | { .mmi; rum 1<<5 // clear um.mfh | ||
| 231 | shrp r16=r17,r16,4 };; | ||
| 232 | ___ | ||
| 233 | for($i=0;$i<6;$i++) { # generate second half of Hshr4[] | ||
| 234 | $code.=<<___; | ||
| 235 | { .mmi; ld8 r`20+2*$i`=[r8],16 // Htable[`10+$i`].lo | ||
| 236 | ld8 r`20+2*$i+1`=[r9],16 // Htable[`10+$i`].hi | ||
| 237 | shr.u r`16+2*$i+1`=r`16+2*$i+1`,4 };; | ||
| 238 | { .mmi; st8 [r14]=r`16+2*$i`,16 // Htable[`8+$i`].lo>>4 | ||
| 239 | st8 [r15]=r`16+2*$i+1`,16 // Htable[`8+$i`].hi>>4 | ||
| 240 | shrp r`18+2*$i`=r`18+2*$i+1`,r`18+2*$i`,4 } | ||
| 241 | ___ | ||
| 242 | } | ||
| 243 | $code.=<<___; | ||
| 244 | { .mmi; shr.u r`16+2*$i+1`=r`16+2*$i+1`,4 };; | ||
| 245 | { .mmi; st8 [r14]=r`16+2*$i`,16 // Htable[`8+$i`].lo>>4 | ||
| 246 | st8 [r15]=r`16+2*$i+1`,16 // Htable[`8+$i`].hi>>4 | ||
| 247 | shrp r`18+2*$i`=r`18+2*$i+1`,r`18+2*$i`,4 } | ||
| 248 | { .mmi; add $Htbl=256,sp // &Htable[0] | ||
| 249 | add $rem_8bit=rem_8bit#-gcm_ghash_4bit#,$rem_8bit | ||
| 250 | shr.u r`18+2*$i+1`=r`18+2*$i+1`,4 };; | ||
| 251 | { .mmi; st8 [r14]=r`18+2*$i` // Htable[`8+$i`].lo>>4 | ||
| 252 | st8 [r15]=r`18+2*$i+1` } // Htable[`8+$i`].hi>>4 | ||
| 253 | ___ | ||
| 254 | |||
| 255 | $in="r15"; | ||
| 256 | @xi=("r16","r17"); | ||
| 257 | @rem=("r18","r19"); | ||
| 258 | ($Alo,$Ahi,$Blo,$Bhi,$Zlo,$Zhi)=("r20","r21","r22","r23","r24","r25"); | ||
| 259 | ($Atbl,$Btbl)=("r26","r27"); | ||
| 260 | |||
| 261 | $code.=<<___; # (p16) | ||
| 262 | { .mmi; ld1 $in=[$inp],-1 //(p16) *inp-- | ||
| 263 | ld1 $xi[0]=[$Xip],-1 //(p16) *Xi-- | ||
| 264 | cmp.eq p0,p6=r0,r0 };; // clear p6 | ||
| 265 | ___ | ||
| 266 | push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers | ||
| 267 | |||
| 268 | $code.=<<___; # (p16),(p17) | ||
| 269 | { .mmi; ld1 $xi[0]=[$Xip],-1 //(p16) *Xi-- | ||
| 270 | xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i] | ||
| 271 | { .mii; ld1 $in=[$inp],-1 //(p16) *inp-- | ||
| 272 | dep $Atbl=$xi[1],$Htbl,4,4 //(p17) &Htable[nlo].lo | ||
| 273 | and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0 | ||
| 274 | .align 32 | ||
| 275 | .LOOP: | ||
| 276 | { .mmi; | ||
| 277 | (p6) st8 [$Xip]=$Zhi,13 | ||
| 278 | xor $Zlo=$Zlo,$Zlo | ||
| 279 | add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi].lo | ||
| 280 | ___ | ||
| 281 | push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers | ||
| 282 | |||
| 283 | $code.=<<___; # (p16),(p17),(p18) | ||
| 284 | { .mmi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi | ||
| 285 | ld8 $rem[0]=[$Btbl],-256 //(p18) Htable[nhi].lo,&Hshr4[nhi].lo | ||
| 286 | xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i] | ||
| 287 | { .mfi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi | ||
| 288 | dep $Atbl=$xi[1],$Htbl,4,4 } //(p17) &Htable[nlo].lo | ||
| 289 | { .mfi; shladd $rem[0]=$rem[0],4,r0 //(p18) Htable[nhi].lo<<4 | ||
| 290 | xor $Zlo=$Zlo,$Alo };; //(p18) Z.lo^=Htable[nlo].lo | ||
| 291 | { .mmi; ld8 $Blo=[$Btbl],8 //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi | ||
| 292 | ld1 $in=[$inp],-1 } //(p16) *inp-- | ||
| 293 | { .mmi; xor $rem[0]=$rem[0],$Zlo //(p18) Z.lo^(Htable[nhi].lo<<4) | ||
| 294 | mov $Zhi=$Ahi //(p18) Z.hi^=Htable[nlo].hi | ||
| 295 | and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0 | ||
| 296 | { .mmi; ld8 $Bhi=[$Btbl] //(p18) Hshr4[nhi].hi | ||
| 297 | ld1 $xi[0]=[$Xip],-1 //(p16) *Xi-- | ||
| 298 | shrp $Zlo=$Zhi,$Zlo,8 } //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8) | ||
| 299 | { .mmi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff | ||
| 300 | add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi] | ||
| 301 | ___ | ||
| 302 | push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers | ||
| 303 | |||
| 304 | for ($i=1;$i<14;$i++) { | ||
| 305 | # Above and below fragments are derived from this one by removing | ||
| 306 | # unsuitable (p??) instructions. | ||
| 307 | $code.=<<___; # (p16),(p17),(p18),(p19) | ||
| 308 | { .mmi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi | ||
| 309 | ld8 $rem[0]=[$Btbl],-256 //(p18) Htable[nhi].lo,&Hshr4[nhi].lo | ||
| 310 | shr.u $Zhi=$Zhi,8 } //(p19) Z.hi>>=8 | ||
| 311 | { .mmi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem] | ||
| 312 | xor $Zlo=$Zlo,$Blo //(p19) Z.lo^=Hshr4[nhi].lo | ||
| 313 | xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i] | ||
| 314 | { .mmi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi | ||
| 315 | ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem] | ||
| 316 | dep $Atbl=$xi[1],$Htbl,4,4 } //(p17) &Htable[nlo].lo | ||
| 317 | { .mmi; shladd $rem[0]=$rem[0],4,r0 //(p18) Htable[nhi].lo<<4 | ||
| 318 | xor $Zlo=$Zlo,$Alo //(p18) Z.lo^=Htable[nlo].lo | ||
| 319 | xor $Zhi=$Zhi,$Bhi };; //(p19) Z.hi^=Hshr4[nhi].hi | ||
| 320 | { .mmi; ld8 $Blo=[$Btbl],8 //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi | ||
| 321 | ld1 $in=[$inp],-1 //(p16) *inp-- | ||
| 322 | shl $rem[1]=$rem[1],48 } //(p19) rem_8bit[rem]<<48 | ||
| 323 | { .mmi; xor $rem[0]=$rem[0],$Zlo //(p18) Z.lo^(Htable[nhi].lo<<4) | ||
| 324 | xor $Zhi=$Zhi,$Ahi //(p18) Z.hi^=Htable[nlo].hi | ||
| 325 | and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0 | ||
| 326 | { .mmi; ld8 $Bhi=[$Btbl] //(p18) Hshr4[nhi].hi | ||
| 327 | ld1 $xi[0]=[$Xip],-1 //(p16) *Xi-- | ||
| 328 | shrp $Zlo=$Zhi,$Zlo,8 } //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8) | ||
| 329 | { .mmi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff | ||
| 330 | xor $Zhi=$Zhi,$rem[1] //(p19) Z.hi^=rem_8bit[rem]<<48 | ||
| 331 | add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi] | ||
| 332 | ___ | ||
| 333 | push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers | ||
| 334 | } | ||
| 335 | |||
| 336 | $code.=<<___; # (p17),(p18),(p19) | ||
| 337 | { .mmi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi | ||
| 338 | ld8 $rem[0]=[$Btbl],-256 //(p18) Htable[nhi].lo,&Hshr4[nhi].lo | ||
| 339 | shr.u $Zhi=$Zhi,8 } //(p19) Z.hi>>=8 | ||
| 340 | { .mmi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem] | ||
| 341 | xor $Zlo=$Zlo,$Blo //(p19) Z.lo^=Hshr4[nhi].lo | ||
| 342 | xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i] | ||
| 343 | { .mmi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi | ||
| 344 | ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem] | ||
| 345 | dep $Atbl=$xi[1],$Htbl,4,4 };; //(p17) &Htable[nlo].lo | ||
| 346 | { .mmi; shladd $rem[0]=$rem[0],4,r0 //(p18) Htable[nhi].lo<<4 | ||
| 347 | xor $Zlo=$Zlo,$Alo //(p18) Z.lo^=Htable[nlo].lo | ||
| 348 | xor $Zhi=$Zhi,$Bhi };; //(p19) Z.hi^=Hshr4[nhi].hi | ||
| 349 | { .mmi; ld8 $Blo=[$Btbl],8 //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi | ||
| 350 | shl $rem[1]=$rem[1],48 } //(p19) rem_8bit[rem]<<48 | ||
| 351 | { .mmi; xor $rem[0]=$rem[0],$Zlo //(p18) Z.lo^(Htable[nhi].lo<<4) | ||
| 352 | xor $Zhi=$Zhi,$Ahi //(p18) Z.hi^=Htable[nlo].hi | ||
| 353 | and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0 | ||
| 354 | { .mmi; ld8 $Bhi=[$Btbl] //(p18) Hshr4[nhi].hi | ||
| 355 | shrp $Zlo=$Zhi,$Zlo,8 } //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8) | ||
| 356 | { .mmi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff | ||
| 357 | xor $Zhi=$Zhi,$rem[1] //(p19) Z.hi^=rem_8bit[rem]<<48 | ||
| 358 | add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi] | ||
| 359 | ___ | ||
| 360 | push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers | ||
| 361 | |||
| 362 | $code.=<<___; # (p18),(p19) | ||
| 363 | { .mfi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi | ||
| 364 | shr.u $Zhi=$Zhi,8 } //(p19) Z.hi>>=8 | ||
| 365 | { .mfi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem] | ||
| 366 | xor $Zlo=$Zlo,$Blo };; //(p19) Z.lo^=Hshr4[nhi].lo | ||
| 367 | { .mfi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi | ||
| 368 | xor $Zlo=$Zlo,$Alo } //(p18) Z.lo^=Htable[nlo].lo | ||
| 369 | { .mfi; ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem] | ||
| 370 | xor $Zhi=$Zhi,$Bhi };; //(p19) Z.hi^=Hshr4[nhi].hi | ||
| 371 | { .mfi; ld8 $Blo=[$Btbl],8 //(p18) Htable[nhi].lo,&Htable[nhi].hi | ||
| 372 | shl $rem[1]=$rem[1],48 } //(p19) rem_8bit[rem]<<48 | ||
| 373 | { .mfi; shladd $rem[0]=$Zlo,4,r0 //(p18) Z.lo<<4 | ||
| 374 | xor $Zhi=$Zhi,$Ahi };; //(p18) Z.hi^=Htable[nlo].hi | ||
| 375 | { .mfi; ld8 $Bhi=[$Btbl] //(p18) Htable[nhi].hi | ||
| 376 | shrp $Zlo=$Zhi,$Zlo,4 } //(p18) Z.lo=(Z.hi<<60)|(Z.lo>>4) | ||
| 377 | { .mfi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff | ||
| 378 | xor $Zhi=$Zhi,$rem[1] };; //(p19) Z.hi^=rem_8bit[rem]<<48 | ||
| 379 | ___ | ||
| 380 | push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers | ||
| 381 | |||
| 382 | $code.=<<___; # (p19) | ||
| 383 | { .mmi; cmp.ltu p6,p0=$inp,$len | ||
| 384 | add $inp=32,$inp | ||
| 385 | shr.u $Zhi=$Zhi,4 } //(p19) Z.hi>>=4 | ||
| 386 | { .mmi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem] | ||
| 387 | xor $Zlo=$Zlo,$Blo //(p19) Z.lo^=Hshr4[nhi].lo | ||
| 388 | add $Xip=9,$Xip };; // &Xi.lo | ||
| 389 | { .mmi; ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem] | ||
| 390 | (p6) ld1 $in=[$inp],-1 //[p16] *inp-- | ||
| 391 | (p6) extr.u $xi[1]=$Zlo,8,8 } //[p17] Xi[14] | ||
| 392 | { .mmi; xor $Zhi=$Zhi,$Bhi //(p19) Z.hi^=Hshr4[nhi].hi | ||
| 393 | (p6) and $xi[0]=$Zlo,$mask0xff };; //[p16] Xi[15] | ||
| 394 | { .mmi; st8 [$Xip]=$Zlo,-8 | ||
| 395 | (p6) xor $xi[0]=$xi[0],$in //[p17] xi=$xi[i]^inp[i] | ||
| 396 | shl $rem[1]=$rem[1],48 };; //(p19) rem_8bit[rem]<<48 | ||
| 397 | { .mmi; | ||
| 398 | (p6) ld1 $in=[$inp],-1 //[p16] *inp-- | ||
| 399 | xor $Zhi=$Zhi,$rem[1] //(p19) Z.hi^=rem_8bit[rem]<<48 | ||
| 400 | (p6) dep $Atbl=$xi[0],$Htbl,4,4 } //[p17] &Htable[nlo].lo | ||
| 401 | { .mib; | ||
| 402 | (p6) and $xi[0]=-16,$xi[0] //[p17] nhi=xi&0xf0 | ||
| 403 | (p6) br.cond.dptk.many .LOOP };; | ||
| 404 | |||
| 405 | { .mib; st8 [$Xip]=$Zhi };; | ||
| 406 | { .mib; $rum 1<<1 // return to little-endian | ||
| 407 | .restore sp | ||
| 408 | mov sp=prevsp | ||
| 409 | br.ret.sptk.many b0 };; | ||
| 410 | .endp gcm_ghash_4bit# | ||
| 411 | ___ | ||
| 412 | $code.=<<___; | ||
| 413 | .align 128 | ||
| 414 | .type rem_4bit#,\@object | ||
| 415 | rem_4bit: | ||
| 416 | data8 0x0000<<48, 0x1C20<<48, 0x3840<<48, 0x2460<<48 | ||
| 417 | data8 0x7080<<48, 0x6CA0<<48, 0x48C0<<48, 0x54E0<<48 | ||
| 418 | data8 0xE100<<48, 0xFD20<<48, 0xD940<<48, 0xC560<<48 | ||
| 419 | data8 0x9180<<48, 0x8DA0<<48, 0xA9C0<<48, 0xB5E0<<48 | ||
| 420 | .size rem_4bit#,128 | ||
| 421 | .type rem_8bit#,\@object | ||
| 422 | rem_8bit: | ||
| 423 | data1 0x00,0x00, 0x01,0xC2, 0x03,0x84, 0x02,0x46, 0x07,0x08, 0x06,0xCA, 0x04,0x8C, 0x05,0x4E | ||
| 424 | data1 0x0E,0x10, 0x0F,0xD2, 0x0D,0x94, 0x0C,0x56, 0x09,0x18, 0x08,0xDA, 0x0A,0x9C, 0x0B,0x5E | ||
| 425 | data1 0x1C,0x20, 0x1D,0xE2, 0x1F,0xA4, 0x1E,0x66, 0x1B,0x28, 0x1A,0xEA, 0x18,0xAC, 0x19,0x6E | ||
| 426 | data1 0x12,0x30, 0x13,0xF2, 0x11,0xB4, 0x10,0x76, 0x15,0x38, 0x14,0xFA, 0x16,0xBC, 0x17,0x7E | ||
| 427 | data1 0x38,0x40, 0x39,0x82, 0x3B,0xC4, 0x3A,0x06, 0x3F,0x48, 0x3E,0x8A, 0x3C,0xCC, 0x3D,0x0E | ||
| 428 | data1 0x36,0x50, 0x37,0x92, 0x35,0xD4, 0x34,0x16, 0x31,0x58, 0x30,0x9A, 0x32,0xDC, 0x33,0x1E | ||
| 429 | data1 0x24,0x60, 0x25,0xA2, 0x27,0xE4, 0x26,0x26, 0x23,0x68, 0x22,0xAA, 0x20,0xEC, 0x21,0x2E | ||
| 430 | data1 0x2A,0x70, 0x2B,0xB2, 0x29,0xF4, 0x28,0x36, 0x2D,0x78, 0x2C,0xBA, 0x2E,0xFC, 0x2F,0x3E | ||
| 431 | data1 0x70,0x80, 0x71,0x42, 0x73,0x04, 0x72,0xC6, 0x77,0x88, 0x76,0x4A, 0x74,0x0C, 0x75,0xCE | ||
| 432 | data1 0x7E,0x90, 0x7F,0x52, 0x7D,0x14, 0x7C,0xD6, 0x79,0x98, 0x78,0x5A, 0x7A,0x1C, 0x7B,0xDE | ||
| 433 | data1 0x6C,0xA0, 0x6D,0x62, 0x6F,0x24, 0x6E,0xE6, 0x6B,0xA8, 0x6A,0x6A, 0x68,0x2C, 0x69,0xEE | ||
| 434 | data1 0x62,0xB0, 0x63,0x72, 0x61,0x34, 0x60,0xF6, 0x65,0xB8, 0x64,0x7A, 0x66,0x3C, 0x67,0xFE | ||
| 435 | data1 0x48,0xC0, 0x49,0x02, 0x4B,0x44, 0x4A,0x86, 0x4F,0xC8, 0x4E,0x0A, 0x4C,0x4C, 0x4D,0x8E | ||
| 436 | data1 0x46,0xD0, 0x47,0x12, 0x45,0x54, 0x44,0x96, 0x41,0xD8, 0x40,0x1A, 0x42,0x5C, 0x43,0x9E | ||
| 437 | data1 0x54,0xE0, 0x55,0x22, 0x57,0x64, 0x56,0xA6, 0x53,0xE8, 0x52,0x2A, 0x50,0x6C, 0x51,0xAE | ||
| 438 | data1 0x5A,0xF0, 0x5B,0x32, 0x59,0x74, 0x58,0xB6, 0x5D,0xF8, 0x5C,0x3A, 0x5E,0x7C, 0x5F,0xBE | ||
| 439 | data1 0xE1,0x00, 0xE0,0xC2, 0xE2,0x84, 0xE3,0x46, 0xE6,0x08, 0xE7,0xCA, 0xE5,0x8C, 0xE4,0x4E | ||
| 440 | data1 0xEF,0x10, 0xEE,0xD2, 0xEC,0x94, 0xED,0x56, 0xE8,0x18, 0xE9,0xDA, 0xEB,0x9C, 0xEA,0x5E | ||
| 441 | data1 0xFD,0x20, 0xFC,0xE2, 0xFE,0xA4, 0xFF,0x66, 0xFA,0x28, 0xFB,0xEA, 0xF9,0xAC, 0xF8,0x6E | ||
| 442 | data1 0xF3,0x30, 0xF2,0xF2, 0xF0,0xB4, 0xF1,0x76, 0xF4,0x38, 0xF5,0xFA, 0xF7,0xBC, 0xF6,0x7E | ||
| 443 | data1 0xD9,0x40, 0xD8,0x82, 0xDA,0xC4, 0xDB,0x06, 0xDE,0x48, 0xDF,0x8A, 0xDD,0xCC, 0xDC,0x0E | ||
| 444 | data1 0xD7,0x50, 0xD6,0x92, 0xD4,0xD4, 0xD5,0x16, 0xD0,0x58, 0xD1,0x9A, 0xD3,0xDC, 0xD2,0x1E | ||
| 445 | data1 0xC5,0x60, 0xC4,0xA2, 0xC6,0xE4, 0xC7,0x26, 0xC2,0x68, 0xC3,0xAA, 0xC1,0xEC, 0xC0,0x2E | ||
| 446 | data1 0xCB,0x70, 0xCA,0xB2, 0xC8,0xF4, 0xC9,0x36, 0xCC,0x78, 0xCD,0xBA, 0xCF,0xFC, 0xCE,0x3E | ||
| 447 | data1 0x91,0x80, 0x90,0x42, 0x92,0x04, 0x93,0xC6, 0x96,0x88, 0x97,0x4A, 0x95,0x0C, 0x94,0xCE | ||
| 448 | data1 0x9F,0x90, 0x9E,0x52, 0x9C,0x14, 0x9D,0xD6, 0x98,0x98, 0x99,0x5A, 0x9B,0x1C, 0x9A,0xDE | ||
| 449 | data1 0x8D,0xA0, 0x8C,0x62, 0x8E,0x24, 0x8F,0xE6, 0x8A,0xA8, 0x8B,0x6A, 0x89,0x2C, 0x88,0xEE | ||
| 450 | data1 0x83,0xB0, 0x82,0x72, 0x80,0x34, 0x81,0xF6, 0x84,0xB8, 0x85,0x7A, 0x87,0x3C, 0x86,0xFE | ||
| 451 | data1 0xA9,0xC0, 0xA8,0x02, 0xAA,0x44, 0xAB,0x86, 0xAE,0xC8, 0xAF,0x0A, 0xAD,0x4C, 0xAC,0x8E | ||
| 452 | data1 0xA7,0xD0, 0xA6,0x12, 0xA4,0x54, 0xA5,0x96, 0xA0,0xD8, 0xA1,0x1A, 0xA3,0x5C, 0xA2,0x9E | ||
| 453 | data1 0xB5,0xE0, 0xB4,0x22, 0xB6,0x64, 0xB7,0xA6, 0xB2,0xE8, 0xB3,0x2A, 0xB1,0x6C, 0xB0,0xAE | ||
| 454 | data1 0xBB,0xF0, 0xBA,0x32, 0xB8,0x74, 0xB9,0xB6, 0xBC,0xF8, 0xBD,0x3A, 0xBF,0x7C, 0xBE,0xBE | ||
| 455 | .size rem_8bit#,512 | ||
| 456 | stringz "GHASH for IA64, CRYPTOGAMS by <appro\@openssl.org>" | ||
| 457 | ___ | ||
| 458 | |||
| 459 | $code =~ s/mux1(\s+)\S+\@rev/nop.i$1 0x0/gm if ($big_endian); | ||
| 460 | $code =~ s/\`([^\`]*)\`/eval $1/gem; | ||
| 461 | |||
| 462 | print $code; | ||
| 463 | close STDOUT; | ||
