diff options
| author | djm <> | 2012-10-13 21:23:50 +0000 |
|---|---|---|
| committer | djm <> | 2012-10-13 21:23:50 +0000 |
| commit | d56dbc3c72494d4b68c03f5bcc3ae1f9df7b17df (patch) | |
| tree | 10ebe51c3542099b0ab8325d8f322372375dc3b4 /src/lib/libcrypto/rc4 | |
| parent | bc685bd401e5657f7fb51b4e1a62a7a5c5ea4098 (diff) | |
| parent | 228cae30b117c2493f69ad3c195341cd6ec8d430 (diff) | |
| download | openbsd-d56dbc3c72494d4b68c03f5bcc3ae1f9df7b17df.tar.gz openbsd-d56dbc3c72494d4b68c03f5bcc3ae1f9df7b17df.tar.bz2 openbsd-d56dbc3c72494d4b68c03f5bcc3ae1f9df7b17df.zip | |
This commit was generated by cvs2git to track changes on a CVS vendor
branch.
Diffstat (limited to 'src/lib/libcrypto/rc4')
| -rw-r--r-- | src/lib/libcrypto/rc4/asm/rc4-md5-x86_64.pl | 631 | ||||
| -rw-r--r-- | src/lib/libcrypto/rc4/asm/rc4-parisc.pl | 313 | ||||
| -rw-r--r-- | src/lib/libcrypto/rc4/asm/rc4-s390x.pl | 47 |
3 files changed, 982 insertions, 9 deletions
diff --git a/src/lib/libcrypto/rc4/asm/rc4-md5-x86_64.pl b/src/lib/libcrypto/rc4/asm/rc4-md5-x86_64.pl new file mode 100644 index 0000000000..7f684092d4 --- /dev/null +++ b/src/lib/libcrypto/rc4/asm/rc4-md5-x86_64.pl | |||
| @@ -0,0 +1,631 @@ | |||
| 1 | #!/usr/bin/env perl | ||
| 2 | # | ||
| 3 | # ==================================================================== | ||
| 4 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | ||
| 5 | # project. The module is, however, dual licensed under OpenSSL and | ||
| 6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
| 7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
| 8 | # ==================================================================== | ||
| 9 | |||
| 10 | # June 2011 | ||
| 11 | # | ||
| 12 | # This is RC4+MD5 "stitch" implementation. The idea, as spelled in | ||
| 13 | # http://download.intel.com/design/intarch/papers/323686.pdf, is that | ||
| 14 | # since both algorithms exhibit instruction-level parallelism, ILP, | ||
| 15 | # below theoretical maximum, interleaving them would allow to utilize | ||
| 16 | # processor resources better and achieve better performance. RC4 | ||
| 17 | # instruction sequence is virtually identical to rc4-x86_64.pl, which | ||
| 18 | # is heavily based on submission by Maxim Perminov, Maxim Locktyukhin | ||
| 19 | # and Jim Guilford of Intel. MD5 is fresh implementation aiming to | ||
| 20 | # minimize register usage, which was used as "main thread" with RC4 | ||
| 21 | # weaved into it, one RC4 round per one MD5 round. In addition to the | ||
| 22 | # stiched subroutine the script can generate standalone replacement | ||
| 23 | # md5_block_asm_data_order and RC4. Below are performance numbers in | ||
| 24 | # cycles per processed byte, less is better, for these the standalone | ||
| 25 | # subroutines, sum of them, and stitched one: | ||
| 26 | # | ||
| 27 | # RC4 MD5 RC4+MD5 stitch gain | ||
| 28 | # Opteron 6.5(*) 5.4 11.9 7.0 +70%(*) | ||
| 29 | # Core2 6.5 5.8 12.3 7.7 +60% | ||
| 30 | # Westmere 4.3 5.2 9.5 7.0 +36% | ||
| 31 | # Sandy Bridge 4.2 5.5 9.7 6.8 +43% | ||
| 32 | # Atom 9.3 6.5 15.8 11.1 +42% | ||
| 33 | # | ||
| 34 | # (*) rc4-x86_64.pl delivers 5.3 on Opteron, so real improvement | ||
| 35 | # is +53%... | ||
| 36 | |||
| 37 | my ($rc4,$md5)=(1,1); # what to generate? | ||
| 38 | my $D="#" if (!$md5); # if set to "#", MD5 is stitched into RC4(), | ||
| 39 | # but its result is discarded. Idea here is | ||
| 40 | # to be able to use 'openssl speed rc4' for | ||
| 41 | # benchmarking the stitched subroutine... | ||
| 42 | |||
| 43 | my $flavour = shift; | ||
| 44 | my $output = shift; | ||
| 45 | if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } | ||
| 46 | |||
| 47 | my $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); | ||
| 48 | |||
| 49 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; my $dir=$1; my $xlate; | ||
| 50 | ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or | ||
| 51 | ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or | ||
| 52 | die "can't locate x86_64-xlate.pl"; | ||
| 53 | |||
| 54 | open STDOUT,"| $^X $xlate $flavour $output"; | ||
| 55 | |||
| 56 | my ($dat,$in0,$out,$ctx,$inp,$len, $func,$nargs); | ||
| 57 | |||
| 58 | if ($rc4 && !$md5) { | ||
| 59 | ($dat,$len,$in0,$out) = ("%rdi","%rsi","%rdx","%rcx"); | ||
| 60 | $func="RC4"; $nargs=4; | ||
| 61 | } elsif ($md5 && !$rc4) { | ||
| 62 | ($ctx,$inp,$len) = ("%rdi","%rsi","%rdx"); | ||
| 63 | $func="md5_block_asm_data_order"; $nargs=3; | ||
| 64 | } else { | ||
| 65 | ($dat,$in0,$out,$ctx,$inp,$len) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9"); | ||
| 66 | $func="rc4_md5_enc"; $nargs=6; | ||
| 67 | # void rc4_md5_enc( | ||
| 68 | # RC4_KEY *key, # | ||
| 69 | # const void *in0, # RC4 input | ||
| 70 | # void *out, # RC4 output | ||
| 71 | # MD5_CTX *ctx, # | ||
| 72 | # const void *inp, # MD5 input | ||
| 73 | # size_t len); # number of 64-byte blocks | ||
| 74 | } | ||
| 75 | |||
| 76 | my @K=( 0xd76aa478,0xe8c7b756,0x242070db,0xc1bdceee, | ||
| 77 | 0xf57c0faf,0x4787c62a,0xa8304613,0xfd469501, | ||
| 78 | 0x698098d8,0x8b44f7af,0xffff5bb1,0x895cd7be, | ||
| 79 | 0x6b901122,0xfd987193,0xa679438e,0x49b40821, | ||
| 80 | |||
| 81 | 0xf61e2562,0xc040b340,0x265e5a51,0xe9b6c7aa, | ||
| 82 | 0xd62f105d,0x02441453,0xd8a1e681,0xe7d3fbc8, | ||
| 83 | 0x21e1cde6,0xc33707d6,0xf4d50d87,0x455a14ed, | ||
| 84 | 0xa9e3e905,0xfcefa3f8,0x676f02d9,0x8d2a4c8a, | ||
| 85 | |||
| 86 | 0xfffa3942,0x8771f681,0x6d9d6122,0xfde5380c, | ||
| 87 | 0xa4beea44,0x4bdecfa9,0xf6bb4b60,0xbebfbc70, | ||
| 88 | 0x289b7ec6,0xeaa127fa,0xd4ef3085,0x04881d05, | ||
| 89 | 0xd9d4d039,0xe6db99e5,0x1fa27cf8,0xc4ac5665, | ||
| 90 | |||
| 91 | 0xf4292244,0x432aff97,0xab9423a7,0xfc93a039, | ||
| 92 | 0x655b59c3,0x8f0ccc92,0xffeff47d,0x85845dd1, | ||
| 93 | 0x6fa87e4f,0xfe2ce6e0,0xa3014314,0x4e0811a1, | ||
| 94 | 0xf7537e82,0xbd3af235,0x2ad7d2bb,0xeb86d391 ); | ||
| 95 | |||
| 96 | my @V=("%r8d","%r9d","%r10d","%r11d"); # MD5 registers | ||
| 97 | my $tmp="%r12d"; | ||
| 98 | |||
| 99 | my @XX=("%rbp","%rsi"); # RC4 registers | ||
| 100 | my @TX=("%rax","%rbx"); | ||
| 101 | my $YY="%rcx"; | ||
| 102 | my $TY="%rdx"; | ||
| 103 | |||
| 104 | my $MOD=32; # 16, 32 or 64 | ||
| 105 | |||
| 106 | $code.=<<___; | ||
| 107 | .text | ||
| 108 | .align 16 | ||
| 109 | |||
| 110 | .globl $func | ||
| 111 | .type $func,\@function,$nargs | ||
| 112 | $func: | ||
| 113 | cmp \$0,$len | ||
| 114 | je .Labort | ||
| 115 | push %rbx | ||
| 116 | push %rbp | ||
| 117 | push %r12 | ||
| 118 | push %r13 | ||
| 119 | push %r14 | ||
| 120 | push %r15 | ||
| 121 | sub \$40,%rsp | ||
| 122 | .Lbody: | ||
| 123 | ___ | ||
| 124 | if ($rc4) { | ||
| 125 | $code.=<<___; | ||
| 126 | $D#md5# mov $ctx,%r11 # reassign arguments | ||
| 127 | mov $len,%r12 | ||
| 128 | mov $in0,%r13 | ||
| 129 | mov $out,%r14 | ||
| 130 | $D#md5# mov $inp,%r15 | ||
| 131 | ___ | ||
| 132 | $ctx="%r11" if ($md5); # reassign arguments | ||
| 133 | $len="%r12"; | ||
| 134 | $in0="%r13"; | ||
| 135 | $out="%r14"; | ||
| 136 | $inp="%r15" if ($md5); | ||
| 137 | $inp=$in0 if (!$md5); | ||
| 138 | $code.=<<___; | ||
| 139 | xor $XX[0],$XX[0] | ||
| 140 | xor $YY,$YY | ||
| 141 | |||
| 142 | lea 8($dat),$dat | ||
| 143 | mov -8($dat),$XX[0]#b | ||
| 144 | mov -4($dat),$YY#b | ||
| 145 | |||
| 146 | inc $XX[0]#b | ||
| 147 | sub $in0,$out | ||
| 148 | movl ($dat,$XX[0],4),$TX[0]#d | ||
| 149 | ___ | ||
| 150 | $code.=<<___ if (!$md5); | ||
| 151 | xor $TX[1],$TX[1] | ||
| 152 | test \$-128,$len | ||
| 153 | jz .Loop1 | ||
| 154 | sub $XX[0],$TX[1] | ||
| 155 | and \$`$MOD-1`,$TX[1] | ||
| 156 | jz .Loop${MOD}_is_hot | ||
| 157 | sub $TX[1],$len | ||
| 158 | .Loop${MOD}_warmup: | ||
| 159 | add $TX[0]#b,$YY#b | ||
| 160 | movl ($dat,$YY,4),$TY#d | ||
| 161 | movl $TX[0]#d,($dat,$YY,4) | ||
| 162 | movl $TY#d,($dat,$XX[0],4) | ||
| 163 | add $TY#b,$TX[0]#b | ||
| 164 | inc $XX[0]#b | ||
| 165 | movl ($dat,$TX[0],4),$TY#d | ||
| 166 | movl ($dat,$XX[0],4),$TX[0]#d | ||
| 167 | xorb ($in0),$TY#b | ||
| 168 | movb $TY#b,($out,$in0) | ||
| 169 | lea 1($in0),$in0 | ||
| 170 | dec $TX[1] | ||
| 171 | jnz .Loop${MOD}_warmup | ||
| 172 | |||
| 173 | mov $YY,$TX[1] | ||
| 174 | xor $YY,$YY | ||
| 175 | mov $TX[1]#b,$YY#b | ||
| 176 | |||
| 177 | .Loop${MOD}_is_hot: | ||
| 178 | mov $len,32(%rsp) # save original $len | ||
| 179 | shr \$6,$len # number of 64-byte blocks | ||
| 180 | ___ | ||
| 181 | if ($D && !$md5) { # stitch in dummy MD5 | ||
| 182 | $md5=1; | ||
| 183 | $ctx="%r11"; | ||
| 184 | $inp="%r15"; | ||
| 185 | $code.=<<___; | ||
| 186 | mov %rsp,$ctx | ||
| 187 | mov $in0,$inp | ||
| 188 | ___ | ||
| 189 | } | ||
| 190 | } | ||
| 191 | $code.=<<___; | ||
| 192 | #rc4# add $TX[0]#b,$YY#b | ||
| 193 | #rc4# lea ($dat,$XX[0],4),$XX[1] | ||
| 194 | shl \$6,$len | ||
| 195 | add $inp,$len # pointer to the end of input | ||
| 196 | mov $len,16(%rsp) | ||
| 197 | |||
| 198 | #md5# mov $ctx,24(%rsp) # save pointer to MD5_CTX | ||
| 199 | #md5# mov 0*4($ctx),$V[0] # load current hash value from MD5_CTX | ||
| 200 | #md5# mov 1*4($ctx),$V[1] | ||
| 201 | #md5# mov 2*4($ctx),$V[2] | ||
| 202 | #md5# mov 3*4($ctx),$V[3] | ||
| 203 | jmp .Loop | ||
| 204 | |||
| 205 | .align 16 | ||
| 206 | .Loop: | ||
| 207 | #md5# mov $V[0],0*4(%rsp) # put aside current hash value | ||
| 208 | #md5# mov $V[1],1*4(%rsp) | ||
| 209 | #md5# mov $V[2],2*4(%rsp) | ||
| 210 | #md5# mov $V[3],$tmp # forward reference | ||
| 211 | #md5# mov $V[3],3*4(%rsp) | ||
| 212 | ___ | ||
| 213 | |||
| 214 | sub R0 { | ||
| 215 | my ($i,$a,$b,$c,$d)=@_; | ||
| 216 | my @rot0=(7,12,17,22); | ||
| 217 | my $j=$i%16; | ||
| 218 | my $k=$i%$MOD; | ||
| 219 | my $xmm="%xmm".($j&1); | ||
| 220 | $code.=" movdqu ($in0),%xmm2\n" if ($rc4 && $j==15); | ||
| 221 | $code.=" add \$$MOD,$XX[0]#b\n" if ($rc4 && $j==15 && $k==$MOD-1); | ||
| 222 | $code.=" pxor $xmm,$xmm\n" if ($rc4 && $j<=1); | ||
| 223 | $code.=<<___; | ||
| 224 | #rc4# movl ($dat,$YY,4),$TY#d | ||
| 225 | #md5# xor $c,$tmp | ||
| 226 | #rc4# movl $TX[0]#d,($dat,$YY,4) | ||
| 227 | #md5# and $b,$tmp | ||
| 228 | #md5# add 4*`$j`($inp),$a | ||
| 229 | #rc4# add $TY#b,$TX[0]#b | ||
| 230 | #rc4# movl `4*(($k+1)%$MOD)`(`$k==$MOD-1?"$dat,$XX[0],4":"$XX[1]"`),$TX[1]#d | ||
| 231 | #md5# add \$$K[$i],$a | ||
| 232 | #md5# xor $d,$tmp | ||
| 233 | #rc4# movz $TX[0]#b,$TX[0]#d | ||
| 234 | #rc4# movl $TY#d,4*$k($XX[1]) | ||
| 235 | #md5# add $tmp,$a | ||
| 236 | #rc4# add $TX[1]#b,$YY#b | ||
| 237 | #md5# rol \$$rot0[$j%4],$a | ||
| 238 | #md5# mov `$j==15?"$b":"$c"`,$tmp # forward reference | ||
| 239 | #rc4# pinsrw \$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n | ||
| 240 | #md5# add $b,$a | ||
| 241 | ___ | ||
| 242 | $code.=<<___ if ($rc4 && $j==15 && $k==$MOD-1); | ||
| 243 | mov $YY,$XX[1] | ||
| 244 | xor $YY,$YY # keyword to partial register | ||
| 245 | mov $XX[1]#b,$YY#b | ||
| 246 | lea ($dat,$XX[0],4),$XX[1] | ||
| 247 | ___ | ||
| 248 | $code.=<<___ if ($rc4 && $j==15); | ||
| 249 | psllq \$8,%xmm1 | ||
| 250 | pxor %xmm0,%xmm2 | ||
| 251 | pxor %xmm1,%xmm2 | ||
| 252 | ___ | ||
| 253 | } | ||
| 254 | sub R1 { | ||
| 255 | my ($i,$a,$b,$c,$d)=@_; | ||
| 256 | my @rot1=(5,9,14,20); | ||
| 257 | my $j=$i%16; | ||
| 258 | my $k=$i%$MOD; | ||
| 259 | my $xmm="%xmm".($j&1); | ||
| 260 | $code.=" movdqu 16($in0),%xmm3\n" if ($rc4 && $j==15); | ||
| 261 | $code.=" add \$$MOD,$XX[0]#b\n" if ($rc4 && $j==15 && $k==$MOD-1); | ||
| 262 | $code.=" pxor $xmm,$xmm\n" if ($rc4 && $j<=1); | ||
| 263 | $code.=<<___; | ||
| 264 | #rc4# movl ($dat,$YY,4),$TY#d | ||
| 265 | #md5# xor $b,$tmp | ||
| 266 | #rc4# movl $TX[0]#d,($dat,$YY,4) | ||
| 267 | #md5# and $d,$tmp | ||
| 268 | #md5# add 4*`((1+5*$j)%16)`($inp),$a | ||
| 269 | #rc4# add $TY#b,$TX[0]#b | ||
| 270 | #rc4# movl `4*(($k+1)%$MOD)`(`$k==$MOD-1?"$dat,$XX[0],4":"$XX[1]"`),$TX[1]#d | ||
| 271 | #md5# add \$$K[$i],$a | ||
| 272 | #md5# xor $c,$tmp | ||
| 273 | #rc4# movz $TX[0]#b,$TX[0]#d | ||
| 274 | #rc4# movl $TY#d,4*$k($XX[1]) | ||
| 275 | #md5# add $tmp,$a | ||
| 276 | #rc4# add $TX[1]#b,$YY#b | ||
| 277 | #md5# rol \$$rot1[$j%4],$a | ||
| 278 | #md5# mov `$j==15?"$c":"$b"`,$tmp # forward reference | ||
| 279 | #rc4# pinsrw \$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n | ||
| 280 | #md5# add $b,$a | ||
| 281 | ___ | ||
| 282 | $code.=<<___ if ($rc4 && $j==15 && $k==$MOD-1); | ||
| 283 | mov $YY,$XX[1] | ||
| 284 | xor $YY,$YY # keyword to partial register | ||
| 285 | mov $XX[1]#b,$YY#b | ||
| 286 | lea ($dat,$XX[0],4),$XX[1] | ||
| 287 | ___ | ||
| 288 | $code.=<<___ if ($rc4 && $j==15); | ||
| 289 | psllq \$8,%xmm1 | ||
| 290 | pxor %xmm0,%xmm3 | ||
| 291 | pxor %xmm1,%xmm3 | ||
| 292 | ___ | ||
| 293 | } | ||
| 294 | sub R2 { | ||
| 295 | my ($i,$a,$b,$c,$d)=@_; | ||
| 296 | my @rot2=(4,11,16,23); | ||
| 297 | my $j=$i%16; | ||
| 298 | my $k=$i%$MOD; | ||
| 299 | my $xmm="%xmm".($j&1); | ||
| 300 | $code.=" movdqu 32($in0),%xmm4\n" if ($rc4 && $j==15); | ||
| 301 | $code.=" add \$$MOD,$XX[0]#b\n" if ($rc4 && $j==15 && $k==$MOD-1); | ||
| 302 | $code.=" pxor $xmm,$xmm\n" if ($rc4 && $j<=1); | ||
| 303 | $code.=<<___; | ||
| 304 | #rc4# movl ($dat,$YY,4),$TY#d | ||
| 305 | #md5# xor $c,$tmp | ||
| 306 | #rc4# movl $TX[0]#d,($dat,$YY,4) | ||
| 307 | #md5# xor $b,$tmp | ||
| 308 | #md5# add 4*`((5+3*$j)%16)`($inp),$a | ||
| 309 | #rc4# add $TY#b,$TX[0]#b | ||
| 310 | #rc4# movl `4*(($k+1)%$MOD)`(`$k==$MOD-1?"$dat,$XX[0],4":"$XX[1]"`),$TX[1]#d | ||
| 311 | #md5# add \$$K[$i],$a | ||
| 312 | #rc4# movz $TX[0]#b,$TX[0]#d | ||
| 313 | #md5# add $tmp,$a | ||
| 314 | #rc4# movl $TY#d,4*$k($XX[1]) | ||
| 315 | #rc4# add $TX[1]#b,$YY#b | ||
| 316 | #md5# rol \$$rot2[$j%4],$a | ||
| 317 | #md5# mov `$j==15?"\\\$-1":"$c"`,$tmp # forward reference | ||
| 318 | #rc4# pinsrw \$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n | ||
| 319 | #md5# add $b,$a | ||
| 320 | ___ | ||
| 321 | $code.=<<___ if ($rc4 && $j==15 && $k==$MOD-1); | ||
| 322 | mov $YY,$XX[1] | ||
| 323 | xor $YY,$YY # keyword to partial register | ||
| 324 | mov $XX[1]#b,$YY#b | ||
| 325 | lea ($dat,$XX[0],4),$XX[1] | ||
| 326 | ___ | ||
| 327 | $code.=<<___ if ($rc4 && $j==15); | ||
| 328 | psllq \$8,%xmm1 | ||
| 329 | pxor %xmm0,%xmm4 | ||
| 330 | pxor %xmm1,%xmm4 | ||
| 331 | ___ | ||
| 332 | } | ||
| 333 | sub R3 { | ||
| 334 | my ($i,$a,$b,$c,$d)=@_; | ||
| 335 | my @rot3=(6,10,15,21); | ||
| 336 | my $j=$i%16; | ||
| 337 | my $k=$i%$MOD; | ||
| 338 | my $xmm="%xmm".($j&1); | ||
| 339 | $code.=" movdqu 48($in0),%xmm5\n" if ($rc4 && $j==15); | ||
| 340 | $code.=" add \$$MOD,$XX[0]#b\n" if ($rc4 && $j==15 && $k==$MOD-1); | ||
| 341 | $code.=" pxor $xmm,$xmm\n" if ($rc4 && $j<=1); | ||
| 342 | $code.=<<___; | ||
| 343 | #rc4# movl ($dat,$YY,4),$TY#d | ||
| 344 | #md5# xor $d,$tmp | ||
| 345 | #rc4# movl $TX[0]#d,($dat,$YY,4) | ||
| 346 | #md5# or $b,$tmp | ||
| 347 | #md5# add 4*`((7*$j)%16)`($inp),$a | ||
| 348 | #rc4# add $TY#b,$TX[0]#b | ||
| 349 | #rc4# movl `4*(($k+1)%$MOD)`(`$k==$MOD-1?"$dat,$XX[0],4":"$XX[1]"`),$TX[1]#d | ||
| 350 | #md5# add \$$K[$i],$a | ||
| 351 | #rc4# movz $TX[0]#b,$TX[0]#d | ||
| 352 | #md5# xor $c,$tmp | ||
| 353 | #rc4# movl $TY#d,4*$k($XX[1]) | ||
| 354 | #md5# add $tmp,$a | ||
| 355 | #rc4# add $TX[1]#b,$YY#b | ||
| 356 | #md5# rol \$$rot3[$j%4],$a | ||
| 357 | #md5# mov \$-1,$tmp # forward reference | ||
| 358 | #rc4# pinsrw \$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n | ||
| 359 | #md5# add $b,$a | ||
| 360 | ___ | ||
| 361 | $code.=<<___ if ($rc4 && $j==15); | ||
| 362 | mov $XX[0],$XX[1] | ||
| 363 | xor $XX[0],$XX[0] # keyword to partial register | ||
| 364 | mov $XX[1]#b,$XX[0]#b | ||
| 365 | mov $YY,$XX[1] | ||
| 366 | xor $YY,$YY # keyword to partial register | ||
| 367 | mov $XX[1]#b,$YY#b | ||
| 368 | lea ($dat,$XX[0],4),$XX[1] | ||
| 369 | psllq \$8,%xmm1 | ||
| 370 | pxor %xmm0,%xmm5 | ||
| 371 | pxor %xmm1,%xmm5 | ||
| 372 | ___ | ||
| 373 | } | ||
| 374 | |||
| 375 | my $i=0; | ||
| 376 | for(;$i<16;$i++) { R0($i,@V); unshift(@V,pop(@V)); push(@TX,shift(@TX)); } | ||
| 377 | for(;$i<32;$i++) { R1($i,@V); unshift(@V,pop(@V)); push(@TX,shift(@TX)); } | ||
| 378 | for(;$i<48;$i++) { R2($i,@V); unshift(@V,pop(@V)); push(@TX,shift(@TX)); } | ||
| 379 | for(;$i<64;$i++) { R3($i,@V); unshift(@V,pop(@V)); push(@TX,shift(@TX)); } | ||
| 380 | |||
| 381 | $code.=<<___; | ||
| 382 | #md5# add 0*4(%rsp),$V[0] # accumulate hash value | ||
| 383 | #md5# add 1*4(%rsp),$V[1] | ||
| 384 | #md5# add 2*4(%rsp),$V[2] | ||
| 385 | #md5# add 3*4(%rsp),$V[3] | ||
| 386 | |||
| 387 | #rc4# movdqu %xmm2,($out,$in0) # write RC4 output | ||
| 388 | #rc4# movdqu %xmm3,16($out,$in0) | ||
| 389 | #rc4# movdqu %xmm4,32($out,$in0) | ||
| 390 | #rc4# movdqu %xmm5,48($out,$in0) | ||
| 391 | #md5# lea 64($inp),$inp | ||
| 392 | #rc4# lea 64($in0),$in0 | ||
| 393 | cmp 16(%rsp),$inp # are we done? | ||
| 394 | jb .Loop | ||
| 395 | |||
| 396 | #md5# mov 24(%rsp),$len # restore pointer to MD5_CTX | ||
| 397 | #rc4# sub $TX[0]#b,$YY#b # correct $YY | ||
| 398 | #md5# mov $V[0],0*4($len) # write MD5_CTX | ||
| 399 | #md5# mov $V[1],1*4($len) | ||
| 400 | #md5# mov $V[2],2*4($len) | ||
| 401 | #md5# mov $V[3],3*4($len) | ||
| 402 | ___ | ||
| 403 | $code.=<<___ if ($rc4 && (!$md5 || $D)); | ||
| 404 | mov 32(%rsp),$len # restore original $len | ||
| 405 | and \$63,$len # remaining bytes | ||
| 406 | jnz .Loop1 | ||
| 407 | jmp .Ldone | ||
| 408 | |||
| 409 | .align 16 | ||
| 410 | .Loop1: | ||
| 411 | add $TX[0]#b,$YY#b | ||
| 412 | movl ($dat,$YY,4),$TY#d | ||
| 413 | movl $TX[0]#d,($dat,$YY,4) | ||
| 414 | movl $TY#d,($dat,$XX[0],4) | ||
| 415 | add $TY#b,$TX[0]#b | ||
| 416 | inc $XX[0]#b | ||
| 417 | movl ($dat,$TX[0],4),$TY#d | ||
| 418 | movl ($dat,$XX[0],4),$TX[0]#d | ||
| 419 | xorb ($in0),$TY#b | ||
| 420 | movb $TY#b,($out,$in0) | ||
| 421 | lea 1($in0),$in0 | ||
| 422 | dec $len | ||
| 423 | jnz .Loop1 | ||
| 424 | |||
| 425 | .Ldone: | ||
| 426 | ___ | ||
| 427 | $code.=<<___; | ||
| 428 | #rc4# sub \$1,$XX[0]#b | ||
| 429 | #rc4# movl $XX[0]#d,-8($dat) | ||
| 430 | #rc4# movl $YY#d,-4($dat) | ||
| 431 | |||
| 432 | mov 40(%rsp),%r15 | ||
| 433 | mov 48(%rsp),%r14 | ||
| 434 | mov 56(%rsp),%r13 | ||
| 435 | mov 64(%rsp),%r12 | ||
| 436 | mov 72(%rsp),%rbp | ||
| 437 | mov 80(%rsp),%rbx | ||
| 438 | lea 88(%rsp),%rsp | ||
| 439 | .Lepilogue: | ||
| 440 | .Labort: | ||
| 441 | ret | ||
| 442 | .size $func,.-$func | ||
| 443 | ___ | ||
| 444 | |||
| 445 | if ($rc4 && $D) { # sole purpose of this section is to provide | ||
| 446 | # option to use the generated module as drop-in | ||
| 447 | # replacement for rc4-x86_64.pl for debugging | ||
| 448 | # and testing purposes... | ||
| 449 | my ($idx,$ido)=("%r8","%r9"); | ||
| 450 | my ($dat,$len,$inp)=("%rdi","%rsi","%rdx"); | ||
| 451 | |||
| 452 | $code.=<<___; | ||
| 453 | .globl RC4_set_key | ||
| 454 | .type RC4_set_key,\@function,3 | ||
| 455 | .align 16 | ||
| 456 | RC4_set_key: | ||
| 457 | lea 8($dat),$dat | ||
| 458 | lea ($inp,$len),$inp | ||
| 459 | neg $len | ||
| 460 | mov $len,%rcx | ||
| 461 | xor %eax,%eax | ||
| 462 | xor $ido,$ido | ||
| 463 | xor %r10,%r10 | ||
| 464 | xor %r11,%r11 | ||
| 465 | jmp .Lw1stloop | ||
| 466 | |||
| 467 | .align 16 | ||
| 468 | .Lw1stloop: | ||
| 469 | mov %eax,($dat,%rax,4) | ||
| 470 | add \$1,%al | ||
| 471 | jnc .Lw1stloop | ||
| 472 | |||
| 473 | xor $ido,$ido | ||
| 474 | xor $idx,$idx | ||
| 475 | .align 16 | ||
| 476 | .Lw2ndloop: | ||
| 477 | mov ($dat,$ido,4),%r10d | ||
| 478 | add ($inp,$len,1),$idx#b | ||
| 479 | add %r10b,$idx#b | ||
| 480 | add \$1,$len | ||
| 481 | mov ($dat,$idx,4),%r11d | ||
| 482 | cmovz %rcx,$len | ||
| 483 | mov %r10d,($dat,$idx,4) | ||
| 484 | mov %r11d,($dat,$ido,4) | ||
| 485 | add \$1,$ido#b | ||
| 486 | jnc .Lw2ndloop | ||
| 487 | |||
| 488 | xor %eax,%eax | ||
| 489 | mov %eax,-8($dat) | ||
| 490 | mov %eax,-4($dat) | ||
| 491 | ret | ||
| 492 | .size RC4_set_key,.-RC4_set_key | ||
| 493 | |||
| 494 | .globl RC4_options | ||
| 495 | .type RC4_options,\@abi-omnipotent | ||
| 496 | .align 16 | ||
| 497 | RC4_options: | ||
| 498 | lea .Lopts(%rip),%rax | ||
| 499 | ret | ||
| 500 | .align 64 | ||
| 501 | .Lopts: | ||
| 502 | .asciz "rc4(64x,int)" | ||
| 503 | .align 64 | ||
| 504 | .size RC4_options,.-RC4_options | ||
| 505 | ___ | ||
| 506 | } | ||
| 507 | # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, | ||
| 508 | # CONTEXT *context,DISPATCHER_CONTEXT *disp) | ||
| 509 | if ($win64) { | ||
| 510 | my $rec="%rcx"; | ||
| 511 | my $frame="%rdx"; | ||
| 512 | my $context="%r8"; | ||
| 513 | my $disp="%r9"; | ||
| 514 | |||
| 515 | $code.=<<___; | ||
| 516 | .extern __imp_RtlVirtualUnwind | ||
| 517 | .type se_handler,\@abi-omnipotent | ||
| 518 | .align 16 | ||
| 519 | se_handler: | ||
| 520 | push %rsi | ||
| 521 | push %rdi | ||
| 522 | push %rbx | ||
| 523 | push %rbp | ||
| 524 | push %r12 | ||
| 525 | push %r13 | ||
| 526 | push %r14 | ||
| 527 | push %r15 | ||
| 528 | pushfq | ||
| 529 | sub \$64,%rsp | ||
| 530 | |||
| 531 | mov 120($context),%rax # pull context->Rax | ||
| 532 | mov 248($context),%rbx # pull context->Rip | ||
| 533 | |||
| 534 | lea .Lbody(%rip),%r10 | ||
| 535 | cmp %r10,%rbx # context->Rip<.Lbody | ||
| 536 | jb .Lin_prologue | ||
| 537 | |||
| 538 | mov 152($context),%rax # pull context->Rsp | ||
| 539 | |||
| 540 | lea .Lepilogue(%rip),%r10 | ||
| 541 | cmp %r10,%rbx # context->Rip>=.Lepilogue | ||
| 542 | jae .Lin_prologue | ||
| 543 | |||
| 544 | mov 40(%rax),%r15 | ||
| 545 | mov 48(%rax),%r14 | ||
| 546 | mov 56(%rax),%r13 | ||
| 547 | mov 64(%rax),%r12 | ||
| 548 | mov 72(%rax),%rbp | ||
| 549 | mov 80(%rax),%rbx | ||
| 550 | lea 88(%rax),%rax | ||
| 551 | |||
| 552 | mov %rbx,144($context) # restore context->Rbx | ||
| 553 | mov %rbp,160($context) # restore context->Rbp | ||
| 554 | mov %r12,216($context) # restore context->R12 | ||
| 555 | mov %r13,224($context) # restore context->R12 | ||
| 556 | mov %r14,232($context) # restore context->R14 | ||
| 557 | mov %r15,240($context) # restore context->R15 | ||
| 558 | |||
| 559 | .Lin_prologue: | ||
| 560 | mov 8(%rax),%rdi | ||
| 561 | mov 16(%rax),%rsi | ||
| 562 | mov %rax,152($context) # restore context->Rsp | ||
| 563 | mov %rsi,168($context) # restore context->Rsi | ||
| 564 | mov %rdi,176($context) # restore context->Rdi | ||
| 565 | |||
| 566 | mov 40($disp),%rdi # disp->ContextRecord | ||
| 567 | mov $context,%rsi # context | ||
| 568 | mov \$154,%ecx # sizeof(CONTEXT) | ||
| 569 | .long 0xa548f3fc # cld; rep movsq | ||
| 570 | |||
| 571 | mov $disp,%rsi | ||
| 572 | xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER | ||
| 573 | mov 8(%rsi),%rdx # arg2, disp->ImageBase | ||
| 574 | mov 0(%rsi),%r8 # arg3, disp->ControlPc | ||
| 575 | mov 16(%rsi),%r9 # arg4, disp->FunctionEntry | ||
| 576 | mov 40(%rsi),%r10 # disp->ContextRecord | ||
| 577 | lea 56(%rsi),%r11 # &disp->HandlerData | ||
| 578 | lea 24(%rsi),%r12 # &disp->EstablisherFrame | ||
| 579 | mov %r10,32(%rsp) # arg5 | ||
| 580 | mov %r11,40(%rsp) # arg6 | ||
| 581 | mov %r12,48(%rsp) # arg7 | ||
| 582 | mov %rcx,56(%rsp) # arg8, (NULL) | ||
| 583 | call *__imp_RtlVirtualUnwind(%rip) | ||
| 584 | |||
| 585 | mov \$1,%eax # ExceptionContinueSearch | ||
| 586 | add \$64,%rsp | ||
| 587 | popfq | ||
| 588 | pop %r15 | ||
| 589 | pop %r14 | ||
| 590 | pop %r13 | ||
| 591 | pop %r12 | ||
| 592 | pop %rbp | ||
| 593 | pop %rbx | ||
| 594 | pop %rdi | ||
| 595 | pop %rsi | ||
| 596 | ret | ||
| 597 | .size se_handler,.-se_handler | ||
| 598 | |||
| 599 | .section .pdata | ||
| 600 | .align 4 | ||
| 601 | .rva .LSEH_begin_$func | ||
| 602 | .rva .LSEH_end_$func | ||
| 603 | .rva .LSEH_info_$func | ||
| 604 | |||
| 605 | .section .xdata | ||
| 606 | .align 8 | ||
| 607 | .LSEH_info_$func: | ||
| 608 | .byte 9,0,0,0 | ||
| 609 | .rva se_handler | ||
| 610 | ___ | ||
| 611 | } | ||
| 612 | |||
| 613 | sub reg_part { | ||
| 614 | my ($reg,$conv)=@_; | ||
| 615 | if ($reg =~ /%r[0-9]+/) { $reg .= $conv; } | ||
| 616 | elsif ($conv eq "b") { $reg =~ s/%[er]([^x]+)x?/%$1l/; } | ||
| 617 | elsif ($conv eq "w") { $reg =~ s/%[er](.+)/%$1/; } | ||
| 618 | elsif ($conv eq "d") { $reg =~ s/%[er](.+)/%e$1/; } | ||
| 619 | return $reg; | ||
| 620 | } | ||
| 621 | |||
| 622 | $code =~ s/(%[a-z0-9]+)#([bwd])/reg_part($1,$2)/gem; | ||
| 623 | $code =~ s/\`([^\`]*)\`/eval $1/gem; | ||
| 624 | $code =~ s/pinsrw\s+\$0,/movd /gm; | ||
| 625 | |||
| 626 | $code =~ s/#md5#//gm if ($md5); | ||
| 627 | $code =~ s/#rc4#//gm if ($rc4); | ||
| 628 | |||
| 629 | print $code; | ||
| 630 | |||
| 631 | close STDOUT; | ||
diff --git a/src/lib/libcrypto/rc4/asm/rc4-parisc.pl b/src/lib/libcrypto/rc4/asm/rc4-parisc.pl new file mode 100644 index 0000000000..9165067080 --- /dev/null +++ b/src/lib/libcrypto/rc4/asm/rc4-parisc.pl | |||
| @@ -0,0 +1,313 @@ | |||
| 1 | #!/usr/bin/env perl | ||
| 2 | |||
| 3 | # ==================================================================== | ||
| 4 | # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | ||
| 5 | # project. The module is, however, dual licensed under OpenSSL and | ||
| 6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
| 7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
| 8 | # ==================================================================== | ||
| 9 | |||
| 10 | # RC4 for PA-RISC. | ||
| 11 | |||
| 12 | # June 2009. | ||
| 13 | # | ||
| 14 | # Performance is 33% better than gcc 3.2 generated code on PA-7100LC. | ||
| 15 | # For reference, [4x] unrolled loop is >40% faster than folded one. | ||
| 16 | # It's possible to unroll loop 8 times on PA-RISC 2.0, but improvement | ||
| 17 | # is believed to be not sufficient to justify the effort... | ||
| 18 | # | ||
| 19 | # Special thanks to polarhome.com for providing HP-UX account. | ||
| 20 | |||
| 21 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | ||
| 22 | |||
| 23 | $flavour = shift; | ||
| 24 | $output = shift; | ||
| 25 | open STDOUT,">$output"; | ||
| 26 | |||
| 27 | if ($flavour =~ /64/) { | ||
| 28 | $LEVEL ="2.0W"; | ||
| 29 | $SIZE_T =8; | ||
| 30 | $FRAME_MARKER =80; | ||
| 31 | $SAVED_RP =16; | ||
| 32 | $PUSH ="std"; | ||
| 33 | $PUSHMA ="std,ma"; | ||
| 34 | $POP ="ldd"; | ||
| 35 | $POPMB ="ldd,mb"; | ||
| 36 | } else { | ||
| 37 | $LEVEL ="1.0"; | ||
| 38 | $SIZE_T =4; | ||
| 39 | $FRAME_MARKER =48; | ||
| 40 | $SAVED_RP =20; | ||
| 41 | $PUSH ="stw"; | ||
| 42 | $PUSHMA ="stwm"; | ||
| 43 | $POP ="ldw"; | ||
| 44 | $POPMB ="ldwm"; | ||
| 45 | } | ||
| 46 | |||
| 47 | $FRAME=4*$SIZE_T+$FRAME_MARKER; # 4 saved regs + frame marker | ||
| 48 | # [+ argument transfer] | ||
| 49 | $SZ=1; # defaults to RC4_CHAR | ||
| 50 | if (open CONF,"<${dir}../../opensslconf.h") { | ||
| 51 | while(<CONF>) { | ||
| 52 | if (m/#\s*define\s+RC4_INT\s+(.*)/) { | ||
| 53 | $SZ = ($1=~/char$/) ? 1 : 4; | ||
| 54 | last; | ||
| 55 | } | ||
| 56 | } | ||
| 57 | close CONF; | ||
| 58 | } | ||
| 59 | |||
| 60 | if ($SZ==1) { # RC4_CHAR | ||
| 61 | $LD="ldb"; | ||
| 62 | $LDX="ldbx"; | ||
| 63 | $MKX="addl"; | ||
| 64 | $ST="stb"; | ||
| 65 | } else { # RC4_INT (~5% faster than RC4_CHAR on PA-7100LC) | ||
| 66 | $LD="ldw"; | ||
| 67 | $LDX="ldwx,s"; | ||
| 68 | $MKX="sh2addl"; | ||
| 69 | $ST="stw"; | ||
| 70 | } | ||
| 71 | |||
| 72 | $key="%r26"; | ||
| 73 | $len="%r25"; | ||
| 74 | $inp="%r24"; | ||
| 75 | $out="%r23"; | ||
| 76 | |||
| 77 | @XX=("%r19","%r20"); | ||
| 78 | @TX=("%r21","%r22"); | ||
| 79 | $YY="%r28"; | ||
| 80 | $TY="%r29"; | ||
| 81 | |||
| 82 | $acc="%r1"; | ||
| 83 | $ix="%r2"; | ||
| 84 | $iy="%r3"; | ||
| 85 | $dat0="%r4"; | ||
| 86 | $dat1="%r5"; | ||
| 87 | $rem="%r6"; | ||
| 88 | $mask="%r31"; | ||
| 89 | |||
| 90 | sub unrolledloopbody { | ||
| 91 | for ($i=0;$i<4;$i++) { | ||
| 92 | $code.=<<___; | ||
| 93 | ldo 1($XX[0]),$XX[1] | ||
| 94 | `sprintf("$LDX %$TY(%$key),%$dat1") if ($i>0)` | ||
| 95 | and $mask,$XX[1],$XX[1] | ||
| 96 | $LDX $YY($key),$TY | ||
| 97 | $MKX $YY,$key,$ix | ||
| 98 | $LDX $XX[1]($key),$TX[1] | ||
| 99 | $MKX $XX[0],$key,$iy | ||
| 100 | $ST $TX[0],0($ix) | ||
| 101 | comclr,<> $XX[1],$YY,%r0 ; conditional | ||
| 102 | copy $TX[0],$TX[1] ; move | ||
| 103 | `sprintf("%sdep %$dat1,%d,8,%$acc",$i==1?"z":"",8*($i-1)+7) if ($i>0)` | ||
| 104 | $ST $TY,0($iy) | ||
| 105 | addl $TX[0],$TY,$TY | ||
| 106 | addl $TX[1],$YY,$YY | ||
| 107 | and $mask,$TY,$TY | ||
| 108 | and $mask,$YY,$YY | ||
| 109 | ___ | ||
| 110 | push(@TX,shift(@TX)); push(@XX,shift(@XX)); # "rotate" registers | ||
| 111 | } } | ||
| 112 | |||
| 113 | sub foldedloop { | ||
| 114 | my ($label,$count)=@_; | ||
| 115 | $code.=<<___; | ||
| 116 | $label | ||
| 117 | $MKX $YY,$key,$iy | ||
| 118 | $LDX $YY($key),$TY | ||
| 119 | $MKX $XX[0],$key,$ix | ||
| 120 | $ST $TX[0],0($iy) | ||
| 121 | ldo 1($XX[0]),$XX[0] | ||
| 122 | $ST $TY,0($ix) | ||
| 123 | addl $TX[0],$TY,$TY | ||
| 124 | ldbx $inp($out),$dat1 | ||
| 125 | and $mask,$TY,$TY | ||
| 126 | and $mask,$XX[0],$XX[0] | ||
| 127 | $LDX $TY($key),$acc | ||
| 128 | $LDX $XX[0]($key),$TX[0] | ||
| 129 | ldo 1($out),$out | ||
| 130 | xor $dat1,$acc,$acc | ||
| 131 | addl $TX[0],$YY,$YY | ||
| 132 | stb $acc,-1($out) | ||
| 133 | addib,<> -1,$count,$label ; $count is always small | ||
| 134 | and $mask,$YY,$YY | ||
| 135 | ___ | ||
| 136 | } | ||
| 137 | |||
| 138 | $code=<<___; | ||
| 139 | .LEVEL $LEVEL | ||
| 140 | .SPACE \$TEXT\$ | ||
| 141 | .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY | ||
| 142 | |||
| 143 | .EXPORT RC4,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR | ||
| 144 | RC4 | ||
| 145 | .PROC | ||
| 146 | .CALLINFO FRAME=`$FRAME-4*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=6 | ||
| 147 | .ENTRY | ||
| 148 | $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue | ||
| 149 | $PUSHMA %r3,$FRAME(%sp) | ||
| 150 | $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp) | ||
| 151 | $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp) | ||
| 152 | $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp) | ||
| 153 | |||
| 154 | cmpib,*= 0,$len,L\$abort | ||
| 155 | sub $inp,$out,$inp ; distance between $inp and $out | ||
| 156 | |||
| 157 | $LD `0*$SZ`($key),$XX[0] | ||
| 158 | $LD `1*$SZ`($key),$YY | ||
| 159 | ldo `2*$SZ`($key),$key | ||
| 160 | |||
| 161 | ldi 0xff,$mask | ||
| 162 | ldi 3,$dat0 | ||
| 163 | |||
| 164 | ldo 1($XX[0]),$XX[0] ; warm up loop | ||
| 165 | and $mask,$XX[0],$XX[0] | ||
| 166 | $LDX $XX[0]($key),$TX[0] | ||
| 167 | addl $TX[0],$YY,$YY | ||
| 168 | cmpib,*>>= 6,$len,L\$oop1 ; is $len large enough to bother? | ||
| 169 | and $mask,$YY,$YY | ||
| 170 | |||
| 171 | and,<> $out,$dat0,$rem ; is $out aligned? | ||
| 172 | b L\$alignedout | ||
| 173 | subi 4,$rem,$rem | ||
| 174 | sub $len,$rem,$len | ||
| 175 | ___ | ||
| 176 | &foldedloop("L\$alignout",$rem); # process till $out is aligned | ||
| 177 | |||
| 178 | $code.=<<___; | ||
| 179 | L\$alignedout ; $len is at least 4 here | ||
| 180 | and,<> $inp,$dat0,$acc ; is $inp aligned? | ||
| 181 | b L\$oop4 | ||
| 182 | sub $inp,$acc,$rem ; align $inp | ||
| 183 | |||
| 184 | sh3addl $acc,%r0,$acc | ||
| 185 | subi 32,$acc,$acc | ||
| 186 | mtctl $acc,%cr11 ; load %sar with vshd align factor | ||
| 187 | ldwx $rem($out),$dat0 | ||
| 188 | ldo 4($rem),$rem | ||
| 189 | L\$oop4misalignedinp | ||
| 190 | ___ | ||
| 191 | &unrolledloopbody(); | ||
| 192 | $code.=<<___; | ||
| 193 | $LDX $TY($key),$ix | ||
| 194 | ldwx $rem($out),$dat1 | ||
| 195 | ldo -4($len),$len | ||
| 196 | or $ix,$acc,$acc ; last piece, no need to dep | ||
| 197 | vshd $dat0,$dat1,$iy ; align data | ||
| 198 | copy $dat1,$dat0 | ||
| 199 | xor $iy,$acc,$acc | ||
| 200 | stw $acc,0($out) | ||
| 201 | cmpib,*<< 3,$len,L\$oop4misalignedinp | ||
| 202 | ldo 4($out),$out | ||
| 203 | cmpib,*= 0,$len,L\$done | ||
| 204 | nop | ||
| 205 | b L\$oop1 | ||
| 206 | nop | ||
| 207 | |||
| 208 | .ALIGN 8 | ||
| 209 | L\$oop4 | ||
| 210 | ___ | ||
| 211 | &unrolledloopbody(); | ||
| 212 | $code.=<<___; | ||
| 213 | $LDX $TY($key),$ix | ||
| 214 | ldwx $inp($out),$dat0 | ||
| 215 | ldo -4($len),$len | ||
| 216 | or $ix,$acc,$acc ; last piece, no need to dep | ||
| 217 | xor $dat0,$acc,$acc | ||
| 218 | stw $acc,0($out) | ||
| 219 | cmpib,*<< 3,$len,L\$oop4 | ||
| 220 | ldo 4($out),$out | ||
| 221 | cmpib,*= 0,$len,L\$done | ||
| 222 | nop | ||
| 223 | ___ | ||
| 224 | &foldedloop("L\$oop1",$len); | ||
| 225 | $code.=<<___; | ||
| 226 | L\$done | ||
| 227 | $POP `-$FRAME-$SAVED_RP`(%sp),%r2 | ||
| 228 | ldo -1($XX[0]),$XX[0] ; chill out loop | ||
| 229 | sub $YY,$TX[0],$YY | ||
| 230 | and $mask,$XX[0],$XX[0] | ||
| 231 | and $mask,$YY,$YY | ||
| 232 | $ST $XX[0],`-2*$SZ`($key) | ||
| 233 | $ST $YY,`-1*$SZ`($key) | ||
| 234 | $POP `-$FRAME+1*$SIZE_T`(%sp),%r4 | ||
| 235 | $POP `-$FRAME+2*$SIZE_T`(%sp),%r5 | ||
| 236 | $POP `-$FRAME+3*$SIZE_T`(%sp),%r6 | ||
| 237 | L\$abort | ||
| 238 | bv (%r2) | ||
| 239 | .EXIT | ||
| 240 | $POPMB -$FRAME(%sp),%r3 | ||
| 241 | .PROCEND | ||
| 242 | ___ | ||
| 243 | |||
| 244 | $code.=<<___; | ||
| 245 | |||
| 246 | .EXPORT private_RC4_set_key,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR | ||
| 247 | .ALIGN 8 | ||
| 248 | private_RC4_set_key | ||
| 249 | .PROC | ||
| 250 | .CALLINFO NO_CALLS | ||
| 251 | .ENTRY | ||
| 252 | $ST %r0,`0*$SZ`($key) | ||
| 253 | $ST %r0,`1*$SZ`($key) | ||
| 254 | ldo `2*$SZ`($key),$key | ||
| 255 | copy %r0,@XX[0] | ||
| 256 | L\$1st | ||
| 257 | $ST @XX[0],0($key) | ||
| 258 | ldo 1(@XX[0]),@XX[0] | ||
| 259 | bb,>= @XX[0],`31-8`,L\$1st ; @XX[0]<256 | ||
| 260 | ldo $SZ($key),$key | ||
| 261 | |||
| 262 | ldo `-256*$SZ`($key),$key ; rewind $key | ||
| 263 | addl $len,$inp,$inp ; $inp to point at the end | ||
| 264 | sub %r0,$len,%r23 ; inverse index | ||
| 265 | copy %r0,@XX[0] | ||
| 266 | copy %r0,@XX[1] | ||
| 267 | ldi 0xff,$mask | ||
| 268 | |||
| 269 | L\$2nd | ||
| 270 | $LDX @XX[0]($key),@TX[0] | ||
| 271 | ldbx %r23($inp),@TX[1] | ||
| 272 | addi,nuv 1,%r23,%r23 ; increment and conditional | ||
| 273 | sub %r0,$len,%r23 ; inverse index | ||
| 274 | addl @TX[0],@XX[1],@XX[1] | ||
| 275 | addl @TX[1],@XX[1],@XX[1] | ||
| 276 | and $mask,@XX[1],@XX[1] | ||
| 277 | $MKX @XX[0],$key,$TY | ||
| 278 | $LDX @XX[1]($key),@TX[1] | ||
| 279 | $MKX @XX[1],$key,$YY | ||
| 280 | ldo 1(@XX[0]),@XX[0] | ||
| 281 | $ST @TX[0],0($YY) | ||
| 282 | bb,>= @XX[0],`31-8`,L\$2nd ; @XX[0]<256 | ||
| 283 | $ST @TX[1],0($TY) | ||
| 284 | |||
| 285 | bv,n (%r2) | ||
| 286 | .EXIT | ||
| 287 | nop | ||
| 288 | .PROCEND | ||
| 289 | |||
| 290 | .EXPORT RC4_options,ENTRY | ||
| 291 | .ALIGN 8 | ||
| 292 | RC4_options | ||
| 293 | .PROC | ||
| 294 | .CALLINFO NO_CALLS | ||
| 295 | .ENTRY | ||
| 296 | blr %r0,%r28 | ||
| 297 | ldi 3,%r1 | ||
| 298 | L\$pic | ||
| 299 | andcm %r28,%r1,%r28 | ||
| 300 | bv (%r2) | ||
| 301 | .EXIT | ||
| 302 | ldo L\$opts-L\$pic(%r28),%r28 | ||
| 303 | .PROCEND | ||
| 304 | .ALIGN 8 | ||
| 305 | L\$opts | ||
| 306 | .STRINGZ "rc4(4x,`$SZ==1?"char":"int"`)" | ||
| 307 | .STRINGZ "RC4 for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>" | ||
| 308 | ___ | ||
| 309 | $code =~ s/\`([^\`]*)\`/eval $1/gem; | ||
| 310 | $code =~ s/cmpib,\*/comib,/gm if ($SIZE_T==4); | ||
| 311 | |||
| 312 | print $code; | ||
| 313 | close STDOUT; | ||
diff --git a/src/lib/libcrypto/rc4/asm/rc4-s390x.pl b/src/lib/libcrypto/rc4/asm/rc4-s390x.pl index 96681fa05e..7528ece13c 100644 --- a/src/lib/libcrypto/rc4/asm/rc4-s390x.pl +++ b/src/lib/libcrypto/rc4/asm/rc4-s390x.pl | |||
| @@ -13,6 +13,29 @@ | |||
| 13 | # "cluster" Address Generation Interlocks, so that one pipeline stall | 13 | # "cluster" Address Generation Interlocks, so that one pipeline stall |
| 14 | # resolves several dependencies. | 14 | # resolves several dependencies. |
| 15 | 15 | ||
| 16 | # November 2010. | ||
| 17 | # | ||
| 18 | # Adapt for -m31 build. If kernel supports what's called "highgprs" | ||
| 19 | # feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit | ||
| 20 | # instructions and achieve "64-bit" performance even in 31-bit legacy | ||
| 21 | # application context. The feature is not specific to any particular | ||
| 22 | # processor, as long as it's "z-CPU". Latter implies that the code | ||
| 23 | # remains z/Architecture specific. On z990 it was measured to perform | ||
| 24 | # 50% better than code generated by gcc 4.3. | ||
| 25 | |||
| 26 | $flavour = shift; | ||
| 27 | |||
| 28 | if ($flavour =~ /3[12]/) { | ||
| 29 | $SIZE_T=4; | ||
| 30 | $g=""; | ||
| 31 | } else { | ||
| 32 | $SIZE_T=8; | ||
| 33 | $g="g"; | ||
| 34 | } | ||
| 35 | |||
| 36 | while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} | ||
| 37 | open STDOUT,">$output"; | ||
| 38 | |||
| 16 | $rp="%r14"; | 39 | $rp="%r14"; |
| 17 | $sp="%r15"; | 40 | $sp="%r15"; |
| 18 | $code=<<___; | 41 | $code=<<___; |
| @@ -39,7 +62,12 @@ $code.=<<___; | |||
| 39 | .type RC4,\@function | 62 | .type RC4,\@function |
| 40 | .align 64 | 63 | .align 64 |
| 41 | RC4: | 64 | RC4: |
| 42 | stmg %r6,%r11,48($sp) | 65 | stm${g} %r6,%r11,6*$SIZE_T($sp) |
| 66 | ___ | ||
| 67 | $code.=<<___ if ($flavour =~ /3[12]/); | ||
| 68 | llgfr $len,$len | ||
| 69 | ___ | ||
| 70 | $code.=<<___; | ||
| 43 | llgc $XX[0],0($key) | 71 | llgc $XX[0],0($key) |
| 44 | llgc $YY,1($key) | 72 | llgc $YY,1($key) |
| 45 | la $XX[0],1($XX[0]) | 73 | la $XX[0],1($XX[0]) |
| @@ -90,7 +118,7 @@ $code.=<<___; | |||
| 90 | xgr $acc,$TX[1] | 118 | xgr $acc,$TX[1] |
| 91 | stg $acc,0($out) | 119 | stg $acc,0($out) |
| 92 | la $out,8($out) | 120 | la $out,8($out) |
| 93 | brct $cnt,.Loop8 | 121 | brctg $cnt,.Loop8 |
| 94 | 122 | ||
| 95 | .Lshort: | 123 | .Lshort: |
| 96 | lghi $acc,7 | 124 | lghi $acc,7 |
| @@ -122,7 +150,7 @@ $code.=<<___; | |||
| 122 | ahi $XX[0],-1 | 150 | ahi $XX[0],-1 |
| 123 | stc $XX[0],0($key) | 151 | stc $XX[0],0($key) |
| 124 | stc $YY,1($key) | 152 | stc $YY,1($key) |
| 125 | lmg %r6,%r11,48($sp) | 153 | lm${g} %r6,%r11,6*$SIZE_T($sp) |
| 126 | br $rp | 154 | br $rp |
| 127 | .size RC4,.-RC4 | 155 | .size RC4,.-RC4 |
| 128 | .string "RC4 for s390x, CRYPTOGAMS by <appro\@openssl.org>" | 156 | .string "RC4 for s390x, CRYPTOGAMS by <appro\@openssl.org>" |
| @@ -143,11 +171,11 @@ $ikey="%r7"; | |||
| 143 | $iinp="%r8"; | 171 | $iinp="%r8"; |
| 144 | 172 | ||
| 145 | $code.=<<___; | 173 | $code.=<<___; |
| 146 | .globl RC4_set_key | 174 | .globl private_RC4_set_key |
| 147 | .type RC4_set_key,\@function | 175 | .type private_RC4_set_key,\@function |
| 148 | .align 64 | 176 | .align 64 |
| 149 | RC4_set_key: | 177 | private_RC4_set_key: |
| 150 | stmg %r6,%r8,48($sp) | 178 | stm${g} %r6,%r8,6*$SIZE_T($sp) |
| 151 | lhi $cnt,256 | 179 | lhi $cnt,256 |
| 152 | la $idx,0(%r0) | 180 | la $idx,0(%r0) |
| 153 | sth $idx,0($key) | 181 | sth $idx,0($key) |
| @@ -180,9 +208,9 @@ RC4_set_key: | |||
| 180 | la $iinp,0(%r0) | 208 | la $iinp,0(%r0) |
| 181 | j .L2ndloop | 209 | j .L2ndloop |
| 182 | .Ldone: | 210 | .Ldone: |
| 183 | lmg %r6,%r8,48($sp) | 211 | lm${g} %r6,%r8,6*$SIZE_T($sp) |
| 184 | br $rp | 212 | br $rp |
| 185 | .size RC4_set_key,.-RC4_set_key | 213 | .size private_RC4_set_key,.-private_RC4_set_key |
| 186 | 214 | ||
| 187 | ___ | 215 | ___ |
| 188 | } | 216 | } |
| @@ -203,3 +231,4 @@ RC4_options: | |||
| 203 | ___ | 231 | ___ |
| 204 | 232 | ||
| 205 | print $code; | 233 | print $code; |
| 234 | close STDOUT; # force flush | ||
