diff options
Diffstat (limited to 'src/lib/libcrypto/rc4')
| -rw-r--r-- | src/lib/libcrypto/rc4/asm/rc4-md5-x86_64.pl | 515 |
1 files changed, 0 insertions, 515 deletions
diff --git a/src/lib/libcrypto/rc4/asm/rc4-md5-x86_64.pl b/src/lib/libcrypto/rc4/asm/rc4-md5-x86_64.pl deleted file mode 100644 index e5e8aa08a1..0000000000 --- a/src/lib/libcrypto/rc4/asm/rc4-md5-x86_64.pl +++ /dev/null | |||
| @@ -1,515 +0,0 @@ | |||
| 1 | #!/usr/bin/env perl | ||
| 2 | # | ||
| 3 | # ==================================================================== | ||
| 4 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | ||
| 5 | # project. The module is, however, dual licensed under OpenSSL and | ||
| 6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
| 7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
| 8 | # ==================================================================== | ||
| 9 | |||
| 10 | # June 2011 | ||
| 11 | # | ||
| 12 | # This is RC4+MD5 "stitch" implementation. The idea, as spelled in | ||
| 13 | # http://download.intel.com/design/intarch/papers/323686.pdf, is that | ||
| 14 | # since both algorithms exhibit instruction-level parallelism, ILP, | ||
| 15 | # below theoretical maximum, interleaving them would allow to utilize | ||
| 16 | # processor resources better and achieve better performance. RC4 | ||
| 17 | # instruction sequence is virtually identical to rc4-x86_64.pl, which | ||
| 18 | # is heavily based on submission by Maxim Perminov, Maxim Locktyukhin | ||
| 19 | # and Jim Guilford of Intel. MD5 is fresh implementation aiming to | ||
| 20 | # minimize register usage, which was used as "main thread" with RC4 | ||
| 21 | # weaved into it, one RC4 round per one MD5 round. In addition to the | ||
| 22 | # stiched subroutine the script can generate standalone replacement | ||
| 23 | # md5_block_asm_data_order and RC4. Below are performance numbers in | ||
| 24 | # cycles per processed byte, less is better, for these the standalone | ||
| 25 | # subroutines, sum of them, and stitched one: | ||
| 26 | # | ||
| 27 | # RC4 MD5 RC4+MD5 stitch gain | ||
| 28 | # Opteron 6.5(*) 5.4 11.9 7.0 +70%(*) | ||
| 29 | # Core2 6.5 5.8 12.3 7.7 +60% | ||
| 30 | # Westmere 4.3 5.2 9.5 7.0 +36% | ||
| 31 | # Sandy Bridge 4.2 5.5 9.7 6.8 +43% | ||
| 32 | # Atom 9.3 6.5 15.8 11.1 +42% | ||
| 33 | # | ||
| 34 | # (*) rc4-x86_64.pl delivers 5.3 on Opteron, so real improvement | ||
| 35 | # is +53%... | ||
| 36 | |||
| 37 | my ($rc4,$md5)=(1,1); # what to generate? | ||
| 38 | my $D="#" if (!$md5); # if set to "#", MD5 is stitched into RC4(), | ||
| 39 | # but its result is discarded. Idea here is | ||
| 40 | # to be able to use 'openssl speed rc4' for | ||
| 41 | # benchmarking the stitched subroutine... | ||
| 42 | |||
| 43 | my $flavour = shift; | ||
| 44 | my $output = shift; | ||
| 45 | if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } | ||
| 46 | |||
| 47 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; my $dir=$1; my $xlate; | ||
| 48 | ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or | ||
| 49 | ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or | ||
| 50 | die "can't locate x86_64-xlate.pl"; | ||
| 51 | |||
| 52 | open OUT,"| \"$^X\" $xlate $flavour $output"; | ||
| 53 | *STDOUT=*OUT; | ||
| 54 | |||
| 55 | my ($dat,$in0,$out,$ctx,$inp,$len, $func,$nargs); | ||
| 56 | |||
| 57 | if ($rc4 && !$md5) { | ||
| 58 | ($dat,$len,$in0,$out) = ("%rdi","%rsi","%rdx","%rcx"); | ||
| 59 | $func="RC4"; $nargs=4; | ||
| 60 | } elsif ($md5 && !$rc4) { | ||
| 61 | ($ctx,$inp,$len) = ("%rdi","%rsi","%rdx"); | ||
| 62 | $func="md5_block_asm_data_order"; $nargs=3; | ||
| 63 | } else { | ||
| 64 | ($dat,$in0,$out,$ctx,$inp,$len) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9"); | ||
| 65 | $func="rc4_md5_enc"; $nargs=6; | ||
| 66 | # void rc4_md5_enc( | ||
| 67 | # RC4_KEY *key, # | ||
| 68 | # const void *in0, # RC4 input | ||
| 69 | # void *out, # RC4 output | ||
| 70 | # MD5_CTX *ctx, # | ||
| 71 | # const void *inp, # MD5 input | ||
| 72 | # size_t len); # number of 64-byte blocks | ||
| 73 | } | ||
| 74 | |||
| 75 | my @K=( 0xd76aa478,0xe8c7b756,0x242070db,0xc1bdceee, | ||
| 76 | 0xf57c0faf,0x4787c62a,0xa8304613,0xfd469501, | ||
| 77 | 0x698098d8,0x8b44f7af,0xffff5bb1,0x895cd7be, | ||
| 78 | 0x6b901122,0xfd987193,0xa679438e,0x49b40821, | ||
| 79 | |||
| 80 | 0xf61e2562,0xc040b340,0x265e5a51,0xe9b6c7aa, | ||
| 81 | 0xd62f105d,0x02441453,0xd8a1e681,0xe7d3fbc8, | ||
| 82 | 0x21e1cde6,0xc33707d6,0xf4d50d87,0x455a14ed, | ||
| 83 | 0xa9e3e905,0xfcefa3f8,0x676f02d9,0x8d2a4c8a, | ||
| 84 | |||
| 85 | 0xfffa3942,0x8771f681,0x6d9d6122,0xfde5380c, | ||
| 86 | 0xa4beea44,0x4bdecfa9,0xf6bb4b60,0xbebfbc70, | ||
| 87 | 0x289b7ec6,0xeaa127fa,0xd4ef3085,0x04881d05, | ||
| 88 | 0xd9d4d039,0xe6db99e5,0x1fa27cf8,0xc4ac5665, | ||
| 89 | |||
| 90 | 0xf4292244,0x432aff97,0xab9423a7,0xfc93a039, | ||
| 91 | 0x655b59c3,0x8f0ccc92,0xffeff47d,0x85845dd1, | ||
| 92 | 0x6fa87e4f,0xfe2ce6e0,0xa3014314,0x4e0811a1, | ||
| 93 | 0xf7537e82,0xbd3af235,0x2ad7d2bb,0xeb86d391 ); | ||
| 94 | |||
| 95 | my @V=("%r8d","%r9d","%r10d","%r11d"); # MD5 registers | ||
| 96 | my $tmp="%r12d"; | ||
| 97 | |||
| 98 | my @XX=("%rbp","%rsi"); # RC4 registers | ||
| 99 | my @TX=("%rax","%rbx"); | ||
| 100 | my $YY="%rcx"; | ||
| 101 | my $TY="%rdx"; | ||
| 102 | |||
| 103 | my $MOD=32; # 16, 32 or 64 | ||
| 104 | |||
| 105 | $code.=<<___; | ||
| 106 | .text | ||
| 107 | .align 16 | ||
| 108 | |||
| 109 | .globl $func | ||
| 110 | .type $func,\@function,$nargs | ||
| 111 | $func: | ||
| 112 | _CET_ENDBR | ||
| 113 | cmp \$0,$len | ||
| 114 | je .Labort | ||
| 115 | push %rbx | ||
| 116 | push %rbp | ||
| 117 | push %r12 | ||
| 118 | push %r13 | ||
| 119 | push %r14 | ||
| 120 | push %r15 | ||
| 121 | sub \$40,%rsp | ||
| 122 | .Lbody: | ||
| 123 | ___ | ||
| 124 | if ($rc4) { | ||
| 125 | $code.=<<___; | ||
| 126 | $D#md5# mov $ctx,%r11 # reassign arguments | ||
| 127 | mov $len,%r12 | ||
| 128 | mov $in0,%r13 | ||
| 129 | mov $out,%r14 | ||
| 130 | $D#md5# mov $inp,%r15 | ||
| 131 | ___ | ||
| 132 | $ctx="%r11" if ($md5); # reassign arguments | ||
| 133 | $len="%r12"; | ||
| 134 | $in0="%r13"; | ||
| 135 | $out="%r14"; | ||
| 136 | $inp="%r15" if ($md5); | ||
| 137 | $inp=$in0 if (!$md5); | ||
| 138 | $code.=<<___; | ||
| 139 | xor $XX[0],$XX[0] | ||
| 140 | xor $YY,$YY | ||
| 141 | |||
| 142 | lea 8($dat),$dat | ||
| 143 | mov -8($dat),$XX[0]#b | ||
| 144 | mov -4($dat),$YY#b | ||
| 145 | |||
| 146 | inc $XX[0]#b | ||
| 147 | sub $in0,$out | ||
| 148 | movl ($dat,$XX[0],4),$TX[0]#d | ||
| 149 | ___ | ||
| 150 | $code.=<<___ if (!$md5); | ||
| 151 | xor $TX[1],$TX[1] | ||
| 152 | test \$-128,$len | ||
| 153 | jz .Loop1 | ||
| 154 | sub $XX[0],$TX[1] | ||
| 155 | and \$`$MOD-1`,$TX[1] | ||
| 156 | jz .Loop${MOD}_is_hot | ||
| 157 | sub $TX[1],$len | ||
| 158 | .Loop${MOD}_warmup: | ||
| 159 | add $TX[0]#b,$YY#b | ||
| 160 | movl ($dat,$YY,4),$TY#d | ||
| 161 | movl $TX[0]#d,($dat,$YY,4) | ||
| 162 | movl $TY#d,($dat,$XX[0],4) | ||
| 163 | add $TY#b,$TX[0]#b | ||
| 164 | inc $XX[0]#b | ||
| 165 | movl ($dat,$TX[0],4),$TY#d | ||
| 166 | movl ($dat,$XX[0],4),$TX[0]#d | ||
| 167 | xorb ($in0),$TY#b | ||
| 168 | movb $TY#b,($out,$in0) | ||
| 169 | lea 1($in0),$in0 | ||
| 170 | dec $TX[1] | ||
| 171 | jnz .Loop${MOD}_warmup | ||
| 172 | |||
| 173 | mov $YY,$TX[1] | ||
| 174 | xor $YY,$YY | ||
| 175 | mov $TX[1]#b,$YY#b | ||
| 176 | |||
| 177 | .Loop${MOD}_is_hot: | ||
| 178 | mov $len,32(%rsp) # save original $len | ||
| 179 | shr \$6,$len # number of 64-byte blocks | ||
| 180 | ___ | ||
| 181 | if ($D && !$md5) { # stitch in dummy MD5 | ||
| 182 | $md5=1; | ||
| 183 | $ctx="%r11"; | ||
| 184 | $inp="%r15"; | ||
| 185 | $code.=<<___; | ||
| 186 | mov %rsp,$ctx | ||
| 187 | mov $in0,$inp | ||
| 188 | ___ | ||
| 189 | } | ||
| 190 | } | ||
| 191 | $code.=<<___; | ||
| 192 | #rc4# add $TX[0]#b,$YY#b | ||
| 193 | #rc4# lea ($dat,$XX[0],4),$XX[1] | ||
| 194 | shl \$6,$len | ||
| 195 | add $inp,$len # pointer to the end of input | ||
| 196 | mov $len,16(%rsp) | ||
| 197 | |||
| 198 | #md5# mov $ctx,24(%rsp) # save pointer to MD5_CTX | ||
| 199 | #md5# mov 0*4($ctx),$V[0] # load current hash value from MD5_CTX | ||
| 200 | #md5# mov 1*4($ctx),$V[1] | ||
| 201 | #md5# mov 2*4($ctx),$V[2] | ||
| 202 | #md5# mov 3*4($ctx),$V[3] | ||
| 203 | jmp .Loop | ||
| 204 | |||
| 205 | .align 16 | ||
| 206 | .Loop: | ||
| 207 | #md5# mov $V[0],0*4(%rsp) # put aside current hash value | ||
| 208 | #md5# mov $V[1],1*4(%rsp) | ||
| 209 | #md5# mov $V[2],2*4(%rsp) | ||
| 210 | #md5# mov $V[3],$tmp # forward reference | ||
| 211 | #md5# mov $V[3],3*4(%rsp) | ||
| 212 | ___ | ||
| 213 | |||
| 214 | sub R0 { | ||
| 215 | my ($i,$a,$b,$c,$d)=@_; | ||
| 216 | my @rot0=(7,12,17,22); | ||
| 217 | my $j=$i%16; | ||
| 218 | my $k=$i%$MOD; | ||
| 219 | my $xmm="%xmm".($j&1); | ||
| 220 | $code.=" movdqu ($in0),%xmm2\n" if ($rc4 && $j==15); | ||
| 221 | $code.=" add \$$MOD,$XX[0]#b\n" if ($rc4 && $j==15 && $k==$MOD-1); | ||
| 222 | $code.=" pxor $xmm,$xmm\n" if ($rc4 && $j<=1); | ||
| 223 | $code.=<<___; | ||
| 224 | #rc4# movl ($dat,$YY,4),$TY#d | ||
| 225 | #md5# xor $c,$tmp | ||
| 226 | #rc4# movl $TX[0]#d,($dat,$YY,4) | ||
| 227 | #md5# and $b,$tmp | ||
| 228 | #md5# add 4*`$j`($inp),$a | ||
| 229 | #rc4# add $TY#b,$TX[0]#b | ||
| 230 | #rc4# movl `4*(($k+1)%$MOD)`(`$k==$MOD-1?"$dat,$XX[0],4":"$XX[1]"`),$TX[1]#d | ||
| 231 | #md5# add \$$K[$i],$a | ||
| 232 | #md5# xor $d,$tmp | ||
| 233 | #rc4# movz $TX[0]#b,$TX[0]#d | ||
| 234 | #rc4# movl $TY#d,4*$k($XX[1]) | ||
| 235 | #md5# add $tmp,$a | ||
| 236 | #rc4# add $TX[1]#b,$YY#b | ||
| 237 | #md5# rol \$$rot0[$j%4],$a | ||
| 238 | #md5# mov `$j==15?"$b":"$c"`,$tmp # forward reference | ||
| 239 | #rc4# pinsrw \$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n | ||
| 240 | #md5# add $b,$a | ||
| 241 | ___ | ||
| 242 | $code.=<<___ if ($rc4 && $j==15 && $k==$MOD-1); | ||
| 243 | mov $YY,$XX[1] | ||
| 244 | xor $YY,$YY # keyword to partial register | ||
| 245 | mov $XX[1]#b,$YY#b | ||
| 246 | lea ($dat,$XX[0],4),$XX[1] | ||
| 247 | ___ | ||
| 248 | $code.=<<___ if ($rc4 && $j==15); | ||
| 249 | psllq \$8,%xmm1 | ||
| 250 | pxor %xmm0,%xmm2 | ||
| 251 | pxor %xmm1,%xmm2 | ||
| 252 | ___ | ||
| 253 | } | ||
| 254 | sub R1 { | ||
| 255 | my ($i,$a,$b,$c,$d)=@_; | ||
| 256 | my @rot1=(5,9,14,20); | ||
| 257 | my $j=$i%16; | ||
| 258 | my $k=$i%$MOD; | ||
| 259 | my $xmm="%xmm".($j&1); | ||
| 260 | $code.=" movdqu 16($in0),%xmm3\n" if ($rc4 && $j==15); | ||
| 261 | $code.=" add \$$MOD,$XX[0]#b\n" if ($rc4 && $j==15 && $k==$MOD-1); | ||
| 262 | $code.=" pxor $xmm,$xmm\n" if ($rc4 && $j<=1); | ||
| 263 | $code.=<<___; | ||
| 264 | #rc4# movl ($dat,$YY,4),$TY#d | ||
| 265 | #md5# xor $b,$tmp | ||
| 266 | #rc4# movl $TX[0]#d,($dat,$YY,4) | ||
| 267 | #md5# and $d,$tmp | ||
| 268 | #md5# add 4*`((1+5*$j)%16)`($inp),$a | ||
| 269 | #rc4# add $TY#b,$TX[0]#b | ||
| 270 | #rc4# movl `4*(($k+1)%$MOD)`(`$k==$MOD-1?"$dat,$XX[0],4":"$XX[1]"`),$TX[1]#d | ||
| 271 | #md5# add \$$K[$i],$a | ||
| 272 | #md5# xor $c,$tmp | ||
| 273 | #rc4# movz $TX[0]#b,$TX[0]#d | ||
| 274 | #rc4# movl $TY#d,4*$k($XX[1]) | ||
| 275 | #md5# add $tmp,$a | ||
| 276 | #rc4# add $TX[1]#b,$YY#b | ||
| 277 | #md5# rol \$$rot1[$j%4],$a | ||
| 278 | #md5# mov `$j==15?"$c":"$b"`,$tmp # forward reference | ||
| 279 | #rc4# pinsrw \$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n | ||
| 280 | #md5# add $b,$a | ||
| 281 | ___ | ||
| 282 | $code.=<<___ if ($rc4 && $j==15 && $k==$MOD-1); | ||
| 283 | mov $YY,$XX[1] | ||
| 284 | xor $YY,$YY # keyword to partial register | ||
| 285 | mov $XX[1]#b,$YY#b | ||
| 286 | lea ($dat,$XX[0],4),$XX[1] | ||
| 287 | ___ | ||
| 288 | $code.=<<___ if ($rc4 && $j==15); | ||
| 289 | psllq \$8,%xmm1 | ||
| 290 | pxor %xmm0,%xmm3 | ||
| 291 | pxor %xmm1,%xmm3 | ||
| 292 | ___ | ||
| 293 | } | ||
| 294 | sub R2 { | ||
| 295 | my ($i,$a,$b,$c,$d)=@_; | ||
| 296 | my @rot2=(4,11,16,23); | ||
| 297 | my $j=$i%16; | ||
| 298 | my $k=$i%$MOD; | ||
| 299 | my $xmm="%xmm".($j&1); | ||
| 300 | $code.=" movdqu 32($in0),%xmm4\n" if ($rc4 && $j==15); | ||
| 301 | $code.=" add \$$MOD,$XX[0]#b\n" if ($rc4 && $j==15 && $k==$MOD-1); | ||
| 302 | $code.=" pxor $xmm,$xmm\n" if ($rc4 && $j<=1); | ||
| 303 | $code.=<<___; | ||
| 304 | #rc4# movl ($dat,$YY,4),$TY#d | ||
| 305 | #md5# xor $c,$tmp | ||
| 306 | #rc4# movl $TX[0]#d,($dat,$YY,4) | ||
| 307 | #md5# xor $b,$tmp | ||
| 308 | #md5# add 4*`((5+3*$j)%16)`($inp),$a | ||
| 309 | #rc4# add $TY#b,$TX[0]#b | ||
| 310 | #rc4# movl `4*(($k+1)%$MOD)`(`$k==$MOD-1?"$dat,$XX[0],4":"$XX[1]"`),$TX[1]#d | ||
| 311 | #md5# add \$$K[$i],$a | ||
| 312 | #rc4# movz $TX[0]#b,$TX[0]#d | ||
| 313 | #md5# add $tmp,$a | ||
| 314 | #rc4# movl $TY#d,4*$k($XX[1]) | ||
| 315 | #rc4# add $TX[1]#b,$YY#b | ||
| 316 | #md5# rol \$$rot2[$j%4],$a | ||
| 317 | #md5# mov `$j==15?"\\\$-1":"$c"`,$tmp # forward reference | ||
| 318 | #rc4# pinsrw \$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n | ||
| 319 | #md5# add $b,$a | ||
| 320 | ___ | ||
| 321 | $code.=<<___ if ($rc4 && $j==15 && $k==$MOD-1); | ||
| 322 | mov $YY,$XX[1] | ||
| 323 | xor $YY,$YY # keyword to partial register | ||
| 324 | mov $XX[1]#b,$YY#b | ||
| 325 | lea ($dat,$XX[0],4),$XX[1] | ||
| 326 | ___ | ||
| 327 | $code.=<<___ if ($rc4 && $j==15); | ||
| 328 | psllq \$8,%xmm1 | ||
| 329 | pxor %xmm0,%xmm4 | ||
| 330 | pxor %xmm1,%xmm4 | ||
| 331 | ___ | ||
| 332 | } | ||
| 333 | sub R3 { | ||
| 334 | my ($i,$a,$b,$c,$d)=@_; | ||
| 335 | my @rot3=(6,10,15,21); | ||
| 336 | my $j=$i%16; | ||
| 337 | my $k=$i%$MOD; | ||
| 338 | my $xmm="%xmm".($j&1); | ||
| 339 | $code.=" movdqu 48($in0),%xmm5\n" if ($rc4 && $j==15); | ||
| 340 | $code.=" add \$$MOD,$XX[0]#b\n" if ($rc4 && $j==15 && $k==$MOD-1); | ||
| 341 | $code.=" pxor $xmm,$xmm\n" if ($rc4 && $j<=1); | ||
| 342 | $code.=<<___; | ||
| 343 | #rc4# movl ($dat,$YY,4),$TY#d | ||
| 344 | #md5# xor $d,$tmp | ||
| 345 | #rc4# movl $TX[0]#d,($dat,$YY,4) | ||
| 346 | #md5# or $b,$tmp | ||
| 347 | #md5# add 4*`((7*$j)%16)`($inp),$a | ||
| 348 | #rc4# add $TY#b,$TX[0]#b | ||
| 349 | #rc4# movl `4*(($k+1)%$MOD)`(`$k==$MOD-1?"$dat,$XX[0],4":"$XX[1]"`),$TX[1]#d | ||
| 350 | #md5# add \$$K[$i],$a | ||
| 351 | #rc4# movz $TX[0]#b,$TX[0]#d | ||
| 352 | #md5# xor $c,$tmp | ||
| 353 | #rc4# movl $TY#d,4*$k($XX[1]) | ||
| 354 | #md5# add $tmp,$a | ||
| 355 | #rc4# add $TX[1]#b,$YY#b | ||
| 356 | #md5# rol \$$rot3[$j%4],$a | ||
| 357 | #md5# mov \$-1,$tmp # forward reference | ||
| 358 | #rc4# pinsrw \$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n | ||
| 359 | #md5# add $b,$a | ||
| 360 | ___ | ||
| 361 | $code.=<<___ if ($rc4 && $j==15); | ||
| 362 | mov $XX[0],$XX[1] | ||
| 363 | xor $XX[0],$XX[0] # keyword to partial register | ||
| 364 | mov $XX[1]#b,$XX[0]#b | ||
| 365 | mov $YY,$XX[1] | ||
| 366 | xor $YY,$YY # keyword to partial register | ||
| 367 | mov $XX[1]#b,$YY#b | ||
| 368 | lea ($dat,$XX[0],4),$XX[1] | ||
| 369 | psllq \$8,%xmm1 | ||
| 370 | pxor %xmm0,%xmm5 | ||
| 371 | pxor %xmm1,%xmm5 | ||
| 372 | ___ | ||
| 373 | } | ||
| 374 | |||
| 375 | my $i=0; | ||
| 376 | for(;$i<16;$i++) { R0($i,@V); unshift(@V,pop(@V)); push(@TX,shift(@TX)); } | ||
| 377 | for(;$i<32;$i++) { R1($i,@V); unshift(@V,pop(@V)); push(@TX,shift(@TX)); } | ||
| 378 | for(;$i<48;$i++) { R2($i,@V); unshift(@V,pop(@V)); push(@TX,shift(@TX)); } | ||
| 379 | for(;$i<64;$i++) { R3($i,@V); unshift(@V,pop(@V)); push(@TX,shift(@TX)); } | ||
| 380 | |||
| 381 | $code.=<<___; | ||
| 382 | #md5# add 0*4(%rsp),$V[0] # accumulate hash value | ||
| 383 | #md5# add 1*4(%rsp),$V[1] | ||
| 384 | #md5# add 2*4(%rsp),$V[2] | ||
| 385 | #md5# add 3*4(%rsp),$V[3] | ||
| 386 | |||
| 387 | #rc4# movdqu %xmm2,($out,$in0) # write RC4 output | ||
| 388 | #rc4# movdqu %xmm3,16($out,$in0) | ||
| 389 | #rc4# movdqu %xmm4,32($out,$in0) | ||
| 390 | #rc4# movdqu %xmm5,48($out,$in0) | ||
| 391 | #md5# lea 64($inp),$inp | ||
| 392 | #rc4# lea 64($in0),$in0 | ||
| 393 | cmp 16(%rsp),$inp # are we done? | ||
| 394 | jb .Loop | ||
| 395 | |||
| 396 | #md5# mov 24(%rsp),$len # restore pointer to MD5_CTX | ||
| 397 | #rc4# sub $TX[0]#b,$YY#b # correct $YY | ||
| 398 | #md5# mov $V[0],0*4($len) # write MD5_CTX | ||
| 399 | #md5# mov $V[1],1*4($len) | ||
| 400 | #md5# mov $V[2],2*4($len) | ||
| 401 | #md5# mov $V[3],3*4($len) | ||
| 402 | ___ | ||
| 403 | $code.=<<___ if ($rc4 && (!$md5 || $D)); | ||
| 404 | mov 32(%rsp),$len # restore original $len | ||
| 405 | and \$63,$len # remaining bytes | ||
| 406 | jnz .Loop1 | ||
| 407 | jmp .Ldone | ||
| 408 | |||
| 409 | .align 16 | ||
| 410 | .Loop1: | ||
| 411 | add $TX[0]#b,$YY#b | ||
| 412 | movl ($dat,$YY,4),$TY#d | ||
| 413 | movl $TX[0]#d,($dat,$YY,4) | ||
| 414 | movl $TY#d,($dat,$XX[0],4) | ||
| 415 | add $TY#b,$TX[0]#b | ||
| 416 | inc $XX[0]#b | ||
| 417 | movl ($dat,$TX[0],4),$TY#d | ||
| 418 | movl ($dat,$XX[0],4),$TX[0]#d | ||
| 419 | xorb ($in0),$TY#b | ||
| 420 | movb $TY#b,($out,$in0) | ||
| 421 | lea 1($in0),$in0 | ||
| 422 | dec $len | ||
| 423 | jnz .Loop1 | ||
| 424 | |||
| 425 | .Ldone: | ||
| 426 | ___ | ||
| 427 | $code.=<<___; | ||
| 428 | #rc4# sub \$1,$XX[0]#b | ||
| 429 | #rc4# movl $XX[0]#d,-8($dat) | ||
| 430 | #rc4# movl $YY#d,-4($dat) | ||
| 431 | |||
| 432 | mov 40(%rsp),%r15 | ||
| 433 | mov 48(%rsp),%r14 | ||
| 434 | mov 56(%rsp),%r13 | ||
| 435 | mov 64(%rsp),%r12 | ||
| 436 | mov 72(%rsp),%rbp | ||
| 437 | mov 80(%rsp),%rbx | ||
| 438 | lea 88(%rsp),%rsp | ||
| 439 | .Lepilogue: | ||
| 440 | .Labort: | ||
| 441 | ret | ||
| 442 | .size $func,.-$func | ||
| 443 | ___ | ||
| 444 | |||
| 445 | if ($rc4 && $D) { # sole purpose of this section is to provide | ||
| 446 | # option to use the generated module as drop-in | ||
| 447 | # replacement for rc4-x86_64.pl for debugging | ||
| 448 | # and testing purposes... | ||
| 449 | my ($idx,$ido)=("%r8","%r9"); | ||
| 450 | my ($dat,$len,$inp)=("%rdi","%rsi","%rdx"); | ||
| 451 | |||
| 452 | $code.=<<___; | ||
| 453 | .globl RC4_set_key | ||
| 454 | .type RC4_set_key,\@function,3 | ||
| 455 | .align 16 | ||
| 456 | RC4_set_key: | ||
| 457 | _CET_ENDBR | ||
| 458 | lea 8($dat),$dat | ||
| 459 | lea ($inp,$len),$inp | ||
| 460 | neg $len | ||
| 461 | mov $len,%rcx | ||
| 462 | xor %eax,%eax | ||
| 463 | xor $ido,$ido | ||
| 464 | xor %r10,%r10 | ||
| 465 | xor %r11,%r11 | ||
| 466 | jmp .Lw1stloop | ||
| 467 | |||
| 468 | .align 16 | ||
| 469 | .Lw1stloop: | ||
| 470 | mov %eax,($dat,%rax,4) | ||
| 471 | add \$1,%al | ||
| 472 | jnc .Lw1stloop | ||
| 473 | |||
| 474 | xor $ido,$ido | ||
| 475 | xor $idx,$idx | ||
| 476 | .align 16 | ||
| 477 | .Lw2ndloop: | ||
| 478 | mov ($dat,$ido,4),%r10d | ||
| 479 | add ($inp,$len,1),$idx#b | ||
| 480 | add %r10b,$idx#b | ||
| 481 | add \$1,$len | ||
| 482 | mov ($dat,$idx,4),%r11d | ||
| 483 | cmovz %rcx,$len | ||
| 484 | mov %r10d,($dat,$idx,4) | ||
| 485 | mov %r11d,($dat,$ido,4) | ||
| 486 | add \$1,$ido#b | ||
| 487 | jnc .Lw2ndloop | ||
| 488 | |||
| 489 | xor %eax,%eax | ||
| 490 | mov %eax,-8($dat) | ||
| 491 | mov %eax,-4($dat) | ||
| 492 | ret | ||
| 493 | .size RC4_set_key,.-RC4_set_key | ||
| 494 | ___ | ||
| 495 | } | ||
| 496 | |||
| 497 | sub reg_part { | ||
| 498 | my ($reg,$conv)=@_; | ||
| 499 | if ($reg =~ /%r[0-9]+/) { $reg .= $conv; } | ||
| 500 | elsif ($conv eq "b") { $reg =~ s/%[er]([^x]+)x?/%$1l/; } | ||
| 501 | elsif ($conv eq "w") { $reg =~ s/%[er](.+)/%$1/; } | ||
| 502 | elsif ($conv eq "d") { $reg =~ s/%[er](.+)/%e$1/; } | ||
| 503 | return $reg; | ||
| 504 | } | ||
| 505 | |||
| 506 | $code =~ s/(%[a-z0-9]+)#([bwd])/reg_part($1,$2)/gem; | ||
| 507 | $code =~ s/\`([^\`]*)\`/eval $1/gem; | ||
| 508 | $code =~ s/pinsrw\s+\$0,/movd /gm; | ||
| 509 | |||
| 510 | $code =~ s/#md5#//gm if ($md5); | ||
| 511 | $code =~ s/#rc4#//gm if ($rc4); | ||
| 512 | |||
| 513 | print $code; | ||
| 514 | |||
| 515 | close STDOUT; | ||
