diff options
| author | djm <> | 2006-06-27 05:05:42 +0000 |
|---|---|---|
| committer | djm <> | 2006-06-27 05:05:42 +0000 |
| commit | 3f764f48d2626a43b6eeef7652c28303269d1204 (patch) | |
| tree | 764d513589e09d2d10dbe70039b5f3bf58a36803 /src/lib/libcrypto/rc4 | |
| parent | 0d2f07cb82812dd6f9e33c493104f4c24e5b13a3 (diff) | |
| parent | f6198d4d0ab97685dc56be2d48715ed39fcc74b9 (diff) | |
| download | openbsd-3f764f48d2626a43b6eeef7652c28303269d1204.tar.gz openbsd-3f764f48d2626a43b6eeef7652c28303269d1204.tar.bz2 openbsd-3f764f48d2626a43b6eeef7652c28303269d1204.zip | |
This commit was generated by cvs2git to track changes on a CVS vendor
branch.
Diffstat (limited to 'src/lib/libcrypto/rc4')
| -rwxr-xr-x | src/lib/libcrypto/rc4/asm/rc4-x86_64.pl | 150 |
1 files changed, 150 insertions, 0 deletions
diff --git a/src/lib/libcrypto/rc4/asm/rc4-x86_64.pl b/src/lib/libcrypto/rc4/asm/rc4-x86_64.pl new file mode 100755 index 0000000000..b628daca70 --- /dev/null +++ b/src/lib/libcrypto/rc4/asm/rc4-x86_64.pl | |||
| @@ -0,0 +1,150 @@ | |||
| 1 | #!/usr/bin/env perl | ||
| 2 | # | ||
| 3 | # ==================================================================== | ||
| 4 | # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | ||
| 5 | # project. Rights for redistribution and usage in source and binary | ||
| 6 | # forms are granted according to the OpenSSL license. | ||
| 7 | # ==================================================================== | ||
| 8 | # | ||
| 9 | # Unlike 0.9.7f this code expects RC4_CHAR back in config line! See | ||
| 10 | # commentary section in corresponding script in development branch | ||
| 11 | # for background information about this option carousel. For those | ||
| 12 | # who don't have energy to figure out these gory details, here is | ||
| 13 | # basis in form of performance matrix relative to the original | ||
| 14 | # 0.9.7e C code-base: | ||
| 15 | # | ||
| 16 | # 0.9.7e 0.9.7f this | ||
| 17 | # AMD64 1x 3.3x 2.4x | ||
| 18 | # EM64T 1x 0.8x 1.5x | ||
| 19 | # | ||
| 20 | # In other words idea is to trade -25% AMD64 performance to compensate | ||
| 21 | # for deterioration and gain +90% on EM64T core. Development branch | ||
| 22 | # maintains best performance for either target, i.e. 3.3x for AMD64 | ||
| 23 | # and 1.5x for EM64T. | ||
| 24 | |||
| 25 | $output=shift; | ||
| 26 | |||
| 27 | open STDOUT,">$output" || die "can't open $output: $!"; | ||
| 28 | |||
| 29 | $dat="%rdi"; # arg1 | ||
| 30 | $len="%rsi"; # arg2 | ||
| 31 | $inp="%rdx"; # arg3 | ||
| 32 | $out="%rcx"; # arg4 | ||
| 33 | |||
| 34 | @XX=("%r8","%r10"); | ||
| 35 | @TX=("%r9","%r11"); | ||
| 36 | $YY="%r12"; | ||
| 37 | $TY="%r13"; | ||
| 38 | |||
| 39 | $code=<<___;; | ||
| 40 | .text | ||
| 41 | |||
| 42 | .globl RC4 | ||
| 43 | .type RC4,\@function | ||
| 44 | .align 16 | ||
| 45 | RC4: or $len,$len | ||
| 46 | jne .Lentry | ||
| 47 | repret | ||
| 48 | .Lentry: | ||
| 49 | push %r12 | ||
| 50 | push %r13 | ||
| 51 | |||
| 52 | add \$2,$dat | ||
| 53 | movzb -2($dat),$XX[0]#d | ||
| 54 | movzb -1($dat),$YY#d | ||
| 55 | |||
| 56 | add \$1,$XX[0]#b | ||
| 57 | movzb ($dat,$XX[0]),$TX[0]#d | ||
| 58 | test \$-8,$len | ||
| 59 | jz .Lcloop1 | ||
| 60 | push %rbx | ||
| 61 | .align 16 # incidentally aligned already | ||
| 62 | .Lcloop8: | ||
| 63 | mov ($inp),%eax | ||
| 64 | mov 4($inp),%ebx | ||
| 65 | ___ | ||
| 66 | # unroll 2x4-wise, because 64-bit rotates kill Intel P4... | ||
| 67 | for ($i=0;$i<4;$i++) { | ||
| 68 | $code.=<<___; | ||
| 69 | add $TX[0]#b,$YY#b | ||
| 70 | lea 1($XX[0]),$XX[1] | ||
| 71 | movzb ($dat,$YY),$TY#d | ||
| 72 | movzb $XX[1]#b,$XX[1]#d | ||
| 73 | movzb ($dat,$XX[1]),$TX[1]#d | ||
| 74 | movb $TX[0]#b,($dat,$YY) | ||
| 75 | cmp $XX[1],$YY | ||
| 76 | movb $TY#b,($dat,$XX[0]) | ||
| 77 | jne .Lcmov$i # Intel cmov is sloooow... | ||
| 78 | mov $TX[0],$TX[1] | ||
| 79 | .Lcmov$i: | ||
| 80 | add $TX[0]#b,$TY#b | ||
| 81 | xor ($dat,$TY),%al | ||
| 82 | ror \$8,%eax | ||
| 83 | ___ | ||
| 84 | push(@TX,shift(@TX)); push(@XX,shift(@XX)); # "rotate" registers | ||
| 85 | } | ||
| 86 | for ($i=4;$i<8;$i++) { | ||
| 87 | $code.=<<___; | ||
| 88 | add $TX[0]#b,$YY#b | ||
| 89 | lea 1($XX[0]),$XX[1] | ||
| 90 | movzb ($dat,$YY),$TY#d | ||
| 91 | movzb $XX[1]#b,$XX[1]#d | ||
| 92 | movzb ($dat,$XX[1]),$TX[1]#d | ||
| 93 | movb $TX[0]#b,($dat,$YY) | ||
| 94 | cmp $XX[1],$YY | ||
| 95 | movb $TY#b,($dat,$XX[0]) | ||
| 96 | jne .Lcmov$i # Intel cmov is sloooow... | ||
| 97 | mov $TX[0],$TX[1] | ||
| 98 | .Lcmov$i: | ||
| 99 | add $TX[0]#b,$TY#b | ||
| 100 | xor ($dat,$TY),%bl | ||
| 101 | ror \$8,%ebx | ||
| 102 | ___ | ||
| 103 | push(@TX,shift(@TX)); push(@XX,shift(@XX)); # "rotate" registers | ||
| 104 | } | ||
| 105 | $code.=<<___; | ||
| 106 | lea -8($len),$len | ||
| 107 | mov %eax,($out) | ||
| 108 | lea 8($inp),$inp | ||
| 109 | mov %ebx,4($out) | ||
| 110 | lea 8($out),$out | ||
| 111 | |||
| 112 | test \$-8,$len | ||
| 113 | jnz .Lcloop8 | ||
| 114 | pop %rbx | ||
| 115 | cmp \$0,$len | ||
| 116 | jne .Lcloop1 | ||
| 117 | .Lexit: | ||
| 118 | sub \$1,$XX[0]#b | ||
| 119 | movb $XX[0]#b,-2($dat) | ||
| 120 | movb $YY#b,-1($dat) | ||
| 121 | |||
| 122 | pop %r13 | ||
| 123 | pop %r12 | ||
| 124 | repret | ||
| 125 | |||
| 126 | .align 16 | ||
| 127 | .Lcloop1: | ||
| 128 | add $TX[0]#b,$YY#b | ||
| 129 | movzb ($dat,$YY),$TY#d | ||
| 130 | movb $TX[0]#b,($dat,$YY) | ||
| 131 | movb $TY#b,($dat,$XX[0]) | ||
| 132 | add $TX[0]#b,$TY#b | ||
| 133 | add \$1,$XX[0]#b | ||
| 134 | movzb ($dat,$TY),$TY#d | ||
| 135 | movzb ($dat,$XX[0]),$TX[0]#d | ||
| 136 | xorb ($inp),$TY#b | ||
| 137 | lea 1($inp),$inp | ||
| 138 | movb $TY#b,($out) | ||
| 139 | lea 1($out),$out | ||
| 140 | sub \$1,$len | ||
| 141 | jnz .Lcloop1 | ||
| 142 | jmp .Lexit | ||
| 143 | .size RC4,.-RC4 | ||
| 144 | ___ | ||
| 145 | |||
| 146 | $code =~ s/#([bwd])/$1/gm; | ||
| 147 | |||
| 148 | $code =~ s/repret/.byte\t0xF3,0xC3/gm; | ||
| 149 | |||
| 150 | print $code; | ||
