diff options
author | djm <> | 2005-04-29 05:39:33 +0000 |
---|---|---|
committer | djm <> | 2005-04-29 05:39:33 +0000 |
commit | 68edd00d9258df93b1366c71ac124e0cadf7bc08 (patch) | |
tree | 3ce4ae2a9747bbc11aed1f95f9bbea92c41f8683 /src/lib/libcrypto/rc4/asm/rc4-586.pl | |
parent | f396ed0f5ce0af56bfde2e75e15cf1f52924c779 (diff) | |
download | openbsd-68edd00d9258df93b1366c71ac124e0cadf7bc08.tar.gz openbsd-68edd00d9258df93b1366c71ac124e0cadf7bc08.tar.bz2 openbsd-68edd00d9258df93b1366c71ac124e0cadf7bc08.zip |
resolve conflicts
Diffstat (limited to 'src/lib/libcrypto/rc4/asm/rc4-586.pl')
-rw-r--r-- | src/lib/libcrypto/rc4/asm/rc4-586.pl | 114 |
1 files changed, 85 insertions, 29 deletions
diff --git a/src/lib/libcrypto/rc4/asm/rc4-586.pl b/src/lib/libcrypto/rc4/asm/rc4-586.pl index 7ef889e5a1..d6e98f0811 100644 --- a/src/lib/libcrypto/rc4/asm/rc4-586.pl +++ b/src/lib/libcrypto/rc4/asm/rc4-586.pl | |||
@@ -1,16 +1,37 @@ | |||
1 | #!/usr/local/bin/perl | 1 | #!/usr/local/bin/perl |
2 | 2 | ||
3 | # define for pentium pro friendly version | 3 | # At some point it became apparent that the original SSLeay RC4 |
4 | # assembler implementation performs suboptimaly on latest IA-32 | ||
5 | # microarchitectures. After re-tuning performance has changed as | ||
6 | # following: | ||
7 | # | ||
8 | # Pentium +0% | ||
9 | # Pentium III +17% | ||
10 | # AMD +52%(*) | ||
11 | # P4 +180%(**) | ||
12 | # | ||
13 | # (*) This number is actually a trade-off:-) It's possible to | ||
14 | # achieve +72%, but at the cost of -48% off PIII performance. | ||
15 | # In other words code performing further 13% faster on AMD | ||
16 | # would perform almost 2 times slower on Intel PIII... | ||
17 | # For reference! This code delivers ~80% of rc4-amd64.pl | ||
18 | # performance on the same Opteron machine. | ||
19 | # (**) This number requires compressed key schedule set up by | ||
20 | # RC4_set_key and therefore doesn't apply to 0.9.7 [option for | ||
21 | # compressed key schedule is implemented in 0.9.8 and later, | ||
22 | # see commentary section in rc4_skey.c for further details]. | ||
23 | # | ||
24 | # <appro@fy.chalmers.se> | ||
4 | 25 | ||
5 | push(@INC,"perlasm","../../perlasm"); | 26 | push(@INC,"perlasm","../../perlasm"); |
6 | require "x86asm.pl"; | 27 | require "x86asm.pl"; |
7 | 28 | ||
8 | &asm_init($ARGV[0],"rc4-586.pl"); | 29 | &asm_init($ARGV[0],"rc4-586.pl"); |
9 | 30 | ||
10 | $tx="eax"; | 31 | $x="eax"; |
11 | $ty="ebx"; | 32 | $y="ebx"; |
12 | $x="ecx"; | 33 | $tx="ecx"; |
13 | $y="edx"; | 34 | $ty="edx"; |
14 | $in="esi"; | 35 | $in="esi"; |
15 | $out="edi"; | 36 | $out="edi"; |
16 | $d="ebp"; | 37 | $d="ebp"; |
@@ -31,7 +52,7 @@ sub RC4_loop | |||
31 | { | 52 | { |
32 | &mov($ty, &swtmp(2)); | 53 | &mov($ty, &swtmp(2)); |
33 | &cmp($ty, $in); | 54 | &cmp($ty, $in); |
34 | &jle(&label("finished")); | 55 | &jbe(&label("finished")); |
35 | &inc($in); | 56 | &inc($in); |
36 | } | 57 | } |
37 | else | 58 | else |
@@ -39,27 +60,23 @@ sub RC4_loop | |||
39 | &add($ty, 8); | 60 | &add($ty, 8); |
40 | &inc($in); | 61 | &inc($in); |
41 | &cmp($ty, $in); | 62 | &cmp($ty, $in); |
42 | &jl(&label("finished")); | 63 | &jb(&label("finished")); |
43 | &mov(&swtmp(2), $ty); | 64 | &mov(&swtmp(2), $ty); |
44 | } | 65 | } |
45 | } | 66 | } |
46 | # Moved out | 67 | # Moved out |
47 | # &mov( $tx, &DWP(0,$d,$x,4)) if $p < 0; | 68 | # &mov( $tx, &DWP(0,$d,$x,4)) if $p < 0; |
48 | 69 | ||
49 | &add( $y, $tx); | 70 | &add( &LB($y), &LB($tx)); |
50 | &and( $y, 0xff); | ||
51 | &inc( $x); # NEXT ROUND | ||
52 | &mov( $ty, &DWP(0,$d,$y,4)); | 71 | &mov( $ty, &DWP(0,$d,$y,4)); |
53 | # XXX | 72 | # XXX |
54 | &mov( &DWP(-4,$d,$x,4),$ty); # AGI | 73 | &mov( &DWP(0,$d,$x,4),$ty); |
55 | &add( $ty, $tx); | 74 | &add( $ty, $tx); |
56 | &and( $x, 0xff); # NEXT ROUND | ||
57 | &and( $ty, 0xff); | ||
58 | &mov( &DWP(0,$d,$y,4),$tx); | 75 | &mov( &DWP(0,$d,$y,4),$tx); |
59 | &nop(); | 76 | &and( $ty, 0xff); |
60 | &mov( $ty, &DWP(0,$d,$ty,4)); | 77 | &inc( &LB($x)); # NEXT ROUND |
61 | &mov( $tx, &DWP(0,$d,$x,4)) if $p < 1; # NEXT ROUND | 78 | &mov( $tx, &DWP(0,$d,$x,4)) if $p < 1; # NEXT ROUND |
62 | # XXX | 79 | &mov( $ty, &DWP(0,$d,$ty,4)); |
63 | 80 | ||
64 | if (!$char) | 81 | if (!$char) |
65 | { | 82 | { |
@@ -88,35 +105,47 @@ sub RC4 | |||
88 | 105 | ||
89 | &function_begin_B($name,""); | 106 | &function_begin_B($name,""); |
90 | 107 | ||
108 | &mov($ty,&wparam(1)); # len | ||
109 | &cmp($ty,0); | ||
110 | &jne(&label("proceed")); | ||
111 | &ret(); | ||
112 | &set_label("proceed"); | ||
113 | |||
91 | &comment(""); | 114 | &comment(""); |
92 | 115 | ||
93 | &push("ebp"); | 116 | &push("ebp"); |
94 | &push("ebx"); | 117 | &push("ebx"); |
95 | &mov( $d, &wparam(0)); # key | ||
96 | &mov( $ty, &wparam(1)); # num | ||
97 | &push("esi"); | 118 | &push("esi"); |
98 | &push("edi"); | 119 | &xor( $x, $x); # avoid partial register stalls |
120 | &push("edi"); | ||
121 | &xor( $y, $y); # avoid partial register stalls | ||
122 | &mov( $d, &wparam(0)); # key | ||
123 | &mov( $in, &wparam(2)); | ||
99 | 124 | ||
100 | &mov( $x, &DWP(0,$d,"",1)); | 125 | &movb( &LB($x), &BP(0,$d,"",1)); |
101 | &mov( $y, &DWP(4,$d,"",1)); | 126 | &movb( &LB($y), &BP(4,$d,"",1)); |
102 | 127 | ||
103 | &mov( $in, &wparam(2)); | 128 | &mov( $out, &wparam(3)); |
104 | &inc( $x); | 129 | &inc( &LB($x)); |
105 | 130 | ||
106 | &stack_push(3); # 3 temp variables | 131 | &stack_push(3); # 3 temp variables |
107 | &add( $d, 8); | 132 | &add( $d, 8); |
108 | &and( $x, 0xff); | 133 | |
134 | # detect compressed schedule, see commentary section in rc4_skey.c... | ||
135 | # in 0.9.7 context ~50 bytes below RC4_CHAR label remain redundant, | ||
136 | # as compressed key schedule is set up in 0.9.8 and later. | ||
137 | &cmp(&DWP(256,$d),-1); | ||
138 | &je(&label("RC4_CHAR")); | ||
109 | 139 | ||
110 | &lea( $ty, &DWP(-8,$ty,$in)); | 140 | &lea( $ty, &DWP(-8,$ty,$in)); |
111 | 141 | ||
112 | # check for 0 length input | 142 | # check for 0 length input |
113 | 143 | ||
114 | &mov( $out, &wparam(3)); | ||
115 | &mov( &swtmp(2), $ty); # this is now address to exit at | 144 | &mov( &swtmp(2), $ty); # this is now address to exit at |
116 | &mov( $tx, &DWP(0,$d,$x,4)); | 145 | &mov( $tx, &DWP(0,$d,$x,4)); |
117 | 146 | ||
118 | &cmp( $ty, $in); | 147 | &cmp( $ty, $in); |
119 | &jl( &label("end")); # less than 8 bytes | 148 | &jb( &label("end")); # less than 8 bytes |
120 | 149 | ||
121 | &set_label("start"); | 150 | &set_label("start"); |
122 | 151 | ||
@@ -148,7 +177,7 @@ sub RC4 | |||
148 | &mov( &DWP(-4,$out,"",0), $tx); | 177 | &mov( &DWP(-4,$out,"",0), $tx); |
149 | &mov( $tx, &DWP(0,$d,$x,4)); | 178 | &mov( $tx, &DWP(0,$d,$x,4)); |
150 | &cmp($in, $ty); | 179 | &cmp($in, $ty); |
151 | &jle(&label("start")); | 180 | &jbe(&label("start")); |
152 | 181 | ||
153 | &set_label("end"); | 182 | &set_label("end"); |
154 | 183 | ||
@@ -162,10 +191,37 @@ sub RC4 | |||
162 | &RC4_loop(5,0,1); | 191 | &RC4_loop(5,0,1); |
163 | &RC4_loop(6,1,1); | 192 | &RC4_loop(6,1,1); |
164 | 193 | ||
194 | &jmp(&label("finished")); | ||
195 | |||
196 | &align(16); | ||
197 | # this is essentially Intel P4 specific codepath, see rc4_skey.c, | ||
198 | # and is engaged in 0.9.8 and later context... | ||
199 | &set_label("RC4_CHAR"); | ||
200 | |||
201 | &lea ($ty,&DWP(0,$in,$ty)); | ||
202 | &mov (&swtmp(2),$ty); | ||
203 | |||
204 | # strangely enough unrolled loop performs over 20% slower... | ||
205 | &set_label("RC4_CHAR_loop"); | ||
206 | &movz ($tx,&BP(0,$d,$x)); | ||
207 | &add (&LB($y),&LB($tx)); | ||
208 | &movz ($ty,&BP(0,$d,$y)); | ||
209 | &movb (&BP(0,$d,$y),&LB($tx)); | ||
210 | &movb (&BP(0,$d,$x),&LB($ty)); | ||
211 | &add (&LB($ty),&LB($tx)); | ||
212 | &movz ($ty,&BP(0,$d,$ty)); | ||
213 | &xorb (&LB($ty),&BP(0,$in)); | ||
214 | &movb (&BP(0,$out),&LB($ty)); | ||
215 | &inc (&LB($x)); | ||
216 | &inc ($in); | ||
217 | &inc ($out); | ||
218 | &cmp ($in,&swtmp(2)); | ||
219 | &jb (&label("RC4_CHAR_loop")); | ||
220 | |||
165 | &set_label("finished"); | 221 | &set_label("finished"); |
166 | &dec( $x); | 222 | &dec( $x); |
167 | &stack_pop(3); | 223 | &stack_pop(3); |
168 | &mov( &DWP(-4,$d,"",0),$y); | 224 | &movb( &BP(-4,$d,"",0),&LB($y)); |
169 | &movb( &BP(-8,$d,"",0),&LB($x)); | 225 | &movb( &BP(-8,$d,"",0),&LB($x)); |
170 | 226 | ||
171 | &function_end($name); | 227 | &function_end($name); |