diff options
author | ryker <> | 1998-10-05 20:13:15 +0000 |
---|---|---|
committer | ryker <> | 1998-10-05 20:13:15 +0000 |
commit | 9e77c62555877f9a64805c49d0dcd7dbfbb40f4e (patch) | |
tree | 2a6396b738ecede1e1dd3ad84c90e47e21d0bcbd /src/lib/libcrypto/des/asm | |
parent | fe5d0717e2760d02faf23bf5a714f17b33ae4abb (diff) | |
parent | 536c76cbb863bab152f19842ab88772c01e922c7 (diff) | |
download | openbsd-9e77c62555877f9a64805c49d0dcd7dbfbb40f4e.tar.gz openbsd-9e77c62555877f9a64805c49d0dcd7dbfbb40f4e.tar.bz2 openbsd-9e77c62555877f9a64805c49d0dcd7dbfbb40f4e.zip |
This commit was generated by cvs2git to track changes on a CVS vendor
branch.
Diffstat (limited to 'src/lib/libcrypto/des/asm')
-rw-r--r-- | src/lib/libcrypto/des/asm/des686.pl | 230 | ||||
-rw-r--r-- | src/lib/libcrypto/des/asm/readme | 131 |
2 files changed, 361 insertions, 0 deletions
diff --git a/src/lib/libcrypto/des/asm/des686.pl b/src/lib/libcrypto/des/asm/des686.pl new file mode 100644 index 0000000000..cf1a82fb5c --- /dev/null +++ b/src/lib/libcrypto/des/asm/des686.pl | |||
@@ -0,0 +1,230 @@ | |||
1 | #!/usr/bin/perl | ||
2 | |||
3 | $prog="des686.pl"; | ||
4 | |||
5 | # base code is in microsft | ||
6 | # op dest, source | ||
7 | # format. | ||
8 | # | ||
9 | |||
10 | # WILL NOT WORK ANYMORE WITH desboth.pl | ||
11 | require "desboth.pl"; | ||
12 | |||
13 | if ( ($ARGV[0] eq "elf")) | ||
14 | { require "x86unix.pl"; } | ||
15 | elsif ( ($ARGV[0] eq "a.out")) | ||
16 | { $aout=1; require "x86unix.pl"; } | ||
17 | elsif ( ($ARGV[0] eq "sol")) | ||
18 | { $sol=1; require "x86unix.pl"; } | ||
19 | elsif ( ($ARGV[0] eq "cpp")) | ||
20 | { $cpp=1; require "x86unix.pl"; } | ||
21 | elsif ( ($ARGV[0] eq "win32")) | ||
22 | { require "x86ms.pl"; } | ||
23 | else | ||
24 | { | ||
25 | print STDERR <<"EOF"; | ||
26 | Pick one target type from | ||
27 | elf - linux, FreeBSD etc | ||
28 | a.out - old linux | ||
29 | sol - x86 solaris | ||
30 | cpp - format so x86unix.cpp can be used | ||
31 | win32 - Windows 95/Windows NT | ||
32 | EOF | ||
33 | exit(1); | ||
34 | } | ||
35 | |||
36 | &comment("Don't even think of reading this code"); | ||
37 | &comment("It was automatically generated by $prog"); | ||
38 | &comment("Which is a perl program used to generate the x86 assember for"); | ||
39 | &comment("any of elf, a.out, Win32, or Solaris"); | ||
40 | &comment("It can be found in SSLeay 0.6.5+ or in libdes 3.26+"); | ||
41 | &comment("eric <eay\@cryptsoft.com>"); | ||
42 | &comment(""); | ||
43 | |||
44 | &file("dx86xxxx"); | ||
45 | |||
46 | $L="edi"; | ||
47 | $R="esi"; | ||
48 | |||
49 | &des_encrypt("des_encrypt",1); | ||
50 | &des_encrypt("des_encrypt2",0); | ||
51 | |||
52 | &des_encrypt3("des_encrypt3",1); | ||
53 | &des_encrypt3("des_decrypt3",0); | ||
54 | |||
55 | &file_end(); | ||
56 | |||
57 | sub des_encrypt | ||
58 | { | ||
59 | local($name,$do_ip)=@_; | ||
60 | |||
61 | &function_begin($name,"EXTRN _des_SPtrans:DWORD"); | ||
62 | |||
63 | &comment(""); | ||
64 | &comment("Load the 2 words"); | ||
65 | &mov("eax",&wparam(0)); | ||
66 | &mov($L,&DWP(0,"eax","",0)); | ||
67 | &mov($R,&DWP(4,"eax","",0)); | ||
68 | |||
69 | $ksp=&wparam(1); | ||
70 | |||
71 | if ($do_ip) | ||
72 | { | ||
73 | &comment(""); | ||
74 | &comment("IP"); | ||
75 | &IP_new($L,$R,"eax"); | ||
76 | } | ||
77 | |||
78 | &comment(""); | ||
79 | &comment("fixup rotate"); | ||
80 | &rotl($R,3); | ||
81 | &rotl($L,3); | ||
82 | &exch($L,$R); | ||
83 | |||
84 | &comment(""); | ||
85 | &comment("load counter, key_schedule and enc flag"); | ||
86 | &mov("eax",&wparam(2)); # get encrypt flag | ||
87 | &mov("ebp",&wparam(1)); # get ks | ||
88 | &cmp("eax","0"); | ||
89 | &je(&label("start_decrypt")); | ||
90 | |||
91 | # encrypting part | ||
92 | |||
93 | for ($i=0; $i<16; $i+=2) | ||
94 | { | ||
95 | &comment(""); | ||
96 | &comment("Round $i"); | ||
97 | &D_ENCRYPT($L,$R,$i*2,"ebp","des_SPtrans","ecx","edx","eax","ebx"); | ||
98 | |||
99 | &comment(""); | ||
100 | &comment("Round ".sprintf("%d",$i+1)); | ||
101 | &D_ENCRYPT($R,$L,($i+1)*2,"ebp","des_SPtrans","ecx","edx","eax","ebx"); | ||
102 | } | ||
103 | &jmp(&label("end")); | ||
104 | |||
105 | &set_label("start_decrypt"); | ||
106 | |||
107 | for ($i=15; $i>0; $i-=2) | ||
108 | { | ||
109 | &comment(""); | ||
110 | &comment("Round $i"); | ||
111 | &D_ENCRYPT($L,$R,$i*2,"ebp","des_SPtrans","ecx","edx","eax","ebx"); | ||
112 | &comment(""); | ||
113 | &comment("Round ".sprintf("%d",$i-1)); | ||
114 | &D_ENCRYPT($R,$L,($i-1)*2,"ebp","des_SPtrans","ecx","edx","eax","ebx"); | ||
115 | } | ||
116 | |||
117 | &set_label("end"); | ||
118 | |||
119 | &comment(""); | ||
120 | &comment("Fixup"); | ||
121 | &rotr($L,3); # r | ||
122 | &rotr($R,3); # l | ||
123 | |||
124 | if ($do_ip) | ||
125 | { | ||
126 | &comment(""); | ||
127 | &comment("FP"); | ||
128 | &FP_new($R,$L,"eax"); | ||
129 | } | ||
130 | |||
131 | &mov("eax",&wparam(0)); | ||
132 | &mov(&DWP(0,"eax","",0),$L); | ||
133 | &mov(&DWP(4,"eax","",0),$R); | ||
134 | |||
135 | &function_end($name); | ||
136 | } | ||
137 | |||
138 | |||
139 | # The logic is to load R into 2 registers and operate on both at the same time. | ||
140 | # We also load the 2 R's into 2 more registers so we can do the 'move word down a byte' | ||
141 | # while also masking the other copy and doing a lookup. We then also accumulate the | ||
142 | # L value in 2 registers then combine them at the end. | ||
143 | sub D_ENCRYPT | ||
144 | { | ||
145 | local($L,$R,$S,$ks,$desSP,$u,$t,$tmp1,$tmp2,$tmp3)=@_; | ||
146 | |||
147 | &mov( $u, &DWP(&n2a($S*4),$ks,"",0)); | ||
148 | &mov( $t, &DWP(&n2a(($S+1)*4),$ks,"",0)); | ||
149 | &xor( $u, $R ); | ||
150 | &xor( $t, $R ); | ||
151 | &rotr( $t, 4 ); | ||
152 | |||
153 | # the numbers at the end of the line are origional instruction order | ||
154 | &mov( $tmp2, $u ); # 1 2 | ||
155 | &mov( $tmp1, $t ); # 1 1 | ||
156 | &and( $tmp2, "0xfc" ); # 1 4 | ||
157 | &and( $tmp1, "0xfc" ); # 1 3 | ||
158 | &shr( $t, 8 ); # 1 5 | ||
159 | &xor( $L, &DWP("0x100+$desSP",$tmp1,"",0)); # 1 7 | ||
160 | &shr( $u, 8 ); # 1 6 | ||
161 | &mov( $tmp1, &DWP(" $desSP",$tmp2,"",0)); # 1 8 | ||
162 | |||
163 | &mov( $tmp2, $u ); # 2 2 | ||
164 | &xor( $L, $tmp1 ); # 1 9 | ||
165 | &and( $tmp2, "0xfc" ); # 2 4 | ||
166 | &mov( $tmp1, $t ); # 2 1 | ||
167 | &and( $tmp1, "0xfc" ); # 2 3 | ||
168 | &shr( $t, 8 ); # 2 5 | ||
169 | &xor( $L, &DWP("0x300+$desSP",$tmp1,"",0)); # 2 7 | ||
170 | &shr( $u, 8 ); # 2 6 | ||
171 | &mov( $tmp1, &DWP("0x200+$desSP",$tmp2,"",0)); # 2 8 | ||
172 | &mov( $tmp2, $u ); # 3 2 | ||
173 | |||
174 | &xor( $L, $tmp1 ); # 2 9 | ||
175 | &and( $tmp2, "0xfc" ); # 3 4 | ||
176 | |||
177 | &mov( $tmp1, $t ); # 3 1 | ||
178 | &shr( $u, 8 ); # 3 6 | ||
179 | &and( $tmp1, "0xfc" ); # 3 3 | ||
180 | &shr( $t, 8 ); # 3 5 | ||
181 | &xor( $L, &DWP("0x500+$desSP",$tmp1,"",0)); # 3 7 | ||
182 | &mov( $tmp1, &DWP("0x400+$desSP",$tmp2,"",0)); # 3 8 | ||
183 | |||
184 | &and( $t, "0xfc" ); # 4 1 | ||
185 | &xor( $L, $tmp1 ); # 3 9 | ||
186 | |||
187 | &and( $u, "0xfc" ); # 4 2 | ||
188 | &xor( $L, &DWP("0x700+$desSP",$t,"",0)); # 4 3 | ||
189 | &xor( $L, &DWP("0x600+$desSP",$u,"",0)); # 4 4 | ||
190 | } | ||
191 | |||
192 | sub PERM_OP | ||
193 | { | ||
194 | local($a,$b,$tt,$shift,$mask)=@_; | ||
195 | |||
196 | &mov( $tt, $a ); | ||
197 | &shr( $tt, $shift ); | ||
198 | &xor( $tt, $b ); | ||
199 | &and( $tt, $mask ); | ||
200 | &xor( $b, $tt ); | ||
201 | &shl( $tt, $shift ); | ||
202 | &xor( $a, $tt ); | ||
203 | } | ||
204 | |||
205 | sub IP_new | ||
206 | { | ||
207 | local($l,$r,$tt)=@_; | ||
208 | |||
209 | &PERM_OP($r,$l,$tt, 4,"0x0f0f0f0f"); | ||
210 | &PERM_OP($l,$r,$tt,16,"0x0000ffff"); | ||
211 | &PERM_OP($r,$l,$tt, 2,"0x33333333"); | ||
212 | &PERM_OP($l,$r,$tt, 8,"0x00ff00ff"); | ||
213 | &PERM_OP($r,$l,$tt, 1,"0x55555555"); | ||
214 | } | ||
215 | |||
216 | sub FP_new | ||
217 | { | ||
218 | local($l,$r,$tt)=@_; | ||
219 | |||
220 | &PERM_OP($l,$r,$tt, 1,"0x55555555"); | ||
221 | &PERM_OP($r,$l,$tt, 8,"0x00ff00ff"); | ||
222 | &PERM_OP($l,$r,$tt, 2,"0x33333333"); | ||
223 | &PERM_OP($r,$l,$tt,16,"0x0000ffff"); | ||
224 | &PERM_OP($l,$r,$tt, 4,"0x0f0f0f0f"); | ||
225 | } | ||
226 | |||
227 | sub n2a | ||
228 | { | ||
229 | sprintf("%d",$_[0]); | ||
230 | } | ||
diff --git a/src/lib/libcrypto/des/asm/readme b/src/lib/libcrypto/des/asm/readme new file mode 100644 index 0000000000..f8529d9307 --- /dev/null +++ b/src/lib/libcrypto/des/asm/readme | |||
@@ -0,0 +1,131 @@ | |||
1 | First up, let me say I don't like writing in assembler. It is not portable, | ||
2 | dependant on the particular CPU architecture release and is generally a pig | ||
3 | to debug and get right. Having said that, the x86 architecture is probably | ||
4 | the most important for speed due to number of boxes and since | ||
5 | it appears to be the worst architecture to to get | ||
6 | good C compilers for. So due to this, I have lowered myself to do | ||
7 | assembler for the inner DES routines in libdes :-). | ||
8 | |||
9 | The file to implement in assembler is des_enc.c. Replace the following | ||
10 | 4 functions | ||
11 | des_encrypt(DES_LONG data[2],des_key_schedule ks, int encrypt); | ||
12 | des_encrypt2(DES_LONG data[2],des_key_schedule ks, int encrypt); | ||
13 | des_encrypt3(DES_LONG data[2],des_key_schedule ks1,ks2,ks3); | ||
14 | des_decrypt3(DES_LONG data[2],des_key_schedule ks1,ks2,ks3); | ||
15 | |||
16 | They encrypt/decrypt the 64 bits held in 'data' using | ||
17 | the 'ks' key schedules. The only difference between the 4 functions is that | ||
18 | des_encrypt2() does not perform IP() or FP() on the data (this is an | ||
19 | optimization for when doing triple DES and des_encrypt3() and des_decrypt3() | ||
20 | perform triple des. The triple DES routines are in here because it does | ||
21 | make a big difference to have them located near the des_encrypt2 function | ||
22 | at link time.. | ||
23 | |||
24 | Now as we all know, there are lots of different operating systems running on | ||
25 | x86 boxes, and unfortunately they normally try to make sure their assembler | ||
26 | formating is not the same as the other peoples. | ||
27 | The 4 main formats I know of are | ||
28 | Microsoft Windows 95/Windows NT | ||
29 | Elf Includes Linux and FreeBSD(?). | ||
30 | a.out The older Linux. | ||
31 | Solaris Same as Elf but different comments :-(. | ||
32 | |||
33 | Now I was not overly keen to write 4 different copies of the same code, | ||
34 | so I wrote a few perl routines to output the correct assembler, given | ||
35 | a target assembler type. This code is ugly and is just a hack. | ||
36 | The libraries are x86unix.pl and x86ms.pl. | ||
37 | des586.pl, des686.pl and des-som[23].pl are the programs to actually | ||
38 | generate the assembler. | ||
39 | |||
40 | So to generate elf assembler | ||
41 | perl des-som3.pl elf >dx86-elf.s | ||
42 | For Windows 95/NT | ||
43 | perl des-som2.pl win32 >win32.asm | ||
44 | |||
45 | [ update 4 Jan 1996 ] | ||
46 | I have added another way to do things. | ||
47 | perl des-som3.pl cpp >dx86-cpp.s | ||
48 | generates a file that will be included by dx86unix.cpp when it is compiled. | ||
49 | To build for elf, a.out, solaris, bsdi etc, | ||
50 | cc -E -DELF asm/dx86unix.cpp | as -o asm/dx86-elf.o | ||
51 | cc -E -DSOL asm/dx86unix.cpp | as -o asm/dx86-sol.o | ||
52 | cc -E -DOUT asm/dx86unix.cpp | as -o asm/dx86-out.o | ||
53 | cc -E -DBSDI asm/dx86unix.cpp | as -o asm/dx86bsdi.o | ||
54 | This was done to cut down the number of files in the distribution. | ||
55 | |||
56 | Now the ugly part. I acquired my copy of Intels | ||
57 | "Optimization's For Intel's 32-Bit Processors" and found a few interesting | ||
58 | things. First, the aim of the exersize is to 'extract' one byte at a time | ||
59 | from a word and do an array lookup. This involves getting the byte from | ||
60 | the 4 locations in the word and moving it to a new word and doing the lookup. | ||
61 | The most obvious way to do this is | ||
62 | xor eax, eax # clear word | ||
63 | movb al, cl # get low byte | ||
64 | xor edi DWORD PTR 0x100+des_SP[eax] # xor in word | ||
65 | movb al, ch # get next byte | ||
66 | xor edi DWORD PTR 0x300+des_SP[eax] # xor in word | ||
67 | shr ecx 16 | ||
68 | which seems ok. For the pentium, this system appears to be the best. | ||
69 | One has to do instruction interleaving to keep both functional units | ||
70 | operating, but it is basically very efficient. | ||
71 | |||
72 | Now the crunch. When a full register is used after a partial write, eg. | ||
73 | mov al, cl | ||
74 | xor edi, DWORD PTR 0x100+des_SP[eax] | ||
75 | 386 - 1 cycle stall | ||
76 | 486 - 1 cycle stall | ||
77 | 586 - 0 cycle stall | ||
78 | 686 - at least 7 cycle stall (page 22 of the above mentioned document). | ||
79 | |||
80 | So the technique that produces the best results on a pentium, according to | ||
81 | the documentation, will produce hideous results on a pentium pro. | ||
82 | |||
83 | To get around this, des686.pl will generate code that is not as fast on | ||
84 | a pentium, should be very good on a pentium pro. | ||
85 | mov eax, ecx # copy word | ||
86 | shr ecx, 8 # line up next byte | ||
87 | and eax, 0fch # mask byte | ||
88 | xor edi DWORD PTR 0x100+des_SP[eax] # xor in array lookup | ||
89 | mov eax, ecx # get word | ||
90 | shr ecx 8 # line up next byte | ||
91 | and eax, 0fch # mask byte | ||
92 | xor edi DWORD PTR 0x300+des_SP[eax] # xor in array lookup | ||
93 | |||
94 | Due to the execution units in the pentium, this actually works quite well. | ||
95 | For a pentium pro it should be very good. This is the type of output | ||
96 | Visual C++ generates. | ||
97 | |||
98 | There is a third option. instead of using | ||
99 | mov al, ch | ||
100 | which is bad on the pentium pro, one may be able to use | ||
101 | movzx eax, ch | ||
102 | which may not incur the partial write penalty. On the pentium, | ||
103 | this instruction takes 4 cycles so is not worth using but on the | ||
104 | pentium pro it appears it may be worth while. I need access to one to | ||
105 | experiment :-). | ||
106 | |||
107 | eric (20 Oct 1996) | ||
108 | |||
109 | 22 Nov 1996 - I have asked people to run the 2 different version on pentium | ||
110 | pros and it appears that the intel documentation is wrong. The | ||
111 | mov al,bh is still faster on a pentium pro, so just use the des586.pl | ||
112 | install des686.pl | ||
113 | |||
114 | 3 Dec 1996 - I added des_encrypt3/des_decrypt3 because I have moved these | ||
115 | functions into des_enc.c because it does make a massive performance | ||
116 | difference on some boxes to have the functions code located close to | ||
117 | the des_encrypt2() function. | ||
118 | |||
119 | 9 Jan 1997 - des-som2.pl is now the correct perl script to use for | ||
120 | pentiums. It contains an inner loop from | ||
121 | Svend Olaf Mikkelsen <svolaf@inet.uni-c.dk> which does raw ecb DES calls at | ||
122 | 273,000 per second. He had a previous version at 250,000 and the best | ||
123 | I was able to get was 203,000. The content has not changed, this is all | ||
124 | due to instruction sequencing (and actual instructions choice) which is able | ||
125 | to keep both functional units of the pentium going. | ||
126 | We may have lost the ugly register usage restrictions when x86 went 32 bit | ||
127 | but for the pentium it has been replaced by evil instruction ordering tricks. | ||
128 | |||
129 | 13 Jan 1997 - des-som3.pl, more optimizations from Svend Olaf. | ||
130 | raw DES at 281,000 per second on a pentium 100. | ||
131 | |||