summaryrefslogtreecommitdiff
path: root/src/lib
diff options
context:
space:
mode:
authormiod <>2015-09-12 09:01:45 +0000
committermiod <>2015-09-12 09:01:45 +0000
commit6144f48e89ae4129a0d4588a071fb1025e5211b7 (patch)
tree19a680ffcbc96d38180b75e3c4aaf549df5f076f /src/lib
parent4c26aee9b32371e04636caf4e3188a68b04c2a0c (diff)
downloadopenbsd-6144f48e89ae4129a0d4588a071fb1025e5211b7.tar.gz
openbsd-6144f48e89ae4129a0d4588a071fb1025e5211b7.tar.bz2
openbsd-6144f48e89ae4129a0d4588a071fb1025e5211b7.zip
Remove horribly old and outdated `documentation' for the assembly code.
Diffstat (limited to 'src/lib')
-rw-r--r--src/lib/libcrypto/bf/asm/bf-686.pl127
-rw-r--r--src/lib/libcrypto/bf/asm/readme10
-rw-r--r--src/lib/libcrypto/bn/asm/README27
-rw-r--r--src/lib/libcrypto/camellia/asm/BSD_license.txt24
-rw-r--r--src/lib/libcrypto/cast/asm/readme7
-rw-r--r--src/lib/libcrypto/des/asm/readme131
-rw-r--r--src/lib/libssl/src/crypto/bf/asm/bf-686.pl127
-rw-r--r--src/lib/libssl/src/crypto/bf/asm/readme10
-rw-r--r--src/lib/libssl/src/crypto/bn/asm/README27
-rw-r--r--src/lib/libssl/src/crypto/camellia/asm/BSD_license.txt24
-rw-r--r--src/lib/libssl/src/crypto/cast/asm/readme7
-rw-r--r--src/lib/libssl/src/crypto/des/asm/readme131
12 files changed, 0 insertions, 652 deletions
diff --git a/src/lib/libcrypto/bf/asm/bf-686.pl b/src/lib/libcrypto/bf/asm/bf-686.pl
deleted file mode 100644
index 8e4c25f598..0000000000
--- a/src/lib/libcrypto/bf/asm/bf-686.pl
+++ /dev/null
@@ -1,127 +0,0 @@
1#!/usr/local/bin/perl
2
3push(@INC,"perlasm","../../perlasm");
4require "x86asm.pl";
5require "cbc.pl";
6
7&asm_init($ARGV[0],"bf-686.pl");
8
9$BF_ROUNDS=16;
10$BF_OFF=($BF_ROUNDS+2)*4;
11$L="ecx";
12$R="edx";
13$P="edi";
14$tot="esi";
15$tmp1="eax";
16$tmp2="ebx";
17$tmp3="ebp";
18
19&des_encrypt("BF_encrypt",1);
20&des_encrypt("BF_decrypt",0);
21&cbc("BF_cbc_encrypt","BF_encrypt","BF_decrypt",1,4,5,3,-1,-1);
22
23&asm_finish();
24
25&file_end();
26
27sub des_encrypt
28 {
29 local($name,$enc)=@_;
30
31 &function_begin($name,"");
32
33 &comment("");
34 &comment("Load the 2 words");
35 &mov("eax",&wparam(0));
36 &mov($L,&DWP(0,"eax","",0));
37 &mov($R,&DWP(4,"eax","",0));
38
39 &comment("");
40 &comment("P pointer, s and enc flag");
41 &mov($P,&wparam(1));
42
43 &xor( $tmp1, $tmp1);
44 &xor( $tmp2, $tmp2);
45
46 # encrypting part
47
48 if ($enc)
49 {
50 &xor($L,&DWP(0,$P,"",0));
51 for ($i=0; $i<$BF_ROUNDS; $i+=2)
52 {
53 &comment("");
54 &comment("Round $i");
55 &BF_ENCRYPT($i+1,$R,$L,$P,$tot,$tmp1,$tmp2,$tmp3);
56
57 &comment("");
58 &comment("Round ".sprintf("%d",$i+1));
59 &BF_ENCRYPT($i+2,$L,$R,$P,$tot,$tmp1,$tmp2,$tmp3);
60 }
61 &xor($R,&DWP(($BF_ROUNDS+1)*4,$P,"",0));
62
63 &mov("eax",&wparam(0));
64 &mov(&DWP(0,"eax","",0),$R);
65 &mov(&DWP(4,"eax","",0),$L);
66 &function_end_A($name);
67 }
68 else
69 {
70 &xor($L,&DWP(($BF_ROUNDS+1)*4,$P,"",0));
71 for ($i=$BF_ROUNDS; $i>0; $i-=2)
72 {
73 &comment("");
74 &comment("Round $i");
75 &BF_ENCRYPT($i,$R,$L,$P,$tot,$tmp1,$tmp2,$tmp3);
76 &comment("");
77 &comment("Round ".sprintf("%d",$i-1));
78 &BF_ENCRYPT($i-1,$L,$R,$P,$tot,$tmp1,$tmp2,$tmp3);
79 }
80 &xor($R,&DWP(0,$P,"",0));
81
82 &mov("eax",&wparam(0));
83 &mov(&DWP(0,"eax","",0),$R);
84 &mov(&DWP(4,"eax","",0),$L);
85 &function_end_A($name);
86 }
87
88 &function_end_B($name);
89 }
90
91sub BF_ENCRYPT
92 {
93 local($i,$L,$R,$P,$tot,$tmp1,$tmp2,$tmp3)=@_;
94
95 &rotr( $R, 16);
96 &mov( $tot, &DWP(&n2a($i*4),$P,"",0));
97
98 &movb( &LB($tmp1), &HB($R));
99 &movb( &LB($tmp2), &LB($R));
100
101 &rotr( $R, 16);
102 &xor( $L, $tot);
103
104 &mov( $tot, &DWP(&n2a($BF_OFF+0x0000),$P,$tmp1,4));
105 &mov( $tmp3, &DWP(&n2a($BF_OFF+0x0400),$P,$tmp2,4));
106
107 &movb( &LB($tmp1), &HB($R));
108 &movb( &LB($tmp2), &LB($R));
109
110 &add( $tot, $tmp3);
111 &mov( $tmp1, &DWP(&n2a($BF_OFF+0x0800),$P,$tmp1,4)); # delay
112
113 &xor( $tot, $tmp1);
114 &mov( $tmp3, &DWP(&n2a($BF_OFF+0x0C00),$P,$tmp2,4));
115
116 &add( $tot, $tmp3);
117 &xor( $tmp1, $tmp1);
118
119 &xor( $L, $tot);
120 # delay
121 }
122
123sub n2a
124 {
125 sprintf("%d",$_[0]);
126 }
127
diff --git a/src/lib/libcrypto/bf/asm/readme b/src/lib/libcrypto/bf/asm/readme
deleted file mode 100644
index 2385fa3812..0000000000
--- a/src/lib/libcrypto/bf/asm/readme
+++ /dev/null
@@ -1,10 +0,0 @@
1There are blowfish assembler generation scripts.
2bf-586.pl version is for the pentium and
3bf-686.pl is my original version, which is faster on the pentium pro.
4
5When using a bf-586.pl, the pentium pro/II is %8 slower than using
6bf-686.pl. When using a bf-686.pl, the pentium is %16 slower
7than bf-586.pl
8
9So the default is bf-586.pl
10
diff --git a/src/lib/libcrypto/bn/asm/README b/src/lib/libcrypto/bn/asm/README
deleted file mode 100644
index 323d1a06b9..0000000000
--- a/src/lib/libcrypto/bn/asm/README
+++ /dev/null
@@ -1,27 +0,0 @@
1<OBSOLETE>
2
3All assembler in this directory are just version of the file
4crypto/bn/bn_asm.c.
5
6Quite a few of these files are just the assembler output from gcc since on
7quite a few machines they are 2 times faster than the system compiler.
8
9For the x86, I have hand written assembler because of the bad job all
10compilers seem to do on it. This normally gives a 2 time speed up in the RSA
11routines.
12
13For the DEC alpha, I also hand wrote the assembler (except the division which
14is just the output from the C compiler pasted on the end of the file).
15On the 2 alpha C compilers I had access to, it was not possible to do
1664b x 64b -> 128b calculations (both long and the long long data types
17were 64 bits). So the hand assembler gives access to the 128 bit result and
18a 2 times speedup :-).
19
20There are 3 versions of assembler for the HP PA-RISC.
21
22pa-risc.s is the origional one which works fine and generated using gcc :-)
23
24pa-risc2W.s and pa-risc2.s are 64 and 32-bit PA-RISC 2.0 implementations
25by Chris Ruemmler from HP (with some help from the HP C compiler).
26
27</OBSOLETE>
diff --git a/src/lib/libcrypto/camellia/asm/BSD_license.txt b/src/lib/libcrypto/camellia/asm/BSD_license.txt
deleted file mode 100644
index 591975cb98..0000000000
--- a/src/lib/libcrypto/camellia/asm/BSD_license.txt
+++ /dev/null
@@ -1,24 +0,0 @@
1Camellia assembler implementation.
2
3Copyright (c) 2008 Andy Polyakov <appro@openssl.org>
4
5Redistribution and use in source and binary forms, with or without
6modification, are permitted provided that the following conditions
7are met:
81. Redistributions of source code must retain the above copyright
9 notice, this list of conditions and the following disclaimer as
10 the first lines of this file unmodified.
112. Redistributions in binary form must reproduce the above copyright
12 notice, this list of conditions and the following disclaimer in the
13 documentation and/or other materials provided with the distribution.
14
15THIS SOFTWARE IS PROVIDED BY Andy Polyakov ``AS IS'' AND ANY EXPRESS OR
16IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
17OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
18IN NO EVENT SHALL NTT BE LIABLE FOR ANY DIRECT, INDIRECT,
19INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
20NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
21DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
22THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
24THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/src/lib/libcrypto/cast/asm/readme b/src/lib/libcrypto/cast/asm/readme
deleted file mode 100644
index fbcd76289e..0000000000
--- a/src/lib/libcrypto/cast/asm/readme
+++ /dev/null
@@ -1,7 +0,0 @@
1There is a ppro flag in cast-586 which turns on/off
2generation of pentium pro/II friendly code
3
4This flag makes the inner loop one cycle longer, but generates
5code that runs %30 faster on the pentium pro/II, while only %7 slower
6on the pentium. By default, this flag is on.
7
diff --git a/src/lib/libcrypto/des/asm/readme b/src/lib/libcrypto/des/asm/readme
deleted file mode 100644
index 1beafe253b..0000000000
--- a/src/lib/libcrypto/des/asm/readme
+++ /dev/null
@@ -1,131 +0,0 @@
1First up, let me say I don't like writing in assembler. It is not portable,
2dependant on the particular CPU architecture release and is generally a pig
3to debug and get right. Having said that, the x86 architecture is probably
4the most important for speed due to number of boxes and since
5it appears to be the worst architecture to to get
6good C compilers for. So due to this, I have lowered myself to do
7assembler for the inner DES routines in libdes :-).
8
9The file to implement in assembler is des_enc.c. Replace the following
104 functions
11des_encrypt1(DES_LONG data[2],des_key_schedule ks, int encrypt);
12des_encrypt2(DES_LONG data[2],des_key_schedule ks, int encrypt);
13des_encrypt3(DES_LONG data[2],des_key_schedule ks1,ks2,ks3);
14des_decrypt3(DES_LONG data[2],des_key_schedule ks1,ks2,ks3);
15
16They encrypt/decrypt the 64 bits held in 'data' using
17the 'ks' key schedules. The only difference between the 4 functions is that
18des_encrypt2() does not perform IP() or FP() on the data (this is an
19optimization for when doing triple DES and des_encrypt3() and des_decrypt3()
20perform triple des. The triple DES routines are in here because it does
21make a big difference to have them located near the des_encrypt2 function
22at link time..
23
24Now as we all know, there are lots of different operating systems running on
25x86 boxes, and unfortunately they normally try to make sure their assembler
26formating is not the same as the other peoples.
27The 4 main formats I know of are
28Microsoft Windows 95/Windows NT
29Elf Includes Linux and FreeBSD(?).
30a.out The older Linux.
31Solaris Same as Elf but different comments :-(.
32
33Now I was not overly keen to write 4 different copies of the same code,
34so I wrote a few perl routines to output the correct assembler, given
35a target assembler type. This code is ugly and is just a hack.
36The libraries are x86unix.pl and x86ms.pl.
37des586.pl, des686.pl and des-som[23].pl are the programs to actually
38generate the assembler.
39
40So to generate elf assembler
41perl des-som3.pl elf >dx86-elf.s
42For Windows 95/NT
43perl des-som2.pl win32 >win32.asm
44
45[ update 4 Jan 1996 ]
46I have added another way to do things.
47perl des-som3.pl cpp >dx86-cpp.s
48generates a file that will be included by dx86unix.cpp when it is compiled.
49To build for elf, a.out, solaris, bsdi etc,
50cc -E -DELF asm/dx86unix.cpp | as -o asm/dx86-elf.o
51cc -E -DSOL asm/dx86unix.cpp | as -o asm/dx86-sol.o
52cc -E -DOUT asm/dx86unix.cpp | as -o asm/dx86-out.o
53cc -E -DBSDI asm/dx86unix.cpp | as -o asm/dx86bsdi.o
54This was done to cut down the number of files in the distribution.
55
56Now the ugly part. I acquired my copy of Intels
57"Optimization's For Intel's 32-Bit Processors" and found a few interesting
58things. First, the aim of the exersize is to 'extract' one byte at a time
59from a word and do an array lookup. This involves getting the byte from
60the 4 locations in the word and moving it to a new word and doing the lookup.
61The most obvious way to do this is
62xor eax, eax # clear word
63movb al, cl # get low byte
64xor edi DWORD PTR 0x100+des_SP[eax] # xor in word
65movb al, ch # get next byte
66xor edi DWORD PTR 0x300+des_SP[eax] # xor in word
67shr ecx 16
68which seems ok. For the pentium, this system appears to be the best.
69One has to do instruction interleaving to keep both functional units
70operating, but it is basically very efficient.
71
72Now the crunch. When a full register is used after a partial write, eg.
73mov al, cl
74xor edi, DWORD PTR 0x100+des_SP[eax]
75386 - 1 cycle stall
76486 - 1 cycle stall
77586 - 0 cycle stall
78686 - at least 7 cycle stall (page 22 of the above mentioned document).
79
80So the technique that produces the best results on a pentium, according to
81the documentation, will produce hideous results on a pentium pro.
82
83To get around this, des686.pl will generate code that is not as fast on
84a pentium, should be very good on a pentium pro.
85mov eax, ecx # copy word
86shr ecx, 8 # line up next byte
87and eax, 0fch # mask byte
88xor edi DWORD PTR 0x100+des_SP[eax] # xor in array lookup
89mov eax, ecx # get word
90shr ecx 8 # line up next byte
91and eax, 0fch # mask byte
92xor edi DWORD PTR 0x300+des_SP[eax] # xor in array lookup
93
94Due to the execution units in the pentium, this actually works quite well.
95For a pentium pro it should be very good. This is the type of output
96Visual C++ generates.
97
98There is a third option. instead of using
99mov al, ch
100which is bad on the pentium pro, one may be able to use
101movzx eax, ch
102which may not incur the partial write penalty. On the pentium,
103this instruction takes 4 cycles so is not worth using but on the
104pentium pro it appears it may be worth while. I need access to one to
105experiment :-).
106
107eric (20 Oct 1996)
108
10922 Nov 1996 - I have asked people to run the 2 different version on pentium
110pros and it appears that the intel documentation is wrong. The
111mov al,bh is still faster on a pentium pro, so just use the des586.pl
112install des686.pl
113
1143 Dec 1996 - I added des_encrypt3/des_decrypt3 because I have moved these
115functions into des_enc.c because it does make a massive performance
116difference on some boxes to have the functions code located close to
117the des_encrypt2() function.
118
1199 Jan 1997 - des-som2.pl is now the correct perl script to use for
120pentiums. It contains an inner loop from
121Svend Olaf Mikkelsen <svolaf@inet.uni-c.dk> which does raw ecb DES calls at
122273,000 per second. He had a previous version at 250,000 and the best
123I was able to get was 203,000. The content has not changed, this is all
124due to instruction sequencing (and actual instructions choice) which is able
125to keep both functional units of the pentium going.
126We may have lost the ugly register usage restrictions when x86 went 32 bit
127but for the pentium it has been replaced by evil instruction ordering tricks.
128
12913 Jan 1997 - des-som3.pl, more optimizations from Svend Olaf.
130raw DES at 281,000 per second on a pentium 100.
131
diff --git a/src/lib/libssl/src/crypto/bf/asm/bf-686.pl b/src/lib/libssl/src/crypto/bf/asm/bf-686.pl
deleted file mode 100644
index 8e4c25f598..0000000000
--- a/src/lib/libssl/src/crypto/bf/asm/bf-686.pl
+++ /dev/null
@@ -1,127 +0,0 @@
1#!/usr/local/bin/perl
2
3push(@INC,"perlasm","../../perlasm");
4require "x86asm.pl";
5require "cbc.pl";
6
7&asm_init($ARGV[0],"bf-686.pl");
8
9$BF_ROUNDS=16;
10$BF_OFF=($BF_ROUNDS+2)*4;
11$L="ecx";
12$R="edx";
13$P="edi";
14$tot="esi";
15$tmp1="eax";
16$tmp2="ebx";
17$tmp3="ebp";
18
19&des_encrypt("BF_encrypt",1);
20&des_encrypt("BF_decrypt",0);
21&cbc("BF_cbc_encrypt","BF_encrypt","BF_decrypt",1,4,5,3,-1,-1);
22
23&asm_finish();
24
25&file_end();
26
27sub des_encrypt
28 {
29 local($name,$enc)=@_;
30
31 &function_begin($name,"");
32
33 &comment("");
34 &comment("Load the 2 words");
35 &mov("eax",&wparam(0));
36 &mov($L,&DWP(0,"eax","",0));
37 &mov($R,&DWP(4,"eax","",0));
38
39 &comment("");
40 &comment("P pointer, s and enc flag");
41 &mov($P,&wparam(1));
42
43 &xor( $tmp1, $tmp1);
44 &xor( $tmp2, $tmp2);
45
46 # encrypting part
47
48 if ($enc)
49 {
50 &xor($L,&DWP(0,$P,"",0));
51 for ($i=0; $i<$BF_ROUNDS; $i+=2)
52 {
53 &comment("");
54 &comment("Round $i");
55 &BF_ENCRYPT($i+1,$R,$L,$P,$tot,$tmp1,$tmp2,$tmp3);
56
57 &comment("");
58 &comment("Round ".sprintf("%d",$i+1));
59 &BF_ENCRYPT($i+2,$L,$R,$P,$tot,$tmp1,$tmp2,$tmp3);
60 }
61 &xor($R,&DWP(($BF_ROUNDS+1)*4,$P,"",0));
62
63 &mov("eax",&wparam(0));
64 &mov(&DWP(0,"eax","",0),$R);
65 &mov(&DWP(4,"eax","",0),$L);
66 &function_end_A($name);
67 }
68 else
69 {
70 &xor($L,&DWP(($BF_ROUNDS+1)*4,$P,"",0));
71 for ($i=$BF_ROUNDS; $i>0; $i-=2)
72 {
73 &comment("");
74 &comment("Round $i");
75 &BF_ENCRYPT($i,$R,$L,$P,$tot,$tmp1,$tmp2,$tmp3);
76 &comment("");
77 &comment("Round ".sprintf("%d",$i-1));
78 &BF_ENCRYPT($i-1,$L,$R,$P,$tot,$tmp1,$tmp2,$tmp3);
79 }
80 &xor($R,&DWP(0,$P,"",0));
81
82 &mov("eax",&wparam(0));
83 &mov(&DWP(0,"eax","",0),$R);
84 &mov(&DWP(4,"eax","",0),$L);
85 &function_end_A($name);
86 }
87
88 &function_end_B($name);
89 }
90
91sub BF_ENCRYPT
92 {
93 local($i,$L,$R,$P,$tot,$tmp1,$tmp2,$tmp3)=@_;
94
95 &rotr( $R, 16);
96 &mov( $tot, &DWP(&n2a($i*4),$P,"",0));
97
98 &movb( &LB($tmp1), &HB($R));
99 &movb( &LB($tmp2), &LB($R));
100
101 &rotr( $R, 16);
102 &xor( $L, $tot);
103
104 &mov( $tot, &DWP(&n2a($BF_OFF+0x0000),$P,$tmp1,4));
105 &mov( $tmp3, &DWP(&n2a($BF_OFF+0x0400),$P,$tmp2,4));
106
107 &movb( &LB($tmp1), &HB($R));
108 &movb( &LB($tmp2), &LB($R));
109
110 &add( $tot, $tmp3);
111 &mov( $tmp1, &DWP(&n2a($BF_OFF+0x0800),$P,$tmp1,4)); # delay
112
113 &xor( $tot, $tmp1);
114 &mov( $tmp3, &DWP(&n2a($BF_OFF+0x0C00),$P,$tmp2,4));
115
116 &add( $tot, $tmp3);
117 &xor( $tmp1, $tmp1);
118
119 &xor( $L, $tot);
120 # delay
121 }
122
123sub n2a
124 {
125 sprintf("%d",$_[0]);
126 }
127
diff --git a/src/lib/libssl/src/crypto/bf/asm/readme b/src/lib/libssl/src/crypto/bf/asm/readme
deleted file mode 100644
index 2385fa3812..0000000000
--- a/src/lib/libssl/src/crypto/bf/asm/readme
+++ /dev/null
@@ -1,10 +0,0 @@
1There are blowfish assembler generation scripts.
2bf-586.pl version is for the pentium and
3bf-686.pl is my original version, which is faster on the pentium pro.
4
5When using a bf-586.pl, the pentium pro/II is %8 slower than using
6bf-686.pl. When using a bf-686.pl, the pentium is %16 slower
7than bf-586.pl
8
9So the default is bf-586.pl
10
diff --git a/src/lib/libssl/src/crypto/bn/asm/README b/src/lib/libssl/src/crypto/bn/asm/README
deleted file mode 100644
index 323d1a06b9..0000000000
--- a/src/lib/libssl/src/crypto/bn/asm/README
+++ /dev/null
@@ -1,27 +0,0 @@
1<OBSOLETE>
2
3All assembler in this directory are just version of the file
4crypto/bn/bn_asm.c.
5
6Quite a few of these files are just the assembler output from gcc since on
7quite a few machines they are 2 times faster than the system compiler.
8
9For the x86, I have hand written assembler because of the bad job all
10compilers seem to do on it. This normally gives a 2 time speed up in the RSA
11routines.
12
13For the DEC alpha, I also hand wrote the assembler (except the division which
14is just the output from the C compiler pasted on the end of the file).
15On the 2 alpha C compilers I had access to, it was not possible to do
1664b x 64b -> 128b calculations (both long and the long long data types
17were 64 bits). So the hand assembler gives access to the 128 bit result and
18a 2 times speedup :-).
19
20There are 3 versions of assembler for the HP PA-RISC.
21
22pa-risc.s is the origional one which works fine and generated using gcc :-)
23
24pa-risc2W.s and pa-risc2.s are 64 and 32-bit PA-RISC 2.0 implementations
25by Chris Ruemmler from HP (with some help from the HP C compiler).
26
27</OBSOLETE>
diff --git a/src/lib/libssl/src/crypto/camellia/asm/BSD_license.txt b/src/lib/libssl/src/crypto/camellia/asm/BSD_license.txt
deleted file mode 100644
index 591975cb98..0000000000
--- a/src/lib/libssl/src/crypto/camellia/asm/BSD_license.txt
+++ /dev/null
@@ -1,24 +0,0 @@
1Camellia assembler implementation.
2
3Copyright (c) 2008 Andy Polyakov <appro@openssl.org>
4
5Redistribution and use in source and binary forms, with or without
6modification, are permitted provided that the following conditions
7are met:
81. Redistributions of source code must retain the above copyright
9 notice, this list of conditions and the following disclaimer as
10 the first lines of this file unmodified.
112. Redistributions in binary form must reproduce the above copyright
12 notice, this list of conditions and the following disclaimer in the
13 documentation and/or other materials provided with the distribution.
14
15THIS SOFTWARE IS PROVIDED BY Andy Polyakov ``AS IS'' AND ANY EXPRESS OR
16IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
17OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
18IN NO EVENT SHALL NTT BE LIABLE FOR ANY DIRECT, INDIRECT,
19INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
20NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
21DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
22THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
24THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/src/lib/libssl/src/crypto/cast/asm/readme b/src/lib/libssl/src/crypto/cast/asm/readme
deleted file mode 100644
index fbcd76289e..0000000000
--- a/src/lib/libssl/src/crypto/cast/asm/readme
+++ /dev/null
@@ -1,7 +0,0 @@
1There is a ppro flag in cast-586 which turns on/off
2generation of pentium pro/II friendly code
3
4This flag makes the inner loop one cycle longer, but generates
5code that runs %30 faster on the pentium pro/II, while only %7 slower
6on the pentium. By default, this flag is on.
7
diff --git a/src/lib/libssl/src/crypto/des/asm/readme b/src/lib/libssl/src/crypto/des/asm/readme
deleted file mode 100644
index 1beafe253b..0000000000
--- a/src/lib/libssl/src/crypto/des/asm/readme
+++ /dev/null
@@ -1,131 +0,0 @@
1First up, let me say I don't like writing in assembler. It is not portable,
2dependant on the particular CPU architecture release and is generally a pig
3to debug and get right. Having said that, the x86 architecture is probably
4the most important for speed due to number of boxes and since
5it appears to be the worst architecture to to get
6good C compilers for. So due to this, I have lowered myself to do
7assembler for the inner DES routines in libdes :-).
8
9The file to implement in assembler is des_enc.c. Replace the following
104 functions
11des_encrypt1(DES_LONG data[2],des_key_schedule ks, int encrypt);
12des_encrypt2(DES_LONG data[2],des_key_schedule ks, int encrypt);
13des_encrypt3(DES_LONG data[2],des_key_schedule ks1,ks2,ks3);
14des_decrypt3(DES_LONG data[2],des_key_schedule ks1,ks2,ks3);
15
16They encrypt/decrypt the 64 bits held in 'data' using
17the 'ks' key schedules. The only difference between the 4 functions is that
18des_encrypt2() does not perform IP() or FP() on the data (this is an
19optimization for when doing triple DES and des_encrypt3() and des_decrypt3()
20perform triple des. The triple DES routines are in here because it does
21make a big difference to have them located near the des_encrypt2 function
22at link time..
23
24Now as we all know, there are lots of different operating systems running on
25x86 boxes, and unfortunately they normally try to make sure their assembler
26formating is not the same as the other peoples.
27The 4 main formats I know of are
28Microsoft Windows 95/Windows NT
29Elf Includes Linux and FreeBSD(?).
30a.out The older Linux.
31Solaris Same as Elf but different comments :-(.
32
33Now I was not overly keen to write 4 different copies of the same code,
34so I wrote a few perl routines to output the correct assembler, given
35a target assembler type. This code is ugly and is just a hack.
36The libraries are x86unix.pl and x86ms.pl.
37des586.pl, des686.pl and des-som[23].pl are the programs to actually
38generate the assembler.
39
40So to generate elf assembler
41perl des-som3.pl elf >dx86-elf.s
42For Windows 95/NT
43perl des-som2.pl win32 >win32.asm
44
45[ update 4 Jan 1996 ]
46I have added another way to do things.
47perl des-som3.pl cpp >dx86-cpp.s
48generates a file that will be included by dx86unix.cpp when it is compiled.
49To build for elf, a.out, solaris, bsdi etc,
50cc -E -DELF asm/dx86unix.cpp | as -o asm/dx86-elf.o
51cc -E -DSOL asm/dx86unix.cpp | as -o asm/dx86-sol.o
52cc -E -DOUT asm/dx86unix.cpp | as -o asm/dx86-out.o
53cc -E -DBSDI asm/dx86unix.cpp | as -o asm/dx86bsdi.o
54This was done to cut down the number of files in the distribution.
55
56Now the ugly part. I acquired my copy of Intels
57"Optimization's For Intel's 32-Bit Processors" and found a few interesting
58things. First, the aim of the exersize is to 'extract' one byte at a time
59from a word and do an array lookup. This involves getting the byte from
60the 4 locations in the word and moving it to a new word and doing the lookup.
61The most obvious way to do this is
62xor eax, eax # clear word
63movb al, cl # get low byte
64xor edi DWORD PTR 0x100+des_SP[eax] # xor in word
65movb al, ch # get next byte
66xor edi DWORD PTR 0x300+des_SP[eax] # xor in word
67shr ecx 16
68which seems ok. For the pentium, this system appears to be the best.
69One has to do instruction interleaving to keep both functional units
70operating, but it is basically very efficient.
71
72Now the crunch. When a full register is used after a partial write, eg.
73mov al, cl
74xor edi, DWORD PTR 0x100+des_SP[eax]
75386 - 1 cycle stall
76486 - 1 cycle stall
77586 - 0 cycle stall
78686 - at least 7 cycle stall (page 22 of the above mentioned document).
79
80So the technique that produces the best results on a pentium, according to
81the documentation, will produce hideous results on a pentium pro.
82
83To get around this, des686.pl will generate code that is not as fast on
84a pentium, should be very good on a pentium pro.
85mov eax, ecx # copy word
86shr ecx, 8 # line up next byte
87and eax, 0fch # mask byte
88xor edi DWORD PTR 0x100+des_SP[eax] # xor in array lookup
89mov eax, ecx # get word
90shr ecx 8 # line up next byte
91and eax, 0fch # mask byte
92xor edi DWORD PTR 0x300+des_SP[eax] # xor in array lookup
93
94Due to the execution units in the pentium, this actually works quite well.
95For a pentium pro it should be very good. This is the type of output
96Visual C++ generates.
97
98There is a third option. instead of using
99mov al, ch
100which is bad on the pentium pro, one may be able to use
101movzx eax, ch
102which may not incur the partial write penalty. On the pentium,
103this instruction takes 4 cycles so is not worth using but on the
104pentium pro it appears it may be worth while. I need access to one to
105experiment :-).
106
107eric (20 Oct 1996)
108
10922 Nov 1996 - I have asked people to run the 2 different version on pentium
110pros and it appears that the intel documentation is wrong. The
111mov al,bh is still faster on a pentium pro, so just use the des586.pl
112install des686.pl
113
1143 Dec 1996 - I added des_encrypt3/des_decrypt3 because I have moved these
115functions into des_enc.c because it does make a massive performance
116difference on some boxes to have the functions code located close to
117the des_encrypt2() function.
118
1199 Jan 1997 - des-som2.pl is now the correct perl script to use for
120pentiums. It contains an inner loop from
121Svend Olaf Mikkelsen <svolaf@inet.uni-c.dk> which does raw ecb DES calls at
122273,000 per second. He had a previous version at 250,000 and the best
123I was able to get was 203,000. The content has not changed, this is all
124due to instruction sequencing (and actual instructions choice) which is able
125to keep both functional units of the pentium going.
126We may have lost the ugly register usage restrictions when x86 went 32 bit
127but for the pentium it has been replaced by evil instruction ordering tricks.
128
12913 Jan 1997 - des-som3.pl, more optimizations from Svend Olaf.
130raw DES at 281,000 per second on a pentium 100.
131