diff options
Diffstat (limited to 'src')
| -rw-r--r-- | src/lib/libcrypto/bf/asm/bf-686.pl | 127 | ||||
| -rw-r--r-- | src/lib/libcrypto/bf/asm/readme | 10 | ||||
| -rw-r--r-- | src/lib/libcrypto/bn/asm/README | 27 | ||||
| -rw-r--r-- | src/lib/libcrypto/camellia/asm/BSD_license.txt | 24 | ||||
| -rw-r--r-- | src/lib/libcrypto/cast/asm/readme | 7 | ||||
| -rw-r--r-- | src/lib/libcrypto/des/asm/readme | 131 | ||||
| -rw-r--r-- | src/lib/libssl/src/crypto/bf/asm/bf-686.pl | 127 | ||||
| -rw-r--r-- | src/lib/libssl/src/crypto/bf/asm/readme | 10 | ||||
| -rw-r--r-- | src/lib/libssl/src/crypto/bn/asm/README | 27 | ||||
| -rw-r--r-- | src/lib/libssl/src/crypto/camellia/asm/BSD_license.txt | 24 | ||||
| -rw-r--r-- | src/lib/libssl/src/crypto/cast/asm/readme | 7 | ||||
| -rw-r--r-- | src/lib/libssl/src/crypto/des/asm/readme | 131 | 
12 files changed, 0 insertions, 652 deletions
| diff --git a/src/lib/libcrypto/bf/asm/bf-686.pl b/src/lib/libcrypto/bf/asm/bf-686.pl deleted file mode 100644 index 8e4c25f598..0000000000 --- a/src/lib/libcrypto/bf/asm/bf-686.pl +++ /dev/null | |||
| @@ -1,127 +0,0 @@ | |||
| 1 | #!/usr/local/bin/perl | ||
| 2 | |||
| 3 | push(@INC,"perlasm","../../perlasm"); | ||
| 4 | require "x86asm.pl"; | ||
| 5 | require "cbc.pl"; | ||
| 6 | |||
| 7 | &asm_init($ARGV[0],"bf-686.pl"); | ||
| 8 | |||
| 9 | $BF_ROUNDS=16; | ||
| 10 | $BF_OFF=($BF_ROUNDS+2)*4; | ||
| 11 | $L="ecx"; | ||
| 12 | $R="edx"; | ||
| 13 | $P="edi"; | ||
| 14 | $tot="esi"; | ||
| 15 | $tmp1="eax"; | ||
| 16 | $tmp2="ebx"; | ||
| 17 | $tmp3="ebp"; | ||
| 18 | |||
| 19 | &des_encrypt("BF_encrypt",1); | ||
| 20 | &des_encrypt("BF_decrypt",0); | ||
| 21 | &cbc("BF_cbc_encrypt","BF_encrypt","BF_decrypt",1,4,5,3,-1,-1); | ||
| 22 | |||
| 23 | &asm_finish(); | ||
| 24 | |||
| 25 | &file_end(); | ||
| 26 | |||
| 27 | sub des_encrypt | ||
| 28 | { | ||
| 29 | local($name,$enc)=@_; | ||
| 30 | |||
| 31 | &function_begin($name,""); | ||
| 32 | |||
| 33 | &comment(""); | ||
| 34 | &comment("Load the 2 words"); | ||
| 35 | &mov("eax",&wparam(0)); | ||
| 36 | &mov($L,&DWP(0,"eax","",0)); | ||
| 37 | &mov($R,&DWP(4,"eax","",0)); | ||
| 38 | |||
| 39 | &comment(""); | ||
| 40 | &comment("P pointer, s and enc flag"); | ||
| 41 | &mov($P,&wparam(1)); | ||
| 42 | |||
| 43 | &xor( $tmp1, $tmp1); | ||
| 44 | &xor( $tmp2, $tmp2); | ||
| 45 | |||
| 46 | # encrypting part | ||
| 47 | |||
| 48 | if ($enc) | ||
| 49 | { | ||
| 50 | &xor($L,&DWP(0,$P,"",0)); | ||
| 51 | for ($i=0; $i<$BF_ROUNDS; $i+=2) | ||
| 52 | { | ||
| 53 | &comment(""); | ||
| 54 | &comment("Round $i"); | ||
| 55 | &BF_ENCRYPT($i+1,$R,$L,$P,$tot,$tmp1,$tmp2,$tmp3); | ||
| 56 | |||
| 57 | &comment(""); | ||
| 58 | &comment("Round ".sprintf("%d",$i+1)); | ||
| 59 | &BF_ENCRYPT($i+2,$L,$R,$P,$tot,$tmp1,$tmp2,$tmp3); | ||
| 60 | } | ||
| 61 | &xor($R,&DWP(($BF_ROUNDS+1)*4,$P,"",0)); | ||
| 62 | |||
| 63 | &mov("eax",&wparam(0)); | ||
| 64 | &mov(&DWP(0,"eax","",0),$R); | ||
| 65 | &mov(&DWP(4,"eax","",0),$L); | ||
| 66 | &function_end_A($name); | ||
| 67 | } | ||
| 68 | else | ||
| 69 | { | ||
| 70 | &xor($L,&DWP(($BF_ROUNDS+1)*4,$P,"",0)); | ||
| 71 | for ($i=$BF_ROUNDS; $i>0; $i-=2) | ||
| 72 | { | ||
| 73 | &comment(""); | ||
| 74 | &comment("Round $i"); | ||
| 75 | &BF_ENCRYPT($i,$R,$L,$P,$tot,$tmp1,$tmp2,$tmp3); | ||
| 76 | &comment(""); | ||
| 77 | &comment("Round ".sprintf("%d",$i-1)); | ||
| 78 | &BF_ENCRYPT($i-1,$L,$R,$P,$tot,$tmp1,$tmp2,$tmp3); | ||
| 79 | } | ||
| 80 | &xor($R,&DWP(0,$P,"",0)); | ||
| 81 | |||
| 82 | &mov("eax",&wparam(0)); | ||
| 83 | &mov(&DWP(0,"eax","",0),$R); | ||
| 84 | &mov(&DWP(4,"eax","",0),$L); | ||
| 85 | &function_end_A($name); | ||
| 86 | } | ||
| 87 | |||
| 88 | &function_end_B($name); | ||
| 89 | } | ||
| 90 | |||
| 91 | sub BF_ENCRYPT | ||
| 92 | { | ||
| 93 | local($i,$L,$R,$P,$tot,$tmp1,$tmp2,$tmp3)=@_; | ||
| 94 | |||
| 95 | &rotr( $R, 16); | ||
| 96 | &mov( $tot, &DWP(&n2a($i*4),$P,"",0)); | ||
| 97 | |||
| 98 | &movb( &LB($tmp1), &HB($R)); | ||
| 99 | &movb( &LB($tmp2), &LB($R)); | ||
| 100 | |||
| 101 | &rotr( $R, 16); | ||
| 102 | &xor( $L, $tot); | ||
| 103 | |||
| 104 | &mov( $tot, &DWP(&n2a($BF_OFF+0x0000),$P,$tmp1,4)); | ||
| 105 | &mov( $tmp3, &DWP(&n2a($BF_OFF+0x0400),$P,$tmp2,4)); | ||
| 106 | |||
| 107 | &movb( &LB($tmp1), &HB($R)); | ||
| 108 | &movb( &LB($tmp2), &LB($R)); | ||
| 109 | |||
| 110 | &add( $tot, $tmp3); | ||
| 111 | &mov( $tmp1, &DWP(&n2a($BF_OFF+0x0800),$P,$tmp1,4)); # delay | ||
| 112 | |||
| 113 | &xor( $tot, $tmp1); | ||
| 114 | &mov( $tmp3, &DWP(&n2a($BF_OFF+0x0C00),$P,$tmp2,4)); | ||
| 115 | |||
| 116 | &add( $tot, $tmp3); | ||
| 117 | &xor( $tmp1, $tmp1); | ||
| 118 | |||
| 119 | &xor( $L, $tot); | ||
| 120 | # delay | ||
| 121 | } | ||
| 122 | |||
| 123 | sub n2a | ||
| 124 | { | ||
| 125 | sprintf("%d",$_[0]); | ||
| 126 | } | ||
| 127 | |||
| diff --git a/src/lib/libcrypto/bf/asm/readme b/src/lib/libcrypto/bf/asm/readme deleted file mode 100644 index 2385fa3812..0000000000 --- a/src/lib/libcrypto/bf/asm/readme +++ /dev/null | |||
| @@ -1,10 +0,0 @@ | |||
| 1 | There are blowfish assembler generation scripts. | ||
| 2 | bf-586.pl version is for the pentium and | ||
| 3 | bf-686.pl is my original version, which is faster on the pentium pro. | ||
| 4 | |||
| 5 | When using a bf-586.pl, the pentium pro/II is %8 slower than using | ||
| 6 | bf-686.pl. When using a bf-686.pl, the pentium is %16 slower | ||
| 7 | than bf-586.pl | ||
| 8 | |||
| 9 | So the default is bf-586.pl | ||
| 10 | |||
| diff --git a/src/lib/libcrypto/bn/asm/README b/src/lib/libcrypto/bn/asm/README deleted file mode 100644 index 323d1a06b9..0000000000 --- a/src/lib/libcrypto/bn/asm/README +++ /dev/null | |||
| @@ -1,27 +0,0 @@ | |||
| 1 | <OBSOLETE> | ||
| 2 | |||
| 3 | All assembler in this directory are just version of the file | ||
| 4 | crypto/bn/bn_asm.c. | ||
| 5 | |||
| 6 | Quite a few of these files are just the assembler output from gcc since on | ||
| 7 | quite a few machines they are 2 times faster than the system compiler. | ||
| 8 | |||
| 9 | For the x86, I have hand written assembler because of the bad job all | ||
| 10 | compilers seem to do on it. This normally gives a 2 time speed up in the RSA | ||
| 11 | routines. | ||
| 12 | |||
| 13 | For the DEC alpha, I also hand wrote the assembler (except the division which | ||
| 14 | is just the output from the C compiler pasted on the end of the file). | ||
| 15 | On the 2 alpha C compilers I had access to, it was not possible to do | ||
| 16 | 64b x 64b -> 128b calculations (both long and the long long data types | ||
| 17 | were 64 bits). So the hand assembler gives access to the 128 bit result and | ||
| 18 | a 2 times speedup :-). | ||
| 19 | |||
| 20 | There are 3 versions of assembler for the HP PA-RISC. | ||
| 21 | |||
| 22 | pa-risc.s is the origional one which works fine and generated using gcc :-) | ||
| 23 | |||
| 24 | pa-risc2W.s and pa-risc2.s are 64 and 32-bit PA-RISC 2.0 implementations | ||
| 25 | by Chris Ruemmler from HP (with some help from the HP C compiler). | ||
| 26 | |||
| 27 | </OBSOLETE> | ||
| diff --git a/src/lib/libcrypto/camellia/asm/BSD_license.txt b/src/lib/libcrypto/camellia/asm/BSD_license.txt deleted file mode 100644 index 591975cb98..0000000000 --- a/src/lib/libcrypto/camellia/asm/BSD_license.txt +++ /dev/null | |||
| @@ -1,24 +0,0 @@ | |||
| 1 | Camellia assembler implementation. | ||
| 2 | |||
| 3 | Copyright (c) 2008 Andy Polyakov <appro@openssl.org> | ||
| 4 | |||
| 5 | Redistribution and use in source and binary forms, with or without | ||
| 6 | modification, are permitted provided that the following conditions | ||
| 7 | are met: | ||
| 8 | 1. Redistributions of source code must retain the above copyright | ||
| 9 | notice, this list of conditions and the following disclaimer as | ||
| 10 | the first lines of this file unmodified. | ||
| 11 | 2. Redistributions in binary form must reproduce the above copyright | ||
| 12 | notice, this list of conditions and the following disclaimer in the | ||
| 13 | documentation and/or other materials provided with the distribution. | ||
| 14 | |||
| 15 | THIS SOFTWARE IS PROVIDED BY Andy Polyakov ``AS IS'' AND ANY EXPRESS OR | ||
| 16 | IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES | ||
| 17 | OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. | ||
| 18 | IN NO EVENT SHALL NTT BE LIABLE FOR ANY DIRECT, INDIRECT, | ||
| 19 | INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT | ||
| 20 | NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | ||
| 21 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | ||
| 22 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | ||
| 23 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF | ||
| 24 | THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||
| diff --git a/src/lib/libcrypto/cast/asm/readme b/src/lib/libcrypto/cast/asm/readme deleted file mode 100644 index fbcd76289e..0000000000 --- a/src/lib/libcrypto/cast/asm/readme +++ /dev/null | |||
| @@ -1,7 +0,0 @@ | |||
| 1 | There is a ppro flag in cast-586 which turns on/off | ||
| 2 | generation of pentium pro/II friendly code | ||
| 3 | |||
| 4 | This flag makes the inner loop one cycle longer, but generates | ||
| 5 | code that runs %30 faster on the pentium pro/II, while only %7 slower | ||
| 6 | on the pentium. By default, this flag is on. | ||
| 7 | |||
| diff --git a/src/lib/libcrypto/des/asm/readme b/src/lib/libcrypto/des/asm/readme deleted file mode 100644 index 1beafe253b..0000000000 --- a/src/lib/libcrypto/des/asm/readme +++ /dev/null | |||
| @@ -1,131 +0,0 @@ | |||
| 1 | First up, let me say I don't like writing in assembler. It is not portable, | ||
| 2 | dependant on the particular CPU architecture release and is generally a pig | ||
| 3 | to debug and get right. Having said that, the x86 architecture is probably | ||
| 4 | the most important for speed due to number of boxes and since | ||
| 5 | it appears to be the worst architecture to to get | ||
| 6 | good C compilers for. So due to this, I have lowered myself to do | ||
| 7 | assembler for the inner DES routines in libdes :-). | ||
| 8 | |||
| 9 | The file to implement in assembler is des_enc.c. Replace the following | ||
| 10 | 4 functions | ||
| 11 | des_encrypt1(DES_LONG data[2],des_key_schedule ks, int encrypt); | ||
| 12 | des_encrypt2(DES_LONG data[2],des_key_schedule ks, int encrypt); | ||
| 13 | des_encrypt3(DES_LONG data[2],des_key_schedule ks1,ks2,ks3); | ||
| 14 | des_decrypt3(DES_LONG data[2],des_key_schedule ks1,ks2,ks3); | ||
| 15 | |||
| 16 | They encrypt/decrypt the 64 bits held in 'data' using | ||
| 17 | the 'ks' key schedules. The only difference between the 4 functions is that | ||
| 18 | des_encrypt2() does not perform IP() or FP() on the data (this is an | ||
| 19 | optimization for when doing triple DES and des_encrypt3() and des_decrypt3() | ||
| 20 | perform triple des. The triple DES routines are in here because it does | ||
| 21 | make a big difference to have them located near the des_encrypt2 function | ||
| 22 | at link time.. | ||
| 23 | |||
| 24 | Now as we all know, there are lots of different operating systems running on | ||
| 25 | x86 boxes, and unfortunately they normally try to make sure their assembler | ||
| 26 | formating is not the same as the other peoples. | ||
| 27 | The 4 main formats I know of are | ||
| 28 | Microsoft Windows 95/Windows NT | ||
| 29 | Elf Includes Linux and FreeBSD(?). | ||
| 30 | a.out The older Linux. | ||
| 31 | Solaris Same as Elf but different comments :-(. | ||
| 32 | |||
| 33 | Now I was not overly keen to write 4 different copies of the same code, | ||
| 34 | so I wrote a few perl routines to output the correct assembler, given | ||
| 35 | a target assembler type. This code is ugly and is just a hack. | ||
| 36 | The libraries are x86unix.pl and x86ms.pl. | ||
| 37 | des586.pl, des686.pl and des-som[23].pl are the programs to actually | ||
| 38 | generate the assembler. | ||
| 39 | |||
| 40 | So to generate elf assembler | ||
| 41 | perl des-som3.pl elf >dx86-elf.s | ||
| 42 | For Windows 95/NT | ||
| 43 | perl des-som2.pl win32 >win32.asm | ||
| 44 | |||
| 45 | [ update 4 Jan 1996 ] | ||
| 46 | I have added another way to do things. | ||
| 47 | perl des-som3.pl cpp >dx86-cpp.s | ||
| 48 | generates a file that will be included by dx86unix.cpp when it is compiled. | ||
| 49 | To build for elf, a.out, solaris, bsdi etc, | ||
| 50 | cc -E -DELF asm/dx86unix.cpp | as -o asm/dx86-elf.o | ||
| 51 | cc -E -DSOL asm/dx86unix.cpp | as -o asm/dx86-sol.o | ||
| 52 | cc -E -DOUT asm/dx86unix.cpp | as -o asm/dx86-out.o | ||
| 53 | cc -E -DBSDI asm/dx86unix.cpp | as -o asm/dx86bsdi.o | ||
| 54 | This was done to cut down the number of files in the distribution. | ||
| 55 | |||
| 56 | Now the ugly part. I acquired my copy of Intels | ||
| 57 | "Optimization's For Intel's 32-Bit Processors" and found a few interesting | ||
| 58 | things. First, the aim of the exersize is to 'extract' one byte at a time | ||
| 59 | from a word and do an array lookup. This involves getting the byte from | ||
| 60 | the 4 locations in the word and moving it to a new word and doing the lookup. | ||
| 61 | The most obvious way to do this is | ||
| 62 | xor eax, eax # clear word | ||
| 63 | movb al, cl # get low byte | ||
| 64 | xor edi DWORD PTR 0x100+des_SP[eax] # xor in word | ||
| 65 | movb al, ch # get next byte | ||
| 66 | xor edi DWORD PTR 0x300+des_SP[eax] # xor in word | ||
| 67 | shr ecx 16 | ||
| 68 | which seems ok. For the pentium, this system appears to be the best. | ||
| 69 | One has to do instruction interleaving to keep both functional units | ||
| 70 | operating, but it is basically very efficient. | ||
| 71 | |||
| 72 | Now the crunch. When a full register is used after a partial write, eg. | ||
| 73 | mov al, cl | ||
| 74 | xor edi, DWORD PTR 0x100+des_SP[eax] | ||
| 75 | 386 - 1 cycle stall | ||
| 76 | 486 - 1 cycle stall | ||
| 77 | 586 - 0 cycle stall | ||
| 78 | 686 - at least 7 cycle stall (page 22 of the above mentioned document). | ||
| 79 | |||
| 80 | So the technique that produces the best results on a pentium, according to | ||
| 81 | the documentation, will produce hideous results on a pentium pro. | ||
| 82 | |||
| 83 | To get around this, des686.pl will generate code that is not as fast on | ||
| 84 | a pentium, should be very good on a pentium pro. | ||
| 85 | mov eax, ecx # copy word | ||
| 86 | shr ecx, 8 # line up next byte | ||
| 87 | and eax, 0fch # mask byte | ||
| 88 | xor edi DWORD PTR 0x100+des_SP[eax] # xor in array lookup | ||
| 89 | mov eax, ecx # get word | ||
| 90 | shr ecx 8 # line up next byte | ||
| 91 | and eax, 0fch # mask byte | ||
| 92 | xor edi DWORD PTR 0x300+des_SP[eax] # xor in array lookup | ||
| 93 | |||
| 94 | Due to the execution units in the pentium, this actually works quite well. | ||
| 95 | For a pentium pro it should be very good. This is the type of output | ||
| 96 | Visual C++ generates. | ||
| 97 | |||
| 98 | There is a third option. instead of using | ||
| 99 | mov al, ch | ||
| 100 | which is bad on the pentium pro, one may be able to use | ||
| 101 | movzx eax, ch | ||
| 102 | which may not incur the partial write penalty. On the pentium, | ||
| 103 | this instruction takes 4 cycles so is not worth using but on the | ||
| 104 | pentium pro it appears it may be worth while. I need access to one to | ||
| 105 | experiment :-). | ||
| 106 | |||
| 107 | eric (20 Oct 1996) | ||
| 108 | |||
| 109 | 22 Nov 1996 - I have asked people to run the 2 different version on pentium | ||
| 110 | pros and it appears that the intel documentation is wrong. The | ||
| 111 | mov al,bh is still faster on a pentium pro, so just use the des586.pl | ||
| 112 | install des686.pl | ||
| 113 | |||
| 114 | 3 Dec 1996 - I added des_encrypt3/des_decrypt3 because I have moved these | ||
| 115 | functions into des_enc.c because it does make a massive performance | ||
| 116 | difference on some boxes to have the functions code located close to | ||
| 117 | the des_encrypt2() function. | ||
| 118 | |||
| 119 | 9 Jan 1997 - des-som2.pl is now the correct perl script to use for | ||
| 120 | pentiums. It contains an inner loop from | ||
| 121 | Svend Olaf Mikkelsen <svolaf@inet.uni-c.dk> which does raw ecb DES calls at | ||
| 122 | 273,000 per second. He had a previous version at 250,000 and the best | ||
| 123 | I was able to get was 203,000. The content has not changed, this is all | ||
| 124 | due to instruction sequencing (and actual instructions choice) which is able | ||
| 125 | to keep both functional units of the pentium going. | ||
| 126 | We may have lost the ugly register usage restrictions when x86 went 32 bit | ||
| 127 | but for the pentium it has been replaced by evil instruction ordering tricks. | ||
| 128 | |||
| 129 | 13 Jan 1997 - des-som3.pl, more optimizations from Svend Olaf. | ||
| 130 | raw DES at 281,000 per second on a pentium 100. | ||
| 131 | |||
| diff --git a/src/lib/libssl/src/crypto/bf/asm/bf-686.pl b/src/lib/libssl/src/crypto/bf/asm/bf-686.pl deleted file mode 100644 index 8e4c25f598..0000000000 --- a/src/lib/libssl/src/crypto/bf/asm/bf-686.pl +++ /dev/null | |||
| @@ -1,127 +0,0 @@ | |||
| 1 | #!/usr/local/bin/perl | ||
| 2 | |||
| 3 | push(@INC,"perlasm","../../perlasm"); | ||
| 4 | require "x86asm.pl"; | ||
| 5 | require "cbc.pl"; | ||
| 6 | |||
| 7 | &asm_init($ARGV[0],"bf-686.pl"); | ||
| 8 | |||
| 9 | $BF_ROUNDS=16; | ||
| 10 | $BF_OFF=($BF_ROUNDS+2)*4; | ||
| 11 | $L="ecx"; | ||
| 12 | $R="edx"; | ||
| 13 | $P="edi"; | ||
| 14 | $tot="esi"; | ||
| 15 | $tmp1="eax"; | ||
| 16 | $tmp2="ebx"; | ||
| 17 | $tmp3="ebp"; | ||
| 18 | |||
| 19 | &des_encrypt("BF_encrypt",1); | ||
| 20 | &des_encrypt("BF_decrypt",0); | ||
| 21 | &cbc("BF_cbc_encrypt","BF_encrypt","BF_decrypt",1,4,5,3,-1,-1); | ||
| 22 | |||
| 23 | &asm_finish(); | ||
| 24 | |||
| 25 | &file_end(); | ||
| 26 | |||
| 27 | sub des_encrypt | ||
| 28 | { | ||
| 29 | local($name,$enc)=@_; | ||
| 30 | |||
| 31 | &function_begin($name,""); | ||
| 32 | |||
| 33 | &comment(""); | ||
| 34 | &comment("Load the 2 words"); | ||
| 35 | &mov("eax",&wparam(0)); | ||
| 36 | &mov($L,&DWP(0,"eax","",0)); | ||
| 37 | &mov($R,&DWP(4,"eax","",0)); | ||
| 38 | |||
| 39 | &comment(""); | ||
| 40 | &comment("P pointer, s and enc flag"); | ||
| 41 | &mov($P,&wparam(1)); | ||
| 42 | |||
| 43 | &xor( $tmp1, $tmp1); | ||
| 44 | &xor( $tmp2, $tmp2); | ||
| 45 | |||
| 46 | # encrypting part | ||
| 47 | |||
| 48 | if ($enc) | ||
| 49 | { | ||
| 50 | &xor($L,&DWP(0,$P,"",0)); | ||
| 51 | for ($i=0; $i<$BF_ROUNDS; $i+=2) | ||
| 52 | { | ||
| 53 | &comment(""); | ||
| 54 | &comment("Round $i"); | ||
| 55 | &BF_ENCRYPT($i+1,$R,$L,$P,$tot,$tmp1,$tmp2,$tmp3); | ||
| 56 | |||
| 57 | &comment(""); | ||
| 58 | &comment("Round ".sprintf("%d",$i+1)); | ||
| 59 | &BF_ENCRYPT($i+2,$L,$R,$P,$tot,$tmp1,$tmp2,$tmp3); | ||
| 60 | } | ||
| 61 | &xor($R,&DWP(($BF_ROUNDS+1)*4,$P,"",0)); | ||
| 62 | |||
| 63 | &mov("eax",&wparam(0)); | ||
| 64 | &mov(&DWP(0,"eax","",0),$R); | ||
| 65 | &mov(&DWP(4,"eax","",0),$L); | ||
| 66 | &function_end_A($name); | ||
| 67 | } | ||
| 68 | else | ||
| 69 | { | ||
| 70 | &xor($L,&DWP(($BF_ROUNDS+1)*4,$P,"",0)); | ||
| 71 | for ($i=$BF_ROUNDS; $i>0; $i-=2) | ||
| 72 | { | ||
| 73 | &comment(""); | ||
| 74 | &comment("Round $i"); | ||
| 75 | &BF_ENCRYPT($i,$R,$L,$P,$tot,$tmp1,$tmp2,$tmp3); | ||
| 76 | &comment(""); | ||
| 77 | &comment("Round ".sprintf("%d",$i-1)); | ||
| 78 | &BF_ENCRYPT($i-1,$L,$R,$P,$tot,$tmp1,$tmp2,$tmp3); | ||
| 79 | } | ||
| 80 | &xor($R,&DWP(0,$P,"",0)); | ||
| 81 | |||
| 82 | &mov("eax",&wparam(0)); | ||
| 83 | &mov(&DWP(0,"eax","",0),$R); | ||
| 84 | &mov(&DWP(4,"eax","",0),$L); | ||
| 85 | &function_end_A($name); | ||
| 86 | } | ||
| 87 | |||
| 88 | &function_end_B($name); | ||
| 89 | } | ||
| 90 | |||
| 91 | sub BF_ENCRYPT | ||
| 92 | { | ||
| 93 | local($i,$L,$R,$P,$tot,$tmp1,$tmp2,$tmp3)=@_; | ||
| 94 | |||
| 95 | &rotr( $R, 16); | ||
| 96 | &mov( $tot, &DWP(&n2a($i*4),$P,"",0)); | ||
| 97 | |||
| 98 | &movb( &LB($tmp1), &HB($R)); | ||
| 99 | &movb( &LB($tmp2), &LB($R)); | ||
| 100 | |||
| 101 | &rotr( $R, 16); | ||
| 102 | &xor( $L, $tot); | ||
| 103 | |||
| 104 | &mov( $tot, &DWP(&n2a($BF_OFF+0x0000),$P,$tmp1,4)); | ||
| 105 | &mov( $tmp3, &DWP(&n2a($BF_OFF+0x0400),$P,$tmp2,4)); | ||
| 106 | |||
| 107 | &movb( &LB($tmp1), &HB($R)); | ||
| 108 | &movb( &LB($tmp2), &LB($R)); | ||
| 109 | |||
| 110 | &add( $tot, $tmp3); | ||
| 111 | &mov( $tmp1, &DWP(&n2a($BF_OFF+0x0800),$P,$tmp1,4)); # delay | ||
| 112 | |||
| 113 | &xor( $tot, $tmp1); | ||
| 114 | &mov( $tmp3, &DWP(&n2a($BF_OFF+0x0C00),$P,$tmp2,4)); | ||
| 115 | |||
| 116 | &add( $tot, $tmp3); | ||
| 117 | &xor( $tmp1, $tmp1); | ||
| 118 | |||
| 119 | &xor( $L, $tot); | ||
| 120 | # delay | ||
| 121 | } | ||
| 122 | |||
| 123 | sub n2a | ||
| 124 | { | ||
| 125 | sprintf("%d",$_[0]); | ||
| 126 | } | ||
| 127 | |||
| diff --git a/src/lib/libssl/src/crypto/bf/asm/readme b/src/lib/libssl/src/crypto/bf/asm/readme deleted file mode 100644 index 2385fa3812..0000000000 --- a/src/lib/libssl/src/crypto/bf/asm/readme +++ /dev/null | |||
| @@ -1,10 +0,0 @@ | |||
| 1 | There are blowfish assembler generation scripts. | ||
| 2 | bf-586.pl version is for the pentium and | ||
| 3 | bf-686.pl is my original version, which is faster on the pentium pro. | ||
| 4 | |||
| 5 | When using a bf-586.pl, the pentium pro/II is %8 slower than using | ||
| 6 | bf-686.pl. When using a bf-686.pl, the pentium is %16 slower | ||
| 7 | than bf-586.pl | ||
| 8 | |||
| 9 | So the default is bf-586.pl | ||
| 10 | |||
| diff --git a/src/lib/libssl/src/crypto/bn/asm/README b/src/lib/libssl/src/crypto/bn/asm/README deleted file mode 100644 index 323d1a06b9..0000000000 --- a/src/lib/libssl/src/crypto/bn/asm/README +++ /dev/null | |||
| @@ -1,27 +0,0 @@ | |||
| 1 | <OBSOLETE> | ||
| 2 | |||
| 3 | All assembler in this directory are just version of the file | ||
| 4 | crypto/bn/bn_asm.c. | ||
| 5 | |||
| 6 | Quite a few of these files are just the assembler output from gcc since on | ||
| 7 | quite a few machines they are 2 times faster than the system compiler. | ||
| 8 | |||
| 9 | For the x86, I have hand written assembler because of the bad job all | ||
| 10 | compilers seem to do on it. This normally gives a 2 time speed up in the RSA | ||
| 11 | routines. | ||
| 12 | |||
| 13 | For the DEC alpha, I also hand wrote the assembler (except the division which | ||
| 14 | is just the output from the C compiler pasted on the end of the file). | ||
| 15 | On the 2 alpha C compilers I had access to, it was not possible to do | ||
| 16 | 64b x 64b -> 128b calculations (both long and the long long data types | ||
| 17 | were 64 bits). So the hand assembler gives access to the 128 bit result and | ||
| 18 | a 2 times speedup :-). | ||
| 19 | |||
| 20 | There are 3 versions of assembler for the HP PA-RISC. | ||
| 21 | |||
| 22 | pa-risc.s is the origional one which works fine and generated using gcc :-) | ||
| 23 | |||
| 24 | pa-risc2W.s and pa-risc2.s are 64 and 32-bit PA-RISC 2.0 implementations | ||
| 25 | by Chris Ruemmler from HP (with some help from the HP C compiler). | ||
| 26 | |||
| 27 | </OBSOLETE> | ||
| diff --git a/src/lib/libssl/src/crypto/camellia/asm/BSD_license.txt b/src/lib/libssl/src/crypto/camellia/asm/BSD_license.txt deleted file mode 100644 index 591975cb98..0000000000 --- a/src/lib/libssl/src/crypto/camellia/asm/BSD_license.txt +++ /dev/null | |||
| @@ -1,24 +0,0 @@ | |||
| 1 | Camellia assembler implementation. | ||
| 2 | |||
| 3 | Copyright (c) 2008 Andy Polyakov <appro@openssl.org> | ||
| 4 | |||
| 5 | Redistribution and use in source and binary forms, with or without | ||
| 6 | modification, are permitted provided that the following conditions | ||
| 7 | are met: | ||
| 8 | 1. Redistributions of source code must retain the above copyright | ||
| 9 | notice, this list of conditions and the following disclaimer as | ||
| 10 | the first lines of this file unmodified. | ||
| 11 | 2. Redistributions in binary form must reproduce the above copyright | ||
| 12 | notice, this list of conditions and the following disclaimer in the | ||
| 13 | documentation and/or other materials provided with the distribution. | ||
| 14 | |||
| 15 | THIS SOFTWARE IS PROVIDED BY Andy Polyakov ``AS IS'' AND ANY EXPRESS OR | ||
| 16 | IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES | ||
| 17 | OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. | ||
| 18 | IN NO EVENT SHALL NTT BE LIABLE FOR ANY DIRECT, INDIRECT, | ||
| 19 | INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT | ||
| 20 | NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | ||
| 21 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | ||
| 22 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | ||
| 23 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF | ||
| 24 | THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||
| diff --git a/src/lib/libssl/src/crypto/cast/asm/readme b/src/lib/libssl/src/crypto/cast/asm/readme deleted file mode 100644 index fbcd76289e..0000000000 --- a/src/lib/libssl/src/crypto/cast/asm/readme +++ /dev/null | |||
| @@ -1,7 +0,0 @@ | |||
| 1 | There is a ppro flag in cast-586 which turns on/off | ||
| 2 | generation of pentium pro/II friendly code | ||
| 3 | |||
| 4 | This flag makes the inner loop one cycle longer, but generates | ||
| 5 | code that runs %30 faster on the pentium pro/II, while only %7 slower | ||
| 6 | on the pentium. By default, this flag is on. | ||
| 7 | |||
| diff --git a/src/lib/libssl/src/crypto/des/asm/readme b/src/lib/libssl/src/crypto/des/asm/readme deleted file mode 100644 index 1beafe253b..0000000000 --- a/src/lib/libssl/src/crypto/des/asm/readme +++ /dev/null | |||
| @@ -1,131 +0,0 @@ | |||
| 1 | First up, let me say I don't like writing in assembler. It is not portable, | ||
| 2 | dependant on the particular CPU architecture release and is generally a pig | ||
| 3 | to debug and get right. Having said that, the x86 architecture is probably | ||
| 4 | the most important for speed due to number of boxes and since | ||
| 5 | it appears to be the worst architecture to to get | ||
| 6 | good C compilers for. So due to this, I have lowered myself to do | ||
| 7 | assembler for the inner DES routines in libdes :-). | ||
| 8 | |||
| 9 | The file to implement in assembler is des_enc.c. Replace the following | ||
| 10 | 4 functions | ||
| 11 | des_encrypt1(DES_LONG data[2],des_key_schedule ks, int encrypt); | ||
| 12 | des_encrypt2(DES_LONG data[2],des_key_schedule ks, int encrypt); | ||
| 13 | des_encrypt3(DES_LONG data[2],des_key_schedule ks1,ks2,ks3); | ||
| 14 | des_decrypt3(DES_LONG data[2],des_key_schedule ks1,ks2,ks3); | ||
| 15 | |||
| 16 | They encrypt/decrypt the 64 bits held in 'data' using | ||
| 17 | the 'ks' key schedules. The only difference between the 4 functions is that | ||
| 18 | des_encrypt2() does not perform IP() or FP() on the data (this is an | ||
| 19 | optimization for when doing triple DES and des_encrypt3() and des_decrypt3() | ||
| 20 | perform triple des. The triple DES routines are in here because it does | ||
| 21 | make a big difference to have them located near the des_encrypt2 function | ||
| 22 | at link time.. | ||
| 23 | |||
| 24 | Now as we all know, there are lots of different operating systems running on | ||
| 25 | x86 boxes, and unfortunately they normally try to make sure their assembler | ||
| 26 | formating is not the same as the other peoples. | ||
| 27 | The 4 main formats I know of are | ||
| 28 | Microsoft Windows 95/Windows NT | ||
| 29 | Elf Includes Linux and FreeBSD(?). | ||
| 30 | a.out The older Linux. | ||
| 31 | Solaris Same as Elf but different comments :-(. | ||
| 32 | |||
| 33 | Now I was not overly keen to write 4 different copies of the same code, | ||
| 34 | so I wrote a few perl routines to output the correct assembler, given | ||
| 35 | a target assembler type. This code is ugly and is just a hack. | ||
| 36 | The libraries are x86unix.pl and x86ms.pl. | ||
| 37 | des586.pl, des686.pl and des-som[23].pl are the programs to actually | ||
| 38 | generate the assembler. | ||
| 39 | |||
| 40 | So to generate elf assembler | ||
| 41 | perl des-som3.pl elf >dx86-elf.s | ||
| 42 | For Windows 95/NT | ||
| 43 | perl des-som2.pl win32 >win32.asm | ||
| 44 | |||
| 45 | [ update 4 Jan 1996 ] | ||
| 46 | I have added another way to do things. | ||
| 47 | perl des-som3.pl cpp >dx86-cpp.s | ||
| 48 | generates a file that will be included by dx86unix.cpp when it is compiled. | ||
| 49 | To build for elf, a.out, solaris, bsdi etc, | ||
| 50 | cc -E -DELF asm/dx86unix.cpp | as -o asm/dx86-elf.o | ||
| 51 | cc -E -DSOL asm/dx86unix.cpp | as -o asm/dx86-sol.o | ||
| 52 | cc -E -DOUT asm/dx86unix.cpp | as -o asm/dx86-out.o | ||
| 53 | cc -E -DBSDI asm/dx86unix.cpp | as -o asm/dx86bsdi.o | ||
| 54 | This was done to cut down the number of files in the distribution. | ||
| 55 | |||
| 56 | Now the ugly part. I acquired my copy of Intels | ||
| 57 | "Optimization's For Intel's 32-Bit Processors" and found a few interesting | ||
| 58 | things. First, the aim of the exersize is to 'extract' one byte at a time | ||
| 59 | from a word and do an array lookup. This involves getting the byte from | ||
| 60 | the 4 locations in the word and moving it to a new word and doing the lookup. | ||
| 61 | The most obvious way to do this is | ||
| 62 | xor eax, eax # clear word | ||
| 63 | movb al, cl # get low byte | ||
| 64 | xor edi DWORD PTR 0x100+des_SP[eax] # xor in word | ||
| 65 | movb al, ch # get next byte | ||
| 66 | xor edi DWORD PTR 0x300+des_SP[eax] # xor in word | ||
| 67 | shr ecx 16 | ||
| 68 | which seems ok. For the pentium, this system appears to be the best. | ||
| 69 | One has to do instruction interleaving to keep both functional units | ||
| 70 | operating, but it is basically very efficient. | ||
| 71 | |||
| 72 | Now the crunch. When a full register is used after a partial write, eg. | ||
| 73 | mov al, cl | ||
| 74 | xor edi, DWORD PTR 0x100+des_SP[eax] | ||
| 75 | 386 - 1 cycle stall | ||
| 76 | 486 - 1 cycle stall | ||
| 77 | 586 - 0 cycle stall | ||
| 78 | 686 - at least 7 cycle stall (page 22 of the above mentioned document). | ||
| 79 | |||
| 80 | So the technique that produces the best results on a pentium, according to | ||
| 81 | the documentation, will produce hideous results on a pentium pro. | ||
| 82 | |||
| 83 | To get around this, des686.pl will generate code that is not as fast on | ||
| 84 | a pentium, should be very good on a pentium pro. | ||
| 85 | mov eax, ecx # copy word | ||
| 86 | shr ecx, 8 # line up next byte | ||
| 87 | and eax, 0fch # mask byte | ||
| 88 | xor edi DWORD PTR 0x100+des_SP[eax] # xor in array lookup | ||
| 89 | mov eax, ecx # get word | ||
| 90 | shr ecx 8 # line up next byte | ||
| 91 | and eax, 0fch # mask byte | ||
| 92 | xor edi DWORD PTR 0x300+des_SP[eax] # xor in array lookup | ||
| 93 | |||
| 94 | Due to the execution units in the pentium, this actually works quite well. | ||
| 95 | For a pentium pro it should be very good. This is the type of output | ||
| 96 | Visual C++ generates. | ||
| 97 | |||
| 98 | There is a third option. instead of using | ||
| 99 | mov al, ch | ||
| 100 | which is bad on the pentium pro, one may be able to use | ||
| 101 | movzx eax, ch | ||
| 102 | which may not incur the partial write penalty. On the pentium, | ||
| 103 | this instruction takes 4 cycles so is not worth using but on the | ||
| 104 | pentium pro it appears it may be worth while. I need access to one to | ||
| 105 | experiment :-). | ||
| 106 | |||
| 107 | eric (20 Oct 1996) | ||
| 108 | |||
| 109 | 22 Nov 1996 - I have asked people to run the 2 different version on pentium | ||
| 110 | pros and it appears that the intel documentation is wrong. The | ||
| 111 | mov al,bh is still faster on a pentium pro, so just use the des586.pl | ||
| 112 | install des686.pl | ||
| 113 | |||
| 114 | 3 Dec 1996 - I added des_encrypt3/des_decrypt3 because I have moved these | ||
| 115 | functions into des_enc.c because it does make a massive performance | ||
| 116 | difference on some boxes to have the functions code located close to | ||
| 117 | the des_encrypt2() function. | ||
| 118 | |||
| 119 | 9 Jan 1997 - des-som2.pl is now the correct perl script to use for | ||
| 120 | pentiums. It contains an inner loop from | ||
| 121 | Svend Olaf Mikkelsen <svolaf@inet.uni-c.dk> which does raw ecb DES calls at | ||
| 122 | 273,000 per second. He had a previous version at 250,000 and the best | ||
| 123 | I was able to get was 203,000. The content has not changed, this is all | ||
| 124 | due to instruction sequencing (and actual instructions choice) which is able | ||
| 125 | to keep both functional units of the pentium going. | ||
| 126 | We may have lost the ugly register usage restrictions when x86 went 32 bit | ||
| 127 | but for the pentium it has been replaced by evil instruction ordering tricks. | ||
| 128 | |||
| 129 | 13 Jan 1997 - des-som3.pl, more optimizations from Svend Olaf. | ||
| 130 | raw DES at 281,000 per second on a pentium 100. | ||
| 131 | |||
