diff options
author | djm <> | 2006-06-27 05:05:40 +0000 |
---|---|---|
committer | djm <> | 2006-06-27 05:05:40 +0000 |
commit | 4f828b924f54507141fb95ebe49dfcd261945e85 (patch) | |
tree | f6b05913ca6b34db73a343a7bb36d57ff4105356 /src/lib/libcrypto/rc4 | |
parent | 588543a0946f1dbf0f1dd5135f8f6447486dc183 (diff) | |
download | openbsd-4f828b924f54507141fb95ebe49dfcd261945e85.tar.gz openbsd-4f828b924f54507141fb95ebe49dfcd261945e85.tar.bz2 openbsd-4f828b924f54507141fb95ebe49dfcd261945e85.zip |
import of openssl-0.9.7j
Diffstat (limited to 'src/lib/libcrypto/rc4')
-rw-r--r-- | src/lib/libcrypto/rc4/Makefile | 13 | ||||
-rw-r--r-- | src/lib/libcrypto/rc4/asm/rc4-ia64.S | 65 |
2 files changed, 43 insertions, 35 deletions
diff --git a/src/lib/libcrypto/rc4/Makefile b/src/lib/libcrypto/rc4/Makefile index 64e06924f4..20d078ec87 100644 --- a/src/lib/libcrypto/rc4/Makefile +++ b/src/lib/libcrypto/rc4/Makefile | |||
@@ -1,5 +1,5 @@ | |||
1 | # | 1 | # |
2 | # SSLeay/crypto/rc4/Makefile | 2 | # OpenSSL/crypto/rc4/Makefile |
3 | # | 3 | # |
4 | 4 | ||
5 | DIR= rc4 | 5 | DIR= rc4 |
@@ -66,10 +66,14 @@ asm/rx86bsdi.o: asm/rx86unix.cpp | |||
66 | asm/rx86unix.cpp: asm/rc4-586.pl ../perlasm/x86asm.pl | 66 | asm/rx86unix.cpp: asm/rc4-586.pl ../perlasm/x86asm.pl |
67 | (cd asm; $(PERL) rc4-586.pl cpp >rx86unix.cpp) | 67 | (cd asm; $(PERL) rc4-586.pl cpp >rx86unix.cpp) |
68 | 68 | ||
69 | asm/rc4-amd64.s: asm/rc4-amd64.pl; $(PERL) asm/rc4-amd64.pl $@ | 69 | asm/rc4-x86_64.s: asm/rc4-x86_64.pl; $(PERL) asm/rc4-x86_64.pl $@ |
70 | 70 | ||
71 | asm/rc4-ia64.s: asm/rc4-ia64.S | 71 | asm/rc4-ia64.s: asm/rc4-ia64.S |
72 | $(CC) $(CFLAGS) -E asm/rc4-ia64.S > $@ | 72 | @case `awk '/^#define RC4_INT/{print$$NF}' $(TOP)/include/openssl/opensslconf.h` in \ |
73 | int) set -x; $(CC) $(CFLAGS) -DSZ=4 -E asm/rc4-ia64.S > $@ ;; \ | ||
74 | char) set -x; $(CC) $(CFLAGS) -DSZ=1 -E asm/rc4-ia64.S > $@ ;; \ | ||
75 | *) exit 1 ;; \ | ||
76 | esac | ||
73 | 77 | ||
74 | files: | 78 | files: |
75 | $(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO | 79 | $(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO |
@@ -116,7 +120,8 @@ rc4_enc.o: ../../include/openssl/symhacks.h ../cryptlib.h rc4_enc.c rc4_locl.h | |||
116 | rc4_skey.o: ../../e_os.h ../../include/openssl/bio.h | 120 | rc4_skey.o: ../../e_os.h ../../include/openssl/bio.h |
117 | rc4_skey.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h | 121 | rc4_skey.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h |
118 | rc4_skey.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h | 122 | rc4_skey.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h |
119 | rc4_skey.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h | 123 | rc4_skey.o: ../../include/openssl/fips.h ../../include/openssl/lhash.h |
124 | rc4_skey.o: ../../include/openssl/opensslconf.h | ||
120 | rc4_skey.o: ../../include/openssl/opensslv.h ../../include/openssl/rc4.h | 125 | rc4_skey.o: ../../include/openssl/opensslv.h ../../include/openssl/rc4.h |
121 | rc4_skey.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h | 126 | rc4_skey.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h |
122 | rc4_skey.o: ../../include/openssl/symhacks.h ../cryptlib.h rc4_locl.h | 127 | rc4_skey.o: ../../include/openssl/symhacks.h ../cryptlib.h rc4_locl.h |
diff --git a/src/lib/libcrypto/rc4/asm/rc4-ia64.S b/src/lib/libcrypto/rc4/asm/rc4-ia64.S index b517d2e88f..a322d0c718 100644 --- a/src/lib/libcrypto/rc4/asm/rc4-ia64.S +++ b/src/lib/libcrypto/rc4/asm/rc4-ia64.S | |||
@@ -7,7 +7,7 @@ | |||
7 | // disclaimed. | 7 | // disclaimed. |
8 | // ==================================================================== | 8 | // ==================================================================== |
9 | 9 | ||
10 | .ident "rc4-ia64.S, Version 1.1" | 10 | .ident "rc4-ia64.S, Version 2.0" |
11 | .ident "IA-64 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>" | 11 | .ident "IA-64 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>" |
12 | 12 | ||
13 | // What's wrong with compiler generated code? Because of the nature of | 13 | // What's wrong with compiler generated code? Because of the nature of |
@@ -27,17 +27,10 @@ | |||
27 | // Legitimate "collisions" do occur within every 256^2 bytes window. | 27 | // Legitimate "collisions" do occur within every 256^2 bytes window. |
28 | // Fortunately there're enough free instruction slots to keep prior | 28 | // Fortunately there're enough free instruction slots to keep prior |
29 | // reference to key[x+1], detect "collision" and compensate for it. | 29 | // reference to key[x+1], detect "collision" and compensate for it. |
30 | // All this without sacrificing a single clock cycle:-) | 30 | // All this without sacrificing a single clock cycle:-) Throughput is |
31 | // Furthermore. In order to compress loop body to the minimum, I chose | 31 | // ~210MBps on 900MHz CPU, which is is >3x faster than gcc generated |
32 | // to deploy deposit instruction, which substitutes for the whole | 32 | // code and +30% - if compared to HP-UX C. Unrolling loop below should |
33 | // key->data+((x&255)<<log2(sizeof(key->data[0]))). This unfortunately | 33 | // give >30% on top of that... |
34 | // requires key->data to be aligned at sizeof(key->data) boundary. | ||
35 | // This is why you'll find "RC4_INT pad[512-256-2];" addenum to RC4_KEY | ||
36 | // and "d=(RC4_INT *)(((size_t)(d+255))&~(sizeof(key->data)-1));" in | ||
37 | // rc4_skey.c [and rc4_enc.c, where it's retained for debugging | ||
38 | // purposes]. Throughput is ~210MBps on 900MHz CPU, which is is >3x | ||
39 | // faster than gcc generated code and +30% - if compared to HP-UX C. | ||
40 | // Unrolling loop below should give >30% on top of that... | ||
41 | 34 | ||
42 | .text | 35 | .text |
43 | .explicit | 36 | .explicit |
@@ -48,7 +41,9 @@ | |||
48 | # define ADDP add | 41 | # define ADDP add |
49 | #endif | 42 | #endif |
50 | 43 | ||
44 | #ifndef SZ | ||
51 | #define SZ 4 // this is set to sizeof(RC4_INT) | 45 | #define SZ 4 // this is set to sizeof(RC4_INT) |
46 | #endif | ||
52 | // SZ==4 seems to be optimal. At least SZ==8 is not any faster, not for | 47 | // SZ==4 seems to be optimal. At least SZ==8 is not any faster, not for |
53 | // assembler implementation, while SZ==1 code is ~30% slower. | 48 | // assembler implementation, while SZ==1 code is ~30% slower. |
54 | #if SZ==1 // RC4_INT is unsigned char | 49 | #if SZ==1 // RC4_INT is unsigned char |
@@ -101,45 +96,53 @@ RC4: | |||
101 | ADDP out=0,in3 | 96 | ADDP out=0,in3 |
102 | brp.loop.imp .Ltop,.Lexit-16 };; | 97 | brp.loop.imp .Ltop,.Lexit-16 };; |
103 | { .mmi; LDKEY yy=[key] // load key->y | 98 | { .mmi; LDKEY yy=[key] // load key->y |
104 | add ksch=(255+1)*SZ,key // as ksch will be used with | 99 | add ksch=SZ,key |
105 | // deposit instruction only, | ||
106 | // I don't have to &~255... | ||
107 | mov ar.lc=in1 } | 100 | mov ar.lc=in1 } |
108 | { .mmi; mov key_y[1]=r0 // guarantee inequality | 101 | { .mmi; mov key_y[1]=r0 // guarantee inequality |
109 | // in first iteration | 102 | // in first iteration |
110 | add xx=1,xx | 103 | add xx=1,xx |
111 | mov pr.rot=1<<16 };; | 104 | mov pr.rot=1<<16 };; |
112 | { .mii; nop.m 0 | 105 | { .mii; nop.m 0 |
113 | dep key_x[1]=xx,ksch,OFF,8 | 106 | dep key_x[1]=xx,r0,OFF,8 |
114 | mov ar.ec=3 };; // note that epilogue counter | 107 | mov ar.ec=3 };; // note that epilogue counter |
115 | // is off by 1. I compensate | 108 | // is off by 1. I compensate |
116 | // for this at exit... | 109 | // for this at exit... |
117 | .Ltop: | 110 | .Ltop: |
118 | // The loop is scheduled for 3*(n+2) spin-rate on Itanium 2, which | 111 | // The loop is scheduled for 4*(n+2) spin-rate on Itanium 2, which |
119 | // theoretically gives asymptotic performance of clock frequency | 112 | // theoretically gives asymptotic performance of clock frequency |
120 | // divided by 3 bytes per seconds, or 500MBps on 1.5GHz CPU. Measured | 113 | // divided by 4 bytes per seconds, or 400MBps on 1.6GHz CPU. This is |
121 | // performance however is distinctly lower than 1/4:-( The culplrit | 114 | // for sizeof(RC4_INT)==4. For smaller RC4_INT STKEY inadvertently |
122 | // seems to be *(out++)=dat, which inadvertently splits the bundle, | 115 | // splits the last bundle and you end up with 5*n spin-rate:-( |
123 | // even though there is M-port available... Unrolling is due... | 116 | // Originally the loop was scheduled for 3*n and relied on key |
124 | // Unrolled loop should collect output with variable shift instruction | 117 | // schedule to be aligned at 256*sizeof(RC4_INT) boundary. But |
125 | // in order to avoid starvation for integer shifter... It should be | 118 | // *(out++)=dat, which maps to st1, had same effect [inadvertent |
126 | // possible to get pretty close to theoretical peak... | 119 | // bundle split] and holded the loop back. Rescheduling for 4*n |
127 | { .mmi; (p16) LDKEY tx[0]=[key_x[1]] // tx=key[xx] | 120 | // made it possible to eliminate dependence on specific alignment |
128 | (p17) LDKEY ty[0]=[key_y[1]] // ty=key[yy] | 121 | // and allow OpenSSH keep "abusing" our API. Reaching for 3*n would |
129 | (p18) dep rnd[1]=rnd[1],ksch,OFF,8} // &key[(tx+ty)&255] | 122 | // require unrolling, sticking to variable shift instruction for |
123 | // collecting output [to avoid starvation for integer shifter] and | ||
124 | // copying of key schedule to controlled place in stack [so that | ||
125 | // deposit instruction can serve as substitute for whole | ||
126 | // key->data+((x&255)<<log2(sizeof(key->data[0])))]... | ||
130 | { .mmi; (p19) st1 [out]=dat[3],1 // *(out++)=dat | 127 | { .mmi; (p19) st1 [out]=dat[3],1 // *(out++)=dat |
131 | (p16) add xx=1,xx // x++ | 128 | (p16) add xx=1,xx // x++ |
132 | (p16) cmp.ne.unc p20,p21=key_x[1],key_y[1] };; | 129 | (p18) dep rnd[1]=rnd[1],r0,OFF,8 } // ((tx+ty)&255)<<OFF |
130 | { .mmi; (p16) add key_x[1]=ksch,key_x[1] // &key[xx&255] | ||
131 | (p17) add key_y[1]=ksch,key_y[1] };; // &key[yy&255] | ||
132 | { .mmi; (p16) LDKEY tx[0]=[key_x[1]] // tx=key[xx] | ||
133 | (p17) LDKEY ty[0]=[key_y[1]] // ty=key[yy] | ||
134 | (p16) dep key_x[0]=xx,r0,OFF,8 } // (xx&255)<<OFF | ||
135 | { .mmi; (p18) add rnd[1]=ksch,rnd[1] // &key[(tx+ty)&255] | ||
136 | (p16) cmp.ne.unc p20,p21=key_x[1],key_y[1] };; | ||
133 | { .mmi; (p18) LDKEY rnd[1]=[rnd[1]] // rnd=key[(tx+ty)&255] | 137 | { .mmi; (p18) LDKEY rnd[1]=[rnd[1]] // rnd=key[(tx+ty)&255] |
134 | (p16) ld1 dat[0]=[inp],1 // dat=*(inp++) | 138 | (p16) ld1 dat[0]=[inp],1 } // dat=*(inp++) |
135 | (p16) dep key_x[0]=xx,ksch,OFF,8 } // &key[xx&255] | ||
136 | .pred.rel "mutex",p20,p21 | 139 | .pred.rel "mutex",p20,p21 |
137 | { .mmi; (p21) add yy=yy,tx[1] // (p16) | 140 | { .mmi; (p21) add yy=yy,tx[1] // (p16) |
138 | (p20) add yy=yy,tx[0] // (p16) y+=tx | 141 | (p20) add yy=yy,tx[0] // (p16) y+=tx |
139 | (p21) mov tx[0]=tx[1] };; // (p16) | 142 | (p21) mov tx[0]=tx[1] };; // (p16) |
140 | { .mmi; (p17) STKEY [key_y[1]]=tx[1] // key[yy]=tx | 143 | { .mmi; (p17) STKEY [key_y[1]]=tx[1] // key[yy]=tx |
141 | (p17) STKEY [key_x[2]]=ty[0] // key[xx]=ty | 144 | (p17) STKEY [key_x[2]]=ty[0] // key[xx]=ty |
142 | (p16) dep key_y[0]=yy,ksch,OFF,8 } // &key[yy&255] | 145 | (p16) dep key_y[0]=yy,r0,OFF,8 } // &key[yy&255] |
143 | { .mmb; (p17) add rnd[0]=tx[1],ty[0] // tx+=ty | 146 | { .mmb; (p17) add rnd[0]=tx[1],ty[0] // tx+=ty |
144 | (p18) xor dat[2]=dat[2],rnd[1] // dat^=rnd | 147 | (p18) xor dat[2]=dat[2],rnd[1] // dat^=rnd |
145 | br.ctop.sptk .Ltop };; | 148 | br.ctop.sptk .Ltop };; |