summaryrefslogtreecommitdiff
path: root/src/lib/libcrypto/sha
diff options
context:
space:
mode:
authordjm <>2008-09-06 12:15:54 +0000
committerdjm <>2008-09-06 12:15:54 +0000
commitf1625f274acf5dcd5601f6cb5e29e233b2a441a3 (patch)
tree44899ddfbef8f24aacbea50086c041c1b150f6d6 /src/lib/libcrypto/sha
parent4f828b924f54507141fb95ebe49dfcd261945e85 (diff)
downloadopenbsd-f1625f274acf5dcd5601f6cb5e29e233b2a441a3.tar.gz
openbsd-f1625f274acf5dcd5601f6cb5e29e233b2a441a3.tar.bz2
openbsd-f1625f274acf5dcd5601f6cb5e29e233b2a441a3.zip
import of OpenSSL 0.9.8h
Diffstat (limited to 'src/lib/libcrypto/sha')
-rw-r--r--src/lib/libcrypto/sha/Makefile102
-rw-r--r--src/lib/libcrypto/sha/asm/sha512-sse2.pl404
-rw-r--r--src/lib/libcrypto/sha/sha1test.c12
-rw-r--r--src/lib/libcrypto/sha/sha_dgst.c3
-rw-r--r--src/lib/libcrypto/sha/sha_one.c2
-rw-r--r--src/lib/libcrypto/sha/shatest.c12
6 files changed, 480 insertions, 55 deletions
diff --git a/src/lib/libcrypto/sha/Makefile b/src/lib/libcrypto/sha/Makefile
index 46103bbc83..ac64fb61d3 100644
--- a/src/lib/libcrypto/sha/Makefile
+++ b/src/lib/libcrypto/sha/Makefile
@@ -8,11 +8,6 @@ CC= cc
8CPP= $(CC) -E 8CPP= $(CC) -E
9INCLUDES= 9INCLUDES=
10CFLAG=-g 10CFLAG=-g
11INSTALL_PREFIX=
12OPENSSLDIR= /usr/local/ssl
13INSTALLTOP=/usr/local/ssl
14MAKEDEPPROG= makedepend
15MAKEDEPEND= $(TOP)/util/domd $(TOP) -MD $(MAKEDEPPROG)
16MAKEFILE= Makefile 11MAKEFILE= Makefile
17AR= ar r 12AR= ar r
18 13
@@ -20,14 +15,15 @@ SHA1_ASM_OBJ=
20 15
21CFLAGS= $(INCLUDES) $(CFLAG) 16CFLAGS= $(INCLUDES) $(CFLAG)
22ASFLAGS= $(INCLUDES) $(ASFLAG) 17ASFLAGS= $(INCLUDES) $(ASFLAG)
18AFLAGS= $(ASFLAGS)
23 19
24GENERAL=Makefile 20GENERAL=Makefile
25TEST=shatest.c sha1test.c 21TEST=shatest.c sha1test.c sha256t.c sha512t.c
26APPS= 22APPS=
27 23
28LIB=$(TOP)/libcrypto.a 24LIB=$(TOP)/libcrypto.a
29LIBSRC=sha_dgst.c sha1dgst.c sha_one.c sha1_one.c 25LIBSRC=sha_dgst.c sha1dgst.c sha_one.c sha1_one.c sha256.c sha512.c
30LIBOBJ=sha_dgst.o sha1dgst.o sha_one.o sha1_one.o $(SHA1_ASM_OBJ) 26LIBOBJ=sha_dgst.o sha1dgst.o sha_one.o sha1_one.o sha256.o sha512.o $(SHA1_ASM_OBJ)
31 27
32SRC= $(LIBSRC) 28SRC= $(LIBSRC)
33 29
@@ -46,23 +42,33 @@ lib: $(LIBOBJ)
46 $(RANLIB) $(LIB) || echo Never mind. 42 $(RANLIB) $(LIB) || echo Never mind.
47 @touch lib 43 @touch lib
48 44
49# elf 45# ELF
50asm/sx86-elf.s: asm/sha1-586.pl ../perlasm/x86asm.pl 46sx86-elf.s: asm/sha1-586.pl ../perlasm/x86asm.pl
51 (cd asm; $(PERL) sha1-586.pl elf $(CFLAGS) $(PROCESSOR) > sx86-elf.s) 47 (cd asm; $(PERL) sha1-586.pl elf $(CFLAGS) $(PROCESSOR) > ../$@)
52 48s512sse2-elf.s: asm/sha512-sse2.pl ../perlasm/x86asm.pl
49 (cd asm; $(PERL) sha512-sse2.pl elf $(CFLAGS) $(PROCESSOR) > ../$@)
50# COFF
51sx86-cof.s: asm/sha1-586.pl ../perlasm/x86asm.pl
52 (cd asm; $(PERL) sha1-586.pl coff $(CFLAGS) $(PROCESSOR) > ../$@)
53s512sse2-cof.s: asm/sha512-sse2.pl ../perlasm/x86asm.pl
54 (cd asm; $(PERL) sha512-sse2.pl coff $(CFLAGS) $(PROCESSOR) > ../$@)
53# a.out 55# a.out
54asm/sx86-out.o: asm/sx86unix.cpp 56sx86-out.s: asm/sha1-586.pl ../perlasm/x86asm.pl
55 $(CPP) -DOUT asm/sx86unix.cpp | as -o asm/sx86-out.o 57 (cd asm; $(PERL) sha1-586.pl a.out $(CFLAGS) $(PROCESSOR) > ../$@)
56 58s512sse2-out.s: asm/sha512-sse2.pl ../perlasm/x86asm.pl
57# bsdi 59 (cd asm; $(PERL) sha512-sse2.pl a.out $(CFLAGS) $(PROCESSOR) > ../$@)
58asm/sx86bsdi.o: asm/sx86unix.cpp
59 $(CPP) -DBSDI asm/sx86unix.cpp | sed 's/ :/:/' | as -o asm/sx86bsdi.o
60 60
61asm/sx86unix.cpp: asm/sha1-586.pl ../perlasm/x86asm.pl 61sha1-ia64.s: asm/sha1-ia64.pl
62 (cd asm; $(PERL) sha1-586.pl cpp $(PROCESSOR) >sx86unix.cpp)
63
64asm/sha1-ia64.s: asm/sha1-ia64.pl
65 (cd asm; $(PERL) sha1-ia64.pl $(CFLAGS) ) > $@ 62 (cd asm; $(PERL) sha1-ia64.pl $(CFLAGS) ) > $@
63sha256-ia64.s: asm/sha512-ia64.pl
64 (cd asm; $(PERL) sha512-ia64.pl ../$@ $(CFLAGS))
65sha512-ia64.s: asm/sha512-ia64.pl
66 (cd asm; $(PERL) sha512-ia64.pl ../$@ $(CFLAGS))
67
68# Solaris make has to be explicitly told
69sha1-x86_64.s: asm/sha1-x86_64.pl; $(PERL) asm/sha1-x86_64.pl $@
70sha256-x86_64.s:asm/sha512-x86_64.pl; $(PERL) asm/sha512-x86_64.pl $@
71sha512-x86_64.s:asm/sha512-x86_64.pl; $(PERL) asm/sha512-x86_64.pl $@
66 72
67files: 73files:
68 $(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO 74 $(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO
@@ -73,6 +79,7 @@ links:
73 @$(PERL) $(TOP)/util/mklink.pl ../../apps $(APPS) 79 @$(PERL) $(TOP)/util/mklink.pl ../../apps $(APPS)
74 80
75install: 81install:
82 @[ -n "$(INSTALLTOP)" ] # should be set by top Makefile...
76 @headerlist="$(EXHEADER)"; for i in $$headerlist ; \ 83 @headerlist="$(EXHEADER)"; for i in $$headerlist ; \
77 do \ 84 do \
78 (cp $$i $(INSTALL_PREFIX)$(INSTALLTOP)/include/openssl/$$i; \ 85 (cp $$i $(INSTALL_PREFIX)$(INSTALLTOP)/include/openssl/$$i; \
@@ -88,6 +95,7 @@ lint:
88 lint -DLINT $(INCLUDES) $(SRC)>fluff 95 lint -DLINT $(INCLUDES) $(SRC)>fluff
89 96
90depend: 97depend:
98 @[ -n "$(MAKEDEPEND)" ] # should be set by upper Makefile...
91 $(MAKEDEPEND) -- $(CFLAG) $(INCLUDES) $(DEPFLAG) -- $(PROGS) $(LIBSRC) 99 $(MAKEDEPEND) -- $(CFLAG) $(INCLUDES) $(DEPFLAG) -- $(PROGS) $(LIBSRC)
92 100
93dclean: 101dclean:
@@ -95,33 +103,37 @@ dclean:
95 mv -f Makefile.new $(MAKEFILE) 103 mv -f Makefile.new $(MAKEFILE)
96 104
97clean: 105clean:
98 rm -f asm/sx86unix.cpp asm/*-elf.* *.o *.obj lib tags core .pure .nfs* *.old *.bak fluff asm/*.o 106 rm -f *.s *.o *.obj lib tags core .pure .nfs* *.old *.bak fluff
99 107
100# DO NOT DELETE THIS LINE -- make depend depends on it. 108# DO NOT DELETE THIS LINE -- make depend depends on it.
101 109
102sha1_one.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h 110sha1_one.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h
103sha1_one.o: ../../include/openssl/opensslconf.h 111sha1_one.o: ../../include/openssl/opensslconf.h
104sha1_one.o: ../../include/openssl/opensslv.h ../../include/openssl/safestack.h 112sha1_one.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
105sha1_one.o: ../../include/openssl/sha.h ../../include/openssl/stack.h 113sha1_one.o: ../../include/openssl/safestack.h ../../include/openssl/sha.h
106sha1_one.o: ../../include/openssl/symhacks.h sha1_one.c 114sha1_one.o: ../../include/openssl/stack.h ../../include/openssl/symhacks.h
107sha1dgst.o: ../../include/openssl/bio.h ../../include/openssl/crypto.h 115sha1_one.o: sha1_one.c
108sha1dgst.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h 116sha1dgst.o: ../../include/openssl/e_os2.h ../../include/openssl/opensslconf.h
109sha1dgst.o: ../../include/openssl/fips.h ../../include/openssl/lhash.h 117sha1dgst.o: ../../include/openssl/opensslv.h ../../include/openssl/sha.h
110sha1dgst.o: ../../include/openssl/opensslconf.h 118sha1dgst.o: ../md32_common.h sha1dgst.c sha_locl.h
111sha1dgst.o: ../../include/openssl/opensslv.h ../../include/openssl/safestack.h 119sha256.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h
112sha1dgst.o: ../../include/openssl/sha.h ../../include/openssl/stack.h 120sha256.o: ../../include/openssl/opensslconf.h ../../include/openssl/opensslv.h
113sha1dgst.o: ../../include/openssl/symhacks.h ../md32_common.h sha1dgst.c 121sha256.o: ../../include/openssl/ossl_typ.h ../../include/openssl/safestack.h
114sha1dgst.o: sha_locl.h 122sha256.o: ../../include/openssl/sha.h ../../include/openssl/stack.h
115sha_dgst.o: ../../include/openssl/bio.h ../../include/openssl/crypto.h 123sha256.o: ../../include/openssl/symhacks.h ../md32_common.h sha256.c
116sha_dgst.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h 124sha512.o: ../../e_os.h ../../include/openssl/bio.h
117sha_dgst.o: ../../include/openssl/fips.h ../../include/openssl/lhash.h 125sha512.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
118sha_dgst.o: ../../include/openssl/opensslconf.h 126sha512.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
119sha_dgst.o: ../../include/openssl/opensslv.h ../../include/openssl/safestack.h 127sha512.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
120sha_dgst.o: ../../include/openssl/sha.h ../../include/openssl/stack.h 128sha512.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
121sha_dgst.o: ../../include/openssl/symhacks.h ../md32_common.h sha_dgst.c 129sha512.o: ../../include/openssl/safestack.h ../../include/openssl/sha.h
122sha_dgst.o: sha_locl.h 130sha512.o: ../../include/openssl/stack.h ../../include/openssl/symhacks.h
131sha512.o: ../cryptlib.h sha512.c
132sha_dgst.o: ../../include/openssl/e_os2.h ../../include/openssl/opensslconf.h
133sha_dgst.o: ../../include/openssl/opensslv.h ../../include/openssl/sha.h
134sha_dgst.o: ../md32_common.h sha_dgst.c sha_locl.h
123sha_one.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h 135sha_one.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h
124sha_one.o: ../../include/openssl/opensslconf.h ../../include/openssl/opensslv.h 136sha_one.o: ../../include/openssl/opensslconf.h ../../include/openssl/opensslv.h
125sha_one.o: ../../include/openssl/safestack.h ../../include/openssl/sha.h 137sha_one.o: ../../include/openssl/ossl_typ.h ../../include/openssl/safestack.h
126sha_one.o: ../../include/openssl/stack.h ../../include/openssl/symhacks.h 138sha_one.o: ../../include/openssl/sha.h ../../include/openssl/stack.h
127sha_one.o: sha_one.c 139sha_one.o: ../../include/openssl/symhacks.h sha_one.c
diff --git a/src/lib/libcrypto/sha/asm/sha512-sse2.pl b/src/lib/libcrypto/sha/asm/sha512-sse2.pl
new file mode 100644
index 0000000000..10902bf673
--- /dev/null
+++ b/src/lib/libcrypto/sha/asm/sha512-sse2.pl
@@ -0,0 +1,404 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. Rights for redistribution and usage in source and binary
6# forms are granted according to the OpenSSL license.
7# ====================================================================
8#
9# SHA512_Transform_SSE2.
10#
11# As the name suggests, this is an IA-32 SSE2 implementation of
12# SHA512_Transform. Motivating factor for the undertaken effort was that
13# SHA512 was observed to *consistently* perform *significantly* poorer
14# than SHA256 [2x and slower is common] on 32-bit platforms. On 64-bit
15# platforms on the other hand SHA512 tend to outperform SHA256 [~50%
16# seem to be common improvement factor]. All this is perfectly natural,
17# as SHA512 is a 64-bit algorithm. But isn't IA-32 SSE2 essentially
18# a 64-bit instruction set? Is it rich enough to implement SHA512?
19# If answer was "no," then you wouldn't have been reading this...
20#
21# Throughput performance in MBps (larger is better):
22#
23# 2.4GHz P4 1.4GHz AMD32 1.4GHz AMD64(*)
24# SHA256/gcc(*) 54 43 59
25# SHA512/gcc 17 23 92
26# SHA512/sse2 61(**) 57(**)
27# SHA512/icc 26 28
28# SHA256/icc(*) 65 54
29#
30# (*) AMD64 and SHA256 numbers are presented mostly for amusement or
31# reference purposes.
32# (**) I.e. it gives ~2-3x speed-up if compared with compiler generated
33# code. One can argue that hand-coded *non*-SSE2 implementation
34# would perform better than compiler generated one as well, and
35# that comparison is therefore not exactly fair. Well, as SHA512
36# puts enormous pressure on IA-32 GP register bank, I reckon that
37# hand-coded version wouldn't perform significantly better than
38# one compiled with icc, ~20% perhaps... So that this code would
39# still outperform it with distinguishing marginal. But feel free
40# to prove me wrong:-)
41# <appro@fy.chalmers.se>
42push(@INC,"perlasm","../../perlasm");
43require "x86asm.pl";
44
45&asm_init($ARGV[0],"sha512-sse2.pl",$ARGV[$#ARGV] eq "386");
46
47$K512="esi"; # K512[80] table, found at the end...
48#$W512="esp"; # $W512 is not just W512[16]: it comprises *two* copies
49 # of W512[16] and a copy of A-H variables...
50$W512_SZ=8*(16+16+8); # see above...
51#$Kidx="ebx"; # index in K512 table, advances from 0 to 80...
52$Widx="edx"; # index in W512, wraps around at 16...
53$data="edi"; # 16 qwords of input data...
54$A="mm0"; # B-D and
55$E="mm1"; # F-H are allocated dynamically...
56$Aoff=256+0; # A-H offsets relative to $W512...
57$Boff=256+8;
58$Coff=256+16;
59$Doff=256+24;
60$Eoff=256+32;
61$Foff=256+40;
62$Goff=256+48;
63$Hoff=256+56;
64
65sub SHA2_ROUND()
66{ local ($kidx,$widx)=@_;
67
68 # One can argue that one could reorder instructions for better
69 # performance. Well, I tried and it doesn't seem to make any
70 # noticeable difference. Modern out-of-order execution cores
71 # reorder instructions to their liking in either case and they
72 # apparently do decent job. So we can keep the code more
73 # readable/regular/comprehensible:-)
74
75 # I adhere to 64-bit %mmX registers in order to avoid/not care
76 # about #GP exceptions on misaligned 128-bit access, most
77 # notably in paddq with memory operand. Not to mention that
78 # SSE2 intructions operating on %mmX can be scheduled every
79 # cycle [and not every second one if operating on %xmmN].
80
81 &movq ("mm4",&QWP($Foff,$W512)); # load f
82 &movq ("mm5",&QWP($Goff,$W512)); # load g
83 &movq ("mm6",&QWP($Hoff,$W512)); # load h
84
85 &movq ("mm2",$E); # %mm2 is sliding right
86 &movq ("mm3",$E); # %mm3 is sliding left
87 &psrlq ("mm2",14);
88 &psllq ("mm3",23);
89 &movq ("mm7","mm2"); # %mm7 is T1
90 &pxor ("mm7","mm3");
91 &psrlq ("mm2",4);
92 &psllq ("mm3",23);
93 &pxor ("mm7","mm2");
94 &pxor ("mm7","mm3");
95 &psrlq ("mm2",23);
96 &psllq ("mm3",4);
97 &pxor ("mm7","mm2");
98 &pxor ("mm7","mm3"); # T1=Sigma1_512(e)
99
100 &movq (&QWP($Foff,$W512),$E); # f = e
101 &movq (&QWP($Goff,$W512),"mm4"); # g = f
102 &movq (&QWP($Hoff,$W512),"mm5"); # h = g
103
104 &pxor ("mm4","mm5"); # f^=g
105 &pand ("mm4",$E); # f&=e
106 &pxor ("mm4","mm5"); # f^=g
107 &paddq ("mm7","mm4"); # T1+=Ch(e,f,g)
108
109 &movq ("mm2",&QWP($Boff,$W512)); # load b
110 &movq ("mm3",&QWP($Coff,$W512)); # load c
111 &movq ($E,&QWP($Doff,$W512)); # e = d
112
113 &paddq ("mm7","mm6"); # T1+=h
114 &paddq ("mm7",&QWP(0,$K512,$kidx,8)); # T1+=K512[i]
115 &paddq ("mm7",&QWP(0,$W512,$widx,8)); # T1+=W512[i]
116 &paddq ($E,"mm7"); # e += T1
117
118 &movq ("mm4",$A); # %mm4 is sliding right
119 &movq ("mm5",$A); # %mm5 is sliding left
120 &psrlq ("mm4",28);
121 &psllq ("mm5",25);
122 &movq ("mm6","mm4"); # %mm6 is T2
123 &pxor ("mm6","mm5");
124 &psrlq ("mm4",6);
125 &psllq ("mm5",5);
126 &pxor ("mm6","mm4");
127 &pxor ("mm6","mm5");
128 &psrlq ("mm4",5);
129 &psllq ("mm5",6);
130 &pxor ("mm6","mm4");
131 &pxor ("mm6","mm5"); # T2=Sigma0_512(a)
132
133 &movq (&QWP($Boff,$W512),$A); # b = a
134 &movq (&QWP($Coff,$W512),"mm2"); # c = b
135 &movq (&QWP($Doff,$W512),"mm3"); # d = c
136
137 &movq ("mm4",$A); # %mm4=a
138 &por ($A,"mm3"); # a=a|c
139 &pand ("mm4","mm3"); # %mm4=a&c
140 &pand ($A,"mm2"); # a=(a|c)&b
141 &por ("mm4",$A); # %mm4=(a&c)|((a|c)&b)
142 &paddq ("mm6","mm4"); # T2+=Maj(a,b,c)
143
144 &movq ($A,"mm7"); # a=T1
145 &paddq ($A,"mm6"); # a+=T2
146}
147
148$func="sha512_block_sse2";
149
150&function_begin_B($func);
151 if (0) {# Caller is expected to check if it's appropriate to
152 # call this routine. Below 3 lines are retained for
153 # debugging purposes...
154 &picmeup("eax","OPENSSL_ia32cap");
155 &bt (&DWP(0,"eax"),26);
156 &jnc ("SHA512_Transform");
157 }
158
159 &push ("ebp");
160 &mov ("ebp","esp");
161 &push ("ebx");
162 &push ("esi");
163 &push ("edi");
164
165 &mov ($Widx,&DWP(8,"ebp")); # A-H state, 1st arg
166 &mov ($data,&DWP(12,"ebp")); # input data, 2nd arg
167 &call (&label("pic_point")); # make it PIC!
168&set_label("pic_point");
169 &blindpop($K512);
170 &lea ($K512,&DWP(&label("K512")."-".&label("pic_point"),$K512));
171
172 $W512 = "esp"; # start using %esp as W512
173 &sub ($W512,$W512_SZ);
174 &and ($W512,-16); # ensure 128-bit alignment
175
176 # make private copy of A-H
177 # v assume the worst and stick to unaligned load
178 &movdqu ("xmm0",&QWP(0,$Widx));
179 &movdqu ("xmm1",&QWP(16,$Widx));
180 &movdqu ("xmm2",&QWP(32,$Widx));
181 &movdqu ("xmm3",&QWP(48,$Widx));
182
183&align(8);
184&set_label("_chunk_loop");
185
186 &movdqa (&QWP($Aoff,$W512),"xmm0"); # a,b
187 &movdqa (&QWP($Coff,$W512),"xmm1"); # c,d
188 &movdqa (&QWP($Eoff,$W512),"xmm2"); # e,f
189 &movdqa (&QWP($Goff,$W512),"xmm3"); # g,h
190
191 &xor ($Widx,$Widx);
192
193 &movdq2q($A,"xmm0"); # load a
194 &movdq2q($E,"xmm2"); # load e
195
196 # Why aren't loops unrolled? It makes sense to unroll if
197 # execution time for loop body is comparable with branch
198 # penalties and/or if whole data-set resides in register bank.
199 # Neither is case here... Well, it would be possible to
200 # eliminate few store operations, but it would hardly affect
201 # so to say stop-watch performance, as there is a lot of
202 # available memory slots to fill. It will only relieve some
203 # pressure off memory bus...
204
205 # flip input stream byte order...
206 &mov ("eax",&DWP(0,$data,$Widx,8));
207 &mov ("ebx",&DWP(4,$data,$Widx,8));
208 &bswap ("eax");
209 &bswap ("ebx");
210 &mov (&DWP(0,$W512,$Widx,8),"ebx"); # W512[i]
211 &mov (&DWP(4,$W512,$Widx,8),"eax");
212 &mov (&DWP(128+0,$W512,$Widx,8),"ebx"); # copy of W512[i]
213 &mov (&DWP(128+4,$W512,$Widx,8),"eax");
214
215&align(8);
216&set_label("_1st_loop"); # 0-15
217 # flip input stream byte order...
218 &mov ("eax",&DWP(0+8,$data,$Widx,8));
219 &mov ("ebx",&DWP(4+8,$data,$Widx,8));
220 &bswap ("eax");
221 &bswap ("ebx");
222 &mov (&DWP(0+8,$W512,$Widx,8),"ebx"); # W512[i]
223 &mov (&DWP(4+8,$W512,$Widx,8),"eax");
224 &mov (&DWP(128+0+8,$W512,$Widx,8),"ebx"); # copy of W512[i]
225 &mov (&DWP(128+4+8,$W512,$Widx,8),"eax");
226&set_label("_1st_looplet");
227 &SHA2_ROUND($Widx,$Widx); &inc($Widx);
228
229&cmp ($Widx,15)
230&jl (&label("_1st_loop"));
231&je (&label("_1st_looplet")); # playing similar trick on 2nd loop
232 # does not improve performance...
233
234 $Kidx = "ebx"; # start using %ebx as Kidx
235 &mov ($Kidx,$Widx);
236
237&align(8);
238&set_label("_2nd_loop"); # 16-79
239 &and($Widx,0xf);
240
241 # 128-bit fragment! I update W512[i] and W512[i+1] in
242 # parallel:-) Note that I refer to W512[(i&0xf)+N] and not to
243 # W512[(i+N)&0xf]! This is exactly what I maintain the second
244 # copy of W512[16] for...
245 &movdqu ("xmm0",&QWP(8*1,$W512,$Widx,8)); # s0=W512[i+1]
246 &movdqa ("xmm2","xmm0"); # %xmm2 is sliding right
247 &movdqa ("xmm3","xmm0"); # %xmm3 is sliding left
248 &psrlq ("xmm2",1);
249 &psllq ("xmm3",56);
250 &movdqa ("xmm0","xmm2");
251 &pxor ("xmm0","xmm3");
252 &psrlq ("xmm2",6);
253 &psllq ("xmm3",7);
254 &pxor ("xmm0","xmm2");
255 &pxor ("xmm0","xmm3");
256 &psrlq ("xmm2",1);
257 &pxor ("xmm0","xmm2"); # s0 = sigma0_512(s0);
258
259 &movdqa ("xmm1",&QWP(8*14,$W512,$Widx,8)); # s1=W512[i+14]
260 &movdqa ("xmm4","xmm1"); # %xmm4 is sliding right
261 &movdqa ("xmm5","xmm1"); # %xmm5 is sliding left
262 &psrlq ("xmm4",6);
263 &psllq ("xmm5",3);
264 &movdqa ("xmm1","xmm4");
265 &pxor ("xmm1","xmm5");
266 &psrlq ("xmm4",13);
267 &psllq ("xmm5",42);
268 &pxor ("xmm1","xmm4");
269 &pxor ("xmm1","xmm5");
270 &psrlq ("xmm4",42);
271 &pxor ("xmm1","xmm4"); # s1 = sigma1_512(s1);
272
273 # + have to explictly load W512[i+9] as it's not 128-bit
274 # v aligned and paddq would throw an exception...
275 &movdqu ("xmm6",&QWP(8*9,$W512,$Widx,8));
276 &paddq ("xmm0","xmm1"); # s0 += s1
277 &paddq ("xmm0","xmm6"); # s0 += W512[i+9]
278 &paddq ("xmm0",&QWP(0,$W512,$Widx,8)); # s0 += W512[i]
279
280 &movdqa (&QWP(0,$W512,$Widx,8),"xmm0"); # W512[i] = s0
281 &movdqa (&QWP(16*8,$W512,$Widx,8),"xmm0"); # copy of W512[i]
282
283 # as the above fragment was 128-bit, we "owe" 2 rounds...
284 &SHA2_ROUND($Kidx,$Widx); &inc($Kidx); &inc($Widx);
285 &SHA2_ROUND($Kidx,$Widx); &inc($Kidx); &inc($Widx);
286
287&cmp ($Kidx,80);
288&jl (&label("_2nd_loop"));
289
290 # update A-H state
291 &mov ($Widx,&DWP(8,"ebp")); # A-H state, 1st arg
292 &movq (&QWP($Aoff,$W512),$A); # write out a
293 &movq (&QWP($Eoff,$W512),$E); # write out e
294 &movdqu ("xmm0",&QWP(0,$Widx));
295 &movdqu ("xmm1",&QWP(16,$Widx));
296 &movdqu ("xmm2",&QWP(32,$Widx));
297 &movdqu ("xmm3",&QWP(48,$Widx));
298 &paddq ("xmm0",&QWP($Aoff,$W512)); # 128-bit additions...
299 &paddq ("xmm1",&QWP($Coff,$W512));
300 &paddq ("xmm2",&QWP($Eoff,$W512));
301 &paddq ("xmm3",&QWP($Goff,$W512));
302 &movdqu (&QWP(0,$Widx),"xmm0");
303 &movdqu (&QWP(16,$Widx),"xmm1");
304 &movdqu (&QWP(32,$Widx),"xmm2");
305 &movdqu (&QWP(48,$Widx),"xmm3");
306
307&add ($data,16*8); # advance input data pointer
308&dec (&DWP(16,"ebp")); # decrement 3rd arg
309&jnz (&label("_chunk_loop"));
310
311 # epilogue
312 &emms (); # required for at least ELF and Win32 ABIs
313 &mov ("edi",&DWP(-12,"ebp"));
314 &mov ("esi",&DWP(-8,"ebp"));
315 &mov ("ebx",&DWP(-4,"ebp"));
316 &leave ();
317&ret ();
318
319&align(64);
320&set_label("K512"); # Yes! I keep it in the code segment!
321 &data_word(0xd728ae22,0x428a2f98); # u64
322 &data_word(0x23ef65cd,0x71374491); # u64
323 &data_word(0xec4d3b2f,0xb5c0fbcf); # u64
324 &data_word(0x8189dbbc,0xe9b5dba5); # u64
325 &data_word(0xf348b538,0x3956c25b); # u64
326 &data_word(0xb605d019,0x59f111f1); # u64
327 &data_word(0xaf194f9b,0x923f82a4); # u64
328 &data_word(0xda6d8118,0xab1c5ed5); # u64
329 &data_word(0xa3030242,0xd807aa98); # u64
330 &data_word(0x45706fbe,0x12835b01); # u64
331 &data_word(0x4ee4b28c,0x243185be); # u64
332 &data_word(0xd5ffb4e2,0x550c7dc3); # u64
333 &data_word(0xf27b896f,0x72be5d74); # u64
334 &data_word(0x3b1696b1,0x80deb1fe); # u64
335 &data_word(0x25c71235,0x9bdc06a7); # u64
336 &data_word(0xcf692694,0xc19bf174); # u64
337 &data_word(0x9ef14ad2,0xe49b69c1); # u64
338 &data_word(0x384f25e3,0xefbe4786); # u64
339 &data_word(0x8b8cd5b5,0x0fc19dc6); # u64
340 &data_word(0x77ac9c65,0x240ca1cc); # u64
341 &data_word(0x592b0275,0x2de92c6f); # u64
342 &data_word(0x6ea6e483,0x4a7484aa); # u64
343 &data_word(0xbd41fbd4,0x5cb0a9dc); # u64
344 &data_word(0x831153b5,0x76f988da); # u64
345 &data_word(0xee66dfab,0x983e5152); # u64
346 &data_word(0x2db43210,0xa831c66d); # u64
347 &data_word(0x98fb213f,0xb00327c8); # u64
348 &data_word(0xbeef0ee4,0xbf597fc7); # u64
349 &data_word(0x3da88fc2,0xc6e00bf3); # u64
350 &data_word(0x930aa725,0xd5a79147); # u64
351 &data_word(0xe003826f,0x06ca6351); # u64
352 &data_word(0x0a0e6e70,0x14292967); # u64
353 &data_word(0x46d22ffc,0x27b70a85); # u64
354 &data_word(0x5c26c926,0x2e1b2138); # u64
355 &data_word(0x5ac42aed,0x4d2c6dfc); # u64
356 &data_word(0x9d95b3df,0x53380d13); # u64
357 &data_word(0x8baf63de,0x650a7354); # u64
358 &data_word(0x3c77b2a8,0x766a0abb); # u64
359 &data_word(0x47edaee6,0x81c2c92e); # u64
360 &data_word(0x1482353b,0x92722c85); # u64
361 &data_word(0x4cf10364,0xa2bfe8a1); # u64
362 &data_word(0xbc423001,0xa81a664b); # u64
363 &data_word(0xd0f89791,0xc24b8b70); # u64
364 &data_word(0x0654be30,0xc76c51a3); # u64
365 &data_word(0xd6ef5218,0xd192e819); # u64
366 &data_word(0x5565a910,0xd6990624); # u64
367 &data_word(0x5771202a,0xf40e3585); # u64
368 &data_word(0x32bbd1b8,0x106aa070); # u64
369 &data_word(0xb8d2d0c8,0x19a4c116); # u64
370 &data_word(0x5141ab53,0x1e376c08); # u64
371 &data_word(0xdf8eeb99,0x2748774c); # u64
372 &data_word(0xe19b48a8,0x34b0bcb5); # u64
373 &data_word(0xc5c95a63,0x391c0cb3); # u64
374 &data_word(0xe3418acb,0x4ed8aa4a); # u64
375 &data_word(0x7763e373,0x5b9cca4f); # u64
376 &data_word(0xd6b2b8a3,0x682e6ff3); # u64
377 &data_word(0x5defb2fc,0x748f82ee); # u64
378 &data_word(0x43172f60,0x78a5636f); # u64
379 &data_word(0xa1f0ab72,0x84c87814); # u64
380 &data_word(0x1a6439ec,0x8cc70208); # u64
381 &data_word(0x23631e28,0x90befffa); # u64
382 &data_word(0xde82bde9,0xa4506ceb); # u64
383 &data_word(0xb2c67915,0xbef9a3f7); # u64
384 &data_word(0xe372532b,0xc67178f2); # u64
385 &data_word(0xea26619c,0xca273ece); # u64
386 &data_word(0x21c0c207,0xd186b8c7); # u64
387 &data_word(0xcde0eb1e,0xeada7dd6); # u64
388 &data_word(0xee6ed178,0xf57d4f7f); # u64
389 &data_word(0x72176fba,0x06f067aa); # u64
390 &data_word(0xa2c898a6,0x0a637dc5); # u64
391 &data_word(0xbef90dae,0x113f9804); # u64
392 &data_word(0x131c471b,0x1b710b35); # u64
393 &data_word(0x23047d84,0x28db77f5); # u64
394 &data_word(0x40c72493,0x32caab7b); # u64
395 &data_word(0x15c9bebc,0x3c9ebe0a); # u64
396 &data_word(0x9c100d4c,0x431d67c4); # u64
397 &data_word(0xcb3e42b6,0x4cc5d4be); # u64
398 &data_word(0xfc657e2a,0x597f299c); # u64
399 &data_word(0x3ad6faec,0x5fcb6fab); # u64
400 &data_word(0x4a475817,0x6c44198c); # u64
401
402&function_end_B($func);
403
404&asm_finish();
diff --git a/src/lib/libcrypto/sha/sha1test.c b/src/lib/libcrypto/sha/sha1test.c
index 4f2e4ada2d..6feb3964c7 100644
--- a/src/lib/libcrypto/sha/sha1test.c
+++ b/src/lib/libcrypto/sha/sha1test.c
@@ -106,7 +106,7 @@ static char *pt(unsigned char *md);
106int main(int argc, char *argv[]) 106int main(int argc, char *argv[])
107 { 107 {
108 int i,err=0; 108 int i,err=0;
109 unsigned char **P,**R; 109 char **P,**R;
110 static unsigned char buf[1000]; 110 static unsigned char buf[1000];
111 char *p,*r; 111 char *p,*r;
112 EVP_MD_CTX c; 112 EVP_MD_CTX c;
@@ -118,12 +118,12 @@ int main(int argc, char *argv[])
118#endif 118#endif
119 119
120 EVP_MD_CTX_init(&c); 120 EVP_MD_CTX_init(&c);
121 P=(unsigned char **)test; 121 P=test;
122 R=(unsigned char **)ret; 122 R=ret;
123 i=1; 123 i=1;
124 while (*P != NULL) 124 while (*P != NULL)
125 { 125 {
126 EVP_Digest(*P,(unsigned long)strlen((char *)*P),md,NULL,EVP_sha1(), NULL); 126 EVP_Digest(*P,strlen((char *)*P),md,NULL,EVP_sha1(), NULL);
127 p=pt(md); 127 p=pt(md);
128 if (strcmp(p,(char *)*R) != 0) 128 if (strcmp(p,(char *)*R) != 0)
129 { 129 {
@@ -157,6 +157,10 @@ int main(int argc, char *argv[])
157 } 157 }
158 else 158 else
159 printf("test 3 ok\n"); 159 printf("test 3 ok\n");
160
161#ifdef OPENSSL_SYS_NETWARE
162 if (err) printf("ERROR: %d\n", err);
163#endif
160 EXIT(err); 164 EXIT(err);
161 EVP_MD_CTX_cleanup(&c); 165 EVP_MD_CTX_cleanup(&c);
162 return(0); 166 return(0);
diff --git a/src/lib/libcrypto/sha/sha_dgst.c b/src/lib/libcrypto/sha/sha_dgst.c
index 5a4b3ab204..70eb56032c 100644
--- a/src/lib/libcrypto/sha/sha_dgst.c
+++ b/src/lib/libcrypto/sha/sha_dgst.c
@@ -56,6 +56,7 @@
56 * [including the GNU Public Licence.] 56 * [including the GNU Public Licence.]
57 */ 57 */
58 58
59#include <openssl/opensslconf.h>
59#if !defined(OPENSSL_NO_SHA0) && !defined(OPENSSL_NO_SHA) 60#if !defined(OPENSSL_NO_SHA0) && !defined(OPENSSL_NO_SHA)
60 61
61#undef SHA_1 62#undef SHA_1
@@ -63,7 +64,7 @@
63 64
64#include <openssl/opensslv.h> 65#include <openssl/opensslv.h>
65 66
66const char *SHA_version="SHA" OPENSSL_VERSION_PTEXT; 67const char SHA_version[]="SHA" OPENSSL_VERSION_PTEXT;
67 68
68/* The implementation is in ../md32_common.h */ 69/* The implementation is in ../md32_common.h */
69 70
diff --git a/src/lib/libcrypto/sha/sha_one.c b/src/lib/libcrypto/sha/sha_one.c
index d4f4d344df..3bae623ce8 100644
--- a/src/lib/libcrypto/sha/sha_one.c
+++ b/src/lib/libcrypto/sha/sha_one.c
@@ -62,7 +62,7 @@
62#include <openssl/crypto.h> 62#include <openssl/crypto.h>
63 63
64#ifndef OPENSSL_NO_SHA0 64#ifndef OPENSSL_NO_SHA0
65unsigned char *SHA(const unsigned char *d, unsigned long n, unsigned char *md) 65unsigned char *SHA(const unsigned char *d, size_t n, unsigned char *md)
66 { 66 {
67 SHA_CTX c; 67 SHA_CTX c;
68 static unsigned char m[SHA_DIGEST_LENGTH]; 68 static unsigned char m[SHA_DIGEST_LENGTH];
diff --git a/src/lib/libcrypto/sha/shatest.c b/src/lib/libcrypto/sha/shatest.c
index ff702aa53e..ed0fe06a7b 100644
--- a/src/lib/libcrypto/sha/shatest.c
+++ b/src/lib/libcrypto/sha/shatest.c
@@ -106,7 +106,7 @@ static char *pt(unsigned char *md);
106int main(int argc, char *argv[]) 106int main(int argc, char *argv[])
107 { 107 {
108 int i,err=0; 108 int i,err=0;
109 unsigned char **P,**R; 109 char **P,**R;
110 static unsigned char buf[1000]; 110 static unsigned char buf[1000];
111 char *p,*r; 111 char *p,*r;
112 EVP_MD_CTX c; 112 EVP_MD_CTX c;
@@ -118,12 +118,12 @@ int main(int argc, char *argv[])
118#endif 118#endif
119 119
120 EVP_MD_CTX_init(&c); 120 EVP_MD_CTX_init(&c);
121 P=(unsigned char **)test; 121 P=test;
122 R=(unsigned char **)ret; 122 R=ret;
123 i=1; 123 i=1;
124 while (*P != NULL) 124 while (*P != NULL)
125 { 125 {
126 EVP_Digest(*P,(unsigned long)strlen((char *)*P),md,NULL,EVP_sha(), NULL); 126 EVP_Digest(*P,strlen((char *)*P),md,NULL,EVP_sha(), NULL);
127 p=pt(md); 127 p=pt(md);
128 if (strcmp(p,(char *)*R) != 0) 128 if (strcmp(p,(char *)*R) != 0)
129 { 129 {
@@ -157,6 +157,10 @@ int main(int argc, char *argv[])
157 } 157 }
158 else 158 else
159 printf("test 3 ok\n"); 159 printf("test 3 ok\n");
160
161#ifdef OPENSSL_SYS_NETWARE
162 if (err) printf("ERROR: %d\n", err);
163#endif
160 EVP_MD_CTX_cleanup(&c); 164 EVP_MD_CTX_cleanup(&c);
161 EXIT(err); 165 EXIT(err);
162 return(0); 166 return(0);