summaryrefslogtreecommitdiff
path: root/src/lib/libcrypto/sha
diff options
context:
space:
mode:
authorcvs2svn <admin@example.com>2015-08-02 21:54:22 +0000
committercvs2svn <admin@example.com>2015-08-02 21:54:22 +0000
commited3760bf4be4a96a89233fb8f8b84a0d44725862 (patch)
tree5609c82060f75c53af0a7641d9b33a88574876cd /src/lib/libcrypto/sha
parentf8b563fb5ba1524c821d37308f4e6abfc866bc3f (diff)
downloadopenbsd-OPENBSD_5_8_BASE.tar.gz
openbsd-OPENBSD_5_8_BASE.tar.bz2
openbsd-OPENBSD_5_8_BASE.zip
This commit was manufactured by cvs2git to create tag 'OPENBSD_5_8_BASE'.OPENBSD_5_8_BASE
Diffstat (limited to 'src/lib/libcrypto/sha')
-rw-r--r--src/lib/libcrypto/sha/asm/sha1-586.pl1225
-rw-r--r--src/lib/libcrypto/sha/asm/sha1-alpha.pl317
-rw-r--r--src/lib/libcrypto/sha/asm/sha1-armv4-large.pl248
-rw-r--r--src/lib/libcrypto/sha/asm/sha1-ia64.pl305
-rw-r--r--src/lib/libcrypto/sha/asm/sha1-mips.pl350
-rw-r--r--src/lib/libcrypto/sha/asm/sha1-parisc.pl266
-rwxr-xr-xsrc/lib/libcrypto/sha/asm/sha1-ppc.pl326
-rw-r--r--src/lib/libcrypto/sha/asm/sha1-s390x.pl246
-rw-r--r--src/lib/libcrypto/sha/asm/sha1-sparcv9.pl284
-rw-r--r--src/lib/libcrypto/sha/asm/sha1-sparcv9a.pl601
-rw-r--r--src/lib/libcrypto/sha/asm/sha1-thumb.pl259
-rwxr-xr-xsrc/lib/libcrypto/sha/asm/sha1-x86_64.pl1261
-rw-r--r--src/lib/libcrypto/sha/asm/sha256-586.pl249
-rw-r--r--src/lib/libcrypto/sha/asm/sha256-armv4.pl211
-rw-r--r--src/lib/libcrypto/sha/asm/sha512-586.pl644
-rw-r--r--src/lib/libcrypto/sha/asm/sha512-armv4.pl582
-rwxr-xr-xsrc/lib/libcrypto/sha/asm/sha512-ia64.pl672
-rw-r--r--src/lib/libcrypto/sha/asm/sha512-mips.pl457
-rwxr-xr-xsrc/lib/libcrypto/sha/asm/sha512-parisc.pl805
-rwxr-xr-xsrc/lib/libcrypto/sha/asm/sha512-ppc.pl460
-rw-r--r--src/lib/libcrypto/sha/asm/sha512-s390x.pl322
-rw-r--r--src/lib/libcrypto/sha/asm/sha512-sparcv9.pl594
-rwxr-xr-xsrc/lib/libcrypto/sha/asm/sha512-x86_64.pl342
-rw-r--r--src/lib/libcrypto/sha/sha.h201
-rw-r--r--src/lib/libcrypto/sha/sha1_one.c81
-rw-r--r--src/lib/libcrypto/sha/sha1dgst.c75
-rw-r--r--src/lib/libcrypto/sha/sha256.c284
-rw-r--r--src/lib/libcrypto/sha/sha512.c558
-rw-r--r--src/lib/libcrypto/sha/sha_locl.h435
29 files changed, 0 insertions, 12660 deletions
diff --git a/src/lib/libcrypto/sha/asm/sha1-586.pl b/src/lib/libcrypto/sha/asm/sha1-586.pl
deleted file mode 100644
index 6fbea34d78..0000000000
--- a/src/lib/libcrypto/sha/asm/sha1-586.pl
+++ /dev/null
@@ -1,1225 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# [Re]written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# "[Re]written" was achieved in two major overhauls. In 2004 BODY_*
11# functions were re-implemented to address P4 performance issue [see
12# commentary below], and in 2006 the rest was rewritten in order to
13# gain freedom to liberate licensing terms.
14
15# January, September 2004.
16#
17# It was noted that Intel IA-32 C compiler generates code which
18# performs ~30% *faster* on P4 CPU than original *hand-coded*
19# SHA1 assembler implementation. To address this problem (and
20# prove that humans are still better than machines:-), the
21# original code was overhauled, which resulted in following
22# performance changes:
23#
24# compared with original compared with Intel cc
25# assembler impl. generated code
26# Pentium -16% +48%
27# PIII/AMD +8% +16%
28# P4 +85%(!) +45%
29#
30# As you can see Pentium came out as looser:-( Yet I reckoned that
31# improvement on P4 outweights the loss and incorporate this
32# re-tuned code to 0.9.7 and later.
33# ----------------------------------------------------------------
34# <appro@fy.chalmers.se>
35
36# August 2009.
37#
38# George Spelvin has tipped that F_40_59(b,c,d) can be rewritten as
39# '(c&d) + (b&(c^d))', which allows to accumulate partial results
40# and lighten "pressure" on scratch registers. This resulted in
41# >12% performance improvement on contemporary AMD cores (with no
42# degradation on other CPUs:-). Also, the code was revised to maximize
43# "distance" between instructions producing input to 'lea' instruction
44# and the 'lea' instruction itself, which is essential for Intel Atom
45# core and resulted in ~15% improvement.
46
47# October 2010.
48#
49# Add SSSE3, Supplemental[!] SSE3, implementation. The idea behind it
50# is to offload message schedule denoted by Wt in NIST specification,
51# or Xupdate in OpenSSL source, to SIMD unit. The idea is not novel,
52# and in SSE2 context was first explored by Dean Gaudet in 2004, see
53# http://arctic.org/~dean/crypto/sha1.html. Since then several things
54# have changed that made it interesting again:
55#
56# a) XMM units became faster and wider;
57# b) instruction set became more versatile;
58# c) an important observation was made by Max Locktykhin, which made
59# it possible to reduce amount of instructions required to perform
60# the operation in question, for further details see
61# http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/.
62
63# April 2011.
64#
65# Add AVX code path, probably most controversial... The thing is that
66# switch to AVX alone improves performance by as little as 4% in
67# comparison to SSSE3 code path. But below result doesn't look like
68# 4% improvement... Trouble is that Sandy Bridge decodes 'ro[rl]' as
69# pair of µ-ops, and it's the additional µ-ops, two per round, that
70# make it run slower than Core2 and Westmere. But 'sh[rl]d' is decoded
71# as single µ-op by Sandy Bridge and it's replacing 'ro[rl]' with
72# equivalent 'sh[rl]d' that is responsible for the impressive 5.1
73# cycles per processed byte. But 'sh[rl]d' is not something that used
74# to be fast, nor does it appear to be fast in upcoming Bulldozer
75# [according to its optimization manual]. Which is why AVX code path
76# is guarded by *both* AVX and synthetic bit denoting Intel CPUs.
77# One can argue that it's unfair to AMD, but without 'sh[rl]d' it
78# makes no sense to keep the AVX code path. If somebody feels that
79# strongly, it's probably more appropriate to discuss possibility of
80# using vector rotate XOP on AMD...
81
82######################################################################
83# Current performance is summarized in following table. Numbers are
84# CPU clock cycles spent to process single byte (less is better).
85#
86# x86 SSSE3 AVX
87# Pentium 15.7 -
88# PIII 11.5 -
89# P4 10.6 -
90# AMD K8 7.1 -
91# Core2 7.3 6.1/+20% -
92# Atom 12.5 9.5(*)/+32% -
93# Westmere 7.3 5.6/+30% -
94# Sandy Bridge 8.8 6.2/+40% 5.1(**)/+70%
95#
96# (*) Loop is 1056 instructions long and expected result is ~8.25.
97# It remains mystery [to me] why ILP is limited to 1.7.
98#
99# (**) As per above comment, the result is for AVX *plus* sh[rl]d.
100
101$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
102push(@INC,"${dir}","${dir}../../perlasm");
103require "x86asm.pl";
104
105&asm_init($ARGV[0],"sha1-586.pl",$ARGV[$#ARGV] eq "386");
106
107$xmm=$ymm=0;
108for (@ARGV) { $xmm=1 if (/-DOPENSSL_IA32_SSE2/); }
109
110$ymm=1 if ($xmm &&
111 `$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
112 =~ /GNU assembler version ([2-9]\.[0-9]+)/ &&
113 $1>=2.19); # first version supporting AVX
114
115&external_label("OPENSSL_ia32cap_P") if ($xmm);
116
117
118$A="eax";
119$B="ebx";
120$C="ecx";
121$D="edx";
122$E="edi";
123$T="esi";
124$tmp1="ebp";
125
126@V=($A,$B,$C,$D,$E,$T);
127
128$alt=0; # 1 denotes alternative IALU implementation, which performs
129 # 8% *worse* on P4, same on Westmere and Atom, 2% better on
130 # Sandy Bridge...
131
132sub BODY_00_15
133 {
134 local($n,$a,$b,$c,$d,$e,$f)=@_;
135
136 &comment("00_15 $n");
137
138 &mov($f,$c); # f to hold F_00_19(b,c,d)
139 if ($n==0) { &mov($tmp1,$a); }
140 else { &mov($a,$tmp1); }
141 &rotl($tmp1,5); # tmp1=ROTATE(a,5)
142 &xor($f,$d);
143 &add($tmp1,$e); # tmp1+=e;
144 &mov($e,&swtmp($n%16)); # e becomes volatile and is loaded
145 # with xi, also note that e becomes
146 # f in next round...
147 &and($f,$b);
148 &rotr($b,2); # b=ROTATE(b,30)
149 &xor($f,$d); # f holds F_00_19(b,c,d)
150 &lea($tmp1,&DWP(0x5a827999,$tmp1,$e)); # tmp1+=K_00_19+xi
151
152 if ($n==15) { &mov($e,&swtmp(($n+1)%16));# pre-fetch f for next round
153 &add($f,$tmp1); } # f+=tmp1
154 else { &add($tmp1,$f); } # f becomes a in next round
155 &mov($tmp1,$a) if ($alt && $n==15);
156 }
157
158sub BODY_16_19
159 {
160 local($n,$a,$b,$c,$d,$e,$f)=@_;
161
162 &comment("16_19 $n");
163
164if ($alt) {
165 &xor($c,$d);
166 &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd)
167 &and($tmp1,$c); # tmp1 to hold F_00_19(b,c,d), b&=c^d
168 &xor($f,&swtmp(($n+8)%16));
169 &xor($tmp1,$d); # tmp1=F_00_19(b,c,d)
170 &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd
171 &rotl($f,1); # f=ROTATE(f,1)
172 &add($e,$tmp1); # e+=F_00_19(b,c,d)
173 &xor($c,$d); # restore $c
174 &mov($tmp1,$a); # b in next round
175 &rotr($b,$n==16?2:7); # b=ROTATE(b,30)
176 &mov(&swtmp($n%16),$f); # xi=f
177 &rotl($a,5); # ROTATE(a,5)
178 &lea($f,&DWP(0x5a827999,$f,$e));# f+=F_00_19(b,c,d)+e
179 &mov($e,&swtmp(($n+1)%16)); # pre-fetch f for next round
180 &add($f,$a); # f+=ROTATE(a,5)
181} else {
182 &mov($tmp1,$c); # tmp1 to hold F_00_19(b,c,d)
183 &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd)
184 &xor($tmp1,$d);
185 &xor($f,&swtmp(($n+8)%16));
186 &and($tmp1,$b);
187 &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd
188 &rotl($f,1); # f=ROTATE(f,1)
189 &xor($tmp1,$d); # tmp1=F_00_19(b,c,d)
190 &add($e,$tmp1); # e+=F_00_19(b,c,d)
191 &mov($tmp1,$a);
192 &rotr($b,2); # b=ROTATE(b,30)
193 &mov(&swtmp($n%16),$f); # xi=f
194 &rotl($tmp1,5); # ROTATE(a,5)
195 &lea($f,&DWP(0x5a827999,$f,$e));# f+=F_00_19(b,c,d)+e
196 &mov($e,&swtmp(($n+1)%16)); # pre-fetch f for next round
197 &add($f,$tmp1); # f+=ROTATE(a,5)
198}
199 }
200
201sub BODY_20_39
202 {
203 local($n,$a,$b,$c,$d,$e,$f)=@_;
204 local $K=($n<40)?0x6ed9eba1:0xca62c1d6;
205
206 &comment("20_39 $n");
207
208if ($alt) {
209 &xor($tmp1,$c); # tmp1 to hold F_20_39(b,c,d), b^=c
210 &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd)
211 &xor($tmp1,$d); # tmp1 holds F_20_39(b,c,d)
212 &xor($f,&swtmp(($n+8)%16));
213 &add($e,$tmp1); # e+=F_20_39(b,c,d)
214 &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd
215 &rotl($f,1); # f=ROTATE(f,1)
216 &mov($tmp1,$a); # b in next round
217 &rotr($b,7); # b=ROTATE(b,30)
218 &mov(&swtmp($n%16),$f) if($n<77);# xi=f
219 &rotl($a,5); # ROTATE(a,5)
220 &xor($b,$c) if($n==39);# warm up for BODY_40_59
221 &and($tmp1,$b) if($n==39);
222 &lea($f,&DWP($K,$f,$e)); # f+=e+K_XX_YY
223 &mov($e,&swtmp(($n+1)%16)) if($n<79);# pre-fetch f for next round
224 &add($f,$a); # f+=ROTATE(a,5)
225 &rotr($a,5) if ($n==79);
226} else {
227 &mov($tmp1,$b); # tmp1 to hold F_20_39(b,c,d)
228 &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd)
229 &xor($tmp1,$c);
230 &xor($f,&swtmp(($n+8)%16));
231 &xor($tmp1,$d); # tmp1 holds F_20_39(b,c,d)
232 &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd
233 &rotl($f,1); # f=ROTATE(f,1)
234 &add($e,$tmp1); # e+=F_20_39(b,c,d)
235 &rotr($b,2); # b=ROTATE(b,30)
236 &mov($tmp1,$a);
237 &rotl($tmp1,5); # ROTATE(a,5)
238 &mov(&swtmp($n%16),$f) if($n<77);# xi=f
239 &lea($f,&DWP($K,$f,$e)); # f+=e+K_XX_YY
240 &mov($e,&swtmp(($n+1)%16)) if($n<79);# pre-fetch f for next round
241 &add($f,$tmp1); # f+=ROTATE(a,5)
242}
243 }
244
245sub BODY_40_59
246 {
247 local($n,$a,$b,$c,$d,$e,$f)=@_;
248
249 &comment("40_59 $n");
250
251if ($alt) {
252 &add($e,$tmp1); # e+=b&(c^d)
253 &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd)
254 &mov($tmp1,$d);
255 &xor($f,&swtmp(($n+8)%16));
256 &xor($c,$d); # restore $c
257 &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd
258 &rotl($f,1); # f=ROTATE(f,1)
259 &and($tmp1,$c);
260 &rotr($b,7); # b=ROTATE(b,30)
261 &add($e,$tmp1); # e+=c&d
262 &mov($tmp1,$a); # b in next round
263 &mov(&swtmp($n%16),$f); # xi=f
264 &rotl($a,5); # ROTATE(a,5)
265 &xor($b,$c) if ($n<59);
266 &and($tmp1,$b) if ($n<59);# tmp1 to hold F_40_59(b,c,d)
267 &lea($f,&DWP(0x8f1bbcdc,$f,$e));# f+=K_40_59+e+(b&(c^d))
268 &mov($e,&swtmp(($n+1)%16)); # pre-fetch f for next round
269 &add($f,$a); # f+=ROTATE(a,5)
270} else {
271 &mov($tmp1,$c); # tmp1 to hold F_40_59(b,c,d)
272 &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd)
273 &xor($tmp1,$d);
274 &xor($f,&swtmp(($n+8)%16));
275 &and($tmp1,$b);
276 &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd
277 &rotl($f,1); # f=ROTATE(f,1)
278 &add($tmp1,$e); # b&(c^d)+=e
279 &rotr($b,2); # b=ROTATE(b,30)
280 &mov($e,$a); # e becomes volatile
281 &rotl($e,5); # ROTATE(a,5)
282 &mov(&swtmp($n%16),$f); # xi=f
283 &lea($f,&DWP(0x8f1bbcdc,$f,$tmp1));# f+=K_40_59+e+(b&(c^d))
284 &mov($tmp1,$c);
285 &add($f,$e); # f+=ROTATE(a,5)
286 &and($tmp1,$d);
287 &mov($e,&swtmp(($n+1)%16)); # pre-fetch f for next round
288 &add($f,$tmp1); # f+=c&d
289}
290 }
291
292&function_begin("sha1_block_data_order");
293if ($xmm) {
294 &static_label("ssse3_shortcut");
295 &static_label("avx_shortcut") if ($ymm);
296 &static_label("K_XX_XX");
297
298 &call (&label("pic_point")); # make it PIC!
299 &set_label("pic_point");
300 &blindpop($tmp1);
301 &picmeup($T,"OPENSSL_ia32cap_P",$tmp1,&label("pic_point"));
302 &lea ($tmp1,&DWP(&label("K_XX_XX")."-".&label("pic_point"),$tmp1));
303
304 &mov ($A,&DWP(0,$T));
305 &mov ($D,&DWP(4,$T));
306 &test ($D,1<<9); # check SSSE3 bit
307 &jz (&label("x86"));
308 &test ($A,1<<24); # check FXSR bit
309 &jz (&label("x86"));
310 if ($ymm) {
311 &and ($D,1<<28); # mask AVX bit
312 &and ($A,1<<30); # mask "Intel CPU" bit
313 &or ($A,$D);
314 &cmp ($A,1<<28|1<<30);
315 &je (&label("avx_shortcut"));
316 }
317 &jmp (&label("ssse3_shortcut"));
318 &set_label("x86",16);
319}
320 &mov($tmp1,&wparam(0)); # SHA_CTX *c
321 &mov($T,&wparam(1)); # const void *input
322 &mov($A,&wparam(2)); # size_t num
323 &stack_push(16+3); # allocate X[16]
324 &shl($A,6);
325 &add($A,$T);
326 &mov(&wparam(2),$A); # pointer beyond the end of input
327 &mov($E,&DWP(16,$tmp1));# pre-load E
328 &jmp(&label("loop"));
329
330&set_label("loop",16);
331
332 # copy input chunk to X, but reversing byte order!
333 for ($i=0; $i<16; $i+=4)
334 {
335 &mov($A,&DWP(4*($i+0),$T));
336 &mov($B,&DWP(4*($i+1),$T));
337 &mov($C,&DWP(4*($i+2),$T));
338 &mov($D,&DWP(4*($i+3),$T));
339 &bswap($A);
340 &bswap($B);
341 &bswap($C);
342 &bswap($D);
343 &mov(&swtmp($i+0),$A);
344 &mov(&swtmp($i+1),$B);
345 &mov(&swtmp($i+2),$C);
346 &mov(&swtmp($i+3),$D);
347 }
348 &mov(&wparam(1),$T); # redundant in 1st spin
349
350 &mov($A,&DWP(0,$tmp1)); # load SHA_CTX
351 &mov($B,&DWP(4,$tmp1));
352 &mov($C,&DWP(8,$tmp1));
353 &mov($D,&DWP(12,$tmp1));
354 # E is pre-loaded
355
356 for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
357 for(;$i<20;$i++) { &BODY_16_19($i,@V); unshift(@V,pop(@V)); }
358 for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
359 for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
360 for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
361
362 (($V[5] eq $D) and ($V[0] eq $E)) or die; # double-check
363
364 &mov($tmp1,&wparam(0)); # re-load SHA_CTX*
365 &mov($D,&wparam(1)); # D is last "T" and is discarded
366
367 &add($E,&DWP(0,$tmp1)); # E is last "A"...
368 &add($T,&DWP(4,$tmp1));
369 &add($A,&DWP(8,$tmp1));
370 &add($B,&DWP(12,$tmp1));
371 &add($C,&DWP(16,$tmp1));
372
373 &mov(&DWP(0,$tmp1),$E); # update SHA_CTX
374 &add($D,64); # advance input pointer
375 &mov(&DWP(4,$tmp1),$T);
376 &cmp($D,&wparam(2)); # have we reached the end yet?
377 &mov(&DWP(8,$tmp1),$A);
378 &mov($E,$C); # C is last "E" which needs to be "pre-loaded"
379 &mov(&DWP(12,$tmp1),$B);
380 &mov($T,$D); # input pointer
381 &mov(&DWP(16,$tmp1),$C);
382 &jb(&label("loop"));
383
384 &stack_pop(16+3);
385&function_end("sha1_block_data_order");
386
387if ($xmm) {
388######################################################################
389# The SSSE3 implementation.
390#
391# %xmm[0-7] are used as ring @X[] buffer containing quadruples of last
392# 32 elements of the message schedule or Xupdate outputs. First 4
393# quadruples are simply byte-swapped input, next 4 are calculated
394# according to method originally suggested by Dean Gaudet (modulo
395# being implemented in SSSE3). Once 8 quadruples or 32 elements are
396# collected, it switches to routine proposed by Max Locktyukhin.
397#
398# Calculations inevitably require temporary reqisters, and there are
399# no %xmm registers left to spare. For this reason part of the ring
400# buffer, X[2..4] to be specific, is offloaded to 3 quadriples ring
401# buffer on the stack. Keep in mind that X[2] is alias X[-6], X[3] -
402# X[-5], and X[4] - X[-4]...
403#
404# Another notable optimization is aggressive stack frame compression
405# aiming to minimize amount of 9-byte instructions...
406#
407# Yet another notable optimization is "jumping" $B variable. It means
408# that there is no register permanently allocated for $B value. This
409# allowed to eliminate one instruction from body_20_39...
410#
411my $Xi=4; # 4xSIMD Xupdate round, start pre-seeded
412my @X=map("xmm$_",(4..7,0..3)); # pre-seeded for $Xi=4
413my @V=($A,$B,$C,$D,$E);
414my $j=0; # hash round
415my @T=($T,$tmp1);
416my $inp;
417
418my $_rol=sub { &rol(@_) };
419my $_ror=sub { &ror(@_) };
420
421&function_begin("_sha1_block_data_order_ssse3");
422 &call (&label("pic_point")); # make it PIC!
423 &set_label("pic_point");
424 &blindpop($tmp1);
425 &lea ($tmp1,&DWP(&label("K_XX_XX")."-".&label("pic_point"),$tmp1));
426&set_label("ssse3_shortcut");
427
428 &movdqa (@X[3],&QWP(0,$tmp1)); # K_00_19
429 &movdqa (@X[4],&QWP(16,$tmp1)); # K_20_39
430 &movdqa (@X[5],&QWP(32,$tmp1)); # K_40_59
431 &movdqa (@X[6],&QWP(48,$tmp1)); # K_60_79
432 &movdqa (@X[2],&QWP(64,$tmp1)); # pbswap mask
433
434 &mov ($E,&wparam(0)); # load argument block
435 &mov ($inp=@T[1],&wparam(1));
436 &mov ($D,&wparam(2));
437 &mov (@T[0],"esp");
438
439 # stack frame layout
440 #
441 # +0 X[0]+K X[1]+K X[2]+K X[3]+K # XMM->IALU xfer area
442 # X[4]+K X[5]+K X[6]+K X[7]+K
443 # X[8]+K X[9]+K X[10]+K X[11]+K
444 # X[12]+K X[13]+K X[14]+K X[15]+K
445 #
446 # +64 X[0] X[1] X[2] X[3] # XMM->XMM backtrace area
447 # X[4] X[5] X[6] X[7]
448 # X[8] X[9] X[10] X[11] # even borrowed for K_00_19
449 #
450 # +112 K_20_39 K_20_39 K_20_39 K_20_39 # constants
451 # K_40_59 K_40_59 K_40_59 K_40_59
452 # K_60_79 K_60_79 K_60_79 K_60_79
453 # K_00_19 K_00_19 K_00_19 K_00_19
454 # pbswap mask
455 #
456 # +192 ctx # argument block
457 # +196 inp
458 # +200 end
459 # +204 esp
460 &sub ("esp",208);
461 &and ("esp",-64);
462
463 &movdqa (&QWP(112+0,"esp"),@X[4]); # copy constants
464 &movdqa (&QWP(112+16,"esp"),@X[5]);
465 &movdqa (&QWP(112+32,"esp"),@X[6]);
466 &shl ($D,6); # len*64
467 &movdqa (&QWP(112+48,"esp"),@X[3]);
468 &add ($D,$inp); # end of input
469 &movdqa (&QWP(112+64,"esp"),@X[2]);
470 &add ($inp,64);
471 &mov (&DWP(192+0,"esp"),$E); # save argument block
472 &mov (&DWP(192+4,"esp"),$inp);
473 &mov (&DWP(192+8,"esp"),$D);
474 &mov (&DWP(192+12,"esp"),@T[0]); # save original %esp
475
476 &mov ($A,&DWP(0,$E)); # load context
477 &mov ($B,&DWP(4,$E));
478 &mov ($C,&DWP(8,$E));
479 &mov ($D,&DWP(12,$E));
480 &mov ($E,&DWP(16,$E));
481 &mov (@T[0],$B); # magic seed
482
483 &movdqu (@X[-4&7],&QWP(-64,$inp)); # load input to %xmm[0-3]
484 &movdqu (@X[-3&7],&QWP(-48,$inp));
485 &movdqu (@X[-2&7],&QWP(-32,$inp));
486 &movdqu (@X[-1&7],&QWP(-16,$inp));
487 &pshufb (@X[-4&7],@X[2]); # byte swap
488 &pshufb (@X[-3&7],@X[2]);
489 &pshufb (@X[-2&7],@X[2]);
490 &movdqa (&QWP(112-16,"esp"),@X[3]); # borrow last backtrace slot
491 &pshufb (@X[-1&7],@X[2]);
492 &paddd (@X[-4&7],@X[3]); # add K_00_19
493 &paddd (@X[-3&7],@X[3]);
494 &paddd (@X[-2&7],@X[3]);
495 &movdqa (&QWP(0,"esp"),@X[-4&7]); # X[]+K xfer to IALU
496 &psubd (@X[-4&7],@X[3]); # restore X[]
497 &movdqa (&QWP(0+16,"esp"),@X[-3&7]);
498 &psubd (@X[-3&7],@X[3]);
499 &movdqa (&QWP(0+32,"esp"),@X[-2&7]);
500 &psubd (@X[-2&7],@X[3]);
501 &movdqa (@X[0],@X[-3&7]);
502 &jmp (&label("loop"));
503
504######################################################################
505# SSE instruction sequence is first broken to groups of independent
506# instructions, independent in respect to their inputs and shifter
507# (not all architectures have more than one). Then IALU instructions
508# are "knitted in" between the SSE groups. Distance is maintained for
509# SSE latency of 2 in hope that it fits better upcoming AMD Bulldozer
510# [which allegedly also implements SSSE3]...
511#
512# Temporary registers usage. X[2] is volatile at the entry and at the
513# end is restored from backtrace ring buffer. X[3] is expected to
514# contain current K_XX_XX constant and is used to caclulate X[-1]+K
515# from previous round, it becomes volatile the moment the value is
516# saved to stack for transfer to IALU. X[4] becomes volatile whenever
517# X[-4] is accumulated and offloaded to backtrace ring buffer, at the
518# end it is loaded with next K_XX_XX [which becomes X[3] in next
519# round]...
520#
521sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4
522{ use integer;
523 my $body = shift;
524 my @insns = (&$body,&$body,&$body,&$body); # 40 instructions
525 my ($a,$b,$c,$d,$e);
526
527 eval(shift(@insns));
528 eval(shift(@insns));
529 &palignr(@X[0],@X[-4&7],8); # compose "X[-14]" in "X[0]"
530 &movdqa (@X[2],@X[-1&7]);
531 eval(shift(@insns));
532 eval(shift(@insns));
533
534 &paddd (@X[3],@X[-1&7]);
535 &movdqa (&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]);# save X[] to backtrace buffer
536 eval(shift(@insns));
537 eval(shift(@insns));
538 &psrldq (@X[2],4); # "X[-3]", 3 dwords
539 eval(shift(@insns));
540 eval(shift(@insns));
541 &pxor (@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
542 eval(shift(@insns));
543 eval(shift(@insns));
544
545 &pxor (@X[2],@X[-2&7]); # "X[-3]"^"X[-8]"
546 eval(shift(@insns));
547 eval(shift(@insns));
548 eval(shift(@insns));
549 eval(shift(@insns));
550
551 &pxor (@X[0],@X[2]); # "X[0]"^="X[-3]"^"X[-8]"
552 eval(shift(@insns));
553 eval(shift(@insns));
554 &movdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer to IALU
555 eval(shift(@insns));
556 eval(shift(@insns));
557
558 &movdqa (@X[4],@X[0]);
559 &movdqa (@X[2],@X[0]);
560 eval(shift(@insns));
561 eval(shift(@insns));
562 eval(shift(@insns));
563 eval(shift(@insns));
564
565 &pslldq (@X[4],12); # "X[0]"<<96, extract one dword
566 &paddd (@X[0],@X[0]);
567 eval(shift(@insns));
568 eval(shift(@insns));
569 eval(shift(@insns));
570 eval(shift(@insns));
571
572 &psrld (@X[2],31);
573 eval(shift(@insns));
574 eval(shift(@insns));
575 &movdqa (@X[3],@X[4]);
576 eval(shift(@insns));
577 eval(shift(@insns));
578
579 &psrld (@X[4],30);
580 &por (@X[0],@X[2]); # "X[0]"<<<=1
581 eval(shift(@insns));
582 eval(shift(@insns));
583 &movdqa (@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if ($Xi>5); # restore X[] from backtrace buffer
584 eval(shift(@insns));
585 eval(shift(@insns));
586
587 &pslld (@X[3],2);
588 &pxor (@X[0],@X[4]);
589 eval(shift(@insns));
590 eval(shift(@insns));
591 &movdqa (@X[4],&QWP(112-16+16*(($Xi)/5),"esp")); # K_XX_XX
592 eval(shift(@insns));
593 eval(shift(@insns));
594
595 &pxor (@X[0],@X[3]); # "X[0]"^=("X[0]"<<96)<<<2
596 &movdqa (@X[1],@X[-2&7]) if ($Xi<7);
597 eval(shift(@insns));
598 eval(shift(@insns));
599
600 foreach (@insns) { eval; } # remaining instructions [if any]
601
602 $Xi++; push(@X,shift(@X)); # "rotate" X[]
603}
604
605sub Xupdate_ssse3_32_79()
606{ use integer;
607 my $body = shift;
608 my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions
609 my ($a,$b,$c,$d,$e);
610
611 &movdqa (@X[2],@X[-1&7]) if ($Xi==8);
612 eval(shift(@insns)); # body_20_39
613 &pxor (@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
614 &palignr(@X[2],@X[-2&7],8); # compose "X[-6]"
615 eval(shift(@insns));
616 eval(shift(@insns));
617 eval(shift(@insns)); # rol
618
619 &pxor (@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
620 &movdqa (&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]); # save X[] to backtrace buffer
621 eval(shift(@insns));
622 eval(shift(@insns));
623 if ($Xi%5) {
624 &movdqa (@X[4],@X[3]); # "perpetuate" K_XX_XX...
625 } else { # ... or load next one
626 &movdqa (@X[4],&QWP(112-16+16*($Xi/5),"esp"));
627 }
628 &paddd (@X[3],@X[-1&7]);
629 eval(shift(@insns)); # ror
630 eval(shift(@insns));
631
632 &pxor (@X[0],@X[2]); # "X[0]"^="X[-6]"
633 eval(shift(@insns)); # body_20_39
634 eval(shift(@insns));
635 eval(shift(@insns));
636 eval(shift(@insns)); # rol
637
638 &movdqa (@X[2],@X[0]);
639 &movdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer to IALU
640 eval(shift(@insns));
641 eval(shift(@insns));
642 eval(shift(@insns)); # ror
643 eval(shift(@insns));
644
645 &pslld (@X[0],2);
646 eval(shift(@insns)); # body_20_39
647 eval(shift(@insns));
648 &psrld (@X[2],30);
649 eval(shift(@insns));
650 eval(shift(@insns)); # rol
651 eval(shift(@insns));
652 eval(shift(@insns));
653 eval(shift(@insns)); # ror
654 eval(shift(@insns));
655
656 &por (@X[0],@X[2]); # "X[0]"<<<=2
657 eval(shift(@insns)); # body_20_39
658 eval(shift(@insns));
659 &movdqa (@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if($Xi<19); # restore X[] from backtrace buffer
660 eval(shift(@insns));
661 eval(shift(@insns)); # rol
662 eval(shift(@insns));
663 eval(shift(@insns));
664 eval(shift(@insns)); # ror
665 &movdqa (@X[3],@X[0]) if ($Xi<19);
666 eval(shift(@insns));
667
668 foreach (@insns) { eval; } # remaining instructions
669
670 $Xi++; push(@X,shift(@X)); # "rotate" X[]
671}
672
673sub Xuplast_ssse3_80()
674{ use integer;
675 my $body = shift;
676 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
677 my ($a,$b,$c,$d,$e);
678
679 eval(shift(@insns));
680 &paddd (@X[3],@X[-1&7]);
681 eval(shift(@insns));
682 eval(shift(@insns));
683 eval(shift(@insns));
684 eval(shift(@insns));
685
686 &movdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer IALU
687
688 foreach (@insns) { eval; } # remaining instructions
689
690 &mov ($inp=@T[1],&DWP(192+4,"esp"));
691 &cmp ($inp,&DWP(192+8,"esp"));
692 &je (&label("done"));
693
694 &movdqa (@X[3],&QWP(112+48,"esp")); # K_00_19
695 &movdqa (@X[2],&QWP(112+64,"esp")); # pbswap mask
696 &movdqu (@X[-4&7],&QWP(0,$inp)); # load input
697 &movdqu (@X[-3&7],&QWP(16,$inp));
698 &movdqu (@X[-2&7],&QWP(32,$inp));
699 &movdqu (@X[-1&7],&QWP(48,$inp));
700 &add ($inp,64);
701 &pshufb (@X[-4&7],@X[2]); # byte swap
702 &mov (&DWP(192+4,"esp"),$inp);
703 &movdqa (&QWP(112-16,"esp"),@X[3]); # borrow last backtrace slot
704
705 $Xi=0;
706}
707
708sub Xloop_ssse3()
709{ use integer;
710 my $body = shift;
711 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
712 my ($a,$b,$c,$d,$e);
713
714 eval(shift(@insns));
715 eval(shift(@insns));
716 &pshufb (@X[($Xi-3)&7],@X[2]);
717 eval(shift(@insns));
718 eval(shift(@insns));
719 &paddd (@X[($Xi-4)&7],@X[3]);
720 eval(shift(@insns));
721 eval(shift(@insns));
722 eval(shift(@insns));
723 eval(shift(@insns));
724 &movdqa (&QWP(0+16*$Xi,"esp"),@X[($Xi-4)&7]); # X[]+K xfer to IALU
725 eval(shift(@insns));
726 eval(shift(@insns));
727 &psubd (@X[($Xi-4)&7],@X[3]);
728
729 foreach (@insns) { eval; }
730 $Xi++;
731}
732
733sub Xtail_ssse3()
734{ use integer;
735 my $body = shift;
736 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
737 my ($a,$b,$c,$d,$e);
738
739 foreach (@insns) { eval; }
740}
741
742sub body_00_19 () {
743 (
744 '($a,$b,$c,$d,$e)=@V;'.
745 '&add ($e,&DWP(4*($j&15),"esp"));', # X[]+K xfer
746 '&xor ($c,$d);',
747 '&mov (@T[1],$a);', # $b in next round
748 '&$_rol ($a,5);',
749 '&and (@T[0],$c);', # ($b&($c^$d))
750 '&xor ($c,$d);', # restore $c
751 '&xor (@T[0],$d);',
752 '&add ($e,$a);',
753 '&$_ror ($b,$j?7:2);', # $b>>>2
754 '&add ($e,@T[0]);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
755 );
756}
757
758sub body_20_39 () {
759 (
760 '($a,$b,$c,$d,$e)=@V;'.
761 '&add ($e,&DWP(4*($j++&15),"esp"));', # X[]+K xfer
762 '&xor (@T[0],$d);', # ($b^$d)
763 '&mov (@T[1],$a);', # $b in next round
764 '&$_rol ($a,5);',
765 '&xor (@T[0],$c);', # ($b^$d^$c)
766 '&add ($e,$a);',
767 '&$_ror ($b,7);', # $b>>>2
768 '&add ($e,@T[0]);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
769 );
770}
771
772sub body_40_59 () {
773 (
774 '($a,$b,$c,$d,$e)=@V;'.
775 '&mov (@T[1],$c);',
776 '&xor ($c,$d);',
777 '&add ($e,&DWP(4*($j++&15),"esp"));', # X[]+K xfer
778 '&and (@T[1],$d);',
779 '&and (@T[0],$c);', # ($b&($c^$d))
780 '&$_ror ($b,7);', # $b>>>2
781 '&add ($e,@T[1]);',
782 '&mov (@T[1],$a);', # $b in next round
783 '&$_rol ($a,5);',
784 '&add ($e,@T[0]);',
785 '&xor ($c,$d);', # restore $c
786 '&add ($e,$a);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
787 );
788}
789
790&set_label("loop",16);
791 &Xupdate_ssse3_16_31(\&body_00_19);
792 &Xupdate_ssse3_16_31(\&body_00_19);
793 &Xupdate_ssse3_16_31(\&body_00_19);
794 &Xupdate_ssse3_16_31(\&body_00_19);
795 &Xupdate_ssse3_32_79(\&body_00_19);
796 &Xupdate_ssse3_32_79(\&body_20_39);
797 &Xupdate_ssse3_32_79(\&body_20_39);
798 &Xupdate_ssse3_32_79(\&body_20_39);
799 &Xupdate_ssse3_32_79(\&body_20_39);
800 &Xupdate_ssse3_32_79(\&body_20_39);
801 &Xupdate_ssse3_32_79(\&body_40_59);
802 &Xupdate_ssse3_32_79(\&body_40_59);
803 &Xupdate_ssse3_32_79(\&body_40_59);
804 &Xupdate_ssse3_32_79(\&body_40_59);
805 &Xupdate_ssse3_32_79(\&body_40_59);
806 &Xupdate_ssse3_32_79(\&body_20_39);
807 &Xuplast_ssse3_80(\&body_20_39); # can jump to "done"
808
809 $saved_j=$j; @saved_V=@V;
810
811 &Xloop_ssse3(\&body_20_39);
812 &Xloop_ssse3(\&body_20_39);
813 &Xloop_ssse3(\&body_20_39);
814
815 &mov (@T[1],&DWP(192,"esp")); # update context
816 &add ($A,&DWP(0,@T[1]));
817 &add (@T[0],&DWP(4,@T[1])); # $b
818 &add ($C,&DWP(8,@T[1]));
819 &mov (&DWP(0,@T[1]),$A);
820 &add ($D,&DWP(12,@T[1]));
821 &mov (&DWP(4,@T[1]),@T[0]);
822 &add ($E,&DWP(16,@T[1]));
823 &mov (&DWP(8,@T[1]),$C);
824 &mov ($B,@T[0]);
825 &mov (&DWP(12,@T[1]),$D);
826 &mov (&DWP(16,@T[1]),$E);
827 &movdqa (@X[0],@X[-3&7]);
828
829 &jmp (&label("loop"));
830
831&set_label("done",16); $j=$saved_j; @V=@saved_V;
832
833 &Xtail_ssse3(\&body_20_39);
834 &Xtail_ssse3(\&body_20_39);
835 &Xtail_ssse3(\&body_20_39);
836
837 &mov (@T[1],&DWP(192,"esp")); # update context
838 &add ($A,&DWP(0,@T[1]));
839 &mov ("esp",&DWP(192+12,"esp")); # restore %esp
840 &add (@T[0],&DWP(4,@T[1])); # $b
841 &add ($C,&DWP(8,@T[1]));
842 &mov (&DWP(0,@T[1]),$A);
843 &add ($D,&DWP(12,@T[1]));
844 &mov (&DWP(4,@T[1]),@T[0]);
845 &add ($E,&DWP(16,@T[1]));
846 &mov (&DWP(8,@T[1]),$C);
847 &mov (&DWP(12,@T[1]),$D);
848 &mov (&DWP(16,@T[1]),$E);
849
850&function_end("_sha1_block_data_order_ssse3");
851
852if ($ymm) {
853my $Xi=4; # 4xSIMD Xupdate round, start pre-seeded
854my @X=map("xmm$_",(4..7,0..3)); # pre-seeded for $Xi=4
855my @V=($A,$B,$C,$D,$E);
856my $j=0; # hash round
857my @T=($T,$tmp1);
858my $inp;
859
860my $_rol=sub { &shld(@_[0],@_) };
861my $_ror=sub { &shrd(@_[0],@_) };
862
863&function_begin("_sha1_block_data_order_avx");
864 &call (&label("pic_point")); # make it PIC!
865 &set_label("pic_point");
866 &blindpop($tmp1);
867 &lea ($tmp1,&DWP(&label("K_XX_XX")."-".&label("pic_point"),$tmp1));
868&set_label("avx_shortcut");
869 &vzeroall();
870
871 &vmovdqa(@X[3],&QWP(0,$tmp1)); # K_00_19
872 &vmovdqa(@X[4],&QWP(16,$tmp1)); # K_20_39
873 &vmovdqa(@X[5],&QWP(32,$tmp1)); # K_40_59
874 &vmovdqa(@X[6],&QWP(48,$tmp1)); # K_60_79
875 &vmovdqa(@X[2],&QWP(64,$tmp1)); # pbswap mask
876
877 &mov ($E,&wparam(0)); # load argument block
878 &mov ($inp=@T[1],&wparam(1));
879 &mov ($D,&wparam(2));
880 &mov (@T[0],"esp");
881
882 # stack frame layout
883 #
884 # +0 X[0]+K X[1]+K X[2]+K X[3]+K # XMM->IALU xfer area
885 # X[4]+K X[5]+K X[6]+K X[7]+K
886 # X[8]+K X[9]+K X[10]+K X[11]+K
887 # X[12]+K X[13]+K X[14]+K X[15]+K
888 #
889 # +64 X[0] X[1] X[2] X[3] # XMM->XMM backtrace area
890 # X[4] X[5] X[6] X[7]
891 # X[8] X[9] X[10] X[11] # even borrowed for K_00_19
892 #
893 # +112 K_20_39 K_20_39 K_20_39 K_20_39 # constants
894 # K_40_59 K_40_59 K_40_59 K_40_59
895 # K_60_79 K_60_79 K_60_79 K_60_79
896 # K_00_19 K_00_19 K_00_19 K_00_19
897 # pbswap mask
898 #
899 # +192 ctx # argument block
900 # +196 inp
901 # +200 end
902 # +204 esp
903 &sub ("esp",208);
904 &and ("esp",-64);
905
906 &vmovdqa(&QWP(112+0,"esp"),@X[4]); # copy constants
907 &vmovdqa(&QWP(112+16,"esp"),@X[5]);
908 &vmovdqa(&QWP(112+32,"esp"),@X[6]);
909 &shl ($D,6); # len*64
910 &vmovdqa(&QWP(112+48,"esp"),@X[3]);
911 &add ($D,$inp); # end of input
912 &vmovdqa(&QWP(112+64,"esp"),@X[2]);
913 &add ($inp,64);
914 &mov (&DWP(192+0,"esp"),$E); # save argument block
915 &mov (&DWP(192+4,"esp"),$inp);
916 &mov (&DWP(192+8,"esp"),$D);
917 &mov (&DWP(192+12,"esp"),@T[0]); # save original %esp
918
919 &mov ($A,&DWP(0,$E)); # load context
920 &mov ($B,&DWP(4,$E));
921 &mov ($C,&DWP(8,$E));
922 &mov ($D,&DWP(12,$E));
923 &mov ($E,&DWP(16,$E));
924 &mov (@T[0],$B); # magic seed
925
926 &vmovdqu(@X[-4&7],&QWP(-64,$inp)); # load input to %xmm[0-3]
927 &vmovdqu(@X[-3&7],&QWP(-48,$inp));
928 &vmovdqu(@X[-2&7],&QWP(-32,$inp));
929 &vmovdqu(@X[-1&7],&QWP(-16,$inp));
930 &vpshufb(@X[-4&7],@X[-4&7],@X[2]); # byte swap
931 &vpshufb(@X[-3&7],@X[-3&7],@X[2]);
932 &vpshufb(@X[-2&7],@X[-2&7],@X[2]);
933 &vmovdqa(&QWP(112-16,"esp"),@X[3]); # borrow last backtrace slot
934 &vpshufb(@X[-1&7],@X[-1&7],@X[2]);
935 &vpaddd (@X[0],@X[-4&7],@X[3]); # add K_00_19
936 &vpaddd (@X[1],@X[-3&7],@X[3]);
937 &vpaddd (@X[2],@X[-2&7],@X[3]);
938 &vmovdqa(&QWP(0,"esp"),@X[0]); # X[]+K xfer to IALU
939 &vmovdqa(&QWP(0+16,"esp"),@X[1]);
940 &vmovdqa(&QWP(0+32,"esp"),@X[2]);
941 &jmp (&label("loop"));
942
943sub Xupdate_avx_16_31() # recall that $Xi starts wtih 4
944{ use integer;
945 my $body = shift;
946 my @insns = (&$body,&$body,&$body,&$body); # 40 instructions
947 my ($a,$b,$c,$d,$e);
948
949 eval(shift(@insns));
950 eval(shift(@insns));
951 &vpalignr(@X[0],@X[-3&7],@X[-4&7],8); # compose "X[-14]" in "X[0]"
952 eval(shift(@insns));
953 eval(shift(@insns));
954
955 &vpaddd (@X[3],@X[3],@X[-1&7]);
956 &vmovdqa (&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]);# save X[] to backtrace buffer
957 eval(shift(@insns));
958 eval(shift(@insns));
959 &vpsrldq(@X[2],@X[-1&7],4); # "X[-3]", 3 dwords
960 eval(shift(@insns));
961 eval(shift(@insns));
962 &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
963 eval(shift(@insns));
964 eval(shift(@insns));
965
966 &vpxor (@X[2],@X[2],@X[-2&7]); # "X[-3]"^"X[-8]"
967 eval(shift(@insns));
968 eval(shift(@insns));
969 &vmovdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer to IALU
970 eval(shift(@insns));
971 eval(shift(@insns));
972
973 &vpxor (@X[0],@X[0],@X[2]); # "X[0]"^="X[-3]"^"X[-8]"
974 eval(shift(@insns));
975 eval(shift(@insns));
976 eval(shift(@insns));
977 eval(shift(@insns));
978
979 &vpsrld (@X[2],@X[0],31);
980 eval(shift(@insns));
981 eval(shift(@insns));
982 eval(shift(@insns));
983 eval(shift(@insns));
984
985 &vpslldq(@X[4],@X[0],12); # "X[0]"<<96, extract one dword
986 &vpaddd (@X[0],@X[0],@X[0]);
987 eval(shift(@insns));
988 eval(shift(@insns));
989 eval(shift(@insns));
990 eval(shift(@insns));
991
992 &vpsrld (@X[3],@X[4],30);
993 &vpor (@X[0],@X[0],@X[2]); # "X[0]"<<<=1
994 eval(shift(@insns));
995 eval(shift(@insns));
996 eval(shift(@insns));
997 eval(shift(@insns));
998
999 &vpslld (@X[4],@X[4],2);
1000 &vmovdqa (@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if ($Xi>5); # restore X[] from backtrace buffer
1001 eval(shift(@insns));
1002 eval(shift(@insns));
1003 &vpxor (@X[0],@X[0],@X[3]);
1004 eval(shift(@insns));
1005 eval(shift(@insns));
1006 eval(shift(@insns));
1007 eval(shift(@insns));
1008
1009 &vpxor (@X[0],@X[0],@X[4]); # "X[0]"^=("X[0]"<<96)<<<2
1010 eval(shift(@insns));
1011 eval(shift(@insns));
1012 &vmovdqa (@X[4],&QWP(112-16+16*(($Xi)/5),"esp")); # K_XX_XX
1013 eval(shift(@insns));
1014 eval(shift(@insns));
1015
1016 foreach (@insns) { eval; } # remaining instructions [if any]
1017
1018 $Xi++; push(@X,shift(@X)); # "rotate" X[]
1019}
1020
1021sub Xupdate_avx_32_79()
1022{ use integer;
1023 my $body = shift;
1024 my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions
1025 my ($a,$b,$c,$d,$e);
1026
1027 &vpalignr(@X[2],@X[-1&7],@X[-2&7],8); # compose "X[-6]"
1028 &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
1029 eval(shift(@insns)); # body_20_39
1030 eval(shift(@insns));
1031 eval(shift(@insns));
1032 eval(shift(@insns)); # rol
1033
1034 &vpxor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
1035 &vmovdqa (&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]); # save X[] to backtrace buffer
1036 eval(shift(@insns));
1037 eval(shift(@insns));
1038 if ($Xi%5) {
1039 &vmovdqa (@X[4],@X[3]); # "perpetuate" K_XX_XX...
1040 } else { # ... or load next one
1041 &vmovdqa (@X[4],&QWP(112-16+16*($Xi/5),"esp"));
1042 }
1043 &vpaddd (@X[3],@X[3],@X[-1&7]);
1044 eval(shift(@insns)); # ror
1045 eval(shift(@insns));
1046
1047 &vpxor (@X[0],@X[0],@X[2]); # "X[0]"^="X[-6]"
1048 eval(shift(@insns)); # body_20_39
1049 eval(shift(@insns));
1050 eval(shift(@insns));
1051 eval(shift(@insns)); # rol
1052
1053 &vpsrld (@X[2],@X[0],30);
1054 &vmovdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer to IALU
1055 eval(shift(@insns));
1056 eval(shift(@insns));
1057 eval(shift(@insns)); # ror
1058 eval(shift(@insns));
1059
1060 &vpslld (@X[0],@X[0],2);
1061 eval(shift(@insns)); # body_20_39
1062 eval(shift(@insns));
1063 eval(shift(@insns));
1064 eval(shift(@insns)); # rol
1065 eval(shift(@insns));
1066 eval(shift(@insns));
1067 eval(shift(@insns)); # ror
1068 eval(shift(@insns));
1069
1070 &vpor (@X[0],@X[0],@X[2]); # "X[0]"<<<=2
1071 eval(shift(@insns)); # body_20_39
1072 eval(shift(@insns));
1073 &vmovdqa (@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if($Xi<19); # restore X[] from backtrace buffer
1074 eval(shift(@insns));
1075 eval(shift(@insns)); # rol
1076 eval(shift(@insns));
1077 eval(shift(@insns));
1078 eval(shift(@insns)); # ror
1079 eval(shift(@insns));
1080
1081 foreach (@insns) { eval; } # remaining instructions
1082
1083 $Xi++; push(@X,shift(@X)); # "rotate" X[]
1084}
1085
1086sub Xuplast_avx_80()
1087{ use integer;
1088 my $body = shift;
1089 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
1090 my ($a,$b,$c,$d,$e);
1091
1092 eval(shift(@insns));
1093 &vpaddd (@X[3],@X[3],@X[-1&7]);
1094 eval(shift(@insns));
1095 eval(shift(@insns));
1096 eval(shift(@insns));
1097 eval(shift(@insns));
1098
1099 &vmovdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer IALU
1100
1101 foreach (@insns) { eval; } # remaining instructions
1102
1103 &mov ($inp=@T[1],&DWP(192+4,"esp"));
1104 &cmp ($inp,&DWP(192+8,"esp"));
1105 &je (&label("done"));
1106
1107 &vmovdqa(@X[3],&QWP(112+48,"esp")); # K_00_19
1108 &vmovdqa(@X[2],&QWP(112+64,"esp")); # pbswap mask
1109 &vmovdqu(@X[-4&7],&QWP(0,$inp)); # load input
1110 &vmovdqu(@X[-3&7],&QWP(16,$inp));
1111 &vmovdqu(@X[-2&7],&QWP(32,$inp));
1112 &vmovdqu(@X[-1&7],&QWP(48,$inp));
1113 &add ($inp,64);
1114 &vpshufb(@X[-4&7],@X[-4&7],@X[2]); # byte swap
1115 &mov (&DWP(192+4,"esp"),$inp);
1116 &vmovdqa(&QWP(112-16,"esp"),@X[3]); # borrow last backtrace slot
1117
1118 $Xi=0;
1119}
1120
1121sub Xloop_avx()
1122{ use integer;
1123 my $body = shift;
1124 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
1125 my ($a,$b,$c,$d,$e);
1126
1127 eval(shift(@insns));
1128 eval(shift(@insns));
1129 &vpshufb (@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]);
1130 eval(shift(@insns));
1131 eval(shift(@insns));
1132 &vpaddd (@X[$Xi&7],@X[($Xi-4)&7],@X[3]);
1133 eval(shift(@insns));
1134 eval(shift(@insns));
1135 eval(shift(@insns));
1136 eval(shift(@insns));
1137 &vmovdqa (&QWP(0+16*$Xi,"esp"),@X[$Xi&7]); # X[]+K xfer to IALU
1138 eval(shift(@insns));
1139 eval(shift(@insns));
1140
1141 foreach (@insns) { eval; }
1142 $Xi++;
1143}
1144
1145sub Xtail_avx()
1146{ use integer;
1147 my $body = shift;
1148 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
1149 my ($a,$b,$c,$d,$e);
1150
1151 foreach (@insns) { eval; }
1152}
1153
1154&set_label("loop",16);
1155 &Xupdate_avx_16_31(\&body_00_19);
1156 &Xupdate_avx_16_31(\&body_00_19);
1157 &Xupdate_avx_16_31(\&body_00_19);
1158 &Xupdate_avx_16_31(\&body_00_19);
1159 &Xupdate_avx_32_79(\&body_00_19);
1160 &Xupdate_avx_32_79(\&body_20_39);
1161 &Xupdate_avx_32_79(\&body_20_39);
1162 &Xupdate_avx_32_79(\&body_20_39);
1163 &Xupdate_avx_32_79(\&body_20_39);
1164 &Xupdate_avx_32_79(\&body_20_39);
1165 &Xupdate_avx_32_79(\&body_40_59);
1166 &Xupdate_avx_32_79(\&body_40_59);
1167 &Xupdate_avx_32_79(\&body_40_59);
1168 &Xupdate_avx_32_79(\&body_40_59);
1169 &Xupdate_avx_32_79(\&body_40_59);
1170 &Xupdate_avx_32_79(\&body_20_39);
1171 &Xuplast_avx_80(\&body_20_39); # can jump to "done"
1172
1173 $saved_j=$j; @saved_V=@V;
1174
1175 &Xloop_avx(\&body_20_39);
1176 &Xloop_avx(\&body_20_39);
1177 &Xloop_avx(\&body_20_39);
1178
1179 &mov (@T[1],&DWP(192,"esp")); # update context
1180 &add ($A,&DWP(0,@T[1]));
1181 &add (@T[0],&DWP(4,@T[1])); # $b
1182 &add ($C,&DWP(8,@T[1]));
1183 &mov (&DWP(0,@T[1]),$A);
1184 &add ($D,&DWP(12,@T[1]));
1185 &mov (&DWP(4,@T[1]),@T[0]);
1186 &add ($E,&DWP(16,@T[1]));
1187 &mov (&DWP(8,@T[1]),$C);
1188 &mov ($B,@T[0]);
1189 &mov (&DWP(12,@T[1]),$D);
1190 &mov (&DWP(16,@T[1]),$E);
1191
1192 &jmp (&label("loop"));
1193
1194&set_label("done",16); $j=$saved_j; @V=@saved_V;
1195
1196 &Xtail_avx(\&body_20_39);
1197 &Xtail_avx(\&body_20_39);
1198 &Xtail_avx(\&body_20_39);
1199
1200 &vzeroall();
1201
1202 &mov (@T[1],&DWP(192,"esp")); # update context
1203 &add ($A,&DWP(0,@T[1]));
1204 &mov ("esp",&DWP(192+12,"esp")); # restore %esp
1205 &add (@T[0],&DWP(4,@T[1])); # $b
1206 &add ($C,&DWP(8,@T[1]));
1207 &mov (&DWP(0,@T[1]),$A);
1208 &add ($D,&DWP(12,@T[1]));
1209 &mov (&DWP(4,@T[1]),@T[0]);
1210 &add ($E,&DWP(16,@T[1]));
1211 &mov (&DWP(8,@T[1]),$C);
1212 &mov (&DWP(12,@T[1]),$D);
1213 &mov (&DWP(16,@T[1]),$E);
1214&function_end("_sha1_block_data_order_avx");
1215}
1216&set_label("K_XX_XX",64);
1217&data_word(0x5a827999,0x5a827999,0x5a827999,0x5a827999); # K_00_19
1218&data_word(0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1); # K_20_39
1219&data_word(0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc); # K_40_59
1220&data_word(0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6); # K_60_79
1221&data_word(0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f); # pbswap mask
1222}
1223&asciz("SHA1 block transform for x86, CRYPTOGAMS by <appro\@openssl.org>");
1224
1225&asm_finish();
diff --git a/src/lib/libcrypto/sha/asm/sha1-alpha.pl b/src/lib/libcrypto/sha/asm/sha1-alpha.pl
deleted file mode 100644
index 44720c418c..0000000000
--- a/src/lib/libcrypto/sha/asm/sha1-alpha.pl
+++ /dev/null
@@ -1,317 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# SHA1 block procedure for Alpha.
11
12# On 21264 performance is 33% better than code generated by vendor
13# compiler, and 75% better than GCC [3.4], and in absolute terms is
14# 8.7 cycles per processed byte. Implementation features vectorized
15# byte swap, but not Xupdate.
16
17@X=( "\$0", "\$1", "\$2", "\$3", "\$4", "\$5", "\$6", "\$7",
18 "\$8", "\$9", "\$10", "\$11", "\$12", "\$13", "\$14", "\$15");
19$ctx="a0"; # $16
20$inp="a1";
21$num="a2";
22$A="a3";
23$B="a4"; # 20
24$C="a5";
25$D="t8";
26$E="t9"; @V=($A,$B,$C,$D,$E);
27$t0="t10"; # 24
28$t1="t11";
29$t2="ra";
30$t3="t12";
31$K="AT"; # 28
32
33sub BODY_00_19 {
34my ($i,$a,$b,$c,$d,$e)=@_;
35my $j=$i+1;
36$code.=<<___ if ($i==0);
37 ldq_u @X[0],0+0($inp)
38 ldq_u @X[1],0+7($inp)
39___
40$code.=<<___ if (!($i&1) && $i<14);
41 ldq_u @X[$i+2],($i+2)*4+0($inp)
42 ldq_u @X[$i+3],($i+2)*4+7($inp)
43___
44$code.=<<___ if (!($i&1) && $i<15);
45 extql @X[$i],$inp,@X[$i]
46 extqh @X[$i+1],$inp,@X[$i+1]
47
48 or @X[$i+1],@X[$i],@X[$i] # pair of 32-bit values are fetched
49
50 srl @X[$i],24,$t0 # vectorized byte swap
51 srl @X[$i],8,$t2
52
53 sll @X[$i],8,$t3
54 sll @X[$i],24,@X[$i]
55 zapnot $t0,0x11,$t0
56 zapnot $t2,0x22,$t2
57
58 zapnot @X[$i],0x88,@X[$i]
59 or $t0,$t2,$t0
60 zapnot $t3,0x44,$t3
61 sll $a,5,$t1
62
63 or @X[$i],$t0,@X[$i]
64 addl $K,$e,$e
65 and $b,$c,$t2
66 zapnot $a,0xf,$a
67
68 or @X[$i],$t3,@X[$i]
69 srl $a,27,$t0
70 bic $d,$b,$t3
71 sll $b,30,$b
72
73 extll @X[$i],4,@X[$i+1] # extract upper half
74 or $t2,$t3,$t2
75 addl @X[$i],$e,$e
76
77 addl $t1,$e,$e
78 srl $b,32,$t3
79 zapnot @X[$i],0xf,@X[$i]
80
81 addl $t0,$e,$e
82 addl $t2,$e,$e
83 or $t3,$b,$b
84___
85$code.=<<___ if (($i&1) && $i<15);
86 sll $a,5,$t1
87 addl $K,$e,$e
88 and $b,$c,$t2
89 zapnot $a,0xf,$a
90
91 srl $a,27,$t0
92 addl @X[$i%16],$e,$e
93 bic $d,$b,$t3
94 sll $b,30,$b
95
96 or $t2,$t3,$t2
97 addl $t1,$e,$e
98 srl $b,32,$t3
99 zapnot @X[$i],0xf,@X[$i]
100
101 addl $t0,$e,$e
102 addl $t2,$e,$e
103 or $t3,$b,$b
104___
105$code.=<<___ if ($i>=15); # with forward Xupdate
106 sll $a,5,$t1
107 addl $K,$e,$e
108 and $b,$c,$t2
109 xor @X[($j+2)%16],@X[$j%16],@X[$j%16]
110
111 zapnot $a,0xf,$a
112 addl @X[$i%16],$e,$e
113 bic $d,$b,$t3
114 xor @X[($j+8)%16],@X[$j%16],@X[$j%16]
115
116 srl $a,27,$t0
117 addl $t1,$e,$e
118 or $t2,$t3,$t2
119 xor @X[($j+13)%16],@X[$j%16],@X[$j%16]
120
121 sll $b,30,$b
122 addl $t0,$e,$e
123 srl @X[$j%16],31,$t1
124
125 addl $t2,$e,$e
126 srl $b,32,$t3
127 addl @X[$j%16],@X[$j%16],@X[$j%16]
128
129 or $t3,$b,$b
130 zapnot @X[$i%16],0xf,@X[$i%16]
131 or $t1,@X[$j%16],@X[$j%16]
132___
133}
134
135sub BODY_20_39 {
136my ($i,$a,$b,$c,$d,$e)=@_;
137my $j=$i+1;
138$code.=<<___ if ($i<79); # with forward Xupdate
139 sll $a,5,$t1
140 addl $K,$e,$e
141 zapnot $a,0xf,$a
142 xor @X[($j+2)%16],@X[$j%16],@X[$j%16]
143
144 sll $b,30,$t3
145 addl $t1,$e,$e
146 xor $b,$c,$t2
147 xor @X[($j+8)%16],@X[$j%16],@X[$j%16]
148
149 srl $b,2,$b
150 addl @X[$i%16],$e,$e
151 xor $d,$t2,$t2
152 xor @X[($j+13)%16],@X[$j%16],@X[$j%16]
153
154 srl @X[$j%16],31,$t1
155 addl $t2,$e,$e
156 srl $a,27,$t0
157 addl @X[$j%16],@X[$j%16],@X[$j%16]
158
159 or $t3,$b,$b
160 addl $t0,$e,$e
161 or $t1,@X[$j%16],@X[$j%16]
162___
163$code.=<<___ if ($i<77);
164 zapnot @X[$i%16],0xf,@X[$i%16]
165___
166$code.=<<___ if ($i==79); # with context fetch
167 sll $a,5,$t1
168 addl $K,$e,$e
169 zapnot $a,0xf,$a
170 ldl @X[0],0($ctx)
171
172 sll $b,30,$t3
173 addl $t1,$e,$e
174 xor $b,$c,$t2
175 ldl @X[1],4($ctx)
176
177 srl $b,2,$b
178 addl @X[$i%16],$e,$e
179 xor $d,$t2,$t2
180 ldl @X[2],8($ctx)
181
182 srl $a,27,$t0
183 addl $t2,$e,$e
184 ldl @X[3],12($ctx)
185
186 or $t3,$b,$b
187 addl $t0,$e,$e
188 ldl @X[4],16($ctx)
189___
190}
191
192sub BODY_40_59 {
193my ($i,$a,$b,$c,$d,$e)=@_;
194my $j=$i+1;
195$code.=<<___; # with forward Xupdate
196 sll $a,5,$t1
197 addl $K,$e,$e
198 zapnot $a,0xf,$a
199 xor @X[($j+2)%16],@X[$j%16],@X[$j%16]
200
201 srl $a,27,$t0
202 and $b,$c,$t2
203 and $b,$d,$t3
204 xor @X[($j+8)%16],@X[$j%16],@X[$j%16]
205
206 sll $b,30,$b
207 addl $t1,$e,$e
208 xor @X[($j+13)%16],@X[$j%16],@X[$j%16]
209
210 srl @X[$j%16],31,$t1
211 addl $t0,$e,$e
212 or $t2,$t3,$t2
213 and $c,$d,$t3
214
215 or $t2,$t3,$t2
216 srl $b,32,$t3
217 addl @X[$i%16],$e,$e
218 addl @X[$j%16],@X[$j%16],@X[$j%16]
219
220 or $t3,$b,$b
221 addl $t2,$e,$e
222 or $t1,@X[$j%16],@X[$j%16]
223 zapnot @X[$i%16],0xf,@X[$i%16]
224___
225}
226
227$code=<<___;
228#include <machine/asm.h>
229
230.text
231
232.set noat
233.set noreorder
234.globl sha1_block_data_order
235.align 5
236.ent sha1_block_data_order
237sha1_block_data_order:
238 lda sp,-64(sp)
239 stq ra,0(sp)
240 stq s0,8(sp)
241 stq s1,16(sp)
242 stq s2,24(sp)
243 stq s3,32(sp)
244 stq s4,40(sp)
245 stq s5,48(sp)
246 stq fp,56(sp)
247 .mask 0x0400fe00,-64
248 .frame sp,64,ra
249 .prologue 0
250
251 ldl $A,0($ctx)
252 ldl $B,4($ctx)
253 sll $num,6,$num
254 ldl $C,8($ctx)
255 ldl $D,12($ctx)
256 ldl $E,16($ctx)
257 addq $inp,$num,$num
258
259.Lloop:
260 .set noreorder
261 ldah $K,23170(zero)
262 zapnot $B,0xf,$B
263 lda $K,31129($K) # K_00_19
264___
265for ($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
266
267$code.=<<___;
268 ldah $K,28378(zero)
269 lda $K,-5215($K) # K_20_39
270___
271for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
272
273$code.=<<___;
274 ldah $K,-28900(zero)
275 lda $K,-17188($K) # K_40_59
276___
277for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
278
279$code.=<<___;
280 ldah $K,-13725(zero)
281 lda $K,-15914($K) # K_60_79
282___
283for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
284
285$code.=<<___;
286 addl @X[0],$A,$A
287 addl @X[1],$B,$B
288 addl @X[2],$C,$C
289 addl @X[3],$D,$D
290 addl @X[4],$E,$E
291 stl $A,0($ctx)
292 stl $B,4($ctx)
293 addq $inp,64,$inp
294 stl $C,8($ctx)
295 stl $D,12($ctx)
296 stl $E,16($ctx)
297 cmpult $inp,$num,$t1
298 bne $t1,.Lloop
299
300 .set noreorder
301 ldq ra,0(sp)
302 ldq s0,8(sp)
303 ldq s1,16(sp)
304 ldq s2,24(sp)
305 ldq s3,32(sp)
306 ldq s4,40(sp)
307 ldq s5,48(sp)
308 ldq fp,56(sp)
309 lda sp,64(sp)
310 ret (ra)
311.end sha1_block_data_order
312.ascii "SHA1 block transform for Alpha, CRYPTOGAMS by <appro\@openssl.org>"
313.align 2
314___
315$output=shift and open STDOUT,">$output";
316print $code;
317close STDOUT;
diff --git a/src/lib/libcrypto/sha/asm/sha1-armv4-large.pl b/src/lib/libcrypto/sha/asm/sha1-armv4-large.pl
deleted file mode 100644
index 33da3e0e3c..0000000000
--- a/src/lib/libcrypto/sha/asm/sha1-armv4-large.pl
+++ /dev/null
@@ -1,248 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# sha1_block procedure for ARMv4.
11#
12# January 2007.
13
14# Size/performance trade-off
15# ====================================================================
16# impl size in bytes comp cycles[*] measured performance
17# ====================================================================
18# thumb 304 3212 4420
19# armv4-small 392/+29% 1958/+64% 2250/+96%
20# armv4-compact 740/+89% 1552/+26% 1840/+22%
21# armv4-large 1420/+92% 1307/+19% 1370/+34%[***]
22# full unroll ~5100/+260% ~1260/+4% ~1300/+5%
23# ====================================================================
24# thumb = same as 'small' but in Thumb instructions[**] and
25# with recurring code in two private functions;
26# small = detached Xload/update, loops are folded;
27# compact = detached Xload/update, 5x unroll;
28# large = interleaved Xload/update, 5x unroll;
29# full unroll = interleaved Xload/update, full unroll, estimated[!];
30#
31# [*] Manually counted instructions in "grand" loop body. Measured
32# performance is affected by prologue and epilogue overhead,
33# i-cache availability, branch penalties, etc.
34# [**] While each Thumb instruction is twice smaller, they are not as
35# diverse as ARM ones: e.g., there are only two arithmetic
36# instructions with 3 arguments, no [fixed] rotate, addressing
37# modes are limited. As result it takes more instructions to do
38# the same job in Thumb, therefore the code is never twice as
39# small and always slower.
40# [***] which is also ~35% better than compiler generated code. Dual-
41# issue Cortex A8 core was measured to process input block in
42# ~990 cycles.
43
44# August 2010.
45#
46# Rescheduling for dual-issue pipeline resulted in 13% improvement on
47# Cortex A8 core and in absolute terms ~870 cycles per input block
48# [or 13.6 cycles per byte].
49
50# February 2011.
51#
52# Profiler-assisted and platform-specific optimization resulted in 10%
53# improvement on Cortex A8 core and 12.2 cycles per byte.
54
55while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
56open STDOUT,">$output";
57
58$ctx="r0";
59$inp="r1";
60$len="r2";
61$a="r3";
62$b="r4";
63$c="r5";
64$d="r6";
65$e="r7";
66$K="r8";
67$t0="r9";
68$t1="r10";
69$t2="r11";
70$t3="r12";
71$Xi="r14";
72@V=($a,$b,$c,$d,$e);
73
74sub Xupdate {
75my ($a,$b,$c,$d,$e,$opt1,$opt2)=@_;
76$code.=<<___;
77 ldr $t0,[$Xi,#15*4]
78 ldr $t1,[$Xi,#13*4]
79 ldr $t2,[$Xi,#7*4]
80 add $e,$K,$e,ror#2 @ E+=K_xx_xx
81 ldr $t3,[$Xi,#2*4]
82 eor $t0,$t0,$t1
83 eor $t2,$t2,$t3 @ 1 cycle stall
84 eor $t1,$c,$d @ F_xx_xx
85 mov $t0,$t0,ror#31
86 add $e,$e,$a,ror#27 @ E+=ROR(A,27)
87 eor $t0,$t0,$t2,ror#31
88 str $t0,[$Xi,#-4]!
89 $opt1 @ F_xx_xx
90 $opt2 @ F_xx_xx
91 add $e,$e,$t0 @ E+=X[i]
92___
93}
94
95sub BODY_00_15 {
96my ($a,$b,$c,$d,$e)=@_;
97$code.=<<___;
98#if __ARM_ARCH__<7
99 ldrb $t1,[$inp,#2]
100 ldrb $t0,[$inp,#3]
101 ldrb $t2,[$inp,#1]
102 add $e,$K,$e,ror#2 @ E+=K_00_19
103 ldrb $t3,[$inp],#4
104 orr $t0,$t0,$t1,lsl#8
105 eor $t1,$c,$d @ F_xx_xx
106 orr $t0,$t0,$t2,lsl#16
107 add $e,$e,$a,ror#27 @ E+=ROR(A,27)
108 orr $t0,$t0,$t3,lsl#24
109#else
110 ldr $t0,[$inp],#4 @ handles unaligned
111 add $e,$K,$e,ror#2 @ E+=K_00_19
112 eor $t1,$c,$d @ F_xx_xx
113 add $e,$e,$a,ror#27 @ E+=ROR(A,27)
114#ifdef __ARMEL__
115 rev $t0,$t0 @ byte swap
116#endif
117#endif
118 and $t1,$b,$t1,ror#2
119 add $e,$e,$t0 @ E+=X[i]
120 eor $t1,$t1,$d,ror#2 @ F_00_19(B,C,D)
121 str $t0,[$Xi,#-4]!
122 add $e,$e,$t1 @ E+=F_00_19(B,C,D)
123___
124}
125
126sub BODY_16_19 {
127my ($a,$b,$c,$d,$e)=@_;
128 &Xupdate(@_,"and $t1,$b,$t1,ror#2");
129$code.=<<___;
130 eor $t1,$t1,$d,ror#2 @ F_00_19(B,C,D)
131 add $e,$e,$t1 @ E+=F_00_19(B,C,D)
132___
133}
134
135sub BODY_20_39 {
136my ($a,$b,$c,$d,$e)=@_;
137 &Xupdate(@_,"eor $t1,$b,$t1,ror#2");
138$code.=<<___;
139 add $e,$e,$t1 @ E+=F_20_39(B,C,D)
140___
141}
142
143sub BODY_40_59 {
144my ($a,$b,$c,$d,$e)=@_;
145 &Xupdate(@_,"and $t1,$b,$t1,ror#2","and $t2,$c,$d");
146$code.=<<___;
147 add $e,$e,$t1 @ E+=F_40_59(B,C,D)
148 add $e,$e,$t2,ror#2
149___
150}
151
152$code=<<___;
153#include "arm_arch.h"
154
155.text
156
157.global sha1_block_data_order
158.type sha1_block_data_order,%function
159
160.align 2
161sha1_block_data_order:
162 stmdb sp!,{r4-r12,lr}
163 add $len,$inp,$len,lsl#6 @ $len to point at the end of $inp
164 ldmia $ctx,{$a,$b,$c,$d,$e}
165.Lloop:
166 ldr $K,.LK_00_19
167 mov $Xi,sp
168 sub sp,sp,#15*4
169 mov $c,$c,ror#30
170 mov $d,$d,ror#30
171 mov $e,$e,ror#30 @ [6]
172.L_00_15:
173___
174for($i=0;$i<5;$i++) {
175 &BODY_00_15(@V); unshift(@V,pop(@V));
176}
177$code.=<<___;
178 teq $Xi,sp
179 bne .L_00_15 @ [((11+4)*5+2)*3]
180 sub sp,sp,#25*4
181___
182 &BODY_00_15(@V); unshift(@V,pop(@V));
183 &BODY_16_19(@V); unshift(@V,pop(@V));
184 &BODY_16_19(@V); unshift(@V,pop(@V));
185 &BODY_16_19(@V); unshift(@V,pop(@V));
186 &BODY_16_19(@V); unshift(@V,pop(@V));
187$code.=<<___;
188
189 ldr $K,.LK_20_39 @ [+15+16*4]
190 cmn sp,#0 @ [+3], clear carry to denote 20_39
191.L_20_39_or_60_79:
192___
193for($i=0;$i<5;$i++) {
194 &BODY_20_39(@V); unshift(@V,pop(@V));
195}
196$code.=<<___;
197 teq $Xi,sp @ preserve carry
198 bne .L_20_39_or_60_79 @ [+((12+3)*5+2)*4]
199 bcs .L_done @ [+((12+3)*5+2)*4], spare 300 bytes
200
201 ldr $K,.LK_40_59
202 sub sp,sp,#20*4 @ [+2]
203.L_40_59:
204___
205for($i=0;$i<5;$i++) {
206 &BODY_40_59(@V); unshift(@V,pop(@V));
207}
208$code.=<<___;
209 teq $Xi,sp
210 bne .L_40_59 @ [+((12+5)*5+2)*4]
211
212 ldr $K,.LK_60_79
213 sub sp,sp,#20*4
214 cmp sp,#0 @ set carry to denote 60_79
215 b .L_20_39_or_60_79 @ [+4], spare 300 bytes
216.L_done:
217 add sp,sp,#80*4 @ "deallocate" stack frame
218 ldmia $ctx,{$K,$t0,$t1,$t2,$t3}
219 add $a,$K,$a
220 add $b,$t0,$b
221 add $c,$t1,$c,ror#2
222 add $d,$t2,$d,ror#2
223 add $e,$t3,$e,ror#2
224 stmia $ctx,{$a,$b,$c,$d,$e}
225 teq $inp,$len
226 bne .Lloop @ [+18], total 1307
227
228#if __ARM_ARCH__>=5
229 ldmia sp!,{r4-r12,pc}
230#else
231 ldmia sp!,{r4-r12,lr}
232 tst lr,#1
233 moveq pc,lr @ be binary compatible with V4, yet
234 bx lr @ interoperable with Thumb ISA:-)
235#endif
236.align 2
237.LK_00_19: .word 0x5a827999
238.LK_20_39: .word 0x6ed9eba1
239.LK_40_59: .word 0x8f1bbcdc
240.LK_60_79: .word 0xca62c1d6
241.size sha1_block_data_order,.-sha1_block_data_order
242.asciz "SHA1 block transform for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
243.align 2
244___
245
246$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
247print $code;
248close STDOUT; # enforce flush
diff --git a/src/lib/libcrypto/sha/asm/sha1-ia64.pl b/src/lib/libcrypto/sha/asm/sha1-ia64.pl
deleted file mode 100644
index 02d35d1614..0000000000
--- a/src/lib/libcrypto/sha/asm/sha1-ia64.pl
+++ /dev/null
@@ -1,305 +0,0 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# Eternal question is what's wrong with compiler generated code? The
11# trick is that it's possible to reduce the number of shifts required
12# to perform rotations by maintaining copy of 32-bit value in upper
13# bits of 64-bit register. Just follow mux2 and shrp instructions...
14# Performance under big-endian OS such as HP-UX is 179MBps*1GHz, which
15# is >50% better than HP C and >2x better than gcc.
16
17$code=<<___;
18.ident \"sha1-ia64.s, version 1.3\"
19.ident \"IA-64 ISA artwork by Andy Polyakov <appro\@fy.chalmers.se>\"
20.explicit
21
22___
23
24
25if ($^O eq "hpux") {
26 $ADDP="addp4";
27 for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
28} else { $ADDP="add"; }
29
30#$human=1;
31if ($human) { # useful for visual code auditing...
32 ($A,$B,$C,$D,$E) = ("A","B","C","D","E");
33 ($h0,$h1,$h2,$h3,$h4) = ("h0","h1","h2","h3","h4");
34 ($K_00_19, $K_20_39, $K_40_59, $K_60_79) =
35 ( "K_00_19","K_20_39","K_40_59","K_60_79" );
36 @X= ( "X0", "X1", "X2", "X3", "X4", "X5", "X6", "X7",
37 "X8", "X9","X10","X11","X12","X13","X14","X15" );
38}
39else {
40 ($A,$B,$C,$D,$E) = ("loc0","loc1","loc2","loc3","loc4");
41 ($h0,$h1,$h2,$h3,$h4) = ("loc5","loc6","loc7","loc8","loc9");
42 ($K_00_19, $K_20_39, $K_40_59, $K_60_79) =
43 ( "r14", "r15", "loc10", "loc11" );
44 @X= ( "r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23",
45 "r24", "r25", "r26", "r27", "r28", "r29", "r30", "r31" );
46}
47
48sub BODY_00_15 {
49local *code=shift;
50my ($i,$a,$b,$c,$d,$e)=@_;
51my $j=$i+1;
52my $Xn=@X[$j%16];
53
54$code.=<<___ if ($i==0);
55{ .mmi; ld1 $X[$i]=[inp],2 // MSB
56 ld1 tmp2=[tmp3],2 };;
57{ .mmi; ld1 tmp0=[inp],2
58 ld1 tmp4=[tmp3],2 // LSB
59 dep $X[$i]=$X[$i],tmp2,8,8 };;
60___
61if ($i<15) {
62 $code.=<<___;
63{ .mmi; ld1 $Xn=[inp],2 // forward Xload
64 nop.m 0x0
65 dep tmp1=tmp0,tmp4,8,8 };;
66{ .mmi; ld1 tmp2=[tmp3],2 // forward Xload
67 and tmp4=$c,$b
68 dep $X[$i]=$X[$i],tmp1,16,16} //;;
69{ .mmi; add $e=$e,$K_00_19 // e+=K_00_19
70 andcm tmp1=$d,$b
71 dep.z tmp5=$a,5,27 };; // a<<5
72{ .mmi; add $e=$e,$X[$i] // e+=Xload
73 or tmp4=tmp4,tmp1 // F_00_19(b,c,d)=(b&c)|(~b&d)
74 extr.u tmp1=$a,27,5 };; // a>>27
75{ .mmi; ld1 tmp0=[inp],2 // forward Xload
76 add $e=$e,tmp4 // e+=F_00_19(b,c,d)
77 shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30)
78{ .mmi; ld1 tmp4=[tmp3],2 // forward Xload
79 or tmp5=tmp1,tmp5 // ROTATE(a,5)
80 mux2 tmp6=$a,0x44 };; // see b in next iteration
81{ .mii; add $e=$e,tmp5 // e+=ROTATE(a,5)
82 dep $Xn=$Xn,tmp2,8,8 // forward Xload
83 mux2 $X[$i]=$X[$i],0x44 } //;;
84
85___
86 }
87else {
88 $code.=<<___;
89{ .mii; and tmp3=$c,$b
90 dep tmp1=tmp0,tmp4,8,8;;
91 dep $X[$i]=$X[$i],tmp1,16,16} //;;
92{ .mmi; add $e=$e,$K_00_19 // e+=K_00_19
93 andcm tmp1=$d,$b
94 dep.z tmp5=$a,5,27 };; // a<<5
95{ .mmi; add $e=$e,$X[$i] // e+=Xupdate
96 or tmp4=tmp3,tmp1 // F_00_19(b,c,d)=(b&c)|(~b&d)
97 extr.u tmp1=$a,27,5 } // a>>27
98{ .mmi; xor $Xn=$Xn,$X[($j+2)%16] // forward Xupdate
99 xor tmp3=$X[($j+8)%16],$X[($j+13)%16] // forward Xupdate
100 nop.i 0 };;
101{ .mmi; add $e=$e,tmp4 // e+=F_00_19(b,c,d)
102 xor $Xn=$Xn,tmp3 // forward Xupdate
103 shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30)
104{ .mmi; or tmp1=tmp1,tmp5 // ROTATE(a,5)
105 mux2 tmp6=$a,0x44 };; // see b in next iteration
106{ .mii; add $e=$e,tmp1 // e+=ROTATE(a,5)
107 shrp $Xn=$Xn,$Xn,31 // ROTATE(x[0]^x[2]^x[8]^x[13],1)
108 mux2 $X[$i]=$X[$i],0x44 };;
109
110___
111 }
112}
113
114sub BODY_16_19 {
115local *code=shift;
116my ($i,$a,$b,$c,$d,$e)=@_;
117my $j=$i+1;
118my $Xn=@X[$j%16];
119
120$code.=<<___;
121{ .mib; add $e=$e,$K_00_19 // e+=K_00_19
122 dep.z tmp5=$a,5,27 } // a<<5
123{ .mib; andcm tmp1=$d,$b
124 and tmp0=$c,$b };;
125{ .mmi; add $e=$e,$X[$i%16] // e+=Xupdate
126 or tmp0=tmp0,tmp1 // F_00_19(b,c,d)=(b&c)|(~b&d)
127 extr.u tmp1=$a,27,5 } // a>>27
128{ .mmi; xor $Xn=$Xn,$X[($j+2)%16] // forward Xupdate
129 xor tmp3=$X[($j+8)%16],$X[($j+13)%16] // forward Xupdate
130 nop.i 0 };;
131{ .mmi; add $e=$e,tmp0 // f+=F_00_19(b,c,d)
132 xor $Xn=$Xn,tmp3 // forward Xupdate
133 shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30)
134{ .mmi; or tmp1=tmp1,tmp5 // ROTATE(a,5)
135 mux2 tmp6=$a,0x44 };; // see b in next iteration
136{ .mii; add $e=$e,tmp1 // e+=ROTATE(a,5)
137 shrp $Xn=$Xn,$Xn,31 // ROTATE(x[0]^x[2]^x[8]^x[13],1)
138 nop.i 0 };;
139
140___
141}
142
143sub BODY_20_39 {
144local *code=shift;
145my ($i,$a,$b,$c,$d,$e,$Konst)=@_;
146 $Konst = $K_20_39 if (!defined($Konst));
147my $j=$i+1;
148my $Xn=@X[$j%16];
149
150if ($i<79) {
151$code.=<<___;
152{ .mib; add $e=$e,$Konst // e+=K_XX_XX
153 dep.z tmp5=$a,5,27 } // a<<5
154{ .mib; xor tmp0=$c,$b
155 xor $Xn=$Xn,$X[($j+2)%16] };; // forward Xupdate
156{ .mib; add $e=$e,$X[$i%16] // e+=Xupdate
157 extr.u tmp1=$a,27,5 } // a>>27
158{ .mib; xor tmp0=tmp0,$d // F_20_39(b,c,d)=b^c^d
159 xor $Xn=$Xn,$X[($j+8)%16] };; // forward Xupdate
160{ .mmi; add $e=$e,tmp0 // e+=F_20_39(b,c,d)
161 xor $Xn=$Xn,$X[($j+13)%16] // forward Xupdate
162 shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30)
163{ .mmi; or tmp1=tmp1,tmp5 // ROTATE(a,5)
164 mux2 tmp6=$a,0x44 };; // see b in next iteration
165{ .mii; add $e=$e,tmp1 // e+=ROTATE(a,5)
166 shrp $Xn=$Xn,$Xn,31 // ROTATE(x[0]^x[2]^x[8]^x[13],1)
167 nop.i 0 };;
168
169___
170}
171else {
172$code.=<<___;
173{ .mib; add $e=$e,$Konst // e+=K_60_79
174 dep.z tmp5=$a,5,27 } // a<<5
175{ .mib; xor tmp0=$c,$b
176 add $h1=$h1,$a };; // wrap up
177{ .mib; add $e=$e,$X[$i%16] // e+=Xupdate
178 extr.u tmp1=$a,27,5 } // a>>27
179{ .mib; xor tmp0=tmp0,$d // F_20_39(b,c,d)=b^c^d
180 add $h3=$h3,$c };; // wrap up
181{ .mmi; add $e=$e,tmp0 // e+=F_20_39(b,c,d)
182 or tmp1=tmp1,tmp5 // ROTATE(a,5)
183 shrp $b=tmp6,tmp6,2 };; // b=ROTATE(b,30) ;;?
184{ .mmi; add $e=$e,tmp1 // e+=ROTATE(a,5)
185 add tmp3=1,inp // used in unaligned codepath
186 add $h4=$h4,$d };; // wrap up
187
188___
189}
190}
191
192sub BODY_40_59 {
193local *code=shift;
194my ($i,$a,$b,$c,$d,$e)=@_;
195my $j=$i+1;
196my $Xn=@X[$j%16];
197
198$code.=<<___;
199{ .mib; add $e=$e,$K_40_59 // e+=K_40_59
200 dep.z tmp5=$a,5,27 } // a<<5
201{ .mib; and tmp1=$c,$d
202 xor tmp0=$c,$d };;
203{ .mmi; add $e=$e,$X[$i%16] // e+=Xupdate
204 add tmp5=tmp5,tmp1 // a<<5+(c&d)
205 extr.u tmp1=$a,27,5 } // a>>27
206{ .mmi; and tmp0=tmp0,$b
207 xor $Xn=$Xn,$X[($j+2)%16] // forward Xupdate
208 xor tmp3=$X[($j+8)%16],$X[($j+13)%16] };; // forward Xupdate
209{ .mmi; add $e=$e,tmp0 // e+=b&(c^d)
210 add tmp5=tmp5,tmp1 // ROTATE(a,5)+(c&d)
211 shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30)
212{ .mmi; xor $Xn=$Xn,tmp3
213 mux2 tmp6=$a,0x44 };; // see b in next iteration
214{ .mii; add $e=$e,tmp5 // e+=ROTATE(a,5)+(c&d)
215 shrp $Xn=$Xn,$Xn,31 // ROTATE(x[0]^x[2]^x[8]^x[13],1)
216 nop.i 0x0 };;
217
218___
219}
220sub BODY_60_79 { &BODY_20_39(@_,$K_60_79); }
221
222$code.=<<___;
223.text
224
225tmp0=r8;
226tmp1=r9;
227tmp2=r10;
228tmp3=r11;
229ctx=r32; // in0
230inp=r33; // in1
231
232// void sha1_block_data_order(SHA_CTX *c,const void *p,size_t num);
233.global sha1_block_data_order#
234.proc sha1_block_data_order#
235.align 32
236sha1_block_data_order:
237 .prologue
238{ .mmi; alloc tmp1=ar.pfs,3,14,0,0
239 $ADDP tmp0=4,ctx
240 .save ar.lc,r3
241 mov r3=ar.lc }
242{ .mmi; $ADDP ctx=0,ctx
243 $ADDP inp=0,inp
244 mov r2=pr };;
245tmp4=in2;
246tmp5=loc12;
247tmp6=loc13;
248 .body
249{ .mlx; ld4 $h0=[ctx],8
250 movl $K_00_19=0x5a827999 }
251{ .mlx; ld4 $h1=[tmp0],8
252 movl $K_20_39=0x6ed9eba1 };;
253{ .mlx; ld4 $h2=[ctx],8
254 movl $K_40_59=0x8f1bbcdc }
255{ .mlx; ld4 $h3=[tmp0]
256 movl $K_60_79=0xca62c1d6 };;
257{ .mmi; ld4 $h4=[ctx],-16
258 add in2=-1,in2 // adjust num for ar.lc
259 mov ar.ec=1 };;
260{ .mmi; nop.m 0
261 add tmp3=1,inp
262 mov ar.lc=in2 };; // brp.loop.imp: too far
263
264.Ldtop:
265{ .mmi; mov $A=$h0
266 mov $B=$h1
267 mux2 tmp6=$h1,0x44 }
268{ .mmi; mov $C=$h2
269 mov $D=$h3
270 mov $E=$h4 };;
271
272___
273
274{ my $i;
275 my @V=($A,$B,$C,$D,$E);
276
277 for($i=0;$i<16;$i++) { &BODY_00_15(\$code,$i,@V); unshift(@V,pop(@V)); }
278 for(;$i<20;$i++) { &BODY_16_19(\$code,$i,@V); unshift(@V,pop(@V)); }
279 for(;$i<40;$i++) { &BODY_20_39(\$code,$i,@V); unshift(@V,pop(@V)); }
280 for(;$i<60;$i++) { &BODY_40_59(\$code,$i,@V); unshift(@V,pop(@V)); }
281 for(;$i<80;$i++) { &BODY_60_79(\$code,$i,@V); unshift(@V,pop(@V)); }
282
283 (($V[0] eq $A) and ($V[4] eq $E)) or die; # double-check
284}
285
286$code.=<<___;
287{ .mmb; add $h0=$h0,$A
288 add $h2=$h2,$C
289 br.ctop.dptk.many .Ldtop };;
290.Ldend:
291{ .mmi; add tmp0=4,ctx
292 mov ar.lc=r3 };;
293{ .mmi; st4 [ctx]=$h0,8
294 st4 [tmp0]=$h1,8 };;
295{ .mmi; st4 [ctx]=$h2,8
296 st4 [tmp0]=$h3 };;
297{ .mib; st4 [ctx]=$h4,-16
298 mov pr=r2,0x1ffff
299 br.ret.sptk.many b0 };;
300.endp sha1_block_data_order#
301stringz "SHA1 block transform for IA64, CRYPTOGAMS by <appro\@openssl.org>"
302___
303
304$output=shift and open STDOUT,">$output";
305print $code;
diff --git a/src/lib/libcrypto/sha/asm/sha1-mips.pl b/src/lib/libcrypto/sha/asm/sha1-mips.pl
deleted file mode 100644
index 0590b7cdb2..0000000000
--- a/src/lib/libcrypto/sha/asm/sha1-mips.pl
+++ /dev/null
@@ -1,350 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# SHA1 block procedure for MIPS.
11
12# Performance improvement is 30% on unaligned input. The "secret" is
13# to deploy lwl/lwr pair to load unaligned input. One could have
14# vectorized Xupdate on MIPSIII/IV, but the goal was to code MIPS32-
15# compatible subroutine. There is room for minor optimization on
16# little-endian platforms...
17
18######################################################################
19# There is a number of MIPS ABI in use, O32 and N32/64 are most
20# widely used. Then there is a new contender: NUBI. It appears that if
21# one picks the latter, it's possible to arrange code in ABI neutral
22# manner. Therefore let's stick to NUBI register layout:
23#
24($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
25($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
26($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
27($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
28#
29# The return value is placed in $a0. Following coding rules facilitate
30# interoperability:
31#
32# - never ever touch $tp, "thread pointer", former $gp;
33# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
34# old code];
35# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
36#
37# For reference here is register layout for N32/64 MIPS ABIs:
38#
39# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
40# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
41# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
42# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
43# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
44#
45$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64
46
47if ($flavour =~ /64|n32/i) {
48 $PTR_ADD="dadd"; # incidentally works even on n32
49 $PTR_SUB="dsub"; # incidentally works even on n32
50 $REG_S="sd";
51 $REG_L="ld";
52 $PTR_SLL="dsll"; # incidentally works even on n32
53 $SZREG=8;
54} else {
55 $PTR_ADD="add";
56 $PTR_SUB="sub";
57 $REG_S="sw";
58 $REG_L="lw";
59 $PTR_SLL="sll";
60 $SZREG=4;
61}
62#
63# <appro@openssl.org>
64#
65######################################################################
66
67$big_endian=(`echo MIPSEL | $ENV{CC} -E -P -`=~/MIPSEL/)?1:0;
68
69for (@ARGV) { $output=$_ if (/^\w[\w\-]*\.\w+$/); }
70open STDOUT,">$output";
71
72if (!defined($big_endian))
73 { $big_endian=(unpack('L',pack('N',1))==1); }
74
75# offsets of the Most and Least Significant Bytes
76$MSB=$big_endian?0:3;
77$LSB=3&~$MSB;
78
79@X=map("\$$_",(8..23)); # a4-a7,s0-s11
80
81$ctx=$a0;
82$inp=$a1;
83$num=$a2;
84$A="\$1";
85$B="\$2";
86$C="\$3";
87$D="\$7";
88$E="\$24"; @V=($A,$B,$C,$D,$E);
89$t0="\$25";
90$t1=$num; # $num is offloaded to stack
91$t2="\$30"; # fp
92$K="\$31"; # ra
93
94sub BODY_00_14 {
95my ($i,$a,$b,$c,$d,$e)=@_;
96my $j=$i+1;
97$code.=<<___ if (!$big_endian);
98 srl $t0,@X[$i],24 # byte swap($i)
99 srl $t1,@X[$i],8
100 andi $t2,@X[$i],0xFF00
101 sll @X[$i],@X[$i],24
102 andi $t1,0xFF00
103 sll $t2,$t2,8
104 or @X[$i],$t0
105 or $t1,$t2
106 or @X[$i],$t1
107___
108$code.=<<___;
109 lwl @X[$j],$j*4+$MSB($inp)
110 sll $t0,$a,5 # $i
111 addu $e,$K
112 lwr @X[$j],$j*4+$LSB($inp)
113 srl $t1,$a,27
114 addu $e,$t0
115 xor $t0,$c,$d
116 addu $e,$t1
117 sll $t2,$b,30
118 and $t0,$b
119 srl $b,$b,2
120 xor $t0,$d
121 addu $e,@X[$i]
122 or $b,$t2
123 addu $e,$t0
124___
125}
126
127sub BODY_15_19 {
128my ($i,$a,$b,$c,$d,$e)=@_;
129my $j=$i+1;
130
131$code.=<<___ if (!$big_endian && $i==15);
132 srl $t0,@X[$i],24 # byte swap($i)
133 srl $t1,@X[$i],8
134 andi $t2,@X[$i],0xFF00
135 sll @X[$i],@X[$i],24
136 andi $t1,0xFF00
137 sll $t2,$t2,8
138 or @X[$i],$t0
139 or @X[$i],$t1
140 or @X[$i],$t2
141___
142$code.=<<___;
143 xor @X[$j%16],@X[($j+2)%16]
144 sll $t0,$a,5 # $i
145 addu $e,$K
146 srl $t1,$a,27
147 addu $e,$t0
148 xor @X[$j%16],@X[($j+8)%16]
149 xor $t0,$c,$d
150 addu $e,$t1
151 xor @X[$j%16],@X[($j+13)%16]
152 sll $t2,$b,30
153 and $t0,$b
154 srl $t1,@X[$j%16],31
155 addu @X[$j%16],@X[$j%16]
156 srl $b,$b,2
157 xor $t0,$d
158 or @X[$j%16],$t1
159 addu $e,@X[$i%16]
160 or $b,$t2
161 addu $e,$t0
162___
163}
164
165sub BODY_20_39 {
166my ($i,$a,$b,$c,$d,$e)=@_;
167my $j=$i+1;
168$code.=<<___ if ($i<79);
169 xor @X[$j%16],@X[($j+2)%16]
170 sll $t0,$a,5 # $i
171 addu $e,$K
172 srl $t1,$a,27
173 addu $e,$t0
174 xor @X[$j%16],@X[($j+8)%16]
175 xor $t0,$c,$d
176 addu $e,$t1
177 xor @X[$j%16],@X[($j+13)%16]
178 sll $t2,$b,30
179 xor $t0,$b
180 srl $t1,@X[$j%16],31
181 addu @X[$j%16],@X[$j%16]
182 srl $b,$b,2
183 addu $e,@X[$i%16]
184 or @X[$j%16],$t1
185 or $b,$t2
186 addu $e,$t0
187___
188$code.=<<___ if ($i==79);
189 lw @X[0],0($ctx)
190 sll $t0,$a,5 # $i
191 addu $e,$K
192 lw @X[1],4($ctx)
193 srl $t1,$a,27
194 addu $e,$t0
195 lw @X[2],8($ctx)
196 xor $t0,$c,$d
197 addu $e,$t1
198 lw @X[3],12($ctx)
199 sll $t2,$b,30
200 xor $t0,$b
201 lw @X[4],16($ctx)
202 srl $b,$b,2
203 addu $e,@X[$i%16]
204 or $b,$t2
205 addu $e,$t0
206___
207}
208
209sub BODY_40_59 {
210my ($i,$a,$b,$c,$d,$e)=@_;
211my $j=$i+1;
212$code.=<<___ if ($i<79);
213 xor @X[$j%16],@X[($j+2)%16]
214 sll $t0,$a,5 # $i
215 addu $e,$K
216 srl $t1,$a,27
217 addu $e,$t0
218 xor @X[$j%16],@X[($j+8)%16]
219 and $t0,$c,$d
220 addu $e,$t1
221 xor @X[$j%16],@X[($j+13)%16]
222 sll $t2,$b,30
223 addu $e,$t0
224 srl $t1,@X[$j%16],31
225 xor $t0,$c,$d
226 addu @X[$j%16],@X[$j%16]
227 and $t0,$b
228 srl $b,$b,2
229 or @X[$j%16],$t1
230 addu $e,@X[$i%16]
231 or $b,$t2
232 addu $e,$t0
233___
234}
235
236$FRAMESIZE=16; # large enough to accomodate NUBI saved registers
237$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0xc0fff008 : 0xc0ff0000;
238
239$code=<<___;
240.text
241
242.set noat
243.set noreorder
244.align 5
245.globl sha1_block_data_order
246.ent sha1_block_data_order
247sha1_block_data_order:
248 .frame $sp,$FRAMESIZE*$SZREG,$ra
249 .mask $SAVED_REGS_MASK,-$SZREG
250 .set noreorder
251 $PTR_SUB $sp,$FRAMESIZE*$SZREG
252 $REG_S $ra,($FRAMESIZE-1)*$SZREG($sp)
253 $REG_S $fp,($FRAMESIZE-2)*$SZREG($sp)
254 $REG_S $s11,($FRAMESIZE-3)*$SZREG($sp)
255 $REG_S $s10,($FRAMESIZE-4)*$SZREG($sp)
256 $REG_S $s9,($FRAMESIZE-5)*$SZREG($sp)
257 $REG_S $s8,($FRAMESIZE-6)*$SZREG($sp)
258 $REG_S $s7,($FRAMESIZE-7)*$SZREG($sp)
259 $REG_S $s6,($FRAMESIZE-8)*$SZREG($sp)
260 $REG_S $s5,($FRAMESIZE-9)*$SZREG($sp)
261 $REG_S $s4,($FRAMESIZE-10)*$SZREG($sp)
262___
263$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
264 $REG_S $s3,($FRAMESIZE-11)*$SZREG($sp)
265 $REG_S $s2,($FRAMESIZE-12)*$SZREG($sp)
266 $REG_S $s1,($FRAMESIZE-13)*$SZREG($sp)
267 $REG_S $s0,($FRAMESIZE-14)*$SZREG($sp)
268 $REG_S $gp,($FRAMESIZE-15)*$SZREG($sp)
269___
270$code.=<<___;
271 $PTR_SLL $num,6
272 $PTR_ADD $num,$inp
273 $REG_S $num,0($sp)
274 lw $A,0($ctx)
275 lw $B,4($ctx)
276 lw $C,8($ctx)
277 lw $D,12($ctx)
278 b .Loop
279 lw $E,16($ctx)
280.align 4
281.Loop:
282 .set reorder
283 lwl @X[0],$MSB($inp)
284 lui $K,0x5a82
285 lwr @X[0],$LSB($inp)
286 ori $K,0x7999 # K_00_19
287___
288for ($i=0;$i<15;$i++) { &BODY_00_14($i,@V); unshift(@V,pop(@V)); }
289for (;$i<20;$i++) { &BODY_15_19($i,@V); unshift(@V,pop(@V)); }
290$code.=<<___;
291 lui $K,0x6ed9
292 ori $K,0xeba1 # K_20_39
293___
294for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
295$code.=<<___;
296 lui $K,0x8f1b
297 ori $K,0xbcdc # K_40_59
298___
299for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
300$code.=<<___;
301 lui $K,0xca62
302 ori $K,0xc1d6 # K_60_79
303___
304for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
305$code.=<<___;
306 $PTR_ADD $inp,64
307 $REG_L $num,0($sp)
308
309 addu $A,$X[0]
310 addu $B,$X[1]
311 sw $A,0($ctx)
312 addu $C,$X[2]
313 addu $D,$X[3]
314 sw $B,4($ctx)
315 addu $E,$X[4]
316 sw $C,8($ctx)
317 sw $D,12($ctx)
318 sw $E,16($ctx)
319 .set noreorder
320 bne $inp,$num,.Loop
321 nop
322
323 .set noreorder
324 $REG_L $ra,($FRAMESIZE-1)*$SZREG($sp)
325 $REG_L $fp,($FRAMESIZE-2)*$SZREG($sp)
326 $REG_L $s11,($FRAMESIZE-3)*$SZREG($sp)
327 $REG_L $s10,($FRAMESIZE-4)*$SZREG($sp)
328 $REG_L $s9,($FRAMESIZE-5)*$SZREG($sp)
329 $REG_L $s8,($FRAMESIZE-6)*$SZREG($sp)
330 $REG_L $s7,($FRAMESIZE-7)*$SZREG($sp)
331 $REG_L $s6,($FRAMESIZE-8)*$SZREG($sp)
332 $REG_L $s5,($FRAMESIZE-9)*$SZREG($sp)
333 $REG_L $s4,($FRAMESIZE-10)*$SZREG($sp)
334___
335$code.=<<___ if ($flavour =~ /nubi/i);
336 $REG_L $s3,($FRAMESIZE-11)*$SZREG($sp)
337 $REG_L $s2,($FRAMESIZE-12)*$SZREG($sp)
338 $REG_L $s1,($FRAMESIZE-13)*$SZREG($sp)
339 $REG_L $s0,($FRAMESIZE-14)*$SZREG($sp)
340 $REG_L $gp,($FRAMESIZE-15)*$SZREG($sp)
341___
342$code.=<<___;
343 jr $ra
344 $PTR_ADD $sp,$FRAMESIZE*$SZREG
345.end sha1_block_data_order
346.rdata
347.asciiz "SHA1 for MIPS, CRYPTOGAMS by <appro\@openssl.org>"
348___
349print $code;
350close STDOUT;
diff --git a/src/lib/libcrypto/sha/asm/sha1-parisc.pl b/src/lib/libcrypto/sha/asm/sha1-parisc.pl
deleted file mode 100644
index 6cb4656422..0000000000
--- a/src/lib/libcrypto/sha/asm/sha1-parisc.pl
+++ /dev/null
@@ -1,266 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# SHA1 block procedure for PA-RISC.
11
12# June 2009.
13#
14# On PA-7100LC performance is >30% better than gcc 3.2 generated code
15# for aligned input and >50% better for unaligned. Compared to vendor
16# compiler on PA-8600 it's almost 60% faster in 64-bit build and just
17# few percent faster in 32-bit one (this for aligned input, data for
18# unaligned input is not available).
19#
20# Special thanks to polarhome.com for providing HP-UX account.
21
22$flavour = shift;
23$output = shift;
24open STDOUT,">$output";
25
26if ($flavour =~ /64/) {
27 $LEVEL ="2.0W";
28 $SIZE_T =8;
29 $FRAME_MARKER =80;
30 $SAVED_RP =16;
31 $PUSH ="std";
32 $PUSHMA ="std,ma";
33 $POP ="ldd";
34 $POPMB ="ldd,mb";
35} else {
36 $LEVEL ="1.0";
37 $SIZE_T =4;
38 $FRAME_MARKER =48;
39 $SAVED_RP =20;
40 $PUSH ="stw";
41 $PUSHMA ="stwm";
42 $POP ="ldw";
43 $POPMB ="ldwm";
44}
45
46$FRAME=14*$SIZE_T+$FRAME_MARKER;# 14 saved regs + frame marker
47 # [+ argument transfer]
48$ctx="%r26"; # arg0
49$inp="%r25"; # arg1
50$num="%r24"; # arg2
51
52$t0="%r28";
53$t1="%r29";
54$K="%r31";
55
56@X=("%r1", "%r2", "%r3", "%r4", "%r5", "%r6", "%r7", "%r8",
57 "%r9", "%r10","%r11","%r12","%r13","%r14","%r15","%r16",$t0);
58
59@V=($A,$B,$C,$D,$E)=("%r19","%r20","%r21","%r22","%r23");
60
61sub BODY_00_19 {
62my ($i,$a,$b,$c,$d,$e)=@_;
63my $j=$i+1;
64$code.=<<___ if ($i<15);
65 addl $K,$e,$e ; $i
66 shd $a,$a,27,$t1
67 addl @X[$i],$e,$e
68 and $c,$b,$t0
69 addl $t1,$e,$e
70 andcm $d,$b,$t1
71 shd $b,$b,2,$b
72 or $t1,$t0,$t0
73 addl $t0,$e,$e
74___
75$code.=<<___ if ($i>=15); # with forward Xupdate
76 addl $K,$e,$e ; $i
77 shd $a,$a,27,$t1
78 xor @X[($j+2)%16],@X[$j%16],@X[$j%16]
79 addl @X[$i%16],$e,$e
80 and $c,$b,$t0
81 xor @X[($j+8)%16],@X[$j%16],@X[$j%16]
82 addl $t1,$e,$e
83 andcm $d,$b,$t1
84 shd $b,$b,2,$b
85 or $t1,$t0,$t0
86 xor @X[($j+13)%16],@X[$j%16],@X[$j%16]
87 add $t0,$e,$e
88 shd @X[$j%16],@X[$j%16],31,@X[$j%16]
89___
90}
91
92sub BODY_20_39 {
93my ($i,$a,$b,$c,$d,$e)=@_;
94my $j=$i+1;
95$code.=<<___ if ($i<79);
96 xor @X[($j+2)%16],@X[$j%16],@X[$j%16] ; $i
97 addl $K,$e,$e
98 shd $a,$a,27,$t1
99 xor @X[($j+8)%16],@X[$j%16],@X[$j%16]
100 addl @X[$i%16],$e,$e
101 xor $b,$c,$t0
102 xor @X[($j+13)%16],@X[$j%16],@X[$j%16]
103 addl $t1,$e,$e
104 shd $b,$b,2,$b
105 xor $d,$t0,$t0
106 shd @X[$j%16],@X[$j%16],31,@X[$j%16]
107 addl $t0,$e,$e
108___
109$code.=<<___ if ($i==79); # with context load
110 ldw 0($ctx),@X[0] ; $i
111 addl $K,$e,$e
112 shd $a,$a,27,$t1
113 ldw 4($ctx),@X[1]
114 addl @X[$i%16],$e,$e
115 xor $b,$c,$t0
116 ldw 8($ctx),@X[2]
117 addl $t1,$e,$e
118 shd $b,$b,2,$b
119 xor $d,$t0,$t0
120 ldw 12($ctx),@X[3]
121 addl $t0,$e,$e
122 ldw 16($ctx),@X[4]
123___
124}
125
126sub BODY_40_59 {
127my ($i,$a,$b,$c,$d,$e)=@_;
128my $j=$i+1;
129$code.=<<___;
130 shd $a,$a,27,$t1 ; $i
131 addl $K,$e,$e
132 xor @X[($j+2)%16],@X[$j%16],@X[$j%16]
133 xor $d,$c,$t0
134 addl @X[$i%16],$e,$e
135 xor @X[($j+8)%16],@X[$j%16],@X[$j%16]
136 and $b,$t0,$t0
137 addl $t1,$e,$e
138 shd $b,$b,2,$b
139 xor @X[($j+13)%16],@X[$j%16],@X[$j%16]
140 addl $t0,$e,$e
141 and $d,$c,$t1
142 shd @X[$j%16],@X[$j%16],31,@X[$j%16]
143 addl $t1,$e,$e
144___
145}
146
147$code=<<___;
148 .LEVEL $LEVEL
149#if 0
150 .SPACE \$TEXT\$
151 .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
152#else
153 .text
154#endif
155
156 .EXPORT sha1_block_data_order,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR
157sha1_block_data_order
158 .PROC
159 .CALLINFO FRAME=`$FRAME-14*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=16
160 .ENTRY
161 $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
162 $PUSHMA %r3,$FRAME(%sp)
163 $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
164 $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
165 $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
166 $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp)
167 $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp)
168 $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp)
169 $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp)
170 $PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp)
171 $PUSH %r12,`-$FRAME+9*$SIZE_T`(%sp)
172 $PUSH %r13,`-$FRAME+10*$SIZE_T`(%sp)
173 $PUSH %r14,`-$FRAME+11*$SIZE_T`(%sp)
174 $PUSH %r15,`-$FRAME+12*$SIZE_T`(%sp)
175 $PUSH %r16,`-$FRAME+13*$SIZE_T`(%sp)
176
177 ldw 0($ctx),$A
178 ldw 4($ctx),$B
179 ldw 8($ctx),$C
180 ldw 12($ctx),$D
181 ldw 16($ctx),$E
182
183 extru $inp,31,2,$t0 ; t0=inp&3;
184 sh3addl $t0,%r0,$t0 ; t0*=8;
185 subi 32,$t0,$t0 ; t0=32-t0;
186 mtctl $t0,%cr11 ; %sar=t0;
187
188L\$oop
189 ldi 3,$t0
190 andcm $inp,$t0,$t0 ; 64-bit neutral
191___
192 for ($i=0;$i<15;$i++) { # load input block
193 $code.="\tldw `4*$i`($t0),@X[$i]\n"; }
194$code.=<<___;
195 cmpb,*= $inp,$t0,L\$aligned
196 ldw 60($t0),@X[15]
197 ldw 64($t0),@X[16]
198___
199 for ($i=0;$i<16;$i++) { # align input
200 $code.="\tvshd @X[$i],@X[$i+1],@X[$i]\n"; }
201$code.=<<___;
202L\$aligned
203 ldil L'0x5a827000,$K ; K_00_19
204 ldo 0x999($K),$K
205___
206for ($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
207$code.=<<___;
208 ldil L'0x6ed9e000,$K ; K_20_39
209 ldo 0xba1($K),$K
210___
211
212for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
213$code.=<<___;
214 ldil L'0x8f1bb000,$K ; K_40_59
215 ldo 0xcdc($K),$K
216___
217
218for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
219$code.=<<___;
220 ldil L'0xca62c000,$K ; K_60_79
221 ldo 0x1d6($K),$K
222___
223for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
224
225$code.=<<___;
226 addl @X[0],$A,$A
227 addl @X[1],$B,$B
228 addl @X[2],$C,$C
229 addl @X[3],$D,$D
230 addl @X[4],$E,$E
231 stw $A,0($ctx)
232 stw $B,4($ctx)
233 stw $C,8($ctx)
234 stw $D,12($ctx)
235 stw $E,16($ctx)
236 addib,*<> -1,$num,L\$oop
237 ldo 64($inp),$inp
238
239 $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue
240 $POP `-$FRAME+1*$SIZE_T`(%sp),%r4
241 $POP `-$FRAME+2*$SIZE_T`(%sp),%r5
242 $POP `-$FRAME+3*$SIZE_T`(%sp),%r6
243 $POP `-$FRAME+4*$SIZE_T`(%sp),%r7
244 $POP `-$FRAME+5*$SIZE_T`(%sp),%r8
245 $POP `-$FRAME+6*$SIZE_T`(%sp),%r9
246 $POP `-$FRAME+7*$SIZE_T`(%sp),%r10
247 $POP `-$FRAME+8*$SIZE_T`(%sp),%r11
248 $POP `-$FRAME+9*$SIZE_T`(%sp),%r12
249 $POP `-$FRAME+10*$SIZE_T`(%sp),%r13
250 $POP `-$FRAME+11*$SIZE_T`(%sp),%r14
251 $POP `-$FRAME+12*$SIZE_T`(%sp),%r15
252 $POP `-$FRAME+13*$SIZE_T`(%sp),%r16
253 bv (%r2)
254 .EXIT
255 $POPMB -$FRAME(%sp),%r3
256 .PROCEND
257
258 .data
259 .STRINGZ "SHA1 block transform for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
260___
261
262$code =~ s/\`([^\`]*)\`/eval $1/gem;
263$code =~ s/,\*/,/gm if ($SIZE_T==4);
264$code =~ s/\bbv\b/bve/gm if ($SIZE_T==8);
265print $code;
266close STDOUT;
diff --git a/src/lib/libcrypto/sha/asm/sha1-ppc.pl b/src/lib/libcrypto/sha/asm/sha1-ppc.pl
deleted file mode 100755
index 2140dd2f8d..0000000000
--- a/src/lib/libcrypto/sha/asm/sha1-ppc.pl
+++ /dev/null
@@ -1,326 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# I let hardware handle unaligned input(*), except on page boundaries
11# (see below for details). Otherwise straightforward implementation
12# with X vector in register bank. The module is big-endian [which is
13# not big deal as there're no little-endian targets left around].
14#
15# (*) this means that this module is inappropriate for PPC403? Does
16# anybody know if pre-POWER3 can sustain unaligned load?
17
18# -m64 -m32
19# ----------------------------------
20# PPC970,gcc-4.0.0 +76% +59%
21# Power6,xlc-7 +68% +33%
22
23$flavour = shift;
24
25if ($flavour =~ /64/) {
26 $SIZE_T =8;
27 $LRSAVE =2*$SIZE_T;
28 $UCMP ="cmpld";
29 $STU ="stdu";
30 $POP ="ld";
31 $PUSH ="std";
32} elsif ($flavour =~ /32/) {
33 $SIZE_T =4;
34 $LRSAVE =$SIZE_T;
35 $UCMP ="cmplw";
36 $STU ="stwu";
37 $POP ="lwz";
38 $PUSH ="stw";
39} else { die "nonsense $flavour"; }
40
41$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
42( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
43( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
44die "can't locate ppc-xlate.pl";
45
46open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
47
48$FRAME=24*$SIZE_T+64;
49$LOCALS=6*$SIZE_T;
50
51$K ="r0";
52$sp ="r1";
53$toc="r2";
54$ctx="r3";
55$inp="r4";
56$num="r5";
57$t0 ="r15";
58$t1 ="r6";
59
60$A ="r7";
61$B ="r8";
62$C ="r9";
63$D ="r10";
64$E ="r11";
65$T ="r12";
66
67@V=($A,$B,$C,$D,$E,$T);
68@X=("r16","r17","r18","r19","r20","r21","r22","r23",
69 "r24","r25","r26","r27","r28","r29","r30","r31");
70
71sub BODY_00_19 {
72my ($i,$a,$b,$c,$d,$e,$f)=@_;
73my $j=$i+1;
74$code.=<<___ if ($i==0);
75 lwz @X[$i],`$i*4`($inp)
76___
77$code.=<<___ if ($i<15);
78 lwz @X[$j],`$j*4`($inp)
79 add $f,$K,$e
80 rotlwi $e,$a,5
81 add $f,$f,@X[$i]
82 and $t0,$c,$b
83 add $f,$f,$e
84 andc $t1,$d,$b
85 rotlwi $b,$b,30
86 or $t0,$t0,$t1
87 add $f,$f,$t0
88___
89$code.=<<___ if ($i>=15);
90 add $f,$K,$e
91 rotlwi $e,$a,5
92 xor @X[$j%16],@X[$j%16],@X[($j+2)%16]
93 add $f,$f,@X[$i%16]
94 and $t0,$c,$b
95 xor @X[$j%16],@X[$j%16],@X[($j+8)%16]
96 add $f,$f,$e
97 andc $t1,$d,$b
98 rotlwi $b,$b,30
99 or $t0,$t0,$t1
100 xor @X[$j%16],@X[$j%16],@X[($j+13)%16]
101 add $f,$f,$t0
102 rotlwi @X[$j%16],@X[$j%16],1
103___
104}
105
106sub BODY_20_39 {
107my ($i,$a,$b,$c,$d,$e,$f)=@_;
108my $j=$i+1;
109$code.=<<___ if ($i<79);
110 add $f,$K,$e
111 rotlwi $e,$a,5
112 xor @X[$j%16],@X[$j%16],@X[($j+2)%16]
113 add $f,$f,@X[$i%16]
114 xor $t0,$b,$c
115 xor @X[$j%16],@X[$j%16],@X[($j+8)%16]
116 add $f,$f,$e
117 rotlwi $b,$b,30
118 xor $t0,$t0,$d
119 xor @X[$j%16],@X[$j%16],@X[($j+13)%16]
120 add $f,$f,$t0
121 rotlwi @X[$j%16],@X[$j%16],1
122___
123$code.=<<___ if ($i==79);
124 add $f,$K,$e
125 rotlwi $e,$a,5
126 lwz r16,0($ctx)
127 add $f,$f,@X[$i%16]
128 xor $t0,$b,$c
129 lwz r17,4($ctx)
130 add $f,$f,$e
131 rotlwi $b,$b,30
132 lwz r18,8($ctx)
133 xor $t0,$t0,$d
134 lwz r19,12($ctx)
135 add $f,$f,$t0
136 lwz r20,16($ctx)
137___
138}
139
140sub BODY_40_59 {
141my ($i,$a,$b,$c,$d,$e,$f)=@_;
142my $j=$i+1;
143$code.=<<___;
144 add $f,$K,$e
145 rotlwi $e,$a,5
146 xor @X[$j%16],@X[$j%16],@X[($j+2)%16]
147 add $f,$f,@X[$i%16]
148 and $t0,$b,$c
149 xor @X[$j%16],@X[$j%16],@X[($j+8)%16]
150 add $f,$f,$e
151 or $t1,$b,$c
152 rotlwi $b,$b,30
153 xor @X[$j%16],@X[$j%16],@X[($j+13)%16]
154 and $t1,$t1,$d
155 or $t0,$t0,$t1
156 rotlwi @X[$j%16],@X[$j%16],1
157 add $f,$f,$t0
158___
159}
160
161$code=<<___;
162.machine "any"
163.text
164
165.globl .sha1_block_data_order
166.align 4
167.sha1_block_data_order:
168 $STU $sp,-$FRAME($sp)
169 mflr r0
170 $PUSH r15,`$FRAME-$SIZE_T*17`($sp)
171 $PUSH r16,`$FRAME-$SIZE_T*16`($sp)
172 $PUSH r17,`$FRAME-$SIZE_T*15`($sp)
173 $PUSH r18,`$FRAME-$SIZE_T*14`($sp)
174 $PUSH r19,`$FRAME-$SIZE_T*13`($sp)
175 $PUSH r20,`$FRAME-$SIZE_T*12`($sp)
176 $PUSH r21,`$FRAME-$SIZE_T*11`($sp)
177 $PUSH r22,`$FRAME-$SIZE_T*10`($sp)
178 $PUSH r23,`$FRAME-$SIZE_T*9`($sp)
179 $PUSH r24,`$FRAME-$SIZE_T*8`($sp)
180 $PUSH r25,`$FRAME-$SIZE_T*7`($sp)
181 $PUSH r26,`$FRAME-$SIZE_T*6`($sp)
182 $PUSH r27,`$FRAME-$SIZE_T*5`($sp)
183 $PUSH r28,`$FRAME-$SIZE_T*4`($sp)
184 $PUSH r29,`$FRAME-$SIZE_T*3`($sp)
185 $PUSH r30,`$FRAME-$SIZE_T*2`($sp)
186 $PUSH r31,`$FRAME-$SIZE_T*1`($sp)
187 $PUSH r0,`$FRAME+$LRSAVE`($sp)
188 lwz $A,0($ctx)
189 lwz $B,4($ctx)
190 lwz $C,8($ctx)
191 lwz $D,12($ctx)
192 lwz $E,16($ctx)
193 andi. r0,$inp,3
194 bne Lunaligned
195Laligned:
196 mtctr $num
197 bl Lsha1_block_private
198 b Ldone
199
200; PowerPC specification allows an implementation to be ill-behaved
201; upon unaligned access which crosses page boundary. "Better safe
202; than sorry" principle makes me treat it specially. But I don't
203; look for particular offending word, but rather for 64-byte input
204; block which crosses the boundary. Once found that block is aligned
205; and hashed separately...
206.align 4
207Lunaligned:
208 subfic $t1,$inp,4096
209 andi. $t1,$t1,4095 ; distance to closest page boundary
210 srwi. $t1,$t1,6 ; t1/=64
211 beq Lcross_page
212 $UCMP $num,$t1
213 ble- Laligned ; didn't cross the page boundary
214 mtctr $t1
215 subfc $num,$t1,$num
216 bl Lsha1_block_private
217Lcross_page:
218 li $t1,16
219 mtctr $t1
220 addi r20,$sp,$LOCALS ; spot within the frame
221Lmemcpy:
222 lbz r16,0($inp)
223 lbz r17,1($inp)
224 lbz r18,2($inp)
225 lbz r19,3($inp)
226 addi $inp,$inp,4
227 stb r16,0(r20)
228 stb r17,1(r20)
229 stb r18,2(r20)
230 stb r19,3(r20)
231 addi r20,r20,4
232 bdnz Lmemcpy
233
234 $PUSH $inp,`$FRAME-$SIZE_T*18`($sp)
235 li $t1,1
236 addi $inp,$sp,$LOCALS
237 mtctr $t1
238 bl Lsha1_block_private
239 $POP $inp,`$FRAME-$SIZE_T*18`($sp)
240 addic. $num,$num,-1
241 bne- Lunaligned
242
243Ldone:
244 $POP r0,`$FRAME+$LRSAVE`($sp)
245 $POP r15,`$FRAME-$SIZE_T*17`($sp)
246 $POP r16,`$FRAME-$SIZE_T*16`($sp)
247 $POP r17,`$FRAME-$SIZE_T*15`($sp)
248 $POP r18,`$FRAME-$SIZE_T*14`($sp)
249 $POP r19,`$FRAME-$SIZE_T*13`($sp)
250 $POP r20,`$FRAME-$SIZE_T*12`($sp)
251 $POP r21,`$FRAME-$SIZE_T*11`($sp)
252 $POP r22,`$FRAME-$SIZE_T*10`($sp)
253 $POP r23,`$FRAME-$SIZE_T*9`($sp)
254 $POP r24,`$FRAME-$SIZE_T*8`($sp)
255 $POP r25,`$FRAME-$SIZE_T*7`($sp)
256 $POP r26,`$FRAME-$SIZE_T*6`($sp)
257 $POP r27,`$FRAME-$SIZE_T*5`($sp)
258 $POP r28,`$FRAME-$SIZE_T*4`($sp)
259 $POP r29,`$FRAME-$SIZE_T*3`($sp)
260 $POP r30,`$FRAME-$SIZE_T*2`($sp)
261 $POP r31,`$FRAME-$SIZE_T*1`($sp)
262 mtlr r0
263 addi $sp,$sp,$FRAME
264 blr
265 .long 0
266 .byte 0,12,4,1,0x80,18,3,0
267 .long 0
268___
269
270# This is private block function, which uses tailored calling
271# interface, namely upon entry SHA_CTX is pre-loaded to given
272# registers and counter register contains amount of chunks to
273# digest...
274$code.=<<___;
275.align 4
276Lsha1_block_private:
277___
278$code.=<<___; # load K_00_19
279 lis $K,0x5a82
280 ori $K,$K,0x7999
281___
282for($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
283$code.=<<___; # load K_20_39
284 lis $K,0x6ed9
285 ori $K,$K,0xeba1
286___
287for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
288$code.=<<___; # load K_40_59
289 lis $K,0x8f1b
290 ori $K,$K,0xbcdc
291___
292for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
293$code.=<<___; # load K_60_79
294 lis $K,0xca62
295 ori $K,$K,0xc1d6
296___
297for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
298$code.=<<___;
299 add r16,r16,$E
300 add r17,r17,$T
301 add r18,r18,$A
302 add r19,r19,$B
303 add r20,r20,$C
304 stw r16,0($ctx)
305 mr $A,r16
306 stw r17,4($ctx)
307 mr $B,r17
308 stw r18,8($ctx)
309 mr $C,r18
310 stw r19,12($ctx)
311 mr $D,r19
312 stw r20,16($ctx)
313 mr $E,r20
314 addi $inp,$inp,`16*4`
315 bdnz- Lsha1_block_private
316 blr
317 .long 0
318 .byte 0,12,0x14,0,0,0,0,0
319___
320$code.=<<___;
321.asciz "SHA1 block transform for PPC, CRYPTOGAMS by <appro\@fy.chalmers.se>"
322___
323
324$code =~ s/\`([^\`]*)\`/eval $1/gem;
325print $code;
326close STDOUT;
diff --git a/src/lib/libcrypto/sha/asm/sha1-s390x.pl b/src/lib/libcrypto/sha/asm/sha1-s390x.pl
deleted file mode 100644
index 9193dda45e..0000000000
--- a/src/lib/libcrypto/sha/asm/sha1-s390x.pl
+++ /dev/null
@@ -1,246 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# SHA1 block procedure for s390x.
11
12# April 2007.
13#
14# Performance is >30% better than gcc 3.3 generated code. But the real
15# twist is that SHA1 hardware support is detected and utilized. In
16# which case performance can reach further >4.5x for larger chunks.
17
18# January 2009.
19#
20# Optimize Xupdate for amount of memory references and reschedule
21# instructions to favour dual-issue z10 pipeline. On z10 hardware is
22# "only" ~2.3x faster than software.
23
24# November 2010.
25#
26# Adapt for -m31 build. If kernel supports what's called "highgprs"
27# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
28# instructions and achieve "64-bit" performance even in 31-bit legacy
29# application context. The feature is not specific to any particular
30# processor, as long as it's "z-CPU". Latter implies that the code
31# remains z/Architecture specific.
32
33$kimdfunc=1; # magic function code for kimd instruction
34
35$flavour = shift;
36
37if ($flavour =~ /3[12]/) {
38 $SIZE_T=4;
39 $g="";
40} else {
41 $SIZE_T=8;
42 $g="g";
43}
44
45while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
46open STDOUT,">$output";
47
48$K_00_39="%r0"; $K=$K_00_39;
49$K_40_79="%r1";
50$ctx="%r2"; $prefetch="%r2";
51$inp="%r3";
52$len="%r4";
53
54$A="%r5";
55$B="%r6";
56$C="%r7";
57$D="%r8";
58$E="%r9"; @V=($A,$B,$C,$D,$E);
59$t0="%r10";
60$t1="%r11";
61@X=("%r12","%r13","%r14");
62$sp="%r15";
63
64$stdframe=16*$SIZE_T+4*8;
65$frame=$stdframe+16*4;
66
67sub Xupdate {
68my $i=shift;
69
70$code.=<<___ if ($i==15);
71 lg $prefetch,$stdframe($sp) ### Xupdate(16) warm-up
72 lr $X[0],$X[2]
73___
74return if ($i&1); # Xupdate is vectorized and executed every 2nd cycle
75$code.=<<___ if ($i<16);
76 lg $X[0],`$i*4`($inp) ### Xload($i)
77 rllg $X[1],$X[0],32
78___
79$code.=<<___ if ($i>=16);
80 xgr $X[0],$prefetch ### Xupdate($i)
81 lg $prefetch,`$stdframe+4*(($i+2)%16)`($sp)
82 xg $X[0],`$stdframe+4*(($i+8)%16)`($sp)
83 xgr $X[0],$prefetch
84 rll $X[0],$X[0],1
85 rllg $X[1],$X[0],32
86 rll $X[1],$X[1],1
87 rllg $X[0],$X[1],32
88 lr $X[2],$X[1] # feedback
89___
90$code.=<<___ if ($i<=70);
91 stg $X[0],`$stdframe+4*($i%16)`($sp)
92___
93unshift(@X,pop(@X));
94}
95
96sub BODY_00_19 {
97my ($i,$a,$b,$c,$d,$e)=@_;
98my $xi=$X[1];
99
100 &Xupdate($i);
101$code.=<<___;
102 alr $e,$K ### $i
103 rll $t1,$a,5
104 lr $t0,$d
105 xr $t0,$c
106 alr $e,$t1
107 nr $t0,$b
108 alr $e,$xi
109 xr $t0,$d
110 rll $b,$b,30
111 alr $e,$t0
112___
113}
114
115sub BODY_20_39 {
116my ($i,$a,$b,$c,$d,$e)=@_;
117my $xi=$X[1];
118
119 &Xupdate($i);
120$code.=<<___;
121 alr $e,$K ### $i
122 rll $t1,$a,5
123 lr $t0,$b
124 alr $e,$t1
125 xr $t0,$c
126 alr $e,$xi
127 xr $t0,$d
128 rll $b,$b,30
129 alr $e,$t0
130___
131}
132
133sub BODY_40_59 {
134my ($i,$a,$b,$c,$d,$e)=@_;
135my $xi=$X[1];
136
137 &Xupdate($i);
138$code.=<<___;
139 alr $e,$K ### $i
140 rll $t1,$a,5
141 lr $t0,$b
142 alr $e,$t1
143 or $t0,$c
144 lr $t1,$b
145 nr $t0,$d
146 nr $t1,$c
147 alr $e,$xi
148 or $t0,$t1
149 rll $b,$b,30
150 alr $e,$t0
151___
152}
153
154$code.=<<___;
155.text
156.align 64
157.type Ktable,\@object
158Ktable: .long 0x5a827999,0x6ed9eba1,0x8f1bbcdc,0xca62c1d6
159 .skip 48 #.long 0,0,0,0,0,0,0,0,0,0,0,0
160.size Ktable,.-Ktable
161.globl sha1_block_data_order
162.type sha1_block_data_order,\@function
163sha1_block_data_order:
164___
165$code.=<<___ if ($kimdfunc);
166 larl %r1,OPENSSL_s390xcap_P
167 lg %r0,0(%r1)
168 tmhl %r0,0x4000 # check for message-security assist
169 jz .Lsoftware
170 lghi %r0,0
171 la %r1,`2*$SIZE_T`($sp)
172 .long 0xb93e0002 # kimd %r0,%r2
173 lg %r0,`2*$SIZE_T`($sp)
174 tmhh %r0,`0x8000>>$kimdfunc`
175 jz .Lsoftware
176 lghi %r0,$kimdfunc
177 lgr %r1,$ctx
178 lgr %r2,$inp
179 sllg %r3,$len,6
180 .long 0xb93e0002 # kimd %r0,%r2
181 brc 1,.-4 # pay attention to "partial completion"
182 br %r14
183.align 16
184.Lsoftware:
185___
186$code.=<<___;
187 lghi %r1,-$frame
188 st${g} $ctx,`2*$SIZE_T`($sp)
189 stm${g} %r6,%r15,`6*$SIZE_T`($sp)
190 lgr %r0,$sp
191 la $sp,0(%r1,$sp)
192 st${g} %r0,0($sp)
193
194 larl $t0,Ktable
195 llgf $A,0($ctx)
196 llgf $B,4($ctx)
197 llgf $C,8($ctx)
198 llgf $D,12($ctx)
199 llgf $E,16($ctx)
200
201 lg $K_00_39,0($t0)
202 lg $K_40_79,8($t0)
203
204.Lloop:
205 rllg $K_00_39,$K_00_39,32
206___
207for ($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
208$code.=<<___;
209 rllg $K_00_39,$K_00_39,32
210___
211for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
212$code.=<<___; $K=$K_40_79;
213 rllg $K_40_79,$K_40_79,32
214___
215for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
216$code.=<<___;
217 rllg $K_40_79,$K_40_79,32
218___
219for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
220$code.=<<___;
221
222 l${g} $ctx,`$frame+2*$SIZE_T`($sp)
223 la $inp,64($inp)
224 al $A,0($ctx)
225 al $B,4($ctx)
226 al $C,8($ctx)
227 al $D,12($ctx)
228 al $E,16($ctx)
229 st $A,0($ctx)
230 st $B,4($ctx)
231 st $C,8($ctx)
232 st $D,12($ctx)
233 st $E,16($ctx)
234 brct${g} $len,.Lloop
235
236 lm${g} %r6,%r15,`$frame+6*$SIZE_T`($sp)
237 br %r14
238.size sha1_block_data_order,.-sha1_block_data_order
239.string "SHA1 block transform for s390x, CRYPTOGAMS by <appro\@openssl.org>"
240.comm OPENSSL_s390xcap_P,16,8
241___
242
243$code =~ s/\`([^\`]*)\`/eval $1/gem;
244
245print $code;
246close STDOUT;
diff --git a/src/lib/libcrypto/sha/asm/sha1-sparcv9.pl b/src/lib/libcrypto/sha/asm/sha1-sparcv9.pl
deleted file mode 100644
index 5c161cecd6..0000000000
--- a/src/lib/libcrypto/sha/asm/sha1-sparcv9.pl
+++ /dev/null
@@ -1,284 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# Performance improvement is not really impressive on pre-T1 CPU: +8%
11# over Sun C and +25% over gcc [3.3]. While on T1, a.k.a. Niagara, it
12# turned to be 40% faster than 64-bit code generated by Sun C 5.8 and
13# >2x than 64-bit code generated by gcc 3.4. And there is a gimmick.
14# X[16] vector is packed to 8 64-bit registers and as result nothing
15# is spilled on stack. In addition input data is loaded in compact
16# instruction sequence, thus minimizing the window when the code is
17# subject to [inter-thread] cache-thrashing hazard. The goal is to
18# ensure scalability on UltraSPARC T1, or rather to avoid decay when
19# amount of active threads exceeds the number of physical cores.
20
21$bits=32;
22for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
23if ($bits==64) { $bias=2047; $frame=192; }
24else { $bias=0; $frame=112; }
25
26$output=shift;
27open STDOUT,">$output";
28
29@X=("%o0","%o1","%o2","%o3","%o4","%o5","%g1","%o7");
30$rot1m="%g2";
31$tmp64="%g3";
32$Xi="%g4";
33$A="%l0";
34$B="%l1";
35$C="%l2";
36$D="%l3";
37$E="%l4";
38@V=($A,$B,$C,$D,$E);
39$K_00_19="%l5";
40$K_20_39="%l6";
41$K_40_59="%l7";
42$K_60_79="%g5";
43@K=($K_00_19,$K_20_39,$K_40_59,$K_60_79);
44
45$ctx="%i0";
46$inp="%i1";
47$len="%i2";
48$tmp0="%i3";
49$tmp1="%i4";
50$tmp2="%i5";
51
52sub BODY_00_15 {
53my ($i,$a,$b,$c,$d,$e)=@_;
54my $xi=($i&1)?@X[($i/2)%8]:$Xi;
55
56$code.=<<___;
57 sll $a,5,$tmp0 !! $i
58 add @K[$i/20],$e,$e
59 srl $a,27,$tmp1
60 add $tmp0,$e,$e
61 and $c,$b,$tmp0
62 add $tmp1,$e,$e
63 sll $b,30,$tmp2
64 andn $d,$b,$tmp1
65 srl $b,2,$b
66 or $tmp1,$tmp0,$tmp1
67 or $tmp2,$b,$b
68 add $xi,$e,$e
69___
70if ($i&1 && $i<15) {
71 $code.=
72 " srlx @X[(($i+1)/2)%8],32,$Xi\n";
73}
74$code.=<<___;
75 add $tmp1,$e,$e
76___
77}
78
79sub Xupdate {
80my ($i,$a,$b,$c,$d,$e)=@_;
81my $j=$i/2;
82
83if ($i&1) {
84$code.=<<___;
85 sll $a,5,$tmp0 !! $i
86 add @K[$i/20],$e,$e
87 srl $a,27,$tmp1
88___
89} else {
90$code.=<<___;
91 sllx @X[($j+6)%8],32,$Xi ! Xupdate($i)
92 xor @X[($j+1)%8],@X[$j%8],@X[$j%8]
93 srlx @X[($j+7)%8],32,$tmp1
94 xor @X[($j+4)%8],@X[$j%8],@X[$j%8]
95 sll $a,5,$tmp0 !! $i
96 or $tmp1,$Xi,$Xi
97 add @K[$i/20],$e,$e !!
98 xor $Xi,@X[$j%8],@X[$j%8]
99 srlx @X[$j%8],31,$Xi
100 add @X[$j%8],@X[$j%8],@X[$j%8]
101 and $Xi,$rot1m,$Xi
102 andn @X[$j%8],$rot1m,@X[$j%8]
103 srl $a,27,$tmp1 !!
104 or $Xi,@X[$j%8],@X[$j%8]
105___
106}
107}
108
109sub BODY_16_19 {
110my ($i,$a,$b,$c,$d,$e)=@_;
111
112 &Xupdate(@_);
113 if ($i&1) {
114 $xi=@X[($i/2)%8];
115 } else {
116 $xi=$Xi;
117 $code.="\tsrlx @X[($i/2)%8],32,$xi\n";
118 }
119$code.=<<___;
120 add $tmp0,$e,$e !!
121 and $c,$b,$tmp0
122 add $tmp1,$e,$e
123 sll $b,30,$tmp2
124 add $xi,$e,$e
125 andn $d,$b,$tmp1
126 srl $b,2,$b
127 or $tmp1,$tmp0,$tmp1
128 or $tmp2,$b,$b
129 add $tmp1,$e,$e
130___
131}
132
133sub BODY_20_39 {
134my ($i,$a,$b,$c,$d,$e)=@_;
135my $xi;
136 &Xupdate(@_);
137 if ($i&1) {
138 $xi=@X[($i/2)%8];
139 } else {
140 $xi=$Xi;
141 $code.="\tsrlx @X[($i/2)%8],32,$xi\n";
142 }
143$code.=<<___;
144 add $tmp0,$e,$e !!
145 xor $c,$b,$tmp0
146 add $tmp1,$e,$e
147 sll $b,30,$tmp2
148 xor $d,$tmp0,$tmp1
149 srl $b,2,$b
150 add $tmp1,$e,$e
151 or $tmp2,$b,$b
152 add $xi,$e,$e
153___
154}
155
156sub BODY_40_59 {
157my ($i,$a,$b,$c,$d,$e)=@_;
158my $xi;
159 &Xupdate(@_);
160 if ($i&1) {
161 $xi=@X[($i/2)%8];
162 } else {
163 $xi=$Xi;
164 $code.="\tsrlx @X[($i/2)%8],32,$xi\n";
165 }
166$code.=<<___;
167 add $tmp0,$e,$e !!
168 and $c,$b,$tmp0
169 add $tmp1,$e,$e
170 sll $b,30,$tmp2
171 or $c,$b,$tmp1
172 srl $b,2,$b
173 and $d,$tmp1,$tmp1
174 add $xi,$e,$e
175 or $tmp1,$tmp0,$tmp1
176 or $tmp2,$b,$b
177 add $tmp1,$e,$e
178___
179}
180
181$code.=<<___ if ($bits==64);
182.register %g2,#scratch
183.register %g3,#scratch
184___
185$code.=<<___;
186.section ".text",#alloc,#execinstr
187
188.align 32
189.globl sha1_block_data_order
190sha1_block_data_order:
191 save %sp,-$frame,%sp
192 sllx $len,6,$len
193 add $inp,$len,$len
194
195 or %g0,1,$rot1m
196 sllx $rot1m,32,$rot1m
197 or $rot1m,1,$rot1m
198
199 ld [$ctx+0],$A
200 ld [$ctx+4],$B
201 ld [$ctx+8],$C
202 ld [$ctx+12],$D
203 ld [$ctx+16],$E
204 andn $inp,7,$tmp0
205
206 sethi %hi(0x5a827999),$K_00_19
207 or $K_00_19,%lo(0x5a827999),$K_00_19
208 sethi %hi(0x6ed9eba1),$K_20_39
209 or $K_20_39,%lo(0x6ed9eba1),$K_20_39
210 sethi %hi(0x8f1bbcdc),$K_40_59
211 or $K_40_59,%lo(0x8f1bbcdc),$K_40_59
212 sethi %hi(0xca62c1d6),$K_60_79
213 or $K_60_79,%lo(0xca62c1d6),$K_60_79
214
215.Lloop:
216 ldx [$tmp0+0],@X[0]
217 ldx [$tmp0+16],@X[2]
218 ldx [$tmp0+32],@X[4]
219 ldx [$tmp0+48],@X[6]
220 and $inp,7,$tmp1
221 ldx [$tmp0+8],@X[1]
222 sll $tmp1,3,$tmp1
223 ldx [$tmp0+24],@X[3]
224 subcc %g0,$tmp1,$tmp2 ! should be 64-$tmp1, but -$tmp1 works too
225 ldx [$tmp0+40],@X[5]
226 bz,pt %icc,.Laligned
227 ldx [$tmp0+56],@X[7]
228
229 sllx @X[0],$tmp1,@X[0]
230 ldx [$tmp0+64],$tmp64
231___
232for($i=0;$i<7;$i++)
233{ $code.=<<___;
234 srlx @X[$i+1],$tmp2,$Xi
235 sllx @X[$i+1],$tmp1,@X[$i+1]
236 or $Xi,@X[$i],@X[$i]
237___
238}
239$code.=<<___;
240 srlx $tmp64,$tmp2,$tmp64
241 or $tmp64,@X[7],@X[7]
242.Laligned:
243 srlx @X[0],32,$Xi
244___
245for ($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
246for (;$i<20;$i++) { &BODY_16_19($i,@V); unshift(@V,pop(@V)); }
247for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
248for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
249for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
250$code.=<<___;
251
252 ld [$ctx+0],@X[0]
253 ld [$ctx+4],@X[1]
254 ld [$ctx+8],@X[2]
255 ld [$ctx+12],@X[3]
256 add $inp,64,$inp
257 ld [$ctx+16],@X[4]
258 cmp $inp,$len
259
260 add $A,@X[0],$A
261 st $A,[$ctx+0]
262 add $B,@X[1],$B
263 st $B,[$ctx+4]
264 add $C,@X[2],$C
265 st $C,[$ctx+8]
266 add $D,@X[3],$D
267 st $D,[$ctx+12]
268 add $E,@X[4],$E
269 st $E,[$ctx+16]
270
271 bne `$bits==64?"%xcc":"%icc"`,.Lloop
272 andn $inp,7,$tmp0
273
274 ret
275 restore
276.type sha1_block_data_order,#function
277.size sha1_block_data_order,(.-sha1_block_data_order)
278.asciz "SHA1 block transform for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
279.align 4
280___
281
282$code =~ s/\`([^\`]*)\`/eval $1/gem;
283print $code;
284close STDOUT;
diff --git a/src/lib/libcrypto/sha/asm/sha1-sparcv9a.pl b/src/lib/libcrypto/sha/asm/sha1-sparcv9a.pl
deleted file mode 100644
index e65291bbd9..0000000000
--- a/src/lib/libcrypto/sha/asm/sha1-sparcv9a.pl
+++ /dev/null
@@ -1,601 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# January 2009
11#
12# Provided that UltraSPARC VIS instructions are pipe-lined(*) and
13# pairable(*) with IALU ones, offloading of Xupdate to the UltraSPARC
14# Graphic Unit would make it possible to achieve higher instruction-
15# level parallelism, ILP, and thus higher performance. It should be
16# explicitly noted that ILP is the keyword, and it means that this
17# code would be unsuitable for cores like UltraSPARC-Tx. The idea is
18# not really novel, Sun had VIS-powered implementation for a while.
19# Unlike Sun's implementation this one can process multiple unaligned
20# input blocks, and as such works as drop-in replacement for OpenSSL
21# sha1_block_data_order. Performance improvement was measured to be
22# 40% over pure IALU sha1-sparcv9.pl on UltraSPARC-IIi, but 12% on
23# UltraSPARC-III. See below for discussion...
24#
25# The module does not present direct interest for OpenSSL, because
26# it doesn't provide better performance on contemporary SPARCv9 CPUs,
27# UltraSPARC-Tx and SPARC64-V[II] to be specific. Those who feel they
28# absolutely must score on UltraSPARC-I-IV can simply replace
29# crypto/sha/asm/sha1-sparcv9.pl with this module.
30#
31# (*) "Pipe-lined" means that even if it takes several cycles to
32# complete, next instruction using same functional unit [but not
33# depending on the result of the current instruction] can start
34# execution without having to wait for the unit. "Pairable"
35# means that two [or more] independent instructions can be
36# issued at the very same time.
37
38$bits=32;
39for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
40if ($bits==64) { $bias=2047; $frame=192; }
41else { $bias=0; $frame=112; }
42
43$output=shift;
44open STDOUT,">$output";
45
46$ctx="%i0";
47$inp="%i1";
48$len="%i2";
49$tmp0="%i3";
50$tmp1="%i4";
51$tmp2="%i5";
52$tmp3="%g5";
53
54$base="%g1";
55$align="%g4";
56$Xfer="%o5";
57$nXfer=$tmp3;
58$Xi="%o7";
59
60$A="%l0";
61$B="%l1";
62$C="%l2";
63$D="%l3";
64$E="%l4";
65@V=($A,$B,$C,$D,$E);
66
67$Actx="%o0";
68$Bctx="%o1";
69$Cctx="%o2";
70$Dctx="%o3";
71$Ectx="%o4";
72
73$fmul="%f32";
74$VK_00_19="%f34";
75$VK_20_39="%f36";
76$VK_40_59="%f38";
77$VK_60_79="%f40";
78@VK=($VK_00_19,$VK_20_39,$VK_40_59,$VK_60_79);
79@X=("%f0", "%f1", "%f2", "%f3", "%f4", "%f5", "%f6", "%f7",
80 "%f8", "%f9","%f10","%f11","%f12","%f13","%f14","%f15","%f16");
81
82# This is reference 2x-parallelized VIS-powered Xupdate procedure. It
83# covers even K_NN_MM addition...
84sub Xupdate {
85my ($i)=@_;
86my $K=@VK[($i+16)/20];
87my $j=($i+16)%16;
88
89# [ provided that GSR.alignaddr_offset is 5, $mul contains
90# 0x100ULL<<32|0x100 value and K_NN_MM are pre-loaded to
91# chosen registers... ]
92$code.=<<___;
93 fxors @X[($j+13)%16],@X[$j],@X[$j] !-1/-1/-1:X[0]^=X[13]
94 fxors @X[($j+14)%16],@X[$j+1],@X[$j+1]! 0/ 0/ 0:X[1]^=X[14]
95 fxor @X[($j+2)%16],@X[($j+8)%16],%f18! 1/ 1/ 1:Tmp=X[2,3]^X[8,9]
96 fxor %f18,@X[$j],@X[$j] ! 2/ 4/ 3:X[0,1]^=X[2,3]^X[8,9]
97 faligndata @X[$j],@X[$j],%f18 ! 3/ 7/ 5:Tmp=X[0,1]>>>24
98 fpadd32 @X[$j],@X[$j],@X[$j] ! 4/ 8/ 6:X[0,1]<<=1
99 fmul8ulx16 %f18,$fmul,%f18 ! 5/10/ 7:Tmp>>=7, Tmp&=1
100 ![fxors %f15,%f2,%f2]
101 for %f18,@X[$j],@X[$j] ! 8/14/10:X[0,1]|=Tmp
102 ![fxors %f0,%f3,%f3] !10/17/12:X[0] dependency
103 fpadd32 $K,@X[$j],%f20
104 std %f20,[$Xfer+`4*$j`]
105___
106# The numbers delimited with slash are the earliest possible dispatch
107# cycles for given instruction assuming 1 cycle latency for simple VIS
108# instructions, such as on UltraSPARC-I&II, 3 cycles latency, such as
109# on UltraSPARC-III&IV, and 2 cycles latency(*), respectively. Being
110# 2x-parallelized the procedure is "worth" 5, 8.5 or 6 ticks per SHA1
111# round. As [long as] FPU/VIS instructions are perfectly pairable with
112# IALU ones, the round timing is defined by the maximum between VIS
113# and IALU timings. The latter varies from round to round and averages
114# out at 6.25 ticks. This means that USI&II should operate at IALU
115# rate, while USIII&IV - at VIS rate. This explains why performance
116# improvement varies among processors. Well, given that pure IALU
117# sha1-sparcv9.pl module exhibits virtually uniform performance of
118# ~9.3 cycles per SHA1 round. Timings mentioned above are theoretical
119# lower limits. Real-life performance was measured to be 6.6 cycles
120# per SHA1 round on USIIi and 8.3 on USIII. The latter is lower than
121# half-round VIS timing, because there are 16 Xupdate-free rounds,
122# which "push down" average theoretical timing to 8 cycles...
123
124# (*) SPARC64-V[II] was originally believed to have 2 cycles VIS
125# latency. Well, it might have, but it doesn't have dedicated
126# VIS-unit. Instead, VIS instructions are executed by other
127# functional units, ones used here - by IALU. This doesn't
128# improve effective ILP...
129}
130
131# The reference Xupdate procedure is then "strained" over *pairs* of
132# BODY_NN_MM and kind of modulo-scheduled in respect to X[n]^=X[n+13]
133# and K_NN_MM addition. It's "running" 15 rounds ahead, which leaves
134# plenty of room to amortize for read-after-write hazard, as well as
135# to fetch and align input for the next spin. The VIS instructions are
136# scheduled for latency of 2 cycles, because there are not enough IALU
137# instructions to schedule for latency of 3, while scheduling for 1
138# would give no gain on USI&II anyway.
139
140sub BODY_00_19 {
141my ($i,$a,$b,$c,$d,$e)=@_;
142my $j=$i&~1;
143my $k=($j+16+2)%16; # ahead reference
144my $l=($j+16-2)%16; # behind reference
145my $K=@VK[($j+16-2)/20];
146
147$j=($j+16)%16;
148
149$code.=<<___ if (!($i&1));
150 sll $a,5,$tmp0 !! $i
151 and $c,$b,$tmp3
152 ld [$Xfer+`4*($i%16)`],$Xi
153 fxors @X[($j+14)%16],@X[$j+1],@X[$j+1]! 0/ 0/ 0:X[1]^=X[14]
154 srl $a,27,$tmp1
155 add $tmp0,$e,$e
156 fxor @X[($j+2)%16],@X[($j+8)%16],%f18! 1/ 1/ 1:Tmp=X[2,3]^X[8,9]
157 sll $b,30,$tmp2
158 add $tmp1,$e,$e
159 andn $d,$b,$tmp1
160 add $Xi,$e,$e
161 fxor %f18,@X[$j],@X[$j] ! 2/ 4/ 3:X[0,1]^=X[2,3]^X[8,9]
162 srl $b,2,$b
163 or $tmp1,$tmp3,$tmp1
164 or $tmp2,$b,$b
165 add $tmp1,$e,$e
166 faligndata @X[$j],@X[$j],%f18 ! 3/ 7/ 5:Tmp=X[0,1]>>>24
167___
168$code.=<<___ if ($i&1);
169 sll $a,5,$tmp0 !! $i
170 and $c,$b,$tmp3
171 ld [$Xfer+`4*($i%16)`],$Xi
172 fpadd32 @X[$j],@X[$j],@X[$j] ! 4/ 8/ 6:X[0,1]<<=1
173 srl $a,27,$tmp1
174 add $tmp0,$e,$e
175 fmul8ulx16 %f18,$fmul,%f18 ! 5/10/ 7:Tmp>>=7, Tmp&=1
176 sll $b,30,$tmp2
177 add $tmp1,$e,$e
178 fpadd32 $K,@X[$l],%f20 !
179 andn $d,$b,$tmp1
180 add $Xi,$e,$e
181 fxors @X[($k+13)%16],@X[$k],@X[$k] !-1/-1/-1:X[0]^=X[13]
182 srl $b,2,$b
183 or $tmp1,$tmp3,$tmp1
184 fxor %f18,@X[$j],@X[$j] ! 8/14/10:X[0,1]|=Tmp
185 or $tmp2,$b,$b
186 add $tmp1,$e,$e
187___
188$code.=<<___ if ($i&1 && $i>=2);
189 std %f20,[$Xfer+`4*$l`] !
190___
191}
192
193sub BODY_20_39 {
194my ($i,$a,$b,$c,$d,$e)=@_;
195my $j=$i&~1;
196my $k=($j+16+2)%16; # ahead reference
197my $l=($j+16-2)%16; # behind reference
198my $K=@VK[($j+16-2)/20];
199
200$j=($j+16)%16;
201
202$code.=<<___ if (!($i&1) && $i<64);
203 sll $a,5,$tmp0 !! $i
204 ld [$Xfer+`4*($i%16)`],$Xi
205 fxors @X[($j+14)%16],@X[$j+1],@X[$j+1]! 0/ 0/ 0:X[1]^=X[14]
206 srl $a,27,$tmp1
207 add $tmp0,$e,$e
208 fxor @X[($j+2)%16],@X[($j+8)%16],%f18! 1/ 1/ 1:Tmp=X[2,3]^X[8,9]
209 xor $c,$b,$tmp0
210 add $tmp1,$e,$e
211 sll $b,30,$tmp2
212 xor $d,$tmp0,$tmp1
213 fxor %f18,@X[$j],@X[$j] ! 2/ 4/ 3:X[0,1]^=X[2,3]^X[8,9]
214 srl $b,2,$b
215 add $tmp1,$e,$e
216 or $tmp2,$b,$b
217 add $Xi,$e,$e
218 faligndata @X[$j],@X[$j],%f18 ! 3/ 7/ 5:Tmp=X[0,1]>>>24
219___
220$code.=<<___ if ($i&1 && $i<64);
221 sll $a,5,$tmp0 !! $i
222 ld [$Xfer+`4*($i%16)`],$Xi
223 fpadd32 @X[$j],@X[$j],@X[$j] ! 4/ 8/ 6:X[0,1]<<=1
224 srl $a,27,$tmp1
225 add $tmp0,$e,$e
226 fmul8ulx16 %f18,$fmul,%f18 ! 5/10/ 7:Tmp>>=7, Tmp&=1
227 xor $c,$b,$tmp0
228 add $tmp1,$e,$e
229 fpadd32 $K,@X[$l],%f20 !
230 sll $b,30,$tmp2
231 xor $d,$tmp0,$tmp1
232 fxors @X[($k+13)%16],@X[$k],@X[$k] !-1/-1/-1:X[0]^=X[13]
233 srl $b,2,$b
234 add $tmp1,$e,$e
235 fxor %f18,@X[$j],@X[$j] ! 8/14/10:X[0,1]|=Tmp
236 or $tmp2,$b,$b
237 add $Xi,$e,$e
238 std %f20,[$Xfer+`4*$l`] !
239___
240$code.=<<___ if ($i==64);
241 sll $a,5,$tmp0 !! $i
242 ld [$Xfer+`4*($i%16)`],$Xi
243 fpadd32 $K,@X[$l],%f20
244 srl $a,27,$tmp1
245 add $tmp0,$e,$e
246 xor $c,$b,$tmp0
247 add $tmp1,$e,$e
248 sll $b,30,$tmp2
249 xor $d,$tmp0,$tmp1
250 std %f20,[$Xfer+`4*$l`]
251 srl $b,2,$b
252 add $tmp1,$e,$e
253 or $tmp2,$b,$b
254 add $Xi,$e,$e
255___
256$code.=<<___ if ($i>64);
257 sll $a,5,$tmp0 !! $i
258 ld [$Xfer+`4*($i%16)`],$Xi
259 srl $a,27,$tmp1
260 add $tmp0,$e,$e
261 xor $c,$b,$tmp0
262 add $tmp1,$e,$e
263 sll $b,30,$tmp2
264 xor $d,$tmp0,$tmp1
265 srl $b,2,$b
266 add $tmp1,$e,$e
267 or $tmp2,$b,$b
268 add $Xi,$e,$e
269___
270}
271
272sub BODY_40_59 {
273my ($i,$a,$b,$c,$d,$e)=@_;
274my $j=$i&~1;
275my $k=($j+16+2)%16; # ahead reference
276my $l=($j+16-2)%16; # behind reference
277my $K=@VK[($j+16-2)/20];
278
279$j=($j+16)%16;
280
281$code.=<<___ if (!($i&1));
282 sll $a,5,$tmp0 !! $i
283 ld [$Xfer+`4*($i%16)`],$Xi
284 fxors @X[($j+14)%16],@X[$j+1],@X[$j+1]! 0/ 0/ 0:X[1]^=X[14]
285 srl $a,27,$tmp1
286 add $tmp0,$e,$e
287 fxor @X[($j+2)%16],@X[($j+8)%16],%f18! 1/ 1/ 1:Tmp=X[2,3]^X[8,9]
288 and $c,$b,$tmp0
289 add $tmp1,$e,$e
290 sll $b,30,$tmp2
291 or $c,$b,$tmp1
292 fxor %f18,@X[$j],@X[$j] ! 2/ 4/ 3:X[0,1]^=X[2,3]^X[8,9]
293 srl $b,2,$b
294 and $d,$tmp1,$tmp1
295 add $Xi,$e,$e
296 or $tmp1,$tmp0,$tmp1
297 faligndata @X[$j],@X[$j],%f18 ! 3/ 7/ 5:Tmp=X[0,1]>>>24
298 or $tmp2,$b,$b
299 add $tmp1,$e,$e
300 fpadd32 @X[$j],@X[$j],@X[$j] ! 4/ 8/ 6:X[0,1]<<=1
301___
302$code.=<<___ if ($i&1);
303 sll $a,5,$tmp0 !! $i
304 ld [$Xfer+`4*($i%16)`],$Xi
305 srl $a,27,$tmp1
306 add $tmp0,$e,$e
307 fmul8ulx16 %f18,$fmul,%f18 ! 5/10/ 7:Tmp>>=7, Tmp&=1
308 and $c,$b,$tmp0
309 add $tmp1,$e,$e
310 fpadd32 $K,@X[$l],%f20 !
311 sll $b,30,$tmp2
312 or $c,$b,$tmp1
313 fxors @X[($k+13)%16],@X[$k],@X[$k] !-1/-1/-1:X[0]^=X[13]
314 srl $b,2,$b
315 and $d,$tmp1,$tmp1
316 fxor %f18,@X[$j],@X[$j] ! 8/14/10:X[0,1]|=Tmp
317 add $Xi,$e,$e
318 or $tmp1,$tmp0,$tmp1
319 or $tmp2,$b,$b
320 add $tmp1,$e,$e
321 std %f20,[$Xfer+`4*$l`] !
322___
323}
324
325# If there is more data to process, then we pre-fetch the data for
326# next iteration in last ten rounds...
327sub BODY_70_79 {
328my ($i,$a,$b,$c,$d,$e)=@_;
329my $j=$i&~1;
330my $m=($i%8)*2;
331
332$j=($j+16)%16;
333
334$code.=<<___ if ($i==70);
335 sll $a,5,$tmp0 !! $i
336 ld [$Xfer+`4*($i%16)`],$Xi
337 srl $a,27,$tmp1
338 add $tmp0,$e,$e
339 ldd [$inp+64],@X[0]
340 xor $c,$b,$tmp0
341 add $tmp1,$e,$e
342 sll $b,30,$tmp2
343 xor $d,$tmp0,$tmp1
344 srl $b,2,$b
345 add $tmp1,$e,$e
346 or $tmp2,$b,$b
347 add $Xi,$e,$e
348
349 and $inp,-64,$nXfer
350 inc 64,$inp
351 and $nXfer,255,$nXfer
352 alignaddr %g0,$align,%g0
353 add $base,$nXfer,$nXfer
354___
355$code.=<<___ if ($i==71);
356 sll $a,5,$tmp0 !! $i
357 ld [$Xfer+`4*($i%16)`],$Xi
358 srl $a,27,$tmp1
359 add $tmp0,$e,$e
360 xor $c,$b,$tmp0
361 add $tmp1,$e,$e
362 sll $b,30,$tmp2
363 xor $d,$tmp0,$tmp1
364 srl $b,2,$b
365 add $tmp1,$e,$e
366 or $tmp2,$b,$b
367 add $Xi,$e,$e
368___
369$code.=<<___ if ($i>=72);
370 faligndata @X[$m],@X[$m+2],@X[$m]
371 sll $a,5,$tmp0 !! $i
372 ld [$Xfer+`4*($i%16)`],$Xi
373 srl $a,27,$tmp1
374 add $tmp0,$e,$e
375 xor $c,$b,$tmp0
376 add $tmp1,$e,$e
377 fpadd32 $VK_00_19,@X[$m],%f20
378 sll $b,30,$tmp2
379 xor $d,$tmp0,$tmp1
380 srl $b,2,$b
381 add $tmp1,$e,$e
382 or $tmp2,$b,$b
383 add $Xi,$e,$e
384___
385$code.=<<___ if ($i<77);
386 ldd [$inp+`8*($i+1-70)`],@X[2*($i+1-70)]
387___
388$code.=<<___ if ($i==77); # redundant if $inp was aligned
389 add $align,63,$tmp0
390 and $tmp0,-8,$tmp0
391 ldd [$inp+$tmp0],@X[16]
392___
393$code.=<<___ if ($i>=72);
394 std %f20,[$nXfer+`4*$m`]
395___
396}
397
398$code.=<<___;
399.section ".text",#alloc,#execinstr
400
401.align 64
402vis_const:
403.long 0x5a827999,0x5a827999 ! K_00_19
404.long 0x6ed9eba1,0x6ed9eba1 ! K_20_39
405.long 0x8f1bbcdc,0x8f1bbcdc ! K_40_59
406.long 0xca62c1d6,0xca62c1d6 ! K_60_79
407.long 0x00000100,0x00000100
408.align 64
409.type vis_const,#object
410.size vis_const,(.-vis_const)
411
412.globl sha1_block_data_order
413sha1_block_data_order:
414 save %sp,-$frame,%sp
415 add %fp,$bias-256,$base
416
4171: call .+8
418 add %o7,vis_const-1b,$tmp0
419
420 ldd [$tmp0+0],$VK_00_19
421 ldd [$tmp0+8],$VK_20_39
422 ldd [$tmp0+16],$VK_40_59
423 ldd [$tmp0+24],$VK_60_79
424 ldd [$tmp0+32],$fmul
425
426 ld [$ctx+0],$Actx
427 and $base,-256,$base
428 ld [$ctx+4],$Bctx
429 sub $base,$bias+$frame,%sp
430 ld [$ctx+8],$Cctx
431 and $inp,7,$align
432 ld [$ctx+12],$Dctx
433 and $inp,-8,$inp
434 ld [$ctx+16],$Ectx
435
436 ! X[16] is maintained in FP register bank
437 alignaddr %g0,$align,%g0
438 ldd [$inp+0],@X[0]
439 sub $inp,-64,$Xfer
440 ldd [$inp+8],@X[2]
441 and $Xfer,-64,$Xfer
442 ldd [$inp+16],@X[4]
443 and $Xfer,255,$Xfer
444 ldd [$inp+24],@X[6]
445 add $base,$Xfer,$Xfer
446 ldd [$inp+32],@X[8]
447 ldd [$inp+40],@X[10]
448 ldd [$inp+48],@X[12]
449 brz,pt $align,.Laligned
450 ldd [$inp+56],@X[14]
451
452 ldd [$inp+64],@X[16]
453 faligndata @X[0],@X[2],@X[0]
454 faligndata @X[2],@X[4],@X[2]
455 faligndata @X[4],@X[6],@X[4]
456 faligndata @X[6],@X[8],@X[6]
457 faligndata @X[8],@X[10],@X[8]
458 faligndata @X[10],@X[12],@X[10]
459 faligndata @X[12],@X[14],@X[12]
460 faligndata @X[14],@X[16],@X[14]
461
462.Laligned:
463 mov 5,$tmp0
464 dec 1,$len
465 alignaddr %g0,$tmp0,%g0
466 fpadd32 $VK_00_19,@X[0],%f16
467 fpadd32 $VK_00_19,@X[2],%f18
468 fpadd32 $VK_00_19,@X[4],%f20
469 fpadd32 $VK_00_19,@X[6],%f22
470 fpadd32 $VK_00_19,@X[8],%f24
471 fpadd32 $VK_00_19,@X[10],%f26
472 fpadd32 $VK_00_19,@X[12],%f28
473 fpadd32 $VK_00_19,@X[14],%f30
474 std %f16,[$Xfer+0]
475 mov $Actx,$A
476 std %f18,[$Xfer+8]
477 mov $Bctx,$B
478 std %f20,[$Xfer+16]
479 mov $Cctx,$C
480 std %f22,[$Xfer+24]
481 mov $Dctx,$D
482 std %f24,[$Xfer+32]
483 mov $Ectx,$E
484 std %f26,[$Xfer+40]
485 fxors @X[13],@X[0],@X[0]
486 std %f28,[$Xfer+48]
487 ba .Loop
488 std %f30,[$Xfer+56]
489.align 32
490.Loop:
491___
492for ($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
493for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
494for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
495for (;$i<70;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
496$code.=<<___;
497 tst $len
498 bz,pn `$bits==32?"%icc":"%xcc"`,.Ltail
499 nop
500___
501for (;$i<80;$i++) { &BODY_70_79($i,@V); unshift(@V,pop(@V)); }
502$code.=<<___;
503 add $A,$Actx,$Actx
504 add $B,$Bctx,$Bctx
505 add $C,$Cctx,$Cctx
506 add $D,$Dctx,$Dctx
507 add $E,$Ectx,$Ectx
508 mov 5,$tmp0
509 fxors @X[13],@X[0],@X[0]
510 mov $Actx,$A
511 mov $Bctx,$B
512 mov $Cctx,$C
513 mov $Dctx,$D
514 mov $Ectx,$E
515 alignaddr %g0,$tmp0,%g0
516 dec 1,$len
517 ba .Loop
518 mov $nXfer,$Xfer
519
520.align 32
521.Ltail:
522___
523for($i=70;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
524$code.=<<___;
525 add $A,$Actx,$Actx
526 add $B,$Bctx,$Bctx
527 add $C,$Cctx,$Cctx
528 add $D,$Dctx,$Dctx
529 add $E,$Ectx,$Ectx
530
531 st $Actx,[$ctx+0]
532 st $Bctx,[$ctx+4]
533 st $Cctx,[$ctx+8]
534 st $Dctx,[$ctx+12]
535 st $Ectx,[$ctx+16]
536
537 ret
538 restore
539.type sha1_block_data_order,#function
540.size sha1_block_data_order,(.-sha1_block_data_order)
541.asciz "SHA1 block transform for SPARCv9a, CRYPTOGAMS by <appro\@openssl.org>"
542.align 4
543___
544
545# Purpose of these subroutines is to explicitly encode VIS instructions,
546# so that one can compile the module without having to specify VIS
547# extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
548# Idea is to reserve for option to produce "universal" binary and let
549# programmer detect if current CPU is VIS capable at run-time.
550sub unvis {
551my ($mnemonic,$rs1,$rs2,$rd)=@_;
552my ($ref,$opf);
553my %visopf = ( "fmul8ulx16" => 0x037,
554 "faligndata" => 0x048,
555 "fpadd32" => 0x052,
556 "fxor" => 0x06c,
557 "fxors" => 0x06d );
558
559 $ref = "$mnemonic\t$rs1,$rs2,$rd";
560
561 if ($opf=$visopf{$mnemonic}) {
562 foreach ($rs1,$rs2,$rd) {
563 return $ref if (!/%f([0-9]{1,2})/);
564 $_=$1;
565 if ($1>=32) {
566 return $ref if ($1&1);
567 # re-encode for upper double register addressing
568 $_=($1|$1>>5)&31;
569 }
570 }
571
572 return sprintf ".word\t0x%08x !%s",
573 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
574 $ref;
575 } else {
576 return $ref;
577 }
578}
579sub unalignaddr {
580my ($mnemonic,$rs1,$rs2,$rd)=@_;
581my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
582my $ref="$mnemonic\t$rs1,$rs2,$rd";
583
584 foreach ($rs1,$rs2,$rd) {
585 if (/%([goli])([0-7])/) { $_=$bias{$1}+$2; }
586 else { return $ref; }
587 }
588 return sprintf ".word\t0x%08x !%s",
589 0x81b00300|$rd<<25|$rs1<<14|$rs2,
590 $ref;
591}
592
593$code =~ s/\`([^\`]*)\`/eval $1/gem;
594$code =~ s/\b(f[^\s]*)\s+(%f[0-9]{1,2}),(%f[0-9]{1,2}),(%f[0-9]{1,2})/
595 &unvis($1,$2,$3,$4)
596 /gem;
597$code =~ s/\b(alignaddr)\s+(%[goli][0-7]),(%[goli][0-7]),(%[goli][0-7])/
598 &unalignaddr($1,$2,$3,$4)
599 /gem;
600print $code;
601close STDOUT;
diff --git a/src/lib/libcrypto/sha/asm/sha1-thumb.pl b/src/lib/libcrypto/sha/asm/sha1-thumb.pl
deleted file mode 100644
index 7c9ea9b029..0000000000
--- a/src/lib/libcrypto/sha/asm/sha1-thumb.pl
+++ /dev/null
@@ -1,259 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# sha1_block for Thumb.
11#
12# January 2007.
13#
14# The code does not present direct interest to OpenSSL, because of low
15# performance. Its purpose is to establish _size_ benchmark. Pretty
16# useless one I must say, because 30% or 88 bytes larger ARMv4 code
17# [avialable on demand] is almost _twice_ as fast. It should also be
18# noted that in-lining of .Lcommon and .Lrotate improves performance
19# by over 40%, while code increases by only 10% or 32 bytes. But once
20# again, the goal was to establish _size_ benchmark, not performance.
21
22$output=shift;
23open STDOUT,">$output";
24
25$inline=0;
26#$cheat_on_binutils=1;
27
28$t0="r0";
29$t1="r1";
30$t2="r2";
31$a="r3";
32$b="r4";
33$c="r5";
34$d="r6";
35$e="r7";
36$K="r8"; # "upper" registers can be used in add/sub and mov insns
37$ctx="r9";
38$inp="r10";
39$len="r11";
40$Xi="r12";
41
42sub common {
43<<___;
44 sub $t0,#4
45 ldr $t1,[$t0]
46 add $e,$K @ E+=K_xx_xx
47 lsl $t2,$a,#5
48 add $t2,$e
49 lsr $e,$a,#27
50 add $t2,$e @ E+=ROR(A,27)
51 add $t2,$t1 @ E+=X[i]
52___
53}
54sub rotate {
55<<___;
56 mov $e,$d @ E=D
57 mov $d,$c @ D=C
58 lsl $c,$b,#30
59 lsr $b,$b,#2
60 orr $c,$b @ C=ROR(B,2)
61 mov $b,$a @ B=A
62 add $a,$t2,$t1 @ A=E+F_xx_xx(B,C,D)
63___
64}
65
66sub BODY_00_19 {
67$code.=$inline?&common():"\tbl .Lcommon\n";
68$code.=<<___;
69 mov $t1,$c
70 eor $t1,$d
71 and $t1,$b
72 eor $t1,$d @ F_00_19(B,C,D)
73___
74$code.=$inline?&rotate():"\tbl .Lrotate\n";
75}
76
77sub BODY_20_39 {
78$code.=$inline?&common():"\tbl .Lcommon\n";
79$code.=<<___;
80 mov $t1,$b
81 eor $t1,$c
82 eor $t1,$d @ F_20_39(B,C,D)
83___
84$code.=$inline?&rotate():"\tbl .Lrotate\n";
85}
86
87sub BODY_40_59 {
88$code.=$inline?&common():"\tbl .Lcommon\n";
89$code.=<<___;
90 mov $t1,$b
91 and $t1,$c
92 mov $e,$b
93 orr $e,$c
94 and $e,$d
95 orr $t1,$e @ F_40_59(B,C,D)
96___
97$code.=$inline?&rotate():"\tbl .Lrotate\n";
98}
99
100$code=<<___;
101.text
102.code 16
103
104.global sha1_block_data_order
105.type sha1_block_data_order,%function
106
107.align 2
108sha1_block_data_order:
109___
110if ($cheat_on_binutils) {
111$code.=<<___;
112.code 32
113 add r3,pc,#1
114 bx r3 @ switch to Thumb ISA
115.code 16
116___
117}
118$code.=<<___;
119 push {r4-r7}
120 mov r3,r8
121 mov r4,r9
122 mov r5,r10
123 mov r6,r11
124 mov r7,r12
125 push {r3-r7,lr}
126 lsl r2,#6
127 mov $ctx,r0 @ save context
128 mov $inp,r1 @ save inp
129 mov $len,r2 @ save len
130 add $len,$inp @ $len to point at inp end
131
132.Lloop:
133 mov $Xi,sp
134 mov $t2,sp
135 sub $t2,#16*4 @ [3]
136.LXload:
137 ldrb $a,[$t1,#0] @ $t1 is r1 and holds inp
138 ldrb $b,[$t1,#1]
139 ldrb $c,[$t1,#2]
140 ldrb $d,[$t1,#3]
141 lsl $a,#24
142 lsl $b,#16
143 lsl $c,#8
144 orr $a,$b
145 orr $a,$c
146 orr $a,$d
147 add $t1,#4
148 push {$a}
149 cmp sp,$t2
150 bne .LXload @ [+14*16]
151
152 mov $inp,$t1 @ update $inp
153 sub $t2,#32*4
154 sub $t2,#32*4
155 mov $e,#31 @ [+4]
156.LXupdate:
157 ldr $a,[sp,#15*4]
158 ldr $b,[sp,#13*4]
159 ldr $c,[sp,#7*4]
160 ldr $d,[sp,#2*4]
161 eor $a,$b
162 eor $a,$c
163 eor $a,$d
164 ror $a,$e
165 push {$a}
166 cmp sp,$t2
167 bne .LXupdate @ [+(11+1)*64]
168
169 ldmia $t0!,{$a,$b,$c,$d,$e} @ $t0 is r0 and holds ctx
170 mov $t0,$Xi
171
172 ldr $t2,.LK_00_19
173 mov $t1,$t0
174 sub $t1,#20*4
175 mov $Xi,$t1
176 mov $K,$t2 @ [+7+4]
177.L_00_19:
178___
179 &BODY_00_19();
180$code.=<<___;
181 cmp $Xi,$t0
182 bne .L_00_19 @ [+(2+9+4+2+8+2)*20]
183
184 ldr $t2,.LK_20_39
185 mov $t1,$t0
186 sub $t1,#20*4
187 mov $Xi,$t1
188 mov $K,$t2 @ [+5]
189.L_20_39_or_60_79:
190___
191 &BODY_20_39();
192$code.=<<___;
193 cmp $Xi,$t0
194 bne .L_20_39_or_60_79 @ [+(2+9+3+2+8+2)*20*2]
195 cmp sp,$t0
196 beq .Ldone @ [+2]
197
198 ldr $t2,.LK_40_59
199 mov $t1,$t0
200 sub $t1,#20*4
201 mov $Xi,$t1
202 mov $K,$t2 @ [+5]
203.L_40_59:
204___
205 &BODY_40_59();
206$code.=<<___;
207 cmp $Xi,$t0
208 bne .L_40_59 @ [+(2+9+6+2+8+2)*20]
209
210 ldr $t2,.LK_60_79
211 mov $Xi,sp
212 mov $K,$t2
213 b .L_20_39_or_60_79 @ [+4]
214.Ldone:
215 mov $t0,$ctx
216 ldr $t1,[$t0,#0]
217 ldr $t2,[$t0,#4]
218 add $a,$t1
219 ldr $t1,[$t0,#8]
220 add $b,$t2
221 ldr $t2,[$t0,#12]
222 add $c,$t1
223 ldr $t1,[$t0,#16]
224 add $d,$t2
225 add $e,$t1
226 stmia $t0!,{$a,$b,$c,$d,$e} @ [+20]
227
228 add sp,#80*4 @ deallocate stack frame
229 mov $t0,$ctx @ restore ctx
230 mov $t1,$inp @ restore inp
231 cmp $t1,$len
232 beq .Lexit
233 b .Lloop @ [+6] total 3212 cycles
234.Lexit:
235 pop {r2-r7}
236 mov r8,r2
237 mov r9,r3
238 mov r10,r4
239 mov r11,r5
240 mov r12,r6
241 mov lr,r7
242 pop {r4-r7}
243 bx lr
244.align 2
245___
246$code.=".Lcommon:\n".&common()."\tmov pc,lr\n" if (!$inline);
247$code.=".Lrotate:\n".&rotate()."\tmov pc,lr\n" if (!$inline);
248$code.=<<___;
249.align 2
250.LK_00_19: .word 0x5a827999
251.LK_20_39: .word 0x6ed9eba1
252.LK_40_59: .word 0x8f1bbcdc
253.LK_60_79: .word 0xca62c1d6
254.size sha1_block_data_order,.-sha1_block_data_order
255.asciz "SHA1 block transform for Thumb, CRYPTOGAMS by <appro\@openssl.org>"
256___
257
258print $code;
259close STDOUT; # enforce flush
diff --git a/src/lib/libcrypto/sha/asm/sha1-x86_64.pl b/src/lib/libcrypto/sha/asm/sha1-x86_64.pl
deleted file mode 100755
index f15c7ec39b..0000000000
--- a/src/lib/libcrypto/sha/asm/sha1-x86_64.pl
+++ /dev/null
@@ -1,1261 +0,0 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# sha1_block procedure for x86_64.
11#
12# It was brought to my attention that on EM64T compiler-generated code
13# was far behind 32-bit assembler implementation. This is unlike on
14# Opteron where compiler-generated code was only 15% behind 32-bit
15# assembler, which originally made it hard to motivate the effort.
16# There was suggestion to mechanically translate 32-bit code, but I
17# dismissed it, reasoning that x86_64 offers enough register bank
18# capacity to fully utilize SHA-1 parallelism. Therefore this fresh
19# implementation:-) However! While 64-bit code does perform better
20# on Opteron, I failed to beat 32-bit assembler on EM64T core. Well,
21# x86_64 does offer larger *addressable* bank, but out-of-order core
22# reaches for even more registers through dynamic aliasing, and EM64T
23# core must have managed to run-time optimize even 32-bit code just as
24# good as 64-bit one. Performance improvement is summarized in the
25# following table:
26#
27# gcc 3.4 32-bit asm cycles/byte
28# Opteron +45% +20% 6.8
29# Xeon P4 +65% +0% 9.9
30# Core2 +60% +10% 7.0
31
32# August 2009.
33#
34# The code was revised to minimize code size and to maximize
35# "distance" between instructions producing input to 'lea'
36# instruction and the 'lea' instruction itself, which is essential
37# for Intel Atom core.
38
39# October 2010.
40#
41# Add SSSE3, Supplemental[!] SSE3, implementation. The idea behind it
42# is to offload message schedule denoted by Wt in NIST specification,
43# or Xupdate in OpenSSL source, to SIMD unit. See sha1-586.pl module
44# for background and implementation details. The only difference from
45# 32-bit code is that 64-bit code doesn't have to spill @X[] elements
46# to free temporary registers.
47
48# April 2011.
49#
50# Add AVX code path. See sha1-586.pl for further information.
51
52######################################################################
53# Current performance is summarized in following table. Numbers are
54# CPU clock cycles spent to process single byte (less is better).
55#
56# x86_64 SSSE3 AVX
57# P4 9.8 -
58# Opteron 6.6 -
59# Core2 6.7 6.1/+10% -
60# Atom 11.0 9.7/+13% -
61# Westmere 7.1 5.6/+27% -
62# Sandy Bridge 7.9 6.3/+25% 5.2/+51%
63
64$flavour = shift;
65$output = shift;
66if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
67
68$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
69
70$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
71( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
72( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
73die "can't locate x86_64-xlate.pl";
74
75$avx=1 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
76 =~ /GNU assembler version ([2-9]\.[0-9]+)/ &&
77 $1>=2.19);
78$avx=1 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
79 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ &&
80 $1>=2.09);
81$avx=1 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
82 `ml64 2>&1` =~ /Version ([0-9]+)\./ &&
83 $1>=10);
84
85open OUT,"| \"$^X\" $xlate $flavour $output";
86*STDOUT=*OUT;
87
88$ctx="%rdi"; # 1st arg
89$inp="%rsi"; # 2nd arg
90$num="%rdx"; # 3rd arg
91
92# reassign arguments in order to produce more compact code
93$ctx="%r8";
94$inp="%r9";
95$num="%r10";
96
97$t0="%eax";
98$t1="%ebx";
99$t2="%ecx";
100@xi=("%edx","%ebp");
101$A="%esi";
102$B="%edi";
103$C="%r11d";
104$D="%r12d";
105$E="%r13d";
106
107@V=($A,$B,$C,$D,$E);
108
109sub BODY_00_19 {
110my ($i,$a,$b,$c,$d,$e)=@_;
111my $j=$i+1;
112$code.=<<___ if ($i==0);
113 mov `4*$i`($inp),$xi[0]
114 bswap $xi[0]
115 mov $xi[0],`4*$i`(%rsp)
116___
117$code.=<<___ if ($i<15);
118 mov $c,$t0
119 mov `4*$j`($inp),$xi[1]
120 mov $a,$t2
121 xor $d,$t0
122 bswap $xi[1]
123 rol \$5,$t2
124 lea 0x5a827999($xi[0],$e),$e
125 and $b,$t0
126 mov $xi[1],`4*$j`(%rsp)
127 add $t2,$e
128 xor $d,$t0
129 rol \$30,$b
130 add $t0,$e
131___
132$code.=<<___ if ($i>=15);
133 mov `4*($j%16)`(%rsp),$xi[1]
134 mov $c,$t0
135 mov $a,$t2
136 xor `4*(($j+2)%16)`(%rsp),$xi[1]
137 xor $d,$t0
138 rol \$5,$t2
139 xor `4*(($j+8)%16)`(%rsp),$xi[1]
140 and $b,$t0
141 lea 0x5a827999($xi[0],$e),$e
142 xor `4*(($j+13)%16)`(%rsp),$xi[1]
143 xor $d,$t0
144 rol \$1,$xi[1]
145 add $t2,$e
146 rol \$30,$b
147 mov $xi[1],`4*($j%16)`(%rsp)
148 add $t0,$e
149___
150unshift(@xi,pop(@xi));
151}
152
153sub BODY_20_39 {
154my ($i,$a,$b,$c,$d,$e)=@_;
155my $j=$i+1;
156my $K=($i<40)?0x6ed9eba1:0xca62c1d6;
157$code.=<<___ if ($i<79);
158 mov `4*($j%16)`(%rsp),$xi[1]
159 mov $c,$t0
160 mov $a,$t2
161 xor `4*(($j+2)%16)`(%rsp),$xi[1]
162 xor $b,$t0
163 rol \$5,$t2
164 lea $K($xi[0],$e),$e
165 xor `4*(($j+8)%16)`(%rsp),$xi[1]
166 xor $d,$t0
167 add $t2,$e
168 xor `4*(($j+13)%16)`(%rsp),$xi[1]
169 rol \$30,$b
170 add $t0,$e
171 rol \$1,$xi[1]
172___
173$code.=<<___ if ($i<76);
174 mov $xi[1],`4*($j%16)`(%rsp)
175___
176$code.=<<___ if ($i==79);
177 mov $c,$t0
178 mov $a,$t2
179 xor $b,$t0
180 lea $K($xi[0],$e),$e
181 rol \$5,$t2
182 xor $d,$t0
183 add $t2,$e
184 rol \$30,$b
185 add $t0,$e
186___
187unshift(@xi,pop(@xi));
188}
189
190sub BODY_40_59 {
191my ($i,$a,$b,$c,$d,$e)=@_;
192my $j=$i+1;
193$code.=<<___;
194 mov `4*($j%16)`(%rsp),$xi[1]
195 mov $c,$t0
196 mov $c,$t1
197 xor `4*(($j+2)%16)`(%rsp),$xi[1]
198 and $d,$t0
199 mov $a,$t2
200 xor `4*(($j+8)%16)`(%rsp),$xi[1]
201 xor $d,$t1
202 lea 0x8f1bbcdc($xi[0],$e),$e
203 rol \$5,$t2
204 xor `4*(($j+13)%16)`(%rsp),$xi[1]
205 add $t0,$e
206 and $b,$t1
207 rol \$1,$xi[1]
208 add $t1,$e
209 rol \$30,$b
210 mov $xi[1],`4*($j%16)`(%rsp)
211 add $t2,$e
212___
213unshift(@xi,pop(@xi));
214}
215
216$code.=<<___;
217.text
218.extern OPENSSL_ia32cap_P
219
220.globl sha1_block_data_order
221.type sha1_block_data_order,\@function,3
222.align 16
223sha1_block_data_order:
224 mov OPENSSL_ia32cap_P+0(%rip),%r9d
225 mov OPENSSL_ia32cap_P+4(%rip),%r8d
226 test \$`1<<9`,%r8d # check SSSE3 bit
227 jz .Lialu
228___
229$code.=<<___ if ($avx);
230 and \$`1<<28`,%r8d # mask AVX bit
231 and \$`1<<30`,%r9d # mask "Intel CPU" bit
232 or %r9d,%r8d
233 cmp \$`1<<28|1<<30`,%r8d
234 je _avx_shortcut
235___
236$code.=<<___;
237 jmp _ssse3_shortcut
238
239.align 16
240.Lialu:
241 push %rbx
242 push %rbp
243 push %r12
244 push %r13
245 mov %rsp,%r11
246 mov %rdi,$ctx # reassigned argument
247 sub \$`8+16*4`,%rsp
248 mov %rsi,$inp # reassigned argument
249 and \$-64,%rsp
250 mov %rdx,$num # reassigned argument
251 mov %r11,`16*4`(%rsp)
252.Lprologue:
253
254 mov 0($ctx),$A
255 mov 4($ctx),$B
256 mov 8($ctx),$C
257 mov 12($ctx),$D
258 mov 16($ctx),$E
259 jmp .Lloop
260
261.align 16
262.Lloop:
263___
264for($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
265for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
266for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
267for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
268$code.=<<___;
269 add 0($ctx),$A
270 add 4($ctx),$B
271 add 8($ctx),$C
272 add 12($ctx),$D
273 add 16($ctx),$E
274 mov $A,0($ctx)
275 mov $B,4($ctx)
276 mov $C,8($ctx)
277 mov $D,12($ctx)
278 mov $E,16($ctx)
279
280 sub \$1,$num
281 lea `16*4`($inp),$inp
282 jnz .Lloop
283
284 mov `16*4`(%rsp),%rsi
285 mov (%rsi),%r13
286 mov 8(%rsi),%r12
287 mov 16(%rsi),%rbp
288 mov 24(%rsi),%rbx
289 lea 32(%rsi),%rsp
290.Lepilogue:
291 ret
292.size sha1_block_data_order,.-sha1_block_data_order
293___
294{{{
295my $Xi=4;
296my @X=map("%xmm$_",(4..7,0..3));
297my @Tx=map("%xmm$_",(8..10));
298my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization
299my @T=("%esi","%edi");
300my $j=0;
301my $K_XX_XX="%r11";
302
303my $_rol=sub { &rol(@_) };
304my $_ror=sub { &ror(@_) };
305
306$code.=<<___;
307.type sha1_block_data_order_ssse3,\@function,3
308.align 16
309sha1_block_data_order_ssse3:
310_ssse3_shortcut:
311 push %rbx
312 push %rbp
313 push %r12
314 lea `-64-($win64?5*16:0)`(%rsp),%rsp
315___
316$code.=<<___ if ($win64);
317 movaps %xmm6,64+0(%rsp)
318 movaps %xmm7,64+16(%rsp)
319 movaps %xmm8,64+32(%rsp)
320 movaps %xmm9,64+48(%rsp)
321 movaps %xmm10,64+64(%rsp)
322.Lprologue_ssse3:
323___
324$code.=<<___;
325 mov %rdi,$ctx # reassigned argument
326 mov %rsi,$inp # reassigned argument
327 mov %rdx,$num # reassigned argument
328
329 shl \$6,$num
330 add $inp,$num
331 lea K_XX_XX(%rip),$K_XX_XX
332
333 mov 0($ctx),$A # load context
334 mov 4($ctx),$B
335 mov 8($ctx),$C
336 mov 12($ctx),$D
337 mov $B,@T[0] # magic seed
338 mov 16($ctx),$E
339
340 movdqa 64($K_XX_XX),@X[2] # pbswap mask
341 movdqa 0($K_XX_XX),@Tx[1] # K_00_19
342 movdqu 0($inp),@X[-4&7] # load input to %xmm[0-3]
343 movdqu 16($inp),@X[-3&7]
344 movdqu 32($inp),@X[-2&7]
345 movdqu 48($inp),@X[-1&7]
346 pshufb @X[2],@X[-4&7] # byte swap
347 add \$64,$inp
348 pshufb @X[2],@X[-3&7]
349 pshufb @X[2],@X[-2&7]
350 pshufb @X[2],@X[-1&7]
351 paddd @Tx[1],@X[-4&7] # add K_00_19
352 paddd @Tx[1],@X[-3&7]
353 paddd @Tx[1],@X[-2&7]
354 movdqa @X[-4&7],0(%rsp) # X[]+K xfer to IALU
355 psubd @Tx[1],@X[-4&7] # restore X[]
356 movdqa @X[-3&7],16(%rsp)
357 psubd @Tx[1],@X[-3&7]
358 movdqa @X[-2&7],32(%rsp)
359 psubd @Tx[1],@X[-2&7]
360 jmp .Loop_ssse3
361___
362
363sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
364{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
365 my $arg = pop;
366 $arg = "\$$arg" if ($arg*1 eq $arg);
367 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
368}
369
370sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4
371{ use integer;
372 my $body = shift;
373 my @insns = (&$body,&$body,&$body,&$body); # 40 instructions
374 my ($a,$b,$c,$d,$e);
375
376 &movdqa (@X[0],@X[-3&7]);
377 eval(shift(@insns));
378 eval(shift(@insns));
379 &movdqa (@Tx[0],@X[-1&7]);
380 &palignr(@X[0],@X[-4&7],8); # compose "X[-14]" in "X[0]"
381 eval(shift(@insns));
382 eval(shift(@insns));
383
384 &paddd (@Tx[1],@X[-1&7]);
385 eval(shift(@insns));
386 eval(shift(@insns));
387 &psrldq (@Tx[0],4); # "X[-3]", 3 dwords
388 eval(shift(@insns));
389 eval(shift(@insns));
390 &pxor (@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
391 eval(shift(@insns));
392 eval(shift(@insns));
393
394 &pxor (@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"
395 eval(shift(@insns));
396 eval(shift(@insns));
397 eval(shift(@insns));
398 eval(shift(@insns));
399
400 &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]"
401 eval(shift(@insns));
402 eval(shift(@insns));
403 &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
404 eval(shift(@insns));
405 eval(shift(@insns));
406
407 &movdqa (@Tx[2],@X[0]);
408 &movdqa (@Tx[0],@X[0]);
409 eval(shift(@insns));
410 eval(shift(@insns));
411 eval(shift(@insns));
412 eval(shift(@insns));
413
414 &pslldq (@Tx[2],12); # "X[0]"<<96, extract one dword
415 &paddd (@X[0],@X[0]);
416 eval(shift(@insns));
417 eval(shift(@insns));
418 eval(shift(@insns));
419 eval(shift(@insns));
420
421 &psrld (@Tx[0],31);
422 eval(shift(@insns));
423 eval(shift(@insns));
424 &movdqa (@Tx[1],@Tx[2]);
425 eval(shift(@insns));
426 eval(shift(@insns));
427
428 &psrld (@Tx[2],30);
429 &por (@X[0],@Tx[0]); # "X[0]"<<<=1
430 eval(shift(@insns));
431 eval(shift(@insns));
432 eval(shift(@insns));
433 eval(shift(@insns));
434
435 &pslld (@Tx[1],2);
436 &pxor (@X[0],@Tx[2]);
437 eval(shift(@insns));
438 eval(shift(@insns));
439 &movdqa (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)"); # K_XX_XX
440 eval(shift(@insns));
441 eval(shift(@insns));
442
443 &pxor (@X[0],@Tx[1]); # "X[0]"^=("X[0]">>96)<<<2
444
445 foreach (@insns) { eval; } # remaining instructions [if any]
446
447 $Xi++; push(@X,shift(@X)); # "rotate" X[]
448 push(@Tx,shift(@Tx));
449}
450
451sub Xupdate_ssse3_32_79()
452{ use integer;
453 my $body = shift;
454 my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions
455 my ($a,$b,$c,$d,$e);
456
457 &movdqa (@Tx[0],@X[-1&7]) if ($Xi==8);
458 eval(shift(@insns)); # body_20_39
459 &pxor (@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
460 &palignr(@Tx[0],@X[-2&7],8); # compose "X[-6]"
461 eval(shift(@insns));
462 eval(shift(@insns));
463 eval(shift(@insns)); # rol
464
465 &pxor (@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
466 eval(shift(@insns));
467 eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/);
468 if ($Xi%5) {
469 &movdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
470 } else { # ... or load next one
471 &movdqa (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
472 }
473 &paddd (@Tx[1],@X[-1&7]);
474 eval(shift(@insns)); # ror
475 eval(shift(@insns));
476
477 &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-6]"
478 eval(shift(@insns)); # body_20_39
479 eval(shift(@insns));
480 eval(shift(@insns));
481 eval(shift(@insns)); # rol
482
483 &movdqa (@Tx[0],@X[0]);
484 &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
485 eval(shift(@insns));
486 eval(shift(@insns));
487 eval(shift(@insns)); # ror
488 eval(shift(@insns));
489
490 &pslld (@X[0],2);
491 eval(shift(@insns)); # body_20_39
492 eval(shift(@insns));
493 &psrld (@Tx[0],30);
494 eval(shift(@insns));
495 eval(shift(@insns)); # rol
496 eval(shift(@insns));
497 eval(shift(@insns));
498 eval(shift(@insns)); # ror
499 eval(shift(@insns));
500
501 &por (@X[0],@Tx[0]); # "X[0]"<<<=2
502 eval(shift(@insns)); # body_20_39
503 eval(shift(@insns));
504 &movdqa (@Tx[1],@X[0]) if ($Xi<19);
505 eval(shift(@insns));
506 eval(shift(@insns)); # rol
507 eval(shift(@insns));
508 eval(shift(@insns));
509 eval(shift(@insns)); # rol
510 eval(shift(@insns));
511
512 foreach (@insns) { eval; } # remaining instructions
513
514 $Xi++; push(@X,shift(@X)); # "rotate" X[]
515 push(@Tx,shift(@Tx));
516}
517
518sub Xuplast_ssse3_80()
519{ use integer;
520 my $body = shift;
521 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
522 my ($a,$b,$c,$d,$e);
523
524 eval(shift(@insns));
525 &paddd (@Tx[1],@X[-1&7]);
526 eval(shift(@insns));
527 eval(shift(@insns));
528 eval(shift(@insns));
529 eval(shift(@insns));
530
531 &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU
532
533 foreach (@insns) { eval; } # remaining instructions
534
535 &cmp ($inp,$num);
536 &je (".Ldone_ssse3");
537
538 unshift(@Tx,pop(@Tx));
539
540 &movdqa (@X[2],"64($K_XX_XX)"); # pbswap mask
541 &movdqa (@Tx[1],"0($K_XX_XX)"); # K_00_19
542 &movdqu (@X[-4&7],"0($inp)"); # load input
543 &movdqu (@X[-3&7],"16($inp)");
544 &movdqu (@X[-2&7],"32($inp)");
545 &movdqu (@X[-1&7],"48($inp)");
546 &pshufb (@X[-4&7],@X[2]); # byte swap
547 &add ($inp,64);
548
549 $Xi=0;
550}
551
552sub Xloop_ssse3()
553{ use integer;
554 my $body = shift;
555 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
556 my ($a,$b,$c,$d,$e);
557
558 eval(shift(@insns));
559 eval(shift(@insns));
560 &pshufb (@X[($Xi-3)&7],@X[2]);
561 eval(shift(@insns));
562 eval(shift(@insns));
563 &paddd (@X[($Xi-4)&7],@Tx[1]);
564 eval(shift(@insns));
565 eval(shift(@insns));
566 eval(shift(@insns));
567 eval(shift(@insns));
568 &movdqa (eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]); # X[]+K xfer to IALU
569 eval(shift(@insns));
570 eval(shift(@insns));
571 &psubd (@X[($Xi-4)&7],@Tx[1]);
572
573 foreach (@insns) { eval; }
574 $Xi++;
575}
576
577sub Xtail_ssse3()
578{ use integer;
579 my $body = shift;
580 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
581 my ($a,$b,$c,$d,$e);
582
583 foreach (@insns) { eval; }
584}
585
586sub body_00_19 () {
587 (
588 '($a,$b,$c,$d,$e)=@V;'.
589 '&add ($e,eval(4*($j&15))."(%rsp)");', # X[]+K xfer
590 '&xor ($c,$d);',
591 '&mov (@T[1],$a);', # $b in next round
592 '&$_rol ($a,5);',
593 '&and (@T[0],$c);', # ($b&($c^$d))
594 '&xor ($c,$d);', # restore $c
595 '&xor (@T[0],$d);',
596 '&add ($e,$a);',
597 '&$_ror ($b,$j?7:2);', # $b>>>2
598 '&add ($e,@T[0]);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
599 );
600}
601
602sub body_20_39 () {
603 (
604 '($a,$b,$c,$d,$e)=@V;'.
605 '&add ($e,eval(4*($j++&15))."(%rsp)");', # X[]+K xfer
606 '&xor (@T[0],$d);', # ($b^$d)
607 '&mov (@T[1],$a);', # $b in next round
608 '&$_rol ($a,5);',
609 '&xor (@T[0],$c);', # ($b^$d^$c)
610 '&add ($e,$a);',
611 '&$_ror ($b,7);', # $b>>>2
612 '&add ($e,@T[0]);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
613 );
614}
615
616sub body_40_59 () {
617 (
618 '($a,$b,$c,$d,$e)=@V;'.
619 '&mov (@T[1],$c);',
620 '&xor ($c,$d);',
621 '&add ($e,eval(4*($j++&15))."(%rsp)");', # X[]+K xfer
622 '&and (@T[1],$d);',
623 '&and (@T[0],$c);', # ($b&($c^$d))
624 '&$_ror ($b,7);', # $b>>>2
625 '&add ($e,@T[1]);',
626 '&mov (@T[1],$a);', # $b in next round
627 '&$_rol ($a,5);',
628 '&add ($e,@T[0]);',
629 '&xor ($c,$d);', # restore $c
630 '&add ($e,$a);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
631 );
632}
633$code.=<<___;
634.align 16
635.Loop_ssse3:
636___
637 &Xupdate_ssse3_16_31(\&body_00_19);
638 &Xupdate_ssse3_16_31(\&body_00_19);
639 &Xupdate_ssse3_16_31(\&body_00_19);
640 &Xupdate_ssse3_16_31(\&body_00_19);
641 &Xupdate_ssse3_32_79(\&body_00_19);
642 &Xupdate_ssse3_32_79(\&body_20_39);
643 &Xupdate_ssse3_32_79(\&body_20_39);
644 &Xupdate_ssse3_32_79(\&body_20_39);
645 &Xupdate_ssse3_32_79(\&body_20_39);
646 &Xupdate_ssse3_32_79(\&body_20_39);
647 &Xupdate_ssse3_32_79(\&body_40_59);
648 &Xupdate_ssse3_32_79(\&body_40_59);
649 &Xupdate_ssse3_32_79(\&body_40_59);
650 &Xupdate_ssse3_32_79(\&body_40_59);
651 &Xupdate_ssse3_32_79(\&body_40_59);
652 &Xupdate_ssse3_32_79(\&body_20_39);
653 &Xuplast_ssse3_80(\&body_20_39); # can jump to "done"
654
655 $saved_j=$j; @saved_V=@V;
656
657 &Xloop_ssse3(\&body_20_39);
658 &Xloop_ssse3(\&body_20_39);
659 &Xloop_ssse3(\&body_20_39);
660
661$code.=<<___;
662 add 0($ctx),$A # update context
663 add 4($ctx),@T[0]
664 add 8($ctx),$C
665 add 12($ctx),$D
666 mov $A,0($ctx)
667 add 16($ctx),$E
668 mov @T[0],4($ctx)
669 mov @T[0],$B # magic seed
670 mov $C,8($ctx)
671 mov $D,12($ctx)
672 mov $E,16($ctx)
673 jmp .Loop_ssse3
674
675.align 16
676.Ldone_ssse3:
677___
678 $j=$saved_j; @V=@saved_V;
679
680 &Xtail_ssse3(\&body_20_39);
681 &Xtail_ssse3(\&body_20_39);
682 &Xtail_ssse3(\&body_20_39);
683
684$code.=<<___;
685 add 0($ctx),$A # update context
686 add 4($ctx),@T[0]
687 add 8($ctx),$C
688 mov $A,0($ctx)
689 add 12($ctx),$D
690 mov @T[0],4($ctx)
691 add 16($ctx),$E
692 mov $C,8($ctx)
693 mov $D,12($ctx)
694 mov $E,16($ctx)
695___
696$code.=<<___ if ($win64);
697 movaps 64+0(%rsp),%xmm6
698 movaps 64+16(%rsp),%xmm7
699 movaps 64+32(%rsp),%xmm8
700 movaps 64+48(%rsp),%xmm9
701 movaps 64+64(%rsp),%xmm10
702___
703$code.=<<___;
704 lea `64+($win64?5*16:0)`(%rsp),%rsi
705 mov 0(%rsi),%r12
706 mov 8(%rsi),%rbp
707 mov 16(%rsi),%rbx
708 lea 24(%rsi),%rsp
709.Lepilogue_ssse3:
710 ret
711.size sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3
712___
713
714if ($avx) {
715my $Xi=4;
716my @X=map("%xmm$_",(4..7,0..3));
717my @Tx=map("%xmm$_",(8..10));
718my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization
719my @T=("%esi","%edi");
720my $j=0;
721my $K_XX_XX="%r11";
722
723my $_rol=sub { &shld(@_[0],@_) };
724my $_ror=sub { &shrd(@_[0],@_) };
725
726$code.=<<___;
727.type sha1_block_data_order_avx,\@function,3
728.align 16
729sha1_block_data_order_avx:
730_avx_shortcut:
731 push %rbx
732 push %rbp
733 push %r12
734 lea `-64-($win64?5*16:0)`(%rsp),%rsp
735___
736$code.=<<___ if ($win64);
737 movaps %xmm6,64+0(%rsp)
738 movaps %xmm7,64+16(%rsp)
739 movaps %xmm8,64+32(%rsp)
740 movaps %xmm9,64+48(%rsp)
741 movaps %xmm10,64+64(%rsp)
742.Lprologue_avx:
743___
744$code.=<<___;
745 mov %rdi,$ctx # reassigned argument
746 mov %rsi,$inp # reassigned argument
747 mov %rdx,$num # reassigned argument
748 vzeroupper
749
750 shl \$6,$num
751 add $inp,$num
752 lea K_XX_XX(%rip),$K_XX_XX
753
754 mov 0($ctx),$A # load context
755 mov 4($ctx),$B
756 mov 8($ctx),$C
757 mov 12($ctx),$D
758 mov $B,@T[0] # magic seed
759 mov 16($ctx),$E
760
761 vmovdqa 64($K_XX_XX),@X[2] # pbswap mask
762 vmovdqa 0($K_XX_XX),@Tx[1] # K_00_19
763 vmovdqu 0($inp),@X[-4&7] # load input to %xmm[0-3]
764 vmovdqu 16($inp),@X[-3&7]
765 vmovdqu 32($inp),@X[-2&7]
766 vmovdqu 48($inp),@X[-1&7]
767 vpshufb @X[2],@X[-4&7],@X[-4&7] # byte swap
768 add \$64,$inp
769 vpshufb @X[2],@X[-3&7],@X[-3&7]
770 vpshufb @X[2],@X[-2&7],@X[-2&7]
771 vpshufb @X[2],@X[-1&7],@X[-1&7]
772 vpaddd @Tx[1],@X[-4&7],@X[0] # add K_00_19
773 vpaddd @Tx[1],@X[-3&7],@X[1]
774 vpaddd @Tx[1],@X[-2&7],@X[2]
775 vmovdqa @X[0],0(%rsp) # X[]+K xfer to IALU
776 vmovdqa @X[1],16(%rsp)
777 vmovdqa @X[2],32(%rsp)
778 jmp .Loop_avx
779___
780
781sub Xupdate_avx_16_31() # recall that $Xi starts wtih 4
782{ use integer;
783 my $body = shift;
784 my @insns = (&$body,&$body,&$body,&$body); # 40 instructions
785 my ($a,$b,$c,$d,$e);
786
787 eval(shift(@insns));
788 eval(shift(@insns));
789 &vpalignr(@X[0],@X[-3&7],@X[-4&7],8); # compose "X[-14]" in "X[0]"
790 eval(shift(@insns));
791 eval(shift(@insns));
792
793 &vpaddd (@Tx[1],@Tx[1],@X[-1&7]);
794 eval(shift(@insns));
795 eval(shift(@insns));
796 &vpsrldq(@Tx[0],@X[-1&7],4); # "X[-3]", 3 dwords
797 eval(shift(@insns));
798 eval(shift(@insns));
799 &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
800 eval(shift(@insns));
801 eval(shift(@insns));
802
803 &vpxor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"
804 eval(shift(@insns));
805 eval(shift(@insns));
806 eval(shift(@insns));
807 eval(shift(@insns));
808
809 &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]"
810 eval(shift(@insns));
811 eval(shift(@insns));
812 &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
813 eval(shift(@insns));
814 eval(shift(@insns));
815
816 &vpsrld (@Tx[0],@X[0],31);
817 eval(shift(@insns));
818 eval(shift(@insns));
819 eval(shift(@insns));
820 eval(shift(@insns));
821
822 &vpslldq(@Tx[2],@X[0],12); # "X[0]"<<96, extract one dword
823 &vpaddd (@X[0],@X[0],@X[0]);
824 eval(shift(@insns));
825 eval(shift(@insns));
826 eval(shift(@insns));
827 eval(shift(@insns));
828
829 &vpsrld (@Tx[1],@Tx[2],30);
830 &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=1
831 eval(shift(@insns));
832 eval(shift(@insns));
833 eval(shift(@insns));
834 eval(shift(@insns));
835
836 &vpslld (@Tx[2],@Tx[2],2);
837 &vpxor (@X[0],@X[0],@Tx[1]);
838 eval(shift(@insns));
839 eval(shift(@insns));
840 eval(shift(@insns));
841 eval(shift(@insns));
842
843 &vpxor (@X[0],@X[0],@Tx[2]); # "X[0]"^=("X[0]">>96)<<<2
844 eval(shift(@insns));
845 eval(shift(@insns));
846 &vmovdqa (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)"); # K_XX_XX
847 eval(shift(@insns));
848 eval(shift(@insns));
849
850
851 foreach (@insns) { eval; } # remaining instructions [if any]
852
853 $Xi++; push(@X,shift(@X)); # "rotate" X[]
854 push(@Tx,shift(@Tx));
855}
856
857sub Xupdate_avx_32_79()
858{ use integer;
859 my $body = shift;
860 my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions
861 my ($a,$b,$c,$d,$e);
862
863 &vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8); # compose "X[-6]"
864 &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
865 eval(shift(@insns)); # body_20_39
866 eval(shift(@insns));
867 eval(shift(@insns));
868 eval(shift(@insns)); # rol
869
870 &vpxor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
871 eval(shift(@insns));
872 eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/);
873 if ($Xi%5) {
874 &vmovdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
875 } else { # ... or load next one
876 &vmovdqa (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
877 }
878 &vpaddd (@Tx[1],@Tx[1],@X[-1&7]);
879 eval(shift(@insns)); # ror
880 eval(shift(@insns));
881
882 &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-6]"
883 eval(shift(@insns)); # body_20_39
884 eval(shift(@insns));
885 eval(shift(@insns));
886 eval(shift(@insns)); # rol
887
888 &vpsrld (@Tx[0],@X[0],30);
889 &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
890 eval(shift(@insns));
891 eval(shift(@insns));
892 eval(shift(@insns)); # ror
893 eval(shift(@insns));
894
895 &vpslld (@X[0],@X[0],2);
896 eval(shift(@insns)); # body_20_39
897 eval(shift(@insns));
898 eval(shift(@insns));
899 eval(shift(@insns)); # rol
900 eval(shift(@insns));
901 eval(shift(@insns));
902 eval(shift(@insns)); # ror
903 eval(shift(@insns));
904
905 &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=2
906 eval(shift(@insns)); # body_20_39
907 eval(shift(@insns));
908 &vmovdqa (@Tx[1],@X[0]) if ($Xi<19);
909 eval(shift(@insns));
910 eval(shift(@insns)); # rol
911 eval(shift(@insns));
912 eval(shift(@insns));
913 eval(shift(@insns)); # rol
914 eval(shift(@insns));
915
916 foreach (@insns) { eval; } # remaining instructions
917
918 $Xi++; push(@X,shift(@X)); # "rotate" X[]
919 push(@Tx,shift(@Tx));
920}
921
922sub Xuplast_avx_80()
923{ use integer;
924 my $body = shift;
925 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
926 my ($a,$b,$c,$d,$e);
927
928 eval(shift(@insns));
929 &vpaddd (@Tx[1],@Tx[1],@X[-1&7]);
930 eval(shift(@insns));
931 eval(shift(@insns));
932 eval(shift(@insns));
933 eval(shift(@insns));
934
935 &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU
936
937 foreach (@insns) { eval; } # remaining instructions
938
939 &cmp ($inp,$num);
940 &je (".Ldone_avx");
941
942 unshift(@Tx,pop(@Tx));
943
944 &vmovdqa(@X[2],"64($K_XX_XX)"); # pbswap mask
945 &vmovdqa(@Tx[1],"0($K_XX_XX)"); # K_00_19
946 &vmovdqu(@X[-4&7],"0($inp)"); # load input
947 &vmovdqu(@X[-3&7],"16($inp)");
948 &vmovdqu(@X[-2&7],"32($inp)");
949 &vmovdqu(@X[-1&7],"48($inp)");
950 &vpshufb(@X[-4&7],@X[-4&7],@X[2]); # byte swap
951 &add ($inp,64);
952
953 $Xi=0;
954}
955
956sub Xloop_avx()
957{ use integer;
958 my $body = shift;
959 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
960 my ($a,$b,$c,$d,$e);
961
962 eval(shift(@insns));
963 eval(shift(@insns));
964 &vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]);
965 eval(shift(@insns));
966 eval(shift(@insns));
967 &vpaddd (@X[$Xi&7],@X[($Xi-4)&7],@Tx[1]);
968 eval(shift(@insns));
969 eval(shift(@insns));
970 eval(shift(@insns));
971 eval(shift(@insns));
972 &vmovdqa(eval(16*$Xi)."(%rsp)",@X[$Xi&7]); # X[]+K xfer to IALU
973 eval(shift(@insns));
974 eval(shift(@insns));
975
976 foreach (@insns) { eval; }
977 $Xi++;
978}
979
980sub Xtail_avx()
981{ use integer;
982 my $body = shift;
983 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
984 my ($a,$b,$c,$d,$e);
985
986 foreach (@insns) { eval; }
987}
988
989$code.=<<___;
990.align 16
991.Loop_avx:
992___
993 &Xupdate_avx_16_31(\&body_00_19);
994 &Xupdate_avx_16_31(\&body_00_19);
995 &Xupdate_avx_16_31(\&body_00_19);
996 &Xupdate_avx_16_31(\&body_00_19);
997 &Xupdate_avx_32_79(\&body_00_19);
998 &Xupdate_avx_32_79(\&body_20_39);
999 &Xupdate_avx_32_79(\&body_20_39);
1000 &Xupdate_avx_32_79(\&body_20_39);
1001 &Xupdate_avx_32_79(\&body_20_39);
1002 &Xupdate_avx_32_79(\&body_20_39);
1003 &Xupdate_avx_32_79(\&body_40_59);
1004 &Xupdate_avx_32_79(\&body_40_59);
1005 &Xupdate_avx_32_79(\&body_40_59);
1006 &Xupdate_avx_32_79(\&body_40_59);
1007 &Xupdate_avx_32_79(\&body_40_59);
1008 &Xupdate_avx_32_79(\&body_20_39);
1009 &Xuplast_avx_80(\&body_20_39); # can jump to "done"
1010
1011 $saved_j=$j; @saved_V=@V;
1012
1013 &Xloop_avx(\&body_20_39);
1014 &Xloop_avx(\&body_20_39);
1015 &Xloop_avx(\&body_20_39);
1016
1017$code.=<<___;
1018 add 0($ctx),$A # update context
1019 add 4($ctx),@T[0]
1020 add 8($ctx),$C
1021 add 12($ctx),$D
1022 mov $A,0($ctx)
1023 add 16($ctx),$E
1024 mov @T[0],4($ctx)
1025 mov @T[0],$B # magic seed
1026 mov $C,8($ctx)
1027 mov $D,12($ctx)
1028 mov $E,16($ctx)
1029 jmp .Loop_avx
1030
1031.align 16
1032.Ldone_avx:
1033___
1034 $j=$saved_j; @V=@saved_V;
1035
1036 &Xtail_avx(\&body_20_39);
1037 &Xtail_avx(\&body_20_39);
1038 &Xtail_avx(\&body_20_39);
1039
1040$code.=<<___;
1041 vzeroupper
1042
1043 add 0($ctx),$A # update context
1044 add 4($ctx),@T[0]
1045 add 8($ctx),$C
1046 mov $A,0($ctx)
1047 add 12($ctx),$D
1048 mov @T[0],4($ctx)
1049 add 16($ctx),$E
1050 mov $C,8($ctx)
1051 mov $D,12($ctx)
1052 mov $E,16($ctx)
1053___
1054$code.=<<___ if ($win64);
1055 movaps 64+0(%rsp),%xmm6
1056 movaps 64+16(%rsp),%xmm7
1057 movaps 64+32(%rsp),%xmm8
1058 movaps 64+48(%rsp),%xmm9
1059 movaps 64+64(%rsp),%xmm10
1060___
1061$code.=<<___;
1062 lea `64+($win64?5*16:0)`(%rsp),%rsi
1063 mov 0(%rsi),%r12
1064 mov 8(%rsi),%rbp
1065 mov 16(%rsi),%rbx
1066 lea 24(%rsi),%rsp
1067.Lepilogue_avx:
1068 ret
1069.size sha1_block_data_order_avx,.-sha1_block_data_order_avx
1070___
1071}
1072$code.=<<___;
1073.align 64
1074K_XX_XX:
1075.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19
1076.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39
1077.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59
1078.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79
1079.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap mask
1080___
1081}}}
1082$code.=<<___;
1083.asciz "SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1084.align 64
1085___
1086
1087# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1088# CONTEXT *context,DISPATCHER_CONTEXT *disp)
1089if ($win64) {
1090$rec="%rcx";
1091$frame="%rdx";
1092$context="%r8";
1093$disp="%r9";
1094
1095$code.=<<___;
1096.extern __imp_RtlVirtualUnwind
1097.type se_handler,\@abi-omnipotent
1098.align 16
1099se_handler:
1100 push %rsi
1101 push %rdi
1102 push %rbx
1103 push %rbp
1104 push %r12
1105 push %r13
1106 push %r14
1107 push %r15
1108 pushfq
1109 sub \$64,%rsp
1110
1111 mov 120($context),%rax # pull context->Rax
1112 mov 248($context),%rbx # pull context->Rip
1113
1114 lea .Lprologue(%rip),%r10
1115 cmp %r10,%rbx # context->Rip<.Lprologue
1116 jb .Lcommon_seh_tail
1117
1118 mov 152($context),%rax # pull context->Rsp
1119
1120 lea .Lepilogue(%rip),%r10
1121 cmp %r10,%rbx # context->Rip>=.Lepilogue
1122 jae .Lcommon_seh_tail
1123
1124 mov `16*4`(%rax),%rax # pull saved stack pointer
1125 lea 32(%rax),%rax
1126
1127 mov -8(%rax),%rbx
1128 mov -16(%rax),%rbp
1129 mov -24(%rax),%r12
1130 mov -32(%rax),%r13
1131 mov %rbx,144($context) # restore context->Rbx
1132 mov %rbp,160($context) # restore context->Rbp
1133 mov %r12,216($context) # restore context->R12
1134 mov %r13,224($context) # restore context->R13
1135
1136 jmp .Lcommon_seh_tail
1137.size se_handler,.-se_handler
1138
1139.type ssse3_handler,\@abi-omnipotent
1140.align 16
1141ssse3_handler:
1142 push %rsi
1143 push %rdi
1144 push %rbx
1145 push %rbp
1146 push %r12
1147 push %r13
1148 push %r14
1149 push %r15
1150 pushfq
1151 sub \$64,%rsp
1152
1153 mov 120($context),%rax # pull context->Rax
1154 mov 248($context),%rbx # pull context->Rip
1155
1156 mov 8($disp),%rsi # disp->ImageBase
1157 mov 56($disp),%r11 # disp->HandlerData
1158
1159 mov 0(%r11),%r10d # HandlerData[0]
1160 lea (%rsi,%r10),%r10 # prologue label
1161 cmp %r10,%rbx # context->Rip<prologue label
1162 jb .Lcommon_seh_tail
1163
1164 mov 152($context),%rax # pull context->Rsp
1165
1166 mov 4(%r11),%r10d # HandlerData[1]
1167 lea (%rsi,%r10),%r10 # epilogue label
1168 cmp %r10,%rbx # context->Rip>=epilogue label
1169 jae .Lcommon_seh_tail
1170
1171 lea 64(%rax),%rsi
1172 lea 512($context),%rdi # &context.Xmm6
1173 mov \$10,%ecx
1174 .long 0xa548f3fc # cld; rep movsq
1175 lea `24+64+5*16`(%rax),%rax # adjust stack pointer
1176
1177 mov -8(%rax),%rbx
1178 mov -16(%rax),%rbp
1179 mov -24(%rax),%r12
1180 mov %rbx,144($context) # restore context->Rbx
1181 mov %rbp,160($context) # restore context->Rbp
1182 mov %r12,216($context) # restore cotnext->R12
1183
1184.Lcommon_seh_tail:
1185 mov 8(%rax),%rdi
1186 mov 16(%rax),%rsi
1187 mov %rax,152($context) # restore context->Rsp
1188 mov %rsi,168($context) # restore context->Rsi
1189 mov %rdi,176($context) # restore context->Rdi
1190
1191 mov 40($disp),%rdi # disp->ContextRecord
1192 mov $context,%rsi # context
1193 mov \$154,%ecx # sizeof(CONTEXT)
1194 .long 0xa548f3fc # cld; rep movsq
1195
1196 mov $disp,%rsi
1197 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1198 mov 8(%rsi),%rdx # arg2, disp->ImageBase
1199 mov 0(%rsi),%r8 # arg3, disp->ControlPc
1200 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1201 mov 40(%rsi),%r10 # disp->ContextRecord
1202 lea 56(%rsi),%r11 # &disp->HandlerData
1203 lea 24(%rsi),%r12 # &disp->EstablisherFrame
1204 mov %r10,32(%rsp) # arg5
1205 mov %r11,40(%rsp) # arg6
1206 mov %r12,48(%rsp) # arg7
1207 mov %rcx,56(%rsp) # arg8, (NULL)
1208 call *__imp_RtlVirtualUnwind(%rip)
1209
1210 mov \$1,%eax # ExceptionContinueSearch
1211 add \$64,%rsp
1212 popfq
1213 pop %r15
1214 pop %r14
1215 pop %r13
1216 pop %r12
1217 pop %rbp
1218 pop %rbx
1219 pop %rdi
1220 pop %rsi
1221 ret
1222.size ssse3_handler,.-ssse3_handler
1223
1224.section .pdata
1225.align 4
1226 .rva .LSEH_begin_sha1_block_data_order
1227 .rva .LSEH_end_sha1_block_data_order
1228 .rva .LSEH_info_sha1_block_data_order
1229 .rva .LSEH_begin_sha1_block_data_order_ssse3
1230 .rva .LSEH_end_sha1_block_data_order_ssse3
1231 .rva .LSEH_info_sha1_block_data_order_ssse3
1232___
1233$code.=<<___ if ($avx);
1234 .rva .LSEH_begin_sha1_block_data_order_avx
1235 .rva .LSEH_end_sha1_block_data_order_avx
1236 .rva .LSEH_info_sha1_block_data_order_avx
1237___
1238$code.=<<___;
1239.section .xdata
1240.align 8
1241.LSEH_info_sha1_block_data_order:
1242 .byte 9,0,0,0
1243 .rva se_handler
1244.LSEH_info_sha1_block_data_order_ssse3:
1245 .byte 9,0,0,0
1246 .rva ssse3_handler
1247 .rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[]
1248___
1249$code.=<<___ if ($avx);
1250.LSEH_info_sha1_block_data_order_avx:
1251 .byte 9,0,0,0
1252 .rva ssse3_handler
1253 .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[]
1254___
1255}
1256
1257####################################################################
1258
1259$code =~ s/\`([^\`]*)\`/eval $1/gem;
1260print $code;
1261close STDOUT;
diff --git a/src/lib/libcrypto/sha/asm/sha256-586.pl b/src/lib/libcrypto/sha/asm/sha256-586.pl
deleted file mode 100644
index 928ec53123..0000000000
--- a/src/lib/libcrypto/sha/asm/sha256-586.pl
+++ /dev/null
@@ -1,249 +0,0 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# SHA256 block transform for x86. September 2007.
11#
12# Performance in clock cycles per processed byte (less is better):
13#
14# Pentium PIII P4 AMD K8 Core2
15# gcc 46 36 41 27 26
16# icc 57 33 38 25 23
17# x86 asm 40 30 33 20 18
18# x86_64 asm(*) - - 21 16 16
19#
20# (*) x86_64 assembler performance is presented for reference
21# purposes.
22#
23# Performance improvement over compiler generated code varies from
24# 10% to 40% [see above]. Not very impressive on some µ-archs, but
25# it's 5 times smaller and optimizies amount of writes.
26
27$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
28push(@INC,"${dir}","${dir}../../perlasm");
29require "x86asm.pl";
30
31&asm_init($ARGV[0],"sha512-586.pl",$ARGV[$#ARGV] eq "386");
32
33$A="eax";
34$E="edx";
35$T="ebx";
36$Aoff=&DWP(0,"esp");
37$Boff=&DWP(4,"esp");
38$Coff=&DWP(8,"esp");
39$Doff=&DWP(12,"esp");
40$Eoff=&DWP(16,"esp");
41$Foff=&DWP(20,"esp");
42$Goff=&DWP(24,"esp");
43$Hoff=&DWP(28,"esp");
44$Xoff=&DWP(32,"esp");
45$K256="ebp";
46
47sub BODY_00_15() {
48 my $in_16_63=shift;
49
50 &mov ("ecx",$E);
51 &add ($T,"edi") if ($in_16_63); # T += sigma1(X[-2])
52 &ror ("ecx",25-11);
53 &mov ("esi",$Foff);
54 &xor ("ecx",$E);
55 &ror ("ecx",11-6);
56 &mov (&DWP(4*(8+15),"esp"),$T) if ($in_16_63); # save X[0]
57 &xor ("ecx",$E);
58 &ror ("ecx",6); # Sigma1(e)
59 &mov ("edi",$Goff);
60 &add ($T,"ecx"); # T += Sigma1(e)
61
62 &xor ("esi","edi");
63 &mov ($Eoff,$E); # modulo-scheduled
64 &mov ("ecx",$A);
65 &and ("esi",$E);
66 &mov ($E,$Doff); # e becomes d, which is e in next iteration
67 &xor ("esi","edi"); # Ch(e,f,g)
68 &mov ("edi",$A);
69 &add ($T,"esi"); # T += Ch(e,f,g)
70
71 &ror ("ecx",22-13);
72 &add ($T,$Hoff); # T += h
73 &xor ("ecx",$A);
74 &ror ("ecx",13-2);
75 &mov ("esi",$Boff);
76 &xor ("ecx",$A);
77 &ror ("ecx",2); # Sigma0(a)
78 &add ($E,$T); # d += T
79 &mov ("edi",$Coff);
80
81 &add ($T,"ecx"); # T += Sigma0(a)
82 &mov ($Aoff,$A); # modulo-scheduled
83
84 &mov ("ecx",$A);
85 &sub ("esp",4);
86 &or ($A,"esi"); # a becomes h, which is a in next iteration
87 &and ("ecx","esi");
88 &and ($A,"edi");
89 &mov ("esi",&DWP(0,$K256));
90 &or ($A,"ecx"); # h=Maj(a,b,c)
91
92 &add ($K256,4);
93 &add ($A,$T); # h += T
94 &mov ($T,&DWP(4*(8+15+16-1),"esp")) if ($in_16_63); # preload T
95 &add ($E,"esi"); # d += K256[i]
96 &add ($A,"esi"); # h += K256[i]
97}
98
99&function_begin("sha256_block_data_order");
100 &mov ("esi",wparam(0)); # ctx
101 &mov ("edi",wparam(1)); # inp
102 &mov ("eax",wparam(2)); # num
103 &mov ("ebx","esp"); # saved sp
104
105 &call (&label("pic_point")); # make it PIC!
106&set_label("pic_point");
107 &blindpop($K256);
108 &lea ($K256,&DWP(&label("K256")."-".&label("pic_point"),$K256));
109
110 &sub ("esp",16);
111 &and ("esp",-64);
112
113 &shl ("eax",6);
114 &add ("eax","edi");
115 &mov (&DWP(0,"esp"),"esi"); # ctx
116 &mov (&DWP(4,"esp"),"edi"); # inp
117 &mov (&DWP(8,"esp"),"eax"); # inp+num*128
118 &mov (&DWP(12,"esp"),"ebx"); # saved sp
119
120&set_label("loop",16);
121 # copy input block to stack reversing byte and dword order
122 for($i=0;$i<4;$i++) {
123 &mov ("eax",&DWP($i*16+0,"edi"));
124 &mov ("ebx",&DWP($i*16+4,"edi"));
125 &mov ("ecx",&DWP($i*16+8,"edi"));
126 &mov ("edx",&DWP($i*16+12,"edi"));
127 &bswap ("eax");
128 &bswap ("ebx");
129 &bswap ("ecx");
130 &bswap ("edx");
131 &push ("eax");
132 &push ("ebx");
133 &push ("ecx");
134 &push ("edx");
135 }
136 &add ("edi",64);
137 &sub ("esp",4*8); # place for A,B,C,D,E,F,G,H
138 &mov (&DWP(4*(8+16)+4,"esp"),"edi");
139
140 # copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack
141 &mov ($A,&DWP(0,"esi"));
142 &mov ("ebx",&DWP(4,"esi"));
143 &mov ("ecx",&DWP(8,"esi"));
144 &mov ("edi",&DWP(12,"esi"));
145 # &mov ($Aoff,$A);
146 &mov ($Boff,"ebx");
147 &mov ($Coff,"ecx");
148 &mov ($Doff,"edi");
149 &mov ($E,&DWP(16,"esi"));
150 &mov ("ebx",&DWP(20,"esi"));
151 &mov ("ecx",&DWP(24,"esi"));
152 &mov ("edi",&DWP(28,"esi"));
153 # &mov ($Eoff,$E);
154 &mov ($Foff,"ebx");
155 &mov ($Goff,"ecx");
156 &mov ($Hoff,"edi");
157
158&set_label("00_15",16);
159 &mov ($T,&DWP(4*(8+15),"esp"));
160
161 &BODY_00_15();
162
163 &cmp ("esi",0xc19bf174);
164 &jne (&label("00_15"));
165
166 &mov ($T,&DWP(4*(8+15+16-1),"esp")); # preloaded in BODY_00_15(1)
167&set_label("16_63",16);
168 &mov ("esi",$T);
169 &mov ("ecx",&DWP(4*(8+15+16-14),"esp"));
170 &ror ("esi",18-7);
171 &mov ("edi","ecx");
172 &xor ("esi",$T);
173 &ror ("esi",7);
174 &shr ($T,3);
175
176 &ror ("edi",19-17);
177 &xor ($T,"esi"); # T = sigma0(X[-15])
178 &xor ("edi","ecx");
179 &ror ("edi",17);
180 &shr ("ecx",10);
181 &add ($T,&DWP(4*(8+15+16),"esp")); # T += X[-16]
182 &xor ("edi","ecx"); # sigma1(X[-2])
183
184 &add ($T,&DWP(4*(8+15+16-9),"esp")); # T += X[-7]
185 # &add ($T,"edi"); # T += sigma1(X[-2])
186 # &mov (&DWP(4*(8+15),"esp"),$T); # save X[0]
187
188 &BODY_00_15(1);
189
190 &cmp ("esi",0xc67178f2);
191 &jne (&label("16_63"));
192
193 &mov ("esi",&DWP(4*(8+16+64)+0,"esp"));#ctx
194 # &mov ($A,$Aoff);
195 &mov ("ebx",$Boff);
196 &mov ("ecx",$Coff);
197 &mov ("edi",$Doff);
198 &add ($A,&DWP(0,"esi"));
199 &add ("ebx",&DWP(4,"esi"));
200 &add ("ecx",&DWP(8,"esi"));
201 &add ("edi",&DWP(12,"esi"));
202 &mov (&DWP(0,"esi"),$A);
203 &mov (&DWP(4,"esi"),"ebx");
204 &mov (&DWP(8,"esi"),"ecx");
205 &mov (&DWP(12,"esi"),"edi");
206 # &mov ($E,$Eoff);
207 &mov ("eax",$Foff);
208 &mov ("ebx",$Goff);
209 &mov ("ecx",$Hoff);
210 &mov ("edi",&DWP(4*(8+16+64)+4,"esp"));#inp
211 &add ($E,&DWP(16,"esi"));
212 &add ("eax",&DWP(20,"esi"));
213 &add ("ebx",&DWP(24,"esi"));
214 &add ("ecx",&DWP(28,"esi"));
215 &mov (&DWP(16,"esi"),$E);
216 &mov (&DWP(20,"esi"),"eax");
217 &mov (&DWP(24,"esi"),"ebx");
218 &mov (&DWP(28,"esi"),"ecx");
219
220 &add ("esp",4*(8+16+64)); # destroy frame
221 &sub ($K256,4*64); # rewind K
222
223 &cmp ("edi",&DWP(8,"esp")); # are we done yet?
224 &jb (&label("loop"));
225
226 &mov ("esp",&DWP(12,"esp")); # restore sp
227&function_end_A();
228
229&set_label("K256",64); # Yes! I keep it in the code segment!
230 &data_word(0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5);
231 &data_word(0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5);
232 &data_word(0xd807aa98,0x12835b01,0x243185be,0x550c7dc3);
233 &data_word(0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174);
234 &data_word(0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc);
235 &data_word(0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da);
236 &data_word(0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7);
237 &data_word(0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967);
238 &data_word(0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13);
239 &data_word(0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85);
240 &data_word(0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3);
241 &data_word(0xd192e819,0xd6990624,0xf40e3585,0x106aa070);
242 &data_word(0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5);
243 &data_word(0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3);
244 &data_word(0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208);
245 &data_word(0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2);
246&function_end_B("sha256_block_data_order");
247&asciz("SHA256 block transform for x86, CRYPTOGAMS by <appro\@openssl.org>");
248
249&asm_finish();
diff --git a/src/lib/libcrypto/sha/asm/sha256-armv4.pl b/src/lib/libcrypto/sha/asm/sha256-armv4.pl
deleted file mode 100644
index 9c84e8d93c..0000000000
--- a/src/lib/libcrypto/sha/asm/sha256-armv4.pl
+++ /dev/null
@@ -1,211 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# SHA256 block procedure for ARMv4. May 2007.
11
12# Performance is ~2x better than gcc 3.4 generated code and in "abso-
13# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
14# byte [on single-issue Xscale PXA250 core].
15
16# July 2010.
17#
18# Rescheduling for dual-issue pipeline resulted in 22% improvement on
19# Cortex A8 core and ~20 cycles per processed byte.
20
21# February 2011.
22#
23# Profiler-assisted and platform-specific optimization resulted in 16%
24# improvement on Cortex A8 core and ~17 cycles per processed byte.
25
26while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
27open STDOUT,">$output";
28
29$ctx="r0"; $t0="r0";
30$inp="r1"; $t3="r1";
31$len="r2"; $t1="r2";
32$T1="r3";
33$A="r4";
34$B="r5";
35$C="r6";
36$D="r7";
37$E="r8";
38$F="r9";
39$G="r10";
40$H="r11";
41@V=($A,$B,$C,$D,$E,$F,$G,$H);
42$t2="r12";
43$Ktbl="r14";
44
45@Sigma0=( 2,13,22);
46@Sigma1=( 6,11,25);
47@sigma0=( 7,18, 3);
48@sigma1=(17,19,10);
49
50sub BODY_00_15 {
51my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
52
53$code.=<<___ if ($i<16);
54#if __ARM_ARCH__>=7
55 ldr $T1,[$inp],#4
56#else
57 ldrb $T1,[$inp,#3] @ $i
58 ldrb $t2,[$inp,#2]
59 ldrb $t1,[$inp,#1]
60 ldrb $t0,[$inp],#4
61 orr $T1,$T1,$t2,lsl#8
62 orr $T1,$T1,$t1,lsl#16
63 orr $T1,$T1,$t0,lsl#24
64#endif
65___
66$code.=<<___;
67 mov $t0,$e,ror#$Sigma1[0]
68 ldr $t2,[$Ktbl],#4 @ *K256++
69 eor $t0,$t0,$e,ror#$Sigma1[1]
70 eor $t1,$f,$g
71#if $i>=16
72 add $T1,$T1,$t3 @ from BODY_16_xx
73#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
74 rev $T1,$T1
75#endif
76#if $i==15
77 str $inp,[sp,#17*4] @ leave room for $t3
78#endif
79 eor $t0,$t0,$e,ror#$Sigma1[2] @ Sigma1(e)
80 and $t1,$t1,$e
81 str $T1,[sp,#`$i%16`*4]
82 add $T1,$T1,$t0
83 eor $t1,$t1,$g @ Ch(e,f,g)
84 add $T1,$T1,$h
85 mov $h,$a,ror#$Sigma0[0]
86 add $T1,$T1,$t1
87 eor $h,$h,$a,ror#$Sigma0[1]
88 add $T1,$T1,$t2
89 eor $h,$h,$a,ror#$Sigma0[2] @ Sigma0(a)
90#if $i>=15
91 ldr $t3,[sp,#`($i+2)%16`*4] @ from BODY_16_xx
92#endif
93 orr $t0,$a,$b
94 and $t1,$a,$b
95 and $t0,$t0,$c
96 add $h,$h,$T1
97 orr $t0,$t0,$t1 @ Maj(a,b,c)
98 add $d,$d,$T1
99 add $h,$h,$t0
100___
101}
102
103sub BODY_16_XX {
104my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
105
106$code.=<<___;
107 @ ldr $t3,[sp,#`($i+1)%16`*4] @ $i
108 ldr $t2,[sp,#`($i+14)%16`*4]
109 mov $t0,$t3,ror#$sigma0[0]
110 ldr $T1,[sp,#`($i+0)%16`*4]
111 eor $t0,$t0,$t3,ror#$sigma0[1]
112 ldr $t1,[sp,#`($i+9)%16`*4]
113 eor $t0,$t0,$t3,lsr#$sigma0[2] @ sigma0(X[i+1])
114 mov $t3,$t2,ror#$sigma1[0]
115 add $T1,$T1,$t0
116 eor $t3,$t3,$t2,ror#$sigma1[1]
117 add $T1,$T1,$t1
118 eor $t3,$t3,$t2,lsr#$sigma1[2] @ sigma1(X[i+14])
119 @ add $T1,$T1,$t3
120___
121 &BODY_00_15(@_);
122}
123
124$code=<<___;
125#include "arm_arch.h"
126
127.text
128.code 32
129
130.type K256,%object
131.align 5
132K256:
133.word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
134.word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
135.word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
136.word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
137.word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
138.word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
139.word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
140.word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
141.word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
142.word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
143.word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
144.word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
145.word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
146.word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
147.word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
148.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
149.size K256,.-K256
150
151.global sha256_block_data_order
152.type sha256_block_data_order,%function
153sha256_block_data_order:
154 sub r3,pc,#8 @ sha256_block_data_order
155 add $len,$inp,$len,lsl#6 @ len to point at the end of inp
156 stmdb sp!,{$ctx,$inp,$len,r4-r11,lr}
157 ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
158 sub $Ktbl,r3,#256 @ K256
159 sub sp,sp,#16*4 @ alloca(X[16])
160.Loop:
161___
162for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
163$code.=".Lrounds_16_xx:\n";
164for (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
165$code.=<<___;
166 and $t2,$t2,#0xff
167 cmp $t2,#0xf2
168 bne .Lrounds_16_xx
169
170 ldr $T1,[sp,#16*4] @ pull ctx
171 ldr $t0,[$T1,#0]
172 ldr $t1,[$T1,#4]
173 ldr $t2,[$T1,#8]
174 add $A,$A,$t0
175 ldr $t0,[$T1,#12]
176 add $B,$B,$t1
177 ldr $t1,[$T1,#16]
178 add $C,$C,$t2
179 ldr $t2,[$T1,#20]
180 add $D,$D,$t0
181 ldr $t0,[$T1,#24]
182 add $E,$E,$t1
183 ldr $t1,[$T1,#28]
184 add $F,$F,$t2
185 ldr $inp,[sp,#17*4] @ pull inp
186 ldr $t2,[sp,#18*4] @ pull inp+len
187 add $G,$G,$t0
188 add $H,$H,$t1
189 stmia $T1,{$A,$B,$C,$D,$E,$F,$G,$H}
190 cmp $inp,$t2
191 sub $Ktbl,$Ktbl,#256 @ rewind Ktbl
192 bne .Loop
193
194 add sp,sp,#`16+3`*4 @ destroy frame
195#if __ARM_ARCH__>=5
196 ldmia sp!,{r4-r11,pc}
197#else
198 ldmia sp!,{r4-r11,lr}
199 tst lr,#1
200 moveq pc,lr @ be binary compatible with V4, yet
201 bx lr @ interoperable with Thumb ISA:-)
202#endif
203.size sha256_block_data_order,.-sha256_block_data_order
204.asciz "SHA256 block transform for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
205.align 2
206___
207
208$code =~ s/\`([^\`]*)\`/eval $1/gem;
209$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
210print $code;
211close STDOUT; # enforce flush
diff --git a/src/lib/libcrypto/sha/asm/sha512-586.pl b/src/lib/libcrypto/sha/asm/sha512-586.pl
deleted file mode 100644
index 7eab6a5b88..0000000000
--- a/src/lib/libcrypto/sha/asm/sha512-586.pl
+++ /dev/null
@@ -1,644 +0,0 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# SHA512 block transform for x86. September 2007.
11#
12# Performance in clock cycles per processed byte (less is better):
13#
14# Pentium PIII P4 AMD K8 Core2
15# gcc 100 75 116 54 66
16# icc 97 77 95 55 57
17# x86 asm 61 56 82 36 40
18# SSE2 asm - - 38 24 20
19# x86_64 asm(*) - - 30 10.0 10.5
20#
21# (*) x86_64 assembler performance is presented for reference
22# purposes.
23#
24# IALU code-path is optimized for elder Pentiums. On vanilla Pentium
25# performance improvement over compiler generated code reaches ~60%,
26# while on PIII - ~35%. On newer µ-archs improvement varies from 15%
27# to 50%, but it's less important as they are expected to execute SSE2
28# code-path, which is commonly ~2-3x faster [than compiler generated
29# code]. SSE2 code-path is as fast as original sha512-sse2.pl, even
30# though it does not use 128-bit operations. The latter means that
31# SSE2-aware kernel is no longer required to execute the code. Another
32# difference is that new code optimizes amount of writes, but at the
33# cost of increased data cache "footprint" by 1/2KB.
34
35$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
36push(@INC,"${dir}","${dir}../../perlasm");
37require "x86asm.pl";
38
39&asm_init($ARGV[0],"sha512-586.pl",$ARGV[$#ARGV] eq "386");
40
41$sse2=0;
42for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
43
44&external_label("OPENSSL_ia32cap_P") if ($sse2);
45
46$Tlo=&DWP(0,"esp"); $Thi=&DWP(4,"esp");
47$Alo=&DWP(8,"esp"); $Ahi=&DWP(8+4,"esp");
48$Blo=&DWP(16,"esp"); $Bhi=&DWP(16+4,"esp");
49$Clo=&DWP(24,"esp"); $Chi=&DWP(24+4,"esp");
50$Dlo=&DWP(32,"esp"); $Dhi=&DWP(32+4,"esp");
51$Elo=&DWP(40,"esp"); $Ehi=&DWP(40+4,"esp");
52$Flo=&DWP(48,"esp"); $Fhi=&DWP(48+4,"esp");
53$Glo=&DWP(56,"esp"); $Ghi=&DWP(56+4,"esp");
54$Hlo=&DWP(64,"esp"); $Hhi=&DWP(64+4,"esp");
55$K512="ebp";
56
57$Asse2=&QWP(0,"esp");
58$Bsse2=&QWP(8,"esp");
59$Csse2=&QWP(16,"esp");
60$Dsse2=&QWP(24,"esp");
61$Esse2=&QWP(32,"esp");
62$Fsse2=&QWP(40,"esp");
63$Gsse2=&QWP(48,"esp");
64$Hsse2=&QWP(56,"esp");
65
66$A="mm0"; # B-D and
67$E="mm4"; # F-H are commonly loaded to respectively mm1-mm3 and
68 # mm5-mm7, but it's done on on-demand basis...
69
70sub BODY_00_15_sse2 {
71 my $prefetch=shift;
72
73 &movq ("mm5",$Fsse2); # load f
74 &movq ("mm6",$Gsse2); # load g
75 &movq ("mm7",$Hsse2); # load h
76
77 &movq ("mm1",$E); # %mm1 is sliding right
78 &movq ("mm2",$E); # %mm2 is sliding left
79 &psrlq ("mm1",14);
80 &movq ($Esse2,$E); # modulo-scheduled save e
81 &psllq ("mm2",23);
82 &movq ("mm3","mm1"); # %mm3 is T1
83 &psrlq ("mm1",4);
84 &pxor ("mm3","mm2");
85 &psllq ("mm2",23);
86 &pxor ("mm3","mm1");
87 &psrlq ("mm1",23);
88 &pxor ("mm3","mm2");
89 &psllq ("mm2",4);
90 &pxor ("mm3","mm1");
91 &paddq ("mm7",QWP(0,$K512)); # h+=K512[i]
92 &pxor ("mm3","mm2"); # T1=Sigma1_512(e)
93
94 &pxor ("mm5","mm6"); # f^=g
95 &movq ("mm1",$Bsse2); # load b
96 &pand ("mm5",$E); # f&=e
97 &movq ("mm2",$Csse2); # load c
98 &pxor ("mm5","mm6"); # f^=g
99 &movq ($E,$Dsse2); # e = load d
100 &paddq ("mm3","mm5"); # T1+=Ch(e,f,g)
101 &movq (&QWP(0,"esp"),$A); # modulo-scheduled save a
102 &paddq ("mm3","mm7"); # T1+=h
103
104 &movq ("mm5",$A); # %mm5 is sliding right
105 &movq ("mm6",$A); # %mm6 is sliding left
106 &paddq ("mm3",&QWP(8*9,"esp")); # T1+=X[0]
107 &psrlq ("mm5",28);
108 &paddq ($E,"mm3"); # e += T1
109 &psllq ("mm6",25);
110 &movq ("mm7","mm5"); # %mm7 is T2
111 &psrlq ("mm5",6);
112 &pxor ("mm7","mm6");
113 &psllq ("mm6",5);
114 &pxor ("mm7","mm5");
115 &psrlq ("mm5",5);
116 &pxor ("mm7","mm6");
117 &psllq ("mm6",6);
118 &pxor ("mm7","mm5");
119 &sub ("esp",8);
120 &pxor ("mm7","mm6"); # T2=Sigma0_512(a)
121
122 &movq ("mm5",$A); # %mm5=a
123 &por ($A,"mm2"); # a=a|c
124 &movq ("mm6",&QWP(8*(9+16-14),"esp")) if ($prefetch);
125 &pand ("mm5","mm2"); # %mm5=a&c
126 &pand ($A,"mm1"); # a=(a|c)&b
127 &movq ("mm2",&QWP(8*(9+16-1),"esp")) if ($prefetch);
128 &por ("mm5",$A); # %mm5=(a&c)|((a|c)&b)
129 &paddq ("mm7","mm5"); # T2+=Maj(a,b,c)
130 &movq ($A,"mm3"); # a=T1
131
132 &mov (&LB("edx"),&BP(0,$K512));
133 &paddq ($A,"mm7"); # a+=T2
134 &add ($K512,8);
135}
136
137sub BODY_00_15_x86 {
138 #define Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41))
139 # LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
140 # HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
141 &mov ("ecx",$Elo);
142 &mov ("edx",$Ehi);
143 &mov ("esi","ecx");
144
145 &shr ("ecx",9); # lo>>9
146 &mov ("edi","edx");
147 &shr ("edx",9); # hi>>9
148 &mov ("ebx","ecx");
149 &shl ("esi",14); # lo<<14
150 &mov ("eax","edx");
151 &shl ("edi",14); # hi<<14
152 &xor ("ebx","esi");
153
154 &shr ("ecx",14-9); # lo>>14
155 &xor ("eax","edi");
156 &shr ("edx",14-9); # hi>>14
157 &xor ("eax","ecx");
158 &shl ("esi",18-14); # lo<<18
159 &xor ("ebx","edx");
160 &shl ("edi",18-14); # hi<<18
161 &xor ("ebx","esi");
162
163 &shr ("ecx",18-14); # lo>>18
164 &xor ("eax","edi");
165 &shr ("edx",18-14); # hi>>18
166 &xor ("eax","ecx");
167 &shl ("esi",23-18); # lo<<23
168 &xor ("ebx","edx");
169 &shl ("edi",23-18); # hi<<23
170 &xor ("eax","esi");
171 &xor ("ebx","edi"); # T1 = Sigma1(e)
172
173 &mov ("ecx",$Flo);
174 &mov ("edx",$Fhi);
175 &mov ("esi",$Glo);
176 &mov ("edi",$Ghi);
177 &add ("eax",$Hlo);
178 &adc ("ebx",$Hhi); # T1 += h
179 &xor ("ecx","esi");
180 &xor ("edx","edi");
181 &and ("ecx",$Elo);
182 &and ("edx",$Ehi);
183 &add ("eax",&DWP(8*(9+15)+0,"esp"));
184 &adc ("ebx",&DWP(8*(9+15)+4,"esp")); # T1 += X[0]
185 &xor ("ecx","esi");
186 &xor ("edx","edi"); # Ch(e,f,g) = (f^g)&e)^g
187
188 &mov ("esi",&DWP(0,$K512));
189 &mov ("edi",&DWP(4,$K512)); # K[i]
190 &add ("eax","ecx");
191 &adc ("ebx","edx"); # T1 += Ch(e,f,g)
192 &mov ("ecx",$Dlo);
193 &mov ("edx",$Dhi);
194 &add ("eax","esi");
195 &adc ("ebx","edi"); # T1 += K[i]
196 &mov ($Tlo,"eax");
197 &mov ($Thi,"ebx"); # put T1 away
198 &add ("eax","ecx");
199 &adc ("ebx","edx"); # d += T1
200
201 #define Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
202 # LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
203 # HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
204 &mov ("ecx",$Alo);
205 &mov ("edx",$Ahi);
206 &mov ($Dlo,"eax");
207 &mov ($Dhi,"ebx");
208 &mov ("esi","ecx");
209
210 &shr ("ecx",2); # lo>>2
211 &mov ("edi","edx");
212 &shr ("edx",2); # hi>>2
213 &mov ("ebx","ecx");
214 &shl ("esi",4); # lo<<4
215 &mov ("eax","edx");
216 &shl ("edi",4); # hi<<4
217 &xor ("ebx","esi");
218
219 &shr ("ecx",7-2); # lo>>7
220 &xor ("eax","edi");
221 &shr ("edx",7-2); # hi>>7
222 &xor ("ebx","ecx");
223 &shl ("esi",25-4); # lo<<25
224 &xor ("eax","edx");
225 &shl ("edi",25-4); # hi<<25
226 &xor ("eax","esi");
227
228 &shr ("ecx",28-7); # lo>>28
229 &xor ("ebx","edi");
230 &shr ("edx",28-7); # hi>>28
231 &xor ("eax","ecx");
232 &shl ("esi",30-25); # lo<<30
233 &xor ("ebx","edx");
234 &shl ("edi",30-25); # hi<<30
235 &xor ("eax","esi");
236 &xor ("ebx","edi"); # Sigma0(a)
237
238 &mov ("ecx",$Alo);
239 &mov ("edx",$Ahi);
240 &mov ("esi",$Blo);
241 &mov ("edi",$Bhi);
242 &add ("eax",$Tlo);
243 &adc ("ebx",$Thi); # T1 = Sigma0(a)+T1
244 &or ("ecx","esi");
245 &or ("edx","edi");
246 &and ("ecx",$Clo);
247 &and ("edx",$Chi);
248 &and ("esi",$Alo);
249 &and ("edi",$Ahi);
250 &or ("ecx","esi");
251 &or ("edx","edi"); # Maj(a,b,c) = ((a|b)&c)|(a&b)
252
253 &add ("eax","ecx");
254 &adc ("ebx","edx"); # T1 += Maj(a,b,c)
255 &mov ($Tlo,"eax");
256 &mov ($Thi,"ebx");
257
258 &mov (&LB("edx"),&BP(0,$K512)); # pre-fetch LSB of *K
259 &sub ("esp",8);
260 &lea ($K512,&DWP(8,$K512)); # K++
261}
262
263
264&function_begin("sha512_block_data_order");
265 &mov ("esi",wparam(0)); # ctx
266 &mov ("edi",wparam(1)); # inp
267 &mov ("eax",wparam(2)); # num
268 &mov ("ebx","esp"); # saved sp
269
270 &call (&label("pic_point")); # make it PIC!
271&set_label("pic_point");
272 &blindpop($K512);
273 &lea ($K512,&DWP(&label("K512")."-".&label("pic_point"),$K512));
274
275 &sub ("esp",16);
276 &and ("esp",-64);
277
278 &shl ("eax",7);
279 &add ("eax","edi");
280 &mov (&DWP(0,"esp"),"esi"); # ctx
281 &mov (&DWP(4,"esp"),"edi"); # inp
282 &mov (&DWP(8,"esp"),"eax"); # inp+num*128
283 &mov (&DWP(12,"esp"),"ebx"); # saved sp
284
285if ($sse2) {
286 &picmeup("edx","OPENSSL_ia32cap_P",$K512,&label("K512"));
287 &bt (&DWP(0,"edx"),26);
288 &jnc (&label("loop_x86"));
289
290 # load ctx->h[0-7]
291 &movq ($A,&QWP(0,"esi"));
292 &movq ("mm1",&QWP(8,"esi"));
293 &movq ("mm2",&QWP(16,"esi"));
294 &movq ("mm3",&QWP(24,"esi"));
295 &movq ($E,&QWP(32,"esi"));
296 &movq ("mm5",&QWP(40,"esi"));
297 &movq ("mm6",&QWP(48,"esi"));
298 &movq ("mm7",&QWP(56,"esi"));
299 &sub ("esp",8*10);
300
301&set_label("loop_sse2",16);
302 # &movq ($Asse2,$A);
303 &movq ($Bsse2,"mm1");
304 &movq ($Csse2,"mm2");
305 &movq ($Dsse2,"mm3");
306 # &movq ($Esse2,$E);
307 &movq ($Fsse2,"mm5");
308 &movq ($Gsse2,"mm6");
309 &movq ($Hsse2,"mm7");
310
311 &mov ("ecx",&DWP(0,"edi"));
312 &mov ("edx",&DWP(4,"edi"));
313 &add ("edi",8);
314 &bswap ("ecx");
315 &bswap ("edx");
316 &mov (&DWP(8*9+4,"esp"),"ecx");
317 &mov (&DWP(8*9+0,"esp"),"edx");
318
319&set_label("00_14_sse2",16);
320 &mov ("eax",&DWP(0,"edi"));
321 &mov ("ebx",&DWP(4,"edi"));
322 &add ("edi",8);
323 &bswap ("eax");
324 &bswap ("ebx");
325 &mov (&DWP(8*8+4,"esp"),"eax");
326 &mov (&DWP(8*8+0,"esp"),"ebx");
327
328 &BODY_00_15_sse2();
329
330 &cmp (&LB("edx"),0x35);
331 &jne (&label("00_14_sse2"));
332
333 &BODY_00_15_sse2(1);
334
335&set_label("16_79_sse2",16);
336 #&movq ("mm2",&QWP(8*(9+16-1),"esp")); #prefetched in BODY_00_15
337 #&movq ("mm6",&QWP(8*(9+16-14),"esp"));
338 &movq ("mm1","mm2");
339
340 &psrlq ("mm2",1);
341 &movq ("mm7","mm6");
342 &psrlq ("mm6",6);
343 &movq ("mm3","mm2");
344
345 &psrlq ("mm2",7-1);
346 &movq ("mm5","mm6");
347 &psrlq ("mm6",19-6);
348 &pxor ("mm3","mm2");
349
350 &psrlq ("mm2",8-7);
351 &pxor ("mm5","mm6");
352 &psrlq ("mm6",61-19);
353 &pxor ("mm3","mm2");
354
355 &movq ("mm2",&QWP(8*(9+16),"esp"));
356
357 &psllq ("mm1",56);
358 &pxor ("mm5","mm6");
359 &psllq ("mm7",3);
360 &pxor ("mm3","mm1");
361
362 &paddq ("mm2",&QWP(8*(9+16-9),"esp"));
363
364 &psllq ("mm1",63-56);
365 &pxor ("mm5","mm7");
366 &psllq ("mm7",45-3);
367 &pxor ("mm3","mm1");
368 &pxor ("mm5","mm7");
369
370 &paddq ("mm3","mm5");
371 &paddq ("mm3","mm2");
372 &movq (&QWP(8*9,"esp"),"mm3");
373
374 &BODY_00_15_sse2(1);
375
376 &cmp (&LB("edx"),0x17);
377 &jne (&label("16_79_sse2"));
378
379 # &movq ($A,$Asse2);
380 &movq ("mm1",$Bsse2);
381 &movq ("mm2",$Csse2);
382 &movq ("mm3",$Dsse2);
383 # &movq ($E,$Esse2);
384 &movq ("mm5",$Fsse2);
385 &movq ("mm6",$Gsse2);
386 &movq ("mm7",$Hsse2);
387
388 &paddq ($A,&QWP(0,"esi"));
389 &paddq ("mm1",&QWP(8,"esi"));
390 &paddq ("mm2",&QWP(16,"esi"));
391 &paddq ("mm3",&QWP(24,"esi"));
392 &paddq ($E,&QWP(32,"esi"));
393 &paddq ("mm5",&QWP(40,"esi"));
394 &paddq ("mm6",&QWP(48,"esi"));
395 &paddq ("mm7",&QWP(56,"esi"));
396
397 &movq (&QWP(0,"esi"),$A);
398 &movq (&QWP(8,"esi"),"mm1");
399 &movq (&QWP(16,"esi"),"mm2");
400 &movq (&QWP(24,"esi"),"mm3");
401 &movq (&QWP(32,"esi"),$E);
402 &movq (&QWP(40,"esi"),"mm5");
403 &movq (&QWP(48,"esi"),"mm6");
404 &movq (&QWP(56,"esi"),"mm7");
405
406 &add ("esp",8*80); # destroy frame
407 &sub ($K512,8*80); # rewind K
408
409 &cmp ("edi",&DWP(8*10+8,"esp")); # are we done yet?
410 &jb (&label("loop_sse2"));
411
412 &emms ();
413 &mov ("esp",&DWP(8*10+12,"esp")); # restore sp
414&function_end_A();
415}
416&set_label("loop_x86",16);
417 # copy input block to stack reversing byte and qword order
418 for ($i=0;$i<8;$i++) {
419 &mov ("eax",&DWP($i*16+0,"edi"));
420 &mov ("ebx",&DWP($i*16+4,"edi"));
421 &mov ("ecx",&DWP($i*16+8,"edi"));
422 &mov ("edx",&DWP($i*16+12,"edi"));
423 &bswap ("eax");
424 &bswap ("ebx");
425 &bswap ("ecx");
426 &bswap ("edx");
427 &push ("eax");
428 &push ("ebx");
429 &push ("ecx");
430 &push ("edx");
431 }
432 &add ("edi",128);
433 &sub ("esp",9*8); # place for T,A,B,C,D,E,F,G,H
434 &mov (&DWP(8*(9+16)+4,"esp"),"edi");
435
436 # copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack
437 &lea ("edi",&DWP(8,"esp"));
438 &mov ("ecx",16);
439 &data_word(0xA5F3F689); # rep movsd
440
441&set_label("00_15_x86",16);
442 &BODY_00_15_x86();
443
444 &cmp (&LB("edx"),0x94);
445 &jne (&label("00_15_x86"));
446
447&set_label("16_79_x86",16);
448 #define sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7))
449 # LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
450 # HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7
451 &mov ("ecx",&DWP(8*(9+15+16-1)+0,"esp"));
452 &mov ("edx",&DWP(8*(9+15+16-1)+4,"esp"));
453 &mov ("esi","ecx");
454
455 &shr ("ecx",1); # lo>>1
456 &mov ("edi","edx");
457 &shr ("edx",1); # hi>>1
458 &mov ("eax","ecx");
459 &shl ("esi",24); # lo<<24
460 &mov ("ebx","edx");
461 &shl ("edi",24); # hi<<24
462 &xor ("ebx","esi");
463
464 &shr ("ecx",7-1); # lo>>7
465 &xor ("eax","edi");
466 &shr ("edx",7-1); # hi>>7
467 &xor ("eax","ecx");
468 &shl ("esi",31-24); # lo<<31
469 &xor ("ebx","edx");
470 &shl ("edi",25-24); # hi<<25
471 &xor ("ebx","esi");
472
473 &shr ("ecx",8-7); # lo>>8
474 &xor ("eax","edi");
475 &shr ("edx",8-7); # hi>>8
476 &xor ("eax","ecx");
477 &shl ("edi",31-25); # hi<<31
478 &xor ("ebx","edx");
479 &xor ("eax","edi"); # T1 = sigma0(X[-15])
480
481 &mov (&DWP(0,"esp"),"eax");
482 &mov (&DWP(4,"esp"),"ebx"); # put T1 away
483
484 #define sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
485 # LO lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
486 # HI hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
487 &mov ("ecx",&DWP(8*(9+15+16-14)+0,"esp"));
488 &mov ("edx",&DWP(8*(9+15+16-14)+4,"esp"));
489 &mov ("esi","ecx");
490
491 &shr ("ecx",6); # lo>>6
492 &mov ("edi","edx");
493 &shr ("edx",6); # hi>>6
494 &mov ("eax","ecx");
495 &shl ("esi",3); # lo<<3
496 &mov ("ebx","edx");
497 &shl ("edi",3); # hi<<3
498 &xor ("eax","esi");
499
500 &shr ("ecx",19-6); # lo>>19
501 &xor ("ebx","edi");
502 &shr ("edx",19-6); # hi>>19
503 &xor ("eax","ecx");
504 &shl ("esi",13-3); # lo<<13
505 &xor ("ebx","edx");
506 &shl ("edi",13-3); # hi<<13
507 &xor ("ebx","esi");
508
509 &shr ("ecx",29-19); # lo>>29
510 &xor ("eax","edi");
511 &shr ("edx",29-19); # hi>>29
512 &xor ("ebx","ecx");
513 &shl ("edi",26-13); # hi<<26
514 &xor ("eax","edx");
515 &xor ("eax","edi"); # sigma1(X[-2])
516
517 &mov ("ecx",&DWP(8*(9+15+16)+0,"esp"));
518 &mov ("edx",&DWP(8*(9+15+16)+4,"esp"));
519 &add ("eax",&DWP(0,"esp"));
520 &adc ("ebx",&DWP(4,"esp")); # T1 = sigma1(X[-2])+T1
521 &mov ("esi",&DWP(8*(9+15+16-9)+0,"esp"));
522 &mov ("edi",&DWP(8*(9+15+16-9)+4,"esp"));
523 &add ("eax","ecx");
524 &adc ("ebx","edx"); # T1 += X[-16]
525 &add ("eax","esi");
526 &adc ("ebx","edi"); # T1 += X[-7]
527 &mov (&DWP(8*(9+15)+0,"esp"),"eax");
528 &mov (&DWP(8*(9+15)+4,"esp"),"ebx"); # save X[0]
529
530 &BODY_00_15_x86();
531
532 &cmp (&LB("edx"),0x17);
533 &jne (&label("16_79_x86"));
534
535 &mov ("esi",&DWP(8*(9+16+80)+0,"esp"));# ctx
536 &mov ("edi",&DWP(8*(9+16+80)+4,"esp"));# inp
537 for($i=0;$i<4;$i++) {
538 &mov ("eax",&DWP($i*16+0,"esi"));
539 &mov ("ebx",&DWP($i*16+4,"esi"));
540 &mov ("ecx",&DWP($i*16+8,"esi"));
541 &mov ("edx",&DWP($i*16+12,"esi"));
542 &add ("eax",&DWP(8+($i*16)+0,"esp"));
543 &adc ("ebx",&DWP(8+($i*16)+4,"esp"));
544 &mov (&DWP($i*16+0,"esi"),"eax");
545 &mov (&DWP($i*16+4,"esi"),"ebx");
546 &add ("ecx",&DWP(8+($i*16)+8,"esp"));
547 &adc ("edx",&DWP(8+($i*16)+12,"esp"));
548 &mov (&DWP($i*16+8,"esi"),"ecx");
549 &mov (&DWP($i*16+12,"esi"),"edx");
550 }
551 &add ("esp",8*(9+16+80)); # destroy frame
552 &sub ($K512,8*80); # rewind K
553
554 &cmp ("edi",&DWP(8,"esp")); # are we done yet?
555 &jb (&label("loop_x86"));
556
557 &mov ("esp",&DWP(12,"esp")); # restore sp
558&function_end_A();
559
560&set_label("K512",64); # Yes! I keep it in the code segment!
561 &data_word(0xd728ae22,0x428a2f98); # u64
562 &data_word(0x23ef65cd,0x71374491); # u64
563 &data_word(0xec4d3b2f,0xb5c0fbcf); # u64
564 &data_word(0x8189dbbc,0xe9b5dba5); # u64
565 &data_word(0xf348b538,0x3956c25b); # u64
566 &data_word(0xb605d019,0x59f111f1); # u64
567 &data_word(0xaf194f9b,0x923f82a4); # u64
568 &data_word(0xda6d8118,0xab1c5ed5); # u64
569 &data_word(0xa3030242,0xd807aa98); # u64
570 &data_word(0x45706fbe,0x12835b01); # u64
571 &data_word(0x4ee4b28c,0x243185be); # u64
572 &data_word(0xd5ffb4e2,0x550c7dc3); # u64
573 &data_word(0xf27b896f,0x72be5d74); # u64
574 &data_word(0x3b1696b1,0x80deb1fe); # u64
575 &data_word(0x25c71235,0x9bdc06a7); # u64
576 &data_word(0xcf692694,0xc19bf174); # u64
577 &data_word(0x9ef14ad2,0xe49b69c1); # u64
578 &data_word(0x384f25e3,0xefbe4786); # u64
579 &data_word(0x8b8cd5b5,0x0fc19dc6); # u64
580 &data_word(0x77ac9c65,0x240ca1cc); # u64
581 &data_word(0x592b0275,0x2de92c6f); # u64
582 &data_word(0x6ea6e483,0x4a7484aa); # u64
583 &data_word(0xbd41fbd4,0x5cb0a9dc); # u64
584 &data_word(0x831153b5,0x76f988da); # u64
585 &data_word(0xee66dfab,0x983e5152); # u64
586 &data_word(0x2db43210,0xa831c66d); # u64
587 &data_word(0x98fb213f,0xb00327c8); # u64
588 &data_word(0xbeef0ee4,0xbf597fc7); # u64
589 &data_word(0x3da88fc2,0xc6e00bf3); # u64
590 &data_word(0x930aa725,0xd5a79147); # u64
591 &data_word(0xe003826f,0x06ca6351); # u64
592 &data_word(0x0a0e6e70,0x14292967); # u64
593 &data_word(0x46d22ffc,0x27b70a85); # u64
594 &data_word(0x5c26c926,0x2e1b2138); # u64
595 &data_word(0x5ac42aed,0x4d2c6dfc); # u64
596 &data_word(0x9d95b3df,0x53380d13); # u64
597 &data_word(0x8baf63de,0x650a7354); # u64
598 &data_word(0x3c77b2a8,0x766a0abb); # u64
599 &data_word(0x47edaee6,0x81c2c92e); # u64
600 &data_word(0x1482353b,0x92722c85); # u64
601 &data_word(0x4cf10364,0xa2bfe8a1); # u64
602 &data_word(0xbc423001,0xa81a664b); # u64
603 &data_word(0xd0f89791,0xc24b8b70); # u64
604 &data_word(0x0654be30,0xc76c51a3); # u64
605 &data_word(0xd6ef5218,0xd192e819); # u64
606 &data_word(0x5565a910,0xd6990624); # u64
607 &data_word(0x5771202a,0xf40e3585); # u64
608 &data_word(0x32bbd1b8,0x106aa070); # u64
609 &data_word(0xb8d2d0c8,0x19a4c116); # u64
610 &data_word(0x5141ab53,0x1e376c08); # u64
611 &data_word(0xdf8eeb99,0x2748774c); # u64
612 &data_word(0xe19b48a8,0x34b0bcb5); # u64
613 &data_word(0xc5c95a63,0x391c0cb3); # u64
614 &data_word(0xe3418acb,0x4ed8aa4a); # u64
615 &data_word(0x7763e373,0x5b9cca4f); # u64
616 &data_word(0xd6b2b8a3,0x682e6ff3); # u64
617 &data_word(0x5defb2fc,0x748f82ee); # u64
618 &data_word(0x43172f60,0x78a5636f); # u64
619 &data_word(0xa1f0ab72,0x84c87814); # u64
620 &data_word(0x1a6439ec,0x8cc70208); # u64
621 &data_word(0x23631e28,0x90befffa); # u64
622 &data_word(0xde82bde9,0xa4506ceb); # u64
623 &data_word(0xb2c67915,0xbef9a3f7); # u64
624 &data_word(0xe372532b,0xc67178f2); # u64
625 &data_word(0xea26619c,0xca273ece); # u64
626 &data_word(0x21c0c207,0xd186b8c7); # u64
627 &data_word(0xcde0eb1e,0xeada7dd6); # u64
628 &data_word(0xee6ed178,0xf57d4f7f); # u64
629 &data_word(0x72176fba,0x06f067aa); # u64
630 &data_word(0xa2c898a6,0x0a637dc5); # u64
631 &data_word(0xbef90dae,0x113f9804); # u64
632 &data_word(0x131c471b,0x1b710b35); # u64
633 &data_word(0x23047d84,0x28db77f5); # u64
634 &data_word(0x40c72493,0x32caab7b); # u64
635 &data_word(0x15c9bebc,0x3c9ebe0a); # u64
636 &data_word(0x9c100d4c,0x431d67c4); # u64
637 &data_word(0xcb3e42b6,0x4cc5d4be); # u64
638 &data_word(0xfc657e2a,0x597f299c); # u64
639 &data_word(0x3ad6faec,0x5fcb6fab); # u64
640 &data_word(0x4a475817,0x6c44198c); # u64
641&function_end_B("sha512_block_data_order");
642&asciz("SHA512 block transform for x86, CRYPTOGAMS by <appro\@openssl.org>");
643
644&asm_finish();
diff --git a/src/lib/libcrypto/sha/asm/sha512-armv4.pl b/src/lib/libcrypto/sha/asm/sha512-armv4.pl
deleted file mode 100644
index 7faf37b147..0000000000
--- a/src/lib/libcrypto/sha/asm/sha512-armv4.pl
+++ /dev/null
@@ -1,582 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# SHA512 block procedure for ARMv4. September 2007.
11
12# This code is ~4.5 (four and a half) times faster than code generated
13# by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
14# Xscale PXA250 core].
15#
16# July 2010.
17#
18# Rescheduling for dual-issue pipeline resulted in 6% improvement on
19# Cortex A8 core and ~40 cycles per processed byte.
20
21# February 2011.
22#
23# Profiler-assisted and platform-specific optimization resulted in 7%
24# improvement on Coxtex A8 core and ~38 cycles per byte.
25
26# March 2011.
27#
28# Add NEON implementation. On Cortex A8 it was measured to process
29# one byte in 25.5 cycles or 47% faster than integer-only code.
30
31# Byte order [in]dependence. =========================================
32#
33# Originally caller was expected to maintain specific *dword* order in
34# h[0-7], namely with most significant dword at *lower* address, which
35# was reflected in below two parameters as 0 and 4. Now caller is
36# expected to maintain native byte order for whole 64-bit values.
37$hi="HI";
38$lo="LO";
39# ====================================================================
40
41while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
42open STDOUT,">$output";
43
44$ctx="r0"; # parameter block
45$inp="r1";
46$len="r2";
47
48$Tlo="r3";
49$Thi="r4";
50$Alo="r5";
51$Ahi="r6";
52$Elo="r7";
53$Ehi="r8";
54$t0="r9";
55$t1="r10";
56$t2="r11";
57$t3="r12";
58############ r13 is stack pointer
59$Ktbl="r14";
60############ r15 is program counter
61
62$Aoff=8*0;
63$Boff=8*1;
64$Coff=8*2;
65$Doff=8*3;
66$Eoff=8*4;
67$Foff=8*5;
68$Goff=8*6;
69$Hoff=8*7;
70$Xoff=8*8;
71
72sub BODY_00_15() {
73my $magic = shift;
74$code.=<<___;
75 @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41))
76 @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
77 @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
78 mov $t0,$Elo,lsr#14
79 str $Tlo,[sp,#$Xoff+0]
80 mov $t1,$Ehi,lsr#14
81 str $Thi,[sp,#$Xoff+4]
82 eor $t0,$t0,$Ehi,lsl#18
83 ldr $t2,[sp,#$Hoff+0] @ h.lo
84 eor $t1,$t1,$Elo,lsl#18
85 ldr $t3,[sp,#$Hoff+4] @ h.hi
86 eor $t0,$t0,$Elo,lsr#18
87 eor $t1,$t1,$Ehi,lsr#18
88 eor $t0,$t0,$Ehi,lsl#14
89 eor $t1,$t1,$Elo,lsl#14
90 eor $t0,$t0,$Ehi,lsr#9
91 eor $t1,$t1,$Elo,lsr#9
92 eor $t0,$t0,$Elo,lsl#23
93 eor $t1,$t1,$Ehi,lsl#23 @ Sigma1(e)
94 adds $Tlo,$Tlo,$t0
95 ldr $t0,[sp,#$Foff+0] @ f.lo
96 adc $Thi,$Thi,$t1 @ T += Sigma1(e)
97 ldr $t1,[sp,#$Foff+4] @ f.hi
98 adds $Tlo,$Tlo,$t2
99 ldr $t2,[sp,#$Goff+0] @ g.lo
100 adc $Thi,$Thi,$t3 @ T += h
101 ldr $t3,[sp,#$Goff+4] @ g.hi
102
103 eor $t0,$t0,$t2
104 str $Elo,[sp,#$Eoff+0]
105 eor $t1,$t1,$t3
106 str $Ehi,[sp,#$Eoff+4]
107 and $t0,$t0,$Elo
108 str $Alo,[sp,#$Aoff+0]
109 and $t1,$t1,$Ehi
110 str $Ahi,[sp,#$Aoff+4]
111 eor $t0,$t0,$t2
112 ldr $t2,[$Ktbl,#$lo] @ K[i].lo
113 eor $t1,$t1,$t3 @ Ch(e,f,g)
114 ldr $t3,[$Ktbl,#$hi] @ K[i].hi
115
116 adds $Tlo,$Tlo,$t0
117 ldr $Elo,[sp,#$Doff+0] @ d.lo
118 adc $Thi,$Thi,$t1 @ T += Ch(e,f,g)
119 ldr $Ehi,[sp,#$Doff+4] @ d.hi
120 adds $Tlo,$Tlo,$t2
121 and $t0,$t2,#0xff
122 adc $Thi,$Thi,$t3 @ T += K[i]
123 adds $Elo,$Elo,$Tlo
124 ldr $t2,[sp,#$Boff+0] @ b.lo
125 adc $Ehi,$Ehi,$Thi @ d += T
126 teq $t0,#$magic
127
128 ldr $t3,[sp,#$Coff+0] @ c.lo
129 orreq $Ktbl,$Ktbl,#1
130 @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
131 @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
132 @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
133 mov $t0,$Alo,lsr#28
134 mov $t1,$Ahi,lsr#28
135 eor $t0,$t0,$Ahi,lsl#4
136 eor $t1,$t1,$Alo,lsl#4
137 eor $t0,$t0,$Ahi,lsr#2
138 eor $t1,$t1,$Alo,lsr#2
139 eor $t0,$t0,$Alo,lsl#30
140 eor $t1,$t1,$Ahi,lsl#30
141 eor $t0,$t0,$Ahi,lsr#7
142 eor $t1,$t1,$Alo,lsr#7
143 eor $t0,$t0,$Alo,lsl#25
144 eor $t1,$t1,$Ahi,lsl#25 @ Sigma0(a)
145 adds $Tlo,$Tlo,$t0
146 and $t0,$Alo,$t2
147 adc $Thi,$Thi,$t1 @ T += Sigma0(a)
148
149 ldr $t1,[sp,#$Boff+4] @ b.hi
150 orr $Alo,$Alo,$t2
151 ldr $t2,[sp,#$Coff+4] @ c.hi
152 and $Alo,$Alo,$t3
153 and $t3,$Ahi,$t1
154 orr $Ahi,$Ahi,$t1
155 orr $Alo,$Alo,$t0 @ Maj(a,b,c).lo
156 and $Ahi,$Ahi,$t2
157 adds $Alo,$Alo,$Tlo
158 orr $Ahi,$Ahi,$t3 @ Maj(a,b,c).hi
159 sub sp,sp,#8
160 adc $Ahi,$Ahi,$Thi @ h += T
161 tst $Ktbl,#1
162 add $Ktbl,$Ktbl,#8
163___
164}
165$code=<<___;
166#include "arm_arch.h"
167#ifdef __ARMEL__
168# define LO 0
169# define HI 4
170# define WORD64(hi0,lo0,hi1,lo1) .word lo0,hi0, lo1,hi1
171#else
172# define HI 0
173# define LO 4
174# define WORD64(hi0,lo0,hi1,lo1) .word hi0,lo0, hi1,lo1
175#endif
176
177.text
178.code 32
179.type K512,%object
180.align 5
181K512:
182WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
183WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
184WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
185WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
186WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
187WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
188WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
189WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
190WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
191WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
192WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
193WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
194WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
195WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
196WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
197WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
198WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
199WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
200WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
201WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
202WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
203WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
204WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
205WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
206WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
207WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
208WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
209WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
210WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
211WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
212WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
213WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
214WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
215WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
216WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
217WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
218WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
219WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
220WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
221WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
222.size K512,.-K512
223.LOPENSSL_armcap:
224.word OPENSSL_armcap_P-sha512_block_data_order
225.skip 32-4
226
227.global sha512_block_data_order
228.type sha512_block_data_order,%function
229sha512_block_data_order:
230 sub r3,pc,#8 @ sha512_block_data_order
231 add $len,$inp,$len,lsl#7 @ len to point at the end of inp
232#if __ARM_ARCH__>=7
233 ldr r12,.LOPENSSL_armcap
234 ldr r12,[r3,r12] @ OPENSSL_armcap_P
235 tst r12,#1
236 bne .LNEON
237#endif
238 stmdb sp!,{r4-r12,lr}
239 sub $Ktbl,r3,#672 @ K512
240 sub sp,sp,#9*8
241
242 ldr $Elo,[$ctx,#$Eoff+$lo]
243 ldr $Ehi,[$ctx,#$Eoff+$hi]
244 ldr $t0, [$ctx,#$Goff+$lo]
245 ldr $t1, [$ctx,#$Goff+$hi]
246 ldr $t2, [$ctx,#$Hoff+$lo]
247 ldr $t3, [$ctx,#$Hoff+$hi]
248.Loop:
249 str $t0, [sp,#$Goff+0]
250 str $t1, [sp,#$Goff+4]
251 str $t2, [sp,#$Hoff+0]
252 str $t3, [sp,#$Hoff+4]
253 ldr $Alo,[$ctx,#$Aoff+$lo]
254 ldr $Ahi,[$ctx,#$Aoff+$hi]
255 ldr $Tlo,[$ctx,#$Boff+$lo]
256 ldr $Thi,[$ctx,#$Boff+$hi]
257 ldr $t0, [$ctx,#$Coff+$lo]
258 ldr $t1, [$ctx,#$Coff+$hi]
259 ldr $t2, [$ctx,#$Doff+$lo]
260 ldr $t3, [$ctx,#$Doff+$hi]
261 str $Tlo,[sp,#$Boff+0]
262 str $Thi,[sp,#$Boff+4]
263 str $t0, [sp,#$Coff+0]
264 str $t1, [sp,#$Coff+4]
265 str $t2, [sp,#$Doff+0]
266 str $t3, [sp,#$Doff+4]
267 ldr $Tlo,[$ctx,#$Foff+$lo]
268 ldr $Thi,[$ctx,#$Foff+$hi]
269 str $Tlo,[sp,#$Foff+0]
270 str $Thi,[sp,#$Foff+4]
271
272.L00_15:
273#if __ARM_ARCH__<7
274 ldrb $Tlo,[$inp,#7]
275 ldrb $t0, [$inp,#6]
276 ldrb $t1, [$inp,#5]
277 ldrb $t2, [$inp,#4]
278 ldrb $Thi,[$inp,#3]
279 ldrb $t3, [$inp,#2]
280 orr $Tlo,$Tlo,$t0,lsl#8
281 ldrb $t0, [$inp,#1]
282 orr $Tlo,$Tlo,$t1,lsl#16
283 ldrb $t1, [$inp],#8
284 orr $Tlo,$Tlo,$t2,lsl#24
285 orr $Thi,$Thi,$t3,lsl#8
286 orr $Thi,$Thi,$t0,lsl#16
287 orr $Thi,$Thi,$t1,lsl#24
288#else
289 ldr $Tlo,[$inp,#4]
290 ldr $Thi,[$inp],#8
291#ifdef __ARMEL__
292 rev $Tlo,$Tlo
293 rev $Thi,$Thi
294#endif
295#endif
296___
297 &BODY_00_15(0x94);
298$code.=<<___;
299 tst $Ktbl,#1
300 beq .L00_15
301 ldr $t0,[sp,#`$Xoff+8*(16-1)`+0]
302 ldr $t1,[sp,#`$Xoff+8*(16-1)`+4]
303 bic $Ktbl,$Ktbl,#1
304.L16_79:
305 @ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7))
306 @ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
307 @ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7
308 mov $Tlo,$t0,lsr#1
309 ldr $t2,[sp,#`$Xoff+8*(16-14)`+0]
310 mov $Thi,$t1,lsr#1
311 ldr $t3,[sp,#`$Xoff+8*(16-14)`+4]
312 eor $Tlo,$Tlo,$t1,lsl#31
313 eor $Thi,$Thi,$t0,lsl#31
314 eor $Tlo,$Tlo,$t0,lsr#8
315 eor $Thi,$Thi,$t1,lsr#8
316 eor $Tlo,$Tlo,$t1,lsl#24
317 eor $Thi,$Thi,$t0,lsl#24
318 eor $Tlo,$Tlo,$t0,lsr#7
319 eor $Thi,$Thi,$t1,lsr#7
320 eor $Tlo,$Tlo,$t1,lsl#25
321
322 @ sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
323 @ LO lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
324 @ HI hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
325 mov $t0,$t2,lsr#19
326 mov $t1,$t3,lsr#19
327 eor $t0,$t0,$t3,lsl#13
328 eor $t1,$t1,$t2,lsl#13
329 eor $t0,$t0,$t3,lsr#29
330 eor $t1,$t1,$t2,lsr#29
331 eor $t0,$t0,$t2,lsl#3
332 eor $t1,$t1,$t3,lsl#3
333 eor $t0,$t0,$t2,lsr#6
334 eor $t1,$t1,$t3,lsr#6
335 ldr $t2,[sp,#`$Xoff+8*(16-9)`+0]
336 eor $t0,$t0,$t3,lsl#26
337
338 ldr $t3,[sp,#`$Xoff+8*(16-9)`+4]
339 adds $Tlo,$Tlo,$t0
340 ldr $t0,[sp,#`$Xoff+8*16`+0]
341 adc $Thi,$Thi,$t1
342
343 ldr $t1,[sp,#`$Xoff+8*16`+4]
344 adds $Tlo,$Tlo,$t2
345 adc $Thi,$Thi,$t3
346 adds $Tlo,$Tlo,$t0
347 adc $Thi,$Thi,$t1
348___
349 &BODY_00_15(0x17);
350$code.=<<___;
351 ldreq $t0,[sp,#`$Xoff+8*(16-1)`+0]
352 ldreq $t1,[sp,#`$Xoff+8*(16-1)`+4]
353 beq .L16_79
354 bic $Ktbl,$Ktbl,#1
355
356 ldr $Tlo,[sp,#$Boff+0]
357 ldr $Thi,[sp,#$Boff+4]
358 ldr $t0, [$ctx,#$Aoff+$lo]
359 ldr $t1, [$ctx,#$Aoff+$hi]
360 ldr $t2, [$ctx,#$Boff+$lo]
361 ldr $t3, [$ctx,#$Boff+$hi]
362 adds $t0,$Alo,$t0
363 str $t0, [$ctx,#$Aoff+$lo]
364 adc $t1,$Ahi,$t1
365 str $t1, [$ctx,#$Aoff+$hi]
366 adds $t2,$Tlo,$t2
367 str $t2, [$ctx,#$Boff+$lo]
368 adc $t3,$Thi,$t3
369 str $t3, [$ctx,#$Boff+$hi]
370
371 ldr $Alo,[sp,#$Coff+0]
372 ldr $Ahi,[sp,#$Coff+4]
373 ldr $Tlo,[sp,#$Doff+0]
374 ldr $Thi,[sp,#$Doff+4]
375 ldr $t0, [$ctx,#$Coff+$lo]
376 ldr $t1, [$ctx,#$Coff+$hi]
377 ldr $t2, [$ctx,#$Doff+$lo]
378 ldr $t3, [$ctx,#$Doff+$hi]
379 adds $t0,$Alo,$t0
380 str $t0, [$ctx,#$Coff+$lo]
381 adc $t1,$Ahi,$t1
382 str $t1, [$ctx,#$Coff+$hi]
383 adds $t2,$Tlo,$t2
384 str $t2, [$ctx,#$Doff+$lo]
385 adc $t3,$Thi,$t3
386 str $t3, [$ctx,#$Doff+$hi]
387
388 ldr $Tlo,[sp,#$Foff+0]
389 ldr $Thi,[sp,#$Foff+4]
390 ldr $t0, [$ctx,#$Eoff+$lo]
391 ldr $t1, [$ctx,#$Eoff+$hi]
392 ldr $t2, [$ctx,#$Foff+$lo]
393 ldr $t3, [$ctx,#$Foff+$hi]
394 adds $Elo,$Elo,$t0
395 str $Elo,[$ctx,#$Eoff+$lo]
396 adc $Ehi,$Ehi,$t1
397 str $Ehi,[$ctx,#$Eoff+$hi]
398 adds $t2,$Tlo,$t2
399 str $t2, [$ctx,#$Foff+$lo]
400 adc $t3,$Thi,$t3
401 str $t3, [$ctx,#$Foff+$hi]
402
403 ldr $Alo,[sp,#$Goff+0]
404 ldr $Ahi,[sp,#$Goff+4]
405 ldr $Tlo,[sp,#$Hoff+0]
406 ldr $Thi,[sp,#$Hoff+4]
407 ldr $t0, [$ctx,#$Goff+$lo]
408 ldr $t1, [$ctx,#$Goff+$hi]
409 ldr $t2, [$ctx,#$Hoff+$lo]
410 ldr $t3, [$ctx,#$Hoff+$hi]
411 adds $t0,$Alo,$t0
412 str $t0, [$ctx,#$Goff+$lo]
413 adc $t1,$Ahi,$t1
414 str $t1, [$ctx,#$Goff+$hi]
415 adds $t2,$Tlo,$t2
416 str $t2, [$ctx,#$Hoff+$lo]
417 adc $t3,$Thi,$t3
418 str $t3, [$ctx,#$Hoff+$hi]
419
420 add sp,sp,#640
421 sub $Ktbl,$Ktbl,#640
422
423 teq $inp,$len
424 bne .Loop
425
426 add sp,sp,#8*9 @ destroy frame
427#if __ARM_ARCH__>=5
428 ldmia sp!,{r4-r12,pc}
429#else
430 ldmia sp!,{r4-r12,lr}
431 tst lr,#1
432 moveq pc,lr @ be binary compatible with V4, yet
433 bx lr @ interoperable with Thumb ISA:-)
434#endif
435___
436
437{
438my @Sigma0=(28,34,39);
439my @Sigma1=(14,18,41);
440my @sigma0=(1, 8, 7);
441my @sigma1=(19,61,6);
442
443my $Ktbl="r3";
444my $cnt="r12"; # volatile register known as ip, intra-procedure-call scratch
445
446my @X=map("d$_",(0..15));
447my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23));
448
449sub NEON_00_15() {
450my $i=shift;
451my ($a,$b,$c,$d,$e,$f,$g,$h)=@_;
452my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31)); # temps
453
454$code.=<<___ if ($i<16 || $i&1);
455 vshr.u64 $t0,$e,#@Sigma1[0] @ $i
456#if $i<16
457 vld1.64 {@X[$i%16]},[$inp]! @ handles unaligned
458#endif
459 vshr.u64 $t1,$e,#@Sigma1[1]
460 vshr.u64 $t2,$e,#@Sigma1[2]
461___
462$code.=<<___;
463 vld1.64 {$K},[$Ktbl,:64]! @ K[i++]
464 vsli.64 $t0,$e,#`64-@Sigma1[0]`
465 vsli.64 $t1,$e,#`64-@Sigma1[1]`
466 vsli.64 $t2,$e,#`64-@Sigma1[2]`
467#if $i<16 && defined(__ARMEL__)
468 vrev64.8 @X[$i],@X[$i]
469#endif
470 vadd.i64 $T1,$K,$h
471 veor $Ch,$f,$g
472 veor $t0,$t1
473 vand $Ch,$e
474 veor $t0,$t2 @ Sigma1(e)
475 veor $Ch,$g @ Ch(e,f,g)
476 vadd.i64 $T1,$t0
477 vshr.u64 $t0,$a,#@Sigma0[0]
478 vadd.i64 $T1,$Ch
479 vshr.u64 $t1,$a,#@Sigma0[1]
480 vshr.u64 $t2,$a,#@Sigma0[2]
481 vsli.64 $t0,$a,#`64-@Sigma0[0]`
482 vsli.64 $t1,$a,#`64-@Sigma0[1]`
483 vsli.64 $t2,$a,#`64-@Sigma0[2]`
484 vadd.i64 $T1,@X[$i%16]
485 vorr $Maj,$a,$c
486 vand $Ch,$a,$c
487 veor $h,$t0,$t1
488 vand $Maj,$b
489 veor $h,$t2 @ Sigma0(a)
490 vorr $Maj,$Ch @ Maj(a,b,c)
491 vadd.i64 $h,$T1
492 vadd.i64 $d,$T1
493 vadd.i64 $h,$Maj
494___
495}
496
497sub NEON_16_79() {
498my $i=shift;
499
500if ($i&1) { &NEON_00_15($i,@_); return; }
501
502# 2x-vectorized, therefore runs every 2nd round
503my @X=map("q$_",(0..7)); # view @X as 128-bit vector
504my ($t0,$t1,$s0,$s1) = map("q$_",(12..15)); # temps
505my ($d0,$d1,$d2) = map("d$_",(24..26)); # temps from NEON_00_15
506my $e=@_[4]; # $e from NEON_00_15
507$i /= 2;
508$code.=<<___;
509 vshr.u64 $t0,@X[($i+7)%8],#@sigma1[0]
510 vshr.u64 $t1,@X[($i+7)%8],#@sigma1[1]
511 vshr.u64 $s1,@X[($i+7)%8],#@sigma1[2]
512 vsli.64 $t0,@X[($i+7)%8],#`64-@sigma1[0]`
513 vext.8 $s0,@X[$i%8],@X[($i+1)%8],#8 @ X[i+1]
514 vsli.64 $t1,@X[($i+7)%8],#`64-@sigma1[1]`
515 veor $s1,$t0
516 vshr.u64 $t0,$s0,#@sigma0[0]
517 veor $s1,$t1 @ sigma1(X[i+14])
518 vshr.u64 $t1,$s0,#@sigma0[1]
519 vadd.i64 @X[$i%8],$s1
520 vshr.u64 $s1,$s0,#@sigma0[2]
521 vsli.64 $t0,$s0,#`64-@sigma0[0]`
522 vsli.64 $t1,$s0,#`64-@sigma0[1]`
523 vext.8 $s0,@X[($i+4)%8],@X[($i+5)%8],#8 @ X[i+9]
524 veor $s1,$t0
525 vshr.u64 $d0,$e,#@Sigma1[0] @ from NEON_00_15
526 vadd.i64 @X[$i%8],$s0
527 vshr.u64 $d1,$e,#@Sigma1[1] @ from NEON_00_15
528 veor $s1,$t1 @ sigma0(X[i+1])
529 vshr.u64 $d2,$e,#@Sigma1[2] @ from NEON_00_15
530 vadd.i64 @X[$i%8],$s1
531___
532 &NEON_00_15(2*$i,@_);
533}
534
535$code.=<<___;
536#if __ARM_ARCH__>=7
537.fpu neon
538
539.align 4
540.LNEON:
541 dmb @ errata #451034 on early Cortex A8
542 vstmdb sp!,{d8-d15} @ ABI specification says so
543 sub $Ktbl,r3,#672 @ K512
544 vldmia $ctx,{$A-$H} @ load context
545.Loop_neon:
546___
547for($i=0;$i<16;$i++) { &NEON_00_15($i,@V); unshift(@V,pop(@V)); }
548$code.=<<___;
549 mov $cnt,#4
550.L16_79_neon:
551 subs $cnt,#1
552___
553for(;$i<32;$i++) { &NEON_16_79($i,@V); unshift(@V,pop(@V)); }
554$code.=<<___;
555 bne .L16_79_neon
556
557 vldmia $ctx,{d24-d31} @ load context to temp
558 vadd.i64 q8,q12 @ vectorized accumulate
559 vadd.i64 q9,q13
560 vadd.i64 q10,q14
561 vadd.i64 q11,q15
562 vstmia $ctx,{$A-$H} @ save context
563 teq $inp,$len
564 sub $Ktbl,#640 @ rewind K512
565 bne .Loop_neon
566
567 vldmia sp!,{d8-d15} @ epilogue
568 bx lr
569#endif
570___
571}
572$code.=<<___;
573.size sha512_block_data_order,.-sha512_block_data_order
574.asciz "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
575.align 2
576.comm OPENSSL_armcap_P,4,4
577___
578
579$code =~ s/\`([^\`]*)\`/eval $1/gem;
580$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
581print $code;
582close STDOUT; # enforce flush
diff --git a/src/lib/libcrypto/sha/asm/sha512-ia64.pl b/src/lib/libcrypto/sha/asm/sha512-ia64.pl
deleted file mode 100755
index 1c6ce56522..0000000000
--- a/src/lib/libcrypto/sha/asm/sha512-ia64.pl
+++ /dev/null
@@ -1,672 +0,0 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# SHA256/512_Transform for Itanium.
11#
12# sha512_block runs in 1003 cycles on Itanium 2, which is almost 50%
13# faster than gcc and >60%(!) faster than code generated by HP-UX
14# compiler (yes, HP-UX is generating slower code, because unlike gcc,
15# it failed to deploy "shift right pair," 'shrp' instruction, which
16# substitutes for 64-bit rotate).
17#
18# 924 cycles long sha256_block outperforms gcc by over factor of 2(!)
19# and HP-UX compiler - by >40% (yes, gcc won sha512_block, but lost
20# this one big time). Note that "formally" 924 is about 100 cycles
21# too much. I mean it's 64 32-bit rounds vs. 80 virtually identical
22# 64-bit ones and 1003*64/80 gives 802. Extra cycles, 2 per round,
23# are spent on extra work to provide for 32-bit rotations. 32-bit
24# rotations are still handled by 'shrp' instruction and for this
25# reason lower 32 bits are deposited to upper half of 64-bit register
26# prior 'shrp' issue. And in order to minimize the amount of such
27# operations, X[16] values are *maintained* with copies of lower
28# halves in upper halves, which is why you'll spot such instructions
29# as custom 'mux2', "parallel 32-bit add," 'padd4' and "parallel
30# 32-bit unsigned right shift," 'pshr4.u' instructions here.
31#
32# Rules of engagement.
33#
34# There is only one integer shifter meaning that if I have two rotate,
35# deposit or extract instructions in adjacent bundles, they shall
36# split [at run-time if they have to]. But note that variable and
37# parallel shifts are performed by multi-media ALU and *are* pairable
38# with rotates [and alike]. On the backside MMALU is rather slow: it
39# takes 2 extra cycles before the result of integer operation is
40# available *to* MMALU and 2(*) extra cycles before the result of MM
41# operation is available "back" *to* integer ALU, not to mention that
42# MMALU itself has 2 cycles latency. However! I explicitly scheduled
43# these MM instructions to avoid MM stalls, so that all these extra
44# latencies get "hidden" in instruction-level parallelism.
45#
46# (*) 2 cycles on Itanium 1 and 1 cycle on Itanium 2. But I schedule
47# for 2 in order to provide for best *overall* performance,
48# because on Itanium 1 stall on MM result is accompanied by
49# pipeline flush, which takes 6 cycles:-(
50#
51# Resulting performance numbers for 900MHz Itanium 2 system:
52#
53# The 'numbers' are in 1000s of bytes per second processed.
54# type 16 bytes 64 bytes 256 bytes 1024 bytes 8192 bytes
55# sha1(*) 6210.14k 20376.30k 52447.83k 85870.05k 105478.12k
56# sha256 7476.45k 20572.05k 41538.34k 56062.29k 62093.18k
57# sha512 4996.56k 20026.28k 47597.20k 85278.79k 111501.31k
58#
59# (*) SHA1 numbers are for HP-UX compiler and are presented purely
60# for reference purposes. I bet it can improved too...
61#
62# To generate code, pass the file name with either 256 or 512 in its
63# name and compiler flags.
64
65$output=shift;
66
67if ($output =~ /512.*\.[s|asm]/) {
68 $SZ=8;
69 $BITS=8*$SZ;
70 $LDW="ld8";
71 $STW="st8";
72 $ADD="add";
73 $SHRU="shr.u";
74 $TABLE="K512";
75 $func="sha512_block_data_order";
76 @Sigma0=(28,34,39);
77 @Sigma1=(14,18,41);
78 @sigma0=(1, 8, 7);
79 @sigma1=(19,61, 6);
80 $rounds=80;
81} elsif ($output =~ /256.*\.[s|asm]/) {
82 $SZ=4;
83 $BITS=8*$SZ;
84 $LDW="ld4";
85 $STW="st4";
86 $ADD="padd4";
87 $SHRU="pshr4.u";
88 $TABLE="K256";
89 $func="sha256_block_data_order";
90 @Sigma0=( 2,13,22);
91 @Sigma1=( 6,11,25);
92 @sigma0=( 7,18, 3);
93 @sigma1=(17,19,10);
94 $rounds=64;
95} else { die "nonsense $output"; }
96
97open STDOUT,">$output" || die "can't open $output: $!";
98
99if ($^O eq "hpux") {
100 $ADDP="addp4";
101 for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
102} else { $ADDP="add"; }
103for (@ARGV) { $big_endian=1 if (/\-DB_ENDIAN/);
104 $big_endian=0 if (/\-DL_ENDIAN/); }
105if (!defined($big_endian))
106 { $big_endian=(unpack('L',pack('N',1))==1); }
107
108$code=<<___;
109.ident \"$output, version 1.1\"
110.ident \"IA-64 ISA artwork by Andy Polyakov <appro\@fy.chalmers.se>\"
111.explicit
112.text
113
114pfssave=r2;
115lcsave=r3;
116prsave=r14;
117K=r15;
118A=r16; B=r17; C=r18; D=r19;
119E=r20; F=r21; G=r22; H=r23;
120T1=r24; T2=r25;
121s0=r26; s1=r27; t0=r28; t1=r29;
122Ktbl=r30;
123ctx=r31; // 1st arg
124input=r48; // 2nd arg
125num=r49; // 3rd arg
126sgm0=r50; sgm1=r51; // small constants
127A_=r54; B_=r55; C_=r56; D_=r57;
128E_=r58; F_=r59; G_=r60; H_=r61;
129
130// void $func (SHA_CTX *ctx, const void *in,size_t num[,int host])
131.global $func#
132.proc $func#
133.align 32
134$func:
135 .prologue
136 .save ar.pfs,pfssave
137{ .mmi; alloc pfssave=ar.pfs,3,27,0,16
138 $ADDP ctx=0,r32 // 1st arg
139 .save ar.lc,lcsave
140 mov lcsave=ar.lc }
141{ .mmi; $ADDP input=0,r33 // 2nd arg
142 mov num=r34 // 3rd arg
143 .save pr,prsave
144 mov prsave=pr };;
145
146 .body
147{ .mib; add r8=0*$SZ,ctx
148 add r9=1*$SZ,ctx
149 brp.loop.imp .L_first16,.L_first16_end-16 }
150{ .mib; add r10=2*$SZ,ctx
151 add r11=3*$SZ,ctx
152 brp.loop.imp .L_rest,.L_rest_end-16 };;
153
154// load A-H
155.Lpic_point:
156{ .mmi; $LDW A_=[r8],4*$SZ
157 $LDW B_=[r9],4*$SZ
158 mov Ktbl=ip }
159{ .mmi; $LDW C_=[r10],4*$SZ
160 $LDW D_=[r11],4*$SZ
161 mov sgm0=$sigma0[2] };;
162{ .mmi; $LDW E_=[r8]
163 $LDW F_=[r9]
164 add Ktbl=($TABLE#-.Lpic_point),Ktbl }
165{ .mmi; $LDW G_=[r10]
166 $LDW H_=[r11]
167 cmp.ne p0,p16=0,r0 };; // used in sha256_block
168___
169$code.=<<___ if ($BITS==64);
170{ .mii; and r8=7,input
171 and input=~7,input;;
172 cmp.eq p9,p0=1,r8 }
173{ .mmi; cmp.eq p10,p0=2,r8
174 cmp.eq p11,p0=3,r8
175 cmp.eq p12,p0=4,r8 }
176{ .mmi; cmp.eq p13,p0=5,r8
177 cmp.eq p14,p0=6,r8
178 cmp.eq p15,p0=7,r8 };;
179___
180$code.=<<___;
181.L_outer:
182.rotr X[16]
183{ .mmi; mov A=A_
184 mov B=B_
185 mov ar.lc=14 }
186{ .mmi; mov C=C_
187 mov D=D_
188 mov E=E_ }
189{ .mmi; mov F=F_
190 mov G=G_
191 mov ar.ec=2 }
192{ .mmi; ld1 X[15]=[input],$SZ // eliminated in 64-bit
193 mov H=H_
194 mov sgm1=$sigma1[2] };;
195
196___
197$t0="t0", $t1="t1", $code.=<<___ if ($BITS==32);
198.align 32
199.L_first16:
200{ .mmi; add r9=1-$SZ,input
201 add r10=2-$SZ,input
202 add r11=3-$SZ,input };;
203{ .mmi; ld1 r9=[r9]
204 ld1 r10=[r10]
205 dep.z $t1=E,32,32 }
206{ .mmi; $LDW K=[Ktbl],$SZ
207 ld1 r11=[r11]
208 zxt4 E=E };;
209{ .mii; or $t1=$t1,E
210 dep X[15]=X[15],r9,8,8
211 dep r11=r10,r11,8,8 };;
212{ .mmi; and T1=F,E
213 and T2=A,B
214 dep X[15]=X[15],r11,16,16 }
215{ .mmi; andcm r8=G,E
216 and r9=A,C
217 mux2 $t0=A,0x44 };; // copy lower half to upper
218{ .mmi; (p16) ld1 X[15-1]=[input],$SZ // prefetch
219 xor T1=T1,r8 // T1=((e & f) ^ (~e & g))
220 _rotr r11=$t1,$Sigma1[0] } // ROTR(e,14)
221{ .mib; and r10=B,C
222 xor T2=T2,r9 };;
223___
224$t0="A", $t1="E", $code.=<<___ if ($BITS==64);
225// in 64-bit mode I load whole X[16] at once and take care of alignment...
226{ .mmi; add r8=1*$SZ,input
227 add r9=2*$SZ,input
228 add r10=3*$SZ,input };;
229{ .mmb; $LDW X[15]=[input],4*$SZ
230 $LDW X[14]=[r8],4*$SZ
231(p9) br.cond.dpnt.many .L1byte };;
232{ .mmb; $LDW X[13]=[r9],4*$SZ
233 $LDW X[12]=[r10],4*$SZ
234(p10) br.cond.dpnt.many .L2byte };;
235{ .mmb; $LDW X[11]=[input],4*$SZ
236 $LDW X[10]=[r8],4*$SZ
237(p11) br.cond.dpnt.many .L3byte };;
238{ .mmb; $LDW X[ 9]=[r9],4*$SZ
239 $LDW X[ 8]=[r10],4*$SZ
240(p12) br.cond.dpnt.many .L4byte };;
241{ .mmb; $LDW X[ 7]=[input],4*$SZ
242 $LDW X[ 6]=[r8],4*$SZ
243(p13) br.cond.dpnt.many .L5byte };;
244{ .mmb; $LDW X[ 5]=[r9],4*$SZ
245 $LDW X[ 4]=[r10],4*$SZ
246(p14) br.cond.dpnt.many .L6byte };;
247{ .mmb; $LDW X[ 3]=[input],4*$SZ
248 $LDW X[ 2]=[r8],4*$SZ
249(p15) br.cond.dpnt.many .L7byte };;
250{ .mmb; $LDW X[ 1]=[r9],4*$SZ
251 $LDW X[ 0]=[r10],4*$SZ
252 br.many .L_first16 };;
253.L1byte:
254{ .mmi; $LDW X[13]=[r9],4*$SZ
255 $LDW X[12]=[r10],4*$SZ
256 shrp X[15]=X[15],X[14],56 };;
257{ .mmi; $LDW X[11]=[input],4*$SZ
258 $LDW X[10]=[r8],4*$SZ
259 shrp X[14]=X[14],X[13],56 }
260{ .mmi; $LDW X[ 9]=[r9],4*$SZ
261 $LDW X[ 8]=[r10],4*$SZ
262 shrp X[13]=X[13],X[12],56 };;
263{ .mmi; $LDW X[ 7]=[input],4*$SZ
264 $LDW X[ 6]=[r8],4*$SZ
265 shrp X[12]=X[12],X[11],56 }
266{ .mmi; $LDW X[ 5]=[r9],4*$SZ
267 $LDW X[ 4]=[r10],4*$SZ
268 shrp X[11]=X[11],X[10],56 };;
269{ .mmi; $LDW X[ 3]=[input],4*$SZ
270 $LDW X[ 2]=[r8],4*$SZ
271 shrp X[10]=X[10],X[ 9],56 }
272{ .mmi; $LDW X[ 1]=[r9],4*$SZ
273 $LDW X[ 0]=[r10],4*$SZ
274 shrp X[ 9]=X[ 9],X[ 8],56 };;
275{ .mii; $LDW T1=[input]
276 shrp X[ 8]=X[ 8],X[ 7],56
277 shrp X[ 7]=X[ 7],X[ 6],56 }
278{ .mii; shrp X[ 6]=X[ 6],X[ 5],56
279 shrp X[ 5]=X[ 5],X[ 4],56 };;
280{ .mii; shrp X[ 4]=X[ 4],X[ 3],56
281 shrp X[ 3]=X[ 3],X[ 2],56 }
282{ .mii; shrp X[ 2]=X[ 2],X[ 1],56
283 shrp X[ 1]=X[ 1],X[ 0],56 }
284{ .mib; shrp X[ 0]=X[ 0],T1,56
285 br.many .L_first16 };;
286.L2byte:
287{ .mmi; $LDW X[11]=[input],4*$SZ
288 $LDW X[10]=[r8],4*$SZ
289 shrp X[15]=X[15],X[14],48 }
290{ .mmi; $LDW X[ 9]=[r9],4*$SZ
291 $LDW X[ 8]=[r10],4*$SZ
292 shrp X[14]=X[14],X[13],48 };;
293{ .mmi; $LDW X[ 7]=[input],4*$SZ
294 $LDW X[ 6]=[r8],4*$SZ
295 shrp X[13]=X[13],X[12],48 }
296{ .mmi; $LDW X[ 5]=[r9],4*$SZ
297 $LDW X[ 4]=[r10],4*$SZ
298 shrp X[12]=X[12],X[11],48 };;
299{ .mmi; $LDW X[ 3]=[input],4*$SZ
300 $LDW X[ 2]=[r8],4*$SZ
301 shrp X[11]=X[11],X[10],48 }
302{ .mmi; $LDW X[ 1]=[r9],4*$SZ
303 $LDW X[ 0]=[r10],4*$SZ
304 shrp X[10]=X[10],X[ 9],48 };;
305{ .mii; $LDW T1=[input]
306 shrp X[ 9]=X[ 9],X[ 8],48
307 shrp X[ 8]=X[ 8],X[ 7],48 }
308{ .mii; shrp X[ 7]=X[ 7],X[ 6],48
309 shrp X[ 6]=X[ 6],X[ 5],48 };;
310{ .mii; shrp X[ 5]=X[ 5],X[ 4],48
311 shrp X[ 4]=X[ 4],X[ 3],48 }
312{ .mii; shrp X[ 3]=X[ 3],X[ 2],48
313 shrp X[ 2]=X[ 2],X[ 1],48 }
314{ .mii; shrp X[ 1]=X[ 1],X[ 0],48
315 shrp X[ 0]=X[ 0],T1,48 }
316{ .mfb; br.many .L_first16 };;
317.L3byte:
318{ .mmi; $LDW X[ 9]=[r9],4*$SZ
319 $LDW X[ 8]=[r10],4*$SZ
320 shrp X[15]=X[15],X[14],40 };;
321{ .mmi; $LDW X[ 7]=[input],4*$SZ
322 $LDW X[ 6]=[r8],4*$SZ
323 shrp X[14]=X[14],X[13],40 }
324{ .mmi; $LDW X[ 5]=[r9],4*$SZ
325 $LDW X[ 4]=[r10],4*$SZ
326 shrp X[13]=X[13],X[12],40 };;
327{ .mmi; $LDW X[ 3]=[input],4*$SZ
328 $LDW X[ 2]=[r8],4*$SZ
329 shrp X[12]=X[12],X[11],40 }
330{ .mmi; $LDW X[ 1]=[r9],4*$SZ
331 $LDW X[ 0]=[r10],4*$SZ
332 shrp X[11]=X[11],X[10],40 };;
333{ .mii; $LDW T1=[input]
334 shrp X[10]=X[10],X[ 9],40
335 shrp X[ 9]=X[ 9],X[ 8],40 }
336{ .mii; shrp X[ 8]=X[ 8],X[ 7],40
337 shrp X[ 7]=X[ 7],X[ 6],40 };;
338{ .mii; shrp X[ 6]=X[ 6],X[ 5],40
339 shrp X[ 5]=X[ 5],X[ 4],40 }
340{ .mii; shrp X[ 4]=X[ 4],X[ 3],40
341 shrp X[ 3]=X[ 3],X[ 2],40 }
342{ .mii; shrp X[ 2]=X[ 2],X[ 1],40
343 shrp X[ 1]=X[ 1],X[ 0],40 }
344{ .mib; shrp X[ 0]=X[ 0],T1,40
345 br.many .L_first16 };;
346.L4byte:
347{ .mmi; $LDW X[ 7]=[input],4*$SZ
348 $LDW X[ 6]=[r8],4*$SZ
349 shrp X[15]=X[15],X[14],32 }
350{ .mmi; $LDW X[ 5]=[r9],4*$SZ
351 $LDW X[ 4]=[r10],4*$SZ
352 shrp X[14]=X[14],X[13],32 };;
353{ .mmi; $LDW X[ 3]=[input],4*$SZ
354 $LDW X[ 2]=[r8],4*$SZ
355 shrp X[13]=X[13],X[12],32 }
356{ .mmi; $LDW X[ 1]=[r9],4*$SZ
357 $LDW X[ 0]=[r10],4*$SZ
358 shrp X[12]=X[12],X[11],32 };;
359{ .mii; $LDW T1=[input]
360 shrp X[11]=X[11],X[10],32
361 shrp X[10]=X[10],X[ 9],32 }
362{ .mii; shrp X[ 9]=X[ 9],X[ 8],32
363 shrp X[ 8]=X[ 8],X[ 7],32 };;
364{ .mii; shrp X[ 7]=X[ 7],X[ 6],32
365 shrp X[ 6]=X[ 6],X[ 5],32 }
366{ .mii; shrp X[ 5]=X[ 5],X[ 4],32
367 shrp X[ 4]=X[ 4],X[ 3],32 }
368{ .mii; shrp X[ 3]=X[ 3],X[ 2],32
369 shrp X[ 2]=X[ 2],X[ 1],32 }
370{ .mii; shrp X[ 1]=X[ 1],X[ 0],32
371 shrp X[ 0]=X[ 0],T1,32 }
372{ .mfb; br.many .L_first16 };;
373.L5byte:
374{ .mmi; $LDW X[ 5]=[r9],4*$SZ
375 $LDW X[ 4]=[r10],4*$SZ
376 shrp X[15]=X[15],X[14],24 };;
377{ .mmi; $LDW X[ 3]=[input],4*$SZ
378 $LDW X[ 2]=[r8],4*$SZ
379 shrp X[14]=X[14],X[13],24 }
380{ .mmi; $LDW X[ 1]=[r9],4*$SZ
381 $LDW X[ 0]=[r10],4*$SZ
382 shrp X[13]=X[13],X[12],24 };;
383{ .mii; $LDW T1=[input]
384 shrp X[12]=X[12],X[11],24
385 shrp X[11]=X[11],X[10],24 }
386{ .mii; shrp X[10]=X[10],X[ 9],24
387 shrp X[ 9]=X[ 9],X[ 8],24 };;
388{ .mii; shrp X[ 8]=X[ 8],X[ 7],24
389 shrp X[ 7]=X[ 7],X[ 6],24 }
390{ .mii; shrp X[ 6]=X[ 6],X[ 5],24
391 shrp X[ 5]=X[ 5],X[ 4],24 }
392{ .mii; shrp X[ 4]=X[ 4],X[ 3],24
393 shrp X[ 3]=X[ 3],X[ 2],24 }
394{ .mii; shrp X[ 2]=X[ 2],X[ 1],24
395 shrp X[ 1]=X[ 1],X[ 0],24 }
396{ .mib; shrp X[ 0]=X[ 0],T1,24
397 br.many .L_first16 };;
398.L6byte:
399{ .mmi; $LDW X[ 3]=[input],4*$SZ
400 $LDW X[ 2]=[r8],4*$SZ
401 shrp X[15]=X[15],X[14],16 }
402{ .mmi; $LDW X[ 1]=[r9],4*$SZ
403 $LDW X[ 0]=[r10],4*$SZ
404 shrp X[14]=X[14],X[13],16 };;
405{ .mii; $LDW T1=[input]
406 shrp X[13]=X[13],X[12],16
407 shrp X[12]=X[12],X[11],16 }
408{ .mii; shrp X[11]=X[11],X[10],16
409 shrp X[10]=X[10],X[ 9],16 };;
410{ .mii; shrp X[ 9]=X[ 9],X[ 8],16
411 shrp X[ 8]=X[ 8],X[ 7],16 }
412{ .mii; shrp X[ 7]=X[ 7],X[ 6],16
413 shrp X[ 6]=X[ 6],X[ 5],16 }
414{ .mii; shrp X[ 5]=X[ 5],X[ 4],16
415 shrp X[ 4]=X[ 4],X[ 3],16 }
416{ .mii; shrp X[ 3]=X[ 3],X[ 2],16
417 shrp X[ 2]=X[ 2],X[ 1],16 }
418{ .mii; shrp X[ 1]=X[ 1],X[ 0],16
419 shrp X[ 0]=X[ 0],T1,16 }
420{ .mfb; br.many .L_first16 };;
421.L7byte:
422{ .mmi; $LDW X[ 1]=[r9],4*$SZ
423 $LDW X[ 0]=[r10],4*$SZ
424 shrp X[15]=X[15],X[14],8 };;
425{ .mii; $LDW T1=[input]
426 shrp X[14]=X[14],X[13],8
427 shrp X[13]=X[13],X[12],8 }
428{ .mii; shrp X[12]=X[12],X[11],8
429 shrp X[11]=X[11],X[10],8 };;
430{ .mii; shrp X[10]=X[10],X[ 9],8
431 shrp X[ 9]=X[ 9],X[ 8],8 }
432{ .mii; shrp X[ 8]=X[ 8],X[ 7],8
433 shrp X[ 7]=X[ 7],X[ 6],8 }
434{ .mii; shrp X[ 6]=X[ 6],X[ 5],8
435 shrp X[ 5]=X[ 5],X[ 4],8 }
436{ .mii; shrp X[ 4]=X[ 4],X[ 3],8
437 shrp X[ 3]=X[ 3],X[ 2],8 }
438{ .mii; shrp X[ 2]=X[ 2],X[ 1],8
439 shrp X[ 1]=X[ 1],X[ 0],8 }
440{ .mib; shrp X[ 0]=X[ 0],T1,8
441 br.many .L_first16 };;
442
443.align 32
444.L_first16:
445{ .mmi; $LDW K=[Ktbl],$SZ
446 and T1=F,E
447 and T2=A,B }
448{ .mmi; //$LDW X[15]=[input],$SZ // X[i]=*input++
449 andcm r8=G,E
450 and r9=A,C };;
451{ .mmi; xor T1=T1,r8 //T1=((e & f) ^ (~e & g))
452 and r10=B,C
453 _rotr r11=$t1,$Sigma1[0] } // ROTR(e,14)
454{ .mmi; xor T2=T2,r9
455 mux1 X[15]=X[15],\@rev };; // eliminated in big-endian
456___
457$code.=<<___;
458{ .mib; add T1=T1,H // T1=Ch(e,f,g)+h
459 _rotr r8=$t1,$Sigma1[1] } // ROTR(e,18)
460{ .mib; xor T2=T2,r10 // T2=((a & b) ^ (a & c) ^ (b & c))
461 mov H=G };;
462{ .mib; xor r11=r8,r11
463 _rotr r9=$t1,$Sigma1[2] } // ROTR(e,41)
464{ .mib; mov G=F
465 mov F=E };;
466{ .mib; xor r9=r9,r11 // r9=Sigma1(e)
467 _rotr r10=$t0,$Sigma0[0] } // ROTR(a,28)
468{ .mib; add T1=T1,K // T1=Ch(e,f,g)+h+K512[i]
469 mov E=D };;
470{ .mib; add T1=T1,r9 // T1+=Sigma1(e)
471 _rotr r11=$t0,$Sigma0[1] } // ROTR(a,34)
472{ .mib; mov D=C
473 mov C=B };;
474{ .mib; add T1=T1,X[15] // T1+=X[i]
475 _rotr r8=$t0,$Sigma0[2] } // ROTR(a,39)
476{ .mib; xor r10=r10,r11
477 mux2 X[15]=X[15],0x44 };; // eliminated in 64-bit
478{ .mmi; xor r10=r8,r10 // r10=Sigma0(a)
479 mov B=A
480 add A=T1,T2 };;
481{ .mib; add E=E,T1
482 add A=A,r10 // T2=Maj(a,b,c)+Sigma0(a)
483 br.ctop.sptk .L_first16 };;
484.L_first16_end:
485
486{ .mii; mov ar.lc=$rounds-17
487 mov ar.ec=1 };;
488
489.align 32
490.L_rest:
491.rotr X[16]
492{ .mib; $LDW K=[Ktbl],$SZ
493 _rotr r8=X[15-1],$sigma0[0] } // ROTR(s0,1)
494{ .mib; $ADD X[15]=X[15],X[15-9] // X[i&0xF]+=X[(i+9)&0xF]
495 $SHRU s0=X[15-1],sgm0 };; // s0=X[(i+1)&0xF]>>7
496{ .mib; and T1=F,E
497 _rotr r9=X[15-1],$sigma0[1] } // ROTR(s0,8)
498{ .mib; andcm r10=G,E
499 $SHRU s1=X[15-14],sgm1 };; // s1=X[(i+14)&0xF]>>6
500{ .mmi; xor T1=T1,r10 // T1=((e & f) ^ (~e & g))
501 xor r9=r8,r9
502 _rotr r10=X[15-14],$sigma1[0] };;// ROTR(s1,19)
503{ .mib; and T2=A,B
504 _rotr r11=X[15-14],$sigma1[1] }// ROTR(s1,61)
505{ .mib; and r8=A,C };;
506___
507$t0="t0", $t1="t1", $code.=<<___ if ($BITS==32);
508// I adhere to mmi; in order to hold Itanium 1 back and avoid 6 cycle
509// pipeline flush in last bundle. Note that even on Itanium2 the
510// latter stalls for one clock cycle...
511{ .mmi; xor s0=s0,r9 // s0=sigma0(X[(i+1)&0xF])
512 dep.z $t1=E,32,32 }
513{ .mmi; xor r10=r11,r10
514 zxt4 E=E };;
515{ .mmi; or $t1=$t1,E
516 xor s1=s1,r10 // s1=sigma1(X[(i+14)&0xF])
517 mux2 $t0=A,0x44 };; // copy lower half to upper
518{ .mmi; xor T2=T2,r8
519 _rotr r9=$t1,$Sigma1[0] } // ROTR(e,14)
520{ .mmi; and r10=B,C
521 add T1=T1,H // T1=Ch(e,f,g)+h
522 $ADD X[15]=X[15],s0 };; // X[i&0xF]+=sigma0(X[(i+1)&0xF])
523___
524$t0="A", $t1="E", $code.=<<___ if ($BITS==64);
525{ .mib; xor s0=s0,r9 // s0=sigma0(X[(i+1)&0xF])
526 _rotr r9=$t1,$Sigma1[0] } // ROTR(e,14)
527{ .mib; xor r10=r11,r10
528 xor T2=T2,r8 };;
529{ .mib; xor s1=s1,r10 // s1=sigma1(X[(i+14)&0xF])
530 add T1=T1,H }
531{ .mib; and r10=B,C
532 $ADD X[15]=X[15],s0 };; // X[i&0xF]+=sigma0(X[(i+1)&0xF])
533___
534$code.=<<___;
535{ .mmi; xor T2=T2,r10 // T2=((a & b) ^ (a & c) ^ (b & c))
536 mov H=G
537 _rotr r8=$t1,$Sigma1[1] };; // ROTR(e,18)
538{ .mmi; xor r11=r8,r9
539 $ADD X[15]=X[15],s1 // X[i&0xF]+=sigma1(X[(i+14)&0xF])
540 _rotr r9=$t1,$Sigma1[2] } // ROTR(e,41)
541{ .mmi; mov G=F
542 mov F=E };;
543{ .mib; xor r9=r9,r11 // r9=Sigma1(e)
544 _rotr r10=$t0,$Sigma0[0] } // ROTR(a,28)
545{ .mib; add T1=T1,K // T1=Ch(e,f,g)+h+K512[i]
546 mov E=D };;
547{ .mib; add T1=T1,r9 // T1+=Sigma1(e)
548 _rotr r11=$t0,$Sigma0[1] } // ROTR(a,34)
549{ .mib; mov D=C
550 mov C=B };;
551{ .mmi; add T1=T1,X[15] // T1+=X[i]
552 xor r10=r10,r11
553 _rotr r8=$t0,$Sigma0[2] };; // ROTR(a,39)
554{ .mmi; xor r10=r8,r10 // r10=Sigma0(a)
555 mov B=A
556 add A=T1,T2 };;
557{ .mib; add E=E,T1
558 add A=A,r10 // T2=Maj(a,b,c)+Sigma0(a)
559 br.ctop.sptk .L_rest };;
560.L_rest_end:
561
562{ .mmi; add A_=A_,A
563 add B_=B_,B
564 add C_=C_,C }
565{ .mmi; add D_=D_,D
566 add E_=E_,E
567 cmp.ltu p16,p0=1,num };;
568{ .mmi; add F_=F_,F
569 add G_=G_,G
570 add H_=H_,H }
571{ .mmb; add Ktbl=-$SZ*$rounds,Ktbl
572(p16) add num=-1,num
573(p16) br.dptk.many .L_outer };;
574
575{ .mib; add r8=0*$SZ,ctx
576 add r9=1*$SZ,ctx }
577{ .mib; add r10=2*$SZ,ctx
578 add r11=3*$SZ,ctx };;
579{ .mmi; $STW [r8]=A_,4*$SZ
580 $STW [r9]=B_,4*$SZ
581 mov ar.lc=lcsave }
582{ .mmi; $STW [r10]=C_,4*$SZ
583 $STW [r11]=D_,4*$SZ
584 mov pr=prsave,0x1ffff };;
585{ .mmb; $STW [r8]=E_
586 $STW [r9]=F_ }
587{ .mmb; $STW [r10]=G_
588 $STW [r11]=H_
589 br.ret.sptk.many b0 };;
590.endp $func#
591___
592
593$code =~ s/\`([^\`]*)\`/eval $1/gem;
594$code =~ s/_rotr(\s+)([^=]+)=([^,]+),([0-9]+)/shrp$1$2=$3,$3,$4/gm;
595if ($BITS==64) {
596 $code =~ s/mux2(\s+)\S+/nop.i$1 0x0/gm;
597 $code =~ s/mux1(\s+)\S+/nop.i$1 0x0/gm if ($big_endian);
598 $code =~ s/(shrp\s+X\[[^=]+)=([^,]+),([^,]+),([1-9]+)/$1=$3,$2,64-$4/gm
599 if (!$big_endian);
600 $code =~ s/ld1(\s+)X\[\S+/nop.m$1 0x0/gm;
601}
602
603print $code;
604
605print<<___ if ($BITS==32);
606.align 64
607.type K256#,\@object
608K256: data4 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
609 data4 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
610 data4 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
611 data4 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
612 data4 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
613 data4 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
614 data4 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
615 data4 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
616 data4 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
617 data4 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
618 data4 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
619 data4 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
620 data4 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
621 data4 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
622 data4 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
623 data4 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
624.size K256#,$SZ*$rounds
625stringz "SHA256 block transform for IA64, CRYPTOGAMS by <appro\@openssl.org>"
626___
627print<<___ if ($BITS==64);
628.align 64
629.type K512#,\@object
630K512: data8 0x428a2f98d728ae22,0x7137449123ef65cd
631 data8 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
632 data8 0x3956c25bf348b538,0x59f111f1b605d019
633 data8 0x923f82a4af194f9b,0xab1c5ed5da6d8118
634 data8 0xd807aa98a3030242,0x12835b0145706fbe
635 data8 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
636 data8 0x72be5d74f27b896f,0x80deb1fe3b1696b1
637 data8 0x9bdc06a725c71235,0xc19bf174cf692694
638 data8 0xe49b69c19ef14ad2,0xefbe4786384f25e3
639 data8 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
640 data8 0x2de92c6f592b0275,0x4a7484aa6ea6e483
641 data8 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
642 data8 0x983e5152ee66dfab,0xa831c66d2db43210
643 data8 0xb00327c898fb213f,0xbf597fc7beef0ee4
644 data8 0xc6e00bf33da88fc2,0xd5a79147930aa725
645 data8 0x06ca6351e003826f,0x142929670a0e6e70
646 data8 0x27b70a8546d22ffc,0x2e1b21385c26c926
647 data8 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
648 data8 0x650a73548baf63de,0x766a0abb3c77b2a8
649 data8 0x81c2c92e47edaee6,0x92722c851482353b
650 data8 0xa2bfe8a14cf10364,0xa81a664bbc423001
651 data8 0xc24b8b70d0f89791,0xc76c51a30654be30
652 data8 0xd192e819d6ef5218,0xd69906245565a910
653 data8 0xf40e35855771202a,0x106aa07032bbd1b8
654 data8 0x19a4c116b8d2d0c8,0x1e376c085141ab53
655 data8 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
656 data8 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
657 data8 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
658 data8 0x748f82ee5defb2fc,0x78a5636f43172f60
659 data8 0x84c87814a1f0ab72,0x8cc702081a6439ec
660 data8 0x90befffa23631e28,0xa4506cebde82bde9
661 data8 0xbef9a3f7b2c67915,0xc67178f2e372532b
662 data8 0xca273eceea26619c,0xd186b8c721c0c207
663 data8 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
664 data8 0x06f067aa72176fba,0x0a637dc5a2c898a6
665 data8 0x113f9804bef90dae,0x1b710b35131c471b
666 data8 0x28db77f523047d84,0x32caab7b40c72493
667 data8 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
668 data8 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
669 data8 0x5fcb6fab3ad6faec,0x6c44198c4a475817
670.size K512#,$SZ*$rounds
671stringz "SHA512 block transform for IA64, CRYPTOGAMS by <appro\@openssl.org>"
672___
diff --git a/src/lib/libcrypto/sha/asm/sha512-mips.pl b/src/lib/libcrypto/sha/asm/sha512-mips.pl
deleted file mode 100644
index 495a000695..0000000000
--- a/src/lib/libcrypto/sha/asm/sha512-mips.pl
+++ /dev/null
@@ -1,457 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# SHA2 block procedures for MIPS.
11
12# October 2010.
13#
14# SHA256 performance improvement on MIPS R5000 CPU is ~27% over gcc-
15# generated code in o32 build and ~55% in n32/64 build. SHA512 [which
16# for now can only be compiled for MIPS64 ISA] improvement is modest
17# ~17%, but it comes for free, because it's same instruction sequence.
18# Improvement coefficients are for aligned input.
19
20######################################################################
21# There is a number of MIPS ABI in use, O32 and N32/64 are most
22# widely used. Then there is a new contender: NUBI. It appears that if
23# one picks the latter, it's possible to arrange code in ABI neutral
24# manner. Therefore let's stick to NUBI register layout:
25#
26($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
27($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
28($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
29($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
30#
31# The return value is placed in $a0. Following coding rules facilitate
32# interoperability:
33#
34# - never ever touch $tp, "thread pointer", former $gp [o32 can be
35# excluded from the rule, because it's specified volatile];
36# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
37# old code];
38# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
39#
40# For reference here is register layout for N32/64 MIPS ABIs:
41#
42# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
43# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
44# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
45# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
46# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
47#
48$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64
49
50if ($flavour =~ /64/i) {
51 $LA="dla";
52} else {
53 $LA="la";
54}
55
56if ($flavour =~ /64|n32/i) {
57 $PTR_ADD="dadd"; # incidentally works even on n32
58 $PTR_SUB="dsub"; # incidentally works even on n32
59 $REG_S="sd";
60 $REG_L="ld";
61 $PTR_SLL="dsll"; # incidentally works even on n32
62 $SZREG=8;
63} else {
64 $PTR_ADD="add";
65 $PTR_SUB="sub";
66 $REG_S="sw";
67 $REG_L="lw";
68 $PTR_SLL="sll";
69 $SZREG=4;
70}
71$pf = ($flavour =~ /nubi/i) ? $t0 : $t2;
72#
73# <appro@openssl.org>
74#
75######################################################################
76
77$big_endian=(`echo MIPSEL | $ENV{CC} -E -P -`=~/MIPSEL/)?1:0;
78
79for (@ARGV) { $output=$_ if (/^\w[\w\-]*\.\w+$/); }
80open STDOUT,">$output";
81
82if (!defined($big_endian)) { $big_endian=(unpack('L',pack('N',1))==1); }
83
84if ($output =~ /512/) {
85 $label="512";
86 $SZ=8;
87 $LD="ld"; # load from memory
88 $ST="sd"; # store to memory
89 $SLL="dsll"; # shift left logical
90 $SRL="dsrl"; # shift right logical
91 $ADDU="daddu";
92 @Sigma0=(28,34,39);
93 @Sigma1=(14,18,41);
94 @sigma0=( 7, 1, 8); # right shift first
95 @sigma1=( 6,19,61); # right shift first
96 $lastK=0x817;
97 $rounds=80;
98} else {
99 $label="256";
100 $SZ=4;
101 $LD="lw"; # load from memory
102 $ST="sw"; # store to memory
103 $SLL="sll"; # shift left logical
104 $SRL="srl"; # shift right logical
105 $ADDU="addu";
106 @Sigma0=( 2,13,22);
107 @Sigma1=( 6,11,25);
108 @sigma0=( 3, 7,18); # right shift first
109 @sigma1=(10,17,19); # right shift first
110 $lastK=0x8f2;
111 $rounds=64;
112}
113
114$MSB = $big_endian ? 0 : ($SZ-1);
115$LSB = ($SZ-1)&~$MSB;
116
117@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("\$$_",(1,2,3,7,24,25,30,31));
118@X=map("\$$_",(8..23));
119
120$ctx=$a0;
121$inp=$a1;
122$len=$a2; $Ktbl=$len;
123
124sub BODY_00_15 {
125my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
126my ($T1,$tmp0,$tmp1,$tmp2)=(@X[4],@X[5],@X[6],@X[7]);
127
128$code.=<<___ if ($i<15);
129 ${LD}l @X[1],`($i+1)*$SZ+$MSB`($inp)
130 ${LD}r @X[1],`($i+1)*$SZ+$LSB`($inp)
131___
132$code.=<<___ if (!$big_endian && $i<16 && $SZ==4);
133 srl $tmp0,@X[0],24 # byte swap($i)
134 srl $tmp1,@X[0],8
135 andi $tmp2,@X[0],0xFF00
136 sll @X[0],@X[0],24
137 andi $tmp1,0xFF00
138 sll $tmp2,$tmp2,8
139 or @X[0],$tmp0
140 or $tmp1,$tmp2
141 or @X[0],$tmp1
142___
143$code.=<<___ if (!$big_endian && $i<16 && $SZ==8);
144 ori $tmp0,$zero,0xFF
145 dsll $tmp2,$tmp0,32
146 or $tmp0,$tmp2 # 0x000000FF000000FF
147 and $tmp1,@X[0],$tmp0 # byte swap($i)
148 dsrl $tmp2,@X[0],24
149 dsll $tmp1,24
150 and $tmp2,$tmp0
151 dsll $tmp0,8 # 0x0000FF000000FF00
152 or $tmp1,$tmp2
153 and $tmp2,@X[0],$tmp0
154 dsrl @X[0],8
155 dsll $tmp2,8
156 and @X[0],$tmp0
157 or $tmp1,$tmp2
158 or @X[0],$tmp1
159 dsrl $tmp1,@X[0],32
160 dsll @X[0],32
161 or @X[0],$tmp1
162___
163$code.=<<___;
164 $ADDU $T1,$X[0],$h # $i
165 $SRL $h,$e,@Sigma1[0]
166 xor $tmp2,$f,$g
167 $SLL $tmp1,$e,`$SZ*8-@Sigma1[2]`
168 and $tmp2,$e
169 $SRL $tmp0,$e,@Sigma1[1]
170 xor $h,$tmp1
171 $SLL $tmp1,$e,`$SZ*8-@Sigma1[1]`
172 xor $h,$tmp0
173 $SRL $tmp0,$e,@Sigma1[2]
174 xor $h,$tmp1
175 $SLL $tmp1,$e,`$SZ*8-@Sigma1[0]`
176 xor $h,$tmp0
177 xor $tmp2,$g # Ch(e,f,g)
178 xor $tmp0,$tmp1,$h # Sigma1(e)
179
180 $SRL $h,$a,@Sigma0[0]
181 $ADDU $T1,$tmp2
182 $LD $tmp2,`$i*$SZ`($Ktbl) # K[$i]
183 $SLL $tmp1,$a,`$SZ*8-@Sigma0[2]`
184 $ADDU $T1,$tmp0
185 $SRL $tmp0,$a,@Sigma0[1]
186 xor $h,$tmp1
187 $SLL $tmp1,$a,`$SZ*8-@Sigma0[1]`
188 xor $h,$tmp0
189 $SRL $tmp0,$a,@Sigma0[2]
190 xor $h,$tmp1
191 $SLL $tmp1,$a,`$SZ*8-@Sigma0[0]`
192 xor $h,$tmp0
193 $ST @X[0],`($i%16)*$SZ`($sp) # offload to ring buffer
194 xor $h,$tmp1 # Sigma0(a)
195
196 or $tmp0,$a,$b
197 and $tmp1,$a,$b
198 and $tmp0,$c
199 or $tmp1,$tmp0 # Maj(a,b,c)
200 $ADDU $T1,$tmp2 # +=K[$i]
201 $ADDU $h,$tmp1
202
203 $ADDU $d,$T1
204 $ADDU $h,$T1
205___
206$code.=<<___ if ($i>=13);
207 $LD @X[3],`(($i+3)%16)*$SZ`($sp) # prefetch from ring buffer
208___
209}
210
211sub BODY_16_XX {
212my $i=@_[0];
213my ($tmp0,$tmp1,$tmp2,$tmp3)=(@X[4],@X[5],@X[6],@X[7]);
214
215$code.=<<___;
216 $SRL $tmp2,@X[1],@sigma0[0] # Xupdate($i)
217 $ADDU @X[0],@X[9] # +=X[i+9]
218 $SLL $tmp1,@X[1],`$SZ*8-@sigma0[2]`
219 $SRL $tmp0,@X[1],@sigma0[1]
220 xor $tmp2,$tmp1
221 $SLL $tmp1,`@sigma0[2]-@sigma0[1]`
222 xor $tmp2,$tmp0
223 $SRL $tmp0,@X[1],@sigma0[2]
224 xor $tmp2,$tmp1
225
226 $SRL $tmp3,@X[14],@sigma1[0]
227 xor $tmp2,$tmp0 # sigma0(X[i+1])
228 $SLL $tmp1,@X[14],`$SZ*8-@sigma1[2]`
229 $ADDU @X[0],$tmp2
230 $SRL $tmp0,@X[14],@sigma1[1]
231 xor $tmp3,$tmp1
232 $SLL $tmp1,`@sigma1[2]-@sigma1[1]`
233 xor $tmp3,$tmp0
234 $SRL $tmp0,@X[14],@sigma1[2]
235 xor $tmp3,$tmp1
236
237 xor $tmp3,$tmp0 # sigma1(X[i+14])
238 $ADDU @X[0],$tmp3
239___
240 &BODY_00_15(@_);
241}
242
243$FRAMESIZE=16*$SZ+16*$SZREG;
244$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0xc0fff008 : 0xc0ff0000;
245
246$code.=<<___;
247.text
248.set noat
249#if !defined(__vxworks) || defined(__pic__)
250.option pic2
251#endif
252
253.align 5
254.globl sha${label}_block_data_order
255.ent sha${label}_block_data_order
256sha${label}_block_data_order:
257 .frame $sp,$FRAMESIZE,$ra
258 .mask $SAVED_REGS_MASK,-$SZREG
259 .set noreorder
260___
261$code.=<<___ if ($flavour =~ /o32/i); # o32 PIC-ification
262 .cpload $pf
263___
264$code.=<<___;
265 $PTR_SUB $sp,$FRAMESIZE
266 $REG_S $ra,$FRAMESIZE-1*$SZREG($sp)
267 $REG_S $fp,$FRAMESIZE-2*$SZREG($sp)
268 $REG_S $s11,$FRAMESIZE-3*$SZREG($sp)
269 $REG_S $s10,$FRAMESIZE-4*$SZREG($sp)
270 $REG_S $s9,$FRAMESIZE-5*$SZREG($sp)
271 $REG_S $s8,$FRAMESIZE-6*$SZREG($sp)
272 $REG_S $s7,$FRAMESIZE-7*$SZREG($sp)
273 $REG_S $s6,$FRAMESIZE-8*$SZREG($sp)
274 $REG_S $s5,$FRAMESIZE-9*$SZREG($sp)
275 $REG_S $s4,$FRAMESIZE-10*$SZREG($sp)
276___
277$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
278 $REG_S $s3,$FRAMESIZE-11*$SZREG($sp)
279 $REG_S $s2,$FRAMESIZE-12*$SZREG($sp)
280 $REG_S $s1,$FRAMESIZE-13*$SZREG($sp)
281 $REG_S $s0,$FRAMESIZE-14*$SZREG($sp)
282 $REG_S $gp,$FRAMESIZE-15*$SZREG($sp)
283___
284$code.=<<___;
285 $PTR_SLL @X[15],$len,`log(16*$SZ)/log(2)`
286___
287$code.=<<___ if ($flavour !~ /o32/i); # non-o32 PIC-ification
288 .cplocal $Ktbl
289 .cpsetup $pf,$zero,sha${label}_block_data_order
290___
291$code.=<<___;
292 .set reorder
293 $LA $Ktbl,K${label} # PIC-ified 'load address'
294
295 $LD $A,0*$SZ($ctx) # load context
296 $LD $B,1*$SZ($ctx)
297 $LD $C,2*$SZ($ctx)
298 $LD $D,3*$SZ($ctx)
299 $LD $E,4*$SZ($ctx)
300 $LD $F,5*$SZ($ctx)
301 $LD $G,6*$SZ($ctx)
302 $LD $H,7*$SZ($ctx)
303
304 $PTR_ADD @X[15],$inp # pointer to the end of input
305 $REG_S @X[15],16*$SZ($sp)
306 b .Loop
307
308.align 5
309.Loop:
310 ${LD}l @X[0],$MSB($inp)
311 ${LD}r @X[0],$LSB($inp)
312___
313for ($i=0;$i<16;$i++)
314{ &BODY_00_15($i,@V); unshift(@V,pop(@V)); push(@X,shift(@X)); }
315$code.=<<___;
316 b .L16_xx
317.align 4
318.L16_xx:
319___
320for (;$i<32;$i++)
321{ &BODY_16_XX($i,@V); unshift(@V,pop(@V)); push(@X,shift(@X)); }
322$code.=<<___;
323 and @X[6],0xfff
324 li @X[7],$lastK
325 .set noreorder
326 bne @X[6],@X[7],.L16_xx
327 $PTR_ADD $Ktbl,16*$SZ # Ktbl+=16
328
329 $REG_L @X[15],16*$SZ($sp) # restore pointer to the end of input
330 $LD @X[0],0*$SZ($ctx)
331 $LD @X[1],1*$SZ($ctx)
332 $LD @X[2],2*$SZ($ctx)
333 $PTR_ADD $inp,16*$SZ
334 $LD @X[3],3*$SZ($ctx)
335 $ADDU $A,@X[0]
336 $LD @X[4],4*$SZ($ctx)
337 $ADDU $B,@X[1]
338 $LD @X[5],5*$SZ($ctx)
339 $ADDU $C,@X[2]
340 $LD @X[6],6*$SZ($ctx)
341 $ADDU $D,@X[3]
342 $LD @X[7],7*$SZ($ctx)
343 $ADDU $E,@X[4]
344 $ST $A,0*$SZ($ctx)
345 $ADDU $F,@X[5]
346 $ST $B,1*$SZ($ctx)
347 $ADDU $G,@X[6]
348 $ST $C,2*$SZ($ctx)
349 $ADDU $H,@X[7]
350 $ST $D,3*$SZ($ctx)
351 $ST $E,4*$SZ($ctx)
352 $ST $F,5*$SZ($ctx)
353 $ST $G,6*$SZ($ctx)
354 $ST $H,7*$SZ($ctx)
355
356 bne $inp,@X[15],.Loop
357 $PTR_SUB $Ktbl,`($rounds-16)*$SZ` # rewind $Ktbl
358
359 $REG_L $ra,$FRAMESIZE-1*$SZREG($sp)
360 $REG_L $fp,$FRAMESIZE-2*$SZREG($sp)
361 $REG_L $s11,$FRAMESIZE-3*$SZREG($sp)
362 $REG_L $s10,$FRAMESIZE-4*$SZREG($sp)
363 $REG_L $s9,$FRAMESIZE-5*$SZREG($sp)
364 $REG_L $s8,$FRAMESIZE-6*$SZREG($sp)
365 $REG_L $s7,$FRAMESIZE-7*$SZREG($sp)
366 $REG_L $s6,$FRAMESIZE-8*$SZREG($sp)
367 $REG_L $s5,$FRAMESIZE-9*$SZREG($sp)
368 $REG_L $s4,$FRAMESIZE-10*$SZREG($sp)
369___
370$code.=<<___ if ($flavour =~ /nubi/i);
371 $REG_L $s3,$FRAMESIZE-11*$SZREG($sp)
372 $REG_L $s2,$FRAMESIZE-12*$SZREG($sp)
373 $REG_L $s1,$FRAMESIZE-13*$SZREG($sp)
374 $REG_L $s0,$FRAMESIZE-14*$SZREG($sp)
375 $REG_L $gp,$FRAMESIZE-15*$SZREG($sp)
376___
377$code.=<<___;
378 jr $ra
379 $PTR_ADD $sp,$FRAMESIZE
380.end sha${label}_block_data_order
381
382.rdata
383.align 5
384K${label}:
385___
386if ($SZ==4) {
387$code.=<<___;
388 .word 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
389 .word 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
390 .word 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
391 .word 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
392 .word 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
393 .word 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
394 .word 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
395 .word 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
396 .word 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
397 .word 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
398 .word 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
399 .word 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
400 .word 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
401 .word 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
402 .word 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
403 .word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
404___
405} else {
406$code.=<<___;
407 .dword 0x428a2f98d728ae22, 0x7137449123ef65cd
408 .dword 0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc
409 .dword 0x3956c25bf348b538, 0x59f111f1b605d019
410 .dword 0x923f82a4af194f9b, 0xab1c5ed5da6d8118
411 .dword 0xd807aa98a3030242, 0x12835b0145706fbe
412 .dword 0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2
413 .dword 0x72be5d74f27b896f, 0x80deb1fe3b1696b1
414 .dword 0x9bdc06a725c71235, 0xc19bf174cf692694
415 .dword 0xe49b69c19ef14ad2, 0xefbe4786384f25e3
416 .dword 0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65
417 .dword 0x2de92c6f592b0275, 0x4a7484aa6ea6e483
418 .dword 0x5cb0a9dcbd41fbd4, 0x76f988da831153b5
419 .dword 0x983e5152ee66dfab, 0xa831c66d2db43210
420 .dword 0xb00327c898fb213f, 0xbf597fc7beef0ee4
421 .dword 0xc6e00bf33da88fc2, 0xd5a79147930aa725
422 .dword 0x06ca6351e003826f, 0x142929670a0e6e70
423 .dword 0x27b70a8546d22ffc, 0x2e1b21385c26c926
424 .dword 0x4d2c6dfc5ac42aed, 0x53380d139d95b3df
425 .dword 0x650a73548baf63de, 0x766a0abb3c77b2a8
426 .dword 0x81c2c92e47edaee6, 0x92722c851482353b
427 .dword 0xa2bfe8a14cf10364, 0xa81a664bbc423001
428 .dword 0xc24b8b70d0f89791, 0xc76c51a30654be30
429 .dword 0xd192e819d6ef5218, 0xd69906245565a910
430 .dword 0xf40e35855771202a, 0x106aa07032bbd1b8
431 .dword 0x19a4c116b8d2d0c8, 0x1e376c085141ab53
432 .dword 0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8
433 .dword 0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb
434 .dword 0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3
435 .dword 0x748f82ee5defb2fc, 0x78a5636f43172f60
436 .dword 0x84c87814a1f0ab72, 0x8cc702081a6439ec
437 .dword 0x90befffa23631e28, 0xa4506cebde82bde9
438 .dword 0xbef9a3f7b2c67915, 0xc67178f2e372532b
439 .dword 0xca273eceea26619c, 0xd186b8c721c0c207
440 .dword 0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178
441 .dword 0x06f067aa72176fba, 0x0a637dc5a2c898a6
442 .dword 0x113f9804bef90dae, 0x1b710b35131c471b
443 .dword 0x28db77f523047d84, 0x32caab7b40c72493
444 .dword 0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c
445 .dword 0x4cc5d4becb3e42b6, 0x597f299cfc657e2a
446 .dword 0x5fcb6fab3ad6faec, 0x6c44198c4a475817
447___
448}
449$code.=<<___;
450.asciiz "SHA${label} for MIPS, CRYPTOGAMS by <appro\@openssl.org>"
451.align 5
452
453___
454
455$code =~ s/\`([^\`]*)\`/eval $1/gem;
456print $code;
457close STDOUT;
diff --git a/src/lib/libcrypto/sha/asm/sha512-parisc.pl b/src/lib/libcrypto/sha/asm/sha512-parisc.pl
deleted file mode 100755
index 4af7731661..0000000000
--- a/src/lib/libcrypto/sha/asm/sha512-parisc.pl
+++ /dev/null
@@ -1,805 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# SHA256/512 block procedure for PA-RISC.
11
12# June 2009.
13#
14# SHA256 performance is >75% better than gcc 3.2 generated code on
15# PA-7100LC. Compared to code generated by vendor compiler this
16# implementation is almost 70% faster in 64-bit build, but delivers
17# virtually same performance in 32-bit build on PA-8600.
18#
19# SHA512 performance is >2.9x better than gcc 3.2 generated code on
20# PA-7100LC, PA-RISC 1.1 processor. Then implementation detects if the
21# code is executed on PA-RISC 2.0 processor and switches to 64-bit
22# code path delivering adequate peformance even in "blended" 32-bit
23# build. Though 64-bit code is not any faster than code generated by
24# vendor compiler on PA-8600...
25#
26# Special thanks to polarhome.com for providing HP-UX account.
27
28$flavour = shift;
29$output = shift;
30open STDOUT,">$output";
31
32if ($flavour =~ /64/) {
33 $LEVEL ="2.0W";
34 $SIZE_T =8;
35 $FRAME_MARKER =80;
36 $SAVED_RP =16;
37 $PUSH ="std";
38 $PUSHMA ="std,ma";
39 $POP ="ldd";
40 $POPMB ="ldd,mb";
41} else {
42 $LEVEL ="1.0";
43 $SIZE_T =4;
44 $FRAME_MARKER =48;
45 $SAVED_RP =20;
46 $PUSH ="stw";
47 $PUSHMA ="stwm";
48 $POP ="ldw";
49 $POPMB ="ldwm";
50}
51
52if ($output =~ /512/) {
53 $func="sha512_block_data_order";
54 $SZ=8;
55 @Sigma0=(28,34,39);
56 @Sigma1=(14,18,41);
57 @sigma0=(1, 8, 7);
58 @sigma1=(19,61, 6);
59 $rounds=80;
60 $LAST10BITS=0x017;
61 $LD="ldd";
62 $LDM="ldd,ma";
63 $ST="std";
64} else {
65 $func="sha256_block_data_order";
66 $SZ=4;
67 @Sigma0=( 2,13,22);
68 @Sigma1=( 6,11,25);
69 @sigma0=( 7,18, 3);
70 @sigma1=(17,19,10);
71 $rounds=64;
72 $LAST10BITS=0x0f2;
73 $LD="ldw";
74 $LDM="ldwm";
75 $ST="stw";
76}
77
78$FRAME=16*$SIZE_T+$FRAME_MARKER;# 16 saved regs + frame marker
79 # [+ argument transfer]
80$XOFF=16*$SZ+32; # local variables
81$FRAME+=$XOFF;
82$XOFF+=$FRAME_MARKER; # distance between %sp and local variables
83
84$ctx="%r26"; # zapped by $a0
85$inp="%r25"; # zapped by $a1
86$num="%r24"; # zapped by $t0
87
88$a0 ="%r26";
89$a1 ="%r25";
90$t0 ="%r24";
91$t1 ="%r29";
92$Tbl="%r31";
93
94@V=($A,$B,$C,$D,$E,$F,$G,$H)=("%r17","%r18","%r19","%r20","%r21","%r22","%r23","%r28");
95
96@X=("%r1", "%r2", "%r3", "%r4", "%r5", "%r6", "%r7", "%r8",
97 "%r9", "%r10","%r11","%r12","%r13","%r14","%r15","%r16",$inp);
98
99sub ROUND_00_15 {
100my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
101$code.=<<___;
102 _ror $e,$Sigma1[0],$a0
103 and $f,$e,$t0
104 _ror $e,$Sigma1[1],$a1
105 addl $t1,$h,$h
106 andcm $g,$e,$t1
107 xor $a1,$a0,$a0
108 _ror $a1,`$Sigma1[2]-$Sigma1[1]`,$a1
109 or $t0,$t1,$t1 ; Ch(e,f,g)
110 addl @X[$i%16],$h,$h
111 xor $a0,$a1,$a1 ; Sigma1(e)
112 addl $t1,$h,$h
113 _ror $a,$Sigma0[0],$a0
114 addl $a1,$h,$h
115
116 _ror $a,$Sigma0[1],$a1
117 and $a,$b,$t0
118 and $a,$c,$t1
119 xor $a1,$a0,$a0
120 _ror $a1,`$Sigma0[2]-$Sigma0[1]`,$a1
121 xor $t1,$t0,$t0
122 and $b,$c,$t1
123 xor $a0,$a1,$a1 ; Sigma0(a)
124 addl $h,$d,$d
125 xor $t1,$t0,$t0 ; Maj(a,b,c)
126 `"$LDM $SZ($Tbl),$t1" if ($i<15)`
127 addl $a1,$h,$h
128 addl $t0,$h,$h
129
130___
131}
132
133sub ROUND_16_xx {
134my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
135$i-=16;
136$code.=<<___;
137 _ror @X[($i+1)%16],$sigma0[0],$a0
138 _ror @X[($i+1)%16],$sigma0[1],$a1
139 addl @X[($i+9)%16],@X[$i],@X[$i]
140 _ror @X[($i+14)%16],$sigma1[0],$t0
141 _ror @X[($i+14)%16],$sigma1[1],$t1
142 xor $a1,$a0,$a0
143 _shr @X[($i+1)%16],$sigma0[2],$a1
144 xor $t1,$t0,$t0
145 _shr @X[($i+14)%16],$sigma1[2],$t1
146 xor $a1,$a0,$a0 ; sigma0(X[(i+1)&0x0f])
147 xor $t1,$t0,$t0 ; sigma1(X[(i+14)&0x0f])
148 $LDM $SZ($Tbl),$t1
149 addl $a0,@X[$i],@X[$i]
150 addl $t0,@X[$i],@X[$i]
151___
152$code.=<<___ if ($i==15);
153 extru $t1,31,10,$a1
154 comiclr,<> $LAST10BITS,$a1,%r0
155 ldo 1($Tbl),$Tbl ; signal end of $Tbl
156___
157&ROUND_00_15($i+16,$a,$b,$c,$d,$e,$f,$g,$h);
158}
159
160$code=<<___;
161 .LEVEL $LEVEL
162#if 0
163 .SPACE \$TEXT\$
164 .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
165#else
166 .text
167#endif
168
169 .ALIGN 64
170L\$table
171___
172$code.=<<___ if ($SZ==8);
173 .WORD 0x428a2f98,0xd728ae22,0x71374491,0x23ef65cd
174 .WORD 0xb5c0fbcf,0xec4d3b2f,0xe9b5dba5,0x8189dbbc
175 .WORD 0x3956c25b,0xf348b538,0x59f111f1,0xb605d019
176 .WORD 0x923f82a4,0xaf194f9b,0xab1c5ed5,0xda6d8118
177 .WORD 0xd807aa98,0xa3030242,0x12835b01,0x45706fbe
178 .WORD 0x243185be,0x4ee4b28c,0x550c7dc3,0xd5ffb4e2
179 .WORD 0x72be5d74,0xf27b896f,0x80deb1fe,0x3b1696b1
180 .WORD 0x9bdc06a7,0x25c71235,0xc19bf174,0xcf692694
181 .WORD 0xe49b69c1,0x9ef14ad2,0xefbe4786,0x384f25e3
182 .WORD 0x0fc19dc6,0x8b8cd5b5,0x240ca1cc,0x77ac9c65
183 .WORD 0x2de92c6f,0x592b0275,0x4a7484aa,0x6ea6e483
184 .WORD 0x5cb0a9dc,0xbd41fbd4,0x76f988da,0x831153b5
185 .WORD 0x983e5152,0xee66dfab,0xa831c66d,0x2db43210
186 .WORD 0xb00327c8,0x98fb213f,0xbf597fc7,0xbeef0ee4
187 .WORD 0xc6e00bf3,0x3da88fc2,0xd5a79147,0x930aa725
188 .WORD 0x06ca6351,0xe003826f,0x14292967,0x0a0e6e70
189 .WORD 0x27b70a85,0x46d22ffc,0x2e1b2138,0x5c26c926
190 .WORD 0x4d2c6dfc,0x5ac42aed,0x53380d13,0x9d95b3df
191 .WORD 0x650a7354,0x8baf63de,0x766a0abb,0x3c77b2a8
192 .WORD 0x81c2c92e,0x47edaee6,0x92722c85,0x1482353b
193 .WORD 0xa2bfe8a1,0x4cf10364,0xa81a664b,0xbc423001
194 .WORD 0xc24b8b70,0xd0f89791,0xc76c51a3,0x0654be30
195 .WORD 0xd192e819,0xd6ef5218,0xd6990624,0x5565a910
196 .WORD 0xf40e3585,0x5771202a,0x106aa070,0x32bbd1b8
197 .WORD 0x19a4c116,0xb8d2d0c8,0x1e376c08,0x5141ab53
198 .WORD 0x2748774c,0xdf8eeb99,0x34b0bcb5,0xe19b48a8
199 .WORD 0x391c0cb3,0xc5c95a63,0x4ed8aa4a,0xe3418acb
200 .WORD 0x5b9cca4f,0x7763e373,0x682e6ff3,0xd6b2b8a3
201 .WORD 0x748f82ee,0x5defb2fc,0x78a5636f,0x43172f60
202 .WORD 0x84c87814,0xa1f0ab72,0x8cc70208,0x1a6439ec
203 .WORD 0x90befffa,0x23631e28,0xa4506ceb,0xde82bde9
204 .WORD 0xbef9a3f7,0xb2c67915,0xc67178f2,0xe372532b
205 .WORD 0xca273ece,0xea26619c,0xd186b8c7,0x21c0c207
206 .WORD 0xeada7dd6,0xcde0eb1e,0xf57d4f7f,0xee6ed178
207 .WORD 0x06f067aa,0x72176fba,0x0a637dc5,0xa2c898a6
208 .WORD 0x113f9804,0xbef90dae,0x1b710b35,0x131c471b
209 .WORD 0x28db77f5,0x23047d84,0x32caab7b,0x40c72493
210 .WORD 0x3c9ebe0a,0x15c9bebc,0x431d67c4,0x9c100d4c
211 .WORD 0x4cc5d4be,0xcb3e42b6,0x597f299c,0xfc657e2a
212 .WORD 0x5fcb6fab,0x3ad6faec,0x6c44198c,0x4a475817
213___
214$code.=<<___ if ($SZ==4);
215 .WORD 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
216 .WORD 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
217 .WORD 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
218 .WORD 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
219 .WORD 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
220 .WORD 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
221 .WORD 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
222 .WORD 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
223 .WORD 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
224 .WORD 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
225 .WORD 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
226 .WORD 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
227 .WORD 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
228 .WORD 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
229 .WORD 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
230 .WORD 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
231___
232$code.=<<___;
233
234 .EXPORT $func,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR
235 .ALIGN 64
236$func
237 .PROC
238 .CALLINFO FRAME=`$FRAME-16*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=18
239 .ENTRY
240 $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
241 $PUSHMA %r3,$FRAME(%sp)
242 $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
243 $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
244 $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
245 $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp)
246 $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp)
247 $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp)
248 $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp)
249 $PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp)
250 $PUSH %r12,`-$FRAME+9*$SIZE_T`(%sp)
251 $PUSH %r13,`-$FRAME+10*$SIZE_T`(%sp)
252 $PUSH %r14,`-$FRAME+11*$SIZE_T`(%sp)
253 $PUSH %r15,`-$FRAME+12*$SIZE_T`(%sp)
254 $PUSH %r16,`-$FRAME+13*$SIZE_T`(%sp)
255 $PUSH %r17,`-$FRAME+14*$SIZE_T`(%sp)
256 $PUSH %r18,`-$FRAME+15*$SIZE_T`(%sp)
257
258 _shl $num,`log(16*$SZ)/log(2)`,$num
259 addl $inp,$num,$num ; $num to point at the end of $inp
260
261 $PUSH $num,`-$FRAME_MARKER-4*$SIZE_T`(%sp) ; save arguments
262 $PUSH $inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp)
263 $PUSH $ctx,`-$FRAME_MARKER-2*$SIZE_T`(%sp)
264
265 blr %r0,$Tbl
266 ldi 3,$t1
267L\$pic
268 andcm $Tbl,$t1,$Tbl ; wipe privilege level
269 ldo L\$table-L\$pic($Tbl),$Tbl
270___
271$code.=<<___ if ($SZ==8 && $SIZE_T==4);
272#ifndef __OpenBSD__
273___
274$code.=<<___ if ($SZ==8 && $SIZE_T==4);
275 ldi 31,$t1
276 mtctl $t1,%cr11
277 extrd,u,*= $t1,%sar,1,$t1 ; executes on PA-RISC 1.0
278 b L\$parisc1
279 nop
280___
281$code.=<<___;
282 $LD `0*$SZ`($ctx),$A ; load context
283 $LD `1*$SZ`($ctx),$B
284 $LD `2*$SZ`($ctx),$C
285 $LD `3*$SZ`($ctx),$D
286 $LD `4*$SZ`($ctx),$E
287 $LD `5*$SZ`($ctx),$F
288 $LD `6*$SZ`($ctx),$G
289 $LD `7*$SZ`($ctx),$H
290
291 extru $inp,31,`log($SZ)/log(2)`,$t0
292 sh3addl $t0,%r0,$t0
293 subi `8*$SZ`,$t0,$t0
294 mtctl $t0,%cr11 ; load %sar with align factor
295
296L\$oop
297 ldi `$SZ-1`,$t0
298 $LDM $SZ($Tbl),$t1
299 andcm $inp,$t0,$t0 ; align $inp
300___
301 for ($i=0;$i<15;$i++) { # load input block
302 $code.="\t$LD `$SZ*$i`($t0),@X[$i]\n"; }
303$code.=<<___;
304 cmpb,*= $inp,$t0,L\$aligned
305 $LD `$SZ*15`($t0),@X[15]
306 $LD `$SZ*16`($t0),@X[16]
307___
308 for ($i=0;$i<16;$i++) { # align data
309 $code.="\t_align @X[$i],@X[$i+1],@X[$i]\n"; }
310$code.=<<___;
311L\$aligned
312 nop ; otherwise /usr/ccs/bin/as is confused by below .WORD
313___
314
315for($i=0;$i<16;$i++) { &ROUND_00_15($i,@V); unshift(@V,pop(@V)); }
316$code.=<<___;
317L\$rounds
318 nop ; otherwise /usr/ccs/bin/as is confused by below .WORD
319___
320for(;$i<32;$i++) { &ROUND_16_xx($i,@V); unshift(@V,pop(@V)); }
321$code.=<<___;
322 bb,>= $Tbl,31,L\$rounds ; end of $Tbl signalled?
323 nop
324
325 $POP `-$FRAME_MARKER-2*$SIZE_T`(%sp),$ctx ; restore arguments
326 $POP `-$FRAME_MARKER-3*$SIZE_T`(%sp),$inp
327 $POP `-$FRAME_MARKER-4*$SIZE_T`(%sp),$num
328 ldo `-$rounds*$SZ-1`($Tbl),$Tbl ; rewind $Tbl
329
330 $LD `0*$SZ`($ctx),@X[0] ; load context
331 $LD `1*$SZ`($ctx),@X[1]
332 $LD `2*$SZ`($ctx),@X[2]
333 $LD `3*$SZ`($ctx),@X[3]
334 $LD `4*$SZ`($ctx),@X[4]
335 $LD `5*$SZ`($ctx),@X[5]
336 addl @X[0],$A,$A
337 $LD `6*$SZ`($ctx),@X[6]
338 addl @X[1],$B,$B
339 $LD `7*$SZ`($ctx),@X[7]
340 ldo `16*$SZ`($inp),$inp ; advance $inp
341
342 $ST $A,`0*$SZ`($ctx) ; save context
343 addl @X[2],$C,$C
344 $ST $B,`1*$SZ`($ctx)
345 addl @X[3],$D,$D
346 $ST $C,`2*$SZ`($ctx)
347 addl @X[4],$E,$E
348 $ST $D,`3*$SZ`($ctx)
349 addl @X[5],$F,$F
350 $ST $E,`4*$SZ`($ctx)
351 addl @X[6],$G,$G
352 $ST $F,`5*$SZ`($ctx)
353 addl @X[7],$H,$H
354 $ST $G,`6*$SZ`($ctx)
355 $ST $H,`7*$SZ`($ctx)
356
357 cmpb,*<>,n $inp,$num,L\$oop
358 $PUSH $inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp) ; save $inp
359___
360if ($SZ==8 && $SIZE_T==4) # SHA512 for 32-bit PA-RISC 1.0
361{{
362$code.=<<___;
363 b L\$done
364 nop
365
366 .ALIGN 64
367L\$parisc1
368___
369$code.=<<___ if ($SZ==8 && $SIZE_T==4);
370#endif
371___
372
373@V=( $Ahi, $Alo, $Bhi, $Blo, $Chi, $Clo, $Dhi, $Dlo,
374 $Ehi, $Elo, $Fhi, $Flo, $Ghi, $Glo, $Hhi, $Hlo) =
375 ( "%r1", "%r2", "%r3", "%r4", "%r5", "%r6", "%r7", "%r8",
376 "%r9","%r10","%r11","%r12","%r13","%r14","%r15","%r16");
377$a0 ="%r17";
378$a1 ="%r18";
379$a2 ="%r19";
380$a3 ="%r20";
381$t0 ="%r21";
382$t1 ="%r22";
383$t2 ="%r28";
384$t3 ="%r29";
385$Tbl="%r31";
386
387@X=("%r23","%r24","%r25","%r26"); # zaps $num,$inp,$ctx
388
389sub ROUND_00_15_pa1 {
390my ($i,$ahi,$alo,$bhi,$blo,$chi,$clo,$dhi,$dlo,
391 $ehi,$elo,$fhi,$flo,$ghi,$glo,$hhi,$hlo,$flag)=@_;
392my ($Xhi,$Xlo,$Xnhi,$Xnlo) = @X;
393
394$code.=<<___ if (!$flag);
395 ldw `-$XOFF+8*(($i+1)%16)`(%sp),$Xnhi
396 ldw `-$XOFF+8*(($i+1)%16)+4`(%sp),$Xnlo ; load X[i+1]
397___
398$code.=<<___;
399 shd $ehi,$elo,$Sigma1[0],$t0
400 add $Xlo,$hlo,$hlo
401 shd $elo,$ehi,$Sigma1[0],$t1
402 addc $Xhi,$hhi,$hhi ; h += X[i]
403 shd $ehi,$elo,$Sigma1[1],$t2
404 ldwm 8($Tbl),$Xhi
405 shd $elo,$ehi,$Sigma1[1],$t3
406 ldw -4($Tbl),$Xlo ; load K[i]
407 xor $t2,$t0,$t0
408 xor $t3,$t1,$t1
409 and $flo,$elo,$a0
410 and $fhi,$ehi,$a1
411 shd $ehi,$elo,$Sigma1[2],$t2
412 andcm $glo,$elo,$a2
413 shd $elo,$ehi,$Sigma1[2],$t3
414 andcm $ghi,$ehi,$a3
415 xor $t2,$t0,$t0
416 xor $t3,$t1,$t1 ; Sigma1(e)
417 add $Xlo,$hlo,$hlo
418 xor $a2,$a0,$a0
419 addc $Xhi,$hhi,$hhi ; h += K[i]
420 xor $a3,$a1,$a1 ; Ch(e,f,g)
421
422 add $t0,$hlo,$hlo
423 shd $ahi,$alo,$Sigma0[0],$t0
424 addc $t1,$hhi,$hhi ; h += Sigma1(e)
425 shd $alo,$ahi,$Sigma0[0],$t1
426 add $a0,$hlo,$hlo
427 shd $ahi,$alo,$Sigma0[1],$t2
428 addc $a1,$hhi,$hhi ; h += Ch(e,f,g)
429 shd $alo,$ahi,$Sigma0[1],$t3
430
431 xor $t2,$t0,$t0
432 xor $t3,$t1,$t1
433 shd $ahi,$alo,$Sigma0[2],$t2
434 and $alo,$blo,$a0
435 shd $alo,$ahi,$Sigma0[2],$t3
436 and $ahi,$bhi,$a1
437 xor $t2,$t0,$t0
438 xor $t3,$t1,$t1 ; Sigma0(a)
439
440 and $alo,$clo,$a2
441 and $ahi,$chi,$a3
442 xor $a2,$a0,$a0
443 add $hlo,$dlo,$dlo
444 xor $a3,$a1,$a1
445 addc $hhi,$dhi,$dhi ; d += h
446 and $blo,$clo,$a2
447 add $t0,$hlo,$hlo
448 and $bhi,$chi,$a3
449 addc $t1,$hhi,$hhi ; h += Sigma0(a)
450 xor $a2,$a0,$a0
451 add $a0,$hlo,$hlo
452 xor $a3,$a1,$a1 ; Maj(a,b,c)
453 addc $a1,$hhi,$hhi ; h += Maj(a,b,c)
454
455___
456$code.=<<___ if ($i==15 && $flag);
457 extru $Xlo,31,10,$Xlo
458 comiclr,= $LAST10BITS,$Xlo,%r0
459 b L\$rounds_pa1
460 nop
461___
462push(@X,shift(@X)); push(@X,shift(@X));
463}
464
465sub ROUND_16_xx_pa1 {
466my ($Xhi,$Xlo,$Xnhi,$Xnlo) = @X;
467my ($i)=shift;
468$i-=16;
469$code.=<<___;
470 ldw `-$XOFF+8*(($i+1)%16)`(%sp),$Xnhi
471 ldw `-$XOFF+8*(($i+1)%16)+4`(%sp),$Xnlo ; load X[i+1]
472 ldw `-$XOFF+8*(($i+9)%16)`(%sp),$a1
473 ldw `-$XOFF+8*(($i+9)%16)+4`(%sp),$a0 ; load X[i+9]
474 ldw `-$XOFF+8*(($i+14)%16)`(%sp),$a3
475 ldw `-$XOFF+8*(($i+14)%16)+4`(%sp),$a2 ; load X[i+14]
476 shd $Xnhi,$Xnlo,$sigma0[0],$t0
477 shd $Xnlo,$Xnhi,$sigma0[0],$t1
478 add $a0,$Xlo,$Xlo
479 shd $Xnhi,$Xnlo,$sigma0[1],$t2
480 addc $a1,$Xhi,$Xhi
481 shd $Xnlo,$Xnhi,$sigma0[1],$t3
482 xor $t2,$t0,$t0
483 shd $Xnhi,$Xnlo,$sigma0[2],$t2
484 xor $t3,$t1,$t1
485 extru $Xnhi,`31-$sigma0[2]`,`32-$sigma0[2]`,$t3
486 xor $t2,$t0,$t0
487 shd $a3,$a2,$sigma1[0],$a0
488 xor $t3,$t1,$t1 ; sigma0(X[i+1)&0x0f])
489 shd $a2,$a3,$sigma1[0],$a1
490 add $t0,$Xlo,$Xlo
491 shd $a3,$a2,$sigma1[1],$t2
492 addc $t1,$Xhi,$Xhi
493 shd $a2,$a3,$sigma1[1],$t3
494 xor $t2,$a0,$a0
495 shd $a3,$a2,$sigma1[2],$t2
496 xor $t3,$a1,$a1
497 extru $a3,`31-$sigma1[2]`,`32-$sigma1[2]`,$t3
498 xor $t2,$a0,$a0
499 xor $t3,$a1,$a1 ; sigma0(X[i+14)&0x0f])
500 add $a0,$Xlo,$Xlo
501 addc $a1,$Xhi,$Xhi
502
503 stw $Xhi,`-$XOFF+8*($i%16)`(%sp)
504 stw $Xlo,`-$XOFF+8*($i%16)+4`(%sp)
505___
506&ROUND_00_15_pa1($i,@_,1);
507}
508$code.=<<___;
509 ldw `0*4`($ctx),$Ahi ; load context
510 ldw `1*4`($ctx),$Alo
511 ldw `2*4`($ctx),$Bhi
512 ldw `3*4`($ctx),$Blo
513 ldw `4*4`($ctx),$Chi
514 ldw `5*4`($ctx),$Clo
515 ldw `6*4`($ctx),$Dhi
516 ldw `7*4`($ctx),$Dlo
517 ldw `8*4`($ctx),$Ehi
518 ldw `9*4`($ctx),$Elo
519 ldw `10*4`($ctx),$Fhi
520 ldw `11*4`($ctx),$Flo
521 ldw `12*4`($ctx),$Ghi
522 ldw `13*4`($ctx),$Glo
523 ldw `14*4`($ctx),$Hhi
524 ldw `15*4`($ctx),$Hlo
525
526 extru $inp,31,2,$t0
527 sh3addl $t0,%r0,$t0
528 subi 32,$t0,$t0
529 mtctl $t0,%cr11 ; load %sar with align factor
530
531L\$oop_pa1
532 extru $inp,31,2,$a3
533 comib,= 0,$a3,L\$aligned_pa1
534 sub $inp,$a3,$inp
535
536 ldw `0*4`($inp),$X[0]
537 ldw `1*4`($inp),$X[1]
538 ldw `2*4`($inp),$t2
539 ldw `3*4`($inp),$t3
540 ldw `4*4`($inp),$a0
541 ldw `5*4`($inp),$a1
542 ldw `6*4`($inp),$a2
543 ldw `7*4`($inp),$a3
544 vshd $X[0],$X[1],$X[0]
545 vshd $X[1],$t2,$X[1]
546 stw $X[0],`-$XOFF+0*4`(%sp)
547 ldw `8*4`($inp),$t0
548 vshd $t2,$t3,$t2
549 stw $X[1],`-$XOFF+1*4`(%sp)
550 ldw `9*4`($inp),$t1
551 vshd $t3,$a0,$t3
552___
553{
554my @t=($t2,$t3,$a0,$a1,$a2,$a3,$t0,$t1);
555for ($i=2;$i<=(128/4-8);$i++) {
556$code.=<<___;
557 stw $t[0],`-$XOFF+$i*4`(%sp)
558 ldw `(8+$i)*4`($inp),$t[0]
559 vshd $t[1],$t[2],$t[1]
560___
561push(@t,shift(@t));
562}
563for (;$i<(128/4-1);$i++) {
564$code.=<<___;
565 stw $t[0],`-$XOFF+$i*4`(%sp)
566 vshd $t[1],$t[2],$t[1]
567___
568push(@t,shift(@t));
569}
570$code.=<<___;
571 b L\$collected_pa1
572 stw $t[0],`-$XOFF+$i*4`(%sp)
573
574___
575}
576$code.=<<___;
577L\$aligned_pa1
578 ldw `0*4`($inp),$X[0]
579 ldw `1*4`($inp),$X[1]
580 ldw `2*4`($inp),$t2
581 ldw `3*4`($inp),$t3
582 ldw `4*4`($inp),$a0
583 ldw `5*4`($inp),$a1
584 ldw `6*4`($inp),$a2
585 ldw `7*4`($inp),$a3
586 stw $X[0],`-$XOFF+0*4`(%sp)
587 ldw `8*4`($inp),$t0
588 stw $X[1],`-$XOFF+1*4`(%sp)
589 ldw `9*4`($inp),$t1
590___
591{
592my @t=($t2,$t3,$a0,$a1,$a2,$a3,$t0,$t1);
593for ($i=2;$i<(128/4-8);$i++) {
594$code.=<<___;
595 stw $t[0],`-$XOFF+$i*4`(%sp)
596 ldw `(8+$i)*4`($inp),$t[0]
597___
598push(@t,shift(@t));
599}
600for (;$i<128/4;$i++) {
601$code.=<<___;
602 stw $t[0],`-$XOFF+$i*4`(%sp)
603___
604push(@t,shift(@t));
605}
606$code.="L\$collected_pa1\n";
607}
608
609for($i=0;$i<16;$i++) { &ROUND_00_15_pa1($i,@V); unshift(@V,pop(@V)); unshift(@V,pop(@V)); }
610$code.="L\$rounds_pa1\n";
611for(;$i<32;$i++) { &ROUND_16_xx_pa1($i,@V); unshift(@V,pop(@V)); unshift(@V,pop(@V)); }
612
613$code.=<<___;
614 $POP `-$FRAME_MARKER-2*$SIZE_T`(%sp),$ctx ; restore arguments
615 $POP `-$FRAME_MARKER-3*$SIZE_T`(%sp),$inp
616 $POP `-$FRAME_MARKER-4*$SIZE_T`(%sp),$num
617 ldo `-$rounds*$SZ`($Tbl),$Tbl ; rewind $Tbl
618
619 ldw `0*4`($ctx),$t1 ; update context
620 ldw `1*4`($ctx),$t0
621 ldw `2*4`($ctx),$t3
622 ldw `3*4`($ctx),$t2
623 ldw `4*4`($ctx),$a1
624 ldw `5*4`($ctx),$a0
625 ldw `6*4`($ctx),$a3
626 add $t0,$Alo,$Alo
627 ldw `7*4`($ctx),$a2
628 addc $t1,$Ahi,$Ahi
629 ldw `8*4`($ctx),$t1
630 add $t2,$Blo,$Blo
631 ldw `9*4`($ctx),$t0
632 addc $t3,$Bhi,$Bhi
633 ldw `10*4`($ctx),$t3
634 add $a0,$Clo,$Clo
635 ldw `11*4`($ctx),$t2
636 addc $a1,$Chi,$Chi
637 ldw `12*4`($ctx),$a1
638 add $a2,$Dlo,$Dlo
639 ldw `13*4`($ctx),$a0
640 addc $a3,$Dhi,$Dhi
641 ldw `14*4`($ctx),$a3
642 add $t0,$Elo,$Elo
643 ldw `15*4`($ctx),$a2
644 addc $t1,$Ehi,$Ehi
645 stw $Ahi,`0*4`($ctx)
646 add $t2,$Flo,$Flo
647 stw $Alo,`1*4`($ctx)
648 addc $t3,$Fhi,$Fhi
649 stw $Bhi,`2*4`($ctx)
650 add $a0,$Glo,$Glo
651 stw $Blo,`3*4`($ctx)
652 addc $a1,$Ghi,$Ghi
653 stw $Chi,`4*4`($ctx)
654 add $a2,$Hlo,$Hlo
655 stw $Clo,`5*4`($ctx)
656 addc $a3,$Hhi,$Hhi
657 stw $Dhi,`6*4`($ctx)
658 ldo `16*$SZ`($inp),$inp ; advance $inp
659 stw $Dlo,`7*4`($ctx)
660 stw $Ehi,`8*4`($ctx)
661 stw $Elo,`9*4`($ctx)
662 stw $Fhi,`10*4`($ctx)
663 stw $Flo,`11*4`($ctx)
664 stw $Ghi,`12*4`($ctx)
665 stw $Glo,`13*4`($ctx)
666 stw $Hhi,`14*4`($ctx)
667 comb,= $inp,$num,L\$done
668 stw $Hlo,`15*4`($ctx)
669 b L\$oop_pa1
670 $PUSH $inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp) ; save $inp
671L\$done
672___
673}}
674$code.=<<___;
675 $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue
676 $POP `-$FRAME+1*$SIZE_T`(%sp),%r4
677 $POP `-$FRAME+2*$SIZE_T`(%sp),%r5
678 $POP `-$FRAME+3*$SIZE_T`(%sp),%r6
679 $POP `-$FRAME+4*$SIZE_T`(%sp),%r7
680 $POP `-$FRAME+5*$SIZE_T`(%sp),%r8
681 $POP `-$FRAME+6*$SIZE_T`(%sp),%r9
682 $POP `-$FRAME+7*$SIZE_T`(%sp),%r10
683 $POP `-$FRAME+8*$SIZE_T`(%sp),%r11
684 $POP `-$FRAME+9*$SIZE_T`(%sp),%r12
685 $POP `-$FRAME+10*$SIZE_T`(%sp),%r13
686 $POP `-$FRAME+11*$SIZE_T`(%sp),%r14
687 $POP `-$FRAME+12*$SIZE_T`(%sp),%r15
688 $POP `-$FRAME+13*$SIZE_T`(%sp),%r16
689 $POP `-$FRAME+14*$SIZE_T`(%sp),%r17
690 $POP `-$FRAME+15*$SIZE_T`(%sp),%r18
691 bv (%r2)
692 .EXIT
693 $POPMB -$FRAME(%sp),%r3
694 .PROCEND
695
696 .data
697 .STRINGZ "SHA`64*$SZ` block transform for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
698___
699
700# Explicitly encode PA-RISC 2.0 instructions used in this module, so
701# that it can be compiled with .LEVEL 1.0. It should be noted that I
702# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0
703# directive...
704
705my $ldd = sub {
706 my ($mod,$args) = @_;
707 my $orig = "ldd$mod\t$args";
708
709 if ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 3 suffices
710 { my $opcode=(0x14<<26)|($2<<21)|($3<<16)|(($1&0x1FF8)<<1)|(($1>>13)&1);
711 $opcode|=(1<<3) if ($mod =~ /^,m/);
712 $opcode|=(1<<2) if ($mod =~ /^,mb/);
713 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
714 }
715 else { "\t".$orig; }
716};
717
718my $std = sub {
719 my ($mod,$args) = @_;
720 my $orig = "std$mod\t$args";
721
722 if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 3 suffices
723 { my $opcode=(0x1c<<26)|($3<<21)|($1<<16)|(($2&0x1FF8)<<1)|(($2>>13)&1);
724 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
725 }
726 else { "\t".$orig; }
727};
728
729my $extrd = sub {
730 my ($mod,$args) = @_;
731 my $orig = "extrd$mod\t$args";
732
733 # I only have ",u" completer, it's implicitly encoded...
734 if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 15
735 { my $opcode=(0x36<<26)|($1<<21)|($4<<16);
736 my $len=32-$3;
737 $opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5); # encode pos
738 $opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len
739 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
740 }
741 elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/) # format 12
742 { my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9);
743 my $len=32-$2;
744 $opcode |= (($len&0x20)<<3)|($len&0x1f); # encode len
745 $opcode |= (1<<13) if ($mod =~ /,\**=/);
746 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
747 }
748 else { "\t".$orig; }
749};
750
751my $shrpd = sub {
752 my ($mod,$args) = @_;
753 my $orig = "shrpd$mod\t$args";
754
755 if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/) # format 14
756 { my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4;
757 my $cpos=63-$3;
758 $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode sa
759 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
760 }
761 elsif ($args =~ /%r([0-9]+),%r([0-9]+),%sar,%r([0-9]+)/) # format 11
762 { sprintf "\t.WORD\t0x%08x\t; %s",
763 (0x34<<26)|($2<<21)|($1<<16)|(1<<9)|$3,$orig;
764 }
765 else { "\t".$orig; }
766};
767
768sub assemble {
769 my ($mnemonic,$mod,$args)=@_;
770 my $opcode = eval("\$$mnemonic");
771
772 ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args";
773}
774
775foreach (split("\n",$code)) {
776 s/\`([^\`]*)\`/eval $1/ge;
777
778 s/shd\s+(%r[0-9]+),(%r[0-9]+),([0-9]+)/
779 $3>31 ? sprintf("shd\t%$2,%$1,%d",$3-32) # rotation for >=32
780 : sprintf("shd\t%$1,%$2,%d",$3)/e or
781 # translate made up instructons: _ror, _shr, _align, _shl
782 s/_ror(\s+)(%r[0-9]+),/
783 ($SZ==4 ? "shd" : "shrpd")."$1$2,$2,"/e or
784
785 s/_shr(\s+%r[0-9]+),([0-9]+),/
786 $SZ==4 ? sprintf("extru%s,%d,%d,",$1,31-$2,32-$2)
787 : sprintf("extrd,u%s,%d,%d,",$1,63-$2,64-$2)/e or
788
789 s/_align(\s+%r[0-9]+,%r[0-9]+),/
790 ($SZ==4 ? "vshd$1," : "shrpd$1,%sar,")/e or
791
792 s/_shl(\s+%r[0-9]+),([0-9]+),/
793 $SIZE_T==4 ? sprintf("zdep%s,%d,%d,",$1,31-$2,32-$2)
794 : sprintf("depd,z%s,%d,%d,",$1,63-$2,64-$2)/e;
795
796 s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e if ($SIZE_T==4);
797
798 s/cmpb,\*/comb,/ if ($SIZE_T==4);
799
800 s/\bbv\b/bve/ if ($SIZE_T==8);
801
802 print $_,"\n";
803}
804
805close STDOUT;
diff --git a/src/lib/libcrypto/sha/asm/sha512-ppc.pl b/src/lib/libcrypto/sha/asm/sha512-ppc.pl
deleted file mode 100755
index 6b44a68e59..0000000000
--- a/src/lib/libcrypto/sha/asm/sha512-ppc.pl
+++ /dev/null
@@ -1,460 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# I let hardware handle unaligned input, except on page boundaries
11# (see below for details). Otherwise straightforward implementation
12# with X vector in register bank. The module is big-endian [which is
13# not big deal as there're no little-endian targets left around].
14
15# sha256 | sha512
16# -m64 -m32 | -m64 -m32
17# --------------------------------------+-----------------------
18# PPC970,gcc-4.0.0 +50% +38% | +40% +410%(*)
19# Power6,xlc-7 +150% +90% | +100% +430%(*)
20#
21# (*) 64-bit code in 32-bit application context, which actually is
22# on TODO list. It should be noted that for safe deployment in
23# 32-bit *mutli-threaded* context asyncronous signals should be
24# blocked upon entry to SHA512 block routine. This is because
25# 32-bit signaling procedure invalidates upper halves of GPRs.
26# Context switch procedure preserves them, but not signaling:-(
27
28# Second version is true multi-thread safe. Trouble with the original
29# version was that it was using thread local storage pointer register.
30# Well, it scrupulously preserved it, but the problem would arise the
31# moment asynchronous signal was delivered and signal handler would
32# dereference the TLS pointer. While it's never the case in openssl
33# application or test suite, we have to respect this scenario and not
34# use TLS pointer register. Alternative would be to require caller to
35# block signals prior calling this routine. For the record, in 32-bit
36# context R2 serves as TLS pointer, while in 64-bit context - R13.
37
38$flavour=shift;
39$output =shift;
40
41if ($flavour =~ /64/) {
42 $SIZE_T=8;
43 $LRSAVE=2*$SIZE_T;
44 $STU="stdu";
45 $UCMP="cmpld";
46 $SHL="sldi";
47 $POP="ld";
48 $PUSH="std";
49} elsif ($flavour =~ /32/) {
50 $SIZE_T=4;
51 $LRSAVE=$SIZE_T;
52 $STU="stwu";
53 $UCMP="cmplw";
54 $SHL="slwi";
55 $POP="lwz";
56 $PUSH="stw";
57} else { die "nonsense $flavour"; }
58
59$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
60( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
61( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
62die "can't locate ppc-xlate.pl";
63
64open STDOUT,"| $^X $xlate $flavour $output" || die "can't call $xlate: $!";
65
66if ($output =~ /512/) {
67 $func="sha512_block_data_order";
68 $SZ=8;
69 @Sigma0=(28,34,39);
70 @Sigma1=(14,18,41);
71 @sigma0=(1, 8, 7);
72 @sigma1=(19,61, 6);
73 $rounds=80;
74 $LD="ld";
75 $ST="std";
76 $ROR="rotrdi";
77 $SHR="srdi";
78} else {
79 $func="sha256_block_data_order";
80 $SZ=4;
81 @Sigma0=( 2,13,22);
82 @Sigma1=( 6,11,25);
83 @sigma0=( 7,18, 3);
84 @sigma1=(17,19,10);
85 $rounds=64;
86 $LD="lwz";
87 $ST="stw";
88 $ROR="rotrwi";
89 $SHR="srwi";
90}
91
92$FRAME=32*$SIZE_T+16*$SZ;
93$LOCALS=6*$SIZE_T;
94
95$sp ="r1";
96$toc="r2";
97$ctx="r3"; # zapped by $a0
98$inp="r4"; # zapped by $a1
99$num="r5"; # zapped by $t0
100
101$T ="r0";
102$a0 ="r3";
103$a1 ="r4";
104$t0 ="r5";
105$t1 ="r6";
106$Tbl="r7";
107
108$A ="r8";
109$B ="r9";
110$C ="r10";
111$D ="r11";
112$E ="r12";
113$F ="r13"; $F="r2" if ($SIZE_T==8);# reassigned to exempt TLS pointer
114$G ="r14";
115$H ="r15";
116
117@V=($A,$B,$C,$D,$E,$F,$G,$H);
118@X=("r16","r17","r18","r19","r20","r21","r22","r23",
119 "r24","r25","r26","r27","r28","r29","r30","r31");
120
121$inp="r31"; # reassigned $inp! aliases with @X[15]
122
123sub ROUND_00_15 {
124my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
125$code.=<<___;
126 $LD $T,`$i*$SZ`($Tbl)
127 $ROR $a0,$e,$Sigma1[0]
128 $ROR $a1,$e,$Sigma1[1]
129 and $t0,$f,$e
130 andc $t1,$g,$e
131 add $T,$T,$h
132 xor $a0,$a0,$a1
133 $ROR $a1,$a1,`$Sigma1[2]-$Sigma1[1]`
134 or $t0,$t0,$t1 ; Ch(e,f,g)
135 add $T,$T,@X[$i]
136 xor $a0,$a0,$a1 ; Sigma1(e)
137 add $T,$T,$t0
138 add $T,$T,$a0
139
140 $ROR $a0,$a,$Sigma0[0]
141 $ROR $a1,$a,$Sigma0[1]
142 and $t0,$a,$b
143 and $t1,$a,$c
144 xor $a0,$a0,$a1
145 $ROR $a1,$a1,`$Sigma0[2]-$Sigma0[1]`
146 xor $t0,$t0,$t1
147 and $t1,$b,$c
148 xor $a0,$a0,$a1 ; Sigma0(a)
149 add $d,$d,$T
150 xor $t0,$t0,$t1 ; Maj(a,b,c)
151 add $h,$T,$a0
152 add $h,$h,$t0
153
154___
155}
156
157sub ROUND_16_xx {
158my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
159$i-=16;
160$code.=<<___;
161 $ROR $a0,@X[($i+1)%16],$sigma0[0]
162 $ROR $a1,@X[($i+1)%16],$sigma0[1]
163 $ROR $t0,@X[($i+14)%16],$sigma1[0]
164 $ROR $t1,@X[($i+14)%16],$sigma1[1]
165 xor $a0,$a0,$a1
166 $SHR $a1,@X[($i+1)%16],$sigma0[2]
167 xor $t0,$t0,$t1
168 $SHR $t1,@X[($i+14)%16],$sigma1[2]
169 add @X[$i],@X[$i],@X[($i+9)%16]
170 xor $a0,$a0,$a1 ; sigma0(X[(i+1)&0x0f])
171 xor $t0,$t0,$t1 ; sigma1(X[(i+14)&0x0f])
172 add @X[$i],@X[$i],$a0
173 add @X[$i],@X[$i],$t0
174___
175&ROUND_00_15($i,$a,$b,$c,$d,$e,$f,$g,$h);
176}
177
178$code=<<___;
179.machine "any"
180.text
181
182.globl $func
183.align 6
184$func:
185 $STU $sp,-$FRAME($sp)
186 mflr r0
187 $SHL $num,$num,`log(16*$SZ)/log(2)`
188
189 $PUSH $ctx,`$FRAME-$SIZE_T*22`($sp)
190
191 $PUSH $toc,`$FRAME-$SIZE_T*20`($sp)
192 $PUSH r13,`$FRAME-$SIZE_T*19`($sp)
193 $PUSH r14,`$FRAME-$SIZE_T*18`($sp)
194 $PUSH r15,`$FRAME-$SIZE_T*17`($sp)
195 $PUSH r16,`$FRAME-$SIZE_T*16`($sp)
196 $PUSH r17,`$FRAME-$SIZE_T*15`($sp)
197 $PUSH r18,`$FRAME-$SIZE_T*14`($sp)
198 $PUSH r19,`$FRAME-$SIZE_T*13`($sp)
199 $PUSH r20,`$FRAME-$SIZE_T*12`($sp)
200 $PUSH r21,`$FRAME-$SIZE_T*11`($sp)
201 $PUSH r22,`$FRAME-$SIZE_T*10`($sp)
202 $PUSH r23,`$FRAME-$SIZE_T*9`($sp)
203 $PUSH r24,`$FRAME-$SIZE_T*8`($sp)
204 $PUSH r25,`$FRAME-$SIZE_T*7`($sp)
205 $PUSH r26,`$FRAME-$SIZE_T*6`($sp)
206 $PUSH r27,`$FRAME-$SIZE_T*5`($sp)
207 $PUSH r28,`$FRAME-$SIZE_T*4`($sp)
208 $PUSH r29,`$FRAME-$SIZE_T*3`($sp)
209 $PUSH r30,`$FRAME-$SIZE_T*2`($sp)
210 $PUSH r31,`$FRAME-$SIZE_T*1`($sp)
211 $PUSH r0,`$FRAME+$LRSAVE`($sp)
212
213 $LD $A,`0*$SZ`($ctx)
214 mr $inp,r4 ; incarnate $inp
215 $LD $B,`1*$SZ`($ctx)
216 $LD $C,`2*$SZ`($ctx)
217 $LD $D,`3*$SZ`($ctx)
218 $LD $E,`4*$SZ`($ctx)
219 $LD $F,`5*$SZ`($ctx)
220 $LD $G,`6*$SZ`($ctx)
221 $LD $H,`7*$SZ`($ctx)
222
223 bl LPICmeup
224LPICedup:
225 andi. r0,$inp,3
226 bne Lunaligned
227Laligned:
228 add $num,$inp,$num
229 $PUSH $num,`$FRAME-$SIZE_T*24`($sp) ; end pointer
230 $PUSH $inp,`$FRAME-$SIZE_T*23`($sp) ; inp pointer
231 bl Lsha2_block_private
232 b Ldone
233
234; PowerPC specification allows an implementation to be ill-behaved
235; upon unaligned access which crosses page boundary. "Better safe
236; than sorry" principle makes me treat it specially. But I don't
237; look for particular offending word, but rather for the input
238; block which crosses the boundary. Once found that block is aligned
239; and hashed separately...
240.align 4
241Lunaligned:
242 subfic $t1,$inp,4096
243 andi. $t1,$t1,`4096-16*$SZ` ; distance to closest page boundary
244 beq Lcross_page
245 $UCMP $num,$t1
246 ble- Laligned ; didn't cross the page boundary
247 subfc $num,$t1,$num
248 add $t1,$inp,$t1
249 $PUSH $num,`$FRAME-$SIZE_T*25`($sp) ; save real remaining num
250 $PUSH $t1,`$FRAME-$SIZE_T*24`($sp) ; intermediate end pointer
251 $PUSH $inp,`$FRAME-$SIZE_T*23`($sp) ; inp pointer
252 bl Lsha2_block_private
253 ; $inp equals to the intermediate end pointer here
254 $POP $num,`$FRAME-$SIZE_T*25`($sp) ; restore real remaining num
255Lcross_page:
256 li $t1,`16*$SZ/4`
257 mtctr $t1
258 addi r20,$sp,$LOCALS ; aligned spot below the frame
259Lmemcpy:
260 lbz r16,0($inp)
261 lbz r17,1($inp)
262 lbz r18,2($inp)
263 lbz r19,3($inp)
264 addi $inp,$inp,4
265 stb r16,0(r20)
266 stb r17,1(r20)
267 stb r18,2(r20)
268 stb r19,3(r20)
269 addi r20,r20,4
270 bdnz Lmemcpy
271
272 $PUSH $inp,`$FRAME-$SIZE_T*26`($sp) ; save real inp
273 addi $t1,$sp,`$LOCALS+16*$SZ` ; fictitious end pointer
274 addi $inp,$sp,$LOCALS ; fictitious inp pointer
275 $PUSH $num,`$FRAME-$SIZE_T*25`($sp) ; save real num
276 $PUSH $t1,`$FRAME-$SIZE_T*24`($sp) ; end pointer
277 $PUSH $inp,`$FRAME-$SIZE_T*23`($sp) ; inp pointer
278 bl Lsha2_block_private
279 $POP $inp,`$FRAME-$SIZE_T*26`($sp) ; restore real inp
280 $POP $num,`$FRAME-$SIZE_T*25`($sp) ; restore real num
281 addic. $num,$num,`-16*$SZ` ; num--
282 bne- Lunaligned
283
284Ldone:
285 $POP r0,`$FRAME+$LRSAVE`($sp)
286 $POP $toc,`$FRAME-$SIZE_T*20`($sp)
287 $POP r13,`$FRAME-$SIZE_T*19`($sp)
288 $POP r14,`$FRAME-$SIZE_T*18`($sp)
289 $POP r15,`$FRAME-$SIZE_T*17`($sp)
290 $POP r16,`$FRAME-$SIZE_T*16`($sp)
291 $POP r17,`$FRAME-$SIZE_T*15`($sp)
292 $POP r18,`$FRAME-$SIZE_T*14`($sp)
293 $POP r19,`$FRAME-$SIZE_T*13`($sp)
294 $POP r20,`$FRAME-$SIZE_T*12`($sp)
295 $POP r21,`$FRAME-$SIZE_T*11`($sp)
296 $POP r22,`$FRAME-$SIZE_T*10`($sp)
297 $POP r23,`$FRAME-$SIZE_T*9`($sp)
298 $POP r24,`$FRAME-$SIZE_T*8`($sp)
299 $POP r25,`$FRAME-$SIZE_T*7`($sp)
300 $POP r26,`$FRAME-$SIZE_T*6`($sp)
301 $POP r27,`$FRAME-$SIZE_T*5`($sp)
302 $POP r28,`$FRAME-$SIZE_T*4`($sp)
303 $POP r29,`$FRAME-$SIZE_T*3`($sp)
304 $POP r30,`$FRAME-$SIZE_T*2`($sp)
305 $POP r31,`$FRAME-$SIZE_T*1`($sp)
306 mtlr r0
307 addi $sp,$sp,$FRAME
308 blr
309 .long 0
310 .byte 0,12,4,1,0x80,18,3,0
311 .long 0
312
313.align 4
314Lsha2_block_private:
315___
316for($i=0;$i<16;$i++) {
317$code.=<<___ if ($SZ==4);
318 lwz @X[$i],`$i*$SZ`($inp)
319___
320# 64-bit loads are split to 2x32-bit ones, as CPU can't handle
321# unaligned 64-bit loads, only 32-bit ones...
322$code.=<<___ if ($SZ==8);
323 lwz $t0,`$i*$SZ`($inp)
324 lwz @X[$i],`$i*$SZ+4`($inp)
325 insrdi @X[$i],$t0,32,0
326___
327 &ROUND_00_15($i,@V);
328 unshift(@V,pop(@V));
329}
330$code.=<<___;
331 li $T,`$rounds/16-1`
332 mtctr $T
333.align 4
334Lrounds:
335 addi $Tbl,$Tbl,`16*$SZ`
336___
337for(;$i<32;$i++) {
338 &ROUND_16_xx($i,@V);
339 unshift(@V,pop(@V));
340}
341$code.=<<___;
342 bdnz- Lrounds
343
344 $POP $ctx,`$FRAME-$SIZE_T*22`($sp)
345 $POP $inp,`$FRAME-$SIZE_T*23`($sp) ; inp pointer
346 $POP $num,`$FRAME-$SIZE_T*24`($sp) ; end pointer
347 subi $Tbl,$Tbl,`($rounds-16)*$SZ` ; rewind Tbl
348
349 $LD r16,`0*$SZ`($ctx)
350 $LD r17,`1*$SZ`($ctx)
351 $LD r18,`2*$SZ`($ctx)
352 $LD r19,`3*$SZ`($ctx)
353 $LD r20,`4*$SZ`($ctx)
354 $LD r21,`5*$SZ`($ctx)
355 $LD r22,`6*$SZ`($ctx)
356 addi $inp,$inp,`16*$SZ` ; advance inp
357 $LD r23,`7*$SZ`($ctx)
358 add $A,$A,r16
359 add $B,$B,r17
360 $PUSH $inp,`$FRAME-$SIZE_T*23`($sp)
361 add $C,$C,r18
362 $ST $A,`0*$SZ`($ctx)
363 add $D,$D,r19
364 $ST $B,`1*$SZ`($ctx)
365 add $E,$E,r20
366 $ST $C,`2*$SZ`($ctx)
367 add $F,$F,r21
368 $ST $D,`3*$SZ`($ctx)
369 add $G,$G,r22
370 $ST $E,`4*$SZ`($ctx)
371 add $H,$H,r23
372 $ST $F,`5*$SZ`($ctx)
373 $ST $G,`6*$SZ`($ctx)
374 $UCMP $inp,$num
375 $ST $H,`7*$SZ`($ctx)
376 bne Lsha2_block_private
377 blr
378 .long 0
379 .byte 0,12,0x14,0,0,0,0,0
380___
381
382# Ugly hack here, because PPC assembler syntax seem to vary too
383# much from platforms to platform...
384$code.=<<___;
385.align 6
386LPICmeup:
387 mflr r0
388 bcl 20,31,\$+4
389 mflr $Tbl ; vvvvvv "distance" between . and 1st data entry
390 addi $Tbl,$Tbl,`64-8`
391 mtlr r0
392 blr
393 .long 0
394 .byte 0,12,0x14,0,0,0,0,0
395 .space `64-9*4`
396___
397$code.=<<___ if ($SZ==8);
398 .long 0x428a2f98,0xd728ae22,0x71374491,0x23ef65cd
399 .long 0xb5c0fbcf,0xec4d3b2f,0xe9b5dba5,0x8189dbbc
400 .long 0x3956c25b,0xf348b538,0x59f111f1,0xb605d019
401 .long 0x923f82a4,0xaf194f9b,0xab1c5ed5,0xda6d8118
402 .long 0xd807aa98,0xa3030242,0x12835b01,0x45706fbe
403 .long 0x243185be,0x4ee4b28c,0x550c7dc3,0xd5ffb4e2
404 .long 0x72be5d74,0xf27b896f,0x80deb1fe,0x3b1696b1
405 .long 0x9bdc06a7,0x25c71235,0xc19bf174,0xcf692694
406 .long 0xe49b69c1,0x9ef14ad2,0xefbe4786,0x384f25e3
407 .long 0x0fc19dc6,0x8b8cd5b5,0x240ca1cc,0x77ac9c65
408 .long 0x2de92c6f,0x592b0275,0x4a7484aa,0x6ea6e483
409 .long 0x5cb0a9dc,0xbd41fbd4,0x76f988da,0x831153b5
410 .long 0x983e5152,0xee66dfab,0xa831c66d,0x2db43210
411 .long 0xb00327c8,0x98fb213f,0xbf597fc7,0xbeef0ee4
412 .long 0xc6e00bf3,0x3da88fc2,0xd5a79147,0x930aa725
413 .long 0x06ca6351,0xe003826f,0x14292967,0x0a0e6e70
414 .long 0x27b70a85,0x46d22ffc,0x2e1b2138,0x5c26c926
415 .long 0x4d2c6dfc,0x5ac42aed,0x53380d13,0x9d95b3df
416 .long 0x650a7354,0x8baf63de,0x766a0abb,0x3c77b2a8
417 .long 0x81c2c92e,0x47edaee6,0x92722c85,0x1482353b
418 .long 0xa2bfe8a1,0x4cf10364,0xa81a664b,0xbc423001
419 .long 0xc24b8b70,0xd0f89791,0xc76c51a3,0x0654be30
420 .long 0xd192e819,0xd6ef5218,0xd6990624,0x5565a910
421 .long 0xf40e3585,0x5771202a,0x106aa070,0x32bbd1b8
422 .long 0x19a4c116,0xb8d2d0c8,0x1e376c08,0x5141ab53
423 .long 0x2748774c,0xdf8eeb99,0x34b0bcb5,0xe19b48a8
424 .long 0x391c0cb3,0xc5c95a63,0x4ed8aa4a,0xe3418acb
425 .long 0x5b9cca4f,0x7763e373,0x682e6ff3,0xd6b2b8a3
426 .long 0x748f82ee,0x5defb2fc,0x78a5636f,0x43172f60
427 .long 0x84c87814,0xa1f0ab72,0x8cc70208,0x1a6439ec
428 .long 0x90befffa,0x23631e28,0xa4506ceb,0xde82bde9
429 .long 0xbef9a3f7,0xb2c67915,0xc67178f2,0xe372532b
430 .long 0xca273ece,0xea26619c,0xd186b8c7,0x21c0c207
431 .long 0xeada7dd6,0xcde0eb1e,0xf57d4f7f,0xee6ed178
432 .long 0x06f067aa,0x72176fba,0x0a637dc5,0xa2c898a6
433 .long 0x113f9804,0xbef90dae,0x1b710b35,0x131c471b
434 .long 0x28db77f5,0x23047d84,0x32caab7b,0x40c72493
435 .long 0x3c9ebe0a,0x15c9bebc,0x431d67c4,0x9c100d4c
436 .long 0x4cc5d4be,0xcb3e42b6,0x597f299c,0xfc657e2a
437 .long 0x5fcb6fab,0x3ad6faec,0x6c44198c,0x4a475817
438___
439$code.=<<___ if ($SZ==4);
440 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
441 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
442 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
443 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
444 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
445 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
446 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
447 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
448 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
449 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
450 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
451 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
452 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
453 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
454 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
455 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
456___
457
458$code =~ s/\`([^\`]*)\`/eval $1/gem;
459print $code;
460close STDOUT;
diff --git a/src/lib/libcrypto/sha/asm/sha512-s390x.pl b/src/lib/libcrypto/sha/asm/sha512-s390x.pl
deleted file mode 100644
index 079a3fc78a..0000000000
--- a/src/lib/libcrypto/sha/asm/sha512-s390x.pl
+++ /dev/null
@@ -1,322 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# SHA256/512 block procedures for s390x.
11
12# April 2007.
13#
14# sha256_block_data_order is reportedly >3 times faster than gcc 3.3
15# generated code (must be a bug in compiler, as improvement is
16# "pathologically" high, in particular in comparison to other SHA
17# modules). But the real twist is that it detects if hardware support
18# for SHA256 is available and in such case utilizes it. Then the
19# performance can reach >6.5x of assembler one for larger chunks.
20#
21# sha512_block_data_order is ~70% faster than gcc 3.3 generated code.
22
23# January 2009.
24#
25# Add support for hardware SHA512 and reschedule instructions to
26# favour dual-issue z10 pipeline. Hardware SHA256/512 is ~4.7x faster
27# than software.
28
29# November 2010.
30#
31# Adapt for -m31 build. If kernel supports what's called "highgprs"
32# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
33# instructions and achieve "64-bit" performance even in 31-bit legacy
34# application context. The feature is not specific to any particular
35# processor, as long as it's "z-CPU". Latter implies that the code
36# remains z/Architecture specific. On z900 SHA256 was measured to
37# perform 2.4x and SHA512 - 13x better than code generated by gcc 4.3.
38
39$flavour = shift;
40
41if ($flavour =~ /3[12]/) {
42 $SIZE_T=4;
43 $g="";
44} else {
45 $SIZE_T=8;
46 $g="g";
47}
48
49$t0="%r0";
50$t1="%r1";
51$ctx="%r2"; $t2="%r2";
52$inp="%r3";
53$len="%r4"; # used as index in inner loop
54
55$A="%r5";
56$B="%r6";
57$C="%r7";
58$D="%r8";
59$E="%r9";
60$F="%r10";
61$G="%r11";
62$H="%r12"; @V=($A,$B,$C,$D,$E,$F,$G,$H);
63$tbl="%r13";
64$T1="%r14";
65$sp="%r15";
66
67while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
68open STDOUT,">$output";
69
70if ($output =~ /512/) {
71 $label="512";
72 $SZ=8;
73 $LD="lg"; # load from memory
74 $ST="stg"; # store to memory
75 $ADD="alg"; # add with memory operand
76 $ROT="rllg"; # rotate left
77 $SHR="srlg"; # logical right shift [see even at the end]
78 @Sigma0=(25,30,36);
79 @Sigma1=(23,46,50);
80 @sigma0=(56,63, 7);
81 @sigma1=( 3,45, 6);
82 $rounds=80;
83 $kimdfunc=3; # 0 means unknown/unsupported/unimplemented/disabled
84} else {
85 $label="256";
86 $SZ=4;
87 $LD="llgf"; # load from memory
88 $ST="st"; # store to memory
89 $ADD="al"; # add with memory operand
90 $ROT="rll"; # rotate left
91 $SHR="srl"; # logical right shift
92 @Sigma0=(10,19,30);
93 @Sigma1=( 7,21,26);
94 @sigma0=(14,25, 3);
95 @sigma1=(13,15,10);
96 $rounds=64;
97 $kimdfunc=2; # magic function code for kimd instruction
98}
99$Func="sha${label}_block_data_order";
100$Table="K${label}";
101$stdframe=16*$SIZE_T+4*8;
102$frame=$stdframe+16*$SZ;
103
104sub BODY_00_15 {
105my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
106
107$code.=<<___ if ($i<16);
108 $LD $T1,`$i*$SZ`($inp) ### $i
109___
110$code.=<<___;
111 $ROT $t0,$e,$Sigma1[0]
112 $ROT $t1,$e,$Sigma1[1]
113 lgr $t2,$f
114 xgr $t0,$t1
115 $ROT $t1,$t1,`$Sigma1[2]-$Sigma1[1]`
116 xgr $t2,$g
117 $ST $T1,`$stdframe+$SZ*($i%16)`($sp)
118 xgr $t0,$t1 # Sigma1(e)
119 algr $T1,$h # T1+=h
120 ngr $t2,$e
121 lgr $t1,$a
122 algr $T1,$t0 # T1+=Sigma1(e)
123 $ROT $h,$a,$Sigma0[0]
124 xgr $t2,$g # Ch(e,f,g)
125 $ADD $T1,`$i*$SZ`($len,$tbl) # T1+=K[i]
126 $ROT $t0,$a,$Sigma0[1]
127 algr $T1,$t2 # T1+=Ch(e,f,g)
128 ogr $t1,$b
129 xgr $h,$t0
130 lgr $t2,$a
131 ngr $t1,$c
132 $ROT $t0,$t0,`$Sigma0[2]-$Sigma0[1]`
133 xgr $h,$t0 # h=Sigma0(a)
134 ngr $t2,$b
135 algr $h,$T1 # h+=T1
136 ogr $t2,$t1 # Maj(a,b,c)
137 algr $d,$T1 # d+=T1
138 algr $h,$t2 # h+=Maj(a,b,c)
139___
140}
141
142sub BODY_16_XX {
143my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
144
145$code.=<<___;
146 $LD $T1,`$stdframe+$SZ*(($i+1)%16)`($sp) ### $i
147 $LD $t1,`$stdframe+$SZ*(($i+14)%16)`($sp)
148 $ROT $t0,$T1,$sigma0[0]
149 $SHR $T1,$sigma0[2]
150 $ROT $t2,$t0,`$sigma0[1]-$sigma0[0]`
151 xgr $T1,$t0
152 $ROT $t0,$t1,$sigma1[0]
153 xgr $T1,$t2 # sigma0(X[i+1])
154 $SHR $t1,$sigma1[2]
155 $ADD $T1,`$stdframe+$SZ*($i%16)`($sp) # +=X[i]
156 xgr $t1,$t0
157 $ROT $t0,$t0,`$sigma1[1]-$sigma1[0]`
158 $ADD $T1,`$stdframe+$SZ*(($i+9)%16)`($sp) # +=X[i+9]
159 xgr $t1,$t0 # sigma1(X[i+14])
160 algr $T1,$t1 # +=sigma1(X[i+14])
161___
162 &BODY_00_15(@_);
163}
164
165$code.=<<___;
166.text
167.align 64
168.type $Table,\@object
169$Table:
170___
171$code.=<<___ if ($SZ==4);
172 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
173 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
174 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
175 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
176 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
177 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
178 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
179 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
180 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
181 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
182 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
183 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
184 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
185 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
186 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
187 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
188___
189$code.=<<___ if ($SZ==8);
190 .quad 0x428a2f98d728ae22,0x7137449123ef65cd
191 .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
192 .quad 0x3956c25bf348b538,0x59f111f1b605d019
193 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
194 .quad 0xd807aa98a3030242,0x12835b0145706fbe
195 .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
196 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
197 .quad 0x9bdc06a725c71235,0xc19bf174cf692694
198 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
199 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
200 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
201 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
202 .quad 0x983e5152ee66dfab,0xa831c66d2db43210
203 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
204 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
205 .quad 0x06ca6351e003826f,0x142929670a0e6e70
206 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
207 .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
208 .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
209 .quad 0x81c2c92e47edaee6,0x92722c851482353b
210 .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
211 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
212 .quad 0xd192e819d6ef5218,0xd69906245565a910
213 .quad 0xf40e35855771202a,0x106aa07032bbd1b8
214 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
215 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
216 .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
217 .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
218 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
219 .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
220 .quad 0x90befffa23631e28,0xa4506cebde82bde9
221 .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
222 .quad 0xca273eceea26619c,0xd186b8c721c0c207
223 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
224 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
225 .quad 0x113f9804bef90dae,0x1b710b35131c471b
226 .quad 0x28db77f523047d84,0x32caab7b40c72493
227 .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
228 .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
229 .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
230___
231$code.=<<___;
232.size $Table,.-$Table
233.globl $Func
234.type $Func,\@function
235$Func:
236 sllg $len,$len,`log(16*$SZ)/log(2)`
237___
238$code.=<<___ if ($kimdfunc);
239 larl %r1,OPENSSL_s390xcap_P
240 lg %r0,0(%r1)
241 tmhl %r0,0x4000 # check for message-security assist
242 jz .Lsoftware
243 lghi %r0,0
244 la %r1,`2*$SIZE_T`($sp)
245 .long 0xb93e0002 # kimd %r0,%r2
246 lg %r0,`2*$SIZE_T`($sp)
247 tmhh %r0,`0x8000>>$kimdfunc`
248 jz .Lsoftware
249 lghi %r0,$kimdfunc
250 lgr %r1,$ctx
251 lgr %r2,$inp
252 lgr %r3,$len
253 .long 0xb93e0002 # kimd %r0,%r2
254 brc 1,.-4 # pay attention to "partial completion"
255 br %r14
256.align 16
257.Lsoftware:
258___
259$code.=<<___;
260 lghi %r1,-$frame
261 la $len,0($len,$inp)
262 stm${g} $ctx,%r15,`2*$SIZE_T`($sp)
263 lgr %r0,$sp
264 la $sp,0(%r1,$sp)
265 st${g} %r0,0($sp)
266
267 larl $tbl,$Table
268 $LD $A,`0*$SZ`($ctx)
269 $LD $B,`1*$SZ`($ctx)
270 $LD $C,`2*$SZ`($ctx)
271 $LD $D,`3*$SZ`($ctx)
272 $LD $E,`4*$SZ`($ctx)
273 $LD $F,`5*$SZ`($ctx)
274 $LD $G,`6*$SZ`($ctx)
275 $LD $H,`7*$SZ`($ctx)
276
277.Lloop:
278 lghi $len,0
279___
280for ($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
281$code.=".Lrounds_16_xx:\n";
282for (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
283$code.=<<___;
284 aghi $len,`16*$SZ`
285 lghi $t0,`($rounds-16)*$SZ`
286 clgr $len,$t0
287 jne .Lrounds_16_xx
288
289 l${g} $ctx,`$frame+2*$SIZE_T`($sp)
290 la $inp,`16*$SZ`($inp)
291 $ADD $A,`0*$SZ`($ctx)
292 $ADD $B,`1*$SZ`($ctx)
293 $ADD $C,`2*$SZ`($ctx)
294 $ADD $D,`3*$SZ`($ctx)
295 $ADD $E,`4*$SZ`($ctx)
296 $ADD $F,`5*$SZ`($ctx)
297 $ADD $G,`6*$SZ`($ctx)
298 $ADD $H,`7*$SZ`($ctx)
299 $ST $A,`0*$SZ`($ctx)
300 $ST $B,`1*$SZ`($ctx)
301 $ST $C,`2*$SZ`($ctx)
302 $ST $D,`3*$SZ`($ctx)
303 $ST $E,`4*$SZ`($ctx)
304 $ST $F,`5*$SZ`($ctx)
305 $ST $G,`6*$SZ`($ctx)
306 $ST $H,`7*$SZ`($ctx)
307 cl${g} $inp,`$frame+4*$SIZE_T`($sp)
308 jne .Lloop
309
310 lm${g} %r6,%r15,`$frame+6*$SIZE_T`($sp)
311 br %r14
312.size $Func,.-$Func
313.string "SHA${label} block transform for s390x, CRYPTOGAMS by <appro\@openssl.org>"
314.comm OPENSSL_s390xcap_P,16,8
315___
316
317$code =~ s/\`([^\`]*)\`/eval $1/gem;
318# unlike 32-bit shift 64-bit one takes three arguments
319$code =~ s/(srlg\s+)(%r[0-9]+),/$1$2,$2,/gm;
320
321print $code;
322close STDOUT;
diff --git a/src/lib/libcrypto/sha/asm/sha512-sparcv9.pl b/src/lib/libcrypto/sha/asm/sha512-sparcv9.pl
deleted file mode 100644
index 585740789e..0000000000
--- a/src/lib/libcrypto/sha/asm/sha512-sparcv9.pl
+++ /dev/null
@@ -1,594 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# SHA256 performance improvement over compiler generated code varies
11# from 40% for Sun C [32-bit build] to 70% for gcc [3.3, 64-bit
12# build]. Just like in SHA1 module I aim to ensure scalability on
13# UltraSPARC T1 by packing X[16] to 8 64-bit registers.
14
15# SHA512 on pre-T1 UltraSPARC.
16#
17# Performance is >75% better than 64-bit code generated by Sun C and
18# over 2x than 32-bit code. X[16] resides on stack, but access to it
19# is scheduled for L2 latency and staged through 32 least significant
20# bits of %l0-%l7. The latter is done to achieve 32-/64-bit ABI
21# duality. Nevetheless it's ~40% faster than SHA256, which is pretty
22# good [optimal coefficient is 50%].
23#
24# SHA512 on UltraSPARC T1.
25#
26# It's not any faster than 64-bit code generated by Sun C 5.8. This is
27# because 64-bit code generator has the advantage of using 64-bit
28# loads(*) to access X[16], which I consciously traded for 32-/64-bit
29# ABI duality [as per above]. But it surpasses 32-bit Sun C generated
30# code by 60%, not to mention that it doesn't suffer from severe decay
31# when running 4 times physical cores threads and that it leaves gcc
32# [3.4] behind by over 4x factor! If compared to SHA256, single thread
33# performance is only 10% better, but overall throughput for maximum
34# amount of threads for given CPU exceeds corresponding one of SHA256
35# by 30% [again, optimal coefficient is 50%].
36#
37# (*) Unlike pre-T1 UltraSPARC loads on T1 are executed strictly
38# in-order, i.e. load instruction has to complete prior next
39# instruction in given thread is executed, even if the latter is
40# not dependent on load result! This means that on T1 two 32-bit
41# loads are always slower than one 64-bit load. Once again this
42# is unlike pre-T1 UltraSPARC, where, if scheduled appropriately,
43# 2x32-bit loads can be as fast as 1x64-bit ones.
44
45$bits=32;
46for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
47if ($bits==64) { $bias=2047; $frame=192; }
48else { $bias=0; $frame=112; }
49
50$output=shift;
51open STDOUT,">$output";
52
53if ($output =~ /512/) {
54 $label="512";
55 $SZ=8;
56 $LD="ldx"; # load from memory
57 $ST="stx"; # store to memory
58 $SLL="sllx"; # shift left logical
59 $SRL="srlx"; # shift right logical
60 @Sigma0=(28,34,39);
61 @Sigma1=(14,18,41);
62 @sigma0=( 7, 1, 8); # right shift first
63 @sigma1=( 6,19,61); # right shift first
64 $lastK=0x817;
65 $rounds=80;
66 $align=4;
67
68 $locals=16*$SZ; # X[16]
69
70 $A="%o0";
71 $B="%o1";
72 $C="%o2";
73 $D="%o3";
74 $E="%o4";
75 $F="%o5";
76 $G="%g1";
77 $H="%o7";
78 @V=($A,$B,$C,$D,$E,$F,$G,$H);
79} else {
80 $label="256";
81 $SZ=4;
82 $LD="ld"; # load from memory
83 $ST="st"; # store to memory
84 $SLL="sll"; # shift left logical
85 $SRL="srl"; # shift right logical
86 @Sigma0=( 2,13,22);
87 @Sigma1=( 6,11,25);
88 @sigma0=( 3, 7,18); # right shift first
89 @sigma1=(10,17,19); # right shift first
90 $lastK=0x8f2;
91 $rounds=64;
92 $align=8;
93
94 $locals=0; # X[16] is register resident
95 @X=("%o0","%o1","%o2","%o3","%o4","%o5","%g1","%o7");
96
97 $A="%l0";
98 $B="%l1";
99 $C="%l2";
100 $D="%l3";
101 $E="%l4";
102 $F="%l5";
103 $G="%l6";
104 $H="%l7";
105 @V=($A,$B,$C,$D,$E,$F,$G,$H);
106}
107$T1="%g2";
108$tmp0="%g3";
109$tmp1="%g4";
110$tmp2="%g5";
111
112$ctx="%i0";
113$inp="%i1";
114$len="%i2";
115$Ktbl="%i3";
116$tmp31="%i4";
117$tmp32="%i5";
118
119########### SHA256
120$Xload = sub {
121my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
122
123 if ($i==0) {
124$code.=<<___;
125 ldx [$inp+0],@X[0]
126 ldx [$inp+16],@X[2]
127 ldx [$inp+32],@X[4]
128 ldx [$inp+48],@X[6]
129 ldx [$inp+8],@X[1]
130 ldx [$inp+24],@X[3]
131 subcc %g0,$tmp31,$tmp32 ! should be 64-$tmp31, but -$tmp31 works too
132 ldx [$inp+40],@X[5]
133 bz,pt %icc,.Laligned
134 ldx [$inp+56],@X[7]
135
136 sllx @X[0],$tmp31,@X[0]
137 ldx [$inp+64],$T1
138___
139for($j=0;$j<7;$j++)
140{ $code.=<<___;
141 srlx @X[$j+1],$tmp32,$tmp1
142 sllx @X[$j+1],$tmp31,@X[$j+1]
143 or $tmp1,@X[$j],@X[$j]
144___
145}
146$code.=<<___;
147 srlx $T1,$tmp32,$T1
148 or $T1,@X[7],@X[7]
149.Laligned:
150___
151 }
152
153 if ($i&1) {
154 $code.="\tadd @X[$i/2],$h,$T1\n";
155 } else {
156 $code.="\tsrlx @X[$i/2],32,$T1\n\tadd $h,$T1,$T1\n";
157 }
158} if ($SZ==4);
159
160########### SHA512
161$Xload = sub {
162my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
163my @pair=("%l".eval(($i*2)%8),"%l".eval(($i*2)%8+1),"%l".eval((($i+1)*2)%8));
164
165$code.=<<___ if ($i==0);
166 ld [$inp+0],%l0
167 ld [$inp+4],%l1
168 ld [$inp+8],%l2
169 ld [$inp+12],%l3
170 ld [$inp+16],%l4
171 ld [$inp+20],%l5
172 ld [$inp+24],%l6
173 ld [$inp+28],%l7
174___
175$code.=<<___ if ($i<15);
176 sllx @pair[1],$tmp31,$tmp2 ! Xload($i)
177 add $tmp31,32,$tmp0
178 sllx @pair[0],$tmp0,$tmp1
179 `"ld [$inp+".eval(32+0+$i*8)."],@pair[0]" if ($i<12)`
180 srlx @pair[2],$tmp32,@pair[1]
181 or $tmp1,$tmp2,$tmp2
182 or @pair[1],$tmp2,$tmp2
183 `"ld [$inp+".eval(32+4+$i*8)."],@pair[1]" if ($i<12)`
184 add $h,$tmp2,$T1
185 $ST $tmp2,[%sp+`$bias+$frame+$i*$SZ`]
186___
187$code.=<<___ if ($i==12);
188 brnz,a $tmp31,.+8
189 ld [$inp+128],%l0
190___
191$code.=<<___ if ($i==15);
192 ld [%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+0`],%l2
193 sllx @pair[1],$tmp31,$tmp2 ! Xload($i)
194 add $tmp31,32,$tmp0
195 ld [%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+4`],%l3
196 sllx @pair[0],$tmp0,$tmp1
197 ld [%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+0`],%l4
198 srlx @pair[2],$tmp32,@pair[1]
199 or $tmp1,$tmp2,$tmp2
200 ld [%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+4`],%l5
201 or @pair[1],$tmp2,$tmp2
202 ld [%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+0`],%l6
203 add $h,$tmp2,$T1
204 $ST $tmp2,[%sp+`$bias+$frame+$i*$SZ`]
205 ld [%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+4`],%l7
206 ld [%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+0`],%l0
207 ld [%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+4`],%l1
208___
209} if ($SZ==8);
210
211########### common
212sub BODY_00_15 {
213my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
214
215 if ($i<16) {
216 &$Xload(@_);
217 } else {
218 $code.="\tadd $h,$T1,$T1\n";
219 }
220
221$code.=<<___;
222 $SRL $e,@Sigma1[0],$h !! $i
223 xor $f,$g,$tmp2
224 $SLL $e,`$SZ*8-@Sigma1[2]`,$tmp1
225 and $e,$tmp2,$tmp2
226 $SRL $e,@Sigma1[1],$tmp0
227 xor $tmp1,$h,$h
228 $SLL $e,`$SZ*8-@Sigma1[1]`,$tmp1
229 xor $tmp0,$h,$h
230 $SRL $e,@Sigma1[2],$tmp0
231 xor $tmp1,$h,$h
232 $SLL $e,`$SZ*8-@Sigma1[0]`,$tmp1
233 xor $tmp0,$h,$h
234 xor $g,$tmp2,$tmp2 ! Ch(e,f,g)
235 xor $tmp1,$h,$tmp0 ! Sigma1(e)
236
237 $SRL $a,@Sigma0[0],$h
238 add $tmp2,$T1,$T1
239 $LD [$Ktbl+`$i*$SZ`],$tmp2 ! K[$i]
240 $SLL $a,`$SZ*8-@Sigma0[2]`,$tmp1
241 add $tmp0,$T1,$T1
242 $SRL $a,@Sigma0[1],$tmp0
243 xor $tmp1,$h,$h
244 $SLL $a,`$SZ*8-@Sigma0[1]`,$tmp1
245 xor $tmp0,$h,$h
246 $SRL $a,@Sigma0[2],$tmp0
247 xor $tmp1,$h,$h
248 $SLL $a,`$SZ*8-@Sigma0[0]`,$tmp1
249 xor $tmp0,$h,$h
250 xor $tmp1,$h,$h ! Sigma0(a)
251
252 or $a,$b,$tmp0
253 and $a,$b,$tmp1
254 and $c,$tmp0,$tmp0
255 or $tmp0,$tmp1,$tmp1 ! Maj(a,b,c)
256 add $tmp2,$T1,$T1 ! +=K[$i]
257 add $tmp1,$h,$h
258
259 add $T1,$d,$d
260 add $T1,$h,$h
261___
262}
263
264########### SHA256
265$BODY_16_XX = sub {
266my $i=@_[0];
267my $xi;
268
269 if ($i&1) {
270 $xi=$tmp32;
271 $code.="\tsrlx @X[(($i+1)/2)%8],32,$xi\n";
272 } else {
273 $xi=@X[(($i+1)/2)%8];
274 }
275$code.=<<___;
276 srl $xi,@sigma0[0],$T1 !! Xupdate($i)
277 sll $xi,`32-@sigma0[2]`,$tmp1
278 srl $xi,@sigma0[1],$tmp0
279 xor $tmp1,$T1,$T1
280 sll $tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1
281 xor $tmp0,$T1,$T1
282 srl $xi,@sigma0[2],$tmp0
283 xor $tmp1,$T1,$T1
284___
285 if ($i&1) {
286 $xi=@X[(($i+14)/2)%8];
287 } else {
288 $xi=$tmp32;
289 $code.="\tsrlx @X[(($i+14)/2)%8],32,$xi\n";
290 }
291$code.=<<___;
292 srl $xi,@sigma1[0],$tmp2
293 xor $tmp0,$T1,$T1 ! T1=sigma0(X[i+1])
294 sll $xi,`32-@sigma1[2]`,$tmp1
295 srl $xi,@sigma1[1],$tmp0
296 xor $tmp1,$tmp2,$tmp2
297 sll $tmp1,`@sigma1[2]-@sigma1[1]`,$tmp1
298 xor $tmp0,$tmp2,$tmp2
299 srl $xi,@sigma1[2],$tmp0
300 xor $tmp1,$tmp2,$tmp2
301___
302 if ($i&1) {
303 $xi=@X[($i/2)%8];
304$code.=<<___;
305 srlx @X[(($i+9)/2)%8],32,$tmp1 ! X[i+9]
306 xor $tmp0,$tmp2,$tmp2 ! sigma1(X[i+14])
307 srl @X[($i/2)%8],0,$tmp0
308 add $tmp2,$tmp1,$tmp1
309 add $xi,$T1,$T1 ! +=X[i]
310 xor $tmp0,@X[($i/2)%8],@X[($i/2)%8]
311 add $tmp1,$T1,$T1
312
313 srl $T1,0,$T1
314 or $T1,@X[($i/2)%8],@X[($i/2)%8]
315___
316 } else {
317 $xi=@X[(($i+9)/2)%8];
318$code.=<<___;
319 srlx @X[($i/2)%8],32,$tmp1 ! X[i]
320 xor $tmp0,$tmp2,$tmp2 ! sigma1(X[i+14])
321 add $xi,$T1,$T1 ! +=X[i+9]
322 add $tmp2,$tmp1,$tmp1
323 srl @X[($i/2)%8],0,@X[($i/2)%8]
324 add $tmp1,$T1,$T1
325
326 sllx $T1,32,$tmp0
327 or $tmp0,@X[($i/2)%8],@X[($i/2)%8]
328___
329 }
330 &BODY_00_15(@_);
331} if ($SZ==4);
332
333########### SHA512
334$BODY_16_XX = sub {
335my $i=@_[0];
336my @pair=("%l".eval(($i*2)%8),"%l".eval(($i*2)%8+1));
337
338$code.=<<___;
339 sllx %l2,32,$tmp0 !! Xupdate($i)
340 or %l3,$tmp0,$tmp0
341
342 srlx $tmp0,@sigma0[0],$T1
343 ld [%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+0`],%l2
344 sllx $tmp0,`64-@sigma0[2]`,$tmp1
345 ld [%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+4`],%l3
346 srlx $tmp0,@sigma0[1],$tmp0
347 xor $tmp1,$T1,$T1
348 sllx $tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1
349 xor $tmp0,$T1,$T1
350 srlx $tmp0,`@sigma0[2]-@sigma0[1]`,$tmp0
351 xor $tmp1,$T1,$T1
352 sllx %l6,32,$tmp2
353 xor $tmp0,$T1,$T1 ! sigma0(X[$i+1])
354 or %l7,$tmp2,$tmp2
355
356 srlx $tmp2,@sigma1[0],$tmp1
357 ld [%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+0`],%l6
358 sllx $tmp2,`64-@sigma1[2]`,$tmp0
359 ld [%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+4`],%l7
360 srlx $tmp2,@sigma1[1],$tmp2
361 xor $tmp0,$tmp1,$tmp1
362 sllx $tmp0,`@sigma1[2]-@sigma1[1]`,$tmp0
363 xor $tmp2,$tmp1,$tmp1
364 srlx $tmp2,`@sigma1[2]-@sigma1[1]`,$tmp2
365 xor $tmp0,$tmp1,$tmp1
366 sllx %l4,32,$tmp0
367 xor $tmp2,$tmp1,$tmp1 ! sigma1(X[$i+14])
368 ld [%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+0`],%l4
369 or %l5,$tmp0,$tmp0
370 ld [%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+4`],%l5
371
372 sllx %l0,32,$tmp2
373 add $tmp1,$T1,$T1
374 ld [%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+0`],%l0
375 or %l1,$tmp2,$tmp2
376 add $tmp0,$T1,$T1 ! +=X[$i+9]
377 ld [%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+4`],%l1
378 add $tmp2,$T1,$T1 ! +=X[$i]
379 $ST $T1,[%sp+`$bias+$frame+($i%16)*$SZ`]
380___
381 &BODY_00_15(@_);
382} if ($SZ==8);
383
384$code.=<<___ if ($bits==64);
385.register %g2,#scratch
386.register %g3,#scratch
387___
388$code.=<<___;
389.section ".text",#alloc,#execinstr
390
391.align 64
392K${label}:
393.type K${label},#object
394___
395if ($SZ==4) {
396$code.=<<___;
397 .long 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
398 .long 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
399 .long 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
400 .long 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
401 .long 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
402 .long 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
403 .long 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
404 .long 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
405 .long 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
406 .long 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
407 .long 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
408 .long 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
409 .long 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
410 .long 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
411 .long 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
412 .long 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
413___
414} else {
415$code.=<<___;
416 .long 0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd
417 .long 0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc
418 .long 0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019
419 .long 0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118
420 .long 0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe
421 .long 0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2
422 .long 0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1
423 .long 0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694
424 .long 0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3
425 .long 0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65
426 .long 0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483
427 .long 0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5
428 .long 0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210
429 .long 0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4
430 .long 0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725
431 .long 0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70
432 .long 0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926
433 .long 0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df
434 .long 0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8
435 .long 0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b
436 .long 0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001
437 .long 0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30
438 .long 0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910
439 .long 0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8
440 .long 0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53
441 .long 0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8
442 .long 0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb
443 .long 0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3
444 .long 0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60
445 .long 0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec
446 .long 0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9
447 .long 0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b
448 .long 0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207
449 .long 0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178
450 .long 0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6
451 .long 0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b
452 .long 0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493
453 .long 0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c
454 .long 0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a
455 .long 0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817
456___
457}
458$code.=<<___;
459.size K${label},.-K${label}
460.globl sha${label}_block_data_order
461sha${label}_block_data_order:
462 save %sp,`-$frame-$locals`,%sp
463 and $inp,`$align-1`,$tmp31
464 sllx $len,`log(16*$SZ)/log(2)`,$len
465 andn $inp,`$align-1`,$inp
466 sll $tmp31,3,$tmp31
467 add $inp,$len,$len
468___
469$code.=<<___ if ($SZ==8); # SHA512
470 mov 32,$tmp32
471 sub $tmp32,$tmp31,$tmp32
472___
473$code.=<<___;
474.Lpic: call .+8
475 add %o7,K${label}-.Lpic,$Ktbl
476
477 $LD [$ctx+`0*$SZ`],$A
478 $LD [$ctx+`1*$SZ`],$B
479 $LD [$ctx+`2*$SZ`],$C
480 $LD [$ctx+`3*$SZ`],$D
481 $LD [$ctx+`4*$SZ`],$E
482 $LD [$ctx+`5*$SZ`],$F
483 $LD [$ctx+`6*$SZ`],$G
484 $LD [$ctx+`7*$SZ`],$H
485
486.Lloop:
487___
488for ($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
489$code.=".L16_xx:\n";
490for (;$i<32;$i++) { &$BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
491$code.=<<___;
492 and $tmp2,0xfff,$tmp2
493 cmp $tmp2,$lastK
494 bne .L16_xx
495 add $Ktbl,`16*$SZ`,$Ktbl ! Ktbl+=16
496
497___
498$code.=<<___ if ($SZ==4); # SHA256
499 $LD [$ctx+`0*$SZ`],@X[0]
500 $LD [$ctx+`1*$SZ`],@X[1]
501 $LD [$ctx+`2*$SZ`],@X[2]
502 $LD [$ctx+`3*$SZ`],@X[3]
503 $LD [$ctx+`4*$SZ`],@X[4]
504 $LD [$ctx+`5*$SZ`],@X[5]
505 $LD [$ctx+`6*$SZ`],@X[6]
506 $LD [$ctx+`7*$SZ`],@X[7]
507
508 add $A,@X[0],$A
509 $ST $A,[$ctx+`0*$SZ`]
510 add $B,@X[1],$B
511 $ST $B,[$ctx+`1*$SZ`]
512 add $C,@X[2],$C
513 $ST $C,[$ctx+`2*$SZ`]
514 add $D,@X[3],$D
515 $ST $D,[$ctx+`3*$SZ`]
516 add $E,@X[4],$E
517 $ST $E,[$ctx+`4*$SZ`]
518 add $F,@X[5],$F
519 $ST $F,[$ctx+`5*$SZ`]
520 add $G,@X[6],$G
521 $ST $G,[$ctx+`6*$SZ`]
522 add $H,@X[7],$H
523 $ST $H,[$ctx+`7*$SZ`]
524___
525$code.=<<___ if ($SZ==8); # SHA512
526 ld [$ctx+`0*$SZ+0`],%l0
527 ld [$ctx+`0*$SZ+4`],%l1
528 ld [$ctx+`1*$SZ+0`],%l2
529 ld [$ctx+`1*$SZ+4`],%l3
530 ld [$ctx+`2*$SZ+0`],%l4
531 ld [$ctx+`2*$SZ+4`],%l5
532 ld [$ctx+`3*$SZ+0`],%l6
533
534 sllx %l0,32,$tmp0
535 ld [$ctx+`3*$SZ+4`],%l7
536 sllx %l2,32,$tmp1
537 or %l1,$tmp0,$tmp0
538 or %l3,$tmp1,$tmp1
539 add $tmp0,$A,$A
540 add $tmp1,$B,$B
541 $ST $A,[$ctx+`0*$SZ`]
542 sllx %l4,32,$tmp2
543 $ST $B,[$ctx+`1*$SZ`]
544 sllx %l6,32,$T1
545 or %l5,$tmp2,$tmp2
546 or %l7,$T1,$T1
547 add $tmp2,$C,$C
548 $ST $C,[$ctx+`2*$SZ`]
549 add $T1,$D,$D
550 $ST $D,[$ctx+`3*$SZ`]
551
552 ld [$ctx+`4*$SZ+0`],%l0
553 ld [$ctx+`4*$SZ+4`],%l1
554 ld [$ctx+`5*$SZ+0`],%l2
555 ld [$ctx+`5*$SZ+4`],%l3
556 ld [$ctx+`6*$SZ+0`],%l4
557 ld [$ctx+`6*$SZ+4`],%l5
558 ld [$ctx+`7*$SZ+0`],%l6
559
560 sllx %l0,32,$tmp0
561 ld [$ctx+`7*$SZ+4`],%l7
562 sllx %l2,32,$tmp1
563 or %l1,$tmp0,$tmp0
564 or %l3,$tmp1,$tmp1
565 add $tmp0,$E,$E
566 add $tmp1,$F,$F
567 $ST $E,[$ctx+`4*$SZ`]
568 sllx %l4,32,$tmp2
569 $ST $F,[$ctx+`5*$SZ`]
570 sllx %l6,32,$T1
571 or %l5,$tmp2,$tmp2
572 or %l7,$T1,$T1
573 add $tmp2,$G,$G
574 $ST $G,[$ctx+`6*$SZ`]
575 add $T1,$H,$H
576 $ST $H,[$ctx+`7*$SZ`]
577___
578$code.=<<___;
579 add $inp,`16*$SZ`,$inp ! advance inp
580 cmp $inp,$len
581 bne `$bits==64?"%xcc":"%icc"`,.Lloop
582 sub $Ktbl,`($rounds-16)*$SZ`,$Ktbl ! rewind Ktbl
583
584 ret
585 restore
586.type sha${label}_block_data_order,#function
587.size sha${label}_block_data_order,(.-sha${label}_block_data_order)
588.asciz "SHA${label} block transform for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
589.align 4
590___
591
592$code =~ s/\`([^\`]*)\`/eval $1/gem;
593print $code;
594close STDOUT;
diff --git a/src/lib/libcrypto/sha/asm/sha512-x86_64.pl b/src/lib/libcrypto/sha/asm/sha512-x86_64.pl
deleted file mode 100755
index feb0f9e776..0000000000
--- a/src/lib/libcrypto/sha/asm/sha512-x86_64.pl
+++ /dev/null
@@ -1,342 +0,0 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. Rights for redistribution and usage in source and binary
6# forms are granted according to the OpenSSL license.
7# ====================================================================
8#
9# sha256/512_block procedure for x86_64.
10#
11# 40% improvement over compiler-generated code on Opteron. On EM64T
12# sha256 was observed to run >80% faster and sha512 - >40%. No magical
13# tricks, just straight implementation... I really wonder why gcc
14# [being armed with inline assembler] fails to generate as fast code.
15# The only thing which is cool about this module is that it's very
16# same instruction sequence used for both SHA-256 and SHA-512. In
17# former case the instructions operate on 32-bit operands, while in
18# latter - on 64-bit ones. All I had to do is to get one flavor right,
19# the other one passed the test right away:-)
20#
21# sha256_block runs in ~1005 cycles on Opteron, which gives you
22# asymptotic performance of 64*1000/1005=63.7MBps times CPU clock
23# frequency in GHz. sha512_block runs in ~1275 cycles, which results
24# in 128*1000/1275=100MBps per GHz. Is there room for improvement?
25# Well, if you compare it to IA-64 implementation, which maintains
26# X[16] in register bank[!], tends to 4 instructions per CPU clock
27# cycle and runs in 1003 cycles, 1275 is very good result for 3-way
28# issue Opteron pipeline and X[16] maintained in memory. So that *if*
29# there is a way to improve it, *then* the only way would be to try to
30# offload X[16] updates to SSE unit, but that would require "deeper"
31# loop unroll, which in turn would naturally cause size blow-up, not
32# to mention increased complexity! And once again, only *if* it's
33# actually possible to noticeably improve overall ILP, instruction
34# level parallelism, on a given CPU implementation in this case.
35#
36# Special note on Intel EM64T. While Opteron CPU exhibits perfect
37# perfromance ratio of 1.5 between 64- and 32-bit flavors [see above],
38# [currently available] EM64T CPUs apparently are far from it. On the
39# contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit
40# sha256_block:-( This is presumably because 64-bit shifts/rotates
41# apparently are not atomic instructions, but implemented in microcode.
42
43$flavour = shift;
44$output = shift;
45if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
46
47$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
48( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
49( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
50die "can't locate x86_64-xlate.pl";
51
52open OUT,"| \"$^X\" $xlate $flavour $output";
53*STDOUT=*OUT;
54
55if ($output =~ /512/) {
56 $func="sha512_block_data_order";
57 $TABLE="K512";
58 $SZ=8;
59 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%rax","%rbx","%rcx","%rdx",
60 "%r8", "%r9", "%r10","%r11");
61 ($T1,$a0,$a1,$a2)=("%r12","%r13","%r14","%r15");
62 @Sigma0=(28,34,39);
63 @Sigma1=(14,18,41);
64 @sigma0=(1, 8, 7);
65 @sigma1=(19,61, 6);
66 $rounds=80;
67} else {
68 $func="sha256_block_data_order";
69 $TABLE="K256";
70 $SZ=4;
71 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
72 "%r8d","%r9d","%r10d","%r11d");
73 ($T1,$a0,$a1,$a2)=("%r12d","%r13d","%r14d","%r15d");
74 @Sigma0=( 2,13,22);
75 @Sigma1=( 6,11,25);
76 @sigma0=( 7,18, 3);
77 @sigma1=(17,19,10);
78 $rounds=64;
79}
80
81$ctx="%rdi"; # 1st arg
82$round="%rdi"; # zaps $ctx
83$inp="%rsi"; # 2nd arg
84$Tbl="%rbp";
85
86$_ctx="16*$SZ+0*8(%rsp)";
87$_inp="16*$SZ+1*8(%rsp)";
88$_end="16*$SZ+2*8(%rsp)";
89$_rsp="16*$SZ+3*8(%rsp)";
90$framesz="16*$SZ+4*8";
91
92
93sub ROUND_00_15()
94{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
95
96$code.=<<___;
97 ror \$`$Sigma1[2]-$Sigma1[1]`,$a0
98 mov $f,$a2
99 mov $T1,`$SZ*($i&0xf)`(%rsp)
100
101 ror \$`$Sigma0[2]-$Sigma0[1]`,$a1
102 xor $e,$a0
103 xor $g,$a2 # f^g
104
105 ror \$`$Sigma1[1]-$Sigma1[0]`,$a0
106 add $h,$T1 # T1+=h
107 xor $a,$a1
108
109 add ($Tbl,$round,$SZ),$T1 # T1+=K[round]
110 and $e,$a2 # (f^g)&e
111 mov $b,$h
112
113 ror \$`$Sigma0[1]-$Sigma0[0]`,$a1
114 xor $e,$a0
115 xor $g,$a2 # Ch(e,f,g)=((f^g)&e)^g
116
117 xor $c,$h # b^c
118 xor $a,$a1
119 add $a2,$T1 # T1+=Ch(e,f,g)
120 mov $b,$a2
121
122 ror \$$Sigma1[0],$a0 # Sigma1(e)
123 and $a,$h # h=(b^c)&a
124 and $c,$a2 # b&c
125
126 ror \$$Sigma0[0],$a1 # Sigma0(a)
127 add $a0,$T1 # T1+=Sigma1(e)
128 add $a2,$h # h+=b&c (completes +=Maj(a,b,c)
129
130 add $T1,$d # d+=T1
131 add $T1,$h # h+=T1
132 lea 1($round),$round # round++
133 add $a1,$h # h+=Sigma0(a)
134
135___
136}
137
138sub ROUND_16_XX()
139{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
140
141$code.=<<___;
142 mov `$SZ*(($i+1)&0xf)`(%rsp),$a0
143 mov `$SZ*(($i+14)&0xf)`(%rsp),$a1
144 mov $a0,$T1
145 mov $a1,$a2
146
147 ror \$`$sigma0[1]-$sigma0[0]`,$T1
148 xor $a0,$T1
149 shr \$$sigma0[2],$a0
150
151 ror \$$sigma0[0],$T1
152 xor $T1,$a0 # sigma0(X[(i+1)&0xf])
153 mov `$SZ*(($i+9)&0xf)`(%rsp),$T1
154
155 ror \$`$sigma1[1]-$sigma1[0]`,$a2
156 xor $a1,$a2
157 shr \$$sigma1[2],$a1
158
159 ror \$$sigma1[0],$a2
160 add $a0,$T1
161 xor $a2,$a1 # sigma1(X[(i+14)&0xf])
162
163 add `$SZ*($i&0xf)`(%rsp),$T1
164 mov $e,$a0
165 add $a1,$T1
166 mov $a,$a1
167___
168 &ROUND_00_15(@_);
169}
170
171$code=<<___;
172.text
173
174.globl $func
175.type $func,\@function,4
176.align 16
177$func:
178 push %rbx
179 push %rbp
180 push %r12
181 push %r13
182 push %r14
183 push %r15
184 mov %rsp,%r11 # copy %rsp
185 shl \$4,%rdx # num*16
186 sub \$$framesz,%rsp
187 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
188 and \$-64,%rsp # align stack frame
189 mov $ctx,$_ctx # save ctx, 1st arg
190 mov $inp,$_inp # save inp, 2nd arh
191 mov %rdx,$_end # save end pointer, "3rd" arg
192 mov %r11,$_rsp # save copy of %rsp
193.Lprologue:
194
195 lea $TABLE(%rip),$Tbl
196
197 mov $SZ*0($ctx),$A
198 mov $SZ*1($ctx),$B
199 mov $SZ*2($ctx),$C
200 mov $SZ*3($ctx),$D
201 mov $SZ*4($ctx),$E
202 mov $SZ*5($ctx),$F
203 mov $SZ*6($ctx),$G
204 mov $SZ*7($ctx),$H
205 jmp .Lloop
206
207.align 16
208.Lloop:
209 xor $round,$round
210___
211 for($i=0;$i<16;$i++) {
212 $code.=" mov $SZ*$i($inp),$T1\n";
213 $code.=" mov @ROT[4],$a0\n";
214 $code.=" mov @ROT[0],$a1\n";
215 $code.=" bswap $T1\n";
216 &ROUND_00_15($i,@ROT);
217 unshift(@ROT,pop(@ROT));
218 }
219$code.=<<___;
220 jmp .Lrounds_16_xx
221.align 16
222.Lrounds_16_xx:
223___
224 for(;$i<32;$i++) {
225 &ROUND_16_XX($i,@ROT);
226 unshift(@ROT,pop(@ROT));
227 }
228
229$code.=<<___;
230 cmp \$$rounds,$round
231 jb .Lrounds_16_xx
232
233 mov $_ctx,$ctx
234 lea 16*$SZ($inp),$inp
235
236 add $SZ*0($ctx),$A
237 add $SZ*1($ctx),$B
238 add $SZ*2($ctx),$C
239 add $SZ*3($ctx),$D
240 add $SZ*4($ctx),$E
241 add $SZ*5($ctx),$F
242 add $SZ*6($ctx),$G
243 add $SZ*7($ctx),$H
244
245 cmp $_end,$inp
246
247 mov $A,$SZ*0($ctx)
248 mov $B,$SZ*1($ctx)
249 mov $C,$SZ*2($ctx)
250 mov $D,$SZ*3($ctx)
251 mov $E,$SZ*4($ctx)
252 mov $F,$SZ*5($ctx)
253 mov $G,$SZ*6($ctx)
254 mov $H,$SZ*7($ctx)
255 jb .Lloop
256
257 mov $_rsp,%rsi
258 mov (%rsi),%r15
259 mov 8(%rsi),%r14
260 mov 16(%rsi),%r13
261 mov 24(%rsi),%r12
262 mov 32(%rsi),%rbp
263 mov 40(%rsi),%rbx
264 lea 48(%rsi),%rsp
265.Lepilogue:
266 ret
267.size $func,.-$func
268___
269
270if ($SZ==4) {
271$code.=<<___;
272.align 64
273.type $TABLE,\@object
274$TABLE:
275 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
276 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
277 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
278 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
279 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
280 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
281 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
282 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
283 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
284 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
285 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
286 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
287 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
288 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
289 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
290 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
291___
292} else {
293$code.=<<___;
294.align 64
295.type $TABLE,\@object
296$TABLE:
297 .quad 0x428a2f98d728ae22,0x7137449123ef65cd
298 .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
299 .quad 0x3956c25bf348b538,0x59f111f1b605d019
300 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
301 .quad 0xd807aa98a3030242,0x12835b0145706fbe
302 .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
303 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
304 .quad 0x9bdc06a725c71235,0xc19bf174cf692694
305 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
306 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
307 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
308 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
309 .quad 0x983e5152ee66dfab,0xa831c66d2db43210
310 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
311 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
312 .quad 0x06ca6351e003826f,0x142929670a0e6e70
313 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
314 .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
315 .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
316 .quad 0x81c2c92e47edaee6,0x92722c851482353b
317 .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
318 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
319 .quad 0xd192e819d6ef5218,0xd69906245565a910
320 .quad 0xf40e35855771202a,0x106aa07032bbd1b8
321 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
322 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
323 .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
324 .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
325 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
326 .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
327 .quad 0x90befffa23631e28,0xa4506cebde82bde9
328 .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
329 .quad 0xca273eceea26619c,0xd186b8c721c0c207
330 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
331 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
332 .quad 0x113f9804bef90dae,0x1b710b35131c471b
333 .quad 0x28db77f523047d84,0x32caab7b40c72493
334 .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
335 .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
336 .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
337___
338}
339
340$code =~ s/\`([^\`]*)\`/eval $1/gem;
341print $code;
342close STDOUT;
diff --git a/src/lib/libcrypto/sha/sha.h b/src/lib/libcrypto/sha/sha.h
deleted file mode 100644
index d890175159..0000000000
--- a/src/lib/libcrypto/sha/sha.h
+++ /dev/null
@@ -1,201 +0,0 @@
1/* $OpenBSD: sha.h,v 1.20 2014/10/20 13:06:54 bcook Exp $ */
2/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
3 * All rights reserved.
4 *
5 * This package is an SSL implementation written
6 * by Eric Young (eay@cryptsoft.com).
7 * The implementation was written so as to conform with Netscapes SSL.
8 *
9 * This library is free for commercial and non-commercial use as long as
10 * the following conditions are aheared to. The following conditions
11 * apply to all code found in this distribution, be it the RC4, RSA,
12 * lhash, DES, etc., code; not just the SSL code. The SSL documentation
13 * included with this distribution is covered by the same copyright terms
14 * except that the holder is Tim Hudson (tjh@cryptsoft.com).
15 *
16 * Copyright remains Eric Young's, and as such any Copyright notices in
17 * the code are not to be removed.
18 * If this package is used in a product, Eric Young should be given attribution
19 * as the author of the parts of the library used.
20 * This can be in the form of a textual message at program startup or
21 * in documentation (online or textual) provided with the package.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 * 1. Redistributions of source code must retain the copyright
27 * notice, this list of conditions and the following disclaimer.
28 * 2. Redistributions in binary form must reproduce the above copyright
29 * notice, this list of conditions and the following disclaimer in the
30 * documentation and/or other materials provided with the distribution.
31 * 3. All advertising materials mentioning features or use of this software
32 * must display the following acknowledgement:
33 * "This product includes cryptographic software written by
34 * Eric Young (eay@cryptsoft.com)"
35 * The word 'cryptographic' can be left out if the rouines from the library
36 * being used are not cryptographic related :-).
37 * 4. If you include any Windows specific code (or a derivative thereof) from
38 * the apps directory (application code) you must include an acknowledgement:
39 * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
40 *
41 * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
42 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51 * SUCH DAMAGE.
52 *
53 * The licence and distribution terms for any publically available version or
54 * derivative of this code cannot be changed. i.e. this code cannot simply be
55 * copied and put under another distribution licence
56 * [including the GNU Public Licence.]
57 */
58
59#include <stddef.h>
60
61#ifndef HEADER_SHA_H
62#define HEADER_SHA_H
63#if !defined(HAVE_ATTRIBUTE__BOUNDED__) && !defined(__OpenBSD__)
64#define __bounded__(x, y, z)
65#endif
66
67#include <openssl/opensslconf.h>
68
69#ifdef __cplusplus
70extern "C" {
71#endif
72
73#if defined(OPENSSL_NO_SHA) || (defined(OPENSSL_NO_SHA0) && defined(OPENSSL_NO_SHA1))
74#error SHA is disabled.
75#endif
76
77/*
78 * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
79 * ! SHA_LONG has to be at least 32 bits wide. !
80 * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
81 */
82
83#define SHA_LONG unsigned int
84
85#define SHA_LBLOCK 16
86#define SHA_CBLOCK (SHA_LBLOCK*4) /* SHA treats input data as a
87 * contiguous array of 32 bit
88 * wide big-endian values. */
89#define SHA_LAST_BLOCK (SHA_CBLOCK-8)
90#define SHA_DIGEST_LENGTH 20
91
92typedef struct SHAstate_st
93 {
94 SHA_LONG h0,h1,h2,h3,h4;
95 SHA_LONG Nl,Nh;
96 SHA_LONG data[SHA_LBLOCK];
97 unsigned int num;
98 } SHA_CTX;
99
100#ifndef OPENSSL_NO_SHA0
101int SHA_Init(SHA_CTX *c);
102int SHA_Update(SHA_CTX *c, const void *data, size_t len)
103 __attribute__ ((__bounded__(__buffer__,2,3)));
104int SHA_Final(unsigned char *md, SHA_CTX *c);
105unsigned char *SHA(const unsigned char *d, size_t n, unsigned char *md)
106 __attribute__ ((__bounded__(__buffer__,1,2)));
107void SHA_Transform(SHA_CTX *c, const unsigned char *data);
108#endif
109#ifndef OPENSSL_NO_SHA1
110int SHA1_Init(SHA_CTX *c);
111int SHA1_Update(SHA_CTX *c, const void *data, size_t len)
112 __attribute__ ((__bounded__(__buffer__,2,3)));
113int SHA1_Final(unsigned char *md, SHA_CTX *c);
114unsigned char *SHA1(const unsigned char *d, size_t n, unsigned char *md)
115 __attribute__ ((__bounded__(__buffer__,1,2)));
116void SHA1_Transform(SHA_CTX *c, const unsigned char *data);
117#endif
118
119#define SHA256_CBLOCK (SHA_LBLOCK*4) /* SHA-256 treats input data as a
120 * contiguous array of 32 bit
121 * wide big-endian values. */
122#define SHA224_DIGEST_LENGTH 28
123#define SHA256_DIGEST_LENGTH 32
124
125typedef struct SHA256state_st
126 {
127 SHA_LONG h[8];
128 SHA_LONG Nl,Nh;
129 SHA_LONG data[SHA_LBLOCK];
130 unsigned int num,md_len;
131 } SHA256_CTX;
132
133#ifndef OPENSSL_NO_SHA256
134int SHA224_Init(SHA256_CTX *c);
135int SHA224_Update(SHA256_CTX *c, const void *data, size_t len)
136 __attribute__ ((__bounded__(__buffer__,2,3)));
137int SHA224_Final(unsigned char *md, SHA256_CTX *c);
138unsigned char *SHA224(const unsigned char *d, size_t n,unsigned char *md)
139 __attribute__ ((__bounded__(__buffer__,1,2)));
140int SHA256_Init(SHA256_CTX *c);
141int SHA256_Update(SHA256_CTX *c, const void *data, size_t len)
142 __attribute__ ((__bounded__(__buffer__,2,3)));
143int SHA256_Final(unsigned char *md, SHA256_CTX *c);
144unsigned char *SHA256(const unsigned char *d, size_t n,unsigned char *md)
145 __attribute__ ((__bounded__(__buffer__,1,2)));
146void SHA256_Transform(SHA256_CTX *c, const unsigned char *data);
147#endif
148
149#define SHA384_DIGEST_LENGTH 48
150#define SHA512_DIGEST_LENGTH 64
151
152#ifndef OPENSSL_NO_SHA512
153/*
154 * Unlike 32-bit digest algorithms, SHA-512 *relies* on SHA_LONG64
155 * being exactly 64-bit wide. See Implementation Notes in sha512.c
156 * for further details.
157 */
158#define SHA512_CBLOCK (SHA_LBLOCK*8) /* SHA-512 treats input data as a
159 * contiguous array of 64 bit
160 * wide big-endian values. */
161#if defined(_LP64)
162#define SHA_LONG64 unsigned long
163#define U64(C) C##UL
164#else
165#define SHA_LONG64 unsigned long long
166#define U64(C) C##ULL
167#endif
168
169typedef struct SHA512state_st
170 {
171 SHA_LONG64 h[8];
172 SHA_LONG64 Nl,Nh;
173 union {
174 SHA_LONG64 d[SHA_LBLOCK];
175 unsigned char p[SHA512_CBLOCK];
176 } u;
177 unsigned int num,md_len;
178 } SHA512_CTX;
179#endif
180
181#ifndef OPENSSL_NO_SHA512
182int SHA384_Init(SHA512_CTX *c);
183int SHA384_Update(SHA512_CTX *c, const void *data, size_t len)
184 __attribute__ ((__bounded__(__buffer__,2,3)));
185int SHA384_Final(unsigned char *md, SHA512_CTX *c);
186unsigned char *SHA384(const unsigned char *d, size_t n,unsigned char *md)
187 __attribute__ ((__bounded__(__buffer__,1,2)));
188int SHA512_Init(SHA512_CTX *c);
189int SHA512_Update(SHA512_CTX *c, const void *data, size_t len)
190 __attribute__ ((__bounded__(__buffer__,2,3)));
191int SHA512_Final(unsigned char *md, SHA512_CTX *c);
192unsigned char *SHA512(const unsigned char *d, size_t n,unsigned char *md)
193 __attribute__ ((__bounded__(__buffer__,1,2)));
194void SHA512_Transform(SHA512_CTX *c, const unsigned char *data);
195#endif
196
197#ifdef __cplusplus
198}
199#endif
200
201#endif
diff --git a/src/lib/libcrypto/sha/sha1_one.c b/src/lib/libcrypto/sha/sha1_one.c
deleted file mode 100644
index f6b5e4bacf..0000000000
--- a/src/lib/libcrypto/sha/sha1_one.c
+++ /dev/null
@@ -1,81 +0,0 @@
1/* $OpenBSD: sha1_one.c,v 1.11 2014/07/10 22:45:58 jsing Exp $ */
2/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
3 * All rights reserved.
4 *
5 * This package is an SSL implementation written
6 * by Eric Young (eay@cryptsoft.com).
7 * The implementation was written so as to conform with Netscapes SSL.
8 *
9 * This library is free for commercial and non-commercial use as long as
10 * the following conditions are aheared to. The following conditions
11 * apply to all code found in this distribution, be it the RC4, RSA,
12 * lhash, DES, etc., code; not just the SSL code. The SSL documentation
13 * included with this distribution is covered by the same copyright terms
14 * except that the holder is Tim Hudson (tjh@cryptsoft.com).
15 *
16 * Copyright remains Eric Young's, and as such any Copyright notices in
17 * the code are not to be removed.
18 * If this package is used in a product, Eric Young should be given attribution
19 * as the author of the parts of the library used.
20 * This can be in the form of a textual message at program startup or
21 * in documentation (online or textual) provided with the package.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 * 1. Redistributions of source code must retain the copyright
27 * notice, this list of conditions and the following disclaimer.
28 * 2. Redistributions in binary form must reproduce the above copyright
29 * notice, this list of conditions and the following disclaimer in the
30 * documentation and/or other materials provided with the distribution.
31 * 3. All advertising materials mentioning features or use of this software
32 * must display the following acknowledgement:
33 * "This product includes cryptographic software written by
34 * Eric Young (eay@cryptsoft.com)"
35 * The word 'cryptographic' can be left out if the rouines from the library
36 * being used are not cryptographic related :-).
37 * 4. If you include any Windows specific code (or a derivative thereof) from
38 * the apps directory (application code) you must include an acknowledgement:
39 * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
40 *
41 * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
42 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51 * SUCH DAMAGE.
52 *
53 * The licence and distribution terms for any publically available version or
54 * derivative of this code cannot be changed. i.e. this code cannot simply be
55 * copied and put under another distribution licence
56 * [including the GNU Public Licence.]
57 */
58
59#include <stdio.h>
60#include <string.h>
61
62#include <openssl/opensslconf.h>
63
64#include <openssl/crypto.h>
65#include <openssl/sha.h>
66
67#ifndef OPENSSL_NO_SHA1
68unsigned char *SHA1(const unsigned char *d, size_t n, unsigned char *md)
69 {
70 SHA_CTX c;
71 static unsigned char m[SHA_DIGEST_LENGTH];
72
73 if (md == NULL) md=m;
74 if (!SHA1_Init(&c))
75 return NULL;
76 SHA1_Update(&c,d,n);
77 SHA1_Final(md,&c);
78 OPENSSL_cleanse(&c,sizeof(c));
79 return(md);
80 }
81#endif
diff --git a/src/lib/libcrypto/sha/sha1dgst.c b/src/lib/libcrypto/sha/sha1dgst.c
deleted file mode 100644
index aac27bdd2d..0000000000
--- a/src/lib/libcrypto/sha/sha1dgst.c
+++ /dev/null
@@ -1,75 +0,0 @@
1/* $OpenBSD: sha1dgst.c,v 1.13 2014/07/10 22:45:58 jsing Exp $ */
2/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
3 * All rights reserved.
4 *
5 * This package is an SSL implementation written
6 * by Eric Young (eay@cryptsoft.com).
7 * The implementation was written so as to conform with Netscapes SSL.
8 *
9 * This library is free for commercial and non-commercial use as long as
10 * the following conditions are aheared to. The following conditions
11 * apply to all code found in this distribution, be it the RC4, RSA,
12 * lhash, DES, etc., code; not just the SSL code. The SSL documentation
13 * included with this distribution is covered by the same copyright terms
14 * except that the holder is Tim Hudson (tjh@cryptsoft.com).
15 *
16 * Copyright remains Eric Young's, and as such any Copyright notices in
17 * the code are not to be removed.
18 * If this package is used in a product, Eric Young should be given attribution
19 * as the author of the parts of the library used.
20 * This can be in the form of a textual message at program startup or
21 * in documentation (online or textual) provided with the package.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 * 1. Redistributions of source code must retain the copyright
27 * notice, this list of conditions and the following disclaimer.
28 * 2. Redistributions in binary form must reproduce the above copyright
29 * notice, this list of conditions and the following disclaimer in the
30 * documentation and/or other materials provided with the distribution.
31 * 3. All advertising materials mentioning features or use of this software
32 * must display the following acknowledgement:
33 * "This product includes cryptographic software written by
34 * Eric Young (eay@cryptsoft.com)"
35 * The word 'cryptographic' can be left out if the rouines from the library
36 * being used are not cryptographic related :-).
37 * 4. If you include any Windows specific code (or a derivative thereof) from
38 * the apps directory (application code) you must include an acknowledgement:
39 * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
40 *
41 * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
42 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51 * SUCH DAMAGE.
52 *
53 * The licence and distribution terms for any publically available version or
54 * derivative of this code cannot be changed. i.e. this code cannot simply be
55 * copied and put under another distribution licence
56 * [including the GNU Public Licence.]
57 */
58
59#include <openssl/opensslconf.h>
60
61#include <openssl/crypto.h>
62
63#if !defined(OPENSSL_NO_SHA1) && !defined(OPENSSL_NO_SHA)
64
65#undef SHA_0
66#define SHA_1
67
68#include <openssl/opensslv.h>
69
70/* The implementation is in ../md32_common.h */
71
72#include "sha_locl.h"
73
74#endif
75
diff --git a/src/lib/libcrypto/sha/sha256.c b/src/lib/libcrypto/sha/sha256.c
deleted file mode 100644
index c5ab56852f..0000000000
--- a/src/lib/libcrypto/sha/sha256.c
+++ /dev/null
@@ -1,284 +0,0 @@
1/* $OpenBSD: sha256.c,v 1.8 2014/08/18 19:11:48 bcook Exp $ */
2/* ====================================================================
3 * Copyright (c) 2004 The OpenSSL Project. All rights reserved
4 * according to the OpenSSL license [found in ../../LICENSE].
5 * ====================================================================
6 */
7
8#include <openssl/opensslconf.h>
9
10#if !defined(OPENSSL_NO_SHA) && !defined(OPENSSL_NO_SHA256)
11
12#include <machine/endian.h>
13
14#include <stdlib.h>
15#include <string.h>
16
17#include <openssl/crypto.h>
18#include <openssl/sha.h>
19#include <openssl/opensslv.h>
20
21int SHA224_Init(SHA256_CTX *c)
22 {
23 memset (c,0,sizeof(*c));
24 c->h[0]=0xc1059ed8UL; c->h[1]=0x367cd507UL;
25 c->h[2]=0x3070dd17UL; c->h[3]=0xf70e5939UL;
26 c->h[4]=0xffc00b31UL; c->h[5]=0x68581511UL;
27 c->h[6]=0x64f98fa7UL; c->h[7]=0xbefa4fa4UL;
28 c->md_len=SHA224_DIGEST_LENGTH;
29 return 1;
30 }
31
32int SHA256_Init(SHA256_CTX *c)
33 {
34 memset (c,0,sizeof(*c));
35 c->h[0]=0x6a09e667UL; c->h[1]=0xbb67ae85UL;
36 c->h[2]=0x3c6ef372UL; c->h[3]=0xa54ff53aUL;
37 c->h[4]=0x510e527fUL; c->h[5]=0x9b05688cUL;
38 c->h[6]=0x1f83d9abUL; c->h[7]=0x5be0cd19UL;
39 c->md_len=SHA256_DIGEST_LENGTH;
40 return 1;
41 }
42
43unsigned char *SHA224(const unsigned char *d, size_t n, unsigned char *md)
44 {
45 SHA256_CTX c;
46 static unsigned char m[SHA224_DIGEST_LENGTH];
47
48 if (md == NULL) md=m;
49 SHA224_Init(&c);
50 SHA256_Update(&c,d,n);
51 SHA256_Final(md,&c);
52 OPENSSL_cleanse(&c,sizeof(c));
53 return(md);
54 }
55
56unsigned char *SHA256(const unsigned char *d, size_t n, unsigned char *md)
57 {
58 SHA256_CTX c;
59 static unsigned char m[SHA256_DIGEST_LENGTH];
60
61 if (md == NULL) md=m;
62 SHA256_Init(&c);
63 SHA256_Update(&c,d,n);
64 SHA256_Final(md,&c);
65 OPENSSL_cleanse(&c,sizeof(c));
66 return(md);
67 }
68
69int SHA224_Update(SHA256_CTX *c, const void *data, size_t len)
70{ return SHA256_Update (c,data,len); }
71int SHA224_Final (unsigned char *md, SHA256_CTX *c)
72{ return SHA256_Final (md,c); }
73
74#define DATA_ORDER_IS_BIG_ENDIAN
75
76#define HASH_LONG SHA_LONG
77#define HASH_CTX SHA256_CTX
78#define HASH_CBLOCK SHA_CBLOCK
79/*
80 * Note that FIPS180-2 discusses "Truncation of the Hash Function Output."
81 * default: case below covers for it. It's not clear however if it's
82 * permitted to truncate to amount of bytes not divisible by 4. I bet not,
83 * but if it is, then default: case shall be extended. For reference.
84 * Idea behind separate cases for pre-defined lenghts is to let the
85 * compiler decide if it's appropriate to unroll small loops.
86 */
87#define HASH_MAKE_STRING(c,s) do { \
88 unsigned long ll; \
89 unsigned int nn; \
90 switch ((c)->md_len) \
91 { case SHA224_DIGEST_LENGTH: \
92 for (nn=0;nn<SHA224_DIGEST_LENGTH/4;nn++) \
93 { ll=(c)->h[nn]; HOST_l2c(ll,(s)); } \
94 break; \
95 case SHA256_DIGEST_LENGTH: \
96 for (nn=0;nn<SHA256_DIGEST_LENGTH/4;nn++) \
97 { ll=(c)->h[nn]; HOST_l2c(ll,(s)); } \
98 break; \
99 default: \
100 if ((c)->md_len > SHA256_DIGEST_LENGTH) \
101 return 0; \
102 for (nn=0;nn<(c)->md_len/4;nn++) \
103 { ll=(c)->h[nn]; HOST_l2c(ll,(s)); } \
104 break; \
105 } \
106 } while (0)
107
108#define HASH_UPDATE SHA256_Update
109#define HASH_TRANSFORM SHA256_Transform
110#define HASH_FINAL SHA256_Final
111#define HASH_BLOCK_DATA_ORDER sha256_block_data_order
112#ifndef SHA256_ASM
113static
114#endif
115void sha256_block_data_order (SHA256_CTX *ctx, const void *in, size_t num);
116
117#include "md32_common.h"
118
119#ifndef SHA256_ASM
120static const SHA_LONG K256[64] = {
121 0x428a2f98UL,0x71374491UL,0xb5c0fbcfUL,0xe9b5dba5UL,
122 0x3956c25bUL,0x59f111f1UL,0x923f82a4UL,0xab1c5ed5UL,
123 0xd807aa98UL,0x12835b01UL,0x243185beUL,0x550c7dc3UL,
124 0x72be5d74UL,0x80deb1feUL,0x9bdc06a7UL,0xc19bf174UL,
125 0xe49b69c1UL,0xefbe4786UL,0x0fc19dc6UL,0x240ca1ccUL,
126 0x2de92c6fUL,0x4a7484aaUL,0x5cb0a9dcUL,0x76f988daUL,
127 0x983e5152UL,0xa831c66dUL,0xb00327c8UL,0xbf597fc7UL,
128 0xc6e00bf3UL,0xd5a79147UL,0x06ca6351UL,0x14292967UL,
129 0x27b70a85UL,0x2e1b2138UL,0x4d2c6dfcUL,0x53380d13UL,
130 0x650a7354UL,0x766a0abbUL,0x81c2c92eUL,0x92722c85UL,
131 0xa2bfe8a1UL,0xa81a664bUL,0xc24b8b70UL,0xc76c51a3UL,
132 0xd192e819UL,0xd6990624UL,0xf40e3585UL,0x106aa070UL,
133 0x19a4c116UL,0x1e376c08UL,0x2748774cUL,0x34b0bcb5UL,
134 0x391c0cb3UL,0x4ed8aa4aUL,0x5b9cca4fUL,0x682e6ff3UL,
135 0x748f82eeUL,0x78a5636fUL,0x84c87814UL,0x8cc70208UL,
136 0x90befffaUL,0xa4506cebUL,0xbef9a3f7UL,0xc67178f2UL };
137
138/*
139 * FIPS specification refers to right rotations, while our ROTATE macro
140 * is left one. This is why you might notice that rotation coefficients
141 * differ from those observed in FIPS document by 32-N...
142 */
143#define Sigma0(x) (ROTATE((x),30) ^ ROTATE((x),19) ^ ROTATE((x),10))
144#define Sigma1(x) (ROTATE((x),26) ^ ROTATE((x),21) ^ ROTATE((x),7))
145#define sigma0(x) (ROTATE((x),25) ^ ROTATE((x),14) ^ ((x)>>3))
146#define sigma1(x) (ROTATE((x),15) ^ ROTATE((x),13) ^ ((x)>>10))
147
148#define Ch(x,y,z) (((x) & (y)) ^ ((~(x)) & (z)))
149#define Maj(x,y,z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
150
151#ifdef OPENSSL_SMALL_FOOTPRINT
152
153static void sha256_block_data_order (SHA256_CTX *ctx, const void *in, size_t num)
154 {
155 unsigned MD32_REG_T a,b,c,d,e,f,g,h,s0,s1,T1,T2;
156 SHA_LONG X[16],l;
157 int i;
158 const unsigned char *data=in;
159
160 while (num--) {
161
162 a = ctx->h[0]; b = ctx->h[1]; c = ctx->h[2]; d = ctx->h[3];
163 e = ctx->h[4]; f = ctx->h[5]; g = ctx->h[6]; h = ctx->h[7];
164
165 for (i=0;i<16;i++)
166 {
167 HOST_c2l(data,l); T1 = X[i] = l;
168 T1 += h + Sigma1(e) + Ch(e,f,g) + K256[i];
169 T2 = Sigma0(a) + Maj(a,b,c);
170 h = g; g = f; f = e; e = d + T1;
171 d = c; c = b; b = a; a = T1 + T2;
172 }
173
174 for (;i<64;i++)
175 {
176 s0 = X[(i+1)&0x0f]; s0 = sigma0(s0);
177 s1 = X[(i+14)&0x0f]; s1 = sigma1(s1);
178
179 T1 = X[i&0xf] += s0 + s1 + X[(i+9)&0xf];
180 T1 += h + Sigma1(e) + Ch(e,f,g) + K256[i];
181 T2 = Sigma0(a) + Maj(a,b,c);
182 h = g; g = f; f = e; e = d + T1;
183 d = c; c = b; b = a; a = T1 + T2;
184 }
185
186 ctx->h[0] += a; ctx->h[1] += b; ctx->h[2] += c; ctx->h[3] += d;
187 ctx->h[4] += e; ctx->h[5] += f; ctx->h[6] += g; ctx->h[7] += h;
188
189 }
190}
191
192#else
193
194#define ROUND_00_15(i,a,b,c,d,e,f,g,h) do { \
195 T1 += h + Sigma1(e) + Ch(e,f,g) + K256[i]; \
196 h = Sigma0(a) + Maj(a,b,c); \
197 d += T1; h += T1; } while (0)
198
199#define ROUND_16_63(i,a,b,c,d,e,f,g,h,X) do { \
200 s0 = X[(i+1)&0x0f]; s0 = sigma0(s0); \
201 s1 = X[(i+14)&0x0f]; s1 = sigma1(s1); \
202 T1 = X[(i)&0x0f] += s0 + s1 + X[(i+9)&0x0f]; \
203 ROUND_00_15(i,a,b,c,d,e,f,g,h); } while (0)
204
205static void sha256_block_data_order (SHA256_CTX *ctx, const void *in, size_t num)
206 {
207 unsigned MD32_REG_T a,b,c,d,e,f,g,h,s0,s1,T1;
208 SHA_LONG X[16];
209 int i;
210 const unsigned char *data=in;
211
212 while (num--) {
213
214 a = ctx->h[0]; b = ctx->h[1]; c = ctx->h[2]; d = ctx->h[3];
215 e = ctx->h[4]; f = ctx->h[5]; g = ctx->h[6]; h = ctx->h[7];
216
217 if (BYTE_ORDER != LITTLE_ENDIAN &&
218 sizeof(SHA_LONG)==4 && ((size_t)in%4)==0)
219 {
220 const SHA_LONG *W=(const SHA_LONG *)data;
221
222 T1 = X[0] = W[0]; ROUND_00_15(0,a,b,c,d,e,f,g,h);
223 T1 = X[1] = W[1]; ROUND_00_15(1,h,a,b,c,d,e,f,g);
224 T1 = X[2] = W[2]; ROUND_00_15(2,g,h,a,b,c,d,e,f);
225 T1 = X[3] = W[3]; ROUND_00_15(3,f,g,h,a,b,c,d,e);
226 T1 = X[4] = W[4]; ROUND_00_15(4,e,f,g,h,a,b,c,d);
227 T1 = X[5] = W[5]; ROUND_00_15(5,d,e,f,g,h,a,b,c);
228 T1 = X[6] = W[6]; ROUND_00_15(6,c,d,e,f,g,h,a,b);
229 T1 = X[7] = W[7]; ROUND_00_15(7,b,c,d,e,f,g,h,a);
230 T1 = X[8] = W[8]; ROUND_00_15(8,a,b,c,d,e,f,g,h);
231 T1 = X[9] = W[9]; ROUND_00_15(9,h,a,b,c,d,e,f,g);
232 T1 = X[10] = W[10]; ROUND_00_15(10,g,h,a,b,c,d,e,f);
233 T1 = X[11] = W[11]; ROUND_00_15(11,f,g,h,a,b,c,d,e);
234 T1 = X[12] = W[12]; ROUND_00_15(12,e,f,g,h,a,b,c,d);
235 T1 = X[13] = W[13]; ROUND_00_15(13,d,e,f,g,h,a,b,c);
236 T1 = X[14] = W[14]; ROUND_00_15(14,c,d,e,f,g,h,a,b);
237 T1 = X[15] = W[15]; ROUND_00_15(15,b,c,d,e,f,g,h,a);
238
239 data += SHA256_CBLOCK;
240 }
241 else
242 {
243 SHA_LONG l;
244
245 HOST_c2l(data,l); T1 = X[0] = l; ROUND_00_15(0,a,b,c,d,e,f,g,h);
246 HOST_c2l(data,l); T1 = X[1] = l; ROUND_00_15(1,h,a,b,c,d,e,f,g);
247 HOST_c2l(data,l); T1 = X[2] = l; ROUND_00_15(2,g,h,a,b,c,d,e,f);
248 HOST_c2l(data,l); T1 = X[3] = l; ROUND_00_15(3,f,g,h,a,b,c,d,e);
249 HOST_c2l(data,l); T1 = X[4] = l; ROUND_00_15(4,e,f,g,h,a,b,c,d);
250 HOST_c2l(data,l); T1 = X[5] = l; ROUND_00_15(5,d,e,f,g,h,a,b,c);
251 HOST_c2l(data,l); T1 = X[6] = l; ROUND_00_15(6,c,d,e,f,g,h,a,b);
252 HOST_c2l(data,l); T1 = X[7] = l; ROUND_00_15(7,b,c,d,e,f,g,h,a);
253 HOST_c2l(data,l); T1 = X[8] = l; ROUND_00_15(8,a,b,c,d,e,f,g,h);
254 HOST_c2l(data,l); T1 = X[9] = l; ROUND_00_15(9,h,a,b,c,d,e,f,g);
255 HOST_c2l(data,l); T1 = X[10] = l; ROUND_00_15(10,g,h,a,b,c,d,e,f);
256 HOST_c2l(data,l); T1 = X[11] = l; ROUND_00_15(11,f,g,h,a,b,c,d,e);
257 HOST_c2l(data,l); T1 = X[12] = l; ROUND_00_15(12,e,f,g,h,a,b,c,d);
258 HOST_c2l(data,l); T1 = X[13] = l; ROUND_00_15(13,d,e,f,g,h,a,b,c);
259 HOST_c2l(data,l); T1 = X[14] = l; ROUND_00_15(14,c,d,e,f,g,h,a,b);
260 HOST_c2l(data,l); T1 = X[15] = l; ROUND_00_15(15,b,c,d,e,f,g,h,a);
261 }
262
263 for (i=16;i<64;i+=8)
264 {
265 ROUND_16_63(i+0,a,b,c,d,e,f,g,h,X);
266 ROUND_16_63(i+1,h,a,b,c,d,e,f,g,X);
267 ROUND_16_63(i+2,g,h,a,b,c,d,e,f,X);
268 ROUND_16_63(i+3,f,g,h,a,b,c,d,e,X);
269 ROUND_16_63(i+4,e,f,g,h,a,b,c,d,X);
270 ROUND_16_63(i+5,d,e,f,g,h,a,b,c,X);
271 ROUND_16_63(i+6,c,d,e,f,g,h,a,b,X);
272 ROUND_16_63(i+7,b,c,d,e,f,g,h,a,X);
273 }
274
275 ctx->h[0] += a; ctx->h[1] += b; ctx->h[2] += c; ctx->h[3] += d;
276 ctx->h[4] += e; ctx->h[5] += f; ctx->h[6] += g; ctx->h[7] += h;
277
278 }
279 }
280
281#endif
282#endif /* SHA256_ASM */
283
284#endif /* OPENSSL_NO_SHA256 */
diff --git a/src/lib/libcrypto/sha/sha512.c b/src/lib/libcrypto/sha/sha512.c
deleted file mode 100644
index ad72b7e6f1..0000000000
--- a/src/lib/libcrypto/sha/sha512.c
+++ /dev/null
@@ -1,558 +0,0 @@
1/* $OpenBSD: sha512.c,v 1.13 2014/07/11 08:44:49 jsing Exp $ */
2/* ====================================================================
3 * Copyright (c) 2004 The OpenSSL Project. All rights reserved
4 * according to the OpenSSL license [found in ../../LICENSE].
5 * ====================================================================
6 */
7
8#include <machine/endian.h>
9
10#include <stdlib.h>
11#include <string.h>
12
13#include <openssl/opensslconf.h>
14
15#if !defined(OPENSSL_NO_SHA) && !defined(OPENSSL_NO_SHA512)
16/*
17 * IMPLEMENTATION NOTES.
18 *
19 * As you might have noticed 32-bit hash algorithms:
20 *
21 * - permit SHA_LONG to be wider than 32-bit (case on CRAY);
22 * - optimized versions implement two transform functions: one operating
23 * on [aligned] data in host byte order and one - on data in input
24 * stream byte order;
25 * - share common byte-order neutral collector and padding function
26 * implementations, ../md32_common.h;
27 *
28 * Neither of the above applies to this SHA-512 implementations. Reasons
29 * [in reverse order] are:
30 *
31 * - it's the only 64-bit hash algorithm for the moment of this writing,
32 * there is no need for common collector/padding implementation [yet];
33 * - by supporting only one transform function [which operates on
34 * *aligned* data in input stream byte order, big-endian in this case]
35 * we minimize burden of maintenance in two ways: a) collector/padding
36 * function is simpler; b) only one transform function to stare at;
37 * - SHA_LONG64 is required to be exactly 64-bit in order to be able to
38 * apply a number of optimizations to mitigate potential performance
39 * penalties caused by previous design decision;
40 *
41 * Caveat lector.
42 *
43 * Implementation relies on the fact that "long long" is 64-bit on
44 * both 32- and 64-bit platforms. If some compiler vendor comes up
45 * with 128-bit long long, adjustment to sha.h would be required.
46 * As this implementation relies on 64-bit integer type, it's totally
47 * inappropriate for platforms which don't support it, most notably
48 * 16-bit platforms.
49 * <appro@fy.chalmers.se>
50 */
51
52#include <openssl/crypto.h>
53#include <openssl/opensslv.h>
54#include <openssl/sha.h>
55
56#if !defined(__STRICT_ALIGNMENT) || defined(SHA512_ASM)
57#define SHA512_BLOCK_CAN_MANAGE_UNALIGNED_DATA
58#endif
59
60int SHA384_Init(SHA512_CTX *c)
61 {
62 c->h[0]=U64(0xcbbb9d5dc1059ed8);
63 c->h[1]=U64(0x629a292a367cd507);
64 c->h[2]=U64(0x9159015a3070dd17);
65 c->h[3]=U64(0x152fecd8f70e5939);
66 c->h[4]=U64(0x67332667ffc00b31);
67 c->h[5]=U64(0x8eb44a8768581511);
68 c->h[6]=U64(0xdb0c2e0d64f98fa7);
69 c->h[7]=U64(0x47b5481dbefa4fa4);
70
71 c->Nl=0; c->Nh=0;
72 c->num=0; c->md_len=SHA384_DIGEST_LENGTH;
73 return 1;
74 }
75
76int SHA512_Init(SHA512_CTX *c)
77 {
78 c->h[0]=U64(0x6a09e667f3bcc908);
79 c->h[1]=U64(0xbb67ae8584caa73b);
80 c->h[2]=U64(0x3c6ef372fe94f82b);
81 c->h[3]=U64(0xa54ff53a5f1d36f1);
82 c->h[4]=U64(0x510e527fade682d1);
83 c->h[5]=U64(0x9b05688c2b3e6c1f);
84 c->h[6]=U64(0x1f83d9abfb41bd6b);
85 c->h[7]=U64(0x5be0cd19137e2179);
86
87 c->Nl=0; c->Nh=0;
88 c->num=0; c->md_len=SHA512_DIGEST_LENGTH;
89 return 1;
90 }
91
92#ifndef SHA512_ASM
93static
94#endif
95void sha512_block_data_order (SHA512_CTX *ctx, const void *in, size_t num);
96
97int SHA512_Final (unsigned char *md, SHA512_CTX *c)
98 {
99 unsigned char *p=(unsigned char *)c->u.p;
100 size_t n=c->num;
101
102 p[n]=0x80; /* There always is a room for one */
103 n++;
104 if (n > (sizeof(c->u)-16))
105 memset (p+n,0,sizeof(c->u)-n), n=0,
106 sha512_block_data_order (c,p,1);
107
108 memset (p+n,0,sizeof(c->u)-16-n);
109#if BYTE_ORDER == BIG_ENDIAN
110 c->u.d[SHA_LBLOCK-2] = c->Nh;
111 c->u.d[SHA_LBLOCK-1] = c->Nl;
112#else
113 p[sizeof(c->u)-1] = (unsigned char)(c->Nl);
114 p[sizeof(c->u)-2] = (unsigned char)(c->Nl>>8);
115 p[sizeof(c->u)-3] = (unsigned char)(c->Nl>>16);
116 p[sizeof(c->u)-4] = (unsigned char)(c->Nl>>24);
117 p[sizeof(c->u)-5] = (unsigned char)(c->Nl>>32);
118 p[sizeof(c->u)-6] = (unsigned char)(c->Nl>>40);
119 p[sizeof(c->u)-7] = (unsigned char)(c->Nl>>48);
120 p[sizeof(c->u)-8] = (unsigned char)(c->Nl>>56);
121 p[sizeof(c->u)-9] = (unsigned char)(c->Nh);
122 p[sizeof(c->u)-10] = (unsigned char)(c->Nh>>8);
123 p[sizeof(c->u)-11] = (unsigned char)(c->Nh>>16);
124 p[sizeof(c->u)-12] = (unsigned char)(c->Nh>>24);
125 p[sizeof(c->u)-13] = (unsigned char)(c->Nh>>32);
126 p[sizeof(c->u)-14] = (unsigned char)(c->Nh>>40);
127 p[sizeof(c->u)-15] = (unsigned char)(c->Nh>>48);
128 p[sizeof(c->u)-16] = (unsigned char)(c->Nh>>56);
129#endif
130
131 sha512_block_data_order (c,p,1);
132
133 if (md==0) return 0;
134
135 switch (c->md_len)
136 {
137 /* Let compiler decide if it's appropriate to unroll... */
138 case SHA384_DIGEST_LENGTH:
139 for (n=0;n<SHA384_DIGEST_LENGTH/8;n++)
140 {
141 SHA_LONG64 t = c->h[n];
142
143 *(md++) = (unsigned char)(t>>56);
144 *(md++) = (unsigned char)(t>>48);
145 *(md++) = (unsigned char)(t>>40);
146 *(md++) = (unsigned char)(t>>32);
147 *(md++) = (unsigned char)(t>>24);
148 *(md++) = (unsigned char)(t>>16);
149 *(md++) = (unsigned char)(t>>8);
150 *(md++) = (unsigned char)(t);
151 }
152 break;
153 case SHA512_DIGEST_LENGTH:
154 for (n=0;n<SHA512_DIGEST_LENGTH/8;n++)
155 {
156 SHA_LONG64 t = c->h[n];
157
158 *(md++) = (unsigned char)(t>>56);
159 *(md++) = (unsigned char)(t>>48);
160 *(md++) = (unsigned char)(t>>40);
161 *(md++) = (unsigned char)(t>>32);
162 *(md++) = (unsigned char)(t>>24);
163 *(md++) = (unsigned char)(t>>16);
164 *(md++) = (unsigned char)(t>>8);
165 *(md++) = (unsigned char)(t);
166 }
167 break;
168 /* ... as well as make sure md_len is not abused. */
169 default: return 0;
170 }
171
172 return 1;
173 }
174
175int SHA384_Final (unsigned char *md,SHA512_CTX *c)
176{ return SHA512_Final (md,c); }
177
178int SHA512_Update (SHA512_CTX *c, const void *_data, size_t len)
179 {
180 SHA_LONG64 l;
181 unsigned char *p=c->u.p;
182 const unsigned char *data=(const unsigned char *)_data;
183
184 if (len==0) return 1;
185
186 l = (c->Nl+(((SHA_LONG64)len)<<3))&U64(0xffffffffffffffff);
187 if (l < c->Nl) c->Nh++;
188 if (sizeof(len)>=8) c->Nh+=(((SHA_LONG64)len)>>61);
189 c->Nl=l;
190
191 if (c->num != 0)
192 {
193 size_t n = sizeof(c->u) - c->num;
194
195 if (len < n)
196 {
197 memcpy (p+c->num,data,len), c->num += (unsigned int)len;
198 return 1;
199 }
200 else {
201 memcpy (p+c->num,data,n), c->num = 0;
202 len-=n, data+=n;
203 sha512_block_data_order (c,p,1);
204 }
205 }
206
207 if (len >= sizeof(c->u))
208 {
209#ifndef SHA512_BLOCK_CAN_MANAGE_UNALIGNED_DATA
210 if ((size_t)data%sizeof(c->u.d[0]) != 0)
211 while (len >= sizeof(c->u))
212 memcpy (p,data,sizeof(c->u)),
213 sha512_block_data_order (c,p,1),
214 len -= sizeof(c->u),
215 data += sizeof(c->u);
216 else
217#endif
218 sha512_block_data_order (c,data,len/sizeof(c->u)),
219 data += len,
220 len %= sizeof(c->u),
221 data -= len;
222 }
223
224 if (len != 0) memcpy (p,data,len), c->num = (int)len;
225
226 return 1;
227 }
228
229int SHA384_Update (SHA512_CTX *c, const void *data, size_t len)
230{ return SHA512_Update (c,data,len); }
231
232void SHA512_Transform (SHA512_CTX *c, const unsigned char *data)
233 {
234#ifndef SHA512_BLOCK_CAN_MANAGE_UNALIGNED_DATA
235 if ((size_t)data%sizeof(c->u.d[0]) != 0)
236 memcpy(c->u.p,data,sizeof(c->u.p)),
237 data = c->u.p;
238#endif
239 sha512_block_data_order (c,data,1);
240 }
241
242unsigned char *SHA384(const unsigned char *d, size_t n, unsigned char *md)
243 {
244 SHA512_CTX c;
245 static unsigned char m[SHA384_DIGEST_LENGTH];
246
247 if (md == NULL) md=m;
248 SHA384_Init(&c);
249 SHA512_Update(&c,d,n);
250 SHA512_Final(md,&c);
251 OPENSSL_cleanse(&c,sizeof(c));
252 return(md);
253 }
254
255unsigned char *SHA512(const unsigned char *d, size_t n, unsigned char *md)
256 {
257 SHA512_CTX c;
258 static unsigned char m[SHA512_DIGEST_LENGTH];
259
260 if (md == NULL) md=m;
261 SHA512_Init(&c);
262 SHA512_Update(&c,d,n);
263 SHA512_Final(md,&c);
264 OPENSSL_cleanse(&c,sizeof(c));
265 return(md);
266 }
267
268#ifndef SHA512_ASM
269static const SHA_LONG64 K512[80] = {
270 U64(0x428a2f98d728ae22),U64(0x7137449123ef65cd),
271 U64(0xb5c0fbcfec4d3b2f),U64(0xe9b5dba58189dbbc),
272 U64(0x3956c25bf348b538),U64(0x59f111f1b605d019),
273 U64(0x923f82a4af194f9b),U64(0xab1c5ed5da6d8118),
274 U64(0xd807aa98a3030242),U64(0x12835b0145706fbe),
275 U64(0x243185be4ee4b28c),U64(0x550c7dc3d5ffb4e2),
276 U64(0x72be5d74f27b896f),U64(0x80deb1fe3b1696b1),
277 U64(0x9bdc06a725c71235),U64(0xc19bf174cf692694),
278 U64(0xe49b69c19ef14ad2),U64(0xefbe4786384f25e3),
279 U64(0x0fc19dc68b8cd5b5),U64(0x240ca1cc77ac9c65),
280 U64(0x2de92c6f592b0275),U64(0x4a7484aa6ea6e483),
281 U64(0x5cb0a9dcbd41fbd4),U64(0x76f988da831153b5),
282 U64(0x983e5152ee66dfab),U64(0xa831c66d2db43210),
283 U64(0xb00327c898fb213f),U64(0xbf597fc7beef0ee4),
284 U64(0xc6e00bf33da88fc2),U64(0xd5a79147930aa725),
285 U64(0x06ca6351e003826f),U64(0x142929670a0e6e70),
286 U64(0x27b70a8546d22ffc),U64(0x2e1b21385c26c926),
287 U64(0x4d2c6dfc5ac42aed),U64(0x53380d139d95b3df),
288 U64(0x650a73548baf63de),U64(0x766a0abb3c77b2a8),
289 U64(0x81c2c92e47edaee6),U64(0x92722c851482353b),
290 U64(0xa2bfe8a14cf10364),U64(0xa81a664bbc423001),
291 U64(0xc24b8b70d0f89791),U64(0xc76c51a30654be30),
292 U64(0xd192e819d6ef5218),U64(0xd69906245565a910),
293 U64(0xf40e35855771202a),U64(0x106aa07032bbd1b8),
294 U64(0x19a4c116b8d2d0c8),U64(0x1e376c085141ab53),
295 U64(0x2748774cdf8eeb99),U64(0x34b0bcb5e19b48a8),
296 U64(0x391c0cb3c5c95a63),U64(0x4ed8aa4ae3418acb),
297 U64(0x5b9cca4f7763e373),U64(0x682e6ff3d6b2b8a3),
298 U64(0x748f82ee5defb2fc),U64(0x78a5636f43172f60),
299 U64(0x84c87814a1f0ab72),U64(0x8cc702081a6439ec),
300 U64(0x90befffa23631e28),U64(0xa4506cebde82bde9),
301 U64(0xbef9a3f7b2c67915),U64(0xc67178f2e372532b),
302 U64(0xca273eceea26619c),U64(0xd186b8c721c0c207),
303 U64(0xeada7dd6cde0eb1e),U64(0xf57d4f7fee6ed178),
304 U64(0x06f067aa72176fba),U64(0x0a637dc5a2c898a6),
305 U64(0x113f9804bef90dae),U64(0x1b710b35131c471b),
306 U64(0x28db77f523047d84),U64(0x32caab7b40c72493),
307 U64(0x3c9ebe0a15c9bebc),U64(0x431d67c49c100d4c),
308 U64(0x4cc5d4becb3e42b6),U64(0x597f299cfc657e2a),
309 U64(0x5fcb6fab3ad6faec),U64(0x6c44198c4a475817) };
310
311#if defined(__GNUC__) && __GNUC__>=2 && !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_NO_INLINE_ASM)
312# if defined(__x86_64) || defined(__x86_64__)
313# define ROTR(a,n) ({ SHA_LONG64 ret; \
314 asm ("rorq %1,%0" \
315 : "=r"(ret) \
316 : "J"(n),"0"(a) \
317 : "cc"); ret; })
318# define PULL64(x) ({ SHA_LONG64 ret=*((const SHA_LONG64 *)(&(x))); \
319 asm ("bswapq %0" \
320 : "=r"(ret) \
321 : "0"(ret)); ret; })
322# elif (defined(__i386) || defined(__i386__))
323# if defined(I386_ONLY)
324# define PULL64(x) ({ const unsigned int *p=(const unsigned int *)(&(x));\
325 unsigned int hi=p[0],lo=p[1]; \
326 asm("xchgb %%ah,%%al;xchgb %%dh,%%dl;"\
327 "roll $16,%%eax; roll $16,%%edx; "\
328 "xchgb %%ah,%%al;xchgb %%dh,%%dl;" \
329 : "=a"(lo),"=d"(hi) \
330 : "0"(lo),"1"(hi) : "cc"); \
331 ((SHA_LONG64)hi)<<32|lo; })
332# else
333# define PULL64(x) ({ const unsigned int *p=(const unsigned int *)(&(x));\
334 unsigned int hi=p[0],lo=p[1]; \
335 asm ("bswapl %0; bswapl %1;" \
336 : "=r"(lo),"=r"(hi) \
337 : "0"(lo),"1"(hi)); \
338 ((SHA_LONG64)hi)<<32|lo; })
339# endif
340# elif (defined(_ARCH_PPC) && defined(__64BIT__)) || defined(_ARCH_PPC64)
341# define ROTR(a,n) ({ SHA_LONG64 ret; \
342 asm ("rotrdi %0,%1,%2" \
343 : "=r"(ret) \
344 : "r"(a),"K"(n)); ret; })
345# endif
346#endif
347
348#ifndef PULL64
349#define B(x,j) (((SHA_LONG64)(*(((const unsigned char *)(&x))+j)))<<((7-j)*8))
350#define PULL64(x) (B(x,0)|B(x,1)|B(x,2)|B(x,3)|B(x,4)|B(x,5)|B(x,6)|B(x,7))
351#endif
352
353#ifndef ROTR
354#define ROTR(x,s) (((x)>>s) | (x)<<(64-s))
355#endif
356
357#define Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
358#define Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41))
359#define sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7))
360#define sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
361
362#define Ch(x,y,z) (((x) & (y)) ^ ((~(x)) & (z)))
363#define Maj(x,y,z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
364
365
366#if defined(__i386) || defined(__i386__) || defined(_M_IX86)
367/*
368 * This code should give better results on 32-bit CPU with less than
369 * ~24 registers, both size and performance wise...
370 */
371static void sha512_block_data_order (SHA512_CTX *ctx, const void *in, size_t num)
372 {
373 const SHA_LONG64 *W=in;
374 SHA_LONG64 A,E,T;
375 SHA_LONG64 X[9+80],*F;
376 int i;
377
378 while (num--) {
379
380 F = X+80;
381 A = ctx->h[0]; F[1] = ctx->h[1];
382 F[2] = ctx->h[2]; F[3] = ctx->h[3];
383 E = ctx->h[4]; F[5] = ctx->h[5];
384 F[6] = ctx->h[6]; F[7] = ctx->h[7];
385
386 for (i=0;i<16;i++,F--)
387 {
388 T = PULL64(W[i]);
389 F[0] = A;
390 F[4] = E;
391 F[8] = T;
392 T += F[7] + Sigma1(E) + Ch(E,F[5],F[6]) + K512[i];
393 E = F[3] + T;
394 A = T + Sigma0(A) + Maj(A,F[1],F[2]);
395 }
396
397 for (;i<80;i++,F--)
398 {
399 T = sigma0(F[8+16-1]);
400 T += sigma1(F[8+16-14]);
401 T += F[8+16] + F[8+16-9];
402
403 F[0] = A;
404 F[4] = E;
405 F[8] = T;
406 T += F[7] + Sigma1(E) + Ch(E,F[5],F[6]) + K512[i];
407 E = F[3] + T;
408 A = T + Sigma0(A) + Maj(A,F[1],F[2]);
409 }
410
411 ctx->h[0] += A; ctx->h[1] += F[1];
412 ctx->h[2] += F[2]; ctx->h[3] += F[3];
413 ctx->h[4] += E; ctx->h[5] += F[5];
414 ctx->h[6] += F[6]; ctx->h[7] += F[7];
415
416 W+=SHA_LBLOCK;
417 }
418 }
419
420#elif defined(OPENSSL_SMALL_FOOTPRINT)
421
422static void sha512_block_data_order (SHA512_CTX *ctx, const void *in, size_t num)
423 {
424 const SHA_LONG64 *W=in;
425 SHA_LONG64 a,b,c,d,e,f,g,h,s0,s1,T1,T2;
426 SHA_LONG64 X[16];
427 int i;
428
429 while (num--) {
430
431 a = ctx->h[0]; b = ctx->h[1]; c = ctx->h[2]; d = ctx->h[3];
432 e = ctx->h[4]; f = ctx->h[5]; g = ctx->h[6]; h = ctx->h[7];
433
434 for (i=0;i<16;i++)
435 {
436#if BYTE_ORDER == BIG_ENDIAN
437 T1 = X[i] = W[i];
438#else
439 T1 = X[i] = PULL64(W[i]);
440#endif
441 T1 += h + Sigma1(e) + Ch(e,f,g) + K512[i];
442 T2 = Sigma0(a) + Maj(a,b,c);
443 h = g; g = f; f = e; e = d + T1;
444 d = c; c = b; b = a; a = T1 + T2;
445 }
446
447 for (;i<80;i++)
448 {
449 s0 = X[(i+1)&0x0f]; s0 = sigma0(s0);
450 s1 = X[(i+14)&0x0f]; s1 = sigma1(s1);
451
452 T1 = X[i&0xf] += s0 + s1 + X[(i+9)&0xf];
453 T1 += h + Sigma1(e) + Ch(e,f,g) + K512[i];
454 T2 = Sigma0(a) + Maj(a,b,c);
455 h = g; g = f; f = e; e = d + T1;
456 d = c; c = b; b = a; a = T1 + T2;
457 }
458
459 ctx->h[0] += a; ctx->h[1] += b; ctx->h[2] += c; ctx->h[3] += d;
460 ctx->h[4] += e; ctx->h[5] += f; ctx->h[6] += g; ctx->h[7] += h;
461
462 W+=SHA_LBLOCK;
463 }
464 }
465
466#else
467
468#define ROUND_00_15(i,a,b,c,d,e,f,g,h) do { \
469 T1 += h + Sigma1(e) + Ch(e,f,g) + K512[i]; \
470 h = Sigma0(a) + Maj(a,b,c); \
471 d += T1; h += T1; } while (0)
472
473#define ROUND_16_80(i,j,a,b,c,d,e,f,g,h,X) do { \
474 s0 = X[(j+1)&0x0f]; s0 = sigma0(s0); \
475 s1 = X[(j+14)&0x0f]; s1 = sigma1(s1); \
476 T1 = X[(j)&0x0f] += s0 + s1 + X[(j+9)&0x0f]; \
477 ROUND_00_15(i+j,a,b,c,d,e,f,g,h); } while (0)
478
479static void sha512_block_data_order (SHA512_CTX *ctx, const void *in, size_t num)
480 {
481 const SHA_LONG64 *W=in;
482 SHA_LONG64 a,b,c,d,e,f,g,h,s0,s1,T1;
483 SHA_LONG64 X[16];
484 int i;
485
486 while (num--) {
487
488 a = ctx->h[0]; b = ctx->h[1]; c = ctx->h[2]; d = ctx->h[3];
489 e = ctx->h[4]; f = ctx->h[5]; g = ctx->h[6]; h = ctx->h[7];
490
491#if BYTE_ORDER == BIG_ENDIAN
492 T1 = X[0] = W[0]; ROUND_00_15(0,a,b,c,d,e,f,g,h);
493 T1 = X[1] = W[1]; ROUND_00_15(1,h,a,b,c,d,e,f,g);
494 T1 = X[2] = W[2]; ROUND_00_15(2,g,h,a,b,c,d,e,f);
495 T1 = X[3] = W[3]; ROUND_00_15(3,f,g,h,a,b,c,d,e);
496 T1 = X[4] = W[4]; ROUND_00_15(4,e,f,g,h,a,b,c,d);
497 T1 = X[5] = W[5]; ROUND_00_15(5,d,e,f,g,h,a,b,c);
498 T1 = X[6] = W[6]; ROUND_00_15(6,c,d,e,f,g,h,a,b);
499 T1 = X[7] = W[7]; ROUND_00_15(7,b,c,d,e,f,g,h,a);
500 T1 = X[8] = W[8]; ROUND_00_15(8,a,b,c,d,e,f,g,h);
501 T1 = X[9] = W[9]; ROUND_00_15(9,h,a,b,c,d,e,f,g);
502 T1 = X[10] = W[10]; ROUND_00_15(10,g,h,a,b,c,d,e,f);
503 T1 = X[11] = W[11]; ROUND_00_15(11,f,g,h,a,b,c,d,e);
504 T1 = X[12] = W[12]; ROUND_00_15(12,e,f,g,h,a,b,c,d);
505 T1 = X[13] = W[13]; ROUND_00_15(13,d,e,f,g,h,a,b,c);
506 T1 = X[14] = W[14]; ROUND_00_15(14,c,d,e,f,g,h,a,b);
507 T1 = X[15] = W[15]; ROUND_00_15(15,b,c,d,e,f,g,h,a);
508#else
509 T1 = X[0] = PULL64(W[0]); ROUND_00_15(0,a,b,c,d,e,f,g,h);
510 T1 = X[1] = PULL64(W[1]); ROUND_00_15(1,h,a,b,c,d,e,f,g);
511 T1 = X[2] = PULL64(W[2]); ROUND_00_15(2,g,h,a,b,c,d,e,f);
512 T1 = X[3] = PULL64(W[3]); ROUND_00_15(3,f,g,h,a,b,c,d,e);
513 T1 = X[4] = PULL64(W[4]); ROUND_00_15(4,e,f,g,h,a,b,c,d);
514 T1 = X[5] = PULL64(W[5]); ROUND_00_15(5,d,e,f,g,h,a,b,c);
515 T1 = X[6] = PULL64(W[6]); ROUND_00_15(6,c,d,e,f,g,h,a,b);
516 T1 = X[7] = PULL64(W[7]); ROUND_00_15(7,b,c,d,e,f,g,h,a);
517 T1 = X[8] = PULL64(W[8]); ROUND_00_15(8,a,b,c,d,e,f,g,h);
518 T1 = X[9] = PULL64(W[9]); ROUND_00_15(9,h,a,b,c,d,e,f,g);
519 T1 = X[10] = PULL64(W[10]); ROUND_00_15(10,g,h,a,b,c,d,e,f);
520 T1 = X[11] = PULL64(W[11]); ROUND_00_15(11,f,g,h,a,b,c,d,e);
521 T1 = X[12] = PULL64(W[12]); ROUND_00_15(12,e,f,g,h,a,b,c,d);
522 T1 = X[13] = PULL64(W[13]); ROUND_00_15(13,d,e,f,g,h,a,b,c);
523 T1 = X[14] = PULL64(W[14]); ROUND_00_15(14,c,d,e,f,g,h,a,b);
524 T1 = X[15] = PULL64(W[15]); ROUND_00_15(15,b,c,d,e,f,g,h,a);
525#endif
526
527 for (i=16;i<80;i+=16)
528 {
529 ROUND_16_80(i, 0,a,b,c,d,e,f,g,h,X);
530 ROUND_16_80(i, 1,h,a,b,c,d,e,f,g,X);
531 ROUND_16_80(i, 2,g,h,a,b,c,d,e,f,X);
532 ROUND_16_80(i, 3,f,g,h,a,b,c,d,e,X);
533 ROUND_16_80(i, 4,e,f,g,h,a,b,c,d,X);
534 ROUND_16_80(i, 5,d,e,f,g,h,a,b,c,X);
535 ROUND_16_80(i, 6,c,d,e,f,g,h,a,b,X);
536 ROUND_16_80(i, 7,b,c,d,e,f,g,h,a,X);
537 ROUND_16_80(i, 8,a,b,c,d,e,f,g,h,X);
538 ROUND_16_80(i, 9,h,a,b,c,d,e,f,g,X);
539 ROUND_16_80(i,10,g,h,a,b,c,d,e,f,X);
540 ROUND_16_80(i,11,f,g,h,a,b,c,d,e,X);
541 ROUND_16_80(i,12,e,f,g,h,a,b,c,d,X);
542 ROUND_16_80(i,13,d,e,f,g,h,a,b,c,X);
543 ROUND_16_80(i,14,c,d,e,f,g,h,a,b,X);
544 ROUND_16_80(i,15,b,c,d,e,f,g,h,a,X);
545 }
546
547 ctx->h[0] += a; ctx->h[1] += b; ctx->h[2] += c; ctx->h[3] += d;
548 ctx->h[4] += e; ctx->h[5] += f; ctx->h[6] += g; ctx->h[7] += h;
549
550 W+=SHA_LBLOCK;
551 }
552 }
553
554#endif
555
556#endif /* SHA512_ASM */
557
558#endif /* !OPENSSL_NO_SHA512 */
diff --git a/src/lib/libcrypto/sha/sha_locl.h b/src/lib/libcrypto/sha/sha_locl.h
deleted file mode 100644
index f2f9a31ee7..0000000000
--- a/src/lib/libcrypto/sha/sha_locl.h
+++ /dev/null
@@ -1,435 +0,0 @@
1/* $OpenBSD: sha_locl.h,v 1.19 2014/10/28 07:35:59 jsg Exp $ */
2/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
3 * All rights reserved.
4 *
5 * This package is an SSL implementation written
6 * by Eric Young (eay@cryptsoft.com).
7 * The implementation was written so as to conform with Netscapes SSL.
8 *
9 * This library is free for commercial and non-commercial use as long as
10 * the following conditions are aheared to. The following conditions
11 * apply to all code found in this distribution, be it the RC4, RSA,
12 * lhash, DES, etc., code; not just the SSL code. The SSL documentation
13 * included with this distribution is covered by the same copyright terms
14 * except that the holder is Tim Hudson (tjh@cryptsoft.com).
15 *
16 * Copyright remains Eric Young's, and as such any Copyright notices in
17 * the code are not to be removed.
18 * If this package is used in a product, Eric Young should be given attribution
19 * as the author of the parts of the library used.
20 * This can be in the form of a textual message at program startup or
21 * in documentation (online or textual) provided with the package.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 * 1. Redistributions of source code must retain the copyright
27 * notice, this list of conditions and the following disclaimer.
28 * 2. Redistributions in binary form must reproduce the above copyright
29 * notice, this list of conditions and the following disclaimer in the
30 * documentation and/or other materials provided with the distribution.
31 * 3. All advertising materials mentioning features or use of this software
32 * must display the following acknowledgement:
33 * "This product includes cryptographic software written by
34 * Eric Young (eay@cryptsoft.com)"
35 * The word 'cryptographic' can be left out if the rouines from the library
36 * being used are not cryptographic related :-).
37 * 4. If you include any Windows specific code (or a derivative thereof) from
38 * the apps directory (application code) you must include an acknowledgement:
39 * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
40 *
41 * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
42 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51 * SUCH DAMAGE.
52 *
53 * The licence and distribution terms for any publically available version or
54 * derivative of this code cannot be changed. i.e. this code cannot simply be
55 * copied and put under another distribution licence
56 * [including the GNU Public Licence.]
57 */
58
59#include <stdlib.h>
60#include <string.h>
61
62#include <openssl/opensslconf.h>
63#include <openssl/sha.h>
64
65#define DATA_ORDER_IS_BIG_ENDIAN
66
67#define HASH_LONG SHA_LONG
68#define HASH_CTX SHA_CTX
69#define HASH_CBLOCK SHA_CBLOCK
70#define HASH_MAKE_STRING(c,s) do { \
71 unsigned long ll; \
72 ll=(c)->h0; HOST_l2c(ll,(s)); \
73 ll=(c)->h1; HOST_l2c(ll,(s)); \
74 ll=(c)->h2; HOST_l2c(ll,(s)); \
75 ll=(c)->h3; HOST_l2c(ll,(s)); \
76 ll=(c)->h4; HOST_l2c(ll,(s)); \
77 } while (0)
78
79#if defined(SHA_0)
80
81# define HASH_UPDATE SHA_Update
82# define HASH_TRANSFORM SHA_Transform
83# define HASH_FINAL SHA_Final
84# define HASH_INIT SHA_Init
85# define HASH_BLOCK_DATA_ORDER sha_block_data_order
86# define Xupdate(a,ix,ia,ib,ic,id) (ix=(a)=(ia^ib^ic^id))
87
88static void sha_block_data_order (SHA_CTX *c, const void *p,size_t num);
89
90#elif defined(SHA_1)
91
92# define HASH_UPDATE SHA1_Update
93# define HASH_TRANSFORM SHA1_Transform
94# define HASH_FINAL SHA1_Final
95# define HASH_INIT SHA1_Init
96# define HASH_BLOCK_DATA_ORDER sha1_block_data_order
97# define Xupdate(a,ix,ia,ib,ic,id) ( (a)=(ia^ib^ic^id), \
98 ix=(a)=ROTATE((a),1) \
99 )
100
101#ifndef SHA1_ASM
102static
103#endif
104void sha1_block_data_order (SHA_CTX *c, const void *p,size_t num);
105
106#else
107# error "Either SHA_0 or SHA_1 must be defined."
108#endif
109
110#include "md32_common.h"
111
112#define INIT_DATA_h0 0x67452301UL
113#define INIT_DATA_h1 0xefcdab89UL
114#define INIT_DATA_h2 0x98badcfeUL
115#define INIT_DATA_h3 0x10325476UL
116#define INIT_DATA_h4 0xc3d2e1f0UL
117
118#ifdef SHA_0
119int SHA_Init(SHA_CTX *c)
120#else
121int SHA1_Init(SHA_CTX *c)
122#endif
123 {
124 memset (c,0,sizeof(*c));
125 c->h0=INIT_DATA_h0;
126 c->h1=INIT_DATA_h1;
127 c->h2=INIT_DATA_h2;
128 c->h3=INIT_DATA_h3;
129 c->h4=INIT_DATA_h4;
130 return 1;
131 }
132
133#define K_00_19 0x5a827999UL
134#define K_20_39 0x6ed9eba1UL
135#define K_40_59 0x8f1bbcdcUL
136#define K_60_79 0xca62c1d6UL
137
138/* As pointed out by Wei Dai <weidai@eskimo.com>, F() below can be
139 * simplified to the code in F_00_19. Wei attributes these optimisations
140 * to Peter Gutmann's SHS code, and he attributes it to Rich Schroeppel.
141 * #define F(x,y,z) (((x) & (y)) | ((~(x)) & (z)))
142 * I've just become aware of another tweak to be made, again from Wei Dai,
143 * in F_40_59, (x&a)|(y&a) -> (x|y)&a
144 */
145#define F_00_19(b,c,d) ((((c) ^ (d)) & (b)) ^ (d))
146#define F_20_39(b,c,d) ((b) ^ (c) ^ (d))
147#define F_40_59(b,c,d) (((b) & (c)) | (((b)|(c)) & (d)))
148#define F_60_79(b,c,d) F_20_39(b,c,d)
149
150#ifndef OPENSSL_SMALL_FOOTPRINT
151
152#define BODY_00_15(i,a,b,c,d,e,f,xi) \
153 (f)=xi+(e)+K_00_19+ROTATE((a),5)+F_00_19((b),(c),(d)); \
154 (b)=ROTATE((b),30);
155
156#define BODY_16_19(i,a,b,c,d,e,f,xi,xa,xb,xc,xd) \
157 Xupdate(f,xi,xa,xb,xc,xd); \
158 (f)+=(e)+K_00_19+ROTATE((a),5)+F_00_19((b),(c),(d)); \
159 (b)=ROTATE((b),30);
160
161#define BODY_20_31(i,a,b,c,d,e,f,xi,xa,xb,xc,xd) \
162 Xupdate(f,xi,xa,xb,xc,xd); \
163 (f)+=(e)+K_20_39+ROTATE((a),5)+F_20_39((b),(c),(d)); \
164 (b)=ROTATE((b),30);
165
166#define BODY_32_39(i,a,b,c,d,e,f,xa,xb,xc,xd) \
167 Xupdate(f,xa,xa,xb,xc,xd); \
168 (f)+=(e)+K_20_39+ROTATE((a),5)+F_20_39((b),(c),(d)); \
169 (b)=ROTATE((b),30);
170
171#define BODY_40_59(i,a,b,c,d,e,f,xa,xb,xc,xd) \
172 Xupdate(f,xa,xa,xb,xc,xd); \
173 (f)+=(e)+K_40_59+ROTATE((a),5)+F_40_59((b),(c),(d)); \
174 (b)=ROTATE((b),30);
175
176#define BODY_60_79(i,a,b,c,d,e,f,xa,xb,xc,xd) \
177 Xupdate(f,xa,xa,xb,xc,xd); \
178 (f)=xa+(e)+K_60_79+ROTATE((a),5)+F_60_79((b),(c),(d)); \
179 (b)=ROTATE((b),30);
180
181#ifdef X
182#undef X
183#endif
184#ifndef MD32_XARRAY
185 /*
186 * Originally X was an array. As it's automatic it's natural
187 * to expect RISC compiler to accomodate at least part of it in
188 * the register bank, isn't it? Unfortunately not all compilers
189 * "find" this expectation reasonable:-( On order to make such
190 * compilers generate better code I replace X[] with a bunch of
191 * X0, X1, etc. See the function body below...
192 * <appro@fy.chalmers.se>
193 */
194# define X(i) XX##i
195#else
196 /*
197 * However! Some compilers (most notably HP C) get overwhelmed by
198 * that many local variables so that we have to have the way to
199 * fall down to the original behavior.
200 */
201# define X(i) XX[i]
202#endif
203
204#if !defined(SHA_1) || !defined(SHA1_ASM)
205#include <machine/endian.h>
206static void HASH_BLOCK_DATA_ORDER (SHA_CTX *c, const void *p, size_t num)
207 {
208 const unsigned char *data=p;
209 unsigned MD32_REG_T A,B,C,D,E,T,l;
210#ifndef MD32_XARRAY
211 unsigned MD32_REG_T XX0, XX1, XX2, XX3, XX4, XX5, XX6, XX7,
212 XX8, XX9,XX10,XX11,XX12,XX13,XX14,XX15;
213#else
214 SHA_LONG XX[16];
215#endif
216
217 A=c->h0;
218 B=c->h1;
219 C=c->h2;
220 D=c->h3;
221 E=c->h4;
222
223 for (;;)
224 {
225
226 if (BYTE_ORDER != LITTLE_ENDIAN &&
227 sizeof(SHA_LONG)==4 && ((size_t)p%4)==0)
228 {
229 const SHA_LONG *W=(const SHA_LONG *)data;
230
231 X( 0) = W[0]; X( 1) = W[ 1];
232 BODY_00_15( 0,A,B,C,D,E,T,X( 0)); X( 2) = W[ 2];
233 BODY_00_15( 1,T,A,B,C,D,E,X( 1)); X( 3) = W[ 3];
234 BODY_00_15( 2,E,T,A,B,C,D,X( 2)); X( 4) = W[ 4];
235 BODY_00_15( 3,D,E,T,A,B,C,X( 3)); X( 5) = W[ 5];
236 BODY_00_15( 4,C,D,E,T,A,B,X( 4)); X( 6) = W[ 6];
237 BODY_00_15( 5,B,C,D,E,T,A,X( 5)); X( 7) = W[ 7];
238 BODY_00_15( 6,A,B,C,D,E,T,X( 6)); X( 8) = W[ 8];
239 BODY_00_15( 7,T,A,B,C,D,E,X( 7)); X( 9) = W[ 9];
240 BODY_00_15( 8,E,T,A,B,C,D,X( 8)); X(10) = W[10];
241 BODY_00_15( 9,D,E,T,A,B,C,X( 9)); X(11) = W[11];
242 BODY_00_15(10,C,D,E,T,A,B,X(10)); X(12) = W[12];
243 BODY_00_15(11,B,C,D,E,T,A,X(11)); X(13) = W[13];
244 BODY_00_15(12,A,B,C,D,E,T,X(12)); X(14) = W[14];
245 BODY_00_15(13,T,A,B,C,D,E,X(13)); X(15) = W[15];
246 BODY_00_15(14,E,T,A,B,C,D,X(14));
247 BODY_00_15(15,D,E,T,A,B,C,X(15));
248
249 data += SHA_CBLOCK;
250 }
251 else
252 {
253 HOST_c2l(data,l); X( 0)=l; HOST_c2l(data,l); X( 1)=l;
254 BODY_00_15( 0,A,B,C,D,E,T,X( 0)); HOST_c2l(data,l); X( 2)=l;
255 BODY_00_15( 1,T,A,B,C,D,E,X( 1)); HOST_c2l(data,l); X( 3)=l;
256 BODY_00_15( 2,E,T,A,B,C,D,X( 2)); HOST_c2l(data,l); X( 4)=l;
257 BODY_00_15( 3,D,E,T,A,B,C,X( 3)); HOST_c2l(data,l); X( 5)=l;
258 BODY_00_15( 4,C,D,E,T,A,B,X( 4)); HOST_c2l(data,l); X( 6)=l;
259 BODY_00_15( 5,B,C,D,E,T,A,X( 5)); HOST_c2l(data,l); X( 7)=l;
260 BODY_00_15( 6,A,B,C,D,E,T,X( 6)); HOST_c2l(data,l); X( 8)=l;
261 BODY_00_15( 7,T,A,B,C,D,E,X( 7)); HOST_c2l(data,l); X( 9)=l;
262 BODY_00_15( 8,E,T,A,B,C,D,X( 8)); HOST_c2l(data,l); X(10)=l;
263 BODY_00_15( 9,D,E,T,A,B,C,X( 9)); HOST_c2l(data,l); X(11)=l;
264 BODY_00_15(10,C,D,E,T,A,B,X(10)); HOST_c2l(data,l); X(12)=l;
265 BODY_00_15(11,B,C,D,E,T,A,X(11)); HOST_c2l(data,l); X(13)=l;
266 BODY_00_15(12,A,B,C,D,E,T,X(12)); HOST_c2l(data,l); X(14)=l;
267 BODY_00_15(13,T,A,B,C,D,E,X(13)); HOST_c2l(data,l); X(15)=l;
268 BODY_00_15(14,E,T,A,B,C,D,X(14));
269 BODY_00_15(15,D,E,T,A,B,C,X(15));
270 }
271
272 BODY_16_19(16,C,D,E,T,A,B,X( 0),X( 0),X( 2),X( 8),X(13));
273 BODY_16_19(17,B,C,D,E,T,A,X( 1),X( 1),X( 3),X( 9),X(14));
274 BODY_16_19(18,A,B,C,D,E,T,X( 2),X( 2),X( 4),X(10),X(15));
275 BODY_16_19(19,T,A,B,C,D,E,X( 3),X( 3),X( 5),X(11),X( 0));
276
277 BODY_20_31(20,E,T,A,B,C,D,X( 4),X( 4),X( 6),X(12),X( 1));
278 BODY_20_31(21,D,E,T,A,B,C,X( 5),X( 5),X( 7),X(13),X( 2));
279 BODY_20_31(22,C,D,E,T,A,B,X( 6),X( 6),X( 8),X(14),X( 3));
280 BODY_20_31(23,B,C,D,E,T,A,X( 7),X( 7),X( 9),X(15),X( 4));
281 BODY_20_31(24,A,B,C,D,E,T,X( 8),X( 8),X(10),X( 0),X( 5));
282 BODY_20_31(25,T,A,B,C,D,E,X( 9),X( 9),X(11),X( 1),X( 6));
283 BODY_20_31(26,E,T,A,B,C,D,X(10),X(10),X(12),X( 2),X( 7));
284 BODY_20_31(27,D,E,T,A,B,C,X(11),X(11),X(13),X( 3),X( 8));
285 BODY_20_31(28,C,D,E,T,A,B,X(12),X(12),X(14),X( 4),X( 9));
286 BODY_20_31(29,B,C,D,E,T,A,X(13),X(13),X(15),X( 5),X(10));
287 BODY_20_31(30,A,B,C,D,E,T,X(14),X(14),X( 0),X( 6),X(11));
288 BODY_20_31(31,T,A,B,C,D,E,X(15),X(15),X( 1),X( 7),X(12));
289
290 BODY_32_39(32,E,T,A,B,C,D,X( 0),X( 2),X( 8),X(13));
291 BODY_32_39(33,D,E,T,A,B,C,X( 1),X( 3),X( 9),X(14));
292 BODY_32_39(34,C,D,E,T,A,B,X( 2),X( 4),X(10),X(15));
293 BODY_32_39(35,B,C,D,E,T,A,X( 3),X( 5),X(11),X( 0));
294 BODY_32_39(36,A,B,C,D,E,T,X( 4),X( 6),X(12),X( 1));
295 BODY_32_39(37,T,A,B,C,D,E,X( 5),X( 7),X(13),X( 2));
296 BODY_32_39(38,E,T,A,B,C,D,X( 6),X( 8),X(14),X( 3));
297 BODY_32_39(39,D,E,T,A,B,C,X( 7),X( 9),X(15),X( 4));
298
299 BODY_40_59(40,C,D,E,T,A,B,X( 8),X(10),X( 0),X( 5));
300 BODY_40_59(41,B,C,D,E,T,A,X( 9),X(11),X( 1),X( 6));
301 BODY_40_59(42,A,B,C,D,E,T,X(10),X(12),X( 2),X( 7));
302 BODY_40_59(43,T,A,B,C,D,E,X(11),X(13),X( 3),X( 8));
303 BODY_40_59(44,E,T,A,B,C,D,X(12),X(14),X( 4),X( 9));
304 BODY_40_59(45,D,E,T,A,B,C,X(13),X(15),X( 5),X(10));
305 BODY_40_59(46,C,D,E,T,A,B,X(14),X( 0),X( 6),X(11));
306 BODY_40_59(47,B,C,D,E,T,A,X(15),X( 1),X( 7),X(12));
307 BODY_40_59(48,A,B,C,D,E,T,X( 0),X( 2),X( 8),X(13));
308 BODY_40_59(49,T,A,B,C,D,E,X( 1),X( 3),X( 9),X(14));
309 BODY_40_59(50,E,T,A,B,C,D,X( 2),X( 4),X(10),X(15));
310 BODY_40_59(51,D,E,T,A,B,C,X( 3),X( 5),X(11),X( 0));
311 BODY_40_59(52,C,D,E,T,A,B,X( 4),X( 6),X(12),X( 1));
312 BODY_40_59(53,B,C,D,E,T,A,X( 5),X( 7),X(13),X( 2));
313 BODY_40_59(54,A,B,C,D,E,T,X( 6),X( 8),X(14),X( 3));
314 BODY_40_59(55,T,A,B,C,D,E,X( 7),X( 9),X(15),X( 4));
315 BODY_40_59(56,E,T,A,B,C,D,X( 8),X(10),X( 0),X( 5));
316 BODY_40_59(57,D,E,T,A,B,C,X( 9),X(11),X( 1),X( 6));
317 BODY_40_59(58,C,D,E,T,A,B,X(10),X(12),X( 2),X( 7));
318 BODY_40_59(59,B,C,D,E,T,A,X(11),X(13),X( 3),X( 8));
319
320 BODY_60_79(60,A,B,C,D,E,T,X(12),X(14),X( 4),X( 9));
321 BODY_60_79(61,T,A,B,C,D,E,X(13),X(15),X( 5),X(10));
322 BODY_60_79(62,E,T,A,B,C,D,X(14),X( 0),X( 6),X(11));
323 BODY_60_79(63,D,E,T,A,B,C,X(15),X( 1),X( 7),X(12));
324 BODY_60_79(64,C,D,E,T,A,B,X( 0),X( 2),X( 8),X(13));
325 BODY_60_79(65,B,C,D,E,T,A,X( 1),X( 3),X( 9),X(14));
326 BODY_60_79(66,A,B,C,D,E,T,X( 2),X( 4),X(10),X(15));
327 BODY_60_79(67,T,A,B,C,D,E,X( 3),X( 5),X(11),X( 0));
328 BODY_60_79(68,E,T,A,B,C,D,X( 4),X( 6),X(12),X( 1));
329 BODY_60_79(69,D,E,T,A,B,C,X( 5),X( 7),X(13),X( 2));
330 BODY_60_79(70,C,D,E,T,A,B,X( 6),X( 8),X(14),X( 3));
331 BODY_60_79(71,B,C,D,E,T,A,X( 7),X( 9),X(15),X( 4));
332 BODY_60_79(72,A,B,C,D,E,T,X( 8),X(10),X( 0),X( 5));
333 BODY_60_79(73,T,A,B,C,D,E,X( 9),X(11),X( 1),X( 6));
334 BODY_60_79(74,E,T,A,B,C,D,X(10),X(12),X( 2),X( 7));
335 BODY_60_79(75,D,E,T,A,B,C,X(11),X(13),X( 3),X( 8));
336 BODY_60_79(76,C,D,E,T,A,B,X(12),X(14),X( 4),X( 9));
337 BODY_60_79(77,B,C,D,E,T,A,X(13),X(15),X( 5),X(10));
338 BODY_60_79(78,A,B,C,D,E,T,X(14),X( 0),X( 6),X(11));
339 BODY_60_79(79,T,A,B,C,D,E,X(15),X( 1),X( 7),X(12));
340
341 c->h0=(c->h0+E)&0xffffffffL;
342 c->h1=(c->h1+T)&0xffffffffL;
343 c->h2=(c->h2+A)&0xffffffffL;
344 c->h3=(c->h3+B)&0xffffffffL;
345 c->h4=(c->h4+C)&0xffffffffL;
346
347 if (--num == 0) break;
348
349 A=c->h0;
350 B=c->h1;
351 C=c->h2;
352 D=c->h3;
353 E=c->h4;
354
355 }
356 }
357#endif
358
359#else /* OPENSSL_SMALL_FOOTPRINT */
360
361#define BODY_00_15(xi) do { \
362 T=E+K_00_19+F_00_19(B,C,D); \
363 E=D, D=C, C=ROTATE(B,30), B=A; \
364 A=ROTATE(A,5)+T+xi; } while(0)
365
366#define BODY_16_19(xa,xb,xc,xd) do { \
367 Xupdate(T,xa,xa,xb,xc,xd); \
368 T+=E+K_00_19+F_00_19(B,C,D); \
369 E=D, D=C, C=ROTATE(B,30), B=A; \
370 A=ROTATE(A,5)+T; } while(0)
371
372#define BODY_20_39(xa,xb,xc,xd) do { \
373 Xupdate(T,xa,xa,xb,xc,xd); \
374 T+=E+K_20_39+F_20_39(B,C,D); \
375 E=D, D=C, C=ROTATE(B,30), B=A; \
376 A=ROTATE(A,5)+T; } while(0)
377
378#define BODY_40_59(xa,xb,xc,xd) do { \
379 Xupdate(T,xa,xa,xb,xc,xd); \
380 T+=E+K_40_59+F_40_59(B,C,D); \
381 E=D, D=C, C=ROTATE(B,30), B=A; \
382 A=ROTATE(A,5)+T; } while(0)
383
384#define BODY_60_79(xa,xb,xc,xd) do { \
385 Xupdate(T,xa,xa,xb,xc,xd); \
386 T=E+K_60_79+F_60_79(B,C,D); \
387 E=D, D=C, C=ROTATE(B,30), B=A; \
388 A=ROTATE(A,5)+T+xa; } while(0)
389
390#if !defined(SHA_1) || !defined(SHA1_ASM)
391static void HASH_BLOCK_DATA_ORDER (SHA_CTX *c, const void *p, size_t num)
392 {
393 const unsigned char *data=p;
394 unsigned MD32_REG_T A,B,C,D,E,T,l;
395 int i;
396 SHA_LONG X[16];
397
398 A=c->h0;
399 B=c->h1;
400 C=c->h2;
401 D=c->h3;
402 E=c->h4;
403
404 for (;;)
405 {
406 for (i=0;i<16;i++)
407 { HOST_c2l(data,l); X[i]=l; BODY_00_15(X[i]); }
408 for (i=0;i<4;i++)
409 { BODY_16_19(X[i], X[i+2], X[i+8], X[(i+13)&15]); }
410 for (;i<24;i++)
411 { BODY_20_39(X[i&15], X[(i+2)&15], X[(i+8)&15],X[(i+13)&15]); }
412 for (i=0;i<20;i++)
413 { BODY_40_59(X[(i+8)&15],X[(i+10)&15],X[i&15], X[(i+5)&15]); }
414 for (i=4;i<24;i++)
415 { BODY_60_79(X[(i+8)&15],X[(i+10)&15],X[i&15], X[(i+5)&15]); }
416
417 c->h0=(c->h0+A)&0xffffffffL;
418 c->h1=(c->h1+B)&0xffffffffL;
419 c->h2=(c->h2+C)&0xffffffffL;
420 c->h3=(c->h3+D)&0xffffffffL;
421 c->h4=(c->h4+E)&0xffffffffL;
422
423 if (--num == 0) break;
424
425 A=c->h0;
426 B=c->h1;
427 C=c->h2;
428 D=c->h3;
429 E=c->h4;
430
431 }
432 }
433#endif
434
435#endif