summaryrefslogtreecommitdiff
path: root/src/lib/libcrypto/sha
diff options
context:
space:
mode:
authorcvs2svn <admin@example.com>2014-02-27 21:04:58 +0000
committercvs2svn <admin@example.com>2014-02-27 21:04:58 +0000
commit726818f36b5221c023cd04c4b90bdbc08e94cd96 (patch)
treecf8221f3aa5bf5a578ddf1ecf5677ad08c04d342 /src/lib/libcrypto/sha
parent3b6d92e82b1421b811bcdec7f7fdfb31eeef18de (diff)
downloadopenbsd-OPENBSD_5_5_BASE.tar.gz
openbsd-OPENBSD_5_5_BASE.tar.bz2
openbsd-OPENBSD_5_5_BASE.zip
This commit was manufactured by cvs2git to create tag 'OPENBSD_5_5_BASE'.OPENBSD_5_5_BASE
Diffstat (limited to 'src/lib/libcrypto/sha')
-rw-r--r--src/lib/libcrypto/sha/asm/sha1-586.pl1229
-rw-r--r--src/lib/libcrypto/sha/asm/sha1-alpha.pl322
-rw-r--r--src/lib/libcrypto/sha/asm/sha1-armv4-large.pl248
-rw-r--r--src/lib/libcrypto/sha/asm/sha1-ia64.pl304
-rw-r--r--src/lib/libcrypto/sha/asm/sha1-mips.pl354
-rw-r--r--src/lib/libcrypto/sha/asm/sha1-parisc.pl259
-rwxr-xr-xsrc/lib/libcrypto/sha/asm/sha1-ppc.pl326
-rw-r--r--src/lib/libcrypto/sha/asm/sha1-s390x.pl246
-rw-r--r--src/lib/libcrypto/sha/asm/sha1-sparcv9.pl284
-rw-r--r--src/lib/libcrypto/sha/asm/sha1-sparcv9a.pl601
-rw-r--r--src/lib/libcrypto/sha/asm/sha1-thumb.pl259
-rwxr-xr-xsrc/lib/libcrypto/sha/asm/sha1-x86_64.pl1260
-rw-r--r--src/lib/libcrypto/sha/asm/sha256-586.pl249
-rw-r--r--src/lib/libcrypto/sha/asm/sha256-armv4.pl211
-rw-r--r--src/lib/libcrypto/sha/asm/sha512-586.pl644
-rw-r--r--src/lib/libcrypto/sha/asm/sha512-armv4.pl582
-rwxr-xr-xsrc/lib/libcrypto/sha/asm/sha512-ia64.pl672
-rw-r--r--src/lib/libcrypto/sha/asm/sha512-mips.pl455
-rwxr-xr-xsrc/lib/libcrypto/sha/asm/sha512-parisc.pl791
-rwxr-xr-xsrc/lib/libcrypto/sha/asm/sha512-ppc.pl460
-rw-r--r--src/lib/libcrypto/sha/asm/sha512-s390x.pl322
-rw-r--r--src/lib/libcrypto/sha/asm/sha512-sparcv9.pl594
-rwxr-xr-xsrc/lib/libcrypto/sha/asm/sha512-x86_64.pl450
-rw-r--r--src/lib/libcrypto/sha/sha.h214
-rw-r--r--src/lib/libcrypto/sha/sha1_one.c78
-rw-r--r--src/lib/libcrypto/sha/sha1dgst.c75
-rw-r--r--src/lib/libcrypto/sha/sha256.c282
-rw-r--r--src/lib/libcrypto/sha/sha512.c604
-rw-r--r--src/lib/libcrypto/sha/sha_locl.h441
29 files changed, 0 insertions, 12816 deletions
diff --git a/src/lib/libcrypto/sha/asm/sha1-586.pl b/src/lib/libcrypto/sha/asm/sha1-586.pl
deleted file mode 100644
index 1084d227fe..0000000000
--- a/src/lib/libcrypto/sha/asm/sha1-586.pl
+++ /dev/null
@@ -1,1229 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# [Re]written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# "[Re]written" was achieved in two major overhauls. In 2004 BODY_*
11# functions were re-implemented to address P4 performance issue [see
12# commentary below], and in 2006 the rest was rewritten in order to
13# gain freedom to liberate licensing terms.
14
15# January, September 2004.
16#
17# It was noted that Intel IA-32 C compiler generates code which
18# performs ~30% *faster* on P4 CPU than original *hand-coded*
19# SHA1 assembler implementation. To address this problem (and
20# prove that humans are still better than machines:-), the
21# original code was overhauled, which resulted in following
22# performance changes:
23#
24# compared with original compared with Intel cc
25# assembler impl. generated code
26# Pentium -16% +48%
27# PIII/AMD +8% +16%
28# P4 +85%(!) +45%
29#
30# As you can see Pentium came out as looser:-( Yet I reckoned that
31# improvement on P4 outweights the loss and incorporate this
32# re-tuned code to 0.9.7 and later.
33# ----------------------------------------------------------------
34# <appro@fy.chalmers.se>
35
36# August 2009.
37#
38# George Spelvin has tipped that F_40_59(b,c,d) can be rewritten as
39# '(c&d) + (b&(c^d))', which allows to accumulate partial results
40# and lighten "pressure" on scratch registers. This resulted in
41# >12% performance improvement on contemporary AMD cores (with no
42# degradation on other CPUs:-). Also, the code was revised to maximize
43# "distance" between instructions producing input to 'lea' instruction
44# and the 'lea' instruction itself, which is essential for Intel Atom
45# core and resulted in ~15% improvement.
46
47# October 2010.
48#
49# Add SSSE3, Supplemental[!] SSE3, implementation. The idea behind it
50# is to offload message schedule denoted by Wt in NIST specification,
51# or Xupdate in OpenSSL source, to SIMD unit. The idea is not novel,
52# and in SSE2 context was first explored by Dean Gaudet in 2004, see
53# http://arctic.org/~dean/crypto/sha1.html. Since then several things
54# have changed that made it interesting again:
55#
56# a) XMM units became faster and wider;
57# b) instruction set became more versatile;
58# c) an important observation was made by Max Locktykhin, which made
59# it possible to reduce amount of instructions required to perform
60# the operation in question, for further details see
61# http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/.
62
63# April 2011.
64#
65# Add AVX code path, probably most controversial... The thing is that
66# switch to AVX alone improves performance by as little as 4% in
67# comparison to SSSE3 code path. But below result doesn't look like
68# 4% improvement... Trouble is that Sandy Bridge decodes 'ro[rl]' as
69# pair of µ-ops, and it's the additional µ-ops, two per round, that
70# make it run slower than Core2 and Westmere. But 'sh[rl]d' is decoded
71# as single µ-op by Sandy Bridge and it's replacing 'ro[rl]' with
72# equivalent 'sh[rl]d' that is responsible for the impressive 5.1
73# cycles per processed byte. But 'sh[rl]d' is not something that used
74# to be fast, nor does it appear to be fast in upcoming Bulldozer
75# [according to its optimization manual]. Which is why AVX code path
76# is guarded by *both* AVX and synthetic bit denoting Intel CPUs.
77# One can argue that it's unfair to AMD, but without 'sh[rl]d' it
78# makes no sense to keep the AVX code path. If somebody feels that
79# strongly, it's probably more appropriate to discuss possibility of
80# using vector rotate XOP on AMD...
81
82######################################################################
83# Current performance is summarized in following table. Numbers are
84# CPU clock cycles spent to process single byte (less is better).
85#
86# x86 SSSE3 AVX
87# Pentium 15.7 -
88# PIII 11.5 -
89# P4 10.6 -
90# AMD K8 7.1 -
91# Core2 7.3 6.1/+20% -
92# Atom 12.5 9.5(*)/+32% -
93# Westmere 7.3 5.6/+30% -
94# Sandy Bridge 8.8 6.2/+40% 5.1(**)/+70%
95#
96# (*) Loop is 1056 instructions long and expected result is ~8.25.
97# It remains mystery [to me] why ILP is limited to 1.7.
98#
99# (**) As per above comment, the result is for AVX *plus* sh[rl]d.
100
101$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
102push(@INC,"${dir}","${dir}../../perlasm");
103require "x86asm.pl";
104
105&asm_init($ARGV[0],"sha1-586.pl",$ARGV[$#ARGV] eq "386");
106
107$xmm=$ymm=0;
108for (@ARGV) { $xmm=1 if (/-DOPENSSL_IA32_SSE2/); }
109
110$ymm=1 if ($xmm &&
111 `$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
112 =~ /GNU assembler version ([2-9]\.[0-9]+)/ &&
113 $1>=2.19); # first version supporting AVX
114
115$ymm=1 if ($xmm && !$ymm && $ARGV[0] eq "win32n" &&
116 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ &&
117 $1>=2.03); # first version supporting AVX
118
119&external_label("OPENSSL_ia32cap_P") if ($xmm);
120
121
122$A="eax";
123$B="ebx";
124$C="ecx";
125$D="edx";
126$E="edi";
127$T="esi";
128$tmp1="ebp";
129
130@V=($A,$B,$C,$D,$E,$T);
131
132$alt=0; # 1 denotes alternative IALU implementation, which performs
133 # 8% *worse* on P4, same on Westmere and Atom, 2% better on
134 # Sandy Bridge...
135
136sub BODY_00_15
137 {
138 local($n,$a,$b,$c,$d,$e,$f)=@_;
139
140 &comment("00_15 $n");
141
142 &mov($f,$c); # f to hold F_00_19(b,c,d)
143 if ($n==0) { &mov($tmp1,$a); }
144 else { &mov($a,$tmp1); }
145 &rotl($tmp1,5); # tmp1=ROTATE(a,5)
146 &xor($f,$d);
147 &add($tmp1,$e); # tmp1+=e;
148 &mov($e,&swtmp($n%16)); # e becomes volatile and is loaded
149 # with xi, also note that e becomes
150 # f in next round...
151 &and($f,$b);
152 &rotr($b,2); # b=ROTATE(b,30)
153 &xor($f,$d); # f holds F_00_19(b,c,d)
154 &lea($tmp1,&DWP(0x5a827999,$tmp1,$e)); # tmp1+=K_00_19+xi
155
156 if ($n==15) { &mov($e,&swtmp(($n+1)%16));# pre-fetch f for next round
157 &add($f,$tmp1); } # f+=tmp1
158 else { &add($tmp1,$f); } # f becomes a in next round
159 &mov($tmp1,$a) if ($alt && $n==15);
160 }
161
162sub BODY_16_19
163 {
164 local($n,$a,$b,$c,$d,$e,$f)=@_;
165
166 &comment("16_19 $n");
167
168if ($alt) {
169 &xor($c,$d);
170 &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd)
171 &and($tmp1,$c); # tmp1 to hold F_00_19(b,c,d), b&=c^d
172 &xor($f,&swtmp(($n+8)%16));
173 &xor($tmp1,$d); # tmp1=F_00_19(b,c,d)
174 &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd
175 &rotl($f,1); # f=ROTATE(f,1)
176 &add($e,$tmp1); # e+=F_00_19(b,c,d)
177 &xor($c,$d); # restore $c
178 &mov($tmp1,$a); # b in next round
179 &rotr($b,$n==16?2:7); # b=ROTATE(b,30)
180 &mov(&swtmp($n%16),$f); # xi=f
181 &rotl($a,5); # ROTATE(a,5)
182 &lea($f,&DWP(0x5a827999,$f,$e));# f+=F_00_19(b,c,d)+e
183 &mov($e,&swtmp(($n+1)%16)); # pre-fetch f for next round
184 &add($f,$a); # f+=ROTATE(a,5)
185} else {
186 &mov($tmp1,$c); # tmp1 to hold F_00_19(b,c,d)
187 &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd)
188 &xor($tmp1,$d);
189 &xor($f,&swtmp(($n+8)%16));
190 &and($tmp1,$b);
191 &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd
192 &rotl($f,1); # f=ROTATE(f,1)
193 &xor($tmp1,$d); # tmp1=F_00_19(b,c,d)
194 &add($e,$tmp1); # e+=F_00_19(b,c,d)
195 &mov($tmp1,$a);
196 &rotr($b,2); # b=ROTATE(b,30)
197 &mov(&swtmp($n%16),$f); # xi=f
198 &rotl($tmp1,5); # ROTATE(a,5)
199 &lea($f,&DWP(0x5a827999,$f,$e));# f+=F_00_19(b,c,d)+e
200 &mov($e,&swtmp(($n+1)%16)); # pre-fetch f for next round
201 &add($f,$tmp1); # f+=ROTATE(a,5)
202}
203 }
204
205sub BODY_20_39
206 {
207 local($n,$a,$b,$c,$d,$e,$f)=@_;
208 local $K=($n<40)?0x6ed9eba1:0xca62c1d6;
209
210 &comment("20_39 $n");
211
212if ($alt) {
213 &xor($tmp1,$c); # tmp1 to hold F_20_39(b,c,d), b^=c
214 &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd)
215 &xor($tmp1,$d); # tmp1 holds F_20_39(b,c,d)
216 &xor($f,&swtmp(($n+8)%16));
217 &add($e,$tmp1); # e+=F_20_39(b,c,d)
218 &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd
219 &rotl($f,1); # f=ROTATE(f,1)
220 &mov($tmp1,$a); # b in next round
221 &rotr($b,7); # b=ROTATE(b,30)
222 &mov(&swtmp($n%16),$f) if($n<77);# xi=f
223 &rotl($a,5); # ROTATE(a,5)
224 &xor($b,$c) if($n==39);# warm up for BODY_40_59
225 &and($tmp1,$b) if($n==39);
226 &lea($f,&DWP($K,$f,$e)); # f+=e+K_XX_YY
227 &mov($e,&swtmp(($n+1)%16)) if($n<79);# pre-fetch f for next round
228 &add($f,$a); # f+=ROTATE(a,5)
229 &rotr($a,5) if ($n==79);
230} else {
231 &mov($tmp1,$b); # tmp1 to hold F_20_39(b,c,d)
232 &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd)
233 &xor($tmp1,$c);
234 &xor($f,&swtmp(($n+8)%16));
235 &xor($tmp1,$d); # tmp1 holds F_20_39(b,c,d)
236 &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd
237 &rotl($f,1); # f=ROTATE(f,1)
238 &add($e,$tmp1); # e+=F_20_39(b,c,d)
239 &rotr($b,2); # b=ROTATE(b,30)
240 &mov($tmp1,$a);
241 &rotl($tmp1,5); # ROTATE(a,5)
242 &mov(&swtmp($n%16),$f) if($n<77);# xi=f
243 &lea($f,&DWP($K,$f,$e)); # f+=e+K_XX_YY
244 &mov($e,&swtmp(($n+1)%16)) if($n<79);# pre-fetch f for next round
245 &add($f,$tmp1); # f+=ROTATE(a,5)
246}
247 }
248
249sub BODY_40_59
250 {
251 local($n,$a,$b,$c,$d,$e,$f)=@_;
252
253 &comment("40_59 $n");
254
255if ($alt) {
256 &add($e,$tmp1); # e+=b&(c^d)
257 &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd)
258 &mov($tmp1,$d);
259 &xor($f,&swtmp(($n+8)%16));
260 &xor($c,$d); # restore $c
261 &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd
262 &rotl($f,1); # f=ROTATE(f,1)
263 &and($tmp1,$c);
264 &rotr($b,7); # b=ROTATE(b,30)
265 &add($e,$tmp1); # e+=c&d
266 &mov($tmp1,$a); # b in next round
267 &mov(&swtmp($n%16),$f); # xi=f
268 &rotl($a,5); # ROTATE(a,5)
269 &xor($b,$c) if ($n<59);
270 &and($tmp1,$b) if ($n<59);# tmp1 to hold F_40_59(b,c,d)
271 &lea($f,&DWP(0x8f1bbcdc,$f,$e));# f+=K_40_59+e+(b&(c^d))
272 &mov($e,&swtmp(($n+1)%16)); # pre-fetch f for next round
273 &add($f,$a); # f+=ROTATE(a,5)
274} else {
275 &mov($tmp1,$c); # tmp1 to hold F_40_59(b,c,d)
276 &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd)
277 &xor($tmp1,$d);
278 &xor($f,&swtmp(($n+8)%16));
279 &and($tmp1,$b);
280 &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd
281 &rotl($f,1); # f=ROTATE(f,1)
282 &add($tmp1,$e); # b&(c^d)+=e
283 &rotr($b,2); # b=ROTATE(b,30)
284 &mov($e,$a); # e becomes volatile
285 &rotl($e,5); # ROTATE(a,5)
286 &mov(&swtmp($n%16),$f); # xi=f
287 &lea($f,&DWP(0x8f1bbcdc,$f,$tmp1));# f+=K_40_59+e+(b&(c^d))
288 &mov($tmp1,$c);
289 &add($f,$e); # f+=ROTATE(a,5)
290 &and($tmp1,$d);
291 &mov($e,&swtmp(($n+1)%16)); # pre-fetch f for next round
292 &add($f,$tmp1); # f+=c&d
293}
294 }
295
296&function_begin("sha1_block_data_order");
297if ($xmm) {
298 &static_label("ssse3_shortcut");
299 &static_label("avx_shortcut") if ($ymm);
300 &static_label("K_XX_XX");
301
302 &call (&label("pic_point")); # make it PIC!
303 &set_label("pic_point");
304 &blindpop($tmp1);
305 &picmeup($T,"OPENSSL_ia32cap_P",$tmp1,&label("pic_point"));
306 &lea ($tmp1,&DWP(&label("K_XX_XX")."-".&label("pic_point"),$tmp1));
307
308 &mov ($A,&DWP(0,$T));
309 &mov ($D,&DWP(4,$T));
310 &test ($D,1<<9); # check SSSE3 bit
311 &jz (&label("x86"));
312 &test ($A,1<<24); # check FXSR bit
313 &jz (&label("x86"));
314 if ($ymm) {
315 &and ($D,1<<28); # mask AVX bit
316 &and ($A,1<<30); # mask "Intel CPU" bit
317 &or ($A,$D);
318 &cmp ($A,1<<28|1<<30);
319 &je (&label("avx_shortcut"));
320 }
321 &jmp (&label("ssse3_shortcut"));
322 &set_label("x86",16);
323}
324 &mov($tmp1,&wparam(0)); # SHA_CTX *c
325 &mov($T,&wparam(1)); # const void *input
326 &mov($A,&wparam(2)); # size_t num
327 &stack_push(16+3); # allocate X[16]
328 &shl($A,6);
329 &add($A,$T);
330 &mov(&wparam(2),$A); # pointer beyond the end of input
331 &mov($E,&DWP(16,$tmp1));# pre-load E
332 &jmp(&label("loop"));
333
334&set_label("loop",16);
335
336 # copy input chunk to X, but reversing byte order!
337 for ($i=0; $i<16; $i+=4)
338 {
339 &mov($A,&DWP(4*($i+0),$T));
340 &mov($B,&DWP(4*($i+1),$T));
341 &mov($C,&DWP(4*($i+2),$T));
342 &mov($D,&DWP(4*($i+3),$T));
343 &bswap($A);
344 &bswap($B);
345 &bswap($C);
346 &bswap($D);
347 &mov(&swtmp($i+0),$A);
348 &mov(&swtmp($i+1),$B);
349 &mov(&swtmp($i+2),$C);
350 &mov(&swtmp($i+3),$D);
351 }
352 &mov(&wparam(1),$T); # redundant in 1st spin
353
354 &mov($A,&DWP(0,$tmp1)); # load SHA_CTX
355 &mov($B,&DWP(4,$tmp1));
356 &mov($C,&DWP(8,$tmp1));
357 &mov($D,&DWP(12,$tmp1));
358 # E is pre-loaded
359
360 for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
361 for(;$i<20;$i++) { &BODY_16_19($i,@V); unshift(@V,pop(@V)); }
362 for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
363 for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
364 for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
365
366 (($V[5] eq $D) and ($V[0] eq $E)) or die; # double-check
367
368 &mov($tmp1,&wparam(0)); # re-load SHA_CTX*
369 &mov($D,&wparam(1)); # D is last "T" and is discarded
370
371 &add($E,&DWP(0,$tmp1)); # E is last "A"...
372 &add($T,&DWP(4,$tmp1));
373 &add($A,&DWP(8,$tmp1));
374 &add($B,&DWP(12,$tmp1));
375 &add($C,&DWP(16,$tmp1));
376
377 &mov(&DWP(0,$tmp1),$E); # update SHA_CTX
378 &add($D,64); # advance input pointer
379 &mov(&DWP(4,$tmp1),$T);
380 &cmp($D,&wparam(2)); # have we reached the end yet?
381 &mov(&DWP(8,$tmp1),$A);
382 &mov($E,$C); # C is last "E" which needs to be "pre-loaded"
383 &mov(&DWP(12,$tmp1),$B);
384 &mov($T,$D); # input pointer
385 &mov(&DWP(16,$tmp1),$C);
386 &jb(&label("loop"));
387
388 &stack_pop(16+3);
389&function_end("sha1_block_data_order");
390
391if ($xmm) {
392######################################################################
393# The SSSE3 implementation.
394#
395# %xmm[0-7] are used as ring @X[] buffer containing quadruples of last
396# 32 elements of the message schedule or Xupdate outputs. First 4
397# quadruples are simply byte-swapped input, next 4 are calculated
398# according to method originally suggested by Dean Gaudet (modulo
399# being implemented in SSSE3). Once 8 quadruples or 32 elements are
400# collected, it switches to routine proposed by Max Locktyukhin.
401#
402# Calculations inevitably require temporary reqisters, and there are
403# no %xmm registers left to spare. For this reason part of the ring
404# buffer, X[2..4] to be specific, is offloaded to 3 quadriples ring
405# buffer on the stack. Keep in mind that X[2] is alias X[-6], X[3] -
406# X[-5], and X[4] - X[-4]...
407#
408# Another notable optimization is aggressive stack frame compression
409# aiming to minimize amount of 9-byte instructions...
410#
411# Yet another notable optimization is "jumping" $B variable. It means
412# that there is no register permanently allocated for $B value. This
413# allowed to eliminate one instruction from body_20_39...
414#
415my $Xi=4; # 4xSIMD Xupdate round, start pre-seeded
416my @X=map("xmm$_",(4..7,0..3)); # pre-seeded for $Xi=4
417my @V=($A,$B,$C,$D,$E);
418my $j=0; # hash round
419my @T=($T,$tmp1);
420my $inp;
421
422my $_rol=sub { &rol(@_) };
423my $_ror=sub { &ror(@_) };
424
425&function_begin("_sha1_block_data_order_ssse3");
426 &call (&label("pic_point")); # make it PIC!
427 &set_label("pic_point");
428 &blindpop($tmp1);
429 &lea ($tmp1,&DWP(&label("K_XX_XX")."-".&label("pic_point"),$tmp1));
430&set_label("ssse3_shortcut");
431
432 &movdqa (@X[3],&QWP(0,$tmp1)); # K_00_19
433 &movdqa (@X[4],&QWP(16,$tmp1)); # K_20_39
434 &movdqa (@X[5],&QWP(32,$tmp1)); # K_40_59
435 &movdqa (@X[6],&QWP(48,$tmp1)); # K_60_79
436 &movdqa (@X[2],&QWP(64,$tmp1)); # pbswap mask
437
438 &mov ($E,&wparam(0)); # load argument block
439 &mov ($inp=@T[1],&wparam(1));
440 &mov ($D,&wparam(2));
441 &mov (@T[0],"esp");
442
443 # stack frame layout
444 #
445 # +0 X[0]+K X[1]+K X[2]+K X[3]+K # XMM->IALU xfer area
446 # X[4]+K X[5]+K X[6]+K X[7]+K
447 # X[8]+K X[9]+K X[10]+K X[11]+K
448 # X[12]+K X[13]+K X[14]+K X[15]+K
449 #
450 # +64 X[0] X[1] X[2] X[3] # XMM->XMM backtrace area
451 # X[4] X[5] X[6] X[7]
452 # X[8] X[9] X[10] X[11] # even borrowed for K_00_19
453 #
454 # +112 K_20_39 K_20_39 K_20_39 K_20_39 # constants
455 # K_40_59 K_40_59 K_40_59 K_40_59
456 # K_60_79 K_60_79 K_60_79 K_60_79
457 # K_00_19 K_00_19 K_00_19 K_00_19
458 # pbswap mask
459 #
460 # +192 ctx # argument block
461 # +196 inp
462 # +200 end
463 # +204 esp
464 &sub ("esp",208);
465 &and ("esp",-64);
466
467 &movdqa (&QWP(112+0,"esp"),@X[4]); # copy constants
468 &movdqa (&QWP(112+16,"esp"),@X[5]);
469 &movdqa (&QWP(112+32,"esp"),@X[6]);
470 &shl ($D,6); # len*64
471 &movdqa (&QWP(112+48,"esp"),@X[3]);
472 &add ($D,$inp); # end of input
473 &movdqa (&QWP(112+64,"esp"),@X[2]);
474 &add ($inp,64);
475 &mov (&DWP(192+0,"esp"),$E); # save argument block
476 &mov (&DWP(192+4,"esp"),$inp);
477 &mov (&DWP(192+8,"esp"),$D);
478 &mov (&DWP(192+12,"esp"),@T[0]); # save original %esp
479
480 &mov ($A,&DWP(0,$E)); # load context
481 &mov ($B,&DWP(4,$E));
482 &mov ($C,&DWP(8,$E));
483 &mov ($D,&DWP(12,$E));
484 &mov ($E,&DWP(16,$E));
485 &mov (@T[0],$B); # magic seed
486
487 &movdqu (@X[-4&7],&QWP(-64,$inp)); # load input to %xmm[0-3]
488 &movdqu (@X[-3&7],&QWP(-48,$inp));
489 &movdqu (@X[-2&7],&QWP(-32,$inp));
490 &movdqu (@X[-1&7],&QWP(-16,$inp));
491 &pshufb (@X[-4&7],@X[2]); # byte swap
492 &pshufb (@X[-3&7],@X[2]);
493 &pshufb (@X[-2&7],@X[2]);
494 &movdqa (&QWP(112-16,"esp"),@X[3]); # borrow last backtrace slot
495 &pshufb (@X[-1&7],@X[2]);
496 &paddd (@X[-4&7],@X[3]); # add K_00_19
497 &paddd (@X[-3&7],@X[3]);
498 &paddd (@X[-2&7],@X[3]);
499 &movdqa (&QWP(0,"esp"),@X[-4&7]); # X[]+K xfer to IALU
500 &psubd (@X[-4&7],@X[3]); # restore X[]
501 &movdqa (&QWP(0+16,"esp"),@X[-3&7]);
502 &psubd (@X[-3&7],@X[3]);
503 &movdqa (&QWP(0+32,"esp"),@X[-2&7]);
504 &psubd (@X[-2&7],@X[3]);
505 &movdqa (@X[0],@X[-3&7]);
506 &jmp (&label("loop"));
507
508######################################################################
509# SSE instruction sequence is first broken to groups of indepentent
510# instructions, independent in respect to their inputs and shifter
511# (not all architectures have more than one). Then IALU instructions
512# are "knitted in" between the SSE groups. Distance is maintained for
513# SSE latency of 2 in hope that it fits better upcoming AMD Bulldozer
514# [which allegedly also implements SSSE3]...
515#
516# Temporary registers usage. X[2] is volatile at the entry and at the
517# end is restored from backtrace ring buffer. X[3] is expected to
518# contain current K_XX_XX constant and is used to caclulate X[-1]+K
519# from previous round, it becomes volatile the moment the value is
520# saved to stack for transfer to IALU. X[4] becomes volatile whenever
521# X[-4] is accumulated and offloaded to backtrace ring buffer, at the
522# end it is loaded with next K_XX_XX [which becomes X[3] in next
523# round]...
524#
525sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4
526{ use integer;
527 my $body = shift;
528 my @insns = (&$body,&$body,&$body,&$body); # 40 instructions
529 my ($a,$b,$c,$d,$e);
530
531 eval(shift(@insns));
532 eval(shift(@insns));
533 &palignr(@X[0],@X[-4&7],8); # compose "X[-14]" in "X[0]"
534 &movdqa (@X[2],@X[-1&7]);
535 eval(shift(@insns));
536 eval(shift(@insns));
537
538 &paddd (@X[3],@X[-1&7]);
539 &movdqa (&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]);# save X[] to backtrace buffer
540 eval(shift(@insns));
541 eval(shift(@insns));
542 &psrldq (@X[2],4); # "X[-3]", 3 dwords
543 eval(shift(@insns));
544 eval(shift(@insns));
545 &pxor (@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
546 eval(shift(@insns));
547 eval(shift(@insns));
548
549 &pxor (@X[2],@X[-2&7]); # "X[-3]"^"X[-8]"
550 eval(shift(@insns));
551 eval(shift(@insns));
552 eval(shift(@insns));
553 eval(shift(@insns));
554
555 &pxor (@X[0],@X[2]); # "X[0]"^="X[-3]"^"X[-8]"
556 eval(shift(@insns));
557 eval(shift(@insns));
558 &movdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer to IALU
559 eval(shift(@insns));
560 eval(shift(@insns));
561
562 &movdqa (@X[4],@X[0]);
563 &movdqa (@X[2],@X[0]);
564 eval(shift(@insns));
565 eval(shift(@insns));
566 eval(shift(@insns));
567 eval(shift(@insns));
568
569 &pslldq (@X[4],12); # "X[0]"<<96, extract one dword
570 &paddd (@X[0],@X[0]);
571 eval(shift(@insns));
572 eval(shift(@insns));
573 eval(shift(@insns));
574 eval(shift(@insns));
575
576 &psrld (@X[2],31);
577 eval(shift(@insns));
578 eval(shift(@insns));
579 &movdqa (@X[3],@X[4]);
580 eval(shift(@insns));
581 eval(shift(@insns));
582
583 &psrld (@X[4],30);
584 &por (@X[0],@X[2]); # "X[0]"<<<=1
585 eval(shift(@insns));
586 eval(shift(@insns));
587 &movdqa (@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if ($Xi>5); # restore X[] from backtrace buffer
588 eval(shift(@insns));
589 eval(shift(@insns));
590
591 &pslld (@X[3],2);
592 &pxor (@X[0],@X[4]);
593 eval(shift(@insns));
594 eval(shift(@insns));
595 &movdqa (@X[4],&QWP(112-16+16*(($Xi)/5),"esp")); # K_XX_XX
596 eval(shift(@insns));
597 eval(shift(@insns));
598
599 &pxor (@X[0],@X[3]); # "X[0]"^=("X[0]"<<96)<<<2
600 &movdqa (@X[1],@X[-2&7]) if ($Xi<7);
601 eval(shift(@insns));
602 eval(shift(@insns));
603
604 foreach (@insns) { eval; } # remaining instructions [if any]
605
606 $Xi++; push(@X,shift(@X)); # "rotate" X[]
607}
608
609sub Xupdate_ssse3_32_79()
610{ use integer;
611 my $body = shift;
612 my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions
613 my ($a,$b,$c,$d,$e);
614
615 &movdqa (@X[2],@X[-1&7]) if ($Xi==8);
616 eval(shift(@insns)); # body_20_39
617 &pxor (@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
618 &palignr(@X[2],@X[-2&7],8); # compose "X[-6]"
619 eval(shift(@insns));
620 eval(shift(@insns));
621 eval(shift(@insns)); # rol
622
623 &pxor (@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
624 &movdqa (&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]); # save X[] to backtrace buffer
625 eval(shift(@insns));
626 eval(shift(@insns));
627 if ($Xi%5) {
628 &movdqa (@X[4],@X[3]); # "perpetuate" K_XX_XX...
629 } else { # ... or load next one
630 &movdqa (@X[4],&QWP(112-16+16*($Xi/5),"esp"));
631 }
632 &paddd (@X[3],@X[-1&7]);
633 eval(shift(@insns)); # ror
634 eval(shift(@insns));
635
636 &pxor (@X[0],@X[2]); # "X[0]"^="X[-6]"
637 eval(shift(@insns)); # body_20_39
638 eval(shift(@insns));
639 eval(shift(@insns));
640 eval(shift(@insns)); # rol
641
642 &movdqa (@X[2],@X[0]);
643 &movdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer to IALU
644 eval(shift(@insns));
645 eval(shift(@insns));
646 eval(shift(@insns)); # ror
647 eval(shift(@insns));
648
649 &pslld (@X[0],2);
650 eval(shift(@insns)); # body_20_39
651 eval(shift(@insns));
652 &psrld (@X[2],30);
653 eval(shift(@insns));
654 eval(shift(@insns)); # rol
655 eval(shift(@insns));
656 eval(shift(@insns));
657 eval(shift(@insns)); # ror
658 eval(shift(@insns));
659
660 &por (@X[0],@X[2]); # "X[0]"<<<=2
661 eval(shift(@insns)); # body_20_39
662 eval(shift(@insns));
663 &movdqa (@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if($Xi<19); # restore X[] from backtrace buffer
664 eval(shift(@insns));
665 eval(shift(@insns)); # rol
666 eval(shift(@insns));
667 eval(shift(@insns));
668 eval(shift(@insns)); # ror
669 &movdqa (@X[3],@X[0]) if ($Xi<19);
670 eval(shift(@insns));
671
672 foreach (@insns) { eval; } # remaining instructions
673
674 $Xi++; push(@X,shift(@X)); # "rotate" X[]
675}
676
677sub Xuplast_ssse3_80()
678{ use integer;
679 my $body = shift;
680 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
681 my ($a,$b,$c,$d,$e);
682
683 eval(shift(@insns));
684 &paddd (@X[3],@X[-1&7]);
685 eval(shift(@insns));
686 eval(shift(@insns));
687 eval(shift(@insns));
688 eval(shift(@insns));
689
690 &movdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer IALU
691
692 foreach (@insns) { eval; } # remaining instructions
693
694 &mov ($inp=@T[1],&DWP(192+4,"esp"));
695 &cmp ($inp,&DWP(192+8,"esp"));
696 &je (&label("done"));
697
698 &movdqa (@X[3],&QWP(112+48,"esp")); # K_00_19
699 &movdqa (@X[2],&QWP(112+64,"esp")); # pbswap mask
700 &movdqu (@X[-4&7],&QWP(0,$inp)); # load input
701 &movdqu (@X[-3&7],&QWP(16,$inp));
702 &movdqu (@X[-2&7],&QWP(32,$inp));
703 &movdqu (@X[-1&7],&QWP(48,$inp));
704 &add ($inp,64);
705 &pshufb (@X[-4&7],@X[2]); # byte swap
706 &mov (&DWP(192+4,"esp"),$inp);
707 &movdqa (&QWP(112-16,"esp"),@X[3]); # borrow last backtrace slot
708
709 $Xi=0;
710}
711
712sub Xloop_ssse3()
713{ use integer;
714 my $body = shift;
715 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
716 my ($a,$b,$c,$d,$e);
717
718 eval(shift(@insns));
719 eval(shift(@insns));
720 &pshufb (@X[($Xi-3)&7],@X[2]);
721 eval(shift(@insns));
722 eval(shift(@insns));
723 &paddd (@X[($Xi-4)&7],@X[3]);
724 eval(shift(@insns));
725 eval(shift(@insns));
726 eval(shift(@insns));
727 eval(shift(@insns));
728 &movdqa (&QWP(0+16*$Xi,"esp"),@X[($Xi-4)&7]); # X[]+K xfer to IALU
729 eval(shift(@insns));
730 eval(shift(@insns));
731 &psubd (@X[($Xi-4)&7],@X[3]);
732
733 foreach (@insns) { eval; }
734 $Xi++;
735}
736
737sub Xtail_ssse3()
738{ use integer;
739 my $body = shift;
740 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
741 my ($a,$b,$c,$d,$e);
742
743 foreach (@insns) { eval; }
744}
745
746sub body_00_19 () {
747 (
748 '($a,$b,$c,$d,$e)=@V;'.
749 '&add ($e,&DWP(4*($j&15),"esp"));', # X[]+K xfer
750 '&xor ($c,$d);',
751 '&mov (@T[1],$a);', # $b in next round
752 '&$_rol ($a,5);',
753 '&and (@T[0],$c);', # ($b&($c^$d))
754 '&xor ($c,$d);', # restore $c
755 '&xor (@T[0],$d);',
756 '&add ($e,$a);',
757 '&$_ror ($b,$j?7:2);', # $b>>>2
758 '&add ($e,@T[0]);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
759 );
760}
761
762sub body_20_39 () {
763 (
764 '($a,$b,$c,$d,$e)=@V;'.
765 '&add ($e,&DWP(4*($j++&15),"esp"));', # X[]+K xfer
766 '&xor (@T[0],$d);', # ($b^$d)
767 '&mov (@T[1],$a);', # $b in next round
768 '&$_rol ($a,5);',
769 '&xor (@T[0],$c);', # ($b^$d^$c)
770 '&add ($e,$a);',
771 '&$_ror ($b,7);', # $b>>>2
772 '&add ($e,@T[0]);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
773 );
774}
775
776sub body_40_59 () {
777 (
778 '($a,$b,$c,$d,$e)=@V;'.
779 '&mov (@T[1],$c);',
780 '&xor ($c,$d);',
781 '&add ($e,&DWP(4*($j++&15),"esp"));', # X[]+K xfer
782 '&and (@T[1],$d);',
783 '&and (@T[0],$c);', # ($b&($c^$d))
784 '&$_ror ($b,7);', # $b>>>2
785 '&add ($e,@T[1]);',
786 '&mov (@T[1],$a);', # $b in next round
787 '&$_rol ($a,5);',
788 '&add ($e,@T[0]);',
789 '&xor ($c,$d);', # restore $c
790 '&add ($e,$a);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
791 );
792}
793
794&set_label("loop",16);
795 &Xupdate_ssse3_16_31(\&body_00_19);
796 &Xupdate_ssse3_16_31(\&body_00_19);
797 &Xupdate_ssse3_16_31(\&body_00_19);
798 &Xupdate_ssse3_16_31(\&body_00_19);
799 &Xupdate_ssse3_32_79(\&body_00_19);
800 &Xupdate_ssse3_32_79(\&body_20_39);
801 &Xupdate_ssse3_32_79(\&body_20_39);
802 &Xupdate_ssse3_32_79(\&body_20_39);
803 &Xupdate_ssse3_32_79(\&body_20_39);
804 &Xupdate_ssse3_32_79(\&body_20_39);
805 &Xupdate_ssse3_32_79(\&body_40_59);
806 &Xupdate_ssse3_32_79(\&body_40_59);
807 &Xupdate_ssse3_32_79(\&body_40_59);
808 &Xupdate_ssse3_32_79(\&body_40_59);
809 &Xupdate_ssse3_32_79(\&body_40_59);
810 &Xupdate_ssse3_32_79(\&body_20_39);
811 &Xuplast_ssse3_80(\&body_20_39); # can jump to "done"
812
813 $saved_j=$j; @saved_V=@V;
814
815 &Xloop_ssse3(\&body_20_39);
816 &Xloop_ssse3(\&body_20_39);
817 &Xloop_ssse3(\&body_20_39);
818
819 &mov (@T[1],&DWP(192,"esp")); # update context
820 &add ($A,&DWP(0,@T[1]));
821 &add (@T[0],&DWP(4,@T[1])); # $b
822 &add ($C,&DWP(8,@T[1]));
823 &mov (&DWP(0,@T[1]),$A);
824 &add ($D,&DWP(12,@T[1]));
825 &mov (&DWP(4,@T[1]),@T[0]);
826 &add ($E,&DWP(16,@T[1]));
827 &mov (&DWP(8,@T[1]),$C);
828 &mov ($B,@T[0]);
829 &mov (&DWP(12,@T[1]),$D);
830 &mov (&DWP(16,@T[1]),$E);
831 &movdqa (@X[0],@X[-3&7]);
832
833 &jmp (&label("loop"));
834
835&set_label("done",16); $j=$saved_j; @V=@saved_V;
836
837 &Xtail_ssse3(\&body_20_39);
838 &Xtail_ssse3(\&body_20_39);
839 &Xtail_ssse3(\&body_20_39);
840
841 &mov (@T[1],&DWP(192,"esp")); # update context
842 &add ($A,&DWP(0,@T[1]));
843 &mov ("esp",&DWP(192+12,"esp")); # restore %esp
844 &add (@T[0],&DWP(4,@T[1])); # $b
845 &add ($C,&DWP(8,@T[1]));
846 &mov (&DWP(0,@T[1]),$A);
847 &add ($D,&DWP(12,@T[1]));
848 &mov (&DWP(4,@T[1]),@T[0]);
849 &add ($E,&DWP(16,@T[1]));
850 &mov (&DWP(8,@T[1]),$C);
851 &mov (&DWP(12,@T[1]),$D);
852 &mov (&DWP(16,@T[1]),$E);
853
854&function_end("_sha1_block_data_order_ssse3");
855
856if ($ymm) {
857my $Xi=4; # 4xSIMD Xupdate round, start pre-seeded
858my @X=map("xmm$_",(4..7,0..3)); # pre-seeded for $Xi=4
859my @V=($A,$B,$C,$D,$E);
860my $j=0; # hash round
861my @T=($T,$tmp1);
862my $inp;
863
864my $_rol=sub { &shld(@_[0],@_) };
865my $_ror=sub { &shrd(@_[0],@_) };
866
867&function_begin("_sha1_block_data_order_avx");
868 &call (&label("pic_point")); # make it PIC!
869 &set_label("pic_point");
870 &blindpop($tmp1);
871 &lea ($tmp1,&DWP(&label("K_XX_XX")."-".&label("pic_point"),$tmp1));
872&set_label("avx_shortcut");
873 &vzeroall();
874
875 &vmovdqa(@X[3],&QWP(0,$tmp1)); # K_00_19
876 &vmovdqa(@X[4],&QWP(16,$tmp1)); # K_20_39
877 &vmovdqa(@X[5],&QWP(32,$tmp1)); # K_40_59
878 &vmovdqa(@X[6],&QWP(48,$tmp1)); # K_60_79
879 &vmovdqa(@X[2],&QWP(64,$tmp1)); # pbswap mask
880
881 &mov ($E,&wparam(0)); # load argument block
882 &mov ($inp=@T[1],&wparam(1));
883 &mov ($D,&wparam(2));
884 &mov (@T[0],"esp");
885
886 # stack frame layout
887 #
888 # +0 X[0]+K X[1]+K X[2]+K X[3]+K # XMM->IALU xfer area
889 # X[4]+K X[5]+K X[6]+K X[7]+K
890 # X[8]+K X[9]+K X[10]+K X[11]+K
891 # X[12]+K X[13]+K X[14]+K X[15]+K
892 #
893 # +64 X[0] X[1] X[2] X[3] # XMM->XMM backtrace area
894 # X[4] X[5] X[6] X[7]
895 # X[8] X[9] X[10] X[11] # even borrowed for K_00_19
896 #
897 # +112 K_20_39 K_20_39 K_20_39 K_20_39 # constants
898 # K_40_59 K_40_59 K_40_59 K_40_59
899 # K_60_79 K_60_79 K_60_79 K_60_79
900 # K_00_19 K_00_19 K_00_19 K_00_19
901 # pbswap mask
902 #
903 # +192 ctx # argument block
904 # +196 inp
905 # +200 end
906 # +204 esp
907 &sub ("esp",208);
908 &and ("esp",-64);
909
910 &vmovdqa(&QWP(112+0,"esp"),@X[4]); # copy constants
911 &vmovdqa(&QWP(112+16,"esp"),@X[5]);
912 &vmovdqa(&QWP(112+32,"esp"),@X[6]);
913 &shl ($D,6); # len*64
914 &vmovdqa(&QWP(112+48,"esp"),@X[3]);
915 &add ($D,$inp); # end of input
916 &vmovdqa(&QWP(112+64,"esp"),@X[2]);
917 &add ($inp,64);
918 &mov (&DWP(192+0,"esp"),$E); # save argument block
919 &mov (&DWP(192+4,"esp"),$inp);
920 &mov (&DWP(192+8,"esp"),$D);
921 &mov (&DWP(192+12,"esp"),@T[0]); # save original %esp
922
923 &mov ($A,&DWP(0,$E)); # load context
924 &mov ($B,&DWP(4,$E));
925 &mov ($C,&DWP(8,$E));
926 &mov ($D,&DWP(12,$E));
927 &mov ($E,&DWP(16,$E));
928 &mov (@T[0],$B); # magic seed
929
930 &vmovdqu(@X[-4&7],&QWP(-64,$inp)); # load input to %xmm[0-3]
931 &vmovdqu(@X[-3&7],&QWP(-48,$inp));
932 &vmovdqu(@X[-2&7],&QWP(-32,$inp));
933 &vmovdqu(@X[-1&7],&QWP(-16,$inp));
934 &vpshufb(@X[-4&7],@X[-4&7],@X[2]); # byte swap
935 &vpshufb(@X[-3&7],@X[-3&7],@X[2]);
936 &vpshufb(@X[-2&7],@X[-2&7],@X[2]);
937 &vmovdqa(&QWP(112-16,"esp"),@X[3]); # borrow last backtrace slot
938 &vpshufb(@X[-1&7],@X[-1&7],@X[2]);
939 &vpaddd (@X[0],@X[-4&7],@X[3]); # add K_00_19
940 &vpaddd (@X[1],@X[-3&7],@X[3]);
941 &vpaddd (@X[2],@X[-2&7],@X[3]);
942 &vmovdqa(&QWP(0,"esp"),@X[0]); # X[]+K xfer to IALU
943 &vmovdqa(&QWP(0+16,"esp"),@X[1]);
944 &vmovdqa(&QWP(0+32,"esp"),@X[2]);
945 &jmp (&label("loop"));
946
947sub Xupdate_avx_16_31() # recall that $Xi starts wtih 4
948{ use integer;
949 my $body = shift;
950 my @insns = (&$body,&$body,&$body,&$body); # 40 instructions
951 my ($a,$b,$c,$d,$e);
952
953 eval(shift(@insns));
954 eval(shift(@insns));
955 &vpalignr(@X[0],@X[-3&7],@X[-4&7],8); # compose "X[-14]" in "X[0]"
956 eval(shift(@insns));
957 eval(shift(@insns));
958
959 &vpaddd (@X[3],@X[3],@X[-1&7]);
960 &vmovdqa (&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]);# save X[] to backtrace buffer
961 eval(shift(@insns));
962 eval(shift(@insns));
963 &vpsrldq(@X[2],@X[-1&7],4); # "X[-3]", 3 dwords
964 eval(shift(@insns));
965 eval(shift(@insns));
966 &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
967 eval(shift(@insns));
968 eval(shift(@insns));
969
970 &vpxor (@X[2],@X[2],@X[-2&7]); # "X[-3]"^"X[-8]"
971 eval(shift(@insns));
972 eval(shift(@insns));
973 &vmovdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer to IALU
974 eval(shift(@insns));
975 eval(shift(@insns));
976
977 &vpxor (@X[0],@X[0],@X[2]); # "X[0]"^="X[-3]"^"X[-8]"
978 eval(shift(@insns));
979 eval(shift(@insns));
980 eval(shift(@insns));
981 eval(shift(@insns));
982
983 &vpsrld (@X[2],@X[0],31);
984 eval(shift(@insns));
985 eval(shift(@insns));
986 eval(shift(@insns));
987 eval(shift(@insns));
988
989 &vpslldq(@X[4],@X[0],12); # "X[0]"<<96, extract one dword
990 &vpaddd (@X[0],@X[0],@X[0]);
991 eval(shift(@insns));
992 eval(shift(@insns));
993 eval(shift(@insns));
994 eval(shift(@insns));
995
996 &vpsrld (@X[3],@X[4],30);
997 &vpor (@X[0],@X[0],@X[2]); # "X[0]"<<<=1
998 eval(shift(@insns));
999 eval(shift(@insns));
1000 eval(shift(@insns));
1001 eval(shift(@insns));
1002
1003 &vpslld (@X[4],@X[4],2);
1004 &vmovdqa (@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if ($Xi>5); # restore X[] from backtrace buffer
1005 eval(shift(@insns));
1006 eval(shift(@insns));
1007 &vpxor (@X[0],@X[0],@X[3]);
1008 eval(shift(@insns));
1009 eval(shift(@insns));
1010 eval(shift(@insns));
1011 eval(shift(@insns));
1012
1013 &vpxor (@X[0],@X[0],@X[4]); # "X[0]"^=("X[0]"<<96)<<<2
1014 eval(shift(@insns));
1015 eval(shift(@insns));
1016 &vmovdqa (@X[4],&QWP(112-16+16*(($Xi)/5),"esp")); # K_XX_XX
1017 eval(shift(@insns));
1018 eval(shift(@insns));
1019
1020 foreach (@insns) { eval; } # remaining instructions [if any]
1021
1022 $Xi++; push(@X,shift(@X)); # "rotate" X[]
1023}
1024
1025sub Xupdate_avx_32_79()
1026{ use integer;
1027 my $body = shift;
1028 my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions
1029 my ($a,$b,$c,$d,$e);
1030
1031 &vpalignr(@X[2],@X[-1&7],@X[-2&7],8); # compose "X[-6]"
1032 &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
1033 eval(shift(@insns)); # body_20_39
1034 eval(shift(@insns));
1035 eval(shift(@insns));
1036 eval(shift(@insns)); # rol
1037
1038 &vpxor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
1039 &vmovdqa (&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]); # save X[] to backtrace buffer
1040 eval(shift(@insns));
1041 eval(shift(@insns));
1042 if ($Xi%5) {
1043 &vmovdqa (@X[4],@X[3]); # "perpetuate" K_XX_XX...
1044 } else { # ... or load next one
1045 &vmovdqa (@X[4],&QWP(112-16+16*($Xi/5),"esp"));
1046 }
1047 &vpaddd (@X[3],@X[3],@X[-1&7]);
1048 eval(shift(@insns)); # ror
1049 eval(shift(@insns));
1050
1051 &vpxor (@X[0],@X[0],@X[2]); # "X[0]"^="X[-6]"
1052 eval(shift(@insns)); # body_20_39
1053 eval(shift(@insns));
1054 eval(shift(@insns));
1055 eval(shift(@insns)); # rol
1056
1057 &vpsrld (@X[2],@X[0],30);
1058 &vmovdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer to IALU
1059 eval(shift(@insns));
1060 eval(shift(@insns));
1061 eval(shift(@insns)); # ror
1062 eval(shift(@insns));
1063
1064 &vpslld (@X[0],@X[0],2);
1065 eval(shift(@insns)); # body_20_39
1066 eval(shift(@insns));
1067 eval(shift(@insns));
1068 eval(shift(@insns)); # rol
1069 eval(shift(@insns));
1070 eval(shift(@insns));
1071 eval(shift(@insns)); # ror
1072 eval(shift(@insns));
1073
1074 &vpor (@X[0],@X[0],@X[2]); # "X[0]"<<<=2
1075 eval(shift(@insns)); # body_20_39
1076 eval(shift(@insns));
1077 &vmovdqa (@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if($Xi<19); # restore X[] from backtrace buffer
1078 eval(shift(@insns));
1079 eval(shift(@insns)); # rol
1080 eval(shift(@insns));
1081 eval(shift(@insns));
1082 eval(shift(@insns)); # ror
1083 eval(shift(@insns));
1084
1085 foreach (@insns) { eval; } # remaining instructions
1086
1087 $Xi++; push(@X,shift(@X)); # "rotate" X[]
1088}
1089
1090sub Xuplast_avx_80()
1091{ use integer;
1092 my $body = shift;
1093 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
1094 my ($a,$b,$c,$d,$e);
1095
1096 eval(shift(@insns));
1097 &vpaddd (@X[3],@X[3],@X[-1&7]);
1098 eval(shift(@insns));
1099 eval(shift(@insns));
1100 eval(shift(@insns));
1101 eval(shift(@insns));
1102
1103 &vmovdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer IALU
1104
1105 foreach (@insns) { eval; } # remaining instructions
1106
1107 &mov ($inp=@T[1],&DWP(192+4,"esp"));
1108 &cmp ($inp,&DWP(192+8,"esp"));
1109 &je (&label("done"));
1110
1111 &vmovdqa(@X[3],&QWP(112+48,"esp")); # K_00_19
1112 &vmovdqa(@X[2],&QWP(112+64,"esp")); # pbswap mask
1113 &vmovdqu(@X[-4&7],&QWP(0,$inp)); # load input
1114 &vmovdqu(@X[-3&7],&QWP(16,$inp));
1115 &vmovdqu(@X[-2&7],&QWP(32,$inp));
1116 &vmovdqu(@X[-1&7],&QWP(48,$inp));
1117 &add ($inp,64);
1118 &vpshufb(@X[-4&7],@X[-4&7],@X[2]); # byte swap
1119 &mov (&DWP(192+4,"esp"),$inp);
1120 &vmovdqa(&QWP(112-16,"esp"),@X[3]); # borrow last backtrace slot
1121
1122 $Xi=0;
1123}
1124
1125sub Xloop_avx()
1126{ use integer;
1127 my $body = shift;
1128 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
1129 my ($a,$b,$c,$d,$e);
1130
1131 eval(shift(@insns));
1132 eval(shift(@insns));
1133 &vpshufb (@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]);
1134 eval(shift(@insns));
1135 eval(shift(@insns));
1136 &vpaddd (@X[$Xi&7],@X[($Xi-4)&7],@X[3]);
1137 eval(shift(@insns));
1138 eval(shift(@insns));
1139 eval(shift(@insns));
1140 eval(shift(@insns));
1141 &vmovdqa (&QWP(0+16*$Xi,"esp"),@X[$Xi&7]); # X[]+K xfer to IALU
1142 eval(shift(@insns));
1143 eval(shift(@insns));
1144
1145 foreach (@insns) { eval; }
1146 $Xi++;
1147}
1148
1149sub Xtail_avx()
1150{ use integer;
1151 my $body = shift;
1152 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
1153 my ($a,$b,$c,$d,$e);
1154
1155 foreach (@insns) { eval; }
1156}
1157
1158&set_label("loop",16);
1159 &Xupdate_avx_16_31(\&body_00_19);
1160 &Xupdate_avx_16_31(\&body_00_19);
1161 &Xupdate_avx_16_31(\&body_00_19);
1162 &Xupdate_avx_16_31(\&body_00_19);
1163 &Xupdate_avx_32_79(\&body_00_19);
1164 &Xupdate_avx_32_79(\&body_20_39);
1165 &Xupdate_avx_32_79(\&body_20_39);
1166 &Xupdate_avx_32_79(\&body_20_39);
1167 &Xupdate_avx_32_79(\&body_20_39);
1168 &Xupdate_avx_32_79(\&body_20_39);
1169 &Xupdate_avx_32_79(\&body_40_59);
1170 &Xupdate_avx_32_79(\&body_40_59);
1171 &Xupdate_avx_32_79(\&body_40_59);
1172 &Xupdate_avx_32_79(\&body_40_59);
1173 &Xupdate_avx_32_79(\&body_40_59);
1174 &Xupdate_avx_32_79(\&body_20_39);
1175 &Xuplast_avx_80(\&body_20_39); # can jump to "done"
1176
1177 $saved_j=$j; @saved_V=@V;
1178
1179 &Xloop_avx(\&body_20_39);
1180 &Xloop_avx(\&body_20_39);
1181 &Xloop_avx(\&body_20_39);
1182
1183 &mov (@T[1],&DWP(192,"esp")); # update context
1184 &add ($A,&DWP(0,@T[1]));
1185 &add (@T[0],&DWP(4,@T[1])); # $b
1186 &add ($C,&DWP(8,@T[1]));
1187 &mov (&DWP(0,@T[1]),$A);
1188 &add ($D,&DWP(12,@T[1]));
1189 &mov (&DWP(4,@T[1]),@T[0]);
1190 &add ($E,&DWP(16,@T[1]));
1191 &mov (&DWP(8,@T[1]),$C);
1192 &mov ($B,@T[0]);
1193 &mov (&DWP(12,@T[1]),$D);
1194 &mov (&DWP(16,@T[1]),$E);
1195
1196 &jmp (&label("loop"));
1197
1198&set_label("done",16); $j=$saved_j; @V=@saved_V;
1199
1200 &Xtail_avx(\&body_20_39);
1201 &Xtail_avx(\&body_20_39);
1202 &Xtail_avx(\&body_20_39);
1203
1204 &vzeroall();
1205
1206 &mov (@T[1],&DWP(192,"esp")); # update context
1207 &add ($A,&DWP(0,@T[1]));
1208 &mov ("esp",&DWP(192+12,"esp")); # restore %esp
1209 &add (@T[0],&DWP(4,@T[1])); # $b
1210 &add ($C,&DWP(8,@T[1]));
1211 &mov (&DWP(0,@T[1]),$A);
1212 &add ($D,&DWP(12,@T[1]));
1213 &mov (&DWP(4,@T[1]),@T[0]);
1214 &add ($E,&DWP(16,@T[1]));
1215 &mov (&DWP(8,@T[1]),$C);
1216 &mov (&DWP(12,@T[1]),$D);
1217 &mov (&DWP(16,@T[1]),$E);
1218&function_end("_sha1_block_data_order_avx");
1219}
1220&set_label("K_XX_XX",64);
1221&data_word(0x5a827999,0x5a827999,0x5a827999,0x5a827999); # K_00_19
1222&data_word(0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1); # K_20_39
1223&data_word(0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc); # K_40_59
1224&data_word(0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6); # K_60_79
1225&data_word(0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f); # pbswap mask
1226}
1227&asciz("SHA1 block transform for x86, CRYPTOGAMS by <appro\@openssl.org>");
1228
1229&asm_finish();
diff --git a/src/lib/libcrypto/sha/asm/sha1-alpha.pl b/src/lib/libcrypto/sha/asm/sha1-alpha.pl
deleted file mode 100644
index 6c4b9251fd..0000000000
--- a/src/lib/libcrypto/sha/asm/sha1-alpha.pl
+++ /dev/null
@@ -1,322 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# SHA1 block procedure for Alpha.
11
12# On 21264 performance is 33% better than code generated by vendor
13# compiler, and 75% better than GCC [3.4], and in absolute terms is
14# 8.7 cycles per processed byte. Implementation features vectorized
15# byte swap, but not Xupdate.
16
17@X=( "\$0", "\$1", "\$2", "\$3", "\$4", "\$5", "\$6", "\$7",
18 "\$8", "\$9", "\$10", "\$11", "\$12", "\$13", "\$14", "\$15");
19$ctx="a0"; # $16
20$inp="a1";
21$num="a2";
22$A="a3";
23$B="a4"; # 20
24$C="a5";
25$D="t8";
26$E="t9"; @V=($A,$B,$C,$D,$E);
27$t0="t10"; # 24
28$t1="t11";
29$t2="ra";
30$t3="t12";
31$K="AT"; # 28
32
33sub BODY_00_19 {
34my ($i,$a,$b,$c,$d,$e)=@_;
35my $j=$i+1;
36$code.=<<___ if ($i==0);
37 ldq_u @X[0],0+0($inp)
38 ldq_u @X[1],0+7($inp)
39___
40$code.=<<___ if (!($i&1) && $i<14);
41 ldq_u @X[$i+2],($i+2)*4+0($inp)
42 ldq_u @X[$i+3],($i+2)*4+7($inp)
43___
44$code.=<<___ if (!($i&1) && $i<15);
45 extql @X[$i],$inp,@X[$i]
46 extqh @X[$i+1],$inp,@X[$i+1]
47
48 or @X[$i+1],@X[$i],@X[$i] # pair of 32-bit values are fetched
49
50 srl @X[$i],24,$t0 # vectorized byte swap
51 srl @X[$i],8,$t2
52
53 sll @X[$i],8,$t3
54 sll @X[$i],24,@X[$i]
55 zapnot $t0,0x11,$t0
56 zapnot $t2,0x22,$t2
57
58 zapnot @X[$i],0x88,@X[$i]
59 or $t0,$t2,$t0
60 zapnot $t3,0x44,$t3
61 sll $a,5,$t1
62
63 or @X[$i],$t0,@X[$i]
64 addl $K,$e,$e
65 and $b,$c,$t2
66 zapnot $a,0xf,$a
67
68 or @X[$i],$t3,@X[$i]
69 srl $a,27,$t0
70 bic $d,$b,$t3
71 sll $b,30,$b
72
73 extll @X[$i],4,@X[$i+1] # extract upper half
74 or $t2,$t3,$t2
75 addl @X[$i],$e,$e
76
77 addl $t1,$e,$e
78 srl $b,32,$t3
79 zapnot @X[$i],0xf,@X[$i]
80
81 addl $t0,$e,$e
82 addl $t2,$e,$e
83 or $t3,$b,$b
84___
85$code.=<<___ if (($i&1) && $i<15);
86 sll $a,5,$t1
87 addl $K,$e,$e
88 and $b,$c,$t2
89 zapnot $a,0xf,$a
90
91 srl $a,27,$t0
92 addl @X[$i%16],$e,$e
93 bic $d,$b,$t3
94 sll $b,30,$b
95
96 or $t2,$t3,$t2
97 addl $t1,$e,$e
98 srl $b,32,$t3
99 zapnot @X[$i],0xf,@X[$i]
100
101 addl $t0,$e,$e
102 addl $t2,$e,$e
103 or $t3,$b,$b
104___
105$code.=<<___ if ($i>=15); # with forward Xupdate
106 sll $a,5,$t1
107 addl $K,$e,$e
108 and $b,$c,$t2
109 xor @X[($j+2)%16],@X[$j%16],@X[$j%16]
110
111 zapnot $a,0xf,$a
112 addl @X[$i%16],$e,$e
113 bic $d,$b,$t3
114 xor @X[($j+8)%16],@X[$j%16],@X[$j%16]
115
116 srl $a,27,$t0
117 addl $t1,$e,$e
118 or $t2,$t3,$t2
119 xor @X[($j+13)%16],@X[$j%16],@X[$j%16]
120
121 sll $b,30,$b
122 addl $t0,$e,$e
123 srl @X[$j%16],31,$t1
124
125 addl $t2,$e,$e
126 srl $b,32,$t3
127 addl @X[$j%16],@X[$j%16],@X[$j%16]
128
129 or $t3,$b,$b
130 zapnot @X[$i%16],0xf,@X[$i%16]
131 or $t1,@X[$j%16],@X[$j%16]
132___
133}
134
135sub BODY_20_39 {
136my ($i,$a,$b,$c,$d,$e)=@_;
137my $j=$i+1;
138$code.=<<___ if ($i<79); # with forward Xupdate
139 sll $a,5,$t1
140 addl $K,$e,$e
141 zapnot $a,0xf,$a
142 xor @X[($j+2)%16],@X[$j%16],@X[$j%16]
143
144 sll $b,30,$t3
145 addl $t1,$e,$e
146 xor $b,$c,$t2
147 xor @X[($j+8)%16],@X[$j%16],@X[$j%16]
148
149 srl $b,2,$b
150 addl @X[$i%16],$e,$e
151 xor $d,$t2,$t2
152 xor @X[($j+13)%16],@X[$j%16],@X[$j%16]
153
154 srl @X[$j%16],31,$t1
155 addl $t2,$e,$e
156 srl $a,27,$t0
157 addl @X[$j%16],@X[$j%16],@X[$j%16]
158
159 or $t3,$b,$b
160 addl $t0,$e,$e
161 or $t1,@X[$j%16],@X[$j%16]
162___
163$code.=<<___ if ($i<77);
164 zapnot @X[$i%16],0xf,@X[$i%16]
165___
166$code.=<<___ if ($i==79); # with context fetch
167 sll $a,5,$t1
168 addl $K,$e,$e
169 zapnot $a,0xf,$a
170 ldl @X[0],0($ctx)
171
172 sll $b,30,$t3
173 addl $t1,$e,$e
174 xor $b,$c,$t2
175 ldl @X[1],4($ctx)
176
177 srl $b,2,$b
178 addl @X[$i%16],$e,$e
179 xor $d,$t2,$t2
180 ldl @X[2],8($ctx)
181
182 srl $a,27,$t0
183 addl $t2,$e,$e
184 ldl @X[3],12($ctx)
185
186 or $t3,$b,$b
187 addl $t0,$e,$e
188 ldl @X[4],16($ctx)
189___
190}
191
192sub BODY_40_59 {
193my ($i,$a,$b,$c,$d,$e)=@_;
194my $j=$i+1;
195$code.=<<___; # with forward Xupdate
196 sll $a,5,$t1
197 addl $K,$e,$e
198 zapnot $a,0xf,$a
199 xor @X[($j+2)%16],@X[$j%16],@X[$j%16]
200
201 srl $a,27,$t0
202 and $b,$c,$t2
203 and $b,$d,$t3
204 xor @X[($j+8)%16],@X[$j%16],@X[$j%16]
205
206 sll $b,30,$b
207 addl $t1,$e,$e
208 xor @X[($j+13)%16],@X[$j%16],@X[$j%16]
209
210 srl @X[$j%16],31,$t1
211 addl $t0,$e,$e
212 or $t2,$t3,$t2
213 and $c,$d,$t3
214
215 or $t2,$t3,$t2
216 srl $b,32,$t3
217 addl @X[$i%16],$e,$e
218 addl @X[$j%16],@X[$j%16],@X[$j%16]
219
220 or $t3,$b,$b
221 addl $t2,$e,$e
222 or $t1,@X[$j%16],@X[$j%16]
223 zapnot @X[$i%16],0xf,@X[$i%16]
224___
225}
226
227$code=<<___;
228#ifdef __linux__
229#include <asm/regdef.h>
230#else
231#include <asm.h>
232#include <regdef.h>
233#endif
234
235.text
236
237.set noat
238.set noreorder
239.globl sha1_block_data_order
240.align 5
241.ent sha1_block_data_order
242sha1_block_data_order:
243 lda sp,-64(sp)
244 stq ra,0(sp)
245 stq s0,8(sp)
246 stq s1,16(sp)
247 stq s2,24(sp)
248 stq s3,32(sp)
249 stq s4,40(sp)
250 stq s5,48(sp)
251 stq fp,56(sp)
252 .mask 0x0400fe00,-64
253 .frame sp,64,ra
254 .prologue 0
255
256 ldl $A,0($ctx)
257 ldl $B,4($ctx)
258 sll $num,6,$num
259 ldl $C,8($ctx)
260 ldl $D,12($ctx)
261 ldl $E,16($ctx)
262 addq $inp,$num,$num
263
264.Lloop:
265 .set noreorder
266 ldah $K,23170(zero)
267 zapnot $B,0xf,$B
268 lda $K,31129($K) # K_00_19
269___
270for ($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
271
272$code.=<<___;
273 ldah $K,28378(zero)
274 lda $K,-5215($K) # K_20_39
275___
276for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
277
278$code.=<<___;
279 ldah $K,-28900(zero)
280 lda $K,-17188($K) # K_40_59
281___
282for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
283
284$code.=<<___;
285 ldah $K,-13725(zero)
286 lda $K,-15914($K) # K_60_79
287___
288for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
289
290$code.=<<___;
291 addl @X[0],$A,$A
292 addl @X[1],$B,$B
293 addl @X[2],$C,$C
294 addl @X[3],$D,$D
295 addl @X[4],$E,$E
296 stl $A,0($ctx)
297 stl $B,4($ctx)
298 addq $inp,64,$inp
299 stl $C,8($ctx)
300 stl $D,12($ctx)
301 stl $E,16($ctx)
302 cmpult $inp,$num,$t1
303 bne $t1,.Lloop
304
305 .set noreorder
306 ldq ra,0(sp)
307 ldq s0,8(sp)
308 ldq s1,16(sp)
309 ldq s2,24(sp)
310 ldq s3,32(sp)
311 ldq s4,40(sp)
312 ldq s5,48(sp)
313 ldq fp,56(sp)
314 lda sp,64(sp)
315 ret (ra)
316.end sha1_block_data_order
317.ascii "SHA1 block transform for Alpha, CRYPTOGAMS by <appro\@openssl.org>"
318.align 2
319___
320$output=shift and open STDOUT,">$output";
321print $code;
322close STDOUT;
diff --git a/src/lib/libcrypto/sha/asm/sha1-armv4-large.pl b/src/lib/libcrypto/sha/asm/sha1-armv4-large.pl
deleted file mode 100644
index fe8207f77f..0000000000
--- a/src/lib/libcrypto/sha/asm/sha1-armv4-large.pl
+++ /dev/null
@@ -1,248 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# sha1_block procedure for ARMv4.
11#
12# January 2007.
13
14# Size/performance trade-off
15# ====================================================================
16# impl size in bytes comp cycles[*] measured performance
17# ====================================================================
18# thumb 304 3212 4420
19# armv4-small 392/+29% 1958/+64% 2250/+96%
20# armv4-compact 740/+89% 1552/+26% 1840/+22%
21# armv4-large 1420/+92% 1307/+19% 1370/+34%[***]
22# full unroll ~5100/+260% ~1260/+4% ~1300/+5%
23# ====================================================================
24# thumb = same as 'small' but in Thumb instructions[**] and
25# with recurring code in two private functions;
26# small = detached Xload/update, loops are folded;
27# compact = detached Xload/update, 5x unroll;
28# large = interleaved Xload/update, 5x unroll;
29# full unroll = interleaved Xload/update, full unroll, estimated[!];
30#
31# [*] Manually counted instructions in "grand" loop body. Measured
32# performance is affected by prologue and epilogue overhead,
33# i-cache availability, branch penalties, etc.
34# [**] While each Thumb instruction is twice smaller, they are not as
35# diverse as ARM ones: e.g., there are only two arithmetic
36# instructions with 3 arguments, no [fixed] rotate, addressing
37# modes are limited. As result it takes more instructions to do
38# the same job in Thumb, therefore the code is never twice as
39# small and always slower.
40# [***] which is also ~35% better than compiler generated code. Dual-
41# issue Cortex A8 core was measured to process input block in
42# ~990 cycles.
43
44# August 2010.
45#
46# Rescheduling for dual-issue pipeline resulted in 13% improvement on
47# Cortex A8 core and in absolute terms ~870 cycles per input block
48# [or 13.6 cycles per byte].
49
50# February 2011.
51#
52# Profiler-assisted and platform-specific optimization resulted in 10%
53# improvement on Cortex A8 core and 12.2 cycles per byte.
54
55while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
56open STDOUT,">$output";
57
58$ctx="r0";
59$inp="r1";
60$len="r2";
61$a="r3";
62$b="r4";
63$c="r5";
64$d="r6";
65$e="r7";
66$K="r8";
67$t0="r9";
68$t1="r10";
69$t2="r11";
70$t3="r12";
71$Xi="r14";
72@V=($a,$b,$c,$d,$e);
73
74sub Xupdate {
75my ($a,$b,$c,$d,$e,$opt1,$opt2)=@_;
76$code.=<<___;
77 ldr $t0,[$Xi,#15*4]
78 ldr $t1,[$Xi,#13*4]
79 ldr $t2,[$Xi,#7*4]
80 add $e,$K,$e,ror#2 @ E+=K_xx_xx
81 ldr $t3,[$Xi,#2*4]
82 eor $t0,$t0,$t1
83 eor $t2,$t2,$t3 @ 1 cycle stall
84 eor $t1,$c,$d @ F_xx_xx
85 mov $t0,$t0,ror#31
86 add $e,$e,$a,ror#27 @ E+=ROR(A,27)
87 eor $t0,$t0,$t2,ror#31
88 str $t0,[$Xi,#-4]!
89 $opt1 @ F_xx_xx
90 $opt2 @ F_xx_xx
91 add $e,$e,$t0 @ E+=X[i]
92___
93}
94
95sub BODY_00_15 {
96my ($a,$b,$c,$d,$e)=@_;
97$code.=<<___;
98#if __ARM_ARCH__<7
99 ldrb $t1,[$inp,#2]
100 ldrb $t0,[$inp,#3]
101 ldrb $t2,[$inp,#1]
102 add $e,$K,$e,ror#2 @ E+=K_00_19
103 ldrb $t3,[$inp],#4
104 orr $t0,$t0,$t1,lsl#8
105 eor $t1,$c,$d @ F_xx_xx
106 orr $t0,$t0,$t2,lsl#16
107 add $e,$e,$a,ror#27 @ E+=ROR(A,27)
108 orr $t0,$t0,$t3,lsl#24
109#else
110 ldr $t0,[$inp],#4 @ handles unaligned
111 add $e,$K,$e,ror#2 @ E+=K_00_19
112 eor $t1,$c,$d @ F_xx_xx
113 add $e,$e,$a,ror#27 @ E+=ROR(A,27)
114#ifdef __ARMEL__
115 rev $t0,$t0 @ byte swap
116#endif
117#endif
118 and $t1,$b,$t1,ror#2
119 add $e,$e,$t0 @ E+=X[i]
120 eor $t1,$t1,$d,ror#2 @ F_00_19(B,C,D)
121 str $t0,[$Xi,#-4]!
122 add $e,$e,$t1 @ E+=F_00_19(B,C,D)
123___
124}
125
126sub BODY_16_19 {
127my ($a,$b,$c,$d,$e)=@_;
128 &Xupdate(@_,"and $t1,$b,$t1,ror#2");
129$code.=<<___;
130 eor $t1,$t1,$d,ror#2 @ F_00_19(B,C,D)
131 add $e,$e,$t1 @ E+=F_00_19(B,C,D)
132___
133}
134
135sub BODY_20_39 {
136my ($a,$b,$c,$d,$e)=@_;
137 &Xupdate(@_,"eor $t1,$b,$t1,ror#2");
138$code.=<<___;
139 add $e,$e,$t1 @ E+=F_20_39(B,C,D)
140___
141}
142
143sub BODY_40_59 {
144my ($a,$b,$c,$d,$e)=@_;
145 &Xupdate(@_,"and $t1,$b,$t1,ror#2","and $t2,$c,$d");
146$code.=<<___;
147 add $e,$e,$t1 @ E+=F_40_59(B,C,D)
148 add $e,$e,$t2,ror#2
149___
150}
151
152$code=<<___;
153#include "arm_arch.h"
154
155.text
156
157.global sha1_block_data_order
158.type sha1_block_data_order,%function
159
160.align 2
161sha1_block_data_order:
162 stmdb sp!,{r4-r12,lr}
163 add $len,$inp,$len,lsl#6 @ $len to point at the end of $inp
164 ldmia $ctx,{$a,$b,$c,$d,$e}
165.Lloop:
166 ldr $K,.LK_00_19
167 mov $Xi,sp
168 sub sp,sp,#15*4
169 mov $c,$c,ror#30
170 mov $d,$d,ror#30
171 mov $e,$e,ror#30 @ [6]
172.L_00_15:
173___
174for($i=0;$i<5;$i++) {
175 &BODY_00_15(@V); unshift(@V,pop(@V));
176}
177$code.=<<___;
178 teq $Xi,sp
179 bne .L_00_15 @ [((11+4)*5+2)*3]
180___
181 &BODY_00_15(@V); unshift(@V,pop(@V));
182 &BODY_16_19(@V); unshift(@V,pop(@V));
183 &BODY_16_19(@V); unshift(@V,pop(@V));
184 &BODY_16_19(@V); unshift(@V,pop(@V));
185 &BODY_16_19(@V); unshift(@V,pop(@V));
186$code.=<<___;
187
188 ldr $K,.LK_20_39 @ [+15+16*4]
189 sub sp,sp,#25*4
190 cmn sp,#0 @ [+3], clear carry to denote 20_39
191.L_20_39_or_60_79:
192___
193for($i=0;$i<5;$i++) {
194 &BODY_20_39(@V); unshift(@V,pop(@V));
195}
196$code.=<<___;
197 teq $Xi,sp @ preserve carry
198 bne .L_20_39_or_60_79 @ [+((12+3)*5+2)*4]
199 bcs .L_done @ [+((12+3)*5+2)*4], spare 300 bytes
200
201 ldr $K,.LK_40_59
202 sub sp,sp,#20*4 @ [+2]
203.L_40_59:
204___
205for($i=0;$i<5;$i++) {
206 &BODY_40_59(@V); unshift(@V,pop(@V));
207}
208$code.=<<___;
209 teq $Xi,sp
210 bne .L_40_59 @ [+((12+5)*5+2)*4]
211
212 ldr $K,.LK_60_79
213 sub sp,sp,#20*4
214 cmp sp,#0 @ set carry to denote 60_79
215 b .L_20_39_or_60_79 @ [+4], spare 300 bytes
216.L_done:
217 add sp,sp,#80*4 @ "deallocate" stack frame
218 ldmia $ctx,{$K,$t0,$t1,$t2,$t3}
219 add $a,$K,$a
220 add $b,$t0,$b
221 add $c,$t1,$c,ror#2
222 add $d,$t2,$d,ror#2
223 add $e,$t3,$e,ror#2
224 stmia $ctx,{$a,$b,$c,$d,$e}
225 teq $inp,$len
226 bne .Lloop @ [+18], total 1307
227
228#if __ARM_ARCH__>=5
229 ldmia sp!,{r4-r12,pc}
230#else
231 ldmia sp!,{r4-r12,lr}
232 tst lr,#1
233 moveq pc,lr @ be binary compatible with V4, yet
234 bx lr @ interoperable with Thumb ISA:-)
235#endif
236.align 2
237.LK_00_19: .word 0x5a827999
238.LK_20_39: .word 0x6ed9eba1
239.LK_40_59: .word 0x8f1bbcdc
240.LK_60_79: .word 0xca62c1d6
241.size sha1_block_data_order,.-sha1_block_data_order
242.asciz "SHA1 block transform for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
243.align 2
244___
245
246$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
247print $code;
248close STDOUT; # enforce flush
diff --git a/src/lib/libcrypto/sha/asm/sha1-ia64.pl b/src/lib/libcrypto/sha/asm/sha1-ia64.pl
deleted file mode 100644
index db28f0805a..0000000000
--- a/src/lib/libcrypto/sha/asm/sha1-ia64.pl
+++ /dev/null
@@ -1,304 +0,0 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# Eternal question is what's wrong with compiler generated code? The
11# trick is that it's possible to reduce the number of shifts required
12# to perform rotations by maintaining copy of 32-bit value in upper
13# bits of 64-bit register. Just follow mux2 and shrp instructions...
14# Performance under big-endian OS such as HP-UX is 179MBps*1GHz, which
15# is >50% better than HP C and >2x better than gcc.
16
17$code=<<___;
18.ident \"sha1-ia64.s, version 1.3\"
19.ident \"IA-64 ISA artwork by Andy Polyakov <appro\@fy.chalmers.se>\"
20.explicit
21
22___
23
24
25if ($^O eq "hpux") {
26 $ADDP="addp4";
27 for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
28} else { $ADDP="add"; }
29
30#$human=1;
31if ($human) { # useful for visual code auditing...
32 ($A,$B,$C,$D,$E) = ("A","B","C","D","E");
33 ($h0,$h1,$h2,$h3,$h4) = ("h0","h1","h2","h3","h4");
34 ($K_00_19, $K_20_39, $K_40_59, $K_60_79) =
35 ( "K_00_19","K_20_39","K_40_59","K_60_79" );
36 @X= ( "X0", "X1", "X2", "X3", "X4", "X5", "X6", "X7",
37 "X8", "X9","X10","X11","X12","X13","X14","X15" );
38}
39else {
40 ($A,$B,$C,$D,$E) = ("loc0","loc1","loc2","loc3","loc4");
41 ($h0,$h1,$h2,$h3,$h4) = ("loc5","loc6","loc7","loc8","loc9");
42 ($K_00_19, $K_20_39, $K_40_59, $K_60_79) =
43 ( "r14", "r15", "loc10", "loc11" );
44 @X= ( "r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23",
45 "r24", "r25", "r26", "r27", "r28", "r29", "r30", "r31" );
46}
47
48sub BODY_00_15 {
49local *code=shift;
50my ($i,$a,$b,$c,$d,$e)=@_;
51my $j=$i+1;
52my $Xn=@X[$j%16];
53
54$code.=<<___ if ($i==0);
55{ .mmi; ld1 $X[$i]=[inp],2 // MSB
56 ld1 tmp2=[tmp3],2 };;
57{ .mmi; ld1 tmp0=[inp],2
58 ld1 tmp4=[tmp3],2 // LSB
59 dep $X[$i]=$X[$i],tmp2,8,8 };;
60___
61if ($i<15) {
62 $code.=<<___;
63{ .mmi; ld1 $Xn=[inp],2 // forward Xload
64 nop.m 0x0
65 dep tmp1=tmp0,tmp4,8,8 };;
66{ .mmi; ld1 tmp2=[tmp3],2 // forward Xload
67 and tmp4=$c,$b
68 dep $X[$i]=$X[$i],tmp1,16,16} //;;
69{ .mmi; add $e=$e,$K_00_19 // e+=K_00_19
70 andcm tmp1=$d,$b
71 dep.z tmp5=$a,5,27 };; // a<<5
72{ .mmi; add $e=$e,$X[$i] // e+=Xload
73 or tmp4=tmp4,tmp1 // F_00_19(b,c,d)=(b&c)|(~b&d)
74 extr.u tmp1=$a,27,5 };; // a>>27
75{ .mmi; ld1 tmp0=[inp],2 // forward Xload
76 add $e=$e,tmp4 // e+=F_00_19(b,c,d)
77 shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30)
78{ .mmi; ld1 tmp4=[tmp3],2 // forward Xload
79 or tmp5=tmp1,tmp5 // ROTATE(a,5)
80 mux2 tmp6=$a,0x44 };; // see b in next iteration
81{ .mii; add $e=$e,tmp5 // e+=ROTATE(a,5)
82 dep $Xn=$Xn,tmp2,8,8 // forward Xload
83 mux2 $X[$i]=$X[$i],0x44 } //;;
84
85___
86 }
87else {
88 $code.=<<___;
89{ .mii; and tmp3=$c,$b
90 dep tmp1=tmp0,tmp4,8,8;;
91 dep $X[$i]=$X[$i],tmp1,16,16} //;;
92{ .mmi; add $e=$e,$K_00_19 // e+=K_00_19
93 andcm tmp1=$d,$b
94 dep.z tmp5=$a,5,27 };; // a<<5
95{ .mmi; add $e=$e,$X[$i] // e+=Xupdate
96 or tmp4=tmp3,tmp1 // F_00_19(b,c,d)=(b&c)|(~b&d)
97 extr.u tmp1=$a,27,5 } // a>>27
98{ .mmi; xor $Xn=$Xn,$X[($j+2)%16] // forward Xupdate
99 xor tmp3=$X[($j+8)%16],$X[($j+13)%16] // forward Xupdate
100 nop.i 0 };;
101{ .mmi; add $e=$e,tmp4 // e+=F_00_19(b,c,d)
102 xor $Xn=$Xn,tmp3 // forward Xupdate
103 shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30)
104{ .mmi; or tmp1=tmp1,tmp5 // ROTATE(a,5)
105 mux2 tmp6=$a,0x44 };; // see b in next iteration
106{ .mii; add $e=$e,tmp1 // e+=ROTATE(a,5)
107 shrp $Xn=$Xn,$Xn,31 // ROTATE(x[0]^x[2]^x[8]^x[13],1)
108 mux2 $X[$i]=$X[$i],0x44 };;
109
110___
111 }
112}
113
114sub BODY_16_19 {
115local *code=shift;
116my ($i,$a,$b,$c,$d,$e)=@_;
117my $j=$i+1;
118my $Xn=@X[$j%16];
119
120$code.=<<___;
121{ .mib; add $e=$e,$K_00_19 // e+=K_00_19
122 dep.z tmp5=$a,5,27 } // a<<5
123{ .mib; andcm tmp1=$d,$b
124 and tmp0=$c,$b };;
125{ .mmi; add $e=$e,$X[$i%16] // e+=Xupdate
126 or tmp0=tmp0,tmp1 // F_00_19(b,c,d)=(b&c)|(~b&d)
127 extr.u tmp1=$a,27,5 } // a>>27
128{ .mmi; xor $Xn=$Xn,$X[($j+2)%16] // forward Xupdate
129 xor tmp3=$X[($j+8)%16],$X[($j+13)%16] // forward Xupdate
130 nop.i 0 };;
131{ .mmi; add $e=$e,tmp0 // f+=F_00_19(b,c,d)
132 xor $Xn=$Xn,tmp3 // forward Xupdate
133 shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30)
134{ .mmi; or tmp1=tmp1,tmp5 // ROTATE(a,5)
135 mux2 tmp6=$a,0x44 };; // see b in next iteration
136{ .mii; add $e=$e,tmp1 // e+=ROTATE(a,5)
137 shrp $Xn=$Xn,$Xn,31 // ROTATE(x[0]^x[2]^x[8]^x[13],1)
138 nop.i 0 };;
139
140___
141}
142
143sub BODY_20_39 {
144local *code=shift;
145my ($i,$a,$b,$c,$d,$e,$Konst)=@_;
146 $Konst = $K_20_39 if (!defined($Konst));
147my $j=$i+1;
148my $Xn=@X[$j%16];
149
150if ($i<79) {
151$code.=<<___;
152{ .mib; add $e=$e,$Konst // e+=K_XX_XX
153 dep.z tmp5=$a,5,27 } // a<<5
154{ .mib; xor tmp0=$c,$b
155 xor $Xn=$Xn,$X[($j+2)%16] };; // forward Xupdate
156{ .mib; add $e=$e,$X[$i%16] // e+=Xupdate
157 extr.u tmp1=$a,27,5 } // a>>27
158{ .mib; xor tmp0=tmp0,$d // F_20_39(b,c,d)=b^c^d
159 xor $Xn=$Xn,$X[($j+8)%16] };; // forward Xupdate
160{ .mmi; add $e=$e,tmp0 // e+=F_20_39(b,c,d)
161 xor $Xn=$Xn,$X[($j+13)%16] // forward Xupdate
162 shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30)
163{ .mmi; or tmp1=tmp1,tmp5 // ROTATE(a,5)
164 mux2 tmp6=$a,0x44 };; // see b in next iteration
165{ .mii; add $e=$e,tmp1 // e+=ROTATE(a,5)
166 shrp $Xn=$Xn,$Xn,31 // ROTATE(x[0]^x[2]^x[8]^x[13],1)
167 nop.i 0 };;
168
169___
170}
171else {
172$code.=<<___;
173{ .mib; add $e=$e,$Konst // e+=K_60_79
174 dep.z tmp5=$a,5,27 } // a<<5
175{ .mib; xor tmp0=$c,$b
176 add $h1=$h1,$a };; // wrap up
177{ .mib; add $e=$e,$X[$i%16] // e+=Xupdate
178 extr.u tmp1=$a,27,5 } // a>>27
179{ .mib; xor tmp0=tmp0,$d // F_20_39(b,c,d)=b^c^d
180 add $h3=$h3,$c };; // wrap up
181{ .mmi; add $e=$e,tmp0 // e+=F_20_39(b,c,d)
182 or tmp1=tmp1,tmp5 // ROTATE(a,5)
183 shrp $b=tmp6,tmp6,2 };; // b=ROTATE(b,30) ;;?
184{ .mmi; add $e=$e,tmp1 // e+=ROTATE(a,5)
185 add tmp3=1,inp // used in unaligned codepath
186 add $h4=$h4,$d };; // wrap up
187
188___
189}
190}
191
192sub BODY_40_59 {
193local *code=shift;
194my ($i,$a,$b,$c,$d,$e)=@_;
195my $j=$i+1;
196my $Xn=@X[$j%16];
197
198$code.=<<___;
199{ .mib; add $e=$e,$K_40_59 // e+=K_40_59
200 dep.z tmp5=$a,5,27 } // a<<5
201{ .mib; and tmp1=$c,$d
202 xor tmp0=$c,$d };;
203{ .mmi; add $e=$e,$X[$i%16] // e+=Xupdate
204 add tmp5=tmp5,tmp1 // a<<5+(c&d)
205 extr.u tmp1=$a,27,5 } // a>>27
206{ .mmi; and tmp0=tmp0,$b
207 xor $Xn=$Xn,$X[($j+2)%16] // forward Xupdate
208 xor tmp3=$X[($j+8)%16],$X[($j+13)%16] };; // forward Xupdate
209{ .mmi; add $e=$e,tmp0 // e+=b&(c^d)
210 add tmp5=tmp5,tmp1 // ROTATE(a,5)+(c&d)
211 shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30)
212{ .mmi; xor $Xn=$Xn,tmp3
213 mux2 tmp6=$a,0x44 };; // see b in next iteration
214{ .mii; add $e=$e,tmp5 // e+=ROTATE(a,5)+(c&d)
215 shrp $Xn=$Xn,$Xn,31 // ROTATE(x[0]^x[2]^x[8]^x[13],1)
216 nop.i 0x0 };;
217
218___
219}
220sub BODY_60_79 { &BODY_20_39(@_,$K_60_79); }
221
222$code.=<<___;
223.text
224
225tmp0=r8;
226tmp1=r9;
227tmp2=r10;
228tmp3=r11;
229ctx=r32; // in0
230inp=r33; // in1
231
232// void sha1_block_data_order(SHA_CTX *c,const void *p,size_t num);
233.global sha1_block_data_order#
234.proc sha1_block_data_order#
235.align 32
236sha1_block_data_order:
237 .prologue
238{ .mmi; alloc tmp1=ar.pfs,3,14,0,0
239 $ADDP tmp0=4,ctx
240 .save ar.lc,r3
241 mov r3=ar.lc }
242{ .mmi; $ADDP ctx=0,ctx
243 $ADDP inp=0,inp
244 mov r2=pr };;
245tmp4=in2;
246tmp5=loc12;
247tmp6=loc13;
248 .body
249{ .mlx; ld4 $h0=[ctx],8
250 movl $K_00_19=0x5a827999 }
251{ .mlx; ld4 $h1=[tmp0],8
252 movl $K_20_39=0x6ed9eba1 };;
253{ .mlx; ld4 $h2=[ctx],8
254 movl $K_40_59=0x8f1bbcdc }
255{ .mlx; ld4 $h3=[tmp0]
256 movl $K_60_79=0xca62c1d6 };;
257{ .mmi; ld4 $h4=[ctx],-16
258 add in2=-1,in2 // adjust num for ar.lc
259 mov ar.ec=1 };;
260{ .mmi; nop.m 0
261 add tmp3=1,inp
262 mov ar.lc=in2 };; // brp.loop.imp: too far
263
264.Ldtop:
265{ .mmi; mov $A=$h0
266 mov $B=$h1
267 mux2 tmp6=$h1,0x44 }
268{ .mmi; mov $C=$h2
269 mov $D=$h3
270 mov $E=$h4 };;
271
272___
273
274{ my $i,@V=($A,$B,$C,$D,$E);
275
276 for($i=0;$i<16;$i++) { &BODY_00_15(\$code,$i,@V); unshift(@V,pop(@V)); }
277 for(;$i<20;$i++) { &BODY_16_19(\$code,$i,@V); unshift(@V,pop(@V)); }
278 for(;$i<40;$i++) { &BODY_20_39(\$code,$i,@V); unshift(@V,pop(@V)); }
279 for(;$i<60;$i++) { &BODY_40_59(\$code,$i,@V); unshift(@V,pop(@V)); }
280 for(;$i<80;$i++) { &BODY_60_79(\$code,$i,@V); unshift(@V,pop(@V)); }
281
282 (($V[0] eq $A) and ($V[4] eq $E)) or die; # double-check
283}
284
285$code.=<<___;
286{ .mmb; add $h0=$h0,$A
287 add $h2=$h2,$C
288 br.ctop.dptk.many .Ldtop };;
289.Ldend:
290{ .mmi; add tmp0=4,ctx
291 mov ar.lc=r3 };;
292{ .mmi; st4 [ctx]=$h0,8
293 st4 [tmp0]=$h1,8 };;
294{ .mmi; st4 [ctx]=$h2,8
295 st4 [tmp0]=$h3 };;
296{ .mib; st4 [ctx]=$h4,-16
297 mov pr=r2,0x1ffff
298 br.ret.sptk.many b0 };;
299.endp sha1_block_data_order#
300stringz "SHA1 block transform for IA64, CRYPTOGAMS by <appro\@openssl.org>"
301___
302
303$output=shift and open STDOUT,">$output";
304print $code;
diff --git a/src/lib/libcrypto/sha/asm/sha1-mips.pl b/src/lib/libcrypto/sha/asm/sha1-mips.pl
deleted file mode 100644
index f1a702f38f..0000000000
--- a/src/lib/libcrypto/sha/asm/sha1-mips.pl
+++ /dev/null
@@ -1,354 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# SHA1 block procedure for MIPS.
11
12# Performance improvement is 30% on unaligned input. The "secret" is
13# to deploy lwl/lwr pair to load unaligned input. One could have
14# vectorized Xupdate on MIPSIII/IV, but the goal was to code MIPS32-
15# compatible subroutine. There is room for minor optimization on
16# little-endian platforms...
17
18######################################################################
19# There is a number of MIPS ABI in use, O32 and N32/64 are most
20# widely used. Then there is a new contender: NUBI. It appears that if
21# one picks the latter, it's possible to arrange code in ABI neutral
22# manner. Therefore let's stick to NUBI register layout:
23#
24($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
25($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
26($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
27($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
28#
29# The return value is placed in $a0. Following coding rules facilitate
30# interoperability:
31#
32# - never ever touch $tp, "thread pointer", former $gp;
33# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
34# old code];
35# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
36#
37# For reference here is register layout for N32/64 MIPS ABIs:
38#
39# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
40# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
41# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
42# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
43# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
44#
45$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64
46
47if ($flavour =~ /64|n32/i) {
48 $PTR_ADD="dadd"; # incidentally works even on n32
49 $PTR_SUB="dsub"; # incidentally works even on n32
50 $REG_S="sd";
51 $REG_L="ld";
52 $PTR_SLL="dsll"; # incidentally works even on n32
53 $SZREG=8;
54} else {
55 $PTR_ADD="add";
56 $PTR_SUB="sub";
57 $REG_S="sw";
58 $REG_L="lw";
59 $PTR_SLL="sll";
60 $SZREG=4;
61}
62#
63# <appro@openssl.org>
64#
65######################################################################
66
67$big_endian=(`echo MIPSEL | $ENV{CC} -E -P -`=~/MIPSEL/)?1:0;
68
69for (@ARGV) { $output=$_ if (/^\w[\w\-]*\.\w+$/); }
70open STDOUT,">$output";
71
72if (!defined($big_endian))
73 { $big_endian=(unpack('L',pack('N',1))==1); }
74
75# offsets of the Most and Least Significant Bytes
76$MSB=$big_endian?0:3;
77$LSB=3&~$MSB;
78
79@X=map("\$$_",(8..23)); # a4-a7,s0-s11
80
81$ctx=$a0;
82$inp=$a1;
83$num=$a2;
84$A="\$1";
85$B="\$2";
86$C="\$3";
87$D="\$7";
88$E="\$24"; @V=($A,$B,$C,$D,$E);
89$t0="\$25";
90$t1=$num; # $num is offloaded to stack
91$t2="\$30"; # fp
92$K="\$31"; # ra
93
94sub BODY_00_14 {
95my ($i,$a,$b,$c,$d,$e)=@_;
96my $j=$i+1;
97$code.=<<___ if (!$big_endian);
98 srl $t0,@X[$i],24 # byte swap($i)
99 srl $t1,@X[$i],8
100 andi $t2,@X[$i],0xFF00
101 sll @X[$i],@X[$i],24
102 andi $t1,0xFF00
103 sll $t2,$t2,8
104 or @X[$i],$t0
105 or $t1,$t2
106 or @X[$i],$t1
107___
108$code.=<<___;
109 lwl @X[$j],$j*4+$MSB($inp)
110 sll $t0,$a,5 # $i
111 addu $e,$K
112 lwr @X[$j],$j*4+$LSB($inp)
113 srl $t1,$a,27
114 addu $e,$t0
115 xor $t0,$c,$d
116 addu $e,$t1
117 sll $t2,$b,30
118 and $t0,$b
119 srl $b,$b,2
120 xor $t0,$d
121 addu $e,@X[$i]
122 or $b,$t2
123 addu $e,$t0
124___
125}
126
127sub BODY_15_19 {
128my ($i,$a,$b,$c,$d,$e)=@_;
129my $j=$i+1;
130
131$code.=<<___ if (!$big_endian && $i==15);
132 srl $t0,@X[$i],24 # byte swap($i)
133 srl $t1,@X[$i],8
134 andi $t2,@X[$i],0xFF00
135 sll @X[$i],@X[$i],24
136 andi $t1,0xFF00
137 sll $t2,$t2,8
138 or @X[$i],$t0
139 or @X[$i],$t1
140 or @X[$i],$t2
141___
142$code.=<<___;
143 xor @X[$j%16],@X[($j+2)%16]
144 sll $t0,$a,5 # $i
145 addu $e,$K
146 srl $t1,$a,27
147 addu $e,$t0
148 xor @X[$j%16],@X[($j+8)%16]
149 xor $t0,$c,$d
150 addu $e,$t1
151 xor @X[$j%16],@X[($j+13)%16]
152 sll $t2,$b,30
153 and $t0,$b
154 srl $t1,@X[$j%16],31
155 addu @X[$j%16],@X[$j%16]
156 srl $b,$b,2
157 xor $t0,$d
158 or @X[$j%16],$t1
159 addu $e,@X[$i%16]
160 or $b,$t2
161 addu $e,$t0
162___
163}
164
165sub BODY_20_39 {
166my ($i,$a,$b,$c,$d,$e)=@_;
167my $j=$i+1;
168$code.=<<___ if ($i<79);
169 xor @X[$j%16],@X[($j+2)%16]
170 sll $t0,$a,5 # $i
171 addu $e,$K
172 srl $t1,$a,27
173 addu $e,$t0
174 xor @X[$j%16],@X[($j+8)%16]
175 xor $t0,$c,$d
176 addu $e,$t1
177 xor @X[$j%16],@X[($j+13)%16]
178 sll $t2,$b,30
179 xor $t0,$b
180 srl $t1,@X[$j%16],31
181 addu @X[$j%16],@X[$j%16]
182 srl $b,$b,2
183 addu $e,@X[$i%16]
184 or @X[$j%16],$t1
185 or $b,$t2
186 addu $e,$t0
187___
188$code.=<<___ if ($i==79);
189 lw @X[0],0($ctx)
190 sll $t0,$a,5 # $i
191 addu $e,$K
192 lw @X[1],4($ctx)
193 srl $t1,$a,27
194 addu $e,$t0
195 lw @X[2],8($ctx)
196 xor $t0,$c,$d
197 addu $e,$t1
198 lw @X[3],12($ctx)
199 sll $t2,$b,30
200 xor $t0,$b
201 lw @X[4],16($ctx)
202 srl $b,$b,2
203 addu $e,@X[$i%16]
204 or $b,$t2
205 addu $e,$t0
206___
207}
208
209sub BODY_40_59 {
210my ($i,$a,$b,$c,$d,$e)=@_;
211my $j=$i+1;
212$code.=<<___ if ($i<79);
213 xor @X[$j%16],@X[($j+2)%16]
214 sll $t0,$a,5 # $i
215 addu $e,$K
216 srl $t1,$a,27
217 addu $e,$t0
218 xor @X[$j%16],@X[($j+8)%16]
219 and $t0,$c,$d
220 addu $e,$t1
221 xor @X[$j%16],@X[($j+13)%16]
222 sll $t2,$b,30
223 addu $e,$t0
224 srl $t1,@X[$j%16],31
225 xor $t0,$c,$d
226 addu @X[$j%16],@X[$j%16]
227 and $t0,$b
228 srl $b,$b,2
229 or @X[$j%16],$t1
230 addu $e,@X[$i%16]
231 or $b,$t2
232 addu $e,$t0
233___
234}
235
236$FRAMESIZE=16; # large enough to accomodate NUBI saved registers
237$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0xc0fff008 : 0xc0ff0000;
238
239$code=<<___;
240#ifdef OPENSSL_FIPSCANISTER
241# include <openssl/fipssyms.h>
242#endif
243
244.text
245
246.set noat
247.set noreorder
248.align 5
249.globl sha1_block_data_order
250.ent sha1_block_data_order
251sha1_block_data_order:
252 .frame $sp,$FRAMESIZE*$SZREG,$ra
253 .mask $SAVED_REGS_MASK,-$SZREG
254 .set noreorder
255 $PTR_SUB $sp,$FRAMESIZE*$SZREG
256 $REG_S $ra,($FRAMESIZE-1)*$SZREG($sp)
257 $REG_S $fp,($FRAMESIZE-2)*$SZREG($sp)
258 $REG_S $s11,($FRAMESIZE-3)*$SZREG($sp)
259 $REG_S $s10,($FRAMESIZE-4)*$SZREG($sp)
260 $REG_S $s9,($FRAMESIZE-5)*$SZREG($sp)
261 $REG_S $s8,($FRAMESIZE-6)*$SZREG($sp)
262 $REG_S $s7,($FRAMESIZE-7)*$SZREG($sp)
263 $REG_S $s6,($FRAMESIZE-8)*$SZREG($sp)
264 $REG_S $s5,($FRAMESIZE-9)*$SZREG($sp)
265 $REG_S $s4,($FRAMESIZE-10)*$SZREG($sp)
266___
267$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
268 $REG_S $s3,($FRAMESIZE-11)*$SZREG($sp)
269 $REG_S $s2,($FRAMESIZE-12)*$SZREG($sp)
270 $REG_S $s1,($FRAMESIZE-13)*$SZREG($sp)
271 $REG_S $s0,($FRAMESIZE-14)*$SZREG($sp)
272 $REG_S $gp,($FRAMESIZE-15)*$SZREG($sp)
273___
274$code.=<<___;
275 $PTR_SLL $num,6
276 $PTR_ADD $num,$inp
277 $REG_S $num,0($sp)
278 lw $A,0($ctx)
279 lw $B,4($ctx)
280 lw $C,8($ctx)
281 lw $D,12($ctx)
282 b .Loop
283 lw $E,16($ctx)
284.align 4
285.Loop:
286 .set reorder
287 lwl @X[0],$MSB($inp)
288 lui $K,0x5a82
289 lwr @X[0],$LSB($inp)
290 ori $K,0x7999 # K_00_19
291___
292for ($i=0;$i<15;$i++) { &BODY_00_14($i,@V); unshift(@V,pop(@V)); }
293for (;$i<20;$i++) { &BODY_15_19($i,@V); unshift(@V,pop(@V)); }
294$code.=<<___;
295 lui $K,0x6ed9
296 ori $K,0xeba1 # K_20_39
297___
298for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
299$code.=<<___;
300 lui $K,0x8f1b
301 ori $K,0xbcdc # K_40_59
302___
303for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
304$code.=<<___;
305 lui $K,0xca62
306 ori $K,0xc1d6 # K_60_79
307___
308for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
309$code.=<<___;
310 $PTR_ADD $inp,64
311 $REG_L $num,0($sp)
312
313 addu $A,$X[0]
314 addu $B,$X[1]
315 sw $A,0($ctx)
316 addu $C,$X[2]
317 addu $D,$X[3]
318 sw $B,4($ctx)
319 addu $E,$X[4]
320 sw $C,8($ctx)
321 sw $D,12($ctx)
322 sw $E,16($ctx)
323 .set noreorder
324 bne $inp,$num,.Loop
325 nop
326
327 .set noreorder
328 $REG_L $ra,($FRAMESIZE-1)*$SZREG($sp)
329 $REG_L $fp,($FRAMESIZE-2)*$SZREG($sp)
330 $REG_L $s11,($FRAMESIZE-3)*$SZREG($sp)
331 $REG_L $s10,($FRAMESIZE-4)*$SZREG($sp)
332 $REG_L $s9,($FRAMESIZE-5)*$SZREG($sp)
333 $REG_L $s8,($FRAMESIZE-6)*$SZREG($sp)
334 $REG_L $s7,($FRAMESIZE-7)*$SZREG($sp)
335 $REG_L $s6,($FRAMESIZE-8)*$SZREG($sp)
336 $REG_L $s5,($FRAMESIZE-9)*$SZREG($sp)
337 $REG_L $s4,($FRAMESIZE-10)*$SZREG($sp)
338___
339$code.=<<___ if ($flavour =~ /nubi/i);
340 $REG_L $s3,($FRAMESIZE-11)*$SZREG($sp)
341 $REG_L $s2,($FRAMESIZE-12)*$SZREG($sp)
342 $REG_L $s1,($FRAMESIZE-13)*$SZREG($sp)
343 $REG_L $s0,($FRAMESIZE-14)*$SZREG($sp)
344 $REG_L $gp,($FRAMESIZE-15)*$SZREG($sp)
345___
346$code.=<<___;
347 jr $ra
348 $PTR_ADD $sp,$FRAMESIZE*$SZREG
349.end sha1_block_data_order
350.rdata
351.asciiz "SHA1 for MIPS, CRYPTOGAMS by <appro\@openssl.org>"
352___
353print $code;
354close STDOUT;
diff --git a/src/lib/libcrypto/sha/asm/sha1-parisc.pl b/src/lib/libcrypto/sha/asm/sha1-parisc.pl
deleted file mode 100644
index 6d7bf495b2..0000000000
--- a/src/lib/libcrypto/sha/asm/sha1-parisc.pl
+++ /dev/null
@@ -1,259 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# SHA1 block procedure for PA-RISC.
11
12# June 2009.
13#
14# On PA-7100LC performance is >30% better than gcc 3.2 generated code
15# for aligned input and >50% better for unaligned. Compared to vendor
16# compiler on PA-8600 it's almost 60% faster in 64-bit build and just
17# few percent faster in 32-bit one (this for aligned input, data for
18# unaligned input is not available).
19#
20# Special thanks to polarhome.com for providing HP-UX account.
21
22$flavour = shift;
23$output = shift;
24open STDOUT,">$output";
25
26if ($flavour =~ /64/) {
27 $LEVEL ="2.0W";
28 $SIZE_T =8;
29 $FRAME_MARKER =80;
30 $SAVED_RP =16;
31 $PUSH ="std";
32 $PUSHMA ="std,ma";
33 $POP ="ldd";
34 $POPMB ="ldd,mb";
35} else {
36 $LEVEL ="1.0";
37 $SIZE_T =4;
38 $FRAME_MARKER =48;
39 $SAVED_RP =20;
40 $PUSH ="stw";
41 $PUSHMA ="stwm";
42 $POP ="ldw";
43 $POPMB ="ldwm";
44}
45
46$FRAME=14*$SIZE_T+$FRAME_MARKER;# 14 saved regs + frame marker
47 # [+ argument transfer]
48$ctx="%r26"; # arg0
49$inp="%r25"; # arg1
50$num="%r24"; # arg2
51
52$t0="%r28";
53$t1="%r29";
54$K="%r31";
55
56@X=("%r1", "%r2", "%r3", "%r4", "%r5", "%r6", "%r7", "%r8",
57 "%r9", "%r10","%r11","%r12","%r13","%r14","%r15","%r16",$t0);
58
59@V=($A,$B,$C,$D,$E)=("%r19","%r20","%r21","%r22","%r23");
60
61sub BODY_00_19 {
62my ($i,$a,$b,$c,$d,$e)=@_;
63my $j=$i+1;
64$code.=<<___ if ($i<15);
65 addl $K,$e,$e ; $i
66 shd $a,$a,27,$t1
67 addl @X[$i],$e,$e
68 and $c,$b,$t0
69 addl $t1,$e,$e
70 andcm $d,$b,$t1
71 shd $b,$b,2,$b
72 or $t1,$t0,$t0
73 addl $t0,$e,$e
74___
75$code.=<<___ if ($i>=15); # with forward Xupdate
76 addl $K,$e,$e ; $i
77 shd $a,$a,27,$t1
78 xor @X[($j+2)%16],@X[$j%16],@X[$j%16]
79 addl @X[$i%16],$e,$e
80 and $c,$b,$t0
81 xor @X[($j+8)%16],@X[$j%16],@X[$j%16]
82 addl $t1,$e,$e
83 andcm $d,$b,$t1
84 shd $b,$b,2,$b
85 or $t1,$t0,$t0
86 xor @X[($j+13)%16],@X[$j%16],@X[$j%16]
87 add $t0,$e,$e
88 shd @X[$j%16],@X[$j%16],31,@X[$j%16]
89___
90}
91
92sub BODY_20_39 {
93my ($i,$a,$b,$c,$d,$e)=@_;
94my $j=$i+1;
95$code.=<<___ if ($i<79);
96 xor @X[($j+2)%16],@X[$j%16],@X[$j%16] ; $i
97 addl $K,$e,$e
98 shd $a,$a,27,$t1
99 xor @X[($j+8)%16],@X[$j%16],@X[$j%16]
100 addl @X[$i%16],$e,$e
101 xor $b,$c,$t0
102 xor @X[($j+13)%16],@X[$j%16],@X[$j%16]
103 addl $t1,$e,$e
104 shd $b,$b,2,$b
105 xor $d,$t0,$t0
106 shd @X[$j%16],@X[$j%16],31,@X[$j%16]
107 addl $t0,$e,$e
108___
109$code.=<<___ if ($i==79); # with context load
110 ldw 0($ctx),@X[0] ; $i
111 addl $K,$e,$e
112 shd $a,$a,27,$t1
113 ldw 4($ctx),@X[1]
114 addl @X[$i%16],$e,$e
115 xor $b,$c,$t0
116 ldw 8($ctx),@X[2]
117 addl $t1,$e,$e
118 shd $b,$b,2,$b
119 xor $d,$t0,$t0
120 ldw 12($ctx),@X[3]
121 addl $t0,$e,$e
122 ldw 16($ctx),@X[4]
123___
124}
125
126sub BODY_40_59 {
127my ($i,$a,$b,$c,$d,$e)=@_;
128my $j=$i+1;
129$code.=<<___;
130 shd $a,$a,27,$t1 ; $i
131 addl $K,$e,$e
132 xor @X[($j+2)%16],@X[$j%16],@X[$j%16]
133 xor $d,$c,$t0
134 addl @X[$i%16],$e,$e
135 xor @X[($j+8)%16],@X[$j%16],@X[$j%16]
136 and $b,$t0,$t0
137 addl $t1,$e,$e
138 shd $b,$b,2,$b
139 xor @X[($j+13)%16],@X[$j%16],@X[$j%16]
140 addl $t0,$e,$e
141 and $d,$c,$t1
142 shd @X[$j%16],@X[$j%16],31,@X[$j%16]
143 addl $t1,$e,$e
144___
145}
146
147$code=<<___;
148 .LEVEL $LEVEL
149 .SPACE \$TEXT\$
150 .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
151
152 .EXPORT sha1_block_data_order,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR
153sha1_block_data_order
154 .PROC
155 .CALLINFO FRAME=`$FRAME-14*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=16
156 .ENTRY
157 $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
158 $PUSHMA %r3,$FRAME(%sp)
159 $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
160 $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
161 $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
162 $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp)
163 $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp)
164 $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp)
165 $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp)
166 $PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp)
167 $PUSH %r12,`-$FRAME+9*$SIZE_T`(%sp)
168 $PUSH %r13,`-$FRAME+10*$SIZE_T`(%sp)
169 $PUSH %r14,`-$FRAME+11*$SIZE_T`(%sp)
170 $PUSH %r15,`-$FRAME+12*$SIZE_T`(%sp)
171 $PUSH %r16,`-$FRAME+13*$SIZE_T`(%sp)
172
173 ldw 0($ctx),$A
174 ldw 4($ctx),$B
175 ldw 8($ctx),$C
176 ldw 12($ctx),$D
177 ldw 16($ctx),$E
178
179 extru $inp,31,2,$t0 ; t0=inp&3;
180 sh3addl $t0,%r0,$t0 ; t0*=8;
181 subi 32,$t0,$t0 ; t0=32-t0;
182 mtctl $t0,%cr11 ; %sar=t0;
183
184L\$oop
185 ldi 3,$t0
186 andcm $inp,$t0,$t0 ; 64-bit neutral
187___
188 for ($i=0;$i<15;$i++) { # load input block
189 $code.="\tldw `4*$i`($t0),@X[$i]\n"; }
190$code.=<<___;
191 cmpb,*= $inp,$t0,L\$aligned
192 ldw 60($t0),@X[15]
193 ldw 64($t0),@X[16]
194___
195 for ($i=0;$i<16;$i++) { # align input
196 $code.="\tvshd @X[$i],@X[$i+1],@X[$i]\n"; }
197$code.=<<___;
198L\$aligned
199 ldil L'0x5a827000,$K ; K_00_19
200 ldo 0x999($K),$K
201___
202for ($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
203$code.=<<___;
204 ldil L'0x6ed9e000,$K ; K_20_39
205 ldo 0xba1($K),$K
206___
207
208for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
209$code.=<<___;
210 ldil L'0x8f1bb000,$K ; K_40_59
211 ldo 0xcdc($K),$K
212___
213
214for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
215$code.=<<___;
216 ldil L'0xca62c000,$K ; K_60_79
217 ldo 0x1d6($K),$K
218___
219for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
220
221$code.=<<___;
222 addl @X[0],$A,$A
223 addl @X[1],$B,$B
224 addl @X[2],$C,$C
225 addl @X[3],$D,$D
226 addl @X[4],$E,$E
227 stw $A,0($ctx)
228 stw $B,4($ctx)
229 stw $C,8($ctx)
230 stw $D,12($ctx)
231 stw $E,16($ctx)
232 addib,*<> -1,$num,L\$oop
233 ldo 64($inp),$inp
234
235 $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue
236 $POP `-$FRAME+1*$SIZE_T`(%sp),%r4
237 $POP `-$FRAME+2*$SIZE_T`(%sp),%r5
238 $POP `-$FRAME+3*$SIZE_T`(%sp),%r6
239 $POP `-$FRAME+4*$SIZE_T`(%sp),%r7
240 $POP `-$FRAME+5*$SIZE_T`(%sp),%r8
241 $POP `-$FRAME+6*$SIZE_T`(%sp),%r9
242 $POP `-$FRAME+7*$SIZE_T`(%sp),%r10
243 $POP `-$FRAME+8*$SIZE_T`(%sp),%r11
244 $POP `-$FRAME+9*$SIZE_T`(%sp),%r12
245 $POP `-$FRAME+10*$SIZE_T`(%sp),%r13
246 $POP `-$FRAME+11*$SIZE_T`(%sp),%r14
247 $POP `-$FRAME+12*$SIZE_T`(%sp),%r15
248 $POP `-$FRAME+13*$SIZE_T`(%sp),%r16
249 bv (%r2)
250 .EXIT
251 $POPMB -$FRAME(%sp),%r3
252 .PROCEND
253 .STRINGZ "SHA1 block transform for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
254___
255
256$code =~ s/\`([^\`]*)\`/eval $1/gem;
257$code =~ s/,\*/,/gm if ($SIZE_T==4);
258print $code;
259close STDOUT;
diff --git a/src/lib/libcrypto/sha/asm/sha1-ppc.pl b/src/lib/libcrypto/sha/asm/sha1-ppc.pl
deleted file mode 100755
index 2140dd2f8d..0000000000
--- a/src/lib/libcrypto/sha/asm/sha1-ppc.pl
+++ /dev/null
@@ -1,326 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# I let hardware handle unaligned input(*), except on page boundaries
11# (see below for details). Otherwise straightforward implementation
12# with X vector in register bank. The module is big-endian [which is
13# not big deal as there're no little-endian targets left around].
14#
15# (*) this means that this module is inappropriate for PPC403? Does
16# anybody know if pre-POWER3 can sustain unaligned load?
17
18# -m64 -m32
19# ----------------------------------
20# PPC970,gcc-4.0.0 +76% +59%
21# Power6,xlc-7 +68% +33%
22
23$flavour = shift;
24
25if ($flavour =~ /64/) {
26 $SIZE_T =8;
27 $LRSAVE =2*$SIZE_T;
28 $UCMP ="cmpld";
29 $STU ="stdu";
30 $POP ="ld";
31 $PUSH ="std";
32} elsif ($flavour =~ /32/) {
33 $SIZE_T =4;
34 $LRSAVE =$SIZE_T;
35 $UCMP ="cmplw";
36 $STU ="stwu";
37 $POP ="lwz";
38 $PUSH ="stw";
39} else { die "nonsense $flavour"; }
40
41$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
42( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
43( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
44die "can't locate ppc-xlate.pl";
45
46open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
47
48$FRAME=24*$SIZE_T+64;
49$LOCALS=6*$SIZE_T;
50
51$K ="r0";
52$sp ="r1";
53$toc="r2";
54$ctx="r3";
55$inp="r4";
56$num="r5";
57$t0 ="r15";
58$t1 ="r6";
59
60$A ="r7";
61$B ="r8";
62$C ="r9";
63$D ="r10";
64$E ="r11";
65$T ="r12";
66
67@V=($A,$B,$C,$D,$E,$T);
68@X=("r16","r17","r18","r19","r20","r21","r22","r23",
69 "r24","r25","r26","r27","r28","r29","r30","r31");
70
71sub BODY_00_19 {
72my ($i,$a,$b,$c,$d,$e,$f)=@_;
73my $j=$i+1;
74$code.=<<___ if ($i==0);
75 lwz @X[$i],`$i*4`($inp)
76___
77$code.=<<___ if ($i<15);
78 lwz @X[$j],`$j*4`($inp)
79 add $f,$K,$e
80 rotlwi $e,$a,5
81 add $f,$f,@X[$i]
82 and $t0,$c,$b
83 add $f,$f,$e
84 andc $t1,$d,$b
85 rotlwi $b,$b,30
86 or $t0,$t0,$t1
87 add $f,$f,$t0
88___
89$code.=<<___ if ($i>=15);
90 add $f,$K,$e
91 rotlwi $e,$a,5
92 xor @X[$j%16],@X[$j%16],@X[($j+2)%16]
93 add $f,$f,@X[$i%16]
94 and $t0,$c,$b
95 xor @X[$j%16],@X[$j%16],@X[($j+8)%16]
96 add $f,$f,$e
97 andc $t1,$d,$b
98 rotlwi $b,$b,30
99 or $t0,$t0,$t1
100 xor @X[$j%16],@X[$j%16],@X[($j+13)%16]
101 add $f,$f,$t0
102 rotlwi @X[$j%16],@X[$j%16],1
103___
104}
105
106sub BODY_20_39 {
107my ($i,$a,$b,$c,$d,$e,$f)=@_;
108my $j=$i+1;
109$code.=<<___ if ($i<79);
110 add $f,$K,$e
111 rotlwi $e,$a,5
112 xor @X[$j%16],@X[$j%16],@X[($j+2)%16]
113 add $f,$f,@X[$i%16]
114 xor $t0,$b,$c
115 xor @X[$j%16],@X[$j%16],@X[($j+8)%16]
116 add $f,$f,$e
117 rotlwi $b,$b,30
118 xor $t0,$t0,$d
119 xor @X[$j%16],@X[$j%16],@X[($j+13)%16]
120 add $f,$f,$t0
121 rotlwi @X[$j%16],@X[$j%16],1
122___
123$code.=<<___ if ($i==79);
124 add $f,$K,$e
125 rotlwi $e,$a,5
126 lwz r16,0($ctx)
127 add $f,$f,@X[$i%16]
128 xor $t0,$b,$c
129 lwz r17,4($ctx)
130 add $f,$f,$e
131 rotlwi $b,$b,30
132 lwz r18,8($ctx)
133 xor $t0,$t0,$d
134 lwz r19,12($ctx)
135 add $f,$f,$t0
136 lwz r20,16($ctx)
137___
138}
139
140sub BODY_40_59 {
141my ($i,$a,$b,$c,$d,$e,$f)=@_;
142my $j=$i+1;
143$code.=<<___;
144 add $f,$K,$e
145 rotlwi $e,$a,5
146 xor @X[$j%16],@X[$j%16],@X[($j+2)%16]
147 add $f,$f,@X[$i%16]
148 and $t0,$b,$c
149 xor @X[$j%16],@X[$j%16],@X[($j+8)%16]
150 add $f,$f,$e
151 or $t1,$b,$c
152 rotlwi $b,$b,30
153 xor @X[$j%16],@X[$j%16],@X[($j+13)%16]
154 and $t1,$t1,$d
155 or $t0,$t0,$t1
156 rotlwi @X[$j%16],@X[$j%16],1
157 add $f,$f,$t0
158___
159}
160
161$code=<<___;
162.machine "any"
163.text
164
165.globl .sha1_block_data_order
166.align 4
167.sha1_block_data_order:
168 $STU $sp,-$FRAME($sp)
169 mflr r0
170 $PUSH r15,`$FRAME-$SIZE_T*17`($sp)
171 $PUSH r16,`$FRAME-$SIZE_T*16`($sp)
172 $PUSH r17,`$FRAME-$SIZE_T*15`($sp)
173 $PUSH r18,`$FRAME-$SIZE_T*14`($sp)
174 $PUSH r19,`$FRAME-$SIZE_T*13`($sp)
175 $PUSH r20,`$FRAME-$SIZE_T*12`($sp)
176 $PUSH r21,`$FRAME-$SIZE_T*11`($sp)
177 $PUSH r22,`$FRAME-$SIZE_T*10`($sp)
178 $PUSH r23,`$FRAME-$SIZE_T*9`($sp)
179 $PUSH r24,`$FRAME-$SIZE_T*8`($sp)
180 $PUSH r25,`$FRAME-$SIZE_T*7`($sp)
181 $PUSH r26,`$FRAME-$SIZE_T*6`($sp)
182 $PUSH r27,`$FRAME-$SIZE_T*5`($sp)
183 $PUSH r28,`$FRAME-$SIZE_T*4`($sp)
184 $PUSH r29,`$FRAME-$SIZE_T*3`($sp)
185 $PUSH r30,`$FRAME-$SIZE_T*2`($sp)
186 $PUSH r31,`$FRAME-$SIZE_T*1`($sp)
187 $PUSH r0,`$FRAME+$LRSAVE`($sp)
188 lwz $A,0($ctx)
189 lwz $B,4($ctx)
190 lwz $C,8($ctx)
191 lwz $D,12($ctx)
192 lwz $E,16($ctx)
193 andi. r0,$inp,3
194 bne Lunaligned
195Laligned:
196 mtctr $num
197 bl Lsha1_block_private
198 b Ldone
199
200; PowerPC specification allows an implementation to be ill-behaved
201; upon unaligned access which crosses page boundary. "Better safe
202; than sorry" principle makes me treat it specially. But I don't
203; look for particular offending word, but rather for 64-byte input
204; block which crosses the boundary. Once found that block is aligned
205; and hashed separately...
206.align 4
207Lunaligned:
208 subfic $t1,$inp,4096
209 andi. $t1,$t1,4095 ; distance to closest page boundary
210 srwi. $t1,$t1,6 ; t1/=64
211 beq Lcross_page
212 $UCMP $num,$t1
213 ble- Laligned ; didn't cross the page boundary
214 mtctr $t1
215 subfc $num,$t1,$num
216 bl Lsha1_block_private
217Lcross_page:
218 li $t1,16
219 mtctr $t1
220 addi r20,$sp,$LOCALS ; spot within the frame
221Lmemcpy:
222 lbz r16,0($inp)
223 lbz r17,1($inp)
224 lbz r18,2($inp)
225 lbz r19,3($inp)
226 addi $inp,$inp,4
227 stb r16,0(r20)
228 stb r17,1(r20)
229 stb r18,2(r20)
230 stb r19,3(r20)
231 addi r20,r20,4
232 bdnz Lmemcpy
233
234 $PUSH $inp,`$FRAME-$SIZE_T*18`($sp)
235 li $t1,1
236 addi $inp,$sp,$LOCALS
237 mtctr $t1
238 bl Lsha1_block_private
239 $POP $inp,`$FRAME-$SIZE_T*18`($sp)
240 addic. $num,$num,-1
241 bne- Lunaligned
242
243Ldone:
244 $POP r0,`$FRAME+$LRSAVE`($sp)
245 $POP r15,`$FRAME-$SIZE_T*17`($sp)
246 $POP r16,`$FRAME-$SIZE_T*16`($sp)
247 $POP r17,`$FRAME-$SIZE_T*15`($sp)
248 $POP r18,`$FRAME-$SIZE_T*14`($sp)
249 $POP r19,`$FRAME-$SIZE_T*13`($sp)
250 $POP r20,`$FRAME-$SIZE_T*12`($sp)
251 $POP r21,`$FRAME-$SIZE_T*11`($sp)
252 $POP r22,`$FRAME-$SIZE_T*10`($sp)
253 $POP r23,`$FRAME-$SIZE_T*9`($sp)
254 $POP r24,`$FRAME-$SIZE_T*8`($sp)
255 $POP r25,`$FRAME-$SIZE_T*7`($sp)
256 $POP r26,`$FRAME-$SIZE_T*6`($sp)
257 $POP r27,`$FRAME-$SIZE_T*5`($sp)
258 $POP r28,`$FRAME-$SIZE_T*4`($sp)
259 $POP r29,`$FRAME-$SIZE_T*3`($sp)
260 $POP r30,`$FRAME-$SIZE_T*2`($sp)
261 $POP r31,`$FRAME-$SIZE_T*1`($sp)
262 mtlr r0
263 addi $sp,$sp,$FRAME
264 blr
265 .long 0
266 .byte 0,12,4,1,0x80,18,3,0
267 .long 0
268___
269
270# This is private block function, which uses tailored calling
271# interface, namely upon entry SHA_CTX is pre-loaded to given
272# registers and counter register contains amount of chunks to
273# digest...
274$code.=<<___;
275.align 4
276Lsha1_block_private:
277___
278$code.=<<___; # load K_00_19
279 lis $K,0x5a82
280 ori $K,$K,0x7999
281___
282for($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
283$code.=<<___; # load K_20_39
284 lis $K,0x6ed9
285 ori $K,$K,0xeba1
286___
287for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
288$code.=<<___; # load K_40_59
289 lis $K,0x8f1b
290 ori $K,$K,0xbcdc
291___
292for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
293$code.=<<___; # load K_60_79
294 lis $K,0xca62
295 ori $K,$K,0xc1d6
296___
297for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
298$code.=<<___;
299 add r16,r16,$E
300 add r17,r17,$T
301 add r18,r18,$A
302 add r19,r19,$B
303 add r20,r20,$C
304 stw r16,0($ctx)
305 mr $A,r16
306 stw r17,4($ctx)
307 mr $B,r17
308 stw r18,8($ctx)
309 mr $C,r18
310 stw r19,12($ctx)
311 mr $D,r19
312 stw r20,16($ctx)
313 mr $E,r20
314 addi $inp,$inp,`16*4`
315 bdnz- Lsha1_block_private
316 blr
317 .long 0
318 .byte 0,12,0x14,0,0,0,0,0
319___
320$code.=<<___;
321.asciz "SHA1 block transform for PPC, CRYPTOGAMS by <appro\@fy.chalmers.se>"
322___
323
324$code =~ s/\`([^\`]*)\`/eval $1/gem;
325print $code;
326close STDOUT;
diff --git a/src/lib/libcrypto/sha/asm/sha1-s390x.pl b/src/lib/libcrypto/sha/asm/sha1-s390x.pl
deleted file mode 100644
index 9193dda45e..0000000000
--- a/src/lib/libcrypto/sha/asm/sha1-s390x.pl
+++ /dev/null
@@ -1,246 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# SHA1 block procedure for s390x.
11
12# April 2007.
13#
14# Performance is >30% better than gcc 3.3 generated code. But the real
15# twist is that SHA1 hardware support is detected and utilized. In
16# which case performance can reach further >4.5x for larger chunks.
17
18# January 2009.
19#
20# Optimize Xupdate for amount of memory references and reschedule
21# instructions to favour dual-issue z10 pipeline. On z10 hardware is
22# "only" ~2.3x faster than software.
23
24# November 2010.
25#
26# Adapt for -m31 build. If kernel supports what's called "highgprs"
27# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
28# instructions and achieve "64-bit" performance even in 31-bit legacy
29# application context. The feature is not specific to any particular
30# processor, as long as it's "z-CPU". Latter implies that the code
31# remains z/Architecture specific.
32
33$kimdfunc=1; # magic function code for kimd instruction
34
35$flavour = shift;
36
37if ($flavour =~ /3[12]/) {
38 $SIZE_T=4;
39 $g="";
40} else {
41 $SIZE_T=8;
42 $g="g";
43}
44
45while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
46open STDOUT,">$output";
47
48$K_00_39="%r0"; $K=$K_00_39;
49$K_40_79="%r1";
50$ctx="%r2"; $prefetch="%r2";
51$inp="%r3";
52$len="%r4";
53
54$A="%r5";
55$B="%r6";
56$C="%r7";
57$D="%r8";
58$E="%r9"; @V=($A,$B,$C,$D,$E);
59$t0="%r10";
60$t1="%r11";
61@X=("%r12","%r13","%r14");
62$sp="%r15";
63
64$stdframe=16*$SIZE_T+4*8;
65$frame=$stdframe+16*4;
66
67sub Xupdate {
68my $i=shift;
69
70$code.=<<___ if ($i==15);
71 lg $prefetch,$stdframe($sp) ### Xupdate(16) warm-up
72 lr $X[0],$X[2]
73___
74return if ($i&1); # Xupdate is vectorized and executed every 2nd cycle
75$code.=<<___ if ($i<16);
76 lg $X[0],`$i*4`($inp) ### Xload($i)
77 rllg $X[1],$X[0],32
78___
79$code.=<<___ if ($i>=16);
80 xgr $X[0],$prefetch ### Xupdate($i)
81 lg $prefetch,`$stdframe+4*(($i+2)%16)`($sp)
82 xg $X[0],`$stdframe+4*(($i+8)%16)`($sp)
83 xgr $X[0],$prefetch
84 rll $X[0],$X[0],1
85 rllg $X[1],$X[0],32
86 rll $X[1],$X[1],1
87 rllg $X[0],$X[1],32
88 lr $X[2],$X[1] # feedback
89___
90$code.=<<___ if ($i<=70);
91 stg $X[0],`$stdframe+4*($i%16)`($sp)
92___
93unshift(@X,pop(@X));
94}
95
96sub BODY_00_19 {
97my ($i,$a,$b,$c,$d,$e)=@_;
98my $xi=$X[1];
99
100 &Xupdate($i);
101$code.=<<___;
102 alr $e,$K ### $i
103 rll $t1,$a,5
104 lr $t0,$d
105 xr $t0,$c
106 alr $e,$t1
107 nr $t0,$b
108 alr $e,$xi
109 xr $t0,$d
110 rll $b,$b,30
111 alr $e,$t0
112___
113}
114
115sub BODY_20_39 {
116my ($i,$a,$b,$c,$d,$e)=@_;
117my $xi=$X[1];
118
119 &Xupdate($i);
120$code.=<<___;
121 alr $e,$K ### $i
122 rll $t1,$a,5
123 lr $t0,$b
124 alr $e,$t1
125 xr $t0,$c
126 alr $e,$xi
127 xr $t0,$d
128 rll $b,$b,30
129 alr $e,$t0
130___
131}
132
133sub BODY_40_59 {
134my ($i,$a,$b,$c,$d,$e)=@_;
135my $xi=$X[1];
136
137 &Xupdate($i);
138$code.=<<___;
139 alr $e,$K ### $i
140 rll $t1,$a,5
141 lr $t0,$b
142 alr $e,$t1
143 or $t0,$c
144 lr $t1,$b
145 nr $t0,$d
146 nr $t1,$c
147 alr $e,$xi
148 or $t0,$t1
149 rll $b,$b,30
150 alr $e,$t0
151___
152}
153
154$code.=<<___;
155.text
156.align 64
157.type Ktable,\@object
158Ktable: .long 0x5a827999,0x6ed9eba1,0x8f1bbcdc,0xca62c1d6
159 .skip 48 #.long 0,0,0,0,0,0,0,0,0,0,0,0
160.size Ktable,.-Ktable
161.globl sha1_block_data_order
162.type sha1_block_data_order,\@function
163sha1_block_data_order:
164___
165$code.=<<___ if ($kimdfunc);
166 larl %r1,OPENSSL_s390xcap_P
167 lg %r0,0(%r1)
168 tmhl %r0,0x4000 # check for message-security assist
169 jz .Lsoftware
170 lghi %r0,0
171 la %r1,`2*$SIZE_T`($sp)
172 .long 0xb93e0002 # kimd %r0,%r2
173 lg %r0,`2*$SIZE_T`($sp)
174 tmhh %r0,`0x8000>>$kimdfunc`
175 jz .Lsoftware
176 lghi %r0,$kimdfunc
177 lgr %r1,$ctx
178 lgr %r2,$inp
179 sllg %r3,$len,6
180 .long 0xb93e0002 # kimd %r0,%r2
181 brc 1,.-4 # pay attention to "partial completion"
182 br %r14
183.align 16
184.Lsoftware:
185___
186$code.=<<___;
187 lghi %r1,-$frame
188 st${g} $ctx,`2*$SIZE_T`($sp)
189 stm${g} %r6,%r15,`6*$SIZE_T`($sp)
190 lgr %r0,$sp
191 la $sp,0(%r1,$sp)
192 st${g} %r0,0($sp)
193
194 larl $t0,Ktable
195 llgf $A,0($ctx)
196 llgf $B,4($ctx)
197 llgf $C,8($ctx)
198 llgf $D,12($ctx)
199 llgf $E,16($ctx)
200
201 lg $K_00_39,0($t0)
202 lg $K_40_79,8($t0)
203
204.Lloop:
205 rllg $K_00_39,$K_00_39,32
206___
207for ($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
208$code.=<<___;
209 rllg $K_00_39,$K_00_39,32
210___
211for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
212$code.=<<___; $K=$K_40_79;
213 rllg $K_40_79,$K_40_79,32
214___
215for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
216$code.=<<___;
217 rllg $K_40_79,$K_40_79,32
218___
219for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
220$code.=<<___;
221
222 l${g} $ctx,`$frame+2*$SIZE_T`($sp)
223 la $inp,64($inp)
224 al $A,0($ctx)
225 al $B,4($ctx)
226 al $C,8($ctx)
227 al $D,12($ctx)
228 al $E,16($ctx)
229 st $A,0($ctx)
230 st $B,4($ctx)
231 st $C,8($ctx)
232 st $D,12($ctx)
233 st $E,16($ctx)
234 brct${g} $len,.Lloop
235
236 lm${g} %r6,%r15,`$frame+6*$SIZE_T`($sp)
237 br %r14
238.size sha1_block_data_order,.-sha1_block_data_order
239.string "SHA1 block transform for s390x, CRYPTOGAMS by <appro\@openssl.org>"
240.comm OPENSSL_s390xcap_P,16,8
241___
242
243$code =~ s/\`([^\`]*)\`/eval $1/gem;
244
245print $code;
246close STDOUT;
diff --git a/src/lib/libcrypto/sha/asm/sha1-sparcv9.pl b/src/lib/libcrypto/sha/asm/sha1-sparcv9.pl
deleted file mode 100644
index 5c161cecd6..0000000000
--- a/src/lib/libcrypto/sha/asm/sha1-sparcv9.pl
+++ /dev/null
@@ -1,284 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# Performance improvement is not really impressive on pre-T1 CPU: +8%
11# over Sun C and +25% over gcc [3.3]. While on T1, a.k.a. Niagara, it
12# turned to be 40% faster than 64-bit code generated by Sun C 5.8 and
13# >2x than 64-bit code generated by gcc 3.4. And there is a gimmick.
14# X[16] vector is packed to 8 64-bit registers and as result nothing
15# is spilled on stack. In addition input data is loaded in compact
16# instruction sequence, thus minimizing the window when the code is
17# subject to [inter-thread] cache-thrashing hazard. The goal is to
18# ensure scalability on UltraSPARC T1, or rather to avoid decay when
19# amount of active threads exceeds the number of physical cores.
20
21$bits=32;
22for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
23if ($bits==64) { $bias=2047; $frame=192; }
24else { $bias=0; $frame=112; }
25
26$output=shift;
27open STDOUT,">$output";
28
29@X=("%o0","%o1","%o2","%o3","%o4","%o5","%g1","%o7");
30$rot1m="%g2";
31$tmp64="%g3";
32$Xi="%g4";
33$A="%l0";
34$B="%l1";
35$C="%l2";
36$D="%l3";
37$E="%l4";
38@V=($A,$B,$C,$D,$E);
39$K_00_19="%l5";
40$K_20_39="%l6";
41$K_40_59="%l7";
42$K_60_79="%g5";
43@K=($K_00_19,$K_20_39,$K_40_59,$K_60_79);
44
45$ctx="%i0";
46$inp="%i1";
47$len="%i2";
48$tmp0="%i3";
49$tmp1="%i4";
50$tmp2="%i5";
51
52sub BODY_00_15 {
53my ($i,$a,$b,$c,$d,$e)=@_;
54my $xi=($i&1)?@X[($i/2)%8]:$Xi;
55
56$code.=<<___;
57 sll $a,5,$tmp0 !! $i
58 add @K[$i/20],$e,$e
59 srl $a,27,$tmp1
60 add $tmp0,$e,$e
61 and $c,$b,$tmp0
62 add $tmp1,$e,$e
63 sll $b,30,$tmp2
64 andn $d,$b,$tmp1
65 srl $b,2,$b
66 or $tmp1,$tmp0,$tmp1
67 or $tmp2,$b,$b
68 add $xi,$e,$e
69___
70if ($i&1 && $i<15) {
71 $code.=
72 " srlx @X[(($i+1)/2)%8],32,$Xi\n";
73}
74$code.=<<___;
75 add $tmp1,$e,$e
76___
77}
78
79sub Xupdate {
80my ($i,$a,$b,$c,$d,$e)=@_;
81my $j=$i/2;
82
83if ($i&1) {
84$code.=<<___;
85 sll $a,5,$tmp0 !! $i
86 add @K[$i/20],$e,$e
87 srl $a,27,$tmp1
88___
89} else {
90$code.=<<___;
91 sllx @X[($j+6)%8],32,$Xi ! Xupdate($i)
92 xor @X[($j+1)%8],@X[$j%8],@X[$j%8]
93 srlx @X[($j+7)%8],32,$tmp1
94 xor @X[($j+4)%8],@X[$j%8],@X[$j%8]
95 sll $a,5,$tmp0 !! $i
96 or $tmp1,$Xi,$Xi
97 add @K[$i/20],$e,$e !!
98 xor $Xi,@X[$j%8],@X[$j%8]
99 srlx @X[$j%8],31,$Xi
100 add @X[$j%8],@X[$j%8],@X[$j%8]
101 and $Xi,$rot1m,$Xi
102 andn @X[$j%8],$rot1m,@X[$j%8]
103 srl $a,27,$tmp1 !!
104 or $Xi,@X[$j%8],@X[$j%8]
105___
106}
107}
108
109sub BODY_16_19 {
110my ($i,$a,$b,$c,$d,$e)=@_;
111
112 &Xupdate(@_);
113 if ($i&1) {
114 $xi=@X[($i/2)%8];
115 } else {
116 $xi=$Xi;
117 $code.="\tsrlx @X[($i/2)%8],32,$xi\n";
118 }
119$code.=<<___;
120 add $tmp0,$e,$e !!
121 and $c,$b,$tmp0
122 add $tmp1,$e,$e
123 sll $b,30,$tmp2
124 add $xi,$e,$e
125 andn $d,$b,$tmp1
126 srl $b,2,$b
127 or $tmp1,$tmp0,$tmp1
128 or $tmp2,$b,$b
129 add $tmp1,$e,$e
130___
131}
132
133sub BODY_20_39 {
134my ($i,$a,$b,$c,$d,$e)=@_;
135my $xi;
136 &Xupdate(@_);
137 if ($i&1) {
138 $xi=@X[($i/2)%8];
139 } else {
140 $xi=$Xi;
141 $code.="\tsrlx @X[($i/2)%8],32,$xi\n";
142 }
143$code.=<<___;
144 add $tmp0,$e,$e !!
145 xor $c,$b,$tmp0
146 add $tmp1,$e,$e
147 sll $b,30,$tmp2
148 xor $d,$tmp0,$tmp1
149 srl $b,2,$b
150 add $tmp1,$e,$e
151 or $tmp2,$b,$b
152 add $xi,$e,$e
153___
154}
155
156sub BODY_40_59 {
157my ($i,$a,$b,$c,$d,$e)=@_;
158my $xi;
159 &Xupdate(@_);
160 if ($i&1) {
161 $xi=@X[($i/2)%8];
162 } else {
163 $xi=$Xi;
164 $code.="\tsrlx @X[($i/2)%8],32,$xi\n";
165 }
166$code.=<<___;
167 add $tmp0,$e,$e !!
168 and $c,$b,$tmp0
169 add $tmp1,$e,$e
170 sll $b,30,$tmp2
171 or $c,$b,$tmp1
172 srl $b,2,$b
173 and $d,$tmp1,$tmp1
174 add $xi,$e,$e
175 or $tmp1,$tmp0,$tmp1
176 or $tmp2,$b,$b
177 add $tmp1,$e,$e
178___
179}
180
181$code.=<<___ if ($bits==64);
182.register %g2,#scratch
183.register %g3,#scratch
184___
185$code.=<<___;
186.section ".text",#alloc,#execinstr
187
188.align 32
189.globl sha1_block_data_order
190sha1_block_data_order:
191 save %sp,-$frame,%sp
192 sllx $len,6,$len
193 add $inp,$len,$len
194
195 or %g0,1,$rot1m
196 sllx $rot1m,32,$rot1m
197 or $rot1m,1,$rot1m
198
199 ld [$ctx+0],$A
200 ld [$ctx+4],$B
201 ld [$ctx+8],$C
202 ld [$ctx+12],$D
203 ld [$ctx+16],$E
204 andn $inp,7,$tmp0
205
206 sethi %hi(0x5a827999),$K_00_19
207 or $K_00_19,%lo(0x5a827999),$K_00_19
208 sethi %hi(0x6ed9eba1),$K_20_39
209 or $K_20_39,%lo(0x6ed9eba1),$K_20_39
210 sethi %hi(0x8f1bbcdc),$K_40_59
211 or $K_40_59,%lo(0x8f1bbcdc),$K_40_59
212 sethi %hi(0xca62c1d6),$K_60_79
213 or $K_60_79,%lo(0xca62c1d6),$K_60_79
214
215.Lloop:
216 ldx [$tmp0+0],@X[0]
217 ldx [$tmp0+16],@X[2]
218 ldx [$tmp0+32],@X[4]
219 ldx [$tmp0+48],@X[6]
220 and $inp,7,$tmp1
221 ldx [$tmp0+8],@X[1]
222 sll $tmp1,3,$tmp1
223 ldx [$tmp0+24],@X[3]
224 subcc %g0,$tmp1,$tmp2 ! should be 64-$tmp1, but -$tmp1 works too
225 ldx [$tmp0+40],@X[5]
226 bz,pt %icc,.Laligned
227 ldx [$tmp0+56],@X[7]
228
229 sllx @X[0],$tmp1,@X[0]
230 ldx [$tmp0+64],$tmp64
231___
232for($i=0;$i<7;$i++)
233{ $code.=<<___;
234 srlx @X[$i+1],$tmp2,$Xi
235 sllx @X[$i+1],$tmp1,@X[$i+1]
236 or $Xi,@X[$i],@X[$i]
237___
238}
239$code.=<<___;
240 srlx $tmp64,$tmp2,$tmp64
241 or $tmp64,@X[7],@X[7]
242.Laligned:
243 srlx @X[0],32,$Xi
244___
245for ($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
246for (;$i<20;$i++) { &BODY_16_19($i,@V); unshift(@V,pop(@V)); }
247for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
248for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
249for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
250$code.=<<___;
251
252 ld [$ctx+0],@X[0]
253 ld [$ctx+4],@X[1]
254 ld [$ctx+8],@X[2]
255 ld [$ctx+12],@X[3]
256 add $inp,64,$inp
257 ld [$ctx+16],@X[4]
258 cmp $inp,$len
259
260 add $A,@X[0],$A
261 st $A,[$ctx+0]
262 add $B,@X[1],$B
263 st $B,[$ctx+4]
264 add $C,@X[2],$C
265 st $C,[$ctx+8]
266 add $D,@X[3],$D
267 st $D,[$ctx+12]
268 add $E,@X[4],$E
269 st $E,[$ctx+16]
270
271 bne `$bits==64?"%xcc":"%icc"`,.Lloop
272 andn $inp,7,$tmp0
273
274 ret
275 restore
276.type sha1_block_data_order,#function
277.size sha1_block_data_order,(.-sha1_block_data_order)
278.asciz "SHA1 block transform for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
279.align 4
280___
281
282$code =~ s/\`([^\`]*)\`/eval $1/gem;
283print $code;
284close STDOUT;
diff --git a/src/lib/libcrypto/sha/asm/sha1-sparcv9a.pl b/src/lib/libcrypto/sha/asm/sha1-sparcv9a.pl
deleted file mode 100644
index 85e8d68086..0000000000
--- a/src/lib/libcrypto/sha/asm/sha1-sparcv9a.pl
+++ /dev/null
@@ -1,601 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# January 2009
11#
12# Provided that UltraSPARC VIS instructions are pipe-lined(*) and
13# pairable(*) with IALU ones, offloading of Xupdate to the UltraSPARC
14# Graphic Unit would make it possible to achieve higher instruction-
15# level parallelism, ILP, and thus higher performance. It should be
16# explicitly noted that ILP is the keyword, and it means that this
17# code would be unsuitable for cores like UltraSPARC-Tx. The idea is
18# not really novel, Sun had VIS-powered implementation for a while.
19# Unlike Sun's implementation this one can process multiple unaligned
20# input blocks, and as such works as drop-in replacement for OpenSSL
21# sha1_block_data_order. Performance improvement was measured to be
22# 40% over pure IALU sha1-sparcv9.pl on UltraSPARC-IIi, but 12% on
23# UltraSPARC-III. See below for discussion...
24#
25# The module does not present direct interest for OpenSSL, because
26# it doesn't provide better performance on contemporary SPARCv9 CPUs,
27# UltraSPARC-Tx and SPARC64-V[II] to be specific. Those who feel they
28# absolutely must score on UltraSPARC-I-IV can simply replace
29# crypto/sha/asm/sha1-sparcv9.pl with this module.
30#
31# (*) "Pipe-lined" means that even if it takes several cycles to
32# complete, next instruction using same functional unit [but not
33# depending on the result of the current instruction] can start
34# execution without having to wait for the unit. "Pairable"
35# means that two [or more] independent instructions can be
36# issued at the very same time.
37
38$bits=32;
39for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
40if ($bits==64) { $bias=2047; $frame=192; }
41else { $bias=0; $frame=112; }
42
43$output=shift;
44open STDOUT,">$output";
45
46$ctx="%i0";
47$inp="%i1";
48$len="%i2";
49$tmp0="%i3";
50$tmp1="%i4";
51$tmp2="%i5";
52$tmp3="%g5";
53
54$base="%g1";
55$align="%g4";
56$Xfer="%o5";
57$nXfer=$tmp3;
58$Xi="%o7";
59
60$A="%l0";
61$B="%l1";
62$C="%l2";
63$D="%l3";
64$E="%l4";
65@V=($A,$B,$C,$D,$E);
66
67$Actx="%o0";
68$Bctx="%o1";
69$Cctx="%o2";
70$Dctx="%o3";
71$Ectx="%o4";
72
73$fmul="%f32";
74$VK_00_19="%f34";
75$VK_20_39="%f36";
76$VK_40_59="%f38";
77$VK_60_79="%f40";
78@VK=($VK_00_19,$VK_20_39,$VK_40_59,$VK_60_79);
79@X=("%f0", "%f1", "%f2", "%f3", "%f4", "%f5", "%f6", "%f7",
80 "%f8", "%f9","%f10","%f11","%f12","%f13","%f14","%f15","%f16");
81
82# This is reference 2x-parallelized VIS-powered Xupdate procedure. It
83# covers even K_NN_MM addition...
84sub Xupdate {
85my ($i)=@_;
86my $K=@VK[($i+16)/20];
87my $j=($i+16)%16;
88
89# [ provided that GSR.alignaddr_offset is 5, $mul contains
90# 0x100ULL<<32|0x100 value and K_NN_MM are pre-loaded to
91# chosen registers... ]
92$code.=<<___;
93 fxors @X[($j+13)%16],@X[$j],@X[$j] !-1/-1/-1:X[0]^=X[13]
94 fxors @X[($j+14)%16],@X[$j+1],@X[$j+1]! 0/ 0/ 0:X[1]^=X[14]
95 fxor @X[($j+2)%16],@X[($j+8)%16],%f18! 1/ 1/ 1:Tmp=X[2,3]^X[8,9]
96 fxor %f18,@X[$j],@X[$j] ! 2/ 4/ 3:X[0,1]^=X[2,3]^X[8,9]
97 faligndata @X[$j],@X[$j],%f18 ! 3/ 7/ 5:Tmp=X[0,1]>>>24
98 fpadd32 @X[$j],@X[$j],@X[$j] ! 4/ 8/ 6:X[0,1]<<=1
99 fmul8ulx16 %f18,$fmul,%f18 ! 5/10/ 7:Tmp>>=7, Tmp&=1
100 ![fxors %f15,%f2,%f2]
101 for %f18,@X[$j],@X[$j] ! 8/14/10:X[0,1]|=Tmp
102 ![fxors %f0,%f3,%f3] !10/17/12:X[0] dependency
103 fpadd32 $K,@X[$j],%f20
104 std %f20,[$Xfer+`4*$j`]
105___
106# The numbers delimited with slash are the earliest possible dispatch
107# cycles for given instruction assuming 1 cycle latency for simple VIS
108# instructions, such as on UltraSPARC-I&II, 3 cycles latency, such as
109# on UltraSPARC-III&IV, and 2 cycles latency(*), respectively. Being
110# 2x-parallelized the procedure is "worth" 5, 8.5 or 6 ticks per SHA1
111# round. As [long as] FPU/VIS instructions are perfectly pairable with
112# IALU ones, the round timing is defined by the maximum between VIS
113# and IALU timings. The latter varies from round to round and averages
114# out at 6.25 ticks. This means that USI&II should operate at IALU
115# rate, while USIII&IV - at VIS rate. This explains why performance
116# improvement varies among processors. Well, given that pure IALU
117# sha1-sparcv9.pl module exhibits virtually uniform performance of
118# ~9.3 cycles per SHA1 round. Timings mentioned above are theoretical
119# lower limits. Real-life performance was measured to be 6.6 cycles
120# per SHA1 round on USIIi and 8.3 on USIII. The latter is lower than
121# half-round VIS timing, because there are 16 Xupdate-free rounds,
122# which "push down" average theoretical timing to 8 cycles...
123
124# (*) SPARC64-V[II] was originally believed to have 2 cycles VIS
125# latency. Well, it might have, but it doesn't have dedicated
126# VIS-unit. Instead, VIS instructions are executed by other
127# functional units, ones used here - by IALU. This doesn't
128# improve effective ILP...
129}
130
131# The reference Xupdate procedure is then "strained" over *pairs* of
132# BODY_NN_MM and kind of modulo-scheduled in respect to X[n]^=X[n+13]
133# and K_NN_MM addition. It's "running" 15 rounds ahead, which leaves
134# plenty of room to amortize for read-after-write hazard, as well as
135# to fetch and align input for the next spin. The VIS instructions are
136# scheduled for latency of 2 cycles, because there are not enough IALU
137# instructions to schedule for latency of 3, while scheduling for 1
138# would give no gain on USI&II anyway.
139
140sub BODY_00_19 {
141my ($i,$a,$b,$c,$d,$e)=@_;
142my $j=$i&~1;
143my $k=($j+16+2)%16; # ahead reference
144my $l=($j+16-2)%16; # behind reference
145my $K=@VK[($j+16-2)/20];
146
147$j=($j+16)%16;
148
149$code.=<<___ if (!($i&1));
150 sll $a,5,$tmp0 !! $i
151 and $c,$b,$tmp3
152 ld [$Xfer+`4*($i%16)`],$Xi
153 fxors @X[($j+14)%16],@X[$j+1],@X[$j+1]! 0/ 0/ 0:X[1]^=X[14]
154 srl $a,27,$tmp1
155 add $tmp0,$e,$e
156 fxor @X[($j+2)%16],@X[($j+8)%16],%f18! 1/ 1/ 1:Tmp=X[2,3]^X[8,9]
157 sll $b,30,$tmp2
158 add $tmp1,$e,$e
159 andn $d,$b,$tmp1
160 add $Xi,$e,$e
161 fxor %f18,@X[$j],@X[$j] ! 2/ 4/ 3:X[0,1]^=X[2,3]^X[8,9]
162 srl $b,2,$b
163 or $tmp1,$tmp3,$tmp1
164 or $tmp2,$b,$b
165 add $tmp1,$e,$e
166 faligndata @X[$j],@X[$j],%f18 ! 3/ 7/ 5:Tmp=X[0,1]>>>24
167___
168$code.=<<___ if ($i&1);
169 sll $a,5,$tmp0 !! $i
170 and $c,$b,$tmp3
171 ld [$Xfer+`4*($i%16)`],$Xi
172 fpadd32 @X[$j],@X[$j],@X[$j] ! 4/ 8/ 6:X[0,1]<<=1
173 srl $a,27,$tmp1
174 add $tmp0,$e,$e
175 fmul8ulx16 %f18,$fmul,%f18 ! 5/10/ 7:Tmp>>=7, Tmp&=1
176 sll $b,30,$tmp2
177 add $tmp1,$e,$e
178 fpadd32 $K,@X[$l],%f20 !
179 andn $d,$b,$tmp1
180 add $Xi,$e,$e
181 fxors @X[($k+13)%16],@X[$k],@X[$k] !-1/-1/-1:X[0]^=X[13]
182 srl $b,2,$b
183 or $tmp1,$tmp3,$tmp1
184 fxor %f18,@X[$j],@X[$j] ! 8/14/10:X[0,1]|=Tmp
185 or $tmp2,$b,$b
186 add $tmp1,$e,$e
187___
188$code.=<<___ if ($i&1 && $i>=2);
189 std %f20,[$Xfer+`4*$l`] !
190___
191}
192
193sub BODY_20_39 {
194my ($i,$a,$b,$c,$d,$e)=@_;
195my $j=$i&~1;
196my $k=($j+16+2)%16; # ahead reference
197my $l=($j+16-2)%16; # behind reference
198my $K=@VK[($j+16-2)/20];
199
200$j=($j+16)%16;
201
202$code.=<<___ if (!($i&1) && $i<64);
203 sll $a,5,$tmp0 !! $i
204 ld [$Xfer+`4*($i%16)`],$Xi
205 fxors @X[($j+14)%16],@X[$j+1],@X[$j+1]! 0/ 0/ 0:X[1]^=X[14]
206 srl $a,27,$tmp1
207 add $tmp0,$e,$e
208 fxor @X[($j+2)%16],@X[($j+8)%16],%f18! 1/ 1/ 1:Tmp=X[2,3]^X[8,9]
209 xor $c,$b,$tmp0
210 add $tmp1,$e,$e
211 sll $b,30,$tmp2
212 xor $d,$tmp0,$tmp1
213 fxor %f18,@X[$j],@X[$j] ! 2/ 4/ 3:X[0,1]^=X[2,3]^X[8,9]
214 srl $b,2,$b
215 add $tmp1,$e,$e
216 or $tmp2,$b,$b
217 add $Xi,$e,$e
218 faligndata @X[$j],@X[$j],%f18 ! 3/ 7/ 5:Tmp=X[0,1]>>>24
219___
220$code.=<<___ if ($i&1 && $i<64);
221 sll $a,5,$tmp0 !! $i
222 ld [$Xfer+`4*($i%16)`],$Xi
223 fpadd32 @X[$j],@X[$j],@X[$j] ! 4/ 8/ 6:X[0,1]<<=1
224 srl $a,27,$tmp1
225 add $tmp0,$e,$e
226 fmul8ulx16 %f18,$fmul,%f18 ! 5/10/ 7:Tmp>>=7, Tmp&=1
227 xor $c,$b,$tmp0
228 add $tmp1,$e,$e
229 fpadd32 $K,@X[$l],%f20 !
230 sll $b,30,$tmp2
231 xor $d,$tmp0,$tmp1
232 fxors @X[($k+13)%16],@X[$k],@X[$k] !-1/-1/-1:X[0]^=X[13]
233 srl $b,2,$b
234 add $tmp1,$e,$e
235 fxor %f18,@X[$j],@X[$j] ! 8/14/10:X[0,1]|=Tmp
236 or $tmp2,$b,$b
237 add $Xi,$e,$e
238 std %f20,[$Xfer+`4*$l`] !
239___
240$code.=<<___ if ($i==64);
241 sll $a,5,$tmp0 !! $i
242 ld [$Xfer+`4*($i%16)`],$Xi
243 fpadd32 $K,@X[$l],%f20
244 srl $a,27,$tmp1
245 add $tmp0,$e,$e
246 xor $c,$b,$tmp0
247 add $tmp1,$e,$e
248 sll $b,30,$tmp2
249 xor $d,$tmp0,$tmp1
250 std %f20,[$Xfer+`4*$l`]
251 srl $b,2,$b
252 add $tmp1,$e,$e
253 or $tmp2,$b,$b
254 add $Xi,$e,$e
255___
256$code.=<<___ if ($i>64);
257 sll $a,5,$tmp0 !! $i
258 ld [$Xfer+`4*($i%16)`],$Xi
259 srl $a,27,$tmp1
260 add $tmp0,$e,$e
261 xor $c,$b,$tmp0
262 add $tmp1,$e,$e
263 sll $b,30,$tmp2
264 xor $d,$tmp0,$tmp1
265 srl $b,2,$b
266 add $tmp1,$e,$e
267 or $tmp2,$b,$b
268 add $Xi,$e,$e
269___
270}
271
272sub BODY_40_59 {
273my ($i,$a,$b,$c,$d,$e)=@_;
274my $j=$i&~1;
275my $k=($j+16+2)%16; # ahead reference
276my $l=($j+16-2)%16; # behind reference
277my $K=@VK[($j+16-2)/20];
278
279$j=($j+16)%16;
280
281$code.=<<___ if (!($i&1));
282 sll $a,5,$tmp0 !! $i
283 ld [$Xfer+`4*($i%16)`],$Xi
284 fxors @X[($j+14)%16],@X[$j+1],@X[$j+1]! 0/ 0/ 0:X[1]^=X[14]
285 srl $a,27,$tmp1
286 add $tmp0,$e,$e
287 fxor @X[($j+2)%16],@X[($j+8)%16],%f18! 1/ 1/ 1:Tmp=X[2,3]^X[8,9]
288 and $c,$b,$tmp0
289 add $tmp1,$e,$e
290 sll $b,30,$tmp2
291 or $c,$b,$tmp1
292 fxor %f18,@X[$j],@X[$j] ! 2/ 4/ 3:X[0,1]^=X[2,3]^X[8,9]
293 srl $b,2,$b
294 and $d,$tmp1,$tmp1
295 add $Xi,$e,$e
296 or $tmp1,$tmp0,$tmp1
297 faligndata @X[$j],@X[$j],%f18 ! 3/ 7/ 5:Tmp=X[0,1]>>>24
298 or $tmp2,$b,$b
299 add $tmp1,$e,$e
300 fpadd32 @X[$j],@X[$j],@X[$j] ! 4/ 8/ 6:X[0,1]<<=1
301___
302$code.=<<___ if ($i&1);
303 sll $a,5,$tmp0 !! $i
304 ld [$Xfer+`4*($i%16)`],$Xi
305 srl $a,27,$tmp1
306 add $tmp0,$e,$e
307 fmul8ulx16 %f18,$fmul,%f18 ! 5/10/ 7:Tmp>>=7, Tmp&=1
308 and $c,$b,$tmp0
309 add $tmp1,$e,$e
310 fpadd32 $K,@X[$l],%f20 !
311 sll $b,30,$tmp2
312 or $c,$b,$tmp1
313 fxors @X[($k+13)%16],@X[$k],@X[$k] !-1/-1/-1:X[0]^=X[13]
314 srl $b,2,$b
315 and $d,$tmp1,$tmp1
316 fxor %f18,@X[$j],@X[$j] ! 8/14/10:X[0,1]|=Tmp
317 add $Xi,$e,$e
318 or $tmp1,$tmp0,$tmp1
319 or $tmp2,$b,$b
320 add $tmp1,$e,$e
321 std %f20,[$Xfer+`4*$l`] !
322___
323}
324
325# If there is more data to process, then we pre-fetch the data for
326# next iteration in last ten rounds...
327sub BODY_70_79 {
328my ($i,$a,$b,$c,$d,$e)=@_;
329my $j=$i&~1;
330my $m=($i%8)*2;
331
332$j=($j+16)%16;
333
334$code.=<<___ if ($i==70);
335 sll $a,5,$tmp0 !! $i
336 ld [$Xfer+`4*($i%16)`],$Xi
337 srl $a,27,$tmp1
338 add $tmp0,$e,$e
339 ldd [$inp+64],@X[0]
340 xor $c,$b,$tmp0
341 add $tmp1,$e,$e
342 sll $b,30,$tmp2
343 xor $d,$tmp0,$tmp1
344 srl $b,2,$b
345 add $tmp1,$e,$e
346 or $tmp2,$b,$b
347 add $Xi,$e,$e
348
349 and $inp,-64,$nXfer
350 inc 64,$inp
351 and $nXfer,255,$nXfer
352 alignaddr %g0,$align,%g0
353 add $base,$nXfer,$nXfer
354___
355$code.=<<___ if ($i==71);
356 sll $a,5,$tmp0 !! $i
357 ld [$Xfer+`4*($i%16)`],$Xi
358 srl $a,27,$tmp1
359 add $tmp0,$e,$e
360 xor $c,$b,$tmp0
361 add $tmp1,$e,$e
362 sll $b,30,$tmp2
363 xor $d,$tmp0,$tmp1
364 srl $b,2,$b
365 add $tmp1,$e,$e
366 or $tmp2,$b,$b
367 add $Xi,$e,$e
368___
369$code.=<<___ if ($i>=72);
370 faligndata @X[$m],@X[$m+2],@X[$m]
371 sll $a,5,$tmp0 !! $i
372 ld [$Xfer+`4*($i%16)`],$Xi
373 srl $a,27,$tmp1
374 add $tmp0,$e,$e
375 xor $c,$b,$tmp0
376 add $tmp1,$e,$e
377 fpadd32 $VK_00_19,@X[$m],%f20
378 sll $b,30,$tmp2
379 xor $d,$tmp0,$tmp1
380 srl $b,2,$b
381 add $tmp1,$e,$e
382 or $tmp2,$b,$b
383 add $Xi,$e,$e
384___
385$code.=<<___ if ($i<77);
386 ldd [$inp+`8*($i+1-70)`],@X[2*($i+1-70)]
387___
388$code.=<<___ if ($i==77); # redundant if $inp was aligned
389 add $align,63,$tmp0
390 and $tmp0,-8,$tmp0
391 ldd [$inp+$tmp0],@X[16]
392___
393$code.=<<___ if ($i>=72);
394 std %f20,[$nXfer+`4*$m`]
395___
396}
397
398$code.=<<___;
399.section ".text",#alloc,#execinstr
400
401.align 64
402vis_const:
403.long 0x5a827999,0x5a827999 ! K_00_19
404.long 0x6ed9eba1,0x6ed9eba1 ! K_20_39
405.long 0x8f1bbcdc,0x8f1bbcdc ! K_40_59
406.long 0xca62c1d6,0xca62c1d6 ! K_60_79
407.long 0x00000100,0x00000100
408.align 64
409.type vis_const,#object
410.size vis_const,(.-vis_const)
411
412.globl sha1_block_data_order
413sha1_block_data_order:
414 save %sp,-$frame,%sp
415 add %fp,$bias-256,$base
416
4171: call .+8
418 add %o7,vis_const-1b,$tmp0
419
420 ldd [$tmp0+0],$VK_00_19
421 ldd [$tmp0+8],$VK_20_39
422 ldd [$tmp0+16],$VK_40_59
423 ldd [$tmp0+24],$VK_60_79
424 ldd [$tmp0+32],$fmul
425
426 ld [$ctx+0],$Actx
427 and $base,-256,$base
428 ld [$ctx+4],$Bctx
429 sub $base,$bias+$frame,%sp
430 ld [$ctx+8],$Cctx
431 and $inp,7,$align
432 ld [$ctx+12],$Dctx
433 and $inp,-8,$inp
434 ld [$ctx+16],$Ectx
435
436 ! X[16] is maintained in FP register bank
437 alignaddr %g0,$align,%g0
438 ldd [$inp+0],@X[0]
439 sub $inp,-64,$Xfer
440 ldd [$inp+8],@X[2]
441 and $Xfer,-64,$Xfer
442 ldd [$inp+16],@X[4]
443 and $Xfer,255,$Xfer
444 ldd [$inp+24],@X[6]
445 add $base,$Xfer,$Xfer
446 ldd [$inp+32],@X[8]
447 ldd [$inp+40],@X[10]
448 ldd [$inp+48],@X[12]
449 brz,pt $align,.Laligned
450 ldd [$inp+56],@X[14]
451
452 ldd [$inp+64],@X[16]
453 faligndata @X[0],@X[2],@X[0]
454 faligndata @X[2],@X[4],@X[2]
455 faligndata @X[4],@X[6],@X[4]
456 faligndata @X[6],@X[8],@X[6]
457 faligndata @X[8],@X[10],@X[8]
458 faligndata @X[10],@X[12],@X[10]
459 faligndata @X[12],@X[14],@X[12]
460 faligndata @X[14],@X[16],@X[14]
461
462.Laligned:
463 mov 5,$tmp0
464 dec 1,$len
465 alignaddr %g0,$tmp0,%g0
466 fpadd32 $VK_00_19,@X[0],%f16
467 fpadd32 $VK_00_19,@X[2],%f18
468 fpadd32 $VK_00_19,@X[4],%f20
469 fpadd32 $VK_00_19,@X[6],%f22
470 fpadd32 $VK_00_19,@X[8],%f24
471 fpadd32 $VK_00_19,@X[10],%f26
472 fpadd32 $VK_00_19,@X[12],%f28
473 fpadd32 $VK_00_19,@X[14],%f30
474 std %f16,[$Xfer+0]
475 mov $Actx,$A
476 std %f18,[$Xfer+8]
477 mov $Bctx,$B
478 std %f20,[$Xfer+16]
479 mov $Cctx,$C
480 std %f22,[$Xfer+24]
481 mov $Dctx,$D
482 std %f24,[$Xfer+32]
483 mov $Ectx,$E
484 std %f26,[$Xfer+40]
485 fxors @X[13],@X[0],@X[0]
486 std %f28,[$Xfer+48]
487 ba .Loop
488 std %f30,[$Xfer+56]
489.align 32
490.Loop:
491___
492for ($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
493for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
494for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
495for (;$i<70;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
496$code.=<<___;
497 tst $len
498 bz,pn `$bits==32?"%icc":"%xcc"`,.Ltail
499 nop
500___
501for (;$i<80;$i++) { &BODY_70_79($i,@V); unshift(@V,pop(@V)); }
502$code.=<<___;
503 add $A,$Actx,$Actx
504 add $B,$Bctx,$Bctx
505 add $C,$Cctx,$Cctx
506 add $D,$Dctx,$Dctx
507 add $E,$Ectx,$Ectx
508 mov 5,$tmp0
509 fxors @X[13],@X[0],@X[0]
510 mov $Actx,$A
511 mov $Bctx,$B
512 mov $Cctx,$C
513 mov $Dctx,$D
514 mov $Ectx,$E
515 alignaddr %g0,$tmp0,%g0
516 dec 1,$len
517 ba .Loop
518 mov $nXfer,$Xfer
519
520.align 32
521.Ltail:
522___
523for($i=70;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
524$code.=<<___;
525 add $A,$Actx,$Actx
526 add $B,$Bctx,$Bctx
527 add $C,$Cctx,$Cctx
528 add $D,$Dctx,$Dctx
529 add $E,$Ectx,$Ectx
530
531 st $Actx,[$ctx+0]
532 st $Bctx,[$ctx+4]
533 st $Cctx,[$ctx+8]
534 st $Dctx,[$ctx+12]
535 st $Ectx,[$ctx+16]
536
537 ret
538 restore
539.type sha1_block_data_order,#function
540.size sha1_block_data_order,(.-sha1_block_data_order)
541.asciz "SHA1 block transform for SPARCv9a, CRYPTOGAMS by <appro\@openssl.org>"
542.align 4
543___
544
545# Purpose of these subroutines is to explicitly encode VIS instructions,
546# so that one can compile the module without having to specify VIS
547# extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
548# Idea is to reserve for option to produce "universal" binary and let
549# programmer detect if current CPU is VIS capable at run-time.
550sub unvis {
551my ($mnemonic,$rs1,$rs2,$rd)=@_;
552my $ref,$opf;
553my %visopf = ( "fmul8ulx16" => 0x037,
554 "faligndata" => 0x048,
555 "fpadd32" => 0x052,
556 "fxor" => 0x06c,
557 "fxors" => 0x06d );
558
559 $ref = "$mnemonic\t$rs1,$rs2,$rd";
560
561 if ($opf=$visopf{$mnemonic}) {
562 foreach ($rs1,$rs2,$rd) {
563 return $ref if (!/%f([0-9]{1,2})/);
564 $_=$1;
565 if ($1>=32) {
566 return $ref if ($1&1);
567 # re-encode for upper double register addressing
568 $_=($1|$1>>5)&31;
569 }
570 }
571
572 return sprintf ".word\t0x%08x !%s",
573 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
574 $ref;
575 } else {
576 return $ref;
577 }
578}
579sub unalignaddr {
580my ($mnemonic,$rs1,$rs2,$rd)=@_;
581my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
582my $ref="$mnemonic\t$rs1,$rs2,$rd";
583
584 foreach ($rs1,$rs2,$rd) {
585 if (/%([goli])([0-7])/) { $_=$bias{$1}+$2; }
586 else { return $ref; }
587 }
588 return sprintf ".word\t0x%08x !%s",
589 0x81b00300|$rd<<25|$rs1<<14|$rs2,
590 $ref;
591}
592
593$code =~ s/\`([^\`]*)\`/eval $1/gem;
594$code =~ s/\b(f[^\s]*)\s+(%f[0-9]{1,2}),(%f[0-9]{1,2}),(%f[0-9]{1,2})/
595 &unvis($1,$2,$3,$4)
596 /gem;
597$code =~ s/\b(alignaddr)\s+(%[goli][0-7]),(%[goli][0-7]),(%[goli][0-7])/
598 &unalignaddr($1,$2,$3,$4)
599 /gem;
600print $code;
601close STDOUT;
diff --git a/src/lib/libcrypto/sha/asm/sha1-thumb.pl b/src/lib/libcrypto/sha/asm/sha1-thumb.pl
deleted file mode 100644
index 7c9ea9b029..0000000000
--- a/src/lib/libcrypto/sha/asm/sha1-thumb.pl
+++ /dev/null
@@ -1,259 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# sha1_block for Thumb.
11#
12# January 2007.
13#
14# The code does not present direct interest to OpenSSL, because of low
15# performance. Its purpose is to establish _size_ benchmark. Pretty
16# useless one I must say, because 30% or 88 bytes larger ARMv4 code
17# [avialable on demand] is almost _twice_ as fast. It should also be
18# noted that in-lining of .Lcommon and .Lrotate improves performance
19# by over 40%, while code increases by only 10% or 32 bytes. But once
20# again, the goal was to establish _size_ benchmark, not performance.
21
22$output=shift;
23open STDOUT,">$output";
24
25$inline=0;
26#$cheat_on_binutils=1;
27
28$t0="r0";
29$t1="r1";
30$t2="r2";
31$a="r3";
32$b="r4";
33$c="r5";
34$d="r6";
35$e="r7";
36$K="r8"; # "upper" registers can be used in add/sub and mov insns
37$ctx="r9";
38$inp="r10";
39$len="r11";
40$Xi="r12";
41
42sub common {
43<<___;
44 sub $t0,#4
45 ldr $t1,[$t0]
46 add $e,$K @ E+=K_xx_xx
47 lsl $t2,$a,#5
48 add $t2,$e
49 lsr $e,$a,#27
50 add $t2,$e @ E+=ROR(A,27)
51 add $t2,$t1 @ E+=X[i]
52___
53}
54sub rotate {
55<<___;
56 mov $e,$d @ E=D
57 mov $d,$c @ D=C
58 lsl $c,$b,#30
59 lsr $b,$b,#2
60 orr $c,$b @ C=ROR(B,2)
61 mov $b,$a @ B=A
62 add $a,$t2,$t1 @ A=E+F_xx_xx(B,C,D)
63___
64}
65
66sub BODY_00_19 {
67$code.=$inline?&common():"\tbl .Lcommon\n";
68$code.=<<___;
69 mov $t1,$c
70 eor $t1,$d
71 and $t1,$b
72 eor $t1,$d @ F_00_19(B,C,D)
73___
74$code.=$inline?&rotate():"\tbl .Lrotate\n";
75}
76
77sub BODY_20_39 {
78$code.=$inline?&common():"\tbl .Lcommon\n";
79$code.=<<___;
80 mov $t1,$b
81 eor $t1,$c
82 eor $t1,$d @ F_20_39(B,C,D)
83___
84$code.=$inline?&rotate():"\tbl .Lrotate\n";
85}
86
87sub BODY_40_59 {
88$code.=$inline?&common():"\tbl .Lcommon\n";
89$code.=<<___;
90 mov $t1,$b
91 and $t1,$c
92 mov $e,$b
93 orr $e,$c
94 and $e,$d
95 orr $t1,$e @ F_40_59(B,C,D)
96___
97$code.=$inline?&rotate():"\tbl .Lrotate\n";
98}
99
100$code=<<___;
101.text
102.code 16
103
104.global sha1_block_data_order
105.type sha1_block_data_order,%function
106
107.align 2
108sha1_block_data_order:
109___
110if ($cheat_on_binutils) {
111$code.=<<___;
112.code 32
113 add r3,pc,#1
114 bx r3 @ switch to Thumb ISA
115.code 16
116___
117}
118$code.=<<___;
119 push {r4-r7}
120 mov r3,r8
121 mov r4,r9
122 mov r5,r10
123 mov r6,r11
124 mov r7,r12
125 push {r3-r7,lr}
126 lsl r2,#6
127 mov $ctx,r0 @ save context
128 mov $inp,r1 @ save inp
129 mov $len,r2 @ save len
130 add $len,$inp @ $len to point at inp end
131
132.Lloop:
133 mov $Xi,sp
134 mov $t2,sp
135 sub $t2,#16*4 @ [3]
136.LXload:
137 ldrb $a,[$t1,#0] @ $t1 is r1 and holds inp
138 ldrb $b,[$t1,#1]
139 ldrb $c,[$t1,#2]
140 ldrb $d,[$t1,#3]
141 lsl $a,#24
142 lsl $b,#16
143 lsl $c,#8
144 orr $a,$b
145 orr $a,$c
146 orr $a,$d
147 add $t1,#4
148 push {$a}
149 cmp sp,$t2
150 bne .LXload @ [+14*16]
151
152 mov $inp,$t1 @ update $inp
153 sub $t2,#32*4
154 sub $t2,#32*4
155 mov $e,#31 @ [+4]
156.LXupdate:
157 ldr $a,[sp,#15*4]
158 ldr $b,[sp,#13*4]
159 ldr $c,[sp,#7*4]
160 ldr $d,[sp,#2*4]
161 eor $a,$b
162 eor $a,$c
163 eor $a,$d
164 ror $a,$e
165 push {$a}
166 cmp sp,$t2
167 bne .LXupdate @ [+(11+1)*64]
168
169 ldmia $t0!,{$a,$b,$c,$d,$e} @ $t0 is r0 and holds ctx
170 mov $t0,$Xi
171
172 ldr $t2,.LK_00_19
173 mov $t1,$t0
174 sub $t1,#20*4
175 mov $Xi,$t1
176 mov $K,$t2 @ [+7+4]
177.L_00_19:
178___
179 &BODY_00_19();
180$code.=<<___;
181 cmp $Xi,$t0
182 bne .L_00_19 @ [+(2+9+4+2+8+2)*20]
183
184 ldr $t2,.LK_20_39
185 mov $t1,$t0
186 sub $t1,#20*4
187 mov $Xi,$t1
188 mov $K,$t2 @ [+5]
189.L_20_39_or_60_79:
190___
191 &BODY_20_39();
192$code.=<<___;
193 cmp $Xi,$t0
194 bne .L_20_39_or_60_79 @ [+(2+9+3+2+8+2)*20*2]
195 cmp sp,$t0
196 beq .Ldone @ [+2]
197
198 ldr $t2,.LK_40_59
199 mov $t1,$t0
200 sub $t1,#20*4
201 mov $Xi,$t1
202 mov $K,$t2 @ [+5]
203.L_40_59:
204___
205 &BODY_40_59();
206$code.=<<___;
207 cmp $Xi,$t0
208 bne .L_40_59 @ [+(2+9+6+2+8+2)*20]
209
210 ldr $t2,.LK_60_79
211 mov $Xi,sp
212 mov $K,$t2
213 b .L_20_39_or_60_79 @ [+4]
214.Ldone:
215 mov $t0,$ctx
216 ldr $t1,[$t0,#0]
217 ldr $t2,[$t0,#4]
218 add $a,$t1
219 ldr $t1,[$t0,#8]
220 add $b,$t2
221 ldr $t2,[$t0,#12]
222 add $c,$t1
223 ldr $t1,[$t0,#16]
224 add $d,$t2
225 add $e,$t1
226 stmia $t0!,{$a,$b,$c,$d,$e} @ [+20]
227
228 add sp,#80*4 @ deallocate stack frame
229 mov $t0,$ctx @ restore ctx
230 mov $t1,$inp @ restore inp
231 cmp $t1,$len
232 beq .Lexit
233 b .Lloop @ [+6] total 3212 cycles
234.Lexit:
235 pop {r2-r7}
236 mov r8,r2
237 mov r9,r3
238 mov r10,r4
239 mov r11,r5
240 mov r12,r6
241 mov lr,r7
242 pop {r4-r7}
243 bx lr
244.align 2
245___
246$code.=".Lcommon:\n".&common()."\tmov pc,lr\n" if (!$inline);
247$code.=".Lrotate:\n".&rotate()."\tmov pc,lr\n" if (!$inline);
248$code.=<<___;
249.align 2
250.LK_00_19: .word 0x5a827999
251.LK_20_39: .word 0x6ed9eba1
252.LK_40_59: .word 0x8f1bbcdc
253.LK_60_79: .word 0xca62c1d6
254.size sha1_block_data_order,.-sha1_block_data_order
255.asciz "SHA1 block transform for Thumb, CRYPTOGAMS by <appro\@openssl.org>"
256___
257
258print $code;
259close STDOUT; # enforce flush
diff --git a/src/lib/libcrypto/sha/asm/sha1-x86_64.pl b/src/lib/libcrypto/sha/asm/sha1-x86_64.pl
deleted file mode 100755
index f27c1e3fb0..0000000000
--- a/src/lib/libcrypto/sha/asm/sha1-x86_64.pl
+++ /dev/null
@@ -1,1260 +0,0 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# sha1_block procedure for x86_64.
11#
12# It was brought to my attention that on EM64T compiler-generated code
13# was far behind 32-bit assembler implementation. This is unlike on
14# Opteron where compiler-generated code was only 15% behind 32-bit
15# assembler, which originally made it hard to motivate the effort.
16# There was suggestion to mechanically translate 32-bit code, but I
17# dismissed it, reasoning that x86_64 offers enough register bank
18# capacity to fully utilize SHA-1 parallelism. Therefore this fresh
19# implementation:-) However! While 64-bit code does perform better
20# on Opteron, I failed to beat 32-bit assembler on EM64T core. Well,
21# x86_64 does offer larger *addressable* bank, but out-of-order core
22# reaches for even more registers through dynamic aliasing, and EM64T
23# core must have managed to run-time optimize even 32-bit code just as
24# good as 64-bit one. Performance improvement is summarized in the
25# following table:
26#
27# gcc 3.4 32-bit asm cycles/byte
28# Opteron +45% +20% 6.8
29# Xeon P4 +65% +0% 9.9
30# Core2 +60% +10% 7.0
31
32# August 2009.
33#
34# The code was revised to minimize code size and to maximize
35# "distance" between instructions producing input to 'lea'
36# instruction and the 'lea' instruction itself, which is essential
37# for Intel Atom core.
38
39# October 2010.
40#
41# Add SSSE3, Supplemental[!] SSE3, implementation. The idea behind it
42# is to offload message schedule denoted by Wt in NIST specification,
43# or Xupdate in OpenSSL source, to SIMD unit. See sha1-586.pl module
44# for background and implementation details. The only difference from
45# 32-bit code is that 64-bit code doesn't have to spill @X[] elements
46# to free temporary registers.
47
48# April 2011.
49#
50# Add AVX code path. See sha1-586.pl for further information.
51
52######################################################################
53# Current performance is summarized in following table. Numbers are
54# CPU clock cycles spent to process single byte (less is better).
55#
56# x86_64 SSSE3 AVX
57# P4 9.8 -
58# Opteron 6.6 -
59# Core2 6.7 6.1/+10% -
60# Atom 11.0 9.7/+13% -
61# Westmere 7.1 5.6/+27% -
62# Sandy Bridge 7.9 6.3/+25% 5.2/+51%
63
64$flavour = shift;
65$output = shift;
66if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
67
68$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
69
70$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
71( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
72( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
73die "can't locate x86_64-xlate.pl";
74
75$avx=1 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
76 =~ /GNU assembler version ([2-9]\.[0-9]+)/ &&
77 $1>=2.19);
78$avx=1 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
79 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ &&
80 $1>=2.09);
81$avx=1 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
82 `ml64 2>&1` =~ /Version ([0-9]+)\./ &&
83 $1>=10);
84
85open STDOUT,"| $^X $xlate $flavour $output";
86
87$ctx="%rdi"; # 1st arg
88$inp="%rsi"; # 2nd arg
89$num="%rdx"; # 3rd arg
90
91# reassign arguments in order to produce more compact code
92$ctx="%r8";
93$inp="%r9";
94$num="%r10";
95
96$t0="%eax";
97$t1="%ebx";
98$t2="%ecx";
99@xi=("%edx","%ebp");
100$A="%esi";
101$B="%edi";
102$C="%r11d";
103$D="%r12d";
104$E="%r13d";
105
106@V=($A,$B,$C,$D,$E);
107
108sub BODY_00_19 {
109my ($i,$a,$b,$c,$d,$e)=@_;
110my $j=$i+1;
111$code.=<<___ if ($i==0);
112 mov `4*$i`($inp),$xi[0]
113 bswap $xi[0]
114 mov $xi[0],`4*$i`(%rsp)
115___
116$code.=<<___ if ($i<15);
117 mov $c,$t0
118 mov `4*$j`($inp),$xi[1]
119 mov $a,$t2
120 xor $d,$t0
121 bswap $xi[1]
122 rol \$5,$t2
123 lea 0x5a827999($xi[0],$e),$e
124 and $b,$t0
125 mov $xi[1],`4*$j`(%rsp)
126 add $t2,$e
127 xor $d,$t0
128 rol \$30,$b
129 add $t0,$e
130___
131$code.=<<___ if ($i>=15);
132 mov `4*($j%16)`(%rsp),$xi[1]
133 mov $c,$t0
134 mov $a,$t2
135 xor `4*(($j+2)%16)`(%rsp),$xi[1]
136 xor $d,$t0
137 rol \$5,$t2
138 xor `4*(($j+8)%16)`(%rsp),$xi[1]
139 and $b,$t0
140 lea 0x5a827999($xi[0],$e),$e
141 xor `4*(($j+13)%16)`(%rsp),$xi[1]
142 xor $d,$t0
143 rol \$1,$xi[1]
144 add $t2,$e
145 rol \$30,$b
146 mov $xi[1],`4*($j%16)`(%rsp)
147 add $t0,$e
148___
149unshift(@xi,pop(@xi));
150}
151
152sub BODY_20_39 {
153my ($i,$a,$b,$c,$d,$e)=@_;
154my $j=$i+1;
155my $K=($i<40)?0x6ed9eba1:0xca62c1d6;
156$code.=<<___ if ($i<79);
157 mov `4*($j%16)`(%rsp),$xi[1]
158 mov $c,$t0
159 mov $a,$t2
160 xor `4*(($j+2)%16)`(%rsp),$xi[1]
161 xor $b,$t0
162 rol \$5,$t2
163 lea $K($xi[0],$e),$e
164 xor `4*(($j+8)%16)`(%rsp),$xi[1]
165 xor $d,$t0
166 add $t2,$e
167 xor `4*(($j+13)%16)`(%rsp),$xi[1]
168 rol \$30,$b
169 add $t0,$e
170 rol \$1,$xi[1]
171___
172$code.=<<___ if ($i<76);
173 mov $xi[1],`4*($j%16)`(%rsp)
174___
175$code.=<<___ if ($i==79);
176 mov $c,$t0
177 mov $a,$t2
178 xor $b,$t0
179 lea $K($xi[0],$e),$e
180 rol \$5,$t2
181 xor $d,$t0
182 add $t2,$e
183 rol \$30,$b
184 add $t0,$e
185___
186unshift(@xi,pop(@xi));
187}
188
189sub BODY_40_59 {
190my ($i,$a,$b,$c,$d,$e)=@_;
191my $j=$i+1;
192$code.=<<___;
193 mov `4*($j%16)`(%rsp),$xi[1]
194 mov $c,$t0
195 mov $c,$t1
196 xor `4*(($j+2)%16)`(%rsp),$xi[1]
197 and $d,$t0
198 mov $a,$t2
199 xor `4*(($j+8)%16)`(%rsp),$xi[1]
200 xor $d,$t1
201 lea 0x8f1bbcdc($xi[0],$e),$e
202 rol \$5,$t2
203 xor `4*(($j+13)%16)`(%rsp),$xi[1]
204 add $t0,$e
205 and $b,$t1
206 rol \$1,$xi[1]
207 add $t1,$e
208 rol \$30,$b
209 mov $xi[1],`4*($j%16)`(%rsp)
210 add $t2,$e
211___
212unshift(@xi,pop(@xi));
213}
214
215$code.=<<___;
216.text
217.extern OPENSSL_ia32cap_P
218
219.globl sha1_block_data_order
220.type sha1_block_data_order,\@function,3
221.align 16
222sha1_block_data_order:
223 mov OPENSSL_ia32cap_P+0(%rip),%r9d
224 mov OPENSSL_ia32cap_P+4(%rip),%r8d
225 test \$`1<<9`,%r8d # check SSSE3 bit
226 jz .Lialu
227___
228$code.=<<___ if ($avx);
229 and \$`1<<28`,%r8d # mask AVX bit
230 and \$`1<<30`,%r9d # mask "Intel CPU" bit
231 or %r9d,%r8d
232 cmp \$`1<<28|1<<30`,%r8d
233 je _avx_shortcut
234___
235$code.=<<___;
236 jmp _ssse3_shortcut
237
238.align 16
239.Lialu:
240 push %rbx
241 push %rbp
242 push %r12
243 push %r13
244 mov %rsp,%r11
245 mov %rdi,$ctx # reassigned argument
246 sub \$`8+16*4`,%rsp
247 mov %rsi,$inp # reassigned argument
248 and \$-64,%rsp
249 mov %rdx,$num # reassigned argument
250 mov %r11,`16*4`(%rsp)
251.Lprologue:
252
253 mov 0($ctx),$A
254 mov 4($ctx),$B
255 mov 8($ctx),$C
256 mov 12($ctx),$D
257 mov 16($ctx),$E
258 jmp .Lloop
259
260.align 16
261.Lloop:
262___
263for($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
264for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
265for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
266for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
267$code.=<<___;
268 add 0($ctx),$A
269 add 4($ctx),$B
270 add 8($ctx),$C
271 add 12($ctx),$D
272 add 16($ctx),$E
273 mov $A,0($ctx)
274 mov $B,4($ctx)
275 mov $C,8($ctx)
276 mov $D,12($ctx)
277 mov $E,16($ctx)
278
279 sub \$1,$num
280 lea `16*4`($inp),$inp
281 jnz .Lloop
282
283 mov `16*4`(%rsp),%rsi
284 mov (%rsi),%r13
285 mov 8(%rsi),%r12
286 mov 16(%rsi),%rbp
287 mov 24(%rsi),%rbx
288 lea 32(%rsi),%rsp
289.Lepilogue:
290 ret
291.size sha1_block_data_order,.-sha1_block_data_order
292___
293{{{
294my $Xi=4;
295my @X=map("%xmm$_",(4..7,0..3));
296my @Tx=map("%xmm$_",(8..10));
297my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization
298my @T=("%esi","%edi");
299my $j=0;
300my $K_XX_XX="%r11";
301
302my $_rol=sub { &rol(@_) };
303my $_ror=sub { &ror(@_) };
304
305$code.=<<___;
306.type sha1_block_data_order_ssse3,\@function,3
307.align 16
308sha1_block_data_order_ssse3:
309_ssse3_shortcut:
310 push %rbx
311 push %rbp
312 push %r12
313 lea `-64-($win64?5*16:0)`(%rsp),%rsp
314___
315$code.=<<___ if ($win64);
316 movaps %xmm6,64+0(%rsp)
317 movaps %xmm7,64+16(%rsp)
318 movaps %xmm8,64+32(%rsp)
319 movaps %xmm9,64+48(%rsp)
320 movaps %xmm10,64+64(%rsp)
321.Lprologue_ssse3:
322___
323$code.=<<___;
324 mov %rdi,$ctx # reassigned argument
325 mov %rsi,$inp # reassigned argument
326 mov %rdx,$num # reassigned argument
327
328 shl \$6,$num
329 add $inp,$num
330 lea K_XX_XX(%rip),$K_XX_XX
331
332 mov 0($ctx),$A # load context
333 mov 4($ctx),$B
334 mov 8($ctx),$C
335 mov 12($ctx),$D
336 mov $B,@T[0] # magic seed
337 mov 16($ctx),$E
338
339 movdqa 64($K_XX_XX),@X[2] # pbswap mask
340 movdqa 0($K_XX_XX),@Tx[1] # K_00_19
341 movdqu 0($inp),@X[-4&7] # load input to %xmm[0-3]
342 movdqu 16($inp),@X[-3&7]
343 movdqu 32($inp),@X[-2&7]
344 movdqu 48($inp),@X[-1&7]
345 pshufb @X[2],@X[-4&7] # byte swap
346 add \$64,$inp
347 pshufb @X[2],@X[-3&7]
348 pshufb @X[2],@X[-2&7]
349 pshufb @X[2],@X[-1&7]
350 paddd @Tx[1],@X[-4&7] # add K_00_19
351 paddd @Tx[1],@X[-3&7]
352 paddd @Tx[1],@X[-2&7]
353 movdqa @X[-4&7],0(%rsp) # X[]+K xfer to IALU
354 psubd @Tx[1],@X[-4&7] # restore X[]
355 movdqa @X[-3&7],16(%rsp)
356 psubd @Tx[1],@X[-3&7]
357 movdqa @X[-2&7],32(%rsp)
358 psubd @Tx[1],@X[-2&7]
359 jmp .Loop_ssse3
360___
361
362sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
363{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
364 my $arg = pop;
365 $arg = "\$$arg" if ($arg*1 eq $arg);
366 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
367}
368
369sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4
370{ use integer;
371 my $body = shift;
372 my @insns = (&$body,&$body,&$body,&$body); # 40 instructions
373 my ($a,$b,$c,$d,$e);
374
375 &movdqa (@X[0],@X[-3&7]);
376 eval(shift(@insns));
377 eval(shift(@insns));
378 &movdqa (@Tx[0],@X[-1&7]);
379 &palignr(@X[0],@X[-4&7],8); # compose "X[-14]" in "X[0]"
380 eval(shift(@insns));
381 eval(shift(@insns));
382
383 &paddd (@Tx[1],@X[-1&7]);
384 eval(shift(@insns));
385 eval(shift(@insns));
386 &psrldq (@Tx[0],4); # "X[-3]", 3 dwords
387 eval(shift(@insns));
388 eval(shift(@insns));
389 &pxor (@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
390 eval(shift(@insns));
391 eval(shift(@insns));
392
393 &pxor (@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"
394 eval(shift(@insns));
395 eval(shift(@insns));
396 eval(shift(@insns));
397 eval(shift(@insns));
398
399 &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]"
400 eval(shift(@insns));
401 eval(shift(@insns));
402 &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
403 eval(shift(@insns));
404 eval(shift(@insns));
405
406 &movdqa (@Tx[2],@X[0]);
407 &movdqa (@Tx[0],@X[0]);
408 eval(shift(@insns));
409 eval(shift(@insns));
410 eval(shift(@insns));
411 eval(shift(@insns));
412
413 &pslldq (@Tx[2],12); # "X[0]"<<96, extract one dword
414 &paddd (@X[0],@X[0]);
415 eval(shift(@insns));
416 eval(shift(@insns));
417 eval(shift(@insns));
418 eval(shift(@insns));
419
420 &psrld (@Tx[0],31);
421 eval(shift(@insns));
422 eval(shift(@insns));
423 &movdqa (@Tx[1],@Tx[2]);
424 eval(shift(@insns));
425 eval(shift(@insns));
426
427 &psrld (@Tx[2],30);
428 &por (@X[0],@Tx[0]); # "X[0]"<<<=1
429 eval(shift(@insns));
430 eval(shift(@insns));
431 eval(shift(@insns));
432 eval(shift(@insns));
433
434 &pslld (@Tx[1],2);
435 &pxor (@X[0],@Tx[2]);
436 eval(shift(@insns));
437 eval(shift(@insns));
438 &movdqa (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)"); # K_XX_XX
439 eval(shift(@insns));
440 eval(shift(@insns));
441
442 &pxor (@X[0],@Tx[1]); # "X[0]"^=("X[0]">>96)<<<2
443
444 foreach (@insns) { eval; } # remaining instructions [if any]
445
446 $Xi++; push(@X,shift(@X)); # "rotate" X[]
447 push(@Tx,shift(@Tx));
448}
449
450sub Xupdate_ssse3_32_79()
451{ use integer;
452 my $body = shift;
453 my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions
454 my ($a,$b,$c,$d,$e);
455
456 &movdqa (@Tx[0],@X[-1&7]) if ($Xi==8);
457 eval(shift(@insns)); # body_20_39
458 &pxor (@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
459 &palignr(@Tx[0],@X[-2&7],8); # compose "X[-6]"
460 eval(shift(@insns));
461 eval(shift(@insns));
462 eval(shift(@insns)); # rol
463
464 &pxor (@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
465 eval(shift(@insns));
466 eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/);
467 if ($Xi%5) {
468 &movdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
469 } else { # ... or load next one
470 &movdqa (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
471 }
472 &paddd (@Tx[1],@X[-1&7]);
473 eval(shift(@insns)); # ror
474 eval(shift(@insns));
475
476 &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-6]"
477 eval(shift(@insns)); # body_20_39
478 eval(shift(@insns));
479 eval(shift(@insns));
480 eval(shift(@insns)); # rol
481
482 &movdqa (@Tx[0],@X[0]);
483 &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
484 eval(shift(@insns));
485 eval(shift(@insns));
486 eval(shift(@insns)); # ror
487 eval(shift(@insns));
488
489 &pslld (@X[0],2);
490 eval(shift(@insns)); # body_20_39
491 eval(shift(@insns));
492 &psrld (@Tx[0],30);
493 eval(shift(@insns));
494 eval(shift(@insns)); # rol
495 eval(shift(@insns));
496 eval(shift(@insns));
497 eval(shift(@insns)); # ror
498 eval(shift(@insns));
499
500 &por (@X[0],@Tx[0]); # "X[0]"<<<=2
501 eval(shift(@insns)); # body_20_39
502 eval(shift(@insns));
503 &movdqa (@Tx[1],@X[0]) if ($Xi<19);
504 eval(shift(@insns));
505 eval(shift(@insns)); # rol
506 eval(shift(@insns));
507 eval(shift(@insns));
508 eval(shift(@insns)); # rol
509 eval(shift(@insns));
510
511 foreach (@insns) { eval; } # remaining instructions
512
513 $Xi++; push(@X,shift(@X)); # "rotate" X[]
514 push(@Tx,shift(@Tx));
515}
516
517sub Xuplast_ssse3_80()
518{ use integer;
519 my $body = shift;
520 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
521 my ($a,$b,$c,$d,$e);
522
523 eval(shift(@insns));
524 &paddd (@Tx[1],@X[-1&7]);
525 eval(shift(@insns));
526 eval(shift(@insns));
527 eval(shift(@insns));
528 eval(shift(@insns));
529
530 &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU
531
532 foreach (@insns) { eval; } # remaining instructions
533
534 &cmp ($inp,$num);
535 &je (".Ldone_ssse3");
536
537 unshift(@Tx,pop(@Tx));
538
539 &movdqa (@X[2],"64($K_XX_XX)"); # pbswap mask
540 &movdqa (@Tx[1],"0($K_XX_XX)"); # K_00_19
541 &movdqu (@X[-4&7],"0($inp)"); # load input
542 &movdqu (@X[-3&7],"16($inp)");
543 &movdqu (@X[-2&7],"32($inp)");
544 &movdqu (@X[-1&7],"48($inp)");
545 &pshufb (@X[-4&7],@X[2]); # byte swap
546 &add ($inp,64);
547
548 $Xi=0;
549}
550
551sub Xloop_ssse3()
552{ use integer;
553 my $body = shift;
554 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
555 my ($a,$b,$c,$d,$e);
556
557 eval(shift(@insns));
558 eval(shift(@insns));
559 &pshufb (@X[($Xi-3)&7],@X[2]);
560 eval(shift(@insns));
561 eval(shift(@insns));
562 &paddd (@X[($Xi-4)&7],@Tx[1]);
563 eval(shift(@insns));
564 eval(shift(@insns));
565 eval(shift(@insns));
566 eval(shift(@insns));
567 &movdqa (eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]); # X[]+K xfer to IALU
568 eval(shift(@insns));
569 eval(shift(@insns));
570 &psubd (@X[($Xi-4)&7],@Tx[1]);
571
572 foreach (@insns) { eval; }
573 $Xi++;
574}
575
576sub Xtail_ssse3()
577{ use integer;
578 my $body = shift;
579 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
580 my ($a,$b,$c,$d,$e);
581
582 foreach (@insns) { eval; }
583}
584
585sub body_00_19 () {
586 (
587 '($a,$b,$c,$d,$e)=@V;'.
588 '&add ($e,eval(4*($j&15))."(%rsp)");', # X[]+K xfer
589 '&xor ($c,$d);',
590 '&mov (@T[1],$a);', # $b in next round
591 '&$_rol ($a,5);',
592 '&and (@T[0],$c);', # ($b&($c^$d))
593 '&xor ($c,$d);', # restore $c
594 '&xor (@T[0],$d);',
595 '&add ($e,$a);',
596 '&$_ror ($b,$j?7:2);', # $b>>>2
597 '&add ($e,@T[0]);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
598 );
599}
600
601sub body_20_39 () {
602 (
603 '($a,$b,$c,$d,$e)=@V;'.
604 '&add ($e,eval(4*($j++&15))."(%rsp)");', # X[]+K xfer
605 '&xor (@T[0],$d);', # ($b^$d)
606 '&mov (@T[1],$a);', # $b in next round
607 '&$_rol ($a,5);',
608 '&xor (@T[0],$c);', # ($b^$d^$c)
609 '&add ($e,$a);',
610 '&$_ror ($b,7);', # $b>>>2
611 '&add ($e,@T[0]);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
612 );
613}
614
615sub body_40_59 () {
616 (
617 '($a,$b,$c,$d,$e)=@V;'.
618 '&mov (@T[1],$c);',
619 '&xor ($c,$d);',
620 '&add ($e,eval(4*($j++&15))."(%rsp)");', # X[]+K xfer
621 '&and (@T[1],$d);',
622 '&and (@T[0],$c);', # ($b&($c^$d))
623 '&$_ror ($b,7);', # $b>>>2
624 '&add ($e,@T[1]);',
625 '&mov (@T[1],$a);', # $b in next round
626 '&$_rol ($a,5);',
627 '&add ($e,@T[0]);',
628 '&xor ($c,$d);', # restore $c
629 '&add ($e,$a);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
630 );
631}
632$code.=<<___;
633.align 16
634.Loop_ssse3:
635___
636 &Xupdate_ssse3_16_31(\&body_00_19);
637 &Xupdate_ssse3_16_31(\&body_00_19);
638 &Xupdate_ssse3_16_31(\&body_00_19);
639 &Xupdate_ssse3_16_31(\&body_00_19);
640 &Xupdate_ssse3_32_79(\&body_00_19);
641 &Xupdate_ssse3_32_79(\&body_20_39);
642 &Xupdate_ssse3_32_79(\&body_20_39);
643 &Xupdate_ssse3_32_79(\&body_20_39);
644 &Xupdate_ssse3_32_79(\&body_20_39);
645 &Xupdate_ssse3_32_79(\&body_20_39);
646 &Xupdate_ssse3_32_79(\&body_40_59);
647 &Xupdate_ssse3_32_79(\&body_40_59);
648 &Xupdate_ssse3_32_79(\&body_40_59);
649 &Xupdate_ssse3_32_79(\&body_40_59);
650 &Xupdate_ssse3_32_79(\&body_40_59);
651 &Xupdate_ssse3_32_79(\&body_20_39);
652 &Xuplast_ssse3_80(\&body_20_39); # can jump to "done"
653
654 $saved_j=$j; @saved_V=@V;
655
656 &Xloop_ssse3(\&body_20_39);
657 &Xloop_ssse3(\&body_20_39);
658 &Xloop_ssse3(\&body_20_39);
659
660$code.=<<___;
661 add 0($ctx),$A # update context
662 add 4($ctx),@T[0]
663 add 8($ctx),$C
664 add 12($ctx),$D
665 mov $A,0($ctx)
666 add 16($ctx),$E
667 mov @T[0],4($ctx)
668 mov @T[0],$B # magic seed
669 mov $C,8($ctx)
670 mov $D,12($ctx)
671 mov $E,16($ctx)
672 jmp .Loop_ssse3
673
674.align 16
675.Ldone_ssse3:
676___
677 $j=$saved_j; @V=@saved_V;
678
679 &Xtail_ssse3(\&body_20_39);
680 &Xtail_ssse3(\&body_20_39);
681 &Xtail_ssse3(\&body_20_39);
682
683$code.=<<___;
684 add 0($ctx),$A # update context
685 add 4($ctx),@T[0]
686 add 8($ctx),$C
687 mov $A,0($ctx)
688 add 12($ctx),$D
689 mov @T[0],4($ctx)
690 add 16($ctx),$E
691 mov $C,8($ctx)
692 mov $D,12($ctx)
693 mov $E,16($ctx)
694___
695$code.=<<___ if ($win64);
696 movaps 64+0(%rsp),%xmm6
697 movaps 64+16(%rsp),%xmm7
698 movaps 64+32(%rsp),%xmm8
699 movaps 64+48(%rsp),%xmm9
700 movaps 64+64(%rsp),%xmm10
701___
702$code.=<<___;
703 lea `64+($win64?5*16:0)`(%rsp),%rsi
704 mov 0(%rsi),%r12
705 mov 8(%rsi),%rbp
706 mov 16(%rsi),%rbx
707 lea 24(%rsi),%rsp
708.Lepilogue_ssse3:
709 ret
710.size sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3
711___
712
713if ($avx) {
714my $Xi=4;
715my @X=map("%xmm$_",(4..7,0..3));
716my @Tx=map("%xmm$_",(8..10));
717my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization
718my @T=("%esi","%edi");
719my $j=0;
720my $K_XX_XX="%r11";
721
722my $_rol=sub { &shld(@_[0],@_) };
723my $_ror=sub { &shrd(@_[0],@_) };
724
725$code.=<<___;
726.type sha1_block_data_order_avx,\@function,3
727.align 16
728sha1_block_data_order_avx:
729_avx_shortcut:
730 push %rbx
731 push %rbp
732 push %r12
733 lea `-64-($win64?5*16:0)`(%rsp),%rsp
734___
735$code.=<<___ if ($win64);
736 movaps %xmm6,64+0(%rsp)
737 movaps %xmm7,64+16(%rsp)
738 movaps %xmm8,64+32(%rsp)
739 movaps %xmm9,64+48(%rsp)
740 movaps %xmm10,64+64(%rsp)
741.Lprologue_avx:
742___
743$code.=<<___;
744 mov %rdi,$ctx # reassigned argument
745 mov %rsi,$inp # reassigned argument
746 mov %rdx,$num # reassigned argument
747 vzeroall
748
749 shl \$6,$num
750 add $inp,$num
751 lea K_XX_XX(%rip),$K_XX_XX
752
753 mov 0($ctx),$A # load context
754 mov 4($ctx),$B
755 mov 8($ctx),$C
756 mov 12($ctx),$D
757 mov $B,@T[0] # magic seed
758 mov 16($ctx),$E
759
760 vmovdqa 64($K_XX_XX),@X[2] # pbswap mask
761 vmovdqa 0($K_XX_XX),@Tx[1] # K_00_19
762 vmovdqu 0($inp),@X[-4&7] # load input to %xmm[0-3]
763 vmovdqu 16($inp),@X[-3&7]
764 vmovdqu 32($inp),@X[-2&7]
765 vmovdqu 48($inp),@X[-1&7]
766 vpshufb @X[2],@X[-4&7],@X[-4&7] # byte swap
767 add \$64,$inp
768 vpshufb @X[2],@X[-3&7],@X[-3&7]
769 vpshufb @X[2],@X[-2&7],@X[-2&7]
770 vpshufb @X[2],@X[-1&7],@X[-1&7]
771 vpaddd @Tx[1],@X[-4&7],@X[0] # add K_00_19
772 vpaddd @Tx[1],@X[-3&7],@X[1]
773 vpaddd @Tx[1],@X[-2&7],@X[2]
774 vmovdqa @X[0],0(%rsp) # X[]+K xfer to IALU
775 vmovdqa @X[1],16(%rsp)
776 vmovdqa @X[2],32(%rsp)
777 jmp .Loop_avx
778___
779
780sub Xupdate_avx_16_31() # recall that $Xi starts wtih 4
781{ use integer;
782 my $body = shift;
783 my @insns = (&$body,&$body,&$body,&$body); # 40 instructions
784 my ($a,$b,$c,$d,$e);
785
786 eval(shift(@insns));
787 eval(shift(@insns));
788 &vpalignr(@X[0],@X[-3&7],@X[-4&7],8); # compose "X[-14]" in "X[0]"
789 eval(shift(@insns));
790 eval(shift(@insns));
791
792 &vpaddd (@Tx[1],@Tx[1],@X[-1&7]);
793 eval(shift(@insns));
794 eval(shift(@insns));
795 &vpsrldq(@Tx[0],@X[-1&7],4); # "X[-3]", 3 dwords
796 eval(shift(@insns));
797 eval(shift(@insns));
798 &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
799 eval(shift(@insns));
800 eval(shift(@insns));
801
802 &vpxor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"
803 eval(shift(@insns));
804 eval(shift(@insns));
805 eval(shift(@insns));
806 eval(shift(@insns));
807
808 &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]"
809 eval(shift(@insns));
810 eval(shift(@insns));
811 &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
812 eval(shift(@insns));
813 eval(shift(@insns));
814
815 &vpsrld (@Tx[0],@X[0],31);
816 eval(shift(@insns));
817 eval(shift(@insns));
818 eval(shift(@insns));
819 eval(shift(@insns));
820
821 &vpslldq(@Tx[2],@X[0],12); # "X[0]"<<96, extract one dword
822 &vpaddd (@X[0],@X[0],@X[0]);
823 eval(shift(@insns));
824 eval(shift(@insns));
825 eval(shift(@insns));
826 eval(shift(@insns));
827
828 &vpsrld (@Tx[1],@Tx[2],30);
829 &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=1
830 eval(shift(@insns));
831 eval(shift(@insns));
832 eval(shift(@insns));
833 eval(shift(@insns));
834
835 &vpslld (@Tx[2],@Tx[2],2);
836 &vpxor (@X[0],@X[0],@Tx[1]);
837 eval(shift(@insns));
838 eval(shift(@insns));
839 eval(shift(@insns));
840 eval(shift(@insns));
841
842 &vpxor (@X[0],@X[0],@Tx[2]); # "X[0]"^=("X[0]">>96)<<<2
843 eval(shift(@insns));
844 eval(shift(@insns));
845 &vmovdqa (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)"); # K_XX_XX
846 eval(shift(@insns));
847 eval(shift(@insns));
848
849
850 foreach (@insns) { eval; } # remaining instructions [if any]
851
852 $Xi++; push(@X,shift(@X)); # "rotate" X[]
853 push(@Tx,shift(@Tx));
854}
855
856sub Xupdate_avx_32_79()
857{ use integer;
858 my $body = shift;
859 my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions
860 my ($a,$b,$c,$d,$e);
861
862 &vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8); # compose "X[-6]"
863 &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
864 eval(shift(@insns)); # body_20_39
865 eval(shift(@insns));
866 eval(shift(@insns));
867 eval(shift(@insns)); # rol
868
869 &vpxor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
870 eval(shift(@insns));
871 eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/);
872 if ($Xi%5) {
873 &vmovdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
874 } else { # ... or load next one
875 &vmovdqa (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
876 }
877 &vpaddd (@Tx[1],@Tx[1],@X[-1&7]);
878 eval(shift(@insns)); # ror
879 eval(shift(@insns));
880
881 &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-6]"
882 eval(shift(@insns)); # body_20_39
883 eval(shift(@insns));
884 eval(shift(@insns));
885 eval(shift(@insns)); # rol
886
887 &vpsrld (@Tx[0],@X[0],30);
888 &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
889 eval(shift(@insns));
890 eval(shift(@insns));
891 eval(shift(@insns)); # ror
892 eval(shift(@insns));
893
894 &vpslld (@X[0],@X[0],2);
895 eval(shift(@insns)); # body_20_39
896 eval(shift(@insns));
897 eval(shift(@insns));
898 eval(shift(@insns)); # rol
899 eval(shift(@insns));
900 eval(shift(@insns));
901 eval(shift(@insns)); # ror
902 eval(shift(@insns));
903
904 &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=2
905 eval(shift(@insns)); # body_20_39
906 eval(shift(@insns));
907 &vmovdqa (@Tx[1],@X[0]) if ($Xi<19);
908 eval(shift(@insns));
909 eval(shift(@insns)); # rol
910 eval(shift(@insns));
911 eval(shift(@insns));
912 eval(shift(@insns)); # rol
913 eval(shift(@insns));
914
915 foreach (@insns) { eval; } # remaining instructions
916
917 $Xi++; push(@X,shift(@X)); # "rotate" X[]
918 push(@Tx,shift(@Tx));
919}
920
921sub Xuplast_avx_80()
922{ use integer;
923 my $body = shift;
924 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
925 my ($a,$b,$c,$d,$e);
926
927 eval(shift(@insns));
928 &vpaddd (@Tx[1],@Tx[1],@X[-1&7]);
929 eval(shift(@insns));
930 eval(shift(@insns));
931 eval(shift(@insns));
932 eval(shift(@insns));
933
934 &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU
935
936 foreach (@insns) { eval; } # remaining instructions
937
938 &cmp ($inp,$num);
939 &je (".Ldone_avx");
940
941 unshift(@Tx,pop(@Tx));
942
943 &vmovdqa(@X[2],"64($K_XX_XX)"); # pbswap mask
944 &vmovdqa(@Tx[1],"0($K_XX_XX)"); # K_00_19
945 &vmovdqu(@X[-4&7],"0($inp)"); # load input
946 &vmovdqu(@X[-3&7],"16($inp)");
947 &vmovdqu(@X[-2&7],"32($inp)");
948 &vmovdqu(@X[-1&7],"48($inp)");
949 &vpshufb(@X[-4&7],@X[-4&7],@X[2]); # byte swap
950 &add ($inp,64);
951
952 $Xi=0;
953}
954
955sub Xloop_avx()
956{ use integer;
957 my $body = shift;
958 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
959 my ($a,$b,$c,$d,$e);
960
961 eval(shift(@insns));
962 eval(shift(@insns));
963 &vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]);
964 eval(shift(@insns));
965 eval(shift(@insns));
966 &vpaddd (@X[$Xi&7],@X[($Xi-4)&7],@Tx[1]);
967 eval(shift(@insns));
968 eval(shift(@insns));
969 eval(shift(@insns));
970 eval(shift(@insns));
971 &vmovdqa(eval(16*$Xi)."(%rsp)",@X[$Xi&7]); # X[]+K xfer to IALU
972 eval(shift(@insns));
973 eval(shift(@insns));
974
975 foreach (@insns) { eval; }
976 $Xi++;
977}
978
979sub Xtail_avx()
980{ use integer;
981 my $body = shift;
982 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
983 my ($a,$b,$c,$d,$e);
984
985 foreach (@insns) { eval; }
986}
987
988$code.=<<___;
989.align 16
990.Loop_avx:
991___
992 &Xupdate_avx_16_31(\&body_00_19);
993 &Xupdate_avx_16_31(\&body_00_19);
994 &Xupdate_avx_16_31(\&body_00_19);
995 &Xupdate_avx_16_31(\&body_00_19);
996 &Xupdate_avx_32_79(\&body_00_19);
997 &Xupdate_avx_32_79(\&body_20_39);
998 &Xupdate_avx_32_79(\&body_20_39);
999 &Xupdate_avx_32_79(\&body_20_39);
1000 &Xupdate_avx_32_79(\&body_20_39);
1001 &Xupdate_avx_32_79(\&body_20_39);
1002 &Xupdate_avx_32_79(\&body_40_59);
1003 &Xupdate_avx_32_79(\&body_40_59);
1004 &Xupdate_avx_32_79(\&body_40_59);
1005 &Xupdate_avx_32_79(\&body_40_59);
1006 &Xupdate_avx_32_79(\&body_40_59);
1007 &Xupdate_avx_32_79(\&body_20_39);
1008 &Xuplast_avx_80(\&body_20_39); # can jump to "done"
1009
1010 $saved_j=$j; @saved_V=@V;
1011
1012 &Xloop_avx(\&body_20_39);
1013 &Xloop_avx(\&body_20_39);
1014 &Xloop_avx(\&body_20_39);
1015
1016$code.=<<___;
1017 add 0($ctx),$A # update context
1018 add 4($ctx),@T[0]
1019 add 8($ctx),$C
1020 add 12($ctx),$D
1021 mov $A,0($ctx)
1022 add 16($ctx),$E
1023 mov @T[0],4($ctx)
1024 mov @T[0],$B # magic seed
1025 mov $C,8($ctx)
1026 mov $D,12($ctx)
1027 mov $E,16($ctx)
1028 jmp .Loop_avx
1029
1030.align 16
1031.Ldone_avx:
1032___
1033 $j=$saved_j; @V=@saved_V;
1034
1035 &Xtail_avx(\&body_20_39);
1036 &Xtail_avx(\&body_20_39);
1037 &Xtail_avx(\&body_20_39);
1038
1039$code.=<<___;
1040 vzeroall
1041
1042 add 0($ctx),$A # update context
1043 add 4($ctx),@T[0]
1044 add 8($ctx),$C
1045 mov $A,0($ctx)
1046 add 12($ctx),$D
1047 mov @T[0],4($ctx)
1048 add 16($ctx),$E
1049 mov $C,8($ctx)
1050 mov $D,12($ctx)
1051 mov $E,16($ctx)
1052___
1053$code.=<<___ if ($win64);
1054 movaps 64+0(%rsp),%xmm6
1055 movaps 64+16(%rsp),%xmm7
1056 movaps 64+32(%rsp),%xmm8
1057 movaps 64+48(%rsp),%xmm9
1058 movaps 64+64(%rsp),%xmm10
1059___
1060$code.=<<___;
1061 lea `64+($win64?5*16:0)`(%rsp),%rsi
1062 mov 0(%rsi),%r12
1063 mov 8(%rsi),%rbp
1064 mov 16(%rsi),%rbx
1065 lea 24(%rsi),%rsp
1066.Lepilogue_avx:
1067 ret
1068.size sha1_block_data_order_avx,.-sha1_block_data_order_avx
1069___
1070}
1071$code.=<<___;
1072.align 64
1073K_XX_XX:
1074.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19
1075.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39
1076.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59
1077.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79
1078.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap mask
1079___
1080}}}
1081$code.=<<___;
1082.asciz "SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1083.align 64
1084___
1085
1086# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1087# CONTEXT *context,DISPATCHER_CONTEXT *disp)
1088if ($win64) {
1089$rec="%rcx";
1090$frame="%rdx";
1091$context="%r8";
1092$disp="%r9";
1093
1094$code.=<<___;
1095.extern __imp_RtlVirtualUnwind
1096.type se_handler,\@abi-omnipotent
1097.align 16
1098se_handler:
1099 push %rsi
1100 push %rdi
1101 push %rbx
1102 push %rbp
1103 push %r12
1104 push %r13
1105 push %r14
1106 push %r15
1107 pushfq
1108 sub \$64,%rsp
1109
1110 mov 120($context),%rax # pull context->Rax
1111 mov 248($context),%rbx # pull context->Rip
1112
1113 lea .Lprologue(%rip),%r10
1114 cmp %r10,%rbx # context->Rip<.Lprologue
1115 jb .Lcommon_seh_tail
1116
1117 mov 152($context),%rax # pull context->Rsp
1118
1119 lea .Lepilogue(%rip),%r10
1120 cmp %r10,%rbx # context->Rip>=.Lepilogue
1121 jae .Lcommon_seh_tail
1122
1123 mov `16*4`(%rax),%rax # pull saved stack pointer
1124 lea 32(%rax),%rax
1125
1126 mov -8(%rax),%rbx
1127 mov -16(%rax),%rbp
1128 mov -24(%rax),%r12
1129 mov -32(%rax),%r13
1130 mov %rbx,144($context) # restore context->Rbx
1131 mov %rbp,160($context) # restore context->Rbp
1132 mov %r12,216($context) # restore context->R12
1133 mov %r13,224($context) # restore context->R13
1134
1135 jmp .Lcommon_seh_tail
1136.size se_handler,.-se_handler
1137
1138.type ssse3_handler,\@abi-omnipotent
1139.align 16
1140ssse3_handler:
1141 push %rsi
1142 push %rdi
1143 push %rbx
1144 push %rbp
1145 push %r12
1146 push %r13
1147 push %r14
1148 push %r15
1149 pushfq
1150 sub \$64,%rsp
1151
1152 mov 120($context),%rax # pull context->Rax
1153 mov 248($context),%rbx # pull context->Rip
1154
1155 mov 8($disp),%rsi # disp->ImageBase
1156 mov 56($disp),%r11 # disp->HandlerData
1157
1158 mov 0(%r11),%r10d # HandlerData[0]
1159 lea (%rsi,%r10),%r10 # prologue label
1160 cmp %r10,%rbx # context->Rip<prologue label
1161 jb .Lcommon_seh_tail
1162
1163 mov 152($context),%rax # pull context->Rsp
1164
1165 mov 4(%r11),%r10d # HandlerData[1]
1166 lea (%rsi,%r10),%r10 # epilogue label
1167 cmp %r10,%rbx # context->Rip>=epilogue label
1168 jae .Lcommon_seh_tail
1169
1170 lea 64(%rax),%rsi
1171 lea 512($context),%rdi # &context.Xmm6
1172 mov \$10,%ecx
1173 .long 0xa548f3fc # cld; rep movsq
1174 lea `24+64+5*16`(%rax),%rax # adjust stack pointer
1175
1176 mov -8(%rax),%rbx
1177 mov -16(%rax),%rbp
1178 mov -24(%rax),%r12
1179 mov %rbx,144($context) # restore context->Rbx
1180 mov %rbp,160($context) # restore context->Rbp
1181 mov %r12,216($context) # restore cotnext->R12
1182
1183.Lcommon_seh_tail:
1184 mov 8(%rax),%rdi
1185 mov 16(%rax),%rsi
1186 mov %rax,152($context) # restore context->Rsp
1187 mov %rsi,168($context) # restore context->Rsi
1188 mov %rdi,176($context) # restore context->Rdi
1189
1190 mov 40($disp),%rdi # disp->ContextRecord
1191 mov $context,%rsi # context
1192 mov \$154,%ecx # sizeof(CONTEXT)
1193 .long 0xa548f3fc # cld; rep movsq
1194
1195 mov $disp,%rsi
1196 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1197 mov 8(%rsi),%rdx # arg2, disp->ImageBase
1198 mov 0(%rsi),%r8 # arg3, disp->ControlPc
1199 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1200 mov 40(%rsi),%r10 # disp->ContextRecord
1201 lea 56(%rsi),%r11 # &disp->HandlerData
1202 lea 24(%rsi),%r12 # &disp->EstablisherFrame
1203 mov %r10,32(%rsp) # arg5
1204 mov %r11,40(%rsp) # arg6
1205 mov %r12,48(%rsp) # arg7
1206 mov %rcx,56(%rsp) # arg8, (NULL)
1207 call *__imp_RtlVirtualUnwind(%rip)
1208
1209 mov \$1,%eax # ExceptionContinueSearch
1210 add \$64,%rsp
1211 popfq
1212 pop %r15
1213 pop %r14
1214 pop %r13
1215 pop %r12
1216 pop %rbp
1217 pop %rbx
1218 pop %rdi
1219 pop %rsi
1220 ret
1221.size ssse3_handler,.-ssse3_handler
1222
1223.section .pdata
1224.align 4
1225 .rva .LSEH_begin_sha1_block_data_order
1226 .rva .LSEH_end_sha1_block_data_order
1227 .rva .LSEH_info_sha1_block_data_order
1228 .rva .LSEH_begin_sha1_block_data_order_ssse3
1229 .rva .LSEH_end_sha1_block_data_order_ssse3
1230 .rva .LSEH_info_sha1_block_data_order_ssse3
1231___
1232$code.=<<___ if ($avx);
1233 .rva .LSEH_begin_sha1_block_data_order_avx
1234 .rva .LSEH_end_sha1_block_data_order_avx
1235 .rva .LSEH_info_sha1_block_data_order_avx
1236___
1237$code.=<<___;
1238.section .xdata
1239.align 8
1240.LSEH_info_sha1_block_data_order:
1241 .byte 9,0,0,0
1242 .rva se_handler
1243.LSEH_info_sha1_block_data_order_ssse3:
1244 .byte 9,0,0,0
1245 .rva ssse3_handler
1246 .rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[]
1247___
1248$code.=<<___ if ($avx);
1249.LSEH_info_sha1_block_data_order_avx:
1250 .byte 9,0,0,0
1251 .rva ssse3_handler
1252 .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[]
1253___
1254}
1255
1256####################################################################
1257
1258$code =~ s/\`([^\`]*)\`/eval $1/gem;
1259print $code;
1260close STDOUT;
diff --git a/src/lib/libcrypto/sha/asm/sha256-586.pl b/src/lib/libcrypto/sha/asm/sha256-586.pl
deleted file mode 100644
index 928ec53123..0000000000
--- a/src/lib/libcrypto/sha/asm/sha256-586.pl
+++ /dev/null
@@ -1,249 +0,0 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# SHA256 block transform for x86. September 2007.
11#
12# Performance in clock cycles per processed byte (less is better):
13#
14# Pentium PIII P4 AMD K8 Core2
15# gcc 46 36 41 27 26
16# icc 57 33 38 25 23
17# x86 asm 40 30 33 20 18
18# x86_64 asm(*) - - 21 16 16
19#
20# (*) x86_64 assembler performance is presented for reference
21# purposes.
22#
23# Performance improvement over compiler generated code varies from
24# 10% to 40% [see above]. Not very impressive on some µ-archs, but
25# it's 5 times smaller and optimizies amount of writes.
26
27$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
28push(@INC,"${dir}","${dir}../../perlasm");
29require "x86asm.pl";
30
31&asm_init($ARGV[0],"sha512-586.pl",$ARGV[$#ARGV] eq "386");
32
33$A="eax";
34$E="edx";
35$T="ebx";
36$Aoff=&DWP(0,"esp");
37$Boff=&DWP(4,"esp");
38$Coff=&DWP(8,"esp");
39$Doff=&DWP(12,"esp");
40$Eoff=&DWP(16,"esp");
41$Foff=&DWP(20,"esp");
42$Goff=&DWP(24,"esp");
43$Hoff=&DWP(28,"esp");
44$Xoff=&DWP(32,"esp");
45$K256="ebp";
46
47sub BODY_00_15() {
48 my $in_16_63=shift;
49
50 &mov ("ecx",$E);
51 &add ($T,"edi") if ($in_16_63); # T += sigma1(X[-2])
52 &ror ("ecx",25-11);
53 &mov ("esi",$Foff);
54 &xor ("ecx",$E);
55 &ror ("ecx",11-6);
56 &mov (&DWP(4*(8+15),"esp"),$T) if ($in_16_63); # save X[0]
57 &xor ("ecx",$E);
58 &ror ("ecx",6); # Sigma1(e)
59 &mov ("edi",$Goff);
60 &add ($T,"ecx"); # T += Sigma1(e)
61
62 &xor ("esi","edi");
63 &mov ($Eoff,$E); # modulo-scheduled
64 &mov ("ecx",$A);
65 &and ("esi",$E);
66 &mov ($E,$Doff); # e becomes d, which is e in next iteration
67 &xor ("esi","edi"); # Ch(e,f,g)
68 &mov ("edi",$A);
69 &add ($T,"esi"); # T += Ch(e,f,g)
70
71 &ror ("ecx",22-13);
72 &add ($T,$Hoff); # T += h
73 &xor ("ecx",$A);
74 &ror ("ecx",13-2);
75 &mov ("esi",$Boff);
76 &xor ("ecx",$A);
77 &ror ("ecx",2); # Sigma0(a)
78 &add ($E,$T); # d += T
79 &mov ("edi",$Coff);
80
81 &add ($T,"ecx"); # T += Sigma0(a)
82 &mov ($Aoff,$A); # modulo-scheduled
83
84 &mov ("ecx",$A);
85 &sub ("esp",4);
86 &or ($A,"esi"); # a becomes h, which is a in next iteration
87 &and ("ecx","esi");
88 &and ($A,"edi");
89 &mov ("esi",&DWP(0,$K256));
90 &or ($A,"ecx"); # h=Maj(a,b,c)
91
92 &add ($K256,4);
93 &add ($A,$T); # h += T
94 &mov ($T,&DWP(4*(8+15+16-1),"esp")) if ($in_16_63); # preload T
95 &add ($E,"esi"); # d += K256[i]
96 &add ($A,"esi"); # h += K256[i]
97}
98
99&function_begin("sha256_block_data_order");
100 &mov ("esi",wparam(0)); # ctx
101 &mov ("edi",wparam(1)); # inp
102 &mov ("eax",wparam(2)); # num
103 &mov ("ebx","esp"); # saved sp
104
105 &call (&label("pic_point")); # make it PIC!
106&set_label("pic_point");
107 &blindpop($K256);
108 &lea ($K256,&DWP(&label("K256")."-".&label("pic_point"),$K256));
109
110 &sub ("esp",16);
111 &and ("esp",-64);
112
113 &shl ("eax",6);
114 &add ("eax","edi");
115 &mov (&DWP(0,"esp"),"esi"); # ctx
116 &mov (&DWP(4,"esp"),"edi"); # inp
117 &mov (&DWP(8,"esp"),"eax"); # inp+num*128
118 &mov (&DWP(12,"esp"),"ebx"); # saved sp
119
120&set_label("loop",16);
121 # copy input block to stack reversing byte and dword order
122 for($i=0;$i<4;$i++) {
123 &mov ("eax",&DWP($i*16+0,"edi"));
124 &mov ("ebx",&DWP($i*16+4,"edi"));
125 &mov ("ecx",&DWP($i*16+8,"edi"));
126 &mov ("edx",&DWP($i*16+12,"edi"));
127 &bswap ("eax");
128 &bswap ("ebx");
129 &bswap ("ecx");
130 &bswap ("edx");
131 &push ("eax");
132 &push ("ebx");
133 &push ("ecx");
134 &push ("edx");
135 }
136 &add ("edi",64);
137 &sub ("esp",4*8); # place for A,B,C,D,E,F,G,H
138 &mov (&DWP(4*(8+16)+4,"esp"),"edi");
139
140 # copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack
141 &mov ($A,&DWP(0,"esi"));
142 &mov ("ebx",&DWP(4,"esi"));
143 &mov ("ecx",&DWP(8,"esi"));
144 &mov ("edi",&DWP(12,"esi"));
145 # &mov ($Aoff,$A);
146 &mov ($Boff,"ebx");
147 &mov ($Coff,"ecx");
148 &mov ($Doff,"edi");
149 &mov ($E,&DWP(16,"esi"));
150 &mov ("ebx",&DWP(20,"esi"));
151 &mov ("ecx",&DWP(24,"esi"));
152 &mov ("edi",&DWP(28,"esi"));
153 # &mov ($Eoff,$E);
154 &mov ($Foff,"ebx");
155 &mov ($Goff,"ecx");
156 &mov ($Hoff,"edi");
157
158&set_label("00_15",16);
159 &mov ($T,&DWP(4*(8+15),"esp"));
160
161 &BODY_00_15();
162
163 &cmp ("esi",0xc19bf174);
164 &jne (&label("00_15"));
165
166 &mov ($T,&DWP(4*(8+15+16-1),"esp")); # preloaded in BODY_00_15(1)
167&set_label("16_63",16);
168 &mov ("esi",$T);
169 &mov ("ecx",&DWP(4*(8+15+16-14),"esp"));
170 &ror ("esi",18-7);
171 &mov ("edi","ecx");
172 &xor ("esi",$T);
173 &ror ("esi",7);
174 &shr ($T,3);
175
176 &ror ("edi",19-17);
177 &xor ($T,"esi"); # T = sigma0(X[-15])
178 &xor ("edi","ecx");
179 &ror ("edi",17);
180 &shr ("ecx",10);
181 &add ($T,&DWP(4*(8+15+16),"esp")); # T += X[-16]
182 &xor ("edi","ecx"); # sigma1(X[-2])
183
184 &add ($T,&DWP(4*(8+15+16-9),"esp")); # T += X[-7]
185 # &add ($T,"edi"); # T += sigma1(X[-2])
186 # &mov (&DWP(4*(8+15),"esp"),$T); # save X[0]
187
188 &BODY_00_15(1);
189
190 &cmp ("esi",0xc67178f2);
191 &jne (&label("16_63"));
192
193 &mov ("esi",&DWP(4*(8+16+64)+0,"esp"));#ctx
194 # &mov ($A,$Aoff);
195 &mov ("ebx",$Boff);
196 &mov ("ecx",$Coff);
197 &mov ("edi",$Doff);
198 &add ($A,&DWP(0,"esi"));
199 &add ("ebx",&DWP(4,"esi"));
200 &add ("ecx",&DWP(8,"esi"));
201 &add ("edi",&DWP(12,"esi"));
202 &mov (&DWP(0,"esi"),$A);
203 &mov (&DWP(4,"esi"),"ebx");
204 &mov (&DWP(8,"esi"),"ecx");
205 &mov (&DWP(12,"esi"),"edi");
206 # &mov ($E,$Eoff);
207 &mov ("eax",$Foff);
208 &mov ("ebx",$Goff);
209 &mov ("ecx",$Hoff);
210 &mov ("edi",&DWP(4*(8+16+64)+4,"esp"));#inp
211 &add ($E,&DWP(16,"esi"));
212 &add ("eax",&DWP(20,"esi"));
213 &add ("ebx",&DWP(24,"esi"));
214 &add ("ecx",&DWP(28,"esi"));
215 &mov (&DWP(16,"esi"),$E);
216 &mov (&DWP(20,"esi"),"eax");
217 &mov (&DWP(24,"esi"),"ebx");
218 &mov (&DWP(28,"esi"),"ecx");
219
220 &add ("esp",4*(8+16+64)); # destroy frame
221 &sub ($K256,4*64); # rewind K
222
223 &cmp ("edi",&DWP(8,"esp")); # are we done yet?
224 &jb (&label("loop"));
225
226 &mov ("esp",&DWP(12,"esp")); # restore sp
227&function_end_A();
228
229&set_label("K256",64); # Yes! I keep it in the code segment!
230 &data_word(0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5);
231 &data_word(0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5);
232 &data_word(0xd807aa98,0x12835b01,0x243185be,0x550c7dc3);
233 &data_word(0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174);
234 &data_word(0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc);
235 &data_word(0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da);
236 &data_word(0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7);
237 &data_word(0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967);
238 &data_word(0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13);
239 &data_word(0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85);
240 &data_word(0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3);
241 &data_word(0xd192e819,0xd6990624,0xf40e3585,0x106aa070);
242 &data_word(0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5);
243 &data_word(0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3);
244 &data_word(0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208);
245 &data_word(0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2);
246&function_end_B("sha256_block_data_order");
247&asciz("SHA256 block transform for x86, CRYPTOGAMS by <appro\@openssl.org>");
248
249&asm_finish();
diff --git a/src/lib/libcrypto/sha/asm/sha256-armv4.pl b/src/lib/libcrypto/sha/asm/sha256-armv4.pl
deleted file mode 100644
index 9c84e8d93c..0000000000
--- a/src/lib/libcrypto/sha/asm/sha256-armv4.pl
+++ /dev/null
@@ -1,211 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# SHA256 block procedure for ARMv4. May 2007.
11
12# Performance is ~2x better than gcc 3.4 generated code and in "abso-
13# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
14# byte [on single-issue Xscale PXA250 core].
15
16# July 2010.
17#
18# Rescheduling for dual-issue pipeline resulted in 22% improvement on
19# Cortex A8 core and ~20 cycles per processed byte.
20
21# February 2011.
22#
23# Profiler-assisted and platform-specific optimization resulted in 16%
24# improvement on Cortex A8 core and ~17 cycles per processed byte.
25
26while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
27open STDOUT,">$output";
28
29$ctx="r0"; $t0="r0";
30$inp="r1"; $t3="r1";
31$len="r2"; $t1="r2";
32$T1="r3";
33$A="r4";
34$B="r5";
35$C="r6";
36$D="r7";
37$E="r8";
38$F="r9";
39$G="r10";
40$H="r11";
41@V=($A,$B,$C,$D,$E,$F,$G,$H);
42$t2="r12";
43$Ktbl="r14";
44
45@Sigma0=( 2,13,22);
46@Sigma1=( 6,11,25);
47@sigma0=( 7,18, 3);
48@sigma1=(17,19,10);
49
50sub BODY_00_15 {
51my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
52
53$code.=<<___ if ($i<16);
54#if __ARM_ARCH__>=7
55 ldr $T1,[$inp],#4
56#else
57 ldrb $T1,[$inp,#3] @ $i
58 ldrb $t2,[$inp,#2]
59 ldrb $t1,[$inp,#1]
60 ldrb $t0,[$inp],#4
61 orr $T1,$T1,$t2,lsl#8
62 orr $T1,$T1,$t1,lsl#16
63 orr $T1,$T1,$t0,lsl#24
64#endif
65___
66$code.=<<___;
67 mov $t0,$e,ror#$Sigma1[0]
68 ldr $t2,[$Ktbl],#4 @ *K256++
69 eor $t0,$t0,$e,ror#$Sigma1[1]
70 eor $t1,$f,$g
71#if $i>=16
72 add $T1,$T1,$t3 @ from BODY_16_xx
73#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
74 rev $T1,$T1
75#endif
76#if $i==15
77 str $inp,[sp,#17*4] @ leave room for $t3
78#endif
79 eor $t0,$t0,$e,ror#$Sigma1[2] @ Sigma1(e)
80 and $t1,$t1,$e
81 str $T1,[sp,#`$i%16`*4]
82 add $T1,$T1,$t0
83 eor $t1,$t1,$g @ Ch(e,f,g)
84 add $T1,$T1,$h
85 mov $h,$a,ror#$Sigma0[0]
86 add $T1,$T1,$t1
87 eor $h,$h,$a,ror#$Sigma0[1]
88 add $T1,$T1,$t2
89 eor $h,$h,$a,ror#$Sigma0[2] @ Sigma0(a)
90#if $i>=15
91 ldr $t3,[sp,#`($i+2)%16`*4] @ from BODY_16_xx
92#endif
93 orr $t0,$a,$b
94 and $t1,$a,$b
95 and $t0,$t0,$c
96 add $h,$h,$T1
97 orr $t0,$t0,$t1 @ Maj(a,b,c)
98 add $d,$d,$T1
99 add $h,$h,$t0
100___
101}
102
103sub BODY_16_XX {
104my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
105
106$code.=<<___;
107 @ ldr $t3,[sp,#`($i+1)%16`*4] @ $i
108 ldr $t2,[sp,#`($i+14)%16`*4]
109 mov $t0,$t3,ror#$sigma0[0]
110 ldr $T1,[sp,#`($i+0)%16`*4]
111 eor $t0,$t0,$t3,ror#$sigma0[1]
112 ldr $t1,[sp,#`($i+9)%16`*4]
113 eor $t0,$t0,$t3,lsr#$sigma0[2] @ sigma0(X[i+1])
114 mov $t3,$t2,ror#$sigma1[0]
115 add $T1,$T1,$t0
116 eor $t3,$t3,$t2,ror#$sigma1[1]
117 add $T1,$T1,$t1
118 eor $t3,$t3,$t2,lsr#$sigma1[2] @ sigma1(X[i+14])
119 @ add $T1,$T1,$t3
120___
121 &BODY_00_15(@_);
122}
123
124$code=<<___;
125#include "arm_arch.h"
126
127.text
128.code 32
129
130.type K256,%object
131.align 5
132K256:
133.word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
134.word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
135.word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
136.word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
137.word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
138.word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
139.word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
140.word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
141.word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
142.word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
143.word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
144.word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
145.word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
146.word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
147.word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
148.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
149.size K256,.-K256
150
151.global sha256_block_data_order
152.type sha256_block_data_order,%function
153sha256_block_data_order:
154 sub r3,pc,#8 @ sha256_block_data_order
155 add $len,$inp,$len,lsl#6 @ len to point at the end of inp
156 stmdb sp!,{$ctx,$inp,$len,r4-r11,lr}
157 ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
158 sub $Ktbl,r3,#256 @ K256
159 sub sp,sp,#16*4 @ alloca(X[16])
160.Loop:
161___
162for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
163$code.=".Lrounds_16_xx:\n";
164for (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
165$code.=<<___;
166 and $t2,$t2,#0xff
167 cmp $t2,#0xf2
168 bne .Lrounds_16_xx
169
170 ldr $T1,[sp,#16*4] @ pull ctx
171 ldr $t0,[$T1,#0]
172 ldr $t1,[$T1,#4]
173 ldr $t2,[$T1,#8]
174 add $A,$A,$t0
175 ldr $t0,[$T1,#12]
176 add $B,$B,$t1
177 ldr $t1,[$T1,#16]
178 add $C,$C,$t2
179 ldr $t2,[$T1,#20]
180 add $D,$D,$t0
181 ldr $t0,[$T1,#24]
182 add $E,$E,$t1
183 ldr $t1,[$T1,#28]
184 add $F,$F,$t2
185 ldr $inp,[sp,#17*4] @ pull inp
186 ldr $t2,[sp,#18*4] @ pull inp+len
187 add $G,$G,$t0
188 add $H,$H,$t1
189 stmia $T1,{$A,$B,$C,$D,$E,$F,$G,$H}
190 cmp $inp,$t2
191 sub $Ktbl,$Ktbl,#256 @ rewind Ktbl
192 bne .Loop
193
194 add sp,sp,#`16+3`*4 @ destroy frame
195#if __ARM_ARCH__>=5
196 ldmia sp!,{r4-r11,pc}
197#else
198 ldmia sp!,{r4-r11,lr}
199 tst lr,#1
200 moveq pc,lr @ be binary compatible with V4, yet
201 bx lr @ interoperable with Thumb ISA:-)
202#endif
203.size sha256_block_data_order,.-sha256_block_data_order
204.asciz "SHA256 block transform for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
205.align 2
206___
207
208$code =~ s/\`([^\`]*)\`/eval $1/gem;
209$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
210print $code;
211close STDOUT; # enforce flush
diff --git a/src/lib/libcrypto/sha/asm/sha512-586.pl b/src/lib/libcrypto/sha/asm/sha512-586.pl
deleted file mode 100644
index 5b9f3337ad..0000000000
--- a/src/lib/libcrypto/sha/asm/sha512-586.pl
+++ /dev/null
@@ -1,644 +0,0 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# SHA512 block transform for x86. September 2007.
11#
12# Performance in clock cycles per processed byte (less is better):
13#
14# Pentium PIII P4 AMD K8 Core2
15# gcc 100 75 116 54 66
16# icc 97 77 95 55 57
17# x86 asm 61 56 82 36 40
18# SSE2 asm - - 38 24 20
19# x86_64 asm(*) - - 30 10.0 10.5
20#
21# (*) x86_64 assembler performance is presented for reference
22# purposes.
23#
24# IALU code-path is optimized for elder Pentiums. On vanilla Pentium
25# performance improvement over compiler generated code reaches ~60%,
26# while on PIII - ~35%. On newer µ-archs improvement varies from 15%
27# to 50%, but it's less important as they are expected to execute SSE2
28# code-path, which is commonly ~2-3x faster [than compiler generated
29# code]. SSE2 code-path is as fast as original sha512-sse2.pl, even
30# though it does not use 128-bit operations. The latter means that
31# SSE2-aware kernel is no longer required to execute the code. Another
32# difference is that new code optimizes amount of writes, but at the
33# cost of increased data cache "footprint" by 1/2KB.
34
35$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
36push(@INC,"${dir}","${dir}../../perlasm");
37require "x86asm.pl";
38
39&asm_init($ARGV[0],"sha512-586.pl",$ARGV[$#ARGV] eq "386");
40
41$sse2=0;
42for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
43
44&external_label("OPENSSL_ia32cap_P") if ($sse2);
45
46$Tlo=&DWP(0,"esp"); $Thi=&DWP(4,"esp");
47$Alo=&DWP(8,"esp"); $Ahi=&DWP(8+4,"esp");
48$Blo=&DWP(16,"esp"); $Bhi=&DWP(16+4,"esp");
49$Clo=&DWP(24,"esp"); $Chi=&DWP(24+4,"esp");
50$Dlo=&DWP(32,"esp"); $Dhi=&DWP(32+4,"esp");
51$Elo=&DWP(40,"esp"); $Ehi=&DWP(40+4,"esp");
52$Flo=&DWP(48,"esp"); $Fhi=&DWP(48+4,"esp");
53$Glo=&DWP(56,"esp"); $Ghi=&DWP(56+4,"esp");
54$Hlo=&DWP(64,"esp"); $Hhi=&DWP(64+4,"esp");
55$K512="ebp";
56
57$Asse2=&QWP(0,"esp");
58$Bsse2=&QWP(8,"esp");
59$Csse2=&QWP(16,"esp");
60$Dsse2=&QWP(24,"esp");
61$Esse2=&QWP(32,"esp");
62$Fsse2=&QWP(40,"esp");
63$Gsse2=&QWP(48,"esp");
64$Hsse2=&QWP(56,"esp");
65
66$A="mm0"; # B-D and
67$E="mm4"; # F-H are commonly loaded to respectively mm1-mm3 and
68 # mm5-mm7, but it's done on on-demand basis...
69
70sub BODY_00_15_sse2 {
71 my $prefetch=shift;
72
73 &movq ("mm5",$Fsse2); # load f
74 &movq ("mm6",$Gsse2); # load g
75 &movq ("mm7",$Hsse2); # load h
76
77 &movq ("mm1",$E); # %mm1 is sliding right
78 &movq ("mm2",$E); # %mm2 is sliding left
79 &psrlq ("mm1",14);
80 &movq ($Esse2,$E); # modulo-scheduled save e
81 &psllq ("mm2",23);
82 &movq ("mm3","mm1"); # %mm3 is T1
83 &psrlq ("mm1",4);
84 &pxor ("mm3","mm2");
85 &psllq ("mm2",23);
86 &pxor ("mm3","mm1");
87 &psrlq ("mm1",23);
88 &pxor ("mm3","mm2");
89 &psllq ("mm2",4);
90 &pxor ("mm3","mm1");
91 &paddq ("mm7",QWP(0,$K512)); # h+=K512[i]
92 &pxor ("mm3","mm2"); # T1=Sigma1_512(e)
93
94 &pxor ("mm5","mm6"); # f^=g
95 &movq ("mm1",$Bsse2); # load b
96 &pand ("mm5",$E); # f&=e
97 &movq ("mm2",$Csse2); # load c
98 &pxor ("mm5","mm6"); # f^=g
99 &movq ($E,$Dsse2); # e = load d
100 &paddq ("mm3","mm5"); # T1+=Ch(e,f,g)
101 &movq (&QWP(0,"esp"),$A); # modulo-scheduled save a
102 &paddq ("mm3","mm7"); # T1+=h
103
104 &movq ("mm5",$A); # %mm5 is sliding right
105 &movq ("mm6",$A); # %mm6 is sliding left
106 &paddq ("mm3",&QWP(8*9,"esp")); # T1+=X[0]
107 &psrlq ("mm5",28);
108 &paddq ($E,"mm3"); # e += T1
109 &psllq ("mm6",25);
110 &movq ("mm7","mm5"); # %mm7 is T2
111 &psrlq ("mm5",6);
112 &pxor ("mm7","mm6");
113 &psllq ("mm6",5);
114 &pxor ("mm7","mm5");
115 &psrlq ("mm5",5);
116 &pxor ("mm7","mm6");
117 &psllq ("mm6",6);
118 &pxor ("mm7","mm5");
119 &sub ("esp",8);
120 &pxor ("mm7","mm6"); # T2=Sigma0_512(a)
121
122 &movq ("mm5",$A); # %mm5=a
123 &por ($A,"mm2"); # a=a|c
124 &movq ("mm6",&QWP(8*(9+16-14),"esp")) if ($prefetch);
125 &pand ("mm5","mm2"); # %mm5=a&c
126 &pand ($A,"mm1"); # a=(a|c)&b
127 &movq ("mm2",&QWP(8*(9+16-1),"esp")) if ($prefetch);
128 &por ("mm5",$A); # %mm5=(a&c)|((a|c)&b)
129 &paddq ("mm7","mm5"); # T2+=Maj(a,b,c)
130 &movq ($A,"mm3"); # a=T1
131
132 &mov (&LB("edx"),&BP(0,$K512));
133 &paddq ($A,"mm7"); # a+=T2
134 &add ($K512,8);
135}
136
137sub BODY_00_15_x86 {
138 #define Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41))
139 # LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
140 # HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
141 &mov ("ecx",$Elo);
142 &mov ("edx",$Ehi);
143 &mov ("esi","ecx");
144
145 &shr ("ecx",9) # lo>>9
146 &mov ("edi","edx");
147 &shr ("edx",9) # hi>>9
148 &mov ("ebx","ecx");
149 &shl ("esi",14); # lo<<14
150 &mov ("eax","edx");
151 &shl ("edi",14); # hi<<14
152 &xor ("ebx","esi");
153
154 &shr ("ecx",14-9); # lo>>14
155 &xor ("eax","edi");
156 &shr ("edx",14-9); # hi>>14
157 &xor ("eax","ecx");
158 &shl ("esi",18-14); # lo<<18
159 &xor ("ebx","edx");
160 &shl ("edi",18-14); # hi<<18
161 &xor ("ebx","esi");
162
163 &shr ("ecx",18-14); # lo>>18
164 &xor ("eax","edi");
165 &shr ("edx",18-14); # hi>>18
166 &xor ("eax","ecx");
167 &shl ("esi",23-18); # lo<<23
168 &xor ("ebx","edx");
169 &shl ("edi",23-18); # hi<<23
170 &xor ("eax","esi");
171 &xor ("ebx","edi"); # T1 = Sigma1(e)
172
173 &mov ("ecx",$Flo);
174 &mov ("edx",$Fhi);
175 &mov ("esi",$Glo);
176 &mov ("edi",$Ghi);
177 &add ("eax",$Hlo);
178 &adc ("ebx",$Hhi); # T1 += h
179 &xor ("ecx","esi");
180 &xor ("edx","edi");
181 &and ("ecx",$Elo);
182 &and ("edx",$Ehi);
183 &add ("eax",&DWP(8*(9+15)+0,"esp"));
184 &adc ("ebx",&DWP(8*(9+15)+4,"esp")); # T1 += X[0]
185 &xor ("ecx","esi");
186 &xor ("edx","edi"); # Ch(e,f,g) = (f^g)&e)^g
187
188 &mov ("esi",&DWP(0,$K512));
189 &mov ("edi",&DWP(4,$K512)); # K[i]
190 &add ("eax","ecx");
191 &adc ("ebx","edx"); # T1 += Ch(e,f,g)
192 &mov ("ecx",$Dlo);
193 &mov ("edx",$Dhi);
194 &add ("eax","esi");
195 &adc ("ebx","edi"); # T1 += K[i]
196 &mov ($Tlo,"eax");
197 &mov ($Thi,"ebx"); # put T1 away
198 &add ("eax","ecx");
199 &adc ("ebx","edx"); # d += T1
200
201 #define Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
202 # LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
203 # HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
204 &mov ("ecx",$Alo);
205 &mov ("edx",$Ahi);
206 &mov ($Dlo,"eax");
207 &mov ($Dhi,"ebx");
208 &mov ("esi","ecx");
209
210 &shr ("ecx",2) # lo>>2
211 &mov ("edi","edx");
212 &shr ("edx",2) # hi>>2
213 &mov ("ebx","ecx");
214 &shl ("esi",4); # lo<<4
215 &mov ("eax","edx");
216 &shl ("edi",4); # hi<<4
217 &xor ("ebx","esi");
218
219 &shr ("ecx",7-2); # lo>>7
220 &xor ("eax","edi");
221 &shr ("edx",7-2); # hi>>7
222 &xor ("ebx","ecx");
223 &shl ("esi",25-4); # lo<<25
224 &xor ("eax","edx");
225 &shl ("edi",25-4); # hi<<25
226 &xor ("eax","esi");
227
228 &shr ("ecx",28-7); # lo>>28
229 &xor ("ebx","edi");
230 &shr ("edx",28-7); # hi>>28
231 &xor ("eax","ecx");
232 &shl ("esi",30-25); # lo<<30
233 &xor ("ebx","edx");
234 &shl ("edi",30-25); # hi<<30
235 &xor ("eax","esi");
236 &xor ("ebx","edi"); # Sigma0(a)
237
238 &mov ("ecx",$Alo);
239 &mov ("edx",$Ahi);
240 &mov ("esi",$Blo);
241 &mov ("edi",$Bhi);
242 &add ("eax",$Tlo);
243 &adc ("ebx",$Thi); # T1 = Sigma0(a)+T1
244 &or ("ecx","esi");
245 &or ("edx","edi");
246 &and ("ecx",$Clo);
247 &and ("edx",$Chi);
248 &and ("esi",$Alo);
249 &and ("edi",$Ahi);
250 &or ("ecx","esi");
251 &or ("edx","edi"); # Maj(a,b,c) = ((a|b)&c)|(a&b)
252
253 &add ("eax","ecx");
254 &adc ("ebx","edx"); # T1 += Maj(a,b,c)
255 &mov ($Tlo,"eax");
256 &mov ($Thi,"ebx");
257
258 &mov (&LB("edx"),&BP(0,$K512)); # pre-fetch LSB of *K
259 &sub ("esp",8);
260 &lea ($K512,&DWP(8,$K512)); # K++
261}
262
263
264&function_begin("sha512_block_data_order");
265 &mov ("esi",wparam(0)); # ctx
266 &mov ("edi",wparam(1)); # inp
267 &mov ("eax",wparam(2)); # num
268 &mov ("ebx","esp"); # saved sp
269
270 &call (&label("pic_point")); # make it PIC!
271&set_label("pic_point");
272 &blindpop($K512);
273 &lea ($K512,&DWP(&label("K512")."-".&label("pic_point"),$K512));
274
275 &sub ("esp",16);
276 &and ("esp",-64);
277
278 &shl ("eax",7);
279 &add ("eax","edi");
280 &mov (&DWP(0,"esp"),"esi"); # ctx
281 &mov (&DWP(4,"esp"),"edi"); # inp
282 &mov (&DWP(8,"esp"),"eax"); # inp+num*128
283 &mov (&DWP(12,"esp"),"ebx"); # saved sp
284
285if ($sse2) {
286 &picmeup("edx","OPENSSL_ia32cap_P",$K512,&label("K512"));
287 &bt (&DWP(0,"edx"),26);
288 &jnc (&label("loop_x86"));
289
290 # load ctx->h[0-7]
291 &movq ($A,&QWP(0,"esi"));
292 &movq ("mm1",&QWP(8,"esi"));
293 &movq ("mm2",&QWP(16,"esi"));
294 &movq ("mm3",&QWP(24,"esi"));
295 &movq ($E,&QWP(32,"esi"));
296 &movq ("mm5",&QWP(40,"esi"));
297 &movq ("mm6",&QWP(48,"esi"));
298 &movq ("mm7",&QWP(56,"esi"));
299 &sub ("esp",8*10);
300
301&set_label("loop_sse2",16);
302 # &movq ($Asse2,$A);
303 &movq ($Bsse2,"mm1");
304 &movq ($Csse2,"mm2");
305 &movq ($Dsse2,"mm3");
306 # &movq ($Esse2,$E);
307 &movq ($Fsse2,"mm5");
308 &movq ($Gsse2,"mm6");
309 &movq ($Hsse2,"mm7");
310
311 &mov ("ecx",&DWP(0,"edi"));
312 &mov ("edx",&DWP(4,"edi"));
313 &add ("edi",8);
314 &bswap ("ecx");
315 &bswap ("edx");
316 &mov (&DWP(8*9+4,"esp"),"ecx");
317 &mov (&DWP(8*9+0,"esp"),"edx");
318
319&set_label("00_14_sse2",16);
320 &mov ("eax",&DWP(0,"edi"));
321 &mov ("ebx",&DWP(4,"edi"));
322 &add ("edi",8);
323 &bswap ("eax");
324 &bswap ("ebx");
325 &mov (&DWP(8*8+4,"esp"),"eax");
326 &mov (&DWP(8*8+0,"esp"),"ebx");
327
328 &BODY_00_15_sse2();
329
330 &cmp (&LB("edx"),0x35);
331 &jne (&label("00_14_sse2"));
332
333 &BODY_00_15_sse2(1);
334
335&set_label("16_79_sse2",16);
336 #&movq ("mm2",&QWP(8*(9+16-1),"esp")); #prefetched in BODY_00_15
337 #&movq ("mm6",&QWP(8*(9+16-14),"esp"));
338 &movq ("mm1","mm2");
339
340 &psrlq ("mm2",1);
341 &movq ("mm7","mm6");
342 &psrlq ("mm6",6);
343 &movq ("mm3","mm2");
344
345 &psrlq ("mm2",7-1);
346 &movq ("mm5","mm6");
347 &psrlq ("mm6",19-6);
348 &pxor ("mm3","mm2");
349
350 &psrlq ("mm2",8-7);
351 &pxor ("mm5","mm6");
352 &psrlq ("mm6",61-19);
353 &pxor ("mm3","mm2");
354
355 &movq ("mm2",&QWP(8*(9+16),"esp"));
356
357 &psllq ("mm1",56);
358 &pxor ("mm5","mm6");
359 &psllq ("mm7",3);
360 &pxor ("mm3","mm1");
361
362 &paddq ("mm2",&QWP(8*(9+16-9),"esp"));
363
364 &psllq ("mm1",63-56);
365 &pxor ("mm5","mm7");
366 &psllq ("mm7",45-3);
367 &pxor ("mm3","mm1");
368 &pxor ("mm5","mm7");
369
370 &paddq ("mm3","mm5");
371 &paddq ("mm3","mm2");
372 &movq (&QWP(8*9,"esp"),"mm3");
373
374 &BODY_00_15_sse2(1);
375
376 &cmp (&LB("edx"),0x17);
377 &jne (&label("16_79_sse2"));
378
379 # &movq ($A,$Asse2);
380 &movq ("mm1",$Bsse2);
381 &movq ("mm2",$Csse2);
382 &movq ("mm3",$Dsse2);
383 # &movq ($E,$Esse2);
384 &movq ("mm5",$Fsse2);
385 &movq ("mm6",$Gsse2);
386 &movq ("mm7",$Hsse2);
387
388 &paddq ($A,&QWP(0,"esi"));
389 &paddq ("mm1",&QWP(8,"esi"));
390 &paddq ("mm2",&QWP(16,"esi"));
391 &paddq ("mm3",&QWP(24,"esi"));
392 &paddq ($E,&QWP(32,"esi"));
393 &paddq ("mm5",&QWP(40,"esi"));
394 &paddq ("mm6",&QWP(48,"esi"));
395 &paddq ("mm7",&QWP(56,"esi"));
396
397 &movq (&QWP(0,"esi"),$A);
398 &movq (&QWP(8,"esi"),"mm1");
399 &movq (&QWP(16,"esi"),"mm2");
400 &movq (&QWP(24,"esi"),"mm3");
401 &movq (&QWP(32,"esi"),$E);
402 &movq (&QWP(40,"esi"),"mm5");
403 &movq (&QWP(48,"esi"),"mm6");
404 &movq (&QWP(56,"esi"),"mm7");
405
406 &add ("esp",8*80); # destroy frame
407 &sub ($K512,8*80); # rewind K
408
409 &cmp ("edi",&DWP(8*10+8,"esp")); # are we done yet?
410 &jb (&label("loop_sse2"));
411
412 &emms ();
413 &mov ("esp",&DWP(8*10+12,"esp")); # restore sp
414&function_end_A();
415}
416&set_label("loop_x86",16);
417 # copy input block to stack reversing byte and qword order
418 for ($i=0;$i<8;$i++) {
419 &mov ("eax",&DWP($i*16+0,"edi"));
420 &mov ("ebx",&DWP($i*16+4,"edi"));
421 &mov ("ecx",&DWP($i*16+8,"edi"));
422 &mov ("edx",&DWP($i*16+12,"edi"));
423 &bswap ("eax");
424 &bswap ("ebx");
425 &bswap ("ecx");
426 &bswap ("edx");
427 &push ("eax");
428 &push ("ebx");
429 &push ("ecx");
430 &push ("edx");
431 }
432 &add ("edi",128);
433 &sub ("esp",9*8); # place for T,A,B,C,D,E,F,G,H
434 &mov (&DWP(8*(9+16)+4,"esp"),"edi");
435
436 # copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack
437 &lea ("edi",&DWP(8,"esp"));
438 &mov ("ecx",16);
439 &data_word(0xA5F3F689); # rep movsd
440
441&set_label("00_15_x86",16);
442 &BODY_00_15_x86();
443
444 &cmp (&LB("edx"),0x94);
445 &jne (&label("00_15_x86"));
446
447&set_label("16_79_x86",16);
448 #define sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7))
449 # LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
450 # HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7
451 &mov ("ecx",&DWP(8*(9+15+16-1)+0,"esp"));
452 &mov ("edx",&DWP(8*(9+15+16-1)+4,"esp"));
453 &mov ("esi","ecx");
454
455 &shr ("ecx",1) # lo>>1
456 &mov ("edi","edx");
457 &shr ("edx",1) # hi>>1
458 &mov ("eax","ecx");
459 &shl ("esi",24); # lo<<24
460 &mov ("ebx","edx");
461 &shl ("edi",24); # hi<<24
462 &xor ("ebx","esi");
463
464 &shr ("ecx",7-1); # lo>>7
465 &xor ("eax","edi");
466 &shr ("edx",7-1); # hi>>7
467 &xor ("eax","ecx");
468 &shl ("esi",31-24); # lo<<31
469 &xor ("ebx","edx");
470 &shl ("edi",25-24); # hi<<25
471 &xor ("ebx","esi");
472
473 &shr ("ecx",8-7); # lo>>8
474 &xor ("eax","edi");
475 &shr ("edx",8-7); # hi>>8
476 &xor ("eax","ecx");
477 &shl ("edi",31-25); # hi<<31
478 &xor ("ebx","edx");
479 &xor ("eax","edi"); # T1 = sigma0(X[-15])
480
481 &mov (&DWP(0,"esp"),"eax");
482 &mov (&DWP(4,"esp"),"ebx"); # put T1 away
483
484 #define sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
485 # LO lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
486 # HI hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
487 &mov ("ecx",&DWP(8*(9+15+16-14)+0,"esp"));
488 &mov ("edx",&DWP(8*(9+15+16-14)+4,"esp"));
489 &mov ("esi","ecx");
490
491 &shr ("ecx",6) # lo>>6
492 &mov ("edi","edx");
493 &shr ("edx",6) # hi>>6
494 &mov ("eax","ecx");
495 &shl ("esi",3); # lo<<3
496 &mov ("ebx","edx");
497 &shl ("edi",3); # hi<<3
498 &xor ("eax","esi");
499
500 &shr ("ecx",19-6); # lo>>19
501 &xor ("ebx","edi");
502 &shr ("edx",19-6); # hi>>19
503 &xor ("eax","ecx");
504 &shl ("esi",13-3); # lo<<13
505 &xor ("ebx","edx");
506 &shl ("edi",13-3); # hi<<13
507 &xor ("ebx","esi");
508
509 &shr ("ecx",29-19); # lo>>29
510 &xor ("eax","edi");
511 &shr ("edx",29-19); # hi>>29
512 &xor ("ebx","ecx");
513 &shl ("edi",26-13); # hi<<26
514 &xor ("eax","edx");
515 &xor ("eax","edi"); # sigma1(X[-2])
516
517 &mov ("ecx",&DWP(8*(9+15+16)+0,"esp"));
518 &mov ("edx",&DWP(8*(9+15+16)+4,"esp"));
519 &add ("eax",&DWP(0,"esp"));
520 &adc ("ebx",&DWP(4,"esp")); # T1 = sigma1(X[-2])+T1
521 &mov ("esi",&DWP(8*(9+15+16-9)+0,"esp"));
522 &mov ("edi",&DWP(8*(9+15+16-9)+4,"esp"));
523 &add ("eax","ecx");
524 &adc ("ebx","edx"); # T1 += X[-16]
525 &add ("eax","esi");
526 &adc ("ebx","edi"); # T1 += X[-7]
527 &mov (&DWP(8*(9+15)+0,"esp"),"eax");
528 &mov (&DWP(8*(9+15)+4,"esp"),"ebx"); # save X[0]
529
530 &BODY_00_15_x86();
531
532 &cmp (&LB("edx"),0x17);
533 &jne (&label("16_79_x86"));
534
535 &mov ("esi",&DWP(8*(9+16+80)+0,"esp"));# ctx
536 &mov ("edi",&DWP(8*(9+16+80)+4,"esp"));# inp
537 for($i=0;$i<4;$i++) {
538 &mov ("eax",&DWP($i*16+0,"esi"));
539 &mov ("ebx",&DWP($i*16+4,"esi"));
540 &mov ("ecx",&DWP($i*16+8,"esi"));
541 &mov ("edx",&DWP($i*16+12,"esi"));
542 &add ("eax",&DWP(8+($i*16)+0,"esp"));
543 &adc ("ebx",&DWP(8+($i*16)+4,"esp"));
544 &mov (&DWP($i*16+0,"esi"),"eax");
545 &mov (&DWP($i*16+4,"esi"),"ebx");
546 &add ("ecx",&DWP(8+($i*16)+8,"esp"));
547 &adc ("edx",&DWP(8+($i*16)+12,"esp"));
548 &mov (&DWP($i*16+8,"esi"),"ecx");
549 &mov (&DWP($i*16+12,"esi"),"edx");
550 }
551 &add ("esp",8*(9+16+80)); # destroy frame
552 &sub ($K512,8*80); # rewind K
553
554 &cmp ("edi",&DWP(8,"esp")); # are we done yet?
555 &jb (&label("loop_x86"));
556
557 &mov ("esp",&DWP(12,"esp")); # restore sp
558&function_end_A();
559
560&set_label("K512",64); # Yes! I keep it in the code segment!
561 &data_word(0xd728ae22,0x428a2f98); # u64
562 &data_word(0x23ef65cd,0x71374491); # u64
563 &data_word(0xec4d3b2f,0xb5c0fbcf); # u64
564 &data_word(0x8189dbbc,0xe9b5dba5); # u64
565 &data_word(0xf348b538,0x3956c25b); # u64
566 &data_word(0xb605d019,0x59f111f1); # u64
567 &data_word(0xaf194f9b,0x923f82a4); # u64
568 &data_word(0xda6d8118,0xab1c5ed5); # u64
569 &data_word(0xa3030242,0xd807aa98); # u64
570 &data_word(0x45706fbe,0x12835b01); # u64
571 &data_word(0x4ee4b28c,0x243185be); # u64
572 &data_word(0xd5ffb4e2,0x550c7dc3); # u64
573 &data_word(0xf27b896f,0x72be5d74); # u64
574 &data_word(0x3b1696b1,0x80deb1fe); # u64
575 &data_word(0x25c71235,0x9bdc06a7); # u64
576 &data_word(0xcf692694,0xc19bf174); # u64
577 &data_word(0x9ef14ad2,0xe49b69c1); # u64
578 &data_word(0x384f25e3,0xefbe4786); # u64
579 &data_word(0x8b8cd5b5,0x0fc19dc6); # u64
580 &data_word(0x77ac9c65,0x240ca1cc); # u64
581 &data_word(0x592b0275,0x2de92c6f); # u64
582 &data_word(0x6ea6e483,0x4a7484aa); # u64
583 &data_word(0xbd41fbd4,0x5cb0a9dc); # u64
584 &data_word(0x831153b5,0x76f988da); # u64
585 &data_word(0xee66dfab,0x983e5152); # u64
586 &data_word(0x2db43210,0xa831c66d); # u64
587 &data_word(0x98fb213f,0xb00327c8); # u64
588 &data_word(0xbeef0ee4,0xbf597fc7); # u64
589 &data_word(0x3da88fc2,0xc6e00bf3); # u64
590 &data_word(0x930aa725,0xd5a79147); # u64
591 &data_word(0xe003826f,0x06ca6351); # u64
592 &data_word(0x0a0e6e70,0x14292967); # u64
593 &data_word(0x46d22ffc,0x27b70a85); # u64
594 &data_word(0x5c26c926,0x2e1b2138); # u64
595 &data_word(0x5ac42aed,0x4d2c6dfc); # u64
596 &data_word(0x9d95b3df,0x53380d13); # u64
597 &data_word(0x8baf63de,0x650a7354); # u64
598 &data_word(0x3c77b2a8,0x766a0abb); # u64
599 &data_word(0x47edaee6,0x81c2c92e); # u64
600 &data_word(0x1482353b,0x92722c85); # u64
601 &data_word(0x4cf10364,0xa2bfe8a1); # u64
602 &data_word(0xbc423001,0xa81a664b); # u64
603 &data_word(0xd0f89791,0xc24b8b70); # u64
604 &data_word(0x0654be30,0xc76c51a3); # u64
605 &data_word(0xd6ef5218,0xd192e819); # u64
606 &data_word(0x5565a910,0xd6990624); # u64
607 &data_word(0x5771202a,0xf40e3585); # u64
608 &data_word(0x32bbd1b8,0x106aa070); # u64
609 &data_word(0xb8d2d0c8,0x19a4c116); # u64
610 &data_word(0x5141ab53,0x1e376c08); # u64
611 &data_word(0xdf8eeb99,0x2748774c); # u64
612 &data_word(0xe19b48a8,0x34b0bcb5); # u64
613 &data_word(0xc5c95a63,0x391c0cb3); # u64
614 &data_word(0xe3418acb,0x4ed8aa4a); # u64
615 &data_word(0x7763e373,0x5b9cca4f); # u64
616 &data_word(0xd6b2b8a3,0x682e6ff3); # u64
617 &data_word(0x5defb2fc,0x748f82ee); # u64
618 &data_word(0x43172f60,0x78a5636f); # u64
619 &data_word(0xa1f0ab72,0x84c87814); # u64
620 &data_word(0x1a6439ec,0x8cc70208); # u64
621 &data_word(0x23631e28,0x90befffa); # u64
622 &data_word(0xde82bde9,0xa4506ceb); # u64
623 &data_word(0xb2c67915,0xbef9a3f7); # u64
624 &data_word(0xe372532b,0xc67178f2); # u64
625 &data_word(0xea26619c,0xca273ece); # u64
626 &data_word(0x21c0c207,0xd186b8c7); # u64
627 &data_word(0xcde0eb1e,0xeada7dd6); # u64
628 &data_word(0xee6ed178,0xf57d4f7f); # u64
629 &data_word(0x72176fba,0x06f067aa); # u64
630 &data_word(0xa2c898a6,0x0a637dc5); # u64
631 &data_word(0xbef90dae,0x113f9804); # u64
632 &data_word(0x131c471b,0x1b710b35); # u64
633 &data_word(0x23047d84,0x28db77f5); # u64
634 &data_word(0x40c72493,0x32caab7b); # u64
635 &data_word(0x15c9bebc,0x3c9ebe0a); # u64
636 &data_word(0x9c100d4c,0x431d67c4); # u64
637 &data_word(0xcb3e42b6,0x4cc5d4be); # u64
638 &data_word(0xfc657e2a,0x597f299c); # u64
639 &data_word(0x3ad6faec,0x5fcb6fab); # u64
640 &data_word(0x4a475817,0x6c44198c); # u64
641&function_end_B("sha512_block_data_order");
642&asciz("SHA512 block transform for x86, CRYPTOGAMS by <appro\@openssl.org>");
643
644&asm_finish();
diff --git a/src/lib/libcrypto/sha/asm/sha512-armv4.pl b/src/lib/libcrypto/sha/asm/sha512-armv4.pl
deleted file mode 100644
index 7faf37b147..0000000000
--- a/src/lib/libcrypto/sha/asm/sha512-armv4.pl
+++ /dev/null
@@ -1,582 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# SHA512 block procedure for ARMv4. September 2007.
11
12# This code is ~4.5 (four and a half) times faster than code generated
13# by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
14# Xscale PXA250 core].
15#
16# July 2010.
17#
18# Rescheduling for dual-issue pipeline resulted in 6% improvement on
19# Cortex A8 core and ~40 cycles per processed byte.
20
21# February 2011.
22#
23# Profiler-assisted and platform-specific optimization resulted in 7%
24# improvement on Coxtex A8 core and ~38 cycles per byte.
25
26# March 2011.
27#
28# Add NEON implementation. On Cortex A8 it was measured to process
29# one byte in 25.5 cycles or 47% faster than integer-only code.
30
31# Byte order [in]dependence. =========================================
32#
33# Originally caller was expected to maintain specific *dword* order in
34# h[0-7], namely with most significant dword at *lower* address, which
35# was reflected in below two parameters as 0 and 4. Now caller is
36# expected to maintain native byte order for whole 64-bit values.
37$hi="HI";
38$lo="LO";
39# ====================================================================
40
41while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
42open STDOUT,">$output";
43
44$ctx="r0"; # parameter block
45$inp="r1";
46$len="r2";
47
48$Tlo="r3";
49$Thi="r4";
50$Alo="r5";
51$Ahi="r6";
52$Elo="r7";
53$Ehi="r8";
54$t0="r9";
55$t1="r10";
56$t2="r11";
57$t3="r12";
58############ r13 is stack pointer
59$Ktbl="r14";
60############ r15 is program counter
61
62$Aoff=8*0;
63$Boff=8*1;
64$Coff=8*2;
65$Doff=8*3;
66$Eoff=8*4;
67$Foff=8*5;
68$Goff=8*6;
69$Hoff=8*7;
70$Xoff=8*8;
71
72sub BODY_00_15() {
73my $magic = shift;
74$code.=<<___;
75 @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41))
76 @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
77 @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
78 mov $t0,$Elo,lsr#14
79 str $Tlo,[sp,#$Xoff+0]
80 mov $t1,$Ehi,lsr#14
81 str $Thi,[sp,#$Xoff+4]
82 eor $t0,$t0,$Ehi,lsl#18
83 ldr $t2,[sp,#$Hoff+0] @ h.lo
84 eor $t1,$t1,$Elo,lsl#18
85 ldr $t3,[sp,#$Hoff+4] @ h.hi
86 eor $t0,$t0,$Elo,lsr#18
87 eor $t1,$t1,$Ehi,lsr#18
88 eor $t0,$t0,$Ehi,lsl#14
89 eor $t1,$t1,$Elo,lsl#14
90 eor $t0,$t0,$Ehi,lsr#9
91 eor $t1,$t1,$Elo,lsr#9
92 eor $t0,$t0,$Elo,lsl#23
93 eor $t1,$t1,$Ehi,lsl#23 @ Sigma1(e)
94 adds $Tlo,$Tlo,$t0
95 ldr $t0,[sp,#$Foff+0] @ f.lo
96 adc $Thi,$Thi,$t1 @ T += Sigma1(e)
97 ldr $t1,[sp,#$Foff+4] @ f.hi
98 adds $Tlo,$Tlo,$t2
99 ldr $t2,[sp,#$Goff+0] @ g.lo
100 adc $Thi,$Thi,$t3 @ T += h
101 ldr $t3,[sp,#$Goff+4] @ g.hi
102
103 eor $t0,$t0,$t2
104 str $Elo,[sp,#$Eoff+0]
105 eor $t1,$t1,$t3
106 str $Ehi,[sp,#$Eoff+4]
107 and $t0,$t0,$Elo
108 str $Alo,[sp,#$Aoff+0]
109 and $t1,$t1,$Ehi
110 str $Ahi,[sp,#$Aoff+4]
111 eor $t0,$t0,$t2
112 ldr $t2,[$Ktbl,#$lo] @ K[i].lo
113 eor $t1,$t1,$t3 @ Ch(e,f,g)
114 ldr $t3,[$Ktbl,#$hi] @ K[i].hi
115
116 adds $Tlo,$Tlo,$t0
117 ldr $Elo,[sp,#$Doff+0] @ d.lo
118 adc $Thi,$Thi,$t1 @ T += Ch(e,f,g)
119 ldr $Ehi,[sp,#$Doff+4] @ d.hi
120 adds $Tlo,$Tlo,$t2
121 and $t0,$t2,#0xff
122 adc $Thi,$Thi,$t3 @ T += K[i]
123 adds $Elo,$Elo,$Tlo
124 ldr $t2,[sp,#$Boff+0] @ b.lo
125 adc $Ehi,$Ehi,$Thi @ d += T
126 teq $t0,#$magic
127
128 ldr $t3,[sp,#$Coff+0] @ c.lo
129 orreq $Ktbl,$Ktbl,#1
130 @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
131 @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
132 @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
133 mov $t0,$Alo,lsr#28
134 mov $t1,$Ahi,lsr#28
135 eor $t0,$t0,$Ahi,lsl#4
136 eor $t1,$t1,$Alo,lsl#4
137 eor $t0,$t0,$Ahi,lsr#2
138 eor $t1,$t1,$Alo,lsr#2
139 eor $t0,$t0,$Alo,lsl#30
140 eor $t1,$t1,$Ahi,lsl#30
141 eor $t0,$t0,$Ahi,lsr#7
142 eor $t1,$t1,$Alo,lsr#7
143 eor $t0,$t0,$Alo,lsl#25
144 eor $t1,$t1,$Ahi,lsl#25 @ Sigma0(a)
145 adds $Tlo,$Tlo,$t0
146 and $t0,$Alo,$t2
147 adc $Thi,$Thi,$t1 @ T += Sigma0(a)
148
149 ldr $t1,[sp,#$Boff+4] @ b.hi
150 orr $Alo,$Alo,$t2
151 ldr $t2,[sp,#$Coff+4] @ c.hi
152 and $Alo,$Alo,$t3
153 and $t3,$Ahi,$t1
154 orr $Ahi,$Ahi,$t1
155 orr $Alo,$Alo,$t0 @ Maj(a,b,c).lo
156 and $Ahi,$Ahi,$t2
157 adds $Alo,$Alo,$Tlo
158 orr $Ahi,$Ahi,$t3 @ Maj(a,b,c).hi
159 sub sp,sp,#8
160 adc $Ahi,$Ahi,$Thi @ h += T
161 tst $Ktbl,#1
162 add $Ktbl,$Ktbl,#8
163___
164}
165$code=<<___;
166#include "arm_arch.h"
167#ifdef __ARMEL__
168# define LO 0
169# define HI 4
170# define WORD64(hi0,lo0,hi1,lo1) .word lo0,hi0, lo1,hi1
171#else
172# define HI 0
173# define LO 4
174# define WORD64(hi0,lo0,hi1,lo1) .word hi0,lo0, hi1,lo1
175#endif
176
177.text
178.code 32
179.type K512,%object
180.align 5
181K512:
182WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
183WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
184WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
185WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
186WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
187WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
188WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
189WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
190WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
191WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
192WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
193WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
194WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
195WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
196WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
197WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
198WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
199WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
200WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
201WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
202WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
203WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
204WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
205WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
206WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
207WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
208WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
209WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
210WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
211WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
212WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
213WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
214WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
215WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
216WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
217WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
218WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
219WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
220WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
221WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
222.size K512,.-K512
223.LOPENSSL_armcap:
224.word OPENSSL_armcap_P-sha512_block_data_order
225.skip 32-4
226
227.global sha512_block_data_order
228.type sha512_block_data_order,%function
229sha512_block_data_order:
230 sub r3,pc,#8 @ sha512_block_data_order
231 add $len,$inp,$len,lsl#7 @ len to point at the end of inp
232#if __ARM_ARCH__>=7
233 ldr r12,.LOPENSSL_armcap
234 ldr r12,[r3,r12] @ OPENSSL_armcap_P
235 tst r12,#1
236 bne .LNEON
237#endif
238 stmdb sp!,{r4-r12,lr}
239 sub $Ktbl,r3,#672 @ K512
240 sub sp,sp,#9*8
241
242 ldr $Elo,[$ctx,#$Eoff+$lo]
243 ldr $Ehi,[$ctx,#$Eoff+$hi]
244 ldr $t0, [$ctx,#$Goff+$lo]
245 ldr $t1, [$ctx,#$Goff+$hi]
246 ldr $t2, [$ctx,#$Hoff+$lo]
247 ldr $t3, [$ctx,#$Hoff+$hi]
248.Loop:
249 str $t0, [sp,#$Goff+0]
250 str $t1, [sp,#$Goff+4]
251 str $t2, [sp,#$Hoff+0]
252 str $t3, [sp,#$Hoff+4]
253 ldr $Alo,[$ctx,#$Aoff+$lo]
254 ldr $Ahi,[$ctx,#$Aoff+$hi]
255 ldr $Tlo,[$ctx,#$Boff+$lo]
256 ldr $Thi,[$ctx,#$Boff+$hi]
257 ldr $t0, [$ctx,#$Coff+$lo]
258 ldr $t1, [$ctx,#$Coff+$hi]
259 ldr $t2, [$ctx,#$Doff+$lo]
260 ldr $t3, [$ctx,#$Doff+$hi]
261 str $Tlo,[sp,#$Boff+0]
262 str $Thi,[sp,#$Boff+4]
263 str $t0, [sp,#$Coff+0]
264 str $t1, [sp,#$Coff+4]
265 str $t2, [sp,#$Doff+0]
266 str $t3, [sp,#$Doff+4]
267 ldr $Tlo,[$ctx,#$Foff+$lo]
268 ldr $Thi,[$ctx,#$Foff+$hi]
269 str $Tlo,[sp,#$Foff+0]
270 str $Thi,[sp,#$Foff+4]
271
272.L00_15:
273#if __ARM_ARCH__<7
274 ldrb $Tlo,[$inp,#7]
275 ldrb $t0, [$inp,#6]
276 ldrb $t1, [$inp,#5]
277 ldrb $t2, [$inp,#4]
278 ldrb $Thi,[$inp,#3]
279 ldrb $t3, [$inp,#2]
280 orr $Tlo,$Tlo,$t0,lsl#8
281 ldrb $t0, [$inp,#1]
282 orr $Tlo,$Tlo,$t1,lsl#16
283 ldrb $t1, [$inp],#8
284 orr $Tlo,$Tlo,$t2,lsl#24
285 orr $Thi,$Thi,$t3,lsl#8
286 orr $Thi,$Thi,$t0,lsl#16
287 orr $Thi,$Thi,$t1,lsl#24
288#else
289 ldr $Tlo,[$inp,#4]
290 ldr $Thi,[$inp],#8
291#ifdef __ARMEL__
292 rev $Tlo,$Tlo
293 rev $Thi,$Thi
294#endif
295#endif
296___
297 &BODY_00_15(0x94);
298$code.=<<___;
299 tst $Ktbl,#1
300 beq .L00_15
301 ldr $t0,[sp,#`$Xoff+8*(16-1)`+0]
302 ldr $t1,[sp,#`$Xoff+8*(16-1)`+4]
303 bic $Ktbl,$Ktbl,#1
304.L16_79:
305 @ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7))
306 @ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
307 @ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7
308 mov $Tlo,$t0,lsr#1
309 ldr $t2,[sp,#`$Xoff+8*(16-14)`+0]
310 mov $Thi,$t1,lsr#1
311 ldr $t3,[sp,#`$Xoff+8*(16-14)`+4]
312 eor $Tlo,$Tlo,$t1,lsl#31
313 eor $Thi,$Thi,$t0,lsl#31
314 eor $Tlo,$Tlo,$t0,lsr#8
315 eor $Thi,$Thi,$t1,lsr#8
316 eor $Tlo,$Tlo,$t1,lsl#24
317 eor $Thi,$Thi,$t0,lsl#24
318 eor $Tlo,$Tlo,$t0,lsr#7
319 eor $Thi,$Thi,$t1,lsr#7
320 eor $Tlo,$Tlo,$t1,lsl#25
321
322 @ sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
323 @ LO lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
324 @ HI hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
325 mov $t0,$t2,lsr#19
326 mov $t1,$t3,lsr#19
327 eor $t0,$t0,$t3,lsl#13
328 eor $t1,$t1,$t2,lsl#13
329 eor $t0,$t0,$t3,lsr#29
330 eor $t1,$t1,$t2,lsr#29
331 eor $t0,$t0,$t2,lsl#3
332 eor $t1,$t1,$t3,lsl#3
333 eor $t0,$t0,$t2,lsr#6
334 eor $t1,$t1,$t3,lsr#6
335 ldr $t2,[sp,#`$Xoff+8*(16-9)`+0]
336 eor $t0,$t0,$t3,lsl#26
337
338 ldr $t3,[sp,#`$Xoff+8*(16-9)`+4]
339 adds $Tlo,$Tlo,$t0
340 ldr $t0,[sp,#`$Xoff+8*16`+0]
341 adc $Thi,$Thi,$t1
342
343 ldr $t1,[sp,#`$Xoff+8*16`+4]
344 adds $Tlo,$Tlo,$t2
345 adc $Thi,$Thi,$t3
346 adds $Tlo,$Tlo,$t0
347 adc $Thi,$Thi,$t1
348___
349 &BODY_00_15(0x17);
350$code.=<<___;
351 ldreq $t0,[sp,#`$Xoff+8*(16-1)`+0]
352 ldreq $t1,[sp,#`$Xoff+8*(16-1)`+4]
353 beq .L16_79
354 bic $Ktbl,$Ktbl,#1
355
356 ldr $Tlo,[sp,#$Boff+0]
357 ldr $Thi,[sp,#$Boff+4]
358 ldr $t0, [$ctx,#$Aoff+$lo]
359 ldr $t1, [$ctx,#$Aoff+$hi]
360 ldr $t2, [$ctx,#$Boff+$lo]
361 ldr $t3, [$ctx,#$Boff+$hi]
362 adds $t0,$Alo,$t0
363 str $t0, [$ctx,#$Aoff+$lo]
364 adc $t1,$Ahi,$t1
365 str $t1, [$ctx,#$Aoff+$hi]
366 adds $t2,$Tlo,$t2
367 str $t2, [$ctx,#$Boff+$lo]
368 adc $t3,$Thi,$t3
369 str $t3, [$ctx,#$Boff+$hi]
370
371 ldr $Alo,[sp,#$Coff+0]
372 ldr $Ahi,[sp,#$Coff+4]
373 ldr $Tlo,[sp,#$Doff+0]
374 ldr $Thi,[sp,#$Doff+4]
375 ldr $t0, [$ctx,#$Coff+$lo]
376 ldr $t1, [$ctx,#$Coff+$hi]
377 ldr $t2, [$ctx,#$Doff+$lo]
378 ldr $t3, [$ctx,#$Doff+$hi]
379 adds $t0,$Alo,$t0
380 str $t0, [$ctx,#$Coff+$lo]
381 adc $t1,$Ahi,$t1
382 str $t1, [$ctx,#$Coff+$hi]
383 adds $t2,$Tlo,$t2
384 str $t2, [$ctx,#$Doff+$lo]
385 adc $t3,$Thi,$t3
386 str $t3, [$ctx,#$Doff+$hi]
387
388 ldr $Tlo,[sp,#$Foff+0]
389 ldr $Thi,[sp,#$Foff+4]
390 ldr $t0, [$ctx,#$Eoff+$lo]
391 ldr $t1, [$ctx,#$Eoff+$hi]
392 ldr $t2, [$ctx,#$Foff+$lo]
393 ldr $t3, [$ctx,#$Foff+$hi]
394 adds $Elo,$Elo,$t0
395 str $Elo,[$ctx,#$Eoff+$lo]
396 adc $Ehi,$Ehi,$t1
397 str $Ehi,[$ctx,#$Eoff+$hi]
398 adds $t2,$Tlo,$t2
399 str $t2, [$ctx,#$Foff+$lo]
400 adc $t3,$Thi,$t3
401 str $t3, [$ctx,#$Foff+$hi]
402
403 ldr $Alo,[sp,#$Goff+0]
404 ldr $Ahi,[sp,#$Goff+4]
405 ldr $Tlo,[sp,#$Hoff+0]
406 ldr $Thi,[sp,#$Hoff+4]
407 ldr $t0, [$ctx,#$Goff+$lo]
408 ldr $t1, [$ctx,#$Goff+$hi]
409 ldr $t2, [$ctx,#$Hoff+$lo]
410 ldr $t3, [$ctx,#$Hoff+$hi]
411 adds $t0,$Alo,$t0
412 str $t0, [$ctx,#$Goff+$lo]
413 adc $t1,$Ahi,$t1
414 str $t1, [$ctx,#$Goff+$hi]
415 adds $t2,$Tlo,$t2
416 str $t2, [$ctx,#$Hoff+$lo]
417 adc $t3,$Thi,$t3
418 str $t3, [$ctx,#$Hoff+$hi]
419
420 add sp,sp,#640
421 sub $Ktbl,$Ktbl,#640
422
423 teq $inp,$len
424 bne .Loop
425
426 add sp,sp,#8*9 @ destroy frame
427#if __ARM_ARCH__>=5
428 ldmia sp!,{r4-r12,pc}
429#else
430 ldmia sp!,{r4-r12,lr}
431 tst lr,#1
432 moveq pc,lr @ be binary compatible with V4, yet
433 bx lr @ interoperable with Thumb ISA:-)
434#endif
435___
436
437{
438my @Sigma0=(28,34,39);
439my @Sigma1=(14,18,41);
440my @sigma0=(1, 8, 7);
441my @sigma1=(19,61,6);
442
443my $Ktbl="r3";
444my $cnt="r12"; # volatile register known as ip, intra-procedure-call scratch
445
446my @X=map("d$_",(0..15));
447my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23));
448
449sub NEON_00_15() {
450my $i=shift;
451my ($a,$b,$c,$d,$e,$f,$g,$h)=@_;
452my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31)); # temps
453
454$code.=<<___ if ($i<16 || $i&1);
455 vshr.u64 $t0,$e,#@Sigma1[0] @ $i
456#if $i<16
457 vld1.64 {@X[$i%16]},[$inp]! @ handles unaligned
458#endif
459 vshr.u64 $t1,$e,#@Sigma1[1]
460 vshr.u64 $t2,$e,#@Sigma1[2]
461___
462$code.=<<___;
463 vld1.64 {$K},[$Ktbl,:64]! @ K[i++]
464 vsli.64 $t0,$e,#`64-@Sigma1[0]`
465 vsli.64 $t1,$e,#`64-@Sigma1[1]`
466 vsli.64 $t2,$e,#`64-@Sigma1[2]`
467#if $i<16 && defined(__ARMEL__)
468 vrev64.8 @X[$i],@X[$i]
469#endif
470 vadd.i64 $T1,$K,$h
471 veor $Ch,$f,$g
472 veor $t0,$t1
473 vand $Ch,$e
474 veor $t0,$t2 @ Sigma1(e)
475 veor $Ch,$g @ Ch(e,f,g)
476 vadd.i64 $T1,$t0
477 vshr.u64 $t0,$a,#@Sigma0[0]
478 vadd.i64 $T1,$Ch
479 vshr.u64 $t1,$a,#@Sigma0[1]
480 vshr.u64 $t2,$a,#@Sigma0[2]
481 vsli.64 $t0,$a,#`64-@Sigma0[0]`
482 vsli.64 $t1,$a,#`64-@Sigma0[1]`
483 vsli.64 $t2,$a,#`64-@Sigma0[2]`
484 vadd.i64 $T1,@X[$i%16]
485 vorr $Maj,$a,$c
486 vand $Ch,$a,$c
487 veor $h,$t0,$t1
488 vand $Maj,$b
489 veor $h,$t2 @ Sigma0(a)
490 vorr $Maj,$Ch @ Maj(a,b,c)
491 vadd.i64 $h,$T1
492 vadd.i64 $d,$T1
493 vadd.i64 $h,$Maj
494___
495}
496
497sub NEON_16_79() {
498my $i=shift;
499
500if ($i&1) { &NEON_00_15($i,@_); return; }
501
502# 2x-vectorized, therefore runs every 2nd round
503my @X=map("q$_",(0..7)); # view @X as 128-bit vector
504my ($t0,$t1,$s0,$s1) = map("q$_",(12..15)); # temps
505my ($d0,$d1,$d2) = map("d$_",(24..26)); # temps from NEON_00_15
506my $e=@_[4]; # $e from NEON_00_15
507$i /= 2;
508$code.=<<___;
509 vshr.u64 $t0,@X[($i+7)%8],#@sigma1[0]
510 vshr.u64 $t1,@X[($i+7)%8],#@sigma1[1]
511 vshr.u64 $s1,@X[($i+7)%8],#@sigma1[2]
512 vsli.64 $t0,@X[($i+7)%8],#`64-@sigma1[0]`
513 vext.8 $s0,@X[$i%8],@X[($i+1)%8],#8 @ X[i+1]
514 vsli.64 $t1,@X[($i+7)%8],#`64-@sigma1[1]`
515 veor $s1,$t0
516 vshr.u64 $t0,$s0,#@sigma0[0]
517 veor $s1,$t1 @ sigma1(X[i+14])
518 vshr.u64 $t1,$s0,#@sigma0[1]
519 vadd.i64 @X[$i%8],$s1
520 vshr.u64 $s1,$s0,#@sigma0[2]
521 vsli.64 $t0,$s0,#`64-@sigma0[0]`
522 vsli.64 $t1,$s0,#`64-@sigma0[1]`
523 vext.8 $s0,@X[($i+4)%8],@X[($i+5)%8],#8 @ X[i+9]
524 veor $s1,$t0
525 vshr.u64 $d0,$e,#@Sigma1[0] @ from NEON_00_15
526 vadd.i64 @X[$i%8],$s0
527 vshr.u64 $d1,$e,#@Sigma1[1] @ from NEON_00_15
528 veor $s1,$t1 @ sigma0(X[i+1])
529 vshr.u64 $d2,$e,#@Sigma1[2] @ from NEON_00_15
530 vadd.i64 @X[$i%8],$s1
531___
532 &NEON_00_15(2*$i,@_);
533}
534
535$code.=<<___;
536#if __ARM_ARCH__>=7
537.fpu neon
538
539.align 4
540.LNEON:
541 dmb @ errata #451034 on early Cortex A8
542 vstmdb sp!,{d8-d15} @ ABI specification says so
543 sub $Ktbl,r3,#672 @ K512
544 vldmia $ctx,{$A-$H} @ load context
545.Loop_neon:
546___
547for($i=0;$i<16;$i++) { &NEON_00_15($i,@V); unshift(@V,pop(@V)); }
548$code.=<<___;
549 mov $cnt,#4
550.L16_79_neon:
551 subs $cnt,#1
552___
553for(;$i<32;$i++) { &NEON_16_79($i,@V); unshift(@V,pop(@V)); }
554$code.=<<___;
555 bne .L16_79_neon
556
557 vldmia $ctx,{d24-d31} @ load context to temp
558 vadd.i64 q8,q12 @ vectorized accumulate
559 vadd.i64 q9,q13
560 vadd.i64 q10,q14
561 vadd.i64 q11,q15
562 vstmia $ctx,{$A-$H} @ save context
563 teq $inp,$len
564 sub $Ktbl,#640 @ rewind K512
565 bne .Loop_neon
566
567 vldmia sp!,{d8-d15} @ epilogue
568 bx lr
569#endif
570___
571}
572$code.=<<___;
573.size sha512_block_data_order,.-sha512_block_data_order
574.asciz "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
575.align 2
576.comm OPENSSL_armcap_P,4,4
577___
578
579$code =~ s/\`([^\`]*)\`/eval $1/gem;
580$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
581print $code;
582close STDOUT; # enforce flush
diff --git a/src/lib/libcrypto/sha/asm/sha512-ia64.pl b/src/lib/libcrypto/sha/asm/sha512-ia64.pl
deleted file mode 100755
index 1c6ce56522..0000000000
--- a/src/lib/libcrypto/sha/asm/sha512-ia64.pl
+++ /dev/null
@@ -1,672 +0,0 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# SHA256/512_Transform for Itanium.
11#
12# sha512_block runs in 1003 cycles on Itanium 2, which is almost 50%
13# faster than gcc and >60%(!) faster than code generated by HP-UX
14# compiler (yes, HP-UX is generating slower code, because unlike gcc,
15# it failed to deploy "shift right pair," 'shrp' instruction, which
16# substitutes for 64-bit rotate).
17#
18# 924 cycles long sha256_block outperforms gcc by over factor of 2(!)
19# and HP-UX compiler - by >40% (yes, gcc won sha512_block, but lost
20# this one big time). Note that "formally" 924 is about 100 cycles
21# too much. I mean it's 64 32-bit rounds vs. 80 virtually identical
22# 64-bit ones and 1003*64/80 gives 802. Extra cycles, 2 per round,
23# are spent on extra work to provide for 32-bit rotations. 32-bit
24# rotations are still handled by 'shrp' instruction and for this
25# reason lower 32 bits are deposited to upper half of 64-bit register
26# prior 'shrp' issue. And in order to minimize the amount of such
27# operations, X[16] values are *maintained* with copies of lower
28# halves in upper halves, which is why you'll spot such instructions
29# as custom 'mux2', "parallel 32-bit add," 'padd4' and "parallel
30# 32-bit unsigned right shift," 'pshr4.u' instructions here.
31#
32# Rules of engagement.
33#
34# There is only one integer shifter meaning that if I have two rotate,
35# deposit or extract instructions in adjacent bundles, they shall
36# split [at run-time if they have to]. But note that variable and
37# parallel shifts are performed by multi-media ALU and *are* pairable
38# with rotates [and alike]. On the backside MMALU is rather slow: it
39# takes 2 extra cycles before the result of integer operation is
40# available *to* MMALU and 2(*) extra cycles before the result of MM
41# operation is available "back" *to* integer ALU, not to mention that
42# MMALU itself has 2 cycles latency. However! I explicitly scheduled
43# these MM instructions to avoid MM stalls, so that all these extra
44# latencies get "hidden" in instruction-level parallelism.
45#
46# (*) 2 cycles on Itanium 1 and 1 cycle on Itanium 2. But I schedule
47# for 2 in order to provide for best *overall* performance,
48# because on Itanium 1 stall on MM result is accompanied by
49# pipeline flush, which takes 6 cycles:-(
50#
51# Resulting performance numbers for 900MHz Itanium 2 system:
52#
53# The 'numbers' are in 1000s of bytes per second processed.
54# type 16 bytes 64 bytes 256 bytes 1024 bytes 8192 bytes
55# sha1(*) 6210.14k 20376.30k 52447.83k 85870.05k 105478.12k
56# sha256 7476.45k 20572.05k 41538.34k 56062.29k 62093.18k
57# sha512 4996.56k 20026.28k 47597.20k 85278.79k 111501.31k
58#
59# (*) SHA1 numbers are for HP-UX compiler and are presented purely
60# for reference purposes. I bet it can improved too...
61#
62# To generate code, pass the file name with either 256 or 512 in its
63# name and compiler flags.
64
65$output=shift;
66
67if ($output =~ /512.*\.[s|asm]/) {
68 $SZ=8;
69 $BITS=8*$SZ;
70 $LDW="ld8";
71 $STW="st8";
72 $ADD="add";
73 $SHRU="shr.u";
74 $TABLE="K512";
75 $func="sha512_block_data_order";
76 @Sigma0=(28,34,39);
77 @Sigma1=(14,18,41);
78 @sigma0=(1, 8, 7);
79 @sigma1=(19,61, 6);
80 $rounds=80;
81} elsif ($output =~ /256.*\.[s|asm]/) {
82 $SZ=4;
83 $BITS=8*$SZ;
84 $LDW="ld4";
85 $STW="st4";
86 $ADD="padd4";
87 $SHRU="pshr4.u";
88 $TABLE="K256";
89 $func="sha256_block_data_order";
90 @Sigma0=( 2,13,22);
91 @Sigma1=( 6,11,25);
92 @sigma0=( 7,18, 3);
93 @sigma1=(17,19,10);
94 $rounds=64;
95} else { die "nonsense $output"; }
96
97open STDOUT,">$output" || die "can't open $output: $!";
98
99if ($^O eq "hpux") {
100 $ADDP="addp4";
101 for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
102} else { $ADDP="add"; }
103for (@ARGV) { $big_endian=1 if (/\-DB_ENDIAN/);
104 $big_endian=0 if (/\-DL_ENDIAN/); }
105if (!defined($big_endian))
106 { $big_endian=(unpack('L',pack('N',1))==1); }
107
108$code=<<___;
109.ident \"$output, version 1.1\"
110.ident \"IA-64 ISA artwork by Andy Polyakov <appro\@fy.chalmers.se>\"
111.explicit
112.text
113
114pfssave=r2;
115lcsave=r3;
116prsave=r14;
117K=r15;
118A=r16; B=r17; C=r18; D=r19;
119E=r20; F=r21; G=r22; H=r23;
120T1=r24; T2=r25;
121s0=r26; s1=r27; t0=r28; t1=r29;
122Ktbl=r30;
123ctx=r31; // 1st arg
124input=r48; // 2nd arg
125num=r49; // 3rd arg
126sgm0=r50; sgm1=r51; // small constants
127A_=r54; B_=r55; C_=r56; D_=r57;
128E_=r58; F_=r59; G_=r60; H_=r61;
129
130// void $func (SHA_CTX *ctx, const void *in,size_t num[,int host])
131.global $func#
132.proc $func#
133.align 32
134$func:
135 .prologue
136 .save ar.pfs,pfssave
137{ .mmi; alloc pfssave=ar.pfs,3,27,0,16
138 $ADDP ctx=0,r32 // 1st arg
139 .save ar.lc,lcsave
140 mov lcsave=ar.lc }
141{ .mmi; $ADDP input=0,r33 // 2nd arg
142 mov num=r34 // 3rd arg
143 .save pr,prsave
144 mov prsave=pr };;
145
146 .body
147{ .mib; add r8=0*$SZ,ctx
148 add r9=1*$SZ,ctx
149 brp.loop.imp .L_first16,.L_first16_end-16 }
150{ .mib; add r10=2*$SZ,ctx
151 add r11=3*$SZ,ctx
152 brp.loop.imp .L_rest,.L_rest_end-16 };;
153
154// load A-H
155.Lpic_point:
156{ .mmi; $LDW A_=[r8],4*$SZ
157 $LDW B_=[r9],4*$SZ
158 mov Ktbl=ip }
159{ .mmi; $LDW C_=[r10],4*$SZ
160 $LDW D_=[r11],4*$SZ
161 mov sgm0=$sigma0[2] };;
162{ .mmi; $LDW E_=[r8]
163 $LDW F_=[r9]
164 add Ktbl=($TABLE#-.Lpic_point),Ktbl }
165{ .mmi; $LDW G_=[r10]
166 $LDW H_=[r11]
167 cmp.ne p0,p16=0,r0 };; // used in sha256_block
168___
169$code.=<<___ if ($BITS==64);
170{ .mii; and r8=7,input
171 and input=~7,input;;
172 cmp.eq p9,p0=1,r8 }
173{ .mmi; cmp.eq p10,p0=2,r8
174 cmp.eq p11,p0=3,r8
175 cmp.eq p12,p0=4,r8 }
176{ .mmi; cmp.eq p13,p0=5,r8
177 cmp.eq p14,p0=6,r8
178 cmp.eq p15,p0=7,r8 };;
179___
180$code.=<<___;
181.L_outer:
182.rotr X[16]
183{ .mmi; mov A=A_
184 mov B=B_
185 mov ar.lc=14 }
186{ .mmi; mov C=C_
187 mov D=D_
188 mov E=E_ }
189{ .mmi; mov F=F_
190 mov G=G_
191 mov ar.ec=2 }
192{ .mmi; ld1 X[15]=[input],$SZ // eliminated in 64-bit
193 mov H=H_
194 mov sgm1=$sigma1[2] };;
195
196___
197$t0="t0", $t1="t1", $code.=<<___ if ($BITS==32);
198.align 32
199.L_first16:
200{ .mmi; add r9=1-$SZ,input
201 add r10=2-$SZ,input
202 add r11=3-$SZ,input };;
203{ .mmi; ld1 r9=[r9]
204 ld1 r10=[r10]
205 dep.z $t1=E,32,32 }
206{ .mmi; $LDW K=[Ktbl],$SZ
207 ld1 r11=[r11]
208 zxt4 E=E };;
209{ .mii; or $t1=$t1,E
210 dep X[15]=X[15],r9,8,8
211 dep r11=r10,r11,8,8 };;
212{ .mmi; and T1=F,E
213 and T2=A,B
214 dep X[15]=X[15],r11,16,16 }
215{ .mmi; andcm r8=G,E
216 and r9=A,C
217 mux2 $t0=A,0x44 };; // copy lower half to upper
218{ .mmi; (p16) ld1 X[15-1]=[input],$SZ // prefetch
219 xor T1=T1,r8 // T1=((e & f) ^ (~e & g))
220 _rotr r11=$t1,$Sigma1[0] } // ROTR(e,14)
221{ .mib; and r10=B,C
222 xor T2=T2,r9 };;
223___
224$t0="A", $t1="E", $code.=<<___ if ($BITS==64);
225// in 64-bit mode I load whole X[16] at once and take care of alignment...
226{ .mmi; add r8=1*$SZ,input
227 add r9=2*$SZ,input
228 add r10=3*$SZ,input };;
229{ .mmb; $LDW X[15]=[input],4*$SZ
230 $LDW X[14]=[r8],4*$SZ
231(p9) br.cond.dpnt.many .L1byte };;
232{ .mmb; $LDW X[13]=[r9],4*$SZ
233 $LDW X[12]=[r10],4*$SZ
234(p10) br.cond.dpnt.many .L2byte };;
235{ .mmb; $LDW X[11]=[input],4*$SZ
236 $LDW X[10]=[r8],4*$SZ
237(p11) br.cond.dpnt.many .L3byte };;
238{ .mmb; $LDW X[ 9]=[r9],4*$SZ
239 $LDW X[ 8]=[r10],4*$SZ
240(p12) br.cond.dpnt.many .L4byte };;
241{ .mmb; $LDW X[ 7]=[input],4*$SZ
242 $LDW X[ 6]=[r8],4*$SZ
243(p13) br.cond.dpnt.many .L5byte };;
244{ .mmb; $LDW X[ 5]=[r9],4*$SZ
245 $LDW X[ 4]=[r10],4*$SZ
246(p14) br.cond.dpnt.many .L6byte };;
247{ .mmb; $LDW X[ 3]=[input],4*$SZ
248 $LDW X[ 2]=[r8],4*$SZ
249(p15) br.cond.dpnt.many .L7byte };;
250{ .mmb; $LDW X[ 1]=[r9],4*$SZ
251 $LDW X[ 0]=[r10],4*$SZ
252 br.many .L_first16 };;
253.L1byte:
254{ .mmi; $LDW X[13]=[r9],4*$SZ
255 $LDW X[12]=[r10],4*$SZ
256 shrp X[15]=X[15],X[14],56 };;
257{ .mmi; $LDW X[11]=[input],4*$SZ
258 $LDW X[10]=[r8],4*$SZ
259 shrp X[14]=X[14],X[13],56 }
260{ .mmi; $LDW X[ 9]=[r9],4*$SZ
261 $LDW X[ 8]=[r10],4*$SZ
262 shrp X[13]=X[13],X[12],56 };;
263{ .mmi; $LDW X[ 7]=[input],4*$SZ
264 $LDW X[ 6]=[r8],4*$SZ
265 shrp X[12]=X[12],X[11],56 }
266{ .mmi; $LDW X[ 5]=[r9],4*$SZ
267 $LDW X[ 4]=[r10],4*$SZ
268 shrp X[11]=X[11],X[10],56 };;
269{ .mmi; $LDW X[ 3]=[input],4*$SZ
270 $LDW X[ 2]=[r8],4*$SZ
271 shrp X[10]=X[10],X[ 9],56 }
272{ .mmi; $LDW X[ 1]=[r9],4*$SZ
273 $LDW X[ 0]=[r10],4*$SZ
274 shrp X[ 9]=X[ 9],X[ 8],56 };;
275{ .mii; $LDW T1=[input]
276 shrp X[ 8]=X[ 8],X[ 7],56
277 shrp X[ 7]=X[ 7],X[ 6],56 }
278{ .mii; shrp X[ 6]=X[ 6],X[ 5],56
279 shrp X[ 5]=X[ 5],X[ 4],56 };;
280{ .mii; shrp X[ 4]=X[ 4],X[ 3],56
281 shrp X[ 3]=X[ 3],X[ 2],56 }
282{ .mii; shrp X[ 2]=X[ 2],X[ 1],56
283 shrp X[ 1]=X[ 1],X[ 0],56 }
284{ .mib; shrp X[ 0]=X[ 0],T1,56
285 br.many .L_first16 };;
286.L2byte:
287{ .mmi; $LDW X[11]=[input],4*$SZ
288 $LDW X[10]=[r8],4*$SZ
289 shrp X[15]=X[15],X[14],48 }
290{ .mmi; $LDW X[ 9]=[r9],4*$SZ
291 $LDW X[ 8]=[r10],4*$SZ
292 shrp X[14]=X[14],X[13],48 };;
293{ .mmi; $LDW X[ 7]=[input],4*$SZ
294 $LDW X[ 6]=[r8],4*$SZ
295 shrp X[13]=X[13],X[12],48 }
296{ .mmi; $LDW X[ 5]=[r9],4*$SZ
297 $LDW X[ 4]=[r10],4*$SZ
298 shrp X[12]=X[12],X[11],48 };;
299{ .mmi; $LDW X[ 3]=[input],4*$SZ
300 $LDW X[ 2]=[r8],4*$SZ
301 shrp X[11]=X[11],X[10],48 }
302{ .mmi; $LDW X[ 1]=[r9],4*$SZ
303 $LDW X[ 0]=[r10],4*$SZ
304 shrp X[10]=X[10],X[ 9],48 };;
305{ .mii; $LDW T1=[input]
306 shrp X[ 9]=X[ 9],X[ 8],48
307 shrp X[ 8]=X[ 8],X[ 7],48 }
308{ .mii; shrp X[ 7]=X[ 7],X[ 6],48
309 shrp X[ 6]=X[ 6],X[ 5],48 };;
310{ .mii; shrp X[ 5]=X[ 5],X[ 4],48
311 shrp X[ 4]=X[ 4],X[ 3],48 }
312{ .mii; shrp X[ 3]=X[ 3],X[ 2],48
313 shrp X[ 2]=X[ 2],X[ 1],48 }
314{ .mii; shrp X[ 1]=X[ 1],X[ 0],48
315 shrp X[ 0]=X[ 0],T1,48 }
316{ .mfb; br.many .L_first16 };;
317.L3byte:
318{ .mmi; $LDW X[ 9]=[r9],4*$SZ
319 $LDW X[ 8]=[r10],4*$SZ
320 shrp X[15]=X[15],X[14],40 };;
321{ .mmi; $LDW X[ 7]=[input],4*$SZ
322 $LDW X[ 6]=[r8],4*$SZ
323 shrp X[14]=X[14],X[13],40 }
324{ .mmi; $LDW X[ 5]=[r9],4*$SZ
325 $LDW X[ 4]=[r10],4*$SZ
326 shrp X[13]=X[13],X[12],40 };;
327{ .mmi; $LDW X[ 3]=[input],4*$SZ
328 $LDW X[ 2]=[r8],4*$SZ
329 shrp X[12]=X[12],X[11],40 }
330{ .mmi; $LDW X[ 1]=[r9],4*$SZ
331 $LDW X[ 0]=[r10],4*$SZ
332 shrp X[11]=X[11],X[10],40 };;
333{ .mii; $LDW T1=[input]
334 shrp X[10]=X[10],X[ 9],40
335 shrp X[ 9]=X[ 9],X[ 8],40 }
336{ .mii; shrp X[ 8]=X[ 8],X[ 7],40
337 shrp X[ 7]=X[ 7],X[ 6],40 };;
338{ .mii; shrp X[ 6]=X[ 6],X[ 5],40
339 shrp X[ 5]=X[ 5],X[ 4],40 }
340{ .mii; shrp X[ 4]=X[ 4],X[ 3],40
341 shrp X[ 3]=X[ 3],X[ 2],40 }
342{ .mii; shrp X[ 2]=X[ 2],X[ 1],40
343 shrp X[ 1]=X[ 1],X[ 0],40 }
344{ .mib; shrp X[ 0]=X[ 0],T1,40
345 br.many .L_first16 };;
346.L4byte:
347{ .mmi; $LDW X[ 7]=[input],4*$SZ
348 $LDW X[ 6]=[r8],4*$SZ
349 shrp X[15]=X[15],X[14],32 }
350{ .mmi; $LDW X[ 5]=[r9],4*$SZ
351 $LDW X[ 4]=[r10],4*$SZ
352 shrp X[14]=X[14],X[13],32 };;
353{ .mmi; $LDW X[ 3]=[input],4*$SZ
354 $LDW X[ 2]=[r8],4*$SZ
355 shrp X[13]=X[13],X[12],32 }
356{ .mmi; $LDW X[ 1]=[r9],4*$SZ
357 $LDW X[ 0]=[r10],4*$SZ
358 shrp X[12]=X[12],X[11],32 };;
359{ .mii; $LDW T1=[input]
360 shrp X[11]=X[11],X[10],32
361 shrp X[10]=X[10],X[ 9],32 }
362{ .mii; shrp X[ 9]=X[ 9],X[ 8],32
363 shrp X[ 8]=X[ 8],X[ 7],32 };;
364{ .mii; shrp X[ 7]=X[ 7],X[ 6],32
365 shrp X[ 6]=X[ 6],X[ 5],32 }
366{ .mii; shrp X[ 5]=X[ 5],X[ 4],32
367 shrp X[ 4]=X[ 4],X[ 3],32 }
368{ .mii; shrp X[ 3]=X[ 3],X[ 2],32
369 shrp X[ 2]=X[ 2],X[ 1],32 }
370{ .mii; shrp X[ 1]=X[ 1],X[ 0],32
371 shrp X[ 0]=X[ 0],T1,32 }
372{ .mfb; br.many .L_first16 };;
373.L5byte:
374{ .mmi; $LDW X[ 5]=[r9],4*$SZ
375 $LDW X[ 4]=[r10],4*$SZ
376 shrp X[15]=X[15],X[14],24 };;
377{ .mmi; $LDW X[ 3]=[input],4*$SZ
378 $LDW X[ 2]=[r8],4*$SZ
379 shrp X[14]=X[14],X[13],24 }
380{ .mmi; $LDW X[ 1]=[r9],4*$SZ
381 $LDW X[ 0]=[r10],4*$SZ
382 shrp X[13]=X[13],X[12],24 };;
383{ .mii; $LDW T1=[input]
384 shrp X[12]=X[12],X[11],24
385 shrp X[11]=X[11],X[10],24 }
386{ .mii; shrp X[10]=X[10],X[ 9],24
387 shrp X[ 9]=X[ 9],X[ 8],24 };;
388{ .mii; shrp X[ 8]=X[ 8],X[ 7],24
389 shrp X[ 7]=X[ 7],X[ 6],24 }
390{ .mii; shrp X[ 6]=X[ 6],X[ 5],24
391 shrp X[ 5]=X[ 5],X[ 4],24 }
392{ .mii; shrp X[ 4]=X[ 4],X[ 3],24
393 shrp X[ 3]=X[ 3],X[ 2],24 }
394{ .mii; shrp X[ 2]=X[ 2],X[ 1],24
395 shrp X[ 1]=X[ 1],X[ 0],24 }
396{ .mib; shrp X[ 0]=X[ 0],T1,24
397 br.many .L_first16 };;
398.L6byte:
399{ .mmi; $LDW X[ 3]=[input],4*$SZ
400 $LDW X[ 2]=[r8],4*$SZ
401 shrp X[15]=X[15],X[14],16 }
402{ .mmi; $LDW X[ 1]=[r9],4*$SZ
403 $LDW X[ 0]=[r10],4*$SZ
404 shrp X[14]=X[14],X[13],16 };;
405{ .mii; $LDW T1=[input]
406 shrp X[13]=X[13],X[12],16
407 shrp X[12]=X[12],X[11],16 }
408{ .mii; shrp X[11]=X[11],X[10],16
409 shrp X[10]=X[10],X[ 9],16 };;
410{ .mii; shrp X[ 9]=X[ 9],X[ 8],16
411 shrp X[ 8]=X[ 8],X[ 7],16 }
412{ .mii; shrp X[ 7]=X[ 7],X[ 6],16
413 shrp X[ 6]=X[ 6],X[ 5],16 }
414{ .mii; shrp X[ 5]=X[ 5],X[ 4],16
415 shrp X[ 4]=X[ 4],X[ 3],16 }
416{ .mii; shrp X[ 3]=X[ 3],X[ 2],16
417 shrp X[ 2]=X[ 2],X[ 1],16 }
418{ .mii; shrp X[ 1]=X[ 1],X[ 0],16
419 shrp X[ 0]=X[ 0],T1,16 }
420{ .mfb; br.many .L_first16 };;
421.L7byte:
422{ .mmi; $LDW X[ 1]=[r9],4*$SZ
423 $LDW X[ 0]=[r10],4*$SZ
424 shrp X[15]=X[15],X[14],8 };;
425{ .mii; $LDW T1=[input]
426 shrp X[14]=X[14],X[13],8
427 shrp X[13]=X[13],X[12],8 }
428{ .mii; shrp X[12]=X[12],X[11],8
429 shrp X[11]=X[11],X[10],8 };;
430{ .mii; shrp X[10]=X[10],X[ 9],8
431 shrp X[ 9]=X[ 9],X[ 8],8 }
432{ .mii; shrp X[ 8]=X[ 8],X[ 7],8
433 shrp X[ 7]=X[ 7],X[ 6],8 }
434{ .mii; shrp X[ 6]=X[ 6],X[ 5],8
435 shrp X[ 5]=X[ 5],X[ 4],8 }
436{ .mii; shrp X[ 4]=X[ 4],X[ 3],8
437 shrp X[ 3]=X[ 3],X[ 2],8 }
438{ .mii; shrp X[ 2]=X[ 2],X[ 1],8
439 shrp X[ 1]=X[ 1],X[ 0],8 }
440{ .mib; shrp X[ 0]=X[ 0],T1,8
441 br.many .L_first16 };;
442
443.align 32
444.L_first16:
445{ .mmi; $LDW K=[Ktbl],$SZ
446 and T1=F,E
447 and T2=A,B }
448{ .mmi; //$LDW X[15]=[input],$SZ // X[i]=*input++
449 andcm r8=G,E
450 and r9=A,C };;
451{ .mmi; xor T1=T1,r8 //T1=((e & f) ^ (~e & g))
452 and r10=B,C
453 _rotr r11=$t1,$Sigma1[0] } // ROTR(e,14)
454{ .mmi; xor T2=T2,r9
455 mux1 X[15]=X[15],\@rev };; // eliminated in big-endian
456___
457$code.=<<___;
458{ .mib; add T1=T1,H // T1=Ch(e,f,g)+h
459 _rotr r8=$t1,$Sigma1[1] } // ROTR(e,18)
460{ .mib; xor T2=T2,r10 // T2=((a & b) ^ (a & c) ^ (b & c))
461 mov H=G };;
462{ .mib; xor r11=r8,r11
463 _rotr r9=$t1,$Sigma1[2] } // ROTR(e,41)
464{ .mib; mov G=F
465 mov F=E };;
466{ .mib; xor r9=r9,r11 // r9=Sigma1(e)
467 _rotr r10=$t0,$Sigma0[0] } // ROTR(a,28)
468{ .mib; add T1=T1,K // T1=Ch(e,f,g)+h+K512[i]
469 mov E=D };;
470{ .mib; add T1=T1,r9 // T1+=Sigma1(e)
471 _rotr r11=$t0,$Sigma0[1] } // ROTR(a,34)
472{ .mib; mov D=C
473 mov C=B };;
474{ .mib; add T1=T1,X[15] // T1+=X[i]
475 _rotr r8=$t0,$Sigma0[2] } // ROTR(a,39)
476{ .mib; xor r10=r10,r11
477 mux2 X[15]=X[15],0x44 };; // eliminated in 64-bit
478{ .mmi; xor r10=r8,r10 // r10=Sigma0(a)
479 mov B=A
480 add A=T1,T2 };;
481{ .mib; add E=E,T1
482 add A=A,r10 // T2=Maj(a,b,c)+Sigma0(a)
483 br.ctop.sptk .L_first16 };;
484.L_first16_end:
485
486{ .mii; mov ar.lc=$rounds-17
487 mov ar.ec=1 };;
488
489.align 32
490.L_rest:
491.rotr X[16]
492{ .mib; $LDW K=[Ktbl],$SZ
493 _rotr r8=X[15-1],$sigma0[0] } // ROTR(s0,1)
494{ .mib; $ADD X[15]=X[15],X[15-9] // X[i&0xF]+=X[(i+9)&0xF]
495 $SHRU s0=X[15-1],sgm0 };; // s0=X[(i+1)&0xF]>>7
496{ .mib; and T1=F,E
497 _rotr r9=X[15-1],$sigma0[1] } // ROTR(s0,8)
498{ .mib; andcm r10=G,E
499 $SHRU s1=X[15-14],sgm1 };; // s1=X[(i+14)&0xF]>>6
500{ .mmi; xor T1=T1,r10 // T1=((e & f) ^ (~e & g))
501 xor r9=r8,r9
502 _rotr r10=X[15-14],$sigma1[0] };;// ROTR(s1,19)
503{ .mib; and T2=A,B
504 _rotr r11=X[15-14],$sigma1[1] }// ROTR(s1,61)
505{ .mib; and r8=A,C };;
506___
507$t0="t0", $t1="t1", $code.=<<___ if ($BITS==32);
508// I adhere to mmi; in order to hold Itanium 1 back and avoid 6 cycle
509// pipeline flush in last bundle. Note that even on Itanium2 the
510// latter stalls for one clock cycle...
511{ .mmi; xor s0=s0,r9 // s0=sigma0(X[(i+1)&0xF])
512 dep.z $t1=E,32,32 }
513{ .mmi; xor r10=r11,r10
514 zxt4 E=E };;
515{ .mmi; or $t1=$t1,E
516 xor s1=s1,r10 // s1=sigma1(X[(i+14)&0xF])
517 mux2 $t0=A,0x44 };; // copy lower half to upper
518{ .mmi; xor T2=T2,r8
519 _rotr r9=$t1,$Sigma1[0] } // ROTR(e,14)
520{ .mmi; and r10=B,C
521 add T1=T1,H // T1=Ch(e,f,g)+h
522 $ADD X[15]=X[15],s0 };; // X[i&0xF]+=sigma0(X[(i+1)&0xF])
523___
524$t0="A", $t1="E", $code.=<<___ if ($BITS==64);
525{ .mib; xor s0=s0,r9 // s0=sigma0(X[(i+1)&0xF])
526 _rotr r9=$t1,$Sigma1[0] } // ROTR(e,14)
527{ .mib; xor r10=r11,r10
528 xor T2=T2,r8 };;
529{ .mib; xor s1=s1,r10 // s1=sigma1(X[(i+14)&0xF])
530 add T1=T1,H }
531{ .mib; and r10=B,C
532 $ADD X[15]=X[15],s0 };; // X[i&0xF]+=sigma0(X[(i+1)&0xF])
533___
534$code.=<<___;
535{ .mmi; xor T2=T2,r10 // T2=((a & b) ^ (a & c) ^ (b & c))
536 mov H=G
537 _rotr r8=$t1,$Sigma1[1] };; // ROTR(e,18)
538{ .mmi; xor r11=r8,r9
539 $ADD X[15]=X[15],s1 // X[i&0xF]+=sigma1(X[(i+14)&0xF])
540 _rotr r9=$t1,$Sigma1[2] } // ROTR(e,41)
541{ .mmi; mov G=F
542 mov F=E };;
543{ .mib; xor r9=r9,r11 // r9=Sigma1(e)
544 _rotr r10=$t0,$Sigma0[0] } // ROTR(a,28)
545{ .mib; add T1=T1,K // T1=Ch(e,f,g)+h+K512[i]
546 mov E=D };;
547{ .mib; add T1=T1,r9 // T1+=Sigma1(e)
548 _rotr r11=$t0,$Sigma0[1] } // ROTR(a,34)
549{ .mib; mov D=C
550 mov C=B };;
551{ .mmi; add T1=T1,X[15] // T1+=X[i]
552 xor r10=r10,r11
553 _rotr r8=$t0,$Sigma0[2] };; // ROTR(a,39)
554{ .mmi; xor r10=r8,r10 // r10=Sigma0(a)
555 mov B=A
556 add A=T1,T2 };;
557{ .mib; add E=E,T1
558 add A=A,r10 // T2=Maj(a,b,c)+Sigma0(a)
559 br.ctop.sptk .L_rest };;
560.L_rest_end:
561
562{ .mmi; add A_=A_,A
563 add B_=B_,B
564 add C_=C_,C }
565{ .mmi; add D_=D_,D
566 add E_=E_,E
567 cmp.ltu p16,p0=1,num };;
568{ .mmi; add F_=F_,F
569 add G_=G_,G
570 add H_=H_,H }
571{ .mmb; add Ktbl=-$SZ*$rounds,Ktbl
572(p16) add num=-1,num
573(p16) br.dptk.many .L_outer };;
574
575{ .mib; add r8=0*$SZ,ctx
576 add r9=1*$SZ,ctx }
577{ .mib; add r10=2*$SZ,ctx
578 add r11=3*$SZ,ctx };;
579{ .mmi; $STW [r8]=A_,4*$SZ
580 $STW [r9]=B_,4*$SZ
581 mov ar.lc=lcsave }
582{ .mmi; $STW [r10]=C_,4*$SZ
583 $STW [r11]=D_,4*$SZ
584 mov pr=prsave,0x1ffff };;
585{ .mmb; $STW [r8]=E_
586 $STW [r9]=F_ }
587{ .mmb; $STW [r10]=G_
588 $STW [r11]=H_
589 br.ret.sptk.many b0 };;
590.endp $func#
591___
592
593$code =~ s/\`([^\`]*)\`/eval $1/gem;
594$code =~ s/_rotr(\s+)([^=]+)=([^,]+),([0-9]+)/shrp$1$2=$3,$3,$4/gm;
595if ($BITS==64) {
596 $code =~ s/mux2(\s+)\S+/nop.i$1 0x0/gm;
597 $code =~ s/mux1(\s+)\S+/nop.i$1 0x0/gm if ($big_endian);
598 $code =~ s/(shrp\s+X\[[^=]+)=([^,]+),([^,]+),([1-9]+)/$1=$3,$2,64-$4/gm
599 if (!$big_endian);
600 $code =~ s/ld1(\s+)X\[\S+/nop.m$1 0x0/gm;
601}
602
603print $code;
604
605print<<___ if ($BITS==32);
606.align 64
607.type K256#,\@object
608K256: data4 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
609 data4 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
610 data4 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
611 data4 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
612 data4 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
613 data4 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
614 data4 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
615 data4 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
616 data4 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
617 data4 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
618 data4 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
619 data4 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
620 data4 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
621 data4 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
622 data4 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
623 data4 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
624.size K256#,$SZ*$rounds
625stringz "SHA256 block transform for IA64, CRYPTOGAMS by <appro\@openssl.org>"
626___
627print<<___ if ($BITS==64);
628.align 64
629.type K512#,\@object
630K512: data8 0x428a2f98d728ae22,0x7137449123ef65cd
631 data8 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
632 data8 0x3956c25bf348b538,0x59f111f1b605d019
633 data8 0x923f82a4af194f9b,0xab1c5ed5da6d8118
634 data8 0xd807aa98a3030242,0x12835b0145706fbe
635 data8 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
636 data8 0x72be5d74f27b896f,0x80deb1fe3b1696b1
637 data8 0x9bdc06a725c71235,0xc19bf174cf692694
638 data8 0xe49b69c19ef14ad2,0xefbe4786384f25e3
639 data8 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
640 data8 0x2de92c6f592b0275,0x4a7484aa6ea6e483
641 data8 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
642 data8 0x983e5152ee66dfab,0xa831c66d2db43210
643 data8 0xb00327c898fb213f,0xbf597fc7beef0ee4
644 data8 0xc6e00bf33da88fc2,0xd5a79147930aa725
645 data8 0x06ca6351e003826f,0x142929670a0e6e70
646 data8 0x27b70a8546d22ffc,0x2e1b21385c26c926
647 data8 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
648 data8 0x650a73548baf63de,0x766a0abb3c77b2a8
649 data8 0x81c2c92e47edaee6,0x92722c851482353b
650 data8 0xa2bfe8a14cf10364,0xa81a664bbc423001
651 data8 0xc24b8b70d0f89791,0xc76c51a30654be30
652 data8 0xd192e819d6ef5218,0xd69906245565a910
653 data8 0xf40e35855771202a,0x106aa07032bbd1b8
654 data8 0x19a4c116b8d2d0c8,0x1e376c085141ab53
655 data8 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
656 data8 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
657 data8 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
658 data8 0x748f82ee5defb2fc,0x78a5636f43172f60
659 data8 0x84c87814a1f0ab72,0x8cc702081a6439ec
660 data8 0x90befffa23631e28,0xa4506cebde82bde9
661 data8 0xbef9a3f7b2c67915,0xc67178f2e372532b
662 data8 0xca273eceea26619c,0xd186b8c721c0c207
663 data8 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
664 data8 0x06f067aa72176fba,0x0a637dc5a2c898a6
665 data8 0x113f9804bef90dae,0x1b710b35131c471b
666 data8 0x28db77f523047d84,0x32caab7b40c72493
667 data8 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
668 data8 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
669 data8 0x5fcb6fab3ad6faec,0x6c44198c4a475817
670.size K512#,$SZ*$rounds
671stringz "SHA512 block transform for IA64, CRYPTOGAMS by <appro\@openssl.org>"
672___
diff --git a/src/lib/libcrypto/sha/asm/sha512-mips.pl b/src/lib/libcrypto/sha/asm/sha512-mips.pl
deleted file mode 100644
index ba5b250890..0000000000
--- a/src/lib/libcrypto/sha/asm/sha512-mips.pl
+++ /dev/null
@@ -1,455 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# SHA2 block procedures for MIPS.
11
12# October 2010.
13#
14# SHA256 performance improvement on MIPS R5000 CPU is ~27% over gcc-
15# generated code in o32 build and ~55% in n32/64 build. SHA512 [which
16# for now can only be compiled for MIPS64 ISA] improvement is modest
17# ~17%, but it comes for free, because it's same instruction sequence.
18# Improvement coefficients are for aligned input.
19
20######################################################################
21# There is a number of MIPS ABI in use, O32 and N32/64 are most
22# widely used. Then there is a new contender: NUBI. It appears that if
23# one picks the latter, it's possible to arrange code in ABI neutral
24# manner. Therefore let's stick to NUBI register layout:
25#
26($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
27($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
28($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
29($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
30#
31# The return value is placed in $a0. Following coding rules facilitate
32# interoperability:
33#
34# - never ever touch $tp, "thread pointer", former $gp [o32 can be
35# excluded from the rule, because it's specified volatile];
36# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
37# old code];
38# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
39#
40# For reference here is register layout for N32/64 MIPS ABIs:
41#
42# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
43# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
44# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
45# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
46# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
47#
48$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64
49
50if ($flavour =~ /64|n32/i) {
51 $PTR_ADD="dadd"; # incidentally works even on n32
52 $PTR_SUB="dsub"; # incidentally works even on n32
53 $REG_S="sd";
54 $REG_L="ld";
55 $PTR_SLL="dsll"; # incidentally works even on n32
56 $SZREG=8;
57} else {
58 $PTR_ADD="add";
59 $PTR_SUB="sub";
60 $REG_S="sw";
61 $REG_L="lw";
62 $PTR_SLL="sll";
63 $SZREG=4;
64}
65$pf = ($flavour =~ /nubi/i) ? $t0 : $t2;
66#
67# <appro@openssl.org>
68#
69######################################################################
70
71$big_endian=(`echo MIPSEL | $ENV{CC} -E -P -`=~/MIPSEL/)?1:0;
72
73for (@ARGV) { $output=$_ if (/^\w[\w\-]*\.\w+$/); }
74open STDOUT,">$output";
75
76if (!defined($big_endian)) { $big_endian=(unpack('L',pack('N',1))==1); }
77
78if ($output =~ /512/) {
79 $label="512";
80 $SZ=8;
81 $LD="ld"; # load from memory
82 $ST="sd"; # store to memory
83 $SLL="dsll"; # shift left logical
84 $SRL="dsrl"; # shift right logical
85 $ADDU="daddu";
86 @Sigma0=(28,34,39);
87 @Sigma1=(14,18,41);
88 @sigma0=( 7, 1, 8); # right shift first
89 @sigma1=( 6,19,61); # right shift first
90 $lastK=0x817;
91 $rounds=80;
92} else {
93 $label="256";
94 $SZ=4;
95 $LD="lw"; # load from memory
96 $ST="sw"; # store to memory
97 $SLL="sll"; # shift left logical
98 $SRL="srl"; # shift right logical
99 $ADDU="addu";
100 @Sigma0=( 2,13,22);
101 @Sigma1=( 6,11,25);
102 @sigma0=( 3, 7,18); # right shift first
103 @sigma1=(10,17,19); # right shift first
104 $lastK=0x8f2;
105 $rounds=64;
106}
107
108$MSB = $big_endian ? 0 : ($SZ-1);
109$LSB = ($SZ-1)&~$MSB;
110
111@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("\$$_",(1,2,3,7,24,25,30,31));
112@X=map("\$$_",(8..23));
113
114$ctx=$a0;
115$inp=$a1;
116$len=$a2; $Ktbl=$len;
117
118sub BODY_00_15 {
119my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
120my ($T1,$tmp0,$tmp1,$tmp2)=(@X[4],@X[5],@X[6],@X[7]);
121
122$code.=<<___ if ($i<15);
123 ${LD}l @X[1],`($i+1)*$SZ+$MSB`($inp)
124 ${LD}r @X[1],`($i+1)*$SZ+$LSB`($inp)
125___
126$code.=<<___ if (!$big_endian && $i<16 && $SZ==4);
127 srl $tmp0,@X[0],24 # byte swap($i)
128 srl $tmp1,@X[0],8
129 andi $tmp2,@X[0],0xFF00
130 sll @X[0],@X[0],24
131 andi $tmp1,0xFF00
132 sll $tmp2,$tmp2,8
133 or @X[0],$tmp0
134 or $tmp1,$tmp2
135 or @X[0],$tmp1
136___
137$code.=<<___ if (!$big_endian && $i<16 && $SZ==8);
138 ori $tmp0,$zero,0xFF
139 dsll $tmp2,$tmp0,32
140 or $tmp0,$tmp2 # 0x000000FF000000FF
141 and $tmp1,@X[0],$tmp0 # byte swap($i)
142 dsrl $tmp2,@X[0],24
143 dsll $tmp1,24
144 and $tmp2,$tmp0
145 dsll $tmp0,8 # 0x0000FF000000FF00
146 or $tmp1,$tmp2
147 and $tmp2,@X[0],$tmp0
148 dsrl @X[0],8
149 dsll $tmp2,8
150 and @X[0],$tmp0
151 or $tmp1,$tmp2
152 or @X[0],$tmp1
153 dsrl $tmp1,@X[0],32
154 dsll @X[0],32
155 or @X[0],$tmp1
156___
157$code.=<<___;
158 $ADDU $T1,$X[0],$h # $i
159 $SRL $h,$e,@Sigma1[0]
160 xor $tmp2,$f,$g
161 $SLL $tmp1,$e,`$SZ*8-@Sigma1[2]`
162 and $tmp2,$e
163 $SRL $tmp0,$e,@Sigma1[1]
164 xor $h,$tmp1
165 $SLL $tmp1,$e,`$SZ*8-@Sigma1[1]`
166 xor $h,$tmp0
167 $SRL $tmp0,$e,@Sigma1[2]
168 xor $h,$tmp1
169 $SLL $tmp1,$e,`$SZ*8-@Sigma1[0]`
170 xor $h,$tmp0
171 xor $tmp2,$g # Ch(e,f,g)
172 xor $tmp0,$tmp1,$h # Sigma1(e)
173
174 $SRL $h,$a,@Sigma0[0]
175 $ADDU $T1,$tmp2
176 $LD $tmp2,`$i*$SZ`($Ktbl) # K[$i]
177 $SLL $tmp1,$a,`$SZ*8-@Sigma0[2]`
178 $ADDU $T1,$tmp0
179 $SRL $tmp0,$a,@Sigma0[1]
180 xor $h,$tmp1
181 $SLL $tmp1,$a,`$SZ*8-@Sigma0[1]`
182 xor $h,$tmp0
183 $SRL $tmp0,$a,@Sigma0[2]
184 xor $h,$tmp1
185 $SLL $tmp1,$a,`$SZ*8-@Sigma0[0]`
186 xor $h,$tmp0
187 $ST @X[0],`($i%16)*$SZ`($sp) # offload to ring buffer
188 xor $h,$tmp1 # Sigma0(a)
189
190 or $tmp0,$a,$b
191 and $tmp1,$a,$b
192 and $tmp0,$c
193 or $tmp1,$tmp0 # Maj(a,b,c)
194 $ADDU $T1,$tmp2 # +=K[$i]
195 $ADDU $h,$tmp1
196
197 $ADDU $d,$T1
198 $ADDU $h,$T1
199___
200$code.=<<___ if ($i>=13);
201 $LD @X[3],`(($i+3)%16)*$SZ`($sp) # prefetch from ring buffer
202___
203}
204
205sub BODY_16_XX {
206my $i=@_[0];
207my ($tmp0,$tmp1,$tmp2,$tmp3)=(@X[4],@X[5],@X[6],@X[7]);
208
209$code.=<<___;
210 $SRL $tmp2,@X[1],@sigma0[0] # Xupdate($i)
211 $ADDU @X[0],@X[9] # +=X[i+9]
212 $SLL $tmp1,@X[1],`$SZ*8-@sigma0[2]`
213 $SRL $tmp0,@X[1],@sigma0[1]
214 xor $tmp2,$tmp1
215 $SLL $tmp1,`@sigma0[2]-@sigma0[1]`
216 xor $tmp2,$tmp0
217 $SRL $tmp0,@X[1],@sigma0[2]
218 xor $tmp2,$tmp1
219
220 $SRL $tmp3,@X[14],@sigma1[0]
221 xor $tmp2,$tmp0 # sigma0(X[i+1])
222 $SLL $tmp1,@X[14],`$SZ*8-@sigma1[2]`
223 $ADDU @X[0],$tmp2
224 $SRL $tmp0,@X[14],@sigma1[1]
225 xor $tmp3,$tmp1
226 $SLL $tmp1,`@sigma1[2]-@sigma1[1]`
227 xor $tmp3,$tmp0
228 $SRL $tmp0,@X[14],@sigma1[2]
229 xor $tmp3,$tmp1
230
231 xor $tmp3,$tmp0 # sigma1(X[i+14])
232 $ADDU @X[0],$tmp3
233___
234 &BODY_00_15(@_);
235}
236
237$FRAMESIZE=16*$SZ+16*$SZREG;
238$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0xc0fff008 : 0xc0ff0000;
239
240$code.=<<___;
241#ifdef OPENSSL_FIPSCANISTER
242# include <openssl/fipssyms.h>
243#endif
244
245.text
246.set noat
247#if !defined(__vxworks) || defined(__pic__)
248.option pic2
249#endif
250
251.align 5
252.globl sha${label}_block_data_order
253.ent sha${label}_block_data_order
254sha${label}_block_data_order:
255 .frame $sp,$FRAMESIZE,$ra
256 .mask $SAVED_REGS_MASK,-$SZREG
257 .set noreorder
258___
259$code.=<<___ if ($flavour =~ /o32/i); # o32 PIC-ification
260 .cpload $pf
261___
262$code.=<<___;
263 $PTR_SUB $sp,$FRAMESIZE
264 $REG_S $ra,$FRAMESIZE-1*$SZREG($sp)
265 $REG_S $fp,$FRAMESIZE-2*$SZREG($sp)
266 $REG_S $s11,$FRAMESIZE-3*$SZREG($sp)
267 $REG_S $s10,$FRAMESIZE-4*$SZREG($sp)
268 $REG_S $s9,$FRAMESIZE-5*$SZREG($sp)
269 $REG_S $s8,$FRAMESIZE-6*$SZREG($sp)
270 $REG_S $s7,$FRAMESIZE-7*$SZREG($sp)
271 $REG_S $s6,$FRAMESIZE-8*$SZREG($sp)
272 $REG_S $s5,$FRAMESIZE-9*$SZREG($sp)
273 $REG_S $s4,$FRAMESIZE-10*$SZREG($sp)
274___
275$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
276 $REG_S $s3,$FRAMESIZE-11*$SZREG($sp)
277 $REG_S $s2,$FRAMESIZE-12*$SZREG($sp)
278 $REG_S $s1,$FRAMESIZE-13*$SZREG($sp)
279 $REG_S $s0,$FRAMESIZE-14*$SZREG($sp)
280 $REG_S $gp,$FRAMESIZE-15*$SZREG($sp)
281___
282$code.=<<___;
283 $PTR_SLL @X[15],$len,`log(16*$SZ)/log(2)`
284___
285$code.=<<___ if ($flavour !~ /o32/i); # non-o32 PIC-ification
286 .cplocal $Ktbl
287 .cpsetup $pf,$zero,sha${label}_block_data_order
288___
289$code.=<<___;
290 .set reorder
291 la $Ktbl,K${label} # PIC-ified 'load address'
292
293 $LD $A,0*$SZ($ctx) # load context
294 $LD $B,1*$SZ($ctx)
295 $LD $C,2*$SZ($ctx)
296 $LD $D,3*$SZ($ctx)
297 $LD $E,4*$SZ($ctx)
298 $LD $F,5*$SZ($ctx)
299 $LD $G,6*$SZ($ctx)
300 $LD $H,7*$SZ($ctx)
301
302 $PTR_ADD @X[15],$inp # pointer to the end of input
303 $REG_S @X[15],16*$SZ($sp)
304 b .Loop
305
306.align 5
307.Loop:
308 ${LD}l @X[0],$MSB($inp)
309 ${LD}r @X[0],$LSB($inp)
310___
311for ($i=0;$i<16;$i++)
312{ &BODY_00_15($i,@V); unshift(@V,pop(@V)); push(@X,shift(@X)); }
313$code.=<<___;
314 b .L16_xx
315.align 4
316.L16_xx:
317___
318for (;$i<32;$i++)
319{ &BODY_16_XX($i,@V); unshift(@V,pop(@V)); push(@X,shift(@X)); }
320$code.=<<___;
321 and @X[6],0xfff
322 li @X[7],$lastK
323 .set noreorder
324 bne @X[6],@X[7],.L16_xx
325 $PTR_ADD $Ktbl,16*$SZ # Ktbl+=16
326
327 $REG_L @X[15],16*$SZ($sp) # restore pointer to the end of input
328 $LD @X[0],0*$SZ($ctx)
329 $LD @X[1],1*$SZ($ctx)
330 $LD @X[2],2*$SZ($ctx)
331 $PTR_ADD $inp,16*$SZ
332 $LD @X[3],3*$SZ($ctx)
333 $ADDU $A,@X[0]
334 $LD @X[4],4*$SZ($ctx)
335 $ADDU $B,@X[1]
336 $LD @X[5],5*$SZ($ctx)
337 $ADDU $C,@X[2]
338 $LD @X[6],6*$SZ($ctx)
339 $ADDU $D,@X[3]
340 $LD @X[7],7*$SZ($ctx)
341 $ADDU $E,@X[4]
342 $ST $A,0*$SZ($ctx)
343 $ADDU $F,@X[5]
344 $ST $B,1*$SZ($ctx)
345 $ADDU $G,@X[6]
346 $ST $C,2*$SZ($ctx)
347 $ADDU $H,@X[7]
348 $ST $D,3*$SZ($ctx)
349 $ST $E,4*$SZ($ctx)
350 $ST $F,5*$SZ($ctx)
351 $ST $G,6*$SZ($ctx)
352 $ST $H,7*$SZ($ctx)
353
354 bnel $inp,@X[15],.Loop
355 $PTR_SUB $Ktbl,`($rounds-16)*$SZ` # rewind $Ktbl
356
357 $REG_L $ra,$FRAMESIZE-1*$SZREG($sp)
358 $REG_L $fp,$FRAMESIZE-2*$SZREG($sp)
359 $REG_L $s11,$FRAMESIZE-3*$SZREG($sp)
360 $REG_L $s10,$FRAMESIZE-4*$SZREG($sp)
361 $REG_L $s9,$FRAMESIZE-5*$SZREG($sp)
362 $REG_L $s8,$FRAMESIZE-6*$SZREG($sp)
363 $REG_L $s7,$FRAMESIZE-7*$SZREG($sp)
364 $REG_L $s6,$FRAMESIZE-8*$SZREG($sp)
365 $REG_L $s5,$FRAMESIZE-9*$SZREG($sp)
366 $REG_L $s4,$FRAMESIZE-10*$SZREG($sp)
367___
368$code.=<<___ if ($flavour =~ /nubi/i);
369 $REG_L $s3,$FRAMESIZE-11*$SZREG($sp)
370 $REG_L $s2,$FRAMESIZE-12*$SZREG($sp)
371 $REG_L $s1,$FRAMESIZE-13*$SZREG($sp)
372 $REG_L $s0,$FRAMESIZE-14*$SZREG($sp)
373 $REG_L $gp,$FRAMESIZE-15*$SZREG($sp)
374___
375$code.=<<___;
376 jr $ra
377 $PTR_ADD $sp,$FRAMESIZE
378.end sha${label}_block_data_order
379
380.rdata
381.align 5
382K${label}:
383___
384if ($SZ==4) {
385$code.=<<___;
386 .word 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
387 .word 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
388 .word 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
389 .word 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
390 .word 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
391 .word 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
392 .word 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
393 .word 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
394 .word 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
395 .word 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
396 .word 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
397 .word 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
398 .word 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
399 .word 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
400 .word 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
401 .word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
402___
403} else {
404$code.=<<___;
405 .dword 0x428a2f98d728ae22, 0x7137449123ef65cd
406 .dword 0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc
407 .dword 0x3956c25bf348b538, 0x59f111f1b605d019
408 .dword 0x923f82a4af194f9b, 0xab1c5ed5da6d8118
409 .dword 0xd807aa98a3030242, 0x12835b0145706fbe
410 .dword 0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2
411 .dword 0x72be5d74f27b896f, 0x80deb1fe3b1696b1
412 .dword 0x9bdc06a725c71235, 0xc19bf174cf692694
413 .dword 0xe49b69c19ef14ad2, 0xefbe4786384f25e3
414 .dword 0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65
415 .dword 0x2de92c6f592b0275, 0x4a7484aa6ea6e483
416 .dword 0x5cb0a9dcbd41fbd4, 0x76f988da831153b5
417 .dword 0x983e5152ee66dfab, 0xa831c66d2db43210
418 .dword 0xb00327c898fb213f, 0xbf597fc7beef0ee4
419 .dword 0xc6e00bf33da88fc2, 0xd5a79147930aa725
420 .dword 0x06ca6351e003826f, 0x142929670a0e6e70
421 .dword 0x27b70a8546d22ffc, 0x2e1b21385c26c926
422 .dword 0x4d2c6dfc5ac42aed, 0x53380d139d95b3df
423 .dword 0x650a73548baf63de, 0x766a0abb3c77b2a8
424 .dword 0x81c2c92e47edaee6, 0x92722c851482353b
425 .dword 0xa2bfe8a14cf10364, 0xa81a664bbc423001
426 .dword 0xc24b8b70d0f89791, 0xc76c51a30654be30
427 .dword 0xd192e819d6ef5218, 0xd69906245565a910
428 .dword 0xf40e35855771202a, 0x106aa07032bbd1b8
429 .dword 0x19a4c116b8d2d0c8, 0x1e376c085141ab53
430 .dword 0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8
431 .dword 0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb
432 .dword 0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3
433 .dword 0x748f82ee5defb2fc, 0x78a5636f43172f60
434 .dword 0x84c87814a1f0ab72, 0x8cc702081a6439ec
435 .dword 0x90befffa23631e28, 0xa4506cebde82bde9
436 .dword 0xbef9a3f7b2c67915, 0xc67178f2e372532b
437 .dword 0xca273eceea26619c, 0xd186b8c721c0c207
438 .dword 0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178
439 .dword 0x06f067aa72176fba, 0x0a637dc5a2c898a6
440 .dword 0x113f9804bef90dae, 0x1b710b35131c471b
441 .dword 0x28db77f523047d84, 0x32caab7b40c72493
442 .dword 0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c
443 .dword 0x4cc5d4becb3e42b6, 0x597f299cfc657e2a
444 .dword 0x5fcb6fab3ad6faec, 0x6c44198c4a475817
445___
446}
447$code.=<<___;
448.asciiz "SHA${label} for MIPS, CRYPTOGAMS by <appro\@openssl.org>"
449.align 5
450
451___
452
453$code =~ s/\`([^\`]*)\`/eval $1/gem;
454print $code;
455close STDOUT;
diff --git a/src/lib/libcrypto/sha/asm/sha512-parisc.pl b/src/lib/libcrypto/sha/asm/sha512-parisc.pl
deleted file mode 100755
index e24ee58ae9..0000000000
--- a/src/lib/libcrypto/sha/asm/sha512-parisc.pl
+++ /dev/null
@@ -1,791 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# SHA256/512 block procedure for PA-RISC.
11
12# June 2009.
13#
14# SHA256 performance is >75% better than gcc 3.2 generated code on
15# PA-7100LC. Compared to code generated by vendor compiler this
16# implementation is almost 70% faster in 64-bit build, but delivers
17# virtually same performance in 32-bit build on PA-8600.
18#
19# SHA512 performance is >2.9x better than gcc 3.2 generated code on
20# PA-7100LC, PA-RISC 1.1 processor. Then implementation detects if the
21# code is executed on PA-RISC 2.0 processor and switches to 64-bit
22# code path delivering adequate peformance even in "blended" 32-bit
23# build. Though 64-bit code is not any faster than code generated by
24# vendor compiler on PA-8600...
25#
26# Special thanks to polarhome.com for providing HP-UX account.
27
28$flavour = shift;
29$output = shift;
30open STDOUT,">$output";
31
32if ($flavour =~ /64/) {
33 $LEVEL ="2.0W";
34 $SIZE_T =8;
35 $FRAME_MARKER =80;
36 $SAVED_RP =16;
37 $PUSH ="std";
38 $PUSHMA ="std,ma";
39 $POP ="ldd";
40 $POPMB ="ldd,mb";
41} else {
42 $LEVEL ="1.0";
43 $SIZE_T =4;
44 $FRAME_MARKER =48;
45 $SAVED_RP =20;
46 $PUSH ="stw";
47 $PUSHMA ="stwm";
48 $POP ="ldw";
49 $POPMB ="ldwm";
50}
51
52if ($output =~ /512/) {
53 $func="sha512_block_data_order";
54 $SZ=8;
55 @Sigma0=(28,34,39);
56 @Sigma1=(14,18,41);
57 @sigma0=(1, 8, 7);
58 @sigma1=(19,61, 6);
59 $rounds=80;
60 $LAST10BITS=0x017;
61 $LD="ldd";
62 $LDM="ldd,ma";
63 $ST="std";
64} else {
65 $func="sha256_block_data_order";
66 $SZ=4;
67 @Sigma0=( 2,13,22);
68 @Sigma1=( 6,11,25);
69 @sigma0=( 7,18, 3);
70 @sigma1=(17,19,10);
71 $rounds=64;
72 $LAST10BITS=0x0f2;
73 $LD="ldw";
74 $LDM="ldwm";
75 $ST="stw";
76}
77
78$FRAME=16*$SIZE_T+$FRAME_MARKER;# 16 saved regs + frame marker
79 # [+ argument transfer]
80$XOFF=16*$SZ+32; # local variables
81$FRAME+=$XOFF;
82$XOFF+=$FRAME_MARKER; # distance between %sp and local variables
83
84$ctx="%r26"; # zapped by $a0
85$inp="%r25"; # zapped by $a1
86$num="%r24"; # zapped by $t0
87
88$a0 ="%r26";
89$a1 ="%r25";
90$t0 ="%r24";
91$t1 ="%r29";
92$Tbl="%r31";
93
94@V=($A,$B,$C,$D,$E,$F,$G,$H)=("%r17","%r18","%r19","%r20","%r21","%r22","%r23","%r28");
95
96@X=("%r1", "%r2", "%r3", "%r4", "%r5", "%r6", "%r7", "%r8",
97 "%r9", "%r10","%r11","%r12","%r13","%r14","%r15","%r16",$inp);
98
99sub ROUND_00_15 {
100my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
101$code.=<<___;
102 _ror $e,$Sigma1[0],$a0
103 and $f,$e,$t0
104 _ror $e,$Sigma1[1],$a1
105 addl $t1,$h,$h
106 andcm $g,$e,$t1
107 xor $a1,$a0,$a0
108 _ror $a1,`$Sigma1[2]-$Sigma1[1]`,$a1
109 or $t0,$t1,$t1 ; Ch(e,f,g)
110 addl @X[$i%16],$h,$h
111 xor $a0,$a1,$a1 ; Sigma1(e)
112 addl $t1,$h,$h
113 _ror $a,$Sigma0[0],$a0
114 addl $a1,$h,$h
115
116 _ror $a,$Sigma0[1],$a1
117 and $a,$b,$t0
118 and $a,$c,$t1
119 xor $a1,$a0,$a0
120 _ror $a1,`$Sigma0[2]-$Sigma0[1]`,$a1
121 xor $t1,$t0,$t0
122 and $b,$c,$t1
123 xor $a0,$a1,$a1 ; Sigma0(a)
124 addl $h,$d,$d
125 xor $t1,$t0,$t0 ; Maj(a,b,c)
126 `"$LDM $SZ($Tbl),$t1" if ($i<15)`
127 addl $a1,$h,$h
128 addl $t0,$h,$h
129
130___
131}
132
133sub ROUND_16_xx {
134my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
135$i-=16;
136$code.=<<___;
137 _ror @X[($i+1)%16],$sigma0[0],$a0
138 _ror @X[($i+1)%16],$sigma0[1],$a1
139 addl @X[($i+9)%16],@X[$i],@X[$i]
140 _ror @X[($i+14)%16],$sigma1[0],$t0
141 _ror @X[($i+14)%16],$sigma1[1],$t1
142 xor $a1,$a0,$a0
143 _shr @X[($i+1)%16],$sigma0[2],$a1
144 xor $t1,$t0,$t0
145 _shr @X[($i+14)%16],$sigma1[2],$t1
146 xor $a1,$a0,$a0 ; sigma0(X[(i+1)&0x0f])
147 xor $t1,$t0,$t0 ; sigma1(X[(i+14)&0x0f])
148 $LDM $SZ($Tbl),$t1
149 addl $a0,@X[$i],@X[$i]
150 addl $t0,@X[$i],@X[$i]
151___
152$code.=<<___ if ($i==15);
153 extru $t1,31,10,$a1
154 comiclr,<> $LAST10BITS,$a1,%r0
155 ldo 1($Tbl),$Tbl ; signal end of $Tbl
156___
157&ROUND_00_15($i+16,$a,$b,$c,$d,$e,$f,$g,$h);
158}
159
160$code=<<___;
161 .LEVEL $LEVEL
162 .SPACE \$TEXT\$
163 .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
164
165 .ALIGN 64
166L\$table
167___
168$code.=<<___ if ($SZ==8);
169 .WORD 0x428a2f98,0xd728ae22,0x71374491,0x23ef65cd
170 .WORD 0xb5c0fbcf,0xec4d3b2f,0xe9b5dba5,0x8189dbbc
171 .WORD 0x3956c25b,0xf348b538,0x59f111f1,0xb605d019
172 .WORD 0x923f82a4,0xaf194f9b,0xab1c5ed5,0xda6d8118
173 .WORD 0xd807aa98,0xa3030242,0x12835b01,0x45706fbe
174 .WORD 0x243185be,0x4ee4b28c,0x550c7dc3,0xd5ffb4e2
175 .WORD 0x72be5d74,0xf27b896f,0x80deb1fe,0x3b1696b1
176 .WORD 0x9bdc06a7,0x25c71235,0xc19bf174,0xcf692694
177 .WORD 0xe49b69c1,0x9ef14ad2,0xefbe4786,0x384f25e3
178 .WORD 0x0fc19dc6,0x8b8cd5b5,0x240ca1cc,0x77ac9c65
179 .WORD 0x2de92c6f,0x592b0275,0x4a7484aa,0x6ea6e483
180 .WORD 0x5cb0a9dc,0xbd41fbd4,0x76f988da,0x831153b5
181 .WORD 0x983e5152,0xee66dfab,0xa831c66d,0x2db43210
182 .WORD 0xb00327c8,0x98fb213f,0xbf597fc7,0xbeef0ee4
183 .WORD 0xc6e00bf3,0x3da88fc2,0xd5a79147,0x930aa725
184 .WORD 0x06ca6351,0xe003826f,0x14292967,0x0a0e6e70
185 .WORD 0x27b70a85,0x46d22ffc,0x2e1b2138,0x5c26c926
186 .WORD 0x4d2c6dfc,0x5ac42aed,0x53380d13,0x9d95b3df
187 .WORD 0x650a7354,0x8baf63de,0x766a0abb,0x3c77b2a8
188 .WORD 0x81c2c92e,0x47edaee6,0x92722c85,0x1482353b
189 .WORD 0xa2bfe8a1,0x4cf10364,0xa81a664b,0xbc423001
190 .WORD 0xc24b8b70,0xd0f89791,0xc76c51a3,0x0654be30
191 .WORD 0xd192e819,0xd6ef5218,0xd6990624,0x5565a910
192 .WORD 0xf40e3585,0x5771202a,0x106aa070,0x32bbd1b8
193 .WORD 0x19a4c116,0xb8d2d0c8,0x1e376c08,0x5141ab53
194 .WORD 0x2748774c,0xdf8eeb99,0x34b0bcb5,0xe19b48a8
195 .WORD 0x391c0cb3,0xc5c95a63,0x4ed8aa4a,0xe3418acb
196 .WORD 0x5b9cca4f,0x7763e373,0x682e6ff3,0xd6b2b8a3
197 .WORD 0x748f82ee,0x5defb2fc,0x78a5636f,0x43172f60
198 .WORD 0x84c87814,0xa1f0ab72,0x8cc70208,0x1a6439ec
199 .WORD 0x90befffa,0x23631e28,0xa4506ceb,0xde82bde9
200 .WORD 0xbef9a3f7,0xb2c67915,0xc67178f2,0xe372532b
201 .WORD 0xca273ece,0xea26619c,0xd186b8c7,0x21c0c207
202 .WORD 0xeada7dd6,0xcde0eb1e,0xf57d4f7f,0xee6ed178
203 .WORD 0x06f067aa,0x72176fba,0x0a637dc5,0xa2c898a6
204 .WORD 0x113f9804,0xbef90dae,0x1b710b35,0x131c471b
205 .WORD 0x28db77f5,0x23047d84,0x32caab7b,0x40c72493
206 .WORD 0x3c9ebe0a,0x15c9bebc,0x431d67c4,0x9c100d4c
207 .WORD 0x4cc5d4be,0xcb3e42b6,0x597f299c,0xfc657e2a
208 .WORD 0x5fcb6fab,0x3ad6faec,0x6c44198c,0x4a475817
209___
210$code.=<<___ if ($SZ==4);
211 .WORD 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
212 .WORD 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
213 .WORD 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
214 .WORD 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
215 .WORD 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
216 .WORD 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
217 .WORD 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
218 .WORD 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
219 .WORD 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
220 .WORD 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
221 .WORD 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
222 .WORD 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
223 .WORD 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
224 .WORD 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
225 .WORD 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
226 .WORD 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
227___
228$code.=<<___;
229
230 .EXPORT $func,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR
231 .ALIGN 64
232$func
233 .PROC
234 .CALLINFO FRAME=`$FRAME-16*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=18
235 .ENTRY
236 $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
237 $PUSHMA %r3,$FRAME(%sp)
238 $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
239 $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
240 $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
241 $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp)
242 $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp)
243 $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp)
244 $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp)
245 $PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp)
246 $PUSH %r12,`-$FRAME+9*$SIZE_T`(%sp)
247 $PUSH %r13,`-$FRAME+10*$SIZE_T`(%sp)
248 $PUSH %r14,`-$FRAME+11*$SIZE_T`(%sp)
249 $PUSH %r15,`-$FRAME+12*$SIZE_T`(%sp)
250 $PUSH %r16,`-$FRAME+13*$SIZE_T`(%sp)
251 $PUSH %r17,`-$FRAME+14*$SIZE_T`(%sp)
252 $PUSH %r18,`-$FRAME+15*$SIZE_T`(%sp)
253
254 _shl $num,`log(16*$SZ)/log(2)`,$num
255 addl $inp,$num,$num ; $num to point at the end of $inp
256
257 $PUSH $num,`-$FRAME_MARKER-4*$SIZE_T`(%sp) ; save arguments
258 $PUSH $inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp)
259 $PUSH $ctx,`-$FRAME_MARKER-2*$SIZE_T`(%sp)
260
261 blr %r0,$Tbl
262 ldi 3,$t1
263L\$pic
264 andcm $Tbl,$t1,$Tbl ; wipe privilege level
265 ldo L\$table-L\$pic($Tbl),$Tbl
266___
267$code.=<<___ if ($SZ==8 && $SIZE_T==4);
268 ldi 31,$t1
269 mtctl $t1,%cr11
270 extrd,u,*= $t1,%sar,1,$t1 ; executes on PA-RISC 1.0
271 b L\$parisc1
272 nop
273___
274$code.=<<___;
275 $LD `0*$SZ`($ctx),$A ; load context
276 $LD `1*$SZ`($ctx),$B
277 $LD `2*$SZ`($ctx),$C
278 $LD `3*$SZ`($ctx),$D
279 $LD `4*$SZ`($ctx),$E
280 $LD `5*$SZ`($ctx),$F
281 $LD `6*$SZ`($ctx),$G
282 $LD `7*$SZ`($ctx),$H
283
284 extru $inp,31,`log($SZ)/log(2)`,$t0
285 sh3addl $t0,%r0,$t0
286 subi `8*$SZ`,$t0,$t0
287 mtctl $t0,%cr11 ; load %sar with align factor
288
289L\$oop
290 ldi `$SZ-1`,$t0
291 $LDM $SZ($Tbl),$t1
292 andcm $inp,$t0,$t0 ; align $inp
293___
294 for ($i=0;$i<15;$i++) { # load input block
295 $code.="\t$LD `$SZ*$i`($t0),@X[$i]\n"; }
296$code.=<<___;
297 cmpb,*= $inp,$t0,L\$aligned
298 $LD `$SZ*15`($t0),@X[15]
299 $LD `$SZ*16`($t0),@X[16]
300___
301 for ($i=0;$i<16;$i++) { # align data
302 $code.="\t_align @X[$i],@X[$i+1],@X[$i]\n"; }
303$code.=<<___;
304L\$aligned
305 nop ; otherwise /usr/ccs/bin/as is confused by below .WORD
306___
307
308for($i=0;$i<16;$i++) { &ROUND_00_15($i,@V); unshift(@V,pop(@V)); }
309$code.=<<___;
310L\$rounds
311 nop ; otherwise /usr/ccs/bin/as is confused by below .WORD
312___
313for(;$i<32;$i++) { &ROUND_16_xx($i,@V); unshift(@V,pop(@V)); }
314$code.=<<___;
315 bb,>= $Tbl,31,L\$rounds ; end of $Tbl signalled?
316 nop
317
318 $POP `-$FRAME_MARKER-2*$SIZE_T`(%sp),$ctx ; restore arguments
319 $POP `-$FRAME_MARKER-3*$SIZE_T`(%sp),$inp
320 $POP `-$FRAME_MARKER-4*$SIZE_T`(%sp),$num
321 ldo `-$rounds*$SZ-1`($Tbl),$Tbl ; rewind $Tbl
322
323 $LD `0*$SZ`($ctx),@X[0] ; load context
324 $LD `1*$SZ`($ctx),@X[1]
325 $LD `2*$SZ`($ctx),@X[2]
326 $LD `3*$SZ`($ctx),@X[3]
327 $LD `4*$SZ`($ctx),@X[4]
328 $LD `5*$SZ`($ctx),@X[5]
329 addl @X[0],$A,$A
330 $LD `6*$SZ`($ctx),@X[6]
331 addl @X[1],$B,$B
332 $LD `7*$SZ`($ctx),@X[7]
333 ldo `16*$SZ`($inp),$inp ; advance $inp
334
335 $ST $A,`0*$SZ`($ctx) ; save context
336 addl @X[2],$C,$C
337 $ST $B,`1*$SZ`($ctx)
338 addl @X[3],$D,$D
339 $ST $C,`2*$SZ`($ctx)
340 addl @X[4],$E,$E
341 $ST $D,`3*$SZ`($ctx)
342 addl @X[5],$F,$F
343 $ST $E,`4*$SZ`($ctx)
344 addl @X[6],$G,$G
345 $ST $F,`5*$SZ`($ctx)
346 addl @X[7],$H,$H
347 $ST $G,`6*$SZ`($ctx)
348 $ST $H,`7*$SZ`($ctx)
349
350 cmpb,*<>,n $inp,$num,L\$oop
351 $PUSH $inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp) ; save $inp
352___
353if ($SZ==8 && $SIZE_T==4) # SHA512 for 32-bit PA-RISC 1.0
354{{
355$code.=<<___;
356 b L\$done
357 nop
358
359 .ALIGN 64
360L\$parisc1
361___
362
363@V=( $Ahi, $Alo, $Bhi, $Blo, $Chi, $Clo, $Dhi, $Dlo,
364 $Ehi, $Elo, $Fhi, $Flo, $Ghi, $Glo, $Hhi, $Hlo) =
365 ( "%r1", "%r2", "%r3", "%r4", "%r5", "%r6", "%r7", "%r8",
366 "%r9","%r10","%r11","%r12","%r13","%r14","%r15","%r16");
367$a0 ="%r17";
368$a1 ="%r18";
369$a2 ="%r19";
370$a3 ="%r20";
371$t0 ="%r21";
372$t1 ="%r22";
373$t2 ="%r28";
374$t3 ="%r29";
375$Tbl="%r31";
376
377@X=("%r23","%r24","%r25","%r26"); # zaps $num,$inp,$ctx
378
379sub ROUND_00_15_pa1 {
380my ($i,$ahi,$alo,$bhi,$blo,$chi,$clo,$dhi,$dlo,
381 $ehi,$elo,$fhi,$flo,$ghi,$glo,$hhi,$hlo,$flag)=@_;
382my ($Xhi,$Xlo,$Xnhi,$Xnlo) = @X;
383
384$code.=<<___ if (!$flag);
385 ldw `-$XOFF+8*(($i+1)%16)`(%sp),$Xnhi
386 ldw `-$XOFF+8*(($i+1)%16)+4`(%sp),$Xnlo ; load X[i+1]
387___
388$code.=<<___;
389 shd $ehi,$elo,$Sigma1[0],$t0
390 add $Xlo,$hlo,$hlo
391 shd $elo,$ehi,$Sigma1[0],$t1
392 addc $Xhi,$hhi,$hhi ; h += X[i]
393 shd $ehi,$elo,$Sigma1[1],$t2
394 ldwm 8($Tbl),$Xhi
395 shd $elo,$ehi,$Sigma1[1],$t3
396 ldw -4($Tbl),$Xlo ; load K[i]
397 xor $t2,$t0,$t0
398 xor $t3,$t1,$t1
399 and $flo,$elo,$a0
400 and $fhi,$ehi,$a1
401 shd $ehi,$elo,$Sigma1[2],$t2
402 andcm $glo,$elo,$a2
403 shd $elo,$ehi,$Sigma1[2],$t3
404 andcm $ghi,$ehi,$a3
405 xor $t2,$t0,$t0
406 xor $t3,$t1,$t1 ; Sigma1(e)
407 add $Xlo,$hlo,$hlo
408 xor $a2,$a0,$a0
409 addc $Xhi,$hhi,$hhi ; h += K[i]
410 xor $a3,$a1,$a1 ; Ch(e,f,g)
411
412 add $t0,$hlo,$hlo
413 shd $ahi,$alo,$Sigma0[0],$t0
414 addc $t1,$hhi,$hhi ; h += Sigma1(e)
415 shd $alo,$ahi,$Sigma0[0],$t1
416 add $a0,$hlo,$hlo
417 shd $ahi,$alo,$Sigma0[1],$t2
418 addc $a1,$hhi,$hhi ; h += Ch(e,f,g)
419 shd $alo,$ahi,$Sigma0[1],$t3
420
421 xor $t2,$t0,$t0
422 xor $t3,$t1,$t1
423 shd $ahi,$alo,$Sigma0[2],$t2
424 and $alo,$blo,$a0
425 shd $alo,$ahi,$Sigma0[2],$t3
426 and $ahi,$bhi,$a1
427 xor $t2,$t0,$t0
428 xor $t3,$t1,$t1 ; Sigma0(a)
429
430 and $alo,$clo,$a2
431 and $ahi,$chi,$a3
432 xor $a2,$a0,$a0
433 add $hlo,$dlo,$dlo
434 xor $a3,$a1,$a1
435 addc $hhi,$dhi,$dhi ; d += h
436 and $blo,$clo,$a2
437 add $t0,$hlo,$hlo
438 and $bhi,$chi,$a3
439 addc $t1,$hhi,$hhi ; h += Sigma0(a)
440 xor $a2,$a0,$a0
441 add $a0,$hlo,$hlo
442 xor $a3,$a1,$a1 ; Maj(a,b,c)
443 addc $a1,$hhi,$hhi ; h += Maj(a,b,c)
444
445___
446$code.=<<___ if ($i==15 && $flag);
447 extru $Xlo,31,10,$Xlo
448 comiclr,= $LAST10BITS,$Xlo,%r0
449 b L\$rounds_pa1
450 nop
451___
452push(@X,shift(@X)); push(@X,shift(@X));
453}
454
455sub ROUND_16_xx_pa1 {
456my ($Xhi,$Xlo,$Xnhi,$Xnlo) = @X;
457my ($i)=shift;
458$i-=16;
459$code.=<<___;
460 ldw `-$XOFF+8*(($i+1)%16)`(%sp),$Xnhi
461 ldw `-$XOFF+8*(($i+1)%16)+4`(%sp),$Xnlo ; load X[i+1]
462 ldw `-$XOFF+8*(($i+9)%16)`(%sp),$a1
463 ldw `-$XOFF+8*(($i+9)%16)+4`(%sp),$a0 ; load X[i+9]
464 ldw `-$XOFF+8*(($i+14)%16)`(%sp),$a3
465 ldw `-$XOFF+8*(($i+14)%16)+4`(%sp),$a2 ; load X[i+14]
466 shd $Xnhi,$Xnlo,$sigma0[0],$t0
467 shd $Xnlo,$Xnhi,$sigma0[0],$t1
468 add $a0,$Xlo,$Xlo
469 shd $Xnhi,$Xnlo,$sigma0[1],$t2
470 addc $a1,$Xhi,$Xhi
471 shd $Xnlo,$Xnhi,$sigma0[1],$t3
472 xor $t2,$t0,$t0
473 shd $Xnhi,$Xnlo,$sigma0[2],$t2
474 xor $t3,$t1,$t1
475 extru $Xnhi,`31-$sigma0[2]`,`32-$sigma0[2]`,$t3
476 xor $t2,$t0,$t0
477 shd $a3,$a2,$sigma1[0],$a0
478 xor $t3,$t1,$t1 ; sigma0(X[i+1)&0x0f])
479 shd $a2,$a3,$sigma1[0],$a1
480 add $t0,$Xlo,$Xlo
481 shd $a3,$a2,$sigma1[1],$t2
482 addc $t1,$Xhi,$Xhi
483 shd $a2,$a3,$sigma1[1],$t3
484 xor $t2,$a0,$a0
485 shd $a3,$a2,$sigma1[2],$t2
486 xor $t3,$a1,$a1
487 extru $a3,`31-$sigma1[2]`,`32-$sigma1[2]`,$t3
488 xor $t2,$a0,$a0
489 xor $t3,$a1,$a1 ; sigma0(X[i+14)&0x0f])
490 add $a0,$Xlo,$Xlo
491 addc $a1,$Xhi,$Xhi
492
493 stw $Xhi,`-$XOFF+8*($i%16)`(%sp)
494 stw $Xlo,`-$XOFF+8*($i%16)+4`(%sp)
495___
496&ROUND_00_15_pa1($i,@_,1);
497}
498$code.=<<___;
499 ldw `0*4`($ctx),$Ahi ; load context
500 ldw `1*4`($ctx),$Alo
501 ldw `2*4`($ctx),$Bhi
502 ldw `3*4`($ctx),$Blo
503 ldw `4*4`($ctx),$Chi
504 ldw `5*4`($ctx),$Clo
505 ldw `6*4`($ctx),$Dhi
506 ldw `7*4`($ctx),$Dlo
507 ldw `8*4`($ctx),$Ehi
508 ldw `9*4`($ctx),$Elo
509 ldw `10*4`($ctx),$Fhi
510 ldw `11*4`($ctx),$Flo
511 ldw `12*4`($ctx),$Ghi
512 ldw `13*4`($ctx),$Glo
513 ldw `14*4`($ctx),$Hhi
514 ldw `15*4`($ctx),$Hlo
515
516 extru $inp,31,2,$t0
517 sh3addl $t0,%r0,$t0
518 subi 32,$t0,$t0
519 mtctl $t0,%cr11 ; load %sar with align factor
520
521L\$oop_pa1
522 extru $inp,31,2,$a3
523 comib,= 0,$a3,L\$aligned_pa1
524 sub $inp,$a3,$inp
525
526 ldw `0*4`($inp),$X[0]
527 ldw `1*4`($inp),$X[1]
528 ldw `2*4`($inp),$t2
529 ldw `3*4`($inp),$t3
530 ldw `4*4`($inp),$a0
531 ldw `5*4`($inp),$a1
532 ldw `6*4`($inp),$a2
533 ldw `7*4`($inp),$a3
534 vshd $X[0],$X[1],$X[0]
535 vshd $X[1],$t2,$X[1]
536 stw $X[0],`-$XOFF+0*4`(%sp)
537 ldw `8*4`($inp),$t0
538 vshd $t2,$t3,$t2
539 stw $X[1],`-$XOFF+1*4`(%sp)
540 ldw `9*4`($inp),$t1
541 vshd $t3,$a0,$t3
542___
543{
544my @t=($t2,$t3,$a0,$a1,$a2,$a3,$t0,$t1);
545for ($i=2;$i<=(128/4-8);$i++) {
546$code.=<<___;
547 stw $t[0],`-$XOFF+$i*4`(%sp)
548 ldw `(8+$i)*4`($inp),$t[0]
549 vshd $t[1],$t[2],$t[1]
550___
551push(@t,shift(@t));
552}
553for (;$i<(128/4-1);$i++) {
554$code.=<<___;
555 stw $t[0],`-$XOFF+$i*4`(%sp)
556 vshd $t[1],$t[2],$t[1]
557___
558push(@t,shift(@t));
559}
560$code.=<<___;
561 b L\$collected_pa1
562 stw $t[0],`-$XOFF+$i*4`(%sp)
563
564___
565}
566$code.=<<___;
567L\$aligned_pa1
568 ldw `0*4`($inp),$X[0]
569 ldw `1*4`($inp),$X[1]
570 ldw `2*4`($inp),$t2
571 ldw `3*4`($inp),$t3
572 ldw `4*4`($inp),$a0
573 ldw `5*4`($inp),$a1
574 ldw `6*4`($inp),$a2
575 ldw `7*4`($inp),$a3
576 stw $X[0],`-$XOFF+0*4`(%sp)
577 ldw `8*4`($inp),$t0
578 stw $X[1],`-$XOFF+1*4`(%sp)
579 ldw `9*4`($inp),$t1
580___
581{
582my @t=($t2,$t3,$a0,$a1,$a2,$a3,$t0,$t1);
583for ($i=2;$i<(128/4-8);$i++) {
584$code.=<<___;
585 stw $t[0],`-$XOFF+$i*4`(%sp)
586 ldw `(8+$i)*4`($inp),$t[0]
587___
588push(@t,shift(@t));
589}
590for (;$i<128/4;$i++) {
591$code.=<<___;
592 stw $t[0],`-$XOFF+$i*4`(%sp)
593___
594push(@t,shift(@t));
595}
596$code.="L\$collected_pa1\n";
597}
598
599for($i=0;$i<16;$i++) { &ROUND_00_15_pa1($i,@V); unshift(@V,pop(@V)); unshift(@V,pop(@V)); }
600$code.="L\$rounds_pa1\n";
601for(;$i<32;$i++) { &ROUND_16_xx_pa1($i,@V); unshift(@V,pop(@V)); unshift(@V,pop(@V)); }
602
603$code.=<<___;
604 $POP `-$FRAME_MARKER-2*$SIZE_T`(%sp),$ctx ; restore arguments
605 $POP `-$FRAME_MARKER-3*$SIZE_T`(%sp),$inp
606 $POP `-$FRAME_MARKER-4*$SIZE_T`(%sp),$num
607 ldo `-$rounds*$SZ`($Tbl),$Tbl ; rewind $Tbl
608
609 ldw `0*4`($ctx),$t1 ; update context
610 ldw `1*4`($ctx),$t0
611 ldw `2*4`($ctx),$t3
612 ldw `3*4`($ctx),$t2
613 ldw `4*4`($ctx),$a1
614 ldw `5*4`($ctx),$a0
615 ldw `6*4`($ctx),$a3
616 add $t0,$Alo,$Alo
617 ldw `7*4`($ctx),$a2
618 addc $t1,$Ahi,$Ahi
619 ldw `8*4`($ctx),$t1
620 add $t2,$Blo,$Blo
621 ldw `9*4`($ctx),$t0
622 addc $t3,$Bhi,$Bhi
623 ldw `10*4`($ctx),$t3
624 add $a0,$Clo,$Clo
625 ldw `11*4`($ctx),$t2
626 addc $a1,$Chi,$Chi
627 ldw `12*4`($ctx),$a1
628 add $a2,$Dlo,$Dlo
629 ldw `13*4`($ctx),$a0
630 addc $a3,$Dhi,$Dhi
631 ldw `14*4`($ctx),$a3
632 add $t0,$Elo,$Elo
633 ldw `15*4`($ctx),$a2
634 addc $t1,$Ehi,$Ehi
635 stw $Ahi,`0*4`($ctx)
636 add $t2,$Flo,$Flo
637 stw $Alo,`1*4`($ctx)
638 addc $t3,$Fhi,$Fhi
639 stw $Bhi,`2*4`($ctx)
640 add $a0,$Glo,$Glo
641 stw $Blo,`3*4`($ctx)
642 addc $a1,$Ghi,$Ghi
643 stw $Chi,`4*4`($ctx)
644 add $a2,$Hlo,$Hlo
645 stw $Clo,`5*4`($ctx)
646 addc $a3,$Hhi,$Hhi
647 stw $Dhi,`6*4`($ctx)
648 ldo `16*$SZ`($inp),$inp ; advance $inp
649 stw $Dlo,`7*4`($ctx)
650 stw $Ehi,`8*4`($ctx)
651 stw $Elo,`9*4`($ctx)
652 stw $Fhi,`10*4`($ctx)
653 stw $Flo,`11*4`($ctx)
654 stw $Ghi,`12*4`($ctx)
655 stw $Glo,`13*4`($ctx)
656 stw $Hhi,`14*4`($ctx)
657 comb,= $inp,$num,L\$done
658 stw $Hlo,`15*4`($ctx)
659 b L\$oop_pa1
660 $PUSH $inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp) ; save $inp
661L\$done
662___
663}}
664$code.=<<___;
665 $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue
666 $POP `-$FRAME+1*$SIZE_T`(%sp),%r4
667 $POP `-$FRAME+2*$SIZE_T`(%sp),%r5
668 $POP `-$FRAME+3*$SIZE_T`(%sp),%r6
669 $POP `-$FRAME+4*$SIZE_T`(%sp),%r7
670 $POP `-$FRAME+5*$SIZE_T`(%sp),%r8
671 $POP `-$FRAME+6*$SIZE_T`(%sp),%r9
672 $POP `-$FRAME+7*$SIZE_T`(%sp),%r10
673 $POP `-$FRAME+8*$SIZE_T`(%sp),%r11
674 $POP `-$FRAME+9*$SIZE_T`(%sp),%r12
675 $POP `-$FRAME+10*$SIZE_T`(%sp),%r13
676 $POP `-$FRAME+11*$SIZE_T`(%sp),%r14
677 $POP `-$FRAME+12*$SIZE_T`(%sp),%r15
678 $POP `-$FRAME+13*$SIZE_T`(%sp),%r16
679 $POP `-$FRAME+14*$SIZE_T`(%sp),%r17
680 $POP `-$FRAME+15*$SIZE_T`(%sp),%r18
681 bv (%r2)
682 .EXIT
683 $POPMB -$FRAME(%sp),%r3
684 .PROCEND
685 .STRINGZ "SHA`64*$SZ` block transform for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
686___
687
688# Explicitly encode PA-RISC 2.0 instructions used in this module, so
689# that it can be compiled with .LEVEL 1.0. It should be noted that I
690# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0
691# directive...
692
693my $ldd = sub {
694 my ($mod,$args) = @_;
695 my $orig = "ldd$mod\t$args";
696
697 if ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 3 suffices
698 { my $opcode=(0x14<<26)|($2<<21)|($3<<16)|(($1&0x1FF8)<<1)|(($1>>13)&1);
699 $opcode|=(1<<3) if ($mod =~ /^,m/);
700 $opcode|=(1<<2) if ($mod =~ /^,mb/);
701 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
702 }
703 else { "\t".$orig; }
704};
705
706my $std = sub {
707 my ($mod,$args) = @_;
708 my $orig = "std$mod\t$args";
709
710 if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 3 suffices
711 { my $opcode=(0x1c<<26)|($3<<21)|($1<<16)|(($2&0x1FF8)<<1)|(($2>>13)&1);
712 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
713 }
714 else { "\t".$orig; }
715};
716
717my $extrd = sub {
718 my ($mod,$args) = @_;
719 my $orig = "extrd$mod\t$args";
720
721 # I only have ",u" completer, it's implicitly encoded...
722 if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 15
723 { my $opcode=(0x36<<26)|($1<<21)|($4<<16);
724 my $len=32-$3;
725 $opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5); # encode pos
726 $opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len
727 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
728 }
729 elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/) # format 12
730 { my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9);
731 my $len=32-$2;
732 $opcode |= (($len&0x20)<<3)|($len&0x1f); # encode len
733 $opcode |= (1<<13) if ($mod =~ /,\**=/);
734 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
735 }
736 else { "\t".$orig; }
737};
738
739my $shrpd = sub {
740 my ($mod,$args) = @_;
741 my $orig = "shrpd$mod\t$args";
742
743 if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/) # format 14
744 { my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4;
745 my $cpos=63-$3;
746 $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode sa
747 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
748 }
749 elsif ($args =~ /%r([0-9]+),%r([0-9]+),%sar,%r([0-9]+)/) # format 11
750 { sprintf "\t.WORD\t0x%08x\t; %s",
751 (0x34<<26)|($2<<21)|($1<<16)|(1<<9)|$3,$orig;
752 }
753 else { "\t".$orig; }
754};
755
756sub assemble {
757 my ($mnemonic,$mod,$args)=@_;
758 my $opcode = eval("\$$mnemonic");
759
760 ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args";
761}
762
763foreach (split("\n",$code)) {
764 s/\`([^\`]*)\`/eval $1/ge;
765
766 s/shd\s+(%r[0-9]+),(%r[0-9]+),([0-9]+)/
767 $3>31 ? sprintf("shd\t%$2,%$1,%d",$3-32) # rotation for >=32
768 : sprintf("shd\t%$1,%$2,%d",$3)/e or
769 # translate made up instructons: _ror, _shr, _align, _shl
770 s/_ror(\s+)(%r[0-9]+),/
771 ($SZ==4 ? "shd" : "shrpd")."$1$2,$2,"/e or
772
773 s/_shr(\s+%r[0-9]+),([0-9]+),/
774 $SZ==4 ? sprintf("extru%s,%d,%d,",$1,31-$2,32-$2)
775 : sprintf("extrd,u%s,%d,%d,",$1,63-$2,64-$2)/e or
776
777 s/_align(\s+%r[0-9]+,%r[0-9]+),/
778 ($SZ==4 ? "vshd$1," : "shrpd$1,%sar,")/e or
779
780 s/_shl(\s+%r[0-9]+),([0-9]+),/
781 $SIZE_T==4 ? sprintf("zdep%s,%d,%d,",$1,31-$2,32-$2)
782 : sprintf("depd,z%s,%d,%d,",$1,63-$2,64-$2)/e;
783
784 s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e if ($SIZE_T==4);
785
786 s/cmpb,\*/comb,/ if ($SIZE_T==4);
787
788 print $_,"\n";
789}
790
791close STDOUT;
diff --git a/src/lib/libcrypto/sha/asm/sha512-ppc.pl b/src/lib/libcrypto/sha/asm/sha512-ppc.pl
deleted file mode 100755
index 6b44a68e59..0000000000
--- a/src/lib/libcrypto/sha/asm/sha512-ppc.pl
+++ /dev/null
@@ -1,460 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# I let hardware handle unaligned input, except on page boundaries
11# (see below for details). Otherwise straightforward implementation
12# with X vector in register bank. The module is big-endian [which is
13# not big deal as there're no little-endian targets left around].
14
15# sha256 | sha512
16# -m64 -m32 | -m64 -m32
17# --------------------------------------+-----------------------
18# PPC970,gcc-4.0.0 +50% +38% | +40% +410%(*)
19# Power6,xlc-7 +150% +90% | +100% +430%(*)
20#
21# (*) 64-bit code in 32-bit application context, which actually is
22# on TODO list. It should be noted that for safe deployment in
23# 32-bit *mutli-threaded* context asyncronous signals should be
24# blocked upon entry to SHA512 block routine. This is because
25# 32-bit signaling procedure invalidates upper halves of GPRs.
26# Context switch procedure preserves them, but not signaling:-(
27
28# Second version is true multi-thread safe. Trouble with the original
29# version was that it was using thread local storage pointer register.
30# Well, it scrupulously preserved it, but the problem would arise the
31# moment asynchronous signal was delivered and signal handler would
32# dereference the TLS pointer. While it's never the case in openssl
33# application or test suite, we have to respect this scenario and not
34# use TLS pointer register. Alternative would be to require caller to
35# block signals prior calling this routine. For the record, in 32-bit
36# context R2 serves as TLS pointer, while in 64-bit context - R13.
37
38$flavour=shift;
39$output =shift;
40
41if ($flavour =~ /64/) {
42 $SIZE_T=8;
43 $LRSAVE=2*$SIZE_T;
44 $STU="stdu";
45 $UCMP="cmpld";
46 $SHL="sldi";
47 $POP="ld";
48 $PUSH="std";
49} elsif ($flavour =~ /32/) {
50 $SIZE_T=4;
51 $LRSAVE=$SIZE_T;
52 $STU="stwu";
53 $UCMP="cmplw";
54 $SHL="slwi";
55 $POP="lwz";
56 $PUSH="stw";
57} else { die "nonsense $flavour"; }
58
59$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
60( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
61( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
62die "can't locate ppc-xlate.pl";
63
64open STDOUT,"| $^X $xlate $flavour $output" || die "can't call $xlate: $!";
65
66if ($output =~ /512/) {
67 $func="sha512_block_data_order";
68 $SZ=8;
69 @Sigma0=(28,34,39);
70 @Sigma1=(14,18,41);
71 @sigma0=(1, 8, 7);
72 @sigma1=(19,61, 6);
73 $rounds=80;
74 $LD="ld";
75 $ST="std";
76 $ROR="rotrdi";
77 $SHR="srdi";
78} else {
79 $func="sha256_block_data_order";
80 $SZ=4;
81 @Sigma0=( 2,13,22);
82 @Sigma1=( 6,11,25);
83 @sigma0=( 7,18, 3);
84 @sigma1=(17,19,10);
85 $rounds=64;
86 $LD="lwz";
87 $ST="stw";
88 $ROR="rotrwi";
89 $SHR="srwi";
90}
91
92$FRAME=32*$SIZE_T+16*$SZ;
93$LOCALS=6*$SIZE_T;
94
95$sp ="r1";
96$toc="r2";
97$ctx="r3"; # zapped by $a0
98$inp="r4"; # zapped by $a1
99$num="r5"; # zapped by $t0
100
101$T ="r0";
102$a0 ="r3";
103$a1 ="r4";
104$t0 ="r5";
105$t1 ="r6";
106$Tbl="r7";
107
108$A ="r8";
109$B ="r9";
110$C ="r10";
111$D ="r11";
112$E ="r12";
113$F ="r13"; $F="r2" if ($SIZE_T==8);# reassigned to exempt TLS pointer
114$G ="r14";
115$H ="r15";
116
117@V=($A,$B,$C,$D,$E,$F,$G,$H);
118@X=("r16","r17","r18","r19","r20","r21","r22","r23",
119 "r24","r25","r26","r27","r28","r29","r30","r31");
120
121$inp="r31"; # reassigned $inp! aliases with @X[15]
122
123sub ROUND_00_15 {
124my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
125$code.=<<___;
126 $LD $T,`$i*$SZ`($Tbl)
127 $ROR $a0,$e,$Sigma1[0]
128 $ROR $a1,$e,$Sigma1[1]
129 and $t0,$f,$e
130 andc $t1,$g,$e
131 add $T,$T,$h
132 xor $a0,$a0,$a1
133 $ROR $a1,$a1,`$Sigma1[2]-$Sigma1[1]`
134 or $t0,$t0,$t1 ; Ch(e,f,g)
135 add $T,$T,@X[$i]
136 xor $a0,$a0,$a1 ; Sigma1(e)
137 add $T,$T,$t0
138 add $T,$T,$a0
139
140 $ROR $a0,$a,$Sigma0[0]
141 $ROR $a1,$a,$Sigma0[1]
142 and $t0,$a,$b
143 and $t1,$a,$c
144 xor $a0,$a0,$a1
145 $ROR $a1,$a1,`$Sigma0[2]-$Sigma0[1]`
146 xor $t0,$t0,$t1
147 and $t1,$b,$c
148 xor $a0,$a0,$a1 ; Sigma0(a)
149 add $d,$d,$T
150 xor $t0,$t0,$t1 ; Maj(a,b,c)
151 add $h,$T,$a0
152 add $h,$h,$t0
153
154___
155}
156
157sub ROUND_16_xx {
158my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
159$i-=16;
160$code.=<<___;
161 $ROR $a0,@X[($i+1)%16],$sigma0[0]
162 $ROR $a1,@X[($i+1)%16],$sigma0[1]
163 $ROR $t0,@X[($i+14)%16],$sigma1[0]
164 $ROR $t1,@X[($i+14)%16],$sigma1[1]
165 xor $a0,$a0,$a1
166 $SHR $a1,@X[($i+1)%16],$sigma0[2]
167 xor $t0,$t0,$t1
168 $SHR $t1,@X[($i+14)%16],$sigma1[2]
169 add @X[$i],@X[$i],@X[($i+9)%16]
170 xor $a0,$a0,$a1 ; sigma0(X[(i+1)&0x0f])
171 xor $t0,$t0,$t1 ; sigma1(X[(i+14)&0x0f])
172 add @X[$i],@X[$i],$a0
173 add @X[$i],@X[$i],$t0
174___
175&ROUND_00_15($i,$a,$b,$c,$d,$e,$f,$g,$h);
176}
177
178$code=<<___;
179.machine "any"
180.text
181
182.globl $func
183.align 6
184$func:
185 $STU $sp,-$FRAME($sp)
186 mflr r0
187 $SHL $num,$num,`log(16*$SZ)/log(2)`
188
189 $PUSH $ctx,`$FRAME-$SIZE_T*22`($sp)
190
191 $PUSH $toc,`$FRAME-$SIZE_T*20`($sp)
192 $PUSH r13,`$FRAME-$SIZE_T*19`($sp)
193 $PUSH r14,`$FRAME-$SIZE_T*18`($sp)
194 $PUSH r15,`$FRAME-$SIZE_T*17`($sp)
195 $PUSH r16,`$FRAME-$SIZE_T*16`($sp)
196 $PUSH r17,`$FRAME-$SIZE_T*15`($sp)
197 $PUSH r18,`$FRAME-$SIZE_T*14`($sp)
198 $PUSH r19,`$FRAME-$SIZE_T*13`($sp)
199 $PUSH r20,`$FRAME-$SIZE_T*12`($sp)
200 $PUSH r21,`$FRAME-$SIZE_T*11`($sp)
201 $PUSH r22,`$FRAME-$SIZE_T*10`($sp)
202 $PUSH r23,`$FRAME-$SIZE_T*9`($sp)
203 $PUSH r24,`$FRAME-$SIZE_T*8`($sp)
204 $PUSH r25,`$FRAME-$SIZE_T*7`($sp)
205 $PUSH r26,`$FRAME-$SIZE_T*6`($sp)
206 $PUSH r27,`$FRAME-$SIZE_T*5`($sp)
207 $PUSH r28,`$FRAME-$SIZE_T*4`($sp)
208 $PUSH r29,`$FRAME-$SIZE_T*3`($sp)
209 $PUSH r30,`$FRAME-$SIZE_T*2`($sp)
210 $PUSH r31,`$FRAME-$SIZE_T*1`($sp)
211 $PUSH r0,`$FRAME+$LRSAVE`($sp)
212
213 $LD $A,`0*$SZ`($ctx)
214 mr $inp,r4 ; incarnate $inp
215 $LD $B,`1*$SZ`($ctx)
216 $LD $C,`2*$SZ`($ctx)
217 $LD $D,`3*$SZ`($ctx)
218 $LD $E,`4*$SZ`($ctx)
219 $LD $F,`5*$SZ`($ctx)
220 $LD $G,`6*$SZ`($ctx)
221 $LD $H,`7*$SZ`($ctx)
222
223 bl LPICmeup
224LPICedup:
225 andi. r0,$inp,3
226 bne Lunaligned
227Laligned:
228 add $num,$inp,$num
229 $PUSH $num,`$FRAME-$SIZE_T*24`($sp) ; end pointer
230 $PUSH $inp,`$FRAME-$SIZE_T*23`($sp) ; inp pointer
231 bl Lsha2_block_private
232 b Ldone
233
234; PowerPC specification allows an implementation to be ill-behaved
235; upon unaligned access which crosses page boundary. "Better safe
236; than sorry" principle makes me treat it specially. But I don't
237; look for particular offending word, but rather for the input
238; block which crosses the boundary. Once found that block is aligned
239; and hashed separately...
240.align 4
241Lunaligned:
242 subfic $t1,$inp,4096
243 andi. $t1,$t1,`4096-16*$SZ` ; distance to closest page boundary
244 beq Lcross_page
245 $UCMP $num,$t1
246 ble- Laligned ; didn't cross the page boundary
247 subfc $num,$t1,$num
248 add $t1,$inp,$t1
249 $PUSH $num,`$FRAME-$SIZE_T*25`($sp) ; save real remaining num
250 $PUSH $t1,`$FRAME-$SIZE_T*24`($sp) ; intermediate end pointer
251 $PUSH $inp,`$FRAME-$SIZE_T*23`($sp) ; inp pointer
252 bl Lsha2_block_private
253 ; $inp equals to the intermediate end pointer here
254 $POP $num,`$FRAME-$SIZE_T*25`($sp) ; restore real remaining num
255Lcross_page:
256 li $t1,`16*$SZ/4`
257 mtctr $t1
258 addi r20,$sp,$LOCALS ; aligned spot below the frame
259Lmemcpy:
260 lbz r16,0($inp)
261 lbz r17,1($inp)
262 lbz r18,2($inp)
263 lbz r19,3($inp)
264 addi $inp,$inp,4
265 stb r16,0(r20)
266 stb r17,1(r20)
267 stb r18,2(r20)
268 stb r19,3(r20)
269 addi r20,r20,4
270 bdnz Lmemcpy
271
272 $PUSH $inp,`$FRAME-$SIZE_T*26`($sp) ; save real inp
273 addi $t1,$sp,`$LOCALS+16*$SZ` ; fictitious end pointer
274 addi $inp,$sp,$LOCALS ; fictitious inp pointer
275 $PUSH $num,`$FRAME-$SIZE_T*25`($sp) ; save real num
276 $PUSH $t1,`$FRAME-$SIZE_T*24`($sp) ; end pointer
277 $PUSH $inp,`$FRAME-$SIZE_T*23`($sp) ; inp pointer
278 bl Lsha2_block_private
279 $POP $inp,`$FRAME-$SIZE_T*26`($sp) ; restore real inp
280 $POP $num,`$FRAME-$SIZE_T*25`($sp) ; restore real num
281 addic. $num,$num,`-16*$SZ` ; num--
282 bne- Lunaligned
283
284Ldone:
285 $POP r0,`$FRAME+$LRSAVE`($sp)
286 $POP $toc,`$FRAME-$SIZE_T*20`($sp)
287 $POP r13,`$FRAME-$SIZE_T*19`($sp)
288 $POP r14,`$FRAME-$SIZE_T*18`($sp)
289 $POP r15,`$FRAME-$SIZE_T*17`($sp)
290 $POP r16,`$FRAME-$SIZE_T*16`($sp)
291 $POP r17,`$FRAME-$SIZE_T*15`($sp)
292 $POP r18,`$FRAME-$SIZE_T*14`($sp)
293 $POP r19,`$FRAME-$SIZE_T*13`($sp)
294 $POP r20,`$FRAME-$SIZE_T*12`($sp)
295 $POP r21,`$FRAME-$SIZE_T*11`($sp)
296 $POP r22,`$FRAME-$SIZE_T*10`($sp)
297 $POP r23,`$FRAME-$SIZE_T*9`($sp)
298 $POP r24,`$FRAME-$SIZE_T*8`($sp)
299 $POP r25,`$FRAME-$SIZE_T*7`($sp)
300 $POP r26,`$FRAME-$SIZE_T*6`($sp)
301 $POP r27,`$FRAME-$SIZE_T*5`($sp)
302 $POP r28,`$FRAME-$SIZE_T*4`($sp)
303 $POP r29,`$FRAME-$SIZE_T*3`($sp)
304 $POP r30,`$FRAME-$SIZE_T*2`($sp)
305 $POP r31,`$FRAME-$SIZE_T*1`($sp)
306 mtlr r0
307 addi $sp,$sp,$FRAME
308 blr
309 .long 0
310 .byte 0,12,4,1,0x80,18,3,0
311 .long 0
312
313.align 4
314Lsha2_block_private:
315___
316for($i=0;$i<16;$i++) {
317$code.=<<___ if ($SZ==4);
318 lwz @X[$i],`$i*$SZ`($inp)
319___
320# 64-bit loads are split to 2x32-bit ones, as CPU can't handle
321# unaligned 64-bit loads, only 32-bit ones...
322$code.=<<___ if ($SZ==8);
323 lwz $t0,`$i*$SZ`($inp)
324 lwz @X[$i],`$i*$SZ+4`($inp)
325 insrdi @X[$i],$t0,32,0
326___
327 &ROUND_00_15($i,@V);
328 unshift(@V,pop(@V));
329}
330$code.=<<___;
331 li $T,`$rounds/16-1`
332 mtctr $T
333.align 4
334Lrounds:
335 addi $Tbl,$Tbl,`16*$SZ`
336___
337for(;$i<32;$i++) {
338 &ROUND_16_xx($i,@V);
339 unshift(@V,pop(@V));
340}
341$code.=<<___;
342 bdnz- Lrounds
343
344 $POP $ctx,`$FRAME-$SIZE_T*22`($sp)
345 $POP $inp,`$FRAME-$SIZE_T*23`($sp) ; inp pointer
346 $POP $num,`$FRAME-$SIZE_T*24`($sp) ; end pointer
347 subi $Tbl,$Tbl,`($rounds-16)*$SZ` ; rewind Tbl
348
349 $LD r16,`0*$SZ`($ctx)
350 $LD r17,`1*$SZ`($ctx)
351 $LD r18,`2*$SZ`($ctx)
352 $LD r19,`3*$SZ`($ctx)
353 $LD r20,`4*$SZ`($ctx)
354 $LD r21,`5*$SZ`($ctx)
355 $LD r22,`6*$SZ`($ctx)
356 addi $inp,$inp,`16*$SZ` ; advance inp
357 $LD r23,`7*$SZ`($ctx)
358 add $A,$A,r16
359 add $B,$B,r17
360 $PUSH $inp,`$FRAME-$SIZE_T*23`($sp)
361 add $C,$C,r18
362 $ST $A,`0*$SZ`($ctx)
363 add $D,$D,r19
364 $ST $B,`1*$SZ`($ctx)
365 add $E,$E,r20
366 $ST $C,`2*$SZ`($ctx)
367 add $F,$F,r21
368 $ST $D,`3*$SZ`($ctx)
369 add $G,$G,r22
370 $ST $E,`4*$SZ`($ctx)
371 add $H,$H,r23
372 $ST $F,`5*$SZ`($ctx)
373 $ST $G,`6*$SZ`($ctx)
374 $UCMP $inp,$num
375 $ST $H,`7*$SZ`($ctx)
376 bne Lsha2_block_private
377 blr
378 .long 0
379 .byte 0,12,0x14,0,0,0,0,0
380___
381
382# Ugly hack here, because PPC assembler syntax seem to vary too
383# much from platforms to platform...
384$code.=<<___;
385.align 6
386LPICmeup:
387 mflr r0
388 bcl 20,31,\$+4
389 mflr $Tbl ; vvvvvv "distance" between . and 1st data entry
390 addi $Tbl,$Tbl,`64-8`
391 mtlr r0
392 blr
393 .long 0
394 .byte 0,12,0x14,0,0,0,0,0
395 .space `64-9*4`
396___
397$code.=<<___ if ($SZ==8);
398 .long 0x428a2f98,0xd728ae22,0x71374491,0x23ef65cd
399 .long 0xb5c0fbcf,0xec4d3b2f,0xe9b5dba5,0x8189dbbc
400 .long 0x3956c25b,0xf348b538,0x59f111f1,0xb605d019
401 .long 0x923f82a4,0xaf194f9b,0xab1c5ed5,0xda6d8118
402 .long 0xd807aa98,0xa3030242,0x12835b01,0x45706fbe
403 .long 0x243185be,0x4ee4b28c,0x550c7dc3,0xd5ffb4e2
404 .long 0x72be5d74,0xf27b896f,0x80deb1fe,0x3b1696b1
405 .long 0x9bdc06a7,0x25c71235,0xc19bf174,0xcf692694
406 .long 0xe49b69c1,0x9ef14ad2,0xefbe4786,0x384f25e3
407 .long 0x0fc19dc6,0x8b8cd5b5,0x240ca1cc,0x77ac9c65
408 .long 0x2de92c6f,0x592b0275,0x4a7484aa,0x6ea6e483
409 .long 0x5cb0a9dc,0xbd41fbd4,0x76f988da,0x831153b5
410 .long 0x983e5152,0xee66dfab,0xa831c66d,0x2db43210
411 .long 0xb00327c8,0x98fb213f,0xbf597fc7,0xbeef0ee4
412 .long 0xc6e00bf3,0x3da88fc2,0xd5a79147,0x930aa725
413 .long 0x06ca6351,0xe003826f,0x14292967,0x0a0e6e70
414 .long 0x27b70a85,0x46d22ffc,0x2e1b2138,0x5c26c926
415 .long 0x4d2c6dfc,0x5ac42aed,0x53380d13,0x9d95b3df
416 .long 0x650a7354,0x8baf63de,0x766a0abb,0x3c77b2a8
417 .long 0x81c2c92e,0x47edaee6,0x92722c85,0x1482353b
418 .long 0xa2bfe8a1,0x4cf10364,0xa81a664b,0xbc423001
419 .long 0xc24b8b70,0xd0f89791,0xc76c51a3,0x0654be30
420 .long 0xd192e819,0xd6ef5218,0xd6990624,0x5565a910
421 .long 0xf40e3585,0x5771202a,0x106aa070,0x32bbd1b8
422 .long 0x19a4c116,0xb8d2d0c8,0x1e376c08,0x5141ab53
423 .long 0x2748774c,0xdf8eeb99,0x34b0bcb5,0xe19b48a8
424 .long 0x391c0cb3,0xc5c95a63,0x4ed8aa4a,0xe3418acb
425 .long 0x5b9cca4f,0x7763e373,0x682e6ff3,0xd6b2b8a3
426 .long 0x748f82ee,0x5defb2fc,0x78a5636f,0x43172f60
427 .long 0x84c87814,0xa1f0ab72,0x8cc70208,0x1a6439ec
428 .long 0x90befffa,0x23631e28,0xa4506ceb,0xde82bde9
429 .long 0xbef9a3f7,0xb2c67915,0xc67178f2,0xe372532b
430 .long 0xca273ece,0xea26619c,0xd186b8c7,0x21c0c207
431 .long 0xeada7dd6,0xcde0eb1e,0xf57d4f7f,0xee6ed178
432 .long 0x06f067aa,0x72176fba,0x0a637dc5,0xa2c898a6
433 .long 0x113f9804,0xbef90dae,0x1b710b35,0x131c471b
434 .long 0x28db77f5,0x23047d84,0x32caab7b,0x40c72493
435 .long 0x3c9ebe0a,0x15c9bebc,0x431d67c4,0x9c100d4c
436 .long 0x4cc5d4be,0xcb3e42b6,0x597f299c,0xfc657e2a
437 .long 0x5fcb6fab,0x3ad6faec,0x6c44198c,0x4a475817
438___
439$code.=<<___ if ($SZ==4);
440 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
441 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
442 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
443 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
444 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
445 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
446 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
447 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
448 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
449 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
450 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
451 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
452 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
453 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
454 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
455 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
456___
457
458$code =~ s/\`([^\`]*)\`/eval $1/gem;
459print $code;
460close STDOUT;
diff --git a/src/lib/libcrypto/sha/asm/sha512-s390x.pl b/src/lib/libcrypto/sha/asm/sha512-s390x.pl
deleted file mode 100644
index 079a3fc78a..0000000000
--- a/src/lib/libcrypto/sha/asm/sha512-s390x.pl
+++ /dev/null
@@ -1,322 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# SHA256/512 block procedures for s390x.
11
12# April 2007.
13#
14# sha256_block_data_order is reportedly >3 times faster than gcc 3.3
15# generated code (must be a bug in compiler, as improvement is
16# "pathologically" high, in particular in comparison to other SHA
17# modules). But the real twist is that it detects if hardware support
18# for SHA256 is available and in such case utilizes it. Then the
19# performance can reach >6.5x of assembler one for larger chunks.
20#
21# sha512_block_data_order is ~70% faster than gcc 3.3 generated code.
22
23# January 2009.
24#
25# Add support for hardware SHA512 and reschedule instructions to
26# favour dual-issue z10 pipeline. Hardware SHA256/512 is ~4.7x faster
27# than software.
28
29# November 2010.
30#
31# Adapt for -m31 build. If kernel supports what's called "highgprs"
32# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
33# instructions and achieve "64-bit" performance even in 31-bit legacy
34# application context. The feature is not specific to any particular
35# processor, as long as it's "z-CPU". Latter implies that the code
36# remains z/Architecture specific. On z900 SHA256 was measured to
37# perform 2.4x and SHA512 - 13x better than code generated by gcc 4.3.
38
39$flavour = shift;
40
41if ($flavour =~ /3[12]/) {
42 $SIZE_T=4;
43 $g="";
44} else {
45 $SIZE_T=8;
46 $g="g";
47}
48
49$t0="%r0";
50$t1="%r1";
51$ctx="%r2"; $t2="%r2";
52$inp="%r3";
53$len="%r4"; # used as index in inner loop
54
55$A="%r5";
56$B="%r6";
57$C="%r7";
58$D="%r8";
59$E="%r9";
60$F="%r10";
61$G="%r11";
62$H="%r12"; @V=($A,$B,$C,$D,$E,$F,$G,$H);
63$tbl="%r13";
64$T1="%r14";
65$sp="%r15";
66
67while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
68open STDOUT,">$output";
69
70if ($output =~ /512/) {
71 $label="512";
72 $SZ=8;
73 $LD="lg"; # load from memory
74 $ST="stg"; # store to memory
75 $ADD="alg"; # add with memory operand
76 $ROT="rllg"; # rotate left
77 $SHR="srlg"; # logical right shift [see even at the end]
78 @Sigma0=(25,30,36);
79 @Sigma1=(23,46,50);
80 @sigma0=(56,63, 7);
81 @sigma1=( 3,45, 6);
82 $rounds=80;
83 $kimdfunc=3; # 0 means unknown/unsupported/unimplemented/disabled
84} else {
85 $label="256";
86 $SZ=4;
87 $LD="llgf"; # load from memory
88 $ST="st"; # store to memory
89 $ADD="al"; # add with memory operand
90 $ROT="rll"; # rotate left
91 $SHR="srl"; # logical right shift
92 @Sigma0=(10,19,30);
93 @Sigma1=( 7,21,26);
94 @sigma0=(14,25, 3);
95 @sigma1=(13,15,10);
96 $rounds=64;
97 $kimdfunc=2; # magic function code for kimd instruction
98}
99$Func="sha${label}_block_data_order";
100$Table="K${label}";
101$stdframe=16*$SIZE_T+4*8;
102$frame=$stdframe+16*$SZ;
103
104sub BODY_00_15 {
105my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
106
107$code.=<<___ if ($i<16);
108 $LD $T1,`$i*$SZ`($inp) ### $i
109___
110$code.=<<___;
111 $ROT $t0,$e,$Sigma1[0]
112 $ROT $t1,$e,$Sigma1[1]
113 lgr $t2,$f
114 xgr $t0,$t1
115 $ROT $t1,$t1,`$Sigma1[2]-$Sigma1[1]`
116 xgr $t2,$g
117 $ST $T1,`$stdframe+$SZ*($i%16)`($sp)
118 xgr $t0,$t1 # Sigma1(e)
119 algr $T1,$h # T1+=h
120 ngr $t2,$e
121 lgr $t1,$a
122 algr $T1,$t0 # T1+=Sigma1(e)
123 $ROT $h,$a,$Sigma0[0]
124 xgr $t2,$g # Ch(e,f,g)
125 $ADD $T1,`$i*$SZ`($len,$tbl) # T1+=K[i]
126 $ROT $t0,$a,$Sigma0[1]
127 algr $T1,$t2 # T1+=Ch(e,f,g)
128 ogr $t1,$b
129 xgr $h,$t0
130 lgr $t2,$a
131 ngr $t1,$c
132 $ROT $t0,$t0,`$Sigma0[2]-$Sigma0[1]`
133 xgr $h,$t0 # h=Sigma0(a)
134 ngr $t2,$b
135 algr $h,$T1 # h+=T1
136 ogr $t2,$t1 # Maj(a,b,c)
137 algr $d,$T1 # d+=T1
138 algr $h,$t2 # h+=Maj(a,b,c)
139___
140}
141
142sub BODY_16_XX {
143my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
144
145$code.=<<___;
146 $LD $T1,`$stdframe+$SZ*(($i+1)%16)`($sp) ### $i
147 $LD $t1,`$stdframe+$SZ*(($i+14)%16)`($sp)
148 $ROT $t0,$T1,$sigma0[0]
149 $SHR $T1,$sigma0[2]
150 $ROT $t2,$t0,`$sigma0[1]-$sigma0[0]`
151 xgr $T1,$t0
152 $ROT $t0,$t1,$sigma1[0]
153 xgr $T1,$t2 # sigma0(X[i+1])
154 $SHR $t1,$sigma1[2]
155 $ADD $T1,`$stdframe+$SZ*($i%16)`($sp) # +=X[i]
156 xgr $t1,$t0
157 $ROT $t0,$t0,`$sigma1[1]-$sigma1[0]`
158 $ADD $T1,`$stdframe+$SZ*(($i+9)%16)`($sp) # +=X[i+9]
159 xgr $t1,$t0 # sigma1(X[i+14])
160 algr $T1,$t1 # +=sigma1(X[i+14])
161___
162 &BODY_00_15(@_);
163}
164
165$code.=<<___;
166.text
167.align 64
168.type $Table,\@object
169$Table:
170___
171$code.=<<___ if ($SZ==4);
172 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
173 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
174 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
175 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
176 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
177 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
178 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
179 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
180 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
181 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
182 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
183 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
184 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
185 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
186 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
187 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
188___
189$code.=<<___ if ($SZ==8);
190 .quad 0x428a2f98d728ae22,0x7137449123ef65cd
191 .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
192 .quad 0x3956c25bf348b538,0x59f111f1b605d019
193 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
194 .quad 0xd807aa98a3030242,0x12835b0145706fbe
195 .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
196 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
197 .quad 0x9bdc06a725c71235,0xc19bf174cf692694
198 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
199 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
200 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
201 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
202 .quad 0x983e5152ee66dfab,0xa831c66d2db43210
203 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
204 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
205 .quad 0x06ca6351e003826f,0x142929670a0e6e70
206 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
207 .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
208 .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
209 .quad 0x81c2c92e47edaee6,0x92722c851482353b
210 .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
211 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
212 .quad 0xd192e819d6ef5218,0xd69906245565a910
213 .quad 0xf40e35855771202a,0x106aa07032bbd1b8
214 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
215 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
216 .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
217 .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
218 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
219 .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
220 .quad 0x90befffa23631e28,0xa4506cebde82bde9
221 .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
222 .quad 0xca273eceea26619c,0xd186b8c721c0c207
223 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
224 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
225 .quad 0x113f9804bef90dae,0x1b710b35131c471b
226 .quad 0x28db77f523047d84,0x32caab7b40c72493
227 .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
228 .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
229 .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
230___
231$code.=<<___;
232.size $Table,.-$Table
233.globl $Func
234.type $Func,\@function
235$Func:
236 sllg $len,$len,`log(16*$SZ)/log(2)`
237___
238$code.=<<___ if ($kimdfunc);
239 larl %r1,OPENSSL_s390xcap_P
240 lg %r0,0(%r1)
241 tmhl %r0,0x4000 # check for message-security assist
242 jz .Lsoftware
243 lghi %r0,0
244 la %r1,`2*$SIZE_T`($sp)
245 .long 0xb93e0002 # kimd %r0,%r2
246 lg %r0,`2*$SIZE_T`($sp)
247 tmhh %r0,`0x8000>>$kimdfunc`
248 jz .Lsoftware
249 lghi %r0,$kimdfunc
250 lgr %r1,$ctx
251 lgr %r2,$inp
252 lgr %r3,$len
253 .long 0xb93e0002 # kimd %r0,%r2
254 brc 1,.-4 # pay attention to "partial completion"
255 br %r14
256.align 16
257.Lsoftware:
258___
259$code.=<<___;
260 lghi %r1,-$frame
261 la $len,0($len,$inp)
262 stm${g} $ctx,%r15,`2*$SIZE_T`($sp)
263 lgr %r0,$sp
264 la $sp,0(%r1,$sp)
265 st${g} %r0,0($sp)
266
267 larl $tbl,$Table
268 $LD $A,`0*$SZ`($ctx)
269 $LD $B,`1*$SZ`($ctx)
270 $LD $C,`2*$SZ`($ctx)
271 $LD $D,`3*$SZ`($ctx)
272 $LD $E,`4*$SZ`($ctx)
273 $LD $F,`5*$SZ`($ctx)
274 $LD $G,`6*$SZ`($ctx)
275 $LD $H,`7*$SZ`($ctx)
276
277.Lloop:
278 lghi $len,0
279___
280for ($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
281$code.=".Lrounds_16_xx:\n";
282for (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
283$code.=<<___;
284 aghi $len,`16*$SZ`
285 lghi $t0,`($rounds-16)*$SZ`
286 clgr $len,$t0
287 jne .Lrounds_16_xx
288
289 l${g} $ctx,`$frame+2*$SIZE_T`($sp)
290 la $inp,`16*$SZ`($inp)
291 $ADD $A,`0*$SZ`($ctx)
292 $ADD $B,`1*$SZ`($ctx)
293 $ADD $C,`2*$SZ`($ctx)
294 $ADD $D,`3*$SZ`($ctx)
295 $ADD $E,`4*$SZ`($ctx)
296 $ADD $F,`5*$SZ`($ctx)
297 $ADD $G,`6*$SZ`($ctx)
298 $ADD $H,`7*$SZ`($ctx)
299 $ST $A,`0*$SZ`($ctx)
300 $ST $B,`1*$SZ`($ctx)
301 $ST $C,`2*$SZ`($ctx)
302 $ST $D,`3*$SZ`($ctx)
303 $ST $E,`4*$SZ`($ctx)
304 $ST $F,`5*$SZ`($ctx)
305 $ST $G,`6*$SZ`($ctx)
306 $ST $H,`7*$SZ`($ctx)
307 cl${g} $inp,`$frame+4*$SIZE_T`($sp)
308 jne .Lloop
309
310 lm${g} %r6,%r15,`$frame+6*$SIZE_T`($sp)
311 br %r14
312.size $Func,.-$Func
313.string "SHA${label} block transform for s390x, CRYPTOGAMS by <appro\@openssl.org>"
314.comm OPENSSL_s390xcap_P,16,8
315___
316
317$code =~ s/\`([^\`]*)\`/eval $1/gem;
318# unlike 32-bit shift 64-bit one takes three arguments
319$code =~ s/(srlg\s+)(%r[0-9]+),/$1$2,$2,/gm;
320
321print $code;
322close STDOUT;
diff --git a/src/lib/libcrypto/sha/asm/sha512-sparcv9.pl b/src/lib/libcrypto/sha/asm/sha512-sparcv9.pl
deleted file mode 100644
index 585740789e..0000000000
--- a/src/lib/libcrypto/sha/asm/sha512-sparcv9.pl
+++ /dev/null
@@ -1,594 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# SHA256 performance improvement over compiler generated code varies
11# from 40% for Sun C [32-bit build] to 70% for gcc [3.3, 64-bit
12# build]. Just like in SHA1 module I aim to ensure scalability on
13# UltraSPARC T1 by packing X[16] to 8 64-bit registers.
14
15# SHA512 on pre-T1 UltraSPARC.
16#
17# Performance is >75% better than 64-bit code generated by Sun C and
18# over 2x than 32-bit code. X[16] resides on stack, but access to it
19# is scheduled for L2 latency and staged through 32 least significant
20# bits of %l0-%l7. The latter is done to achieve 32-/64-bit ABI
21# duality. Nevetheless it's ~40% faster than SHA256, which is pretty
22# good [optimal coefficient is 50%].
23#
24# SHA512 on UltraSPARC T1.
25#
26# It's not any faster than 64-bit code generated by Sun C 5.8. This is
27# because 64-bit code generator has the advantage of using 64-bit
28# loads(*) to access X[16], which I consciously traded for 32-/64-bit
29# ABI duality [as per above]. But it surpasses 32-bit Sun C generated
30# code by 60%, not to mention that it doesn't suffer from severe decay
31# when running 4 times physical cores threads and that it leaves gcc
32# [3.4] behind by over 4x factor! If compared to SHA256, single thread
33# performance is only 10% better, but overall throughput for maximum
34# amount of threads for given CPU exceeds corresponding one of SHA256
35# by 30% [again, optimal coefficient is 50%].
36#
37# (*) Unlike pre-T1 UltraSPARC loads on T1 are executed strictly
38# in-order, i.e. load instruction has to complete prior next
39# instruction in given thread is executed, even if the latter is
40# not dependent on load result! This means that on T1 two 32-bit
41# loads are always slower than one 64-bit load. Once again this
42# is unlike pre-T1 UltraSPARC, where, if scheduled appropriately,
43# 2x32-bit loads can be as fast as 1x64-bit ones.
44
45$bits=32;
46for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
47if ($bits==64) { $bias=2047; $frame=192; }
48else { $bias=0; $frame=112; }
49
50$output=shift;
51open STDOUT,">$output";
52
53if ($output =~ /512/) {
54 $label="512";
55 $SZ=8;
56 $LD="ldx"; # load from memory
57 $ST="stx"; # store to memory
58 $SLL="sllx"; # shift left logical
59 $SRL="srlx"; # shift right logical
60 @Sigma0=(28,34,39);
61 @Sigma1=(14,18,41);
62 @sigma0=( 7, 1, 8); # right shift first
63 @sigma1=( 6,19,61); # right shift first
64 $lastK=0x817;
65 $rounds=80;
66 $align=4;
67
68 $locals=16*$SZ; # X[16]
69
70 $A="%o0";
71 $B="%o1";
72 $C="%o2";
73 $D="%o3";
74 $E="%o4";
75 $F="%o5";
76 $G="%g1";
77 $H="%o7";
78 @V=($A,$B,$C,$D,$E,$F,$G,$H);
79} else {
80 $label="256";
81 $SZ=4;
82 $LD="ld"; # load from memory
83 $ST="st"; # store to memory
84 $SLL="sll"; # shift left logical
85 $SRL="srl"; # shift right logical
86 @Sigma0=( 2,13,22);
87 @Sigma1=( 6,11,25);
88 @sigma0=( 3, 7,18); # right shift first
89 @sigma1=(10,17,19); # right shift first
90 $lastK=0x8f2;
91 $rounds=64;
92 $align=8;
93
94 $locals=0; # X[16] is register resident
95 @X=("%o0","%o1","%o2","%o3","%o4","%o5","%g1","%o7");
96
97 $A="%l0";
98 $B="%l1";
99 $C="%l2";
100 $D="%l3";
101 $E="%l4";
102 $F="%l5";
103 $G="%l6";
104 $H="%l7";
105 @V=($A,$B,$C,$D,$E,$F,$G,$H);
106}
107$T1="%g2";
108$tmp0="%g3";
109$tmp1="%g4";
110$tmp2="%g5";
111
112$ctx="%i0";
113$inp="%i1";
114$len="%i2";
115$Ktbl="%i3";
116$tmp31="%i4";
117$tmp32="%i5";
118
119########### SHA256
120$Xload = sub {
121my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
122
123 if ($i==0) {
124$code.=<<___;
125 ldx [$inp+0],@X[0]
126 ldx [$inp+16],@X[2]
127 ldx [$inp+32],@X[4]
128 ldx [$inp+48],@X[6]
129 ldx [$inp+8],@X[1]
130 ldx [$inp+24],@X[3]
131 subcc %g0,$tmp31,$tmp32 ! should be 64-$tmp31, but -$tmp31 works too
132 ldx [$inp+40],@X[5]
133 bz,pt %icc,.Laligned
134 ldx [$inp+56],@X[7]
135
136 sllx @X[0],$tmp31,@X[0]
137 ldx [$inp+64],$T1
138___
139for($j=0;$j<7;$j++)
140{ $code.=<<___;
141 srlx @X[$j+1],$tmp32,$tmp1
142 sllx @X[$j+1],$tmp31,@X[$j+1]
143 or $tmp1,@X[$j],@X[$j]
144___
145}
146$code.=<<___;
147 srlx $T1,$tmp32,$T1
148 or $T1,@X[7],@X[7]
149.Laligned:
150___
151 }
152
153 if ($i&1) {
154 $code.="\tadd @X[$i/2],$h,$T1\n";
155 } else {
156 $code.="\tsrlx @X[$i/2],32,$T1\n\tadd $h,$T1,$T1\n";
157 }
158} if ($SZ==4);
159
160########### SHA512
161$Xload = sub {
162my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
163my @pair=("%l".eval(($i*2)%8),"%l".eval(($i*2)%8+1),"%l".eval((($i+1)*2)%8));
164
165$code.=<<___ if ($i==0);
166 ld [$inp+0],%l0
167 ld [$inp+4],%l1
168 ld [$inp+8],%l2
169 ld [$inp+12],%l3
170 ld [$inp+16],%l4
171 ld [$inp+20],%l5
172 ld [$inp+24],%l6
173 ld [$inp+28],%l7
174___
175$code.=<<___ if ($i<15);
176 sllx @pair[1],$tmp31,$tmp2 ! Xload($i)
177 add $tmp31,32,$tmp0
178 sllx @pair[0],$tmp0,$tmp1
179 `"ld [$inp+".eval(32+0+$i*8)."],@pair[0]" if ($i<12)`
180 srlx @pair[2],$tmp32,@pair[1]
181 or $tmp1,$tmp2,$tmp2
182 or @pair[1],$tmp2,$tmp2
183 `"ld [$inp+".eval(32+4+$i*8)."],@pair[1]" if ($i<12)`
184 add $h,$tmp2,$T1
185 $ST $tmp2,[%sp+`$bias+$frame+$i*$SZ`]
186___
187$code.=<<___ if ($i==12);
188 brnz,a $tmp31,.+8
189 ld [$inp+128],%l0
190___
191$code.=<<___ if ($i==15);
192 ld [%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+0`],%l2
193 sllx @pair[1],$tmp31,$tmp2 ! Xload($i)
194 add $tmp31,32,$tmp0
195 ld [%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+4`],%l3
196 sllx @pair[0],$tmp0,$tmp1
197 ld [%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+0`],%l4
198 srlx @pair[2],$tmp32,@pair[1]
199 or $tmp1,$tmp2,$tmp2
200 ld [%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+4`],%l5
201 or @pair[1],$tmp2,$tmp2
202 ld [%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+0`],%l6
203 add $h,$tmp2,$T1
204 $ST $tmp2,[%sp+`$bias+$frame+$i*$SZ`]
205 ld [%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+4`],%l7
206 ld [%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+0`],%l0
207 ld [%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+4`],%l1
208___
209} if ($SZ==8);
210
211########### common
212sub BODY_00_15 {
213my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
214
215 if ($i<16) {
216 &$Xload(@_);
217 } else {
218 $code.="\tadd $h,$T1,$T1\n";
219 }
220
221$code.=<<___;
222 $SRL $e,@Sigma1[0],$h !! $i
223 xor $f,$g,$tmp2
224 $SLL $e,`$SZ*8-@Sigma1[2]`,$tmp1
225 and $e,$tmp2,$tmp2
226 $SRL $e,@Sigma1[1],$tmp0
227 xor $tmp1,$h,$h
228 $SLL $e,`$SZ*8-@Sigma1[1]`,$tmp1
229 xor $tmp0,$h,$h
230 $SRL $e,@Sigma1[2],$tmp0
231 xor $tmp1,$h,$h
232 $SLL $e,`$SZ*8-@Sigma1[0]`,$tmp1
233 xor $tmp0,$h,$h
234 xor $g,$tmp2,$tmp2 ! Ch(e,f,g)
235 xor $tmp1,$h,$tmp0 ! Sigma1(e)
236
237 $SRL $a,@Sigma0[0],$h
238 add $tmp2,$T1,$T1
239 $LD [$Ktbl+`$i*$SZ`],$tmp2 ! K[$i]
240 $SLL $a,`$SZ*8-@Sigma0[2]`,$tmp1
241 add $tmp0,$T1,$T1
242 $SRL $a,@Sigma0[1],$tmp0
243 xor $tmp1,$h,$h
244 $SLL $a,`$SZ*8-@Sigma0[1]`,$tmp1
245 xor $tmp0,$h,$h
246 $SRL $a,@Sigma0[2],$tmp0
247 xor $tmp1,$h,$h
248 $SLL $a,`$SZ*8-@Sigma0[0]`,$tmp1
249 xor $tmp0,$h,$h
250 xor $tmp1,$h,$h ! Sigma0(a)
251
252 or $a,$b,$tmp0
253 and $a,$b,$tmp1
254 and $c,$tmp0,$tmp0
255 or $tmp0,$tmp1,$tmp1 ! Maj(a,b,c)
256 add $tmp2,$T1,$T1 ! +=K[$i]
257 add $tmp1,$h,$h
258
259 add $T1,$d,$d
260 add $T1,$h,$h
261___
262}
263
264########### SHA256
265$BODY_16_XX = sub {
266my $i=@_[0];
267my $xi;
268
269 if ($i&1) {
270 $xi=$tmp32;
271 $code.="\tsrlx @X[(($i+1)/2)%8],32,$xi\n";
272 } else {
273 $xi=@X[(($i+1)/2)%8];
274 }
275$code.=<<___;
276 srl $xi,@sigma0[0],$T1 !! Xupdate($i)
277 sll $xi,`32-@sigma0[2]`,$tmp1
278 srl $xi,@sigma0[1],$tmp0
279 xor $tmp1,$T1,$T1
280 sll $tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1
281 xor $tmp0,$T1,$T1
282 srl $xi,@sigma0[2],$tmp0
283 xor $tmp1,$T1,$T1
284___
285 if ($i&1) {
286 $xi=@X[(($i+14)/2)%8];
287 } else {
288 $xi=$tmp32;
289 $code.="\tsrlx @X[(($i+14)/2)%8],32,$xi\n";
290 }
291$code.=<<___;
292 srl $xi,@sigma1[0],$tmp2
293 xor $tmp0,$T1,$T1 ! T1=sigma0(X[i+1])
294 sll $xi,`32-@sigma1[2]`,$tmp1
295 srl $xi,@sigma1[1],$tmp0
296 xor $tmp1,$tmp2,$tmp2
297 sll $tmp1,`@sigma1[2]-@sigma1[1]`,$tmp1
298 xor $tmp0,$tmp2,$tmp2
299 srl $xi,@sigma1[2],$tmp0
300 xor $tmp1,$tmp2,$tmp2
301___
302 if ($i&1) {
303 $xi=@X[($i/2)%8];
304$code.=<<___;
305 srlx @X[(($i+9)/2)%8],32,$tmp1 ! X[i+9]
306 xor $tmp0,$tmp2,$tmp2 ! sigma1(X[i+14])
307 srl @X[($i/2)%8],0,$tmp0
308 add $tmp2,$tmp1,$tmp1
309 add $xi,$T1,$T1 ! +=X[i]
310 xor $tmp0,@X[($i/2)%8],@X[($i/2)%8]
311 add $tmp1,$T1,$T1
312
313 srl $T1,0,$T1
314 or $T1,@X[($i/2)%8],@X[($i/2)%8]
315___
316 } else {
317 $xi=@X[(($i+9)/2)%8];
318$code.=<<___;
319 srlx @X[($i/2)%8],32,$tmp1 ! X[i]
320 xor $tmp0,$tmp2,$tmp2 ! sigma1(X[i+14])
321 add $xi,$T1,$T1 ! +=X[i+9]
322 add $tmp2,$tmp1,$tmp1
323 srl @X[($i/2)%8],0,@X[($i/2)%8]
324 add $tmp1,$T1,$T1
325
326 sllx $T1,32,$tmp0
327 or $tmp0,@X[($i/2)%8],@X[($i/2)%8]
328___
329 }
330 &BODY_00_15(@_);
331} if ($SZ==4);
332
333########### SHA512
334$BODY_16_XX = sub {
335my $i=@_[0];
336my @pair=("%l".eval(($i*2)%8),"%l".eval(($i*2)%8+1));
337
338$code.=<<___;
339 sllx %l2,32,$tmp0 !! Xupdate($i)
340 or %l3,$tmp0,$tmp0
341
342 srlx $tmp0,@sigma0[0],$T1
343 ld [%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+0`],%l2
344 sllx $tmp0,`64-@sigma0[2]`,$tmp1
345 ld [%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+4`],%l3
346 srlx $tmp0,@sigma0[1],$tmp0
347 xor $tmp1,$T1,$T1
348 sllx $tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1
349 xor $tmp0,$T1,$T1
350 srlx $tmp0,`@sigma0[2]-@sigma0[1]`,$tmp0
351 xor $tmp1,$T1,$T1
352 sllx %l6,32,$tmp2
353 xor $tmp0,$T1,$T1 ! sigma0(X[$i+1])
354 or %l7,$tmp2,$tmp2
355
356 srlx $tmp2,@sigma1[0],$tmp1
357 ld [%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+0`],%l6
358 sllx $tmp2,`64-@sigma1[2]`,$tmp0
359 ld [%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+4`],%l7
360 srlx $tmp2,@sigma1[1],$tmp2
361 xor $tmp0,$tmp1,$tmp1
362 sllx $tmp0,`@sigma1[2]-@sigma1[1]`,$tmp0
363 xor $tmp2,$tmp1,$tmp1
364 srlx $tmp2,`@sigma1[2]-@sigma1[1]`,$tmp2
365 xor $tmp0,$tmp1,$tmp1
366 sllx %l4,32,$tmp0
367 xor $tmp2,$tmp1,$tmp1 ! sigma1(X[$i+14])
368 ld [%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+0`],%l4
369 or %l5,$tmp0,$tmp0
370 ld [%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+4`],%l5
371
372 sllx %l0,32,$tmp2
373 add $tmp1,$T1,$T1
374 ld [%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+0`],%l0
375 or %l1,$tmp2,$tmp2
376 add $tmp0,$T1,$T1 ! +=X[$i+9]
377 ld [%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+4`],%l1
378 add $tmp2,$T1,$T1 ! +=X[$i]
379 $ST $T1,[%sp+`$bias+$frame+($i%16)*$SZ`]
380___
381 &BODY_00_15(@_);
382} if ($SZ==8);
383
384$code.=<<___ if ($bits==64);
385.register %g2,#scratch
386.register %g3,#scratch
387___
388$code.=<<___;
389.section ".text",#alloc,#execinstr
390
391.align 64
392K${label}:
393.type K${label},#object
394___
395if ($SZ==4) {
396$code.=<<___;
397 .long 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
398 .long 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
399 .long 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
400 .long 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
401 .long 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
402 .long 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
403 .long 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
404 .long 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
405 .long 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
406 .long 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
407 .long 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
408 .long 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
409 .long 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
410 .long 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
411 .long 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
412 .long 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
413___
414} else {
415$code.=<<___;
416 .long 0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd
417 .long 0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc
418 .long 0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019
419 .long 0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118
420 .long 0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe
421 .long 0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2
422 .long 0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1
423 .long 0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694
424 .long 0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3
425 .long 0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65
426 .long 0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483
427 .long 0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5
428 .long 0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210
429 .long 0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4
430 .long 0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725
431 .long 0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70
432 .long 0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926
433 .long 0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df
434 .long 0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8
435 .long 0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b
436 .long 0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001
437 .long 0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30
438 .long 0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910
439 .long 0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8
440 .long 0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53
441 .long 0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8
442 .long 0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb
443 .long 0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3
444 .long 0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60
445 .long 0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec
446 .long 0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9
447 .long 0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b
448 .long 0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207
449 .long 0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178
450 .long 0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6
451 .long 0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b
452 .long 0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493
453 .long 0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c
454 .long 0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a
455 .long 0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817
456___
457}
458$code.=<<___;
459.size K${label},.-K${label}
460.globl sha${label}_block_data_order
461sha${label}_block_data_order:
462 save %sp,`-$frame-$locals`,%sp
463 and $inp,`$align-1`,$tmp31
464 sllx $len,`log(16*$SZ)/log(2)`,$len
465 andn $inp,`$align-1`,$inp
466 sll $tmp31,3,$tmp31
467 add $inp,$len,$len
468___
469$code.=<<___ if ($SZ==8); # SHA512
470 mov 32,$tmp32
471 sub $tmp32,$tmp31,$tmp32
472___
473$code.=<<___;
474.Lpic: call .+8
475 add %o7,K${label}-.Lpic,$Ktbl
476
477 $LD [$ctx+`0*$SZ`],$A
478 $LD [$ctx+`1*$SZ`],$B
479 $LD [$ctx+`2*$SZ`],$C
480 $LD [$ctx+`3*$SZ`],$D
481 $LD [$ctx+`4*$SZ`],$E
482 $LD [$ctx+`5*$SZ`],$F
483 $LD [$ctx+`6*$SZ`],$G
484 $LD [$ctx+`7*$SZ`],$H
485
486.Lloop:
487___
488for ($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
489$code.=".L16_xx:\n";
490for (;$i<32;$i++) { &$BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
491$code.=<<___;
492 and $tmp2,0xfff,$tmp2
493 cmp $tmp2,$lastK
494 bne .L16_xx
495 add $Ktbl,`16*$SZ`,$Ktbl ! Ktbl+=16
496
497___
498$code.=<<___ if ($SZ==4); # SHA256
499 $LD [$ctx+`0*$SZ`],@X[0]
500 $LD [$ctx+`1*$SZ`],@X[1]
501 $LD [$ctx+`2*$SZ`],@X[2]
502 $LD [$ctx+`3*$SZ`],@X[3]
503 $LD [$ctx+`4*$SZ`],@X[4]
504 $LD [$ctx+`5*$SZ`],@X[5]
505 $LD [$ctx+`6*$SZ`],@X[6]
506 $LD [$ctx+`7*$SZ`],@X[7]
507
508 add $A,@X[0],$A
509 $ST $A,[$ctx+`0*$SZ`]
510 add $B,@X[1],$B
511 $ST $B,[$ctx+`1*$SZ`]
512 add $C,@X[2],$C
513 $ST $C,[$ctx+`2*$SZ`]
514 add $D,@X[3],$D
515 $ST $D,[$ctx+`3*$SZ`]
516 add $E,@X[4],$E
517 $ST $E,[$ctx+`4*$SZ`]
518 add $F,@X[5],$F
519 $ST $F,[$ctx+`5*$SZ`]
520 add $G,@X[6],$G
521 $ST $G,[$ctx+`6*$SZ`]
522 add $H,@X[7],$H
523 $ST $H,[$ctx+`7*$SZ`]
524___
525$code.=<<___ if ($SZ==8); # SHA512
526 ld [$ctx+`0*$SZ+0`],%l0
527 ld [$ctx+`0*$SZ+4`],%l1
528 ld [$ctx+`1*$SZ+0`],%l2
529 ld [$ctx+`1*$SZ+4`],%l3
530 ld [$ctx+`2*$SZ+0`],%l4
531 ld [$ctx+`2*$SZ+4`],%l5
532 ld [$ctx+`3*$SZ+0`],%l6
533
534 sllx %l0,32,$tmp0
535 ld [$ctx+`3*$SZ+4`],%l7
536 sllx %l2,32,$tmp1
537 or %l1,$tmp0,$tmp0
538 or %l3,$tmp1,$tmp1
539 add $tmp0,$A,$A
540 add $tmp1,$B,$B
541 $ST $A,[$ctx+`0*$SZ`]
542 sllx %l4,32,$tmp2
543 $ST $B,[$ctx+`1*$SZ`]
544 sllx %l6,32,$T1
545 or %l5,$tmp2,$tmp2
546 or %l7,$T1,$T1
547 add $tmp2,$C,$C
548 $ST $C,[$ctx+`2*$SZ`]
549 add $T1,$D,$D
550 $ST $D,[$ctx+`3*$SZ`]
551
552 ld [$ctx+`4*$SZ+0`],%l0
553 ld [$ctx+`4*$SZ+4`],%l1
554 ld [$ctx+`5*$SZ+0`],%l2
555 ld [$ctx+`5*$SZ+4`],%l3
556 ld [$ctx+`6*$SZ+0`],%l4
557 ld [$ctx+`6*$SZ+4`],%l5
558 ld [$ctx+`7*$SZ+0`],%l6
559
560 sllx %l0,32,$tmp0
561 ld [$ctx+`7*$SZ+4`],%l7
562 sllx %l2,32,$tmp1
563 or %l1,$tmp0,$tmp0
564 or %l3,$tmp1,$tmp1
565 add $tmp0,$E,$E
566 add $tmp1,$F,$F
567 $ST $E,[$ctx+`4*$SZ`]
568 sllx %l4,32,$tmp2
569 $ST $F,[$ctx+`5*$SZ`]
570 sllx %l6,32,$T1
571 or %l5,$tmp2,$tmp2
572 or %l7,$T1,$T1
573 add $tmp2,$G,$G
574 $ST $G,[$ctx+`6*$SZ`]
575 add $T1,$H,$H
576 $ST $H,[$ctx+`7*$SZ`]
577___
578$code.=<<___;
579 add $inp,`16*$SZ`,$inp ! advance inp
580 cmp $inp,$len
581 bne `$bits==64?"%xcc":"%icc"`,.Lloop
582 sub $Ktbl,`($rounds-16)*$SZ`,$Ktbl ! rewind Ktbl
583
584 ret
585 restore
586.type sha${label}_block_data_order,#function
587.size sha${label}_block_data_order,(.-sha${label}_block_data_order)
588.asciz "SHA${label} block transform for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
589.align 4
590___
591
592$code =~ s/\`([^\`]*)\`/eval $1/gem;
593print $code;
594close STDOUT;
diff --git a/src/lib/libcrypto/sha/asm/sha512-x86_64.pl b/src/lib/libcrypto/sha/asm/sha512-x86_64.pl
deleted file mode 100755
index f611a2d898..0000000000
--- a/src/lib/libcrypto/sha/asm/sha512-x86_64.pl
+++ /dev/null
@@ -1,450 +0,0 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. Rights for redistribution and usage in source and binary
6# forms are granted according to the OpenSSL license.
7# ====================================================================
8#
9# sha256/512_block procedure for x86_64.
10#
11# 40% improvement over compiler-generated code on Opteron. On EM64T
12# sha256 was observed to run >80% faster and sha512 - >40%. No magical
13# tricks, just straight implementation... I really wonder why gcc
14# [being armed with inline assembler] fails to generate as fast code.
15# The only thing which is cool about this module is that it's very
16# same instruction sequence used for both SHA-256 and SHA-512. In
17# former case the instructions operate on 32-bit operands, while in
18# latter - on 64-bit ones. All I had to do is to get one flavor right,
19# the other one passed the test right away:-)
20#
21# sha256_block runs in ~1005 cycles on Opteron, which gives you
22# asymptotic performance of 64*1000/1005=63.7MBps times CPU clock
23# frequency in GHz. sha512_block runs in ~1275 cycles, which results
24# in 128*1000/1275=100MBps per GHz. Is there room for improvement?
25# Well, if you compare it to IA-64 implementation, which maintains
26# X[16] in register bank[!], tends to 4 instructions per CPU clock
27# cycle and runs in 1003 cycles, 1275 is very good result for 3-way
28# issue Opteron pipeline and X[16] maintained in memory. So that *if*
29# there is a way to improve it, *then* the only way would be to try to
30# offload X[16] updates to SSE unit, but that would require "deeper"
31# loop unroll, which in turn would naturally cause size blow-up, not
32# to mention increased complexity! And once again, only *if* it's
33# actually possible to noticeably improve overall ILP, instruction
34# level parallelism, on a given CPU implementation in this case.
35#
36# Special note on Intel EM64T. While Opteron CPU exhibits perfect
37# perfromance ratio of 1.5 between 64- and 32-bit flavors [see above],
38# [currently available] EM64T CPUs apparently are far from it. On the
39# contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit
40# sha256_block:-( This is presumably because 64-bit shifts/rotates
41# apparently are not atomic instructions, but implemented in microcode.
42
43$flavour = shift;
44$output = shift;
45if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
46
47$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
48
49$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
50( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
51( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
52die "can't locate x86_64-xlate.pl";
53
54open STDOUT,"| $^X $xlate $flavour $output";
55
56if ($output =~ /512/) {
57 $func="sha512_block_data_order";
58 $TABLE="K512";
59 $SZ=8;
60 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%rax","%rbx","%rcx","%rdx",
61 "%r8", "%r9", "%r10","%r11");
62 ($T1,$a0,$a1,$a2)=("%r12","%r13","%r14","%r15");
63 @Sigma0=(28,34,39);
64 @Sigma1=(14,18,41);
65 @sigma0=(1, 8, 7);
66 @sigma1=(19,61, 6);
67 $rounds=80;
68} else {
69 $func="sha256_block_data_order";
70 $TABLE="K256";
71 $SZ=4;
72 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
73 "%r8d","%r9d","%r10d","%r11d");
74 ($T1,$a0,$a1,$a2)=("%r12d","%r13d","%r14d","%r15d");
75 @Sigma0=( 2,13,22);
76 @Sigma1=( 6,11,25);
77 @sigma0=( 7,18, 3);
78 @sigma1=(17,19,10);
79 $rounds=64;
80}
81
82$ctx="%rdi"; # 1st arg
83$round="%rdi"; # zaps $ctx
84$inp="%rsi"; # 2nd arg
85$Tbl="%rbp";
86
87$_ctx="16*$SZ+0*8(%rsp)";
88$_inp="16*$SZ+1*8(%rsp)";
89$_end="16*$SZ+2*8(%rsp)";
90$_rsp="16*$SZ+3*8(%rsp)";
91$framesz="16*$SZ+4*8";
92
93
94sub ROUND_00_15()
95{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
96
97$code.=<<___;
98 ror \$`$Sigma1[2]-$Sigma1[1]`,$a0
99 mov $f,$a2
100 mov $T1,`$SZ*($i&0xf)`(%rsp)
101
102 ror \$`$Sigma0[2]-$Sigma0[1]`,$a1
103 xor $e,$a0
104 xor $g,$a2 # f^g
105
106 ror \$`$Sigma1[1]-$Sigma1[0]`,$a0
107 add $h,$T1 # T1+=h
108 xor $a,$a1
109
110 add ($Tbl,$round,$SZ),$T1 # T1+=K[round]
111 and $e,$a2 # (f^g)&e
112 mov $b,$h
113
114 ror \$`$Sigma0[1]-$Sigma0[0]`,$a1
115 xor $e,$a0
116 xor $g,$a2 # Ch(e,f,g)=((f^g)&e)^g
117
118 xor $c,$h # b^c
119 xor $a,$a1
120 add $a2,$T1 # T1+=Ch(e,f,g)
121 mov $b,$a2
122
123 ror \$$Sigma1[0],$a0 # Sigma1(e)
124 and $a,$h # h=(b^c)&a
125 and $c,$a2 # b&c
126
127 ror \$$Sigma0[0],$a1 # Sigma0(a)
128 add $a0,$T1 # T1+=Sigma1(e)
129 add $a2,$h # h+=b&c (completes +=Maj(a,b,c)
130
131 add $T1,$d # d+=T1
132 add $T1,$h # h+=T1
133 lea 1($round),$round # round++
134 add $a1,$h # h+=Sigma0(a)
135
136___
137}
138
139sub ROUND_16_XX()
140{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
141
142$code.=<<___;
143 mov `$SZ*(($i+1)&0xf)`(%rsp),$a0
144 mov `$SZ*(($i+14)&0xf)`(%rsp),$a1
145 mov $a0,$T1
146 mov $a1,$a2
147
148 ror \$`$sigma0[1]-$sigma0[0]`,$T1
149 xor $a0,$T1
150 shr \$$sigma0[2],$a0
151
152 ror \$$sigma0[0],$T1
153 xor $T1,$a0 # sigma0(X[(i+1)&0xf])
154 mov `$SZ*(($i+9)&0xf)`(%rsp),$T1
155
156 ror \$`$sigma1[1]-$sigma1[0]`,$a2
157 xor $a1,$a2
158 shr \$$sigma1[2],$a1
159
160 ror \$$sigma1[0],$a2
161 add $a0,$T1
162 xor $a2,$a1 # sigma1(X[(i+14)&0xf])
163
164 add `$SZ*($i&0xf)`(%rsp),$T1
165 mov $e,$a0
166 add $a1,$T1
167 mov $a,$a1
168___
169 &ROUND_00_15(@_);
170}
171
172$code=<<___;
173.text
174
175.globl $func
176.type $func,\@function,4
177.align 16
178$func:
179 push %rbx
180 push %rbp
181 push %r12
182 push %r13
183 push %r14
184 push %r15
185 mov %rsp,%r11 # copy %rsp
186 shl \$4,%rdx # num*16
187 sub \$$framesz,%rsp
188 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
189 and \$-64,%rsp # align stack frame
190 mov $ctx,$_ctx # save ctx, 1st arg
191 mov $inp,$_inp # save inp, 2nd arh
192 mov %rdx,$_end # save end pointer, "3rd" arg
193 mov %r11,$_rsp # save copy of %rsp
194.Lprologue:
195
196 lea $TABLE(%rip),$Tbl
197
198 mov $SZ*0($ctx),$A
199 mov $SZ*1($ctx),$B
200 mov $SZ*2($ctx),$C
201 mov $SZ*3($ctx),$D
202 mov $SZ*4($ctx),$E
203 mov $SZ*5($ctx),$F
204 mov $SZ*6($ctx),$G
205 mov $SZ*7($ctx),$H
206 jmp .Lloop
207
208.align 16
209.Lloop:
210 xor $round,$round
211___
212 for($i=0;$i<16;$i++) {
213 $code.=" mov $SZ*$i($inp),$T1\n";
214 $code.=" mov @ROT[4],$a0\n";
215 $code.=" mov @ROT[0],$a1\n";
216 $code.=" bswap $T1\n";
217 &ROUND_00_15($i,@ROT);
218 unshift(@ROT,pop(@ROT));
219 }
220$code.=<<___;
221 jmp .Lrounds_16_xx
222.align 16
223.Lrounds_16_xx:
224___
225 for(;$i<32;$i++) {
226 &ROUND_16_XX($i,@ROT);
227 unshift(@ROT,pop(@ROT));
228 }
229
230$code.=<<___;
231 cmp \$$rounds,$round
232 jb .Lrounds_16_xx
233
234 mov $_ctx,$ctx
235 lea 16*$SZ($inp),$inp
236
237 add $SZ*0($ctx),$A
238 add $SZ*1($ctx),$B
239 add $SZ*2($ctx),$C
240 add $SZ*3($ctx),$D
241 add $SZ*4($ctx),$E
242 add $SZ*5($ctx),$F
243 add $SZ*6($ctx),$G
244 add $SZ*7($ctx),$H
245
246 cmp $_end,$inp
247
248 mov $A,$SZ*0($ctx)
249 mov $B,$SZ*1($ctx)
250 mov $C,$SZ*2($ctx)
251 mov $D,$SZ*3($ctx)
252 mov $E,$SZ*4($ctx)
253 mov $F,$SZ*5($ctx)
254 mov $G,$SZ*6($ctx)
255 mov $H,$SZ*7($ctx)
256 jb .Lloop
257
258 mov $_rsp,%rsi
259 mov (%rsi),%r15
260 mov 8(%rsi),%r14
261 mov 16(%rsi),%r13
262 mov 24(%rsi),%r12
263 mov 32(%rsi),%rbp
264 mov 40(%rsi),%rbx
265 lea 48(%rsi),%rsp
266.Lepilogue:
267 ret
268.size $func,.-$func
269___
270
271if ($SZ==4) {
272$code.=<<___;
273.align 64
274.type $TABLE,\@object
275$TABLE:
276 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
277 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
278 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
279 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
280 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
281 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
282 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
283 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
284 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
285 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
286 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
287 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
288 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
289 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
290 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
291 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
292___
293} else {
294$code.=<<___;
295.align 64
296.type $TABLE,\@object
297$TABLE:
298 .quad 0x428a2f98d728ae22,0x7137449123ef65cd
299 .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
300 .quad 0x3956c25bf348b538,0x59f111f1b605d019
301 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
302 .quad 0xd807aa98a3030242,0x12835b0145706fbe
303 .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
304 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
305 .quad 0x9bdc06a725c71235,0xc19bf174cf692694
306 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
307 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
308 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
309 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
310 .quad 0x983e5152ee66dfab,0xa831c66d2db43210
311 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
312 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
313 .quad 0x06ca6351e003826f,0x142929670a0e6e70
314 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
315 .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
316 .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
317 .quad 0x81c2c92e47edaee6,0x92722c851482353b
318 .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
319 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
320 .quad 0xd192e819d6ef5218,0xd69906245565a910
321 .quad 0xf40e35855771202a,0x106aa07032bbd1b8
322 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
323 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
324 .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
325 .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
326 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
327 .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
328 .quad 0x90befffa23631e28,0xa4506cebde82bde9
329 .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
330 .quad 0xca273eceea26619c,0xd186b8c721c0c207
331 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
332 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
333 .quad 0x113f9804bef90dae,0x1b710b35131c471b
334 .quad 0x28db77f523047d84,0x32caab7b40c72493
335 .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
336 .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
337 .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
338___
339}
340
341# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
342# CONTEXT *context,DISPATCHER_CONTEXT *disp)
343if ($win64) {
344$rec="%rcx";
345$frame="%rdx";
346$context="%r8";
347$disp="%r9";
348
349$code.=<<___;
350.extern __imp_RtlVirtualUnwind
351.type se_handler,\@abi-omnipotent
352.align 16
353se_handler:
354 push %rsi
355 push %rdi
356 push %rbx
357 push %rbp
358 push %r12
359 push %r13
360 push %r14
361 push %r15
362 pushfq
363 sub \$64,%rsp
364
365 mov 120($context),%rax # pull context->Rax
366 mov 248($context),%rbx # pull context->Rip
367
368 lea .Lprologue(%rip),%r10
369 cmp %r10,%rbx # context->Rip<.Lprologue
370 jb .Lin_prologue
371
372 mov 152($context),%rax # pull context->Rsp
373
374 lea .Lepilogue(%rip),%r10
375 cmp %r10,%rbx # context->Rip>=.Lepilogue
376 jae .Lin_prologue
377
378 mov 16*$SZ+3*8(%rax),%rax # pull $_rsp
379 lea 48(%rax),%rax
380
381 mov -8(%rax),%rbx
382 mov -16(%rax),%rbp
383 mov -24(%rax),%r12
384 mov -32(%rax),%r13
385 mov -40(%rax),%r14
386 mov -48(%rax),%r15
387 mov %rbx,144($context) # restore context->Rbx
388 mov %rbp,160($context) # restore context->Rbp
389 mov %r12,216($context) # restore context->R12
390 mov %r13,224($context) # restore context->R13
391 mov %r14,232($context) # restore context->R14
392 mov %r15,240($context) # restore context->R15
393
394.Lin_prologue:
395 mov 8(%rax),%rdi
396 mov 16(%rax),%rsi
397 mov %rax,152($context) # restore context->Rsp
398 mov %rsi,168($context) # restore context->Rsi
399 mov %rdi,176($context) # restore context->Rdi
400
401 mov 40($disp),%rdi # disp->ContextRecord
402 mov $context,%rsi # context
403 mov \$154,%ecx # sizeof(CONTEXT)
404 .long 0xa548f3fc # cld; rep movsq
405
406 mov $disp,%rsi
407 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
408 mov 8(%rsi),%rdx # arg2, disp->ImageBase
409 mov 0(%rsi),%r8 # arg3, disp->ControlPc
410 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
411 mov 40(%rsi),%r10 # disp->ContextRecord
412 lea 56(%rsi),%r11 # &disp->HandlerData
413 lea 24(%rsi),%r12 # &disp->EstablisherFrame
414 mov %r10,32(%rsp) # arg5
415 mov %r11,40(%rsp) # arg6
416 mov %r12,48(%rsp) # arg7
417 mov %rcx,56(%rsp) # arg8, (NULL)
418 call *__imp_RtlVirtualUnwind(%rip)
419
420 mov \$1,%eax # ExceptionContinueSearch
421 add \$64,%rsp
422 popfq
423 pop %r15
424 pop %r14
425 pop %r13
426 pop %r12
427 pop %rbp
428 pop %rbx
429 pop %rdi
430 pop %rsi
431 ret
432.size se_handler,.-se_handler
433
434.section .pdata
435.align 4
436 .rva .LSEH_begin_$func
437 .rva .LSEH_end_$func
438 .rva .LSEH_info_$func
439
440.section .xdata
441.align 8
442.LSEH_info_$func:
443 .byte 9,0,0,0
444 .rva se_handler
445___
446}
447
448$code =~ s/\`([^\`]*)\`/eval $1/gem;
449print $code;
450close STDOUT;
diff --git a/src/lib/libcrypto/sha/sha.h b/src/lib/libcrypto/sha/sha.h
deleted file mode 100644
index 8a6bf4bbbb..0000000000
--- a/src/lib/libcrypto/sha/sha.h
+++ /dev/null
@@ -1,214 +0,0 @@
1/* crypto/sha/sha.h */
2/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
3 * All rights reserved.
4 *
5 * This package is an SSL implementation written
6 * by Eric Young (eay@cryptsoft.com).
7 * The implementation was written so as to conform with Netscapes SSL.
8 *
9 * This library is free for commercial and non-commercial use as long as
10 * the following conditions are aheared to. The following conditions
11 * apply to all code found in this distribution, be it the RC4, RSA,
12 * lhash, DES, etc., code; not just the SSL code. The SSL documentation
13 * included with this distribution is covered by the same copyright terms
14 * except that the holder is Tim Hudson (tjh@cryptsoft.com).
15 *
16 * Copyright remains Eric Young's, and as such any Copyright notices in
17 * the code are not to be removed.
18 * If this package is used in a product, Eric Young should be given attribution
19 * as the author of the parts of the library used.
20 * This can be in the form of a textual message at program startup or
21 * in documentation (online or textual) provided with the package.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 * 1. Redistributions of source code must retain the copyright
27 * notice, this list of conditions and the following disclaimer.
28 * 2. Redistributions in binary form must reproduce the above copyright
29 * notice, this list of conditions and the following disclaimer in the
30 * documentation and/or other materials provided with the distribution.
31 * 3. All advertising materials mentioning features or use of this software
32 * must display the following acknowledgement:
33 * "This product includes cryptographic software written by
34 * Eric Young (eay@cryptsoft.com)"
35 * The word 'cryptographic' can be left out if the rouines from the library
36 * being used are not cryptographic related :-).
37 * 4. If you include any Windows specific code (or a derivative thereof) from
38 * the apps directory (application code) you must include an acknowledgement:
39 * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
40 *
41 * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
42 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51 * SUCH DAMAGE.
52 *
53 * The licence and distribution terms for any publically available version or
54 * derivative of this code cannot be changed. i.e. this code cannot simply be
55 * copied and put under another distribution licence
56 * [including the GNU Public Licence.]
57 */
58
59#ifndef HEADER_SHA_H
60#define HEADER_SHA_H
61
62#include <openssl/e_os2.h>
63#include <stddef.h>
64
65#ifdef __cplusplus
66extern "C" {
67#endif
68
69#if defined(OPENSSL_NO_SHA) || (defined(OPENSSL_NO_SHA0) && defined(OPENSSL_NO_SHA1))
70#error SHA is disabled.
71#endif
72
73#if defined(OPENSSL_FIPS)
74#define FIPS_SHA_SIZE_T size_t
75#endif
76
77/*
78 * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
79 * ! SHA_LONG has to be at least 32 bits wide. If it's wider, then !
80 * ! SHA_LONG_LOG2 has to be defined along. !
81 * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
82 */
83
84#if defined(__LP32__)
85#define SHA_LONG unsigned long
86#elif defined(OPENSSL_SYS_CRAY) || defined(__ILP64__)
87#define SHA_LONG unsigned long
88#define SHA_LONG_LOG2 3
89#else
90#define SHA_LONG unsigned int
91#endif
92
93#define SHA_LBLOCK 16
94#define SHA_CBLOCK (SHA_LBLOCK*4) /* SHA treats input data as a
95 * contiguous array of 32 bit
96 * wide big-endian values. */
97#define SHA_LAST_BLOCK (SHA_CBLOCK-8)
98#define SHA_DIGEST_LENGTH 20
99
100typedef struct SHAstate_st
101 {
102 SHA_LONG h0,h1,h2,h3,h4;
103 SHA_LONG Nl,Nh;
104 SHA_LONG data[SHA_LBLOCK];
105 unsigned int num;
106 } SHA_CTX;
107
108#ifndef OPENSSL_NO_SHA0
109#ifdef OPENSSL_FIPS
110int private_SHA_Init(SHA_CTX *c);
111#endif
112int SHA_Init(SHA_CTX *c);
113int SHA_Update(SHA_CTX *c, const void *data, size_t len);
114int SHA_Final(unsigned char *md, SHA_CTX *c);
115unsigned char *SHA(const unsigned char *d, size_t n, unsigned char *md);
116void SHA_Transform(SHA_CTX *c, const unsigned char *data);
117#endif
118#ifndef OPENSSL_NO_SHA1
119#ifdef OPENSSL_FIPS
120int private_SHA1_Init(SHA_CTX *c);
121#endif
122int SHA1_Init(SHA_CTX *c);
123int SHA1_Update(SHA_CTX *c, const void *data, size_t len);
124int SHA1_Final(unsigned char *md, SHA_CTX *c);
125unsigned char *SHA1(const unsigned char *d, size_t n, unsigned char *md);
126void SHA1_Transform(SHA_CTX *c, const unsigned char *data);
127#endif
128
129#define SHA256_CBLOCK (SHA_LBLOCK*4) /* SHA-256 treats input data as a
130 * contiguous array of 32 bit
131 * wide big-endian values. */
132#define SHA224_DIGEST_LENGTH 28
133#define SHA256_DIGEST_LENGTH 32
134
135typedef struct SHA256state_st
136 {
137 SHA_LONG h[8];
138 SHA_LONG Nl,Nh;
139 SHA_LONG data[SHA_LBLOCK];
140 unsigned int num,md_len;
141 } SHA256_CTX;
142
143#ifndef OPENSSL_NO_SHA256
144#ifdef OPENSSL_FIPS
145int private_SHA224_Init(SHA256_CTX *c);
146int private_SHA256_Init(SHA256_CTX *c);
147#endif
148int SHA224_Init(SHA256_CTX *c);
149int SHA224_Update(SHA256_CTX *c, const void *data, size_t len);
150int SHA224_Final(unsigned char *md, SHA256_CTX *c);
151unsigned char *SHA224(const unsigned char *d, size_t n,unsigned char *md);
152int SHA256_Init(SHA256_CTX *c);
153int SHA256_Update(SHA256_CTX *c, const void *data, size_t len);
154int SHA256_Final(unsigned char *md, SHA256_CTX *c);
155unsigned char *SHA256(const unsigned char *d, size_t n,unsigned char *md);
156void SHA256_Transform(SHA256_CTX *c, const unsigned char *data);
157#endif
158
159#define SHA384_DIGEST_LENGTH 48
160#define SHA512_DIGEST_LENGTH 64
161
162#ifndef OPENSSL_NO_SHA512
163/*
164 * Unlike 32-bit digest algorithms, SHA-512 *relies* on SHA_LONG64
165 * being exactly 64-bit wide. See Implementation Notes in sha512.c
166 * for further details.
167 */
168#define SHA512_CBLOCK (SHA_LBLOCK*8) /* SHA-512 treats input data as a
169 * contiguous array of 64 bit
170 * wide big-endian values. */
171#if (defined(_WIN32) || defined(_WIN64)) && !defined(__MINGW32__)
172#define SHA_LONG64 unsigned __int64
173#define U64(C) C##UI64
174#elif defined(__arch64__)
175#define SHA_LONG64 unsigned long
176#define U64(C) C##UL
177#else
178#define SHA_LONG64 unsigned long long
179#define U64(C) C##ULL
180#endif
181
182typedef struct SHA512state_st
183 {
184 SHA_LONG64 h[8];
185 SHA_LONG64 Nl,Nh;
186 union {
187 SHA_LONG64 d[SHA_LBLOCK];
188 unsigned char p[SHA512_CBLOCK];
189 } u;
190 unsigned int num,md_len;
191 } SHA512_CTX;
192#endif
193
194#ifndef OPENSSL_NO_SHA512
195#ifdef OPENSSL_FIPS
196int private_SHA384_Init(SHA512_CTX *c);
197int private_SHA512_Init(SHA512_CTX *c);
198#endif
199int SHA384_Init(SHA512_CTX *c);
200int SHA384_Update(SHA512_CTX *c, const void *data, size_t len);
201int SHA384_Final(unsigned char *md, SHA512_CTX *c);
202unsigned char *SHA384(const unsigned char *d, size_t n,unsigned char *md);
203int SHA512_Init(SHA512_CTX *c);
204int SHA512_Update(SHA512_CTX *c, const void *data, size_t len);
205int SHA512_Final(unsigned char *md, SHA512_CTX *c);
206unsigned char *SHA512(const unsigned char *d, size_t n,unsigned char *md);
207void SHA512_Transform(SHA512_CTX *c, const unsigned char *data);
208#endif
209
210#ifdef __cplusplus
211}
212#endif
213
214#endif
diff --git a/src/lib/libcrypto/sha/sha1_one.c b/src/lib/libcrypto/sha/sha1_one.c
deleted file mode 100644
index 7c65b60276..0000000000
--- a/src/lib/libcrypto/sha/sha1_one.c
+++ /dev/null
@@ -1,78 +0,0 @@
1/* crypto/sha/sha1_one.c */
2/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
3 * All rights reserved.
4 *
5 * This package is an SSL implementation written
6 * by Eric Young (eay@cryptsoft.com).
7 * The implementation was written so as to conform with Netscapes SSL.
8 *
9 * This library is free for commercial and non-commercial use as long as
10 * the following conditions are aheared to. The following conditions
11 * apply to all code found in this distribution, be it the RC4, RSA,
12 * lhash, DES, etc., code; not just the SSL code. The SSL documentation
13 * included with this distribution is covered by the same copyright terms
14 * except that the holder is Tim Hudson (tjh@cryptsoft.com).
15 *
16 * Copyright remains Eric Young's, and as such any Copyright notices in
17 * the code are not to be removed.
18 * If this package is used in a product, Eric Young should be given attribution
19 * as the author of the parts of the library used.
20 * This can be in the form of a textual message at program startup or
21 * in documentation (online or textual) provided with the package.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 * 1. Redistributions of source code must retain the copyright
27 * notice, this list of conditions and the following disclaimer.
28 * 2. Redistributions in binary form must reproduce the above copyright
29 * notice, this list of conditions and the following disclaimer in the
30 * documentation and/or other materials provided with the distribution.
31 * 3. All advertising materials mentioning features or use of this software
32 * must display the following acknowledgement:
33 * "This product includes cryptographic software written by
34 * Eric Young (eay@cryptsoft.com)"
35 * The word 'cryptographic' can be left out if the rouines from the library
36 * being used are not cryptographic related :-).
37 * 4. If you include any Windows specific code (or a derivative thereof) from
38 * the apps directory (application code) you must include an acknowledgement:
39 * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
40 *
41 * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
42 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51 * SUCH DAMAGE.
52 *
53 * The licence and distribution terms for any publically available version or
54 * derivative of this code cannot be changed. i.e. this code cannot simply be
55 * copied and put under another distribution licence
56 * [including the GNU Public Licence.]
57 */
58
59#include <stdio.h>
60#include <string.h>
61#include <openssl/sha.h>
62#include <openssl/crypto.h>
63
64#ifndef OPENSSL_NO_SHA1
65unsigned char *SHA1(const unsigned char *d, size_t n, unsigned char *md)
66 {
67 SHA_CTX c;
68 static unsigned char m[SHA_DIGEST_LENGTH];
69
70 if (md == NULL) md=m;
71 if (!SHA1_Init(&c))
72 return NULL;
73 SHA1_Update(&c,d,n);
74 SHA1_Final(md,&c);
75 OPENSSL_cleanse(&c,sizeof(c));
76 return(md);
77 }
78#endif
diff --git a/src/lib/libcrypto/sha/sha1dgst.c b/src/lib/libcrypto/sha/sha1dgst.c
deleted file mode 100644
index 81219af088..0000000000
--- a/src/lib/libcrypto/sha/sha1dgst.c
+++ /dev/null
@@ -1,75 +0,0 @@
1/* crypto/sha/sha1dgst.c */
2/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
3 * All rights reserved.
4 *
5 * This package is an SSL implementation written
6 * by Eric Young (eay@cryptsoft.com).
7 * The implementation was written so as to conform with Netscapes SSL.
8 *
9 * This library is free for commercial and non-commercial use as long as
10 * the following conditions are aheared to. The following conditions
11 * apply to all code found in this distribution, be it the RC4, RSA,
12 * lhash, DES, etc., code; not just the SSL code. The SSL documentation
13 * included with this distribution is covered by the same copyright terms
14 * except that the holder is Tim Hudson (tjh@cryptsoft.com).
15 *
16 * Copyright remains Eric Young's, and as such any Copyright notices in
17 * the code are not to be removed.
18 * If this package is used in a product, Eric Young should be given attribution
19 * as the author of the parts of the library used.
20 * This can be in the form of a textual message at program startup or
21 * in documentation (online or textual) provided with the package.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 * 1. Redistributions of source code must retain the copyright
27 * notice, this list of conditions and the following disclaimer.
28 * 2. Redistributions in binary form must reproduce the above copyright
29 * notice, this list of conditions and the following disclaimer in the
30 * documentation and/or other materials provided with the distribution.
31 * 3. All advertising materials mentioning features or use of this software
32 * must display the following acknowledgement:
33 * "This product includes cryptographic software written by
34 * Eric Young (eay@cryptsoft.com)"
35 * The word 'cryptographic' can be left out if the rouines from the library
36 * being used are not cryptographic related :-).
37 * 4. If you include any Windows specific code (or a derivative thereof) from
38 * the apps directory (application code) you must include an acknowledgement:
39 * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
40 *
41 * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
42 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51 * SUCH DAMAGE.
52 *
53 * The licence and distribution terms for any publically available version or
54 * derivative of this code cannot be changed. i.e. this code cannot simply be
55 * copied and put under another distribution licence
56 * [including the GNU Public Licence.]
57 */
58
59#include <openssl/opensslconf.h>
60#include <openssl/crypto.h>
61#if !defined(OPENSSL_NO_SHA1) && !defined(OPENSSL_NO_SHA)
62
63#undef SHA_0
64#define SHA_1
65
66#include <openssl/opensslv.h>
67
68const char SHA1_version[]="SHA1" OPENSSL_VERSION_PTEXT;
69
70/* The implementation is in ../md32_common.h */
71
72#include "sha_locl.h"
73
74#endif
75
diff --git a/src/lib/libcrypto/sha/sha256.c b/src/lib/libcrypto/sha/sha256.c
deleted file mode 100644
index f88d3d6dad..0000000000
--- a/src/lib/libcrypto/sha/sha256.c
+++ /dev/null
@@ -1,282 +0,0 @@
1/* crypto/sha/sha256.c */
2/* ====================================================================
3 * Copyright (c) 2004 The OpenSSL Project. All rights reserved
4 * according to the OpenSSL license [found in ../../LICENSE].
5 * ====================================================================
6 */
7#include <openssl/opensslconf.h>
8#if !defined(OPENSSL_NO_SHA) && !defined(OPENSSL_NO_SHA256)
9
10#include <stdlib.h>
11#include <string.h>
12
13#include <openssl/crypto.h>
14#include <openssl/sha.h>
15#include <openssl/opensslv.h>
16
17const char SHA256_version[]="SHA-256" OPENSSL_VERSION_PTEXT;
18
19fips_md_init_ctx(SHA224, SHA256)
20 {
21 memset (c,0,sizeof(*c));
22 c->h[0]=0xc1059ed8UL; c->h[1]=0x367cd507UL;
23 c->h[2]=0x3070dd17UL; c->h[3]=0xf70e5939UL;
24 c->h[4]=0xffc00b31UL; c->h[5]=0x68581511UL;
25 c->h[6]=0x64f98fa7UL; c->h[7]=0xbefa4fa4UL;
26 c->md_len=SHA224_DIGEST_LENGTH;
27 return 1;
28 }
29
30fips_md_init(SHA256)
31 {
32 memset (c,0,sizeof(*c));
33 c->h[0]=0x6a09e667UL; c->h[1]=0xbb67ae85UL;
34 c->h[2]=0x3c6ef372UL; c->h[3]=0xa54ff53aUL;
35 c->h[4]=0x510e527fUL; c->h[5]=0x9b05688cUL;
36 c->h[6]=0x1f83d9abUL; c->h[7]=0x5be0cd19UL;
37 c->md_len=SHA256_DIGEST_LENGTH;
38 return 1;
39 }
40
41unsigned char *SHA224(const unsigned char *d, size_t n, unsigned char *md)
42 {
43 SHA256_CTX c;
44 static unsigned char m[SHA224_DIGEST_LENGTH];
45
46 if (md == NULL) md=m;
47 SHA224_Init(&c);
48 SHA256_Update(&c,d,n);
49 SHA256_Final(md,&c);
50 OPENSSL_cleanse(&c,sizeof(c));
51 return(md);
52 }
53
54unsigned char *SHA256(const unsigned char *d, size_t n, unsigned char *md)
55 {
56 SHA256_CTX c;
57 static unsigned char m[SHA256_DIGEST_LENGTH];
58
59 if (md == NULL) md=m;
60 SHA256_Init(&c);
61 SHA256_Update(&c,d,n);
62 SHA256_Final(md,&c);
63 OPENSSL_cleanse(&c,sizeof(c));
64 return(md);
65 }
66
67int SHA224_Update(SHA256_CTX *c, const void *data, size_t len)
68{ return SHA256_Update (c,data,len); }
69int SHA224_Final (unsigned char *md, SHA256_CTX *c)
70{ return SHA256_Final (md,c); }
71
72#define DATA_ORDER_IS_BIG_ENDIAN
73
74#define HASH_LONG SHA_LONG
75#define HASH_CTX SHA256_CTX
76#define HASH_CBLOCK SHA_CBLOCK
77/*
78 * Note that FIPS180-2 discusses "Truncation of the Hash Function Output."
79 * default: case below covers for it. It's not clear however if it's
80 * permitted to truncate to amount of bytes not divisible by 4. I bet not,
81 * but if it is, then default: case shall be extended. For reference.
82 * Idea behind separate cases for pre-defined lenghts is to let the
83 * compiler decide if it's appropriate to unroll small loops.
84 */
85#define HASH_MAKE_STRING(c,s) do { \
86 unsigned long ll; \
87 unsigned int nn; \
88 switch ((c)->md_len) \
89 { case SHA224_DIGEST_LENGTH: \
90 for (nn=0;nn<SHA224_DIGEST_LENGTH/4;nn++) \
91 { ll=(c)->h[nn]; HOST_l2c(ll,(s)); } \
92 break; \
93 case SHA256_DIGEST_LENGTH: \
94 for (nn=0;nn<SHA256_DIGEST_LENGTH/4;nn++) \
95 { ll=(c)->h[nn]; HOST_l2c(ll,(s)); } \
96 break; \
97 default: \
98 if ((c)->md_len > SHA256_DIGEST_LENGTH) \
99 return 0; \
100 for (nn=0;nn<(c)->md_len/4;nn++) \
101 { ll=(c)->h[nn]; HOST_l2c(ll,(s)); } \
102 break; \
103 } \
104 } while (0)
105
106#define HASH_UPDATE SHA256_Update
107#define HASH_TRANSFORM SHA256_Transform
108#define HASH_FINAL SHA256_Final
109#define HASH_BLOCK_DATA_ORDER sha256_block_data_order
110#ifndef SHA256_ASM
111static
112#endif
113void sha256_block_data_order (SHA256_CTX *ctx, const void *in, size_t num);
114
115#include "md32_common.h"
116
117#ifndef SHA256_ASM
118static const SHA_LONG K256[64] = {
119 0x428a2f98UL,0x71374491UL,0xb5c0fbcfUL,0xe9b5dba5UL,
120 0x3956c25bUL,0x59f111f1UL,0x923f82a4UL,0xab1c5ed5UL,
121 0xd807aa98UL,0x12835b01UL,0x243185beUL,0x550c7dc3UL,
122 0x72be5d74UL,0x80deb1feUL,0x9bdc06a7UL,0xc19bf174UL,
123 0xe49b69c1UL,0xefbe4786UL,0x0fc19dc6UL,0x240ca1ccUL,
124 0x2de92c6fUL,0x4a7484aaUL,0x5cb0a9dcUL,0x76f988daUL,
125 0x983e5152UL,0xa831c66dUL,0xb00327c8UL,0xbf597fc7UL,
126 0xc6e00bf3UL,0xd5a79147UL,0x06ca6351UL,0x14292967UL,
127 0x27b70a85UL,0x2e1b2138UL,0x4d2c6dfcUL,0x53380d13UL,
128 0x650a7354UL,0x766a0abbUL,0x81c2c92eUL,0x92722c85UL,
129 0xa2bfe8a1UL,0xa81a664bUL,0xc24b8b70UL,0xc76c51a3UL,
130 0xd192e819UL,0xd6990624UL,0xf40e3585UL,0x106aa070UL,
131 0x19a4c116UL,0x1e376c08UL,0x2748774cUL,0x34b0bcb5UL,
132 0x391c0cb3UL,0x4ed8aa4aUL,0x5b9cca4fUL,0x682e6ff3UL,
133 0x748f82eeUL,0x78a5636fUL,0x84c87814UL,0x8cc70208UL,
134 0x90befffaUL,0xa4506cebUL,0xbef9a3f7UL,0xc67178f2UL };
135
136/*
137 * FIPS specification refers to right rotations, while our ROTATE macro
138 * is left one. This is why you might notice that rotation coefficients
139 * differ from those observed in FIPS document by 32-N...
140 */
141#define Sigma0(x) (ROTATE((x),30) ^ ROTATE((x),19) ^ ROTATE((x),10))
142#define Sigma1(x) (ROTATE((x),26) ^ ROTATE((x),21) ^ ROTATE((x),7))
143#define sigma0(x) (ROTATE((x),25) ^ ROTATE((x),14) ^ ((x)>>3))
144#define sigma1(x) (ROTATE((x),15) ^ ROTATE((x),13) ^ ((x)>>10))
145
146#define Ch(x,y,z) (((x) & (y)) ^ ((~(x)) & (z)))
147#define Maj(x,y,z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
148
149#ifdef OPENSSL_SMALL_FOOTPRINT
150
151static void sha256_block_data_order (SHA256_CTX *ctx, const void *in, size_t num)
152 {
153 unsigned MD32_REG_T a,b,c,d,e,f,g,h,s0,s1,T1,T2;
154 SHA_LONG X[16],l;
155 int i;
156 const unsigned char *data=in;
157
158 while (num--) {
159
160 a = ctx->h[0]; b = ctx->h[1]; c = ctx->h[2]; d = ctx->h[3];
161 e = ctx->h[4]; f = ctx->h[5]; g = ctx->h[6]; h = ctx->h[7];
162
163 for (i=0;i<16;i++)
164 {
165 HOST_c2l(data,l); T1 = X[i] = l;
166 T1 += h + Sigma1(e) + Ch(e,f,g) + K256[i];
167 T2 = Sigma0(a) + Maj(a,b,c);
168 h = g; g = f; f = e; e = d + T1;
169 d = c; c = b; b = a; a = T1 + T2;
170 }
171
172 for (;i<64;i++)
173 {
174 s0 = X[(i+1)&0x0f]; s0 = sigma0(s0);
175 s1 = X[(i+14)&0x0f]; s1 = sigma1(s1);
176
177 T1 = X[i&0xf] += s0 + s1 + X[(i+9)&0xf];
178 T1 += h + Sigma1(e) + Ch(e,f,g) + K256[i];
179 T2 = Sigma0(a) + Maj(a,b,c);
180 h = g; g = f; f = e; e = d + T1;
181 d = c; c = b; b = a; a = T1 + T2;
182 }
183
184 ctx->h[0] += a; ctx->h[1] += b; ctx->h[2] += c; ctx->h[3] += d;
185 ctx->h[4] += e; ctx->h[5] += f; ctx->h[6] += g; ctx->h[7] += h;
186
187 }
188}
189
190#else
191
192#define ROUND_00_15(i,a,b,c,d,e,f,g,h) do { \
193 T1 += h + Sigma1(e) + Ch(e,f,g) + K256[i]; \
194 h = Sigma0(a) + Maj(a,b,c); \
195 d += T1; h += T1; } while (0)
196
197#define ROUND_16_63(i,a,b,c,d,e,f,g,h,X) do { \
198 s0 = X[(i+1)&0x0f]; s0 = sigma0(s0); \
199 s1 = X[(i+14)&0x0f]; s1 = sigma1(s1); \
200 T1 = X[(i)&0x0f] += s0 + s1 + X[(i+9)&0x0f]; \
201 ROUND_00_15(i,a,b,c,d,e,f,g,h); } while (0)
202
203static void sha256_block_data_order (SHA256_CTX *ctx, const void *in, size_t num)
204 {
205 unsigned MD32_REG_T a,b,c,d,e,f,g,h,s0,s1,T1;
206 SHA_LONG X[16];
207 int i;
208 const unsigned char *data=in;
209 const union { long one; char little; } is_endian = {1};
210
211 while (num--) {
212
213 a = ctx->h[0]; b = ctx->h[1]; c = ctx->h[2]; d = ctx->h[3];
214 e = ctx->h[4]; f = ctx->h[5]; g = ctx->h[6]; h = ctx->h[7];
215
216 if (!is_endian.little && sizeof(SHA_LONG)==4 && ((size_t)in%4)==0)
217 {
218 const SHA_LONG *W=(const SHA_LONG *)data;
219
220 T1 = X[0] = W[0]; ROUND_00_15(0,a,b,c,d,e,f,g,h);
221 T1 = X[1] = W[1]; ROUND_00_15(1,h,a,b,c,d,e,f,g);
222 T1 = X[2] = W[2]; ROUND_00_15(2,g,h,a,b,c,d,e,f);
223 T1 = X[3] = W[3]; ROUND_00_15(3,f,g,h,a,b,c,d,e);
224 T1 = X[4] = W[4]; ROUND_00_15(4,e,f,g,h,a,b,c,d);
225 T1 = X[5] = W[5]; ROUND_00_15(5,d,e,f,g,h,a,b,c);
226 T1 = X[6] = W[6]; ROUND_00_15(6,c,d,e,f,g,h,a,b);
227 T1 = X[7] = W[7]; ROUND_00_15(7,b,c,d,e,f,g,h,a);
228 T1 = X[8] = W[8]; ROUND_00_15(8,a,b,c,d,e,f,g,h);
229 T1 = X[9] = W[9]; ROUND_00_15(9,h,a,b,c,d,e,f,g);
230 T1 = X[10] = W[10]; ROUND_00_15(10,g,h,a,b,c,d,e,f);
231 T1 = X[11] = W[11]; ROUND_00_15(11,f,g,h,a,b,c,d,e);
232 T1 = X[12] = W[12]; ROUND_00_15(12,e,f,g,h,a,b,c,d);
233 T1 = X[13] = W[13]; ROUND_00_15(13,d,e,f,g,h,a,b,c);
234 T1 = X[14] = W[14]; ROUND_00_15(14,c,d,e,f,g,h,a,b);
235 T1 = X[15] = W[15]; ROUND_00_15(15,b,c,d,e,f,g,h,a);
236
237 data += SHA256_CBLOCK;
238 }
239 else
240 {
241 SHA_LONG l;
242
243 HOST_c2l(data,l); T1 = X[0] = l; ROUND_00_15(0,a,b,c,d,e,f,g,h);
244 HOST_c2l(data,l); T1 = X[1] = l; ROUND_00_15(1,h,a,b,c,d,e,f,g);
245 HOST_c2l(data,l); T1 = X[2] = l; ROUND_00_15(2,g,h,a,b,c,d,e,f);
246 HOST_c2l(data,l); T1 = X[3] = l; ROUND_00_15(3,f,g,h,a,b,c,d,e);
247 HOST_c2l(data,l); T1 = X[4] = l; ROUND_00_15(4,e,f,g,h,a,b,c,d);
248 HOST_c2l(data,l); T1 = X[5] = l; ROUND_00_15(5,d,e,f,g,h,a,b,c);
249 HOST_c2l(data,l); T1 = X[6] = l; ROUND_00_15(6,c,d,e,f,g,h,a,b);
250 HOST_c2l(data,l); T1 = X[7] = l; ROUND_00_15(7,b,c,d,e,f,g,h,a);
251 HOST_c2l(data,l); T1 = X[8] = l; ROUND_00_15(8,a,b,c,d,e,f,g,h);
252 HOST_c2l(data,l); T1 = X[9] = l; ROUND_00_15(9,h,a,b,c,d,e,f,g);
253 HOST_c2l(data,l); T1 = X[10] = l; ROUND_00_15(10,g,h,a,b,c,d,e,f);
254 HOST_c2l(data,l); T1 = X[11] = l; ROUND_00_15(11,f,g,h,a,b,c,d,e);
255 HOST_c2l(data,l); T1 = X[12] = l; ROUND_00_15(12,e,f,g,h,a,b,c,d);
256 HOST_c2l(data,l); T1 = X[13] = l; ROUND_00_15(13,d,e,f,g,h,a,b,c);
257 HOST_c2l(data,l); T1 = X[14] = l; ROUND_00_15(14,c,d,e,f,g,h,a,b);
258 HOST_c2l(data,l); T1 = X[15] = l; ROUND_00_15(15,b,c,d,e,f,g,h,a);
259 }
260
261 for (i=16;i<64;i+=8)
262 {
263 ROUND_16_63(i+0,a,b,c,d,e,f,g,h,X);
264 ROUND_16_63(i+1,h,a,b,c,d,e,f,g,X);
265 ROUND_16_63(i+2,g,h,a,b,c,d,e,f,X);
266 ROUND_16_63(i+3,f,g,h,a,b,c,d,e,X);
267 ROUND_16_63(i+4,e,f,g,h,a,b,c,d,X);
268 ROUND_16_63(i+5,d,e,f,g,h,a,b,c,X);
269 ROUND_16_63(i+6,c,d,e,f,g,h,a,b,X);
270 ROUND_16_63(i+7,b,c,d,e,f,g,h,a,X);
271 }
272
273 ctx->h[0] += a; ctx->h[1] += b; ctx->h[2] += c; ctx->h[3] += d;
274 ctx->h[4] += e; ctx->h[5] += f; ctx->h[6] += g; ctx->h[7] += h;
275
276 }
277 }
278
279#endif
280#endif /* SHA256_ASM */
281
282#endif /* OPENSSL_NO_SHA256 */
diff --git a/src/lib/libcrypto/sha/sha512.c b/src/lib/libcrypto/sha/sha512.c
deleted file mode 100644
index 50c229ddeb..0000000000
--- a/src/lib/libcrypto/sha/sha512.c
+++ /dev/null
@@ -1,604 +0,0 @@
1/* crypto/sha/sha512.c */
2/* ====================================================================
3 * Copyright (c) 2004 The OpenSSL Project. All rights reserved
4 * according to the OpenSSL license [found in ../../LICENSE].
5 * ====================================================================
6 */
7#include <openssl/opensslconf.h>
8#if !defined(OPENSSL_NO_SHA) && !defined(OPENSSL_NO_SHA512)
9/*
10 * IMPLEMENTATION NOTES.
11 *
12 * As you might have noticed 32-bit hash algorithms:
13 *
14 * - permit SHA_LONG to be wider than 32-bit (case on CRAY);
15 * - optimized versions implement two transform functions: one operating
16 * on [aligned] data in host byte order and one - on data in input
17 * stream byte order;
18 * - share common byte-order neutral collector and padding function
19 * implementations, ../md32_common.h;
20 *
21 * Neither of the above applies to this SHA-512 implementations. Reasons
22 * [in reverse order] are:
23 *
24 * - it's the only 64-bit hash algorithm for the moment of this writing,
25 * there is no need for common collector/padding implementation [yet];
26 * - by supporting only one transform function [which operates on
27 * *aligned* data in input stream byte order, big-endian in this case]
28 * we minimize burden of maintenance in two ways: a) collector/padding
29 * function is simpler; b) only one transform function to stare at;
30 * - SHA_LONG64 is required to be exactly 64-bit in order to be able to
31 * apply a number of optimizations to mitigate potential performance
32 * penalties caused by previous design decision;
33 *
34 * Caveat lector.
35 *
36 * Implementation relies on the fact that "long long" is 64-bit on
37 * both 32- and 64-bit platforms. If some compiler vendor comes up
38 * with 128-bit long long, adjustment to sha.h would be required.
39 * As this implementation relies on 64-bit integer type, it's totally
40 * inappropriate for platforms which don't support it, most notably
41 * 16-bit platforms.
42 * <appro@fy.chalmers.se>
43 */
44#include <stdlib.h>
45#include <string.h>
46
47#include <openssl/crypto.h>
48#include <openssl/sha.h>
49#include <openssl/opensslv.h>
50
51#include "cryptlib.h"
52
53const char SHA512_version[]="SHA-512" OPENSSL_VERSION_PTEXT;
54
55#if defined(__i386) || defined(__i386__) || defined(_M_IX86) || \
56 defined(__x86_64) || defined(_M_AMD64) || defined(_M_X64) || \
57 defined(__s390__) || defined(__s390x__) || \
58 defined(SHA512_ASM)
59#define SHA512_BLOCK_CAN_MANAGE_UNALIGNED_DATA
60#endif
61
62fips_md_init_ctx(SHA384, SHA512)
63 {
64 c->h[0]=U64(0xcbbb9d5dc1059ed8);
65 c->h[1]=U64(0x629a292a367cd507);
66 c->h[2]=U64(0x9159015a3070dd17);
67 c->h[3]=U64(0x152fecd8f70e5939);
68 c->h[4]=U64(0x67332667ffc00b31);
69 c->h[5]=U64(0x8eb44a8768581511);
70 c->h[6]=U64(0xdb0c2e0d64f98fa7);
71 c->h[7]=U64(0x47b5481dbefa4fa4);
72
73 c->Nl=0; c->Nh=0;
74 c->num=0; c->md_len=SHA384_DIGEST_LENGTH;
75 return 1;
76 }
77
78fips_md_init(SHA512)
79 {
80 c->h[0]=U64(0x6a09e667f3bcc908);
81 c->h[1]=U64(0xbb67ae8584caa73b);
82 c->h[2]=U64(0x3c6ef372fe94f82b);
83 c->h[3]=U64(0xa54ff53a5f1d36f1);
84 c->h[4]=U64(0x510e527fade682d1);
85 c->h[5]=U64(0x9b05688c2b3e6c1f);
86 c->h[6]=U64(0x1f83d9abfb41bd6b);
87 c->h[7]=U64(0x5be0cd19137e2179);
88
89 c->Nl=0; c->Nh=0;
90 c->num=0; c->md_len=SHA512_DIGEST_LENGTH;
91 return 1;
92 }
93
94#ifndef SHA512_ASM
95static
96#endif
97void sha512_block_data_order (SHA512_CTX *ctx, const void *in, size_t num);
98
99int SHA512_Final (unsigned char *md, SHA512_CTX *c)
100 {
101 unsigned char *p=(unsigned char *)c->u.p;
102 size_t n=c->num;
103
104 p[n]=0x80; /* There always is a room for one */
105 n++;
106 if (n > (sizeof(c->u)-16))
107 memset (p+n,0,sizeof(c->u)-n), n=0,
108 sha512_block_data_order (c,p,1);
109
110 memset (p+n,0,sizeof(c->u)-16-n);
111#ifdef B_ENDIAN
112 c->u.d[SHA_LBLOCK-2] = c->Nh;
113 c->u.d[SHA_LBLOCK-1] = c->Nl;
114#else
115 p[sizeof(c->u)-1] = (unsigned char)(c->Nl);
116 p[sizeof(c->u)-2] = (unsigned char)(c->Nl>>8);
117 p[sizeof(c->u)-3] = (unsigned char)(c->Nl>>16);
118 p[sizeof(c->u)-4] = (unsigned char)(c->Nl>>24);
119 p[sizeof(c->u)-5] = (unsigned char)(c->Nl>>32);
120 p[sizeof(c->u)-6] = (unsigned char)(c->Nl>>40);
121 p[sizeof(c->u)-7] = (unsigned char)(c->Nl>>48);
122 p[sizeof(c->u)-8] = (unsigned char)(c->Nl>>56);
123 p[sizeof(c->u)-9] = (unsigned char)(c->Nh);
124 p[sizeof(c->u)-10] = (unsigned char)(c->Nh>>8);
125 p[sizeof(c->u)-11] = (unsigned char)(c->Nh>>16);
126 p[sizeof(c->u)-12] = (unsigned char)(c->Nh>>24);
127 p[sizeof(c->u)-13] = (unsigned char)(c->Nh>>32);
128 p[sizeof(c->u)-14] = (unsigned char)(c->Nh>>40);
129 p[sizeof(c->u)-15] = (unsigned char)(c->Nh>>48);
130 p[sizeof(c->u)-16] = (unsigned char)(c->Nh>>56);
131#endif
132
133 sha512_block_data_order (c,p,1);
134
135 if (md==0) return 0;
136
137 switch (c->md_len)
138 {
139 /* Let compiler decide if it's appropriate to unroll... */
140 case SHA384_DIGEST_LENGTH:
141 for (n=0;n<SHA384_DIGEST_LENGTH/8;n++)
142 {
143 SHA_LONG64 t = c->h[n];
144
145 *(md++) = (unsigned char)(t>>56);
146 *(md++) = (unsigned char)(t>>48);
147 *(md++) = (unsigned char)(t>>40);
148 *(md++) = (unsigned char)(t>>32);
149 *(md++) = (unsigned char)(t>>24);
150 *(md++) = (unsigned char)(t>>16);
151 *(md++) = (unsigned char)(t>>8);
152 *(md++) = (unsigned char)(t);
153 }
154 break;
155 case SHA512_DIGEST_LENGTH:
156 for (n=0;n<SHA512_DIGEST_LENGTH/8;n++)
157 {
158 SHA_LONG64 t = c->h[n];
159
160 *(md++) = (unsigned char)(t>>56);
161 *(md++) = (unsigned char)(t>>48);
162 *(md++) = (unsigned char)(t>>40);
163 *(md++) = (unsigned char)(t>>32);
164 *(md++) = (unsigned char)(t>>24);
165 *(md++) = (unsigned char)(t>>16);
166 *(md++) = (unsigned char)(t>>8);
167 *(md++) = (unsigned char)(t);
168 }
169 break;
170 /* ... as well as make sure md_len is not abused. */
171 default: return 0;
172 }
173
174 return 1;
175 }
176
177int SHA384_Final (unsigned char *md,SHA512_CTX *c)
178{ return SHA512_Final (md,c); }
179
180int SHA512_Update (SHA512_CTX *c, const void *_data, size_t len)
181 {
182 SHA_LONG64 l;
183 unsigned char *p=c->u.p;
184 const unsigned char *data=(const unsigned char *)_data;
185
186 if (len==0) return 1;
187
188 l = (c->Nl+(((SHA_LONG64)len)<<3))&U64(0xffffffffffffffff);
189 if (l < c->Nl) c->Nh++;
190 if (sizeof(len)>=8) c->Nh+=(((SHA_LONG64)len)>>61);
191 c->Nl=l;
192
193 if (c->num != 0)
194 {
195 size_t n = sizeof(c->u) - c->num;
196
197 if (len < n)
198 {
199 memcpy (p+c->num,data,len), c->num += (unsigned int)len;
200 return 1;
201 }
202 else {
203 memcpy (p+c->num,data,n), c->num = 0;
204 len-=n, data+=n;
205 sha512_block_data_order (c,p,1);
206 }
207 }
208
209 if (len >= sizeof(c->u))
210 {
211#ifndef SHA512_BLOCK_CAN_MANAGE_UNALIGNED_DATA
212 if ((size_t)data%sizeof(c->u.d[0]) != 0)
213 while (len >= sizeof(c->u))
214 memcpy (p,data,sizeof(c->u)),
215 sha512_block_data_order (c,p,1),
216 len -= sizeof(c->u),
217 data += sizeof(c->u);
218 else
219#endif
220 sha512_block_data_order (c,data,len/sizeof(c->u)),
221 data += len,
222 len %= sizeof(c->u),
223 data -= len;
224 }
225
226 if (len != 0) memcpy (p,data,len), c->num = (int)len;
227
228 return 1;
229 }
230
231int SHA384_Update (SHA512_CTX *c, const void *data, size_t len)
232{ return SHA512_Update (c,data,len); }
233
234void SHA512_Transform (SHA512_CTX *c, const unsigned char *data)
235 {
236#ifndef SHA512_BLOCK_CAN_MANAGE_UNALIGNED_DATA
237 if ((size_t)data%sizeof(c->u.d[0]) != 0)
238 memcpy(c->u.p,data,sizeof(c->u.p)),
239 data = c->u.p;
240#endif
241 sha512_block_data_order (c,data,1);
242 }
243
244unsigned char *SHA384(const unsigned char *d, size_t n, unsigned char *md)
245 {
246 SHA512_CTX c;
247 static unsigned char m[SHA384_DIGEST_LENGTH];
248
249 if (md == NULL) md=m;
250 SHA384_Init(&c);
251 SHA512_Update(&c,d,n);
252 SHA512_Final(md,&c);
253 OPENSSL_cleanse(&c,sizeof(c));
254 return(md);
255 }
256
257unsigned char *SHA512(const unsigned char *d, size_t n, unsigned char *md)
258 {
259 SHA512_CTX c;
260 static unsigned char m[SHA512_DIGEST_LENGTH];
261
262 if (md == NULL) md=m;
263 SHA512_Init(&c);
264 SHA512_Update(&c,d,n);
265 SHA512_Final(md,&c);
266 OPENSSL_cleanse(&c,sizeof(c));
267 return(md);
268 }
269
270#ifndef SHA512_ASM
271static const SHA_LONG64 K512[80] = {
272 U64(0x428a2f98d728ae22),U64(0x7137449123ef65cd),
273 U64(0xb5c0fbcfec4d3b2f),U64(0xe9b5dba58189dbbc),
274 U64(0x3956c25bf348b538),U64(0x59f111f1b605d019),
275 U64(0x923f82a4af194f9b),U64(0xab1c5ed5da6d8118),
276 U64(0xd807aa98a3030242),U64(0x12835b0145706fbe),
277 U64(0x243185be4ee4b28c),U64(0x550c7dc3d5ffb4e2),
278 U64(0x72be5d74f27b896f),U64(0x80deb1fe3b1696b1),
279 U64(0x9bdc06a725c71235),U64(0xc19bf174cf692694),
280 U64(0xe49b69c19ef14ad2),U64(0xefbe4786384f25e3),
281 U64(0x0fc19dc68b8cd5b5),U64(0x240ca1cc77ac9c65),
282 U64(0x2de92c6f592b0275),U64(0x4a7484aa6ea6e483),
283 U64(0x5cb0a9dcbd41fbd4),U64(0x76f988da831153b5),
284 U64(0x983e5152ee66dfab),U64(0xa831c66d2db43210),
285 U64(0xb00327c898fb213f),U64(0xbf597fc7beef0ee4),
286 U64(0xc6e00bf33da88fc2),U64(0xd5a79147930aa725),
287 U64(0x06ca6351e003826f),U64(0x142929670a0e6e70),
288 U64(0x27b70a8546d22ffc),U64(0x2e1b21385c26c926),
289 U64(0x4d2c6dfc5ac42aed),U64(0x53380d139d95b3df),
290 U64(0x650a73548baf63de),U64(0x766a0abb3c77b2a8),
291 U64(0x81c2c92e47edaee6),U64(0x92722c851482353b),
292 U64(0xa2bfe8a14cf10364),U64(0xa81a664bbc423001),
293 U64(0xc24b8b70d0f89791),U64(0xc76c51a30654be30),
294 U64(0xd192e819d6ef5218),U64(0xd69906245565a910),
295 U64(0xf40e35855771202a),U64(0x106aa07032bbd1b8),
296 U64(0x19a4c116b8d2d0c8),U64(0x1e376c085141ab53),
297 U64(0x2748774cdf8eeb99),U64(0x34b0bcb5e19b48a8),
298 U64(0x391c0cb3c5c95a63),U64(0x4ed8aa4ae3418acb),
299 U64(0x5b9cca4f7763e373),U64(0x682e6ff3d6b2b8a3),
300 U64(0x748f82ee5defb2fc),U64(0x78a5636f43172f60),
301 U64(0x84c87814a1f0ab72),U64(0x8cc702081a6439ec),
302 U64(0x90befffa23631e28),U64(0xa4506cebde82bde9),
303 U64(0xbef9a3f7b2c67915),U64(0xc67178f2e372532b),
304 U64(0xca273eceea26619c),U64(0xd186b8c721c0c207),
305 U64(0xeada7dd6cde0eb1e),U64(0xf57d4f7fee6ed178),
306 U64(0x06f067aa72176fba),U64(0x0a637dc5a2c898a6),
307 U64(0x113f9804bef90dae),U64(0x1b710b35131c471b),
308 U64(0x28db77f523047d84),U64(0x32caab7b40c72493),
309 U64(0x3c9ebe0a15c9bebc),U64(0x431d67c49c100d4c),
310 U64(0x4cc5d4becb3e42b6),U64(0x597f299cfc657e2a),
311 U64(0x5fcb6fab3ad6faec),U64(0x6c44198c4a475817) };
312
313#ifndef PEDANTIC
314# if defined(__GNUC__) && __GNUC__>=2 && !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_NO_INLINE_ASM)
315# if defined(__x86_64) || defined(__x86_64__)
316# define ROTR(a,n) ({ SHA_LONG64 ret; \
317 asm ("rorq %1,%0" \
318 : "=r"(ret) \
319 : "J"(n),"0"(a) \
320 : "cc"); ret; })
321# if !defined(B_ENDIAN)
322# define PULL64(x) ({ SHA_LONG64 ret=*((const SHA_LONG64 *)(&(x))); \
323 asm ("bswapq %0" \
324 : "=r"(ret) \
325 : "0"(ret)); ret; })
326# endif
327# elif (defined(__i386) || defined(__i386__)) && !defined(B_ENDIAN)
328# if defined(I386_ONLY)
329# define PULL64(x) ({ const unsigned int *p=(const unsigned int *)(&(x));\
330 unsigned int hi=p[0],lo=p[1]; \
331 asm("xchgb %%ah,%%al;xchgb %%dh,%%dl;"\
332 "roll $16,%%eax; roll $16,%%edx; "\
333 "xchgb %%ah,%%al;xchgb %%dh,%%dl;" \
334 : "=a"(lo),"=d"(hi) \
335 : "0"(lo),"1"(hi) : "cc"); \
336 ((SHA_LONG64)hi)<<32|lo; })
337# else
338# define PULL64(x) ({ const unsigned int *p=(const unsigned int *)(&(x));\
339 unsigned int hi=p[0],lo=p[1]; \
340 asm ("bswapl %0; bswapl %1;" \
341 : "=r"(lo),"=r"(hi) \
342 : "0"(lo),"1"(hi)); \
343 ((SHA_LONG64)hi)<<32|lo; })
344# endif
345# elif (defined(_ARCH_PPC) && defined(__64BIT__)) || defined(_ARCH_PPC64)
346# define ROTR(a,n) ({ SHA_LONG64 ret; \
347 asm ("rotrdi %0,%1,%2" \
348 : "=r"(ret) \
349 : "r"(a),"K"(n)); ret; })
350# endif
351# elif defined(_MSC_VER)
352# if defined(_WIN64) /* applies to both IA-64 and AMD64 */
353# pragma intrinsic(_rotr64)
354# define ROTR(a,n) _rotr64((a),n)
355# endif
356# if defined(_M_IX86) && !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_NO_INLINE_ASM)
357# if defined(I386_ONLY)
358 static SHA_LONG64 __fastcall __pull64be(const void *x)
359 { _asm mov edx, [ecx + 0]
360 _asm mov eax, [ecx + 4]
361 _asm xchg dh,dl
362 _asm xchg ah,al
363 _asm rol edx,16
364 _asm rol eax,16
365 _asm xchg dh,dl
366 _asm xchg ah,al
367 }
368# else
369 static SHA_LONG64 __fastcall __pull64be(const void *x)
370 { _asm mov edx, [ecx + 0]
371 _asm mov eax, [ecx + 4]
372 _asm bswap edx
373 _asm bswap eax
374 }
375# endif
376# define PULL64(x) __pull64be(&(x))
377# if _MSC_VER<=1200
378# pragma inline_depth(0)
379# endif
380# endif
381# endif
382#endif
383
384#ifndef PULL64
385#define B(x,j) (((SHA_LONG64)(*(((const unsigned char *)(&x))+j)))<<((7-j)*8))
386#define PULL64(x) (B(x,0)|B(x,1)|B(x,2)|B(x,3)|B(x,4)|B(x,5)|B(x,6)|B(x,7))
387#endif
388
389#ifndef ROTR
390#define ROTR(x,s) (((x)>>s) | (x)<<(64-s))
391#endif
392
393#define Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
394#define Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41))
395#define sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7))
396#define sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
397
398#define Ch(x,y,z) (((x) & (y)) ^ ((~(x)) & (z)))
399#define Maj(x,y,z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
400
401
402#if defined(__i386) || defined(__i386__) || defined(_M_IX86)
403/*
404 * This code should give better results on 32-bit CPU with less than
405 * ~24 registers, both size and performance wise...
406 */
407static void sha512_block_data_order (SHA512_CTX *ctx, const void *in, size_t num)
408 {
409 const SHA_LONG64 *W=in;
410 SHA_LONG64 A,E,T;
411 SHA_LONG64 X[9+80],*F;
412 int i;
413
414 while (num--) {
415
416 F = X+80;
417 A = ctx->h[0]; F[1] = ctx->h[1];
418 F[2] = ctx->h[2]; F[3] = ctx->h[3];
419 E = ctx->h[4]; F[5] = ctx->h[5];
420 F[6] = ctx->h[6]; F[7] = ctx->h[7];
421
422 for (i=0;i<16;i++,F--)
423 {
424#ifdef B_ENDIAN
425 T = W[i];
426#else
427 T = PULL64(W[i]);
428#endif
429 F[0] = A;
430 F[4] = E;
431 F[8] = T;
432 T += F[7] + Sigma1(E) + Ch(E,F[5],F[6]) + K512[i];
433 E = F[3] + T;
434 A = T + Sigma0(A) + Maj(A,F[1],F[2]);
435 }
436
437 for (;i<80;i++,F--)
438 {
439 T = sigma0(F[8+16-1]);
440 T += sigma1(F[8+16-14]);
441 T += F[8+16] + F[8+16-9];
442
443 F[0] = A;
444 F[4] = E;
445 F[8] = T;
446 T += F[7] + Sigma1(E) + Ch(E,F[5],F[6]) + K512[i];
447 E = F[3] + T;
448 A = T + Sigma0(A) + Maj(A,F[1],F[2]);
449 }
450
451 ctx->h[0] += A; ctx->h[1] += F[1];
452 ctx->h[2] += F[2]; ctx->h[3] += F[3];
453 ctx->h[4] += E; ctx->h[5] += F[5];
454 ctx->h[6] += F[6]; ctx->h[7] += F[7];
455
456 W+=SHA_LBLOCK;
457 }
458 }
459
460#elif defined(OPENSSL_SMALL_FOOTPRINT)
461
462static void sha512_block_data_order (SHA512_CTX *ctx, const void *in, size_t num)
463 {
464 const SHA_LONG64 *W=in;
465 SHA_LONG64 a,b,c,d,e,f,g,h,s0,s1,T1,T2;
466 SHA_LONG64 X[16];
467 int i;
468
469 while (num--) {
470
471 a = ctx->h[0]; b = ctx->h[1]; c = ctx->h[2]; d = ctx->h[3];
472 e = ctx->h[4]; f = ctx->h[5]; g = ctx->h[6]; h = ctx->h[7];
473
474 for (i=0;i<16;i++)
475 {
476#ifdef B_ENDIAN
477 T1 = X[i] = W[i];
478#else
479 T1 = X[i] = PULL64(W[i]);
480#endif
481 T1 += h + Sigma1(e) + Ch(e,f,g) + K512[i];
482 T2 = Sigma0(a) + Maj(a,b,c);
483 h = g; g = f; f = e; e = d + T1;
484 d = c; c = b; b = a; a = T1 + T2;
485 }
486
487 for (;i<80;i++)
488 {
489 s0 = X[(i+1)&0x0f]; s0 = sigma0(s0);
490 s1 = X[(i+14)&0x0f]; s1 = sigma1(s1);
491
492 T1 = X[i&0xf] += s0 + s1 + X[(i+9)&0xf];
493 T1 += h + Sigma1(e) + Ch(e,f,g) + K512[i];
494 T2 = Sigma0(a) + Maj(a,b,c);
495 h = g; g = f; f = e; e = d + T1;
496 d = c; c = b; b = a; a = T1 + T2;
497 }
498
499 ctx->h[0] += a; ctx->h[1] += b; ctx->h[2] += c; ctx->h[3] += d;
500 ctx->h[4] += e; ctx->h[5] += f; ctx->h[6] += g; ctx->h[7] += h;
501
502 W+=SHA_LBLOCK;
503 }
504 }
505
506#else
507
508#define ROUND_00_15(i,a,b,c,d,e,f,g,h) do { \
509 T1 += h + Sigma1(e) + Ch(e,f,g) + K512[i]; \
510 h = Sigma0(a) + Maj(a,b,c); \
511 d += T1; h += T1; } while (0)
512
513#define ROUND_16_80(i,j,a,b,c,d,e,f,g,h,X) do { \
514 s0 = X[(j+1)&0x0f]; s0 = sigma0(s0); \
515 s1 = X[(j+14)&0x0f]; s1 = sigma1(s1); \
516 T1 = X[(j)&0x0f] += s0 + s1 + X[(j+9)&0x0f]; \
517 ROUND_00_15(i+j,a,b,c,d,e,f,g,h); } while (0)
518
519static void sha512_block_data_order (SHA512_CTX *ctx, const void *in, size_t num)
520 {
521 const SHA_LONG64 *W=in;
522 SHA_LONG64 a,b,c,d,e,f,g,h,s0,s1,T1;
523 SHA_LONG64 X[16];
524 int i;
525
526 while (num--) {
527
528 a = ctx->h[0]; b = ctx->h[1]; c = ctx->h[2]; d = ctx->h[3];
529 e = ctx->h[4]; f = ctx->h[5]; g = ctx->h[6]; h = ctx->h[7];
530
531#ifdef B_ENDIAN
532 T1 = X[0] = W[0]; ROUND_00_15(0,a,b,c,d,e,f,g,h);
533 T1 = X[1] = W[1]; ROUND_00_15(1,h,a,b,c,d,e,f,g);
534 T1 = X[2] = W[2]; ROUND_00_15(2,g,h,a,b,c,d,e,f);
535 T1 = X[3] = W[3]; ROUND_00_15(3,f,g,h,a,b,c,d,e);
536 T1 = X[4] = W[4]; ROUND_00_15(4,e,f,g,h,a,b,c,d);
537 T1 = X[5] = W[5]; ROUND_00_15(5,d,e,f,g,h,a,b,c);
538 T1 = X[6] = W[6]; ROUND_00_15(6,c,d,e,f,g,h,a,b);
539 T1 = X[7] = W[7]; ROUND_00_15(7,b,c,d,e,f,g,h,a);
540 T1 = X[8] = W[8]; ROUND_00_15(8,a,b,c,d,e,f,g,h);
541 T1 = X[9] = W[9]; ROUND_00_15(9,h,a,b,c,d,e,f,g);
542 T1 = X[10] = W[10]; ROUND_00_15(10,g,h,a,b,c,d,e,f);
543 T1 = X[11] = W[11]; ROUND_00_15(11,f,g,h,a,b,c,d,e);
544 T1 = X[12] = W[12]; ROUND_00_15(12,e,f,g,h,a,b,c,d);
545 T1 = X[13] = W[13]; ROUND_00_15(13,d,e,f,g,h,a,b,c);
546 T1 = X[14] = W[14]; ROUND_00_15(14,c,d,e,f,g,h,a,b);
547 T1 = X[15] = W[15]; ROUND_00_15(15,b,c,d,e,f,g,h,a);
548#else
549 T1 = X[0] = PULL64(W[0]); ROUND_00_15(0,a,b,c,d,e,f,g,h);
550 T1 = X[1] = PULL64(W[1]); ROUND_00_15(1,h,a,b,c,d,e,f,g);
551 T1 = X[2] = PULL64(W[2]); ROUND_00_15(2,g,h,a,b,c,d,e,f);
552 T1 = X[3] = PULL64(W[3]); ROUND_00_15(3,f,g,h,a,b,c,d,e);
553 T1 = X[4] = PULL64(W[4]); ROUND_00_15(4,e,f,g,h,a,b,c,d);
554 T1 = X[5] = PULL64(W[5]); ROUND_00_15(5,d,e,f,g,h,a,b,c);
555 T1 = X[6] = PULL64(W[6]); ROUND_00_15(6,c,d,e,f,g,h,a,b);
556 T1 = X[7] = PULL64(W[7]); ROUND_00_15(7,b,c,d,e,f,g,h,a);
557 T1 = X[8] = PULL64(W[8]); ROUND_00_15(8,a,b,c,d,e,f,g,h);
558 T1 = X[9] = PULL64(W[9]); ROUND_00_15(9,h,a,b,c,d,e,f,g);
559 T1 = X[10] = PULL64(W[10]); ROUND_00_15(10,g,h,a,b,c,d,e,f);
560 T1 = X[11] = PULL64(W[11]); ROUND_00_15(11,f,g,h,a,b,c,d,e);
561 T1 = X[12] = PULL64(W[12]); ROUND_00_15(12,e,f,g,h,a,b,c,d);
562 T1 = X[13] = PULL64(W[13]); ROUND_00_15(13,d,e,f,g,h,a,b,c);
563 T1 = X[14] = PULL64(W[14]); ROUND_00_15(14,c,d,e,f,g,h,a,b);
564 T1 = X[15] = PULL64(W[15]); ROUND_00_15(15,b,c,d,e,f,g,h,a);
565#endif
566
567 for (i=16;i<80;i+=16)
568 {
569 ROUND_16_80(i, 0,a,b,c,d,e,f,g,h,X);
570 ROUND_16_80(i, 1,h,a,b,c,d,e,f,g,X);
571 ROUND_16_80(i, 2,g,h,a,b,c,d,e,f,X);
572 ROUND_16_80(i, 3,f,g,h,a,b,c,d,e,X);
573 ROUND_16_80(i, 4,e,f,g,h,a,b,c,d,X);
574 ROUND_16_80(i, 5,d,e,f,g,h,a,b,c,X);
575 ROUND_16_80(i, 6,c,d,e,f,g,h,a,b,X);
576 ROUND_16_80(i, 7,b,c,d,e,f,g,h,a,X);
577 ROUND_16_80(i, 8,a,b,c,d,e,f,g,h,X);
578 ROUND_16_80(i, 9,h,a,b,c,d,e,f,g,X);
579 ROUND_16_80(i,10,g,h,a,b,c,d,e,f,X);
580 ROUND_16_80(i,11,f,g,h,a,b,c,d,e,X);
581 ROUND_16_80(i,12,e,f,g,h,a,b,c,d,X);
582 ROUND_16_80(i,13,d,e,f,g,h,a,b,c,X);
583 ROUND_16_80(i,14,c,d,e,f,g,h,a,b,X);
584 ROUND_16_80(i,15,b,c,d,e,f,g,h,a,X);
585 }
586
587 ctx->h[0] += a; ctx->h[1] += b; ctx->h[2] += c; ctx->h[3] += d;
588 ctx->h[4] += e; ctx->h[5] += f; ctx->h[6] += g; ctx->h[7] += h;
589
590 W+=SHA_LBLOCK;
591 }
592 }
593
594#endif
595
596#endif /* SHA512_ASM */
597
598#else /* !OPENSSL_NO_SHA512 */
599
600#if defined(PEDANTIC) || defined(__DECC) || defined(OPENSSL_SYS_MACOSX)
601static void *dummy=&dummy;
602#endif
603
604#endif /* !OPENSSL_NO_SHA512 */
diff --git a/src/lib/libcrypto/sha/sha_locl.h b/src/lib/libcrypto/sha/sha_locl.h
deleted file mode 100644
index 7a0c3ca8d8..0000000000
--- a/src/lib/libcrypto/sha/sha_locl.h
+++ /dev/null
@@ -1,441 +0,0 @@
1/* crypto/sha/sha_locl.h */
2/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
3 * All rights reserved.
4 *
5 * This package is an SSL implementation written
6 * by Eric Young (eay@cryptsoft.com).
7 * The implementation was written so as to conform with Netscapes SSL.
8 *
9 * This library is free for commercial and non-commercial use as long as
10 * the following conditions are aheared to. The following conditions
11 * apply to all code found in this distribution, be it the RC4, RSA,
12 * lhash, DES, etc., code; not just the SSL code. The SSL documentation
13 * included with this distribution is covered by the same copyright terms
14 * except that the holder is Tim Hudson (tjh@cryptsoft.com).
15 *
16 * Copyright remains Eric Young's, and as such any Copyright notices in
17 * the code are not to be removed.
18 * If this package is used in a product, Eric Young should be given attribution
19 * as the author of the parts of the library used.
20 * This can be in the form of a textual message at program startup or
21 * in documentation (online or textual) provided with the package.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 * 1. Redistributions of source code must retain the copyright
27 * notice, this list of conditions and the following disclaimer.
28 * 2. Redistributions in binary form must reproduce the above copyright
29 * notice, this list of conditions and the following disclaimer in the
30 * documentation and/or other materials provided with the distribution.
31 * 3. All advertising materials mentioning features or use of this software
32 * must display the following acknowledgement:
33 * "This product includes cryptographic software written by
34 * Eric Young (eay@cryptsoft.com)"
35 * The word 'cryptographic' can be left out if the rouines from the library
36 * being used are not cryptographic related :-).
37 * 4. If you include any Windows specific code (or a derivative thereof) from
38 * the apps directory (application code) you must include an acknowledgement:
39 * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
40 *
41 * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
42 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51 * SUCH DAMAGE.
52 *
53 * The licence and distribution terms for any publically available version or
54 * derivative of this code cannot be changed. i.e. this code cannot simply be
55 * copied and put under another distribution licence
56 * [including the GNU Public Licence.]
57 */
58
59#include <stdlib.h>
60#include <string.h>
61
62#include <openssl/opensslconf.h>
63#include <openssl/sha.h>
64
65#define DATA_ORDER_IS_BIG_ENDIAN
66
67#define HASH_LONG SHA_LONG
68#define HASH_CTX SHA_CTX
69#define HASH_CBLOCK SHA_CBLOCK
70#define HASH_MAKE_STRING(c,s) do { \
71 unsigned long ll; \
72 ll=(c)->h0; HOST_l2c(ll,(s)); \
73 ll=(c)->h1; HOST_l2c(ll,(s)); \
74 ll=(c)->h2; HOST_l2c(ll,(s)); \
75 ll=(c)->h3; HOST_l2c(ll,(s)); \
76 ll=(c)->h4; HOST_l2c(ll,(s)); \
77 } while (0)
78
79#if defined(SHA_0)
80
81# define HASH_UPDATE SHA_Update
82# define HASH_TRANSFORM SHA_Transform
83# define HASH_FINAL SHA_Final
84# define HASH_INIT SHA_Init
85# define HASH_BLOCK_DATA_ORDER sha_block_data_order
86# define Xupdate(a,ix,ia,ib,ic,id) (ix=(a)=(ia^ib^ic^id))
87
88static void sha_block_data_order (SHA_CTX *c, const void *p,size_t num);
89
90#elif defined(SHA_1)
91
92# define HASH_UPDATE SHA1_Update
93# define HASH_TRANSFORM SHA1_Transform
94# define HASH_FINAL SHA1_Final
95# define HASH_INIT SHA1_Init
96# define HASH_BLOCK_DATA_ORDER sha1_block_data_order
97# if defined(__MWERKS__) && defined(__MC68K__)
98 /* Metrowerks for Motorola fails otherwise:-( <appro@fy.chalmers.se> */
99# define Xupdate(a,ix,ia,ib,ic,id) do { (a)=(ia^ib^ic^id); \
100 ix=(a)=ROTATE((a),1); \
101 } while (0)
102# else
103# define Xupdate(a,ix,ia,ib,ic,id) ( (a)=(ia^ib^ic^id), \
104 ix=(a)=ROTATE((a),1) \
105 )
106# endif
107
108#ifndef SHA1_ASM
109static
110#endif
111void sha1_block_data_order (SHA_CTX *c, const void *p,size_t num);
112
113#else
114# error "Either SHA_0 or SHA_1 must be defined."
115#endif
116
117#include "md32_common.h"
118
119#define INIT_DATA_h0 0x67452301UL
120#define INIT_DATA_h1 0xefcdab89UL
121#define INIT_DATA_h2 0x98badcfeUL
122#define INIT_DATA_h3 0x10325476UL
123#define INIT_DATA_h4 0xc3d2e1f0UL
124
125#ifdef SHA_0
126fips_md_init(SHA)
127#else
128fips_md_init_ctx(SHA1, SHA)
129#endif
130 {
131 memset (c,0,sizeof(*c));
132 c->h0=INIT_DATA_h0;
133 c->h1=INIT_DATA_h1;
134 c->h2=INIT_DATA_h2;
135 c->h3=INIT_DATA_h3;
136 c->h4=INIT_DATA_h4;
137 return 1;
138 }
139
140#define K_00_19 0x5a827999UL
141#define K_20_39 0x6ed9eba1UL
142#define K_40_59 0x8f1bbcdcUL
143#define K_60_79 0xca62c1d6UL
144
145/* As pointed out by Wei Dai <weidai@eskimo.com>, F() below can be
146 * simplified to the code in F_00_19. Wei attributes these optimisations
147 * to Peter Gutmann's SHS code, and he attributes it to Rich Schroeppel.
148 * #define F(x,y,z) (((x) & (y)) | ((~(x)) & (z)))
149 * I've just become aware of another tweak to be made, again from Wei Dai,
150 * in F_40_59, (x&a)|(y&a) -> (x|y)&a
151 */
152#define F_00_19(b,c,d) ((((c) ^ (d)) & (b)) ^ (d))
153#define F_20_39(b,c,d) ((b) ^ (c) ^ (d))
154#define F_40_59(b,c,d) (((b) & (c)) | (((b)|(c)) & (d)))
155#define F_60_79(b,c,d) F_20_39(b,c,d)
156
157#ifndef OPENSSL_SMALL_FOOTPRINT
158
159#define BODY_00_15(i,a,b,c,d,e,f,xi) \
160 (f)=xi+(e)+K_00_19+ROTATE((a),5)+F_00_19((b),(c),(d)); \
161 (b)=ROTATE((b),30);
162
163#define BODY_16_19(i,a,b,c,d,e,f,xi,xa,xb,xc,xd) \
164 Xupdate(f,xi,xa,xb,xc,xd); \
165 (f)+=(e)+K_00_19+ROTATE((a),5)+F_00_19((b),(c),(d)); \
166 (b)=ROTATE((b),30);
167
168#define BODY_20_31(i,a,b,c,d,e,f,xi,xa,xb,xc,xd) \
169 Xupdate(f,xi,xa,xb,xc,xd); \
170 (f)+=(e)+K_20_39+ROTATE((a),5)+F_20_39((b),(c),(d)); \
171 (b)=ROTATE((b),30);
172
173#define BODY_32_39(i,a,b,c,d,e,f,xa,xb,xc,xd) \
174 Xupdate(f,xa,xa,xb,xc,xd); \
175 (f)+=(e)+K_20_39+ROTATE((a),5)+F_20_39((b),(c),(d)); \
176 (b)=ROTATE((b),30);
177
178#define BODY_40_59(i,a,b,c,d,e,f,xa,xb,xc,xd) \
179 Xupdate(f,xa,xa,xb,xc,xd); \
180 (f)+=(e)+K_40_59+ROTATE((a),5)+F_40_59((b),(c),(d)); \
181 (b)=ROTATE((b),30);
182
183#define BODY_60_79(i,a,b,c,d,e,f,xa,xb,xc,xd) \
184 Xupdate(f,xa,xa,xb,xc,xd); \
185 (f)=xa+(e)+K_60_79+ROTATE((a),5)+F_60_79((b),(c),(d)); \
186 (b)=ROTATE((b),30);
187
188#ifdef X
189#undef X
190#endif
191#ifndef MD32_XARRAY
192 /*
193 * Originally X was an array. As it's automatic it's natural
194 * to expect RISC compiler to accomodate at least part of it in
195 * the register bank, isn't it? Unfortunately not all compilers
196 * "find" this expectation reasonable:-( On order to make such
197 * compilers generate better code I replace X[] with a bunch of
198 * X0, X1, etc. See the function body below...
199 * <appro@fy.chalmers.se>
200 */
201# define X(i) XX##i
202#else
203 /*
204 * However! Some compilers (most notably HP C) get overwhelmed by
205 * that many local variables so that we have to have the way to
206 * fall down to the original behavior.
207 */
208# define X(i) XX[i]
209#endif
210
211#if !defined(SHA_1) || !defined(SHA1_ASM)
212static void HASH_BLOCK_DATA_ORDER (SHA_CTX *c, const void *p, size_t num)
213 {
214 const unsigned char *data=p;
215 register unsigned MD32_REG_T A,B,C,D,E,T,l;
216#ifndef MD32_XARRAY
217 unsigned MD32_REG_T XX0, XX1, XX2, XX3, XX4, XX5, XX6, XX7,
218 XX8, XX9,XX10,XX11,XX12,XX13,XX14,XX15;
219#else
220 SHA_LONG XX[16];
221#endif
222
223 A=c->h0;
224 B=c->h1;
225 C=c->h2;
226 D=c->h3;
227 E=c->h4;
228
229 for (;;)
230 {
231 const union { long one; char little; } is_endian = {1};
232
233 if (!is_endian.little && sizeof(SHA_LONG)==4 && ((size_t)p%4)==0)
234 {
235 const SHA_LONG *W=(const SHA_LONG *)data;
236
237 X( 0) = W[0]; X( 1) = W[ 1];
238 BODY_00_15( 0,A,B,C,D,E,T,X( 0)); X( 2) = W[ 2];
239 BODY_00_15( 1,T,A,B,C,D,E,X( 1)); X( 3) = W[ 3];
240 BODY_00_15( 2,E,T,A,B,C,D,X( 2)); X( 4) = W[ 4];
241 BODY_00_15( 3,D,E,T,A,B,C,X( 3)); X( 5) = W[ 5];
242 BODY_00_15( 4,C,D,E,T,A,B,X( 4)); X( 6) = W[ 6];
243 BODY_00_15( 5,B,C,D,E,T,A,X( 5)); X( 7) = W[ 7];
244 BODY_00_15( 6,A,B,C,D,E,T,X( 6)); X( 8) = W[ 8];
245 BODY_00_15( 7,T,A,B,C,D,E,X( 7)); X( 9) = W[ 9];
246 BODY_00_15( 8,E,T,A,B,C,D,X( 8)); X(10) = W[10];
247 BODY_00_15( 9,D,E,T,A,B,C,X( 9)); X(11) = W[11];
248 BODY_00_15(10,C,D,E,T,A,B,X(10)); X(12) = W[12];
249 BODY_00_15(11,B,C,D,E,T,A,X(11)); X(13) = W[13];
250 BODY_00_15(12,A,B,C,D,E,T,X(12)); X(14) = W[14];
251 BODY_00_15(13,T,A,B,C,D,E,X(13)); X(15) = W[15];
252 BODY_00_15(14,E,T,A,B,C,D,X(14));
253 BODY_00_15(15,D,E,T,A,B,C,X(15));
254
255 data += SHA_CBLOCK;
256 }
257 else
258 {
259 HOST_c2l(data,l); X( 0)=l; HOST_c2l(data,l); X( 1)=l;
260 BODY_00_15( 0,A,B,C,D,E,T,X( 0)); HOST_c2l(data,l); X( 2)=l;
261 BODY_00_15( 1,T,A,B,C,D,E,X( 1)); HOST_c2l(data,l); X( 3)=l;
262 BODY_00_15( 2,E,T,A,B,C,D,X( 2)); HOST_c2l(data,l); X( 4)=l;
263 BODY_00_15( 3,D,E,T,A,B,C,X( 3)); HOST_c2l(data,l); X( 5)=l;
264 BODY_00_15( 4,C,D,E,T,A,B,X( 4)); HOST_c2l(data,l); X( 6)=l;
265 BODY_00_15( 5,B,C,D,E,T,A,X( 5)); HOST_c2l(data,l); X( 7)=l;
266 BODY_00_15( 6,A,B,C,D,E,T,X( 6)); HOST_c2l(data,l); X( 8)=l;
267 BODY_00_15( 7,T,A,B,C,D,E,X( 7)); HOST_c2l(data,l); X( 9)=l;
268 BODY_00_15( 8,E,T,A,B,C,D,X( 8)); HOST_c2l(data,l); X(10)=l;
269 BODY_00_15( 9,D,E,T,A,B,C,X( 9)); HOST_c2l(data,l); X(11)=l;
270 BODY_00_15(10,C,D,E,T,A,B,X(10)); HOST_c2l(data,l); X(12)=l;
271 BODY_00_15(11,B,C,D,E,T,A,X(11)); HOST_c2l(data,l); X(13)=l;
272 BODY_00_15(12,A,B,C,D,E,T,X(12)); HOST_c2l(data,l); X(14)=l;
273 BODY_00_15(13,T,A,B,C,D,E,X(13)); HOST_c2l(data,l); X(15)=l;
274 BODY_00_15(14,E,T,A,B,C,D,X(14));
275 BODY_00_15(15,D,E,T,A,B,C,X(15));
276 }
277
278 BODY_16_19(16,C,D,E,T,A,B,X( 0),X( 0),X( 2),X( 8),X(13));
279 BODY_16_19(17,B,C,D,E,T,A,X( 1),X( 1),X( 3),X( 9),X(14));
280 BODY_16_19(18,A,B,C,D,E,T,X( 2),X( 2),X( 4),X(10),X(15));
281 BODY_16_19(19,T,A,B,C,D,E,X( 3),X( 3),X( 5),X(11),X( 0));
282
283 BODY_20_31(20,E,T,A,B,C,D,X( 4),X( 4),X( 6),X(12),X( 1));
284 BODY_20_31(21,D,E,T,A,B,C,X( 5),X( 5),X( 7),X(13),X( 2));
285 BODY_20_31(22,C,D,E,T,A,B,X( 6),X( 6),X( 8),X(14),X( 3));
286 BODY_20_31(23,B,C,D,E,T,A,X( 7),X( 7),X( 9),X(15),X( 4));
287 BODY_20_31(24,A,B,C,D,E,T,X( 8),X( 8),X(10),X( 0),X( 5));
288 BODY_20_31(25,T,A,B,C,D,E,X( 9),X( 9),X(11),X( 1),X( 6));
289 BODY_20_31(26,E,T,A,B,C,D,X(10),X(10),X(12),X( 2),X( 7));
290 BODY_20_31(27,D,E,T,A,B,C,X(11),X(11),X(13),X( 3),X( 8));
291 BODY_20_31(28,C,D,E,T,A,B,X(12),X(12),X(14),X( 4),X( 9));
292 BODY_20_31(29,B,C,D,E,T,A,X(13),X(13),X(15),X( 5),X(10));
293 BODY_20_31(30,A,B,C,D,E,T,X(14),X(14),X( 0),X( 6),X(11));
294 BODY_20_31(31,T,A,B,C,D,E,X(15),X(15),X( 1),X( 7),X(12));
295
296 BODY_32_39(32,E,T,A,B,C,D,X( 0),X( 2),X( 8),X(13));
297 BODY_32_39(33,D,E,T,A,B,C,X( 1),X( 3),X( 9),X(14));
298 BODY_32_39(34,C,D,E,T,A,B,X( 2),X( 4),X(10),X(15));
299 BODY_32_39(35,B,C,D,E,T,A,X( 3),X( 5),X(11),X( 0));
300 BODY_32_39(36,A,B,C,D,E,T,X( 4),X( 6),X(12),X( 1));
301 BODY_32_39(37,T,A,B,C,D,E,X( 5),X( 7),X(13),X( 2));
302 BODY_32_39(38,E,T,A,B,C,D,X( 6),X( 8),X(14),X( 3));
303 BODY_32_39(39,D,E,T,A,B,C,X( 7),X( 9),X(15),X( 4));
304
305 BODY_40_59(40,C,D,E,T,A,B,X( 8),X(10),X( 0),X( 5));
306 BODY_40_59(41,B,C,D,E,T,A,X( 9),X(11),X( 1),X( 6));
307 BODY_40_59(42,A,B,C,D,E,T,X(10),X(12),X( 2),X( 7));
308 BODY_40_59(43,T,A,B,C,D,E,X(11),X(13),X( 3),X( 8));
309 BODY_40_59(44,E,T,A,B,C,D,X(12),X(14),X( 4),X( 9));
310 BODY_40_59(45,D,E,T,A,B,C,X(13),X(15),X( 5),X(10));
311 BODY_40_59(46,C,D,E,T,A,B,X(14),X( 0),X( 6),X(11));
312 BODY_40_59(47,B,C,D,E,T,A,X(15),X( 1),X( 7),X(12));
313 BODY_40_59(48,A,B,C,D,E,T,X( 0),X( 2),X( 8),X(13));
314 BODY_40_59(49,T,A,B,C,D,E,X( 1),X( 3),X( 9),X(14));
315 BODY_40_59(50,E,T,A,B,C,D,X( 2),X( 4),X(10),X(15));
316 BODY_40_59(51,D,E,T,A,B,C,X( 3),X( 5),X(11),X( 0));
317 BODY_40_59(52,C,D,E,T,A,B,X( 4),X( 6),X(12),X( 1));
318 BODY_40_59(53,B,C,D,E,T,A,X( 5),X( 7),X(13),X( 2));
319 BODY_40_59(54,A,B,C,D,E,T,X( 6),X( 8),X(14),X( 3));
320 BODY_40_59(55,T,A,B,C,D,E,X( 7),X( 9),X(15),X( 4));
321 BODY_40_59(56,E,T,A,B,C,D,X( 8),X(10),X( 0),X( 5));
322 BODY_40_59(57,D,E,T,A,B,C,X( 9),X(11),X( 1),X( 6));
323 BODY_40_59(58,C,D,E,T,A,B,X(10),X(12),X( 2),X( 7));
324 BODY_40_59(59,B,C,D,E,T,A,X(11),X(13),X( 3),X( 8));
325
326 BODY_60_79(60,A,B,C,D,E,T,X(12),X(14),X( 4),X( 9));
327 BODY_60_79(61,T,A,B,C,D,E,X(13),X(15),X( 5),X(10));
328 BODY_60_79(62,E,T,A,B,C,D,X(14),X( 0),X( 6),X(11));
329 BODY_60_79(63,D,E,T,A,B,C,X(15),X( 1),X( 7),X(12));
330 BODY_60_79(64,C,D,E,T,A,B,X( 0),X( 2),X( 8),X(13));
331 BODY_60_79(65,B,C,D,E,T,A,X( 1),X( 3),X( 9),X(14));
332 BODY_60_79(66,A,B,C,D,E,T,X( 2),X( 4),X(10),X(15));
333 BODY_60_79(67,T,A,B,C,D,E,X( 3),X( 5),X(11),X( 0));
334 BODY_60_79(68,E,T,A,B,C,D,X( 4),X( 6),X(12),X( 1));
335 BODY_60_79(69,D,E,T,A,B,C,X( 5),X( 7),X(13),X( 2));
336 BODY_60_79(70,C,D,E,T,A,B,X( 6),X( 8),X(14),X( 3));
337 BODY_60_79(71,B,C,D,E,T,A,X( 7),X( 9),X(15),X( 4));
338 BODY_60_79(72,A,B,C,D,E,T,X( 8),X(10),X( 0),X( 5));
339 BODY_60_79(73,T,A,B,C,D,E,X( 9),X(11),X( 1),X( 6));
340 BODY_60_79(74,E,T,A,B,C,D,X(10),X(12),X( 2),X( 7));
341 BODY_60_79(75,D,E,T,A,B,C,X(11),X(13),X( 3),X( 8));
342 BODY_60_79(76,C,D,E,T,A,B,X(12),X(14),X( 4),X( 9));
343 BODY_60_79(77,B,C,D,E,T,A,X(13),X(15),X( 5),X(10));
344 BODY_60_79(78,A,B,C,D,E,T,X(14),X( 0),X( 6),X(11));
345 BODY_60_79(79,T,A,B,C,D,E,X(15),X( 1),X( 7),X(12));
346
347 c->h0=(c->h0+E)&0xffffffffL;
348 c->h1=(c->h1+T)&0xffffffffL;
349 c->h2=(c->h2+A)&0xffffffffL;
350 c->h3=(c->h3+B)&0xffffffffL;
351 c->h4=(c->h4+C)&0xffffffffL;
352
353 if (--num == 0) break;
354
355 A=c->h0;
356 B=c->h1;
357 C=c->h2;
358 D=c->h3;
359 E=c->h4;
360
361 }
362 }
363#endif
364
365#else /* OPENSSL_SMALL_FOOTPRINT */
366
367#define BODY_00_15(xi) do { \
368 T=E+K_00_19+F_00_19(B,C,D); \
369 E=D, D=C, C=ROTATE(B,30), B=A; \
370 A=ROTATE(A,5)+T+xi; } while(0)
371
372#define BODY_16_19(xa,xb,xc,xd) do { \
373 Xupdate(T,xa,xa,xb,xc,xd); \
374 T+=E+K_00_19+F_00_19(B,C,D); \
375 E=D, D=C, C=ROTATE(B,30), B=A; \
376 A=ROTATE(A,5)+T; } while(0)
377
378#define BODY_20_39(xa,xb,xc,xd) do { \
379 Xupdate(T,xa,xa,xb,xc,xd); \
380 T+=E+K_20_39+F_20_39(B,C,D); \
381 E=D, D=C, C=ROTATE(B,30), B=A; \
382 A=ROTATE(A,5)+T; } while(0)
383
384#define BODY_40_59(xa,xb,xc,xd) do { \
385 Xupdate(T,xa,xa,xb,xc,xd); \
386 T+=E+K_40_59+F_40_59(B,C,D); \
387 E=D, D=C, C=ROTATE(B,30), B=A; \
388 A=ROTATE(A,5)+T; } while(0)
389
390#define BODY_60_79(xa,xb,xc,xd) do { \
391 Xupdate(T,xa,xa,xb,xc,xd); \
392 T=E+K_60_79+F_60_79(B,C,D); \
393 E=D, D=C, C=ROTATE(B,30), B=A; \
394 A=ROTATE(A,5)+T+xa; } while(0)
395
396#if !defined(SHA_1) || !defined(SHA1_ASM)
397static void HASH_BLOCK_DATA_ORDER (SHA_CTX *c, const void *p, size_t num)
398 {
399 const unsigned char *data=p;
400 register unsigned MD32_REG_T A,B,C,D,E,T,l;
401 int i;
402 SHA_LONG X[16];
403
404 A=c->h0;
405 B=c->h1;
406 C=c->h2;
407 D=c->h3;
408 E=c->h4;
409
410 for (;;)
411 {
412 for (i=0;i<16;i++)
413 { HOST_c2l(data,l); X[i]=l; BODY_00_15(X[i]); }
414 for (i=0;i<4;i++)
415 { BODY_16_19(X[i], X[i+2], X[i+8], X[(i+13)&15]); }
416 for (;i<24;i++)
417 { BODY_20_39(X[i&15], X[(i+2)&15], X[(i+8)&15],X[(i+13)&15]); }
418 for (i=0;i<20;i++)
419 { BODY_40_59(X[(i+8)&15],X[(i+10)&15],X[i&15], X[(i+5)&15]); }
420 for (i=4;i<24;i++)
421 { BODY_60_79(X[(i+8)&15],X[(i+10)&15],X[i&15], X[(i+5)&15]); }
422
423 c->h0=(c->h0+A)&0xffffffffL;
424 c->h1=(c->h1+B)&0xffffffffL;
425 c->h2=(c->h2+C)&0xffffffffL;
426 c->h3=(c->h3+D)&0xffffffffL;
427 c->h4=(c->h4+E)&0xffffffffL;
428
429 if (--num == 0) break;
430
431 A=c->h0;
432 B=c->h1;
433 C=c->h2;
434 D=c->h3;
435 E=c->h4;
436
437 }
438 }
439#endif
440
441#endif