summaryrefslogtreecommitdiff
path: root/src/lib/libcrypto/sha
diff options
context:
space:
mode:
authorcvs2svn <admin@example.com>2025-04-14 17:32:06 +0000
committercvs2svn <admin@example.com>2025-04-14 17:32:06 +0000
commiteb8dd9dca1228af0cd132f515509051ecfabf6f6 (patch)
treeedb6da6af7e865d488dc1a29309f1e1ec226e603 /src/lib/libcrypto/sha
parent247f0352e0ed72a4f476db9dc91f4d982bc83eb2 (diff)
downloadopenbsd-tb_20250414.tar.gz
openbsd-tb_20250414.tar.bz2
openbsd-tb_20250414.zip
This commit was manufactured by cvs2git to create tag 'tb_20250414'.tb_20250414
Diffstat (limited to 'src/lib/libcrypto/sha')
-rw-r--r--src/lib/libcrypto/sha/asm/sha1-586.pl1223
-rw-r--r--src/lib/libcrypto/sha/asm/sha1-alpha.pl316
-rw-r--r--src/lib/libcrypto/sha/asm/sha1-armv4-large.pl248
-rw-r--r--src/lib/libcrypto/sha/asm/sha1-mips.pl350
-rw-r--r--src/lib/libcrypto/sha/asm/sha1-parisc.pl258
-rwxr-xr-xsrc/lib/libcrypto/sha/asm/sha1-ppc.pl318
-rw-r--r--src/lib/libcrypto/sha/asm/sha1-sparcv9.pl282
-rw-r--r--src/lib/libcrypto/sha/asm/sha256-586.pl249
-rw-r--r--src/lib/libcrypto/sha/asm/sha256-armv4.pl211
-rw-r--r--src/lib/libcrypto/sha/asm/sha512-586.pl646
-rw-r--r--src/lib/libcrypto/sha/asm/sha512-armv4.pl582
-rw-r--r--src/lib/libcrypto/sha/asm/sha512-mips.pl457
-rwxr-xr-xsrc/lib/libcrypto/sha/asm/sha512-parisc.pl801
-rwxr-xr-xsrc/lib/libcrypto/sha/asm/sha512-ppc.pl444
-rw-r--r--src/lib/libcrypto/sha/asm/sha512-sparcv9.pl604
-rw-r--r--src/lib/libcrypto/sha/sha.h190
-rw-r--r--src/lib/libcrypto/sha/sha1.c518
-rw-r--r--src/lib/libcrypto/sha/sha1_amd64.c34
-rw-r--r--src/lib/libcrypto/sha/sha1_amd64_generic.S314
-rw-r--r--src/lib/libcrypto/sha/sha1_amd64_shani.S170
-rw-r--r--src/lib/libcrypto/sha/sha256.c496
-rw-r--r--src/lib/libcrypto/sha/sha256_aarch64.c34
-rw-r--r--src/lib/libcrypto/sha/sha256_aarch64_ce.S189
-rw-r--r--src/lib/libcrypto/sha/sha256_amd64.c34
-rw-r--r--src/lib/libcrypto/sha/sha256_amd64_generic.S302
-rw-r--r--src/lib/libcrypto/sha/sha256_amd64_shani.S209
-rw-r--r--src/lib/libcrypto/sha/sha3.c172
-rw-r--r--src/lib/libcrypto/sha/sha3_internal.h81
-rw-r--r--src/lib/libcrypto/sha/sha512.c578
-rw-r--r--src/lib/libcrypto/sha/sha512_aarch64.c34
-rw-r--r--src/lib/libcrypto/sha/sha512_aarch64_ce.S312
-rw-r--r--src/lib/libcrypto/sha/sha512_amd64.c26
-rw-r--r--src/lib/libcrypto/sha/sha512_amd64_generic.S307
-rw-r--r--src/lib/libcrypto/sha/sha_internal.h36
34 files changed, 0 insertions, 11025 deletions
diff --git a/src/lib/libcrypto/sha/asm/sha1-586.pl b/src/lib/libcrypto/sha/asm/sha1-586.pl
deleted file mode 100644
index 5928e083c1..0000000000
--- a/src/lib/libcrypto/sha/asm/sha1-586.pl
+++ /dev/null
@@ -1,1223 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# [Re]written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# "[Re]written" was achieved in two major overhauls. In 2004 BODY_*
11# functions were re-implemented to address P4 performance issue [see
12# commentary below], and in 2006 the rest was rewritten in order to
13# gain freedom to liberate licensing terms.
14
15# January, September 2004.
16#
17# It was noted that Intel IA-32 C compiler generates code which
18# performs ~30% *faster* on P4 CPU than original *hand-coded*
19# SHA1 assembler implementation. To address this problem (and
20# prove that humans are still better than machines:-), the
21# original code was overhauled, which resulted in following
22# performance changes:
23#
24# compared with original compared with Intel cc
25# assembler impl. generated code
26# Pentium -16% +48%
27# PIII/AMD +8% +16%
28# P4 +85%(!) +45%
29#
30# As you can see Pentium came out as looser:-( Yet I reckoned that
31# improvement on P4 outweighs the loss and incorporate this
32# re-tuned code to 0.9.7 and later.
33# ----------------------------------------------------------------
34# <appro@fy.chalmers.se>
35
36# August 2009.
37#
38# George Spelvin has tipped that F_40_59(b,c,d) can be rewritten as
39# '(c&d) + (b&(c^d))', which allows to accumulate partial results
40# and lighten "pressure" on scratch registers. This resulted in
41# >12% performance improvement on contemporary AMD cores (with no
42# degradation on other CPUs:-). Also, the code was revised to maximize
43# "distance" between instructions producing input to 'lea' instruction
44# and the 'lea' instruction itself, which is essential for Intel Atom
45# core and resulted in ~15% improvement.
46
47# October 2010.
48#
49# Add SSSE3, Supplemental[!] SSE3, implementation. The idea behind it
50# is to offload message schedule denoted by Wt in NIST specification,
51# or Xupdate in OpenSSL source, to SIMD unit. The idea is not novel,
52# and in SSE2 context was first explored by Dean Gaudet in 2004, see
53# http://arctic.org/~dean/crypto/sha1.html. Since then several things
54# have changed that made it interesting again:
55#
56# a) XMM units became faster and wider;
57# b) instruction set became more versatile;
58# c) an important observation was made by Max Locktykhin, which made
59# it possible to reduce amount of instructions required to perform
60# the operation in question, for further details see
61# http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/.
62
63# April 2011.
64#
65# Add AVX code path, probably most controversial... The thing is that
66# switch to AVX alone improves performance by as little as 4% in
67# comparison to SSSE3 code path. But below result doesn't look like
68# 4% improvement... Trouble is that Sandy Bridge decodes 'ro[rl]' as
69# pair of µ-ops, and it's the additional µ-ops, two per round, that
70# make it run slower than Core2 and Westmere. But 'sh[rl]d' is decoded
71# as single µ-op by Sandy Bridge and it's replacing 'ro[rl]' with
72# equivalent 'sh[rl]d' that is responsible for the impressive 5.1
73# cycles per processed byte. But 'sh[rl]d' is not something that used
74# to be fast, nor does it appear to be fast in upcoming Bulldozer
75# [according to its optimization manual]. Which is why AVX code path
76# is guarded by *both* AVX and synthetic bit denoting Intel CPUs.
77# One can argue that it's unfair to AMD, but without 'sh[rl]d' it
78# makes no sense to keep the AVX code path. If somebody feels that
79# strongly, it's probably more appropriate to discuss possibility of
80# using vector rotate XOP on AMD...
81
82######################################################################
83# Current performance is summarized in following table. Numbers are
84# CPU clock cycles spent to process single byte (less is better).
85#
86# x86 SSSE3 AVX
87# Pentium 15.7 -
88# PIII 11.5 -
89# P4 10.6 -
90# AMD K8 7.1 -
91# Core2 7.3 6.1/+20% -
92# Atom 12.5 9.5(*)/+32% -
93# Westmere 7.3 5.6/+30% -
94# Sandy Bridge 8.8 6.2/+40% 5.1(**)/+70%
95#
96# (*) Loop is 1056 instructions long and expected result is ~8.25.
97# It remains mystery [to me] why ILP is limited to 1.7.
98#
99# (**) As per above comment, the result is for AVX *plus* sh[rl]d.
100
101$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
102push(@INC,"${dir}","${dir}../../perlasm");
103require "x86asm.pl";
104
105&asm_init($ARGV[0],"sha1-586.pl",$ARGV[$#ARGV] eq "386");
106
107$xmm=$ymm=0;
108for (@ARGV) { $xmm=1 if (/-DOPENSSL_IA32_SSE2/); }
109
110$ymm=1 if ($xmm &&
111 `$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
112 =~ /GNU assembler version ([2-9]\.[0-9]+)/ &&
113 $1>=2.19); # first version supporting AVX
114
115&external_label("OPENSSL_ia32cap_P") if ($xmm);
116
117
118$A="eax";
119$B="ebx";
120$C="ecx";
121$D="edx";
122$E="edi";
123$T="esi";
124$tmp1="ebp";
125
126@V=($A,$B,$C,$D,$E,$T);
127
128$alt=0; # 1 denotes alternative IALU implementation, which performs
129 # 8% *worse* on P4, same on Westmere and Atom, 2% better on
130 # Sandy Bridge...
131
132sub BODY_00_15
133 {
134 local($n,$a,$b,$c,$d,$e,$f)=@_;
135
136 &comment("00_15 $n");
137
138 &mov($f,$c); # f to hold F_00_19(b,c,d)
139 if ($n==0) { &mov($tmp1,$a); }
140 else { &mov($a,$tmp1); }
141 &rotl($tmp1,5); # tmp1=ROTATE(a,5)
142 &xor($f,$d);
143 &add($tmp1,$e); # tmp1+=e;
144 &mov($e,&swtmp($n%16)); # e becomes volatile and is loaded
145 # with xi, also note that e becomes
146 # f in next round...
147 &and($f,$b);
148 &rotr($b,2); # b=ROTATE(b,30)
149 &xor($f,$d); # f holds F_00_19(b,c,d)
150 &lea($tmp1,&DWP(0x5a827999,$tmp1,$e)); # tmp1+=K_00_19+xi
151
152 if ($n==15) { &mov($e,&swtmp(($n+1)%16));# pre-fetch f for next round
153 &add($f,$tmp1); } # f+=tmp1
154 else { &add($tmp1,$f); } # f becomes a in next round
155 &mov($tmp1,$a) if ($alt && $n==15);
156 }
157
158sub BODY_16_19
159 {
160 local($n,$a,$b,$c,$d,$e,$f)=@_;
161
162 &comment("16_19 $n");
163
164if ($alt) {
165 &xor($c,$d);
166 &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd)
167 &and($tmp1,$c); # tmp1 to hold F_00_19(b,c,d), b&=c^d
168 &xor($f,&swtmp(($n+8)%16));
169 &xor($tmp1,$d); # tmp1=F_00_19(b,c,d)
170 &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd
171 &rotl($f,1); # f=ROTATE(f,1)
172 &add($e,$tmp1); # e+=F_00_19(b,c,d)
173 &xor($c,$d); # restore $c
174 &mov($tmp1,$a); # b in next round
175 &rotr($b,$n==16?2:7); # b=ROTATE(b,30)
176 &mov(&swtmp($n%16),$f); # xi=f
177 &rotl($a,5); # ROTATE(a,5)
178 &lea($f,&DWP(0x5a827999,$f,$e));# f+=F_00_19(b,c,d)+e
179 &mov($e,&swtmp(($n+1)%16)); # pre-fetch f for next round
180 &add($f,$a); # f+=ROTATE(a,5)
181} else {
182 &mov($tmp1,$c); # tmp1 to hold F_00_19(b,c,d)
183 &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd)
184 &xor($tmp1,$d);
185 &xor($f,&swtmp(($n+8)%16));
186 &and($tmp1,$b);
187 &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd
188 &rotl($f,1); # f=ROTATE(f,1)
189 &xor($tmp1,$d); # tmp1=F_00_19(b,c,d)
190 &add($e,$tmp1); # e+=F_00_19(b,c,d)
191 &mov($tmp1,$a);
192 &rotr($b,2); # b=ROTATE(b,30)
193 &mov(&swtmp($n%16),$f); # xi=f
194 &rotl($tmp1,5); # ROTATE(a,5)
195 &lea($f,&DWP(0x5a827999,$f,$e));# f+=F_00_19(b,c,d)+e
196 &mov($e,&swtmp(($n+1)%16)); # pre-fetch f for next round
197 &add($f,$tmp1); # f+=ROTATE(a,5)
198}
199 }
200
201sub BODY_20_39
202 {
203 local($n,$a,$b,$c,$d,$e,$f)=@_;
204 local $K=($n<40)?0x6ed9eba1:0xca62c1d6;
205
206 &comment("20_39 $n");
207
208if ($alt) {
209 &xor($tmp1,$c); # tmp1 to hold F_20_39(b,c,d), b^=c
210 &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd)
211 &xor($tmp1,$d); # tmp1 holds F_20_39(b,c,d)
212 &xor($f,&swtmp(($n+8)%16));
213 &add($e,$tmp1); # e+=F_20_39(b,c,d)
214 &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd
215 &rotl($f,1); # f=ROTATE(f,1)
216 &mov($tmp1,$a); # b in next round
217 &rotr($b,7); # b=ROTATE(b,30)
218 &mov(&swtmp($n%16),$f) if($n<77);# xi=f
219 &rotl($a,5); # ROTATE(a,5)
220 &xor($b,$c) if($n==39);# warm up for BODY_40_59
221 &and($tmp1,$b) if($n==39);
222 &lea($f,&DWP($K,$f,$e)); # f+=e+K_XX_YY
223 &mov($e,&swtmp(($n+1)%16)) if($n<79);# pre-fetch f for next round
224 &add($f,$a); # f+=ROTATE(a,5)
225 &rotr($a,5) if ($n==79);
226} else {
227 &mov($tmp1,$b); # tmp1 to hold F_20_39(b,c,d)
228 &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd)
229 &xor($tmp1,$c);
230 &xor($f,&swtmp(($n+8)%16));
231 &xor($tmp1,$d); # tmp1 holds F_20_39(b,c,d)
232 &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd
233 &rotl($f,1); # f=ROTATE(f,1)
234 &add($e,$tmp1); # e+=F_20_39(b,c,d)
235 &rotr($b,2); # b=ROTATE(b,30)
236 &mov($tmp1,$a);
237 &rotl($tmp1,5); # ROTATE(a,5)
238 &mov(&swtmp($n%16),$f) if($n<77);# xi=f
239 &lea($f,&DWP($K,$f,$e)); # f+=e+K_XX_YY
240 &mov($e,&swtmp(($n+1)%16)) if($n<79);# pre-fetch f for next round
241 &add($f,$tmp1); # f+=ROTATE(a,5)
242}
243 }
244
245sub BODY_40_59
246 {
247 local($n,$a,$b,$c,$d,$e,$f)=@_;
248
249 &comment("40_59 $n");
250
251if ($alt) {
252 &add($e,$tmp1); # e+=b&(c^d)
253 &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd)
254 &mov($tmp1,$d);
255 &xor($f,&swtmp(($n+8)%16));
256 &xor($c,$d); # restore $c
257 &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd
258 &rotl($f,1); # f=ROTATE(f,1)
259 &and($tmp1,$c);
260 &rotr($b,7); # b=ROTATE(b,30)
261 &add($e,$tmp1); # e+=c&d
262 &mov($tmp1,$a); # b in next round
263 &mov(&swtmp($n%16),$f); # xi=f
264 &rotl($a,5); # ROTATE(a,5)
265 &xor($b,$c) if ($n<59);
266 &and($tmp1,$b) if ($n<59);# tmp1 to hold F_40_59(b,c,d)
267 &lea($f,&DWP(0x8f1bbcdc,$f,$e));# f+=K_40_59+e+(b&(c^d))
268 &mov($e,&swtmp(($n+1)%16)); # pre-fetch f for next round
269 &add($f,$a); # f+=ROTATE(a,5)
270} else {
271 &mov($tmp1,$c); # tmp1 to hold F_40_59(b,c,d)
272 &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd)
273 &xor($tmp1,$d);
274 &xor($f,&swtmp(($n+8)%16));
275 &and($tmp1,$b);
276 &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd
277 &rotl($f,1); # f=ROTATE(f,1)
278 &add($tmp1,$e); # b&(c^d)+=e
279 &rotr($b,2); # b=ROTATE(b,30)
280 &mov($e,$a); # e becomes volatile
281 &rotl($e,5); # ROTATE(a,5)
282 &mov(&swtmp($n%16),$f); # xi=f
283 &lea($f,&DWP(0x8f1bbcdc,$f,$tmp1));# f+=K_40_59+e+(b&(c^d))
284 &mov($tmp1,$c);
285 &add($f,$e); # f+=ROTATE(a,5)
286 &and($tmp1,$d);
287 &mov($e,&swtmp(($n+1)%16)); # pre-fetch f for next round
288 &add($f,$tmp1); # f+=c&d
289}
290 }
291
292&function_begin("sha1_block_data_order");
293if ($xmm) {
294 &static_label("ssse3_shortcut");
295 &static_label("avx_shortcut") if ($ymm);
296 &static_label("K_XX_XX");
297
298 &picsetup($tmp1);
299 &picsymbol($T, "OPENSSL_ia32cap_P", $tmp1);
300 &picsymbol($tmp1, &label("K_XX_XX"), $tmp1);
301
302 &mov ($A,&DWP(0,$T));
303 &mov ($D,&DWP(4,$T));
304 &test ($D,"\$IA32CAP_MASK1_SSSE3"); # check SSSE3 bit
305 &jz (&label("x86"));
306 &test ($A,"\$IA32CAP_MASK0_FXSR"); # check FXSR bit
307 &jz (&label("x86"));
308 if ($ymm) {
309 &and ($D,"\$IA32CAP_MASK1_AVX"); # mask AVX bit
310 &and ($A,"\$IA32CAP_MASK0_INTEL"); # mask "Intel CPU" bit
311 &or ($A,$D);
312 &cmp ($A,"\$(IA32CAP_MASK1_AVX | IA32CAP_MASK0_INTEL)");
313 &je (&label("avx_shortcut"));
314 }
315 &jmp (&label("ssse3_shortcut"));
316 &set_label("x86",16);
317}
318 &mov($tmp1,&wparam(0)); # SHA_CTX *c
319 &mov($T,&wparam(1)); # const void *input
320 &mov($A,&wparam(2)); # size_t num
321 &stack_push(16+3); # allocate X[16]
322 &shl($A,6);
323 &add($A,$T);
324 &mov(&wparam(2),$A); # pointer beyond the end of input
325 &mov($E,&DWP(16,$tmp1));# pre-load E
326 &jmp(&label("loop"));
327
328&set_label("loop",16);
329
330 # copy input chunk to X, but reversing byte order!
331 for ($i=0; $i<16; $i+=4)
332 {
333 &mov($A,&DWP(4*($i+0),$T));
334 &mov($B,&DWP(4*($i+1),$T));
335 &mov($C,&DWP(4*($i+2),$T));
336 &mov($D,&DWP(4*($i+3),$T));
337 &bswap($A);
338 &bswap($B);
339 &bswap($C);
340 &bswap($D);
341 &mov(&swtmp($i+0),$A);
342 &mov(&swtmp($i+1),$B);
343 &mov(&swtmp($i+2),$C);
344 &mov(&swtmp($i+3),$D);
345 }
346 &mov(&wparam(1),$T); # redundant in 1st spin
347
348 &mov($A,&DWP(0,$tmp1)); # load SHA_CTX
349 &mov($B,&DWP(4,$tmp1));
350 &mov($C,&DWP(8,$tmp1));
351 &mov($D,&DWP(12,$tmp1));
352 # E is pre-loaded
353
354 for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
355 for(;$i<20;$i++) { &BODY_16_19($i,@V); unshift(@V,pop(@V)); }
356 for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
357 for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
358 for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
359
360 (($V[5] eq $D) and ($V[0] eq $E)) or die; # double-check
361
362 &mov($tmp1,&wparam(0)); # re-load SHA_CTX*
363 &mov($D,&wparam(1)); # D is last "T" and is discarded
364
365 &add($E,&DWP(0,$tmp1)); # E is last "A"...
366 &add($T,&DWP(4,$tmp1));
367 &add($A,&DWP(8,$tmp1));
368 &add($B,&DWP(12,$tmp1));
369 &add($C,&DWP(16,$tmp1));
370
371 &mov(&DWP(0,$tmp1),$E); # update SHA_CTX
372 &add($D,64); # advance input pointer
373 &mov(&DWP(4,$tmp1),$T);
374 &cmp($D,&wparam(2)); # have we reached the end yet?
375 &mov(&DWP(8,$tmp1),$A);
376 &mov($E,$C); # C is last "E" which needs to be "pre-loaded"
377 &mov(&DWP(12,$tmp1),$B);
378 &mov($T,$D); # input pointer
379 &mov(&DWP(16,$tmp1),$C);
380 &jb(&label("loop"));
381
382 &stack_pop(16+3);
383&function_end("sha1_block_data_order");
384
385if ($xmm) {
386######################################################################
387# The SSSE3 implementation.
388#
389# %xmm[0-7] are used as ring @X[] buffer containing quadruples of last
390# 32 elements of the message schedule or Xupdate outputs. First 4
391# quadruples are simply byte-swapped input, next 4 are calculated
392# according to method originally suggested by Dean Gaudet (modulo
393# being implemented in SSSE3). Once 8 quadruples or 32 elements are
394# collected, it switches to routine proposed by Max Locktyukhin.
395#
396# Calculations inevitably require temporary reqisters, and there are
397# no %xmm registers left to spare. For this reason part of the ring
398# buffer, X[2..4] to be specific, is offloaded to 3 quadriples ring
399# buffer on the stack. Keep in mind that X[2] is alias X[-6], X[3] -
400# X[-5], and X[4] - X[-4]...
401#
402# Another notable optimization is aggressive stack frame compression
403# aiming to minimize amount of 9-byte instructions...
404#
405# Yet another notable optimization is "jumping" $B variable. It means
406# that there is no register permanently allocated for $B value. This
407# allowed to eliminate one instruction from body_20_39...
408#
409my $Xi=4; # 4xSIMD Xupdate round, start pre-seeded
410my @X=map("xmm$_",(4..7,0..3)); # pre-seeded for $Xi=4
411my @V=($A,$B,$C,$D,$E);
412my $j=0; # hash round
413my @T=($T,$tmp1);
414my $inp;
415
416my $_rol=sub { &rol(@_) };
417my $_ror=sub { &ror(@_) };
418
419&function_begin("_sha1_block_data_order_ssse3");
420 &picsetup($tmp1);
421 &picsymbol($tmp1, &label("K_XX_XX"), $tmp1);
422
423&set_label("ssse3_shortcut");
424
425 &movdqa (@X[3],&QWP(0,$tmp1)); # K_00_19
426 &movdqa (@X[4],&QWP(16,$tmp1)); # K_20_39
427 &movdqa (@X[5],&QWP(32,$tmp1)); # K_40_59
428 &movdqa (@X[6],&QWP(48,$tmp1)); # K_60_79
429 &movdqa (@X[2],&QWP(64,$tmp1)); # pbswap mask
430
431 &mov ($E,&wparam(0)); # load argument block
432 &mov ($inp=@T[1],&wparam(1));
433 &mov ($D,&wparam(2));
434 &mov (@T[0],"esp");
435
436 # stack frame layout
437 #
438 # +0 X[0]+K X[1]+K X[2]+K X[3]+K # XMM->IALU xfer area
439 # X[4]+K X[5]+K X[6]+K X[7]+K
440 # X[8]+K X[9]+K X[10]+K X[11]+K
441 # X[12]+K X[13]+K X[14]+K X[15]+K
442 #
443 # +64 X[0] X[1] X[2] X[3] # XMM->XMM backtrace area
444 # X[4] X[5] X[6] X[7]
445 # X[8] X[9] X[10] X[11] # even borrowed for K_00_19
446 #
447 # +112 K_20_39 K_20_39 K_20_39 K_20_39 # constants
448 # K_40_59 K_40_59 K_40_59 K_40_59
449 # K_60_79 K_60_79 K_60_79 K_60_79
450 # K_00_19 K_00_19 K_00_19 K_00_19
451 # pbswap mask
452 #
453 # +192 ctx # argument block
454 # +196 inp
455 # +200 end
456 # +204 esp
457 &sub ("esp",208);
458 &and ("esp",-64);
459
460 &movdqa (&QWP(112+0,"esp"),@X[4]); # copy constants
461 &movdqa (&QWP(112+16,"esp"),@X[5]);
462 &movdqa (&QWP(112+32,"esp"),@X[6]);
463 &shl ($D,6); # len*64
464 &movdqa (&QWP(112+48,"esp"),@X[3]);
465 &add ($D,$inp); # end of input
466 &movdqa (&QWP(112+64,"esp"),@X[2]);
467 &add ($inp,64);
468 &mov (&DWP(192+0,"esp"),$E); # save argument block
469 &mov (&DWP(192+4,"esp"),$inp);
470 &mov (&DWP(192+8,"esp"),$D);
471 &mov (&DWP(192+12,"esp"),@T[0]); # save original %esp
472
473 &mov ($A,&DWP(0,$E)); # load context
474 &mov ($B,&DWP(4,$E));
475 &mov ($C,&DWP(8,$E));
476 &mov ($D,&DWP(12,$E));
477 &mov ($E,&DWP(16,$E));
478 &mov (@T[0],$B); # magic seed
479
480 &movdqu (@X[-4&7],&QWP(-64,$inp)); # load input to %xmm[0-3]
481 &movdqu (@X[-3&7],&QWP(-48,$inp));
482 &movdqu (@X[-2&7],&QWP(-32,$inp));
483 &movdqu (@X[-1&7],&QWP(-16,$inp));
484 &pshufb (@X[-4&7],@X[2]); # byte swap
485 &pshufb (@X[-3&7],@X[2]);
486 &pshufb (@X[-2&7],@X[2]);
487 &movdqa (&QWP(112-16,"esp"),@X[3]); # borrow last backtrace slot
488 &pshufb (@X[-1&7],@X[2]);
489 &paddd (@X[-4&7],@X[3]); # add K_00_19
490 &paddd (@X[-3&7],@X[3]);
491 &paddd (@X[-2&7],@X[3]);
492 &movdqa (&QWP(0,"esp"),@X[-4&7]); # X[]+K xfer to IALU
493 &psubd (@X[-4&7],@X[3]); # restore X[]
494 &movdqa (&QWP(0+16,"esp"),@X[-3&7]);
495 &psubd (@X[-3&7],@X[3]);
496 &movdqa (&QWP(0+32,"esp"),@X[-2&7]);
497 &psubd (@X[-2&7],@X[3]);
498 &movdqa (@X[0],@X[-3&7]);
499 &jmp (&label("loop"));
500
501######################################################################
502# SSE instruction sequence is first broken to groups of independent
503# instructions, independent in respect to their inputs and shifter
504# (not all architectures have more than one). Then IALU instructions
505# are "knitted in" between the SSE groups. Distance is maintained for
506# SSE latency of 2 in hope that it fits better upcoming AMD Bulldozer
507# [which allegedly also implements SSSE3]...
508#
509# Temporary registers usage. X[2] is volatile at the entry and at the
510# end is restored from backtrace ring buffer. X[3] is expected to
511# contain current K_XX_XX constant and is used to calculate X[-1]+K
512# from previous round, it becomes volatile the moment the value is
513# saved to stack for transfer to IALU. X[4] becomes volatile whenever
514# X[-4] is accumulated and offloaded to backtrace ring buffer, at the
515# end it is loaded with next K_XX_XX [which becomes X[3] in next
516# round]...
517#
518sub Xupdate_ssse3_16_31() # recall that $Xi starts with 4
519{ use integer;
520 my $body = shift;
521 my @insns = (&$body,&$body,&$body,&$body); # 40 instructions
522 my ($a,$b,$c,$d,$e);
523
524 eval(shift(@insns));
525 eval(shift(@insns));
526 &palignr(@X[0],@X[-4&7],8); # compose "X[-14]" in "X[0]"
527 &movdqa (@X[2],@X[-1&7]);
528 eval(shift(@insns));
529 eval(shift(@insns));
530
531 &paddd (@X[3],@X[-1&7]);
532 &movdqa (&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]);# save X[] to backtrace buffer
533 eval(shift(@insns));
534 eval(shift(@insns));
535 &psrldq (@X[2],4); # "X[-3]", 3 dwords
536 eval(shift(@insns));
537 eval(shift(@insns));
538 &pxor (@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
539 eval(shift(@insns));
540 eval(shift(@insns));
541
542 &pxor (@X[2],@X[-2&7]); # "X[-3]"^"X[-8]"
543 eval(shift(@insns));
544 eval(shift(@insns));
545 eval(shift(@insns));
546 eval(shift(@insns));
547
548 &pxor (@X[0],@X[2]); # "X[0]"^="X[-3]"^"X[-8]"
549 eval(shift(@insns));
550 eval(shift(@insns));
551 &movdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer to IALU
552 eval(shift(@insns));
553 eval(shift(@insns));
554
555 &movdqa (@X[4],@X[0]);
556 &movdqa (@X[2],@X[0]);
557 eval(shift(@insns));
558 eval(shift(@insns));
559 eval(shift(@insns));
560 eval(shift(@insns));
561
562 &pslldq (@X[4],12); # "X[0]"<<96, extract one dword
563 &paddd (@X[0],@X[0]);
564 eval(shift(@insns));
565 eval(shift(@insns));
566 eval(shift(@insns));
567 eval(shift(@insns));
568
569 &psrld (@X[2],31);
570 eval(shift(@insns));
571 eval(shift(@insns));
572 &movdqa (@X[3],@X[4]);
573 eval(shift(@insns));
574 eval(shift(@insns));
575
576 &psrld (@X[4],30);
577 &por (@X[0],@X[2]); # "X[0]"<<<=1
578 eval(shift(@insns));
579 eval(shift(@insns));
580 &movdqa (@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if ($Xi>5); # restore X[] from backtrace buffer
581 eval(shift(@insns));
582 eval(shift(@insns));
583
584 &pslld (@X[3],2);
585 &pxor (@X[0],@X[4]);
586 eval(shift(@insns));
587 eval(shift(@insns));
588 &movdqa (@X[4],&QWP(112-16+16*(($Xi)/5),"esp")); # K_XX_XX
589 eval(shift(@insns));
590 eval(shift(@insns));
591
592 &pxor (@X[0],@X[3]); # "X[0]"^=("X[0]"<<96)<<<2
593 &movdqa (@X[1],@X[-2&7]) if ($Xi<7);
594 eval(shift(@insns));
595 eval(shift(@insns));
596
597 foreach (@insns) { eval; } # remaining instructions [if any]
598
599 $Xi++; push(@X,shift(@X)); # "rotate" X[]
600}
601
602sub Xupdate_ssse3_32_79()
603{ use integer;
604 my $body = shift;
605 my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions
606 my ($a,$b,$c,$d,$e);
607
608 &movdqa (@X[2],@X[-1&7]) if ($Xi==8);
609 eval(shift(@insns)); # body_20_39
610 &pxor (@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
611 &palignr(@X[2],@X[-2&7],8); # compose "X[-6]"
612 eval(shift(@insns));
613 eval(shift(@insns));
614 eval(shift(@insns)); # rol
615
616 &pxor (@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
617 &movdqa (&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]); # save X[] to backtrace buffer
618 eval(shift(@insns));
619 eval(shift(@insns));
620 if ($Xi%5) {
621 &movdqa (@X[4],@X[3]); # "perpetuate" K_XX_XX...
622 } else { # ... or load next one
623 &movdqa (@X[4],&QWP(112-16+16*($Xi/5),"esp"));
624 }
625 &paddd (@X[3],@X[-1&7]);
626 eval(shift(@insns)); # ror
627 eval(shift(@insns));
628
629 &pxor (@X[0],@X[2]); # "X[0]"^="X[-6]"
630 eval(shift(@insns)); # body_20_39
631 eval(shift(@insns));
632 eval(shift(@insns));
633 eval(shift(@insns)); # rol
634
635 &movdqa (@X[2],@X[0]);
636 &movdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer to IALU
637 eval(shift(@insns));
638 eval(shift(@insns));
639 eval(shift(@insns)); # ror
640 eval(shift(@insns));
641
642 &pslld (@X[0],2);
643 eval(shift(@insns)); # body_20_39
644 eval(shift(@insns));
645 &psrld (@X[2],30);
646 eval(shift(@insns));
647 eval(shift(@insns)); # rol
648 eval(shift(@insns));
649 eval(shift(@insns));
650 eval(shift(@insns)); # ror
651 eval(shift(@insns));
652
653 &por (@X[0],@X[2]); # "X[0]"<<<=2
654 eval(shift(@insns)); # body_20_39
655 eval(shift(@insns));
656 &movdqa (@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if($Xi<19); # restore X[] from backtrace buffer
657 eval(shift(@insns));
658 eval(shift(@insns)); # rol
659 eval(shift(@insns));
660 eval(shift(@insns));
661 eval(shift(@insns)); # ror
662 &movdqa (@X[3],@X[0]) if ($Xi<19);
663 eval(shift(@insns));
664
665 foreach (@insns) { eval; } # remaining instructions
666
667 $Xi++; push(@X,shift(@X)); # "rotate" X[]
668}
669
670sub Xuplast_ssse3_80()
671{ use integer;
672 my $body = shift;
673 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
674 my ($a,$b,$c,$d,$e);
675
676 eval(shift(@insns));
677 &paddd (@X[3],@X[-1&7]);
678 eval(shift(@insns));
679 eval(shift(@insns));
680 eval(shift(@insns));
681 eval(shift(@insns));
682
683 &movdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer IALU
684
685 foreach (@insns) { eval; } # remaining instructions
686
687 &mov ($inp=@T[1],&DWP(192+4,"esp"));
688 &cmp ($inp,&DWP(192+8,"esp"));
689 &je (&label("done"));
690
691 &movdqa (@X[3],&QWP(112+48,"esp")); # K_00_19
692 &movdqa (@X[2],&QWP(112+64,"esp")); # pbswap mask
693 &movdqu (@X[-4&7],&QWP(0,$inp)); # load input
694 &movdqu (@X[-3&7],&QWP(16,$inp));
695 &movdqu (@X[-2&7],&QWP(32,$inp));
696 &movdqu (@X[-1&7],&QWP(48,$inp));
697 &add ($inp,64);
698 &pshufb (@X[-4&7],@X[2]); # byte swap
699 &mov (&DWP(192+4,"esp"),$inp);
700 &movdqa (&QWP(112-16,"esp"),@X[3]); # borrow last backtrace slot
701
702 $Xi=0;
703}
704
705sub Xloop_ssse3()
706{ use integer;
707 my $body = shift;
708 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
709 my ($a,$b,$c,$d,$e);
710
711 eval(shift(@insns));
712 eval(shift(@insns));
713 &pshufb (@X[($Xi-3)&7],@X[2]);
714 eval(shift(@insns));
715 eval(shift(@insns));
716 &paddd (@X[($Xi-4)&7],@X[3]);
717 eval(shift(@insns));
718 eval(shift(@insns));
719 eval(shift(@insns));
720 eval(shift(@insns));
721 &movdqa (&QWP(0+16*$Xi,"esp"),@X[($Xi-4)&7]); # X[]+K xfer to IALU
722 eval(shift(@insns));
723 eval(shift(@insns));
724 &psubd (@X[($Xi-4)&7],@X[3]);
725
726 foreach (@insns) { eval; }
727 $Xi++;
728}
729
730sub Xtail_ssse3()
731{ use integer;
732 my $body = shift;
733 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
734 my ($a,$b,$c,$d,$e);
735
736 foreach (@insns) { eval; }
737}
738
739sub body_00_19 () {
740 (
741 '($a,$b,$c,$d,$e)=@V;'.
742 '&add ($e,&DWP(4*($j&15),"esp"));', # X[]+K xfer
743 '&xor ($c,$d);',
744 '&mov (@T[1],$a);', # $b in next round
745 '&$_rol ($a,5);',
746 '&and (@T[0],$c);', # ($b&($c^$d))
747 '&xor ($c,$d);', # restore $c
748 '&xor (@T[0],$d);',
749 '&add ($e,$a);',
750 '&$_ror ($b,$j?7:2);', # $b>>>2
751 '&add ($e,@T[0]);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
752 );
753}
754
755sub body_20_39 () {
756 (
757 '($a,$b,$c,$d,$e)=@V;'.
758 '&add ($e,&DWP(4*($j++&15),"esp"));', # X[]+K xfer
759 '&xor (@T[0],$d);', # ($b^$d)
760 '&mov (@T[1],$a);', # $b in next round
761 '&$_rol ($a,5);',
762 '&xor (@T[0],$c);', # ($b^$d^$c)
763 '&add ($e,$a);',
764 '&$_ror ($b,7);', # $b>>>2
765 '&add ($e,@T[0]);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
766 );
767}
768
769sub body_40_59 () {
770 (
771 '($a,$b,$c,$d,$e)=@V;'.
772 '&mov (@T[1],$c);',
773 '&xor ($c,$d);',
774 '&add ($e,&DWP(4*($j++&15),"esp"));', # X[]+K xfer
775 '&and (@T[1],$d);',
776 '&and (@T[0],$c);', # ($b&($c^$d))
777 '&$_ror ($b,7);', # $b>>>2
778 '&add ($e,@T[1]);',
779 '&mov (@T[1],$a);', # $b in next round
780 '&$_rol ($a,5);',
781 '&add ($e,@T[0]);',
782 '&xor ($c,$d);', # restore $c
783 '&add ($e,$a);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
784 );
785}
786
787&set_label("loop",16);
788 &Xupdate_ssse3_16_31(\&body_00_19);
789 &Xupdate_ssse3_16_31(\&body_00_19);
790 &Xupdate_ssse3_16_31(\&body_00_19);
791 &Xupdate_ssse3_16_31(\&body_00_19);
792 &Xupdate_ssse3_32_79(\&body_00_19);
793 &Xupdate_ssse3_32_79(\&body_20_39);
794 &Xupdate_ssse3_32_79(\&body_20_39);
795 &Xupdate_ssse3_32_79(\&body_20_39);
796 &Xupdate_ssse3_32_79(\&body_20_39);
797 &Xupdate_ssse3_32_79(\&body_20_39);
798 &Xupdate_ssse3_32_79(\&body_40_59);
799 &Xupdate_ssse3_32_79(\&body_40_59);
800 &Xupdate_ssse3_32_79(\&body_40_59);
801 &Xupdate_ssse3_32_79(\&body_40_59);
802 &Xupdate_ssse3_32_79(\&body_40_59);
803 &Xupdate_ssse3_32_79(\&body_20_39);
804 &Xuplast_ssse3_80(\&body_20_39); # can jump to "done"
805
806 $saved_j=$j; @saved_V=@V;
807
808 &Xloop_ssse3(\&body_20_39);
809 &Xloop_ssse3(\&body_20_39);
810 &Xloop_ssse3(\&body_20_39);
811
812 &mov (@T[1],&DWP(192,"esp")); # update context
813 &add ($A,&DWP(0,@T[1]));
814 &add (@T[0],&DWP(4,@T[1])); # $b
815 &add ($C,&DWP(8,@T[1]));
816 &mov (&DWP(0,@T[1]),$A);
817 &add ($D,&DWP(12,@T[1]));
818 &mov (&DWP(4,@T[1]),@T[0]);
819 &add ($E,&DWP(16,@T[1]));
820 &mov (&DWP(8,@T[1]),$C);
821 &mov ($B,@T[0]);
822 &mov (&DWP(12,@T[1]),$D);
823 &mov (&DWP(16,@T[1]),$E);
824 &movdqa (@X[0],@X[-3&7]);
825
826 &jmp (&label("loop"));
827
828&set_label("done",16); $j=$saved_j; @V=@saved_V;
829
830 &Xtail_ssse3(\&body_20_39);
831 &Xtail_ssse3(\&body_20_39);
832 &Xtail_ssse3(\&body_20_39);
833
834 &mov (@T[1],&DWP(192,"esp")); # update context
835 &add ($A,&DWP(0,@T[1]));
836 &mov ("esp",&DWP(192+12,"esp")); # restore %esp
837 &add (@T[0],&DWP(4,@T[1])); # $b
838 &add ($C,&DWP(8,@T[1]));
839 &mov (&DWP(0,@T[1]),$A);
840 &add ($D,&DWP(12,@T[1]));
841 &mov (&DWP(4,@T[1]),@T[0]);
842 &add ($E,&DWP(16,@T[1]));
843 &mov (&DWP(8,@T[1]),$C);
844 &mov (&DWP(12,@T[1]),$D);
845 &mov (&DWP(16,@T[1]),$E);
846
847&function_end("_sha1_block_data_order_ssse3");
848
849if ($ymm) {
850my $Xi=4; # 4xSIMD Xupdate round, start pre-seeded
851my @X=map("xmm$_",(4..7,0..3)); # pre-seeded for $Xi=4
852my @V=($A,$B,$C,$D,$E);
853my $j=0; # hash round
854my @T=($T,$tmp1);
855my $inp;
856
857my $_rol=sub { &shld(@_[0],@_) };
858my $_ror=sub { &shrd(@_[0],@_) };
859
860&function_begin("_sha1_block_data_order_avx");
861 &picsetup($tmp1);
862 &picsymbol($tmp1, &label("K_XX_XX"), $tmp1);
863
864&set_label("avx_shortcut");
865 &vzeroall();
866
867 &vmovdqa(@X[3],&QWP(0,$tmp1)); # K_00_19
868 &vmovdqa(@X[4],&QWP(16,$tmp1)); # K_20_39
869 &vmovdqa(@X[5],&QWP(32,$tmp1)); # K_40_59
870 &vmovdqa(@X[6],&QWP(48,$tmp1)); # K_60_79
871 &vmovdqa(@X[2],&QWP(64,$tmp1)); # pbswap mask
872
873 &mov ($E,&wparam(0)); # load argument block
874 &mov ($inp=@T[1],&wparam(1));
875 &mov ($D,&wparam(2));
876 &mov (@T[0],"esp");
877
878 # stack frame layout
879 #
880 # +0 X[0]+K X[1]+K X[2]+K X[3]+K # XMM->IALU xfer area
881 # X[4]+K X[5]+K X[6]+K X[7]+K
882 # X[8]+K X[9]+K X[10]+K X[11]+K
883 # X[12]+K X[13]+K X[14]+K X[15]+K
884 #
885 # +64 X[0] X[1] X[2] X[3] # XMM->XMM backtrace area
886 # X[4] X[5] X[6] X[7]
887 # X[8] X[9] X[10] X[11] # even borrowed for K_00_19
888 #
889 # +112 K_20_39 K_20_39 K_20_39 K_20_39 # constants
890 # K_40_59 K_40_59 K_40_59 K_40_59
891 # K_60_79 K_60_79 K_60_79 K_60_79
892 # K_00_19 K_00_19 K_00_19 K_00_19
893 # pbswap mask
894 #
895 # +192 ctx # argument block
896 # +196 inp
897 # +200 end
898 # +204 esp
899 &sub ("esp",208);
900 &and ("esp",-64);
901
902 &vmovdqa(&QWP(112+0,"esp"),@X[4]); # copy constants
903 &vmovdqa(&QWP(112+16,"esp"),@X[5]);
904 &vmovdqa(&QWP(112+32,"esp"),@X[6]);
905 &shl ($D,6); # len*64
906 &vmovdqa(&QWP(112+48,"esp"),@X[3]);
907 &add ($D,$inp); # end of input
908 &vmovdqa(&QWP(112+64,"esp"),@X[2]);
909 &add ($inp,64);
910 &mov (&DWP(192+0,"esp"),$E); # save argument block
911 &mov (&DWP(192+4,"esp"),$inp);
912 &mov (&DWP(192+8,"esp"),$D);
913 &mov (&DWP(192+12,"esp"),@T[0]); # save original %esp
914
915 &mov ($A,&DWP(0,$E)); # load context
916 &mov ($B,&DWP(4,$E));
917 &mov ($C,&DWP(8,$E));
918 &mov ($D,&DWP(12,$E));
919 &mov ($E,&DWP(16,$E));
920 &mov (@T[0],$B); # magic seed
921
922 &vmovdqu(@X[-4&7],&QWP(-64,$inp)); # load input to %xmm[0-3]
923 &vmovdqu(@X[-3&7],&QWP(-48,$inp));
924 &vmovdqu(@X[-2&7],&QWP(-32,$inp));
925 &vmovdqu(@X[-1&7],&QWP(-16,$inp));
926 &vpshufb(@X[-4&7],@X[-4&7],@X[2]); # byte swap
927 &vpshufb(@X[-3&7],@X[-3&7],@X[2]);
928 &vpshufb(@X[-2&7],@X[-2&7],@X[2]);
929 &vmovdqa(&QWP(112-16,"esp"),@X[3]); # borrow last backtrace slot
930 &vpshufb(@X[-1&7],@X[-1&7],@X[2]);
931 &vpaddd (@X[0],@X[-4&7],@X[3]); # add K_00_19
932 &vpaddd (@X[1],@X[-3&7],@X[3]);
933 &vpaddd (@X[2],@X[-2&7],@X[3]);
934 &vmovdqa(&QWP(0,"esp"),@X[0]); # X[]+K xfer to IALU
935 &vmovdqa(&QWP(0+16,"esp"),@X[1]);
936 &vmovdqa(&QWP(0+32,"esp"),@X[2]);
937 &jmp (&label("loop"));
938
939sub Xupdate_avx_16_31() # recall that $Xi starts with 4
940{ use integer;
941 my $body = shift;
942 my @insns = (&$body,&$body,&$body,&$body); # 40 instructions
943 my ($a,$b,$c,$d,$e);
944
945 eval(shift(@insns));
946 eval(shift(@insns));
947 &vpalignr(@X[0],@X[-3&7],@X[-4&7],8); # compose "X[-14]" in "X[0]"
948 eval(shift(@insns));
949 eval(shift(@insns));
950
951 &vpaddd (@X[3],@X[3],@X[-1&7]);
952 &vmovdqa (&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]);# save X[] to backtrace buffer
953 eval(shift(@insns));
954 eval(shift(@insns));
955 &vpsrldq(@X[2],@X[-1&7],4); # "X[-3]", 3 dwords
956 eval(shift(@insns));
957 eval(shift(@insns));
958 &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
959 eval(shift(@insns));
960 eval(shift(@insns));
961
962 &vpxor (@X[2],@X[2],@X[-2&7]); # "X[-3]"^"X[-8]"
963 eval(shift(@insns));
964 eval(shift(@insns));
965 &vmovdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer to IALU
966 eval(shift(@insns));
967 eval(shift(@insns));
968
969 &vpxor (@X[0],@X[0],@X[2]); # "X[0]"^="X[-3]"^"X[-8]"
970 eval(shift(@insns));
971 eval(shift(@insns));
972 eval(shift(@insns));
973 eval(shift(@insns));
974
975 &vpsrld (@X[2],@X[0],31);
976 eval(shift(@insns));
977 eval(shift(@insns));
978 eval(shift(@insns));
979 eval(shift(@insns));
980
981 &vpslldq(@X[4],@X[0],12); # "X[0]"<<96, extract one dword
982 &vpaddd (@X[0],@X[0],@X[0]);
983 eval(shift(@insns));
984 eval(shift(@insns));
985 eval(shift(@insns));
986 eval(shift(@insns));
987
988 &vpsrld (@X[3],@X[4],30);
989 &vpor (@X[0],@X[0],@X[2]); # "X[0]"<<<=1
990 eval(shift(@insns));
991 eval(shift(@insns));
992 eval(shift(@insns));
993 eval(shift(@insns));
994
995 &vpslld (@X[4],@X[4],2);
996 &vmovdqa (@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if ($Xi>5); # restore X[] from backtrace buffer
997 eval(shift(@insns));
998 eval(shift(@insns));
999 &vpxor (@X[0],@X[0],@X[3]);
1000 eval(shift(@insns));
1001 eval(shift(@insns));
1002 eval(shift(@insns));
1003 eval(shift(@insns));
1004
1005 &vpxor (@X[0],@X[0],@X[4]); # "X[0]"^=("X[0]"<<96)<<<2
1006 eval(shift(@insns));
1007 eval(shift(@insns));
1008 &vmovdqa (@X[4],&QWP(112-16+16*(($Xi)/5),"esp")); # K_XX_XX
1009 eval(shift(@insns));
1010 eval(shift(@insns));
1011
1012 foreach (@insns) { eval; } # remaining instructions [if any]
1013
1014 $Xi++; push(@X,shift(@X)); # "rotate" X[]
1015}
1016
1017sub Xupdate_avx_32_79()
1018{ use integer;
1019 my $body = shift;
1020 my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions
1021 my ($a,$b,$c,$d,$e);
1022
1023 &vpalignr(@X[2],@X[-1&7],@X[-2&7],8); # compose "X[-6]"
1024 &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
1025 eval(shift(@insns)); # body_20_39
1026 eval(shift(@insns));
1027 eval(shift(@insns));
1028 eval(shift(@insns)); # rol
1029
1030 &vpxor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
1031 &vmovdqa (&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]); # save X[] to backtrace buffer
1032 eval(shift(@insns));
1033 eval(shift(@insns));
1034 if ($Xi%5) {
1035 &vmovdqa (@X[4],@X[3]); # "perpetuate" K_XX_XX...
1036 } else { # ... or load next one
1037 &vmovdqa (@X[4],&QWP(112-16+16*($Xi/5),"esp"));
1038 }
1039 &vpaddd (@X[3],@X[3],@X[-1&7]);
1040 eval(shift(@insns)); # ror
1041 eval(shift(@insns));
1042
1043 &vpxor (@X[0],@X[0],@X[2]); # "X[0]"^="X[-6]"
1044 eval(shift(@insns)); # body_20_39
1045 eval(shift(@insns));
1046 eval(shift(@insns));
1047 eval(shift(@insns)); # rol
1048
1049 &vpsrld (@X[2],@X[0],30);
1050 &vmovdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer to IALU
1051 eval(shift(@insns));
1052 eval(shift(@insns));
1053 eval(shift(@insns)); # ror
1054 eval(shift(@insns));
1055
1056 &vpslld (@X[0],@X[0],2);
1057 eval(shift(@insns)); # body_20_39
1058 eval(shift(@insns));
1059 eval(shift(@insns));
1060 eval(shift(@insns)); # rol
1061 eval(shift(@insns));
1062 eval(shift(@insns));
1063 eval(shift(@insns)); # ror
1064 eval(shift(@insns));
1065
1066 &vpor (@X[0],@X[0],@X[2]); # "X[0]"<<<=2
1067 eval(shift(@insns)); # body_20_39
1068 eval(shift(@insns));
1069 &vmovdqa (@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if($Xi<19); # restore X[] from backtrace buffer
1070 eval(shift(@insns));
1071 eval(shift(@insns)); # rol
1072 eval(shift(@insns));
1073 eval(shift(@insns));
1074 eval(shift(@insns)); # ror
1075 eval(shift(@insns));
1076
1077 foreach (@insns) { eval; } # remaining instructions
1078
1079 $Xi++; push(@X,shift(@X)); # "rotate" X[]
1080}
1081
1082sub Xuplast_avx_80()
1083{ use integer;
1084 my $body = shift;
1085 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
1086 my ($a,$b,$c,$d,$e);
1087
1088 eval(shift(@insns));
1089 &vpaddd (@X[3],@X[3],@X[-1&7]);
1090 eval(shift(@insns));
1091 eval(shift(@insns));
1092 eval(shift(@insns));
1093 eval(shift(@insns));
1094
1095 &vmovdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer IALU
1096
1097 foreach (@insns) { eval; } # remaining instructions
1098
1099 &mov ($inp=@T[1],&DWP(192+4,"esp"));
1100 &cmp ($inp,&DWP(192+8,"esp"));
1101 &je (&label("done"));
1102
1103 &vmovdqa(@X[3],&QWP(112+48,"esp")); # K_00_19
1104 &vmovdqa(@X[2],&QWP(112+64,"esp")); # pbswap mask
1105 &vmovdqu(@X[-4&7],&QWP(0,$inp)); # load input
1106 &vmovdqu(@X[-3&7],&QWP(16,$inp));
1107 &vmovdqu(@X[-2&7],&QWP(32,$inp));
1108 &vmovdqu(@X[-1&7],&QWP(48,$inp));
1109 &add ($inp,64);
1110 &vpshufb(@X[-4&7],@X[-4&7],@X[2]); # byte swap
1111 &mov (&DWP(192+4,"esp"),$inp);
1112 &vmovdqa(&QWP(112-16,"esp"),@X[3]); # borrow last backtrace slot
1113
1114 $Xi=0;
1115}
1116
1117sub Xloop_avx()
1118{ use integer;
1119 my $body = shift;
1120 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
1121 my ($a,$b,$c,$d,$e);
1122
1123 eval(shift(@insns));
1124 eval(shift(@insns));
1125 &vpshufb (@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]);
1126 eval(shift(@insns));
1127 eval(shift(@insns));
1128 &vpaddd (@X[$Xi&7],@X[($Xi-4)&7],@X[3]);
1129 eval(shift(@insns));
1130 eval(shift(@insns));
1131 eval(shift(@insns));
1132 eval(shift(@insns));
1133 &vmovdqa (&QWP(0+16*$Xi,"esp"),@X[$Xi&7]); # X[]+K xfer to IALU
1134 eval(shift(@insns));
1135 eval(shift(@insns));
1136
1137 foreach (@insns) { eval; }
1138 $Xi++;
1139}
1140
1141sub Xtail_avx()
1142{ use integer;
1143 my $body = shift;
1144 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
1145 my ($a,$b,$c,$d,$e);
1146
1147 foreach (@insns) { eval; }
1148}
1149
1150&set_label("loop",16);
1151 &Xupdate_avx_16_31(\&body_00_19);
1152 &Xupdate_avx_16_31(\&body_00_19);
1153 &Xupdate_avx_16_31(\&body_00_19);
1154 &Xupdate_avx_16_31(\&body_00_19);
1155 &Xupdate_avx_32_79(\&body_00_19);
1156 &Xupdate_avx_32_79(\&body_20_39);
1157 &Xupdate_avx_32_79(\&body_20_39);
1158 &Xupdate_avx_32_79(\&body_20_39);
1159 &Xupdate_avx_32_79(\&body_20_39);
1160 &Xupdate_avx_32_79(\&body_20_39);
1161 &Xupdate_avx_32_79(\&body_40_59);
1162 &Xupdate_avx_32_79(\&body_40_59);
1163 &Xupdate_avx_32_79(\&body_40_59);
1164 &Xupdate_avx_32_79(\&body_40_59);
1165 &Xupdate_avx_32_79(\&body_40_59);
1166 &Xupdate_avx_32_79(\&body_20_39);
1167 &Xuplast_avx_80(\&body_20_39); # can jump to "done"
1168
1169 $saved_j=$j; @saved_V=@V;
1170
1171 &Xloop_avx(\&body_20_39);
1172 &Xloop_avx(\&body_20_39);
1173 &Xloop_avx(\&body_20_39);
1174
1175 &mov (@T[1],&DWP(192,"esp")); # update context
1176 &add ($A,&DWP(0,@T[1]));
1177 &add (@T[0],&DWP(4,@T[1])); # $b
1178 &add ($C,&DWP(8,@T[1]));
1179 &mov (&DWP(0,@T[1]),$A);
1180 &add ($D,&DWP(12,@T[1]));
1181 &mov (&DWP(4,@T[1]),@T[0]);
1182 &add ($E,&DWP(16,@T[1]));
1183 &mov (&DWP(8,@T[1]),$C);
1184 &mov ($B,@T[0]);
1185 &mov (&DWP(12,@T[1]),$D);
1186 &mov (&DWP(16,@T[1]),$E);
1187
1188 &jmp (&label("loop"));
1189
1190&set_label("done",16); $j=$saved_j; @V=@saved_V;
1191
1192 &Xtail_avx(\&body_20_39);
1193 &Xtail_avx(\&body_20_39);
1194 &Xtail_avx(\&body_20_39);
1195
1196 &vzeroall();
1197
1198 &mov (@T[1],&DWP(192,"esp")); # update context
1199 &add ($A,&DWP(0,@T[1]));
1200 &mov ("esp",&DWP(192+12,"esp")); # restore %esp
1201 &add (@T[0],&DWP(4,@T[1])); # $b
1202 &add ($C,&DWP(8,@T[1]));
1203 &mov (&DWP(0,@T[1]),$A);
1204 &add ($D,&DWP(12,@T[1]));
1205 &mov (&DWP(4,@T[1]),@T[0]);
1206 &add ($E,&DWP(16,@T[1]));
1207 &mov (&DWP(8,@T[1]),$C);
1208 &mov (&DWP(12,@T[1]),$D);
1209 &mov (&DWP(16,@T[1]),$E);
1210&function_end("_sha1_block_data_order_avx");
1211}
1212
1213 &rodataseg();
1214&set_label("K_XX_XX",64);
1215&data_word(0x5a827999,0x5a827999,0x5a827999,0x5a827999); # K_00_19
1216&data_word(0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1); # K_20_39
1217&data_word(0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc); # K_40_59
1218&data_word(0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6); # K_60_79
1219&data_word(0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f); # pbswap mask
1220 &previous();
1221}
1222
1223&asm_finish();
diff --git a/src/lib/libcrypto/sha/asm/sha1-alpha.pl b/src/lib/libcrypto/sha/asm/sha1-alpha.pl
deleted file mode 100644
index 56b3369f09..0000000000
--- a/src/lib/libcrypto/sha/asm/sha1-alpha.pl
+++ /dev/null
@@ -1,316 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# SHA1 block procedure for Alpha.
11
12# On 21264 performance is 33% better than code generated by vendor
13# compiler, and 75% better than GCC [3.4], and in absolute terms is
14# 8.7 cycles per processed byte. Implementation features vectorized
15# byte swap, but not Xupdate.
16
17@X=( "\$0", "\$1", "\$2", "\$3", "\$4", "\$5", "\$6", "\$7",
18 "\$8", "\$9", "\$10", "\$11", "\$12", "\$13", "\$14", "\$15");
19$ctx="a0"; # $16
20$inp="a1";
21$num="a2";
22$A="a3";
23$B="a4"; # 20
24$C="a5";
25$D="t8";
26$E="t9"; @V=($A,$B,$C,$D,$E);
27$t0="t10"; # 24
28$t1="t11";
29$t2="ra";
30$t3="t12";
31$K="AT"; # 28
32
33sub BODY_00_19 {
34my ($i,$a,$b,$c,$d,$e)=@_;
35my $j=$i+1;
36$code.=<<___ if ($i==0);
37 ldq_u @X[0],0+0($inp)
38 ldq_u @X[1],0+7($inp)
39___
40$code.=<<___ if (!($i&1) && $i<14);
41 ldq_u @X[$i+2],($i+2)*4+0($inp)
42 ldq_u @X[$i+3],($i+2)*4+7($inp)
43___
44$code.=<<___ if (!($i&1) && $i<15);
45 extql @X[$i],$inp,@X[$i]
46 extqh @X[$i+1],$inp,@X[$i+1]
47
48 or @X[$i+1],@X[$i],@X[$i] # pair of 32-bit values are fetched
49
50 srl @X[$i],24,$t0 # vectorized byte swap
51 srl @X[$i],8,$t2
52
53 sll @X[$i],8,$t3
54 sll @X[$i],24,@X[$i]
55 zapnot $t0,0x11,$t0
56 zapnot $t2,0x22,$t2
57
58 zapnot @X[$i],0x88,@X[$i]
59 or $t0,$t2,$t0
60 zapnot $t3,0x44,$t3
61 sll $a,5,$t1
62
63 or @X[$i],$t0,@X[$i]
64 addl $K,$e,$e
65 and $b,$c,$t2
66 zapnot $a,0xf,$a
67
68 or @X[$i],$t3,@X[$i]
69 srl $a,27,$t0
70 bic $d,$b,$t3
71 sll $b,30,$b
72
73 extll @X[$i],4,@X[$i+1] # extract upper half
74 or $t2,$t3,$t2
75 addl @X[$i],$e,$e
76
77 addl $t1,$e,$e
78 srl $b,32,$t3
79 zapnot @X[$i],0xf,@X[$i]
80
81 addl $t0,$e,$e
82 addl $t2,$e,$e
83 or $t3,$b,$b
84___
85$code.=<<___ if (($i&1) && $i<15);
86 sll $a,5,$t1
87 addl $K,$e,$e
88 and $b,$c,$t2
89 zapnot $a,0xf,$a
90
91 srl $a,27,$t0
92 addl @X[$i%16],$e,$e
93 bic $d,$b,$t3
94 sll $b,30,$b
95
96 or $t2,$t3,$t2
97 addl $t1,$e,$e
98 srl $b,32,$t3
99 zapnot @X[$i],0xf,@X[$i]
100
101 addl $t0,$e,$e
102 addl $t2,$e,$e
103 or $t3,$b,$b
104___
105$code.=<<___ if ($i>=15); # with forward Xupdate
106 sll $a,5,$t1
107 addl $K,$e,$e
108 and $b,$c,$t2
109 xor @X[($j+2)%16],@X[$j%16],@X[$j%16]
110
111 zapnot $a,0xf,$a
112 addl @X[$i%16],$e,$e
113 bic $d,$b,$t3
114 xor @X[($j+8)%16],@X[$j%16],@X[$j%16]
115
116 srl $a,27,$t0
117 addl $t1,$e,$e
118 or $t2,$t3,$t2
119 xor @X[($j+13)%16],@X[$j%16],@X[$j%16]
120
121 sll $b,30,$b
122 addl $t0,$e,$e
123 srl @X[$j%16],31,$t1
124
125 addl $t2,$e,$e
126 srl $b,32,$t3
127 addl @X[$j%16],@X[$j%16],@X[$j%16]
128
129 or $t3,$b,$b
130 zapnot @X[$i%16],0xf,@X[$i%16]
131 or $t1,@X[$j%16],@X[$j%16]
132___
133}
134
135sub BODY_20_39 {
136my ($i,$a,$b,$c,$d,$e)=@_;
137my $j=$i+1;
138$code.=<<___ if ($i<79); # with forward Xupdate
139 sll $a,5,$t1
140 addl $K,$e,$e
141 zapnot $a,0xf,$a
142 xor @X[($j+2)%16],@X[$j%16],@X[$j%16]
143
144 sll $b,30,$t3
145 addl $t1,$e,$e
146 xor $b,$c,$t2
147 xor @X[($j+8)%16],@X[$j%16],@X[$j%16]
148
149 srl $b,2,$b
150 addl @X[$i%16],$e,$e
151 xor $d,$t2,$t2
152 xor @X[($j+13)%16],@X[$j%16],@X[$j%16]
153
154 srl @X[$j%16],31,$t1
155 addl $t2,$e,$e
156 srl $a,27,$t0
157 addl @X[$j%16],@X[$j%16],@X[$j%16]
158
159 or $t3,$b,$b
160 addl $t0,$e,$e
161 or $t1,@X[$j%16],@X[$j%16]
162___
163$code.=<<___ if ($i<77);
164 zapnot @X[$i%16],0xf,@X[$i%16]
165___
166$code.=<<___ if ($i==79); # with context fetch
167 sll $a,5,$t1
168 addl $K,$e,$e
169 zapnot $a,0xf,$a
170 ldl @X[0],0($ctx)
171
172 sll $b,30,$t3
173 addl $t1,$e,$e
174 xor $b,$c,$t2
175 ldl @X[1],4($ctx)
176
177 srl $b,2,$b
178 addl @X[$i%16],$e,$e
179 xor $d,$t2,$t2
180 ldl @X[2],8($ctx)
181
182 srl $a,27,$t0
183 addl $t2,$e,$e
184 ldl @X[3],12($ctx)
185
186 or $t3,$b,$b
187 addl $t0,$e,$e
188 ldl @X[4],16($ctx)
189___
190}
191
192sub BODY_40_59 {
193my ($i,$a,$b,$c,$d,$e)=@_;
194my $j=$i+1;
195$code.=<<___; # with forward Xupdate
196 sll $a,5,$t1
197 addl $K,$e,$e
198 zapnot $a,0xf,$a
199 xor @X[($j+2)%16],@X[$j%16],@X[$j%16]
200
201 srl $a,27,$t0
202 and $b,$c,$t2
203 and $b,$d,$t3
204 xor @X[($j+8)%16],@X[$j%16],@X[$j%16]
205
206 sll $b,30,$b
207 addl $t1,$e,$e
208 xor @X[($j+13)%16],@X[$j%16],@X[$j%16]
209
210 srl @X[$j%16],31,$t1
211 addl $t0,$e,$e
212 or $t2,$t3,$t2
213 and $c,$d,$t3
214
215 or $t2,$t3,$t2
216 srl $b,32,$t3
217 addl @X[$i%16],$e,$e
218 addl @X[$j%16],@X[$j%16],@X[$j%16]
219
220 or $t3,$b,$b
221 addl $t2,$e,$e
222 or $t1,@X[$j%16],@X[$j%16]
223 zapnot @X[$i%16],0xf,@X[$i%16]
224___
225}
226
227$code=<<___;
228#include <machine/asm.h>
229
230.text
231
232.set noat
233.set noreorder
234.globl sha1_block_data_order
235.align 5
236.ent sha1_block_data_order
237sha1_block_data_order:
238 lda sp,-64(sp)
239 stq ra,0(sp)
240 stq s0,8(sp)
241 stq s1,16(sp)
242 stq s2,24(sp)
243 stq s3,32(sp)
244 stq s4,40(sp)
245 stq s5,48(sp)
246 stq fp,56(sp)
247 .mask 0x0400fe00,-64
248 .frame sp,64,ra
249 .prologue 0
250
251 ldl $A,0($ctx)
252 ldl $B,4($ctx)
253 sll $num,6,$num
254 ldl $C,8($ctx)
255 ldl $D,12($ctx)
256 ldl $E,16($ctx)
257 addq $inp,$num,$num
258
259.Lloop:
260 .set noreorder
261 ldah $K,23170(zero)
262 zapnot $B,0xf,$B
263 lda $K,31129($K) # K_00_19
264___
265for ($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
266
267$code.=<<___;
268 ldah $K,28378(zero)
269 lda $K,-5215($K) # K_20_39
270___
271for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
272
273$code.=<<___;
274 ldah $K,-28900(zero)
275 lda $K,-17188($K) # K_40_59
276___
277for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
278
279$code.=<<___;
280 ldah $K,-13725(zero)
281 lda $K,-15914($K) # K_60_79
282___
283for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
284
285$code.=<<___;
286 addl @X[0],$A,$A
287 addl @X[1],$B,$B
288 addl @X[2],$C,$C
289 addl @X[3],$D,$D
290 addl @X[4],$E,$E
291 stl $A,0($ctx)
292 stl $B,4($ctx)
293 addq $inp,64,$inp
294 stl $C,8($ctx)
295 stl $D,12($ctx)
296 stl $E,16($ctx)
297 cmpult $inp,$num,$t1
298 bne $t1,.Lloop
299
300 .set noreorder
301 ldq ra,0(sp)
302 ldq s0,8(sp)
303 ldq s1,16(sp)
304 ldq s2,24(sp)
305 ldq s3,32(sp)
306 ldq s4,40(sp)
307 ldq s5,48(sp)
308 ldq fp,56(sp)
309 lda sp,64(sp)
310 ret (ra)
311.end sha1_block_data_order
312.align 2
313___
314$output=shift and open STDOUT,">$output";
315print $code;
316close STDOUT;
diff --git a/src/lib/libcrypto/sha/asm/sha1-armv4-large.pl b/src/lib/libcrypto/sha/asm/sha1-armv4-large.pl
deleted file mode 100644
index 8f0cdaf83c..0000000000
--- a/src/lib/libcrypto/sha/asm/sha1-armv4-large.pl
+++ /dev/null
@@ -1,248 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# sha1_block procedure for ARMv4.
11#
12# January 2007.
13
14# Size/performance trade-off
15# ====================================================================
16# impl size in bytes comp cycles[*] measured performance
17# ====================================================================
18# thumb 304 3212 4420
19# armv4-small 392/+29% 1958/+64% 2250/+96%
20# armv4-compact 740/+89% 1552/+26% 1840/+22%
21# armv4-large 1420/+92% 1307/+19% 1370/+34%[***]
22# full unroll ~5100/+260% ~1260/+4% ~1300/+5%
23# ====================================================================
24# thumb = same as 'small' but in Thumb instructions[**] and
25# with recurring code in two private functions;
26# small = detached Xload/update, loops are folded;
27# compact = detached Xload/update, 5x unroll;
28# large = interleaved Xload/update, 5x unroll;
29# full unroll = interleaved Xload/update, full unroll, estimated[!];
30#
31# [*] Manually counted instructions in "grand" loop body. Measured
32# performance is affected by prologue and epilogue overhead,
33# i-cache availability, branch penalties, etc.
34# [**] While each Thumb instruction is twice smaller, they are not as
35# diverse as ARM ones: e.g., there are only two arithmetic
36# instructions with 3 arguments, no [fixed] rotate, addressing
37# modes are limited. As result it takes more instructions to do
38# the same job in Thumb, therefore the code is never twice as
39# small and always slower.
40# [***] which is also ~35% better than compiler generated code. Dual-
41# issue Cortex A8 core was measured to process input block in
42# ~990 cycles.
43
44# August 2010.
45#
46# Rescheduling for dual-issue pipeline resulted in 13% improvement on
47# Cortex A8 core and in absolute terms ~870 cycles per input block
48# [or 13.6 cycles per byte].
49
50# February 2011.
51#
52# Profiler-assisted and platform-specific optimization resulted in 10%
53# improvement on Cortex A8 core and 12.2 cycles per byte.
54
55while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
56open STDOUT,">$output";
57
58$ctx="r0";
59$inp="r1";
60$len="r2";
61$a="r3";
62$b="r4";
63$c="r5";
64$d="r6";
65$e="r7";
66$K="r8";
67$t0="r9";
68$t1="r10";
69$t2="r11";
70$t3="r12";
71$Xi="r14";
72@V=($a,$b,$c,$d,$e);
73
74sub Xupdate {
75my ($a,$b,$c,$d,$e,$opt1,$opt2)=@_;
76$code.=<<___;
77 ldr $t0,[$Xi,#15*4]
78 ldr $t1,[$Xi,#13*4]
79 ldr $t2,[$Xi,#7*4]
80 add $e,$K,$e,ror#2 @ E+=K_xx_xx
81 ldr $t3,[$Xi,#2*4]
82 eor $t0,$t0,$t1
83 eor $t2,$t2,$t3 @ 1 cycle stall
84 eor $t1,$c,$d @ F_xx_xx
85 mov $t0,$t0,ror#31
86 add $e,$e,$a,ror#27 @ E+=ROR(A,27)
87 eor $t0,$t0,$t2,ror#31
88 str $t0,[$Xi,#-4]!
89 $opt1 @ F_xx_xx
90 $opt2 @ F_xx_xx
91 add $e,$e,$t0 @ E+=X[i]
92___
93}
94
95sub BODY_00_15 {
96my ($a,$b,$c,$d,$e)=@_;
97$code.=<<___;
98#if __ARM_ARCH__<7 || defined(__STRICT_ALIGNMENT)
99 ldrb $t1,[$inp,#2]
100 ldrb $t0,[$inp,#3]
101 ldrb $t2,[$inp,#1]
102 add $e,$K,$e,ror#2 @ E+=K_00_19
103 ldrb $t3,[$inp],#4
104 orr $t0,$t0,$t1,lsl#8
105 eor $t1,$c,$d @ F_xx_xx
106 orr $t0,$t0,$t2,lsl#16
107 add $e,$e,$a,ror#27 @ E+=ROR(A,27)
108 orr $t0,$t0,$t3,lsl#24
109#else
110 ldr $t0,[$inp],#4 @ handles unaligned
111 add $e,$K,$e,ror#2 @ E+=K_00_19
112 eor $t1,$c,$d @ F_xx_xx
113 add $e,$e,$a,ror#27 @ E+=ROR(A,27)
114#ifdef __ARMEL__
115 rev $t0,$t0 @ byte swap
116#endif
117#endif
118 and $t1,$b,$t1,ror#2
119 add $e,$e,$t0 @ E+=X[i]
120 eor $t1,$t1,$d,ror#2 @ F_00_19(B,C,D)
121 str $t0,[$Xi,#-4]!
122 add $e,$e,$t1 @ E+=F_00_19(B,C,D)
123___
124}
125
126sub BODY_16_19 {
127my ($a,$b,$c,$d,$e)=@_;
128 &Xupdate(@_,"and $t1,$b,$t1,ror#2");
129$code.=<<___;
130 eor $t1,$t1,$d,ror#2 @ F_00_19(B,C,D)
131 add $e,$e,$t1 @ E+=F_00_19(B,C,D)
132___
133}
134
135sub BODY_20_39 {
136my ($a,$b,$c,$d,$e)=@_;
137 &Xupdate(@_,"eor $t1,$b,$t1,ror#2");
138$code.=<<___;
139 add $e,$e,$t1 @ E+=F_20_39(B,C,D)
140___
141}
142
143sub BODY_40_59 {
144my ($a,$b,$c,$d,$e)=@_;
145 &Xupdate(@_,"and $t1,$b,$t1,ror#2","and $t2,$c,$d");
146$code.=<<___;
147 add $e,$e,$t1 @ E+=F_40_59(B,C,D)
148 add $e,$e,$t2,ror#2
149___
150}
151
152$code=<<___;
153#include "arm_arch.h"
154
155.text
156
157.global sha1_block_data_order
158.type sha1_block_data_order,%function
159
160.align 2
161sha1_block_data_order:
162 stmdb sp!,{r4-r12,lr}
163 add $len,$inp,$len,lsl#6 @ $len to point at the end of $inp
164 ldmia $ctx,{$a,$b,$c,$d,$e}
165.Lloop:
166 ldr $K,.LK_00_19
167 mov $Xi,sp
168 sub sp,sp,#15*4
169 mov $c,$c,ror#30
170 mov $d,$d,ror#30
171 mov $e,$e,ror#30 @ [6]
172.L_00_15:
173___
174for($i=0;$i<5;$i++) {
175 &BODY_00_15(@V); unshift(@V,pop(@V));
176}
177$code.=<<___;
178 teq $Xi,sp
179 bne .L_00_15 @ [((11+4)*5+2)*3]
180 sub sp,sp,#25*4
181___
182 &BODY_00_15(@V); unshift(@V,pop(@V));
183 &BODY_16_19(@V); unshift(@V,pop(@V));
184 &BODY_16_19(@V); unshift(@V,pop(@V));
185 &BODY_16_19(@V); unshift(@V,pop(@V));
186 &BODY_16_19(@V); unshift(@V,pop(@V));
187$code.=<<___;
188
189 ldr $K,.LK_20_39 @ [+15+16*4]
190 cmn sp,#0 @ [+3], clear carry to denote 20_39
191.L_20_39_or_60_79:
192___
193for($i=0;$i<5;$i++) {
194 &BODY_20_39(@V); unshift(@V,pop(@V));
195}
196$code.=<<___;
197 teq $Xi,sp @ preserve carry
198 bne .L_20_39_or_60_79 @ [+((12+3)*5+2)*4]
199 bcs .L_done @ [+((12+3)*5+2)*4], spare 300 bytes
200
201 ldr $K,.LK_40_59
202 sub sp,sp,#20*4 @ [+2]
203.L_40_59:
204___
205for($i=0;$i<5;$i++) {
206 &BODY_40_59(@V); unshift(@V,pop(@V));
207}
208$code.=<<___;
209 teq $Xi,sp
210 bne .L_40_59 @ [+((12+5)*5+2)*4]
211
212 ldr $K,.LK_60_79
213 sub sp,sp,#20*4
214 cmp sp,#0 @ set carry to denote 60_79
215 b .L_20_39_or_60_79 @ [+4], spare 300 bytes
216.L_done:
217 add sp,sp,#80*4 @ "deallocate" stack frame
218 ldmia $ctx,{$K,$t0,$t1,$t2,$t3}
219 add $a,$K,$a
220 add $b,$t0,$b
221 add $c,$t1,$c,ror#2
222 add $d,$t2,$d,ror#2
223 add $e,$t3,$e,ror#2
224 stmia $ctx,{$a,$b,$c,$d,$e}
225 teq $inp,$len
226 bne .Lloop @ [+18], total 1307
227
228#if __ARM_ARCH__>=5
229 ldmia sp!,{r4-r12,pc}
230#else
231 ldmia sp!,{r4-r12,lr}
232 tst lr,#1
233 moveq pc,lr @ be binary compatible with V4, yet
234 bx lr @ interoperable with Thumb ISA:-)
235#endif
236.align 2
237.LK_00_19: .word 0x5a827999
238.LK_20_39: .word 0x6ed9eba1
239.LK_40_59: .word 0x8f1bbcdc
240.LK_60_79: .word 0xca62c1d6
241.size sha1_block_data_order,.-sha1_block_data_order
242.asciz "SHA1 block transform for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
243.align 2
244___
245
246$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
247print $code;
248close STDOUT; # enforce flush
diff --git a/src/lib/libcrypto/sha/asm/sha1-mips.pl b/src/lib/libcrypto/sha/asm/sha1-mips.pl
deleted file mode 100644
index 75fe7113e2..0000000000
--- a/src/lib/libcrypto/sha/asm/sha1-mips.pl
+++ /dev/null
@@ -1,350 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# SHA1 block procedure for MIPS.
11
12# Performance improvement is 30% on unaligned input. The "secret" is
13# to deploy lwl/lwr pair to load unaligned input. One could have
14# vectorized Xupdate on MIPSIII/IV, but the goal was to code MIPS32-
15# compatible subroutine. There is room for minor optimization on
16# little-endian platforms...
17
18######################################################################
19# There is a number of MIPS ABI in use, O32 and N32/64 are most
20# widely used. Then there is a new contender: NUBI. It appears that if
21# one picks the latter, it's possible to arrange code in ABI neutral
22# manner. Therefore let's stick to NUBI register layout:
23#
24($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
25($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
26($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
27($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
28#
29# The return value is placed in $a0. Following coding rules facilitate
30# interoperability:
31#
32# - never ever touch $tp, "thread pointer", former $gp;
33# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
34# old code];
35# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
36#
37# For reference here is register layout for N32/64 MIPS ABIs:
38#
39# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
40# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
41# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
42# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
43# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
44#
45$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64
46
47if ($flavour =~ /64|n32/i) {
48 $PTR_ADD="dadd"; # incidentally works even on n32
49 $PTR_SUB="dsub"; # incidentally works even on n32
50 $REG_S="sd";
51 $REG_L="ld";
52 $PTR_SLL="dsll"; # incidentally works even on n32
53 $SZREG=8;
54} else {
55 $PTR_ADD="add";
56 $PTR_SUB="sub";
57 $REG_S="sw";
58 $REG_L="lw";
59 $PTR_SLL="sll";
60 $SZREG=4;
61}
62#
63# <appro@openssl.org>
64#
65######################################################################
66
67$big_endian=(`echo MIPSEL | $ENV{CC} -E -P -`=~/MIPSEL/)?1:0;
68
69for (@ARGV) { $output=$_ if (/^\w[\w\-]*\.\w+$/); }
70open STDOUT,">$output";
71
72if (!defined($big_endian))
73 { $big_endian=(unpack('L',pack('N',1))==1); }
74
75# offsets of the Most and Least Significant Bytes
76$MSB=$big_endian?0:3;
77$LSB=3&~$MSB;
78
79@X=map("\$$_",(8..23)); # a4-a7,s0-s11
80
81$ctx=$a0;
82$inp=$a1;
83$num=$a2;
84$A="\$1";
85$B="\$2";
86$C="\$3";
87$D="\$7";
88$E="\$24"; @V=($A,$B,$C,$D,$E);
89$t0="\$25";
90$t1=$num; # $num is offloaded to stack
91$t2="\$30"; # fp
92$K="\$31"; # ra
93
94sub BODY_00_14 {
95my ($i,$a,$b,$c,$d,$e)=@_;
96my $j=$i+1;
97$code.=<<___ if (!$big_endian);
98 srl $t0,@X[$i],24 # byte swap($i)
99 srl $t1,@X[$i],8
100 andi $t2,@X[$i],0xFF00
101 sll @X[$i],@X[$i],24
102 andi $t1,0xFF00
103 sll $t2,$t2,8
104 or @X[$i],$t0
105 or $t1,$t2
106 or @X[$i],$t1
107___
108$code.=<<___;
109 lwl @X[$j],$j*4+$MSB($inp)
110 sll $t0,$a,5 # $i
111 addu $e,$K
112 lwr @X[$j],$j*4+$LSB($inp)
113 srl $t1,$a,27
114 addu $e,$t0
115 xor $t0,$c,$d
116 addu $e,$t1
117 sll $t2,$b,30
118 and $t0,$b
119 srl $b,$b,2
120 xor $t0,$d
121 addu $e,@X[$i]
122 or $b,$t2
123 addu $e,$t0
124___
125}
126
127sub BODY_15_19 {
128my ($i,$a,$b,$c,$d,$e)=@_;
129my $j=$i+1;
130
131$code.=<<___ if (!$big_endian && $i==15);
132 srl $t0,@X[$i],24 # byte swap($i)
133 srl $t1,@X[$i],8
134 andi $t2,@X[$i],0xFF00
135 sll @X[$i],@X[$i],24
136 andi $t1,0xFF00
137 sll $t2,$t2,8
138 or @X[$i],$t0
139 or @X[$i],$t1
140 or @X[$i],$t2
141___
142$code.=<<___;
143 xor @X[$j%16],@X[($j+2)%16]
144 sll $t0,$a,5 # $i
145 addu $e,$K
146 srl $t1,$a,27
147 addu $e,$t0
148 xor @X[$j%16],@X[($j+8)%16]
149 xor $t0,$c,$d
150 addu $e,$t1
151 xor @X[$j%16],@X[($j+13)%16]
152 sll $t2,$b,30
153 and $t0,$b
154 srl $t1,@X[$j%16],31
155 addu @X[$j%16],@X[$j%16]
156 srl $b,$b,2
157 xor $t0,$d
158 or @X[$j%16],$t1
159 addu $e,@X[$i%16]
160 or $b,$t2
161 addu $e,$t0
162___
163}
164
165sub BODY_20_39 {
166my ($i,$a,$b,$c,$d,$e)=@_;
167my $j=$i+1;
168$code.=<<___ if ($i<79);
169 xor @X[$j%16],@X[($j+2)%16]
170 sll $t0,$a,5 # $i
171 addu $e,$K
172 srl $t1,$a,27
173 addu $e,$t0
174 xor @X[$j%16],@X[($j+8)%16]
175 xor $t0,$c,$d
176 addu $e,$t1
177 xor @X[$j%16],@X[($j+13)%16]
178 sll $t2,$b,30
179 xor $t0,$b
180 srl $t1,@X[$j%16],31
181 addu @X[$j%16],@X[$j%16]
182 srl $b,$b,2
183 addu $e,@X[$i%16]
184 or @X[$j%16],$t1
185 or $b,$t2
186 addu $e,$t0
187___
188$code.=<<___ if ($i==79);
189 lw @X[0],0($ctx)
190 sll $t0,$a,5 # $i
191 addu $e,$K
192 lw @X[1],4($ctx)
193 srl $t1,$a,27
194 addu $e,$t0
195 lw @X[2],8($ctx)
196 xor $t0,$c,$d
197 addu $e,$t1
198 lw @X[3],12($ctx)
199 sll $t2,$b,30
200 xor $t0,$b
201 lw @X[4],16($ctx)
202 srl $b,$b,2
203 addu $e,@X[$i%16]
204 or $b,$t2
205 addu $e,$t0
206___
207}
208
209sub BODY_40_59 {
210my ($i,$a,$b,$c,$d,$e)=@_;
211my $j=$i+1;
212$code.=<<___ if ($i<79);
213 xor @X[$j%16],@X[($j+2)%16]
214 sll $t0,$a,5 # $i
215 addu $e,$K
216 srl $t1,$a,27
217 addu $e,$t0
218 xor @X[$j%16],@X[($j+8)%16]
219 and $t0,$c,$d
220 addu $e,$t1
221 xor @X[$j%16],@X[($j+13)%16]
222 sll $t2,$b,30
223 addu $e,$t0
224 srl $t1,@X[$j%16],31
225 xor $t0,$c,$d
226 addu @X[$j%16],@X[$j%16]
227 and $t0,$b
228 srl $b,$b,2
229 or @X[$j%16],$t1
230 addu $e,@X[$i%16]
231 or $b,$t2
232 addu $e,$t0
233___
234}
235
236$FRAMESIZE=16; # large enough to accommodate NUBI saved registers
237$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0xc0fff008 : 0xc0ff0000;
238
239$code=<<___;
240.text
241
242.set noat
243.set noreorder
244.align 5
245.globl sha1_block_data_order
246.ent sha1_block_data_order
247sha1_block_data_order:
248 .frame $sp,$FRAMESIZE*$SZREG,$ra
249 .mask $SAVED_REGS_MASK,-$SZREG
250 .set noreorder
251 $PTR_SUB $sp,$FRAMESIZE*$SZREG
252 $REG_S $ra,($FRAMESIZE-1)*$SZREG($sp)
253 $REG_S $fp,($FRAMESIZE-2)*$SZREG($sp)
254 $REG_S $s11,($FRAMESIZE-3)*$SZREG($sp)
255 $REG_S $s10,($FRAMESIZE-4)*$SZREG($sp)
256 $REG_S $s9,($FRAMESIZE-5)*$SZREG($sp)
257 $REG_S $s8,($FRAMESIZE-6)*$SZREG($sp)
258 $REG_S $s7,($FRAMESIZE-7)*$SZREG($sp)
259 $REG_S $s6,($FRAMESIZE-8)*$SZREG($sp)
260 $REG_S $s5,($FRAMESIZE-9)*$SZREG($sp)
261 $REG_S $s4,($FRAMESIZE-10)*$SZREG($sp)
262___
263$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
264 $REG_S $s3,($FRAMESIZE-11)*$SZREG($sp)
265 $REG_S $s2,($FRAMESIZE-12)*$SZREG($sp)
266 $REG_S $s1,($FRAMESIZE-13)*$SZREG($sp)
267 $REG_S $s0,($FRAMESIZE-14)*$SZREG($sp)
268 $REG_S $gp,($FRAMESIZE-15)*$SZREG($sp)
269___
270$code.=<<___;
271 $PTR_SLL $num,6
272 $PTR_ADD $num,$inp
273 $REG_S $num,0($sp)
274 lw $A,0($ctx)
275 lw $B,4($ctx)
276 lw $C,8($ctx)
277 lw $D,12($ctx)
278 b .Loop
279 lw $E,16($ctx)
280.align 4
281.Loop:
282 .set reorder
283 lwl @X[0],$MSB($inp)
284 lui $K,0x5a82
285 lwr @X[0],$LSB($inp)
286 ori $K,0x7999 # K_00_19
287___
288for ($i=0;$i<15;$i++) { &BODY_00_14($i,@V); unshift(@V,pop(@V)); }
289for (;$i<20;$i++) { &BODY_15_19($i,@V); unshift(@V,pop(@V)); }
290$code.=<<___;
291 lui $K,0x6ed9
292 ori $K,0xeba1 # K_20_39
293___
294for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
295$code.=<<___;
296 lui $K,0x8f1b
297 ori $K,0xbcdc # K_40_59
298___
299for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
300$code.=<<___;
301 lui $K,0xca62
302 ori $K,0xc1d6 # K_60_79
303___
304for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
305$code.=<<___;
306 $PTR_ADD $inp,64
307 $REG_L $num,0($sp)
308
309 addu $A,$X[0]
310 addu $B,$X[1]
311 sw $A,0($ctx)
312 addu $C,$X[2]
313 addu $D,$X[3]
314 sw $B,4($ctx)
315 addu $E,$X[4]
316 sw $C,8($ctx)
317 sw $D,12($ctx)
318 sw $E,16($ctx)
319 .set noreorder
320 bne $inp,$num,.Loop
321 nop
322
323 .set noreorder
324 $REG_L $ra,($FRAMESIZE-1)*$SZREG($sp)
325 $REG_L $fp,($FRAMESIZE-2)*$SZREG($sp)
326 $REG_L $s11,($FRAMESIZE-3)*$SZREG($sp)
327 $REG_L $s10,($FRAMESIZE-4)*$SZREG($sp)
328 $REG_L $s9,($FRAMESIZE-5)*$SZREG($sp)
329 $REG_L $s8,($FRAMESIZE-6)*$SZREG($sp)
330 $REG_L $s7,($FRAMESIZE-7)*$SZREG($sp)
331 $REG_L $s6,($FRAMESIZE-8)*$SZREG($sp)
332 $REG_L $s5,($FRAMESIZE-9)*$SZREG($sp)
333 $REG_L $s4,($FRAMESIZE-10)*$SZREG($sp)
334___
335$code.=<<___ if ($flavour =~ /nubi/i);
336 $REG_L $s3,($FRAMESIZE-11)*$SZREG($sp)
337 $REG_L $s2,($FRAMESIZE-12)*$SZREG($sp)
338 $REG_L $s1,($FRAMESIZE-13)*$SZREG($sp)
339 $REG_L $s0,($FRAMESIZE-14)*$SZREG($sp)
340 $REG_L $gp,($FRAMESIZE-15)*$SZREG($sp)
341___
342$code.=<<___;
343 jr $ra
344 $PTR_ADD $sp,$FRAMESIZE*$SZREG
345.end sha1_block_data_order
346.rdata
347.asciiz "SHA1 for MIPS, CRYPTOGAMS by <appro\@openssl.org>"
348___
349print $code;
350close STDOUT;
diff --git a/src/lib/libcrypto/sha/asm/sha1-parisc.pl b/src/lib/libcrypto/sha/asm/sha1-parisc.pl
deleted file mode 100644
index 783c26272b..0000000000
--- a/src/lib/libcrypto/sha/asm/sha1-parisc.pl
+++ /dev/null
@@ -1,258 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# SHA1 block procedure for PA-RISC.
11
12# June 2009.
13#
14# On PA-7100LC performance is >30% better than gcc 3.2 generated code
15# for aligned input and >50% better for unaligned. Compared to vendor
16# compiler on PA-8600 it's almost 60% faster in 64-bit build and just
17# few percent faster in 32-bit one (this for aligned input, data for
18# unaligned input is not available).
19#
20# Special thanks to polarhome.com for providing HP-UX account.
21
22$flavour = shift;
23$output = shift;
24open STDOUT,">$output";
25
26if ($flavour =~ /64/) {
27 $LEVEL ="2.0W";
28 $SIZE_T =8;
29 $FRAME_MARKER =80;
30 $SAVED_RP =16;
31 $PUSH ="std";
32 $PUSHMA ="std,ma";
33 $POP ="ldd";
34 $POPMB ="ldd,mb";
35} else {
36 $LEVEL ="1.0";
37 $SIZE_T =4;
38 $FRAME_MARKER =48;
39 $SAVED_RP =20;
40 $PUSH ="stw";
41 $PUSHMA ="stwm";
42 $POP ="ldw";
43 $POPMB ="ldwm";
44}
45
46$FRAME=14*$SIZE_T+$FRAME_MARKER;# 14 saved regs + frame marker
47 # [+ argument transfer]
48$ctx="%r26"; # arg0
49$inp="%r25"; # arg1
50$num="%r24"; # arg2
51
52$t0="%r28";
53$t1="%r29";
54$K="%r31";
55
56@X=("%r1", "%r2", "%r3", "%r4", "%r5", "%r6", "%r7", "%r8",
57 "%r9", "%r10","%r11","%r12","%r13","%r14","%r15","%r16",$t0);
58
59@V=($A,$B,$C,$D,$E)=("%r19","%r20","%r21","%r22","%r23");
60
61sub BODY_00_19 {
62my ($i,$a,$b,$c,$d,$e)=@_;
63my $j=$i+1;
64$code.=<<___ if ($i<15);
65 addl $K,$e,$e ; $i
66 shd $a,$a,27,$t1
67 addl @X[$i],$e,$e
68 and $c,$b,$t0
69 addl $t1,$e,$e
70 andcm $d,$b,$t1
71 shd $b,$b,2,$b
72 or $t1,$t0,$t0
73 addl $t0,$e,$e
74___
75$code.=<<___ if ($i>=15); # with forward Xupdate
76 addl $K,$e,$e ; $i
77 shd $a,$a,27,$t1
78 xor @X[($j+2)%16],@X[$j%16],@X[$j%16]
79 addl @X[$i%16],$e,$e
80 and $c,$b,$t0
81 xor @X[($j+8)%16],@X[$j%16],@X[$j%16]
82 addl $t1,$e,$e
83 andcm $d,$b,$t1
84 shd $b,$b,2,$b
85 or $t1,$t0,$t0
86 xor @X[($j+13)%16],@X[$j%16],@X[$j%16]
87 add $t0,$e,$e
88 shd @X[$j%16],@X[$j%16],31,@X[$j%16]
89___
90}
91
92sub BODY_20_39 {
93my ($i,$a,$b,$c,$d,$e)=@_;
94my $j=$i+1;
95$code.=<<___ if ($i<79);
96 xor @X[($j+2)%16],@X[$j%16],@X[$j%16] ; $i
97 addl $K,$e,$e
98 shd $a,$a,27,$t1
99 xor @X[($j+8)%16],@X[$j%16],@X[$j%16]
100 addl @X[$i%16],$e,$e
101 xor $b,$c,$t0
102 xor @X[($j+13)%16],@X[$j%16],@X[$j%16]
103 addl $t1,$e,$e
104 shd $b,$b,2,$b
105 xor $d,$t0,$t0
106 shd @X[$j%16],@X[$j%16],31,@X[$j%16]
107 addl $t0,$e,$e
108___
109$code.=<<___ if ($i==79); # with context load
110 ldw 0($ctx),@X[0] ; $i
111 addl $K,$e,$e
112 shd $a,$a,27,$t1
113 ldw 4($ctx),@X[1]
114 addl @X[$i%16],$e,$e
115 xor $b,$c,$t0
116 ldw 8($ctx),@X[2]
117 addl $t1,$e,$e
118 shd $b,$b,2,$b
119 xor $d,$t0,$t0
120 ldw 12($ctx),@X[3]
121 addl $t0,$e,$e
122 ldw 16($ctx),@X[4]
123___
124}
125
126sub BODY_40_59 {
127my ($i,$a,$b,$c,$d,$e)=@_;
128my $j=$i+1;
129$code.=<<___;
130 shd $a,$a,27,$t1 ; $i
131 addl $K,$e,$e
132 xor @X[($j+2)%16],@X[$j%16],@X[$j%16]
133 xor $d,$c,$t0
134 addl @X[$i%16],$e,$e
135 xor @X[($j+8)%16],@X[$j%16],@X[$j%16]
136 and $b,$t0,$t0
137 addl $t1,$e,$e
138 shd $b,$b,2,$b
139 xor @X[($j+13)%16],@X[$j%16],@X[$j%16]
140 addl $t0,$e,$e
141 and $d,$c,$t1
142 shd @X[$j%16],@X[$j%16],31,@X[$j%16]
143 addl $t1,$e,$e
144___
145}
146
147$code=<<___;
148 .LEVEL $LEVEL
149 .text
150
151 .EXPORT sha1_block_data_order,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR
152sha1_block_data_order
153 .PROC
154 .CALLINFO FRAME=`$FRAME-14*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=16
155 .ENTRY
156 $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
157 $PUSHMA %r3,$FRAME(%sp)
158 $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
159 $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
160 $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
161 $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp)
162 $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp)
163 $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp)
164 $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp)
165 $PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp)
166 $PUSH %r12,`-$FRAME+9*$SIZE_T`(%sp)
167 $PUSH %r13,`-$FRAME+10*$SIZE_T`(%sp)
168 $PUSH %r14,`-$FRAME+11*$SIZE_T`(%sp)
169 $PUSH %r15,`-$FRAME+12*$SIZE_T`(%sp)
170 $PUSH %r16,`-$FRAME+13*$SIZE_T`(%sp)
171
172 ldw 0($ctx),$A
173 ldw 4($ctx),$B
174 ldw 8($ctx),$C
175 ldw 12($ctx),$D
176 ldw 16($ctx),$E
177
178 extru $inp,31,2,$t0 ; t0=inp&3;
179 sh3addl $t0,%r0,$t0 ; t0*=8;
180 subi 32,$t0,$t0 ; t0=32-t0;
181 mtctl $t0,%cr11 ; %sar=t0;
182
183L\$oop
184 ldi 3,$t0
185 andcm $inp,$t0,$t0 ; 64-bit neutral
186___
187 for ($i=0;$i<15;$i++) { # load input block
188 $code.="\tldw `4*$i`($t0),@X[$i]\n"; }
189$code.=<<___;
190 cmpb,*= $inp,$t0,L\$aligned
191 ldw 60($t0),@X[15]
192 ldw 64($t0),@X[16]
193___
194 for ($i=0;$i<16;$i++) { # align input
195 $code.="\tvshd @X[$i],@X[$i+1],@X[$i]\n"; }
196$code.=<<___;
197L\$aligned
198 ldil L'0x5a827000,$K ; K_00_19
199 ldo 0x999($K),$K
200___
201for ($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
202$code.=<<___;
203 ldil L'0x6ed9e000,$K ; K_20_39
204 ldo 0xba1($K),$K
205___
206
207for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
208$code.=<<___;
209 ldil L'0x8f1bb000,$K ; K_40_59
210 ldo 0xcdc($K),$K
211___
212
213for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
214$code.=<<___;
215 ldil L'0xca62c000,$K ; K_60_79
216 ldo 0x1d6($K),$K
217___
218for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
219
220$code.=<<___;
221 addl @X[0],$A,$A
222 addl @X[1],$B,$B
223 addl @X[2],$C,$C
224 addl @X[3],$D,$D
225 addl @X[4],$E,$E
226 stw $A,0($ctx)
227 stw $B,4($ctx)
228 stw $C,8($ctx)
229 stw $D,12($ctx)
230 stw $E,16($ctx)
231 addib,*<> -1,$num,L\$oop
232 ldo 64($inp),$inp
233
234 $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue
235 $POP `-$FRAME+1*$SIZE_T`(%sp),%r4
236 $POP `-$FRAME+2*$SIZE_T`(%sp),%r5
237 $POP `-$FRAME+3*$SIZE_T`(%sp),%r6
238 $POP `-$FRAME+4*$SIZE_T`(%sp),%r7
239 $POP `-$FRAME+5*$SIZE_T`(%sp),%r8
240 $POP `-$FRAME+6*$SIZE_T`(%sp),%r9
241 $POP `-$FRAME+7*$SIZE_T`(%sp),%r10
242 $POP `-$FRAME+8*$SIZE_T`(%sp),%r11
243 $POP `-$FRAME+9*$SIZE_T`(%sp),%r12
244 $POP `-$FRAME+10*$SIZE_T`(%sp),%r13
245 $POP `-$FRAME+11*$SIZE_T`(%sp),%r14
246 $POP `-$FRAME+12*$SIZE_T`(%sp),%r15
247 $POP `-$FRAME+13*$SIZE_T`(%sp),%r16
248 bv (%r2)
249 .EXIT
250 $POPMB -$FRAME(%sp),%r3
251 .PROCEND
252___
253
254$code =~ s/\`([^\`]*)\`/eval $1/gem;
255$code =~ s/,\*/,/gm if ($SIZE_T==4);
256$code =~ s/\bbv\b/bve/gm if ($SIZE_T==8);
257print $code;
258close STDOUT;
diff --git a/src/lib/libcrypto/sha/asm/sha1-ppc.pl b/src/lib/libcrypto/sha/asm/sha1-ppc.pl
deleted file mode 100755
index 85342b6a82..0000000000
--- a/src/lib/libcrypto/sha/asm/sha1-ppc.pl
+++ /dev/null
@@ -1,318 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# I let hardware handle unaligned input(*), except on page boundaries
11# (see below for details). Otherwise straightforward implementation
12# with X vector in register bank. The module is big-endian [which is
13# not big deal as there're no little-endian targets left around].
14#
15# (*) this means that this module is inappropriate for PPC403? Does
16# anybody know if pre-POWER3 can sustain unaligned load?
17
18# -m64 -m32
19# ----------------------------------
20# PPC970,gcc-4.0.0 +76% +59%
21# Power6,xlc-7 +68% +33%
22
23$flavour = shift;
24
25if ($flavour =~ /64/) {
26 $SIZE_T =8;
27 $LRSAVE =2*$SIZE_T;
28 $UCMP ="cmpld";
29 $STU ="stdu";
30 $POP ="ld";
31 $PUSH ="std";
32} elsif ($flavour =~ /32/) {
33 $SIZE_T =4;
34 $LRSAVE =$SIZE_T;
35 $UCMP ="cmplw";
36 $STU ="stwu";
37 $POP ="lwz";
38 $PUSH ="stw";
39} else { die "nonsense $flavour"; }
40
41$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
42( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
43( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
44die "can't locate ppc-xlate.pl";
45
46open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
47
48$FRAME=24*$SIZE_T+64;
49$LOCALS=6*$SIZE_T;
50
51$K ="r0";
52$sp ="r1";
53$toc="r2";
54$ctx="r3";
55$inp="r4";
56$num="r5";
57$t0 ="r15";
58$t1 ="r6";
59
60$A ="r7";
61$B ="r8";
62$C ="r9";
63$D ="r10";
64$E ="r11";
65$T ="r12";
66
67@V=($A,$B,$C,$D,$E,$T);
68@X=("r16","r17","r18","r19","r20","r21","r22","r23",
69 "r24","r25","r26","r27","r28","r29","r30","r31");
70
71sub BODY_00_19 {
72my ($i,$a,$b,$c,$d,$e,$f)=@_;
73my $j=$i+1;
74$code.=<<___ if ($i==0);
75 lwz @X[$i],`$i*4`($inp)
76___
77$code.=<<___ if ($i<15);
78 lwz @X[$j],`$j*4`($inp)
79 add $f,$K,$e
80 rotlwi $e,$a,5
81 add $f,$f,@X[$i]
82 and $t0,$c,$b
83 add $f,$f,$e
84 andc $t1,$d,$b
85 rotlwi $b,$b,30
86 or $t0,$t0,$t1
87 add $f,$f,$t0
88___
89$code.=<<___ if ($i>=15);
90 add $f,$K,$e
91 rotlwi $e,$a,5
92 xor @X[$j%16],@X[$j%16],@X[($j+2)%16]
93 add $f,$f,@X[$i%16]
94 and $t0,$c,$b
95 xor @X[$j%16],@X[$j%16],@X[($j+8)%16]
96 add $f,$f,$e
97 andc $t1,$d,$b
98 rotlwi $b,$b,30
99 or $t0,$t0,$t1
100 xor @X[$j%16],@X[$j%16],@X[($j+13)%16]
101 add $f,$f,$t0
102 rotlwi @X[$j%16],@X[$j%16],1
103___
104}
105
106sub BODY_20_39 {
107my ($i,$a,$b,$c,$d,$e,$f)=@_;
108my $j=$i+1;
109$code.=<<___ if ($i<79);
110 add $f,$K,$e
111 rotlwi $e,$a,5
112 xor @X[$j%16],@X[$j%16],@X[($j+2)%16]
113 add $f,$f,@X[$i%16]
114 xor $t0,$b,$c
115 xor @X[$j%16],@X[$j%16],@X[($j+8)%16]
116 add $f,$f,$e
117 rotlwi $b,$b,30
118 xor $t0,$t0,$d
119 xor @X[$j%16],@X[$j%16],@X[($j+13)%16]
120 add $f,$f,$t0
121 rotlwi @X[$j%16],@X[$j%16],1
122___
123$code.=<<___ if ($i==79);
124 add $f,$K,$e
125 rotlwi $e,$a,5
126 lwz r16,0($ctx)
127 add $f,$f,@X[$i%16]
128 xor $t0,$b,$c
129 lwz r17,4($ctx)
130 add $f,$f,$e
131 rotlwi $b,$b,30
132 lwz r18,8($ctx)
133 xor $t0,$t0,$d
134 lwz r19,12($ctx)
135 add $f,$f,$t0
136 lwz r20,16($ctx)
137___
138}
139
140sub BODY_40_59 {
141my ($i,$a,$b,$c,$d,$e,$f)=@_;
142my $j=$i+1;
143$code.=<<___;
144 add $f,$K,$e
145 rotlwi $e,$a,5
146 xor @X[$j%16],@X[$j%16],@X[($j+2)%16]
147 add $f,$f,@X[$i%16]
148 and $t0,$b,$c
149 xor @X[$j%16],@X[$j%16],@X[($j+8)%16]
150 add $f,$f,$e
151 or $t1,$b,$c
152 rotlwi $b,$b,30
153 xor @X[$j%16],@X[$j%16],@X[($j+13)%16]
154 and $t1,$t1,$d
155 or $t0,$t0,$t1
156 rotlwi @X[$j%16],@X[$j%16],1
157 add $f,$f,$t0
158___
159}
160
161$code=<<___;
162.machine "any"
163.text
164
165.globl .sha1_block_data_order
166.align 4
167.sha1_block_data_order:
168 $STU $sp,-$FRAME($sp)
169 mflr r0
170 $PUSH r15,`$FRAME-$SIZE_T*17`($sp)
171 $PUSH r16,`$FRAME-$SIZE_T*16`($sp)
172 $PUSH r17,`$FRAME-$SIZE_T*15`($sp)
173 $PUSH r18,`$FRAME-$SIZE_T*14`($sp)
174 $PUSH r19,`$FRAME-$SIZE_T*13`($sp)
175 $PUSH r20,`$FRAME-$SIZE_T*12`($sp)
176 $PUSH r21,`$FRAME-$SIZE_T*11`($sp)
177 $PUSH r22,`$FRAME-$SIZE_T*10`($sp)
178 $PUSH r23,`$FRAME-$SIZE_T*9`($sp)
179 $PUSH r24,`$FRAME-$SIZE_T*8`($sp)
180 $PUSH r25,`$FRAME-$SIZE_T*7`($sp)
181 $PUSH r26,`$FRAME-$SIZE_T*6`($sp)
182 $PUSH r27,`$FRAME-$SIZE_T*5`($sp)
183 $PUSH r28,`$FRAME-$SIZE_T*4`($sp)
184 $PUSH r29,`$FRAME-$SIZE_T*3`($sp)
185 $PUSH r30,`$FRAME-$SIZE_T*2`($sp)
186 $PUSH r31,`$FRAME-$SIZE_T*1`($sp)
187 $PUSH r0,`$FRAME+$LRSAVE`($sp)
188 lwz $A,0($ctx)
189 lwz $B,4($ctx)
190 lwz $C,8($ctx)
191 lwz $D,12($ctx)
192 lwz $E,16($ctx)
193 andi. r0,$inp,3
194 bne Lunaligned
195Laligned:
196 mtctr $num
197 bl Lsha1_block_private
198 b Ldone
199
200; PowerPC specification allows an implementation to be ill-behaved
201; upon unaligned access which crosses page boundary. "Better safe
202; than sorry" principle makes me treat it specially. But I don't
203; look for particular offending word, but rather for 64-byte input
204; block which crosses the boundary. Once found that block is aligned
205; and hashed separately...
206.align 4
207Lunaligned:
208 subfic $t1,$inp,4096
209 andi. $t1,$t1,4095 ; distance to closest page boundary
210 srwi. $t1,$t1,6 ; t1/=64
211 beq Lcross_page
212 $UCMP $num,$t1
213 ble- Laligned ; didn't cross the page boundary
214 mtctr $t1
215 subfc $num,$t1,$num
216 bl Lsha1_block_private
217Lcross_page:
218 li $t1,16
219 mtctr $t1
220 addi r20,$sp,$LOCALS ; spot within the frame
221Lmemcpy:
222 lbz r16,0($inp)
223 lbz r17,1($inp)
224 lbz r18,2($inp)
225 lbz r19,3($inp)
226 addi $inp,$inp,4
227 stb r16,0(r20)
228 stb r17,1(r20)
229 stb r18,2(r20)
230 stb r19,3(r20)
231 addi r20,r20,4
232 bdnz Lmemcpy
233
234 $PUSH $inp,`$FRAME-$SIZE_T*18`($sp)
235 li $t1,1
236 addi $inp,$sp,$LOCALS
237 mtctr $t1
238 bl Lsha1_block_private
239 $POP $inp,`$FRAME-$SIZE_T*18`($sp)
240 addic. $num,$num,-1
241 bne- Lunaligned
242
243Ldone:
244 $POP r0,`$FRAME+$LRSAVE`($sp)
245 $POP r15,`$FRAME-$SIZE_T*17`($sp)
246 $POP r16,`$FRAME-$SIZE_T*16`($sp)
247 $POP r17,`$FRAME-$SIZE_T*15`($sp)
248 $POP r18,`$FRAME-$SIZE_T*14`($sp)
249 $POP r19,`$FRAME-$SIZE_T*13`($sp)
250 $POP r20,`$FRAME-$SIZE_T*12`($sp)
251 $POP r21,`$FRAME-$SIZE_T*11`($sp)
252 $POP r22,`$FRAME-$SIZE_T*10`($sp)
253 $POP r23,`$FRAME-$SIZE_T*9`($sp)
254 $POP r24,`$FRAME-$SIZE_T*8`($sp)
255 $POP r25,`$FRAME-$SIZE_T*7`($sp)
256 $POP r26,`$FRAME-$SIZE_T*6`($sp)
257 $POP r27,`$FRAME-$SIZE_T*5`($sp)
258 $POP r28,`$FRAME-$SIZE_T*4`($sp)
259 $POP r29,`$FRAME-$SIZE_T*3`($sp)
260 $POP r30,`$FRAME-$SIZE_T*2`($sp)
261 $POP r31,`$FRAME-$SIZE_T*1`($sp)
262 mtlr r0
263 addi $sp,$sp,$FRAME
264 blr
265___
266
267# This is private block function, which uses tailored calling
268# interface, namely upon entry SHA_CTX is pre-loaded to given
269# registers and counter register contains amount of chunks to
270# digest...
271$code.=<<___;
272.align 4
273Lsha1_block_private:
274___
275$code.=<<___; # load K_00_19
276 lis $K,0x5a82
277 ori $K,$K,0x7999
278___
279for($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
280$code.=<<___; # load K_20_39
281 lis $K,0x6ed9
282 ori $K,$K,0xeba1
283___
284for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
285$code.=<<___; # load K_40_59
286 lis $K,0x8f1b
287 ori $K,$K,0xbcdc
288___
289for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
290$code.=<<___; # load K_60_79
291 lis $K,0xca62
292 ori $K,$K,0xc1d6
293___
294for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
295$code.=<<___;
296 add r16,r16,$E
297 add r17,r17,$T
298 add r18,r18,$A
299 add r19,r19,$B
300 add r20,r20,$C
301 stw r16,0($ctx)
302 mr $A,r16
303 stw r17,4($ctx)
304 mr $B,r17
305 stw r18,8($ctx)
306 mr $C,r18
307 stw r19,12($ctx)
308 mr $D,r19
309 stw r20,16($ctx)
310 mr $E,r20
311 addi $inp,$inp,`16*4`
312 bdnz- Lsha1_block_private
313 blr
314___
315
316$code =~ s/\`([^\`]*)\`/eval $1/gem;
317print $code;
318close STDOUT;
diff --git a/src/lib/libcrypto/sha/asm/sha1-sparcv9.pl b/src/lib/libcrypto/sha/asm/sha1-sparcv9.pl
deleted file mode 100644
index 5235c59e63..0000000000
--- a/src/lib/libcrypto/sha/asm/sha1-sparcv9.pl
+++ /dev/null
@@ -1,282 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# Performance improvement is not really impressive on pre-T1 CPU: +8%
11# over Sun C and +25% over gcc [3.3]. While on T1, a.k.a. Niagara, it
12# turned to be 40% faster than 64-bit code generated by Sun C 5.8 and
13# >2x than 64-bit code generated by gcc 3.4. And there is a gimmick.
14# X[16] vector is packed to 8 64-bit registers and as result nothing
15# is spilled on stack. In addition input data is loaded in compact
16# instruction sequence, thus minimizing the window when the code is
17# subject to [inter-thread] cache-thrashing hazard. The goal is to
18# ensure scalability on UltraSPARC T1, or rather to avoid decay when
19# amount of active threads exceeds the number of physical cores.
20
21$bits=32;
22for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
23if ($bits==64) { $bias=2047; $frame=192; }
24else { $bias=0; $frame=112; }
25
26$output=shift;
27open STDOUT,">$output";
28
29@X=("%o0","%o1","%o2","%o3","%o4","%o5","%g1","%o7");
30$rot1m="%g2";
31$tmp64="%g3";
32$Xi="%g4";
33$A="%l0";
34$B="%l1";
35$C="%l2";
36$D="%l3";
37$E="%l4";
38@V=($A,$B,$C,$D,$E);
39$K_00_19="%l5";
40$K_20_39="%l6";
41$K_40_59="%l7";
42$K_60_79="%g5";
43@K=($K_00_19,$K_20_39,$K_40_59,$K_60_79);
44
45$ctx="%i0";
46$inp="%i1";
47$len="%i2";
48$tmp0="%i3";
49$tmp1="%i4";
50$tmp2="%i5";
51
52sub BODY_00_15 {
53my ($i,$a,$b,$c,$d,$e)=@_;
54my $xi=($i&1)?@X[($i/2)%8]:$Xi;
55
56$code.=<<___;
57 sll $a,5,$tmp0 !! $i
58 add @K[$i/20],$e,$e
59 srl $a,27,$tmp1
60 add $tmp0,$e,$e
61 and $c,$b,$tmp0
62 add $tmp1,$e,$e
63 sll $b,30,$tmp2
64 andn $d,$b,$tmp1
65 srl $b,2,$b
66 or $tmp1,$tmp0,$tmp1
67 or $tmp2,$b,$b
68 add $xi,$e,$e
69___
70if ($i&1 && $i<15) {
71 $code.=
72 " srlx @X[(($i+1)/2)%8],32,$Xi\n";
73}
74$code.=<<___;
75 add $tmp1,$e,$e
76___
77}
78
79sub Xupdate {
80my ($i,$a,$b,$c,$d,$e)=@_;
81my $j=$i/2;
82
83if ($i&1) {
84$code.=<<___;
85 sll $a,5,$tmp0 !! $i
86 add @K[$i/20],$e,$e
87 srl $a,27,$tmp1
88___
89} else {
90$code.=<<___;
91 sllx @X[($j+6)%8],32,$Xi ! Xupdate($i)
92 xor @X[($j+1)%8],@X[$j%8],@X[$j%8]
93 srlx @X[($j+7)%8],32,$tmp1
94 xor @X[($j+4)%8],@X[$j%8],@X[$j%8]
95 sll $a,5,$tmp0 !! $i
96 or $tmp1,$Xi,$Xi
97 add @K[$i/20],$e,$e !!
98 xor $Xi,@X[$j%8],@X[$j%8]
99 srlx @X[$j%8],31,$Xi
100 add @X[$j%8],@X[$j%8],@X[$j%8]
101 and $Xi,$rot1m,$Xi
102 andn @X[$j%8],$rot1m,@X[$j%8]
103 srl $a,27,$tmp1 !!
104 or $Xi,@X[$j%8],@X[$j%8]
105___
106}
107}
108
109sub BODY_16_19 {
110my ($i,$a,$b,$c,$d,$e)=@_;
111
112 &Xupdate(@_);
113 if ($i&1) {
114 $xi=@X[($i/2)%8];
115 } else {
116 $xi=$Xi;
117 $code.="\tsrlx @X[($i/2)%8],32,$xi\n";
118 }
119$code.=<<___;
120 add $tmp0,$e,$e !!
121 and $c,$b,$tmp0
122 add $tmp1,$e,$e
123 sll $b,30,$tmp2
124 add $xi,$e,$e
125 andn $d,$b,$tmp1
126 srl $b,2,$b
127 or $tmp1,$tmp0,$tmp1
128 or $tmp2,$b,$b
129 add $tmp1,$e,$e
130___
131}
132
133sub BODY_20_39 {
134my ($i,$a,$b,$c,$d,$e)=@_;
135my $xi;
136 &Xupdate(@_);
137 if ($i&1) {
138 $xi=@X[($i/2)%8];
139 } else {
140 $xi=$Xi;
141 $code.="\tsrlx @X[($i/2)%8],32,$xi\n";
142 }
143$code.=<<___;
144 add $tmp0,$e,$e !!
145 xor $c,$b,$tmp0
146 add $tmp1,$e,$e
147 sll $b,30,$tmp2
148 xor $d,$tmp0,$tmp1
149 srl $b,2,$b
150 add $tmp1,$e,$e
151 or $tmp2,$b,$b
152 add $xi,$e,$e
153___
154}
155
156sub BODY_40_59 {
157my ($i,$a,$b,$c,$d,$e)=@_;
158my $xi;
159 &Xupdate(@_);
160 if ($i&1) {
161 $xi=@X[($i/2)%8];
162 } else {
163 $xi=$Xi;
164 $code.="\tsrlx @X[($i/2)%8],32,$xi\n";
165 }
166$code.=<<___;
167 add $tmp0,$e,$e !!
168 and $c,$b,$tmp0
169 add $tmp1,$e,$e
170 sll $b,30,$tmp2
171 or $c,$b,$tmp1
172 srl $b,2,$b
173 and $d,$tmp1,$tmp1
174 add $xi,$e,$e
175 or $tmp1,$tmp0,$tmp1
176 or $tmp2,$b,$b
177 add $tmp1,$e,$e
178___
179}
180
181$code.=<<___ if ($bits==64);
182.register %g2,#scratch
183.register %g3,#scratch
184___
185$code.=<<___;
186.section ".text",#alloc,#execinstr
187
188.align 32
189.globl sha1_block_data_order
190sha1_block_data_order:
191 save %sp,-$frame,%sp
192 sllx $len,6,$len
193 add $inp,$len,$len
194
195 or %g0,1,$rot1m
196 sllx $rot1m,32,$rot1m
197 or $rot1m,1,$rot1m
198
199 ld [$ctx+0],$A
200 ld [$ctx+4],$B
201 ld [$ctx+8],$C
202 ld [$ctx+12],$D
203 ld [$ctx+16],$E
204 andn $inp,7,$tmp0
205
206 sethi %hi(0x5a827999),$K_00_19
207 or $K_00_19,%lo(0x5a827999),$K_00_19
208 sethi %hi(0x6ed9eba1),$K_20_39
209 or $K_20_39,%lo(0x6ed9eba1),$K_20_39
210 sethi %hi(0x8f1bbcdc),$K_40_59
211 or $K_40_59,%lo(0x8f1bbcdc),$K_40_59
212 sethi %hi(0xca62c1d6),$K_60_79
213 or $K_60_79,%lo(0xca62c1d6),$K_60_79
214
215.Lloop:
216 ldx [$tmp0+0],@X[0]
217 ldx [$tmp0+16],@X[2]
218 ldx [$tmp0+32],@X[4]
219 ldx [$tmp0+48],@X[6]
220 and $inp,7,$tmp1
221 ldx [$tmp0+8],@X[1]
222 sll $tmp1,3,$tmp1
223 ldx [$tmp0+24],@X[3]
224 subcc %g0,$tmp1,$tmp2 ! should be 64-$tmp1, but -$tmp1 works too
225 ldx [$tmp0+40],@X[5]
226 bz,pt %icc,.Laligned
227 ldx [$tmp0+56],@X[7]
228
229 sllx @X[0],$tmp1,@X[0]
230 ldx [$tmp0+64],$tmp64
231___
232for($i=0;$i<7;$i++)
233{ $code.=<<___;
234 srlx @X[$i+1],$tmp2,$Xi
235 sllx @X[$i+1],$tmp1,@X[$i+1]
236 or $Xi,@X[$i],@X[$i]
237___
238}
239$code.=<<___;
240 srlx $tmp64,$tmp2,$tmp64
241 or $tmp64,@X[7],@X[7]
242.Laligned:
243 srlx @X[0],32,$Xi
244___
245for ($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
246for (;$i<20;$i++) { &BODY_16_19($i,@V); unshift(@V,pop(@V)); }
247for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
248for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
249for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
250$code.=<<___;
251
252 ld [$ctx+0],@X[0]
253 ld [$ctx+4],@X[1]
254 ld [$ctx+8],@X[2]
255 ld [$ctx+12],@X[3]
256 add $inp,64,$inp
257 ld [$ctx+16],@X[4]
258 cmp $inp,$len
259
260 add $A,@X[0],$A
261 st $A,[$ctx+0]
262 add $B,@X[1],$B
263 st $B,[$ctx+4]
264 add $C,@X[2],$C
265 st $C,[$ctx+8]
266 add $D,@X[3],$D
267 st $D,[$ctx+12]
268 add $E,@X[4],$E
269 st $E,[$ctx+16]
270
271 bne `$bits==64?"%xcc":"%icc"`,.Lloop
272 andn $inp,7,$tmp0
273
274 ret
275 restore
276.type sha1_block_data_order,#function
277.size sha1_block_data_order,(.-sha1_block_data_order)
278___
279
280$code =~ s/\`([^\`]*)\`/eval $1/gem;
281print $code;
282close STDOUT;
diff --git a/src/lib/libcrypto/sha/asm/sha256-586.pl b/src/lib/libcrypto/sha/asm/sha256-586.pl
deleted file mode 100644
index 2b05c96063..0000000000
--- a/src/lib/libcrypto/sha/asm/sha256-586.pl
+++ /dev/null
@@ -1,249 +0,0 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# SHA256 block transform for x86. September 2007.
11#
12# Performance in clock cycles per processed byte (less is better):
13#
14# Pentium PIII P4 AMD K8 Core2
15# gcc 46 36 41 27 26
16# icc 57 33 38 25 23
17# x86 asm 40 30 33 20 18
18# x86_64 asm(*) - - 21 16 16
19#
20# (*) x86_64 assembler performance is presented for reference
21# purposes.
22#
23# Performance improvement over compiler generated code varies from
24# 10% to 40% [see above]. Not very impressive on some µ-archs, but
25# it's 5 times smaller and optimizies amount of writes.
26
27$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
28push(@INC,"${dir}","${dir}../../perlasm");
29require "x86asm.pl";
30
31&asm_init($ARGV[0],"sha512-586.pl",$ARGV[$#ARGV] eq "386");
32
33$A="eax";
34$E="edx";
35$T="ebx";
36$Aoff=&DWP(0,"esp");
37$Boff=&DWP(4,"esp");
38$Coff=&DWP(8,"esp");
39$Doff=&DWP(12,"esp");
40$Eoff=&DWP(16,"esp");
41$Foff=&DWP(20,"esp");
42$Goff=&DWP(24,"esp");
43$Hoff=&DWP(28,"esp");
44$Xoff=&DWP(32,"esp");
45$K256="ebp";
46
47sub BODY_00_15() {
48 my $in_16_63=shift;
49
50 &mov ("ecx",$E);
51 &add ($T,"edi") if ($in_16_63); # T += sigma1(X[-2])
52 &ror ("ecx",25-11);
53 &mov ("esi",$Foff);
54 &xor ("ecx",$E);
55 &ror ("ecx",11-6);
56 &mov (&DWP(4*(8+15),"esp"),$T) if ($in_16_63); # save X[0]
57 &xor ("ecx",$E);
58 &ror ("ecx",6); # Sigma1(e)
59 &mov ("edi",$Goff);
60 &add ($T,"ecx"); # T += Sigma1(e)
61
62 &xor ("esi","edi");
63 &mov ($Eoff,$E); # modulo-scheduled
64 &mov ("ecx",$A);
65 &and ("esi",$E);
66 &mov ($E,$Doff); # e becomes d, which is e in next iteration
67 &xor ("esi","edi"); # Ch(e,f,g)
68 &mov ("edi",$A);
69 &add ($T,"esi"); # T += Ch(e,f,g)
70
71 &ror ("ecx",22-13);
72 &add ($T,$Hoff); # T += h
73 &xor ("ecx",$A);
74 &ror ("ecx",13-2);
75 &mov ("esi",$Boff);
76 &xor ("ecx",$A);
77 &ror ("ecx",2); # Sigma0(a)
78 &add ($E,$T); # d += T
79 &mov ("edi",$Coff);
80
81 &add ($T,"ecx"); # T += Sigma0(a)
82 &mov ($Aoff,$A); # modulo-scheduled
83
84 &mov ("ecx",$A);
85 &sub ("esp",4);
86 &or ($A,"esi"); # a becomes h, which is a in next iteration
87 &and ("ecx","esi");
88 &and ($A,"edi");
89 &mov ("esi",&DWP(0,$K256));
90 &or ($A,"ecx"); # h=Maj(a,b,c)
91
92 &add ($K256,4);
93 &add ($A,$T); # h += T
94 &mov ($T,&DWP(4*(8+15+16-1),"esp")) if ($in_16_63); # preload T
95 &add ($E,"esi"); # d += K256[i]
96 &add ($A,"esi"); # h += K256[i]
97}
98
99&static_label("K256");
100&function_begin("sha256_block_data_order");
101 &mov ("esi",wparam(0)); # ctx
102 &mov ("edi",wparam(1)); # inp
103 &mov ("eax",wparam(2)); # num
104 &mov ("ebx","esp"); # saved sp
105
106 &picsetup($K256);
107 &picsymbol($K256, &label("K256"), $K256);
108
109 &sub ("esp",16);
110 &and ("esp",-64);
111
112 &shl ("eax",6);
113 &add ("eax","edi");
114 &mov (&DWP(0,"esp"),"esi"); # ctx
115 &mov (&DWP(4,"esp"),"edi"); # inp
116 &mov (&DWP(8,"esp"),"eax"); # inp+num*128
117 &mov (&DWP(12,"esp"),"ebx"); # saved sp
118
119&set_label("loop",16);
120 # copy input block to stack reversing byte and dword order
121 for($i=0;$i<4;$i++) {
122 &mov ("eax",&DWP($i*16+0,"edi"));
123 &mov ("ebx",&DWP($i*16+4,"edi"));
124 &mov ("ecx",&DWP($i*16+8,"edi"));
125 &mov ("edx",&DWP($i*16+12,"edi"));
126 &bswap ("eax");
127 &bswap ("ebx");
128 &bswap ("ecx");
129 &bswap ("edx");
130 &push ("eax");
131 &push ("ebx");
132 &push ("ecx");
133 &push ("edx");
134 }
135 &add ("edi",64);
136 &sub ("esp",4*8); # place for A,B,C,D,E,F,G,H
137 &mov (&DWP(4*(8+16)+4,"esp"),"edi");
138
139 # copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack
140 &mov ($A,&DWP(0,"esi"));
141 &mov ("ebx",&DWP(4,"esi"));
142 &mov ("ecx",&DWP(8,"esi"));
143 &mov ("edi",&DWP(12,"esi"));
144 # &mov ($Aoff,$A);
145 &mov ($Boff,"ebx");
146 &mov ($Coff,"ecx");
147 &mov ($Doff,"edi");
148 &mov ($E,&DWP(16,"esi"));
149 &mov ("ebx",&DWP(20,"esi"));
150 &mov ("ecx",&DWP(24,"esi"));
151 &mov ("edi",&DWP(28,"esi"));
152 # &mov ($Eoff,$E);
153 &mov ($Foff,"ebx");
154 &mov ($Goff,"ecx");
155 &mov ($Hoff,"edi");
156
157&set_label("00_15",16);
158 &mov ($T,&DWP(4*(8+15),"esp"));
159
160 &BODY_00_15();
161
162 &cmp ("esi",0xc19bf174);
163 &jne (&label("00_15"));
164
165 &mov ($T,&DWP(4*(8+15+16-1),"esp")); # preloaded in BODY_00_15(1)
166&set_label("16_63",16);
167 &mov ("esi",$T);
168 &mov ("ecx",&DWP(4*(8+15+16-14),"esp"));
169 &ror ("esi",18-7);
170 &mov ("edi","ecx");
171 &xor ("esi",$T);
172 &ror ("esi",7);
173 &shr ($T,3);
174
175 &ror ("edi",19-17);
176 &xor ($T,"esi"); # T = sigma0(X[-15])
177 &xor ("edi","ecx");
178 &ror ("edi",17);
179 &shr ("ecx",10);
180 &add ($T,&DWP(4*(8+15+16),"esp")); # T += X[-16]
181 &xor ("edi","ecx"); # sigma1(X[-2])
182
183 &add ($T,&DWP(4*(8+15+16-9),"esp")); # T += X[-7]
184 # &add ($T,"edi"); # T += sigma1(X[-2])
185 # &mov (&DWP(4*(8+15),"esp"),$T); # save X[0]
186
187 &BODY_00_15(1);
188
189 &cmp ("esi",0xc67178f2);
190 &jne (&label("16_63"));
191
192 &mov ("esi",&DWP(4*(8+16+64)+0,"esp"));#ctx
193 # &mov ($A,$Aoff);
194 &mov ("ebx",$Boff);
195 &mov ("ecx",$Coff);
196 &mov ("edi",$Doff);
197 &add ($A,&DWP(0,"esi"));
198 &add ("ebx",&DWP(4,"esi"));
199 &add ("ecx",&DWP(8,"esi"));
200 &add ("edi",&DWP(12,"esi"));
201 &mov (&DWP(0,"esi"),$A);
202 &mov (&DWP(4,"esi"),"ebx");
203 &mov (&DWP(8,"esi"),"ecx");
204 &mov (&DWP(12,"esi"),"edi");
205 # &mov ($E,$Eoff);
206 &mov ("eax",$Foff);
207 &mov ("ebx",$Goff);
208 &mov ("ecx",$Hoff);
209 &mov ("edi",&DWP(4*(8+16+64)+4,"esp"));#inp
210 &add ($E,&DWP(16,"esi"));
211 &add ("eax",&DWP(20,"esi"));
212 &add ("ebx",&DWP(24,"esi"));
213 &add ("ecx",&DWP(28,"esi"));
214 &mov (&DWP(16,"esi"),$E);
215 &mov (&DWP(20,"esi"),"eax");
216 &mov (&DWP(24,"esi"),"ebx");
217 &mov (&DWP(28,"esi"),"ecx");
218
219 &add ("esp",4*(8+16+64)); # destroy frame
220 &sub ($K256,4*64); # rewind K
221
222 &cmp ("edi",&DWP(8,"esp")); # are we done yet?
223 &jb (&label("loop"));
224
225 &mov ("esp",&DWP(12,"esp")); # restore sp
226&function_end_A();
227&function_end_B("sha256_block_data_order");
228
229 &rodataseg();
230&set_label("K256",64);
231 &data_word(0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5);
232 &data_word(0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5);
233 &data_word(0xd807aa98,0x12835b01,0x243185be,0x550c7dc3);
234 &data_word(0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174);
235 &data_word(0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc);
236 &data_word(0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da);
237 &data_word(0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7);
238 &data_word(0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967);
239 &data_word(0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13);
240 &data_word(0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85);
241 &data_word(0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3);
242 &data_word(0xd192e819,0xd6990624,0xf40e3585,0x106aa070);
243 &data_word(0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5);
244 &data_word(0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3);
245 &data_word(0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208);
246 &data_word(0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2);
247 &previous();
248
249&asm_finish();
diff --git a/src/lib/libcrypto/sha/asm/sha256-armv4.pl b/src/lib/libcrypto/sha/asm/sha256-armv4.pl
deleted file mode 100644
index 292520731c..0000000000
--- a/src/lib/libcrypto/sha/asm/sha256-armv4.pl
+++ /dev/null
@@ -1,211 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# SHA256 block procedure for ARMv4. May 2007.
11
12# Performance is ~2x better than gcc 3.4 generated code and in "abso-
13# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
14# byte [on single-issue Xscale PXA250 core].
15
16# July 2010.
17#
18# Rescheduling for dual-issue pipeline resulted in 22% improvement on
19# Cortex A8 core and ~20 cycles per processed byte.
20
21# February 2011.
22#
23# Profiler-assisted and platform-specific optimization resulted in 16%
24# improvement on Cortex A8 core and ~17 cycles per processed byte.
25
26while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
27open STDOUT,">$output";
28
29$ctx="r0"; $t0="r0";
30$inp="r1"; $t3="r1";
31$len="r2"; $t1="r2";
32$T1="r3";
33$A="r4";
34$B="r5";
35$C="r6";
36$D="r7";
37$E="r8";
38$F="r9";
39$G="r10";
40$H="r11";
41@V=($A,$B,$C,$D,$E,$F,$G,$H);
42$t2="r12";
43$Ktbl="r14";
44
45@Sigma0=( 2,13,22);
46@Sigma1=( 6,11,25);
47@sigma0=( 7,18, 3);
48@sigma1=(17,19,10);
49
50sub BODY_00_15 {
51my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
52
53$code.=<<___ if ($i<16);
54#if __ARM_ARCH__>=7 && !defined(__STRICT_ALIGNMENT)
55 ldr $T1,[$inp],#4
56#else
57 ldrb $T1,[$inp,#3] @ $i
58 ldrb $t2,[$inp,#2]
59 ldrb $t1,[$inp,#1]
60 ldrb $t0,[$inp],#4
61 orr $T1,$T1,$t2,lsl#8
62 orr $T1,$T1,$t1,lsl#16
63 orr $T1,$T1,$t0,lsl#24
64#endif
65___
66$code.=<<___;
67 mov $t0,$e,ror#$Sigma1[0]
68 ldr $t2,[$Ktbl],#4 @ *K256++
69 eor $t0,$t0,$e,ror#$Sigma1[1]
70 eor $t1,$f,$g
71#if $i>=16
72 add $T1,$T1,$t3 @ from BODY_16_xx
73#elif __ARM_ARCH__>=7 && defined(__ARMEL__) && !defined(__STRICT_ALIGNMENT)
74 rev $T1,$T1
75#endif
76#if $i==15
77 str $inp,[sp,#17*4] @ leave room for $t3
78#endif
79 eor $t0,$t0,$e,ror#$Sigma1[2] @ Sigma1(e)
80 and $t1,$t1,$e
81 str $T1,[sp,#`$i%16`*4]
82 add $T1,$T1,$t0
83 eor $t1,$t1,$g @ Ch(e,f,g)
84 add $T1,$T1,$h
85 mov $h,$a,ror#$Sigma0[0]
86 add $T1,$T1,$t1
87 eor $h,$h,$a,ror#$Sigma0[1]
88 add $T1,$T1,$t2
89 eor $h,$h,$a,ror#$Sigma0[2] @ Sigma0(a)
90#if $i>=15
91 ldr $t3,[sp,#`($i+2)%16`*4] @ from BODY_16_xx
92#endif
93 orr $t0,$a,$b
94 and $t1,$a,$b
95 and $t0,$t0,$c
96 add $h,$h,$T1
97 orr $t0,$t0,$t1 @ Maj(a,b,c)
98 add $d,$d,$T1
99 add $h,$h,$t0
100___
101}
102
103sub BODY_16_XX {
104my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
105
106$code.=<<___;
107 @ ldr $t3,[sp,#`($i+1)%16`*4] @ $i
108 ldr $t2,[sp,#`($i+14)%16`*4]
109 mov $t0,$t3,ror#$sigma0[0]
110 ldr $T1,[sp,#`($i+0)%16`*4]
111 eor $t0,$t0,$t3,ror#$sigma0[1]
112 ldr $t1,[sp,#`($i+9)%16`*4]
113 eor $t0,$t0,$t3,lsr#$sigma0[2] @ sigma0(X[i+1])
114 mov $t3,$t2,ror#$sigma1[0]
115 add $T1,$T1,$t0
116 eor $t3,$t3,$t2,ror#$sigma1[1]
117 add $T1,$T1,$t1
118 eor $t3,$t3,$t2,lsr#$sigma1[2] @ sigma1(X[i+14])
119 @ add $T1,$T1,$t3
120___
121 &BODY_00_15(@_);
122}
123
124$code=<<___;
125#include "arm_arch.h"
126
127.text
128.code 32
129
130.type K256,%object
131.align 5
132K256:
133.word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
134.word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
135.word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
136.word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
137.word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
138.word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
139.word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
140.word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
141.word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
142.word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
143.word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
144.word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
145.word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
146.word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
147.word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
148.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
149.size K256,.-K256
150
151.global sha256_block_data_order
152.type sha256_block_data_order,%function
153sha256_block_data_order:
154 sub r3,pc,#8 @ sha256_block_data_order
155 add $len,$inp,$len,lsl#6 @ len to point at the end of inp
156 stmdb sp!,{$ctx,$inp,$len,r4-r11,lr}
157 ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
158 sub $Ktbl,r3,#256 @ K256
159 sub sp,sp,#16*4 @ alloca(X[16])
160.Loop:
161___
162for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
163$code.=".Lrounds_16_xx:\n";
164for (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
165$code.=<<___;
166 and $t2,$t2,#0xff
167 cmp $t2,#0xf2
168 bne .Lrounds_16_xx
169
170 ldr $T1,[sp,#16*4] @ pull ctx
171 ldr $t0,[$T1,#0]
172 ldr $t1,[$T1,#4]
173 ldr $t2,[$T1,#8]
174 add $A,$A,$t0
175 ldr $t0,[$T1,#12]
176 add $B,$B,$t1
177 ldr $t1,[$T1,#16]
178 add $C,$C,$t2
179 ldr $t2,[$T1,#20]
180 add $D,$D,$t0
181 ldr $t0,[$T1,#24]
182 add $E,$E,$t1
183 ldr $t1,[$T1,#28]
184 add $F,$F,$t2
185 ldr $inp,[sp,#17*4] @ pull inp
186 ldr $t2,[sp,#18*4] @ pull inp+len
187 add $G,$G,$t0
188 add $H,$H,$t1
189 stmia $T1,{$A,$B,$C,$D,$E,$F,$G,$H}
190 cmp $inp,$t2
191 sub $Ktbl,$Ktbl,#256 @ rewind Ktbl
192 bne .Loop
193
194 add sp,sp,#`16+3`*4 @ destroy frame
195#if __ARM_ARCH__>=5
196 ldmia sp!,{r4-r11,pc}
197#else
198 ldmia sp!,{r4-r11,lr}
199 tst lr,#1
200 moveq pc,lr @ be binary compatible with V4, yet
201 bx lr @ interoperable with Thumb ISA:-)
202#endif
203.size sha256_block_data_order,.-sha256_block_data_order
204.asciz "SHA256 block transform for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
205.align 2
206___
207
208$code =~ s/\`([^\`]*)\`/eval $1/gem;
209$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
210print $code;
211close STDOUT; # enforce flush
diff --git a/src/lib/libcrypto/sha/asm/sha512-586.pl b/src/lib/libcrypto/sha/asm/sha512-586.pl
deleted file mode 100644
index c1d0684e92..0000000000
--- a/src/lib/libcrypto/sha/asm/sha512-586.pl
+++ /dev/null
@@ -1,646 +0,0 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# SHA512 block transform for x86. September 2007.
11#
12# Performance in clock cycles per processed byte (less is better):
13#
14# Pentium PIII P4 AMD K8 Core2
15# gcc 100 75 116 54 66
16# icc 97 77 95 55 57
17# x86 asm 61 56 82 36 40
18# SSE2 asm - - 38 24 20
19# x86_64 asm(*) - - 30 10.0 10.5
20#
21# (*) x86_64 assembler performance is presented for reference
22# purposes.
23#
24# IALU code-path is optimized for elder Pentiums. On vanilla Pentium
25# performance improvement over compiler generated code reaches ~60%,
26# while on PIII - ~35%. On newer µ-archs improvement varies from 15%
27# to 50%, but it's less important as they are expected to execute SSE2
28# code-path, which is commonly ~2-3x faster [than compiler generated
29# code]. SSE2 code-path is as fast as original sha512-sse2.pl, even
30# though it does not use 128-bit operations. The latter means that
31# SSE2-aware kernel is no longer required to execute the code. Another
32# difference is that new code optimizes amount of writes, but at the
33# cost of increased data cache "footprint" by 1/2KB.
34
35$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
36push(@INC,"${dir}","${dir}../../perlasm");
37require "x86asm.pl";
38
39&asm_init($ARGV[0],"sha512-586.pl",$ARGV[$#ARGV] eq "386");
40
41$sse2=0;
42for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
43
44&external_label("OPENSSL_ia32cap_P") if ($sse2);
45
46$Tlo=&DWP(0,"esp"); $Thi=&DWP(4,"esp");
47$Alo=&DWP(8,"esp"); $Ahi=&DWP(8+4,"esp");
48$Blo=&DWP(16,"esp"); $Bhi=&DWP(16+4,"esp");
49$Clo=&DWP(24,"esp"); $Chi=&DWP(24+4,"esp");
50$Dlo=&DWP(32,"esp"); $Dhi=&DWP(32+4,"esp");
51$Elo=&DWP(40,"esp"); $Ehi=&DWP(40+4,"esp");
52$Flo=&DWP(48,"esp"); $Fhi=&DWP(48+4,"esp");
53$Glo=&DWP(56,"esp"); $Ghi=&DWP(56+4,"esp");
54$Hlo=&DWP(64,"esp"); $Hhi=&DWP(64+4,"esp");
55$K512="ebp";
56
57$Asse2=&QWP(0,"esp");
58$Bsse2=&QWP(8,"esp");
59$Csse2=&QWP(16,"esp");
60$Dsse2=&QWP(24,"esp");
61$Esse2=&QWP(32,"esp");
62$Fsse2=&QWP(40,"esp");
63$Gsse2=&QWP(48,"esp");
64$Hsse2=&QWP(56,"esp");
65
66$A="mm0"; # B-D and
67$E="mm4"; # F-H are commonly loaded to respectively mm1-mm3 and
68 # mm5-mm7, but it's done on on-demand basis...
69
70sub BODY_00_15_sse2 {
71 my $prefetch=shift;
72
73 &movq ("mm5",$Fsse2); # load f
74 &movq ("mm6",$Gsse2); # load g
75 &movq ("mm7",$Hsse2); # load h
76
77 &movq ("mm1",$E); # %mm1 is sliding right
78 &movq ("mm2",$E); # %mm2 is sliding left
79 &psrlq ("mm1",14);
80 &movq ($Esse2,$E); # modulo-scheduled save e
81 &psllq ("mm2",23);
82 &movq ("mm3","mm1"); # %mm3 is T1
83 &psrlq ("mm1",4);
84 &pxor ("mm3","mm2");
85 &psllq ("mm2",23);
86 &pxor ("mm3","mm1");
87 &psrlq ("mm1",23);
88 &pxor ("mm3","mm2");
89 &psllq ("mm2",4);
90 &pxor ("mm3","mm1");
91 &paddq ("mm7",QWP(0,$K512)); # h+=K512[i]
92 &pxor ("mm3","mm2"); # T1=Sigma1_512(e)
93
94 &pxor ("mm5","mm6"); # f^=g
95 &movq ("mm1",$Bsse2); # load b
96 &pand ("mm5",$E); # f&=e
97 &movq ("mm2",$Csse2); # load c
98 &pxor ("mm5","mm6"); # f^=g
99 &movq ($E,$Dsse2); # e = load d
100 &paddq ("mm3","mm5"); # T1+=Ch(e,f,g)
101 &movq (&QWP(0,"esp"),$A); # modulo-scheduled save a
102 &paddq ("mm3","mm7"); # T1+=h
103
104 &movq ("mm5",$A); # %mm5 is sliding right
105 &movq ("mm6",$A); # %mm6 is sliding left
106 &paddq ("mm3",&QWP(8*9,"esp")); # T1+=X[0]
107 &psrlq ("mm5",28);
108 &paddq ($E,"mm3"); # e += T1
109 &psllq ("mm6",25);
110 &movq ("mm7","mm5"); # %mm7 is T2
111 &psrlq ("mm5",6);
112 &pxor ("mm7","mm6");
113 &psllq ("mm6",5);
114 &pxor ("mm7","mm5");
115 &psrlq ("mm5",5);
116 &pxor ("mm7","mm6");
117 &psllq ("mm6",6);
118 &pxor ("mm7","mm5");
119 &sub ("esp",8);
120 &pxor ("mm7","mm6"); # T2=Sigma0_512(a)
121
122 &movq ("mm5",$A); # %mm5=a
123 &por ($A,"mm2"); # a=a|c
124 &movq ("mm6",&QWP(8*(9+16-14),"esp")) if ($prefetch);
125 &pand ("mm5","mm2"); # %mm5=a&c
126 &pand ($A,"mm1"); # a=(a|c)&b
127 &movq ("mm2",&QWP(8*(9+16-1),"esp")) if ($prefetch);
128 &por ("mm5",$A); # %mm5=(a&c)|((a|c)&b)
129 &paddq ("mm7","mm5"); # T2+=Maj(a,b,c)
130 &movq ($A,"mm3"); # a=T1
131
132 &mov (&LB("edx"),&BP(0,$K512));
133 &paddq ($A,"mm7"); # a+=T2
134 &add ($K512,8);
135}
136
137sub BODY_00_15_x86 {
138 #define Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41))
139 # LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
140 # HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
141 &mov ("ecx",$Elo);
142 &mov ("edx",$Ehi);
143 &mov ("esi","ecx");
144
145 &shr ("ecx",9); # lo>>9
146 &mov ("edi","edx");
147 &shr ("edx",9); # hi>>9
148 &mov ("ebx","ecx");
149 &shl ("esi",14); # lo<<14
150 &mov ("eax","edx");
151 &shl ("edi",14); # hi<<14
152 &xor ("ebx","esi");
153
154 &shr ("ecx",14-9); # lo>>14
155 &xor ("eax","edi");
156 &shr ("edx",14-9); # hi>>14
157 &xor ("eax","ecx");
158 &shl ("esi",18-14); # lo<<18
159 &xor ("ebx","edx");
160 &shl ("edi",18-14); # hi<<18
161 &xor ("ebx","esi");
162
163 &shr ("ecx",18-14); # lo>>18
164 &xor ("eax","edi");
165 &shr ("edx",18-14); # hi>>18
166 &xor ("eax","ecx");
167 &shl ("esi",23-18); # lo<<23
168 &xor ("ebx","edx");
169 &shl ("edi",23-18); # hi<<23
170 &xor ("eax","esi");
171 &xor ("ebx","edi"); # T1 = Sigma1(e)
172
173 &mov ("ecx",$Flo);
174 &mov ("edx",$Fhi);
175 &mov ("esi",$Glo);
176 &mov ("edi",$Ghi);
177 &add ("eax",$Hlo);
178 &adc ("ebx",$Hhi); # T1 += h
179 &xor ("ecx","esi");
180 &xor ("edx","edi");
181 &and ("ecx",$Elo);
182 &and ("edx",$Ehi);
183 &add ("eax",&DWP(8*(9+15)+0,"esp"));
184 &adc ("ebx",&DWP(8*(9+15)+4,"esp")); # T1 += X[0]
185 &xor ("ecx","esi");
186 &xor ("edx","edi"); # Ch(e,f,g) = (f^g)&e)^g
187
188 &mov ("esi",&DWP(0,$K512));
189 &mov ("edi",&DWP(4,$K512)); # K[i]
190 &add ("eax","ecx");
191 &adc ("ebx","edx"); # T1 += Ch(e,f,g)
192 &mov ("ecx",$Dlo);
193 &mov ("edx",$Dhi);
194 &add ("eax","esi");
195 &adc ("ebx","edi"); # T1 += K[i]
196 &mov ($Tlo,"eax");
197 &mov ($Thi,"ebx"); # put T1 away
198 &add ("eax","ecx");
199 &adc ("ebx","edx"); # d += T1
200
201 #define Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
202 # LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
203 # HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
204 &mov ("ecx",$Alo);
205 &mov ("edx",$Ahi);
206 &mov ($Dlo,"eax");
207 &mov ($Dhi,"ebx");
208 &mov ("esi","ecx");
209
210 &shr ("ecx",2); # lo>>2
211 &mov ("edi","edx");
212 &shr ("edx",2); # hi>>2
213 &mov ("ebx","ecx");
214 &shl ("esi",4); # lo<<4
215 &mov ("eax","edx");
216 &shl ("edi",4); # hi<<4
217 &xor ("ebx","esi");
218
219 &shr ("ecx",7-2); # lo>>7
220 &xor ("eax","edi");
221 &shr ("edx",7-2); # hi>>7
222 &xor ("ebx","ecx");
223 &shl ("esi",25-4); # lo<<25
224 &xor ("eax","edx");
225 &shl ("edi",25-4); # hi<<25
226 &xor ("eax","esi");
227
228 &shr ("ecx",28-7); # lo>>28
229 &xor ("ebx","edi");
230 &shr ("edx",28-7); # hi>>28
231 &xor ("eax","ecx");
232 &shl ("esi",30-25); # lo<<30
233 &xor ("ebx","edx");
234 &shl ("edi",30-25); # hi<<30
235 &xor ("eax","esi");
236 &xor ("ebx","edi"); # Sigma0(a)
237
238 &mov ("ecx",$Alo);
239 &mov ("edx",$Ahi);
240 &mov ("esi",$Blo);
241 &mov ("edi",$Bhi);
242 &add ("eax",$Tlo);
243 &adc ("ebx",$Thi); # T1 = Sigma0(a)+T1
244 &or ("ecx","esi");
245 &or ("edx","edi");
246 &and ("ecx",$Clo);
247 &and ("edx",$Chi);
248 &and ("esi",$Alo);
249 &and ("edi",$Ahi);
250 &or ("ecx","esi");
251 &or ("edx","edi"); # Maj(a,b,c) = ((a|b)&c)|(a&b)
252
253 &add ("eax","ecx");
254 &adc ("ebx","edx"); # T1 += Maj(a,b,c)
255 &mov ($Tlo,"eax");
256 &mov ($Thi,"ebx");
257
258 &mov (&LB("edx"),&BP(0,$K512)); # pre-fetch LSB of *K
259 &sub ("esp",8);
260 &lea ($K512,&DWP(8,$K512)); # K++
261}
262
263
264&static_label("K512");
265&function_begin("sha512_block_data_order");
266 &mov ("esi",wparam(0)); # ctx
267 &mov ("edi",wparam(1)); # inp
268 &mov ("eax",wparam(2)); # num
269 &mov ("ebx","esp"); # saved sp
270
271 &picsetup($K512);
272if ($sse2) {
273 &picsymbol("edx", "OPENSSL_ia32cap_P", $K512);
274}
275 &picsymbol($K512, &label("K512"), $K512);
276
277 &sub ("esp",16);
278 &and ("esp",-64);
279
280 &shl ("eax",7);
281 &add ("eax","edi");
282 &mov (&DWP(0,"esp"),"esi"); # ctx
283 &mov (&DWP(4,"esp"),"edi"); # inp
284 &mov (&DWP(8,"esp"),"eax"); # inp+num*128
285 &mov (&DWP(12,"esp"),"ebx"); # saved sp
286
287if ($sse2) {
288 &bt (&DWP(0,"edx"),"\$IA32CAP_BIT0_SSE2");
289 &jnc (&label("loop_x86"));
290
291 # load ctx->h[0-7]
292 &movq ($A,&QWP(0,"esi"));
293 &movq ("mm1",&QWP(8,"esi"));
294 &movq ("mm2",&QWP(16,"esi"));
295 &movq ("mm3",&QWP(24,"esi"));
296 &movq ($E,&QWP(32,"esi"));
297 &movq ("mm5",&QWP(40,"esi"));
298 &movq ("mm6",&QWP(48,"esi"));
299 &movq ("mm7",&QWP(56,"esi"));
300 &sub ("esp",8*10);
301
302&set_label("loop_sse2",16);
303 # &movq ($Asse2,$A);
304 &movq ($Bsse2,"mm1");
305 &movq ($Csse2,"mm2");
306 &movq ($Dsse2,"mm3");
307 # &movq ($Esse2,$E);
308 &movq ($Fsse2,"mm5");
309 &movq ($Gsse2,"mm6");
310 &movq ($Hsse2,"mm7");
311
312 &mov ("ecx",&DWP(0,"edi"));
313 &mov ("edx",&DWP(4,"edi"));
314 &add ("edi",8);
315 &bswap ("ecx");
316 &bswap ("edx");
317 &mov (&DWP(8*9+4,"esp"),"ecx");
318 &mov (&DWP(8*9+0,"esp"),"edx");
319
320&set_label("00_14_sse2",16);
321 &mov ("eax",&DWP(0,"edi"));
322 &mov ("ebx",&DWP(4,"edi"));
323 &add ("edi",8);
324 &bswap ("eax");
325 &bswap ("ebx");
326 &mov (&DWP(8*8+4,"esp"),"eax");
327 &mov (&DWP(8*8+0,"esp"),"ebx");
328
329 &BODY_00_15_sse2();
330
331 &cmp (&LB("edx"),0x35);
332 &jne (&label("00_14_sse2"));
333
334 &BODY_00_15_sse2(1);
335
336&set_label("16_79_sse2",16);
337 #&movq ("mm2",&QWP(8*(9+16-1),"esp")); #prefetched in BODY_00_15
338 #&movq ("mm6",&QWP(8*(9+16-14),"esp"));
339 &movq ("mm1","mm2");
340
341 &psrlq ("mm2",1);
342 &movq ("mm7","mm6");
343 &psrlq ("mm6",6);
344 &movq ("mm3","mm2");
345
346 &psrlq ("mm2",7-1);
347 &movq ("mm5","mm6");
348 &psrlq ("mm6",19-6);
349 &pxor ("mm3","mm2");
350
351 &psrlq ("mm2",8-7);
352 &pxor ("mm5","mm6");
353 &psrlq ("mm6",61-19);
354 &pxor ("mm3","mm2");
355
356 &movq ("mm2",&QWP(8*(9+16),"esp"));
357
358 &psllq ("mm1",56);
359 &pxor ("mm5","mm6");
360 &psllq ("mm7",3);
361 &pxor ("mm3","mm1");
362
363 &paddq ("mm2",&QWP(8*(9+16-9),"esp"));
364
365 &psllq ("mm1",63-56);
366 &pxor ("mm5","mm7");
367 &psllq ("mm7",45-3);
368 &pxor ("mm3","mm1");
369 &pxor ("mm5","mm7");
370
371 &paddq ("mm3","mm5");
372 &paddq ("mm3","mm2");
373 &movq (&QWP(8*9,"esp"),"mm3");
374
375 &BODY_00_15_sse2(1);
376
377 &cmp (&LB("edx"),0x17);
378 &jne (&label("16_79_sse2"));
379
380 # &movq ($A,$Asse2);
381 &movq ("mm1",$Bsse2);
382 &movq ("mm2",$Csse2);
383 &movq ("mm3",$Dsse2);
384 # &movq ($E,$Esse2);
385 &movq ("mm5",$Fsse2);
386 &movq ("mm6",$Gsse2);
387 &movq ("mm7",$Hsse2);
388
389 &paddq ($A,&QWP(0,"esi"));
390 &paddq ("mm1",&QWP(8,"esi"));
391 &paddq ("mm2",&QWP(16,"esi"));
392 &paddq ("mm3",&QWP(24,"esi"));
393 &paddq ($E,&QWP(32,"esi"));
394 &paddq ("mm5",&QWP(40,"esi"));
395 &paddq ("mm6",&QWP(48,"esi"));
396 &paddq ("mm7",&QWP(56,"esi"));
397
398 &movq (&QWP(0,"esi"),$A);
399 &movq (&QWP(8,"esi"),"mm1");
400 &movq (&QWP(16,"esi"),"mm2");
401 &movq (&QWP(24,"esi"),"mm3");
402 &movq (&QWP(32,"esi"),$E);
403 &movq (&QWP(40,"esi"),"mm5");
404 &movq (&QWP(48,"esi"),"mm6");
405 &movq (&QWP(56,"esi"),"mm7");
406
407 &add ("esp",8*80); # destroy frame
408 &sub ($K512,8*80); # rewind K
409
410 &cmp ("edi",&DWP(8*10+8,"esp")); # are we done yet?
411 &jb (&label("loop_sse2"));
412
413 &emms ();
414 &mov ("esp",&DWP(8*10+12,"esp")); # restore sp
415&function_end_A();
416}
417&set_label("loop_x86",16);
418 # copy input block to stack reversing byte and qword order
419 for ($i=0;$i<8;$i++) {
420 &mov ("eax",&DWP($i*16+0,"edi"));
421 &mov ("ebx",&DWP($i*16+4,"edi"));
422 &mov ("ecx",&DWP($i*16+8,"edi"));
423 &mov ("edx",&DWP($i*16+12,"edi"));
424 &bswap ("eax");
425 &bswap ("ebx");
426 &bswap ("ecx");
427 &bswap ("edx");
428 &push ("eax");
429 &push ("ebx");
430 &push ("ecx");
431 &push ("edx");
432 }
433 &add ("edi",128);
434 &sub ("esp",9*8); # place for T,A,B,C,D,E,F,G,H
435 &mov (&DWP(8*(9+16)+4,"esp"),"edi");
436
437 # copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack
438 &lea ("edi",&DWP(8,"esp"));
439 &mov ("ecx",16);
440 &data_word(0xA5F3F689); # rep movsd
441
442&set_label("00_15_x86",16);
443 &BODY_00_15_x86();
444
445 &cmp (&LB("edx"),0x94);
446 &jne (&label("00_15_x86"));
447
448&set_label("16_79_x86",16);
449 #define sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7))
450 # LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
451 # HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7
452 &mov ("ecx",&DWP(8*(9+15+16-1)+0,"esp"));
453 &mov ("edx",&DWP(8*(9+15+16-1)+4,"esp"));
454 &mov ("esi","ecx");
455
456 &shr ("ecx",1); # lo>>1
457 &mov ("edi","edx");
458 &shr ("edx",1); # hi>>1
459 &mov ("eax","ecx");
460 &shl ("esi",24); # lo<<24
461 &mov ("ebx","edx");
462 &shl ("edi",24); # hi<<24
463 &xor ("ebx","esi");
464
465 &shr ("ecx",7-1); # lo>>7
466 &xor ("eax","edi");
467 &shr ("edx",7-1); # hi>>7
468 &xor ("eax","ecx");
469 &shl ("esi",31-24); # lo<<31
470 &xor ("ebx","edx");
471 &shl ("edi",25-24); # hi<<25
472 &xor ("ebx","esi");
473
474 &shr ("ecx",8-7); # lo>>8
475 &xor ("eax","edi");
476 &shr ("edx",8-7); # hi>>8
477 &xor ("eax","ecx");
478 &shl ("edi",31-25); # hi<<31
479 &xor ("ebx","edx");
480 &xor ("eax","edi"); # T1 = sigma0(X[-15])
481
482 &mov (&DWP(0,"esp"),"eax");
483 &mov (&DWP(4,"esp"),"ebx"); # put T1 away
484
485 #define sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
486 # LO lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
487 # HI hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
488 &mov ("ecx",&DWP(8*(9+15+16-14)+0,"esp"));
489 &mov ("edx",&DWP(8*(9+15+16-14)+4,"esp"));
490 &mov ("esi","ecx");
491
492 &shr ("ecx",6); # lo>>6
493 &mov ("edi","edx");
494 &shr ("edx",6); # hi>>6
495 &mov ("eax","ecx");
496 &shl ("esi",3); # lo<<3
497 &mov ("ebx","edx");
498 &shl ("edi",3); # hi<<3
499 &xor ("eax","esi");
500
501 &shr ("ecx",19-6); # lo>>19
502 &xor ("ebx","edi");
503 &shr ("edx",19-6); # hi>>19
504 &xor ("eax","ecx");
505 &shl ("esi",13-3); # lo<<13
506 &xor ("ebx","edx");
507 &shl ("edi",13-3); # hi<<13
508 &xor ("ebx","esi");
509
510 &shr ("ecx",29-19); # lo>>29
511 &xor ("eax","edi");
512 &shr ("edx",29-19); # hi>>29
513 &xor ("ebx","ecx");
514 &shl ("edi",26-13); # hi<<26
515 &xor ("eax","edx");
516 &xor ("eax","edi"); # sigma1(X[-2])
517
518 &mov ("ecx",&DWP(8*(9+15+16)+0,"esp"));
519 &mov ("edx",&DWP(8*(9+15+16)+4,"esp"));
520 &add ("eax",&DWP(0,"esp"));
521 &adc ("ebx",&DWP(4,"esp")); # T1 = sigma1(X[-2])+T1
522 &mov ("esi",&DWP(8*(9+15+16-9)+0,"esp"));
523 &mov ("edi",&DWP(8*(9+15+16-9)+4,"esp"));
524 &add ("eax","ecx");
525 &adc ("ebx","edx"); # T1 += X[-16]
526 &add ("eax","esi");
527 &adc ("ebx","edi"); # T1 += X[-7]
528 &mov (&DWP(8*(9+15)+0,"esp"),"eax");
529 &mov (&DWP(8*(9+15)+4,"esp"),"ebx"); # save X[0]
530
531 &BODY_00_15_x86();
532
533 &cmp (&LB("edx"),0x17);
534 &jne (&label("16_79_x86"));
535
536 &mov ("esi",&DWP(8*(9+16+80)+0,"esp"));# ctx
537 &mov ("edi",&DWP(8*(9+16+80)+4,"esp"));# inp
538 for($i=0;$i<4;$i++) {
539 &mov ("eax",&DWP($i*16+0,"esi"));
540 &mov ("ebx",&DWP($i*16+4,"esi"));
541 &mov ("ecx",&DWP($i*16+8,"esi"));
542 &mov ("edx",&DWP($i*16+12,"esi"));
543 &add ("eax",&DWP(8+($i*16)+0,"esp"));
544 &adc ("ebx",&DWP(8+($i*16)+4,"esp"));
545 &mov (&DWP($i*16+0,"esi"),"eax");
546 &mov (&DWP($i*16+4,"esi"),"ebx");
547 &add ("ecx",&DWP(8+($i*16)+8,"esp"));
548 &adc ("edx",&DWP(8+($i*16)+12,"esp"));
549 &mov (&DWP($i*16+8,"esi"),"ecx");
550 &mov (&DWP($i*16+12,"esi"),"edx");
551 }
552 &add ("esp",8*(9+16+80)); # destroy frame
553 &sub ($K512,8*80); # rewind K
554
555 &cmp ("edi",&DWP(8,"esp")); # are we done yet?
556 &jb (&label("loop_x86"));
557
558 &mov ("esp",&DWP(12,"esp")); # restore sp
559&function_end_A();
560&function_end_B("sha512_block_data_order");
561
562 &rodataseg();
563&set_label("K512",64);
564 &data_word(0xd728ae22,0x428a2f98); # u64
565 &data_word(0x23ef65cd,0x71374491); # u64
566 &data_word(0xec4d3b2f,0xb5c0fbcf); # u64
567 &data_word(0x8189dbbc,0xe9b5dba5); # u64
568 &data_word(0xf348b538,0x3956c25b); # u64
569 &data_word(0xb605d019,0x59f111f1); # u64
570 &data_word(0xaf194f9b,0x923f82a4); # u64
571 &data_word(0xda6d8118,0xab1c5ed5); # u64
572 &data_word(0xa3030242,0xd807aa98); # u64
573 &data_word(0x45706fbe,0x12835b01); # u64
574 &data_word(0x4ee4b28c,0x243185be); # u64
575 &data_word(0xd5ffb4e2,0x550c7dc3); # u64
576 &data_word(0xf27b896f,0x72be5d74); # u64
577 &data_word(0x3b1696b1,0x80deb1fe); # u64
578 &data_word(0x25c71235,0x9bdc06a7); # u64
579 &data_word(0xcf692694,0xc19bf174); # u64
580 &data_word(0x9ef14ad2,0xe49b69c1); # u64
581 &data_word(0x384f25e3,0xefbe4786); # u64
582 &data_word(0x8b8cd5b5,0x0fc19dc6); # u64
583 &data_word(0x77ac9c65,0x240ca1cc); # u64
584 &data_word(0x592b0275,0x2de92c6f); # u64
585 &data_word(0x6ea6e483,0x4a7484aa); # u64
586 &data_word(0xbd41fbd4,0x5cb0a9dc); # u64
587 &data_word(0x831153b5,0x76f988da); # u64
588 &data_word(0xee66dfab,0x983e5152); # u64
589 &data_word(0x2db43210,0xa831c66d); # u64
590 &data_word(0x98fb213f,0xb00327c8); # u64
591 &data_word(0xbeef0ee4,0xbf597fc7); # u64
592 &data_word(0x3da88fc2,0xc6e00bf3); # u64
593 &data_word(0x930aa725,0xd5a79147); # u64
594 &data_word(0xe003826f,0x06ca6351); # u64
595 &data_word(0x0a0e6e70,0x14292967); # u64
596 &data_word(0x46d22ffc,0x27b70a85); # u64
597 &data_word(0x5c26c926,0x2e1b2138); # u64
598 &data_word(0x5ac42aed,0x4d2c6dfc); # u64
599 &data_word(0x9d95b3df,0x53380d13); # u64
600 &data_word(0x8baf63de,0x650a7354); # u64
601 &data_word(0x3c77b2a8,0x766a0abb); # u64
602 &data_word(0x47edaee6,0x81c2c92e); # u64
603 &data_word(0x1482353b,0x92722c85); # u64
604 &data_word(0x4cf10364,0xa2bfe8a1); # u64
605 &data_word(0xbc423001,0xa81a664b); # u64
606 &data_word(0xd0f89791,0xc24b8b70); # u64
607 &data_word(0x0654be30,0xc76c51a3); # u64
608 &data_word(0xd6ef5218,0xd192e819); # u64
609 &data_word(0x5565a910,0xd6990624); # u64
610 &data_word(0x5771202a,0xf40e3585); # u64
611 &data_word(0x32bbd1b8,0x106aa070); # u64
612 &data_word(0xb8d2d0c8,0x19a4c116); # u64
613 &data_word(0x5141ab53,0x1e376c08); # u64
614 &data_word(0xdf8eeb99,0x2748774c); # u64
615 &data_word(0xe19b48a8,0x34b0bcb5); # u64
616 &data_word(0xc5c95a63,0x391c0cb3); # u64
617 &data_word(0xe3418acb,0x4ed8aa4a); # u64
618 &data_word(0x7763e373,0x5b9cca4f); # u64
619 &data_word(0xd6b2b8a3,0x682e6ff3); # u64
620 &data_word(0x5defb2fc,0x748f82ee); # u64
621 &data_word(0x43172f60,0x78a5636f); # u64
622 &data_word(0xa1f0ab72,0x84c87814); # u64
623 &data_word(0x1a6439ec,0x8cc70208); # u64
624 &data_word(0x23631e28,0x90befffa); # u64
625 &data_word(0xde82bde9,0xa4506ceb); # u64
626 &data_word(0xb2c67915,0xbef9a3f7); # u64
627 &data_word(0xe372532b,0xc67178f2); # u64
628 &data_word(0xea26619c,0xca273ece); # u64
629 &data_word(0x21c0c207,0xd186b8c7); # u64
630 &data_word(0xcde0eb1e,0xeada7dd6); # u64
631 &data_word(0xee6ed178,0xf57d4f7f); # u64
632 &data_word(0x72176fba,0x06f067aa); # u64
633 &data_word(0xa2c898a6,0x0a637dc5); # u64
634 &data_word(0xbef90dae,0x113f9804); # u64
635 &data_word(0x131c471b,0x1b710b35); # u64
636 &data_word(0x23047d84,0x28db77f5); # u64
637 &data_word(0x40c72493,0x32caab7b); # u64
638 &data_word(0x15c9bebc,0x3c9ebe0a); # u64
639 &data_word(0x9c100d4c,0x431d67c4); # u64
640 &data_word(0xcb3e42b6,0x4cc5d4be); # u64
641 &data_word(0xfc657e2a,0x597f299c); # u64
642 &data_word(0x3ad6faec,0x5fcb6fab); # u64
643 &data_word(0x4a475817,0x6c44198c); # u64
644 &previous();
645
646&asm_finish();
diff --git a/src/lib/libcrypto/sha/asm/sha512-armv4.pl b/src/lib/libcrypto/sha/asm/sha512-armv4.pl
deleted file mode 100644
index a247a00c2b..0000000000
--- a/src/lib/libcrypto/sha/asm/sha512-armv4.pl
+++ /dev/null
@@ -1,582 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# SHA512 block procedure for ARMv4. September 2007.
11
12# This code is ~4.5 (four and a half) times faster than code generated
13# by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
14# Xscale PXA250 core].
15#
16# July 2010.
17#
18# Rescheduling for dual-issue pipeline resulted in 6% improvement on
19# Cortex A8 core and ~40 cycles per processed byte.
20
21# February 2011.
22#
23# Profiler-assisted and platform-specific optimization resulted in 7%
24# improvement on Coxtex A8 core and ~38 cycles per byte.
25
26# March 2011.
27#
28# Add NEON implementation. On Cortex A8 it was measured to process
29# one byte in 25.5 cycles or 47% faster than integer-only code.
30
31# Byte order [in]dependence. =========================================
32#
33# Originally caller was expected to maintain specific *dword* order in
34# h[0-7], namely with most significant dword at *lower* address, which
35# was reflected in below two parameters as 0 and 4. Now caller is
36# expected to maintain native byte order for whole 64-bit values.
37$hi="HI";
38$lo="LO";
39# ====================================================================
40
41while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
42open STDOUT,">$output";
43
44$ctx="r0"; # parameter block
45$inp="r1";
46$len="r2";
47
48$Tlo="r3";
49$Thi="r4";
50$Alo="r5";
51$Ahi="r6";
52$Elo="r7";
53$Ehi="r8";
54$t0="r9";
55$t1="r10";
56$t2="r11";
57$t3="r12";
58############ r13 is stack pointer
59$Ktbl="r14";
60############ r15 is program counter
61
62$Aoff=8*0;
63$Boff=8*1;
64$Coff=8*2;
65$Doff=8*3;
66$Eoff=8*4;
67$Foff=8*5;
68$Goff=8*6;
69$Hoff=8*7;
70$Xoff=8*8;
71
72sub BODY_00_15() {
73my $magic = shift;
74$code.=<<___;
75 @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41))
76 @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
77 @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
78 mov $t0,$Elo,lsr#14
79 str $Tlo,[sp,#$Xoff+0]
80 mov $t1,$Ehi,lsr#14
81 str $Thi,[sp,#$Xoff+4]
82 eor $t0,$t0,$Ehi,lsl#18
83 ldr $t2,[sp,#$Hoff+0] @ h.lo
84 eor $t1,$t1,$Elo,lsl#18
85 ldr $t3,[sp,#$Hoff+4] @ h.hi
86 eor $t0,$t0,$Elo,lsr#18
87 eor $t1,$t1,$Ehi,lsr#18
88 eor $t0,$t0,$Ehi,lsl#14
89 eor $t1,$t1,$Elo,lsl#14
90 eor $t0,$t0,$Ehi,lsr#9
91 eor $t1,$t1,$Elo,lsr#9
92 eor $t0,$t0,$Elo,lsl#23
93 eor $t1,$t1,$Ehi,lsl#23 @ Sigma1(e)
94 adds $Tlo,$Tlo,$t0
95 ldr $t0,[sp,#$Foff+0] @ f.lo
96 adc $Thi,$Thi,$t1 @ T += Sigma1(e)
97 ldr $t1,[sp,#$Foff+4] @ f.hi
98 adds $Tlo,$Tlo,$t2
99 ldr $t2,[sp,#$Goff+0] @ g.lo
100 adc $Thi,$Thi,$t3 @ T += h
101 ldr $t3,[sp,#$Goff+4] @ g.hi
102
103 eor $t0,$t0,$t2
104 str $Elo,[sp,#$Eoff+0]
105 eor $t1,$t1,$t3
106 str $Ehi,[sp,#$Eoff+4]
107 and $t0,$t0,$Elo
108 str $Alo,[sp,#$Aoff+0]
109 and $t1,$t1,$Ehi
110 str $Ahi,[sp,#$Aoff+4]
111 eor $t0,$t0,$t2
112 ldr $t2,[$Ktbl,#$lo] @ K[i].lo
113 eor $t1,$t1,$t3 @ Ch(e,f,g)
114 ldr $t3,[$Ktbl,#$hi] @ K[i].hi
115
116 adds $Tlo,$Tlo,$t0
117 ldr $Elo,[sp,#$Doff+0] @ d.lo
118 adc $Thi,$Thi,$t1 @ T += Ch(e,f,g)
119 ldr $Ehi,[sp,#$Doff+4] @ d.hi
120 adds $Tlo,$Tlo,$t2
121 and $t0,$t2,#0xff
122 adc $Thi,$Thi,$t3 @ T += K[i]
123 adds $Elo,$Elo,$Tlo
124 ldr $t2,[sp,#$Boff+0] @ b.lo
125 adc $Ehi,$Ehi,$Thi @ d += T
126 teq $t0,#$magic
127
128 ldr $t3,[sp,#$Coff+0] @ c.lo
129 orreq $Ktbl,$Ktbl,#1
130 @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
131 @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
132 @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
133 mov $t0,$Alo,lsr#28
134 mov $t1,$Ahi,lsr#28
135 eor $t0,$t0,$Ahi,lsl#4
136 eor $t1,$t1,$Alo,lsl#4
137 eor $t0,$t0,$Ahi,lsr#2
138 eor $t1,$t1,$Alo,lsr#2
139 eor $t0,$t0,$Alo,lsl#30
140 eor $t1,$t1,$Ahi,lsl#30
141 eor $t0,$t0,$Ahi,lsr#7
142 eor $t1,$t1,$Alo,lsr#7
143 eor $t0,$t0,$Alo,lsl#25
144 eor $t1,$t1,$Ahi,lsl#25 @ Sigma0(a)
145 adds $Tlo,$Tlo,$t0
146 and $t0,$Alo,$t2
147 adc $Thi,$Thi,$t1 @ T += Sigma0(a)
148
149 ldr $t1,[sp,#$Boff+4] @ b.hi
150 orr $Alo,$Alo,$t2
151 ldr $t2,[sp,#$Coff+4] @ c.hi
152 and $Alo,$Alo,$t3
153 and $t3,$Ahi,$t1
154 orr $Ahi,$Ahi,$t1
155 orr $Alo,$Alo,$t0 @ Maj(a,b,c).lo
156 and $Ahi,$Ahi,$t2
157 adds $Alo,$Alo,$Tlo
158 orr $Ahi,$Ahi,$t3 @ Maj(a,b,c).hi
159 sub sp,sp,#8
160 adc $Ahi,$Ahi,$Thi @ h += T
161 tst $Ktbl,#1
162 add $Ktbl,$Ktbl,#8
163___
164}
165$code=<<___;
166#include "arm_arch.h"
167#ifdef __ARMEL__
168# define LO 0
169# define HI 4
170# define WORD64(hi0,lo0,hi1,lo1) .word lo0,hi0, lo1,hi1
171#else
172# define HI 0
173# define LO 4
174# define WORD64(hi0,lo0,hi1,lo1) .word hi0,lo0, hi1,lo1
175#endif
176
177.text
178.code 32
179.type K512,%object
180.align 5
181K512:
182WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
183WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
184WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
185WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
186WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
187WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
188WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
189WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
190WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
191WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
192WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
193WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
194WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
195WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
196WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
197WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
198WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
199WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
200WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
201WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
202WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
203WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
204WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
205WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
206WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
207WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
208WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
209WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
210WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
211WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
212WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
213WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
214WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
215WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
216WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
217WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
218WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
219WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
220WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
221WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
222.size K512,.-K512
223.LOPENSSL_armcap:
224.word OPENSSL_armcap_P-sha512_block_data_order
225.skip 32-4
226
227.global sha512_block_data_order
228.type sha512_block_data_order,%function
229sha512_block_data_order:
230 sub r3,pc,#8 @ sha512_block_data_order
231 add $len,$inp,$len,lsl#7 @ len to point at the end of inp
232#if __ARM_ARCH__>=7 && !defined(__STRICT_ALIGNMENT)
233 ldr r12,.LOPENSSL_armcap
234 ldr r12,[r3,r12] @ OPENSSL_armcap_P
235 tst r12,#1
236 bne .LNEON
237#endif
238 stmdb sp!,{r4-r12,lr}
239 sub $Ktbl,r3,#672 @ K512
240 sub sp,sp,#9*8
241
242 ldr $Elo,[$ctx,#$Eoff+$lo]
243 ldr $Ehi,[$ctx,#$Eoff+$hi]
244 ldr $t0, [$ctx,#$Goff+$lo]
245 ldr $t1, [$ctx,#$Goff+$hi]
246 ldr $t2, [$ctx,#$Hoff+$lo]
247 ldr $t3, [$ctx,#$Hoff+$hi]
248.Loop:
249 str $t0, [sp,#$Goff+0]
250 str $t1, [sp,#$Goff+4]
251 str $t2, [sp,#$Hoff+0]
252 str $t3, [sp,#$Hoff+4]
253 ldr $Alo,[$ctx,#$Aoff+$lo]
254 ldr $Ahi,[$ctx,#$Aoff+$hi]
255 ldr $Tlo,[$ctx,#$Boff+$lo]
256 ldr $Thi,[$ctx,#$Boff+$hi]
257 ldr $t0, [$ctx,#$Coff+$lo]
258 ldr $t1, [$ctx,#$Coff+$hi]
259 ldr $t2, [$ctx,#$Doff+$lo]
260 ldr $t3, [$ctx,#$Doff+$hi]
261 str $Tlo,[sp,#$Boff+0]
262 str $Thi,[sp,#$Boff+4]
263 str $t0, [sp,#$Coff+0]
264 str $t1, [sp,#$Coff+4]
265 str $t2, [sp,#$Doff+0]
266 str $t3, [sp,#$Doff+4]
267 ldr $Tlo,[$ctx,#$Foff+$lo]
268 ldr $Thi,[$ctx,#$Foff+$hi]
269 str $Tlo,[sp,#$Foff+0]
270 str $Thi,[sp,#$Foff+4]
271
272.L00_15:
273#if __ARM_ARCH__<7 || defined(__STRICT_ALIGNMENT)
274 ldrb $Tlo,[$inp,#7]
275 ldrb $t0, [$inp,#6]
276 ldrb $t1, [$inp,#5]
277 ldrb $t2, [$inp,#4]
278 ldrb $Thi,[$inp,#3]
279 ldrb $t3, [$inp,#2]
280 orr $Tlo,$Tlo,$t0,lsl#8
281 ldrb $t0, [$inp,#1]
282 orr $Tlo,$Tlo,$t1,lsl#16
283 ldrb $t1, [$inp],#8
284 orr $Tlo,$Tlo,$t2,lsl#24
285 orr $Thi,$Thi,$t3,lsl#8
286 orr $Thi,$Thi,$t0,lsl#16
287 orr $Thi,$Thi,$t1,lsl#24
288#else
289 ldr $Tlo,[$inp,#4]
290 ldr $Thi,[$inp],#8
291#ifdef __ARMEL__
292 rev $Tlo,$Tlo
293 rev $Thi,$Thi
294#endif
295#endif
296___
297 &BODY_00_15(0x94);
298$code.=<<___;
299 tst $Ktbl,#1
300 beq .L00_15
301 ldr $t0,[sp,#`$Xoff+8*(16-1)`+0]
302 ldr $t1,[sp,#`$Xoff+8*(16-1)`+4]
303 bic $Ktbl,$Ktbl,#1
304.L16_79:
305 @ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7))
306 @ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
307 @ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7
308 mov $Tlo,$t0,lsr#1
309 ldr $t2,[sp,#`$Xoff+8*(16-14)`+0]
310 mov $Thi,$t1,lsr#1
311 ldr $t3,[sp,#`$Xoff+8*(16-14)`+4]
312 eor $Tlo,$Tlo,$t1,lsl#31
313 eor $Thi,$Thi,$t0,lsl#31
314 eor $Tlo,$Tlo,$t0,lsr#8
315 eor $Thi,$Thi,$t1,lsr#8
316 eor $Tlo,$Tlo,$t1,lsl#24
317 eor $Thi,$Thi,$t0,lsl#24
318 eor $Tlo,$Tlo,$t0,lsr#7
319 eor $Thi,$Thi,$t1,lsr#7
320 eor $Tlo,$Tlo,$t1,lsl#25
321
322 @ sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
323 @ LO lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
324 @ HI hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
325 mov $t0,$t2,lsr#19
326 mov $t1,$t3,lsr#19
327 eor $t0,$t0,$t3,lsl#13
328 eor $t1,$t1,$t2,lsl#13
329 eor $t0,$t0,$t3,lsr#29
330 eor $t1,$t1,$t2,lsr#29
331 eor $t0,$t0,$t2,lsl#3
332 eor $t1,$t1,$t3,lsl#3
333 eor $t0,$t0,$t2,lsr#6
334 eor $t1,$t1,$t3,lsr#6
335 ldr $t2,[sp,#`$Xoff+8*(16-9)`+0]
336 eor $t0,$t0,$t3,lsl#26
337
338 ldr $t3,[sp,#`$Xoff+8*(16-9)`+4]
339 adds $Tlo,$Tlo,$t0
340 ldr $t0,[sp,#`$Xoff+8*16`+0]
341 adc $Thi,$Thi,$t1
342
343 ldr $t1,[sp,#`$Xoff+8*16`+4]
344 adds $Tlo,$Tlo,$t2
345 adc $Thi,$Thi,$t3
346 adds $Tlo,$Tlo,$t0
347 adc $Thi,$Thi,$t1
348___
349 &BODY_00_15(0x17);
350$code.=<<___;
351 ldreq $t0,[sp,#`$Xoff+8*(16-1)`+0]
352 ldreq $t1,[sp,#`$Xoff+8*(16-1)`+4]
353 beq .L16_79
354 bic $Ktbl,$Ktbl,#1
355
356 ldr $Tlo,[sp,#$Boff+0]
357 ldr $Thi,[sp,#$Boff+4]
358 ldr $t0, [$ctx,#$Aoff+$lo]
359 ldr $t1, [$ctx,#$Aoff+$hi]
360 ldr $t2, [$ctx,#$Boff+$lo]
361 ldr $t3, [$ctx,#$Boff+$hi]
362 adds $t0,$Alo,$t0
363 str $t0, [$ctx,#$Aoff+$lo]
364 adc $t1,$Ahi,$t1
365 str $t1, [$ctx,#$Aoff+$hi]
366 adds $t2,$Tlo,$t2
367 str $t2, [$ctx,#$Boff+$lo]
368 adc $t3,$Thi,$t3
369 str $t3, [$ctx,#$Boff+$hi]
370
371 ldr $Alo,[sp,#$Coff+0]
372 ldr $Ahi,[sp,#$Coff+4]
373 ldr $Tlo,[sp,#$Doff+0]
374 ldr $Thi,[sp,#$Doff+4]
375 ldr $t0, [$ctx,#$Coff+$lo]
376 ldr $t1, [$ctx,#$Coff+$hi]
377 ldr $t2, [$ctx,#$Doff+$lo]
378 ldr $t3, [$ctx,#$Doff+$hi]
379 adds $t0,$Alo,$t0
380 str $t0, [$ctx,#$Coff+$lo]
381 adc $t1,$Ahi,$t1
382 str $t1, [$ctx,#$Coff+$hi]
383 adds $t2,$Tlo,$t2
384 str $t2, [$ctx,#$Doff+$lo]
385 adc $t3,$Thi,$t3
386 str $t3, [$ctx,#$Doff+$hi]
387
388 ldr $Tlo,[sp,#$Foff+0]
389 ldr $Thi,[sp,#$Foff+4]
390 ldr $t0, [$ctx,#$Eoff+$lo]
391 ldr $t1, [$ctx,#$Eoff+$hi]
392 ldr $t2, [$ctx,#$Foff+$lo]
393 ldr $t3, [$ctx,#$Foff+$hi]
394 adds $Elo,$Elo,$t0
395 str $Elo,[$ctx,#$Eoff+$lo]
396 adc $Ehi,$Ehi,$t1
397 str $Ehi,[$ctx,#$Eoff+$hi]
398 adds $t2,$Tlo,$t2
399 str $t2, [$ctx,#$Foff+$lo]
400 adc $t3,$Thi,$t3
401 str $t3, [$ctx,#$Foff+$hi]
402
403 ldr $Alo,[sp,#$Goff+0]
404 ldr $Ahi,[sp,#$Goff+4]
405 ldr $Tlo,[sp,#$Hoff+0]
406 ldr $Thi,[sp,#$Hoff+4]
407 ldr $t0, [$ctx,#$Goff+$lo]
408 ldr $t1, [$ctx,#$Goff+$hi]
409 ldr $t2, [$ctx,#$Hoff+$lo]
410 ldr $t3, [$ctx,#$Hoff+$hi]
411 adds $t0,$Alo,$t0
412 str $t0, [$ctx,#$Goff+$lo]
413 adc $t1,$Ahi,$t1
414 str $t1, [$ctx,#$Goff+$hi]
415 adds $t2,$Tlo,$t2
416 str $t2, [$ctx,#$Hoff+$lo]
417 adc $t3,$Thi,$t3
418 str $t3, [$ctx,#$Hoff+$hi]
419
420 add sp,sp,#640
421 sub $Ktbl,$Ktbl,#640
422
423 teq $inp,$len
424 bne .Loop
425
426 add sp,sp,#8*9 @ destroy frame
427#if __ARM_ARCH__>=5
428 ldmia sp!,{r4-r12,pc}
429#else
430 ldmia sp!,{r4-r12,lr}
431 tst lr,#1
432 moveq pc,lr @ be binary compatible with V4, yet
433 bx lr @ interoperable with Thumb ISA:-)
434#endif
435___
436
437{
438my @Sigma0=(28,34,39);
439my @Sigma1=(14,18,41);
440my @sigma0=(1, 8, 7);
441my @sigma1=(19,61,6);
442
443my $Ktbl="r3";
444my $cnt="r12"; # volatile register known as ip, intra-procedure-call scratch
445
446my @X=map("d$_",(0..15));
447my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23));
448
449sub NEON_00_15() {
450my $i=shift;
451my ($a,$b,$c,$d,$e,$f,$g,$h)=@_;
452my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31)); # temps
453
454$code.=<<___ if ($i<16 || $i&1);
455 vshr.u64 $t0,$e,#@Sigma1[0] @ $i
456#if $i<16
457 vld1.64 {@X[$i%16]},[$inp]! @ handles unaligned
458#endif
459 vshr.u64 $t1,$e,#@Sigma1[1]
460 vshr.u64 $t2,$e,#@Sigma1[2]
461___
462$code.=<<___;
463 vld1.64 {$K},[$Ktbl,:64]! @ K[i++]
464 vsli.64 $t0,$e,#`64-@Sigma1[0]`
465 vsli.64 $t1,$e,#`64-@Sigma1[1]`
466 vsli.64 $t2,$e,#`64-@Sigma1[2]`
467#if $i<16 && defined(__ARMEL__)
468 vrev64.8 @X[$i],@X[$i]
469#endif
470 vadd.i64 $T1,$K,$h
471 veor $Ch,$f,$g
472 veor $t0,$t1
473 vand $Ch,$e
474 veor $t0,$t2 @ Sigma1(e)
475 veor $Ch,$g @ Ch(e,f,g)
476 vadd.i64 $T1,$t0
477 vshr.u64 $t0,$a,#@Sigma0[0]
478 vadd.i64 $T1,$Ch
479 vshr.u64 $t1,$a,#@Sigma0[1]
480 vshr.u64 $t2,$a,#@Sigma0[2]
481 vsli.64 $t0,$a,#`64-@Sigma0[0]`
482 vsli.64 $t1,$a,#`64-@Sigma0[1]`
483 vsli.64 $t2,$a,#`64-@Sigma0[2]`
484 vadd.i64 $T1,@X[$i%16]
485 vorr $Maj,$a,$c
486 vand $Ch,$a,$c
487 veor $h,$t0,$t1
488 vand $Maj,$b
489 veor $h,$t2 @ Sigma0(a)
490 vorr $Maj,$Ch @ Maj(a,b,c)
491 vadd.i64 $h,$T1
492 vadd.i64 $d,$T1
493 vadd.i64 $h,$Maj
494___
495}
496
497sub NEON_16_79() {
498my $i=shift;
499
500if ($i&1) { &NEON_00_15($i,@_); return; }
501
502# 2x-vectorized, therefore runs every 2nd round
503my @X=map("q$_",(0..7)); # view @X as 128-bit vector
504my ($t0,$t1,$s0,$s1) = map("q$_",(12..15)); # temps
505my ($d0,$d1,$d2) = map("d$_",(24..26)); # temps from NEON_00_15
506my $e=@_[4]; # $e from NEON_00_15
507$i /= 2;
508$code.=<<___;
509 vshr.u64 $t0,@X[($i+7)%8],#@sigma1[0]
510 vshr.u64 $t1,@X[($i+7)%8],#@sigma1[1]
511 vshr.u64 $s1,@X[($i+7)%8],#@sigma1[2]
512 vsli.64 $t0,@X[($i+7)%8],#`64-@sigma1[0]`
513 vext.8 $s0,@X[$i%8],@X[($i+1)%8],#8 @ X[i+1]
514 vsli.64 $t1,@X[($i+7)%8],#`64-@sigma1[1]`
515 veor $s1,$t0
516 vshr.u64 $t0,$s0,#@sigma0[0]
517 veor $s1,$t1 @ sigma1(X[i+14])
518 vshr.u64 $t1,$s0,#@sigma0[1]
519 vadd.i64 @X[$i%8],$s1
520 vshr.u64 $s1,$s0,#@sigma0[2]
521 vsli.64 $t0,$s0,#`64-@sigma0[0]`
522 vsli.64 $t1,$s0,#`64-@sigma0[1]`
523 vext.8 $s0,@X[($i+4)%8],@X[($i+5)%8],#8 @ X[i+9]
524 veor $s1,$t0
525 vshr.u64 $d0,$e,#@Sigma1[0] @ from NEON_00_15
526 vadd.i64 @X[$i%8],$s0
527 vshr.u64 $d1,$e,#@Sigma1[1] @ from NEON_00_15
528 veor $s1,$t1 @ sigma0(X[i+1])
529 vshr.u64 $d2,$e,#@Sigma1[2] @ from NEON_00_15
530 vadd.i64 @X[$i%8],$s1
531___
532 &NEON_00_15(2*$i,@_);
533}
534
535$code.=<<___;
536#if __ARM_ARCH__>=7 && !defined(__STRICT_ALIGNMENT)
537.fpu neon
538
539.align 4
540.LNEON:
541 dmb @ errata #451034 on early Cortex A8
542 vstmdb sp!,{d8-d15} @ ABI specification says so
543 sub $Ktbl,r3,#672 @ K512
544 vldmia $ctx,{$A-$H} @ load context
545.Loop_neon:
546___
547for($i=0;$i<16;$i++) { &NEON_00_15($i,@V); unshift(@V,pop(@V)); }
548$code.=<<___;
549 mov $cnt,#4
550.L16_79_neon:
551 subs $cnt,#1
552___
553for(;$i<32;$i++) { &NEON_16_79($i,@V); unshift(@V,pop(@V)); }
554$code.=<<___;
555 bne .L16_79_neon
556
557 vldmia $ctx,{d24-d31} @ load context to temp
558 vadd.i64 q8,q12 @ vectorized accumulate
559 vadd.i64 q9,q13
560 vadd.i64 q10,q14
561 vadd.i64 q11,q15
562 vstmia $ctx,{$A-$H} @ save context
563 teq $inp,$len
564 sub $Ktbl,#640 @ rewind K512
565 bne .Loop_neon
566
567 vldmia sp!,{d8-d15} @ epilogue
568 bx lr
569#endif
570___
571}
572$code.=<<___;
573.size sha512_block_data_order,.-sha512_block_data_order
574.asciz "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
575.align 2
576.comm OPENSSL_armcap_P,4,4
577___
578
579$code =~ s/\`([^\`]*)\`/eval $1/gem;
580$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
581print $code;
582close STDOUT; # enforce flush
diff --git a/src/lib/libcrypto/sha/asm/sha512-mips.pl b/src/lib/libcrypto/sha/asm/sha512-mips.pl
deleted file mode 100644
index 495a000695..0000000000
--- a/src/lib/libcrypto/sha/asm/sha512-mips.pl
+++ /dev/null
@@ -1,457 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# SHA2 block procedures for MIPS.
11
12# October 2010.
13#
14# SHA256 performance improvement on MIPS R5000 CPU is ~27% over gcc-
15# generated code in o32 build and ~55% in n32/64 build. SHA512 [which
16# for now can only be compiled for MIPS64 ISA] improvement is modest
17# ~17%, but it comes for free, because it's same instruction sequence.
18# Improvement coefficients are for aligned input.
19
20######################################################################
21# There is a number of MIPS ABI in use, O32 and N32/64 are most
22# widely used. Then there is a new contender: NUBI. It appears that if
23# one picks the latter, it's possible to arrange code in ABI neutral
24# manner. Therefore let's stick to NUBI register layout:
25#
26($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
27($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
28($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
29($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
30#
31# The return value is placed in $a0. Following coding rules facilitate
32# interoperability:
33#
34# - never ever touch $tp, "thread pointer", former $gp [o32 can be
35# excluded from the rule, because it's specified volatile];
36# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
37# old code];
38# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
39#
40# For reference here is register layout for N32/64 MIPS ABIs:
41#
42# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
43# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
44# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
45# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
46# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
47#
48$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64
49
50if ($flavour =~ /64/i) {
51 $LA="dla";
52} else {
53 $LA="la";
54}
55
56if ($flavour =~ /64|n32/i) {
57 $PTR_ADD="dadd"; # incidentally works even on n32
58 $PTR_SUB="dsub"; # incidentally works even on n32
59 $REG_S="sd";
60 $REG_L="ld";
61 $PTR_SLL="dsll"; # incidentally works even on n32
62 $SZREG=8;
63} else {
64 $PTR_ADD="add";
65 $PTR_SUB="sub";
66 $REG_S="sw";
67 $REG_L="lw";
68 $PTR_SLL="sll";
69 $SZREG=4;
70}
71$pf = ($flavour =~ /nubi/i) ? $t0 : $t2;
72#
73# <appro@openssl.org>
74#
75######################################################################
76
77$big_endian=(`echo MIPSEL | $ENV{CC} -E -P -`=~/MIPSEL/)?1:0;
78
79for (@ARGV) { $output=$_ if (/^\w[\w\-]*\.\w+$/); }
80open STDOUT,">$output";
81
82if (!defined($big_endian)) { $big_endian=(unpack('L',pack('N',1))==1); }
83
84if ($output =~ /512/) {
85 $label="512";
86 $SZ=8;
87 $LD="ld"; # load from memory
88 $ST="sd"; # store to memory
89 $SLL="dsll"; # shift left logical
90 $SRL="dsrl"; # shift right logical
91 $ADDU="daddu";
92 @Sigma0=(28,34,39);
93 @Sigma1=(14,18,41);
94 @sigma0=( 7, 1, 8); # right shift first
95 @sigma1=( 6,19,61); # right shift first
96 $lastK=0x817;
97 $rounds=80;
98} else {
99 $label="256";
100 $SZ=4;
101 $LD="lw"; # load from memory
102 $ST="sw"; # store to memory
103 $SLL="sll"; # shift left logical
104 $SRL="srl"; # shift right logical
105 $ADDU="addu";
106 @Sigma0=( 2,13,22);
107 @Sigma1=( 6,11,25);
108 @sigma0=( 3, 7,18); # right shift first
109 @sigma1=(10,17,19); # right shift first
110 $lastK=0x8f2;
111 $rounds=64;
112}
113
114$MSB = $big_endian ? 0 : ($SZ-1);
115$LSB = ($SZ-1)&~$MSB;
116
117@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("\$$_",(1,2,3,7,24,25,30,31));
118@X=map("\$$_",(8..23));
119
120$ctx=$a0;
121$inp=$a1;
122$len=$a2; $Ktbl=$len;
123
124sub BODY_00_15 {
125my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
126my ($T1,$tmp0,$tmp1,$tmp2)=(@X[4],@X[5],@X[6],@X[7]);
127
128$code.=<<___ if ($i<15);
129 ${LD}l @X[1],`($i+1)*$SZ+$MSB`($inp)
130 ${LD}r @X[1],`($i+1)*$SZ+$LSB`($inp)
131___
132$code.=<<___ if (!$big_endian && $i<16 && $SZ==4);
133 srl $tmp0,@X[0],24 # byte swap($i)
134 srl $tmp1,@X[0],8
135 andi $tmp2,@X[0],0xFF00
136 sll @X[0],@X[0],24
137 andi $tmp1,0xFF00
138 sll $tmp2,$tmp2,8
139 or @X[0],$tmp0
140 or $tmp1,$tmp2
141 or @X[0],$tmp1
142___
143$code.=<<___ if (!$big_endian && $i<16 && $SZ==8);
144 ori $tmp0,$zero,0xFF
145 dsll $tmp2,$tmp0,32
146 or $tmp0,$tmp2 # 0x000000FF000000FF
147 and $tmp1,@X[0],$tmp0 # byte swap($i)
148 dsrl $tmp2,@X[0],24
149 dsll $tmp1,24
150 and $tmp2,$tmp0
151 dsll $tmp0,8 # 0x0000FF000000FF00
152 or $tmp1,$tmp2
153 and $tmp2,@X[0],$tmp0
154 dsrl @X[0],8
155 dsll $tmp2,8
156 and @X[0],$tmp0
157 or $tmp1,$tmp2
158 or @X[0],$tmp1
159 dsrl $tmp1,@X[0],32
160 dsll @X[0],32
161 or @X[0],$tmp1
162___
163$code.=<<___;
164 $ADDU $T1,$X[0],$h # $i
165 $SRL $h,$e,@Sigma1[0]
166 xor $tmp2,$f,$g
167 $SLL $tmp1,$e,`$SZ*8-@Sigma1[2]`
168 and $tmp2,$e
169 $SRL $tmp0,$e,@Sigma1[1]
170 xor $h,$tmp1
171 $SLL $tmp1,$e,`$SZ*8-@Sigma1[1]`
172 xor $h,$tmp0
173 $SRL $tmp0,$e,@Sigma1[2]
174 xor $h,$tmp1
175 $SLL $tmp1,$e,`$SZ*8-@Sigma1[0]`
176 xor $h,$tmp0
177 xor $tmp2,$g # Ch(e,f,g)
178 xor $tmp0,$tmp1,$h # Sigma1(e)
179
180 $SRL $h,$a,@Sigma0[0]
181 $ADDU $T1,$tmp2
182 $LD $tmp2,`$i*$SZ`($Ktbl) # K[$i]
183 $SLL $tmp1,$a,`$SZ*8-@Sigma0[2]`
184 $ADDU $T1,$tmp0
185 $SRL $tmp0,$a,@Sigma0[1]
186 xor $h,$tmp1
187 $SLL $tmp1,$a,`$SZ*8-@Sigma0[1]`
188 xor $h,$tmp0
189 $SRL $tmp0,$a,@Sigma0[2]
190 xor $h,$tmp1
191 $SLL $tmp1,$a,`$SZ*8-@Sigma0[0]`
192 xor $h,$tmp0
193 $ST @X[0],`($i%16)*$SZ`($sp) # offload to ring buffer
194 xor $h,$tmp1 # Sigma0(a)
195
196 or $tmp0,$a,$b
197 and $tmp1,$a,$b
198 and $tmp0,$c
199 or $tmp1,$tmp0 # Maj(a,b,c)
200 $ADDU $T1,$tmp2 # +=K[$i]
201 $ADDU $h,$tmp1
202
203 $ADDU $d,$T1
204 $ADDU $h,$T1
205___
206$code.=<<___ if ($i>=13);
207 $LD @X[3],`(($i+3)%16)*$SZ`($sp) # prefetch from ring buffer
208___
209}
210
211sub BODY_16_XX {
212my $i=@_[0];
213my ($tmp0,$tmp1,$tmp2,$tmp3)=(@X[4],@X[5],@X[6],@X[7]);
214
215$code.=<<___;
216 $SRL $tmp2,@X[1],@sigma0[0] # Xupdate($i)
217 $ADDU @X[0],@X[9] # +=X[i+9]
218 $SLL $tmp1,@X[1],`$SZ*8-@sigma0[2]`
219 $SRL $tmp0,@X[1],@sigma0[1]
220 xor $tmp2,$tmp1
221 $SLL $tmp1,`@sigma0[2]-@sigma0[1]`
222 xor $tmp2,$tmp0
223 $SRL $tmp0,@X[1],@sigma0[2]
224 xor $tmp2,$tmp1
225
226 $SRL $tmp3,@X[14],@sigma1[0]
227 xor $tmp2,$tmp0 # sigma0(X[i+1])
228 $SLL $tmp1,@X[14],`$SZ*8-@sigma1[2]`
229 $ADDU @X[0],$tmp2
230 $SRL $tmp0,@X[14],@sigma1[1]
231 xor $tmp3,$tmp1
232 $SLL $tmp1,`@sigma1[2]-@sigma1[1]`
233 xor $tmp3,$tmp0
234 $SRL $tmp0,@X[14],@sigma1[2]
235 xor $tmp3,$tmp1
236
237 xor $tmp3,$tmp0 # sigma1(X[i+14])
238 $ADDU @X[0],$tmp3
239___
240 &BODY_00_15(@_);
241}
242
243$FRAMESIZE=16*$SZ+16*$SZREG;
244$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0xc0fff008 : 0xc0ff0000;
245
246$code.=<<___;
247.text
248.set noat
249#if !defined(__vxworks) || defined(__pic__)
250.option pic2
251#endif
252
253.align 5
254.globl sha${label}_block_data_order
255.ent sha${label}_block_data_order
256sha${label}_block_data_order:
257 .frame $sp,$FRAMESIZE,$ra
258 .mask $SAVED_REGS_MASK,-$SZREG
259 .set noreorder
260___
261$code.=<<___ if ($flavour =~ /o32/i); # o32 PIC-ification
262 .cpload $pf
263___
264$code.=<<___;
265 $PTR_SUB $sp,$FRAMESIZE
266 $REG_S $ra,$FRAMESIZE-1*$SZREG($sp)
267 $REG_S $fp,$FRAMESIZE-2*$SZREG($sp)
268 $REG_S $s11,$FRAMESIZE-3*$SZREG($sp)
269 $REG_S $s10,$FRAMESIZE-4*$SZREG($sp)
270 $REG_S $s9,$FRAMESIZE-5*$SZREG($sp)
271 $REG_S $s8,$FRAMESIZE-6*$SZREG($sp)
272 $REG_S $s7,$FRAMESIZE-7*$SZREG($sp)
273 $REG_S $s6,$FRAMESIZE-8*$SZREG($sp)
274 $REG_S $s5,$FRAMESIZE-9*$SZREG($sp)
275 $REG_S $s4,$FRAMESIZE-10*$SZREG($sp)
276___
277$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
278 $REG_S $s3,$FRAMESIZE-11*$SZREG($sp)
279 $REG_S $s2,$FRAMESIZE-12*$SZREG($sp)
280 $REG_S $s1,$FRAMESIZE-13*$SZREG($sp)
281 $REG_S $s0,$FRAMESIZE-14*$SZREG($sp)
282 $REG_S $gp,$FRAMESIZE-15*$SZREG($sp)
283___
284$code.=<<___;
285 $PTR_SLL @X[15],$len,`log(16*$SZ)/log(2)`
286___
287$code.=<<___ if ($flavour !~ /o32/i); # non-o32 PIC-ification
288 .cplocal $Ktbl
289 .cpsetup $pf,$zero,sha${label}_block_data_order
290___
291$code.=<<___;
292 .set reorder
293 $LA $Ktbl,K${label} # PIC-ified 'load address'
294
295 $LD $A,0*$SZ($ctx) # load context
296 $LD $B,1*$SZ($ctx)
297 $LD $C,2*$SZ($ctx)
298 $LD $D,3*$SZ($ctx)
299 $LD $E,4*$SZ($ctx)
300 $LD $F,5*$SZ($ctx)
301 $LD $G,6*$SZ($ctx)
302 $LD $H,7*$SZ($ctx)
303
304 $PTR_ADD @X[15],$inp # pointer to the end of input
305 $REG_S @X[15],16*$SZ($sp)
306 b .Loop
307
308.align 5
309.Loop:
310 ${LD}l @X[0],$MSB($inp)
311 ${LD}r @X[0],$LSB($inp)
312___
313for ($i=0;$i<16;$i++)
314{ &BODY_00_15($i,@V); unshift(@V,pop(@V)); push(@X,shift(@X)); }
315$code.=<<___;
316 b .L16_xx
317.align 4
318.L16_xx:
319___
320for (;$i<32;$i++)
321{ &BODY_16_XX($i,@V); unshift(@V,pop(@V)); push(@X,shift(@X)); }
322$code.=<<___;
323 and @X[6],0xfff
324 li @X[7],$lastK
325 .set noreorder
326 bne @X[6],@X[7],.L16_xx
327 $PTR_ADD $Ktbl,16*$SZ # Ktbl+=16
328
329 $REG_L @X[15],16*$SZ($sp) # restore pointer to the end of input
330 $LD @X[0],0*$SZ($ctx)
331 $LD @X[1],1*$SZ($ctx)
332 $LD @X[2],2*$SZ($ctx)
333 $PTR_ADD $inp,16*$SZ
334 $LD @X[3],3*$SZ($ctx)
335 $ADDU $A,@X[0]
336 $LD @X[4],4*$SZ($ctx)
337 $ADDU $B,@X[1]
338 $LD @X[5],5*$SZ($ctx)
339 $ADDU $C,@X[2]
340 $LD @X[6],6*$SZ($ctx)
341 $ADDU $D,@X[3]
342 $LD @X[7],7*$SZ($ctx)
343 $ADDU $E,@X[4]
344 $ST $A,0*$SZ($ctx)
345 $ADDU $F,@X[5]
346 $ST $B,1*$SZ($ctx)
347 $ADDU $G,@X[6]
348 $ST $C,2*$SZ($ctx)
349 $ADDU $H,@X[7]
350 $ST $D,3*$SZ($ctx)
351 $ST $E,4*$SZ($ctx)
352 $ST $F,5*$SZ($ctx)
353 $ST $G,6*$SZ($ctx)
354 $ST $H,7*$SZ($ctx)
355
356 bne $inp,@X[15],.Loop
357 $PTR_SUB $Ktbl,`($rounds-16)*$SZ` # rewind $Ktbl
358
359 $REG_L $ra,$FRAMESIZE-1*$SZREG($sp)
360 $REG_L $fp,$FRAMESIZE-2*$SZREG($sp)
361 $REG_L $s11,$FRAMESIZE-3*$SZREG($sp)
362 $REG_L $s10,$FRAMESIZE-4*$SZREG($sp)
363 $REG_L $s9,$FRAMESIZE-5*$SZREG($sp)
364 $REG_L $s8,$FRAMESIZE-6*$SZREG($sp)
365 $REG_L $s7,$FRAMESIZE-7*$SZREG($sp)
366 $REG_L $s6,$FRAMESIZE-8*$SZREG($sp)
367 $REG_L $s5,$FRAMESIZE-9*$SZREG($sp)
368 $REG_L $s4,$FRAMESIZE-10*$SZREG($sp)
369___
370$code.=<<___ if ($flavour =~ /nubi/i);
371 $REG_L $s3,$FRAMESIZE-11*$SZREG($sp)
372 $REG_L $s2,$FRAMESIZE-12*$SZREG($sp)
373 $REG_L $s1,$FRAMESIZE-13*$SZREG($sp)
374 $REG_L $s0,$FRAMESIZE-14*$SZREG($sp)
375 $REG_L $gp,$FRAMESIZE-15*$SZREG($sp)
376___
377$code.=<<___;
378 jr $ra
379 $PTR_ADD $sp,$FRAMESIZE
380.end sha${label}_block_data_order
381
382.rdata
383.align 5
384K${label}:
385___
386if ($SZ==4) {
387$code.=<<___;
388 .word 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
389 .word 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
390 .word 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
391 .word 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
392 .word 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
393 .word 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
394 .word 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
395 .word 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
396 .word 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
397 .word 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
398 .word 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
399 .word 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
400 .word 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
401 .word 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
402 .word 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
403 .word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
404___
405} else {
406$code.=<<___;
407 .dword 0x428a2f98d728ae22, 0x7137449123ef65cd
408 .dword 0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc
409 .dword 0x3956c25bf348b538, 0x59f111f1b605d019
410 .dword 0x923f82a4af194f9b, 0xab1c5ed5da6d8118
411 .dword 0xd807aa98a3030242, 0x12835b0145706fbe
412 .dword 0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2
413 .dword 0x72be5d74f27b896f, 0x80deb1fe3b1696b1
414 .dword 0x9bdc06a725c71235, 0xc19bf174cf692694
415 .dword 0xe49b69c19ef14ad2, 0xefbe4786384f25e3
416 .dword 0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65
417 .dword 0x2de92c6f592b0275, 0x4a7484aa6ea6e483
418 .dword 0x5cb0a9dcbd41fbd4, 0x76f988da831153b5
419 .dword 0x983e5152ee66dfab, 0xa831c66d2db43210
420 .dword 0xb00327c898fb213f, 0xbf597fc7beef0ee4
421 .dword 0xc6e00bf33da88fc2, 0xd5a79147930aa725
422 .dword 0x06ca6351e003826f, 0x142929670a0e6e70
423 .dword 0x27b70a8546d22ffc, 0x2e1b21385c26c926
424 .dword 0x4d2c6dfc5ac42aed, 0x53380d139d95b3df
425 .dword 0x650a73548baf63de, 0x766a0abb3c77b2a8
426 .dword 0x81c2c92e47edaee6, 0x92722c851482353b
427 .dword 0xa2bfe8a14cf10364, 0xa81a664bbc423001
428 .dword 0xc24b8b70d0f89791, 0xc76c51a30654be30
429 .dword 0xd192e819d6ef5218, 0xd69906245565a910
430 .dword 0xf40e35855771202a, 0x106aa07032bbd1b8
431 .dword 0x19a4c116b8d2d0c8, 0x1e376c085141ab53
432 .dword 0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8
433 .dword 0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb
434 .dword 0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3
435 .dword 0x748f82ee5defb2fc, 0x78a5636f43172f60
436 .dword 0x84c87814a1f0ab72, 0x8cc702081a6439ec
437 .dword 0x90befffa23631e28, 0xa4506cebde82bde9
438 .dword 0xbef9a3f7b2c67915, 0xc67178f2e372532b
439 .dword 0xca273eceea26619c, 0xd186b8c721c0c207
440 .dword 0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178
441 .dword 0x06f067aa72176fba, 0x0a637dc5a2c898a6
442 .dword 0x113f9804bef90dae, 0x1b710b35131c471b
443 .dword 0x28db77f523047d84, 0x32caab7b40c72493
444 .dword 0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c
445 .dword 0x4cc5d4becb3e42b6, 0x597f299cfc657e2a
446 .dword 0x5fcb6fab3ad6faec, 0x6c44198c4a475817
447___
448}
449$code.=<<___;
450.asciiz "SHA${label} for MIPS, CRYPTOGAMS by <appro\@openssl.org>"
451.align 5
452
453___
454
455$code =~ s/\`([^\`]*)\`/eval $1/gem;
456print $code;
457close STDOUT;
diff --git a/src/lib/libcrypto/sha/asm/sha512-parisc.pl b/src/lib/libcrypto/sha/asm/sha512-parisc.pl
deleted file mode 100755
index 42832e29f1..0000000000
--- a/src/lib/libcrypto/sha/asm/sha512-parisc.pl
+++ /dev/null
@@ -1,801 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# SHA256/512 block procedure for PA-RISC.
11
12# June 2009.
13#
14# SHA256 performance is >75% better than gcc 3.2 generated code on
15# PA-7100LC. Compared to code generated by vendor compiler this
16# implementation is almost 70% faster in 64-bit build, but delivers
17# virtually same performance in 32-bit build on PA-8600.
18#
19# SHA512 performance is >2.9x better than gcc 3.2 generated code on
20# PA-7100LC, PA-RISC 1.1 processor. Then implementation detects if the
21# code is executed on PA-RISC 2.0 processor and switches to 64-bit
22# code path delivering adequate performance even in "blended" 32-bit
23# build. Though 64-bit code is not any faster than code generated by
24# vendor compiler on PA-8600...
25#
26# Special thanks to polarhome.com for providing HP-UX account.
27
28$flavour = shift;
29$output = shift;
30open STDOUT,">$output";
31
32if ($flavour =~ /64/) {
33 $LEVEL ="2.0W";
34 $SIZE_T =8;
35 $FRAME_MARKER =80;
36 $SAVED_RP =16;
37 $PUSH ="std";
38 $PUSHMA ="std,ma";
39 $POP ="ldd";
40 $POPMB ="ldd,mb";
41} else {
42 $LEVEL ="1.0";
43 $SIZE_T =4;
44 $FRAME_MARKER =48;
45 $SAVED_RP =20;
46 $PUSH ="stw";
47 $PUSHMA ="stwm";
48 $POP ="ldw";
49 $POPMB ="ldwm";
50}
51
52if ($output =~ /512/) {
53 $func="sha512_block_data_order";
54 $SZ=8;
55 @Sigma0=(28,34,39);
56 @Sigma1=(14,18,41);
57 @sigma0=(1, 8, 7);
58 @sigma1=(19,61, 6);
59 $rounds=80;
60 $LAST10BITS=0x017;
61 $LD="ldd";
62 $LDM="ldd,ma";
63 $ST="std";
64} else {
65 $func="sha256_block_data_order";
66 $SZ=4;
67 @Sigma0=( 2,13,22);
68 @Sigma1=( 6,11,25);
69 @sigma0=( 7,18, 3);
70 @sigma1=(17,19,10);
71 $rounds=64;
72 $LAST10BITS=0x0f2;
73 $LD="ldw";
74 $LDM="ldwm";
75 $ST="stw";
76}
77
78$FRAME=16*$SIZE_T+$FRAME_MARKER;# 16 saved regs + frame marker
79 # [+ argument transfer]
80$XOFF=16*$SZ+32; # local variables
81$FRAME+=$XOFF;
82$XOFF+=$FRAME_MARKER; # distance between %sp and local variables
83
84$ctx="%r26"; # zapped by $a0
85$inp="%r25"; # zapped by $a1
86$num="%r24"; # zapped by $t0
87
88$a0 ="%r26";
89$a1 ="%r25";
90$t0 ="%r24";
91$t1 ="%r29";
92$Tbl="%r31";
93
94@V=($A,$B,$C,$D,$E,$F,$G,$H)=("%r17","%r18","%r19","%r20","%r21","%r22","%r23","%r28");
95
96@X=("%r1", "%r2", "%r3", "%r4", "%r5", "%r6", "%r7", "%r8",
97 "%r9", "%r10","%r11","%r12","%r13","%r14","%r15","%r16",$inp);
98
99sub ROUND_00_15 {
100my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
101$code.=<<___;
102 _ror $e,$Sigma1[0],$a0
103 and $f,$e,$t0
104 _ror $e,$Sigma1[1],$a1
105 addl $t1,$h,$h
106 andcm $g,$e,$t1
107 xor $a1,$a0,$a0
108 _ror $a1,`$Sigma1[2]-$Sigma1[1]`,$a1
109 or $t0,$t1,$t1 ; Ch(e,f,g)
110 addl @X[$i%16],$h,$h
111 xor $a0,$a1,$a1 ; Sigma1(e)
112 addl $t1,$h,$h
113 _ror $a,$Sigma0[0],$a0
114 addl $a1,$h,$h
115
116 _ror $a,$Sigma0[1],$a1
117 and $a,$b,$t0
118 and $a,$c,$t1
119 xor $a1,$a0,$a0
120 _ror $a1,`$Sigma0[2]-$Sigma0[1]`,$a1
121 xor $t1,$t0,$t0
122 and $b,$c,$t1
123 xor $a0,$a1,$a1 ; Sigma0(a)
124 addl $h,$d,$d
125 xor $t1,$t0,$t0 ; Maj(a,b,c)
126 `"$LDM $SZ($Tbl),$t1" if ($i<15)`
127 addl $a1,$h,$h
128 addl $t0,$h,$h
129
130___
131}
132
133sub ROUND_16_xx {
134my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
135$i-=16;
136$code.=<<___;
137 _ror @X[($i+1)%16],$sigma0[0],$a0
138 _ror @X[($i+1)%16],$sigma0[1],$a1
139 addl @X[($i+9)%16],@X[$i],@X[$i]
140 _ror @X[($i+14)%16],$sigma1[0],$t0
141 _ror @X[($i+14)%16],$sigma1[1],$t1
142 xor $a1,$a0,$a0
143 _shr @X[($i+1)%16],$sigma0[2],$a1
144 xor $t1,$t0,$t0
145 _shr @X[($i+14)%16],$sigma1[2],$t1
146 xor $a1,$a0,$a0 ; sigma0(X[(i+1)&0x0f])
147 xor $t1,$t0,$t0 ; sigma1(X[(i+14)&0x0f])
148 $LDM $SZ($Tbl),$t1
149 addl $a0,@X[$i],@X[$i]
150 addl $t0,@X[$i],@X[$i]
151___
152$code.=<<___ if ($i==15);
153 extru $t1,31,10,$a1
154 comiclr,<> $LAST10BITS,$a1,%r0
155 ldo 1($Tbl),$Tbl ; signal end of $Tbl
156___
157&ROUND_00_15($i+16,$a,$b,$c,$d,$e,$f,$g,$h);
158}
159
160$code=<<___;
161 .LEVEL $LEVEL
162 .text
163
164 .section .rodata
165 .ALIGN 64
166L\$table
167___
168$code.=<<___ if ($SZ==8);
169 .WORD 0x428a2f98,0xd728ae22,0x71374491,0x23ef65cd
170 .WORD 0xb5c0fbcf,0xec4d3b2f,0xe9b5dba5,0x8189dbbc
171 .WORD 0x3956c25b,0xf348b538,0x59f111f1,0xb605d019
172 .WORD 0x923f82a4,0xaf194f9b,0xab1c5ed5,0xda6d8118
173 .WORD 0xd807aa98,0xa3030242,0x12835b01,0x45706fbe
174 .WORD 0x243185be,0x4ee4b28c,0x550c7dc3,0xd5ffb4e2
175 .WORD 0x72be5d74,0xf27b896f,0x80deb1fe,0x3b1696b1
176 .WORD 0x9bdc06a7,0x25c71235,0xc19bf174,0xcf692694
177 .WORD 0xe49b69c1,0x9ef14ad2,0xefbe4786,0x384f25e3
178 .WORD 0x0fc19dc6,0x8b8cd5b5,0x240ca1cc,0x77ac9c65
179 .WORD 0x2de92c6f,0x592b0275,0x4a7484aa,0x6ea6e483
180 .WORD 0x5cb0a9dc,0xbd41fbd4,0x76f988da,0x831153b5
181 .WORD 0x983e5152,0xee66dfab,0xa831c66d,0x2db43210
182 .WORD 0xb00327c8,0x98fb213f,0xbf597fc7,0xbeef0ee4
183 .WORD 0xc6e00bf3,0x3da88fc2,0xd5a79147,0x930aa725
184 .WORD 0x06ca6351,0xe003826f,0x14292967,0x0a0e6e70
185 .WORD 0x27b70a85,0x46d22ffc,0x2e1b2138,0x5c26c926
186 .WORD 0x4d2c6dfc,0x5ac42aed,0x53380d13,0x9d95b3df
187 .WORD 0x650a7354,0x8baf63de,0x766a0abb,0x3c77b2a8
188 .WORD 0x81c2c92e,0x47edaee6,0x92722c85,0x1482353b
189 .WORD 0xa2bfe8a1,0x4cf10364,0xa81a664b,0xbc423001
190 .WORD 0xc24b8b70,0xd0f89791,0xc76c51a3,0x0654be30
191 .WORD 0xd192e819,0xd6ef5218,0xd6990624,0x5565a910
192 .WORD 0xf40e3585,0x5771202a,0x106aa070,0x32bbd1b8
193 .WORD 0x19a4c116,0xb8d2d0c8,0x1e376c08,0x5141ab53
194 .WORD 0x2748774c,0xdf8eeb99,0x34b0bcb5,0xe19b48a8
195 .WORD 0x391c0cb3,0xc5c95a63,0x4ed8aa4a,0xe3418acb
196 .WORD 0x5b9cca4f,0x7763e373,0x682e6ff3,0xd6b2b8a3
197 .WORD 0x748f82ee,0x5defb2fc,0x78a5636f,0x43172f60
198 .WORD 0x84c87814,0xa1f0ab72,0x8cc70208,0x1a6439ec
199 .WORD 0x90befffa,0x23631e28,0xa4506ceb,0xde82bde9
200 .WORD 0xbef9a3f7,0xb2c67915,0xc67178f2,0xe372532b
201 .WORD 0xca273ece,0xea26619c,0xd186b8c7,0x21c0c207
202 .WORD 0xeada7dd6,0xcde0eb1e,0xf57d4f7f,0xee6ed178
203 .WORD 0x06f067aa,0x72176fba,0x0a637dc5,0xa2c898a6
204 .WORD 0x113f9804,0xbef90dae,0x1b710b35,0x131c471b
205 .WORD 0x28db77f5,0x23047d84,0x32caab7b,0x40c72493
206 .WORD 0x3c9ebe0a,0x15c9bebc,0x431d67c4,0x9c100d4c
207 .WORD 0x4cc5d4be,0xcb3e42b6,0x597f299c,0xfc657e2a
208 .WORD 0x5fcb6fab,0x3ad6faec,0x6c44198c,0x4a475817
209___
210$code.=<<___ if ($SZ==4);
211 .WORD 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
212 .WORD 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
213 .WORD 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
214 .WORD 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
215 .WORD 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
216 .WORD 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
217 .WORD 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
218 .WORD 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
219 .WORD 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
220 .WORD 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
221 .WORD 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
222 .WORD 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
223 .WORD 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
224 .WORD 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
225 .WORD 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
226 .WORD 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
227___
228$code.=<<___;
229 .previous
230
231 .EXPORT $func,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR
232 .ALIGN 64
233$func
234 .PROC
235 .CALLINFO FRAME=`$FRAME-16*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=18
236 .ENTRY
237 $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
238 $PUSHMA %r3,$FRAME(%sp)
239 $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
240 $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
241 $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
242 $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp)
243 $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp)
244 $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp)
245 $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp)
246 $PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp)
247 $PUSH %r12,`-$FRAME+9*$SIZE_T`(%sp)
248 $PUSH %r13,`-$FRAME+10*$SIZE_T`(%sp)
249 $PUSH %r14,`-$FRAME+11*$SIZE_T`(%sp)
250 $PUSH %r15,`-$FRAME+12*$SIZE_T`(%sp)
251 $PUSH %r16,`-$FRAME+13*$SIZE_T`(%sp)
252 $PUSH %r17,`-$FRAME+14*$SIZE_T`(%sp)
253 $PUSH %r18,`-$FRAME+15*$SIZE_T`(%sp)
254
255 _shl $num,`log(16*$SZ)/log(2)`,$num
256 addl $inp,$num,$num ; $num to point at the end of $inp
257
258 $PUSH $num,`-$FRAME_MARKER-4*$SIZE_T`(%sp) ; save arguments
259 $PUSH $inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp)
260 $PUSH $ctx,`-$FRAME_MARKER-2*$SIZE_T`(%sp)
261
262#ifdef __PIC__
263 addil LT'L\$table, %r19
264 ldw RT'L\$table(%r1), $Tbl
265#else
266 ldil L'L\$table, %t1
267 ldo R'L\$table(%t1), $Tbl
268#endif
269___
270$code.=<<___ if ($SZ==8 && $SIZE_T==4);
271#ifndef __OpenBSD__
272___
273$code.=<<___ if ($SZ==8 && $SIZE_T==4);
274 ldi 31,$t1
275 mtctl $t1,%cr11
276 extrd,u,*= $t1,%sar,1,$t1 ; executes on PA-RISC 1.0
277 b L\$parisc1
278 nop
279___
280$code.=<<___;
281 $LD `0*$SZ`($ctx),$A ; load context
282 $LD `1*$SZ`($ctx),$B
283 $LD `2*$SZ`($ctx),$C
284 $LD `3*$SZ`($ctx),$D
285 $LD `4*$SZ`($ctx),$E
286 $LD `5*$SZ`($ctx),$F
287 $LD `6*$SZ`($ctx),$G
288 $LD `7*$SZ`($ctx),$H
289
290 extru $inp,31,`log($SZ)/log(2)`,$t0
291 sh3addl $t0,%r0,$t0
292 subi `8*$SZ`,$t0,$t0
293 mtctl $t0,%cr11 ; load %sar with align factor
294
295L\$oop
296 ldi `$SZ-1`,$t0
297 $LDM $SZ($Tbl),$t1
298 andcm $inp,$t0,$t0 ; align $inp
299___
300 for ($i=0;$i<15;$i++) { # load input block
301 $code.="\t$LD `$SZ*$i`($t0),@X[$i]\n"; }
302$code.=<<___;
303 cmpb,*= $inp,$t0,L\$aligned
304 $LD `$SZ*15`($t0),@X[15]
305 $LD `$SZ*16`($t0),@X[16]
306___
307 for ($i=0;$i<16;$i++) { # align data
308 $code.="\t_align @X[$i],@X[$i+1],@X[$i]\n"; }
309$code.=<<___;
310L\$aligned
311 nop ; otherwise /usr/ccs/bin/as is confused by below .WORD
312___
313
314for($i=0;$i<16;$i++) { &ROUND_00_15($i,@V); unshift(@V,pop(@V)); }
315$code.=<<___;
316L\$rounds
317 nop ; otherwise /usr/ccs/bin/as is confused by below .WORD
318___
319for(;$i<32;$i++) { &ROUND_16_xx($i,@V); unshift(@V,pop(@V)); }
320$code.=<<___;
321 bb,>= $Tbl,31,L\$rounds ; end of $Tbl signalled?
322 nop
323
324 $POP `-$FRAME_MARKER-2*$SIZE_T`(%sp),$ctx ; restore arguments
325 $POP `-$FRAME_MARKER-3*$SIZE_T`(%sp),$inp
326 $POP `-$FRAME_MARKER-4*$SIZE_T`(%sp),$num
327 ldo `-$rounds*$SZ-1`($Tbl),$Tbl ; rewind $Tbl
328
329 $LD `0*$SZ`($ctx),@X[0] ; load context
330 $LD `1*$SZ`($ctx),@X[1]
331 $LD `2*$SZ`($ctx),@X[2]
332 $LD `3*$SZ`($ctx),@X[3]
333 $LD `4*$SZ`($ctx),@X[4]
334 $LD `5*$SZ`($ctx),@X[5]
335 addl @X[0],$A,$A
336 $LD `6*$SZ`($ctx),@X[6]
337 addl @X[1],$B,$B
338 $LD `7*$SZ`($ctx),@X[7]
339 ldo `16*$SZ`($inp),$inp ; advance $inp
340
341 $ST $A,`0*$SZ`($ctx) ; save context
342 addl @X[2],$C,$C
343 $ST $B,`1*$SZ`($ctx)
344 addl @X[3],$D,$D
345 $ST $C,`2*$SZ`($ctx)
346 addl @X[4],$E,$E
347 $ST $D,`3*$SZ`($ctx)
348 addl @X[5],$F,$F
349 $ST $E,`4*$SZ`($ctx)
350 addl @X[6],$G,$G
351 $ST $F,`5*$SZ`($ctx)
352 addl @X[7],$H,$H
353 $ST $G,`6*$SZ`($ctx)
354 $ST $H,`7*$SZ`($ctx)
355
356 cmpb,*<>,n $inp,$num,L\$oop
357 $PUSH $inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp) ; save $inp
358___
359if ($SZ==8 && $SIZE_T==4) # SHA512 for 32-bit PA-RISC 1.0
360{{
361$code.=<<___;
362 b L\$done
363 nop
364
365 .ALIGN 64
366L\$parisc1
367___
368$code.=<<___ if ($SZ==8 && $SIZE_T==4);
369#endif
370___
371
372@V=( $Ahi, $Alo, $Bhi, $Blo, $Chi, $Clo, $Dhi, $Dlo,
373 $Ehi, $Elo, $Fhi, $Flo, $Ghi, $Glo, $Hhi, $Hlo) =
374 ( "%r1", "%r2", "%r3", "%r4", "%r5", "%r6", "%r7", "%r8",
375 "%r9","%r10","%r11","%r12","%r13","%r14","%r15","%r16");
376$a0 ="%r17";
377$a1 ="%r18";
378$a2 ="%r19";
379$a3 ="%r20";
380$t0 ="%r21";
381$t1 ="%r22";
382$t2 ="%r28";
383$t3 ="%r29";
384$Tbl="%r31";
385
386@X=("%r23","%r24","%r25","%r26"); # zaps $num,$inp,$ctx
387
388sub ROUND_00_15_pa1 {
389my ($i,$ahi,$alo,$bhi,$blo,$chi,$clo,$dhi,$dlo,
390 $ehi,$elo,$fhi,$flo,$ghi,$glo,$hhi,$hlo,$flag)=@_;
391my ($Xhi,$Xlo,$Xnhi,$Xnlo) = @X;
392
393$code.=<<___ if (!$flag);
394 ldw `-$XOFF+8*(($i+1)%16)`(%sp),$Xnhi
395 ldw `-$XOFF+8*(($i+1)%16)+4`(%sp),$Xnlo ; load X[i+1]
396___
397$code.=<<___;
398 shd $ehi,$elo,$Sigma1[0],$t0
399 add $Xlo,$hlo,$hlo
400 shd $elo,$ehi,$Sigma1[0],$t1
401 addc $Xhi,$hhi,$hhi ; h += X[i]
402 shd $ehi,$elo,$Sigma1[1],$t2
403 ldwm 8($Tbl),$Xhi
404 shd $elo,$ehi,$Sigma1[1],$t3
405 ldw -4($Tbl),$Xlo ; load K[i]
406 xor $t2,$t0,$t0
407 xor $t3,$t1,$t1
408 and $flo,$elo,$a0
409 and $fhi,$ehi,$a1
410 shd $ehi,$elo,$Sigma1[2],$t2
411 andcm $glo,$elo,$a2
412 shd $elo,$ehi,$Sigma1[2],$t3
413 andcm $ghi,$ehi,$a3
414 xor $t2,$t0,$t0
415 xor $t3,$t1,$t1 ; Sigma1(e)
416 add $Xlo,$hlo,$hlo
417 xor $a2,$a0,$a0
418 addc $Xhi,$hhi,$hhi ; h += K[i]
419 xor $a3,$a1,$a1 ; Ch(e,f,g)
420
421 add $t0,$hlo,$hlo
422 shd $ahi,$alo,$Sigma0[0],$t0
423 addc $t1,$hhi,$hhi ; h += Sigma1(e)
424 shd $alo,$ahi,$Sigma0[0],$t1
425 add $a0,$hlo,$hlo
426 shd $ahi,$alo,$Sigma0[1],$t2
427 addc $a1,$hhi,$hhi ; h += Ch(e,f,g)
428 shd $alo,$ahi,$Sigma0[1],$t3
429
430 xor $t2,$t0,$t0
431 xor $t3,$t1,$t1
432 shd $ahi,$alo,$Sigma0[2],$t2
433 and $alo,$blo,$a0
434 shd $alo,$ahi,$Sigma0[2],$t3
435 and $ahi,$bhi,$a1
436 xor $t2,$t0,$t0
437 xor $t3,$t1,$t1 ; Sigma0(a)
438
439 and $alo,$clo,$a2
440 and $ahi,$chi,$a3
441 xor $a2,$a0,$a0
442 add $hlo,$dlo,$dlo
443 xor $a3,$a1,$a1
444 addc $hhi,$dhi,$dhi ; d += h
445 and $blo,$clo,$a2
446 add $t0,$hlo,$hlo
447 and $bhi,$chi,$a3
448 addc $t1,$hhi,$hhi ; h += Sigma0(a)
449 xor $a2,$a0,$a0
450 add $a0,$hlo,$hlo
451 xor $a3,$a1,$a1 ; Maj(a,b,c)
452 addc $a1,$hhi,$hhi ; h += Maj(a,b,c)
453
454___
455$code.=<<___ if ($i==15 && $flag);
456 extru $Xlo,31,10,$Xlo
457 comiclr,= $LAST10BITS,$Xlo,%r0
458 b L\$rounds_pa1
459 nop
460___
461push(@X,shift(@X)); push(@X,shift(@X));
462}
463
464sub ROUND_16_xx_pa1 {
465my ($Xhi,$Xlo,$Xnhi,$Xnlo) = @X;
466my ($i)=shift;
467$i-=16;
468$code.=<<___;
469 ldw `-$XOFF+8*(($i+1)%16)`(%sp),$Xnhi
470 ldw `-$XOFF+8*(($i+1)%16)+4`(%sp),$Xnlo ; load X[i+1]
471 ldw `-$XOFF+8*(($i+9)%16)`(%sp),$a1
472 ldw `-$XOFF+8*(($i+9)%16)+4`(%sp),$a0 ; load X[i+9]
473 ldw `-$XOFF+8*(($i+14)%16)`(%sp),$a3
474 ldw `-$XOFF+8*(($i+14)%16)+4`(%sp),$a2 ; load X[i+14]
475 shd $Xnhi,$Xnlo,$sigma0[0],$t0
476 shd $Xnlo,$Xnhi,$sigma0[0],$t1
477 add $a0,$Xlo,$Xlo
478 shd $Xnhi,$Xnlo,$sigma0[1],$t2
479 addc $a1,$Xhi,$Xhi
480 shd $Xnlo,$Xnhi,$sigma0[1],$t3
481 xor $t2,$t0,$t0
482 shd $Xnhi,$Xnlo,$sigma0[2],$t2
483 xor $t3,$t1,$t1
484 extru $Xnhi,`31-$sigma0[2]`,`32-$sigma0[2]`,$t3
485 xor $t2,$t0,$t0
486 shd $a3,$a2,$sigma1[0],$a0
487 xor $t3,$t1,$t1 ; sigma0(X[i+1)&0x0f])
488 shd $a2,$a3,$sigma1[0],$a1
489 add $t0,$Xlo,$Xlo
490 shd $a3,$a2,$sigma1[1],$t2
491 addc $t1,$Xhi,$Xhi
492 shd $a2,$a3,$sigma1[1],$t3
493 xor $t2,$a0,$a0
494 shd $a3,$a2,$sigma1[2],$t2
495 xor $t3,$a1,$a1
496 extru $a3,`31-$sigma1[2]`,`32-$sigma1[2]`,$t3
497 xor $t2,$a0,$a0
498 xor $t3,$a1,$a1 ; sigma0(X[i+14)&0x0f])
499 add $a0,$Xlo,$Xlo
500 addc $a1,$Xhi,$Xhi
501
502 stw $Xhi,`-$XOFF+8*($i%16)`(%sp)
503 stw $Xlo,`-$XOFF+8*($i%16)+4`(%sp)
504___
505&ROUND_00_15_pa1($i,@_,1);
506}
507$code.=<<___;
508 ldw `0*4`($ctx),$Ahi ; load context
509 ldw `1*4`($ctx),$Alo
510 ldw `2*4`($ctx),$Bhi
511 ldw `3*4`($ctx),$Blo
512 ldw `4*4`($ctx),$Chi
513 ldw `5*4`($ctx),$Clo
514 ldw `6*4`($ctx),$Dhi
515 ldw `7*4`($ctx),$Dlo
516 ldw `8*4`($ctx),$Ehi
517 ldw `9*4`($ctx),$Elo
518 ldw `10*4`($ctx),$Fhi
519 ldw `11*4`($ctx),$Flo
520 ldw `12*4`($ctx),$Ghi
521 ldw `13*4`($ctx),$Glo
522 ldw `14*4`($ctx),$Hhi
523 ldw `15*4`($ctx),$Hlo
524
525 extru $inp,31,2,$t0
526 sh3addl $t0,%r0,$t0
527 subi 32,$t0,$t0
528 mtctl $t0,%cr11 ; load %sar with align factor
529
530L\$oop_pa1
531 extru $inp,31,2,$a3
532 comib,= 0,$a3,L\$aligned_pa1
533 sub $inp,$a3,$inp
534
535 ldw `0*4`($inp),$X[0]
536 ldw `1*4`($inp),$X[1]
537 ldw `2*4`($inp),$t2
538 ldw `3*4`($inp),$t3
539 ldw `4*4`($inp),$a0
540 ldw `5*4`($inp),$a1
541 ldw `6*4`($inp),$a2
542 ldw `7*4`($inp),$a3
543 vshd $X[0],$X[1],$X[0]
544 vshd $X[1],$t2,$X[1]
545 stw $X[0],`-$XOFF+0*4`(%sp)
546 ldw `8*4`($inp),$t0
547 vshd $t2,$t3,$t2
548 stw $X[1],`-$XOFF+1*4`(%sp)
549 ldw `9*4`($inp),$t1
550 vshd $t3,$a0,$t3
551___
552{
553my @t=($t2,$t3,$a0,$a1,$a2,$a3,$t0,$t1);
554for ($i=2;$i<=(128/4-8);$i++) {
555$code.=<<___;
556 stw $t[0],`-$XOFF+$i*4`(%sp)
557 ldw `(8+$i)*4`($inp),$t[0]
558 vshd $t[1],$t[2],$t[1]
559___
560push(@t,shift(@t));
561}
562for (;$i<(128/4-1);$i++) {
563$code.=<<___;
564 stw $t[0],`-$XOFF+$i*4`(%sp)
565 vshd $t[1],$t[2],$t[1]
566___
567push(@t,shift(@t));
568}
569$code.=<<___;
570 b L\$collected_pa1
571 stw $t[0],`-$XOFF+$i*4`(%sp)
572
573___
574}
575$code.=<<___;
576L\$aligned_pa1
577 ldw `0*4`($inp),$X[0]
578 ldw `1*4`($inp),$X[1]
579 ldw `2*4`($inp),$t2
580 ldw `3*4`($inp),$t3
581 ldw `4*4`($inp),$a0
582 ldw `5*4`($inp),$a1
583 ldw `6*4`($inp),$a2
584 ldw `7*4`($inp),$a3
585 stw $X[0],`-$XOFF+0*4`(%sp)
586 ldw `8*4`($inp),$t0
587 stw $X[1],`-$XOFF+1*4`(%sp)
588 ldw `9*4`($inp),$t1
589___
590{
591my @t=($t2,$t3,$a0,$a1,$a2,$a3,$t0,$t1);
592for ($i=2;$i<(128/4-8);$i++) {
593$code.=<<___;
594 stw $t[0],`-$XOFF+$i*4`(%sp)
595 ldw `(8+$i)*4`($inp),$t[0]
596___
597push(@t,shift(@t));
598}
599for (;$i<128/4;$i++) {
600$code.=<<___;
601 stw $t[0],`-$XOFF+$i*4`(%sp)
602___
603push(@t,shift(@t));
604}
605$code.="L\$collected_pa1\n";
606}
607
608for($i=0;$i<16;$i++) { &ROUND_00_15_pa1($i,@V); unshift(@V,pop(@V)); unshift(@V,pop(@V)); }
609$code.="L\$rounds_pa1\n";
610for(;$i<32;$i++) { &ROUND_16_xx_pa1($i,@V); unshift(@V,pop(@V)); unshift(@V,pop(@V)); }
611
612$code.=<<___;
613 $POP `-$FRAME_MARKER-2*$SIZE_T`(%sp),$ctx ; restore arguments
614 $POP `-$FRAME_MARKER-3*$SIZE_T`(%sp),$inp
615 $POP `-$FRAME_MARKER-4*$SIZE_T`(%sp),$num
616 ldo `-$rounds*$SZ`($Tbl),$Tbl ; rewind $Tbl
617
618 ldw `0*4`($ctx),$t1 ; update context
619 ldw `1*4`($ctx),$t0
620 ldw `2*4`($ctx),$t3
621 ldw `3*4`($ctx),$t2
622 ldw `4*4`($ctx),$a1
623 ldw `5*4`($ctx),$a0
624 ldw `6*4`($ctx),$a3
625 add $t0,$Alo,$Alo
626 ldw `7*4`($ctx),$a2
627 addc $t1,$Ahi,$Ahi
628 ldw `8*4`($ctx),$t1
629 add $t2,$Blo,$Blo
630 ldw `9*4`($ctx),$t0
631 addc $t3,$Bhi,$Bhi
632 ldw `10*4`($ctx),$t3
633 add $a0,$Clo,$Clo
634 ldw `11*4`($ctx),$t2
635 addc $a1,$Chi,$Chi
636 ldw `12*4`($ctx),$a1
637 add $a2,$Dlo,$Dlo
638 ldw `13*4`($ctx),$a0
639 addc $a3,$Dhi,$Dhi
640 ldw `14*4`($ctx),$a3
641 add $t0,$Elo,$Elo
642 ldw `15*4`($ctx),$a2
643 addc $t1,$Ehi,$Ehi
644 stw $Ahi,`0*4`($ctx)
645 add $t2,$Flo,$Flo
646 stw $Alo,`1*4`($ctx)
647 addc $t3,$Fhi,$Fhi
648 stw $Bhi,`2*4`($ctx)
649 add $a0,$Glo,$Glo
650 stw $Blo,`3*4`($ctx)
651 addc $a1,$Ghi,$Ghi
652 stw $Chi,`4*4`($ctx)
653 add $a2,$Hlo,$Hlo
654 stw $Clo,`5*4`($ctx)
655 addc $a3,$Hhi,$Hhi
656 stw $Dhi,`6*4`($ctx)
657 ldo `16*$SZ`($inp),$inp ; advance $inp
658 stw $Dlo,`7*4`($ctx)
659 stw $Ehi,`8*4`($ctx)
660 stw $Elo,`9*4`($ctx)
661 stw $Fhi,`10*4`($ctx)
662 stw $Flo,`11*4`($ctx)
663 stw $Ghi,`12*4`($ctx)
664 stw $Glo,`13*4`($ctx)
665 stw $Hhi,`14*4`($ctx)
666 comb,= $inp,$num,L\$done
667 stw $Hlo,`15*4`($ctx)
668 b L\$oop_pa1
669 $PUSH $inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp) ; save $inp
670L\$done
671___
672}}
673$code.=<<___;
674 $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue
675 $POP `-$FRAME+1*$SIZE_T`(%sp),%r4
676 $POP `-$FRAME+2*$SIZE_T`(%sp),%r5
677 $POP `-$FRAME+3*$SIZE_T`(%sp),%r6
678 $POP `-$FRAME+4*$SIZE_T`(%sp),%r7
679 $POP `-$FRAME+5*$SIZE_T`(%sp),%r8
680 $POP `-$FRAME+6*$SIZE_T`(%sp),%r9
681 $POP `-$FRAME+7*$SIZE_T`(%sp),%r10
682 $POP `-$FRAME+8*$SIZE_T`(%sp),%r11
683 $POP `-$FRAME+9*$SIZE_T`(%sp),%r12
684 $POP `-$FRAME+10*$SIZE_T`(%sp),%r13
685 $POP `-$FRAME+11*$SIZE_T`(%sp),%r14
686 $POP `-$FRAME+12*$SIZE_T`(%sp),%r15
687 $POP `-$FRAME+13*$SIZE_T`(%sp),%r16
688 $POP `-$FRAME+14*$SIZE_T`(%sp),%r17
689 $POP `-$FRAME+15*$SIZE_T`(%sp),%r18
690 bv (%r2)
691 .EXIT
692 $POPMB -$FRAME(%sp),%r3
693 .PROCEND
694___
695
696# Explicitly encode PA-RISC 2.0 instructions used in this module, so
697# that it can be compiled with .LEVEL 1.0. It should be noted that I
698# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0
699# directive...
700
701my $ldd = sub {
702 my ($mod,$args) = @_;
703 my $orig = "ldd$mod\t$args";
704
705 if ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 3 suffices
706 { my $opcode=(0x14<<26)|($2<<21)|($3<<16)|(($1&0x1FF8)<<1)|(($1>>13)&1);
707 $opcode|=(1<<3) if ($mod =~ /^,m/);
708 $opcode|=(1<<2) if ($mod =~ /^,mb/);
709 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
710 }
711 else { "\t".$orig; }
712};
713
714my $std = sub {
715 my ($mod,$args) = @_;
716 my $orig = "std$mod\t$args";
717
718 if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 3 suffices
719 { my $opcode=(0x1c<<26)|($3<<21)|($1<<16)|(($2&0x1FF8)<<1)|(($2>>13)&1);
720 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
721 }
722 else { "\t".$orig; }
723};
724
725my $extrd = sub {
726 my ($mod,$args) = @_;
727 my $orig = "extrd$mod\t$args";
728
729 # I only have ",u" completer, it's implicitly encoded...
730 if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 15
731 { my $opcode=(0x36<<26)|($1<<21)|($4<<16);
732 my $len=32-$3;
733 $opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5); # encode pos
734 $opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len
735 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
736 }
737 elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/) # format 12
738 { my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9);
739 my $len=32-$2;
740 $opcode |= (($len&0x20)<<3)|($len&0x1f); # encode len
741 $opcode |= (1<<13) if ($mod =~ /,\**=/);
742 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
743 }
744 else { "\t".$orig; }
745};
746
747my $shrpd = sub {
748 my ($mod,$args) = @_;
749 my $orig = "shrpd$mod\t$args";
750
751 if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/) # format 14
752 { my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4;
753 my $cpos=63-$3;
754 $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode sa
755 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
756 }
757 elsif ($args =~ /%r([0-9]+),%r([0-9]+),%sar,%r([0-9]+)/) # format 11
758 { sprintf "\t.WORD\t0x%08x\t; %s",
759 (0x34<<26)|($2<<21)|($1<<16)|(1<<9)|$3,$orig;
760 }
761 else { "\t".$orig; }
762};
763
764sub assemble {
765 my ($mnemonic,$mod,$args)=@_;
766 my $opcode = eval("\$$mnemonic");
767
768 ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args";
769}
770
771foreach (split("\n",$code)) {
772 s/\`([^\`]*)\`/eval $1/ge;
773
774 s/shd\s+(%r[0-9]+),(%r[0-9]+),([0-9]+)/
775 $3>31 ? sprintf("shd\t%$2,%$1,%d",$3-32) # rotation for >=32
776 : sprintf("shd\t%$1,%$2,%d",$3)/e or
777 # translate made up instructons: _ror, _shr, _align, _shl
778 s/_ror(\s+)(%r[0-9]+),/
779 ($SZ==4 ? "shd" : "shrpd")."$1$2,$2,"/e or
780
781 s/_shr(\s+%r[0-9]+),([0-9]+),/
782 $SZ==4 ? sprintf("extru%s,%d,%d,",$1,31-$2,32-$2)
783 : sprintf("extrd,u%s,%d,%d,",$1,63-$2,64-$2)/e or
784
785 s/_align(\s+%r[0-9]+,%r[0-9]+),/
786 ($SZ==4 ? "vshd$1," : "shrpd$1,%sar,")/e or
787
788 s/_shl(\s+%r[0-9]+),([0-9]+),/
789 $SIZE_T==4 ? sprintf("zdep%s,%d,%d,",$1,31-$2,32-$2)
790 : sprintf("depd,z%s,%d,%d,",$1,63-$2,64-$2)/e;
791
792 s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e if ($SIZE_T==4);
793
794 s/cmpb,\*/comb,/ if ($SIZE_T==4);
795
796 s/\bbv\b/bve/ if ($SIZE_T==8);
797
798 print $_,"\n";
799}
800
801close STDOUT;
diff --git a/src/lib/libcrypto/sha/asm/sha512-ppc.pl b/src/lib/libcrypto/sha/asm/sha512-ppc.pl
deleted file mode 100755
index 28bd997cf8..0000000000
--- a/src/lib/libcrypto/sha/asm/sha512-ppc.pl
+++ /dev/null
@@ -1,444 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# I let hardware handle unaligned input, except on page boundaries
11# (see below for details). Otherwise straightforward implementation
12# with X vector in register bank. The module is big-endian [which is
13# not big deal as there're no little-endian targets left around].
14
15# sha256 | sha512
16# -m64 -m32 | -m64 -m32
17# --------------------------------------+-----------------------
18# PPC970,gcc-4.0.0 +50% +38% | +40% +410%(*)
19# Power6,xlc-7 +150% +90% | +100% +430%(*)
20#
21# (*) 64-bit code in 32-bit application context, which actually is
22# on TODO list. It should be noted that for safe deployment in
23# 32-bit *multi-threaded* context asynchronous signals should be
24# blocked upon entry to SHA512 block routine. This is because
25# 32-bit signaling procedure invalidates upper halves of GPRs.
26# Context switch procedure preserves them, but not signaling:-(
27
28# Second version is true multi-thread safe. Trouble with the original
29# version was that it was using thread local storage pointer register.
30# Well, it scrupulously preserved it, but the problem would arise the
31# moment asynchronous signal was delivered and signal handler would
32# dereference the TLS pointer. While it's never the case in openssl
33# application or test suite, we have to respect this scenario and not
34# use TLS pointer register. Alternative would be to require caller to
35# block signals prior calling this routine. For the record, in 32-bit
36# context R2 serves as TLS pointer, while in 64-bit context - R13.
37
38$flavour=shift;
39$output =shift;
40
41if ($flavour =~ /64/) {
42 $SIZE_T=8;
43 $LRSAVE=2*$SIZE_T;
44 $STU="stdu";
45 $UCMP="cmpld";
46 $SHL="sldi";
47 $POP="ld";
48 $PUSH="std";
49} elsif ($flavour =~ /32/) {
50 $SIZE_T=4;
51 $LRSAVE=$SIZE_T;
52 $STU="stwu";
53 $UCMP="cmplw";
54 $SHL="slwi";
55 $POP="lwz";
56 $PUSH="stw";
57} else { die "nonsense $flavour"; }
58
59$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
60( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
61( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
62die "can't locate ppc-xlate.pl";
63
64open STDOUT,"| $^X $xlate $flavour $output" || die "can't call $xlate: $!";
65
66if ($output =~ /512/) {
67 $func="sha512_block_data_order";
68 $SZ=8;
69 @Sigma0=(28,34,39);
70 @Sigma1=(14,18,41);
71 @sigma0=(1, 8, 7);
72 @sigma1=(19,61, 6);
73 $rounds=80;
74 $LD="ld";
75 $ST="std";
76 $ROR="rotrdi";
77 $SHR="srdi";
78} else {
79 $func="sha256_block_data_order";
80 $SZ=4;
81 @Sigma0=( 2,13,22);
82 @Sigma1=( 6,11,25);
83 @sigma0=( 7,18, 3);
84 @sigma1=(17,19,10);
85 $rounds=64;
86 $LD="lwz";
87 $ST="stw";
88 $ROR="rotrwi";
89 $SHR="srwi";
90}
91
92$FRAME=32*$SIZE_T+16*$SZ;
93$LOCALS=6*$SIZE_T;
94
95$sp ="r1";
96$toc="r2";
97$ctx="r3"; # zapped by $a0
98$inp="r4"; # zapped by $a1
99$num="r5"; # zapped by $t0
100
101$T ="r0";
102$a0 ="r3";
103$a1 ="r4";
104$t0 ="r5";
105$t1 ="r6";
106$Tbl="r7";
107
108$A ="r8";
109$B ="r9";
110$C ="r10";
111$D ="r11";
112$E ="r12";
113$F ="r13"; $F="r2" if ($SIZE_T==8);# reassigned to exempt TLS pointer
114$G ="r14";
115$H ="r15";
116
117@V=($A,$B,$C,$D,$E,$F,$G,$H);
118@X=("r16","r17","r18","r19","r20","r21","r22","r23",
119 "r24","r25","r26","r27","r28","r29","r30","r31");
120
121$inp="r31"; # reassigned $inp! aliases with @X[15]
122
123sub ROUND_00_15 {
124my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
125$code.=<<___;
126 $LD $T,`$i*$SZ`($Tbl)
127 $ROR $a0,$e,$Sigma1[0]
128 $ROR $a1,$e,$Sigma1[1]
129 and $t0,$f,$e
130 andc $t1,$g,$e
131 add $T,$T,$h
132 xor $a0,$a0,$a1
133 $ROR $a1,$a1,`$Sigma1[2]-$Sigma1[1]`
134 or $t0,$t0,$t1 ; Ch(e,f,g)
135 add $T,$T,@X[$i]
136 xor $a0,$a0,$a1 ; Sigma1(e)
137 add $T,$T,$t0
138 add $T,$T,$a0
139
140 $ROR $a0,$a,$Sigma0[0]
141 $ROR $a1,$a,$Sigma0[1]
142 and $t0,$a,$b
143 and $t1,$a,$c
144 xor $a0,$a0,$a1
145 $ROR $a1,$a1,`$Sigma0[2]-$Sigma0[1]`
146 xor $t0,$t0,$t1
147 and $t1,$b,$c
148 xor $a0,$a0,$a1 ; Sigma0(a)
149 add $d,$d,$T
150 xor $t0,$t0,$t1 ; Maj(a,b,c)
151 add $h,$T,$a0
152 add $h,$h,$t0
153
154___
155}
156
157sub ROUND_16_xx {
158my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
159$i-=16;
160$code.=<<___;
161 $ROR $a0,@X[($i+1)%16],$sigma0[0]
162 $ROR $a1,@X[($i+1)%16],$sigma0[1]
163 $ROR $t0,@X[($i+14)%16],$sigma1[0]
164 $ROR $t1,@X[($i+14)%16],$sigma1[1]
165 xor $a0,$a0,$a1
166 $SHR $a1,@X[($i+1)%16],$sigma0[2]
167 xor $t0,$t0,$t1
168 $SHR $t1,@X[($i+14)%16],$sigma1[2]
169 add @X[$i],@X[$i],@X[($i+9)%16]
170 xor $a0,$a0,$a1 ; sigma0(X[(i+1)&0x0f])
171 xor $t0,$t0,$t1 ; sigma1(X[(i+14)&0x0f])
172 add @X[$i],@X[$i],$a0
173 add @X[$i],@X[$i],$t0
174___
175&ROUND_00_15($i,$a,$b,$c,$d,$e,$f,$g,$h);
176}
177
178$code=<<___;
179.machine "any"
180.text
181
182.globl $func
183.align 6
184$func:
185 $STU $sp,-$FRAME($sp)
186 mflr r0
187 $SHL $num,$num,`log(16*$SZ)/log(2)`
188
189 $PUSH $ctx,`$FRAME-$SIZE_T*22`($sp)
190
191 $PUSH $toc,`$FRAME-$SIZE_T*20`($sp)
192 $PUSH r13,`$FRAME-$SIZE_T*19`($sp)
193 $PUSH r14,`$FRAME-$SIZE_T*18`($sp)
194 $PUSH r15,`$FRAME-$SIZE_T*17`($sp)
195 $PUSH r16,`$FRAME-$SIZE_T*16`($sp)
196 $PUSH r17,`$FRAME-$SIZE_T*15`($sp)
197 $PUSH r18,`$FRAME-$SIZE_T*14`($sp)
198 $PUSH r19,`$FRAME-$SIZE_T*13`($sp)
199 $PUSH r20,`$FRAME-$SIZE_T*12`($sp)
200 $PUSH r21,`$FRAME-$SIZE_T*11`($sp)
201 $PUSH r22,`$FRAME-$SIZE_T*10`($sp)
202 $PUSH r23,`$FRAME-$SIZE_T*9`($sp)
203 $PUSH r24,`$FRAME-$SIZE_T*8`($sp)
204 $PUSH r25,`$FRAME-$SIZE_T*7`($sp)
205 $PUSH r26,`$FRAME-$SIZE_T*6`($sp)
206 $PUSH r27,`$FRAME-$SIZE_T*5`($sp)
207 $PUSH r28,`$FRAME-$SIZE_T*4`($sp)
208 $PUSH r29,`$FRAME-$SIZE_T*3`($sp)
209 $PUSH r30,`$FRAME-$SIZE_T*2`($sp)
210 $PUSH r31,`$FRAME-$SIZE_T*1`($sp)
211 $PUSH r0,`$FRAME+$LRSAVE`($sp)
212
213 $LD $A,`0*$SZ`($ctx)
214 mr $inp,r4 ; incarnate $inp
215 $LD $B,`1*$SZ`($ctx)
216 $LD $C,`2*$SZ`($ctx)
217 $LD $D,`3*$SZ`($ctx)
218 $LD $E,`4*$SZ`($ctx)
219 $LD $F,`5*$SZ`($ctx)
220 $LD $G,`6*$SZ`($ctx)
221 $LD $H,`7*$SZ`($ctx)
222
223 bcl 20,31,Lpc
224Lpc:
225 mflr $Tbl
226 addis $Tbl,$Tbl,Ltable-Lpc\@ha
227 addi $Tbl,$Tbl,Ltable-Lpc\@l
228 andi. r0,$inp,3
229 bne Lunaligned
230Laligned:
231 add $num,$inp,$num
232 $PUSH $num,`$FRAME-$SIZE_T*24`($sp) ; end pointer
233 $PUSH $inp,`$FRAME-$SIZE_T*23`($sp) ; inp pointer
234 bl Lsha2_block_private
235 b Ldone
236
237; PowerPC specification allows an implementation to be ill-behaved
238; upon unaligned access which crosses page boundary. "Better safe
239; than sorry" principle makes me treat it specially. But I don't
240; look for particular offending word, but rather for the input
241; block which crosses the boundary. Once found that block is aligned
242; and hashed separately...
243.align 4
244Lunaligned:
245 subfic $t1,$inp,4096
246 andi. $t1,$t1,`4096-16*$SZ` ; distance to closest page boundary
247 beq Lcross_page
248 $UCMP $num,$t1
249 ble- Laligned ; didn't cross the page boundary
250 subfc $num,$t1,$num
251 add $t1,$inp,$t1
252 $PUSH $num,`$FRAME-$SIZE_T*25`($sp) ; save real remaining num
253 $PUSH $t1,`$FRAME-$SIZE_T*24`($sp) ; intermediate end pointer
254 $PUSH $inp,`$FRAME-$SIZE_T*23`($sp) ; inp pointer
255 bl Lsha2_block_private
256 ; $inp equals to the intermediate end pointer here
257 $POP $num,`$FRAME-$SIZE_T*25`($sp) ; restore real remaining num
258Lcross_page:
259 li $t1,`16*$SZ/4`
260 mtctr $t1
261 addi r20,$sp,$LOCALS ; aligned spot below the frame
262Lmemcpy:
263 lbz r16,0($inp)
264 lbz r17,1($inp)
265 lbz r18,2($inp)
266 lbz r19,3($inp)
267 addi $inp,$inp,4
268 stb r16,0(r20)
269 stb r17,1(r20)
270 stb r18,2(r20)
271 stb r19,3(r20)
272 addi r20,r20,4
273 bdnz Lmemcpy
274
275 $PUSH $inp,`$FRAME-$SIZE_T*26`($sp) ; save real inp
276 addi $t1,$sp,`$LOCALS+16*$SZ` ; fictitious end pointer
277 addi $inp,$sp,$LOCALS ; fictitious inp pointer
278 $PUSH $num,`$FRAME-$SIZE_T*25`($sp) ; save real num
279 $PUSH $t1,`$FRAME-$SIZE_T*24`($sp) ; end pointer
280 $PUSH $inp,`$FRAME-$SIZE_T*23`($sp) ; inp pointer
281 bl Lsha2_block_private
282 $POP $inp,`$FRAME-$SIZE_T*26`($sp) ; restore real inp
283 $POP $num,`$FRAME-$SIZE_T*25`($sp) ; restore real num
284 addic. $num,$num,`-16*$SZ` ; num--
285 bne- Lunaligned
286
287Ldone:
288 $POP r0,`$FRAME+$LRSAVE`($sp)
289 $POP $toc,`$FRAME-$SIZE_T*20`($sp)
290 $POP r13,`$FRAME-$SIZE_T*19`($sp)
291 $POP r14,`$FRAME-$SIZE_T*18`($sp)
292 $POP r15,`$FRAME-$SIZE_T*17`($sp)
293 $POP r16,`$FRAME-$SIZE_T*16`($sp)
294 $POP r17,`$FRAME-$SIZE_T*15`($sp)
295 $POP r18,`$FRAME-$SIZE_T*14`($sp)
296 $POP r19,`$FRAME-$SIZE_T*13`($sp)
297 $POP r20,`$FRAME-$SIZE_T*12`($sp)
298 $POP r21,`$FRAME-$SIZE_T*11`($sp)
299 $POP r22,`$FRAME-$SIZE_T*10`($sp)
300 $POP r23,`$FRAME-$SIZE_T*9`($sp)
301 $POP r24,`$FRAME-$SIZE_T*8`($sp)
302 $POP r25,`$FRAME-$SIZE_T*7`($sp)
303 $POP r26,`$FRAME-$SIZE_T*6`($sp)
304 $POP r27,`$FRAME-$SIZE_T*5`($sp)
305 $POP r28,`$FRAME-$SIZE_T*4`($sp)
306 $POP r29,`$FRAME-$SIZE_T*3`($sp)
307 $POP r30,`$FRAME-$SIZE_T*2`($sp)
308 $POP r31,`$FRAME-$SIZE_T*1`($sp)
309 mtlr r0
310 addi $sp,$sp,$FRAME
311 blr
312
313.align 4
314Lsha2_block_private:
315___
316for($i=0;$i<16;$i++) {
317$code.=<<___ if ($SZ==4);
318 lwz @X[$i],`$i*$SZ`($inp)
319___
320# 64-bit loads are split to 2x32-bit ones, as CPU can't handle
321# unaligned 64-bit loads, only 32-bit ones...
322$code.=<<___ if ($SZ==8);
323 lwz $t0,`$i*$SZ`($inp)
324 lwz @X[$i],`$i*$SZ+4`($inp)
325 insrdi @X[$i],$t0,32,0
326___
327 &ROUND_00_15($i,@V);
328 unshift(@V,pop(@V));
329}
330$code.=<<___;
331 li $T,`$rounds/16-1`
332 mtctr $T
333.align 4
334Lrounds:
335 addi $Tbl,$Tbl,`16*$SZ`
336___
337for(;$i<32;$i++) {
338 &ROUND_16_xx($i,@V);
339 unshift(@V,pop(@V));
340}
341$code.=<<___;
342 bdnz- Lrounds
343
344 $POP $ctx,`$FRAME-$SIZE_T*22`($sp)
345 $POP $inp,`$FRAME-$SIZE_T*23`($sp) ; inp pointer
346 $POP $num,`$FRAME-$SIZE_T*24`($sp) ; end pointer
347 subi $Tbl,$Tbl,`($rounds-16)*$SZ` ; rewind Tbl
348
349 $LD r16,`0*$SZ`($ctx)
350 $LD r17,`1*$SZ`($ctx)
351 $LD r18,`2*$SZ`($ctx)
352 $LD r19,`3*$SZ`($ctx)
353 $LD r20,`4*$SZ`($ctx)
354 $LD r21,`5*$SZ`($ctx)
355 $LD r22,`6*$SZ`($ctx)
356 addi $inp,$inp,`16*$SZ` ; advance inp
357 $LD r23,`7*$SZ`($ctx)
358 add $A,$A,r16
359 add $B,$B,r17
360 $PUSH $inp,`$FRAME-$SIZE_T*23`($sp)
361 add $C,$C,r18
362 $ST $A,`0*$SZ`($ctx)
363 add $D,$D,r19
364 $ST $B,`1*$SZ`($ctx)
365 add $E,$E,r20
366 $ST $C,`2*$SZ`($ctx)
367 add $F,$F,r21
368 $ST $D,`3*$SZ`($ctx)
369 add $G,$G,r22
370 $ST $E,`4*$SZ`($ctx)
371 add $H,$H,r23
372 $ST $F,`5*$SZ`($ctx)
373 $ST $G,`6*$SZ`($ctx)
374 $UCMP $inp,$num
375 $ST $H,`7*$SZ`($ctx)
376 bne Lsha2_block_private
377 blr
378 .section .rodata
379Ltable:
380___
381$code.=<<___ if ($SZ==8);
382 .long 0x428a2f98,0xd728ae22,0x71374491,0x23ef65cd
383 .long 0xb5c0fbcf,0xec4d3b2f,0xe9b5dba5,0x8189dbbc
384 .long 0x3956c25b,0xf348b538,0x59f111f1,0xb605d019
385 .long 0x923f82a4,0xaf194f9b,0xab1c5ed5,0xda6d8118
386 .long 0xd807aa98,0xa3030242,0x12835b01,0x45706fbe
387 .long 0x243185be,0x4ee4b28c,0x550c7dc3,0xd5ffb4e2
388 .long 0x72be5d74,0xf27b896f,0x80deb1fe,0x3b1696b1
389 .long 0x9bdc06a7,0x25c71235,0xc19bf174,0xcf692694
390 .long 0xe49b69c1,0x9ef14ad2,0xefbe4786,0x384f25e3
391 .long 0x0fc19dc6,0x8b8cd5b5,0x240ca1cc,0x77ac9c65
392 .long 0x2de92c6f,0x592b0275,0x4a7484aa,0x6ea6e483
393 .long 0x5cb0a9dc,0xbd41fbd4,0x76f988da,0x831153b5
394 .long 0x983e5152,0xee66dfab,0xa831c66d,0x2db43210
395 .long 0xb00327c8,0x98fb213f,0xbf597fc7,0xbeef0ee4
396 .long 0xc6e00bf3,0x3da88fc2,0xd5a79147,0x930aa725
397 .long 0x06ca6351,0xe003826f,0x14292967,0x0a0e6e70
398 .long 0x27b70a85,0x46d22ffc,0x2e1b2138,0x5c26c926
399 .long 0x4d2c6dfc,0x5ac42aed,0x53380d13,0x9d95b3df
400 .long 0x650a7354,0x8baf63de,0x766a0abb,0x3c77b2a8
401 .long 0x81c2c92e,0x47edaee6,0x92722c85,0x1482353b
402 .long 0xa2bfe8a1,0x4cf10364,0xa81a664b,0xbc423001
403 .long 0xc24b8b70,0xd0f89791,0xc76c51a3,0x0654be30
404 .long 0xd192e819,0xd6ef5218,0xd6990624,0x5565a910
405 .long 0xf40e3585,0x5771202a,0x106aa070,0x32bbd1b8
406 .long 0x19a4c116,0xb8d2d0c8,0x1e376c08,0x5141ab53
407 .long 0x2748774c,0xdf8eeb99,0x34b0bcb5,0xe19b48a8
408 .long 0x391c0cb3,0xc5c95a63,0x4ed8aa4a,0xe3418acb
409 .long 0x5b9cca4f,0x7763e373,0x682e6ff3,0xd6b2b8a3
410 .long 0x748f82ee,0x5defb2fc,0x78a5636f,0x43172f60
411 .long 0x84c87814,0xa1f0ab72,0x8cc70208,0x1a6439ec
412 .long 0x90befffa,0x23631e28,0xa4506ceb,0xde82bde9
413 .long 0xbef9a3f7,0xb2c67915,0xc67178f2,0xe372532b
414 .long 0xca273ece,0xea26619c,0xd186b8c7,0x21c0c207
415 .long 0xeada7dd6,0xcde0eb1e,0xf57d4f7f,0xee6ed178
416 .long 0x06f067aa,0x72176fba,0x0a637dc5,0xa2c898a6
417 .long 0x113f9804,0xbef90dae,0x1b710b35,0x131c471b
418 .long 0x28db77f5,0x23047d84,0x32caab7b,0x40c72493
419 .long 0x3c9ebe0a,0x15c9bebc,0x431d67c4,0x9c100d4c
420 .long 0x4cc5d4be,0xcb3e42b6,0x597f299c,0xfc657e2a
421 .long 0x5fcb6fab,0x3ad6faec,0x6c44198c,0x4a475817
422___
423$code.=<<___ if ($SZ==4);
424 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
425 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
426 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
427 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
428 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
429 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
430 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
431 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
432 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
433 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
434 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
435 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
436 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
437 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
438 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
439 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
440___
441
442$code =~ s/\`([^\`]*)\`/eval $1/gem;
443print $code;
444close STDOUT;
diff --git a/src/lib/libcrypto/sha/asm/sha512-sparcv9.pl b/src/lib/libcrypto/sha/asm/sha512-sparcv9.pl
deleted file mode 100644
index 3c93799446..0000000000
--- a/src/lib/libcrypto/sha/asm/sha512-sparcv9.pl
+++ /dev/null
@@ -1,604 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# SHA256 performance improvement over compiler generated code varies
11# from 40% for Sun C [32-bit build] to 70% for gcc [3.3, 64-bit
12# build]. Just like in SHA1 module I aim to ensure scalability on
13# UltraSPARC T1 by packing X[16] to 8 64-bit registers.
14
15# SHA512 on pre-T1 UltraSPARC.
16#
17# Performance is >75% better than 64-bit code generated by Sun C and
18# over 2x than 32-bit code. X[16] resides on stack, but access to it
19# is scheduled for L2 latency and staged through 32 least significant
20# bits of %l0-%l7. The latter is done to achieve 32-/64-bit ABI
21# duality. Nevetheless it's ~40% faster than SHA256, which is pretty
22# good [optimal coefficient is 50%].
23#
24# SHA512 on UltraSPARC T1.
25#
26# It's not any faster than 64-bit code generated by Sun C 5.8. This is
27# because 64-bit code generator has the advantage of using 64-bit
28# loads(*) to access X[16], which I consciously traded for 32-/64-bit
29# ABI duality [as per above]. But it surpasses 32-bit Sun C generated
30# code by 60%, not to mention that it doesn't suffer from severe decay
31# when running 4 times physical cores threads and that it leaves gcc
32# [3.4] behind by over 4x factor! If compared to SHA256, single thread
33# performance is only 10% better, but overall throughput for maximum
34# amount of threads for given CPU exceeds corresponding one of SHA256
35# by 30% [again, optimal coefficient is 50%].
36#
37# (*) Unlike pre-T1 UltraSPARC loads on T1 are executed strictly
38# in-order, i.e. load instruction has to complete prior next
39# instruction in given thread is executed, even if the latter is
40# not dependent on load result! This means that on T1 two 32-bit
41# loads are always slower than one 64-bit load. Once again this
42# is unlike pre-T1 UltraSPARC, where, if scheduled appropriately,
43# 2x32-bit loads can be as fast as 1x64-bit ones.
44
45$bits=32;
46for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
47if ($bits==64) { $bias=2047; $frame=192; }
48else { $bias=0; $frame=112; }
49
50$output=shift;
51open STDOUT,">$output";
52
53if ($output =~ /512/) {
54 $label="512";
55 $SZ=8;
56 $LD="ldx"; # load from memory
57 $ST="stx"; # store to memory
58 $SLL="sllx"; # shift left logical
59 $SRL="srlx"; # shift right logical
60 @Sigma0=(28,34,39);
61 @Sigma1=(14,18,41);
62 @sigma0=( 7, 1, 8); # right shift first
63 @sigma1=( 6,19,61); # right shift first
64 $lastK=0x817;
65 $rounds=80;
66 $align=4;
67
68 $locals=16*$SZ; # X[16]
69
70 $A="%o0";
71 $B="%o1";
72 $C="%o2";
73 $D="%o3";
74 $E="%o4";
75 $F="%o5";
76 $G="%g1";
77 $H="%o7";
78 @V=($A,$B,$C,$D,$E,$F,$G,$H);
79} else {
80 $label="256";
81 $SZ=4;
82 $LD="ld"; # load from memory
83 $ST="st"; # store to memory
84 $SLL="sll"; # shift left logical
85 $SRL="srl"; # shift right logical
86 @Sigma0=( 2,13,22);
87 @Sigma1=( 6,11,25);
88 @sigma0=( 3, 7,18); # right shift first
89 @sigma1=(10,17,19); # right shift first
90 $lastK=0x8f2;
91 $rounds=64;
92 $align=8;
93
94 $locals=0; # X[16] is register resident
95 @X=("%o0","%o1","%o2","%o3","%o4","%o5","%g1","%o7");
96
97 $A="%l0";
98 $B="%l1";
99 $C="%l2";
100 $D="%l3";
101 $E="%l4";
102 $F="%l5";
103 $G="%l6";
104 $H="%l7";
105 @V=($A,$B,$C,$D,$E,$F,$G,$H);
106}
107$T1="%g2";
108$tmp0="%g3";
109$tmp1="%g4";
110$tmp2="%g5";
111
112$ctx="%i0";
113$inp="%i1";
114$len="%i2";
115$Ktbl="%i3";
116$tmp31="%i4";
117$tmp32="%i5";
118
119########### SHA256
120$Xload = sub {
121my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
122
123 if ($i==0) {
124$code.=<<___;
125 ldx [$inp+0],@X[0]
126 ldx [$inp+16],@X[2]
127 ldx [$inp+32],@X[4]
128 ldx [$inp+48],@X[6]
129 ldx [$inp+8],@X[1]
130 ldx [$inp+24],@X[3]
131 subcc %g0,$tmp31,$tmp32 ! should be 64-$tmp31, but -$tmp31 works too
132 ldx [$inp+40],@X[5]
133 bz,pt %icc,.Laligned
134 ldx [$inp+56],@X[7]
135
136 sllx @X[0],$tmp31,@X[0]
137 ldx [$inp+64],$T1
138___
139for($j=0;$j<7;$j++)
140{ $code.=<<___;
141 srlx @X[$j+1],$tmp32,$tmp1
142 sllx @X[$j+1],$tmp31,@X[$j+1]
143 or $tmp1,@X[$j],@X[$j]
144___
145}
146$code.=<<___;
147 srlx $T1,$tmp32,$T1
148 or $T1,@X[7],@X[7]
149.Laligned:
150___
151 }
152
153 if ($i&1) {
154 $code.="\tadd @X[$i/2],$h,$T1\n";
155 } else {
156 $code.="\tsrlx @X[$i/2],32,$T1\n\tadd $h,$T1,$T1\n";
157 }
158} if ($SZ==4);
159
160########### SHA512
161$Xload = sub {
162my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
163my @pair=("%l".eval(($i*2)%8),"%l".eval(($i*2)%8+1),"%l".eval((($i+1)*2)%8));
164
165$code.=<<___ if ($i==0);
166 ld [$inp+0],%l0
167 ld [$inp+4],%l1
168 ld [$inp+8],%l2
169 ld [$inp+12],%l3
170 ld [$inp+16],%l4
171 ld [$inp+20],%l5
172 ld [$inp+24],%l6
173 ld [$inp+28],%l7
174___
175$code.=<<___ if ($i<15);
176 sllx @pair[1],$tmp31,$tmp2 ! Xload($i)
177 add $tmp31,32,$tmp0
178 sllx @pair[0],$tmp0,$tmp1
179 `"ld [$inp+".eval(32+0+$i*8)."],@pair[0]" if ($i<12)`
180 srlx @pair[2],$tmp32,@pair[1]
181 or $tmp1,$tmp2,$tmp2
182 or @pair[1],$tmp2,$tmp2
183 `"ld [$inp+".eval(32+4+$i*8)."],@pair[1]" if ($i<12)`
184 add $h,$tmp2,$T1
185 $ST $tmp2,[%sp+`$bias+$frame+$i*$SZ`]
186___
187$code.=<<___ if ($i==12);
188 brnz,a $tmp31,.+8
189 ld [$inp+128],%l0
190___
191$code.=<<___ if ($i==15);
192 ld [%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+0`],%l2
193 sllx @pair[1],$tmp31,$tmp2 ! Xload($i)
194 add $tmp31,32,$tmp0
195 ld [%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+4`],%l3
196 sllx @pair[0],$tmp0,$tmp1
197 ld [%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+0`],%l4
198 srlx @pair[2],$tmp32,@pair[1]
199 or $tmp1,$tmp2,$tmp2
200 ld [%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+4`],%l5
201 or @pair[1],$tmp2,$tmp2
202 ld [%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+0`],%l6
203 add $h,$tmp2,$T1
204 $ST $tmp2,[%sp+`$bias+$frame+$i*$SZ`]
205 ld [%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+4`],%l7
206 ld [%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+0`],%l0
207 ld [%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+4`],%l1
208___
209} if ($SZ==8);
210
211########### common
212sub BODY_00_15 {
213my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
214
215 if ($i<16) {
216 &$Xload(@_);
217 } else {
218 $code.="\tadd $h,$T1,$T1\n";
219 }
220
221$code.=<<___;
222 $SRL $e,@Sigma1[0],$h !! $i
223 xor $f,$g,$tmp2
224 $SLL $e,`$SZ*8-@Sigma1[2]`,$tmp1
225 and $e,$tmp2,$tmp2
226 $SRL $e,@Sigma1[1],$tmp0
227 xor $tmp1,$h,$h
228 $SLL $e,`$SZ*8-@Sigma1[1]`,$tmp1
229 xor $tmp0,$h,$h
230 $SRL $e,@Sigma1[2],$tmp0
231 xor $tmp1,$h,$h
232 $SLL $e,`$SZ*8-@Sigma1[0]`,$tmp1
233 xor $tmp0,$h,$h
234 xor $g,$tmp2,$tmp2 ! Ch(e,f,g)
235 xor $tmp1,$h,$tmp0 ! Sigma1(e)
236
237 $SRL $a,@Sigma0[0],$h
238 add $tmp2,$T1,$T1
239 $LD [$Ktbl+`$i*$SZ`],$tmp2 ! K[$i]
240 $SLL $a,`$SZ*8-@Sigma0[2]`,$tmp1
241 add $tmp0,$T1,$T1
242 $SRL $a,@Sigma0[1],$tmp0
243 xor $tmp1,$h,$h
244 $SLL $a,`$SZ*8-@Sigma0[1]`,$tmp1
245 xor $tmp0,$h,$h
246 $SRL $a,@Sigma0[2],$tmp0
247 xor $tmp1,$h,$h
248 $SLL $a,`$SZ*8-@Sigma0[0]`,$tmp1
249 xor $tmp0,$h,$h
250 xor $tmp1,$h,$h ! Sigma0(a)
251
252 or $a,$b,$tmp0
253 and $a,$b,$tmp1
254 and $c,$tmp0,$tmp0
255 or $tmp0,$tmp1,$tmp1 ! Maj(a,b,c)
256 add $tmp2,$T1,$T1 ! +=K[$i]
257 add $tmp1,$h,$h
258
259 add $T1,$d,$d
260 add $T1,$h,$h
261___
262}
263
264########### SHA256
265$BODY_16_XX = sub {
266my $i=@_[0];
267my $xi;
268
269 if ($i&1) {
270 $xi=$tmp32;
271 $code.="\tsrlx @X[(($i+1)/2)%8],32,$xi\n";
272 } else {
273 $xi=@X[(($i+1)/2)%8];
274 }
275$code.=<<___;
276 srl $xi,@sigma0[0],$T1 !! Xupdate($i)
277 sll $xi,`32-@sigma0[2]`,$tmp1
278 srl $xi,@sigma0[1],$tmp0
279 xor $tmp1,$T1,$T1
280 sll $tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1
281 xor $tmp0,$T1,$T1
282 srl $xi,@sigma0[2],$tmp0
283 xor $tmp1,$T1,$T1
284___
285 if ($i&1) {
286 $xi=@X[(($i+14)/2)%8];
287 } else {
288 $xi=$tmp32;
289 $code.="\tsrlx @X[(($i+14)/2)%8],32,$xi\n";
290 }
291$code.=<<___;
292 srl $xi,@sigma1[0],$tmp2
293 xor $tmp0,$T1,$T1 ! T1=sigma0(X[i+1])
294 sll $xi,`32-@sigma1[2]`,$tmp1
295 srl $xi,@sigma1[1],$tmp0
296 xor $tmp1,$tmp2,$tmp2
297 sll $tmp1,`@sigma1[2]-@sigma1[1]`,$tmp1
298 xor $tmp0,$tmp2,$tmp2
299 srl $xi,@sigma1[2],$tmp0
300 xor $tmp1,$tmp2,$tmp2
301___
302 if ($i&1) {
303 $xi=@X[($i/2)%8];
304$code.=<<___;
305 srlx @X[(($i+9)/2)%8],32,$tmp1 ! X[i+9]
306 xor $tmp0,$tmp2,$tmp2 ! sigma1(X[i+14])
307 srl @X[($i/2)%8],0,$tmp0
308 add $tmp2,$tmp1,$tmp1
309 add $xi,$T1,$T1 ! +=X[i]
310 xor $tmp0,@X[($i/2)%8],@X[($i/2)%8]
311 add $tmp1,$T1,$T1
312
313 srl $T1,0,$T1
314 or $T1,@X[($i/2)%8],@X[($i/2)%8]
315___
316 } else {
317 $xi=@X[(($i+9)/2)%8];
318$code.=<<___;
319 srlx @X[($i/2)%8],32,$tmp1 ! X[i]
320 xor $tmp0,$tmp2,$tmp2 ! sigma1(X[i+14])
321 add $xi,$T1,$T1 ! +=X[i+9]
322 add $tmp2,$tmp1,$tmp1
323 srl @X[($i/2)%8],0,@X[($i/2)%8]
324 add $tmp1,$T1,$T1
325
326 sllx $T1,32,$tmp0
327 or $tmp0,@X[($i/2)%8],@X[($i/2)%8]
328___
329 }
330 &BODY_00_15(@_);
331} if ($SZ==4);
332
333########### SHA512
334$BODY_16_XX = sub {
335my $i=@_[0];
336my @pair=("%l".eval(($i*2)%8),"%l".eval(($i*2)%8+1));
337
338$code.=<<___;
339 sllx %l2,32,$tmp0 !! Xupdate($i)
340 or %l3,$tmp0,$tmp0
341
342 srlx $tmp0,@sigma0[0],$T1
343 ld [%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+0`],%l2
344 sllx $tmp0,`64-@sigma0[2]`,$tmp1
345 ld [%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+4`],%l3
346 srlx $tmp0,@sigma0[1],$tmp0
347 xor $tmp1,$T1,$T1
348 sllx $tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1
349 xor $tmp0,$T1,$T1
350 srlx $tmp0,`@sigma0[2]-@sigma0[1]`,$tmp0
351 xor $tmp1,$T1,$T1
352 sllx %l6,32,$tmp2
353 xor $tmp0,$T1,$T1 ! sigma0(X[$i+1])
354 or %l7,$tmp2,$tmp2
355
356 srlx $tmp2,@sigma1[0],$tmp1
357 ld [%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+0`],%l6
358 sllx $tmp2,`64-@sigma1[2]`,$tmp0
359 ld [%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+4`],%l7
360 srlx $tmp2,@sigma1[1],$tmp2
361 xor $tmp0,$tmp1,$tmp1
362 sllx $tmp0,`@sigma1[2]-@sigma1[1]`,$tmp0
363 xor $tmp2,$tmp1,$tmp1
364 srlx $tmp2,`@sigma1[2]-@sigma1[1]`,$tmp2
365 xor $tmp0,$tmp1,$tmp1
366 sllx %l4,32,$tmp0
367 xor $tmp2,$tmp1,$tmp1 ! sigma1(X[$i+14])
368 ld [%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+0`],%l4
369 or %l5,$tmp0,$tmp0
370 ld [%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+4`],%l5
371
372 sllx %l0,32,$tmp2
373 add $tmp1,$T1,$T1
374 ld [%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+0`],%l0
375 or %l1,$tmp2,$tmp2
376 add $tmp0,$T1,$T1 ! +=X[$i+9]
377 ld [%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+4`],%l1
378 add $tmp2,$T1,$T1 ! +=X[$i]
379 $ST $T1,[%sp+`$bias+$frame+($i%16)*$SZ`]
380___
381 &BODY_00_15(@_);
382} if ($SZ==8);
383
384$code.=<<___ if ($bits==64);
385.register %g2,#scratch
386.register %g3,#scratch
387___
388$code.=<<___;
389.section ".rodata",#alloc
390
391.align 64
392K${label}:
393.type K${label},#object
394___
395if ($SZ==4) {
396$code.=<<___;
397 .long 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
398 .long 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
399 .long 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
400 .long 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
401 .long 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
402 .long 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
403 .long 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
404 .long 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
405 .long 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
406 .long 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
407 .long 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
408 .long 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
409 .long 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
410 .long 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
411 .long 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
412 .long 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
413___
414} else {
415$code.=<<___;
416 .long 0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd
417 .long 0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc
418 .long 0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019
419 .long 0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118
420 .long 0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe
421 .long 0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2
422 .long 0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1
423 .long 0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694
424 .long 0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3
425 .long 0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65
426 .long 0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483
427 .long 0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5
428 .long 0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210
429 .long 0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4
430 .long 0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725
431 .long 0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70
432 .long 0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926
433 .long 0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df
434 .long 0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8
435 .long 0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b
436 .long 0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001
437 .long 0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30
438 .long 0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910
439 .long 0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8
440 .long 0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53
441 .long 0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8
442 .long 0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb
443 .long 0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3
444 .long 0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60
445 .long 0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec
446 .long 0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9
447 .long 0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b
448 .long 0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207
449 .long 0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178
450 .long 0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6
451 .long 0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b
452 .long 0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493
453 .long 0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c
454 .long 0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a
455 .long 0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817
456___
457}
458$code.=<<___;
459.size K${label},.-K${label}
460
461.section ".text",#alloc,#execinstr
462.globl sha${label}_block_data_order
463sha${label}_block_data_order:
464 save %sp,`-$frame-$locals`,%sp
465#ifdef __PIC__
466 sethi %hi(_GLOBAL_OFFSET_TABLE_-4), %o5
467 rd %pc, %o4
468 or %o5, %lo(_GLOBAL_OFFSET_TABLE_+4), %o5
469 add %o5, %o4, %o5
470#endif
471 and $inp,`$align-1`,$tmp31
472 sllx $len,`log(16*$SZ)/log(2)`,$len
473 andn $inp,`$align-1`,$inp
474 sll $tmp31,3,$tmp31
475 add $inp,$len,$len
476___
477$code.=<<___ if ($SZ==8); # SHA512
478 mov 32,$tmp32
479 sub $tmp32,$tmp31,$tmp32
480___
481$code.=<<___;
482#ifdef __PIC__
483 set K${label}, $Ktbl
484 ldx [$Ktbl+%o5], $Ktbl
485#else
486 set K${label}, $Ktbl
487#endif
488
489 $LD [$ctx+`0*$SZ`],$A
490 $LD [$ctx+`1*$SZ`],$B
491 $LD [$ctx+`2*$SZ`],$C
492 $LD [$ctx+`3*$SZ`],$D
493 $LD [$ctx+`4*$SZ`],$E
494 $LD [$ctx+`5*$SZ`],$F
495 $LD [$ctx+`6*$SZ`],$G
496 $LD [$ctx+`7*$SZ`],$H
497
498.Lloop:
499___
500for ($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
501$code.=".L16_xx:\n";
502for (;$i<32;$i++) { &$BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
503$code.=<<___;
504 and $tmp2,0xfff,$tmp2
505 cmp $tmp2,$lastK
506 bne .L16_xx
507 add $Ktbl,`16*$SZ`,$Ktbl ! Ktbl+=16
508
509___
510$code.=<<___ if ($SZ==4); # SHA256
511 $LD [$ctx+`0*$SZ`],@X[0]
512 $LD [$ctx+`1*$SZ`],@X[1]
513 $LD [$ctx+`2*$SZ`],@X[2]
514 $LD [$ctx+`3*$SZ`],@X[3]
515 $LD [$ctx+`4*$SZ`],@X[4]
516 $LD [$ctx+`5*$SZ`],@X[5]
517 $LD [$ctx+`6*$SZ`],@X[6]
518 $LD [$ctx+`7*$SZ`],@X[7]
519
520 add $A,@X[0],$A
521 $ST $A,[$ctx+`0*$SZ`]
522 add $B,@X[1],$B
523 $ST $B,[$ctx+`1*$SZ`]
524 add $C,@X[2],$C
525 $ST $C,[$ctx+`2*$SZ`]
526 add $D,@X[3],$D
527 $ST $D,[$ctx+`3*$SZ`]
528 add $E,@X[4],$E
529 $ST $E,[$ctx+`4*$SZ`]
530 add $F,@X[5],$F
531 $ST $F,[$ctx+`5*$SZ`]
532 add $G,@X[6],$G
533 $ST $G,[$ctx+`6*$SZ`]
534 add $H,@X[7],$H
535 $ST $H,[$ctx+`7*$SZ`]
536___
537$code.=<<___ if ($SZ==8); # SHA512
538 ld [$ctx+`0*$SZ+0`],%l0
539 ld [$ctx+`0*$SZ+4`],%l1
540 ld [$ctx+`1*$SZ+0`],%l2
541 ld [$ctx+`1*$SZ+4`],%l3
542 ld [$ctx+`2*$SZ+0`],%l4
543 ld [$ctx+`2*$SZ+4`],%l5
544 ld [$ctx+`3*$SZ+0`],%l6
545
546 sllx %l0,32,$tmp0
547 ld [$ctx+`3*$SZ+4`],%l7
548 sllx %l2,32,$tmp1
549 or %l1,$tmp0,$tmp0
550 or %l3,$tmp1,$tmp1
551 add $tmp0,$A,$A
552 add $tmp1,$B,$B
553 $ST $A,[$ctx+`0*$SZ`]
554 sllx %l4,32,$tmp2
555 $ST $B,[$ctx+`1*$SZ`]
556 sllx %l6,32,$T1
557 or %l5,$tmp2,$tmp2
558 or %l7,$T1,$T1
559 add $tmp2,$C,$C
560 $ST $C,[$ctx+`2*$SZ`]
561 add $T1,$D,$D
562 $ST $D,[$ctx+`3*$SZ`]
563
564 ld [$ctx+`4*$SZ+0`],%l0
565 ld [$ctx+`4*$SZ+4`],%l1
566 ld [$ctx+`5*$SZ+0`],%l2
567 ld [$ctx+`5*$SZ+4`],%l3
568 ld [$ctx+`6*$SZ+0`],%l4
569 ld [$ctx+`6*$SZ+4`],%l5
570 ld [$ctx+`7*$SZ+0`],%l6
571
572 sllx %l0,32,$tmp0
573 ld [$ctx+`7*$SZ+4`],%l7
574 sllx %l2,32,$tmp1
575 or %l1,$tmp0,$tmp0
576 or %l3,$tmp1,$tmp1
577 add $tmp0,$E,$E
578 add $tmp1,$F,$F
579 $ST $E,[$ctx+`4*$SZ`]
580 sllx %l4,32,$tmp2
581 $ST $F,[$ctx+`5*$SZ`]
582 sllx %l6,32,$T1
583 or %l5,$tmp2,$tmp2
584 or %l7,$T1,$T1
585 add $tmp2,$G,$G
586 $ST $G,[$ctx+`6*$SZ`]
587 add $T1,$H,$H
588 $ST $H,[$ctx+`7*$SZ`]
589___
590$code.=<<___;
591 add $inp,`16*$SZ`,$inp ! advance inp
592 cmp $inp,$len
593 bne `$bits==64?"%xcc":"%icc"`,.Lloop
594 sub $Ktbl,`($rounds-16)*$SZ`,$Ktbl ! rewind Ktbl
595
596 ret
597 restore
598.type sha${label}_block_data_order,#function
599.size sha${label}_block_data_order,(.-sha${label}_block_data_order)
600___
601
602$code =~ s/\`([^\`]*)\`/eval $1/gem;
603print $code;
604close STDOUT;
diff --git a/src/lib/libcrypto/sha/sha.h b/src/lib/libcrypto/sha/sha.h
deleted file mode 100644
index ec97f48b2e..0000000000
--- a/src/lib/libcrypto/sha/sha.h
+++ /dev/null
@@ -1,190 +0,0 @@
1/* $OpenBSD: sha.h,v 1.26 2025/01/25 17:59:44 tb Exp $ */
2/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
3 * All rights reserved.
4 *
5 * This package is an SSL implementation written
6 * by Eric Young (eay@cryptsoft.com).
7 * The implementation was written so as to conform with Netscapes SSL.
8 *
9 * This library is free for commercial and non-commercial use as long as
10 * the following conditions are aheared to. The following conditions
11 * apply to all code found in this distribution, be it the RC4, RSA,
12 * lhash, DES, etc., code; not just the SSL code. The SSL documentation
13 * included with this distribution is covered by the same copyright terms
14 * except that the holder is Tim Hudson (tjh@cryptsoft.com).
15 *
16 * Copyright remains Eric Young's, and as such any Copyright notices in
17 * the code are not to be removed.
18 * If this package is used in a product, Eric Young should be given attribution
19 * as the author of the parts of the library used.
20 * This can be in the form of a textual message at program startup or
21 * in documentation (online or textual) provided with the package.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 * 1. Redistributions of source code must retain the copyright
27 * notice, this list of conditions and the following disclaimer.
28 * 2. Redistributions in binary form must reproduce the above copyright
29 * notice, this list of conditions and the following disclaimer in the
30 * documentation and/or other materials provided with the distribution.
31 * 3. All advertising materials mentioning features or use of this software
32 * must display the following acknowledgement:
33 * "This product includes cryptographic software written by
34 * Eric Young (eay@cryptsoft.com)"
35 * The word 'cryptographic' can be left out if the rouines from the library
36 * being used are not cryptographic related :-).
37 * 4. If you include any Windows specific code (or a derivative thereof) from
38 * the apps directory (application code) you must include an acknowledgement:
39 * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
40 *
41 * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
42 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51 * SUCH DAMAGE.
52 *
53 * The licence and distribution terms for any publically available version or
54 * derivative of this code cannot be changed. i.e. this code cannot simply be
55 * copied and put under another distribution licence
56 * [including the GNU Public Licence.]
57 */
58
59#include <stddef.h>
60
61#ifndef HEADER_SHA_H
62#define HEADER_SHA_H
63#if !defined(HAVE_ATTRIBUTE__BOUNDED__) && !defined(__OpenBSD__)
64#define __bounded__(x, y, z)
65#endif
66
67#include <openssl/opensslconf.h>
68
69#ifdef __cplusplus
70extern "C" {
71#endif
72
73/*
74 * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
75 * ! SHA_LONG has to be at least 32 bits wide. !
76 * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
77 */
78
79#define SHA_LONG unsigned int
80
81#define SHA_LBLOCK 16
82#define SHA_CBLOCK (SHA_LBLOCK*4) /* SHA treats input data as a
83 * contiguous array of 32 bit
84 * wide big-endian values. */
85#define SHA_LAST_BLOCK (SHA_CBLOCK-8)
86#define SHA_DIGEST_LENGTH 20
87
88typedef struct SHAstate_st {
89 SHA_LONG h0, h1, h2, h3, h4;
90 SHA_LONG Nl, Nh;
91 SHA_LONG data[SHA_LBLOCK];
92 unsigned int num;
93} SHA_CTX;
94
95#ifndef OPENSSL_NO_SHA1
96int SHA1_Init(SHA_CTX *c);
97int SHA1_Update(SHA_CTX *c, const void *data, size_t len)
98 __attribute__ ((__bounded__(__buffer__, 2, 3)));
99int SHA1_Final(unsigned char *md, SHA_CTX *c);
100unsigned char *SHA1(const unsigned char *d, size_t n, unsigned char *md)
101 __attribute__ ((__bounded__(__buffer__, 1, 2)))
102 __attribute__ ((__nonnull__(3)));
103void SHA1_Transform(SHA_CTX *c, const unsigned char *data);
104#endif
105
106#define SHA256_CBLOCK (SHA_LBLOCK*4) /* SHA-256 treats input data as a
107 * contiguous array of 32 bit
108 * wide big-endian values. */
109#define SHA224_DIGEST_LENGTH 28
110#define SHA256_DIGEST_LENGTH 32
111
112typedef struct SHA256state_st {
113 SHA_LONG h[8];
114 SHA_LONG Nl, Nh;
115 SHA_LONG data[SHA_LBLOCK];
116 unsigned int num, md_len;
117} SHA256_CTX;
118
119#ifndef OPENSSL_NO_SHA256
120int SHA224_Init(SHA256_CTX *c);
121int SHA224_Update(SHA256_CTX *c, const void *data, size_t len)
122 __attribute__ ((__bounded__(__buffer__, 2, 3)));
123int SHA224_Final(unsigned char *md, SHA256_CTX *c);
124unsigned char *SHA224(const unsigned char *d, size_t n, unsigned char *md)
125 __attribute__ ((__bounded__(__buffer__, 1, 2)))
126 __attribute__ ((__nonnull__(3)));
127int SHA256_Init(SHA256_CTX *c);
128int SHA256_Update(SHA256_CTX *c, const void *data, size_t len)
129 __attribute__ ((__bounded__(__buffer__, 2, 3)));
130int SHA256_Final(unsigned char *md, SHA256_CTX *c);
131unsigned char *SHA256(const unsigned char *d, size_t n, unsigned char *md)
132 __attribute__ ((__bounded__(__buffer__, 1, 2)))
133 __attribute__ ((__nonnull__(3)));
134void SHA256_Transform(SHA256_CTX *c, const unsigned char *data);
135#endif
136
137#define SHA384_DIGEST_LENGTH 48
138#define SHA512_DIGEST_LENGTH 64
139
140#ifndef OPENSSL_NO_SHA512
141/*
142 * Unlike 32-bit digest algorithms, SHA-512 *relies* on SHA_LONG64
143 * being exactly 64-bit wide. See Implementation Notes in sha512.c
144 * for further details.
145 */
146#define SHA512_CBLOCK (SHA_LBLOCK*8) /* SHA-512 treats input data as a
147 * contiguous array of 64 bit
148 * wide big-endian values. */
149#if defined(_LP64)
150#define SHA_LONG64 unsigned long
151#define U64(C) C##UL
152#else
153#define SHA_LONG64 unsigned long long
154#define U64(C) C##ULL
155#endif
156
157typedef struct SHA512state_st {
158 SHA_LONG64 h[8];
159 SHA_LONG64 Nl, Nh;
160 union {
161 SHA_LONG64 d[SHA_LBLOCK];
162 unsigned char p[SHA512_CBLOCK];
163 } u;
164 unsigned int num, md_len;
165} SHA512_CTX;
166#endif
167
168#ifndef OPENSSL_NO_SHA512
169int SHA384_Init(SHA512_CTX *c);
170int SHA384_Update(SHA512_CTX *c, const void *data, size_t len)
171 __attribute__ ((__bounded__(__buffer__, 2, 3)));
172int SHA384_Final(unsigned char *md, SHA512_CTX *c);
173unsigned char *SHA384(const unsigned char *d, size_t n, unsigned char *md)
174 __attribute__ ((__bounded__(__buffer__, 1, 2)))
175 __attribute__ ((__nonnull__(3)));
176int SHA512_Init(SHA512_CTX *c);
177int SHA512_Update(SHA512_CTX *c, const void *data, size_t len)
178 __attribute__ ((__bounded__(__buffer__, 2, 3)));
179int SHA512_Final(unsigned char *md, SHA512_CTX *c);
180unsigned char *SHA512(const unsigned char *d, size_t n, unsigned char *md)
181 __attribute__ ((__bounded__(__buffer__, 1, 2)))
182 __attribute__ ((__nonnull__(3)));
183void SHA512_Transform(SHA512_CTX *c, const unsigned char *data);
184#endif
185
186#ifdef __cplusplus
187}
188#endif
189
190#endif
diff --git a/src/lib/libcrypto/sha/sha1.c b/src/lib/libcrypto/sha/sha1.c
deleted file mode 100644
index ab05709818..0000000000
--- a/src/lib/libcrypto/sha/sha1.c
+++ /dev/null
@@ -1,518 +0,0 @@
1/* $OpenBSD: sha1.c,v 1.16 2025/02/14 12:01:58 jsing Exp $ */
2/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
3 * All rights reserved.
4 *
5 * This package is an SSL implementation written
6 * by Eric Young (eay@cryptsoft.com).
7 * The implementation was written so as to conform with Netscapes SSL.
8 *
9 * This library is free for commercial and non-commercial use as long as
10 * the following conditions are aheared to. The following conditions
11 * apply to all code found in this distribution, be it the RC4, RSA,
12 * lhash, DES, etc., code; not just the SSL code. The SSL documentation
13 * included with this distribution is covered by the same copyright terms
14 * except that the holder is Tim Hudson (tjh@cryptsoft.com).
15 *
16 * Copyright remains Eric Young's, and as such any Copyright notices in
17 * the code are not to be removed.
18 * If this package is used in a product, Eric Young should be given attribution
19 * as the author of the parts of the library used.
20 * This can be in the form of a textual message at program startup or
21 * in documentation (online or textual) provided with the package.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 * 1. Redistributions of source code must retain the copyright
27 * notice, this list of conditions and the following disclaimer.
28 * 2. Redistributions in binary form must reproduce the above copyright
29 * notice, this list of conditions and the following disclaimer in the
30 * documentation and/or other materials provided with the distribution.
31 * 3. All advertising materials mentioning features or use of this software
32 * must display the following acknowledgement:
33 * "This product includes cryptographic software written by
34 * Eric Young (eay@cryptsoft.com)"
35 * The word 'cryptographic' can be left out if the rouines from the library
36 * being used are not cryptographic related :-).
37 * 4. If you include any Windows specific code (or a derivative thereof) from
38 * the apps directory (application code) you must include an acknowledgement:
39 * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
40 *
41 * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
42 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51 * SUCH DAMAGE.
52 *
53 * The licence and distribution terms for any publically available version or
54 * derivative of this code cannot be changed. i.e. this code cannot simply be
55 * copied and put under another distribution licence
56 * [including the GNU Public Licence.]
57 */
58
59#include <stdlib.h>
60#include <string.h>
61
62#include <openssl/opensslconf.h>
63
64#include <openssl/crypto.h>
65#include <openssl/sha.h>
66
67#include "crypto_internal.h"
68
69#if !defined(OPENSSL_NO_SHA1) && !defined(OPENSSL_NO_SHA)
70
71/* Ensure that SHA_LONG and uint32_t are equivalent sizes. */
72CTASSERT(sizeof(SHA_LONG) == sizeof(uint32_t));
73
74void sha1_block_data_order(SHA_CTX *ctx, const void *p, size_t num);
75void sha1_block_generic(SHA_CTX *ctx, const void *p, size_t num);
76
77#ifndef HAVE_SHA1_BLOCK_GENERIC
78static inline SHA_LONG
79Ch(SHA_LONG x, SHA_LONG y, SHA_LONG z)
80{
81 return (x & y) ^ (~x & z);
82}
83
84static inline SHA_LONG
85Parity(SHA_LONG x, SHA_LONG y, SHA_LONG z)
86{
87 return x ^ y ^ z;
88}
89
90static inline SHA_LONG
91Maj(SHA_LONG x, SHA_LONG y, SHA_LONG z)
92{
93 return (x & y) ^ (x & z) ^ (y & z);
94}
95
96static inline void
97sha1_msg_schedule_update(SHA_LONG *W0, SHA_LONG W2, SHA_LONG W8, SHA_LONG W13)
98{
99 *W0 = crypto_rol_u32(W13 ^ W8 ^ W2 ^ *W0, 1);
100}
101
102static inline void
103sha1_round1(SHA_LONG *a, SHA_LONG *b, SHA_LONG *c, SHA_LONG *d, SHA_LONG *e,
104 SHA_LONG Wt)
105{
106 SHA_LONG Kt, T;
107
108 Kt = 0x5a827999UL;
109 T = crypto_rol_u32(*a, 5) + Ch(*b, *c, *d) + *e + Kt + Wt;
110
111 *e = *d;
112 *d = *c;
113 *c = crypto_rol_u32(*b, 30);
114 *b = *a;
115 *a = T;
116}
117
118static inline void
119sha1_round2(SHA_LONG *a, SHA_LONG *b, SHA_LONG *c, SHA_LONG *d, SHA_LONG *e,
120 SHA_LONG Wt)
121{
122 SHA_LONG Kt, T;
123
124 Kt = 0x6ed9eba1UL;
125 T = crypto_rol_u32(*a, 5) + Parity(*b, *c, *d) + *e + Kt + Wt;
126
127 *e = *d;
128 *d = *c;
129 *c = crypto_rol_u32(*b, 30);
130 *b = *a;
131 *a = T;
132}
133
134static inline void
135sha1_round3(SHA_LONG *a, SHA_LONG *b, SHA_LONG *c, SHA_LONG *d, SHA_LONG *e,
136 SHA_LONG Wt)
137{
138 SHA_LONG Kt, T;
139
140 Kt = 0x8f1bbcdcUL;
141 T = crypto_rol_u32(*a, 5) + Maj(*b, *c, *d) + *e + Kt + Wt;
142
143 *e = *d;
144 *d = *c;
145 *c = crypto_rol_u32(*b, 30);
146 *b = *a;
147 *a = T;
148}
149
150static inline void
151sha1_round4(SHA_LONG *a, SHA_LONG *b, SHA_LONG *c, SHA_LONG *d, SHA_LONG *e,
152 SHA_LONG Wt)
153{
154 SHA_LONG Kt, T;
155
156 Kt = 0xca62c1d6UL;
157 T = crypto_rol_u32(*a, 5) + Parity(*b, *c, *d) + *e + Kt + Wt;
158
159 *e = *d;
160 *d = *c;
161 *c = crypto_rol_u32(*b, 30);
162 *b = *a;
163 *a = T;
164}
165
166void
167sha1_block_generic(SHA_CTX *ctx, const void *_in, size_t num)
168{
169 const uint8_t *in = _in;
170 const SHA_LONG *in32;
171 unsigned int a, b, c, d, e;
172 unsigned int X0, X1, X2, X3, X4, X5, X6, X7,
173 X8, X9, X10, X11, X12, X13, X14, X15;
174
175 while (num--) {
176 a = ctx->h0;
177 b = ctx->h1;
178 c = ctx->h2;
179 d = ctx->h3;
180 e = ctx->h4;
181
182 if ((size_t)in % 4 == 0) {
183 /* Input is 32 bit aligned. */
184 in32 = (const SHA_LONG *)in;
185 X0 = be32toh(in32[0]);
186 X1 = be32toh(in32[1]);
187 X2 = be32toh(in32[2]);
188 X3 = be32toh(in32[3]);
189 X4 = be32toh(in32[4]);
190 X5 = be32toh(in32[5]);
191 X6 = be32toh(in32[6]);
192 X7 = be32toh(in32[7]);
193 X8 = be32toh(in32[8]);
194 X9 = be32toh(in32[9]);
195 X10 = be32toh(in32[10]);
196 X11 = be32toh(in32[11]);
197 X12 = be32toh(in32[12]);
198 X13 = be32toh(in32[13]);
199 X14 = be32toh(in32[14]);
200 X15 = be32toh(in32[15]);
201 } else {
202 /* Input is not 32 bit aligned. */
203 X0 = crypto_load_be32toh(&in[0 * 4]);
204 X1 = crypto_load_be32toh(&in[1 * 4]);
205 X2 = crypto_load_be32toh(&in[2 * 4]);
206 X3 = crypto_load_be32toh(&in[3 * 4]);
207 X4 = crypto_load_be32toh(&in[4 * 4]);
208 X5 = crypto_load_be32toh(&in[5 * 4]);
209 X6 = crypto_load_be32toh(&in[6 * 4]);
210 X7 = crypto_load_be32toh(&in[7 * 4]);
211 X8 = crypto_load_be32toh(&in[8 * 4]);
212 X9 = crypto_load_be32toh(&in[9 * 4]);
213 X10 = crypto_load_be32toh(&in[10 * 4]);
214 X11 = crypto_load_be32toh(&in[11 * 4]);
215 X12 = crypto_load_be32toh(&in[12 * 4]);
216 X13 = crypto_load_be32toh(&in[13 * 4]);
217 X14 = crypto_load_be32toh(&in[14 * 4]);
218 X15 = crypto_load_be32toh(&in[15 * 4]);
219 }
220 in += SHA_CBLOCK;
221
222 sha1_round1(&a, &b, &c, &d, &e, X0);
223 sha1_round1(&a, &b, &c, &d, &e, X1);
224 sha1_round1(&a, &b, &c, &d, &e, X2);
225 sha1_round1(&a, &b, &c, &d, &e, X3);
226 sha1_round1(&a, &b, &c, &d, &e, X4);
227 sha1_round1(&a, &b, &c, &d, &e, X5);
228 sha1_round1(&a, &b, &c, &d, &e, X6);
229 sha1_round1(&a, &b, &c, &d, &e, X7);
230 sha1_round1(&a, &b, &c, &d, &e, X8);
231 sha1_round1(&a, &b, &c, &d, &e, X9);
232 sha1_round1(&a, &b, &c, &d, &e, X10);
233 sha1_round1(&a, &b, &c, &d, &e, X11);
234 sha1_round1(&a, &b, &c, &d, &e, X12);
235 sha1_round1(&a, &b, &c, &d, &e, X13);
236 sha1_round1(&a, &b, &c, &d, &e, X14);
237 sha1_round1(&a, &b, &c, &d, &e, X15);
238
239 sha1_msg_schedule_update(&X0, X2, X8, X13);
240 sha1_msg_schedule_update(&X1, X3, X9, X14);
241 sha1_msg_schedule_update(&X2, X4, X10, X15);
242 sha1_msg_schedule_update(&X3, X5, X11, X0);
243 sha1_msg_schedule_update(&X4, X6, X12, X1);
244 sha1_msg_schedule_update(&X5, X7, X13, X2);
245 sha1_msg_schedule_update(&X6, X8, X14, X3);
246 sha1_msg_schedule_update(&X7, X9, X15, X4);
247 sha1_msg_schedule_update(&X8, X10, X0, X5);
248 sha1_msg_schedule_update(&X9, X11, X1, X6);
249 sha1_msg_schedule_update(&X10, X12, X2, X7);
250 sha1_msg_schedule_update(&X11, X13, X3, X8);
251 sha1_msg_schedule_update(&X12, X14, X4, X9);
252 sha1_msg_schedule_update(&X13, X15, X5, X10);
253 sha1_msg_schedule_update(&X14, X0, X6, X11);
254 sha1_msg_schedule_update(&X15, X1, X7, X12);
255
256 sha1_round1(&a, &b, &c, &d, &e, X0);
257 sha1_round1(&a, &b, &c, &d, &e, X1);
258 sha1_round1(&a, &b, &c, &d, &e, X2);
259 sha1_round1(&a, &b, &c, &d, &e, X3);
260 sha1_round2(&a, &b, &c, &d, &e, X4);
261 sha1_round2(&a, &b, &c, &d, &e, X5);
262 sha1_round2(&a, &b, &c, &d, &e, X6);
263 sha1_round2(&a, &b, &c, &d, &e, X7);
264 sha1_round2(&a, &b, &c, &d, &e, X8);
265 sha1_round2(&a, &b, &c, &d, &e, X9);
266 sha1_round2(&a, &b, &c, &d, &e, X10);
267 sha1_round2(&a, &b, &c, &d, &e, X11);
268 sha1_round2(&a, &b, &c, &d, &e, X12);
269 sha1_round2(&a, &b, &c, &d, &e, X13);
270 sha1_round2(&a, &b, &c, &d, &e, X14);
271 sha1_round2(&a, &b, &c, &d, &e, X15);
272
273 sha1_msg_schedule_update(&X0, X2, X8, X13);
274 sha1_msg_schedule_update(&X1, X3, X9, X14);
275 sha1_msg_schedule_update(&X2, X4, X10, X15);
276 sha1_msg_schedule_update(&X3, X5, X11, X0);
277 sha1_msg_schedule_update(&X4, X6, X12, X1);
278 sha1_msg_schedule_update(&X5, X7, X13, X2);
279 sha1_msg_schedule_update(&X6, X8, X14, X3);
280 sha1_msg_schedule_update(&X7, X9, X15, X4);
281 sha1_msg_schedule_update(&X8, X10, X0, X5);
282 sha1_msg_schedule_update(&X9, X11, X1, X6);
283 sha1_msg_schedule_update(&X10, X12, X2, X7);
284 sha1_msg_schedule_update(&X11, X13, X3, X8);
285 sha1_msg_schedule_update(&X12, X14, X4, X9);
286 sha1_msg_schedule_update(&X13, X15, X5, X10);
287 sha1_msg_schedule_update(&X14, X0, X6, X11);
288 sha1_msg_schedule_update(&X15, X1, X7, X12);
289
290 sha1_round2(&a, &b, &c, &d, &e, X0);
291 sha1_round2(&a, &b, &c, &d, &e, X1);
292 sha1_round2(&a, &b, &c, &d, &e, X2);
293 sha1_round2(&a, &b, &c, &d, &e, X3);
294 sha1_round2(&a, &b, &c, &d, &e, X4);
295 sha1_round2(&a, &b, &c, &d, &e, X5);
296 sha1_round2(&a, &b, &c, &d, &e, X6);
297 sha1_round2(&a, &b, &c, &d, &e, X7);
298 sha1_round3(&a, &b, &c, &d, &e, X8);
299 sha1_round3(&a, &b, &c, &d, &e, X9);
300 sha1_round3(&a, &b, &c, &d, &e, X10);
301 sha1_round3(&a, &b, &c, &d, &e, X11);
302 sha1_round3(&a, &b, &c, &d, &e, X12);
303 sha1_round3(&a, &b, &c, &d, &e, X13);
304 sha1_round3(&a, &b, &c, &d, &e, X14);
305 sha1_round3(&a, &b, &c, &d, &e, X15);
306
307 sha1_msg_schedule_update(&X0, X2, X8, X13);
308 sha1_msg_schedule_update(&X1, X3, X9, X14);
309 sha1_msg_schedule_update(&X2, X4, X10, X15);
310 sha1_msg_schedule_update(&X3, X5, X11, X0);
311 sha1_msg_schedule_update(&X4, X6, X12, X1);
312 sha1_msg_schedule_update(&X5, X7, X13, X2);
313 sha1_msg_schedule_update(&X6, X8, X14, X3);
314 sha1_msg_schedule_update(&X7, X9, X15, X4);
315 sha1_msg_schedule_update(&X8, X10, X0, X5);
316 sha1_msg_schedule_update(&X9, X11, X1, X6);
317 sha1_msg_schedule_update(&X10, X12, X2, X7);
318 sha1_msg_schedule_update(&X11, X13, X3, X8);
319 sha1_msg_schedule_update(&X12, X14, X4, X9);
320 sha1_msg_schedule_update(&X13, X15, X5, X10);
321 sha1_msg_schedule_update(&X14, X0, X6, X11);
322 sha1_msg_schedule_update(&X15, X1, X7, X12);
323
324 sha1_round3(&a, &b, &c, &d, &e, X0);
325 sha1_round3(&a, &b, &c, &d, &e, X1);
326 sha1_round3(&a, &b, &c, &d, &e, X2);
327 sha1_round3(&a, &b, &c, &d, &e, X3);
328 sha1_round3(&a, &b, &c, &d, &e, X4);
329 sha1_round3(&a, &b, &c, &d, &e, X5);
330 sha1_round3(&a, &b, &c, &d, &e, X6);
331 sha1_round3(&a, &b, &c, &d, &e, X7);
332 sha1_round3(&a, &b, &c, &d, &e, X8);
333 sha1_round3(&a, &b, &c, &d, &e, X9);
334 sha1_round3(&a, &b, &c, &d, &e, X10);
335 sha1_round3(&a, &b, &c, &d, &e, X11);
336 sha1_round4(&a, &b, &c, &d, &e, X12);
337 sha1_round4(&a, &b, &c, &d, &e, X13);
338 sha1_round4(&a, &b, &c, &d, &e, X14);
339 sha1_round4(&a, &b, &c, &d, &e, X15);
340
341 sha1_msg_schedule_update(&X0, X2, X8, X13);
342 sha1_msg_schedule_update(&X1, X3, X9, X14);
343 sha1_msg_schedule_update(&X2, X4, X10, X15);
344 sha1_msg_schedule_update(&X3, X5, X11, X0);
345 sha1_msg_schedule_update(&X4, X6, X12, X1);
346 sha1_msg_schedule_update(&X5, X7, X13, X2);
347 sha1_msg_schedule_update(&X6, X8, X14, X3);
348 sha1_msg_schedule_update(&X7, X9, X15, X4);
349 sha1_msg_schedule_update(&X8, X10, X0, X5);
350 sha1_msg_schedule_update(&X9, X11, X1, X6);
351 sha1_msg_schedule_update(&X10, X12, X2, X7);
352 sha1_msg_schedule_update(&X11, X13, X3, X8);
353 sha1_msg_schedule_update(&X12, X14, X4, X9);
354 sha1_msg_schedule_update(&X13, X15, X5, X10);
355 sha1_msg_schedule_update(&X14, X0, X6, X11);
356 sha1_msg_schedule_update(&X15, X1, X7, X12);
357
358 sha1_round4(&a, &b, &c, &d, &e, X0);
359 sha1_round4(&a, &b, &c, &d, &e, X1);
360 sha1_round4(&a, &b, &c, &d, &e, X2);
361 sha1_round4(&a, &b, &c, &d, &e, X3);
362 sha1_round4(&a, &b, &c, &d, &e, X4);
363 sha1_round4(&a, &b, &c, &d, &e, X5);
364 sha1_round4(&a, &b, &c, &d, &e, X6);
365 sha1_round4(&a, &b, &c, &d, &e, X7);
366 sha1_round4(&a, &b, &c, &d, &e, X8);
367 sha1_round4(&a, &b, &c, &d, &e, X9);
368 sha1_round4(&a, &b, &c, &d, &e, X10);
369 sha1_round4(&a, &b, &c, &d, &e, X11);
370 sha1_round4(&a, &b, &c, &d, &e, X12);
371 sha1_round4(&a, &b, &c, &d, &e, X13);
372 sha1_round4(&a, &b, &c, &d, &e, X14);
373 sha1_round4(&a, &b, &c, &d, &e, X15);
374
375 ctx->h0 += a;
376 ctx->h1 += b;
377 ctx->h2 += c;
378 ctx->h3 += d;
379 ctx->h4 += e;
380 }
381}
382#endif
383
384#ifndef HAVE_SHA1_BLOCK_DATA_ORDER
385void
386sha1_block_data_order(SHA_CTX *ctx, const void *_in, size_t num)
387{
388 sha1_block_generic(ctx, _in, num);
389}
390#endif
391
392int
393SHA1_Init(SHA_CTX *c)
394{
395 memset(c, 0, sizeof(*c));
396
397 c->h0 = 0x67452301UL;
398 c->h1 = 0xefcdab89UL;
399 c->h2 = 0x98badcfeUL;
400 c->h3 = 0x10325476UL;
401 c->h4 = 0xc3d2e1f0UL;
402
403 return 1;
404}
405LCRYPTO_ALIAS(SHA1_Init);
406
407int
408SHA1_Update(SHA_CTX *c, const void *data_, size_t len)
409{
410 const unsigned char *data = data_;
411 unsigned char *p;
412 SHA_LONG l;
413 size_t n;
414
415 if (len == 0)
416 return 1;
417
418 l = (c->Nl + (((SHA_LONG)len) << 3))&0xffffffffUL;
419 /* 95-05-24 eay Fixed a bug with the overflow handling, thanks to
420 * Wei Dai <weidai@eskimo.com> for pointing it out. */
421 if (l < c->Nl) /* overflow */
422 c->Nh++;
423 c->Nh+=(SHA_LONG)(len>>29); /* might cause compiler warning on 16-bit */
424 c->Nl = l;
425
426 n = c->num;
427 if (n != 0) {
428 p = (unsigned char *)c->data;
429
430 if (len >= SHA_CBLOCK || len + n >= SHA_CBLOCK) {
431 memcpy(p + n, data, SHA_CBLOCK - n);
432 sha1_block_data_order(c, p, 1);
433 n = SHA_CBLOCK - n;
434 data += n;
435 len -= n;
436 c->num = 0;
437 memset(p,0,SHA_CBLOCK); /* keep it zeroed */
438 } else {
439 memcpy(p + n, data, len);
440 c->num += (unsigned int)len;
441 return 1;
442 }
443 }
444
445 n = len/SHA_CBLOCK;
446 if (n > 0) {
447 sha1_block_data_order(c, data, n);
448 n *= SHA_CBLOCK;
449 data += n;
450 len -= n;
451 }
452
453 if (len != 0) {
454 p = (unsigned char *)c->data;
455 c->num = (unsigned int)len;
456 memcpy(p, data, len);
457 }
458 return 1;
459}
460LCRYPTO_ALIAS(SHA1_Update);
461
462void
463SHA1_Transform(SHA_CTX *c, const unsigned char *data)
464{
465 sha1_block_data_order(c, data, 1);
466}
467LCRYPTO_ALIAS(SHA1_Transform);
468
469int
470SHA1_Final(unsigned char *md, SHA_CTX *c)
471{
472 unsigned char *p = (unsigned char *)c->data;
473 size_t n = c->num;
474
475 p[n] = 0x80; /* there is always room for one */
476 n++;
477
478 if (n > (SHA_CBLOCK - 8)) {
479 memset(p + n, 0, SHA_CBLOCK - n);
480 n = 0;
481 sha1_block_data_order(c, p, 1);
482 }
483
484 memset(p + n, 0, SHA_CBLOCK - 8 - n);
485 c->data[SHA_LBLOCK - 2] = htobe32(c->Nh);
486 c->data[SHA_LBLOCK - 1] = htobe32(c->Nl);
487
488 sha1_block_data_order(c, p, 1);
489 c->num = 0;
490 memset(p, 0, SHA_CBLOCK);
491
492 crypto_store_htobe32(&md[0 * 4], c->h0);
493 crypto_store_htobe32(&md[1 * 4], c->h1);
494 crypto_store_htobe32(&md[2 * 4], c->h2);
495 crypto_store_htobe32(&md[3 * 4], c->h3);
496 crypto_store_htobe32(&md[4 * 4], c->h4);
497
498 return 1;
499}
500LCRYPTO_ALIAS(SHA1_Final);
501
502unsigned char *
503SHA1(const unsigned char *d, size_t n, unsigned char *md)
504{
505 SHA_CTX c;
506
507 if (!SHA1_Init(&c))
508 return NULL;
509 SHA1_Update(&c, d, n);
510 SHA1_Final(md, &c);
511
512 explicit_bzero(&c, sizeof(c));
513
514 return (md);
515}
516LCRYPTO_ALIAS(SHA1);
517
518#endif
diff --git a/src/lib/libcrypto/sha/sha1_amd64.c b/src/lib/libcrypto/sha/sha1_amd64.c
deleted file mode 100644
index 2976cc7e6e..0000000000
--- a/src/lib/libcrypto/sha/sha1_amd64.c
+++ /dev/null
@@ -1,34 +0,0 @@
1/* $OpenBSD: sha1_amd64.c,v 1.2 2024/12/06 11:57:18 jsing Exp $ */
2/*
3 * Copyright (c) 2024 Joel Sing <jsing@openbsd.org>
4 *
5 * Permission to use, copy, modify, and distribute this software for any
6 * purpose with or without fee is hereby granted, provided that the above
7 * copyright notice and this permission notice appear in all copies.
8 *
9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16 */
17
18#include <openssl/sha.h>
19
20#include "crypto_arch.h"
21
22void sha1_block_generic(SHA_CTX *ctx, const void *in, size_t num);
23void sha1_block_shani(SHA_CTX *ctx, const void *in, size_t num);
24
25void
26sha1_block_data_order(SHA_CTX *ctx, const void *in, size_t num)
27{
28 if ((crypto_cpu_caps_amd64 & CRYPTO_CPU_CAPS_AMD64_SHA) != 0) {
29 sha1_block_shani(ctx, in, num);
30 return;
31 }
32
33 sha1_block_generic(ctx, in, num);
34}
diff --git a/src/lib/libcrypto/sha/sha1_amd64_generic.S b/src/lib/libcrypto/sha/sha1_amd64_generic.S
deleted file mode 100644
index 38f49b0c3c..0000000000
--- a/src/lib/libcrypto/sha/sha1_amd64_generic.S
+++ /dev/null
@@ -1,314 +0,0 @@
1/* $OpenBSD: sha1_amd64_generic.S,v 1.2 2025/01/18 02:56:07 jsing Exp $ */
2/*
3 * Copyright (c) 2024 Joel Sing <jsing@openbsd.org>
4 *
5 * Permission to use, copy, modify, and distribute this software for any
6 * purpose with or without fee is hereby granted, provided that the above
7 * copyright notice and this permission notice appear in all copies.
8 *
9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16 */
17
18#ifdef __CET__
19#include <cet.h>
20#else
21#define _CET_ENDBR
22#endif
23
24#define ctx %rdi
25#define in %rsi
26#define num %rdx
27
28#define end %rbp
29
30#define hs0 %r8d
31#define hs1 %r9d
32#define hs2 %r10d
33#define hs3 %r11d
34#define hs4 %r12d
35
36#define tmp0 %eax
37#define tmp1 %ebx
38#define tmp2 %ecx
39#define tmp3 %edx
40
41/*
42 * Load message into wt, storing a copy in the message schedule:
43 *
44 * Wt = Mt
45 */
46#define sha1_message_schedule_load(idx, m, w, wt) \
47 movl ((idx&0xf)*4)(m), wt; \
48 bswapl wt; \
49 movl wt, ((idx&0xf)*4)(w);
50
51/*
52 * Update message schedule and return current value in wt:
53 *
54 * W0 = rol(W13 ^ W8 ^ W2 ^ W0, 1)
55 */
56#define sha1_message_schedule_update(idx, w, wt) \
57 movl (((idx-3)&0xf)*4)(w), wt; /* W13 */ \
58 xorl (((idx-8)&0xf)*4)(w), wt; /* W8 */ \
59 xorl (((idx-14)&0xf)*4)(w), wt; /* W2 */ \
60 xorl (((idx)&0xf)*4)(w), wt; /* W0 */ \
61 roll $1, wt; \
62 \
63 movl wt, ((idx&0xf)*4)(w);
64
65/*
66 * Compute a SHA-1 round without logic function:
67 *
68 * T = rol(a, 5) + e + Kt + Wt
69 *
70 * The caller is required to compute the appropriate logic function
71 * (Ch, Maj, Parity) and add it to e.
72 *
73 * Upon completion b = rol(b, 30), e = T, pending rotation.
74 */
75#define sha1_round(a, b, c, d, e, kt, wt) \
76 leal kt(wt, e, 1), e; /* Kt + Wt */ \
77 \
78 movl a, tmp1; /* rol(a, 5) */ \
79 roll $5, tmp1; \
80 addl tmp1, e; \
81 \
82 roll $30, b; /* rol(b, 30) */
83
84/*
85 * Compute a SHA-1 round with Ch:
86 *
87 * T = rol(a, 5) + Ch(b, c, d) + e + Kt + Wt
88 *
89 * Ch(x, y, z) = (x & y) ^ (~x & z) = ((y ^ z) & x) ^ z
90 *
91 * Upon completion b = rol(b, 30), e = T, pending rotation.
92 */
93#define sha1_round_ch(a, b, c, d, e, kt, wt) \
94 movl c, tmp2; /* Ch */ \
95 xorl d, tmp2; /* Ch */ \
96 andl b, tmp2; /* Ch */ \
97 xorl d, tmp2; /* Ch */ \
98 addl tmp2, e; /* Ch */ \
99 \
100 sha1_round(a, b, c, d, e, kt, wt);
101
102/*
103 * Compute a SHA-1 round with Parity:
104 *
105 * T = rol(a, 5) + Parity(b, c, d) + e + Kt + Wt
106 *
107 * Parity(x, y, z) = x ^ y ^ z
108 *
109 * Upon completion b = rol(b, 30), e = T, pending rotation.
110 */
111#define sha1_round_parity(a, b, c, d, e, kt, wt) \
112 movl b, tmp2; /* Parity */ \
113 xorl c, tmp2; /* Parity */ \
114 xorl d, tmp2; /* Parity */ \
115 addl tmp2, e; /* Parity */ \
116 \
117 sha1_round(a, b, c, d, e, kt, wt);
118
119/*
120 * Compute a SHA-1 round with Maj:
121 *
122 * T = rol(a, 5) + Maj(b, c, d) + e + Kt + Wt
123 *
124 * Maj(x, y, z) = (x & y) ^ (x & z) ^ (y & z) = ((y ^ z) & x) ^ (y & z)
125 *
126 * Upon completion b = rol(b, 30), e = T, pending rotation.
127 */
128#define sha1_round_maj(a, b, c, d, e, kt, wt) \
129 movl c, tmp2; /* Maj */ \
130 xorl d, tmp2; /* Maj */ \
131 andl b, tmp2; /* Maj */ \
132 movl c, tmp3; /* Maj */ \
133 andl d, tmp3; /* Maj */ \
134 xorl tmp2, tmp3; /* Maj */ \
135 addl tmp3, e; /* Maj */ \
136 \
137 sha1_round(a, b, c, d, e, kt, wt);
138
139#define sha1_round1_load(idx, a, b, c, d, e) \
140 sha1_message_schedule_load(idx, in, %rsp, tmp0) \
141 sha1_round_ch(a, b, c, d, e, 0x5a827999, tmp0)
142
143#define sha1_round1_update(idx, a, b, c, d, e) \
144 sha1_message_schedule_update(idx, %rsp, tmp0) \
145 sha1_round_ch(a, b, c, d, e, 0x5a827999, tmp0)
146
147#define sha1_round2_update(idx, a, b, c, d, e) \
148 sha1_message_schedule_update(idx, %rsp, tmp0) \
149 sha1_round_parity(a, b, c, d, e, 0x6ed9eba1, tmp0)
150
151#define sha1_round3_update(idx, a, b, c, d, e) \
152 sha1_message_schedule_update(idx, %rsp, tmp0) \
153 sha1_round_maj(a, b, c, d, e, 0x8f1bbcdc, tmp0)
154
155#define sha1_round4_update(idx, a, b, c, d, e) \
156 sha1_message_schedule_update(idx, %rsp, tmp0) \
157 sha1_round_parity(a, b, c, d, e, 0xca62c1d6, tmp0)
158
159.text
160
161/*
162 * void sha1_block_generic(SHA1_CTX *ctx, const void *in, size_t num);
163 *
164 * Standard x86-64 ABI: rdi = ctx, rsi = in, rdx = num
165 */
166.align 16
167.globl sha1_block_generic
168.type sha1_block_generic,@function
169sha1_block_generic:
170 _CET_ENDBR
171
172 /* Save callee save registers. */
173 pushq %rbx
174 pushq %rbp
175 pushq %r12
176
177 /* Allocate space for message schedule. */
178 movq %rsp, %rax
179 subq $(64+1*8), %rsp
180 andq $~63, %rsp
181 movq %rax, (64+0*8)(%rsp)
182
183 /* Compute end of message. */
184 shlq $6, num
185 leaq (in, num, 1), end
186
187 /* Load current hash state from context. */
188 movl (0*4)(ctx), hs0
189 movl (1*4)(ctx), hs1
190 movl (2*4)(ctx), hs2
191 movl (3*4)(ctx), hs3
192 movl (4*4)(ctx), hs4
193
194 jmp .Lblock_loop
195
196.align 16
197.Lblock_loop:
198
199 /* Round 0 through 15. */
200 sha1_round1_load(0, hs0, hs1, hs2, hs3, hs4)
201 sha1_round1_load(1, hs4, hs0, hs1, hs2, hs3)
202 sha1_round1_load(2, hs3, hs4, hs0, hs1, hs2)
203 sha1_round1_load(3, hs2, hs3, hs4, hs0, hs1)
204 sha1_round1_load(4, hs1, hs2, hs3, hs4, hs0)
205 sha1_round1_load(5, hs0, hs1, hs2, hs3, hs4)
206 sha1_round1_load(6, hs4, hs0, hs1, hs2, hs3)
207 sha1_round1_load(7, hs3, hs4, hs0, hs1, hs2)
208 sha1_round1_load(8, hs2, hs3, hs4, hs0, hs1)
209 sha1_round1_load(9, hs1, hs2, hs3, hs4, hs0)
210 sha1_round1_load(10, hs0, hs1, hs2, hs3, hs4)
211 sha1_round1_load(11, hs4, hs0, hs1, hs2, hs3)
212 sha1_round1_load(12, hs3, hs4, hs0, hs1, hs2)
213 sha1_round1_load(13, hs2, hs3, hs4, hs0, hs1)
214 sha1_round1_load(14, hs1, hs2, hs3, hs4, hs0)
215 sha1_round1_load(15, hs0, hs1, hs2, hs3, hs4)
216
217 /* Round 16 through 31. */
218 sha1_round1_update(16, hs4, hs0, hs1, hs2, hs3);
219 sha1_round1_update(17, hs3, hs4, hs0, hs1, hs2);
220 sha1_round1_update(18, hs2, hs3, hs4, hs0, hs1);
221 sha1_round1_update(19, hs1, hs2, hs3, hs4, hs0);
222 sha1_round2_update(20, hs0, hs1, hs2, hs3, hs4);
223 sha1_round2_update(21, hs4, hs0, hs1, hs2, hs3);
224 sha1_round2_update(22, hs3, hs4, hs0, hs1, hs2);
225 sha1_round2_update(23, hs2, hs3, hs4, hs0, hs1);
226 sha1_round2_update(24, hs1, hs2, hs3, hs4, hs0);
227 sha1_round2_update(25, hs0, hs1, hs2, hs3, hs4);
228 sha1_round2_update(26, hs4, hs0, hs1, hs2, hs3);
229 sha1_round2_update(27, hs3, hs4, hs0, hs1, hs2);
230 sha1_round2_update(28, hs2, hs3, hs4, hs0, hs1);
231 sha1_round2_update(29, hs1, hs2, hs3, hs4, hs0);
232 sha1_round2_update(30, hs0, hs1, hs2, hs3, hs4);
233 sha1_round2_update(31, hs4, hs0, hs1, hs2, hs3);
234
235 /* Round 32 through 47. */
236 sha1_round2_update(32, hs3, hs4, hs0, hs1, hs2);
237 sha1_round2_update(33, hs2, hs3, hs4, hs0, hs1);
238 sha1_round2_update(34, hs1, hs2, hs3, hs4, hs0);
239 sha1_round2_update(35, hs0, hs1, hs2, hs3, hs4);
240 sha1_round2_update(36, hs4, hs0, hs1, hs2, hs3);
241 sha1_round2_update(37, hs3, hs4, hs0, hs1, hs2);
242 sha1_round2_update(38, hs2, hs3, hs4, hs0, hs1);
243 sha1_round2_update(39, hs1, hs2, hs3, hs4, hs0);
244 sha1_round3_update(40, hs0, hs1, hs2, hs3, hs4);
245 sha1_round3_update(41, hs4, hs0, hs1, hs2, hs3);
246 sha1_round3_update(42, hs3, hs4, hs0, hs1, hs2);
247 sha1_round3_update(43, hs2, hs3, hs4, hs0, hs1);
248 sha1_round3_update(44, hs1, hs2, hs3, hs4, hs0);
249 sha1_round3_update(45, hs0, hs1, hs2, hs3, hs4);
250 sha1_round3_update(46, hs4, hs0, hs1, hs2, hs3);
251 sha1_round3_update(47, hs3, hs4, hs0, hs1, hs2);
252
253 /* Round 48 through 63. */
254 sha1_round3_update(48, hs2, hs3, hs4, hs0, hs1);
255 sha1_round3_update(49, hs1, hs2, hs3, hs4, hs0);
256 sha1_round3_update(50, hs0, hs1, hs2, hs3, hs4);
257 sha1_round3_update(51, hs4, hs0, hs1, hs2, hs3);
258 sha1_round3_update(52, hs3, hs4, hs0, hs1, hs2);
259 sha1_round3_update(53, hs2, hs3, hs4, hs0, hs1);
260 sha1_round3_update(54, hs1, hs2, hs3, hs4, hs0);
261 sha1_round3_update(55, hs0, hs1, hs2, hs3, hs4);
262 sha1_round3_update(56, hs4, hs0, hs1, hs2, hs3);
263 sha1_round3_update(57, hs3, hs4, hs0, hs1, hs2);
264 sha1_round3_update(58, hs2, hs3, hs4, hs0, hs1);
265 sha1_round3_update(59, hs1, hs2, hs3, hs4, hs0);
266 sha1_round4_update(60, hs0, hs1, hs2, hs3, hs4);
267 sha1_round4_update(61, hs4, hs0, hs1, hs2, hs3);
268 sha1_round4_update(62, hs3, hs4, hs0, hs1, hs2);
269 sha1_round4_update(63, hs2, hs3, hs4, hs0, hs1);
270
271 /* Round 64 through 79. */
272 sha1_round4_update(64, hs1, hs2, hs3, hs4, hs0);
273 sha1_round4_update(65, hs0, hs1, hs2, hs3, hs4);
274 sha1_round4_update(66, hs4, hs0, hs1, hs2, hs3);
275 sha1_round4_update(67, hs3, hs4, hs0, hs1, hs2);
276 sha1_round4_update(68, hs2, hs3, hs4, hs0, hs1);
277 sha1_round4_update(69, hs1, hs2, hs3, hs4, hs0);
278 sha1_round4_update(70, hs0, hs1, hs2, hs3, hs4);
279 sha1_round4_update(71, hs4, hs0, hs1, hs2, hs3);
280 sha1_round4_update(72, hs3, hs4, hs0, hs1, hs2);
281 sha1_round4_update(73, hs2, hs3, hs4, hs0, hs1);
282 sha1_round4_update(74, hs1, hs2, hs3, hs4, hs0);
283 sha1_round4_update(75, hs0, hs1, hs2, hs3, hs4);
284 sha1_round4_update(76, hs4, hs0, hs1, hs2, hs3);
285 sha1_round4_update(77, hs3, hs4, hs0, hs1, hs2);
286 sha1_round4_update(78, hs2, hs3, hs4, hs0, hs1);
287 sha1_round4_update(79, hs1, hs2, hs3, hs4, hs0);
288
289 /* Add intermediate state to hash state. */
290 addl (0*4)(ctx), hs0
291 addl (1*4)(ctx), hs1
292 addl (2*4)(ctx), hs2
293 addl (3*4)(ctx), hs3
294 addl (4*4)(ctx), hs4
295
296 /* Store new hash state to context. */
297 movl hs0, (0*4)(ctx)
298 movl hs1, (1*4)(ctx)
299 movl hs2, (2*4)(ctx)
300 movl hs3, (3*4)(ctx)
301 movl hs4, (4*4)(ctx)
302
303 addq $64, in
304 cmpq end, in
305 jb .Lblock_loop
306
307 movq (64+0*8)(%rsp), %rsp
308
309 /* Restore callee save registers. */
310 popq %r12
311 popq %rbp
312 popq %rbx
313
314 ret
diff --git a/src/lib/libcrypto/sha/sha1_amd64_shani.S b/src/lib/libcrypto/sha/sha1_amd64_shani.S
deleted file mode 100644
index d7699d10f1..0000000000
--- a/src/lib/libcrypto/sha/sha1_amd64_shani.S
+++ /dev/null
@@ -1,170 +0,0 @@
1/* $OpenBSD: sha1_amd64_shani.S,v 1.1 2024/12/06 11:57:18 jsing Exp $ */
2/*
3 * Copyright (c) 2024 Joel Sing <jsing@openbsd.org>
4 *
5 * Permission to use, copy, modify, and distribute this software for any
6 * purpose with or without fee is hereby granted, provided that the above
7 * copyright notice and this permission notice appear in all copies.
8 *
9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16 */
17
18#ifdef __CET__
19#include <cet.h>
20#else
21#define _CET_ENDBR
22#endif
23
24/*
25 * SHA-1 implementation using the Intel SHA extensions:
26 *
27 * https://www.intel.com/content/www/us/en/developer/articles/technical/intel-sha-extensions.html
28 */
29
30#define ctx %rdi
31#define in %rsi
32#define num %rdx
33
34#define end %rbx
35
36#define xabcd_save %xmm0
37#define xe_save %xmm1
38
39#define xabcd %xmm2
40#define xe0 %xmm3
41#define xe1 %xmm4
42
43#define xmsg0 %xmm5
44#define xmsg1 %xmm6
45#define xmsg2 %xmm7
46#define xmsg3 %xmm8
47
48#define xshufmask %xmm9
49
50
51#define sha1_message_schedule_load(idx, m, xmsg) \
52 movdqu (idx*16)(m), xmsg; \
53 pshufb xshufmask, xmsg;
54
55#define sha1_message_schedule_update(xm0, xm1, xm2, xm3) \
56 sha1msg1 xm1, xm0; \
57 pxor xm2, xm0; \
58 sha1msg2 xm3, xm0;
59
60#define sha1_shani_round(fn, xmsg, xe, xe_next) \
61 sha1nexte xmsg, xe; \
62 movdqa xabcd, xe_next; \
63 sha1rnds4 fn, xe, xabcd;
64
65#define sha1_shani_round_load(fn, idx, m, xmsg, xe, xe_next) \
66 sha1_message_schedule_load(idx, m, xmsg); \
67 sha1_shani_round(fn, xmsg, xe, xe_next);
68
69#define sha1_shani_round_update(fn, xm0, xm1, xm2, xm3, xe, xe_next) \
70 sha1_message_schedule_update(xm0, xm1, xm2, xm3); \
71 sha1_shani_round(fn, xm0, xe, xe_next);
72
73
74.text
75
76/*
77 * void sha1_block_shani(SHA256_CTX *ctx, const void *in, size_t num);
78 *
79 * Standard x86-64 ABI: rdi = ctx, rsi = in, rdx = num
80 */
81.align 16
82.globl sha1_block_shani
83.type sha1_block_shani,@function
84sha1_block_shani:
85 _CET_ENDBR
86
87 /* Save callee save registers. */
88 pushq %rbx
89
90 /* Compute end of message. */
91 shlq $6, num
92 leaq (in, num, 1), end
93
94 /* Load endian shuffle mask. */
95 movdqa shufmask(%rip), xshufmask
96
97 /* Load current hash state from context. */
98 movdqu (0*16)(ctx), xabcd
99 pshufd $0x1b, xabcd, xabcd /* dcba -> abcd */
100 pxor xe0, xe0
101 pinsrd $3, (1*16)(ctx), xe0 /* e */
102
103 jmp .Lshani_block_loop
104
105.align 16
106.Lshani_block_loop:
107 /* Save state for accumulation. */
108 movdqa xabcd, xabcd_save
109 movdqa xe0, xe_save
110
111 /* Rounds 0 through 15 (four rounds at a time). */
112 sha1_message_schedule_load(0, in, xmsg0);
113 paddd xmsg0, xe0
114 movdqa xabcd, xe1
115 sha1rnds4 $0, xe0, xabcd
116
117 sha1_shani_round_load($0, 1, in, xmsg1, xe1, xe0);
118 sha1_shani_round_load($0, 2, in, xmsg2, xe0, xe1);
119 sha1_shani_round_load($0, 3, in, xmsg3, xe1, xe0);
120
121 /* Rounds 16 through 79 (four rounds at a time). */
122 sha1_shani_round_update($0, xmsg0, xmsg1, xmsg2, xmsg3, xe0, xe1)
123 sha1_shani_round_update($1, xmsg1, xmsg2, xmsg3, xmsg0, xe1, xe0)
124 sha1_shani_round_update($1, xmsg2, xmsg3, xmsg0, xmsg1, xe0, xe1)
125 sha1_shani_round_update($1, xmsg3, xmsg0, xmsg1, xmsg2, xe1, xe0)
126
127 sha1_shani_round_update($1, xmsg0, xmsg1, xmsg2, xmsg3, xe0, xe1)
128 sha1_shani_round_update($1, xmsg1, xmsg2, xmsg3, xmsg0, xe1, xe0)
129 sha1_shani_round_update($2, xmsg2, xmsg3, xmsg0, xmsg1, xe0, xe1)
130 sha1_shani_round_update($2, xmsg3, xmsg0, xmsg1, xmsg2, xe1, xe0)
131
132 sha1_shani_round_update($2, xmsg0, xmsg1, xmsg2, xmsg3, xe0, xe1)
133 sha1_shani_round_update($2, xmsg1, xmsg2, xmsg3, xmsg0, xe1, xe0)
134 sha1_shani_round_update($2, xmsg2, xmsg3, xmsg0, xmsg1, xe0, xe1)
135 sha1_shani_round_update($3, xmsg3, xmsg0, xmsg1, xmsg2, xe1, xe0)
136
137 sha1_shani_round_update($3, xmsg0, xmsg1, xmsg2, xmsg3, xe0, xe1)
138 sha1_shani_round_update($3, xmsg1, xmsg2, xmsg3, xmsg0, xe1, xe0)
139 sha1_shani_round_update($3, xmsg2, xmsg3, xmsg0, xmsg1, xe0, xe1)
140 sha1_shani_round_update($3, xmsg3, xmsg0, xmsg1, xmsg2, xe1, xe0)
141
142 /* Accumulate hash state. */
143 paddd xabcd_save, xabcd
144 sha1nexte xe_save, xe0
145
146 addq $64, in
147 cmpq end, in
148 jb .Lshani_block_loop
149
150 /* Update stored hash context. */
151 pshufd $0x1b, xabcd, xabcd /* abcd -> dcba */
152 movdqu xabcd, (0*16)(ctx)
153 pextrd $3, xe0, (1*16)(ctx) /* e */
154
155 /* Restore callee save registers. */
156 popq %rbx
157
158 ret
159
160.rodata
161
162/*
163 * Shuffle mask - byte reversal for little endian to big endian word conversion,
164 * and reordering to abcd.
165 */
166.align 16
167.type shufmask,@object
168shufmask:
169.octa 0x000102030405060708090a0b0c0d0e0f
170.size shufmask,.-shufmask
diff --git a/src/lib/libcrypto/sha/sha256.c b/src/lib/libcrypto/sha/sha256.c
deleted file mode 100644
index 5d002ca62c..0000000000
--- a/src/lib/libcrypto/sha/sha256.c
+++ /dev/null
@@ -1,496 +0,0 @@
1/* $OpenBSD: sha256.c,v 1.33 2025/02/14 12:01:58 jsing Exp $ */
2/* ====================================================================
3 * Copyright (c) 1998-2011 The OpenSSL Project. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in
14 * the documentation and/or other materials provided with the
15 * distribution.
16 *
17 * 3. All advertising materials mentioning features or use of this
18 * software must display the following acknowledgment:
19 * "This product includes software developed by the OpenSSL Project
20 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
21 *
22 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
23 * endorse or promote products derived from this software without
24 * prior written permission. For written permission, please contact
25 * openssl-core@openssl.org.
26 *
27 * 5. Products derived from this software may not be called "OpenSSL"
28 * nor may "OpenSSL" appear in their names without prior written
29 * permission of the OpenSSL Project.
30 *
31 * 6. Redistributions of any form whatsoever must retain the following
32 * acknowledgment:
33 * "This product includes software developed by the OpenSSL Project
34 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
35 *
36 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
37 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
38 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
39 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
40 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
41 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
42 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
43 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
44 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
45 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
46 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
47 * OF THE POSSIBILITY OF SUCH DAMAGE.
48 * ====================================================================
49 *
50 * This product includes cryptographic software written by Eric Young
51 * (eay@cryptsoft.com). This product includes software written by Tim
52 * Hudson (tjh@cryptsoft.com).
53 */
54
55#include <endian.h>
56#include <stdlib.h>
57#include <string.h>
58
59#include <openssl/opensslconf.h>
60
61#include <openssl/crypto.h>
62#include <openssl/sha.h>
63
64#include "crypto_internal.h"
65
66#if !defined(OPENSSL_NO_SHA) && !defined(OPENSSL_NO_SHA256)
67
68/* Ensure that SHA_LONG and uint32_t are equivalent. */
69CTASSERT(sizeof(SHA_LONG) == sizeof(uint32_t));
70
71void sha256_block_data_order(SHA256_CTX *ctx, const void *_in, size_t num);
72void sha256_block_generic(SHA256_CTX *ctx, const void *_in, size_t num);
73
74#ifndef HAVE_SHA256_BLOCK_GENERIC
75static const SHA_LONG K256[64] = {
76 0x428a2f98UL, 0x71374491UL, 0xb5c0fbcfUL, 0xe9b5dba5UL,
77 0x3956c25bUL, 0x59f111f1UL, 0x923f82a4UL, 0xab1c5ed5UL,
78 0xd807aa98UL, 0x12835b01UL, 0x243185beUL, 0x550c7dc3UL,
79 0x72be5d74UL, 0x80deb1feUL, 0x9bdc06a7UL, 0xc19bf174UL,
80 0xe49b69c1UL, 0xefbe4786UL, 0x0fc19dc6UL, 0x240ca1ccUL,
81 0x2de92c6fUL, 0x4a7484aaUL, 0x5cb0a9dcUL, 0x76f988daUL,
82 0x983e5152UL, 0xa831c66dUL, 0xb00327c8UL, 0xbf597fc7UL,
83 0xc6e00bf3UL, 0xd5a79147UL, 0x06ca6351UL, 0x14292967UL,
84 0x27b70a85UL, 0x2e1b2138UL, 0x4d2c6dfcUL, 0x53380d13UL,
85 0x650a7354UL, 0x766a0abbUL, 0x81c2c92eUL, 0x92722c85UL,
86 0xa2bfe8a1UL, 0xa81a664bUL, 0xc24b8b70UL, 0xc76c51a3UL,
87 0xd192e819UL, 0xd6990624UL, 0xf40e3585UL, 0x106aa070UL,
88 0x19a4c116UL, 0x1e376c08UL, 0x2748774cUL, 0x34b0bcb5UL,
89 0x391c0cb3UL, 0x4ed8aa4aUL, 0x5b9cca4fUL, 0x682e6ff3UL,
90 0x748f82eeUL, 0x78a5636fUL, 0x84c87814UL, 0x8cc70208UL,
91 0x90befffaUL, 0xa4506cebUL, 0xbef9a3f7UL, 0xc67178f2UL,
92};
93
94static inline SHA_LONG
95Sigma0(SHA_LONG x)
96{
97 return crypto_ror_u32(x, 2) ^ crypto_ror_u32(x, 13) ^
98 crypto_ror_u32(x, 22);
99}
100
101static inline SHA_LONG
102Sigma1(SHA_LONG x)
103{
104 return crypto_ror_u32(x, 6) ^ crypto_ror_u32(x, 11) ^
105 crypto_ror_u32(x, 25);
106}
107
108static inline SHA_LONG
109sigma0(SHA_LONG x)
110{
111 return crypto_ror_u32(x, 7) ^ crypto_ror_u32(x, 18) ^ (x >> 3);
112}
113
114static inline SHA_LONG
115sigma1(SHA_LONG x)
116{
117 return crypto_ror_u32(x, 17) ^ crypto_ror_u32(x, 19) ^ (x >> 10);
118}
119
120static inline SHA_LONG
121Ch(SHA_LONG x, SHA_LONG y, SHA_LONG z)
122{
123 return (x & y) ^ (~x & z);
124}
125
126static inline SHA_LONG
127Maj(SHA_LONG x, SHA_LONG y, SHA_LONG z)
128{
129 return (x & y) ^ (x & z) ^ (y & z);
130}
131
132static inline void
133sha256_msg_schedule_update(SHA_LONG *W0, SHA_LONG W1, SHA_LONG W9, SHA_LONG W14)
134{
135 *W0 = sigma1(W14) + W9 + sigma0(W1) + *W0;
136}
137
138static inline void
139sha256_round(SHA_LONG *a, SHA_LONG *b, SHA_LONG *c, SHA_LONG *d, SHA_LONG *e,
140 SHA_LONG *f, SHA_LONG *g, SHA_LONG *h, SHA_LONG Kt, SHA_LONG Wt)
141{
142 SHA_LONG T1, T2;
143
144 T1 = *h + Sigma1(*e) + Ch(*e, *f, *g) + Kt + Wt;
145 T2 = Sigma0(*a) + Maj(*a, *b, *c);
146
147 *h = *g;
148 *g = *f;
149 *f = *e;
150 *e = *d + T1;
151 *d = *c;
152 *c = *b;
153 *b = *a;
154 *a = T1 + T2;
155}
156
157void
158sha256_block_generic(SHA256_CTX *ctx, const void *_in, size_t num)
159{
160 const uint8_t *in = _in;
161 const SHA_LONG *in32;
162 SHA_LONG a, b, c, d, e, f, g, h;
163 SHA_LONG X[16];
164 int i;
165
166 while (num--) {
167 a = ctx->h[0];
168 b = ctx->h[1];
169 c = ctx->h[2];
170 d = ctx->h[3];
171 e = ctx->h[4];
172 f = ctx->h[5];
173 g = ctx->h[6];
174 h = ctx->h[7];
175
176 if ((size_t)in % 4 == 0) {
177 /* Input is 32 bit aligned. */
178 in32 = (const SHA_LONG *)in;
179 X[0] = be32toh(in32[0]);
180 X[1] = be32toh(in32[1]);
181 X[2] = be32toh(in32[2]);
182 X[3] = be32toh(in32[3]);
183 X[4] = be32toh(in32[4]);
184 X[5] = be32toh(in32[5]);
185 X[6] = be32toh(in32[6]);
186 X[7] = be32toh(in32[7]);
187 X[8] = be32toh(in32[8]);
188 X[9] = be32toh(in32[9]);
189 X[10] = be32toh(in32[10]);
190 X[11] = be32toh(in32[11]);
191 X[12] = be32toh(in32[12]);
192 X[13] = be32toh(in32[13]);
193 X[14] = be32toh(in32[14]);
194 X[15] = be32toh(in32[15]);
195 } else {
196 /* Input is not 32 bit aligned. */
197 X[0] = crypto_load_be32toh(&in[0 * 4]);
198 X[1] = crypto_load_be32toh(&in[1 * 4]);
199 X[2] = crypto_load_be32toh(&in[2 * 4]);
200 X[3] = crypto_load_be32toh(&in[3 * 4]);
201 X[4] = crypto_load_be32toh(&in[4 * 4]);
202 X[5] = crypto_load_be32toh(&in[5 * 4]);
203 X[6] = crypto_load_be32toh(&in[6 * 4]);
204 X[7] = crypto_load_be32toh(&in[7 * 4]);
205 X[8] = crypto_load_be32toh(&in[8 * 4]);
206 X[9] = crypto_load_be32toh(&in[9 * 4]);
207 X[10] = crypto_load_be32toh(&in[10 * 4]);
208 X[11] = crypto_load_be32toh(&in[11 * 4]);
209 X[12] = crypto_load_be32toh(&in[12 * 4]);
210 X[13] = crypto_load_be32toh(&in[13 * 4]);
211 X[14] = crypto_load_be32toh(&in[14 * 4]);
212 X[15] = crypto_load_be32toh(&in[15 * 4]);
213 }
214 in += SHA256_CBLOCK;
215
216 sha256_round(&a, &b, &c, &d, &e, &f, &g, &h, K256[0], X[0]);
217 sha256_round(&a, &b, &c, &d, &e, &f, &g, &h, K256[1], X[1]);
218 sha256_round(&a, &b, &c, &d, &e, &f, &g, &h, K256[2], X[2]);
219 sha256_round(&a, &b, &c, &d, &e, &f, &g, &h, K256[3], X[3]);
220 sha256_round(&a, &b, &c, &d, &e, &f, &g, &h, K256[4], X[4]);
221 sha256_round(&a, &b, &c, &d, &e, &f, &g, &h, K256[5], X[5]);
222 sha256_round(&a, &b, &c, &d, &e, &f, &g, &h, K256[6], X[6]);
223 sha256_round(&a, &b, &c, &d, &e, &f, &g, &h, K256[7], X[7]);
224 sha256_round(&a, &b, &c, &d, &e, &f, &g, &h, K256[8], X[8]);
225 sha256_round(&a, &b, &c, &d, &e, &f, &g, &h, K256[9], X[9]);
226 sha256_round(&a, &b, &c, &d, &e, &f, &g, &h, K256[10], X[10]);
227 sha256_round(&a, &b, &c, &d, &e, &f, &g, &h, K256[11], X[11]);
228 sha256_round(&a, &b, &c, &d, &e, &f, &g, &h, K256[12], X[12]);
229 sha256_round(&a, &b, &c, &d, &e, &f, &g, &h, K256[13], X[13]);
230 sha256_round(&a, &b, &c, &d, &e, &f, &g, &h, K256[14], X[14]);
231 sha256_round(&a, &b, &c, &d, &e, &f, &g, &h, K256[15], X[15]);
232
233 for (i = 16; i < 64; i += 16) {
234 sha256_msg_schedule_update(&X[0], X[1], X[9], X[14]);
235 sha256_msg_schedule_update(&X[1], X[2], X[10], X[15]);
236 sha256_msg_schedule_update(&X[2], X[3], X[11], X[0]);
237 sha256_msg_schedule_update(&X[3], X[4], X[12], X[1]);
238 sha256_msg_schedule_update(&X[4], X[5], X[13], X[2]);
239 sha256_msg_schedule_update(&X[5], X[6], X[14], X[3]);
240 sha256_msg_schedule_update(&X[6], X[7], X[15], X[4]);
241 sha256_msg_schedule_update(&X[7], X[8], X[0], X[5]);
242 sha256_msg_schedule_update(&X[8], X[9], X[1], X[6]);
243 sha256_msg_schedule_update(&X[9], X[10], X[2], X[7]);
244 sha256_msg_schedule_update(&X[10], X[11], X[3], X[8]);
245 sha256_msg_schedule_update(&X[11], X[12], X[4], X[9]);
246 sha256_msg_schedule_update(&X[12], X[13], X[5], X[10]);
247 sha256_msg_schedule_update(&X[13], X[14], X[6], X[11]);
248 sha256_msg_schedule_update(&X[14], X[15], X[7], X[12]);
249 sha256_msg_schedule_update(&X[15], X[0], X[8], X[13]);
250
251 sha256_round(&a, &b, &c, &d, &e, &f, &g, &h, K256[i + 0], X[0]);
252 sha256_round(&a, &b, &c, &d, &e, &f, &g, &h, K256[i + 1], X[1]);
253 sha256_round(&a, &b, &c, &d, &e, &f, &g, &h, K256[i + 2], X[2]);
254 sha256_round(&a, &b, &c, &d, &e, &f, &g, &h, K256[i + 3], X[3]);
255 sha256_round(&a, &b, &c, &d, &e, &f, &g, &h, K256[i + 4], X[4]);
256 sha256_round(&a, &b, &c, &d, &e, &f, &g, &h, K256[i + 5], X[5]);
257 sha256_round(&a, &b, &c, &d, &e, &f, &g, &h, K256[i + 6], X[6]);
258 sha256_round(&a, &b, &c, &d, &e, &f, &g, &h, K256[i + 7], X[7]);
259 sha256_round(&a, &b, &c, &d, &e, &f, &g, &h, K256[i + 8], X[8]);
260 sha256_round(&a, &b, &c, &d, &e, &f, &g, &h, K256[i + 9], X[9]);
261 sha256_round(&a, &b, &c, &d, &e, &f, &g, &h, K256[i + 10], X[10]);
262 sha256_round(&a, &b, &c, &d, &e, &f, &g, &h, K256[i + 11], X[11]);
263 sha256_round(&a, &b, &c, &d, &e, &f, &g, &h, K256[i + 12], X[12]);
264 sha256_round(&a, &b, &c, &d, &e, &f, &g, &h, K256[i + 13], X[13]);
265 sha256_round(&a, &b, &c, &d, &e, &f, &g, &h, K256[i + 14], X[14]);
266 sha256_round(&a, &b, &c, &d, &e, &f, &g, &h, K256[i + 15], X[15]);
267 }
268
269 ctx->h[0] += a;
270 ctx->h[1] += b;
271 ctx->h[2] += c;
272 ctx->h[3] += d;
273 ctx->h[4] += e;
274 ctx->h[5] += f;
275 ctx->h[6] += g;
276 ctx->h[7] += h;
277 }
278}
279#endif
280
281#ifndef HAVE_SHA256_BLOCK_DATA_ORDER
282void
283sha256_block_data_order(SHA256_CTX *ctx, const void *_in, size_t num)
284{
285 sha256_block_generic(ctx, _in, num);
286}
287#endif
288
289int
290SHA224_Init(SHA256_CTX *c)
291{
292 memset(c, 0, sizeof(*c));
293
294 c->h[0] = 0xc1059ed8UL;
295 c->h[1] = 0x367cd507UL;
296 c->h[2] = 0x3070dd17UL;
297 c->h[3] = 0xf70e5939UL;
298 c->h[4] = 0xffc00b31UL;
299 c->h[5] = 0x68581511UL;
300 c->h[6] = 0x64f98fa7UL;
301 c->h[7] = 0xbefa4fa4UL;
302
303 c->md_len = SHA224_DIGEST_LENGTH;
304
305 return 1;
306}
307LCRYPTO_ALIAS(SHA224_Init);
308
309int
310SHA224_Update(SHA256_CTX *c, const void *data, size_t len)
311{
312 return SHA256_Update(c, data, len);
313}
314LCRYPTO_ALIAS(SHA224_Update);
315
316int
317SHA224_Final(unsigned char *md, SHA256_CTX *c)
318{
319 return SHA256_Final(md, c);
320}
321LCRYPTO_ALIAS(SHA224_Final);
322
323unsigned char *
324SHA224(const unsigned char *d, size_t n, unsigned char *md)
325{
326 SHA256_CTX c;
327
328 SHA224_Init(&c);
329 SHA256_Update(&c, d, n);
330 SHA256_Final(md, &c);
331
332 explicit_bzero(&c, sizeof(c));
333
334 return (md);
335}
336LCRYPTO_ALIAS(SHA224);
337
338int
339SHA256_Init(SHA256_CTX *c)
340{
341 memset(c, 0, sizeof(*c));
342
343 c->h[0] = 0x6a09e667UL;
344 c->h[1] = 0xbb67ae85UL;
345 c->h[2] = 0x3c6ef372UL;
346 c->h[3] = 0xa54ff53aUL;
347 c->h[4] = 0x510e527fUL;
348 c->h[5] = 0x9b05688cUL;
349 c->h[6] = 0x1f83d9abUL;
350 c->h[7] = 0x5be0cd19UL;
351
352 c->md_len = SHA256_DIGEST_LENGTH;
353
354 return 1;
355}
356LCRYPTO_ALIAS(SHA256_Init);
357
358int
359SHA256_Update(SHA256_CTX *c, const void *data_, size_t len)
360{
361 const unsigned char *data = data_;
362 unsigned char *p;
363 SHA_LONG l;
364 size_t n;
365
366 if (len == 0)
367 return 1;
368
369 l = (c->Nl + (((SHA_LONG)len) << 3)) & 0xffffffffUL;
370 /* 95-05-24 eay Fixed a bug with the overflow handling, thanks to
371 * Wei Dai <weidai@eskimo.com> for pointing it out. */
372 if (l < c->Nl) /* overflow */
373 c->Nh++;
374 c->Nh += (SHA_LONG)(len >> 29); /* might cause compiler warning on 16-bit */
375 c->Nl = l;
376
377 n = c->num;
378 if (n != 0) {
379 p = (unsigned char *)c->data;
380
381 if (len >= SHA_CBLOCK || len + n >= SHA_CBLOCK) {
382 memcpy(p + n, data, SHA_CBLOCK - n);
383 sha256_block_data_order(c, p, 1);
384 n = SHA_CBLOCK - n;
385 data += n;
386 len -= n;
387 c->num = 0;
388 memset(p, 0, SHA_CBLOCK); /* keep it zeroed */
389 } else {
390 memcpy(p + n, data, len);
391 c->num += (unsigned int)len;
392 return 1;
393 }
394 }
395
396 n = len/SHA_CBLOCK;
397 if (n > 0) {
398 sha256_block_data_order(c, data, n);
399 n *= SHA_CBLOCK;
400 data += n;
401 len -= n;
402 }
403
404 if (len != 0) {
405 p = (unsigned char *)c->data;
406 c->num = (unsigned int)len;
407 memcpy(p, data, len);
408 }
409 return 1;
410}
411LCRYPTO_ALIAS(SHA256_Update);
412
413void
414SHA256_Transform(SHA256_CTX *c, const unsigned char *data)
415{
416 sha256_block_data_order(c, data, 1);
417}
418LCRYPTO_ALIAS(SHA256_Transform);
419
420int
421SHA256_Final(unsigned char *md, SHA256_CTX *c)
422{
423 unsigned char *p = (unsigned char *)c->data;
424 size_t n = c->num;
425 unsigned int nn;
426
427 p[n] = 0x80; /* there is always room for one */
428 n++;
429
430 if (n > (SHA_CBLOCK - 8)) {
431 memset(p + n, 0, SHA_CBLOCK - n);
432 n = 0;
433 sha256_block_data_order(c, p, 1);
434 }
435
436 memset(p + n, 0, SHA_CBLOCK - 8 - n);
437 c->data[SHA_LBLOCK - 2] = htobe32(c->Nh);
438 c->data[SHA_LBLOCK - 1] = htobe32(c->Nl);
439
440 sha256_block_data_order(c, p, 1);
441 c->num = 0;
442 memset(p, 0, SHA_CBLOCK);
443
444 /*
445 * Note that FIPS180-2 discusses "Truncation of the Hash Function Output."
446 * default: case below covers for it. It's not clear however if it's
447 * permitted to truncate to amount of bytes not divisible by 4. I bet not,
448 * but if it is, then default: case shall be extended. For reference.
449 * Idea behind separate cases for pre-defined lengths is to let the
450 * compiler decide if it's appropriate to unroll small loops.
451 */
452 switch (c->md_len) {
453 case SHA224_DIGEST_LENGTH:
454 for (nn = 0; nn < SHA224_DIGEST_LENGTH / 4; nn++) {
455 crypto_store_htobe32(md, c->h[nn]);
456 md += 4;
457 }
458 break;
459
460 case SHA256_DIGEST_LENGTH:
461 for (nn = 0; nn < SHA256_DIGEST_LENGTH / 4; nn++) {
462 crypto_store_htobe32(md, c->h[nn]);
463 md += 4;
464 }
465 break;
466
467 default:
468 if (c->md_len > SHA256_DIGEST_LENGTH)
469 return 0;
470 for (nn = 0; nn < c->md_len / 4; nn++) {
471 crypto_store_htobe32(md, c->h[nn]);
472 md += 4;
473 }
474 break;
475 }
476
477 return 1;
478}
479LCRYPTO_ALIAS(SHA256_Final);
480
481unsigned char *
482SHA256(const unsigned char *d, size_t n, unsigned char *md)
483{
484 SHA256_CTX c;
485
486 SHA256_Init(&c);
487 SHA256_Update(&c, d, n);
488 SHA256_Final(md, &c);
489
490 explicit_bzero(&c, sizeof(c));
491
492 return (md);
493}
494LCRYPTO_ALIAS(SHA256);
495
496#endif /* OPENSSL_NO_SHA256 */
diff --git a/src/lib/libcrypto/sha/sha256_aarch64.c b/src/lib/libcrypto/sha/sha256_aarch64.c
deleted file mode 100644
index ecac64390d..0000000000
--- a/src/lib/libcrypto/sha/sha256_aarch64.c
+++ /dev/null
@@ -1,34 +0,0 @@
1/* $OpenBSD: sha256_aarch64.c,v 1.1 2025/03/07 14:21:22 jsing Exp $ */
2/*
3 * Copyright (c) 2025 Joel Sing <jsing@openbsd.org>
4 *
5 * Permission to use, copy, modify, and distribute this software for any
6 * purpose with or without fee is hereby granted, provided that the above
7 * copyright notice and this permission notice appear in all copies.
8 *
9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16 */
17
18#include <openssl/sha.h>
19
20#include "crypto_arch.h"
21
22void sha256_block_ce(SHA256_CTX *ctx, const void *in, size_t num);
23void sha256_block_generic(SHA256_CTX *ctx, const void *in, size_t num);
24
25void
26sha256_block_data_order(SHA256_CTX *ctx, const void *in, size_t num)
27{
28 if ((crypto_cpu_caps_aarch64 & CRYPTO_CPU_CAPS_AARCH64_SHA2) != 0) {
29 sha256_block_ce(ctx, in, num);
30 return;
31 }
32
33 sha256_block_generic(ctx, in, num);
34}
diff --git a/src/lib/libcrypto/sha/sha256_aarch64_ce.S b/src/lib/libcrypto/sha/sha256_aarch64_ce.S
deleted file mode 100644
index 15726827e6..0000000000
--- a/src/lib/libcrypto/sha/sha256_aarch64_ce.S
+++ /dev/null
@@ -1,189 +0,0 @@
1/* $OpenBSD: sha256_aarch64_ce.S,v 1.2 2025/03/12 12:53:33 jsing Exp $ */
2/*
3 * Copyright (c) 2023,2025 Joel Sing <jsing@openbsd.org>
4 *
5 * Permission to use, copy, modify, and distribute this software for any
6 * purpose with or without fee is hereby granted, provided that the above
7 * copyright notice and this permission notice appear in all copies.
8 *
9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16 */
17
18/*
19 * SHA-256 implementation using the ARM Cryptographic Extension (CE).
20 *
21 * There are four instructions that enable hardware acceleration of SHA-256,
22 * however the documentation for these is woefully inadequate:
23 *
24 * sha256h: hash update - part 1 (without a number to be inconsistent)
25 * sha256h2: hash update - part 2
26 * sha256su0: message schedule update with sigma0 for four rounds
27 * sha256su1: message schedule update with sigma1 for four rounds
28 */
29
30#define ctx x0
31#define in x1
32#define num x2
33
34#define k256_base x9
35#define k256 x10
36
37/* Note: the lower 64 bits of v8 through v15 are callee save. */
38
39#define hc0 v16
40#define hc1 v17
41
42#define hs0 v18
43#define hs1 v19
44
45#define w0 v20
46#define w1 v21
47#define w2 v22
48#define w3 v23
49
50#define k0 v24
51#define k1 v25
52#define k2 v26
53#define k3 v27
54
55#define tmp0 v28
56#define tmp1 v29
57
58/*
59 * Update message schedule for m0 (W0:W1:W2:W3), using m1 (W4:W5:W6:W7),
60 * m2 (W8:W9:W10:11) and m3 (W12:W13:W14:W15). The sha256su0 instruction
61 * computes the sigma0 component of the message schedule update as:
62 * W0:W1:W2:W3 = sigma0(W1:W2:W3:W4) + W0:W1:W2:W3
63 * while sha256su1 computes the sigma1 component and adds in W9 as:
64 * W0:W1:W2:W3 = sigma1(W14:W15:W0:W1) + W9:W10:W12:W13 + W0:W1:W2:W3
65 */
66#define sha256_message_schedule_update(m0, m1, m2, m3) \
67 sha256su0 m0.4s, m1.4s; \
68 sha256su1 m0.4s, m2.4s, m3.4s;
69
70/*
71 * Compute four SHA-256 rounds by adding W0:W1:W2:W3 + K0:K1:K2:K3, then
72 * computing the remainder of each round (including the shuffle) via
73 * sha256h/sha256h2.
74 */
75#define sha256_round(h0, h1, w, k) \
76 add tmp0.4s, w.4s, k.4s; /* Tt = Wt + Kt */ \
77 mov tmp1.4s, h0.4s; \
78 sha256h h0, h1, tmp0.4s; \
79 sha256h2 h1, tmp1, tmp0.4s;
80
81#define sha256_round_update(h0, h1, m0, m1, m2, m3, k) \
82 sha256_message_schedule_update(m0, m1, m2, m3) \
83 sha256_round(h0, h1, m0, k)
84
85.arch armv8-a+sha2
86
87.text
88
89/*
90 * void sha256_block_ce(SHA256_CTX *ctx, const void *in, size_t num);
91 *
92 * Standard ARM ABI: x0 = ctx, x1 = in, x2 = num
93 */
94.globl sha256_block_ce
95.type sha256_block_ce,@function
96sha256_block_ce:
97
98 /* Address of SHA-256 constants. */
99 adrp k256_base, K256
100 add k256_base, k256_base, :lo12:K256
101
102 /*
103 * Load current hash state from context.
104 * hc0 = a:b:c:d, hc1 = e:f:g:h
105 */
106 ld1 {hc0.4s, hc1.4s}, [ctx]
107
108block_loop:
109 mov k256, k256_base
110
111 /* Copy current hash state. */
112 mov hs0.4s, hc0.4s
113 mov hs1.4s, hc1.4s
114
115 /* Load and byte swap message schedule. */
116 ld1 {w0.16b, w1.16b, w2.16b, w3.16b}, [in], #64
117 rev32 w0.16b, w0.16b
118 rev32 w1.16b, w1.16b
119 rev32 w2.16b, w2.16b
120 rev32 w3.16b, w3.16b
121
122 /* Rounds 0 through 15 (four rounds at a time). */
123 ld1 {k0.4s, k1.4s, k2.4s, k3.4s}, [k256], #64
124
125 sha256_round(hs0, hs1, w0, k0)
126 sha256_round(hs0, hs1, w1, k1)
127 sha256_round(hs0, hs1, w2, k2)
128 sha256_round(hs0, hs1, w3, k3)
129
130 /* Rounds 16 through 31 (four rounds at a time). */
131 ld1 {k0.4s, k1.4s, k2.4s, k3.4s}, [k256], #64
132
133 sha256_round_update(hs0, hs1, w0, w1, w2, w3, k0)
134 sha256_round_update(hs0, hs1, w1, w2, w3, w0, k1)
135 sha256_round_update(hs0, hs1, w2, w3, w0, w1, k2)
136 sha256_round_update(hs0, hs1, w3, w0, w1, w2, k3)
137
138 /* Rounds 32 through 47 (four rounds at a time). */
139 ld1 {k0.4s, k1.4s, k2.4s, k3.4s}, [k256], #64
140
141 sha256_round_update(hs0, hs1, w0, w1, w2, w3, k0)
142 sha256_round_update(hs0, hs1, w1, w2, w3, w0, k1)
143 sha256_round_update(hs0, hs1, w2, w3, w0, w1, k2)
144 sha256_round_update(hs0, hs1, w3, w0, w1, w2, k3)
145
146 /* Rounds 48 through 63 (four rounds at a time). */
147 ld1 {k0.4s, k1.4s, k2.4s, k3.4s}, [k256], #64
148
149 sha256_round_update(hs0, hs1, w0, w1, w2, w3, k0)
150 sha256_round_update(hs0, hs1, w1, w2, w3, w0, k1)
151 sha256_round_update(hs0, hs1, w2, w3, w0, w1, k2)
152 sha256_round_update(hs0, hs1, w3, w0, w1, w2, k3)
153
154 /* Add intermediate state to hash state. */
155 add hc0.4s, hc0.4s, hs0.4s
156 add hc1.4s, hc1.4s, hs1.4s
157
158 sub num, num, #1
159 cbnz num, block_loop
160
161 /* Store hash state to context. */
162 st1 {hc0.4s, hc1.4s}, [ctx]
163
164 ret
165
166/*
167 * SHA-256 constants - see FIPS 180-4 section 4.2.3.
168 */
169.rodata
170.align 4
171.type K256,@object
172K256:
173.long 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
174.long 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
175.long 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
176.long 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
177.long 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
178.long 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
179.long 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
180.long 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
181.long 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
182.long 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
183.long 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
184.long 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
185.long 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
186.long 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
187.long 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
188.long 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
189.size K256,.-K256
diff --git a/src/lib/libcrypto/sha/sha256_amd64.c b/src/lib/libcrypto/sha/sha256_amd64.c
deleted file mode 100644
index 6c5d3e897f..0000000000
--- a/src/lib/libcrypto/sha/sha256_amd64.c
+++ /dev/null
@@ -1,34 +0,0 @@
1/* $OpenBSD: sha256_amd64.c,v 1.2 2024/11/16 15:31:36 jsing Exp $ */
2/*
3 * Copyright (c) 2024 Joel Sing <jsing@openbsd.org>
4 *
5 * Permission to use, copy, modify, and distribute this software for any
6 * purpose with or without fee is hereby granted, provided that the above
7 * copyright notice and this permission notice appear in all copies.
8 *
9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16 */
17
18#include <openssl/sha.h>
19
20#include "crypto_arch.h"
21
22void sha256_block_generic(SHA256_CTX *ctx, const void *in, size_t num);
23void sha256_block_shani(SHA256_CTX *ctx, const void *in, size_t num);
24
25void
26sha256_block_data_order(SHA256_CTX *ctx, const void *in, size_t num)
27{
28 if ((crypto_cpu_caps_amd64 & CRYPTO_CPU_CAPS_AMD64_SHA) != 0) {
29 sha256_block_shani(ctx, in, num);
30 return;
31 }
32
33 sha256_block_generic(ctx, in, num);
34}
diff --git a/src/lib/libcrypto/sha/sha256_amd64_generic.S b/src/lib/libcrypto/sha/sha256_amd64_generic.S
deleted file mode 100644
index 166bce9ca8..0000000000
--- a/src/lib/libcrypto/sha/sha256_amd64_generic.S
+++ /dev/null
@@ -1,302 +0,0 @@
1/* $OpenBSD: sha256_amd64_generic.S,v 1.3 2024/11/16 12:34:16 jsing Exp $ */
2/*
3 * Copyright (c) 2024 Joel Sing <jsing@openbsd.org>
4 *
5 * Permission to use, copy, modify, and distribute this software for any
6 * purpose with or without fee is hereby granted, provided that the above
7 * copyright notice and this permission notice appear in all copies.
8 *
9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16 */
17
18#ifdef __CET__
19#include <cet.h>
20#else
21#define _CET_ENDBR
22#endif
23
24#define ctx %rdi
25#define in %rsi
26#define num %rdx
27
28#define round %rdi
29
30#define hs0 %r8d
31#define hs1 %r9d
32#define hs2 %r10d
33#define hs3 %r11d
34#define hs4 %r12d
35#define hs5 %r13d
36#define hs6 %r14d
37#define hs7 %r15d
38
39#define k256 %rbp
40
41#define tmp0 %eax
42#define tmp1 %ebx
43#define tmp2 %ecx
44#define tmp3 %edx
45
46/*
47 * Load message into wt, storing a copy in the message schedule:
48 *
49 * Wt = Mt
50 */
51#define sha256_message_schedule_load(idx, m, w, wt) \
52 movl (m, round, 4), wt; \
53 bswapl wt; \
54 movl wt, ((idx&0xf)*4)(w);
55
56/*
57 * Update message schedule and return current value in wt:
58 *
59 * Wt = sigma1(W(t-2)) + W(t-7) + sigma0(W(t-15)) + W(t-16)
60 *
61 * sigma0(x) = ror(x, 7) ^ ror(x, 18) ^ (x >> 3)
62 * sigma1(x) = ror(x, 17) ^ ror(x, 19) ^ (x >> 10)
63 */
64#define sha256_message_schedule_update(idx, w, wt) \
65 movl (((idx-2)&0xf)*4)(w), wt; /* sigma1 */ \
66 movl wt, tmp1; /* sigma1 */ \
67 rorl $(19-17), tmp1; /* sigma1 */ \
68 xorl wt, tmp1; /* sigma1 */ \
69 rorl $17, tmp1; /* sigma1 */ \
70 shrl $10, wt; /* sigma1 */ \
71 xorl tmp1, wt; /* sigma1 */ \
72 \
73 addl (((idx-7)&0xf)*4)(w), wt; /* Wt-7 */ \
74 addl (((idx-16)&0xf)*4)(w), wt; /* Wt-16 */ \
75 \
76 movl (((idx-15)&0xf)*4)(w), tmp2; /* sigma0 */ \
77 movl tmp2, tmp3; /* sigma0 */ \
78 rorl $(18-7), tmp2; /* sigma0 */ \
79 xorl tmp3, tmp2; /* sigma0 */ \
80 rorl $7, tmp2; /* sigma0 */ \
81 shrl $3, tmp3; /* sigma0 */ \
82 xorl tmp3, tmp2; /* sigma0 */ \
83 addl tmp2, wt; /* sigma0 */ \
84 \
85 movl wt, ((idx&0xf)*4)(w);
86
87/*
88 * Compute a SHA-256 round:
89 *
90 * T1 = h + Sigma1(e) + Ch(e, f, g) + Kt + Wt
91 * T2 = Sigma0(a) + Maj(a, b, c)
92 *
93 * Sigma0(x) = ror(x, 2) ^ ror(x, 13) ^ ror(x, 22)
94 * Sigma1(x) = ror(x, 6) ^ ror(x, 11) ^ ror(x, 25)
95 * Ch(x, y, z) = (x & y) ^ (~x & z) = ((y ^ z) & x) ^ z
96 * Maj(x, y, z) = (x & y) ^ (x & z) ^ (y & z) = ((y ^ z) & x) ^ (y & z)
97 *
98 * Upon completion d = d + T1, h = T1 + T2, pending rotation.
99 */
100#define sha256_round(idx, a, b, c, d, e, f, g, h, k, w, wt) \
101 addl wt, h; /* T1 Wt */ \
102 addl (k256, round, 4), h; /* T1 Kt */ \
103 \
104 movl e, tmp1; /* T1 Sigma1 */ \
105 rorl $(25-11), tmp1; /* T1 Sigma1 */ \
106 xorl e, tmp1; /* T1 Sigma1 */ \
107 rorl $(11-6), tmp1; /* T1 Sigma1 */ \
108 xorl e, tmp1; /* T1 Sigma1 */ \
109 rorl $6, tmp1; /* T1 Sigma1 */ \
110 addl tmp1, h; /* T1 Sigma1 */ \
111 \
112 movl f, tmp2; /* T1 Ch */ \
113 xorl g, tmp2; /* T1 Ch */ \
114 andl e, tmp2; /* T1 Ch */ \
115 xorl g, tmp2; /* T1 Ch */ \
116 addl tmp2, h; /* T1 Ch */ \
117 \
118 addl h, d; /* d += T1 */ \
119 \
120 movl a, tmp1; /* T2 Sigma0 */ \
121 rorl $(22-13), tmp1; /* T2 Sigma0 */ \
122 xorl a, tmp1; /* T2 Sigma0 */ \
123 rorl $(13-2), tmp1; /* T2 Sigma0 */ \
124 xorl a, tmp1; /* T2 Sigma0 */ \
125 rorl $2, tmp1; /* T2 Sigma0 */ \
126 addl tmp1, h; /* T2 Sigma0 */ \
127 \
128 movl b, tmp2; /* T2 Maj */ \
129 xorl c, tmp2; /* T2 Maj */ \
130 andl a, tmp2; /* T2 Maj */ \
131 movl b, tmp3; /* T2 Maj */ \
132 andl c, tmp3; /* T2 Maj */ \
133 xorl tmp2, tmp3; /* T2 Maj */ \
134 addl tmp3, h; /* T2 Maj */ \
135 \
136 addq $1, round;
137
138#define sha256_round_load(idx, a, b, c, d, e, f, g, h) \
139 sha256_message_schedule_load(idx, in, %rsp, tmp0) \
140 sha256_round(idx, a, b, c, d, e, f, g, h, k256, %rsp, tmp0)
141
142#define sha256_round_update(idx, a, b, c, d, e, f, g, h) \
143 sha256_message_schedule_update(idx, %rsp, tmp0) \
144 sha256_round(idx, a, b, c, d, e, f, g, h, k256, %rsp, tmp0)
145
146.text
147
148/*
149 * void sha256_block_generic(SHA256_CTX *ctx, const void *in, size_t num);
150 *
151 * Standard x86-64 ABI: rdi = ctx, rsi = in, rdx = num
152 */
153.align 16
154.globl sha256_block_generic
155.type sha256_block_generic,@function
156sha256_block_generic:
157 _CET_ENDBR
158
159 /* Save callee save registers. */
160 pushq %rbx
161 pushq %rbp
162 pushq %r12
163 pushq %r13
164 pushq %r14
165 pushq %r15
166
167 /* Allocate space for message schedule, context pointer and end of message. */
168 movq %rsp, %rax
169 subq $(64+3*8), %rsp
170 andq $~63, %rsp
171 movq %rax, (64+2*8)(%rsp)
172 movq ctx, (64+1*8)(%rsp)
173
174 /* Compute and store end of message. */
175 shlq $6, num
176 leaq (in, num, 1), %rbx
177 movq %rbx, (64+0*8)(%rsp)
178
179 /* Address of SHA-256 constants. */
180 leaq K256(%rip), k256
181
182 /* Load current hash state from context. */
183 movl (0*4)(ctx), hs0
184 movl (1*4)(ctx), hs1
185 movl (2*4)(ctx), hs2
186 movl (3*4)(ctx), hs3
187 movl (4*4)(ctx), hs4
188 movl (5*4)(ctx), hs5
189 movl (6*4)(ctx), hs6
190 movl (7*4)(ctx), hs7
191
192 jmp .Lblock_loop0
193
194.align 16
195.Lblock_loop0:
196 mov $0, round
197
198 /* Round 0 through 15. */
199 sha256_round_load(0, hs0, hs1, hs2, hs3, hs4, hs5, hs6, hs7)
200 sha256_round_load(1, hs7, hs0, hs1, hs2, hs3, hs4, hs5, hs6)
201 sha256_round_load(2, hs6, hs7, hs0, hs1, hs2, hs3, hs4, hs5)
202 sha256_round_load(3, hs5, hs6, hs7, hs0, hs1, hs2, hs3, hs4)
203 sha256_round_load(4, hs4, hs5, hs6, hs7, hs0, hs1, hs2, hs3)
204 sha256_round_load(5, hs3, hs4, hs5, hs6, hs7, hs0, hs1, hs2)
205 sha256_round_load(6, hs2, hs3, hs4, hs5, hs6, hs7, hs0, hs1)
206 sha256_round_load(7, hs1, hs2, hs3, hs4, hs5, hs6, hs7, hs0)
207 sha256_round_load(8, hs0, hs1, hs2, hs3, hs4, hs5, hs6, hs7)
208 sha256_round_load(9, hs7, hs0, hs1, hs2, hs3, hs4, hs5, hs6)
209 sha256_round_load(10, hs6, hs7, hs0, hs1, hs2, hs3, hs4, hs5)
210 sha256_round_load(11, hs5, hs6, hs7, hs0, hs1, hs2, hs3, hs4)
211 sha256_round_load(12, hs4, hs5, hs6, hs7, hs0, hs1, hs2, hs3)
212 sha256_round_load(13, hs3, hs4, hs5, hs6, hs7, hs0, hs1, hs2)
213 sha256_round_load(14, hs2, hs3, hs4, hs5, hs6, hs7, hs0, hs1)
214 sha256_round_load(15, hs1, hs2, hs3, hs4, hs5, hs6, hs7, hs0)
215
216 jmp .Lblock_loop16
217
218.align 16
219.Lblock_loop16:
220 /* Round 16 through 63. */
221 sha256_round_update(16, hs0, hs1, hs2, hs3, hs4, hs5, hs6, hs7)
222 sha256_round_update(17, hs7, hs0, hs1, hs2, hs3, hs4, hs5, hs6)
223 sha256_round_update(18, hs6, hs7, hs0, hs1, hs2, hs3, hs4, hs5)
224 sha256_round_update(19, hs5, hs6, hs7, hs0, hs1, hs2, hs3, hs4)
225 sha256_round_update(20, hs4, hs5, hs6, hs7, hs0, hs1, hs2, hs3)
226 sha256_round_update(21, hs3, hs4, hs5, hs6, hs7, hs0, hs1, hs2)
227 sha256_round_update(22, hs2, hs3, hs4, hs5, hs6, hs7, hs0, hs1)
228 sha256_round_update(23, hs1, hs2, hs3, hs4, hs5, hs6, hs7, hs0)
229 sha256_round_update(24, hs0, hs1, hs2, hs3, hs4, hs5, hs6, hs7)
230 sha256_round_update(25, hs7, hs0, hs1, hs2, hs3, hs4, hs5, hs6)
231 sha256_round_update(26, hs6, hs7, hs0, hs1, hs2, hs3, hs4, hs5)
232 sha256_round_update(27, hs5, hs6, hs7, hs0, hs1, hs2, hs3, hs4)
233 sha256_round_update(28, hs4, hs5, hs6, hs7, hs0, hs1, hs2, hs3)
234 sha256_round_update(29, hs3, hs4, hs5, hs6, hs7, hs0, hs1, hs2)
235 sha256_round_update(30, hs2, hs3, hs4, hs5, hs6, hs7, hs0, hs1)
236 sha256_round_update(31, hs1, hs2, hs3, hs4, hs5, hs6, hs7, hs0)
237
238 cmp $64, round
239 jb .Lblock_loop16
240
241 movq (64+1*8)(%rsp), ctx
242
243 /* Add intermediate state to hash state. */
244 addl (0*4)(ctx), hs0
245 addl (1*4)(ctx), hs1
246 addl (2*4)(ctx), hs2
247 addl (3*4)(ctx), hs3
248 addl (4*4)(ctx), hs4
249 addl (5*4)(ctx), hs5
250 addl (6*4)(ctx), hs6
251 addl (7*4)(ctx), hs7
252
253 /* Store new hash state to context. */
254 movl hs0, (0*4)(ctx)
255 movl hs1, (1*4)(ctx)
256 movl hs2, (2*4)(ctx)
257 movl hs3, (3*4)(ctx)
258 movl hs4, (4*4)(ctx)
259 movl hs5, (5*4)(ctx)
260 movl hs6, (6*4)(ctx)
261 movl hs7, (7*4)(ctx)
262
263 addq $64, in
264 cmpq (64+0*8)(%rsp), in
265 jb .Lblock_loop0
266
267 movq (64+2*8)(%rsp), %rsp
268
269 /* Restore callee save registers. */
270 popq %r15
271 popq %r14
272 popq %r13
273 popq %r12
274 popq %rbp
275 popq %rbx
276
277 ret
278
279/*
280 * SHA-256 constants - see FIPS 180-4 section 4.2.2.
281 */
282.rodata
283.align 64
284.type K256,@object
285K256:
286.long 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
287.long 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
288.long 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
289.long 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
290.long 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
291.long 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
292.long 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
293.long 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
294.long 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
295.long 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
296.long 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
297.long 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
298.long 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
299.long 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
300.long 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
301.long 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
302.size K256,.-K256
diff --git a/src/lib/libcrypto/sha/sha256_amd64_shani.S b/src/lib/libcrypto/sha/sha256_amd64_shani.S
deleted file mode 100644
index df3a796b45..0000000000
--- a/src/lib/libcrypto/sha/sha256_amd64_shani.S
+++ /dev/null
@@ -1,209 +0,0 @@
1/* $OpenBSD: sha256_amd64_shani.S,v 1.1 2024/11/16 15:31:36 jsing Exp $ */
2/*
3 * Copyright (c) 2024 Joel Sing <jsing@openbsd.org>
4 *
5 * Permission to use, copy, modify, and distribute this software for any
6 * purpose with or without fee is hereby granted, provided that the above
7 * copyright notice and this permission notice appear in all copies.
8 *
9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16 */
17
18#ifdef __CET__
19#include <cet.h>
20#else
21#define _CET_ENDBR
22#endif
23
24/*
25 * SHA-256 implementation using the Intel SHA extensions:
26 *
27 * https://www.intel.com/content/www/us/en/developer/articles/technical/intel-sha-extensions.html
28 */
29
30#define ctx %rdi
31#define in %rsi
32#define num %rdx
33
34#define end %rbx
35
36#define k256 %rbp
37
38#define xmsg %xmm0
39
40#define xhs0 %xmm1
41#define xhs1 %xmm2
42
43#define xabef %xmm3
44#define xcdgh %xmm4
45
46#define xmsgtmp0 %xmm6
47#define xmsgtmp1 %xmm7
48#define xmsgtmp2 %xmm8
49#define xmsgtmp3 %xmm9
50#define xmsgtmp4 %xmm10
51
52#define xshufmask %xmm11
53
54#define xtmp0 %xmm12
55
56#define sha256_message_schedule_load(idx, m, xmsgtmp) \
57 movdqu (idx*16)(m), xmsg; \
58 pshufb xshufmask, xmsg; \
59 movdqa xmsg, xmsgtmp;
60
61#define sha256_message_schedule_update(xmt0, xmt1, xmt2, xmt3) \
62 sha256msg1 xmt1, xmt0; \
63 movdqa xmt3, xmsgtmp4; \
64 palignr $4, xmt2, xmsgtmp4; \
65 paddd xmsgtmp4, xmt0; \
66 sha256msg2 xmt3, xmt0;
67
68#define sha256_shani_round(idx) \
69 paddd (idx*16)(k256), xmsg; \
70 sha256rnds2 xmsg, xhs0, xhs1; \
71 pshufd $0x0e, xmsg, xmsg; \
72 sha256rnds2 xmsg, xhs1, xhs0;
73
74#define sha256_shani_round_load(idx, m, xmsgtmp) \
75 sha256_message_schedule_load(idx, m, xmsgtmp); \
76 sha256_shani_round(idx);
77
78#define sha256_shani_round_update(idx, xmt0, xmt1, xmt2, xmt3) \
79 sha256_message_schedule_update(xmt0, xmt1, xmt2, xmt3); \
80 movdqa xmt0, xmsg; \
81 sha256_shani_round(idx);
82
83.text
84
85/*
86 * void sha256_block_shani(SHA256_CTX *ctx, const void *in, size_t num);
87 *
88 * Standard x86-64 ABI: rdi = ctx, rsi = in, rdx = num
89 */
90.align 16
91.globl sha256_block_shani
92.type sha256_block_shani,@function
93sha256_block_shani:
94 _CET_ENDBR
95
96 /* Save callee save registers. */
97 pushq %rbx
98 pushq %rbp
99
100 /* Compute end of message. */
101 shlq $6, num
102 leaq (in, num, 1), end
103
104 /* Address of SHA-256 constants. */
105 leaq K256(%rip), k256
106
107 /* Load endian shuffle mask. */
108 movdqa shufmask(%rip), xshufmask
109
110 /* Load current hash state from context. */
111 movdqu (0*16)(ctx), xhs0 /* dcba */
112 movdqu (1*16)(ctx), xhs1 /* hgfe */
113
114 /* Rearrange words to construct abef/cdgh. */
115 pshufd $0xb1, xhs0, xhs0 /* cdab */
116 pshufd $0x1b, xhs1, xhs1 /* efgh */
117 movdqa xhs0, xtmp0
118 palignr $8, xhs1, xhs0 /* abef */
119 pblendw $0xf0, xtmp0, xhs1 /* cdgh */
120
121 jmp .Lshani_block_loop
122
123.align 16
124.Lshani_block_loop:
125 /* Save state for accumulation. */
126 movdqa xhs0, xabef
127 movdqa xhs1, xcdgh
128
129 /* Rounds 0 through 15 (four rounds at a time). */
130 sha256_shani_round_load(0, in, xmsgtmp0)
131 sha256_shani_round_load(1, in, xmsgtmp1)
132 sha256_shani_round_load(2, in, xmsgtmp2)
133 sha256_shani_round_load(3, in, xmsgtmp3)
134
135 /* Rounds 16 through 63 (four rounds at a time). */
136 sha256_shani_round_update(4, xmsgtmp0, xmsgtmp1, xmsgtmp2, xmsgtmp3)
137 sha256_shani_round_update(5, xmsgtmp1, xmsgtmp2, xmsgtmp3, xmsgtmp0)
138 sha256_shani_round_update(6, xmsgtmp2, xmsgtmp3, xmsgtmp0, xmsgtmp1)
139 sha256_shani_round_update(7, xmsgtmp3, xmsgtmp0, xmsgtmp1, xmsgtmp2)
140
141 sha256_shani_round_update(8, xmsgtmp0, xmsgtmp1, xmsgtmp2, xmsgtmp3)
142 sha256_shani_round_update(9, xmsgtmp1, xmsgtmp2, xmsgtmp3, xmsgtmp0)
143 sha256_shani_round_update(10, xmsgtmp2, xmsgtmp3, xmsgtmp0, xmsgtmp1)
144 sha256_shani_round_update(11, xmsgtmp3, xmsgtmp0, xmsgtmp1, xmsgtmp2)
145
146 sha256_shani_round_update(12, xmsgtmp0, xmsgtmp1, xmsgtmp2, xmsgtmp3)
147 sha256_shani_round_update(13, xmsgtmp1, xmsgtmp2, xmsgtmp3, xmsgtmp0)
148 sha256_shani_round_update(14, xmsgtmp2, xmsgtmp3, xmsgtmp0, xmsgtmp1)
149 sha256_shani_round_update(15, xmsgtmp3, xmsgtmp0, xmsgtmp1, xmsgtmp2)
150
151 /* Accumulate hash state. */
152 paddd xabef, xhs0
153 paddd xcdgh, xhs1
154
155 addq $64, in
156 cmpq end, in
157 jb .Lshani_block_loop
158
159 /* Rearrange words to construct dcba/hgfe. */
160 pshufd $0x1b, xhs0, xhs0 /* feba */
161 pshufd $0xb1, xhs1, xhs1 /* dchg */
162 movdqa xhs0, xtmp0
163 pblendw $0xf0, xhs1, xhs0 /* dcba */
164 palignr $8, xtmp0, xhs1 /* hgfe */
165
166 /* Update stored hash context. */
167 movdqu xhs0, (0*16)(ctx)
168 movdqu xhs1, (1*16)(ctx)
169
170 /* Restore callee save registers. */
171 popq %rbp
172 popq %rbx
173
174 ret
175
176.rodata
177
178/*
179 * Shuffle mask - little endian to big endian word conversion.
180 */
181.align 16
182.type shufmask,@object
183shufmask:
184.octa 0x0c0d0e0f08090a0b0405060700010203
185.size shufmask,.-shufmask
186
187/*
188 * SHA-256 constants - see FIPS 180-4 section 4.2.2.
189 */
190.align 64
191.type K256,@object
192K256:
193.long 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
194.long 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
195.long 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
196.long 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
197.long 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
198.long 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
199.long 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
200.long 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
201.long 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
202.long 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
203.long 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
204.long 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
205.long 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
206.long 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
207.long 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
208.long 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
209.size K256,.-K256
diff --git a/src/lib/libcrypto/sha/sha3.c b/src/lib/libcrypto/sha/sha3.c
deleted file mode 100644
index 6a7196d582..0000000000
--- a/src/lib/libcrypto/sha/sha3.c
+++ /dev/null
@@ -1,172 +0,0 @@
1/* $OpenBSD: sha3.c,v 1.16 2024/11/23 15:38:12 jsing Exp $ */
2/*
3 * The MIT License (MIT)
4 *
5 * Copyright (c) 2015 Markku-Juhani O. Saarinen
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a copy
8 * of this software and associated documentation files (the "Software"), to deal
9 * in the Software without restriction, including without limitation the rights
10 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 * copies of the Software, and to permit persons to whom the Software is
12 * furnished to do so, subject to the following conditions:
13 *
14 * The above copyright notice and this permission notice shall be included in all
15 * copies or substantial portions of the Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 * SOFTWARE.
24 */
25
26#include <endian.h>
27#include <string.h>
28
29#include "sha3_internal.h"
30
31#define KECCAKF_ROUNDS 24
32
33#define ROTL64(x, y) (((x) << (y)) | ((x) >> (64 - (y))))
34
35static const uint64_t sha3_keccakf_rndc[24] = {
36 0x0000000000000001, 0x0000000000008082, 0x800000000000808a,
37 0x8000000080008000, 0x000000000000808b, 0x0000000080000001,
38 0x8000000080008081, 0x8000000000008009, 0x000000000000008a,
39 0x0000000000000088, 0x0000000080008009, 0x000000008000000a,
40 0x000000008000808b, 0x800000000000008b, 0x8000000000008089,
41 0x8000000000008003, 0x8000000000008002, 0x8000000000000080,
42 0x000000000000800a, 0x800000008000000a, 0x8000000080008081,
43 0x8000000000008080, 0x0000000080000001, 0x8000000080008008
44};
45static const int sha3_keccakf_rotc[24] = {
46 1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 2, 14,
47 27, 41, 56, 8, 25, 43, 62, 18, 39, 61, 20, 44
48};
49static const int sha3_keccakf_piln[24] = {
50 10, 7, 11, 17, 18, 3, 5, 16, 8, 21, 24, 4,
51 15, 23, 19, 13, 12, 2, 20, 14, 22, 9, 6, 1
52};
53
54static void
55sha3_keccakf(uint64_t st[25])
56{
57 uint64_t t, bc[5];
58 int i, j, r;
59
60 for (i = 0; i < 25; i++)
61 st[i] = le64toh(st[i]);
62
63 for (r = 0; r < KECCAKF_ROUNDS; r++) {
64
65 /* Theta */
66 for (i = 0; i < 5; i++)
67 bc[i] = st[i] ^ st[i + 5] ^ st[i + 10] ^ st[i + 15] ^ st[i + 20];
68
69 for (i = 0; i < 5; i++) {
70 t = bc[(i + 4) % 5] ^ ROTL64(bc[(i + 1) % 5], 1);
71 for (j = 0; j < 25; j += 5)
72 st[j + i] ^= t;
73 }
74
75 /* Rho Pi */
76 t = st[1];
77 for (i = 0; i < 24; i++) {
78 j = sha3_keccakf_piln[i];
79 bc[0] = st[j];
80 st[j] = ROTL64(t, sha3_keccakf_rotc[i]);
81 t = bc[0];
82 }
83
84 /* Chi */
85 for (j = 0; j < 25; j += 5) {
86 for (i = 0; i < 5; i++)
87 bc[i] = st[j + i];
88 for (i = 0; i < 5; i++)
89 st[j + i] ^= (~bc[(i + 1) % 5]) & bc[(i + 2) % 5];
90 }
91
92 /* Iota */
93 st[0] ^= sha3_keccakf_rndc[r];
94 }
95
96 for (i = 0; i < 25; i++)
97 st[i] = htole64(st[i]);
98}
99
100int
101sha3_init(sha3_ctx *c, int mdlen)
102{
103 if (mdlen < 0 || mdlen >= KECCAK_BYTE_WIDTH / 2)
104 return 0;
105
106 memset(c, 0, sizeof(*c));
107
108 c->mdlen = mdlen;
109 c->rsize = KECCAK_BYTE_WIDTH - 2 * mdlen;
110
111 return 1;
112}
113
114int
115sha3_update(sha3_ctx *c, const void *data, size_t len)
116{
117 size_t i, j;
118
119 j = c->pt;
120 for (i = 0; i < len; i++) {
121 c->state.b[j++] ^= ((const uint8_t *) data)[i];
122 if (j >= c->rsize) {
123 sha3_keccakf(c->state.q);
124 j = 0;
125 }
126 }
127 c->pt = j;
128
129 return 1;
130}
131
132int
133sha3_final(void *md, sha3_ctx *c)
134{
135 int i;
136
137 c->state.b[c->pt] ^= 0x06;
138 c->state.b[c->rsize - 1] ^= 0x80;
139 sha3_keccakf(c->state.q);
140
141 for (i = 0; i < c->mdlen; i++) {
142 ((uint8_t *) md)[i] = c->state.b[i];
143 }
144
145 return 1;
146}
147
148/* SHAKE128 and SHAKE256 extensible-output functionality. */
149void
150shake_xof(sha3_ctx *c)
151{
152 c->state.b[c->pt] ^= 0x1F;
153 c->state.b[c->rsize - 1] ^= 0x80;
154 sha3_keccakf(c->state.q);
155 c->pt = 0;
156}
157
158void
159shake_out(sha3_ctx *c, void *out, size_t len)
160{
161 size_t i, j;
162
163 j = c->pt;
164 for (i = 0; i < len; i++) {
165 if (j >= c->rsize) {
166 sha3_keccakf(c->state.q);
167 j = 0;
168 }
169 ((uint8_t *) out)[i] = c->state.b[j++];
170 }
171 c->pt = j;
172}
diff --git a/src/lib/libcrypto/sha/sha3_internal.h b/src/lib/libcrypto/sha/sha3_internal.h
deleted file mode 100644
index 53a4980c19..0000000000
--- a/src/lib/libcrypto/sha/sha3_internal.h
+++ /dev/null
@@ -1,81 +0,0 @@
1/* $OpenBSD: sha3_internal.h,v 1.15 2023/04/25 19:32:19 tb Exp $ */
2/*
3 * The MIT License (MIT)
4 *
5 * Copyright (c) 2015 Markku-Juhani O. Saarinen
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a copy
8 * of this software and associated documentation files (the "Software"), to deal
9 * in the Software without restriction, including without limitation the rights
10 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 * copies of the Software, and to permit persons to whom the Software is
12 * furnished to do so, subject to the following conditions:
13 *
14 * The above copyright notice and this permission notice shall be included in all
15 * copies or substantial portions of the Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 * SOFTWARE.
24 */
25
26#include <stddef.h>
27#include <stdint.h>
28
29#ifndef HEADER_SHA3_INTERNAL_H
30#define HEADER_SHA3_INTERNAL_H
31
32#define KECCAK_BIT_WIDTH 1600
33#define KECCAK_BYTE_WIDTH (KECCAK_BIT_WIDTH / 8)
34
35#define SHA3_224_BIT_LENGTH 224
36#define SHA3_224_BITRATE (2 * SHA3_224_BIT_LENGTH)
37#define SHA3_224_CAPACITY (KECCAK_BIT_WIDTH - SHA3_224_BITRATE)
38#define SHA3_224_BLOCK_SIZE (SHA3_224_CAPACITY / 8)
39#define SHA3_224_DIGEST_LENGTH (SHA3_224_BIT_LENGTH / 8)
40
41#define SHA3_256_BIT_LENGTH 256
42#define SHA3_256_BITRATE (2 * SHA3_256_BIT_LENGTH)
43#define SHA3_256_CAPACITY (KECCAK_BIT_WIDTH - SHA3_256_BITRATE)
44#define SHA3_256_BLOCK_SIZE (SHA3_256_CAPACITY / 8)
45#define SHA3_256_DIGEST_LENGTH (SHA3_256_BIT_LENGTH / 8)
46
47#define SHA3_384_BIT_LENGTH 384
48#define SHA3_384_BITRATE (2 * SHA3_384_BIT_LENGTH)
49#define SHA3_384_CAPACITY (KECCAK_BIT_WIDTH - SHA3_384_BITRATE)
50#define SHA3_384_BLOCK_SIZE (SHA3_384_CAPACITY / 8)
51#define SHA3_384_DIGEST_LENGTH (SHA3_384_BIT_LENGTH / 8)
52
53#define SHA3_512_BIT_LENGTH 512
54#define SHA3_512_BITRATE (2 * SHA3_512_BIT_LENGTH)
55#define SHA3_512_CAPACITY (KECCAK_BIT_WIDTH - SHA3_512_BITRATE)
56#define SHA3_512_BLOCK_SIZE (SHA3_512_CAPACITY / 8)
57#define SHA3_512_DIGEST_LENGTH (SHA3_512_BIT_LENGTH / 8)
58
59typedef struct sha3_ctx_st {
60 union {
61 uint8_t b[200]; /* State as 8 bit bytes. */
62 uint64_t q[25]; /* State as 64 bit words. */
63 } state;
64 size_t pt;
65 size_t rsize;
66 size_t mdlen;
67} sha3_ctx;
68
69int sha3_init(sha3_ctx *c, int mdlen);
70int sha3_update(sha3_ctx *c, const void *data, size_t len);
71int sha3_final(void *md, sha3_ctx *c);
72
73/* SHAKE128 and SHAKE256 extensible-output functions. */
74#define shake128_init(c) sha3_init(c, 16)
75#define shake256_init(c) sha3_init(c, 32)
76#define shake_update sha3_update
77
78void shake_xof(sha3_ctx *c);
79void shake_out(sha3_ctx *c, void *out, size_t len);
80
81#endif
diff --git a/src/lib/libcrypto/sha/sha512.c b/src/lib/libcrypto/sha/sha512.c
deleted file mode 100644
index 43d25eb119..0000000000
--- a/src/lib/libcrypto/sha/sha512.c
+++ /dev/null
@@ -1,578 +0,0 @@
1/* $OpenBSD: sha512.c,v 1.43 2025/02/14 12:01:58 jsing Exp $ */
2/* ====================================================================
3 * Copyright (c) 1998-2011 The OpenSSL Project. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in
14 * the documentation and/or other materials provided with the
15 * distribution.
16 *
17 * 3. All advertising materials mentioning features or use of this
18 * software must display the following acknowledgment:
19 * "This product includes software developed by the OpenSSL Project
20 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
21 *
22 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
23 * endorse or promote products derived from this software without
24 * prior written permission. For written permission, please contact
25 * openssl-core@openssl.org.
26 *
27 * 5. Products derived from this software may not be called "OpenSSL"
28 * nor may "OpenSSL" appear in their names without prior written
29 * permission of the OpenSSL Project.
30 *
31 * 6. Redistributions of any form whatsoever must retain the following
32 * acknowledgment:
33 * "This product includes software developed by the OpenSSL Project
34 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
35 *
36 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
37 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
38 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
39 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
40 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
41 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
42 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
43 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
44 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
45 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
46 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
47 * OF THE POSSIBILITY OF SUCH DAMAGE.
48 * ====================================================================
49 *
50 * This product includes cryptographic software written by Eric Young
51 * (eay@cryptsoft.com). This product includes software written by Tim
52 * Hudson (tjh@cryptsoft.com).
53 */
54
55#include <endian.h>
56#include <stdlib.h>
57#include <string.h>
58
59#include <openssl/opensslconf.h>
60
61#include <openssl/crypto.h>
62#include <openssl/sha.h>
63
64#include "crypto_internal.h"
65#include "sha_internal.h"
66
67#if !defined(OPENSSL_NO_SHA) && !defined(OPENSSL_NO_SHA512)
68
69/* Ensure that SHA_LONG64 and uint64_t are equivalent. */
70CTASSERT(sizeof(SHA_LONG64) == sizeof(uint64_t));
71
72void sha512_block_data_order(SHA512_CTX *ctx, const void *in, size_t num);
73void sha512_block_generic(SHA512_CTX *ctx, const void *in, size_t num);
74
75#ifndef HAVE_SHA512_BLOCK_GENERIC
76static const SHA_LONG64 K512[80] = {
77 U64(0x428a2f98d728ae22), U64(0x7137449123ef65cd),
78 U64(0xb5c0fbcfec4d3b2f), U64(0xe9b5dba58189dbbc),
79 U64(0x3956c25bf348b538), U64(0x59f111f1b605d019),
80 U64(0x923f82a4af194f9b), U64(0xab1c5ed5da6d8118),
81 U64(0xd807aa98a3030242), U64(0x12835b0145706fbe),
82 U64(0x243185be4ee4b28c), U64(0x550c7dc3d5ffb4e2),
83 U64(0x72be5d74f27b896f), U64(0x80deb1fe3b1696b1),
84 U64(0x9bdc06a725c71235), U64(0xc19bf174cf692694),
85 U64(0xe49b69c19ef14ad2), U64(0xefbe4786384f25e3),
86 U64(0x0fc19dc68b8cd5b5), U64(0x240ca1cc77ac9c65),
87 U64(0x2de92c6f592b0275), U64(0x4a7484aa6ea6e483),
88 U64(0x5cb0a9dcbd41fbd4), U64(0x76f988da831153b5),
89 U64(0x983e5152ee66dfab), U64(0xa831c66d2db43210),
90 U64(0xb00327c898fb213f), U64(0xbf597fc7beef0ee4),
91 U64(0xc6e00bf33da88fc2), U64(0xd5a79147930aa725),
92 U64(0x06ca6351e003826f), U64(0x142929670a0e6e70),
93 U64(0x27b70a8546d22ffc), U64(0x2e1b21385c26c926),
94 U64(0x4d2c6dfc5ac42aed), U64(0x53380d139d95b3df),
95 U64(0x650a73548baf63de), U64(0x766a0abb3c77b2a8),
96 U64(0x81c2c92e47edaee6), U64(0x92722c851482353b),
97 U64(0xa2bfe8a14cf10364), U64(0xa81a664bbc423001),
98 U64(0xc24b8b70d0f89791), U64(0xc76c51a30654be30),
99 U64(0xd192e819d6ef5218), U64(0xd69906245565a910),
100 U64(0xf40e35855771202a), U64(0x106aa07032bbd1b8),
101 U64(0x19a4c116b8d2d0c8), U64(0x1e376c085141ab53),
102 U64(0x2748774cdf8eeb99), U64(0x34b0bcb5e19b48a8),
103 U64(0x391c0cb3c5c95a63), U64(0x4ed8aa4ae3418acb),
104 U64(0x5b9cca4f7763e373), U64(0x682e6ff3d6b2b8a3),
105 U64(0x748f82ee5defb2fc), U64(0x78a5636f43172f60),
106 U64(0x84c87814a1f0ab72), U64(0x8cc702081a6439ec),
107 U64(0x90befffa23631e28), U64(0xa4506cebde82bde9),
108 U64(0xbef9a3f7b2c67915), U64(0xc67178f2e372532b),
109 U64(0xca273eceea26619c), U64(0xd186b8c721c0c207),
110 U64(0xeada7dd6cde0eb1e), U64(0xf57d4f7fee6ed178),
111 U64(0x06f067aa72176fba), U64(0x0a637dc5a2c898a6),
112 U64(0x113f9804bef90dae), U64(0x1b710b35131c471b),
113 U64(0x28db77f523047d84), U64(0x32caab7b40c72493),
114 U64(0x3c9ebe0a15c9bebc), U64(0x431d67c49c100d4c),
115 U64(0x4cc5d4becb3e42b6), U64(0x597f299cfc657e2a),
116 U64(0x5fcb6fab3ad6faec), U64(0x6c44198c4a475817),
117};
118
119static inline SHA_LONG64
120Sigma0(SHA_LONG64 x)
121{
122 return crypto_ror_u64(x, 28) ^ crypto_ror_u64(x, 34) ^
123 crypto_ror_u64(x, 39);
124}
125
126static inline SHA_LONG64
127Sigma1(SHA_LONG64 x)
128{
129 return crypto_ror_u64(x, 14) ^ crypto_ror_u64(x, 18) ^
130 crypto_ror_u64(x, 41);
131}
132
133static inline SHA_LONG64
134sigma0(SHA_LONG64 x)
135{
136 return crypto_ror_u64(x, 1) ^ crypto_ror_u64(x, 8) ^ (x >> 7);
137}
138
139static inline SHA_LONG64
140sigma1(SHA_LONG64 x)
141{
142 return crypto_ror_u64(x, 19) ^ crypto_ror_u64(x, 61) ^ (x >> 6);
143}
144
145static inline SHA_LONG64
146Ch(SHA_LONG64 x, SHA_LONG64 y, SHA_LONG64 z)
147{
148 return (x & y) ^ (~x & z);
149}
150
151static inline SHA_LONG64
152Maj(SHA_LONG64 x, SHA_LONG64 y, SHA_LONG64 z)
153{
154 return (x & y) ^ (x & z) ^ (y & z);
155}
156
157static inline void
158sha512_msg_schedule_update(SHA_LONG64 *W0, SHA_LONG64 W1,
159 SHA_LONG64 W9, SHA_LONG64 W14)
160{
161 *W0 = sigma1(W14) + W9 + sigma0(W1) + *W0;
162}
163
164static inline void
165sha512_round(SHA_LONG64 *a, SHA_LONG64 *b, SHA_LONG64 *c, SHA_LONG64 *d,
166 SHA_LONG64 *e, SHA_LONG64 *f, SHA_LONG64 *g, SHA_LONG64 *h,
167 SHA_LONG64 Kt, SHA_LONG64 Wt)
168{
169 SHA_LONG64 T1, T2;
170
171 T1 = *h + Sigma1(*e) + Ch(*e, *f, *g) + Kt + Wt;
172 T2 = Sigma0(*a) + Maj(*a, *b, *c);
173
174 *h = *g;
175 *g = *f;
176 *f = *e;
177 *e = *d + T1;
178 *d = *c;
179 *c = *b;
180 *b = *a;
181 *a = T1 + T2;
182}
183
184void
185sha512_block_generic(SHA512_CTX *ctx, const void *_in, size_t num)
186{
187 const uint8_t *in = _in;
188 const SHA_LONG64 *in64;
189 SHA_LONG64 a, b, c, d, e, f, g, h;
190 SHA_LONG64 X[16];
191 int i;
192
193 while (num--) {
194 a = ctx->h[0];
195 b = ctx->h[1];
196 c = ctx->h[2];
197 d = ctx->h[3];
198 e = ctx->h[4];
199 f = ctx->h[5];
200 g = ctx->h[6];
201 h = ctx->h[7];
202
203 if ((size_t)in % sizeof(SHA_LONG64) == 0) {
204 /* Input is 64 bit aligned. */
205 in64 = (const SHA_LONG64 *)in;
206 X[0] = be64toh(in64[0]);
207 X[1] = be64toh(in64[1]);
208 X[2] = be64toh(in64[2]);
209 X[3] = be64toh(in64[3]);
210 X[4] = be64toh(in64[4]);
211 X[5] = be64toh(in64[5]);
212 X[6] = be64toh(in64[6]);
213 X[7] = be64toh(in64[7]);
214 X[8] = be64toh(in64[8]);
215 X[9] = be64toh(in64[9]);
216 X[10] = be64toh(in64[10]);
217 X[11] = be64toh(in64[11]);
218 X[12] = be64toh(in64[12]);
219 X[13] = be64toh(in64[13]);
220 X[14] = be64toh(in64[14]);
221 X[15] = be64toh(in64[15]);
222 } else {
223 /* Input is not 64 bit aligned. */
224 X[0] = crypto_load_be64toh(&in[0 * 8]);
225 X[1] = crypto_load_be64toh(&in[1 * 8]);
226 X[2] = crypto_load_be64toh(&in[2 * 8]);
227 X[3] = crypto_load_be64toh(&in[3 * 8]);
228 X[4] = crypto_load_be64toh(&in[4 * 8]);
229 X[5] = crypto_load_be64toh(&in[5 * 8]);
230 X[6] = crypto_load_be64toh(&in[6 * 8]);
231 X[7] = crypto_load_be64toh(&in[7 * 8]);
232 X[8] = crypto_load_be64toh(&in[8 * 8]);
233 X[9] = crypto_load_be64toh(&in[9 * 8]);
234 X[10] = crypto_load_be64toh(&in[10 * 8]);
235 X[11] = crypto_load_be64toh(&in[11 * 8]);
236 X[12] = crypto_load_be64toh(&in[12 * 8]);
237 X[13] = crypto_load_be64toh(&in[13 * 8]);
238 X[14] = crypto_load_be64toh(&in[14 * 8]);
239 X[15] = crypto_load_be64toh(&in[15 * 8]);
240 }
241 in += SHA512_CBLOCK;
242
243 sha512_round(&a, &b, &c, &d, &e, &f, &g, &h, K512[0], X[0]);
244 sha512_round(&a, &b, &c, &d, &e, &f, &g, &h, K512[1], X[1]);
245 sha512_round(&a, &b, &c, &d, &e, &f, &g, &h, K512[2], X[2]);
246 sha512_round(&a, &b, &c, &d, &e, &f, &g, &h, K512[3], X[3]);
247 sha512_round(&a, &b, &c, &d, &e, &f, &g, &h, K512[4], X[4]);
248 sha512_round(&a, &b, &c, &d, &e, &f, &g, &h, K512[5], X[5]);
249 sha512_round(&a, &b, &c, &d, &e, &f, &g, &h, K512[6], X[6]);
250 sha512_round(&a, &b, &c, &d, &e, &f, &g, &h, K512[7], X[7]);
251 sha512_round(&a, &b, &c, &d, &e, &f, &g, &h, K512[8], X[8]);
252 sha512_round(&a, &b, &c, &d, &e, &f, &g, &h, K512[9], X[9]);
253 sha512_round(&a, &b, &c, &d, &e, &f, &g, &h, K512[10], X[10]);
254 sha512_round(&a, &b, &c, &d, &e, &f, &g, &h, K512[11], X[11]);
255 sha512_round(&a, &b, &c, &d, &e, &f, &g, &h, K512[12], X[12]);
256 sha512_round(&a, &b, &c, &d, &e, &f, &g, &h, K512[13], X[13]);
257 sha512_round(&a, &b, &c, &d, &e, &f, &g, &h, K512[14], X[14]);
258 sha512_round(&a, &b, &c, &d, &e, &f, &g, &h, K512[15], X[15]);
259
260 for (i = 16; i < 80; i += 16) {
261 sha512_msg_schedule_update(&X[0], X[1], X[9], X[14]);
262 sha512_msg_schedule_update(&X[1], X[2], X[10], X[15]);
263 sha512_msg_schedule_update(&X[2], X[3], X[11], X[0]);
264 sha512_msg_schedule_update(&X[3], X[4], X[12], X[1]);
265 sha512_msg_schedule_update(&X[4], X[5], X[13], X[2]);
266 sha512_msg_schedule_update(&X[5], X[6], X[14], X[3]);
267 sha512_msg_schedule_update(&X[6], X[7], X[15], X[4]);
268 sha512_msg_schedule_update(&X[7], X[8], X[0], X[5]);
269 sha512_msg_schedule_update(&X[8], X[9], X[1], X[6]);
270 sha512_msg_schedule_update(&X[9], X[10], X[2], X[7]);
271 sha512_msg_schedule_update(&X[10], X[11], X[3], X[8]);
272 sha512_msg_schedule_update(&X[11], X[12], X[4], X[9]);
273 sha512_msg_schedule_update(&X[12], X[13], X[5], X[10]);
274 sha512_msg_schedule_update(&X[13], X[14], X[6], X[11]);
275 sha512_msg_schedule_update(&X[14], X[15], X[7], X[12]);
276 sha512_msg_schedule_update(&X[15], X[0], X[8], X[13]);
277
278 sha512_round(&a, &b, &c, &d, &e, &f, &g, &h, K512[i + 0], X[0]);
279 sha512_round(&a, &b, &c, &d, &e, &f, &g, &h, K512[i + 1], X[1]);
280 sha512_round(&a, &b, &c, &d, &e, &f, &g, &h, K512[i + 2], X[2]);
281 sha512_round(&a, &b, &c, &d, &e, &f, &g, &h, K512[i + 3], X[3]);
282 sha512_round(&a, &b, &c, &d, &e, &f, &g, &h, K512[i + 4], X[4]);
283 sha512_round(&a, &b, &c, &d, &e, &f, &g, &h, K512[i + 5], X[5]);
284 sha512_round(&a, &b, &c, &d, &e, &f, &g, &h, K512[i + 6], X[6]);
285 sha512_round(&a, &b, &c, &d, &e, &f, &g, &h, K512[i + 7], X[7]);
286 sha512_round(&a, &b, &c, &d, &e, &f, &g, &h, K512[i + 8], X[8]);
287 sha512_round(&a, &b, &c, &d, &e, &f, &g, &h, K512[i + 9], X[9]);
288 sha512_round(&a, &b, &c, &d, &e, &f, &g, &h, K512[i + 10], X[10]);
289 sha512_round(&a, &b, &c, &d, &e, &f, &g, &h, K512[i + 11], X[11]);
290 sha512_round(&a, &b, &c, &d, &e, &f, &g, &h, K512[i + 12], X[12]);
291 sha512_round(&a, &b, &c, &d, &e, &f, &g, &h, K512[i + 13], X[13]);
292 sha512_round(&a, &b, &c, &d, &e, &f, &g, &h, K512[i + 14], X[14]);
293 sha512_round(&a, &b, &c, &d, &e, &f, &g, &h, K512[i + 15], X[15]);
294 }
295
296 ctx->h[0] += a;
297 ctx->h[1] += b;
298 ctx->h[2] += c;
299 ctx->h[3] += d;
300 ctx->h[4] += e;
301 ctx->h[5] += f;
302 ctx->h[6] += g;
303 ctx->h[7] += h;
304 }
305}
306#endif
307
308#ifndef HAVE_SHA512_BLOCK_DATA_ORDER
309void
310sha512_block_data_order(SHA512_CTX *ctx, const void *_in, size_t num)
311{
312 sha512_block_generic(ctx, _in, num);
313}
314#endif
315
316int
317SHA384_Init(SHA512_CTX *c)
318{
319 memset(c, 0, sizeof(*c));
320
321 c->h[0] = U64(0xcbbb9d5dc1059ed8);
322 c->h[1] = U64(0x629a292a367cd507);
323 c->h[2] = U64(0x9159015a3070dd17);
324 c->h[3] = U64(0x152fecd8f70e5939);
325 c->h[4] = U64(0x67332667ffc00b31);
326 c->h[5] = U64(0x8eb44a8768581511);
327 c->h[6] = U64(0xdb0c2e0d64f98fa7);
328 c->h[7] = U64(0x47b5481dbefa4fa4);
329
330 c->md_len = SHA384_DIGEST_LENGTH;
331
332 return 1;
333}
334LCRYPTO_ALIAS(SHA384_Init);
335
336int
337SHA384_Update(SHA512_CTX *c, const void *data, size_t len)
338{
339 return SHA512_Update(c, data, len);
340}
341LCRYPTO_ALIAS(SHA384_Update);
342
343int
344SHA384_Final(unsigned char *md, SHA512_CTX *c)
345{
346 return SHA512_Final(md, c);
347}
348LCRYPTO_ALIAS(SHA384_Final);
349
350unsigned char *
351SHA384(const unsigned char *d, size_t n, unsigned char *md)
352{
353 SHA512_CTX c;
354
355 SHA384_Init(&c);
356 SHA512_Update(&c, d, n);
357 SHA512_Final(md, &c);
358
359 explicit_bzero(&c, sizeof(c));
360
361 return (md);
362}
363LCRYPTO_ALIAS(SHA384);
364
365int
366SHA512_Init(SHA512_CTX *c)
367{
368 memset(c, 0, sizeof(*c));
369
370 c->h[0] = U64(0x6a09e667f3bcc908);
371 c->h[1] = U64(0xbb67ae8584caa73b);
372 c->h[2] = U64(0x3c6ef372fe94f82b);
373 c->h[3] = U64(0xa54ff53a5f1d36f1);
374 c->h[4] = U64(0x510e527fade682d1);
375 c->h[5] = U64(0x9b05688c2b3e6c1f);
376 c->h[6] = U64(0x1f83d9abfb41bd6b);
377 c->h[7] = U64(0x5be0cd19137e2179);
378
379 c->md_len = SHA512_DIGEST_LENGTH;
380
381 return 1;
382}
383LCRYPTO_ALIAS(SHA512_Init);
384
385void
386SHA512_Transform(SHA512_CTX *c, const unsigned char *data)
387{
388 sha512_block_data_order(c, data, 1);
389}
390LCRYPTO_ALIAS(SHA512_Transform);
391
392int
393SHA512_Update(SHA512_CTX *c, const void *_data, size_t len)
394{
395 const unsigned char *data = _data;
396 unsigned char *p = c->u.p;
397 SHA_LONG64 l;
398
399 if (len == 0)
400 return 1;
401
402 l = (c->Nl + (((SHA_LONG64)len) << 3))&U64(0xffffffffffffffff);
403 if (l < c->Nl)
404 c->Nh++;
405 if (sizeof(len) >= 8)
406 c->Nh += (((SHA_LONG64)len) >> 61);
407 c->Nl = l;
408
409 if (c->num != 0) {
410 size_t n = sizeof(c->u) - c->num;
411
412 if (len < n) {
413 memcpy(p + c->num, data, len);
414 c->num += (unsigned int)len;
415 return 1;
416 } else{
417 memcpy(p + c->num, data, n);
418 c->num = 0;
419 len -= n;
420 data += n;
421 sha512_block_data_order(c, p, 1);
422 }
423 }
424
425 if (len >= sizeof(c->u)) {
426 sha512_block_data_order(c, data, len/sizeof(c->u));
427 data += len;
428 len %= sizeof(c->u);
429 data -= len;
430 }
431
432 if (len != 0) {
433 memcpy(p, data, len);
434 c->num = (int)len;
435 }
436
437 return 1;
438}
439LCRYPTO_ALIAS(SHA512_Update);
440
441int
442SHA512_Final(unsigned char *md, SHA512_CTX *c)
443{
444 unsigned char *p = (unsigned char *)c->u.p;
445 size_t n = c->num;
446
447 p[n]=0x80; /* There always is a room for one */
448 n++;
449 if (n > (sizeof(c->u) - 16)) {
450 memset(p + n, 0, sizeof(c->u) - n);
451 n = 0;
452 sha512_block_data_order(c, p, 1);
453 }
454
455 memset(p + n, 0, sizeof(c->u) - 16 - n);
456 c->u.d[SHA_LBLOCK - 2] = htobe64(c->Nh);
457 c->u.d[SHA_LBLOCK - 1] = htobe64(c->Nl);
458
459 sha512_block_data_order(c, p, 1);
460
461 if (md == NULL)
462 return 0;
463
464 /* Let compiler decide if it's appropriate to unroll... */
465 switch (c->md_len) {
466 case SHA512_224_DIGEST_LENGTH:
467 for (n = 0; n < SHA512_224_DIGEST_LENGTH/8; n++) {
468 crypto_store_htobe64(md, c->h[n]);
469 md += 8;
470 }
471 crypto_store_htobe32(md, c->h[n] >> 32);
472 break;
473 case SHA512_256_DIGEST_LENGTH:
474 for (n = 0; n < SHA512_256_DIGEST_LENGTH/8; n++) {
475 crypto_store_htobe64(md, c->h[n]);
476 md += 8;
477 }
478 break;
479 case SHA384_DIGEST_LENGTH:
480 for (n = 0; n < SHA384_DIGEST_LENGTH/8; n++) {
481 crypto_store_htobe64(md, c->h[n]);
482 md += 8;
483 }
484 break;
485 case SHA512_DIGEST_LENGTH:
486 for (n = 0; n < SHA512_DIGEST_LENGTH/8; n++) {
487 crypto_store_htobe64(md, c->h[n]);
488 md += 8;
489 }
490 break;
491 default:
492 return 0;
493 }
494
495 return 1;
496}
497LCRYPTO_ALIAS(SHA512_Final);
498
499unsigned char *
500SHA512(const unsigned char *d, size_t n, unsigned char *md)
501{
502 SHA512_CTX c;
503
504 SHA512_Init(&c);
505 SHA512_Update(&c, d, n);
506 SHA512_Final(md, &c);
507
508 explicit_bzero(&c, sizeof(c));
509
510 return (md);
511}
512LCRYPTO_ALIAS(SHA512);
513
514int
515SHA512_224_Init(SHA512_CTX *c)
516{
517 memset(c, 0, sizeof(*c));
518
519 /* FIPS 180-4 section 5.3.6.1. */
520 c->h[0] = U64(0x8c3d37c819544da2);
521 c->h[1] = U64(0x73e1996689dcd4d6);
522 c->h[2] = U64(0x1dfab7ae32ff9c82);
523 c->h[3] = U64(0x679dd514582f9fcf);
524 c->h[4] = U64(0x0f6d2b697bd44da8);
525 c->h[5] = U64(0x77e36f7304c48942);
526 c->h[6] = U64(0x3f9d85a86a1d36c8);
527 c->h[7] = U64(0x1112e6ad91d692a1);
528
529 c->md_len = SHA512_224_DIGEST_LENGTH;
530
531 return 1;
532}
533
534int
535SHA512_224_Update(SHA512_CTX *c, const void *data, size_t len)
536{
537 return SHA512_Update(c, data, len);
538}
539
540int
541SHA512_224_Final(unsigned char *md, SHA512_CTX *c)
542{
543 return SHA512_Final(md, c);
544}
545
546int
547SHA512_256_Init(SHA512_CTX *c)
548{
549 memset(c, 0, sizeof(*c));
550
551 /* FIPS 180-4 section 5.3.6.2. */
552 c->h[0] = U64(0x22312194fc2bf72c);
553 c->h[1] = U64(0x9f555fa3c84c64c2);
554 c->h[2] = U64(0x2393b86b6f53b151);
555 c->h[3] = U64(0x963877195940eabd);
556 c->h[4] = U64(0x96283ee2a88effe3);
557 c->h[5] = U64(0xbe5e1e2553863992);
558 c->h[6] = U64(0x2b0199fc2c85b8aa);
559 c->h[7] = U64(0x0eb72ddc81c52ca2);
560
561 c->md_len = SHA512_256_DIGEST_LENGTH;
562
563 return 1;
564}
565
566int
567SHA512_256_Update(SHA512_CTX *c, const void *data, size_t len)
568{
569 return SHA512_Update(c, data, len);
570}
571
572int
573SHA512_256_Final(unsigned char *md, SHA512_CTX *c)
574{
575 return SHA512_Final(md, c);
576}
577
578#endif /* !OPENSSL_NO_SHA512 */
diff --git a/src/lib/libcrypto/sha/sha512_aarch64.c b/src/lib/libcrypto/sha/sha512_aarch64.c
deleted file mode 100644
index 3c997e3e89..0000000000
--- a/src/lib/libcrypto/sha/sha512_aarch64.c
+++ /dev/null
@@ -1,34 +0,0 @@
1/* $OpenBSD: sha512_aarch64.c,v 1.1 2025/03/12 14:13:41 jsing Exp $ */
2/*
3 * Copyright (c) 2024 Joel Sing <jsing@openbsd.org>
4 *
5 * Permission to use, copy, modify, and distribute this software for any
6 * purpose with or without fee is hereby granted, provided that the above
7 * copyright notice and this permission notice appear in all copies.
8 *
9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16 */
17
18#include <openssl/sha.h>
19
20#include "crypto_arch.h"
21
22void sha512_block_ce(SHA512_CTX *ctx, const void *in, size_t num);
23void sha512_block_generic(SHA512_CTX *ctx, const void *in, size_t num);
24
25void
26sha512_block_data_order(SHA512_CTX *ctx, const void *in, size_t num)
27{
28 if ((crypto_cpu_caps_aarch64 & CRYPTO_CPU_CAPS_AARCH64_SHA512) != 0) {
29 sha512_block_ce(ctx, in, num);
30 return;
31 }
32
33 sha512_block_generic(ctx, in, num);
34}
diff --git a/src/lib/libcrypto/sha/sha512_aarch64_ce.S b/src/lib/libcrypto/sha/sha512_aarch64_ce.S
deleted file mode 100644
index 89109a78ba..0000000000
--- a/src/lib/libcrypto/sha/sha512_aarch64_ce.S
+++ /dev/null
@@ -1,312 +0,0 @@
1/* $OpenBSD: sha512_aarch64_ce.S,v 1.1 2025/03/12 14:13:41 jsing Exp $ */
2/*
3 * Copyright (c) 2023,2025 Joel Sing <jsing@openbsd.org>
4 *
5 * Permission to use, copy, modify, and distribute this software for any
6 * purpose with or without fee is hereby granted, provided that the above
7 * copyright notice and this permission notice appear in all copies.
8 *
9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16 */
17
18/*
19 * SHA-512 implementation using the ARM Cryptographic Extension (CE).
20 *
21 * The documentation for these is rather inadequate - each instruction is
22 * described in a mechanical sense, however their combined usage does not
23 * seem to be detailed anywhere.
24 *
25 * There are four instructions that enable hardware acceleration of SHA-512:
26 *
27 * sha512h - hash update, part 1 (without a number to be inconsistent):
28 * inputs <W1:W0 + K1:K0 + g:h>, <f:g>, <d:e>
29 * output T1 for W0, T1 for W1
30 *
31 * sha512h2 - hash update, part 2:
32 * inputs <T1 for W0, T1 for W1>, <c:d>, <a:b>
33 * output <T1 + T2 for W0, T1 + T2 for W1>
34 *
35 * sha512su0 - message schedule update with sigma0 for two rounds:
36 * inputs <W0:W1>, <W2:W3>
37 * output W0 += sigma0(W1), W1 += sigma0(W2)
38 *
39 * sha512su1 - message schedule update with sigma1 for two rounds:
40 * inputs <W0:W1>, <W14:W15>, <W9:W10>
41 * output W0 += sigma1(W14) + W9, W1 += sigma1(W15) + W10
42 */
43
44#define ctx x0
45#define in x1
46#define num x2
47
48#define k512_base x3
49#define k512 x4
50
51/* Note: the lower 64 bits of v8 through v15 are callee save. */
52
53#define hc0 v28
54#define hc1 v29
55#define hc2 v30
56#define hc3 v31
57
58#define hs0 v0
59#define hs1 v1
60#define hs2 v2
61#define hs3 v3
62#define hs4 v4
63#define hs5 v5
64#define hs6 v6
65#define hs7 v7
66
67#define w0 v10
68#define w1 v11
69#define w2 v12
70#define w3 v13
71#define w4 v14
72#define w5 v15
73#define w6 v16
74#define w7 v17
75
76#define k0 v20
77#define k1 v21
78#define k2 v22
79#define k3 v23
80#define k4 v24
81#define k5 v25
82#define k6 v26
83#define k7 v27
84
85#define tmp0 v8
86#define tmp1 v9
87#define tmp2 v18
88
89/*
90 * Update message schedule for m0 (W0:W1), using m1 (W2:W3), m4 (W8:W9),
91 * m5 (W10:W11) and m7 (W14:W15). The sha512su0 instruction computes the sigma0
92 * component of the message schedule update as m0 = sigma0(m1) + m0, while
93 * sha512su1 computes the sigma1 component as m0 = sigma1(m7) + W9:W10 + m0.
94 * Note that W9:W10 is split across two registers, hence this needs to be
95 * constructed before it is passed to sha512su1:
96 *
97 * W0 = sigma1(W14) + W9 + sigma0(W1) + W0
98 */
99#define sha512_message_schedule_update(m0, m1, m4, m5, m7) \
100 sha512su0 m0.2d, m1.2d; /* W0 += sigma0(W1) */ \
101 ext tmp2.16b, m4.16b, m5.16b, #8; /* W9:W10 */ \
102 sha512su1 m0.2d, m7.2d, tmp2.2d; /* W0 += sigma1(W14) + W9 */
103
104/*
105 * Compute two SHA-512 rounds by adding W0:W1 + K0:K1, then computing T1 for two
106 * rounds by swapping the double words, adding g:h and calling sha512h with this
107 * value (W1:W0 = W1:W0 + K1:K0 + g:h), f:g and d:e. The new e:f value is then
108 * computed by adding T1 + c:d (producing the next e:f values), before calling
109 * sha512h2 with T1, c:d and a:b, computing T1 + T2 for two rounds (producing
110 * the next a:b values):
111 *
112 * T1 = h + Sigma1(e) + Ch(e, f, g) + Kt + Wt
113 * T2 = Sigma0(a) + Maj(a, b, c)
114 *
115 * h = g
116 * g = f
117 * f = e
118 * e = d + T1
119 * d = c
120 * c = b
121 * b = a
122 * a = T1 + T2
123 *
124 * The inputs are:
125 *
126 * h0 = a:b
127 * h1 = c:d
128 * h2 = e:f
129 * h3 = g:h
130 *
131 * Producing the following outputs:
132 *
133 * h4 = next a:b
134 * h5 = next e:f
135 *
136 * These values are then rotated by the caller to perform the next two rounds.
137 */
138#define sha512_round(h0, h1, h2, h3, h4, h5, w, k) \
139 add h4.2d, w.2d, k.2d; /* W0:W1 += K0:K1 */ \
140 ext h4.16b, h4.16b, h4.16b, #8; /* W1:W0 (swap) */ \
141 add h4.2d, h4.2d, h3.2d; /* W1:W0 += g:h */ \
142 ext tmp0.16b, h2.16b, h3.16b, #8; /* f:g */ \
143 ext tmp1.16b, h1.16b, h2.16b, #8; /* d:e */ \
144 sha512h h4, tmp0, tmp1.2d; /* T1 */ \
145 add h5.2d, h1.2d, h4.2d; /* c:d + T1 */ \
146 sha512h2 h4, h1, h0.2d; /* T1 + T2 */
147
148#define sha512_round_update(h0, h1, h2, h3, h4, h5, m0, m1, m2, m3, m4, k) \
149 sha512_message_schedule_update(m0, m1, m2, m3, m4) \
150 sha512_round(h0, h1, h2, h3, h4, h5, m0, k)
151
152.arch armv8-a+sha3
153
154.text
155
156/*
157 * void sha512_block_ce(SHA512_CTX *ctx, const void *in, size_t num);
158 *
159 * Standard ARM ABI: x0 = ctx, x1 = in, x2 = num
160 */
161.globl sha512_block_ce
162sha512_block_ce:
163
164 /* Save low 64 bits of v8 through v15 to the stack. */
165 sub sp, sp, #32
166 st4 {v8.d, v9.d, v10.d, v11.d}[0], [sp]
167 sub sp, sp, #32
168 st4 {v12.d, v13.d, v14.d, v15.d}[0], [sp]
169
170 /* Address of SHA-512 constants. */
171 adrp k512_base, K512
172 add k512_base, k512_base, :lo12:K512
173
174 /*
175 * Load current hash state from context.
176 * hc0 = a:b, hc1 = c:d, hc2 = e:f, hc3 = g:h
177 */
178 ld1 {hc0.2d, hc1.2d, hc2.2d, hc3.2d}, [ctx]
179
180block_loop:
181 mov k512, k512_base
182
183 /* Copy current hash state. */
184 mov hs0.2d, hc0.2d
185 mov hs1.2d, hc1.2d
186 mov hs2.2d, hc2.2d
187 mov hs3.2d, hc3.2d
188
189 /* Load and byte swap message schedule. */
190 ld1 {w0.16b, w1.16b, w2.16b, w3.16b}, [in], #64
191 rev64 w0.16b, w0.16b
192 rev64 w1.16b, w1.16b
193 rev64 w2.16b, w2.16b
194 rev64 w3.16b, w3.16b
195
196 ld1 {w4.2d, w5.2d, w6.2d, w7.2d}, [in], #64
197 rev64 w4.16b, w4.16b
198 rev64 w5.16b, w5.16b
199 rev64 w6.16b, w6.16b
200 rev64 w7.16b, w7.16b
201
202 /* Rounds 0 through 15 (two rounds at a time). */
203 ld1 {k0.2d, k1.2d, k2.2d, k3.2d}, [k512], #64
204 ld1 {k4.2d, k5.2d, k6.2d, k7.2d}, [k512], #64
205
206 sha512_round(hs0, hs1, hs2, hs3, hs4, hs5, w0, k0)
207 sha512_round(hs4, hs0, hs5, hs2, hs6, hs7, w1, k1)
208 sha512_round(hs6, hs4, hs7, hs5, hs1, hs3, w2, k2)
209 sha512_round(hs1, hs6, hs3, hs7, hs0, hs2, w3, k3)
210 sha512_round(hs0, hs1, hs2, hs3, hs4, hs5, w4, k4)
211 sha512_round(hs4, hs0, hs5, hs2, hs6, hs7, w5, k5)
212 sha512_round(hs6, hs4, hs7, hs5, hs1, hs3, w6, k6)
213 sha512_round(hs1, hs6, hs3, hs7, hs0, hs2, w7, k7)
214
215 /* Rounds 16 through 31 (two rounds at a time). */
216 ld1 {k0.2d, k1.2d, k2.2d, k3.2d}, [k512], #64
217 ld1 {k4.2d, k5.2d, k6.2d, k7.2d}, [k512], #64
218
219 sha512_round_update(hs0, hs1, hs2, hs3, hs4, hs5, w0, w1, w4, w5, w7, k0)
220 sha512_round_update(hs4, hs0, hs5, hs2, hs6, hs7, w1, w2, w5, w6, w0, k1)
221 sha512_round_update(hs6, hs4, hs7, hs5, hs1, hs3, w2, w3, w6, w7, w1, k2)
222 sha512_round_update(hs1, hs6, hs3, hs7, hs0, hs2, w3, w4, w7, w0, w2, k3)
223 sha512_round_update(hs0, hs1, hs2, hs3, hs4, hs5, w4, w5, w0, w1, w3, k4)
224 sha512_round_update(hs4, hs0, hs5, hs2, hs6, hs7, w5, w6, w1, w2, w4, k5)
225 sha512_round_update(hs6, hs4, hs7, hs5, hs1, hs3, w6, w7, w2, w3, w5, k6)
226 sha512_round_update(hs1, hs6, hs3, hs7, hs0, hs2, w7, w0, w3, w4, w6, k7)
227
228 /* Rounds 32 through 47 (two rounds at a time). */
229 ld1 {k0.2d, k1.2d, k2.2d, k3.2d}, [k512], #64
230 ld1 {k4.2d, k5.2d, k6.2d, k7.2d}, [k512], #64
231
232 sha512_round_update(hs0, hs1, hs2, hs3, hs4, hs5, w0, w1, w4, w5, w7, k0)
233 sha512_round_update(hs4, hs0, hs5, hs2, hs6, hs7, w1, w2, w5, w6, w0, k1)
234 sha512_round_update(hs6, hs4, hs7, hs5, hs1, hs3, w2, w3, w6, w7, w1, k2)
235 sha512_round_update(hs1, hs6, hs3, hs7, hs0, hs2, w3, w4, w7, w0, w2, k3)
236 sha512_round_update(hs0, hs1, hs2, hs3, hs4, hs5, w4, w5, w0, w1, w3, k4)
237 sha512_round_update(hs4, hs0, hs5, hs2, hs6, hs7, w5, w6, w1, w2, w4, k5)
238 sha512_round_update(hs6, hs4, hs7, hs5, hs1, hs3, w6, w7, w2, w3, w5, k6)
239 sha512_round_update(hs1, hs6, hs3, hs7, hs0, hs2, w7, w0, w3, w4, w6, k7)
240
241 /* Rounds 48 through 63 (two rounds at a time). */
242 ld1 {k0.2d, k1.2d, k2.2d, k3.2d}, [k512], #64
243 ld1 {k4.2d, k5.2d, k6.2d, k7.2d}, [k512], #64
244
245 sha512_round_update(hs0, hs1, hs2, hs3, hs4, hs5, w0, w1, w4, w5, w7, k0)
246 sha512_round_update(hs4, hs0, hs5, hs2, hs6, hs7, w1, w2, w5, w6, w0, k1)
247 sha512_round_update(hs6, hs4, hs7, hs5, hs1, hs3, w2, w3, w6, w7, w1, k2)
248 sha512_round_update(hs1, hs6, hs3, hs7, hs0, hs2, w3, w4, w7, w0, w2, k3)
249 sha512_round_update(hs0, hs1, hs2, hs3, hs4, hs5, w4, w5, w0, w1, w3, k4)
250 sha512_round_update(hs4, hs0, hs5, hs2, hs6, hs7, w5, w6, w1, w2, w4, k5)
251 sha512_round_update(hs6, hs4, hs7, hs5, hs1, hs3, w6, w7, w2, w3, w5, k6)
252 sha512_round_update(hs1, hs6, hs3, hs7, hs0, hs2, w7, w0, w3, w4, w6, k7)
253
254 /* Rounds 64 through 79 (two rounds at a time). */
255 ld1 {k0.2d, k1.2d, k2.2d, k3.2d}, [k512], #64
256 ld1 {k4.2d, k5.2d, k6.2d, k7.2d}, [k512], #64
257
258 sha512_round_update(hs0, hs1, hs2, hs3, hs4, hs5, w0, w1, w4, w5, w7, k0)
259 sha512_round_update(hs4, hs0, hs5, hs2, hs6, hs7, w1, w2, w5, w6, w0, k1)
260 sha512_round_update(hs6, hs4, hs7, hs5, hs1, hs3, w2, w3, w6, w7, w1, k2)
261 sha512_round_update(hs1, hs6, hs3, hs7, hs0, hs2, w3, w4, w7, w0, w2, k3)
262 sha512_round_update(hs0, hs1, hs2, hs3, hs4, hs5, w4, w5, w0, w1, w3, k4)
263 sha512_round_update(hs4, hs0, hs5, hs2, hs6, hs7, w5, w6, w1, w2, w4, k5)
264 sha512_round_update(hs6, hs4, hs7, hs5, hs1, hs3, w6, w7, w2, w3, w5, k6)
265 sha512_round_update(hs1, hs6, hs3, hs7, hs0, hs2, w7, w0, w3, w4, w6, k7)
266
267 /* Add intermediate state to hash state. */
268 add hc0.2d, hc0.2d, hs0.2d
269 add hc1.2d, hc1.2d, hs1.2d
270 add hc2.2d, hc2.2d, hs2.2d
271 add hc3.2d, hc3.2d, hs3.2d
272
273 sub num, num, #1
274 cbnz num, block_loop
275
276 /* Store hash state to context. */
277 st1 {hc0.2d, hc1.2d, hc2.2d, hc3.2d}, [ctx]
278
279 /* Restore low 64 bits of v8 through v15 from the stack. */
280 ld4 {v12.d, v13.d, v14.d, v15.d}[0], [sp], #32
281 ld4 {v8.d, v9.d, v10.d, v11.d}[0], [sp], #32
282
283 ret
284
285/*
286 * SHA-512 constants - see FIPS 180-4 section 4.2.3.
287 */
288.rodata
289.align 4
290.type K512,@object
291K512:
292.quad 0x428a2f98d728ae22, 0x7137449123ef65cd, 0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc
293.quad 0x3956c25bf348b538, 0x59f111f1b605d019, 0x923f82a4af194f9b, 0xab1c5ed5da6d8118
294.quad 0xd807aa98a3030242, 0x12835b0145706fbe, 0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2
295.quad 0x72be5d74f27b896f, 0x80deb1fe3b1696b1, 0x9bdc06a725c71235, 0xc19bf174cf692694
296.quad 0xe49b69c19ef14ad2, 0xefbe4786384f25e3, 0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65
297.quad 0x2de92c6f592b0275, 0x4a7484aa6ea6e483, 0x5cb0a9dcbd41fbd4, 0x76f988da831153b5
298.quad 0x983e5152ee66dfab, 0xa831c66d2db43210, 0xb00327c898fb213f, 0xbf597fc7beef0ee4
299.quad 0xc6e00bf33da88fc2, 0xd5a79147930aa725, 0x06ca6351e003826f, 0x142929670a0e6e70
300.quad 0x27b70a8546d22ffc, 0x2e1b21385c26c926, 0x4d2c6dfc5ac42aed, 0x53380d139d95b3df
301.quad 0x650a73548baf63de, 0x766a0abb3c77b2a8, 0x81c2c92e47edaee6, 0x92722c851482353b
302.quad 0xa2bfe8a14cf10364, 0xa81a664bbc423001, 0xc24b8b70d0f89791, 0xc76c51a30654be30
303.quad 0xd192e819d6ef5218, 0xd69906245565a910, 0xf40e35855771202a, 0x106aa07032bbd1b8
304.quad 0x19a4c116b8d2d0c8, 0x1e376c085141ab53, 0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8
305.quad 0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb, 0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3
306.quad 0x748f82ee5defb2fc, 0x78a5636f43172f60, 0x84c87814a1f0ab72, 0x8cc702081a6439ec
307.quad 0x90befffa23631e28, 0xa4506cebde82bde9, 0xbef9a3f7b2c67915, 0xc67178f2e372532b
308.quad 0xca273eceea26619c, 0xd186b8c721c0c207, 0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178
309.quad 0x06f067aa72176fba, 0x0a637dc5a2c898a6, 0x113f9804bef90dae, 0x1b710b35131c471b
310.quad 0x28db77f523047d84, 0x32caab7b40c72493, 0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c
311.quad 0x4cc5d4becb3e42b6, 0x597f299cfc657e2a, 0x5fcb6fab3ad6faec, 0x6c44198c4a475817
312.size K512,.-K512
diff --git a/src/lib/libcrypto/sha/sha512_amd64.c b/src/lib/libcrypto/sha/sha512_amd64.c
deleted file mode 100644
index 0b54243020..0000000000
--- a/src/lib/libcrypto/sha/sha512_amd64.c
+++ /dev/null
@@ -1,26 +0,0 @@
1/* $OpenBSD: sha512_amd64.c,v 1.1 2024/11/16 14:56:39 jsing Exp $ */
2/*
3 * Copyright (c) 2024 Joel Sing <jsing@openbsd.org>
4 *
5 * Permission to use, copy, modify, and distribute this software for any
6 * purpose with or without fee is hereby granted, provided that the above
7 * copyright notice and this permission notice appear in all copies.
8 *
9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16 */
17
18#include <openssl/sha.h>
19
20void sha512_block_generic(SHA512_CTX *ctx, const void *in, size_t num);
21
22void
23sha512_block_data_order(SHA512_CTX *ctx, const void *in, size_t num)
24{
25 sha512_block_generic(ctx, in, num);
26}
diff --git a/src/lib/libcrypto/sha/sha512_amd64_generic.S b/src/lib/libcrypto/sha/sha512_amd64_generic.S
deleted file mode 100644
index 8419d60b8e..0000000000
--- a/src/lib/libcrypto/sha/sha512_amd64_generic.S
+++ /dev/null
@@ -1,307 +0,0 @@
1/* $OpenBSD: sha512_amd64_generic.S,v 1.1 2024/11/16 14:56:39 jsing Exp $ */
2/*
3 * Copyright (c) 2024 Joel Sing <jsing@openbsd.org>
4 *
5 * Permission to use, copy, modify, and distribute this software for any
6 * purpose with or without fee is hereby granted, provided that the above
7 * copyright notice and this permission notice appear in all copies.
8 *
9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16 */
17
18#ifdef __CET__
19#include <cet.h>
20#else
21#define _CET_ENDBR
22#endif
23
24#define ctx %rdi
25#define in %rsi
26#define num %rdx
27
28#define round %rdi
29
30#define hs0 %r8
31#define hs1 %r9
32#define hs2 %r10
33#define hs3 %r11
34#define hs4 %r12
35#define hs5 %r13
36#define hs6 %r14
37#define hs7 %r15
38
39#define k512 %rbp
40
41#define tmp0 %rax
42#define tmp1 %rbx
43#define tmp2 %rcx
44#define tmp3 %rdx
45
46/*
47 * Load message into wt, storing a copy in the message schedule:
48 *
49 * Wt = Mt
50 */
51#define sha512_message_schedule_load(idx, m, w, wt) \
52 movq (m, round, 8), wt; \
53 bswapq wt; \
54 movq wt, ((idx&0xf)*8)(w);
55
56/*
57 * Update message schedule and return current value in wt:
58 *
59 * Wt = sigma1(W(t-2)) + W(t-7) + sigma0(W(t-15)) + W(t-16)
60 *
61 * sigma0(x) = ror(x, 1) ^ ror(x, 8) ^ (x >> 7)
62 * sigma1(x) = ror(x, 19) ^ ror(x, 61) ^ (x >> 6)
63 *
64 */
65#define sha512_message_schedule_update(idx, w, wt) \
66 movq (((idx-2)&0xf)*8)(w), wt; /* sigma1 */ \
67 movq wt, tmp1; /* sigma1 */ \
68 rorq $(61-19), tmp1; /* sigma1 */ \
69 xorq wt, tmp1; /* sigma1 */ \
70 rorq $19, tmp1; /* sigma1 */ \
71 shrq $6, wt; /* sigma1 */ \
72 xorq tmp1, wt; /* sigma1 */ \
73 \
74 addq (((idx-7)&0xf)*8)(w), wt; /* Wt-7 */ \
75 addq (((idx-16)&0xf)*8)(w), wt; /* Wt-16 */ \
76 \
77 movq (((idx-15)&0xf)*8)(w), tmp2; /* sigma0 */ \
78 movq tmp2, tmp3; /* sigma0 */ \
79 rorq $(8-1), tmp2; /* sigma0 */ \
80 xorq tmp3, tmp2; /* sigma0 */ \
81 rorq $1, tmp2; /* sigma0 */ \
82 shrq $7, tmp3; /* sigma0 */ \
83 xorq tmp3, tmp2; /* sigma0 */ \
84 addq tmp2, wt; /* sigma0 */ \
85 \
86 movq wt, ((idx&0xf)*8)(w);
87
88/*
89 * Compute a SHA-512 round:
90 *
91 * T1 = h + Sigma1(e) + Ch(e, f, g) + Kt + Wt
92 * T2 = Sigma0(a) + Maj(a, b, c)
93 *
94 * Sigma0(x) = ror(x, 28) ^ ror(x, 34) ^ ror(x, 39)
95 * Sigma1(x) = ror(x, 14) ^ ror(x, 18) ^ ror(x, 41)
96 * Ch(x, y, z) = (x & y) ^ (~x & z) = ((y ^ z) & x) ^ z
97 * Maj(x, y, z) = (x & y) ^ (x & z) ^ (y & z) = ((y ^ z) & x) ^ (y & z)
98 *
99 * Upon completion d = d + T1, h = T1 + T2, pending rotation.
100 */
101#define sha512_round(idx, a, b, c, d, e, f, g, h, k, w, wt) \
102 addq wt, h; /* T1 Wt */ \
103 addq (k512, round, 8), h; /* T1 Kt */ \
104 \
105 movq e, tmp1; /* T1 Sigma1 */ \
106 rorq $(41-18), tmp1; /* T1 Sigma1 */ \
107 xorq e, tmp1; /* T1 Sigma1 */ \
108 rorq $(18-14), tmp1; /* T1 Sigma1 */ \
109 xorq e, tmp1; /* T1 Sigma1 */ \
110 rorq $14, tmp1; /* T1 Sigma1 */ \
111 addq tmp1, h; /* T1 Sigma1 */ \
112 \
113 movq f, tmp2; /* T1 Ch */ \
114 xorq g, tmp2; /* T1 Ch */ \
115 andq e, tmp2; /* T1 Ch */ \
116 xorq g, tmp2; /* T1 Ch */ \
117 addq tmp2, h; /* T1 Ch */ \
118 \
119 addq h, d; /* d += T1 */ \
120 \
121 movq a, tmp1; /* T2 Sigma0 */ \
122 rorq $(39-34), tmp1; /* T2 Sigma0 */ \
123 xorq a, tmp1; /* T2 Sigma0 */ \
124 rorq $(34-28), tmp1; /* T2 Sigma0 */ \
125 xorq a, tmp1; /* T2 Sigma0 */ \
126 rorq $28, tmp1; /* T2 Sigma0 */ \
127 addq tmp1, h; /* T2 Sigma0 */ \
128 \
129 movq b, tmp2; /* T2 Maj */ \
130 xorq c, tmp2; /* T2 Maj */ \
131 andq a, tmp2; /* T2 Maj */ \
132 movq b, tmp3; /* T2 Maj */ \
133 andq c, tmp3; /* T2 Maj */ \
134 xorq tmp2, tmp3; /* T2 Maj */ \
135 addq tmp3, h; /* T2 Maj */ \
136 \
137 addq $1, round;
138
139#define sha512_round_load(idx, a, b, c, d, e, f, g, h) \
140 sha512_message_schedule_load(idx, in, %rsp, tmp0) \
141 sha512_round(idx, a, b, c, d, e, f, g, h, k512, %rsp, tmp0)
142
143#define sha512_round_update(idx, a, b, c, d, e, f, g, h) \
144 sha512_message_schedule_update(idx, %rsp, tmp0) \
145 sha512_round(idx, a, b, c, d, e, f, g, h, k512, %rsp, tmp0)
146
147.text
148
149/*
150 * void sha512_block_generic(SHA512_CTX *ctx, const void *in, size_t num);
151 *
152 * Standard x86-64 ABI: rdi = ctx, rsi = in, rdx = num
153 */
154.align 16
155.globl sha512_block_generic
156.type sha512_block_generic,@function
157sha512_block_generic:
158 _CET_ENDBR
159
160 /* Save callee save registers. */
161 pushq %rbx
162 pushq %rbp
163 pushq %r12
164 pushq %r13
165 pushq %r14
166 pushq %r15
167
168 /* Allocate space for message schedule and context pointer. */
169 movq %rsp, %rax
170 subq $(128+3*8), %rsp
171 andq $~63, %rsp
172 movq %rax, (128+2*8)(%rsp)
173 movq ctx, (128+1*8)(%rsp)
174
175 /* Compute and store end of message. */
176 shlq $7, num
177 leaq (in, num, 1), %rbx
178 movq %rbx, (128+0*8)(%rsp)
179
180 /* Address of SHA-512 constants. */
181 leaq K512(%rip), k512
182
183 /* Load current hash state from context. */
184 movq (0*8)(ctx), hs0
185 movq (1*8)(ctx), hs1
186 movq (2*8)(ctx), hs2
187 movq (3*8)(ctx), hs3
188 movq (4*8)(ctx), hs4
189 movq (5*8)(ctx), hs5
190 movq (6*8)(ctx), hs6
191 movq (7*8)(ctx), hs7
192
193 jmp .Lblock_loop0
194
195.align 16
196.Lblock_loop0:
197 mov $0, round
198
199 /* Round 0 through 15. */
200 sha512_round_load(0, hs0, hs1, hs2, hs3, hs4, hs5, hs6, hs7)
201 sha512_round_load(1, hs7, hs0, hs1, hs2, hs3, hs4, hs5, hs6)
202 sha512_round_load(2, hs6, hs7, hs0, hs1, hs2, hs3, hs4, hs5)
203 sha512_round_load(3, hs5, hs6, hs7, hs0, hs1, hs2, hs3, hs4)
204 sha512_round_load(4, hs4, hs5, hs6, hs7, hs0, hs1, hs2, hs3)
205 sha512_round_load(5, hs3, hs4, hs5, hs6, hs7, hs0, hs1, hs2)
206 sha512_round_load(6, hs2, hs3, hs4, hs5, hs6, hs7, hs0, hs1)
207 sha512_round_load(7, hs1, hs2, hs3, hs4, hs5, hs6, hs7, hs0)
208 sha512_round_load(8, hs0, hs1, hs2, hs3, hs4, hs5, hs6, hs7)
209 sha512_round_load(9, hs7, hs0, hs1, hs2, hs3, hs4, hs5, hs6)
210 sha512_round_load(10, hs6, hs7, hs0, hs1, hs2, hs3, hs4, hs5)
211 sha512_round_load(11, hs5, hs6, hs7, hs0, hs1, hs2, hs3, hs4)
212 sha512_round_load(12, hs4, hs5, hs6, hs7, hs0, hs1, hs2, hs3)
213 sha512_round_load(13, hs3, hs4, hs5, hs6, hs7, hs0, hs1, hs2)
214 sha512_round_load(14, hs2, hs3, hs4, hs5, hs6, hs7, hs0, hs1)
215 sha512_round_load(15, hs1, hs2, hs3, hs4, hs5, hs6, hs7, hs0)
216
217 jmp .Lblock_loop16
218
219.align 16
220.Lblock_loop16:
221 /* Round 16 through 79. */
222 sha512_round_update(16, hs0, hs1, hs2, hs3, hs4, hs5, hs6, hs7)
223 sha512_round_update(17, hs7, hs0, hs1, hs2, hs3, hs4, hs5, hs6)
224 sha512_round_update(18, hs6, hs7, hs0, hs1, hs2, hs3, hs4, hs5)
225 sha512_round_update(19, hs5, hs6, hs7, hs0, hs1, hs2, hs3, hs4)
226 sha512_round_update(20, hs4, hs5, hs6, hs7, hs0, hs1, hs2, hs3)
227 sha512_round_update(21, hs3, hs4, hs5, hs6, hs7, hs0, hs1, hs2)
228 sha512_round_update(22, hs2, hs3, hs4, hs5, hs6, hs7, hs0, hs1)
229 sha512_round_update(23, hs1, hs2, hs3, hs4, hs5, hs6, hs7, hs0)
230 sha512_round_update(24, hs0, hs1, hs2, hs3, hs4, hs5, hs6, hs7)
231 sha512_round_update(25, hs7, hs0, hs1, hs2, hs3, hs4, hs5, hs6)
232 sha512_round_update(26, hs6, hs7, hs0, hs1, hs2, hs3, hs4, hs5)
233 sha512_round_update(27, hs5, hs6, hs7, hs0, hs1, hs2, hs3, hs4)
234 sha512_round_update(28, hs4, hs5, hs6, hs7, hs0, hs1, hs2, hs3)
235 sha512_round_update(29, hs3, hs4, hs5, hs6, hs7, hs0, hs1, hs2)
236 sha512_round_update(30, hs2, hs3, hs4, hs5, hs6, hs7, hs0, hs1)
237 sha512_round_update(31, hs1, hs2, hs3, hs4, hs5, hs6, hs7, hs0)
238
239 cmp $80, round
240 jb .Lblock_loop16
241
242 movq (128+1*8)(%rsp), ctx
243
244 /* Add intermediate state to hash state. */
245 addq (0*8)(ctx), hs0
246 addq (1*8)(ctx), hs1
247 addq (2*8)(ctx), hs2
248 addq (3*8)(ctx), hs3
249 addq (4*8)(ctx), hs4
250 addq (5*8)(ctx), hs5
251 addq (6*8)(ctx), hs6
252 addq (7*8)(ctx), hs7
253
254 /* Store new hash state to context. */
255 movq hs0, (0*8)(ctx)
256 movq hs1, (1*8)(ctx)
257 movq hs2, (2*8)(ctx)
258 movq hs3, (3*8)(ctx)
259 movq hs4, (4*8)(ctx)
260 movq hs5, (5*8)(ctx)
261 movq hs6, (6*8)(ctx)
262 movq hs7, (7*8)(ctx)
263
264 addq $128, in
265 cmpq (128+0*8)(%rsp), in
266 jb .Lblock_loop0
267
268 movq (128+2*8)(%rsp), %rsp
269
270 /* Restore callee save registers. */
271 popq %r15
272 popq %r14
273 popq %r13
274 popq %r12
275 popq %rbp
276 popq %rbx
277
278 ret
279
280/*
281 * SHA-512 constants - see FIPS 180-4 section 4.2.3.
282 */
283.rodata
284.align 64
285.type K512,@object
286K512:
287.quad 0x428a2f98d728ae22, 0x7137449123ef65cd, 0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc
288.quad 0x3956c25bf348b538, 0x59f111f1b605d019, 0x923f82a4af194f9b, 0xab1c5ed5da6d8118
289.quad 0xd807aa98a3030242, 0x12835b0145706fbe, 0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2
290.quad 0x72be5d74f27b896f, 0x80deb1fe3b1696b1, 0x9bdc06a725c71235, 0xc19bf174cf692694
291.quad 0xe49b69c19ef14ad2, 0xefbe4786384f25e3, 0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65
292.quad 0x2de92c6f592b0275, 0x4a7484aa6ea6e483, 0x5cb0a9dcbd41fbd4, 0x76f988da831153b5
293.quad 0x983e5152ee66dfab, 0xa831c66d2db43210, 0xb00327c898fb213f, 0xbf597fc7beef0ee4
294.quad 0xc6e00bf33da88fc2, 0xd5a79147930aa725, 0x06ca6351e003826f, 0x142929670a0e6e70
295.quad 0x27b70a8546d22ffc, 0x2e1b21385c26c926, 0x4d2c6dfc5ac42aed, 0x53380d139d95b3df
296.quad 0x650a73548baf63de, 0x766a0abb3c77b2a8, 0x81c2c92e47edaee6, 0x92722c851482353b
297.quad 0xa2bfe8a14cf10364, 0xa81a664bbc423001, 0xc24b8b70d0f89791, 0xc76c51a30654be30
298.quad 0xd192e819d6ef5218, 0xd69906245565a910, 0xf40e35855771202a, 0x106aa07032bbd1b8
299.quad 0x19a4c116b8d2d0c8, 0x1e376c085141ab53, 0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8
300.quad 0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb, 0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3
301.quad 0x748f82ee5defb2fc, 0x78a5636f43172f60, 0x84c87814a1f0ab72, 0x8cc702081a6439ec
302.quad 0x90befffa23631e28, 0xa4506cebde82bde9, 0xbef9a3f7b2c67915, 0xc67178f2e372532b
303.quad 0xca273eceea26619c, 0xd186b8c721c0c207, 0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178
304.quad 0x06f067aa72176fba, 0x0a637dc5a2c898a6, 0x113f9804bef90dae, 0x1b710b35131c471b
305.quad 0x28db77f523047d84, 0x32caab7b40c72493, 0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c
306.quad 0x4cc5d4becb3e42b6, 0x597f299cfc657e2a, 0x5fcb6fab3ad6faec, 0x6c44198c4a475817
307.size K512,.-K512
diff --git a/src/lib/libcrypto/sha/sha_internal.h b/src/lib/libcrypto/sha/sha_internal.h
deleted file mode 100644
index 63cae3d3b3..0000000000
--- a/src/lib/libcrypto/sha/sha_internal.h
+++ /dev/null
@@ -1,36 +0,0 @@
1/* $OpenBSD: sha_internal.h,v 1.3 2023/04/25 15:47:29 tb Exp $ */
2/*
3 * Copyright (c) 2023 Joel Sing <jsing@openbsd.org>
4 *
5 * Permission to use, copy, modify, and distribute this software for any
6 * purpose with or without fee is hereby granted, provided that the above
7 * copyright notice and this permission notice appear in all copies.
8 *
9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16 */
17
18#include <openssl/sha.h>
19
20#ifndef HEADER_SHA_INTERNAL_H
21#define HEADER_SHA_INTERNAL_H
22
23#define SHA512_224_DIGEST_LENGTH 28
24#define SHA512_256_DIGEST_LENGTH 32
25
26int SHA512_224_Init(SHA512_CTX *c);
27int SHA512_224_Update(SHA512_CTX *c, const void *data, size_t len)
28 __attribute__ ((__bounded__(__buffer__,2,3)));
29int SHA512_224_Final(unsigned char *md, SHA512_CTX *c);
30
31int SHA512_256_Init(SHA512_CTX *c);
32int SHA512_256_Update(SHA512_CTX *c, const void *data, size_t len)
33 __attribute__ ((__bounded__(__buffer__,2,3)));
34int SHA512_256_Final(unsigned char *md, SHA512_CTX *c);
35
36#endif