summaryrefslogtreecommitdiff
path: root/src/lib/libcrypto/sha/asm
diff options
context:
space:
mode:
Diffstat (limited to 'src/lib/libcrypto/sha/asm')
-rw-r--r--src/lib/libcrypto/sha/asm/sha1-586.pl1223
-rw-r--r--src/lib/libcrypto/sha/asm/sha1-alpha.pl316
-rw-r--r--src/lib/libcrypto/sha/asm/sha1-armv4-large.pl248
-rw-r--r--src/lib/libcrypto/sha/asm/sha1-mips.pl350
-rw-r--r--src/lib/libcrypto/sha/asm/sha1-parisc.pl258
-rwxr-xr-xsrc/lib/libcrypto/sha/asm/sha1-ppc.pl318
-rw-r--r--src/lib/libcrypto/sha/asm/sha1-sparcv9.pl282
-rw-r--r--src/lib/libcrypto/sha/asm/sha256-586.pl249
-rw-r--r--src/lib/libcrypto/sha/asm/sha256-armv4.pl211
-rw-r--r--src/lib/libcrypto/sha/asm/sha512-586.pl646
-rw-r--r--src/lib/libcrypto/sha/asm/sha512-armv4.pl582
-rw-r--r--src/lib/libcrypto/sha/asm/sha512-mips.pl457
-rwxr-xr-xsrc/lib/libcrypto/sha/asm/sha512-parisc.pl801
-rwxr-xr-xsrc/lib/libcrypto/sha/asm/sha512-ppc.pl444
-rw-r--r--src/lib/libcrypto/sha/asm/sha512-sparcv9.pl604
15 files changed, 0 insertions, 6989 deletions
diff --git a/src/lib/libcrypto/sha/asm/sha1-586.pl b/src/lib/libcrypto/sha/asm/sha1-586.pl
deleted file mode 100644
index 5928e083c1..0000000000
--- a/src/lib/libcrypto/sha/asm/sha1-586.pl
+++ /dev/null
@@ -1,1223 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# [Re]written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# "[Re]written" was achieved in two major overhauls. In 2004 BODY_*
11# functions were re-implemented to address P4 performance issue [see
12# commentary below], and in 2006 the rest was rewritten in order to
13# gain freedom to liberate licensing terms.
14
15# January, September 2004.
16#
17# It was noted that Intel IA-32 C compiler generates code which
18# performs ~30% *faster* on P4 CPU than original *hand-coded*
19# SHA1 assembler implementation. To address this problem (and
20# prove that humans are still better than machines:-), the
21# original code was overhauled, which resulted in following
22# performance changes:
23#
24# compared with original compared with Intel cc
25# assembler impl. generated code
26# Pentium -16% +48%
27# PIII/AMD +8% +16%
28# P4 +85%(!) +45%
29#
30# As you can see Pentium came out as looser:-( Yet I reckoned that
31# improvement on P4 outweighs the loss and incorporate this
32# re-tuned code to 0.9.7 and later.
33# ----------------------------------------------------------------
34# <appro@fy.chalmers.se>
35
36# August 2009.
37#
38# George Spelvin has tipped that F_40_59(b,c,d) can be rewritten as
39# '(c&d) + (b&(c^d))', which allows to accumulate partial results
40# and lighten "pressure" on scratch registers. This resulted in
41# >12% performance improvement on contemporary AMD cores (with no
42# degradation on other CPUs:-). Also, the code was revised to maximize
43# "distance" between instructions producing input to 'lea' instruction
44# and the 'lea' instruction itself, which is essential for Intel Atom
45# core and resulted in ~15% improvement.
46
47# October 2010.
48#
49# Add SSSE3, Supplemental[!] SSE3, implementation. The idea behind it
50# is to offload message schedule denoted by Wt in NIST specification,
51# or Xupdate in OpenSSL source, to SIMD unit. The idea is not novel,
52# and in SSE2 context was first explored by Dean Gaudet in 2004, see
53# http://arctic.org/~dean/crypto/sha1.html. Since then several things
54# have changed that made it interesting again:
55#
56# a) XMM units became faster and wider;
57# b) instruction set became more versatile;
58# c) an important observation was made by Max Locktykhin, which made
59# it possible to reduce amount of instructions required to perform
60# the operation in question, for further details see
61# http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/.
62
63# April 2011.
64#
65# Add AVX code path, probably most controversial... The thing is that
66# switch to AVX alone improves performance by as little as 4% in
67# comparison to SSSE3 code path. But below result doesn't look like
68# 4% improvement... Trouble is that Sandy Bridge decodes 'ro[rl]' as
69# pair of µ-ops, and it's the additional µ-ops, two per round, that
70# make it run slower than Core2 and Westmere. But 'sh[rl]d' is decoded
71# as single µ-op by Sandy Bridge and it's replacing 'ro[rl]' with
72# equivalent 'sh[rl]d' that is responsible for the impressive 5.1
73# cycles per processed byte. But 'sh[rl]d' is not something that used
74# to be fast, nor does it appear to be fast in upcoming Bulldozer
75# [according to its optimization manual]. Which is why AVX code path
76# is guarded by *both* AVX and synthetic bit denoting Intel CPUs.
77# One can argue that it's unfair to AMD, but without 'sh[rl]d' it
78# makes no sense to keep the AVX code path. If somebody feels that
79# strongly, it's probably more appropriate to discuss possibility of
80# using vector rotate XOP on AMD...
81
82######################################################################
83# Current performance is summarized in following table. Numbers are
84# CPU clock cycles spent to process single byte (less is better).
85#
86# x86 SSSE3 AVX
87# Pentium 15.7 -
88# PIII 11.5 -
89# P4 10.6 -
90# AMD K8 7.1 -
91# Core2 7.3 6.1/+20% -
92# Atom 12.5 9.5(*)/+32% -
93# Westmere 7.3 5.6/+30% -
94# Sandy Bridge 8.8 6.2/+40% 5.1(**)/+70%
95#
96# (*) Loop is 1056 instructions long and expected result is ~8.25.
97# It remains mystery [to me] why ILP is limited to 1.7.
98#
99# (**) As per above comment, the result is for AVX *plus* sh[rl]d.
100
101$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
102push(@INC,"${dir}","${dir}../../perlasm");
103require "x86asm.pl";
104
105&asm_init($ARGV[0],"sha1-586.pl",$ARGV[$#ARGV] eq "386");
106
107$xmm=$ymm=0;
108for (@ARGV) { $xmm=1 if (/-DOPENSSL_IA32_SSE2/); }
109
110$ymm=1 if ($xmm &&
111 `$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
112 =~ /GNU assembler version ([2-9]\.[0-9]+)/ &&
113 $1>=2.19); # first version supporting AVX
114
115&external_label("OPENSSL_ia32cap_P") if ($xmm);
116
117
118$A="eax";
119$B="ebx";
120$C="ecx";
121$D="edx";
122$E="edi";
123$T="esi";
124$tmp1="ebp";
125
126@V=($A,$B,$C,$D,$E,$T);
127
128$alt=0; # 1 denotes alternative IALU implementation, which performs
129 # 8% *worse* on P4, same on Westmere and Atom, 2% better on
130 # Sandy Bridge...
131
132sub BODY_00_15
133 {
134 local($n,$a,$b,$c,$d,$e,$f)=@_;
135
136 &comment("00_15 $n");
137
138 &mov($f,$c); # f to hold F_00_19(b,c,d)
139 if ($n==0) { &mov($tmp1,$a); }
140 else { &mov($a,$tmp1); }
141 &rotl($tmp1,5); # tmp1=ROTATE(a,5)
142 &xor($f,$d);
143 &add($tmp1,$e); # tmp1+=e;
144 &mov($e,&swtmp($n%16)); # e becomes volatile and is loaded
145 # with xi, also note that e becomes
146 # f in next round...
147 &and($f,$b);
148 &rotr($b,2); # b=ROTATE(b,30)
149 &xor($f,$d); # f holds F_00_19(b,c,d)
150 &lea($tmp1,&DWP(0x5a827999,$tmp1,$e)); # tmp1+=K_00_19+xi
151
152 if ($n==15) { &mov($e,&swtmp(($n+1)%16));# pre-fetch f for next round
153 &add($f,$tmp1); } # f+=tmp1
154 else { &add($tmp1,$f); } # f becomes a in next round
155 &mov($tmp1,$a) if ($alt && $n==15);
156 }
157
158sub BODY_16_19
159 {
160 local($n,$a,$b,$c,$d,$e,$f)=@_;
161
162 &comment("16_19 $n");
163
164if ($alt) {
165 &xor($c,$d);
166 &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd)
167 &and($tmp1,$c); # tmp1 to hold F_00_19(b,c,d), b&=c^d
168 &xor($f,&swtmp(($n+8)%16));
169 &xor($tmp1,$d); # tmp1=F_00_19(b,c,d)
170 &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd
171 &rotl($f,1); # f=ROTATE(f,1)
172 &add($e,$tmp1); # e+=F_00_19(b,c,d)
173 &xor($c,$d); # restore $c
174 &mov($tmp1,$a); # b in next round
175 &rotr($b,$n==16?2:7); # b=ROTATE(b,30)
176 &mov(&swtmp($n%16),$f); # xi=f
177 &rotl($a,5); # ROTATE(a,5)
178 &lea($f,&DWP(0x5a827999,$f,$e));# f+=F_00_19(b,c,d)+e
179 &mov($e,&swtmp(($n+1)%16)); # pre-fetch f for next round
180 &add($f,$a); # f+=ROTATE(a,5)
181} else {
182 &mov($tmp1,$c); # tmp1 to hold F_00_19(b,c,d)
183 &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd)
184 &xor($tmp1,$d);
185 &xor($f,&swtmp(($n+8)%16));
186 &and($tmp1,$b);
187 &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd
188 &rotl($f,1); # f=ROTATE(f,1)
189 &xor($tmp1,$d); # tmp1=F_00_19(b,c,d)
190 &add($e,$tmp1); # e+=F_00_19(b,c,d)
191 &mov($tmp1,$a);
192 &rotr($b,2); # b=ROTATE(b,30)
193 &mov(&swtmp($n%16),$f); # xi=f
194 &rotl($tmp1,5); # ROTATE(a,5)
195 &lea($f,&DWP(0x5a827999,$f,$e));# f+=F_00_19(b,c,d)+e
196 &mov($e,&swtmp(($n+1)%16)); # pre-fetch f for next round
197 &add($f,$tmp1); # f+=ROTATE(a,5)
198}
199 }
200
201sub BODY_20_39
202 {
203 local($n,$a,$b,$c,$d,$e,$f)=@_;
204 local $K=($n<40)?0x6ed9eba1:0xca62c1d6;
205
206 &comment("20_39 $n");
207
208if ($alt) {
209 &xor($tmp1,$c); # tmp1 to hold F_20_39(b,c,d), b^=c
210 &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd)
211 &xor($tmp1,$d); # tmp1 holds F_20_39(b,c,d)
212 &xor($f,&swtmp(($n+8)%16));
213 &add($e,$tmp1); # e+=F_20_39(b,c,d)
214 &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd
215 &rotl($f,1); # f=ROTATE(f,1)
216 &mov($tmp1,$a); # b in next round
217 &rotr($b,7); # b=ROTATE(b,30)
218 &mov(&swtmp($n%16),$f) if($n<77);# xi=f
219 &rotl($a,5); # ROTATE(a,5)
220 &xor($b,$c) if($n==39);# warm up for BODY_40_59
221 &and($tmp1,$b) if($n==39);
222 &lea($f,&DWP($K,$f,$e)); # f+=e+K_XX_YY
223 &mov($e,&swtmp(($n+1)%16)) if($n<79);# pre-fetch f for next round
224 &add($f,$a); # f+=ROTATE(a,5)
225 &rotr($a,5) if ($n==79);
226} else {
227 &mov($tmp1,$b); # tmp1 to hold F_20_39(b,c,d)
228 &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd)
229 &xor($tmp1,$c);
230 &xor($f,&swtmp(($n+8)%16));
231 &xor($tmp1,$d); # tmp1 holds F_20_39(b,c,d)
232 &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd
233 &rotl($f,1); # f=ROTATE(f,1)
234 &add($e,$tmp1); # e+=F_20_39(b,c,d)
235 &rotr($b,2); # b=ROTATE(b,30)
236 &mov($tmp1,$a);
237 &rotl($tmp1,5); # ROTATE(a,5)
238 &mov(&swtmp($n%16),$f) if($n<77);# xi=f
239 &lea($f,&DWP($K,$f,$e)); # f+=e+K_XX_YY
240 &mov($e,&swtmp(($n+1)%16)) if($n<79);# pre-fetch f for next round
241 &add($f,$tmp1); # f+=ROTATE(a,5)
242}
243 }
244
245sub BODY_40_59
246 {
247 local($n,$a,$b,$c,$d,$e,$f)=@_;
248
249 &comment("40_59 $n");
250
251if ($alt) {
252 &add($e,$tmp1); # e+=b&(c^d)
253 &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd)
254 &mov($tmp1,$d);
255 &xor($f,&swtmp(($n+8)%16));
256 &xor($c,$d); # restore $c
257 &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd
258 &rotl($f,1); # f=ROTATE(f,1)
259 &and($tmp1,$c);
260 &rotr($b,7); # b=ROTATE(b,30)
261 &add($e,$tmp1); # e+=c&d
262 &mov($tmp1,$a); # b in next round
263 &mov(&swtmp($n%16),$f); # xi=f
264 &rotl($a,5); # ROTATE(a,5)
265 &xor($b,$c) if ($n<59);
266 &and($tmp1,$b) if ($n<59);# tmp1 to hold F_40_59(b,c,d)
267 &lea($f,&DWP(0x8f1bbcdc,$f,$e));# f+=K_40_59+e+(b&(c^d))
268 &mov($e,&swtmp(($n+1)%16)); # pre-fetch f for next round
269 &add($f,$a); # f+=ROTATE(a,5)
270} else {
271 &mov($tmp1,$c); # tmp1 to hold F_40_59(b,c,d)
272 &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd)
273 &xor($tmp1,$d);
274 &xor($f,&swtmp(($n+8)%16));
275 &and($tmp1,$b);
276 &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd
277 &rotl($f,1); # f=ROTATE(f,1)
278 &add($tmp1,$e); # b&(c^d)+=e
279 &rotr($b,2); # b=ROTATE(b,30)
280 &mov($e,$a); # e becomes volatile
281 &rotl($e,5); # ROTATE(a,5)
282 &mov(&swtmp($n%16),$f); # xi=f
283 &lea($f,&DWP(0x8f1bbcdc,$f,$tmp1));# f+=K_40_59+e+(b&(c^d))
284 &mov($tmp1,$c);
285 &add($f,$e); # f+=ROTATE(a,5)
286 &and($tmp1,$d);
287 &mov($e,&swtmp(($n+1)%16)); # pre-fetch f for next round
288 &add($f,$tmp1); # f+=c&d
289}
290 }
291
292&function_begin("sha1_block_data_order");
293if ($xmm) {
294 &static_label("ssse3_shortcut");
295 &static_label("avx_shortcut") if ($ymm);
296 &static_label("K_XX_XX");
297
298 &picsetup($tmp1);
299 &picsymbol($T, "OPENSSL_ia32cap_P", $tmp1);
300 &picsymbol($tmp1, &label("K_XX_XX"), $tmp1);
301
302 &mov ($A,&DWP(0,$T));
303 &mov ($D,&DWP(4,$T));
304 &test ($D,"\$IA32CAP_MASK1_SSSE3"); # check SSSE3 bit
305 &jz (&label("x86"));
306 &test ($A,"\$IA32CAP_MASK0_FXSR"); # check FXSR bit
307 &jz (&label("x86"));
308 if ($ymm) {
309 &and ($D,"\$IA32CAP_MASK1_AVX"); # mask AVX bit
310 &and ($A,"\$IA32CAP_MASK0_INTEL"); # mask "Intel CPU" bit
311 &or ($A,$D);
312 &cmp ($A,"\$(IA32CAP_MASK1_AVX | IA32CAP_MASK0_INTEL)");
313 &je (&label("avx_shortcut"));
314 }
315 &jmp (&label("ssse3_shortcut"));
316 &set_label("x86",16);
317}
318 &mov($tmp1,&wparam(0)); # SHA_CTX *c
319 &mov($T,&wparam(1)); # const void *input
320 &mov($A,&wparam(2)); # size_t num
321 &stack_push(16+3); # allocate X[16]
322 &shl($A,6);
323 &add($A,$T);
324 &mov(&wparam(2),$A); # pointer beyond the end of input
325 &mov($E,&DWP(16,$tmp1));# pre-load E
326 &jmp(&label("loop"));
327
328&set_label("loop",16);
329
330 # copy input chunk to X, but reversing byte order!
331 for ($i=0; $i<16; $i+=4)
332 {
333 &mov($A,&DWP(4*($i+0),$T));
334 &mov($B,&DWP(4*($i+1),$T));
335 &mov($C,&DWP(4*($i+2),$T));
336 &mov($D,&DWP(4*($i+3),$T));
337 &bswap($A);
338 &bswap($B);
339 &bswap($C);
340 &bswap($D);
341 &mov(&swtmp($i+0),$A);
342 &mov(&swtmp($i+1),$B);
343 &mov(&swtmp($i+2),$C);
344 &mov(&swtmp($i+3),$D);
345 }
346 &mov(&wparam(1),$T); # redundant in 1st spin
347
348 &mov($A,&DWP(0,$tmp1)); # load SHA_CTX
349 &mov($B,&DWP(4,$tmp1));
350 &mov($C,&DWP(8,$tmp1));
351 &mov($D,&DWP(12,$tmp1));
352 # E is pre-loaded
353
354 for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
355 for(;$i<20;$i++) { &BODY_16_19($i,@V); unshift(@V,pop(@V)); }
356 for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
357 for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
358 for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
359
360 (($V[5] eq $D) and ($V[0] eq $E)) or die; # double-check
361
362 &mov($tmp1,&wparam(0)); # re-load SHA_CTX*
363 &mov($D,&wparam(1)); # D is last "T" and is discarded
364
365 &add($E,&DWP(0,$tmp1)); # E is last "A"...
366 &add($T,&DWP(4,$tmp1));
367 &add($A,&DWP(8,$tmp1));
368 &add($B,&DWP(12,$tmp1));
369 &add($C,&DWP(16,$tmp1));
370
371 &mov(&DWP(0,$tmp1),$E); # update SHA_CTX
372 &add($D,64); # advance input pointer
373 &mov(&DWP(4,$tmp1),$T);
374 &cmp($D,&wparam(2)); # have we reached the end yet?
375 &mov(&DWP(8,$tmp1),$A);
376 &mov($E,$C); # C is last "E" which needs to be "pre-loaded"
377 &mov(&DWP(12,$tmp1),$B);
378 &mov($T,$D); # input pointer
379 &mov(&DWP(16,$tmp1),$C);
380 &jb(&label("loop"));
381
382 &stack_pop(16+3);
383&function_end("sha1_block_data_order");
384
385if ($xmm) {
386######################################################################
387# The SSSE3 implementation.
388#
389# %xmm[0-7] are used as ring @X[] buffer containing quadruples of last
390# 32 elements of the message schedule or Xupdate outputs. First 4
391# quadruples are simply byte-swapped input, next 4 are calculated
392# according to method originally suggested by Dean Gaudet (modulo
393# being implemented in SSSE3). Once 8 quadruples or 32 elements are
394# collected, it switches to routine proposed by Max Locktyukhin.
395#
396# Calculations inevitably require temporary reqisters, and there are
397# no %xmm registers left to spare. For this reason part of the ring
398# buffer, X[2..4] to be specific, is offloaded to 3 quadriples ring
399# buffer on the stack. Keep in mind that X[2] is alias X[-6], X[3] -
400# X[-5], and X[4] - X[-4]...
401#
402# Another notable optimization is aggressive stack frame compression
403# aiming to minimize amount of 9-byte instructions...
404#
405# Yet another notable optimization is "jumping" $B variable. It means
406# that there is no register permanently allocated for $B value. This
407# allowed to eliminate one instruction from body_20_39...
408#
409my $Xi=4; # 4xSIMD Xupdate round, start pre-seeded
410my @X=map("xmm$_",(4..7,0..3)); # pre-seeded for $Xi=4
411my @V=($A,$B,$C,$D,$E);
412my $j=0; # hash round
413my @T=($T,$tmp1);
414my $inp;
415
416my $_rol=sub { &rol(@_) };
417my $_ror=sub { &ror(@_) };
418
419&function_begin("_sha1_block_data_order_ssse3");
420 &picsetup($tmp1);
421 &picsymbol($tmp1, &label("K_XX_XX"), $tmp1);
422
423&set_label("ssse3_shortcut");
424
425 &movdqa (@X[3],&QWP(0,$tmp1)); # K_00_19
426 &movdqa (@X[4],&QWP(16,$tmp1)); # K_20_39
427 &movdqa (@X[5],&QWP(32,$tmp1)); # K_40_59
428 &movdqa (@X[6],&QWP(48,$tmp1)); # K_60_79
429 &movdqa (@X[2],&QWP(64,$tmp1)); # pbswap mask
430
431 &mov ($E,&wparam(0)); # load argument block
432 &mov ($inp=@T[1],&wparam(1));
433 &mov ($D,&wparam(2));
434 &mov (@T[0],"esp");
435
436 # stack frame layout
437 #
438 # +0 X[0]+K X[1]+K X[2]+K X[3]+K # XMM->IALU xfer area
439 # X[4]+K X[5]+K X[6]+K X[7]+K
440 # X[8]+K X[9]+K X[10]+K X[11]+K
441 # X[12]+K X[13]+K X[14]+K X[15]+K
442 #
443 # +64 X[0] X[1] X[2] X[3] # XMM->XMM backtrace area
444 # X[4] X[5] X[6] X[7]
445 # X[8] X[9] X[10] X[11] # even borrowed for K_00_19
446 #
447 # +112 K_20_39 K_20_39 K_20_39 K_20_39 # constants
448 # K_40_59 K_40_59 K_40_59 K_40_59
449 # K_60_79 K_60_79 K_60_79 K_60_79
450 # K_00_19 K_00_19 K_00_19 K_00_19
451 # pbswap mask
452 #
453 # +192 ctx # argument block
454 # +196 inp
455 # +200 end
456 # +204 esp
457 &sub ("esp",208);
458 &and ("esp",-64);
459
460 &movdqa (&QWP(112+0,"esp"),@X[4]); # copy constants
461 &movdqa (&QWP(112+16,"esp"),@X[5]);
462 &movdqa (&QWP(112+32,"esp"),@X[6]);
463 &shl ($D,6); # len*64
464 &movdqa (&QWP(112+48,"esp"),@X[3]);
465 &add ($D,$inp); # end of input
466 &movdqa (&QWP(112+64,"esp"),@X[2]);
467 &add ($inp,64);
468 &mov (&DWP(192+0,"esp"),$E); # save argument block
469 &mov (&DWP(192+4,"esp"),$inp);
470 &mov (&DWP(192+8,"esp"),$D);
471 &mov (&DWP(192+12,"esp"),@T[0]); # save original %esp
472
473 &mov ($A,&DWP(0,$E)); # load context
474 &mov ($B,&DWP(4,$E));
475 &mov ($C,&DWP(8,$E));
476 &mov ($D,&DWP(12,$E));
477 &mov ($E,&DWP(16,$E));
478 &mov (@T[0],$B); # magic seed
479
480 &movdqu (@X[-4&7],&QWP(-64,$inp)); # load input to %xmm[0-3]
481 &movdqu (@X[-3&7],&QWP(-48,$inp));
482 &movdqu (@X[-2&7],&QWP(-32,$inp));
483 &movdqu (@X[-1&7],&QWP(-16,$inp));
484 &pshufb (@X[-4&7],@X[2]); # byte swap
485 &pshufb (@X[-3&7],@X[2]);
486 &pshufb (@X[-2&7],@X[2]);
487 &movdqa (&QWP(112-16,"esp"),@X[3]); # borrow last backtrace slot
488 &pshufb (@X[-1&7],@X[2]);
489 &paddd (@X[-4&7],@X[3]); # add K_00_19
490 &paddd (@X[-3&7],@X[3]);
491 &paddd (@X[-2&7],@X[3]);
492 &movdqa (&QWP(0,"esp"),@X[-4&7]); # X[]+K xfer to IALU
493 &psubd (@X[-4&7],@X[3]); # restore X[]
494 &movdqa (&QWP(0+16,"esp"),@X[-3&7]);
495 &psubd (@X[-3&7],@X[3]);
496 &movdqa (&QWP(0+32,"esp"),@X[-2&7]);
497 &psubd (@X[-2&7],@X[3]);
498 &movdqa (@X[0],@X[-3&7]);
499 &jmp (&label("loop"));
500
501######################################################################
502# SSE instruction sequence is first broken to groups of independent
503# instructions, independent in respect to their inputs and shifter
504# (not all architectures have more than one). Then IALU instructions
505# are "knitted in" between the SSE groups. Distance is maintained for
506# SSE latency of 2 in hope that it fits better upcoming AMD Bulldozer
507# [which allegedly also implements SSSE3]...
508#
509# Temporary registers usage. X[2] is volatile at the entry and at the
510# end is restored from backtrace ring buffer. X[3] is expected to
511# contain current K_XX_XX constant and is used to calculate X[-1]+K
512# from previous round, it becomes volatile the moment the value is
513# saved to stack for transfer to IALU. X[4] becomes volatile whenever
514# X[-4] is accumulated and offloaded to backtrace ring buffer, at the
515# end it is loaded with next K_XX_XX [which becomes X[3] in next
516# round]...
517#
518sub Xupdate_ssse3_16_31() # recall that $Xi starts with 4
519{ use integer;
520 my $body = shift;
521 my @insns = (&$body,&$body,&$body,&$body); # 40 instructions
522 my ($a,$b,$c,$d,$e);
523
524 eval(shift(@insns));
525 eval(shift(@insns));
526 &palignr(@X[0],@X[-4&7],8); # compose "X[-14]" in "X[0]"
527 &movdqa (@X[2],@X[-1&7]);
528 eval(shift(@insns));
529 eval(shift(@insns));
530
531 &paddd (@X[3],@X[-1&7]);
532 &movdqa (&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]);# save X[] to backtrace buffer
533 eval(shift(@insns));
534 eval(shift(@insns));
535 &psrldq (@X[2],4); # "X[-3]", 3 dwords
536 eval(shift(@insns));
537 eval(shift(@insns));
538 &pxor (@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
539 eval(shift(@insns));
540 eval(shift(@insns));
541
542 &pxor (@X[2],@X[-2&7]); # "X[-3]"^"X[-8]"
543 eval(shift(@insns));
544 eval(shift(@insns));
545 eval(shift(@insns));
546 eval(shift(@insns));
547
548 &pxor (@X[0],@X[2]); # "X[0]"^="X[-3]"^"X[-8]"
549 eval(shift(@insns));
550 eval(shift(@insns));
551 &movdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer to IALU
552 eval(shift(@insns));
553 eval(shift(@insns));
554
555 &movdqa (@X[4],@X[0]);
556 &movdqa (@X[2],@X[0]);
557 eval(shift(@insns));
558 eval(shift(@insns));
559 eval(shift(@insns));
560 eval(shift(@insns));
561
562 &pslldq (@X[4],12); # "X[0]"<<96, extract one dword
563 &paddd (@X[0],@X[0]);
564 eval(shift(@insns));
565 eval(shift(@insns));
566 eval(shift(@insns));
567 eval(shift(@insns));
568
569 &psrld (@X[2],31);
570 eval(shift(@insns));
571 eval(shift(@insns));
572 &movdqa (@X[3],@X[4]);
573 eval(shift(@insns));
574 eval(shift(@insns));
575
576 &psrld (@X[4],30);
577 &por (@X[0],@X[2]); # "X[0]"<<<=1
578 eval(shift(@insns));
579 eval(shift(@insns));
580 &movdqa (@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if ($Xi>5); # restore X[] from backtrace buffer
581 eval(shift(@insns));
582 eval(shift(@insns));
583
584 &pslld (@X[3],2);
585 &pxor (@X[0],@X[4]);
586 eval(shift(@insns));
587 eval(shift(@insns));
588 &movdqa (@X[4],&QWP(112-16+16*(($Xi)/5),"esp")); # K_XX_XX
589 eval(shift(@insns));
590 eval(shift(@insns));
591
592 &pxor (@X[0],@X[3]); # "X[0]"^=("X[0]"<<96)<<<2
593 &movdqa (@X[1],@X[-2&7]) if ($Xi<7);
594 eval(shift(@insns));
595 eval(shift(@insns));
596
597 foreach (@insns) { eval; } # remaining instructions [if any]
598
599 $Xi++; push(@X,shift(@X)); # "rotate" X[]
600}
601
602sub Xupdate_ssse3_32_79()
603{ use integer;
604 my $body = shift;
605 my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions
606 my ($a,$b,$c,$d,$e);
607
608 &movdqa (@X[2],@X[-1&7]) if ($Xi==8);
609 eval(shift(@insns)); # body_20_39
610 &pxor (@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
611 &palignr(@X[2],@X[-2&7],8); # compose "X[-6]"
612 eval(shift(@insns));
613 eval(shift(@insns));
614 eval(shift(@insns)); # rol
615
616 &pxor (@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
617 &movdqa (&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]); # save X[] to backtrace buffer
618 eval(shift(@insns));
619 eval(shift(@insns));
620 if ($Xi%5) {
621 &movdqa (@X[4],@X[3]); # "perpetuate" K_XX_XX...
622 } else { # ... or load next one
623 &movdqa (@X[4],&QWP(112-16+16*($Xi/5),"esp"));
624 }
625 &paddd (@X[3],@X[-1&7]);
626 eval(shift(@insns)); # ror
627 eval(shift(@insns));
628
629 &pxor (@X[0],@X[2]); # "X[0]"^="X[-6]"
630 eval(shift(@insns)); # body_20_39
631 eval(shift(@insns));
632 eval(shift(@insns));
633 eval(shift(@insns)); # rol
634
635 &movdqa (@X[2],@X[0]);
636 &movdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer to IALU
637 eval(shift(@insns));
638 eval(shift(@insns));
639 eval(shift(@insns)); # ror
640 eval(shift(@insns));
641
642 &pslld (@X[0],2);
643 eval(shift(@insns)); # body_20_39
644 eval(shift(@insns));
645 &psrld (@X[2],30);
646 eval(shift(@insns));
647 eval(shift(@insns)); # rol
648 eval(shift(@insns));
649 eval(shift(@insns));
650 eval(shift(@insns)); # ror
651 eval(shift(@insns));
652
653 &por (@X[0],@X[2]); # "X[0]"<<<=2
654 eval(shift(@insns)); # body_20_39
655 eval(shift(@insns));
656 &movdqa (@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if($Xi<19); # restore X[] from backtrace buffer
657 eval(shift(@insns));
658 eval(shift(@insns)); # rol
659 eval(shift(@insns));
660 eval(shift(@insns));
661 eval(shift(@insns)); # ror
662 &movdqa (@X[3],@X[0]) if ($Xi<19);
663 eval(shift(@insns));
664
665 foreach (@insns) { eval; } # remaining instructions
666
667 $Xi++; push(@X,shift(@X)); # "rotate" X[]
668}
669
670sub Xuplast_ssse3_80()
671{ use integer;
672 my $body = shift;
673 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
674 my ($a,$b,$c,$d,$e);
675
676 eval(shift(@insns));
677 &paddd (@X[3],@X[-1&7]);
678 eval(shift(@insns));
679 eval(shift(@insns));
680 eval(shift(@insns));
681 eval(shift(@insns));
682
683 &movdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer IALU
684
685 foreach (@insns) { eval; } # remaining instructions
686
687 &mov ($inp=@T[1],&DWP(192+4,"esp"));
688 &cmp ($inp,&DWP(192+8,"esp"));
689 &je (&label("done"));
690
691 &movdqa (@X[3],&QWP(112+48,"esp")); # K_00_19
692 &movdqa (@X[2],&QWP(112+64,"esp")); # pbswap mask
693 &movdqu (@X[-4&7],&QWP(0,$inp)); # load input
694 &movdqu (@X[-3&7],&QWP(16,$inp));
695 &movdqu (@X[-2&7],&QWP(32,$inp));
696 &movdqu (@X[-1&7],&QWP(48,$inp));
697 &add ($inp,64);
698 &pshufb (@X[-4&7],@X[2]); # byte swap
699 &mov (&DWP(192+4,"esp"),$inp);
700 &movdqa (&QWP(112-16,"esp"),@X[3]); # borrow last backtrace slot
701
702 $Xi=0;
703}
704
705sub Xloop_ssse3()
706{ use integer;
707 my $body = shift;
708 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
709 my ($a,$b,$c,$d,$e);
710
711 eval(shift(@insns));
712 eval(shift(@insns));
713 &pshufb (@X[($Xi-3)&7],@X[2]);
714 eval(shift(@insns));
715 eval(shift(@insns));
716 &paddd (@X[($Xi-4)&7],@X[3]);
717 eval(shift(@insns));
718 eval(shift(@insns));
719 eval(shift(@insns));
720 eval(shift(@insns));
721 &movdqa (&QWP(0+16*$Xi,"esp"),@X[($Xi-4)&7]); # X[]+K xfer to IALU
722 eval(shift(@insns));
723 eval(shift(@insns));
724 &psubd (@X[($Xi-4)&7],@X[3]);
725
726 foreach (@insns) { eval; }
727 $Xi++;
728}
729
730sub Xtail_ssse3()
731{ use integer;
732 my $body = shift;
733 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
734 my ($a,$b,$c,$d,$e);
735
736 foreach (@insns) { eval; }
737}
738
739sub body_00_19 () {
740 (
741 '($a,$b,$c,$d,$e)=@V;'.
742 '&add ($e,&DWP(4*($j&15),"esp"));', # X[]+K xfer
743 '&xor ($c,$d);',
744 '&mov (@T[1],$a);', # $b in next round
745 '&$_rol ($a,5);',
746 '&and (@T[0],$c);', # ($b&($c^$d))
747 '&xor ($c,$d);', # restore $c
748 '&xor (@T[0],$d);',
749 '&add ($e,$a);',
750 '&$_ror ($b,$j?7:2);', # $b>>>2
751 '&add ($e,@T[0]);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
752 );
753}
754
755sub body_20_39 () {
756 (
757 '($a,$b,$c,$d,$e)=@V;'.
758 '&add ($e,&DWP(4*($j++&15),"esp"));', # X[]+K xfer
759 '&xor (@T[0],$d);', # ($b^$d)
760 '&mov (@T[1],$a);', # $b in next round
761 '&$_rol ($a,5);',
762 '&xor (@T[0],$c);', # ($b^$d^$c)
763 '&add ($e,$a);',
764 '&$_ror ($b,7);', # $b>>>2
765 '&add ($e,@T[0]);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
766 );
767}
768
769sub body_40_59 () {
770 (
771 '($a,$b,$c,$d,$e)=@V;'.
772 '&mov (@T[1],$c);',
773 '&xor ($c,$d);',
774 '&add ($e,&DWP(4*($j++&15),"esp"));', # X[]+K xfer
775 '&and (@T[1],$d);',
776 '&and (@T[0],$c);', # ($b&($c^$d))
777 '&$_ror ($b,7);', # $b>>>2
778 '&add ($e,@T[1]);',
779 '&mov (@T[1],$a);', # $b in next round
780 '&$_rol ($a,5);',
781 '&add ($e,@T[0]);',
782 '&xor ($c,$d);', # restore $c
783 '&add ($e,$a);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
784 );
785}
786
787&set_label("loop",16);
788 &Xupdate_ssse3_16_31(\&body_00_19);
789 &Xupdate_ssse3_16_31(\&body_00_19);
790 &Xupdate_ssse3_16_31(\&body_00_19);
791 &Xupdate_ssse3_16_31(\&body_00_19);
792 &Xupdate_ssse3_32_79(\&body_00_19);
793 &Xupdate_ssse3_32_79(\&body_20_39);
794 &Xupdate_ssse3_32_79(\&body_20_39);
795 &Xupdate_ssse3_32_79(\&body_20_39);
796 &Xupdate_ssse3_32_79(\&body_20_39);
797 &Xupdate_ssse3_32_79(\&body_20_39);
798 &Xupdate_ssse3_32_79(\&body_40_59);
799 &Xupdate_ssse3_32_79(\&body_40_59);
800 &Xupdate_ssse3_32_79(\&body_40_59);
801 &Xupdate_ssse3_32_79(\&body_40_59);
802 &Xupdate_ssse3_32_79(\&body_40_59);
803 &Xupdate_ssse3_32_79(\&body_20_39);
804 &Xuplast_ssse3_80(\&body_20_39); # can jump to "done"
805
806 $saved_j=$j; @saved_V=@V;
807
808 &Xloop_ssse3(\&body_20_39);
809 &Xloop_ssse3(\&body_20_39);
810 &Xloop_ssse3(\&body_20_39);
811
812 &mov (@T[1],&DWP(192,"esp")); # update context
813 &add ($A,&DWP(0,@T[1]));
814 &add (@T[0],&DWP(4,@T[1])); # $b
815 &add ($C,&DWP(8,@T[1]));
816 &mov (&DWP(0,@T[1]),$A);
817 &add ($D,&DWP(12,@T[1]));
818 &mov (&DWP(4,@T[1]),@T[0]);
819 &add ($E,&DWP(16,@T[1]));
820 &mov (&DWP(8,@T[1]),$C);
821 &mov ($B,@T[0]);
822 &mov (&DWP(12,@T[1]),$D);
823 &mov (&DWP(16,@T[1]),$E);
824 &movdqa (@X[0],@X[-3&7]);
825
826 &jmp (&label("loop"));
827
828&set_label("done",16); $j=$saved_j; @V=@saved_V;
829
830 &Xtail_ssse3(\&body_20_39);
831 &Xtail_ssse3(\&body_20_39);
832 &Xtail_ssse3(\&body_20_39);
833
834 &mov (@T[1],&DWP(192,"esp")); # update context
835 &add ($A,&DWP(0,@T[1]));
836 &mov ("esp",&DWP(192+12,"esp")); # restore %esp
837 &add (@T[0],&DWP(4,@T[1])); # $b
838 &add ($C,&DWP(8,@T[1]));
839 &mov (&DWP(0,@T[1]),$A);
840 &add ($D,&DWP(12,@T[1]));
841 &mov (&DWP(4,@T[1]),@T[0]);
842 &add ($E,&DWP(16,@T[1]));
843 &mov (&DWP(8,@T[1]),$C);
844 &mov (&DWP(12,@T[1]),$D);
845 &mov (&DWP(16,@T[1]),$E);
846
847&function_end("_sha1_block_data_order_ssse3");
848
849if ($ymm) {
850my $Xi=4; # 4xSIMD Xupdate round, start pre-seeded
851my @X=map("xmm$_",(4..7,0..3)); # pre-seeded for $Xi=4
852my @V=($A,$B,$C,$D,$E);
853my $j=0; # hash round
854my @T=($T,$tmp1);
855my $inp;
856
857my $_rol=sub { &shld(@_[0],@_) };
858my $_ror=sub { &shrd(@_[0],@_) };
859
860&function_begin("_sha1_block_data_order_avx");
861 &picsetup($tmp1);
862 &picsymbol($tmp1, &label("K_XX_XX"), $tmp1);
863
864&set_label("avx_shortcut");
865 &vzeroall();
866
867 &vmovdqa(@X[3],&QWP(0,$tmp1)); # K_00_19
868 &vmovdqa(@X[4],&QWP(16,$tmp1)); # K_20_39
869 &vmovdqa(@X[5],&QWP(32,$tmp1)); # K_40_59
870 &vmovdqa(@X[6],&QWP(48,$tmp1)); # K_60_79
871 &vmovdqa(@X[2],&QWP(64,$tmp1)); # pbswap mask
872
873 &mov ($E,&wparam(0)); # load argument block
874 &mov ($inp=@T[1],&wparam(1));
875 &mov ($D,&wparam(2));
876 &mov (@T[0],"esp");
877
878 # stack frame layout
879 #
880 # +0 X[0]+K X[1]+K X[2]+K X[3]+K # XMM->IALU xfer area
881 # X[4]+K X[5]+K X[6]+K X[7]+K
882 # X[8]+K X[9]+K X[10]+K X[11]+K
883 # X[12]+K X[13]+K X[14]+K X[15]+K
884 #
885 # +64 X[0] X[1] X[2] X[3] # XMM->XMM backtrace area
886 # X[4] X[5] X[6] X[7]
887 # X[8] X[9] X[10] X[11] # even borrowed for K_00_19
888 #
889 # +112 K_20_39 K_20_39 K_20_39 K_20_39 # constants
890 # K_40_59 K_40_59 K_40_59 K_40_59
891 # K_60_79 K_60_79 K_60_79 K_60_79
892 # K_00_19 K_00_19 K_00_19 K_00_19
893 # pbswap mask
894 #
895 # +192 ctx # argument block
896 # +196 inp
897 # +200 end
898 # +204 esp
899 &sub ("esp",208);
900 &and ("esp",-64);
901
902 &vmovdqa(&QWP(112+0,"esp"),@X[4]); # copy constants
903 &vmovdqa(&QWP(112+16,"esp"),@X[5]);
904 &vmovdqa(&QWP(112+32,"esp"),@X[6]);
905 &shl ($D,6); # len*64
906 &vmovdqa(&QWP(112+48,"esp"),@X[3]);
907 &add ($D,$inp); # end of input
908 &vmovdqa(&QWP(112+64,"esp"),@X[2]);
909 &add ($inp,64);
910 &mov (&DWP(192+0,"esp"),$E); # save argument block
911 &mov (&DWP(192+4,"esp"),$inp);
912 &mov (&DWP(192+8,"esp"),$D);
913 &mov (&DWP(192+12,"esp"),@T[0]); # save original %esp
914
915 &mov ($A,&DWP(0,$E)); # load context
916 &mov ($B,&DWP(4,$E));
917 &mov ($C,&DWP(8,$E));
918 &mov ($D,&DWP(12,$E));
919 &mov ($E,&DWP(16,$E));
920 &mov (@T[0],$B); # magic seed
921
922 &vmovdqu(@X[-4&7],&QWP(-64,$inp)); # load input to %xmm[0-3]
923 &vmovdqu(@X[-3&7],&QWP(-48,$inp));
924 &vmovdqu(@X[-2&7],&QWP(-32,$inp));
925 &vmovdqu(@X[-1&7],&QWP(-16,$inp));
926 &vpshufb(@X[-4&7],@X[-4&7],@X[2]); # byte swap
927 &vpshufb(@X[-3&7],@X[-3&7],@X[2]);
928 &vpshufb(@X[-2&7],@X[-2&7],@X[2]);
929 &vmovdqa(&QWP(112-16,"esp"),@X[3]); # borrow last backtrace slot
930 &vpshufb(@X[-1&7],@X[-1&7],@X[2]);
931 &vpaddd (@X[0],@X[-4&7],@X[3]); # add K_00_19
932 &vpaddd (@X[1],@X[-3&7],@X[3]);
933 &vpaddd (@X[2],@X[-2&7],@X[3]);
934 &vmovdqa(&QWP(0,"esp"),@X[0]); # X[]+K xfer to IALU
935 &vmovdqa(&QWP(0+16,"esp"),@X[1]);
936 &vmovdqa(&QWP(0+32,"esp"),@X[2]);
937 &jmp (&label("loop"));
938
939sub Xupdate_avx_16_31() # recall that $Xi starts with 4
940{ use integer;
941 my $body = shift;
942 my @insns = (&$body,&$body,&$body,&$body); # 40 instructions
943 my ($a,$b,$c,$d,$e);
944
945 eval(shift(@insns));
946 eval(shift(@insns));
947 &vpalignr(@X[0],@X[-3&7],@X[-4&7],8); # compose "X[-14]" in "X[0]"
948 eval(shift(@insns));
949 eval(shift(@insns));
950
951 &vpaddd (@X[3],@X[3],@X[-1&7]);
952 &vmovdqa (&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]);# save X[] to backtrace buffer
953 eval(shift(@insns));
954 eval(shift(@insns));
955 &vpsrldq(@X[2],@X[-1&7],4); # "X[-3]", 3 dwords
956 eval(shift(@insns));
957 eval(shift(@insns));
958 &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
959 eval(shift(@insns));
960 eval(shift(@insns));
961
962 &vpxor (@X[2],@X[2],@X[-2&7]); # "X[-3]"^"X[-8]"
963 eval(shift(@insns));
964 eval(shift(@insns));
965 &vmovdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer to IALU
966 eval(shift(@insns));
967 eval(shift(@insns));
968
969 &vpxor (@X[0],@X[0],@X[2]); # "X[0]"^="X[-3]"^"X[-8]"
970 eval(shift(@insns));
971 eval(shift(@insns));
972 eval(shift(@insns));
973 eval(shift(@insns));
974
975 &vpsrld (@X[2],@X[0],31);
976 eval(shift(@insns));
977 eval(shift(@insns));
978 eval(shift(@insns));
979 eval(shift(@insns));
980
981 &vpslldq(@X[4],@X[0],12); # "X[0]"<<96, extract one dword
982 &vpaddd (@X[0],@X[0],@X[0]);
983 eval(shift(@insns));
984 eval(shift(@insns));
985 eval(shift(@insns));
986 eval(shift(@insns));
987
988 &vpsrld (@X[3],@X[4],30);
989 &vpor (@X[0],@X[0],@X[2]); # "X[0]"<<<=1
990 eval(shift(@insns));
991 eval(shift(@insns));
992 eval(shift(@insns));
993 eval(shift(@insns));
994
995 &vpslld (@X[4],@X[4],2);
996 &vmovdqa (@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if ($Xi>5); # restore X[] from backtrace buffer
997 eval(shift(@insns));
998 eval(shift(@insns));
999 &vpxor (@X[0],@X[0],@X[3]);
1000 eval(shift(@insns));
1001 eval(shift(@insns));
1002 eval(shift(@insns));
1003 eval(shift(@insns));
1004
1005 &vpxor (@X[0],@X[0],@X[4]); # "X[0]"^=("X[0]"<<96)<<<2
1006 eval(shift(@insns));
1007 eval(shift(@insns));
1008 &vmovdqa (@X[4],&QWP(112-16+16*(($Xi)/5),"esp")); # K_XX_XX
1009 eval(shift(@insns));
1010 eval(shift(@insns));
1011
1012 foreach (@insns) { eval; } # remaining instructions [if any]
1013
1014 $Xi++; push(@X,shift(@X)); # "rotate" X[]
1015}
1016
1017sub Xupdate_avx_32_79()
1018{ use integer;
1019 my $body = shift;
1020 my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions
1021 my ($a,$b,$c,$d,$e);
1022
1023 &vpalignr(@X[2],@X[-1&7],@X[-2&7],8); # compose "X[-6]"
1024 &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
1025 eval(shift(@insns)); # body_20_39
1026 eval(shift(@insns));
1027 eval(shift(@insns));
1028 eval(shift(@insns)); # rol
1029
1030 &vpxor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
1031 &vmovdqa (&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]); # save X[] to backtrace buffer
1032 eval(shift(@insns));
1033 eval(shift(@insns));
1034 if ($Xi%5) {
1035 &vmovdqa (@X[4],@X[3]); # "perpetuate" K_XX_XX...
1036 } else { # ... or load next one
1037 &vmovdqa (@X[4],&QWP(112-16+16*($Xi/5),"esp"));
1038 }
1039 &vpaddd (@X[3],@X[3],@X[-1&7]);
1040 eval(shift(@insns)); # ror
1041 eval(shift(@insns));
1042
1043 &vpxor (@X[0],@X[0],@X[2]); # "X[0]"^="X[-6]"
1044 eval(shift(@insns)); # body_20_39
1045 eval(shift(@insns));
1046 eval(shift(@insns));
1047 eval(shift(@insns)); # rol
1048
1049 &vpsrld (@X[2],@X[0],30);
1050 &vmovdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer to IALU
1051 eval(shift(@insns));
1052 eval(shift(@insns));
1053 eval(shift(@insns)); # ror
1054 eval(shift(@insns));
1055
1056 &vpslld (@X[0],@X[0],2);
1057 eval(shift(@insns)); # body_20_39
1058 eval(shift(@insns));
1059 eval(shift(@insns));
1060 eval(shift(@insns)); # rol
1061 eval(shift(@insns));
1062 eval(shift(@insns));
1063 eval(shift(@insns)); # ror
1064 eval(shift(@insns));
1065
1066 &vpor (@X[0],@X[0],@X[2]); # "X[0]"<<<=2
1067 eval(shift(@insns)); # body_20_39
1068 eval(shift(@insns));
1069 &vmovdqa (@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if($Xi<19); # restore X[] from backtrace buffer
1070 eval(shift(@insns));
1071 eval(shift(@insns)); # rol
1072 eval(shift(@insns));
1073 eval(shift(@insns));
1074 eval(shift(@insns)); # ror
1075 eval(shift(@insns));
1076
1077 foreach (@insns) { eval; } # remaining instructions
1078
1079 $Xi++; push(@X,shift(@X)); # "rotate" X[]
1080}
1081
1082sub Xuplast_avx_80()
1083{ use integer;
1084 my $body = shift;
1085 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
1086 my ($a,$b,$c,$d,$e);
1087
1088 eval(shift(@insns));
1089 &vpaddd (@X[3],@X[3],@X[-1&7]);
1090 eval(shift(@insns));
1091 eval(shift(@insns));
1092 eval(shift(@insns));
1093 eval(shift(@insns));
1094
1095 &vmovdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer IALU
1096
1097 foreach (@insns) { eval; } # remaining instructions
1098
1099 &mov ($inp=@T[1],&DWP(192+4,"esp"));
1100 &cmp ($inp,&DWP(192+8,"esp"));
1101 &je (&label("done"));
1102
1103 &vmovdqa(@X[3],&QWP(112+48,"esp")); # K_00_19
1104 &vmovdqa(@X[2],&QWP(112+64,"esp")); # pbswap mask
1105 &vmovdqu(@X[-4&7],&QWP(0,$inp)); # load input
1106 &vmovdqu(@X[-3&7],&QWP(16,$inp));
1107 &vmovdqu(@X[-2&7],&QWP(32,$inp));
1108 &vmovdqu(@X[-1&7],&QWP(48,$inp));
1109 &add ($inp,64);
1110 &vpshufb(@X[-4&7],@X[-4&7],@X[2]); # byte swap
1111 &mov (&DWP(192+4,"esp"),$inp);
1112 &vmovdqa(&QWP(112-16,"esp"),@X[3]); # borrow last backtrace slot
1113
1114 $Xi=0;
1115}
1116
1117sub Xloop_avx()
1118{ use integer;
1119 my $body = shift;
1120 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
1121 my ($a,$b,$c,$d,$e);
1122
1123 eval(shift(@insns));
1124 eval(shift(@insns));
1125 &vpshufb (@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]);
1126 eval(shift(@insns));
1127 eval(shift(@insns));
1128 &vpaddd (@X[$Xi&7],@X[($Xi-4)&7],@X[3]);
1129 eval(shift(@insns));
1130 eval(shift(@insns));
1131 eval(shift(@insns));
1132 eval(shift(@insns));
1133 &vmovdqa (&QWP(0+16*$Xi,"esp"),@X[$Xi&7]); # X[]+K xfer to IALU
1134 eval(shift(@insns));
1135 eval(shift(@insns));
1136
1137 foreach (@insns) { eval; }
1138 $Xi++;
1139}
1140
1141sub Xtail_avx()
1142{ use integer;
1143 my $body = shift;
1144 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
1145 my ($a,$b,$c,$d,$e);
1146
1147 foreach (@insns) { eval; }
1148}
1149
1150&set_label("loop",16);
1151 &Xupdate_avx_16_31(\&body_00_19);
1152 &Xupdate_avx_16_31(\&body_00_19);
1153 &Xupdate_avx_16_31(\&body_00_19);
1154 &Xupdate_avx_16_31(\&body_00_19);
1155 &Xupdate_avx_32_79(\&body_00_19);
1156 &Xupdate_avx_32_79(\&body_20_39);
1157 &Xupdate_avx_32_79(\&body_20_39);
1158 &Xupdate_avx_32_79(\&body_20_39);
1159 &Xupdate_avx_32_79(\&body_20_39);
1160 &Xupdate_avx_32_79(\&body_20_39);
1161 &Xupdate_avx_32_79(\&body_40_59);
1162 &Xupdate_avx_32_79(\&body_40_59);
1163 &Xupdate_avx_32_79(\&body_40_59);
1164 &Xupdate_avx_32_79(\&body_40_59);
1165 &Xupdate_avx_32_79(\&body_40_59);
1166 &Xupdate_avx_32_79(\&body_20_39);
1167 &Xuplast_avx_80(\&body_20_39); # can jump to "done"
1168
1169 $saved_j=$j; @saved_V=@V;
1170
1171 &Xloop_avx(\&body_20_39);
1172 &Xloop_avx(\&body_20_39);
1173 &Xloop_avx(\&body_20_39);
1174
1175 &mov (@T[1],&DWP(192,"esp")); # update context
1176 &add ($A,&DWP(0,@T[1]));
1177 &add (@T[0],&DWP(4,@T[1])); # $b
1178 &add ($C,&DWP(8,@T[1]));
1179 &mov (&DWP(0,@T[1]),$A);
1180 &add ($D,&DWP(12,@T[1]));
1181 &mov (&DWP(4,@T[1]),@T[0]);
1182 &add ($E,&DWP(16,@T[1]));
1183 &mov (&DWP(8,@T[1]),$C);
1184 &mov ($B,@T[0]);
1185 &mov (&DWP(12,@T[1]),$D);
1186 &mov (&DWP(16,@T[1]),$E);
1187
1188 &jmp (&label("loop"));
1189
1190&set_label("done",16); $j=$saved_j; @V=@saved_V;
1191
1192 &Xtail_avx(\&body_20_39);
1193 &Xtail_avx(\&body_20_39);
1194 &Xtail_avx(\&body_20_39);
1195
1196 &vzeroall();
1197
1198 &mov (@T[1],&DWP(192,"esp")); # update context
1199 &add ($A,&DWP(0,@T[1]));
1200 &mov ("esp",&DWP(192+12,"esp")); # restore %esp
1201 &add (@T[0],&DWP(4,@T[1])); # $b
1202 &add ($C,&DWP(8,@T[1]));
1203 &mov (&DWP(0,@T[1]),$A);
1204 &add ($D,&DWP(12,@T[1]));
1205 &mov (&DWP(4,@T[1]),@T[0]);
1206 &add ($E,&DWP(16,@T[1]));
1207 &mov (&DWP(8,@T[1]),$C);
1208 &mov (&DWP(12,@T[1]),$D);
1209 &mov (&DWP(16,@T[1]),$E);
1210&function_end("_sha1_block_data_order_avx");
1211}
1212
1213 &rodataseg();
1214&set_label("K_XX_XX",64);
1215&data_word(0x5a827999,0x5a827999,0x5a827999,0x5a827999); # K_00_19
1216&data_word(0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1); # K_20_39
1217&data_word(0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc); # K_40_59
1218&data_word(0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6); # K_60_79
1219&data_word(0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f); # pbswap mask
1220 &previous();
1221}
1222
1223&asm_finish();
diff --git a/src/lib/libcrypto/sha/asm/sha1-alpha.pl b/src/lib/libcrypto/sha/asm/sha1-alpha.pl
deleted file mode 100644
index 56b3369f09..0000000000
--- a/src/lib/libcrypto/sha/asm/sha1-alpha.pl
+++ /dev/null
@@ -1,316 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# SHA1 block procedure for Alpha.
11
12# On 21264 performance is 33% better than code generated by vendor
13# compiler, and 75% better than GCC [3.4], and in absolute terms is
14# 8.7 cycles per processed byte. Implementation features vectorized
15# byte swap, but not Xupdate.
16
17@X=( "\$0", "\$1", "\$2", "\$3", "\$4", "\$5", "\$6", "\$7",
18 "\$8", "\$9", "\$10", "\$11", "\$12", "\$13", "\$14", "\$15");
19$ctx="a0"; # $16
20$inp="a1";
21$num="a2";
22$A="a3";
23$B="a4"; # 20
24$C="a5";
25$D="t8";
26$E="t9"; @V=($A,$B,$C,$D,$E);
27$t0="t10"; # 24
28$t1="t11";
29$t2="ra";
30$t3="t12";
31$K="AT"; # 28
32
33sub BODY_00_19 {
34my ($i,$a,$b,$c,$d,$e)=@_;
35my $j=$i+1;
36$code.=<<___ if ($i==0);
37 ldq_u @X[0],0+0($inp)
38 ldq_u @X[1],0+7($inp)
39___
40$code.=<<___ if (!($i&1) && $i<14);
41 ldq_u @X[$i+2],($i+2)*4+0($inp)
42 ldq_u @X[$i+3],($i+2)*4+7($inp)
43___
44$code.=<<___ if (!($i&1) && $i<15);
45 extql @X[$i],$inp,@X[$i]
46 extqh @X[$i+1],$inp,@X[$i+1]
47
48 or @X[$i+1],@X[$i],@X[$i] # pair of 32-bit values are fetched
49
50 srl @X[$i],24,$t0 # vectorized byte swap
51 srl @X[$i],8,$t2
52
53 sll @X[$i],8,$t3
54 sll @X[$i],24,@X[$i]
55 zapnot $t0,0x11,$t0
56 zapnot $t2,0x22,$t2
57
58 zapnot @X[$i],0x88,@X[$i]
59 or $t0,$t2,$t0
60 zapnot $t3,0x44,$t3
61 sll $a,5,$t1
62
63 or @X[$i],$t0,@X[$i]
64 addl $K,$e,$e
65 and $b,$c,$t2
66 zapnot $a,0xf,$a
67
68 or @X[$i],$t3,@X[$i]
69 srl $a,27,$t0
70 bic $d,$b,$t3
71 sll $b,30,$b
72
73 extll @X[$i],4,@X[$i+1] # extract upper half
74 or $t2,$t3,$t2
75 addl @X[$i],$e,$e
76
77 addl $t1,$e,$e
78 srl $b,32,$t3
79 zapnot @X[$i],0xf,@X[$i]
80
81 addl $t0,$e,$e
82 addl $t2,$e,$e
83 or $t3,$b,$b
84___
85$code.=<<___ if (($i&1) && $i<15);
86 sll $a,5,$t1
87 addl $K,$e,$e
88 and $b,$c,$t2
89 zapnot $a,0xf,$a
90
91 srl $a,27,$t0
92 addl @X[$i%16],$e,$e
93 bic $d,$b,$t3
94 sll $b,30,$b
95
96 or $t2,$t3,$t2
97 addl $t1,$e,$e
98 srl $b,32,$t3
99 zapnot @X[$i],0xf,@X[$i]
100
101 addl $t0,$e,$e
102 addl $t2,$e,$e
103 or $t3,$b,$b
104___
105$code.=<<___ if ($i>=15); # with forward Xupdate
106 sll $a,5,$t1
107 addl $K,$e,$e
108 and $b,$c,$t2
109 xor @X[($j+2)%16],@X[$j%16],@X[$j%16]
110
111 zapnot $a,0xf,$a
112 addl @X[$i%16],$e,$e
113 bic $d,$b,$t3
114 xor @X[($j+8)%16],@X[$j%16],@X[$j%16]
115
116 srl $a,27,$t0
117 addl $t1,$e,$e
118 or $t2,$t3,$t2
119 xor @X[($j+13)%16],@X[$j%16],@X[$j%16]
120
121 sll $b,30,$b
122 addl $t0,$e,$e
123 srl @X[$j%16],31,$t1
124
125 addl $t2,$e,$e
126 srl $b,32,$t3
127 addl @X[$j%16],@X[$j%16],@X[$j%16]
128
129 or $t3,$b,$b
130 zapnot @X[$i%16],0xf,@X[$i%16]
131 or $t1,@X[$j%16],@X[$j%16]
132___
133}
134
135sub BODY_20_39 {
136my ($i,$a,$b,$c,$d,$e)=@_;
137my $j=$i+1;
138$code.=<<___ if ($i<79); # with forward Xupdate
139 sll $a,5,$t1
140 addl $K,$e,$e
141 zapnot $a,0xf,$a
142 xor @X[($j+2)%16],@X[$j%16],@X[$j%16]
143
144 sll $b,30,$t3
145 addl $t1,$e,$e
146 xor $b,$c,$t2
147 xor @X[($j+8)%16],@X[$j%16],@X[$j%16]
148
149 srl $b,2,$b
150 addl @X[$i%16],$e,$e
151 xor $d,$t2,$t2
152 xor @X[($j+13)%16],@X[$j%16],@X[$j%16]
153
154 srl @X[$j%16],31,$t1
155 addl $t2,$e,$e
156 srl $a,27,$t0
157 addl @X[$j%16],@X[$j%16],@X[$j%16]
158
159 or $t3,$b,$b
160 addl $t0,$e,$e
161 or $t1,@X[$j%16],@X[$j%16]
162___
163$code.=<<___ if ($i<77);
164 zapnot @X[$i%16],0xf,@X[$i%16]
165___
166$code.=<<___ if ($i==79); # with context fetch
167 sll $a,5,$t1
168 addl $K,$e,$e
169 zapnot $a,0xf,$a
170 ldl @X[0],0($ctx)
171
172 sll $b,30,$t3
173 addl $t1,$e,$e
174 xor $b,$c,$t2
175 ldl @X[1],4($ctx)
176
177 srl $b,2,$b
178 addl @X[$i%16],$e,$e
179 xor $d,$t2,$t2
180 ldl @X[2],8($ctx)
181
182 srl $a,27,$t0
183 addl $t2,$e,$e
184 ldl @X[3],12($ctx)
185
186 or $t3,$b,$b
187 addl $t0,$e,$e
188 ldl @X[4],16($ctx)
189___
190}
191
192sub BODY_40_59 {
193my ($i,$a,$b,$c,$d,$e)=@_;
194my $j=$i+1;
195$code.=<<___; # with forward Xupdate
196 sll $a,5,$t1
197 addl $K,$e,$e
198 zapnot $a,0xf,$a
199 xor @X[($j+2)%16],@X[$j%16],@X[$j%16]
200
201 srl $a,27,$t0
202 and $b,$c,$t2
203 and $b,$d,$t3
204 xor @X[($j+8)%16],@X[$j%16],@X[$j%16]
205
206 sll $b,30,$b
207 addl $t1,$e,$e
208 xor @X[($j+13)%16],@X[$j%16],@X[$j%16]
209
210 srl @X[$j%16],31,$t1
211 addl $t0,$e,$e
212 or $t2,$t3,$t2
213 and $c,$d,$t3
214
215 or $t2,$t3,$t2
216 srl $b,32,$t3
217 addl @X[$i%16],$e,$e
218 addl @X[$j%16],@X[$j%16],@X[$j%16]
219
220 or $t3,$b,$b
221 addl $t2,$e,$e
222 or $t1,@X[$j%16],@X[$j%16]
223 zapnot @X[$i%16],0xf,@X[$i%16]
224___
225}
226
227$code=<<___;
228#include <machine/asm.h>
229
230.text
231
232.set noat
233.set noreorder
234.globl sha1_block_data_order
235.align 5
236.ent sha1_block_data_order
237sha1_block_data_order:
238 lda sp,-64(sp)
239 stq ra,0(sp)
240 stq s0,8(sp)
241 stq s1,16(sp)
242 stq s2,24(sp)
243 stq s3,32(sp)
244 stq s4,40(sp)
245 stq s5,48(sp)
246 stq fp,56(sp)
247 .mask 0x0400fe00,-64
248 .frame sp,64,ra
249 .prologue 0
250
251 ldl $A,0($ctx)
252 ldl $B,4($ctx)
253 sll $num,6,$num
254 ldl $C,8($ctx)
255 ldl $D,12($ctx)
256 ldl $E,16($ctx)
257 addq $inp,$num,$num
258
259.Lloop:
260 .set noreorder
261 ldah $K,23170(zero)
262 zapnot $B,0xf,$B
263 lda $K,31129($K) # K_00_19
264___
265for ($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
266
267$code.=<<___;
268 ldah $K,28378(zero)
269 lda $K,-5215($K) # K_20_39
270___
271for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
272
273$code.=<<___;
274 ldah $K,-28900(zero)
275 lda $K,-17188($K) # K_40_59
276___
277for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
278
279$code.=<<___;
280 ldah $K,-13725(zero)
281 lda $K,-15914($K) # K_60_79
282___
283for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
284
285$code.=<<___;
286 addl @X[0],$A,$A
287 addl @X[1],$B,$B
288 addl @X[2],$C,$C
289 addl @X[3],$D,$D
290 addl @X[4],$E,$E
291 stl $A,0($ctx)
292 stl $B,4($ctx)
293 addq $inp,64,$inp
294 stl $C,8($ctx)
295 stl $D,12($ctx)
296 stl $E,16($ctx)
297 cmpult $inp,$num,$t1
298 bne $t1,.Lloop
299
300 .set noreorder
301 ldq ra,0(sp)
302 ldq s0,8(sp)
303 ldq s1,16(sp)
304 ldq s2,24(sp)
305 ldq s3,32(sp)
306 ldq s4,40(sp)
307 ldq s5,48(sp)
308 ldq fp,56(sp)
309 lda sp,64(sp)
310 ret (ra)
311.end sha1_block_data_order
312.align 2
313___
314$output=shift and open STDOUT,">$output";
315print $code;
316close STDOUT;
diff --git a/src/lib/libcrypto/sha/asm/sha1-armv4-large.pl b/src/lib/libcrypto/sha/asm/sha1-armv4-large.pl
deleted file mode 100644
index 8f0cdaf83c..0000000000
--- a/src/lib/libcrypto/sha/asm/sha1-armv4-large.pl
+++ /dev/null
@@ -1,248 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# sha1_block procedure for ARMv4.
11#
12# January 2007.
13
14# Size/performance trade-off
15# ====================================================================
16# impl size in bytes comp cycles[*] measured performance
17# ====================================================================
18# thumb 304 3212 4420
19# armv4-small 392/+29% 1958/+64% 2250/+96%
20# armv4-compact 740/+89% 1552/+26% 1840/+22%
21# armv4-large 1420/+92% 1307/+19% 1370/+34%[***]
22# full unroll ~5100/+260% ~1260/+4% ~1300/+5%
23# ====================================================================
24# thumb = same as 'small' but in Thumb instructions[**] and
25# with recurring code in two private functions;
26# small = detached Xload/update, loops are folded;
27# compact = detached Xload/update, 5x unroll;
28# large = interleaved Xload/update, 5x unroll;
29# full unroll = interleaved Xload/update, full unroll, estimated[!];
30#
31# [*] Manually counted instructions in "grand" loop body. Measured
32# performance is affected by prologue and epilogue overhead,
33# i-cache availability, branch penalties, etc.
34# [**] While each Thumb instruction is twice smaller, they are not as
35# diverse as ARM ones: e.g., there are only two arithmetic
36# instructions with 3 arguments, no [fixed] rotate, addressing
37# modes are limited. As result it takes more instructions to do
38# the same job in Thumb, therefore the code is never twice as
39# small and always slower.
40# [***] which is also ~35% better than compiler generated code. Dual-
41# issue Cortex A8 core was measured to process input block in
42# ~990 cycles.
43
44# August 2010.
45#
46# Rescheduling for dual-issue pipeline resulted in 13% improvement on
47# Cortex A8 core and in absolute terms ~870 cycles per input block
48# [or 13.6 cycles per byte].
49
50# February 2011.
51#
52# Profiler-assisted and platform-specific optimization resulted in 10%
53# improvement on Cortex A8 core and 12.2 cycles per byte.
54
55while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
56open STDOUT,">$output";
57
58$ctx="r0";
59$inp="r1";
60$len="r2";
61$a="r3";
62$b="r4";
63$c="r5";
64$d="r6";
65$e="r7";
66$K="r8";
67$t0="r9";
68$t1="r10";
69$t2="r11";
70$t3="r12";
71$Xi="r14";
72@V=($a,$b,$c,$d,$e);
73
74sub Xupdate {
75my ($a,$b,$c,$d,$e,$opt1,$opt2)=@_;
76$code.=<<___;
77 ldr $t0,[$Xi,#15*4]
78 ldr $t1,[$Xi,#13*4]
79 ldr $t2,[$Xi,#7*4]
80 add $e,$K,$e,ror#2 @ E+=K_xx_xx
81 ldr $t3,[$Xi,#2*4]
82 eor $t0,$t0,$t1
83 eor $t2,$t2,$t3 @ 1 cycle stall
84 eor $t1,$c,$d @ F_xx_xx
85 mov $t0,$t0,ror#31
86 add $e,$e,$a,ror#27 @ E+=ROR(A,27)
87 eor $t0,$t0,$t2,ror#31
88 str $t0,[$Xi,#-4]!
89 $opt1 @ F_xx_xx
90 $opt2 @ F_xx_xx
91 add $e,$e,$t0 @ E+=X[i]
92___
93}
94
95sub BODY_00_15 {
96my ($a,$b,$c,$d,$e)=@_;
97$code.=<<___;
98#if __ARM_ARCH__<7 || defined(__STRICT_ALIGNMENT)
99 ldrb $t1,[$inp,#2]
100 ldrb $t0,[$inp,#3]
101 ldrb $t2,[$inp,#1]
102 add $e,$K,$e,ror#2 @ E+=K_00_19
103 ldrb $t3,[$inp],#4
104 orr $t0,$t0,$t1,lsl#8
105 eor $t1,$c,$d @ F_xx_xx
106 orr $t0,$t0,$t2,lsl#16
107 add $e,$e,$a,ror#27 @ E+=ROR(A,27)
108 orr $t0,$t0,$t3,lsl#24
109#else
110 ldr $t0,[$inp],#4 @ handles unaligned
111 add $e,$K,$e,ror#2 @ E+=K_00_19
112 eor $t1,$c,$d @ F_xx_xx
113 add $e,$e,$a,ror#27 @ E+=ROR(A,27)
114#ifdef __ARMEL__
115 rev $t0,$t0 @ byte swap
116#endif
117#endif
118 and $t1,$b,$t1,ror#2
119 add $e,$e,$t0 @ E+=X[i]
120 eor $t1,$t1,$d,ror#2 @ F_00_19(B,C,D)
121 str $t0,[$Xi,#-4]!
122 add $e,$e,$t1 @ E+=F_00_19(B,C,D)
123___
124}
125
126sub BODY_16_19 {
127my ($a,$b,$c,$d,$e)=@_;
128 &Xupdate(@_,"and $t1,$b,$t1,ror#2");
129$code.=<<___;
130 eor $t1,$t1,$d,ror#2 @ F_00_19(B,C,D)
131 add $e,$e,$t1 @ E+=F_00_19(B,C,D)
132___
133}
134
135sub BODY_20_39 {
136my ($a,$b,$c,$d,$e)=@_;
137 &Xupdate(@_,"eor $t1,$b,$t1,ror#2");
138$code.=<<___;
139 add $e,$e,$t1 @ E+=F_20_39(B,C,D)
140___
141}
142
143sub BODY_40_59 {
144my ($a,$b,$c,$d,$e)=@_;
145 &Xupdate(@_,"and $t1,$b,$t1,ror#2","and $t2,$c,$d");
146$code.=<<___;
147 add $e,$e,$t1 @ E+=F_40_59(B,C,D)
148 add $e,$e,$t2,ror#2
149___
150}
151
152$code=<<___;
153#include "arm_arch.h"
154
155.text
156
157.global sha1_block_data_order
158.type sha1_block_data_order,%function
159
160.align 2
161sha1_block_data_order:
162 stmdb sp!,{r4-r12,lr}
163 add $len,$inp,$len,lsl#6 @ $len to point at the end of $inp
164 ldmia $ctx,{$a,$b,$c,$d,$e}
165.Lloop:
166 ldr $K,.LK_00_19
167 mov $Xi,sp
168 sub sp,sp,#15*4
169 mov $c,$c,ror#30
170 mov $d,$d,ror#30
171 mov $e,$e,ror#30 @ [6]
172.L_00_15:
173___
174for($i=0;$i<5;$i++) {
175 &BODY_00_15(@V); unshift(@V,pop(@V));
176}
177$code.=<<___;
178 teq $Xi,sp
179 bne .L_00_15 @ [((11+4)*5+2)*3]
180 sub sp,sp,#25*4
181___
182 &BODY_00_15(@V); unshift(@V,pop(@V));
183 &BODY_16_19(@V); unshift(@V,pop(@V));
184 &BODY_16_19(@V); unshift(@V,pop(@V));
185 &BODY_16_19(@V); unshift(@V,pop(@V));
186 &BODY_16_19(@V); unshift(@V,pop(@V));
187$code.=<<___;
188
189 ldr $K,.LK_20_39 @ [+15+16*4]
190 cmn sp,#0 @ [+3], clear carry to denote 20_39
191.L_20_39_or_60_79:
192___
193for($i=0;$i<5;$i++) {
194 &BODY_20_39(@V); unshift(@V,pop(@V));
195}
196$code.=<<___;
197 teq $Xi,sp @ preserve carry
198 bne .L_20_39_or_60_79 @ [+((12+3)*5+2)*4]
199 bcs .L_done @ [+((12+3)*5+2)*4], spare 300 bytes
200
201 ldr $K,.LK_40_59
202 sub sp,sp,#20*4 @ [+2]
203.L_40_59:
204___
205for($i=0;$i<5;$i++) {
206 &BODY_40_59(@V); unshift(@V,pop(@V));
207}
208$code.=<<___;
209 teq $Xi,sp
210 bne .L_40_59 @ [+((12+5)*5+2)*4]
211
212 ldr $K,.LK_60_79
213 sub sp,sp,#20*4
214 cmp sp,#0 @ set carry to denote 60_79
215 b .L_20_39_or_60_79 @ [+4], spare 300 bytes
216.L_done:
217 add sp,sp,#80*4 @ "deallocate" stack frame
218 ldmia $ctx,{$K,$t0,$t1,$t2,$t3}
219 add $a,$K,$a
220 add $b,$t0,$b
221 add $c,$t1,$c,ror#2
222 add $d,$t2,$d,ror#2
223 add $e,$t3,$e,ror#2
224 stmia $ctx,{$a,$b,$c,$d,$e}
225 teq $inp,$len
226 bne .Lloop @ [+18], total 1307
227
228#if __ARM_ARCH__>=5
229 ldmia sp!,{r4-r12,pc}
230#else
231 ldmia sp!,{r4-r12,lr}
232 tst lr,#1
233 moveq pc,lr @ be binary compatible with V4, yet
234 bx lr @ interoperable with Thumb ISA:-)
235#endif
236.align 2
237.LK_00_19: .word 0x5a827999
238.LK_20_39: .word 0x6ed9eba1
239.LK_40_59: .word 0x8f1bbcdc
240.LK_60_79: .word 0xca62c1d6
241.size sha1_block_data_order,.-sha1_block_data_order
242.asciz "SHA1 block transform for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
243.align 2
244___
245
246$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
247print $code;
248close STDOUT; # enforce flush
diff --git a/src/lib/libcrypto/sha/asm/sha1-mips.pl b/src/lib/libcrypto/sha/asm/sha1-mips.pl
deleted file mode 100644
index 75fe7113e2..0000000000
--- a/src/lib/libcrypto/sha/asm/sha1-mips.pl
+++ /dev/null
@@ -1,350 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# SHA1 block procedure for MIPS.
11
12# Performance improvement is 30% on unaligned input. The "secret" is
13# to deploy lwl/lwr pair to load unaligned input. One could have
14# vectorized Xupdate on MIPSIII/IV, but the goal was to code MIPS32-
15# compatible subroutine. There is room for minor optimization on
16# little-endian platforms...
17
18######################################################################
19# There is a number of MIPS ABI in use, O32 and N32/64 are most
20# widely used. Then there is a new contender: NUBI. It appears that if
21# one picks the latter, it's possible to arrange code in ABI neutral
22# manner. Therefore let's stick to NUBI register layout:
23#
24($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
25($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
26($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
27($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
28#
29# The return value is placed in $a0. Following coding rules facilitate
30# interoperability:
31#
32# - never ever touch $tp, "thread pointer", former $gp;
33# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
34# old code];
35# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
36#
37# For reference here is register layout for N32/64 MIPS ABIs:
38#
39# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
40# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
41# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
42# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
43# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
44#
45$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64
46
47if ($flavour =~ /64|n32/i) {
48 $PTR_ADD="dadd"; # incidentally works even on n32
49 $PTR_SUB="dsub"; # incidentally works even on n32
50 $REG_S="sd";
51 $REG_L="ld";
52 $PTR_SLL="dsll"; # incidentally works even on n32
53 $SZREG=8;
54} else {
55 $PTR_ADD="add";
56 $PTR_SUB="sub";
57 $REG_S="sw";
58 $REG_L="lw";
59 $PTR_SLL="sll";
60 $SZREG=4;
61}
62#
63# <appro@openssl.org>
64#
65######################################################################
66
67$big_endian=(`echo MIPSEL | $ENV{CC} -E -P -`=~/MIPSEL/)?1:0;
68
69for (@ARGV) { $output=$_ if (/^\w[\w\-]*\.\w+$/); }
70open STDOUT,">$output";
71
72if (!defined($big_endian))
73 { $big_endian=(unpack('L',pack('N',1))==1); }
74
75# offsets of the Most and Least Significant Bytes
76$MSB=$big_endian?0:3;
77$LSB=3&~$MSB;
78
79@X=map("\$$_",(8..23)); # a4-a7,s0-s11
80
81$ctx=$a0;
82$inp=$a1;
83$num=$a2;
84$A="\$1";
85$B="\$2";
86$C="\$3";
87$D="\$7";
88$E="\$24"; @V=($A,$B,$C,$D,$E);
89$t0="\$25";
90$t1=$num; # $num is offloaded to stack
91$t2="\$30"; # fp
92$K="\$31"; # ra
93
94sub BODY_00_14 {
95my ($i,$a,$b,$c,$d,$e)=@_;
96my $j=$i+1;
97$code.=<<___ if (!$big_endian);
98 srl $t0,@X[$i],24 # byte swap($i)
99 srl $t1,@X[$i],8
100 andi $t2,@X[$i],0xFF00
101 sll @X[$i],@X[$i],24
102 andi $t1,0xFF00
103 sll $t2,$t2,8
104 or @X[$i],$t0
105 or $t1,$t2
106 or @X[$i],$t1
107___
108$code.=<<___;
109 lwl @X[$j],$j*4+$MSB($inp)
110 sll $t0,$a,5 # $i
111 addu $e,$K
112 lwr @X[$j],$j*4+$LSB($inp)
113 srl $t1,$a,27
114 addu $e,$t0
115 xor $t0,$c,$d
116 addu $e,$t1
117 sll $t2,$b,30
118 and $t0,$b
119 srl $b,$b,2
120 xor $t0,$d
121 addu $e,@X[$i]
122 or $b,$t2
123 addu $e,$t0
124___
125}
126
127sub BODY_15_19 {
128my ($i,$a,$b,$c,$d,$e)=@_;
129my $j=$i+1;
130
131$code.=<<___ if (!$big_endian && $i==15);
132 srl $t0,@X[$i],24 # byte swap($i)
133 srl $t1,@X[$i],8
134 andi $t2,@X[$i],0xFF00
135 sll @X[$i],@X[$i],24
136 andi $t1,0xFF00
137 sll $t2,$t2,8
138 or @X[$i],$t0
139 or @X[$i],$t1
140 or @X[$i],$t2
141___
142$code.=<<___;
143 xor @X[$j%16],@X[($j+2)%16]
144 sll $t0,$a,5 # $i
145 addu $e,$K
146 srl $t1,$a,27
147 addu $e,$t0
148 xor @X[$j%16],@X[($j+8)%16]
149 xor $t0,$c,$d
150 addu $e,$t1
151 xor @X[$j%16],@X[($j+13)%16]
152 sll $t2,$b,30
153 and $t0,$b
154 srl $t1,@X[$j%16],31
155 addu @X[$j%16],@X[$j%16]
156 srl $b,$b,2
157 xor $t0,$d
158 or @X[$j%16],$t1
159 addu $e,@X[$i%16]
160 or $b,$t2
161 addu $e,$t0
162___
163}
164
165sub BODY_20_39 {
166my ($i,$a,$b,$c,$d,$e)=@_;
167my $j=$i+1;
168$code.=<<___ if ($i<79);
169 xor @X[$j%16],@X[($j+2)%16]
170 sll $t0,$a,5 # $i
171 addu $e,$K
172 srl $t1,$a,27
173 addu $e,$t0
174 xor @X[$j%16],@X[($j+8)%16]
175 xor $t0,$c,$d
176 addu $e,$t1
177 xor @X[$j%16],@X[($j+13)%16]
178 sll $t2,$b,30
179 xor $t0,$b
180 srl $t1,@X[$j%16],31
181 addu @X[$j%16],@X[$j%16]
182 srl $b,$b,2
183 addu $e,@X[$i%16]
184 or @X[$j%16],$t1
185 or $b,$t2
186 addu $e,$t0
187___
188$code.=<<___ if ($i==79);
189 lw @X[0],0($ctx)
190 sll $t0,$a,5 # $i
191 addu $e,$K
192 lw @X[1],4($ctx)
193 srl $t1,$a,27
194 addu $e,$t0
195 lw @X[2],8($ctx)
196 xor $t0,$c,$d
197 addu $e,$t1
198 lw @X[3],12($ctx)
199 sll $t2,$b,30
200 xor $t0,$b
201 lw @X[4],16($ctx)
202 srl $b,$b,2
203 addu $e,@X[$i%16]
204 or $b,$t2
205 addu $e,$t0
206___
207}
208
209sub BODY_40_59 {
210my ($i,$a,$b,$c,$d,$e)=@_;
211my $j=$i+1;
212$code.=<<___ if ($i<79);
213 xor @X[$j%16],@X[($j+2)%16]
214 sll $t0,$a,5 # $i
215 addu $e,$K
216 srl $t1,$a,27
217 addu $e,$t0
218 xor @X[$j%16],@X[($j+8)%16]
219 and $t0,$c,$d
220 addu $e,$t1
221 xor @X[$j%16],@X[($j+13)%16]
222 sll $t2,$b,30
223 addu $e,$t0
224 srl $t1,@X[$j%16],31
225 xor $t0,$c,$d
226 addu @X[$j%16],@X[$j%16]
227 and $t0,$b
228 srl $b,$b,2
229 or @X[$j%16],$t1
230 addu $e,@X[$i%16]
231 or $b,$t2
232 addu $e,$t0
233___
234}
235
236$FRAMESIZE=16; # large enough to accommodate NUBI saved registers
237$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0xc0fff008 : 0xc0ff0000;
238
239$code=<<___;
240.text
241
242.set noat
243.set noreorder
244.align 5
245.globl sha1_block_data_order
246.ent sha1_block_data_order
247sha1_block_data_order:
248 .frame $sp,$FRAMESIZE*$SZREG,$ra
249 .mask $SAVED_REGS_MASK,-$SZREG
250 .set noreorder
251 $PTR_SUB $sp,$FRAMESIZE*$SZREG
252 $REG_S $ra,($FRAMESIZE-1)*$SZREG($sp)
253 $REG_S $fp,($FRAMESIZE-2)*$SZREG($sp)
254 $REG_S $s11,($FRAMESIZE-3)*$SZREG($sp)
255 $REG_S $s10,($FRAMESIZE-4)*$SZREG($sp)
256 $REG_S $s9,($FRAMESIZE-5)*$SZREG($sp)
257 $REG_S $s8,($FRAMESIZE-6)*$SZREG($sp)
258 $REG_S $s7,($FRAMESIZE-7)*$SZREG($sp)
259 $REG_S $s6,($FRAMESIZE-8)*$SZREG($sp)
260 $REG_S $s5,($FRAMESIZE-9)*$SZREG($sp)
261 $REG_S $s4,($FRAMESIZE-10)*$SZREG($sp)
262___
263$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
264 $REG_S $s3,($FRAMESIZE-11)*$SZREG($sp)
265 $REG_S $s2,($FRAMESIZE-12)*$SZREG($sp)
266 $REG_S $s1,($FRAMESIZE-13)*$SZREG($sp)
267 $REG_S $s0,($FRAMESIZE-14)*$SZREG($sp)
268 $REG_S $gp,($FRAMESIZE-15)*$SZREG($sp)
269___
270$code.=<<___;
271 $PTR_SLL $num,6
272 $PTR_ADD $num,$inp
273 $REG_S $num,0($sp)
274 lw $A,0($ctx)
275 lw $B,4($ctx)
276 lw $C,8($ctx)
277 lw $D,12($ctx)
278 b .Loop
279 lw $E,16($ctx)
280.align 4
281.Loop:
282 .set reorder
283 lwl @X[0],$MSB($inp)
284 lui $K,0x5a82
285 lwr @X[0],$LSB($inp)
286 ori $K,0x7999 # K_00_19
287___
288for ($i=0;$i<15;$i++) { &BODY_00_14($i,@V); unshift(@V,pop(@V)); }
289for (;$i<20;$i++) { &BODY_15_19($i,@V); unshift(@V,pop(@V)); }
290$code.=<<___;
291 lui $K,0x6ed9
292 ori $K,0xeba1 # K_20_39
293___
294for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
295$code.=<<___;
296 lui $K,0x8f1b
297 ori $K,0xbcdc # K_40_59
298___
299for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
300$code.=<<___;
301 lui $K,0xca62
302 ori $K,0xc1d6 # K_60_79
303___
304for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
305$code.=<<___;
306 $PTR_ADD $inp,64
307 $REG_L $num,0($sp)
308
309 addu $A,$X[0]
310 addu $B,$X[1]
311 sw $A,0($ctx)
312 addu $C,$X[2]
313 addu $D,$X[3]
314 sw $B,4($ctx)
315 addu $E,$X[4]
316 sw $C,8($ctx)
317 sw $D,12($ctx)
318 sw $E,16($ctx)
319 .set noreorder
320 bne $inp,$num,.Loop
321 nop
322
323 .set noreorder
324 $REG_L $ra,($FRAMESIZE-1)*$SZREG($sp)
325 $REG_L $fp,($FRAMESIZE-2)*$SZREG($sp)
326 $REG_L $s11,($FRAMESIZE-3)*$SZREG($sp)
327 $REG_L $s10,($FRAMESIZE-4)*$SZREG($sp)
328 $REG_L $s9,($FRAMESIZE-5)*$SZREG($sp)
329 $REG_L $s8,($FRAMESIZE-6)*$SZREG($sp)
330 $REG_L $s7,($FRAMESIZE-7)*$SZREG($sp)
331 $REG_L $s6,($FRAMESIZE-8)*$SZREG($sp)
332 $REG_L $s5,($FRAMESIZE-9)*$SZREG($sp)
333 $REG_L $s4,($FRAMESIZE-10)*$SZREG($sp)
334___
335$code.=<<___ if ($flavour =~ /nubi/i);
336 $REG_L $s3,($FRAMESIZE-11)*$SZREG($sp)
337 $REG_L $s2,($FRAMESIZE-12)*$SZREG($sp)
338 $REG_L $s1,($FRAMESIZE-13)*$SZREG($sp)
339 $REG_L $s0,($FRAMESIZE-14)*$SZREG($sp)
340 $REG_L $gp,($FRAMESIZE-15)*$SZREG($sp)
341___
342$code.=<<___;
343 jr $ra
344 $PTR_ADD $sp,$FRAMESIZE*$SZREG
345.end sha1_block_data_order
346.rdata
347.asciiz "SHA1 for MIPS, CRYPTOGAMS by <appro\@openssl.org>"
348___
349print $code;
350close STDOUT;
diff --git a/src/lib/libcrypto/sha/asm/sha1-parisc.pl b/src/lib/libcrypto/sha/asm/sha1-parisc.pl
deleted file mode 100644
index 783c26272b..0000000000
--- a/src/lib/libcrypto/sha/asm/sha1-parisc.pl
+++ /dev/null
@@ -1,258 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# SHA1 block procedure for PA-RISC.
11
12# June 2009.
13#
14# On PA-7100LC performance is >30% better than gcc 3.2 generated code
15# for aligned input and >50% better for unaligned. Compared to vendor
16# compiler on PA-8600 it's almost 60% faster in 64-bit build and just
17# few percent faster in 32-bit one (this for aligned input, data for
18# unaligned input is not available).
19#
20# Special thanks to polarhome.com for providing HP-UX account.
21
22$flavour = shift;
23$output = shift;
24open STDOUT,">$output";
25
26if ($flavour =~ /64/) {
27 $LEVEL ="2.0W";
28 $SIZE_T =8;
29 $FRAME_MARKER =80;
30 $SAVED_RP =16;
31 $PUSH ="std";
32 $PUSHMA ="std,ma";
33 $POP ="ldd";
34 $POPMB ="ldd,mb";
35} else {
36 $LEVEL ="1.0";
37 $SIZE_T =4;
38 $FRAME_MARKER =48;
39 $SAVED_RP =20;
40 $PUSH ="stw";
41 $PUSHMA ="stwm";
42 $POP ="ldw";
43 $POPMB ="ldwm";
44}
45
46$FRAME=14*$SIZE_T+$FRAME_MARKER;# 14 saved regs + frame marker
47 # [+ argument transfer]
48$ctx="%r26"; # arg0
49$inp="%r25"; # arg1
50$num="%r24"; # arg2
51
52$t0="%r28";
53$t1="%r29";
54$K="%r31";
55
56@X=("%r1", "%r2", "%r3", "%r4", "%r5", "%r6", "%r7", "%r8",
57 "%r9", "%r10","%r11","%r12","%r13","%r14","%r15","%r16",$t0);
58
59@V=($A,$B,$C,$D,$E)=("%r19","%r20","%r21","%r22","%r23");
60
61sub BODY_00_19 {
62my ($i,$a,$b,$c,$d,$e)=@_;
63my $j=$i+1;
64$code.=<<___ if ($i<15);
65 addl $K,$e,$e ; $i
66 shd $a,$a,27,$t1
67 addl @X[$i],$e,$e
68 and $c,$b,$t0
69 addl $t1,$e,$e
70 andcm $d,$b,$t1
71 shd $b,$b,2,$b
72 or $t1,$t0,$t0
73 addl $t0,$e,$e
74___
75$code.=<<___ if ($i>=15); # with forward Xupdate
76 addl $K,$e,$e ; $i
77 shd $a,$a,27,$t1
78 xor @X[($j+2)%16],@X[$j%16],@X[$j%16]
79 addl @X[$i%16],$e,$e
80 and $c,$b,$t0
81 xor @X[($j+8)%16],@X[$j%16],@X[$j%16]
82 addl $t1,$e,$e
83 andcm $d,$b,$t1
84 shd $b,$b,2,$b
85 or $t1,$t0,$t0
86 xor @X[($j+13)%16],@X[$j%16],@X[$j%16]
87 add $t0,$e,$e
88 shd @X[$j%16],@X[$j%16],31,@X[$j%16]
89___
90}
91
92sub BODY_20_39 {
93my ($i,$a,$b,$c,$d,$e)=@_;
94my $j=$i+1;
95$code.=<<___ if ($i<79);
96 xor @X[($j+2)%16],@X[$j%16],@X[$j%16] ; $i
97 addl $K,$e,$e
98 shd $a,$a,27,$t1
99 xor @X[($j+8)%16],@X[$j%16],@X[$j%16]
100 addl @X[$i%16],$e,$e
101 xor $b,$c,$t0
102 xor @X[($j+13)%16],@X[$j%16],@X[$j%16]
103 addl $t1,$e,$e
104 shd $b,$b,2,$b
105 xor $d,$t0,$t0
106 shd @X[$j%16],@X[$j%16],31,@X[$j%16]
107 addl $t0,$e,$e
108___
109$code.=<<___ if ($i==79); # with context load
110 ldw 0($ctx),@X[0] ; $i
111 addl $K,$e,$e
112 shd $a,$a,27,$t1
113 ldw 4($ctx),@X[1]
114 addl @X[$i%16],$e,$e
115 xor $b,$c,$t0
116 ldw 8($ctx),@X[2]
117 addl $t1,$e,$e
118 shd $b,$b,2,$b
119 xor $d,$t0,$t0
120 ldw 12($ctx),@X[3]
121 addl $t0,$e,$e
122 ldw 16($ctx),@X[4]
123___
124}
125
126sub BODY_40_59 {
127my ($i,$a,$b,$c,$d,$e)=@_;
128my $j=$i+1;
129$code.=<<___;
130 shd $a,$a,27,$t1 ; $i
131 addl $K,$e,$e
132 xor @X[($j+2)%16],@X[$j%16],@X[$j%16]
133 xor $d,$c,$t0
134 addl @X[$i%16],$e,$e
135 xor @X[($j+8)%16],@X[$j%16],@X[$j%16]
136 and $b,$t0,$t0
137 addl $t1,$e,$e
138 shd $b,$b,2,$b
139 xor @X[($j+13)%16],@X[$j%16],@X[$j%16]
140 addl $t0,$e,$e
141 and $d,$c,$t1
142 shd @X[$j%16],@X[$j%16],31,@X[$j%16]
143 addl $t1,$e,$e
144___
145}
146
147$code=<<___;
148 .LEVEL $LEVEL
149 .text
150
151 .EXPORT sha1_block_data_order,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR
152sha1_block_data_order
153 .PROC
154 .CALLINFO FRAME=`$FRAME-14*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=16
155 .ENTRY
156 $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
157 $PUSHMA %r3,$FRAME(%sp)
158 $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
159 $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
160 $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
161 $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp)
162 $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp)
163 $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp)
164 $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp)
165 $PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp)
166 $PUSH %r12,`-$FRAME+9*$SIZE_T`(%sp)
167 $PUSH %r13,`-$FRAME+10*$SIZE_T`(%sp)
168 $PUSH %r14,`-$FRAME+11*$SIZE_T`(%sp)
169 $PUSH %r15,`-$FRAME+12*$SIZE_T`(%sp)
170 $PUSH %r16,`-$FRAME+13*$SIZE_T`(%sp)
171
172 ldw 0($ctx),$A
173 ldw 4($ctx),$B
174 ldw 8($ctx),$C
175 ldw 12($ctx),$D
176 ldw 16($ctx),$E
177
178 extru $inp,31,2,$t0 ; t0=inp&3;
179 sh3addl $t0,%r0,$t0 ; t0*=8;
180 subi 32,$t0,$t0 ; t0=32-t0;
181 mtctl $t0,%cr11 ; %sar=t0;
182
183L\$oop
184 ldi 3,$t0
185 andcm $inp,$t0,$t0 ; 64-bit neutral
186___
187 for ($i=0;$i<15;$i++) { # load input block
188 $code.="\tldw `4*$i`($t0),@X[$i]\n"; }
189$code.=<<___;
190 cmpb,*= $inp,$t0,L\$aligned
191 ldw 60($t0),@X[15]
192 ldw 64($t0),@X[16]
193___
194 for ($i=0;$i<16;$i++) { # align input
195 $code.="\tvshd @X[$i],@X[$i+1],@X[$i]\n"; }
196$code.=<<___;
197L\$aligned
198 ldil L'0x5a827000,$K ; K_00_19
199 ldo 0x999($K),$K
200___
201for ($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
202$code.=<<___;
203 ldil L'0x6ed9e000,$K ; K_20_39
204 ldo 0xba1($K),$K
205___
206
207for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
208$code.=<<___;
209 ldil L'0x8f1bb000,$K ; K_40_59
210 ldo 0xcdc($K),$K
211___
212
213for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
214$code.=<<___;
215 ldil L'0xca62c000,$K ; K_60_79
216 ldo 0x1d6($K),$K
217___
218for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
219
220$code.=<<___;
221 addl @X[0],$A,$A
222 addl @X[1],$B,$B
223 addl @X[2],$C,$C
224 addl @X[3],$D,$D
225 addl @X[4],$E,$E
226 stw $A,0($ctx)
227 stw $B,4($ctx)
228 stw $C,8($ctx)
229 stw $D,12($ctx)
230 stw $E,16($ctx)
231 addib,*<> -1,$num,L\$oop
232 ldo 64($inp),$inp
233
234 $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue
235 $POP `-$FRAME+1*$SIZE_T`(%sp),%r4
236 $POP `-$FRAME+2*$SIZE_T`(%sp),%r5
237 $POP `-$FRAME+3*$SIZE_T`(%sp),%r6
238 $POP `-$FRAME+4*$SIZE_T`(%sp),%r7
239 $POP `-$FRAME+5*$SIZE_T`(%sp),%r8
240 $POP `-$FRAME+6*$SIZE_T`(%sp),%r9
241 $POP `-$FRAME+7*$SIZE_T`(%sp),%r10
242 $POP `-$FRAME+8*$SIZE_T`(%sp),%r11
243 $POP `-$FRAME+9*$SIZE_T`(%sp),%r12
244 $POP `-$FRAME+10*$SIZE_T`(%sp),%r13
245 $POP `-$FRAME+11*$SIZE_T`(%sp),%r14
246 $POP `-$FRAME+12*$SIZE_T`(%sp),%r15
247 $POP `-$FRAME+13*$SIZE_T`(%sp),%r16
248 bv (%r2)
249 .EXIT
250 $POPMB -$FRAME(%sp),%r3
251 .PROCEND
252___
253
254$code =~ s/\`([^\`]*)\`/eval $1/gem;
255$code =~ s/,\*/,/gm if ($SIZE_T==4);
256$code =~ s/\bbv\b/bve/gm if ($SIZE_T==8);
257print $code;
258close STDOUT;
diff --git a/src/lib/libcrypto/sha/asm/sha1-ppc.pl b/src/lib/libcrypto/sha/asm/sha1-ppc.pl
deleted file mode 100755
index 85342b6a82..0000000000
--- a/src/lib/libcrypto/sha/asm/sha1-ppc.pl
+++ /dev/null
@@ -1,318 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# I let hardware handle unaligned input(*), except on page boundaries
11# (see below for details). Otherwise straightforward implementation
12# with X vector in register bank. The module is big-endian [which is
13# not big deal as there're no little-endian targets left around].
14#
15# (*) this means that this module is inappropriate for PPC403? Does
16# anybody know if pre-POWER3 can sustain unaligned load?
17
18# -m64 -m32
19# ----------------------------------
20# PPC970,gcc-4.0.0 +76% +59%
21# Power6,xlc-7 +68% +33%
22
23$flavour = shift;
24
25if ($flavour =~ /64/) {
26 $SIZE_T =8;
27 $LRSAVE =2*$SIZE_T;
28 $UCMP ="cmpld";
29 $STU ="stdu";
30 $POP ="ld";
31 $PUSH ="std";
32} elsif ($flavour =~ /32/) {
33 $SIZE_T =4;
34 $LRSAVE =$SIZE_T;
35 $UCMP ="cmplw";
36 $STU ="stwu";
37 $POP ="lwz";
38 $PUSH ="stw";
39} else { die "nonsense $flavour"; }
40
41$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
42( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
43( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
44die "can't locate ppc-xlate.pl";
45
46open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
47
48$FRAME=24*$SIZE_T+64;
49$LOCALS=6*$SIZE_T;
50
51$K ="r0";
52$sp ="r1";
53$toc="r2";
54$ctx="r3";
55$inp="r4";
56$num="r5";
57$t0 ="r15";
58$t1 ="r6";
59
60$A ="r7";
61$B ="r8";
62$C ="r9";
63$D ="r10";
64$E ="r11";
65$T ="r12";
66
67@V=($A,$B,$C,$D,$E,$T);
68@X=("r16","r17","r18","r19","r20","r21","r22","r23",
69 "r24","r25","r26","r27","r28","r29","r30","r31");
70
71sub BODY_00_19 {
72my ($i,$a,$b,$c,$d,$e,$f)=@_;
73my $j=$i+1;
74$code.=<<___ if ($i==0);
75 lwz @X[$i],`$i*4`($inp)
76___
77$code.=<<___ if ($i<15);
78 lwz @X[$j],`$j*4`($inp)
79 add $f,$K,$e
80 rotlwi $e,$a,5
81 add $f,$f,@X[$i]
82 and $t0,$c,$b
83 add $f,$f,$e
84 andc $t1,$d,$b
85 rotlwi $b,$b,30
86 or $t0,$t0,$t1
87 add $f,$f,$t0
88___
89$code.=<<___ if ($i>=15);
90 add $f,$K,$e
91 rotlwi $e,$a,5
92 xor @X[$j%16],@X[$j%16],@X[($j+2)%16]
93 add $f,$f,@X[$i%16]
94 and $t0,$c,$b
95 xor @X[$j%16],@X[$j%16],@X[($j+8)%16]
96 add $f,$f,$e
97 andc $t1,$d,$b
98 rotlwi $b,$b,30
99 or $t0,$t0,$t1
100 xor @X[$j%16],@X[$j%16],@X[($j+13)%16]
101 add $f,$f,$t0
102 rotlwi @X[$j%16],@X[$j%16],1
103___
104}
105
106sub BODY_20_39 {
107my ($i,$a,$b,$c,$d,$e,$f)=@_;
108my $j=$i+1;
109$code.=<<___ if ($i<79);
110 add $f,$K,$e
111 rotlwi $e,$a,5
112 xor @X[$j%16],@X[$j%16],@X[($j+2)%16]
113 add $f,$f,@X[$i%16]
114 xor $t0,$b,$c
115 xor @X[$j%16],@X[$j%16],@X[($j+8)%16]
116 add $f,$f,$e
117 rotlwi $b,$b,30
118 xor $t0,$t0,$d
119 xor @X[$j%16],@X[$j%16],@X[($j+13)%16]
120 add $f,$f,$t0
121 rotlwi @X[$j%16],@X[$j%16],1
122___
123$code.=<<___ if ($i==79);
124 add $f,$K,$e
125 rotlwi $e,$a,5
126 lwz r16,0($ctx)
127 add $f,$f,@X[$i%16]
128 xor $t0,$b,$c
129 lwz r17,4($ctx)
130 add $f,$f,$e
131 rotlwi $b,$b,30
132 lwz r18,8($ctx)
133 xor $t0,$t0,$d
134 lwz r19,12($ctx)
135 add $f,$f,$t0
136 lwz r20,16($ctx)
137___
138}
139
140sub BODY_40_59 {
141my ($i,$a,$b,$c,$d,$e,$f)=@_;
142my $j=$i+1;
143$code.=<<___;
144 add $f,$K,$e
145 rotlwi $e,$a,5
146 xor @X[$j%16],@X[$j%16],@X[($j+2)%16]
147 add $f,$f,@X[$i%16]
148 and $t0,$b,$c
149 xor @X[$j%16],@X[$j%16],@X[($j+8)%16]
150 add $f,$f,$e
151 or $t1,$b,$c
152 rotlwi $b,$b,30
153 xor @X[$j%16],@X[$j%16],@X[($j+13)%16]
154 and $t1,$t1,$d
155 or $t0,$t0,$t1
156 rotlwi @X[$j%16],@X[$j%16],1
157 add $f,$f,$t0
158___
159}
160
161$code=<<___;
162.machine "any"
163.text
164
165.globl .sha1_block_data_order
166.align 4
167.sha1_block_data_order:
168 $STU $sp,-$FRAME($sp)
169 mflr r0
170 $PUSH r15,`$FRAME-$SIZE_T*17`($sp)
171 $PUSH r16,`$FRAME-$SIZE_T*16`($sp)
172 $PUSH r17,`$FRAME-$SIZE_T*15`($sp)
173 $PUSH r18,`$FRAME-$SIZE_T*14`($sp)
174 $PUSH r19,`$FRAME-$SIZE_T*13`($sp)
175 $PUSH r20,`$FRAME-$SIZE_T*12`($sp)
176 $PUSH r21,`$FRAME-$SIZE_T*11`($sp)
177 $PUSH r22,`$FRAME-$SIZE_T*10`($sp)
178 $PUSH r23,`$FRAME-$SIZE_T*9`($sp)
179 $PUSH r24,`$FRAME-$SIZE_T*8`($sp)
180 $PUSH r25,`$FRAME-$SIZE_T*7`($sp)
181 $PUSH r26,`$FRAME-$SIZE_T*6`($sp)
182 $PUSH r27,`$FRAME-$SIZE_T*5`($sp)
183 $PUSH r28,`$FRAME-$SIZE_T*4`($sp)
184 $PUSH r29,`$FRAME-$SIZE_T*3`($sp)
185 $PUSH r30,`$FRAME-$SIZE_T*2`($sp)
186 $PUSH r31,`$FRAME-$SIZE_T*1`($sp)
187 $PUSH r0,`$FRAME+$LRSAVE`($sp)
188 lwz $A,0($ctx)
189 lwz $B,4($ctx)
190 lwz $C,8($ctx)
191 lwz $D,12($ctx)
192 lwz $E,16($ctx)
193 andi. r0,$inp,3
194 bne Lunaligned
195Laligned:
196 mtctr $num
197 bl Lsha1_block_private
198 b Ldone
199
200; PowerPC specification allows an implementation to be ill-behaved
201; upon unaligned access which crosses page boundary. "Better safe
202; than sorry" principle makes me treat it specially. But I don't
203; look for particular offending word, but rather for 64-byte input
204; block which crosses the boundary. Once found that block is aligned
205; and hashed separately...
206.align 4
207Lunaligned:
208 subfic $t1,$inp,4096
209 andi. $t1,$t1,4095 ; distance to closest page boundary
210 srwi. $t1,$t1,6 ; t1/=64
211 beq Lcross_page
212 $UCMP $num,$t1
213 ble- Laligned ; didn't cross the page boundary
214 mtctr $t1
215 subfc $num,$t1,$num
216 bl Lsha1_block_private
217Lcross_page:
218 li $t1,16
219 mtctr $t1
220 addi r20,$sp,$LOCALS ; spot within the frame
221Lmemcpy:
222 lbz r16,0($inp)
223 lbz r17,1($inp)
224 lbz r18,2($inp)
225 lbz r19,3($inp)
226 addi $inp,$inp,4
227 stb r16,0(r20)
228 stb r17,1(r20)
229 stb r18,2(r20)
230 stb r19,3(r20)
231 addi r20,r20,4
232 bdnz Lmemcpy
233
234 $PUSH $inp,`$FRAME-$SIZE_T*18`($sp)
235 li $t1,1
236 addi $inp,$sp,$LOCALS
237 mtctr $t1
238 bl Lsha1_block_private
239 $POP $inp,`$FRAME-$SIZE_T*18`($sp)
240 addic. $num,$num,-1
241 bne- Lunaligned
242
243Ldone:
244 $POP r0,`$FRAME+$LRSAVE`($sp)
245 $POP r15,`$FRAME-$SIZE_T*17`($sp)
246 $POP r16,`$FRAME-$SIZE_T*16`($sp)
247 $POP r17,`$FRAME-$SIZE_T*15`($sp)
248 $POP r18,`$FRAME-$SIZE_T*14`($sp)
249 $POP r19,`$FRAME-$SIZE_T*13`($sp)
250 $POP r20,`$FRAME-$SIZE_T*12`($sp)
251 $POP r21,`$FRAME-$SIZE_T*11`($sp)
252 $POP r22,`$FRAME-$SIZE_T*10`($sp)
253 $POP r23,`$FRAME-$SIZE_T*9`($sp)
254 $POP r24,`$FRAME-$SIZE_T*8`($sp)
255 $POP r25,`$FRAME-$SIZE_T*7`($sp)
256 $POP r26,`$FRAME-$SIZE_T*6`($sp)
257 $POP r27,`$FRAME-$SIZE_T*5`($sp)
258 $POP r28,`$FRAME-$SIZE_T*4`($sp)
259 $POP r29,`$FRAME-$SIZE_T*3`($sp)
260 $POP r30,`$FRAME-$SIZE_T*2`($sp)
261 $POP r31,`$FRAME-$SIZE_T*1`($sp)
262 mtlr r0
263 addi $sp,$sp,$FRAME
264 blr
265___
266
267# This is private block function, which uses tailored calling
268# interface, namely upon entry SHA_CTX is pre-loaded to given
269# registers and counter register contains amount of chunks to
270# digest...
271$code.=<<___;
272.align 4
273Lsha1_block_private:
274___
275$code.=<<___; # load K_00_19
276 lis $K,0x5a82
277 ori $K,$K,0x7999
278___
279for($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
280$code.=<<___; # load K_20_39
281 lis $K,0x6ed9
282 ori $K,$K,0xeba1
283___
284for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
285$code.=<<___; # load K_40_59
286 lis $K,0x8f1b
287 ori $K,$K,0xbcdc
288___
289for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
290$code.=<<___; # load K_60_79
291 lis $K,0xca62
292 ori $K,$K,0xc1d6
293___
294for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
295$code.=<<___;
296 add r16,r16,$E
297 add r17,r17,$T
298 add r18,r18,$A
299 add r19,r19,$B
300 add r20,r20,$C
301 stw r16,0($ctx)
302 mr $A,r16
303 stw r17,4($ctx)
304 mr $B,r17
305 stw r18,8($ctx)
306 mr $C,r18
307 stw r19,12($ctx)
308 mr $D,r19
309 stw r20,16($ctx)
310 mr $E,r20
311 addi $inp,$inp,`16*4`
312 bdnz- Lsha1_block_private
313 blr
314___
315
316$code =~ s/\`([^\`]*)\`/eval $1/gem;
317print $code;
318close STDOUT;
diff --git a/src/lib/libcrypto/sha/asm/sha1-sparcv9.pl b/src/lib/libcrypto/sha/asm/sha1-sparcv9.pl
deleted file mode 100644
index 5235c59e63..0000000000
--- a/src/lib/libcrypto/sha/asm/sha1-sparcv9.pl
+++ /dev/null
@@ -1,282 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# Performance improvement is not really impressive on pre-T1 CPU: +8%
11# over Sun C and +25% over gcc [3.3]. While on T1, a.k.a. Niagara, it
12# turned to be 40% faster than 64-bit code generated by Sun C 5.8 and
13# >2x than 64-bit code generated by gcc 3.4. And there is a gimmick.
14# X[16] vector is packed to 8 64-bit registers and as result nothing
15# is spilled on stack. In addition input data is loaded in compact
16# instruction sequence, thus minimizing the window when the code is
17# subject to [inter-thread] cache-thrashing hazard. The goal is to
18# ensure scalability on UltraSPARC T1, or rather to avoid decay when
19# amount of active threads exceeds the number of physical cores.
20
21$bits=32;
22for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
23if ($bits==64) { $bias=2047; $frame=192; }
24else { $bias=0; $frame=112; }
25
26$output=shift;
27open STDOUT,">$output";
28
29@X=("%o0","%o1","%o2","%o3","%o4","%o5","%g1","%o7");
30$rot1m="%g2";
31$tmp64="%g3";
32$Xi="%g4";
33$A="%l0";
34$B="%l1";
35$C="%l2";
36$D="%l3";
37$E="%l4";
38@V=($A,$B,$C,$D,$E);
39$K_00_19="%l5";
40$K_20_39="%l6";
41$K_40_59="%l7";
42$K_60_79="%g5";
43@K=($K_00_19,$K_20_39,$K_40_59,$K_60_79);
44
45$ctx="%i0";
46$inp="%i1";
47$len="%i2";
48$tmp0="%i3";
49$tmp1="%i4";
50$tmp2="%i5";
51
52sub BODY_00_15 {
53my ($i,$a,$b,$c,$d,$e)=@_;
54my $xi=($i&1)?@X[($i/2)%8]:$Xi;
55
56$code.=<<___;
57 sll $a,5,$tmp0 !! $i
58 add @K[$i/20],$e,$e
59 srl $a,27,$tmp1
60 add $tmp0,$e,$e
61 and $c,$b,$tmp0
62 add $tmp1,$e,$e
63 sll $b,30,$tmp2
64 andn $d,$b,$tmp1
65 srl $b,2,$b
66 or $tmp1,$tmp0,$tmp1
67 or $tmp2,$b,$b
68 add $xi,$e,$e
69___
70if ($i&1 && $i<15) {
71 $code.=
72 " srlx @X[(($i+1)/2)%8],32,$Xi\n";
73}
74$code.=<<___;
75 add $tmp1,$e,$e
76___
77}
78
79sub Xupdate {
80my ($i,$a,$b,$c,$d,$e)=@_;
81my $j=$i/2;
82
83if ($i&1) {
84$code.=<<___;
85 sll $a,5,$tmp0 !! $i
86 add @K[$i/20],$e,$e
87 srl $a,27,$tmp1
88___
89} else {
90$code.=<<___;
91 sllx @X[($j+6)%8],32,$Xi ! Xupdate($i)
92 xor @X[($j+1)%8],@X[$j%8],@X[$j%8]
93 srlx @X[($j+7)%8],32,$tmp1
94 xor @X[($j+4)%8],@X[$j%8],@X[$j%8]
95 sll $a,5,$tmp0 !! $i
96 or $tmp1,$Xi,$Xi
97 add @K[$i/20],$e,$e !!
98 xor $Xi,@X[$j%8],@X[$j%8]
99 srlx @X[$j%8],31,$Xi
100 add @X[$j%8],@X[$j%8],@X[$j%8]
101 and $Xi,$rot1m,$Xi
102 andn @X[$j%8],$rot1m,@X[$j%8]
103 srl $a,27,$tmp1 !!
104 or $Xi,@X[$j%8],@X[$j%8]
105___
106}
107}
108
109sub BODY_16_19 {
110my ($i,$a,$b,$c,$d,$e)=@_;
111
112 &Xupdate(@_);
113 if ($i&1) {
114 $xi=@X[($i/2)%8];
115 } else {
116 $xi=$Xi;
117 $code.="\tsrlx @X[($i/2)%8],32,$xi\n";
118 }
119$code.=<<___;
120 add $tmp0,$e,$e !!
121 and $c,$b,$tmp0
122 add $tmp1,$e,$e
123 sll $b,30,$tmp2
124 add $xi,$e,$e
125 andn $d,$b,$tmp1
126 srl $b,2,$b
127 or $tmp1,$tmp0,$tmp1
128 or $tmp2,$b,$b
129 add $tmp1,$e,$e
130___
131}
132
133sub BODY_20_39 {
134my ($i,$a,$b,$c,$d,$e)=@_;
135my $xi;
136 &Xupdate(@_);
137 if ($i&1) {
138 $xi=@X[($i/2)%8];
139 } else {
140 $xi=$Xi;
141 $code.="\tsrlx @X[($i/2)%8],32,$xi\n";
142 }
143$code.=<<___;
144 add $tmp0,$e,$e !!
145 xor $c,$b,$tmp0
146 add $tmp1,$e,$e
147 sll $b,30,$tmp2
148 xor $d,$tmp0,$tmp1
149 srl $b,2,$b
150 add $tmp1,$e,$e
151 or $tmp2,$b,$b
152 add $xi,$e,$e
153___
154}
155
156sub BODY_40_59 {
157my ($i,$a,$b,$c,$d,$e)=@_;
158my $xi;
159 &Xupdate(@_);
160 if ($i&1) {
161 $xi=@X[($i/2)%8];
162 } else {
163 $xi=$Xi;
164 $code.="\tsrlx @X[($i/2)%8],32,$xi\n";
165 }
166$code.=<<___;
167 add $tmp0,$e,$e !!
168 and $c,$b,$tmp0
169 add $tmp1,$e,$e
170 sll $b,30,$tmp2
171 or $c,$b,$tmp1
172 srl $b,2,$b
173 and $d,$tmp1,$tmp1
174 add $xi,$e,$e
175 or $tmp1,$tmp0,$tmp1
176 or $tmp2,$b,$b
177 add $tmp1,$e,$e
178___
179}
180
181$code.=<<___ if ($bits==64);
182.register %g2,#scratch
183.register %g3,#scratch
184___
185$code.=<<___;
186.section ".text",#alloc,#execinstr
187
188.align 32
189.globl sha1_block_data_order
190sha1_block_data_order:
191 save %sp,-$frame,%sp
192 sllx $len,6,$len
193 add $inp,$len,$len
194
195 or %g0,1,$rot1m
196 sllx $rot1m,32,$rot1m
197 or $rot1m,1,$rot1m
198
199 ld [$ctx+0],$A
200 ld [$ctx+4],$B
201 ld [$ctx+8],$C
202 ld [$ctx+12],$D
203 ld [$ctx+16],$E
204 andn $inp,7,$tmp0
205
206 sethi %hi(0x5a827999),$K_00_19
207 or $K_00_19,%lo(0x5a827999),$K_00_19
208 sethi %hi(0x6ed9eba1),$K_20_39
209 or $K_20_39,%lo(0x6ed9eba1),$K_20_39
210 sethi %hi(0x8f1bbcdc),$K_40_59
211 or $K_40_59,%lo(0x8f1bbcdc),$K_40_59
212 sethi %hi(0xca62c1d6),$K_60_79
213 or $K_60_79,%lo(0xca62c1d6),$K_60_79
214
215.Lloop:
216 ldx [$tmp0+0],@X[0]
217 ldx [$tmp0+16],@X[2]
218 ldx [$tmp0+32],@X[4]
219 ldx [$tmp0+48],@X[6]
220 and $inp,7,$tmp1
221 ldx [$tmp0+8],@X[1]
222 sll $tmp1,3,$tmp1
223 ldx [$tmp0+24],@X[3]
224 subcc %g0,$tmp1,$tmp2 ! should be 64-$tmp1, but -$tmp1 works too
225 ldx [$tmp0+40],@X[5]
226 bz,pt %icc,.Laligned
227 ldx [$tmp0+56],@X[7]
228
229 sllx @X[0],$tmp1,@X[0]
230 ldx [$tmp0+64],$tmp64
231___
232for($i=0;$i<7;$i++)
233{ $code.=<<___;
234 srlx @X[$i+1],$tmp2,$Xi
235 sllx @X[$i+1],$tmp1,@X[$i+1]
236 or $Xi,@X[$i],@X[$i]
237___
238}
239$code.=<<___;
240 srlx $tmp64,$tmp2,$tmp64
241 or $tmp64,@X[7],@X[7]
242.Laligned:
243 srlx @X[0],32,$Xi
244___
245for ($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
246for (;$i<20;$i++) { &BODY_16_19($i,@V); unshift(@V,pop(@V)); }
247for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
248for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
249for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
250$code.=<<___;
251
252 ld [$ctx+0],@X[0]
253 ld [$ctx+4],@X[1]
254 ld [$ctx+8],@X[2]
255 ld [$ctx+12],@X[3]
256 add $inp,64,$inp
257 ld [$ctx+16],@X[4]
258 cmp $inp,$len
259
260 add $A,@X[0],$A
261 st $A,[$ctx+0]
262 add $B,@X[1],$B
263 st $B,[$ctx+4]
264 add $C,@X[2],$C
265 st $C,[$ctx+8]
266 add $D,@X[3],$D
267 st $D,[$ctx+12]
268 add $E,@X[4],$E
269 st $E,[$ctx+16]
270
271 bne `$bits==64?"%xcc":"%icc"`,.Lloop
272 andn $inp,7,$tmp0
273
274 ret
275 restore
276.type sha1_block_data_order,#function
277.size sha1_block_data_order,(.-sha1_block_data_order)
278___
279
280$code =~ s/\`([^\`]*)\`/eval $1/gem;
281print $code;
282close STDOUT;
diff --git a/src/lib/libcrypto/sha/asm/sha256-586.pl b/src/lib/libcrypto/sha/asm/sha256-586.pl
deleted file mode 100644
index 2b05c96063..0000000000
--- a/src/lib/libcrypto/sha/asm/sha256-586.pl
+++ /dev/null
@@ -1,249 +0,0 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# SHA256 block transform for x86. September 2007.
11#
12# Performance in clock cycles per processed byte (less is better):
13#
14# Pentium PIII P4 AMD K8 Core2
15# gcc 46 36 41 27 26
16# icc 57 33 38 25 23
17# x86 asm 40 30 33 20 18
18# x86_64 asm(*) - - 21 16 16
19#
20# (*) x86_64 assembler performance is presented for reference
21# purposes.
22#
23# Performance improvement over compiler generated code varies from
24# 10% to 40% [see above]. Not very impressive on some µ-archs, but
25# it's 5 times smaller and optimizies amount of writes.
26
27$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
28push(@INC,"${dir}","${dir}../../perlasm");
29require "x86asm.pl";
30
31&asm_init($ARGV[0],"sha512-586.pl",$ARGV[$#ARGV] eq "386");
32
33$A="eax";
34$E="edx";
35$T="ebx";
36$Aoff=&DWP(0,"esp");
37$Boff=&DWP(4,"esp");
38$Coff=&DWP(8,"esp");
39$Doff=&DWP(12,"esp");
40$Eoff=&DWP(16,"esp");
41$Foff=&DWP(20,"esp");
42$Goff=&DWP(24,"esp");
43$Hoff=&DWP(28,"esp");
44$Xoff=&DWP(32,"esp");
45$K256="ebp";
46
47sub BODY_00_15() {
48 my $in_16_63=shift;
49
50 &mov ("ecx",$E);
51 &add ($T,"edi") if ($in_16_63); # T += sigma1(X[-2])
52 &ror ("ecx",25-11);
53 &mov ("esi",$Foff);
54 &xor ("ecx",$E);
55 &ror ("ecx",11-6);
56 &mov (&DWP(4*(8+15),"esp"),$T) if ($in_16_63); # save X[0]
57 &xor ("ecx",$E);
58 &ror ("ecx",6); # Sigma1(e)
59 &mov ("edi",$Goff);
60 &add ($T,"ecx"); # T += Sigma1(e)
61
62 &xor ("esi","edi");
63 &mov ($Eoff,$E); # modulo-scheduled
64 &mov ("ecx",$A);
65 &and ("esi",$E);
66 &mov ($E,$Doff); # e becomes d, which is e in next iteration
67 &xor ("esi","edi"); # Ch(e,f,g)
68 &mov ("edi",$A);
69 &add ($T,"esi"); # T += Ch(e,f,g)
70
71 &ror ("ecx",22-13);
72 &add ($T,$Hoff); # T += h
73 &xor ("ecx",$A);
74 &ror ("ecx",13-2);
75 &mov ("esi",$Boff);
76 &xor ("ecx",$A);
77 &ror ("ecx",2); # Sigma0(a)
78 &add ($E,$T); # d += T
79 &mov ("edi",$Coff);
80
81 &add ($T,"ecx"); # T += Sigma0(a)
82 &mov ($Aoff,$A); # modulo-scheduled
83
84 &mov ("ecx",$A);
85 &sub ("esp",4);
86 &or ($A,"esi"); # a becomes h, which is a in next iteration
87 &and ("ecx","esi");
88 &and ($A,"edi");
89 &mov ("esi",&DWP(0,$K256));
90 &or ($A,"ecx"); # h=Maj(a,b,c)
91
92 &add ($K256,4);
93 &add ($A,$T); # h += T
94 &mov ($T,&DWP(4*(8+15+16-1),"esp")) if ($in_16_63); # preload T
95 &add ($E,"esi"); # d += K256[i]
96 &add ($A,"esi"); # h += K256[i]
97}
98
99&static_label("K256");
100&function_begin("sha256_block_data_order");
101 &mov ("esi",wparam(0)); # ctx
102 &mov ("edi",wparam(1)); # inp
103 &mov ("eax",wparam(2)); # num
104 &mov ("ebx","esp"); # saved sp
105
106 &picsetup($K256);
107 &picsymbol($K256, &label("K256"), $K256);
108
109 &sub ("esp",16);
110 &and ("esp",-64);
111
112 &shl ("eax",6);
113 &add ("eax","edi");
114 &mov (&DWP(0,"esp"),"esi"); # ctx
115 &mov (&DWP(4,"esp"),"edi"); # inp
116 &mov (&DWP(8,"esp"),"eax"); # inp+num*128
117 &mov (&DWP(12,"esp"),"ebx"); # saved sp
118
119&set_label("loop",16);
120 # copy input block to stack reversing byte and dword order
121 for($i=0;$i<4;$i++) {
122 &mov ("eax",&DWP($i*16+0,"edi"));
123 &mov ("ebx",&DWP($i*16+4,"edi"));
124 &mov ("ecx",&DWP($i*16+8,"edi"));
125 &mov ("edx",&DWP($i*16+12,"edi"));
126 &bswap ("eax");
127 &bswap ("ebx");
128 &bswap ("ecx");
129 &bswap ("edx");
130 &push ("eax");
131 &push ("ebx");
132 &push ("ecx");
133 &push ("edx");
134 }
135 &add ("edi",64);
136 &sub ("esp",4*8); # place for A,B,C,D,E,F,G,H
137 &mov (&DWP(4*(8+16)+4,"esp"),"edi");
138
139 # copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack
140 &mov ($A,&DWP(0,"esi"));
141 &mov ("ebx",&DWP(4,"esi"));
142 &mov ("ecx",&DWP(8,"esi"));
143 &mov ("edi",&DWP(12,"esi"));
144 # &mov ($Aoff,$A);
145 &mov ($Boff,"ebx");
146 &mov ($Coff,"ecx");
147 &mov ($Doff,"edi");
148 &mov ($E,&DWP(16,"esi"));
149 &mov ("ebx",&DWP(20,"esi"));
150 &mov ("ecx",&DWP(24,"esi"));
151 &mov ("edi",&DWP(28,"esi"));
152 # &mov ($Eoff,$E);
153 &mov ($Foff,"ebx");
154 &mov ($Goff,"ecx");
155 &mov ($Hoff,"edi");
156
157&set_label("00_15",16);
158 &mov ($T,&DWP(4*(8+15),"esp"));
159
160 &BODY_00_15();
161
162 &cmp ("esi",0xc19bf174);
163 &jne (&label("00_15"));
164
165 &mov ($T,&DWP(4*(8+15+16-1),"esp")); # preloaded in BODY_00_15(1)
166&set_label("16_63",16);
167 &mov ("esi",$T);
168 &mov ("ecx",&DWP(4*(8+15+16-14),"esp"));
169 &ror ("esi",18-7);
170 &mov ("edi","ecx");
171 &xor ("esi",$T);
172 &ror ("esi",7);
173 &shr ($T,3);
174
175 &ror ("edi",19-17);
176 &xor ($T,"esi"); # T = sigma0(X[-15])
177 &xor ("edi","ecx");
178 &ror ("edi",17);
179 &shr ("ecx",10);
180 &add ($T,&DWP(4*(8+15+16),"esp")); # T += X[-16]
181 &xor ("edi","ecx"); # sigma1(X[-2])
182
183 &add ($T,&DWP(4*(8+15+16-9),"esp")); # T += X[-7]
184 # &add ($T,"edi"); # T += sigma1(X[-2])
185 # &mov (&DWP(4*(8+15),"esp"),$T); # save X[0]
186
187 &BODY_00_15(1);
188
189 &cmp ("esi",0xc67178f2);
190 &jne (&label("16_63"));
191
192 &mov ("esi",&DWP(4*(8+16+64)+0,"esp"));#ctx
193 # &mov ($A,$Aoff);
194 &mov ("ebx",$Boff);
195 &mov ("ecx",$Coff);
196 &mov ("edi",$Doff);
197 &add ($A,&DWP(0,"esi"));
198 &add ("ebx",&DWP(4,"esi"));
199 &add ("ecx",&DWP(8,"esi"));
200 &add ("edi",&DWP(12,"esi"));
201 &mov (&DWP(0,"esi"),$A);
202 &mov (&DWP(4,"esi"),"ebx");
203 &mov (&DWP(8,"esi"),"ecx");
204 &mov (&DWP(12,"esi"),"edi");
205 # &mov ($E,$Eoff);
206 &mov ("eax",$Foff);
207 &mov ("ebx",$Goff);
208 &mov ("ecx",$Hoff);
209 &mov ("edi",&DWP(4*(8+16+64)+4,"esp"));#inp
210 &add ($E,&DWP(16,"esi"));
211 &add ("eax",&DWP(20,"esi"));
212 &add ("ebx",&DWP(24,"esi"));
213 &add ("ecx",&DWP(28,"esi"));
214 &mov (&DWP(16,"esi"),$E);
215 &mov (&DWP(20,"esi"),"eax");
216 &mov (&DWP(24,"esi"),"ebx");
217 &mov (&DWP(28,"esi"),"ecx");
218
219 &add ("esp",4*(8+16+64)); # destroy frame
220 &sub ($K256,4*64); # rewind K
221
222 &cmp ("edi",&DWP(8,"esp")); # are we done yet?
223 &jb (&label("loop"));
224
225 &mov ("esp",&DWP(12,"esp")); # restore sp
226&function_end_A();
227&function_end_B("sha256_block_data_order");
228
229 &rodataseg();
230&set_label("K256",64);
231 &data_word(0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5);
232 &data_word(0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5);
233 &data_word(0xd807aa98,0x12835b01,0x243185be,0x550c7dc3);
234 &data_word(0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174);
235 &data_word(0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc);
236 &data_word(0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da);
237 &data_word(0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7);
238 &data_word(0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967);
239 &data_word(0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13);
240 &data_word(0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85);
241 &data_word(0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3);
242 &data_word(0xd192e819,0xd6990624,0xf40e3585,0x106aa070);
243 &data_word(0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5);
244 &data_word(0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3);
245 &data_word(0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208);
246 &data_word(0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2);
247 &previous();
248
249&asm_finish();
diff --git a/src/lib/libcrypto/sha/asm/sha256-armv4.pl b/src/lib/libcrypto/sha/asm/sha256-armv4.pl
deleted file mode 100644
index 292520731c..0000000000
--- a/src/lib/libcrypto/sha/asm/sha256-armv4.pl
+++ /dev/null
@@ -1,211 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# SHA256 block procedure for ARMv4. May 2007.
11
12# Performance is ~2x better than gcc 3.4 generated code and in "abso-
13# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
14# byte [on single-issue Xscale PXA250 core].
15
16# July 2010.
17#
18# Rescheduling for dual-issue pipeline resulted in 22% improvement on
19# Cortex A8 core and ~20 cycles per processed byte.
20
21# February 2011.
22#
23# Profiler-assisted and platform-specific optimization resulted in 16%
24# improvement on Cortex A8 core and ~17 cycles per processed byte.
25
26while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
27open STDOUT,">$output";
28
29$ctx="r0"; $t0="r0";
30$inp="r1"; $t3="r1";
31$len="r2"; $t1="r2";
32$T1="r3";
33$A="r4";
34$B="r5";
35$C="r6";
36$D="r7";
37$E="r8";
38$F="r9";
39$G="r10";
40$H="r11";
41@V=($A,$B,$C,$D,$E,$F,$G,$H);
42$t2="r12";
43$Ktbl="r14";
44
45@Sigma0=( 2,13,22);
46@Sigma1=( 6,11,25);
47@sigma0=( 7,18, 3);
48@sigma1=(17,19,10);
49
50sub BODY_00_15 {
51my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
52
53$code.=<<___ if ($i<16);
54#if __ARM_ARCH__>=7 && !defined(__STRICT_ALIGNMENT)
55 ldr $T1,[$inp],#4
56#else
57 ldrb $T1,[$inp,#3] @ $i
58 ldrb $t2,[$inp,#2]
59 ldrb $t1,[$inp,#1]
60 ldrb $t0,[$inp],#4
61 orr $T1,$T1,$t2,lsl#8
62 orr $T1,$T1,$t1,lsl#16
63 orr $T1,$T1,$t0,lsl#24
64#endif
65___
66$code.=<<___;
67 mov $t0,$e,ror#$Sigma1[0]
68 ldr $t2,[$Ktbl],#4 @ *K256++
69 eor $t0,$t0,$e,ror#$Sigma1[1]
70 eor $t1,$f,$g
71#if $i>=16
72 add $T1,$T1,$t3 @ from BODY_16_xx
73#elif __ARM_ARCH__>=7 && defined(__ARMEL__) && !defined(__STRICT_ALIGNMENT)
74 rev $T1,$T1
75#endif
76#if $i==15
77 str $inp,[sp,#17*4] @ leave room for $t3
78#endif
79 eor $t0,$t0,$e,ror#$Sigma1[2] @ Sigma1(e)
80 and $t1,$t1,$e
81 str $T1,[sp,#`$i%16`*4]
82 add $T1,$T1,$t0
83 eor $t1,$t1,$g @ Ch(e,f,g)
84 add $T1,$T1,$h
85 mov $h,$a,ror#$Sigma0[0]
86 add $T1,$T1,$t1
87 eor $h,$h,$a,ror#$Sigma0[1]
88 add $T1,$T1,$t2
89 eor $h,$h,$a,ror#$Sigma0[2] @ Sigma0(a)
90#if $i>=15
91 ldr $t3,[sp,#`($i+2)%16`*4] @ from BODY_16_xx
92#endif
93 orr $t0,$a,$b
94 and $t1,$a,$b
95 and $t0,$t0,$c
96 add $h,$h,$T1
97 orr $t0,$t0,$t1 @ Maj(a,b,c)
98 add $d,$d,$T1
99 add $h,$h,$t0
100___
101}
102
103sub BODY_16_XX {
104my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
105
106$code.=<<___;
107 @ ldr $t3,[sp,#`($i+1)%16`*4] @ $i
108 ldr $t2,[sp,#`($i+14)%16`*4]
109 mov $t0,$t3,ror#$sigma0[0]
110 ldr $T1,[sp,#`($i+0)%16`*4]
111 eor $t0,$t0,$t3,ror#$sigma0[1]
112 ldr $t1,[sp,#`($i+9)%16`*4]
113 eor $t0,$t0,$t3,lsr#$sigma0[2] @ sigma0(X[i+1])
114 mov $t3,$t2,ror#$sigma1[0]
115 add $T1,$T1,$t0
116 eor $t3,$t3,$t2,ror#$sigma1[1]
117 add $T1,$T1,$t1
118 eor $t3,$t3,$t2,lsr#$sigma1[2] @ sigma1(X[i+14])
119 @ add $T1,$T1,$t3
120___
121 &BODY_00_15(@_);
122}
123
124$code=<<___;
125#include "arm_arch.h"
126
127.text
128.code 32
129
130.type K256,%object
131.align 5
132K256:
133.word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
134.word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
135.word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
136.word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
137.word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
138.word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
139.word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
140.word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
141.word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
142.word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
143.word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
144.word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
145.word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
146.word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
147.word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
148.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
149.size K256,.-K256
150
151.global sha256_block_data_order
152.type sha256_block_data_order,%function
153sha256_block_data_order:
154 sub r3,pc,#8 @ sha256_block_data_order
155 add $len,$inp,$len,lsl#6 @ len to point at the end of inp
156 stmdb sp!,{$ctx,$inp,$len,r4-r11,lr}
157 ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
158 sub $Ktbl,r3,#256 @ K256
159 sub sp,sp,#16*4 @ alloca(X[16])
160.Loop:
161___
162for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
163$code.=".Lrounds_16_xx:\n";
164for (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
165$code.=<<___;
166 and $t2,$t2,#0xff
167 cmp $t2,#0xf2
168 bne .Lrounds_16_xx
169
170 ldr $T1,[sp,#16*4] @ pull ctx
171 ldr $t0,[$T1,#0]
172 ldr $t1,[$T1,#4]
173 ldr $t2,[$T1,#8]
174 add $A,$A,$t0
175 ldr $t0,[$T1,#12]
176 add $B,$B,$t1
177 ldr $t1,[$T1,#16]
178 add $C,$C,$t2
179 ldr $t2,[$T1,#20]
180 add $D,$D,$t0
181 ldr $t0,[$T1,#24]
182 add $E,$E,$t1
183 ldr $t1,[$T1,#28]
184 add $F,$F,$t2
185 ldr $inp,[sp,#17*4] @ pull inp
186 ldr $t2,[sp,#18*4] @ pull inp+len
187 add $G,$G,$t0
188 add $H,$H,$t1
189 stmia $T1,{$A,$B,$C,$D,$E,$F,$G,$H}
190 cmp $inp,$t2
191 sub $Ktbl,$Ktbl,#256 @ rewind Ktbl
192 bne .Loop
193
194 add sp,sp,#`16+3`*4 @ destroy frame
195#if __ARM_ARCH__>=5
196 ldmia sp!,{r4-r11,pc}
197#else
198 ldmia sp!,{r4-r11,lr}
199 tst lr,#1
200 moveq pc,lr @ be binary compatible with V4, yet
201 bx lr @ interoperable with Thumb ISA:-)
202#endif
203.size sha256_block_data_order,.-sha256_block_data_order
204.asciz "SHA256 block transform for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
205.align 2
206___
207
208$code =~ s/\`([^\`]*)\`/eval $1/gem;
209$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
210print $code;
211close STDOUT; # enforce flush
diff --git a/src/lib/libcrypto/sha/asm/sha512-586.pl b/src/lib/libcrypto/sha/asm/sha512-586.pl
deleted file mode 100644
index c1d0684e92..0000000000
--- a/src/lib/libcrypto/sha/asm/sha512-586.pl
+++ /dev/null
@@ -1,646 +0,0 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# SHA512 block transform for x86. September 2007.
11#
12# Performance in clock cycles per processed byte (less is better):
13#
14# Pentium PIII P4 AMD K8 Core2
15# gcc 100 75 116 54 66
16# icc 97 77 95 55 57
17# x86 asm 61 56 82 36 40
18# SSE2 asm - - 38 24 20
19# x86_64 asm(*) - - 30 10.0 10.5
20#
21# (*) x86_64 assembler performance is presented for reference
22# purposes.
23#
24# IALU code-path is optimized for elder Pentiums. On vanilla Pentium
25# performance improvement over compiler generated code reaches ~60%,
26# while on PIII - ~35%. On newer µ-archs improvement varies from 15%
27# to 50%, but it's less important as they are expected to execute SSE2
28# code-path, which is commonly ~2-3x faster [than compiler generated
29# code]. SSE2 code-path is as fast as original sha512-sse2.pl, even
30# though it does not use 128-bit operations. The latter means that
31# SSE2-aware kernel is no longer required to execute the code. Another
32# difference is that new code optimizes amount of writes, but at the
33# cost of increased data cache "footprint" by 1/2KB.
34
35$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
36push(@INC,"${dir}","${dir}../../perlasm");
37require "x86asm.pl";
38
39&asm_init($ARGV[0],"sha512-586.pl",$ARGV[$#ARGV] eq "386");
40
41$sse2=0;
42for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
43
44&external_label("OPENSSL_ia32cap_P") if ($sse2);
45
46$Tlo=&DWP(0,"esp"); $Thi=&DWP(4,"esp");
47$Alo=&DWP(8,"esp"); $Ahi=&DWP(8+4,"esp");
48$Blo=&DWP(16,"esp"); $Bhi=&DWP(16+4,"esp");
49$Clo=&DWP(24,"esp"); $Chi=&DWP(24+4,"esp");
50$Dlo=&DWP(32,"esp"); $Dhi=&DWP(32+4,"esp");
51$Elo=&DWP(40,"esp"); $Ehi=&DWP(40+4,"esp");
52$Flo=&DWP(48,"esp"); $Fhi=&DWP(48+4,"esp");
53$Glo=&DWP(56,"esp"); $Ghi=&DWP(56+4,"esp");
54$Hlo=&DWP(64,"esp"); $Hhi=&DWP(64+4,"esp");
55$K512="ebp";
56
57$Asse2=&QWP(0,"esp");
58$Bsse2=&QWP(8,"esp");
59$Csse2=&QWP(16,"esp");
60$Dsse2=&QWP(24,"esp");
61$Esse2=&QWP(32,"esp");
62$Fsse2=&QWP(40,"esp");
63$Gsse2=&QWP(48,"esp");
64$Hsse2=&QWP(56,"esp");
65
66$A="mm0"; # B-D and
67$E="mm4"; # F-H are commonly loaded to respectively mm1-mm3 and
68 # mm5-mm7, but it's done on on-demand basis...
69
70sub BODY_00_15_sse2 {
71 my $prefetch=shift;
72
73 &movq ("mm5",$Fsse2); # load f
74 &movq ("mm6",$Gsse2); # load g
75 &movq ("mm7",$Hsse2); # load h
76
77 &movq ("mm1",$E); # %mm1 is sliding right
78 &movq ("mm2",$E); # %mm2 is sliding left
79 &psrlq ("mm1",14);
80 &movq ($Esse2,$E); # modulo-scheduled save e
81 &psllq ("mm2",23);
82 &movq ("mm3","mm1"); # %mm3 is T1
83 &psrlq ("mm1",4);
84 &pxor ("mm3","mm2");
85 &psllq ("mm2",23);
86 &pxor ("mm3","mm1");
87 &psrlq ("mm1",23);
88 &pxor ("mm3","mm2");
89 &psllq ("mm2",4);
90 &pxor ("mm3","mm1");
91 &paddq ("mm7",QWP(0,$K512)); # h+=K512[i]
92 &pxor ("mm3","mm2"); # T1=Sigma1_512(e)
93
94 &pxor ("mm5","mm6"); # f^=g
95 &movq ("mm1",$Bsse2); # load b
96 &pand ("mm5",$E); # f&=e
97 &movq ("mm2",$Csse2); # load c
98 &pxor ("mm5","mm6"); # f^=g
99 &movq ($E,$Dsse2); # e = load d
100 &paddq ("mm3","mm5"); # T1+=Ch(e,f,g)
101 &movq (&QWP(0,"esp"),$A); # modulo-scheduled save a
102 &paddq ("mm3","mm7"); # T1+=h
103
104 &movq ("mm5",$A); # %mm5 is sliding right
105 &movq ("mm6",$A); # %mm6 is sliding left
106 &paddq ("mm3",&QWP(8*9,"esp")); # T1+=X[0]
107 &psrlq ("mm5",28);
108 &paddq ($E,"mm3"); # e += T1
109 &psllq ("mm6",25);
110 &movq ("mm7","mm5"); # %mm7 is T2
111 &psrlq ("mm5",6);
112 &pxor ("mm7","mm6");
113 &psllq ("mm6",5);
114 &pxor ("mm7","mm5");
115 &psrlq ("mm5",5);
116 &pxor ("mm7","mm6");
117 &psllq ("mm6",6);
118 &pxor ("mm7","mm5");
119 &sub ("esp",8);
120 &pxor ("mm7","mm6"); # T2=Sigma0_512(a)
121
122 &movq ("mm5",$A); # %mm5=a
123 &por ($A,"mm2"); # a=a|c
124 &movq ("mm6",&QWP(8*(9+16-14),"esp")) if ($prefetch);
125 &pand ("mm5","mm2"); # %mm5=a&c
126 &pand ($A,"mm1"); # a=(a|c)&b
127 &movq ("mm2",&QWP(8*(9+16-1),"esp")) if ($prefetch);
128 &por ("mm5",$A); # %mm5=(a&c)|((a|c)&b)
129 &paddq ("mm7","mm5"); # T2+=Maj(a,b,c)
130 &movq ($A,"mm3"); # a=T1
131
132 &mov (&LB("edx"),&BP(0,$K512));
133 &paddq ($A,"mm7"); # a+=T2
134 &add ($K512,8);
135}
136
137sub BODY_00_15_x86 {
138 #define Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41))
139 # LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
140 # HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
141 &mov ("ecx",$Elo);
142 &mov ("edx",$Ehi);
143 &mov ("esi","ecx");
144
145 &shr ("ecx",9); # lo>>9
146 &mov ("edi","edx");
147 &shr ("edx",9); # hi>>9
148 &mov ("ebx","ecx");
149 &shl ("esi",14); # lo<<14
150 &mov ("eax","edx");
151 &shl ("edi",14); # hi<<14
152 &xor ("ebx","esi");
153
154 &shr ("ecx",14-9); # lo>>14
155 &xor ("eax","edi");
156 &shr ("edx",14-9); # hi>>14
157 &xor ("eax","ecx");
158 &shl ("esi",18-14); # lo<<18
159 &xor ("ebx","edx");
160 &shl ("edi",18-14); # hi<<18
161 &xor ("ebx","esi");
162
163 &shr ("ecx",18-14); # lo>>18
164 &xor ("eax","edi");
165 &shr ("edx",18-14); # hi>>18
166 &xor ("eax","ecx");
167 &shl ("esi",23-18); # lo<<23
168 &xor ("ebx","edx");
169 &shl ("edi",23-18); # hi<<23
170 &xor ("eax","esi");
171 &xor ("ebx","edi"); # T1 = Sigma1(e)
172
173 &mov ("ecx",$Flo);
174 &mov ("edx",$Fhi);
175 &mov ("esi",$Glo);
176 &mov ("edi",$Ghi);
177 &add ("eax",$Hlo);
178 &adc ("ebx",$Hhi); # T1 += h
179 &xor ("ecx","esi");
180 &xor ("edx","edi");
181 &and ("ecx",$Elo);
182 &and ("edx",$Ehi);
183 &add ("eax",&DWP(8*(9+15)+0,"esp"));
184 &adc ("ebx",&DWP(8*(9+15)+4,"esp")); # T1 += X[0]
185 &xor ("ecx","esi");
186 &xor ("edx","edi"); # Ch(e,f,g) = (f^g)&e)^g
187
188 &mov ("esi",&DWP(0,$K512));
189 &mov ("edi",&DWP(4,$K512)); # K[i]
190 &add ("eax","ecx");
191 &adc ("ebx","edx"); # T1 += Ch(e,f,g)
192 &mov ("ecx",$Dlo);
193 &mov ("edx",$Dhi);
194 &add ("eax","esi");
195 &adc ("ebx","edi"); # T1 += K[i]
196 &mov ($Tlo,"eax");
197 &mov ($Thi,"ebx"); # put T1 away
198 &add ("eax","ecx");
199 &adc ("ebx","edx"); # d += T1
200
201 #define Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
202 # LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
203 # HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
204 &mov ("ecx",$Alo);
205 &mov ("edx",$Ahi);
206 &mov ($Dlo,"eax");
207 &mov ($Dhi,"ebx");
208 &mov ("esi","ecx");
209
210 &shr ("ecx",2); # lo>>2
211 &mov ("edi","edx");
212 &shr ("edx",2); # hi>>2
213 &mov ("ebx","ecx");
214 &shl ("esi",4); # lo<<4
215 &mov ("eax","edx");
216 &shl ("edi",4); # hi<<4
217 &xor ("ebx","esi");
218
219 &shr ("ecx",7-2); # lo>>7
220 &xor ("eax","edi");
221 &shr ("edx",7-2); # hi>>7
222 &xor ("ebx","ecx");
223 &shl ("esi",25-4); # lo<<25
224 &xor ("eax","edx");
225 &shl ("edi",25-4); # hi<<25
226 &xor ("eax","esi");
227
228 &shr ("ecx",28-7); # lo>>28
229 &xor ("ebx","edi");
230 &shr ("edx",28-7); # hi>>28
231 &xor ("eax","ecx");
232 &shl ("esi",30-25); # lo<<30
233 &xor ("ebx","edx");
234 &shl ("edi",30-25); # hi<<30
235 &xor ("eax","esi");
236 &xor ("ebx","edi"); # Sigma0(a)
237
238 &mov ("ecx",$Alo);
239 &mov ("edx",$Ahi);
240 &mov ("esi",$Blo);
241 &mov ("edi",$Bhi);
242 &add ("eax",$Tlo);
243 &adc ("ebx",$Thi); # T1 = Sigma0(a)+T1
244 &or ("ecx","esi");
245 &or ("edx","edi");
246 &and ("ecx",$Clo);
247 &and ("edx",$Chi);
248 &and ("esi",$Alo);
249 &and ("edi",$Ahi);
250 &or ("ecx","esi");
251 &or ("edx","edi"); # Maj(a,b,c) = ((a|b)&c)|(a&b)
252
253 &add ("eax","ecx");
254 &adc ("ebx","edx"); # T1 += Maj(a,b,c)
255 &mov ($Tlo,"eax");
256 &mov ($Thi,"ebx");
257
258 &mov (&LB("edx"),&BP(0,$K512)); # pre-fetch LSB of *K
259 &sub ("esp",8);
260 &lea ($K512,&DWP(8,$K512)); # K++
261}
262
263
264&static_label("K512");
265&function_begin("sha512_block_data_order");
266 &mov ("esi",wparam(0)); # ctx
267 &mov ("edi",wparam(1)); # inp
268 &mov ("eax",wparam(2)); # num
269 &mov ("ebx","esp"); # saved sp
270
271 &picsetup($K512);
272if ($sse2) {
273 &picsymbol("edx", "OPENSSL_ia32cap_P", $K512);
274}
275 &picsymbol($K512, &label("K512"), $K512);
276
277 &sub ("esp",16);
278 &and ("esp",-64);
279
280 &shl ("eax",7);
281 &add ("eax","edi");
282 &mov (&DWP(0,"esp"),"esi"); # ctx
283 &mov (&DWP(4,"esp"),"edi"); # inp
284 &mov (&DWP(8,"esp"),"eax"); # inp+num*128
285 &mov (&DWP(12,"esp"),"ebx"); # saved sp
286
287if ($sse2) {
288 &bt (&DWP(0,"edx"),"\$IA32CAP_BIT0_SSE2");
289 &jnc (&label("loop_x86"));
290
291 # load ctx->h[0-7]
292 &movq ($A,&QWP(0,"esi"));
293 &movq ("mm1",&QWP(8,"esi"));
294 &movq ("mm2",&QWP(16,"esi"));
295 &movq ("mm3",&QWP(24,"esi"));
296 &movq ($E,&QWP(32,"esi"));
297 &movq ("mm5",&QWP(40,"esi"));
298 &movq ("mm6",&QWP(48,"esi"));
299 &movq ("mm7",&QWP(56,"esi"));
300 &sub ("esp",8*10);
301
302&set_label("loop_sse2",16);
303 # &movq ($Asse2,$A);
304 &movq ($Bsse2,"mm1");
305 &movq ($Csse2,"mm2");
306 &movq ($Dsse2,"mm3");
307 # &movq ($Esse2,$E);
308 &movq ($Fsse2,"mm5");
309 &movq ($Gsse2,"mm6");
310 &movq ($Hsse2,"mm7");
311
312 &mov ("ecx",&DWP(0,"edi"));
313 &mov ("edx",&DWP(4,"edi"));
314 &add ("edi",8);
315 &bswap ("ecx");
316 &bswap ("edx");
317 &mov (&DWP(8*9+4,"esp"),"ecx");
318 &mov (&DWP(8*9+0,"esp"),"edx");
319
320&set_label("00_14_sse2",16);
321 &mov ("eax",&DWP(0,"edi"));
322 &mov ("ebx",&DWP(4,"edi"));
323 &add ("edi",8);
324 &bswap ("eax");
325 &bswap ("ebx");
326 &mov (&DWP(8*8+4,"esp"),"eax");
327 &mov (&DWP(8*8+0,"esp"),"ebx");
328
329 &BODY_00_15_sse2();
330
331 &cmp (&LB("edx"),0x35);
332 &jne (&label("00_14_sse2"));
333
334 &BODY_00_15_sse2(1);
335
336&set_label("16_79_sse2",16);
337 #&movq ("mm2",&QWP(8*(9+16-1),"esp")); #prefetched in BODY_00_15
338 #&movq ("mm6",&QWP(8*(9+16-14),"esp"));
339 &movq ("mm1","mm2");
340
341 &psrlq ("mm2",1);
342 &movq ("mm7","mm6");
343 &psrlq ("mm6",6);
344 &movq ("mm3","mm2");
345
346 &psrlq ("mm2",7-1);
347 &movq ("mm5","mm6");
348 &psrlq ("mm6",19-6);
349 &pxor ("mm3","mm2");
350
351 &psrlq ("mm2",8-7);
352 &pxor ("mm5","mm6");
353 &psrlq ("mm6",61-19);
354 &pxor ("mm3","mm2");
355
356 &movq ("mm2",&QWP(8*(9+16),"esp"));
357
358 &psllq ("mm1",56);
359 &pxor ("mm5","mm6");
360 &psllq ("mm7",3);
361 &pxor ("mm3","mm1");
362
363 &paddq ("mm2",&QWP(8*(9+16-9),"esp"));
364
365 &psllq ("mm1",63-56);
366 &pxor ("mm5","mm7");
367 &psllq ("mm7",45-3);
368 &pxor ("mm3","mm1");
369 &pxor ("mm5","mm7");
370
371 &paddq ("mm3","mm5");
372 &paddq ("mm3","mm2");
373 &movq (&QWP(8*9,"esp"),"mm3");
374
375 &BODY_00_15_sse2(1);
376
377 &cmp (&LB("edx"),0x17);
378 &jne (&label("16_79_sse2"));
379
380 # &movq ($A,$Asse2);
381 &movq ("mm1",$Bsse2);
382 &movq ("mm2",$Csse2);
383 &movq ("mm3",$Dsse2);
384 # &movq ($E,$Esse2);
385 &movq ("mm5",$Fsse2);
386 &movq ("mm6",$Gsse2);
387 &movq ("mm7",$Hsse2);
388
389 &paddq ($A,&QWP(0,"esi"));
390 &paddq ("mm1",&QWP(8,"esi"));
391 &paddq ("mm2",&QWP(16,"esi"));
392 &paddq ("mm3",&QWP(24,"esi"));
393 &paddq ($E,&QWP(32,"esi"));
394 &paddq ("mm5",&QWP(40,"esi"));
395 &paddq ("mm6",&QWP(48,"esi"));
396 &paddq ("mm7",&QWP(56,"esi"));
397
398 &movq (&QWP(0,"esi"),$A);
399 &movq (&QWP(8,"esi"),"mm1");
400 &movq (&QWP(16,"esi"),"mm2");
401 &movq (&QWP(24,"esi"),"mm3");
402 &movq (&QWP(32,"esi"),$E);
403 &movq (&QWP(40,"esi"),"mm5");
404 &movq (&QWP(48,"esi"),"mm6");
405 &movq (&QWP(56,"esi"),"mm7");
406
407 &add ("esp",8*80); # destroy frame
408 &sub ($K512,8*80); # rewind K
409
410 &cmp ("edi",&DWP(8*10+8,"esp")); # are we done yet?
411 &jb (&label("loop_sse2"));
412
413 &emms ();
414 &mov ("esp",&DWP(8*10+12,"esp")); # restore sp
415&function_end_A();
416}
417&set_label("loop_x86",16);
418 # copy input block to stack reversing byte and qword order
419 for ($i=0;$i<8;$i++) {
420 &mov ("eax",&DWP($i*16+0,"edi"));
421 &mov ("ebx",&DWP($i*16+4,"edi"));
422 &mov ("ecx",&DWP($i*16+8,"edi"));
423 &mov ("edx",&DWP($i*16+12,"edi"));
424 &bswap ("eax");
425 &bswap ("ebx");
426 &bswap ("ecx");
427 &bswap ("edx");
428 &push ("eax");
429 &push ("ebx");
430 &push ("ecx");
431 &push ("edx");
432 }
433 &add ("edi",128);
434 &sub ("esp",9*8); # place for T,A,B,C,D,E,F,G,H
435 &mov (&DWP(8*(9+16)+4,"esp"),"edi");
436
437 # copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack
438 &lea ("edi",&DWP(8,"esp"));
439 &mov ("ecx",16);
440 &data_word(0xA5F3F689); # rep movsd
441
442&set_label("00_15_x86",16);
443 &BODY_00_15_x86();
444
445 &cmp (&LB("edx"),0x94);
446 &jne (&label("00_15_x86"));
447
448&set_label("16_79_x86",16);
449 #define sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7))
450 # LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
451 # HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7
452 &mov ("ecx",&DWP(8*(9+15+16-1)+0,"esp"));
453 &mov ("edx",&DWP(8*(9+15+16-1)+4,"esp"));
454 &mov ("esi","ecx");
455
456 &shr ("ecx",1); # lo>>1
457 &mov ("edi","edx");
458 &shr ("edx",1); # hi>>1
459 &mov ("eax","ecx");
460 &shl ("esi",24); # lo<<24
461 &mov ("ebx","edx");
462 &shl ("edi",24); # hi<<24
463 &xor ("ebx","esi");
464
465 &shr ("ecx",7-1); # lo>>7
466 &xor ("eax","edi");
467 &shr ("edx",7-1); # hi>>7
468 &xor ("eax","ecx");
469 &shl ("esi",31-24); # lo<<31
470 &xor ("ebx","edx");
471 &shl ("edi",25-24); # hi<<25
472 &xor ("ebx","esi");
473
474 &shr ("ecx",8-7); # lo>>8
475 &xor ("eax","edi");
476 &shr ("edx",8-7); # hi>>8
477 &xor ("eax","ecx");
478 &shl ("edi",31-25); # hi<<31
479 &xor ("ebx","edx");
480 &xor ("eax","edi"); # T1 = sigma0(X[-15])
481
482 &mov (&DWP(0,"esp"),"eax");
483 &mov (&DWP(4,"esp"),"ebx"); # put T1 away
484
485 #define sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
486 # LO lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
487 # HI hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
488 &mov ("ecx",&DWP(8*(9+15+16-14)+0,"esp"));
489 &mov ("edx",&DWP(8*(9+15+16-14)+4,"esp"));
490 &mov ("esi","ecx");
491
492 &shr ("ecx",6); # lo>>6
493 &mov ("edi","edx");
494 &shr ("edx",6); # hi>>6
495 &mov ("eax","ecx");
496 &shl ("esi",3); # lo<<3
497 &mov ("ebx","edx");
498 &shl ("edi",3); # hi<<3
499 &xor ("eax","esi");
500
501 &shr ("ecx",19-6); # lo>>19
502 &xor ("ebx","edi");
503 &shr ("edx",19-6); # hi>>19
504 &xor ("eax","ecx");
505 &shl ("esi",13-3); # lo<<13
506 &xor ("ebx","edx");
507 &shl ("edi",13-3); # hi<<13
508 &xor ("ebx","esi");
509
510 &shr ("ecx",29-19); # lo>>29
511 &xor ("eax","edi");
512 &shr ("edx",29-19); # hi>>29
513 &xor ("ebx","ecx");
514 &shl ("edi",26-13); # hi<<26
515 &xor ("eax","edx");
516 &xor ("eax","edi"); # sigma1(X[-2])
517
518 &mov ("ecx",&DWP(8*(9+15+16)+0,"esp"));
519 &mov ("edx",&DWP(8*(9+15+16)+4,"esp"));
520 &add ("eax",&DWP(0,"esp"));
521 &adc ("ebx",&DWP(4,"esp")); # T1 = sigma1(X[-2])+T1
522 &mov ("esi",&DWP(8*(9+15+16-9)+0,"esp"));
523 &mov ("edi",&DWP(8*(9+15+16-9)+4,"esp"));
524 &add ("eax","ecx");
525 &adc ("ebx","edx"); # T1 += X[-16]
526 &add ("eax","esi");
527 &adc ("ebx","edi"); # T1 += X[-7]
528 &mov (&DWP(8*(9+15)+0,"esp"),"eax");
529 &mov (&DWP(8*(9+15)+4,"esp"),"ebx"); # save X[0]
530
531 &BODY_00_15_x86();
532
533 &cmp (&LB("edx"),0x17);
534 &jne (&label("16_79_x86"));
535
536 &mov ("esi",&DWP(8*(9+16+80)+0,"esp"));# ctx
537 &mov ("edi",&DWP(8*(9+16+80)+4,"esp"));# inp
538 for($i=0;$i<4;$i++) {
539 &mov ("eax",&DWP($i*16+0,"esi"));
540 &mov ("ebx",&DWP($i*16+4,"esi"));
541 &mov ("ecx",&DWP($i*16+8,"esi"));
542 &mov ("edx",&DWP($i*16+12,"esi"));
543 &add ("eax",&DWP(8+($i*16)+0,"esp"));
544 &adc ("ebx",&DWP(8+($i*16)+4,"esp"));
545 &mov (&DWP($i*16+0,"esi"),"eax");
546 &mov (&DWP($i*16+4,"esi"),"ebx");
547 &add ("ecx",&DWP(8+($i*16)+8,"esp"));
548 &adc ("edx",&DWP(8+($i*16)+12,"esp"));
549 &mov (&DWP($i*16+8,"esi"),"ecx");
550 &mov (&DWP($i*16+12,"esi"),"edx");
551 }
552 &add ("esp",8*(9+16+80)); # destroy frame
553 &sub ($K512,8*80); # rewind K
554
555 &cmp ("edi",&DWP(8,"esp")); # are we done yet?
556 &jb (&label("loop_x86"));
557
558 &mov ("esp",&DWP(12,"esp")); # restore sp
559&function_end_A();
560&function_end_B("sha512_block_data_order");
561
562 &rodataseg();
563&set_label("K512",64);
564 &data_word(0xd728ae22,0x428a2f98); # u64
565 &data_word(0x23ef65cd,0x71374491); # u64
566 &data_word(0xec4d3b2f,0xb5c0fbcf); # u64
567 &data_word(0x8189dbbc,0xe9b5dba5); # u64
568 &data_word(0xf348b538,0x3956c25b); # u64
569 &data_word(0xb605d019,0x59f111f1); # u64
570 &data_word(0xaf194f9b,0x923f82a4); # u64
571 &data_word(0xda6d8118,0xab1c5ed5); # u64
572 &data_word(0xa3030242,0xd807aa98); # u64
573 &data_word(0x45706fbe,0x12835b01); # u64
574 &data_word(0x4ee4b28c,0x243185be); # u64
575 &data_word(0xd5ffb4e2,0x550c7dc3); # u64
576 &data_word(0xf27b896f,0x72be5d74); # u64
577 &data_word(0x3b1696b1,0x80deb1fe); # u64
578 &data_word(0x25c71235,0x9bdc06a7); # u64
579 &data_word(0xcf692694,0xc19bf174); # u64
580 &data_word(0x9ef14ad2,0xe49b69c1); # u64
581 &data_word(0x384f25e3,0xefbe4786); # u64
582 &data_word(0x8b8cd5b5,0x0fc19dc6); # u64
583 &data_word(0x77ac9c65,0x240ca1cc); # u64
584 &data_word(0x592b0275,0x2de92c6f); # u64
585 &data_word(0x6ea6e483,0x4a7484aa); # u64
586 &data_word(0xbd41fbd4,0x5cb0a9dc); # u64
587 &data_word(0x831153b5,0x76f988da); # u64
588 &data_word(0xee66dfab,0x983e5152); # u64
589 &data_word(0x2db43210,0xa831c66d); # u64
590 &data_word(0x98fb213f,0xb00327c8); # u64
591 &data_word(0xbeef0ee4,0xbf597fc7); # u64
592 &data_word(0x3da88fc2,0xc6e00bf3); # u64
593 &data_word(0x930aa725,0xd5a79147); # u64
594 &data_word(0xe003826f,0x06ca6351); # u64
595 &data_word(0x0a0e6e70,0x14292967); # u64
596 &data_word(0x46d22ffc,0x27b70a85); # u64
597 &data_word(0x5c26c926,0x2e1b2138); # u64
598 &data_word(0x5ac42aed,0x4d2c6dfc); # u64
599 &data_word(0x9d95b3df,0x53380d13); # u64
600 &data_word(0x8baf63de,0x650a7354); # u64
601 &data_word(0x3c77b2a8,0x766a0abb); # u64
602 &data_word(0x47edaee6,0x81c2c92e); # u64
603 &data_word(0x1482353b,0x92722c85); # u64
604 &data_word(0x4cf10364,0xa2bfe8a1); # u64
605 &data_word(0xbc423001,0xa81a664b); # u64
606 &data_word(0xd0f89791,0xc24b8b70); # u64
607 &data_word(0x0654be30,0xc76c51a3); # u64
608 &data_word(0xd6ef5218,0xd192e819); # u64
609 &data_word(0x5565a910,0xd6990624); # u64
610 &data_word(0x5771202a,0xf40e3585); # u64
611 &data_word(0x32bbd1b8,0x106aa070); # u64
612 &data_word(0xb8d2d0c8,0x19a4c116); # u64
613 &data_word(0x5141ab53,0x1e376c08); # u64
614 &data_word(0xdf8eeb99,0x2748774c); # u64
615 &data_word(0xe19b48a8,0x34b0bcb5); # u64
616 &data_word(0xc5c95a63,0x391c0cb3); # u64
617 &data_word(0xe3418acb,0x4ed8aa4a); # u64
618 &data_word(0x7763e373,0x5b9cca4f); # u64
619 &data_word(0xd6b2b8a3,0x682e6ff3); # u64
620 &data_word(0x5defb2fc,0x748f82ee); # u64
621 &data_word(0x43172f60,0x78a5636f); # u64
622 &data_word(0xa1f0ab72,0x84c87814); # u64
623 &data_word(0x1a6439ec,0x8cc70208); # u64
624 &data_word(0x23631e28,0x90befffa); # u64
625 &data_word(0xde82bde9,0xa4506ceb); # u64
626 &data_word(0xb2c67915,0xbef9a3f7); # u64
627 &data_word(0xe372532b,0xc67178f2); # u64
628 &data_word(0xea26619c,0xca273ece); # u64
629 &data_word(0x21c0c207,0xd186b8c7); # u64
630 &data_word(0xcde0eb1e,0xeada7dd6); # u64
631 &data_word(0xee6ed178,0xf57d4f7f); # u64
632 &data_word(0x72176fba,0x06f067aa); # u64
633 &data_word(0xa2c898a6,0x0a637dc5); # u64
634 &data_word(0xbef90dae,0x113f9804); # u64
635 &data_word(0x131c471b,0x1b710b35); # u64
636 &data_word(0x23047d84,0x28db77f5); # u64
637 &data_word(0x40c72493,0x32caab7b); # u64
638 &data_word(0x15c9bebc,0x3c9ebe0a); # u64
639 &data_word(0x9c100d4c,0x431d67c4); # u64
640 &data_word(0xcb3e42b6,0x4cc5d4be); # u64
641 &data_word(0xfc657e2a,0x597f299c); # u64
642 &data_word(0x3ad6faec,0x5fcb6fab); # u64
643 &data_word(0x4a475817,0x6c44198c); # u64
644 &previous();
645
646&asm_finish();
diff --git a/src/lib/libcrypto/sha/asm/sha512-armv4.pl b/src/lib/libcrypto/sha/asm/sha512-armv4.pl
deleted file mode 100644
index a247a00c2b..0000000000
--- a/src/lib/libcrypto/sha/asm/sha512-armv4.pl
+++ /dev/null
@@ -1,582 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# SHA512 block procedure for ARMv4. September 2007.
11
12# This code is ~4.5 (four and a half) times faster than code generated
13# by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
14# Xscale PXA250 core].
15#
16# July 2010.
17#
18# Rescheduling for dual-issue pipeline resulted in 6% improvement on
19# Cortex A8 core and ~40 cycles per processed byte.
20
21# February 2011.
22#
23# Profiler-assisted and platform-specific optimization resulted in 7%
24# improvement on Coxtex A8 core and ~38 cycles per byte.
25
26# March 2011.
27#
28# Add NEON implementation. On Cortex A8 it was measured to process
29# one byte in 25.5 cycles or 47% faster than integer-only code.
30
31# Byte order [in]dependence. =========================================
32#
33# Originally caller was expected to maintain specific *dword* order in
34# h[0-7], namely with most significant dword at *lower* address, which
35# was reflected in below two parameters as 0 and 4. Now caller is
36# expected to maintain native byte order for whole 64-bit values.
37$hi="HI";
38$lo="LO";
39# ====================================================================
40
41while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
42open STDOUT,">$output";
43
44$ctx="r0"; # parameter block
45$inp="r1";
46$len="r2";
47
48$Tlo="r3";
49$Thi="r4";
50$Alo="r5";
51$Ahi="r6";
52$Elo="r7";
53$Ehi="r8";
54$t0="r9";
55$t1="r10";
56$t2="r11";
57$t3="r12";
58############ r13 is stack pointer
59$Ktbl="r14";
60############ r15 is program counter
61
62$Aoff=8*0;
63$Boff=8*1;
64$Coff=8*2;
65$Doff=8*3;
66$Eoff=8*4;
67$Foff=8*5;
68$Goff=8*6;
69$Hoff=8*7;
70$Xoff=8*8;
71
72sub BODY_00_15() {
73my $magic = shift;
74$code.=<<___;
75 @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41))
76 @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
77 @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
78 mov $t0,$Elo,lsr#14
79 str $Tlo,[sp,#$Xoff+0]
80 mov $t1,$Ehi,lsr#14
81 str $Thi,[sp,#$Xoff+4]
82 eor $t0,$t0,$Ehi,lsl#18
83 ldr $t2,[sp,#$Hoff+0] @ h.lo
84 eor $t1,$t1,$Elo,lsl#18
85 ldr $t3,[sp,#$Hoff+4] @ h.hi
86 eor $t0,$t0,$Elo,lsr#18
87 eor $t1,$t1,$Ehi,lsr#18
88 eor $t0,$t0,$Ehi,lsl#14
89 eor $t1,$t1,$Elo,lsl#14
90 eor $t0,$t0,$Ehi,lsr#9
91 eor $t1,$t1,$Elo,lsr#9
92 eor $t0,$t0,$Elo,lsl#23
93 eor $t1,$t1,$Ehi,lsl#23 @ Sigma1(e)
94 adds $Tlo,$Tlo,$t0
95 ldr $t0,[sp,#$Foff+0] @ f.lo
96 adc $Thi,$Thi,$t1 @ T += Sigma1(e)
97 ldr $t1,[sp,#$Foff+4] @ f.hi
98 adds $Tlo,$Tlo,$t2
99 ldr $t2,[sp,#$Goff+0] @ g.lo
100 adc $Thi,$Thi,$t3 @ T += h
101 ldr $t3,[sp,#$Goff+4] @ g.hi
102
103 eor $t0,$t0,$t2
104 str $Elo,[sp,#$Eoff+0]
105 eor $t1,$t1,$t3
106 str $Ehi,[sp,#$Eoff+4]
107 and $t0,$t0,$Elo
108 str $Alo,[sp,#$Aoff+0]
109 and $t1,$t1,$Ehi
110 str $Ahi,[sp,#$Aoff+4]
111 eor $t0,$t0,$t2
112 ldr $t2,[$Ktbl,#$lo] @ K[i].lo
113 eor $t1,$t1,$t3 @ Ch(e,f,g)
114 ldr $t3,[$Ktbl,#$hi] @ K[i].hi
115
116 adds $Tlo,$Tlo,$t0
117 ldr $Elo,[sp,#$Doff+0] @ d.lo
118 adc $Thi,$Thi,$t1 @ T += Ch(e,f,g)
119 ldr $Ehi,[sp,#$Doff+4] @ d.hi
120 adds $Tlo,$Tlo,$t2
121 and $t0,$t2,#0xff
122 adc $Thi,$Thi,$t3 @ T += K[i]
123 adds $Elo,$Elo,$Tlo
124 ldr $t2,[sp,#$Boff+0] @ b.lo
125 adc $Ehi,$Ehi,$Thi @ d += T
126 teq $t0,#$magic
127
128 ldr $t3,[sp,#$Coff+0] @ c.lo
129 orreq $Ktbl,$Ktbl,#1
130 @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
131 @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
132 @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
133 mov $t0,$Alo,lsr#28
134 mov $t1,$Ahi,lsr#28
135 eor $t0,$t0,$Ahi,lsl#4
136 eor $t1,$t1,$Alo,lsl#4
137 eor $t0,$t0,$Ahi,lsr#2
138 eor $t1,$t1,$Alo,lsr#2
139 eor $t0,$t0,$Alo,lsl#30
140 eor $t1,$t1,$Ahi,lsl#30
141 eor $t0,$t0,$Ahi,lsr#7
142 eor $t1,$t1,$Alo,lsr#7
143 eor $t0,$t0,$Alo,lsl#25
144 eor $t1,$t1,$Ahi,lsl#25 @ Sigma0(a)
145 adds $Tlo,$Tlo,$t0
146 and $t0,$Alo,$t2
147 adc $Thi,$Thi,$t1 @ T += Sigma0(a)
148
149 ldr $t1,[sp,#$Boff+4] @ b.hi
150 orr $Alo,$Alo,$t2
151 ldr $t2,[sp,#$Coff+4] @ c.hi
152 and $Alo,$Alo,$t3
153 and $t3,$Ahi,$t1
154 orr $Ahi,$Ahi,$t1
155 orr $Alo,$Alo,$t0 @ Maj(a,b,c).lo
156 and $Ahi,$Ahi,$t2
157 adds $Alo,$Alo,$Tlo
158 orr $Ahi,$Ahi,$t3 @ Maj(a,b,c).hi
159 sub sp,sp,#8
160 adc $Ahi,$Ahi,$Thi @ h += T
161 tst $Ktbl,#1
162 add $Ktbl,$Ktbl,#8
163___
164}
165$code=<<___;
166#include "arm_arch.h"
167#ifdef __ARMEL__
168# define LO 0
169# define HI 4
170# define WORD64(hi0,lo0,hi1,lo1) .word lo0,hi0, lo1,hi1
171#else
172# define HI 0
173# define LO 4
174# define WORD64(hi0,lo0,hi1,lo1) .word hi0,lo0, hi1,lo1
175#endif
176
177.text
178.code 32
179.type K512,%object
180.align 5
181K512:
182WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
183WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
184WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
185WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
186WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
187WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
188WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
189WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
190WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
191WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
192WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
193WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
194WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
195WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
196WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
197WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
198WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
199WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
200WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
201WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
202WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
203WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
204WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
205WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
206WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
207WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
208WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
209WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
210WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
211WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
212WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
213WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
214WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
215WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
216WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
217WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
218WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
219WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
220WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
221WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
222.size K512,.-K512
223.LOPENSSL_armcap:
224.word OPENSSL_armcap_P-sha512_block_data_order
225.skip 32-4
226
227.global sha512_block_data_order
228.type sha512_block_data_order,%function
229sha512_block_data_order:
230 sub r3,pc,#8 @ sha512_block_data_order
231 add $len,$inp,$len,lsl#7 @ len to point at the end of inp
232#if __ARM_ARCH__>=7 && !defined(__STRICT_ALIGNMENT)
233 ldr r12,.LOPENSSL_armcap
234 ldr r12,[r3,r12] @ OPENSSL_armcap_P
235 tst r12,#1
236 bne .LNEON
237#endif
238 stmdb sp!,{r4-r12,lr}
239 sub $Ktbl,r3,#672 @ K512
240 sub sp,sp,#9*8
241
242 ldr $Elo,[$ctx,#$Eoff+$lo]
243 ldr $Ehi,[$ctx,#$Eoff+$hi]
244 ldr $t0, [$ctx,#$Goff+$lo]
245 ldr $t1, [$ctx,#$Goff+$hi]
246 ldr $t2, [$ctx,#$Hoff+$lo]
247 ldr $t3, [$ctx,#$Hoff+$hi]
248.Loop:
249 str $t0, [sp,#$Goff+0]
250 str $t1, [sp,#$Goff+4]
251 str $t2, [sp,#$Hoff+0]
252 str $t3, [sp,#$Hoff+4]
253 ldr $Alo,[$ctx,#$Aoff+$lo]
254 ldr $Ahi,[$ctx,#$Aoff+$hi]
255 ldr $Tlo,[$ctx,#$Boff+$lo]
256 ldr $Thi,[$ctx,#$Boff+$hi]
257 ldr $t0, [$ctx,#$Coff+$lo]
258 ldr $t1, [$ctx,#$Coff+$hi]
259 ldr $t2, [$ctx,#$Doff+$lo]
260 ldr $t3, [$ctx,#$Doff+$hi]
261 str $Tlo,[sp,#$Boff+0]
262 str $Thi,[sp,#$Boff+4]
263 str $t0, [sp,#$Coff+0]
264 str $t1, [sp,#$Coff+4]
265 str $t2, [sp,#$Doff+0]
266 str $t3, [sp,#$Doff+4]
267 ldr $Tlo,[$ctx,#$Foff+$lo]
268 ldr $Thi,[$ctx,#$Foff+$hi]
269 str $Tlo,[sp,#$Foff+0]
270 str $Thi,[sp,#$Foff+4]
271
272.L00_15:
273#if __ARM_ARCH__<7 || defined(__STRICT_ALIGNMENT)
274 ldrb $Tlo,[$inp,#7]
275 ldrb $t0, [$inp,#6]
276 ldrb $t1, [$inp,#5]
277 ldrb $t2, [$inp,#4]
278 ldrb $Thi,[$inp,#3]
279 ldrb $t3, [$inp,#2]
280 orr $Tlo,$Tlo,$t0,lsl#8
281 ldrb $t0, [$inp,#1]
282 orr $Tlo,$Tlo,$t1,lsl#16
283 ldrb $t1, [$inp],#8
284 orr $Tlo,$Tlo,$t2,lsl#24
285 orr $Thi,$Thi,$t3,lsl#8
286 orr $Thi,$Thi,$t0,lsl#16
287 orr $Thi,$Thi,$t1,lsl#24
288#else
289 ldr $Tlo,[$inp,#4]
290 ldr $Thi,[$inp],#8
291#ifdef __ARMEL__
292 rev $Tlo,$Tlo
293 rev $Thi,$Thi
294#endif
295#endif
296___
297 &BODY_00_15(0x94);
298$code.=<<___;
299 tst $Ktbl,#1
300 beq .L00_15
301 ldr $t0,[sp,#`$Xoff+8*(16-1)`+0]
302 ldr $t1,[sp,#`$Xoff+8*(16-1)`+4]
303 bic $Ktbl,$Ktbl,#1
304.L16_79:
305 @ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7))
306 @ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
307 @ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7
308 mov $Tlo,$t0,lsr#1
309 ldr $t2,[sp,#`$Xoff+8*(16-14)`+0]
310 mov $Thi,$t1,lsr#1
311 ldr $t3,[sp,#`$Xoff+8*(16-14)`+4]
312 eor $Tlo,$Tlo,$t1,lsl#31
313 eor $Thi,$Thi,$t0,lsl#31
314 eor $Tlo,$Tlo,$t0,lsr#8
315 eor $Thi,$Thi,$t1,lsr#8
316 eor $Tlo,$Tlo,$t1,lsl#24
317 eor $Thi,$Thi,$t0,lsl#24
318 eor $Tlo,$Tlo,$t0,lsr#7
319 eor $Thi,$Thi,$t1,lsr#7
320 eor $Tlo,$Tlo,$t1,lsl#25
321
322 @ sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
323 @ LO lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
324 @ HI hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
325 mov $t0,$t2,lsr#19
326 mov $t1,$t3,lsr#19
327 eor $t0,$t0,$t3,lsl#13
328 eor $t1,$t1,$t2,lsl#13
329 eor $t0,$t0,$t3,lsr#29
330 eor $t1,$t1,$t2,lsr#29
331 eor $t0,$t0,$t2,lsl#3
332 eor $t1,$t1,$t3,lsl#3
333 eor $t0,$t0,$t2,lsr#6
334 eor $t1,$t1,$t3,lsr#6
335 ldr $t2,[sp,#`$Xoff+8*(16-9)`+0]
336 eor $t0,$t0,$t3,lsl#26
337
338 ldr $t3,[sp,#`$Xoff+8*(16-9)`+4]
339 adds $Tlo,$Tlo,$t0
340 ldr $t0,[sp,#`$Xoff+8*16`+0]
341 adc $Thi,$Thi,$t1
342
343 ldr $t1,[sp,#`$Xoff+8*16`+4]
344 adds $Tlo,$Tlo,$t2
345 adc $Thi,$Thi,$t3
346 adds $Tlo,$Tlo,$t0
347 adc $Thi,$Thi,$t1
348___
349 &BODY_00_15(0x17);
350$code.=<<___;
351 ldreq $t0,[sp,#`$Xoff+8*(16-1)`+0]
352 ldreq $t1,[sp,#`$Xoff+8*(16-1)`+4]
353 beq .L16_79
354 bic $Ktbl,$Ktbl,#1
355
356 ldr $Tlo,[sp,#$Boff+0]
357 ldr $Thi,[sp,#$Boff+4]
358 ldr $t0, [$ctx,#$Aoff+$lo]
359 ldr $t1, [$ctx,#$Aoff+$hi]
360 ldr $t2, [$ctx,#$Boff+$lo]
361 ldr $t3, [$ctx,#$Boff+$hi]
362 adds $t0,$Alo,$t0
363 str $t0, [$ctx,#$Aoff+$lo]
364 adc $t1,$Ahi,$t1
365 str $t1, [$ctx,#$Aoff+$hi]
366 adds $t2,$Tlo,$t2
367 str $t2, [$ctx,#$Boff+$lo]
368 adc $t3,$Thi,$t3
369 str $t3, [$ctx,#$Boff+$hi]
370
371 ldr $Alo,[sp,#$Coff+0]
372 ldr $Ahi,[sp,#$Coff+4]
373 ldr $Tlo,[sp,#$Doff+0]
374 ldr $Thi,[sp,#$Doff+4]
375 ldr $t0, [$ctx,#$Coff+$lo]
376 ldr $t1, [$ctx,#$Coff+$hi]
377 ldr $t2, [$ctx,#$Doff+$lo]
378 ldr $t3, [$ctx,#$Doff+$hi]
379 adds $t0,$Alo,$t0
380 str $t0, [$ctx,#$Coff+$lo]
381 adc $t1,$Ahi,$t1
382 str $t1, [$ctx,#$Coff+$hi]
383 adds $t2,$Tlo,$t2
384 str $t2, [$ctx,#$Doff+$lo]
385 adc $t3,$Thi,$t3
386 str $t3, [$ctx,#$Doff+$hi]
387
388 ldr $Tlo,[sp,#$Foff+0]
389 ldr $Thi,[sp,#$Foff+4]
390 ldr $t0, [$ctx,#$Eoff+$lo]
391 ldr $t1, [$ctx,#$Eoff+$hi]
392 ldr $t2, [$ctx,#$Foff+$lo]
393 ldr $t3, [$ctx,#$Foff+$hi]
394 adds $Elo,$Elo,$t0
395 str $Elo,[$ctx,#$Eoff+$lo]
396 adc $Ehi,$Ehi,$t1
397 str $Ehi,[$ctx,#$Eoff+$hi]
398 adds $t2,$Tlo,$t2
399 str $t2, [$ctx,#$Foff+$lo]
400 adc $t3,$Thi,$t3
401 str $t3, [$ctx,#$Foff+$hi]
402
403 ldr $Alo,[sp,#$Goff+0]
404 ldr $Ahi,[sp,#$Goff+4]
405 ldr $Tlo,[sp,#$Hoff+0]
406 ldr $Thi,[sp,#$Hoff+4]
407 ldr $t0, [$ctx,#$Goff+$lo]
408 ldr $t1, [$ctx,#$Goff+$hi]
409 ldr $t2, [$ctx,#$Hoff+$lo]
410 ldr $t3, [$ctx,#$Hoff+$hi]
411 adds $t0,$Alo,$t0
412 str $t0, [$ctx,#$Goff+$lo]
413 adc $t1,$Ahi,$t1
414 str $t1, [$ctx,#$Goff+$hi]
415 adds $t2,$Tlo,$t2
416 str $t2, [$ctx,#$Hoff+$lo]
417 adc $t3,$Thi,$t3
418 str $t3, [$ctx,#$Hoff+$hi]
419
420 add sp,sp,#640
421 sub $Ktbl,$Ktbl,#640
422
423 teq $inp,$len
424 bne .Loop
425
426 add sp,sp,#8*9 @ destroy frame
427#if __ARM_ARCH__>=5
428 ldmia sp!,{r4-r12,pc}
429#else
430 ldmia sp!,{r4-r12,lr}
431 tst lr,#1
432 moveq pc,lr @ be binary compatible with V4, yet
433 bx lr @ interoperable with Thumb ISA:-)
434#endif
435___
436
437{
438my @Sigma0=(28,34,39);
439my @Sigma1=(14,18,41);
440my @sigma0=(1, 8, 7);
441my @sigma1=(19,61,6);
442
443my $Ktbl="r3";
444my $cnt="r12"; # volatile register known as ip, intra-procedure-call scratch
445
446my @X=map("d$_",(0..15));
447my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23));
448
449sub NEON_00_15() {
450my $i=shift;
451my ($a,$b,$c,$d,$e,$f,$g,$h)=@_;
452my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31)); # temps
453
454$code.=<<___ if ($i<16 || $i&1);
455 vshr.u64 $t0,$e,#@Sigma1[0] @ $i
456#if $i<16
457 vld1.64 {@X[$i%16]},[$inp]! @ handles unaligned
458#endif
459 vshr.u64 $t1,$e,#@Sigma1[1]
460 vshr.u64 $t2,$e,#@Sigma1[2]
461___
462$code.=<<___;
463 vld1.64 {$K},[$Ktbl,:64]! @ K[i++]
464 vsli.64 $t0,$e,#`64-@Sigma1[0]`
465 vsli.64 $t1,$e,#`64-@Sigma1[1]`
466 vsli.64 $t2,$e,#`64-@Sigma1[2]`
467#if $i<16 && defined(__ARMEL__)
468 vrev64.8 @X[$i],@X[$i]
469#endif
470 vadd.i64 $T1,$K,$h
471 veor $Ch,$f,$g
472 veor $t0,$t1
473 vand $Ch,$e
474 veor $t0,$t2 @ Sigma1(e)
475 veor $Ch,$g @ Ch(e,f,g)
476 vadd.i64 $T1,$t0
477 vshr.u64 $t0,$a,#@Sigma0[0]
478 vadd.i64 $T1,$Ch
479 vshr.u64 $t1,$a,#@Sigma0[1]
480 vshr.u64 $t2,$a,#@Sigma0[2]
481 vsli.64 $t0,$a,#`64-@Sigma0[0]`
482 vsli.64 $t1,$a,#`64-@Sigma0[1]`
483 vsli.64 $t2,$a,#`64-@Sigma0[2]`
484 vadd.i64 $T1,@X[$i%16]
485 vorr $Maj,$a,$c
486 vand $Ch,$a,$c
487 veor $h,$t0,$t1
488 vand $Maj,$b
489 veor $h,$t2 @ Sigma0(a)
490 vorr $Maj,$Ch @ Maj(a,b,c)
491 vadd.i64 $h,$T1
492 vadd.i64 $d,$T1
493 vadd.i64 $h,$Maj
494___
495}
496
497sub NEON_16_79() {
498my $i=shift;
499
500if ($i&1) { &NEON_00_15($i,@_); return; }
501
502# 2x-vectorized, therefore runs every 2nd round
503my @X=map("q$_",(0..7)); # view @X as 128-bit vector
504my ($t0,$t1,$s0,$s1) = map("q$_",(12..15)); # temps
505my ($d0,$d1,$d2) = map("d$_",(24..26)); # temps from NEON_00_15
506my $e=@_[4]; # $e from NEON_00_15
507$i /= 2;
508$code.=<<___;
509 vshr.u64 $t0,@X[($i+7)%8],#@sigma1[0]
510 vshr.u64 $t1,@X[($i+7)%8],#@sigma1[1]
511 vshr.u64 $s1,@X[($i+7)%8],#@sigma1[2]
512 vsli.64 $t0,@X[($i+7)%8],#`64-@sigma1[0]`
513 vext.8 $s0,@X[$i%8],@X[($i+1)%8],#8 @ X[i+1]
514 vsli.64 $t1,@X[($i+7)%8],#`64-@sigma1[1]`
515 veor $s1,$t0
516 vshr.u64 $t0,$s0,#@sigma0[0]
517 veor $s1,$t1 @ sigma1(X[i+14])
518 vshr.u64 $t1,$s0,#@sigma0[1]
519 vadd.i64 @X[$i%8],$s1
520 vshr.u64 $s1,$s0,#@sigma0[2]
521 vsli.64 $t0,$s0,#`64-@sigma0[0]`
522 vsli.64 $t1,$s0,#`64-@sigma0[1]`
523 vext.8 $s0,@X[($i+4)%8],@X[($i+5)%8],#8 @ X[i+9]
524 veor $s1,$t0
525 vshr.u64 $d0,$e,#@Sigma1[0] @ from NEON_00_15
526 vadd.i64 @X[$i%8],$s0
527 vshr.u64 $d1,$e,#@Sigma1[1] @ from NEON_00_15
528 veor $s1,$t1 @ sigma0(X[i+1])
529 vshr.u64 $d2,$e,#@Sigma1[2] @ from NEON_00_15
530 vadd.i64 @X[$i%8],$s1
531___
532 &NEON_00_15(2*$i,@_);
533}
534
535$code.=<<___;
536#if __ARM_ARCH__>=7 && !defined(__STRICT_ALIGNMENT)
537.fpu neon
538
539.align 4
540.LNEON:
541 dmb @ errata #451034 on early Cortex A8
542 vstmdb sp!,{d8-d15} @ ABI specification says so
543 sub $Ktbl,r3,#672 @ K512
544 vldmia $ctx,{$A-$H} @ load context
545.Loop_neon:
546___
547for($i=0;$i<16;$i++) { &NEON_00_15($i,@V); unshift(@V,pop(@V)); }
548$code.=<<___;
549 mov $cnt,#4
550.L16_79_neon:
551 subs $cnt,#1
552___
553for(;$i<32;$i++) { &NEON_16_79($i,@V); unshift(@V,pop(@V)); }
554$code.=<<___;
555 bne .L16_79_neon
556
557 vldmia $ctx,{d24-d31} @ load context to temp
558 vadd.i64 q8,q12 @ vectorized accumulate
559 vadd.i64 q9,q13
560 vadd.i64 q10,q14
561 vadd.i64 q11,q15
562 vstmia $ctx,{$A-$H} @ save context
563 teq $inp,$len
564 sub $Ktbl,#640 @ rewind K512
565 bne .Loop_neon
566
567 vldmia sp!,{d8-d15} @ epilogue
568 bx lr
569#endif
570___
571}
572$code.=<<___;
573.size sha512_block_data_order,.-sha512_block_data_order
574.asciz "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
575.align 2
576.comm OPENSSL_armcap_P,4,4
577___
578
579$code =~ s/\`([^\`]*)\`/eval $1/gem;
580$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
581print $code;
582close STDOUT; # enforce flush
diff --git a/src/lib/libcrypto/sha/asm/sha512-mips.pl b/src/lib/libcrypto/sha/asm/sha512-mips.pl
deleted file mode 100644
index 495a000695..0000000000
--- a/src/lib/libcrypto/sha/asm/sha512-mips.pl
+++ /dev/null
@@ -1,457 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# SHA2 block procedures for MIPS.
11
12# October 2010.
13#
14# SHA256 performance improvement on MIPS R5000 CPU is ~27% over gcc-
15# generated code in o32 build and ~55% in n32/64 build. SHA512 [which
16# for now can only be compiled for MIPS64 ISA] improvement is modest
17# ~17%, but it comes for free, because it's same instruction sequence.
18# Improvement coefficients are for aligned input.
19
20######################################################################
21# There is a number of MIPS ABI in use, O32 and N32/64 are most
22# widely used. Then there is a new contender: NUBI. It appears that if
23# one picks the latter, it's possible to arrange code in ABI neutral
24# manner. Therefore let's stick to NUBI register layout:
25#
26($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
27($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
28($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
29($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
30#
31# The return value is placed in $a0. Following coding rules facilitate
32# interoperability:
33#
34# - never ever touch $tp, "thread pointer", former $gp [o32 can be
35# excluded from the rule, because it's specified volatile];
36# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
37# old code];
38# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
39#
40# For reference here is register layout for N32/64 MIPS ABIs:
41#
42# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
43# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
44# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
45# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
46# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
47#
48$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64
49
50if ($flavour =~ /64/i) {
51 $LA="dla";
52} else {
53 $LA="la";
54}
55
56if ($flavour =~ /64|n32/i) {
57 $PTR_ADD="dadd"; # incidentally works even on n32
58 $PTR_SUB="dsub"; # incidentally works even on n32
59 $REG_S="sd";
60 $REG_L="ld";
61 $PTR_SLL="dsll"; # incidentally works even on n32
62 $SZREG=8;
63} else {
64 $PTR_ADD="add";
65 $PTR_SUB="sub";
66 $REG_S="sw";
67 $REG_L="lw";
68 $PTR_SLL="sll";
69 $SZREG=4;
70}
71$pf = ($flavour =~ /nubi/i) ? $t0 : $t2;
72#
73# <appro@openssl.org>
74#
75######################################################################
76
77$big_endian=(`echo MIPSEL | $ENV{CC} -E -P -`=~/MIPSEL/)?1:0;
78
79for (@ARGV) { $output=$_ if (/^\w[\w\-]*\.\w+$/); }
80open STDOUT,">$output";
81
82if (!defined($big_endian)) { $big_endian=(unpack('L',pack('N',1))==1); }
83
84if ($output =~ /512/) {
85 $label="512";
86 $SZ=8;
87 $LD="ld"; # load from memory
88 $ST="sd"; # store to memory
89 $SLL="dsll"; # shift left logical
90 $SRL="dsrl"; # shift right logical
91 $ADDU="daddu";
92 @Sigma0=(28,34,39);
93 @Sigma1=(14,18,41);
94 @sigma0=( 7, 1, 8); # right shift first
95 @sigma1=( 6,19,61); # right shift first
96 $lastK=0x817;
97 $rounds=80;
98} else {
99 $label="256";
100 $SZ=4;
101 $LD="lw"; # load from memory
102 $ST="sw"; # store to memory
103 $SLL="sll"; # shift left logical
104 $SRL="srl"; # shift right logical
105 $ADDU="addu";
106 @Sigma0=( 2,13,22);
107 @Sigma1=( 6,11,25);
108 @sigma0=( 3, 7,18); # right shift first
109 @sigma1=(10,17,19); # right shift first
110 $lastK=0x8f2;
111 $rounds=64;
112}
113
114$MSB = $big_endian ? 0 : ($SZ-1);
115$LSB = ($SZ-1)&~$MSB;
116
117@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("\$$_",(1,2,3,7,24,25,30,31));
118@X=map("\$$_",(8..23));
119
120$ctx=$a0;
121$inp=$a1;
122$len=$a2; $Ktbl=$len;
123
124sub BODY_00_15 {
125my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
126my ($T1,$tmp0,$tmp1,$tmp2)=(@X[4],@X[5],@X[6],@X[7]);
127
128$code.=<<___ if ($i<15);
129 ${LD}l @X[1],`($i+1)*$SZ+$MSB`($inp)
130 ${LD}r @X[1],`($i+1)*$SZ+$LSB`($inp)
131___
132$code.=<<___ if (!$big_endian && $i<16 && $SZ==4);
133 srl $tmp0,@X[0],24 # byte swap($i)
134 srl $tmp1,@X[0],8
135 andi $tmp2,@X[0],0xFF00
136 sll @X[0],@X[0],24
137 andi $tmp1,0xFF00
138 sll $tmp2,$tmp2,8
139 or @X[0],$tmp0
140 or $tmp1,$tmp2
141 or @X[0],$tmp1
142___
143$code.=<<___ if (!$big_endian && $i<16 && $SZ==8);
144 ori $tmp0,$zero,0xFF
145 dsll $tmp2,$tmp0,32
146 or $tmp0,$tmp2 # 0x000000FF000000FF
147 and $tmp1,@X[0],$tmp0 # byte swap($i)
148 dsrl $tmp2,@X[0],24
149 dsll $tmp1,24
150 and $tmp2,$tmp0
151 dsll $tmp0,8 # 0x0000FF000000FF00
152 or $tmp1,$tmp2
153 and $tmp2,@X[0],$tmp0
154 dsrl @X[0],8
155 dsll $tmp2,8
156 and @X[0],$tmp0
157 or $tmp1,$tmp2
158 or @X[0],$tmp1
159 dsrl $tmp1,@X[0],32
160 dsll @X[0],32
161 or @X[0],$tmp1
162___
163$code.=<<___;
164 $ADDU $T1,$X[0],$h # $i
165 $SRL $h,$e,@Sigma1[0]
166 xor $tmp2,$f,$g
167 $SLL $tmp1,$e,`$SZ*8-@Sigma1[2]`
168 and $tmp2,$e
169 $SRL $tmp0,$e,@Sigma1[1]
170 xor $h,$tmp1
171 $SLL $tmp1,$e,`$SZ*8-@Sigma1[1]`
172 xor $h,$tmp0
173 $SRL $tmp0,$e,@Sigma1[2]
174 xor $h,$tmp1
175 $SLL $tmp1,$e,`$SZ*8-@Sigma1[0]`
176 xor $h,$tmp0
177 xor $tmp2,$g # Ch(e,f,g)
178 xor $tmp0,$tmp1,$h # Sigma1(e)
179
180 $SRL $h,$a,@Sigma0[0]
181 $ADDU $T1,$tmp2
182 $LD $tmp2,`$i*$SZ`($Ktbl) # K[$i]
183 $SLL $tmp1,$a,`$SZ*8-@Sigma0[2]`
184 $ADDU $T1,$tmp0
185 $SRL $tmp0,$a,@Sigma0[1]
186 xor $h,$tmp1
187 $SLL $tmp1,$a,`$SZ*8-@Sigma0[1]`
188 xor $h,$tmp0
189 $SRL $tmp0,$a,@Sigma0[2]
190 xor $h,$tmp1
191 $SLL $tmp1,$a,`$SZ*8-@Sigma0[0]`
192 xor $h,$tmp0
193 $ST @X[0],`($i%16)*$SZ`($sp) # offload to ring buffer
194 xor $h,$tmp1 # Sigma0(a)
195
196 or $tmp0,$a,$b
197 and $tmp1,$a,$b
198 and $tmp0,$c
199 or $tmp1,$tmp0 # Maj(a,b,c)
200 $ADDU $T1,$tmp2 # +=K[$i]
201 $ADDU $h,$tmp1
202
203 $ADDU $d,$T1
204 $ADDU $h,$T1
205___
206$code.=<<___ if ($i>=13);
207 $LD @X[3],`(($i+3)%16)*$SZ`($sp) # prefetch from ring buffer
208___
209}
210
211sub BODY_16_XX {
212my $i=@_[0];
213my ($tmp0,$tmp1,$tmp2,$tmp3)=(@X[4],@X[5],@X[6],@X[7]);
214
215$code.=<<___;
216 $SRL $tmp2,@X[1],@sigma0[0] # Xupdate($i)
217 $ADDU @X[0],@X[9] # +=X[i+9]
218 $SLL $tmp1,@X[1],`$SZ*8-@sigma0[2]`
219 $SRL $tmp0,@X[1],@sigma0[1]
220 xor $tmp2,$tmp1
221 $SLL $tmp1,`@sigma0[2]-@sigma0[1]`
222 xor $tmp2,$tmp0
223 $SRL $tmp0,@X[1],@sigma0[2]
224 xor $tmp2,$tmp1
225
226 $SRL $tmp3,@X[14],@sigma1[0]
227 xor $tmp2,$tmp0 # sigma0(X[i+1])
228 $SLL $tmp1,@X[14],`$SZ*8-@sigma1[2]`
229 $ADDU @X[0],$tmp2
230 $SRL $tmp0,@X[14],@sigma1[1]
231 xor $tmp3,$tmp1
232 $SLL $tmp1,`@sigma1[2]-@sigma1[1]`
233 xor $tmp3,$tmp0
234 $SRL $tmp0,@X[14],@sigma1[2]
235 xor $tmp3,$tmp1
236
237 xor $tmp3,$tmp0 # sigma1(X[i+14])
238 $ADDU @X[0],$tmp3
239___
240 &BODY_00_15(@_);
241}
242
243$FRAMESIZE=16*$SZ+16*$SZREG;
244$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0xc0fff008 : 0xc0ff0000;
245
246$code.=<<___;
247.text
248.set noat
249#if !defined(__vxworks) || defined(__pic__)
250.option pic2
251#endif
252
253.align 5
254.globl sha${label}_block_data_order
255.ent sha${label}_block_data_order
256sha${label}_block_data_order:
257 .frame $sp,$FRAMESIZE,$ra
258 .mask $SAVED_REGS_MASK,-$SZREG
259 .set noreorder
260___
261$code.=<<___ if ($flavour =~ /o32/i); # o32 PIC-ification
262 .cpload $pf
263___
264$code.=<<___;
265 $PTR_SUB $sp,$FRAMESIZE
266 $REG_S $ra,$FRAMESIZE-1*$SZREG($sp)
267 $REG_S $fp,$FRAMESIZE-2*$SZREG($sp)
268 $REG_S $s11,$FRAMESIZE-3*$SZREG($sp)
269 $REG_S $s10,$FRAMESIZE-4*$SZREG($sp)
270 $REG_S $s9,$FRAMESIZE-5*$SZREG($sp)
271 $REG_S $s8,$FRAMESIZE-6*$SZREG($sp)
272 $REG_S $s7,$FRAMESIZE-7*$SZREG($sp)
273 $REG_S $s6,$FRAMESIZE-8*$SZREG($sp)
274 $REG_S $s5,$FRAMESIZE-9*$SZREG($sp)
275 $REG_S $s4,$FRAMESIZE-10*$SZREG($sp)
276___
277$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
278 $REG_S $s3,$FRAMESIZE-11*$SZREG($sp)
279 $REG_S $s2,$FRAMESIZE-12*$SZREG($sp)
280 $REG_S $s1,$FRAMESIZE-13*$SZREG($sp)
281 $REG_S $s0,$FRAMESIZE-14*$SZREG($sp)
282 $REG_S $gp,$FRAMESIZE-15*$SZREG($sp)
283___
284$code.=<<___;
285 $PTR_SLL @X[15],$len,`log(16*$SZ)/log(2)`
286___
287$code.=<<___ if ($flavour !~ /o32/i); # non-o32 PIC-ification
288 .cplocal $Ktbl
289 .cpsetup $pf,$zero,sha${label}_block_data_order
290___
291$code.=<<___;
292 .set reorder
293 $LA $Ktbl,K${label} # PIC-ified 'load address'
294
295 $LD $A,0*$SZ($ctx) # load context
296 $LD $B,1*$SZ($ctx)
297 $LD $C,2*$SZ($ctx)
298 $LD $D,3*$SZ($ctx)
299 $LD $E,4*$SZ($ctx)
300 $LD $F,5*$SZ($ctx)
301 $LD $G,6*$SZ($ctx)
302 $LD $H,7*$SZ($ctx)
303
304 $PTR_ADD @X[15],$inp # pointer to the end of input
305 $REG_S @X[15],16*$SZ($sp)
306 b .Loop
307
308.align 5
309.Loop:
310 ${LD}l @X[0],$MSB($inp)
311 ${LD}r @X[0],$LSB($inp)
312___
313for ($i=0;$i<16;$i++)
314{ &BODY_00_15($i,@V); unshift(@V,pop(@V)); push(@X,shift(@X)); }
315$code.=<<___;
316 b .L16_xx
317.align 4
318.L16_xx:
319___
320for (;$i<32;$i++)
321{ &BODY_16_XX($i,@V); unshift(@V,pop(@V)); push(@X,shift(@X)); }
322$code.=<<___;
323 and @X[6],0xfff
324 li @X[7],$lastK
325 .set noreorder
326 bne @X[6],@X[7],.L16_xx
327 $PTR_ADD $Ktbl,16*$SZ # Ktbl+=16
328
329 $REG_L @X[15],16*$SZ($sp) # restore pointer to the end of input
330 $LD @X[0],0*$SZ($ctx)
331 $LD @X[1],1*$SZ($ctx)
332 $LD @X[2],2*$SZ($ctx)
333 $PTR_ADD $inp,16*$SZ
334 $LD @X[3],3*$SZ($ctx)
335 $ADDU $A,@X[0]
336 $LD @X[4],4*$SZ($ctx)
337 $ADDU $B,@X[1]
338 $LD @X[5],5*$SZ($ctx)
339 $ADDU $C,@X[2]
340 $LD @X[6],6*$SZ($ctx)
341 $ADDU $D,@X[3]
342 $LD @X[7],7*$SZ($ctx)
343 $ADDU $E,@X[4]
344 $ST $A,0*$SZ($ctx)
345 $ADDU $F,@X[5]
346 $ST $B,1*$SZ($ctx)
347 $ADDU $G,@X[6]
348 $ST $C,2*$SZ($ctx)
349 $ADDU $H,@X[7]
350 $ST $D,3*$SZ($ctx)
351 $ST $E,4*$SZ($ctx)
352 $ST $F,5*$SZ($ctx)
353 $ST $G,6*$SZ($ctx)
354 $ST $H,7*$SZ($ctx)
355
356 bne $inp,@X[15],.Loop
357 $PTR_SUB $Ktbl,`($rounds-16)*$SZ` # rewind $Ktbl
358
359 $REG_L $ra,$FRAMESIZE-1*$SZREG($sp)
360 $REG_L $fp,$FRAMESIZE-2*$SZREG($sp)
361 $REG_L $s11,$FRAMESIZE-3*$SZREG($sp)
362 $REG_L $s10,$FRAMESIZE-4*$SZREG($sp)
363 $REG_L $s9,$FRAMESIZE-5*$SZREG($sp)
364 $REG_L $s8,$FRAMESIZE-6*$SZREG($sp)
365 $REG_L $s7,$FRAMESIZE-7*$SZREG($sp)
366 $REG_L $s6,$FRAMESIZE-8*$SZREG($sp)
367 $REG_L $s5,$FRAMESIZE-9*$SZREG($sp)
368 $REG_L $s4,$FRAMESIZE-10*$SZREG($sp)
369___
370$code.=<<___ if ($flavour =~ /nubi/i);
371 $REG_L $s3,$FRAMESIZE-11*$SZREG($sp)
372 $REG_L $s2,$FRAMESIZE-12*$SZREG($sp)
373 $REG_L $s1,$FRAMESIZE-13*$SZREG($sp)
374 $REG_L $s0,$FRAMESIZE-14*$SZREG($sp)
375 $REG_L $gp,$FRAMESIZE-15*$SZREG($sp)
376___
377$code.=<<___;
378 jr $ra
379 $PTR_ADD $sp,$FRAMESIZE
380.end sha${label}_block_data_order
381
382.rdata
383.align 5
384K${label}:
385___
386if ($SZ==4) {
387$code.=<<___;
388 .word 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
389 .word 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
390 .word 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
391 .word 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
392 .word 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
393 .word 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
394 .word 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
395 .word 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
396 .word 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
397 .word 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
398 .word 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
399 .word 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
400 .word 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
401 .word 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
402 .word 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
403 .word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
404___
405} else {
406$code.=<<___;
407 .dword 0x428a2f98d728ae22, 0x7137449123ef65cd
408 .dword 0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc
409 .dword 0x3956c25bf348b538, 0x59f111f1b605d019
410 .dword 0x923f82a4af194f9b, 0xab1c5ed5da6d8118
411 .dword 0xd807aa98a3030242, 0x12835b0145706fbe
412 .dword 0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2
413 .dword 0x72be5d74f27b896f, 0x80deb1fe3b1696b1
414 .dword 0x9bdc06a725c71235, 0xc19bf174cf692694
415 .dword 0xe49b69c19ef14ad2, 0xefbe4786384f25e3
416 .dword 0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65
417 .dword 0x2de92c6f592b0275, 0x4a7484aa6ea6e483
418 .dword 0x5cb0a9dcbd41fbd4, 0x76f988da831153b5
419 .dword 0x983e5152ee66dfab, 0xa831c66d2db43210
420 .dword 0xb00327c898fb213f, 0xbf597fc7beef0ee4
421 .dword 0xc6e00bf33da88fc2, 0xd5a79147930aa725
422 .dword 0x06ca6351e003826f, 0x142929670a0e6e70
423 .dword 0x27b70a8546d22ffc, 0x2e1b21385c26c926
424 .dword 0x4d2c6dfc5ac42aed, 0x53380d139d95b3df
425 .dword 0x650a73548baf63de, 0x766a0abb3c77b2a8
426 .dword 0x81c2c92e47edaee6, 0x92722c851482353b
427 .dword 0xa2bfe8a14cf10364, 0xa81a664bbc423001
428 .dword 0xc24b8b70d0f89791, 0xc76c51a30654be30
429 .dword 0xd192e819d6ef5218, 0xd69906245565a910
430 .dword 0xf40e35855771202a, 0x106aa07032bbd1b8
431 .dword 0x19a4c116b8d2d0c8, 0x1e376c085141ab53
432 .dword 0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8
433 .dword 0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb
434 .dword 0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3
435 .dword 0x748f82ee5defb2fc, 0x78a5636f43172f60
436 .dword 0x84c87814a1f0ab72, 0x8cc702081a6439ec
437 .dword 0x90befffa23631e28, 0xa4506cebde82bde9
438 .dword 0xbef9a3f7b2c67915, 0xc67178f2e372532b
439 .dword 0xca273eceea26619c, 0xd186b8c721c0c207
440 .dword 0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178
441 .dword 0x06f067aa72176fba, 0x0a637dc5a2c898a6
442 .dword 0x113f9804bef90dae, 0x1b710b35131c471b
443 .dword 0x28db77f523047d84, 0x32caab7b40c72493
444 .dword 0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c
445 .dword 0x4cc5d4becb3e42b6, 0x597f299cfc657e2a
446 .dword 0x5fcb6fab3ad6faec, 0x6c44198c4a475817
447___
448}
449$code.=<<___;
450.asciiz "SHA${label} for MIPS, CRYPTOGAMS by <appro\@openssl.org>"
451.align 5
452
453___
454
455$code =~ s/\`([^\`]*)\`/eval $1/gem;
456print $code;
457close STDOUT;
diff --git a/src/lib/libcrypto/sha/asm/sha512-parisc.pl b/src/lib/libcrypto/sha/asm/sha512-parisc.pl
deleted file mode 100755
index 42832e29f1..0000000000
--- a/src/lib/libcrypto/sha/asm/sha512-parisc.pl
+++ /dev/null
@@ -1,801 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# SHA256/512 block procedure for PA-RISC.
11
12# June 2009.
13#
14# SHA256 performance is >75% better than gcc 3.2 generated code on
15# PA-7100LC. Compared to code generated by vendor compiler this
16# implementation is almost 70% faster in 64-bit build, but delivers
17# virtually same performance in 32-bit build on PA-8600.
18#
19# SHA512 performance is >2.9x better than gcc 3.2 generated code on
20# PA-7100LC, PA-RISC 1.1 processor. Then implementation detects if the
21# code is executed on PA-RISC 2.0 processor and switches to 64-bit
22# code path delivering adequate performance even in "blended" 32-bit
23# build. Though 64-bit code is not any faster than code generated by
24# vendor compiler on PA-8600...
25#
26# Special thanks to polarhome.com for providing HP-UX account.
27
28$flavour = shift;
29$output = shift;
30open STDOUT,">$output";
31
32if ($flavour =~ /64/) {
33 $LEVEL ="2.0W";
34 $SIZE_T =8;
35 $FRAME_MARKER =80;
36 $SAVED_RP =16;
37 $PUSH ="std";
38 $PUSHMA ="std,ma";
39 $POP ="ldd";
40 $POPMB ="ldd,mb";
41} else {
42 $LEVEL ="1.0";
43 $SIZE_T =4;
44 $FRAME_MARKER =48;
45 $SAVED_RP =20;
46 $PUSH ="stw";
47 $PUSHMA ="stwm";
48 $POP ="ldw";
49 $POPMB ="ldwm";
50}
51
52if ($output =~ /512/) {
53 $func="sha512_block_data_order";
54 $SZ=8;
55 @Sigma0=(28,34,39);
56 @Sigma1=(14,18,41);
57 @sigma0=(1, 8, 7);
58 @sigma1=(19,61, 6);
59 $rounds=80;
60 $LAST10BITS=0x017;
61 $LD="ldd";
62 $LDM="ldd,ma";
63 $ST="std";
64} else {
65 $func="sha256_block_data_order";
66 $SZ=4;
67 @Sigma0=( 2,13,22);
68 @Sigma1=( 6,11,25);
69 @sigma0=( 7,18, 3);
70 @sigma1=(17,19,10);
71 $rounds=64;
72 $LAST10BITS=0x0f2;
73 $LD="ldw";
74 $LDM="ldwm";
75 $ST="stw";
76}
77
78$FRAME=16*$SIZE_T+$FRAME_MARKER;# 16 saved regs + frame marker
79 # [+ argument transfer]
80$XOFF=16*$SZ+32; # local variables
81$FRAME+=$XOFF;
82$XOFF+=$FRAME_MARKER; # distance between %sp and local variables
83
84$ctx="%r26"; # zapped by $a0
85$inp="%r25"; # zapped by $a1
86$num="%r24"; # zapped by $t0
87
88$a0 ="%r26";
89$a1 ="%r25";
90$t0 ="%r24";
91$t1 ="%r29";
92$Tbl="%r31";
93
94@V=($A,$B,$C,$D,$E,$F,$G,$H)=("%r17","%r18","%r19","%r20","%r21","%r22","%r23","%r28");
95
96@X=("%r1", "%r2", "%r3", "%r4", "%r5", "%r6", "%r7", "%r8",
97 "%r9", "%r10","%r11","%r12","%r13","%r14","%r15","%r16",$inp);
98
99sub ROUND_00_15 {
100my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
101$code.=<<___;
102 _ror $e,$Sigma1[0],$a0
103 and $f,$e,$t0
104 _ror $e,$Sigma1[1],$a1
105 addl $t1,$h,$h
106 andcm $g,$e,$t1
107 xor $a1,$a0,$a0
108 _ror $a1,`$Sigma1[2]-$Sigma1[1]`,$a1
109 or $t0,$t1,$t1 ; Ch(e,f,g)
110 addl @X[$i%16],$h,$h
111 xor $a0,$a1,$a1 ; Sigma1(e)
112 addl $t1,$h,$h
113 _ror $a,$Sigma0[0],$a0
114 addl $a1,$h,$h
115
116 _ror $a,$Sigma0[1],$a1
117 and $a,$b,$t0
118 and $a,$c,$t1
119 xor $a1,$a0,$a0
120 _ror $a1,`$Sigma0[2]-$Sigma0[1]`,$a1
121 xor $t1,$t0,$t0
122 and $b,$c,$t1
123 xor $a0,$a1,$a1 ; Sigma0(a)
124 addl $h,$d,$d
125 xor $t1,$t0,$t0 ; Maj(a,b,c)
126 `"$LDM $SZ($Tbl),$t1" if ($i<15)`
127 addl $a1,$h,$h
128 addl $t0,$h,$h
129
130___
131}
132
133sub ROUND_16_xx {
134my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
135$i-=16;
136$code.=<<___;
137 _ror @X[($i+1)%16],$sigma0[0],$a0
138 _ror @X[($i+1)%16],$sigma0[1],$a1
139 addl @X[($i+9)%16],@X[$i],@X[$i]
140 _ror @X[($i+14)%16],$sigma1[0],$t0
141 _ror @X[($i+14)%16],$sigma1[1],$t1
142 xor $a1,$a0,$a0
143 _shr @X[($i+1)%16],$sigma0[2],$a1
144 xor $t1,$t0,$t0
145 _shr @X[($i+14)%16],$sigma1[2],$t1
146 xor $a1,$a0,$a0 ; sigma0(X[(i+1)&0x0f])
147 xor $t1,$t0,$t0 ; sigma1(X[(i+14)&0x0f])
148 $LDM $SZ($Tbl),$t1
149 addl $a0,@X[$i],@X[$i]
150 addl $t0,@X[$i],@X[$i]
151___
152$code.=<<___ if ($i==15);
153 extru $t1,31,10,$a1
154 comiclr,<> $LAST10BITS,$a1,%r0
155 ldo 1($Tbl),$Tbl ; signal end of $Tbl
156___
157&ROUND_00_15($i+16,$a,$b,$c,$d,$e,$f,$g,$h);
158}
159
160$code=<<___;
161 .LEVEL $LEVEL
162 .text
163
164 .section .rodata
165 .ALIGN 64
166L\$table
167___
168$code.=<<___ if ($SZ==8);
169 .WORD 0x428a2f98,0xd728ae22,0x71374491,0x23ef65cd
170 .WORD 0xb5c0fbcf,0xec4d3b2f,0xe9b5dba5,0x8189dbbc
171 .WORD 0x3956c25b,0xf348b538,0x59f111f1,0xb605d019
172 .WORD 0x923f82a4,0xaf194f9b,0xab1c5ed5,0xda6d8118
173 .WORD 0xd807aa98,0xa3030242,0x12835b01,0x45706fbe
174 .WORD 0x243185be,0x4ee4b28c,0x550c7dc3,0xd5ffb4e2
175 .WORD 0x72be5d74,0xf27b896f,0x80deb1fe,0x3b1696b1
176 .WORD 0x9bdc06a7,0x25c71235,0xc19bf174,0xcf692694
177 .WORD 0xe49b69c1,0x9ef14ad2,0xefbe4786,0x384f25e3
178 .WORD 0x0fc19dc6,0x8b8cd5b5,0x240ca1cc,0x77ac9c65
179 .WORD 0x2de92c6f,0x592b0275,0x4a7484aa,0x6ea6e483
180 .WORD 0x5cb0a9dc,0xbd41fbd4,0x76f988da,0x831153b5
181 .WORD 0x983e5152,0xee66dfab,0xa831c66d,0x2db43210
182 .WORD 0xb00327c8,0x98fb213f,0xbf597fc7,0xbeef0ee4
183 .WORD 0xc6e00bf3,0x3da88fc2,0xd5a79147,0x930aa725
184 .WORD 0x06ca6351,0xe003826f,0x14292967,0x0a0e6e70
185 .WORD 0x27b70a85,0x46d22ffc,0x2e1b2138,0x5c26c926
186 .WORD 0x4d2c6dfc,0x5ac42aed,0x53380d13,0x9d95b3df
187 .WORD 0x650a7354,0x8baf63de,0x766a0abb,0x3c77b2a8
188 .WORD 0x81c2c92e,0x47edaee6,0x92722c85,0x1482353b
189 .WORD 0xa2bfe8a1,0x4cf10364,0xa81a664b,0xbc423001
190 .WORD 0xc24b8b70,0xd0f89791,0xc76c51a3,0x0654be30
191 .WORD 0xd192e819,0xd6ef5218,0xd6990624,0x5565a910
192 .WORD 0xf40e3585,0x5771202a,0x106aa070,0x32bbd1b8
193 .WORD 0x19a4c116,0xb8d2d0c8,0x1e376c08,0x5141ab53
194 .WORD 0x2748774c,0xdf8eeb99,0x34b0bcb5,0xe19b48a8
195 .WORD 0x391c0cb3,0xc5c95a63,0x4ed8aa4a,0xe3418acb
196 .WORD 0x5b9cca4f,0x7763e373,0x682e6ff3,0xd6b2b8a3
197 .WORD 0x748f82ee,0x5defb2fc,0x78a5636f,0x43172f60
198 .WORD 0x84c87814,0xa1f0ab72,0x8cc70208,0x1a6439ec
199 .WORD 0x90befffa,0x23631e28,0xa4506ceb,0xde82bde9
200 .WORD 0xbef9a3f7,0xb2c67915,0xc67178f2,0xe372532b
201 .WORD 0xca273ece,0xea26619c,0xd186b8c7,0x21c0c207
202 .WORD 0xeada7dd6,0xcde0eb1e,0xf57d4f7f,0xee6ed178
203 .WORD 0x06f067aa,0x72176fba,0x0a637dc5,0xa2c898a6
204 .WORD 0x113f9804,0xbef90dae,0x1b710b35,0x131c471b
205 .WORD 0x28db77f5,0x23047d84,0x32caab7b,0x40c72493
206 .WORD 0x3c9ebe0a,0x15c9bebc,0x431d67c4,0x9c100d4c
207 .WORD 0x4cc5d4be,0xcb3e42b6,0x597f299c,0xfc657e2a
208 .WORD 0x5fcb6fab,0x3ad6faec,0x6c44198c,0x4a475817
209___
210$code.=<<___ if ($SZ==4);
211 .WORD 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
212 .WORD 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
213 .WORD 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
214 .WORD 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
215 .WORD 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
216 .WORD 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
217 .WORD 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
218 .WORD 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
219 .WORD 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
220 .WORD 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
221 .WORD 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
222 .WORD 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
223 .WORD 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
224 .WORD 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
225 .WORD 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
226 .WORD 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
227___
228$code.=<<___;
229 .previous
230
231 .EXPORT $func,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR
232 .ALIGN 64
233$func
234 .PROC
235 .CALLINFO FRAME=`$FRAME-16*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=18
236 .ENTRY
237 $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
238 $PUSHMA %r3,$FRAME(%sp)
239 $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
240 $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
241 $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
242 $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp)
243 $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp)
244 $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp)
245 $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp)
246 $PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp)
247 $PUSH %r12,`-$FRAME+9*$SIZE_T`(%sp)
248 $PUSH %r13,`-$FRAME+10*$SIZE_T`(%sp)
249 $PUSH %r14,`-$FRAME+11*$SIZE_T`(%sp)
250 $PUSH %r15,`-$FRAME+12*$SIZE_T`(%sp)
251 $PUSH %r16,`-$FRAME+13*$SIZE_T`(%sp)
252 $PUSH %r17,`-$FRAME+14*$SIZE_T`(%sp)
253 $PUSH %r18,`-$FRAME+15*$SIZE_T`(%sp)
254
255 _shl $num,`log(16*$SZ)/log(2)`,$num
256 addl $inp,$num,$num ; $num to point at the end of $inp
257
258 $PUSH $num,`-$FRAME_MARKER-4*$SIZE_T`(%sp) ; save arguments
259 $PUSH $inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp)
260 $PUSH $ctx,`-$FRAME_MARKER-2*$SIZE_T`(%sp)
261
262#ifdef __PIC__
263 addil LT'L\$table, %r19
264 ldw RT'L\$table(%r1), $Tbl
265#else
266 ldil L'L\$table, %t1
267 ldo R'L\$table(%t1), $Tbl
268#endif
269___
270$code.=<<___ if ($SZ==8 && $SIZE_T==4);
271#ifndef __OpenBSD__
272___
273$code.=<<___ if ($SZ==8 && $SIZE_T==4);
274 ldi 31,$t1
275 mtctl $t1,%cr11
276 extrd,u,*= $t1,%sar,1,$t1 ; executes on PA-RISC 1.0
277 b L\$parisc1
278 nop
279___
280$code.=<<___;
281 $LD `0*$SZ`($ctx),$A ; load context
282 $LD `1*$SZ`($ctx),$B
283 $LD `2*$SZ`($ctx),$C
284 $LD `3*$SZ`($ctx),$D
285 $LD `4*$SZ`($ctx),$E
286 $LD `5*$SZ`($ctx),$F
287 $LD `6*$SZ`($ctx),$G
288 $LD `7*$SZ`($ctx),$H
289
290 extru $inp,31,`log($SZ)/log(2)`,$t0
291 sh3addl $t0,%r0,$t0
292 subi `8*$SZ`,$t0,$t0
293 mtctl $t0,%cr11 ; load %sar with align factor
294
295L\$oop
296 ldi `$SZ-1`,$t0
297 $LDM $SZ($Tbl),$t1
298 andcm $inp,$t0,$t0 ; align $inp
299___
300 for ($i=0;$i<15;$i++) { # load input block
301 $code.="\t$LD `$SZ*$i`($t0),@X[$i]\n"; }
302$code.=<<___;
303 cmpb,*= $inp,$t0,L\$aligned
304 $LD `$SZ*15`($t0),@X[15]
305 $LD `$SZ*16`($t0),@X[16]
306___
307 for ($i=0;$i<16;$i++) { # align data
308 $code.="\t_align @X[$i],@X[$i+1],@X[$i]\n"; }
309$code.=<<___;
310L\$aligned
311 nop ; otherwise /usr/ccs/bin/as is confused by below .WORD
312___
313
314for($i=0;$i<16;$i++) { &ROUND_00_15($i,@V); unshift(@V,pop(@V)); }
315$code.=<<___;
316L\$rounds
317 nop ; otherwise /usr/ccs/bin/as is confused by below .WORD
318___
319for(;$i<32;$i++) { &ROUND_16_xx($i,@V); unshift(@V,pop(@V)); }
320$code.=<<___;
321 bb,>= $Tbl,31,L\$rounds ; end of $Tbl signalled?
322 nop
323
324 $POP `-$FRAME_MARKER-2*$SIZE_T`(%sp),$ctx ; restore arguments
325 $POP `-$FRAME_MARKER-3*$SIZE_T`(%sp),$inp
326 $POP `-$FRAME_MARKER-4*$SIZE_T`(%sp),$num
327 ldo `-$rounds*$SZ-1`($Tbl),$Tbl ; rewind $Tbl
328
329 $LD `0*$SZ`($ctx),@X[0] ; load context
330 $LD `1*$SZ`($ctx),@X[1]
331 $LD `2*$SZ`($ctx),@X[2]
332 $LD `3*$SZ`($ctx),@X[3]
333 $LD `4*$SZ`($ctx),@X[4]
334 $LD `5*$SZ`($ctx),@X[5]
335 addl @X[0],$A,$A
336 $LD `6*$SZ`($ctx),@X[6]
337 addl @X[1],$B,$B
338 $LD `7*$SZ`($ctx),@X[7]
339 ldo `16*$SZ`($inp),$inp ; advance $inp
340
341 $ST $A,`0*$SZ`($ctx) ; save context
342 addl @X[2],$C,$C
343 $ST $B,`1*$SZ`($ctx)
344 addl @X[3],$D,$D
345 $ST $C,`2*$SZ`($ctx)
346 addl @X[4],$E,$E
347 $ST $D,`3*$SZ`($ctx)
348 addl @X[5],$F,$F
349 $ST $E,`4*$SZ`($ctx)
350 addl @X[6],$G,$G
351 $ST $F,`5*$SZ`($ctx)
352 addl @X[7],$H,$H
353 $ST $G,`6*$SZ`($ctx)
354 $ST $H,`7*$SZ`($ctx)
355
356 cmpb,*<>,n $inp,$num,L\$oop
357 $PUSH $inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp) ; save $inp
358___
359if ($SZ==8 && $SIZE_T==4) # SHA512 for 32-bit PA-RISC 1.0
360{{
361$code.=<<___;
362 b L\$done
363 nop
364
365 .ALIGN 64
366L\$parisc1
367___
368$code.=<<___ if ($SZ==8 && $SIZE_T==4);
369#endif
370___
371
372@V=( $Ahi, $Alo, $Bhi, $Blo, $Chi, $Clo, $Dhi, $Dlo,
373 $Ehi, $Elo, $Fhi, $Flo, $Ghi, $Glo, $Hhi, $Hlo) =
374 ( "%r1", "%r2", "%r3", "%r4", "%r5", "%r6", "%r7", "%r8",
375 "%r9","%r10","%r11","%r12","%r13","%r14","%r15","%r16");
376$a0 ="%r17";
377$a1 ="%r18";
378$a2 ="%r19";
379$a3 ="%r20";
380$t0 ="%r21";
381$t1 ="%r22";
382$t2 ="%r28";
383$t3 ="%r29";
384$Tbl="%r31";
385
386@X=("%r23","%r24","%r25","%r26"); # zaps $num,$inp,$ctx
387
388sub ROUND_00_15_pa1 {
389my ($i,$ahi,$alo,$bhi,$blo,$chi,$clo,$dhi,$dlo,
390 $ehi,$elo,$fhi,$flo,$ghi,$glo,$hhi,$hlo,$flag)=@_;
391my ($Xhi,$Xlo,$Xnhi,$Xnlo) = @X;
392
393$code.=<<___ if (!$flag);
394 ldw `-$XOFF+8*(($i+1)%16)`(%sp),$Xnhi
395 ldw `-$XOFF+8*(($i+1)%16)+4`(%sp),$Xnlo ; load X[i+1]
396___
397$code.=<<___;
398 shd $ehi,$elo,$Sigma1[0],$t0
399 add $Xlo,$hlo,$hlo
400 shd $elo,$ehi,$Sigma1[0],$t1
401 addc $Xhi,$hhi,$hhi ; h += X[i]
402 shd $ehi,$elo,$Sigma1[1],$t2
403 ldwm 8($Tbl),$Xhi
404 shd $elo,$ehi,$Sigma1[1],$t3
405 ldw -4($Tbl),$Xlo ; load K[i]
406 xor $t2,$t0,$t0
407 xor $t3,$t1,$t1
408 and $flo,$elo,$a0
409 and $fhi,$ehi,$a1
410 shd $ehi,$elo,$Sigma1[2],$t2
411 andcm $glo,$elo,$a2
412 shd $elo,$ehi,$Sigma1[2],$t3
413 andcm $ghi,$ehi,$a3
414 xor $t2,$t0,$t0
415 xor $t3,$t1,$t1 ; Sigma1(e)
416 add $Xlo,$hlo,$hlo
417 xor $a2,$a0,$a0
418 addc $Xhi,$hhi,$hhi ; h += K[i]
419 xor $a3,$a1,$a1 ; Ch(e,f,g)
420
421 add $t0,$hlo,$hlo
422 shd $ahi,$alo,$Sigma0[0],$t0
423 addc $t1,$hhi,$hhi ; h += Sigma1(e)
424 shd $alo,$ahi,$Sigma0[0],$t1
425 add $a0,$hlo,$hlo
426 shd $ahi,$alo,$Sigma0[1],$t2
427 addc $a1,$hhi,$hhi ; h += Ch(e,f,g)
428 shd $alo,$ahi,$Sigma0[1],$t3
429
430 xor $t2,$t0,$t0
431 xor $t3,$t1,$t1
432 shd $ahi,$alo,$Sigma0[2],$t2
433 and $alo,$blo,$a0
434 shd $alo,$ahi,$Sigma0[2],$t3
435 and $ahi,$bhi,$a1
436 xor $t2,$t0,$t0
437 xor $t3,$t1,$t1 ; Sigma0(a)
438
439 and $alo,$clo,$a2
440 and $ahi,$chi,$a3
441 xor $a2,$a0,$a0
442 add $hlo,$dlo,$dlo
443 xor $a3,$a1,$a1
444 addc $hhi,$dhi,$dhi ; d += h
445 and $blo,$clo,$a2
446 add $t0,$hlo,$hlo
447 and $bhi,$chi,$a3
448 addc $t1,$hhi,$hhi ; h += Sigma0(a)
449 xor $a2,$a0,$a0
450 add $a0,$hlo,$hlo
451 xor $a3,$a1,$a1 ; Maj(a,b,c)
452 addc $a1,$hhi,$hhi ; h += Maj(a,b,c)
453
454___
455$code.=<<___ if ($i==15 && $flag);
456 extru $Xlo,31,10,$Xlo
457 comiclr,= $LAST10BITS,$Xlo,%r0
458 b L\$rounds_pa1
459 nop
460___
461push(@X,shift(@X)); push(@X,shift(@X));
462}
463
464sub ROUND_16_xx_pa1 {
465my ($Xhi,$Xlo,$Xnhi,$Xnlo) = @X;
466my ($i)=shift;
467$i-=16;
468$code.=<<___;
469 ldw `-$XOFF+8*(($i+1)%16)`(%sp),$Xnhi
470 ldw `-$XOFF+8*(($i+1)%16)+4`(%sp),$Xnlo ; load X[i+1]
471 ldw `-$XOFF+8*(($i+9)%16)`(%sp),$a1
472 ldw `-$XOFF+8*(($i+9)%16)+4`(%sp),$a0 ; load X[i+9]
473 ldw `-$XOFF+8*(($i+14)%16)`(%sp),$a3
474 ldw `-$XOFF+8*(($i+14)%16)+4`(%sp),$a2 ; load X[i+14]
475 shd $Xnhi,$Xnlo,$sigma0[0],$t0
476 shd $Xnlo,$Xnhi,$sigma0[0],$t1
477 add $a0,$Xlo,$Xlo
478 shd $Xnhi,$Xnlo,$sigma0[1],$t2
479 addc $a1,$Xhi,$Xhi
480 shd $Xnlo,$Xnhi,$sigma0[1],$t3
481 xor $t2,$t0,$t0
482 shd $Xnhi,$Xnlo,$sigma0[2],$t2
483 xor $t3,$t1,$t1
484 extru $Xnhi,`31-$sigma0[2]`,`32-$sigma0[2]`,$t3
485 xor $t2,$t0,$t0
486 shd $a3,$a2,$sigma1[0],$a0
487 xor $t3,$t1,$t1 ; sigma0(X[i+1)&0x0f])
488 shd $a2,$a3,$sigma1[0],$a1
489 add $t0,$Xlo,$Xlo
490 shd $a3,$a2,$sigma1[1],$t2
491 addc $t1,$Xhi,$Xhi
492 shd $a2,$a3,$sigma1[1],$t3
493 xor $t2,$a0,$a0
494 shd $a3,$a2,$sigma1[2],$t2
495 xor $t3,$a1,$a1
496 extru $a3,`31-$sigma1[2]`,`32-$sigma1[2]`,$t3
497 xor $t2,$a0,$a0
498 xor $t3,$a1,$a1 ; sigma0(X[i+14)&0x0f])
499 add $a0,$Xlo,$Xlo
500 addc $a1,$Xhi,$Xhi
501
502 stw $Xhi,`-$XOFF+8*($i%16)`(%sp)
503 stw $Xlo,`-$XOFF+8*($i%16)+4`(%sp)
504___
505&ROUND_00_15_pa1($i,@_,1);
506}
507$code.=<<___;
508 ldw `0*4`($ctx),$Ahi ; load context
509 ldw `1*4`($ctx),$Alo
510 ldw `2*4`($ctx),$Bhi
511 ldw `3*4`($ctx),$Blo
512 ldw `4*4`($ctx),$Chi
513 ldw `5*4`($ctx),$Clo
514 ldw `6*4`($ctx),$Dhi
515 ldw `7*4`($ctx),$Dlo
516 ldw `8*4`($ctx),$Ehi
517 ldw `9*4`($ctx),$Elo
518 ldw `10*4`($ctx),$Fhi
519 ldw `11*4`($ctx),$Flo
520 ldw `12*4`($ctx),$Ghi
521 ldw `13*4`($ctx),$Glo
522 ldw `14*4`($ctx),$Hhi
523 ldw `15*4`($ctx),$Hlo
524
525 extru $inp,31,2,$t0
526 sh3addl $t0,%r0,$t0
527 subi 32,$t0,$t0
528 mtctl $t0,%cr11 ; load %sar with align factor
529
530L\$oop_pa1
531 extru $inp,31,2,$a3
532 comib,= 0,$a3,L\$aligned_pa1
533 sub $inp,$a3,$inp
534
535 ldw `0*4`($inp),$X[0]
536 ldw `1*4`($inp),$X[1]
537 ldw `2*4`($inp),$t2
538 ldw `3*4`($inp),$t3
539 ldw `4*4`($inp),$a0
540 ldw `5*4`($inp),$a1
541 ldw `6*4`($inp),$a2
542 ldw `7*4`($inp),$a3
543 vshd $X[0],$X[1],$X[0]
544 vshd $X[1],$t2,$X[1]
545 stw $X[0],`-$XOFF+0*4`(%sp)
546 ldw `8*4`($inp),$t0
547 vshd $t2,$t3,$t2
548 stw $X[1],`-$XOFF+1*4`(%sp)
549 ldw `9*4`($inp),$t1
550 vshd $t3,$a0,$t3
551___
552{
553my @t=($t2,$t3,$a0,$a1,$a2,$a3,$t0,$t1);
554for ($i=2;$i<=(128/4-8);$i++) {
555$code.=<<___;
556 stw $t[0],`-$XOFF+$i*4`(%sp)
557 ldw `(8+$i)*4`($inp),$t[0]
558 vshd $t[1],$t[2],$t[1]
559___
560push(@t,shift(@t));
561}
562for (;$i<(128/4-1);$i++) {
563$code.=<<___;
564 stw $t[0],`-$XOFF+$i*4`(%sp)
565 vshd $t[1],$t[2],$t[1]
566___
567push(@t,shift(@t));
568}
569$code.=<<___;
570 b L\$collected_pa1
571 stw $t[0],`-$XOFF+$i*4`(%sp)
572
573___
574}
575$code.=<<___;
576L\$aligned_pa1
577 ldw `0*4`($inp),$X[0]
578 ldw `1*4`($inp),$X[1]
579 ldw `2*4`($inp),$t2
580 ldw `3*4`($inp),$t3
581 ldw `4*4`($inp),$a0
582 ldw `5*4`($inp),$a1
583 ldw `6*4`($inp),$a2
584 ldw `7*4`($inp),$a3
585 stw $X[0],`-$XOFF+0*4`(%sp)
586 ldw `8*4`($inp),$t0
587 stw $X[1],`-$XOFF+1*4`(%sp)
588 ldw `9*4`($inp),$t1
589___
590{
591my @t=($t2,$t3,$a0,$a1,$a2,$a3,$t0,$t1);
592for ($i=2;$i<(128/4-8);$i++) {
593$code.=<<___;
594 stw $t[0],`-$XOFF+$i*4`(%sp)
595 ldw `(8+$i)*4`($inp),$t[0]
596___
597push(@t,shift(@t));
598}
599for (;$i<128/4;$i++) {
600$code.=<<___;
601 stw $t[0],`-$XOFF+$i*4`(%sp)
602___
603push(@t,shift(@t));
604}
605$code.="L\$collected_pa1\n";
606}
607
608for($i=0;$i<16;$i++) { &ROUND_00_15_pa1($i,@V); unshift(@V,pop(@V)); unshift(@V,pop(@V)); }
609$code.="L\$rounds_pa1\n";
610for(;$i<32;$i++) { &ROUND_16_xx_pa1($i,@V); unshift(@V,pop(@V)); unshift(@V,pop(@V)); }
611
612$code.=<<___;
613 $POP `-$FRAME_MARKER-2*$SIZE_T`(%sp),$ctx ; restore arguments
614 $POP `-$FRAME_MARKER-3*$SIZE_T`(%sp),$inp
615 $POP `-$FRAME_MARKER-4*$SIZE_T`(%sp),$num
616 ldo `-$rounds*$SZ`($Tbl),$Tbl ; rewind $Tbl
617
618 ldw `0*4`($ctx),$t1 ; update context
619 ldw `1*4`($ctx),$t0
620 ldw `2*4`($ctx),$t3
621 ldw `3*4`($ctx),$t2
622 ldw `4*4`($ctx),$a1
623 ldw `5*4`($ctx),$a0
624 ldw `6*4`($ctx),$a3
625 add $t0,$Alo,$Alo
626 ldw `7*4`($ctx),$a2
627 addc $t1,$Ahi,$Ahi
628 ldw `8*4`($ctx),$t1
629 add $t2,$Blo,$Blo
630 ldw `9*4`($ctx),$t0
631 addc $t3,$Bhi,$Bhi
632 ldw `10*4`($ctx),$t3
633 add $a0,$Clo,$Clo
634 ldw `11*4`($ctx),$t2
635 addc $a1,$Chi,$Chi
636 ldw `12*4`($ctx),$a1
637 add $a2,$Dlo,$Dlo
638 ldw `13*4`($ctx),$a0
639 addc $a3,$Dhi,$Dhi
640 ldw `14*4`($ctx),$a3
641 add $t0,$Elo,$Elo
642 ldw `15*4`($ctx),$a2
643 addc $t1,$Ehi,$Ehi
644 stw $Ahi,`0*4`($ctx)
645 add $t2,$Flo,$Flo
646 stw $Alo,`1*4`($ctx)
647 addc $t3,$Fhi,$Fhi
648 stw $Bhi,`2*4`($ctx)
649 add $a0,$Glo,$Glo
650 stw $Blo,`3*4`($ctx)
651 addc $a1,$Ghi,$Ghi
652 stw $Chi,`4*4`($ctx)
653 add $a2,$Hlo,$Hlo
654 stw $Clo,`5*4`($ctx)
655 addc $a3,$Hhi,$Hhi
656 stw $Dhi,`6*4`($ctx)
657 ldo `16*$SZ`($inp),$inp ; advance $inp
658 stw $Dlo,`7*4`($ctx)
659 stw $Ehi,`8*4`($ctx)
660 stw $Elo,`9*4`($ctx)
661 stw $Fhi,`10*4`($ctx)
662 stw $Flo,`11*4`($ctx)
663 stw $Ghi,`12*4`($ctx)
664 stw $Glo,`13*4`($ctx)
665 stw $Hhi,`14*4`($ctx)
666 comb,= $inp,$num,L\$done
667 stw $Hlo,`15*4`($ctx)
668 b L\$oop_pa1
669 $PUSH $inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp) ; save $inp
670L\$done
671___
672}}
673$code.=<<___;
674 $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue
675 $POP `-$FRAME+1*$SIZE_T`(%sp),%r4
676 $POP `-$FRAME+2*$SIZE_T`(%sp),%r5
677 $POP `-$FRAME+3*$SIZE_T`(%sp),%r6
678 $POP `-$FRAME+4*$SIZE_T`(%sp),%r7
679 $POP `-$FRAME+5*$SIZE_T`(%sp),%r8
680 $POP `-$FRAME+6*$SIZE_T`(%sp),%r9
681 $POP `-$FRAME+7*$SIZE_T`(%sp),%r10
682 $POP `-$FRAME+8*$SIZE_T`(%sp),%r11
683 $POP `-$FRAME+9*$SIZE_T`(%sp),%r12
684 $POP `-$FRAME+10*$SIZE_T`(%sp),%r13
685 $POP `-$FRAME+11*$SIZE_T`(%sp),%r14
686 $POP `-$FRAME+12*$SIZE_T`(%sp),%r15
687 $POP `-$FRAME+13*$SIZE_T`(%sp),%r16
688 $POP `-$FRAME+14*$SIZE_T`(%sp),%r17
689 $POP `-$FRAME+15*$SIZE_T`(%sp),%r18
690 bv (%r2)
691 .EXIT
692 $POPMB -$FRAME(%sp),%r3
693 .PROCEND
694___
695
696# Explicitly encode PA-RISC 2.0 instructions used in this module, so
697# that it can be compiled with .LEVEL 1.0. It should be noted that I
698# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0
699# directive...
700
701my $ldd = sub {
702 my ($mod,$args) = @_;
703 my $orig = "ldd$mod\t$args";
704
705 if ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 3 suffices
706 { my $opcode=(0x14<<26)|($2<<21)|($3<<16)|(($1&0x1FF8)<<1)|(($1>>13)&1);
707 $opcode|=(1<<3) if ($mod =~ /^,m/);
708 $opcode|=(1<<2) if ($mod =~ /^,mb/);
709 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
710 }
711 else { "\t".$orig; }
712};
713
714my $std = sub {
715 my ($mod,$args) = @_;
716 my $orig = "std$mod\t$args";
717
718 if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 3 suffices
719 { my $opcode=(0x1c<<26)|($3<<21)|($1<<16)|(($2&0x1FF8)<<1)|(($2>>13)&1);
720 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
721 }
722 else { "\t".$orig; }
723};
724
725my $extrd = sub {
726 my ($mod,$args) = @_;
727 my $orig = "extrd$mod\t$args";
728
729 # I only have ",u" completer, it's implicitly encoded...
730 if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 15
731 { my $opcode=(0x36<<26)|($1<<21)|($4<<16);
732 my $len=32-$3;
733 $opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5); # encode pos
734 $opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len
735 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
736 }
737 elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/) # format 12
738 { my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9);
739 my $len=32-$2;
740 $opcode |= (($len&0x20)<<3)|($len&0x1f); # encode len
741 $opcode |= (1<<13) if ($mod =~ /,\**=/);
742 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
743 }
744 else { "\t".$orig; }
745};
746
747my $shrpd = sub {
748 my ($mod,$args) = @_;
749 my $orig = "shrpd$mod\t$args";
750
751 if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/) # format 14
752 { my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4;
753 my $cpos=63-$3;
754 $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode sa
755 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
756 }
757 elsif ($args =~ /%r([0-9]+),%r([0-9]+),%sar,%r([0-9]+)/) # format 11
758 { sprintf "\t.WORD\t0x%08x\t; %s",
759 (0x34<<26)|($2<<21)|($1<<16)|(1<<9)|$3,$orig;
760 }
761 else { "\t".$orig; }
762};
763
764sub assemble {
765 my ($mnemonic,$mod,$args)=@_;
766 my $opcode = eval("\$$mnemonic");
767
768 ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args";
769}
770
771foreach (split("\n",$code)) {
772 s/\`([^\`]*)\`/eval $1/ge;
773
774 s/shd\s+(%r[0-9]+),(%r[0-9]+),([0-9]+)/
775 $3>31 ? sprintf("shd\t%$2,%$1,%d",$3-32) # rotation for >=32
776 : sprintf("shd\t%$1,%$2,%d",$3)/e or
777 # translate made up instructons: _ror, _shr, _align, _shl
778 s/_ror(\s+)(%r[0-9]+),/
779 ($SZ==4 ? "shd" : "shrpd")."$1$2,$2,"/e or
780
781 s/_shr(\s+%r[0-9]+),([0-9]+),/
782 $SZ==4 ? sprintf("extru%s,%d,%d,",$1,31-$2,32-$2)
783 : sprintf("extrd,u%s,%d,%d,",$1,63-$2,64-$2)/e or
784
785 s/_align(\s+%r[0-9]+,%r[0-9]+),/
786 ($SZ==4 ? "vshd$1," : "shrpd$1,%sar,")/e or
787
788 s/_shl(\s+%r[0-9]+),([0-9]+),/
789 $SIZE_T==4 ? sprintf("zdep%s,%d,%d,",$1,31-$2,32-$2)
790 : sprintf("depd,z%s,%d,%d,",$1,63-$2,64-$2)/e;
791
792 s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e if ($SIZE_T==4);
793
794 s/cmpb,\*/comb,/ if ($SIZE_T==4);
795
796 s/\bbv\b/bve/ if ($SIZE_T==8);
797
798 print $_,"\n";
799}
800
801close STDOUT;
diff --git a/src/lib/libcrypto/sha/asm/sha512-ppc.pl b/src/lib/libcrypto/sha/asm/sha512-ppc.pl
deleted file mode 100755
index 28bd997cf8..0000000000
--- a/src/lib/libcrypto/sha/asm/sha512-ppc.pl
+++ /dev/null
@@ -1,444 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# I let hardware handle unaligned input, except on page boundaries
11# (see below for details). Otherwise straightforward implementation
12# with X vector in register bank. The module is big-endian [which is
13# not big deal as there're no little-endian targets left around].
14
15# sha256 | sha512
16# -m64 -m32 | -m64 -m32
17# --------------------------------------+-----------------------
18# PPC970,gcc-4.0.0 +50% +38% | +40% +410%(*)
19# Power6,xlc-7 +150% +90% | +100% +430%(*)
20#
21# (*) 64-bit code in 32-bit application context, which actually is
22# on TODO list. It should be noted that for safe deployment in
23# 32-bit *multi-threaded* context asynchronous signals should be
24# blocked upon entry to SHA512 block routine. This is because
25# 32-bit signaling procedure invalidates upper halves of GPRs.
26# Context switch procedure preserves them, but not signaling:-(
27
28# Second version is true multi-thread safe. Trouble with the original
29# version was that it was using thread local storage pointer register.
30# Well, it scrupulously preserved it, but the problem would arise the
31# moment asynchronous signal was delivered and signal handler would
32# dereference the TLS pointer. While it's never the case in openssl
33# application or test suite, we have to respect this scenario and not
34# use TLS pointer register. Alternative would be to require caller to
35# block signals prior calling this routine. For the record, in 32-bit
36# context R2 serves as TLS pointer, while in 64-bit context - R13.
37
38$flavour=shift;
39$output =shift;
40
41if ($flavour =~ /64/) {
42 $SIZE_T=8;
43 $LRSAVE=2*$SIZE_T;
44 $STU="stdu";
45 $UCMP="cmpld";
46 $SHL="sldi";
47 $POP="ld";
48 $PUSH="std";
49} elsif ($flavour =~ /32/) {
50 $SIZE_T=4;
51 $LRSAVE=$SIZE_T;
52 $STU="stwu";
53 $UCMP="cmplw";
54 $SHL="slwi";
55 $POP="lwz";
56 $PUSH="stw";
57} else { die "nonsense $flavour"; }
58
59$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
60( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
61( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
62die "can't locate ppc-xlate.pl";
63
64open STDOUT,"| $^X $xlate $flavour $output" || die "can't call $xlate: $!";
65
66if ($output =~ /512/) {
67 $func="sha512_block_data_order";
68 $SZ=8;
69 @Sigma0=(28,34,39);
70 @Sigma1=(14,18,41);
71 @sigma0=(1, 8, 7);
72 @sigma1=(19,61, 6);
73 $rounds=80;
74 $LD="ld";
75 $ST="std";
76 $ROR="rotrdi";
77 $SHR="srdi";
78} else {
79 $func="sha256_block_data_order";
80 $SZ=4;
81 @Sigma0=( 2,13,22);
82 @Sigma1=( 6,11,25);
83 @sigma0=( 7,18, 3);
84 @sigma1=(17,19,10);
85 $rounds=64;
86 $LD="lwz";
87 $ST="stw";
88 $ROR="rotrwi";
89 $SHR="srwi";
90}
91
92$FRAME=32*$SIZE_T+16*$SZ;
93$LOCALS=6*$SIZE_T;
94
95$sp ="r1";
96$toc="r2";
97$ctx="r3"; # zapped by $a0
98$inp="r4"; # zapped by $a1
99$num="r5"; # zapped by $t0
100
101$T ="r0";
102$a0 ="r3";
103$a1 ="r4";
104$t0 ="r5";
105$t1 ="r6";
106$Tbl="r7";
107
108$A ="r8";
109$B ="r9";
110$C ="r10";
111$D ="r11";
112$E ="r12";
113$F ="r13"; $F="r2" if ($SIZE_T==8);# reassigned to exempt TLS pointer
114$G ="r14";
115$H ="r15";
116
117@V=($A,$B,$C,$D,$E,$F,$G,$H);
118@X=("r16","r17","r18","r19","r20","r21","r22","r23",
119 "r24","r25","r26","r27","r28","r29","r30","r31");
120
121$inp="r31"; # reassigned $inp! aliases with @X[15]
122
123sub ROUND_00_15 {
124my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
125$code.=<<___;
126 $LD $T,`$i*$SZ`($Tbl)
127 $ROR $a0,$e,$Sigma1[0]
128 $ROR $a1,$e,$Sigma1[1]
129 and $t0,$f,$e
130 andc $t1,$g,$e
131 add $T,$T,$h
132 xor $a0,$a0,$a1
133 $ROR $a1,$a1,`$Sigma1[2]-$Sigma1[1]`
134 or $t0,$t0,$t1 ; Ch(e,f,g)
135 add $T,$T,@X[$i]
136 xor $a0,$a0,$a1 ; Sigma1(e)
137 add $T,$T,$t0
138 add $T,$T,$a0
139
140 $ROR $a0,$a,$Sigma0[0]
141 $ROR $a1,$a,$Sigma0[1]
142 and $t0,$a,$b
143 and $t1,$a,$c
144 xor $a0,$a0,$a1
145 $ROR $a1,$a1,`$Sigma0[2]-$Sigma0[1]`
146 xor $t0,$t0,$t1
147 and $t1,$b,$c
148 xor $a0,$a0,$a1 ; Sigma0(a)
149 add $d,$d,$T
150 xor $t0,$t0,$t1 ; Maj(a,b,c)
151 add $h,$T,$a0
152 add $h,$h,$t0
153
154___
155}
156
157sub ROUND_16_xx {
158my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
159$i-=16;
160$code.=<<___;
161 $ROR $a0,@X[($i+1)%16],$sigma0[0]
162 $ROR $a1,@X[($i+1)%16],$sigma0[1]
163 $ROR $t0,@X[($i+14)%16],$sigma1[0]
164 $ROR $t1,@X[($i+14)%16],$sigma1[1]
165 xor $a0,$a0,$a1
166 $SHR $a1,@X[($i+1)%16],$sigma0[2]
167 xor $t0,$t0,$t1
168 $SHR $t1,@X[($i+14)%16],$sigma1[2]
169 add @X[$i],@X[$i],@X[($i+9)%16]
170 xor $a0,$a0,$a1 ; sigma0(X[(i+1)&0x0f])
171 xor $t0,$t0,$t1 ; sigma1(X[(i+14)&0x0f])
172 add @X[$i],@X[$i],$a0
173 add @X[$i],@X[$i],$t0
174___
175&ROUND_00_15($i,$a,$b,$c,$d,$e,$f,$g,$h);
176}
177
178$code=<<___;
179.machine "any"
180.text
181
182.globl $func
183.align 6
184$func:
185 $STU $sp,-$FRAME($sp)
186 mflr r0
187 $SHL $num,$num,`log(16*$SZ)/log(2)`
188
189 $PUSH $ctx,`$FRAME-$SIZE_T*22`($sp)
190
191 $PUSH $toc,`$FRAME-$SIZE_T*20`($sp)
192 $PUSH r13,`$FRAME-$SIZE_T*19`($sp)
193 $PUSH r14,`$FRAME-$SIZE_T*18`($sp)
194 $PUSH r15,`$FRAME-$SIZE_T*17`($sp)
195 $PUSH r16,`$FRAME-$SIZE_T*16`($sp)
196 $PUSH r17,`$FRAME-$SIZE_T*15`($sp)
197 $PUSH r18,`$FRAME-$SIZE_T*14`($sp)
198 $PUSH r19,`$FRAME-$SIZE_T*13`($sp)
199 $PUSH r20,`$FRAME-$SIZE_T*12`($sp)
200 $PUSH r21,`$FRAME-$SIZE_T*11`($sp)
201 $PUSH r22,`$FRAME-$SIZE_T*10`($sp)
202 $PUSH r23,`$FRAME-$SIZE_T*9`($sp)
203 $PUSH r24,`$FRAME-$SIZE_T*8`($sp)
204 $PUSH r25,`$FRAME-$SIZE_T*7`($sp)
205 $PUSH r26,`$FRAME-$SIZE_T*6`($sp)
206 $PUSH r27,`$FRAME-$SIZE_T*5`($sp)
207 $PUSH r28,`$FRAME-$SIZE_T*4`($sp)
208 $PUSH r29,`$FRAME-$SIZE_T*3`($sp)
209 $PUSH r30,`$FRAME-$SIZE_T*2`($sp)
210 $PUSH r31,`$FRAME-$SIZE_T*1`($sp)
211 $PUSH r0,`$FRAME+$LRSAVE`($sp)
212
213 $LD $A,`0*$SZ`($ctx)
214 mr $inp,r4 ; incarnate $inp
215 $LD $B,`1*$SZ`($ctx)
216 $LD $C,`2*$SZ`($ctx)
217 $LD $D,`3*$SZ`($ctx)
218 $LD $E,`4*$SZ`($ctx)
219 $LD $F,`5*$SZ`($ctx)
220 $LD $G,`6*$SZ`($ctx)
221 $LD $H,`7*$SZ`($ctx)
222
223 bcl 20,31,Lpc
224Lpc:
225 mflr $Tbl
226 addis $Tbl,$Tbl,Ltable-Lpc\@ha
227 addi $Tbl,$Tbl,Ltable-Lpc\@l
228 andi. r0,$inp,3
229 bne Lunaligned
230Laligned:
231 add $num,$inp,$num
232 $PUSH $num,`$FRAME-$SIZE_T*24`($sp) ; end pointer
233 $PUSH $inp,`$FRAME-$SIZE_T*23`($sp) ; inp pointer
234 bl Lsha2_block_private
235 b Ldone
236
237; PowerPC specification allows an implementation to be ill-behaved
238; upon unaligned access which crosses page boundary. "Better safe
239; than sorry" principle makes me treat it specially. But I don't
240; look for particular offending word, but rather for the input
241; block which crosses the boundary. Once found that block is aligned
242; and hashed separately...
243.align 4
244Lunaligned:
245 subfic $t1,$inp,4096
246 andi. $t1,$t1,`4096-16*$SZ` ; distance to closest page boundary
247 beq Lcross_page
248 $UCMP $num,$t1
249 ble- Laligned ; didn't cross the page boundary
250 subfc $num,$t1,$num
251 add $t1,$inp,$t1
252 $PUSH $num,`$FRAME-$SIZE_T*25`($sp) ; save real remaining num
253 $PUSH $t1,`$FRAME-$SIZE_T*24`($sp) ; intermediate end pointer
254 $PUSH $inp,`$FRAME-$SIZE_T*23`($sp) ; inp pointer
255 bl Lsha2_block_private
256 ; $inp equals to the intermediate end pointer here
257 $POP $num,`$FRAME-$SIZE_T*25`($sp) ; restore real remaining num
258Lcross_page:
259 li $t1,`16*$SZ/4`
260 mtctr $t1
261 addi r20,$sp,$LOCALS ; aligned spot below the frame
262Lmemcpy:
263 lbz r16,0($inp)
264 lbz r17,1($inp)
265 lbz r18,2($inp)
266 lbz r19,3($inp)
267 addi $inp,$inp,4
268 stb r16,0(r20)
269 stb r17,1(r20)
270 stb r18,2(r20)
271 stb r19,3(r20)
272 addi r20,r20,4
273 bdnz Lmemcpy
274
275 $PUSH $inp,`$FRAME-$SIZE_T*26`($sp) ; save real inp
276 addi $t1,$sp,`$LOCALS+16*$SZ` ; fictitious end pointer
277 addi $inp,$sp,$LOCALS ; fictitious inp pointer
278 $PUSH $num,`$FRAME-$SIZE_T*25`($sp) ; save real num
279 $PUSH $t1,`$FRAME-$SIZE_T*24`($sp) ; end pointer
280 $PUSH $inp,`$FRAME-$SIZE_T*23`($sp) ; inp pointer
281 bl Lsha2_block_private
282 $POP $inp,`$FRAME-$SIZE_T*26`($sp) ; restore real inp
283 $POP $num,`$FRAME-$SIZE_T*25`($sp) ; restore real num
284 addic. $num,$num,`-16*$SZ` ; num--
285 bne- Lunaligned
286
287Ldone:
288 $POP r0,`$FRAME+$LRSAVE`($sp)
289 $POP $toc,`$FRAME-$SIZE_T*20`($sp)
290 $POP r13,`$FRAME-$SIZE_T*19`($sp)
291 $POP r14,`$FRAME-$SIZE_T*18`($sp)
292 $POP r15,`$FRAME-$SIZE_T*17`($sp)
293 $POP r16,`$FRAME-$SIZE_T*16`($sp)
294 $POP r17,`$FRAME-$SIZE_T*15`($sp)
295 $POP r18,`$FRAME-$SIZE_T*14`($sp)
296 $POP r19,`$FRAME-$SIZE_T*13`($sp)
297 $POP r20,`$FRAME-$SIZE_T*12`($sp)
298 $POP r21,`$FRAME-$SIZE_T*11`($sp)
299 $POP r22,`$FRAME-$SIZE_T*10`($sp)
300 $POP r23,`$FRAME-$SIZE_T*9`($sp)
301 $POP r24,`$FRAME-$SIZE_T*8`($sp)
302 $POP r25,`$FRAME-$SIZE_T*7`($sp)
303 $POP r26,`$FRAME-$SIZE_T*6`($sp)
304 $POP r27,`$FRAME-$SIZE_T*5`($sp)
305 $POP r28,`$FRAME-$SIZE_T*4`($sp)
306 $POP r29,`$FRAME-$SIZE_T*3`($sp)
307 $POP r30,`$FRAME-$SIZE_T*2`($sp)
308 $POP r31,`$FRAME-$SIZE_T*1`($sp)
309 mtlr r0
310 addi $sp,$sp,$FRAME
311 blr
312
313.align 4
314Lsha2_block_private:
315___
316for($i=0;$i<16;$i++) {
317$code.=<<___ if ($SZ==4);
318 lwz @X[$i],`$i*$SZ`($inp)
319___
320# 64-bit loads are split to 2x32-bit ones, as CPU can't handle
321# unaligned 64-bit loads, only 32-bit ones...
322$code.=<<___ if ($SZ==8);
323 lwz $t0,`$i*$SZ`($inp)
324 lwz @X[$i],`$i*$SZ+4`($inp)
325 insrdi @X[$i],$t0,32,0
326___
327 &ROUND_00_15($i,@V);
328 unshift(@V,pop(@V));
329}
330$code.=<<___;
331 li $T,`$rounds/16-1`
332 mtctr $T
333.align 4
334Lrounds:
335 addi $Tbl,$Tbl,`16*$SZ`
336___
337for(;$i<32;$i++) {
338 &ROUND_16_xx($i,@V);
339 unshift(@V,pop(@V));
340}
341$code.=<<___;
342 bdnz- Lrounds
343
344 $POP $ctx,`$FRAME-$SIZE_T*22`($sp)
345 $POP $inp,`$FRAME-$SIZE_T*23`($sp) ; inp pointer
346 $POP $num,`$FRAME-$SIZE_T*24`($sp) ; end pointer
347 subi $Tbl,$Tbl,`($rounds-16)*$SZ` ; rewind Tbl
348
349 $LD r16,`0*$SZ`($ctx)
350 $LD r17,`1*$SZ`($ctx)
351 $LD r18,`2*$SZ`($ctx)
352 $LD r19,`3*$SZ`($ctx)
353 $LD r20,`4*$SZ`($ctx)
354 $LD r21,`5*$SZ`($ctx)
355 $LD r22,`6*$SZ`($ctx)
356 addi $inp,$inp,`16*$SZ` ; advance inp
357 $LD r23,`7*$SZ`($ctx)
358 add $A,$A,r16
359 add $B,$B,r17
360 $PUSH $inp,`$FRAME-$SIZE_T*23`($sp)
361 add $C,$C,r18
362 $ST $A,`0*$SZ`($ctx)
363 add $D,$D,r19
364 $ST $B,`1*$SZ`($ctx)
365 add $E,$E,r20
366 $ST $C,`2*$SZ`($ctx)
367 add $F,$F,r21
368 $ST $D,`3*$SZ`($ctx)
369 add $G,$G,r22
370 $ST $E,`4*$SZ`($ctx)
371 add $H,$H,r23
372 $ST $F,`5*$SZ`($ctx)
373 $ST $G,`6*$SZ`($ctx)
374 $UCMP $inp,$num
375 $ST $H,`7*$SZ`($ctx)
376 bne Lsha2_block_private
377 blr
378 .section .rodata
379Ltable:
380___
381$code.=<<___ if ($SZ==8);
382 .long 0x428a2f98,0xd728ae22,0x71374491,0x23ef65cd
383 .long 0xb5c0fbcf,0xec4d3b2f,0xe9b5dba5,0x8189dbbc
384 .long 0x3956c25b,0xf348b538,0x59f111f1,0xb605d019
385 .long 0x923f82a4,0xaf194f9b,0xab1c5ed5,0xda6d8118
386 .long 0xd807aa98,0xa3030242,0x12835b01,0x45706fbe
387 .long 0x243185be,0x4ee4b28c,0x550c7dc3,0xd5ffb4e2
388 .long 0x72be5d74,0xf27b896f,0x80deb1fe,0x3b1696b1
389 .long 0x9bdc06a7,0x25c71235,0xc19bf174,0xcf692694
390 .long 0xe49b69c1,0x9ef14ad2,0xefbe4786,0x384f25e3
391 .long 0x0fc19dc6,0x8b8cd5b5,0x240ca1cc,0x77ac9c65
392 .long 0x2de92c6f,0x592b0275,0x4a7484aa,0x6ea6e483
393 .long 0x5cb0a9dc,0xbd41fbd4,0x76f988da,0x831153b5
394 .long 0x983e5152,0xee66dfab,0xa831c66d,0x2db43210
395 .long 0xb00327c8,0x98fb213f,0xbf597fc7,0xbeef0ee4
396 .long 0xc6e00bf3,0x3da88fc2,0xd5a79147,0x930aa725
397 .long 0x06ca6351,0xe003826f,0x14292967,0x0a0e6e70
398 .long 0x27b70a85,0x46d22ffc,0x2e1b2138,0x5c26c926
399 .long 0x4d2c6dfc,0x5ac42aed,0x53380d13,0x9d95b3df
400 .long 0x650a7354,0x8baf63de,0x766a0abb,0x3c77b2a8
401 .long 0x81c2c92e,0x47edaee6,0x92722c85,0x1482353b
402 .long 0xa2bfe8a1,0x4cf10364,0xa81a664b,0xbc423001
403 .long 0xc24b8b70,0xd0f89791,0xc76c51a3,0x0654be30
404 .long 0xd192e819,0xd6ef5218,0xd6990624,0x5565a910
405 .long 0xf40e3585,0x5771202a,0x106aa070,0x32bbd1b8
406 .long 0x19a4c116,0xb8d2d0c8,0x1e376c08,0x5141ab53
407 .long 0x2748774c,0xdf8eeb99,0x34b0bcb5,0xe19b48a8
408 .long 0x391c0cb3,0xc5c95a63,0x4ed8aa4a,0xe3418acb
409 .long 0x5b9cca4f,0x7763e373,0x682e6ff3,0xd6b2b8a3
410 .long 0x748f82ee,0x5defb2fc,0x78a5636f,0x43172f60
411 .long 0x84c87814,0xa1f0ab72,0x8cc70208,0x1a6439ec
412 .long 0x90befffa,0x23631e28,0xa4506ceb,0xde82bde9
413 .long 0xbef9a3f7,0xb2c67915,0xc67178f2,0xe372532b
414 .long 0xca273ece,0xea26619c,0xd186b8c7,0x21c0c207
415 .long 0xeada7dd6,0xcde0eb1e,0xf57d4f7f,0xee6ed178
416 .long 0x06f067aa,0x72176fba,0x0a637dc5,0xa2c898a6
417 .long 0x113f9804,0xbef90dae,0x1b710b35,0x131c471b
418 .long 0x28db77f5,0x23047d84,0x32caab7b,0x40c72493
419 .long 0x3c9ebe0a,0x15c9bebc,0x431d67c4,0x9c100d4c
420 .long 0x4cc5d4be,0xcb3e42b6,0x597f299c,0xfc657e2a
421 .long 0x5fcb6fab,0x3ad6faec,0x6c44198c,0x4a475817
422___
423$code.=<<___ if ($SZ==4);
424 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
425 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
426 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
427 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
428 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
429 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
430 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
431 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
432 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
433 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
434 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
435 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
436 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
437 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
438 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
439 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
440___
441
442$code =~ s/\`([^\`]*)\`/eval $1/gem;
443print $code;
444close STDOUT;
diff --git a/src/lib/libcrypto/sha/asm/sha512-sparcv9.pl b/src/lib/libcrypto/sha/asm/sha512-sparcv9.pl
deleted file mode 100644
index 3c93799446..0000000000
--- a/src/lib/libcrypto/sha/asm/sha512-sparcv9.pl
+++ /dev/null
@@ -1,604 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# SHA256 performance improvement over compiler generated code varies
11# from 40% for Sun C [32-bit build] to 70% for gcc [3.3, 64-bit
12# build]. Just like in SHA1 module I aim to ensure scalability on
13# UltraSPARC T1 by packing X[16] to 8 64-bit registers.
14
15# SHA512 on pre-T1 UltraSPARC.
16#
17# Performance is >75% better than 64-bit code generated by Sun C and
18# over 2x than 32-bit code. X[16] resides on stack, but access to it
19# is scheduled for L2 latency and staged through 32 least significant
20# bits of %l0-%l7. The latter is done to achieve 32-/64-bit ABI
21# duality. Nevetheless it's ~40% faster than SHA256, which is pretty
22# good [optimal coefficient is 50%].
23#
24# SHA512 on UltraSPARC T1.
25#
26# It's not any faster than 64-bit code generated by Sun C 5.8. This is
27# because 64-bit code generator has the advantage of using 64-bit
28# loads(*) to access X[16], which I consciously traded for 32-/64-bit
29# ABI duality [as per above]. But it surpasses 32-bit Sun C generated
30# code by 60%, not to mention that it doesn't suffer from severe decay
31# when running 4 times physical cores threads and that it leaves gcc
32# [3.4] behind by over 4x factor! If compared to SHA256, single thread
33# performance is only 10% better, but overall throughput for maximum
34# amount of threads for given CPU exceeds corresponding one of SHA256
35# by 30% [again, optimal coefficient is 50%].
36#
37# (*) Unlike pre-T1 UltraSPARC loads on T1 are executed strictly
38# in-order, i.e. load instruction has to complete prior next
39# instruction in given thread is executed, even if the latter is
40# not dependent on load result! This means that on T1 two 32-bit
41# loads are always slower than one 64-bit load. Once again this
42# is unlike pre-T1 UltraSPARC, where, if scheduled appropriately,
43# 2x32-bit loads can be as fast as 1x64-bit ones.
44
45$bits=32;
46for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
47if ($bits==64) { $bias=2047; $frame=192; }
48else { $bias=0; $frame=112; }
49
50$output=shift;
51open STDOUT,">$output";
52
53if ($output =~ /512/) {
54 $label="512";
55 $SZ=8;
56 $LD="ldx"; # load from memory
57 $ST="stx"; # store to memory
58 $SLL="sllx"; # shift left logical
59 $SRL="srlx"; # shift right logical
60 @Sigma0=(28,34,39);
61 @Sigma1=(14,18,41);
62 @sigma0=( 7, 1, 8); # right shift first
63 @sigma1=( 6,19,61); # right shift first
64 $lastK=0x817;
65 $rounds=80;
66 $align=4;
67
68 $locals=16*$SZ; # X[16]
69
70 $A="%o0";
71 $B="%o1";
72 $C="%o2";
73 $D="%o3";
74 $E="%o4";
75 $F="%o5";
76 $G="%g1";
77 $H="%o7";
78 @V=($A,$B,$C,$D,$E,$F,$G,$H);
79} else {
80 $label="256";
81 $SZ=4;
82 $LD="ld"; # load from memory
83 $ST="st"; # store to memory
84 $SLL="sll"; # shift left logical
85 $SRL="srl"; # shift right logical
86 @Sigma0=( 2,13,22);
87 @Sigma1=( 6,11,25);
88 @sigma0=( 3, 7,18); # right shift first
89 @sigma1=(10,17,19); # right shift first
90 $lastK=0x8f2;
91 $rounds=64;
92 $align=8;
93
94 $locals=0; # X[16] is register resident
95 @X=("%o0","%o1","%o2","%o3","%o4","%o5","%g1","%o7");
96
97 $A="%l0";
98 $B="%l1";
99 $C="%l2";
100 $D="%l3";
101 $E="%l4";
102 $F="%l5";
103 $G="%l6";
104 $H="%l7";
105 @V=($A,$B,$C,$D,$E,$F,$G,$H);
106}
107$T1="%g2";
108$tmp0="%g3";
109$tmp1="%g4";
110$tmp2="%g5";
111
112$ctx="%i0";
113$inp="%i1";
114$len="%i2";
115$Ktbl="%i3";
116$tmp31="%i4";
117$tmp32="%i5";
118
119########### SHA256
120$Xload = sub {
121my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
122
123 if ($i==0) {
124$code.=<<___;
125 ldx [$inp+0],@X[0]
126 ldx [$inp+16],@X[2]
127 ldx [$inp+32],@X[4]
128 ldx [$inp+48],@X[6]
129 ldx [$inp+8],@X[1]
130 ldx [$inp+24],@X[3]
131 subcc %g0,$tmp31,$tmp32 ! should be 64-$tmp31, but -$tmp31 works too
132 ldx [$inp+40],@X[5]
133 bz,pt %icc,.Laligned
134 ldx [$inp+56],@X[7]
135
136 sllx @X[0],$tmp31,@X[0]
137 ldx [$inp+64],$T1
138___
139for($j=0;$j<7;$j++)
140{ $code.=<<___;
141 srlx @X[$j+1],$tmp32,$tmp1
142 sllx @X[$j+1],$tmp31,@X[$j+1]
143 or $tmp1,@X[$j],@X[$j]
144___
145}
146$code.=<<___;
147 srlx $T1,$tmp32,$T1
148 or $T1,@X[7],@X[7]
149.Laligned:
150___
151 }
152
153 if ($i&1) {
154 $code.="\tadd @X[$i/2],$h,$T1\n";
155 } else {
156 $code.="\tsrlx @X[$i/2],32,$T1\n\tadd $h,$T1,$T1\n";
157 }
158} if ($SZ==4);
159
160########### SHA512
161$Xload = sub {
162my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
163my @pair=("%l".eval(($i*2)%8),"%l".eval(($i*2)%8+1),"%l".eval((($i+1)*2)%8));
164
165$code.=<<___ if ($i==0);
166 ld [$inp+0],%l0
167 ld [$inp+4],%l1
168 ld [$inp+8],%l2
169 ld [$inp+12],%l3
170 ld [$inp+16],%l4
171 ld [$inp+20],%l5
172 ld [$inp+24],%l6
173 ld [$inp+28],%l7
174___
175$code.=<<___ if ($i<15);
176 sllx @pair[1],$tmp31,$tmp2 ! Xload($i)
177 add $tmp31,32,$tmp0
178 sllx @pair[0],$tmp0,$tmp1
179 `"ld [$inp+".eval(32+0+$i*8)."],@pair[0]" if ($i<12)`
180 srlx @pair[2],$tmp32,@pair[1]
181 or $tmp1,$tmp2,$tmp2
182 or @pair[1],$tmp2,$tmp2
183 `"ld [$inp+".eval(32+4+$i*8)."],@pair[1]" if ($i<12)`
184 add $h,$tmp2,$T1
185 $ST $tmp2,[%sp+`$bias+$frame+$i*$SZ`]
186___
187$code.=<<___ if ($i==12);
188 brnz,a $tmp31,.+8
189 ld [$inp+128],%l0
190___
191$code.=<<___ if ($i==15);
192 ld [%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+0`],%l2
193 sllx @pair[1],$tmp31,$tmp2 ! Xload($i)
194 add $tmp31,32,$tmp0
195 ld [%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+4`],%l3
196 sllx @pair[0],$tmp0,$tmp1
197 ld [%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+0`],%l4
198 srlx @pair[2],$tmp32,@pair[1]
199 or $tmp1,$tmp2,$tmp2
200 ld [%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+4`],%l5
201 or @pair[1],$tmp2,$tmp2
202 ld [%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+0`],%l6
203 add $h,$tmp2,$T1
204 $ST $tmp2,[%sp+`$bias+$frame+$i*$SZ`]
205 ld [%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+4`],%l7
206 ld [%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+0`],%l0
207 ld [%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+4`],%l1
208___
209} if ($SZ==8);
210
211########### common
212sub BODY_00_15 {
213my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
214
215 if ($i<16) {
216 &$Xload(@_);
217 } else {
218 $code.="\tadd $h,$T1,$T1\n";
219 }
220
221$code.=<<___;
222 $SRL $e,@Sigma1[0],$h !! $i
223 xor $f,$g,$tmp2
224 $SLL $e,`$SZ*8-@Sigma1[2]`,$tmp1
225 and $e,$tmp2,$tmp2
226 $SRL $e,@Sigma1[1],$tmp0
227 xor $tmp1,$h,$h
228 $SLL $e,`$SZ*8-@Sigma1[1]`,$tmp1
229 xor $tmp0,$h,$h
230 $SRL $e,@Sigma1[2],$tmp0
231 xor $tmp1,$h,$h
232 $SLL $e,`$SZ*8-@Sigma1[0]`,$tmp1
233 xor $tmp0,$h,$h
234 xor $g,$tmp2,$tmp2 ! Ch(e,f,g)
235 xor $tmp1,$h,$tmp0 ! Sigma1(e)
236
237 $SRL $a,@Sigma0[0],$h
238 add $tmp2,$T1,$T1
239 $LD [$Ktbl+`$i*$SZ`],$tmp2 ! K[$i]
240 $SLL $a,`$SZ*8-@Sigma0[2]`,$tmp1
241 add $tmp0,$T1,$T1
242 $SRL $a,@Sigma0[1],$tmp0
243 xor $tmp1,$h,$h
244 $SLL $a,`$SZ*8-@Sigma0[1]`,$tmp1
245 xor $tmp0,$h,$h
246 $SRL $a,@Sigma0[2],$tmp0
247 xor $tmp1,$h,$h
248 $SLL $a,`$SZ*8-@Sigma0[0]`,$tmp1
249 xor $tmp0,$h,$h
250 xor $tmp1,$h,$h ! Sigma0(a)
251
252 or $a,$b,$tmp0
253 and $a,$b,$tmp1
254 and $c,$tmp0,$tmp0
255 or $tmp0,$tmp1,$tmp1 ! Maj(a,b,c)
256 add $tmp2,$T1,$T1 ! +=K[$i]
257 add $tmp1,$h,$h
258
259 add $T1,$d,$d
260 add $T1,$h,$h
261___
262}
263
264########### SHA256
265$BODY_16_XX = sub {
266my $i=@_[0];
267my $xi;
268
269 if ($i&1) {
270 $xi=$tmp32;
271 $code.="\tsrlx @X[(($i+1)/2)%8],32,$xi\n";
272 } else {
273 $xi=@X[(($i+1)/2)%8];
274 }
275$code.=<<___;
276 srl $xi,@sigma0[0],$T1 !! Xupdate($i)
277 sll $xi,`32-@sigma0[2]`,$tmp1
278 srl $xi,@sigma0[1],$tmp0
279 xor $tmp1,$T1,$T1
280 sll $tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1
281 xor $tmp0,$T1,$T1
282 srl $xi,@sigma0[2],$tmp0
283 xor $tmp1,$T1,$T1
284___
285 if ($i&1) {
286 $xi=@X[(($i+14)/2)%8];
287 } else {
288 $xi=$tmp32;
289 $code.="\tsrlx @X[(($i+14)/2)%8],32,$xi\n";
290 }
291$code.=<<___;
292 srl $xi,@sigma1[0],$tmp2
293 xor $tmp0,$T1,$T1 ! T1=sigma0(X[i+1])
294 sll $xi,`32-@sigma1[2]`,$tmp1
295 srl $xi,@sigma1[1],$tmp0
296 xor $tmp1,$tmp2,$tmp2
297 sll $tmp1,`@sigma1[2]-@sigma1[1]`,$tmp1
298 xor $tmp0,$tmp2,$tmp2
299 srl $xi,@sigma1[2],$tmp0
300 xor $tmp1,$tmp2,$tmp2
301___
302 if ($i&1) {
303 $xi=@X[($i/2)%8];
304$code.=<<___;
305 srlx @X[(($i+9)/2)%8],32,$tmp1 ! X[i+9]
306 xor $tmp0,$tmp2,$tmp2 ! sigma1(X[i+14])
307 srl @X[($i/2)%8],0,$tmp0
308 add $tmp2,$tmp1,$tmp1
309 add $xi,$T1,$T1 ! +=X[i]
310 xor $tmp0,@X[($i/2)%8],@X[($i/2)%8]
311 add $tmp1,$T1,$T1
312
313 srl $T1,0,$T1
314 or $T1,@X[($i/2)%8],@X[($i/2)%8]
315___
316 } else {
317 $xi=@X[(($i+9)/2)%8];
318$code.=<<___;
319 srlx @X[($i/2)%8],32,$tmp1 ! X[i]
320 xor $tmp0,$tmp2,$tmp2 ! sigma1(X[i+14])
321 add $xi,$T1,$T1 ! +=X[i+9]
322 add $tmp2,$tmp1,$tmp1
323 srl @X[($i/2)%8],0,@X[($i/2)%8]
324 add $tmp1,$T1,$T1
325
326 sllx $T1,32,$tmp0
327 or $tmp0,@X[($i/2)%8],@X[($i/2)%8]
328___
329 }
330 &BODY_00_15(@_);
331} if ($SZ==4);
332
333########### SHA512
334$BODY_16_XX = sub {
335my $i=@_[0];
336my @pair=("%l".eval(($i*2)%8),"%l".eval(($i*2)%8+1));
337
338$code.=<<___;
339 sllx %l2,32,$tmp0 !! Xupdate($i)
340 or %l3,$tmp0,$tmp0
341
342 srlx $tmp0,@sigma0[0],$T1
343 ld [%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+0`],%l2
344 sllx $tmp0,`64-@sigma0[2]`,$tmp1
345 ld [%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+4`],%l3
346 srlx $tmp0,@sigma0[1],$tmp0
347 xor $tmp1,$T1,$T1
348 sllx $tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1
349 xor $tmp0,$T1,$T1
350 srlx $tmp0,`@sigma0[2]-@sigma0[1]`,$tmp0
351 xor $tmp1,$T1,$T1
352 sllx %l6,32,$tmp2
353 xor $tmp0,$T1,$T1 ! sigma0(X[$i+1])
354 or %l7,$tmp2,$tmp2
355
356 srlx $tmp2,@sigma1[0],$tmp1
357 ld [%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+0`],%l6
358 sllx $tmp2,`64-@sigma1[2]`,$tmp0
359 ld [%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+4`],%l7
360 srlx $tmp2,@sigma1[1],$tmp2
361 xor $tmp0,$tmp1,$tmp1
362 sllx $tmp0,`@sigma1[2]-@sigma1[1]`,$tmp0
363 xor $tmp2,$tmp1,$tmp1
364 srlx $tmp2,`@sigma1[2]-@sigma1[1]`,$tmp2
365 xor $tmp0,$tmp1,$tmp1
366 sllx %l4,32,$tmp0
367 xor $tmp2,$tmp1,$tmp1 ! sigma1(X[$i+14])
368 ld [%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+0`],%l4
369 or %l5,$tmp0,$tmp0
370 ld [%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+4`],%l5
371
372 sllx %l0,32,$tmp2
373 add $tmp1,$T1,$T1
374 ld [%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+0`],%l0
375 or %l1,$tmp2,$tmp2
376 add $tmp0,$T1,$T1 ! +=X[$i+9]
377 ld [%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+4`],%l1
378 add $tmp2,$T1,$T1 ! +=X[$i]
379 $ST $T1,[%sp+`$bias+$frame+($i%16)*$SZ`]
380___
381 &BODY_00_15(@_);
382} if ($SZ==8);
383
384$code.=<<___ if ($bits==64);
385.register %g2,#scratch
386.register %g3,#scratch
387___
388$code.=<<___;
389.section ".rodata",#alloc
390
391.align 64
392K${label}:
393.type K${label},#object
394___
395if ($SZ==4) {
396$code.=<<___;
397 .long 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
398 .long 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
399 .long 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
400 .long 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
401 .long 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
402 .long 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
403 .long 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
404 .long 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
405 .long 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
406 .long 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
407 .long 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
408 .long 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
409 .long 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
410 .long 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
411 .long 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
412 .long 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
413___
414} else {
415$code.=<<___;
416 .long 0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd
417 .long 0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc
418 .long 0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019
419 .long 0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118
420 .long 0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe
421 .long 0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2
422 .long 0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1
423 .long 0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694
424 .long 0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3
425 .long 0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65
426 .long 0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483
427 .long 0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5
428 .long 0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210
429 .long 0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4
430 .long 0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725
431 .long 0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70
432 .long 0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926
433 .long 0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df
434 .long 0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8
435 .long 0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b
436 .long 0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001
437 .long 0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30
438 .long 0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910
439 .long 0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8
440 .long 0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53
441 .long 0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8
442 .long 0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb
443 .long 0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3
444 .long 0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60
445 .long 0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec
446 .long 0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9
447 .long 0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b
448 .long 0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207
449 .long 0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178
450 .long 0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6
451 .long 0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b
452 .long 0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493
453 .long 0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c
454 .long 0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a
455 .long 0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817
456___
457}
458$code.=<<___;
459.size K${label},.-K${label}
460
461.section ".text",#alloc,#execinstr
462.globl sha${label}_block_data_order
463sha${label}_block_data_order:
464 save %sp,`-$frame-$locals`,%sp
465#ifdef __PIC__
466 sethi %hi(_GLOBAL_OFFSET_TABLE_-4), %o5
467 rd %pc, %o4
468 or %o5, %lo(_GLOBAL_OFFSET_TABLE_+4), %o5
469 add %o5, %o4, %o5
470#endif
471 and $inp,`$align-1`,$tmp31
472 sllx $len,`log(16*$SZ)/log(2)`,$len
473 andn $inp,`$align-1`,$inp
474 sll $tmp31,3,$tmp31
475 add $inp,$len,$len
476___
477$code.=<<___ if ($SZ==8); # SHA512
478 mov 32,$tmp32
479 sub $tmp32,$tmp31,$tmp32
480___
481$code.=<<___;
482#ifdef __PIC__
483 set K${label}, $Ktbl
484 ldx [$Ktbl+%o5], $Ktbl
485#else
486 set K${label}, $Ktbl
487#endif
488
489 $LD [$ctx+`0*$SZ`],$A
490 $LD [$ctx+`1*$SZ`],$B
491 $LD [$ctx+`2*$SZ`],$C
492 $LD [$ctx+`3*$SZ`],$D
493 $LD [$ctx+`4*$SZ`],$E
494 $LD [$ctx+`5*$SZ`],$F
495 $LD [$ctx+`6*$SZ`],$G
496 $LD [$ctx+`7*$SZ`],$H
497
498.Lloop:
499___
500for ($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
501$code.=".L16_xx:\n";
502for (;$i<32;$i++) { &$BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
503$code.=<<___;
504 and $tmp2,0xfff,$tmp2
505 cmp $tmp2,$lastK
506 bne .L16_xx
507 add $Ktbl,`16*$SZ`,$Ktbl ! Ktbl+=16
508
509___
510$code.=<<___ if ($SZ==4); # SHA256
511 $LD [$ctx+`0*$SZ`],@X[0]
512 $LD [$ctx+`1*$SZ`],@X[1]
513 $LD [$ctx+`2*$SZ`],@X[2]
514 $LD [$ctx+`3*$SZ`],@X[3]
515 $LD [$ctx+`4*$SZ`],@X[4]
516 $LD [$ctx+`5*$SZ`],@X[5]
517 $LD [$ctx+`6*$SZ`],@X[6]
518 $LD [$ctx+`7*$SZ`],@X[7]
519
520 add $A,@X[0],$A
521 $ST $A,[$ctx+`0*$SZ`]
522 add $B,@X[1],$B
523 $ST $B,[$ctx+`1*$SZ`]
524 add $C,@X[2],$C
525 $ST $C,[$ctx+`2*$SZ`]
526 add $D,@X[3],$D
527 $ST $D,[$ctx+`3*$SZ`]
528 add $E,@X[4],$E
529 $ST $E,[$ctx+`4*$SZ`]
530 add $F,@X[5],$F
531 $ST $F,[$ctx+`5*$SZ`]
532 add $G,@X[6],$G
533 $ST $G,[$ctx+`6*$SZ`]
534 add $H,@X[7],$H
535 $ST $H,[$ctx+`7*$SZ`]
536___
537$code.=<<___ if ($SZ==8); # SHA512
538 ld [$ctx+`0*$SZ+0`],%l0
539 ld [$ctx+`0*$SZ+4`],%l1
540 ld [$ctx+`1*$SZ+0`],%l2
541 ld [$ctx+`1*$SZ+4`],%l3
542 ld [$ctx+`2*$SZ+0`],%l4
543 ld [$ctx+`2*$SZ+4`],%l5
544 ld [$ctx+`3*$SZ+0`],%l6
545
546 sllx %l0,32,$tmp0
547 ld [$ctx+`3*$SZ+4`],%l7
548 sllx %l2,32,$tmp1
549 or %l1,$tmp0,$tmp0
550 or %l3,$tmp1,$tmp1
551 add $tmp0,$A,$A
552 add $tmp1,$B,$B
553 $ST $A,[$ctx+`0*$SZ`]
554 sllx %l4,32,$tmp2
555 $ST $B,[$ctx+`1*$SZ`]
556 sllx %l6,32,$T1
557 or %l5,$tmp2,$tmp2
558 or %l7,$T1,$T1
559 add $tmp2,$C,$C
560 $ST $C,[$ctx+`2*$SZ`]
561 add $T1,$D,$D
562 $ST $D,[$ctx+`3*$SZ`]
563
564 ld [$ctx+`4*$SZ+0`],%l0
565 ld [$ctx+`4*$SZ+4`],%l1
566 ld [$ctx+`5*$SZ+0`],%l2
567 ld [$ctx+`5*$SZ+4`],%l3
568 ld [$ctx+`6*$SZ+0`],%l4
569 ld [$ctx+`6*$SZ+4`],%l5
570 ld [$ctx+`7*$SZ+0`],%l6
571
572 sllx %l0,32,$tmp0
573 ld [$ctx+`7*$SZ+4`],%l7
574 sllx %l2,32,$tmp1
575 or %l1,$tmp0,$tmp0
576 or %l3,$tmp1,$tmp1
577 add $tmp0,$E,$E
578 add $tmp1,$F,$F
579 $ST $E,[$ctx+`4*$SZ`]
580 sllx %l4,32,$tmp2
581 $ST $F,[$ctx+`5*$SZ`]
582 sllx %l6,32,$T1
583 or %l5,$tmp2,$tmp2
584 or %l7,$T1,$T1
585 add $tmp2,$G,$G
586 $ST $G,[$ctx+`6*$SZ`]
587 add $T1,$H,$H
588 $ST $H,[$ctx+`7*$SZ`]
589___
590$code.=<<___;
591 add $inp,`16*$SZ`,$inp ! advance inp
592 cmp $inp,$len
593 bne `$bits==64?"%xcc":"%icc"`,.Lloop
594 sub $Ktbl,`($rounds-16)*$SZ`,$Ktbl ! rewind Ktbl
595
596 ret
597 restore
598.type sha${label}_block_data_order,#function
599.size sha${label}_block_data_order,(.-sha${label}_block_data_order)
600___
601
602$code =~ s/\`([^\`]*)\`/eval $1/gem;
603print $code;
604close STDOUT;