summaryrefslogtreecommitdiff
path: root/src/lib/libcrypto/sha
diff options
context:
space:
mode:
authordjm <>2012-10-13 21:23:50 +0000
committerdjm <>2012-10-13 21:23:50 +0000
commit228cae30b117c2493f69ad3c195341cd6ec8d430 (patch)
tree29ff00b10d52c0978077c4fd83c33b065bade73e /src/lib/libcrypto/sha
parent731838c66b52c0ae5888333005b74115a620aa96 (diff)
downloadopenbsd-228cae30b117c2493f69ad3c195341cd6ec8d430.tar.gz
openbsd-228cae30b117c2493f69ad3c195341cd6ec8d430.tar.bz2
openbsd-228cae30b117c2493f69ad3c195341cd6ec8d430.zip
import OpenSSL-1.0.1c
Diffstat (limited to 'src/lib/libcrypto/sha')
-rw-r--r--src/lib/libcrypto/sha/asm/sha1-586.pl1107
-rw-r--r--src/lib/libcrypto/sha/asm/sha1-alpha.pl322
-rw-r--r--src/lib/libcrypto/sha/asm/sha1-armv4-large.pl38
-rw-r--r--src/lib/libcrypto/sha/asm/sha1-ia64.pl192
-rw-r--r--src/lib/libcrypto/sha/asm/sha1-mips.pl354
-rw-r--r--src/lib/libcrypto/sha/asm/sha1-parisc.pl259
-rwxr-xr-xsrc/lib/libcrypto/sha/asm/sha1-ppc.pl83
-rw-r--r--src/lib/libcrypto/sha/asm/sha1-s390x.pl50
-rwxr-xr-xsrc/lib/libcrypto/sha/asm/sha1-x86_64.pl1185
-rw-r--r--src/lib/libcrypto/sha/asm/sha256-586.pl52
-rw-r--r--src/lib/libcrypto/sha/asm/sha256-armv4.pl55
-rw-r--r--src/lib/libcrypto/sha/asm/sha512-armv4.pl357
-rw-r--r--src/lib/libcrypto/sha/asm/sha512-mips.pl455
-rwxr-xr-xsrc/lib/libcrypto/sha/asm/sha512-parisc.pl791
-rwxr-xr-xsrc/lib/libcrypto/sha/asm/sha512-ppc.pl114
-rw-r--r--src/lib/libcrypto/sha/asm/sha512-s390x.pl63
-rw-r--r--src/lib/libcrypto/sha/asm/sha512-sparcv9.pl6
-rwxr-xr-xsrc/lib/libcrypto/sha/asm/sha512-x86_64.pl86
-rw-r--r--src/lib/libcrypto/sha/sha.h14
-rw-r--r--src/lib/libcrypto/sha/sha1dgst.c1
-rw-r--r--src/lib/libcrypto/sha/sha256.c4
-rw-r--r--src/lib/libcrypto/sha/sha512.c54
-rw-r--r--src/lib/libcrypto/sha/sha_locl.h6
23 files changed, 4991 insertions, 657 deletions
diff --git a/src/lib/libcrypto/sha/asm/sha1-586.pl b/src/lib/libcrypto/sha/asm/sha1-586.pl
index a1f876281a..1084d227fe 100644
--- a/src/lib/libcrypto/sha/asm/sha1-586.pl
+++ b/src/lib/libcrypto/sha/asm/sha1-586.pl
@@ -12,6 +12,8 @@
12# commentary below], and in 2006 the rest was rewritten in order to 12# commentary below], and in 2006 the rest was rewritten in order to
13# gain freedom to liberate licensing terms. 13# gain freedom to liberate licensing terms.
14 14
15# January, September 2004.
16#
15# It was noted that Intel IA-32 C compiler generates code which 17# It was noted that Intel IA-32 C compiler generates code which
16# performs ~30% *faster* on P4 CPU than original *hand-coded* 18# performs ~30% *faster* on P4 CPU than original *hand-coded*
17# SHA1 assembler implementation. To address this problem (and 19# SHA1 assembler implementation. To address this problem (and
@@ -31,12 +33,92 @@
31# ---------------------------------------------------------------- 33# ----------------------------------------------------------------
32# <appro@fy.chalmers.se> 34# <appro@fy.chalmers.se>
33 35
36# August 2009.
37#
38# George Spelvin has tipped that F_40_59(b,c,d) can be rewritten as
39# '(c&d) + (b&(c^d))', which allows to accumulate partial results
40# and lighten "pressure" on scratch registers. This resulted in
41# >12% performance improvement on contemporary AMD cores (with no
42# degradation on other CPUs:-). Also, the code was revised to maximize
43# "distance" between instructions producing input to 'lea' instruction
44# and the 'lea' instruction itself, which is essential for Intel Atom
45# core and resulted in ~15% improvement.
46
47# October 2010.
48#
49# Add SSSE3, Supplemental[!] SSE3, implementation. The idea behind it
50# is to offload message schedule denoted by Wt in NIST specification,
51# or Xupdate in OpenSSL source, to SIMD unit. The idea is not novel,
52# and in SSE2 context was first explored by Dean Gaudet in 2004, see
53# http://arctic.org/~dean/crypto/sha1.html. Since then several things
54# have changed that made it interesting again:
55#
56# a) XMM units became faster and wider;
57# b) instruction set became more versatile;
58# c) an important observation was made by Max Locktykhin, which made
59# it possible to reduce amount of instructions required to perform
60# the operation in question, for further details see
61# http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/.
62
63# April 2011.
64#
65# Add AVX code path, probably most controversial... The thing is that
66# switch to AVX alone improves performance by as little as 4% in
67# comparison to SSSE3 code path. But below result doesn't look like
68# 4% improvement... Trouble is that Sandy Bridge decodes 'ro[rl]' as
69# pair of µ-ops, and it's the additional µ-ops, two per round, that
70# make it run slower than Core2 and Westmere. But 'sh[rl]d' is decoded
71# as single µ-op by Sandy Bridge and it's replacing 'ro[rl]' with
72# equivalent 'sh[rl]d' that is responsible for the impressive 5.1
73# cycles per processed byte. But 'sh[rl]d' is not something that used
74# to be fast, nor does it appear to be fast in upcoming Bulldozer
75# [according to its optimization manual]. Which is why AVX code path
76# is guarded by *both* AVX and synthetic bit denoting Intel CPUs.
77# One can argue that it's unfair to AMD, but without 'sh[rl]d' it
78# makes no sense to keep the AVX code path. If somebody feels that
79# strongly, it's probably more appropriate to discuss possibility of
80# using vector rotate XOP on AMD...
81
82######################################################################
83# Current performance is summarized in following table. Numbers are
84# CPU clock cycles spent to process single byte (less is better).
85#
86# x86 SSSE3 AVX
87# Pentium 15.7 -
88# PIII 11.5 -
89# P4 10.6 -
90# AMD K8 7.1 -
91# Core2 7.3 6.1/+20% -
92# Atom 12.5 9.5(*)/+32% -
93# Westmere 7.3 5.6/+30% -
94# Sandy Bridge 8.8 6.2/+40% 5.1(**)/+70%
95#
96# (*) Loop is 1056 instructions long and expected result is ~8.25.
97# It remains mystery [to me] why ILP is limited to 1.7.
98#
99# (**) As per above comment, the result is for AVX *plus* sh[rl]d.
100
34$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 101$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
35push(@INC,"${dir}","${dir}../../perlasm"); 102push(@INC,"${dir}","${dir}../../perlasm");
36require "x86asm.pl"; 103require "x86asm.pl";
37 104
38&asm_init($ARGV[0],"sha1-586.pl",$ARGV[$#ARGV] eq "386"); 105&asm_init($ARGV[0],"sha1-586.pl",$ARGV[$#ARGV] eq "386");
39 106
107$xmm=$ymm=0;
108for (@ARGV) { $xmm=1 if (/-DOPENSSL_IA32_SSE2/); }
109
110$ymm=1 if ($xmm &&
111 `$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
112 =~ /GNU assembler version ([2-9]\.[0-9]+)/ &&
113 $1>=2.19); # first version supporting AVX
114
115$ymm=1 if ($xmm && !$ymm && $ARGV[0] eq "win32n" &&
116 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ &&
117 $1>=2.03); # first version supporting AVX
118
119&external_label("OPENSSL_ia32cap_P") if ($xmm);
120
121
40$A="eax"; 122$A="eax";
41$B="ebx"; 123$B="ebx";
42$C="ecx"; 124$C="ecx";
@@ -47,6 +129,10 @@ $tmp1="ebp";
47 129
48@V=($A,$B,$C,$D,$E,$T); 130@V=($A,$B,$C,$D,$E,$T);
49 131
132$alt=0; # 1 denotes alternative IALU implementation, which performs
133 # 8% *worse* on P4, same on Westmere and Atom, 2% better on
134 # Sandy Bridge...
135
50sub BODY_00_15 136sub BODY_00_15
51 { 137 {
52 local($n,$a,$b,$c,$d,$e,$f)=@_; 138 local($n,$a,$b,$c,$d,$e,$f)=@_;
@@ -59,16 +145,18 @@ sub BODY_00_15
59 &rotl($tmp1,5); # tmp1=ROTATE(a,5) 145 &rotl($tmp1,5); # tmp1=ROTATE(a,5)
60 &xor($f,$d); 146 &xor($f,$d);
61 &add($tmp1,$e); # tmp1+=e; 147 &add($tmp1,$e); # tmp1+=e;
62 &and($f,$b); 148 &mov($e,&swtmp($n%16)); # e becomes volatile and is loaded
63 &mov($e,&swtmp($n%16)); # e becomes volatile and is loaded
64 # with xi, also note that e becomes 149 # with xi, also note that e becomes
65 # f in next round... 150 # f in next round...
66 &xor($f,$d); # f holds F_00_19(b,c,d) 151 &and($f,$b);
67 &rotr($b,2); # b=ROTATE(b,30) 152 &rotr($b,2); # b=ROTATE(b,30)
68 &lea($tmp1,&DWP(0x5a827999,$tmp1,$e)); # tmp1+=K_00_19+xi 153 &xor($f,$d); # f holds F_00_19(b,c,d)
154 &lea($tmp1,&DWP(0x5a827999,$tmp1,$e)); # tmp1+=K_00_19+xi
69 155
70 if ($n==15) { &add($f,$tmp1); } # f+=tmp1 156 if ($n==15) { &mov($e,&swtmp(($n+1)%16));# pre-fetch f for next round
157 &add($f,$tmp1); } # f+=tmp1
71 else { &add($tmp1,$f); } # f becomes a in next round 158 else { &add($tmp1,$f); } # f becomes a in next round
159 &mov($tmp1,$a) if ($alt && $n==15);
72 } 160 }
73 161
74sub BODY_16_19 162sub BODY_16_19
@@ -77,22 +165,41 @@ sub BODY_16_19
77 165
78 &comment("16_19 $n"); 166 &comment("16_19 $n");
79 167
80 &mov($f,&swtmp($n%16)); # f to hold Xupdate(xi,xa,xb,xc,xd) 168if ($alt) {
81 &mov($tmp1,$c); # tmp1 to hold F_00_19(b,c,d) 169 &xor($c,$d);
82 &xor($f,&swtmp(($n+2)%16)); 170 &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd)
83 &xor($tmp1,$d); 171 &and($tmp1,$c); # tmp1 to hold F_00_19(b,c,d), b&=c^d
84 &xor($f,&swtmp(($n+8)%16)); 172 &xor($f,&swtmp(($n+8)%16));
85 &and($tmp1,$b); # tmp1 holds F_00_19(b,c,d) 173 &xor($tmp1,$d); # tmp1=F_00_19(b,c,d)
86 &rotr($b,2); # b=ROTATE(b,30) 174 &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd
175 &rotl($f,1); # f=ROTATE(f,1)
176 &add($e,$tmp1); # e+=F_00_19(b,c,d)
177 &xor($c,$d); # restore $c
178 &mov($tmp1,$a); # b in next round
179 &rotr($b,$n==16?2:7); # b=ROTATE(b,30)
180 &mov(&swtmp($n%16),$f); # xi=f
181 &rotl($a,5); # ROTATE(a,5)
182 &lea($f,&DWP(0x5a827999,$f,$e));# f+=F_00_19(b,c,d)+e
183 &mov($e,&swtmp(($n+1)%16)); # pre-fetch f for next round
184 &add($f,$a); # f+=ROTATE(a,5)
185} else {
186 &mov($tmp1,$c); # tmp1 to hold F_00_19(b,c,d)
187 &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd)
188 &xor($tmp1,$d);
189 &xor($f,&swtmp(($n+8)%16));
190 &and($tmp1,$b);
87 &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd 191 &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd
88 &rotl($f,1); # f=ROTATE(f,1) 192 &rotl($f,1); # f=ROTATE(f,1)
89 &xor($tmp1,$d); # tmp1=F_00_19(b,c,d) 193 &xor($tmp1,$d); # tmp1=F_00_19(b,c,d)
90 &mov(&swtmp($n%16),$f); # xi=f 194 &add($e,$tmp1); # e+=F_00_19(b,c,d)
91 &lea($f,&DWP(0x5a827999,$f,$e));# f+=K_00_19+e 195 &mov($tmp1,$a);
92 &mov($e,$a); # e becomes volatile 196 &rotr($b,2); # b=ROTATE(b,30)
93 &rotl($e,5); # e=ROTATE(a,5) 197 &mov(&swtmp($n%16),$f); # xi=f
94 &add($f,$tmp1); # f+=F_00_19(b,c,d) 198 &rotl($tmp1,5); # ROTATE(a,5)
95 &add($f,$e); # f+=ROTATE(a,5) 199 &lea($f,&DWP(0x5a827999,$f,$e));# f+=F_00_19(b,c,d)+e
200 &mov($e,&swtmp(($n+1)%16)); # pre-fetch f for next round
201 &add($f,$tmp1); # f+=ROTATE(a,5)
202}
96 } 203 }
97 204
98sub BODY_20_39 205sub BODY_20_39
@@ -102,21 +209,41 @@ sub BODY_20_39
102 209
103 &comment("20_39 $n"); 210 &comment("20_39 $n");
104 211
212if ($alt) {
213 &xor($tmp1,$c); # tmp1 to hold F_20_39(b,c,d), b^=c
214 &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd)
215 &xor($tmp1,$d); # tmp1 holds F_20_39(b,c,d)
216 &xor($f,&swtmp(($n+8)%16));
217 &add($e,$tmp1); # e+=F_20_39(b,c,d)
218 &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd
219 &rotl($f,1); # f=ROTATE(f,1)
220 &mov($tmp1,$a); # b in next round
221 &rotr($b,7); # b=ROTATE(b,30)
222 &mov(&swtmp($n%16),$f) if($n<77);# xi=f
223 &rotl($a,5); # ROTATE(a,5)
224 &xor($b,$c) if($n==39);# warm up for BODY_40_59
225 &and($tmp1,$b) if($n==39);
226 &lea($f,&DWP($K,$f,$e)); # f+=e+K_XX_YY
227 &mov($e,&swtmp(($n+1)%16)) if($n<79);# pre-fetch f for next round
228 &add($f,$a); # f+=ROTATE(a,5)
229 &rotr($a,5) if ($n==79);
230} else {
105 &mov($tmp1,$b); # tmp1 to hold F_20_39(b,c,d) 231 &mov($tmp1,$b); # tmp1 to hold F_20_39(b,c,d)
106 &mov($f,&swtmp($n%16)); # f to hold Xupdate(xi,xa,xb,xc,xd) 232 &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd)
107 &rotr($b,2); # b=ROTATE(b,30)
108 &xor($f,&swtmp(($n+2)%16));
109 &xor($tmp1,$c); 233 &xor($tmp1,$c);
110 &xor($f,&swtmp(($n+8)%16)); 234 &xor($f,&swtmp(($n+8)%16));
111 &xor($tmp1,$d); # tmp1 holds F_20_39(b,c,d) 235 &xor($tmp1,$d); # tmp1 holds F_20_39(b,c,d)
112 &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd 236 &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd
113 &rotl($f,1); # f=ROTATE(f,1) 237 &rotl($f,1); # f=ROTATE(f,1)
114 &add($tmp1,$e); 238 &add($e,$tmp1); # e+=F_20_39(b,c,d)
115 &mov(&swtmp($n%16),$f); # xi=f 239 &rotr($b,2); # b=ROTATE(b,30)
116 &mov($e,$a); # e becomes volatile 240 &mov($tmp1,$a);
117 &rotl($e,5); # e=ROTATE(a,5) 241 &rotl($tmp1,5); # ROTATE(a,5)
118 &lea($f,&DWP($K,$f,$tmp1)); # f+=K_20_39+e 242 &mov(&swtmp($n%16),$f) if($n<77);# xi=f
119 &add($f,$e); # f+=ROTATE(a,5) 243 &lea($f,&DWP($K,$f,$e)); # f+=e+K_XX_YY
244 &mov($e,&swtmp(($n+1)%16)) if($n<79);# pre-fetch f for next round
245 &add($f,$tmp1); # f+=ROTATE(a,5)
246}
120 } 247 }
121 248
122sub BODY_40_59 249sub BODY_40_59
@@ -125,41 +252,86 @@ sub BODY_40_59
125 252
126 &comment("40_59 $n"); 253 &comment("40_59 $n");
127 254
128 &mov($f,&swtmp($n%16)); # f to hold Xupdate(xi,xa,xb,xc,xd) 255if ($alt) {
129 &mov($tmp1,&swtmp(($n+2)%16)); 256 &add($e,$tmp1); # e+=b&(c^d)
130 &xor($f,$tmp1); 257 &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd)
131 &mov($tmp1,&swtmp(($n+8)%16)); 258 &mov($tmp1,$d);
132 &xor($f,$tmp1); 259 &xor($f,&swtmp(($n+8)%16));
133 &mov($tmp1,&swtmp(($n+13)%16)); 260 &xor($c,$d); # restore $c
134 &xor($f,$tmp1); # f holds xa^xb^xc^xd 261 &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd
135 &mov($tmp1,$b); # tmp1 to hold F_40_59(b,c,d)
136 &rotl($f,1); # f=ROTATE(f,1) 262 &rotl($f,1); # f=ROTATE(f,1)
137 &or($tmp1,$c); 263 &and($tmp1,$c);
138 &mov(&swtmp($n%16),$f); # xi=f 264 &rotr($b,7); # b=ROTATE(b,30)
139 &and($tmp1,$d); 265 &add($e,$tmp1); # e+=c&d
140 &lea($f,&DWP(0x8f1bbcdc,$f,$e));# f+=K_40_59+e 266 &mov($tmp1,$a); # b in next round
141 &mov($e,$b); # e becomes volatile and is used 267 &mov(&swtmp($n%16),$f); # xi=f
142 # to calculate F_40_59(b,c,d) 268 &rotl($a,5); # ROTATE(a,5)
269 &xor($b,$c) if ($n<59);
270 &and($tmp1,$b) if ($n<59);# tmp1 to hold F_40_59(b,c,d)
271 &lea($f,&DWP(0x8f1bbcdc,$f,$e));# f+=K_40_59+e+(b&(c^d))
272 &mov($e,&swtmp(($n+1)%16)); # pre-fetch f for next round
273 &add($f,$a); # f+=ROTATE(a,5)
274} else {
275 &mov($tmp1,$c); # tmp1 to hold F_40_59(b,c,d)
276 &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd)
277 &xor($tmp1,$d);
278 &xor($f,&swtmp(($n+8)%16));
279 &and($tmp1,$b);
280 &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd
281 &rotl($f,1); # f=ROTATE(f,1)
282 &add($tmp1,$e); # b&(c^d)+=e
143 &rotr($b,2); # b=ROTATE(b,30) 283 &rotr($b,2); # b=ROTATE(b,30)
144 &and($e,$c); 284 &mov($e,$a); # e becomes volatile
145 &or($tmp1,$e); # tmp1 holds F_40_59(b,c,d) 285 &rotl($e,5); # ROTATE(a,5)
146 &mov($e,$a); 286 &mov(&swtmp($n%16),$f); # xi=f
147 &rotl($e,5); # e=ROTATE(a,5) 287 &lea($f,&DWP(0x8f1bbcdc,$f,$tmp1));# f+=K_40_59+e+(b&(c^d))
148 &add($f,$tmp1); # f+=tmp1; 288 &mov($tmp1,$c);
149 &add($f,$e); # f+=ROTATE(a,5) 289 &add($f,$e); # f+=ROTATE(a,5)
290 &and($tmp1,$d);
291 &mov($e,&swtmp(($n+1)%16)); # pre-fetch f for next round
292 &add($f,$tmp1); # f+=c&d
293}
150 } 294 }
151 295
152&function_begin("sha1_block_data_order"); 296&function_begin("sha1_block_data_order");
297if ($xmm) {
298 &static_label("ssse3_shortcut");
299 &static_label("avx_shortcut") if ($ymm);
300 &static_label("K_XX_XX");
301
302 &call (&label("pic_point")); # make it PIC!
303 &set_label("pic_point");
304 &blindpop($tmp1);
305 &picmeup($T,"OPENSSL_ia32cap_P",$tmp1,&label("pic_point"));
306 &lea ($tmp1,&DWP(&label("K_XX_XX")."-".&label("pic_point"),$tmp1));
307
308 &mov ($A,&DWP(0,$T));
309 &mov ($D,&DWP(4,$T));
310 &test ($D,1<<9); # check SSSE3 bit
311 &jz (&label("x86"));
312 &test ($A,1<<24); # check FXSR bit
313 &jz (&label("x86"));
314 if ($ymm) {
315 &and ($D,1<<28); # mask AVX bit
316 &and ($A,1<<30); # mask "Intel CPU" bit
317 &or ($A,$D);
318 &cmp ($A,1<<28|1<<30);
319 &je (&label("avx_shortcut"));
320 }
321 &jmp (&label("ssse3_shortcut"));
322 &set_label("x86",16);
323}
153 &mov($tmp1,&wparam(0)); # SHA_CTX *c 324 &mov($tmp1,&wparam(0)); # SHA_CTX *c
154 &mov($T,&wparam(1)); # const void *input 325 &mov($T,&wparam(1)); # const void *input
155 &mov($A,&wparam(2)); # size_t num 326 &mov($A,&wparam(2)); # size_t num
156 &stack_push(16); # allocate X[16] 327 &stack_push(16+3); # allocate X[16]
157 &shl($A,6); 328 &shl($A,6);
158 &add($A,$T); 329 &add($A,$T);
159 &mov(&wparam(2),$A); # pointer beyond the end of input 330 &mov(&wparam(2),$A); # pointer beyond the end of input
160 &mov($E,&DWP(16,$tmp1));# pre-load E 331 &mov($E,&DWP(16,$tmp1));# pre-load E
332 &jmp(&label("loop"));
161 333
162 &set_label("loop",16); 334&set_label("loop",16);
163 335
164 # copy input chunk to X, but reversing byte order! 336 # copy input chunk to X, but reversing byte order!
165 for ($i=0; $i<16; $i+=4) 337 for ($i=0; $i<16; $i+=4)
@@ -213,8 +385,845 @@ sub BODY_40_59
213 &mov(&DWP(16,$tmp1),$C); 385 &mov(&DWP(16,$tmp1),$C);
214 &jb(&label("loop")); 386 &jb(&label("loop"));
215 387
216 &stack_pop(16); 388 &stack_pop(16+3);
217&function_end("sha1_block_data_order"); 389&function_end("sha1_block_data_order");
390
391if ($xmm) {
392######################################################################
393# The SSSE3 implementation.
394#
395# %xmm[0-7] are used as ring @X[] buffer containing quadruples of last
396# 32 elements of the message schedule or Xupdate outputs. First 4
397# quadruples are simply byte-swapped input, next 4 are calculated
398# according to method originally suggested by Dean Gaudet (modulo
399# being implemented in SSSE3). Once 8 quadruples or 32 elements are
400# collected, it switches to routine proposed by Max Locktyukhin.
401#
402# Calculations inevitably require temporary reqisters, and there are
403# no %xmm registers left to spare. For this reason part of the ring
404# buffer, X[2..4] to be specific, is offloaded to 3 quadriples ring
405# buffer on the stack. Keep in mind that X[2] is alias X[-6], X[3] -
406# X[-5], and X[4] - X[-4]...
407#
408# Another notable optimization is aggressive stack frame compression
409# aiming to minimize amount of 9-byte instructions...
410#
411# Yet another notable optimization is "jumping" $B variable. It means
412# that there is no register permanently allocated for $B value. This
413# allowed to eliminate one instruction from body_20_39...
414#
415my $Xi=4; # 4xSIMD Xupdate round, start pre-seeded
416my @X=map("xmm$_",(4..7,0..3)); # pre-seeded for $Xi=4
417my @V=($A,$B,$C,$D,$E);
418my $j=0; # hash round
419my @T=($T,$tmp1);
420my $inp;
421
422my $_rol=sub { &rol(@_) };
423my $_ror=sub { &ror(@_) };
424
425&function_begin("_sha1_block_data_order_ssse3");
426 &call (&label("pic_point")); # make it PIC!
427 &set_label("pic_point");
428 &blindpop($tmp1);
429 &lea ($tmp1,&DWP(&label("K_XX_XX")."-".&label("pic_point"),$tmp1));
430&set_label("ssse3_shortcut");
431
432 &movdqa (@X[3],&QWP(0,$tmp1)); # K_00_19
433 &movdqa (@X[4],&QWP(16,$tmp1)); # K_20_39
434 &movdqa (@X[5],&QWP(32,$tmp1)); # K_40_59
435 &movdqa (@X[6],&QWP(48,$tmp1)); # K_60_79
436 &movdqa (@X[2],&QWP(64,$tmp1)); # pbswap mask
437
438 &mov ($E,&wparam(0)); # load argument block
439 &mov ($inp=@T[1],&wparam(1));
440 &mov ($D,&wparam(2));
441 &mov (@T[0],"esp");
442
443 # stack frame layout
444 #
445 # +0 X[0]+K X[1]+K X[2]+K X[3]+K # XMM->IALU xfer area
446 # X[4]+K X[5]+K X[6]+K X[7]+K
447 # X[8]+K X[9]+K X[10]+K X[11]+K
448 # X[12]+K X[13]+K X[14]+K X[15]+K
449 #
450 # +64 X[0] X[1] X[2] X[3] # XMM->XMM backtrace area
451 # X[4] X[5] X[6] X[7]
452 # X[8] X[9] X[10] X[11] # even borrowed for K_00_19
453 #
454 # +112 K_20_39 K_20_39 K_20_39 K_20_39 # constants
455 # K_40_59 K_40_59 K_40_59 K_40_59
456 # K_60_79 K_60_79 K_60_79 K_60_79
457 # K_00_19 K_00_19 K_00_19 K_00_19
458 # pbswap mask
459 #
460 # +192 ctx # argument block
461 # +196 inp
462 # +200 end
463 # +204 esp
464 &sub ("esp",208);
465 &and ("esp",-64);
466
467 &movdqa (&QWP(112+0,"esp"),@X[4]); # copy constants
468 &movdqa (&QWP(112+16,"esp"),@X[5]);
469 &movdqa (&QWP(112+32,"esp"),@X[6]);
470 &shl ($D,6); # len*64
471 &movdqa (&QWP(112+48,"esp"),@X[3]);
472 &add ($D,$inp); # end of input
473 &movdqa (&QWP(112+64,"esp"),@X[2]);
474 &add ($inp,64);
475 &mov (&DWP(192+0,"esp"),$E); # save argument block
476 &mov (&DWP(192+4,"esp"),$inp);
477 &mov (&DWP(192+8,"esp"),$D);
478 &mov (&DWP(192+12,"esp"),@T[0]); # save original %esp
479
480 &mov ($A,&DWP(0,$E)); # load context
481 &mov ($B,&DWP(4,$E));
482 &mov ($C,&DWP(8,$E));
483 &mov ($D,&DWP(12,$E));
484 &mov ($E,&DWP(16,$E));
485 &mov (@T[0],$B); # magic seed
486
487 &movdqu (@X[-4&7],&QWP(-64,$inp)); # load input to %xmm[0-3]
488 &movdqu (@X[-3&7],&QWP(-48,$inp));
489 &movdqu (@X[-2&7],&QWP(-32,$inp));
490 &movdqu (@X[-1&7],&QWP(-16,$inp));
491 &pshufb (@X[-4&7],@X[2]); # byte swap
492 &pshufb (@X[-3&7],@X[2]);
493 &pshufb (@X[-2&7],@X[2]);
494 &movdqa (&QWP(112-16,"esp"),@X[3]); # borrow last backtrace slot
495 &pshufb (@X[-1&7],@X[2]);
496 &paddd (@X[-4&7],@X[3]); # add K_00_19
497 &paddd (@X[-3&7],@X[3]);
498 &paddd (@X[-2&7],@X[3]);
499 &movdqa (&QWP(0,"esp"),@X[-4&7]); # X[]+K xfer to IALU
500 &psubd (@X[-4&7],@X[3]); # restore X[]
501 &movdqa (&QWP(0+16,"esp"),@X[-3&7]);
502 &psubd (@X[-3&7],@X[3]);
503 &movdqa (&QWP(0+32,"esp"),@X[-2&7]);
504 &psubd (@X[-2&7],@X[3]);
505 &movdqa (@X[0],@X[-3&7]);
506 &jmp (&label("loop"));
507
508######################################################################
509# SSE instruction sequence is first broken to groups of indepentent
510# instructions, independent in respect to their inputs and shifter
511# (not all architectures have more than one). Then IALU instructions
512# are "knitted in" between the SSE groups. Distance is maintained for
513# SSE latency of 2 in hope that it fits better upcoming AMD Bulldozer
514# [which allegedly also implements SSSE3]...
515#
516# Temporary registers usage. X[2] is volatile at the entry and at the
517# end is restored from backtrace ring buffer. X[3] is expected to
518# contain current K_XX_XX constant and is used to caclulate X[-1]+K
519# from previous round, it becomes volatile the moment the value is
520# saved to stack for transfer to IALU. X[4] becomes volatile whenever
521# X[-4] is accumulated and offloaded to backtrace ring buffer, at the
522# end it is loaded with next K_XX_XX [which becomes X[3] in next
523# round]...
524#
525sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4
526{ use integer;
527 my $body = shift;
528 my @insns = (&$body,&$body,&$body,&$body); # 40 instructions
529 my ($a,$b,$c,$d,$e);
530
531 eval(shift(@insns));
532 eval(shift(@insns));
533 &palignr(@X[0],@X[-4&7],8); # compose "X[-14]" in "X[0]"
534 &movdqa (@X[2],@X[-1&7]);
535 eval(shift(@insns));
536 eval(shift(@insns));
537
538 &paddd (@X[3],@X[-1&7]);
539 &movdqa (&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]);# save X[] to backtrace buffer
540 eval(shift(@insns));
541 eval(shift(@insns));
542 &psrldq (@X[2],4); # "X[-3]", 3 dwords
543 eval(shift(@insns));
544 eval(shift(@insns));
545 &pxor (@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
546 eval(shift(@insns));
547 eval(shift(@insns));
548
549 &pxor (@X[2],@X[-2&7]); # "X[-3]"^"X[-8]"
550 eval(shift(@insns));
551 eval(shift(@insns));
552 eval(shift(@insns));
553 eval(shift(@insns));
554
555 &pxor (@X[0],@X[2]); # "X[0]"^="X[-3]"^"X[-8]"
556 eval(shift(@insns));
557 eval(shift(@insns));
558 &movdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer to IALU
559 eval(shift(@insns));
560 eval(shift(@insns));
561
562 &movdqa (@X[4],@X[0]);
563 &movdqa (@X[2],@X[0]);
564 eval(shift(@insns));
565 eval(shift(@insns));
566 eval(shift(@insns));
567 eval(shift(@insns));
568
569 &pslldq (@X[4],12); # "X[0]"<<96, extract one dword
570 &paddd (@X[0],@X[0]);
571 eval(shift(@insns));
572 eval(shift(@insns));
573 eval(shift(@insns));
574 eval(shift(@insns));
575
576 &psrld (@X[2],31);
577 eval(shift(@insns));
578 eval(shift(@insns));
579 &movdqa (@X[3],@X[4]);
580 eval(shift(@insns));
581 eval(shift(@insns));
582
583 &psrld (@X[4],30);
584 &por (@X[0],@X[2]); # "X[0]"<<<=1
585 eval(shift(@insns));
586 eval(shift(@insns));
587 &movdqa (@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if ($Xi>5); # restore X[] from backtrace buffer
588 eval(shift(@insns));
589 eval(shift(@insns));
590
591 &pslld (@X[3],2);
592 &pxor (@X[0],@X[4]);
593 eval(shift(@insns));
594 eval(shift(@insns));
595 &movdqa (@X[4],&QWP(112-16+16*(($Xi)/5),"esp")); # K_XX_XX
596 eval(shift(@insns));
597 eval(shift(@insns));
598
599 &pxor (@X[0],@X[3]); # "X[0]"^=("X[0]"<<96)<<<2
600 &movdqa (@X[1],@X[-2&7]) if ($Xi<7);
601 eval(shift(@insns));
602 eval(shift(@insns));
603
604 foreach (@insns) { eval; } # remaining instructions [if any]
605
606 $Xi++; push(@X,shift(@X)); # "rotate" X[]
607}
608
609sub Xupdate_ssse3_32_79()
610{ use integer;
611 my $body = shift;
612 my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions
613 my ($a,$b,$c,$d,$e);
614
615 &movdqa (@X[2],@X[-1&7]) if ($Xi==8);
616 eval(shift(@insns)); # body_20_39
617 &pxor (@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
618 &palignr(@X[2],@X[-2&7],8); # compose "X[-6]"
619 eval(shift(@insns));
620 eval(shift(@insns));
621 eval(shift(@insns)); # rol
622
623 &pxor (@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
624 &movdqa (&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]); # save X[] to backtrace buffer
625 eval(shift(@insns));
626 eval(shift(@insns));
627 if ($Xi%5) {
628 &movdqa (@X[4],@X[3]); # "perpetuate" K_XX_XX...
629 } else { # ... or load next one
630 &movdqa (@X[4],&QWP(112-16+16*($Xi/5),"esp"));
631 }
632 &paddd (@X[3],@X[-1&7]);
633 eval(shift(@insns)); # ror
634 eval(shift(@insns));
635
636 &pxor (@X[0],@X[2]); # "X[0]"^="X[-6]"
637 eval(shift(@insns)); # body_20_39
638 eval(shift(@insns));
639 eval(shift(@insns));
640 eval(shift(@insns)); # rol
641
642 &movdqa (@X[2],@X[0]);
643 &movdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer to IALU
644 eval(shift(@insns));
645 eval(shift(@insns));
646 eval(shift(@insns)); # ror
647 eval(shift(@insns));
648
649 &pslld (@X[0],2);
650 eval(shift(@insns)); # body_20_39
651 eval(shift(@insns));
652 &psrld (@X[2],30);
653 eval(shift(@insns));
654 eval(shift(@insns)); # rol
655 eval(shift(@insns));
656 eval(shift(@insns));
657 eval(shift(@insns)); # ror
658 eval(shift(@insns));
659
660 &por (@X[0],@X[2]); # "X[0]"<<<=2
661 eval(shift(@insns)); # body_20_39
662 eval(shift(@insns));
663 &movdqa (@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if($Xi<19); # restore X[] from backtrace buffer
664 eval(shift(@insns));
665 eval(shift(@insns)); # rol
666 eval(shift(@insns));
667 eval(shift(@insns));
668 eval(shift(@insns)); # ror
669 &movdqa (@X[3],@X[0]) if ($Xi<19);
670 eval(shift(@insns));
671
672 foreach (@insns) { eval; } # remaining instructions
673
674 $Xi++; push(@X,shift(@X)); # "rotate" X[]
675}
676
677sub Xuplast_ssse3_80()
678{ use integer;
679 my $body = shift;
680 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
681 my ($a,$b,$c,$d,$e);
682
683 eval(shift(@insns));
684 &paddd (@X[3],@X[-1&7]);
685 eval(shift(@insns));
686 eval(shift(@insns));
687 eval(shift(@insns));
688 eval(shift(@insns));
689
690 &movdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer IALU
691
692 foreach (@insns) { eval; } # remaining instructions
693
694 &mov ($inp=@T[1],&DWP(192+4,"esp"));
695 &cmp ($inp,&DWP(192+8,"esp"));
696 &je (&label("done"));
697
698 &movdqa (@X[3],&QWP(112+48,"esp")); # K_00_19
699 &movdqa (@X[2],&QWP(112+64,"esp")); # pbswap mask
700 &movdqu (@X[-4&7],&QWP(0,$inp)); # load input
701 &movdqu (@X[-3&7],&QWP(16,$inp));
702 &movdqu (@X[-2&7],&QWP(32,$inp));
703 &movdqu (@X[-1&7],&QWP(48,$inp));
704 &add ($inp,64);
705 &pshufb (@X[-4&7],@X[2]); # byte swap
706 &mov (&DWP(192+4,"esp"),$inp);
707 &movdqa (&QWP(112-16,"esp"),@X[3]); # borrow last backtrace slot
708
709 $Xi=0;
710}
711
712sub Xloop_ssse3()
713{ use integer;
714 my $body = shift;
715 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
716 my ($a,$b,$c,$d,$e);
717
718 eval(shift(@insns));
719 eval(shift(@insns));
720 &pshufb (@X[($Xi-3)&7],@X[2]);
721 eval(shift(@insns));
722 eval(shift(@insns));
723 &paddd (@X[($Xi-4)&7],@X[3]);
724 eval(shift(@insns));
725 eval(shift(@insns));
726 eval(shift(@insns));
727 eval(shift(@insns));
728 &movdqa (&QWP(0+16*$Xi,"esp"),@X[($Xi-4)&7]); # X[]+K xfer to IALU
729 eval(shift(@insns));
730 eval(shift(@insns));
731 &psubd (@X[($Xi-4)&7],@X[3]);
732
733 foreach (@insns) { eval; }
734 $Xi++;
735}
736
737sub Xtail_ssse3()
738{ use integer;
739 my $body = shift;
740 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
741 my ($a,$b,$c,$d,$e);
742
743 foreach (@insns) { eval; }
744}
745
746sub body_00_19 () {
747 (
748 '($a,$b,$c,$d,$e)=@V;'.
749 '&add ($e,&DWP(4*($j&15),"esp"));', # X[]+K xfer
750 '&xor ($c,$d);',
751 '&mov (@T[1],$a);', # $b in next round
752 '&$_rol ($a,5);',
753 '&and (@T[0],$c);', # ($b&($c^$d))
754 '&xor ($c,$d);', # restore $c
755 '&xor (@T[0],$d);',
756 '&add ($e,$a);',
757 '&$_ror ($b,$j?7:2);', # $b>>>2
758 '&add ($e,@T[0]);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
759 );
760}
761
762sub body_20_39 () {
763 (
764 '($a,$b,$c,$d,$e)=@V;'.
765 '&add ($e,&DWP(4*($j++&15),"esp"));', # X[]+K xfer
766 '&xor (@T[0],$d);', # ($b^$d)
767 '&mov (@T[1],$a);', # $b in next round
768 '&$_rol ($a,5);',
769 '&xor (@T[0],$c);', # ($b^$d^$c)
770 '&add ($e,$a);',
771 '&$_ror ($b,7);', # $b>>>2
772 '&add ($e,@T[0]);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
773 );
774}
775
776sub body_40_59 () {
777 (
778 '($a,$b,$c,$d,$e)=@V;'.
779 '&mov (@T[1],$c);',
780 '&xor ($c,$d);',
781 '&add ($e,&DWP(4*($j++&15),"esp"));', # X[]+K xfer
782 '&and (@T[1],$d);',
783 '&and (@T[0],$c);', # ($b&($c^$d))
784 '&$_ror ($b,7);', # $b>>>2
785 '&add ($e,@T[1]);',
786 '&mov (@T[1],$a);', # $b in next round
787 '&$_rol ($a,5);',
788 '&add ($e,@T[0]);',
789 '&xor ($c,$d);', # restore $c
790 '&add ($e,$a);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
791 );
792}
793
794&set_label("loop",16);
795 &Xupdate_ssse3_16_31(\&body_00_19);
796 &Xupdate_ssse3_16_31(\&body_00_19);
797 &Xupdate_ssse3_16_31(\&body_00_19);
798 &Xupdate_ssse3_16_31(\&body_00_19);
799 &Xupdate_ssse3_32_79(\&body_00_19);
800 &Xupdate_ssse3_32_79(\&body_20_39);
801 &Xupdate_ssse3_32_79(\&body_20_39);
802 &Xupdate_ssse3_32_79(\&body_20_39);
803 &Xupdate_ssse3_32_79(\&body_20_39);
804 &Xupdate_ssse3_32_79(\&body_20_39);
805 &Xupdate_ssse3_32_79(\&body_40_59);
806 &Xupdate_ssse3_32_79(\&body_40_59);
807 &Xupdate_ssse3_32_79(\&body_40_59);
808 &Xupdate_ssse3_32_79(\&body_40_59);
809 &Xupdate_ssse3_32_79(\&body_40_59);
810 &Xupdate_ssse3_32_79(\&body_20_39);
811 &Xuplast_ssse3_80(\&body_20_39); # can jump to "done"
812
813 $saved_j=$j; @saved_V=@V;
814
815 &Xloop_ssse3(\&body_20_39);
816 &Xloop_ssse3(\&body_20_39);
817 &Xloop_ssse3(\&body_20_39);
818
819 &mov (@T[1],&DWP(192,"esp")); # update context
820 &add ($A,&DWP(0,@T[1]));
821 &add (@T[0],&DWP(4,@T[1])); # $b
822 &add ($C,&DWP(8,@T[1]));
823 &mov (&DWP(0,@T[1]),$A);
824 &add ($D,&DWP(12,@T[1]));
825 &mov (&DWP(4,@T[1]),@T[0]);
826 &add ($E,&DWP(16,@T[1]));
827 &mov (&DWP(8,@T[1]),$C);
828 &mov ($B,@T[0]);
829 &mov (&DWP(12,@T[1]),$D);
830 &mov (&DWP(16,@T[1]),$E);
831 &movdqa (@X[0],@X[-3&7]);
832
833 &jmp (&label("loop"));
834
835&set_label("done",16); $j=$saved_j; @V=@saved_V;
836
837 &Xtail_ssse3(\&body_20_39);
838 &Xtail_ssse3(\&body_20_39);
839 &Xtail_ssse3(\&body_20_39);
840
841 &mov (@T[1],&DWP(192,"esp")); # update context
842 &add ($A,&DWP(0,@T[1]));
843 &mov ("esp",&DWP(192+12,"esp")); # restore %esp
844 &add (@T[0],&DWP(4,@T[1])); # $b
845 &add ($C,&DWP(8,@T[1]));
846 &mov (&DWP(0,@T[1]),$A);
847 &add ($D,&DWP(12,@T[1]));
848 &mov (&DWP(4,@T[1]),@T[0]);
849 &add ($E,&DWP(16,@T[1]));
850 &mov (&DWP(8,@T[1]),$C);
851 &mov (&DWP(12,@T[1]),$D);
852 &mov (&DWP(16,@T[1]),$E);
853
854&function_end("_sha1_block_data_order_ssse3");
855
856if ($ymm) {
857my $Xi=4; # 4xSIMD Xupdate round, start pre-seeded
858my @X=map("xmm$_",(4..7,0..3)); # pre-seeded for $Xi=4
859my @V=($A,$B,$C,$D,$E);
860my $j=0; # hash round
861my @T=($T,$tmp1);
862my $inp;
863
864my $_rol=sub { &shld(@_[0],@_) };
865my $_ror=sub { &shrd(@_[0],@_) };
866
867&function_begin("_sha1_block_data_order_avx");
868 &call (&label("pic_point")); # make it PIC!
869 &set_label("pic_point");
870 &blindpop($tmp1);
871 &lea ($tmp1,&DWP(&label("K_XX_XX")."-".&label("pic_point"),$tmp1));
872&set_label("avx_shortcut");
873 &vzeroall();
874
875 &vmovdqa(@X[3],&QWP(0,$tmp1)); # K_00_19
876 &vmovdqa(@X[4],&QWP(16,$tmp1)); # K_20_39
877 &vmovdqa(@X[5],&QWP(32,$tmp1)); # K_40_59
878 &vmovdqa(@X[6],&QWP(48,$tmp1)); # K_60_79
879 &vmovdqa(@X[2],&QWP(64,$tmp1)); # pbswap mask
880
881 &mov ($E,&wparam(0)); # load argument block
882 &mov ($inp=@T[1],&wparam(1));
883 &mov ($D,&wparam(2));
884 &mov (@T[0],"esp");
885
886 # stack frame layout
887 #
888 # +0 X[0]+K X[1]+K X[2]+K X[3]+K # XMM->IALU xfer area
889 # X[4]+K X[5]+K X[6]+K X[7]+K
890 # X[8]+K X[9]+K X[10]+K X[11]+K
891 # X[12]+K X[13]+K X[14]+K X[15]+K
892 #
893 # +64 X[0] X[1] X[2] X[3] # XMM->XMM backtrace area
894 # X[4] X[5] X[6] X[7]
895 # X[8] X[9] X[10] X[11] # even borrowed for K_00_19
896 #
897 # +112 K_20_39 K_20_39 K_20_39 K_20_39 # constants
898 # K_40_59 K_40_59 K_40_59 K_40_59
899 # K_60_79 K_60_79 K_60_79 K_60_79
900 # K_00_19 K_00_19 K_00_19 K_00_19
901 # pbswap mask
902 #
903 # +192 ctx # argument block
904 # +196 inp
905 # +200 end
906 # +204 esp
907 &sub ("esp",208);
908 &and ("esp",-64);
909
910 &vmovdqa(&QWP(112+0,"esp"),@X[4]); # copy constants
911 &vmovdqa(&QWP(112+16,"esp"),@X[5]);
912 &vmovdqa(&QWP(112+32,"esp"),@X[6]);
913 &shl ($D,6); # len*64
914 &vmovdqa(&QWP(112+48,"esp"),@X[3]);
915 &add ($D,$inp); # end of input
916 &vmovdqa(&QWP(112+64,"esp"),@X[2]);
917 &add ($inp,64);
918 &mov (&DWP(192+0,"esp"),$E); # save argument block
919 &mov (&DWP(192+4,"esp"),$inp);
920 &mov (&DWP(192+8,"esp"),$D);
921 &mov (&DWP(192+12,"esp"),@T[0]); # save original %esp
922
923 &mov ($A,&DWP(0,$E)); # load context
924 &mov ($B,&DWP(4,$E));
925 &mov ($C,&DWP(8,$E));
926 &mov ($D,&DWP(12,$E));
927 &mov ($E,&DWP(16,$E));
928 &mov (@T[0],$B); # magic seed
929
930 &vmovdqu(@X[-4&7],&QWP(-64,$inp)); # load input to %xmm[0-3]
931 &vmovdqu(@X[-3&7],&QWP(-48,$inp));
932 &vmovdqu(@X[-2&7],&QWP(-32,$inp));
933 &vmovdqu(@X[-1&7],&QWP(-16,$inp));
934 &vpshufb(@X[-4&7],@X[-4&7],@X[2]); # byte swap
935 &vpshufb(@X[-3&7],@X[-3&7],@X[2]);
936 &vpshufb(@X[-2&7],@X[-2&7],@X[2]);
937 &vmovdqa(&QWP(112-16,"esp"),@X[3]); # borrow last backtrace slot
938 &vpshufb(@X[-1&7],@X[-1&7],@X[2]);
939 &vpaddd (@X[0],@X[-4&7],@X[3]); # add K_00_19
940 &vpaddd (@X[1],@X[-3&7],@X[3]);
941 &vpaddd (@X[2],@X[-2&7],@X[3]);
942 &vmovdqa(&QWP(0,"esp"),@X[0]); # X[]+K xfer to IALU
943 &vmovdqa(&QWP(0+16,"esp"),@X[1]);
944 &vmovdqa(&QWP(0+32,"esp"),@X[2]);
945 &jmp (&label("loop"));
946
947sub Xupdate_avx_16_31() # recall that $Xi starts wtih 4
948{ use integer;
949 my $body = shift;
950 my @insns = (&$body,&$body,&$body,&$body); # 40 instructions
951 my ($a,$b,$c,$d,$e);
952
953 eval(shift(@insns));
954 eval(shift(@insns));
955 &vpalignr(@X[0],@X[-3&7],@X[-4&7],8); # compose "X[-14]" in "X[0]"
956 eval(shift(@insns));
957 eval(shift(@insns));
958
959 &vpaddd (@X[3],@X[3],@X[-1&7]);
960 &vmovdqa (&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]);# save X[] to backtrace buffer
961 eval(shift(@insns));
962 eval(shift(@insns));
963 &vpsrldq(@X[2],@X[-1&7],4); # "X[-3]", 3 dwords
964 eval(shift(@insns));
965 eval(shift(@insns));
966 &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
967 eval(shift(@insns));
968 eval(shift(@insns));
969
970 &vpxor (@X[2],@X[2],@X[-2&7]); # "X[-3]"^"X[-8]"
971 eval(shift(@insns));
972 eval(shift(@insns));
973 &vmovdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer to IALU
974 eval(shift(@insns));
975 eval(shift(@insns));
976
977 &vpxor (@X[0],@X[0],@X[2]); # "X[0]"^="X[-3]"^"X[-8]"
978 eval(shift(@insns));
979 eval(shift(@insns));
980 eval(shift(@insns));
981 eval(shift(@insns));
982
983 &vpsrld (@X[2],@X[0],31);
984 eval(shift(@insns));
985 eval(shift(@insns));
986 eval(shift(@insns));
987 eval(shift(@insns));
988
989 &vpslldq(@X[4],@X[0],12); # "X[0]"<<96, extract one dword
990 &vpaddd (@X[0],@X[0],@X[0]);
991 eval(shift(@insns));
992 eval(shift(@insns));
993 eval(shift(@insns));
994 eval(shift(@insns));
995
996 &vpsrld (@X[3],@X[4],30);
997 &vpor (@X[0],@X[0],@X[2]); # "X[0]"<<<=1
998 eval(shift(@insns));
999 eval(shift(@insns));
1000 eval(shift(@insns));
1001 eval(shift(@insns));
1002
1003 &vpslld (@X[4],@X[4],2);
1004 &vmovdqa (@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if ($Xi>5); # restore X[] from backtrace buffer
1005 eval(shift(@insns));
1006 eval(shift(@insns));
1007 &vpxor (@X[0],@X[0],@X[3]);
1008 eval(shift(@insns));
1009 eval(shift(@insns));
1010 eval(shift(@insns));
1011 eval(shift(@insns));
1012
1013 &vpxor (@X[0],@X[0],@X[4]); # "X[0]"^=("X[0]"<<96)<<<2
1014 eval(shift(@insns));
1015 eval(shift(@insns));
1016 &vmovdqa (@X[4],&QWP(112-16+16*(($Xi)/5),"esp")); # K_XX_XX
1017 eval(shift(@insns));
1018 eval(shift(@insns));
1019
1020 foreach (@insns) { eval; } # remaining instructions [if any]
1021
1022 $Xi++; push(@X,shift(@X)); # "rotate" X[]
1023}
1024
1025sub Xupdate_avx_32_79()
1026{ use integer;
1027 my $body = shift;
1028 my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions
1029 my ($a,$b,$c,$d,$e);
1030
1031 &vpalignr(@X[2],@X[-1&7],@X[-2&7],8); # compose "X[-6]"
1032 &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
1033 eval(shift(@insns)); # body_20_39
1034 eval(shift(@insns));
1035 eval(shift(@insns));
1036 eval(shift(@insns)); # rol
1037
1038 &vpxor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
1039 &vmovdqa (&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]); # save X[] to backtrace buffer
1040 eval(shift(@insns));
1041 eval(shift(@insns));
1042 if ($Xi%5) {
1043 &vmovdqa (@X[4],@X[3]); # "perpetuate" K_XX_XX...
1044 } else { # ... or load next one
1045 &vmovdqa (@X[4],&QWP(112-16+16*($Xi/5),"esp"));
1046 }
1047 &vpaddd (@X[3],@X[3],@X[-1&7]);
1048 eval(shift(@insns)); # ror
1049 eval(shift(@insns));
1050
1051 &vpxor (@X[0],@X[0],@X[2]); # "X[0]"^="X[-6]"
1052 eval(shift(@insns)); # body_20_39
1053 eval(shift(@insns));
1054 eval(shift(@insns));
1055 eval(shift(@insns)); # rol
1056
1057 &vpsrld (@X[2],@X[0],30);
1058 &vmovdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer to IALU
1059 eval(shift(@insns));
1060 eval(shift(@insns));
1061 eval(shift(@insns)); # ror
1062 eval(shift(@insns));
1063
1064 &vpslld (@X[0],@X[0],2);
1065 eval(shift(@insns)); # body_20_39
1066 eval(shift(@insns));
1067 eval(shift(@insns));
1068 eval(shift(@insns)); # rol
1069 eval(shift(@insns));
1070 eval(shift(@insns));
1071 eval(shift(@insns)); # ror
1072 eval(shift(@insns));
1073
1074 &vpor (@X[0],@X[0],@X[2]); # "X[0]"<<<=2
1075 eval(shift(@insns)); # body_20_39
1076 eval(shift(@insns));
1077 &vmovdqa (@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if($Xi<19); # restore X[] from backtrace buffer
1078 eval(shift(@insns));
1079 eval(shift(@insns)); # rol
1080 eval(shift(@insns));
1081 eval(shift(@insns));
1082 eval(shift(@insns)); # ror
1083 eval(shift(@insns));
1084
1085 foreach (@insns) { eval; } # remaining instructions
1086
1087 $Xi++; push(@X,shift(@X)); # "rotate" X[]
1088}
1089
1090sub Xuplast_avx_80()
1091{ use integer;
1092 my $body = shift;
1093 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
1094 my ($a,$b,$c,$d,$e);
1095
1096 eval(shift(@insns));
1097 &vpaddd (@X[3],@X[3],@X[-1&7]);
1098 eval(shift(@insns));
1099 eval(shift(@insns));
1100 eval(shift(@insns));
1101 eval(shift(@insns));
1102
1103 &vmovdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer IALU
1104
1105 foreach (@insns) { eval; } # remaining instructions
1106
1107 &mov ($inp=@T[1],&DWP(192+4,"esp"));
1108 &cmp ($inp,&DWP(192+8,"esp"));
1109 &je (&label("done"));
1110
1111 &vmovdqa(@X[3],&QWP(112+48,"esp")); # K_00_19
1112 &vmovdqa(@X[2],&QWP(112+64,"esp")); # pbswap mask
1113 &vmovdqu(@X[-4&7],&QWP(0,$inp)); # load input
1114 &vmovdqu(@X[-3&7],&QWP(16,$inp));
1115 &vmovdqu(@X[-2&7],&QWP(32,$inp));
1116 &vmovdqu(@X[-1&7],&QWP(48,$inp));
1117 &add ($inp,64);
1118 &vpshufb(@X[-4&7],@X[-4&7],@X[2]); # byte swap
1119 &mov (&DWP(192+4,"esp"),$inp);
1120 &vmovdqa(&QWP(112-16,"esp"),@X[3]); # borrow last backtrace slot
1121
1122 $Xi=0;
1123}
1124
1125sub Xloop_avx()
1126{ use integer;
1127 my $body = shift;
1128 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
1129 my ($a,$b,$c,$d,$e);
1130
1131 eval(shift(@insns));
1132 eval(shift(@insns));
1133 &vpshufb (@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]);
1134 eval(shift(@insns));
1135 eval(shift(@insns));
1136 &vpaddd (@X[$Xi&7],@X[($Xi-4)&7],@X[3]);
1137 eval(shift(@insns));
1138 eval(shift(@insns));
1139 eval(shift(@insns));
1140 eval(shift(@insns));
1141 &vmovdqa (&QWP(0+16*$Xi,"esp"),@X[$Xi&7]); # X[]+K xfer to IALU
1142 eval(shift(@insns));
1143 eval(shift(@insns));
1144
1145 foreach (@insns) { eval; }
1146 $Xi++;
1147}
1148
1149sub Xtail_avx()
1150{ use integer;
1151 my $body = shift;
1152 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
1153 my ($a,$b,$c,$d,$e);
1154
1155 foreach (@insns) { eval; }
1156}
1157
1158&set_label("loop",16);
1159 &Xupdate_avx_16_31(\&body_00_19);
1160 &Xupdate_avx_16_31(\&body_00_19);
1161 &Xupdate_avx_16_31(\&body_00_19);
1162 &Xupdate_avx_16_31(\&body_00_19);
1163 &Xupdate_avx_32_79(\&body_00_19);
1164 &Xupdate_avx_32_79(\&body_20_39);
1165 &Xupdate_avx_32_79(\&body_20_39);
1166 &Xupdate_avx_32_79(\&body_20_39);
1167 &Xupdate_avx_32_79(\&body_20_39);
1168 &Xupdate_avx_32_79(\&body_20_39);
1169 &Xupdate_avx_32_79(\&body_40_59);
1170 &Xupdate_avx_32_79(\&body_40_59);
1171 &Xupdate_avx_32_79(\&body_40_59);
1172 &Xupdate_avx_32_79(\&body_40_59);
1173 &Xupdate_avx_32_79(\&body_40_59);
1174 &Xupdate_avx_32_79(\&body_20_39);
1175 &Xuplast_avx_80(\&body_20_39); # can jump to "done"
1176
1177 $saved_j=$j; @saved_V=@V;
1178
1179 &Xloop_avx(\&body_20_39);
1180 &Xloop_avx(\&body_20_39);
1181 &Xloop_avx(\&body_20_39);
1182
1183 &mov (@T[1],&DWP(192,"esp")); # update context
1184 &add ($A,&DWP(0,@T[1]));
1185 &add (@T[0],&DWP(4,@T[1])); # $b
1186 &add ($C,&DWP(8,@T[1]));
1187 &mov (&DWP(0,@T[1]),$A);
1188 &add ($D,&DWP(12,@T[1]));
1189 &mov (&DWP(4,@T[1]),@T[0]);
1190 &add ($E,&DWP(16,@T[1]));
1191 &mov (&DWP(8,@T[1]),$C);
1192 &mov ($B,@T[0]);
1193 &mov (&DWP(12,@T[1]),$D);
1194 &mov (&DWP(16,@T[1]),$E);
1195
1196 &jmp (&label("loop"));
1197
1198&set_label("done",16); $j=$saved_j; @V=@saved_V;
1199
1200 &Xtail_avx(\&body_20_39);
1201 &Xtail_avx(\&body_20_39);
1202 &Xtail_avx(\&body_20_39);
1203
1204 &vzeroall();
1205
1206 &mov (@T[1],&DWP(192,"esp")); # update context
1207 &add ($A,&DWP(0,@T[1]));
1208 &mov ("esp",&DWP(192+12,"esp")); # restore %esp
1209 &add (@T[0],&DWP(4,@T[1])); # $b
1210 &add ($C,&DWP(8,@T[1]));
1211 &mov (&DWP(0,@T[1]),$A);
1212 &add ($D,&DWP(12,@T[1]));
1213 &mov (&DWP(4,@T[1]),@T[0]);
1214 &add ($E,&DWP(16,@T[1]));
1215 &mov (&DWP(8,@T[1]),$C);
1216 &mov (&DWP(12,@T[1]),$D);
1217 &mov (&DWP(16,@T[1]),$E);
1218&function_end("_sha1_block_data_order_avx");
1219}
1220&set_label("K_XX_XX",64);
1221&data_word(0x5a827999,0x5a827999,0x5a827999,0x5a827999); # K_00_19
1222&data_word(0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1); # K_20_39
1223&data_word(0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc); # K_40_59
1224&data_word(0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6); # K_60_79
1225&data_word(0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f); # pbswap mask
1226}
218&asciz("SHA1 block transform for x86, CRYPTOGAMS by <appro\@openssl.org>"); 1227&asciz("SHA1 block transform for x86, CRYPTOGAMS by <appro\@openssl.org>");
219 1228
220&asm_finish(); 1229&asm_finish();
diff --git a/src/lib/libcrypto/sha/asm/sha1-alpha.pl b/src/lib/libcrypto/sha/asm/sha1-alpha.pl
new file mode 100644
index 0000000000..6c4b9251fd
--- /dev/null
+++ b/src/lib/libcrypto/sha/asm/sha1-alpha.pl
@@ -0,0 +1,322 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# SHA1 block procedure for Alpha.
11
12# On 21264 performance is 33% better than code generated by vendor
13# compiler, and 75% better than GCC [3.4], and in absolute terms is
14# 8.7 cycles per processed byte. Implementation features vectorized
15# byte swap, but not Xupdate.
16
17@X=( "\$0", "\$1", "\$2", "\$3", "\$4", "\$5", "\$6", "\$7",
18 "\$8", "\$9", "\$10", "\$11", "\$12", "\$13", "\$14", "\$15");
19$ctx="a0"; # $16
20$inp="a1";
21$num="a2";
22$A="a3";
23$B="a4"; # 20
24$C="a5";
25$D="t8";
26$E="t9"; @V=($A,$B,$C,$D,$E);
27$t0="t10"; # 24
28$t1="t11";
29$t2="ra";
30$t3="t12";
31$K="AT"; # 28
32
33sub BODY_00_19 {
34my ($i,$a,$b,$c,$d,$e)=@_;
35my $j=$i+1;
36$code.=<<___ if ($i==0);
37 ldq_u @X[0],0+0($inp)
38 ldq_u @X[1],0+7($inp)
39___
40$code.=<<___ if (!($i&1) && $i<14);
41 ldq_u @X[$i+2],($i+2)*4+0($inp)
42 ldq_u @X[$i+3],($i+2)*4+7($inp)
43___
44$code.=<<___ if (!($i&1) && $i<15);
45 extql @X[$i],$inp,@X[$i]
46 extqh @X[$i+1],$inp,@X[$i+1]
47
48 or @X[$i+1],@X[$i],@X[$i] # pair of 32-bit values are fetched
49
50 srl @X[$i],24,$t0 # vectorized byte swap
51 srl @X[$i],8,$t2
52
53 sll @X[$i],8,$t3
54 sll @X[$i],24,@X[$i]
55 zapnot $t0,0x11,$t0
56 zapnot $t2,0x22,$t2
57
58 zapnot @X[$i],0x88,@X[$i]
59 or $t0,$t2,$t0
60 zapnot $t3,0x44,$t3
61 sll $a,5,$t1
62
63 or @X[$i],$t0,@X[$i]
64 addl $K,$e,$e
65 and $b,$c,$t2
66 zapnot $a,0xf,$a
67
68 or @X[$i],$t3,@X[$i]
69 srl $a,27,$t0
70 bic $d,$b,$t3
71 sll $b,30,$b
72
73 extll @X[$i],4,@X[$i+1] # extract upper half
74 or $t2,$t3,$t2
75 addl @X[$i],$e,$e
76
77 addl $t1,$e,$e
78 srl $b,32,$t3
79 zapnot @X[$i],0xf,@X[$i]
80
81 addl $t0,$e,$e
82 addl $t2,$e,$e
83 or $t3,$b,$b
84___
85$code.=<<___ if (($i&1) && $i<15);
86 sll $a,5,$t1
87 addl $K,$e,$e
88 and $b,$c,$t2
89 zapnot $a,0xf,$a
90
91 srl $a,27,$t0
92 addl @X[$i%16],$e,$e
93 bic $d,$b,$t3
94 sll $b,30,$b
95
96 or $t2,$t3,$t2
97 addl $t1,$e,$e
98 srl $b,32,$t3
99 zapnot @X[$i],0xf,@X[$i]
100
101 addl $t0,$e,$e
102 addl $t2,$e,$e
103 or $t3,$b,$b
104___
105$code.=<<___ if ($i>=15); # with forward Xupdate
106 sll $a,5,$t1
107 addl $K,$e,$e
108 and $b,$c,$t2
109 xor @X[($j+2)%16],@X[$j%16],@X[$j%16]
110
111 zapnot $a,0xf,$a
112 addl @X[$i%16],$e,$e
113 bic $d,$b,$t3
114 xor @X[($j+8)%16],@X[$j%16],@X[$j%16]
115
116 srl $a,27,$t0
117 addl $t1,$e,$e
118 or $t2,$t3,$t2
119 xor @X[($j+13)%16],@X[$j%16],@X[$j%16]
120
121 sll $b,30,$b
122 addl $t0,$e,$e
123 srl @X[$j%16],31,$t1
124
125 addl $t2,$e,$e
126 srl $b,32,$t3
127 addl @X[$j%16],@X[$j%16],@X[$j%16]
128
129 or $t3,$b,$b
130 zapnot @X[$i%16],0xf,@X[$i%16]
131 or $t1,@X[$j%16],@X[$j%16]
132___
133}
134
135sub BODY_20_39 {
136my ($i,$a,$b,$c,$d,$e)=@_;
137my $j=$i+1;
138$code.=<<___ if ($i<79); # with forward Xupdate
139 sll $a,5,$t1
140 addl $K,$e,$e
141 zapnot $a,0xf,$a
142 xor @X[($j+2)%16],@X[$j%16],@X[$j%16]
143
144 sll $b,30,$t3
145 addl $t1,$e,$e
146 xor $b,$c,$t2
147 xor @X[($j+8)%16],@X[$j%16],@X[$j%16]
148
149 srl $b,2,$b
150 addl @X[$i%16],$e,$e
151 xor $d,$t2,$t2
152 xor @X[($j+13)%16],@X[$j%16],@X[$j%16]
153
154 srl @X[$j%16],31,$t1
155 addl $t2,$e,$e
156 srl $a,27,$t0
157 addl @X[$j%16],@X[$j%16],@X[$j%16]
158
159 or $t3,$b,$b
160 addl $t0,$e,$e
161 or $t1,@X[$j%16],@X[$j%16]
162___
163$code.=<<___ if ($i<77);
164 zapnot @X[$i%16],0xf,@X[$i%16]
165___
166$code.=<<___ if ($i==79); # with context fetch
167 sll $a,5,$t1
168 addl $K,$e,$e
169 zapnot $a,0xf,$a
170 ldl @X[0],0($ctx)
171
172 sll $b,30,$t3
173 addl $t1,$e,$e
174 xor $b,$c,$t2
175 ldl @X[1],4($ctx)
176
177 srl $b,2,$b
178 addl @X[$i%16],$e,$e
179 xor $d,$t2,$t2
180 ldl @X[2],8($ctx)
181
182 srl $a,27,$t0
183 addl $t2,$e,$e
184 ldl @X[3],12($ctx)
185
186 or $t3,$b,$b
187 addl $t0,$e,$e
188 ldl @X[4],16($ctx)
189___
190}
191
192sub BODY_40_59 {
193my ($i,$a,$b,$c,$d,$e)=@_;
194my $j=$i+1;
195$code.=<<___; # with forward Xupdate
196 sll $a,5,$t1
197 addl $K,$e,$e
198 zapnot $a,0xf,$a
199 xor @X[($j+2)%16],@X[$j%16],@X[$j%16]
200
201 srl $a,27,$t0
202 and $b,$c,$t2
203 and $b,$d,$t3
204 xor @X[($j+8)%16],@X[$j%16],@X[$j%16]
205
206 sll $b,30,$b
207 addl $t1,$e,$e
208 xor @X[($j+13)%16],@X[$j%16],@X[$j%16]
209
210 srl @X[$j%16],31,$t1
211 addl $t0,$e,$e
212 or $t2,$t3,$t2
213 and $c,$d,$t3
214
215 or $t2,$t3,$t2
216 srl $b,32,$t3
217 addl @X[$i%16],$e,$e
218 addl @X[$j%16],@X[$j%16],@X[$j%16]
219
220 or $t3,$b,$b
221 addl $t2,$e,$e
222 or $t1,@X[$j%16],@X[$j%16]
223 zapnot @X[$i%16],0xf,@X[$i%16]
224___
225}
226
227$code=<<___;
228#ifdef __linux__
229#include <asm/regdef.h>
230#else
231#include <asm.h>
232#include <regdef.h>
233#endif
234
235.text
236
237.set noat
238.set noreorder
239.globl sha1_block_data_order
240.align 5
241.ent sha1_block_data_order
242sha1_block_data_order:
243 lda sp,-64(sp)
244 stq ra,0(sp)
245 stq s0,8(sp)
246 stq s1,16(sp)
247 stq s2,24(sp)
248 stq s3,32(sp)
249 stq s4,40(sp)
250 stq s5,48(sp)
251 stq fp,56(sp)
252 .mask 0x0400fe00,-64
253 .frame sp,64,ra
254 .prologue 0
255
256 ldl $A,0($ctx)
257 ldl $B,4($ctx)
258 sll $num,6,$num
259 ldl $C,8($ctx)
260 ldl $D,12($ctx)
261 ldl $E,16($ctx)
262 addq $inp,$num,$num
263
264.Lloop:
265 .set noreorder
266 ldah $K,23170(zero)
267 zapnot $B,0xf,$B
268 lda $K,31129($K) # K_00_19
269___
270for ($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
271
272$code.=<<___;
273 ldah $K,28378(zero)
274 lda $K,-5215($K) # K_20_39
275___
276for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
277
278$code.=<<___;
279 ldah $K,-28900(zero)
280 lda $K,-17188($K) # K_40_59
281___
282for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
283
284$code.=<<___;
285 ldah $K,-13725(zero)
286 lda $K,-15914($K) # K_60_79
287___
288for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
289
290$code.=<<___;
291 addl @X[0],$A,$A
292 addl @X[1],$B,$B
293 addl @X[2],$C,$C
294 addl @X[3],$D,$D
295 addl @X[4],$E,$E
296 stl $A,0($ctx)
297 stl $B,4($ctx)
298 addq $inp,64,$inp
299 stl $C,8($ctx)
300 stl $D,12($ctx)
301 stl $E,16($ctx)
302 cmpult $inp,$num,$t1
303 bne $t1,.Lloop
304
305 .set noreorder
306 ldq ra,0(sp)
307 ldq s0,8(sp)
308 ldq s1,16(sp)
309 ldq s2,24(sp)
310 ldq s3,32(sp)
311 ldq s4,40(sp)
312 ldq s5,48(sp)
313 ldq fp,56(sp)
314 lda sp,64(sp)
315 ret (ra)
316.end sha1_block_data_order
317.ascii "SHA1 block transform for Alpha, CRYPTOGAMS by <appro\@openssl.org>"
318.align 2
319___
320$output=shift and open STDOUT,">$output";
321print $code;
322close STDOUT;
diff --git a/src/lib/libcrypto/sha/asm/sha1-armv4-large.pl b/src/lib/libcrypto/sha/asm/sha1-armv4-large.pl
index 6e65fe3e01..fe8207f77f 100644
--- a/src/lib/libcrypto/sha/asm/sha1-armv4-large.pl
+++ b/src/lib/libcrypto/sha/asm/sha1-armv4-large.pl
@@ -47,6 +47,10 @@
47# Cortex A8 core and in absolute terms ~870 cycles per input block 47# Cortex A8 core and in absolute terms ~870 cycles per input block
48# [or 13.6 cycles per byte]. 48# [or 13.6 cycles per byte].
49 49
50# February 2011.
51#
52# Profiler-assisted and platform-specific optimization resulted in 10%
53# improvement on Cortex A8 core and 12.2 cycles per byte.
50 54
51while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} 55while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
52open STDOUT,">$output"; 56open STDOUT,">$output";
@@ -76,31 +80,41 @@ $code.=<<___;
76 add $e,$K,$e,ror#2 @ E+=K_xx_xx 80 add $e,$K,$e,ror#2 @ E+=K_xx_xx
77 ldr $t3,[$Xi,#2*4] 81 ldr $t3,[$Xi,#2*4]
78 eor $t0,$t0,$t1 82 eor $t0,$t0,$t1
79 eor $t2,$t2,$t3 83 eor $t2,$t2,$t3 @ 1 cycle stall
80 eor $t1,$c,$d @ F_xx_xx 84 eor $t1,$c,$d @ F_xx_xx
81 mov $t0,$t0,ror#31 85 mov $t0,$t0,ror#31
82 add $e,$e,$a,ror#27 @ E+=ROR(A,27) 86 add $e,$e,$a,ror#27 @ E+=ROR(A,27)
83 eor $t0,$t0,$t2,ror#31 87 eor $t0,$t0,$t2,ror#31
88 str $t0,[$Xi,#-4]!
84 $opt1 @ F_xx_xx 89 $opt1 @ F_xx_xx
85 $opt2 @ F_xx_xx 90 $opt2 @ F_xx_xx
86 add $e,$e,$t0 @ E+=X[i] 91 add $e,$e,$t0 @ E+=X[i]
87 str $t0,[$Xi,#-4]!
88___ 92___
89} 93}
90 94
91sub BODY_00_15 { 95sub BODY_00_15 {
92my ($a,$b,$c,$d,$e)=@_; 96my ($a,$b,$c,$d,$e)=@_;
93$code.=<<___; 97$code.=<<___;
94 ldrb $t0,[$inp],#4 98#if __ARM_ARCH__<7
95 ldrb $t1,[$inp,#-1] 99 ldrb $t1,[$inp,#2]
96 ldrb $t2,[$inp,#-2] 100 ldrb $t0,[$inp,#3]
101 ldrb $t2,[$inp,#1]
97 add $e,$K,$e,ror#2 @ E+=K_00_19 102 add $e,$K,$e,ror#2 @ E+=K_00_19
98 ldrb $t3,[$inp,#-3] 103 ldrb $t3,[$inp],#4
104 orr $t0,$t0,$t1,lsl#8
105 eor $t1,$c,$d @ F_xx_xx
106 orr $t0,$t0,$t2,lsl#16
99 add $e,$e,$a,ror#27 @ E+=ROR(A,27) 107 add $e,$e,$a,ror#27 @ E+=ROR(A,27)
100 orr $t0,$t1,$t0,lsl#24 108 orr $t0,$t0,$t3,lsl#24
109#else
110 ldr $t0,[$inp],#4 @ handles unaligned
111 add $e,$K,$e,ror#2 @ E+=K_00_19
101 eor $t1,$c,$d @ F_xx_xx 112 eor $t1,$c,$d @ F_xx_xx
102 orr $t0,$t0,$t2,lsl#8 113 add $e,$e,$a,ror#27 @ E+=ROR(A,27)
103 orr $t0,$t0,$t3,lsl#16 114#ifdef __ARMEL__
115 rev $t0,$t0 @ byte swap
116#endif
117#endif
104 and $t1,$b,$t1,ror#2 118 and $t1,$b,$t1,ror#2
105 add $e,$e,$t0 @ E+=X[i] 119 add $e,$e,$t0 @ E+=X[i]
106 eor $t1,$t1,$d,ror#2 @ F_00_19(B,C,D) 120 eor $t1,$t1,$d,ror#2 @ F_00_19(B,C,D)
@@ -136,6 +150,8 @@ ___
136} 150}
137 151
138$code=<<___; 152$code=<<___;
153#include "arm_arch.h"
154
139.text 155.text
140 156
141.global sha1_block_data_order 157.global sha1_block_data_order
@@ -209,10 +225,14 @@ $code.=<<___;
209 teq $inp,$len 225 teq $inp,$len
210 bne .Lloop @ [+18], total 1307 226 bne .Lloop @ [+18], total 1307
211 227
228#if __ARM_ARCH__>=5
229 ldmia sp!,{r4-r12,pc}
230#else
212 ldmia sp!,{r4-r12,lr} 231 ldmia sp!,{r4-r12,lr}
213 tst lr,#1 232 tst lr,#1
214 moveq pc,lr @ be binary compatible with V4, yet 233 moveq pc,lr @ be binary compatible with V4, yet
215 bx lr @ interoperable with Thumb ISA:-) 234 bx lr @ interoperable with Thumb ISA:-)
235#endif
216.align 2 236.align 2
217.LK_00_19: .word 0x5a827999 237.LK_00_19: .word 0x5a827999
218.LK_20_39: .word 0x6ed9eba1 238.LK_20_39: .word 0x6ed9eba1
diff --git a/src/lib/libcrypto/sha/asm/sha1-ia64.pl b/src/lib/libcrypto/sha/asm/sha1-ia64.pl
index 51c4f47ecb..db28f0805a 100644
--- a/src/lib/libcrypto/sha/asm/sha1-ia64.pl
+++ b/src/lib/libcrypto/sha/asm/sha1-ia64.pl
@@ -15,7 +15,7 @@
15# is >50% better than HP C and >2x better than gcc. 15# is >50% better than HP C and >2x better than gcc.
16 16
17$code=<<___; 17$code=<<___;
18.ident \"sha1-ia64.s, version 1.2\" 18.ident \"sha1-ia64.s, version 1.3\"
19.ident \"IA-64 ISA artwork by Andy Polyakov <appro\@fy.chalmers.se>\" 19.ident \"IA-64 ISA artwork by Andy Polyakov <appro\@fy.chalmers.se>\"
20.explicit 20.explicit
21 21
@@ -26,14 +26,10 @@ if ($^O eq "hpux") {
26 $ADDP="addp4"; 26 $ADDP="addp4";
27 for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); } 27 for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
28} else { $ADDP="add"; } 28} else { $ADDP="add"; }
29for (@ARGV) { $big_endian=1 if (/\-DB_ENDIAN/);
30 $big_endian=0 if (/\-DL_ENDIAN/); }
31if (!defined($big_endian))
32 { $big_endian=(unpack('L',pack('N',1))==1); }
33 29
34#$human=1; 30#$human=1;
35if ($human) { # useful for visual code auditing... 31if ($human) { # useful for visual code auditing...
36 ($A,$B,$C,$D,$E,$T) = ("A","B","C","D","E","T"); 32 ($A,$B,$C,$D,$E) = ("A","B","C","D","E");
37 ($h0,$h1,$h2,$h3,$h4) = ("h0","h1","h2","h3","h4"); 33 ($h0,$h1,$h2,$h3,$h4) = ("h0","h1","h2","h3","h4");
38 ($K_00_19, $K_20_39, $K_40_59, $K_60_79) = 34 ($K_00_19, $K_20_39, $K_40_59, $K_60_79) =
39 ( "K_00_19","K_20_39","K_40_59","K_60_79" ); 35 ( "K_00_19","K_20_39","K_40_59","K_60_79" );
@@ -41,47 +37,50 @@ if ($human) { # useful for visual code auditing...
41 "X8", "X9","X10","X11","X12","X13","X14","X15" ); 37 "X8", "X9","X10","X11","X12","X13","X14","X15" );
42} 38}
43else { 39else {
44 ($A,$B,$C,$D,$E,$T) = ("loc0","loc1","loc2","loc3","loc4","loc5"); 40 ($A,$B,$C,$D,$E) = ("loc0","loc1","loc2","loc3","loc4");
45 ($h0,$h1,$h2,$h3,$h4) = ("loc6","loc7","loc8","loc9","loc10"); 41 ($h0,$h1,$h2,$h3,$h4) = ("loc5","loc6","loc7","loc8","loc9");
46 ($K_00_19, $K_20_39, $K_40_59, $K_60_79) = 42 ($K_00_19, $K_20_39, $K_40_59, $K_60_79) =
47 ( "r14", "r15", "loc11", "loc12" ); 43 ( "r14", "r15", "loc10", "loc11" );
48 @X= ( "r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23", 44 @X= ( "r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23",
49 "r24", "r25", "r26", "r27", "r28", "r29", "r30", "r31" ); 45 "r24", "r25", "r26", "r27", "r28", "r29", "r30", "r31" );
50} 46}
51 47
52sub BODY_00_15 { 48sub BODY_00_15 {
53local *code=shift; 49local *code=shift;
54local ($i,$a,$b,$c,$d,$e,$f)=@_; 50my ($i,$a,$b,$c,$d,$e)=@_;
51my $j=$i+1;
52my $Xn=@X[$j%16];
55 53
56$code.=<<___ if ($i==0); 54$code.=<<___ if ($i==0);
57{ .mmi; ld1 $X[$i&0xf]=[inp],2 // MSB 55{ .mmi; ld1 $X[$i]=[inp],2 // MSB
58 ld1 tmp2=[tmp3],2 };; 56 ld1 tmp2=[tmp3],2 };;
59{ .mmi; ld1 tmp0=[inp],2 57{ .mmi; ld1 tmp0=[inp],2
60 ld1 tmp4=[tmp3],2 // LSB 58 ld1 tmp4=[tmp3],2 // LSB
61 dep $X[$i&0xf]=$X[$i&0xf],tmp2,8,8 };; 59 dep $X[$i]=$X[$i],tmp2,8,8 };;
62___ 60___
63if ($i<15) { 61if ($i<15) {
64 $code.=<<___; 62 $code.=<<___;
65{ .mmi; ld1 $X[($i+1)&0xf]=[inp],2 // +1 63{ .mmi; ld1 $Xn=[inp],2 // forward Xload
64 nop.m 0x0
66 dep tmp1=tmp0,tmp4,8,8 };; 65 dep tmp1=tmp0,tmp4,8,8 };;
67{ .mmi; ld1 tmp2=[tmp3],2 // +1 66{ .mmi; ld1 tmp2=[tmp3],2 // forward Xload
68 and tmp4=$c,$b 67 and tmp4=$c,$b
69 dep $X[$i&0xf]=$X[$i&0xf],tmp1,16,16 } //;; 68 dep $X[$i]=$X[$i],tmp1,16,16} //;;
70{ .mmi; andcm tmp1=$d,$b 69{ .mmi; add $e=$e,$K_00_19 // e+=K_00_19
71 add tmp0=$e,$K_00_19 70 andcm tmp1=$d,$b
72 dep.z tmp5=$a,5,27 };; // a<<5 71 dep.z tmp5=$a,5,27 };; // a<<5
73{ .mmi; or tmp4=tmp4,tmp1 // F_00_19(b,c,d)=(b&c)|(~b&d) 72{ .mmi; add $e=$e,$X[$i] // e+=Xload
74 add $f=tmp0,$X[$i&0xf] // f=xi+e+K_00_19 73 or tmp4=tmp4,tmp1 // F_00_19(b,c,d)=(b&c)|(~b&d)
75 extr.u tmp1=$a,27,5 };; // a>>27 74 extr.u tmp1=$a,27,5 };; // a>>27
76{ .mmi; ld1 tmp0=[inp],2 // +1 75{ .mmi; ld1 tmp0=[inp],2 // forward Xload
77 add $f=$f,tmp4 // f+=F_00_19(b,c,d) 76 add $e=$e,tmp4 // e+=F_00_19(b,c,d)
78 shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30) 77 shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30)
79{ .mmi; ld1 tmp4=[tmp3],2 // +1 78{ .mmi; ld1 tmp4=[tmp3],2 // forward Xload
80 or tmp5=tmp1,tmp5 // ROTATE(a,5) 79 or tmp5=tmp1,tmp5 // ROTATE(a,5)
81 mux2 tmp6=$a,0x44 };; // see b in next iteration 80 mux2 tmp6=$a,0x44 };; // see b in next iteration
82{ .mii; add $f=$f,tmp5 // f+=ROTATE(a,5) 81{ .mii; add $e=$e,tmp5 // e+=ROTATE(a,5)
83 dep $X[($i+1)&0xf]=$X[($i+1)&0xf],tmp2,8,8 // +1 82 dep $Xn=$Xn,tmp2,8,8 // forward Xload
84 mux2 $X[$i&0xf]=$X[$i&0xf],0x44 } //;; 83 mux2 $X[$i]=$X[$i],0x44 } //;;
85 84
86___ 85___
87 } 86 }
@@ -89,24 +88,24 @@ else {
89 $code.=<<___; 88 $code.=<<___;
90{ .mii; and tmp3=$c,$b 89{ .mii; and tmp3=$c,$b
91 dep tmp1=tmp0,tmp4,8,8;; 90 dep tmp1=tmp0,tmp4,8,8;;
92 dep $X[$i&0xf]=$X[$i&0xf],tmp1,16,16 } //;; 91 dep $X[$i]=$X[$i],tmp1,16,16} //;;
93{ .mmi; andcm tmp1=$d,$b 92{ .mmi; add $e=$e,$K_00_19 // e+=K_00_19
94 add tmp0=$e,$K_00_19 93 andcm tmp1=$d,$b
95 dep.z tmp5=$a,5,27 };; // a<<5 94 dep.z tmp5=$a,5,27 };; // a<<5
96{ .mmi; or tmp4=tmp3,tmp1 // F_00_19(b,c,d)=(b&c)|(~b&d) 95{ .mmi; add $e=$e,$X[$i] // e+=Xupdate
97 add $f=tmp0,$X[$i&0xf] // f=xi+e+K_00_19 96 or tmp4=tmp3,tmp1 // F_00_19(b,c,d)=(b&c)|(~b&d)
98 extr.u tmp1=$a,27,5 } // a>>27 97 extr.u tmp1=$a,27,5 } // a>>27
99{ .mmi; xor tmp2=$X[($i+0+1)&0xf],$X[($i+2+1)&0xf] // +1 98{ .mmi; xor $Xn=$Xn,$X[($j+2)%16] // forward Xupdate
100 xor tmp3=$X[($i+8+1)&0xf],$X[($i+13+1)&0xf] // +1 99 xor tmp3=$X[($j+8)%16],$X[($j+13)%16] // forward Xupdate
101 nop.i 0 };; 100 nop.i 0 };;
102{ .mmi; add $f=$f,tmp4 // f+=F_00_19(b,c,d) 101{ .mmi; add $e=$e,tmp4 // e+=F_00_19(b,c,d)
103 xor tmp2=tmp2,tmp3 // +1 102 xor $Xn=$Xn,tmp3 // forward Xupdate
104 shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30) 103 shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30)
105{ .mmi; or tmp1=tmp1,tmp5 // ROTATE(a,5) 104{ .mmi; or tmp1=tmp1,tmp5 // ROTATE(a,5)
106 mux2 tmp6=$a,0x44 };; // see b in next iteration 105 mux2 tmp6=$a,0x44 };; // see b in next iteration
107{ .mii; add $f=$f,tmp1 // f+=ROTATE(a,5) 106{ .mii; add $e=$e,tmp1 // e+=ROTATE(a,5)
108 shrp $e=tmp2,tmp2,31 // f+1=ROTATE(x[0]^x[2]^x[8]^x[13],1) 107 shrp $Xn=$Xn,$Xn,31 // ROTATE(x[0]^x[2]^x[8]^x[13],1)
109 mux2 $X[$i&0xf]=$X[$i&0xf],0x44 };; 108 mux2 $X[$i]=$X[$i],0x44 };;
110 109
111___ 110___
112 } 111 }
@@ -114,27 +113,28 @@ ___
114 113
115sub BODY_16_19 { 114sub BODY_16_19 {
116local *code=shift; 115local *code=shift;
117local ($i,$a,$b,$c,$d,$e,$f)=@_; 116my ($i,$a,$b,$c,$d,$e)=@_;
117my $j=$i+1;
118my $Xn=@X[$j%16];
118 119
119$code.=<<___; 120$code.=<<___;
120{ .mmi; mov $X[$i&0xf]=$f // Xupdate 121{ .mib; add $e=$e,$K_00_19 // e+=K_00_19
121 and tmp0=$c,$b
122 dep.z tmp5=$a,5,27 } // a<<5 122 dep.z tmp5=$a,5,27 } // a<<5
123{ .mmi; andcm tmp1=$d,$b 123{ .mib; andcm tmp1=$d,$b
124 add tmp4=$e,$K_00_19 };; 124 and tmp0=$c,$b };;
125{ .mmi; or tmp0=tmp0,tmp1 // F_00_19(b,c,d)=(b&c)|(~b&d) 125{ .mmi; add $e=$e,$X[$i%16] // e+=Xupdate
126 add $f=$f,tmp4 // f+=e+K_00_19 126 or tmp0=tmp0,tmp1 // F_00_19(b,c,d)=(b&c)|(~b&d)
127 extr.u tmp1=$a,27,5 } // a>>27 127 extr.u tmp1=$a,27,5 } // a>>27
128{ .mmi; xor tmp2=$X[($i+0+1)&0xf],$X[($i+2+1)&0xf] // +1 128{ .mmi; xor $Xn=$Xn,$X[($j+2)%16] // forward Xupdate
129 xor tmp3=$X[($i+8+1)&0xf],$X[($i+13+1)&0xf] // +1 129 xor tmp3=$X[($j+8)%16],$X[($j+13)%16] // forward Xupdate
130 nop.i 0 };; 130 nop.i 0 };;
131{ .mmi; add $f=$f,tmp0 // f+=F_00_19(b,c,d) 131{ .mmi; add $e=$e,tmp0 // f+=F_00_19(b,c,d)
132 xor tmp2=tmp2,tmp3 // +1 132 xor $Xn=$Xn,tmp3 // forward Xupdate
133 shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30) 133 shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30)
134{ .mmi; or tmp1=tmp1,tmp5 // ROTATE(a,5) 134{ .mmi; or tmp1=tmp1,tmp5 // ROTATE(a,5)
135 mux2 tmp6=$a,0x44 };; // see b in next iteration 135 mux2 tmp6=$a,0x44 };; // see b in next iteration
136{ .mii; add $f=$f,tmp1 // f+=ROTATE(a,5) 136{ .mii; add $e=$e,tmp1 // e+=ROTATE(a,5)
137 shrp $e=tmp2,tmp2,31 // f+1=ROTATE(x[0]^x[2]^x[8]^x[13],1) 137 shrp $Xn=$Xn,$Xn,31 // ROTATE(x[0]^x[2]^x[8]^x[13],1)
138 nop.i 0 };; 138 nop.i 0 };;
139 139
140___ 140___
@@ -142,49 +142,47 @@ ___
142 142
143sub BODY_20_39 { 143sub BODY_20_39 {
144local *code=shift; 144local *code=shift;
145local ($i,$a,$b,$c,$d,$e,$f,$Konst)=@_; 145my ($i,$a,$b,$c,$d,$e,$Konst)=@_;
146 $Konst = $K_20_39 if (!defined($Konst)); 146 $Konst = $K_20_39 if (!defined($Konst));
147my $j=$i+1;
148my $Xn=@X[$j%16];
147 149
148if ($i<79) { 150if ($i<79) {
149$code.=<<___; 151$code.=<<___;
150{ .mib; mov $X[$i&0xf]=$f // Xupdate 152{ .mib; add $e=$e,$Konst // e+=K_XX_XX
151 dep.z tmp5=$a,5,27 } // a<<5 153 dep.z tmp5=$a,5,27 } // a<<5
152{ .mib; xor tmp0=$c,$b 154{ .mib; xor tmp0=$c,$b
153 add tmp4=$e,$Konst };; 155 xor $Xn=$Xn,$X[($j+2)%16] };; // forward Xupdate
154{ .mmi; xor tmp0=tmp0,$d // F_20_39(b,c,d)=b^c^d 156{ .mib; add $e=$e,$X[$i%16] // e+=Xupdate
155 add $f=$f,tmp4 // f+=e+K_20_39
156 extr.u tmp1=$a,27,5 } // a>>27 157 extr.u tmp1=$a,27,5 } // a>>27
157{ .mmi; xor tmp2=$X[($i+0+1)&0xf],$X[($i+2+1)&0xf] // +1 158{ .mib; xor tmp0=tmp0,$d // F_20_39(b,c,d)=b^c^d
158 xor tmp3=$X[($i+8+1)&0xf],$X[($i+13+1)&0xf] // +1 159 xor $Xn=$Xn,$X[($j+8)%16] };; // forward Xupdate
159 nop.i 0 };; 160{ .mmi; add $e=$e,tmp0 // e+=F_20_39(b,c,d)
160{ .mmi; add $f=$f,tmp0 // f+=F_20_39(b,c,d) 161 xor $Xn=$Xn,$X[($j+13)%16] // forward Xupdate
161 xor tmp2=tmp2,tmp3 // +1
162 shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30) 162 shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30)
163{ .mmi; or tmp1=tmp1,tmp5 // ROTATE(a,5) 163{ .mmi; or tmp1=tmp1,tmp5 // ROTATE(a,5)
164 mux2 tmp6=$a,0x44 };; // see b in next iteration 164 mux2 tmp6=$a,0x44 };; // see b in next iteration
165{ .mii; add $f=$f,tmp1 // f+=ROTATE(a,5) 165{ .mii; add $e=$e,tmp1 // e+=ROTATE(a,5)
166 shrp $e=tmp2,tmp2,31 // f+1=ROTATE(x[0]^x[2]^x[8]^x[13],1) 166 shrp $Xn=$Xn,$Xn,31 // ROTATE(x[0]^x[2]^x[8]^x[13],1)
167 nop.i 0 };; 167 nop.i 0 };;
168 168
169___ 169___
170} 170}
171else { 171else {
172$code.=<<___; 172$code.=<<___;
173{ .mib; mov $X[$i&0xf]=$f // Xupdate 173{ .mib; add $e=$e,$Konst // e+=K_60_79
174 dep.z tmp5=$a,5,27 } // a<<5 174 dep.z tmp5=$a,5,27 } // a<<5
175{ .mib; xor tmp0=$c,$b 175{ .mib; xor tmp0=$c,$b
176 add tmp4=$e,$Konst };;
177{ .mib; xor tmp0=tmp0,$d // F_20_39(b,c,d)=b^c^d
178 extr.u tmp1=$a,27,5 } // a>>27
179{ .mib; add $f=$f,tmp4 // f+=e+K_20_39
180 add $h1=$h1,$a };; // wrap up 176 add $h1=$h1,$a };; // wrap up
181{ .mmi; add $f=$f,tmp0 // f+=F_20_39(b,c,d) 177{ .mib; add $e=$e,$X[$i%16] // e+=Xupdate
182 shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30) ;;? 178 extr.u tmp1=$a,27,5 } // a>>27
183{ .mmi; or tmp1=tmp1,tmp5 // ROTATE(a,5) 179{ .mib; xor tmp0=tmp0,$d // F_20_39(b,c,d)=b^c^d
184 add $h3=$h3,$c };; // wrap up 180 add $h3=$h3,$c };; // wrap up
185{ .mib; add tmp3=1,inp // used in unaligned codepath 181{ .mmi; add $e=$e,tmp0 // e+=F_20_39(b,c,d)
186 add $f=$f,tmp1 } // f+=ROTATE(a,5) 182 or tmp1=tmp1,tmp5 // ROTATE(a,5)
187{ .mib; add $h2=$h2,$b // wrap up 183 shrp $b=tmp6,tmp6,2 };; // b=ROTATE(b,30) ;;?
184{ .mmi; add $e=$e,tmp1 // e+=ROTATE(a,5)
185 add tmp3=1,inp // used in unaligned codepath
188 add $h4=$h4,$d };; // wrap up 186 add $h4=$h4,$d };; // wrap up
189 187
190___ 188___
@@ -193,29 +191,29 @@ ___
193 191
194sub BODY_40_59 { 192sub BODY_40_59 {
195local *code=shift; 193local *code=shift;
196local ($i,$a,$b,$c,$d,$e,$f)=@_; 194my ($i,$a,$b,$c,$d,$e)=@_;
195my $j=$i+1;
196my $Xn=@X[$j%16];
197 197
198$code.=<<___; 198$code.=<<___;
199{ .mmi; mov $X[$i&0xf]=$f // Xupdate 199{ .mib; add $e=$e,$K_40_59 // e+=K_40_59
200 and tmp0=$c,$b
201 dep.z tmp5=$a,5,27 } // a<<5 200 dep.z tmp5=$a,5,27 } // a<<5
202{ .mmi; and tmp1=$d,$b 201{ .mib; and tmp1=$c,$d
203 add tmp4=$e,$K_40_59 };; 202 xor tmp0=$c,$d };;
204{ .mmi; or tmp0=tmp0,tmp1 // (b&c)|(b&d) 203{ .mmi; add $e=$e,$X[$i%16] // e+=Xupdate
205 add $f=$f,tmp4 // f+=e+K_40_59 204 add tmp5=tmp5,tmp1 // a<<5+(c&d)
206 extr.u tmp1=$a,27,5 } // a>>27 205 extr.u tmp1=$a,27,5 } // a>>27
207{ .mmi; and tmp4=$c,$d 206{ .mmi; and tmp0=tmp0,$b
208 xor tmp2=$X[($i+0+1)&0xf],$X[($i+2+1)&0xf] // +1 207 xor $Xn=$Xn,$X[($j+2)%16] // forward Xupdate
209 xor tmp3=$X[($i+8+1)&0xf],$X[($i+13+1)&0xf] // +1 208 xor tmp3=$X[($j+8)%16],$X[($j+13)%16] };; // forward Xupdate
210 };; 209{ .mmi; add $e=$e,tmp0 // e+=b&(c^d)
211{ .mmi; or tmp1=tmp1,tmp5 // ROTATE(a,5) 210 add tmp5=tmp5,tmp1 // ROTATE(a,5)+(c&d)
212 xor tmp2=tmp2,tmp3 // +1
213 shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30) 211 shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30)
214{ .mmi; or tmp0=tmp0,tmp4 // F_40_59(b,c,d)=(b&c)|(b&d)|(c&d) 212{ .mmi; xor $Xn=$Xn,tmp3
215 mux2 tmp6=$a,0x44 };; // see b in next iteration 213 mux2 tmp6=$a,0x44 };; // see b in next iteration
216{ .mii; add $f=$f,tmp0 // f+=F_40_59(b,c,d) 214{ .mii; add $e=$e,tmp5 // e+=ROTATE(a,5)+(c&d)
217 shrp $e=tmp2,tmp2,31;; // f+1=ROTATE(x[0]^x[2]^x[8]^x[13],1) 215 shrp $Xn=$Xn,$Xn,31 // ROTATE(x[0]^x[2]^x[8]^x[13],1)
218 add $f=$f,tmp1 };; // f+=ROTATE(a,5) 216 nop.i 0x0 };;
219 217
220___ 218___
221} 219}
@@ -237,7 +235,7 @@ inp=r33; // in1
237.align 32 235.align 32
238sha1_block_data_order: 236sha1_block_data_order:
239 .prologue 237 .prologue
240{ .mmi; alloc tmp1=ar.pfs,3,15,0,0 238{ .mmi; alloc tmp1=ar.pfs,3,14,0,0
241 $ADDP tmp0=4,ctx 239 $ADDP tmp0=4,ctx
242 .save ar.lc,r3 240 .save ar.lc,r3
243 mov r3=ar.lc } 241 mov r3=ar.lc }
@@ -245,8 +243,8 @@ sha1_block_data_order:
245 $ADDP inp=0,inp 243 $ADDP inp=0,inp
246 mov r2=pr };; 244 mov r2=pr };;
247tmp4=in2; 245tmp4=in2;
248tmp5=loc13; 246tmp5=loc12;
249tmp6=loc14; 247tmp6=loc13;
250 .body 248 .body
251{ .mlx; ld4 $h0=[ctx],8 249{ .mlx; ld4 $h0=[ctx],8
252 movl $K_00_19=0x5a827999 } 250 movl $K_00_19=0x5a827999 }
@@ -273,7 +271,7 @@ tmp6=loc14;
273 271
274___ 272___
275 273
276{ my $i,@V=($A,$B,$C,$D,$E,$T); 274{ my $i,@V=($A,$B,$C,$D,$E);
277 275
278 for($i=0;$i<16;$i++) { &BODY_00_15(\$code,$i,@V); unshift(@V,pop(@V)); } 276 for($i=0;$i<16;$i++) { &BODY_00_15(\$code,$i,@V); unshift(@V,pop(@V)); }
279 for(;$i<20;$i++) { &BODY_16_19(\$code,$i,@V); unshift(@V,pop(@V)); } 277 for(;$i<20;$i++) { &BODY_16_19(\$code,$i,@V); unshift(@V,pop(@V)); }
@@ -281,12 +279,12 @@ ___
281 for(;$i<60;$i++) { &BODY_40_59(\$code,$i,@V); unshift(@V,pop(@V)); } 279 for(;$i<60;$i++) { &BODY_40_59(\$code,$i,@V); unshift(@V,pop(@V)); }
282 for(;$i<80;$i++) { &BODY_60_79(\$code,$i,@V); unshift(@V,pop(@V)); } 280 for(;$i<80;$i++) { &BODY_60_79(\$code,$i,@V); unshift(@V,pop(@V)); }
283 281
284 (($V[5] eq $D) and ($V[0] eq $E)) or die; # double-check 282 (($V[0] eq $A) and ($V[4] eq $E)) or die; # double-check
285} 283}
286 284
287$code.=<<___; 285$code.=<<___;
288{ .mmb; add $h0=$h0,$E 286{ .mmb; add $h0=$h0,$A
289 nop.m 0 287 add $h2=$h2,$C
290 br.ctop.dptk.many .Ldtop };; 288 br.ctop.dptk.many .Ldtop };;
291.Ldend: 289.Ldend:
292{ .mmi; add tmp0=4,ctx 290{ .mmi; add tmp0=4,ctx
diff --git a/src/lib/libcrypto/sha/asm/sha1-mips.pl b/src/lib/libcrypto/sha/asm/sha1-mips.pl
new file mode 100644
index 0000000000..f1a702f38f
--- /dev/null
+++ b/src/lib/libcrypto/sha/asm/sha1-mips.pl
@@ -0,0 +1,354 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# SHA1 block procedure for MIPS.
11
12# Performance improvement is 30% on unaligned input. The "secret" is
13# to deploy lwl/lwr pair to load unaligned input. One could have
14# vectorized Xupdate on MIPSIII/IV, but the goal was to code MIPS32-
15# compatible subroutine. There is room for minor optimization on
16# little-endian platforms...
17
18######################################################################
19# There is a number of MIPS ABI in use, O32 and N32/64 are most
20# widely used. Then there is a new contender: NUBI. It appears that if
21# one picks the latter, it's possible to arrange code in ABI neutral
22# manner. Therefore let's stick to NUBI register layout:
23#
24($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
25($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
26($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
27($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
28#
29# The return value is placed in $a0. Following coding rules facilitate
30# interoperability:
31#
32# - never ever touch $tp, "thread pointer", former $gp;
33# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
34# old code];
35# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
36#
37# For reference here is register layout for N32/64 MIPS ABIs:
38#
39# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
40# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
41# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
42# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
43# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
44#
45$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64
46
47if ($flavour =~ /64|n32/i) {
48 $PTR_ADD="dadd"; # incidentally works even on n32
49 $PTR_SUB="dsub"; # incidentally works even on n32
50 $REG_S="sd";
51 $REG_L="ld";
52 $PTR_SLL="dsll"; # incidentally works even on n32
53 $SZREG=8;
54} else {
55 $PTR_ADD="add";
56 $PTR_SUB="sub";
57 $REG_S="sw";
58 $REG_L="lw";
59 $PTR_SLL="sll";
60 $SZREG=4;
61}
62#
63# <appro@openssl.org>
64#
65######################################################################
66
67$big_endian=(`echo MIPSEL | $ENV{CC} -E -P -`=~/MIPSEL/)?1:0;
68
69for (@ARGV) { $output=$_ if (/^\w[\w\-]*\.\w+$/); }
70open STDOUT,">$output";
71
72if (!defined($big_endian))
73 { $big_endian=(unpack('L',pack('N',1))==1); }
74
75# offsets of the Most and Least Significant Bytes
76$MSB=$big_endian?0:3;
77$LSB=3&~$MSB;
78
79@X=map("\$$_",(8..23)); # a4-a7,s0-s11
80
81$ctx=$a0;
82$inp=$a1;
83$num=$a2;
84$A="\$1";
85$B="\$2";
86$C="\$3";
87$D="\$7";
88$E="\$24"; @V=($A,$B,$C,$D,$E);
89$t0="\$25";
90$t1=$num; # $num is offloaded to stack
91$t2="\$30"; # fp
92$K="\$31"; # ra
93
94sub BODY_00_14 {
95my ($i,$a,$b,$c,$d,$e)=@_;
96my $j=$i+1;
97$code.=<<___ if (!$big_endian);
98 srl $t0,@X[$i],24 # byte swap($i)
99 srl $t1,@X[$i],8
100 andi $t2,@X[$i],0xFF00
101 sll @X[$i],@X[$i],24
102 andi $t1,0xFF00
103 sll $t2,$t2,8
104 or @X[$i],$t0
105 or $t1,$t2
106 or @X[$i],$t1
107___
108$code.=<<___;
109 lwl @X[$j],$j*4+$MSB($inp)
110 sll $t0,$a,5 # $i
111 addu $e,$K
112 lwr @X[$j],$j*4+$LSB($inp)
113 srl $t1,$a,27
114 addu $e,$t0
115 xor $t0,$c,$d
116 addu $e,$t1
117 sll $t2,$b,30
118 and $t0,$b
119 srl $b,$b,2
120 xor $t0,$d
121 addu $e,@X[$i]
122 or $b,$t2
123 addu $e,$t0
124___
125}
126
127sub BODY_15_19 {
128my ($i,$a,$b,$c,$d,$e)=@_;
129my $j=$i+1;
130
131$code.=<<___ if (!$big_endian && $i==15);
132 srl $t0,@X[$i],24 # byte swap($i)
133 srl $t1,@X[$i],8
134 andi $t2,@X[$i],0xFF00
135 sll @X[$i],@X[$i],24
136 andi $t1,0xFF00
137 sll $t2,$t2,8
138 or @X[$i],$t0
139 or @X[$i],$t1
140 or @X[$i],$t2
141___
142$code.=<<___;
143 xor @X[$j%16],@X[($j+2)%16]
144 sll $t0,$a,5 # $i
145 addu $e,$K
146 srl $t1,$a,27
147 addu $e,$t0
148 xor @X[$j%16],@X[($j+8)%16]
149 xor $t0,$c,$d
150 addu $e,$t1
151 xor @X[$j%16],@X[($j+13)%16]
152 sll $t2,$b,30
153 and $t0,$b
154 srl $t1,@X[$j%16],31
155 addu @X[$j%16],@X[$j%16]
156 srl $b,$b,2
157 xor $t0,$d
158 or @X[$j%16],$t1
159 addu $e,@X[$i%16]
160 or $b,$t2
161 addu $e,$t0
162___
163}
164
165sub BODY_20_39 {
166my ($i,$a,$b,$c,$d,$e)=@_;
167my $j=$i+1;
168$code.=<<___ if ($i<79);
169 xor @X[$j%16],@X[($j+2)%16]
170 sll $t0,$a,5 # $i
171 addu $e,$K
172 srl $t1,$a,27
173 addu $e,$t0
174 xor @X[$j%16],@X[($j+8)%16]
175 xor $t0,$c,$d
176 addu $e,$t1
177 xor @X[$j%16],@X[($j+13)%16]
178 sll $t2,$b,30
179 xor $t0,$b
180 srl $t1,@X[$j%16],31
181 addu @X[$j%16],@X[$j%16]
182 srl $b,$b,2
183 addu $e,@X[$i%16]
184 or @X[$j%16],$t1
185 or $b,$t2
186 addu $e,$t0
187___
188$code.=<<___ if ($i==79);
189 lw @X[0],0($ctx)
190 sll $t0,$a,5 # $i
191 addu $e,$K
192 lw @X[1],4($ctx)
193 srl $t1,$a,27
194 addu $e,$t0
195 lw @X[2],8($ctx)
196 xor $t0,$c,$d
197 addu $e,$t1
198 lw @X[3],12($ctx)
199 sll $t2,$b,30
200 xor $t0,$b
201 lw @X[4],16($ctx)
202 srl $b,$b,2
203 addu $e,@X[$i%16]
204 or $b,$t2
205 addu $e,$t0
206___
207}
208
209sub BODY_40_59 {
210my ($i,$a,$b,$c,$d,$e)=@_;
211my $j=$i+1;
212$code.=<<___ if ($i<79);
213 xor @X[$j%16],@X[($j+2)%16]
214 sll $t0,$a,5 # $i
215 addu $e,$K
216 srl $t1,$a,27
217 addu $e,$t0
218 xor @X[$j%16],@X[($j+8)%16]
219 and $t0,$c,$d
220 addu $e,$t1
221 xor @X[$j%16],@X[($j+13)%16]
222 sll $t2,$b,30
223 addu $e,$t0
224 srl $t1,@X[$j%16],31
225 xor $t0,$c,$d
226 addu @X[$j%16],@X[$j%16]
227 and $t0,$b
228 srl $b,$b,2
229 or @X[$j%16],$t1
230 addu $e,@X[$i%16]
231 or $b,$t2
232 addu $e,$t0
233___
234}
235
236$FRAMESIZE=16; # large enough to accomodate NUBI saved registers
237$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0xc0fff008 : 0xc0ff0000;
238
239$code=<<___;
240#ifdef OPENSSL_FIPSCANISTER
241# include <openssl/fipssyms.h>
242#endif
243
244.text
245
246.set noat
247.set noreorder
248.align 5
249.globl sha1_block_data_order
250.ent sha1_block_data_order
251sha1_block_data_order:
252 .frame $sp,$FRAMESIZE*$SZREG,$ra
253 .mask $SAVED_REGS_MASK,-$SZREG
254 .set noreorder
255 $PTR_SUB $sp,$FRAMESIZE*$SZREG
256 $REG_S $ra,($FRAMESIZE-1)*$SZREG($sp)
257 $REG_S $fp,($FRAMESIZE-2)*$SZREG($sp)
258 $REG_S $s11,($FRAMESIZE-3)*$SZREG($sp)
259 $REG_S $s10,($FRAMESIZE-4)*$SZREG($sp)
260 $REG_S $s9,($FRAMESIZE-5)*$SZREG($sp)
261 $REG_S $s8,($FRAMESIZE-6)*$SZREG($sp)
262 $REG_S $s7,($FRAMESIZE-7)*$SZREG($sp)
263 $REG_S $s6,($FRAMESIZE-8)*$SZREG($sp)
264 $REG_S $s5,($FRAMESIZE-9)*$SZREG($sp)
265 $REG_S $s4,($FRAMESIZE-10)*$SZREG($sp)
266___
267$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
268 $REG_S $s3,($FRAMESIZE-11)*$SZREG($sp)
269 $REG_S $s2,($FRAMESIZE-12)*$SZREG($sp)
270 $REG_S $s1,($FRAMESIZE-13)*$SZREG($sp)
271 $REG_S $s0,($FRAMESIZE-14)*$SZREG($sp)
272 $REG_S $gp,($FRAMESIZE-15)*$SZREG($sp)
273___
274$code.=<<___;
275 $PTR_SLL $num,6
276 $PTR_ADD $num,$inp
277 $REG_S $num,0($sp)
278 lw $A,0($ctx)
279 lw $B,4($ctx)
280 lw $C,8($ctx)
281 lw $D,12($ctx)
282 b .Loop
283 lw $E,16($ctx)
284.align 4
285.Loop:
286 .set reorder
287 lwl @X[0],$MSB($inp)
288 lui $K,0x5a82
289 lwr @X[0],$LSB($inp)
290 ori $K,0x7999 # K_00_19
291___
292for ($i=0;$i<15;$i++) { &BODY_00_14($i,@V); unshift(@V,pop(@V)); }
293for (;$i<20;$i++) { &BODY_15_19($i,@V); unshift(@V,pop(@V)); }
294$code.=<<___;
295 lui $K,0x6ed9
296 ori $K,0xeba1 # K_20_39
297___
298for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
299$code.=<<___;
300 lui $K,0x8f1b
301 ori $K,0xbcdc # K_40_59
302___
303for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
304$code.=<<___;
305 lui $K,0xca62
306 ori $K,0xc1d6 # K_60_79
307___
308for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
309$code.=<<___;
310 $PTR_ADD $inp,64
311 $REG_L $num,0($sp)
312
313 addu $A,$X[0]
314 addu $B,$X[1]
315 sw $A,0($ctx)
316 addu $C,$X[2]
317 addu $D,$X[3]
318 sw $B,4($ctx)
319 addu $E,$X[4]
320 sw $C,8($ctx)
321 sw $D,12($ctx)
322 sw $E,16($ctx)
323 .set noreorder
324 bne $inp,$num,.Loop
325 nop
326
327 .set noreorder
328 $REG_L $ra,($FRAMESIZE-1)*$SZREG($sp)
329 $REG_L $fp,($FRAMESIZE-2)*$SZREG($sp)
330 $REG_L $s11,($FRAMESIZE-3)*$SZREG($sp)
331 $REG_L $s10,($FRAMESIZE-4)*$SZREG($sp)
332 $REG_L $s9,($FRAMESIZE-5)*$SZREG($sp)
333 $REG_L $s8,($FRAMESIZE-6)*$SZREG($sp)
334 $REG_L $s7,($FRAMESIZE-7)*$SZREG($sp)
335 $REG_L $s6,($FRAMESIZE-8)*$SZREG($sp)
336 $REG_L $s5,($FRAMESIZE-9)*$SZREG($sp)
337 $REG_L $s4,($FRAMESIZE-10)*$SZREG($sp)
338___
339$code.=<<___ if ($flavour =~ /nubi/i);
340 $REG_L $s3,($FRAMESIZE-11)*$SZREG($sp)
341 $REG_L $s2,($FRAMESIZE-12)*$SZREG($sp)
342 $REG_L $s1,($FRAMESIZE-13)*$SZREG($sp)
343 $REG_L $s0,($FRAMESIZE-14)*$SZREG($sp)
344 $REG_L $gp,($FRAMESIZE-15)*$SZREG($sp)
345___
346$code.=<<___;
347 jr $ra
348 $PTR_ADD $sp,$FRAMESIZE*$SZREG
349.end sha1_block_data_order
350.rdata
351.asciiz "SHA1 for MIPS, CRYPTOGAMS by <appro\@openssl.org>"
352___
353print $code;
354close STDOUT;
diff --git a/src/lib/libcrypto/sha/asm/sha1-parisc.pl b/src/lib/libcrypto/sha/asm/sha1-parisc.pl
new file mode 100644
index 0000000000..6d7bf495b2
--- /dev/null
+++ b/src/lib/libcrypto/sha/asm/sha1-parisc.pl
@@ -0,0 +1,259 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# SHA1 block procedure for PA-RISC.
11
12# June 2009.
13#
14# On PA-7100LC performance is >30% better than gcc 3.2 generated code
15# for aligned input and >50% better for unaligned. Compared to vendor
16# compiler on PA-8600 it's almost 60% faster in 64-bit build and just
17# few percent faster in 32-bit one (this for aligned input, data for
18# unaligned input is not available).
19#
20# Special thanks to polarhome.com for providing HP-UX account.
21
22$flavour = shift;
23$output = shift;
24open STDOUT,">$output";
25
26if ($flavour =~ /64/) {
27 $LEVEL ="2.0W";
28 $SIZE_T =8;
29 $FRAME_MARKER =80;
30 $SAVED_RP =16;
31 $PUSH ="std";
32 $PUSHMA ="std,ma";
33 $POP ="ldd";
34 $POPMB ="ldd,mb";
35} else {
36 $LEVEL ="1.0";
37 $SIZE_T =4;
38 $FRAME_MARKER =48;
39 $SAVED_RP =20;
40 $PUSH ="stw";
41 $PUSHMA ="stwm";
42 $POP ="ldw";
43 $POPMB ="ldwm";
44}
45
46$FRAME=14*$SIZE_T+$FRAME_MARKER;# 14 saved regs + frame marker
47 # [+ argument transfer]
48$ctx="%r26"; # arg0
49$inp="%r25"; # arg1
50$num="%r24"; # arg2
51
52$t0="%r28";
53$t1="%r29";
54$K="%r31";
55
56@X=("%r1", "%r2", "%r3", "%r4", "%r5", "%r6", "%r7", "%r8",
57 "%r9", "%r10","%r11","%r12","%r13","%r14","%r15","%r16",$t0);
58
59@V=($A,$B,$C,$D,$E)=("%r19","%r20","%r21","%r22","%r23");
60
61sub BODY_00_19 {
62my ($i,$a,$b,$c,$d,$e)=@_;
63my $j=$i+1;
64$code.=<<___ if ($i<15);
65 addl $K,$e,$e ; $i
66 shd $a,$a,27,$t1
67 addl @X[$i],$e,$e
68 and $c,$b,$t0
69 addl $t1,$e,$e
70 andcm $d,$b,$t1
71 shd $b,$b,2,$b
72 or $t1,$t0,$t0
73 addl $t0,$e,$e
74___
75$code.=<<___ if ($i>=15); # with forward Xupdate
76 addl $K,$e,$e ; $i
77 shd $a,$a,27,$t1
78 xor @X[($j+2)%16],@X[$j%16],@X[$j%16]
79 addl @X[$i%16],$e,$e
80 and $c,$b,$t0
81 xor @X[($j+8)%16],@X[$j%16],@X[$j%16]
82 addl $t1,$e,$e
83 andcm $d,$b,$t1
84 shd $b,$b,2,$b
85 or $t1,$t0,$t0
86 xor @X[($j+13)%16],@X[$j%16],@X[$j%16]
87 add $t0,$e,$e
88 shd @X[$j%16],@X[$j%16],31,@X[$j%16]
89___
90}
91
92sub BODY_20_39 {
93my ($i,$a,$b,$c,$d,$e)=@_;
94my $j=$i+1;
95$code.=<<___ if ($i<79);
96 xor @X[($j+2)%16],@X[$j%16],@X[$j%16] ; $i
97 addl $K,$e,$e
98 shd $a,$a,27,$t1
99 xor @X[($j+8)%16],@X[$j%16],@X[$j%16]
100 addl @X[$i%16],$e,$e
101 xor $b,$c,$t0
102 xor @X[($j+13)%16],@X[$j%16],@X[$j%16]
103 addl $t1,$e,$e
104 shd $b,$b,2,$b
105 xor $d,$t0,$t0
106 shd @X[$j%16],@X[$j%16],31,@X[$j%16]
107 addl $t0,$e,$e
108___
109$code.=<<___ if ($i==79); # with context load
110 ldw 0($ctx),@X[0] ; $i
111 addl $K,$e,$e
112 shd $a,$a,27,$t1
113 ldw 4($ctx),@X[1]
114 addl @X[$i%16],$e,$e
115 xor $b,$c,$t0
116 ldw 8($ctx),@X[2]
117 addl $t1,$e,$e
118 shd $b,$b,2,$b
119 xor $d,$t0,$t0
120 ldw 12($ctx),@X[3]
121 addl $t0,$e,$e
122 ldw 16($ctx),@X[4]
123___
124}
125
126sub BODY_40_59 {
127my ($i,$a,$b,$c,$d,$e)=@_;
128my $j=$i+1;
129$code.=<<___;
130 shd $a,$a,27,$t1 ; $i
131 addl $K,$e,$e
132 xor @X[($j+2)%16],@X[$j%16],@X[$j%16]
133 xor $d,$c,$t0
134 addl @X[$i%16],$e,$e
135 xor @X[($j+8)%16],@X[$j%16],@X[$j%16]
136 and $b,$t0,$t0
137 addl $t1,$e,$e
138 shd $b,$b,2,$b
139 xor @X[($j+13)%16],@X[$j%16],@X[$j%16]
140 addl $t0,$e,$e
141 and $d,$c,$t1
142 shd @X[$j%16],@X[$j%16],31,@X[$j%16]
143 addl $t1,$e,$e
144___
145}
146
147$code=<<___;
148 .LEVEL $LEVEL
149 .SPACE \$TEXT\$
150 .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
151
152 .EXPORT sha1_block_data_order,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR
153sha1_block_data_order
154 .PROC
155 .CALLINFO FRAME=`$FRAME-14*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=16
156 .ENTRY
157 $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
158 $PUSHMA %r3,$FRAME(%sp)
159 $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
160 $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
161 $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
162 $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp)
163 $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp)
164 $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp)
165 $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp)
166 $PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp)
167 $PUSH %r12,`-$FRAME+9*$SIZE_T`(%sp)
168 $PUSH %r13,`-$FRAME+10*$SIZE_T`(%sp)
169 $PUSH %r14,`-$FRAME+11*$SIZE_T`(%sp)
170 $PUSH %r15,`-$FRAME+12*$SIZE_T`(%sp)
171 $PUSH %r16,`-$FRAME+13*$SIZE_T`(%sp)
172
173 ldw 0($ctx),$A
174 ldw 4($ctx),$B
175 ldw 8($ctx),$C
176 ldw 12($ctx),$D
177 ldw 16($ctx),$E
178
179 extru $inp,31,2,$t0 ; t0=inp&3;
180 sh3addl $t0,%r0,$t0 ; t0*=8;
181 subi 32,$t0,$t0 ; t0=32-t0;
182 mtctl $t0,%cr11 ; %sar=t0;
183
184L\$oop
185 ldi 3,$t0
186 andcm $inp,$t0,$t0 ; 64-bit neutral
187___
188 for ($i=0;$i<15;$i++) { # load input block
189 $code.="\tldw `4*$i`($t0),@X[$i]\n"; }
190$code.=<<___;
191 cmpb,*= $inp,$t0,L\$aligned
192 ldw 60($t0),@X[15]
193 ldw 64($t0),@X[16]
194___
195 for ($i=0;$i<16;$i++) { # align input
196 $code.="\tvshd @X[$i],@X[$i+1],@X[$i]\n"; }
197$code.=<<___;
198L\$aligned
199 ldil L'0x5a827000,$K ; K_00_19
200 ldo 0x999($K),$K
201___
202for ($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
203$code.=<<___;
204 ldil L'0x6ed9e000,$K ; K_20_39
205 ldo 0xba1($K),$K
206___
207
208for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
209$code.=<<___;
210 ldil L'0x8f1bb000,$K ; K_40_59
211 ldo 0xcdc($K),$K
212___
213
214for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
215$code.=<<___;
216 ldil L'0xca62c000,$K ; K_60_79
217 ldo 0x1d6($K),$K
218___
219for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
220
221$code.=<<___;
222 addl @X[0],$A,$A
223 addl @X[1],$B,$B
224 addl @X[2],$C,$C
225 addl @X[3],$D,$D
226 addl @X[4],$E,$E
227 stw $A,0($ctx)
228 stw $B,4($ctx)
229 stw $C,8($ctx)
230 stw $D,12($ctx)
231 stw $E,16($ctx)
232 addib,*<> -1,$num,L\$oop
233 ldo 64($inp),$inp
234
235 $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue
236 $POP `-$FRAME+1*$SIZE_T`(%sp),%r4
237 $POP `-$FRAME+2*$SIZE_T`(%sp),%r5
238 $POP `-$FRAME+3*$SIZE_T`(%sp),%r6
239 $POP `-$FRAME+4*$SIZE_T`(%sp),%r7
240 $POP `-$FRAME+5*$SIZE_T`(%sp),%r8
241 $POP `-$FRAME+6*$SIZE_T`(%sp),%r9
242 $POP `-$FRAME+7*$SIZE_T`(%sp),%r10
243 $POP `-$FRAME+8*$SIZE_T`(%sp),%r11
244 $POP `-$FRAME+9*$SIZE_T`(%sp),%r12
245 $POP `-$FRAME+10*$SIZE_T`(%sp),%r13
246 $POP `-$FRAME+11*$SIZE_T`(%sp),%r14
247 $POP `-$FRAME+12*$SIZE_T`(%sp),%r15
248 $POP `-$FRAME+13*$SIZE_T`(%sp),%r16
249 bv (%r2)
250 .EXIT
251 $POPMB -$FRAME(%sp),%r3
252 .PROCEND
253 .STRINGZ "SHA1 block transform for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
254___
255
256$code =~ s/\`([^\`]*)\`/eval $1/gem;
257$code =~ s/,\*/,/gm if ($SIZE_T==4);
258print $code;
259close STDOUT;
diff --git a/src/lib/libcrypto/sha/asm/sha1-ppc.pl b/src/lib/libcrypto/sha/asm/sha1-ppc.pl
index dcd0fcdfcf..2140dd2f8d 100755
--- a/src/lib/libcrypto/sha/asm/sha1-ppc.pl
+++ b/src/lib/libcrypto/sha/asm/sha1-ppc.pl
@@ -24,12 +24,14 @@ $flavour = shift;
24 24
25if ($flavour =~ /64/) { 25if ($flavour =~ /64/) {
26 $SIZE_T =8; 26 $SIZE_T =8;
27 $LRSAVE =2*$SIZE_T;
27 $UCMP ="cmpld"; 28 $UCMP ="cmpld";
28 $STU ="stdu"; 29 $STU ="stdu";
29 $POP ="ld"; 30 $POP ="ld";
30 $PUSH ="std"; 31 $PUSH ="std";
31} elsif ($flavour =~ /32/) { 32} elsif ($flavour =~ /32/) {
32 $SIZE_T =4; 33 $SIZE_T =4;
34 $LRSAVE =$SIZE_T;
33 $UCMP ="cmplw"; 35 $UCMP ="cmplw";
34 $STU ="stwu"; 36 $STU ="stwu";
35 $POP ="lwz"; 37 $POP ="lwz";
@@ -43,7 +45,8 @@ die "can't locate ppc-xlate.pl";
43 45
44open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!"; 46open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
45 47
46$FRAME=24*$SIZE_T; 48$FRAME=24*$SIZE_T+64;
49$LOCALS=6*$SIZE_T;
47 50
48$K ="r0"; 51$K ="r0";
49$sp ="r1"; 52$sp ="r1";
@@ -162,9 +165,8 @@ $code=<<___;
162.globl .sha1_block_data_order 165.globl .sha1_block_data_order
163.align 4 166.align 4
164.sha1_block_data_order: 167.sha1_block_data_order:
168 $STU $sp,-$FRAME($sp)
165 mflr r0 169 mflr r0
166 $STU $sp,`-($FRAME+64)`($sp)
167 $PUSH r0,`$FRAME-$SIZE_T*18`($sp)
168 $PUSH r15,`$FRAME-$SIZE_T*17`($sp) 170 $PUSH r15,`$FRAME-$SIZE_T*17`($sp)
169 $PUSH r16,`$FRAME-$SIZE_T*16`($sp) 171 $PUSH r16,`$FRAME-$SIZE_T*16`($sp)
170 $PUSH r17,`$FRAME-$SIZE_T*15`($sp) 172 $PUSH r17,`$FRAME-$SIZE_T*15`($sp)
@@ -182,6 +184,7 @@ $code=<<___;
182 $PUSH r29,`$FRAME-$SIZE_T*3`($sp) 184 $PUSH r29,`$FRAME-$SIZE_T*3`($sp)
183 $PUSH r30,`$FRAME-$SIZE_T*2`($sp) 185 $PUSH r30,`$FRAME-$SIZE_T*2`($sp)
184 $PUSH r31,`$FRAME-$SIZE_T*1`($sp) 186 $PUSH r31,`$FRAME-$SIZE_T*1`($sp)
187 $PUSH r0,`$FRAME+$LRSAVE`($sp)
185 lwz $A,0($ctx) 188 lwz $A,0($ctx)
186 lwz $B,4($ctx) 189 lwz $B,4($ctx)
187 lwz $C,8($ctx) 190 lwz $C,8($ctx)
@@ -192,37 +195,14 @@ $code=<<___;
192Laligned: 195Laligned:
193 mtctr $num 196 mtctr $num
194 bl Lsha1_block_private 197 bl Lsha1_block_private
195Ldone: 198 b Ldone
196 $POP r0,`$FRAME-$SIZE_T*18`($sp)
197 $POP r15,`$FRAME-$SIZE_T*17`($sp)
198 $POP r16,`$FRAME-$SIZE_T*16`($sp)
199 $POP r17,`$FRAME-$SIZE_T*15`($sp)
200 $POP r18,`$FRAME-$SIZE_T*14`($sp)
201 $POP r19,`$FRAME-$SIZE_T*13`($sp)
202 $POP r20,`$FRAME-$SIZE_T*12`($sp)
203 $POP r21,`$FRAME-$SIZE_T*11`($sp)
204 $POP r22,`$FRAME-$SIZE_T*10`($sp)
205 $POP r23,`$FRAME-$SIZE_T*9`($sp)
206 $POP r24,`$FRAME-$SIZE_T*8`($sp)
207 $POP r25,`$FRAME-$SIZE_T*7`($sp)
208 $POP r26,`$FRAME-$SIZE_T*6`($sp)
209 $POP r27,`$FRAME-$SIZE_T*5`($sp)
210 $POP r28,`$FRAME-$SIZE_T*4`($sp)
211 $POP r29,`$FRAME-$SIZE_T*3`($sp)
212 $POP r30,`$FRAME-$SIZE_T*2`($sp)
213 $POP r31,`$FRAME-$SIZE_T*1`($sp)
214 mtlr r0
215 addi $sp,$sp,`$FRAME+64`
216 blr
217___
218 199
219# PowerPC specification allows an implementation to be ill-behaved 200; PowerPC specification allows an implementation to be ill-behaved
220# upon unaligned access which crosses page boundary. "Better safe 201; upon unaligned access which crosses page boundary. "Better safe
221# than sorry" principle makes me treat it specially. But I don't 202; than sorry" principle makes me treat it specially. But I don't
222# look for particular offending word, but rather for 64-byte input 203; look for particular offending word, but rather for 64-byte input
223# block which crosses the boundary. Once found that block is aligned 204; block which crosses the boundary. Once found that block is aligned
224# and hashed separately... 205; and hashed separately...
225$code.=<<___;
226.align 4 206.align 4
227Lunaligned: 207Lunaligned:
228 subfic $t1,$inp,4096 208 subfic $t1,$inp,4096
@@ -237,7 +217,7 @@ Lunaligned:
237Lcross_page: 217Lcross_page:
238 li $t1,16 218 li $t1,16
239 mtctr $t1 219 mtctr $t1
240 addi r20,$sp,$FRAME ; spot below the frame 220 addi r20,$sp,$LOCALS ; spot within the frame
241Lmemcpy: 221Lmemcpy:
242 lbz r16,0($inp) 222 lbz r16,0($inp)
243 lbz r17,1($inp) 223 lbz r17,1($inp)
@@ -251,15 +231,40 @@ Lmemcpy:
251 addi r20,r20,4 231 addi r20,r20,4
252 bdnz Lmemcpy 232 bdnz Lmemcpy
253 233
254 $PUSH $inp,`$FRAME-$SIZE_T*19`($sp) 234 $PUSH $inp,`$FRAME-$SIZE_T*18`($sp)
255 li $t1,1 235 li $t1,1
256 addi $inp,$sp,$FRAME 236 addi $inp,$sp,$LOCALS
257 mtctr $t1 237 mtctr $t1
258 bl Lsha1_block_private 238 bl Lsha1_block_private
259 $POP $inp,`$FRAME-$SIZE_T*19`($sp) 239 $POP $inp,`$FRAME-$SIZE_T*18`($sp)
260 addic. $num,$num,-1 240 addic. $num,$num,-1
261 bne- Lunaligned 241 bne- Lunaligned
262 b Ldone 242
243Ldone:
244 $POP r0,`$FRAME+$LRSAVE`($sp)
245 $POP r15,`$FRAME-$SIZE_T*17`($sp)
246 $POP r16,`$FRAME-$SIZE_T*16`($sp)
247 $POP r17,`$FRAME-$SIZE_T*15`($sp)
248 $POP r18,`$FRAME-$SIZE_T*14`($sp)
249 $POP r19,`$FRAME-$SIZE_T*13`($sp)
250 $POP r20,`$FRAME-$SIZE_T*12`($sp)
251 $POP r21,`$FRAME-$SIZE_T*11`($sp)
252 $POP r22,`$FRAME-$SIZE_T*10`($sp)
253 $POP r23,`$FRAME-$SIZE_T*9`($sp)
254 $POP r24,`$FRAME-$SIZE_T*8`($sp)
255 $POP r25,`$FRAME-$SIZE_T*7`($sp)
256 $POP r26,`$FRAME-$SIZE_T*6`($sp)
257 $POP r27,`$FRAME-$SIZE_T*5`($sp)
258 $POP r28,`$FRAME-$SIZE_T*4`($sp)
259 $POP r29,`$FRAME-$SIZE_T*3`($sp)
260 $POP r30,`$FRAME-$SIZE_T*2`($sp)
261 $POP r31,`$FRAME-$SIZE_T*1`($sp)
262 mtlr r0
263 addi $sp,$sp,$FRAME
264 blr
265 .long 0
266 .byte 0,12,4,1,0x80,18,3,0
267 .long 0
263___ 268___
264 269
265# This is private block function, which uses tailored calling 270# This is private block function, which uses tailored calling
@@ -309,6 +314,8 @@ $code.=<<___;
309 addi $inp,$inp,`16*4` 314 addi $inp,$inp,`16*4`
310 bdnz- Lsha1_block_private 315 bdnz- Lsha1_block_private
311 blr 316 blr
317 .long 0
318 .byte 0,12,0x14,0,0,0,0,0
312___ 319___
313$code.=<<___; 320$code.=<<___;
314.asciz "SHA1 block transform for PPC, CRYPTOGAMS by <appro\@fy.chalmers.se>" 321.asciz "SHA1 block transform for PPC, CRYPTOGAMS by <appro\@fy.chalmers.se>"
diff --git a/src/lib/libcrypto/sha/asm/sha1-s390x.pl b/src/lib/libcrypto/sha/asm/sha1-s390x.pl
index 4b17848287..9193dda45e 100644
--- a/src/lib/libcrypto/sha/asm/sha1-s390x.pl
+++ b/src/lib/libcrypto/sha/asm/sha1-s390x.pl
@@ -21,9 +21,28 @@
21# instructions to favour dual-issue z10 pipeline. On z10 hardware is 21# instructions to favour dual-issue z10 pipeline. On z10 hardware is
22# "only" ~2.3x faster than software. 22# "only" ~2.3x faster than software.
23 23
24# November 2010.
25#
26# Adapt for -m31 build. If kernel supports what's called "highgprs"
27# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
28# instructions and achieve "64-bit" performance even in 31-bit legacy
29# application context. The feature is not specific to any particular
30# processor, as long as it's "z-CPU". Latter implies that the code
31# remains z/Architecture specific.
32
24$kimdfunc=1; # magic function code for kimd instruction 33$kimdfunc=1; # magic function code for kimd instruction
25 34
26$output=shift; 35$flavour = shift;
36
37if ($flavour =~ /3[12]/) {
38 $SIZE_T=4;
39 $g="";
40} else {
41 $SIZE_T=8;
42 $g="g";
43}
44
45while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
27open STDOUT,">$output"; 46open STDOUT,">$output";
28 47
29$K_00_39="%r0"; $K=$K_00_39; 48$K_00_39="%r0"; $K=$K_00_39;
@@ -42,13 +61,14 @@ $t1="%r11";
42@X=("%r12","%r13","%r14"); 61@X=("%r12","%r13","%r14");
43$sp="%r15"; 62$sp="%r15";
44 63
45$frame=160+16*4; 64$stdframe=16*$SIZE_T+4*8;
65$frame=$stdframe+16*4;
46 66
47sub Xupdate { 67sub Xupdate {
48my $i=shift; 68my $i=shift;
49 69
50$code.=<<___ if ($i==15); 70$code.=<<___ if ($i==15);
51 lg $prefetch,160($sp) ### Xupdate(16) warm-up 71 lg $prefetch,$stdframe($sp) ### Xupdate(16) warm-up
52 lr $X[0],$X[2] 72 lr $X[0],$X[2]
53___ 73___
54return if ($i&1); # Xupdate is vectorized and executed every 2nd cycle 74return if ($i&1); # Xupdate is vectorized and executed every 2nd cycle
@@ -58,8 +78,8 @@ $code.=<<___ if ($i<16);
58___ 78___
59$code.=<<___ if ($i>=16); 79$code.=<<___ if ($i>=16);
60 xgr $X[0],$prefetch ### Xupdate($i) 80 xgr $X[0],$prefetch ### Xupdate($i)
61 lg $prefetch,`160+4*(($i+2)%16)`($sp) 81 lg $prefetch,`$stdframe+4*(($i+2)%16)`($sp)
62 xg $X[0],`160+4*(($i+8)%16)`($sp) 82 xg $X[0],`$stdframe+4*(($i+8)%16)`($sp)
63 xgr $X[0],$prefetch 83 xgr $X[0],$prefetch
64 rll $X[0],$X[0],1 84 rll $X[0],$X[0],1
65 rllg $X[1],$X[0],32 85 rllg $X[1],$X[0],32
@@ -68,7 +88,7 @@ $code.=<<___ if ($i>=16);
68 lr $X[2],$X[1] # feedback 88 lr $X[2],$X[1] # feedback
69___ 89___
70$code.=<<___ if ($i<=70); 90$code.=<<___ if ($i<=70);
71 stg $X[0],`160+4*($i%16)`($sp) 91 stg $X[0],`$stdframe+4*($i%16)`($sp)
72___ 92___
73unshift(@X,pop(@X)); 93unshift(@X,pop(@X));
74} 94}
@@ -148,9 +168,9 @@ $code.=<<___ if ($kimdfunc);
148 tmhl %r0,0x4000 # check for message-security assist 168 tmhl %r0,0x4000 # check for message-security assist
149 jz .Lsoftware 169 jz .Lsoftware
150 lghi %r0,0 170 lghi %r0,0
151 la %r1,16($sp) 171 la %r1,`2*$SIZE_T`($sp)
152 .long 0xb93e0002 # kimd %r0,%r2 172 .long 0xb93e0002 # kimd %r0,%r2
153 lg %r0,16($sp) 173 lg %r0,`2*$SIZE_T`($sp)
154 tmhh %r0,`0x8000>>$kimdfunc` 174 tmhh %r0,`0x8000>>$kimdfunc`
155 jz .Lsoftware 175 jz .Lsoftware
156 lghi %r0,$kimdfunc 176 lghi %r0,$kimdfunc
@@ -165,11 +185,11 @@ $code.=<<___ if ($kimdfunc);
165___ 185___
166$code.=<<___; 186$code.=<<___;
167 lghi %r1,-$frame 187 lghi %r1,-$frame
168 stg $ctx,16($sp) 188 st${g} $ctx,`2*$SIZE_T`($sp)
169 stmg %r6,%r15,48($sp) 189 stm${g} %r6,%r15,`6*$SIZE_T`($sp)
170 lgr %r0,$sp 190 lgr %r0,$sp
171 la $sp,0(%r1,$sp) 191 la $sp,0(%r1,$sp)
172 stg %r0,0($sp) 192 st${g} %r0,0($sp)
173 193
174 larl $t0,Ktable 194 larl $t0,Ktable
175 llgf $A,0($ctx) 195 llgf $A,0($ctx)
@@ -199,7 +219,7 @@ ___
199for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } 219for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
200$code.=<<___; 220$code.=<<___;
201 221
202 lg $ctx,`$frame+16`($sp) 222 l${g} $ctx,`$frame+2*$SIZE_T`($sp)
203 la $inp,64($inp) 223 la $inp,64($inp)
204 al $A,0($ctx) 224 al $A,0($ctx)
205 al $B,4($ctx) 225 al $B,4($ctx)
@@ -211,13 +231,13 @@ $code.=<<___;
211 st $C,8($ctx) 231 st $C,8($ctx)
212 st $D,12($ctx) 232 st $D,12($ctx)
213 st $E,16($ctx) 233 st $E,16($ctx)
214 brct $len,.Lloop 234 brct${g} $len,.Lloop
215 235
216 lmg %r6,%r15,`$frame+48`($sp) 236 lm${g} %r6,%r15,`$frame+6*$SIZE_T`($sp)
217 br %r14 237 br %r14
218.size sha1_block_data_order,.-sha1_block_data_order 238.size sha1_block_data_order,.-sha1_block_data_order
219.string "SHA1 block transform for s390x, CRYPTOGAMS by <appro\@openssl.org>" 239.string "SHA1 block transform for s390x, CRYPTOGAMS by <appro\@openssl.org>"
220.comm OPENSSL_s390xcap_P,8,8 240.comm OPENSSL_s390xcap_P,16,8
221___ 241___
222 242
223$code =~ s/\`([^\`]*)\`/eval $1/gem; 243$code =~ s/\`([^\`]*)\`/eval $1/gem;
diff --git a/src/lib/libcrypto/sha/asm/sha1-x86_64.pl b/src/lib/libcrypto/sha/asm/sha1-x86_64.pl
index 4edc5ea9ad..f27c1e3fb0 100755
--- a/src/lib/libcrypto/sha/asm/sha1-x86_64.pl
+++ b/src/lib/libcrypto/sha/asm/sha1-x86_64.pl
@@ -16,7 +16,7 @@
16# There was suggestion to mechanically translate 32-bit code, but I 16# There was suggestion to mechanically translate 32-bit code, but I
17# dismissed it, reasoning that x86_64 offers enough register bank 17# dismissed it, reasoning that x86_64 offers enough register bank
18# capacity to fully utilize SHA-1 parallelism. Therefore this fresh 18# capacity to fully utilize SHA-1 parallelism. Therefore this fresh
19# implementation:-) However! While 64-bit code does performs better 19# implementation:-) However! While 64-bit code does perform better
20# on Opteron, I failed to beat 32-bit assembler on EM64T core. Well, 20# on Opteron, I failed to beat 32-bit assembler on EM64T core. Well,
21# x86_64 does offer larger *addressable* bank, but out-of-order core 21# x86_64 does offer larger *addressable* bank, but out-of-order core
22# reaches for even more registers through dynamic aliasing, and EM64T 22# reaches for even more registers through dynamic aliasing, and EM64T
@@ -29,6 +29,38 @@
29# Xeon P4 +65% +0% 9.9 29# Xeon P4 +65% +0% 9.9
30# Core2 +60% +10% 7.0 30# Core2 +60% +10% 7.0
31 31
32# August 2009.
33#
34# The code was revised to minimize code size and to maximize
35# "distance" between instructions producing input to 'lea'
36# instruction and the 'lea' instruction itself, which is essential
37# for Intel Atom core.
38
39# October 2010.
40#
41# Add SSSE3, Supplemental[!] SSE3, implementation. The idea behind it
42# is to offload message schedule denoted by Wt in NIST specification,
43# or Xupdate in OpenSSL source, to SIMD unit. See sha1-586.pl module
44# for background and implementation details. The only difference from
45# 32-bit code is that 64-bit code doesn't have to spill @X[] elements
46# to free temporary registers.
47
48# April 2011.
49#
50# Add AVX code path. See sha1-586.pl for further information.
51
52######################################################################
53# Current performance is summarized in following table. Numbers are
54# CPU clock cycles spent to process single byte (less is better).
55#
56# x86_64 SSSE3 AVX
57# P4 9.8 -
58# Opteron 6.6 -
59# Core2 6.7 6.1/+10% -
60# Atom 11.0 9.7/+13% -
61# Westmere 7.1 5.6/+27% -
62# Sandy Bridge 7.9 6.3/+25% 5.2/+51%
63
32$flavour = shift; 64$flavour = shift;
33$output = shift; 65$output = shift;
34if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 66if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
@@ -40,6 +72,16 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
40( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 72( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
41die "can't locate x86_64-xlate.pl"; 73die "can't locate x86_64-xlate.pl";
42 74
75$avx=1 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
76 =~ /GNU assembler version ([2-9]\.[0-9]+)/ &&
77 $1>=2.19);
78$avx=1 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
79 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ &&
80 $1>=2.09);
81$avx=1 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
82 `ml64 2>&1` =~ /Version ([0-9]+)\./ &&
83 $1>=10);
84
43open STDOUT,"| $^X $xlate $flavour $output"; 85open STDOUT,"| $^X $xlate $flavour $output";
44 86
45$ctx="%rdi"; # 1st arg 87$ctx="%rdi"; # 1st arg
@@ -51,196 +93,994 @@ $ctx="%r8";
51$inp="%r9"; 93$inp="%r9";
52$num="%r10"; 94$num="%r10";
53 95
54$xi="%eax"; 96$t0="%eax";
55$t0="%ebx"; 97$t1="%ebx";
56$t1="%ecx"; 98$t2="%ecx";
57$A="%edx"; 99@xi=("%edx","%ebp");
58$B="%esi"; 100$A="%esi";
59$C="%edi"; 101$B="%edi";
60$D="%ebp"; 102$C="%r11d";
61$E="%r11d"; 103$D="%r12d";
62$T="%r12d"; 104$E="%r13d";
63
64@V=($A,$B,$C,$D,$E,$T);
65 105
66sub PROLOGUE { 106@V=($A,$B,$C,$D,$E);
67my $func=shift;
68$code.=<<___;
69.globl $func
70.type $func,\@function,3
71.align 16
72$func:
73 push %rbx
74 push %rbp
75 push %r12
76 mov %rsp,%r11
77 mov %rdi,$ctx # reassigned argument
78 sub \$`8+16*4`,%rsp
79 mov %rsi,$inp # reassigned argument
80 and \$-64,%rsp
81 mov %rdx,$num # reassigned argument
82 mov %r11,`16*4`(%rsp)
83.Lprologue:
84
85 mov 0($ctx),$A
86 mov 4($ctx),$B
87 mov 8($ctx),$C
88 mov 12($ctx),$D
89 mov 16($ctx),$E
90___
91}
92
93sub EPILOGUE {
94my $func=shift;
95$code.=<<___;
96 mov `16*4`(%rsp),%rsi
97 mov (%rsi),%r12
98 mov 8(%rsi),%rbp
99 mov 16(%rsi),%rbx
100 lea 24(%rsi),%rsp
101.Lepilogue:
102 ret
103.size $func,.-$func
104___
105}
106 107
107sub BODY_00_19 { 108sub BODY_00_19 {
108my ($i,$a,$b,$c,$d,$e,$f,$host)=@_; 109my ($i,$a,$b,$c,$d,$e)=@_;
109my $j=$i+1; 110my $j=$i+1;
110$code.=<<___ if ($i==0); 111$code.=<<___ if ($i==0);
111 mov `4*$i`($inp),$xi 112 mov `4*$i`($inp),$xi[0]
112 `"bswap $xi" if(!defined($host))` 113 bswap $xi[0]
113 mov $xi,`4*$i`(%rsp) 114 mov $xi[0],`4*$i`(%rsp)
114___ 115___
115$code.=<<___ if ($i<15); 116$code.=<<___ if ($i<15);
116 lea 0x5a827999($xi,$e),$f
117 mov $c,$t0 117 mov $c,$t0
118 mov `4*$j`($inp),$xi 118 mov `4*$j`($inp),$xi[1]
119 mov $a,$e 119 mov $a,$t2
120 xor $d,$t0 120 xor $d,$t0
121 `"bswap $xi" if(!defined($host))` 121 bswap $xi[1]
122 rol \$5,$e 122 rol \$5,$t2
123 lea 0x5a827999($xi[0],$e),$e
123 and $b,$t0 124 and $b,$t0
124 mov $xi,`4*$j`(%rsp) 125 mov $xi[1],`4*$j`(%rsp)
125 add $e,$f 126 add $t2,$e
126 xor $d,$t0 127 xor $d,$t0
127 rol \$30,$b 128 rol \$30,$b
128 add $t0,$f 129 add $t0,$e
129___ 130___
130$code.=<<___ if ($i>=15); 131$code.=<<___ if ($i>=15);
131 lea 0x5a827999($xi,$e),$f 132 mov `4*($j%16)`(%rsp),$xi[1]
132 mov `4*($j%16)`(%rsp),$xi
133 mov $c,$t0 133 mov $c,$t0
134 mov $a,$e 134 mov $a,$t2
135 xor `4*(($j+2)%16)`(%rsp),$xi 135 xor `4*(($j+2)%16)`(%rsp),$xi[1]
136 xor $d,$t0 136 xor $d,$t0
137 rol \$5,$e 137 rol \$5,$t2
138 xor `4*(($j+8)%16)`(%rsp),$xi 138 xor `4*(($j+8)%16)`(%rsp),$xi[1]
139 and $b,$t0 139 and $b,$t0
140 add $e,$f 140 lea 0x5a827999($xi[0],$e),$e
141 xor `4*(($j+13)%16)`(%rsp),$xi 141 xor `4*(($j+13)%16)`(%rsp),$xi[1]
142 xor $d,$t0 142 xor $d,$t0
143 rol \$1,$xi[1]
144 add $t2,$e
143 rol \$30,$b 145 rol \$30,$b
144 add $t0,$f 146 mov $xi[1],`4*($j%16)`(%rsp)
145 rol \$1,$xi 147 add $t0,$e
146 mov $xi,`4*($j%16)`(%rsp)
147___ 148___
149unshift(@xi,pop(@xi));
148} 150}
149 151
150sub BODY_20_39 { 152sub BODY_20_39 {
151my ($i,$a,$b,$c,$d,$e,$f)=@_; 153my ($i,$a,$b,$c,$d,$e)=@_;
152my $j=$i+1; 154my $j=$i+1;
153my $K=($i<40)?0x6ed9eba1:0xca62c1d6; 155my $K=($i<40)?0x6ed9eba1:0xca62c1d6;
154$code.=<<___ if ($i<79); 156$code.=<<___ if ($i<79);
155 lea $K($xi,$e),$f 157 mov `4*($j%16)`(%rsp),$xi[1]
156 mov `4*($j%16)`(%rsp),$xi
157 mov $c,$t0 158 mov $c,$t0
158 mov $a,$e 159 mov $a,$t2
159 xor `4*(($j+2)%16)`(%rsp),$xi 160 xor `4*(($j+2)%16)`(%rsp),$xi[1]
160 xor $b,$t0 161 xor $b,$t0
161 rol \$5,$e 162 rol \$5,$t2
162 xor `4*(($j+8)%16)`(%rsp),$xi 163 lea $K($xi[0],$e),$e
164 xor `4*(($j+8)%16)`(%rsp),$xi[1]
163 xor $d,$t0 165 xor $d,$t0
164 add $e,$f 166 add $t2,$e
165 xor `4*(($j+13)%16)`(%rsp),$xi 167 xor `4*(($j+13)%16)`(%rsp),$xi[1]
166 rol \$30,$b 168 rol \$30,$b
167 add $t0,$f 169 add $t0,$e
168 rol \$1,$xi 170 rol \$1,$xi[1]
169___ 171___
170$code.=<<___ if ($i<76); 172$code.=<<___ if ($i<76);
171 mov $xi,`4*($j%16)`(%rsp) 173 mov $xi[1],`4*($j%16)`(%rsp)
172___ 174___
173$code.=<<___ if ($i==79); 175$code.=<<___ if ($i==79);
174 lea $K($xi,$e),$f
175 mov $c,$t0 176 mov $c,$t0
176 mov $a,$e 177 mov $a,$t2
177 xor $b,$t0 178 xor $b,$t0
178 rol \$5,$e 179 lea $K($xi[0],$e),$e
180 rol \$5,$t2
179 xor $d,$t0 181 xor $d,$t0
180 add $e,$f 182 add $t2,$e
181 rol \$30,$b 183 rol \$30,$b
182 add $t0,$f 184 add $t0,$e
183___ 185___
186unshift(@xi,pop(@xi));
184} 187}
185 188
186sub BODY_40_59 { 189sub BODY_40_59 {
187my ($i,$a,$b,$c,$d,$e,$f)=@_; 190my ($i,$a,$b,$c,$d,$e)=@_;
188my $j=$i+1; 191my $j=$i+1;
189$code.=<<___; 192$code.=<<___;
190 lea 0x8f1bbcdc($xi,$e),$f 193 mov `4*($j%16)`(%rsp),$xi[1]
191 mov `4*($j%16)`(%rsp),$xi 194 mov $c,$t0
192 mov $b,$t0 195 mov $c,$t1
193 mov $b,$t1 196 xor `4*(($j+2)%16)`(%rsp),$xi[1]
194 xor `4*(($j+2)%16)`(%rsp),$xi 197 and $d,$t0
195 mov $a,$e 198 mov $a,$t2
196 and $c,$t0 199 xor `4*(($j+8)%16)`(%rsp),$xi[1]
197 xor `4*(($j+8)%16)`(%rsp),$xi 200 xor $d,$t1
198 or $c,$t1 201 lea 0x8f1bbcdc($xi[0],$e),$e
199 rol \$5,$e 202 rol \$5,$t2
200 xor `4*(($j+13)%16)`(%rsp),$xi 203 xor `4*(($j+13)%16)`(%rsp),$xi[1]
201 and $d,$t1 204 add $t0,$e
202 add $e,$f 205 and $b,$t1
203 rol \$1,$xi 206 rol \$1,$xi[1]
204 or $t1,$t0 207 add $t1,$e
205 rol \$30,$b 208 rol \$30,$b
206 mov $xi,`4*($j%16)`(%rsp) 209 mov $xi[1],`4*($j%16)`(%rsp)
207 add $t0,$f 210 add $t2,$e
208___ 211___
212unshift(@xi,pop(@xi));
209} 213}
210 214
211$code=".text\n"; 215$code.=<<___;
216.text
217.extern OPENSSL_ia32cap_P
212 218
213&PROLOGUE("sha1_block_data_order"); 219.globl sha1_block_data_order
214$code.=".align 4\n.Lloop:\n"; 220.type sha1_block_data_order,\@function,3
221.align 16
222sha1_block_data_order:
223 mov OPENSSL_ia32cap_P+0(%rip),%r9d
224 mov OPENSSL_ia32cap_P+4(%rip),%r8d
225 test \$`1<<9`,%r8d # check SSSE3 bit
226 jz .Lialu
227___
228$code.=<<___ if ($avx);
229 and \$`1<<28`,%r8d # mask AVX bit
230 and \$`1<<30`,%r9d # mask "Intel CPU" bit
231 or %r9d,%r8d
232 cmp \$`1<<28|1<<30`,%r8d
233 je _avx_shortcut
234___
235$code.=<<___;
236 jmp _ssse3_shortcut
237
238.align 16
239.Lialu:
240 push %rbx
241 push %rbp
242 push %r12
243 push %r13
244 mov %rsp,%r11
245 mov %rdi,$ctx # reassigned argument
246 sub \$`8+16*4`,%rsp
247 mov %rsi,$inp # reassigned argument
248 and \$-64,%rsp
249 mov %rdx,$num # reassigned argument
250 mov %r11,`16*4`(%rsp)
251.Lprologue:
252
253 mov 0($ctx),$A
254 mov 4($ctx),$B
255 mov 8($ctx),$C
256 mov 12($ctx),$D
257 mov 16($ctx),$E
258 jmp .Lloop
259
260.align 16
261.Lloop:
262___
215for($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); } 263for($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
216for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } 264for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
217for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); } 265for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
218for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } 266for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
219$code.=<<___; 267$code.=<<___;
220 add 0($ctx),$E 268 add 0($ctx),$A
221 add 4($ctx),$T 269 add 4($ctx),$B
222 add 8($ctx),$A 270 add 8($ctx),$C
223 add 12($ctx),$B 271 add 12($ctx),$D
224 add 16($ctx),$C 272 add 16($ctx),$E
225 mov $E,0($ctx) 273 mov $A,0($ctx)
226 mov $T,4($ctx) 274 mov $B,4($ctx)
227 mov $A,8($ctx) 275 mov $C,8($ctx)
228 mov $B,12($ctx) 276 mov $D,12($ctx)
229 mov $C,16($ctx) 277 mov $E,16($ctx)
230 278
231 xchg $E,$A # mov $E,$A
232 xchg $T,$B # mov $T,$B
233 xchg $E,$C # mov $A,$C
234 xchg $T,$D # mov $B,$D
235 # mov $C,$E
236 lea `16*4`($inp),$inp
237 sub \$1,$num 279 sub \$1,$num
280 lea `16*4`($inp),$inp
238 jnz .Lloop 281 jnz .Lloop
282
283 mov `16*4`(%rsp),%rsi
284 mov (%rsi),%r13
285 mov 8(%rsi),%r12
286 mov 16(%rsi),%rbp
287 mov 24(%rsi),%rbx
288 lea 32(%rsi),%rsp
289.Lepilogue:
290 ret
291.size sha1_block_data_order,.-sha1_block_data_order
239___ 292___
240&EPILOGUE("sha1_block_data_order"); 293{{{
294my $Xi=4;
295my @X=map("%xmm$_",(4..7,0..3));
296my @Tx=map("%xmm$_",(8..10));
297my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization
298my @T=("%esi","%edi");
299my $j=0;
300my $K_XX_XX="%r11";
301
302my $_rol=sub { &rol(@_) };
303my $_ror=sub { &ror(@_) };
304
305$code.=<<___;
306.type sha1_block_data_order_ssse3,\@function,3
307.align 16
308sha1_block_data_order_ssse3:
309_ssse3_shortcut:
310 push %rbx
311 push %rbp
312 push %r12
313 lea `-64-($win64?5*16:0)`(%rsp),%rsp
314___
315$code.=<<___ if ($win64);
316 movaps %xmm6,64+0(%rsp)
317 movaps %xmm7,64+16(%rsp)
318 movaps %xmm8,64+32(%rsp)
319 movaps %xmm9,64+48(%rsp)
320 movaps %xmm10,64+64(%rsp)
321.Lprologue_ssse3:
322___
323$code.=<<___;
324 mov %rdi,$ctx # reassigned argument
325 mov %rsi,$inp # reassigned argument
326 mov %rdx,$num # reassigned argument
327
328 shl \$6,$num
329 add $inp,$num
330 lea K_XX_XX(%rip),$K_XX_XX
331
332 mov 0($ctx),$A # load context
333 mov 4($ctx),$B
334 mov 8($ctx),$C
335 mov 12($ctx),$D
336 mov $B,@T[0] # magic seed
337 mov 16($ctx),$E
338
339 movdqa 64($K_XX_XX),@X[2] # pbswap mask
340 movdqa 0($K_XX_XX),@Tx[1] # K_00_19
341 movdqu 0($inp),@X[-4&7] # load input to %xmm[0-3]
342 movdqu 16($inp),@X[-3&7]
343 movdqu 32($inp),@X[-2&7]
344 movdqu 48($inp),@X[-1&7]
345 pshufb @X[2],@X[-4&7] # byte swap
346 add \$64,$inp
347 pshufb @X[2],@X[-3&7]
348 pshufb @X[2],@X[-2&7]
349 pshufb @X[2],@X[-1&7]
350 paddd @Tx[1],@X[-4&7] # add K_00_19
351 paddd @Tx[1],@X[-3&7]
352 paddd @Tx[1],@X[-2&7]
353 movdqa @X[-4&7],0(%rsp) # X[]+K xfer to IALU
354 psubd @Tx[1],@X[-4&7] # restore X[]
355 movdqa @X[-3&7],16(%rsp)
356 psubd @Tx[1],@X[-3&7]
357 movdqa @X[-2&7],32(%rsp)
358 psubd @Tx[1],@X[-2&7]
359 jmp .Loop_ssse3
360___
361
362sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
363{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
364 my $arg = pop;
365 $arg = "\$$arg" if ($arg*1 eq $arg);
366 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
367}
368
369sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4
370{ use integer;
371 my $body = shift;
372 my @insns = (&$body,&$body,&$body,&$body); # 40 instructions
373 my ($a,$b,$c,$d,$e);
374
375 &movdqa (@X[0],@X[-3&7]);
376 eval(shift(@insns));
377 eval(shift(@insns));
378 &movdqa (@Tx[0],@X[-1&7]);
379 &palignr(@X[0],@X[-4&7],8); # compose "X[-14]" in "X[0]"
380 eval(shift(@insns));
381 eval(shift(@insns));
382
383 &paddd (@Tx[1],@X[-1&7]);
384 eval(shift(@insns));
385 eval(shift(@insns));
386 &psrldq (@Tx[0],4); # "X[-3]", 3 dwords
387 eval(shift(@insns));
388 eval(shift(@insns));
389 &pxor (@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
390 eval(shift(@insns));
391 eval(shift(@insns));
392
393 &pxor (@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"
394 eval(shift(@insns));
395 eval(shift(@insns));
396 eval(shift(@insns));
397 eval(shift(@insns));
398
399 &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]"
400 eval(shift(@insns));
401 eval(shift(@insns));
402 &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
403 eval(shift(@insns));
404 eval(shift(@insns));
405
406 &movdqa (@Tx[2],@X[0]);
407 &movdqa (@Tx[0],@X[0]);
408 eval(shift(@insns));
409 eval(shift(@insns));
410 eval(shift(@insns));
411 eval(shift(@insns));
412
413 &pslldq (@Tx[2],12); # "X[0]"<<96, extract one dword
414 &paddd (@X[0],@X[0]);
415 eval(shift(@insns));
416 eval(shift(@insns));
417 eval(shift(@insns));
418 eval(shift(@insns));
419
420 &psrld (@Tx[0],31);
421 eval(shift(@insns));
422 eval(shift(@insns));
423 &movdqa (@Tx[1],@Tx[2]);
424 eval(shift(@insns));
425 eval(shift(@insns));
426
427 &psrld (@Tx[2],30);
428 &por (@X[0],@Tx[0]); # "X[0]"<<<=1
429 eval(shift(@insns));
430 eval(shift(@insns));
431 eval(shift(@insns));
432 eval(shift(@insns));
433
434 &pslld (@Tx[1],2);
435 &pxor (@X[0],@Tx[2]);
436 eval(shift(@insns));
437 eval(shift(@insns));
438 &movdqa (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)"); # K_XX_XX
439 eval(shift(@insns));
440 eval(shift(@insns));
441
442 &pxor (@X[0],@Tx[1]); # "X[0]"^=("X[0]">>96)<<<2
443
444 foreach (@insns) { eval; } # remaining instructions [if any]
445
446 $Xi++; push(@X,shift(@X)); # "rotate" X[]
447 push(@Tx,shift(@Tx));
448}
449
450sub Xupdate_ssse3_32_79()
451{ use integer;
452 my $body = shift;
453 my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions
454 my ($a,$b,$c,$d,$e);
455
456 &movdqa (@Tx[0],@X[-1&7]) if ($Xi==8);
457 eval(shift(@insns)); # body_20_39
458 &pxor (@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
459 &palignr(@Tx[0],@X[-2&7],8); # compose "X[-6]"
460 eval(shift(@insns));
461 eval(shift(@insns));
462 eval(shift(@insns)); # rol
463
464 &pxor (@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
465 eval(shift(@insns));
466 eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/);
467 if ($Xi%5) {
468 &movdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
469 } else { # ... or load next one
470 &movdqa (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
471 }
472 &paddd (@Tx[1],@X[-1&7]);
473 eval(shift(@insns)); # ror
474 eval(shift(@insns));
475
476 &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-6]"
477 eval(shift(@insns)); # body_20_39
478 eval(shift(@insns));
479 eval(shift(@insns));
480 eval(shift(@insns)); # rol
481
482 &movdqa (@Tx[0],@X[0]);
483 &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
484 eval(shift(@insns));
485 eval(shift(@insns));
486 eval(shift(@insns)); # ror
487 eval(shift(@insns));
488
489 &pslld (@X[0],2);
490 eval(shift(@insns)); # body_20_39
491 eval(shift(@insns));
492 &psrld (@Tx[0],30);
493 eval(shift(@insns));
494 eval(shift(@insns)); # rol
495 eval(shift(@insns));
496 eval(shift(@insns));
497 eval(shift(@insns)); # ror
498 eval(shift(@insns));
499
500 &por (@X[0],@Tx[0]); # "X[0]"<<<=2
501 eval(shift(@insns)); # body_20_39
502 eval(shift(@insns));
503 &movdqa (@Tx[1],@X[0]) if ($Xi<19);
504 eval(shift(@insns));
505 eval(shift(@insns)); # rol
506 eval(shift(@insns));
507 eval(shift(@insns));
508 eval(shift(@insns)); # rol
509 eval(shift(@insns));
510
511 foreach (@insns) { eval; } # remaining instructions
512
513 $Xi++; push(@X,shift(@X)); # "rotate" X[]
514 push(@Tx,shift(@Tx));
515}
516
517sub Xuplast_ssse3_80()
518{ use integer;
519 my $body = shift;
520 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
521 my ($a,$b,$c,$d,$e);
522
523 eval(shift(@insns));
524 &paddd (@Tx[1],@X[-1&7]);
525 eval(shift(@insns));
526 eval(shift(@insns));
527 eval(shift(@insns));
528 eval(shift(@insns));
529
530 &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU
531
532 foreach (@insns) { eval; } # remaining instructions
533
534 &cmp ($inp,$num);
535 &je (".Ldone_ssse3");
536
537 unshift(@Tx,pop(@Tx));
538
539 &movdqa (@X[2],"64($K_XX_XX)"); # pbswap mask
540 &movdqa (@Tx[1],"0($K_XX_XX)"); # K_00_19
541 &movdqu (@X[-4&7],"0($inp)"); # load input
542 &movdqu (@X[-3&7],"16($inp)");
543 &movdqu (@X[-2&7],"32($inp)");
544 &movdqu (@X[-1&7],"48($inp)");
545 &pshufb (@X[-4&7],@X[2]); # byte swap
546 &add ($inp,64);
547
548 $Xi=0;
549}
550
551sub Xloop_ssse3()
552{ use integer;
553 my $body = shift;
554 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
555 my ($a,$b,$c,$d,$e);
556
557 eval(shift(@insns));
558 eval(shift(@insns));
559 &pshufb (@X[($Xi-3)&7],@X[2]);
560 eval(shift(@insns));
561 eval(shift(@insns));
562 &paddd (@X[($Xi-4)&7],@Tx[1]);
563 eval(shift(@insns));
564 eval(shift(@insns));
565 eval(shift(@insns));
566 eval(shift(@insns));
567 &movdqa (eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]); # X[]+K xfer to IALU
568 eval(shift(@insns));
569 eval(shift(@insns));
570 &psubd (@X[($Xi-4)&7],@Tx[1]);
571
572 foreach (@insns) { eval; }
573 $Xi++;
574}
575
576sub Xtail_ssse3()
577{ use integer;
578 my $body = shift;
579 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
580 my ($a,$b,$c,$d,$e);
581
582 foreach (@insns) { eval; }
583}
584
585sub body_00_19 () {
586 (
587 '($a,$b,$c,$d,$e)=@V;'.
588 '&add ($e,eval(4*($j&15))."(%rsp)");', # X[]+K xfer
589 '&xor ($c,$d);',
590 '&mov (@T[1],$a);', # $b in next round
591 '&$_rol ($a,5);',
592 '&and (@T[0],$c);', # ($b&($c^$d))
593 '&xor ($c,$d);', # restore $c
594 '&xor (@T[0],$d);',
595 '&add ($e,$a);',
596 '&$_ror ($b,$j?7:2);', # $b>>>2
597 '&add ($e,@T[0]);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
598 );
599}
600
601sub body_20_39 () {
602 (
603 '($a,$b,$c,$d,$e)=@V;'.
604 '&add ($e,eval(4*($j++&15))."(%rsp)");', # X[]+K xfer
605 '&xor (@T[0],$d);', # ($b^$d)
606 '&mov (@T[1],$a);', # $b in next round
607 '&$_rol ($a,5);',
608 '&xor (@T[0],$c);', # ($b^$d^$c)
609 '&add ($e,$a);',
610 '&$_ror ($b,7);', # $b>>>2
611 '&add ($e,@T[0]);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
612 );
613}
614
615sub body_40_59 () {
616 (
617 '($a,$b,$c,$d,$e)=@V;'.
618 '&mov (@T[1],$c);',
619 '&xor ($c,$d);',
620 '&add ($e,eval(4*($j++&15))."(%rsp)");', # X[]+K xfer
621 '&and (@T[1],$d);',
622 '&and (@T[0],$c);', # ($b&($c^$d))
623 '&$_ror ($b,7);', # $b>>>2
624 '&add ($e,@T[1]);',
625 '&mov (@T[1],$a);', # $b in next round
626 '&$_rol ($a,5);',
627 '&add ($e,@T[0]);',
628 '&xor ($c,$d);', # restore $c
629 '&add ($e,$a);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
630 );
631}
241$code.=<<___; 632$code.=<<___;
242.asciz "SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
243.align 16 633.align 16
634.Loop_ssse3:
635___
636 &Xupdate_ssse3_16_31(\&body_00_19);
637 &Xupdate_ssse3_16_31(\&body_00_19);
638 &Xupdate_ssse3_16_31(\&body_00_19);
639 &Xupdate_ssse3_16_31(\&body_00_19);
640 &Xupdate_ssse3_32_79(\&body_00_19);
641 &Xupdate_ssse3_32_79(\&body_20_39);
642 &Xupdate_ssse3_32_79(\&body_20_39);
643 &Xupdate_ssse3_32_79(\&body_20_39);
644 &Xupdate_ssse3_32_79(\&body_20_39);
645 &Xupdate_ssse3_32_79(\&body_20_39);
646 &Xupdate_ssse3_32_79(\&body_40_59);
647 &Xupdate_ssse3_32_79(\&body_40_59);
648 &Xupdate_ssse3_32_79(\&body_40_59);
649 &Xupdate_ssse3_32_79(\&body_40_59);
650 &Xupdate_ssse3_32_79(\&body_40_59);
651 &Xupdate_ssse3_32_79(\&body_20_39);
652 &Xuplast_ssse3_80(\&body_20_39); # can jump to "done"
653
654 $saved_j=$j; @saved_V=@V;
655
656 &Xloop_ssse3(\&body_20_39);
657 &Xloop_ssse3(\&body_20_39);
658 &Xloop_ssse3(\&body_20_39);
659
660$code.=<<___;
661 add 0($ctx),$A # update context
662 add 4($ctx),@T[0]
663 add 8($ctx),$C
664 add 12($ctx),$D
665 mov $A,0($ctx)
666 add 16($ctx),$E
667 mov @T[0],4($ctx)
668 mov @T[0],$B # magic seed
669 mov $C,8($ctx)
670 mov $D,12($ctx)
671 mov $E,16($ctx)
672 jmp .Loop_ssse3
673
674.align 16
675.Ldone_ssse3:
676___
677 $j=$saved_j; @V=@saved_V;
678
679 &Xtail_ssse3(\&body_20_39);
680 &Xtail_ssse3(\&body_20_39);
681 &Xtail_ssse3(\&body_20_39);
682
683$code.=<<___;
684 add 0($ctx),$A # update context
685 add 4($ctx),@T[0]
686 add 8($ctx),$C
687 mov $A,0($ctx)
688 add 12($ctx),$D
689 mov @T[0],4($ctx)
690 add 16($ctx),$E
691 mov $C,8($ctx)
692 mov $D,12($ctx)
693 mov $E,16($ctx)
694___
695$code.=<<___ if ($win64);
696 movaps 64+0(%rsp),%xmm6
697 movaps 64+16(%rsp),%xmm7
698 movaps 64+32(%rsp),%xmm8
699 movaps 64+48(%rsp),%xmm9
700 movaps 64+64(%rsp),%xmm10
701___
702$code.=<<___;
703 lea `64+($win64?5*16:0)`(%rsp),%rsi
704 mov 0(%rsi),%r12
705 mov 8(%rsi),%rbp
706 mov 16(%rsi),%rbx
707 lea 24(%rsi),%rsp
708.Lepilogue_ssse3:
709 ret
710.size sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3
711___
712
713if ($avx) {
714my $Xi=4;
715my @X=map("%xmm$_",(4..7,0..3));
716my @Tx=map("%xmm$_",(8..10));
717my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization
718my @T=("%esi","%edi");
719my $j=0;
720my $K_XX_XX="%r11";
721
722my $_rol=sub { &shld(@_[0],@_) };
723my $_ror=sub { &shrd(@_[0],@_) };
724
725$code.=<<___;
726.type sha1_block_data_order_avx,\@function,3
727.align 16
728sha1_block_data_order_avx:
729_avx_shortcut:
730 push %rbx
731 push %rbp
732 push %r12
733 lea `-64-($win64?5*16:0)`(%rsp),%rsp
734___
735$code.=<<___ if ($win64);
736 movaps %xmm6,64+0(%rsp)
737 movaps %xmm7,64+16(%rsp)
738 movaps %xmm8,64+32(%rsp)
739 movaps %xmm9,64+48(%rsp)
740 movaps %xmm10,64+64(%rsp)
741.Lprologue_avx:
742___
743$code.=<<___;
744 mov %rdi,$ctx # reassigned argument
745 mov %rsi,$inp # reassigned argument
746 mov %rdx,$num # reassigned argument
747 vzeroall
748
749 shl \$6,$num
750 add $inp,$num
751 lea K_XX_XX(%rip),$K_XX_XX
752
753 mov 0($ctx),$A # load context
754 mov 4($ctx),$B
755 mov 8($ctx),$C
756 mov 12($ctx),$D
757 mov $B,@T[0] # magic seed
758 mov 16($ctx),$E
759
760 vmovdqa 64($K_XX_XX),@X[2] # pbswap mask
761 vmovdqa 0($K_XX_XX),@Tx[1] # K_00_19
762 vmovdqu 0($inp),@X[-4&7] # load input to %xmm[0-3]
763 vmovdqu 16($inp),@X[-3&7]
764 vmovdqu 32($inp),@X[-2&7]
765 vmovdqu 48($inp),@X[-1&7]
766 vpshufb @X[2],@X[-4&7],@X[-4&7] # byte swap
767 add \$64,$inp
768 vpshufb @X[2],@X[-3&7],@X[-3&7]
769 vpshufb @X[2],@X[-2&7],@X[-2&7]
770 vpshufb @X[2],@X[-1&7],@X[-1&7]
771 vpaddd @Tx[1],@X[-4&7],@X[0] # add K_00_19
772 vpaddd @Tx[1],@X[-3&7],@X[1]
773 vpaddd @Tx[1],@X[-2&7],@X[2]
774 vmovdqa @X[0],0(%rsp) # X[]+K xfer to IALU
775 vmovdqa @X[1],16(%rsp)
776 vmovdqa @X[2],32(%rsp)
777 jmp .Loop_avx
778___
779
780sub Xupdate_avx_16_31() # recall that $Xi starts wtih 4
781{ use integer;
782 my $body = shift;
783 my @insns = (&$body,&$body,&$body,&$body); # 40 instructions
784 my ($a,$b,$c,$d,$e);
785
786 eval(shift(@insns));
787 eval(shift(@insns));
788 &vpalignr(@X[0],@X[-3&7],@X[-4&7],8); # compose "X[-14]" in "X[0]"
789 eval(shift(@insns));
790 eval(shift(@insns));
791
792 &vpaddd (@Tx[1],@Tx[1],@X[-1&7]);
793 eval(shift(@insns));
794 eval(shift(@insns));
795 &vpsrldq(@Tx[0],@X[-1&7],4); # "X[-3]", 3 dwords
796 eval(shift(@insns));
797 eval(shift(@insns));
798 &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
799 eval(shift(@insns));
800 eval(shift(@insns));
801
802 &vpxor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"
803 eval(shift(@insns));
804 eval(shift(@insns));
805 eval(shift(@insns));
806 eval(shift(@insns));
807
808 &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]"
809 eval(shift(@insns));
810 eval(shift(@insns));
811 &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
812 eval(shift(@insns));
813 eval(shift(@insns));
814
815 &vpsrld (@Tx[0],@X[0],31);
816 eval(shift(@insns));
817 eval(shift(@insns));
818 eval(shift(@insns));
819 eval(shift(@insns));
820
821 &vpslldq(@Tx[2],@X[0],12); # "X[0]"<<96, extract one dword
822 &vpaddd (@X[0],@X[0],@X[0]);
823 eval(shift(@insns));
824 eval(shift(@insns));
825 eval(shift(@insns));
826 eval(shift(@insns));
827
828 &vpsrld (@Tx[1],@Tx[2],30);
829 &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=1
830 eval(shift(@insns));
831 eval(shift(@insns));
832 eval(shift(@insns));
833 eval(shift(@insns));
834
835 &vpslld (@Tx[2],@Tx[2],2);
836 &vpxor (@X[0],@X[0],@Tx[1]);
837 eval(shift(@insns));
838 eval(shift(@insns));
839 eval(shift(@insns));
840 eval(shift(@insns));
841
842 &vpxor (@X[0],@X[0],@Tx[2]); # "X[0]"^=("X[0]">>96)<<<2
843 eval(shift(@insns));
844 eval(shift(@insns));
845 &vmovdqa (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)"); # K_XX_XX
846 eval(shift(@insns));
847 eval(shift(@insns));
848
849
850 foreach (@insns) { eval; } # remaining instructions [if any]
851
852 $Xi++; push(@X,shift(@X)); # "rotate" X[]
853 push(@Tx,shift(@Tx));
854}
855
856sub Xupdate_avx_32_79()
857{ use integer;
858 my $body = shift;
859 my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions
860 my ($a,$b,$c,$d,$e);
861
862 &vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8); # compose "X[-6]"
863 &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
864 eval(shift(@insns)); # body_20_39
865 eval(shift(@insns));
866 eval(shift(@insns));
867 eval(shift(@insns)); # rol
868
869 &vpxor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
870 eval(shift(@insns));
871 eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/);
872 if ($Xi%5) {
873 &vmovdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
874 } else { # ... or load next one
875 &vmovdqa (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
876 }
877 &vpaddd (@Tx[1],@Tx[1],@X[-1&7]);
878 eval(shift(@insns)); # ror
879 eval(shift(@insns));
880
881 &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-6]"
882 eval(shift(@insns)); # body_20_39
883 eval(shift(@insns));
884 eval(shift(@insns));
885 eval(shift(@insns)); # rol
886
887 &vpsrld (@Tx[0],@X[0],30);
888 &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
889 eval(shift(@insns));
890 eval(shift(@insns));
891 eval(shift(@insns)); # ror
892 eval(shift(@insns));
893
894 &vpslld (@X[0],@X[0],2);
895 eval(shift(@insns)); # body_20_39
896 eval(shift(@insns));
897 eval(shift(@insns));
898 eval(shift(@insns)); # rol
899 eval(shift(@insns));
900 eval(shift(@insns));
901 eval(shift(@insns)); # ror
902 eval(shift(@insns));
903
904 &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=2
905 eval(shift(@insns)); # body_20_39
906 eval(shift(@insns));
907 &vmovdqa (@Tx[1],@X[0]) if ($Xi<19);
908 eval(shift(@insns));
909 eval(shift(@insns)); # rol
910 eval(shift(@insns));
911 eval(shift(@insns));
912 eval(shift(@insns)); # rol
913 eval(shift(@insns));
914
915 foreach (@insns) { eval; } # remaining instructions
916
917 $Xi++; push(@X,shift(@X)); # "rotate" X[]
918 push(@Tx,shift(@Tx));
919}
920
921sub Xuplast_avx_80()
922{ use integer;
923 my $body = shift;
924 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
925 my ($a,$b,$c,$d,$e);
926
927 eval(shift(@insns));
928 &vpaddd (@Tx[1],@Tx[1],@X[-1&7]);
929 eval(shift(@insns));
930 eval(shift(@insns));
931 eval(shift(@insns));
932 eval(shift(@insns));
933
934 &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU
935
936 foreach (@insns) { eval; } # remaining instructions
937
938 &cmp ($inp,$num);
939 &je (".Ldone_avx");
940
941 unshift(@Tx,pop(@Tx));
942
943 &vmovdqa(@X[2],"64($K_XX_XX)"); # pbswap mask
944 &vmovdqa(@Tx[1],"0($K_XX_XX)"); # K_00_19
945 &vmovdqu(@X[-4&7],"0($inp)"); # load input
946 &vmovdqu(@X[-3&7],"16($inp)");
947 &vmovdqu(@X[-2&7],"32($inp)");
948 &vmovdqu(@X[-1&7],"48($inp)");
949 &vpshufb(@X[-4&7],@X[-4&7],@X[2]); # byte swap
950 &add ($inp,64);
951
952 $Xi=0;
953}
954
955sub Xloop_avx()
956{ use integer;
957 my $body = shift;
958 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
959 my ($a,$b,$c,$d,$e);
960
961 eval(shift(@insns));
962 eval(shift(@insns));
963 &vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]);
964 eval(shift(@insns));
965 eval(shift(@insns));
966 &vpaddd (@X[$Xi&7],@X[($Xi-4)&7],@Tx[1]);
967 eval(shift(@insns));
968 eval(shift(@insns));
969 eval(shift(@insns));
970 eval(shift(@insns));
971 &vmovdqa(eval(16*$Xi)."(%rsp)",@X[$Xi&7]); # X[]+K xfer to IALU
972 eval(shift(@insns));
973 eval(shift(@insns));
974
975 foreach (@insns) { eval; }
976 $Xi++;
977}
978
979sub Xtail_avx()
980{ use integer;
981 my $body = shift;
982 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
983 my ($a,$b,$c,$d,$e);
984
985 foreach (@insns) { eval; }
986}
987
988$code.=<<___;
989.align 16
990.Loop_avx:
991___
992 &Xupdate_avx_16_31(\&body_00_19);
993 &Xupdate_avx_16_31(\&body_00_19);
994 &Xupdate_avx_16_31(\&body_00_19);
995 &Xupdate_avx_16_31(\&body_00_19);
996 &Xupdate_avx_32_79(\&body_00_19);
997 &Xupdate_avx_32_79(\&body_20_39);
998 &Xupdate_avx_32_79(\&body_20_39);
999 &Xupdate_avx_32_79(\&body_20_39);
1000 &Xupdate_avx_32_79(\&body_20_39);
1001 &Xupdate_avx_32_79(\&body_20_39);
1002 &Xupdate_avx_32_79(\&body_40_59);
1003 &Xupdate_avx_32_79(\&body_40_59);
1004 &Xupdate_avx_32_79(\&body_40_59);
1005 &Xupdate_avx_32_79(\&body_40_59);
1006 &Xupdate_avx_32_79(\&body_40_59);
1007 &Xupdate_avx_32_79(\&body_20_39);
1008 &Xuplast_avx_80(\&body_20_39); # can jump to "done"
1009
1010 $saved_j=$j; @saved_V=@V;
1011
1012 &Xloop_avx(\&body_20_39);
1013 &Xloop_avx(\&body_20_39);
1014 &Xloop_avx(\&body_20_39);
1015
1016$code.=<<___;
1017 add 0($ctx),$A # update context
1018 add 4($ctx),@T[0]
1019 add 8($ctx),$C
1020 add 12($ctx),$D
1021 mov $A,0($ctx)
1022 add 16($ctx),$E
1023 mov @T[0],4($ctx)
1024 mov @T[0],$B # magic seed
1025 mov $C,8($ctx)
1026 mov $D,12($ctx)
1027 mov $E,16($ctx)
1028 jmp .Loop_avx
1029
1030.align 16
1031.Ldone_avx:
1032___
1033 $j=$saved_j; @V=@saved_V;
1034
1035 &Xtail_avx(\&body_20_39);
1036 &Xtail_avx(\&body_20_39);
1037 &Xtail_avx(\&body_20_39);
1038
1039$code.=<<___;
1040 vzeroall
1041
1042 add 0($ctx),$A # update context
1043 add 4($ctx),@T[0]
1044 add 8($ctx),$C
1045 mov $A,0($ctx)
1046 add 12($ctx),$D
1047 mov @T[0],4($ctx)
1048 add 16($ctx),$E
1049 mov $C,8($ctx)
1050 mov $D,12($ctx)
1051 mov $E,16($ctx)
1052___
1053$code.=<<___ if ($win64);
1054 movaps 64+0(%rsp),%xmm6
1055 movaps 64+16(%rsp),%xmm7
1056 movaps 64+32(%rsp),%xmm8
1057 movaps 64+48(%rsp),%xmm9
1058 movaps 64+64(%rsp),%xmm10
1059___
1060$code.=<<___;
1061 lea `64+($win64?5*16:0)`(%rsp),%rsi
1062 mov 0(%rsi),%r12
1063 mov 8(%rsi),%rbp
1064 mov 16(%rsi),%rbx
1065 lea 24(%rsi),%rsp
1066.Lepilogue_avx:
1067 ret
1068.size sha1_block_data_order_avx,.-sha1_block_data_order_avx
1069___
1070}
1071$code.=<<___;
1072.align 64
1073K_XX_XX:
1074.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19
1075.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39
1076.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59
1077.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79
1078.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap mask
1079___
1080}}}
1081$code.=<<___;
1082.asciz "SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1083.align 64
244___ 1084___
245 1085
246# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 1086# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
@@ -272,25 +1112,75 @@ se_handler:
272 1112
273 lea .Lprologue(%rip),%r10 1113 lea .Lprologue(%rip),%r10
274 cmp %r10,%rbx # context->Rip<.Lprologue 1114 cmp %r10,%rbx # context->Rip<.Lprologue
275 jb .Lin_prologue 1115 jb .Lcommon_seh_tail
276 1116
277 mov 152($context),%rax # pull context->Rsp 1117 mov 152($context),%rax # pull context->Rsp
278 1118
279 lea .Lepilogue(%rip),%r10 1119 lea .Lepilogue(%rip),%r10
280 cmp %r10,%rbx # context->Rip>=.Lepilogue 1120 cmp %r10,%rbx # context->Rip>=.Lepilogue
281 jae .Lin_prologue 1121 jae .Lcommon_seh_tail
282 1122
283 mov `16*4`(%rax),%rax # pull saved stack pointer 1123 mov `16*4`(%rax),%rax # pull saved stack pointer
284 lea 24(%rax),%rax 1124 lea 32(%rax),%rax
285 1125
286 mov -8(%rax),%rbx 1126 mov -8(%rax),%rbx
287 mov -16(%rax),%rbp 1127 mov -16(%rax),%rbp
288 mov -24(%rax),%r12 1128 mov -24(%rax),%r12
1129 mov -32(%rax),%r13
289 mov %rbx,144($context) # restore context->Rbx 1130 mov %rbx,144($context) # restore context->Rbx
290 mov %rbp,160($context) # restore context->Rbp 1131 mov %rbp,160($context) # restore context->Rbp
291 mov %r12,216($context) # restore context->R12 1132 mov %r12,216($context) # restore context->R12
1133 mov %r13,224($context) # restore context->R13
1134
1135 jmp .Lcommon_seh_tail
1136.size se_handler,.-se_handler
292 1137
293.Lin_prologue: 1138.type ssse3_handler,\@abi-omnipotent
1139.align 16
1140ssse3_handler:
1141 push %rsi
1142 push %rdi
1143 push %rbx
1144 push %rbp
1145 push %r12
1146 push %r13
1147 push %r14
1148 push %r15
1149 pushfq
1150 sub \$64,%rsp
1151
1152 mov 120($context),%rax # pull context->Rax
1153 mov 248($context),%rbx # pull context->Rip
1154
1155 mov 8($disp),%rsi # disp->ImageBase
1156 mov 56($disp),%r11 # disp->HandlerData
1157
1158 mov 0(%r11),%r10d # HandlerData[0]
1159 lea (%rsi,%r10),%r10 # prologue label
1160 cmp %r10,%rbx # context->Rip<prologue label
1161 jb .Lcommon_seh_tail
1162
1163 mov 152($context),%rax # pull context->Rsp
1164
1165 mov 4(%r11),%r10d # HandlerData[1]
1166 lea (%rsi,%r10),%r10 # epilogue label
1167 cmp %r10,%rbx # context->Rip>=epilogue label
1168 jae .Lcommon_seh_tail
1169
1170 lea 64(%rax),%rsi
1171 lea 512($context),%rdi # &context.Xmm6
1172 mov \$10,%ecx
1173 .long 0xa548f3fc # cld; rep movsq
1174 lea `24+64+5*16`(%rax),%rax # adjust stack pointer
1175
1176 mov -8(%rax),%rbx
1177 mov -16(%rax),%rbp
1178 mov -24(%rax),%r12
1179 mov %rbx,144($context) # restore context->Rbx
1180 mov %rbp,160($context) # restore context->Rbp
1181 mov %r12,216($context) # restore cotnext->R12
1182
1183.Lcommon_seh_tail:
294 mov 8(%rax),%rdi 1184 mov 8(%rax),%rdi
295 mov 16(%rax),%rsi 1185 mov 16(%rax),%rsi
296 mov %rax,152($context) # restore context->Rsp 1186 mov %rax,152($context) # restore context->Rsp
@@ -328,19 +1218,38 @@ se_handler:
328 pop %rdi 1218 pop %rdi
329 pop %rsi 1219 pop %rsi
330 ret 1220 ret
331.size se_handler,.-se_handler 1221.size ssse3_handler,.-ssse3_handler
332 1222
333.section .pdata 1223.section .pdata
334.align 4 1224.align 4
335 .rva .LSEH_begin_sha1_block_data_order 1225 .rva .LSEH_begin_sha1_block_data_order
336 .rva .LSEH_end_sha1_block_data_order 1226 .rva .LSEH_end_sha1_block_data_order
337 .rva .LSEH_info_sha1_block_data_order 1227 .rva .LSEH_info_sha1_block_data_order
338 1228 .rva .LSEH_begin_sha1_block_data_order_ssse3
1229 .rva .LSEH_end_sha1_block_data_order_ssse3
1230 .rva .LSEH_info_sha1_block_data_order_ssse3
1231___
1232$code.=<<___ if ($avx);
1233 .rva .LSEH_begin_sha1_block_data_order_avx
1234 .rva .LSEH_end_sha1_block_data_order_avx
1235 .rva .LSEH_info_sha1_block_data_order_avx
1236___
1237$code.=<<___;
339.section .xdata 1238.section .xdata
340.align 8 1239.align 8
341.LSEH_info_sha1_block_data_order: 1240.LSEH_info_sha1_block_data_order:
342 .byte 9,0,0,0 1241 .byte 9,0,0,0
343 .rva se_handler 1242 .rva se_handler
1243.LSEH_info_sha1_block_data_order_ssse3:
1244 .byte 9,0,0,0
1245 .rva ssse3_handler
1246 .rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[]
1247___
1248$code.=<<___ if ($avx);
1249.LSEH_info_sha1_block_data_order_avx:
1250 .byte 9,0,0,0
1251 .rva ssse3_handler
1252 .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[]
344___ 1253___
345} 1254}
346 1255
diff --git a/src/lib/libcrypto/sha/asm/sha256-586.pl b/src/lib/libcrypto/sha/asm/sha256-586.pl
index ecc8b69c75..928ec53123 100644
--- a/src/lib/libcrypto/sha/asm/sha256-586.pl
+++ b/src/lib/libcrypto/sha/asm/sha256-586.pl
@@ -14,8 +14,8 @@
14# Pentium PIII P4 AMD K8 Core2 14# Pentium PIII P4 AMD K8 Core2
15# gcc 46 36 41 27 26 15# gcc 46 36 41 27 26
16# icc 57 33 38 25 23 16# icc 57 33 38 25 23
17# x86 asm 40 30 35 20 20 17# x86 asm 40 30 33 20 18
18# x86_64 asm(*) - - 21 15.8 16.5 18# x86_64 asm(*) - - 21 16 16
19# 19#
20# (*) x86_64 assembler performance is presented for reference 20# (*) x86_64 assembler performance is presented for reference
21# purposes. 21# purposes.
@@ -48,20 +48,19 @@ sub BODY_00_15() {
48 my $in_16_63=shift; 48 my $in_16_63=shift;
49 49
50 &mov ("ecx",$E); 50 &mov ("ecx",$E);
51 &add ($T,&DWP(4*(8+15+16-9),"esp")) if ($in_16_63); # T += X[-7] 51 &add ($T,"edi") if ($in_16_63); # T += sigma1(X[-2])
52 &ror ("ecx",6); 52 &ror ("ecx",25-11);
53 &mov ("edi",$E);
54 &ror ("edi",11);
55 &mov ("esi",$Foff); 53 &mov ("esi",$Foff);
56 &xor ("ecx","edi"); 54 &xor ("ecx",$E);
57 &ror ("edi",25-11); 55 &ror ("ecx",11-6);
58 &mov (&DWP(4*(8+15),"esp"),$T) if ($in_16_63); # save X[0] 56 &mov (&DWP(4*(8+15),"esp"),$T) if ($in_16_63); # save X[0]
59 &xor ("ecx","edi"); # Sigma1(e) 57 &xor ("ecx",$E);
58 &ror ("ecx",6); # Sigma1(e)
60 &mov ("edi",$Goff); 59 &mov ("edi",$Goff);
61 &add ($T,"ecx"); # T += Sigma1(e) 60 &add ($T,"ecx"); # T += Sigma1(e)
62 &mov ($Eoff,$E); # modulo-scheduled
63 61
64 &xor ("esi","edi"); 62 &xor ("esi","edi");
63 &mov ($Eoff,$E); # modulo-scheduled
65 &mov ("ecx",$A); 64 &mov ("ecx",$A);
66 &and ("esi",$E); 65 &and ("esi",$E);
67 &mov ($E,$Doff); # e becomes d, which is e in next iteration 66 &mov ($E,$Doff); # e becomes d, which is e in next iteration
@@ -69,14 +68,14 @@ sub BODY_00_15() {
69 &mov ("edi",$A); 68 &mov ("edi",$A);
70 &add ($T,"esi"); # T += Ch(e,f,g) 69 &add ($T,"esi"); # T += Ch(e,f,g)
71 70
72 &ror ("ecx",2); 71 &ror ("ecx",22-13);
73 &add ($T,$Hoff); # T += h 72 &add ($T,$Hoff); # T += h
74 &ror ("edi",13); 73 &xor ("ecx",$A);
74 &ror ("ecx",13-2);
75 &mov ("esi",$Boff); 75 &mov ("esi",$Boff);
76 &xor ("ecx","edi"); 76 &xor ("ecx",$A);
77 &ror ("edi",22-13); 77 &ror ("ecx",2); # Sigma0(a)
78 &add ($E,$T); # d += T 78 &add ($E,$T); # d += T
79 &xor ("ecx","edi"); # Sigma0(a)
80 &mov ("edi",$Coff); 79 &mov ("edi",$Coff);
81 80
82 &add ($T,"ecx"); # T += Sigma0(a) 81 &add ($T,"ecx"); # T += Sigma0(a)
@@ -168,23 +167,22 @@ sub BODY_00_15() {
168&set_label("16_63",16); 167&set_label("16_63",16);
169 &mov ("esi",$T); 168 &mov ("esi",$T);
170 &mov ("ecx",&DWP(4*(8+15+16-14),"esp")); 169 &mov ("ecx",&DWP(4*(8+15+16-14),"esp"));
171 &shr ($T,3);
172 &ror ("esi",7);
173 &xor ($T,"esi");
174 &ror ("esi",18-7); 170 &ror ("esi",18-7);
175 &mov ("edi","ecx"); 171 &mov ("edi","ecx");
176 &xor ($T,"esi"); # T = sigma0(X[-15]) 172 &xor ("esi",$T);
173 &ror ("esi",7);
174 &shr ($T,3);
177 175
178 &shr ("ecx",10);
179 &mov ("esi",&DWP(4*(8+15+16),"esp"));
180 &ror ("edi",17);
181 &xor ("ecx","edi");
182 &ror ("edi",19-17); 176 &ror ("edi",19-17);
183 &add ($T,"esi"); # T += X[-16] 177 &xor ($T,"esi"); # T = sigma0(X[-15])
184 &xor ("edi","ecx") # sigma1(X[-2]) 178 &xor ("edi","ecx");
179 &ror ("edi",17);
180 &shr ("ecx",10);
181 &add ($T,&DWP(4*(8+15+16),"esp")); # T += X[-16]
182 &xor ("edi","ecx"); # sigma1(X[-2])
185 183
186 &add ($T,"edi"); # T += sigma1(X[-2]) 184 &add ($T,&DWP(4*(8+15+16-9),"esp")); # T += X[-7]
187 # &add ($T,&DWP(4*(8+15+16-9),"esp")); # T += X[-7], moved to BODY_00_15(1) 185 # &add ($T,"edi"); # T += sigma1(X[-2])
188 # &mov (&DWP(4*(8+15),"esp"),$T); # save X[0] 186 # &mov (&DWP(4*(8+15),"esp"),$T); # save X[0]
189 187
190 &BODY_00_15(1); 188 &BODY_00_15(1);
diff --git a/src/lib/libcrypto/sha/asm/sha256-armv4.pl b/src/lib/libcrypto/sha/asm/sha256-armv4.pl
index 492cb62bc0..9c84e8d93c 100644
--- a/src/lib/libcrypto/sha/asm/sha256-armv4.pl
+++ b/src/lib/libcrypto/sha/asm/sha256-armv4.pl
@@ -18,11 +18,16 @@
18# Rescheduling for dual-issue pipeline resulted in 22% improvement on 18# Rescheduling for dual-issue pipeline resulted in 22% improvement on
19# Cortex A8 core and ~20 cycles per processed byte. 19# Cortex A8 core and ~20 cycles per processed byte.
20 20
21# February 2011.
22#
23# Profiler-assisted and platform-specific optimization resulted in 16%
24# improvement on Cortex A8 core and ~17 cycles per processed byte.
25
21while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} 26while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
22open STDOUT,">$output"; 27open STDOUT,">$output";
23 28
24$ctx="r0"; $t0="r0"; 29$ctx="r0"; $t0="r0";
25$inp="r1"; 30$inp="r1"; $t3="r1";
26$len="r2"; $t1="r2"; 31$len="r2"; $t1="r2";
27$T1="r3"; 32$T1="r3";
28$A="r4"; 33$A="r4";
@@ -46,6 +51,9 @@ sub BODY_00_15 {
46my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; 51my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
47 52
48$code.=<<___ if ($i<16); 53$code.=<<___ if ($i<16);
54#if __ARM_ARCH__>=7
55 ldr $T1,[$inp],#4
56#else
49 ldrb $T1,[$inp,#3] @ $i 57 ldrb $T1,[$inp,#3] @ $i
50 ldrb $t2,[$inp,#2] 58 ldrb $t2,[$inp,#2]
51 ldrb $t1,[$inp,#1] 59 ldrb $t1,[$inp,#1]
@@ -53,16 +61,24 @@ $code.=<<___ if ($i<16);
53 orr $T1,$T1,$t2,lsl#8 61 orr $T1,$T1,$t2,lsl#8
54 orr $T1,$T1,$t1,lsl#16 62 orr $T1,$T1,$t1,lsl#16
55 orr $T1,$T1,$t0,lsl#24 63 orr $T1,$T1,$t0,lsl#24
56 `"str $inp,[sp,#17*4]" if ($i==15)` 64#endif
57___ 65___
58$code.=<<___; 66$code.=<<___;
59 ldr $t2,[$Ktbl],#4 @ *K256++
60 mov $t0,$e,ror#$Sigma1[0] 67 mov $t0,$e,ror#$Sigma1[0]
61 str $T1,[sp,#`$i%16`*4] 68 ldr $t2,[$Ktbl],#4 @ *K256++
62 eor $t0,$t0,$e,ror#$Sigma1[1] 69 eor $t0,$t0,$e,ror#$Sigma1[1]
63 eor $t1,$f,$g 70 eor $t1,$f,$g
71#if $i>=16
72 add $T1,$T1,$t3 @ from BODY_16_xx
73#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
74 rev $T1,$T1
75#endif
76#if $i==15
77 str $inp,[sp,#17*4] @ leave room for $t3
78#endif
64 eor $t0,$t0,$e,ror#$Sigma1[2] @ Sigma1(e) 79 eor $t0,$t0,$e,ror#$Sigma1[2] @ Sigma1(e)
65 and $t1,$t1,$e 80 and $t1,$t1,$e
81 str $T1,[sp,#`$i%16`*4]
66 add $T1,$T1,$t0 82 add $T1,$T1,$t0
67 eor $t1,$t1,$g @ Ch(e,f,g) 83 eor $t1,$t1,$g @ Ch(e,f,g)
68 add $T1,$T1,$h 84 add $T1,$T1,$h
@@ -71,6 +87,9 @@ $code.=<<___;
71 eor $h,$h,$a,ror#$Sigma0[1] 87 eor $h,$h,$a,ror#$Sigma0[1]
72 add $T1,$T1,$t2 88 add $T1,$T1,$t2
73 eor $h,$h,$a,ror#$Sigma0[2] @ Sigma0(a) 89 eor $h,$h,$a,ror#$Sigma0[2] @ Sigma0(a)
90#if $i>=15
91 ldr $t3,[sp,#`($i+2)%16`*4] @ from BODY_16_xx
92#endif
74 orr $t0,$a,$b 93 orr $t0,$a,$b
75 and $t1,$a,$b 94 and $t1,$a,$b
76 and $t0,$t0,$c 95 and $t0,$t0,$c
@@ -85,24 +104,26 @@ sub BODY_16_XX {
85my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; 104my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
86 105
87$code.=<<___; 106$code.=<<___;
88 ldr $t1,[sp,#`($i+1)%16`*4] @ $i 107 @ ldr $t3,[sp,#`($i+1)%16`*4] @ $i
89 ldr $t2,[sp,#`($i+14)%16`*4] 108 ldr $t2,[sp,#`($i+14)%16`*4]
109 mov $t0,$t3,ror#$sigma0[0]
90 ldr $T1,[sp,#`($i+0)%16`*4] 110 ldr $T1,[sp,#`($i+0)%16`*4]
91 mov $t0,$t1,ror#$sigma0[0] 111 eor $t0,$t0,$t3,ror#$sigma0[1]
92 ldr $inp,[sp,#`($i+9)%16`*4] 112 ldr $t1,[sp,#`($i+9)%16`*4]
93 eor $t0,$t0,$t1,ror#$sigma0[1] 113 eor $t0,$t0,$t3,lsr#$sigma0[2] @ sigma0(X[i+1])
94 eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1]) 114 mov $t3,$t2,ror#$sigma1[0]
95 mov $t1,$t2,ror#$sigma1[0]
96 add $T1,$T1,$t0 115 add $T1,$T1,$t0
97 eor $t1,$t1,$t2,ror#$sigma1[1] 116 eor $t3,$t3,$t2,ror#$sigma1[1]
98 add $T1,$T1,$inp
99 eor $t1,$t1,$t2,lsr#$sigma1[2] @ sigma1(X[i+14])
100 add $T1,$T1,$t1 117 add $T1,$T1,$t1
118 eor $t3,$t3,$t2,lsr#$sigma1[2] @ sigma1(X[i+14])
119 @ add $T1,$T1,$t3
101___ 120___
102 &BODY_00_15(@_); 121 &BODY_00_15(@_);
103} 122}
104 123
105$code=<<___; 124$code=<<___;
125#include "arm_arch.h"
126
106.text 127.text
107.code 32 128.code 32
108 129
@@ -132,7 +153,7 @@ K256:
132sha256_block_data_order: 153sha256_block_data_order:
133 sub r3,pc,#8 @ sha256_block_data_order 154 sub r3,pc,#8 @ sha256_block_data_order
134 add $len,$inp,$len,lsl#6 @ len to point at the end of inp 155 add $len,$inp,$len,lsl#6 @ len to point at the end of inp
135 stmdb sp!,{$ctx,$inp,$len,r4-r12,lr} 156 stmdb sp!,{$ctx,$inp,$len,r4-r11,lr}
136 ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H} 157 ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
137 sub $Ktbl,r3,#256 @ K256 158 sub $Ktbl,r3,#256 @ K256
138 sub sp,sp,#16*4 @ alloca(X[16]) 159 sub sp,sp,#16*4 @ alloca(X[16])
@@ -171,10 +192,14 @@ $code.=<<___;
171 bne .Loop 192 bne .Loop
172 193
173 add sp,sp,#`16+3`*4 @ destroy frame 194 add sp,sp,#`16+3`*4 @ destroy frame
174 ldmia sp!,{r4-r12,lr} 195#if __ARM_ARCH__>=5
196 ldmia sp!,{r4-r11,pc}
197#else
198 ldmia sp!,{r4-r11,lr}
175 tst lr,#1 199 tst lr,#1
176 moveq pc,lr @ be binary compatible with V4, yet 200 moveq pc,lr @ be binary compatible with V4, yet
177 bx lr @ interoperable with Thumb ISA:-) 201 bx lr @ interoperable with Thumb ISA:-)
202#endif
178.size sha256_block_data_order,.-sha256_block_data_order 203.size sha256_block_data_order,.-sha256_block_data_order
179.asciz "SHA256 block transform for ARMv4, CRYPTOGAMS by <appro\@openssl.org>" 204.asciz "SHA256 block transform for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
180.align 2 205.align 2
diff --git a/src/lib/libcrypto/sha/asm/sha512-armv4.pl b/src/lib/libcrypto/sha/asm/sha512-armv4.pl
index 3a35861ac6..7faf37b147 100644
--- a/src/lib/libcrypto/sha/asm/sha512-armv4.pl
+++ b/src/lib/libcrypto/sha/asm/sha512-armv4.pl
@@ -18,22 +18,33 @@
18# Rescheduling for dual-issue pipeline resulted in 6% improvement on 18# Rescheduling for dual-issue pipeline resulted in 6% improvement on
19# Cortex A8 core and ~40 cycles per processed byte. 19# Cortex A8 core and ~40 cycles per processed byte.
20 20
21# February 2011.
22#
23# Profiler-assisted and platform-specific optimization resulted in 7%
24# improvement on Coxtex A8 core and ~38 cycles per byte.
25
26# March 2011.
27#
28# Add NEON implementation. On Cortex A8 it was measured to process
29# one byte in 25.5 cycles or 47% faster than integer-only code.
30
21# Byte order [in]dependence. ========================================= 31# Byte order [in]dependence. =========================================
22# 32#
23# Caller is expected to maintain specific *dword* order in h[0-7], 33# Originally caller was expected to maintain specific *dword* order in
24# namely with most significant dword at *lower* address, which is 34# h[0-7], namely with most significant dword at *lower* address, which
25# reflected in below two parameters. *Byte* order within these dwords 35# was reflected in below two parameters as 0 and 4. Now caller is
26# in turn is whatever *native* byte order on current platform. 36# expected to maintain native byte order for whole 64-bit values.
27$hi=0; 37$hi="HI";
28$lo=4; 38$lo="LO";
29# ==================================================================== 39# ====================================================================
30 40
31while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} 41while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
32open STDOUT,">$output"; 42open STDOUT,">$output";
33 43
34$ctx="r0"; 44$ctx="r0"; # parameter block
35$inp="r1"; 45$inp="r1";
36$len="r2"; 46$len="r2";
47
37$Tlo="r3"; 48$Tlo="r3";
38$Thi="r4"; 49$Thi="r4";
39$Alo="r5"; 50$Alo="r5";
@@ -61,15 +72,17 @@ $Xoff=8*8;
61sub BODY_00_15() { 72sub BODY_00_15() {
62my $magic = shift; 73my $magic = shift;
63$code.=<<___; 74$code.=<<___;
64 ldr $t2,[sp,#$Hoff+0] @ h.lo
65 ldr $t3,[sp,#$Hoff+4] @ h.hi
66 @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41)) 75 @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41))
67 @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23 76 @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
68 @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23 77 @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
69 mov $t0,$Elo,lsr#14 78 mov $t0,$Elo,lsr#14
79 str $Tlo,[sp,#$Xoff+0]
70 mov $t1,$Ehi,lsr#14 80 mov $t1,$Ehi,lsr#14
81 str $Thi,[sp,#$Xoff+4]
71 eor $t0,$t0,$Ehi,lsl#18 82 eor $t0,$t0,$Ehi,lsl#18
83 ldr $t2,[sp,#$Hoff+0] @ h.lo
72 eor $t1,$t1,$Elo,lsl#18 84 eor $t1,$t1,$Elo,lsl#18
85 ldr $t3,[sp,#$Hoff+4] @ h.hi
73 eor $t0,$t0,$Elo,lsr#18 86 eor $t0,$t0,$Elo,lsr#18
74 eor $t1,$t1,$Ehi,lsr#18 87 eor $t1,$t1,$Ehi,lsr#18
75 eor $t0,$t0,$Ehi,lsl#14 88 eor $t0,$t0,$Ehi,lsl#14
@@ -96,25 +109,24 @@ $code.=<<___;
96 and $t1,$t1,$Ehi 109 and $t1,$t1,$Ehi
97 str $Ahi,[sp,#$Aoff+4] 110 str $Ahi,[sp,#$Aoff+4]
98 eor $t0,$t0,$t2 111 eor $t0,$t0,$t2
99 ldr $t2,[$Ktbl,#4] @ K[i].lo 112 ldr $t2,[$Ktbl,#$lo] @ K[i].lo
100 eor $t1,$t1,$t3 @ Ch(e,f,g) 113 eor $t1,$t1,$t3 @ Ch(e,f,g)
101 ldr $t3,[$Ktbl,#0] @ K[i].hi 114 ldr $t3,[$Ktbl,#$hi] @ K[i].hi
102 115
103 adds $Tlo,$Tlo,$t0 116 adds $Tlo,$Tlo,$t0
104 ldr $Elo,[sp,#$Doff+0] @ d.lo 117 ldr $Elo,[sp,#$Doff+0] @ d.lo
105 adc $Thi,$Thi,$t1 @ T += Ch(e,f,g) 118 adc $Thi,$Thi,$t1 @ T += Ch(e,f,g)
106 ldr $Ehi,[sp,#$Doff+4] @ d.hi 119 ldr $Ehi,[sp,#$Doff+4] @ d.hi
107 adds $Tlo,$Tlo,$t2 120 adds $Tlo,$Tlo,$t2
121 and $t0,$t2,#0xff
108 adc $Thi,$Thi,$t3 @ T += K[i] 122 adc $Thi,$Thi,$t3 @ T += K[i]
109 adds $Elo,$Elo,$Tlo 123 adds $Elo,$Elo,$Tlo
124 ldr $t2,[sp,#$Boff+0] @ b.lo
110 adc $Ehi,$Ehi,$Thi @ d += T 125 adc $Ehi,$Ehi,$Thi @ d += T
111
112 and $t0,$t2,#0xff
113 teq $t0,#$magic 126 teq $t0,#$magic
114 orreq $Ktbl,$Ktbl,#1
115 127
116 ldr $t2,[sp,#$Boff+0] @ b.lo
117 ldr $t3,[sp,#$Coff+0] @ c.lo 128 ldr $t3,[sp,#$Coff+0] @ c.lo
129 orreq $Ktbl,$Ktbl,#1
118 @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39)) 130 @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
119 @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25 131 @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
120 @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25 132 @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
@@ -131,80 +143,100 @@ $code.=<<___;
131 eor $t0,$t0,$Alo,lsl#25 143 eor $t0,$t0,$Alo,lsl#25
132 eor $t1,$t1,$Ahi,lsl#25 @ Sigma0(a) 144 eor $t1,$t1,$Ahi,lsl#25 @ Sigma0(a)
133 adds $Tlo,$Tlo,$t0 145 adds $Tlo,$Tlo,$t0
146 and $t0,$Alo,$t2
134 adc $Thi,$Thi,$t1 @ T += Sigma0(a) 147 adc $Thi,$Thi,$t1 @ T += Sigma0(a)
135 148
136 and $t0,$Alo,$t2
137 orr $Alo,$Alo,$t2
138 ldr $t1,[sp,#$Boff+4] @ b.hi 149 ldr $t1,[sp,#$Boff+4] @ b.hi
150 orr $Alo,$Alo,$t2
139 ldr $t2,[sp,#$Coff+4] @ c.hi 151 ldr $t2,[sp,#$Coff+4] @ c.hi
140 and $Alo,$Alo,$t3 152 and $Alo,$Alo,$t3
141 orr $Alo,$Alo,$t0 @ Maj(a,b,c).lo
142 and $t3,$Ahi,$t1 153 and $t3,$Ahi,$t1
143 orr $Ahi,$Ahi,$t1 154 orr $Ahi,$Ahi,$t1
155 orr $Alo,$Alo,$t0 @ Maj(a,b,c).lo
144 and $Ahi,$Ahi,$t2 156 and $Ahi,$Ahi,$t2
145 orr $Ahi,$Ahi,$t3 @ Maj(a,b,c).hi
146 adds $Alo,$Alo,$Tlo 157 adds $Alo,$Alo,$Tlo
147 adc $Ahi,$Ahi,$Thi @ h += T 158 orr $Ahi,$Ahi,$t3 @ Maj(a,b,c).hi
148
149 sub sp,sp,#8 159 sub sp,sp,#8
160 adc $Ahi,$Ahi,$Thi @ h += T
161 tst $Ktbl,#1
150 add $Ktbl,$Ktbl,#8 162 add $Ktbl,$Ktbl,#8
151___ 163___
152} 164}
153$code=<<___; 165$code=<<___;
166#include "arm_arch.h"
167#ifdef __ARMEL__
168# define LO 0
169# define HI 4
170# define WORD64(hi0,lo0,hi1,lo1) .word lo0,hi0, lo1,hi1
171#else
172# define HI 0
173# define LO 4
174# define WORD64(hi0,lo0,hi1,lo1) .word hi0,lo0, hi1,lo1
175#endif
176
154.text 177.text
155.code 32 178.code 32
156.type K512,%object 179.type K512,%object
157.align 5 180.align 5
158K512: 181K512:
159.word 0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd 182WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
160.word 0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc 183WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
161.word 0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019 184WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
162.word 0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118 185WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
163.word 0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe 186WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
164.word 0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2 187WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
165.word 0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1 188WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
166.word 0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694 189WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
167.word 0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3 190WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
168.word 0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65 191WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
169.word 0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483 192WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
170.word 0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5 193WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
171.word 0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210 194WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
172.word 0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4 195WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
173.word 0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725 196WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
174.word 0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70 197WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
175.word 0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926 198WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
176.word 0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df 199WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
177.word 0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8 200WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
178.word 0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b 201WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
179.word 0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001 202WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
180.word 0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30 203WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
181.word 0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910 204WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
182.word 0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8 205WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
183.word 0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53 206WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
184.word 0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8 207WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
185.word 0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb 208WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
186.word 0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3 209WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
187.word 0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60 210WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
188.word 0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec 211WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
189.word 0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9 212WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
190.word 0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b 213WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
191.word 0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207 214WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
192.word 0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178 215WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
193.word 0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6 216WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
194.word 0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b 217WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
195.word 0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493 218WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
196.word 0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c 219WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
197.word 0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a 220WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
198.word 0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817 221WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
199.size K512,.-K512 222.size K512,.-K512
223.LOPENSSL_armcap:
224.word OPENSSL_armcap_P-sha512_block_data_order
225.skip 32-4
200 226
201.global sha512_block_data_order 227.global sha512_block_data_order
202.type sha512_block_data_order,%function 228.type sha512_block_data_order,%function
203sha512_block_data_order: 229sha512_block_data_order:
204 sub r3,pc,#8 @ sha512_block_data_order 230 sub r3,pc,#8 @ sha512_block_data_order
205 add $len,$inp,$len,lsl#7 @ len to point at the end of inp 231 add $len,$inp,$len,lsl#7 @ len to point at the end of inp
232#if __ARM_ARCH__>=7
233 ldr r12,.LOPENSSL_armcap
234 ldr r12,[r3,r12] @ OPENSSL_armcap_P
235 tst r12,#1
236 bne .LNEON
237#endif
206 stmdb sp!,{r4-r12,lr} 238 stmdb sp!,{r4-r12,lr}
207 sub $Ktbl,r3,#640 @ K512 239 sub $Ktbl,r3,#672 @ K512
208 sub sp,sp,#9*8 240 sub sp,sp,#9*8
209 241
210 ldr $Elo,[$ctx,#$Eoff+$lo] 242 ldr $Elo,[$ctx,#$Eoff+$lo]
@@ -238,6 +270,7 @@ sha512_block_data_order:
238 str $Thi,[sp,#$Foff+4] 270 str $Thi,[sp,#$Foff+4]
239 271
240.L00_15: 272.L00_15:
273#if __ARM_ARCH__<7
241 ldrb $Tlo,[$inp,#7] 274 ldrb $Tlo,[$inp,#7]
242 ldrb $t0, [$inp,#6] 275 ldrb $t0, [$inp,#6]
243 ldrb $t1, [$inp,#5] 276 ldrb $t1, [$inp,#5]
@@ -252,26 +285,30 @@ sha512_block_data_order:
252 orr $Thi,$Thi,$t3,lsl#8 285 orr $Thi,$Thi,$t3,lsl#8
253 orr $Thi,$Thi,$t0,lsl#16 286 orr $Thi,$Thi,$t0,lsl#16
254 orr $Thi,$Thi,$t1,lsl#24 287 orr $Thi,$Thi,$t1,lsl#24
255 str $Tlo,[sp,#$Xoff+0] 288#else
256 str $Thi,[sp,#$Xoff+4] 289 ldr $Tlo,[$inp,#4]
290 ldr $Thi,[$inp],#8
291#ifdef __ARMEL__
292 rev $Tlo,$Tlo
293 rev $Thi,$Thi
294#endif
295#endif
257___ 296___
258 &BODY_00_15(0x94); 297 &BODY_00_15(0x94);
259$code.=<<___; 298$code.=<<___;
260 tst $Ktbl,#1 299 tst $Ktbl,#1
261 beq .L00_15 300 beq .L00_15
262 bic $Ktbl,$Ktbl,#1
263
264.L16_79:
265 ldr $t0,[sp,#`$Xoff+8*(16-1)`+0] 301 ldr $t0,[sp,#`$Xoff+8*(16-1)`+0]
266 ldr $t1,[sp,#`$Xoff+8*(16-1)`+4] 302 ldr $t1,[sp,#`$Xoff+8*(16-1)`+4]
267 ldr $t2,[sp,#`$Xoff+8*(16-14)`+0] 303 bic $Ktbl,$Ktbl,#1
268 ldr $t3,[sp,#`$Xoff+8*(16-14)`+4] 304.L16_79:
269
270 @ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7)) 305 @ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7))
271 @ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25 306 @ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
272 @ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7 307 @ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7
273 mov $Tlo,$t0,lsr#1 308 mov $Tlo,$t0,lsr#1
309 ldr $t2,[sp,#`$Xoff+8*(16-14)`+0]
274 mov $Thi,$t1,lsr#1 310 mov $Thi,$t1,lsr#1
311 ldr $t3,[sp,#`$Xoff+8*(16-14)`+4]
275 eor $Tlo,$Tlo,$t1,lsl#31 312 eor $Tlo,$Tlo,$t1,lsl#31
276 eor $Thi,$Thi,$t0,lsl#31 313 eor $Thi,$Thi,$t0,lsl#31
277 eor $Tlo,$Tlo,$t0,lsr#8 314 eor $Tlo,$Tlo,$t0,lsr#8
@@ -295,25 +332,24 @@ $code.=<<___;
295 eor $t1,$t1,$t3,lsl#3 332 eor $t1,$t1,$t3,lsl#3
296 eor $t0,$t0,$t2,lsr#6 333 eor $t0,$t0,$t2,lsr#6
297 eor $t1,$t1,$t3,lsr#6 334 eor $t1,$t1,$t3,lsr#6
335 ldr $t2,[sp,#`$Xoff+8*(16-9)`+0]
298 eor $t0,$t0,$t3,lsl#26 336 eor $t0,$t0,$t3,lsl#26
299 337
300 ldr $t2,[sp,#`$Xoff+8*(16-9)`+0]
301 ldr $t3,[sp,#`$Xoff+8*(16-9)`+4] 338 ldr $t3,[sp,#`$Xoff+8*(16-9)`+4]
302 adds $Tlo,$Tlo,$t0 339 adds $Tlo,$Tlo,$t0
340 ldr $t0,[sp,#`$Xoff+8*16`+0]
303 adc $Thi,$Thi,$t1 341 adc $Thi,$Thi,$t1
304 342
305 ldr $t0,[sp,#`$Xoff+8*16`+0]
306 ldr $t1,[sp,#`$Xoff+8*16`+4] 343 ldr $t1,[sp,#`$Xoff+8*16`+4]
307 adds $Tlo,$Tlo,$t2 344 adds $Tlo,$Tlo,$t2
308 adc $Thi,$Thi,$t3 345 adc $Thi,$Thi,$t3
309 adds $Tlo,$Tlo,$t0 346 adds $Tlo,$Tlo,$t0
310 adc $Thi,$Thi,$t1 347 adc $Thi,$Thi,$t1
311 str $Tlo,[sp,#$Xoff+0]
312 str $Thi,[sp,#$Xoff+4]
313___ 348___
314 &BODY_00_15(0x17); 349 &BODY_00_15(0x17);
315$code.=<<___; 350$code.=<<___;
316 tst $Ktbl,#1 351 ldreq $t0,[sp,#`$Xoff+8*(16-1)`+0]
352 ldreq $t1,[sp,#`$Xoff+8*(16-1)`+4]
317 beq .L16_79 353 beq .L16_79
318 bic $Ktbl,$Ktbl,#1 354 bic $Ktbl,$Ktbl,#1
319 355
@@ -324,12 +360,12 @@ $code.=<<___;
324 ldr $t2, [$ctx,#$Boff+$lo] 360 ldr $t2, [$ctx,#$Boff+$lo]
325 ldr $t3, [$ctx,#$Boff+$hi] 361 ldr $t3, [$ctx,#$Boff+$hi]
326 adds $t0,$Alo,$t0 362 adds $t0,$Alo,$t0
327 adc $t1,$Ahi,$t1
328 adds $t2,$Tlo,$t2
329 adc $t3,$Thi,$t3
330 str $t0, [$ctx,#$Aoff+$lo] 363 str $t0, [$ctx,#$Aoff+$lo]
364 adc $t1,$Ahi,$t1
331 str $t1, [$ctx,#$Aoff+$hi] 365 str $t1, [$ctx,#$Aoff+$hi]
366 adds $t2,$Tlo,$t2
332 str $t2, [$ctx,#$Boff+$lo] 367 str $t2, [$ctx,#$Boff+$lo]
368 adc $t3,$Thi,$t3
333 str $t3, [$ctx,#$Boff+$hi] 369 str $t3, [$ctx,#$Boff+$hi]
334 370
335 ldr $Alo,[sp,#$Coff+0] 371 ldr $Alo,[sp,#$Coff+0]
@@ -341,12 +377,12 @@ $code.=<<___;
341 ldr $t2, [$ctx,#$Doff+$lo] 377 ldr $t2, [$ctx,#$Doff+$lo]
342 ldr $t3, [$ctx,#$Doff+$hi] 378 ldr $t3, [$ctx,#$Doff+$hi]
343 adds $t0,$Alo,$t0 379 adds $t0,$Alo,$t0
344 adc $t1,$Ahi,$t1
345 adds $t2,$Tlo,$t2
346 adc $t3,$Thi,$t3
347 str $t0, [$ctx,#$Coff+$lo] 380 str $t0, [$ctx,#$Coff+$lo]
381 adc $t1,$Ahi,$t1
348 str $t1, [$ctx,#$Coff+$hi] 382 str $t1, [$ctx,#$Coff+$hi]
383 adds $t2,$Tlo,$t2
349 str $t2, [$ctx,#$Doff+$lo] 384 str $t2, [$ctx,#$Doff+$lo]
385 adc $t3,$Thi,$t3
350 str $t3, [$ctx,#$Doff+$hi] 386 str $t3, [$ctx,#$Doff+$hi]
351 387
352 ldr $Tlo,[sp,#$Foff+0] 388 ldr $Tlo,[sp,#$Foff+0]
@@ -356,12 +392,12 @@ $code.=<<___;
356 ldr $t2, [$ctx,#$Foff+$lo] 392 ldr $t2, [$ctx,#$Foff+$lo]
357 ldr $t3, [$ctx,#$Foff+$hi] 393 ldr $t3, [$ctx,#$Foff+$hi]
358 adds $Elo,$Elo,$t0 394 adds $Elo,$Elo,$t0
359 adc $Ehi,$Ehi,$t1
360 adds $t2,$Tlo,$t2
361 adc $t3,$Thi,$t3
362 str $Elo,[$ctx,#$Eoff+$lo] 395 str $Elo,[$ctx,#$Eoff+$lo]
396 adc $Ehi,$Ehi,$t1
363 str $Ehi,[$ctx,#$Eoff+$hi] 397 str $Ehi,[$ctx,#$Eoff+$hi]
398 adds $t2,$Tlo,$t2
364 str $t2, [$ctx,#$Foff+$lo] 399 str $t2, [$ctx,#$Foff+$lo]
400 adc $t3,$Thi,$t3
365 str $t3, [$ctx,#$Foff+$hi] 401 str $t3, [$ctx,#$Foff+$hi]
366 402
367 ldr $Alo,[sp,#$Goff+0] 403 ldr $Alo,[sp,#$Goff+0]
@@ -373,12 +409,12 @@ $code.=<<___;
373 ldr $t2, [$ctx,#$Hoff+$lo] 409 ldr $t2, [$ctx,#$Hoff+$lo]
374 ldr $t3, [$ctx,#$Hoff+$hi] 410 ldr $t3, [$ctx,#$Hoff+$hi]
375 adds $t0,$Alo,$t0 411 adds $t0,$Alo,$t0
376 adc $t1,$Ahi,$t1
377 adds $t2,$Tlo,$t2
378 adc $t3,$Thi,$t3
379 str $t0, [$ctx,#$Goff+$lo] 412 str $t0, [$ctx,#$Goff+$lo]
413 adc $t1,$Ahi,$t1
380 str $t1, [$ctx,#$Goff+$hi] 414 str $t1, [$ctx,#$Goff+$hi]
415 adds $t2,$Tlo,$t2
381 str $t2, [$ctx,#$Hoff+$lo] 416 str $t2, [$ctx,#$Hoff+$lo]
417 adc $t3,$Thi,$t3
382 str $t3, [$ctx,#$Hoff+$hi] 418 str $t3, [$ctx,#$Hoff+$hi]
383 419
384 add sp,sp,#640 420 add sp,sp,#640
@@ -388,13 +424,156 @@ $code.=<<___;
388 bne .Loop 424 bne .Loop
389 425
390 add sp,sp,#8*9 @ destroy frame 426 add sp,sp,#8*9 @ destroy frame
427#if __ARM_ARCH__>=5
428 ldmia sp!,{r4-r12,pc}
429#else
391 ldmia sp!,{r4-r12,lr} 430 ldmia sp!,{r4-r12,lr}
392 tst lr,#1 431 tst lr,#1
393 moveq pc,lr @ be binary compatible with V4, yet 432 moveq pc,lr @ be binary compatible with V4, yet
394 bx lr @ interoperable with Thumb ISA:-) 433 bx lr @ interoperable with Thumb ISA:-)
395.size sha512_block_data_order,.-sha512_block_data_order 434#endif
396.asciz "SHA512 block transform for ARMv4, CRYPTOGAMS by <appro\@openssl.org>" 435___
436
437{
438my @Sigma0=(28,34,39);
439my @Sigma1=(14,18,41);
440my @sigma0=(1, 8, 7);
441my @sigma1=(19,61,6);
442
443my $Ktbl="r3";
444my $cnt="r12"; # volatile register known as ip, intra-procedure-call scratch
445
446my @X=map("d$_",(0..15));
447my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23));
448
449sub NEON_00_15() {
450my $i=shift;
451my ($a,$b,$c,$d,$e,$f,$g,$h)=@_;
452my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31)); # temps
453
454$code.=<<___ if ($i<16 || $i&1);
455 vshr.u64 $t0,$e,#@Sigma1[0] @ $i
456#if $i<16
457 vld1.64 {@X[$i%16]},[$inp]! @ handles unaligned
458#endif
459 vshr.u64 $t1,$e,#@Sigma1[1]
460 vshr.u64 $t2,$e,#@Sigma1[2]
461___
462$code.=<<___;
463 vld1.64 {$K},[$Ktbl,:64]! @ K[i++]
464 vsli.64 $t0,$e,#`64-@Sigma1[0]`
465 vsli.64 $t1,$e,#`64-@Sigma1[1]`
466 vsli.64 $t2,$e,#`64-@Sigma1[2]`
467#if $i<16 && defined(__ARMEL__)
468 vrev64.8 @X[$i],@X[$i]
469#endif
470 vadd.i64 $T1,$K,$h
471 veor $Ch,$f,$g
472 veor $t0,$t1
473 vand $Ch,$e
474 veor $t0,$t2 @ Sigma1(e)
475 veor $Ch,$g @ Ch(e,f,g)
476 vadd.i64 $T1,$t0
477 vshr.u64 $t0,$a,#@Sigma0[0]
478 vadd.i64 $T1,$Ch
479 vshr.u64 $t1,$a,#@Sigma0[1]
480 vshr.u64 $t2,$a,#@Sigma0[2]
481 vsli.64 $t0,$a,#`64-@Sigma0[0]`
482 vsli.64 $t1,$a,#`64-@Sigma0[1]`
483 vsli.64 $t2,$a,#`64-@Sigma0[2]`
484 vadd.i64 $T1,@X[$i%16]
485 vorr $Maj,$a,$c
486 vand $Ch,$a,$c
487 veor $h,$t0,$t1
488 vand $Maj,$b
489 veor $h,$t2 @ Sigma0(a)
490 vorr $Maj,$Ch @ Maj(a,b,c)
491 vadd.i64 $h,$T1
492 vadd.i64 $d,$T1
493 vadd.i64 $h,$Maj
494___
495}
496
497sub NEON_16_79() {
498my $i=shift;
499
500if ($i&1) { &NEON_00_15($i,@_); return; }
501
502# 2x-vectorized, therefore runs every 2nd round
503my @X=map("q$_",(0..7)); # view @X as 128-bit vector
504my ($t0,$t1,$s0,$s1) = map("q$_",(12..15)); # temps
505my ($d0,$d1,$d2) = map("d$_",(24..26)); # temps from NEON_00_15
506my $e=@_[4]; # $e from NEON_00_15
507$i /= 2;
508$code.=<<___;
509 vshr.u64 $t0,@X[($i+7)%8],#@sigma1[0]
510 vshr.u64 $t1,@X[($i+7)%8],#@sigma1[1]
511 vshr.u64 $s1,@X[($i+7)%8],#@sigma1[2]
512 vsli.64 $t0,@X[($i+7)%8],#`64-@sigma1[0]`
513 vext.8 $s0,@X[$i%8],@X[($i+1)%8],#8 @ X[i+1]
514 vsli.64 $t1,@X[($i+7)%8],#`64-@sigma1[1]`
515 veor $s1,$t0
516 vshr.u64 $t0,$s0,#@sigma0[0]
517 veor $s1,$t1 @ sigma1(X[i+14])
518 vshr.u64 $t1,$s0,#@sigma0[1]
519 vadd.i64 @X[$i%8],$s1
520 vshr.u64 $s1,$s0,#@sigma0[2]
521 vsli.64 $t0,$s0,#`64-@sigma0[0]`
522 vsli.64 $t1,$s0,#`64-@sigma0[1]`
523 vext.8 $s0,@X[($i+4)%8],@X[($i+5)%8],#8 @ X[i+9]
524 veor $s1,$t0
525 vshr.u64 $d0,$e,#@Sigma1[0] @ from NEON_00_15
526 vadd.i64 @X[$i%8],$s0
527 vshr.u64 $d1,$e,#@Sigma1[1] @ from NEON_00_15
528 veor $s1,$t1 @ sigma0(X[i+1])
529 vshr.u64 $d2,$e,#@Sigma1[2] @ from NEON_00_15
530 vadd.i64 @X[$i%8],$s1
531___
532 &NEON_00_15(2*$i,@_);
533}
534
535$code.=<<___;
536#if __ARM_ARCH__>=7
537.fpu neon
538
539.align 4
540.LNEON:
541 dmb @ errata #451034 on early Cortex A8
542 vstmdb sp!,{d8-d15} @ ABI specification says so
543 sub $Ktbl,r3,#672 @ K512
544 vldmia $ctx,{$A-$H} @ load context
545.Loop_neon:
546___
547for($i=0;$i<16;$i++) { &NEON_00_15($i,@V); unshift(@V,pop(@V)); }
548$code.=<<___;
549 mov $cnt,#4
550.L16_79_neon:
551 subs $cnt,#1
552___
553for(;$i<32;$i++) { &NEON_16_79($i,@V); unshift(@V,pop(@V)); }
554$code.=<<___;
555 bne .L16_79_neon
556
557 vldmia $ctx,{d24-d31} @ load context to temp
558 vadd.i64 q8,q12 @ vectorized accumulate
559 vadd.i64 q9,q13
560 vadd.i64 q10,q14
561 vadd.i64 q11,q15
562 vstmia $ctx,{$A-$H} @ save context
563 teq $inp,$len
564 sub $Ktbl,#640 @ rewind K512
565 bne .Loop_neon
566
567 vldmia sp!,{d8-d15} @ epilogue
568 bx lr
569#endif
570___
571}
572$code.=<<___;
573.size sha512_block_data_order,.-sha512_block_data_order
574.asciz "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
397.align 2 575.align 2
576.comm OPENSSL_armcap_P,4,4
398___ 577___
399 578
400$code =~ s/\`([^\`]*)\`/eval $1/gem; 579$code =~ s/\`([^\`]*)\`/eval $1/gem;
diff --git a/src/lib/libcrypto/sha/asm/sha512-mips.pl b/src/lib/libcrypto/sha/asm/sha512-mips.pl
new file mode 100644
index 0000000000..ba5b250890
--- /dev/null
+++ b/src/lib/libcrypto/sha/asm/sha512-mips.pl
@@ -0,0 +1,455 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# SHA2 block procedures for MIPS.
11
12# October 2010.
13#
14# SHA256 performance improvement on MIPS R5000 CPU is ~27% over gcc-
15# generated code in o32 build and ~55% in n32/64 build. SHA512 [which
16# for now can only be compiled for MIPS64 ISA] improvement is modest
17# ~17%, but it comes for free, because it's same instruction sequence.
18# Improvement coefficients are for aligned input.
19
20######################################################################
21# There is a number of MIPS ABI in use, O32 and N32/64 are most
22# widely used. Then there is a new contender: NUBI. It appears that if
23# one picks the latter, it's possible to arrange code in ABI neutral
24# manner. Therefore let's stick to NUBI register layout:
25#
26($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
27($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
28($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
29($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
30#
31# The return value is placed in $a0. Following coding rules facilitate
32# interoperability:
33#
34# - never ever touch $tp, "thread pointer", former $gp [o32 can be
35# excluded from the rule, because it's specified volatile];
36# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
37# old code];
38# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
39#
40# For reference here is register layout for N32/64 MIPS ABIs:
41#
42# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
43# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
44# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
45# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
46# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
47#
48$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64
49
50if ($flavour =~ /64|n32/i) {
51 $PTR_ADD="dadd"; # incidentally works even on n32
52 $PTR_SUB="dsub"; # incidentally works even on n32
53 $REG_S="sd";
54 $REG_L="ld";
55 $PTR_SLL="dsll"; # incidentally works even on n32
56 $SZREG=8;
57} else {
58 $PTR_ADD="add";
59 $PTR_SUB="sub";
60 $REG_S="sw";
61 $REG_L="lw";
62 $PTR_SLL="sll";
63 $SZREG=4;
64}
65$pf = ($flavour =~ /nubi/i) ? $t0 : $t2;
66#
67# <appro@openssl.org>
68#
69######################################################################
70
71$big_endian=(`echo MIPSEL | $ENV{CC} -E -P -`=~/MIPSEL/)?1:0;
72
73for (@ARGV) { $output=$_ if (/^\w[\w\-]*\.\w+$/); }
74open STDOUT,">$output";
75
76if (!defined($big_endian)) { $big_endian=(unpack('L',pack('N',1))==1); }
77
78if ($output =~ /512/) {
79 $label="512";
80 $SZ=8;
81 $LD="ld"; # load from memory
82 $ST="sd"; # store to memory
83 $SLL="dsll"; # shift left logical
84 $SRL="dsrl"; # shift right logical
85 $ADDU="daddu";
86 @Sigma0=(28,34,39);
87 @Sigma1=(14,18,41);
88 @sigma0=( 7, 1, 8); # right shift first
89 @sigma1=( 6,19,61); # right shift first
90 $lastK=0x817;
91 $rounds=80;
92} else {
93 $label="256";
94 $SZ=4;
95 $LD="lw"; # load from memory
96 $ST="sw"; # store to memory
97 $SLL="sll"; # shift left logical
98 $SRL="srl"; # shift right logical
99 $ADDU="addu";
100 @Sigma0=( 2,13,22);
101 @Sigma1=( 6,11,25);
102 @sigma0=( 3, 7,18); # right shift first
103 @sigma1=(10,17,19); # right shift first
104 $lastK=0x8f2;
105 $rounds=64;
106}
107
108$MSB = $big_endian ? 0 : ($SZ-1);
109$LSB = ($SZ-1)&~$MSB;
110
111@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("\$$_",(1,2,3,7,24,25,30,31));
112@X=map("\$$_",(8..23));
113
114$ctx=$a0;
115$inp=$a1;
116$len=$a2; $Ktbl=$len;
117
118sub BODY_00_15 {
119my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
120my ($T1,$tmp0,$tmp1,$tmp2)=(@X[4],@X[5],@X[6],@X[7]);
121
122$code.=<<___ if ($i<15);
123 ${LD}l @X[1],`($i+1)*$SZ+$MSB`($inp)
124 ${LD}r @X[1],`($i+1)*$SZ+$LSB`($inp)
125___
126$code.=<<___ if (!$big_endian && $i<16 && $SZ==4);
127 srl $tmp0,@X[0],24 # byte swap($i)
128 srl $tmp1,@X[0],8
129 andi $tmp2,@X[0],0xFF00
130 sll @X[0],@X[0],24
131 andi $tmp1,0xFF00
132 sll $tmp2,$tmp2,8
133 or @X[0],$tmp0
134 or $tmp1,$tmp2
135 or @X[0],$tmp1
136___
137$code.=<<___ if (!$big_endian && $i<16 && $SZ==8);
138 ori $tmp0,$zero,0xFF
139 dsll $tmp2,$tmp0,32
140 or $tmp0,$tmp2 # 0x000000FF000000FF
141 and $tmp1,@X[0],$tmp0 # byte swap($i)
142 dsrl $tmp2,@X[0],24
143 dsll $tmp1,24
144 and $tmp2,$tmp0
145 dsll $tmp0,8 # 0x0000FF000000FF00
146 or $tmp1,$tmp2
147 and $tmp2,@X[0],$tmp0
148 dsrl @X[0],8
149 dsll $tmp2,8
150 and @X[0],$tmp0
151 or $tmp1,$tmp2
152 or @X[0],$tmp1
153 dsrl $tmp1,@X[0],32
154 dsll @X[0],32
155 or @X[0],$tmp1
156___
157$code.=<<___;
158 $ADDU $T1,$X[0],$h # $i
159 $SRL $h,$e,@Sigma1[0]
160 xor $tmp2,$f,$g
161 $SLL $tmp1,$e,`$SZ*8-@Sigma1[2]`
162 and $tmp2,$e
163 $SRL $tmp0,$e,@Sigma1[1]
164 xor $h,$tmp1
165 $SLL $tmp1,$e,`$SZ*8-@Sigma1[1]`
166 xor $h,$tmp0
167 $SRL $tmp0,$e,@Sigma1[2]
168 xor $h,$tmp1
169 $SLL $tmp1,$e,`$SZ*8-@Sigma1[0]`
170 xor $h,$tmp0
171 xor $tmp2,$g # Ch(e,f,g)
172 xor $tmp0,$tmp1,$h # Sigma1(e)
173
174 $SRL $h,$a,@Sigma0[0]
175 $ADDU $T1,$tmp2
176 $LD $tmp2,`$i*$SZ`($Ktbl) # K[$i]
177 $SLL $tmp1,$a,`$SZ*8-@Sigma0[2]`
178 $ADDU $T1,$tmp0
179 $SRL $tmp0,$a,@Sigma0[1]
180 xor $h,$tmp1
181 $SLL $tmp1,$a,`$SZ*8-@Sigma0[1]`
182 xor $h,$tmp0
183 $SRL $tmp0,$a,@Sigma0[2]
184 xor $h,$tmp1
185 $SLL $tmp1,$a,`$SZ*8-@Sigma0[0]`
186 xor $h,$tmp0
187 $ST @X[0],`($i%16)*$SZ`($sp) # offload to ring buffer
188 xor $h,$tmp1 # Sigma0(a)
189
190 or $tmp0,$a,$b
191 and $tmp1,$a,$b
192 and $tmp0,$c
193 or $tmp1,$tmp0 # Maj(a,b,c)
194 $ADDU $T1,$tmp2 # +=K[$i]
195 $ADDU $h,$tmp1
196
197 $ADDU $d,$T1
198 $ADDU $h,$T1
199___
200$code.=<<___ if ($i>=13);
201 $LD @X[3],`(($i+3)%16)*$SZ`($sp) # prefetch from ring buffer
202___
203}
204
205sub BODY_16_XX {
206my $i=@_[0];
207my ($tmp0,$tmp1,$tmp2,$tmp3)=(@X[4],@X[5],@X[6],@X[7]);
208
209$code.=<<___;
210 $SRL $tmp2,@X[1],@sigma0[0] # Xupdate($i)
211 $ADDU @X[0],@X[9] # +=X[i+9]
212 $SLL $tmp1,@X[1],`$SZ*8-@sigma0[2]`
213 $SRL $tmp0,@X[1],@sigma0[1]
214 xor $tmp2,$tmp1
215 $SLL $tmp1,`@sigma0[2]-@sigma0[1]`
216 xor $tmp2,$tmp0
217 $SRL $tmp0,@X[1],@sigma0[2]
218 xor $tmp2,$tmp1
219
220 $SRL $tmp3,@X[14],@sigma1[0]
221 xor $tmp2,$tmp0 # sigma0(X[i+1])
222 $SLL $tmp1,@X[14],`$SZ*8-@sigma1[2]`
223 $ADDU @X[0],$tmp2
224 $SRL $tmp0,@X[14],@sigma1[1]
225 xor $tmp3,$tmp1
226 $SLL $tmp1,`@sigma1[2]-@sigma1[1]`
227 xor $tmp3,$tmp0
228 $SRL $tmp0,@X[14],@sigma1[2]
229 xor $tmp3,$tmp1
230
231 xor $tmp3,$tmp0 # sigma1(X[i+14])
232 $ADDU @X[0],$tmp3
233___
234 &BODY_00_15(@_);
235}
236
237$FRAMESIZE=16*$SZ+16*$SZREG;
238$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0xc0fff008 : 0xc0ff0000;
239
240$code.=<<___;
241#ifdef OPENSSL_FIPSCANISTER
242# include <openssl/fipssyms.h>
243#endif
244
245.text
246.set noat
247#if !defined(__vxworks) || defined(__pic__)
248.option pic2
249#endif
250
251.align 5
252.globl sha${label}_block_data_order
253.ent sha${label}_block_data_order
254sha${label}_block_data_order:
255 .frame $sp,$FRAMESIZE,$ra
256 .mask $SAVED_REGS_MASK,-$SZREG
257 .set noreorder
258___
259$code.=<<___ if ($flavour =~ /o32/i); # o32 PIC-ification
260 .cpload $pf
261___
262$code.=<<___;
263 $PTR_SUB $sp,$FRAMESIZE
264 $REG_S $ra,$FRAMESIZE-1*$SZREG($sp)
265 $REG_S $fp,$FRAMESIZE-2*$SZREG($sp)
266 $REG_S $s11,$FRAMESIZE-3*$SZREG($sp)
267 $REG_S $s10,$FRAMESIZE-4*$SZREG($sp)
268 $REG_S $s9,$FRAMESIZE-5*$SZREG($sp)
269 $REG_S $s8,$FRAMESIZE-6*$SZREG($sp)
270 $REG_S $s7,$FRAMESIZE-7*$SZREG($sp)
271 $REG_S $s6,$FRAMESIZE-8*$SZREG($sp)
272 $REG_S $s5,$FRAMESIZE-9*$SZREG($sp)
273 $REG_S $s4,$FRAMESIZE-10*$SZREG($sp)
274___
275$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
276 $REG_S $s3,$FRAMESIZE-11*$SZREG($sp)
277 $REG_S $s2,$FRAMESIZE-12*$SZREG($sp)
278 $REG_S $s1,$FRAMESIZE-13*$SZREG($sp)
279 $REG_S $s0,$FRAMESIZE-14*$SZREG($sp)
280 $REG_S $gp,$FRAMESIZE-15*$SZREG($sp)
281___
282$code.=<<___;
283 $PTR_SLL @X[15],$len,`log(16*$SZ)/log(2)`
284___
285$code.=<<___ if ($flavour !~ /o32/i); # non-o32 PIC-ification
286 .cplocal $Ktbl
287 .cpsetup $pf,$zero,sha${label}_block_data_order
288___
289$code.=<<___;
290 .set reorder
291 la $Ktbl,K${label} # PIC-ified 'load address'
292
293 $LD $A,0*$SZ($ctx) # load context
294 $LD $B,1*$SZ($ctx)
295 $LD $C,2*$SZ($ctx)
296 $LD $D,3*$SZ($ctx)
297 $LD $E,4*$SZ($ctx)
298 $LD $F,5*$SZ($ctx)
299 $LD $G,6*$SZ($ctx)
300 $LD $H,7*$SZ($ctx)
301
302 $PTR_ADD @X[15],$inp # pointer to the end of input
303 $REG_S @X[15],16*$SZ($sp)
304 b .Loop
305
306.align 5
307.Loop:
308 ${LD}l @X[0],$MSB($inp)
309 ${LD}r @X[0],$LSB($inp)
310___
311for ($i=0;$i<16;$i++)
312{ &BODY_00_15($i,@V); unshift(@V,pop(@V)); push(@X,shift(@X)); }
313$code.=<<___;
314 b .L16_xx
315.align 4
316.L16_xx:
317___
318for (;$i<32;$i++)
319{ &BODY_16_XX($i,@V); unshift(@V,pop(@V)); push(@X,shift(@X)); }
320$code.=<<___;
321 and @X[6],0xfff
322 li @X[7],$lastK
323 .set noreorder
324 bne @X[6],@X[7],.L16_xx
325 $PTR_ADD $Ktbl,16*$SZ # Ktbl+=16
326
327 $REG_L @X[15],16*$SZ($sp) # restore pointer to the end of input
328 $LD @X[0],0*$SZ($ctx)
329 $LD @X[1],1*$SZ($ctx)
330 $LD @X[2],2*$SZ($ctx)
331 $PTR_ADD $inp,16*$SZ
332 $LD @X[3],3*$SZ($ctx)
333 $ADDU $A,@X[0]
334 $LD @X[4],4*$SZ($ctx)
335 $ADDU $B,@X[1]
336 $LD @X[5],5*$SZ($ctx)
337 $ADDU $C,@X[2]
338 $LD @X[6],6*$SZ($ctx)
339 $ADDU $D,@X[3]
340 $LD @X[7],7*$SZ($ctx)
341 $ADDU $E,@X[4]
342 $ST $A,0*$SZ($ctx)
343 $ADDU $F,@X[5]
344 $ST $B,1*$SZ($ctx)
345 $ADDU $G,@X[6]
346 $ST $C,2*$SZ($ctx)
347 $ADDU $H,@X[7]
348 $ST $D,3*$SZ($ctx)
349 $ST $E,4*$SZ($ctx)
350 $ST $F,5*$SZ($ctx)
351 $ST $G,6*$SZ($ctx)
352 $ST $H,7*$SZ($ctx)
353
354 bnel $inp,@X[15],.Loop
355 $PTR_SUB $Ktbl,`($rounds-16)*$SZ` # rewind $Ktbl
356
357 $REG_L $ra,$FRAMESIZE-1*$SZREG($sp)
358 $REG_L $fp,$FRAMESIZE-2*$SZREG($sp)
359 $REG_L $s11,$FRAMESIZE-3*$SZREG($sp)
360 $REG_L $s10,$FRAMESIZE-4*$SZREG($sp)
361 $REG_L $s9,$FRAMESIZE-5*$SZREG($sp)
362 $REG_L $s8,$FRAMESIZE-6*$SZREG($sp)
363 $REG_L $s7,$FRAMESIZE-7*$SZREG($sp)
364 $REG_L $s6,$FRAMESIZE-8*$SZREG($sp)
365 $REG_L $s5,$FRAMESIZE-9*$SZREG($sp)
366 $REG_L $s4,$FRAMESIZE-10*$SZREG($sp)
367___
368$code.=<<___ if ($flavour =~ /nubi/i);
369 $REG_L $s3,$FRAMESIZE-11*$SZREG($sp)
370 $REG_L $s2,$FRAMESIZE-12*$SZREG($sp)
371 $REG_L $s1,$FRAMESIZE-13*$SZREG($sp)
372 $REG_L $s0,$FRAMESIZE-14*$SZREG($sp)
373 $REG_L $gp,$FRAMESIZE-15*$SZREG($sp)
374___
375$code.=<<___;
376 jr $ra
377 $PTR_ADD $sp,$FRAMESIZE
378.end sha${label}_block_data_order
379
380.rdata
381.align 5
382K${label}:
383___
384if ($SZ==4) {
385$code.=<<___;
386 .word 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
387 .word 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
388 .word 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
389 .word 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
390 .word 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
391 .word 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
392 .word 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
393 .word 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
394 .word 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
395 .word 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
396 .word 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
397 .word 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
398 .word 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
399 .word 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
400 .word 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
401 .word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
402___
403} else {
404$code.=<<___;
405 .dword 0x428a2f98d728ae22, 0x7137449123ef65cd
406 .dword 0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc
407 .dword 0x3956c25bf348b538, 0x59f111f1b605d019
408 .dword 0x923f82a4af194f9b, 0xab1c5ed5da6d8118
409 .dword 0xd807aa98a3030242, 0x12835b0145706fbe
410 .dword 0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2
411 .dword 0x72be5d74f27b896f, 0x80deb1fe3b1696b1
412 .dword 0x9bdc06a725c71235, 0xc19bf174cf692694
413 .dword 0xe49b69c19ef14ad2, 0xefbe4786384f25e3
414 .dword 0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65
415 .dword 0x2de92c6f592b0275, 0x4a7484aa6ea6e483
416 .dword 0x5cb0a9dcbd41fbd4, 0x76f988da831153b5
417 .dword 0x983e5152ee66dfab, 0xa831c66d2db43210
418 .dword 0xb00327c898fb213f, 0xbf597fc7beef0ee4
419 .dword 0xc6e00bf33da88fc2, 0xd5a79147930aa725
420 .dword 0x06ca6351e003826f, 0x142929670a0e6e70
421 .dword 0x27b70a8546d22ffc, 0x2e1b21385c26c926
422 .dword 0x4d2c6dfc5ac42aed, 0x53380d139d95b3df
423 .dword 0x650a73548baf63de, 0x766a0abb3c77b2a8
424 .dword 0x81c2c92e47edaee6, 0x92722c851482353b
425 .dword 0xa2bfe8a14cf10364, 0xa81a664bbc423001
426 .dword 0xc24b8b70d0f89791, 0xc76c51a30654be30
427 .dword 0xd192e819d6ef5218, 0xd69906245565a910
428 .dword 0xf40e35855771202a, 0x106aa07032bbd1b8
429 .dword 0x19a4c116b8d2d0c8, 0x1e376c085141ab53
430 .dword 0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8
431 .dword 0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb
432 .dword 0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3
433 .dword 0x748f82ee5defb2fc, 0x78a5636f43172f60
434 .dword 0x84c87814a1f0ab72, 0x8cc702081a6439ec
435 .dword 0x90befffa23631e28, 0xa4506cebde82bde9
436 .dword 0xbef9a3f7b2c67915, 0xc67178f2e372532b
437 .dword 0xca273eceea26619c, 0xd186b8c721c0c207
438 .dword 0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178
439 .dword 0x06f067aa72176fba, 0x0a637dc5a2c898a6
440 .dword 0x113f9804bef90dae, 0x1b710b35131c471b
441 .dword 0x28db77f523047d84, 0x32caab7b40c72493
442 .dword 0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c
443 .dword 0x4cc5d4becb3e42b6, 0x597f299cfc657e2a
444 .dword 0x5fcb6fab3ad6faec, 0x6c44198c4a475817
445___
446}
447$code.=<<___;
448.asciiz "SHA${label} for MIPS, CRYPTOGAMS by <appro\@openssl.org>"
449.align 5
450
451___
452
453$code =~ s/\`([^\`]*)\`/eval $1/gem;
454print $code;
455close STDOUT;
diff --git a/src/lib/libcrypto/sha/asm/sha512-parisc.pl b/src/lib/libcrypto/sha/asm/sha512-parisc.pl
new file mode 100755
index 0000000000..e24ee58ae9
--- /dev/null
+++ b/src/lib/libcrypto/sha/asm/sha512-parisc.pl
@@ -0,0 +1,791 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# SHA256/512 block procedure for PA-RISC.
11
12# June 2009.
13#
14# SHA256 performance is >75% better than gcc 3.2 generated code on
15# PA-7100LC. Compared to code generated by vendor compiler this
16# implementation is almost 70% faster in 64-bit build, but delivers
17# virtually same performance in 32-bit build on PA-8600.
18#
19# SHA512 performance is >2.9x better than gcc 3.2 generated code on
20# PA-7100LC, PA-RISC 1.1 processor. Then implementation detects if the
21# code is executed on PA-RISC 2.0 processor and switches to 64-bit
22# code path delivering adequate peformance even in "blended" 32-bit
23# build. Though 64-bit code is not any faster than code generated by
24# vendor compiler on PA-8600...
25#
26# Special thanks to polarhome.com for providing HP-UX account.
27
28$flavour = shift;
29$output = shift;
30open STDOUT,">$output";
31
32if ($flavour =~ /64/) {
33 $LEVEL ="2.0W";
34 $SIZE_T =8;
35 $FRAME_MARKER =80;
36 $SAVED_RP =16;
37 $PUSH ="std";
38 $PUSHMA ="std,ma";
39 $POP ="ldd";
40 $POPMB ="ldd,mb";
41} else {
42 $LEVEL ="1.0";
43 $SIZE_T =4;
44 $FRAME_MARKER =48;
45 $SAVED_RP =20;
46 $PUSH ="stw";
47 $PUSHMA ="stwm";
48 $POP ="ldw";
49 $POPMB ="ldwm";
50}
51
52if ($output =~ /512/) {
53 $func="sha512_block_data_order";
54 $SZ=8;
55 @Sigma0=(28,34,39);
56 @Sigma1=(14,18,41);
57 @sigma0=(1, 8, 7);
58 @sigma1=(19,61, 6);
59 $rounds=80;
60 $LAST10BITS=0x017;
61 $LD="ldd";
62 $LDM="ldd,ma";
63 $ST="std";
64} else {
65 $func="sha256_block_data_order";
66 $SZ=4;
67 @Sigma0=( 2,13,22);
68 @Sigma1=( 6,11,25);
69 @sigma0=( 7,18, 3);
70 @sigma1=(17,19,10);
71 $rounds=64;
72 $LAST10BITS=0x0f2;
73 $LD="ldw";
74 $LDM="ldwm";
75 $ST="stw";
76}
77
78$FRAME=16*$SIZE_T+$FRAME_MARKER;# 16 saved regs + frame marker
79 # [+ argument transfer]
80$XOFF=16*$SZ+32; # local variables
81$FRAME+=$XOFF;
82$XOFF+=$FRAME_MARKER; # distance between %sp and local variables
83
84$ctx="%r26"; # zapped by $a0
85$inp="%r25"; # zapped by $a1
86$num="%r24"; # zapped by $t0
87
88$a0 ="%r26";
89$a1 ="%r25";
90$t0 ="%r24";
91$t1 ="%r29";
92$Tbl="%r31";
93
94@V=($A,$B,$C,$D,$E,$F,$G,$H)=("%r17","%r18","%r19","%r20","%r21","%r22","%r23","%r28");
95
96@X=("%r1", "%r2", "%r3", "%r4", "%r5", "%r6", "%r7", "%r8",
97 "%r9", "%r10","%r11","%r12","%r13","%r14","%r15","%r16",$inp);
98
99sub ROUND_00_15 {
100my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
101$code.=<<___;
102 _ror $e,$Sigma1[0],$a0
103 and $f,$e,$t0
104 _ror $e,$Sigma1[1],$a1
105 addl $t1,$h,$h
106 andcm $g,$e,$t1
107 xor $a1,$a0,$a0
108 _ror $a1,`$Sigma1[2]-$Sigma1[1]`,$a1
109 or $t0,$t1,$t1 ; Ch(e,f,g)
110 addl @X[$i%16],$h,$h
111 xor $a0,$a1,$a1 ; Sigma1(e)
112 addl $t1,$h,$h
113 _ror $a,$Sigma0[0],$a0
114 addl $a1,$h,$h
115
116 _ror $a,$Sigma0[1],$a1
117 and $a,$b,$t0
118 and $a,$c,$t1
119 xor $a1,$a0,$a0
120 _ror $a1,`$Sigma0[2]-$Sigma0[1]`,$a1
121 xor $t1,$t0,$t0
122 and $b,$c,$t1
123 xor $a0,$a1,$a1 ; Sigma0(a)
124 addl $h,$d,$d
125 xor $t1,$t0,$t0 ; Maj(a,b,c)
126 `"$LDM $SZ($Tbl),$t1" if ($i<15)`
127 addl $a1,$h,$h
128 addl $t0,$h,$h
129
130___
131}
132
133sub ROUND_16_xx {
134my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
135$i-=16;
136$code.=<<___;
137 _ror @X[($i+1)%16],$sigma0[0],$a0
138 _ror @X[($i+1)%16],$sigma0[1],$a1
139 addl @X[($i+9)%16],@X[$i],@X[$i]
140 _ror @X[($i+14)%16],$sigma1[0],$t0
141 _ror @X[($i+14)%16],$sigma1[1],$t1
142 xor $a1,$a0,$a0
143 _shr @X[($i+1)%16],$sigma0[2],$a1
144 xor $t1,$t0,$t0
145 _shr @X[($i+14)%16],$sigma1[2],$t1
146 xor $a1,$a0,$a0 ; sigma0(X[(i+1)&0x0f])
147 xor $t1,$t0,$t0 ; sigma1(X[(i+14)&0x0f])
148 $LDM $SZ($Tbl),$t1
149 addl $a0,@X[$i],@X[$i]
150 addl $t0,@X[$i],@X[$i]
151___
152$code.=<<___ if ($i==15);
153 extru $t1,31,10,$a1
154 comiclr,<> $LAST10BITS,$a1,%r0
155 ldo 1($Tbl),$Tbl ; signal end of $Tbl
156___
157&ROUND_00_15($i+16,$a,$b,$c,$d,$e,$f,$g,$h);
158}
159
160$code=<<___;
161 .LEVEL $LEVEL
162 .SPACE \$TEXT\$
163 .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
164
165 .ALIGN 64
166L\$table
167___
168$code.=<<___ if ($SZ==8);
169 .WORD 0x428a2f98,0xd728ae22,0x71374491,0x23ef65cd
170 .WORD 0xb5c0fbcf,0xec4d3b2f,0xe9b5dba5,0x8189dbbc
171 .WORD 0x3956c25b,0xf348b538,0x59f111f1,0xb605d019
172 .WORD 0x923f82a4,0xaf194f9b,0xab1c5ed5,0xda6d8118
173 .WORD 0xd807aa98,0xa3030242,0x12835b01,0x45706fbe
174 .WORD 0x243185be,0x4ee4b28c,0x550c7dc3,0xd5ffb4e2
175 .WORD 0x72be5d74,0xf27b896f,0x80deb1fe,0x3b1696b1
176 .WORD 0x9bdc06a7,0x25c71235,0xc19bf174,0xcf692694
177 .WORD 0xe49b69c1,0x9ef14ad2,0xefbe4786,0x384f25e3
178 .WORD 0x0fc19dc6,0x8b8cd5b5,0x240ca1cc,0x77ac9c65
179 .WORD 0x2de92c6f,0x592b0275,0x4a7484aa,0x6ea6e483
180 .WORD 0x5cb0a9dc,0xbd41fbd4,0x76f988da,0x831153b5
181 .WORD 0x983e5152,0xee66dfab,0xa831c66d,0x2db43210
182 .WORD 0xb00327c8,0x98fb213f,0xbf597fc7,0xbeef0ee4
183 .WORD 0xc6e00bf3,0x3da88fc2,0xd5a79147,0x930aa725
184 .WORD 0x06ca6351,0xe003826f,0x14292967,0x0a0e6e70
185 .WORD 0x27b70a85,0x46d22ffc,0x2e1b2138,0x5c26c926
186 .WORD 0x4d2c6dfc,0x5ac42aed,0x53380d13,0x9d95b3df
187 .WORD 0x650a7354,0x8baf63de,0x766a0abb,0x3c77b2a8
188 .WORD 0x81c2c92e,0x47edaee6,0x92722c85,0x1482353b
189 .WORD 0xa2bfe8a1,0x4cf10364,0xa81a664b,0xbc423001
190 .WORD 0xc24b8b70,0xd0f89791,0xc76c51a3,0x0654be30
191 .WORD 0xd192e819,0xd6ef5218,0xd6990624,0x5565a910
192 .WORD 0xf40e3585,0x5771202a,0x106aa070,0x32bbd1b8
193 .WORD 0x19a4c116,0xb8d2d0c8,0x1e376c08,0x5141ab53
194 .WORD 0x2748774c,0xdf8eeb99,0x34b0bcb5,0xe19b48a8
195 .WORD 0x391c0cb3,0xc5c95a63,0x4ed8aa4a,0xe3418acb
196 .WORD 0x5b9cca4f,0x7763e373,0x682e6ff3,0xd6b2b8a3
197 .WORD 0x748f82ee,0x5defb2fc,0x78a5636f,0x43172f60
198 .WORD 0x84c87814,0xa1f0ab72,0x8cc70208,0x1a6439ec
199 .WORD 0x90befffa,0x23631e28,0xa4506ceb,0xde82bde9
200 .WORD 0xbef9a3f7,0xb2c67915,0xc67178f2,0xe372532b
201 .WORD 0xca273ece,0xea26619c,0xd186b8c7,0x21c0c207
202 .WORD 0xeada7dd6,0xcde0eb1e,0xf57d4f7f,0xee6ed178
203 .WORD 0x06f067aa,0x72176fba,0x0a637dc5,0xa2c898a6
204 .WORD 0x113f9804,0xbef90dae,0x1b710b35,0x131c471b
205 .WORD 0x28db77f5,0x23047d84,0x32caab7b,0x40c72493
206 .WORD 0x3c9ebe0a,0x15c9bebc,0x431d67c4,0x9c100d4c
207 .WORD 0x4cc5d4be,0xcb3e42b6,0x597f299c,0xfc657e2a
208 .WORD 0x5fcb6fab,0x3ad6faec,0x6c44198c,0x4a475817
209___
210$code.=<<___ if ($SZ==4);
211 .WORD 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
212 .WORD 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
213 .WORD 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
214 .WORD 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
215 .WORD 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
216 .WORD 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
217 .WORD 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
218 .WORD 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
219 .WORD 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
220 .WORD 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
221 .WORD 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
222 .WORD 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
223 .WORD 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
224 .WORD 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
225 .WORD 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
226 .WORD 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
227___
228$code.=<<___;
229
230 .EXPORT $func,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR
231 .ALIGN 64
232$func
233 .PROC
234 .CALLINFO FRAME=`$FRAME-16*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=18
235 .ENTRY
236 $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
237 $PUSHMA %r3,$FRAME(%sp)
238 $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
239 $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
240 $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
241 $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp)
242 $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp)
243 $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp)
244 $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp)
245 $PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp)
246 $PUSH %r12,`-$FRAME+9*$SIZE_T`(%sp)
247 $PUSH %r13,`-$FRAME+10*$SIZE_T`(%sp)
248 $PUSH %r14,`-$FRAME+11*$SIZE_T`(%sp)
249 $PUSH %r15,`-$FRAME+12*$SIZE_T`(%sp)
250 $PUSH %r16,`-$FRAME+13*$SIZE_T`(%sp)
251 $PUSH %r17,`-$FRAME+14*$SIZE_T`(%sp)
252 $PUSH %r18,`-$FRAME+15*$SIZE_T`(%sp)
253
254 _shl $num,`log(16*$SZ)/log(2)`,$num
255 addl $inp,$num,$num ; $num to point at the end of $inp
256
257 $PUSH $num,`-$FRAME_MARKER-4*$SIZE_T`(%sp) ; save arguments
258 $PUSH $inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp)
259 $PUSH $ctx,`-$FRAME_MARKER-2*$SIZE_T`(%sp)
260
261 blr %r0,$Tbl
262 ldi 3,$t1
263L\$pic
264 andcm $Tbl,$t1,$Tbl ; wipe privilege level
265 ldo L\$table-L\$pic($Tbl),$Tbl
266___
267$code.=<<___ if ($SZ==8 && $SIZE_T==4);
268 ldi 31,$t1
269 mtctl $t1,%cr11
270 extrd,u,*= $t1,%sar,1,$t1 ; executes on PA-RISC 1.0
271 b L\$parisc1
272 nop
273___
274$code.=<<___;
275 $LD `0*$SZ`($ctx),$A ; load context
276 $LD `1*$SZ`($ctx),$B
277 $LD `2*$SZ`($ctx),$C
278 $LD `3*$SZ`($ctx),$D
279 $LD `4*$SZ`($ctx),$E
280 $LD `5*$SZ`($ctx),$F
281 $LD `6*$SZ`($ctx),$G
282 $LD `7*$SZ`($ctx),$H
283
284 extru $inp,31,`log($SZ)/log(2)`,$t0
285 sh3addl $t0,%r0,$t0
286 subi `8*$SZ`,$t0,$t0
287 mtctl $t0,%cr11 ; load %sar with align factor
288
289L\$oop
290 ldi `$SZ-1`,$t0
291 $LDM $SZ($Tbl),$t1
292 andcm $inp,$t0,$t0 ; align $inp
293___
294 for ($i=0;$i<15;$i++) { # load input block
295 $code.="\t$LD `$SZ*$i`($t0),@X[$i]\n"; }
296$code.=<<___;
297 cmpb,*= $inp,$t0,L\$aligned
298 $LD `$SZ*15`($t0),@X[15]
299 $LD `$SZ*16`($t0),@X[16]
300___
301 for ($i=0;$i<16;$i++) { # align data
302 $code.="\t_align @X[$i],@X[$i+1],@X[$i]\n"; }
303$code.=<<___;
304L\$aligned
305 nop ; otherwise /usr/ccs/bin/as is confused by below .WORD
306___
307
308for($i=0;$i<16;$i++) { &ROUND_00_15($i,@V); unshift(@V,pop(@V)); }
309$code.=<<___;
310L\$rounds
311 nop ; otherwise /usr/ccs/bin/as is confused by below .WORD
312___
313for(;$i<32;$i++) { &ROUND_16_xx($i,@V); unshift(@V,pop(@V)); }
314$code.=<<___;
315 bb,>= $Tbl,31,L\$rounds ; end of $Tbl signalled?
316 nop
317
318 $POP `-$FRAME_MARKER-2*$SIZE_T`(%sp),$ctx ; restore arguments
319 $POP `-$FRAME_MARKER-3*$SIZE_T`(%sp),$inp
320 $POP `-$FRAME_MARKER-4*$SIZE_T`(%sp),$num
321 ldo `-$rounds*$SZ-1`($Tbl),$Tbl ; rewind $Tbl
322
323 $LD `0*$SZ`($ctx),@X[0] ; load context
324 $LD `1*$SZ`($ctx),@X[1]
325 $LD `2*$SZ`($ctx),@X[2]
326 $LD `3*$SZ`($ctx),@X[3]
327 $LD `4*$SZ`($ctx),@X[4]
328 $LD `5*$SZ`($ctx),@X[5]
329 addl @X[0],$A,$A
330 $LD `6*$SZ`($ctx),@X[6]
331 addl @X[1],$B,$B
332 $LD `7*$SZ`($ctx),@X[7]
333 ldo `16*$SZ`($inp),$inp ; advance $inp
334
335 $ST $A,`0*$SZ`($ctx) ; save context
336 addl @X[2],$C,$C
337 $ST $B,`1*$SZ`($ctx)
338 addl @X[3],$D,$D
339 $ST $C,`2*$SZ`($ctx)
340 addl @X[4],$E,$E
341 $ST $D,`3*$SZ`($ctx)
342 addl @X[5],$F,$F
343 $ST $E,`4*$SZ`($ctx)
344 addl @X[6],$G,$G
345 $ST $F,`5*$SZ`($ctx)
346 addl @X[7],$H,$H
347 $ST $G,`6*$SZ`($ctx)
348 $ST $H,`7*$SZ`($ctx)
349
350 cmpb,*<>,n $inp,$num,L\$oop
351 $PUSH $inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp) ; save $inp
352___
353if ($SZ==8 && $SIZE_T==4) # SHA512 for 32-bit PA-RISC 1.0
354{{
355$code.=<<___;
356 b L\$done
357 nop
358
359 .ALIGN 64
360L\$parisc1
361___
362
363@V=( $Ahi, $Alo, $Bhi, $Blo, $Chi, $Clo, $Dhi, $Dlo,
364 $Ehi, $Elo, $Fhi, $Flo, $Ghi, $Glo, $Hhi, $Hlo) =
365 ( "%r1", "%r2", "%r3", "%r4", "%r5", "%r6", "%r7", "%r8",
366 "%r9","%r10","%r11","%r12","%r13","%r14","%r15","%r16");
367$a0 ="%r17";
368$a1 ="%r18";
369$a2 ="%r19";
370$a3 ="%r20";
371$t0 ="%r21";
372$t1 ="%r22";
373$t2 ="%r28";
374$t3 ="%r29";
375$Tbl="%r31";
376
377@X=("%r23","%r24","%r25","%r26"); # zaps $num,$inp,$ctx
378
379sub ROUND_00_15_pa1 {
380my ($i,$ahi,$alo,$bhi,$blo,$chi,$clo,$dhi,$dlo,
381 $ehi,$elo,$fhi,$flo,$ghi,$glo,$hhi,$hlo,$flag)=@_;
382my ($Xhi,$Xlo,$Xnhi,$Xnlo) = @X;
383
384$code.=<<___ if (!$flag);
385 ldw `-$XOFF+8*(($i+1)%16)`(%sp),$Xnhi
386 ldw `-$XOFF+8*(($i+1)%16)+4`(%sp),$Xnlo ; load X[i+1]
387___
388$code.=<<___;
389 shd $ehi,$elo,$Sigma1[0],$t0
390 add $Xlo,$hlo,$hlo
391 shd $elo,$ehi,$Sigma1[0],$t1
392 addc $Xhi,$hhi,$hhi ; h += X[i]
393 shd $ehi,$elo,$Sigma1[1],$t2
394 ldwm 8($Tbl),$Xhi
395 shd $elo,$ehi,$Sigma1[1],$t3
396 ldw -4($Tbl),$Xlo ; load K[i]
397 xor $t2,$t0,$t0
398 xor $t3,$t1,$t1
399 and $flo,$elo,$a0
400 and $fhi,$ehi,$a1
401 shd $ehi,$elo,$Sigma1[2],$t2
402 andcm $glo,$elo,$a2
403 shd $elo,$ehi,$Sigma1[2],$t3
404 andcm $ghi,$ehi,$a3
405 xor $t2,$t0,$t0
406 xor $t3,$t1,$t1 ; Sigma1(e)
407 add $Xlo,$hlo,$hlo
408 xor $a2,$a0,$a0
409 addc $Xhi,$hhi,$hhi ; h += K[i]
410 xor $a3,$a1,$a1 ; Ch(e,f,g)
411
412 add $t0,$hlo,$hlo
413 shd $ahi,$alo,$Sigma0[0],$t0
414 addc $t1,$hhi,$hhi ; h += Sigma1(e)
415 shd $alo,$ahi,$Sigma0[0],$t1
416 add $a0,$hlo,$hlo
417 shd $ahi,$alo,$Sigma0[1],$t2
418 addc $a1,$hhi,$hhi ; h += Ch(e,f,g)
419 shd $alo,$ahi,$Sigma0[1],$t3
420
421 xor $t2,$t0,$t0
422 xor $t3,$t1,$t1
423 shd $ahi,$alo,$Sigma0[2],$t2
424 and $alo,$blo,$a0
425 shd $alo,$ahi,$Sigma0[2],$t3
426 and $ahi,$bhi,$a1
427 xor $t2,$t0,$t0
428 xor $t3,$t1,$t1 ; Sigma0(a)
429
430 and $alo,$clo,$a2
431 and $ahi,$chi,$a3
432 xor $a2,$a0,$a0
433 add $hlo,$dlo,$dlo
434 xor $a3,$a1,$a1
435 addc $hhi,$dhi,$dhi ; d += h
436 and $blo,$clo,$a2
437 add $t0,$hlo,$hlo
438 and $bhi,$chi,$a3
439 addc $t1,$hhi,$hhi ; h += Sigma0(a)
440 xor $a2,$a0,$a0
441 add $a0,$hlo,$hlo
442 xor $a3,$a1,$a1 ; Maj(a,b,c)
443 addc $a1,$hhi,$hhi ; h += Maj(a,b,c)
444
445___
446$code.=<<___ if ($i==15 && $flag);
447 extru $Xlo,31,10,$Xlo
448 comiclr,= $LAST10BITS,$Xlo,%r0
449 b L\$rounds_pa1
450 nop
451___
452push(@X,shift(@X)); push(@X,shift(@X));
453}
454
455sub ROUND_16_xx_pa1 {
456my ($Xhi,$Xlo,$Xnhi,$Xnlo) = @X;
457my ($i)=shift;
458$i-=16;
459$code.=<<___;
460 ldw `-$XOFF+8*(($i+1)%16)`(%sp),$Xnhi
461 ldw `-$XOFF+8*(($i+1)%16)+4`(%sp),$Xnlo ; load X[i+1]
462 ldw `-$XOFF+8*(($i+9)%16)`(%sp),$a1
463 ldw `-$XOFF+8*(($i+9)%16)+4`(%sp),$a0 ; load X[i+9]
464 ldw `-$XOFF+8*(($i+14)%16)`(%sp),$a3
465 ldw `-$XOFF+8*(($i+14)%16)+4`(%sp),$a2 ; load X[i+14]
466 shd $Xnhi,$Xnlo,$sigma0[0],$t0
467 shd $Xnlo,$Xnhi,$sigma0[0],$t1
468 add $a0,$Xlo,$Xlo
469 shd $Xnhi,$Xnlo,$sigma0[1],$t2
470 addc $a1,$Xhi,$Xhi
471 shd $Xnlo,$Xnhi,$sigma0[1],$t3
472 xor $t2,$t0,$t0
473 shd $Xnhi,$Xnlo,$sigma0[2],$t2
474 xor $t3,$t1,$t1
475 extru $Xnhi,`31-$sigma0[2]`,`32-$sigma0[2]`,$t3
476 xor $t2,$t0,$t0
477 shd $a3,$a2,$sigma1[0],$a0
478 xor $t3,$t1,$t1 ; sigma0(X[i+1)&0x0f])
479 shd $a2,$a3,$sigma1[0],$a1
480 add $t0,$Xlo,$Xlo
481 shd $a3,$a2,$sigma1[1],$t2
482 addc $t1,$Xhi,$Xhi
483 shd $a2,$a3,$sigma1[1],$t3
484 xor $t2,$a0,$a0
485 shd $a3,$a2,$sigma1[2],$t2
486 xor $t3,$a1,$a1
487 extru $a3,`31-$sigma1[2]`,`32-$sigma1[2]`,$t3
488 xor $t2,$a0,$a0
489 xor $t3,$a1,$a1 ; sigma0(X[i+14)&0x0f])
490 add $a0,$Xlo,$Xlo
491 addc $a1,$Xhi,$Xhi
492
493 stw $Xhi,`-$XOFF+8*($i%16)`(%sp)
494 stw $Xlo,`-$XOFF+8*($i%16)+4`(%sp)
495___
496&ROUND_00_15_pa1($i,@_,1);
497}
498$code.=<<___;
499 ldw `0*4`($ctx),$Ahi ; load context
500 ldw `1*4`($ctx),$Alo
501 ldw `2*4`($ctx),$Bhi
502 ldw `3*4`($ctx),$Blo
503 ldw `4*4`($ctx),$Chi
504 ldw `5*4`($ctx),$Clo
505 ldw `6*4`($ctx),$Dhi
506 ldw `7*4`($ctx),$Dlo
507 ldw `8*4`($ctx),$Ehi
508 ldw `9*4`($ctx),$Elo
509 ldw `10*4`($ctx),$Fhi
510 ldw `11*4`($ctx),$Flo
511 ldw `12*4`($ctx),$Ghi
512 ldw `13*4`($ctx),$Glo
513 ldw `14*4`($ctx),$Hhi
514 ldw `15*4`($ctx),$Hlo
515
516 extru $inp,31,2,$t0
517 sh3addl $t0,%r0,$t0
518 subi 32,$t0,$t0
519 mtctl $t0,%cr11 ; load %sar with align factor
520
521L\$oop_pa1
522 extru $inp,31,2,$a3
523 comib,= 0,$a3,L\$aligned_pa1
524 sub $inp,$a3,$inp
525
526 ldw `0*4`($inp),$X[0]
527 ldw `1*4`($inp),$X[1]
528 ldw `2*4`($inp),$t2
529 ldw `3*4`($inp),$t3
530 ldw `4*4`($inp),$a0
531 ldw `5*4`($inp),$a1
532 ldw `6*4`($inp),$a2
533 ldw `7*4`($inp),$a3
534 vshd $X[0],$X[1],$X[0]
535 vshd $X[1],$t2,$X[1]
536 stw $X[0],`-$XOFF+0*4`(%sp)
537 ldw `8*4`($inp),$t0
538 vshd $t2,$t3,$t2
539 stw $X[1],`-$XOFF+1*4`(%sp)
540 ldw `9*4`($inp),$t1
541 vshd $t3,$a0,$t3
542___
543{
544my @t=($t2,$t3,$a0,$a1,$a2,$a3,$t0,$t1);
545for ($i=2;$i<=(128/4-8);$i++) {
546$code.=<<___;
547 stw $t[0],`-$XOFF+$i*4`(%sp)
548 ldw `(8+$i)*4`($inp),$t[0]
549 vshd $t[1],$t[2],$t[1]
550___
551push(@t,shift(@t));
552}
553for (;$i<(128/4-1);$i++) {
554$code.=<<___;
555 stw $t[0],`-$XOFF+$i*4`(%sp)
556 vshd $t[1],$t[2],$t[1]
557___
558push(@t,shift(@t));
559}
560$code.=<<___;
561 b L\$collected_pa1
562 stw $t[0],`-$XOFF+$i*4`(%sp)
563
564___
565}
566$code.=<<___;
567L\$aligned_pa1
568 ldw `0*4`($inp),$X[0]
569 ldw `1*4`($inp),$X[1]
570 ldw `2*4`($inp),$t2
571 ldw `3*4`($inp),$t3
572 ldw `4*4`($inp),$a0
573 ldw `5*4`($inp),$a1
574 ldw `6*4`($inp),$a2
575 ldw `7*4`($inp),$a3
576 stw $X[0],`-$XOFF+0*4`(%sp)
577 ldw `8*4`($inp),$t0
578 stw $X[1],`-$XOFF+1*4`(%sp)
579 ldw `9*4`($inp),$t1
580___
581{
582my @t=($t2,$t3,$a0,$a1,$a2,$a3,$t0,$t1);
583for ($i=2;$i<(128/4-8);$i++) {
584$code.=<<___;
585 stw $t[0],`-$XOFF+$i*4`(%sp)
586 ldw `(8+$i)*4`($inp),$t[0]
587___
588push(@t,shift(@t));
589}
590for (;$i<128/4;$i++) {
591$code.=<<___;
592 stw $t[0],`-$XOFF+$i*4`(%sp)
593___
594push(@t,shift(@t));
595}
596$code.="L\$collected_pa1\n";
597}
598
599for($i=0;$i<16;$i++) { &ROUND_00_15_pa1($i,@V); unshift(@V,pop(@V)); unshift(@V,pop(@V)); }
600$code.="L\$rounds_pa1\n";
601for(;$i<32;$i++) { &ROUND_16_xx_pa1($i,@V); unshift(@V,pop(@V)); unshift(@V,pop(@V)); }
602
603$code.=<<___;
604 $POP `-$FRAME_MARKER-2*$SIZE_T`(%sp),$ctx ; restore arguments
605 $POP `-$FRAME_MARKER-3*$SIZE_T`(%sp),$inp
606 $POP `-$FRAME_MARKER-4*$SIZE_T`(%sp),$num
607 ldo `-$rounds*$SZ`($Tbl),$Tbl ; rewind $Tbl
608
609 ldw `0*4`($ctx),$t1 ; update context
610 ldw `1*4`($ctx),$t0
611 ldw `2*4`($ctx),$t3
612 ldw `3*4`($ctx),$t2
613 ldw `4*4`($ctx),$a1
614 ldw `5*4`($ctx),$a0
615 ldw `6*4`($ctx),$a3
616 add $t0,$Alo,$Alo
617 ldw `7*4`($ctx),$a2
618 addc $t1,$Ahi,$Ahi
619 ldw `8*4`($ctx),$t1
620 add $t2,$Blo,$Blo
621 ldw `9*4`($ctx),$t0
622 addc $t3,$Bhi,$Bhi
623 ldw `10*4`($ctx),$t3
624 add $a0,$Clo,$Clo
625 ldw `11*4`($ctx),$t2
626 addc $a1,$Chi,$Chi
627 ldw `12*4`($ctx),$a1
628 add $a2,$Dlo,$Dlo
629 ldw `13*4`($ctx),$a0
630 addc $a3,$Dhi,$Dhi
631 ldw `14*4`($ctx),$a3
632 add $t0,$Elo,$Elo
633 ldw `15*4`($ctx),$a2
634 addc $t1,$Ehi,$Ehi
635 stw $Ahi,`0*4`($ctx)
636 add $t2,$Flo,$Flo
637 stw $Alo,`1*4`($ctx)
638 addc $t3,$Fhi,$Fhi
639 stw $Bhi,`2*4`($ctx)
640 add $a0,$Glo,$Glo
641 stw $Blo,`3*4`($ctx)
642 addc $a1,$Ghi,$Ghi
643 stw $Chi,`4*4`($ctx)
644 add $a2,$Hlo,$Hlo
645 stw $Clo,`5*4`($ctx)
646 addc $a3,$Hhi,$Hhi
647 stw $Dhi,`6*4`($ctx)
648 ldo `16*$SZ`($inp),$inp ; advance $inp
649 stw $Dlo,`7*4`($ctx)
650 stw $Ehi,`8*4`($ctx)
651 stw $Elo,`9*4`($ctx)
652 stw $Fhi,`10*4`($ctx)
653 stw $Flo,`11*4`($ctx)
654 stw $Ghi,`12*4`($ctx)
655 stw $Glo,`13*4`($ctx)
656 stw $Hhi,`14*4`($ctx)
657 comb,= $inp,$num,L\$done
658 stw $Hlo,`15*4`($ctx)
659 b L\$oop_pa1
660 $PUSH $inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp) ; save $inp
661L\$done
662___
663}}
664$code.=<<___;
665 $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue
666 $POP `-$FRAME+1*$SIZE_T`(%sp),%r4
667 $POP `-$FRAME+2*$SIZE_T`(%sp),%r5
668 $POP `-$FRAME+3*$SIZE_T`(%sp),%r6
669 $POP `-$FRAME+4*$SIZE_T`(%sp),%r7
670 $POP `-$FRAME+5*$SIZE_T`(%sp),%r8
671 $POP `-$FRAME+6*$SIZE_T`(%sp),%r9
672 $POP `-$FRAME+7*$SIZE_T`(%sp),%r10
673 $POP `-$FRAME+8*$SIZE_T`(%sp),%r11
674 $POP `-$FRAME+9*$SIZE_T`(%sp),%r12
675 $POP `-$FRAME+10*$SIZE_T`(%sp),%r13
676 $POP `-$FRAME+11*$SIZE_T`(%sp),%r14
677 $POP `-$FRAME+12*$SIZE_T`(%sp),%r15
678 $POP `-$FRAME+13*$SIZE_T`(%sp),%r16
679 $POP `-$FRAME+14*$SIZE_T`(%sp),%r17
680 $POP `-$FRAME+15*$SIZE_T`(%sp),%r18
681 bv (%r2)
682 .EXIT
683 $POPMB -$FRAME(%sp),%r3
684 .PROCEND
685 .STRINGZ "SHA`64*$SZ` block transform for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
686___
687
688# Explicitly encode PA-RISC 2.0 instructions used in this module, so
689# that it can be compiled with .LEVEL 1.0. It should be noted that I
690# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0
691# directive...
692
693my $ldd = sub {
694 my ($mod,$args) = @_;
695 my $orig = "ldd$mod\t$args";
696
697 if ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 3 suffices
698 { my $opcode=(0x14<<26)|($2<<21)|($3<<16)|(($1&0x1FF8)<<1)|(($1>>13)&1);
699 $opcode|=(1<<3) if ($mod =~ /^,m/);
700 $opcode|=(1<<2) if ($mod =~ /^,mb/);
701 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
702 }
703 else { "\t".$orig; }
704};
705
706my $std = sub {
707 my ($mod,$args) = @_;
708 my $orig = "std$mod\t$args";
709
710 if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 3 suffices
711 { my $opcode=(0x1c<<26)|($3<<21)|($1<<16)|(($2&0x1FF8)<<1)|(($2>>13)&1);
712 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
713 }
714 else { "\t".$orig; }
715};
716
717my $extrd = sub {
718 my ($mod,$args) = @_;
719 my $orig = "extrd$mod\t$args";
720
721 # I only have ",u" completer, it's implicitly encoded...
722 if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 15
723 { my $opcode=(0x36<<26)|($1<<21)|($4<<16);
724 my $len=32-$3;
725 $opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5); # encode pos
726 $opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len
727 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
728 }
729 elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/) # format 12
730 { my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9);
731 my $len=32-$2;
732 $opcode |= (($len&0x20)<<3)|($len&0x1f); # encode len
733 $opcode |= (1<<13) if ($mod =~ /,\**=/);
734 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
735 }
736 else { "\t".$orig; }
737};
738
739my $shrpd = sub {
740 my ($mod,$args) = @_;
741 my $orig = "shrpd$mod\t$args";
742
743 if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/) # format 14
744 { my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4;
745 my $cpos=63-$3;
746 $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode sa
747 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
748 }
749 elsif ($args =~ /%r([0-9]+),%r([0-9]+),%sar,%r([0-9]+)/) # format 11
750 { sprintf "\t.WORD\t0x%08x\t; %s",
751 (0x34<<26)|($2<<21)|($1<<16)|(1<<9)|$3,$orig;
752 }
753 else { "\t".$orig; }
754};
755
756sub assemble {
757 my ($mnemonic,$mod,$args)=@_;
758 my $opcode = eval("\$$mnemonic");
759
760 ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args";
761}
762
763foreach (split("\n",$code)) {
764 s/\`([^\`]*)\`/eval $1/ge;
765
766 s/shd\s+(%r[0-9]+),(%r[0-9]+),([0-9]+)/
767 $3>31 ? sprintf("shd\t%$2,%$1,%d",$3-32) # rotation for >=32
768 : sprintf("shd\t%$1,%$2,%d",$3)/e or
769 # translate made up instructons: _ror, _shr, _align, _shl
770 s/_ror(\s+)(%r[0-9]+),/
771 ($SZ==4 ? "shd" : "shrpd")."$1$2,$2,"/e or
772
773 s/_shr(\s+%r[0-9]+),([0-9]+),/
774 $SZ==4 ? sprintf("extru%s,%d,%d,",$1,31-$2,32-$2)
775 : sprintf("extrd,u%s,%d,%d,",$1,63-$2,64-$2)/e or
776
777 s/_align(\s+%r[0-9]+,%r[0-9]+),/
778 ($SZ==4 ? "vshd$1," : "shrpd$1,%sar,")/e or
779
780 s/_shl(\s+%r[0-9]+),([0-9]+),/
781 $SIZE_T==4 ? sprintf("zdep%s,%d,%d,",$1,31-$2,32-$2)
782 : sprintf("depd,z%s,%d,%d,",$1,63-$2,64-$2)/e;
783
784 s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e if ($SIZE_T==4);
785
786 s/cmpb,\*/comb,/ if ($SIZE_T==4);
787
788 print $_,"\n";
789}
790
791close STDOUT;
diff --git a/src/lib/libcrypto/sha/asm/sha512-ppc.pl b/src/lib/libcrypto/sha/asm/sha512-ppc.pl
index 768a6a6fad..6b44a68e59 100755
--- a/src/lib/libcrypto/sha/asm/sha512-ppc.pl
+++ b/src/lib/libcrypto/sha/asm/sha512-ppc.pl
@@ -40,6 +40,7 @@ $output =shift;
40 40
41if ($flavour =~ /64/) { 41if ($flavour =~ /64/) {
42 $SIZE_T=8; 42 $SIZE_T=8;
43 $LRSAVE=2*$SIZE_T;
43 $STU="stdu"; 44 $STU="stdu";
44 $UCMP="cmpld"; 45 $UCMP="cmpld";
45 $SHL="sldi"; 46 $SHL="sldi";
@@ -47,6 +48,7 @@ if ($flavour =~ /64/) {
47 $PUSH="std"; 48 $PUSH="std";
48} elsif ($flavour =~ /32/) { 49} elsif ($flavour =~ /32/) {
49 $SIZE_T=4; 50 $SIZE_T=4;
51 $LRSAVE=$SIZE_T;
50 $STU="stwu"; 52 $STU="stwu";
51 $UCMP="cmplw"; 53 $UCMP="cmplw";
52 $SHL="slwi"; 54 $SHL="slwi";
@@ -87,7 +89,8 @@ if ($output =~ /512/) {
87 $SHR="srwi"; 89 $SHR="srwi";
88} 90}
89 91
90$FRAME=32*$SIZE_T; 92$FRAME=32*$SIZE_T+16*$SZ;
93$LOCALS=6*$SIZE_T;
91 94
92$sp ="r1"; 95$sp ="r1";
93$toc="r2"; 96$toc="r2";
@@ -179,13 +182,12 @@ $code=<<___;
179.globl $func 182.globl $func
180.align 6 183.align 6
181$func: 184$func:
185 $STU $sp,-$FRAME($sp)
182 mflr r0 186 mflr r0
183 $STU $sp,`-($FRAME+16*$SZ)`($sp)
184 $SHL $num,$num,`log(16*$SZ)/log(2)` 187 $SHL $num,$num,`log(16*$SZ)/log(2)`
185 188
186 $PUSH $ctx,`$FRAME-$SIZE_T*22`($sp) 189 $PUSH $ctx,`$FRAME-$SIZE_T*22`($sp)
187 190
188 $PUSH r0,`$FRAME-$SIZE_T*21`($sp)
189 $PUSH $toc,`$FRAME-$SIZE_T*20`($sp) 191 $PUSH $toc,`$FRAME-$SIZE_T*20`($sp)
190 $PUSH r13,`$FRAME-$SIZE_T*19`($sp) 192 $PUSH r13,`$FRAME-$SIZE_T*19`($sp)
191 $PUSH r14,`$FRAME-$SIZE_T*18`($sp) 193 $PUSH r14,`$FRAME-$SIZE_T*18`($sp)
@@ -206,6 +208,7 @@ $func:
206 $PUSH r29,`$FRAME-$SIZE_T*3`($sp) 208 $PUSH r29,`$FRAME-$SIZE_T*3`($sp)
207 $PUSH r30,`$FRAME-$SIZE_T*2`($sp) 209 $PUSH r30,`$FRAME-$SIZE_T*2`($sp)
208 $PUSH r31,`$FRAME-$SIZE_T*1`($sp) 210 $PUSH r31,`$FRAME-$SIZE_T*1`($sp)
211 $PUSH r0,`$FRAME+$LRSAVE`($sp)
209 212
210 $LD $A,`0*$SZ`($ctx) 213 $LD $A,`0*$SZ`($ctx)
211 mr $inp,r4 ; incarnate $inp 214 mr $inp,r4 ; incarnate $inp
@@ -217,7 +220,7 @@ $func:
217 $LD $G,`6*$SZ`($ctx) 220 $LD $G,`6*$SZ`($ctx)
218 $LD $H,`7*$SZ`($ctx) 221 $LD $H,`7*$SZ`($ctx)
219 222
220 b LPICmeup 223 bl LPICmeup
221LPICedup: 224LPICedup:
222 andi. r0,$inp,3 225 andi. r0,$inp,3
223 bne Lunaligned 226 bne Lunaligned
@@ -226,40 +229,14 @@ Laligned:
226 $PUSH $num,`$FRAME-$SIZE_T*24`($sp) ; end pointer 229 $PUSH $num,`$FRAME-$SIZE_T*24`($sp) ; end pointer
227 $PUSH $inp,`$FRAME-$SIZE_T*23`($sp) ; inp pointer 230 $PUSH $inp,`$FRAME-$SIZE_T*23`($sp) ; inp pointer
228 bl Lsha2_block_private 231 bl Lsha2_block_private
229Ldone: 232 b Ldone
230 $POP r0,`$FRAME-$SIZE_T*21`($sp)
231 $POP $toc,`$FRAME-$SIZE_T*20`($sp)
232 $POP r13,`$FRAME-$SIZE_T*19`($sp)
233 $POP r14,`$FRAME-$SIZE_T*18`($sp)
234 $POP r15,`$FRAME-$SIZE_T*17`($sp)
235 $POP r16,`$FRAME-$SIZE_T*16`($sp)
236 $POP r17,`$FRAME-$SIZE_T*15`($sp)
237 $POP r18,`$FRAME-$SIZE_T*14`($sp)
238 $POP r19,`$FRAME-$SIZE_T*13`($sp)
239 $POP r20,`$FRAME-$SIZE_T*12`($sp)
240 $POP r21,`$FRAME-$SIZE_T*11`($sp)
241 $POP r22,`$FRAME-$SIZE_T*10`($sp)
242 $POP r23,`$FRAME-$SIZE_T*9`($sp)
243 $POP r24,`$FRAME-$SIZE_T*8`($sp)
244 $POP r25,`$FRAME-$SIZE_T*7`($sp)
245 $POP r26,`$FRAME-$SIZE_T*6`($sp)
246 $POP r27,`$FRAME-$SIZE_T*5`($sp)
247 $POP r28,`$FRAME-$SIZE_T*4`($sp)
248 $POP r29,`$FRAME-$SIZE_T*3`($sp)
249 $POP r30,`$FRAME-$SIZE_T*2`($sp)
250 $POP r31,`$FRAME-$SIZE_T*1`($sp)
251 mtlr r0
252 addi $sp,$sp,`$FRAME+16*$SZ`
253 blr
254___
255 233
256# PowerPC specification allows an implementation to be ill-behaved 234; PowerPC specification allows an implementation to be ill-behaved
257# upon unaligned access which crosses page boundary. "Better safe 235; upon unaligned access which crosses page boundary. "Better safe
258# than sorry" principle makes me treat it specially. But I don't 236; than sorry" principle makes me treat it specially. But I don't
259# look for particular offending word, but rather for the input 237; look for particular offending word, but rather for the input
260# block which crosses the boundary. Once found that block is aligned 238; block which crosses the boundary. Once found that block is aligned
261# and hashed separately... 239; and hashed separately...
262$code.=<<___;
263.align 4 240.align 4
264Lunaligned: 241Lunaligned:
265 subfic $t1,$inp,4096 242 subfic $t1,$inp,4096
@@ -278,7 +255,7 @@ Lunaligned:
278Lcross_page: 255Lcross_page:
279 li $t1,`16*$SZ/4` 256 li $t1,`16*$SZ/4`
280 mtctr $t1 257 mtctr $t1
281 addi r20,$sp,$FRAME ; aligned spot below the frame 258 addi r20,$sp,$LOCALS ; aligned spot below the frame
282Lmemcpy: 259Lmemcpy:
283 lbz r16,0($inp) 260 lbz r16,0($inp)
284 lbz r17,1($inp) 261 lbz r17,1($inp)
@@ -293,8 +270,8 @@ Lmemcpy:
293 bdnz Lmemcpy 270 bdnz Lmemcpy
294 271
295 $PUSH $inp,`$FRAME-$SIZE_T*26`($sp) ; save real inp 272 $PUSH $inp,`$FRAME-$SIZE_T*26`($sp) ; save real inp
296 addi $t1,$sp,`$FRAME+16*$SZ` ; fictitious end pointer 273 addi $t1,$sp,`$LOCALS+16*$SZ` ; fictitious end pointer
297 addi $inp,$sp,$FRAME ; fictitious inp pointer 274 addi $inp,$sp,$LOCALS ; fictitious inp pointer
298 $PUSH $num,`$FRAME-$SIZE_T*25`($sp) ; save real num 275 $PUSH $num,`$FRAME-$SIZE_T*25`($sp) ; save real num
299 $PUSH $t1,`$FRAME-$SIZE_T*24`($sp) ; end pointer 276 $PUSH $t1,`$FRAME-$SIZE_T*24`($sp) ; end pointer
300 $PUSH $inp,`$FRAME-$SIZE_T*23`($sp) ; inp pointer 277 $PUSH $inp,`$FRAME-$SIZE_T*23`($sp) ; inp pointer
@@ -303,10 +280,36 @@ Lmemcpy:
303 $POP $num,`$FRAME-$SIZE_T*25`($sp) ; restore real num 280 $POP $num,`$FRAME-$SIZE_T*25`($sp) ; restore real num
304 addic. $num,$num,`-16*$SZ` ; num-- 281 addic. $num,$num,`-16*$SZ` ; num--
305 bne- Lunaligned 282 bne- Lunaligned
306 b Ldone
307___
308 283
309$code.=<<___; 284Ldone:
285 $POP r0,`$FRAME+$LRSAVE`($sp)
286 $POP $toc,`$FRAME-$SIZE_T*20`($sp)
287 $POP r13,`$FRAME-$SIZE_T*19`($sp)
288 $POP r14,`$FRAME-$SIZE_T*18`($sp)
289 $POP r15,`$FRAME-$SIZE_T*17`($sp)
290 $POP r16,`$FRAME-$SIZE_T*16`($sp)
291 $POP r17,`$FRAME-$SIZE_T*15`($sp)
292 $POP r18,`$FRAME-$SIZE_T*14`($sp)
293 $POP r19,`$FRAME-$SIZE_T*13`($sp)
294 $POP r20,`$FRAME-$SIZE_T*12`($sp)
295 $POP r21,`$FRAME-$SIZE_T*11`($sp)
296 $POP r22,`$FRAME-$SIZE_T*10`($sp)
297 $POP r23,`$FRAME-$SIZE_T*9`($sp)
298 $POP r24,`$FRAME-$SIZE_T*8`($sp)
299 $POP r25,`$FRAME-$SIZE_T*7`($sp)
300 $POP r26,`$FRAME-$SIZE_T*6`($sp)
301 $POP r27,`$FRAME-$SIZE_T*5`($sp)
302 $POP r28,`$FRAME-$SIZE_T*4`($sp)
303 $POP r29,`$FRAME-$SIZE_T*3`($sp)
304 $POP r30,`$FRAME-$SIZE_T*2`($sp)
305 $POP r31,`$FRAME-$SIZE_T*1`($sp)
306 mtlr r0
307 addi $sp,$sp,$FRAME
308 blr
309 .long 0
310 .byte 0,12,4,1,0x80,18,3,0
311 .long 0
312
310.align 4 313.align 4
311Lsha2_block_private: 314Lsha2_block_private:
312___ 315___
@@ -372,6 +375,8 @@ $code.=<<___;
372 $ST $H,`7*$SZ`($ctx) 375 $ST $H,`7*$SZ`($ctx)
373 bne Lsha2_block_private 376 bne Lsha2_block_private
374 blr 377 blr
378 .long 0
379 .byte 0,12,0x14,0,0,0,0,0
375___ 380___
376 381
377# Ugly hack here, because PPC assembler syntax seem to vary too 382# Ugly hack here, because PPC assembler syntax seem to vary too
@@ -379,22 +384,15 @@ ___
379$code.=<<___; 384$code.=<<___;
380.align 6 385.align 6
381LPICmeup: 386LPICmeup:
382 bl LPIC 387 mflr r0
383 addi $Tbl,$Tbl,`64-4` ; "distance" between . and last nop 388 bcl 20,31,\$+4
384 b LPICedup 389 mflr $Tbl ; vvvvvv "distance" between . and 1st data entry
385 nop 390 addi $Tbl,$Tbl,`64-8`
386 nop 391 mtlr r0
387 nop
388 nop
389 nop
390LPIC: mflr $Tbl
391 blr 392 blr
392 nop 393 .long 0
393 nop 394 .byte 0,12,0x14,0,0,0,0,0
394 nop 395 .space `64-9*4`
395 nop
396 nop
397 nop
398___ 396___
399$code.=<<___ if ($SZ==8); 397$code.=<<___ if ($SZ==8);
400 .long 0x428a2f98,0xd728ae22,0x71374491,0x23ef65cd 398 .long 0x428a2f98,0xd728ae22,0x71374491,0x23ef65cd
diff --git a/src/lib/libcrypto/sha/asm/sha512-s390x.pl b/src/lib/libcrypto/sha/asm/sha512-s390x.pl
index e7ef2d5a9f..079a3fc78a 100644
--- a/src/lib/libcrypto/sha/asm/sha512-s390x.pl
+++ b/src/lib/libcrypto/sha/asm/sha512-s390x.pl
@@ -26,6 +26,26 @@
26# favour dual-issue z10 pipeline. Hardware SHA256/512 is ~4.7x faster 26# favour dual-issue z10 pipeline. Hardware SHA256/512 is ~4.7x faster
27# than software. 27# than software.
28 28
29# November 2010.
30#
31# Adapt for -m31 build. If kernel supports what's called "highgprs"
32# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
33# instructions and achieve "64-bit" performance even in 31-bit legacy
34# application context. The feature is not specific to any particular
35# processor, as long as it's "z-CPU". Latter implies that the code
36# remains z/Architecture specific. On z900 SHA256 was measured to
37# perform 2.4x and SHA512 - 13x better than code generated by gcc 4.3.
38
39$flavour = shift;
40
41if ($flavour =~ /3[12]/) {
42 $SIZE_T=4;
43 $g="";
44} else {
45 $SIZE_T=8;
46 $g="g";
47}
48
29$t0="%r0"; 49$t0="%r0";
30$t1="%r1"; 50$t1="%r1";
31$ctx="%r2"; $t2="%r2"; 51$ctx="%r2"; $t2="%r2";
@@ -44,7 +64,7 @@ $tbl="%r13";
44$T1="%r14"; 64$T1="%r14";
45$sp="%r15"; 65$sp="%r15";
46 66
47$output=shift; 67while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
48open STDOUT,">$output"; 68open STDOUT,">$output";
49 69
50if ($output =~ /512/) { 70if ($output =~ /512/) {
@@ -78,7 +98,8 @@ if ($output =~ /512/) {
78} 98}
79$Func="sha${label}_block_data_order"; 99$Func="sha${label}_block_data_order";
80$Table="K${label}"; 100$Table="K${label}";
81$frame=160+16*$SZ; 101$stdframe=16*$SIZE_T+4*8;
102$frame=$stdframe+16*$SZ;
82 103
83sub BODY_00_15 { 104sub BODY_00_15 {
84my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; 105my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
@@ -93,9 +114,9 @@ $code.=<<___;
93 xgr $t0,$t1 114 xgr $t0,$t1
94 $ROT $t1,$t1,`$Sigma1[2]-$Sigma1[1]` 115 $ROT $t1,$t1,`$Sigma1[2]-$Sigma1[1]`
95 xgr $t2,$g 116 xgr $t2,$g
96 $ST $T1,`160+$SZ*($i%16)`($sp) 117 $ST $T1,`$stdframe+$SZ*($i%16)`($sp)
97 xgr $t0,$t1 # Sigma1(e) 118 xgr $t0,$t1 # Sigma1(e)
98 la $T1,0($T1,$h) # T1+=h 119 algr $T1,$h # T1+=h
99 ngr $t2,$e 120 ngr $t2,$e
100 lgr $t1,$a 121 lgr $t1,$a
101 algr $T1,$t0 # T1+=Sigma1(e) 122 algr $T1,$t0 # T1+=Sigma1(e)
@@ -113,7 +134,7 @@ $code.=<<___;
113 ngr $t2,$b 134 ngr $t2,$b
114 algr $h,$T1 # h+=T1 135 algr $h,$T1 # h+=T1
115 ogr $t2,$t1 # Maj(a,b,c) 136 ogr $t2,$t1 # Maj(a,b,c)
116 la $d,0($d,$T1) # d+=T1 137 algr $d,$T1 # d+=T1
117 algr $h,$t2 # h+=Maj(a,b,c) 138 algr $h,$t2 # h+=Maj(a,b,c)
118___ 139___
119} 140}
@@ -122,19 +143,19 @@ sub BODY_16_XX {
122my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; 143my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
123 144
124$code.=<<___; 145$code.=<<___;
125 $LD $T1,`160+$SZ*(($i+1)%16)`($sp) ### $i 146 $LD $T1,`$stdframe+$SZ*(($i+1)%16)`($sp) ### $i
126 $LD $t1,`160+$SZ*(($i+14)%16)`($sp) 147 $LD $t1,`$stdframe+$SZ*(($i+14)%16)`($sp)
127 $ROT $t0,$T1,$sigma0[0] 148 $ROT $t0,$T1,$sigma0[0]
128 $SHR $T1,$sigma0[2] 149 $SHR $T1,$sigma0[2]
129 $ROT $t2,$t0,`$sigma0[1]-$sigma0[0]` 150 $ROT $t2,$t0,`$sigma0[1]-$sigma0[0]`
130 xgr $T1,$t0 151 xgr $T1,$t0
131 $ROT $t0,$t1,$sigma1[0] 152 $ROT $t0,$t1,$sigma1[0]
132 xgr $T1,$t2 # sigma0(X[i+1]) 153 xgr $T1,$t2 # sigma0(X[i+1])
133 $SHR $t1,$sigma1[2] 154 $SHR $t1,$sigma1[2]
134 $ADD $T1,`160+$SZ*($i%16)`($sp) # +=X[i] 155 $ADD $T1,`$stdframe+$SZ*($i%16)`($sp) # +=X[i]
135 xgr $t1,$t0 156 xgr $t1,$t0
136 $ROT $t0,$t0,`$sigma1[1]-$sigma1[0]` 157 $ROT $t0,$t0,`$sigma1[1]-$sigma1[0]`
137 $ADD $T1,`160+$SZ*(($i+9)%16)`($sp) # +=X[i+9] 158 $ADD $T1,`$stdframe+$SZ*(($i+9)%16)`($sp) # +=X[i+9]
138 xgr $t1,$t0 # sigma1(X[i+14]) 159 xgr $t1,$t0 # sigma1(X[i+14])
139 algr $T1,$t1 # +=sigma1(X[i+14]) 160 algr $T1,$t1 # +=sigma1(X[i+14])
140___ 161___
@@ -212,6 +233,7 @@ $code.=<<___;
212.globl $Func 233.globl $Func
213.type $Func,\@function 234.type $Func,\@function
214$Func: 235$Func:
236 sllg $len,$len,`log(16*$SZ)/log(2)`
215___ 237___
216$code.=<<___ if ($kimdfunc); 238$code.=<<___ if ($kimdfunc);
217 larl %r1,OPENSSL_s390xcap_P 239 larl %r1,OPENSSL_s390xcap_P
@@ -219,15 +241,15 @@ $code.=<<___ if ($kimdfunc);
219 tmhl %r0,0x4000 # check for message-security assist 241 tmhl %r0,0x4000 # check for message-security assist
220 jz .Lsoftware 242 jz .Lsoftware
221 lghi %r0,0 243 lghi %r0,0
222 la %r1,16($sp) 244 la %r1,`2*$SIZE_T`($sp)
223 .long 0xb93e0002 # kimd %r0,%r2 245 .long 0xb93e0002 # kimd %r0,%r2
224 lg %r0,16($sp) 246 lg %r0,`2*$SIZE_T`($sp)
225 tmhh %r0,`0x8000>>$kimdfunc` 247 tmhh %r0,`0x8000>>$kimdfunc`
226 jz .Lsoftware 248 jz .Lsoftware
227 lghi %r0,$kimdfunc 249 lghi %r0,$kimdfunc
228 lgr %r1,$ctx 250 lgr %r1,$ctx
229 lgr %r2,$inp 251 lgr %r2,$inp
230 sllg %r3,$len,`log(16*$SZ)/log(2)` 252 lgr %r3,$len
231 .long 0xb93e0002 # kimd %r0,%r2 253 .long 0xb93e0002 # kimd %r0,%r2
232 brc 1,.-4 # pay attention to "partial completion" 254 brc 1,.-4 # pay attention to "partial completion"
233 br %r14 255 br %r14
@@ -235,13 +257,12 @@ $code.=<<___ if ($kimdfunc);
235.Lsoftware: 257.Lsoftware:
236___ 258___
237$code.=<<___; 259$code.=<<___;
238 sllg $len,$len,`log(16*$SZ)/log(2)`
239 lghi %r1,-$frame 260 lghi %r1,-$frame
240 agr $len,$inp 261 la $len,0($len,$inp)
241 stmg $ctx,%r15,16($sp) 262 stm${g} $ctx,%r15,`2*$SIZE_T`($sp)
242 lgr %r0,$sp 263 lgr %r0,$sp
243 la $sp,0(%r1,$sp) 264 la $sp,0(%r1,$sp)
244 stg %r0,0($sp) 265 st${g} %r0,0($sp)
245 266
246 larl $tbl,$Table 267 larl $tbl,$Table
247 $LD $A,`0*$SZ`($ctx) 268 $LD $A,`0*$SZ`($ctx)
@@ -265,7 +286,7 @@ $code.=<<___;
265 clgr $len,$t0 286 clgr $len,$t0
266 jne .Lrounds_16_xx 287 jne .Lrounds_16_xx
267 288
268 lg $ctx,`$frame+16`($sp) 289 l${g} $ctx,`$frame+2*$SIZE_T`($sp)
269 la $inp,`16*$SZ`($inp) 290 la $inp,`16*$SZ`($inp)
270 $ADD $A,`0*$SZ`($ctx) 291 $ADD $A,`0*$SZ`($ctx)
271 $ADD $B,`1*$SZ`($ctx) 292 $ADD $B,`1*$SZ`($ctx)
@@ -283,14 +304,14 @@ $code.=<<___;
283 $ST $F,`5*$SZ`($ctx) 304 $ST $F,`5*$SZ`($ctx)
284 $ST $G,`6*$SZ`($ctx) 305 $ST $G,`6*$SZ`($ctx)
285 $ST $H,`7*$SZ`($ctx) 306 $ST $H,`7*$SZ`($ctx)
286 clg $inp,`$frame+32`($sp) 307 cl${g} $inp,`$frame+4*$SIZE_T`($sp)
287 jne .Lloop 308 jne .Lloop
288 309
289 lmg %r6,%r15,`$frame+48`($sp) 310 lm${g} %r6,%r15,`$frame+6*$SIZE_T`($sp)
290 br %r14 311 br %r14
291.size $Func,.-$Func 312.size $Func,.-$Func
292.string "SHA${label} block transform for s390x, CRYPTOGAMS by <appro\@openssl.org>" 313.string "SHA${label} block transform for s390x, CRYPTOGAMS by <appro\@openssl.org>"
293.comm OPENSSL_s390xcap_P,8,8 314.comm OPENSSL_s390xcap_P,16,8
294___ 315___
295 316
296$code =~ s/\`([^\`]*)\`/eval $1/gem; 317$code =~ s/\`([^\`]*)\`/eval $1/gem;
diff --git a/src/lib/libcrypto/sha/asm/sha512-sparcv9.pl b/src/lib/libcrypto/sha/asm/sha512-sparcv9.pl
index ec5d78135e..585740789e 100644
--- a/src/lib/libcrypto/sha/asm/sha512-sparcv9.pl
+++ b/src/lib/libcrypto/sha/asm/sha512-sparcv9.pl
@@ -305,9 +305,9 @@ $code.=<<___;
305 srlx @X[(($i+9)/2)%8],32,$tmp1 ! X[i+9] 305 srlx @X[(($i+9)/2)%8],32,$tmp1 ! X[i+9]
306 xor $tmp0,$tmp2,$tmp2 ! sigma1(X[i+14]) 306 xor $tmp0,$tmp2,$tmp2 ! sigma1(X[i+14])
307 srl @X[($i/2)%8],0,$tmp0 307 srl @X[($i/2)%8],0,$tmp0
308 add $tmp2,$tmp1,$tmp1
308 add $xi,$T1,$T1 ! +=X[i] 309 add $xi,$T1,$T1 ! +=X[i]
309 xor $tmp0,@X[($i/2)%8],@X[($i/2)%8] 310 xor $tmp0,@X[($i/2)%8],@X[($i/2)%8]
310 add $tmp2,$T1,$T1
311 add $tmp1,$T1,$T1 311 add $tmp1,$T1,$T1
312 312
313 srl $T1,0,$T1 313 srl $T1,0,$T1
@@ -318,9 +318,9 @@ ___
318$code.=<<___; 318$code.=<<___;
319 srlx @X[($i/2)%8],32,$tmp1 ! X[i] 319 srlx @X[($i/2)%8],32,$tmp1 ! X[i]
320 xor $tmp0,$tmp2,$tmp2 ! sigma1(X[i+14]) 320 xor $tmp0,$tmp2,$tmp2 ! sigma1(X[i+14])
321 srl @X[($i/2)%8],0,@X[($i/2)%8]
322 add $xi,$T1,$T1 ! +=X[i+9] 321 add $xi,$T1,$T1 ! +=X[i+9]
323 add $tmp2,$T1,$T1 322 add $tmp2,$tmp1,$tmp1
323 srl @X[($i/2)%8],0,@X[($i/2)%8]
324 add $tmp1,$T1,$T1 324 add $tmp1,$T1,$T1
325 325
326 sllx $T1,32,$tmp0 326 sllx $T1,32,$tmp0
diff --git a/src/lib/libcrypto/sha/asm/sha512-x86_64.pl b/src/lib/libcrypto/sha/asm/sha512-x86_64.pl
index e6643f8cf6..f611a2d898 100755
--- a/src/lib/libcrypto/sha/asm/sha512-x86_64.pl
+++ b/src/lib/libcrypto/sha/asm/sha512-x86_64.pl
@@ -95,50 +95,44 @@ sub ROUND_00_15()
95{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; 95{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
96 96
97$code.=<<___; 97$code.=<<___;
98 mov $e,$a0 98 ror \$`$Sigma1[2]-$Sigma1[1]`,$a0
99 mov $e,$a1
100 mov $f,$a2 99 mov $f,$a2
100 mov $T1,`$SZ*($i&0xf)`(%rsp)
101 101
102 ror \$$Sigma1[0],$a0 102 ror \$`$Sigma0[2]-$Sigma0[1]`,$a1
103 ror \$$Sigma1[1],$a1 103 xor $e,$a0
104 xor $g,$a2 # f^g 104 xor $g,$a2 # f^g
105 105
106 xor $a1,$a0 106 ror \$`$Sigma1[1]-$Sigma1[0]`,$a0
107 ror \$`$Sigma1[2]-$Sigma1[1]`,$a1 107 add $h,$T1 # T1+=h
108 xor $a,$a1
109
110 add ($Tbl,$round,$SZ),$T1 # T1+=K[round]
108 and $e,$a2 # (f^g)&e 111 and $e,$a2 # (f^g)&e
109 mov $T1,`$SZ*($i&0xf)`(%rsp) 112 mov $b,$h
110 113
111 xor $a1,$a0 # Sigma1(e) 114 ror \$`$Sigma0[1]-$Sigma0[0]`,$a1
115 xor $e,$a0
112 xor $g,$a2 # Ch(e,f,g)=((f^g)&e)^g 116 xor $g,$a2 # Ch(e,f,g)=((f^g)&e)^g
113 add $h,$T1 # T1+=h
114
115 mov $a,$h
116 add $a0,$T1 # T1+=Sigma1(e)
117 117
118 xor $c,$h # b^c
119 xor $a,$a1
118 add $a2,$T1 # T1+=Ch(e,f,g) 120 add $a2,$T1 # T1+=Ch(e,f,g)
119 mov $a,$a0 121 mov $b,$a2
120 mov $a,$a1
121 122
122 ror \$$Sigma0[0],$h 123 ror \$$Sigma1[0],$a0 # Sigma1(e)
123 ror \$$Sigma0[1],$a0 124 and $a,$h # h=(b^c)&a
124 mov $a,$a2 125 and $c,$a2 # b&c
125 add ($Tbl,$round,$SZ),$T1 # T1+=K[round]
126 126
127 xor $a0,$h 127 ror \$$Sigma0[0],$a1 # Sigma0(a)
128 ror \$`$Sigma0[2]-$Sigma0[1]`,$a0 128 add $a0,$T1 # T1+=Sigma1(e)
129 or $c,$a1 # a|c 129 add $a2,$h # h+=b&c (completes +=Maj(a,b,c)
130 130
131 xor $a0,$h # h=Sigma0(a)
132 and $c,$a2 # a&c
133 add $T1,$d # d+=T1 131 add $T1,$d # d+=T1
134
135 and $b,$a1 # (a|c)&b
136 add $T1,$h # h+=T1 132 add $T1,$h # h+=T1
137
138 or $a2,$a1 # Maj(a,b,c)=((a|c)&b)|(a&c)
139 lea 1($round),$round # round++ 133 lea 1($round),$round # round++
134 add $a1,$h # h+=Sigma0(a)
140 135
141 add $a1,$h # h+=Maj(a,b,c)
142___ 136___
143} 137}
144 138
@@ -147,32 +141,30 @@ sub ROUND_16_XX()
147 141
148$code.=<<___; 142$code.=<<___;
149 mov `$SZ*(($i+1)&0xf)`(%rsp),$a0 143 mov `$SZ*(($i+1)&0xf)`(%rsp),$a0
150 mov `$SZ*(($i+14)&0xf)`(%rsp),$T1 144 mov `$SZ*(($i+14)&0xf)`(%rsp),$a1
151 145 mov $a0,$T1
152 mov $a0,$a2 146 mov $a1,$a2
153 147
148 ror \$`$sigma0[1]-$sigma0[0]`,$T1
149 xor $a0,$T1
154 shr \$$sigma0[2],$a0 150 shr \$$sigma0[2],$a0
155 ror \$$sigma0[0],$a2
156
157 xor $a2,$a0
158 ror \$`$sigma0[1]-$sigma0[0]`,$a2
159 151
160 xor $a2,$a0 # sigma0(X[(i+1)&0xf]) 152 ror \$$sigma0[0],$T1
161 mov $T1,$a1 153 xor $T1,$a0 # sigma0(X[(i+1)&0xf])
154 mov `$SZ*(($i+9)&0xf)`(%rsp),$T1
162 155
163 shr \$$sigma1[2],$T1 156 ror \$`$sigma1[1]-$sigma1[0]`,$a2
164 ror \$$sigma1[0],$a1 157 xor $a1,$a2
165 158 shr \$$sigma1[2],$a1
166 xor $a1,$T1
167 ror \$`$sigma1[1]-$sigma1[0]`,$a1
168
169 xor $a1,$T1 # sigma1(X[(i+14)&0xf])
170 159
160 ror \$$sigma1[0],$a2
171 add $a0,$T1 161 add $a0,$T1
172 162 xor $a2,$a1 # sigma1(X[(i+14)&0xf])
173 add `$SZ*(($i+9)&0xf)`(%rsp),$T1
174 163
175 add `$SZ*($i&0xf)`(%rsp),$T1 164 add `$SZ*($i&0xf)`(%rsp),$T1
165 mov $e,$a0
166 add $a1,$T1
167 mov $a,$a1
176___ 168___
177 &ROUND_00_15(@_); 169 &ROUND_00_15(@_);
178} 170}
@@ -219,6 +211,8 @@ $func:
219___ 211___
220 for($i=0;$i<16;$i++) { 212 for($i=0;$i<16;$i++) {
221 $code.=" mov $SZ*$i($inp),$T1\n"; 213 $code.=" mov $SZ*$i($inp),$T1\n";
214 $code.=" mov @ROT[4],$a0\n";
215 $code.=" mov @ROT[0],$a1\n";
222 $code.=" bswap $T1\n"; 216 $code.=" bswap $T1\n";
223 &ROUND_00_15($i,@ROT); 217 &ROUND_00_15($i,@ROT);
224 unshift(@ROT,pop(@ROT)); 218 unshift(@ROT,pop(@ROT));
diff --git a/src/lib/libcrypto/sha/sha.h b/src/lib/libcrypto/sha/sha.h
index 16cacf9fc0..8a6bf4bbbb 100644
--- a/src/lib/libcrypto/sha/sha.h
+++ b/src/lib/libcrypto/sha/sha.h
@@ -106,6 +106,9 @@ typedef struct SHAstate_st
106 } SHA_CTX; 106 } SHA_CTX;
107 107
108#ifndef OPENSSL_NO_SHA0 108#ifndef OPENSSL_NO_SHA0
109#ifdef OPENSSL_FIPS
110int private_SHA_Init(SHA_CTX *c);
111#endif
109int SHA_Init(SHA_CTX *c); 112int SHA_Init(SHA_CTX *c);
110int SHA_Update(SHA_CTX *c, const void *data, size_t len); 113int SHA_Update(SHA_CTX *c, const void *data, size_t len);
111int SHA_Final(unsigned char *md, SHA_CTX *c); 114int SHA_Final(unsigned char *md, SHA_CTX *c);
@@ -113,6 +116,9 @@ unsigned char *SHA(const unsigned char *d, size_t n, unsigned char *md);
113void SHA_Transform(SHA_CTX *c, const unsigned char *data); 116void SHA_Transform(SHA_CTX *c, const unsigned char *data);
114#endif 117#endif
115#ifndef OPENSSL_NO_SHA1 118#ifndef OPENSSL_NO_SHA1
119#ifdef OPENSSL_FIPS
120int private_SHA1_Init(SHA_CTX *c);
121#endif
116int SHA1_Init(SHA_CTX *c); 122int SHA1_Init(SHA_CTX *c);
117int SHA1_Update(SHA_CTX *c, const void *data, size_t len); 123int SHA1_Update(SHA_CTX *c, const void *data, size_t len);
118int SHA1_Final(unsigned char *md, SHA_CTX *c); 124int SHA1_Final(unsigned char *md, SHA_CTX *c);
@@ -135,6 +141,10 @@ typedef struct SHA256state_st
135 } SHA256_CTX; 141 } SHA256_CTX;
136 142
137#ifndef OPENSSL_NO_SHA256 143#ifndef OPENSSL_NO_SHA256
144#ifdef OPENSSL_FIPS
145int private_SHA224_Init(SHA256_CTX *c);
146int private_SHA256_Init(SHA256_CTX *c);
147#endif
138int SHA224_Init(SHA256_CTX *c); 148int SHA224_Init(SHA256_CTX *c);
139int SHA224_Update(SHA256_CTX *c, const void *data, size_t len); 149int SHA224_Update(SHA256_CTX *c, const void *data, size_t len);
140int SHA224_Final(unsigned char *md, SHA256_CTX *c); 150int SHA224_Final(unsigned char *md, SHA256_CTX *c);
@@ -182,6 +192,10 @@ typedef struct SHA512state_st
182#endif 192#endif
183 193
184#ifndef OPENSSL_NO_SHA512 194#ifndef OPENSSL_NO_SHA512
195#ifdef OPENSSL_FIPS
196int private_SHA384_Init(SHA512_CTX *c);
197int private_SHA512_Init(SHA512_CTX *c);
198#endif
185int SHA384_Init(SHA512_CTX *c); 199int SHA384_Init(SHA512_CTX *c);
186int SHA384_Update(SHA512_CTX *c, const void *data, size_t len); 200int SHA384_Update(SHA512_CTX *c, const void *data, size_t len);
187int SHA384_Final(unsigned char *md, SHA512_CTX *c); 201int SHA384_Final(unsigned char *md, SHA512_CTX *c);
diff --git a/src/lib/libcrypto/sha/sha1dgst.c b/src/lib/libcrypto/sha/sha1dgst.c
index 50d1925cde..81219af088 100644
--- a/src/lib/libcrypto/sha/sha1dgst.c
+++ b/src/lib/libcrypto/sha/sha1dgst.c
@@ -57,6 +57,7 @@
57 */ 57 */
58 58
59#include <openssl/opensslconf.h> 59#include <openssl/opensslconf.h>
60#include <openssl/crypto.h>
60#if !defined(OPENSSL_NO_SHA1) && !defined(OPENSSL_NO_SHA) 61#if !defined(OPENSSL_NO_SHA1) && !defined(OPENSSL_NO_SHA)
61 62
62#undef SHA_0 63#undef SHA_0
diff --git a/src/lib/libcrypto/sha/sha256.c b/src/lib/libcrypto/sha/sha256.c
index 8952d87673..f88d3d6dad 100644
--- a/src/lib/libcrypto/sha/sha256.c
+++ b/src/lib/libcrypto/sha/sha256.c
@@ -16,7 +16,7 @@
16 16
17const char SHA256_version[]="SHA-256" OPENSSL_VERSION_PTEXT; 17const char SHA256_version[]="SHA-256" OPENSSL_VERSION_PTEXT;
18 18
19int SHA224_Init (SHA256_CTX *c) 19fips_md_init_ctx(SHA224, SHA256)
20 { 20 {
21 memset (c,0,sizeof(*c)); 21 memset (c,0,sizeof(*c));
22 c->h[0]=0xc1059ed8UL; c->h[1]=0x367cd507UL; 22 c->h[0]=0xc1059ed8UL; c->h[1]=0x367cd507UL;
@@ -27,7 +27,7 @@ int SHA224_Init (SHA256_CTX *c)
27 return 1; 27 return 1;
28 } 28 }
29 29
30int SHA256_Init (SHA256_CTX *c) 30fips_md_init(SHA256)
31 { 31 {
32 memset (c,0,sizeof(*c)); 32 memset (c,0,sizeof(*c));
33 c->h[0]=0x6a09e667UL; c->h[1]=0xbb67ae85UL; 33 c->h[0]=0x6a09e667UL; c->h[1]=0xbb67ae85UL;
diff --git a/src/lib/libcrypto/sha/sha512.c b/src/lib/libcrypto/sha/sha512.c
index cbc0e58c48..50dd7dc744 100644
--- a/src/lib/libcrypto/sha/sha512.c
+++ b/src/lib/libcrypto/sha/sha512.c
@@ -59,21 +59,8 @@ const char SHA512_version[]="SHA-512" OPENSSL_VERSION_PTEXT;
59#define SHA512_BLOCK_CAN_MANAGE_UNALIGNED_DATA 59#define SHA512_BLOCK_CAN_MANAGE_UNALIGNED_DATA
60#endif 60#endif
61 61
62int SHA384_Init (SHA512_CTX *c) 62fips_md_init_ctx(SHA384, SHA512)
63 { 63 {
64#if defined(SHA512_ASM) && (defined(__arm__) || defined(__arm))
65 /* maintain dword order required by assembler module */
66 unsigned int *h = (unsigned int *)c->h;
67
68 h[0] = 0xcbbb9d5d; h[1] = 0xc1059ed8;
69 h[2] = 0x629a292a; h[3] = 0x367cd507;
70 h[4] = 0x9159015a; h[5] = 0x3070dd17;
71 h[6] = 0x152fecd8; h[7] = 0xf70e5939;
72 h[8] = 0x67332667; h[9] = 0xffc00b31;
73 h[10] = 0x8eb44a87; h[11] = 0x68581511;
74 h[12] = 0xdb0c2e0d; h[13] = 0x64f98fa7;
75 h[14] = 0x47b5481d; h[15] = 0xbefa4fa4;
76#else
77 c->h[0]=U64(0xcbbb9d5dc1059ed8); 64 c->h[0]=U64(0xcbbb9d5dc1059ed8);
78 c->h[1]=U64(0x629a292a367cd507); 65 c->h[1]=U64(0x629a292a367cd507);
79 c->h[2]=U64(0x9159015a3070dd17); 66 c->h[2]=U64(0x9159015a3070dd17);
@@ -82,27 +69,14 @@ int SHA384_Init (SHA512_CTX *c)
82 c->h[5]=U64(0x8eb44a8768581511); 69 c->h[5]=U64(0x8eb44a8768581511);
83 c->h[6]=U64(0xdb0c2e0d64f98fa7); 70 c->h[6]=U64(0xdb0c2e0d64f98fa7);
84 c->h[7]=U64(0x47b5481dbefa4fa4); 71 c->h[7]=U64(0x47b5481dbefa4fa4);
85#endif 72
86 c->Nl=0; c->Nh=0; 73 c->Nl=0; c->Nh=0;
87 c->num=0; c->md_len=SHA384_DIGEST_LENGTH; 74 c->num=0; c->md_len=SHA384_DIGEST_LENGTH;
88 return 1; 75 return 1;
89 } 76 }
90 77
91int SHA512_Init (SHA512_CTX *c) 78fips_md_init(SHA512)
92 { 79 {
93#if defined(SHA512_ASM) && (defined(__arm__) || defined(__arm))
94 /* maintain dword order required by assembler module */
95 unsigned int *h = (unsigned int *)c->h;
96
97 h[0] = 0x6a09e667; h[1] = 0xf3bcc908;
98 h[2] = 0xbb67ae85; h[3] = 0x84caa73b;
99 h[4] = 0x3c6ef372; h[5] = 0xfe94f82b;
100 h[6] = 0xa54ff53a; h[7] = 0x5f1d36f1;
101 h[8] = 0x510e527f; h[9] = 0xade682d1;
102 h[10] = 0x9b05688c; h[11] = 0x2b3e6c1f;
103 h[12] = 0x1f83d9ab; h[13] = 0xfb41bd6b;
104 h[14] = 0x5be0cd19; h[15] = 0x137e2179;
105#else
106 c->h[0]=U64(0x6a09e667f3bcc908); 80 c->h[0]=U64(0x6a09e667f3bcc908);
107 c->h[1]=U64(0xbb67ae8584caa73b); 81 c->h[1]=U64(0xbb67ae8584caa73b);
108 c->h[2]=U64(0x3c6ef372fe94f82b); 82 c->h[2]=U64(0x3c6ef372fe94f82b);
@@ -111,7 +85,7 @@ int SHA512_Init (SHA512_CTX *c)
111 c->h[5]=U64(0x9b05688c2b3e6c1f); 85 c->h[5]=U64(0x9b05688c2b3e6c1f);
112 c->h[6]=U64(0x1f83d9abfb41bd6b); 86 c->h[6]=U64(0x1f83d9abfb41bd6b);
113 c->h[7]=U64(0x5be0cd19137e2179); 87 c->h[7]=U64(0x5be0cd19137e2179);
114#endif 88
115 c->Nl=0; c->Nh=0; 89 c->Nl=0; c->Nh=0;
116 c->num=0; c->md_len=SHA512_DIGEST_LENGTH; 90 c->num=0; c->md_len=SHA512_DIGEST_LENGTH;
117 return 1; 91 return 1;
@@ -160,24 +134,6 @@ int SHA512_Final (unsigned char *md, SHA512_CTX *c)
160 134
161 if (md==0) return 0; 135 if (md==0) return 0;
162 136
163#if defined(SHA512_ASM) && (defined(__arm__) || defined(__arm))
164 /* recall assembler dword order... */
165 n = c->md_len;
166 if (n == SHA384_DIGEST_LENGTH || n == SHA512_DIGEST_LENGTH)
167 {
168 unsigned int *h = (unsigned int *)c->h, t;
169
170 for (n/=4;n;n--)
171 {
172 t = *(h++);
173 *(md++) = (unsigned char)(t>>24);
174 *(md++) = (unsigned char)(t>>16);
175 *(md++) = (unsigned char)(t>>8);
176 *(md++) = (unsigned char)(t);
177 }
178 }
179 else return 0;
180#else
181 switch (c->md_len) 137 switch (c->md_len)
182 { 138 {
183 /* Let compiler decide if it's appropriate to unroll... */ 139 /* Let compiler decide if it's appropriate to unroll... */
@@ -214,7 +170,7 @@ int SHA512_Final (unsigned char *md, SHA512_CTX *c)
214 /* ... as well as make sure md_len is not abused. */ 170 /* ... as well as make sure md_len is not abused. */
215 default: return 0; 171 default: return 0;
216 } 172 }
217#endif 173
218 return 1; 174 return 1;
219 } 175 }
220 176
diff --git a/src/lib/libcrypto/sha/sha_locl.h b/src/lib/libcrypto/sha/sha_locl.h
index 672c26eee1..7a0c3ca8d8 100644
--- a/src/lib/libcrypto/sha/sha_locl.h
+++ b/src/lib/libcrypto/sha/sha_locl.h
@@ -122,7 +122,11 @@ void sha1_block_data_order (SHA_CTX *c, const void *p,size_t num);
122#define INIT_DATA_h3 0x10325476UL 122#define INIT_DATA_h3 0x10325476UL
123#define INIT_DATA_h4 0xc3d2e1f0UL 123#define INIT_DATA_h4 0xc3d2e1f0UL
124 124
125int HASH_INIT (SHA_CTX *c) 125#ifdef SHA_0
126fips_md_init(SHA)
127#else
128fips_md_init_ctx(SHA1, SHA)
129#endif
126 { 130 {
127 memset (c,0,sizeof(*c)); 131 memset (c,0,sizeof(*c));
128 c->h0=INIT_DATA_h0; 132 c->h0=INIT_DATA_h0;