summaryrefslogtreecommitdiff
path: root/src/lib/libcrypto/sha
diff options
context:
space:
mode:
authorcvs2svn <admin@example.com>2012-07-13 17:49:55 +0000
committercvs2svn <admin@example.com>2012-07-13 17:49:55 +0000
commit6fdb436ab2cd5b35066babb3a03be7ad0daf1ae2 (patch)
treea760cf389e7ea59961bb306a1f50bf5443205176 /src/lib/libcrypto/sha
parent9204e59073bcf27e1487ec4ac46e981902ddd904 (diff)
downloadopenbsd-OPENBSD_5_2_BASE.tar.gz
openbsd-OPENBSD_5_2_BASE.tar.bz2
openbsd-OPENBSD_5_2_BASE.zip
This commit was manufactured by cvs2git to create tag 'OPENBSD_5_2_BASE'.OPENBSD_5_2_BASE
Diffstat (limited to 'src/lib/libcrypto/sha')
-rw-r--r--src/lib/libcrypto/sha/asm/sha1-586.pl220
-rw-r--r--src/lib/libcrypto/sha/asm/sha1-armv4-large.pl228
-rw-r--r--src/lib/libcrypto/sha/asm/sha1-ia64.pl306
-rwxr-xr-xsrc/lib/libcrypto/sha/asm/sha1-ppc.pl319
-rw-r--r--src/lib/libcrypto/sha/asm/sha1-s390x.pl226
-rw-r--r--src/lib/libcrypto/sha/asm/sha1-sparcv9.pl284
-rw-r--r--src/lib/libcrypto/sha/asm/sha1-sparcv9a.pl601
-rw-r--r--src/lib/libcrypto/sha/asm/sha1-thumb.pl259
-rwxr-xr-xsrc/lib/libcrypto/sha/asm/sha1-x86_64.pl351
-rw-r--r--src/lib/libcrypto/sha/asm/sha256-586.pl251
-rw-r--r--src/lib/libcrypto/sha/asm/sha256-armv4.pl186
-rw-r--r--src/lib/libcrypto/sha/asm/sha512-586.pl644
-rw-r--r--src/lib/libcrypto/sha/asm/sha512-armv4.pl403
-rwxr-xr-xsrc/lib/libcrypto/sha/asm/sha512-ia64.pl672
-rwxr-xr-xsrc/lib/libcrypto/sha/asm/sha512-ppc.pl462
-rw-r--r--src/lib/libcrypto/sha/asm/sha512-s390x.pl301
-rw-r--r--src/lib/libcrypto/sha/asm/sha512-sparcv9.pl594
-rwxr-xr-xsrc/lib/libcrypto/sha/asm/sha512-x86_64.pl456
-rw-r--r--src/lib/libcrypto/sha/sha.h200
-rw-r--r--src/lib/libcrypto/sha/sha1_one.c78
-rw-r--r--src/lib/libcrypto/sha/sha1dgst.c74
-rw-r--r--src/lib/libcrypto/sha/sha256.c282
-rw-r--r--src/lib/libcrypto/sha/sha512.c641
-rw-r--r--src/lib/libcrypto/sha/sha_locl.h437
24 files changed, 0 insertions, 8475 deletions
diff --git a/src/lib/libcrypto/sha/asm/sha1-586.pl b/src/lib/libcrypto/sha/asm/sha1-586.pl
deleted file mode 100644
index a1f876281a..0000000000
--- a/src/lib/libcrypto/sha/asm/sha1-586.pl
+++ /dev/null
@@ -1,220 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# [Re]written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# "[Re]written" was achieved in two major overhauls. In 2004 BODY_*
11# functions were re-implemented to address P4 performance issue [see
12# commentary below], and in 2006 the rest was rewritten in order to
13# gain freedom to liberate licensing terms.
14
15# It was noted that Intel IA-32 C compiler generates code which
16# performs ~30% *faster* on P4 CPU than original *hand-coded*
17# SHA1 assembler implementation. To address this problem (and
18# prove that humans are still better than machines:-), the
19# original code was overhauled, which resulted in following
20# performance changes:
21#
22# compared with original compared with Intel cc
23# assembler impl. generated code
24# Pentium -16% +48%
25# PIII/AMD +8% +16%
26# P4 +85%(!) +45%
27#
28# As you can see Pentium came out as looser:-( Yet I reckoned that
29# improvement on P4 outweights the loss and incorporate this
30# re-tuned code to 0.9.7 and later.
31# ----------------------------------------------------------------
32# <appro@fy.chalmers.se>
33
34$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
35push(@INC,"${dir}","${dir}../../perlasm");
36require "x86asm.pl";
37
38&asm_init($ARGV[0],"sha1-586.pl",$ARGV[$#ARGV] eq "386");
39
40$A="eax";
41$B="ebx";
42$C="ecx";
43$D="edx";
44$E="edi";
45$T="esi";
46$tmp1="ebp";
47
48@V=($A,$B,$C,$D,$E,$T);
49
50sub BODY_00_15
51 {
52 local($n,$a,$b,$c,$d,$e,$f)=@_;
53
54 &comment("00_15 $n");
55
56 &mov($f,$c); # f to hold F_00_19(b,c,d)
57 if ($n==0) { &mov($tmp1,$a); }
58 else { &mov($a,$tmp1); }
59 &rotl($tmp1,5); # tmp1=ROTATE(a,5)
60 &xor($f,$d);
61 &add($tmp1,$e); # tmp1+=e;
62 &and($f,$b);
63 &mov($e,&swtmp($n%16)); # e becomes volatile and is loaded
64 # with xi, also note that e becomes
65 # f in next round...
66 &xor($f,$d); # f holds F_00_19(b,c,d)
67 &rotr($b,2); # b=ROTATE(b,30)
68 &lea($tmp1,&DWP(0x5a827999,$tmp1,$e)); # tmp1+=K_00_19+xi
69
70 if ($n==15) { &add($f,$tmp1); } # f+=tmp1
71 else { &add($tmp1,$f); } # f becomes a in next round
72 }
73
74sub BODY_16_19
75 {
76 local($n,$a,$b,$c,$d,$e,$f)=@_;
77
78 &comment("16_19 $n");
79
80 &mov($f,&swtmp($n%16)); # f to hold Xupdate(xi,xa,xb,xc,xd)
81 &mov($tmp1,$c); # tmp1 to hold F_00_19(b,c,d)
82 &xor($f,&swtmp(($n+2)%16));
83 &xor($tmp1,$d);
84 &xor($f,&swtmp(($n+8)%16));
85 &and($tmp1,$b); # tmp1 holds F_00_19(b,c,d)
86 &rotr($b,2); # b=ROTATE(b,30)
87 &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd
88 &rotl($f,1); # f=ROTATE(f,1)
89 &xor($tmp1,$d); # tmp1=F_00_19(b,c,d)
90 &mov(&swtmp($n%16),$f); # xi=f
91 &lea($f,&DWP(0x5a827999,$f,$e));# f+=K_00_19+e
92 &mov($e,$a); # e becomes volatile
93 &rotl($e,5); # e=ROTATE(a,5)
94 &add($f,$tmp1); # f+=F_00_19(b,c,d)
95 &add($f,$e); # f+=ROTATE(a,5)
96 }
97
98sub BODY_20_39
99 {
100 local($n,$a,$b,$c,$d,$e,$f)=@_;
101 local $K=($n<40)?0x6ed9eba1:0xca62c1d6;
102
103 &comment("20_39 $n");
104
105 &mov($tmp1,$b); # tmp1 to hold F_20_39(b,c,d)
106 &mov($f,&swtmp($n%16)); # f to hold Xupdate(xi,xa,xb,xc,xd)
107 &rotr($b,2); # b=ROTATE(b,30)
108 &xor($f,&swtmp(($n+2)%16));
109 &xor($tmp1,$c);
110 &xor($f,&swtmp(($n+8)%16));
111 &xor($tmp1,$d); # tmp1 holds F_20_39(b,c,d)
112 &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd
113 &rotl($f,1); # f=ROTATE(f,1)
114 &add($tmp1,$e);
115 &mov(&swtmp($n%16),$f); # xi=f
116 &mov($e,$a); # e becomes volatile
117 &rotl($e,5); # e=ROTATE(a,5)
118 &lea($f,&DWP($K,$f,$tmp1)); # f+=K_20_39+e
119 &add($f,$e); # f+=ROTATE(a,5)
120 }
121
122sub BODY_40_59
123 {
124 local($n,$a,$b,$c,$d,$e,$f)=@_;
125
126 &comment("40_59 $n");
127
128 &mov($f,&swtmp($n%16)); # f to hold Xupdate(xi,xa,xb,xc,xd)
129 &mov($tmp1,&swtmp(($n+2)%16));
130 &xor($f,$tmp1);
131 &mov($tmp1,&swtmp(($n+8)%16));
132 &xor($f,$tmp1);
133 &mov($tmp1,&swtmp(($n+13)%16));
134 &xor($f,$tmp1); # f holds xa^xb^xc^xd
135 &mov($tmp1,$b); # tmp1 to hold F_40_59(b,c,d)
136 &rotl($f,1); # f=ROTATE(f,1)
137 &or($tmp1,$c);
138 &mov(&swtmp($n%16),$f); # xi=f
139 &and($tmp1,$d);
140 &lea($f,&DWP(0x8f1bbcdc,$f,$e));# f+=K_40_59+e
141 &mov($e,$b); # e becomes volatile and is used
142 # to calculate F_40_59(b,c,d)
143 &rotr($b,2); # b=ROTATE(b,30)
144 &and($e,$c);
145 &or($tmp1,$e); # tmp1 holds F_40_59(b,c,d)
146 &mov($e,$a);
147 &rotl($e,5); # e=ROTATE(a,5)
148 &add($f,$tmp1); # f+=tmp1;
149 &add($f,$e); # f+=ROTATE(a,5)
150 }
151
152&function_begin("sha1_block_data_order");
153 &mov($tmp1,&wparam(0)); # SHA_CTX *c
154 &mov($T,&wparam(1)); # const void *input
155 &mov($A,&wparam(2)); # size_t num
156 &stack_push(16); # allocate X[16]
157 &shl($A,6);
158 &add($A,$T);
159 &mov(&wparam(2),$A); # pointer beyond the end of input
160 &mov($E,&DWP(16,$tmp1));# pre-load E
161
162 &set_label("loop",16);
163
164 # copy input chunk to X, but reversing byte order!
165 for ($i=0; $i<16; $i+=4)
166 {
167 &mov($A,&DWP(4*($i+0),$T));
168 &mov($B,&DWP(4*($i+1),$T));
169 &mov($C,&DWP(4*($i+2),$T));
170 &mov($D,&DWP(4*($i+3),$T));
171 &bswap($A);
172 &bswap($B);
173 &bswap($C);
174 &bswap($D);
175 &mov(&swtmp($i+0),$A);
176 &mov(&swtmp($i+1),$B);
177 &mov(&swtmp($i+2),$C);
178 &mov(&swtmp($i+3),$D);
179 }
180 &mov(&wparam(1),$T); # redundant in 1st spin
181
182 &mov($A,&DWP(0,$tmp1)); # load SHA_CTX
183 &mov($B,&DWP(4,$tmp1));
184 &mov($C,&DWP(8,$tmp1));
185 &mov($D,&DWP(12,$tmp1));
186 # E is pre-loaded
187
188 for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
189 for(;$i<20;$i++) { &BODY_16_19($i,@V); unshift(@V,pop(@V)); }
190 for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
191 for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
192 for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
193
194 (($V[5] eq $D) and ($V[0] eq $E)) or die; # double-check
195
196 &mov($tmp1,&wparam(0)); # re-load SHA_CTX*
197 &mov($D,&wparam(1)); # D is last "T" and is discarded
198
199 &add($E,&DWP(0,$tmp1)); # E is last "A"...
200 &add($T,&DWP(4,$tmp1));
201 &add($A,&DWP(8,$tmp1));
202 &add($B,&DWP(12,$tmp1));
203 &add($C,&DWP(16,$tmp1));
204
205 &mov(&DWP(0,$tmp1),$E); # update SHA_CTX
206 &add($D,64); # advance input pointer
207 &mov(&DWP(4,$tmp1),$T);
208 &cmp($D,&wparam(2)); # have we reached the end yet?
209 &mov(&DWP(8,$tmp1),$A);
210 &mov($E,$C); # C is last "E" which needs to be "pre-loaded"
211 &mov(&DWP(12,$tmp1),$B);
212 &mov($T,$D); # input pointer
213 &mov(&DWP(16,$tmp1),$C);
214 &jb(&label("loop"));
215
216 &stack_pop(16);
217&function_end("sha1_block_data_order");
218&asciz("SHA1 block transform for x86, CRYPTOGAMS by <appro\@openssl.org>");
219
220&asm_finish();
diff --git a/src/lib/libcrypto/sha/asm/sha1-armv4-large.pl b/src/lib/libcrypto/sha/asm/sha1-armv4-large.pl
deleted file mode 100644
index 6e65fe3e01..0000000000
--- a/src/lib/libcrypto/sha/asm/sha1-armv4-large.pl
+++ /dev/null
@@ -1,228 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# sha1_block procedure for ARMv4.
11#
12# January 2007.
13
14# Size/performance trade-off
15# ====================================================================
16# impl size in bytes comp cycles[*] measured performance
17# ====================================================================
18# thumb 304 3212 4420
19# armv4-small 392/+29% 1958/+64% 2250/+96%
20# armv4-compact 740/+89% 1552/+26% 1840/+22%
21# armv4-large 1420/+92% 1307/+19% 1370/+34%[***]
22# full unroll ~5100/+260% ~1260/+4% ~1300/+5%
23# ====================================================================
24# thumb = same as 'small' but in Thumb instructions[**] and
25# with recurring code in two private functions;
26# small = detached Xload/update, loops are folded;
27# compact = detached Xload/update, 5x unroll;
28# large = interleaved Xload/update, 5x unroll;
29# full unroll = interleaved Xload/update, full unroll, estimated[!];
30#
31# [*] Manually counted instructions in "grand" loop body. Measured
32# performance is affected by prologue and epilogue overhead,
33# i-cache availability, branch penalties, etc.
34# [**] While each Thumb instruction is twice smaller, they are not as
35# diverse as ARM ones: e.g., there are only two arithmetic
36# instructions with 3 arguments, no [fixed] rotate, addressing
37# modes are limited. As result it takes more instructions to do
38# the same job in Thumb, therefore the code is never twice as
39# small and always slower.
40# [***] which is also ~35% better than compiler generated code. Dual-
41# issue Cortex A8 core was measured to process input block in
42# ~990 cycles.
43
44# August 2010.
45#
46# Rescheduling for dual-issue pipeline resulted in 13% improvement on
47# Cortex A8 core and in absolute terms ~870 cycles per input block
48# [or 13.6 cycles per byte].
49
50
51while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
52open STDOUT,">$output";
53
54$ctx="r0";
55$inp="r1";
56$len="r2";
57$a="r3";
58$b="r4";
59$c="r5";
60$d="r6";
61$e="r7";
62$K="r8";
63$t0="r9";
64$t1="r10";
65$t2="r11";
66$t3="r12";
67$Xi="r14";
68@V=($a,$b,$c,$d,$e);
69
70sub Xupdate {
71my ($a,$b,$c,$d,$e,$opt1,$opt2)=@_;
72$code.=<<___;
73 ldr $t0,[$Xi,#15*4]
74 ldr $t1,[$Xi,#13*4]
75 ldr $t2,[$Xi,#7*4]
76 add $e,$K,$e,ror#2 @ E+=K_xx_xx
77 ldr $t3,[$Xi,#2*4]
78 eor $t0,$t0,$t1
79 eor $t2,$t2,$t3
80 eor $t1,$c,$d @ F_xx_xx
81 mov $t0,$t0,ror#31
82 add $e,$e,$a,ror#27 @ E+=ROR(A,27)
83 eor $t0,$t0,$t2,ror#31
84 $opt1 @ F_xx_xx
85 $opt2 @ F_xx_xx
86 add $e,$e,$t0 @ E+=X[i]
87 str $t0,[$Xi,#-4]!
88___
89}
90
91sub BODY_00_15 {
92my ($a,$b,$c,$d,$e)=@_;
93$code.=<<___;
94 ldrb $t0,[$inp],#4
95 ldrb $t1,[$inp,#-1]
96 ldrb $t2,[$inp,#-2]
97 add $e,$K,$e,ror#2 @ E+=K_00_19
98 ldrb $t3,[$inp,#-3]
99 add $e,$e,$a,ror#27 @ E+=ROR(A,27)
100 orr $t0,$t1,$t0,lsl#24
101 eor $t1,$c,$d @ F_xx_xx
102 orr $t0,$t0,$t2,lsl#8
103 orr $t0,$t0,$t3,lsl#16
104 and $t1,$b,$t1,ror#2
105 add $e,$e,$t0 @ E+=X[i]
106 eor $t1,$t1,$d,ror#2 @ F_00_19(B,C,D)
107 str $t0,[$Xi,#-4]!
108 add $e,$e,$t1 @ E+=F_00_19(B,C,D)
109___
110}
111
112sub BODY_16_19 {
113my ($a,$b,$c,$d,$e)=@_;
114 &Xupdate(@_,"and $t1,$b,$t1,ror#2");
115$code.=<<___;
116 eor $t1,$t1,$d,ror#2 @ F_00_19(B,C,D)
117 add $e,$e,$t1 @ E+=F_00_19(B,C,D)
118___
119}
120
121sub BODY_20_39 {
122my ($a,$b,$c,$d,$e)=@_;
123 &Xupdate(@_,"eor $t1,$b,$t1,ror#2");
124$code.=<<___;
125 add $e,$e,$t1 @ E+=F_20_39(B,C,D)
126___
127}
128
129sub BODY_40_59 {
130my ($a,$b,$c,$d,$e)=@_;
131 &Xupdate(@_,"and $t1,$b,$t1,ror#2","and $t2,$c,$d");
132$code.=<<___;
133 add $e,$e,$t1 @ E+=F_40_59(B,C,D)
134 add $e,$e,$t2,ror#2
135___
136}
137
138$code=<<___;
139.text
140
141.global sha1_block_data_order
142.type sha1_block_data_order,%function
143
144.align 2
145sha1_block_data_order:
146 stmdb sp!,{r4-r12,lr}
147 add $len,$inp,$len,lsl#6 @ $len to point at the end of $inp
148 ldmia $ctx,{$a,$b,$c,$d,$e}
149.Lloop:
150 ldr $K,.LK_00_19
151 mov $Xi,sp
152 sub sp,sp,#15*4
153 mov $c,$c,ror#30
154 mov $d,$d,ror#30
155 mov $e,$e,ror#30 @ [6]
156.L_00_15:
157___
158for($i=0;$i<5;$i++) {
159 &BODY_00_15(@V); unshift(@V,pop(@V));
160}
161$code.=<<___;
162 teq $Xi,sp
163 bne .L_00_15 @ [((11+4)*5+2)*3]
164___
165 &BODY_00_15(@V); unshift(@V,pop(@V));
166 &BODY_16_19(@V); unshift(@V,pop(@V));
167 &BODY_16_19(@V); unshift(@V,pop(@V));
168 &BODY_16_19(@V); unshift(@V,pop(@V));
169 &BODY_16_19(@V); unshift(@V,pop(@V));
170$code.=<<___;
171
172 ldr $K,.LK_20_39 @ [+15+16*4]
173 sub sp,sp,#25*4
174 cmn sp,#0 @ [+3], clear carry to denote 20_39
175.L_20_39_or_60_79:
176___
177for($i=0;$i<5;$i++) {
178 &BODY_20_39(@V); unshift(@V,pop(@V));
179}
180$code.=<<___;
181 teq $Xi,sp @ preserve carry
182 bne .L_20_39_or_60_79 @ [+((12+3)*5+2)*4]
183 bcs .L_done @ [+((12+3)*5+2)*4], spare 300 bytes
184
185 ldr $K,.LK_40_59
186 sub sp,sp,#20*4 @ [+2]
187.L_40_59:
188___
189for($i=0;$i<5;$i++) {
190 &BODY_40_59(@V); unshift(@V,pop(@V));
191}
192$code.=<<___;
193 teq $Xi,sp
194 bne .L_40_59 @ [+((12+5)*5+2)*4]
195
196 ldr $K,.LK_60_79
197 sub sp,sp,#20*4
198 cmp sp,#0 @ set carry to denote 60_79
199 b .L_20_39_or_60_79 @ [+4], spare 300 bytes
200.L_done:
201 add sp,sp,#80*4 @ "deallocate" stack frame
202 ldmia $ctx,{$K,$t0,$t1,$t2,$t3}
203 add $a,$K,$a
204 add $b,$t0,$b
205 add $c,$t1,$c,ror#2
206 add $d,$t2,$d,ror#2
207 add $e,$t3,$e,ror#2
208 stmia $ctx,{$a,$b,$c,$d,$e}
209 teq $inp,$len
210 bne .Lloop @ [+18], total 1307
211
212 ldmia sp!,{r4-r12,lr}
213 tst lr,#1
214 moveq pc,lr @ be binary compatible with V4, yet
215 bx lr @ interoperable with Thumb ISA:-)
216.align 2
217.LK_00_19: .word 0x5a827999
218.LK_20_39: .word 0x6ed9eba1
219.LK_40_59: .word 0x8f1bbcdc
220.LK_60_79: .word 0xca62c1d6
221.size sha1_block_data_order,.-sha1_block_data_order
222.asciz "SHA1 block transform for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
223.align 2
224___
225
226$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
227print $code;
228close STDOUT; # enforce flush
diff --git a/src/lib/libcrypto/sha/asm/sha1-ia64.pl b/src/lib/libcrypto/sha/asm/sha1-ia64.pl
deleted file mode 100644
index 51c4f47ecb..0000000000
--- a/src/lib/libcrypto/sha/asm/sha1-ia64.pl
+++ /dev/null
@@ -1,306 +0,0 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# Eternal question is what's wrong with compiler generated code? The
11# trick is that it's possible to reduce the number of shifts required
12# to perform rotations by maintaining copy of 32-bit value in upper
13# bits of 64-bit register. Just follow mux2 and shrp instructions...
14# Performance under big-endian OS such as HP-UX is 179MBps*1GHz, which
15# is >50% better than HP C and >2x better than gcc.
16
17$code=<<___;
18.ident \"sha1-ia64.s, version 1.2\"
19.ident \"IA-64 ISA artwork by Andy Polyakov <appro\@fy.chalmers.se>\"
20.explicit
21
22___
23
24
25if ($^O eq "hpux") {
26 $ADDP="addp4";
27 for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
28} else { $ADDP="add"; }
29for (@ARGV) { $big_endian=1 if (/\-DB_ENDIAN/);
30 $big_endian=0 if (/\-DL_ENDIAN/); }
31if (!defined($big_endian))
32 { $big_endian=(unpack('L',pack('N',1))==1); }
33
34#$human=1;
35if ($human) { # useful for visual code auditing...
36 ($A,$B,$C,$D,$E,$T) = ("A","B","C","D","E","T");
37 ($h0,$h1,$h2,$h3,$h4) = ("h0","h1","h2","h3","h4");
38 ($K_00_19, $K_20_39, $K_40_59, $K_60_79) =
39 ( "K_00_19","K_20_39","K_40_59","K_60_79" );
40 @X= ( "X0", "X1", "X2", "X3", "X4", "X5", "X6", "X7",
41 "X8", "X9","X10","X11","X12","X13","X14","X15" );
42}
43else {
44 ($A,$B,$C,$D,$E,$T) = ("loc0","loc1","loc2","loc3","loc4","loc5");
45 ($h0,$h1,$h2,$h3,$h4) = ("loc6","loc7","loc8","loc9","loc10");
46 ($K_00_19, $K_20_39, $K_40_59, $K_60_79) =
47 ( "r14", "r15", "loc11", "loc12" );
48 @X= ( "r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23",
49 "r24", "r25", "r26", "r27", "r28", "r29", "r30", "r31" );
50}
51
52sub BODY_00_15 {
53local *code=shift;
54local ($i,$a,$b,$c,$d,$e,$f)=@_;
55
56$code.=<<___ if ($i==0);
57{ .mmi; ld1 $X[$i&0xf]=[inp],2 // MSB
58 ld1 tmp2=[tmp3],2 };;
59{ .mmi; ld1 tmp0=[inp],2
60 ld1 tmp4=[tmp3],2 // LSB
61 dep $X[$i&0xf]=$X[$i&0xf],tmp2,8,8 };;
62___
63if ($i<15) {
64 $code.=<<___;
65{ .mmi; ld1 $X[($i+1)&0xf]=[inp],2 // +1
66 dep tmp1=tmp0,tmp4,8,8 };;
67{ .mmi; ld1 tmp2=[tmp3],2 // +1
68 and tmp4=$c,$b
69 dep $X[$i&0xf]=$X[$i&0xf],tmp1,16,16 } //;;
70{ .mmi; andcm tmp1=$d,$b
71 add tmp0=$e,$K_00_19
72 dep.z tmp5=$a,5,27 };; // a<<5
73{ .mmi; or tmp4=tmp4,tmp1 // F_00_19(b,c,d)=(b&c)|(~b&d)
74 add $f=tmp0,$X[$i&0xf] // f=xi+e+K_00_19
75 extr.u tmp1=$a,27,5 };; // a>>27
76{ .mmi; ld1 tmp0=[inp],2 // +1
77 add $f=$f,tmp4 // f+=F_00_19(b,c,d)
78 shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30)
79{ .mmi; ld1 tmp4=[tmp3],2 // +1
80 or tmp5=tmp1,tmp5 // ROTATE(a,5)
81 mux2 tmp6=$a,0x44 };; // see b in next iteration
82{ .mii; add $f=$f,tmp5 // f+=ROTATE(a,5)
83 dep $X[($i+1)&0xf]=$X[($i+1)&0xf],tmp2,8,8 // +1
84 mux2 $X[$i&0xf]=$X[$i&0xf],0x44 } //;;
85
86___
87 }
88else {
89 $code.=<<___;
90{ .mii; and tmp3=$c,$b
91 dep tmp1=tmp0,tmp4,8,8;;
92 dep $X[$i&0xf]=$X[$i&0xf],tmp1,16,16 } //;;
93{ .mmi; andcm tmp1=$d,$b
94 add tmp0=$e,$K_00_19
95 dep.z tmp5=$a,5,27 };; // a<<5
96{ .mmi; or tmp4=tmp3,tmp1 // F_00_19(b,c,d)=(b&c)|(~b&d)
97 add $f=tmp0,$X[$i&0xf] // f=xi+e+K_00_19
98 extr.u tmp1=$a,27,5 } // a>>27
99{ .mmi; xor tmp2=$X[($i+0+1)&0xf],$X[($i+2+1)&0xf] // +1
100 xor tmp3=$X[($i+8+1)&0xf],$X[($i+13+1)&0xf] // +1
101 nop.i 0 };;
102{ .mmi; add $f=$f,tmp4 // f+=F_00_19(b,c,d)
103 xor tmp2=tmp2,tmp3 // +1
104 shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30)
105{ .mmi; or tmp1=tmp1,tmp5 // ROTATE(a,5)
106 mux2 tmp6=$a,0x44 };; // see b in next iteration
107{ .mii; add $f=$f,tmp1 // f+=ROTATE(a,5)
108 shrp $e=tmp2,tmp2,31 // f+1=ROTATE(x[0]^x[2]^x[8]^x[13],1)
109 mux2 $X[$i&0xf]=$X[$i&0xf],0x44 };;
110
111___
112 }
113}
114
115sub BODY_16_19 {
116local *code=shift;
117local ($i,$a,$b,$c,$d,$e,$f)=@_;
118
119$code.=<<___;
120{ .mmi; mov $X[$i&0xf]=$f // Xupdate
121 and tmp0=$c,$b
122 dep.z tmp5=$a,5,27 } // a<<5
123{ .mmi; andcm tmp1=$d,$b
124 add tmp4=$e,$K_00_19 };;
125{ .mmi; or tmp0=tmp0,tmp1 // F_00_19(b,c,d)=(b&c)|(~b&d)
126 add $f=$f,tmp4 // f+=e+K_00_19
127 extr.u tmp1=$a,27,5 } // a>>27
128{ .mmi; xor tmp2=$X[($i+0+1)&0xf],$X[($i+2+1)&0xf] // +1
129 xor tmp3=$X[($i+8+1)&0xf],$X[($i+13+1)&0xf] // +1
130 nop.i 0 };;
131{ .mmi; add $f=$f,tmp0 // f+=F_00_19(b,c,d)
132 xor tmp2=tmp2,tmp3 // +1
133 shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30)
134{ .mmi; or tmp1=tmp1,tmp5 // ROTATE(a,5)
135 mux2 tmp6=$a,0x44 };; // see b in next iteration
136{ .mii; add $f=$f,tmp1 // f+=ROTATE(a,5)
137 shrp $e=tmp2,tmp2,31 // f+1=ROTATE(x[0]^x[2]^x[8]^x[13],1)
138 nop.i 0 };;
139
140___
141}
142
143sub BODY_20_39 {
144local *code=shift;
145local ($i,$a,$b,$c,$d,$e,$f,$Konst)=@_;
146 $Konst = $K_20_39 if (!defined($Konst));
147
148if ($i<79) {
149$code.=<<___;
150{ .mib; mov $X[$i&0xf]=$f // Xupdate
151 dep.z tmp5=$a,5,27 } // a<<5
152{ .mib; xor tmp0=$c,$b
153 add tmp4=$e,$Konst };;
154{ .mmi; xor tmp0=tmp0,$d // F_20_39(b,c,d)=b^c^d
155 add $f=$f,tmp4 // f+=e+K_20_39
156 extr.u tmp1=$a,27,5 } // a>>27
157{ .mmi; xor tmp2=$X[($i+0+1)&0xf],$X[($i+2+1)&0xf] // +1
158 xor tmp3=$X[($i+8+1)&0xf],$X[($i+13+1)&0xf] // +1
159 nop.i 0 };;
160{ .mmi; add $f=$f,tmp0 // f+=F_20_39(b,c,d)
161 xor tmp2=tmp2,tmp3 // +1
162 shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30)
163{ .mmi; or tmp1=tmp1,tmp5 // ROTATE(a,5)
164 mux2 tmp6=$a,0x44 };; // see b in next iteration
165{ .mii; add $f=$f,tmp1 // f+=ROTATE(a,5)
166 shrp $e=tmp2,tmp2,31 // f+1=ROTATE(x[0]^x[2]^x[8]^x[13],1)
167 nop.i 0 };;
168
169___
170}
171else {
172$code.=<<___;
173{ .mib; mov $X[$i&0xf]=$f // Xupdate
174 dep.z tmp5=$a,5,27 } // a<<5
175{ .mib; xor tmp0=$c,$b
176 add tmp4=$e,$Konst };;
177{ .mib; xor tmp0=tmp0,$d // F_20_39(b,c,d)=b^c^d
178 extr.u tmp1=$a,27,5 } // a>>27
179{ .mib; add $f=$f,tmp4 // f+=e+K_20_39
180 add $h1=$h1,$a };; // wrap up
181{ .mmi; add $f=$f,tmp0 // f+=F_20_39(b,c,d)
182 shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30) ;;?
183{ .mmi; or tmp1=tmp1,tmp5 // ROTATE(a,5)
184 add $h3=$h3,$c };; // wrap up
185{ .mib; add tmp3=1,inp // used in unaligned codepath
186 add $f=$f,tmp1 } // f+=ROTATE(a,5)
187{ .mib; add $h2=$h2,$b // wrap up
188 add $h4=$h4,$d };; // wrap up
189
190___
191}
192}
193
194sub BODY_40_59 {
195local *code=shift;
196local ($i,$a,$b,$c,$d,$e,$f)=@_;
197
198$code.=<<___;
199{ .mmi; mov $X[$i&0xf]=$f // Xupdate
200 and tmp0=$c,$b
201 dep.z tmp5=$a,5,27 } // a<<5
202{ .mmi; and tmp1=$d,$b
203 add tmp4=$e,$K_40_59 };;
204{ .mmi; or tmp0=tmp0,tmp1 // (b&c)|(b&d)
205 add $f=$f,tmp4 // f+=e+K_40_59
206 extr.u tmp1=$a,27,5 } // a>>27
207{ .mmi; and tmp4=$c,$d
208 xor tmp2=$X[($i+0+1)&0xf],$X[($i+2+1)&0xf] // +1
209 xor tmp3=$X[($i+8+1)&0xf],$X[($i+13+1)&0xf] // +1
210 };;
211{ .mmi; or tmp1=tmp1,tmp5 // ROTATE(a,5)
212 xor tmp2=tmp2,tmp3 // +1
213 shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30)
214{ .mmi; or tmp0=tmp0,tmp4 // F_40_59(b,c,d)=(b&c)|(b&d)|(c&d)
215 mux2 tmp6=$a,0x44 };; // see b in next iteration
216{ .mii; add $f=$f,tmp0 // f+=F_40_59(b,c,d)
217 shrp $e=tmp2,tmp2,31;; // f+1=ROTATE(x[0]^x[2]^x[8]^x[13],1)
218 add $f=$f,tmp1 };; // f+=ROTATE(a,5)
219
220___
221}
222sub BODY_60_79 { &BODY_20_39(@_,$K_60_79); }
223
224$code.=<<___;
225.text
226
227tmp0=r8;
228tmp1=r9;
229tmp2=r10;
230tmp3=r11;
231ctx=r32; // in0
232inp=r33; // in1
233
234// void sha1_block_data_order(SHA_CTX *c,const void *p,size_t num);
235.global sha1_block_data_order#
236.proc sha1_block_data_order#
237.align 32
238sha1_block_data_order:
239 .prologue
240{ .mmi; alloc tmp1=ar.pfs,3,15,0,0
241 $ADDP tmp0=4,ctx
242 .save ar.lc,r3
243 mov r3=ar.lc }
244{ .mmi; $ADDP ctx=0,ctx
245 $ADDP inp=0,inp
246 mov r2=pr };;
247tmp4=in2;
248tmp5=loc13;
249tmp6=loc14;
250 .body
251{ .mlx; ld4 $h0=[ctx],8
252 movl $K_00_19=0x5a827999 }
253{ .mlx; ld4 $h1=[tmp0],8
254 movl $K_20_39=0x6ed9eba1 };;
255{ .mlx; ld4 $h2=[ctx],8
256 movl $K_40_59=0x8f1bbcdc }
257{ .mlx; ld4 $h3=[tmp0]
258 movl $K_60_79=0xca62c1d6 };;
259{ .mmi; ld4 $h4=[ctx],-16
260 add in2=-1,in2 // adjust num for ar.lc
261 mov ar.ec=1 };;
262{ .mmi; nop.m 0
263 add tmp3=1,inp
264 mov ar.lc=in2 };; // brp.loop.imp: too far
265
266.Ldtop:
267{ .mmi; mov $A=$h0
268 mov $B=$h1
269 mux2 tmp6=$h1,0x44 }
270{ .mmi; mov $C=$h2
271 mov $D=$h3
272 mov $E=$h4 };;
273
274___
275
276{ my $i,@V=($A,$B,$C,$D,$E,$T);
277
278 for($i=0;$i<16;$i++) { &BODY_00_15(\$code,$i,@V); unshift(@V,pop(@V)); }
279 for(;$i<20;$i++) { &BODY_16_19(\$code,$i,@V); unshift(@V,pop(@V)); }
280 for(;$i<40;$i++) { &BODY_20_39(\$code,$i,@V); unshift(@V,pop(@V)); }
281 for(;$i<60;$i++) { &BODY_40_59(\$code,$i,@V); unshift(@V,pop(@V)); }
282 for(;$i<80;$i++) { &BODY_60_79(\$code,$i,@V); unshift(@V,pop(@V)); }
283
284 (($V[5] eq $D) and ($V[0] eq $E)) or die; # double-check
285}
286
287$code.=<<___;
288{ .mmb; add $h0=$h0,$E
289 nop.m 0
290 br.ctop.dptk.many .Ldtop };;
291.Ldend:
292{ .mmi; add tmp0=4,ctx
293 mov ar.lc=r3 };;
294{ .mmi; st4 [ctx]=$h0,8
295 st4 [tmp0]=$h1,8 };;
296{ .mmi; st4 [ctx]=$h2,8
297 st4 [tmp0]=$h3 };;
298{ .mib; st4 [ctx]=$h4,-16
299 mov pr=r2,0x1ffff
300 br.ret.sptk.many b0 };;
301.endp sha1_block_data_order#
302stringz "SHA1 block transform for IA64, CRYPTOGAMS by <appro\@openssl.org>"
303___
304
305$output=shift and open STDOUT,">$output";
306print $code;
diff --git a/src/lib/libcrypto/sha/asm/sha1-ppc.pl b/src/lib/libcrypto/sha/asm/sha1-ppc.pl
deleted file mode 100755
index dcd0fcdfcf..0000000000
--- a/src/lib/libcrypto/sha/asm/sha1-ppc.pl
+++ /dev/null
@@ -1,319 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# I let hardware handle unaligned input(*), except on page boundaries
11# (see below for details). Otherwise straightforward implementation
12# with X vector in register bank. The module is big-endian [which is
13# not big deal as there're no little-endian targets left around].
14#
15# (*) this means that this module is inappropriate for PPC403? Does
16# anybody know if pre-POWER3 can sustain unaligned load?
17
18# -m64 -m32
19# ----------------------------------
20# PPC970,gcc-4.0.0 +76% +59%
21# Power6,xlc-7 +68% +33%
22
23$flavour = shift;
24
25if ($flavour =~ /64/) {
26 $SIZE_T =8;
27 $UCMP ="cmpld";
28 $STU ="stdu";
29 $POP ="ld";
30 $PUSH ="std";
31} elsif ($flavour =~ /32/) {
32 $SIZE_T =4;
33 $UCMP ="cmplw";
34 $STU ="stwu";
35 $POP ="lwz";
36 $PUSH ="stw";
37} else { die "nonsense $flavour"; }
38
39$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
40( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
41( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
42die "can't locate ppc-xlate.pl";
43
44open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
45
46$FRAME=24*$SIZE_T;
47
48$K ="r0";
49$sp ="r1";
50$toc="r2";
51$ctx="r3";
52$inp="r4";
53$num="r5";
54$t0 ="r15";
55$t1 ="r6";
56
57$A ="r7";
58$B ="r8";
59$C ="r9";
60$D ="r10";
61$E ="r11";
62$T ="r12";
63
64@V=($A,$B,$C,$D,$E,$T);
65@X=("r16","r17","r18","r19","r20","r21","r22","r23",
66 "r24","r25","r26","r27","r28","r29","r30","r31");
67
68sub BODY_00_19 {
69my ($i,$a,$b,$c,$d,$e,$f)=@_;
70my $j=$i+1;
71$code.=<<___ if ($i==0);
72 lwz @X[$i],`$i*4`($inp)
73___
74$code.=<<___ if ($i<15);
75 lwz @X[$j],`$j*4`($inp)
76 add $f,$K,$e
77 rotlwi $e,$a,5
78 add $f,$f,@X[$i]
79 and $t0,$c,$b
80 add $f,$f,$e
81 andc $t1,$d,$b
82 rotlwi $b,$b,30
83 or $t0,$t0,$t1
84 add $f,$f,$t0
85___
86$code.=<<___ if ($i>=15);
87 add $f,$K,$e
88 rotlwi $e,$a,5
89 xor @X[$j%16],@X[$j%16],@X[($j+2)%16]
90 add $f,$f,@X[$i%16]
91 and $t0,$c,$b
92 xor @X[$j%16],@X[$j%16],@X[($j+8)%16]
93 add $f,$f,$e
94 andc $t1,$d,$b
95 rotlwi $b,$b,30
96 or $t0,$t0,$t1
97 xor @X[$j%16],@X[$j%16],@X[($j+13)%16]
98 add $f,$f,$t0
99 rotlwi @X[$j%16],@X[$j%16],1
100___
101}
102
103sub BODY_20_39 {
104my ($i,$a,$b,$c,$d,$e,$f)=@_;
105my $j=$i+1;
106$code.=<<___ if ($i<79);
107 add $f,$K,$e
108 rotlwi $e,$a,5
109 xor @X[$j%16],@X[$j%16],@X[($j+2)%16]
110 add $f,$f,@X[$i%16]
111 xor $t0,$b,$c
112 xor @X[$j%16],@X[$j%16],@X[($j+8)%16]
113 add $f,$f,$e
114 rotlwi $b,$b,30
115 xor $t0,$t0,$d
116 xor @X[$j%16],@X[$j%16],@X[($j+13)%16]
117 add $f,$f,$t0
118 rotlwi @X[$j%16],@X[$j%16],1
119___
120$code.=<<___ if ($i==79);
121 add $f,$K,$e
122 rotlwi $e,$a,5
123 lwz r16,0($ctx)
124 add $f,$f,@X[$i%16]
125 xor $t0,$b,$c
126 lwz r17,4($ctx)
127 add $f,$f,$e
128 rotlwi $b,$b,30
129 lwz r18,8($ctx)
130 xor $t0,$t0,$d
131 lwz r19,12($ctx)
132 add $f,$f,$t0
133 lwz r20,16($ctx)
134___
135}
136
137sub BODY_40_59 {
138my ($i,$a,$b,$c,$d,$e,$f)=@_;
139my $j=$i+1;
140$code.=<<___;
141 add $f,$K,$e
142 rotlwi $e,$a,5
143 xor @X[$j%16],@X[$j%16],@X[($j+2)%16]
144 add $f,$f,@X[$i%16]
145 and $t0,$b,$c
146 xor @X[$j%16],@X[$j%16],@X[($j+8)%16]
147 add $f,$f,$e
148 or $t1,$b,$c
149 rotlwi $b,$b,30
150 xor @X[$j%16],@X[$j%16],@X[($j+13)%16]
151 and $t1,$t1,$d
152 or $t0,$t0,$t1
153 rotlwi @X[$j%16],@X[$j%16],1
154 add $f,$f,$t0
155___
156}
157
158$code=<<___;
159.machine "any"
160.text
161
162.globl .sha1_block_data_order
163.align 4
164.sha1_block_data_order:
165 mflr r0
166 $STU $sp,`-($FRAME+64)`($sp)
167 $PUSH r0,`$FRAME-$SIZE_T*18`($sp)
168 $PUSH r15,`$FRAME-$SIZE_T*17`($sp)
169 $PUSH r16,`$FRAME-$SIZE_T*16`($sp)
170 $PUSH r17,`$FRAME-$SIZE_T*15`($sp)
171 $PUSH r18,`$FRAME-$SIZE_T*14`($sp)
172 $PUSH r19,`$FRAME-$SIZE_T*13`($sp)
173 $PUSH r20,`$FRAME-$SIZE_T*12`($sp)
174 $PUSH r21,`$FRAME-$SIZE_T*11`($sp)
175 $PUSH r22,`$FRAME-$SIZE_T*10`($sp)
176 $PUSH r23,`$FRAME-$SIZE_T*9`($sp)
177 $PUSH r24,`$FRAME-$SIZE_T*8`($sp)
178 $PUSH r25,`$FRAME-$SIZE_T*7`($sp)
179 $PUSH r26,`$FRAME-$SIZE_T*6`($sp)
180 $PUSH r27,`$FRAME-$SIZE_T*5`($sp)
181 $PUSH r28,`$FRAME-$SIZE_T*4`($sp)
182 $PUSH r29,`$FRAME-$SIZE_T*3`($sp)
183 $PUSH r30,`$FRAME-$SIZE_T*2`($sp)
184 $PUSH r31,`$FRAME-$SIZE_T*1`($sp)
185 lwz $A,0($ctx)
186 lwz $B,4($ctx)
187 lwz $C,8($ctx)
188 lwz $D,12($ctx)
189 lwz $E,16($ctx)
190 andi. r0,$inp,3
191 bne Lunaligned
192Laligned:
193 mtctr $num
194 bl Lsha1_block_private
195Ldone:
196 $POP r0,`$FRAME-$SIZE_T*18`($sp)
197 $POP r15,`$FRAME-$SIZE_T*17`($sp)
198 $POP r16,`$FRAME-$SIZE_T*16`($sp)
199 $POP r17,`$FRAME-$SIZE_T*15`($sp)
200 $POP r18,`$FRAME-$SIZE_T*14`($sp)
201 $POP r19,`$FRAME-$SIZE_T*13`($sp)
202 $POP r20,`$FRAME-$SIZE_T*12`($sp)
203 $POP r21,`$FRAME-$SIZE_T*11`($sp)
204 $POP r22,`$FRAME-$SIZE_T*10`($sp)
205 $POP r23,`$FRAME-$SIZE_T*9`($sp)
206 $POP r24,`$FRAME-$SIZE_T*8`($sp)
207 $POP r25,`$FRAME-$SIZE_T*7`($sp)
208 $POP r26,`$FRAME-$SIZE_T*6`($sp)
209 $POP r27,`$FRAME-$SIZE_T*5`($sp)
210 $POP r28,`$FRAME-$SIZE_T*4`($sp)
211 $POP r29,`$FRAME-$SIZE_T*3`($sp)
212 $POP r30,`$FRAME-$SIZE_T*2`($sp)
213 $POP r31,`$FRAME-$SIZE_T*1`($sp)
214 mtlr r0
215 addi $sp,$sp,`$FRAME+64`
216 blr
217___
218
219# PowerPC specification allows an implementation to be ill-behaved
220# upon unaligned access which crosses page boundary. "Better safe
221# than sorry" principle makes me treat it specially. But I don't
222# look for particular offending word, but rather for 64-byte input
223# block which crosses the boundary. Once found that block is aligned
224# and hashed separately...
225$code.=<<___;
226.align 4
227Lunaligned:
228 subfic $t1,$inp,4096
229 andi. $t1,$t1,4095 ; distance to closest page boundary
230 srwi. $t1,$t1,6 ; t1/=64
231 beq Lcross_page
232 $UCMP $num,$t1
233 ble- Laligned ; didn't cross the page boundary
234 mtctr $t1
235 subfc $num,$t1,$num
236 bl Lsha1_block_private
237Lcross_page:
238 li $t1,16
239 mtctr $t1
240 addi r20,$sp,$FRAME ; spot below the frame
241Lmemcpy:
242 lbz r16,0($inp)
243 lbz r17,1($inp)
244 lbz r18,2($inp)
245 lbz r19,3($inp)
246 addi $inp,$inp,4
247 stb r16,0(r20)
248 stb r17,1(r20)
249 stb r18,2(r20)
250 stb r19,3(r20)
251 addi r20,r20,4
252 bdnz Lmemcpy
253
254 $PUSH $inp,`$FRAME-$SIZE_T*19`($sp)
255 li $t1,1
256 addi $inp,$sp,$FRAME
257 mtctr $t1
258 bl Lsha1_block_private
259 $POP $inp,`$FRAME-$SIZE_T*19`($sp)
260 addic. $num,$num,-1
261 bne- Lunaligned
262 b Ldone
263___
264
265# This is private block function, which uses tailored calling
266# interface, namely upon entry SHA_CTX is pre-loaded to given
267# registers and counter register contains amount of chunks to
268# digest...
269$code.=<<___;
270.align 4
271Lsha1_block_private:
272___
273$code.=<<___; # load K_00_19
274 lis $K,0x5a82
275 ori $K,$K,0x7999
276___
277for($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
278$code.=<<___; # load K_20_39
279 lis $K,0x6ed9
280 ori $K,$K,0xeba1
281___
282for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
283$code.=<<___; # load K_40_59
284 lis $K,0x8f1b
285 ori $K,$K,0xbcdc
286___
287for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
288$code.=<<___; # load K_60_79
289 lis $K,0xca62
290 ori $K,$K,0xc1d6
291___
292for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
293$code.=<<___;
294 add r16,r16,$E
295 add r17,r17,$T
296 add r18,r18,$A
297 add r19,r19,$B
298 add r20,r20,$C
299 stw r16,0($ctx)
300 mr $A,r16
301 stw r17,4($ctx)
302 mr $B,r17
303 stw r18,8($ctx)
304 mr $C,r18
305 stw r19,12($ctx)
306 mr $D,r19
307 stw r20,16($ctx)
308 mr $E,r20
309 addi $inp,$inp,`16*4`
310 bdnz- Lsha1_block_private
311 blr
312___
313$code.=<<___;
314.asciz "SHA1 block transform for PPC, CRYPTOGAMS by <appro\@fy.chalmers.se>"
315___
316
317$code =~ s/\`([^\`]*)\`/eval $1/gem;
318print $code;
319close STDOUT;
diff --git a/src/lib/libcrypto/sha/asm/sha1-s390x.pl b/src/lib/libcrypto/sha/asm/sha1-s390x.pl
deleted file mode 100644
index 4b17848287..0000000000
--- a/src/lib/libcrypto/sha/asm/sha1-s390x.pl
+++ /dev/null
@@ -1,226 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# SHA1 block procedure for s390x.
11
12# April 2007.
13#
14# Performance is >30% better than gcc 3.3 generated code. But the real
15# twist is that SHA1 hardware support is detected and utilized. In
16# which case performance can reach further >4.5x for larger chunks.
17
18# January 2009.
19#
20# Optimize Xupdate for amount of memory references and reschedule
21# instructions to favour dual-issue z10 pipeline. On z10 hardware is
22# "only" ~2.3x faster than software.
23
24$kimdfunc=1; # magic function code for kimd instruction
25
26$output=shift;
27open STDOUT,">$output";
28
29$K_00_39="%r0"; $K=$K_00_39;
30$K_40_79="%r1";
31$ctx="%r2"; $prefetch="%r2";
32$inp="%r3";
33$len="%r4";
34
35$A="%r5";
36$B="%r6";
37$C="%r7";
38$D="%r8";
39$E="%r9"; @V=($A,$B,$C,$D,$E);
40$t0="%r10";
41$t1="%r11";
42@X=("%r12","%r13","%r14");
43$sp="%r15";
44
45$frame=160+16*4;
46
47sub Xupdate {
48my $i=shift;
49
50$code.=<<___ if ($i==15);
51 lg $prefetch,160($sp) ### Xupdate(16) warm-up
52 lr $X[0],$X[2]
53___
54return if ($i&1); # Xupdate is vectorized and executed every 2nd cycle
55$code.=<<___ if ($i<16);
56 lg $X[0],`$i*4`($inp) ### Xload($i)
57 rllg $X[1],$X[0],32
58___
59$code.=<<___ if ($i>=16);
60 xgr $X[0],$prefetch ### Xupdate($i)
61 lg $prefetch,`160+4*(($i+2)%16)`($sp)
62 xg $X[0],`160+4*(($i+8)%16)`($sp)
63 xgr $X[0],$prefetch
64 rll $X[0],$X[0],1
65 rllg $X[1],$X[0],32
66 rll $X[1],$X[1],1
67 rllg $X[0],$X[1],32
68 lr $X[2],$X[1] # feedback
69___
70$code.=<<___ if ($i<=70);
71 stg $X[0],`160+4*($i%16)`($sp)
72___
73unshift(@X,pop(@X));
74}
75
76sub BODY_00_19 {
77my ($i,$a,$b,$c,$d,$e)=@_;
78my $xi=$X[1];
79
80 &Xupdate($i);
81$code.=<<___;
82 alr $e,$K ### $i
83 rll $t1,$a,5
84 lr $t0,$d
85 xr $t0,$c
86 alr $e,$t1
87 nr $t0,$b
88 alr $e,$xi
89 xr $t0,$d
90 rll $b,$b,30
91 alr $e,$t0
92___
93}
94
95sub BODY_20_39 {
96my ($i,$a,$b,$c,$d,$e)=@_;
97my $xi=$X[1];
98
99 &Xupdate($i);
100$code.=<<___;
101 alr $e,$K ### $i
102 rll $t1,$a,5
103 lr $t0,$b
104 alr $e,$t1
105 xr $t0,$c
106 alr $e,$xi
107 xr $t0,$d
108 rll $b,$b,30
109 alr $e,$t0
110___
111}
112
113sub BODY_40_59 {
114my ($i,$a,$b,$c,$d,$e)=@_;
115my $xi=$X[1];
116
117 &Xupdate($i);
118$code.=<<___;
119 alr $e,$K ### $i
120 rll $t1,$a,5
121 lr $t0,$b
122 alr $e,$t1
123 or $t0,$c
124 lr $t1,$b
125 nr $t0,$d
126 nr $t1,$c
127 alr $e,$xi
128 or $t0,$t1
129 rll $b,$b,30
130 alr $e,$t0
131___
132}
133
134$code.=<<___;
135.text
136.align 64
137.type Ktable,\@object
138Ktable: .long 0x5a827999,0x6ed9eba1,0x8f1bbcdc,0xca62c1d6
139 .skip 48 #.long 0,0,0,0,0,0,0,0,0,0,0,0
140.size Ktable,.-Ktable
141.globl sha1_block_data_order
142.type sha1_block_data_order,\@function
143sha1_block_data_order:
144___
145$code.=<<___ if ($kimdfunc);
146 larl %r1,OPENSSL_s390xcap_P
147 lg %r0,0(%r1)
148 tmhl %r0,0x4000 # check for message-security assist
149 jz .Lsoftware
150 lghi %r0,0
151 la %r1,16($sp)
152 .long 0xb93e0002 # kimd %r0,%r2
153 lg %r0,16($sp)
154 tmhh %r0,`0x8000>>$kimdfunc`
155 jz .Lsoftware
156 lghi %r0,$kimdfunc
157 lgr %r1,$ctx
158 lgr %r2,$inp
159 sllg %r3,$len,6
160 .long 0xb93e0002 # kimd %r0,%r2
161 brc 1,.-4 # pay attention to "partial completion"
162 br %r14
163.align 16
164.Lsoftware:
165___
166$code.=<<___;
167 lghi %r1,-$frame
168 stg $ctx,16($sp)
169 stmg %r6,%r15,48($sp)
170 lgr %r0,$sp
171 la $sp,0(%r1,$sp)
172 stg %r0,0($sp)
173
174 larl $t0,Ktable
175 llgf $A,0($ctx)
176 llgf $B,4($ctx)
177 llgf $C,8($ctx)
178 llgf $D,12($ctx)
179 llgf $E,16($ctx)
180
181 lg $K_00_39,0($t0)
182 lg $K_40_79,8($t0)
183
184.Lloop:
185 rllg $K_00_39,$K_00_39,32
186___
187for ($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
188$code.=<<___;
189 rllg $K_00_39,$K_00_39,32
190___
191for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
192$code.=<<___; $K=$K_40_79;
193 rllg $K_40_79,$K_40_79,32
194___
195for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
196$code.=<<___;
197 rllg $K_40_79,$K_40_79,32
198___
199for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
200$code.=<<___;
201
202 lg $ctx,`$frame+16`($sp)
203 la $inp,64($inp)
204 al $A,0($ctx)
205 al $B,4($ctx)
206 al $C,8($ctx)
207 al $D,12($ctx)
208 al $E,16($ctx)
209 st $A,0($ctx)
210 st $B,4($ctx)
211 st $C,8($ctx)
212 st $D,12($ctx)
213 st $E,16($ctx)
214 brct $len,.Lloop
215
216 lmg %r6,%r15,`$frame+48`($sp)
217 br %r14
218.size sha1_block_data_order,.-sha1_block_data_order
219.string "SHA1 block transform for s390x, CRYPTOGAMS by <appro\@openssl.org>"
220.comm OPENSSL_s390xcap_P,8,8
221___
222
223$code =~ s/\`([^\`]*)\`/eval $1/gem;
224
225print $code;
226close STDOUT;
diff --git a/src/lib/libcrypto/sha/asm/sha1-sparcv9.pl b/src/lib/libcrypto/sha/asm/sha1-sparcv9.pl
deleted file mode 100644
index 5c161cecd6..0000000000
--- a/src/lib/libcrypto/sha/asm/sha1-sparcv9.pl
+++ /dev/null
@@ -1,284 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# Performance improvement is not really impressive on pre-T1 CPU: +8%
11# over Sun C and +25% over gcc [3.3]. While on T1, a.k.a. Niagara, it
12# turned to be 40% faster than 64-bit code generated by Sun C 5.8 and
13# >2x than 64-bit code generated by gcc 3.4. And there is a gimmick.
14# X[16] vector is packed to 8 64-bit registers and as result nothing
15# is spilled on stack. In addition input data is loaded in compact
16# instruction sequence, thus minimizing the window when the code is
17# subject to [inter-thread] cache-thrashing hazard. The goal is to
18# ensure scalability on UltraSPARC T1, or rather to avoid decay when
19# amount of active threads exceeds the number of physical cores.
20
21$bits=32;
22for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
23if ($bits==64) { $bias=2047; $frame=192; }
24else { $bias=0; $frame=112; }
25
26$output=shift;
27open STDOUT,">$output";
28
29@X=("%o0","%o1","%o2","%o3","%o4","%o5","%g1","%o7");
30$rot1m="%g2";
31$tmp64="%g3";
32$Xi="%g4";
33$A="%l0";
34$B="%l1";
35$C="%l2";
36$D="%l3";
37$E="%l4";
38@V=($A,$B,$C,$D,$E);
39$K_00_19="%l5";
40$K_20_39="%l6";
41$K_40_59="%l7";
42$K_60_79="%g5";
43@K=($K_00_19,$K_20_39,$K_40_59,$K_60_79);
44
45$ctx="%i0";
46$inp="%i1";
47$len="%i2";
48$tmp0="%i3";
49$tmp1="%i4";
50$tmp2="%i5";
51
52sub BODY_00_15 {
53my ($i,$a,$b,$c,$d,$e)=@_;
54my $xi=($i&1)?@X[($i/2)%8]:$Xi;
55
56$code.=<<___;
57 sll $a,5,$tmp0 !! $i
58 add @K[$i/20],$e,$e
59 srl $a,27,$tmp1
60 add $tmp0,$e,$e
61 and $c,$b,$tmp0
62 add $tmp1,$e,$e
63 sll $b,30,$tmp2
64 andn $d,$b,$tmp1
65 srl $b,2,$b
66 or $tmp1,$tmp0,$tmp1
67 or $tmp2,$b,$b
68 add $xi,$e,$e
69___
70if ($i&1 && $i<15) {
71 $code.=
72 " srlx @X[(($i+1)/2)%8],32,$Xi\n";
73}
74$code.=<<___;
75 add $tmp1,$e,$e
76___
77}
78
79sub Xupdate {
80my ($i,$a,$b,$c,$d,$e)=@_;
81my $j=$i/2;
82
83if ($i&1) {
84$code.=<<___;
85 sll $a,5,$tmp0 !! $i
86 add @K[$i/20],$e,$e
87 srl $a,27,$tmp1
88___
89} else {
90$code.=<<___;
91 sllx @X[($j+6)%8],32,$Xi ! Xupdate($i)
92 xor @X[($j+1)%8],@X[$j%8],@X[$j%8]
93 srlx @X[($j+7)%8],32,$tmp1
94 xor @X[($j+4)%8],@X[$j%8],@X[$j%8]
95 sll $a,5,$tmp0 !! $i
96 or $tmp1,$Xi,$Xi
97 add @K[$i/20],$e,$e !!
98 xor $Xi,@X[$j%8],@X[$j%8]
99 srlx @X[$j%8],31,$Xi
100 add @X[$j%8],@X[$j%8],@X[$j%8]
101 and $Xi,$rot1m,$Xi
102 andn @X[$j%8],$rot1m,@X[$j%8]
103 srl $a,27,$tmp1 !!
104 or $Xi,@X[$j%8],@X[$j%8]
105___
106}
107}
108
109sub BODY_16_19 {
110my ($i,$a,$b,$c,$d,$e)=@_;
111
112 &Xupdate(@_);
113 if ($i&1) {
114 $xi=@X[($i/2)%8];
115 } else {
116 $xi=$Xi;
117 $code.="\tsrlx @X[($i/2)%8],32,$xi\n";
118 }
119$code.=<<___;
120 add $tmp0,$e,$e !!
121 and $c,$b,$tmp0
122 add $tmp1,$e,$e
123 sll $b,30,$tmp2
124 add $xi,$e,$e
125 andn $d,$b,$tmp1
126 srl $b,2,$b
127 or $tmp1,$tmp0,$tmp1
128 or $tmp2,$b,$b
129 add $tmp1,$e,$e
130___
131}
132
133sub BODY_20_39 {
134my ($i,$a,$b,$c,$d,$e)=@_;
135my $xi;
136 &Xupdate(@_);
137 if ($i&1) {
138 $xi=@X[($i/2)%8];
139 } else {
140 $xi=$Xi;
141 $code.="\tsrlx @X[($i/2)%8],32,$xi\n";
142 }
143$code.=<<___;
144 add $tmp0,$e,$e !!
145 xor $c,$b,$tmp0
146 add $tmp1,$e,$e
147 sll $b,30,$tmp2
148 xor $d,$tmp0,$tmp1
149 srl $b,2,$b
150 add $tmp1,$e,$e
151 or $tmp2,$b,$b
152 add $xi,$e,$e
153___
154}
155
156sub BODY_40_59 {
157my ($i,$a,$b,$c,$d,$e)=@_;
158my $xi;
159 &Xupdate(@_);
160 if ($i&1) {
161 $xi=@X[($i/2)%8];
162 } else {
163 $xi=$Xi;
164 $code.="\tsrlx @X[($i/2)%8],32,$xi\n";
165 }
166$code.=<<___;
167 add $tmp0,$e,$e !!
168 and $c,$b,$tmp0
169 add $tmp1,$e,$e
170 sll $b,30,$tmp2
171 or $c,$b,$tmp1
172 srl $b,2,$b
173 and $d,$tmp1,$tmp1
174 add $xi,$e,$e
175 or $tmp1,$tmp0,$tmp1
176 or $tmp2,$b,$b
177 add $tmp1,$e,$e
178___
179}
180
181$code.=<<___ if ($bits==64);
182.register %g2,#scratch
183.register %g3,#scratch
184___
185$code.=<<___;
186.section ".text",#alloc,#execinstr
187
188.align 32
189.globl sha1_block_data_order
190sha1_block_data_order:
191 save %sp,-$frame,%sp
192 sllx $len,6,$len
193 add $inp,$len,$len
194
195 or %g0,1,$rot1m
196 sllx $rot1m,32,$rot1m
197 or $rot1m,1,$rot1m
198
199 ld [$ctx+0],$A
200 ld [$ctx+4],$B
201 ld [$ctx+8],$C
202 ld [$ctx+12],$D
203 ld [$ctx+16],$E
204 andn $inp,7,$tmp0
205
206 sethi %hi(0x5a827999),$K_00_19
207 or $K_00_19,%lo(0x5a827999),$K_00_19
208 sethi %hi(0x6ed9eba1),$K_20_39
209 or $K_20_39,%lo(0x6ed9eba1),$K_20_39
210 sethi %hi(0x8f1bbcdc),$K_40_59
211 or $K_40_59,%lo(0x8f1bbcdc),$K_40_59
212 sethi %hi(0xca62c1d6),$K_60_79
213 or $K_60_79,%lo(0xca62c1d6),$K_60_79
214
215.Lloop:
216 ldx [$tmp0+0],@X[0]
217 ldx [$tmp0+16],@X[2]
218 ldx [$tmp0+32],@X[4]
219 ldx [$tmp0+48],@X[6]
220 and $inp,7,$tmp1
221 ldx [$tmp0+8],@X[1]
222 sll $tmp1,3,$tmp1
223 ldx [$tmp0+24],@X[3]
224 subcc %g0,$tmp1,$tmp2 ! should be 64-$tmp1, but -$tmp1 works too
225 ldx [$tmp0+40],@X[5]
226 bz,pt %icc,.Laligned
227 ldx [$tmp0+56],@X[7]
228
229 sllx @X[0],$tmp1,@X[0]
230 ldx [$tmp0+64],$tmp64
231___
232for($i=0;$i<7;$i++)
233{ $code.=<<___;
234 srlx @X[$i+1],$tmp2,$Xi
235 sllx @X[$i+1],$tmp1,@X[$i+1]
236 or $Xi,@X[$i],@X[$i]
237___
238}
239$code.=<<___;
240 srlx $tmp64,$tmp2,$tmp64
241 or $tmp64,@X[7],@X[7]
242.Laligned:
243 srlx @X[0],32,$Xi
244___
245for ($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
246for (;$i<20;$i++) { &BODY_16_19($i,@V); unshift(@V,pop(@V)); }
247for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
248for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
249for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
250$code.=<<___;
251
252 ld [$ctx+0],@X[0]
253 ld [$ctx+4],@X[1]
254 ld [$ctx+8],@X[2]
255 ld [$ctx+12],@X[3]
256 add $inp,64,$inp
257 ld [$ctx+16],@X[4]
258 cmp $inp,$len
259
260 add $A,@X[0],$A
261 st $A,[$ctx+0]
262 add $B,@X[1],$B
263 st $B,[$ctx+4]
264 add $C,@X[2],$C
265 st $C,[$ctx+8]
266 add $D,@X[3],$D
267 st $D,[$ctx+12]
268 add $E,@X[4],$E
269 st $E,[$ctx+16]
270
271 bne `$bits==64?"%xcc":"%icc"`,.Lloop
272 andn $inp,7,$tmp0
273
274 ret
275 restore
276.type sha1_block_data_order,#function
277.size sha1_block_data_order,(.-sha1_block_data_order)
278.asciz "SHA1 block transform for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
279.align 4
280___
281
282$code =~ s/\`([^\`]*)\`/eval $1/gem;
283print $code;
284close STDOUT;
diff --git a/src/lib/libcrypto/sha/asm/sha1-sparcv9a.pl b/src/lib/libcrypto/sha/asm/sha1-sparcv9a.pl
deleted file mode 100644
index 85e8d68086..0000000000
--- a/src/lib/libcrypto/sha/asm/sha1-sparcv9a.pl
+++ /dev/null
@@ -1,601 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# January 2009
11#
12# Provided that UltraSPARC VIS instructions are pipe-lined(*) and
13# pairable(*) with IALU ones, offloading of Xupdate to the UltraSPARC
14# Graphic Unit would make it possible to achieve higher instruction-
15# level parallelism, ILP, and thus higher performance. It should be
16# explicitly noted that ILP is the keyword, and it means that this
17# code would be unsuitable for cores like UltraSPARC-Tx. The idea is
18# not really novel, Sun had VIS-powered implementation for a while.
19# Unlike Sun's implementation this one can process multiple unaligned
20# input blocks, and as such works as drop-in replacement for OpenSSL
21# sha1_block_data_order. Performance improvement was measured to be
22# 40% over pure IALU sha1-sparcv9.pl on UltraSPARC-IIi, but 12% on
23# UltraSPARC-III. See below for discussion...
24#
25# The module does not present direct interest for OpenSSL, because
26# it doesn't provide better performance on contemporary SPARCv9 CPUs,
27# UltraSPARC-Tx and SPARC64-V[II] to be specific. Those who feel they
28# absolutely must score on UltraSPARC-I-IV can simply replace
29# crypto/sha/asm/sha1-sparcv9.pl with this module.
30#
31# (*) "Pipe-lined" means that even if it takes several cycles to
32# complete, next instruction using same functional unit [but not
33# depending on the result of the current instruction] can start
34# execution without having to wait for the unit. "Pairable"
35# means that two [or more] independent instructions can be
36# issued at the very same time.
37
38$bits=32;
39for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
40if ($bits==64) { $bias=2047; $frame=192; }
41else { $bias=0; $frame=112; }
42
43$output=shift;
44open STDOUT,">$output";
45
46$ctx="%i0";
47$inp="%i1";
48$len="%i2";
49$tmp0="%i3";
50$tmp1="%i4";
51$tmp2="%i5";
52$tmp3="%g5";
53
54$base="%g1";
55$align="%g4";
56$Xfer="%o5";
57$nXfer=$tmp3;
58$Xi="%o7";
59
60$A="%l0";
61$B="%l1";
62$C="%l2";
63$D="%l3";
64$E="%l4";
65@V=($A,$B,$C,$D,$E);
66
67$Actx="%o0";
68$Bctx="%o1";
69$Cctx="%o2";
70$Dctx="%o3";
71$Ectx="%o4";
72
73$fmul="%f32";
74$VK_00_19="%f34";
75$VK_20_39="%f36";
76$VK_40_59="%f38";
77$VK_60_79="%f40";
78@VK=($VK_00_19,$VK_20_39,$VK_40_59,$VK_60_79);
79@X=("%f0", "%f1", "%f2", "%f3", "%f4", "%f5", "%f6", "%f7",
80 "%f8", "%f9","%f10","%f11","%f12","%f13","%f14","%f15","%f16");
81
82# This is reference 2x-parallelized VIS-powered Xupdate procedure. It
83# covers even K_NN_MM addition...
84sub Xupdate {
85my ($i)=@_;
86my $K=@VK[($i+16)/20];
87my $j=($i+16)%16;
88
89# [ provided that GSR.alignaddr_offset is 5, $mul contains
90# 0x100ULL<<32|0x100 value and K_NN_MM are pre-loaded to
91# chosen registers... ]
92$code.=<<___;
93 fxors @X[($j+13)%16],@X[$j],@X[$j] !-1/-1/-1:X[0]^=X[13]
94 fxors @X[($j+14)%16],@X[$j+1],@X[$j+1]! 0/ 0/ 0:X[1]^=X[14]
95 fxor @X[($j+2)%16],@X[($j+8)%16],%f18! 1/ 1/ 1:Tmp=X[2,3]^X[8,9]
96 fxor %f18,@X[$j],@X[$j] ! 2/ 4/ 3:X[0,1]^=X[2,3]^X[8,9]
97 faligndata @X[$j],@X[$j],%f18 ! 3/ 7/ 5:Tmp=X[0,1]>>>24
98 fpadd32 @X[$j],@X[$j],@X[$j] ! 4/ 8/ 6:X[0,1]<<=1
99 fmul8ulx16 %f18,$fmul,%f18 ! 5/10/ 7:Tmp>>=7, Tmp&=1
100 ![fxors %f15,%f2,%f2]
101 for %f18,@X[$j],@X[$j] ! 8/14/10:X[0,1]|=Tmp
102 ![fxors %f0,%f3,%f3] !10/17/12:X[0] dependency
103 fpadd32 $K,@X[$j],%f20
104 std %f20,[$Xfer+`4*$j`]
105___
106# The numbers delimited with slash are the earliest possible dispatch
107# cycles for given instruction assuming 1 cycle latency for simple VIS
108# instructions, such as on UltraSPARC-I&II, 3 cycles latency, such as
109# on UltraSPARC-III&IV, and 2 cycles latency(*), respectively. Being
110# 2x-parallelized the procedure is "worth" 5, 8.5 or 6 ticks per SHA1
111# round. As [long as] FPU/VIS instructions are perfectly pairable with
112# IALU ones, the round timing is defined by the maximum between VIS
113# and IALU timings. The latter varies from round to round and averages
114# out at 6.25 ticks. This means that USI&II should operate at IALU
115# rate, while USIII&IV - at VIS rate. This explains why performance
116# improvement varies among processors. Well, given that pure IALU
117# sha1-sparcv9.pl module exhibits virtually uniform performance of
118# ~9.3 cycles per SHA1 round. Timings mentioned above are theoretical
119# lower limits. Real-life performance was measured to be 6.6 cycles
120# per SHA1 round on USIIi and 8.3 on USIII. The latter is lower than
121# half-round VIS timing, because there are 16 Xupdate-free rounds,
122# which "push down" average theoretical timing to 8 cycles...
123
124# (*) SPARC64-V[II] was originally believed to have 2 cycles VIS
125# latency. Well, it might have, but it doesn't have dedicated
126# VIS-unit. Instead, VIS instructions are executed by other
127# functional units, ones used here - by IALU. This doesn't
128# improve effective ILP...
129}
130
131# The reference Xupdate procedure is then "strained" over *pairs* of
132# BODY_NN_MM and kind of modulo-scheduled in respect to X[n]^=X[n+13]
133# and K_NN_MM addition. It's "running" 15 rounds ahead, which leaves
134# plenty of room to amortize for read-after-write hazard, as well as
135# to fetch and align input for the next spin. The VIS instructions are
136# scheduled for latency of 2 cycles, because there are not enough IALU
137# instructions to schedule for latency of 3, while scheduling for 1
138# would give no gain on USI&II anyway.
139
140sub BODY_00_19 {
141my ($i,$a,$b,$c,$d,$e)=@_;
142my $j=$i&~1;
143my $k=($j+16+2)%16; # ahead reference
144my $l=($j+16-2)%16; # behind reference
145my $K=@VK[($j+16-2)/20];
146
147$j=($j+16)%16;
148
149$code.=<<___ if (!($i&1));
150 sll $a,5,$tmp0 !! $i
151 and $c,$b,$tmp3
152 ld [$Xfer+`4*($i%16)`],$Xi
153 fxors @X[($j+14)%16],@X[$j+1],@X[$j+1]! 0/ 0/ 0:X[1]^=X[14]
154 srl $a,27,$tmp1
155 add $tmp0,$e,$e
156 fxor @X[($j+2)%16],@X[($j+8)%16],%f18! 1/ 1/ 1:Tmp=X[2,3]^X[8,9]
157 sll $b,30,$tmp2
158 add $tmp1,$e,$e
159 andn $d,$b,$tmp1
160 add $Xi,$e,$e
161 fxor %f18,@X[$j],@X[$j] ! 2/ 4/ 3:X[0,1]^=X[2,3]^X[8,9]
162 srl $b,2,$b
163 or $tmp1,$tmp3,$tmp1
164 or $tmp2,$b,$b
165 add $tmp1,$e,$e
166 faligndata @X[$j],@X[$j],%f18 ! 3/ 7/ 5:Tmp=X[0,1]>>>24
167___
168$code.=<<___ if ($i&1);
169 sll $a,5,$tmp0 !! $i
170 and $c,$b,$tmp3
171 ld [$Xfer+`4*($i%16)`],$Xi
172 fpadd32 @X[$j],@X[$j],@X[$j] ! 4/ 8/ 6:X[0,1]<<=1
173 srl $a,27,$tmp1
174 add $tmp0,$e,$e
175 fmul8ulx16 %f18,$fmul,%f18 ! 5/10/ 7:Tmp>>=7, Tmp&=1
176 sll $b,30,$tmp2
177 add $tmp1,$e,$e
178 fpadd32 $K,@X[$l],%f20 !
179 andn $d,$b,$tmp1
180 add $Xi,$e,$e
181 fxors @X[($k+13)%16],@X[$k],@X[$k] !-1/-1/-1:X[0]^=X[13]
182 srl $b,2,$b
183 or $tmp1,$tmp3,$tmp1
184 fxor %f18,@X[$j],@X[$j] ! 8/14/10:X[0,1]|=Tmp
185 or $tmp2,$b,$b
186 add $tmp1,$e,$e
187___
188$code.=<<___ if ($i&1 && $i>=2);
189 std %f20,[$Xfer+`4*$l`] !
190___
191}
192
193sub BODY_20_39 {
194my ($i,$a,$b,$c,$d,$e)=@_;
195my $j=$i&~1;
196my $k=($j+16+2)%16; # ahead reference
197my $l=($j+16-2)%16; # behind reference
198my $K=@VK[($j+16-2)/20];
199
200$j=($j+16)%16;
201
202$code.=<<___ if (!($i&1) && $i<64);
203 sll $a,5,$tmp0 !! $i
204 ld [$Xfer+`4*($i%16)`],$Xi
205 fxors @X[($j+14)%16],@X[$j+1],@X[$j+1]! 0/ 0/ 0:X[1]^=X[14]
206 srl $a,27,$tmp1
207 add $tmp0,$e,$e
208 fxor @X[($j+2)%16],@X[($j+8)%16],%f18! 1/ 1/ 1:Tmp=X[2,3]^X[8,9]
209 xor $c,$b,$tmp0
210 add $tmp1,$e,$e
211 sll $b,30,$tmp2
212 xor $d,$tmp0,$tmp1
213 fxor %f18,@X[$j],@X[$j] ! 2/ 4/ 3:X[0,1]^=X[2,3]^X[8,9]
214 srl $b,2,$b
215 add $tmp1,$e,$e
216 or $tmp2,$b,$b
217 add $Xi,$e,$e
218 faligndata @X[$j],@X[$j],%f18 ! 3/ 7/ 5:Tmp=X[0,1]>>>24
219___
220$code.=<<___ if ($i&1 && $i<64);
221 sll $a,5,$tmp0 !! $i
222 ld [$Xfer+`4*($i%16)`],$Xi
223 fpadd32 @X[$j],@X[$j],@X[$j] ! 4/ 8/ 6:X[0,1]<<=1
224 srl $a,27,$tmp1
225 add $tmp0,$e,$e
226 fmul8ulx16 %f18,$fmul,%f18 ! 5/10/ 7:Tmp>>=7, Tmp&=1
227 xor $c,$b,$tmp0
228 add $tmp1,$e,$e
229 fpadd32 $K,@X[$l],%f20 !
230 sll $b,30,$tmp2
231 xor $d,$tmp0,$tmp1
232 fxors @X[($k+13)%16],@X[$k],@X[$k] !-1/-1/-1:X[0]^=X[13]
233 srl $b,2,$b
234 add $tmp1,$e,$e
235 fxor %f18,@X[$j],@X[$j] ! 8/14/10:X[0,1]|=Tmp
236 or $tmp2,$b,$b
237 add $Xi,$e,$e
238 std %f20,[$Xfer+`4*$l`] !
239___
240$code.=<<___ if ($i==64);
241 sll $a,5,$tmp0 !! $i
242 ld [$Xfer+`4*($i%16)`],$Xi
243 fpadd32 $K,@X[$l],%f20
244 srl $a,27,$tmp1
245 add $tmp0,$e,$e
246 xor $c,$b,$tmp0
247 add $tmp1,$e,$e
248 sll $b,30,$tmp2
249 xor $d,$tmp0,$tmp1
250 std %f20,[$Xfer+`4*$l`]
251 srl $b,2,$b
252 add $tmp1,$e,$e
253 or $tmp2,$b,$b
254 add $Xi,$e,$e
255___
256$code.=<<___ if ($i>64);
257 sll $a,5,$tmp0 !! $i
258 ld [$Xfer+`4*($i%16)`],$Xi
259 srl $a,27,$tmp1
260 add $tmp0,$e,$e
261 xor $c,$b,$tmp0
262 add $tmp1,$e,$e
263 sll $b,30,$tmp2
264 xor $d,$tmp0,$tmp1
265 srl $b,2,$b
266 add $tmp1,$e,$e
267 or $tmp2,$b,$b
268 add $Xi,$e,$e
269___
270}
271
272sub BODY_40_59 {
273my ($i,$a,$b,$c,$d,$e)=@_;
274my $j=$i&~1;
275my $k=($j+16+2)%16; # ahead reference
276my $l=($j+16-2)%16; # behind reference
277my $K=@VK[($j+16-2)/20];
278
279$j=($j+16)%16;
280
281$code.=<<___ if (!($i&1));
282 sll $a,5,$tmp0 !! $i
283 ld [$Xfer+`4*($i%16)`],$Xi
284 fxors @X[($j+14)%16],@X[$j+1],@X[$j+1]! 0/ 0/ 0:X[1]^=X[14]
285 srl $a,27,$tmp1
286 add $tmp0,$e,$e
287 fxor @X[($j+2)%16],@X[($j+8)%16],%f18! 1/ 1/ 1:Tmp=X[2,3]^X[8,9]
288 and $c,$b,$tmp0
289 add $tmp1,$e,$e
290 sll $b,30,$tmp2
291 or $c,$b,$tmp1
292 fxor %f18,@X[$j],@X[$j] ! 2/ 4/ 3:X[0,1]^=X[2,3]^X[8,9]
293 srl $b,2,$b
294 and $d,$tmp1,$tmp1
295 add $Xi,$e,$e
296 or $tmp1,$tmp0,$tmp1
297 faligndata @X[$j],@X[$j],%f18 ! 3/ 7/ 5:Tmp=X[0,1]>>>24
298 or $tmp2,$b,$b
299 add $tmp1,$e,$e
300 fpadd32 @X[$j],@X[$j],@X[$j] ! 4/ 8/ 6:X[0,1]<<=1
301___
302$code.=<<___ if ($i&1);
303 sll $a,5,$tmp0 !! $i
304 ld [$Xfer+`4*($i%16)`],$Xi
305 srl $a,27,$tmp1
306 add $tmp0,$e,$e
307 fmul8ulx16 %f18,$fmul,%f18 ! 5/10/ 7:Tmp>>=7, Tmp&=1
308 and $c,$b,$tmp0
309 add $tmp1,$e,$e
310 fpadd32 $K,@X[$l],%f20 !
311 sll $b,30,$tmp2
312 or $c,$b,$tmp1
313 fxors @X[($k+13)%16],@X[$k],@X[$k] !-1/-1/-1:X[0]^=X[13]
314 srl $b,2,$b
315 and $d,$tmp1,$tmp1
316 fxor %f18,@X[$j],@X[$j] ! 8/14/10:X[0,1]|=Tmp
317 add $Xi,$e,$e
318 or $tmp1,$tmp0,$tmp1
319 or $tmp2,$b,$b
320 add $tmp1,$e,$e
321 std %f20,[$Xfer+`4*$l`] !
322___
323}
324
325# If there is more data to process, then we pre-fetch the data for
326# next iteration in last ten rounds...
327sub BODY_70_79 {
328my ($i,$a,$b,$c,$d,$e)=@_;
329my $j=$i&~1;
330my $m=($i%8)*2;
331
332$j=($j+16)%16;
333
334$code.=<<___ if ($i==70);
335 sll $a,5,$tmp0 !! $i
336 ld [$Xfer+`4*($i%16)`],$Xi
337 srl $a,27,$tmp1
338 add $tmp0,$e,$e
339 ldd [$inp+64],@X[0]
340 xor $c,$b,$tmp0
341 add $tmp1,$e,$e
342 sll $b,30,$tmp2
343 xor $d,$tmp0,$tmp1
344 srl $b,2,$b
345 add $tmp1,$e,$e
346 or $tmp2,$b,$b
347 add $Xi,$e,$e
348
349 and $inp,-64,$nXfer
350 inc 64,$inp
351 and $nXfer,255,$nXfer
352 alignaddr %g0,$align,%g0
353 add $base,$nXfer,$nXfer
354___
355$code.=<<___ if ($i==71);
356 sll $a,5,$tmp0 !! $i
357 ld [$Xfer+`4*($i%16)`],$Xi
358 srl $a,27,$tmp1
359 add $tmp0,$e,$e
360 xor $c,$b,$tmp0
361 add $tmp1,$e,$e
362 sll $b,30,$tmp2
363 xor $d,$tmp0,$tmp1
364 srl $b,2,$b
365 add $tmp1,$e,$e
366 or $tmp2,$b,$b
367 add $Xi,$e,$e
368___
369$code.=<<___ if ($i>=72);
370 faligndata @X[$m],@X[$m+2],@X[$m]
371 sll $a,5,$tmp0 !! $i
372 ld [$Xfer+`4*($i%16)`],$Xi
373 srl $a,27,$tmp1
374 add $tmp0,$e,$e
375 xor $c,$b,$tmp0
376 add $tmp1,$e,$e
377 fpadd32 $VK_00_19,@X[$m],%f20
378 sll $b,30,$tmp2
379 xor $d,$tmp0,$tmp1
380 srl $b,2,$b
381 add $tmp1,$e,$e
382 or $tmp2,$b,$b
383 add $Xi,$e,$e
384___
385$code.=<<___ if ($i<77);
386 ldd [$inp+`8*($i+1-70)`],@X[2*($i+1-70)]
387___
388$code.=<<___ if ($i==77); # redundant if $inp was aligned
389 add $align,63,$tmp0
390 and $tmp0,-8,$tmp0
391 ldd [$inp+$tmp0],@X[16]
392___
393$code.=<<___ if ($i>=72);
394 std %f20,[$nXfer+`4*$m`]
395___
396}
397
398$code.=<<___;
399.section ".text",#alloc,#execinstr
400
401.align 64
402vis_const:
403.long 0x5a827999,0x5a827999 ! K_00_19
404.long 0x6ed9eba1,0x6ed9eba1 ! K_20_39
405.long 0x8f1bbcdc,0x8f1bbcdc ! K_40_59
406.long 0xca62c1d6,0xca62c1d6 ! K_60_79
407.long 0x00000100,0x00000100
408.align 64
409.type vis_const,#object
410.size vis_const,(.-vis_const)
411
412.globl sha1_block_data_order
413sha1_block_data_order:
414 save %sp,-$frame,%sp
415 add %fp,$bias-256,$base
416
4171: call .+8
418 add %o7,vis_const-1b,$tmp0
419
420 ldd [$tmp0+0],$VK_00_19
421 ldd [$tmp0+8],$VK_20_39
422 ldd [$tmp0+16],$VK_40_59
423 ldd [$tmp0+24],$VK_60_79
424 ldd [$tmp0+32],$fmul
425
426 ld [$ctx+0],$Actx
427 and $base,-256,$base
428 ld [$ctx+4],$Bctx
429 sub $base,$bias+$frame,%sp
430 ld [$ctx+8],$Cctx
431 and $inp,7,$align
432 ld [$ctx+12],$Dctx
433 and $inp,-8,$inp
434 ld [$ctx+16],$Ectx
435
436 ! X[16] is maintained in FP register bank
437 alignaddr %g0,$align,%g0
438 ldd [$inp+0],@X[0]
439 sub $inp,-64,$Xfer
440 ldd [$inp+8],@X[2]
441 and $Xfer,-64,$Xfer
442 ldd [$inp+16],@X[4]
443 and $Xfer,255,$Xfer
444 ldd [$inp+24],@X[6]
445 add $base,$Xfer,$Xfer
446 ldd [$inp+32],@X[8]
447 ldd [$inp+40],@X[10]
448 ldd [$inp+48],@X[12]
449 brz,pt $align,.Laligned
450 ldd [$inp+56],@X[14]
451
452 ldd [$inp+64],@X[16]
453 faligndata @X[0],@X[2],@X[0]
454 faligndata @X[2],@X[4],@X[2]
455 faligndata @X[4],@X[6],@X[4]
456 faligndata @X[6],@X[8],@X[6]
457 faligndata @X[8],@X[10],@X[8]
458 faligndata @X[10],@X[12],@X[10]
459 faligndata @X[12],@X[14],@X[12]
460 faligndata @X[14],@X[16],@X[14]
461
462.Laligned:
463 mov 5,$tmp0
464 dec 1,$len
465 alignaddr %g0,$tmp0,%g0
466 fpadd32 $VK_00_19,@X[0],%f16
467 fpadd32 $VK_00_19,@X[2],%f18
468 fpadd32 $VK_00_19,@X[4],%f20
469 fpadd32 $VK_00_19,@X[6],%f22
470 fpadd32 $VK_00_19,@X[8],%f24
471 fpadd32 $VK_00_19,@X[10],%f26
472 fpadd32 $VK_00_19,@X[12],%f28
473 fpadd32 $VK_00_19,@X[14],%f30
474 std %f16,[$Xfer+0]
475 mov $Actx,$A
476 std %f18,[$Xfer+8]
477 mov $Bctx,$B
478 std %f20,[$Xfer+16]
479 mov $Cctx,$C
480 std %f22,[$Xfer+24]
481 mov $Dctx,$D
482 std %f24,[$Xfer+32]
483 mov $Ectx,$E
484 std %f26,[$Xfer+40]
485 fxors @X[13],@X[0],@X[0]
486 std %f28,[$Xfer+48]
487 ba .Loop
488 std %f30,[$Xfer+56]
489.align 32
490.Loop:
491___
492for ($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
493for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
494for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
495for (;$i<70;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
496$code.=<<___;
497 tst $len
498 bz,pn `$bits==32?"%icc":"%xcc"`,.Ltail
499 nop
500___
501for (;$i<80;$i++) { &BODY_70_79($i,@V); unshift(@V,pop(@V)); }
502$code.=<<___;
503 add $A,$Actx,$Actx
504 add $B,$Bctx,$Bctx
505 add $C,$Cctx,$Cctx
506 add $D,$Dctx,$Dctx
507 add $E,$Ectx,$Ectx
508 mov 5,$tmp0
509 fxors @X[13],@X[0],@X[0]
510 mov $Actx,$A
511 mov $Bctx,$B
512 mov $Cctx,$C
513 mov $Dctx,$D
514 mov $Ectx,$E
515 alignaddr %g0,$tmp0,%g0
516 dec 1,$len
517 ba .Loop
518 mov $nXfer,$Xfer
519
520.align 32
521.Ltail:
522___
523for($i=70;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
524$code.=<<___;
525 add $A,$Actx,$Actx
526 add $B,$Bctx,$Bctx
527 add $C,$Cctx,$Cctx
528 add $D,$Dctx,$Dctx
529 add $E,$Ectx,$Ectx
530
531 st $Actx,[$ctx+0]
532 st $Bctx,[$ctx+4]
533 st $Cctx,[$ctx+8]
534 st $Dctx,[$ctx+12]
535 st $Ectx,[$ctx+16]
536
537 ret
538 restore
539.type sha1_block_data_order,#function
540.size sha1_block_data_order,(.-sha1_block_data_order)
541.asciz "SHA1 block transform for SPARCv9a, CRYPTOGAMS by <appro\@openssl.org>"
542.align 4
543___
544
545# Purpose of these subroutines is to explicitly encode VIS instructions,
546# so that one can compile the module without having to specify VIS
547# extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
548# Idea is to reserve for option to produce "universal" binary and let
549# programmer detect if current CPU is VIS capable at run-time.
550sub unvis {
551my ($mnemonic,$rs1,$rs2,$rd)=@_;
552my $ref,$opf;
553my %visopf = ( "fmul8ulx16" => 0x037,
554 "faligndata" => 0x048,
555 "fpadd32" => 0x052,
556 "fxor" => 0x06c,
557 "fxors" => 0x06d );
558
559 $ref = "$mnemonic\t$rs1,$rs2,$rd";
560
561 if ($opf=$visopf{$mnemonic}) {
562 foreach ($rs1,$rs2,$rd) {
563 return $ref if (!/%f([0-9]{1,2})/);
564 $_=$1;
565 if ($1>=32) {
566 return $ref if ($1&1);
567 # re-encode for upper double register addressing
568 $_=($1|$1>>5)&31;
569 }
570 }
571
572 return sprintf ".word\t0x%08x !%s",
573 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
574 $ref;
575 } else {
576 return $ref;
577 }
578}
579sub unalignaddr {
580my ($mnemonic,$rs1,$rs2,$rd)=@_;
581my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
582my $ref="$mnemonic\t$rs1,$rs2,$rd";
583
584 foreach ($rs1,$rs2,$rd) {
585 if (/%([goli])([0-7])/) { $_=$bias{$1}+$2; }
586 else { return $ref; }
587 }
588 return sprintf ".word\t0x%08x !%s",
589 0x81b00300|$rd<<25|$rs1<<14|$rs2,
590 $ref;
591}
592
593$code =~ s/\`([^\`]*)\`/eval $1/gem;
594$code =~ s/\b(f[^\s]*)\s+(%f[0-9]{1,2}),(%f[0-9]{1,2}),(%f[0-9]{1,2})/
595 &unvis($1,$2,$3,$4)
596 /gem;
597$code =~ s/\b(alignaddr)\s+(%[goli][0-7]),(%[goli][0-7]),(%[goli][0-7])/
598 &unalignaddr($1,$2,$3,$4)
599 /gem;
600print $code;
601close STDOUT;
diff --git a/src/lib/libcrypto/sha/asm/sha1-thumb.pl b/src/lib/libcrypto/sha/asm/sha1-thumb.pl
deleted file mode 100644
index 7c9ea9b029..0000000000
--- a/src/lib/libcrypto/sha/asm/sha1-thumb.pl
+++ /dev/null
@@ -1,259 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# sha1_block for Thumb.
11#
12# January 2007.
13#
14# The code does not present direct interest to OpenSSL, because of low
15# performance. Its purpose is to establish _size_ benchmark. Pretty
16# useless one I must say, because 30% or 88 bytes larger ARMv4 code
17# [avialable on demand] is almost _twice_ as fast. It should also be
18# noted that in-lining of .Lcommon and .Lrotate improves performance
19# by over 40%, while code increases by only 10% or 32 bytes. But once
20# again, the goal was to establish _size_ benchmark, not performance.
21
22$output=shift;
23open STDOUT,">$output";
24
25$inline=0;
26#$cheat_on_binutils=1;
27
28$t0="r0";
29$t1="r1";
30$t2="r2";
31$a="r3";
32$b="r4";
33$c="r5";
34$d="r6";
35$e="r7";
36$K="r8"; # "upper" registers can be used in add/sub and mov insns
37$ctx="r9";
38$inp="r10";
39$len="r11";
40$Xi="r12";
41
42sub common {
43<<___;
44 sub $t0,#4
45 ldr $t1,[$t0]
46 add $e,$K @ E+=K_xx_xx
47 lsl $t2,$a,#5
48 add $t2,$e
49 lsr $e,$a,#27
50 add $t2,$e @ E+=ROR(A,27)
51 add $t2,$t1 @ E+=X[i]
52___
53}
54sub rotate {
55<<___;
56 mov $e,$d @ E=D
57 mov $d,$c @ D=C
58 lsl $c,$b,#30
59 lsr $b,$b,#2
60 orr $c,$b @ C=ROR(B,2)
61 mov $b,$a @ B=A
62 add $a,$t2,$t1 @ A=E+F_xx_xx(B,C,D)
63___
64}
65
66sub BODY_00_19 {
67$code.=$inline?&common():"\tbl .Lcommon\n";
68$code.=<<___;
69 mov $t1,$c
70 eor $t1,$d
71 and $t1,$b
72 eor $t1,$d @ F_00_19(B,C,D)
73___
74$code.=$inline?&rotate():"\tbl .Lrotate\n";
75}
76
77sub BODY_20_39 {
78$code.=$inline?&common():"\tbl .Lcommon\n";
79$code.=<<___;
80 mov $t1,$b
81 eor $t1,$c
82 eor $t1,$d @ F_20_39(B,C,D)
83___
84$code.=$inline?&rotate():"\tbl .Lrotate\n";
85}
86
87sub BODY_40_59 {
88$code.=$inline?&common():"\tbl .Lcommon\n";
89$code.=<<___;
90 mov $t1,$b
91 and $t1,$c
92 mov $e,$b
93 orr $e,$c
94 and $e,$d
95 orr $t1,$e @ F_40_59(B,C,D)
96___
97$code.=$inline?&rotate():"\tbl .Lrotate\n";
98}
99
100$code=<<___;
101.text
102.code 16
103
104.global sha1_block_data_order
105.type sha1_block_data_order,%function
106
107.align 2
108sha1_block_data_order:
109___
110if ($cheat_on_binutils) {
111$code.=<<___;
112.code 32
113 add r3,pc,#1
114 bx r3 @ switch to Thumb ISA
115.code 16
116___
117}
118$code.=<<___;
119 push {r4-r7}
120 mov r3,r8
121 mov r4,r9
122 mov r5,r10
123 mov r6,r11
124 mov r7,r12
125 push {r3-r7,lr}
126 lsl r2,#6
127 mov $ctx,r0 @ save context
128 mov $inp,r1 @ save inp
129 mov $len,r2 @ save len
130 add $len,$inp @ $len to point at inp end
131
132.Lloop:
133 mov $Xi,sp
134 mov $t2,sp
135 sub $t2,#16*4 @ [3]
136.LXload:
137 ldrb $a,[$t1,#0] @ $t1 is r1 and holds inp
138 ldrb $b,[$t1,#1]
139 ldrb $c,[$t1,#2]
140 ldrb $d,[$t1,#3]
141 lsl $a,#24
142 lsl $b,#16
143 lsl $c,#8
144 orr $a,$b
145 orr $a,$c
146 orr $a,$d
147 add $t1,#4
148 push {$a}
149 cmp sp,$t2
150 bne .LXload @ [+14*16]
151
152 mov $inp,$t1 @ update $inp
153 sub $t2,#32*4
154 sub $t2,#32*4
155 mov $e,#31 @ [+4]
156.LXupdate:
157 ldr $a,[sp,#15*4]
158 ldr $b,[sp,#13*4]
159 ldr $c,[sp,#7*4]
160 ldr $d,[sp,#2*4]
161 eor $a,$b
162 eor $a,$c
163 eor $a,$d
164 ror $a,$e
165 push {$a}
166 cmp sp,$t2
167 bne .LXupdate @ [+(11+1)*64]
168
169 ldmia $t0!,{$a,$b,$c,$d,$e} @ $t0 is r0 and holds ctx
170 mov $t0,$Xi
171
172 ldr $t2,.LK_00_19
173 mov $t1,$t0
174 sub $t1,#20*4
175 mov $Xi,$t1
176 mov $K,$t2 @ [+7+4]
177.L_00_19:
178___
179 &BODY_00_19();
180$code.=<<___;
181 cmp $Xi,$t0
182 bne .L_00_19 @ [+(2+9+4+2+8+2)*20]
183
184 ldr $t2,.LK_20_39
185 mov $t1,$t0
186 sub $t1,#20*4
187 mov $Xi,$t1
188 mov $K,$t2 @ [+5]
189.L_20_39_or_60_79:
190___
191 &BODY_20_39();
192$code.=<<___;
193 cmp $Xi,$t0
194 bne .L_20_39_or_60_79 @ [+(2+9+3+2+8+2)*20*2]
195 cmp sp,$t0
196 beq .Ldone @ [+2]
197
198 ldr $t2,.LK_40_59
199 mov $t1,$t0
200 sub $t1,#20*4
201 mov $Xi,$t1
202 mov $K,$t2 @ [+5]
203.L_40_59:
204___
205 &BODY_40_59();
206$code.=<<___;
207 cmp $Xi,$t0
208 bne .L_40_59 @ [+(2+9+6+2+8+2)*20]
209
210 ldr $t2,.LK_60_79
211 mov $Xi,sp
212 mov $K,$t2
213 b .L_20_39_or_60_79 @ [+4]
214.Ldone:
215 mov $t0,$ctx
216 ldr $t1,[$t0,#0]
217 ldr $t2,[$t0,#4]
218 add $a,$t1
219 ldr $t1,[$t0,#8]
220 add $b,$t2
221 ldr $t2,[$t0,#12]
222 add $c,$t1
223 ldr $t1,[$t0,#16]
224 add $d,$t2
225 add $e,$t1
226 stmia $t0!,{$a,$b,$c,$d,$e} @ [+20]
227
228 add sp,#80*4 @ deallocate stack frame
229 mov $t0,$ctx @ restore ctx
230 mov $t1,$inp @ restore inp
231 cmp $t1,$len
232 beq .Lexit
233 b .Lloop @ [+6] total 3212 cycles
234.Lexit:
235 pop {r2-r7}
236 mov r8,r2
237 mov r9,r3
238 mov r10,r4
239 mov r11,r5
240 mov r12,r6
241 mov lr,r7
242 pop {r4-r7}
243 bx lr
244.align 2
245___
246$code.=".Lcommon:\n".&common()."\tmov pc,lr\n" if (!$inline);
247$code.=".Lrotate:\n".&rotate()."\tmov pc,lr\n" if (!$inline);
248$code.=<<___;
249.align 2
250.LK_00_19: .word 0x5a827999
251.LK_20_39: .word 0x6ed9eba1
252.LK_40_59: .word 0x8f1bbcdc
253.LK_60_79: .word 0xca62c1d6
254.size sha1_block_data_order,.-sha1_block_data_order
255.asciz "SHA1 block transform for Thumb, CRYPTOGAMS by <appro\@openssl.org>"
256___
257
258print $code;
259close STDOUT; # enforce flush
diff --git a/src/lib/libcrypto/sha/asm/sha1-x86_64.pl b/src/lib/libcrypto/sha/asm/sha1-x86_64.pl
deleted file mode 100755
index 4edc5ea9ad..0000000000
--- a/src/lib/libcrypto/sha/asm/sha1-x86_64.pl
+++ /dev/null
@@ -1,351 +0,0 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# sha1_block procedure for x86_64.
11#
12# It was brought to my attention that on EM64T compiler-generated code
13# was far behind 32-bit assembler implementation. This is unlike on
14# Opteron where compiler-generated code was only 15% behind 32-bit
15# assembler, which originally made it hard to motivate the effort.
16# There was suggestion to mechanically translate 32-bit code, but I
17# dismissed it, reasoning that x86_64 offers enough register bank
18# capacity to fully utilize SHA-1 parallelism. Therefore this fresh
19# implementation:-) However! While 64-bit code does performs better
20# on Opteron, I failed to beat 32-bit assembler on EM64T core. Well,
21# x86_64 does offer larger *addressable* bank, but out-of-order core
22# reaches for even more registers through dynamic aliasing, and EM64T
23# core must have managed to run-time optimize even 32-bit code just as
24# good as 64-bit one. Performance improvement is summarized in the
25# following table:
26#
27# gcc 3.4 32-bit asm cycles/byte
28# Opteron +45% +20% 6.8
29# Xeon P4 +65% +0% 9.9
30# Core2 +60% +10% 7.0
31
32$flavour = shift;
33$output = shift;
34if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
35
36$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
37
38$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
39( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
40( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
41die "can't locate x86_64-xlate.pl";
42
43open STDOUT,"| $^X $xlate $flavour $output";
44
45$ctx="%rdi"; # 1st arg
46$inp="%rsi"; # 2nd arg
47$num="%rdx"; # 3rd arg
48
49# reassign arguments in order to produce more compact code
50$ctx="%r8";
51$inp="%r9";
52$num="%r10";
53
54$xi="%eax";
55$t0="%ebx";
56$t1="%ecx";
57$A="%edx";
58$B="%esi";
59$C="%edi";
60$D="%ebp";
61$E="%r11d";
62$T="%r12d";
63
64@V=($A,$B,$C,$D,$E,$T);
65
66sub PROLOGUE {
67my $func=shift;
68$code.=<<___;
69.globl $func
70.type $func,\@function,3
71.align 16
72$func:
73 push %rbx
74 push %rbp
75 push %r12
76 mov %rsp,%r11
77 mov %rdi,$ctx # reassigned argument
78 sub \$`8+16*4`,%rsp
79 mov %rsi,$inp # reassigned argument
80 and \$-64,%rsp
81 mov %rdx,$num # reassigned argument
82 mov %r11,`16*4`(%rsp)
83.Lprologue:
84
85 mov 0($ctx),$A
86 mov 4($ctx),$B
87 mov 8($ctx),$C
88 mov 12($ctx),$D
89 mov 16($ctx),$E
90___
91}
92
93sub EPILOGUE {
94my $func=shift;
95$code.=<<___;
96 mov `16*4`(%rsp),%rsi
97 mov (%rsi),%r12
98 mov 8(%rsi),%rbp
99 mov 16(%rsi),%rbx
100 lea 24(%rsi),%rsp
101.Lepilogue:
102 ret
103.size $func,.-$func
104___
105}
106
107sub BODY_00_19 {
108my ($i,$a,$b,$c,$d,$e,$f,$host)=@_;
109my $j=$i+1;
110$code.=<<___ if ($i==0);
111 mov `4*$i`($inp),$xi
112 `"bswap $xi" if(!defined($host))`
113 mov $xi,`4*$i`(%rsp)
114___
115$code.=<<___ if ($i<15);
116 lea 0x5a827999($xi,$e),$f
117 mov $c,$t0
118 mov `4*$j`($inp),$xi
119 mov $a,$e
120 xor $d,$t0
121 `"bswap $xi" if(!defined($host))`
122 rol \$5,$e
123 and $b,$t0
124 mov $xi,`4*$j`(%rsp)
125 add $e,$f
126 xor $d,$t0
127 rol \$30,$b
128 add $t0,$f
129___
130$code.=<<___ if ($i>=15);
131 lea 0x5a827999($xi,$e),$f
132 mov `4*($j%16)`(%rsp),$xi
133 mov $c,$t0
134 mov $a,$e
135 xor `4*(($j+2)%16)`(%rsp),$xi
136 xor $d,$t0
137 rol \$5,$e
138 xor `4*(($j+8)%16)`(%rsp),$xi
139 and $b,$t0
140 add $e,$f
141 xor `4*(($j+13)%16)`(%rsp),$xi
142 xor $d,$t0
143 rol \$30,$b
144 add $t0,$f
145 rol \$1,$xi
146 mov $xi,`4*($j%16)`(%rsp)
147___
148}
149
150sub BODY_20_39 {
151my ($i,$a,$b,$c,$d,$e,$f)=@_;
152my $j=$i+1;
153my $K=($i<40)?0x6ed9eba1:0xca62c1d6;
154$code.=<<___ if ($i<79);
155 lea $K($xi,$e),$f
156 mov `4*($j%16)`(%rsp),$xi
157 mov $c,$t0
158 mov $a,$e
159 xor `4*(($j+2)%16)`(%rsp),$xi
160 xor $b,$t0
161 rol \$5,$e
162 xor `4*(($j+8)%16)`(%rsp),$xi
163 xor $d,$t0
164 add $e,$f
165 xor `4*(($j+13)%16)`(%rsp),$xi
166 rol \$30,$b
167 add $t0,$f
168 rol \$1,$xi
169___
170$code.=<<___ if ($i<76);
171 mov $xi,`4*($j%16)`(%rsp)
172___
173$code.=<<___ if ($i==79);
174 lea $K($xi,$e),$f
175 mov $c,$t0
176 mov $a,$e
177 xor $b,$t0
178 rol \$5,$e
179 xor $d,$t0
180 add $e,$f
181 rol \$30,$b
182 add $t0,$f
183___
184}
185
186sub BODY_40_59 {
187my ($i,$a,$b,$c,$d,$e,$f)=@_;
188my $j=$i+1;
189$code.=<<___;
190 lea 0x8f1bbcdc($xi,$e),$f
191 mov `4*($j%16)`(%rsp),$xi
192 mov $b,$t0
193 mov $b,$t1
194 xor `4*(($j+2)%16)`(%rsp),$xi
195 mov $a,$e
196 and $c,$t0
197 xor `4*(($j+8)%16)`(%rsp),$xi
198 or $c,$t1
199 rol \$5,$e
200 xor `4*(($j+13)%16)`(%rsp),$xi
201 and $d,$t1
202 add $e,$f
203 rol \$1,$xi
204 or $t1,$t0
205 rol \$30,$b
206 mov $xi,`4*($j%16)`(%rsp)
207 add $t0,$f
208___
209}
210
211$code=".text\n";
212
213&PROLOGUE("sha1_block_data_order");
214$code.=".align 4\n.Lloop:\n";
215for($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
216for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
217for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
218for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
219$code.=<<___;
220 add 0($ctx),$E
221 add 4($ctx),$T
222 add 8($ctx),$A
223 add 12($ctx),$B
224 add 16($ctx),$C
225 mov $E,0($ctx)
226 mov $T,4($ctx)
227 mov $A,8($ctx)
228 mov $B,12($ctx)
229 mov $C,16($ctx)
230
231 xchg $E,$A # mov $E,$A
232 xchg $T,$B # mov $T,$B
233 xchg $E,$C # mov $A,$C
234 xchg $T,$D # mov $B,$D
235 # mov $C,$E
236 lea `16*4`($inp),$inp
237 sub \$1,$num
238 jnz .Lloop
239___
240&EPILOGUE("sha1_block_data_order");
241$code.=<<___;
242.asciz "SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
243.align 16
244___
245
246# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
247# CONTEXT *context,DISPATCHER_CONTEXT *disp)
248if ($win64) {
249$rec="%rcx";
250$frame="%rdx";
251$context="%r8";
252$disp="%r9";
253
254$code.=<<___;
255.extern __imp_RtlVirtualUnwind
256.type se_handler,\@abi-omnipotent
257.align 16
258se_handler:
259 push %rsi
260 push %rdi
261 push %rbx
262 push %rbp
263 push %r12
264 push %r13
265 push %r14
266 push %r15
267 pushfq
268 sub \$64,%rsp
269
270 mov 120($context),%rax # pull context->Rax
271 mov 248($context),%rbx # pull context->Rip
272
273 lea .Lprologue(%rip),%r10
274 cmp %r10,%rbx # context->Rip<.Lprologue
275 jb .Lin_prologue
276
277 mov 152($context),%rax # pull context->Rsp
278
279 lea .Lepilogue(%rip),%r10
280 cmp %r10,%rbx # context->Rip>=.Lepilogue
281 jae .Lin_prologue
282
283 mov `16*4`(%rax),%rax # pull saved stack pointer
284 lea 24(%rax),%rax
285
286 mov -8(%rax),%rbx
287 mov -16(%rax),%rbp
288 mov -24(%rax),%r12
289 mov %rbx,144($context) # restore context->Rbx
290 mov %rbp,160($context) # restore context->Rbp
291 mov %r12,216($context) # restore context->R12
292
293.Lin_prologue:
294 mov 8(%rax),%rdi
295 mov 16(%rax),%rsi
296 mov %rax,152($context) # restore context->Rsp
297 mov %rsi,168($context) # restore context->Rsi
298 mov %rdi,176($context) # restore context->Rdi
299
300 mov 40($disp),%rdi # disp->ContextRecord
301 mov $context,%rsi # context
302 mov \$154,%ecx # sizeof(CONTEXT)
303 .long 0xa548f3fc # cld; rep movsq
304
305 mov $disp,%rsi
306 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
307 mov 8(%rsi),%rdx # arg2, disp->ImageBase
308 mov 0(%rsi),%r8 # arg3, disp->ControlPc
309 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
310 mov 40(%rsi),%r10 # disp->ContextRecord
311 lea 56(%rsi),%r11 # &disp->HandlerData
312 lea 24(%rsi),%r12 # &disp->EstablisherFrame
313 mov %r10,32(%rsp) # arg5
314 mov %r11,40(%rsp) # arg6
315 mov %r12,48(%rsp) # arg7
316 mov %rcx,56(%rsp) # arg8, (NULL)
317 call *__imp_RtlVirtualUnwind(%rip)
318
319 mov \$1,%eax # ExceptionContinueSearch
320 add \$64,%rsp
321 popfq
322 pop %r15
323 pop %r14
324 pop %r13
325 pop %r12
326 pop %rbp
327 pop %rbx
328 pop %rdi
329 pop %rsi
330 ret
331.size se_handler,.-se_handler
332
333.section .pdata
334.align 4
335 .rva .LSEH_begin_sha1_block_data_order
336 .rva .LSEH_end_sha1_block_data_order
337 .rva .LSEH_info_sha1_block_data_order
338
339.section .xdata
340.align 8
341.LSEH_info_sha1_block_data_order:
342 .byte 9,0,0,0
343 .rva se_handler
344___
345}
346
347####################################################################
348
349$code =~ s/\`([^\`]*)\`/eval $1/gem;
350print $code;
351close STDOUT;
diff --git a/src/lib/libcrypto/sha/asm/sha256-586.pl b/src/lib/libcrypto/sha/asm/sha256-586.pl
deleted file mode 100644
index ecc8b69c75..0000000000
--- a/src/lib/libcrypto/sha/asm/sha256-586.pl
+++ /dev/null
@@ -1,251 +0,0 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# SHA256 block transform for x86. September 2007.
11#
12# Performance in clock cycles per processed byte (less is better):
13#
14# Pentium PIII P4 AMD K8 Core2
15# gcc 46 36 41 27 26
16# icc 57 33 38 25 23
17# x86 asm 40 30 35 20 20
18# x86_64 asm(*) - - 21 15.8 16.5
19#
20# (*) x86_64 assembler performance is presented for reference
21# purposes.
22#
23# Performance improvement over compiler generated code varies from
24# 10% to 40% [see above]. Not very impressive on some µ-archs, but
25# it's 5 times smaller and optimizies amount of writes.
26
27$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
28push(@INC,"${dir}","${dir}../../perlasm");
29require "x86asm.pl";
30
31&asm_init($ARGV[0],"sha512-586.pl",$ARGV[$#ARGV] eq "386");
32
33$A="eax";
34$E="edx";
35$T="ebx";
36$Aoff=&DWP(0,"esp");
37$Boff=&DWP(4,"esp");
38$Coff=&DWP(8,"esp");
39$Doff=&DWP(12,"esp");
40$Eoff=&DWP(16,"esp");
41$Foff=&DWP(20,"esp");
42$Goff=&DWP(24,"esp");
43$Hoff=&DWP(28,"esp");
44$Xoff=&DWP(32,"esp");
45$K256="ebp";
46
47sub BODY_00_15() {
48 my $in_16_63=shift;
49
50 &mov ("ecx",$E);
51 &add ($T,&DWP(4*(8+15+16-9),"esp")) if ($in_16_63); # T += X[-7]
52 &ror ("ecx",6);
53 &mov ("edi",$E);
54 &ror ("edi",11);
55 &mov ("esi",$Foff);
56 &xor ("ecx","edi");
57 &ror ("edi",25-11);
58 &mov (&DWP(4*(8+15),"esp"),$T) if ($in_16_63); # save X[0]
59 &xor ("ecx","edi"); # Sigma1(e)
60 &mov ("edi",$Goff);
61 &add ($T,"ecx"); # T += Sigma1(e)
62 &mov ($Eoff,$E); # modulo-scheduled
63
64 &xor ("esi","edi");
65 &mov ("ecx",$A);
66 &and ("esi",$E);
67 &mov ($E,$Doff); # e becomes d, which is e in next iteration
68 &xor ("esi","edi"); # Ch(e,f,g)
69 &mov ("edi",$A);
70 &add ($T,"esi"); # T += Ch(e,f,g)
71
72 &ror ("ecx",2);
73 &add ($T,$Hoff); # T += h
74 &ror ("edi",13);
75 &mov ("esi",$Boff);
76 &xor ("ecx","edi");
77 &ror ("edi",22-13);
78 &add ($E,$T); # d += T
79 &xor ("ecx","edi"); # Sigma0(a)
80 &mov ("edi",$Coff);
81
82 &add ($T,"ecx"); # T += Sigma0(a)
83 &mov ($Aoff,$A); # modulo-scheduled
84
85 &mov ("ecx",$A);
86 &sub ("esp",4);
87 &or ($A,"esi"); # a becomes h, which is a in next iteration
88 &and ("ecx","esi");
89 &and ($A,"edi");
90 &mov ("esi",&DWP(0,$K256));
91 &or ($A,"ecx"); # h=Maj(a,b,c)
92
93 &add ($K256,4);
94 &add ($A,$T); # h += T
95 &mov ($T,&DWP(4*(8+15+16-1),"esp")) if ($in_16_63); # preload T
96 &add ($E,"esi"); # d += K256[i]
97 &add ($A,"esi"); # h += K256[i]
98}
99
100&function_begin("sha256_block_data_order");
101 &mov ("esi",wparam(0)); # ctx
102 &mov ("edi",wparam(1)); # inp
103 &mov ("eax",wparam(2)); # num
104 &mov ("ebx","esp"); # saved sp
105
106 &call (&label("pic_point")); # make it PIC!
107&set_label("pic_point");
108 &blindpop($K256);
109 &lea ($K256,&DWP(&label("K256")."-".&label("pic_point"),$K256));
110
111 &sub ("esp",16);
112 &and ("esp",-64);
113
114 &shl ("eax",6);
115 &add ("eax","edi");
116 &mov (&DWP(0,"esp"),"esi"); # ctx
117 &mov (&DWP(4,"esp"),"edi"); # inp
118 &mov (&DWP(8,"esp"),"eax"); # inp+num*128
119 &mov (&DWP(12,"esp"),"ebx"); # saved sp
120
121&set_label("loop",16);
122 # copy input block to stack reversing byte and dword order
123 for($i=0;$i<4;$i++) {
124 &mov ("eax",&DWP($i*16+0,"edi"));
125 &mov ("ebx",&DWP($i*16+4,"edi"));
126 &mov ("ecx",&DWP($i*16+8,"edi"));
127 &mov ("edx",&DWP($i*16+12,"edi"));
128 &bswap ("eax");
129 &bswap ("ebx");
130 &bswap ("ecx");
131 &bswap ("edx");
132 &push ("eax");
133 &push ("ebx");
134 &push ("ecx");
135 &push ("edx");
136 }
137 &add ("edi",64);
138 &sub ("esp",4*8); # place for A,B,C,D,E,F,G,H
139 &mov (&DWP(4*(8+16)+4,"esp"),"edi");
140
141 # copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack
142 &mov ($A,&DWP(0,"esi"));
143 &mov ("ebx",&DWP(4,"esi"));
144 &mov ("ecx",&DWP(8,"esi"));
145 &mov ("edi",&DWP(12,"esi"));
146 # &mov ($Aoff,$A);
147 &mov ($Boff,"ebx");
148 &mov ($Coff,"ecx");
149 &mov ($Doff,"edi");
150 &mov ($E,&DWP(16,"esi"));
151 &mov ("ebx",&DWP(20,"esi"));
152 &mov ("ecx",&DWP(24,"esi"));
153 &mov ("edi",&DWP(28,"esi"));
154 # &mov ($Eoff,$E);
155 &mov ($Foff,"ebx");
156 &mov ($Goff,"ecx");
157 &mov ($Hoff,"edi");
158
159&set_label("00_15",16);
160 &mov ($T,&DWP(4*(8+15),"esp"));
161
162 &BODY_00_15();
163
164 &cmp ("esi",0xc19bf174);
165 &jne (&label("00_15"));
166
167 &mov ($T,&DWP(4*(8+15+16-1),"esp")); # preloaded in BODY_00_15(1)
168&set_label("16_63",16);
169 &mov ("esi",$T);
170 &mov ("ecx",&DWP(4*(8+15+16-14),"esp"));
171 &shr ($T,3);
172 &ror ("esi",7);
173 &xor ($T,"esi");
174 &ror ("esi",18-7);
175 &mov ("edi","ecx");
176 &xor ($T,"esi"); # T = sigma0(X[-15])
177
178 &shr ("ecx",10);
179 &mov ("esi",&DWP(4*(8+15+16),"esp"));
180 &ror ("edi",17);
181 &xor ("ecx","edi");
182 &ror ("edi",19-17);
183 &add ($T,"esi"); # T += X[-16]
184 &xor ("edi","ecx") # sigma1(X[-2])
185
186 &add ($T,"edi"); # T += sigma1(X[-2])
187 # &add ($T,&DWP(4*(8+15+16-9),"esp")); # T += X[-7], moved to BODY_00_15(1)
188 # &mov (&DWP(4*(8+15),"esp"),$T); # save X[0]
189
190 &BODY_00_15(1);
191
192 &cmp ("esi",0xc67178f2);
193 &jne (&label("16_63"));
194
195 &mov ("esi",&DWP(4*(8+16+64)+0,"esp"));#ctx
196 # &mov ($A,$Aoff);
197 &mov ("ebx",$Boff);
198 &mov ("ecx",$Coff);
199 &mov ("edi",$Doff);
200 &add ($A,&DWP(0,"esi"));
201 &add ("ebx",&DWP(4,"esi"));
202 &add ("ecx",&DWP(8,"esi"));
203 &add ("edi",&DWP(12,"esi"));
204 &mov (&DWP(0,"esi"),$A);
205 &mov (&DWP(4,"esi"),"ebx");
206 &mov (&DWP(8,"esi"),"ecx");
207 &mov (&DWP(12,"esi"),"edi");
208 # &mov ($E,$Eoff);
209 &mov ("eax",$Foff);
210 &mov ("ebx",$Goff);
211 &mov ("ecx",$Hoff);
212 &mov ("edi",&DWP(4*(8+16+64)+4,"esp"));#inp
213 &add ($E,&DWP(16,"esi"));
214 &add ("eax",&DWP(20,"esi"));
215 &add ("ebx",&DWP(24,"esi"));
216 &add ("ecx",&DWP(28,"esi"));
217 &mov (&DWP(16,"esi"),$E);
218 &mov (&DWP(20,"esi"),"eax");
219 &mov (&DWP(24,"esi"),"ebx");
220 &mov (&DWP(28,"esi"),"ecx");
221
222 &add ("esp",4*(8+16+64)); # destroy frame
223 &sub ($K256,4*64); # rewind K
224
225 &cmp ("edi",&DWP(8,"esp")); # are we done yet?
226 &jb (&label("loop"));
227
228 &mov ("esp",&DWP(12,"esp")); # restore sp
229&function_end_A();
230
231&set_label("K256",64); # Yes! I keep it in the code segment!
232 &data_word(0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5);
233 &data_word(0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5);
234 &data_word(0xd807aa98,0x12835b01,0x243185be,0x550c7dc3);
235 &data_word(0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174);
236 &data_word(0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc);
237 &data_word(0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da);
238 &data_word(0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7);
239 &data_word(0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967);
240 &data_word(0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13);
241 &data_word(0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85);
242 &data_word(0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3);
243 &data_word(0xd192e819,0xd6990624,0xf40e3585,0x106aa070);
244 &data_word(0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5);
245 &data_word(0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3);
246 &data_word(0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208);
247 &data_word(0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2);
248&function_end_B("sha256_block_data_order");
249&asciz("SHA256 block transform for x86, CRYPTOGAMS by <appro\@openssl.org>");
250
251&asm_finish();
diff --git a/src/lib/libcrypto/sha/asm/sha256-armv4.pl b/src/lib/libcrypto/sha/asm/sha256-armv4.pl
deleted file mode 100644
index 492cb62bc0..0000000000
--- a/src/lib/libcrypto/sha/asm/sha256-armv4.pl
+++ /dev/null
@@ -1,186 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# SHA256 block procedure for ARMv4. May 2007.
11
12# Performance is ~2x better than gcc 3.4 generated code and in "abso-
13# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
14# byte [on single-issue Xscale PXA250 core].
15
16# July 2010.
17#
18# Rescheduling for dual-issue pipeline resulted in 22% improvement on
19# Cortex A8 core and ~20 cycles per processed byte.
20
21while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
22open STDOUT,">$output";
23
24$ctx="r0"; $t0="r0";
25$inp="r1";
26$len="r2"; $t1="r2";
27$T1="r3";
28$A="r4";
29$B="r5";
30$C="r6";
31$D="r7";
32$E="r8";
33$F="r9";
34$G="r10";
35$H="r11";
36@V=($A,$B,$C,$D,$E,$F,$G,$H);
37$t2="r12";
38$Ktbl="r14";
39
40@Sigma0=( 2,13,22);
41@Sigma1=( 6,11,25);
42@sigma0=( 7,18, 3);
43@sigma1=(17,19,10);
44
45sub BODY_00_15 {
46my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
47
48$code.=<<___ if ($i<16);
49 ldrb $T1,[$inp,#3] @ $i
50 ldrb $t2,[$inp,#2]
51 ldrb $t1,[$inp,#1]
52 ldrb $t0,[$inp],#4
53 orr $T1,$T1,$t2,lsl#8
54 orr $T1,$T1,$t1,lsl#16
55 orr $T1,$T1,$t0,lsl#24
56 `"str $inp,[sp,#17*4]" if ($i==15)`
57___
58$code.=<<___;
59 ldr $t2,[$Ktbl],#4 @ *K256++
60 mov $t0,$e,ror#$Sigma1[0]
61 str $T1,[sp,#`$i%16`*4]
62 eor $t0,$t0,$e,ror#$Sigma1[1]
63 eor $t1,$f,$g
64 eor $t0,$t0,$e,ror#$Sigma1[2] @ Sigma1(e)
65 and $t1,$t1,$e
66 add $T1,$T1,$t0
67 eor $t1,$t1,$g @ Ch(e,f,g)
68 add $T1,$T1,$h
69 mov $h,$a,ror#$Sigma0[0]
70 add $T1,$T1,$t1
71 eor $h,$h,$a,ror#$Sigma0[1]
72 add $T1,$T1,$t2
73 eor $h,$h,$a,ror#$Sigma0[2] @ Sigma0(a)
74 orr $t0,$a,$b
75 and $t1,$a,$b
76 and $t0,$t0,$c
77 add $h,$h,$T1
78 orr $t0,$t0,$t1 @ Maj(a,b,c)
79 add $d,$d,$T1
80 add $h,$h,$t0
81___
82}
83
84sub BODY_16_XX {
85my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
86
87$code.=<<___;
88 ldr $t1,[sp,#`($i+1)%16`*4] @ $i
89 ldr $t2,[sp,#`($i+14)%16`*4]
90 ldr $T1,[sp,#`($i+0)%16`*4]
91 mov $t0,$t1,ror#$sigma0[0]
92 ldr $inp,[sp,#`($i+9)%16`*4]
93 eor $t0,$t0,$t1,ror#$sigma0[1]
94 eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1])
95 mov $t1,$t2,ror#$sigma1[0]
96 add $T1,$T1,$t0
97 eor $t1,$t1,$t2,ror#$sigma1[1]
98 add $T1,$T1,$inp
99 eor $t1,$t1,$t2,lsr#$sigma1[2] @ sigma1(X[i+14])
100 add $T1,$T1,$t1
101___
102 &BODY_00_15(@_);
103}
104
105$code=<<___;
106.text
107.code 32
108
109.type K256,%object
110.align 5
111K256:
112.word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
113.word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
114.word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
115.word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
116.word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
117.word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
118.word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
119.word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
120.word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
121.word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
122.word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
123.word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
124.word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
125.word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
126.word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
127.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
128.size K256,.-K256
129
130.global sha256_block_data_order
131.type sha256_block_data_order,%function
132sha256_block_data_order:
133 sub r3,pc,#8 @ sha256_block_data_order
134 add $len,$inp,$len,lsl#6 @ len to point at the end of inp
135 stmdb sp!,{$ctx,$inp,$len,r4-r12,lr}
136 ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
137 sub $Ktbl,r3,#256 @ K256
138 sub sp,sp,#16*4 @ alloca(X[16])
139.Loop:
140___
141for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
142$code.=".Lrounds_16_xx:\n";
143for (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
144$code.=<<___;
145 and $t2,$t2,#0xff
146 cmp $t2,#0xf2
147 bne .Lrounds_16_xx
148
149 ldr $T1,[sp,#16*4] @ pull ctx
150 ldr $t0,[$T1,#0]
151 ldr $t1,[$T1,#4]
152 ldr $t2,[$T1,#8]
153 add $A,$A,$t0
154 ldr $t0,[$T1,#12]
155 add $B,$B,$t1
156 ldr $t1,[$T1,#16]
157 add $C,$C,$t2
158 ldr $t2,[$T1,#20]
159 add $D,$D,$t0
160 ldr $t0,[$T1,#24]
161 add $E,$E,$t1
162 ldr $t1,[$T1,#28]
163 add $F,$F,$t2
164 ldr $inp,[sp,#17*4] @ pull inp
165 ldr $t2,[sp,#18*4] @ pull inp+len
166 add $G,$G,$t0
167 add $H,$H,$t1
168 stmia $T1,{$A,$B,$C,$D,$E,$F,$G,$H}
169 cmp $inp,$t2
170 sub $Ktbl,$Ktbl,#256 @ rewind Ktbl
171 bne .Loop
172
173 add sp,sp,#`16+3`*4 @ destroy frame
174 ldmia sp!,{r4-r12,lr}
175 tst lr,#1
176 moveq pc,lr @ be binary compatible with V4, yet
177 bx lr @ interoperable with Thumb ISA:-)
178.size sha256_block_data_order,.-sha256_block_data_order
179.asciz "SHA256 block transform for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
180.align 2
181___
182
183$code =~ s/\`([^\`]*)\`/eval $1/gem;
184$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
185print $code;
186close STDOUT; # enforce flush
diff --git a/src/lib/libcrypto/sha/asm/sha512-586.pl b/src/lib/libcrypto/sha/asm/sha512-586.pl
deleted file mode 100644
index 5b9f3337ad..0000000000
--- a/src/lib/libcrypto/sha/asm/sha512-586.pl
+++ /dev/null
@@ -1,644 +0,0 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# SHA512 block transform for x86. September 2007.
11#
12# Performance in clock cycles per processed byte (less is better):
13#
14# Pentium PIII P4 AMD K8 Core2
15# gcc 100 75 116 54 66
16# icc 97 77 95 55 57
17# x86 asm 61 56 82 36 40
18# SSE2 asm - - 38 24 20
19# x86_64 asm(*) - - 30 10.0 10.5
20#
21# (*) x86_64 assembler performance is presented for reference
22# purposes.
23#
24# IALU code-path is optimized for elder Pentiums. On vanilla Pentium
25# performance improvement over compiler generated code reaches ~60%,
26# while on PIII - ~35%. On newer µ-archs improvement varies from 15%
27# to 50%, but it's less important as they are expected to execute SSE2
28# code-path, which is commonly ~2-3x faster [than compiler generated
29# code]. SSE2 code-path is as fast as original sha512-sse2.pl, even
30# though it does not use 128-bit operations. The latter means that
31# SSE2-aware kernel is no longer required to execute the code. Another
32# difference is that new code optimizes amount of writes, but at the
33# cost of increased data cache "footprint" by 1/2KB.
34
35$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
36push(@INC,"${dir}","${dir}../../perlasm");
37require "x86asm.pl";
38
39&asm_init($ARGV[0],"sha512-586.pl",$ARGV[$#ARGV] eq "386");
40
41$sse2=0;
42for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
43
44&external_label("OPENSSL_ia32cap_P") if ($sse2);
45
46$Tlo=&DWP(0,"esp"); $Thi=&DWP(4,"esp");
47$Alo=&DWP(8,"esp"); $Ahi=&DWP(8+4,"esp");
48$Blo=&DWP(16,"esp"); $Bhi=&DWP(16+4,"esp");
49$Clo=&DWP(24,"esp"); $Chi=&DWP(24+4,"esp");
50$Dlo=&DWP(32,"esp"); $Dhi=&DWP(32+4,"esp");
51$Elo=&DWP(40,"esp"); $Ehi=&DWP(40+4,"esp");
52$Flo=&DWP(48,"esp"); $Fhi=&DWP(48+4,"esp");
53$Glo=&DWP(56,"esp"); $Ghi=&DWP(56+4,"esp");
54$Hlo=&DWP(64,"esp"); $Hhi=&DWP(64+4,"esp");
55$K512="ebp";
56
57$Asse2=&QWP(0,"esp");
58$Bsse2=&QWP(8,"esp");
59$Csse2=&QWP(16,"esp");
60$Dsse2=&QWP(24,"esp");
61$Esse2=&QWP(32,"esp");
62$Fsse2=&QWP(40,"esp");
63$Gsse2=&QWP(48,"esp");
64$Hsse2=&QWP(56,"esp");
65
66$A="mm0"; # B-D and
67$E="mm4"; # F-H are commonly loaded to respectively mm1-mm3 and
68 # mm5-mm7, but it's done on on-demand basis...
69
70sub BODY_00_15_sse2 {
71 my $prefetch=shift;
72
73 &movq ("mm5",$Fsse2); # load f
74 &movq ("mm6",$Gsse2); # load g
75 &movq ("mm7",$Hsse2); # load h
76
77 &movq ("mm1",$E); # %mm1 is sliding right
78 &movq ("mm2",$E); # %mm2 is sliding left
79 &psrlq ("mm1",14);
80 &movq ($Esse2,$E); # modulo-scheduled save e
81 &psllq ("mm2",23);
82 &movq ("mm3","mm1"); # %mm3 is T1
83 &psrlq ("mm1",4);
84 &pxor ("mm3","mm2");
85 &psllq ("mm2",23);
86 &pxor ("mm3","mm1");
87 &psrlq ("mm1",23);
88 &pxor ("mm3","mm2");
89 &psllq ("mm2",4);
90 &pxor ("mm3","mm1");
91 &paddq ("mm7",QWP(0,$K512)); # h+=K512[i]
92 &pxor ("mm3","mm2"); # T1=Sigma1_512(e)
93
94 &pxor ("mm5","mm6"); # f^=g
95 &movq ("mm1",$Bsse2); # load b
96 &pand ("mm5",$E); # f&=e
97 &movq ("mm2",$Csse2); # load c
98 &pxor ("mm5","mm6"); # f^=g
99 &movq ($E,$Dsse2); # e = load d
100 &paddq ("mm3","mm5"); # T1+=Ch(e,f,g)
101 &movq (&QWP(0,"esp"),$A); # modulo-scheduled save a
102 &paddq ("mm3","mm7"); # T1+=h
103
104 &movq ("mm5",$A); # %mm5 is sliding right
105 &movq ("mm6",$A); # %mm6 is sliding left
106 &paddq ("mm3",&QWP(8*9,"esp")); # T1+=X[0]
107 &psrlq ("mm5",28);
108 &paddq ($E,"mm3"); # e += T1
109 &psllq ("mm6",25);
110 &movq ("mm7","mm5"); # %mm7 is T2
111 &psrlq ("mm5",6);
112 &pxor ("mm7","mm6");
113 &psllq ("mm6",5);
114 &pxor ("mm7","mm5");
115 &psrlq ("mm5",5);
116 &pxor ("mm7","mm6");
117 &psllq ("mm6",6);
118 &pxor ("mm7","mm5");
119 &sub ("esp",8);
120 &pxor ("mm7","mm6"); # T2=Sigma0_512(a)
121
122 &movq ("mm5",$A); # %mm5=a
123 &por ($A,"mm2"); # a=a|c
124 &movq ("mm6",&QWP(8*(9+16-14),"esp")) if ($prefetch);
125 &pand ("mm5","mm2"); # %mm5=a&c
126 &pand ($A,"mm1"); # a=(a|c)&b
127 &movq ("mm2",&QWP(8*(9+16-1),"esp")) if ($prefetch);
128 &por ("mm5",$A); # %mm5=(a&c)|((a|c)&b)
129 &paddq ("mm7","mm5"); # T2+=Maj(a,b,c)
130 &movq ($A,"mm3"); # a=T1
131
132 &mov (&LB("edx"),&BP(0,$K512));
133 &paddq ($A,"mm7"); # a+=T2
134 &add ($K512,8);
135}
136
137sub BODY_00_15_x86 {
138 #define Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41))
139 # LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
140 # HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
141 &mov ("ecx",$Elo);
142 &mov ("edx",$Ehi);
143 &mov ("esi","ecx");
144
145 &shr ("ecx",9) # lo>>9
146 &mov ("edi","edx");
147 &shr ("edx",9) # hi>>9
148 &mov ("ebx","ecx");
149 &shl ("esi",14); # lo<<14
150 &mov ("eax","edx");
151 &shl ("edi",14); # hi<<14
152 &xor ("ebx","esi");
153
154 &shr ("ecx",14-9); # lo>>14
155 &xor ("eax","edi");
156 &shr ("edx",14-9); # hi>>14
157 &xor ("eax","ecx");
158 &shl ("esi",18-14); # lo<<18
159 &xor ("ebx","edx");
160 &shl ("edi",18-14); # hi<<18
161 &xor ("ebx","esi");
162
163 &shr ("ecx",18-14); # lo>>18
164 &xor ("eax","edi");
165 &shr ("edx",18-14); # hi>>18
166 &xor ("eax","ecx");
167 &shl ("esi",23-18); # lo<<23
168 &xor ("ebx","edx");
169 &shl ("edi",23-18); # hi<<23
170 &xor ("eax","esi");
171 &xor ("ebx","edi"); # T1 = Sigma1(e)
172
173 &mov ("ecx",$Flo);
174 &mov ("edx",$Fhi);
175 &mov ("esi",$Glo);
176 &mov ("edi",$Ghi);
177 &add ("eax",$Hlo);
178 &adc ("ebx",$Hhi); # T1 += h
179 &xor ("ecx","esi");
180 &xor ("edx","edi");
181 &and ("ecx",$Elo);
182 &and ("edx",$Ehi);
183 &add ("eax",&DWP(8*(9+15)+0,"esp"));
184 &adc ("ebx",&DWP(8*(9+15)+4,"esp")); # T1 += X[0]
185 &xor ("ecx","esi");
186 &xor ("edx","edi"); # Ch(e,f,g) = (f^g)&e)^g
187
188 &mov ("esi",&DWP(0,$K512));
189 &mov ("edi",&DWP(4,$K512)); # K[i]
190 &add ("eax","ecx");
191 &adc ("ebx","edx"); # T1 += Ch(e,f,g)
192 &mov ("ecx",$Dlo);
193 &mov ("edx",$Dhi);
194 &add ("eax","esi");
195 &adc ("ebx","edi"); # T1 += K[i]
196 &mov ($Tlo,"eax");
197 &mov ($Thi,"ebx"); # put T1 away
198 &add ("eax","ecx");
199 &adc ("ebx","edx"); # d += T1
200
201 #define Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
202 # LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
203 # HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
204 &mov ("ecx",$Alo);
205 &mov ("edx",$Ahi);
206 &mov ($Dlo,"eax");
207 &mov ($Dhi,"ebx");
208 &mov ("esi","ecx");
209
210 &shr ("ecx",2) # lo>>2
211 &mov ("edi","edx");
212 &shr ("edx",2) # hi>>2
213 &mov ("ebx","ecx");
214 &shl ("esi",4); # lo<<4
215 &mov ("eax","edx");
216 &shl ("edi",4); # hi<<4
217 &xor ("ebx","esi");
218
219 &shr ("ecx",7-2); # lo>>7
220 &xor ("eax","edi");
221 &shr ("edx",7-2); # hi>>7
222 &xor ("ebx","ecx");
223 &shl ("esi",25-4); # lo<<25
224 &xor ("eax","edx");
225 &shl ("edi",25-4); # hi<<25
226 &xor ("eax","esi");
227
228 &shr ("ecx",28-7); # lo>>28
229 &xor ("ebx","edi");
230 &shr ("edx",28-7); # hi>>28
231 &xor ("eax","ecx");
232 &shl ("esi",30-25); # lo<<30
233 &xor ("ebx","edx");
234 &shl ("edi",30-25); # hi<<30
235 &xor ("eax","esi");
236 &xor ("ebx","edi"); # Sigma0(a)
237
238 &mov ("ecx",$Alo);
239 &mov ("edx",$Ahi);
240 &mov ("esi",$Blo);
241 &mov ("edi",$Bhi);
242 &add ("eax",$Tlo);
243 &adc ("ebx",$Thi); # T1 = Sigma0(a)+T1
244 &or ("ecx","esi");
245 &or ("edx","edi");
246 &and ("ecx",$Clo);
247 &and ("edx",$Chi);
248 &and ("esi",$Alo);
249 &and ("edi",$Ahi);
250 &or ("ecx","esi");
251 &or ("edx","edi"); # Maj(a,b,c) = ((a|b)&c)|(a&b)
252
253 &add ("eax","ecx");
254 &adc ("ebx","edx"); # T1 += Maj(a,b,c)
255 &mov ($Tlo,"eax");
256 &mov ($Thi,"ebx");
257
258 &mov (&LB("edx"),&BP(0,$K512)); # pre-fetch LSB of *K
259 &sub ("esp",8);
260 &lea ($K512,&DWP(8,$K512)); # K++
261}
262
263
264&function_begin("sha512_block_data_order");
265 &mov ("esi",wparam(0)); # ctx
266 &mov ("edi",wparam(1)); # inp
267 &mov ("eax",wparam(2)); # num
268 &mov ("ebx","esp"); # saved sp
269
270 &call (&label("pic_point")); # make it PIC!
271&set_label("pic_point");
272 &blindpop($K512);
273 &lea ($K512,&DWP(&label("K512")."-".&label("pic_point"),$K512));
274
275 &sub ("esp",16);
276 &and ("esp",-64);
277
278 &shl ("eax",7);
279 &add ("eax","edi");
280 &mov (&DWP(0,"esp"),"esi"); # ctx
281 &mov (&DWP(4,"esp"),"edi"); # inp
282 &mov (&DWP(8,"esp"),"eax"); # inp+num*128
283 &mov (&DWP(12,"esp"),"ebx"); # saved sp
284
285if ($sse2) {
286 &picmeup("edx","OPENSSL_ia32cap_P",$K512,&label("K512"));
287 &bt (&DWP(0,"edx"),26);
288 &jnc (&label("loop_x86"));
289
290 # load ctx->h[0-7]
291 &movq ($A,&QWP(0,"esi"));
292 &movq ("mm1",&QWP(8,"esi"));
293 &movq ("mm2",&QWP(16,"esi"));
294 &movq ("mm3",&QWP(24,"esi"));
295 &movq ($E,&QWP(32,"esi"));
296 &movq ("mm5",&QWP(40,"esi"));
297 &movq ("mm6",&QWP(48,"esi"));
298 &movq ("mm7",&QWP(56,"esi"));
299 &sub ("esp",8*10);
300
301&set_label("loop_sse2",16);
302 # &movq ($Asse2,$A);
303 &movq ($Bsse2,"mm1");
304 &movq ($Csse2,"mm2");
305 &movq ($Dsse2,"mm3");
306 # &movq ($Esse2,$E);
307 &movq ($Fsse2,"mm5");
308 &movq ($Gsse2,"mm6");
309 &movq ($Hsse2,"mm7");
310
311 &mov ("ecx",&DWP(0,"edi"));
312 &mov ("edx",&DWP(4,"edi"));
313 &add ("edi",8);
314 &bswap ("ecx");
315 &bswap ("edx");
316 &mov (&DWP(8*9+4,"esp"),"ecx");
317 &mov (&DWP(8*9+0,"esp"),"edx");
318
319&set_label("00_14_sse2",16);
320 &mov ("eax",&DWP(0,"edi"));
321 &mov ("ebx",&DWP(4,"edi"));
322 &add ("edi",8);
323 &bswap ("eax");
324 &bswap ("ebx");
325 &mov (&DWP(8*8+4,"esp"),"eax");
326 &mov (&DWP(8*8+0,"esp"),"ebx");
327
328 &BODY_00_15_sse2();
329
330 &cmp (&LB("edx"),0x35);
331 &jne (&label("00_14_sse2"));
332
333 &BODY_00_15_sse2(1);
334
335&set_label("16_79_sse2",16);
336 #&movq ("mm2",&QWP(8*(9+16-1),"esp")); #prefetched in BODY_00_15
337 #&movq ("mm6",&QWP(8*(9+16-14),"esp"));
338 &movq ("mm1","mm2");
339
340 &psrlq ("mm2",1);
341 &movq ("mm7","mm6");
342 &psrlq ("mm6",6);
343 &movq ("mm3","mm2");
344
345 &psrlq ("mm2",7-1);
346 &movq ("mm5","mm6");
347 &psrlq ("mm6",19-6);
348 &pxor ("mm3","mm2");
349
350 &psrlq ("mm2",8-7);
351 &pxor ("mm5","mm6");
352 &psrlq ("mm6",61-19);
353 &pxor ("mm3","mm2");
354
355 &movq ("mm2",&QWP(8*(9+16),"esp"));
356
357 &psllq ("mm1",56);
358 &pxor ("mm5","mm6");
359 &psllq ("mm7",3);
360 &pxor ("mm3","mm1");
361
362 &paddq ("mm2",&QWP(8*(9+16-9),"esp"));
363
364 &psllq ("mm1",63-56);
365 &pxor ("mm5","mm7");
366 &psllq ("mm7",45-3);
367 &pxor ("mm3","mm1");
368 &pxor ("mm5","mm7");
369
370 &paddq ("mm3","mm5");
371 &paddq ("mm3","mm2");
372 &movq (&QWP(8*9,"esp"),"mm3");
373
374 &BODY_00_15_sse2(1);
375
376 &cmp (&LB("edx"),0x17);
377 &jne (&label("16_79_sse2"));
378
379 # &movq ($A,$Asse2);
380 &movq ("mm1",$Bsse2);
381 &movq ("mm2",$Csse2);
382 &movq ("mm3",$Dsse2);
383 # &movq ($E,$Esse2);
384 &movq ("mm5",$Fsse2);
385 &movq ("mm6",$Gsse2);
386 &movq ("mm7",$Hsse2);
387
388 &paddq ($A,&QWP(0,"esi"));
389 &paddq ("mm1",&QWP(8,"esi"));
390 &paddq ("mm2",&QWP(16,"esi"));
391 &paddq ("mm3",&QWP(24,"esi"));
392 &paddq ($E,&QWP(32,"esi"));
393 &paddq ("mm5",&QWP(40,"esi"));
394 &paddq ("mm6",&QWP(48,"esi"));
395 &paddq ("mm7",&QWP(56,"esi"));
396
397 &movq (&QWP(0,"esi"),$A);
398 &movq (&QWP(8,"esi"),"mm1");
399 &movq (&QWP(16,"esi"),"mm2");
400 &movq (&QWP(24,"esi"),"mm3");
401 &movq (&QWP(32,"esi"),$E);
402 &movq (&QWP(40,"esi"),"mm5");
403 &movq (&QWP(48,"esi"),"mm6");
404 &movq (&QWP(56,"esi"),"mm7");
405
406 &add ("esp",8*80); # destroy frame
407 &sub ($K512,8*80); # rewind K
408
409 &cmp ("edi",&DWP(8*10+8,"esp")); # are we done yet?
410 &jb (&label("loop_sse2"));
411
412 &emms ();
413 &mov ("esp",&DWP(8*10+12,"esp")); # restore sp
414&function_end_A();
415}
416&set_label("loop_x86",16);
417 # copy input block to stack reversing byte and qword order
418 for ($i=0;$i<8;$i++) {
419 &mov ("eax",&DWP($i*16+0,"edi"));
420 &mov ("ebx",&DWP($i*16+4,"edi"));
421 &mov ("ecx",&DWP($i*16+8,"edi"));
422 &mov ("edx",&DWP($i*16+12,"edi"));
423 &bswap ("eax");
424 &bswap ("ebx");
425 &bswap ("ecx");
426 &bswap ("edx");
427 &push ("eax");
428 &push ("ebx");
429 &push ("ecx");
430 &push ("edx");
431 }
432 &add ("edi",128);
433 &sub ("esp",9*8); # place for T,A,B,C,D,E,F,G,H
434 &mov (&DWP(8*(9+16)+4,"esp"),"edi");
435
436 # copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack
437 &lea ("edi",&DWP(8,"esp"));
438 &mov ("ecx",16);
439 &data_word(0xA5F3F689); # rep movsd
440
441&set_label("00_15_x86",16);
442 &BODY_00_15_x86();
443
444 &cmp (&LB("edx"),0x94);
445 &jne (&label("00_15_x86"));
446
447&set_label("16_79_x86",16);
448 #define sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7))
449 # LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
450 # HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7
451 &mov ("ecx",&DWP(8*(9+15+16-1)+0,"esp"));
452 &mov ("edx",&DWP(8*(9+15+16-1)+4,"esp"));
453 &mov ("esi","ecx");
454
455 &shr ("ecx",1) # lo>>1
456 &mov ("edi","edx");
457 &shr ("edx",1) # hi>>1
458 &mov ("eax","ecx");
459 &shl ("esi",24); # lo<<24
460 &mov ("ebx","edx");
461 &shl ("edi",24); # hi<<24
462 &xor ("ebx","esi");
463
464 &shr ("ecx",7-1); # lo>>7
465 &xor ("eax","edi");
466 &shr ("edx",7-1); # hi>>7
467 &xor ("eax","ecx");
468 &shl ("esi",31-24); # lo<<31
469 &xor ("ebx","edx");
470 &shl ("edi",25-24); # hi<<25
471 &xor ("ebx","esi");
472
473 &shr ("ecx",8-7); # lo>>8
474 &xor ("eax","edi");
475 &shr ("edx",8-7); # hi>>8
476 &xor ("eax","ecx");
477 &shl ("edi",31-25); # hi<<31
478 &xor ("ebx","edx");
479 &xor ("eax","edi"); # T1 = sigma0(X[-15])
480
481 &mov (&DWP(0,"esp"),"eax");
482 &mov (&DWP(4,"esp"),"ebx"); # put T1 away
483
484 #define sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
485 # LO lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
486 # HI hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
487 &mov ("ecx",&DWP(8*(9+15+16-14)+0,"esp"));
488 &mov ("edx",&DWP(8*(9+15+16-14)+4,"esp"));
489 &mov ("esi","ecx");
490
491 &shr ("ecx",6) # lo>>6
492 &mov ("edi","edx");
493 &shr ("edx",6) # hi>>6
494 &mov ("eax","ecx");
495 &shl ("esi",3); # lo<<3
496 &mov ("ebx","edx");
497 &shl ("edi",3); # hi<<3
498 &xor ("eax","esi");
499
500 &shr ("ecx",19-6); # lo>>19
501 &xor ("ebx","edi");
502 &shr ("edx",19-6); # hi>>19
503 &xor ("eax","ecx");
504 &shl ("esi",13-3); # lo<<13
505 &xor ("ebx","edx");
506 &shl ("edi",13-3); # hi<<13
507 &xor ("ebx","esi");
508
509 &shr ("ecx",29-19); # lo>>29
510 &xor ("eax","edi");
511 &shr ("edx",29-19); # hi>>29
512 &xor ("ebx","ecx");
513 &shl ("edi",26-13); # hi<<26
514 &xor ("eax","edx");
515 &xor ("eax","edi"); # sigma1(X[-2])
516
517 &mov ("ecx",&DWP(8*(9+15+16)+0,"esp"));
518 &mov ("edx",&DWP(8*(9+15+16)+4,"esp"));
519 &add ("eax",&DWP(0,"esp"));
520 &adc ("ebx",&DWP(4,"esp")); # T1 = sigma1(X[-2])+T1
521 &mov ("esi",&DWP(8*(9+15+16-9)+0,"esp"));
522 &mov ("edi",&DWP(8*(9+15+16-9)+4,"esp"));
523 &add ("eax","ecx");
524 &adc ("ebx","edx"); # T1 += X[-16]
525 &add ("eax","esi");
526 &adc ("ebx","edi"); # T1 += X[-7]
527 &mov (&DWP(8*(9+15)+0,"esp"),"eax");
528 &mov (&DWP(8*(9+15)+4,"esp"),"ebx"); # save X[0]
529
530 &BODY_00_15_x86();
531
532 &cmp (&LB("edx"),0x17);
533 &jne (&label("16_79_x86"));
534
535 &mov ("esi",&DWP(8*(9+16+80)+0,"esp"));# ctx
536 &mov ("edi",&DWP(8*(9+16+80)+4,"esp"));# inp
537 for($i=0;$i<4;$i++) {
538 &mov ("eax",&DWP($i*16+0,"esi"));
539 &mov ("ebx",&DWP($i*16+4,"esi"));
540 &mov ("ecx",&DWP($i*16+8,"esi"));
541 &mov ("edx",&DWP($i*16+12,"esi"));
542 &add ("eax",&DWP(8+($i*16)+0,"esp"));
543 &adc ("ebx",&DWP(8+($i*16)+4,"esp"));
544 &mov (&DWP($i*16+0,"esi"),"eax");
545 &mov (&DWP($i*16+4,"esi"),"ebx");
546 &add ("ecx",&DWP(8+($i*16)+8,"esp"));
547 &adc ("edx",&DWP(8+($i*16)+12,"esp"));
548 &mov (&DWP($i*16+8,"esi"),"ecx");
549 &mov (&DWP($i*16+12,"esi"),"edx");
550 }
551 &add ("esp",8*(9+16+80)); # destroy frame
552 &sub ($K512,8*80); # rewind K
553
554 &cmp ("edi",&DWP(8,"esp")); # are we done yet?
555 &jb (&label("loop_x86"));
556
557 &mov ("esp",&DWP(12,"esp")); # restore sp
558&function_end_A();
559
560&set_label("K512",64); # Yes! I keep it in the code segment!
561 &data_word(0xd728ae22,0x428a2f98); # u64
562 &data_word(0x23ef65cd,0x71374491); # u64
563 &data_word(0xec4d3b2f,0xb5c0fbcf); # u64
564 &data_word(0x8189dbbc,0xe9b5dba5); # u64
565 &data_word(0xf348b538,0x3956c25b); # u64
566 &data_word(0xb605d019,0x59f111f1); # u64
567 &data_word(0xaf194f9b,0x923f82a4); # u64
568 &data_word(0xda6d8118,0xab1c5ed5); # u64
569 &data_word(0xa3030242,0xd807aa98); # u64
570 &data_word(0x45706fbe,0x12835b01); # u64
571 &data_word(0x4ee4b28c,0x243185be); # u64
572 &data_word(0xd5ffb4e2,0x550c7dc3); # u64
573 &data_word(0xf27b896f,0x72be5d74); # u64
574 &data_word(0x3b1696b1,0x80deb1fe); # u64
575 &data_word(0x25c71235,0x9bdc06a7); # u64
576 &data_word(0xcf692694,0xc19bf174); # u64
577 &data_word(0x9ef14ad2,0xe49b69c1); # u64
578 &data_word(0x384f25e3,0xefbe4786); # u64
579 &data_word(0x8b8cd5b5,0x0fc19dc6); # u64
580 &data_word(0x77ac9c65,0x240ca1cc); # u64
581 &data_word(0x592b0275,0x2de92c6f); # u64
582 &data_word(0x6ea6e483,0x4a7484aa); # u64
583 &data_word(0xbd41fbd4,0x5cb0a9dc); # u64
584 &data_word(0x831153b5,0x76f988da); # u64
585 &data_word(0xee66dfab,0x983e5152); # u64
586 &data_word(0x2db43210,0xa831c66d); # u64
587 &data_word(0x98fb213f,0xb00327c8); # u64
588 &data_word(0xbeef0ee4,0xbf597fc7); # u64
589 &data_word(0x3da88fc2,0xc6e00bf3); # u64
590 &data_word(0x930aa725,0xd5a79147); # u64
591 &data_word(0xe003826f,0x06ca6351); # u64
592 &data_word(0x0a0e6e70,0x14292967); # u64
593 &data_word(0x46d22ffc,0x27b70a85); # u64
594 &data_word(0x5c26c926,0x2e1b2138); # u64
595 &data_word(0x5ac42aed,0x4d2c6dfc); # u64
596 &data_word(0x9d95b3df,0x53380d13); # u64
597 &data_word(0x8baf63de,0x650a7354); # u64
598 &data_word(0x3c77b2a8,0x766a0abb); # u64
599 &data_word(0x47edaee6,0x81c2c92e); # u64
600 &data_word(0x1482353b,0x92722c85); # u64
601 &data_word(0x4cf10364,0xa2bfe8a1); # u64
602 &data_word(0xbc423001,0xa81a664b); # u64
603 &data_word(0xd0f89791,0xc24b8b70); # u64
604 &data_word(0x0654be30,0xc76c51a3); # u64
605 &data_word(0xd6ef5218,0xd192e819); # u64
606 &data_word(0x5565a910,0xd6990624); # u64
607 &data_word(0x5771202a,0xf40e3585); # u64
608 &data_word(0x32bbd1b8,0x106aa070); # u64
609 &data_word(0xb8d2d0c8,0x19a4c116); # u64
610 &data_word(0x5141ab53,0x1e376c08); # u64
611 &data_word(0xdf8eeb99,0x2748774c); # u64
612 &data_word(0xe19b48a8,0x34b0bcb5); # u64
613 &data_word(0xc5c95a63,0x391c0cb3); # u64
614 &data_word(0xe3418acb,0x4ed8aa4a); # u64
615 &data_word(0x7763e373,0x5b9cca4f); # u64
616 &data_word(0xd6b2b8a3,0x682e6ff3); # u64
617 &data_word(0x5defb2fc,0x748f82ee); # u64
618 &data_word(0x43172f60,0x78a5636f); # u64
619 &data_word(0xa1f0ab72,0x84c87814); # u64
620 &data_word(0x1a6439ec,0x8cc70208); # u64
621 &data_word(0x23631e28,0x90befffa); # u64
622 &data_word(0xde82bde9,0xa4506ceb); # u64
623 &data_word(0xb2c67915,0xbef9a3f7); # u64
624 &data_word(0xe372532b,0xc67178f2); # u64
625 &data_word(0xea26619c,0xca273ece); # u64
626 &data_word(0x21c0c207,0xd186b8c7); # u64
627 &data_word(0xcde0eb1e,0xeada7dd6); # u64
628 &data_word(0xee6ed178,0xf57d4f7f); # u64
629 &data_word(0x72176fba,0x06f067aa); # u64
630 &data_word(0xa2c898a6,0x0a637dc5); # u64
631 &data_word(0xbef90dae,0x113f9804); # u64
632 &data_word(0x131c471b,0x1b710b35); # u64
633 &data_word(0x23047d84,0x28db77f5); # u64
634 &data_word(0x40c72493,0x32caab7b); # u64
635 &data_word(0x15c9bebc,0x3c9ebe0a); # u64
636 &data_word(0x9c100d4c,0x431d67c4); # u64
637 &data_word(0xcb3e42b6,0x4cc5d4be); # u64
638 &data_word(0xfc657e2a,0x597f299c); # u64
639 &data_word(0x3ad6faec,0x5fcb6fab); # u64
640 &data_word(0x4a475817,0x6c44198c); # u64
641&function_end_B("sha512_block_data_order");
642&asciz("SHA512 block transform for x86, CRYPTOGAMS by <appro\@openssl.org>");
643
644&asm_finish();
diff --git a/src/lib/libcrypto/sha/asm/sha512-armv4.pl b/src/lib/libcrypto/sha/asm/sha512-armv4.pl
deleted file mode 100644
index 3a35861ac6..0000000000
--- a/src/lib/libcrypto/sha/asm/sha512-armv4.pl
+++ /dev/null
@@ -1,403 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# SHA512 block procedure for ARMv4. September 2007.
11
12# This code is ~4.5 (four and a half) times faster than code generated
13# by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
14# Xscale PXA250 core].
15#
16# July 2010.
17#
18# Rescheduling for dual-issue pipeline resulted in 6% improvement on
19# Cortex A8 core and ~40 cycles per processed byte.
20
21# Byte order [in]dependence. =========================================
22#
23# Caller is expected to maintain specific *dword* order in h[0-7],
24# namely with most significant dword at *lower* address, which is
25# reflected in below two parameters. *Byte* order within these dwords
26# in turn is whatever *native* byte order on current platform.
27$hi=0;
28$lo=4;
29# ====================================================================
30
31while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
32open STDOUT,">$output";
33
34$ctx="r0";
35$inp="r1";
36$len="r2";
37$Tlo="r3";
38$Thi="r4";
39$Alo="r5";
40$Ahi="r6";
41$Elo="r7";
42$Ehi="r8";
43$t0="r9";
44$t1="r10";
45$t2="r11";
46$t3="r12";
47############ r13 is stack pointer
48$Ktbl="r14";
49############ r15 is program counter
50
51$Aoff=8*0;
52$Boff=8*1;
53$Coff=8*2;
54$Doff=8*3;
55$Eoff=8*4;
56$Foff=8*5;
57$Goff=8*6;
58$Hoff=8*7;
59$Xoff=8*8;
60
61sub BODY_00_15() {
62my $magic = shift;
63$code.=<<___;
64 ldr $t2,[sp,#$Hoff+0] @ h.lo
65 ldr $t3,[sp,#$Hoff+4] @ h.hi
66 @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41))
67 @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
68 @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
69 mov $t0,$Elo,lsr#14
70 mov $t1,$Ehi,lsr#14
71 eor $t0,$t0,$Ehi,lsl#18
72 eor $t1,$t1,$Elo,lsl#18
73 eor $t0,$t0,$Elo,lsr#18
74 eor $t1,$t1,$Ehi,lsr#18
75 eor $t0,$t0,$Ehi,lsl#14
76 eor $t1,$t1,$Elo,lsl#14
77 eor $t0,$t0,$Ehi,lsr#9
78 eor $t1,$t1,$Elo,lsr#9
79 eor $t0,$t0,$Elo,lsl#23
80 eor $t1,$t1,$Ehi,lsl#23 @ Sigma1(e)
81 adds $Tlo,$Tlo,$t0
82 ldr $t0,[sp,#$Foff+0] @ f.lo
83 adc $Thi,$Thi,$t1 @ T += Sigma1(e)
84 ldr $t1,[sp,#$Foff+4] @ f.hi
85 adds $Tlo,$Tlo,$t2
86 ldr $t2,[sp,#$Goff+0] @ g.lo
87 adc $Thi,$Thi,$t3 @ T += h
88 ldr $t3,[sp,#$Goff+4] @ g.hi
89
90 eor $t0,$t0,$t2
91 str $Elo,[sp,#$Eoff+0]
92 eor $t1,$t1,$t3
93 str $Ehi,[sp,#$Eoff+4]
94 and $t0,$t0,$Elo
95 str $Alo,[sp,#$Aoff+0]
96 and $t1,$t1,$Ehi
97 str $Ahi,[sp,#$Aoff+4]
98 eor $t0,$t0,$t2
99 ldr $t2,[$Ktbl,#4] @ K[i].lo
100 eor $t1,$t1,$t3 @ Ch(e,f,g)
101 ldr $t3,[$Ktbl,#0] @ K[i].hi
102
103 adds $Tlo,$Tlo,$t0
104 ldr $Elo,[sp,#$Doff+0] @ d.lo
105 adc $Thi,$Thi,$t1 @ T += Ch(e,f,g)
106 ldr $Ehi,[sp,#$Doff+4] @ d.hi
107 adds $Tlo,$Tlo,$t2
108 adc $Thi,$Thi,$t3 @ T += K[i]
109 adds $Elo,$Elo,$Tlo
110 adc $Ehi,$Ehi,$Thi @ d += T
111
112 and $t0,$t2,#0xff
113 teq $t0,#$magic
114 orreq $Ktbl,$Ktbl,#1
115
116 ldr $t2,[sp,#$Boff+0] @ b.lo
117 ldr $t3,[sp,#$Coff+0] @ c.lo
118 @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
119 @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
120 @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
121 mov $t0,$Alo,lsr#28
122 mov $t1,$Ahi,lsr#28
123 eor $t0,$t0,$Ahi,lsl#4
124 eor $t1,$t1,$Alo,lsl#4
125 eor $t0,$t0,$Ahi,lsr#2
126 eor $t1,$t1,$Alo,lsr#2
127 eor $t0,$t0,$Alo,lsl#30
128 eor $t1,$t1,$Ahi,lsl#30
129 eor $t0,$t0,$Ahi,lsr#7
130 eor $t1,$t1,$Alo,lsr#7
131 eor $t0,$t0,$Alo,lsl#25
132 eor $t1,$t1,$Ahi,lsl#25 @ Sigma0(a)
133 adds $Tlo,$Tlo,$t0
134 adc $Thi,$Thi,$t1 @ T += Sigma0(a)
135
136 and $t0,$Alo,$t2
137 orr $Alo,$Alo,$t2
138 ldr $t1,[sp,#$Boff+4] @ b.hi
139 ldr $t2,[sp,#$Coff+4] @ c.hi
140 and $Alo,$Alo,$t3
141 orr $Alo,$Alo,$t0 @ Maj(a,b,c).lo
142 and $t3,$Ahi,$t1
143 orr $Ahi,$Ahi,$t1
144 and $Ahi,$Ahi,$t2
145 orr $Ahi,$Ahi,$t3 @ Maj(a,b,c).hi
146 adds $Alo,$Alo,$Tlo
147 adc $Ahi,$Ahi,$Thi @ h += T
148
149 sub sp,sp,#8
150 add $Ktbl,$Ktbl,#8
151___
152}
153$code=<<___;
154.text
155.code 32
156.type K512,%object
157.align 5
158K512:
159.word 0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd
160.word 0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc
161.word 0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019
162.word 0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118
163.word 0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe
164.word 0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2
165.word 0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1
166.word 0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694
167.word 0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3
168.word 0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65
169.word 0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483
170.word 0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5
171.word 0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210
172.word 0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4
173.word 0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725
174.word 0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70
175.word 0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926
176.word 0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df
177.word 0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8
178.word 0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b
179.word 0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001
180.word 0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30
181.word 0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910
182.word 0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8
183.word 0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53
184.word 0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8
185.word 0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb
186.word 0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3
187.word 0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60
188.word 0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec
189.word 0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9
190.word 0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b
191.word 0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207
192.word 0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178
193.word 0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6
194.word 0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b
195.word 0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493
196.word 0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c
197.word 0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a
198.word 0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817
199.size K512,.-K512
200
201.global sha512_block_data_order
202.type sha512_block_data_order,%function
203sha512_block_data_order:
204 sub r3,pc,#8 @ sha512_block_data_order
205 add $len,$inp,$len,lsl#7 @ len to point at the end of inp
206 stmdb sp!,{r4-r12,lr}
207 sub $Ktbl,r3,#640 @ K512
208 sub sp,sp,#9*8
209
210 ldr $Elo,[$ctx,#$Eoff+$lo]
211 ldr $Ehi,[$ctx,#$Eoff+$hi]
212 ldr $t0, [$ctx,#$Goff+$lo]
213 ldr $t1, [$ctx,#$Goff+$hi]
214 ldr $t2, [$ctx,#$Hoff+$lo]
215 ldr $t3, [$ctx,#$Hoff+$hi]
216.Loop:
217 str $t0, [sp,#$Goff+0]
218 str $t1, [sp,#$Goff+4]
219 str $t2, [sp,#$Hoff+0]
220 str $t3, [sp,#$Hoff+4]
221 ldr $Alo,[$ctx,#$Aoff+$lo]
222 ldr $Ahi,[$ctx,#$Aoff+$hi]
223 ldr $Tlo,[$ctx,#$Boff+$lo]
224 ldr $Thi,[$ctx,#$Boff+$hi]
225 ldr $t0, [$ctx,#$Coff+$lo]
226 ldr $t1, [$ctx,#$Coff+$hi]
227 ldr $t2, [$ctx,#$Doff+$lo]
228 ldr $t3, [$ctx,#$Doff+$hi]
229 str $Tlo,[sp,#$Boff+0]
230 str $Thi,[sp,#$Boff+4]
231 str $t0, [sp,#$Coff+0]
232 str $t1, [sp,#$Coff+4]
233 str $t2, [sp,#$Doff+0]
234 str $t3, [sp,#$Doff+4]
235 ldr $Tlo,[$ctx,#$Foff+$lo]
236 ldr $Thi,[$ctx,#$Foff+$hi]
237 str $Tlo,[sp,#$Foff+0]
238 str $Thi,[sp,#$Foff+4]
239
240.L00_15:
241 ldrb $Tlo,[$inp,#7]
242 ldrb $t0, [$inp,#6]
243 ldrb $t1, [$inp,#5]
244 ldrb $t2, [$inp,#4]
245 ldrb $Thi,[$inp,#3]
246 ldrb $t3, [$inp,#2]
247 orr $Tlo,$Tlo,$t0,lsl#8
248 ldrb $t0, [$inp,#1]
249 orr $Tlo,$Tlo,$t1,lsl#16
250 ldrb $t1, [$inp],#8
251 orr $Tlo,$Tlo,$t2,lsl#24
252 orr $Thi,$Thi,$t3,lsl#8
253 orr $Thi,$Thi,$t0,lsl#16
254 orr $Thi,$Thi,$t1,lsl#24
255 str $Tlo,[sp,#$Xoff+0]
256 str $Thi,[sp,#$Xoff+4]
257___
258 &BODY_00_15(0x94);
259$code.=<<___;
260 tst $Ktbl,#1
261 beq .L00_15
262 bic $Ktbl,$Ktbl,#1
263
264.L16_79:
265 ldr $t0,[sp,#`$Xoff+8*(16-1)`+0]
266 ldr $t1,[sp,#`$Xoff+8*(16-1)`+4]
267 ldr $t2,[sp,#`$Xoff+8*(16-14)`+0]
268 ldr $t3,[sp,#`$Xoff+8*(16-14)`+4]
269
270 @ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7))
271 @ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
272 @ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7
273 mov $Tlo,$t0,lsr#1
274 mov $Thi,$t1,lsr#1
275 eor $Tlo,$Tlo,$t1,lsl#31
276 eor $Thi,$Thi,$t0,lsl#31
277 eor $Tlo,$Tlo,$t0,lsr#8
278 eor $Thi,$Thi,$t1,lsr#8
279 eor $Tlo,$Tlo,$t1,lsl#24
280 eor $Thi,$Thi,$t0,lsl#24
281 eor $Tlo,$Tlo,$t0,lsr#7
282 eor $Thi,$Thi,$t1,lsr#7
283 eor $Tlo,$Tlo,$t1,lsl#25
284
285 @ sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
286 @ LO lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
287 @ HI hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
288 mov $t0,$t2,lsr#19
289 mov $t1,$t3,lsr#19
290 eor $t0,$t0,$t3,lsl#13
291 eor $t1,$t1,$t2,lsl#13
292 eor $t0,$t0,$t3,lsr#29
293 eor $t1,$t1,$t2,lsr#29
294 eor $t0,$t0,$t2,lsl#3
295 eor $t1,$t1,$t3,lsl#3
296 eor $t0,$t0,$t2,lsr#6
297 eor $t1,$t1,$t3,lsr#6
298 eor $t0,$t0,$t3,lsl#26
299
300 ldr $t2,[sp,#`$Xoff+8*(16-9)`+0]
301 ldr $t3,[sp,#`$Xoff+8*(16-9)`+4]
302 adds $Tlo,$Tlo,$t0
303 adc $Thi,$Thi,$t1
304
305 ldr $t0,[sp,#`$Xoff+8*16`+0]
306 ldr $t1,[sp,#`$Xoff+8*16`+4]
307 adds $Tlo,$Tlo,$t2
308 adc $Thi,$Thi,$t3
309 adds $Tlo,$Tlo,$t0
310 adc $Thi,$Thi,$t1
311 str $Tlo,[sp,#$Xoff+0]
312 str $Thi,[sp,#$Xoff+4]
313___
314 &BODY_00_15(0x17);
315$code.=<<___;
316 tst $Ktbl,#1
317 beq .L16_79
318 bic $Ktbl,$Ktbl,#1
319
320 ldr $Tlo,[sp,#$Boff+0]
321 ldr $Thi,[sp,#$Boff+4]
322 ldr $t0, [$ctx,#$Aoff+$lo]
323 ldr $t1, [$ctx,#$Aoff+$hi]
324 ldr $t2, [$ctx,#$Boff+$lo]
325 ldr $t3, [$ctx,#$Boff+$hi]
326 adds $t0,$Alo,$t0
327 adc $t1,$Ahi,$t1
328 adds $t2,$Tlo,$t2
329 adc $t3,$Thi,$t3
330 str $t0, [$ctx,#$Aoff+$lo]
331 str $t1, [$ctx,#$Aoff+$hi]
332 str $t2, [$ctx,#$Boff+$lo]
333 str $t3, [$ctx,#$Boff+$hi]
334
335 ldr $Alo,[sp,#$Coff+0]
336 ldr $Ahi,[sp,#$Coff+4]
337 ldr $Tlo,[sp,#$Doff+0]
338 ldr $Thi,[sp,#$Doff+4]
339 ldr $t0, [$ctx,#$Coff+$lo]
340 ldr $t1, [$ctx,#$Coff+$hi]
341 ldr $t2, [$ctx,#$Doff+$lo]
342 ldr $t3, [$ctx,#$Doff+$hi]
343 adds $t0,$Alo,$t0
344 adc $t1,$Ahi,$t1
345 adds $t2,$Tlo,$t2
346 adc $t3,$Thi,$t3
347 str $t0, [$ctx,#$Coff+$lo]
348 str $t1, [$ctx,#$Coff+$hi]
349 str $t2, [$ctx,#$Doff+$lo]
350 str $t3, [$ctx,#$Doff+$hi]
351
352 ldr $Tlo,[sp,#$Foff+0]
353 ldr $Thi,[sp,#$Foff+4]
354 ldr $t0, [$ctx,#$Eoff+$lo]
355 ldr $t1, [$ctx,#$Eoff+$hi]
356 ldr $t2, [$ctx,#$Foff+$lo]
357 ldr $t3, [$ctx,#$Foff+$hi]
358 adds $Elo,$Elo,$t0
359 adc $Ehi,$Ehi,$t1
360 adds $t2,$Tlo,$t2
361 adc $t3,$Thi,$t3
362 str $Elo,[$ctx,#$Eoff+$lo]
363 str $Ehi,[$ctx,#$Eoff+$hi]
364 str $t2, [$ctx,#$Foff+$lo]
365 str $t3, [$ctx,#$Foff+$hi]
366
367 ldr $Alo,[sp,#$Goff+0]
368 ldr $Ahi,[sp,#$Goff+4]
369 ldr $Tlo,[sp,#$Hoff+0]
370 ldr $Thi,[sp,#$Hoff+4]
371 ldr $t0, [$ctx,#$Goff+$lo]
372 ldr $t1, [$ctx,#$Goff+$hi]
373 ldr $t2, [$ctx,#$Hoff+$lo]
374 ldr $t3, [$ctx,#$Hoff+$hi]
375 adds $t0,$Alo,$t0
376 adc $t1,$Ahi,$t1
377 adds $t2,$Tlo,$t2
378 adc $t3,$Thi,$t3
379 str $t0, [$ctx,#$Goff+$lo]
380 str $t1, [$ctx,#$Goff+$hi]
381 str $t2, [$ctx,#$Hoff+$lo]
382 str $t3, [$ctx,#$Hoff+$hi]
383
384 add sp,sp,#640
385 sub $Ktbl,$Ktbl,#640
386
387 teq $inp,$len
388 bne .Loop
389
390 add sp,sp,#8*9 @ destroy frame
391 ldmia sp!,{r4-r12,lr}
392 tst lr,#1
393 moveq pc,lr @ be binary compatible with V4, yet
394 bx lr @ interoperable with Thumb ISA:-)
395.size sha512_block_data_order,.-sha512_block_data_order
396.asciz "SHA512 block transform for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
397.align 2
398___
399
400$code =~ s/\`([^\`]*)\`/eval $1/gem;
401$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
402print $code;
403close STDOUT; # enforce flush
diff --git a/src/lib/libcrypto/sha/asm/sha512-ia64.pl b/src/lib/libcrypto/sha/asm/sha512-ia64.pl
deleted file mode 100755
index 1c6ce56522..0000000000
--- a/src/lib/libcrypto/sha/asm/sha512-ia64.pl
+++ /dev/null
@@ -1,672 +0,0 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# SHA256/512_Transform for Itanium.
11#
12# sha512_block runs in 1003 cycles on Itanium 2, which is almost 50%
13# faster than gcc and >60%(!) faster than code generated by HP-UX
14# compiler (yes, HP-UX is generating slower code, because unlike gcc,
15# it failed to deploy "shift right pair," 'shrp' instruction, which
16# substitutes for 64-bit rotate).
17#
18# 924 cycles long sha256_block outperforms gcc by over factor of 2(!)
19# and HP-UX compiler - by >40% (yes, gcc won sha512_block, but lost
20# this one big time). Note that "formally" 924 is about 100 cycles
21# too much. I mean it's 64 32-bit rounds vs. 80 virtually identical
22# 64-bit ones and 1003*64/80 gives 802. Extra cycles, 2 per round,
23# are spent on extra work to provide for 32-bit rotations. 32-bit
24# rotations are still handled by 'shrp' instruction and for this
25# reason lower 32 bits are deposited to upper half of 64-bit register
26# prior 'shrp' issue. And in order to minimize the amount of such
27# operations, X[16] values are *maintained* with copies of lower
28# halves in upper halves, which is why you'll spot such instructions
29# as custom 'mux2', "parallel 32-bit add," 'padd4' and "parallel
30# 32-bit unsigned right shift," 'pshr4.u' instructions here.
31#
32# Rules of engagement.
33#
34# There is only one integer shifter meaning that if I have two rotate,
35# deposit or extract instructions in adjacent bundles, they shall
36# split [at run-time if they have to]. But note that variable and
37# parallel shifts are performed by multi-media ALU and *are* pairable
38# with rotates [and alike]. On the backside MMALU is rather slow: it
39# takes 2 extra cycles before the result of integer operation is
40# available *to* MMALU and 2(*) extra cycles before the result of MM
41# operation is available "back" *to* integer ALU, not to mention that
42# MMALU itself has 2 cycles latency. However! I explicitly scheduled
43# these MM instructions to avoid MM stalls, so that all these extra
44# latencies get "hidden" in instruction-level parallelism.
45#
46# (*) 2 cycles on Itanium 1 and 1 cycle on Itanium 2. But I schedule
47# for 2 in order to provide for best *overall* performance,
48# because on Itanium 1 stall on MM result is accompanied by
49# pipeline flush, which takes 6 cycles:-(
50#
51# Resulting performance numbers for 900MHz Itanium 2 system:
52#
53# The 'numbers' are in 1000s of bytes per second processed.
54# type 16 bytes 64 bytes 256 bytes 1024 bytes 8192 bytes
55# sha1(*) 6210.14k 20376.30k 52447.83k 85870.05k 105478.12k
56# sha256 7476.45k 20572.05k 41538.34k 56062.29k 62093.18k
57# sha512 4996.56k 20026.28k 47597.20k 85278.79k 111501.31k
58#
59# (*) SHA1 numbers are for HP-UX compiler and are presented purely
60# for reference purposes. I bet it can improved too...
61#
62# To generate code, pass the file name with either 256 or 512 in its
63# name and compiler flags.
64
65$output=shift;
66
67if ($output =~ /512.*\.[s|asm]/) {
68 $SZ=8;
69 $BITS=8*$SZ;
70 $LDW="ld8";
71 $STW="st8";
72 $ADD="add";
73 $SHRU="shr.u";
74 $TABLE="K512";
75 $func="sha512_block_data_order";
76 @Sigma0=(28,34,39);
77 @Sigma1=(14,18,41);
78 @sigma0=(1, 8, 7);
79 @sigma1=(19,61, 6);
80 $rounds=80;
81} elsif ($output =~ /256.*\.[s|asm]/) {
82 $SZ=4;
83 $BITS=8*$SZ;
84 $LDW="ld4";
85 $STW="st4";
86 $ADD="padd4";
87 $SHRU="pshr4.u";
88 $TABLE="K256";
89 $func="sha256_block_data_order";
90 @Sigma0=( 2,13,22);
91 @Sigma1=( 6,11,25);
92 @sigma0=( 7,18, 3);
93 @sigma1=(17,19,10);
94 $rounds=64;
95} else { die "nonsense $output"; }
96
97open STDOUT,">$output" || die "can't open $output: $!";
98
99if ($^O eq "hpux") {
100 $ADDP="addp4";
101 for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
102} else { $ADDP="add"; }
103for (@ARGV) { $big_endian=1 if (/\-DB_ENDIAN/);
104 $big_endian=0 if (/\-DL_ENDIAN/); }
105if (!defined($big_endian))
106 { $big_endian=(unpack('L',pack('N',1))==1); }
107
108$code=<<___;
109.ident \"$output, version 1.1\"
110.ident \"IA-64 ISA artwork by Andy Polyakov <appro\@fy.chalmers.se>\"
111.explicit
112.text
113
114pfssave=r2;
115lcsave=r3;
116prsave=r14;
117K=r15;
118A=r16; B=r17; C=r18; D=r19;
119E=r20; F=r21; G=r22; H=r23;
120T1=r24; T2=r25;
121s0=r26; s1=r27; t0=r28; t1=r29;
122Ktbl=r30;
123ctx=r31; // 1st arg
124input=r48; // 2nd arg
125num=r49; // 3rd arg
126sgm0=r50; sgm1=r51; // small constants
127A_=r54; B_=r55; C_=r56; D_=r57;
128E_=r58; F_=r59; G_=r60; H_=r61;
129
130// void $func (SHA_CTX *ctx, const void *in,size_t num[,int host])
131.global $func#
132.proc $func#
133.align 32
134$func:
135 .prologue
136 .save ar.pfs,pfssave
137{ .mmi; alloc pfssave=ar.pfs,3,27,0,16
138 $ADDP ctx=0,r32 // 1st arg
139 .save ar.lc,lcsave
140 mov lcsave=ar.lc }
141{ .mmi; $ADDP input=0,r33 // 2nd arg
142 mov num=r34 // 3rd arg
143 .save pr,prsave
144 mov prsave=pr };;
145
146 .body
147{ .mib; add r8=0*$SZ,ctx
148 add r9=1*$SZ,ctx
149 brp.loop.imp .L_first16,.L_first16_end-16 }
150{ .mib; add r10=2*$SZ,ctx
151 add r11=3*$SZ,ctx
152 brp.loop.imp .L_rest,.L_rest_end-16 };;
153
154// load A-H
155.Lpic_point:
156{ .mmi; $LDW A_=[r8],4*$SZ
157 $LDW B_=[r9],4*$SZ
158 mov Ktbl=ip }
159{ .mmi; $LDW C_=[r10],4*$SZ
160 $LDW D_=[r11],4*$SZ
161 mov sgm0=$sigma0[2] };;
162{ .mmi; $LDW E_=[r8]
163 $LDW F_=[r9]
164 add Ktbl=($TABLE#-.Lpic_point),Ktbl }
165{ .mmi; $LDW G_=[r10]
166 $LDW H_=[r11]
167 cmp.ne p0,p16=0,r0 };; // used in sha256_block
168___
169$code.=<<___ if ($BITS==64);
170{ .mii; and r8=7,input
171 and input=~7,input;;
172 cmp.eq p9,p0=1,r8 }
173{ .mmi; cmp.eq p10,p0=2,r8
174 cmp.eq p11,p0=3,r8
175 cmp.eq p12,p0=4,r8 }
176{ .mmi; cmp.eq p13,p0=5,r8
177 cmp.eq p14,p0=6,r8
178 cmp.eq p15,p0=7,r8 };;
179___
180$code.=<<___;
181.L_outer:
182.rotr X[16]
183{ .mmi; mov A=A_
184 mov B=B_
185 mov ar.lc=14 }
186{ .mmi; mov C=C_
187 mov D=D_
188 mov E=E_ }
189{ .mmi; mov F=F_
190 mov G=G_
191 mov ar.ec=2 }
192{ .mmi; ld1 X[15]=[input],$SZ // eliminated in 64-bit
193 mov H=H_
194 mov sgm1=$sigma1[2] };;
195
196___
197$t0="t0", $t1="t1", $code.=<<___ if ($BITS==32);
198.align 32
199.L_first16:
200{ .mmi; add r9=1-$SZ,input
201 add r10=2-$SZ,input
202 add r11=3-$SZ,input };;
203{ .mmi; ld1 r9=[r9]
204 ld1 r10=[r10]
205 dep.z $t1=E,32,32 }
206{ .mmi; $LDW K=[Ktbl],$SZ
207 ld1 r11=[r11]
208 zxt4 E=E };;
209{ .mii; or $t1=$t1,E
210 dep X[15]=X[15],r9,8,8
211 dep r11=r10,r11,8,8 };;
212{ .mmi; and T1=F,E
213 and T2=A,B
214 dep X[15]=X[15],r11,16,16 }
215{ .mmi; andcm r8=G,E
216 and r9=A,C
217 mux2 $t0=A,0x44 };; // copy lower half to upper
218{ .mmi; (p16) ld1 X[15-1]=[input],$SZ // prefetch
219 xor T1=T1,r8 // T1=((e & f) ^ (~e & g))
220 _rotr r11=$t1,$Sigma1[0] } // ROTR(e,14)
221{ .mib; and r10=B,C
222 xor T2=T2,r9 };;
223___
224$t0="A", $t1="E", $code.=<<___ if ($BITS==64);
225// in 64-bit mode I load whole X[16] at once and take care of alignment...
226{ .mmi; add r8=1*$SZ,input
227 add r9=2*$SZ,input
228 add r10=3*$SZ,input };;
229{ .mmb; $LDW X[15]=[input],4*$SZ
230 $LDW X[14]=[r8],4*$SZ
231(p9) br.cond.dpnt.many .L1byte };;
232{ .mmb; $LDW X[13]=[r9],4*$SZ
233 $LDW X[12]=[r10],4*$SZ
234(p10) br.cond.dpnt.many .L2byte };;
235{ .mmb; $LDW X[11]=[input],4*$SZ
236 $LDW X[10]=[r8],4*$SZ
237(p11) br.cond.dpnt.many .L3byte };;
238{ .mmb; $LDW X[ 9]=[r9],4*$SZ
239 $LDW X[ 8]=[r10],4*$SZ
240(p12) br.cond.dpnt.many .L4byte };;
241{ .mmb; $LDW X[ 7]=[input],4*$SZ
242 $LDW X[ 6]=[r8],4*$SZ
243(p13) br.cond.dpnt.many .L5byte };;
244{ .mmb; $LDW X[ 5]=[r9],4*$SZ
245 $LDW X[ 4]=[r10],4*$SZ
246(p14) br.cond.dpnt.many .L6byte };;
247{ .mmb; $LDW X[ 3]=[input],4*$SZ
248 $LDW X[ 2]=[r8],4*$SZ
249(p15) br.cond.dpnt.many .L7byte };;
250{ .mmb; $LDW X[ 1]=[r9],4*$SZ
251 $LDW X[ 0]=[r10],4*$SZ
252 br.many .L_first16 };;
253.L1byte:
254{ .mmi; $LDW X[13]=[r9],4*$SZ
255 $LDW X[12]=[r10],4*$SZ
256 shrp X[15]=X[15],X[14],56 };;
257{ .mmi; $LDW X[11]=[input],4*$SZ
258 $LDW X[10]=[r8],4*$SZ
259 shrp X[14]=X[14],X[13],56 }
260{ .mmi; $LDW X[ 9]=[r9],4*$SZ
261 $LDW X[ 8]=[r10],4*$SZ
262 shrp X[13]=X[13],X[12],56 };;
263{ .mmi; $LDW X[ 7]=[input],4*$SZ
264 $LDW X[ 6]=[r8],4*$SZ
265 shrp X[12]=X[12],X[11],56 }
266{ .mmi; $LDW X[ 5]=[r9],4*$SZ
267 $LDW X[ 4]=[r10],4*$SZ
268 shrp X[11]=X[11],X[10],56 };;
269{ .mmi; $LDW X[ 3]=[input],4*$SZ
270 $LDW X[ 2]=[r8],4*$SZ
271 shrp X[10]=X[10],X[ 9],56 }
272{ .mmi; $LDW X[ 1]=[r9],4*$SZ
273 $LDW X[ 0]=[r10],4*$SZ
274 shrp X[ 9]=X[ 9],X[ 8],56 };;
275{ .mii; $LDW T1=[input]
276 shrp X[ 8]=X[ 8],X[ 7],56
277 shrp X[ 7]=X[ 7],X[ 6],56 }
278{ .mii; shrp X[ 6]=X[ 6],X[ 5],56
279 shrp X[ 5]=X[ 5],X[ 4],56 };;
280{ .mii; shrp X[ 4]=X[ 4],X[ 3],56
281 shrp X[ 3]=X[ 3],X[ 2],56 }
282{ .mii; shrp X[ 2]=X[ 2],X[ 1],56
283 shrp X[ 1]=X[ 1],X[ 0],56 }
284{ .mib; shrp X[ 0]=X[ 0],T1,56
285 br.many .L_first16 };;
286.L2byte:
287{ .mmi; $LDW X[11]=[input],4*$SZ
288 $LDW X[10]=[r8],4*$SZ
289 shrp X[15]=X[15],X[14],48 }
290{ .mmi; $LDW X[ 9]=[r9],4*$SZ
291 $LDW X[ 8]=[r10],4*$SZ
292 shrp X[14]=X[14],X[13],48 };;
293{ .mmi; $LDW X[ 7]=[input],4*$SZ
294 $LDW X[ 6]=[r8],4*$SZ
295 shrp X[13]=X[13],X[12],48 }
296{ .mmi; $LDW X[ 5]=[r9],4*$SZ
297 $LDW X[ 4]=[r10],4*$SZ
298 shrp X[12]=X[12],X[11],48 };;
299{ .mmi; $LDW X[ 3]=[input],4*$SZ
300 $LDW X[ 2]=[r8],4*$SZ
301 shrp X[11]=X[11],X[10],48 }
302{ .mmi; $LDW X[ 1]=[r9],4*$SZ
303 $LDW X[ 0]=[r10],4*$SZ
304 shrp X[10]=X[10],X[ 9],48 };;
305{ .mii; $LDW T1=[input]
306 shrp X[ 9]=X[ 9],X[ 8],48
307 shrp X[ 8]=X[ 8],X[ 7],48 }
308{ .mii; shrp X[ 7]=X[ 7],X[ 6],48
309 shrp X[ 6]=X[ 6],X[ 5],48 };;
310{ .mii; shrp X[ 5]=X[ 5],X[ 4],48
311 shrp X[ 4]=X[ 4],X[ 3],48 }
312{ .mii; shrp X[ 3]=X[ 3],X[ 2],48
313 shrp X[ 2]=X[ 2],X[ 1],48 }
314{ .mii; shrp X[ 1]=X[ 1],X[ 0],48
315 shrp X[ 0]=X[ 0],T1,48 }
316{ .mfb; br.many .L_first16 };;
317.L3byte:
318{ .mmi; $LDW X[ 9]=[r9],4*$SZ
319 $LDW X[ 8]=[r10],4*$SZ
320 shrp X[15]=X[15],X[14],40 };;
321{ .mmi; $LDW X[ 7]=[input],4*$SZ
322 $LDW X[ 6]=[r8],4*$SZ
323 shrp X[14]=X[14],X[13],40 }
324{ .mmi; $LDW X[ 5]=[r9],4*$SZ
325 $LDW X[ 4]=[r10],4*$SZ
326 shrp X[13]=X[13],X[12],40 };;
327{ .mmi; $LDW X[ 3]=[input],4*$SZ
328 $LDW X[ 2]=[r8],4*$SZ
329 shrp X[12]=X[12],X[11],40 }
330{ .mmi; $LDW X[ 1]=[r9],4*$SZ
331 $LDW X[ 0]=[r10],4*$SZ
332 shrp X[11]=X[11],X[10],40 };;
333{ .mii; $LDW T1=[input]
334 shrp X[10]=X[10],X[ 9],40
335 shrp X[ 9]=X[ 9],X[ 8],40 }
336{ .mii; shrp X[ 8]=X[ 8],X[ 7],40
337 shrp X[ 7]=X[ 7],X[ 6],40 };;
338{ .mii; shrp X[ 6]=X[ 6],X[ 5],40
339 shrp X[ 5]=X[ 5],X[ 4],40 }
340{ .mii; shrp X[ 4]=X[ 4],X[ 3],40
341 shrp X[ 3]=X[ 3],X[ 2],40 }
342{ .mii; shrp X[ 2]=X[ 2],X[ 1],40
343 shrp X[ 1]=X[ 1],X[ 0],40 }
344{ .mib; shrp X[ 0]=X[ 0],T1,40
345 br.many .L_first16 };;
346.L4byte:
347{ .mmi; $LDW X[ 7]=[input],4*$SZ
348 $LDW X[ 6]=[r8],4*$SZ
349 shrp X[15]=X[15],X[14],32 }
350{ .mmi; $LDW X[ 5]=[r9],4*$SZ
351 $LDW X[ 4]=[r10],4*$SZ
352 shrp X[14]=X[14],X[13],32 };;
353{ .mmi; $LDW X[ 3]=[input],4*$SZ
354 $LDW X[ 2]=[r8],4*$SZ
355 shrp X[13]=X[13],X[12],32 }
356{ .mmi; $LDW X[ 1]=[r9],4*$SZ
357 $LDW X[ 0]=[r10],4*$SZ
358 shrp X[12]=X[12],X[11],32 };;
359{ .mii; $LDW T1=[input]
360 shrp X[11]=X[11],X[10],32
361 shrp X[10]=X[10],X[ 9],32 }
362{ .mii; shrp X[ 9]=X[ 9],X[ 8],32
363 shrp X[ 8]=X[ 8],X[ 7],32 };;
364{ .mii; shrp X[ 7]=X[ 7],X[ 6],32
365 shrp X[ 6]=X[ 6],X[ 5],32 }
366{ .mii; shrp X[ 5]=X[ 5],X[ 4],32
367 shrp X[ 4]=X[ 4],X[ 3],32 }
368{ .mii; shrp X[ 3]=X[ 3],X[ 2],32
369 shrp X[ 2]=X[ 2],X[ 1],32 }
370{ .mii; shrp X[ 1]=X[ 1],X[ 0],32
371 shrp X[ 0]=X[ 0],T1,32 }
372{ .mfb; br.many .L_first16 };;
373.L5byte:
374{ .mmi; $LDW X[ 5]=[r9],4*$SZ
375 $LDW X[ 4]=[r10],4*$SZ
376 shrp X[15]=X[15],X[14],24 };;
377{ .mmi; $LDW X[ 3]=[input],4*$SZ
378 $LDW X[ 2]=[r8],4*$SZ
379 shrp X[14]=X[14],X[13],24 }
380{ .mmi; $LDW X[ 1]=[r9],4*$SZ
381 $LDW X[ 0]=[r10],4*$SZ
382 shrp X[13]=X[13],X[12],24 };;
383{ .mii; $LDW T1=[input]
384 shrp X[12]=X[12],X[11],24
385 shrp X[11]=X[11],X[10],24 }
386{ .mii; shrp X[10]=X[10],X[ 9],24
387 shrp X[ 9]=X[ 9],X[ 8],24 };;
388{ .mii; shrp X[ 8]=X[ 8],X[ 7],24
389 shrp X[ 7]=X[ 7],X[ 6],24 }
390{ .mii; shrp X[ 6]=X[ 6],X[ 5],24
391 shrp X[ 5]=X[ 5],X[ 4],24 }
392{ .mii; shrp X[ 4]=X[ 4],X[ 3],24
393 shrp X[ 3]=X[ 3],X[ 2],24 }
394{ .mii; shrp X[ 2]=X[ 2],X[ 1],24
395 shrp X[ 1]=X[ 1],X[ 0],24 }
396{ .mib; shrp X[ 0]=X[ 0],T1,24
397 br.many .L_first16 };;
398.L6byte:
399{ .mmi; $LDW X[ 3]=[input],4*$SZ
400 $LDW X[ 2]=[r8],4*$SZ
401 shrp X[15]=X[15],X[14],16 }
402{ .mmi; $LDW X[ 1]=[r9],4*$SZ
403 $LDW X[ 0]=[r10],4*$SZ
404 shrp X[14]=X[14],X[13],16 };;
405{ .mii; $LDW T1=[input]
406 shrp X[13]=X[13],X[12],16
407 shrp X[12]=X[12],X[11],16 }
408{ .mii; shrp X[11]=X[11],X[10],16
409 shrp X[10]=X[10],X[ 9],16 };;
410{ .mii; shrp X[ 9]=X[ 9],X[ 8],16
411 shrp X[ 8]=X[ 8],X[ 7],16 }
412{ .mii; shrp X[ 7]=X[ 7],X[ 6],16
413 shrp X[ 6]=X[ 6],X[ 5],16 }
414{ .mii; shrp X[ 5]=X[ 5],X[ 4],16
415 shrp X[ 4]=X[ 4],X[ 3],16 }
416{ .mii; shrp X[ 3]=X[ 3],X[ 2],16
417 shrp X[ 2]=X[ 2],X[ 1],16 }
418{ .mii; shrp X[ 1]=X[ 1],X[ 0],16
419 shrp X[ 0]=X[ 0],T1,16 }
420{ .mfb; br.many .L_first16 };;
421.L7byte:
422{ .mmi; $LDW X[ 1]=[r9],4*$SZ
423 $LDW X[ 0]=[r10],4*$SZ
424 shrp X[15]=X[15],X[14],8 };;
425{ .mii; $LDW T1=[input]
426 shrp X[14]=X[14],X[13],8
427 shrp X[13]=X[13],X[12],8 }
428{ .mii; shrp X[12]=X[12],X[11],8
429 shrp X[11]=X[11],X[10],8 };;
430{ .mii; shrp X[10]=X[10],X[ 9],8
431 shrp X[ 9]=X[ 9],X[ 8],8 }
432{ .mii; shrp X[ 8]=X[ 8],X[ 7],8
433 shrp X[ 7]=X[ 7],X[ 6],8 }
434{ .mii; shrp X[ 6]=X[ 6],X[ 5],8
435 shrp X[ 5]=X[ 5],X[ 4],8 }
436{ .mii; shrp X[ 4]=X[ 4],X[ 3],8
437 shrp X[ 3]=X[ 3],X[ 2],8 }
438{ .mii; shrp X[ 2]=X[ 2],X[ 1],8
439 shrp X[ 1]=X[ 1],X[ 0],8 }
440{ .mib; shrp X[ 0]=X[ 0],T1,8
441 br.many .L_first16 };;
442
443.align 32
444.L_first16:
445{ .mmi; $LDW K=[Ktbl],$SZ
446 and T1=F,E
447 and T2=A,B }
448{ .mmi; //$LDW X[15]=[input],$SZ // X[i]=*input++
449 andcm r8=G,E
450 and r9=A,C };;
451{ .mmi; xor T1=T1,r8 //T1=((e & f) ^ (~e & g))
452 and r10=B,C
453 _rotr r11=$t1,$Sigma1[0] } // ROTR(e,14)
454{ .mmi; xor T2=T2,r9
455 mux1 X[15]=X[15],\@rev };; // eliminated in big-endian
456___
457$code.=<<___;
458{ .mib; add T1=T1,H // T1=Ch(e,f,g)+h
459 _rotr r8=$t1,$Sigma1[1] } // ROTR(e,18)
460{ .mib; xor T2=T2,r10 // T2=((a & b) ^ (a & c) ^ (b & c))
461 mov H=G };;
462{ .mib; xor r11=r8,r11
463 _rotr r9=$t1,$Sigma1[2] } // ROTR(e,41)
464{ .mib; mov G=F
465 mov F=E };;
466{ .mib; xor r9=r9,r11 // r9=Sigma1(e)
467 _rotr r10=$t0,$Sigma0[0] } // ROTR(a,28)
468{ .mib; add T1=T1,K // T1=Ch(e,f,g)+h+K512[i]
469 mov E=D };;
470{ .mib; add T1=T1,r9 // T1+=Sigma1(e)
471 _rotr r11=$t0,$Sigma0[1] } // ROTR(a,34)
472{ .mib; mov D=C
473 mov C=B };;
474{ .mib; add T1=T1,X[15] // T1+=X[i]
475 _rotr r8=$t0,$Sigma0[2] } // ROTR(a,39)
476{ .mib; xor r10=r10,r11
477 mux2 X[15]=X[15],0x44 };; // eliminated in 64-bit
478{ .mmi; xor r10=r8,r10 // r10=Sigma0(a)
479 mov B=A
480 add A=T1,T2 };;
481{ .mib; add E=E,T1
482 add A=A,r10 // T2=Maj(a,b,c)+Sigma0(a)
483 br.ctop.sptk .L_first16 };;
484.L_first16_end:
485
486{ .mii; mov ar.lc=$rounds-17
487 mov ar.ec=1 };;
488
489.align 32
490.L_rest:
491.rotr X[16]
492{ .mib; $LDW K=[Ktbl],$SZ
493 _rotr r8=X[15-1],$sigma0[0] } // ROTR(s0,1)
494{ .mib; $ADD X[15]=X[15],X[15-9] // X[i&0xF]+=X[(i+9)&0xF]
495 $SHRU s0=X[15-1],sgm0 };; // s0=X[(i+1)&0xF]>>7
496{ .mib; and T1=F,E
497 _rotr r9=X[15-1],$sigma0[1] } // ROTR(s0,8)
498{ .mib; andcm r10=G,E
499 $SHRU s1=X[15-14],sgm1 };; // s1=X[(i+14)&0xF]>>6
500{ .mmi; xor T1=T1,r10 // T1=((e & f) ^ (~e & g))
501 xor r9=r8,r9
502 _rotr r10=X[15-14],$sigma1[0] };;// ROTR(s1,19)
503{ .mib; and T2=A,B
504 _rotr r11=X[15-14],$sigma1[1] }// ROTR(s1,61)
505{ .mib; and r8=A,C };;
506___
507$t0="t0", $t1="t1", $code.=<<___ if ($BITS==32);
508// I adhere to mmi; in order to hold Itanium 1 back and avoid 6 cycle
509// pipeline flush in last bundle. Note that even on Itanium2 the
510// latter stalls for one clock cycle...
511{ .mmi; xor s0=s0,r9 // s0=sigma0(X[(i+1)&0xF])
512 dep.z $t1=E,32,32 }
513{ .mmi; xor r10=r11,r10
514 zxt4 E=E };;
515{ .mmi; or $t1=$t1,E
516 xor s1=s1,r10 // s1=sigma1(X[(i+14)&0xF])
517 mux2 $t0=A,0x44 };; // copy lower half to upper
518{ .mmi; xor T2=T2,r8
519 _rotr r9=$t1,$Sigma1[0] } // ROTR(e,14)
520{ .mmi; and r10=B,C
521 add T1=T1,H // T1=Ch(e,f,g)+h
522 $ADD X[15]=X[15],s0 };; // X[i&0xF]+=sigma0(X[(i+1)&0xF])
523___
524$t0="A", $t1="E", $code.=<<___ if ($BITS==64);
525{ .mib; xor s0=s0,r9 // s0=sigma0(X[(i+1)&0xF])
526 _rotr r9=$t1,$Sigma1[0] } // ROTR(e,14)
527{ .mib; xor r10=r11,r10
528 xor T2=T2,r8 };;
529{ .mib; xor s1=s1,r10 // s1=sigma1(X[(i+14)&0xF])
530 add T1=T1,H }
531{ .mib; and r10=B,C
532 $ADD X[15]=X[15],s0 };; // X[i&0xF]+=sigma0(X[(i+1)&0xF])
533___
534$code.=<<___;
535{ .mmi; xor T2=T2,r10 // T2=((a & b) ^ (a & c) ^ (b & c))
536 mov H=G
537 _rotr r8=$t1,$Sigma1[1] };; // ROTR(e,18)
538{ .mmi; xor r11=r8,r9
539 $ADD X[15]=X[15],s1 // X[i&0xF]+=sigma1(X[(i+14)&0xF])
540 _rotr r9=$t1,$Sigma1[2] } // ROTR(e,41)
541{ .mmi; mov G=F
542 mov F=E };;
543{ .mib; xor r9=r9,r11 // r9=Sigma1(e)
544 _rotr r10=$t0,$Sigma0[0] } // ROTR(a,28)
545{ .mib; add T1=T1,K // T1=Ch(e,f,g)+h+K512[i]
546 mov E=D };;
547{ .mib; add T1=T1,r9 // T1+=Sigma1(e)
548 _rotr r11=$t0,$Sigma0[1] } // ROTR(a,34)
549{ .mib; mov D=C
550 mov C=B };;
551{ .mmi; add T1=T1,X[15] // T1+=X[i]
552 xor r10=r10,r11
553 _rotr r8=$t0,$Sigma0[2] };; // ROTR(a,39)
554{ .mmi; xor r10=r8,r10 // r10=Sigma0(a)
555 mov B=A
556 add A=T1,T2 };;
557{ .mib; add E=E,T1
558 add A=A,r10 // T2=Maj(a,b,c)+Sigma0(a)
559 br.ctop.sptk .L_rest };;
560.L_rest_end:
561
562{ .mmi; add A_=A_,A
563 add B_=B_,B
564 add C_=C_,C }
565{ .mmi; add D_=D_,D
566 add E_=E_,E
567 cmp.ltu p16,p0=1,num };;
568{ .mmi; add F_=F_,F
569 add G_=G_,G
570 add H_=H_,H }
571{ .mmb; add Ktbl=-$SZ*$rounds,Ktbl
572(p16) add num=-1,num
573(p16) br.dptk.many .L_outer };;
574
575{ .mib; add r8=0*$SZ,ctx
576 add r9=1*$SZ,ctx }
577{ .mib; add r10=2*$SZ,ctx
578 add r11=3*$SZ,ctx };;
579{ .mmi; $STW [r8]=A_,4*$SZ
580 $STW [r9]=B_,4*$SZ
581 mov ar.lc=lcsave }
582{ .mmi; $STW [r10]=C_,4*$SZ
583 $STW [r11]=D_,4*$SZ
584 mov pr=prsave,0x1ffff };;
585{ .mmb; $STW [r8]=E_
586 $STW [r9]=F_ }
587{ .mmb; $STW [r10]=G_
588 $STW [r11]=H_
589 br.ret.sptk.many b0 };;
590.endp $func#
591___
592
593$code =~ s/\`([^\`]*)\`/eval $1/gem;
594$code =~ s/_rotr(\s+)([^=]+)=([^,]+),([0-9]+)/shrp$1$2=$3,$3,$4/gm;
595if ($BITS==64) {
596 $code =~ s/mux2(\s+)\S+/nop.i$1 0x0/gm;
597 $code =~ s/mux1(\s+)\S+/nop.i$1 0x0/gm if ($big_endian);
598 $code =~ s/(shrp\s+X\[[^=]+)=([^,]+),([^,]+),([1-9]+)/$1=$3,$2,64-$4/gm
599 if (!$big_endian);
600 $code =~ s/ld1(\s+)X\[\S+/nop.m$1 0x0/gm;
601}
602
603print $code;
604
605print<<___ if ($BITS==32);
606.align 64
607.type K256#,\@object
608K256: data4 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
609 data4 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
610 data4 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
611 data4 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
612 data4 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
613 data4 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
614 data4 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
615 data4 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
616 data4 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
617 data4 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
618 data4 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
619 data4 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
620 data4 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
621 data4 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
622 data4 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
623 data4 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
624.size K256#,$SZ*$rounds
625stringz "SHA256 block transform for IA64, CRYPTOGAMS by <appro\@openssl.org>"
626___
627print<<___ if ($BITS==64);
628.align 64
629.type K512#,\@object
630K512: data8 0x428a2f98d728ae22,0x7137449123ef65cd
631 data8 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
632 data8 0x3956c25bf348b538,0x59f111f1b605d019
633 data8 0x923f82a4af194f9b,0xab1c5ed5da6d8118
634 data8 0xd807aa98a3030242,0x12835b0145706fbe
635 data8 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
636 data8 0x72be5d74f27b896f,0x80deb1fe3b1696b1
637 data8 0x9bdc06a725c71235,0xc19bf174cf692694
638 data8 0xe49b69c19ef14ad2,0xefbe4786384f25e3
639 data8 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
640 data8 0x2de92c6f592b0275,0x4a7484aa6ea6e483
641 data8 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
642 data8 0x983e5152ee66dfab,0xa831c66d2db43210
643 data8 0xb00327c898fb213f,0xbf597fc7beef0ee4
644 data8 0xc6e00bf33da88fc2,0xd5a79147930aa725
645 data8 0x06ca6351e003826f,0x142929670a0e6e70
646 data8 0x27b70a8546d22ffc,0x2e1b21385c26c926
647 data8 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
648 data8 0x650a73548baf63de,0x766a0abb3c77b2a8
649 data8 0x81c2c92e47edaee6,0x92722c851482353b
650 data8 0xa2bfe8a14cf10364,0xa81a664bbc423001
651 data8 0xc24b8b70d0f89791,0xc76c51a30654be30
652 data8 0xd192e819d6ef5218,0xd69906245565a910
653 data8 0xf40e35855771202a,0x106aa07032bbd1b8
654 data8 0x19a4c116b8d2d0c8,0x1e376c085141ab53
655 data8 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
656 data8 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
657 data8 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
658 data8 0x748f82ee5defb2fc,0x78a5636f43172f60
659 data8 0x84c87814a1f0ab72,0x8cc702081a6439ec
660 data8 0x90befffa23631e28,0xa4506cebde82bde9
661 data8 0xbef9a3f7b2c67915,0xc67178f2e372532b
662 data8 0xca273eceea26619c,0xd186b8c721c0c207
663 data8 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
664 data8 0x06f067aa72176fba,0x0a637dc5a2c898a6
665 data8 0x113f9804bef90dae,0x1b710b35131c471b
666 data8 0x28db77f523047d84,0x32caab7b40c72493
667 data8 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
668 data8 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
669 data8 0x5fcb6fab3ad6faec,0x6c44198c4a475817
670.size K512#,$SZ*$rounds
671stringz "SHA512 block transform for IA64, CRYPTOGAMS by <appro\@openssl.org>"
672___
diff --git a/src/lib/libcrypto/sha/asm/sha512-ppc.pl b/src/lib/libcrypto/sha/asm/sha512-ppc.pl
deleted file mode 100755
index 768a6a6fad..0000000000
--- a/src/lib/libcrypto/sha/asm/sha512-ppc.pl
+++ /dev/null
@@ -1,462 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# I let hardware handle unaligned input, except on page boundaries
11# (see below for details). Otherwise straightforward implementation
12# with X vector in register bank. The module is big-endian [which is
13# not big deal as there're no little-endian targets left around].
14
15# sha256 | sha512
16# -m64 -m32 | -m64 -m32
17# --------------------------------------+-----------------------
18# PPC970,gcc-4.0.0 +50% +38% | +40% +410%(*)
19# Power6,xlc-7 +150% +90% | +100% +430%(*)
20#
21# (*) 64-bit code in 32-bit application context, which actually is
22# on TODO list. It should be noted that for safe deployment in
23# 32-bit *mutli-threaded* context asyncronous signals should be
24# blocked upon entry to SHA512 block routine. This is because
25# 32-bit signaling procedure invalidates upper halves of GPRs.
26# Context switch procedure preserves them, but not signaling:-(
27
28# Second version is true multi-thread safe. Trouble with the original
29# version was that it was using thread local storage pointer register.
30# Well, it scrupulously preserved it, but the problem would arise the
31# moment asynchronous signal was delivered and signal handler would
32# dereference the TLS pointer. While it's never the case in openssl
33# application or test suite, we have to respect this scenario and not
34# use TLS pointer register. Alternative would be to require caller to
35# block signals prior calling this routine. For the record, in 32-bit
36# context R2 serves as TLS pointer, while in 64-bit context - R13.
37
38$flavour=shift;
39$output =shift;
40
41if ($flavour =~ /64/) {
42 $SIZE_T=8;
43 $STU="stdu";
44 $UCMP="cmpld";
45 $SHL="sldi";
46 $POP="ld";
47 $PUSH="std";
48} elsif ($flavour =~ /32/) {
49 $SIZE_T=4;
50 $STU="stwu";
51 $UCMP="cmplw";
52 $SHL="slwi";
53 $POP="lwz";
54 $PUSH="stw";
55} else { die "nonsense $flavour"; }
56
57$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
58( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
59( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
60die "can't locate ppc-xlate.pl";
61
62open STDOUT,"| $^X $xlate $flavour $output" || die "can't call $xlate: $!";
63
64if ($output =~ /512/) {
65 $func="sha512_block_data_order";
66 $SZ=8;
67 @Sigma0=(28,34,39);
68 @Sigma1=(14,18,41);
69 @sigma0=(1, 8, 7);
70 @sigma1=(19,61, 6);
71 $rounds=80;
72 $LD="ld";
73 $ST="std";
74 $ROR="rotrdi";
75 $SHR="srdi";
76} else {
77 $func="sha256_block_data_order";
78 $SZ=4;
79 @Sigma0=( 2,13,22);
80 @Sigma1=( 6,11,25);
81 @sigma0=( 7,18, 3);
82 @sigma1=(17,19,10);
83 $rounds=64;
84 $LD="lwz";
85 $ST="stw";
86 $ROR="rotrwi";
87 $SHR="srwi";
88}
89
90$FRAME=32*$SIZE_T;
91
92$sp ="r1";
93$toc="r2";
94$ctx="r3"; # zapped by $a0
95$inp="r4"; # zapped by $a1
96$num="r5"; # zapped by $t0
97
98$T ="r0";
99$a0 ="r3";
100$a1 ="r4";
101$t0 ="r5";
102$t1 ="r6";
103$Tbl="r7";
104
105$A ="r8";
106$B ="r9";
107$C ="r10";
108$D ="r11";
109$E ="r12";
110$F ="r13"; $F="r2" if ($SIZE_T==8);# reassigned to exempt TLS pointer
111$G ="r14";
112$H ="r15";
113
114@V=($A,$B,$C,$D,$E,$F,$G,$H);
115@X=("r16","r17","r18","r19","r20","r21","r22","r23",
116 "r24","r25","r26","r27","r28","r29","r30","r31");
117
118$inp="r31"; # reassigned $inp! aliases with @X[15]
119
120sub ROUND_00_15 {
121my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
122$code.=<<___;
123 $LD $T,`$i*$SZ`($Tbl)
124 $ROR $a0,$e,$Sigma1[0]
125 $ROR $a1,$e,$Sigma1[1]
126 and $t0,$f,$e
127 andc $t1,$g,$e
128 add $T,$T,$h
129 xor $a0,$a0,$a1
130 $ROR $a1,$a1,`$Sigma1[2]-$Sigma1[1]`
131 or $t0,$t0,$t1 ; Ch(e,f,g)
132 add $T,$T,@X[$i]
133 xor $a0,$a0,$a1 ; Sigma1(e)
134 add $T,$T,$t0
135 add $T,$T,$a0
136
137 $ROR $a0,$a,$Sigma0[0]
138 $ROR $a1,$a,$Sigma0[1]
139 and $t0,$a,$b
140 and $t1,$a,$c
141 xor $a0,$a0,$a1
142 $ROR $a1,$a1,`$Sigma0[2]-$Sigma0[1]`
143 xor $t0,$t0,$t1
144 and $t1,$b,$c
145 xor $a0,$a0,$a1 ; Sigma0(a)
146 add $d,$d,$T
147 xor $t0,$t0,$t1 ; Maj(a,b,c)
148 add $h,$T,$a0
149 add $h,$h,$t0
150
151___
152}
153
154sub ROUND_16_xx {
155my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
156$i-=16;
157$code.=<<___;
158 $ROR $a0,@X[($i+1)%16],$sigma0[0]
159 $ROR $a1,@X[($i+1)%16],$sigma0[1]
160 $ROR $t0,@X[($i+14)%16],$sigma1[0]
161 $ROR $t1,@X[($i+14)%16],$sigma1[1]
162 xor $a0,$a0,$a1
163 $SHR $a1,@X[($i+1)%16],$sigma0[2]
164 xor $t0,$t0,$t1
165 $SHR $t1,@X[($i+14)%16],$sigma1[2]
166 add @X[$i],@X[$i],@X[($i+9)%16]
167 xor $a0,$a0,$a1 ; sigma0(X[(i+1)&0x0f])
168 xor $t0,$t0,$t1 ; sigma1(X[(i+14)&0x0f])
169 add @X[$i],@X[$i],$a0
170 add @X[$i],@X[$i],$t0
171___
172&ROUND_00_15($i,$a,$b,$c,$d,$e,$f,$g,$h);
173}
174
175$code=<<___;
176.machine "any"
177.text
178
179.globl $func
180.align 6
181$func:
182 mflr r0
183 $STU $sp,`-($FRAME+16*$SZ)`($sp)
184 $SHL $num,$num,`log(16*$SZ)/log(2)`
185
186 $PUSH $ctx,`$FRAME-$SIZE_T*22`($sp)
187
188 $PUSH r0,`$FRAME-$SIZE_T*21`($sp)
189 $PUSH $toc,`$FRAME-$SIZE_T*20`($sp)
190 $PUSH r13,`$FRAME-$SIZE_T*19`($sp)
191 $PUSH r14,`$FRAME-$SIZE_T*18`($sp)
192 $PUSH r15,`$FRAME-$SIZE_T*17`($sp)
193 $PUSH r16,`$FRAME-$SIZE_T*16`($sp)
194 $PUSH r17,`$FRAME-$SIZE_T*15`($sp)
195 $PUSH r18,`$FRAME-$SIZE_T*14`($sp)
196 $PUSH r19,`$FRAME-$SIZE_T*13`($sp)
197 $PUSH r20,`$FRAME-$SIZE_T*12`($sp)
198 $PUSH r21,`$FRAME-$SIZE_T*11`($sp)
199 $PUSH r22,`$FRAME-$SIZE_T*10`($sp)
200 $PUSH r23,`$FRAME-$SIZE_T*9`($sp)
201 $PUSH r24,`$FRAME-$SIZE_T*8`($sp)
202 $PUSH r25,`$FRAME-$SIZE_T*7`($sp)
203 $PUSH r26,`$FRAME-$SIZE_T*6`($sp)
204 $PUSH r27,`$FRAME-$SIZE_T*5`($sp)
205 $PUSH r28,`$FRAME-$SIZE_T*4`($sp)
206 $PUSH r29,`$FRAME-$SIZE_T*3`($sp)
207 $PUSH r30,`$FRAME-$SIZE_T*2`($sp)
208 $PUSH r31,`$FRAME-$SIZE_T*1`($sp)
209
210 $LD $A,`0*$SZ`($ctx)
211 mr $inp,r4 ; incarnate $inp
212 $LD $B,`1*$SZ`($ctx)
213 $LD $C,`2*$SZ`($ctx)
214 $LD $D,`3*$SZ`($ctx)
215 $LD $E,`4*$SZ`($ctx)
216 $LD $F,`5*$SZ`($ctx)
217 $LD $G,`6*$SZ`($ctx)
218 $LD $H,`7*$SZ`($ctx)
219
220 b LPICmeup
221LPICedup:
222 andi. r0,$inp,3
223 bne Lunaligned
224Laligned:
225 add $num,$inp,$num
226 $PUSH $num,`$FRAME-$SIZE_T*24`($sp) ; end pointer
227 $PUSH $inp,`$FRAME-$SIZE_T*23`($sp) ; inp pointer
228 bl Lsha2_block_private
229Ldone:
230 $POP r0,`$FRAME-$SIZE_T*21`($sp)
231 $POP $toc,`$FRAME-$SIZE_T*20`($sp)
232 $POP r13,`$FRAME-$SIZE_T*19`($sp)
233 $POP r14,`$FRAME-$SIZE_T*18`($sp)
234 $POP r15,`$FRAME-$SIZE_T*17`($sp)
235 $POP r16,`$FRAME-$SIZE_T*16`($sp)
236 $POP r17,`$FRAME-$SIZE_T*15`($sp)
237 $POP r18,`$FRAME-$SIZE_T*14`($sp)
238 $POP r19,`$FRAME-$SIZE_T*13`($sp)
239 $POP r20,`$FRAME-$SIZE_T*12`($sp)
240 $POP r21,`$FRAME-$SIZE_T*11`($sp)
241 $POP r22,`$FRAME-$SIZE_T*10`($sp)
242 $POP r23,`$FRAME-$SIZE_T*9`($sp)
243 $POP r24,`$FRAME-$SIZE_T*8`($sp)
244 $POP r25,`$FRAME-$SIZE_T*7`($sp)
245 $POP r26,`$FRAME-$SIZE_T*6`($sp)
246 $POP r27,`$FRAME-$SIZE_T*5`($sp)
247 $POP r28,`$FRAME-$SIZE_T*4`($sp)
248 $POP r29,`$FRAME-$SIZE_T*3`($sp)
249 $POP r30,`$FRAME-$SIZE_T*2`($sp)
250 $POP r31,`$FRAME-$SIZE_T*1`($sp)
251 mtlr r0
252 addi $sp,$sp,`$FRAME+16*$SZ`
253 blr
254___
255
256# PowerPC specification allows an implementation to be ill-behaved
257# upon unaligned access which crosses page boundary. "Better safe
258# than sorry" principle makes me treat it specially. But I don't
259# look for particular offending word, but rather for the input
260# block which crosses the boundary. Once found that block is aligned
261# and hashed separately...
262$code.=<<___;
263.align 4
264Lunaligned:
265 subfic $t1,$inp,4096
266 andi. $t1,$t1,`4096-16*$SZ` ; distance to closest page boundary
267 beq Lcross_page
268 $UCMP $num,$t1
269 ble- Laligned ; didn't cross the page boundary
270 subfc $num,$t1,$num
271 add $t1,$inp,$t1
272 $PUSH $num,`$FRAME-$SIZE_T*25`($sp) ; save real remaining num
273 $PUSH $t1,`$FRAME-$SIZE_T*24`($sp) ; intermediate end pointer
274 $PUSH $inp,`$FRAME-$SIZE_T*23`($sp) ; inp pointer
275 bl Lsha2_block_private
276 ; $inp equals to the intermediate end pointer here
277 $POP $num,`$FRAME-$SIZE_T*25`($sp) ; restore real remaining num
278Lcross_page:
279 li $t1,`16*$SZ/4`
280 mtctr $t1
281 addi r20,$sp,$FRAME ; aligned spot below the frame
282Lmemcpy:
283 lbz r16,0($inp)
284 lbz r17,1($inp)
285 lbz r18,2($inp)
286 lbz r19,3($inp)
287 addi $inp,$inp,4
288 stb r16,0(r20)
289 stb r17,1(r20)
290 stb r18,2(r20)
291 stb r19,3(r20)
292 addi r20,r20,4
293 bdnz Lmemcpy
294
295 $PUSH $inp,`$FRAME-$SIZE_T*26`($sp) ; save real inp
296 addi $t1,$sp,`$FRAME+16*$SZ` ; fictitious end pointer
297 addi $inp,$sp,$FRAME ; fictitious inp pointer
298 $PUSH $num,`$FRAME-$SIZE_T*25`($sp) ; save real num
299 $PUSH $t1,`$FRAME-$SIZE_T*24`($sp) ; end pointer
300 $PUSH $inp,`$FRAME-$SIZE_T*23`($sp) ; inp pointer
301 bl Lsha2_block_private
302 $POP $inp,`$FRAME-$SIZE_T*26`($sp) ; restore real inp
303 $POP $num,`$FRAME-$SIZE_T*25`($sp) ; restore real num
304 addic. $num,$num,`-16*$SZ` ; num--
305 bne- Lunaligned
306 b Ldone
307___
308
309$code.=<<___;
310.align 4
311Lsha2_block_private:
312___
313for($i=0;$i<16;$i++) {
314$code.=<<___ if ($SZ==4);
315 lwz @X[$i],`$i*$SZ`($inp)
316___
317# 64-bit loads are split to 2x32-bit ones, as CPU can't handle
318# unaligned 64-bit loads, only 32-bit ones...
319$code.=<<___ if ($SZ==8);
320 lwz $t0,`$i*$SZ`($inp)
321 lwz @X[$i],`$i*$SZ+4`($inp)
322 insrdi @X[$i],$t0,32,0
323___
324 &ROUND_00_15($i,@V);
325 unshift(@V,pop(@V));
326}
327$code.=<<___;
328 li $T,`$rounds/16-1`
329 mtctr $T
330.align 4
331Lrounds:
332 addi $Tbl,$Tbl,`16*$SZ`
333___
334for(;$i<32;$i++) {
335 &ROUND_16_xx($i,@V);
336 unshift(@V,pop(@V));
337}
338$code.=<<___;
339 bdnz- Lrounds
340
341 $POP $ctx,`$FRAME-$SIZE_T*22`($sp)
342 $POP $inp,`$FRAME-$SIZE_T*23`($sp) ; inp pointer
343 $POP $num,`$FRAME-$SIZE_T*24`($sp) ; end pointer
344 subi $Tbl,$Tbl,`($rounds-16)*$SZ` ; rewind Tbl
345
346 $LD r16,`0*$SZ`($ctx)
347 $LD r17,`1*$SZ`($ctx)
348 $LD r18,`2*$SZ`($ctx)
349 $LD r19,`3*$SZ`($ctx)
350 $LD r20,`4*$SZ`($ctx)
351 $LD r21,`5*$SZ`($ctx)
352 $LD r22,`6*$SZ`($ctx)
353 addi $inp,$inp,`16*$SZ` ; advance inp
354 $LD r23,`7*$SZ`($ctx)
355 add $A,$A,r16
356 add $B,$B,r17
357 $PUSH $inp,`$FRAME-$SIZE_T*23`($sp)
358 add $C,$C,r18
359 $ST $A,`0*$SZ`($ctx)
360 add $D,$D,r19
361 $ST $B,`1*$SZ`($ctx)
362 add $E,$E,r20
363 $ST $C,`2*$SZ`($ctx)
364 add $F,$F,r21
365 $ST $D,`3*$SZ`($ctx)
366 add $G,$G,r22
367 $ST $E,`4*$SZ`($ctx)
368 add $H,$H,r23
369 $ST $F,`5*$SZ`($ctx)
370 $ST $G,`6*$SZ`($ctx)
371 $UCMP $inp,$num
372 $ST $H,`7*$SZ`($ctx)
373 bne Lsha2_block_private
374 blr
375___
376
377# Ugly hack here, because PPC assembler syntax seem to vary too
378# much from platforms to platform...
379$code.=<<___;
380.align 6
381LPICmeup:
382 bl LPIC
383 addi $Tbl,$Tbl,`64-4` ; "distance" between . and last nop
384 b LPICedup
385 nop
386 nop
387 nop
388 nop
389 nop
390LPIC: mflr $Tbl
391 blr
392 nop
393 nop
394 nop
395 nop
396 nop
397 nop
398___
399$code.=<<___ if ($SZ==8);
400 .long 0x428a2f98,0xd728ae22,0x71374491,0x23ef65cd
401 .long 0xb5c0fbcf,0xec4d3b2f,0xe9b5dba5,0x8189dbbc
402 .long 0x3956c25b,0xf348b538,0x59f111f1,0xb605d019
403 .long 0x923f82a4,0xaf194f9b,0xab1c5ed5,0xda6d8118
404 .long 0xd807aa98,0xa3030242,0x12835b01,0x45706fbe
405 .long 0x243185be,0x4ee4b28c,0x550c7dc3,0xd5ffb4e2
406 .long 0x72be5d74,0xf27b896f,0x80deb1fe,0x3b1696b1
407 .long 0x9bdc06a7,0x25c71235,0xc19bf174,0xcf692694
408 .long 0xe49b69c1,0x9ef14ad2,0xefbe4786,0x384f25e3
409 .long 0x0fc19dc6,0x8b8cd5b5,0x240ca1cc,0x77ac9c65
410 .long 0x2de92c6f,0x592b0275,0x4a7484aa,0x6ea6e483
411 .long 0x5cb0a9dc,0xbd41fbd4,0x76f988da,0x831153b5
412 .long 0x983e5152,0xee66dfab,0xa831c66d,0x2db43210
413 .long 0xb00327c8,0x98fb213f,0xbf597fc7,0xbeef0ee4
414 .long 0xc6e00bf3,0x3da88fc2,0xd5a79147,0x930aa725
415 .long 0x06ca6351,0xe003826f,0x14292967,0x0a0e6e70
416 .long 0x27b70a85,0x46d22ffc,0x2e1b2138,0x5c26c926
417 .long 0x4d2c6dfc,0x5ac42aed,0x53380d13,0x9d95b3df
418 .long 0x650a7354,0x8baf63de,0x766a0abb,0x3c77b2a8
419 .long 0x81c2c92e,0x47edaee6,0x92722c85,0x1482353b
420 .long 0xa2bfe8a1,0x4cf10364,0xa81a664b,0xbc423001
421 .long 0xc24b8b70,0xd0f89791,0xc76c51a3,0x0654be30
422 .long 0xd192e819,0xd6ef5218,0xd6990624,0x5565a910
423 .long 0xf40e3585,0x5771202a,0x106aa070,0x32bbd1b8
424 .long 0x19a4c116,0xb8d2d0c8,0x1e376c08,0x5141ab53
425 .long 0x2748774c,0xdf8eeb99,0x34b0bcb5,0xe19b48a8
426 .long 0x391c0cb3,0xc5c95a63,0x4ed8aa4a,0xe3418acb
427 .long 0x5b9cca4f,0x7763e373,0x682e6ff3,0xd6b2b8a3
428 .long 0x748f82ee,0x5defb2fc,0x78a5636f,0x43172f60
429 .long 0x84c87814,0xa1f0ab72,0x8cc70208,0x1a6439ec
430 .long 0x90befffa,0x23631e28,0xa4506ceb,0xde82bde9
431 .long 0xbef9a3f7,0xb2c67915,0xc67178f2,0xe372532b
432 .long 0xca273ece,0xea26619c,0xd186b8c7,0x21c0c207
433 .long 0xeada7dd6,0xcde0eb1e,0xf57d4f7f,0xee6ed178
434 .long 0x06f067aa,0x72176fba,0x0a637dc5,0xa2c898a6
435 .long 0x113f9804,0xbef90dae,0x1b710b35,0x131c471b
436 .long 0x28db77f5,0x23047d84,0x32caab7b,0x40c72493
437 .long 0x3c9ebe0a,0x15c9bebc,0x431d67c4,0x9c100d4c
438 .long 0x4cc5d4be,0xcb3e42b6,0x597f299c,0xfc657e2a
439 .long 0x5fcb6fab,0x3ad6faec,0x6c44198c,0x4a475817
440___
441$code.=<<___ if ($SZ==4);
442 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
443 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
444 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
445 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
446 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
447 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
448 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
449 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
450 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
451 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
452 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
453 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
454 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
455 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
456 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
457 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
458___
459
460$code =~ s/\`([^\`]*)\`/eval $1/gem;
461print $code;
462close STDOUT;
diff --git a/src/lib/libcrypto/sha/asm/sha512-s390x.pl b/src/lib/libcrypto/sha/asm/sha512-s390x.pl
deleted file mode 100644
index e7ef2d5a9f..0000000000
--- a/src/lib/libcrypto/sha/asm/sha512-s390x.pl
+++ /dev/null
@@ -1,301 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# SHA256/512 block procedures for s390x.
11
12# April 2007.
13#
14# sha256_block_data_order is reportedly >3 times faster than gcc 3.3
15# generated code (must be a bug in compiler, as improvement is
16# "pathologically" high, in particular in comparison to other SHA
17# modules). But the real twist is that it detects if hardware support
18# for SHA256 is available and in such case utilizes it. Then the
19# performance can reach >6.5x of assembler one for larger chunks.
20#
21# sha512_block_data_order is ~70% faster than gcc 3.3 generated code.
22
23# January 2009.
24#
25# Add support for hardware SHA512 and reschedule instructions to
26# favour dual-issue z10 pipeline. Hardware SHA256/512 is ~4.7x faster
27# than software.
28
29$t0="%r0";
30$t1="%r1";
31$ctx="%r2"; $t2="%r2";
32$inp="%r3";
33$len="%r4"; # used as index in inner loop
34
35$A="%r5";
36$B="%r6";
37$C="%r7";
38$D="%r8";
39$E="%r9";
40$F="%r10";
41$G="%r11";
42$H="%r12"; @V=($A,$B,$C,$D,$E,$F,$G,$H);
43$tbl="%r13";
44$T1="%r14";
45$sp="%r15";
46
47$output=shift;
48open STDOUT,">$output";
49
50if ($output =~ /512/) {
51 $label="512";
52 $SZ=8;
53 $LD="lg"; # load from memory
54 $ST="stg"; # store to memory
55 $ADD="alg"; # add with memory operand
56 $ROT="rllg"; # rotate left
57 $SHR="srlg"; # logical right shift [see even at the end]
58 @Sigma0=(25,30,36);
59 @Sigma1=(23,46,50);
60 @sigma0=(56,63, 7);
61 @sigma1=( 3,45, 6);
62 $rounds=80;
63 $kimdfunc=3; # 0 means unknown/unsupported/unimplemented/disabled
64} else {
65 $label="256";
66 $SZ=4;
67 $LD="llgf"; # load from memory
68 $ST="st"; # store to memory
69 $ADD="al"; # add with memory operand
70 $ROT="rll"; # rotate left
71 $SHR="srl"; # logical right shift
72 @Sigma0=(10,19,30);
73 @Sigma1=( 7,21,26);
74 @sigma0=(14,25, 3);
75 @sigma1=(13,15,10);
76 $rounds=64;
77 $kimdfunc=2; # magic function code for kimd instruction
78}
79$Func="sha${label}_block_data_order";
80$Table="K${label}";
81$frame=160+16*$SZ;
82
83sub BODY_00_15 {
84my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
85
86$code.=<<___ if ($i<16);
87 $LD $T1,`$i*$SZ`($inp) ### $i
88___
89$code.=<<___;
90 $ROT $t0,$e,$Sigma1[0]
91 $ROT $t1,$e,$Sigma1[1]
92 lgr $t2,$f
93 xgr $t0,$t1
94 $ROT $t1,$t1,`$Sigma1[2]-$Sigma1[1]`
95 xgr $t2,$g
96 $ST $T1,`160+$SZ*($i%16)`($sp)
97 xgr $t0,$t1 # Sigma1(e)
98 la $T1,0($T1,$h) # T1+=h
99 ngr $t2,$e
100 lgr $t1,$a
101 algr $T1,$t0 # T1+=Sigma1(e)
102 $ROT $h,$a,$Sigma0[0]
103 xgr $t2,$g # Ch(e,f,g)
104 $ADD $T1,`$i*$SZ`($len,$tbl) # T1+=K[i]
105 $ROT $t0,$a,$Sigma0[1]
106 algr $T1,$t2 # T1+=Ch(e,f,g)
107 ogr $t1,$b
108 xgr $h,$t0
109 lgr $t2,$a
110 ngr $t1,$c
111 $ROT $t0,$t0,`$Sigma0[2]-$Sigma0[1]`
112 xgr $h,$t0 # h=Sigma0(a)
113 ngr $t2,$b
114 algr $h,$T1 # h+=T1
115 ogr $t2,$t1 # Maj(a,b,c)
116 la $d,0($d,$T1) # d+=T1
117 algr $h,$t2 # h+=Maj(a,b,c)
118___
119}
120
121sub BODY_16_XX {
122my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
123
124$code.=<<___;
125 $LD $T1,`160+$SZ*(($i+1)%16)`($sp) ### $i
126 $LD $t1,`160+$SZ*(($i+14)%16)`($sp)
127 $ROT $t0,$T1,$sigma0[0]
128 $SHR $T1,$sigma0[2]
129 $ROT $t2,$t0,`$sigma0[1]-$sigma0[0]`
130 xgr $T1,$t0
131 $ROT $t0,$t1,$sigma1[0]
132 xgr $T1,$t2 # sigma0(X[i+1])
133 $SHR $t1,$sigma1[2]
134 $ADD $T1,`160+$SZ*($i%16)`($sp) # +=X[i]
135 xgr $t1,$t0
136 $ROT $t0,$t0,`$sigma1[1]-$sigma1[0]`
137 $ADD $T1,`160+$SZ*(($i+9)%16)`($sp) # +=X[i+9]
138 xgr $t1,$t0 # sigma1(X[i+14])
139 algr $T1,$t1 # +=sigma1(X[i+14])
140___
141 &BODY_00_15(@_);
142}
143
144$code.=<<___;
145.text
146.align 64
147.type $Table,\@object
148$Table:
149___
150$code.=<<___ if ($SZ==4);
151 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
152 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
153 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
154 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
155 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
156 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
157 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
158 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
159 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
160 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
161 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
162 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
163 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
164 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
165 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
166 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
167___
168$code.=<<___ if ($SZ==8);
169 .quad 0x428a2f98d728ae22,0x7137449123ef65cd
170 .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
171 .quad 0x3956c25bf348b538,0x59f111f1b605d019
172 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
173 .quad 0xd807aa98a3030242,0x12835b0145706fbe
174 .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
175 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
176 .quad 0x9bdc06a725c71235,0xc19bf174cf692694
177 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
178 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
179 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
180 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
181 .quad 0x983e5152ee66dfab,0xa831c66d2db43210
182 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
183 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
184 .quad 0x06ca6351e003826f,0x142929670a0e6e70
185 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
186 .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
187 .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
188 .quad 0x81c2c92e47edaee6,0x92722c851482353b
189 .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
190 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
191 .quad 0xd192e819d6ef5218,0xd69906245565a910
192 .quad 0xf40e35855771202a,0x106aa07032bbd1b8
193 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
194 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
195 .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
196 .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
197 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
198 .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
199 .quad 0x90befffa23631e28,0xa4506cebde82bde9
200 .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
201 .quad 0xca273eceea26619c,0xd186b8c721c0c207
202 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
203 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
204 .quad 0x113f9804bef90dae,0x1b710b35131c471b
205 .quad 0x28db77f523047d84,0x32caab7b40c72493
206 .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
207 .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
208 .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
209___
210$code.=<<___;
211.size $Table,.-$Table
212.globl $Func
213.type $Func,\@function
214$Func:
215___
216$code.=<<___ if ($kimdfunc);
217 larl %r1,OPENSSL_s390xcap_P
218 lg %r0,0(%r1)
219 tmhl %r0,0x4000 # check for message-security assist
220 jz .Lsoftware
221 lghi %r0,0
222 la %r1,16($sp)
223 .long 0xb93e0002 # kimd %r0,%r2
224 lg %r0,16($sp)
225 tmhh %r0,`0x8000>>$kimdfunc`
226 jz .Lsoftware
227 lghi %r0,$kimdfunc
228 lgr %r1,$ctx
229 lgr %r2,$inp
230 sllg %r3,$len,`log(16*$SZ)/log(2)`
231 .long 0xb93e0002 # kimd %r0,%r2
232 brc 1,.-4 # pay attention to "partial completion"
233 br %r14
234.align 16
235.Lsoftware:
236___
237$code.=<<___;
238 sllg $len,$len,`log(16*$SZ)/log(2)`
239 lghi %r1,-$frame
240 agr $len,$inp
241 stmg $ctx,%r15,16($sp)
242 lgr %r0,$sp
243 la $sp,0(%r1,$sp)
244 stg %r0,0($sp)
245
246 larl $tbl,$Table
247 $LD $A,`0*$SZ`($ctx)
248 $LD $B,`1*$SZ`($ctx)
249 $LD $C,`2*$SZ`($ctx)
250 $LD $D,`3*$SZ`($ctx)
251 $LD $E,`4*$SZ`($ctx)
252 $LD $F,`5*$SZ`($ctx)
253 $LD $G,`6*$SZ`($ctx)
254 $LD $H,`7*$SZ`($ctx)
255
256.Lloop:
257 lghi $len,0
258___
259for ($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
260$code.=".Lrounds_16_xx:\n";
261for (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
262$code.=<<___;
263 aghi $len,`16*$SZ`
264 lghi $t0,`($rounds-16)*$SZ`
265 clgr $len,$t0
266 jne .Lrounds_16_xx
267
268 lg $ctx,`$frame+16`($sp)
269 la $inp,`16*$SZ`($inp)
270 $ADD $A,`0*$SZ`($ctx)
271 $ADD $B,`1*$SZ`($ctx)
272 $ADD $C,`2*$SZ`($ctx)
273 $ADD $D,`3*$SZ`($ctx)
274 $ADD $E,`4*$SZ`($ctx)
275 $ADD $F,`5*$SZ`($ctx)
276 $ADD $G,`6*$SZ`($ctx)
277 $ADD $H,`7*$SZ`($ctx)
278 $ST $A,`0*$SZ`($ctx)
279 $ST $B,`1*$SZ`($ctx)
280 $ST $C,`2*$SZ`($ctx)
281 $ST $D,`3*$SZ`($ctx)
282 $ST $E,`4*$SZ`($ctx)
283 $ST $F,`5*$SZ`($ctx)
284 $ST $G,`6*$SZ`($ctx)
285 $ST $H,`7*$SZ`($ctx)
286 clg $inp,`$frame+32`($sp)
287 jne .Lloop
288
289 lmg %r6,%r15,`$frame+48`($sp)
290 br %r14
291.size $Func,.-$Func
292.string "SHA${label} block transform for s390x, CRYPTOGAMS by <appro\@openssl.org>"
293.comm OPENSSL_s390xcap_P,8,8
294___
295
296$code =~ s/\`([^\`]*)\`/eval $1/gem;
297# unlike 32-bit shift 64-bit one takes three arguments
298$code =~ s/(srlg\s+)(%r[0-9]+),/$1$2,$2,/gm;
299
300print $code;
301close STDOUT;
diff --git a/src/lib/libcrypto/sha/asm/sha512-sparcv9.pl b/src/lib/libcrypto/sha/asm/sha512-sparcv9.pl
deleted file mode 100644
index ec5d78135e..0000000000
--- a/src/lib/libcrypto/sha/asm/sha512-sparcv9.pl
+++ /dev/null
@@ -1,594 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# SHA256 performance improvement over compiler generated code varies
11# from 40% for Sun C [32-bit build] to 70% for gcc [3.3, 64-bit
12# build]. Just like in SHA1 module I aim to ensure scalability on
13# UltraSPARC T1 by packing X[16] to 8 64-bit registers.
14
15# SHA512 on pre-T1 UltraSPARC.
16#
17# Performance is >75% better than 64-bit code generated by Sun C and
18# over 2x than 32-bit code. X[16] resides on stack, but access to it
19# is scheduled for L2 latency and staged through 32 least significant
20# bits of %l0-%l7. The latter is done to achieve 32-/64-bit ABI
21# duality. Nevetheless it's ~40% faster than SHA256, which is pretty
22# good [optimal coefficient is 50%].
23#
24# SHA512 on UltraSPARC T1.
25#
26# It's not any faster than 64-bit code generated by Sun C 5.8. This is
27# because 64-bit code generator has the advantage of using 64-bit
28# loads(*) to access X[16], which I consciously traded for 32-/64-bit
29# ABI duality [as per above]. But it surpasses 32-bit Sun C generated
30# code by 60%, not to mention that it doesn't suffer from severe decay
31# when running 4 times physical cores threads and that it leaves gcc
32# [3.4] behind by over 4x factor! If compared to SHA256, single thread
33# performance is only 10% better, but overall throughput for maximum
34# amount of threads for given CPU exceeds corresponding one of SHA256
35# by 30% [again, optimal coefficient is 50%].
36#
37# (*) Unlike pre-T1 UltraSPARC loads on T1 are executed strictly
38# in-order, i.e. load instruction has to complete prior next
39# instruction in given thread is executed, even if the latter is
40# not dependent on load result! This means that on T1 two 32-bit
41# loads are always slower than one 64-bit load. Once again this
42# is unlike pre-T1 UltraSPARC, where, if scheduled appropriately,
43# 2x32-bit loads can be as fast as 1x64-bit ones.
44
45$bits=32;
46for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
47if ($bits==64) { $bias=2047; $frame=192; }
48else { $bias=0; $frame=112; }
49
50$output=shift;
51open STDOUT,">$output";
52
53if ($output =~ /512/) {
54 $label="512";
55 $SZ=8;
56 $LD="ldx"; # load from memory
57 $ST="stx"; # store to memory
58 $SLL="sllx"; # shift left logical
59 $SRL="srlx"; # shift right logical
60 @Sigma0=(28,34,39);
61 @Sigma1=(14,18,41);
62 @sigma0=( 7, 1, 8); # right shift first
63 @sigma1=( 6,19,61); # right shift first
64 $lastK=0x817;
65 $rounds=80;
66 $align=4;
67
68 $locals=16*$SZ; # X[16]
69
70 $A="%o0";
71 $B="%o1";
72 $C="%o2";
73 $D="%o3";
74 $E="%o4";
75 $F="%o5";
76 $G="%g1";
77 $H="%o7";
78 @V=($A,$B,$C,$D,$E,$F,$G,$H);
79} else {
80 $label="256";
81 $SZ=4;
82 $LD="ld"; # load from memory
83 $ST="st"; # store to memory
84 $SLL="sll"; # shift left logical
85 $SRL="srl"; # shift right logical
86 @Sigma0=( 2,13,22);
87 @Sigma1=( 6,11,25);
88 @sigma0=( 3, 7,18); # right shift first
89 @sigma1=(10,17,19); # right shift first
90 $lastK=0x8f2;
91 $rounds=64;
92 $align=8;
93
94 $locals=0; # X[16] is register resident
95 @X=("%o0","%o1","%o2","%o3","%o4","%o5","%g1","%o7");
96
97 $A="%l0";
98 $B="%l1";
99 $C="%l2";
100 $D="%l3";
101 $E="%l4";
102 $F="%l5";
103 $G="%l6";
104 $H="%l7";
105 @V=($A,$B,$C,$D,$E,$F,$G,$H);
106}
107$T1="%g2";
108$tmp0="%g3";
109$tmp1="%g4";
110$tmp2="%g5";
111
112$ctx="%i0";
113$inp="%i1";
114$len="%i2";
115$Ktbl="%i3";
116$tmp31="%i4";
117$tmp32="%i5";
118
119########### SHA256
120$Xload = sub {
121my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
122
123 if ($i==0) {
124$code.=<<___;
125 ldx [$inp+0],@X[0]
126 ldx [$inp+16],@X[2]
127 ldx [$inp+32],@X[4]
128 ldx [$inp+48],@X[6]
129 ldx [$inp+8],@X[1]
130 ldx [$inp+24],@X[3]
131 subcc %g0,$tmp31,$tmp32 ! should be 64-$tmp31, but -$tmp31 works too
132 ldx [$inp+40],@X[5]
133 bz,pt %icc,.Laligned
134 ldx [$inp+56],@X[7]
135
136 sllx @X[0],$tmp31,@X[0]
137 ldx [$inp+64],$T1
138___
139for($j=0;$j<7;$j++)
140{ $code.=<<___;
141 srlx @X[$j+1],$tmp32,$tmp1
142 sllx @X[$j+1],$tmp31,@X[$j+1]
143 or $tmp1,@X[$j],@X[$j]
144___
145}
146$code.=<<___;
147 srlx $T1,$tmp32,$T1
148 or $T1,@X[7],@X[7]
149.Laligned:
150___
151 }
152
153 if ($i&1) {
154 $code.="\tadd @X[$i/2],$h,$T1\n";
155 } else {
156 $code.="\tsrlx @X[$i/2],32,$T1\n\tadd $h,$T1,$T1\n";
157 }
158} if ($SZ==4);
159
160########### SHA512
161$Xload = sub {
162my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
163my @pair=("%l".eval(($i*2)%8),"%l".eval(($i*2)%8+1),"%l".eval((($i+1)*2)%8));
164
165$code.=<<___ if ($i==0);
166 ld [$inp+0],%l0
167 ld [$inp+4],%l1
168 ld [$inp+8],%l2
169 ld [$inp+12],%l3
170 ld [$inp+16],%l4
171 ld [$inp+20],%l5
172 ld [$inp+24],%l6
173 ld [$inp+28],%l7
174___
175$code.=<<___ if ($i<15);
176 sllx @pair[1],$tmp31,$tmp2 ! Xload($i)
177 add $tmp31,32,$tmp0
178 sllx @pair[0],$tmp0,$tmp1
179 `"ld [$inp+".eval(32+0+$i*8)."],@pair[0]" if ($i<12)`
180 srlx @pair[2],$tmp32,@pair[1]
181 or $tmp1,$tmp2,$tmp2
182 or @pair[1],$tmp2,$tmp2
183 `"ld [$inp+".eval(32+4+$i*8)."],@pair[1]" if ($i<12)`
184 add $h,$tmp2,$T1
185 $ST $tmp2,[%sp+`$bias+$frame+$i*$SZ`]
186___
187$code.=<<___ if ($i==12);
188 brnz,a $tmp31,.+8
189 ld [$inp+128],%l0
190___
191$code.=<<___ if ($i==15);
192 ld [%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+0`],%l2
193 sllx @pair[1],$tmp31,$tmp2 ! Xload($i)
194 add $tmp31,32,$tmp0
195 ld [%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+4`],%l3
196 sllx @pair[0],$tmp0,$tmp1
197 ld [%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+0`],%l4
198 srlx @pair[2],$tmp32,@pair[1]
199 or $tmp1,$tmp2,$tmp2
200 ld [%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+4`],%l5
201 or @pair[1],$tmp2,$tmp2
202 ld [%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+0`],%l6
203 add $h,$tmp2,$T1
204 $ST $tmp2,[%sp+`$bias+$frame+$i*$SZ`]
205 ld [%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+4`],%l7
206 ld [%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+0`],%l0
207 ld [%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+4`],%l1
208___
209} if ($SZ==8);
210
211########### common
212sub BODY_00_15 {
213my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
214
215 if ($i<16) {
216 &$Xload(@_);
217 } else {
218 $code.="\tadd $h,$T1,$T1\n";
219 }
220
221$code.=<<___;
222 $SRL $e,@Sigma1[0],$h !! $i
223 xor $f,$g,$tmp2
224 $SLL $e,`$SZ*8-@Sigma1[2]`,$tmp1
225 and $e,$tmp2,$tmp2
226 $SRL $e,@Sigma1[1],$tmp0
227 xor $tmp1,$h,$h
228 $SLL $e,`$SZ*8-@Sigma1[1]`,$tmp1
229 xor $tmp0,$h,$h
230 $SRL $e,@Sigma1[2],$tmp0
231 xor $tmp1,$h,$h
232 $SLL $e,`$SZ*8-@Sigma1[0]`,$tmp1
233 xor $tmp0,$h,$h
234 xor $g,$tmp2,$tmp2 ! Ch(e,f,g)
235 xor $tmp1,$h,$tmp0 ! Sigma1(e)
236
237 $SRL $a,@Sigma0[0],$h
238 add $tmp2,$T1,$T1
239 $LD [$Ktbl+`$i*$SZ`],$tmp2 ! K[$i]
240 $SLL $a,`$SZ*8-@Sigma0[2]`,$tmp1
241 add $tmp0,$T1,$T1
242 $SRL $a,@Sigma0[1],$tmp0
243 xor $tmp1,$h,$h
244 $SLL $a,`$SZ*8-@Sigma0[1]`,$tmp1
245 xor $tmp0,$h,$h
246 $SRL $a,@Sigma0[2],$tmp0
247 xor $tmp1,$h,$h
248 $SLL $a,`$SZ*8-@Sigma0[0]`,$tmp1
249 xor $tmp0,$h,$h
250 xor $tmp1,$h,$h ! Sigma0(a)
251
252 or $a,$b,$tmp0
253 and $a,$b,$tmp1
254 and $c,$tmp0,$tmp0
255 or $tmp0,$tmp1,$tmp1 ! Maj(a,b,c)
256 add $tmp2,$T1,$T1 ! +=K[$i]
257 add $tmp1,$h,$h
258
259 add $T1,$d,$d
260 add $T1,$h,$h
261___
262}
263
264########### SHA256
265$BODY_16_XX = sub {
266my $i=@_[0];
267my $xi;
268
269 if ($i&1) {
270 $xi=$tmp32;
271 $code.="\tsrlx @X[(($i+1)/2)%8],32,$xi\n";
272 } else {
273 $xi=@X[(($i+1)/2)%8];
274 }
275$code.=<<___;
276 srl $xi,@sigma0[0],$T1 !! Xupdate($i)
277 sll $xi,`32-@sigma0[2]`,$tmp1
278 srl $xi,@sigma0[1],$tmp0
279 xor $tmp1,$T1,$T1
280 sll $tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1
281 xor $tmp0,$T1,$T1
282 srl $xi,@sigma0[2],$tmp0
283 xor $tmp1,$T1,$T1
284___
285 if ($i&1) {
286 $xi=@X[(($i+14)/2)%8];
287 } else {
288 $xi=$tmp32;
289 $code.="\tsrlx @X[(($i+14)/2)%8],32,$xi\n";
290 }
291$code.=<<___;
292 srl $xi,@sigma1[0],$tmp2
293 xor $tmp0,$T1,$T1 ! T1=sigma0(X[i+1])
294 sll $xi,`32-@sigma1[2]`,$tmp1
295 srl $xi,@sigma1[1],$tmp0
296 xor $tmp1,$tmp2,$tmp2
297 sll $tmp1,`@sigma1[2]-@sigma1[1]`,$tmp1
298 xor $tmp0,$tmp2,$tmp2
299 srl $xi,@sigma1[2],$tmp0
300 xor $tmp1,$tmp2,$tmp2
301___
302 if ($i&1) {
303 $xi=@X[($i/2)%8];
304$code.=<<___;
305 srlx @X[(($i+9)/2)%8],32,$tmp1 ! X[i+9]
306 xor $tmp0,$tmp2,$tmp2 ! sigma1(X[i+14])
307 srl @X[($i/2)%8],0,$tmp0
308 add $xi,$T1,$T1 ! +=X[i]
309 xor $tmp0,@X[($i/2)%8],@X[($i/2)%8]
310 add $tmp2,$T1,$T1
311 add $tmp1,$T1,$T1
312
313 srl $T1,0,$T1
314 or $T1,@X[($i/2)%8],@X[($i/2)%8]
315___
316 } else {
317 $xi=@X[(($i+9)/2)%8];
318$code.=<<___;
319 srlx @X[($i/2)%8],32,$tmp1 ! X[i]
320 xor $tmp0,$tmp2,$tmp2 ! sigma1(X[i+14])
321 srl @X[($i/2)%8],0,@X[($i/2)%8]
322 add $xi,$T1,$T1 ! +=X[i+9]
323 add $tmp2,$T1,$T1
324 add $tmp1,$T1,$T1
325
326 sllx $T1,32,$tmp0
327 or $tmp0,@X[($i/2)%8],@X[($i/2)%8]
328___
329 }
330 &BODY_00_15(@_);
331} if ($SZ==4);
332
333########### SHA512
334$BODY_16_XX = sub {
335my $i=@_[0];
336my @pair=("%l".eval(($i*2)%8),"%l".eval(($i*2)%8+1));
337
338$code.=<<___;
339 sllx %l2,32,$tmp0 !! Xupdate($i)
340 or %l3,$tmp0,$tmp0
341
342 srlx $tmp0,@sigma0[0],$T1
343 ld [%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+0`],%l2
344 sllx $tmp0,`64-@sigma0[2]`,$tmp1
345 ld [%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+4`],%l3
346 srlx $tmp0,@sigma0[1],$tmp0
347 xor $tmp1,$T1,$T1
348 sllx $tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1
349 xor $tmp0,$T1,$T1
350 srlx $tmp0,`@sigma0[2]-@sigma0[1]`,$tmp0
351 xor $tmp1,$T1,$T1
352 sllx %l6,32,$tmp2
353 xor $tmp0,$T1,$T1 ! sigma0(X[$i+1])
354 or %l7,$tmp2,$tmp2
355
356 srlx $tmp2,@sigma1[0],$tmp1
357 ld [%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+0`],%l6
358 sllx $tmp2,`64-@sigma1[2]`,$tmp0
359 ld [%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+4`],%l7
360 srlx $tmp2,@sigma1[1],$tmp2
361 xor $tmp0,$tmp1,$tmp1
362 sllx $tmp0,`@sigma1[2]-@sigma1[1]`,$tmp0
363 xor $tmp2,$tmp1,$tmp1
364 srlx $tmp2,`@sigma1[2]-@sigma1[1]`,$tmp2
365 xor $tmp0,$tmp1,$tmp1
366 sllx %l4,32,$tmp0
367 xor $tmp2,$tmp1,$tmp1 ! sigma1(X[$i+14])
368 ld [%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+0`],%l4
369 or %l5,$tmp0,$tmp0
370 ld [%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+4`],%l5
371
372 sllx %l0,32,$tmp2
373 add $tmp1,$T1,$T1
374 ld [%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+0`],%l0
375 or %l1,$tmp2,$tmp2
376 add $tmp0,$T1,$T1 ! +=X[$i+9]
377 ld [%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+4`],%l1
378 add $tmp2,$T1,$T1 ! +=X[$i]
379 $ST $T1,[%sp+`$bias+$frame+($i%16)*$SZ`]
380___
381 &BODY_00_15(@_);
382} if ($SZ==8);
383
384$code.=<<___ if ($bits==64);
385.register %g2,#scratch
386.register %g3,#scratch
387___
388$code.=<<___;
389.section ".text",#alloc,#execinstr
390
391.align 64
392K${label}:
393.type K${label},#object
394___
395if ($SZ==4) {
396$code.=<<___;
397 .long 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
398 .long 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
399 .long 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
400 .long 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
401 .long 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
402 .long 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
403 .long 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
404 .long 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
405 .long 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
406 .long 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
407 .long 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
408 .long 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
409 .long 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
410 .long 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
411 .long 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
412 .long 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
413___
414} else {
415$code.=<<___;
416 .long 0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd
417 .long 0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc
418 .long 0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019
419 .long 0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118
420 .long 0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe
421 .long 0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2
422 .long 0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1
423 .long 0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694
424 .long 0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3
425 .long 0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65
426 .long 0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483
427 .long 0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5
428 .long 0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210
429 .long 0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4
430 .long 0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725
431 .long 0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70
432 .long 0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926
433 .long 0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df
434 .long 0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8
435 .long 0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b
436 .long 0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001
437 .long 0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30
438 .long 0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910
439 .long 0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8
440 .long 0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53
441 .long 0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8
442 .long 0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb
443 .long 0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3
444 .long 0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60
445 .long 0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec
446 .long 0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9
447 .long 0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b
448 .long 0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207
449 .long 0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178
450 .long 0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6
451 .long 0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b
452 .long 0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493
453 .long 0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c
454 .long 0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a
455 .long 0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817
456___
457}
458$code.=<<___;
459.size K${label},.-K${label}
460.globl sha${label}_block_data_order
461sha${label}_block_data_order:
462 save %sp,`-$frame-$locals`,%sp
463 and $inp,`$align-1`,$tmp31
464 sllx $len,`log(16*$SZ)/log(2)`,$len
465 andn $inp,`$align-1`,$inp
466 sll $tmp31,3,$tmp31
467 add $inp,$len,$len
468___
469$code.=<<___ if ($SZ==8); # SHA512
470 mov 32,$tmp32
471 sub $tmp32,$tmp31,$tmp32
472___
473$code.=<<___;
474.Lpic: call .+8
475 add %o7,K${label}-.Lpic,$Ktbl
476
477 $LD [$ctx+`0*$SZ`],$A
478 $LD [$ctx+`1*$SZ`],$B
479 $LD [$ctx+`2*$SZ`],$C
480 $LD [$ctx+`3*$SZ`],$D
481 $LD [$ctx+`4*$SZ`],$E
482 $LD [$ctx+`5*$SZ`],$F
483 $LD [$ctx+`6*$SZ`],$G
484 $LD [$ctx+`7*$SZ`],$H
485
486.Lloop:
487___
488for ($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
489$code.=".L16_xx:\n";
490for (;$i<32;$i++) { &$BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
491$code.=<<___;
492 and $tmp2,0xfff,$tmp2
493 cmp $tmp2,$lastK
494 bne .L16_xx
495 add $Ktbl,`16*$SZ`,$Ktbl ! Ktbl+=16
496
497___
498$code.=<<___ if ($SZ==4); # SHA256
499 $LD [$ctx+`0*$SZ`],@X[0]
500 $LD [$ctx+`1*$SZ`],@X[1]
501 $LD [$ctx+`2*$SZ`],@X[2]
502 $LD [$ctx+`3*$SZ`],@X[3]
503 $LD [$ctx+`4*$SZ`],@X[4]
504 $LD [$ctx+`5*$SZ`],@X[5]
505 $LD [$ctx+`6*$SZ`],@X[6]
506 $LD [$ctx+`7*$SZ`],@X[7]
507
508 add $A,@X[0],$A
509 $ST $A,[$ctx+`0*$SZ`]
510 add $B,@X[1],$B
511 $ST $B,[$ctx+`1*$SZ`]
512 add $C,@X[2],$C
513 $ST $C,[$ctx+`2*$SZ`]
514 add $D,@X[3],$D
515 $ST $D,[$ctx+`3*$SZ`]
516 add $E,@X[4],$E
517 $ST $E,[$ctx+`4*$SZ`]
518 add $F,@X[5],$F
519 $ST $F,[$ctx+`5*$SZ`]
520 add $G,@X[6],$G
521 $ST $G,[$ctx+`6*$SZ`]
522 add $H,@X[7],$H
523 $ST $H,[$ctx+`7*$SZ`]
524___
525$code.=<<___ if ($SZ==8); # SHA512
526 ld [$ctx+`0*$SZ+0`],%l0
527 ld [$ctx+`0*$SZ+4`],%l1
528 ld [$ctx+`1*$SZ+0`],%l2
529 ld [$ctx+`1*$SZ+4`],%l3
530 ld [$ctx+`2*$SZ+0`],%l4
531 ld [$ctx+`2*$SZ+4`],%l5
532 ld [$ctx+`3*$SZ+0`],%l6
533
534 sllx %l0,32,$tmp0
535 ld [$ctx+`3*$SZ+4`],%l7
536 sllx %l2,32,$tmp1
537 or %l1,$tmp0,$tmp0
538 or %l3,$tmp1,$tmp1
539 add $tmp0,$A,$A
540 add $tmp1,$B,$B
541 $ST $A,[$ctx+`0*$SZ`]
542 sllx %l4,32,$tmp2
543 $ST $B,[$ctx+`1*$SZ`]
544 sllx %l6,32,$T1
545 or %l5,$tmp2,$tmp2
546 or %l7,$T1,$T1
547 add $tmp2,$C,$C
548 $ST $C,[$ctx+`2*$SZ`]
549 add $T1,$D,$D
550 $ST $D,[$ctx+`3*$SZ`]
551
552 ld [$ctx+`4*$SZ+0`],%l0
553 ld [$ctx+`4*$SZ+4`],%l1
554 ld [$ctx+`5*$SZ+0`],%l2
555 ld [$ctx+`5*$SZ+4`],%l3
556 ld [$ctx+`6*$SZ+0`],%l4
557 ld [$ctx+`6*$SZ+4`],%l5
558 ld [$ctx+`7*$SZ+0`],%l6
559
560 sllx %l0,32,$tmp0
561 ld [$ctx+`7*$SZ+4`],%l7
562 sllx %l2,32,$tmp1
563 or %l1,$tmp0,$tmp0
564 or %l3,$tmp1,$tmp1
565 add $tmp0,$E,$E
566 add $tmp1,$F,$F
567 $ST $E,[$ctx+`4*$SZ`]
568 sllx %l4,32,$tmp2
569 $ST $F,[$ctx+`5*$SZ`]
570 sllx %l6,32,$T1
571 or %l5,$tmp2,$tmp2
572 or %l7,$T1,$T1
573 add $tmp2,$G,$G
574 $ST $G,[$ctx+`6*$SZ`]
575 add $T1,$H,$H
576 $ST $H,[$ctx+`7*$SZ`]
577___
578$code.=<<___;
579 add $inp,`16*$SZ`,$inp ! advance inp
580 cmp $inp,$len
581 bne `$bits==64?"%xcc":"%icc"`,.Lloop
582 sub $Ktbl,`($rounds-16)*$SZ`,$Ktbl ! rewind Ktbl
583
584 ret
585 restore
586.type sha${label}_block_data_order,#function
587.size sha${label}_block_data_order,(.-sha${label}_block_data_order)
588.asciz "SHA${label} block transform for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
589.align 4
590___
591
592$code =~ s/\`([^\`]*)\`/eval $1/gem;
593print $code;
594close STDOUT;
diff --git a/src/lib/libcrypto/sha/asm/sha512-x86_64.pl b/src/lib/libcrypto/sha/asm/sha512-x86_64.pl
deleted file mode 100755
index e6643f8cf6..0000000000
--- a/src/lib/libcrypto/sha/asm/sha512-x86_64.pl
+++ /dev/null
@@ -1,456 +0,0 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. Rights for redistribution and usage in source and binary
6# forms are granted according to the OpenSSL license.
7# ====================================================================
8#
9# sha256/512_block procedure for x86_64.
10#
11# 40% improvement over compiler-generated code on Opteron. On EM64T
12# sha256 was observed to run >80% faster and sha512 - >40%. No magical
13# tricks, just straight implementation... I really wonder why gcc
14# [being armed with inline assembler] fails to generate as fast code.
15# The only thing which is cool about this module is that it's very
16# same instruction sequence used for both SHA-256 and SHA-512. In
17# former case the instructions operate on 32-bit operands, while in
18# latter - on 64-bit ones. All I had to do is to get one flavor right,
19# the other one passed the test right away:-)
20#
21# sha256_block runs in ~1005 cycles on Opteron, which gives you
22# asymptotic performance of 64*1000/1005=63.7MBps times CPU clock
23# frequency in GHz. sha512_block runs in ~1275 cycles, which results
24# in 128*1000/1275=100MBps per GHz. Is there room for improvement?
25# Well, if you compare it to IA-64 implementation, which maintains
26# X[16] in register bank[!], tends to 4 instructions per CPU clock
27# cycle and runs in 1003 cycles, 1275 is very good result for 3-way
28# issue Opteron pipeline and X[16] maintained in memory. So that *if*
29# there is a way to improve it, *then* the only way would be to try to
30# offload X[16] updates to SSE unit, but that would require "deeper"
31# loop unroll, which in turn would naturally cause size blow-up, not
32# to mention increased complexity! And once again, only *if* it's
33# actually possible to noticeably improve overall ILP, instruction
34# level parallelism, on a given CPU implementation in this case.
35#
36# Special note on Intel EM64T. While Opteron CPU exhibits perfect
37# perfromance ratio of 1.5 between 64- and 32-bit flavors [see above],
38# [currently available] EM64T CPUs apparently are far from it. On the
39# contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit
40# sha256_block:-( This is presumably because 64-bit shifts/rotates
41# apparently are not atomic instructions, but implemented in microcode.
42
43$flavour = shift;
44$output = shift;
45if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
46
47$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
48
49$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
50( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
51( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
52die "can't locate x86_64-xlate.pl";
53
54open STDOUT,"| $^X $xlate $flavour $output";
55
56if ($output =~ /512/) {
57 $func="sha512_block_data_order";
58 $TABLE="K512";
59 $SZ=8;
60 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%rax","%rbx","%rcx","%rdx",
61 "%r8", "%r9", "%r10","%r11");
62 ($T1,$a0,$a1,$a2)=("%r12","%r13","%r14","%r15");
63 @Sigma0=(28,34,39);
64 @Sigma1=(14,18,41);
65 @sigma0=(1, 8, 7);
66 @sigma1=(19,61, 6);
67 $rounds=80;
68} else {
69 $func="sha256_block_data_order";
70 $TABLE="K256";
71 $SZ=4;
72 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
73 "%r8d","%r9d","%r10d","%r11d");
74 ($T1,$a0,$a1,$a2)=("%r12d","%r13d","%r14d","%r15d");
75 @Sigma0=( 2,13,22);
76 @Sigma1=( 6,11,25);
77 @sigma0=( 7,18, 3);
78 @sigma1=(17,19,10);
79 $rounds=64;
80}
81
82$ctx="%rdi"; # 1st arg
83$round="%rdi"; # zaps $ctx
84$inp="%rsi"; # 2nd arg
85$Tbl="%rbp";
86
87$_ctx="16*$SZ+0*8(%rsp)";
88$_inp="16*$SZ+1*8(%rsp)";
89$_end="16*$SZ+2*8(%rsp)";
90$_rsp="16*$SZ+3*8(%rsp)";
91$framesz="16*$SZ+4*8";
92
93
94sub ROUND_00_15()
95{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
96
97$code.=<<___;
98 mov $e,$a0
99 mov $e,$a1
100 mov $f,$a2
101
102 ror \$$Sigma1[0],$a0
103 ror \$$Sigma1[1],$a1
104 xor $g,$a2 # f^g
105
106 xor $a1,$a0
107 ror \$`$Sigma1[2]-$Sigma1[1]`,$a1
108 and $e,$a2 # (f^g)&e
109 mov $T1,`$SZ*($i&0xf)`(%rsp)
110
111 xor $a1,$a0 # Sigma1(e)
112 xor $g,$a2 # Ch(e,f,g)=((f^g)&e)^g
113 add $h,$T1 # T1+=h
114
115 mov $a,$h
116 add $a0,$T1 # T1+=Sigma1(e)
117
118 add $a2,$T1 # T1+=Ch(e,f,g)
119 mov $a,$a0
120 mov $a,$a1
121
122 ror \$$Sigma0[0],$h
123 ror \$$Sigma0[1],$a0
124 mov $a,$a2
125 add ($Tbl,$round,$SZ),$T1 # T1+=K[round]
126
127 xor $a0,$h
128 ror \$`$Sigma0[2]-$Sigma0[1]`,$a0
129 or $c,$a1 # a|c
130
131 xor $a0,$h # h=Sigma0(a)
132 and $c,$a2 # a&c
133 add $T1,$d # d+=T1
134
135 and $b,$a1 # (a|c)&b
136 add $T1,$h # h+=T1
137
138 or $a2,$a1 # Maj(a,b,c)=((a|c)&b)|(a&c)
139 lea 1($round),$round # round++
140
141 add $a1,$h # h+=Maj(a,b,c)
142___
143}
144
145sub ROUND_16_XX()
146{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
147
148$code.=<<___;
149 mov `$SZ*(($i+1)&0xf)`(%rsp),$a0
150 mov `$SZ*(($i+14)&0xf)`(%rsp),$T1
151
152 mov $a0,$a2
153
154 shr \$$sigma0[2],$a0
155 ror \$$sigma0[0],$a2
156
157 xor $a2,$a0
158 ror \$`$sigma0[1]-$sigma0[0]`,$a2
159
160 xor $a2,$a0 # sigma0(X[(i+1)&0xf])
161 mov $T1,$a1
162
163 shr \$$sigma1[2],$T1
164 ror \$$sigma1[0],$a1
165
166 xor $a1,$T1
167 ror \$`$sigma1[1]-$sigma1[0]`,$a1
168
169 xor $a1,$T1 # sigma1(X[(i+14)&0xf])
170
171 add $a0,$T1
172
173 add `$SZ*(($i+9)&0xf)`(%rsp),$T1
174
175 add `$SZ*($i&0xf)`(%rsp),$T1
176___
177 &ROUND_00_15(@_);
178}
179
180$code=<<___;
181.text
182
183.globl $func
184.type $func,\@function,4
185.align 16
186$func:
187 push %rbx
188 push %rbp
189 push %r12
190 push %r13
191 push %r14
192 push %r15
193 mov %rsp,%r11 # copy %rsp
194 shl \$4,%rdx # num*16
195 sub \$$framesz,%rsp
196 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
197 and \$-64,%rsp # align stack frame
198 mov $ctx,$_ctx # save ctx, 1st arg
199 mov $inp,$_inp # save inp, 2nd arh
200 mov %rdx,$_end # save end pointer, "3rd" arg
201 mov %r11,$_rsp # save copy of %rsp
202.Lprologue:
203
204 lea $TABLE(%rip),$Tbl
205
206 mov $SZ*0($ctx),$A
207 mov $SZ*1($ctx),$B
208 mov $SZ*2($ctx),$C
209 mov $SZ*3($ctx),$D
210 mov $SZ*4($ctx),$E
211 mov $SZ*5($ctx),$F
212 mov $SZ*6($ctx),$G
213 mov $SZ*7($ctx),$H
214 jmp .Lloop
215
216.align 16
217.Lloop:
218 xor $round,$round
219___
220 for($i=0;$i<16;$i++) {
221 $code.=" mov $SZ*$i($inp),$T1\n";
222 $code.=" bswap $T1\n";
223 &ROUND_00_15($i,@ROT);
224 unshift(@ROT,pop(@ROT));
225 }
226$code.=<<___;
227 jmp .Lrounds_16_xx
228.align 16
229.Lrounds_16_xx:
230___
231 for(;$i<32;$i++) {
232 &ROUND_16_XX($i,@ROT);
233 unshift(@ROT,pop(@ROT));
234 }
235
236$code.=<<___;
237 cmp \$$rounds,$round
238 jb .Lrounds_16_xx
239
240 mov $_ctx,$ctx
241 lea 16*$SZ($inp),$inp
242
243 add $SZ*0($ctx),$A
244 add $SZ*1($ctx),$B
245 add $SZ*2($ctx),$C
246 add $SZ*3($ctx),$D
247 add $SZ*4($ctx),$E
248 add $SZ*5($ctx),$F
249 add $SZ*6($ctx),$G
250 add $SZ*7($ctx),$H
251
252 cmp $_end,$inp
253
254 mov $A,$SZ*0($ctx)
255 mov $B,$SZ*1($ctx)
256 mov $C,$SZ*2($ctx)
257 mov $D,$SZ*3($ctx)
258 mov $E,$SZ*4($ctx)
259 mov $F,$SZ*5($ctx)
260 mov $G,$SZ*6($ctx)
261 mov $H,$SZ*7($ctx)
262 jb .Lloop
263
264 mov $_rsp,%rsi
265 mov (%rsi),%r15
266 mov 8(%rsi),%r14
267 mov 16(%rsi),%r13
268 mov 24(%rsi),%r12
269 mov 32(%rsi),%rbp
270 mov 40(%rsi),%rbx
271 lea 48(%rsi),%rsp
272.Lepilogue:
273 ret
274.size $func,.-$func
275___
276
277if ($SZ==4) {
278$code.=<<___;
279.align 64
280.type $TABLE,\@object
281$TABLE:
282 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
283 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
284 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
285 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
286 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
287 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
288 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
289 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
290 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
291 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
292 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
293 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
294 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
295 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
296 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
297 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
298___
299} else {
300$code.=<<___;
301.align 64
302.type $TABLE,\@object
303$TABLE:
304 .quad 0x428a2f98d728ae22,0x7137449123ef65cd
305 .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
306 .quad 0x3956c25bf348b538,0x59f111f1b605d019
307 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
308 .quad 0xd807aa98a3030242,0x12835b0145706fbe
309 .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
310 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
311 .quad 0x9bdc06a725c71235,0xc19bf174cf692694
312 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
313 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
314 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
315 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
316 .quad 0x983e5152ee66dfab,0xa831c66d2db43210
317 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
318 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
319 .quad 0x06ca6351e003826f,0x142929670a0e6e70
320 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
321 .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
322 .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
323 .quad 0x81c2c92e47edaee6,0x92722c851482353b
324 .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
325 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
326 .quad 0xd192e819d6ef5218,0xd69906245565a910
327 .quad 0xf40e35855771202a,0x106aa07032bbd1b8
328 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
329 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
330 .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
331 .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
332 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
333 .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
334 .quad 0x90befffa23631e28,0xa4506cebde82bde9
335 .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
336 .quad 0xca273eceea26619c,0xd186b8c721c0c207
337 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
338 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
339 .quad 0x113f9804bef90dae,0x1b710b35131c471b
340 .quad 0x28db77f523047d84,0x32caab7b40c72493
341 .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
342 .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
343 .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
344___
345}
346
347# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
348# CONTEXT *context,DISPATCHER_CONTEXT *disp)
349if ($win64) {
350$rec="%rcx";
351$frame="%rdx";
352$context="%r8";
353$disp="%r9";
354
355$code.=<<___;
356.extern __imp_RtlVirtualUnwind
357.type se_handler,\@abi-omnipotent
358.align 16
359se_handler:
360 push %rsi
361 push %rdi
362 push %rbx
363 push %rbp
364 push %r12
365 push %r13
366 push %r14
367 push %r15
368 pushfq
369 sub \$64,%rsp
370
371 mov 120($context),%rax # pull context->Rax
372 mov 248($context),%rbx # pull context->Rip
373
374 lea .Lprologue(%rip),%r10
375 cmp %r10,%rbx # context->Rip<.Lprologue
376 jb .Lin_prologue
377
378 mov 152($context),%rax # pull context->Rsp
379
380 lea .Lepilogue(%rip),%r10
381 cmp %r10,%rbx # context->Rip>=.Lepilogue
382 jae .Lin_prologue
383
384 mov 16*$SZ+3*8(%rax),%rax # pull $_rsp
385 lea 48(%rax),%rax
386
387 mov -8(%rax),%rbx
388 mov -16(%rax),%rbp
389 mov -24(%rax),%r12
390 mov -32(%rax),%r13
391 mov -40(%rax),%r14
392 mov -48(%rax),%r15
393 mov %rbx,144($context) # restore context->Rbx
394 mov %rbp,160($context) # restore context->Rbp
395 mov %r12,216($context) # restore context->R12
396 mov %r13,224($context) # restore context->R13
397 mov %r14,232($context) # restore context->R14
398 mov %r15,240($context) # restore context->R15
399
400.Lin_prologue:
401 mov 8(%rax),%rdi
402 mov 16(%rax),%rsi
403 mov %rax,152($context) # restore context->Rsp
404 mov %rsi,168($context) # restore context->Rsi
405 mov %rdi,176($context) # restore context->Rdi
406
407 mov 40($disp),%rdi # disp->ContextRecord
408 mov $context,%rsi # context
409 mov \$154,%ecx # sizeof(CONTEXT)
410 .long 0xa548f3fc # cld; rep movsq
411
412 mov $disp,%rsi
413 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
414 mov 8(%rsi),%rdx # arg2, disp->ImageBase
415 mov 0(%rsi),%r8 # arg3, disp->ControlPc
416 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
417 mov 40(%rsi),%r10 # disp->ContextRecord
418 lea 56(%rsi),%r11 # &disp->HandlerData
419 lea 24(%rsi),%r12 # &disp->EstablisherFrame
420 mov %r10,32(%rsp) # arg5
421 mov %r11,40(%rsp) # arg6
422 mov %r12,48(%rsp) # arg7
423 mov %rcx,56(%rsp) # arg8, (NULL)
424 call *__imp_RtlVirtualUnwind(%rip)
425
426 mov \$1,%eax # ExceptionContinueSearch
427 add \$64,%rsp
428 popfq
429 pop %r15
430 pop %r14
431 pop %r13
432 pop %r12
433 pop %rbp
434 pop %rbx
435 pop %rdi
436 pop %rsi
437 ret
438.size se_handler,.-se_handler
439
440.section .pdata
441.align 4
442 .rva .LSEH_begin_$func
443 .rva .LSEH_end_$func
444 .rva .LSEH_info_$func
445
446.section .xdata
447.align 8
448.LSEH_info_$func:
449 .byte 9,0,0,0
450 .rva se_handler
451___
452}
453
454$code =~ s/\`([^\`]*)\`/eval $1/gem;
455print $code;
456close STDOUT;
diff --git a/src/lib/libcrypto/sha/sha.h b/src/lib/libcrypto/sha/sha.h
deleted file mode 100644
index 16cacf9fc0..0000000000
--- a/src/lib/libcrypto/sha/sha.h
+++ /dev/null
@@ -1,200 +0,0 @@
1/* crypto/sha/sha.h */
2/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
3 * All rights reserved.
4 *
5 * This package is an SSL implementation written
6 * by Eric Young (eay@cryptsoft.com).
7 * The implementation was written so as to conform with Netscapes SSL.
8 *
9 * This library is free for commercial and non-commercial use as long as
10 * the following conditions are aheared to. The following conditions
11 * apply to all code found in this distribution, be it the RC4, RSA,
12 * lhash, DES, etc., code; not just the SSL code. The SSL documentation
13 * included with this distribution is covered by the same copyright terms
14 * except that the holder is Tim Hudson (tjh@cryptsoft.com).
15 *
16 * Copyright remains Eric Young's, and as such any Copyright notices in
17 * the code are not to be removed.
18 * If this package is used in a product, Eric Young should be given attribution
19 * as the author of the parts of the library used.
20 * This can be in the form of a textual message at program startup or
21 * in documentation (online or textual) provided with the package.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 * 1. Redistributions of source code must retain the copyright
27 * notice, this list of conditions and the following disclaimer.
28 * 2. Redistributions in binary form must reproduce the above copyright
29 * notice, this list of conditions and the following disclaimer in the
30 * documentation and/or other materials provided with the distribution.
31 * 3. All advertising materials mentioning features or use of this software
32 * must display the following acknowledgement:
33 * "This product includes cryptographic software written by
34 * Eric Young (eay@cryptsoft.com)"
35 * The word 'cryptographic' can be left out if the rouines from the library
36 * being used are not cryptographic related :-).
37 * 4. If you include any Windows specific code (or a derivative thereof) from
38 * the apps directory (application code) you must include an acknowledgement:
39 * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
40 *
41 * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
42 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51 * SUCH DAMAGE.
52 *
53 * The licence and distribution terms for any publically available version or
54 * derivative of this code cannot be changed. i.e. this code cannot simply be
55 * copied and put under another distribution licence
56 * [including the GNU Public Licence.]
57 */
58
59#ifndef HEADER_SHA_H
60#define HEADER_SHA_H
61
62#include <openssl/e_os2.h>
63#include <stddef.h>
64
65#ifdef __cplusplus
66extern "C" {
67#endif
68
69#if defined(OPENSSL_NO_SHA) || (defined(OPENSSL_NO_SHA0) && defined(OPENSSL_NO_SHA1))
70#error SHA is disabled.
71#endif
72
73#if defined(OPENSSL_FIPS)
74#define FIPS_SHA_SIZE_T size_t
75#endif
76
77/*
78 * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
79 * ! SHA_LONG has to be at least 32 bits wide. If it's wider, then !
80 * ! SHA_LONG_LOG2 has to be defined along. !
81 * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
82 */
83
84#if defined(__LP32__)
85#define SHA_LONG unsigned long
86#elif defined(OPENSSL_SYS_CRAY) || defined(__ILP64__)
87#define SHA_LONG unsigned long
88#define SHA_LONG_LOG2 3
89#else
90#define SHA_LONG unsigned int
91#endif
92
93#define SHA_LBLOCK 16
94#define SHA_CBLOCK (SHA_LBLOCK*4) /* SHA treats input data as a
95 * contiguous array of 32 bit
96 * wide big-endian values. */
97#define SHA_LAST_BLOCK (SHA_CBLOCK-8)
98#define SHA_DIGEST_LENGTH 20
99
100typedef struct SHAstate_st
101 {
102 SHA_LONG h0,h1,h2,h3,h4;
103 SHA_LONG Nl,Nh;
104 SHA_LONG data[SHA_LBLOCK];
105 unsigned int num;
106 } SHA_CTX;
107
108#ifndef OPENSSL_NO_SHA0
109int SHA_Init(SHA_CTX *c);
110int SHA_Update(SHA_CTX *c, const void *data, size_t len);
111int SHA_Final(unsigned char *md, SHA_CTX *c);
112unsigned char *SHA(const unsigned char *d, size_t n, unsigned char *md);
113void SHA_Transform(SHA_CTX *c, const unsigned char *data);
114#endif
115#ifndef OPENSSL_NO_SHA1
116int SHA1_Init(SHA_CTX *c);
117int SHA1_Update(SHA_CTX *c, const void *data, size_t len);
118int SHA1_Final(unsigned char *md, SHA_CTX *c);
119unsigned char *SHA1(const unsigned char *d, size_t n, unsigned char *md);
120void SHA1_Transform(SHA_CTX *c, const unsigned char *data);
121#endif
122
123#define SHA256_CBLOCK (SHA_LBLOCK*4) /* SHA-256 treats input data as a
124 * contiguous array of 32 bit
125 * wide big-endian values. */
126#define SHA224_DIGEST_LENGTH 28
127#define SHA256_DIGEST_LENGTH 32
128
129typedef struct SHA256state_st
130 {
131 SHA_LONG h[8];
132 SHA_LONG Nl,Nh;
133 SHA_LONG data[SHA_LBLOCK];
134 unsigned int num,md_len;
135 } SHA256_CTX;
136
137#ifndef OPENSSL_NO_SHA256
138int SHA224_Init(SHA256_CTX *c);
139int SHA224_Update(SHA256_CTX *c, const void *data, size_t len);
140int SHA224_Final(unsigned char *md, SHA256_CTX *c);
141unsigned char *SHA224(const unsigned char *d, size_t n,unsigned char *md);
142int SHA256_Init(SHA256_CTX *c);
143int SHA256_Update(SHA256_CTX *c, const void *data, size_t len);
144int SHA256_Final(unsigned char *md, SHA256_CTX *c);
145unsigned char *SHA256(const unsigned char *d, size_t n,unsigned char *md);
146void SHA256_Transform(SHA256_CTX *c, const unsigned char *data);
147#endif
148
149#define SHA384_DIGEST_LENGTH 48
150#define SHA512_DIGEST_LENGTH 64
151
152#ifndef OPENSSL_NO_SHA512
153/*
154 * Unlike 32-bit digest algorithms, SHA-512 *relies* on SHA_LONG64
155 * being exactly 64-bit wide. See Implementation Notes in sha512.c
156 * for further details.
157 */
158#define SHA512_CBLOCK (SHA_LBLOCK*8) /* SHA-512 treats input data as a
159 * contiguous array of 64 bit
160 * wide big-endian values. */
161#if (defined(_WIN32) || defined(_WIN64)) && !defined(__MINGW32__)
162#define SHA_LONG64 unsigned __int64
163#define U64(C) C##UI64
164#elif defined(__arch64__)
165#define SHA_LONG64 unsigned long
166#define U64(C) C##UL
167#else
168#define SHA_LONG64 unsigned long long
169#define U64(C) C##ULL
170#endif
171
172typedef struct SHA512state_st
173 {
174 SHA_LONG64 h[8];
175 SHA_LONG64 Nl,Nh;
176 union {
177 SHA_LONG64 d[SHA_LBLOCK];
178 unsigned char p[SHA512_CBLOCK];
179 } u;
180 unsigned int num,md_len;
181 } SHA512_CTX;
182#endif
183
184#ifndef OPENSSL_NO_SHA512
185int SHA384_Init(SHA512_CTX *c);
186int SHA384_Update(SHA512_CTX *c, const void *data, size_t len);
187int SHA384_Final(unsigned char *md, SHA512_CTX *c);
188unsigned char *SHA384(const unsigned char *d, size_t n,unsigned char *md);
189int SHA512_Init(SHA512_CTX *c);
190int SHA512_Update(SHA512_CTX *c, const void *data, size_t len);
191int SHA512_Final(unsigned char *md, SHA512_CTX *c);
192unsigned char *SHA512(const unsigned char *d, size_t n,unsigned char *md);
193void SHA512_Transform(SHA512_CTX *c, const unsigned char *data);
194#endif
195
196#ifdef __cplusplus
197}
198#endif
199
200#endif
diff --git a/src/lib/libcrypto/sha/sha1_one.c b/src/lib/libcrypto/sha/sha1_one.c
deleted file mode 100644
index 7c65b60276..0000000000
--- a/src/lib/libcrypto/sha/sha1_one.c
+++ /dev/null
@@ -1,78 +0,0 @@
1/* crypto/sha/sha1_one.c */
2/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
3 * All rights reserved.
4 *
5 * This package is an SSL implementation written
6 * by Eric Young (eay@cryptsoft.com).
7 * The implementation was written so as to conform with Netscapes SSL.
8 *
9 * This library is free for commercial and non-commercial use as long as
10 * the following conditions are aheared to. The following conditions
11 * apply to all code found in this distribution, be it the RC4, RSA,
12 * lhash, DES, etc., code; not just the SSL code. The SSL documentation
13 * included with this distribution is covered by the same copyright terms
14 * except that the holder is Tim Hudson (tjh@cryptsoft.com).
15 *
16 * Copyright remains Eric Young's, and as such any Copyright notices in
17 * the code are not to be removed.
18 * If this package is used in a product, Eric Young should be given attribution
19 * as the author of the parts of the library used.
20 * This can be in the form of a textual message at program startup or
21 * in documentation (online or textual) provided with the package.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 * 1. Redistributions of source code must retain the copyright
27 * notice, this list of conditions and the following disclaimer.
28 * 2. Redistributions in binary form must reproduce the above copyright
29 * notice, this list of conditions and the following disclaimer in the
30 * documentation and/or other materials provided with the distribution.
31 * 3. All advertising materials mentioning features or use of this software
32 * must display the following acknowledgement:
33 * "This product includes cryptographic software written by
34 * Eric Young (eay@cryptsoft.com)"
35 * The word 'cryptographic' can be left out if the rouines from the library
36 * being used are not cryptographic related :-).
37 * 4. If you include any Windows specific code (or a derivative thereof) from
38 * the apps directory (application code) you must include an acknowledgement:
39 * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
40 *
41 * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
42 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51 * SUCH DAMAGE.
52 *
53 * The licence and distribution terms for any publically available version or
54 * derivative of this code cannot be changed. i.e. this code cannot simply be
55 * copied and put under another distribution licence
56 * [including the GNU Public Licence.]
57 */
58
59#include <stdio.h>
60#include <string.h>
61#include <openssl/sha.h>
62#include <openssl/crypto.h>
63
64#ifndef OPENSSL_NO_SHA1
65unsigned char *SHA1(const unsigned char *d, size_t n, unsigned char *md)
66 {
67 SHA_CTX c;
68 static unsigned char m[SHA_DIGEST_LENGTH];
69
70 if (md == NULL) md=m;
71 if (!SHA1_Init(&c))
72 return NULL;
73 SHA1_Update(&c,d,n);
74 SHA1_Final(md,&c);
75 OPENSSL_cleanse(&c,sizeof(c));
76 return(md);
77 }
78#endif
diff --git a/src/lib/libcrypto/sha/sha1dgst.c b/src/lib/libcrypto/sha/sha1dgst.c
deleted file mode 100644
index 50d1925cde..0000000000
--- a/src/lib/libcrypto/sha/sha1dgst.c
+++ /dev/null
@@ -1,74 +0,0 @@
1/* crypto/sha/sha1dgst.c */
2/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
3 * All rights reserved.
4 *
5 * This package is an SSL implementation written
6 * by Eric Young (eay@cryptsoft.com).
7 * The implementation was written so as to conform with Netscapes SSL.
8 *
9 * This library is free for commercial and non-commercial use as long as
10 * the following conditions are aheared to. The following conditions
11 * apply to all code found in this distribution, be it the RC4, RSA,
12 * lhash, DES, etc., code; not just the SSL code. The SSL documentation
13 * included with this distribution is covered by the same copyright terms
14 * except that the holder is Tim Hudson (tjh@cryptsoft.com).
15 *
16 * Copyright remains Eric Young's, and as such any Copyright notices in
17 * the code are not to be removed.
18 * If this package is used in a product, Eric Young should be given attribution
19 * as the author of the parts of the library used.
20 * This can be in the form of a textual message at program startup or
21 * in documentation (online or textual) provided with the package.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 * 1. Redistributions of source code must retain the copyright
27 * notice, this list of conditions and the following disclaimer.
28 * 2. Redistributions in binary form must reproduce the above copyright
29 * notice, this list of conditions and the following disclaimer in the
30 * documentation and/or other materials provided with the distribution.
31 * 3. All advertising materials mentioning features or use of this software
32 * must display the following acknowledgement:
33 * "This product includes cryptographic software written by
34 * Eric Young (eay@cryptsoft.com)"
35 * The word 'cryptographic' can be left out if the rouines from the library
36 * being used are not cryptographic related :-).
37 * 4. If you include any Windows specific code (or a derivative thereof) from
38 * the apps directory (application code) you must include an acknowledgement:
39 * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
40 *
41 * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
42 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51 * SUCH DAMAGE.
52 *
53 * The licence and distribution terms for any publically available version or
54 * derivative of this code cannot be changed. i.e. this code cannot simply be
55 * copied and put under another distribution licence
56 * [including the GNU Public Licence.]
57 */
58
59#include <openssl/opensslconf.h>
60#if !defined(OPENSSL_NO_SHA1) && !defined(OPENSSL_NO_SHA)
61
62#undef SHA_0
63#define SHA_1
64
65#include <openssl/opensslv.h>
66
67const char SHA1_version[]="SHA1" OPENSSL_VERSION_PTEXT;
68
69/* The implementation is in ../md32_common.h */
70
71#include "sha_locl.h"
72
73#endif
74
diff --git a/src/lib/libcrypto/sha/sha256.c b/src/lib/libcrypto/sha/sha256.c
deleted file mode 100644
index 8952d87673..0000000000
--- a/src/lib/libcrypto/sha/sha256.c
+++ /dev/null
@@ -1,282 +0,0 @@
1/* crypto/sha/sha256.c */
2/* ====================================================================
3 * Copyright (c) 2004 The OpenSSL Project. All rights reserved
4 * according to the OpenSSL license [found in ../../LICENSE].
5 * ====================================================================
6 */
7#include <openssl/opensslconf.h>
8#if !defined(OPENSSL_NO_SHA) && !defined(OPENSSL_NO_SHA256)
9
10#include <stdlib.h>
11#include <string.h>
12
13#include <openssl/crypto.h>
14#include <openssl/sha.h>
15#include <openssl/opensslv.h>
16
17const char SHA256_version[]="SHA-256" OPENSSL_VERSION_PTEXT;
18
19int SHA224_Init (SHA256_CTX *c)
20 {
21 memset (c,0,sizeof(*c));
22 c->h[0]=0xc1059ed8UL; c->h[1]=0x367cd507UL;
23 c->h[2]=0x3070dd17UL; c->h[3]=0xf70e5939UL;
24 c->h[4]=0xffc00b31UL; c->h[5]=0x68581511UL;
25 c->h[6]=0x64f98fa7UL; c->h[7]=0xbefa4fa4UL;
26 c->md_len=SHA224_DIGEST_LENGTH;
27 return 1;
28 }
29
30int SHA256_Init (SHA256_CTX *c)
31 {
32 memset (c,0,sizeof(*c));
33 c->h[0]=0x6a09e667UL; c->h[1]=0xbb67ae85UL;
34 c->h[2]=0x3c6ef372UL; c->h[3]=0xa54ff53aUL;
35 c->h[4]=0x510e527fUL; c->h[5]=0x9b05688cUL;
36 c->h[6]=0x1f83d9abUL; c->h[7]=0x5be0cd19UL;
37 c->md_len=SHA256_DIGEST_LENGTH;
38 return 1;
39 }
40
41unsigned char *SHA224(const unsigned char *d, size_t n, unsigned char *md)
42 {
43 SHA256_CTX c;
44 static unsigned char m[SHA224_DIGEST_LENGTH];
45
46 if (md == NULL) md=m;
47 SHA224_Init(&c);
48 SHA256_Update(&c,d,n);
49 SHA256_Final(md,&c);
50 OPENSSL_cleanse(&c,sizeof(c));
51 return(md);
52 }
53
54unsigned char *SHA256(const unsigned char *d, size_t n, unsigned char *md)
55 {
56 SHA256_CTX c;
57 static unsigned char m[SHA256_DIGEST_LENGTH];
58
59 if (md == NULL) md=m;
60 SHA256_Init(&c);
61 SHA256_Update(&c,d,n);
62 SHA256_Final(md,&c);
63 OPENSSL_cleanse(&c,sizeof(c));
64 return(md);
65 }
66
67int SHA224_Update(SHA256_CTX *c, const void *data, size_t len)
68{ return SHA256_Update (c,data,len); }
69int SHA224_Final (unsigned char *md, SHA256_CTX *c)
70{ return SHA256_Final (md,c); }
71
72#define DATA_ORDER_IS_BIG_ENDIAN
73
74#define HASH_LONG SHA_LONG
75#define HASH_CTX SHA256_CTX
76#define HASH_CBLOCK SHA_CBLOCK
77/*
78 * Note that FIPS180-2 discusses "Truncation of the Hash Function Output."
79 * default: case below covers for it. It's not clear however if it's
80 * permitted to truncate to amount of bytes not divisible by 4. I bet not,
81 * but if it is, then default: case shall be extended. For reference.
82 * Idea behind separate cases for pre-defined lenghts is to let the
83 * compiler decide if it's appropriate to unroll small loops.
84 */
85#define HASH_MAKE_STRING(c,s) do { \
86 unsigned long ll; \
87 unsigned int nn; \
88 switch ((c)->md_len) \
89 { case SHA224_DIGEST_LENGTH: \
90 for (nn=0;nn<SHA224_DIGEST_LENGTH/4;nn++) \
91 { ll=(c)->h[nn]; HOST_l2c(ll,(s)); } \
92 break; \
93 case SHA256_DIGEST_LENGTH: \
94 for (nn=0;nn<SHA256_DIGEST_LENGTH/4;nn++) \
95 { ll=(c)->h[nn]; HOST_l2c(ll,(s)); } \
96 break; \
97 default: \
98 if ((c)->md_len > SHA256_DIGEST_LENGTH) \
99 return 0; \
100 for (nn=0;nn<(c)->md_len/4;nn++) \
101 { ll=(c)->h[nn]; HOST_l2c(ll,(s)); } \
102 break; \
103 } \
104 } while (0)
105
106#define HASH_UPDATE SHA256_Update
107#define HASH_TRANSFORM SHA256_Transform
108#define HASH_FINAL SHA256_Final
109#define HASH_BLOCK_DATA_ORDER sha256_block_data_order
110#ifndef SHA256_ASM
111static
112#endif
113void sha256_block_data_order (SHA256_CTX *ctx, const void *in, size_t num);
114
115#include "md32_common.h"
116
117#ifndef SHA256_ASM
118static const SHA_LONG K256[64] = {
119 0x428a2f98UL,0x71374491UL,0xb5c0fbcfUL,0xe9b5dba5UL,
120 0x3956c25bUL,0x59f111f1UL,0x923f82a4UL,0xab1c5ed5UL,
121 0xd807aa98UL,0x12835b01UL,0x243185beUL,0x550c7dc3UL,
122 0x72be5d74UL,0x80deb1feUL,0x9bdc06a7UL,0xc19bf174UL,
123 0xe49b69c1UL,0xefbe4786UL,0x0fc19dc6UL,0x240ca1ccUL,
124 0x2de92c6fUL,0x4a7484aaUL,0x5cb0a9dcUL,0x76f988daUL,
125 0x983e5152UL,0xa831c66dUL,0xb00327c8UL,0xbf597fc7UL,
126 0xc6e00bf3UL,0xd5a79147UL,0x06ca6351UL,0x14292967UL,
127 0x27b70a85UL,0x2e1b2138UL,0x4d2c6dfcUL,0x53380d13UL,
128 0x650a7354UL,0x766a0abbUL,0x81c2c92eUL,0x92722c85UL,
129 0xa2bfe8a1UL,0xa81a664bUL,0xc24b8b70UL,0xc76c51a3UL,
130 0xd192e819UL,0xd6990624UL,0xf40e3585UL,0x106aa070UL,
131 0x19a4c116UL,0x1e376c08UL,0x2748774cUL,0x34b0bcb5UL,
132 0x391c0cb3UL,0x4ed8aa4aUL,0x5b9cca4fUL,0x682e6ff3UL,
133 0x748f82eeUL,0x78a5636fUL,0x84c87814UL,0x8cc70208UL,
134 0x90befffaUL,0xa4506cebUL,0xbef9a3f7UL,0xc67178f2UL };
135
136/*
137 * FIPS specification refers to right rotations, while our ROTATE macro
138 * is left one. This is why you might notice that rotation coefficients
139 * differ from those observed in FIPS document by 32-N...
140 */
141#define Sigma0(x) (ROTATE((x),30) ^ ROTATE((x),19) ^ ROTATE((x),10))
142#define Sigma1(x) (ROTATE((x),26) ^ ROTATE((x),21) ^ ROTATE((x),7))
143#define sigma0(x) (ROTATE((x),25) ^ ROTATE((x),14) ^ ((x)>>3))
144#define sigma1(x) (ROTATE((x),15) ^ ROTATE((x),13) ^ ((x)>>10))
145
146#define Ch(x,y,z) (((x) & (y)) ^ ((~(x)) & (z)))
147#define Maj(x,y,z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
148
149#ifdef OPENSSL_SMALL_FOOTPRINT
150
151static void sha256_block_data_order (SHA256_CTX *ctx, const void *in, size_t num)
152 {
153 unsigned MD32_REG_T a,b,c,d,e,f,g,h,s0,s1,T1,T2;
154 SHA_LONG X[16],l;
155 int i;
156 const unsigned char *data=in;
157
158 while (num--) {
159
160 a = ctx->h[0]; b = ctx->h[1]; c = ctx->h[2]; d = ctx->h[3];
161 e = ctx->h[4]; f = ctx->h[5]; g = ctx->h[6]; h = ctx->h[7];
162
163 for (i=0;i<16;i++)
164 {
165 HOST_c2l(data,l); T1 = X[i] = l;
166 T1 += h + Sigma1(e) + Ch(e,f,g) + K256[i];
167 T2 = Sigma0(a) + Maj(a,b,c);
168 h = g; g = f; f = e; e = d + T1;
169 d = c; c = b; b = a; a = T1 + T2;
170 }
171
172 for (;i<64;i++)
173 {
174 s0 = X[(i+1)&0x0f]; s0 = sigma0(s0);
175 s1 = X[(i+14)&0x0f]; s1 = sigma1(s1);
176
177 T1 = X[i&0xf] += s0 + s1 + X[(i+9)&0xf];
178 T1 += h + Sigma1(e) + Ch(e,f,g) + K256[i];
179 T2 = Sigma0(a) + Maj(a,b,c);
180 h = g; g = f; f = e; e = d + T1;
181 d = c; c = b; b = a; a = T1 + T2;
182 }
183
184 ctx->h[0] += a; ctx->h[1] += b; ctx->h[2] += c; ctx->h[3] += d;
185 ctx->h[4] += e; ctx->h[5] += f; ctx->h[6] += g; ctx->h[7] += h;
186
187 }
188}
189
190#else
191
192#define ROUND_00_15(i,a,b,c,d,e,f,g,h) do { \
193 T1 += h + Sigma1(e) + Ch(e,f,g) + K256[i]; \
194 h = Sigma0(a) + Maj(a,b,c); \
195 d += T1; h += T1; } while (0)
196
197#define ROUND_16_63(i,a,b,c,d,e,f,g,h,X) do { \
198 s0 = X[(i+1)&0x0f]; s0 = sigma0(s0); \
199 s1 = X[(i+14)&0x0f]; s1 = sigma1(s1); \
200 T1 = X[(i)&0x0f] += s0 + s1 + X[(i+9)&0x0f]; \
201 ROUND_00_15(i,a,b,c,d,e,f,g,h); } while (0)
202
203static void sha256_block_data_order (SHA256_CTX *ctx, const void *in, size_t num)
204 {
205 unsigned MD32_REG_T a,b,c,d,e,f,g,h,s0,s1,T1;
206 SHA_LONG X[16];
207 int i;
208 const unsigned char *data=in;
209 const union { long one; char little; } is_endian = {1};
210
211 while (num--) {
212
213 a = ctx->h[0]; b = ctx->h[1]; c = ctx->h[2]; d = ctx->h[3];
214 e = ctx->h[4]; f = ctx->h[5]; g = ctx->h[6]; h = ctx->h[7];
215
216 if (!is_endian.little && sizeof(SHA_LONG)==4 && ((size_t)in%4)==0)
217 {
218 const SHA_LONG *W=(const SHA_LONG *)data;
219
220 T1 = X[0] = W[0]; ROUND_00_15(0,a,b,c,d,e,f,g,h);
221 T1 = X[1] = W[1]; ROUND_00_15(1,h,a,b,c,d,e,f,g);
222 T1 = X[2] = W[2]; ROUND_00_15(2,g,h,a,b,c,d,e,f);
223 T1 = X[3] = W[3]; ROUND_00_15(3,f,g,h,a,b,c,d,e);
224 T1 = X[4] = W[4]; ROUND_00_15(4,e,f,g,h,a,b,c,d);
225 T1 = X[5] = W[5]; ROUND_00_15(5,d,e,f,g,h,a,b,c);
226 T1 = X[6] = W[6]; ROUND_00_15(6,c,d,e,f,g,h,a,b);
227 T1 = X[7] = W[7]; ROUND_00_15(7,b,c,d,e,f,g,h,a);
228 T1 = X[8] = W[8]; ROUND_00_15(8,a,b,c,d,e,f,g,h);
229 T1 = X[9] = W[9]; ROUND_00_15(9,h,a,b,c,d,e,f,g);
230 T1 = X[10] = W[10]; ROUND_00_15(10,g,h,a,b,c,d,e,f);
231 T1 = X[11] = W[11]; ROUND_00_15(11,f,g,h,a,b,c,d,e);
232 T1 = X[12] = W[12]; ROUND_00_15(12,e,f,g,h,a,b,c,d);
233 T1 = X[13] = W[13]; ROUND_00_15(13,d,e,f,g,h,a,b,c);
234 T1 = X[14] = W[14]; ROUND_00_15(14,c,d,e,f,g,h,a,b);
235 T1 = X[15] = W[15]; ROUND_00_15(15,b,c,d,e,f,g,h,a);
236
237 data += SHA256_CBLOCK;
238 }
239 else
240 {
241 SHA_LONG l;
242
243 HOST_c2l(data,l); T1 = X[0] = l; ROUND_00_15(0,a,b,c,d,e,f,g,h);
244 HOST_c2l(data,l); T1 = X[1] = l; ROUND_00_15(1,h,a,b,c,d,e,f,g);
245 HOST_c2l(data,l); T1 = X[2] = l; ROUND_00_15(2,g,h,a,b,c,d,e,f);
246 HOST_c2l(data,l); T1 = X[3] = l; ROUND_00_15(3,f,g,h,a,b,c,d,e);
247 HOST_c2l(data,l); T1 = X[4] = l; ROUND_00_15(4,e,f,g,h,a,b,c,d);
248 HOST_c2l(data,l); T1 = X[5] = l; ROUND_00_15(5,d,e,f,g,h,a,b,c);
249 HOST_c2l(data,l); T1 = X[6] = l; ROUND_00_15(6,c,d,e,f,g,h,a,b);
250 HOST_c2l(data,l); T1 = X[7] = l; ROUND_00_15(7,b,c,d,e,f,g,h,a);
251 HOST_c2l(data,l); T1 = X[8] = l; ROUND_00_15(8,a,b,c,d,e,f,g,h);
252 HOST_c2l(data,l); T1 = X[9] = l; ROUND_00_15(9,h,a,b,c,d,e,f,g);
253 HOST_c2l(data,l); T1 = X[10] = l; ROUND_00_15(10,g,h,a,b,c,d,e,f);
254 HOST_c2l(data,l); T1 = X[11] = l; ROUND_00_15(11,f,g,h,a,b,c,d,e);
255 HOST_c2l(data,l); T1 = X[12] = l; ROUND_00_15(12,e,f,g,h,a,b,c,d);
256 HOST_c2l(data,l); T1 = X[13] = l; ROUND_00_15(13,d,e,f,g,h,a,b,c);
257 HOST_c2l(data,l); T1 = X[14] = l; ROUND_00_15(14,c,d,e,f,g,h,a,b);
258 HOST_c2l(data,l); T1 = X[15] = l; ROUND_00_15(15,b,c,d,e,f,g,h,a);
259 }
260
261 for (i=16;i<64;i+=8)
262 {
263 ROUND_16_63(i+0,a,b,c,d,e,f,g,h,X);
264 ROUND_16_63(i+1,h,a,b,c,d,e,f,g,X);
265 ROUND_16_63(i+2,g,h,a,b,c,d,e,f,X);
266 ROUND_16_63(i+3,f,g,h,a,b,c,d,e,X);
267 ROUND_16_63(i+4,e,f,g,h,a,b,c,d,X);
268 ROUND_16_63(i+5,d,e,f,g,h,a,b,c,X);
269 ROUND_16_63(i+6,c,d,e,f,g,h,a,b,X);
270 ROUND_16_63(i+7,b,c,d,e,f,g,h,a,X);
271 }
272
273 ctx->h[0] += a; ctx->h[1] += b; ctx->h[2] += c; ctx->h[3] += d;
274 ctx->h[4] += e; ctx->h[5] += f; ctx->h[6] += g; ctx->h[7] += h;
275
276 }
277 }
278
279#endif
280#endif /* SHA256_ASM */
281
282#endif /* OPENSSL_NO_SHA256 */
diff --git a/src/lib/libcrypto/sha/sha512.c b/src/lib/libcrypto/sha/sha512.c
deleted file mode 100644
index cbc0e58c48..0000000000
--- a/src/lib/libcrypto/sha/sha512.c
+++ /dev/null
@@ -1,641 +0,0 @@
1/* crypto/sha/sha512.c */
2/* ====================================================================
3 * Copyright (c) 2004 The OpenSSL Project. All rights reserved
4 * according to the OpenSSL license [found in ../../LICENSE].
5 * ====================================================================
6 */
7#include <openssl/opensslconf.h>
8#if !defined(OPENSSL_NO_SHA) && !defined(OPENSSL_NO_SHA512)
9/*
10 * IMPLEMENTATION NOTES.
11 *
12 * As you might have noticed 32-bit hash algorithms:
13 *
14 * - permit SHA_LONG to be wider than 32-bit (case on CRAY);
15 * - optimized versions implement two transform functions: one operating
16 * on [aligned] data in host byte order and one - on data in input
17 * stream byte order;
18 * - share common byte-order neutral collector and padding function
19 * implementations, ../md32_common.h;
20 *
21 * Neither of the above applies to this SHA-512 implementations. Reasons
22 * [in reverse order] are:
23 *
24 * - it's the only 64-bit hash algorithm for the moment of this writing,
25 * there is no need for common collector/padding implementation [yet];
26 * - by supporting only one transform function [which operates on
27 * *aligned* data in input stream byte order, big-endian in this case]
28 * we minimize burden of maintenance in two ways: a) collector/padding
29 * function is simpler; b) only one transform function to stare at;
30 * - SHA_LONG64 is required to be exactly 64-bit in order to be able to
31 * apply a number of optimizations to mitigate potential performance
32 * penalties caused by previous design decision;
33 *
34 * Caveat lector.
35 *
36 * Implementation relies on the fact that "long long" is 64-bit on
37 * both 32- and 64-bit platforms. If some compiler vendor comes up
38 * with 128-bit long long, adjustment to sha.h would be required.
39 * As this implementation relies on 64-bit integer type, it's totally
40 * inappropriate for platforms which don't support it, most notably
41 * 16-bit platforms.
42 * <appro@fy.chalmers.se>
43 */
44#include <stdlib.h>
45#include <string.h>
46
47#include <openssl/crypto.h>
48#include <openssl/sha.h>
49#include <openssl/opensslv.h>
50
51#include "cryptlib.h"
52
53const char SHA512_version[]="SHA-512" OPENSSL_VERSION_PTEXT;
54
55#if defined(__i386) || defined(__i386__) || defined(_M_IX86) || \
56 defined(__x86_64) || defined(_M_AMD64) || defined(_M_X64) || \
57 defined(__s390__) || defined(__s390x__) || \
58 defined(SHA512_ASM)
59#define SHA512_BLOCK_CAN_MANAGE_UNALIGNED_DATA
60#endif
61
62int SHA384_Init (SHA512_CTX *c)
63 {
64#if defined(SHA512_ASM) && (defined(__arm__) || defined(__arm))
65 /* maintain dword order required by assembler module */
66 unsigned int *h = (unsigned int *)c->h;
67
68 h[0] = 0xcbbb9d5d; h[1] = 0xc1059ed8;
69 h[2] = 0x629a292a; h[3] = 0x367cd507;
70 h[4] = 0x9159015a; h[5] = 0x3070dd17;
71 h[6] = 0x152fecd8; h[7] = 0xf70e5939;
72 h[8] = 0x67332667; h[9] = 0xffc00b31;
73 h[10] = 0x8eb44a87; h[11] = 0x68581511;
74 h[12] = 0xdb0c2e0d; h[13] = 0x64f98fa7;
75 h[14] = 0x47b5481d; h[15] = 0xbefa4fa4;
76#else
77 c->h[0]=U64(0xcbbb9d5dc1059ed8);
78 c->h[1]=U64(0x629a292a367cd507);
79 c->h[2]=U64(0x9159015a3070dd17);
80 c->h[3]=U64(0x152fecd8f70e5939);
81 c->h[4]=U64(0x67332667ffc00b31);
82 c->h[5]=U64(0x8eb44a8768581511);
83 c->h[6]=U64(0xdb0c2e0d64f98fa7);
84 c->h[7]=U64(0x47b5481dbefa4fa4);
85#endif
86 c->Nl=0; c->Nh=0;
87 c->num=0; c->md_len=SHA384_DIGEST_LENGTH;
88 return 1;
89 }
90
91int SHA512_Init (SHA512_CTX *c)
92 {
93#if defined(SHA512_ASM) && (defined(__arm__) || defined(__arm))
94 /* maintain dword order required by assembler module */
95 unsigned int *h = (unsigned int *)c->h;
96
97 h[0] = 0x6a09e667; h[1] = 0xf3bcc908;
98 h[2] = 0xbb67ae85; h[3] = 0x84caa73b;
99 h[4] = 0x3c6ef372; h[5] = 0xfe94f82b;
100 h[6] = 0xa54ff53a; h[7] = 0x5f1d36f1;
101 h[8] = 0x510e527f; h[9] = 0xade682d1;
102 h[10] = 0x9b05688c; h[11] = 0x2b3e6c1f;
103 h[12] = 0x1f83d9ab; h[13] = 0xfb41bd6b;
104 h[14] = 0x5be0cd19; h[15] = 0x137e2179;
105#else
106 c->h[0]=U64(0x6a09e667f3bcc908);
107 c->h[1]=U64(0xbb67ae8584caa73b);
108 c->h[2]=U64(0x3c6ef372fe94f82b);
109 c->h[3]=U64(0xa54ff53a5f1d36f1);
110 c->h[4]=U64(0x510e527fade682d1);
111 c->h[5]=U64(0x9b05688c2b3e6c1f);
112 c->h[6]=U64(0x1f83d9abfb41bd6b);
113 c->h[7]=U64(0x5be0cd19137e2179);
114#endif
115 c->Nl=0; c->Nh=0;
116 c->num=0; c->md_len=SHA512_DIGEST_LENGTH;
117 return 1;
118 }
119
120#ifndef SHA512_ASM
121static
122#endif
123void sha512_block_data_order (SHA512_CTX *ctx, const void *in, size_t num);
124
125int SHA512_Final (unsigned char *md, SHA512_CTX *c)
126 {
127 unsigned char *p=(unsigned char *)c->u.p;
128 size_t n=c->num;
129
130 p[n]=0x80; /* There always is a room for one */
131 n++;
132 if (n > (sizeof(c->u)-16))
133 memset (p+n,0,sizeof(c->u)-n), n=0,
134 sha512_block_data_order (c,p,1);
135
136 memset (p+n,0,sizeof(c->u)-16-n);
137#ifdef B_ENDIAN
138 c->u.d[SHA_LBLOCK-2] = c->Nh;
139 c->u.d[SHA_LBLOCK-1] = c->Nl;
140#else
141 p[sizeof(c->u)-1] = (unsigned char)(c->Nl);
142 p[sizeof(c->u)-2] = (unsigned char)(c->Nl>>8);
143 p[sizeof(c->u)-3] = (unsigned char)(c->Nl>>16);
144 p[sizeof(c->u)-4] = (unsigned char)(c->Nl>>24);
145 p[sizeof(c->u)-5] = (unsigned char)(c->Nl>>32);
146 p[sizeof(c->u)-6] = (unsigned char)(c->Nl>>40);
147 p[sizeof(c->u)-7] = (unsigned char)(c->Nl>>48);
148 p[sizeof(c->u)-8] = (unsigned char)(c->Nl>>56);
149 p[sizeof(c->u)-9] = (unsigned char)(c->Nh);
150 p[sizeof(c->u)-10] = (unsigned char)(c->Nh>>8);
151 p[sizeof(c->u)-11] = (unsigned char)(c->Nh>>16);
152 p[sizeof(c->u)-12] = (unsigned char)(c->Nh>>24);
153 p[sizeof(c->u)-13] = (unsigned char)(c->Nh>>32);
154 p[sizeof(c->u)-14] = (unsigned char)(c->Nh>>40);
155 p[sizeof(c->u)-15] = (unsigned char)(c->Nh>>48);
156 p[sizeof(c->u)-16] = (unsigned char)(c->Nh>>56);
157#endif
158
159 sha512_block_data_order (c,p,1);
160
161 if (md==0) return 0;
162
163#if defined(SHA512_ASM) && (defined(__arm__) || defined(__arm))
164 /* recall assembler dword order... */
165 n = c->md_len;
166 if (n == SHA384_DIGEST_LENGTH || n == SHA512_DIGEST_LENGTH)
167 {
168 unsigned int *h = (unsigned int *)c->h, t;
169
170 for (n/=4;n;n--)
171 {
172 t = *(h++);
173 *(md++) = (unsigned char)(t>>24);
174 *(md++) = (unsigned char)(t>>16);
175 *(md++) = (unsigned char)(t>>8);
176 *(md++) = (unsigned char)(t);
177 }
178 }
179 else return 0;
180#else
181 switch (c->md_len)
182 {
183 /* Let compiler decide if it's appropriate to unroll... */
184 case SHA384_DIGEST_LENGTH:
185 for (n=0;n<SHA384_DIGEST_LENGTH/8;n++)
186 {
187 SHA_LONG64 t = c->h[n];
188
189 *(md++) = (unsigned char)(t>>56);
190 *(md++) = (unsigned char)(t>>48);
191 *(md++) = (unsigned char)(t>>40);
192 *(md++) = (unsigned char)(t>>32);
193 *(md++) = (unsigned char)(t>>24);
194 *(md++) = (unsigned char)(t>>16);
195 *(md++) = (unsigned char)(t>>8);
196 *(md++) = (unsigned char)(t);
197 }
198 break;
199 case SHA512_DIGEST_LENGTH:
200 for (n=0;n<SHA512_DIGEST_LENGTH/8;n++)
201 {
202 SHA_LONG64 t = c->h[n];
203
204 *(md++) = (unsigned char)(t>>56);
205 *(md++) = (unsigned char)(t>>48);
206 *(md++) = (unsigned char)(t>>40);
207 *(md++) = (unsigned char)(t>>32);
208 *(md++) = (unsigned char)(t>>24);
209 *(md++) = (unsigned char)(t>>16);
210 *(md++) = (unsigned char)(t>>8);
211 *(md++) = (unsigned char)(t);
212 }
213 break;
214 /* ... as well as make sure md_len is not abused. */
215 default: return 0;
216 }
217#endif
218 return 1;
219 }
220
221int SHA384_Final (unsigned char *md,SHA512_CTX *c)
222{ return SHA512_Final (md,c); }
223
224int SHA512_Update (SHA512_CTX *c, const void *_data, size_t len)
225 {
226 SHA_LONG64 l;
227 unsigned char *p=c->u.p;
228 const unsigned char *data=(const unsigned char *)_data;
229
230 if (len==0) return 1;
231
232 l = (c->Nl+(((SHA_LONG64)len)<<3))&U64(0xffffffffffffffff);
233 if (l < c->Nl) c->Nh++;
234 if (sizeof(len)>=8) c->Nh+=(((SHA_LONG64)len)>>61);
235 c->Nl=l;
236
237 if (c->num != 0)
238 {
239 size_t n = sizeof(c->u) - c->num;
240
241 if (len < n)
242 {
243 memcpy (p+c->num,data,len), c->num += (unsigned int)len;
244 return 1;
245 }
246 else {
247 memcpy (p+c->num,data,n), c->num = 0;
248 len-=n, data+=n;
249 sha512_block_data_order (c,p,1);
250 }
251 }
252
253 if (len >= sizeof(c->u))
254 {
255#ifndef SHA512_BLOCK_CAN_MANAGE_UNALIGNED_DATA
256 if ((size_t)data%sizeof(c->u.d[0]) != 0)
257 while (len >= sizeof(c->u))
258 memcpy (p,data,sizeof(c->u)),
259 sha512_block_data_order (c,p,1),
260 len -= sizeof(c->u),
261 data += sizeof(c->u);
262 else
263#endif
264 sha512_block_data_order (c,data,len/sizeof(c->u)),
265 data += len,
266 len %= sizeof(c->u),
267 data -= len;
268 }
269
270 if (len != 0) memcpy (p,data,len), c->num = (int)len;
271
272 return 1;
273 }
274
275int SHA384_Update (SHA512_CTX *c, const void *data, size_t len)
276{ return SHA512_Update (c,data,len); }
277
278void SHA512_Transform (SHA512_CTX *c, const unsigned char *data)
279{ sha512_block_data_order (c,data,1); }
280
281unsigned char *SHA384(const unsigned char *d, size_t n, unsigned char *md)
282 {
283 SHA512_CTX c;
284 static unsigned char m[SHA384_DIGEST_LENGTH];
285
286 if (md == NULL) md=m;
287 SHA384_Init(&c);
288 SHA512_Update(&c,d,n);
289 SHA512_Final(md,&c);
290 OPENSSL_cleanse(&c,sizeof(c));
291 return(md);
292 }
293
294unsigned char *SHA512(const unsigned char *d, size_t n, unsigned char *md)
295 {
296 SHA512_CTX c;
297 static unsigned char m[SHA512_DIGEST_LENGTH];
298
299 if (md == NULL) md=m;
300 SHA512_Init(&c);
301 SHA512_Update(&c,d,n);
302 SHA512_Final(md,&c);
303 OPENSSL_cleanse(&c,sizeof(c));
304 return(md);
305 }
306
307#ifndef SHA512_ASM
308static const SHA_LONG64 K512[80] = {
309 U64(0x428a2f98d728ae22),U64(0x7137449123ef65cd),
310 U64(0xb5c0fbcfec4d3b2f),U64(0xe9b5dba58189dbbc),
311 U64(0x3956c25bf348b538),U64(0x59f111f1b605d019),
312 U64(0x923f82a4af194f9b),U64(0xab1c5ed5da6d8118),
313 U64(0xd807aa98a3030242),U64(0x12835b0145706fbe),
314 U64(0x243185be4ee4b28c),U64(0x550c7dc3d5ffb4e2),
315 U64(0x72be5d74f27b896f),U64(0x80deb1fe3b1696b1),
316 U64(0x9bdc06a725c71235),U64(0xc19bf174cf692694),
317 U64(0xe49b69c19ef14ad2),U64(0xefbe4786384f25e3),
318 U64(0x0fc19dc68b8cd5b5),U64(0x240ca1cc77ac9c65),
319 U64(0x2de92c6f592b0275),U64(0x4a7484aa6ea6e483),
320 U64(0x5cb0a9dcbd41fbd4),U64(0x76f988da831153b5),
321 U64(0x983e5152ee66dfab),U64(0xa831c66d2db43210),
322 U64(0xb00327c898fb213f),U64(0xbf597fc7beef0ee4),
323 U64(0xc6e00bf33da88fc2),U64(0xd5a79147930aa725),
324 U64(0x06ca6351e003826f),U64(0x142929670a0e6e70),
325 U64(0x27b70a8546d22ffc),U64(0x2e1b21385c26c926),
326 U64(0x4d2c6dfc5ac42aed),U64(0x53380d139d95b3df),
327 U64(0x650a73548baf63de),U64(0x766a0abb3c77b2a8),
328 U64(0x81c2c92e47edaee6),U64(0x92722c851482353b),
329 U64(0xa2bfe8a14cf10364),U64(0xa81a664bbc423001),
330 U64(0xc24b8b70d0f89791),U64(0xc76c51a30654be30),
331 U64(0xd192e819d6ef5218),U64(0xd69906245565a910),
332 U64(0xf40e35855771202a),U64(0x106aa07032bbd1b8),
333 U64(0x19a4c116b8d2d0c8),U64(0x1e376c085141ab53),
334 U64(0x2748774cdf8eeb99),U64(0x34b0bcb5e19b48a8),
335 U64(0x391c0cb3c5c95a63),U64(0x4ed8aa4ae3418acb),
336 U64(0x5b9cca4f7763e373),U64(0x682e6ff3d6b2b8a3),
337 U64(0x748f82ee5defb2fc),U64(0x78a5636f43172f60),
338 U64(0x84c87814a1f0ab72),U64(0x8cc702081a6439ec),
339 U64(0x90befffa23631e28),U64(0xa4506cebde82bde9),
340 U64(0xbef9a3f7b2c67915),U64(0xc67178f2e372532b),
341 U64(0xca273eceea26619c),U64(0xd186b8c721c0c207),
342 U64(0xeada7dd6cde0eb1e),U64(0xf57d4f7fee6ed178),
343 U64(0x06f067aa72176fba),U64(0x0a637dc5a2c898a6),
344 U64(0x113f9804bef90dae),U64(0x1b710b35131c471b),
345 U64(0x28db77f523047d84),U64(0x32caab7b40c72493),
346 U64(0x3c9ebe0a15c9bebc),U64(0x431d67c49c100d4c),
347 U64(0x4cc5d4becb3e42b6),U64(0x597f299cfc657e2a),
348 U64(0x5fcb6fab3ad6faec),U64(0x6c44198c4a475817) };
349
350#ifndef PEDANTIC
351# if defined(__GNUC__) && __GNUC__>=2 && !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_NO_INLINE_ASM)
352# if defined(__x86_64) || defined(__x86_64__)
353# define ROTR(a,n) ({ SHA_LONG64 ret; \
354 asm ("rorq %1,%0" \
355 : "=r"(ret) \
356 : "J"(n),"0"(a) \
357 : "cc"); ret; })
358# if !defined(B_ENDIAN)
359# define PULL64(x) ({ SHA_LONG64 ret=*((const SHA_LONG64 *)(&(x))); \
360 asm ("bswapq %0" \
361 : "=r"(ret) \
362 : "0"(ret)); ret; })
363# endif
364# elif (defined(__i386) || defined(__i386__)) && !defined(B_ENDIAN)
365# if defined(I386_ONLY)
366# define PULL64(x) ({ const unsigned int *p=(const unsigned int *)(&(x));\
367 unsigned int hi=p[0],lo=p[1]; \
368 asm("xchgb %%ah,%%al;xchgb %%dh,%%dl;"\
369 "roll $16,%%eax; roll $16,%%edx; "\
370 "xchgb %%ah,%%al;xchgb %%dh,%%dl;" \
371 : "=a"(lo),"=d"(hi) \
372 : "0"(lo),"1"(hi) : "cc"); \
373 ((SHA_LONG64)hi)<<32|lo; })
374# else
375# define PULL64(x) ({ const unsigned int *p=(const unsigned int *)(&(x));\
376 unsigned int hi=p[0],lo=p[1]; \
377 asm ("bswapl %0; bswapl %1;" \
378 : "=r"(lo),"=r"(hi) \
379 : "0"(lo),"1"(hi)); \
380 ((SHA_LONG64)hi)<<32|lo; })
381# endif
382# elif (defined(_ARCH_PPC) && defined(__64BIT__)) || defined(_ARCH_PPC64)
383# define ROTR(a,n) ({ SHA_LONG64 ret; \
384 asm ("rotrdi %0,%1,%2" \
385 : "=r"(ret) \
386 : "r"(a),"K"(n)); ret; })
387# endif
388# elif defined(_MSC_VER)
389# if defined(_WIN64) /* applies to both IA-64 and AMD64 */
390# pragma intrinsic(_rotr64)
391# define ROTR(a,n) _rotr64((a),n)
392# endif
393# if defined(_M_IX86) && !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_NO_INLINE_ASM)
394# if defined(I386_ONLY)
395 static SHA_LONG64 __fastcall __pull64be(const void *x)
396 { _asm mov edx, [ecx + 0]
397 _asm mov eax, [ecx + 4]
398 _asm xchg dh,dl
399 _asm xchg ah,al
400 _asm rol edx,16
401 _asm rol eax,16
402 _asm xchg dh,dl
403 _asm xchg ah,al
404 }
405# else
406 static SHA_LONG64 __fastcall __pull64be(const void *x)
407 { _asm mov edx, [ecx + 0]
408 _asm mov eax, [ecx + 4]
409 _asm bswap edx
410 _asm bswap eax
411 }
412# endif
413# define PULL64(x) __pull64be(&(x))
414# if _MSC_VER<=1200
415# pragma inline_depth(0)
416# endif
417# endif
418# endif
419#endif
420
421#ifndef PULL64
422#define B(x,j) (((SHA_LONG64)(*(((const unsigned char *)(&x))+j)))<<((7-j)*8))
423#define PULL64(x) (B(x,0)|B(x,1)|B(x,2)|B(x,3)|B(x,4)|B(x,5)|B(x,6)|B(x,7))
424#endif
425
426#ifndef ROTR
427#define ROTR(x,s) (((x)>>s) | (x)<<(64-s))
428#endif
429
430#define Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
431#define Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41))
432#define sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7))
433#define sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
434
435#define Ch(x,y,z) (((x) & (y)) ^ ((~(x)) & (z)))
436#define Maj(x,y,z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
437
438
439#if defined(__i386) || defined(__i386__) || defined(_M_IX86)
440/*
441 * This code should give better results on 32-bit CPU with less than
442 * ~24 registers, both size and performance wise...
443 */
444static void sha512_block_data_order (SHA512_CTX *ctx, const void *in, size_t num)
445 {
446 const SHA_LONG64 *W=in;
447 SHA_LONG64 A,E,T;
448 SHA_LONG64 X[9+80],*F;
449 int i;
450
451 while (num--) {
452
453 F = X+80;
454 A = ctx->h[0]; F[1] = ctx->h[1];
455 F[2] = ctx->h[2]; F[3] = ctx->h[3];
456 E = ctx->h[4]; F[5] = ctx->h[5];
457 F[6] = ctx->h[6]; F[7] = ctx->h[7];
458
459 for (i=0;i<16;i++,F--)
460 {
461#ifdef B_ENDIAN
462 T = W[i];
463#else
464 T = PULL64(W[i]);
465#endif
466 F[0] = A;
467 F[4] = E;
468 F[8] = T;
469 T += F[7] + Sigma1(E) + Ch(E,F[5],F[6]) + K512[i];
470 E = F[3] + T;
471 A = T + Sigma0(A) + Maj(A,F[1],F[2]);
472 }
473
474 for (;i<80;i++,F--)
475 {
476 T = sigma0(F[8+16-1]);
477 T += sigma1(F[8+16-14]);
478 T += F[8+16] + F[8+16-9];
479
480 F[0] = A;
481 F[4] = E;
482 F[8] = T;
483 T += F[7] + Sigma1(E) + Ch(E,F[5],F[6]) + K512[i];
484 E = F[3] + T;
485 A = T + Sigma0(A) + Maj(A,F[1],F[2]);
486 }
487
488 ctx->h[0] += A; ctx->h[1] += F[1];
489 ctx->h[2] += F[2]; ctx->h[3] += F[3];
490 ctx->h[4] += E; ctx->h[5] += F[5];
491 ctx->h[6] += F[6]; ctx->h[7] += F[7];
492
493 W+=SHA_LBLOCK;
494 }
495 }
496
497#elif defined(OPENSSL_SMALL_FOOTPRINT)
498
499static void sha512_block_data_order (SHA512_CTX *ctx, const void *in, size_t num)
500 {
501 const SHA_LONG64 *W=in;
502 SHA_LONG64 a,b,c,d,e,f,g,h,s0,s1,T1,T2;
503 SHA_LONG64 X[16];
504 int i;
505
506 while (num--) {
507
508 a = ctx->h[0]; b = ctx->h[1]; c = ctx->h[2]; d = ctx->h[3];
509 e = ctx->h[4]; f = ctx->h[5]; g = ctx->h[6]; h = ctx->h[7];
510
511 for (i=0;i<16;i++)
512 {
513#ifdef B_ENDIAN
514 T1 = X[i] = W[i];
515#else
516 T1 = X[i] = PULL64(W[i]);
517#endif
518 T1 += h + Sigma1(e) + Ch(e,f,g) + K512[i];
519 T2 = Sigma0(a) + Maj(a,b,c);
520 h = g; g = f; f = e; e = d + T1;
521 d = c; c = b; b = a; a = T1 + T2;
522 }
523
524 for (;i<80;i++)
525 {
526 s0 = X[(i+1)&0x0f]; s0 = sigma0(s0);
527 s1 = X[(i+14)&0x0f]; s1 = sigma1(s1);
528
529 T1 = X[i&0xf] += s0 + s1 + X[(i+9)&0xf];
530 T1 += h + Sigma1(e) + Ch(e,f,g) + K512[i];
531 T2 = Sigma0(a) + Maj(a,b,c);
532 h = g; g = f; f = e; e = d + T1;
533 d = c; c = b; b = a; a = T1 + T2;
534 }
535
536 ctx->h[0] += a; ctx->h[1] += b; ctx->h[2] += c; ctx->h[3] += d;
537 ctx->h[4] += e; ctx->h[5] += f; ctx->h[6] += g; ctx->h[7] += h;
538
539 W+=SHA_LBLOCK;
540 }
541 }
542
543#else
544
545#define ROUND_00_15(i,a,b,c,d,e,f,g,h) do { \
546 T1 += h + Sigma1(e) + Ch(e,f,g) + K512[i]; \
547 h = Sigma0(a) + Maj(a,b,c); \
548 d += T1; h += T1; } while (0)
549
550#define ROUND_16_80(i,j,a,b,c,d,e,f,g,h,X) do { \
551 s0 = X[(j+1)&0x0f]; s0 = sigma0(s0); \
552 s1 = X[(j+14)&0x0f]; s1 = sigma1(s1); \
553 T1 = X[(j)&0x0f] += s0 + s1 + X[(j+9)&0x0f]; \
554 ROUND_00_15(i+j,a,b,c,d,e,f,g,h); } while (0)
555
556static void sha512_block_data_order (SHA512_CTX *ctx, const void *in, size_t num)
557 {
558 const SHA_LONG64 *W=in;
559 SHA_LONG64 a,b,c,d,e,f,g,h,s0,s1,T1;
560 SHA_LONG64 X[16];
561 int i;
562
563 while (num--) {
564
565 a = ctx->h[0]; b = ctx->h[1]; c = ctx->h[2]; d = ctx->h[3];
566 e = ctx->h[4]; f = ctx->h[5]; g = ctx->h[6]; h = ctx->h[7];
567
568#ifdef B_ENDIAN
569 T1 = X[0] = W[0]; ROUND_00_15(0,a,b,c,d,e,f,g,h);
570 T1 = X[1] = W[1]; ROUND_00_15(1,h,a,b,c,d,e,f,g);
571 T1 = X[2] = W[2]; ROUND_00_15(2,g,h,a,b,c,d,e,f);
572 T1 = X[3] = W[3]; ROUND_00_15(3,f,g,h,a,b,c,d,e);
573 T1 = X[4] = W[4]; ROUND_00_15(4,e,f,g,h,a,b,c,d);
574 T1 = X[5] = W[5]; ROUND_00_15(5,d,e,f,g,h,a,b,c);
575 T1 = X[6] = W[6]; ROUND_00_15(6,c,d,e,f,g,h,a,b);
576 T1 = X[7] = W[7]; ROUND_00_15(7,b,c,d,e,f,g,h,a);
577 T1 = X[8] = W[8]; ROUND_00_15(8,a,b,c,d,e,f,g,h);
578 T1 = X[9] = W[9]; ROUND_00_15(9,h,a,b,c,d,e,f,g);
579 T1 = X[10] = W[10]; ROUND_00_15(10,g,h,a,b,c,d,e,f);
580 T1 = X[11] = W[11]; ROUND_00_15(11,f,g,h,a,b,c,d,e);
581 T1 = X[12] = W[12]; ROUND_00_15(12,e,f,g,h,a,b,c,d);
582 T1 = X[13] = W[13]; ROUND_00_15(13,d,e,f,g,h,a,b,c);
583 T1 = X[14] = W[14]; ROUND_00_15(14,c,d,e,f,g,h,a,b);
584 T1 = X[15] = W[15]; ROUND_00_15(15,b,c,d,e,f,g,h,a);
585#else
586 T1 = X[0] = PULL64(W[0]); ROUND_00_15(0,a,b,c,d,e,f,g,h);
587 T1 = X[1] = PULL64(W[1]); ROUND_00_15(1,h,a,b,c,d,e,f,g);
588 T1 = X[2] = PULL64(W[2]); ROUND_00_15(2,g,h,a,b,c,d,e,f);
589 T1 = X[3] = PULL64(W[3]); ROUND_00_15(3,f,g,h,a,b,c,d,e);
590 T1 = X[4] = PULL64(W[4]); ROUND_00_15(4,e,f,g,h,a,b,c,d);
591 T1 = X[5] = PULL64(W[5]); ROUND_00_15(5,d,e,f,g,h,a,b,c);
592 T1 = X[6] = PULL64(W[6]); ROUND_00_15(6,c,d,e,f,g,h,a,b);
593 T1 = X[7] = PULL64(W[7]); ROUND_00_15(7,b,c,d,e,f,g,h,a);
594 T1 = X[8] = PULL64(W[8]); ROUND_00_15(8,a,b,c,d,e,f,g,h);
595 T1 = X[9] = PULL64(W[9]); ROUND_00_15(9,h,a,b,c,d,e,f,g);
596 T1 = X[10] = PULL64(W[10]); ROUND_00_15(10,g,h,a,b,c,d,e,f);
597 T1 = X[11] = PULL64(W[11]); ROUND_00_15(11,f,g,h,a,b,c,d,e);
598 T1 = X[12] = PULL64(W[12]); ROUND_00_15(12,e,f,g,h,a,b,c,d);
599 T1 = X[13] = PULL64(W[13]); ROUND_00_15(13,d,e,f,g,h,a,b,c);
600 T1 = X[14] = PULL64(W[14]); ROUND_00_15(14,c,d,e,f,g,h,a,b);
601 T1 = X[15] = PULL64(W[15]); ROUND_00_15(15,b,c,d,e,f,g,h,a);
602#endif
603
604 for (i=16;i<80;i+=16)
605 {
606 ROUND_16_80(i, 0,a,b,c,d,e,f,g,h,X);
607 ROUND_16_80(i, 1,h,a,b,c,d,e,f,g,X);
608 ROUND_16_80(i, 2,g,h,a,b,c,d,e,f,X);
609 ROUND_16_80(i, 3,f,g,h,a,b,c,d,e,X);
610 ROUND_16_80(i, 4,e,f,g,h,a,b,c,d,X);
611 ROUND_16_80(i, 5,d,e,f,g,h,a,b,c,X);
612 ROUND_16_80(i, 6,c,d,e,f,g,h,a,b,X);
613 ROUND_16_80(i, 7,b,c,d,e,f,g,h,a,X);
614 ROUND_16_80(i, 8,a,b,c,d,e,f,g,h,X);
615 ROUND_16_80(i, 9,h,a,b,c,d,e,f,g,X);
616 ROUND_16_80(i,10,g,h,a,b,c,d,e,f,X);
617 ROUND_16_80(i,11,f,g,h,a,b,c,d,e,X);
618 ROUND_16_80(i,12,e,f,g,h,a,b,c,d,X);
619 ROUND_16_80(i,13,d,e,f,g,h,a,b,c,X);
620 ROUND_16_80(i,14,c,d,e,f,g,h,a,b,X);
621 ROUND_16_80(i,15,b,c,d,e,f,g,h,a,X);
622 }
623
624 ctx->h[0] += a; ctx->h[1] += b; ctx->h[2] += c; ctx->h[3] += d;
625 ctx->h[4] += e; ctx->h[5] += f; ctx->h[6] += g; ctx->h[7] += h;
626
627 W+=SHA_LBLOCK;
628 }
629 }
630
631#endif
632
633#endif /* SHA512_ASM */
634
635#else /* !OPENSSL_NO_SHA512 */
636
637#if defined(PEDANTIC) || defined(__DECC) || defined(OPENSSL_SYS_MACOSX)
638static void *dummy=&dummy;
639#endif
640
641#endif /* !OPENSSL_NO_SHA512 */
diff --git a/src/lib/libcrypto/sha/sha_locl.h b/src/lib/libcrypto/sha/sha_locl.h
deleted file mode 100644
index 672c26eee1..0000000000
--- a/src/lib/libcrypto/sha/sha_locl.h
+++ /dev/null
@@ -1,437 +0,0 @@
1/* crypto/sha/sha_locl.h */
2/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
3 * All rights reserved.
4 *
5 * This package is an SSL implementation written
6 * by Eric Young (eay@cryptsoft.com).
7 * The implementation was written so as to conform with Netscapes SSL.
8 *
9 * This library is free for commercial and non-commercial use as long as
10 * the following conditions are aheared to. The following conditions
11 * apply to all code found in this distribution, be it the RC4, RSA,
12 * lhash, DES, etc., code; not just the SSL code. The SSL documentation
13 * included with this distribution is covered by the same copyright terms
14 * except that the holder is Tim Hudson (tjh@cryptsoft.com).
15 *
16 * Copyright remains Eric Young's, and as such any Copyright notices in
17 * the code are not to be removed.
18 * If this package is used in a product, Eric Young should be given attribution
19 * as the author of the parts of the library used.
20 * This can be in the form of a textual message at program startup or
21 * in documentation (online or textual) provided with the package.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 * 1. Redistributions of source code must retain the copyright
27 * notice, this list of conditions and the following disclaimer.
28 * 2. Redistributions in binary form must reproduce the above copyright
29 * notice, this list of conditions and the following disclaimer in the
30 * documentation and/or other materials provided with the distribution.
31 * 3. All advertising materials mentioning features or use of this software
32 * must display the following acknowledgement:
33 * "This product includes cryptographic software written by
34 * Eric Young (eay@cryptsoft.com)"
35 * The word 'cryptographic' can be left out if the rouines from the library
36 * being used are not cryptographic related :-).
37 * 4. If you include any Windows specific code (or a derivative thereof) from
38 * the apps directory (application code) you must include an acknowledgement:
39 * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
40 *
41 * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
42 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51 * SUCH DAMAGE.
52 *
53 * The licence and distribution terms for any publically available version or
54 * derivative of this code cannot be changed. i.e. this code cannot simply be
55 * copied and put under another distribution licence
56 * [including the GNU Public Licence.]
57 */
58
59#include <stdlib.h>
60#include <string.h>
61
62#include <openssl/opensslconf.h>
63#include <openssl/sha.h>
64
65#define DATA_ORDER_IS_BIG_ENDIAN
66
67#define HASH_LONG SHA_LONG
68#define HASH_CTX SHA_CTX
69#define HASH_CBLOCK SHA_CBLOCK
70#define HASH_MAKE_STRING(c,s) do { \
71 unsigned long ll; \
72 ll=(c)->h0; HOST_l2c(ll,(s)); \
73 ll=(c)->h1; HOST_l2c(ll,(s)); \
74 ll=(c)->h2; HOST_l2c(ll,(s)); \
75 ll=(c)->h3; HOST_l2c(ll,(s)); \
76 ll=(c)->h4; HOST_l2c(ll,(s)); \
77 } while (0)
78
79#if defined(SHA_0)
80
81# define HASH_UPDATE SHA_Update
82# define HASH_TRANSFORM SHA_Transform
83# define HASH_FINAL SHA_Final
84# define HASH_INIT SHA_Init
85# define HASH_BLOCK_DATA_ORDER sha_block_data_order
86# define Xupdate(a,ix,ia,ib,ic,id) (ix=(a)=(ia^ib^ic^id))
87
88static void sha_block_data_order (SHA_CTX *c, const void *p,size_t num);
89
90#elif defined(SHA_1)
91
92# define HASH_UPDATE SHA1_Update
93# define HASH_TRANSFORM SHA1_Transform
94# define HASH_FINAL SHA1_Final
95# define HASH_INIT SHA1_Init
96# define HASH_BLOCK_DATA_ORDER sha1_block_data_order
97# if defined(__MWERKS__) && defined(__MC68K__)
98 /* Metrowerks for Motorola fails otherwise:-( <appro@fy.chalmers.se> */
99# define Xupdate(a,ix,ia,ib,ic,id) do { (a)=(ia^ib^ic^id); \
100 ix=(a)=ROTATE((a),1); \
101 } while (0)
102# else
103# define Xupdate(a,ix,ia,ib,ic,id) ( (a)=(ia^ib^ic^id), \
104 ix=(a)=ROTATE((a),1) \
105 )
106# endif
107
108#ifndef SHA1_ASM
109static
110#endif
111void sha1_block_data_order (SHA_CTX *c, const void *p,size_t num);
112
113#else
114# error "Either SHA_0 or SHA_1 must be defined."
115#endif
116
117#include "md32_common.h"
118
119#define INIT_DATA_h0 0x67452301UL
120#define INIT_DATA_h1 0xefcdab89UL
121#define INIT_DATA_h2 0x98badcfeUL
122#define INIT_DATA_h3 0x10325476UL
123#define INIT_DATA_h4 0xc3d2e1f0UL
124
125int HASH_INIT (SHA_CTX *c)
126 {
127 memset (c,0,sizeof(*c));
128 c->h0=INIT_DATA_h0;
129 c->h1=INIT_DATA_h1;
130 c->h2=INIT_DATA_h2;
131 c->h3=INIT_DATA_h3;
132 c->h4=INIT_DATA_h4;
133 return 1;
134 }
135
136#define K_00_19 0x5a827999UL
137#define K_20_39 0x6ed9eba1UL
138#define K_40_59 0x8f1bbcdcUL
139#define K_60_79 0xca62c1d6UL
140
141/* As pointed out by Wei Dai <weidai@eskimo.com>, F() below can be
142 * simplified to the code in F_00_19. Wei attributes these optimisations
143 * to Peter Gutmann's SHS code, and he attributes it to Rich Schroeppel.
144 * #define F(x,y,z) (((x) & (y)) | ((~(x)) & (z)))
145 * I've just become aware of another tweak to be made, again from Wei Dai,
146 * in F_40_59, (x&a)|(y&a) -> (x|y)&a
147 */
148#define F_00_19(b,c,d) ((((c) ^ (d)) & (b)) ^ (d))
149#define F_20_39(b,c,d) ((b) ^ (c) ^ (d))
150#define F_40_59(b,c,d) (((b) & (c)) | (((b)|(c)) & (d)))
151#define F_60_79(b,c,d) F_20_39(b,c,d)
152
153#ifndef OPENSSL_SMALL_FOOTPRINT
154
155#define BODY_00_15(i,a,b,c,d,e,f,xi) \
156 (f)=xi+(e)+K_00_19+ROTATE((a),5)+F_00_19((b),(c),(d)); \
157 (b)=ROTATE((b),30);
158
159#define BODY_16_19(i,a,b,c,d,e,f,xi,xa,xb,xc,xd) \
160 Xupdate(f,xi,xa,xb,xc,xd); \
161 (f)+=(e)+K_00_19+ROTATE((a),5)+F_00_19((b),(c),(d)); \
162 (b)=ROTATE((b),30);
163
164#define BODY_20_31(i,a,b,c,d,e,f,xi,xa,xb,xc,xd) \
165 Xupdate(f,xi,xa,xb,xc,xd); \
166 (f)+=(e)+K_20_39+ROTATE((a),5)+F_20_39((b),(c),(d)); \
167 (b)=ROTATE((b),30);
168
169#define BODY_32_39(i,a,b,c,d,e,f,xa,xb,xc,xd) \
170 Xupdate(f,xa,xa,xb,xc,xd); \
171 (f)+=(e)+K_20_39+ROTATE((a),5)+F_20_39((b),(c),(d)); \
172 (b)=ROTATE((b),30);
173
174#define BODY_40_59(i,a,b,c,d,e,f,xa,xb,xc,xd) \
175 Xupdate(f,xa,xa,xb,xc,xd); \
176 (f)+=(e)+K_40_59+ROTATE((a),5)+F_40_59((b),(c),(d)); \
177 (b)=ROTATE((b),30);
178
179#define BODY_60_79(i,a,b,c,d,e,f,xa,xb,xc,xd) \
180 Xupdate(f,xa,xa,xb,xc,xd); \
181 (f)=xa+(e)+K_60_79+ROTATE((a),5)+F_60_79((b),(c),(d)); \
182 (b)=ROTATE((b),30);
183
184#ifdef X
185#undef X
186#endif
187#ifndef MD32_XARRAY
188 /*
189 * Originally X was an array. As it's automatic it's natural
190 * to expect RISC compiler to accomodate at least part of it in
191 * the register bank, isn't it? Unfortunately not all compilers
192 * "find" this expectation reasonable:-( On order to make such
193 * compilers generate better code I replace X[] with a bunch of
194 * X0, X1, etc. See the function body below...
195 * <appro@fy.chalmers.se>
196 */
197# define X(i) XX##i
198#else
199 /*
200 * However! Some compilers (most notably HP C) get overwhelmed by
201 * that many local variables so that we have to have the way to
202 * fall down to the original behavior.
203 */
204# define X(i) XX[i]
205#endif
206
207#if !defined(SHA_1) || !defined(SHA1_ASM)
208static void HASH_BLOCK_DATA_ORDER (SHA_CTX *c, const void *p, size_t num)
209 {
210 const unsigned char *data=p;
211 register unsigned MD32_REG_T A,B,C,D,E,T,l;
212#ifndef MD32_XARRAY
213 unsigned MD32_REG_T XX0, XX1, XX2, XX3, XX4, XX5, XX6, XX7,
214 XX8, XX9,XX10,XX11,XX12,XX13,XX14,XX15;
215#else
216 SHA_LONG XX[16];
217#endif
218
219 A=c->h0;
220 B=c->h1;
221 C=c->h2;
222 D=c->h3;
223 E=c->h4;
224
225 for (;;)
226 {
227 const union { long one; char little; } is_endian = {1};
228
229 if (!is_endian.little && sizeof(SHA_LONG)==4 && ((size_t)p%4)==0)
230 {
231 const SHA_LONG *W=(const SHA_LONG *)data;
232
233 X( 0) = W[0]; X( 1) = W[ 1];
234 BODY_00_15( 0,A,B,C,D,E,T,X( 0)); X( 2) = W[ 2];
235 BODY_00_15( 1,T,A,B,C,D,E,X( 1)); X( 3) = W[ 3];
236 BODY_00_15( 2,E,T,A,B,C,D,X( 2)); X( 4) = W[ 4];
237 BODY_00_15( 3,D,E,T,A,B,C,X( 3)); X( 5) = W[ 5];
238 BODY_00_15( 4,C,D,E,T,A,B,X( 4)); X( 6) = W[ 6];
239 BODY_00_15( 5,B,C,D,E,T,A,X( 5)); X( 7) = W[ 7];
240 BODY_00_15( 6,A,B,C,D,E,T,X( 6)); X( 8) = W[ 8];
241 BODY_00_15( 7,T,A,B,C,D,E,X( 7)); X( 9) = W[ 9];
242 BODY_00_15( 8,E,T,A,B,C,D,X( 8)); X(10) = W[10];
243 BODY_00_15( 9,D,E,T,A,B,C,X( 9)); X(11) = W[11];
244 BODY_00_15(10,C,D,E,T,A,B,X(10)); X(12) = W[12];
245 BODY_00_15(11,B,C,D,E,T,A,X(11)); X(13) = W[13];
246 BODY_00_15(12,A,B,C,D,E,T,X(12)); X(14) = W[14];
247 BODY_00_15(13,T,A,B,C,D,E,X(13)); X(15) = W[15];
248 BODY_00_15(14,E,T,A,B,C,D,X(14));
249 BODY_00_15(15,D,E,T,A,B,C,X(15));
250
251 data += SHA_CBLOCK;
252 }
253 else
254 {
255 HOST_c2l(data,l); X( 0)=l; HOST_c2l(data,l); X( 1)=l;
256 BODY_00_15( 0,A,B,C,D,E,T,X( 0)); HOST_c2l(data,l); X( 2)=l;
257 BODY_00_15( 1,T,A,B,C,D,E,X( 1)); HOST_c2l(data,l); X( 3)=l;
258 BODY_00_15( 2,E,T,A,B,C,D,X( 2)); HOST_c2l(data,l); X( 4)=l;
259 BODY_00_15( 3,D,E,T,A,B,C,X( 3)); HOST_c2l(data,l); X( 5)=l;
260 BODY_00_15( 4,C,D,E,T,A,B,X( 4)); HOST_c2l(data,l); X( 6)=l;
261 BODY_00_15( 5,B,C,D,E,T,A,X( 5)); HOST_c2l(data,l); X( 7)=l;
262 BODY_00_15( 6,A,B,C,D,E,T,X( 6)); HOST_c2l(data,l); X( 8)=l;
263 BODY_00_15( 7,T,A,B,C,D,E,X( 7)); HOST_c2l(data,l); X( 9)=l;
264 BODY_00_15( 8,E,T,A,B,C,D,X( 8)); HOST_c2l(data,l); X(10)=l;
265 BODY_00_15( 9,D,E,T,A,B,C,X( 9)); HOST_c2l(data,l); X(11)=l;
266 BODY_00_15(10,C,D,E,T,A,B,X(10)); HOST_c2l(data,l); X(12)=l;
267 BODY_00_15(11,B,C,D,E,T,A,X(11)); HOST_c2l(data,l); X(13)=l;
268 BODY_00_15(12,A,B,C,D,E,T,X(12)); HOST_c2l(data,l); X(14)=l;
269 BODY_00_15(13,T,A,B,C,D,E,X(13)); HOST_c2l(data,l); X(15)=l;
270 BODY_00_15(14,E,T,A,B,C,D,X(14));
271 BODY_00_15(15,D,E,T,A,B,C,X(15));
272 }
273
274 BODY_16_19(16,C,D,E,T,A,B,X( 0),X( 0),X( 2),X( 8),X(13));
275 BODY_16_19(17,B,C,D,E,T,A,X( 1),X( 1),X( 3),X( 9),X(14));
276 BODY_16_19(18,A,B,C,D,E,T,X( 2),X( 2),X( 4),X(10),X(15));
277 BODY_16_19(19,T,A,B,C,D,E,X( 3),X( 3),X( 5),X(11),X( 0));
278
279 BODY_20_31(20,E,T,A,B,C,D,X( 4),X( 4),X( 6),X(12),X( 1));
280 BODY_20_31(21,D,E,T,A,B,C,X( 5),X( 5),X( 7),X(13),X( 2));
281 BODY_20_31(22,C,D,E,T,A,B,X( 6),X( 6),X( 8),X(14),X( 3));
282 BODY_20_31(23,B,C,D,E,T,A,X( 7),X( 7),X( 9),X(15),X( 4));
283 BODY_20_31(24,A,B,C,D,E,T,X( 8),X( 8),X(10),X( 0),X( 5));
284 BODY_20_31(25,T,A,B,C,D,E,X( 9),X( 9),X(11),X( 1),X( 6));
285 BODY_20_31(26,E,T,A,B,C,D,X(10),X(10),X(12),X( 2),X( 7));
286 BODY_20_31(27,D,E,T,A,B,C,X(11),X(11),X(13),X( 3),X( 8));
287 BODY_20_31(28,C,D,E,T,A,B,X(12),X(12),X(14),X( 4),X( 9));
288 BODY_20_31(29,B,C,D,E,T,A,X(13),X(13),X(15),X( 5),X(10));
289 BODY_20_31(30,A,B,C,D,E,T,X(14),X(14),X( 0),X( 6),X(11));
290 BODY_20_31(31,T,A,B,C,D,E,X(15),X(15),X( 1),X( 7),X(12));
291
292 BODY_32_39(32,E,T,A,B,C,D,X( 0),X( 2),X( 8),X(13));
293 BODY_32_39(33,D,E,T,A,B,C,X( 1),X( 3),X( 9),X(14));
294 BODY_32_39(34,C,D,E,T,A,B,X( 2),X( 4),X(10),X(15));
295 BODY_32_39(35,B,C,D,E,T,A,X( 3),X( 5),X(11),X( 0));
296 BODY_32_39(36,A,B,C,D,E,T,X( 4),X( 6),X(12),X( 1));
297 BODY_32_39(37,T,A,B,C,D,E,X( 5),X( 7),X(13),X( 2));
298 BODY_32_39(38,E,T,A,B,C,D,X( 6),X( 8),X(14),X( 3));
299 BODY_32_39(39,D,E,T,A,B,C,X( 7),X( 9),X(15),X( 4));
300
301 BODY_40_59(40,C,D,E,T,A,B,X( 8),X(10),X( 0),X( 5));
302 BODY_40_59(41,B,C,D,E,T,A,X( 9),X(11),X( 1),X( 6));
303 BODY_40_59(42,A,B,C,D,E,T,X(10),X(12),X( 2),X( 7));
304 BODY_40_59(43,T,A,B,C,D,E,X(11),X(13),X( 3),X( 8));
305 BODY_40_59(44,E,T,A,B,C,D,X(12),X(14),X( 4),X( 9));
306 BODY_40_59(45,D,E,T,A,B,C,X(13),X(15),X( 5),X(10));
307 BODY_40_59(46,C,D,E,T,A,B,X(14),X( 0),X( 6),X(11));
308 BODY_40_59(47,B,C,D,E,T,A,X(15),X( 1),X( 7),X(12));
309 BODY_40_59(48,A,B,C,D,E,T,X( 0),X( 2),X( 8),X(13));
310 BODY_40_59(49,T,A,B,C,D,E,X( 1),X( 3),X( 9),X(14));
311 BODY_40_59(50,E,T,A,B,C,D,X( 2),X( 4),X(10),X(15));
312 BODY_40_59(51,D,E,T,A,B,C,X( 3),X( 5),X(11),X( 0));
313 BODY_40_59(52,C,D,E,T,A,B,X( 4),X( 6),X(12),X( 1));
314 BODY_40_59(53,B,C,D,E,T,A,X( 5),X( 7),X(13),X( 2));
315 BODY_40_59(54,A,B,C,D,E,T,X( 6),X( 8),X(14),X( 3));
316 BODY_40_59(55,T,A,B,C,D,E,X( 7),X( 9),X(15),X( 4));
317 BODY_40_59(56,E,T,A,B,C,D,X( 8),X(10),X( 0),X( 5));
318 BODY_40_59(57,D,E,T,A,B,C,X( 9),X(11),X( 1),X( 6));
319 BODY_40_59(58,C,D,E,T,A,B,X(10),X(12),X( 2),X( 7));
320 BODY_40_59(59,B,C,D,E,T,A,X(11),X(13),X( 3),X( 8));
321
322 BODY_60_79(60,A,B,C,D,E,T,X(12),X(14),X( 4),X( 9));
323 BODY_60_79(61,T,A,B,C,D,E,X(13),X(15),X( 5),X(10));
324 BODY_60_79(62,E,T,A,B,C,D,X(14),X( 0),X( 6),X(11));
325 BODY_60_79(63,D,E,T,A,B,C,X(15),X( 1),X( 7),X(12));
326 BODY_60_79(64,C,D,E,T,A,B,X( 0),X( 2),X( 8),X(13));
327 BODY_60_79(65,B,C,D,E,T,A,X( 1),X( 3),X( 9),X(14));
328 BODY_60_79(66,A,B,C,D,E,T,X( 2),X( 4),X(10),X(15));
329 BODY_60_79(67,T,A,B,C,D,E,X( 3),X( 5),X(11),X( 0));
330 BODY_60_79(68,E,T,A,B,C,D,X( 4),X( 6),X(12),X( 1));
331 BODY_60_79(69,D,E,T,A,B,C,X( 5),X( 7),X(13),X( 2));
332 BODY_60_79(70,C,D,E,T,A,B,X( 6),X( 8),X(14),X( 3));
333 BODY_60_79(71,B,C,D,E,T,A,X( 7),X( 9),X(15),X( 4));
334 BODY_60_79(72,A,B,C,D,E,T,X( 8),X(10),X( 0),X( 5));
335 BODY_60_79(73,T,A,B,C,D,E,X( 9),X(11),X( 1),X( 6));
336 BODY_60_79(74,E,T,A,B,C,D,X(10),X(12),X( 2),X( 7));
337 BODY_60_79(75,D,E,T,A,B,C,X(11),X(13),X( 3),X( 8));
338 BODY_60_79(76,C,D,E,T,A,B,X(12),X(14),X( 4),X( 9));
339 BODY_60_79(77,B,C,D,E,T,A,X(13),X(15),X( 5),X(10));
340 BODY_60_79(78,A,B,C,D,E,T,X(14),X( 0),X( 6),X(11));
341 BODY_60_79(79,T,A,B,C,D,E,X(15),X( 1),X( 7),X(12));
342
343 c->h0=(c->h0+E)&0xffffffffL;
344 c->h1=(c->h1+T)&0xffffffffL;
345 c->h2=(c->h2+A)&0xffffffffL;
346 c->h3=(c->h3+B)&0xffffffffL;
347 c->h4=(c->h4+C)&0xffffffffL;
348
349 if (--num == 0) break;
350
351 A=c->h0;
352 B=c->h1;
353 C=c->h2;
354 D=c->h3;
355 E=c->h4;
356
357 }
358 }
359#endif
360
361#else /* OPENSSL_SMALL_FOOTPRINT */
362
363#define BODY_00_15(xi) do { \
364 T=E+K_00_19+F_00_19(B,C,D); \
365 E=D, D=C, C=ROTATE(B,30), B=A; \
366 A=ROTATE(A,5)+T+xi; } while(0)
367
368#define BODY_16_19(xa,xb,xc,xd) do { \
369 Xupdate(T,xa,xa,xb,xc,xd); \
370 T+=E+K_00_19+F_00_19(B,C,D); \
371 E=D, D=C, C=ROTATE(B,30), B=A; \
372 A=ROTATE(A,5)+T; } while(0)
373
374#define BODY_20_39(xa,xb,xc,xd) do { \
375 Xupdate(T,xa,xa,xb,xc,xd); \
376 T+=E+K_20_39+F_20_39(B,C,D); \
377 E=D, D=C, C=ROTATE(B,30), B=A; \
378 A=ROTATE(A,5)+T; } while(0)
379
380#define BODY_40_59(xa,xb,xc,xd) do { \
381 Xupdate(T,xa,xa,xb,xc,xd); \
382 T+=E+K_40_59+F_40_59(B,C,D); \
383 E=D, D=C, C=ROTATE(B,30), B=A; \
384 A=ROTATE(A,5)+T; } while(0)
385
386#define BODY_60_79(xa,xb,xc,xd) do { \
387 Xupdate(T,xa,xa,xb,xc,xd); \
388 T=E+K_60_79+F_60_79(B,C,D); \
389 E=D, D=C, C=ROTATE(B,30), B=A; \
390 A=ROTATE(A,5)+T+xa; } while(0)
391
392#if !defined(SHA_1) || !defined(SHA1_ASM)
393static void HASH_BLOCK_DATA_ORDER (SHA_CTX *c, const void *p, size_t num)
394 {
395 const unsigned char *data=p;
396 register unsigned MD32_REG_T A,B,C,D,E,T,l;
397 int i;
398 SHA_LONG X[16];
399
400 A=c->h0;
401 B=c->h1;
402 C=c->h2;
403 D=c->h3;
404 E=c->h4;
405
406 for (;;)
407 {
408 for (i=0;i<16;i++)
409 { HOST_c2l(data,l); X[i]=l; BODY_00_15(X[i]); }
410 for (i=0;i<4;i++)
411 { BODY_16_19(X[i], X[i+2], X[i+8], X[(i+13)&15]); }
412 for (;i<24;i++)
413 { BODY_20_39(X[i&15], X[(i+2)&15], X[(i+8)&15],X[(i+13)&15]); }
414 for (i=0;i<20;i++)
415 { BODY_40_59(X[(i+8)&15],X[(i+10)&15],X[i&15], X[(i+5)&15]); }
416 for (i=4;i<24;i++)
417 { BODY_60_79(X[(i+8)&15],X[(i+10)&15],X[i&15], X[(i+5)&15]); }
418
419 c->h0=(c->h0+A)&0xffffffffL;
420 c->h1=(c->h1+B)&0xffffffffL;
421 c->h2=(c->h2+C)&0xffffffffL;
422 c->h3=(c->h3+D)&0xffffffffL;
423 c->h4=(c->h4+E)&0xffffffffL;
424
425 if (--num == 0) break;
426
427 A=c->h0;
428 B=c->h1;
429 C=c->h2;
430 D=c->h3;
431 E=c->h4;
432
433 }
434 }
435#endif
436
437#endif