summaryrefslogtreecommitdiff
path: root/src/lib/libcrypto/sha
diff options
context:
space:
mode:
Diffstat (limited to 'src/lib/libcrypto/sha')
-rw-r--r--src/lib/libcrypto/sha/asm/sha1-armv4-large.pl234
-rwxr-xr-xsrc/lib/libcrypto/sha/asm/sha1-ppc.pl319
-rw-r--r--src/lib/libcrypto/sha/asm/sha1-s390x.pl226
-rw-r--r--src/lib/libcrypto/sha/asm/sha1-sparcv9.pl283
-rw-r--r--src/lib/libcrypto/sha/asm/sha1-sparcv9a.pl600
-rw-r--r--src/lib/libcrypto/sha/asm/sha1-thumb.pl259
-rwxr-xr-xsrc/lib/libcrypto/sha/asm/sha1-x86_64.pl125
-rw-r--r--src/lib/libcrypto/sha/asm/sha256-586.pl251
-rw-r--r--src/lib/libcrypto/sha/asm/sha256-armv4.pl181
-rw-r--r--src/lib/libcrypto/sha/asm/sha512-586.pl644
-rw-r--r--src/lib/libcrypto/sha/asm/sha512-armv4.pl399
-rwxr-xr-xsrc/lib/libcrypto/sha/asm/sha512-ppc.pl462
-rw-r--r--src/lib/libcrypto/sha/asm/sha512-s390x.pl301
-rw-r--r--src/lib/libcrypto/sha/asm/sha512-sparcv9.pl593
-rwxr-xr-xsrc/lib/libcrypto/sha/asm/sha512-x86_64.pl140
-rw-r--r--src/lib/libcrypto/sha/sha256.c32
-rw-r--r--src/lib/libcrypto/sha/sha512.c184
17 files changed, 5145 insertions, 88 deletions
diff --git a/src/lib/libcrypto/sha/asm/sha1-armv4-large.pl b/src/lib/libcrypto/sha/asm/sha1-armv4-large.pl
new file mode 100644
index 0000000000..88861af641
--- /dev/null
+++ b/src/lib/libcrypto/sha/asm/sha1-armv4-large.pl
@@ -0,0 +1,234 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# sha1_block procedure for ARMv4.
11#
12# January 2007.
13
14# Size/performance trade-off
15# ====================================================================
16# impl size in bytes comp cycles[*] measured performance
17# ====================================================================
18# thumb 304 3212 4420
19# armv4-small 392/+29% 1958/+64% 2250/+96%
20# armv4-compact 740/+89% 1552/+26% 1840/+22%
21# armv4-large 1420/+92% 1307/+19% 1370/+34%[***]
22# full unroll ~5100/+260% ~1260/+4% ~1300/+5%
23# ====================================================================
24# thumb = same as 'small' but in Thumb instructions[**] and
25# with recurring code in two private functions;
26# small = detached Xload/update, loops are folded;
27# compact = detached Xload/update, 5x unroll;
28# large = interleaved Xload/update, 5x unroll;
29# full unroll = interleaved Xload/update, full unroll, estimated[!];
30#
31# [*] Manually counted instructions in "grand" loop body. Measured
32# performance is affected by prologue and epilogue overhead,
33# i-cache availability, branch penalties, etc.
34# [**] While each Thumb instruction is twice smaller, they are not as
35# diverse as ARM ones: e.g., there are only two arithmetic
36# instructions with 3 arguments, no [fixed] rotate, addressing
37# modes are limited. As result it takes more instructions to do
38# the same job in Thumb, therefore the code is never twice as
39# small and always slower.
40# [***] which is also ~35% better than compiler generated code.
41
42$output=shift;
43open STDOUT,">$output";
44
45$ctx="r0";
46$inp="r1";
47$len="r2";
48$a="r3";
49$b="r4";
50$c="r5";
51$d="r6";
52$e="r7";
53$K="r8";
54$t0="r9";
55$t1="r10";
56$t2="r11";
57$t3="r12";
58$Xi="r14";
59@V=($a,$b,$c,$d,$e);
60
61# One can optimize this for aligned access on big-endian architecture,
62# but code's endian neutrality makes it too pretty:-)
63sub Xload {
64my ($a,$b,$c,$d,$e)=@_;
65$code.=<<___;
66 ldrb $t0,[$inp],#4
67 ldrb $t1,[$inp,#-3]
68 ldrb $t2,[$inp,#-2]
69 ldrb $t3,[$inp,#-1]
70 add $e,$K,$e,ror#2 @ E+=K_00_19
71 orr $t0,$t1,$t0,lsl#8
72 add $e,$e,$a,ror#27 @ E+=ROR(A,27)
73 orr $t0,$t2,$t0,lsl#8
74 eor $t1,$c,$d @ F_xx_xx
75 orr $t0,$t3,$t0,lsl#8
76 add $e,$e,$t0 @ E+=X[i]
77 str $t0,[$Xi,#-4]!
78___
79}
80sub Xupdate {
81my ($a,$b,$c,$d,$e,$flag)=@_;
82$code.=<<___;
83 ldr $t0,[$Xi,#15*4]
84 ldr $t1,[$Xi,#13*4]
85 ldr $t2,[$Xi,#7*4]
86 ldr $t3,[$Xi,#2*4]
87 add $e,$K,$e,ror#2 @ E+=K_xx_xx
88 eor $t0,$t0,$t1
89 eor $t0,$t0,$t2
90 eor $t0,$t0,$t3
91 add $e,$e,$a,ror#27 @ E+=ROR(A,27)
92___
93$code.=<<___ if (!defined($flag));
94 eor $t1,$c,$d @ F_xx_xx, but not in 40_59
95___
96$code.=<<___;
97 mov $t0,$t0,ror#31
98 add $e,$e,$t0 @ E+=X[i]
99 str $t0,[$Xi,#-4]!
100___
101}
102
103sub BODY_00_15 {
104my ($a,$b,$c,$d,$e)=@_;
105 &Xload(@_);
106$code.=<<___;
107 and $t1,$b,$t1,ror#2
108 eor $t1,$t1,$d,ror#2 @ F_00_19(B,C,D)
109 add $e,$e,$t1 @ E+=F_00_19(B,C,D)
110___
111}
112
113sub BODY_16_19 {
114my ($a,$b,$c,$d,$e)=@_;
115 &Xupdate(@_);
116$code.=<<___;
117 and $t1,$b,$t1,ror#2
118 eor $t1,$t1,$d,ror#2 @ F_00_19(B,C,D)
119 add $e,$e,$t1 @ E+=F_00_19(B,C,D)
120___
121}
122
123sub BODY_20_39 {
124my ($a,$b,$c,$d,$e)=@_;
125 &Xupdate(@_);
126$code.=<<___;
127 eor $t1,$b,$t1,ror#2 @ F_20_39(B,C,D)
128 add $e,$e,$t1 @ E+=F_20_39(B,C,D)
129___
130}
131
132sub BODY_40_59 {
133my ($a,$b,$c,$d,$e)=@_;
134 &Xupdate(@_,1);
135$code.=<<___;
136 and $t1,$b,$c,ror#2
137 orr $t2,$b,$c,ror#2
138 and $t2,$t2,$d,ror#2
139 orr $t1,$t1,$t2 @ F_40_59(B,C,D)
140 add $e,$e,$t1 @ E+=F_40_59(B,C,D)
141___
142}
143
144$code=<<___;
145.text
146
147.global sha1_block_data_order
148.type sha1_block_data_order,%function
149
150.align 2
151sha1_block_data_order:
152 stmdb sp!,{r4-r12,lr}
153 add $len,$inp,$len,lsl#6 @ $len to point at the end of $inp
154 ldmia $ctx,{$a,$b,$c,$d,$e}
155.Lloop:
156 ldr $K,.LK_00_19
157 mov $Xi,sp
158 sub sp,sp,#15*4
159 mov $c,$c,ror#30
160 mov $d,$d,ror#30
161 mov $e,$e,ror#30 @ [6]
162.L_00_15:
163___
164for($i=0;$i<5;$i++) {
165 &BODY_00_15(@V); unshift(@V,pop(@V));
166}
167$code.=<<___;
168 teq $Xi,sp
169 bne .L_00_15 @ [((11+4)*5+2)*3]
170___
171 &BODY_00_15(@V); unshift(@V,pop(@V));
172 &BODY_16_19(@V); unshift(@V,pop(@V));
173 &BODY_16_19(@V); unshift(@V,pop(@V));
174 &BODY_16_19(@V); unshift(@V,pop(@V));
175 &BODY_16_19(@V); unshift(@V,pop(@V));
176$code.=<<___;
177
178 ldr $K,.LK_20_39 @ [+15+16*4]
179 sub sp,sp,#25*4
180 cmn sp,#0 @ [+3], clear carry to denote 20_39
181.L_20_39_or_60_79:
182___
183for($i=0;$i<5;$i++) {
184 &BODY_20_39(@V); unshift(@V,pop(@V));
185}
186$code.=<<___;
187 teq $Xi,sp @ preserve carry
188 bne .L_20_39_or_60_79 @ [+((12+3)*5+2)*4]
189 bcs .L_done @ [+((12+3)*5+2)*4], spare 300 bytes
190
191 ldr $K,.LK_40_59
192 sub sp,sp,#20*4 @ [+2]
193.L_40_59:
194___
195for($i=0;$i<5;$i++) {
196 &BODY_40_59(@V); unshift(@V,pop(@V));
197}
198$code.=<<___;
199 teq $Xi,sp
200 bne .L_40_59 @ [+((12+5)*5+2)*4]
201
202 ldr $K,.LK_60_79
203 sub sp,sp,#20*4
204 cmp sp,#0 @ set carry to denote 60_79
205 b .L_20_39_or_60_79 @ [+4], spare 300 bytes
206.L_done:
207 add sp,sp,#80*4 @ "deallocate" stack frame
208 ldmia $ctx,{$K,$t0,$t1,$t2,$t3}
209 add $a,$K,$a
210 add $b,$t0,$b
211 add $c,$t1,$c,ror#2
212 add $d,$t2,$d,ror#2
213 add $e,$t3,$e,ror#2
214 stmia $ctx,{$a,$b,$c,$d,$e}
215 teq $inp,$len
216 bne .Lloop @ [+18], total 1307
217
218 ldmia sp!,{r4-r12,lr}
219 tst lr,#1
220 moveq pc,lr @ be binary compatible with V4, yet
221 bx lr @ interoperable with Thumb ISA:-)
222.align 2
223.LK_00_19: .word 0x5a827999
224.LK_20_39: .word 0x6ed9eba1
225.LK_40_59: .word 0x8f1bbcdc
226.LK_60_79: .word 0xca62c1d6
227.size sha1_block_data_order,.-sha1_block_data_order
228.asciz "SHA1 block transform for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
229.align 2
230___
231
232$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
233print $code;
234close STDOUT; # enforce flush
diff --git a/src/lib/libcrypto/sha/asm/sha1-ppc.pl b/src/lib/libcrypto/sha/asm/sha1-ppc.pl
new file mode 100755
index 0000000000..dcd0fcdfcf
--- /dev/null
+++ b/src/lib/libcrypto/sha/asm/sha1-ppc.pl
@@ -0,0 +1,319 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# I let hardware handle unaligned input(*), except on page boundaries
11# (see below for details). Otherwise straightforward implementation
12# with X vector in register bank. The module is big-endian [which is
13# not big deal as there're no little-endian targets left around].
14#
15# (*) this means that this module is inappropriate for PPC403? Does
16# anybody know if pre-POWER3 can sustain unaligned load?
17
18# -m64 -m32
19# ----------------------------------
20# PPC970,gcc-4.0.0 +76% +59%
21# Power6,xlc-7 +68% +33%
22
23$flavour = shift;
24
25if ($flavour =~ /64/) {
26 $SIZE_T =8;
27 $UCMP ="cmpld";
28 $STU ="stdu";
29 $POP ="ld";
30 $PUSH ="std";
31} elsif ($flavour =~ /32/) {
32 $SIZE_T =4;
33 $UCMP ="cmplw";
34 $STU ="stwu";
35 $POP ="lwz";
36 $PUSH ="stw";
37} else { die "nonsense $flavour"; }
38
39$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
40( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
41( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
42die "can't locate ppc-xlate.pl";
43
44open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
45
46$FRAME=24*$SIZE_T;
47
48$K ="r0";
49$sp ="r1";
50$toc="r2";
51$ctx="r3";
52$inp="r4";
53$num="r5";
54$t0 ="r15";
55$t1 ="r6";
56
57$A ="r7";
58$B ="r8";
59$C ="r9";
60$D ="r10";
61$E ="r11";
62$T ="r12";
63
64@V=($A,$B,$C,$D,$E,$T);
65@X=("r16","r17","r18","r19","r20","r21","r22","r23",
66 "r24","r25","r26","r27","r28","r29","r30","r31");
67
68sub BODY_00_19 {
69my ($i,$a,$b,$c,$d,$e,$f)=@_;
70my $j=$i+1;
71$code.=<<___ if ($i==0);
72 lwz @X[$i],`$i*4`($inp)
73___
74$code.=<<___ if ($i<15);
75 lwz @X[$j],`$j*4`($inp)
76 add $f,$K,$e
77 rotlwi $e,$a,5
78 add $f,$f,@X[$i]
79 and $t0,$c,$b
80 add $f,$f,$e
81 andc $t1,$d,$b
82 rotlwi $b,$b,30
83 or $t0,$t0,$t1
84 add $f,$f,$t0
85___
86$code.=<<___ if ($i>=15);
87 add $f,$K,$e
88 rotlwi $e,$a,5
89 xor @X[$j%16],@X[$j%16],@X[($j+2)%16]
90 add $f,$f,@X[$i%16]
91 and $t0,$c,$b
92 xor @X[$j%16],@X[$j%16],@X[($j+8)%16]
93 add $f,$f,$e
94 andc $t1,$d,$b
95 rotlwi $b,$b,30
96 or $t0,$t0,$t1
97 xor @X[$j%16],@X[$j%16],@X[($j+13)%16]
98 add $f,$f,$t0
99 rotlwi @X[$j%16],@X[$j%16],1
100___
101}
102
103sub BODY_20_39 {
104my ($i,$a,$b,$c,$d,$e,$f)=@_;
105my $j=$i+1;
106$code.=<<___ if ($i<79);
107 add $f,$K,$e
108 rotlwi $e,$a,5
109 xor @X[$j%16],@X[$j%16],@X[($j+2)%16]
110 add $f,$f,@X[$i%16]
111 xor $t0,$b,$c
112 xor @X[$j%16],@X[$j%16],@X[($j+8)%16]
113 add $f,$f,$e
114 rotlwi $b,$b,30
115 xor $t0,$t0,$d
116 xor @X[$j%16],@X[$j%16],@X[($j+13)%16]
117 add $f,$f,$t0
118 rotlwi @X[$j%16],@X[$j%16],1
119___
120$code.=<<___ if ($i==79);
121 add $f,$K,$e
122 rotlwi $e,$a,5
123 lwz r16,0($ctx)
124 add $f,$f,@X[$i%16]
125 xor $t0,$b,$c
126 lwz r17,4($ctx)
127 add $f,$f,$e
128 rotlwi $b,$b,30
129 lwz r18,8($ctx)
130 xor $t0,$t0,$d
131 lwz r19,12($ctx)
132 add $f,$f,$t0
133 lwz r20,16($ctx)
134___
135}
136
137sub BODY_40_59 {
138my ($i,$a,$b,$c,$d,$e,$f)=@_;
139my $j=$i+1;
140$code.=<<___;
141 add $f,$K,$e
142 rotlwi $e,$a,5
143 xor @X[$j%16],@X[$j%16],@X[($j+2)%16]
144 add $f,$f,@X[$i%16]
145 and $t0,$b,$c
146 xor @X[$j%16],@X[$j%16],@X[($j+8)%16]
147 add $f,$f,$e
148 or $t1,$b,$c
149 rotlwi $b,$b,30
150 xor @X[$j%16],@X[$j%16],@X[($j+13)%16]
151 and $t1,$t1,$d
152 or $t0,$t0,$t1
153 rotlwi @X[$j%16],@X[$j%16],1
154 add $f,$f,$t0
155___
156}
157
158$code=<<___;
159.machine "any"
160.text
161
162.globl .sha1_block_data_order
163.align 4
164.sha1_block_data_order:
165 mflr r0
166 $STU $sp,`-($FRAME+64)`($sp)
167 $PUSH r0,`$FRAME-$SIZE_T*18`($sp)
168 $PUSH r15,`$FRAME-$SIZE_T*17`($sp)
169 $PUSH r16,`$FRAME-$SIZE_T*16`($sp)
170 $PUSH r17,`$FRAME-$SIZE_T*15`($sp)
171 $PUSH r18,`$FRAME-$SIZE_T*14`($sp)
172 $PUSH r19,`$FRAME-$SIZE_T*13`($sp)
173 $PUSH r20,`$FRAME-$SIZE_T*12`($sp)
174 $PUSH r21,`$FRAME-$SIZE_T*11`($sp)
175 $PUSH r22,`$FRAME-$SIZE_T*10`($sp)
176 $PUSH r23,`$FRAME-$SIZE_T*9`($sp)
177 $PUSH r24,`$FRAME-$SIZE_T*8`($sp)
178 $PUSH r25,`$FRAME-$SIZE_T*7`($sp)
179 $PUSH r26,`$FRAME-$SIZE_T*6`($sp)
180 $PUSH r27,`$FRAME-$SIZE_T*5`($sp)
181 $PUSH r28,`$FRAME-$SIZE_T*4`($sp)
182 $PUSH r29,`$FRAME-$SIZE_T*3`($sp)
183 $PUSH r30,`$FRAME-$SIZE_T*2`($sp)
184 $PUSH r31,`$FRAME-$SIZE_T*1`($sp)
185 lwz $A,0($ctx)
186 lwz $B,4($ctx)
187 lwz $C,8($ctx)
188 lwz $D,12($ctx)
189 lwz $E,16($ctx)
190 andi. r0,$inp,3
191 bne Lunaligned
192Laligned:
193 mtctr $num
194 bl Lsha1_block_private
195Ldone:
196 $POP r0,`$FRAME-$SIZE_T*18`($sp)
197 $POP r15,`$FRAME-$SIZE_T*17`($sp)
198 $POP r16,`$FRAME-$SIZE_T*16`($sp)
199 $POP r17,`$FRAME-$SIZE_T*15`($sp)
200 $POP r18,`$FRAME-$SIZE_T*14`($sp)
201 $POP r19,`$FRAME-$SIZE_T*13`($sp)
202 $POP r20,`$FRAME-$SIZE_T*12`($sp)
203 $POP r21,`$FRAME-$SIZE_T*11`($sp)
204 $POP r22,`$FRAME-$SIZE_T*10`($sp)
205 $POP r23,`$FRAME-$SIZE_T*9`($sp)
206 $POP r24,`$FRAME-$SIZE_T*8`($sp)
207 $POP r25,`$FRAME-$SIZE_T*7`($sp)
208 $POP r26,`$FRAME-$SIZE_T*6`($sp)
209 $POP r27,`$FRAME-$SIZE_T*5`($sp)
210 $POP r28,`$FRAME-$SIZE_T*4`($sp)
211 $POP r29,`$FRAME-$SIZE_T*3`($sp)
212 $POP r30,`$FRAME-$SIZE_T*2`($sp)
213 $POP r31,`$FRAME-$SIZE_T*1`($sp)
214 mtlr r0
215 addi $sp,$sp,`$FRAME+64`
216 blr
217___
218
219# PowerPC specification allows an implementation to be ill-behaved
220# upon unaligned access which crosses page boundary. "Better safe
221# than sorry" principle makes me treat it specially. But I don't
222# look for particular offending word, but rather for 64-byte input
223# block which crosses the boundary. Once found that block is aligned
224# and hashed separately...
225$code.=<<___;
226.align 4
227Lunaligned:
228 subfic $t1,$inp,4096
229 andi. $t1,$t1,4095 ; distance to closest page boundary
230 srwi. $t1,$t1,6 ; t1/=64
231 beq Lcross_page
232 $UCMP $num,$t1
233 ble- Laligned ; didn't cross the page boundary
234 mtctr $t1
235 subfc $num,$t1,$num
236 bl Lsha1_block_private
237Lcross_page:
238 li $t1,16
239 mtctr $t1
240 addi r20,$sp,$FRAME ; spot below the frame
241Lmemcpy:
242 lbz r16,0($inp)
243 lbz r17,1($inp)
244 lbz r18,2($inp)
245 lbz r19,3($inp)
246 addi $inp,$inp,4
247 stb r16,0(r20)
248 stb r17,1(r20)
249 stb r18,2(r20)
250 stb r19,3(r20)
251 addi r20,r20,4
252 bdnz Lmemcpy
253
254 $PUSH $inp,`$FRAME-$SIZE_T*19`($sp)
255 li $t1,1
256 addi $inp,$sp,$FRAME
257 mtctr $t1
258 bl Lsha1_block_private
259 $POP $inp,`$FRAME-$SIZE_T*19`($sp)
260 addic. $num,$num,-1
261 bne- Lunaligned
262 b Ldone
263___
264
265# This is private block function, which uses tailored calling
266# interface, namely upon entry SHA_CTX is pre-loaded to given
267# registers and counter register contains amount of chunks to
268# digest...
269$code.=<<___;
270.align 4
271Lsha1_block_private:
272___
273$code.=<<___; # load K_00_19
274 lis $K,0x5a82
275 ori $K,$K,0x7999
276___
277for($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
278$code.=<<___; # load K_20_39
279 lis $K,0x6ed9
280 ori $K,$K,0xeba1
281___
282for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
283$code.=<<___; # load K_40_59
284 lis $K,0x8f1b
285 ori $K,$K,0xbcdc
286___
287for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
288$code.=<<___; # load K_60_79
289 lis $K,0xca62
290 ori $K,$K,0xc1d6
291___
292for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
293$code.=<<___;
294 add r16,r16,$E
295 add r17,r17,$T
296 add r18,r18,$A
297 add r19,r19,$B
298 add r20,r20,$C
299 stw r16,0($ctx)
300 mr $A,r16
301 stw r17,4($ctx)
302 mr $B,r17
303 stw r18,8($ctx)
304 mr $C,r18
305 stw r19,12($ctx)
306 mr $D,r19
307 stw r20,16($ctx)
308 mr $E,r20
309 addi $inp,$inp,`16*4`
310 bdnz- Lsha1_block_private
311 blr
312___
313$code.=<<___;
314.asciz "SHA1 block transform for PPC, CRYPTOGAMS by <appro\@fy.chalmers.se>"
315___
316
317$code =~ s/\`([^\`]*)\`/eval $1/gem;
318print $code;
319close STDOUT;
diff --git a/src/lib/libcrypto/sha/asm/sha1-s390x.pl b/src/lib/libcrypto/sha/asm/sha1-s390x.pl
new file mode 100644
index 0000000000..4b17848287
--- /dev/null
+++ b/src/lib/libcrypto/sha/asm/sha1-s390x.pl
@@ -0,0 +1,226 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# SHA1 block procedure for s390x.
11
12# April 2007.
13#
14# Performance is >30% better than gcc 3.3 generated code. But the real
15# twist is that SHA1 hardware support is detected and utilized. In
16# which case performance can reach further >4.5x for larger chunks.
17
18# January 2009.
19#
20# Optimize Xupdate for amount of memory references and reschedule
21# instructions to favour dual-issue z10 pipeline. On z10 hardware is
22# "only" ~2.3x faster than software.
23
24$kimdfunc=1; # magic function code for kimd instruction
25
26$output=shift;
27open STDOUT,">$output";
28
29$K_00_39="%r0"; $K=$K_00_39;
30$K_40_79="%r1";
31$ctx="%r2"; $prefetch="%r2";
32$inp="%r3";
33$len="%r4";
34
35$A="%r5";
36$B="%r6";
37$C="%r7";
38$D="%r8";
39$E="%r9"; @V=($A,$B,$C,$D,$E);
40$t0="%r10";
41$t1="%r11";
42@X=("%r12","%r13","%r14");
43$sp="%r15";
44
45$frame=160+16*4;
46
47sub Xupdate {
48my $i=shift;
49
50$code.=<<___ if ($i==15);
51 lg $prefetch,160($sp) ### Xupdate(16) warm-up
52 lr $X[0],$X[2]
53___
54return if ($i&1); # Xupdate is vectorized and executed every 2nd cycle
55$code.=<<___ if ($i<16);
56 lg $X[0],`$i*4`($inp) ### Xload($i)
57 rllg $X[1],$X[0],32
58___
59$code.=<<___ if ($i>=16);
60 xgr $X[0],$prefetch ### Xupdate($i)
61 lg $prefetch,`160+4*(($i+2)%16)`($sp)
62 xg $X[0],`160+4*(($i+8)%16)`($sp)
63 xgr $X[0],$prefetch
64 rll $X[0],$X[0],1
65 rllg $X[1],$X[0],32
66 rll $X[1],$X[1],1
67 rllg $X[0],$X[1],32
68 lr $X[2],$X[1] # feedback
69___
70$code.=<<___ if ($i<=70);
71 stg $X[0],`160+4*($i%16)`($sp)
72___
73unshift(@X,pop(@X));
74}
75
76sub BODY_00_19 {
77my ($i,$a,$b,$c,$d,$e)=@_;
78my $xi=$X[1];
79
80 &Xupdate($i);
81$code.=<<___;
82 alr $e,$K ### $i
83 rll $t1,$a,5
84 lr $t0,$d
85 xr $t0,$c
86 alr $e,$t1
87 nr $t0,$b
88 alr $e,$xi
89 xr $t0,$d
90 rll $b,$b,30
91 alr $e,$t0
92___
93}
94
95sub BODY_20_39 {
96my ($i,$a,$b,$c,$d,$e)=@_;
97my $xi=$X[1];
98
99 &Xupdate($i);
100$code.=<<___;
101 alr $e,$K ### $i
102 rll $t1,$a,5
103 lr $t0,$b
104 alr $e,$t1
105 xr $t0,$c
106 alr $e,$xi
107 xr $t0,$d
108 rll $b,$b,30
109 alr $e,$t0
110___
111}
112
113sub BODY_40_59 {
114my ($i,$a,$b,$c,$d,$e)=@_;
115my $xi=$X[1];
116
117 &Xupdate($i);
118$code.=<<___;
119 alr $e,$K ### $i
120 rll $t1,$a,5
121 lr $t0,$b
122 alr $e,$t1
123 or $t0,$c
124 lr $t1,$b
125 nr $t0,$d
126 nr $t1,$c
127 alr $e,$xi
128 or $t0,$t1
129 rll $b,$b,30
130 alr $e,$t0
131___
132}
133
134$code.=<<___;
135.text
136.align 64
137.type Ktable,\@object
138Ktable: .long 0x5a827999,0x6ed9eba1,0x8f1bbcdc,0xca62c1d6
139 .skip 48 #.long 0,0,0,0,0,0,0,0,0,0,0,0
140.size Ktable,.-Ktable
141.globl sha1_block_data_order
142.type sha1_block_data_order,\@function
143sha1_block_data_order:
144___
145$code.=<<___ if ($kimdfunc);
146 larl %r1,OPENSSL_s390xcap_P
147 lg %r0,0(%r1)
148 tmhl %r0,0x4000 # check for message-security assist
149 jz .Lsoftware
150 lghi %r0,0
151 la %r1,16($sp)
152 .long 0xb93e0002 # kimd %r0,%r2
153 lg %r0,16($sp)
154 tmhh %r0,`0x8000>>$kimdfunc`
155 jz .Lsoftware
156 lghi %r0,$kimdfunc
157 lgr %r1,$ctx
158 lgr %r2,$inp
159 sllg %r3,$len,6
160 .long 0xb93e0002 # kimd %r0,%r2
161 brc 1,.-4 # pay attention to "partial completion"
162 br %r14
163.align 16
164.Lsoftware:
165___
166$code.=<<___;
167 lghi %r1,-$frame
168 stg $ctx,16($sp)
169 stmg %r6,%r15,48($sp)
170 lgr %r0,$sp
171 la $sp,0(%r1,$sp)
172 stg %r0,0($sp)
173
174 larl $t0,Ktable
175 llgf $A,0($ctx)
176 llgf $B,4($ctx)
177 llgf $C,8($ctx)
178 llgf $D,12($ctx)
179 llgf $E,16($ctx)
180
181 lg $K_00_39,0($t0)
182 lg $K_40_79,8($t0)
183
184.Lloop:
185 rllg $K_00_39,$K_00_39,32
186___
187for ($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
188$code.=<<___;
189 rllg $K_00_39,$K_00_39,32
190___
191for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
192$code.=<<___; $K=$K_40_79;
193 rllg $K_40_79,$K_40_79,32
194___
195for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
196$code.=<<___;
197 rllg $K_40_79,$K_40_79,32
198___
199for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
200$code.=<<___;
201
202 lg $ctx,`$frame+16`($sp)
203 la $inp,64($inp)
204 al $A,0($ctx)
205 al $B,4($ctx)
206 al $C,8($ctx)
207 al $D,12($ctx)
208 al $E,16($ctx)
209 st $A,0($ctx)
210 st $B,4($ctx)
211 st $C,8($ctx)
212 st $D,12($ctx)
213 st $E,16($ctx)
214 brct $len,.Lloop
215
216 lmg %r6,%r15,`$frame+48`($sp)
217 br %r14
218.size sha1_block_data_order,.-sha1_block_data_order
219.string "SHA1 block transform for s390x, CRYPTOGAMS by <appro\@openssl.org>"
220.comm OPENSSL_s390xcap_P,8,8
221___
222
223$code =~ s/\`([^\`]*)\`/eval $1/gem;
224
225print $code;
226close STDOUT;
diff --git a/src/lib/libcrypto/sha/asm/sha1-sparcv9.pl b/src/lib/libcrypto/sha/asm/sha1-sparcv9.pl
new file mode 100644
index 0000000000..8306fc88cc
--- /dev/null
+++ b/src/lib/libcrypto/sha/asm/sha1-sparcv9.pl
@@ -0,0 +1,283 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# Performance improvement is not really impressive on pre-T1 CPU: +8%
11# over Sun C and +25% over gcc [3.3]. While on T1, a.k.a. Niagara, it
12# turned to be 40% faster than 64-bit code generated by Sun C 5.8 and
13# >2x than 64-bit code generated by gcc 3.4. And there is a gimmick.
14# X[16] vector is packed to 8 64-bit registers and as result nothing
15# is spilled on stack. In addition input data is loaded in compact
16# instruction sequence, thus minimizing the window when the code is
17# subject to [inter-thread] cache-thrashing hazard. The goal is to
18# ensure scalability on UltraSPARC T1, or rather to avoid decay when
19# amount of active threads exceeds the number of physical cores.
20
21$bits=32;
22for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
23if ($bits==64) { $bias=2047; $frame=192; }
24else { $bias=0; $frame=112; }
25
26$output=shift;
27open STDOUT,">$output";
28
29@X=("%o0","%o1","%o2","%o3","%o4","%o5","%g1","%o7");
30$rot1m="%g2";
31$tmp64="%g3";
32$Xi="%g4";
33$A="%l0";
34$B="%l1";
35$C="%l2";
36$D="%l3";
37$E="%l4";
38@V=($A,$B,$C,$D,$E);
39$K_00_19="%l5";
40$K_20_39="%l6";
41$K_40_59="%l7";
42$K_60_79="%g5";
43@K=($K_00_19,$K_20_39,$K_40_59,$K_60_79);
44
45$ctx="%i0";
46$inp="%i1";
47$len="%i2";
48$tmp0="%i3";
49$tmp1="%i4";
50$tmp2="%i5";
51
52sub BODY_00_15 {
53my ($i,$a,$b,$c,$d,$e)=@_;
54my $xi=($i&1)?@X[($i/2)%8]:$Xi;
55
56$code.=<<___;
57 sll $a,5,$tmp0 !! $i
58 add @K[$i/20],$e,$e
59 srl $a,27,$tmp1
60 add $tmp0,$e,$e
61 and $c,$b,$tmp0
62 add $tmp1,$e,$e
63 sll $b,30,$tmp2
64 andn $d,$b,$tmp1
65 srl $b,2,$b
66 or $tmp1,$tmp0,$tmp1
67 or $tmp2,$b,$b
68 add $xi,$e,$e
69___
70if ($i&1 && $i<15) {
71 $code.=
72 " srlx @X[(($i+1)/2)%8],32,$Xi\n";
73}
74$code.=<<___;
75 add $tmp1,$e,$e
76___
77}
78
79sub Xupdate {
80my ($i,$a,$b,$c,$d,$e)=@_;
81my $j=$i/2;
82
83if ($i&1) {
84$code.=<<___;
85 sll $a,5,$tmp0 !! $i
86 add @K[$i/20],$e,$e
87 srl $a,27,$tmp1
88___
89} else {
90$code.=<<___;
91 sllx @X[($j+6)%8],32,$Xi ! Xupdate($i)
92 xor @X[($j+1)%8],@X[$j%8],@X[$j%8]
93 srlx @X[($j+7)%8],32,$tmp1
94 xor @X[($j+4)%8],@X[$j%8],@X[$j%8]
95 sll $a,5,$tmp0 !! $i
96 or $tmp1,$Xi,$Xi
97 add @K[$i/20],$e,$e !!
98 xor $Xi,@X[$j%8],@X[$j%8]
99 srlx @X[$j%8],31,$Xi
100 add @X[$j%8],@X[$j%8],@X[$j%8]
101 and $Xi,$rot1m,$Xi
102 andn @X[$j%8],$rot1m,@X[$j%8]
103 srl $a,27,$tmp1 !!
104 or $Xi,@X[$j%8],@X[$j%8]
105___
106}
107}
108
109sub BODY_16_19 {
110my ($i,$a,$b,$c,$d,$e)=@_;
111
112 &Xupdate(@_);
113 if ($i&1) {
114 $xi=@X[($i/2)%8];
115 } else {
116 $xi=$Xi;
117 $code.="\tsrlx @X[($i/2)%8],32,$xi\n";
118 }
119$code.=<<___;
120 add $tmp0,$e,$e !!
121 and $c,$b,$tmp0
122 add $tmp1,$e,$e
123 sll $b,30,$tmp2
124 add $xi,$e,$e
125 andn $d,$b,$tmp1
126 srl $b,2,$b
127 or $tmp1,$tmp0,$tmp1
128 or $tmp2,$b,$b
129 add $tmp1,$e,$e
130___
131}
132
133sub BODY_20_39 {
134my ($i,$a,$b,$c,$d,$e)=@_;
135my $xi;
136 &Xupdate(@_);
137 if ($i&1) {
138 $xi=@X[($i/2)%8];
139 } else {
140 $xi=$Xi;
141 $code.="\tsrlx @X[($i/2)%8],32,$xi\n";
142 }
143$code.=<<___;
144 add $tmp0,$e,$e !!
145 xor $c,$b,$tmp0
146 add $tmp1,$e,$e
147 sll $b,30,$tmp2
148 xor $d,$tmp0,$tmp1
149 srl $b,2,$b
150 add $tmp1,$e,$e
151 or $tmp2,$b,$b
152 add $xi,$e,$e
153___
154}
155
156sub BODY_40_59 {
157my ($i,$a,$b,$c,$d,$e)=@_;
158my $xi;
159 &Xupdate(@_);
160 if ($i&1) {
161 $xi=@X[($i/2)%8];
162 } else {
163 $xi=$Xi;
164 $code.="\tsrlx @X[($i/2)%8],32,$xi\n";
165 }
166$code.=<<___;
167 add $tmp0,$e,$e !!
168 and $c,$b,$tmp0
169 add $tmp1,$e,$e
170 sll $b,30,$tmp2
171 or $c,$b,$tmp1
172 srl $b,2,$b
173 and $d,$tmp1,$tmp1
174 add $xi,$e,$e
175 or $tmp1,$tmp0,$tmp1
176 or $tmp2,$b,$b
177 add $tmp1,$e,$e
178___
179}
180
181$code.=<<___ if ($bits==64);
182.register %g2,#scratch
183.register %g3,#scratch
184___
185$code.=<<___;
186.section ".text",#alloc,#execinstr
187
188.align 32
189.globl sha1_block_data_order
190sha1_block_data_order:
191 save %sp,-$frame,%sp
192 sllx $len,6,$len
193 add $inp,$len,$len
194
195 or %g0,1,$rot1m
196 sllx $rot1m,32,$rot1m
197 or $rot1m,1,$rot1m
198
199 ld [$ctx+0],$A
200 ld [$ctx+4],$B
201 ld [$ctx+8],$C
202 ld [$ctx+12],$D
203 ld [$ctx+16],$E
204 andn $inp,7,$tmp0
205
206 sethi %hi(0x5a827999),$K_00_19
207 or $K_00_19,%lo(0x5a827999),$K_00_19
208 sethi %hi(0x6ed9eba1),$K_20_39
209 or $K_20_39,%lo(0x6ed9eba1),$K_20_39
210 sethi %hi(0x8f1bbcdc),$K_40_59
211 or $K_40_59,%lo(0x8f1bbcdc),$K_40_59
212 sethi %hi(0xca62c1d6),$K_60_79
213 or $K_60_79,%lo(0xca62c1d6),$K_60_79
214
215.Lloop:
216 ldx [$tmp0+0],@X[0]
217 ldx [$tmp0+16],@X[2]
218 ldx [$tmp0+32],@X[4]
219 ldx [$tmp0+48],@X[6]
220 and $inp,7,$tmp1
221 ldx [$tmp0+8],@X[1]
222 sll $tmp1,3,$tmp1
223 ldx [$tmp0+24],@X[3]
224 subcc %g0,$tmp1,$tmp2 ! should be 64-$tmp1, but -$tmp1 works too
225 ldx [$tmp0+40],@X[5]
226 bz,pt %icc,.Laligned
227 ldx [$tmp0+56],@X[7]
228
229 sllx @X[0],$tmp1,@X[0]
230 ldx [$tmp0+64],$tmp64
231___
232for($i=0;$i<7;$i++)
233{ $code.=<<___;
234 srlx @X[$i+1],$tmp2,$Xi
235 sllx @X[$i+1],$tmp1,@X[$i+1]
236 or $Xi,@X[$i],@X[$i]
237___
238}
239$code.=<<___;
240 srlx $tmp64,$tmp2,$tmp64
241 or $tmp64,@X[7],@X[7]
242.Laligned:
243 srlx @X[0],32,$Xi
244___
245for ($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
246for (;$i<20;$i++) { &BODY_16_19($i,@V); unshift(@V,pop(@V)); }
247for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
248for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
249for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
250$code.=<<___;
251
252 ld [$ctx+0],@X[0]
253 ld [$ctx+4],@X[1]
254 ld [$ctx+8],@X[2]
255 ld [$ctx+12],@X[3]
256 add $inp,64,$inp
257 ld [$ctx+16],@X[4]
258 cmp $inp,$len
259
260 add $A,@X[0],$A
261 st $A,[$ctx+0]
262 add $B,@X[1],$B
263 st $B,[$ctx+4]
264 add $C,@X[2],$C
265 st $C,[$ctx+8]
266 add $D,@X[3],$D
267 st $D,[$ctx+12]
268 add $E,@X[4],$E
269 st $E,[$ctx+16]
270
271 bne `$bits==64?"%xcc":"%icc"`,.Lloop
272 andn $inp,7,$tmp0
273
274 ret
275 restore
276.type sha1_block_data_order,#function
277.size sha1_block_data_order,(.-sha1_block_data_order)
278.asciz "SHA1 block transform for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
279___
280
281$code =~ s/\`([^\`]*)\`/eval $1/gem;
282print $code;
283close STDOUT;
diff --git a/src/lib/libcrypto/sha/asm/sha1-sparcv9a.pl b/src/lib/libcrypto/sha/asm/sha1-sparcv9a.pl
new file mode 100644
index 0000000000..15eb854bad
--- /dev/null
+++ b/src/lib/libcrypto/sha/asm/sha1-sparcv9a.pl
@@ -0,0 +1,600 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# January 2009
11#
12# Provided that UltraSPARC VIS instructions are pipe-lined(*) and
13# pairable(*) with IALU ones, offloading of Xupdate to the UltraSPARC
14# Graphic Unit would make it possible to achieve higher instruction-
15# level parallelism, ILP, and thus higher performance. It should be
16# explicitly noted that ILP is the keyword, and it means that this
17# code would be unsuitable for cores like UltraSPARC-Tx. The idea is
18# not really novel, Sun had VIS-powered implementation for a while.
19# Unlike Sun's implementation this one can process multiple unaligned
20# input blocks, and as such works as drop-in replacement for OpenSSL
21# sha1_block_data_order. Performance improvement was measured to be
22# 40% over pure IALU sha1-sparcv9.pl on UltraSPARC-IIi, but 12% on
23# UltraSPARC-III. See below for discussion...
24#
25# The module does not present direct interest for OpenSSL, because
26# it doesn't provide better performance on contemporary SPARCv9 CPUs,
27# UltraSPARC-Tx and SPARC64-V[II] to be specific. Those who feel they
28# absolutely must score on UltraSPARC-I-IV can simply replace
29# crypto/sha/asm/sha1-sparcv9.pl with this module.
30#
31# (*) "Pipe-lined" means that even if it takes several cycles to
32# complete, next instruction using same functional unit [but not
33# depending on the result of the current instruction] can start
34# execution without having to wait for the unit. "Pairable"
35# means that two [or more] independent instructions can be
36# issued at the very same time.
37
38$bits=32;
39for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
40if ($bits==64) { $bias=2047; $frame=192; }
41else { $bias=0; $frame=112; }
42
43$output=shift;
44open STDOUT,">$output";
45
46$ctx="%i0";
47$inp="%i1";
48$len="%i2";
49$tmp0="%i3";
50$tmp1="%i4";
51$tmp2="%i5";
52$tmp3="%g5";
53
54$base="%g1";
55$align="%g4";
56$Xfer="%o5";
57$nXfer=$tmp3;
58$Xi="%o7";
59
60$A="%l0";
61$B="%l1";
62$C="%l2";
63$D="%l3";
64$E="%l4";
65@V=($A,$B,$C,$D,$E);
66
67$Actx="%o0";
68$Bctx="%o1";
69$Cctx="%o2";
70$Dctx="%o3";
71$Ectx="%o4";
72
73$fmul="%f32";
74$VK_00_19="%f34";
75$VK_20_39="%f36";
76$VK_40_59="%f38";
77$VK_60_79="%f40";
78@VK=($VK_00_19,$VK_20_39,$VK_40_59,$VK_60_79);
79@X=("%f0", "%f1", "%f2", "%f3", "%f4", "%f5", "%f6", "%f7",
80 "%f8", "%f9","%f10","%f11","%f12","%f13","%f14","%f15","%f16");
81
82# This is reference 2x-parallelized VIS-powered Xupdate procedure. It
83# covers even K_NN_MM addition...
84sub Xupdate {
85my ($i)=@_;
86my $K=@VK[($i+16)/20];
87my $j=($i+16)%16;
88
89# [ provided that GSR.alignaddr_offset is 5, $mul contains
90# 0x100ULL<<32|0x100 value and K_NN_MM are pre-loaded to
91# chosen registers... ]
92$code.=<<___;
93 fxors @X[($j+13)%16],@X[$j],@X[$j] !-1/-1/-1:X[0]^=X[13]
94 fxors @X[($j+14)%16],@X[$j+1],@X[$j+1]! 0/ 0/ 0:X[1]^=X[14]
95 fxor @X[($j+2)%16],@X[($j+8)%16],%f18! 1/ 1/ 1:Tmp=X[2,3]^X[8,9]
96 fxor %f18,@X[$j],@X[$j] ! 2/ 4/ 3:X[0,1]^=X[2,3]^X[8,9]
97 faligndata @X[$j],@X[$j],%f18 ! 3/ 7/ 5:Tmp=X[0,1]>>>24
98 fpadd32 @X[$j],@X[$j],@X[$j] ! 4/ 8/ 6:X[0,1]<<=1
99 fmul8ulx16 %f18,$fmul,%f18 ! 5/10/ 7:Tmp>>=7, Tmp&=1
100 ![fxors %f15,%f2,%f2]
101 for %f18,@X[$j],@X[$j] ! 8/14/10:X[0,1]|=Tmp
102 ![fxors %f0,%f3,%f3] !10/17/12:X[0] dependency
103 fpadd32 $K,@X[$j],%f20
104 std %f20,[$Xfer+`4*$j`]
105___
106# The numbers delimited with slash are the earliest possible dispatch
107# cycles for given instruction assuming 1 cycle latency for simple VIS
108# instructions, such as on UltraSPARC-I&II, 3 cycles latency, such as
109# on UltraSPARC-III&IV, and 2 cycles latency(*), respectively. Being
110# 2x-parallelized the procedure is "worth" 5, 8.5 or 6 ticks per SHA1
111# round. As [long as] FPU/VIS instructions are perfectly pairable with
112# IALU ones, the round timing is defined by the maximum between VIS
113# and IALU timings. The latter varies from round to round and averages
114# out at 6.25 ticks. This means that USI&II should operate at IALU
115# rate, while USIII&IV - at VIS rate. This explains why performance
116# improvement varies among processors. Well, given that pure IALU
117# sha1-sparcv9.pl module exhibits virtually uniform performance of
118# ~9.3 cycles per SHA1 round. Timings mentioned above are theoretical
119# lower limits. Real-life performance was measured to be 6.6 cycles
120# per SHA1 round on USIIi and 8.3 on USIII. The latter is lower than
121# half-round VIS timing, because there are 16 Xupdate-free rounds,
122# which "push down" average theoretical timing to 8 cycles...
123
124# (*) SPARC64-V[II] was originally believed to have 2 cycles VIS
125# latency. Well, it might have, but it doesn't have dedicated
126# VIS-unit. Instead, VIS instructions are executed by other
127# functional units, ones used here - by IALU. This doesn't
128# improve effective ILP...
129}
130
131# The reference Xupdate procedure is then "strained" over *pairs* of
132# BODY_NN_MM and kind of modulo-scheduled in respect to X[n]^=X[n+13]
133# and K_NN_MM addition. It's "running" 15 rounds ahead, which leaves
134# plenty of room to amortize for read-after-write hazard, as well as
135# to fetch and align input for the next spin. The VIS instructions are
136# scheduled for latency of 2 cycles, because there are not enough IALU
137# instructions to schedule for latency of 3, while scheduling for 1
138# would give no gain on USI&II anyway.
139
140sub BODY_00_19 {
141my ($i,$a,$b,$c,$d,$e)=@_;
142my $j=$i&~1;
143my $k=($j+16+2)%16; # ahead reference
144my $l=($j+16-2)%16; # behind reference
145my $K=@VK[($j+16-2)/20];
146
147$j=($j+16)%16;
148
149$code.=<<___ if (!($i&1));
150 sll $a,5,$tmp0 !! $i
151 and $c,$b,$tmp3
152 ld [$Xfer+`4*($i%16)`],$Xi
153 fxors @X[($j+14)%16],@X[$j+1],@X[$j+1]! 0/ 0/ 0:X[1]^=X[14]
154 srl $a,27,$tmp1
155 add $tmp0,$e,$e
156 fxor @X[($j+2)%16],@X[($j+8)%16],%f18! 1/ 1/ 1:Tmp=X[2,3]^X[8,9]
157 sll $b,30,$tmp2
158 add $tmp1,$e,$e
159 andn $d,$b,$tmp1
160 add $Xi,$e,$e
161 fxor %f18,@X[$j],@X[$j] ! 2/ 4/ 3:X[0,1]^=X[2,3]^X[8,9]
162 srl $b,2,$b
163 or $tmp1,$tmp3,$tmp1
164 or $tmp2,$b,$b
165 add $tmp1,$e,$e
166 faligndata @X[$j],@X[$j],%f18 ! 3/ 7/ 5:Tmp=X[0,1]>>>24
167___
168$code.=<<___ if ($i&1);
169 sll $a,5,$tmp0 !! $i
170 and $c,$b,$tmp3
171 ld [$Xfer+`4*($i%16)`],$Xi
172 fpadd32 @X[$j],@X[$j],@X[$j] ! 4/ 8/ 6:X[0,1]<<=1
173 srl $a,27,$tmp1
174 add $tmp0,$e,$e
175 fmul8ulx16 %f18,$fmul,%f18 ! 5/10/ 7:Tmp>>=7, Tmp&=1
176 sll $b,30,$tmp2
177 add $tmp1,$e,$e
178 fpadd32 $K,@X[$l],%f20 !
179 andn $d,$b,$tmp1
180 add $Xi,$e,$e
181 fxors @X[($k+13)%16],@X[$k],@X[$k] !-1/-1/-1:X[0]^=X[13]
182 srl $b,2,$b
183 or $tmp1,$tmp3,$tmp1
184 fxor %f18,@X[$j],@X[$j] ! 8/14/10:X[0,1]|=Tmp
185 or $tmp2,$b,$b
186 add $tmp1,$e,$e
187___
188$code.=<<___ if ($i&1 && $i>=2);
189 std %f20,[$Xfer+`4*$l`] !
190___
191}
192
193sub BODY_20_39 {
194my ($i,$a,$b,$c,$d,$e)=@_;
195my $j=$i&~1;
196my $k=($j+16+2)%16; # ahead reference
197my $l=($j+16-2)%16; # behind reference
198my $K=@VK[($j+16-2)/20];
199
200$j=($j+16)%16;
201
202$code.=<<___ if (!($i&1) && $i<64);
203 sll $a,5,$tmp0 !! $i
204 ld [$Xfer+`4*($i%16)`],$Xi
205 fxors @X[($j+14)%16],@X[$j+1],@X[$j+1]! 0/ 0/ 0:X[1]^=X[14]
206 srl $a,27,$tmp1
207 add $tmp0,$e,$e
208 fxor @X[($j+2)%16],@X[($j+8)%16],%f18! 1/ 1/ 1:Tmp=X[2,3]^X[8,9]
209 xor $c,$b,$tmp0
210 add $tmp1,$e,$e
211 sll $b,30,$tmp2
212 xor $d,$tmp0,$tmp1
213 fxor %f18,@X[$j],@X[$j] ! 2/ 4/ 3:X[0,1]^=X[2,3]^X[8,9]
214 srl $b,2,$b
215 add $tmp1,$e,$e
216 or $tmp2,$b,$b
217 add $Xi,$e,$e
218 faligndata @X[$j],@X[$j],%f18 ! 3/ 7/ 5:Tmp=X[0,1]>>>24
219___
220$code.=<<___ if ($i&1 && $i<64);
221 sll $a,5,$tmp0 !! $i
222 ld [$Xfer+`4*($i%16)`],$Xi
223 fpadd32 @X[$j],@X[$j],@X[$j] ! 4/ 8/ 6:X[0,1]<<=1
224 srl $a,27,$tmp1
225 add $tmp0,$e,$e
226 fmul8ulx16 %f18,$fmul,%f18 ! 5/10/ 7:Tmp>>=7, Tmp&=1
227 xor $c,$b,$tmp0
228 add $tmp1,$e,$e
229 fpadd32 $K,@X[$l],%f20 !
230 sll $b,30,$tmp2
231 xor $d,$tmp0,$tmp1
232 fxors @X[($k+13)%16],@X[$k],@X[$k] !-1/-1/-1:X[0]^=X[13]
233 srl $b,2,$b
234 add $tmp1,$e,$e
235 fxor %f18,@X[$j],@X[$j] ! 8/14/10:X[0,1]|=Tmp
236 or $tmp2,$b,$b
237 add $Xi,$e,$e
238 std %f20,[$Xfer+`4*$l`] !
239___
240$code.=<<___ if ($i==64);
241 sll $a,5,$tmp0 !! $i
242 ld [$Xfer+`4*($i%16)`],$Xi
243 fpadd32 $K,@X[$l],%f20
244 srl $a,27,$tmp1
245 add $tmp0,$e,$e
246 xor $c,$b,$tmp0
247 add $tmp1,$e,$e
248 sll $b,30,$tmp2
249 xor $d,$tmp0,$tmp1
250 std %f20,[$Xfer+`4*$l`]
251 srl $b,2,$b
252 add $tmp1,$e,$e
253 or $tmp2,$b,$b
254 add $Xi,$e,$e
255___
256$code.=<<___ if ($i>64);
257 sll $a,5,$tmp0 !! $i
258 ld [$Xfer+`4*($i%16)`],$Xi
259 srl $a,27,$tmp1
260 add $tmp0,$e,$e
261 xor $c,$b,$tmp0
262 add $tmp1,$e,$e
263 sll $b,30,$tmp2
264 xor $d,$tmp0,$tmp1
265 srl $b,2,$b
266 add $tmp1,$e,$e
267 or $tmp2,$b,$b
268 add $Xi,$e,$e
269___
270}
271
272sub BODY_40_59 {
273my ($i,$a,$b,$c,$d,$e)=@_;
274my $j=$i&~1;
275my $k=($j+16+2)%16; # ahead reference
276my $l=($j+16-2)%16; # behind reference
277my $K=@VK[($j+16-2)/20];
278
279$j=($j+16)%16;
280
281$code.=<<___ if (!($i&1));
282 sll $a,5,$tmp0 !! $i
283 ld [$Xfer+`4*($i%16)`],$Xi
284 fxors @X[($j+14)%16],@X[$j+1],@X[$j+1]! 0/ 0/ 0:X[1]^=X[14]
285 srl $a,27,$tmp1
286 add $tmp0,$e,$e
287 fxor @X[($j+2)%16],@X[($j+8)%16],%f18! 1/ 1/ 1:Tmp=X[2,3]^X[8,9]
288 and $c,$b,$tmp0
289 add $tmp1,$e,$e
290 sll $b,30,$tmp2
291 or $c,$b,$tmp1
292 fxor %f18,@X[$j],@X[$j] ! 2/ 4/ 3:X[0,1]^=X[2,3]^X[8,9]
293 srl $b,2,$b
294 and $d,$tmp1,$tmp1
295 add $Xi,$e,$e
296 or $tmp1,$tmp0,$tmp1
297 faligndata @X[$j],@X[$j],%f18 ! 3/ 7/ 5:Tmp=X[0,1]>>>24
298 or $tmp2,$b,$b
299 add $tmp1,$e,$e
300 fpadd32 @X[$j],@X[$j],@X[$j] ! 4/ 8/ 6:X[0,1]<<=1
301___
302$code.=<<___ if ($i&1);
303 sll $a,5,$tmp0 !! $i
304 ld [$Xfer+`4*($i%16)`],$Xi
305 srl $a,27,$tmp1
306 add $tmp0,$e,$e
307 fmul8ulx16 %f18,$fmul,%f18 ! 5/10/ 7:Tmp>>=7, Tmp&=1
308 and $c,$b,$tmp0
309 add $tmp1,$e,$e
310 fpadd32 $K,@X[$l],%f20 !
311 sll $b,30,$tmp2
312 or $c,$b,$tmp1
313 fxors @X[($k+13)%16],@X[$k],@X[$k] !-1/-1/-1:X[0]^=X[13]
314 srl $b,2,$b
315 and $d,$tmp1,$tmp1
316 fxor %f18,@X[$j],@X[$j] ! 8/14/10:X[0,1]|=Tmp
317 add $Xi,$e,$e
318 or $tmp1,$tmp0,$tmp1
319 or $tmp2,$b,$b
320 add $tmp1,$e,$e
321 std %f20,[$Xfer+`4*$l`] !
322___
323}
324
325# If there is more data to process, then we pre-fetch the data for
326# next iteration in last ten rounds...
327sub BODY_70_79 {
328my ($i,$a,$b,$c,$d,$e)=@_;
329my $j=$i&~1;
330my $m=($i%8)*2;
331
332$j=($j+16)%16;
333
334$code.=<<___ if ($i==70);
335 sll $a,5,$tmp0 !! $i
336 ld [$Xfer+`4*($i%16)`],$Xi
337 srl $a,27,$tmp1
338 add $tmp0,$e,$e
339 ldd [$inp+64],@X[0]
340 xor $c,$b,$tmp0
341 add $tmp1,$e,$e
342 sll $b,30,$tmp2
343 xor $d,$tmp0,$tmp1
344 srl $b,2,$b
345 add $tmp1,$e,$e
346 or $tmp2,$b,$b
347 add $Xi,$e,$e
348
349 and $inp,-64,$nXfer
350 inc 64,$inp
351 and $nXfer,255,$nXfer
352 alignaddr %g0,$align,%g0
353 add $base,$nXfer,$nXfer
354___
355$code.=<<___ if ($i==71);
356 sll $a,5,$tmp0 !! $i
357 ld [$Xfer+`4*($i%16)`],$Xi
358 srl $a,27,$tmp1
359 add $tmp0,$e,$e
360 xor $c,$b,$tmp0
361 add $tmp1,$e,$e
362 sll $b,30,$tmp2
363 xor $d,$tmp0,$tmp1
364 srl $b,2,$b
365 add $tmp1,$e,$e
366 or $tmp2,$b,$b
367 add $Xi,$e,$e
368___
369$code.=<<___ if ($i>=72);
370 faligndata @X[$m],@X[$m+2],@X[$m]
371 sll $a,5,$tmp0 !! $i
372 ld [$Xfer+`4*($i%16)`],$Xi
373 srl $a,27,$tmp1
374 add $tmp0,$e,$e
375 xor $c,$b,$tmp0
376 add $tmp1,$e,$e
377 fpadd32 $VK_00_19,@X[$m],%f20
378 sll $b,30,$tmp2
379 xor $d,$tmp0,$tmp1
380 srl $b,2,$b
381 add $tmp1,$e,$e
382 or $tmp2,$b,$b
383 add $Xi,$e,$e
384___
385$code.=<<___ if ($i<77);
386 ldd [$inp+`8*($i+1-70)`],@X[2*($i+1-70)]
387___
388$code.=<<___ if ($i==77); # redundant if $inp was aligned
389 add $align,63,$tmp0
390 and $tmp0,-8,$tmp0
391 ldd [$inp+$tmp0],@X[16]
392___
393$code.=<<___ if ($i>=72);
394 std %f20,[$nXfer+`4*$m`]
395___
396}
397
398$code.=<<___;
399.section ".text",#alloc,#execinstr
400
401.align 64
402vis_const:
403.long 0x5a827999,0x5a827999 ! K_00_19
404.long 0x6ed9eba1,0x6ed9eba1 ! K_20_39
405.long 0x8f1bbcdc,0x8f1bbcdc ! K_40_59
406.long 0xca62c1d6,0xca62c1d6 ! K_60_79
407.long 0x00000100,0x00000100
408.align 64
409.type vis_const,#object
410.size vis_const,(.-vis_const)
411
412.globl sha1_block_data_order
413sha1_block_data_order:
414 save %sp,-$frame,%sp
415 add %fp,$bias-256,$base
416
4171: call .+8
418 add %o7,vis_const-1b,$tmp0
419
420 ldd [$tmp0+0],$VK_00_19
421 ldd [$tmp0+8],$VK_20_39
422 ldd [$tmp0+16],$VK_40_59
423 ldd [$tmp0+24],$VK_60_79
424 ldd [$tmp0+32],$fmul
425
426 ld [$ctx+0],$Actx
427 and $base,-256,$base
428 ld [$ctx+4],$Bctx
429 sub $base,$bias+$frame,%sp
430 ld [$ctx+8],$Cctx
431 and $inp,7,$align
432 ld [$ctx+12],$Dctx
433 and $inp,-8,$inp
434 ld [$ctx+16],$Ectx
435
436 ! X[16] is maintained in FP register bank
437 alignaddr %g0,$align,%g0
438 ldd [$inp+0],@X[0]
439 sub $inp,-64,$Xfer
440 ldd [$inp+8],@X[2]
441 and $Xfer,-64,$Xfer
442 ldd [$inp+16],@X[4]
443 and $Xfer,255,$Xfer
444 ldd [$inp+24],@X[6]
445 add $base,$Xfer,$Xfer
446 ldd [$inp+32],@X[8]
447 ldd [$inp+40],@X[10]
448 ldd [$inp+48],@X[12]
449 brz,pt $align,.Laligned
450 ldd [$inp+56],@X[14]
451
452 ldd [$inp+64],@X[16]
453 faligndata @X[0],@X[2],@X[0]
454 faligndata @X[2],@X[4],@X[2]
455 faligndata @X[4],@X[6],@X[4]
456 faligndata @X[6],@X[8],@X[6]
457 faligndata @X[8],@X[10],@X[8]
458 faligndata @X[10],@X[12],@X[10]
459 faligndata @X[12],@X[14],@X[12]
460 faligndata @X[14],@X[16],@X[14]
461
462.Laligned:
463 mov 5,$tmp0
464 dec 1,$len
465 alignaddr %g0,$tmp0,%g0
466 fpadd32 $VK_00_19,@X[0],%f16
467 fpadd32 $VK_00_19,@X[2],%f18
468 fpadd32 $VK_00_19,@X[4],%f20
469 fpadd32 $VK_00_19,@X[6],%f22
470 fpadd32 $VK_00_19,@X[8],%f24
471 fpadd32 $VK_00_19,@X[10],%f26
472 fpadd32 $VK_00_19,@X[12],%f28
473 fpadd32 $VK_00_19,@X[14],%f30
474 std %f16,[$Xfer+0]
475 mov $Actx,$A
476 std %f18,[$Xfer+8]
477 mov $Bctx,$B
478 std %f20,[$Xfer+16]
479 mov $Cctx,$C
480 std %f22,[$Xfer+24]
481 mov $Dctx,$D
482 std %f24,[$Xfer+32]
483 mov $Ectx,$E
484 std %f26,[$Xfer+40]
485 fxors @X[13],@X[0],@X[0]
486 std %f28,[$Xfer+48]
487 ba .Loop
488 std %f30,[$Xfer+56]
489.align 32
490.Loop:
491___
492for ($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
493for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
494for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
495for (;$i<70;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
496$code.=<<___;
497 tst $len
498 bz,pn `$bits==32?"%icc":"%xcc"`,.Ltail
499 nop
500___
501for (;$i<80;$i++) { &BODY_70_79($i,@V); unshift(@V,pop(@V)); }
502$code.=<<___;
503 add $A,$Actx,$Actx
504 add $B,$Bctx,$Bctx
505 add $C,$Cctx,$Cctx
506 add $D,$Dctx,$Dctx
507 add $E,$Ectx,$Ectx
508 mov 5,$tmp0
509 fxors @X[13],@X[0],@X[0]
510 mov $Actx,$A
511 mov $Bctx,$B
512 mov $Cctx,$C
513 mov $Dctx,$D
514 mov $Ectx,$E
515 alignaddr %g0,$tmp0,%g0
516 dec 1,$len
517 ba .Loop
518 mov $nXfer,$Xfer
519
520.align 32
521.Ltail:
522___
523for($i=70;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
524$code.=<<___;
525 add $A,$Actx,$Actx
526 add $B,$Bctx,$Bctx
527 add $C,$Cctx,$Cctx
528 add $D,$Dctx,$Dctx
529 add $E,$Ectx,$Ectx
530
531 st $Actx,[$ctx+0]
532 st $Bctx,[$ctx+4]
533 st $Cctx,[$ctx+8]
534 st $Dctx,[$ctx+12]
535 st $Ectx,[$ctx+16]
536
537 ret
538 restore
539.type sha1_block_data_order,#function
540.size sha1_block_data_order,(.-sha1_block_data_order)
541.asciz "SHA1 block transform for SPARCv9a, CRYPTOGAMS by <appro\@openssl.org>"
542___
543
544# Purpose of these subroutines is to explicitly encode VIS instructions,
545# so that one can compile the module without having to specify VIS
546# extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
547# Idea is to reserve for option to produce "universal" binary and let
548# programmer detect if current CPU is VIS capable at run-time.
549sub unvis {
550my ($mnemonic,$rs1,$rs2,$rd)=@_;
551my $ref,$opf;
552my %visopf = ( "fmul8ulx16" => 0x037,
553 "faligndata" => 0x048,
554 "fpadd32" => 0x052,
555 "fxor" => 0x06c,
556 "fxors" => 0x06d );
557
558 $ref = "$mnemonic\t$rs1,$rs2,$rd";
559
560 if ($opf=$visopf{$mnemonic}) {
561 foreach ($rs1,$rs2,$rd) {
562 return $ref if (!/%f([0-9]{1,2})/);
563 $_=$1;
564 if ($1>=32) {
565 return $ref if ($1&1);
566 # re-encode for upper double register addressing
567 $_=($1|$1>>5)&31;
568 }
569 }
570
571 return sprintf ".word\t0x%08x !%s",
572 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
573 $ref;
574 } else {
575 return $ref;
576 }
577}
578sub unalignaddr {
579my ($mnemonic,$rs1,$rs2,$rd)=@_;
580my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
581my $ref="$mnemonic\t$rs1,$rs2,$rd";
582
583 foreach ($rs1,$rs2,$rd) {
584 if (/%([goli])([0-7])/) { $_=$bias{$1}+$2; }
585 else { return $ref; }
586 }
587 return sprintf ".word\t0x%08x !%s",
588 0x81b00300|$rd<<25|$rs1<<14|$rs2,
589 $ref;
590}
591
592$code =~ s/\`([^\`]*)\`/eval $1/gem;
593$code =~ s/\b(f[^\s]*)\s+(%f[0-9]{1,2}),(%f[0-9]{1,2}),(%f[0-9]{1,2})/
594 &unvis($1,$2,$3,$4)
595 /gem;
596$code =~ s/\b(alignaddr)\s+(%[goli][0-7]),(%[goli][0-7]),(%[goli][0-7])/
597 &unalignaddr($1,$2,$3,$4)
598 /gem;
599print $code;
600close STDOUT;
diff --git a/src/lib/libcrypto/sha/asm/sha1-thumb.pl b/src/lib/libcrypto/sha/asm/sha1-thumb.pl
new file mode 100644
index 0000000000..7c9ea9b029
--- /dev/null
+++ b/src/lib/libcrypto/sha/asm/sha1-thumb.pl
@@ -0,0 +1,259 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# sha1_block for Thumb.
11#
12# January 2007.
13#
14# The code does not present direct interest to OpenSSL, because of low
15# performance. Its purpose is to establish _size_ benchmark. Pretty
16# useless one I must say, because 30% or 88 bytes larger ARMv4 code
17# [avialable on demand] is almost _twice_ as fast. It should also be
18# noted that in-lining of .Lcommon and .Lrotate improves performance
19# by over 40%, while code increases by only 10% or 32 bytes. But once
20# again, the goal was to establish _size_ benchmark, not performance.
21
22$output=shift;
23open STDOUT,">$output";
24
25$inline=0;
26#$cheat_on_binutils=1;
27
28$t0="r0";
29$t1="r1";
30$t2="r2";
31$a="r3";
32$b="r4";
33$c="r5";
34$d="r6";
35$e="r7";
36$K="r8"; # "upper" registers can be used in add/sub and mov insns
37$ctx="r9";
38$inp="r10";
39$len="r11";
40$Xi="r12";
41
42sub common {
43<<___;
44 sub $t0,#4
45 ldr $t1,[$t0]
46 add $e,$K @ E+=K_xx_xx
47 lsl $t2,$a,#5
48 add $t2,$e
49 lsr $e,$a,#27
50 add $t2,$e @ E+=ROR(A,27)
51 add $t2,$t1 @ E+=X[i]
52___
53}
54sub rotate {
55<<___;
56 mov $e,$d @ E=D
57 mov $d,$c @ D=C
58 lsl $c,$b,#30
59 lsr $b,$b,#2
60 orr $c,$b @ C=ROR(B,2)
61 mov $b,$a @ B=A
62 add $a,$t2,$t1 @ A=E+F_xx_xx(B,C,D)
63___
64}
65
66sub BODY_00_19 {
67$code.=$inline?&common():"\tbl .Lcommon\n";
68$code.=<<___;
69 mov $t1,$c
70 eor $t1,$d
71 and $t1,$b
72 eor $t1,$d @ F_00_19(B,C,D)
73___
74$code.=$inline?&rotate():"\tbl .Lrotate\n";
75}
76
77sub BODY_20_39 {
78$code.=$inline?&common():"\tbl .Lcommon\n";
79$code.=<<___;
80 mov $t1,$b
81 eor $t1,$c
82 eor $t1,$d @ F_20_39(B,C,D)
83___
84$code.=$inline?&rotate():"\tbl .Lrotate\n";
85}
86
87sub BODY_40_59 {
88$code.=$inline?&common():"\tbl .Lcommon\n";
89$code.=<<___;
90 mov $t1,$b
91 and $t1,$c
92 mov $e,$b
93 orr $e,$c
94 and $e,$d
95 orr $t1,$e @ F_40_59(B,C,D)
96___
97$code.=$inline?&rotate():"\tbl .Lrotate\n";
98}
99
100$code=<<___;
101.text
102.code 16
103
104.global sha1_block_data_order
105.type sha1_block_data_order,%function
106
107.align 2
108sha1_block_data_order:
109___
110if ($cheat_on_binutils) {
111$code.=<<___;
112.code 32
113 add r3,pc,#1
114 bx r3 @ switch to Thumb ISA
115.code 16
116___
117}
118$code.=<<___;
119 push {r4-r7}
120 mov r3,r8
121 mov r4,r9
122 mov r5,r10
123 mov r6,r11
124 mov r7,r12
125 push {r3-r7,lr}
126 lsl r2,#6
127 mov $ctx,r0 @ save context
128 mov $inp,r1 @ save inp
129 mov $len,r2 @ save len
130 add $len,$inp @ $len to point at inp end
131
132.Lloop:
133 mov $Xi,sp
134 mov $t2,sp
135 sub $t2,#16*4 @ [3]
136.LXload:
137 ldrb $a,[$t1,#0] @ $t1 is r1 and holds inp
138 ldrb $b,[$t1,#1]
139 ldrb $c,[$t1,#2]
140 ldrb $d,[$t1,#3]
141 lsl $a,#24
142 lsl $b,#16
143 lsl $c,#8
144 orr $a,$b
145 orr $a,$c
146 orr $a,$d
147 add $t1,#4
148 push {$a}
149 cmp sp,$t2
150 bne .LXload @ [+14*16]
151
152 mov $inp,$t1 @ update $inp
153 sub $t2,#32*4
154 sub $t2,#32*4
155 mov $e,#31 @ [+4]
156.LXupdate:
157 ldr $a,[sp,#15*4]
158 ldr $b,[sp,#13*4]
159 ldr $c,[sp,#7*4]
160 ldr $d,[sp,#2*4]
161 eor $a,$b
162 eor $a,$c
163 eor $a,$d
164 ror $a,$e
165 push {$a}
166 cmp sp,$t2
167 bne .LXupdate @ [+(11+1)*64]
168
169 ldmia $t0!,{$a,$b,$c,$d,$e} @ $t0 is r0 and holds ctx
170 mov $t0,$Xi
171
172 ldr $t2,.LK_00_19
173 mov $t1,$t0
174 sub $t1,#20*4
175 mov $Xi,$t1
176 mov $K,$t2 @ [+7+4]
177.L_00_19:
178___
179 &BODY_00_19();
180$code.=<<___;
181 cmp $Xi,$t0
182 bne .L_00_19 @ [+(2+9+4+2+8+2)*20]
183
184 ldr $t2,.LK_20_39
185 mov $t1,$t0
186 sub $t1,#20*4
187 mov $Xi,$t1
188 mov $K,$t2 @ [+5]
189.L_20_39_or_60_79:
190___
191 &BODY_20_39();
192$code.=<<___;
193 cmp $Xi,$t0
194 bne .L_20_39_or_60_79 @ [+(2+9+3+2+8+2)*20*2]
195 cmp sp,$t0
196 beq .Ldone @ [+2]
197
198 ldr $t2,.LK_40_59
199 mov $t1,$t0
200 sub $t1,#20*4
201 mov $Xi,$t1
202 mov $K,$t2 @ [+5]
203.L_40_59:
204___
205 &BODY_40_59();
206$code.=<<___;
207 cmp $Xi,$t0
208 bne .L_40_59 @ [+(2+9+6+2+8+2)*20]
209
210 ldr $t2,.LK_60_79
211 mov $Xi,sp
212 mov $K,$t2
213 b .L_20_39_or_60_79 @ [+4]
214.Ldone:
215 mov $t0,$ctx
216 ldr $t1,[$t0,#0]
217 ldr $t2,[$t0,#4]
218 add $a,$t1
219 ldr $t1,[$t0,#8]
220 add $b,$t2
221 ldr $t2,[$t0,#12]
222 add $c,$t1
223 ldr $t1,[$t0,#16]
224 add $d,$t2
225 add $e,$t1
226 stmia $t0!,{$a,$b,$c,$d,$e} @ [+20]
227
228 add sp,#80*4 @ deallocate stack frame
229 mov $t0,$ctx @ restore ctx
230 mov $t1,$inp @ restore inp
231 cmp $t1,$len
232 beq .Lexit
233 b .Lloop @ [+6] total 3212 cycles
234.Lexit:
235 pop {r2-r7}
236 mov r8,r2
237 mov r9,r3
238 mov r10,r4
239 mov r11,r5
240 mov r12,r6
241 mov lr,r7
242 pop {r4-r7}
243 bx lr
244.align 2
245___
246$code.=".Lcommon:\n".&common()."\tmov pc,lr\n" if (!$inline);
247$code.=".Lrotate:\n".&rotate()."\tmov pc,lr\n" if (!$inline);
248$code.=<<___;
249.align 2
250.LK_00_19: .word 0x5a827999
251.LK_20_39: .word 0x6ed9eba1
252.LK_40_59: .word 0x8f1bbcdc
253.LK_60_79: .word 0xca62c1d6
254.size sha1_block_data_order,.-sha1_block_data_order
255.asciz "SHA1 block transform for Thumb, CRYPTOGAMS by <appro\@openssl.org>"
256___
257
258print $code;
259close STDOUT; # enforce flush
diff --git a/src/lib/libcrypto/sha/asm/sha1-x86_64.pl b/src/lib/libcrypto/sha/asm/sha1-x86_64.pl
index f7ed67a726..4edc5ea9ad 100755
--- a/src/lib/libcrypto/sha/asm/sha1-x86_64.pl
+++ b/src/lib/libcrypto/sha/asm/sha1-x86_64.pl
@@ -29,14 +29,18 @@
29# Xeon P4 +65% +0% 9.9 29# Xeon P4 +65% +0% 9.9
30# Core2 +60% +10% 7.0 30# Core2 +60% +10% 7.0
31 31
32$output=shift; 32$flavour = shift;
33$output = shift;
34if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
35
36$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
33 37
34$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 38$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
35( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 39( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
36( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 40( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
37die "can't locate x86_64-xlate.pl"; 41die "can't locate x86_64-xlate.pl";
38 42
39open STDOUT,"| $^X $xlate $output"; 43open STDOUT,"| $^X $xlate $flavour $output";
40 44
41$ctx="%rdi"; # 1st arg 45$ctx="%rdi"; # 1st arg
42$inp="%rsi"; # 2nd arg 46$inp="%rsi"; # 2nd arg
@@ -69,13 +73,14 @@ $func:
69 push %rbx 73 push %rbx
70 push %rbp 74 push %rbp
71 push %r12 75 push %r12
72 mov %rsp,%rax 76 mov %rsp,%r11
73 mov %rdi,$ctx # reassigned argument 77 mov %rdi,$ctx # reassigned argument
74 sub \$`8+16*4`,%rsp 78 sub \$`8+16*4`,%rsp
75 mov %rsi,$inp # reassigned argument 79 mov %rsi,$inp # reassigned argument
76 and \$-64,%rsp 80 and \$-64,%rsp
77 mov %rdx,$num # reassigned argument 81 mov %rdx,$num # reassigned argument
78 mov %rax,`16*4`(%rsp) 82 mov %r11,`16*4`(%rsp)
83.Lprologue:
79 84
80 mov 0($ctx),$A 85 mov 0($ctx),$A
81 mov 4($ctx),$B 86 mov 4($ctx),$B
@@ -88,10 +93,12 @@ ___
88sub EPILOGUE { 93sub EPILOGUE {
89my $func=shift; 94my $func=shift;
90$code.=<<___; 95$code.=<<___;
91 mov `16*4`(%rsp),%rsp 96 mov `16*4`(%rsp),%rsi
92 pop %r12 97 mov (%rsi),%r12
93 pop %rbp 98 mov 8(%rsi),%rbp
94 pop %rbx 99 mov 16(%rsi),%rbx
100 lea 24(%rsi),%rsp
101.Lepilogue:
95 ret 102 ret
96.size $func,.-$func 103.size $func,.-$func
97___ 104___
@@ -233,7 +240,109 @@ ___
233&EPILOGUE("sha1_block_data_order"); 240&EPILOGUE("sha1_block_data_order");
234$code.=<<___; 241$code.=<<___;
235.asciz "SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 242.asciz "SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
243.align 16
244___
245
246# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
247# CONTEXT *context,DISPATCHER_CONTEXT *disp)
248if ($win64) {
249$rec="%rcx";
250$frame="%rdx";
251$context="%r8";
252$disp="%r9";
253
254$code.=<<___;
255.extern __imp_RtlVirtualUnwind
256.type se_handler,\@abi-omnipotent
257.align 16
258se_handler:
259 push %rsi
260 push %rdi
261 push %rbx
262 push %rbp
263 push %r12
264 push %r13
265 push %r14
266 push %r15
267 pushfq
268 sub \$64,%rsp
269
270 mov 120($context),%rax # pull context->Rax
271 mov 248($context),%rbx # pull context->Rip
272
273 lea .Lprologue(%rip),%r10
274 cmp %r10,%rbx # context->Rip<.Lprologue
275 jb .Lin_prologue
276
277 mov 152($context),%rax # pull context->Rsp
278
279 lea .Lepilogue(%rip),%r10
280 cmp %r10,%rbx # context->Rip>=.Lepilogue
281 jae .Lin_prologue
282
283 mov `16*4`(%rax),%rax # pull saved stack pointer
284 lea 24(%rax),%rax
285
286 mov -8(%rax),%rbx
287 mov -16(%rax),%rbp
288 mov -24(%rax),%r12
289 mov %rbx,144($context) # restore context->Rbx
290 mov %rbp,160($context) # restore context->Rbp
291 mov %r12,216($context) # restore context->R12
292
293.Lin_prologue:
294 mov 8(%rax),%rdi
295 mov 16(%rax),%rsi
296 mov %rax,152($context) # restore context->Rsp
297 mov %rsi,168($context) # restore context->Rsi
298 mov %rdi,176($context) # restore context->Rdi
299
300 mov 40($disp),%rdi # disp->ContextRecord
301 mov $context,%rsi # context
302 mov \$154,%ecx # sizeof(CONTEXT)
303 .long 0xa548f3fc # cld; rep movsq
304
305 mov $disp,%rsi
306 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
307 mov 8(%rsi),%rdx # arg2, disp->ImageBase
308 mov 0(%rsi),%r8 # arg3, disp->ControlPc
309 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
310 mov 40(%rsi),%r10 # disp->ContextRecord
311 lea 56(%rsi),%r11 # &disp->HandlerData
312 lea 24(%rsi),%r12 # &disp->EstablisherFrame
313 mov %r10,32(%rsp) # arg5
314 mov %r11,40(%rsp) # arg6
315 mov %r12,48(%rsp) # arg7
316 mov %rcx,56(%rsp) # arg8, (NULL)
317 call *__imp_RtlVirtualUnwind(%rip)
318
319 mov \$1,%eax # ExceptionContinueSearch
320 add \$64,%rsp
321 popfq
322 pop %r15
323 pop %r14
324 pop %r13
325 pop %r12
326 pop %rbp
327 pop %rbx
328 pop %rdi
329 pop %rsi
330 ret
331.size se_handler,.-se_handler
332
333.section .pdata
334.align 4
335 .rva .LSEH_begin_sha1_block_data_order
336 .rva .LSEH_end_sha1_block_data_order
337 .rva .LSEH_info_sha1_block_data_order
338
339.section .xdata
340.align 8
341.LSEH_info_sha1_block_data_order:
342 .byte 9,0,0,0
343 .rva se_handler
236___ 344___
345}
237 346
238#################################################################### 347####################################################################
239 348
diff --git a/src/lib/libcrypto/sha/asm/sha256-586.pl b/src/lib/libcrypto/sha/asm/sha256-586.pl
new file mode 100644
index 0000000000..ecc8b69c75
--- /dev/null
+++ b/src/lib/libcrypto/sha/asm/sha256-586.pl
@@ -0,0 +1,251 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# SHA256 block transform for x86. September 2007.
11#
12# Performance in clock cycles per processed byte (less is better):
13#
14# Pentium PIII P4 AMD K8 Core2
15# gcc 46 36 41 27 26
16# icc 57 33 38 25 23
17# x86 asm 40 30 35 20 20
18# x86_64 asm(*) - - 21 15.8 16.5
19#
20# (*) x86_64 assembler performance is presented for reference
21# purposes.
22#
23# Performance improvement over compiler generated code varies from
24# 10% to 40% [see above]. Not very impressive on some µ-archs, but
25# it's 5 times smaller and optimizies amount of writes.
26
27$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
28push(@INC,"${dir}","${dir}../../perlasm");
29require "x86asm.pl";
30
31&asm_init($ARGV[0],"sha512-586.pl",$ARGV[$#ARGV] eq "386");
32
33$A="eax";
34$E="edx";
35$T="ebx";
36$Aoff=&DWP(0,"esp");
37$Boff=&DWP(4,"esp");
38$Coff=&DWP(8,"esp");
39$Doff=&DWP(12,"esp");
40$Eoff=&DWP(16,"esp");
41$Foff=&DWP(20,"esp");
42$Goff=&DWP(24,"esp");
43$Hoff=&DWP(28,"esp");
44$Xoff=&DWP(32,"esp");
45$K256="ebp";
46
47sub BODY_00_15() {
48 my $in_16_63=shift;
49
50 &mov ("ecx",$E);
51 &add ($T,&DWP(4*(8+15+16-9),"esp")) if ($in_16_63); # T += X[-7]
52 &ror ("ecx",6);
53 &mov ("edi",$E);
54 &ror ("edi",11);
55 &mov ("esi",$Foff);
56 &xor ("ecx","edi");
57 &ror ("edi",25-11);
58 &mov (&DWP(4*(8+15),"esp"),$T) if ($in_16_63); # save X[0]
59 &xor ("ecx","edi"); # Sigma1(e)
60 &mov ("edi",$Goff);
61 &add ($T,"ecx"); # T += Sigma1(e)
62 &mov ($Eoff,$E); # modulo-scheduled
63
64 &xor ("esi","edi");
65 &mov ("ecx",$A);
66 &and ("esi",$E);
67 &mov ($E,$Doff); # e becomes d, which is e in next iteration
68 &xor ("esi","edi"); # Ch(e,f,g)
69 &mov ("edi",$A);
70 &add ($T,"esi"); # T += Ch(e,f,g)
71
72 &ror ("ecx",2);
73 &add ($T,$Hoff); # T += h
74 &ror ("edi",13);
75 &mov ("esi",$Boff);
76 &xor ("ecx","edi");
77 &ror ("edi",22-13);
78 &add ($E,$T); # d += T
79 &xor ("ecx","edi"); # Sigma0(a)
80 &mov ("edi",$Coff);
81
82 &add ($T,"ecx"); # T += Sigma0(a)
83 &mov ($Aoff,$A); # modulo-scheduled
84
85 &mov ("ecx",$A);
86 &sub ("esp",4);
87 &or ($A,"esi"); # a becomes h, which is a in next iteration
88 &and ("ecx","esi");
89 &and ($A,"edi");
90 &mov ("esi",&DWP(0,$K256));
91 &or ($A,"ecx"); # h=Maj(a,b,c)
92
93 &add ($K256,4);
94 &add ($A,$T); # h += T
95 &mov ($T,&DWP(4*(8+15+16-1),"esp")) if ($in_16_63); # preload T
96 &add ($E,"esi"); # d += K256[i]
97 &add ($A,"esi"); # h += K256[i]
98}
99
100&function_begin("sha256_block_data_order");
101 &mov ("esi",wparam(0)); # ctx
102 &mov ("edi",wparam(1)); # inp
103 &mov ("eax",wparam(2)); # num
104 &mov ("ebx","esp"); # saved sp
105
106 &call (&label("pic_point")); # make it PIC!
107&set_label("pic_point");
108 &blindpop($K256);
109 &lea ($K256,&DWP(&label("K256")."-".&label("pic_point"),$K256));
110
111 &sub ("esp",16);
112 &and ("esp",-64);
113
114 &shl ("eax",6);
115 &add ("eax","edi");
116 &mov (&DWP(0,"esp"),"esi"); # ctx
117 &mov (&DWP(4,"esp"),"edi"); # inp
118 &mov (&DWP(8,"esp"),"eax"); # inp+num*128
119 &mov (&DWP(12,"esp"),"ebx"); # saved sp
120
121&set_label("loop",16);
122 # copy input block to stack reversing byte and dword order
123 for($i=0;$i<4;$i++) {
124 &mov ("eax",&DWP($i*16+0,"edi"));
125 &mov ("ebx",&DWP($i*16+4,"edi"));
126 &mov ("ecx",&DWP($i*16+8,"edi"));
127 &mov ("edx",&DWP($i*16+12,"edi"));
128 &bswap ("eax");
129 &bswap ("ebx");
130 &bswap ("ecx");
131 &bswap ("edx");
132 &push ("eax");
133 &push ("ebx");
134 &push ("ecx");
135 &push ("edx");
136 }
137 &add ("edi",64);
138 &sub ("esp",4*8); # place for A,B,C,D,E,F,G,H
139 &mov (&DWP(4*(8+16)+4,"esp"),"edi");
140
141 # copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack
142 &mov ($A,&DWP(0,"esi"));
143 &mov ("ebx",&DWP(4,"esi"));
144 &mov ("ecx",&DWP(8,"esi"));
145 &mov ("edi",&DWP(12,"esi"));
146 # &mov ($Aoff,$A);
147 &mov ($Boff,"ebx");
148 &mov ($Coff,"ecx");
149 &mov ($Doff,"edi");
150 &mov ($E,&DWP(16,"esi"));
151 &mov ("ebx",&DWP(20,"esi"));
152 &mov ("ecx",&DWP(24,"esi"));
153 &mov ("edi",&DWP(28,"esi"));
154 # &mov ($Eoff,$E);
155 &mov ($Foff,"ebx");
156 &mov ($Goff,"ecx");
157 &mov ($Hoff,"edi");
158
159&set_label("00_15",16);
160 &mov ($T,&DWP(4*(8+15),"esp"));
161
162 &BODY_00_15();
163
164 &cmp ("esi",0xc19bf174);
165 &jne (&label("00_15"));
166
167 &mov ($T,&DWP(4*(8+15+16-1),"esp")); # preloaded in BODY_00_15(1)
168&set_label("16_63",16);
169 &mov ("esi",$T);
170 &mov ("ecx",&DWP(4*(8+15+16-14),"esp"));
171 &shr ($T,3);
172 &ror ("esi",7);
173 &xor ($T,"esi");
174 &ror ("esi",18-7);
175 &mov ("edi","ecx");
176 &xor ($T,"esi"); # T = sigma0(X[-15])
177
178 &shr ("ecx",10);
179 &mov ("esi",&DWP(4*(8+15+16),"esp"));
180 &ror ("edi",17);
181 &xor ("ecx","edi");
182 &ror ("edi",19-17);
183 &add ($T,"esi"); # T += X[-16]
184 &xor ("edi","ecx") # sigma1(X[-2])
185
186 &add ($T,"edi"); # T += sigma1(X[-2])
187 # &add ($T,&DWP(4*(8+15+16-9),"esp")); # T += X[-7], moved to BODY_00_15(1)
188 # &mov (&DWP(4*(8+15),"esp"),$T); # save X[0]
189
190 &BODY_00_15(1);
191
192 &cmp ("esi",0xc67178f2);
193 &jne (&label("16_63"));
194
195 &mov ("esi",&DWP(4*(8+16+64)+0,"esp"));#ctx
196 # &mov ($A,$Aoff);
197 &mov ("ebx",$Boff);
198 &mov ("ecx",$Coff);
199 &mov ("edi",$Doff);
200 &add ($A,&DWP(0,"esi"));
201 &add ("ebx",&DWP(4,"esi"));
202 &add ("ecx",&DWP(8,"esi"));
203 &add ("edi",&DWP(12,"esi"));
204 &mov (&DWP(0,"esi"),$A);
205 &mov (&DWP(4,"esi"),"ebx");
206 &mov (&DWP(8,"esi"),"ecx");
207 &mov (&DWP(12,"esi"),"edi");
208 # &mov ($E,$Eoff);
209 &mov ("eax",$Foff);
210 &mov ("ebx",$Goff);
211 &mov ("ecx",$Hoff);
212 &mov ("edi",&DWP(4*(8+16+64)+4,"esp"));#inp
213 &add ($E,&DWP(16,"esi"));
214 &add ("eax",&DWP(20,"esi"));
215 &add ("ebx",&DWP(24,"esi"));
216 &add ("ecx",&DWP(28,"esi"));
217 &mov (&DWP(16,"esi"),$E);
218 &mov (&DWP(20,"esi"),"eax");
219 &mov (&DWP(24,"esi"),"ebx");
220 &mov (&DWP(28,"esi"),"ecx");
221
222 &add ("esp",4*(8+16+64)); # destroy frame
223 &sub ($K256,4*64); # rewind K
224
225 &cmp ("edi",&DWP(8,"esp")); # are we done yet?
226 &jb (&label("loop"));
227
228 &mov ("esp",&DWP(12,"esp")); # restore sp
229&function_end_A();
230
231&set_label("K256",64); # Yes! I keep it in the code segment!
232 &data_word(0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5);
233 &data_word(0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5);
234 &data_word(0xd807aa98,0x12835b01,0x243185be,0x550c7dc3);
235 &data_word(0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174);
236 &data_word(0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc);
237 &data_word(0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da);
238 &data_word(0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7);
239 &data_word(0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967);
240 &data_word(0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13);
241 &data_word(0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85);
242 &data_word(0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3);
243 &data_word(0xd192e819,0xd6990624,0xf40e3585,0x106aa070);
244 &data_word(0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5);
245 &data_word(0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3);
246 &data_word(0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208);
247 &data_word(0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2);
248&function_end_B("sha256_block_data_order");
249&asciz("SHA256 block transform for x86, CRYPTOGAMS by <appro\@openssl.org>");
250
251&asm_finish();
diff --git a/src/lib/libcrypto/sha/asm/sha256-armv4.pl b/src/lib/libcrypto/sha/asm/sha256-armv4.pl
new file mode 100644
index 0000000000..48d846deec
--- /dev/null
+++ b/src/lib/libcrypto/sha/asm/sha256-armv4.pl
@@ -0,0 +1,181 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# SHA256 block procedure for ARMv4. May 2007.
11
12# Performance is ~2x better than gcc 3.4 generated code and in "abso-
13# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
14# byte.
15
16$output=shift;
17open STDOUT,">$output";
18
19$ctx="r0"; $t0="r0";
20$inp="r1";
21$len="r2"; $t1="r2";
22$T1="r3";
23$A="r4";
24$B="r5";
25$C="r6";
26$D="r7";
27$E="r8";
28$F="r9";
29$G="r10";
30$H="r11";
31@V=($A,$B,$C,$D,$E,$F,$G,$H);
32$t2="r12";
33$Ktbl="r14";
34
35@Sigma0=( 2,13,22);
36@Sigma1=( 6,11,25);
37@sigma0=( 7,18, 3);
38@sigma1=(17,19,10);
39
40sub BODY_00_15 {
41my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
42
43$code.=<<___ if ($i<16);
44 ldrb $T1,[$inp,#3] @ $i
45 ldrb $t2,[$inp,#2]
46 ldrb $t1,[$inp,#1]
47 ldrb $t0,[$inp],#4
48 orr $T1,$T1,$t2,lsl#8
49 orr $T1,$T1,$t1,lsl#16
50 orr $T1,$T1,$t0,lsl#24
51 `"str $inp,[sp,#17*4]" if ($i==15)`
52___
53$code.=<<___;
54 ldr $t2,[$Ktbl],#4 @ *K256++
55 str $T1,[sp,#`$i%16`*4]
56 mov $t0,$e,ror#$Sigma1[0]
57 eor $t0,$t0,$e,ror#$Sigma1[1]
58 eor $t0,$t0,$e,ror#$Sigma1[2] @ Sigma1(e)
59 add $T1,$T1,$t0
60 eor $t1,$f,$g
61 and $t1,$t1,$e
62 eor $t1,$t1,$g @ Ch(e,f,g)
63 add $T1,$T1,$t1
64 add $T1,$T1,$h
65 add $T1,$T1,$t2
66 mov $h,$a,ror#$Sigma0[0]
67 eor $h,$h,$a,ror#$Sigma0[1]
68 eor $h,$h,$a,ror#$Sigma0[2] @ Sigma0(a)
69 orr $t0,$a,$b
70 and $t0,$t0,$c
71 and $t1,$a,$b
72 orr $t0,$t0,$t1 @ Maj(a,b,c)
73 add $h,$h,$t0
74 add $d,$d,$T1
75 add $h,$h,$T1
76___
77}
78
79sub BODY_16_XX {
80my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
81
82$code.=<<___;
83 ldr $t1,[sp,#`($i+1)%16`*4] @ $i
84 ldr $t2,[sp,#`($i+14)%16`*4]
85 ldr $T1,[sp,#`($i+0)%16`*4]
86 ldr $inp,[sp,#`($i+9)%16`*4]
87 mov $t0,$t1,ror#$sigma0[0]
88 eor $t0,$t0,$t1,ror#$sigma0[1]
89 eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1])
90 mov $t1,$t2,ror#$sigma1[0]
91 eor $t1,$t1,$t2,ror#$sigma1[1]
92 eor $t1,$t1,$t2,lsr#$sigma1[2] @ sigma1(X[i+14])
93 add $T1,$T1,$t0
94 add $T1,$T1,$t1
95 add $T1,$T1,$inp
96___
97 &BODY_00_15(@_);
98}
99
100$code=<<___;
101.text
102.code 32
103
104.type K256,%object
105.align 5
106K256:
107.word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
108.word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
109.word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
110.word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
111.word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
112.word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
113.word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
114.word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
115.word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
116.word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
117.word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
118.word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
119.word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
120.word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
121.word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
122.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
123.size K256,.-K256
124
125.global sha256_block_data_order
126.type sha256_block_data_order,%function
127sha256_block_data_order:
128 sub r3,pc,#8 @ sha256_block_data_order
129 add $len,$inp,$len,lsl#6 @ len to point at the end of inp
130 stmdb sp!,{$ctx,$inp,$len,r4-r12,lr}
131 ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
132 sub $Ktbl,r3,#256 @ K256
133 sub sp,sp,#16*4 @ alloca(X[16])
134.Loop:
135___
136for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
137$code.=".Lrounds_16_xx:\n";
138for (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
139$code.=<<___;
140 and $t2,$t2,#0xff
141 cmp $t2,#0xf2
142 bne .Lrounds_16_xx
143
144 ldr $T1,[sp,#16*4] @ pull ctx
145 ldr $t0,[$T1,#0]
146 ldr $t1,[$T1,#4]
147 ldr $t2,[$T1,#8]
148 add $A,$A,$t0
149 ldr $t0,[$T1,#12]
150 add $B,$B,$t1
151 ldr $t1,[$T1,#16]
152 add $C,$C,$t2
153 ldr $t2,[$T1,#20]
154 add $D,$D,$t0
155 ldr $t0,[$T1,#24]
156 add $E,$E,$t1
157 ldr $t1,[$T1,#28]
158 add $F,$F,$t2
159 ldr $inp,[sp,#17*4] @ pull inp
160 ldr $t2,[sp,#18*4] @ pull inp+len
161 add $G,$G,$t0
162 add $H,$H,$t1
163 stmia $T1,{$A,$B,$C,$D,$E,$F,$G,$H}
164 cmp $inp,$t2
165 sub $Ktbl,$Ktbl,#256 @ rewind Ktbl
166 bne .Loop
167
168 add sp,sp,#`16+3`*4 @ destroy frame
169 ldmia sp!,{r4-r12,lr}
170 tst lr,#1
171 moveq pc,lr @ be binary compatible with V4, yet
172 bx lr @ interoperable with Thumb ISA:-)
173.size sha256_block_data_order,.-sha256_block_data_order
174.asciz "SHA256 block transform for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
175.align 2
176___
177
178$code =~ s/\`([^\`]*)\`/eval $1/gem;
179$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
180print $code;
181close STDOUT; # enforce flush
diff --git a/src/lib/libcrypto/sha/asm/sha512-586.pl b/src/lib/libcrypto/sha/asm/sha512-586.pl
new file mode 100644
index 0000000000..5b9f3337ad
--- /dev/null
+++ b/src/lib/libcrypto/sha/asm/sha512-586.pl
@@ -0,0 +1,644 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# SHA512 block transform for x86. September 2007.
11#
12# Performance in clock cycles per processed byte (less is better):
13#
14# Pentium PIII P4 AMD K8 Core2
15# gcc 100 75 116 54 66
16# icc 97 77 95 55 57
17# x86 asm 61 56 82 36 40
18# SSE2 asm - - 38 24 20
19# x86_64 asm(*) - - 30 10.0 10.5
20#
21# (*) x86_64 assembler performance is presented for reference
22# purposes.
23#
24# IALU code-path is optimized for elder Pentiums. On vanilla Pentium
25# performance improvement over compiler generated code reaches ~60%,
26# while on PIII - ~35%. On newer µ-archs improvement varies from 15%
27# to 50%, but it's less important as they are expected to execute SSE2
28# code-path, which is commonly ~2-3x faster [than compiler generated
29# code]. SSE2 code-path is as fast as original sha512-sse2.pl, even
30# though it does not use 128-bit operations. The latter means that
31# SSE2-aware kernel is no longer required to execute the code. Another
32# difference is that new code optimizes amount of writes, but at the
33# cost of increased data cache "footprint" by 1/2KB.
34
35$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
36push(@INC,"${dir}","${dir}../../perlasm");
37require "x86asm.pl";
38
39&asm_init($ARGV[0],"sha512-586.pl",$ARGV[$#ARGV] eq "386");
40
41$sse2=0;
42for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
43
44&external_label("OPENSSL_ia32cap_P") if ($sse2);
45
46$Tlo=&DWP(0,"esp"); $Thi=&DWP(4,"esp");
47$Alo=&DWP(8,"esp"); $Ahi=&DWP(8+4,"esp");
48$Blo=&DWP(16,"esp"); $Bhi=&DWP(16+4,"esp");
49$Clo=&DWP(24,"esp"); $Chi=&DWP(24+4,"esp");
50$Dlo=&DWP(32,"esp"); $Dhi=&DWP(32+4,"esp");
51$Elo=&DWP(40,"esp"); $Ehi=&DWP(40+4,"esp");
52$Flo=&DWP(48,"esp"); $Fhi=&DWP(48+4,"esp");
53$Glo=&DWP(56,"esp"); $Ghi=&DWP(56+4,"esp");
54$Hlo=&DWP(64,"esp"); $Hhi=&DWP(64+4,"esp");
55$K512="ebp";
56
57$Asse2=&QWP(0,"esp");
58$Bsse2=&QWP(8,"esp");
59$Csse2=&QWP(16,"esp");
60$Dsse2=&QWP(24,"esp");
61$Esse2=&QWP(32,"esp");
62$Fsse2=&QWP(40,"esp");
63$Gsse2=&QWP(48,"esp");
64$Hsse2=&QWP(56,"esp");
65
66$A="mm0"; # B-D and
67$E="mm4"; # F-H are commonly loaded to respectively mm1-mm3 and
68 # mm5-mm7, but it's done on on-demand basis...
69
70sub BODY_00_15_sse2 {
71 my $prefetch=shift;
72
73 &movq ("mm5",$Fsse2); # load f
74 &movq ("mm6",$Gsse2); # load g
75 &movq ("mm7",$Hsse2); # load h
76
77 &movq ("mm1",$E); # %mm1 is sliding right
78 &movq ("mm2",$E); # %mm2 is sliding left
79 &psrlq ("mm1",14);
80 &movq ($Esse2,$E); # modulo-scheduled save e
81 &psllq ("mm2",23);
82 &movq ("mm3","mm1"); # %mm3 is T1
83 &psrlq ("mm1",4);
84 &pxor ("mm3","mm2");
85 &psllq ("mm2",23);
86 &pxor ("mm3","mm1");
87 &psrlq ("mm1",23);
88 &pxor ("mm3","mm2");
89 &psllq ("mm2",4);
90 &pxor ("mm3","mm1");
91 &paddq ("mm7",QWP(0,$K512)); # h+=K512[i]
92 &pxor ("mm3","mm2"); # T1=Sigma1_512(e)
93
94 &pxor ("mm5","mm6"); # f^=g
95 &movq ("mm1",$Bsse2); # load b
96 &pand ("mm5",$E); # f&=e
97 &movq ("mm2",$Csse2); # load c
98 &pxor ("mm5","mm6"); # f^=g
99 &movq ($E,$Dsse2); # e = load d
100 &paddq ("mm3","mm5"); # T1+=Ch(e,f,g)
101 &movq (&QWP(0,"esp"),$A); # modulo-scheduled save a
102 &paddq ("mm3","mm7"); # T1+=h
103
104 &movq ("mm5",$A); # %mm5 is sliding right
105 &movq ("mm6",$A); # %mm6 is sliding left
106 &paddq ("mm3",&QWP(8*9,"esp")); # T1+=X[0]
107 &psrlq ("mm5",28);
108 &paddq ($E,"mm3"); # e += T1
109 &psllq ("mm6",25);
110 &movq ("mm7","mm5"); # %mm7 is T2
111 &psrlq ("mm5",6);
112 &pxor ("mm7","mm6");
113 &psllq ("mm6",5);
114 &pxor ("mm7","mm5");
115 &psrlq ("mm5",5);
116 &pxor ("mm7","mm6");
117 &psllq ("mm6",6);
118 &pxor ("mm7","mm5");
119 &sub ("esp",8);
120 &pxor ("mm7","mm6"); # T2=Sigma0_512(a)
121
122 &movq ("mm5",$A); # %mm5=a
123 &por ($A,"mm2"); # a=a|c
124 &movq ("mm6",&QWP(8*(9+16-14),"esp")) if ($prefetch);
125 &pand ("mm5","mm2"); # %mm5=a&c
126 &pand ($A,"mm1"); # a=(a|c)&b
127 &movq ("mm2",&QWP(8*(9+16-1),"esp")) if ($prefetch);
128 &por ("mm5",$A); # %mm5=(a&c)|((a|c)&b)
129 &paddq ("mm7","mm5"); # T2+=Maj(a,b,c)
130 &movq ($A,"mm3"); # a=T1
131
132 &mov (&LB("edx"),&BP(0,$K512));
133 &paddq ($A,"mm7"); # a+=T2
134 &add ($K512,8);
135}
136
137sub BODY_00_15_x86 {
138 #define Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41))
139 # LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
140 # HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
141 &mov ("ecx",$Elo);
142 &mov ("edx",$Ehi);
143 &mov ("esi","ecx");
144
145 &shr ("ecx",9) # lo>>9
146 &mov ("edi","edx");
147 &shr ("edx",9) # hi>>9
148 &mov ("ebx","ecx");
149 &shl ("esi",14); # lo<<14
150 &mov ("eax","edx");
151 &shl ("edi",14); # hi<<14
152 &xor ("ebx","esi");
153
154 &shr ("ecx",14-9); # lo>>14
155 &xor ("eax","edi");
156 &shr ("edx",14-9); # hi>>14
157 &xor ("eax","ecx");
158 &shl ("esi",18-14); # lo<<18
159 &xor ("ebx","edx");
160 &shl ("edi",18-14); # hi<<18
161 &xor ("ebx","esi");
162
163 &shr ("ecx",18-14); # lo>>18
164 &xor ("eax","edi");
165 &shr ("edx",18-14); # hi>>18
166 &xor ("eax","ecx");
167 &shl ("esi",23-18); # lo<<23
168 &xor ("ebx","edx");
169 &shl ("edi",23-18); # hi<<23
170 &xor ("eax","esi");
171 &xor ("ebx","edi"); # T1 = Sigma1(e)
172
173 &mov ("ecx",$Flo);
174 &mov ("edx",$Fhi);
175 &mov ("esi",$Glo);
176 &mov ("edi",$Ghi);
177 &add ("eax",$Hlo);
178 &adc ("ebx",$Hhi); # T1 += h
179 &xor ("ecx","esi");
180 &xor ("edx","edi");
181 &and ("ecx",$Elo);
182 &and ("edx",$Ehi);
183 &add ("eax",&DWP(8*(9+15)+0,"esp"));
184 &adc ("ebx",&DWP(8*(9+15)+4,"esp")); # T1 += X[0]
185 &xor ("ecx","esi");
186 &xor ("edx","edi"); # Ch(e,f,g) = (f^g)&e)^g
187
188 &mov ("esi",&DWP(0,$K512));
189 &mov ("edi",&DWP(4,$K512)); # K[i]
190 &add ("eax","ecx");
191 &adc ("ebx","edx"); # T1 += Ch(e,f,g)
192 &mov ("ecx",$Dlo);
193 &mov ("edx",$Dhi);
194 &add ("eax","esi");
195 &adc ("ebx","edi"); # T1 += K[i]
196 &mov ($Tlo,"eax");
197 &mov ($Thi,"ebx"); # put T1 away
198 &add ("eax","ecx");
199 &adc ("ebx","edx"); # d += T1
200
201 #define Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
202 # LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
203 # HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
204 &mov ("ecx",$Alo);
205 &mov ("edx",$Ahi);
206 &mov ($Dlo,"eax");
207 &mov ($Dhi,"ebx");
208 &mov ("esi","ecx");
209
210 &shr ("ecx",2) # lo>>2
211 &mov ("edi","edx");
212 &shr ("edx",2) # hi>>2
213 &mov ("ebx","ecx");
214 &shl ("esi",4); # lo<<4
215 &mov ("eax","edx");
216 &shl ("edi",4); # hi<<4
217 &xor ("ebx","esi");
218
219 &shr ("ecx",7-2); # lo>>7
220 &xor ("eax","edi");
221 &shr ("edx",7-2); # hi>>7
222 &xor ("ebx","ecx");
223 &shl ("esi",25-4); # lo<<25
224 &xor ("eax","edx");
225 &shl ("edi",25-4); # hi<<25
226 &xor ("eax","esi");
227
228 &shr ("ecx",28-7); # lo>>28
229 &xor ("ebx","edi");
230 &shr ("edx",28-7); # hi>>28
231 &xor ("eax","ecx");
232 &shl ("esi",30-25); # lo<<30
233 &xor ("ebx","edx");
234 &shl ("edi",30-25); # hi<<30
235 &xor ("eax","esi");
236 &xor ("ebx","edi"); # Sigma0(a)
237
238 &mov ("ecx",$Alo);
239 &mov ("edx",$Ahi);
240 &mov ("esi",$Blo);
241 &mov ("edi",$Bhi);
242 &add ("eax",$Tlo);
243 &adc ("ebx",$Thi); # T1 = Sigma0(a)+T1
244 &or ("ecx","esi");
245 &or ("edx","edi");
246 &and ("ecx",$Clo);
247 &and ("edx",$Chi);
248 &and ("esi",$Alo);
249 &and ("edi",$Ahi);
250 &or ("ecx","esi");
251 &or ("edx","edi"); # Maj(a,b,c) = ((a|b)&c)|(a&b)
252
253 &add ("eax","ecx");
254 &adc ("ebx","edx"); # T1 += Maj(a,b,c)
255 &mov ($Tlo,"eax");
256 &mov ($Thi,"ebx");
257
258 &mov (&LB("edx"),&BP(0,$K512)); # pre-fetch LSB of *K
259 &sub ("esp",8);
260 &lea ($K512,&DWP(8,$K512)); # K++
261}
262
263
264&function_begin("sha512_block_data_order");
265 &mov ("esi",wparam(0)); # ctx
266 &mov ("edi",wparam(1)); # inp
267 &mov ("eax",wparam(2)); # num
268 &mov ("ebx","esp"); # saved sp
269
270 &call (&label("pic_point")); # make it PIC!
271&set_label("pic_point");
272 &blindpop($K512);
273 &lea ($K512,&DWP(&label("K512")."-".&label("pic_point"),$K512));
274
275 &sub ("esp",16);
276 &and ("esp",-64);
277
278 &shl ("eax",7);
279 &add ("eax","edi");
280 &mov (&DWP(0,"esp"),"esi"); # ctx
281 &mov (&DWP(4,"esp"),"edi"); # inp
282 &mov (&DWP(8,"esp"),"eax"); # inp+num*128
283 &mov (&DWP(12,"esp"),"ebx"); # saved sp
284
285if ($sse2) {
286 &picmeup("edx","OPENSSL_ia32cap_P",$K512,&label("K512"));
287 &bt (&DWP(0,"edx"),26);
288 &jnc (&label("loop_x86"));
289
290 # load ctx->h[0-7]
291 &movq ($A,&QWP(0,"esi"));
292 &movq ("mm1",&QWP(8,"esi"));
293 &movq ("mm2",&QWP(16,"esi"));
294 &movq ("mm3",&QWP(24,"esi"));
295 &movq ($E,&QWP(32,"esi"));
296 &movq ("mm5",&QWP(40,"esi"));
297 &movq ("mm6",&QWP(48,"esi"));
298 &movq ("mm7",&QWP(56,"esi"));
299 &sub ("esp",8*10);
300
301&set_label("loop_sse2",16);
302 # &movq ($Asse2,$A);
303 &movq ($Bsse2,"mm1");
304 &movq ($Csse2,"mm2");
305 &movq ($Dsse2,"mm3");
306 # &movq ($Esse2,$E);
307 &movq ($Fsse2,"mm5");
308 &movq ($Gsse2,"mm6");
309 &movq ($Hsse2,"mm7");
310
311 &mov ("ecx",&DWP(0,"edi"));
312 &mov ("edx",&DWP(4,"edi"));
313 &add ("edi",8);
314 &bswap ("ecx");
315 &bswap ("edx");
316 &mov (&DWP(8*9+4,"esp"),"ecx");
317 &mov (&DWP(8*9+0,"esp"),"edx");
318
319&set_label("00_14_sse2",16);
320 &mov ("eax",&DWP(0,"edi"));
321 &mov ("ebx",&DWP(4,"edi"));
322 &add ("edi",8);
323 &bswap ("eax");
324 &bswap ("ebx");
325 &mov (&DWP(8*8+4,"esp"),"eax");
326 &mov (&DWP(8*8+0,"esp"),"ebx");
327
328 &BODY_00_15_sse2();
329
330 &cmp (&LB("edx"),0x35);
331 &jne (&label("00_14_sse2"));
332
333 &BODY_00_15_sse2(1);
334
335&set_label("16_79_sse2",16);
336 #&movq ("mm2",&QWP(8*(9+16-1),"esp")); #prefetched in BODY_00_15
337 #&movq ("mm6",&QWP(8*(9+16-14),"esp"));
338 &movq ("mm1","mm2");
339
340 &psrlq ("mm2",1);
341 &movq ("mm7","mm6");
342 &psrlq ("mm6",6);
343 &movq ("mm3","mm2");
344
345 &psrlq ("mm2",7-1);
346 &movq ("mm5","mm6");
347 &psrlq ("mm6",19-6);
348 &pxor ("mm3","mm2");
349
350 &psrlq ("mm2",8-7);
351 &pxor ("mm5","mm6");
352 &psrlq ("mm6",61-19);
353 &pxor ("mm3","mm2");
354
355 &movq ("mm2",&QWP(8*(9+16),"esp"));
356
357 &psllq ("mm1",56);
358 &pxor ("mm5","mm6");
359 &psllq ("mm7",3);
360 &pxor ("mm3","mm1");
361
362 &paddq ("mm2",&QWP(8*(9+16-9),"esp"));
363
364 &psllq ("mm1",63-56);
365 &pxor ("mm5","mm7");
366 &psllq ("mm7",45-3);
367 &pxor ("mm3","mm1");
368 &pxor ("mm5","mm7");
369
370 &paddq ("mm3","mm5");
371 &paddq ("mm3","mm2");
372 &movq (&QWP(8*9,"esp"),"mm3");
373
374 &BODY_00_15_sse2(1);
375
376 &cmp (&LB("edx"),0x17);
377 &jne (&label("16_79_sse2"));
378
379 # &movq ($A,$Asse2);
380 &movq ("mm1",$Bsse2);
381 &movq ("mm2",$Csse2);
382 &movq ("mm3",$Dsse2);
383 # &movq ($E,$Esse2);
384 &movq ("mm5",$Fsse2);
385 &movq ("mm6",$Gsse2);
386 &movq ("mm7",$Hsse2);
387
388 &paddq ($A,&QWP(0,"esi"));
389 &paddq ("mm1",&QWP(8,"esi"));
390 &paddq ("mm2",&QWP(16,"esi"));
391 &paddq ("mm3",&QWP(24,"esi"));
392 &paddq ($E,&QWP(32,"esi"));
393 &paddq ("mm5",&QWP(40,"esi"));
394 &paddq ("mm6",&QWP(48,"esi"));
395 &paddq ("mm7",&QWP(56,"esi"));
396
397 &movq (&QWP(0,"esi"),$A);
398 &movq (&QWP(8,"esi"),"mm1");
399 &movq (&QWP(16,"esi"),"mm2");
400 &movq (&QWP(24,"esi"),"mm3");
401 &movq (&QWP(32,"esi"),$E);
402 &movq (&QWP(40,"esi"),"mm5");
403 &movq (&QWP(48,"esi"),"mm6");
404 &movq (&QWP(56,"esi"),"mm7");
405
406 &add ("esp",8*80); # destroy frame
407 &sub ($K512,8*80); # rewind K
408
409 &cmp ("edi",&DWP(8*10+8,"esp")); # are we done yet?
410 &jb (&label("loop_sse2"));
411
412 &emms ();
413 &mov ("esp",&DWP(8*10+12,"esp")); # restore sp
414&function_end_A();
415}
416&set_label("loop_x86",16);
417 # copy input block to stack reversing byte and qword order
418 for ($i=0;$i<8;$i++) {
419 &mov ("eax",&DWP($i*16+0,"edi"));
420 &mov ("ebx",&DWP($i*16+4,"edi"));
421 &mov ("ecx",&DWP($i*16+8,"edi"));
422 &mov ("edx",&DWP($i*16+12,"edi"));
423 &bswap ("eax");
424 &bswap ("ebx");
425 &bswap ("ecx");
426 &bswap ("edx");
427 &push ("eax");
428 &push ("ebx");
429 &push ("ecx");
430 &push ("edx");
431 }
432 &add ("edi",128);
433 &sub ("esp",9*8); # place for T,A,B,C,D,E,F,G,H
434 &mov (&DWP(8*(9+16)+4,"esp"),"edi");
435
436 # copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack
437 &lea ("edi",&DWP(8,"esp"));
438 &mov ("ecx",16);
439 &data_word(0xA5F3F689); # rep movsd
440
441&set_label("00_15_x86",16);
442 &BODY_00_15_x86();
443
444 &cmp (&LB("edx"),0x94);
445 &jne (&label("00_15_x86"));
446
447&set_label("16_79_x86",16);
448 #define sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7))
449 # LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
450 # HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7
451 &mov ("ecx",&DWP(8*(9+15+16-1)+0,"esp"));
452 &mov ("edx",&DWP(8*(9+15+16-1)+4,"esp"));
453 &mov ("esi","ecx");
454
455 &shr ("ecx",1) # lo>>1
456 &mov ("edi","edx");
457 &shr ("edx",1) # hi>>1
458 &mov ("eax","ecx");
459 &shl ("esi",24); # lo<<24
460 &mov ("ebx","edx");
461 &shl ("edi",24); # hi<<24
462 &xor ("ebx","esi");
463
464 &shr ("ecx",7-1); # lo>>7
465 &xor ("eax","edi");
466 &shr ("edx",7-1); # hi>>7
467 &xor ("eax","ecx");
468 &shl ("esi",31-24); # lo<<31
469 &xor ("ebx","edx");
470 &shl ("edi",25-24); # hi<<25
471 &xor ("ebx","esi");
472
473 &shr ("ecx",8-7); # lo>>8
474 &xor ("eax","edi");
475 &shr ("edx",8-7); # hi>>8
476 &xor ("eax","ecx");
477 &shl ("edi",31-25); # hi<<31
478 &xor ("ebx","edx");
479 &xor ("eax","edi"); # T1 = sigma0(X[-15])
480
481 &mov (&DWP(0,"esp"),"eax");
482 &mov (&DWP(4,"esp"),"ebx"); # put T1 away
483
484 #define sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
485 # LO lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
486 # HI hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
487 &mov ("ecx",&DWP(8*(9+15+16-14)+0,"esp"));
488 &mov ("edx",&DWP(8*(9+15+16-14)+4,"esp"));
489 &mov ("esi","ecx");
490
491 &shr ("ecx",6) # lo>>6
492 &mov ("edi","edx");
493 &shr ("edx",6) # hi>>6
494 &mov ("eax","ecx");
495 &shl ("esi",3); # lo<<3
496 &mov ("ebx","edx");
497 &shl ("edi",3); # hi<<3
498 &xor ("eax","esi");
499
500 &shr ("ecx",19-6); # lo>>19
501 &xor ("ebx","edi");
502 &shr ("edx",19-6); # hi>>19
503 &xor ("eax","ecx");
504 &shl ("esi",13-3); # lo<<13
505 &xor ("ebx","edx");
506 &shl ("edi",13-3); # hi<<13
507 &xor ("ebx","esi");
508
509 &shr ("ecx",29-19); # lo>>29
510 &xor ("eax","edi");
511 &shr ("edx",29-19); # hi>>29
512 &xor ("ebx","ecx");
513 &shl ("edi",26-13); # hi<<26
514 &xor ("eax","edx");
515 &xor ("eax","edi"); # sigma1(X[-2])
516
517 &mov ("ecx",&DWP(8*(9+15+16)+0,"esp"));
518 &mov ("edx",&DWP(8*(9+15+16)+4,"esp"));
519 &add ("eax",&DWP(0,"esp"));
520 &adc ("ebx",&DWP(4,"esp")); # T1 = sigma1(X[-2])+T1
521 &mov ("esi",&DWP(8*(9+15+16-9)+0,"esp"));
522 &mov ("edi",&DWP(8*(9+15+16-9)+4,"esp"));
523 &add ("eax","ecx");
524 &adc ("ebx","edx"); # T1 += X[-16]
525 &add ("eax","esi");
526 &adc ("ebx","edi"); # T1 += X[-7]
527 &mov (&DWP(8*(9+15)+0,"esp"),"eax");
528 &mov (&DWP(8*(9+15)+4,"esp"),"ebx"); # save X[0]
529
530 &BODY_00_15_x86();
531
532 &cmp (&LB("edx"),0x17);
533 &jne (&label("16_79_x86"));
534
535 &mov ("esi",&DWP(8*(9+16+80)+0,"esp"));# ctx
536 &mov ("edi",&DWP(8*(9+16+80)+4,"esp"));# inp
537 for($i=0;$i<4;$i++) {
538 &mov ("eax",&DWP($i*16+0,"esi"));
539 &mov ("ebx",&DWP($i*16+4,"esi"));
540 &mov ("ecx",&DWP($i*16+8,"esi"));
541 &mov ("edx",&DWP($i*16+12,"esi"));
542 &add ("eax",&DWP(8+($i*16)+0,"esp"));
543 &adc ("ebx",&DWP(8+($i*16)+4,"esp"));
544 &mov (&DWP($i*16+0,"esi"),"eax");
545 &mov (&DWP($i*16+4,"esi"),"ebx");
546 &add ("ecx",&DWP(8+($i*16)+8,"esp"));
547 &adc ("edx",&DWP(8+($i*16)+12,"esp"));
548 &mov (&DWP($i*16+8,"esi"),"ecx");
549 &mov (&DWP($i*16+12,"esi"),"edx");
550 }
551 &add ("esp",8*(9+16+80)); # destroy frame
552 &sub ($K512,8*80); # rewind K
553
554 &cmp ("edi",&DWP(8,"esp")); # are we done yet?
555 &jb (&label("loop_x86"));
556
557 &mov ("esp",&DWP(12,"esp")); # restore sp
558&function_end_A();
559
560&set_label("K512",64); # Yes! I keep it in the code segment!
561 &data_word(0xd728ae22,0x428a2f98); # u64
562 &data_word(0x23ef65cd,0x71374491); # u64
563 &data_word(0xec4d3b2f,0xb5c0fbcf); # u64
564 &data_word(0x8189dbbc,0xe9b5dba5); # u64
565 &data_word(0xf348b538,0x3956c25b); # u64
566 &data_word(0xb605d019,0x59f111f1); # u64
567 &data_word(0xaf194f9b,0x923f82a4); # u64
568 &data_word(0xda6d8118,0xab1c5ed5); # u64
569 &data_word(0xa3030242,0xd807aa98); # u64
570 &data_word(0x45706fbe,0x12835b01); # u64
571 &data_word(0x4ee4b28c,0x243185be); # u64
572 &data_word(0xd5ffb4e2,0x550c7dc3); # u64
573 &data_word(0xf27b896f,0x72be5d74); # u64
574 &data_word(0x3b1696b1,0x80deb1fe); # u64
575 &data_word(0x25c71235,0x9bdc06a7); # u64
576 &data_word(0xcf692694,0xc19bf174); # u64
577 &data_word(0x9ef14ad2,0xe49b69c1); # u64
578 &data_word(0x384f25e3,0xefbe4786); # u64
579 &data_word(0x8b8cd5b5,0x0fc19dc6); # u64
580 &data_word(0x77ac9c65,0x240ca1cc); # u64
581 &data_word(0x592b0275,0x2de92c6f); # u64
582 &data_word(0x6ea6e483,0x4a7484aa); # u64
583 &data_word(0xbd41fbd4,0x5cb0a9dc); # u64
584 &data_word(0x831153b5,0x76f988da); # u64
585 &data_word(0xee66dfab,0x983e5152); # u64
586 &data_word(0x2db43210,0xa831c66d); # u64
587 &data_word(0x98fb213f,0xb00327c8); # u64
588 &data_word(0xbeef0ee4,0xbf597fc7); # u64
589 &data_word(0x3da88fc2,0xc6e00bf3); # u64
590 &data_word(0x930aa725,0xd5a79147); # u64
591 &data_word(0xe003826f,0x06ca6351); # u64
592 &data_word(0x0a0e6e70,0x14292967); # u64
593 &data_word(0x46d22ffc,0x27b70a85); # u64
594 &data_word(0x5c26c926,0x2e1b2138); # u64
595 &data_word(0x5ac42aed,0x4d2c6dfc); # u64
596 &data_word(0x9d95b3df,0x53380d13); # u64
597 &data_word(0x8baf63de,0x650a7354); # u64
598 &data_word(0x3c77b2a8,0x766a0abb); # u64
599 &data_word(0x47edaee6,0x81c2c92e); # u64
600 &data_word(0x1482353b,0x92722c85); # u64
601 &data_word(0x4cf10364,0xa2bfe8a1); # u64
602 &data_word(0xbc423001,0xa81a664b); # u64
603 &data_word(0xd0f89791,0xc24b8b70); # u64
604 &data_word(0x0654be30,0xc76c51a3); # u64
605 &data_word(0xd6ef5218,0xd192e819); # u64
606 &data_word(0x5565a910,0xd6990624); # u64
607 &data_word(0x5771202a,0xf40e3585); # u64
608 &data_word(0x32bbd1b8,0x106aa070); # u64
609 &data_word(0xb8d2d0c8,0x19a4c116); # u64
610 &data_word(0x5141ab53,0x1e376c08); # u64
611 &data_word(0xdf8eeb99,0x2748774c); # u64
612 &data_word(0xe19b48a8,0x34b0bcb5); # u64
613 &data_word(0xc5c95a63,0x391c0cb3); # u64
614 &data_word(0xe3418acb,0x4ed8aa4a); # u64
615 &data_word(0x7763e373,0x5b9cca4f); # u64
616 &data_word(0xd6b2b8a3,0x682e6ff3); # u64
617 &data_word(0x5defb2fc,0x748f82ee); # u64
618 &data_word(0x43172f60,0x78a5636f); # u64
619 &data_word(0xa1f0ab72,0x84c87814); # u64
620 &data_word(0x1a6439ec,0x8cc70208); # u64
621 &data_word(0x23631e28,0x90befffa); # u64
622 &data_word(0xde82bde9,0xa4506ceb); # u64
623 &data_word(0xb2c67915,0xbef9a3f7); # u64
624 &data_word(0xe372532b,0xc67178f2); # u64
625 &data_word(0xea26619c,0xca273ece); # u64
626 &data_word(0x21c0c207,0xd186b8c7); # u64
627 &data_word(0xcde0eb1e,0xeada7dd6); # u64
628 &data_word(0xee6ed178,0xf57d4f7f); # u64
629 &data_word(0x72176fba,0x06f067aa); # u64
630 &data_word(0xa2c898a6,0x0a637dc5); # u64
631 &data_word(0xbef90dae,0x113f9804); # u64
632 &data_word(0x131c471b,0x1b710b35); # u64
633 &data_word(0x23047d84,0x28db77f5); # u64
634 &data_word(0x40c72493,0x32caab7b); # u64
635 &data_word(0x15c9bebc,0x3c9ebe0a); # u64
636 &data_word(0x9c100d4c,0x431d67c4); # u64
637 &data_word(0xcb3e42b6,0x4cc5d4be); # u64
638 &data_word(0xfc657e2a,0x597f299c); # u64
639 &data_word(0x3ad6faec,0x5fcb6fab); # u64
640 &data_word(0x4a475817,0x6c44198c); # u64
641&function_end_B("sha512_block_data_order");
642&asciz("SHA512 block transform for x86, CRYPTOGAMS by <appro\@openssl.org>");
643
644&asm_finish();
diff --git a/src/lib/libcrypto/sha/asm/sha512-armv4.pl b/src/lib/libcrypto/sha/asm/sha512-armv4.pl
new file mode 100644
index 0000000000..4fbb94a914
--- /dev/null
+++ b/src/lib/libcrypto/sha/asm/sha512-armv4.pl
@@ -0,0 +1,399 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# SHA512 block procedure for ARMv4. September 2007.
11
12# This code is ~4.5 (four and a half) times faster than code generated
13# by gcc 3.4 and it spends ~72 clock cycles per byte.
14
15# Byte order [in]dependence. =========================================
16#
17# Caller is expected to maintain specific *dword* order in h[0-7],
18# namely with most significant dword at *lower* address, which is
19# reflected in below two parameters. *Byte* order within these dwords
20# in turn is whatever *native* byte order on current platform.
21$hi=0;
22$lo=4;
23# ====================================================================
24
25$output=shift;
26open STDOUT,">$output";
27
28$ctx="r0";
29$inp="r1";
30$len="r2";
31$Tlo="r3";
32$Thi="r4";
33$Alo="r5";
34$Ahi="r6";
35$Elo="r7";
36$Ehi="r8";
37$t0="r9";
38$t1="r10";
39$t2="r11";
40$t3="r12";
41############ r13 is stack pointer
42$Ktbl="r14";
43############ r15 is program counter
44
45$Aoff=8*0;
46$Boff=8*1;
47$Coff=8*2;
48$Doff=8*3;
49$Eoff=8*4;
50$Foff=8*5;
51$Goff=8*6;
52$Hoff=8*7;
53$Xoff=8*8;
54
55sub BODY_00_15() {
56my $magic = shift;
57$code.=<<___;
58 ldr $t2,[sp,#$Hoff+0] @ h.lo
59 ldr $t3,[sp,#$Hoff+4] @ h.hi
60 @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41))
61 @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
62 @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
63 mov $t0,$Elo,lsr#14
64 mov $t1,$Ehi,lsr#14
65 eor $t0,$t0,$Ehi,lsl#18
66 eor $t1,$t1,$Elo,lsl#18
67 eor $t0,$t0,$Elo,lsr#18
68 eor $t1,$t1,$Ehi,lsr#18
69 eor $t0,$t0,$Ehi,lsl#14
70 eor $t1,$t1,$Elo,lsl#14
71 eor $t0,$t0,$Ehi,lsr#9
72 eor $t1,$t1,$Elo,lsr#9
73 eor $t0,$t0,$Elo,lsl#23
74 eor $t1,$t1,$Ehi,lsl#23 @ Sigma1(e)
75 adds $Tlo,$Tlo,$t0
76 adc $Thi,$Thi,$t1 @ T += Sigma1(e)
77 adds $Tlo,$Tlo,$t2
78 adc $Thi,$Thi,$t3 @ T += h
79
80 ldr $t0,[sp,#$Foff+0] @ f.lo
81 ldr $t1,[sp,#$Foff+4] @ f.hi
82 ldr $t2,[sp,#$Goff+0] @ g.lo
83 ldr $t3,[sp,#$Goff+4] @ g.hi
84 str $Elo,[sp,#$Eoff+0]
85 str $Ehi,[sp,#$Eoff+4]
86 str $Alo,[sp,#$Aoff+0]
87 str $Ahi,[sp,#$Aoff+4]
88
89 eor $t0,$t0,$t2
90 eor $t1,$t1,$t3
91 and $t0,$t0,$Elo
92 and $t1,$t1,$Ehi
93 eor $t0,$t0,$t2
94 eor $t1,$t1,$t3 @ Ch(e,f,g)
95
96 ldr $t2,[$Ktbl,#4] @ K[i].lo
97 ldr $t3,[$Ktbl,#0] @ K[i].hi
98 ldr $Elo,[sp,#$Doff+0] @ d.lo
99 ldr $Ehi,[sp,#$Doff+4] @ d.hi
100
101 adds $Tlo,$Tlo,$t0
102 adc $Thi,$Thi,$t1 @ T += Ch(e,f,g)
103 adds $Tlo,$Tlo,$t2
104 adc $Thi,$Thi,$t3 @ T += K[i]
105 adds $Elo,$Elo,$Tlo
106 adc $Ehi,$Ehi,$Thi @ d += T
107
108 and $t0,$t2,#0xff
109 teq $t0,#$magic
110 orreq $Ktbl,$Ktbl,#1
111
112 ldr $t2,[sp,#$Boff+0] @ b.lo
113 ldr $t3,[sp,#$Coff+0] @ c.lo
114 @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
115 @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
116 @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
117 mov $t0,$Alo,lsr#28
118 mov $t1,$Ahi,lsr#28
119 eor $t0,$t0,$Ahi,lsl#4
120 eor $t1,$t1,$Alo,lsl#4
121 eor $t0,$t0,$Ahi,lsr#2
122 eor $t1,$t1,$Alo,lsr#2
123 eor $t0,$t0,$Alo,lsl#30
124 eor $t1,$t1,$Ahi,lsl#30
125 eor $t0,$t0,$Ahi,lsr#7
126 eor $t1,$t1,$Alo,lsr#7
127 eor $t0,$t0,$Alo,lsl#25
128 eor $t1,$t1,$Ahi,lsl#25 @ Sigma0(a)
129 adds $Tlo,$Tlo,$t0
130 adc $Thi,$Thi,$t1 @ T += Sigma0(a)
131
132 and $t0,$Alo,$t2
133 orr $Alo,$Alo,$t2
134 ldr $t1,[sp,#$Boff+4] @ b.hi
135 ldr $t2,[sp,#$Coff+4] @ c.hi
136 and $Alo,$Alo,$t3
137 orr $Alo,$Alo,$t0 @ Maj(a,b,c).lo
138 and $t3,$Ahi,$t1
139 orr $Ahi,$Ahi,$t1
140 and $Ahi,$Ahi,$t2
141 orr $Ahi,$Ahi,$t3 @ Maj(a,b,c).hi
142 adds $Alo,$Alo,$Tlo
143 adc $Ahi,$Ahi,$Thi @ h += T
144
145 sub sp,sp,#8
146 add $Ktbl,$Ktbl,#8
147___
148}
149$code=<<___;
150.text
151.code 32
152.type K512,%object
153.align 5
154K512:
155.word 0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd
156.word 0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc
157.word 0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019
158.word 0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118
159.word 0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe
160.word 0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2
161.word 0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1
162.word 0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694
163.word 0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3
164.word 0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65
165.word 0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483
166.word 0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5
167.word 0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210
168.word 0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4
169.word 0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725
170.word 0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70
171.word 0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926
172.word 0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df
173.word 0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8
174.word 0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b
175.word 0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001
176.word 0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30
177.word 0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910
178.word 0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8
179.word 0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53
180.word 0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8
181.word 0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb
182.word 0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3
183.word 0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60
184.word 0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec
185.word 0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9
186.word 0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b
187.word 0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207
188.word 0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178
189.word 0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6
190.word 0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b
191.word 0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493
192.word 0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c
193.word 0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a
194.word 0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817
195.size K512,.-K512
196
197.global sha512_block_data_order
198.type sha512_block_data_order,%function
199sha512_block_data_order:
200 sub r3,pc,#8 @ sha512_block_data_order
201 add $len,$inp,$len,lsl#7 @ len to point at the end of inp
202 stmdb sp!,{r4-r12,lr}
203 sub $Ktbl,r3,#640 @ K512
204 sub sp,sp,#9*8
205
206 ldr $Elo,[$ctx,#$Eoff+$lo]
207 ldr $Ehi,[$ctx,#$Eoff+$hi]
208 ldr $t0, [$ctx,#$Goff+$lo]
209 ldr $t1, [$ctx,#$Goff+$hi]
210 ldr $t2, [$ctx,#$Hoff+$lo]
211 ldr $t3, [$ctx,#$Hoff+$hi]
212.Loop:
213 str $t0, [sp,#$Goff+0]
214 str $t1, [sp,#$Goff+4]
215 str $t2, [sp,#$Hoff+0]
216 str $t3, [sp,#$Hoff+4]
217 ldr $Alo,[$ctx,#$Aoff+$lo]
218 ldr $Ahi,[$ctx,#$Aoff+$hi]
219 ldr $Tlo,[$ctx,#$Boff+$lo]
220 ldr $Thi,[$ctx,#$Boff+$hi]
221 ldr $t0, [$ctx,#$Coff+$lo]
222 ldr $t1, [$ctx,#$Coff+$hi]
223 ldr $t2, [$ctx,#$Doff+$lo]
224 ldr $t3, [$ctx,#$Doff+$hi]
225 str $Tlo,[sp,#$Boff+0]
226 str $Thi,[sp,#$Boff+4]
227 str $t0, [sp,#$Coff+0]
228 str $t1, [sp,#$Coff+4]
229 str $t2, [sp,#$Doff+0]
230 str $t3, [sp,#$Doff+4]
231 ldr $Tlo,[$ctx,#$Foff+$lo]
232 ldr $Thi,[$ctx,#$Foff+$hi]
233 str $Tlo,[sp,#$Foff+0]
234 str $Thi,[sp,#$Foff+4]
235
236.L00_15:
237 ldrb $Tlo,[$inp,#7]
238 ldrb $t0, [$inp,#6]
239 ldrb $t1, [$inp,#5]
240 ldrb $t2, [$inp,#4]
241 ldrb $Thi,[$inp,#3]
242 ldrb $t3, [$inp,#2]
243 orr $Tlo,$Tlo,$t0,lsl#8
244 ldrb $t0, [$inp,#1]
245 orr $Tlo,$Tlo,$t1,lsl#16
246 ldrb $t1, [$inp],#8
247 orr $Tlo,$Tlo,$t2,lsl#24
248 orr $Thi,$Thi,$t3,lsl#8
249 orr $Thi,$Thi,$t0,lsl#16
250 orr $Thi,$Thi,$t1,lsl#24
251 str $Tlo,[sp,#$Xoff+0]
252 str $Thi,[sp,#$Xoff+4]
253___
254 &BODY_00_15(0x94);
255$code.=<<___;
256 tst $Ktbl,#1
257 beq .L00_15
258 bic $Ktbl,$Ktbl,#1
259
260.L16_79:
261 ldr $t0,[sp,#`$Xoff+8*(16-1)`+0]
262 ldr $t1,[sp,#`$Xoff+8*(16-1)`+4]
263 ldr $t2,[sp,#`$Xoff+8*(16-14)`+0]
264 ldr $t3,[sp,#`$Xoff+8*(16-14)`+4]
265
266 @ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7))
267 @ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
268 @ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7
269 mov $Tlo,$t0,lsr#1
270 mov $Thi,$t1,lsr#1
271 eor $Tlo,$Tlo,$t1,lsl#31
272 eor $Thi,$Thi,$t0,lsl#31
273 eor $Tlo,$Tlo,$t0,lsr#8
274 eor $Thi,$Thi,$t1,lsr#8
275 eor $Tlo,$Tlo,$t1,lsl#24
276 eor $Thi,$Thi,$t0,lsl#24
277 eor $Tlo,$Tlo,$t0,lsr#7
278 eor $Thi,$Thi,$t1,lsr#7
279 eor $Tlo,$Tlo,$t1,lsl#25
280
281 @ sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
282 @ LO lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
283 @ HI hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
284 mov $t0,$t2,lsr#19
285 mov $t1,$t3,lsr#19
286 eor $t0,$t0,$t3,lsl#13
287 eor $t1,$t1,$t2,lsl#13
288 eor $t0,$t0,$t3,lsr#29
289 eor $t1,$t1,$t2,lsr#29
290 eor $t0,$t0,$t2,lsl#3
291 eor $t1,$t1,$t3,lsl#3
292 eor $t0,$t0,$t2,lsr#6
293 eor $t1,$t1,$t3,lsr#6
294 eor $t0,$t0,$t3,lsl#26
295
296 ldr $t2,[sp,#`$Xoff+8*(16-9)`+0]
297 ldr $t3,[sp,#`$Xoff+8*(16-9)`+4]
298 adds $Tlo,$Tlo,$t0
299 adc $Thi,$Thi,$t1
300
301 ldr $t0,[sp,#`$Xoff+8*16`+0]
302 ldr $t1,[sp,#`$Xoff+8*16`+4]
303 adds $Tlo,$Tlo,$t2
304 adc $Thi,$Thi,$t3
305 adds $Tlo,$Tlo,$t0
306 adc $Thi,$Thi,$t1
307 str $Tlo,[sp,#$Xoff+0]
308 str $Thi,[sp,#$Xoff+4]
309___
310 &BODY_00_15(0x17);
311$code.=<<___;
312 tst $Ktbl,#1
313 beq .L16_79
314 bic $Ktbl,$Ktbl,#1
315
316 ldr $Tlo,[sp,#$Boff+0]
317 ldr $Thi,[sp,#$Boff+4]
318 ldr $t0, [$ctx,#$Aoff+$lo]
319 ldr $t1, [$ctx,#$Aoff+$hi]
320 ldr $t2, [$ctx,#$Boff+$lo]
321 ldr $t3, [$ctx,#$Boff+$hi]
322 adds $t0,$Alo,$t0
323 adc $t1,$Ahi,$t1
324 adds $t2,$Tlo,$t2
325 adc $t3,$Thi,$t3
326 str $t0, [$ctx,#$Aoff+$lo]
327 str $t1, [$ctx,#$Aoff+$hi]
328 str $t2, [$ctx,#$Boff+$lo]
329 str $t3, [$ctx,#$Boff+$hi]
330
331 ldr $Alo,[sp,#$Coff+0]
332 ldr $Ahi,[sp,#$Coff+4]
333 ldr $Tlo,[sp,#$Doff+0]
334 ldr $Thi,[sp,#$Doff+4]
335 ldr $t0, [$ctx,#$Coff+$lo]
336 ldr $t1, [$ctx,#$Coff+$hi]
337 ldr $t2, [$ctx,#$Doff+$lo]
338 ldr $t3, [$ctx,#$Doff+$hi]
339 adds $t0,$Alo,$t0
340 adc $t1,$Ahi,$t1
341 adds $t2,$Tlo,$t2
342 adc $t3,$Thi,$t3
343 str $t0, [$ctx,#$Coff+$lo]
344 str $t1, [$ctx,#$Coff+$hi]
345 str $t2, [$ctx,#$Doff+$lo]
346 str $t3, [$ctx,#$Doff+$hi]
347
348 ldr $Tlo,[sp,#$Foff+0]
349 ldr $Thi,[sp,#$Foff+4]
350 ldr $t0, [$ctx,#$Eoff+$lo]
351 ldr $t1, [$ctx,#$Eoff+$hi]
352 ldr $t2, [$ctx,#$Foff+$lo]
353 ldr $t3, [$ctx,#$Foff+$hi]
354 adds $Elo,$Elo,$t0
355 adc $Ehi,$Ehi,$t1
356 adds $t2,$Tlo,$t2
357 adc $t3,$Thi,$t3
358 str $Elo,[$ctx,#$Eoff+$lo]
359 str $Ehi,[$ctx,#$Eoff+$hi]
360 str $t2, [$ctx,#$Foff+$lo]
361 str $t3, [$ctx,#$Foff+$hi]
362
363 ldr $Alo,[sp,#$Goff+0]
364 ldr $Ahi,[sp,#$Goff+4]
365 ldr $Tlo,[sp,#$Hoff+0]
366 ldr $Thi,[sp,#$Hoff+4]
367 ldr $t0, [$ctx,#$Goff+$lo]
368 ldr $t1, [$ctx,#$Goff+$hi]
369 ldr $t2, [$ctx,#$Hoff+$lo]
370 ldr $t3, [$ctx,#$Hoff+$hi]
371 adds $t0,$Alo,$t0
372 adc $t1,$Ahi,$t1
373 adds $t2,$Tlo,$t2
374 adc $t3,$Thi,$t3
375 str $t0, [$ctx,#$Goff+$lo]
376 str $t1, [$ctx,#$Goff+$hi]
377 str $t2, [$ctx,#$Hoff+$lo]
378 str $t3, [$ctx,#$Hoff+$hi]
379
380 add sp,sp,#640
381 sub $Ktbl,$Ktbl,#640
382
383 teq $inp,$len
384 bne .Loop
385
386 add sp,sp,#8*9 @ destroy frame
387 ldmia sp!,{r4-r12,lr}
388 tst lr,#1
389 moveq pc,lr @ be binary compatible with V4, yet
390 bx lr @ interoperable with Thumb ISA:-)
391.size sha512_block_data_order,.-sha512_block_data_order
392.asciz "SHA512 block transform for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
393.align 2
394___
395
396$code =~ s/\`([^\`]*)\`/eval $1/gem;
397$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
398print $code;
399close STDOUT; # enforce flush
diff --git a/src/lib/libcrypto/sha/asm/sha512-ppc.pl b/src/lib/libcrypto/sha/asm/sha512-ppc.pl
new file mode 100755
index 0000000000..768a6a6fad
--- /dev/null
+++ b/src/lib/libcrypto/sha/asm/sha512-ppc.pl
@@ -0,0 +1,462 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# I let hardware handle unaligned input, except on page boundaries
11# (see below for details). Otherwise straightforward implementation
12# with X vector in register bank. The module is big-endian [which is
13# not big deal as there're no little-endian targets left around].
14
15# sha256 | sha512
16# -m64 -m32 | -m64 -m32
17# --------------------------------------+-----------------------
18# PPC970,gcc-4.0.0 +50% +38% | +40% +410%(*)
19# Power6,xlc-7 +150% +90% | +100% +430%(*)
20#
21# (*) 64-bit code in 32-bit application context, which actually is
22# on TODO list. It should be noted that for safe deployment in
23# 32-bit *mutli-threaded* context asyncronous signals should be
24# blocked upon entry to SHA512 block routine. This is because
25# 32-bit signaling procedure invalidates upper halves of GPRs.
26# Context switch procedure preserves them, but not signaling:-(
27
28# Second version is true multi-thread safe. Trouble with the original
29# version was that it was using thread local storage pointer register.
30# Well, it scrupulously preserved it, but the problem would arise the
31# moment asynchronous signal was delivered and signal handler would
32# dereference the TLS pointer. While it's never the case in openssl
33# application or test suite, we have to respect this scenario and not
34# use TLS pointer register. Alternative would be to require caller to
35# block signals prior calling this routine. For the record, in 32-bit
36# context R2 serves as TLS pointer, while in 64-bit context - R13.
37
38$flavour=shift;
39$output =shift;
40
41if ($flavour =~ /64/) {
42 $SIZE_T=8;
43 $STU="stdu";
44 $UCMP="cmpld";
45 $SHL="sldi";
46 $POP="ld";
47 $PUSH="std";
48} elsif ($flavour =~ /32/) {
49 $SIZE_T=4;
50 $STU="stwu";
51 $UCMP="cmplw";
52 $SHL="slwi";
53 $POP="lwz";
54 $PUSH="stw";
55} else { die "nonsense $flavour"; }
56
57$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
58( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
59( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
60die "can't locate ppc-xlate.pl";
61
62open STDOUT,"| $^X $xlate $flavour $output" || die "can't call $xlate: $!";
63
64if ($output =~ /512/) {
65 $func="sha512_block_data_order";
66 $SZ=8;
67 @Sigma0=(28,34,39);
68 @Sigma1=(14,18,41);
69 @sigma0=(1, 8, 7);
70 @sigma1=(19,61, 6);
71 $rounds=80;
72 $LD="ld";
73 $ST="std";
74 $ROR="rotrdi";
75 $SHR="srdi";
76} else {
77 $func="sha256_block_data_order";
78 $SZ=4;
79 @Sigma0=( 2,13,22);
80 @Sigma1=( 6,11,25);
81 @sigma0=( 7,18, 3);
82 @sigma1=(17,19,10);
83 $rounds=64;
84 $LD="lwz";
85 $ST="stw";
86 $ROR="rotrwi";
87 $SHR="srwi";
88}
89
90$FRAME=32*$SIZE_T;
91
92$sp ="r1";
93$toc="r2";
94$ctx="r3"; # zapped by $a0
95$inp="r4"; # zapped by $a1
96$num="r5"; # zapped by $t0
97
98$T ="r0";
99$a0 ="r3";
100$a1 ="r4";
101$t0 ="r5";
102$t1 ="r6";
103$Tbl="r7";
104
105$A ="r8";
106$B ="r9";
107$C ="r10";
108$D ="r11";
109$E ="r12";
110$F ="r13"; $F="r2" if ($SIZE_T==8);# reassigned to exempt TLS pointer
111$G ="r14";
112$H ="r15";
113
114@V=($A,$B,$C,$D,$E,$F,$G,$H);
115@X=("r16","r17","r18","r19","r20","r21","r22","r23",
116 "r24","r25","r26","r27","r28","r29","r30","r31");
117
118$inp="r31"; # reassigned $inp! aliases with @X[15]
119
120sub ROUND_00_15 {
121my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
122$code.=<<___;
123 $LD $T,`$i*$SZ`($Tbl)
124 $ROR $a0,$e,$Sigma1[0]
125 $ROR $a1,$e,$Sigma1[1]
126 and $t0,$f,$e
127 andc $t1,$g,$e
128 add $T,$T,$h
129 xor $a0,$a0,$a1
130 $ROR $a1,$a1,`$Sigma1[2]-$Sigma1[1]`
131 or $t0,$t0,$t1 ; Ch(e,f,g)
132 add $T,$T,@X[$i]
133 xor $a0,$a0,$a1 ; Sigma1(e)
134 add $T,$T,$t0
135 add $T,$T,$a0
136
137 $ROR $a0,$a,$Sigma0[0]
138 $ROR $a1,$a,$Sigma0[1]
139 and $t0,$a,$b
140 and $t1,$a,$c
141 xor $a0,$a0,$a1
142 $ROR $a1,$a1,`$Sigma0[2]-$Sigma0[1]`
143 xor $t0,$t0,$t1
144 and $t1,$b,$c
145 xor $a0,$a0,$a1 ; Sigma0(a)
146 add $d,$d,$T
147 xor $t0,$t0,$t1 ; Maj(a,b,c)
148 add $h,$T,$a0
149 add $h,$h,$t0
150
151___
152}
153
154sub ROUND_16_xx {
155my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
156$i-=16;
157$code.=<<___;
158 $ROR $a0,@X[($i+1)%16],$sigma0[0]
159 $ROR $a1,@X[($i+1)%16],$sigma0[1]
160 $ROR $t0,@X[($i+14)%16],$sigma1[0]
161 $ROR $t1,@X[($i+14)%16],$sigma1[1]
162 xor $a0,$a0,$a1
163 $SHR $a1,@X[($i+1)%16],$sigma0[2]
164 xor $t0,$t0,$t1
165 $SHR $t1,@X[($i+14)%16],$sigma1[2]
166 add @X[$i],@X[$i],@X[($i+9)%16]
167 xor $a0,$a0,$a1 ; sigma0(X[(i+1)&0x0f])
168 xor $t0,$t0,$t1 ; sigma1(X[(i+14)&0x0f])
169 add @X[$i],@X[$i],$a0
170 add @X[$i],@X[$i],$t0
171___
172&ROUND_00_15($i,$a,$b,$c,$d,$e,$f,$g,$h);
173}
174
175$code=<<___;
176.machine "any"
177.text
178
179.globl $func
180.align 6
181$func:
182 mflr r0
183 $STU $sp,`-($FRAME+16*$SZ)`($sp)
184 $SHL $num,$num,`log(16*$SZ)/log(2)`
185
186 $PUSH $ctx,`$FRAME-$SIZE_T*22`($sp)
187
188 $PUSH r0,`$FRAME-$SIZE_T*21`($sp)
189 $PUSH $toc,`$FRAME-$SIZE_T*20`($sp)
190 $PUSH r13,`$FRAME-$SIZE_T*19`($sp)
191 $PUSH r14,`$FRAME-$SIZE_T*18`($sp)
192 $PUSH r15,`$FRAME-$SIZE_T*17`($sp)
193 $PUSH r16,`$FRAME-$SIZE_T*16`($sp)
194 $PUSH r17,`$FRAME-$SIZE_T*15`($sp)
195 $PUSH r18,`$FRAME-$SIZE_T*14`($sp)
196 $PUSH r19,`$FRAME-$SIZE_T*13`($sp)
197 $PUSH r20,`$FRAME-$SIZE_T*12`($sp)
198 $PUSH r21,`$FRAME-$SIZE_T*11`($sp)
199 $PUSH r22,`$FRAME-$SIZE_T*10`($sp)
200 $PUSH r23,`$FRAME-$SIZE_T*9`($sp)
201 $PUSH r24,`$FRAME-$SIZE_T*8`($sp)
202 $PUSH r25,`$FRAME-$SIZE_T*7`($sp)
203 $PUSH r26,`$FRAME-$SIZE_T*6`($sp)
204 $PUSH r27,`$FRAME-$SIZE_T*5`($sp)
205 $PUSH r28,`$FRAME-$SIZE_T*4`($sp)
206 $PUSH r29,`$FRAME-$SIZE_T*3`($sp)
207 $PUSH r30,`$FRAME-$SIZE_T*2`($sp)
208 $PUSH r31,`$FRAME-$SIZE_T*1`($sp)
209
210 $LD $A,`0*$SZ`($ctx)
211 mr $inp,r4 ; incarnate $inp
212 $LD $B,`1*$SZ`($ctx)
213 $LD $C,`2*$SZ`($ctx)
214 $LD $D,`3*$SZ`($ctx)
215 $LD $E,`4*$SZ`($ctx)
216 $LD $F,`5*$SZ`($ctx)
217 $LD $G,`6*$SZ`($ctx)
218 $LD $H,`7*$SZ`($ctx)
219
220 b LPICmeup
221LPICedup:
222 andi. r0,$inp,3
223 bne Lunaligned
224Laligned:
225 add $num,$inp,$num
226 $PUSH $num,`$FRAME-$SIZE_T*24`($sp) ; end pointer
227 $PUSH $inp,`$FRAME-$SIZE_T*23`($sp) ; inp pointer
228 bl Lsha2_block_private
229Ldone:
230 $POP r0,`$FRAME-$SIZE_T*21`($sp)
231 $POP $toc,`$FRAME-$SIZE_T*20`($sp)
232 $POP r13,`$FRAME-$SIZE_T*19`($sp)
233 $POP r14,`$FRAME-$SIZE_T*18`($sp)
234 $POP r15,`$FRAME-$SIZE_T*17`($sp)
235 $POP r16,`$FRAME-$SIZE_T*16`($sp)
236 $POP r17,`$FRAME-$SIZE_T*15`($sp)
237 $POP r18,`$FRAME-$SIZE_T*14`($sp)
238 $POP r19,`$FRAME-$SIZE_T*13`($sp)
239 $POP r20,`$FRAME-$SIZE_T*12`($sp)
240 $POP r21,`$FRAME-$SIZE_T*11`($sp)
241 $POP r22,`$FRAME-$SIZE_T*10`($sp)
242 $POP r23,`$FRAME-$SIZE_T*9`($sp)
243 $POP r24,`$FRAME-$SIZE_T*8`($sp)
244 $POP r25,`$FRAME-$SIZE_T*7`($sp)
245 $POP r26,`$FRAME-$SIZE_T*6`($sp)
246 $POP r27,`$FRAME-$SIZE_T*5`($sp)
247 $POP r28,`$FRAME-$SIZE_T*4`($sp)
248 $POP r29,`$FRAME-$SIZE_T*3`($sp)
249 $POP r30,`$FRAME-$SIZE_T*2`($sp)
250 $POP r31,`$FRAME-$SIZE_T*1`($sp)
251 mtlr r0
252 addi $sp,$sp,`$FRAME+16*$SZ`
253 blr
254___
255
256# PowerPC specification allows an implementation to be ill-behaved
257# upon unaligned access which crosses page boundary. "Better safe
258# than sorry" principle makes me treat it specially. But I don't
259# look for particular offending word, but rather for the input
260# block which crosses the boundary. Once found that block is aligned
261# and hashed separately...
262$code.=<<___;
263.align 4
264Lunaligned:
265 subfic $t1,$inp,4096
266 andi. $t1,$t1,`4096-16*$SZ` ; distance to closest page boundary
267 beq Lcross_page
268 $UCMP $num,$t1
269 ble- Laligned ; didn't cross the page boundary
270 subfc $num,$t1,$num
271 add $t1,$inp,$t1
272 $PUSH $num,`$FRAME-$SIZE_T*25`($sp) ; save real remaining num
273 $PUSH $t1,`$FRAME-$SIZE_T*24`($sp) ; intermediate end pointer
274 $PUSH $inp,`$FRAME-$SIZE_T*23`($sp) ; inp pointer
275 bl Lsha2_block_private
276 ; $inp equals to the intermediate end pointer here
277 $POP $num,`$FRAME-$SIZE_T*25`($sp) ; restore real remaining num
278Lcross_page:
279 li $t1,`16*$SZ/4`
280 mtctr $t1
281 addi r20,$sp,$FRAME ; aligned spot below the frame
282Lmemcpy:
283 lbz r16,0($inp)
284 lbz r17,1($inp)
285 lbz r18,2($inp)
286 lbz r19,3($inp)
287 addi $inp,$inp,4
288 stb r16,0(r20)
289 stb r17,1(r20)
290 stb r18,2(r20)
291 stb r19,3(r20)
292 addi r20,r20,4
293 bdnz Lmemcpy
294
295 $PUSH $inp,`$FRAME-$SIZE_T*26`($sp) ; save real inp
296 addi $t1,$sp,`$FRAME+16*$SZ` ; fictitious end pointer
297 addi $inp,$sp,$FRAME ; fictitious inp pointer
298 $PUSH $num,`$FRAME-$SIZE_T*25`($sp) ; save real num
299 $PUSH $t1,`$FRAME-$SIZE_T*24`($sp) ; end pointer
300 $PUSH $inp,`$FRAME-$SIZE_T*23`($sp) ; inp pointer
301 bl Lsha2_block_private
302 $POP $inp,`$FRAME-$SIZE_T*26`($sp) ; restore real inp
303 $POP $num,`$FRAME-$SIZE_T*25`($sp) ; restore real num
304 addic. $num,$num,`-16*$SZ` ; num--
305 bne- Lunaligned
306 b Ldone
307___
308
309$code.=<<___;
310.align 4
311Lsha2_block_private:
312___
313for($i=0;$i<16;$i++) {
314$code.=<<___ if ($SZ==4);
315 lwz @X[$i],`$i*$SZ`($inp)
316___
317# 64-bit loads are split to 2x32-bit ones, as CPU can't handle
318# unaligned 64-bit loads, only 32-bit ones...
319$code.=<<___ if ($SZ==8);
320 lwz $t0,`$i*$SZ`($inp)
321 lwz @X[$i],`$i*$SZ+4`($inp)
322 insrdi @X[$i],$t0,32,0
323___
324 &ROUND_00_15($i,@V);
325 unshift(@V,pop(@V));
326}
327$code.=<<___;
328 li $T,`$rounds/16-1`
329 mtctr $T
330.align 4
331Lrounds:
332 addi $Tbl,$Tbl,`16*$SZ`
333___
334for(;$i<32;$i++) {
335 &ROUND_16_xx($i,@V);
336 unshift(@V,pop(@V));
337}
338$code.=<<___;
339 bdnz- Lrounds
340
341 $POP $ctx,`$FRAME-$SIZE_T*22`($sp)
342 $POP $inp,`$FRAME-$SIZE_T*23`($sp) ; inp pointer
343 $POP $num,`$FRAME-$SIZE_T*24`($sp) ; end pointer
344 subi $Tbl,$Tbl,`($rounds-16)*$SZ` ; rewind Tbl
345
346 $LD r16,`0*$SZ`($ctx)
347 $LD r17,`1*$SZ`($ctx)
348 $LD r18,`2*$SZ`($ctx)
349 $LD r19,`3*$SZ`($ctx)
350 $LD r20,`4*$SZ`($ctx)
351 $LD r21,`5*$SZ`($ctx)
352 $LD r22,`6*$SZ`($ctx)
353 addi $inp,$inp,`16*$SZ` ; advance inp
354 $LD r23,`7*$SZ`($ctx)
355 add $A,$A,r16
356 add $B,$B,r17
357 $PUSH $inp,`$FRAME-$SIZE_T*23`($sp)
358 add $C,$C,r18
359 $ST $A,`0*$SZ`($ctx)
360 add $D,$D,r19
361 $ST $B,`1*$SZ`($ctx)
362 add $E,$E,r20
363 $ST $C,`2*$SZ`($ctx)
364 add $F,$F,r21
365 $ST $D,`3*$SZ`($ctx)
366 add $G,$G,r22
367 $ST $E,`4*$SZ`($ctx)
368 add $H,$H,r23
369 $ST $F,`5*$SZ`($ctx)
370 $ST $G,`6*$SZ`($ctx)
371 $UCMP $inp,$num
372 $ST $H,`7*$SZ`($ctx)
373 bne Lsha2_block_private
374 blr
375___
376
377# Ugly hack here, because PPC assembler syntax seem to vary too
378# much from platforms to platform...
379$code.=<<___;
380.align 6
381LPICmeup:
382 bl LPIC
383 addi $Tbl,$Tbl,`64-4` ; "distance" between . and last nop
384 b LPICedup
385 nop
386 nop
387 nop
388 nop
389 nop
390LPIC: mflr $Tbl
391 blr
392 nop
393 nop
394 nop
395 nop
396 nop
397 nop
398___
399$code.=<<___ if ($SZ==8);
400 .long 0x428a2f98,0xd728ae22,0x71374491,0x23ef65cd
401 .long 0xb5c0fbcf,0xec4d3b2f,0xe9b5dba5,0x8189dbbc
402 .long 0x3956c25b,0xf348b538,0x59f111f1,0xb605d019
403 .long 0x923f82a4,0xaf194f9b,0xab1c5ed5,0xda6d8118
404 .long 0xd807aa98,0xa3030242,0x12835b01,0x45706fbe
405 .long 0x243185be,0x4ee4b28c,0x550c7dc3,0xd5ffb4e2
406 .long 0x72be5d74,0xf27b896f,0x80deb1fe,0x3b1696b1
407 .long 0x9bdc06a7,0x25c71235,0xc19bf174,0xcf692694
408 .long 0xe49b69c1,0x9ef14ad2,0xefbe4786,0x384f25e3
409 .long 0x0fc19dc6,0x8b8cd5b5,0x240ca1cc,0x77ac9c65
410 .long 0x2de92c6f,0x592b0275,0x4a7484aa,0x6ea6e483
411 .long 0x5cb0a9dc,0xbd41fbd4,0x76f988da,0x831153b5
412 .long 0x983e5152,0xee66dfab,0xa831c66d,0x2db43210
413 .long 0xb00327c8,0x98fb213f,0xbf597fc7,0xbeef0ee4
414 .long 0xc6e00bf3,0x3da88fc2,0xd5a79147,0x930aa725
415 .long 0x06ca6351,0xe003826f,0x14292967,0x0a0e6e70
416 .long 0x27b70a85,0x46d22ffc,0x2e1b2138,0x5c26c926
417 .long 0x4d2c6dfc,0x5ac42aed,0x53380d13,0x9d95b3df
418 .long 0x650a7354,0x8baf63de,0x766a0abb,0x3c77b2a8
419 .long 0x81c2c92e,0x47edaee6,0x92722c85,0x1482353b
420 .long 0xa2bfe8a1,0x4cf10364,0xa81a664b,0xbc423001
421 .long 0xc24b8b70,0xd0f89791,0xc76c51a3,0x0654be30
422 .long 0xd192e819,0xd6ef5218,0xd6990624,0x5565a910
423 .long 0xf40e3585,0x5771202a,0x106aa070,0x32bbd1b8
424 .long 0x19a4c116,0xb8d2d0c8,0x1e376c08,0x5141ab53
425 .long 0x2748774c,0xdf8eeb99,0x34b0bcb5,0xe19b48a8
426 .long 0x391c0cb3,0xc5c95a63,0x4ed8aa4a,0xe3418acb
427 .long 0x5b9cca4f,0x7763e373,0x682e6ff3,0xd6b2b8a3
428 .long 0x748f82ee,0x5defb2fc,0x78a5636f,0x43172f60
429 .long 0x84c87814,0xa1f0ab72,0x8cc70208,0x1a6439ec
430 .long 0x90befffa,0x23631e28,0xa4506ceb,0xde82bde9
431 .long 0xbef9a3f7,0xb2c67915,0xc67178f2,0xe372532b
432 .long 0xca273ece,0xea26619c,0xd186b8c7,0x21c0c207
433 .long 0xeada7dd6,0xcde0eb1e,0xf57d4f7f,0xee6ed178
434 .long 0x06f067aa,0x72176fba,0x0a637dc5,0xa2c898a6
435 .long 0x113f9804,0xbef90dae,0x1b710b35,0x131c471b
436 .long 0x28db77f5,0x23047d84,0x32caab7b,0x40c72493
437 .long 0x3c9ebe0a,0x15c9bebc,0x431d67c4,0x9c100d4c
438 .long 0x4cc5d4be,0xcb3e42b6,0x597f299c,0xfc657e2a
439 .long 0x5fcb6fab,0x3ad6faec,0x6c44198c,0x4a475817
440___
441$code.=<<___ if ($SZ==4);
442 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
443 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
444 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
445 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
446 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
447 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
448 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
449 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
450 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
451 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
452 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
453 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
454 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
455 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
456 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
457 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
458___
459
460$code =~ s/\`([^\`]*)\`/eval $1/gem;
461print $code;
462close STDOUT;
diff --git a/src/lib/libcrypto/sha/asm/sha512-s390x.pl b/src/lib/libcrypto/sha/asm/sha512-s390x.pl
new file mode 100644
index 0000000000..e7ef2d5a9f
--- /dev/null
+++ b/src/lib/libcrypto/sha/asm/sha512-s390x.pl
@@ -0,0 +1,301 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# SHA256/512 block procedures for s390x.
11
12# April 2007.
13#
14# sha256_block_data_order is reportedly >3 times faster than gcc 3.3
15# generated code (must be a bug in compiler, as improvement is
16# "pathologically" high, in particular in comparison to other SHA
17# modules). But the real twist is that it detects if hardware support
18# for SHA256 is available and in such case utilizes it. Then the
19# performance can reach >6.5x of assembler one for larger chunks.
20#
21# sha512_block_data_order is ~70% faster than gcc 3.3 generated code.
22
23# January 2009.
24#
25# Add support for hardware SHA512 and reschedule instructions to
26# favour dual-issue z10 pipeline. Hardware SHA256/512 is ~4.7x faster
27# than software.
28
29$t0="%r0";
30$t1="%r1";
31$ctx="%r2"; $t2="%r2";
32$inp="%r3";
33$len="%r4"; # used as index in inner loop
34
35$A="%r5";
36$B="%r6";
37$C="%r7";
38$D="%r8";
39$E="%r9";
40$F="%r10";
41$G="%r11";
42$H="%r12"; @V=($A,$B,$C,$D,$E,$F,$G,$H);
43$tbl="%r13";
44$T1="%r14";
45$sp="%r15";
46
47$output=shift;
48open STDOUT,">$output";
49
50if ($output =~ /512/) {
51 $label="512";
52 $SZ=8;
53 $LD="lg"; # load from memory
54 $ST="stg"; # store to memory
55 $ADD="alg"; # add with memory operand
56 $ROT="rllg"; # rotate left
57 $SHR="srlg"; # logical right shift [see even at the end]
58 @Sigma0=(25,30,36);
59 @Sigma1=(23,46,50);
60 @sigma0=(56,63, 7);
61 @sigma1=( 3,45, 6);
62 $rounds=80;
63 $kimdfunc=3; # 0 means unknown/unsupported/unimplemented/disabled
64} else {
65 $label="256";
66 $SZ=4;
67 $LD="llgf"; # load from memory
68 $ST="st"; # store to memory
69 $ADD="al"; # add with memory operand
70 $ROT="rll"; # rotate left
71 $SHR="srl"; # logical right shift
72 @Sigma0=(10,19,30);
73 @Sigma1=( 7,21,26);
74 @sigma0=(14,25, 3);
75 @sigma1=(13,15,10);
76 $rounds=64;
77 $kimdfunc=2; # magic function code for kimd instruction
78}
79$Func="sha${label}_block_data_order";
80$Table="K${label}";
81$frame=160+16*$SZ;
82
83sub BODY_00_15 {
84my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
85
86$code.=<<___ if ($i<16);
87 $LD $T1,`$i*$SZ`($inp) ### $i
88___
89$code.=<<___;
90 $ROT $t0,$e,$Sigma1[0]
91 $ROT $t1,$e,$Sigma1[1]
92 lgr $t2,$f
93 xgr $t0,$t1
94 $ROT $t1,$t1,`$Sigma1[2]-$Sigma1[1]`
95 xgr $t2,$g
96 $ST $T1,`160+$SZ*($i%16)`($sp)
97 xgr $t0,$t1 # Sigma1(e)
98 la $T1,0($T1,$h) # T1+=h
99 ngr $t2,$e
100 lgr $t1,$a
101 algr $T1,$t0 # T1+=Sigma1(e)
102 $ROT $h,$a,$Sigma0[0]
103 xgr $t2,$g # Ch(e,f,g)
104 $ADD $T1,`$i*$SZ`($len,$tbl) # T1+=K[i]
105 $ROT $t0,$a,$Sigma0[1]
106 algr $T1,$t2 # T1+=Ch(e,f,g)
107 ogr $t1,$b
108 xgr $h,$t0
109 lgr $t2,$a
110 ngr $t1,$c
111 $ROT $t0,$t0,`$Sigma0[2]-$Sigma0[1]`
112 xgr $h,$t0 # h=Sigma0(a)
113 ngr $t2,$b
114 algr $h,$T1 # h+=T1
115 ogr $t2,$t1 # Maj(a,b,c)
116 la $d,0($d,$T1) # d+=T1
117 algr $h,$t2 # h+=Maj(a,b,c)
118___
119}
120
121sub BODY_16_XX {
122my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
123
124$code.=<<___;
125 $LD $T1,`160+$SZ*(($i+1)%16)`($sp) ### $i
126 $LD $t1,`160+$SZ*(($i+14)%16)`($sp)
127 $ROT $t0,$T1,$sigma0[0]
128 $SHR $T1,$sigma0[2]
129 $ROT $t2,$t0,`$sigma0[1]-$sigma0[0]`
130 xgr $T1,$t0
131 $ROT $t0,$t1,$sigma1[0]
132 xgr $T1,$t2 # sigma0(X[i+1])
133 $SHR $t1,$sigma1[2]
134 $ADD $T1,`160+$SZ*($i%16)`($sp) # +=X[i]
135 xgr $t1,$t0
136 $ROT $t0,$t0,`$sigma1[1]-$sigma1[0]`
137 $ADD $T1,`160+$SZ*(($i+9)%16)`($sp) # +=X[i+9]
138 xgr $t1,$t0 # sigma1(X[i+14])
139 algr $T1,$t1 # +=sigma1(X[i+14])
140___
141 &BODY_00_15(@_);
142}
143
144$code.=<<___;
145.text
146.align 64
147.type $Table,\@object
148$Table:
149___
150$code.=<<___ if ($SZ==4);
151 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
152 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
153 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
154 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
155 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
156 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
157 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
158 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
159 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
160 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
161 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
162 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
163 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
164 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
165 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
166 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
167___
168$code.=<<___ if ($SZ==8);
169 .quad 0x428a2f98d728ae22,0x7137449123ef65cd
170 .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
171 .quad 0x3956c25bf348b538,0x59f111f1b605d019
172 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
173 .quad 0xd807aa98a3030242,0x12835b0145706fbe
174 .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
175 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
176 .quad 0x9bdc06a725c71235,0xc19bf174cf692694
177 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
178 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
179 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
180 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
181 .quad 0x983e5152ee66dfab,0xa831c66d2db43210
182 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
183 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
184 .quad 0x06ca6351e003826f,0x142929670a0e6e70
185 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
186 .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
187 .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
188 .quad 0x81c2c92e47edaee6,0x92722c851482353b
189 .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
190 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
191 .quad 0xd192e819d6ef5218,0xd69906245565a910
192 .quad 0xf40e35855771202a,0x106aa07032bbd1b8
193 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
194 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
195 .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
196 .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
197 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
198 .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
199 .quad 0x90befffa23631e28,0xa4506cebde82bde9
200 .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
201 .quad 0xca273eceea26619c,0xd186b8c721c0c207
202 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
203 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
204 .quad 0x113f9804bef90dae,0x1b710b35131c471b
205 .quad 0x28db77f523047d84,0x32caab7b40c72493
206 .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
207 .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
208 .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
209___
210$code.=<<___;
211.size $Table,.-$Table
212.globl $Func
213.type $Func,\@function
214$Func:
215___
216$code.=<<___ if ($kimdfunc);
217 larl %r1,OPENSSL_s390xcap_P
218 lg %r0,0(%r1)
219 tmhl %r0,0x4000 # check for message-security assist
220 jz .Lsoftware
221 lghi %r0,0
222 la %r1,16($sp)
223 .long 0xb93e0002 # kimd %r0,%r2
224 lg %r0,16($sp)
225 tmhh %r0,`0x8000>>$kimdfunc`
226 jz .Lsoftware
227 lghi %r0,$kimdfunc
228 lgr %r1,$ctx
229 lgr %r2,$inp
230 sllg %r3,$len,`log(16*$SZ)/log(2)`
231 .long 0xb93e0002 # kimd %r0,%r2
232 brc 1,.-4 # pay attention to "partial completion"
233 br %r14
234.align 16
235.Lsoftware:
236___
237$code.=<<___;
238 sllg $len,$len,`log(16*$SZ)/log(2)`
239 lghi %r1,-$frame
240 agr $len,$inp
241 stmg $ctx,%r15,16($sp)
242 lgr %r0,$sp
243 la $sp,0(%r1,$sp)
244 stg %r0,0($sp)
245
246 larl $tbl,$Table
247 $LD $A,`0*$SZ`($ctx)
248 $LD $B,`1*$SZ`($ctx)
249 $LD $C,`2*$SZ`($ctx)
250 $LD $D,`3*$SZ`($ctx)
251 $LD $E,`4*$SZ`($ctx)
252 $LD $F,`5*$SZ`($ctx)
253 $LD $G,`6*$SZ`($ctx)
254 $LD $H,`7*$SZ`($ctx)
255
256.Lloop:
257 lghi $len,0
258___
259for ($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
260$code.=".Lrounds_16_xx:\n";
261for (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
262$code.=<<___;
263 aghi $len,`16*$SZ`
264 lghi $t0,`($rounds-16)*$SZ`
265 clgr $len,$t0
266 jne .Lrounds_16_xx
267
268 lg $ctx,`$frame+16`($sp)
269 la $inp,`16*$SZ`($inp)
270 $ADD $A,`0*$SZ`($ctx)
271 $ADD $B,`1*$SZ`($ctx)
272 $ADD $C,`2*$SZ`($ctx)
273 $ADD $D,`3*$SZ`($ctx)
274 $ADD $E,`4*$SZ`($ctx)
275 $ADD $F,`5*$SZ`($ctx)
276 $ADD $G,`6*$SZ`($ctx)
277 $ADD $H,`7*$SZ`($ctx)
278 $ST $A,`0*$SZ`($ctx)
279 $ST $B,`1*$SZ`($ctx)
280 $ST $C,`2*$SZ`($ctx)
281 $ST $D,`3*$SZ`($ctx)
282 $ST $E,`4*$SZ`($ctx)
283 $ST $F,`5*$SZ`($ctx)
284 $ST $G,`6*$SZ`($ctx)
285 $ST $H,`7*$SZ`($ctx)
286 clg $inp,`$frame+32`($sp)
287 jne .Lloop
288
289 lmg %r6,%r15,`$frame+48`($sp)
290 br %r14
291.size $Func,.-$Func
292.string "SHA${label} block transform for s390x, CRYPTOGAMS by <appro\@openssl.org>"
293.comm OPENSSL_s390xcap_P,8,8
294___
295
296$code =~ s/\`([^\`]*)\`/eval $1/gem;
297# unlike 32-bit shift 64-bit one takes three arguments
298$code =~ s/(srlg\s+)(%r[0-9]+),/$1$2,$2,/gm;
299
300print $code;
301close STDOUT;
diff --git a/src/lib/libcrypto/sha/asm/sha512-sparcv9.pl b/src/lib/libcrypto/sha/asm/sha512-sparcv9.pl
new file mode 100644
index 0000000000..54241aab50
--- /dev/null
+++ b/src/lib/libcrypto/sha/asm/sha512-sparcv9.pl
@@ -0,0 +1,593 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# SHA256 performance improvement over compiler generated code varies
11# from 40% for Sun C [32-bit build] to 70% for gcc [3.3, 64-bit
12# build]. Just like in SHA1 module I aim to ensure scalability on
13# UltraSPARC T1 by packing X[16] to 8 64-bit registers.
14
15# SHA512 on pre-T1 UltraSPARC.
16#
17# Performance is >75% better than 64-bit code generated by Sun C and
18# over 2x than 32-bit code. X[16] resides on stack, but access to it
19# is scheduled for L2 latency and staged through 32 least significant
20# bits of %l0-%l7. The latter is done to achieve 32-/64-bit ABI
21# duality. Nevetheless it's ~40% faster than SHA256, which is pretty
22# good [optimal coefficient is 50%].
23#
24# SHA512 on UltraSPARC T1.
25#
26# It's not any faster than 64-bit code generated by Sun C 5.8. This is
27# because 64-bit code generator has the advantage of using 64-bit
28# loads(*) to access X[16], which I consciously traded for 32-/64-bit
29# ABI duality [as per above]. But it surpasses 32-bit Sun C generated
30# code by 60%, not to mention that it doesn't suffer from severe decay
31# when running 4 times physical cores threads and that it leaves gcc
32# [3.4] behind by over 4x factor! If compared to SHA256, single thread
33# performance is only 10% better, but overall throughput for maximum
34# amount of threads for given CPU exceeds corresponding one of SHA256
35# by 30% [again, optimal coefficient is 50%].
36#
37# (*) Unlike pre-T1 UltraSPARC loads on T1 are executed strictly
38# in-order, i.e. load instruction has to complete prior next
39# instruction in given thread is executed, even if the latter is
40# not dependent on load result! This means that on T1 two 32-bit
41# loads are always slower than one 64-bit load. Once again this
42# is unlike pre-T1 UltraSPARC, where, if scheduled appropriately,
43# 2x32-bit loads can be as fast as 1x64-bit ones.
44
45$bits=32;
46for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
47if ($bits==64) { $bias=2047; $frame=192; }
48else { $bias=0; $frame=112; }
49
50$output=shift;
51open STDOUT,">$output";
52
53if ($output =~ /512/) {
54 $label="512";
55 $SZ=8;
56 $LD="ldx"; # load from memory
57 $ST="stx"; # store to memory
58 $SLL="sllx"; # shift left logical
59 $SRL="srlx"; # shift right logical
60 @Sigma0=(28,34,39);
61 @Sigma1=(14,18,41);
62 @sigma0=( 7, 1, 8); # right shift first
63 @sigma1=( 6,19,61); # right shift first
64 $lastK=0x817;
65 $rounds=80;
66 $align=4;
67
68 $locals=16*$SZ; # X[16]
69
70 $A="%o0";
71 $B="%o1";
72 $C="%o2";
73 $D="%o3";
74 $E="%o4";
75 $F="%o5";
76 $G="%g1";
77 $H="%o7";
78 @V=($A,$B,$C,$D,$E,$F,$G,$H);
79} else {
80 $label="256";
81 $SZ=4;
82 $LD="ld"; # load from memory
83 $ST="st"; # store to memory
84 $SLL="sll"; # shift left logical
85 $SRL="srl"; # shift right logical
86 @Sigma0=( 2,13,22);
87 @Sigma1=( 6,11,25);
88 @sigma0=( 3, 7,18); # right shift first
89 @sigma1=(10,17,19); # right shift first
90 $lastK=0x8f2;
91 $rounds=64;
92 $align=8;
93
94 $locals=0; # X[16] is register resident
95 @X=("%o0","%o1","%o2","%o3","%o4","%o5","%g1","%o7");
96
97 $A="%l0";
98 $B="%l1";
99 $C="%l2";
100 $D="%l3";
101 $E="%l4";
102 $F="%l5";
103 $G="%l6";
104 $H="%l7";
105 @V=($A,$B,$C,$D,$E,$F,$G,$H);
106}
107$T1="%g2";
108$tmp0="%g3";
109$tmp1="%g4";
110$tmp2="%g5";
111
112$ctx="%i0";
113$inp="%i1";
114$len="%i2";
115$Ktbl="%i3";
116$tmp31="%i4";
117$tmp32="%i5";
118
119########### SHA256
120$Xload = sub {
121my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
122
123 if ($i==0) {
124$code.=<<___;
125 ldx [$inp+0],@X[0]
126 ldx [$inp+16],@X[2]
127 ldx [$inp+32],@X[4]
128 ldx [$inp+48],@X[6]
129 ldx [$inp+8],@X[1]
130 ldx [$inp+24],@X[3]
131 subcc %g0,$tmp31,$tmp32 ! should be 64-$tmp31, but -$tmp31 works too
132 ldx [$inp+40],@X[5]
133 bz,pt %icc,.Laligned
134 ldx [$inp+56],@X[7]
135
136 sllx @X[0],$tmp31,@X[0]
137 ldx [$inp+64],$T1
138___
139for($j=0;$j<7;$j++)
140{ $code.=<<___;
141 srlx @X[$j+1],$tmp32,$tmp1
142 sllx @X[$j+1],$tmp31,@X[$j+1]
143 or $tmp1,@X[$j],@X[$j]
144___
145}
146$code.=<<___;
147 srlx $T1,$tmp32,$T1
148 or $T1,@X[7],@X[7]
149.Laligned:
150___
151 }
152
153 if ($i&1) {
154 $code.="\tadd @X[$i/2],$h,$T1\n";
155 } else {
156 $code.="\tsrlx @X[$i/2],32,$T1\n\tadd $h,$T1,$T1\n";
157 }
158} if ($SZ==4);
159
160########### SHA512
161$Xload = sub {
162my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
163my @pair=("%l".eval(($i*2)%8),"%l".eval(($i*2)%8+1),"%l".eval((($i+1)*2)%8));
164
165$code.=<<___ if ($i==0);
166 ld [$inp+0],%l0
167 ld [$inp+4],%l1
168 ld [$inp+8],%l2
169 ld [$inp+12],%l3
170 ld [$inp+16],%l4
171 ld [$inp+20],%l5
172 ld [$inp+24],%l6
173 ld [$inp+28],%l7
174___
175$code.=<<___ if ($i<15);
176 sllx @pair[1],$tmp31,$tmp2 ! Xload($i)
177 add $tmp31,32,$tmp0
178 sllx @pair[0],$tmp0,$tmp1
179 `"ld [$inp+".eval(32+0+$i*8)."],@pair[0]" if ($i<12)`
180 srlx @pair[2],$tmp32,@pair[1]
181 or $tmp1,$tmp2,$tmp2
182 or @pair[1],$tmp2,$tmp2
183 `"ld [$inp+".eval(32+4+$i*8)."],@pair[1]" if ($i<12)`
184 add $h,$tmp2,$T1
185 $ST $tmp2,[%sp+`$bias+$frame+$i*$SZ`]
186___
187$code.=<<___ if ($i==12);
188 brnz,a $tmp31,.+8
189 ld [$inp+128],%l0
190___
191$code.=<<___ if ($i==15);
192 ld [%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+0`],%l2
193 sllx @pair[1],$tmp31,$tmp2 ! Xload($i)
194 add $tmp31,32,$tmp0
195 ld [%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+4`],%l3
196 sllx @pair[0],$tmp0,$tmp1
197 ld [%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+0`],%l4
198 srlx @pair[2],$tmp32,@pair[1]
199 or $tmp1,$tmp2,$tmp2
200 ld [%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+4`],%l5
201 or @pair[1],$tmp2,$tmp2
202 ld [%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+0`],%l6
203 add $h,$tmp2,$T1
204 $ST $tmp2,[%sp+`$bias+$frame+$i*$SZ`]
205 ld [%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+4`],%l7
206 ld [%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+0`],%l0
207 ld [%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+4`],%l1
208___
209} if ($SZ==8);
210
211########### common
212sub BODY_00_15 {
213my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
214
215 if ($i<16) {
216 &$Xload(@_);
217 } else {
218 $code.="\tadd $h,$T1,$T1\n";
219 }
220
221$code.=<<___;
222 $SRL $e,@Sigma1[0],$h !! $i
223 xor $f,$g,$tmp2
224 $SLL $e,`$SZ*8-@Sigma1[2]`,$tmp1
225 and $e,$tmp2,$tmp2
226 $SRL $e,@Sigma1[1],$tmp0
227 xor $tmp1,$h,$h
228 $SLL $e,`$SZ*8-@Sigma1[1]`,$tmp1
229 xor $tmp0,$h,$h
230 $SRL $e,@Sigma1[2],$tmp0
231 xor $tmp1,$h,$h
232 $SLL $e,`$SZ*8-@Sigma1[0]`,$tmp1
233 xor $tmp0,$h,$h
234 xor $g,$tmp2,$tmp2 ! Ch(e,f,g)
235 xor $tmp1,$h,$tmp0 ! Sigma1(e)
236
237 $SRL $a,@Sigma0[0],$h
238 add $tmp2,$T1,$T1
239 $LD [$Ktbl+`$i*$SZ`],$tmp2 ! K[$i]
240 $SLL $a,`$SZ*8-@Sigma0[2]`,$tmp1
241 add $tmp0,$T1,$T1
242 $SRL $a,@Sigma0[1],$tmp0
243 xor $tmp1,$h,$h
244 $SLL $a,`$SZ*8-@Sigma0[1]`,$tmp1
245 xor $tmp0,$h,$h
246 $SRL $a,@Sigma0[2],$tmp0
247 xor $tmp1,$h,$h
248 $SLL $a,`$SZ*8-@Sigma0[0]`,$tmp1
249 xor $tmp0,$h,$h
250 xor $tmp1,$h,$h ! Sigma0(a)
251
252 or $a,$b,$tmp0
253 and $a,$b,$tmp1
254 and $c,$tmp0,$tmp0
255 or $tmp0,$tmp1,$tmp1 ! Maj(a,b,c)
256 add $tmp2,$T1,$T1 ! +=K[$i]
257 add $tmp1,$h,$h
258
259 add $T1,$d,$d
260 add $T1,$h,$h
261___
262}
263
264########### SHA256
265$BODY_16_XX = sub {
266my $i=@_[0];
267my $xi;
268
269 if ($i&1) {
270 $xi=$tmp32;
271 $code.="\tsrlx @X[(($i+1)/2)%8],32,$xi\n";
272 } else {
273 $xi=@X[(($i+1)/2)%8];
274 }
275$code.=<<___;
276 srl $xi,@sigma0[0],$T1 !! Xupdate($i)
277 sll $xi,`32-@sigma0[2]`,$tmp1
278 srl $xi,@sigma0[1],$tmp0
279 xor $tmp1,$T1,$T1
280 sll $tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1
281 xor $tmp0,$T1,$T1
282 srl $xi,@sigma0[2],$tmp0
283 xor $tmp1,$T1,$T1
284___
285 if ($i&1) {
286 $xi=@X[(($i+14)/2)%8];
287 } else {
288 $xi=$tmp32;
289 $code.="\tsrlx @X[(($i+14)/2)%8],32,$xi\n";
290 }
291$code.=<<___;
292 srl $xi,@sigma1[0],$tmp2
293 xor $tmp0,$T1,$T1 ! T1=sigma0(X[i+1])
294 sll $xi,`32-@sigma1[2]`,$tmp1
295 srl $xi,@sigma1[1],$tmp0
296 xor $tmp1,$tmp2,$tmp2
297 sll $tmp1,`@sigma1[2]-@sigma1[1]`,$tmp1
298 xor $tmp0,$tmp2,$tmp2
299 srl $xi,@sigma1[2],$tmp0
300 xor $tmp1,$tmp2,$tmp2
301___
302 if ($i&1) {
303 $xi=@X[($i/2)%8];
304$code.=<<___;
305 srlx @X[(($i+9)/2)%8],32,$tmp1 ! X[i+9]
306 xor $tmp0,$tmp2,$tmp2 ! sigma1(X[i+14])
307 srl @X[($i/2)%8],0,$tmp0
308 add $xi,$T1,$T1 ! +=X[i]
309 xor $tmp0,@X[($i/2)%8],@X[($i/2)%8]
310 add $tmp2,$T1,$T1
311 add $tmp1,$T1,$T1
312
313 srl $T1,0,$T1
314 or $T1,@X[($i/2)%8],@X[($i/2)%8]
315___
316 } else {
317 $xi=@X[(($i+9)/2)%8];
318$code.=<<___;
319 srlx @X[($i/2)%8],32,$tmp1 ! X[i]
320 xor $tmp0,$tmp2,$tmp2 ! sigma1(X[i+14])
321 srl @X[($i/2)%8],0,@X[($i/2)%8]
322 add $xi,$T1,$T1 ! +=X[i+9]
323 add $tmp2,$T1,$T1
324 add $tmp1,$T1,$T1
325
326 sllx $T1,32,$tmp0
327 or $tmp0,@X[($i/2)%8],@X[($i/2)%8]
328___
329 }
330 &BODY_00_15(@_);
331} if ($SZ==4);
332
333########### SHA512
334$BODY_16_XX = sub {
335my $i=@_[0];
336my @pair=("%l".eval(($i*2)%8),"%l".eval(($i*2)%8+1));
337
338$code.=<<___;
339 sllx %l2,32,$tmp0 !! Xupdate($i)
340 or %l3,$tmp0,$tmp0
341
342 srlx $tmp0,@sigma0[0],$T1
343 ld [%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+0`],%l2
344 sllx $tmp0,`64-@sigma0[2]`,$tmp1
345 ld [%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+4`],%l3
346 srlx $tmp0,@sigma0[1],$tmp0
347 xor $tmp1,$T1,$T1
348 sllx $tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1
349 xor $tmp0,$T1,$T1
350 srlx $tmp0,`@sigma0[2]-@sigma0[1]`,$tmp0
351 xor $tmp1,$T1,$T1
352 sllx %l6,32,$tmp2
353 xor $tmp0,$T1,$T1 ! sigma0(X[$i+1])
354 or %l7,$tmp2,$tmp2
355
356 srlx $tmp2,@sigma1[0],$tmp1
357 ld [%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+0`],%l6
358 sllx $tmp2,`64-@sigma1[2]`,$tmp0
359 ld [%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+4`],%l7
360 srlx $tmp2,@sigma1[1],$tmp2
361 xor $tmp0,$tmp1,$tmp1
362 sllx $tmp0,`@sigma1[2]-@sigma1[1]`,$tmp0
363 xor $tmp2,$tmp1,$tmp1
364 srlx $tmp2,`@sigma1[2]-@sigma1[1]`,$tmp2
365 xor $tmp0,$tmp1,$tmp1
366 sllx %l4,32,$tmp0
367 xor $tmp2,$tmp1,$tmp1 ! sigma1(X[$i+14])
368 ld [%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+0`],%l4
369 or %l5,$tmp0,$tmp0
370 ld [%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+4`],%l5
371
372 sllx %l0,32,$tmp2
373 add $tmp1,$T1,$T1
374 ld [%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+0`],%l0
375 or %l1,$tmp2,$tmp2
376 add $tmp0,$T1,$T1 ! +=X[$i+9]
377 ld [%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+4`],%l1
378 add $tmp2,$T1,$T1 ! +=X[$i]
379 $ST $T1,[%sp+`$bias+$frame+($i%16)*$SZ`]
380___
381 &BODY_00_15(@_);
382} if ($SZ==8);
383
384$code.=<<___ if ($bits==64);
385.register %g2,#scratch
386.register %g3,#scratch
387___
388$code.=<<___;
389.section ".text",#alloc,#execinstr
390
391.align 64
392K${label}:
393.type K${label},#object
394___
395if ($SZ==4) {
396$code.=<<___;
397 .long 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
398 .long 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
399 .long 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
400 .long 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
401 .long 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
402 .long 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
403 .long 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
404 .long 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
405 .long 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
406 .long 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
407 .long 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
408 .long 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
409 .long 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
410 .long 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
411 .long 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
412 .long 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
413___
414} else {
415$code.=<<___;
416 .long 0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd
417 .long 0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc
418 .long 0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019
419 .long 0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118
420 .long 0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe
421 .long 0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2
422 .long 0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1
423 .long 0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694
424 .long 0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3
425 .long 0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65
426 .long 0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483
427 .long 0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5
428 .long 0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210
429 .long 0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4
430 .long 0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725
431 .long 0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70
432 .long 0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926
433 .long 0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df
434 .long 0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8
435 .long 0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b
436 .long 0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001
437 .long 0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30
438 .long 0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910
439 .long 0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8
440 .long 0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53
441 .long 0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8
442 .long 0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb
443 .long 0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3
444 .long 0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60
445 .long 0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec
446 .long 0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9
447 .long 0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b
448 .long 0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207
449 .long 0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178
450 .long 0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6
451 .long 0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b
452 .long 0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493
453 .long 0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c
454 .long 0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a
455 .long 0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817
456___
457}
458$code.=<<___;
459.size K${label},.-K${label}
460.globl sha${label}_block_data_order
461sha${label}_block_data_order:
462 save %sp,`-$frame-$locals`,%sp
463 and $inp,`$align-1`,$tmp31
464 sllx $len,`log(16*$SZ)/log(2)`,$len
465 andn $inp,`$align-1`,$inp
466 sll $tmp31,3,$tmp31
467 add $inp,$len,$len
468___
469$code.=<<___ if ($SZ==8); # SHA512
470 mov 32,$tmp32
471 sub $tmp32,$tmp31,$tmp32
472___
473$code.=<<___;
474.Lpic: call .+8
475 add %o7,K${label}-.Lpic,$Ktbl
476
477 $LD [$ctx+`0*$SZ`],$A
478 $LD [$ctx+`1*$SZ`],$B
479 $LD [$ctx+`2*$SZ`],$C
480 $LD [$ctx+`3*$SZ`],$D
481 $LD [$ctx+`4*$SZ`],$E
482 $LD [$ctx+`5*$SZ`],$F
483 $LD [$ctx+`6*$SZ`],$G
484 $LD [$ctx+`7*$SZ`],$H
485
486.Lloop:
487___
488for ($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
489$code.=".L16_xx:\n";
490for (;$i<32;$i++) { &$BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
491$code.=<<___;
492 and $tmp2,0xfff,$tmp2
493 cmp $tmp2,$lastK
494 bne .L16_xx
495 add $Ktbl,`16*$SZ`,$Ktbl ! Ktbl+=16
496
497___
498$code.=<<___ if ($SZ==4); # SHA256
499 $LD [$ctx+`0*$SZ`],@X[0]
500 $LD [$ctx+`1*$SZ`],@X[1]
501 $LD [$ctx+`2*$SZ`],@X[2]
502 $LD [$ctx+`3*$SZ`],@X[3]
503 $LD [$ctx+`4*$SZ`],@X[4]
504 $LD [$ctx+`5*$SZ`],@X[5]
505 $LD [$ctx+`6*$SZ`],@X[6]
506 $LD [$ctx+`7*$SZ`],@X[7]
507
508 add $A,@X[0],$A
509 $ST $A,[$ctx+`0*$SZ`]
510 add $B,@X[1],$B
511 $ST $B,[$ctx+`1*$SZ`]
512 add $C,@X[2],$C
513 $ST $C,[$ctx+`2*$SZ`]
514 add $D,@X[3],$D
515 $ST $D,[$ctx+`3*$SZ`]
516 add $E,@X[4],$E
517 $ST $E,[$ctx+`4*$SZ`]
518 add $F,@X[5],$F
519 $ST $F,[$ctx+`5*$SZ`]
520 add $G,@X[6],$G
521 $ST $G,[$ctx+`6*$SZ`]
522 add $H,@X[7],$H
523 $ST $H,[$ctx+`7*$SZ`]
524___
525$code.=<<___ if ($SZ==8); # SHA512
526 ld [$ctx+`0*$SZ+0`],%l0
527 ld [$ctx+`0*$SZ+4`],%l1
528 ld [$ctx+`1*$SZ+0`],%l2
529 ld [$ctx+`1*$SZ+4`],%l3
530 ld [$ctx+`2*$SZ+0`],%l4
531 ld [$ctx+`2*$SZ+4`],%l5
532 ld [$ctx+`3*$SZ+0`],%l6
533
534 sllx %l0,32,$tmp0
535 ld [$ctx+`3*$SZ+4`],%l7
536 sllx %l2,32,$tmp1
537 or %l1,$tmp0,$tmp0
538 or %l3,$tmp1,$tmp1
539 add $tmp0,$A,$A
540 add $tmp1,$B,$B
541 $ST $A,[$ctx+`0*$SZ`]
542 sllx %l4,32,$tmp2
543 $ST $B,[$ctx+`1*$SZ`]
544 sllx %l6,32,$T1
545 or %l5,$tmp2,$tmp2
546 or %l7,$T1,$T1
547 add $tmp2,$C,$C
548 $ST $C,[$ctx+`2*$SZ`]
549 add $T1,$D,$D
550 $ST $D,[$ctx+`3*$SZ`]
551
552 ld [$ctx+`4*$SZ+0`],%l0
553 ld [$ctx+`4*$SZ+4`],%l1
554 ld [$ctx+`5*$SZ+0`],%l2
555 ld [$ctx+`5*$SZ+4`],%l3
556 ld [$ctx+`6*$SZ+0`],%l4
557 ld [$ctx+`6*$SZ+4`],%l5
558 ld [$ctx+`7*$SZ+0`],%l6
559
560 sllx %l0,32,$tmp0
561 ld [$ctx+`7*$SZ+4`],%l7
562 sllx %l2,32,$tmp1
563 or %l1,$tmp0,$tmp0
564 or %l3,$tmp1,$tmp1
565 add $tmp0,$E,$E
566 add $tmp1,$F,$F
567 $ST $E,[$ctx+`4*$SZ`]
568 sllx %l4,32,$tmp2
569 $ST $F,[$ctx+`5*$SZ`]
570 sllx %l6,32,$T1
571 or %l5,$tmp2,$tmp2
572 or %l7,$T1,$T1
573 add $tmp2,$G,$G
574 $ST $G,[$ctx+`6*$SZ`]
575 add $T1,$H,$H
576 $ST $H,[$ctx+`7*$SZ`]
577___
578$code.=<<___;
579 add $inp,`16*$SZ`,$inp ! advance inp
580 cmp $inp,$len
581 bne `$bits==64?"%xcc":"%icc"`,.Lloop
582 sub $Ktbl,`($rounds-16)*$SZ`,$Ktbl ! rewind Ktbl
583
584 ret
585 restore
586.type sha${label}_block_data_order,#function
587.size sha${label}_block_data_order,(.-sha${label}_block_data_order)
588.asciz "SHA${label} block transform for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
589___
590
591$code =~ s/\`([^\`]*)\`/eval $1/gem;
592print $code;
593close STDOUT;
diff --git a/src/lib/libcrypto/sha/asm/sha512-x86_64.pl b/src/lib/libcrypto/sha/asm/sha512-x86_64.pl
index b6252d31ec..e6643f8cf6 100755
--- a/src/lib/libcrypto/sha/asm/sha512-x86_64.pl
+++ b/src/lib/libcrypto/sha/asm/sha512-x86_64.pl
@@ -40,14 +40,18 @@
40# sha256_block:-( This is presumably because 64-bit shifts/rotates 40# sha256_block:-( This is presumably because 64-bit shifts/rotates
41# apparently are not atomic instructions, but implemented in microcode. 41# apparently are not atomic instructions, but implemented in microcode.
42 42
43$output=shift; 43$flavour = shift;
44$output = shift;
45if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
46
47$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
44 48
45$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 49$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
46( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 50( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
47( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 51( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
48die "can't locate x86_64-xlate.pl"; 52die "can't locate x86_64-xlate.pl";
49 53
50open STDOUT,"| $^X $xlate $output"; 54open STDOUT,"| $^X $xlate $flavour $output";
51 55
52if ($output =~ /512/) { 56if ($output =~ /512/) {
53 $func="sha512_block_data_order"; 57 $func="sha512_block_data_order";
@@ -186,7 +190,7 @@ $func:
186 push %r13 190 push %r13
187 push %r14 191 push %r14
188 push %r15 192 push %r15
189 mov %rsp,%rbp # copy %rsp 193 mov %rsp,%r11 # copy %rsp
190 shl \$4,%rdx # num*16 194 shl \$4,%rdx # num*16
191 sub \$$framesz,%rsp 195 sub \$$framesz,%rsp
192 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ 196 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
@@ -194,10 +198,10 @@ $func:
194 mov $ctx,$_ctx # save ctx, 1st arg 198 mov $ctx,$_ctx # save ctx, 1st arg
195 mov $inp,$_inp # save inp, 2nd arh 199 mov $inp,$_inp # save inp, 2nd arh
196 mov %rdx,$_end # save end pointer, "3rd" arg 200 mov %rdx,$_end # save end pointer, "3rd" arg
197 mov %rbp,$_rsp # save copy of %rsp 201 mov %r11,$_rsp # save copy of %rsp
202.Lprologue:
198 203
199 .picmeup $Tbl 204 lea $TABLE(%rip),$Tbl
200 lea $TABLE-.($Tbl),$Tbl
201 205
202 mov $SZ*0($ctx),$A 206 mov $SZ*0($ctx),$A
203 mov $SZ*1($ctx),$B 207 mov $SZ*1($ctx),$B
@@ -257,14 +261,15 @@ $code.=<<___;
257 mov $H,$SZ*7($ctx) 261 mov $H,$SZ*7($ctx)
258 jb .Lloop 262 jb .Lloop
259 263
260 mov $_rsp,%rsp 264 mov $_rsp,%rsi
261 pop %r15 265 mov (%rsi),%r15
262 pop %r14 266 mov 8(%rsi),%r14
263 pop %r13 267 mov 16(%rsi),%r13
264 pop %r12 268 mov 24(%rsi),%r12
265 pop %rbp 269 mov 32(%rsi),%rbp
266 pop %rbx 270 mov 40(%rsi),%rbx
267 271 lea 48(%rsi),%rsp
272.Lepilogue:
268 ret 273 ret
269.size $func,.-$func 274.size $func,.-$func
270___ 275___
@@ -339,6 +344,113 @@ $TABLE:
339___ 344___
340} 345}
341 346
347# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
348# CONTEXT *context,DISPATCHER_CONTEXT *disp)
349if ($win64) {
350$rec="%rcx";
351$frame="%rdx";
352$context="%r8";
353$disp="%r9";
354
355$code.=<<___;
356.extern __imp_RtlVirtualUnwind
357.type se_handler,\@abi-omnipotent
358.align 16
359se_handler:
360 push %rsi
361 push %rdi
362 push %rbx
363 push %rbp
364 push %r12
365 push %r13
366 push %r14
367 push %r15
368 pushfq
369 sub \$64,%rsp
370
371 mov 120($context),%rax # pull context->Rax
372 mov 248($context),%rbx # pull context->Rip
373
374 lea .Lprologue(%rip),%r10
375 cmp %r10,%rbx # context->Rip<.Lprologue
376 jb .Lin_prologue
377
378 mov 152($context),%rax # pull context->Rsp
379
380 lea .Lepilogue(%rip),%r10
381 cmp %r10,%rbx # context->Rip>=.Lepilogue
382 jae .Lin_prologue
383
384 mov 16*$SZ+3*8(%rax),%rax # pull $_rsp
385 lea 48(%rax),%rax
386
387 mov -8(%rax),%rbx
388 mov -16(%rax),%rbp
389 mov -24(%rax),%r12
390 mov -32(%rax),%r13
391 mov -40(%rax),%r14
392 mov -48(%rax),%r15
393 mov %rbx,144($context) # restore context->Rbx
394 mov %rbp,160($context) # restore context->Rbp
395 mov %r12,216($context) # restore context->R12
396 mov %r13,224($context) # restore context->R13
397 mov %r14,232($context) # restore context->R14
398 mov %r15,240($context) # restore context->R15
399
400.Lin_prologue:
401 mov 8(%rax),%rdi
402 mov 16(%rax),%rsi
403 mov %rax,152($context) # restore context->Rsp
404 mov %rsi,168($context) # restore context->Rsi
405 mov %rdi,176($context) # restore context->Rdi
406
407 mov 40($disp),%rdi # disp->ContextRecord
408 mov $context,%rsi # context
409 mov \$154,%ecx # sizeof(CONTEXT)
410 .long 0xa548f3fc # cld; rep movsq
411
412 mov $disp,%rsi
413 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
414 mov 8(%rsi),%rdx # arg2, disp->ImageBase
415 mov 0(%rsi),%r8 # arg3, disp->ControlPc
416 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
417 mov 40(%rsi),%r10 # disp->ContextRecord
418 lea 56(%rsi),%r11 # &disp->HandlerData
419 lea 24(%rsi),%r12 # &disp->EstablisherFrame
420 mov %r10,32(%rsp) # arg5
421 mov %r11,40(%rsp) # arg6
422 mov %r12,48(%rsp) # arg7
423 mov %rcx,56(%rsp) # arg8, (NULL)
424 call *__imp_RtlVirtualUnwind(%rip)
425
426 mov \$1,%eax # ExceptionContinueSearch
427 add \$64,%rsp
428 popfq
429 pop %r15
430 pop %r14
431 pop %r13
432 pop %r12
433 pop %rbp
434 pop %rbx
435 pop %rdi
436 pop %rsi
437 ret
438.size se_handler,.-se_handler
439
440.section .pdata
441.align 4
442 .rva .LSEH_begin_$func
443 .rva .LSEH_end_$func
444 .rva .LSEH_info_$func
445
446.section .xdata
447.align 8
448.LSEH_info_$func:
449 .byte 9,0,0,0
450 .rva se_handler
451___
452}
453
342$code =~ s/\`([^\`]*)\`/eval $1/gem; 454$code =~ s/\`([^\`]*)\`/eval $1/gem;
343print $code; 455print $code;
344close STDOUT; 456close STDOUT;
diff --git a/src/lib/libcrypto/sha/sha256.c b/src/lib/libcrypto/sha/sha256.c
index 3256a83e98..8952d87673 100644
--- a/src/lib/libcrypto/sha/sha256.c
+++ b/src/lib/libcrypto/sha/sha256.c
@@ -12,39 +12,29 @@
12 12
13#include <openssl/crypto.h> 13#include <openssl/crypto.h>
14#include <openssl/sha.h> 14#include <openssl/sha.h>
15#ifdef OPENSSL_FIPS
16#include <openssl/fips.h>
17#endif
18
19#include <openssl/opensslv.h> 15#include <openssl/opensslv.h>
20 16
21const char SHA256_version[]="SHA-256" OPENSSL_VERSION_PTEXT; 17const char SHA256_version[]="SHA-256" OPENSSL_VERSION_PTEXT;
22 18
23int SHA224_Init (SHA256_CTX *c) 19int SHA224_Init (SHA256_CTX *c)
24 { 20 {
25#ifdef OPENSSL_FIPS 21 memset (c,0,sizeof(*c));
26 FIPS_selftest_check();
27#endif
28 c->h[0]=0xc1059ed8UL; c->h[1]=0x367cd507UL; 22 c->h[0]=0xc1059ed8UL; c->h[1]=0x367cd507UL;
29 c->h[2]=0x3070dd17UL; c->h[3]=0xf70e5939UL; 23 c->h[2]=0x3070dd17UL; c->h[3]=0xf70e5939UL;
30 c->h[4]=0xffc00b31UL; c->h[5]=0x68581511UL; 24 c->h[4]=0xffc00b31UL; c->h[5]=0x68581511UL;
31 c->h[6]=0x64f98fa7UL; c->h[7]=0xbefa4fa4UL; 25 c->h[6]=0x64f98fa7UL; c->h[7]=0xbefa4fa4UL;
32 c->Nl=0; c->Nh=0; 26 c->md_len=SHA224_DIGEST_LENGTH;
33 c->num=0; c->md_len=SHA224_DIGEST_LENGTH;
34 return 1; 27 return 1;
35 } 28 }
36 29
37int SHA256_Init (SHA256_CTX *c) 30int SHA256_Init (SHA256_CTX *c)
38 { 31 {
39#ifdef OPENSSL_FIPS 32 memset (c,0,sizeof(*c));
40 FIPS_selftest_check();
41#endif
42 c->h[0]=0x6a09e667UL; c->h[1]=0xbb67ae85UL; 33 c->h[0]=0x6a09e667UL; c->h[1]=0xbb67ae85UL;
43 c->h[2]=0x3c6ef372UL; c->h[3]=0xa54ff53aUL; 34 c->h[2]=0x3c6ef372UL; c->h[3]=0xa54ff53aUL;
44 c->h[4]=0x510e527fUL; c->h[5]=0x9b05688cUL; 35 c->h[4]=0x510e527fUL; c->h[5]=0x9b05688cUL;
45 c->h[6]=0x1f83d9abUL; c->h[7]=0x5be0cd19UL; 36 c->h[6]=0x1f83d9abUL; c->h[7]=0x5be0cd19UL;
46 c->Nl=0; c->Nh=0; 37 c->md_len=SHA256_DIGEST_LENGTH;
47 c->num=0; c->md_len=SHA256_DIGEST_LENGTH;
48 return 1; 38 return 1;
49 } 39 }
50 40
@@ -94,21 +84,21 @@ int SHA224_Final (unsigned char *md, SHA256_CTX *c)
94 */ 84 */
95#define HASH_MAKE_STRING(c,s) do { \ 85#define HASH_MAKE_STRING(c,s) do { \
96 unsigned long ll; \ 86 unsigned long ll; \
97 unsigned int xn; \ 87 unsigned int nn; \
98 switch ((c)->md_len) \ 88 switch ((c)->md_len) \
99 { case SHA224_DIGEST_LENGTH: \ 89 { case SHA224_DIGEST_LENGTH: \
100 for (xn=0;xn<SHA224_DIGEST_LENGTH/4;xn++) \ 90 for (nn=0;nn<SHA224_DIGEST_LENGTH/4;nn++) \
101 { ll=(c)->h[xn]; HOST_l2c(ll,(s)); } \ 91 { ll=(c)->h[nn]; HOST_l2c(ll,(s)); } \
102 break; \ 92 break; \
103 case SHA256_DIGEST_LENGTH: \ 93 case SHA256_DIGEST_LENGTH: \
104 for (xn=0;xn<SHA256_DIGEST_LENGTH/4;xn++) \ 94 for (nn=0;nn<SHA256_DIGEST_LENGTH/4;nn++) \
105 { ll=(c)->h[xn]; HOST_l2c(ll,(s)); } \ 95 { ll=(c)->h[nn]; HOST_l2c(ll,(s)); } \
106 break; \ 96 break; \
107 default: \ 97 default: \
108 if ((c)->md_len > SHA256_DIGEST_LENGTH) \ 98 if ((c)->md_len > SHA256_DIGEST_LENGTH) \
109 return 0; \ 99 return 0; \
110 for (xn=0;xn<(c)->md_len/4;xn++) \ 100 for (nn=0;nn<(c)->md_len/4;nn++) \
111 { ll=(c)->h[xn]; HOST_l2c(ll,(s)); } \ 101 { ll=(c)->h[nn]; HOST_l2c(ll,(s)); } \
112 break; \ 102 break; \
113 } \ 103 } \
114 } while (0) 104 } while (0)
diff --git a/src/lib/libcrypto/sha/sha512.c b/src/lib/libcrypto/sha/sha512.c
index f5ed468b85..cbc0e58c48 100644
--- a/src/lib/libcrypto/sha/sha512.c
+++ b/src/lib/libcrypto/sha/sha512.c
@@ -5,10 +5,6 @@
5 * ==================================================================== 5 * ====================================================================
6 */ 6 */
7#include <openssl/opensslconf.h> 7#include <openssl/opensslconf.h>
8#ifdef OPENSSL_FIPS
9#include <openssl/fips.h>
10#endif
11
12#if !defined(OPENSSL_NO_SHA) && !defined(OPENSSL_NO_SHA512) 8#if !defined(OPENSSL_NO_SHA) && !defined(OPENSSL_NO_SHA512)
13/* 9/*
14 * IMPLEMENTATION NOTES. 10 * IMPLEMENTATION NOTES.
@@ -65,9 +61,19 @@ const char SHA512_version[]="SHA-512" OPENSSL_VERSION_PTEXT;
65 61
66int SHA384_Init (SHA512_CTX *c) 62int SHA384_Init (SHA512_CTX *c)
67 { 63 {
68#ifdef OPENSSL_FIPS 64#if defined(SHA512_ASM) && (defined(__arm__) || defined(__arm))
69 FIPS_selftest_check(); 65 /* maintain dword order required by assembler module */
70#endif 66 unsigned int *h = (unsigned int *)c->h;
67
68 h[0] = 0xcbbb9d5d; h[1] = 0xc1059ed8;
69 h[2] = 0x629a292a; h[3] = 0x367cd507;
70 h[4] = 0x9159015a; h[5] = 0x3070dd17;
71 h[6] = 0x152fecd8; h[7] = 0xf70e5939;
72 h[8] = 0x67332667; h[9] = 0xffc00b31;
73 h[10] = 0x8eb44a87; h[11] = 0x68581511;
74 h[12] = 0xdb0c2e0d; h[13] = 0x64f98fa7;
75 h[14] = 0x47b5481d; h[15] = 0xbefa4fa4;
76#else
71 c->h[0]=U64(0xcbbb9d5dc1059ed8); 77 c->h[0]=U64(0xcbbb9d5dc1059ed8);
72 c->h[1]=U64(0x629a292a367cd507); 78 c->h[1]=U64(0x629a292a367cd507);
73 c->h[2]=U64(0x9159015a3070dd17); 79 c->h[2]=U64(0x9159015a3070dd17);
@@ -76,6 +82,7 @@ int SHA384_Init (SHA512_CTX *c)
76 c->h[5]=U64(0x8eb44a8768581511); 82 c->h[5]=U64(0x8eb44a8768581511);
77 c->h[6]=U64(0xdb0c2e0d64f98fa7); 83 c->h[6]=U64(0xdb0c2e0d64f98fa7);
78 c->h[7]=U64(0x47b5481dbefa4fa4); 84 c->h[7]=U64(0x47b5481dbefa4fa4);
85#endif
79 c->Nl=0; c->Nh=0; 86 c->Nl=0; c->Nh=0;
80 c->num=0; c->md_len=SHA384_DIGEST_LENGTH; 87 c->num=0; c->md_len=SHA384_DIGEST_LENGTH;
81 return 1; 88 return 1;
@@ -83,9 +90,19 @@ int SHA384_Init (SHA512_CTX *c)
83 90
84int SHA512_Init (SHA512_CTX *c) 91int SHA512_Init (SHA512_CTX *c)
85 { 92 {
86#ifdef OPENSSL_FIPS 93#if defined(SHA512_ASM) && (defined(__arm__) || defined(__arm))
87 FIPS_selftest_check(); 94 /* maintain dword order required by assembler module */
88#endif 95 unsigned int *h = (unsigned int *)c->h;
96
97 h[0] = 0x6a09e667; h[1] = 0xf3bcc908;
98 h[2] = 0xbb67ae85; h[3] = 0x84caa73b;
99 h[4] = 0x3c6ef372; h[5] = 0xfe94f82b;
100 h[6] = 0xa54ff53a; h[7] = 0x5f1d36f1;
101 h[8] = 0x510e527f; h[9] = 0xade682d1;
102 h[10] = 0x9b05688c; h[11] = 0x2b3e6c1f;
103 h[12] = 0x1f83d9ab; h[13] = 0xfb41bd6b;
104 h[14] = 0x5be0cd19; h[15] = 0x137e2179;
105#else
89 c->h[0]=U64(0x6a09e667f3bcc908); 106 c->h[0]=U64(0x6a09e667f3bcc908);
90 c->h[1]=U64(0xbb67ae8584caa73b); 107 c->h[1]=U64(0xbb67ae8584caa73b);
91 c->h[2]=U64(0x3c6ef372fe94f82b); 108 c->h[2]=U64(0x3c6ef372fe94f82b);
@@ -94,6 +111,7 @@ int SHA512_Init (SHA512_CTX *c)
94 c->h[5]=U64(0x9b05688c2b3e6c1f); 111 c->h[5]=U64(0x9b05688c2b3e6c1f);
95 c->h[6]=U64(0x1f83d9abfb41bd6b); 112 c->h[6]=U64(0x1f83d9abfb41bd6b);
96 c->h[7]=U64(0x5be0cd19137e2179); 113 c->h[7]=U64(0x5be0cd19137e2179);
114#endif
97 c->Nl=0; c->Nh=0; 115 c->Nl=0; c->Nh=0;
98 c->num=0; c->md_len=SHA512_DIGEST_LENGTH; 116 c->num=0; c->md_len=SHA512_DIGEST_LENGTH;
99 return 1; 117 return 1;
@@ -142,6 +160,24 @@ int SHA512_Final (unsigned char *md, SHA512_CTX *c)
142 160
143 if (md==0) return 0; 161 if (md==0) return 0;
144 162
163#if defined(SHA512_ASM) && (defined(__arm__) || defined(__arm))
164 /* recall assembler dword order... */
165 n = c->md_len;
166 if (n == SHA384_DIGEST_LENGTH || n == SHA512_DIGEST_LENGTH)
167 {
168 unsigned int *h = (unsigned int *)c->h, t;
169
170 for (n/=4;n;n--)
171 {
172 t = *(h++);
173 *(md++) = (unsigned char)(t>>24);
174 *(md++) = (unsigned char)(t>>16);
175 *(md++) = (unsigned char)(t>>8);
176 *(md++) = (unsigned char)(t);
177 }
178 }
179 else return 0;
180#else
145 switch (c->md_len) 181 switch (c->md_len)
146 { 182 {
147 /* Let compiler decide if it's appropriate to unroll... */ 183 /* Let compiler decide if it's appropriate to unroll... */
@@ -178,7 +214,7 @@ int SHA512_Final (unsigned char *md, SHA512_CTX *c)
178 /* ... as well as make sure md_len is not abused. */ 214 /* ... as well as make sure md_len is not abused. */
179 default: return 0; 215 default: return 0;
180 } 216 }
181 217#endif
182 return 1; 218 return 1;
183 } 219 }
184 220
@@ -204,7 +240,7 @@ int SHA512_Update (SHA512_CTX *c, const void *_data, size_t len)
204 240
205 if (len < n) 241 if (len < n)
206 { 242 {
207 memcpy (p+c->num,data,len), c->num += len; 243 memcpy (p+c->num,data,len), c->num += (unsigned int)len;
208 return 1; 244 return 1;
209 } 245 }
210 else { 246 else {
@@ -314,7 +350,7 @@ static const SHA_LONG64 K512[80] = {
314#ifndef PEDANTIC 350#ifndef PEDANTIC
315# if defined(__GNUC__) && __GNUC__>=2 && !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_NO_INLINE_ASM) 351# if defined(__GNUC__) && __GNUC__>=2 && !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_NO_INLINE_ASM)
316# if defined(__x86_64) || defined(__x86_64__) 352# if defined(__x86_64) || defined(__x86_64__)
317# define ROTR(a,n) ({ unsigned long ret; \ 353# define ROTR(a,n) ({ SHA_LONG64 ret; \
318 asm ("rorq %1,%0" \ 354 asm ("rorq %1,%0" \
319 : "=r"(ret) \ 355 : "=r"(ret) \
320 : "J"(n),"0"(a) \ 356 : "J"(n),"0"(a) \
@@ -337,20 +373,21 @@ static const SHA_LONG64 K512[80] = {
337 ((SHA_LONG64)hi)<<32|lo; }) 373 ((SHA_LONG64)hi)<<32|lo; })
338# else 374# else
339# define PULL64(x) ({ const unsigned int *p=(const unsigned int *)(&(x));\ 375# define PULL64(x) ({ const unsigned int *p=(const unsigned int *)(&(x));\
340 unsigned int hi=p[0],lo=p[1]; \ 376 unsigned int hi=p[0],lo=p[1]; \
341 asm ("bswapl %0; bswapl %1;" \ 377 asm ("bswapl %0; bswapl %1;" \
342 : "=r"(lo),"=r"(hi) \ 378 : "=r"(lo),"=r"(hi) \
343 : "0"(lo),"1"(hi)); \ 379 : "0"(lo),"1"(hi)); \
344 ((SHA_LONG64)hi)<<32|lo; }) 380 ((SHA_LONG64)hi)<<32|lo; })
345# endif 381# endif
346# elif (defined(_ARCH_PPC) && defined(__64BIT__)) || defined(_ARCH_PPC64) 382# elif (defined(_ARCH_PPC) && defined(__64BIT__)) || defined(_ARCH_PPC64)
347# define ROTR(a,n) ({ unsigned long ret; \ 383# define ROTR(a,n) ({ SHA_LONG64 ret; \
348 asm ("rotrdi %0,%1,%2" \ 384 asm ("rotrdi %0,%1,%2" \
349 : "=r"(ret) \ 385 : "=r"(ret) \
350 : "r"(a),"K"(n)); ret; }) 386 : "r"(a),"K"(n)); ret; })
351# endif 387# endif
352# elif defined(_MSC_VER) 388# elif defined(_MSC_VER)
353# if defined(_WIN64) /* applies to both IA-64 and AMD64 */ 389# if defined(_WIN64) /* applies to both IA-64 and AMD64 */
390# pragma intrinsic(_rotr64)
354# define ROTR(a,n) _rotr64((a),n) 391# define ROTR(a,n) _rotr64((a),n)
355# endif 392# endif
356# if defined(_M_IX86) && !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_NO_INLINE_ASM) 393# if defined(_M_IX86) && !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_NO_INLINE_ASM)
@@ -398,15 +435,66 @@ static const SHA_LONG64 K512[80] = {
398#define Ch(x,y,z) (((x) & (y)) ^ ((~(x)) & (z))) 435#define Ch(x,y,z) (((x) & (y)) ^ ((~(x)) & (z)))
399#define Maj(x,y,z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z))) 436#define Maj(x,y,z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
400 437
401#if defined(OPENSSL_IA32_SSE2) && !defined(OPENSSL_NO_ASM) && !defined(I386_ONLY) 438
402#define GO_FOR_SSE2(ctx,in,num) do { \ 439#if defined(__i386) || defined(__i386__) || defined(_M_IX86)
403 void sha512_block_sse2(void *,const void *,size_t); \ 440/*
404 if (!(OPENSSL_ia32cap_P & (1<<26))) break; \ 441 * This code should give better results on 32-bit CPU with less than
405 sha512_block_sse2(ctx->h,in,num); return; \ 442 * ~24 registers, both size and performance wise...
406 } while (0) 443 */
444static void sha512_block_data_order (SHA512_CTX *ctx, const void *in, size_t num)
445 {
446 const SHA_LONG64 *W=in;
447 SHA_LONG64 A,E,T;
448 SHA_LONG64 X[9+80],*F;
449 int i;
450
451 while (num--) {
452
453 F = X+80;
454 A = ctx->h[0]; F[1] = ctx->h[1];
455 F[2] = ctx->h[2]; F[3] = ctx->h[3];
456 E = ctx->h[4]; F[5] = ctx->h[5];
457 F[6] = ctx->h[6]; F[7] = ctx->h[7];
458
459 for (i=0;i<16;i++,F--)
460 {
461#ifdef B_ENDIAN
462 T = W[i];
463#else
464 T = PULL64(W[i]);
407#endif 465#endif
466 F[0] = A;
467 F[4] = E;
468 F[8] = T;
469 T += F[7] + Sigma1(E) + Ch(E,F[5],F[6]) + K512[i];
470 E = F[3] + T;
471 A = T + Sigma0(A) + Maj(A,F[1],F[2]);
472 }
473
474 for (;i<80;i++,F--)
475 {
476 T = sigma0(F[8+16-1]);
477 T += sigma1(F[8+16-14]);
478 T += F[8+16] + F[8+16-9];
479
480 F[0] = A;
481 F[4] = E;
482 F[8] = T;
483 T += F[7] + Sigma1(E) + Ch(E,F[5],F[6]) + K512[i];
484 E = F[3] + T;
485 A = T + Sigma0(A) + Maj(A,F[1],F[2]);
486 }
408 487
409#ifdef OPENSSL_SMALL_FOOTPRINT 488 ctx->h[0] += A; ctx->h[1] += F[1];
489 ctx->h[2] += F[2]; ctx->h[3] += F[3];
490 ctx->h[4] += E; ctx->h[5] += F[5];
491 ctx->h[6] += F[6]; ctx->h[7] += F[7];
492
493 W+=SHA_LBLOCK;
494 }
495 }
496
497#elif defined(OPENSSL_SMALL_FOOTPRINT)
410 498
411static void sha512_block_data_order (SHA512_CTX *ctx, const void *in, size_t num) 499static void sha512_block_data_order (SHA512_CTX *ctx, const void *in, size_t num)
412 { 500 {
@@ -415,10 +503,6 @@ static void sha512_block_data_order (SHA512_CTX *ctx, const void *in, size_t num
415 SHA_LONG64 X[16]; 503 SHA_LONG64 X[16];
416 int i; 504 int i;
417 505
418#ifdef GO_FOR_SSE2
419 GO_FOR_SSE2(ctx,in,num);
420#endif
421
422 while (num--) { 506 while (num--) {
423 507
424 a = ctx->h[0]; b = ctx->h[1]; c = ctx->h[2]; d = ctx->h[3]; 508 a = ctx->h[0]; b = ctx->h[1]; c = ctx->h[2]; d = ctx->h[3];
@@ -463,11 +547,11 @@ static void sha512_block_data_order (SHA512_CTX *ctx, const void *in, size_t num
463 h = Sigma0(a) + Maj(a,b,c); \ 547 h = Sigma0(a) + Maj(a,b,c); \
464 d += T1; h += T1; } while (0) 548 d += T1; h += T1; } while (0)
465 549
466#define ROUND_16_80(i,a,b,c,d,e,f,g,h,X) do { \ 550#define ROUND_16_80(i,j,a,b,c,d,e,f,g,h,X) do { \
467 s0 = X[(i+1)&0x0f]; s0 = sigma0(s0); \ 551 s0 = X[(j+1)&0x0f]; s0 = sigma0(s0); \
468 s1 = X[(i+14)&0x0f]; s1 = sigma1(s1); \ 552 s1 = X[(j+14)&0x0f]; s1 = sigma1(s1); \
469 T1 = X[(i)&0x0f] += s0 + s1 + X[(i+9)&0x0f]; \ 553 T1 = X[(j)&0x0f] += s0 + s1 + X[(j+9)&0x0f]; \
470 ROUND_00_15(i,a,b,c,d,e,f,g,h); } while (0) 554 ROUND_00_15(i+j,a,b,c,d,e,f,g,h); } while (0)
471 555
472static void sha512_block_data_order (SHA512_CTX *ctx, const void *in, size_t num) 556static void sha512_block_data_order (SHA512_CTX *ctx, const void *in, size_t num)
473 { 557 {
@@ -476,10 +560,6 @@ static void sha512_block_data_order (SHA512_CTX *ctx, const void *in, size_t num
476 SHA_LONG64 X[16]; 560 SHA_LONG64 X[16];
477 int i; 561 int i;
478 562
479#ifdef GO_FOR_SSE2
480 GO_FOR_SSE2(ctx,in,num);
481#endif
482
483 while (num--) { 563 while (num--) {
484 564
485 a = ctx->h[0]; b = ctx->h[1]; c = ctx->h[2]; d = ctx->h[3]; 565 a = ctx->h[0]; b = ctx->h[1]; c = ctx->h[2]; d = ctx->h[3];
@@ -521,16 +601,24 @@ static void sha512_block_data_order (SHA512_CTX *ctx, const void *in, size_t num
521 T1 = X[15] = PULL64(W[15]); ROUND_00_15(15,b,c,d,e,f,g,h,a); 601 T1 = X[15] = PULL64(W[15]); ROUND_00_15(15,b,c,d,e,f,g,h,a);
522#endif 602#endif
523 603
524 for (i=16;i<80;i+=8) 604 for (i=16;i<80;i+=16)
525 { 605 {
526 ROUND_16_80(i+0,a,b,c,d,e,f,g,h,X); 606 ROUND_16_80(i, 0,a,b,c,d,e,f,g,h,X);
527 ROUND_16_80(i+1,h,a,b,c,d,e,f,g,X); 607 ROUND_16_80(i, 1,h,a,b,c,d,e,f,g,X);
528 ROUND_16_80(i+2,g,h,a,b,c,d,e,f,X); 608 ROUND_16_80(i, 2,g,h,a,b,c,d,e,f,X);
529 ROUND_16_80(i+3,f,g,h,a,b,c,d,e,X); 609 ROUND_16_80(i, 3,f,g,h,a,b,c,d,e,X);
530 ROUND_16_80(i+4,e,f,g,h,a,b,c,d,X); 610 ROUND_16_80(i, 4,e,f,g,h,a,b,c,d,X);
531 ROUND_16_80(i+5,d,e,f,g,h,a,b,c,X); 611 ROUND_16_80(i, 5,d,e,f,g,h,a,b,c,X);
532 ROUND_16_80(i+6,c,d,e,f,g,h,a,b,X); 612 ROUND_16_80(i, 6,c,d,e,f,g,h,a,b,X);
533 ROUND_16_80(i+7,b,c,d,e,f,g,h,a,X); 613 ROUND_16_80(i, 7,b,c,d,e,f,g,h,a,X);
614 ROUND_16_80(i, 8,a,b,c,d,e,f,g,h,X);
615 ROUND_16_80(i, 9,h,a,b,c,d,e,f,g,X);
616 ROUND_16_80(i,10,g,h,a,b,c,d,e,f,X);
617 ROUND_16_80(i,11,f,g,h,a,b,c,d,e,X);
618 ROUND_16_80(i,12,e,f,g,h,a,b,c,d,X);
619 ROUND_16_80(i,13,d,e,f,g,h,a,b,c,X);
620 ROUND_16_80(i,14,c,d,e,f,g,h,a,b,X);
621 ROUND_16_80(i,15,b,c,d,e,f,g,h,a,X);
534 } 622 }
535 623
536 ctx->h[0] += a; ctx->h[1] += b; ctx->h[2] += c; ctx->h[3] += d; 624 ctx->h[0] += a; ctx->h[1] += b; ctx->h[2] += c; ctx->h[3] += d;
@@ -544,4 +632,10 @@ static void sha512_block_data_order (SHA512_CTX *ctx, const void *in, size_t num
544 632
545#endif /* SHA512_ASM */ 633#endif /* SHA512_ASM */
546 634
547#endif /* OPENSSL_NO_SHA512 */ 635#else /* !OPENSSL_NO_SHA512 */
636
637#if defined(PEDANTIC) || defined(__DECC) || defined(OPENSSL_SYS_MACOSX)
638static void *dummy=&dummy;
639#endif
640
641#endif /* !OPENSSL_NO_SHA512 */