summaryrefslogtreecommitdiff
path: root/src/lib/libcrypto/sha/asm/sha1-x86_64.pl
diff options
context:
space:
mode:
Diffstat (limited to 'src/lib/libcrypto/sha/asm/sha1-x86_64.pl')
-rwxr-xr-xsrc/lib/libcrypto/sha/asm/sha1-x86_64.pl242
1 files changed, 242 insertions, 0 deletions
diff --git a/src/lib/libcrypto/sha/asm/sha1-x86_64.pl b/src/lib/libcrypto/sha/asm/sha1-x86_64.pl
new file mode 100755
index 0000000000..f7ed67a726
--- /dev/null
+++ b/src/lib/libcrypto/sha/asm/sha1-x86_64.pl
@@ -0,0 +1,242 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# sha1_block procedure for x86_64.
11#
12# It was brought to my attention that on EM64T compiler-generated code
13# was far behind 32-bit assembler implementation. This is unlike on
14# Opteron where compiler-generated code was only 15% behind 32-bit
15# assembler, which originally made it hard to motivate the effort.
16# There was suggestion to mechanically translate 32-bit code, but I
17# dismissed it, reasoning that x86_64 offers enough register bank
18# capacity to fully utilize SHA-1 parallelism. Therefore this fresh
19# implementation:-) However! While 64-bit code does performs better
20# on Opteron, I failed to beat 32-bit assembler on EM64T core. Well,
21# x86_64 does offer larger *addressable* bank, but out-of-order core
22# reaches for even more registers through dynamic aliasing, and EM64T
23# core must have managed to run-time optimize even 32-bit code just as
24# good as 64-bit one. Performance improvement is summarized in the
25# following table:
26#
27# gcc 3.4 32-bit asm cycles/byte
28# Opteron +45% +20% 6.8
29# Xeon P4 +65% +0% 9.9
30# Core2 +60% +10% 7.0
31
32$output=shift;
33
34$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
35( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
36( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
37die "can't locate x86_64-xlate.pl";
38
39open STDOUT,"| $^X $xlate $output";
40
41$ctx="%rdi"; # 1st arg
42$inp="%rsi"; # 2nd arg
43$num="%rdx"; # 3rd arg
44
45# reassign arguments in order to produce more compact code
46$ctx="%r8";
47$inp="%r9";
48$num="%r10";
49
50$xi="%eax";
51$t0="%ebx";
52$t1="%ecx";
53$A="%edx";
54$B="%esi";
55$C="%edi";
56$D="%ebp";
57$E="%r11d";
58$T="%r12d";
59
60@V=($A,$B,$C,$D,$E,$T);
61
62sub PROLOGUE {
63my $func=shift;
64$code.=<<___;
65.globl $func
66.type $func,\@function,3
67.align 16
68$func:
69 push %rbx
70 push %rbp
71 push %r12
72 mov %rsp,%rax
73 mov %rdi,$ctx # reassigned argument
74 sub \$`8+16*4`,%rsp
75 mov %rsi,$inp # reassigned argument
76 and \$-64,%rsp
77 mov %rdx,$num # reassigned argument
78 mov %rax,`16*4`(%rsp)
79
80 mov 0($ctx),$A
81 mov 4($ctx),$B
82 mov 8($ctx),$C
83 mov 12($ctx),$D
84 mov 16($ctx),$E
85___
86}
87
88sub EPILOGUE {
89my $func=shift;
90$code.=<<___;
91 mov `16*4`(%rsp),%rsp
92 pop %r12
93 pop %rbp
94 pop %rbx
95 ret
96.size $func,.-$func
97___
98}
99
100sub BODY_00_19 {
101my ($i,$a,$b,$c,$d,$e,$f,$host)=@_;
102my $j=$i+1;
103$code.=<<___ if ($i==0);
104 mov `4*$i`($inp),$xi
105 `"bswap $xi" if(!defined($host))`
106 mov $xi,`4*$i`(%rsp)
107___
108$code.=<<___ if ($i<15);
109 lea 0x5a827999($xi,$e),$f
110 mov $c,$t0
111 mov `4*$j`($inp),$xi
112 mov $a,$e
113 xor $d,$t0
114 `"bswap $xi" if(!defined($host))`
115 rol \$5,$e
116 and $b,$t0
117 mov $xi,`4*$j`(%rsp)
118 add $e,$f
119 xor $d,$t0
120 rol \$30,$b
121 add $t0,$f
122___
123$code.=<<___ if ($i>=15);
124 lea 0x5a827999($xi,$e),$f
125 mov `4*($j%16)`(%rsp),$xi
126 mov $c,$t0
127 mov $a,$e
128 xor `4*(($j+2)%16)`(%rsp),$xi
129 xor $d,$t0
130 rol \$5,$e
131 xor `4*(($j+8)%16)`(%rsp),$xi
132 and $b,$t0
133 add $e,$f
134 xor `4*(($j+13)%16)`(%rsp),$xi
135 xor $d,$t0
136 rol \$30,$b
137 add $t0,$f
138 rol \$1,$xi
139 mov $xi,`4*($j%16)`(%rsp)
140___
141}
142
143sub BODY_20_39 {
144my ($i,$a,$b,$c,$d,$e,$f)=@_;
145my $j=$i+1;
146my $K=($i<40)?0x6ed9eba1:0xca62c1d6;
147$code.=<<___ if ($i<79);
148 lea $K($xi,$e),$f
149 mov `4*($j%16)`(%rsp),$xi
150 mov $c,$t0
151 mov $a,$e
152 xor `4*(($j+2)%16)`(%rsp),$xi
153 xor $b,$t0
154 rol \$5,$e
155 xor `4*(($j+8)%16)`(%rsp),$xi
156 xor $d,$t0
157 add $e,$f
158 xor `4*(($j+13)%16)`(%rsp),$xi
159 rol \$30,$b
160 add $t0,$f
161 rol \$1,$xi
162___
163$code.=<<___ if ($i<76);
164 mov $xi,`4*($j%16)`(%rsp)
165___
166$code.=<<___ if ($i==79);
167 lea $K($xi,$e),$f
168 mov $c,$t0
169 mov $a,$e
170 xor $b,$t0
171 rol \$5,$e
172 xor $d,$t0
173 add $e,$f
174 rol \$30,$b
175 add $t0,$f
176___
177}
178
179sub BODY_40_59 {
180my ($i,$a,$b,$c,$d,$e,$f)=@_;
181my $j=$i+1;
182$code.=<<___;
183 lea 0x8f1bbcdc($xi,$e),$f
184 mov `4*($j%16)`(%rsp),$xi
185 mov $b,$t0
186 mov $b,$t1
187 xor `4*(($j+2)%16)`(%rsp),$xi
188 mov $a,$e
189 and $c,$t0
190 xor `4*(($j+8)%16)`(%rsp),$xi
191 or $c,$t1
192 rol \$5,$e
193 xor `4*(($j+13)%16)`(%rsp),$xi
194 and $d,$t1
195 add $e,$f
196 rol \$1,$xi
197 or $t1,$t0
198 rol \$30,$b
199 mov $xi,`4*($j%16)`(%rsp)
200 add $t0,$f
201___
202}
203
204$code=".text\n";
205
206&PROLOGUE("sha1_block_data_order");
207$code.=".align 4\n.Lloop:\n";
208for($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
209for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
210for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
211for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
212$code.=<<___;
213 add 0($ctx),$E
214 add 4($ctx),$T
215 add 8($ctx),$A
216 add 12($ctx),$B
217 add 16($ctx),$C
218 mov $E,0($ctx)
219 mov $T,4($ctx)
220 mov $A,8($ctx)
221 mov $B,12($ctx)
222 mov $C,16($ctx)
223
224 xchg $E,$A # mov $E,$A
225 xchg $T,$B # mov $T,$B
226 xchg $E,$C # mov $A,$C
227 xchg $T,$D # mov $B,$D
228 # mov $C,$E
229 lea `16*4`($inp),$inp
230 sub \$1,$num
231 jnz .Lloop
232___
233&EPILOGUE("sha1_block_data_order");
234$code.=<<___;
235.asciz "SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
236___
237
238####################################################################
239
240$code =~ s/\`([^\`]*)\`/eval $1/gem;
241print $code;
242close STDOUT;