summaryrefslogtreecommitdiff
path: root/src/lib/libcrypto/sha
diff options
context:
space:
mode:
Diffstat (limited to 'src/lib/libcrypto/sha')
-rw-r--r--src/lib/libcrypto/sha/asm/sha1-586.pl291
1 files changed, 88 insertions, 203 deletions
diff --git a/src/lib/libcrypto/sha/asm/sha1-586.pl b/src/lib/libcrypto/sha/asm/sha1-586.pl
index fe51fd0794..e00f709553 100644
--- a/src/lib/libcrypto/sha/asm/sha1-586.pl
+++ b/src/lib/libcrypto/sha/asm/sha1-586.pl
@@ -1,5 +1,30 @@
1#!/usr/local/bin/perl 1#!/usr/local/bin/perl
2 2
3# It was noted that Intel IA-32 C compiler generates code which
4# performs ~30% *faster* on P4 CPU than original *hand-coded*
5# SHA1 assembler implementation. To address this problem (and
6# prove that humans are still better than machines:-), the
7# original code was overhauled, which resulted in following
8# performance changes:
9#
10# compared with original compared with Intel cc
11# assembler impl. generated code
12# Pentium -25% +37%
13# PIII/AMD +8% +16%
14# P4 +85%(!) +45%
15#
16# As you can see Pentium came out as looser:-( Yet I reckoned that
17# improvement on P4 outweights the loss and incorporate this
18# re-tuned code to 0.9.7 and later.
19# ----------------------------------------------------------------
20# Those who for any particular reason absolutely must score on
21# Pentium can replace this module with one from 0.9.6 distribution.
22# This "offer" shall be revoked the moment programming interface to
23# this module is changed, in which case this paragraph should be
24# removed.
25# ----------------------------------------------------------------
26# <appro@fy.chalmers.se>
27
3$normal=0; 28$normal=0;
4 29
5push(@INC,"perlasm","../../perlasm"); 30push(@INC,"perlasm","../../perlasm");
@@ -77,54 +102,21 @@ sub BODY_00_15
77 { 102 {
78 local($pos,$K,$X,$n,$a,$b,$c,$d,$e,$f)=@_; 103 local($pos,$K,$X,$n,$a,$b,$c,$d,$e,$f)=@_;
79 104
80return if $n & 1;
81 &comment("00_15 $n"); 105 &comment("00_15 $n");
82 106
83 &mov($f,$c);
84
85 &mov($tmp1,$a);
86 &xor($f,$d); # F2
87
88 &rotl($tmp1,5); # A2
89
90 &and($f,$b); # F3
91 &add($tmp1,$e);
92
93 &rotr($b,1); # B1 <- F
94 &mov($e,&swtmp($n)); # G1
95
96 &rotr($b,1); # B1 <- F
97 &xor($f,$d); # F4
98
99 &lea($tmp1,&DWP($K,$tmp1,$e,1));
100
101############################
102# &BODY_40_59( 0,$K[2],$X,42,$A,$B,$C,$D,$E,$T);
103# &BODY_40_59( 0,$K[2],$X,43,$T,$A,$B,$C,$D,$E);
104$n++;
105 local($n0,$n1,$n2,$n3,$np)=&Na($n);
106 ($b,$c,$d,$e,$f,$a)=($a,$b,$c,$d,$e,$f);
107
108 &mov($f,$c);
109
110 &add($a,$tmp1); # MOVED DOWN
111 &xor($f,$d); # F2
112
113 &mov($tmp1,$a); 107 &mov($tmp1,$a);
114 &and($f,$b); # F3 108 &mov($f,$c); # f to hold F_00_19(b,c,d)
115 109 &rotl($tmp1,5); # tmp1=ROTATE(a,5)
116 &rotl($tmp1,5); # A2 110 &xor($f,$d);
117 111 &and($f,$b);
118 &add($tmp1,$e); 112 &rotr($b,2); # b=ROTATE(b,30)
119 &mov($e,&swtmp($n)); # G1 113 &add($tmp1,$e); # tmp1+=e;
120 114 &mov($e,&swtmp($n)); # e becomes volatile and
121 &rotr($b,1); # B1 <- F 115 # is loaded with xi
122 &xor($f,$d); # F4 116 &xor($f,$d); # f holds F_00_19(b,c,d)
123 117 &lea($tmp1,&DWP($K,$tmp1,$e,1));# tmp1+=K_00_19+xi
124 &rotr($b,1); # B1 <- F 118
125 &lea($tmp1,&DWP($K,$tmp1,$e,1)); 119 &add($f,$tmp1); # f+=tmp1
126
127 &add($f,$tmp1);
128 } 120 }
129 121
130sub BODY_16_19 122sub BODY_16_19
@@ -132,66 +124,24 @@ sub BODY_16_19
132 local($pos,$K,$X,$n,$a,$b,$c,$d,$e,$f)=@_; 124 local($pos,$K,$X,$n,$a,$b,$c,$d,$e,$f)=@_;
133 local($n0,$n1,$n2,$n3,$np)=&Na($n); 125 local($n0,$n1,$n2,$n3,$np)=&Na($n);
134 126
135return if $n & 1;
136 &comment("16_19 $n"); 127 &comment("16_19 $n");
137 128
138 &nop() if ($pos < 0); 129 &mov($f,&swtmp($n1)); # f to hold Xupdate(xi,xa,xb,xc,xd)
139&mov($tmp1,&swtmp($n0)); # X1 130 &mov($tmp1,$c); # tmp1 to hold F_00_19(b,c,d)
140 &mov($f,&swtmp($n1)); # X2 131 &xor($f,&swtmp($n0));
141&xor($f,$tmp1); # X3 132 &xor($tmp1,$d);
142 &mov($tmp1,&swtmp($n2)); # X4 133 &xor($f,&swtmp($n2));
143&xor($f,$tmp1); # X5 134 &and($tmp1,$b); # tmp1 holds F_00_19(b,c,d)
144 &mov($tmp1,&swtmp($n3)); # X6 135 &xor($f,&swtmp($n3)); # f holds xa^xb^xc^xd
145&xor($f,$tmp1); # X7 - slot 136 &rotr($b,2); # b=ROTATE(b,30)
146 &mov($tmp1,$c); # F1 137 &xor($tmp1,$d); # tmp1=F_00_19(b,c,d)
147&rotl($f,1); # X8 - slot 138 &rotl($f,1); # f=ROATE(f,1)
148 &xor($tmp1,$d); # F2 139 &mov(&swtmp($n0),$f); # xi=f
149&mov(&swtmp($n0),$f); # X9 - anytime 140 &lea($f,&DWP($K,$f,$e,1)); # f+=K_00_19+e
150 &and($tmp1,$b); # F3 141 &mov($e,$a); # e becomes volatile
151&lea($f,&DWP($K,$f,$e,1)); # tot=X+K+e 142 &add($f,$tmp1); # f+=F_00_19(b,c,d)
152 &xor($tmp1,$d); # F4 143 &rotl($e,5); # e=ROTATE(a,5)
153&mov($e,$a); # A1 144 &add($f,$e); # f+=ROTATE(a,5)
154 &add($f,$tmp1); # tot+=F();
155
156&rotl($e,5); # A2
157
158&rotr($b,1); # B1 <- F
159 &add($f,$e); # tot+=a
160
161############################
162# &BODY_40_59( 0,$K[2],$X,42,$A,$B,$C,$D,$E,$T);
163# &BODY_40_59( 0,$K[2],$X,43,$T,$A,$B,$C,$D,$E);
164$n++;
165 local($n0,$n1,$n2,$n3,$np)=&Na($n);
166 ($b,$c,$d,$e,$f,$a)=($a,$b,$c,$d,$e,$f);
167
168
169&mov($f,&swtmp($n0)); # X1
170 &mov($tmp1,&swtmp($n1)); # X2
171&xor($f,$tmp1); # X3
172 &mov($tmp1,&swtmp($n2)); # X4
173&xor($f,$tmp1); # X5
174 &mov($tmp1,&swtmp($n3)); # X6
175&rotr($c,1); #&rotr($b,1); # B1 <- F # MOVED DOWN
176 &xor($f,$tmp1); # X7 - slot
177&rotl($f,1); # X8 - slot
178 &mov($tmp1,$c); # F1
179&xor($tmp1,$d); # F2
180 &mov(&swtmp($n0),$f); # X9 - anytime
181&and($tmp1,$b); # F3
182 &lea($f,&DWP($K,$f,$e,1)); # tot=X+K+e
183
184&xor($tmp1,$d); # F4
185 &mov($e,$a); # A1
186
187&rotl($e,5); # A2
188
189&rotr($b,1); # B1 <- F
190 &add($f,$e); # tot+=a
191
192&rotr($b,1); # B1 <- F
193 &add($f,$tmp1); # tot+=F();
194
195 } 145 }
196 146
197sub BODY_20_39 147sub BODY_20_39
@@ -201,42 +151,21 @@ sub BODY_20_39
201 &comment("20_39 $n"); 151 &comment("20_39 $n");
202 local($n0,$n1,$n2,$n3,$np)=&Na($n); 152 local($n0,$n1,$n2,$n3,$np)=&Na($n);
203 153
204&mov($f,&swtmp($n0)); # X1 154 &mov($f,&swtmp($n0)); # f to hold Xupdate(xi,xa,xb,xc,xd)
205 &mov($tmp1,&swtmp($n1)); # X2 155 &mov($tmp1,$b); # tmp1 to hold F_20_39(b,c,d)
206&xor($f,$tmp1); # X3 156 &xor($f,&swtmp($n1));
207 &mov($tmp1,&swtmp($n2)); # X4 157 &rotr($b,2); # b=ROTATE(b,30)
208&xor($f,$tmp1); # X5 158 &xor($f,&swtmp($n2));
209 &mov($tmp1,&swtmp($n3)); # X6 159 &xor($tmp1,$c);
210&xor($f,$tmp1); # X7 - slot 160 &xor($f,&swtmp($n3)); # f holds xa^xb^xc^xd
211 &mov($tmp1,$b); # F1 161 &xor($tmp1,$d); # tmp1 holds F_20_39(b,c,d)
212&rotl($f,1); # X8 - slot 162 &rotl($f,1); # f=ROTATE(f,1)
213 &xor($tmp1,$c); # F2 163 &mov(&swtmp($n0),$f); # xi=f
214&mov(&swtmp($n0),$f); # X9 - anytime 164 &lea($f,&DWP($K,$f,$e,1)); # f+=K_20_39+e
215 &xor($tmp1,$d); # F3 165 &mov($e,$a); # e becomes volatile
216 166 &rotl($e,5); # e=ROTATE(a,5)
217&lea($f,&DWP($K,$f,$e,1)); # tot=X+K+e 167 &add($f,$tmp1); # f+=F_20_39(b,c,d)
218 &mov($e,$a); # A1 168 &add($f,$e); # f+=ROTATE(a,5)
219
220&rotl($e,5); # A2
221
222if ($n != 79) # last loop
223 {
224 &rotr($b,1); # B1 <- F
225 &add($e,$tmp1); # tmp1=F()+a
226
227 &rotr($b,1); # B2 <- F
228 &add($f,$e); # tot+=tmp1;
229 }
230else
231 {
232 &add($e,$tmp1); # tmp1=F()+a
233 &mov($tmp1,&wparam(0));
234
235 &rotr($b,1); # B1 <- F
236 &add($f,$e); # tot+=tmp1;
237
238 &rotr($b,1); # B2 <- F
239 }
240 } 169 }
241 170
242sub BODY_40_59 171sub BODY_40_59
@@ -244,70 +173,27 @@ sub BODY_40_59
244 local($pos,$K,$X,$n,$a,$b,$c,$d,$e,$f)=@_; 173 local($pos,$K,$X,$n,$a,$b,$c,$d,$e,$f)=@_;
245 174
246 &comment("40_59 $n"); 175 &comment("40_59 $n");
247 return if $n & 1;
248 local($n0,$n1,$n2,$n3,$np)=&Na($n); 176 local($n0,$n1,$n2,$n3,$np)=&Na($n);
249 177
250&mov($f,&swtmp($n0)); # X1 178 &mov($f,&swtmp($n0)); # f to hold Xupdate(xi,xa,xb,xc,xd)
251 &mov($tmp1,&swtmp($n1)); # X2 179 &mov($tmp1,$b); # tmp1 to hold F_40_59(b,c,d)
252&xor($f,$tmp1); # X3 180 &xor($f,&swtmp($n1));
253 &mov($tmp1,&swtmp($n2)); # X4 181 &or($tmp1,$c);
254&xor($f,$tmp1); # X5 182 &xor($f,&swtmp($n2));
255 &mov($tmp1,&swtmp($n3)); # X6 183 &and($tmp1,$d);
256&xor($f,$tmp1); # X7 - slot 184 &xor($f,&swtmp($n3)); # f holds xa^xb^xc^xd
257 &mov($tmp1,$b); # F1 185 &rotl($f,1); # f=ROTATE(f,1)
258&rotl($f,1); # X8 - slot 186 &mov(&swtmp($n0),$f); # xi=f
259 &or($tmp1,$c); # F2 187 &lea($f,&DWP($K,$f,$e,1)); # f+=K_40_59+e
260&mov(&swtmp($n0),$f); # X9 - anytime 188 &mov($e,$b); # e becomes volatile and is used
261 &and($tmp1,$d); # F3 189 # to calculate F_40_59(b,c,d)
262 190 &rotr($b,2); # b=ROTATE(b,30)
263&lea($f,&DWP($K,$f,$e,1)); # tot=X+K+e 191 &and($e,$c);
264 &mov($e,$b); # F4 192 &or($tmp1,$e); # tmp1 holds F_40_59(b,c,d)
265 193 &mov($e,$a);
266&rotr($b,1); # B1 <- F 194 &rotl($e,5); # e=ROTATE(a,5)
267 &and($e,$c); # F5 195 &add($tmp1,$e); # tmp1+=ROTATE(a,5)
268 196 &add($f,$tmp1); # f+=tmp1;
269&or($tmp1,$e); # F6
270 &mov($e,$a); # A1
271
272&rotl($e,5); # A2
273
274&add($tmp1,$e); # tmp1=F()+a
275
276############################
277# &BODY_40_59( 0,$K[2],$X,42,$A,$B,$C,$D,$E,$T);
278# &BODY_40_59( 0,$K[2],$X,43,$T,$A,$B,$C,$D,$E);
279$n++;
280 local($n0,$n1,$n2,$n3,$np)=&Na($n);
281 ($b,$c,$d,$e,$f,$a)=($a,$b,$c,$d,$e,$f);
282
283 &mov($f,&swtmp($n0)); # X1
284&add($a,$tmp1); # tot+=tmp1; # moved was add f,tmp1
285 &mov($tmp1,&swtmp($n1)); # X2
286&xor($f,$tmp1); # X3
287 &mov($tmp1,&swtmp($n2)); # X4
288&xor($f,$tmp1); # X5
289 &mov($tmp1,&swtmp($n3)); # X6
290&rotr($c,1); # B2 <- F # moved was rotr b,1
291 &xor($f,$tmp1); # X7 - slot
292&rotl($f,1); # X8 - slot
293 &mov($tmp1,$b); # F1
294&mov(&swtmp($n0),$f); # X9 - anytime
295 &or($tmp1,$c); # F2
296&lea($f,&DWP($K,$f,$e,1)); # tot=X+K+e
297 &mov($e,$b); # F4
298&and($tmp1,$d); # F3
299 &and($e,$c); # F5
300
301&or($tmp1,$e); # F6
302 &mov($e,$a); # A1
303
304&rotl($e,5); # A2
305
306&rotr($b,1); # B1 <- F
307 &add($tmp1,$e); # tmp1=F()+a
308
309&rotr($b,1); # B2 <- F
310 &add($f,$tmp1); # tot+=tmp1;
311 } 197 }
312 198
313sub BODY_60_79 199sub BODY_60_79
@@ -495,8 +381,7 @@ sub sha1_block_data
495 # C -> E 381 # C -> E
496 # D -> T 382 # D -> T
497 383
498 # The last 2 have been moved into the last loop 384 &mov($tmp1,&wparam(0));
499 # &mov($tmp1,&wparam(0));
500 385
501 &mov($D, &DWP(12,$tmp1,"",0)); 386 &mov($D, &DWP(12,$tmp1,"",0));
502 &add($D,$B); 387 &add($D,$B);