summaryrefslogtreecommitdiff
path: root/src/lib/libcrypto/md5/asm
diff options
context:
space:
mode:
Diffstat (limited to 'src/lib/libcrypto/md5/asm')
-rw-r--r--src/lib/libcrypto/md5/asm/md5-586.pl306
-rw-r--r--src/lib/libcrypto/md5/asm/md5-sparcv9.S1031
-rwxr-xr-xsrc/lib/libcrypto/md5/asm/md5-x86_64.pl245
3 files changed, 1582 insertions, 0 deletions
diff --git a/src/lib/libcrypto/md5/asm/md5-586.pl b/src/lib/libcrypto/md5/asm/md5-586.pl
new file mode 100644
index 0000000000..76ac235f7d
--- /dev/null
+++ b/src/lib/libcrypto/md5/asm/md5-586.pl
@@ -0,0 +1,306 @@
1#!/usr/local/bin/perl
2
3# Normal is the
4# md5_block_x86(MD5_CTX *c, ULONG *X);
5# version, non-normal is the
6# md5_block_x86(MD5_CTX *c, ULONG *X,int blocks);
7
8$normal=0;
9
10push(@INC,"perlasm","../../perlasm");
11require "x86asm.pl";
12
13&asm_init($ARGV[0],$0);
14
15$A="eax";
16$B="ebx";
17$C="ecx";
18$D="edx";
19$tmp1="edi";
20$tmp2="ebp";
21$X="esi";
22
23# What we need to load into $tmp for the next round
24%Ltmp1=("R0",&Np($C), "R1",&Np($C), "R2",&Np($C), "R3",&Np($D));
25@xo=(
26 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, # R0
27 1, 6, 11, 0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, # R1
28 5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, # R2
29 0, 7, 14, 5, 12, 3, 10, 1, 8, 15, 6, 13, 4, 11, 2, 9, # R3
30 );
31
32&md5_block("md5_block_asm_data_order");
33&asm_finish();
34
35sub Np
36 {
37 local($p)=@_;
38 local(%n)=($A,$D,$B,$A,$C,$B,$D,$C);
39 return($n{$p});
40 }
41
42sub R0
43 {
44 local($pos,$a,$b,$c,$d,$K,$ki,$s,$t)=@_;
45
46 &mov($tmp1,$C) if $pos < 0;
47 &mov($tmp2,&DWP($xo[$ki]*4,$K,"",0)) if $pos < 0; # very first one
48
49 # body proper
50
51 &comment("R0 $ki");
52 &xor($tmp1,$d); # F function - part 2
53
54 &and($tmp1,$b); # F function - part 3
55 &lea($a,&DWP($t,$a,$tmp2,1));
56
57 &xor($tmp1,$d); # F function - part 4
58
59 &add($a,$tmp1);
60 &mov($tmp1,&Np($c)) if $pos < 1; # next tmp1 for R0
61 &mov($tmp1,&Np($c)) if $pos == 1; # next tmp1 for R1
62
63 &rotl($a,$s);
64
65 &mov($tmp2,&DWP($xo[$ki+1]*4,$K,"",0)) if ($pos != 2);
66
67 &add($a,$b);
68 }
69
70sub R1
71 {
72 local($pos,$a,$b,$c,$d,$K,$ki,$s,$t)=@_;
73
74 &comment("R1 $ki");
75
76 &lea($a,&DWP($t,$a,$tmp2,1));
77
78 &xor($tmp1,$b); # G function - part 2
79 &and($tmp1,$d); # G function - part 3
80
81 &mov($tmp2,&DWP($xo[$ki+1]*4,$K,"",0)) if ($pos != 2);
82 &xor($tmp1,$c); # G function - part 4
83
84 &add($a,$tmp1);
85 &mov($tmp1,&Np($c)) if $pos < 1; # G function - part 1
86 &mov($tmp1,&Np($c)) if $pos == 1; # G function - part 1
87
88 &rotl($a,$s);
89
90 &add($a,$b);
91 }
92
93sub R2
94 {
95 local($n,$pos,$a,$b,$c,$d,$K,$ki,$s,$t)=@_;
96 # This one is different, only 3 logical operations
97
98if (($n & 1) == 0)
99 {
100 &comment("R2 $ki");
101 # make sure to do 'D' first, not 'B', else we clash with
102 # the last add from the previous round.
103
104 &xor($tmp1,$d); # H function - part 2
105
106 &xor($tmp1,$b); # H function - part 3
107 &lea($a,&DWP($t,$a,$tmp2,1));
108
109 &add($a,$tmp1);
110
111 &rotl($a,$s);
112
113 &mov($tmp2,&DWP($xo[$ki+1]*4,$K,"",0));
114 &mov($tmp1,&Np($c));
115 }
116else
117 {
118 &comment("R2 $ki");
119 # make sure to do 'D' first, not 'B', else we clash with
120 # the last add from the previous round.
121
122 &lea($a,&DWP($t,$a,$tmp2,1));
123
124 &add($b,$c); # MOVED FORWARD
125 &xor($tmp1,$d); # H function - part 2
126
127 &xor($tmp1,$b); # H function - part 3
128 &mov($tmp2,&DWP($xo[$ki+1]*4,$K,"",0)) if ($pos != 2);
129
130 &add($a,$tmp1);
131 &mov($tmp1,&Np($c)) if $pos < 1; # H function - part 1
132 &mov($tmp1,-1) if $pos == 1; # I function - part 1
133
134 &rotl($a,$s);
135
136 &add($a,$b);
137 }
138 }
139
140sub R3
141 {
142 local($pos,$a,$b,$c,$d,$K,$ki,$s,$t)=@_;
143
144 &comment("R3 $ki");
145
146 # &not($tmp1)
147 &xor($tmp1,$d) if $pos < 0; # I function - part 2
148
149 &or($tmp1,$b); # I function - part 3
150 &lea($a,&DWP($t,$a,$tmp2,1));
151
152 &xor($tmp1,$c); # I function - part 4
153 &mov($tmp2,&DWP($xo[$ki+1]*4,$K,"",0)) if $pos != 2; # load X/k value
154 &mov($tmp2,&wparam(0)) if $pos == 2;
155
156 &add($a,$tmp1);
157 &mov($tmp1,-1) if $pos < 1; # H function - part 1
158 &add($K,64) if $pos >=1 && !$normal;
159
160 &rotl($a,$s);
161
162 &xor($tmp1,&Np($d)) if $pos <= 0; # I function - part = first time
163 &mov($tmp1,&DWP( 0,$tmp2,"",0)) if $pos > 0;
164 &add($a,$b);
165 }
166
167
168sub md5_block
169 {
170 local($name)=@_;
171
172 &function_begin_B($name,"",3);
173
174 # parameter 1 is the MD5_CTX structure.
175 # A 0
176 # B 4
177 # C 8
178 # D 12
179
180 &push("esi");
181 &push("edi");
182 &mov($tmp1, &wparam(0)); # edi
183 &mov($X, &wparam(1)); # esi
184 &mov($C, &wparam(2));
185 &push("ebp");
186 &shl($C, 6);
187 &push("ebx");
188 &add($C, $X); # offset we end at
189 &sub($C, 64);
190 &mov($A, &DWP( 0,$tmp1,"",0));
191 &push($C); # Put on the TOS
192 &mov($B, &DWP( 4,$tmp1,"",0));
193 &mov($C, &DWP( 8,$tmp1,"",0));
194 &mov($D, &DWP(12,$tmp1,"",0));
195
196 &set_label("start") unless $normal;
197 &comment("");
198 &comment("R0 section");
199
200 &R0(-2,$A,$B,$C,$D,$X, 0, 7,0xd76aa478);
201 &R0( 0,$D,$A,$B,$C,$X, 1,12,0xe8c7b756);
202 &R0( 0,$C,$D,$A,$B,$X, 2,17,0x242070db);
203 &R0( 0,$B,$C,$D,$A,$X, 3,22,0xc1bdceee);
204 &R0( 0,$A,$B,$C,$D,$X, 4, 7,0xf57c0faf);
205 &R0( 0,$D,$A,$B,$C,$X, 5,12,0x4787c62a);
206 &R0( 0,$C,$D,$A,$B,$X, 6,17,0xa8304613);
207 &R0( 0,$B,$C,$D,$A,$X, 7,22,0xfd469501);
208 &R0( 0,$A,$B,$C,$D,$X, 8, 7,0x698098d8);
209 &R0( 0,$D,$A,$B,$C,$X, 9,12,0x8b44f7af);
210 &R0( 0,$C,$D,$A,$B,$X,10,17,0xffff5bb1);
211 &R0( 0,$B,$C,$D,$A,$X,11,22,0x895cd7be);
212 &R0( 0,$A,$B,$C,$D,$X,12, 7,0x6b901122);
213 &R0( 0,$D,$A,$B,$C,$X,13,12,0xfd987193);
214 &R0( 0,$C,$D,$A,$B,$X,14,17,0xa679438e);
215 &R0( 1,$B,$C,$D,$A,$X,15,22,0x49b40821);
216
217 &comment("");
218 &comment("R1 section");
219 &R1(-1,$A,$B,$C,$D,$X,16, 5,0xf61e2562);
220 &R1( 0,$D,$A,$B,$C,$X,17, 9,0xc040b340);
221 &R1( 0,$C,$D,$A,$B,$X,18,14,0x265e5a51);
222 &R1( 0,$B,$C,$D,$A,$X,19,20,0xe9b6c7aa);
223 &R1( 0,$A,$B,$C,$D,$X,20, 5,0xd62f105d);
224 &R1( 0,$D,$A,$B,$C,$X,21, 9,0x02441453);
225 &R1( 0,$C,$D,$A,$B,$X,22,14,0xd8a1e681);
226 &R1( 0,$B,$C,$D,$A,$X,23,20,0xe7d3fbc8);
227 &R1( 0,$A,$B,$C,$D,$X,24, 5,0x21e1cde6);
228 &R1( 0,$D,$A,$B,$C,$X,25, 9,0xc33707d6);
229 &R1( 0,$C,$D,$A,$B,$X,26,14,0xf4d50d87);
230 &R1( 0,$B,$C,$D,$A,$X,27,20,0x455a14ed);
231 &R1( 0,$A,$B,$C,$D,$X,28, 5,0xa9e3e905);
232 &R1( 0,$D,$A,$B,$C,$X,29, 9,0xfcefa3f8);
233 &R1( 0,$C,$D,$A,$B,$X,30,14,0x676f02d9);
234 &R1( 1,$B,$C,$D,$A,$X,31,20,0x8d2a4c8a);
235
236 &comment("");
237 &comment("R2 section");
238 &R2( 0,-1,$A,$B,$C,$D,$X,32, 4,0xfffa3942);
239 &R2( 1, 0,$D,$A,$B,$C,$X,33,11,0x8771f681);
240 &R2( 2, 0,$C,$D,$A,$B,$X,34,16,0x6d9d6122);
241 &R2( 3, 0,$B,$C,$D,$A,$X,35,23,0xfde5380c);
242 &R2( 4, 0,$A,$B,$C,$D,$X,36, 4,0xa4beea44);
243 &R2( 5, 0,$D,$A,$B,$C,$X,37,11,0x4bdecfa9);
244 &R2( 6, 0,$C,$D,$A,$B,$X,38,16,0xf6bb4b60);
245 &R2( 7, 0,$B,$C,$D,$A,$X,39,23,0xbebfbc70);
246 &R2( 8, 0,$A,$B,$C,$D,$X,40, 4,0x289b7ec6);
247 &R2( 9, 0,$D,$A,$B,$C,$X,41,11,0xeaa127fa);
248 &R2(10, 0,$C,$D,$A,$B,$X,42,16,0xd4ef3085);
249 &R2(11, 0,$B,$C,$D,$A,$X,43,23,0x04881d05);
250 &R2(12, 0,$A,$B,$C,$D,$X,44, 4,0xd9d4d039);
251 &R2(13, 0,$D,$A,$B,$C,$X,45,11,0xe6db99e5);
252 &R2(14, 0,$C,$D,$A,$B,$X,46,16,0x1fa27cf8);
253 &R2(15, 1,$B,$C,$D,$A,$X,47,23,0xc4ac5665);
254
255 &comment("");
256 &comment("R3 section");
257 &R3(-1,$A,$B,$C,$D,$X,48, 6,0xf4292244);
258 &R3( 0,$D,$A,$B,$C,$X,49,10,0x432aff97);
259 &R3( 0,$C,$D,$A,$B,$X,50,15,0xab9423a7);
260 &R3( 0,$B,$C,$D,$A,$X,51,21,0xfc93a039);
261 &R3( 0,$A,$B,$C,$D,$X,52, 6,0x655b59c3);
262 &R3( 0,$D,$A,$B,$C,$X,53,10,0x8f0ccc92);
263 &R3( 0,$C,$D,$A,$B,$X,54,15,0xffeff47d);
264 &R3( 0,$B,$C,$D,$A,$X,55,21,0x85845dd1);
265 &R3( 0,$A,$B,$C,$D,$X,56, 6,0x6fa87e4f);
266 &R3( 0,$D,$A,$B,$C,$X,57,10,0xfe2ce6e0);
267 &R3( 0,$C,$D,$A,$B,$X,58,15,0xa3014314);
268 &R3( 0,$B,$C,$D,$A,$X,59,21,0x4e0811a1);
269 &R3( 0,$A,$B,$C,$D,$X,60, 6,0xf7537e82);
270 &R3( 0,$D,$A,$B,$C,$X,61,10,0xbd3af235);
271 &R3( 0,$C,$D,$A,$B,$X,62,15,0x2ad7d2bb);
272 &R3( 2,$B,$C,$D,$A,$X,63,21,0xeb86d391);
273
274 # &mov($tmp2,&wparam(0)); # done in the last R3
275 # &mov($tmp1, &DWP( 0,$tmp2,"",0)); # done is the last R3
276
277 &add($A,$tmp1);
278 &mov($tmp1, &DWP( 4,$tmp2,"",0));
279
280 &add($B,$tmp1);
281 &mov($tmp1, &DWP( 8,$tmp2,"",0));
282
283 &add($C,$tmp1);
284 &mov($tmp1, &DWP(12,$tmp2,"",0));
285
286 &add($D,$tmp1);
287 &mov(&DWP( 0,$tmp2,"",0),$A);
288
289 &mov(&DWP( 4,$tmp2,"",0),$B);
290 &mov($tmp1,&swtmp(0)) unless $normal;
291
292 &mov(&DWP( 8,$tmp2,"",0),$C);
293 &mov(&DWP(12,$tmp2,"",0),$D);
294
295 &cmp($tmp1,$X) unless $normal; # check count
296 &jae(&label("start")) unless $normal;
297
298 &pop("eax"); # pop the temp variable off the stack
299 &pop("ebx");
300 &pop("ebp");
301 &pop("edi");
302 &pop("esi");
303 &ret();
304 &function_end_B($name);
305 }
306
diff --git a/src/lib/libcrypto/md5/asm/md5-sparcv9.S b/src/lib/libcrypto/md5/asm/md5-sparcv9.S
new file mode 100644
index 0000000000..db45aa4c97
--- /dev/null
+++ b/src/lib/libcrypto/md5/asm/md5-sparcv9.S
@@ -0,0 +1,1031 @@
1.ident "md5-sparcv9.S, Version 1.0"
2.ident "SPARC V9 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>"
3.file "md5-sparcv9.S"
4
5/*
6 * ====================================================================
7 * Copyright (c) 1999 Andy Polyakov <appro@fy.chalmers.se>.
8 *
9 * Rights for redistribution and usage in source and binary forms are
10 * granted as long as above copyright notices are retained. Warranty
11 * of any kind is (of course:-) disclaimed.
12 * ====================================================================
13 */
14
15/*
16 * This is my modest contribution to OpenSSL project (see
17 * http://www.openssl.org/ for more information about it) and is an
18 * assembler implementation of MD5 block hash function. I've hand-coded
19 * this for the sole reason to reach UltraSPARC-specific "load in
20 * little-endian byte order" instruction. This gives up to 15%
21 * performance improvement for cases when input message is aligned at
22 * 32 bits boundary. The module was tested under both 32 *and* 64 bit
23 * kernels. For updates see http://fy.chalmers.se/~appro/hpe/.
24 *
25 * To compile with SC4.x/SC5.x:
26 *
27 * cc -xarch=v[9|8plus] -DOPENSSL_SYSNAME_ULTRASPARC -DMD5_BLOCK_DATA_ORDER \
28 * -c md5-sparcv9.S
29 *
30 * and with gcc:
31 *
32 * gcc -mcpu=ultrasparc -DOPENSSL_SYSNAME_ULTRASPARC -DMD5_BLOCK_DATA_ORDER \
33 * -c md5-sparcv9.S
34 *
35 * or if above fails (it does if you have gas):
36 *
37 * gcc -E -DOPENSSL_SYSNAMEULTRASPARC -DMD5_BLOCK_DATA_ORDER md5_block.sparc.S | \
38 * as -xarch=v8plus /dev/fd/0 -o md5-sparcv9.o
39 */
40
41#include <openssl/e_os2.h>
42
43#define A %o0
44#define B %o1
45#define C %o2
46#define D %o3
47#define T1 %o4
48#define T2 %o5
49
50#define R0 %l0
51#define R1 %l1
52#define R2 %l2
53#define R3 %l3
54#define R4 %l4
55#define R5 %l5
56#define R6 %l6
57#define R7 %l7
58#define R8 %i3
59#define R9 %i4
60#define R10 %i5
61#define R11 %g1
62#define R12 %g2
63#define R13 %g3
64#define RX %g4
65
66#define Aptr %i0+0
67#define Bptr %i0+4
68#define Cptr %i0+8
69#define Dptr %i0+12
70
71#define Aval R5 /* those not used at the end of the last round */
72#define Bval R6
73#define Cval R7
74#define Dval R8
75
76#if defined(MD5_BLOCK_DATA_ORDER)
77# if defined(OPENSSL_SYSNAME_ULTRASPARC)
78# define LOAD lda
79# define X(i) [%i1+i*4]%asi
80# define md5_block md5_block_asm_data_order_aligned
81# define ASI_PRIMARY_LITTLE 0x88
82# else
83# error "MD5_BLOCK_DATA_ORDER is supported only on UltraSPARC!"
84# endif
85#else
86# define LOAD ld
87# define X(i) [%i1+i*4]
88# define md5_block md5_block_asm_host_order
89#endif
90
91.section ".text",#alloc,#execinstr
92
93#if defined(__SUNPRO_C) && defined(__sparcv9)
94 /* They've said -xarch=v9 at command line */
95 .register %g2,#scratch
96 .register %g3,#scratch
97# define FRAME -192
98#elif defined(__GNUC__) && defined(__arch64__)
99 /* They've said -m64 at command line */
100 .register %g2,#scratch
101 .register %g3,#scratch
102# define FRAME -192
103#else
104# define FRAME -96
105#endif
106
107.align 32
108
109.global md5_block
110md5_block:
111 save %sp,FRAME,%sp
112
113 ld [Dptr],D
114 ld [Cptr],C
115 ld [Bptr],B
116 ld [Aptr],A
117#ifdef ASI_PRIMARY_LITTLE
118 rd %asi,%o7 ! How dare I? Well, I just do:-)
119 wr %g0,ASI_PRIMARY_LITTLE,%asi
120#endif
121 LOAD X(0),R0
122
123.Lmd5_block_loop:
124
125!!!!!!!!Round 0
126
127 xor C,D,T1
128 sethi %hi(0xd76aa478),T2
129 and T1,B,T1
130 or T2,%lo(0xd76aa478),T2 !=
131 xor T1,D,T1
132 add T1,R0,T1
133 LOAD X(1),R1
134 add T1,T2,T1 !=
135 add A,T1,A
136 sll A,7,T2
137 srl A,32-7,A
138 or A,T2,A !=
139 xor B,C,T1
140 add A,B,A
141
142 sethi %hi(0xe8c7b756),T2
143 and T1,A,T1 !=
144 or T2,%lo(0xe8c7b756),T2
145 xor T1,C,T1
146 LOAD X(2),R2
147 add T1,R1,T1 !=
148 add T1,T2,T1
149 add D,T1,D
150 sll D,12,T2
151 srl D,32-12,D !=
152 or D,T2,D
153 xor A,B,T1
154 add D,A,D
155
156 sethi %hi(0x242070db),T2 !=
157 and T1,D,T1
158 or T2,%lo(0x242070db),T2
159 xor T1,B,T1
160 add T1,R2,T1 !=
161 LOAD X(3),R3
162 add T1,T2,T1
163 add C,T1,C
164 sll C,17,T2 !=
165 srl C,32-17,C
166 or C,T2,C
167 xor D,A,T1
168 add C,D,C !=
169
170 sethi %hi(0xc1bdceee),T2
171 and T1,C,T1
172 or T2,%lo(0xc1bdceee),T2
173 xor T1,A,T1 !=
174 add T1,R3,T1
175 LOAD X(4),R4
176 add T1,T2,T1
177 add B,T1,B !=
178 sll B,22,T2
179 srl B,32-22,B
180 or B,T2,B
181 xor C,D,T1 !=
182 add B,C,B
183
184 sethi %hi(0xf57c0faf),T2
185 and T1,B,T1
186 or T2,%lo(0xf57c0faf),T2 !=
187 xor T1,D,T1
188 add T1,R4,T1
189 LOAD X(5),R5
190 add T1,T2,T1 !=
191 add A,T1,A
192 sll A,7,T2
193 srl A,32-7,A
194 or A,T2,A !=
195 xor B,C,T1
196 add A,B,A
197
198 sethi %hi(0x4787c62a),T2
199 and T1,A,T1 !=
200 or T2,%lo(0x4787c62a),T2
201 xor T1,C,T1
202 LOAD X(6),R6
203 add T1,R5,T1 !=
204 add T1,T2,T1
205 add D,T1,D
206 sll D,12,T2
207 srl D,32-12,D !=
208 or D,T2,D
209 xor A,B,T1
210 add D,A,D
211
212 sethi %hi(0xa8304613),T2 !=
213 and T1,D,T1
214 or T2,%lo(0xa8304613),T2
215 xor T1,B,T1
216 add T1,R6,T1 !=
217 LOAD X(7),R7
218 add T1,T2,T1
219 add C,T1,C
220 sll C,17,T2 !=
221 srl C,32-17,C
222 or C,T2,C
223 xor D,A,T1
224 add C,D,C !=
225
226 sethi %hi(0xfd469501),T2
227 and T1,C,T1
228 or T2,%lo(0xfd469501),T2
229 xor T1,A,T1 !=
230 add T1,R7,T1
231 LOAD X(8),R8
232 add T1,T2,T1
233 add B,T1,B !=
234 sll B,22,T2
235 srl B,32-22,B
236 or B,T2,B
237 xor C,D,T1 !=
238 add B,C,B
239
240 sethi %hi(0x698098d8),T2
241 and T1,B,T1
242 or T2,%lo(0x698098d8),T2 !=
243 xor T1,D,T1
244 add T1,R8,T1
245 LOAD X(9),R9
246 add T1,T2,T1 !=
247 add A,T1,A
248 sll A,7,T2
249 srl A,32-7,A
250 or A,T2,A !=
251 xor B,C,T1
252 add A,B,A
253
254 sethi %hi(0x8b44f7af),T2
255 and T1,A,T1 !=
256 or T2,%lo(0x8b44f7af),T2
257 xor T1,C,T1
258 LOAD X(10),R10
259 add T1,R9,T1 !=
260 add T1,T2,T1
261 add D,T1,D
262 sll D,12,T2
263 srl D,32-12,D !=
264 or D,T2,D
265 xor A,B,T1
266 add D,A,D
267
268 sethi %hi(0xffff5bb1),T2 !=
269 and T1,D,T1
270 or T2,%lo(0xffff5bb1),T2
271 xor T1,B,T1
272 add T1,R10,T1 !=
273 LOAD X(11),R11
274 add T1,T2,T1
275 add C,T1,C
276 sll C,17,T2 !=
277 srl C,32-17,C
278 or C,T2,C
279 xor D,A,T1
280 add C,D,C !=
281
282 sethi %hi(0x895cd7be),T2
283 and T1,C,T1
284 or T2,%lo(0x895cd7be),T2
285 xor T1,A,T1 !=
286 add T1,R11,T1
287 LOAD X(12),R12
288 add T1,T2,T1
289 add B,T1,B !=
290 sll B,22,T2
291 srl B,32-22,B
292 or B,T2,B
293 xor C,D,T1 !=
294 add B,C,B
295
296 sethi %hi(0x6b901122),T2
297 and T1,B,T1
298 or T2,%lo(0x6b901122),T2 !=
299 xor T1,D,T1
300 add T1,R12,T1
301 LOAD X(13),R13
302 add T1,T2,T1 !=
303 add A,T1,A
304 sll A,7,T2
305 srl A,32-7,A
306 or A,T2,A !=
307 xor B,C,T1
308 add A,B,A
309
310 sethi %hi(0xfd987193),T2
311 and T1,A,T1 !=
312 or T2,%lo(0xfd987193),T2
313 xor T1,C,T1
314 LOAD X(14),RX
315 add T1,R13,T1 !=
316 add T1,T2,T1
317 add D,T1,D
318 sll D,12,T2
319 srl D,32-12,D !=
320 or D,T2,D
321 xor A,B,T1
322 add D,A,D
323
324 sethi %hi(0xa679438e),T2 !=
325 and T1,D,T1
326 or T2,%lo(0xa679438e),T2
327 xor T1,B,T1
328 add T1,RX,T1 !=
329 LOAD X(15),RX
330 add T1,T2,T1
331 add C,T1,C
332 sll C,17,T2 !=
333 srl C,32-17,C
334 or C,T2,C
335 xor D,A,T1
336 add C,D,C !=
337
338 sethi %hi(0x49b40821),T2
339 and T1,C,T1
340 or T2,%lo(0x49b40821),T2
341 xor T1,A,T1 !=
342 add T1,RX,T1
343 !pre-LOADed X(1),R1
344 add T1,T2,T1
345 add B,T1,B
346 sll B,22,T2 !=
347 srl B,32-22,B
348 or B,T2,B
349 add B,C,B
350
351!!!!!!!!Round 1
352
353 xor B,C,T1 !=
354 sethi %hi(0xf61e2562),T2
355 and T1,D,T1
356 or T2,%lo(0xf61e2562),T2
357 xor T1,C,T1 !=
358 add T1,R1,T1
359 !pre-LOADed X(6),R6
360 add T1,T2,T1
361 add A,T1,A
362 sll A,5,T2 !=
363 srl A,32-5,A
364 or A,T2,A
365 add A,B,A
366
367 xor A,B,T1 !=
368 sethi %hi(0xc040b340),T2
369 and T1,C,T1
370 or T2,%lo(0xc040b340),T2
371 xor T1,B,T1 !=
372 add T1,R6,T1
373 !pre-LOADed X(11),R11
374 add T1,T2,T1
375 add D,T1,D
376 sll D,9,T2 !=
377 srl D,32-9,D
378 or D,T2,D
379 add D,A,D
380
381 xor D,A,T1 !=
382 sethi %hi(0x265e5a51),T2
383 and T1,B,T1
384 or T2,%lo(0x265e5a51),T2
385 xor T1,A,T1 !=
386 add T1,R11,T1
387 !pre-LOADed X(0),R0
388 add T1,T2,T1
389 add C,T1,C
390 sll C,14,T2 !=
391 srl C,32-14,C
392 or C,T2,C
393 add C,D,C
394
395 xor C,D,T1 !=
396 sethi %hi(0xe9b6c7aa),T2
397 and T1,A,T1
398 or T2,%lo(0xe9b6c7aa),T2
399 xor T1,D,T1 !=
400 add T1,R0,T1
401 !pre-LOADed X(5),R5
402 add T1,T2,T1
403 add B,T1,B
404 sll B,20,T2 !=
405 srl B,32-20,B
406 or B,T2,B
407 add B,C,B
408
409 xor B,C,T1 !=
410 sethi %hi(0xd62f105d),T2
411 and T1,D,T1
412 or T2,%lo(0xd62f105d),T2
413 xor T1,C,T1 !=
414 add T1,R5,T1
415 !pre-LOADed X(10),R10
416 add T1,T2,T1
417 add A,T1,A
418 sll A,5,T2 !=
419 srl A,32-5,A
420 or A,T2,A
421 add A,B,A
422
423 xor A,B,T1 !=
424 sethi %hi(0x02441453),T2
425 and T1,C,T1
426 or T2,%lo(0x02441453),T2
427 xor T1,B,T1 !=
428 add T1,R10,T1
429 LOAD X(15),RX
430 add T1,T2,T1
431 add D,T1,D !=
432 sll D,9,T2
433 srl D,32-9,D
434 or D,T2,D
435 add D,A,D !=
436
437 xor D,A,T1
438 sethi %hi(0xd8a1e681),T2
439 and T1,B,T1
440 or T2,%lo(0xd8a1e681),T2 !=
441 xor T1,A,T1
442 add T1,RX,T1
443 !pre-LOADed X(4),R4
444 add T1,T2,T1
445 add C,T1,C !=
446 sll C,14,T2
447 srl C,32-14,C
448 or C,T2,C
449 add C,D,C !=
450
451 xor C,D,T1
452 sethi %hi(0xe7d3fbc8),T2
453 and T1,A,T1
454 or T2,%lo(0xe7d3fbc8),T2 !=
455 xor T1,D,T1
456 add T1,R4,T1
457 !pre-LOADed X(9),R9
458 add T1,T2,T1
459 add B,T1,B !=
460 sll B,20,T2
461 srl B,32-20,B
462 or B,T2,B
463 add B,C,B !=
464
465 xor B,C,T1
466 sethi %hi(0x21e1cde6),T2
467 and T1,D,T1
468 or T2,%lo(0x21e1cde6),T2 !=
469 xor T1,C,T1
470 add T1,R9,T1
471 LOAD X(14),RX
472 add T1,T2,T1 !=
473 add A,T1,A
474 sll A,5,T2
475 srl A,32-5,A
476 or A,T2,A !=
477 add A,B,A
478
479 xor A,B,T1
480 sethi %hi(0xc33707d6),T2
481 and T1,C,T1 !=
482 or T2,%lo(0xc33707d6),T2
483 xor T1,B,T1
484 add T1,RX,T1
485 !pre-LOADed X(3),R3
486 add T1,T2,T1 !=
487 add D,T1,D
488 sll D,9,T2
489 srl D,32-9,D
490 or D,T2,D !=
491 add D,A,D
492
493 xor D,A,T1
494 sethi %hi(0xf4d50d87),T2
495 and T1,B,T1 !=
496 or T2,%lo(0xf4d50d87),T2
497 xor T1,A,T1
498 add T1,R3,T1
499 !pre-LOADed X(8),R8
500 add T1,T2,T1 !=
501 add C,T1,C
502 sll C,14,T2
503 srl C,32-14,C
504 or C,T2,C !=
505 add C,D,C
506
507 xor C,D,T1
508 sethi %hi(0x455a14ed),T2
509 and T1,A,T1 !=
510 or T2,%lo(0x455a14ed),T2
511 xor T1,D,T1
512 add T1,R8,T1
513 !pre-LOADed X(13),R13
514 add T1,T2,T1 !=
515 add B,T1,B
516 sll B,20,T2
517 srl B,32-20,B
518 or B,T2,B !=
519 add B,C,B
520
521 xor B,C,T1
522 sethi %hi(0xa9e3e905),T2
523 and T1,D,T1 !=
524 or T2,%lo(0xa9e3e905),T2
525 xor T1,C,T1
526 add T1,R13,T1
527 !pre-LOADed X(2),R2
528 add T1,T2,T1 !=
529 add A,T1,A
530 sll A,5,T2
531 srl A,32-5,A
532 or A,T2,A !=
533 add A,B,A
534
535 xor A,B,T1
536 sethi %hi(0xfcefa3f8),T2
537 and T1,C,T1 !=
538 or T2,%lo(0xfcefa3f8),T2
539 xor T1,B,T1
540 add T1,R2,T1
541 !pre-LOADed X(7),R7
542 add T1,T2,T1 !=
543 add D,T1,D
544 sll D,9,T2
545 srl D,32-9,D
546 or D,T2,D !=
547 add D,A,D
548
549 xor D,A,T1
550 sethi %hi(0x676f02d9),T2
551 and T1,B,T1 !=
552 or T2,%lo(0x676f02d9),T2
553 xor T1,A,T1
554 add T1,R7,T1
555 !pre-LOADed X(12),R12
556 add T1,T2,T1 !=
557 add C,T1,C
558 sll C,14,T2
559 srl C,32-14,C
560 or C,T2,C !=
561 add C,D,C
562
563 xor C,D,T1
564 sethi %hi(0x8d2a4c8a),T2
565 and T1,A,T1 !=
566 or T2,%lo(0x8d2a4c8a),T2
567 xor T1,D,T1
568 add T1,R12,T1
569 !pre-LOADed X(5),R5
570 add T1,T2,T1 !=
571 add B,T1,B
572 sll B,20,T2
573 srl B,32-20,B
574 or B,T2,B !=
575 add B,C,B
576
577!!!!!!!!Round 2
578
579 xor B,C,T1
580 sethi %hi(0xfffa3942),T2
581 xor T1,D,T1 !=
582 or T2,%lo(0xfffa3942),T2
583 add T1,R5,T1
584 !pre-LOADed X(8),R8
585 add T1,T2,T1
586 add A,T1,A !=
587 sll A,4,T2
588 srl A,32-4,A
589 or A,T2,A
590 add A,B,A !=
591
592 xor A,B,T1
593 sethi %hi(0x8771f681),T2
594 xor T1,C,T1
595 or T2,%lo(0x8771f681),T2 !=
596 add T1,R8,T1
597 !pre-LOADed X(11),R11
598 add T1,T2,T1
599 add D,T1,D
600 sll D,11,T2 !=
601 srl D,32-11,D
602 or D,T2,D
603 add D,A,D
604
605 xor D,A,T1 !=
606 sethi %hi(0x6d9d6122),T2
607 xor T1,B,T1
608 or T2,%lo(0x6d9d6122),T2
609 add T1,R11,T1 !=
610 LOAD X(14),RX
611 add T1,T2,T1
612 add C,T1,C
613 sll C,16,T2 !=
614 srl C,32-16,C
615 or C,T2,C
616 add C,D,C
617
618 xor C,D,T1 !=
619 sethi %hi(0xfde5380c),T2
620 xor T1,A,T1
621 or T2,%lo(0xfde5380c),T2
622 add T1,RX,T1 !=
623 !pre-LOADed X(1),R1
624 add T1,T2,T1
625 add B,T1,B
626 sll B,23,T2
627 srl B,32-23,B !=
628 or B,T2,B
629 add B,C,B
630
631 xor B,C,T1
632 sethi %hi(0xa4beea44),T2 !=
633 xor T1,D,T1
634 or T2,%lo(0xa4beea44),T2
635 add T1,R1,T1
636 !pre-LOADed X(4),R4
637 add T1,T2,T1 !=
638 add A,T1,A
639 sll A,4,T2
640 srl A,32-4,A
641 or A,T2,A !=
642 add A,B,A
643
644 xor A,B,T1
645 sethi %hi(0x4bdecfa9),T2
646 xor T1,C,T1 !=
647 or T2,%lo(0x4bdecfa9),T2
648 add T1,R4,T1
649 !pre-LOADed X(7),R7
650 add T1,T2,T1
651 add D,T1,D !=
652 sll D,11,T2
653 srl D,32-11,D
654 or D,T2,D
655 add D,A,D !=
656
657 xor D,A,T1
658 sethi %hi(0xf6bb4b60),T2
659 xor T1,B,T1
660 or T2,%lo(0xf6bb4b60),T2 !=
661 add T1,R7,T1
662 !pre-LOADed X(10),R10
663 add T1,T2,T1
664 add C,T1,C
665 sll C,16,T2 !=
666 srl C,32-16,C
667 or C,T2,C
668 add C,D,C
669
670 xor C,D,T1 !=
671 sethi %hi(0xbebfbc70),T2
672 xor T1,A,T1
673 or T2,%lo(0xbebfbc70),T2
674 add T1,R10,T1 !=
675 !pre-LOADed X(13),R13
676 add T1,T2,T1
677 add B,T1,B
678 sll B,23,T2
679 srl B,32-23,B !=
680 or B,T2,B
681 add B,C,B
682
683 xor B,C,T1
684 sethi %hi(0x289b7ec6),T2 !=
685 xor T1,D,T1
686 or T2,%lo(0x289b7ec6),T2
687 add T1,R13,T1
688 !pre-LOADed X(0),R0
689 add T1,T2,T1 !=
690 add A,T1,A
691 sll A,4,T2
692 srl A,32-4,A
693 or A,T2,A !=
694 add A,B,A
695
696 xor A,B,T1
697 sethi %hi(0xeaa127fa),T2
698 xor T1,C,T1 !=
699 or T2,%lo(0xeaa127fa),T2
700 add T1,R0,T1
701 !pre-LOADed X(3),R3
702 add T1,T2,T1
703 add D,T1,D !=
704 sll D,11,T2
705 srl D,32-11,D
706 or D,T2,D
707 add D,A,D !=
708
709 xor D,A,T1
710 sethi %hi(0xd4ef3085),T2
711 xor T1,B,T1
712 or T2,%lo(0xd4ef3085),T2 !=
713 add T1,R3,T1
714 !pre-LOADed X(6),R6
715 add T1,T2,T1
716 add C,T1,C
717 sll C,16,T2 !=
718 srl C,32-16,C
719 or C,T2,C
720 add C,D,C
721
722 xor C,D,T1 !=
723 sethi %hi(0x04881d05),T2
724 xor T1,A,T1
725 or T2,%lo(0x04881d05),T2
726 add T1,R6,T1 !=
727 !pre-LOADed X(9),R9
728 add T1,T2,T1
729 add B,T1,B
730 sll B,23,T2
731 srl B,32-23,B !=
732 or B,T2,B
733 add B,C,B
734
735 xor B,C,T1
736 sethi %hi(0xd9d4d039),T2 !=
737 xor T1,D,T1
738 or T2,%lo(0xd9d4d039),T2
739 add T1,R9,T1
740 !pre-LOADed X(12),R12
741 add T1,T2,T1 !=
742 add A,T1,A
743 sll A,4,T2
744 srl A,32-4,A
745 or A,T2,A !=
746 add A,B,A
747
748 xor A,B,T1
749 sethi %hi(0xe6db99e5),T2
750 xor T1,C,T1 !=
751 or T2,%lo(0xe6db99e5),T2
752 add T1,R12,T1
753 LOAD X(15),RX
754 add T1,T2,T1 !=
755 add D,T1,D
756 sll D,11,T2
757 srl D,32-11,D
758 or D,T2,D !=
759 add D,A,D
760
761 xor D,A,T1
762 sethi %hi(0x1fa27cf8),T2
763 xor T1,B,T1 !=
764 or T2,%lo(0x1fa27cf8),T2
765 add T1,RX,T1
766 !pre-LOADed X(2),R2
767 add T1,T2,T1
768 add C,T1,C !=
769 sll C,16,T2
770 srl C,32-16,C
771 or C,T2,C
772 add C,D,C !=
773
774 xor C,D,T1
775 sethi %hi(0xc4ac5665),T2
776 xor T1,A,T1
777 or T2,%lo(0xc4ac5665),T2 !=
778 add T1,R2,T1
779 !pre-LOADed X(0),R0
780 add T1,T2,T1
781 add B,T1,B
782 sll B,23,T2 !=
783 srl B,32-23,B
784 or B,T2,B
785 add B,C,B
786
787!!!!!!!!Round 3
788
789 orn B,D,T1 !=
790 sethi %hi(0xf4292244),T2
791 xor T1,C,T1
792 or T2,%lo(0xf4292244),T2
793 add T1,R0,T1 !=
794 !pre-LOADed X(7),R7
795 add T1,T2,T1
796 add A,T1,A
797 sll A,6,T2
798 srl A,32-6,A !=
799 or A,T2,A
800 add A,B,A
801
802 orn A,C,T1
803 sethi %hi(0x432aff97),T2 !=
804 xor T1,B,T1
805 or T2,%lo(0x432aff97),T2
806 LOAD X(14),RX
807 add T1,R7,T1 !=
808 add T1,T2,T1
809 add D,T1,D
810 sll D,10,T2
811 srl D,32-10,D !=
812 or D,T2,D
813 add D,A,D
814
815 orn D,B,T1
816 sethi %hi(0xab9423a7),T2 !=
817 xor T1,A,T1
818 or T2,%lo(0xab9423a7),T2
819 add T1,RX,T1
820 !pre-LOADed X(5),R5
821 add T1,T2,T1 !=
822 add C,T1,C
823 sll C,15,T2
824 srl C,32-15,C
825 or C,T2,C !=
826 add C,D,C
827
828 orn C,A,T1
829 sethi %hi(0xfc93a039),T2
830 xor T1,D,T1 !=
831 or T2,%lo(0xfc93a039),T2
832 add T1,R5,T1
833 !pre-LOADed X(12),R12
834 add T1,T2,T1
835 add B,T1,B !=
836 sll B,21,T2
837 srl B,32-21,B
838 or B,T2,B
839 add B,C,B !=
840
841 orn B,D,T1
842 sethi %hi(0x655b59c3),T2
843 xor T1,C,T1
844 or T2,%lo(0x655b59c3),T2 !=
845 add T1,R12,T1
846 !pre-LOADed X(3),R3
847 add T1,T2,T1
848 add A,T1,A
849 sll A,6,T2 !=
850 srl A,32-6,A
851 or A,T2,A
852 add A,B,A
853
854 orn A,C,T1 !=
855 sethi %hi(0x8f0ccc92),T2
856 xor T1,B,T1
857 or T2,%lo(0x8f0ccc92),T2
858 add T1,R3,T1 !=
859 !pre-LOADed X(10),R10
860 add T1,T2,T1
861 add D,T1,D
862 sll D,10,T2
863 srl D,32-10,D !=
864 or D,T2,D
865 add D,A,D
866
867 orn D,B,T1
868 sethi %hi(0xffeff47d),T2 !=
869 xor T1,A,T1
870 or T2,%lo(0xffeff47d),T2
871 add T1,R10,T1
872 !pre-LOADed X(1),R1
873 add T1,T2,T1 !=
874 add C,T1,C
875 sll C,15,T2
876 srl C,32-15,C
877 or C,T2,C !=
878 add C,D,C
879
880 orn C,A,T1
881 sethi %hi(0x85845dd1),T2
882 xor T1,D,T1 !=
883 or T2,%lo(0x85845dd1),T2
884 add T1,R1,T1
885 !pre-LOADed X(8),R8
886 add T1,T2,T1
887 add B,T1,B !=
888 sll B,21,T2
889 srl B,32-21,B
890 or B,T2,B
891 add B,C,B !=
892
893 orn B,D,T1
894 sethi %hi(0x6fa87e4f),T2
895 xor T1,C,T1
896 or T2,%lo(0x6fa87e4f),T2 !=
897 add T1,R8,T1
898 LOAD X(15),RX
899 add T1,T2,T1
900 add A,T1,A !=
901 sll A,6,T2
902 srl A,32-6,A
903 or A,T2,A
904 add A,B,A !=
905
906 orn A,C,T1
907 sethi %hi(0xfe2ce6e0),T2
908 xor T1,B,T1
909 or T2,%lo(0xfe2ce6e0),T2 !=
910 add T1,RX,T1
911 !pre-LOADed X(6),R6
912 add T1,T2,T1
913 add D,T1,D
914 sll D,10,T2 !=
915 srl D,32-10,D
916 or D,T2,D
917 add D,A,D
918
919 orn D,B,T1 !=
920 sethi %hi(0xa3014314),T2
921 xor T1,A,T1
922 or T2,%lo(0xa3014314),T2
923 add T1,R6,T1 !=
924 !pre-LOADed X(13),R13
925 add T1,T2,T1
926 add C,T1,C
927 sll C,15,T2
928 srl C,32-15,C !=
929 or C,T2,C
930 add C,D,C
931
932 orn C,A,T1
933 sethi %hi(0x4e0811a1),T2 !=
934 xor T1,D,T1
935 or T2,%lo(0x4e0811a1),T2
936 !pre-LOADed X(4),R4
937 ld [Aptr],Aval
938 add T1,R13,T1 !=
939 add T1,T2,T1
940 add B,T1,B
941 sll B,21,T2
942 srl B,32-21,B !=
943 or B,T2,B
944 add B,C,B
945
946 orn B,D,T1
947 sethi %hi(0xf7537e82),T2 !=
948 xor T1,C,T1
949 or T2,%lo(0xf7537e82),T2
950 !pre-LOADed X(11),R11
951 ld [Dptr],Dval
952 add T1,R4,T1 !=
953 add T1,T2,T1
954 add A,T1,A
955 sll A,6,T2
956 srl A,32-6,A !=
957 or A,T2,A
958 add A,B,A
959
960 orn A,C,T1
961 sethi %hi(0xbd3af235),T2 !=
962 xor T1,B,T1
963 or T2,%lo(0xbd3af235),T2
964 !pre-LOADed X(2),R2
965 ld [Cptr],Cval
966 add T1,R11,T1 !=
967 add T1,T2,T1
968 add D,T1,D
969 sll D,10,T2
970 srl D,32-10,D !=
971 or D,T2,D
972 add D,A,D
973
974 orn D,B,T1
975 sethi %hi(0x2ad7d2bb),T2 !=
976 xor T1,A,T1
977 or T2,%lo(0x2ad7d2bb),T2
978 !pre-LOADed X(9),R9
979 ld [Bptr],Bval
980 add T1,R2,T1 !=
981 add Aval,A,Aval
982 add T1,T2,T1
983 st Aval,[Aptr]
984 add C,T1,C !=
985 sll C,15,T2
986 add Dval,D,Dval
987 srl C,32-15,C
988 or C,T2,C !=
989 st Dval,[Dptr]
990 add C,D,C
991
992 orn C,A,T1
993 sethi %hi(0xeb86d391),T2 !=
994 xor T1,D,T1
995 or T2,%lo(0xeb86d391),T2
996 add T1,R9,T1
997 !pre-LOADed X(0),R0
998 mov Aval,A !=
999 add T1,T2,T1
1000 mov Dval,D
1001 add B,T1,B
1002 sll B,21,T2 !=
1003 add Cval,C,Cval
1004 srl B,32-21,B
1005 st Cval,[Cptr]
1006 or B,T2,B !=
1007 add B,C,B
1008
1009 deccc %i2
1010 mov Cval,C
1011 add B,Bval,B !=
1012 inc 64,%i1
1013 nop
1014 st B,[Bptr]
1015 nop !=
1016
1017#ifdef OPENSSL_SYSNAME_ULTRASPARC
1018 bg,a,pt %icc,.Lmd5_block_loop
1019#else
1020 bg,a .Lmd5_block_loop
1021#endif
1022 LOAD X(0),R0
1023
1024#ifdef ASI_PRIMARY_LITTLE
1025 wr %g0,%o7,%asi
1026#endif
1027 ret
1028 restore %g0,0,%o0
1029
1030.type md5_block,#function
1031.size md5_block,(.-md5_block)
diff --git a/src/lib/libcrypto/md5/asm/md5-x86_64.pl b/src/lib/libcrypto/md5/asm/md5-x86_64.pl
new file mode 100755
index 0000000000..9a6fa67224
--- /dev/null
+++ b/src/lib/libcrypto/md5/asm/md5-x86_64.pl
@@ -0,0 +1,245 @@
1#!/usr/bin/perl -w
2#
3# MD5 optimized for AMD64.
4#
5# Author: Marc Bevand <bevand_m (at) epita.fr>
6# Licence: I hereby disclaim the copyright on this code and place it
7# in the public domain.
8#
9
10use strict;
11
12my $code;
13
14# round1_step() does:
15# dst = x + ((dst + F(x,y,z) + X[k] + T_i) <<< s)
16# %r10d = X[k_next]
17# %r11d = z' (copy of z for the next step)
18# Each round1_step() takes about 5.71 clocks (9 instructions, 1.58 IPC)
19sub round1_step
20{
21 my ($pos, $dst, $x, $y, $z, $k_next, $T_i, $s) = @_;
22 $code .= " mov 0*4(%rsi), %r10d /* (NEXT STEP) X[0] */\n" if ($pos == -1);
23 $code .= " mov %edx, %r11d /* (NEXT STEP) z' = %edx */\n" if ($pos == -1);
24 $code .= <<EOF;
25 xor $y, %r11d /* y ^ ... */
26 lea $T_i($dst,%r10d),$dst /* Const + dst + ... */
27 and $x, %r11d /* x & ... */
28 xor $z, %r11d /* z ^ ... */
29 mov $k_next*4(%rsi),%r10d /* (NEXT STEP) X[$k_next] */
30 add %r11d, $dst /* dst += ... */
31 rol \$$s, $dst /* dst <<< s */
32 mov $y, %r11d /* (NEXT STEP) z' = $y */
33 add $x, $dst /* dst += x */
34EOF
35}
36
37# round2_step() does:
38# dst = x + ((dst + G(x,y,z) + X[k] + T_i) <<< s)
39# %r10d = X[k_next]
40# %r11d = y' (copy of y for the next step)
41# Each round2_step() takes about 6.22 clocks (9 instructions, 1.45 IPC)
42sub round2_step
43{
44 my ($pos, $dst, $x, $y, $z, $k_next, $T_i, $s) = @_;
45 $code .= " mov 1*4(%rsi), %r10d /* (NEXT STEP) X[1] */\n" if ($pos == -1);
46 $code .= " mov %ecx, %r11d /* (NEXT STEP) y' = %ecx */\n" if ($pos == -1);
47 $code .= <<EOF;
48 xor $x, %r11d /* x ^ ... */
49 lea $T_i($dst,%r10d),$dst /* Const + dst + ... */
50 and $z, %r11d /* z & ... */
51 xor $y, %r11d /* y ^ ... */
52 mov $k_next*4(%rsi),%r10d /* (NEXT STEP) X[$k_next] */
53 add %r11d, $dst /* dst += ... */
54 rol \$$s, $dst /* dst <<< s */
55 mov $x, %r11d /* (NEXT STEP) y' = $x */
56 add $x, $dst /* dst += x */
57EOF
58}
59
60# round3_step() does:
61# dst = x + ((dst + H(x,y,z) + X[k] + T_i) <<< s)
62# %r10d = X[k_next]
63# %r11d = y' (copy of y for the next step)
64# Each round3_step() takes about 4.26 clocks (8 instructions, 1.88 IPC)
65sub round3_step
66{
67 my ($pos, $dst, $x, $y, $z, $k_next, $T_i, $s) = @_;
68 $code .= " mov 5*4(%rsi), %r10d /* (NEXT STEP) X[5] */\n" if ($pos == -1);
69 $code .= " mov %ecx, %r11d /* (NEXT STEP) y' = %ecx */\n" if ($pos == -1);
70 $code .= <<EOF;
71 lea $T_i($dst,%r10d),$dst /* Const + dst + ... */
72 mov $k_next*4(%rsi),%r10d /* (NEXT STEP) X[$k_next] */
73 xor $z, %r11d /* z ^ ... */
74 xor $x, %r11d /* x ^ ... */
75 add %r11d, $dst /* dst += ... */
76 rol \$$s, $dst /* dst <<< s */
77 mov $x, %r11d /* (NEXT STEP) y' = $x */
78 add $x, $dst /* dst += x */
79EOF
80}
81
82# round4_step() does:
83# dst = x + ((dst + I(x,y,z) + X[k] + T_i) <<< s)
84# %r10d = X[k_next]
85# %r11d = not z' (copy of not z for the next step)
86# Each round4_step() takes about 5.27 clocks (9 instructions, 1.71 IPC)
87sub round4_step
88{
89 my ($pos, $dst, $x, $y, $z, $k_next, $T_i, $s) = @_;
90 $code .= " mov 0*4(%rsi), %r10d /* (NEXT STEP) X[0] */\n" if ($pos == -1);
91 $code .= " mov \$0xffffffff, %r11d\n" if ($pos == -1);
92 $code .= " xor %edx, %r11d /* (NEXT STEP) not z' = not %edx*/\n"
93 if ($pos == -1);
94 $code .= <<EOF;
95 lea $T_i($dst,%r10d),$dst /* Const + dst + ... */
96 or $x, %r11d /* x | ... */
97 xor $y, %r11d /* y ^ ... */
98 add %r11d, $dst /* dst += ... */
99 mov $k_next*4(%rsi),%r10d /* (NEXT STEP) X[$k_next] */
100 mov \$0xffffffff, %r11d
101 rol \$$s, $dst /* dst <<< s */
102 xor $y, %r11d /* (NEXT STEP) not z' = not $y */
103 add $x, $dst /* dst += x */
104EOF
105}
106
107my $output = shift;
108open STDOUT,"| $^X ../perlasm/x86_64-xlate.pl $output";
109
110$code .= <<EOF;
111.text
112.align 16
113
114.globl md5_block_asm_data_order
115.type md5_block_asm_data_order,\@function,3
116md5_block_asm_data_order:
117 push %rbp
118 push %rbx
119 push %r14
120 push %r15
121
122 # rdi = arg #1 (ctx, MD5_CTX pointer)
123 # rsi = arg #2 (ptr, data pointer)
124 # rdx = arg #3 (nbr, number of 16-word blocks to process)
125 mov %rdi, %rbp # rbp = ctx
126 shl \$6, %rdx # rdx = nbr in bytes
127 lea (%rsi,%rdx), %rdi # rdi = end
128 mov 0*4(%rbp), %eax # eax = ctx->A
129 mov 1*4(%rbp), %ebx # ebx = ctx->B
130 mov 2*4(%rbp), %ecx # ecx = ctx->C
131 mov 3*4(%rbp), %edx # edx = ctx->D
132 # end is 'rdi'
133 # ptr is 'rsi'
134 # A is 'eax'
135 # B is 'ebx'
136 # C is 'ecx'
137 # D is 'edx'
138
139 cmp %rdi, %rsi # cmp end with ptr
140 je .Lend # jmp if ptr == end
141
142 # BEGIN of loop over 16-word blocks
143.Lloop: # save old values of A, B, C, D
144 mov %eax, %r8d
145 mov %ebx, %r9d
146 mov %ecx, %r14d
147 mov %edx, %r15d
148EOF
149round1_step(-1,'%eax','%ebx','%ecx','%edx', '1','0xd76aa478', '7');
150round1_step( 0,'%edx','%eax','%ebx','%ecx', '2','0xe8c7b756','12');
151round1_step( 0,'%ecx','%edx','%eax','%ebx', '3','0x242070db','17');
152round1_step( 0,'%ebx','%ecx','%edx','%eax', '4','0xc1bdceee','22');
153round1_step( 0,'%eax','%ebx','%ecx','%edx', '5','0xf57c0faf', '7');
154round1_step( 0,'%edx','%eax','%ebx','%ecx', '6','0x4787c62a','12');
155round1_step( 0,'%ecx','%edx','%eax','%ebx', '7','0xa8304613','17');
156round1_step( 0,'%ebx','%ecx','%edx','%eax', '8','0xfd469501','22');
157round1_step( 0,'%eax','%ebx','%ecx','%edx', '9','0x698098d8', '7');
158round1_step( 0,'%edx','%eax','%ebx','%ecx','10','0x8b44f7af','12');
159round1_step( 0,'%ecx','%edx','%eax','%ebx','11','0xffff5bb1','17');
160round1_step( 0,'%ebx','%ecx','%edx','%eax','12','0x895cd7be','22');
161round1_step( 0,'%eax','%ebx','%ecx','%edx','13','0x6b901122', '7');
162round1_step( 0,'%edx','%eax','%ebx','%ecx','14','0xfd987193','12');
163round1_step( 0,'%ecx','%edx','%eax','%ebx','15','0xa679438e','17');
164round1_step( 1,'%ebx','%ecx','%edx','%eax', '0','0x49b40821','22');
165
166round2_step(-1,'%eax','%ebx','%ecx','%edx', '6','0xf61e2562', '5');
167round2_step( 0,'%edx','%eax','%ebx','%ecx','11','0xc040b340', '9');
168round2_step( 0,'%ecx','%edx','%eax','%ebx', '0','0x265e5a51','14');
169round2_step( 0,'%ebx','%ecx','%edx','%eax', '5','0xe9b6c7aa','20');
170round2_step( 0,'%eax','%ebx','%ecx','%edx','10','0xd62f105d', '5');
171round2_step( 0,'%edx','%eax','%ebx','%ecx','15', '0x2441453', '9');
172round2_step( 0,'%ecx','%edx','%eax','%ebx', '4','0xd8a1e681','14');
173round2_step( 0,'%ebx','%ecx','%edx','%eax', '9','0xe7d3fbc8','20');
174round2_step( 0,'%eax','%ebx','%ecx','%edx','14','0x21e1cde6', '5');
175round2_step( 0,'%edx','%eax','%ebx','%ecx', '3','0xc33707d6', '9');
176round2_step( 0,'%ecx','%edx','%eax','%ebx', '8','0xf4d50d87','14');
177round2_step( 0,'%ebx','%ecx','%edx','%eax','13','0x455a14ed','20');
178round2_step( 0,'%eax','%ebx','%ecx','%edx', '2','0xa9e3e905', '5');
179round2_step( 0,'%edx','%eax','%ebx','%ecx', '7','0xfcefa3f8', '9');
180round2_step( 0,'%ecx','%edx','%eax','%ebx','12','0x676f02d9','14');
181round2_step( 1,'%ebx','%ecx','%edx','%eax', '0','0x8d2a4c8a','20');
182
183round3_step(-1,'%eax','%ebx','%ecx','%edx', '8','0xfffa3942', '4');
184round3_step( 0,'%edx','%eax','%ebx','%ecx','11','0x8771f681','11');
185round3_step( 0,'%ecx','%edx','%eax','%ebx','14','0x6d9d6122','16');
186round3_step( 0,'%ebx','%ecx','%edx','%eax', '1','0xfde5380c','23');
187round3_step( 0,'%eax','%ebx','%ecx','%edx', '4','0xa4beea44', '4');
188round3_step( 0,'%edx','%eax','%ebx','%ecx', '7','0x4bdecfa9','11');
189round3_step( 0,'%ecx','%edx','%eax','%ebx','10','0xf6bb4b60','16');
190round3_step( 0,'%ebx','%ecx','%edx','%eax','13','0xbebfbc70','23');
191round3_step( 0,'%eax','%ebx','%ecx','%edx', '0','0x289b7ec6', '4');
192round3_step( 0,'%edx','%eax','%ebx','%ecx', '3','0xeaa127fa','11');
193round3_step( 0,'%ecx','%edx','%eax','%ebx', '6','0xd4ef3085','16');
194round3_step( 0,'%ebx','%ecx','%edx','%eax', '9', '0x4881d05','23');
195round3_step( 0,'%eax','%ebx','%ecx','%edx','12','0xd9d4d039', '4');
196round3_step( 0,'%edx','%eax','%ebx','%ecx','15','0xe6db99e5','11');
197round3_step( 0,'%ecx','%edx','%eax','%ebx', '2','0x1fa27cf8','16');
198round3_step( 1,'%ebx','%ecx','%edx','%eax', '0','0xc4ac5665','23');
199
200round4_step(-1,'%eax','%ebx','%ecx','%edx', '7','0xf4292244', '6');
201round4_step( 0,'%edx','%eax','%ebx','%ecx','14','0x432aff97','10');
202round4_step( 0,'%ecx','%edx','%eax','%ebx', '5','0xab9423a7','15');
203round4_step( 0,'%ebx','%ecx','%edx','%eax','12','0xfc93a039','21');
204round4_step( 0,'%eax','%ebx','%ecx','%edx', '3','0x655b59c3', '6');
205round4_step( 0,'%edx','%eax','%ebx','%ecx','10','0x8f0ccc92','10');
206round4_step( 0,'%ecx','%edx','%eax','%ebx', '1','0xffeff47d','15');
207round4_step( 0,'%ebx','%ecx','%edx','%eax', '8','0x85845dd1','21');
208round4_step( 0,'%eax','%ebx','%ecx','%edx','15','0x6fa87e4f', '6');
209round4_step( 0,'%edx','%eax','%ebx','%ecx', '6','0xfe2ce6e0','10');
210round4_step( 0,'%ecx','%edx','%eax','%ebx','13','0xa3014314','15');
211round4_step( 0,'%ebx','%ecx','%edx','%eax', '4','0x4e0811a1','21');
212round4_step( 0,'%eax','%ebx','%ecx','%edx','11','0xf7537e82', '6');
213round4_step( 0,'%edx','%eax','%ebx','%ecx', '2','0xbd3af235','10');
214round4_step( 0,'%ecx','%edx','%eax','%ebx', '9','0x2ad7d2bb','15');
215round4_step( 1,'%ebx','%ecx','%edx','%eax', '0','0xeb86d391','21');
216$code .= <<EOF;
217 # add old values of A, B, C, D
218 add %r8d, %eax
219 add %r9d, %ebx
220 add %r14d, %ecx
221 add %r15d, %edx
222
223 # loop control
224 add \$64, %rsi # ptr += 64
225 cmp %rdi, %rsi # cmp end with ptr
226 jb .Lloop # jmp if ptr < end
227 # END of loop over 16-word blocks
228
229.Lend:
230 mov %eax, 0*4(%rbp) # ctx->A = A
231 mov %ebx, 1*4(%rbp) # ctx->B = B
232 mov %ecx, 2*4(%rbp) # ctx->C = C
233 mov %edx, 3*4(%rbp) # ctx->D = D
234
235 pop %r15
236 pop %r14
237 pop %rbx
238 pop %rbp
239 ret
240.size md5_block_asm_data_order,.-md5_block_asm_data_order
241EOF
242
243print $code;
244
245close STDOUT;