diff options
Diffstat (limited to 'src/lib/libcrypto/md5/asm/md5-x86_64.pl')
-rwxr-xr-x | src/lib/libcrypto/md5/asm/md5-x86_64.pl | 156 |
1 files changed, 140 insertions, 16 deletions
diff --git a/src/lib/libcrypto/md5/asm/md5-x86_64.pl b/src/lib/libcrypto/md5/asm/md5-x86_64.pl index 9a6fa67224..867885435e 100755 --- a/src/lib/libcrypto/md5/asm/md5-x86_64.pl +++ b/src/lib/libcrypto/md5/asm/md5-x86_64.pl | |||
@@ -15,7 +15,7 @@ my $code; | |||
15 | # dst = x + ((dst + F(x,y,z) + X[k] + T_i) <<< s) | 15 | # dst = x + ((dst + F(x,y,z) + X[k] + T_i) <<< s) |
16 | # %r10d = X[k_next] | 16 | # %r10d = X[k_next] |
17 | # %r11d = z' (copy of z for the next step) | 17 | # %r11d = z' (copy of z for the next step) |
18 | # Each round1_step() takes about 5.71 clocks (9 instructions, 1.58 IPC) | 18 | # Each round1_step() takes about 5.3 clocks (9 instructions, 1.7 IPC) |
19 | sub round1_step | 19 | sub round1_step |
20 | { | 20 | { |
21 | my ($pos, $dst, $x, $y, $z, $k_next, $T_i, $s) = @_; | 21 | my ($pos, $dst, $x, $y, $z, $k_next, $T_i, $s) = @_; |
@@ -37,22 +37,26 @@ EOF | |||
37 | # round2_step() does: | 37 | # round2_step() does: |
38 | # dst = x + ((dst + G(x,y,z) + X[k] + T_i) <<< s) | 38 | # dst = x + ((dst + G(x,y,z) + X[k] + T_i) <<< s) |
39 | # %r10d = X[k_next] | 39 | # %r10d = X[k_next] |
40 | # %r11d = y' (copy of y for the next step) | 40 | # %r11d = z' (copy of z for the next step) |
41 | # Each round2_step() takes about 6.22 clocks (9 instructions, 1.45 IPC) | 41 | # %r12d = z' (copy of z for the next step) |
42 | # Each round2_step() takes about 5.4 clocks (11 instructions, 2.0 IPC) | ||
42 | sub round2_step | 43 | sub round2_step |
43 | { | 44 | { |
44 | my ($pos, $dst, $x, $y, $z, $k_next, $T_i, $s) = @_; | 45 | my ($pos, $dst, $x, $y, $z, $k_next, $T_i, $s) = @_; |
45 | $code .= " mov 1*4(%rsi), %r10d /* (NEXT STEP) X[1] */\n" if ($pos == -1); | 46 | $code .= " mov 1*4(%rsi), %r10d /* (NEXT STEP) X[1] */\n" if ($pos == -1); |
46 | $code .= " mov %ecx, %r11d /* (NEXT STEP) y' = %ecx */\n" if ($pos == -1); | 47 | $code .= " mov %edx, %r11d /* (NEXT STEP) z' = %edx */\n" if ($pos == -1); |
48 | $code .= " mov %edx, %r12d /* (NEXT STEP) z' = %edx */\n" if ($pos == -1); | ||
47 | $code .= <<EOF; | 49 | $code .= <<EOF; |
48 | xor $x, %r11d /* x ^ ... */ | 50 | not %r11d /* not z */ |
49 | lea $T_i($dst,%r10d),$dst /* Const + dst + ... */ | 51 | lea $T_i($dst,%r10d),$dst /* Const + dst + ... */ |
50 | and $z, %r11d /* z & ... */ | 52 | and $x, %r12d /* x & z */ |
51 | xor $y, %r11d /* y ^ ... */ | 53 | and $y, %r11d /* y & (not z) */ |
52 | mov $k_next*4(%rsi),%r10d /* (NEXT STEP) X[$k_next] */ | 54 | mov $k_next*4(%rsi),%r10d /* (NEXT STEP) X[$k_next] */ |
53 | add %r11d, $dst /* dst += ... */ | 55 | or %r11d, %r12d /* (y & (not z)) | (x & z) */ |
56 | mov $y, %r11d /* (NEXT STEP) z' = $y */ | ||
57 | add %r12d, $dst /* dst += ... */ | ||
58 | mov $y, %r12d /* (NEXT STEP) z' = $y */ | ||
54 | rol \$$s, $dst /* dst <<< s */ | 59 | rol \$$s, $dst /* dst <<< s */ |
55 | mov $x, %r11d /* (NEXT STEP) y' = $x */ | ||
56 | add $x, $dst /* dst += x */ | 60 | add $x, $dst /* dst += x */ |
57 | EOF | 61 | EOF |
58 | } | 62 | } |
@@ -61,7 +65,7 @@ EOF | |||
61 | # dst = x + ((dst + H(x,y,z) + X[k] + T_i) <<< s) | 65 | # dst = x + ((dst + H(x,y,z) + X[k] + T_i) <<< s) |
62 | # %r10d = X[k_next] | 66 | # %r10d = X[k_next] |
63 | # %r11d = y' (copy of y for the next step) | 67 | # %r11d = y' (copy of y for the next step) |
64 | # Each round3_step() takes about 4.26 clocks (8 instructions, 1.88 IPC) | 68 | # Each round3_step() takes about 4.2 clocks (8 instructions, 1.9 IPC) |
65 | sub round3_step | 69 | sub round3_step |
66 | { | 70 | { |
67 | my ($pos, $dst, $x, $y, $z, $k_next, $T_i, $s) = @_; | 71 | my ($pos, $dst, $x, $y, $z, $k_next, $T_i, $s) = @_; |
@@ -83,7 +87,7 @@ EOF | |||
83 | # dst = x + ((dst + I(x,y,z) + X[k] + T_i) <<< s) | 87 | # dst = x + ((dst + I(x,y,z) + X[k] + T_i) <<< s) |
84 | # %r10d = X[k_next] | 88 | # %r10d = X[k_next] |
85 | # %r11d = not z' (copy of not z for the next step) | 89 | # %r11d = not z' (copy of not z for the next step) |
86 | # Each round4_step() takes about 5.27 clocks (9 instructions, 1.71 IPC) | 90 | # Each round4_step() takes about 5.2 clocks (9 instructions, 1.7 IPC) |
87 | sub round4_step | 91 | sub round4_step |
88 | { | 92 | { |
89 | my ($pos, $dst, $x, $y, $z, $k_next, $T_i, $s) = @_; | 93 | my ($pos, $dst, $x, $y, $z, $k_next, $T_i, $s) = @_; |
@@ -104,8 +108,19 @@ sub round4_step | |||
104 | EOF | 108 | EOF |
105 | } | 109 | } |
106 | 110 | ||
107 | my $output = shift; | 111 | my $flavour = shift; |
108 | open STDOUT,"| $^X ../perlasm/x86_64-xlate.pl $output"; | 112 | my $output = shift; |
113 | if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } | ||
114 | |||
115 | my $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); | ||
116 | |||
117 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; my $dir=$1; my $xlate; | ||
118 | ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or | ||
119 | ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or | ||
120 | die "can't locate x86_64-xlate.pl"; | ||
121 | |||
122 | no warnings qw(uninitialized); | ||
123 | open STDOUT,"| $^X $xlate $flavour $output"; | ||
109 | 124 | ||
110 | $code .= <<EOF; | 125 | $code .= <<EOF; |
111 | .text | 126 | .text |
@@ -116,8 +131,10 @@ $code .= <<EOF; | |||
116 | md5_block_asm_data_order: | 131 | md5_block_asm_data_order: |
117 | push %rbp | 132 | push %rbp |
118 | push %rbx | 133 | push %rbx |
134 | push %r12 | ||
119 | push %r14 | 135 | push %r14 |
120 | push %r15 | 136 | push %r15 |
137 | .Lprologue: | ||
121 | 138 | ||
122 | # rdi = arg #1 (ctx, MD5_CTX pointer) | 139 | # rdi = arg #1 (ctx, MD5_CTX pointer) |
123 | # rsi = arg #2 (ptr, data pointer) | 140 | # rsi = arg #2 (ptr, data pointer) |
@@ -232,13 +249,120 @@ $code .= <<EOF; | |||
232 | mov %ecx, 2*4(%rbp) # ctx->C = C | 249 | mov %ecx, 2*4(%rbp) # ctx->C = C |
233 | mov %edx, 3*4(%rbp) # ctx->D = D | 250 | mov %edx, 3*4(%rbp) # ctx->D = D |
234 | 251 | ||
252 | mov (%rsp),%r15 | ||
253 | mov 8(%rsp),%r14 | ||
254 | mov 16(%rsp),%r12 | ||
255 | mov 24(%rsp),%rbx | ||
256 | mov 32(%rsp),%rbp | ||
257 | add \$40,%rsp | ||
258 | .Lepilogue: | ||
259 | ret | ||
260 | .size md5_block_asm_data_order,.-md5_block_asm_data_order | ||
261 | EOF | ||
262 | |||
263 | # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, | ||
264 | # CONTEXT *context,DISPATCHER_CONTEXT *disp) | ||
265 | if ($win64) { | ||
266 | my $rec="%rcx"; | ||
267 | my $frame="%rdx"; | ||
268 | my $context="%r8"; | ||
269 | my $disp="%r9"; | ||
270 | |||
271 | $code.=<<___; | ||
272 | .extern __imp_RtlVirtualUnwind | ||
273 | .type se_handler,\@abi-omnipotent | ||
274 | .align 16 | ||
275 | se_handler: | ||
276 | push %rsi | ||
277 | push %rdi | ||
278 | push %rbx | ||
279 | push %rbp | ||
280 | push %r12 | ||
281 | push %r13 | ||
282 | push %r14 | ||
283 | push %r15 | ||
284 | pushfq | ||
285 | sub \$64,%rsp | ||
286 | |||
287 | mov 120($context),%rax # pull context->Rax | ||
288 | mov 248($context),%rbx # pull context->Rip | ||
289 | |||
290 | lea .Lprologue(%rip),%r10 | ||
291 | cmp %r10,%rbx # context->Rip<.Lprologue | ||
292 | jb .Lin_prologue | ||
293 | |||
294 | mov 152($context),%rax # pull context->Rsp | ||
295 | |||
296 | lea .Lepilogue(%rip),%r10 | ||
297 | cmp %r10,%rbx # context->Rip>=.Lepilogue | ||
298 | jae .Lin_prologue | ||
299 | |||
300 | lea 40(%rax),%rax | ||
301 | |||
302 | mov -8(%rax),%rbp | ||
303 | mov -16(%rax),%rbx | ||
304 | mov -24(%rax),%r12 | ||
305 | mov -32(%rax),%r14 | ||
306 | mov -40(%rax),%r15 | ||
307 | mov %rbx,144($context) # restore context->Rbx | ||
308 | mov %rbp,160($context) # restore context->Rbp | ||
309 | mov %r12,216($context) # restore context->R12 | ||
310 | mov %r14,232($context) # restore context->R14 | ||
311 | mov %r15,240($context) # restore context->R15 | ||
312 | |||
313 | .Lin_prologue: | ||
314 | mov 8(%rax),%rdi | ||
315 | mov 16(%rax),%rsi | ||
316 | mov %rax,152($context) # restore context->Rsp | ||
317 | mov %rsi,168($context) # restore context->Rsi | ||
318 | mov %rdi,176($context) # restore context->Rdi | ||
319 | |||
320 | mov 40($disp),%rdi # disp->ContextRecord | ||
321 | mov $context,%rsi # context | ||
322 | mov \$154,%ecx # sizeof(CONTEXT) | ||
323 | .long 0xa548f3fc # cld; rep movsq | ||
324 | |||
325 | mov $disp,%rsi | ||
326 | xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER | ||
327 | mov 8(%rsi),%rdx # arg2, disp->ImageBase | ||
328 | mov 0(%rsi),%r8 # arg3, disp->ControlPc | ||
329 | mov 16(%rsi),%r9 # arg4, disp->FunctionEntry | ||
330 | mov 40(%rsi),%r10 # disp->ContextRecord | ||
331 | lea 56(%rsi),%r11 # &disp->HandlerData | ||
332 | lea 24(%rsi),%r12 # &disp->EstablisherFrame | ||
333 | mov %r10,32(%rsp) # arg5 | ||
334 | mov %r11,40(%rsp) # arg6 | ||
335 | mov %r12,48(%rsp) # arg7 | ||
336 | mov %rcx,56(%rsp) # arg8, (NULL) | ||
337 | call *__imp_RtlVirtualUnwind(%rip) | ||
338 | |||
339 | mov \$1,%eax # ExceptionContinueSearch | ||
340 | add \$64,%rsp | ||
341 | popfq | ||
235 | pop %r15 | 342 | pop %r15 |
236 | pop %r14 | 343 | pop %r14 |
237 | pop %rbx | 344 | pop %r13 |
345 | pop %r12 | ||
238 | pop %rbp | 346 | pop %rbp |
347 | pop %rbx | ||
348 | pop %rdi | ||
349 | pop %rsi | ||
239 | ret | 350 | ret |
240 | .size md5_block_asm_data_order,.-md5_block_asm_data_order | 351 | .size se_handler,.-se_handler |
241 | EOF | 352 | |
353 | .section .pdata | ||
354 | .align 4 | ||
355 | .rva .LSEH_begin_md5_block_asm_data_order | ||
356 | .rva .LSEH_end_md5_block_asm_data_order | ||
357 | .rva .LSEH_info_md5_block_asm_data_order | ||
358 | |||
359 | .section .xdata | ||
360 | .align 8 | ||
361 | .LSEH_info_md5_block_asm_data_order: | ||
362 | .byte 9,0,0,0 | ||
363 | .rva se_handler | ||
364 | ___ | ||
365 | } | ||
242 | 366 | ||
243 | print $code; | 367 | print $code; |
244 | 368 | ||