diff options
Diffstat (limited to 'libbb/hash_md5_sha_x86-64.S.sh')
-rwxr-xr-x | libbb/hash_md5_sha_x86-64.S.sh | 36 |
1 files changed, 29 insertions, 7 deletions
diff --git a/libbb/hash_md5_sha_x86-64.S.sh b/libbb/hash_md5_sha_x86-64.S.sh index 7e50b64fb..901896e6e 100755 --- a/libbb/hash_md5_sha_x86-64.S.sh +++ b/libbb/hash_md5_sha_x86-64.S.sh | |||
@@ -6,13 +6,35 @@ | |||
6 | # also contains the diff of the generated file. | 6 | # also contains the diff of the generated file. |
7 | exec >hash_md5_sha_x86-64.S | 7 | exec >hash_md5_sha_x86-64.S |
8 | 8 | ||
9 | # There is a way to use XMM registers (which always exist for x86-64!) for W[] | ||
10 | # For example, if we load W as follows: | ||
11 | # %xmm0: w[0x0] w[0x1] w[0x2] w[0x3] | ||
12 | # %xmm4: w[0x4] w[0x5] w[0x6] w[0x7] | ||
13 | # %xmm8: w[0x8] w[0x9] w[0xa] w[0xb] | ||
14 | # %xmm12: w[0xc] w[0xd] w[0xe] w[0xf] | ||
15 | # then the xor'ing operation to generate next W[0..3] is: | ||
16 | # movaps %xmm0, %xmmT2 | ||
17 | # palignr $0x8, %xmm4, %xmmT2 # form (w[0x2],w[0x3],w[0x4],w[0x5]) | ||
18 | # # Right-shifts xmm4:xmmT2 by 8 bytes. Writes shifted result to xmmT2. SSSE3 insn. | ||
19 | # movaps %xmm0, %xmmT13 | ||
20 | # palignr $0x4,%xmm0,%xmmT13 # form (w[0xd],w[0xe],w[0xf],w[0x0]) | ||
21 | # xmm0 = xmm0 ^ t2 ^ xmm8 ^ t13 | ||
22 | # xmm0 = rol32(xmm0,1) # no such insn, have to use pslld+psrld+or | ||
23 | # and then results can be extracted for use: | ||
24 | # movd %xmm0, %esi # new W[0] | ||
25 | # pextrd $1, %xmm0, %esi # new W[1] | ||
26 | # # SSE4.1 insn. Can use EXTRACTPS (also SSE4.1) | ||
27 | # pextrd $2, %xmm0, %esi # new W[2] | ||
28 | # pextrd $3, %xmm0, %esi # new W[3] | ||
29 | # ... but this requires SSE4.1 and SSSE3, which are not universally available on x86-64. | ||
30 | |||
9 | echo \ | 31 | echo \ |
10 | '### Generated by hash_md5_sha_x86-64.S.sh ### | 32 | '### Generated by hash_md5_sha_x86-64.S.sh ### |
11 | 33 | ||
12 | #if CONFIG_SHA1_SMALL == 0 && defined(__GNUC__) && defined(__x86_64__) | 34 | #if CONFIG_SHA1_SMALL == 0 && defined(__GNUC__) && defined(__x86_64__) |
13 | .section .text.sha1_process_block64,"ax",@progbits | 35 | .section .text.sha1_process_block64,"ax",@progbits |
14 | .globl sha1_process_block64 | 36 | .globl sha1_process_block64 |
15 | .hidden sha1_process_block64 | 37 | .hidden sha1_process_block64 |
16 | .type sha1_process_block64, @function | 38 | .type sha1_process_block64, @function |
17 | 39 | ||
18 | .balign 8 # allow decoders to fetch at least 5 first insns | 40 | .balign 8 # allow decoders to fetch at least 5 first insns |
@@ -265,15 +287,15 @@ RD2 ax bx cx dx bp 75; RD2 bp ax bx cx dx 76; RD2 dx bp ax bx cx 77; RD2 cx dx b | |||
265 | echo " | 287 | echo " |
266 | popq %rdi # | 288 | popq %rdi # |
267 | popq %r12 # | 289 | popq %r12 # |
268 | addl %eax, 80(%rdi) # ctx->hash[0] += a | 290 | addl %eax, 80(%rdi) # ctx->hash[0] += a |
269 | popq %r13 # | 291 | popq %r13 # |
270 | addl %ebx, 84(%rdi) # ctx->hash[1] += b | 292 | addl %ebx, 84(%rdi) # ctx->hash[1] += b |
271 | popq %r14 # | 293 | popq %r14 # |
272 | addl %ecx, 88(%rdi) # ctx->hash[2] += c | 294 | addl %ecx, 88(%rdi) # ctx->hash[2] += c |
273 | popq %r15 # | 295 | popq %r15 # |
274 | addl %edx, 92(%rdi) # ctx->hash[3] += d | 296 | addl %edx, 92(%rdi) # ctx->hash[3] += d |
275 | popq %rbx # | 297 | popq %rbx # |
276 | addl %ebp, 96(%rdi) # ctx->hash[4] += e | 298 | addl %ebp, 96(%rdi) # ctx->hash[4] += e |
277 | popq %rbp # | 299 | popq %rbp # |
278 | 300 | ||
279 | ret | 301 | ret |