aboutsummaryrefslogtreecommitdiff
path: root/libbb/hash_md5_sha_x86-64.S.sh
diff options
context:
space:
mode:
Diffstat (limited to 'libbb/hash_md5_sha_x86-64.S.sh')
-rwxr-xr-xlibbb/hash_md5_sha_x86-64.S.sh36
1 files changed, 29 insertions, 7 deletions
diff --git a/libbb/hash_md5_sha_x86-64.S.sh b/libbb/hash_md5_sha_x86-64.S.sh
index 7e50b64fb..901896e6e 100755
--- a/libbb/hash_md5_sha_x86-64.S.sh
+++ b/libbb/hash_md5_sha_x86-64.S.sh
@@ -6,13 +6,35 @@
6# also contains the diff of the generated file. 6# also contains the diff of the generated file.
7exec >hash_md5_sha_x86-64.S 7exec >hash_md5_sha_x86-64.S
8 8
9# There is a way to use XMM registers (which always exist for x86-64!) for W[]
10# For example, if we load W as follows:
11# %xmm0: w[0x0] w[0x1] w[0x2] w[0x3]
12# %xmm4: w[0x4] w[0x5] w[0x6] w[0x7]
13# %xmm8: w[0x8] w[0x9] w[0xa] w[0xb]
14# %xmm12: w[0xc] w[0xd] w[0xe] w[0xf]
15# then the xor'ing operation to generate next W[0..3] is:
16# movaps %xmm0, %xmmT2
17# palignr $0x8, %xmm4, %xmmT2 # form (w[0x2],w[0x3],w[0x4],w[0x5])
18# # Right-shifts xmm4:xmmT2 by 8 bytes. Writes shifted result to xmmT2. SSSE3 insn.
19# movaps %xmm0, %xmmT13
20# palignr $0x4,%xmm0,%xmmT13 # form (w[0xd],w[0xe],w[0xf],w[0x0])
21# xmm0 = xmm0 ^ t2 ^ xmm8 ^ t13
22# xmm0 = rol32(xmm0,1) # no such insn, have to use pslld+psrld+or
23# and then results can be extracted for use:
24# movd %xmm0, %esi # new W[0]
25# pextrd $1, %xmm0, %esi # new W[1]
26# # SSE4.1 insn. Can use EXTRACTPS (also SSE4.1)
27# pextrd $2, %xmm0, %esi # new W[2]
28# pextrd $3, %xmm0, %esi # new W[3]
29# ... but this requires SSE4.1 and SSSE3, which are not universally available on x86-64.
30
9echo \ 31echo \
10'### Generated by hash_md5_sha_x86-64.S.sh ### 32'### Generated by hash_md5_sha_x86-64.S.sh ###
11 33
12#if CONFIG_SHA1_SMALL == 0 && defined(__GNUC__) && defined(__x86_64__) 34#if CONFIG_SHA1_SMALL == 0 && defined(__GNUC__) && defined(__x86_64__)
13 .section .text.sha1_process_block64,"ax",@progbits 35 .section .text.sha1_process_block64,"ax",@progbits
14 .globl sha1_process_block64 36 .globl sha1_process_block64
15 .hidden sha1_process_block64 37 .hidden sha1_process_block64
16 .type sha1_process_block64, @function 38 .type sha1_process_block64, @function
17 39
18 .balign 8 # allow decoders to fetch at least 5 first insns 40 .balign 8 # allow decoders to fetch at least 5 first insns
@@ -265,15 +287,15 @@ RD2 ax bx cx dx bp 75; RD2 bp ax bx cx dx 76; RD2 dx bp ax bx cx 77; RD2 cx dx b
265echo " 287echo "
266 popq %rdi # 288 popq %rdi #
267 popq %r12 # 289 popq %r12 #
268 addl %eax, 80(%rdi) # ctx->hash[0] += a 290 addl %eax, 80(%rdi) # ctx->hash[0] += a
269 popq %r13 # 291 popq %r13 #
270 addl %ebx, 84(%rdi) # ctx->hash[1] += b 292 addl %ebx, 84(%rdi) # ctx->hash[1] += b
271 popq %r14 # 293 popq %r14 #
272 addl %ecx, 88(%rdi) # ctx->hash[2] += c 294 addl %ecx, 88(%rdi) # ctx->hash[2] += c
273 popq %r15 # 295 popq %r15 #
274 addl %edx, 92(%rdi) # ctx->hash[3] += d 296 addl %edx, 92(%rdi) # ctx->hash[3] += d
275 popq %rbx # 297 popq %rbx #
276 addl %ebp, 96(%rdi) # ctx->hash[4] += e 298 addl %ebp, 96(%rdi) # ctx->hash[4] += e
277 popq %rbp # 299 popq %rbp #
278 300
279 ret 301 ret