From 67cc20d0041a32bee12bd9eb20ae218f91b73f77 Mon Sep 17 00:00:00 2001 From: Mark Adler Date: Fri, 9 Sep 2011 23:32:36 -0700 Subject: zlib 1.2.4-pre1 --- contrib/amd64/amd64-match.S | 101 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 98 insertions(+), 3 deletions(-) (limited to 'contrib/amd64/amd64-match.S') diff --git a/contrib/amd64/amd64-match.S b/contrib/amd64/amd64-match.S index b3bf1ac..81d4a1c 100644 --- a/contrib/amd64/amd64-match.S +++ b/contrib/amd64/amd64-match.S @@ -52,14 +52,73 @@ #define save_r13 (64-LocalVarsSize)(%rsp) #define save_r15 (80-LocalVarsSize)(%rsp) + +.globl match_init, longest_match + /* * On AMD64 the first argument of a function (in our case -- the pointer to * deflate_state structure) is passed in %rdi, hence our offsets below are * all off of that. */ + +/* you can check the structure offset by running + +#include +#include +#include "deflate.h" + +void print_depl() +{ +deflate_state ds; +deflate_state *s=&ds; +printf("size pointer=%u\n",(int)sizeof(void*)); + +printf("#define dsWSize (%3u)(%%rdi)\n",(int)(((char*)&(s->w_size))-((char*)s))); +printf("#define dsWMask (%3u)(%%rdi)\n",(int)(((char*)&(s->w_mask))-((char*)s))); +printf("#define dsWindow (%3u)(%%rdi)\n",(int)(((char*)&(s->window))-((char*)s))); +printf("#define dsPrev (%3u)(%%rdi)\n",(int)(((char*)&(s->prev))-((char*)s))); +printf("#define dsMatchLen (%3u)(%%rdi)\n",(int)(((char*)&(s->match_length))-((char*)s))); +printf("#define dsPrevMatch (%3u)(%%rdi)\n",(int)(((char*)&(s->prev_match))-((char*)s))); +printf("#define dsStrStart (%3u)(%%rdi)\n",(int)(((char*)&(s->strstart))-((char*)s))); +printf("#define dsMatchStart (%3u)(%%rdi)\n",(int)(((char*)&(s->match_start))-((char*)s))); +printf("#define dsLookahead (%3u)(%%rdi)\n",(int)(((char*)&(s->lookahead))-((char*)s))); +printf("#define dsPrevLen (%3u)(%%rdi)\n",(int)(((char*)&(s->prev_length))-((char*)s))); +printf("#define dsMaxChainLen (%3u)(%%rdi)\n",(int)(((char*)&(s->max_chain_length))-((char*)s))); +printf("#define dsGoodMatch (%3u)(%%rdi)\n",(int)(((char*)&(s->good_match))-((char*)s))); +printf("#define dsNiceMatch (%3u)(%%rdi)\n",(int)(((char*)&(s->nice_match))-((char*)s))); +} + +*/ + + +/* + to compile for XCode 3.2 on MacOSX x86_64 + - run "gcc -g -c -DXCODE_MAC_X64_STRUCTURE amd64-match.S" + */ + + +#ifndef CURRENT_LINX_XCODE_MAC_X64_STRUCTURE +#define dsWSize ( 68)(%rdi) +#define dsWMask ( 76)(%rdi) +#define dsWindow ( 80)(%rdi) +#define dsPrev ( 96)(%rdi) +#define dsMatchLen (144)(%rdi) +#define dsPrevMatch (148)(%rdi) +#define dsStrStart (156)(%rdi) +#define dsMatchStart (160)(%rdi) +#define dsLookahead (164)(%rdi) +#define dsPrevLen (168)(%rdi) +#define dsMaxChainLen (172)(%rdi) +#define dsGoodMatch (188)(%rdi) +#define dsNiceMatch (192)(%rdi) + +#else + #ifndef STRUCT_OFFSET # define STRUCT_OFFSET (0) #endif + + #define dsWSize ( 56 + STRUCT_OFFSET)(%rdi) #define dsWMask ( 64 + STRUCT_OFFSET)(%rdi) #define dsWindow ( 72 + STRUCT_OFFSET)(%rdi) @@ -74,7 +133,10 @@ #define dsGoodMatch (180 + STRUCT_OFFSET)(%rdi) #define dsNiceMatch (184 + STRUCT_OFFSET)(%rdi) -.globl match_init, longest_match +#endif + + + .text @@ -222,7 +284,9 @@ LoopEntry: cmpw -1(%windowbestlen, %curmatch), %scanendw * straightforward "rep cmpsb" would not drastically degrade * performance -- unrolling it, for example, makes no difference. */ + #undef USE_SSE /* works, but is 6-7% slower, than non-SSE... */ + LoopCmps: #ifdef USE_SSE /* Preload the SSE registers */ @@ -244,29 +308,55 @@ LoopCmps: notw %ax bsfw %ax, %ax jnz LeaveLoopCmps - add $16, %rdx + + /* this is the only iteration of the loop with a possibility of having + incremented rdx by 0x108 (each loop iteration add 16*4 = 0x40 + and (0x40*4)+8=0x108 */ + add $8, %rdx + jz LenMaximum + add $8, %rdx + + pmovmskb %xmm3, %rax notw %ax bsfw %ax, %ax jnz LeaveLoopCmps + + add $16, %rdx + + pmovmskb %xmm5, %rax notw %ax bsfw %ax, %ax jnz LeaveLoopCmps + add $16, %rdx + + pmovmskb %xmm7, %rax notw %ax bsfw %ax, %ax jnz LeaveLoopCmps + add $16, %rdx + jmp LoopCmps LeaveLoopCmps: add %rax, %rdx #else mov (%windowbestlen, %rdx), %rax xor (%prev, %rdx), %rax jnz LeaveLoopCmps - add $8, %rdx + + mov 8(%windowbestlen, %rdx), %rax + xor 8(%prev, %rdx), %rax + jnz LeaveLoopCmps8 + + mov 16(%windowbestlen, %rdx), %rax + xor 16(%prev, %rdx), %rax + jnz LeaveLoopCmps16 + + add $24, %rdx jnz LoopCmps jmp LenMaximum # if 0 @@ -274,10 +364,15 @@ LeaveLoopCmps: add %rax, %rdx * This three-liner is tantalizingly simple, but bsf is a slow instruction, * and the complicated alternative down below is quite a bit faster. Sad... */ + LeaveLoopCmps: bsf %rax, %rax /* find the first non-zero bit */ shrl $3, %eax /* divide by 8 to get the byte */ add %rax, %rdx # else +LeaveLoopCmps16: + add $8, %rdx +LeaveLoopCmps8: + add $8, %rdx LeaveLoopCmps: testl $0xFFFFFFFF, %eax /* Check the first 4 bytes */ jnz Check16 add $4, %rdx -- cgit v1.2.3-55-g6feb