1 files changed, 98 insertions, 3 deletions
diff --git a/contrib/amd64/amd64-match.S b/contrib/amd64/amd64-match.S
index b3bf1ac..81d4a1c 100644
--- a/contrib/amd64/amd64-match.S
+++ b/contrib/amd64/amd64-match.S
@@ -52,14 +52,73 @@
 #define save_r13        (64-LocalVarsSize)(%rsp)
 #define save_r15        (80-LocalVarsSize)(%rsp)
+.globl  match_init, longest_match
 /*
 * On AMD64 the first argument of a function (in our case -- the pointer to
 * deflate_state structure) is passed in %rdi, hence our offsets below are
 * all off of that.
 */
+/* you can check the structure offset by running
+#include <stdlib.h>
+#include <stdio.h>
+#include "deflate.h"
+void print_depl()
+{
+deflate_state ds;
+deflate_state *s=&ds;
+printf("size pointer=%u\n",(int)sizeof(void*));
+printf("#define dsWSize         (%3u)(%%rdi)\n",(int)(((char*)&(s->w_size))-((char*)s)));
+printf("#define dsWMask         (%3u)(%%rdi)\n",(int)(((char*)&(s->w_mask))-((char*)s)));
+printf("#define dsWindow        (%3u)(%%rdi)\n",(int)(((char*)&(s->window))-((char*)s)));
+printf("#define dsPrev          (%3u)(%%rdi)\n",(int)(((char*)&(s->prev))-((char*)s)));
+printf("#define dsMatchLen      (%3u)(%%rdi)\n",(int)(((char*)&(s->match_length))-((char*)s)));
+printf("#define dsPrevMatch     (%3u)(%%rdi)\n",(int)(((char*)&(s->prev_match))-((char*)s)));
+printf("#define dsStrStart      (%3u)(%%rdi)\n",(int)(((char*)&(s->strstart))-((char*)s)));
+printf("#define dsMatchStart    (%3u)(%%rdi)\n",(int)(((char*)&(s->match_start))-((char*)s)));
+printf("#define dsLookahead     (%3u)(%%rdi)\n",(int)(((char*)&(s->lookahead))-((char*)s)));
+printf("#define dsPrevLen       (%3u)(%%rdi)\n",(int)(((char*)&(s->prev_length))-((char*)s)));
+printf("#define dsMaxChainLen   (%3u)(%%rdi)\n",(int)(((char*)&(s->max_chain_length))-((char*)s)));
+printf("#define dsGoodMatch     (%3u)(%%rdi)\n",(int)(((char*)&(s->good_match))-((char*)s)));
+printf("#define dsNiceMatch     (%3u)(%%rdi)\n",(int)(((char*)&(s->nice_match))-((char*)s)));
+}
+*/
+/*
+  to compile for XCode 3.2 on MacOSX x86_64
+  - run "gcc -g -c -DXCODE_MAC_X64_STRUCTURE amd64-match.S"
+ */
+#ifndef CURRENT_LINX_XCODE_MAC_X64_STRUCTURE
+#define dsWSize         ( 68)(%rdi)
+#define dsWMask         ( 76)(%rdi)
+#define dsWindow        ( 80)(%rdi)
+#define dsPrev          ( 96)(%rdi)
+#define dsMatchLen      (144)(%rdi)
+#define dsPrevMatch     (148)(%rdi)
+#define dsStrStart      (156)(%rdi)
+#define dsMatchStart    (160)(%rdi)
+#define dsLookahead     (164)(%rdi)
+#define dsPrevLen       (168)(%rdi)
+#define dsMaxChainLen   (172)(%rdi)
+#define dsGoodMatch     (188)(%rdi)
+#define dsNiceMatch     (192)(%rdi)
+#else 
 #ifndef STRUCT_OFFSET
 #       define STRUCT_OFFSET    (0)
 #endif
 #define dsWSize         ( 56 + STRUCT_OFFSET)(%rdi)
 #define dsWMask         ( 64 + STRUCT_OFFSET)(%rdi)
 #define dsWindow        ( 72 + STRUCT_OFFSET)(%rdi)
@@ -74,7 +133,10 @@
 #define dsGoodMatch     (180 + STRUCT_OFFSET)(%rdi)
 #define dsNiceMatch     (184 + STRUCT_OFFSET)(%rdi)
-.globl  match_init, longest_match
+#endif
 .text
@@ -222,7 +284,9 @@ LoopEntry:	cmpw	-1(%windowbestlen, %curmatch), %scanendw
 * straightforward "rep cmpsb" would not drastically degrade
 * performance -- unrolling it, for example, makes no difference.
 */
 #undef USE_SSE  /* works, but is 6-7% slower, than non-SSE... */
 LoopCmps:
 #ifdef USE_SSE
                /* Preload the SSE registers */
@@ -244,29 +308,55 @@ LoopCmps:
                notw    %ax
                bsfw    %ax, %ax
                jnz     LeaveLoopCmps
-                add     $16, %rdx
+                
+                /* this is the only iteration of the loop with a possibility of having
+                   incremented rdx by 0x108 (each loop iteration add 16*4 = 0x40 
+                   and (0x40*4)+8=0x108 */
+                add     $8, %rdx
+                jz LenMaximum
+                add     $8, %rdx
+                
                pmovmskb %xmm3, %rax
                notw    %ax
                bsfw    %ax, %ax
                jnz     LeaveLoopCmps
+                
+                
                add     $16, %rdx
                pmovmskb %xmm5, %rax
                notw    %ax
                bsfw    %ax, %ax
                jnz     LeaveLoopCmps
+                
                add     $16, %rdx
                pmovmskb %xmm7, %rax
                notw    %ax
                bsfw    %ax, %ax
                jnz     LeaveLoopCmps
+                
                add     $16, %rdx
+                
                jmp     LoopCmps
 LeaveLoopCmps:  add     %rax, %rdx
 #else
                mov     (%windowbestlen, %rdx), %rax
                xor     (%prev, %rdx), %rax
                jnz     LeaveLoopCmps
-                add     $8, %rdx
+                
+                mov     8(%windowbestlen, %rdx), %rax
+                xor     8(%prev, %rdx), %rax
+                jnz     LeaveLoopCmps8
+                mov     16(%windowbestlen, %rdx), %rax
+                xor     16(%prev, %rdx), %rax
+                jnz     LeaveLoopCmps16
+                                
+                add     $24, %rdx
                jnz     LoopCmps
                jmp     LenMaximum
 #       if 0
@@ -274,10 +364,15 @@ LeaveLoopCmps:	add	%rax, %rdx
 * This three-liner is tantalizingly simple, but bsf is a slow instruction,
 * and the complicated alternative down below is quite a bit faster. Sad...
 */
 LeaveLoopCmps:  bsf     %rax, %rax /* find the first non-zero bit */
                shrl    $3, %eax /* divide by 8 to get the byte */
                add     %rax, %rdx
 #       else
+LeaveLoopCmps16:
+                add     $8, %rdx
+LeaveLoopCmps8:
+                add     $8, %rdx
 LeaveLoopCmps:  testl   $0xFFFFFFFF, %eax /* Check the first 4 bytes */
                jnz     Check16
                add     $4, %rdx

diff --git a/contrib/amd64/amd64-match.S b/contrib/amd64/amd64-match.S index b3bf1ac..81d4a1c 100644 --- a/contrib/amd64/amd64-match.S +++ b/contrib/amd64/amd64-match.S
@@ -52,14 +52,73 @@
52	#define save_r13 (64-LocalVarsSize)(%rsp)	52	#define save_r13 (64-LocalVarsSize)(%rsp)
53	#define save_r15 (80-LocalVarsSize)(%rsp)	53	#define save_r15 (80-LocalVarsSize)(%rsp)
54		54
		55
		56	.globl match_init, longest_match
		57
55	/*	58	/*
56	* On AMD64 the first argument of a function (in our case -- the pointer to	59	* On AMD64 the first argument of a function (in our case -- the pointer to
57	* deflate_state structure) is passed in %rdi, hence our offsets below are	60	* deflate_state structure) is passed in %rdi, hence our offsets below are
58	* all off of that.	61	* all off of that.
59	*/	62	*/
		63
		64	/* you can check the structure offset by running
		65
		66	#include <stdlib.h>
		67	#include <stdio.h>
		68	#include "deflate.h"
		69
		70	void print_depl()
		71	{
		72	deflate_state ds;
		73	deflate_state *s=&ds;
		74	printf("size pointer=%u\n",(int)sizeof(void*));
		75
		76	printf("#define dsWSize (%3u)(%%rdi)\n",(int)(((char)&(s->w_size))-((char)s)));
		77	printf("#define dsWMask (%3u)(%%rdi)\n",(int)(((char)&(s->w_mask))-((char)s)));
		78	printf("#define dsWindow (%3u)(%%rdi)\n",(int)(((char)&(s->window))-((char)s)));
		79	printf("#define dsPrev (%3u)(%%rdi)\n",(int)(((char)&(s->prev))-((char)s)));
		80	printf("#define dsMatchLen (%3u)(%%rdi)\n",(int)(((char)&(s->match_length))-((char)s)));
		81	printf("#define dsPrevMatch (%3u)(%%rdi)\n",(int)(((char)&(s->prev_match))-((char)s)));
		82	printf("#define dsStrStart (%3u)(%%rdi)\n",(int)(((char)&(s->strstart))-((char)s)));
		83	printf("#define dsMatchStart (%3u)(%%rdi)\n",(int)(((char)&(s->match_start))-((char)s)));
		84	printf("#define dsLookahead (%3u)(%%rdi)\n",(int)(((char)&(s->lookahead))-((char)s)));
		85	printf("#define dsPrevLen (%3u)(%%rdi)\n",(int)(((char)&(s->prev_length))-((char)s)));
		86	printf("#define dsMaxChainLen (%3u)(%%rdi)\n",(int)(((char)&(s->max_chain_length))-((char)s)));
		87	printf("#define dsGoodMatch (%3u)(%%rdi)\n",(int)(((char)&(s->good_match))-((char)s)));
		88	printf("#define dsNiceMatch (%3u)(%%rdi)\n",(int)(((char)&(s->nice_match))-((char)s)));
		89	}
		90
		91	*/
		92
		93
		94	/*
		95	to compile for XCode 3.2 on MacOSX x86_64
		96	- run "gcc -g -c -DXCODE_MAC_X64_STRUCTURE amd64-match.S"
		97	*/
		98
		99
		100	#ifndef CURRENT_LINX_XCODE_MAC_X64_STRUCTURE
		101	#define dsWSize ( 68)(%rdi)
		102	#define dsWMask ( 76)(%rdi)
		103	#define dsWindow ( 80)(%rdi)
		104	#define dsPrev ( 96)(%rdi)
		105	#define dsMatchLen (144)(%rdi)
		106	#define dsPrevMatch (148)(%rdi)
		107	#define dsStrStart (156)(%rdi)
		108	#define dsMatchStart (160)(%rdi)
		109	#define dsLookahead (164)(%rdi)
		110	#define dsPrevLen (168)(%rdi)
		111	#define dsMaxChainLen (172)(%rdi)
		112	#define dsGoodMatch (188)(%rdi)
		113	#define dsNiceMatch (192)(%rdi)
		114
		115	#else
		116
60	#ifndef STRUCT_OFFSET	117	#ifndef STRUCT_OFFSET
61	# define STRUCT_OFFSET (0)	118	# define STRUCT_OFFSET (0)
62	#endif	119	#endif
		120
		121
63	#define dsWSize ( 56 + STRUCT_OFFSET)(%rdi)	122	#define dsWSize ( 56 + STRUCT_OFFSET)(%rdi)
64	#define dsWMask ( 64 + STRUCT_OFFSET)(%rdi)	123	#define dsWMask ( 64 + STRUCT_OFFSET)(%rdi)
65	#define dsWindow ( 72 + STRUCT_OFFSET)(%rdi)	124	#define dsWindow ( 72 + STRUCT_OFFSET)(%rdi)
@@ -74,7 +133,10 @@
74	#define dsGoodMatch (180 + STRUCT_OFFSET)(%rdi)	133	#define dsGoodMatch (180 + STRUCT_OFFSET)(%rdi)
75	#define dsNiceMatch (184 + STRUCT_OFFSET)(%rdi)	134	#define dsNiceMatch (184 + STRUCT_OFFSET)(%rdi)
76		135
77	.globl match_init, longest_match	136	#endif
		137
		138
		139
78		140
79	.text	141	.text
80		142
@@ -222,7 +284,9 @@ LoopEntry: cmpw -1(%windowbestlen, %curmatch), %scanendw
222	* straightforward "rep cmpsb" would not drastically degrade	284	* straightforward "rep cmpsb" would not drastically degrade
223	* performance -- unrolling it, for example, makes no difference.	285	* performance -- unrolling it, for example, makes no difference.
224	*/	286	*/
		287
225	#undef USE_SSE /* works, but is 6-7% slower, than non-SSE... */	288	#undef USE_SSE /* works, but is 6-7% slower, than non-SSE... */
		289
226	LoopCmps:	290	LoopCmps:
227	#ifdef USE_SSE	291	#ifdef USE_SSE
228	/* Preload the SSE registers */	292	/* Preload the SSE registers */
@@ -244,29 +308,55 @@ LoopCmps:
244	notw %ax	308	notw %ax
245	bsfw %ax, %ax	309	bsfw %ax, %ax
246	jnz LeaveLoopCmps	310	jnz LeaveLoopCmps
247	add $16, %rdx	311
		312	/* this is the only iteration of the loop with a possibility of having
		313	incremented rdx by 0x108 (each loop iteration add 16*4 = 0x40
		314	and (0x404)+8=0x108 /
		315	add $8, %rdx
		316	jz LenMaximum
		317	add $8, %rdx
		318
		319
248	pmovmskb %xmm3, %rax	320	pmovmskb %xmm3, %rax
249	notw %ax	321	notw %ax
250	bsfw %ax, %ax	322	bsfw %ax, %ax
251	jnz LeaveLoopCmps	323	jnz LeaveLoopCmps
		324
		325
252	add $16, %rdx	326	add $16, %rdx
		327
		328
253	pmovmskb %xmm5, %rax	329	pmovmskb %xmm5, %rax
254	notw %ax	330	notw %ax
255	bsfw %ax, %ax	331	bsfw %ax, %ax
256	jnz LeaveLoopCmps	332	jnz LeaveLoopCmps
		333
257	add $16, %rdx	334	add $16, %rdx
		335
		336
258	pmovmskb %xmm7, %rax	337	pmovmskb %xmm7, %rax
259	notw %ax	338	notw %ax
260	bsfw %ax, %ax	339	bsfw %ax, %ax
261	jnz LeaveLoopCmps	340	jnz LeaveLoopCmps
		341
262	add $16, %rdx	342	add $16, %rdx
		343
263	jmp LoopCmps	344	jmp LoopCmps
264	LeaveLoopCmps: add %rax, %rdx	345	LeaveLoopCmps: add %rax, %rdx
265	#else	346	#else
266	mov (%windowbestlen, %rdx), %rax	347	mov (%windowbestlen, %rdx), %rax
267	xor (%prev, %rdx), %rax	348	xor (%prev, %rdx), %rax
268	jnz LeaveLoopCmps	349	jnz LeaveLoopCmps
269	add $8, %rdx	350
		351	mov 8(%windowbestlen, %rdx), %rax
		352	xor 8(%prev, %rdx), %rax
		353	jnz LeaveLoopCmps8
		354
		355	mov 16(%windowbestlen, %rdx), %rax
		356	xor 16(%prev, %rdx), %rax
		357	jnz LeaveLoopCmps16
		358
		359	add $24, %rdx
270	jnz LoopCmps	360	jnz LoopCmps
271	jmp LenMaximum	361	jmp LenMaximum
272	# if 0	362	# if 0
@@ -274,10 +364,15 @@ LeaveLoopCmps: add %rax, %rdx
274	* This three-liner is tantalizingly simple, but bsf is a slow instruction,	364	* This three-liner is tantalizingly simple, but bsf is a slow instruction,
275	* and the complicated alternative down below is quite a bit faster. Sad...	365	* and the complicated alternative down below is quite a bit faster. Sad...
276	*/	366	*/
		367
277	LeaveLoopCmps: bsf %rax, %rax /* find the first non-zero bit */	368	LeaveLoopCmps: bsf %rax, %rax /* find the first non-zero bit */
278	shrl $3, %eax /* divide by 8 to get the byte */	369	shrl $3, %eax /* divide by 8 to get the byte */
279	add %rax, %rdx	370	add %rax, %rdx
280	# else	371	# else
		372	LeaveLoopCmps16:
		373	add $8, %rdx
		374	LeaveLoopCmps8:
		375	add $8, %rdx
281	LeaveLoopCmps: testl $0xFFFFFFFF, %eax /* Check the first 4 bytes */	376	LeaveLoopCmps: testl $0xFFFFFFFF, %eax /* Check the first 4 bytes */
282	jnz Check16	377	jnz Check16
283	add $4, %rdx	378	add $4, %rdx