summaryrefslogtreecommitdiff
path: root/contrib/amd64
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/amd64')
-rw-r--r--contrib/amd64/amd64-match.S101
1 files changed, 98 insertions, 3 deletions
diff --git a/contrib/amd64/amd64-match.S b/contrib/amd64/amd64-match.S
index b3bf1ac..81d4a1c 100644
--- a/contrib/amd64/amd64-match.S
+++ b/contrib/amd64/amd64-match.S
@@ -52,14 +52,73 @@
52#define save_r13 (64-LocalVarsSize)(%rsp) 52#define save_r13 (64-LocalVarsSize)(%rsp)
53#define save_r15 (80-LocalVarsSize)(%rsp) 53#define save_r15 (80-LocalVarsSize)(%rsp)
54 54
55
56.globl match_init, longest_match
57
55/* 58/*
56 * On AMD64 the first argument of a function (in our case -- the pointer to 59 * On AMD64 the first argument of a function (in our case -- the pointer to
57 * deflate_state structure) is passed in %rdi, hence our offsets below are 60 * deflate_state structure) is passed in %rdi, hence our offsets below are
58 * all off of that. 61 * all off of that.
59 */ 62 */
63
64/* you can check the structure offset by running
65
66#include <stdlib.h>
67#include <stdio.h>
68#include "deflate.h"
69
70void print_depl()
71{
72deflate_state ds;
73deflate_state *s=&ds;
74printf("size pointer=%u\n",(int)sizeof(void*));
75
76printf("#define dsWSize (%3u)(%%rdi)\n",(int)(((char*)&(s->w_size))-((char*)s)));
77printf("#define dsWMask (%3u)(%%rdi)\n",(int)(((char*)&(s->w_mask))-((char*)s)));
78printf("#define dsWindow (%3u)(%%rdi)\n",(int)(((char*)&(s->window))-((char*)s)));
79printf("#define dsPrev (%3u)(%%rdi)\n",(int)(((char*)&(s->prev))-((char*)s)));
80printf("#define dsMatchLen (%3u)(%%rdi)\n",(int)(((char*)&(s->match_length))-((char*)s)));
81printf("#define dsPrevMatch (%3u)(%%rdi)\n",(int)(((char*)&(s->prev_match))-((char*)s)));
82printf("#define dsStrStart (%3u)(%%rdi)\n",(int)(((char*)&(s->strstart))-((char*)s)));
83printf("#define dsMatchStart (%3u)(%%rdi)\n",(int)(((char*)&(s->match_start))-((char*)s)));
84printf("#define dsLookahead (%3u)(%%rdi)\n",(int)(((char*)&(s->lookahead))-((char*)s)));
85printf("#define dsPrevLen (%3u)(%%rdi)\n",(int)(((char*)&(s->prev_length))-((char*)s)));
86printf("#define dsMaxChainLen (%3u)(%%rdi)\n",(int)(((char*)&(s->max_chain_length))-((char*)s)));
87printf("#define dsGoodMatch (%3u)(%%rdi)\n",(int)(((char*)&(s->good_match))-((char*)s)));
88printf("#define dsNiceMatch (%3u)(%%rdi)\n",(int)(((char*)&(s->nice_match))-((char*)s)));
89}
90
91*/
92
93
94/*
95 to compile for XCode 3.2 on MacOSX x86_64
96 - run "gcc -g -c -DXCODE_MAC_X64_STRUCTURE amd64-match.S"
97 */
98
99
100#ifndef CURRENT_LINX_XCODE_MAC_X64_STRUCTURE
101#define dsWSize ( 68)(%rdi)
102#define dsWMask ( 76)(%rdi)
103#define dsWindow ( 80)(%rdi)
104#define dsPrev ( 96)(%rdi)
105#define dsMatchLen (144)(%rdi)
106#define dsPrevMatch (148)(%rdi)
107#define dsStrStart (156)(%rdi)
108#define dsMatchStart (160)(%rdi)
109#define dsLookahead (164)(%rdi)
110#define dsPrevLen (168)(%rdi)
111#define dsMaxChainLen (172)(%rdi)
112#define dsGoodMatch (188)(%rdi)
113#define dsNiceMatch (192)(%rdi)
114
115#else
116
60#ifndef STRUCT_OFFSET 117#ifndef STRUCT_OFFSET
61# define STRUCT_OFFSET (0) 118# define STRUCT_OFFSET (0)
62#endif 119#endif
120
121
63#define dsWSize ( 56 + STRUCT_OFFSET)(%rdi) 122#define dsWSize ( 56 + STRUCT_OFFSET)(%rdi)
64#define dsWMask ( 64 + STRUCT_OFFSET)(%rdi) 123#define dsWMask ( 64 + STRUCT_OFFSET)(%rdi)
65#define dsWindow ( 72 + STRUCT_OFFSET)(%rdi) 124#define dsWindow ( 72 + STRUCT_OFFSET)(%rdi)
@@ -74,7 +133,10 @@
74#define dsGoodMatch (180 + STRUCT_OFFSET)(%rdi) 133#define dsGoodMatch (180 + STRUCT_OFFSET)(%rdi)
75#define dsNiceMatch (184 + STRUCT_OFFSET)(%rdi) 134#define dsNiceMatch (184 + STRUCT_OFFSET)(%rdi)
76 135
77.globl match_init, longest_match 136#endif
137
138
139
78 140
79.text 141.text
80 142
@@ -222,7 +284,9 @@ LoopEntry: cmpw -1(%windowbestlen, %curmatch), %scanendw
222 * straightforward "rep cmpsb" would not drastically degrade 284 * straightforward "rep cmpsb" would not drastically degrade
223 * performance -- unrolling it, for example, makes no difference. 285 * performance -- unrolling it, for example, makes no difference.
224 */ 286 */
287
225#undef USE_SSE /* works, but is 6-7% slower, than non-SSE... */ 288#undef USE_SSE /* works, but is 6-7% slower, than non-SSE... */
289
226LoopCmps: 290LoopCmps:
227#ifdef USE_SSE 291#ifdef USE_SSE
228 /* Preload the SSE registers */ 292 /* Preload the SSE registers */
@@ -244,29 +308,55 @@ LoopCmps:
244 notw %ax 308 notw %ax
245 bsfw %ax, %ax 309 bsfw %ax, %ax
246 jnz LeaveLoopCmps 310 jnz LeaveLoopCmps
247 add $16, %rdx 311
312 /* this is the only iteration of the loop with a possibility of having
313 incremented rdx by 0x108 (each loop iteration add 16*4 = 0x40
314 and (0x40*4)+8=0x108 */
315 add $8, %rdx
316 jz LenMaximum
317 add $8, %rdx
318
319
248 pmovmskb %xmm3, %rax 320 pmovmskb %xmm3, %rax
249 notw %ax 321 notw %ax
250 bsfw %ax, %ax 322 bsfw %ax, %ax
251 jnz LeaveLoopCmps 323 jnz LeaveLoopCmps
324
325
252 add $16, %rdx 326 add $16, %rdx
327
328
253 pmovmskb %xmm5, %rax 329 pmovmskb %xmm5, %rax
254 notw %ax 330 notw %ax
255 bsfw %ax, %ax 331 bsfw %ax, %ax
256 jnz LeaveLoopCmps 332 jnz LeaveLoopCmps
333
257 add $16, %rdx 334 add $16, %rdx
335
336
258 pmovmskb %xmm7, %rax 337 pmovmskb %xmm7, %rax
259 notw %ax 338 notw %ax
260 bsfw %ax, %ax 339 bsfw %ax, %ax
261 jnz LeaveLoopCmps 340 jnz LeaveLoopCmps
341
262 add $16, %rdx 342 add $16, %rdx
343
263 jmp LoopCmps 344 jmp LoopCmps
264LeaveLoopCmps: add %rax, %rdx 345LeaveLoopCmps: add %rax, %rdx
265#else 346#else
266 mov (%windowbestlen, %rdx), %rax 347 mov (%windowbestlen, %rdx), %rax
267 xor (%prev, %rdx), %rax 348 xor (%prev, %rdx), %rax
268 jnz LeaveLoopCmps 349 jnz LeaveLoopCmps
269 add $8, %rdx 350
351 mov 8(%windowbestlen, %rdx), %rax
352 xor 8(%prev, %rdx), %rax
353 jnz LeaveLoopCmps8
354
355 mov 16(%windowbestlen, %rdx), %rax
356 xor 16(%prev, %rdx), %rax
357 jnz LeaveLoopCmps16
358
359 add $24, %rdx
270 jnz LoopCmps 360 jnz LoopCmps
271 jmp LenMaximum 361 jmp LenMaximum
272# if 0 362# if 0
@@ -274,10 +364,15 @@ LeaveLoopCmps: add %rax, %rdx
274 * This three-liner is tantalizingly simple, but bsf is a slow instruction, 364 * This three-liner is tantalizingly simple, but bsf is a slow instruction,
275 * and the complicated alternative down below is quite a bit faster. Sad... 365 * and the complicated alternative down below is quite a bit faster. Sad...
276 */ 366 */
367
277LeaveLoopCmps: bsf %rax, %rax /* find the first non-zero bit */ 368LeaveLoopCmps: bsf %rax, %rax /* find the first non-zero bit */
278 shrl $3, %eax /* divide by 8 to get the byte */ 369 shrl $3, %eax /* divide by 8 to get the byte */
279 add %rax, %rdx 370 add %rax, %rdx
280# else 371# else
372LeaveLoopCmps16:
373 add $8, %rdx
374LeaveLoopCmps8:
375 add $8, %rdx
281LeaveLoopCmps: testl $0xFFFFFFFF, %eax /* Check the first 4 bytes */ 376LeaveLoopCmps: testl $0xFFFFFFFF, %eax /* Check the first 4 bytes */
282 jnz Check16 377 jnz Check16
283 add $4, %rdx 378 add $4, %rdx