diff options
author | Mark Adler <madler@alumni.caltech.edu> | 2011-09-09 23:32:36 -0700 |
---|---|---|
committer | Mark Adler <madler@alumni.caltech.edu> | 2011-09-09 23:32:36 -0700 |
commit | 67cc20d0041a32bee12bd9eb20ae218f91b73f77 (patch) | |
tree | d7e1b94bd15c30efd57cf9036f5fe89306b6bba0 /contrib/amd64/amd64-match.S | |
parent | 7751bd4c715ea8478113e34b49b5a794a4642e8e (diff) | |
download | zlib-1.2.4-pre1.tar.gz zlib-1.2.4-pre1.tar.bz2 zlib-1.2.4-pre1.zip |
zlib 1.2.4-pre1v1.2.4-pre1
Diffstat (limited to 'contrib/amd64/amd64-match.S')
-rw-r--r-- | contrib/amd64/amd64-match.S | 101 |
1 files changed, 98 insertions, 3 deletions
diff --git a/contrib/amd64/amd64-match.S b/contrib/amd64/amd64-match.S index b3bf1ac..81d4a1c 100644 --- a/contrib/amd64/amd64-match.S +++ b/contrib/amd64/amd64-match.S | |||
@@ -52,14 +52,73 @@ | |||
52 | #define save_r13 (64-LocalVarsSize)(%rsp) | 52 | #define save_r13 (64-LocalVarsSize)(%rsp) |
53 | #define save_r15 (80-LocalVarsSize)(%rsp) | 53 | #define save_r15 (80-LocalVarsSize)(%rsp) |
54 | 54 | ||
55 | |||
56 | .globl match_init, longest_match | ||
57 | |||
55 | /* | 58 | /* |
56 | * On AMD64 the first argument of a function (in our case -- the pointer to | 59 | * On AMD64 the first argument of a function (in our case -- the pointer to |
57 | * deflate_state structure) is passed in %rdi, hence our offsets below are | 60 | * deflate_state structure) is passed in %rdi, hence our offsets below are |
58 | * all off of that. | 61 | * all off of that. |
59 | */ | 62 | */ |
63 | |||
64 | /* you can check the structure offset by running | ||
65 | |||
66 | #include <stdlib.h> | ||
67 | #include <stdio.h> | ||
68 | #include "deflate.h" | ||
69 | |||
70 | void print_depl() | ||
71 | { | ||
72 | deflate_state ds; | ||
73 | deflate_state *s=&ds; | ||
74 | printf("size pointer=%u\n",(int)sizeof(void*)); | ||
75 | |||
76 | printf("#define dsWSize (%3u)(%%rdi)\n",(int)(((char*)&(s->w_size))-((char*)s))); | ||
77 | printf("#define dsWMask (%3u)(%%rdi)\n",(int)(((char*)&(s->w_mask))-((char*)s))); | ||
78 | printf("#define dsWindow (%3u)(%%rdi)\n",(int)(((char*)&(s->window))-((char*)s))); | ||
79 | printf("#define dsPrev (%3u)(%%rdi)\n",(int)(((char*)&(s->prev))-((char*)s))); | ||
80 | printf("#define dsMatchLen (%3u)(%%rdi)\n",(int)(((char*)&(s->match_length))-((char*)s))); | ||
81 | printf("#define dsPrevMatch (%3u)(%%rdi)\n",(int)(((char*)&(s->prev_match))-((char*)s))); | ||
82 | printf("#define dsStrStart (%3u)(%%rdi)\n",(int)(((char*)&(s->strstart))-((char*)s))); | ||
83 | printf("#define dsMatchStart (%3u)(%%rdi)\n",(int)(((char*)&(s->match_start))-((char*)s))); | ||
84 | printf("#define dsLookahead (%3u)(%%rdi)\n",(int)(((char*)&(s->lookahead))-((char*)s))); | ||
85 | printf("#define dsPrevLen (%3u)(%%rdi)\n",(int)(((char*)&(s->prev_length))-((char*)s))); | ||
86 | printf("#define dsMaxChainLen (%3u)(%%rdi)\n",(int)(((char*)&(s->max_chain_length))-((char*)s))); | ||
87 | printf("#define dsGoodMatch (%3u)(%%rdi)\n",(int)(((char*)&(s->good_match))-((char*)s))); | ||
88 | printf("#define dsNiceMatch (%3u)(%%rdi)\n",(int)(((char*)&(s->nice_match))-((char*)s))); | ||
89 | } | ||
90 | |||
91 | */ | ||
92 | |||
93 | |||
94 | /* | ||
95 | to compile for XCode 3.2 on MacOSX x86_64 | ||
96 | - run "gcc -g -c -DXCODE_MAC_X64_STRUCTURE amd64-match.S" | ||
97 | */ | ||
98 | |||
99 | |||
100 | #ifndef CURRENT_LINX_XCODE_MAC_X64_STRUCTURE | ||
101 | #define dsWSize ( 68)(%rdi) | ||
102 | #define dsWMask ( 76)(%rdi) | ||
103 | #define dsWindow ( 80)(%rdi) | ||
104 | #define dsPrev ( 96)(%rdi) | ||
105 | #define dsMatchLen (144)(%rdi) | ||
106 | #define dsPrevMatch (148)(%rdi) | ||
107 | #define dsStrStart (156)(%rdi) | ||
108 | #define dsMatchStart (160)(%rdi) | ||
109 | #define dsLookahead (164)(%rdi) | ||
110 | #define dsPrevLen (168)(%rdi) | ||
111 | #define dsMaxChainLen (172)(%rdi) | ||
112 | #define dsGoodMatch (188)(%rdi) | ||
113 | #define dsNiceMatch (192)(%rdi) | ||
114 | |||
115 | #else | ||
116 | |||
60 | #ifndef STRUCT_OFFSET | 117 | #ifndef STRUCT_OFFSET |
61 | # define STRUCT_OFFSET (0) | 118 | # define STRUCT_OFFSET (0) |
62 | #endif | 119 | #endif |
120 | |||
121 | |||
63 | #define dsWSize ( 56 + STRUCT_OFFSET)(%rdi) | 122 | #define dsWSize ( 56 + STRUCT_OFFSET)(%rdi) |
64 | #define dsWMask ( 64 + STRUCT_OFFSET)(%rdi) | 123 | #define dsWMask ( 64 + STRUCT_OFFSET)(%rdi) |
65 | #define dsWindow ( 72 + STRUCT_OFFSET)(%rdi) | 124 | #define dsWindow ( 72 + STRUCT_OFFSET)(%rdi) |
@@ -74,7 +133,10 @@ | |||
74 | #define dsGoodMatch (180 + STRUCT_OFFSET)(%rdi) | 133 | #define dsGoodMatch (180 + STRUCT_OFFSET)(%rdi) |
75 | #define dsNiceMatch (184 + STRUCT_OFFSET)(%rdi) | 134 | #define dsNiceMatch (184 + STRUCT_OFFSET)(%rdi) |
76 | 135 | ||
77 | .globl match_init, longest_match | 136 | #endif |
137 | |||
138 | |||
139 | |||
78 | 140 | ||
79 | .text | 141 | .text |
80 | 142 | ||
@@ -222,7 +284,9 @@ LoopEntry: cmpw -1(%windowbestlen, %curmatch), %scanendw | |||
222 | * straightforward "rep cmpsb" would not drastically degrade | 284 | * straightforward "rep cmpsb" would not drastically degrade |
223 | * performance -- unrolling it, for example, makes no difference. | 285 | * performance -- unrolling it, for example, makes no difference. |
224 | */ | 286 | */ |
287 | |||
225 | #undef USE_SSE /* works, but is 6-7% slower, than non-SSE... */ | 288 | #undef USE_SSE /* works, but is 6-7% slower, than non-SSE... */ |
289 | |||
226 | LoopCmps: | 290 | LoopCmps: |
227 | #ifdef USE_SSE | 291 | #ifdef USE_SSE |
228 | /* Preload the SSE registers */ | 292 | /* Preload the SSE registers */ |
@@ -244,29 +308,55 @@ LoopCmps: | |||
244 | notw %ax | 308 | notw %ax |
245 | bsfw %ax, %ax | 309 | bsfw %ax, %ax |
246 | jnz LeaveLoopCmps | 310 | jnz LeaveLoopCmps |
247 | add $16, %rdx | 311 | |
312 | /* this is the only iteration of the loop with a possibility of having | ||
313 | incremented rdx by 0x108 (each loop iteration add 16*4 = 0x40 | ||
314 | and (0x40*4)+8=0x108 */ | ||
315 | add $8, %rdx | ||
316 | jz LenMaximum | ||
317 | add $8, %rdx | ||
318 | |||
319 | |||
248 | pmovmskb %xmm3, %rax | 320 | pmovmskb %xmm3, %rax |
249 | notw %ax | 321 | notw %ax |
250 | bsfw %ax, %ax | 322 | bsfw %ax, %ax |
251 | jnz LeaveLoopCmps | 323 | jnz LeaveLoopCmps |
324 | |||
325 | |||
252 | add $16, %rdx | 326 | add $16, %rdx |
327 | |||
328 | |||
253 | pmovmskb %xmm5, %rax | 329 | pmovmskb %xmm5, %rax |
254 | notw %ax | 330 | notw %ax |
255 | bsfw %ax, %ax | 331 | bsfw %ax, %ax |
256 | jnz LeaveLoopCmps | 332 | jnz LeaveLoopCmps |
333 | |||
257 | add $16, %rdx | 334 | add $16, %rdx |
335 | |||
336 | |||
258 | pmovmskb %xmm7, %rax | 337 | pmovmskb %xmm7, %rax |
259 | notw %ax | 338 | notw %ax |
260 | bsfw %ax, %ax | 339 | bsfw %ax, %ax |
261 | jnz LeaveLoopCmps | 340 | jnz LeaveLoopCmps |
341 | |||
262 | add $16, %rdx | 342 | add $16, %rdx |
343 | |||
263 | jmp LoopCmps | 344 | jmp LoopCmps |
264 | LeaveLoopCmps: add %rax, %rdx | 345 | LeaveLoopCmps: add %rax, %rdx |
265 | #else | 346 | #else |
266 | mov (%windowbestlen, %rdx), %rax | 347 | mov (%windowbestlen, %rdx), %rax |
267 | xor (%prev, %rdx), %rax | 348 | xor (%prev, %rdx), %rax |
268 | jnz LeaveLoopCmps | 349 | jnz LeaveLoopCmps |
269 | add $8, %rdx | 350 | |
351 | mov 8(%windowbestlen, %rdx), %rax | ||
352 | xor 8(%prev, %rdx), %rax | ||
353 | jnz LeaveLoopCmps8 | ||
354 | |||
355 | mov 16(%windowbestlen, %rdx), %rax | ||
356 | xor 16(%prev, %rdx), %rax | ||
357 | jnz LeaveLoopCmps16 | ||
358 | |||
359 | add $24, %rdx | ||
270 | jnz LoopCmps | 360 | jnz LoopCmps |
271 | jmp LenMaximum | 361 | jmp LenMaximum |
272 | # if 0 | 362 | # if 0 |
@@ -274,10 +364,15 @@ LeaveLoopCmps: add %rax, %rdx | |||
274 | * This three-liner is tantalizingly simple, but bsf is a slow instruction, | 364 | * This three-liner is tantalizingly simple, but bsf is a slow instruction, |
275 | * and the complicated alternative down below is quite a bit faster. Sad... | 365 | * and the complicated alternative down below is quite a bit faster. Sad... |
276 | */ | 366 | */ |
367 | |||
277 | LeaveLoopCmps: bsf %rax, %rax /* find the first non-zero bit */ | 368 | LeaveLoopCmps: bsf %rax, %rax /* find the first non-zero bit */ |
278 | shrl $3, %eax /* divide by 8 to get the byte */ | 369 | shrl $3, %eax /* divide by 8 to get the byte */ |
279 | add %rax, %rdx | 370 | add %rax, %rdx |
280 | # else | 371 | # else |
372 | LeaveLoopCmps16: | ||
373 | add $8, %rdx | ||
374 | LeaveLoopCmps8: | ||
375 | add $8, %rdx | ||
281 | LeaveLoopCmps: testl $0xFFFFFFFF, %eax /* Check the first 4 bytes */ | 376 | LeaveLoopCmps: testl $0xFFFFFFFF, %eax /* Check the first 4 bytes */ |
282 | jnz Check16 | 377 | jnz Check16 |
283 | add $4, %rdx | 378 | add $4, %rdx |