diff options
| author | Mark Adler <madler@alumni.caltech.edu> | 2011-09-09 23:32:36 -0700 |
|---|---|---|
| committer | Mark Adler <madler@alumni.caltech.edu> | 2011-09-09 23:32:36 -0700 |
| commit | 67cc20d0041a32bee12bd9eb20ae218f91b73f77 (patch) | |
| tree | d7e1b94bd15c30efd57cf9036f5fe89306b6bba0 /contrib/amd64 | |
| parent | 7751bd4c715ea8478113e34b49b5a794a4642e8e (diff) | |
| download | zlib-1.2.4-pre1.tar.gz zlib-1.2.4-pre1.tar.bz2 zlib-1.2.4-pre1.zip | |
zlib 1.2.4-pre1v1.2.4-pre1
Diffstat (limited to 'contrib/amd64')
| -rw-r--r-- | contrib/amd64/amd64-match.S | 101 |
1 files changed, 98 insertions, 3 deletions
diff --git a/contrib/amd64/amd64-match.S b/contrib/amd64/amd64-match.S index b3bf1ac..81d4a1c 100644 --- a/contrib/amd64/amd64-match.S +++ b/contrib/amd64/amd64-match.S | |||
| @@ -52,14 +52,73 @@ | |||
| 52 | #define save_r13 (64-LocalVarsSize)(%rsp) | 52 | #define save_r13 (64-LocalVarsSize)(%rsp) |
| 53 | #define save_r15 (80-LocalVarsSize)(%rsp) | 53 | #define save_r15 (80-LocalVarsSize)(%rsp) |
| 54 | 54 | ||
| 55 | |||
| 56 | .globl match_init, longest_match | ||
| 57 | |||
| 55 | /* | 58 | /* |
| 56 | * On AMD64 the first argument of a function (in our case -- the pointer to | 59 | * On AMD64 the first argument of a function (in our case -- the pointer to |
| 57 | * deflate_state structure) is passed in %rdi, hence our offsets below are | 60 | * deflate_state structure) is passed in %rdi, hence our offsets below are |
| 58 | * all off of that. | 61 | * all off of that. |
| 59 | */ | 62 | */ |
| 63 | |||
| 64 | /* you can check the structure offset by running | ||
| 65 | |||
| 66 | #include <stdlib.h> | ||
| 67 | #include <stdio.h> | ||
| 68 | #include "deflate.h" | ||
| 69 | |||
| 70 | void print_depl() | ||
| 71 | { | ||
| 72 | deflate_state ds; | ||
| 73 | deflate_state *s=&ds; | ||
| 74 | printf("size pointer=%u\n",(int)sizeof(void*)); | ||
| 75 | |||
| 76 | printf("#define dsWSize (%3u)(%%rdi)\n",(int)(((char*)&(s->w_size))-((char*)s))); | ||
| 77 | printf("#define dsWMask (%3u)(%%rdi)\n",(int)(((char*)&(s->w_mask))-((char*)s))); | ||
| 78 | printf("#define dsWindow (%3u)(%%rdi)\n",(int)(((char*)&(s->window))-((char*)s))); | ||
| 79 | printf("#define dsPrev (%3u)(%%rdi)\n",(int)(((char*)&(s->prev))-((char*)s))); | ||
| 80 | printf("#define dsMatchLen (%3u)(%%rdi)\n",(int)(((char*)&(s->match_length))-((char*)s))); | ||
| 81 | printf("#define dsPrevMatch (%3u)(%%rdi)\n",(int)(((char*)&(s->prev_match))-((char*)s))); | ||
| 82 | printf("#define dsStrStart (%3u)(%%rdi)\n",(int)(((char*)&(s->strstart))-((char*)s))); | ||
| 83 | printf("#define dsMatchStart (%3u)(%%rdi)\n",(int)(((char*)&(s->match_start))-((char*)s))); | ||
| 84 | printf("#define dsLookahead (%3u)(%%rdi)\n",(int)(((char*)&(s->lookahead))-((char*)s))); | ||
| 85 | printf("#define dsPrevLen (%3u)(%%rdi)\n",(int)(((char*)&(s->prev_length))-((char*)s))); | ||
| 86 | printf("#define dsMaxChainLen (%3u)(%%rdi)\n",(int)(((char*)&(s->max_chain_length))-((char*)s))); | ||
| 87 | printf("#define dsGoodMatch (%3u)(%%rdi)\n",(int)(((char*)&(s->good_match))-((char*)s))); | ||
| 88 | printf("#define dsNiceMatch (%3u)(%%rdi)\n",(int)(((char*)&(s->nice_match))-((char*)s))); | ||
| 89 | } | ||
| 90 | |||
| 91 | */ | ||
| 92 | |||
| 93 | |||
| 94 | /* | ||
| 95 | to compile for XCode 3.2 on MacOSX x86_64 | ||
| 96 | - run "gcc -g -c -DXCODE_MAC_X64_STRUCTURE amd64-match.S" | ||
| 97 | */ | ||
| 98 | |||
| 99 | |||
| 100 | #ifndef CURRENT_LINX_XCODE_MAC_X64_STRUCTURE | ||
| 101 | #define dsWSize ( 68)(%rdi) | ||
| 102 | #define dsWMask ( 76)(%rdi) | ||
| 103 | #define dsWindow ( 80)(%rdi) | ||
| 104 | #define dsPrev ( 96)(%rdi) | ||
| 105 | #define dsMatchLen (144)(%rdi) | ||
| 106 | #define dsPrevMatch (148)(%rdi) | ||
| 107 | #define dsStrStart (156)(%rdi) | ||
| 108 | #define dsMatchStart (160)(%rdi) | ||
| 109 | #define dsLookahead (164)(%rdi) | ||
| 110 | #define dsPrevLen (168)(%rdi) | ||
| 111 | #define dsMaxChainLen (172)(%rdi) | ||
| 112 | #define dsGoodMatch (188)(%rdi) | ||
| 113 | #define dsNiceMatch (192)(%rdi) | ||
| 114 | |||
| 115 | #else | ||
| 116 | |||
| 60 | #ifndef STRUCT_OFFSET | 117 | #ifndef STRUCT_OFFSET |
| 61 | # define STRUCT_OFFSET (0) | 118 | # define STRUCT_OFFSET (0) |
| 62 | #endif | 119 | #endif |
| 120 | |||
| 121 | |||
| 63 | #define dsWSize ( 56 + STRUCT_OFFSET)(%rdi) | 122 | #define dsWSize ( 56 + STRUCT_OFFSET)(%rdi) |
| 64 | #define dsWMask ( 64 + STRUCT_OFFSET)(%rdi) | 123 | #define dsWMask ( 64 + STRUCT_OFFSET)(%rdi) |
| 65 | #define dsWindow ( 72 + STRUCT_OFFSET)(%rdi) | 124 | #define dsWindow ( 72 + STRUCT_OFFSET)(%rdi) |
| @@ -74,7 +133,10 @@ | |||
| 74 | #define dsGoodMatch (180 + STRUCT_OFFSET)(%rdi) | 133 | #define dsGoodMatch (180 + STRUCT_OFFSET)(%rdi) |
| 75 | #define dsNiceMatch (184 + STRUCT_OFFSET)(%rdi) | 134 | #define dsNiceMatch (184 + STRUCT_OFFSET)(%rdi) |
| 76 | 135 | ||
| 77 | .globl match_init, longest_match | 136 | #endif |
| 137 | |||
| 138 | |||
| 139 | |||
| 78 | 140 | ||
| 79 | .text | 141 | .text |
| 80 | 142 | ||
| @@ -222,7 +284,9 @@ LoopEntry: cmpw -1(%windowbestlen, %curmatch), %scanendw | |||
| 222 | * straightforward "rep cmpsb" would not drastically degrade | 284 | * straightforward "rep cmpsb" would not drastically degrade |
| 223 | * performance -- unrolling it, for example, makes no difference. | 285 | * performance -- unrolling it, for example, makes no difference. |
| 224 | */ | 286 | */ |
| 287 | |||
| 225 | #undef USE_SSE /* works, but is 6-7% slower, than non-SSE... */ | 288 | #undef USE_SSE /* works, but is 6-7% slower, than non-SSE... */ |
| 289 | |||
| 226 | LoopCmps: | 290 | LoopCmps: |
| 227 | #ifdef USE_SSE | 291 | #ifdef USE_SSE |
| 228 | /* Preload the SSE registers */ | 292 | /* Preload the SSE registers */ |
| @@ -244,29 +308,55 @@ LoopCmps: | |||
| 244 | notw %ax | 308 | notw %ax |
| 245 | bsfw %ax, %ax | 309 | bsfw %ax, %ax |
| 246 | jnz LeaveLoopCmps | 310 | jnz LeaveLoopCmps |
| 247 | add $16, %rdx | 311 | |
| 312 | /* this is the only iteration of the loop with a possibility of having | ||
| 313 | incremented rdx by 0x108 (each loop iteration add 16*4 = 0x40 | ||
| 314 | and (0x40*4)+8=0x108 */ | ||
| 315 | add $8, %rdx | ||
| 316 | jz LenMaximum | ||
| 317 | add $8, %rdx | ||
| 318 | |||
| 319 | |||
| 248 | pmovmskb %xmm3, %rax | 320 | pmovmskb %xmm3, %rax |
| 249 | notw %ax | 321 | notw %ax |
| 250 | bsfw %ax, %ax | 322 | bsfw %ax, %ax |
| 251 | jnz LeaveLoopCmps | 323 | jnz LeaveLoopCmps |
| 324 | |||
| 325 | |||
| 252 | add $16, %rdx | 326 | add $16, %rdx |
| 327 | |||
| 328 | |||
| 253 | pmovmskb %xmm5, %rax | 329 | pmovmskb %xmm5, %rax |
| 254 | notw %ax | 330 | notw %ax |
| 255 | bsfw %ax, %ax | 331 | bsfw %ax, %ax |
| 256 | jnz LeaveLoopCmps | 332 | jnz LeaveLoopCmps |
| 333 | |||
| 257 | add $16, %rdx | 334 | add $16, %rdx |
| 335 | |||
| 336 | |||
| 258 | pmovmskb %xmm7, %rax | 337 | pmovmskb %xmm7, %rax |
| 259 | notw %ax | 338 | notw %ax |
| 260 | bsfw %ax, %ax | 339 | bsfw %ax, %ax |
| 261 | jnz LeaveLoopCmps | 340 | jnz LeaveLoopCmps |
| 341 | |||
| 262 | add $16, %rdx | 342 | add $16, %rdx |
| 343 | |||
| 263 | jmp LoopCmps | 344 | jmp LoopCmps |
| 264 | LeaveLoopCmps: add %rax, %rdx | 345 | LeaveLoopCmps: add %rax, %rdx |
| 265 | #else | 346 | #else |
| 266 | mov (%windowbestlen, %rdx), %rax | 347 | mov (%windowbestlen, %rdx), %rax |
| 267 | xor (%prev, %rdx), %rax | 348 | xor (%prev, %rdx), %rax |
| 268 | jnz LeaveLoopCmps | 349 | jnz LeaveLoopCmps |
| 269 | add $8, %rdx | 350 | |
| 351 | mov 8(%windowbestlen, %rdx), %rax | ||
| 352 | xor 8(%prev, %rdx), %rax | ||
| 353 | jnz LeaveLoopCmps8 | ||
| 354 | |||
| 355 | mov 16(%windowbestlen, %rdx), %rax | ||
| 356 | xor 16(%prev, %rdx), %rax | ||
| 357 | jnz LeaveLoopCmps16 | ||
| 358 | |||
| 359 | add $24, %rdx | ||
| 270 | jnz LoopCmps | 360 | jnz LoopCmps |
| 271 | jmp LenMaximum | 361 | jmp LenMaximum |
| 272 | # if 0 | 362 | # if 0 |
| @@ -274,10 +364,15 @@ LeaveLoopCmps: add %rax, %rdx | |||
| 274 | * This three-liner is tantalizingly simple, but bsf is a slow instruction, | 364 | * This three-liner is tantalizingly simple, but bsf is a slow instruction, |
| 275 | * and the complicated alternative down below is quite a bit faster. Sad... | 365 | * and the complicated alternative down below is quite a bit faster. Sad... |
| 276 | */ | 366 | */ |
| 367 | |||
| 277 | LeaveLoopCmps: bsf %rax, %rax /* find the first non-zero bit */ | 368 | LeaveLoopCmps: bsf %rax, %rax /* find the first non-zero bit */ |
| 278 | shrl $3, %eax /* divide by 8 to get the byte */ | 369 | shrl $3, %eax /* divide by 8 to get the byte */ |
| 279 | add %rax, %rdx | 370 | add %rax, %rdx |
| 280 | # else | 371 | # else |
| 372 | LeaveLoopCmps16: | ||
| 373 | add $8, %rdx | ||
| 374 | LeaveLoopCmps8: | ||
| 375 | add $8, %rdx | ||
| 281 | LeaveLoopCmps: testl $0xFFFFFFFF, %eax /* Check the first 4 bytes */ | 376 | LeaveLoopCmps: testl $0xFFFFFFFF, %eax /* Check the first 4 bytes */ |
| 282 | jnz Check16 | 377 | jnz Check16 |
| 283 | add $4, %rdx | 378 | add $4, %rdx |
