diff options
| author | Mark Adler <madler@alumni.caltech.edu> | 2011-09-09 23:25:17 -0700 |
|---|---|---|
| committer | Mark Adler <madler@alumni.caltech.edu> | 2011-09-09 23:25:17 -0700 |
| commit | abf180a067223611620dd97dd5681df7c7fa7c9b (patch) | |
| tree | 48ce6022aa1670380c098bd0abed2ac4aa1d9ca0 /contrib/asm586 | |
| parent | 9c3a5830218c4e7fff23b8fc4386269db77a03a9 (diff) | |
| download | zlib-1.2.3.tar.gz zlib-1.2.3.tar.bz2 zlib-1.2.3.zip | |
zlib 1.2.3v1.2.3
Diffstat (limited to 'contrib/asm586')
| -rw-r--r-- | contrib/asm586/match.S | 728 |
1 files changed, 364 insertions, 364 deletions
diff --git a/contrib/asm586/match.S b/contrib/asm586/match.S index 371c9a0..0368b35 100644 --- a/contrib/asm586/match.S +++ b/contrib/asm586/match.S | |||
| @@ -1,364 +1,364 @@ | |||
| 1 | /* match.s -- Pentium-optimized version of longest_match() | 1 | /* match.s -- Pentium-optimized version of longest_match() |
| 2 | * Written for zlib 1.1.2 | 2 | * Written for zlib 1.1.2 |
| 3 | * Copyright (C) 1998 Brian Raiter <breadbox@muppetlabs.com> | 3 | * Copyright (C) 1998 Brian Raiter <breadbox@muppetlabs.com> |
| 4 | * | 4 | * |
| 5 | * This is free software; you can redistribute it and/or modify it | 5 | * This is free software; you can redistribute it and/or modify it |
| 6 | * under the terms of the GNU General Public License. | 6 | * under the terms of the GNU General Public License. |
| 7 | */ | 7 | */ |
| 8 | 8 | ||
| 9 | #ifndef NO_UNDERLINE | 9 | #ifndef NO_UNDERLINE |
| 10 | #define match_init _match_init | 10 | #define match_init _match_init |
| 11 | #define longest_match _longest_match | 11 | #define longest_match _longest_match |
| 12 | #endif | 12 | #endif |
| 13 | 13 | ||
| 14 | #define MAX_MATCH (258) | 14 | #define MAX_MATCH (258) |
| 15 | #define MIN_MATCH (3) | 15 | #define MIN_MATCH (3) |
| 16 | #define MIN_LOOKAHEAD (MAX_MATCH + MIN_MATCH + 1) | 16 | #define MIN_LOOKAHEAD (MAX_MATCH + MIN_MATCH + 1) |
| 17 | #define MAX_MATCH_8 ((MAX_MATCH + 7) & ~7) | 17 | #define MAX_MATCH_8 ((MAX_MATCH + 7) & ~7) |
| 18 | 18 | ||
| 19 | /* stack frame offsets */ | 19 | /* stack frame offsets */ |
| 20 | 20 | ||
| 21 | #define wmask 0 /* local copy of s->wmask */ | 21 | #define wmask 0 /* local copy of s->wmask */ |
| 22 | #define window 4 /* local copy of s->window */ | 22 | #define window 4 /* local copy of s->window */ |
| 23 | #define windowbestlen 8 /* s->window + bestlen */ | 23 | #define windowbestlen 8 /* s->window + bestlen */ |
| 24 | #define chainlenscanend 12 /* high word: current chain len */ | 24 | #define chainlenscanend 12 /* high word: current chain len */ |
| 25 | /* low word: last bytes sought */ | 25 | /* low word: last bytes sought */ |
| 26 | #define scanstart 16 /* first two bytes of string */ | 26 | #define scanstart 16 /* first two bytes of string */ |
| 27 | #define scanalign 20 /* dword-misalignment of string */ | 27 | #define scanalign 20 /* dword-misalignment of string */ |
| 28 | #define nicematch 24 /* a good enough match size */ | 28 | #define nicematch 24 /* a good enough match size */ |
| 29 | #define bestlen 28 /* size of best match so far */ | 29 | #define bestlen 28 /* size of best match so far */ |
| 30 | #define scan 32 /* ptr to string wanting match */ | 30 | #define scan 32 /* ptr to string wanting match */ |
| 31 | 31 | ||
| 32 | #define LocalVarsSize (36) | 32 | #define LocalVarsSize (36) |
| 33 | /* saved ebx 36 */ | 33 | /* saved ebx 36 */ |
| 34 | /* saved edi 40 */ | 34 | /* saved edi 40 */ |
| 35 | /* saved esi 44 */ | 35 | /* saved esi 44 */ |
| 36 | /* saved ebp 48 */ | 36 | /* saved ebp 48 */ |
| 37 | /* return address 52 */ | 37 | /* return address 52 */ |
| 38 | #define deflatestate 56 /* the function arguments */ | 38 | #define deflatestate 56 /* the function arguments */ |
| 39 | #define curmatch 60 | 39 | #define curmatch 60 |
| 40 | 40 | ||
| 41 | /* Offsets for fields in the deflate_state structure. These numbers | 41 | /* Offsets for fields in the deflate_state structure. These numbers |
| 42 | * are calculated from the definition of deflate_state, with the | 42 | * are calculated from the definition of deflate_state, with the |
| 43 | * assumption that the compiler will dword-align the fields. (Thus, | 43 | * assumption that the compiler will dword-align the fields. (Thus, |
| 44 | * changing the definition of deflate_state could easily cause this | 44 | * changing the definition of deflate_state could easily cause this |
| 45 | * program to crash horribly, without so much as a warning at | 45 | * program to crash horribly, without so much as a warning at |
| 46 | * compile time. Sigh.) | 46 | * compile time. Sigh.) |
| 47 | */ | 47 | */ |
| 48 | 48 | ||
| 49 | /* All the +zlib1222add offsets are due to the addition of fields | 49 | /* All the +zlib1222add offsets are due to the addition of fields |
| 50 | * in zlib in the deflate_state structure since the asm code was first written | 50 | * in zlib in the deflate_state structure since the asm code was first written |
| 51 | * (if you compile with zlib 1.0.4 or older, use "zlib1222add equ (-4)"). | 51 | * (if you compile with zlib 1.0.4 or older, use "zlib1222add equ (-4)"). |
| 52 | * (if you compile with zlib between 1.0.5 and 1.2.2.1, use "zlib1222add equ 0"). | 52 | * (if you compile with zlib between 1.0.5 and 1.2.2.1, use "zlib1222add equ 0"). |
| 53 | * if you compile with zlib 1.2.2.2 or later , use "zlib1222add equ 8"). | 53 | * if you compile with zlib 1.2.2.2 or later , use "zlib1222add equ 8"). |
| 54 | */ | 54 | */ |
| 55 | 55 | ||
| 56 | #define zlib1222add (8) | 56 | #define zlib1222add (8) |
| 57 | 57 | ||
| 58 | #define dsWSize (36+zlib1222add) | 58 | #define dsWSize (36+zlib1222add) |
| 59 | #define dsWMask (44+zlib1222add) | 59 | #define dsWMask (44+zlib1222add) |
| 60 | #define dsWindow (48+zlib1222add) | 60 | #define dsWindow (48+zlib1222add) |
| 61 | #define dsPrev (56+zlib1222add) | 61 | #define dsPrev (56+zlib1222add) |
| 62 | #define dsMatchLen (88+zlib1222add) | 62 | #define dsMatchLen (88+zlib1222add) |
| 63 | #define dsPrevMatch (92+zlib1222add) | 63 | #define dsPrevMatch (92+zlib1222add) |
| 64 | #define dsStrStart (100+zlib1222add) | 64 | #define dsStrStart (100+zlib1222add) |
| 65 | #define dsMatchStart (104+zlib1222add) | 65 | #define dsMatchStart (104+zlib1222add) |
| 66 | #define dsLookahead (108+zlib1222add) | 66 | #define dsLookahead (108+zlib1222add) |
| 67 | #define dsPrevLen (112+zlib1222add) | 67 | #define dsPrevLen (112+zlib1222add) |
| 68 | #define dsMaxChainLen (116+zlib1222add) | 68 | #define dsMaxChainLen (116+zlib1222add) |
| 69 | #define dsGoodMatch (132+zlib1222add) | 69 | #define dsGoodMatch (132+zlib1222add) |
| 70 | #define dsNiceMatch (136+zlib1222add) | 70 | #define dsNiceMatch (136+zlib1222add) |
| 71 | 71 | ||
| 72 | 72 | ||
| 73 | .file "match.S" | 73 | .file "match.S" |
| 74 | 74 | ||
| 75 | .globl match_init, longest_match | 75 | .globl match_init, longest_match |
| 76 | 76 | ||
| 77 | .text | 77 | .text |
| 78 | 78 | ||
| 79 | /* uInt longest_match(deflate_state *deflatestate, IPos curmatch) */ | 79 | /* uInt longest_match(deflate_state *deflatestate, IPos curmatch) */ |
| 80 | 80 | ||
| 81 | longest_match: | 81 | longest_match: |
| 82 | 82 | ||
| 83 | /* Save registers that the compiler may be using, and adjust %esp to */ | 83 | /* Save registers that the compiler may be using, and adjust %esp to */ |
| 84 | /* make room for our stack frame. */ | 84 | /* make room for our stack frame. */ |
| 85 | 85 | ||
| 86 | pushl %ebp | 86 | pushl %ebp |
| 87 | pushl %edi | 87 | pushl %edi |
| 88 | pushl %esi | 88 | pushl %esi |
| 89 | pushl %ebx | 89 | pushl %ebx |
| 90 | subl $LocalVarsSize, %esp | 90 | subl $LocalVarsSize, %esp |
| 91 | 91 | ||
| 92 | /* Retrieve the function arguments. %ecx will hold cur_match */ | 92 | /* Retrieve the function arguments. %ecx will hold cur_match */ |
| 93 | /* throughout the entire function. %edx will hold the pointer to the */ | 93 | /* throughout the entire function. %edx will hold the pointer to the */ |
| 94 | /* deflate_state structure during the function's setup (before */ | 94 | /* deflate_state structure during the function's setup (before */ |
| 95 | /* entering the main loop). */ | 95 | /* entering the main loop). */ |
| 96 | 96 | ||
| 97 | movl deflatestate(%esp), %edx | 97 | movl deflatestate(%esp), %edx |
| 98 | movl curmatch(%esp), %ecx | 98 | movl curmatch(%esp), %ecx |
| 99 | 99 | ||
| 100 | /* if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead; */ | 100 | /* if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead; */ |
| 101 | 101 | ||
| 102 | movl dsNiceMatch(%edx), %eax | 102 | movl dsNiceMatch(%edx), %eax |
| 103 | movl dsLookahead(%edx), %ebx | 103 | movl dsLookahead(%edx), %ebx |
| 104 | cmpl %eax, %ebx | 104 | cmpl %eax, %ebx |
| 105 | jl LookaheadLess | 105 | jl LookaheadLess |
| 106 | movl %eax, %ebx | 106 | movl %eax, %ebx |
| 107 | LookaheadLess: movl %ebx, nicematch(%esp) | 107 | LookaheadLess: movl %ebx, nicematch(%esp) |
| 108 | 108 | ||
| 109 | /* register Bytef *scan = s->window + s->strstart; */ | 109 | /* register Bytef *scan = s->window + s->strstart; */ |
| 110 | 110 | ||
| 111 | movl dsWindow(%edx), %esi | 111 | movl dsWindow(%edx), %esi |
| 112 | movl %esi, window(%esp) | 112 | movl %esi, window(%esp) |
| 113 | movl dsStrStart(%edx), %ebp | 113 | movl dsStrStart(%edx), %ebp |
| 114 | lea (%esi,%ebp), %edi | 114 | lea (%esi,%ebp), %edi |
| 115 | movl %edi, scan(%esp) | 115 | movl %edi, scan(%esp) |
| 116 | 116 | ||
| 117 | /* Determine how many bytes the scan ptr is off from being */ | 117 | /* Determine how many bytes the scan ptr is off from being */ |
| 118 | /* dword-aligned. */ | 118 | /* dword-aligned. */ |
| 119 | 119 | ||
| 120 | movl %edi, %eax | 120 | movl %edi, %eax |
| 121 | negl %eax | 121 | negl %eax |
| 122 | andl $3, %eax | 122 | andl $3, %eax |
| 123 | movl %eax, scanalign(%esp) | 123 | movl %eax, scanalign(%esp) |
| 124 | 124 | ||
| 125 | /* IPos limit = s->strstart > (IPos)MAX_DIST(s) ? */ | 125 | /* IPos limit = s->strstart > (IPos)MAX_DIST(s) ? */ |
| 126 | /* s->strstart - (IPos)MAX_DIST(s) : NIL; */ | 126 | /* s->strstart - (IPos)MAX_DIST(s) : NIL; */ |
| 127 | 127 | ||
| 128 | movl dsWSize(%edx), %eax | 128 | movl dsWSize(%edx), %eax |
| 129 | subl $MIN_LOOKAHEAD, %eax | 129 | subl $MIN_LOOKAHEAD, %eax |
| 130 | subl %eax, %ebp | 130 | subl %eax, %ebp |
| 131 | jg LimitPositive | 131 | jg LimitPositive |
| 132 | xorl %ebp, %ebp | 132 | xorl %ebp, %ebp |
| 133 | LimitPositive: | 133 | LimitPositive: |
| 134 | 134 | ||
| 135 | /* unsigned chain_length = s->max_chain_length; */ | 135 | /* unsigned chain_length = s->max_chain_length; */ |
| 136 | /* if (s->prev_length >= s->good_match) { */ | 136 | /* if (s->prev_length >= s->good_match) { */ |
| 137 | /* chain_length >>= 2; */ | 137 | /* chain_length >>= 2; */ |
| 138 | /* } */ | 138 | /* } */ |
| 139 | 139 | ||
| 140 | movl dsPrevLen(%edx), %eax | 140 | movl dsPrevLen(%edx), %eax |
| 141 | movl dsGoodMatch(%edx), %ebx | 141 | movl dsGoodMatch(%edx), %ebx |
| 142 | cmpl %ebx, %eax | 142 | cmpl %ebx, %eax |
| 143 | movl dsMaxChainLen(%edx), %ebx | 143 | movl dsMaxChainLen(%edx), %ebx |
| 144 | jl LastMatchGood | 144 | jl LastMatchGood |
| 145 | shrl $2, %ebx | 145 | shrl $2, %ebx |
| 146 | LastMatchGood: | 146 | LastMatchGood: |
| 147 | 147 | ||
| 148 | /* chainlen is decremented once beforehand so that the function can */ | 148 | /* chainlen is decremented once beforehand so that the function can */ |
| 149 | /* use the sign flag instead of the zero flag for the exit test. */ | 149 | /* use the sign flag instead of the zero flag for the exit test. */ |
| 150 | /* It is then shifted into the high word, to make room for the scanend */ | 150 | /* It is then shifted into the high word, to make room for the scanend */ |
| 151 | /* scanend value, which it will always accompany. */ | 151 | /* scanend value, which it will always accompany. */ |
| 152 | 152 | ||
| 153 | decl %ebx | 153 | decl %ebx |
| 154 | shll $16, %ebx | 154 | shll $16, %ebx |
| 155 | 155 | ||
| 156 | /* int best_len = s->prev_length; */ | 156 | /* int best_len = s->prev_length; */ |
| 157 | 157 | ||
| 158 | movl dsPrevLen(%edx), %eax | 158 | movl dsPrevLen(%edx), %eax |
| 159 | movl %eax, bestlen(%esp) | 159 | movl %eax, bestlen(%esp) |
| 160 | 160 | ||
| 161 | /* Store the sum of s->window + best_len in %esi locally, and in %esi. */ | 161 | /* Store the sum of s->window + best_len in %esi locally, and in %esi. */ |
| 162 | 162 | ||
| 163 | addl %eax, %esi | 163 | addl %eax, %esi |
| 164 | movl %esi, windowbestlen(%esp) | 164 | movl %esi, windowbestlen(%esp) |
| 165 | 165 | ||
| 166 | /* register ush scan_start = *(ushf*)scan; */ | 166 | /* register ush scan_start = *(ushf*)scan; */ |
| 167 | /* register ush scan_end = *(ushf*)(scan+best_len-1); */ | 167 | /* register ush scan_end = *(ushf*)(scan+best_len-1); */ |
| 168 | 168 | ||
| 169 | movw (%edi), %bx | 169 | movw (%edi), %bx |
| 170 | movw %bx, scanstart(%esp) | 170 | movw %bx, scanstart(%esp) |
| 171 | movw -1(%edi,%eax), %bx | 171 | movw -1(%edi,%eax), %bx |
| 172 | movl %ebx, chainlenscanend(%esp) | 172 | movl %ebx, chainlenscanend(%esp) |
| 173 | 173 | ||
| 174 | /* Posf *prev = s->prev; */ | 174 | /* Posf *prev = s->prev; */ |
| 175 | /* uInt wmask = s->w_mask; */ | 175 | /* uInt wmask = s->w_mask; */ |
| 176 | 176 | ||
| 177 | movl dsPrev(%edx), %edi | 177 | movl dsPrev(%edx), %edi |
| 178 | movl dsWMask(%edx), %edx | 178 | movl dsWMask(%edx), %edx |
| 179 | mov %edx, wmask(%esp) | 179 | mov %edx, wmask(%esp) |
| 180 | 180 | ||
| 181 | /* Jump into the main loop. */ | 181 | /* Jump into the main loop. */ |
| 182 | 182 | ||
| 183 | jmp LoopEntry | 183 | jmp LoopEntry |
| 184 | 184 | ||
| 185 | .balign 16 | 185 | .balign 16 |
| 186 | 186 | ||
| 187 | /* do { | 187 | /* do { |
| 188 | * match = s->window + cur_match; | 188 | * match = s->window + cur_match; |
| 189 | * if (*(ushf*)(match+best_len-1) != scan_end || | 189 | * if (*(ushf*)(match+best_len-1) != scan_end || |
| 190 | * *(ushf*)match != scan_start) continue; | 190 | * *(ushf*)match != scan_start) continue; |
| 191 | * [...] | 191 | * [...] |
| 192 | * } while ((cur_match = prev[cur_match & wmask]) > limit | 192 | * } while ((cur_match = prev[cur_match & wmask]) > limit |
| 193 | * && --chain_length != 0); | 193 | * && --chain_length != 0); |
| 194 | * | 194 | * |
| 195 | * Here is the inner loop of the function. The function will spend the | 195 | * Here is the inner loop of the function. The function will spend the |
| 196 | * majority of its time in this loop, and majority of that time will | 196 | * majority of its time in this loop, and majority of that time will |
| 197 | * be spent in the first ten instructions. | 197 | * be spent in the first ten instructions. |
| 198 | * | 198 | * |
| 199 | * Within this loop: | 199 | * Within this loop: |
| 200 | * %ebx = chainlenscanend - i.e., ((chainlen << 16) | scanend) | 200 | * %ebx = chainlenscanend - i.e., ((chainlen << 16) | scanend) |
| 201 | * %ecx = curmatch | 201 | * %ecx = curmatch |
| 202 | * %edx = curmatch & wmask | 202 | * %edx = curmatch & wmask |
| 203 | * %esi = windowbestlen - i.e., (window + bestlen) | 203 | * %esi = windowbestlen - i.e., (window + bestlen) |
| 204 | * %edi = prev | 204 | * %edi = prev |
| 205 | * %ebp = limit | 205 | * %ebp = limit |
| 206 | * | 206 | * |
| 207 | * Two optimization notes on the choice of instructions: | 207 | * Two optimization notes on the choice of instructions: |
| 208 | * | 208 | * |
| 209 | * The first instruction uses a 16-bit address, which costs an extra, | 209 | * The first instruction uses a 16-bit address, which costs an extra, |
| 210 | * unpairable cycle. This is cheaper than doing a 32-bit access and | 210 | * unpairable cycle. This is cheaper than doing a 32-bit access and |
| 211 | * zeroing the high word, due to the 3-cycle misalignment penalty which | 211 | * zeroing the high word, due to the 3-cycle misalignment penalty which |
| 212 | * would occur half the time. This also turns out to be cheaper than | 212 | * would occur half the time. This also turns out to be cheaper than |
| 213 | * doing two separate 8-bit accesses, as the memory is so rarely in the | 213 | * doing two separate 8-bit accesses, as the memory is so rarely in the |
| 214 | * L1 cache. | 214 | * L1 cache. |
| 215 | * | 215 | * |
| 216 | * The window buffer, however, apparently spends a lot of time in the | 216 | * The window buffer, however, apparently spends a lot of time in the |
| 217 | * cache, and so it is faster to retrieve the word at the end of the | 217 | * cache, and so it is faster to retrieve the word at the end of the |
| 218 | * match string with two 8-bit loads. The instructions that test the | 218 | * match string with two 8-bit loads. The instructions that test the |
| 219 | * word at the beginning of the match string, however, are executed | 219 | * word at the beginning of the match string, however, are executed |
| 220 | * much less frequently, and there it was cheaper to use 16-bit | 220 | * much less frequently, and there it was cheaper to use 16-bit |
| 221 | * instructions, which avoided the necessity of saving off and | 221 | * instructions, which avoided the necessity of saving off and |
| 222 | * subsequently reloading one of the other registers. | 222 | * subsequently reloading one of the other registers. |
| 223 | */ | 223 | */ |
| 224 | LookupLoop: | 224 | LookupLoop: |
| 225 | /* 1 U & V */ | 225 | /* 1 U & V */ |
| 226 | movw (%edi,%edx,2), %cx /* 2 U pipe */ | 226 | movw (%edi,%edx,2), %cx /* 2 U pipe */ |
| 227 | movl wmask(%esp), %edx /* 2 V pipe */ | 227 | movl wmask(%esp), %edx /* 2 V pipe */ |
| 228 | cmpl %ebp, %ecx /* 3 U pipe */ | 228 | cmpl %ebp, %ecx /* 3 U pipe */ |
| 229 | jbe LeaveNow /* 3 V pipe */ | 229 | jbe LeaveNow /* 3 V pipe */ |
| 230 | subl $0x00010000, %ebx /* 4 U pipe */ | 230 | subl $0x00010000, %ebx /* 4 U pipe */ |
| 231 | js LeaveNow /* 4 V pipe */ | 231 | js LeaveNow /* 4 V pipe */ |
| 232 | LoopEntry: movb -1(%esi,%ecx), %al /* 5 U pipe */ | 232 | LoopEntry: movb -1(%esi,%ecx), %al /* 5 U pipe */ |
| 233 | andl %ecx, %edx /* 5 V pipe */ | 233 | andl %ecx, %edx /* 5 V pipe */ |
| 234 | cmpb %bl, %al /* 6 U pipe */ | 234 | cmpb %bl, %al /* 6 U pipe */ |
| 235 | jnz LookupLoop /* 6 V pipe */ | 235 | jnz LookupLoop /* 6 V pipe */ |
| 236 | movb (%esi,%ecx), %ah | 236 | movb (%esi,%ecx), %ah |
| 237 | cmpb %bh, %ah | 237 | cmpb %bh, %ah |
| 238 | jnz LookupLoop | 238 | jnz LookupLoop |
| 239 | movl window(%esp), %eax | 239 | movl window(%esp), %eax |
| 240 | movw (%eax,%ecx), %ax | 240 | movw (%eax,%ecx), %ax |
| 241 | cmpw scanstart(%esp), %ax | 241 | cmpw scanstart(%esp), %ax |
| 242 | jnz LookupLoop | 242 | jnz LookupLoop |
| 243 | 243 | ||
| 244 | /* Store the current value of chainlen. */ | 244 | /* Store the current value of chainlen. */ |
| 245 | 245 | ||
| 246 | movl %ebx, chainlenscanend(%esp) | 246 | movl %ebx, chainlenscanend(%esp) |
| 247 | 247 | ||
| 248 | /* Point %edi to the string under scrutiny, and %esi to the string we */ | 248 | /* Point %edi to the string under scrutiny, and %esi to the string we */ |
| 249 | /* are hoping to match it up with. In actuality, %esi and %edi are */ | 249 | /* are hoping to match it up with. In actuality, %esi and %edi are */ |
| 250 | /* both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and %edx is */ | 250 | /* both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and %edx is */ |
| 251 | /* initialized to -(MAX_MATCH_8 - scanalign). */ | 251 | /* initialized to -(MAX_MATCH_8 - scanalign). */ |
| 252 | 252 | ||
| 253 | movl window(%esp), %esi | 253 | movl window(%esp), %esi |
| 254 | movl scan(%esp), %edi | 254 | movl scan(%esp), %edi |
| 255 | addl %ecx, %esi | 255 | addl %ecx, %esi |
| 256 | movl scanalign(%esp), %eax | 256 | movl scanalign(%esp), %eax |
| 257 | movl $(-MAX_MATCH_8), %edx | 257 | movl $(-MAX_MATCH_8), %edx |
| 258 | lea MAX_MATCH_8(%edi,%eax), %edi | 258 | lea MAX_MATCH_8(%edi,%eax), %edi |
| 259 | lea MAX_MATCH_8(%esi,%eax), %esi | 259 | lea MAX_MATCH_8(%esi,%eax), %esi |
| 260 | 260 | ||
| 261 | /* Test the strings for equality, 8 bytes at a time. At the end, | 261 | /* Test the strings for equality, 8 bytes at a time. At the end, |
| 262 | * adjust %edx so that it is offset to the exact byte that mismatched. | 262 | * adjust %edx so that it is offset to the exact byte that mismatched. |
| 263 | * | 263 | * |
| 264 | * We already know at this point that the first three bytes of the | 264 | * We already know at this point that the first three bytes of the |
| 265 | * strings match each other, and they can be safely passed over before | 265 | * strings match each other, and they can be safely passed over before |
| 266 | * starting the compare loop. So what this code does is skip over 0-3 | 266 | * starting the compare loop. So what this code does is skip over 0-3 |
| 267 | * bytes, as much as necessary in order to dword-align the %edi | 267 | * bytes, as much as necessary in order to dword-align the %edi |
| 268 | * pointer. (%esi will still be misaligned three times out of four.) | 268 | * pointer. (%esi will still be misaligned three times out of four.) |
| 269 | * | 269 | * |
| 270 | * It should be confessed that this loop usually does not represent | 270 | * It should be confessed that this loop usually does not represent |
| 271 | * much of the total running time. Replacing it with a more | 271 | * much of the total running time. Replacing it with a more |
| 272 | * straightforward "rep cmpsb" would not drastically degrade | 272 | * straightforward "rep cmpsb" would not drastically degrade |
| 273 | * performance. | 273 | * performance. |
| 274 | */ | 274 | */ |
| 275 | LoopCmps: | 275 | LoopCmps: |
| 276 | movl (%esi,%edx), %eax | 276 | movl (%esi,%edx), %eax |
| 277 | movl (%edi,%edx), %ebx | 277 | movl (%edi,%edx), %ebx |
| 278 | xorl %ebx, %eax | 278 | xorl %ebx, %eax |
| 279 | jnz LeaveLoopCmps | 279 | jnz LeaveLoopCmps |
| 280 | movl 4(%esi,%edx), %eax | 280 | movl 4(%esi,%edx), %eax |
| 281 | movl 4(%edi,%edx), %ebx | 281 | movl 4(%edi,%edx), %ebx |
| 282 | xorl %ebx, %eax | 282 | xorl %ebx, %eax |
| 283 | jnz LeaveLoopCmps4 | 283 | jnz LeaveLoopCmps4 |
| 284 | addl $8, %edx | 284 | addl $8, %edx |
| 285 | jnz LoopCmps | 285 | jnz LoopCmps |
| 286 | jmp LenMaximum | 286 | jmp LenMaximum |
| 287 | LeaveLoopCmps4: addl $4, %edx | 287 | LeaveLoopCmps4: addl $4, %edx |
| 288 | LeaveLoopCmps: testl $0x0000FFFF, %eax | 288 | LeaveLoopCmps: testl $0x0000FFFF, %eax |
| 289 | jnz LenLower | 289 | jnz LenLower |
| 290 | addl $2, %edx | 290 | addl $2, %edx |
| 291 | shrl $16, %eax | 291 | shrl $16, %eax |
| 292 | LenLower: subb $1, %al | 292 | LenLower: subb $1, %al |
| 293 | adcl $0, %edx | 293 | adcl $0, %edx |
| 294 | 294 | ||
| 295 | /* Calculate the length of the match. If it is longer than MAX_MATCH, */ | 295 | /* Calculate the length of the match. If it is longer than MAX_MATCH, */ |
| 296 | /* then automatically accept it as the best possible match and leave. */ | 296 | /* then automatically accept it as the best possible match and leave. */ |
| 297 | 297 | ||
| 298 | lea (%edi,%edx), %eax | 298 | lea (%edi,%edx), %eax |
| 299 | movl scan(%esp), %edi | 299 | movl scan(%esp), %edi |
| 300 | subl %edi, %eax | 300 | subl %edi, %eax |
| 301 | cmpl $MAX_MATCH, %eax | 301 | cmpl $MAX_MATCH, %eax |
| 302 | jge LenMaximum | 302 | jge LenMaximum |
| 303 | 303 | ||
| 304 | /* If the length of the match is not longer than the best match we */ | 304 | /* If the length of the match is not longer than the best match we */ |
| 305 | /* have so far, then forget it and return to the lookup loop. */ | 305 | /* have so far, then forget it and return to the lookup loop. */ |
| 306 | 306 | ||
| 307 | movl deflatestate(%esp), %edx | 307 | movl deflatestate(%esp), %edx |
| 308 | movl bestlen(%esp), %ebx | 308 | movl bestlen(%esp), %ebx |
| 309 | cmpl %ebx, %eax | 309 | cmpl %ebx, %eax |
| 310 | jg LongerMatch | 310 | jg LongerMatch |
| 311 | movl chainlenscanend(%esp), %ebx | 311 | movl chainlenscanend(%esp), %ebx |
| 312 | movl windowbestlen(%esp), %esi | 312 | movl windowbestlen(%esp), %esi |
| 313 | movl dsPrev(%edx), %edi | 313 | movl dsPrev(%edx), %edi |
| 314 | movl wmask(%esp), %edx | 314 | movl wmask(%esp), %edx |
| 315 | andl %ecx, %edx | 315 | andl %ecx, %edx |
| 316 | jmp LookupLoop | 316 | jmp LookupLoop |
| 317 | 317 | ||
| 318 | /* s->match_start = cur_match; */ | 318 | /* s->match_start = cur_match; */ |
| 319 | /* best_len = len; */ | 319 | /* best_len = len; */ |
| 320 | /* if (len >= nice_match) break; */ | 320 | /* if (len >= nice_match) break; */ |
| 321 | /* scan_end = *(ushf*)(scan+best_len-1); */ | 321 | /* scan_end = *(ushf*)(scan+best_len-1); */ |
| 322 | 322 | ||
| 323 | LongerMatch: movl nicematch(%esp), %ebx | 323 | LongerMatch: movl nicematch(%esp), %ebx |
| 324 | movl %eax, bestlen(%esp) | 324 | movl %eax, bestlen(%esp) |
| 325 | movl %ecx, dsMatchStart(%edx) | 325 | movl %ecx, dsMatchStart(%edx) |
| 326 | cmpl %ebx, %eax | 326 | cmpl %ebx, %eax |
| 327 | jge LeaveNow | 327 | jge LeaveNow |
| 328 | movl window(%esp), %esi | 328 | movl window(%esp), %esi |
| 329 | addl %eax, %esi | 329 | addl %eax, %esi |
| 330 | movl %esi, windowbestlen(%esp) | 330 | movl %esi, windowbestlen(%esp) |
| 331 | movl chainlenscanend(%esp), %ebx | 331 | movl chainlenscanend(%esp), %ebx |
| 332 | movw -1(%edi,%eax), %bx | 332 | movw -1(%edi,%eax), %bx |
| 333 | movl dsPrev(%edx), %edi | 333 | movl dsPrev(%edx), %edi |
| 334 | movl %ebx, chainlenscanend(%esp) | 334 | movl %ebx, chainlenscanend(%esp) |
| 335 | movl wmask(%esp), %edx | 335 | movl wmask(%esp), %edx |
| 336 | andl %ecx, %edx | 336 | andl %ecx, %edx |
| 337 | jmp LookupLoop | 337 | jmp LookupLoop |
| 338 | 338 | ||
| 339 | /* Accept the current string, with the maximum possible length. */ | 339 | /* Accept the current string, with the maximum possible length. */ |
| 340 | 340 | ||
| 341 | LenMaximum: movl deflatestate(%esp), %edx | 341 | LenMaximum: movl deflatestate(%esp), %edx |
| 342 | movl $MAX_MATCH, bestlen(%esp) | 342 | movl $MAX_MATCH, bestlen(%esp) |
| 343 | movl %ecx, dsMatchStart(%edx) | 343 | movl %ecx, dsMatchStart(%edx) |
| 344 | 344 | ||
| 345 | /* if ((uInt)best_len <= s->lookahead) return (uInt)best_len; */ | 345 | /* if ((uInt)best_len <= s->lookahead) return (uInt)best_len; */ |
| 346 | /* return s->lookahead; */ | 346 | /* return s->lookahead; */ |
| 347 | 347 | ||
| 348 | LeaveNow: | 348 | LeaveNow: |
| 349 | movl deflatestate(%esp), %edx | 349 | movl deflatestate(%esp), %edx |
| 350 | movl bestlen(%esp), %ebx | 350 | movl bestlen(%esp), %ebx |
| 351 | movl dsLookahead(%edx), %eax | 351 | movl dsLookahead(%edx), %eax |
| 352 | cmpl %eax, %ebx | 352 | cmpl %eax, %ebx |
| 353 | jg LookaheadRet | 353 | jg LookaheadRet |
| 354 | movl %ebx, %eax | 354 | movl %ebx, %eax |
| 355 | LookaheadRet: | 355 | LookaheadRet: |
| 356 | 356 | ||
| 357 | /* Restore the stack and return from whence we came. */ | 357 | /* Restore the stack and return from whence we came. */ |
| 358 | 358 | ||
| 359 | addl $LocalVarsSize, %esp | 359 | addl $LocalVarsSize, %esp |
| 360 | popl %ebx | 360 | popl %ebx |
| 361 | popl %esi | 361 | popl %esi |
| 362 | popl %edi | 362 | popl %edi |
| 363 | popl %ebp | 363 | popl %ebp |
| 364 | match_init: ret | 364 | match_init: ret |
