diff options
author | Mark Adler <madler@alumni.caltech.edu> | 2011-09-09 23:24:43 -0700 |
---|---|---|
committer | Mark Adler <madler@alumni.caltech.edu> | 2011-09-09 23:24:43 -0700 |
commit | 6b8233bfe00e79134cb1b84fc49d4f750a797f79 (patch) | |
tree | ca2b03b0169568681dc3d9c823e9f0bc4417d6b5 /contrib/asm586 | |
parent | 0484693e1723bbab791c56f95597bd7dbe867d03 (diff) | |
download | zlib-1.2.2.3.tar.gz zlib-1.2.2.3.tar.bz2 zlib-1.2.2.3.zip |
zlib 1.2.2.3v1.2.2.3
Diffstat (limited to 'contrib/asm586')
-rw-r--r-- | contrib/asm586/match.S | 718 |
1 files changed, 364 insertions, 354 deletions
diff --git a/contrib/asm586/match.S b/contrib/asm586/match.S index 8f16140..371c9a0 100644 --- a/contrib/asm586/match.S +++ b/contrib/asm586/match.S | |||
@@ -1,354 +1,364 @@ | |||
1 | /* match.s -- Pentium-optimized version of longest_match() | 1 | /* match.s -- Pentium-optimized version of longest_match() |
2 | * Written for zlib 1.1.2 | 2 | * Written for zlib 1.1.2 |
3 | * Copyright (C) 1998 Brian Raiter <breadbox@muppetlabs.com> | 3 | * Copyright (C) 1998 Brian Raiter <breadbox@muppetlabs.com> |
4 | * | 4 | * |
5 | * This is free software; you can redistribute it and/or modify it | 5 | * This is free software; you can redistribute it and/or modify it |
6 | * under the terms of the GNU General Public License. | 6 | * under the terms of the GNU General Public License. |
7 | */ | 7 | */ |
8 | 8 | ||
9 | #ifndef NO_UNDERLINE | 9 | #ifndef NO_UNDERLINE |
10 | #define match_init _match_init | 10 | #define match_init _match_init |
11 | #define longest_match _longest_match | 11 | #define longest_match _longest_match |
12 | #endif | 12 | #endif |
13 | 13 | ||
14 | #define MAX_MATCH (258) | 14 | #define MAX_MATCH (258) |
15 | #define MIN_MATCH (3) | 15 | #define MIN_MATCH (3) |
16 | #define MIN_LOOKAHEAD (MAX_MATCH + MIN_MATCH + 1) | 16 | #define MIN_LOOKAHEAD (MAX_MATCH + MIN_MATCH + 1) |
17 | #define MAX_MATCH_8 ((MAX_MATCH + 7) & ~7) | 17 | #define MAX_MATCH_8 ((MAX_MATCH + 7) & ~7) |
18 | 18 | ||
19 | /* stack frame offsets */ | 19 | /* stack frame offsets */ |
20 | 20 | ||
21 | #define wmask 0 /* local copy of s->wmask */ | 21 | #define wmask 0 /* local copy of s->wmask */ |
22 | #define window 4 /* local copy of s->window */ | 22 | #define window 4 /* local copy of s->window */ |
23 | #define windowbestlen 8 /* s->window + bestlen */ | 23 | #define windowbestlen 8 /* s->window + bestlen */ |
24 | #define chainlenscanend 12 /* high word: current chain len */ | 24 | #define chainlenscanend 12 /* high word: current chain len */ |
25 | /* low word: last bytes sought */ | 25 | /* low word: last bytes sought */ |
26 | #define scanstart 16 /* first two bytes of string */ | 26 | #define scanstart 16 /* first two bytes of string */ |
27 | #define scanalign 20 /* dword-misalignment of string */ | 27 | #define scanalign 20 /* dword-misalignment of string */ |
28 | #define nicematch 24 /* a good enough match size */ | 28 | #define nicematch 24 /* a good enough match size */ |
29 | #define bestlen 28 /* size of best match so far */ | 29 | #define bestlen 28 /* size of best match so far */ |
30 | #define scan 32 /* ptr to string wanting match */ | 30 | #define scan 32 /* ptr to string wanting match */ |
31 | 31 | ||
32 | #define LocalVarsSize (36) | 32 | #define LocalVarsSize (36) |
33 | /* saved ebx 36 */ | 33 | /* saved ebx 36 */ |
34 | /* saved edi 40 */ | 34 | /* saved edi 40 */ |
35 | /* saved esi 44 */ | 35 | /* saved esi 44 */ |
36 | /* saved ebp 48 */ | 36 | /* saved ebp 48 */ |
37 | /* return address 52 */ | 37 | /* return address 52 */ |
38 | #define deflatestate 56 /* the function arguments */ | 38 | #define deflatestate 56 /* the function arguments */ |
39 | #define curmatch 60 | 39 | #define curmatch 60 |
40 | 40 | ||
41 | /* Offsets for fields in the deflate_state structure. These numbers | 41 | /* Offsets for fields in the deflate_state structure. These numbers |
42 | * are calculated from the definition of deflate_state, with the | 42 | * are calculated from the definition of deflate_state, with the |
43 | * assumption that the compiler will dword-align the fields. (Thus, | 43 | * assumption that the compiler will dword-align the fields. (Thus, |
44 | * changing the definition of deflate_state could easily cause this | 44 | * changing the definition of deflate_state could easily cause this |
45 | * program to crash horribly, without so much as a warning at | 45 | * program to crash horribly, without so much as a warning at |
46 | * compile time. Sigh.) | 46 | * compile time. Sigh.) |
47 | */ | 47 | */ |
48 | #define dsWSize 36 | 48 | |
49 | #define dsWMask 44 | 49 | /* All the +zlib1222add offsets are due to the addition of fields |
50 | #define dsWindow 48 | 50 | * in zlib in the deflate_state structure since the asm code was first written |
51 | #define dsPrev 56 | 51 | * (if you compile with zlib 1.0.4 or older, use "zlib1222add equ (-4)"). |
52 | #define dsMatchLen 88 | 52 | * (if you compile with zlib between 1.0.5 and 1.2.2.1, use "zlib1222add equ 0"). |
53 | #define dsPrevMatch 92 | 53 | * if you compile with zlib 1.2.2.2 or later , use "zlib1222add equ 8"). |
54 | #define dsStrStart 100 | 54 | */ |
55 | #define dsMatchStart 104 | 55 | |
56 | #define dsLookahead 108 | 56 | #define zlib1222add (8) |
57 | #define dsPrevLen 112 | 57 | |
58 | #define dsMaxChainLen 116 | 58 | #define dsWSize (36+zlib1222add) |
59 | #define dsGoodMatch 132 | 59 | #define dsWMask (44+zlib1222add) |
60 | #define dsNiceMatch 136 | 60 | #define dsWindow (48+zlib1222add) |
61 | 61 | #define dsPrev (56+zlib1222add) | |
62 | 62 | #define dsMatchLen (88+zlib1222add) | |
63 | .file "match.S" | 63 | #define dsPrevMatch (92+zlib1222add) |
64 | 64 | #define dsStrStart (100+zlib1222add) | |
65 | .globl match_init, longest_match | 65 | #define dsMatchStart (104+zlib1222add) |
66 | 66 | #define dsLookahead (108+zlib1222add) | |
67 | .text | 67 | #define dsPrevLen (112+zlib1222add) |
68 | 68 | #define dsMaxChainLen (116+zlib1222add) | |
69 | /* uInt longest_match(deflate_state *deflatestate, IPos curmatch) */ | 69 | #define dsGoodMatch (132+zlib1222add) |
70 | 70 | #define dsNiceMatch (136+zlib1222add) | |
71 | longest_match: | 71 | |
72 | 72 | ||
73 | /* Save registers that the compiler may be using, and adjust %esp to */ | 73 | .file "match.S" |
74 | /* make room for our stack frame. */ | 74 | |
75 | 75 | .globl match_init, longest_match | |
76 | pushl %ebp | 76 | |
77 | pushl %edi | 77 | .text |
78 | pushl %esi | 78 | |
79 | pushl %ebx | 79 | /* uInt longest_match(deflate_state *deflatestate, IPos curmatch) */ |
80 | subl $LocalVarsSize, %esp | 80 | |
81 | 81 | longest_match: | |
82 | /* Retrieve the function arguments. %ecx will hold cur_match */ | 82 | |
83 | /* throughout the entire function. %edx will hold the pointer to the */ | 83 | /* Save registers that the compiler may be using, and adjust %esp to */ |
84 | /* deflate_state structure during the function's setup (before */ | 84 | /* make room for our stack frame. */ |
85 | /* entering the main loop). */ | 85 | |
86 | 86 | pushl %ebp | |
87 | movl deflatestate(%esp), %edx | 87 | pushl %edi |
88 | movl curmatch(%esp), %ecx | 88 | pushl %esi |
89 | 89 | pushl %ebx | |
90 | /* if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead; */ | 90 | subl $LocalVarsSize, %esp |
91 | 91 | ||
92 | movl dsNiceMatch(%edx), %eax | 92 | /* Retrieve the function arguments. %ecx will hold cur_match */ |
93 | movl dsLookahead(%edx), %ebx | 93 | /* throughout the entire function. %edx will hold the pointer to the */ |
94 | cmpl %eax, %ebx | 94 | /* deflate_state structure during the function's setup (before */ |
95 | jl LookaheadLess | 95 | /* entering the main loop). */ |
96 | movl %eax, %ebx | 96 | |
97 | LookaheadLess: movl %ebx, nicematch(%esp) | 97 | movl deflatestate(%esp), %edx |
98 | 98 | movl curmatch(%esp), %ecx | |
99 | /* register Bytef *scan = s->window + s->strstart; */ | 99 | |
100 | 100 | /* if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead; */ | |
101 | movl dsWindow(%edx), %esi | 101 | |
102 | movl %esi, window(%esp) | 102 | movl dsNiceMatch(%edx), %eax |
103 | movl dsStrStart(%edx), %ebp | 103 | movl dsLookahead(%edx), %ebx |
104 | lea (%esi,%ebp), %edi | 104 | cmpl %eax, %ebx |
105 | movl %edi, scan(%esp) | 105 | jl LookaheadLess |
106 | 106 | movl %eax, %ebx | |
107 | /* Determine how many bytes the scan ptr is off from being */ | 107 | LookaheadLess: movl %ebx, nicematch(%esp) |
108 | /* dword-aligned. */ | 108 | |
109 | 109 | /* register Bytef *scan = s->window + s->strstart; */ | |
110 | movl %edi, %eax | 110 | |
111 | negl %eax | 111 | movl dsWindow(%edx), %esi |
112 | andl $3, %eax | 112 | movl %esi, window(%esp) |
113 | movl %eax, scanalign(%esp) | 113 | movl dsStrStart(%edx), %ebp |
114 | 114 | lea (%esi,%ebp), %edi | |
115 | /* IPos limit = s->strstart > (IPos)MAX_DIST(s) ? */ | 115 | movl %edi, scan(%esp) |
116 | /* s->strstart - (IPos)MAX_DIST(s) : NIL; */ | 116 | |
117 | 117 | /* Determine how many bytes the scan ptr is off from being */ | |
118 | movl dsWSize(%edx), %eax | 118 | /* dword-aligned. */ |
119 | subl $MIN_LOOKAHEAD, %eax | 119 | |
120 | subl %eax, %ebp | 120 | movl %edi, %eax |
121 | jg LimitPositive | 121 | negl %eax |
122 | xorl %ebp, %ebp | 122 | andl $3, %eax |
123 | LimitPositive: | 123 | movl %eax, scanalign(%esp) |
124 | 124 | ||
125 | /* unsigned chain_length = s->max_chain_length; */ | 125 | /* IPos limit = s->strstart > (IPos)MAX_DIST(s) ? */ |
126 | /* if (s->prev_length >= s->good_match) { */ | 126 | /* s->strstart - (IPos)MAX_DIST(s) : NIL; */ |
127 | /* chain_length >>= 2; */ | 127 | |
128 | /* } */ | 128 | movl dsWSize(%edx), %eax |
129 | 129 | subl $MIN_LOOKAHEAD, %eax | |
130 | movl dsPrevLen(%edx), %eax | 130 | subl %eax, %ebp |
131 | movl dsGoodMatch(%edx), %ebx | 131 | jg LimitPositive |
132 | cmpl %ebx, %eax | 132 | xorl %ebp, %ebp |
133 | movl dsMaxChainLen(%edx), %ebx | 133 | LimitPositive: |
134 | jl LastMatchGood | 134 | |
135 | shrl $2, %ebx | 135 | /* unsigned chain_length = s->max_chain_length; */ |
136 | LastMatchGood: | 136 | /* if (s->prev_length >= s->good_match) { */ |
137 | 137 | /* chain_length >>= 2; */ | |
138 | /* chainlen is decremented once beforehand so that the function can */ | 138 | /* } */ |
139 | /* use the sign flag instead of the zero flag for the exit test. */ | 139 | |
140 | /* It is then shifted into the high word, to make room for the scanend */ | 140 | movl dsPrevLen(%edx), %eax |
141 | /* scanend value, which it will always accompany. */ | 141 | movl dsGoodMatch(%edx), %ebx |
142 | 142 | cmpl %ebx, %eax | |
143 | decl %ebx | 143 | movl dsMaxChainLen(%edx), %ebx |
144 | shll $16, %ebx | 144 | jl LastMatchGood |
145 | 145 | shrl $2, %ebx | |
146 | /* int best_len = s->prev_length; */ | 146 | LastMatchGood: |
147 | 147 | ||
148 | movl dsPrevLen(%edx), %eax | 148 | /* chainlen is decremented once beforehand so that the function can */ |
149 | movl %eax, bestlen(%esp) | 149 | /* use the sign flag instead of the zero flag for the exit test. */ |
150 | 150 | /* It is then shifted into the high word, to make room for the scanend */ | |
151 | /* Store the sum of s->window + best_len in %esi locally, and in %esi. */ | 151 | /* scanend value, which it will always accompany. */ |
152 | 152 | ||
153 | addl %eax, %esi | 153 | decl %ebx |
154 | movl %esi, windowbestlen(%esp) | 154 | shll $16, %ebx |
155 | 155 | ||
156 | /* register ush scan_start = *(ushf*)scan; */ | 156 | /* int best_len = s->prev_length; */ |
157 | /* register ush scan_end = *(ushf*)(scan+best_len-1); */ | 157 | |
158 | 158 | movl dsPrevLen(%edx), %eax | |
159 | movw (%edi), %bx | 159 | movl %eax, bestlen(%esp) |
160 | movw %bx, scanstart(%esp) | 160 | |
161 | movw -1(%edi,%eax), %bx | 161 | /* Store the sum of s->window + best_len in %esi locally, and in %esi. */ |
162 | movl %ebx, chainlenscanend(%esp) | 162 | |
163 | 163 | addl %eax, %esi | |
164 | /* Posf *prev = s->prev; */ | 164 | movl %esi, windowbestlen(%esp) |
165 | /* uInt wmask = s->w_mask; */ | 165 | |
166 | 166 | /* register ush scan_start = *(ushf*)scan; */ | |
167 | movl dsPrev(%edx), %edi | 167 | /* register ush scan_end = *(ushf*)(scan+best_len-1); */ |
168 | movl dsWMask(%edx), %edx | 168 | |
169 | mov %edx, wmask(%esp) | 169 | movw (%edi), %bx |
170 | 170 | movw %bx, scanstart(%esp) | |
171 | /* Jump into the main loop. */ | 171 | movw -1(%edi,%eax), %bx |
172 | 172 | movl %ebx, chainlenscanend(%esp) | |
173 | jmp LoopEntry | 173 | |
174 | 174 | /* Posf *prev = s->prev; */ | |
175 | .balign 16 | 175 | /* uInt wmask = s->w_mask; */ |
176 | 176 | ||
177 | /* do { | 177 | movl dsPrev(%edx), %edi |
178 | * match = s->window + cur_match; | 178 | movl dsWMask(%edx), %edx |
179 | * if (*(ushf*)(match+best_len-1) != scan_end || | 179 | mov %edx, wmask(%esp) |
180 | * *(ushf*)match != scan_start) continue; | 180 | |
181 | * [...] | 181 | /* Jump into the main loop. */ |
182 | * } while ((cur_match = prev[cur_match & wmask]) > limit | 182 | |
183 | * && --chain_length != 0); | 183 | jmp LoopEntry |
184 | * | 184 | |
185 | * Here is the inner loop of the function. The function will spend the | 185 | .balign 16 |
186 | * majority of its time in this loop, and majority of that time will | 186 | |
187 | * be spent in the first ten instructions. | 187 | /* do { |
188 | * | 188 | * match = s->window + cur_match; |
189 | * Within this loop: | 189 | * if (*(ushf*)(match+best_len-1) != scan_end || |
190 | * %ebx = chainlenscanend - i.e., ((chainlen << 16) | scanend) | 190 | * *(ushf*)match != scan_start) continue; |
191 | * %ecx = curmatch | 191 | * [...] |
192 | * %edx = curmatch & wmask | 192 | * } while ((cur_match = prev[cur_match & wmask]) > limit |
193 | * %esi = windowbestlen - i.e., (window + bestlen) | 193 | * && --chain_length != 0); |
194 | * %edi = prev | 194 | * |
195 | * %ebp = limit | 195 | * Here is the inner loop of the function. The function will spend the |
196 | * | 196 | * majority of its time in this loop, and majority of that time will |
197 | * Two optimization notes on the choice of instructions: | 197 | * be spent in the first ten instructions. |
198 | * | 198 | * |
199 | * The first instruction uses a 16-bit address, which costs an extra, | 199 | * Within this loop: |
200 | * unpairable cycle. This is cheaper than doing a 32-bit access and | 200 | * %ebx = chainlenscanend - i.e., ((chainlen << 16) | scanend) |
201 | * zeroing the high word, due to the 3-cycle misalignment penalty which | 201 | * %ecx = curmatch |
202 | * would occur half the time. This also turns out to be cheaper than | 202 | * %edx = curmatch & wmask |
203 | * doing two separate 8-bit accesses, as the memory is so rarely in the | 203 | * %esi = windowbestlen - i.e., (window + bestlen) |
204 | * L1 cache. | 204 | * %edi = prev |
205 | * | 205 | * %ebp = limit |
206 | * The window buffer, however, apparently spends a lot of time in the | 206 | * |
207 | * cache, and so it is faster to retrieve the word at the end of the | 207 | * Two optimization notes on the choice of instructions: |
208 | * match string with two 8-bit loads. The instructions that test the | 208 | * |
209 | * word at the beginning of the match string, however, are executed | 209 | * The first instruction uses a 16-bit address, which costs an extra, |
210 | * much less frequently, and there it was cheaper to use 16-bit | 210 | * unpairable cycle. This is cheaper than doing a 32-bit access and |
211 | * instructions, which avoided the necessity of saving off and | 211 | * zeroing the high word, due to the 3-cycle misalignment penalty which |
212 | * subsequently reloading one of the other registers. | 212 | * would occur half the time. This also turns out to be cheaper than |
213 | */ | 213 | * doing two separate 8-bit accesses, as the memory is so rarely in the |
214 | LookupLoop: | 214 | * L1 cache. |
215 | /* 1 U & V */ | 215 | * |
216 | movw (%edi,%edx,2), %cx /* 2 U pipe */ | 216 | * The window buffer, however, apparently spends a lot of time in the |
217 | movl wmask(%esp), %edx /* 2 V pipe */ | 217 | * cache, and so it is faster to retrieve the word at the end of the |
218 | cmpl %ebp, %ecx /* 3 U pipe */ | 218 | * match string with two 8-bit loads. The instructions that test the |
219 | jbe LeaveNow /* 3 V pipe */ | 219 | * word at the beginning of the match string, however, are executed |
220 | subl $0x00010000, %ebx /* 4 U pipe */ | 220 | * much less frequently, and there it was cheaper to use 16-bit |
221 | js LeaveNow /* 4 V pipe */ | 221 | * instructions, which avoided the necessity of saving off and |
222 | LoopEntry: movb -1(%esi,%ecx), %al /* 5 U pipe */ | 222 | * subsequently reloading one of the other registers. |
223 | andl %ecx, %edx /* 5 V pipe */ | 223 | */ |
224 | cmpb %bl, %al /* 6 U pipe */ | 224 | LookupLoop: |
225 | jnz LookupLoop /* 6 V pipe */ | 225 | /* 1 U & V */ |
226 | movb (%esi,%ecx), %ah | 226 | movw (%edi,%edx,2), %cx /* 2 U pipe */ |
227 | cmpb %bh, %ah | 227 | movl wmask(%esp), %edx /* 2 V pipe */ |
228 | jnz LookupLoop | 228 | cmpl %ebp, %ecx /* 3 U pipe */ |
229 | movl window(%esp), %eax | 229 | jbe LeaveNow /* 3 V pipe */ |
230 | movw (%eax,%ecx), %ax | 230 | subl $0x00010000, %ebx /* 4 U pipe */ |
231 | cmpw scanstart(%esp), %ax | 231 | js LeaveNow /* 4 V pipe */ |
232 | jnz LookupLoop | 232 | LoopEntry: movb -1(%esi,%ecx), %al /* 5 U pipe */ |
233 | 233 | andl %ecx, %edx /* 5 V pipe */ | |
234 | /* Store the current value of chainlen. */ | 234 | cmpb %bl, %al /* 6 U pipe */ |
235 | 235 | jnz LookupLoop /* 6 V pipe */ | |
236 | movl %ebx, chainlenscanend(%esp) | 236 | movb (%esi,%ecx), %ah |
237 | 237 | cmpb %bh, %ah | |
238 | /* Point %edi to the string under scrutiny, and %esi to the string we */ | 238 | jnz LookupLoop |
239 | /* are hoping to match it up with. In actuality, %esi and %edi are */ | 239 | movl window(%esp), %eax |
240 | /* both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and %edx is */ | 240 | movw (%eax,%ecx), %ax |
241 | /* initialized to -(MAX_MATCH_8 - scanalign). */ | 241 | cmpw scanstart(%esp), %ax |
242 | 242 | jnz LookupLoop | |
243 | movl window(%esp), %esi | 243 | |
244 | movl scan(%esp), %edi | 244 | /* Store the current value of chainlen. */ |
245 | addl %ecx, %esi | 245 | |
246 | movl scanalign(%esp), %eax | 246 | movl %ebx, chainlenscanend(%esp) |
247 | movl $(-MAX_MATCH_8), %edx | 247 | |
248 | lea MAX_MATCH_8(%edi,%eax), %edi | 248 | /* Point %edi to the string under scrutiny, and %esi to the string we */ |
249 | lea MAX_MATCH_8(%esi,%eax), %esi | 249 | /* are hoping to match it up with. In actuality, %esi and %edi are */ |
250 | 250 | /* both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and %edx is */ | |
251 | /* Test the strings for equality, 8 bytes at a time. At the end, | 251 | /* initialized to -(MAX_MATCH_8 - scanalign). */ |
252 | * adjust %edx so that it is offset to the exact byte that mismatched. | 252 | |
253 | * | 253 | movl window(%esp), %esi |
254 | * We already know at this point that the first three bytes of the | 254 | movl scan(%esp), %edi |
255 | * strings match each other, and they can be safely passed over before | 255 | addl %ecx, %esi |
256 | * starting the compare loop. So what this code does is skip over 0-3 | 256 | movl scanalign(%esp), %eax |
257 | * bytes, as much as necessary in order to dword-align the %edi | 257 | movl $(-MAX_MATCH_8), %edx |
258 | * pointer. (%esi will still be misaligned three times out of four.) | 258 | lea MAX_MATCH_8(%edi,%eax), %edi |
259 | * | 259 | lea MAX_MATCH_8(%esi,%eax), %esi |
260 | * It should be confessed that this loop usually does not represent | 260 | |
261 | * much of the total running time. Replacing it with a more | 261 | /* Test the strings for equality, 8 bytes at a time. At the end, |
262 | * straightforward "rep cmpsb" would not drastically degrade | 262 | * adjust %edx so that it is offset to the exact byte that mismatched. |
263 | * performance. | 263 | * |
264 | */ | 264 | * We already know at this point that the first three bytes of the |
265 | LoopCmps: | 265 | * strings match each other, and they can be safely passed over before |
266 | movl (%esi,%edx), %eax | 266 | * starting the compare loop. So what this code does is skip over 0-3 |
267 | movl (%edi,%edx), %ebx | 267 | * bytes, as much as necessary in order to dword-align the %edi |
268 | xorl %ebx, %eax | 268 | * pointer. (%esi will still be misaligned three times out of four.) |
269 | jnz LeaveLoopCmps | 269 | * |
270 | movl 4(%esi,%edx), %eax | 270 | * It should be confessed that this loop usually does not represent |
271 | movl 4(%edi,%edx), %ebx | 271 | * much of the total running time. Replacing it with a more |
272 | xorl %ebx, %eax | 272 | * straightforward "rep cmpsb" would not drastically degrade |
273 | jnz LeaveLoopCmps4 | 273 | * performance. |
274 | addl $8, %edx | 274 | */ |
275 | jnz LoopCmps | 275 | LoopCmps: |
276 | jmp LenMaximum | 276 | movl (%esi,%edx), %eax |
277 | LeaveLoopCmps4: addl $4, %edx | 277 | movl (%edi,%edx), %ebx |
278 | LeaveLoopCmps: testl $0x0000FFFF, %eax | 278 | xorl %ebx, %eax |
279 | jnz LenLower | 279 | jnz LeaveLoopCmps |
280 | addl $2, %edx | 280 | movl 4(%esi,%edx), %eax |
281 | shrl $16, %eax | 281 | movl 4(%edi,%edx), %ebx |
282 | LenLower: subb $1, %al | 282 | xorl %ebx, %eax |
283 | adcl $0, %edx | 283 | jnz LeaveLoopCmps4 |
284 | 284 | addl $8, %edx | |
285 | /* Calculate the length of the match. If it is longer than MAX_MATCH, */ | 285 | jnz LoopCmps |
286 | /* then automatically accept it as the best possible match and leave. */ | 286 | jmp LenMaximum |
287 | 287 | LeaveLoopCmps4: addl $4, %edx | |
288 | lea (%edi,%edx), %eax | 288 | LeaveLoopCmps: testl $0x0000FFFF, %eax |
289 | movl scan(%esp), %edi | 289 | jnz LenLower |
290 | subl %edi, %eax | 290 | addl $2, %edx |
291 | cmpl $MAX_MATCH, %eax | 291 | shrl $16, %eax |
292 | jge LenMaximum | 292 | LenLower: subb $1, %al |
293 | 293 | adcl $0, %edx | |
294 | /* If the length of the match is not longer than the best match we */ | 294 | |
295 | /* have so far, then forget it and return to the lookup loop. */ | 295 | /* Calculate the length of the match. If it is longer than MAX_MATCH, */ |
296 | 296 | /* then automatically accept it as the best possible match and leave. */ | |
297 | movl deflatestate(%esp), %edx | 297 | |
298 | movl bestlen(%esp), %ebx | 298 | lea (%edi,%edx), %eax |
299 | cmpl %ebx, %eax | 299 | movl scan(%esp), %edi |
300 | jg LongerMatch | 300 | subl %edi, %eax |
301 | movl chainlenscanend(%esp), %ebx | 301 | cmpl $MAX_MATCH, %eax |
302 | movl windowbestlen(%esp), %esi | 302 | jge LenMaximum |
303 | movl dsPrev(%edx), %edi | 303 | |
304 | movl wmask(%esp), %edx | 304 | /* If the length of the match is not longer than the best match we */ |
305 | andl %ecx, %edx | 305 | /* have so far, then forget it and return to the lookup loop. */ |
306 | jmp LookupLoop | 306 | |
307 | 307 | movl deflatestate(%esp), %edx | |
308 | /* s->match_start = cur_match; */ | 308 | movl bestlen(%esp), %ebx |
309 | /* best_len = len; */ | 309 | cmpl %ebx, %eax |
310 | /* if (len >= nice_match) break; */ | 310 | jg LongerMatch |
311 | /* scan_end = *(ushf*)(scan+best_len-1); */ | 311 | movl chainlenscanend(%esp), %ebx |
312 | 312 | movl windowbestlen(%esp), %esi | |
313 | LongerMatch: movl nicematch(%esp), %ebx | 313 | movl dsPrev(%edx), %edi |
314 | movl %eax, bestlen(%esp) | 314 | movl wmask(%esp), %edx |
315 | movl %ecx, dsMatchStart(%edx) | 315 | andl %ecx, %edx |
316 | cmpl %ebx, %eax | 316 | jmp LookupLoop |
317 | jge LeaveNow | 317 | |
318 | movl window(%esp), %esi | 318 | /* s->match_start = cur_match; */ |
319 | addl %eax, %esi | 319 | /* best_len = len; */ |
320 | movl %esi, windowbestlen(%esp) | 320 | /* if (len >= nice_match) break; */ |
321 | movl chainlenscanend(%esp), %ebx | 321 | /* scan_end = *(ushf*)(scan+best_len-1); */ |
322 | movw -1(%edi,%eax), %bx | 322 | |
323 | movl dsPrev(%edx), %edi | 323 | LongerMatch: movl nicematch(%esp), %ebx |
324 | movl %ebx, chainlenscanend(%esp) | 324 | movl %eax, bestlen(%esp) |
325 | movl wmask(%esp), %edx | 325 | movl %ecx, dsMatchStart(%edx) |
326 | andl %ecx, %edx | 326 | cmpl %ebx, %eax |
327 | jmp LookupLoop | 327 | jge LeaveNow |
328 | 328 | movl window(%esp), %esi | |
329 | /* Accept the current string, with the maximum possible length. */ | 329 | addl %eax, %esi |
330 | 330 | movl %esi, windowbestlen(%esp) | |
331 | LenMaximum: movl deflatestate(%esp), %edx | 331 | movl chainlenscanend(%esp), %ebx |
332 | movl $MAX_MATCH, bestlen(%esp) | 332 | movw -1(%edi,%eax), %bx |
333 | movl %ecx, dsMatchStart(%edx) | 333 | movl dsPrev(%edx), %edi |
334 | 334 | movl %ebx, chainlenscanend(%esp) | |
335 | /* if ((uInt)best_len <= s->lookahead) return (uInt)best_len; */ | 335 | movl wmask(%esp), %edx |
336 | /* return s->lookahead; */ | 336 | andl %ecx, %edx |
337 | 337 | jmp LookupLoop | |
338 | LeaveNow: | 338 | |
339 | movl deflatestate(%esp), %edx | 339 | /* Accept the current string, with the maximum possible length. */ |
340 | movl bestlen(%esp), %ebx | 340 | |
341 | movl dsLookahead(%edx), %eax | 341 | LenMaximum: movl deflatestate(%esp), %edx |
342 | cmpl %eax, %ebx | 342 | movl $MAX_MATCH, bestlen(%esp) |
343 | jg LookaheadRet | 343 | movl %ecx, dsMatchStart(%edx) |
344 | movl %ebx, %eax | 344 | |
345 | LookaheadRet: | 345 | /* if ((uInt)best_len <= s->lookahead) return (uInt)best_len; */ |
346 | 346 | /* return s->lookahead; */ | |
347 | /* Restore the stack and return from whence we came. */ | 347 | |
348 | 348 | LeaveNow: | |
349 | addl $LocalVarsSize, %esp | 349 | movl deflatestate(%esp), %edx |
350 | popl %ebx | 350 | movl bestlen(%esp), %ebx |
351 | popl %esi | 351 | movl dsLookahead(%edx), %eax |
352 | popl %edi | 352 | cmpl %eax, %ebx |
353 | popl %ebp | 353 | jg LookaheadRet |
354 | match_init: ret | 354 | movl %ebx, %eax |
355 | LookaheadRet: | ||
356 | |||
357 | /* Restore the stack and return from whence we came. */ | ||
358 | |||
359 | addl $LocalVarsSize, %esp | ||
360 | popl %ebx | ||
361 | popl %esi | ||
362 | popl %edi | ||
363 | popl %ebp | ||
364 | match_init: ret | ||