diff options
Diffstat (limited to 'contrib/asm586/match.S')
-rw-r--r-- | contrib/asm586/match.S | 728 |
1 files changed, 364 insertions, 364 deletions
diff --git a/contrib/asm586/match.S b/contrib/asm586/match.S index 371c9a0..0368b35 100644 --- a/contrib/asm586/match.S +++ b/contrib/asm586/match.S | |||
@@ -1,364 +1,364 @@ | |||
1 | /* match.s -- Pentium-optimized version of longest_match() | 1 | /* match.s -- Pentium-optimized version of longest_match() |
2 | * Written for zlib 1.1.2 | 2 | * Written for zlib 1.1.2 |
3 | * Copyright (C) 1998 Brian Raiter <breadbox@muppetlabs.com> | 3 | * Copyright (C) 1998 Brian Raiter <breadbox@muppetlabs.com> |
4 | * | 4 | * |
5 | * This is free software; you can redistribute it and/or modify it | 5 | * This is free software; you can redistribute it and/or modify it |
6 | * under the terms of the GNU General Public License. | 6 | * under the terms of the GNU General Public License. |
7 | */ | 7 | */ |
8 | 8 | ||
9 | #ifndef NO_UNDERLINE | 9 | #ifndef NO_UNDERLINE |
10 | #define match_init _match_init | 10 | #define match_init _match_init |
11 | #define longest_match _longest_match | 11 | #define longest_match _longest_match |
12 | #endif | 12 | #endif |
13 | 13 | ||
14 | #define MAX_MATCH (258) | 14 | #define MAX_MATCH (258) |
15 | #define MIN_MATCH (3) | 15 | #define MIN_MATCH (3) |
16 | #define MIN_LOOKAHEAD (MAX_MATCH + MIN_MATCH + 1) | 16 | #define MIN_LOOKAHEAD (MAX_MATCH + MIN_MATCH + 1) |
17 | #define MAX_MATCH_8 ((MAX_MATCH + 7) & ~7) | 17 | #define MAX_MATCH_8 ((MAX_MATCH + 7) & ~7) |
18 | 18 | ||
19 | /* stack frame offsets */ | 19 | /* stack frame offsets */ |
20 | 20 | ||
21 | #define wmask 0 /* local copy of s->wmask */ | 21 | #define wmask 0 /* local copy of s->wmask */ |
22 | #define window 4 /* local copy of s->window */ | 22 | #define window 4 /* local copy of s->window */ |
23 | #define windowbestlen 8 /* s->window + bestlen */ | 23 | #define windowbestlen 8 /* s->window + bestlen */ |
24 | #define chainlenscanend 12 /* high word: current chain len */ | 24 | #define chainlenscanend 12 /* high word: current chain len */ |
25 | /* low word: last bytes sought */ | 25 | /* low word: last bytes sought */ |
26 | #define scanstart 16 /* first two bytes of string */ | 26 | #define scanstart 16 /* first two bytes of string */ |
27 | #define scanalign 20 /* dword-misalignment of string */ | 27 | #define scanalign 20 /* dword-misalignment of string */ |
28 | #define nicematch 24 /* a good enough match size */ | 28 | #define nicematch 24 /* a good enough match size */ |
29 | #define bestlen 28 /* size of best match so far */ | 29 | #define bestlen 28 /* size of best match so far */ |
30 | #define scan 32 /* ptr to string wanting match */ | 30 | #define scan 32 /* ptr to string wanting match */ |
31 | 31 | ||
32 | #define LocalVarsSize (36) | 32 | #define LocalVarsSize (36) |
33 | /* saved ebx 36 */ | 33 | /* saved ebx 36 */ |
34 | /* saved edi 40 */ | 34 | /* saved edi 40 */ |
35 | /* saved esi 44 */ | 35 | /* saved esi 44 */ |
36 | /* saved ebp 48 */ | 36 | /* saved ebp 48 */ |
37 | /* return address 52 */ | 37 | /* return address 52 */ |
38 | #define deflatestate 56 /* the function arguments */ | 38 | #define deflatestate 56 /* the function arguments */ |
39 | #define curmatch 60 | 39 | #define curmatch 60 |
40 | 40 | ||
41 | /* Offsets for fields in the deflate_state structure. These numbers | 41 | /* Offsets for fields in the deflate_state structure. These numbers |
42 | * are calculated from the definition of deflate_state, with the | 42 | * are calculated from the definition of deflate_state, with the |
43 | * assumption that the compiler will dword-align the fields. (Thus, | 43 | * assumption that the compiler will dword-align the fields. (Thus, |
44 | * changing the definition of deflate_state could easily cause this | 44 | * changing the definition of deflate_state could easily cause this |
45 | * program to crash horribly, without so much as a warning at | 45 | * program to crash horribly, without so much as a warning at |
46 | * compile time. Sigh.) | 46 | * compile time. Sigh.) |
47 | */ | 47 | */ |
48 | 48 | ||
49 | /* All the +zlib1222add offsets are due to the addition of fields | 49 | /* All the +zlib1222add offsets are due to the addition of fields |
50 | * in zlib in the deflate_state structure since the asm code was first written | 50 | * in zlib in the deflate_state structure since the asm code was first written |
51 | * (if you compile with zlib 1.0.4 or older, use "zlib1222add equ (-4)"). | 51 | * (if you compile with zlib 1.0.4 or older, use "zlib1222add equ (-4)"). |
52 | * (if you compile with zlib between 1.0.5 and 1.2.2.1, use "zlib1222add equ 0"). | 52 | * (if you compile with zlib between 1.0.5 and 1.2.2.1, use "zlib1222add equ 0"). |
53 | * if you compile with zlib 1.2.2.2 or later , use "zlib1222add equ 8"). | 53 | * if you compile with zlib 1.2.2.2 or later , use "zlib1222add equ 8"). |
54 | */ | 54 | */ |
55 | 55 | ||
56 | #define zlib1222add (8) | 56 | #define zlib1222add (8) |
57 | 57 | ||
58 | #define dsWSize (36+zlib1222add) | 58 | #define dsWSize (36+zlib1222add) |
59 | #define dsWMask (44+zlib1222add) | 59 | #define dsWMask (44+zlib1222add) |
60 | #define dsWindow (48+zlib1222add) | 60 | #define dsWindow (48+zlib1222add) |
61 | #define dsPrev (56+zlib1222add) | 61 | #define dsPrev (56+zlib1222add) |
62 | #define dsMatchLen (88+zlib1222add) | 62 | #define dsMatchLen (88+zlib1222add) |
63 | #define dsPrevMatch (92+zlib1222add) | 63 | #define dsPrevMatch (92+zlib1222add) |
64 | #define dsStrStart (100+zlib1222add) | 64 | #define dsStrStart (100+zlib1222add) |
65 | #define dsMatchStart (104+zlib1222add) | 65 | #define dsMatchStart (104+zlib1222add) |
66 | #define dsLookahead (108+zlib1222add) | 66 | #define dsLookahead (108+zlib1222add) |
67 | #define dsPrevLen (112+zlib1222add) | 67 | #define dsPrevLen (112+zlib1222add) |
68 | #define dsMaxChainLen (116+zlib1222add) | 68 | #define dsMaxChainLen (116+zlib1222add) |
69 | #define dsGoodMatch (132+zlib1222add) | 69 | #define dsGoodMatch (132+zlib1222add) |
70 | #define dsNiceMatch (136+zlib1222add) | 70 | #define dsNiceMatch (136+zlib1222add) |
71 | 71 | ||
72 | 72 | ||
73 | .file "match.S" | 73 | .file "match.S" |
74 | 74 | ||
75 | .globl match_init, longest_match | 75 | .globl match_init, longest_match |
76 | 76 | ||
77 | .text | 77 | .text |
78 | 78 | ||
79 | /* uInt longest_match(deflate_state *deflatestate, IPos curmatch) */ | 79 | /* uInt longest_match(deflate_state *deflatestate, IPos curmatch) */ |
80 | 80 | ||
81 | longest_match: | 81 | longest_match: |
82 | 82 | ||
83 | /* Save registers that the compiler may be using, and adjust %esp to */ | 83 | /* Save registers that the compiler may be using, and adjust %esp to */ |
84 | /* make room for our stack frame. */ | 84 | /* make room for our stack frame. */ |
85 | 85 | ||
86 | pushl %ebp | 86 | pushl %ebp |
87 | pushl %edi | 87 | pushl %edi |
88 | pushl %esi | 88 | pushl %esi |
89 | pushl %ebx | 89 | pushl %ebx |
90 | subl $LocalVarsSize, %esp | 90 | subl $LocalVarsSize, %esp |
91 | 91 | ||
92 | /* Retrieve the function arguments. %ecx will hold cur_match */ | 92 | /* Retrieve the function arguments. %ecx will hold cur_match */ |
93 | /* throughout the entire function. %edx will hold the pointer to the */ | 93 | /* throughout the entire function. %edx will hold the pointer to the */ |
94 | /* deflate_state structure during the function's setup (before */ | 94 | /* deflate_state structure during the function's setup (before */ |
95 | /* entering the main loop). */ | 95 | /* entering the main loop). */ |
96 | 96 | ||
97 | movl deflatestate(%esp), %edx | 97 | movl deflatestate(%esp), %edx |
98 | movl curmatch(%esp), %ecx | 98 | movl curmatch(%esp), %ecx |
99 | 99 | ||
100 | /* if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead; */ | 100 | /* if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead; */ |
101 | 101 | ||
102 | movl dsNiceMatch(%edx), %eax | 102 | movl dsNiceMatch(%edx), %eax |
103 | movl dsLookahead(%edx), %ebx | 103 | movl dsLookahead(%edx), %ebx |
104 | cmpl %eax, %ebx | 104 | cmpl %eax, %ebx |
105 | jl LookaheadLess | 105 | jl LookaheadLess |
106 | movl %eax, %ebx | 106 | movl %eax, %ebx |
107 | LookaheadLess: movl %ebx, nicematch(%esp) | 107 | LookaheadLess: movl %ebx, nicematch(%esp) |
108 | 108 | ||
109 | /* register Bytef *scan = s->window + s->strstart; */ | 109 | /* register Bytef *scan = s->window + s->strstart; */ |
110 | 110 | ||
111 | movl dsWindow(%edx), %esi | 111 | movl dsWindow(%edx), %esi |
112 | movl %esi, window(%esp) | 112 | movl %esi, window(%esp) |
113 | movl dsStrStart(%edx), %ebp | 113 | movl dsStrStart(%edx), %ebp |
114 | lea (%esi,%ebp), %edi | 114 | lea (%esi,%ebp), %edi |
115 | movl %edi, scan(%esp) | 115 | movl %edi, scan(%esp) |
116 | 116 | ||
117 | /* Determine how many bytes the scan ptr is off from being */ | 117 | /* Determine how many bytes the scan ptr is off from being */ |
118 | /* dword-aligned. */ | 118 | /* dword-aligned. */ |
119 | 119 | ||
120 | movl %edi, %eax | 120 | movl %edi, %eax |
121 | negl %eax | 121 | negl %eax |
122 | andl $3, %eax | 122 | andl $3, %eax |
123 | movl %eax, scanalign(%esp) | 123 | movl %eax, scanalign(%esp) |
124 | 124 | ||
125 | /* IPos limit = s->strstart > (IPos)MAX_DIST(s) ? */ | 125 | /* IPos limit = s->strstart > (IPos)MAX_DIST(s) ? */ |
126 | /* s->strstart - (IPos)MAX_DIST(s) : NIL; */ | 126 | /* s->strstart - (IPos)MAX_DIST(s) : NIL; */ |
127 | 127 | ||
128 | movl dsWSize(%edx), %eax | 128 | movl dsWSize(%edx), %eax |
129 | subl $MIN_LOOKAHEAD, %eax | 129 | subl $MIN_LOOKAHEAD, %eax |
130 | subl %eax, %ebp | 130 | subl %eax, %ebp |
131 | jg LimitPositive | 131 | jg LimitPositive |
132 | xorl %ebp, %ebp | 132 | xorl %ebp, %ebp |
133 | LimitPositive: | 133 | LimitPositive: |
134 | 134 | ||
135 | /* unsigned chain_length = s->max_chain_length; */ | 135 | /* unsigned chain_length = s->max_chain_length; */ |
136 | /* if (s->prev_length >= s->good_match) { */ | 136 | /* if (s->prev_length >= s->good_match) { */ |
137 | /* chain_length >>= 2; */ | 137 | /* chain_length >>= 2; */ |
138 | /* } */ | 138 | /* } */ |
139 | 139 | ||
140 | movl dsPrevLen(%edx), %eax | 140 | movl dsPrevLen(%edx), %eax |
141 | movl dsGoodMatch(%edx), %ebx | 141 | movl dsGoodMatch(%edx), %ebx |
142 | cmpl %ebx, %eax | 142 | cmpl %ebx, %eax |
143 | movl dsMaxChainLen(%edx), %ebx | 143 | movl dsMaxChainLen(%edx), %ebx |
144 | jl LastMatchGood | 144 | jl LastMatchGood |
145 | shrl $2, %ebx | 145 | shrl $2, %ebx |
146 | LastMatchGood: | 146 | LastMatchGood: |
147 | 147 | ||
148 | /* chainlen is decremented once beforehand so that the function can */ | 148 | /* chainlen is decremented once beforehand so that the function can */ |
149 | /* use the sign flag instead of the zero flag for the exit test. */ | 149 | /* use the sign flag instead of the zero flag for the exit test. */ |
150 | /* It is then shifted into the high word, to make room for the scanend */ | 150 | /* It is then shifted into the high word, to make room for the scanend */ |
151 | /* scanend value, which it will always accompany. */ | 151 | /* scanend value, which it will always accompany. */ |
152 | 152 | ||
153 | decl %ebx | 153 | decl %ebx |
154 | shll $16, %ebx | 154 | shll $16, %ebx |
155 | 155 | ||
156 | /* int best_len = s->prev_length; */ | 156 | /* int best_len = s->prev_length; */ |
157 | 157 | ||
158 | movl dsPrevLen(%edx), %eax | 158 | movl dsPrevLen(%edx), %eax |
159 | movl %eax, bestlen(%esp) | 159 | movl %eax, bestlen(%esp) |
160 | 160 | ||
161 | /* Store the sum of s->window + best_len in %esi locally, and in %esi. */ | 161 | /* Store the sum of s->window + best_len in %esi locally, and in %esi. */ |
162 | 162 | ||
163 | addl %eax, %esi | 163 | addl %eax, %esi |
164 | movl %esi, windowbestlen(%esp) | 164 | movl %esi, windowbestlen(%esp) |
165 | 165 | ||
166 | /* register ush scan_start = *(ushf*)scan; */ | 166 | /* register ush scan_start = *(ushf*)scan; */ |
167 | /* register ush scan_end = *(ushf*)(scan+best_len-1); */ | 167 | /* register ush scan_end = *(ushf*)(scan+best_len-1); */ |
168 | 168 | ||
169 | movw (%edi), %bx | 169 | movw (%edi), %bx |
170 | movw %bx, scanstart(%esp) | 170 | movw %bx, scanstart(%esp) |
171 | movw -1(%edi,%eax), %bx | 171 | movw -1(%edi,%eax), %bx |
172 | movl %ebx, chainlenscanend(%esp) | 172 | movl %ebx, chainlenscanend(%esp) |
173 | 173 | ||
174 | /* Posf *prev = s->prev; */ | 174 | /* Posf *prev = s->prev; */ |
175 | /* uInt wmask = s->w_mask; */ | 175 | /* uInt wmask = s->w_mask; */ |
176 | 176 | ||
177 | movl dsPrev(%edx), %edi | 177 | movl dsPrev(%edx), %edi |
178 | movl dsWMask(%edx), %edx | 178 | movl dsWMask(%edx), %edx |
179 | mov %edx, wmask(%esp) | 179 | mov %edx, wmask(%esp) |
180 | 180 | ||
181 | /* Jump into the main loop. */ | 181 | /* Jump into the main loop. */ |
182 | 182 | ||
183 | jmp LoopEntry | 183 | jmp LoopEntry |
184 | 184 | ||
185 | .balign 16 | 185 | .balign 16 |
186 | 186 | ||
187 | /* do { | 187 | /* do { |
188 | * match = s->window + cur_match; | 188 | * match = s->window + cur_match; |
189 | * if (*(ushf*)(match+best_len-1) != scan_end || | 189 | * if (*(ushf*)(match+best_len-1) != scan_end || |
190 | * *(ushf*)match != scan_start) continue; | 190 | * *(ushf*)match != scan_start) continue; |
191 | * [...] | 191 | * [...] |
192 | * } while ((cur_match = prev[cur_match & wmask]) > limit | 192 | * } while ((cur_match = prev[cur_match & wmask]) > limit |
193 | * && --chain_length != 0); | 193 | * && --chain_length != 0); |
194 | * | 194 | * |
195 | * Here is the inner loop of the function. The function will spend the | 195 | * Here is the inner loop of the function. The function will spend the |
196 | * majority of its time in this loop, and majority of that time will | 196 | * majority of its time in this loop, and majority of that time will |
197 | * be spent in the first ten instructions. | 197 | * be spent in the first ten instructions. |
198 | * | 198 | * |
199 | * Within this loop: | 199 | * Within this loop: |
200 | * %ebx = chainlenscanend - i.e., ((chainlen << 16) | scanend) | 200 | * %ebx = chainlenscanend - i.e., ((chainlen << 16) | scanend) |
201 | * %ecx = curmatch | 201 | * %ecx = curmatch |
202 | * %edx = curmatch & wmask | 202 | * %edx = curmatch & wmask |
203 | * %esi = windowbestlen - i.e., (window + bestlen) | 203 | * %esi = windowbestlen - i.e., (window + bestlen) |
204 | * %edi = prev | 204 | * %edi = prev |
205 | * %ebp = limit | 205 | * %ebp = limit |
206 | * | 206 | * |
207 | * Two optimization notes on the choice of instructions: | 207 | * Two optimization notes on the choice of instructions: |
208 | * | 208 | * |
209 | * The first instruction uses a 16-bit address, which costs an extra, | 209 | * The first instruction uses a 16-bit address, which costs an extra, |
210 | * unpairable cycle. This is cheaper than doing a 32-bit access and | 210 | * unpairable cycle. This is cheaper than doing a 32-bit access and |
211 | * zeroing the high word, due to the 3-cycle misalignment penalty which | 211 | * zeroing the high word, due to the 3-cycle misalignment penalty which |
212 | * would occur half the time. This also turns out to be cheaper than | 212 | * would occur half the time. This also turns out to be cheaper than |
213 | * doing two separate 8-bit accesses, as the memory is so rarely in the | 213 | * doing two separate 8-bit accesses, as the memory is so rarely in the |
214 | * L1 cache. | 214 | * L1 cache. |
215 | * | 215 | * |
216 | * The window buffer, however, apparently spends a lot of time in the | 216 | * The window buffer, however, apparently spends a lot of time in the |
217 | * cache, and so it is faster to retrieve the word at the end of the | 217 | * cache, and so it is faster to retrieve the word at the end of the |
218 | * match string with two 8-bit loads. The instructions that test the | 218 | * match string with two 8-bit loads. The instructions that test the |
219 | * word at the beginning of the match string, however, are executed | 219 | * word at the beginning of the match string, however, are executed |
220 | * much less frequently, and there it was cheaper to use 16-bit | 220 | * much less frequently, and there it was cheaper to use 16-bit |
221 | * instructions, which avoided the necessity of saving off and | 221 | * instructions, which avoided the necessity of saving off and |
222 | * subsequently reloading one of the other registers. | 222 | * subsequently reloading one of the other registers. |
223 | */ | 223 | */ |
224 | LookupLoop: | 224 | LookupLoop: |
225 | /* 1 U & V */ | 225 | /* 1 U & V */ |
226 | movw (%edi,%edx,2), %cx /* 2 U pipe */ | 226 | movw (%edi,%edx,2), %cx /* 2 U pipe */ |
227 | movl wmask(%esp), %edx /* 2 V pipe */ | 227 | movl wmask(%esp), %edx /* 2 V pipe */ |
228 | cmpl %ebp, %ecx /* 3 U pipe */ | 228 | cmpl %ebp, %ecx /* 3 U pipe */ |
229 | jbe LeaveNow /* 3 V pipe */ | 229 | jbe LeaveNow /* 3 V pipe */ |
230 | subl $0x00010000, %ebx /* 4 U pipe */ | 230 | subl $0x00010000, %ebx /* 4 U pipe */ |
231 | js LeaveNow /* 4 V pipe */ | 231 | js LeaveNow /* 4 V pipe */ |
232 | LoopEntry: movb -1(%esi,%ecx), %al /* 5 U pipe */ | 232 | LoopEntry: movb -1(%esi,%ecx), %al /* 5 U pipe */ |
233 | andl %ecx, %edx /* 5 V pipe */ | 233 | andl %ecx, %edx /* 5 V pipe */ |
234 | cmpb %bl, %al /* 6 U pipe */ | 234 | cmpb %bl, %al /* 6 U pipe */ |
235 | jnz LookupLoop /* 6 V pipe */ | 235 | jnz LookupLoop /* 6 V pipe */ |
236 | movb (%esi,%ecx), %ah | 236 | movb (%esi,%ecx), %ah |
237 | cmpb %bh, %ah | 237 | cmpb %bh, %ah |
238 | jnz LookupLoop | 238 | jnz LookupLoop |
239 | movl window(%esp), %eax | 239 | movl window(%esp), %eax |
240 | movw (%eax,%ecx), %ax | 240 | movw (%eax,%ecx), %ax |
241 | cmpw scanstart(%esp), %ax | 241 | cmpw scanstart(%esp), %ax |
242 | jnz LookupLoop | 242 | jnz LookupLoop |
243 | 243 | ||
244 | /* Store the current value of chainlen. */ | 244 | /* Store the current value of chainlen. */ |
245 | 245 | ||
246 | movl %ebx, chainlenscanend(%esp) | 246 | movl %ebx, chainlenscanend(%esp) |
247 | 247 | ||
248 | /* Point %edi to the string under scrutiny, and %esi to the string we */ | 248 | /* Point %edi to the string under scrutiny, and %esi to the string we */ |
249 | /* are hoping to match it up with. In actuality, %esi and %edi are */ | 249 | /* are hoping to match it up with. In actuality, %esi and %edi are */ |
250 | /* both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and %edx is */ | 250 | /* both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and %edx is */ |
251 | /* initialized to -(MAX_MATCH_8 - scanalign). */ | 251 | /* initialized to -(MAX_MATCH_8 - scanalign). */ |
252 | 252 | ||
253 | movl window(%esp), %esi | 253 | movl window(%esp), %esi |
254 | movl scan(%esp), %edi | 254 | movl scan(%esp), %edi |
255 | addl %ecx, %esi | 255 | addl %ecx, %esi |
256 | movl scanalign(%esp), %eax | 256 | movl scanalign(%esp), %eax |
257 | movl $(-MAX_MATCH_8), %edx | 257 | movl $(-MAX_MATCH_8), %edx |
258 | lea MAX_MATCH_8(%edi,%eax), %edi | 258 | lea MAX_MATCH_8(%edi,%eax), %edi |
259 | lea MAX_MATCH_8(%esi,%eax), %esi | 259 | lea MAX_MATCH_8(%esi,%eax), %esi |
260 | 260 | ||
261 | /* Test the strings for equality, 8 bytes at a time. At the end, | 261 | /* Test the strings for equality, 8 bytes at a time. At the end, |
262 | * adjust %edx so that it is offset to the exact byte that mismatched. | 262 | * adjust %edx so that it is offset to the exact byte that mismatched. |
263 | * | 263 | * |
264 | * We already know at this point that the first three bytes of the | 264 | * We already know at this point that the first three bytes of the |
265 | * strings match each other, and they can be safely passed over before | 265 | * strings match each other, and they can be safely passed over before |
266 | * starting the compare loop. So what this code does is skip over 0-3 | 266 | * starting the compare loop. So what this code does is skip over 0-3 |
267 | * bytes, as much as necessary in order to dword-align the %edi | 267 | * bytes, as much as necessary in order to dword-align the %edi |
268 | * pointer. (%esi will still be misaligned three times out of four.) | 268 | * pointer. (%esi will still be misaligned three times out of four.) |
269 | * | 269 | * |
270 | * It should be confessed that this loop usually does not represent | 270 | * It should be confessed that this loop usually does not represent |
271 | * much of the total running time. Replacing it with a more | 271 | * much of the total running time. Replacing it with a more |
272 | * straightforward "rep cmpsb" would not drastically degrade | 272 | * straightforward "rep cmpsb" would not drastically degrade |
273 | * performance. | 273 | * performance. |
274 | */ | 274 | */ |
275 | LoopCmps: | 275 | LoopCmps: |
276 | movl (%esi,%edx), %eax | 276 | movl (%esi,%edx), %eax |
277 | movl (%edi,%edx), %ebx | 277 | movl (%edi,%edx), %ebx |
278 | xorl %ebx, %eax | 278 | xorl %ebx, %eax |
279 | jnz LeaveLoopCmps | 279 | jnz LeaveLoopCmps |
280 | movl 4(%esi,%edx), %eax | 280 | movl 4(%esi,%edx), %eax |
281 | movl 4(%edi,%edx), %ebx | 281 | movl 4(%edi,%edx), %ebx |
282 | xorl %ebx, %eax | 282 | xorl %ebx, %eax |
283 | jnz LeaveLoopCmps4 | 283 | jnz LeaveLoopCmps4 |
284 | addl $8, %edx | 284 | addl $8, %edx |
285 | jnz LoopCmps | 285 | jnz LoopCmps |
286 | jmp LenMaximum | 286 | jmp LenMaximum |
287 | LeaveLoopCmps4: addl $4, %edx | 287 | LeaveLoopCmps4: addl $4, %edx |
288 | LeaveLoopCmps: testl $0x0000FFFF, %eax | 288 | LeaveLoopCmps: testl $0x0000FFFF, %eax |
289 | jnz LenLower | 289 | jnz LenLower |
290 | addl $2, %edx | 290 | addl $2, %edx |
291 | shrl $16, %eax | 291 | shrl $16, %eax |
292 | LenLower: subb $1, %al | 292 | LenLower: subb $1, %al |
293 | adcl $0, %edx | 293 | adcl $0, %edx |
294 | 294 | ||
295 | /* Calculate the length of the match. If it is longer than MAX_MATCH, */ | 295 | /* Calculate the length of the match. If it is longer than MAX_MATCH, */ |
296 | /* then automatically accept it as the best possible match and leave. */ | 296 | /* then automatically accept it as the best possible match and leave. */ |
297 | 297 | ||
298 | lea (%edi,%edx), %eax | 298 | lea (%edi,%edx), %eax |
299 | movl scan(%esp), %edi | 299 | movl scan(%esp), %edi |
300 | subl %edi, %eax | 300 | subl %edi, %eax |
301 | cmpl $MAX_MATCH, %eax | 301 | cmpl $MAX_MATCH, %eax |
302 | jge LenMaximum | 302 | jge LenMaximum |
303 | 303 | ||
304 | /* If the length of the match is not longer than the best match we */ | 304 | /* If the length of the match is not longer than the best match we */ |
305 | /* have so far, then forget it and return to the lookup loop. */ | 305 | /* have so far, then forget it and return to the lookup loop. */ |
306 | 306 | ||
307 | movl deflatestate(%esp), %edx | 307 | movl deflatestate(%esp), %edx |
308 | movl bestlen(%esp), %ebx | 308 | movl bestlen(%esp), %ebx |
309 | cmpl %ebx, %eax | 309 | cmpl %ebx, %eax |
310 | jg LongerMatch | 310 | jg LongerMatch |
311 | movl chainlenscanend(%esp), %ebx | 311 | movl chainlenscanend(%esp), %ebx |
312 | movl windowbestlen(%esp), %esi | 312 | movl windowbestlen(%esp), %esi |
313 | movl dsPrev(%edx), %edi | 313 | movl dsPrev(%edx), %edi |
314 | movl wmask(%esp), %edx | 314 | movl wmask(%esp), %edx |
315 | andl %ecx, %edx | 315 | andl %ecx, %edx |
316 | jmp LookupLoop | 316 | jmp LookupLoop |
317 | 317 | ||
318 | /* s->match_start = cur_match; */ | 318 | /* s->match_start = cur_match; */ |
319 | /* best_len = len; */ | 319 | /* best_len = len; */ |
320 | /* if (len >= nice_match) break; */ | 320 | /* if (len >= nice_match) break; */ |
321 | /* scan_end = *(ushf*)(scan+best_len-1); */ | 321 | /* scan_end = *(ushf*)(scan+best_len-1); */ |
322 | 322 | ||
323 | LongerMatch: movl nicematch(%esp), %ebx | 323 | LongerMatch: movl nicematch(%esp), %ebx |
324 | movl %eax, bestlen(%esp) | 324 | movl %eax, bestlen(%esp) |
325 | movl %ecx, dsMatchStart(%edx) | 325 | movl %ecx, dsMatchStart(%edx) |
326 | cmpl %ebx, %eax | 326 | cmpl %ebx, %eax |
327 | jge LeaveNow | 327 | jge LeaveNow |
328 | movl window(%esp), %esi | 328 | movl window(%esp), %esi |
329 | addl %eax, %esi | 329 | addl %eax, %esi |
330 | movl %esi, windowbestlen(%esp) | 330 | movl %esi, windowbestlen(%esp) |
331 | movl chainlenscanend(%esp), %ebx | 331 | movl chainlenscanend(%esp), %ebx |
332 | movw -1(%edi,%eax), %bx | 332 | movw -1(%edi,%eax), %bx |
333 | movl dsPrev(%edx), %edi | 333 | movl dsPrev(%edx), %edi |
334 | movl %ebx, chainlenscanend(%esp) | 334 | movl %ebx, chainlenscanend(%esp) |
335 | movl wmask(%esp), %edx | 335 | movl wmask(%esp), %edx |
336 | andl %ecx, %edx | 336 | andl %ecx, %edx |
337 | jmp LookupLoop | 337 | jmp LookupLoop |
338 | 338 | ||
339 | /* Accept the current string, with the maximum possible length. */ | 339 | /* Accept the current string, with the maximum possible length. */ |
340 | 340 | ||
341 | LenMaximum: movl deflatestate(%esp), %edx | 341 | LenMaximum: movl deflatestate(%esp), %edx |
342 | movl $MAX_MATCH, bestlen(%esp) | 342 | movl $MAX_MATCH, bestlen(%esp) |
343 | movl %ecx, dsMatchStart(%edx) | 343 | movl %ecx, dsMatchStart(%edx) |
344 | 344 | ||
345 | /* if ((uInt)best_len <= s->lookahead) return (uInt)best_len; */ | 345 | /* if ((uInt)best_len <= s->lookahead) return (uInt)best_len; */ |
346 | /* return s->lookahead; */ | 346 | /* return s->lookahead; */ |
347 | 347 | ||
348 | LeaveNow: | 348 | LeaveNow: |
349 | movl deflatestate(%esp), %edx | 349 | movl deflatestate(%esp), %edx |
350 | movl bestlen(%esp), %ebx | 350 | movl bestlen(%esp), %ebx |
351 | movl dsLookahead(%edx), %eax | 351 | movl dsLookahead(%edx), %eax |
352 | cmpl %eax, %ebx | 352 | cmpl %eax, %ebx |
353 | jg LookaheadRet | 353 | jg LookaheadRet |
354 | movl %ebx, %eax | 354 | movl %ebx, %eax |
355 | LookaheadRet: | 355 | LookaheadRet: |
356 | 356 | ||
357 | /* Restore the stack and return from whence we came. */ | 357 | /* Restore the stack and return from whence we came. */ |
358 | 358 | ||
359 | addl $LocalVarsSize, %esp | 359 | addl $LocalVarsSize, %esp |
360 | popl %ebx | 360 | popl %ebx |
361 | popl %esi | 361 | popl %esi |
362 | popl %edi | 362 | popl %edi |
363 | popl %ebp | 363 | popl %ebp |
364 | match_init: ret | 364 | match_init: ret |