summaryrefslogtreecommitdiff
path: root/contrib/asm686
diff options
context:
space:
mode:
authorMark Adler <madler@alumni.caltech.edu>2011-09-09 23:24:43 -0700
committerMark Adler <madler@alumni.caltech.edu>2011-09-09 23:24:43 -0700
commit6b8233bfe00e79134cb1b84fc49d4f750a797f79 (patch)
treeca2b03b0169568681dc3d9c823e9f0bc4417d6b5 /contrib/asm686
parent0484693e1723bbab791c56f95597bd7dbe867d03 (diff)
downloadzlib-1.2.2.3.tar.gz
zlib-1.2.2.3.tar.bz2
zlib-1.2.2.3.zip
zlib 1.2.2.3v1.2.2.3
Diffstat (limited to 'contrib/asm686')
-rw-r--r--contrib/asm686/match.S656
1 files changed, 329 insertions, 327 deletions
diff --git a/contrib/asm686/match.S b/contrib/asm686/match.S
index 8e86c33..c24be4d 100644
--- a/contrib/asm686/match.S
+++ b/contrib/asm686/match.S
@@ -1,327 +1,329 @@
1/* match.s -- Pentium-Pro-optimized version of longest_match() 1/* match.s -- Pentium-Pro-optimized version of longest_match()
2 * Written for zlib 1.1.2 2 * Written for zlib 1.1.2
3 * Copyright (C) 1998 Brian Raiter <breadbox@muppetlabs.com> 3 * Copyright (C) 1998 Brian Raiter <breadbox@muppetlabs.com>
4 * 4 *
5 * This is free software; you can redistribute it and/or modify it 5 * This is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License. 6 * under the terms of the GNU General Public License.
7 */ 7 */
8 8
9#ifndef NO_UNDERLINE 9#ifndef NO_UNDERLINE
10#define match_init _match_init 10#define match_init _match_init
11#define longest_match _longest_match 11#define longest_match _longest_match
12#endif 12#endif
13 13
14#define MAX_MATCH (258) 14#define MAX_MATCH (258)
15#define MIN_MATCH (3) 15#define MIN_MATCH (3)
16#define MIN_LOOKAHEAD (MAX_MATCH + MIN_MATCH + 1) 16#define MIN_LOOKAHEAD (MAX_MATCH + MIN_MATCH + 1)
17#define MAX_MATCH_8 ((MAX_MATCH + 7) & ~7) 17#define MAX_MATCH_8 ((MAX_MATCH + 7) & ~7)
18 18
19/* stack frame offsets */ 19/* stack frame offsets */
20 20
21#define chainlenwmask 0 /* high word: current chain len */ 21#define chainlenwmask 0 /* high word: current chain len */
22 /* low word: s->wmask */ 22 /* low word: s->wmask */
23#define window 4 /* local copy of s->window */ 23#define window 4 /* local copy of s->window */
24#define windowbestlen 8 /* s->window + bestlen */ 24#define windowbestlen 8 /* s->window + bestlen */
25#define scanstart 16 /* first two bytes of string */ 25#define scanstart 16 /* first two bytes of string */
26#define scanend 12 /* last two bytes of string */ 26#define scanend 12 /* last two bytes of string */
27#define scanalign 20 /* dword-misalignment of string */ 27#define scanalign 20 /* dword-misalignment of string */
28#define nicematch 24 /* a good enough match size */ 28#define nicematch 24 /* a good enough match size */
29#define bestlen 28 /* size of best match so far */ 29#define bestlen 28 /* size of best match so far */
30#define scan 32 /* ptr to string wanting match */ 30#define scan 32 /* ptr to string wanting match */
31 31
32#define LocalVarsSize (36) 32#define LocalVarsSize (36)
33/* saved ebx 36 */ 33/* saved ebx 36 */
34/* saved edi 40 */ 34/* saved edi 40 */
35/* saved esi 44 */ 35/* saved esi 44 */
36/* saved ebp 48 */ 36/* saved ebp 48 */
37/* return address 52 */ 37/* return address 52 */
38#define deflatestate 56 /* the function arguments */ 38#define deflatestate 56 /* the function arguments */
39#define curmatch 60 39#define curmatch 60
40 40
41/* Offsets for fields in the deflate_state structure. These numbers 41/* All the +zlib1222add offsets are due to the addition of fields
42 * are calculated from the definition of deflate_state, with the 42 * in zlib in the deflate_state structure since the asm code was first written
43 * assumption that the compiler will dword-align the fields. (Thus, 43 * (if you compile with zlib 1.0.4 or older, use "zlib1222add equ (-4)").
44 * changing the definition of deflate_state could easily cause this 44 * (if you compile with zlib between 1.0.5 and 1.2.2.1, use "zlib1222add equ 0").
45 * program to crash horribly, without so much as a warning at 45 * if you compile with zlib 1.2.2.2 or later , use "zlib1222add equ 8").
46 * compile time. Sigh.) 46 */
47 */ 47
48#define dsWSize 36 48#define zlib1222add (8)
49#define dsWMask 44 49
50#define dsWindow 48 50#define dsWSize (36+zlib1222add)
51#define dsPrev 56 51#define dsWMask (44+zlib1222add)
52#define dsMatchLen 88 52#define dsWindow (48+zlib1222add)
53#define dsPrevMatch 92 53#define dsPrev (56+zlib1222add)
54#define dsStrStart 100 54#define dsMatchLen (88+zlib1222add)
55#define dsMatchStart 104 55#define dsPrevMatch (92+zlib1222add)
56#define dsLookahead 108 56#define dsStrStart (100+zlib1222add)
57#define dsPrevLen 112 57#define dsMatchStart (104+zlib1222add)
58#define dsMaxChainLen 116 58#define dsLookahead (108+zlib1222add)
59#define dsGoodMatch 132 59#define dsPrevLen (112+zlib1222add)
60#define dsNiceMatch 136 60#define dsMaxChainLen (116+zlib1222add)
61 61#define dsGoodMatch (132+zlib1222add)
62 62#define dsNiceMatch (136+zlib1222add)
63.file "match.S" 63
64 64
65.globl match_init, longest_match 65.file "match.S"
66 66
67.text 67.globl match_init, longest_match
68 68
69/* uInt longest_match(deflate_state *deflatestate, IPos curmatch) */ 69.text
70 70
71longest_match: 71/* uInt longest_match(deflate_state *deflatestate, IPos curmatch) */
72 72
73/* Save registers that the compiler may be using, and adjust %esp to */ 73longest_match:
74/* make room for our stack frame. */ 74
75 75/* Save registers that the compiler may be using, and adjust %esp to */
76 pushl %ebp 76/* make room for our stack frame. */
77 pushl %edi 77
78 pushl %esi 78 pushl %ebp
79 pushl %ebx 79 pushl %edi
80 subl $LocalVarsSize, %esp 80 pushl %esi
81 81 pushl %ebx
82/* Retrieve the function arguments. %ecx will hold cur_match */ 82 subl $LocalVarsSize, %esp
83/* throughout the entire function. %edx will hold the pointer to the */ 83
84/* deflate_state structure during the function's setup (before */ 84/* Retrieve the function arguments. %ecx will hold cur_match */
85/* entering the main loop). */ 85/* throughout the entire function. %edx will hold the pointer to the */
86 86/* deflate_state structure during the function's setup (before */
87 movl deflatestate(%esp), %edx 87/* entering the main loop). */
88 movl curmatch(%esp), %ecx 88
89 89 movl deflatestate(%esp), %edx
90/* uInt wmask = s->w_mask; */ 90 movl curmatch(%esp), %ecx
91/* unsigned chain_length = s->max_chain_length; */ 91
92/* if (s->prev_length >= s->good_match) { */ 92/* uInt wmask = s->w_mask; */
93/* chain_length >>= 2; */ 93/* unsigned chain_length = s->max_chain_length; */
94/* } */ 94/* if (s->prev_length >= s->good_match) { */
95 95/* chain_length >>= 2; */
96 movl dsPrevLen(%edx), %eax 96/* } */
97 movl dsGoodMatch(%edx), %ebx 97
98 cmpl %ebx, %eax 98 movl dsPrevLen(%edx), %eax
99 movl dsWMask(%edx), %eax 99 movl dsGoodMatch(%edx), %ebx
100 movl dsMaxChainLen(%edx), %ebx 100 cmpl %ebx, %eax
101 jl LastMatchGood 101 movl dsWMask(%edx), %eax
102 shrl $2, %ebx 102 movl dsMaxChainLen(%edx), %ebx
103LastMatchGood: 103 jl LastMatchGood
104 104 shrl $2, %ebx
105/* chainlen is decremented once beforehand so that the function can */ 105LastMatchGood:
106/* use the sign flag instead of the zero flag for the exit test. */ 106
107/* It is then shifted into the high word, to make room for the wmask */ 107/* chainlen is decremented once beforehand so that the function can */
108/* value, which it will always accompany. */ 108/* use the sign flag instead of the zero flag for the exit test. */
109 109/* It is then shifted into the high word, to make room for the wmask */
110 decl %ebx 110/* value, which it will always accompany. */
111 shll $16, %ebx 111
112 orl %eax, %ebx 112 decl %ebx
113 movl %ebx, chainlenwmask(%esp) 113 shll $16, %ebx
114 114 orl %eax, %ebx
115/* if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead; */ 115 movl %ebx, chainlenwmask(%esp)
116 116
117 movl dsNiceMatch(%edx), %eax 117/* if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead; */
118 movl dsLookahead(%edx), %ebx 118
119 cmpl %eax, %ebx 119 movl dsNiceMatch(%edx), %eax
120 jl LookaheadLess 120 movl dsLookahead(%edx), %ebx
121 movl %eax, %ebx 121 cmpl %eax, %ebx
122LookaheadLess: movl %ebx, nicematch(%esp) 122 jl LookaheadLess
123 123 movl %eax, %ebx
124/* register Bytef *scan = s->window + s->strstart; */ 124LookaheadLess: movl %ebx, nicematch(%esp)
125 125
126 movl dsWindow(%edx), %esi 126/* register Bytef *scan = s->window + s->strstart; */
127 movl %esi, window(%esp) 127
128 movl dsStrStart(%edx), %ebp 128 movl dsWindow(%edx), %esi
129 lea (%esi,%ebp), %edi 129 movl %esi, window(%esp)
130 movl %edi, scan(%esp) 130 movl dsStrStart(%edx), %ebp
131 131 lea (%esi,%ebp), %edi
132/* Determine how many bytes the scan ptr is off from being */ 132 movl %edi, scan(%esp)
133/* dword-aligned. */ 133
134 134/* Determine how many bytes the scan ptr is off from being */
135 movl %edi, %eax 135/* dword-aligned. */
136 negl %eax 136
137 andl $3, %eax 137 movl %edi, %eax
138 movl %eax, scanalign(%esp) 138 negl %eax
139 139 andl $3, %eax
140/* IPos limit = s->strstart > (IPos)MAX_DIST(s) ? */ 140 movl %eax, scanalign(%esp)
141/* s->strstart - (IPos)MAX_DIST(s) : NIL; */ 141
142 142/* IPos limit = s->strstart > (IPos)MAX_DIST(s) ? */
143 movl dsWSize(%edx), %eax 143/* s->strstart - (IPos)MAX_DIST(s) : NIL; */
144 subl $MIN_LOOKAHEAD, %eax 144
145 subl %eax, %ebp 145 movl dsWSize(%edx), %eax
146 jg LimitPositive 146 subl $MIN_LOOKAHEAD, %eax
147 xorl %ebp, %ebp 147 subl %eax, %ebp
148LimitPositive: 148 jg LimitPositive
149 149 xorl %ebp, %ebp
150/* int best_len = s->prev_length; */ 150LimitPositive:
151 151
152 movl dsPrevLen(%edx), %eax 152/* int best_len = s->prev_length; */
153 movl %eax, bestlen(%esp) 153
154 154 movl dsPrevLen(%edx), %eax
155/* Store the sum of s->window + best_len in %esi locally, and in %esi. */ 155 movl %eax, bestlen(%esp)
156 156
157 addl %eax, %esi 157/* Store the sum of s->window + best_len in %esi locally, and in %esi. */
158 movl %esi, windowbestlen(%esp) 158
159 159 addl %eax, %esi
160/* register ush scan_start = *(ushf*)scan; */ 160 movl %esi, windowbestlen(%esp)
161/* register ush scan_end = *(ushf*)(scan+best_len-1); */ 161
162/* Posf *prev = s->prev; */ 162/* register ush scan_start = *(ushf*)scan; */
163 163/* register ush scan_end = *(ushf*)(scan+best_len-1); */
164 movzwl (%edi), %ebx 164/* Posf *prev = s->prev; */
165 movl %ebx, scanstart(%esp) 165
166 movzwl -1(%edi,%eax), %ebx 166 movzwl (%edi), %ebx
167 movl %ebx, scanend(%esp) 167 movl %ebx, scanstart(%esp)
168 movl dsPrev(%edx), %edi 168 movzwl -1(%edi,%eax), %ebx
169 169 movl %ebx, scanend(%esp)
170/* Jump into the main loop. */ 170 movl dsPrev(%edx), %edi
171 171
172 movl chainlenwmask(%esp), %edx 172/* Jump into the main loop. */
173 jmp LoopEntry 173
174 174 movl chainlenwmask(%esp), %edx
175.balign 16 175 jmp LoopEntry
176 176
177/* do { 177.balign 16
178 * match = s->window + cur_match; 178
179 * if (*(ushf*)(match+best_len-1) != scan_end || 179/* do {
180 * *(ushf*)match != scan_start) continue; 180 * match = s->window + cur_match;
181 * [...] 181 * if (*(ushf*)(match+best_len-1) != scan_end ||
182 * } while ((cur_match = prev[cur_match & wmask]) > limit 182 * *(ushf*)match != scan_start) continue;
183 * && --chain_length != 0); 183 * [...]
184 * 184 * } while ((cur_match = prev[cur_match & wmask]) > limit
185 * Here is the inner loop of the function. The function will spend the 185 * && --chain_length != 0);
186 * majority of its time in this loop, and majority of that time will 186 *
187 * be spent in the first ten instructions. 187 * Here is the inner loop of the function. The function will spend the
188 * 188 * majority of its time in this loop, and majority of that time will
189 * Within this loop: 189 * be spent in the first ten instructions.
190 * %ebx = scanend 190 *
191 * %ecx = curmatch 191 * Within this loop:
192 * %edx = chainlenwmask - i.e., ((chainlen << 16) | wmask) 192 * %ebx = scanend
193 * %esi = windowbestlen - i.e., (window + bestlen) 193 * %ecx = curmatch
194 * %edi = prev 194 * %edx = chainlenwmask - i.e., ((chainlen << 16) | wmask)
195 * %ebp = limit 195 * %esi = windowbestlen - i.e., (window + bestlen)
196 */ 196 * %edi = prev
197LookupLoop: 197 * %ebp = limit
198 andl %edx, %ecx 198 */
199 movzwl (%edi,%ecx,2), %ecx 199LookupLoop:
200 cmpl %ebp, %ecx 200 andl %edx, %ecx
201 jbe LeaveNow 201 movzwl (%edi,%ecx,2), %ecx
202 subl $0x00010000, %edx 202 cmpl %ebp, %ecx
203 js LeaveNow 203 jbe LeaveNow
204LoopEntry: movzwl -1(%esi,%ecx), %eax 204 subl $0x00010000, %edx
205 cmpl %ebx, %eax 205 js LeaveNow
206 jnz LookupLoop 206LoopEntry: movzwl -1(%esi,%ecx), %eax
207 movl window(%esp), %eax 207 cmpl %ebx, %eax
208 movzwl (%eax,%ecx), %eax 208 jnz LookupLoop
209 cmpl scanstart(%esp), %eax 209 movl window(%esp), %eax
210 jnz LookupLoop 210 movzwl (%eax,%ecx), %eax
211 211 cmpl scanstart(%esp), %eax
212/* Store the current value of chainlen. */ 212 jnz LookupLoop
213 213
214 movl %edx, chainlenwmask(%esp) 214/* Store the current value of chainlen. */
215 215
216/* Point %edi to the string under scrutiny, and %esi to the string we */ 216 movl %edx, chainlenwmask(%esp)
217/* are hoping to match it up with. In actuality, %esi and %edi are */ 217
218/* both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and %edx is */ 218/* Point %edi to the string under scrutiny, and %esi to the string we */
219/* initialized to -(MAX_MATCH_8 - scanalign). */ 219/* are hoping to match it up with. In actuality, %esi and %edi are */
220 220/* both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and %edx is */
221 movl window(%esp), %esi 221/* initialized to -(MAX_MATCH_8 - scanalign). */
222 movl scan(%esp), %edi 222
223 addl %ecx, %esi 223 movl window(%esp), %esi
224 movl scanalign(%esp), %eax 224 movl scan(%esp), %edi
225 movl $(-MAX_MATCH_8), %edx 225 addl %ecx, %esi
226 lea MAX_MATCH_8(%edi,%eax), %edi 226 movl scanalign(%esp), %eax
227 lea MAX_MATCH_8(%esi,%eax), %esi 227 movl $(-MAX_MATCH_8), %edx
228 228 lea MAX_MATCH_8(%edi,%eax), %edi
229/* Test the strings for equality, 8 bytes at a time. At the end, 229 lea MAX_MATCH_8(%esi,%eax), %esi
230 * adjust %edx so that it is offset to the exact byte that mismatched. 230
231 * 231/* Test the strings for equality, 8 bytes at a time. At the end,
232 * We already know at this point that the first three bytes of the 232 * adjust %edx so that it is offset to the exact byte that mismatched.
233 * strings match each other, and they can be safely passed over before 233 *
234 * starting the compare loop. So what this code does is skip over 0-3 234 * We already know at this point that the first three bytes of the
235 * bytes, as much as necessary in order to dword-align the %edi 235 * strings match each other, and they can be safely passed over before
236 * pointer. (%esi will still be misaligned three times out of four.) 236 * starting the compare loop. So what this code does is skip over 0-3
237 * 237 * bytes, as much as necessary in order to dword-align the %edi
238 * It should be confessed that this loop usually does not represent 238 * pointer. (%esi will still be misaligned three times out of four.)
239 * much of the total running time. Replacing it with a more 239 *
240 * straightforward "rep cmpsb" would not drastically degrade 240 * It should be confessed that this loop usually does not represent
241 * performance. 241 * much of the total running time. Replacing it with a more
242 */ 242 * straightforward "rep cmpsb" would not drastically degrade
243LoopCmps: 243 * performance.
244 movl (%esi,%edx), %eax 244 */
245 xorl (%edi,%edx), %eax 245LoopCmps:
246 jnz LeaveLoopCmps 246 movl (%esi,%edx), %eax
247 movl 4(%esi,%edx), %eax 247 xorl (%edi,%edx), %eax
248 xorl 4(%edi,%edx), %eax 248 jnz LeaveLoopCmps
249 jnz LeaveLoopCmps4 249 movl 4(%esi,%edx), %eax
250 addl $8, %edx 250 xorl 4(%edi,%edx), %eax
251 jnz LoopCmps 251 jnz LeaveLoopCmps4
252 jmp LenMaximum 252 addl $8, %edx
253LeaveLoopCmps4: addl $4, %edx 253 jnz LoopCmps
254LeaveLoopCmps: testl $0x0000FFFF, %eax 254 jmp LenMaximum
255 jnz LenLower 255LeaveLoopCmps4: addl $4, %edx
256 addl $2, %edx 256LeaveLoopCmps: testl $0x0000FFFF, %eax
257 shrl $16, %eax 257 jnz LenLower
258LenLower: subb $1, %al 258 addl $2, %edx
259 adcl $0, %edx 259 shrl $16, %eax
260 260LenLower: subb $1, %al
261/* Calculate the length of the match. If it is longer than MAX_MATCH, */ 261 adcl $0, %edx
262/* then automatically accept it as the best possible match and leave. */ 262
263 263/* Calculate the length of the match. If it is longer than MAX_MATCH, */
264 lea (%edi,%edx), %eax 264/* then automatically accept it as the best possible match and leave. */
265 movl scan(%esp), %edi 265
266 subl %edi, %eax 266 lea (%edi,%edx), %eax
267 cmpl $MAX_MATCH, %eax 267 movl scan(%esp), %edi
268 jge LenMaximum 268 subl %edi, %eax
269 269 cmpl $MAX_MATCH, %eax
270/* If the length of the match is not longer than the best match we */ 270 jge LenMaximum
271/* have so far, then forget it and return to the lookup loop. */ 271
272 272/* If the length of the match is not longer than the best match we */
273 movl deflatestate(%esp), %edx 273/* have so far, then forget it and return to the lookup loop. */
274 movl bestlen(%esp), %ebx 274
275 cmpl %ebx, %eax 275 movl deflatestate(%esp), %edx
276 jg LongerMatch 276 movl bestlen(%esp), %ebx
277 movl windowbestlen(%esp), %esi 277 cmpl %ebx, %eax
278 movl dsPrev(%edx), %edi 278 jg LongerMatch
279 movl scanend(%esp), %ebx 279 movl windowbestlen(%esp), %esi
280 movl chainlenwmask(%esp), %edx 280 movl dsPrev(%edx), %edi
281 jmp LookupLoop 281 movl scanend(%esp), %ebx
282 282 movl chainlenwmask(%esp), %edx
283/* s->match_start = cur_match; */ 283 jmp LookupLoop
284/* best_len = len; */ 284
285/* if (len >= nice_match) break; */ 285/* s->match_start = cur_match; */
286/* scan_end = *(ushf*)(scan+best_len-1); */ 286/* best_len = len; */
287 287/* if (len >= nice_match) break; */
288LongerMatch: movl nicematch(%esp), %ebx 288/* scan_end = *(ushf*)(scan+best_len-1); */
289 movl %eax, bestlen(%esp) 289
290 movl %ecx, dsMatchStart(%edx) 290LongerMatch: movl nicematch(%esp), %ebx
291 cmpl %ebx, %eax 291 movl %eax, bestlen(%esp)
292 jge LeaveNow 292 movl %ecx, dsMatchStart(%edx)
293 movl window(%esp), %esi 293 cmpl %ebx, %eax
294 addl %eax, %esi 294 jge LeaveNow
295 movl %esi, windowbestlen(%esp) 295 movl window(%esp), %esi
296 movzwl -1(%edi,%eax), %ebx 296 addl %eax, %esi
297 movl dsPrev(%edx), %edi 297 movl %esi, windowbestlen(%esp)
298 movl %ebx, scanend(%esp) 298 movzwl -1(%edi,%eax), %ebx
299 movl chainlenwmask(%esp), %edx 299 movl dsPrev(%edx), %edi
300 jmp LookupLoop 300 movl %ebx, scanend(%esp)
301 301 movl chainlenwmask(%esp), %edx
302/* Accept the current string, with the maximum possible length. */ 302 jmp LookupLoop
303 303
304LenMaximum: movl deflatestate(%esp), %edx 304/* Accept the current string, with the maximum possible length. */
305 movl $MAX_MATCH, bestlen(%esp) 305
306 movl %ecx, dsMatchStart(%edx) 306LenMaximum: movl deflatestate(%esp), %edx
307 307 movl $MAX_MATCH, bestlen(%esp)
308/* if ((uInt)best_len <= s->lookahead) return (uInt)best_len; */ 308 movl %ecx, dsMatchStart(%edx)
309/* return s->lookahead; */ 309
310 310/* if ((uInt)best_len <= s->lookahead) return (uInt)best_len; */
311LeaveNow: 311/* return s->lookahead; */
312 movl deflatestate(%esp), %edx 312
313 movl bestlen(%esp), %ebx 313LeaveNow:
314 movl dsLookahead(%edx), %eax 314 movl deflatestate(%esp), %edx
315 cmpl %eax, %ebx 315 movl bestlen(%esp), %ebx
316 jg LookaheadRet 316 movl dsLookahead(%edx), %eax
317 movl %ebx, %eax 317 cmpl %eax, %ebx
318LookaheadRet: 318 jg LookaheadRet
319 319 movl %ebx, %eax
320/* Restore the stack and return from whence we came. */ 320LookaheadRet:
321 321
322 addl $LocalVarsSize, %esp 322/* Restore the stack and return from whence we came. */
323 popl %ebx 323
324 popl %esi 324 addl $LocalVarsSize, %esp
325 popl %edi 325 popl %ebx
326 popl %ebp 326 popl %esi
327match_init: ret 327 popl %edi
328 popl %ebp
329match_init: ret