summaryrefslogtreecommitdiff
path: root/contrib/asm686
diff options
context:
space:
mode:
authorMark Adler <madler@alumni.caltech.edu>2011-09-09 23:20:29 -0700
committerMark Adler <madler@alumni.caltech.edu>2011-09-09 23:20:29 -0700
commit14763ac7c6c03bca62c39e35c03cf5bfc7728802 (patch)
treef1055d11ef7b282b698ce7c40e1a9c061413cbdf /contrib/asm686
parentc34c1fcbb19852ca35216ad66276f4f86af3fc22 (diff)
downloadzlib-1.1.3.tar.gz
zlib-1.1.3.tar.bz2
zlib-1.1.3.zip
zlib 1.1.3v1.1.3
Diffstat (limited to 'contrib/asm686')
-rw-r--r--contrib/asm686/README.68634
-rw-r--r--contrib/asm686/match.S327
2 files changed, 361 insertions, 0 deletions
diff --git a/contrib/asm686/README.686 b/contrib/asm686/README.686
new file mode 100644
index 0000000..a593f23
--- /dev/null
+++ b/contrib/asm686/README.686
@@ -0,0 +1,34 @@
1This is a patched version of zlib, modified to use
2Pentium-Pro-optimized assembly code in the deflation algorithm. The
3files changed/added by this patch are:
4
5README.686
6match.S
7
8The speedup that this patch provides varies, depending on whether the
9compiler used to build the original version of zlib falls afoul of the
10PPro's speed traps. My own tests show a speedup of around 10-20% at
11the default compression level, and 20-30% using -9, against a version
12compiled using gcc 2.7.2.3. Your mileage may vary.
13
14Note that this code has been tailored for the PPro/PII in particular,
15and will not perform particuarly well on a Pentium.
16
17If you are using an assembler other than GNU as, you will have to
18translate match.S to use your assembler's syntax. (Have fun.)
19
20Brian Raiter
21breadbox@muppetlabs.com
22April, 1998
23
24
25Added for zlib 1.1.3:
26
27The patches come from
28http://www.muppetlabs.com/~breadbox/software/assembly.html
29
30To compile zlib with this asm file, copy match.S to the zlib directory
31then do:
32
33CFLAGS="-O3 -DASMV" ./configure
34make OBJA=match.o
diff --git a/contrib/asm686/match.S b/contrib/asm686/match.S
new file mode 100644
index 0000000..8e86c33
--- /dev/null
+++ b/contrib/asm686/match.S
@@ -0,0 +1,327 @@
1/* match.s -- Pentium-Pro-optimized version of longest_match()
2 * Written for zlib 1.1.2
3 * Copyright (C) 1998 Brian Raiter <breadbox@muppetlabs.com>
4 *
5 * This is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License.
7 */
8
9#ifndef NO_UNDERLINE
10#define match_init _match_init
11#define longest_match _longest_match
12#endif
13
14#define MAX_MATCH (258)
15#define MIN_MATCH (3)
16#define MIN_LOOKAHEAD (MAX_MATCH + MIN_MATCH + 1)
17#define MAX_MATCH_8 ((MAX_MATCH + 7) & ~7)
18
19/* stack frame offsets */
20
21#define chainlenwmask 0 /* high word: current chain len */
22 /* low word: s->wmask */
23#define window 4 /* local copy of s->window */
24#define windowbestlen 8 /* s->window + bestlen */
25#define scanstart 16 /* first two bytes of string */
26#define scanend 12 /* last two bytes of string */
27#define scanalign 20 /* dword-misalignment of string */
28#define nicematch 24 /* a good enough match size */
29#define bestlen 28 /* size of best match so far */
30#define scan 32 /* ptr to string wanting match */
31
32#define LocalVarsSize (36)
33/* saved ebx 36 */
34/* saved edi 40 */
35/* saved esi 44 */
36/* saved ebp 48 */
37/* return address 52 */
38#define deflatestate 56 /* the function arguments */
39#define curmatch 60
40
41/* Offsets for fields in the deflate_state structure. These numbers
42 * are calculated from the definition of deflate_state, with the
43 * assumption that the compiler will dword-align the fields. (Thus,
44 * changing the definition of deflate_state could easily cause this
45 * program to crash horribly, without so much as a warning at
46 * compile time. Sigh.)
47 */
48#define dsWSize 36
49#define dsWMask 44
50#define dsWindow 48
51#define dsPrev 56
52#define dsMatchLen 88
53#define dsPrevMatch 92
54#define dsStrStart 100
55#define dsMatchStart 104
56#define dsLookahead 108
57#define dsPrevLen 112
58#define dsMaxChainLen 116
59#define dsGoodMatch 132
60#define dsNiceMatch 136
61
62
63.file "match.S"
64
65.globl match_init, longest_match
66
67.text
68
69/* uInt longest_match(deflate_state *deflatestate, IPos curmatch) */
70
71longest_match:
72
73/* Save registers that the compiler may be using, and adjust %esp to */
74/* make room for our stack frame. */
75
76 pushl %ebp
77 pushl %edi
78 pushl %esi
79 pushl %ebx
80 subl $LocalVarsSize, %esp
81
82/* Retrieve the function arguments. %ecx will hold cur_match */
83/* throughout the entire function. %edx will hold the pointer to the */
84/* deflate_state structure during the function's setup (before */
85/* entering the main loop). */
86
87 movl deflatestate(%esp), %edx
88 movl curmatch(%esp), %ecx
89
90/* uInt wmask = s->w_mask; */
91/* unsigned chain_length = s->max_chain_length; */
92/* if (s->prev_length >= s->good_match) { */
93/* chain_length >>= 2; */
94/* } */
95
96 movl dsPrevLen(%edx), %eax
97 movl dsGoodMatch(%edx), %ebx
98 cmpl %ebx, %eax
99 movl dsWMask(%edx), %eax
100 movl dsMaxChainLen(%edx), %ebx
101 jl LastMatchGood
102 shrl $2, %ebx
103LastMatchGood:
104
105/* chainlen is decremented once beforehand so that the function can */
106/* use the sign flag instead of the zero flag for the exit test. */
107/* It is then shifted into the high word, to make room for the wmask */
108/* value, which it will always accompany. */
109
110 decl %ebx
111 shll $16, %ebx
112 orl %eax, %ebx
113 movl %ebx, chainlenwmask(%esp)
114
115/* if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead; */
116
117 movl dsNiceMatch(%edx), %eax
118 movl dsLookahead(%edx), %ebx
119 cmpl %eax, %ebx
120 jl LookaheadLess
121 movl %eax, %ebx
122LookaheadLess: movl %ebx, nicematch(%esp)
123
124/* register Bytef *scan = s->window + s->strstart; */
125
126 movl dsWindow(%edx), %esi
127 movl %esi, window(%esp)
128 movl dsStrStart(%edx), %ebp
129 lea (%esi,%ebp), %edi
130 movl %edi, scan(%esp)
131
132/* Determine how many bytes the scan ptr is off from being */
133/* dword-aligned. */
134
135 movl %edi, %eax
136 negl %eax
137 andl $3, %eax
138 movl %eax, scanalign(%esp)
139
140/* IPos limit = s->strstart > (IPos)MAX_DIST(s) ? */
141/* s->strstart - (IPos)MAX_DIST(s) : NIL; */
142
143 movl dsWSize(%edx), %eax
144 subl $MIN_LOOKAHEAD, %eax
145 subl %eax, %ebp
146 jg LimitPositive
147 xorl %ebp, %ebp
148LimitPositive:
149
150/* int best_len = s->prev_length; */
151
152 movl dsPrevLen(%edx), %eax
153 movl %eax, bestlen(%esp)
154
155/* Store the sum of s->window + best_len in %esi locally, and in %esi. */
156
157 addl %eax, %esi
158 movl %esi, windowbestlen(%esp)
159
160/* register ush scan_start = *(ushf*)scan; */
161/* register ush scan_end = *(ushf*)(scan+best_len-1); */
162/* Posf *prev = s->prev; */
163
164 movzwl (%edi), %ebx
165 movl %ebx, scanstart(%esp)
166 movzwl -1(%edi,%eax), %ebx
167 movl %ebx, scanend(%esp)
168 movl dsPrev(%edx), %edi
169
170/* Jump into the main loop. */
171
172 movl chainlenwmask(%esp), %edx
173 jmp LoopEntry
174
175.balign 16
176
177/* do {
178 * match = s->window + cur_match;
179 * if (*(ushf*)(match+best_len-1) != scan_end ||
180 * *(ushf*)match != scan_start) continue;
181 * [...]
182 * } while ((cur_match = prev[cur_match & wmask]) > limit
183 * && --chain_length != 0);
184 *
185 * Here is the inner loop of the function. The function will spend the
186 * majority of its time in this loop, and majority of that time will
187 * be spent in the first ten instructions.
188 *
189 * Within this loop:
190 * %ebx = scanend
191 * %ecx = curmatch
192 * %edx = chainlenwmask - i.e., ((chainlen << 16) | wmask)
193 * %esi = windowbestlen - i.e., (window + bestlen)
194 * %edi = prev
195 * %ebp = limit
196 */
197LookupLoop:
198 andl %edx, %ecx
199 movzwl (%edi,%ecx,2), %ecx
200 cmpl %ebp, %ecx
201 jbe LeaveNow
202 subl $0x00010000, %edx
203 js LeaveNow
204LoopEntry: movzwl -1(%esi,%ecx), %eax
205 cmpl %ebx, %eax
206 jnz LookupLoop
207 movl window(%esp), %eax
208 movzwl (%eax,%ecx), %eax
209 cmpl scanstart(%esp), %eax
210 jnz LookupLoop
211
212/* Store the current value of chainlen. */
213
214 movl %edx, chainlenwmask(%esp)
215
216/* Point %edi to the string under scrutiny, and %esi to the string we */
217/* are hoping to match it up with. In actuality, %esi and %edi are */
218/* both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and %edx is */
219/* initialized to -(MAX_MATCH_8 - scanalign). */
220
221 movl window(%esp), %esi
222 movl scan(%esp), %edi
223 addl %ecx, %esi
224 movl scanalign(%esp), %eax
225 movl $(-MAX_MATCH_8), %edx
226 lea MAX_MATCH_8(%edi,%eax), %edi
227 lea MAX_MATCH_8(%esi,%eax), %esi
228
229/* Test the strings for equality, 8 bytes at a time. At the end,
230 * adjust %edx so that it is offset to the exact byte that mismatched.
231 *
232 * We already know at this point that the first three bytes of the
233 * strings match each other, and they can be safely passed over before
234 * starting the compare loop. So what this code does is skip over 0-3
235 * bytes, as much as necessary in order to dword-align the %edi
236 * pointer. (%esi will still be misaligned three times out of four.)
237 *
238 * It should be confessed that this loop usually does not represent
239 * much of the total running time. Replacing it with a more
240 * straightforward "rep cmpsb" would not drastically degrade
241 * performance.
242 */
243LoopCmps:
244 movl (%esi,%edx), %eax
245 xorl (%edi,%edx), %eax
246 jnz LeaveLoopCmps
247 movl 4(%esi,%edx), %eax
248 xorl 4(%edi,%edx), %eax
249 jnz LeaveLoopCmps4
250 addl $8, %edx
251 jnz LoopCmps
252 jmp LenMaximum
253LeaveLoopCmps4: addl $4, %edx
254LeaveLoopCmps: testl $0x0000FFFF, %eax
255 jnz LenLower
256 addl $2, %edx
257 shrl $16, %eax
258LenLower: subb $1, %al
259 adcl $0, %edx
260
261/* Calculate the length of the match. If it is longer than MAX_MATCH, */
262/* then automatically accept it as the best possible match and leave. */
263
264 lea (%edi,%edx), %eax
265 movl scan(%esp), %edi
266 subl %edi, %eax
267 cmpl $MAX_MATCH, %eax
268 jge LenMaximum
269
270/* If the length of the match is not longer than the best match we */
271/* have so far, then forget it and return to the lookup loop. */
272
273 movl deflatestate(%esp), %edx
274 movl bestlen(%esp), %ebx
275 cmpl %ebx, %eax
276 jg LongerMatch
277 movl windowbestlen(%esp), %esi
278 movl dsPrev(%edx), %edi
279 movl scanend(%esp), %ebx
280 movl chainlenwmask(%esp), %edx
281 jmp LookupLoop
282
283/* s->match_start = cur_match; */
284/* best_len = len; */
285/* if (len >= nice_match) break; */
286/* scan_end = *(ushf*)(scan+best_len-1); */
287
288LongerMatch: movl nicematch(%esp), %ebx
289 movl %eax, bestlen(%esp)
290 movl %ecx, dsMatchStart(%edx)
291 cmpl %ebx, %eax
292 jge LeaveNow
293 movl window(%esp), %esi
294 addl %eax, %esi
295 movl %esi, windowbestlen(%esp)
296 movzwl -1(%edi,%eax), %ebx
297 movl dsPrev(%edx), %edi
298 movl %ebx, scanend(%esp)
299 movl chainlenwmask(%esp), %edx
300 jmp LookupLoop
301
302/* Accept the current string, with the maximum possible length. */
303
304LenMaximum: movl deflatestate(%esp), %edx
305 movl $MAX_MATCH, bestlen(%esp)
306 movl %ecx, dsMatchStart(%edx)
307
308/* if ((uInt)best_len <= s->lookahead) return (uInt)best_len; */
309/* return s->lookahead; */
310
311LeaveNow:
312 movl deflatestate(%esp), %edx
313 movl bestlen(%esp), %ebx
314 movl dsLookahead(%edx), %eax
315 cmpl %eax, %ebx
316 jg LookaheadRet
317 movl %ebx, %eax
318LookaheadRet:
319
320/* Restore the stack and return from whence we came. */
321
322 addl $LocalVarsSize, %esp
323 popl %ebx
324 popl %esi
325 popl %edi
326 popl %ebp
327match_init: ret