summaryrefslogtreecommitdiff
path: root/contrib/asm586
diff options
context:
space:
mode:
authorMark Adler <madler@alumni.caltech.edu>2011-09-09 23:32:36 -0700
committerMark Adler <madler@alumni.caltech.edu>2011-09-09 23:32:36 -0700
commit67cc20d0041a32bee12bd9eb20ae218f91b73f77 (patch)
treed7e1b94bd15c30efd57cf9036f5fe89306b6bba0 /contrib/asm586
parent7751bd4c715ea8478113e34b49b5a794a4642e8e (diff)
downloadzlib-1.2.4-pre1.tar.gz
zlib-1.2.4-pre1.tar.bz2
zlib-1.2.4-pre1.zip
zlib 1.2.4-pre1v1.2.4-pre1
Diffstat (limited to 'contrib/asm586')
-rw-r--r--contrib/asm586/README.58643
-rw-r--r--contrib/asm586/match.S364
2 files changed, 0 insertions, 407 deletions
diff --git a/contrib/asm586/README.586 b/contrib/asm586/README.586
deleted file mode 100644
index 6bb78f3..0000000
--- a/contrib/asm586/README.586
+++ /dev/null
@@ -1,43 +0,0 @@
1This is a patched version of zlib modified to use
2Pentium-optimized assembly code in the deflation algorithm. The files
3changed/added by this patch are:
4
5README.586
6match.S
7
8The effectiveness of these modifications is a bit marginal, as the the
9program's bottleneck seems to be mostly L1-cache contention, for which
10there is no real way to work around without rewriting the basic
11algorithm. The speedup on average is around 5-10% (which is generally
12less than the amount of variance between subsequent executions).
13However, when used at level 9 compression, the cache contention can
14drop enough for the assembly version to achieve 10-20% speedup (and
15sometimes more, depending on the amount of overall redundancy in the
16files). Even here, though, cache contention can still be the limiting
17factor, depending on the nature of the program using the zlib library.
18This may also mean that better improvements will be seen on a Pentium
19with MMX, which suffers much less from L1-cache contention, but I have
20not yet verified this.
21
22Note that this code has been tailored for the Pentium in particular,
23and will not perform well on the Pentium Pro (due to the use of a
24partial register in the inner loop).
25
26If you are using an assembler other than GNU as, you will have to
27translate match.S to use your assembler's syntax. (Have fun.)
28
29Brian Raiter
30breadbox@muppetlabs.com
31April, 1998
32
33
34Added for zlib 1.1.3:
35
36The patches come from
37http://www.muppetlabs.com/~breadbox/software/assembly.html
38
39To compile zlib with this asm file, copy match.S to the zlib directory
40then do:
41
42CFLAGS="-O3 -DASMV" ./configure
43make OBJA=match.o
diff --git a/contrib/asm586/match.S b/contrib/asm586/match.S
deleted file mode 100644
index 0368b35..0000000
--- a/contrib/asm586/match.S
+++ /dev/null
@@ -1,364 +0,0 @@
1/* match.s -- Pentium-optimized version of longest_match()
2 * Written for zlib 1.1.2
3 * Copyright (C) 1998 Brian Raiter <breadbox@muppetlabs.com>
4 *
5 * This is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License.
7 */
8
9#ifndef NO_UNDERLINE
10#define match_init _match_init
11#define longest_match _longest_match
12#endif
13
14#define MAX_MATCH (258)
15#define MIN_MATCH (3)
16#define MIN_LOOKAHEAD (MAX_MATCH + MIN_MATCH + 1)
17#define MAX_MATCH_8 ((MAX_MATCH + 7) & ~7)
18
19/* stack frame offsets */
20
21#define wmask 0 /* local copy of s->wmask */
22#define window 4 /* local copy of s->window */
23#define windowbestlen 8 /* s->window + bestlen */
24#define chainlenscanend 12 /* high word: current chain len */
25 /* low word: last bytes sought */
26#define scanstart 16 /* first two bytes of string */
27#define scanalign 20 /* dword-misalignment of string */
28#define nicematch 24 /* a good enough match size */
29#define bestlen 28 /* size of best match so far */
30#define scan 32 /* ptr to string wanting match */
31
32#define LocalVarsSize (36)
33/* saved ebx 36 */
34/* saved edi 40 */
35/* saved esi 44 */
36/* saved ebp 48 */
37/* return address 52 */
38#define deflatestate 56 /* the function arguments */
39#define curmatch 60
40
41/* Offsets for fields in the deflate_state structure. These numbers
42 * are calculated from the definition of deflate_state, with the
43 * assumption that the compiler will dword-align the fields. (Thus,
44 * changing the definition of deflate_state could easily cause this
45 * program to crash horribly, without so much as a warning at
46 * compile time. Sigh.)
47 */
48
49/* All the +zlib1222add offsets are due to the addition of fields
50 * in zlib in the deflate_state structure since the asm code was first written
51 * (if you compile with zlib 1.0.4 or older, use "zlib1222add equ (-4)").
52 * (if you compile with zlib between 1.0.5 and 1.2.2.1, use "zlib1222add equ 0").
53 * if you compile with zlib 1.2.2.2 or later , use "zlib1222add equ 8").
54 */
55
56#define zlib1222add (8)
57
58#define dsWSize (36+zlib1222add)
59#define dsWMask (44+zlib1222add)
60#define dsWindow (48+zlib1222add)
61#define dsPrev (56+zlib1222add)
62#define dsMatchLen (88+zlib1222add)
63#define dsPrevMatch (92+zlib1222add)
64#define dsStrStart (100+zlib1222add)
65#define dsMatchStart (104+zlib1222add)
66#define dsLookahead (108+zlib1222add)
67#define dsPrevLen (112+zlib1222add)
68#define dsMaxChainLen (116+zlib1222add)
69#define dsGoodMatch (132+zlib1222add)
70#define dsNiceMatch (136+zlib1222add)
71
72
73.file "match.S"
74
75.globl match_init, longest_match
76
77.text
78
79/* uInt longest_match(deflate_state *deflatestate, IPos curmatch) */
80
81longest_match:
82
83/* Save registers that the compiler may be using, and adjust %esp to */
84/* make room for our stack frame. */
85
86 pushl %ebp
87 pushl %edi
88 pushl %esi
89 pushl %ebx
90 subl $LocalVarsSize, %esp
91
92/* Retrieve the function arguments. %ecx will hold cur_match */
93/* throughout the entire function. %edx will hold the pointer to the */
94/* deflate_state structure during the function's setup (before */
95/* entering the main loop). */
96
97 movl deflatestate(%esp), %edx
98 movl curmatch(%esp), %ecx
99
100/* if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead; */
101
102 movl dsNiceMatch(%edx), %eax
103 movl dsLookahead(%edx), %ebx
104 cmpl %eax, %ebx
105 jl LookaheadLess
106 movl %eax, %ebx
107LookaheadLess: movl %ebx, nicematch(%esp)
108
109/* register Bytef *scan = s->window + s->strstart; */
110
111 movl dsWindow(%edx), %esi
112 movl %esi, window(%esp)
113 movl dsStrStart(%edx), %ebp
114 lea (%esi,%ebp), %edi
115 movl %edi, scan(%esp)
116
117/* Determine how many bytes the scan ptr is off from being */
118/* dword-aligned. */
119
120 movl %edi, %eax
121 negl %eax
122 andl $3, %eax
123 movl %eax, scanalign(%esp)
124
125/* IPos limit = s->strstart > (IPos)MAX_DIST(s) ? */
126/* s->strstart - (IPos)MAX_DIST(s) : NIL; */
127
128 movl dsWSize(%edx), %eax
129 subl $MIN_LOOKAHEAD, %eax
130 subl %eax, %ebp
131 jg LimitPositive
132 xorl %ebp, %ebp
133LimitPositive:
134
135/* unsigned chain_length = s->max_chain_length; */
136/* if (s->prev_length >= s->good_match) { */
137/* chain_length >>= 2; */
138/* } */
139
140 movl dsPrevLen(%edx), %eax
141 movl dsGoodMatch(%edx), %ebx
142 cmpl %ebx, %eax
143 movl dsMaxChainLen(%edx), %ebx
144 jl LastMatchGood
145 shrl $2, %ebx
146LastMatchGood:
147
148/* chainlen is decremented once beforehand so that the function can */
149/* use the sign flag instead of the zero flag for the exit test. */
150/* It is then shifted into the high word, to make room for the scanend */
151/* scanend value, which it will always accompany. */
152
153 decl %ebx
154 shll $16, %ebx
155
156/* int best_len = s->prev_length; */
157
158 movl dsPrevLen(%edx), %eax
159 movl %eax, bestlen(%esp)
160
161/* Store the sum of s->window + best_len in %esi locally, and in %esi. */
162
163 addl %eax, %esi
164 movl %esi, windowbestlen(%esp)
165
166/* register ush scan_start = *(ushf*)scan; */
167/* register ush scan_end = *(ushf*)(scan+best_len-1); */
168
169 movw (%edi), %bx
170 movw %bx, scanstart(%esp)
171 movw -1(%edi,%eax), %bx
172 movl %ebx, chainlenscanend(%esp)
173
174/* Posf *prev = s->prev; */
175/* uInt wmask = s->w_mask; */
176
177 movl dsPrev(%edx), %edi
178 movl dsWMask(%edx), %edx
179 mov %edx, wmask(%esp)
180
181/* Jump into the main loop. */
182
183 jmp LoopEntry
184
185.balign 16
186
187/* do {
188 * match = s->window + cur_match;
189 * if (*(ushf*)(match+best_len-1) != scan_end ||
190 * *(ushf*)match != scan_start) continue;
191 * [...]
192 * } while ((cur_match = prev[cur_match & wmask]) > limit
193 * && --chain_length != 0);
194 *
195 * Here is the inner loop of the function. The function will spend the
196 * majority of its time in this loop, and majority of that time will
197 * be spent in the first ten instructions.
198 *
199 * Within this loop:
200 * %ebx = chainlenscanend - i.e., ((chainlen << 16) | scanend)
201 * %ecx = curmatch
202 * %edx = curmatch & wmask
203 * %esi = windowbestlen - i.e., (window + bestlen)
204 * %edi = prev
205 * %ebp = limit
206 *
207 * Two optimization notes on the choice of instructions:
208 *
209 * The first instruction uses a 16-bit address, which costs an extra,
210 * unpairable cycle. This is cheaper than doing a 32-bit access and
211 * zeroing the high word, due to the 3-cycle misalignment penalty which
212 * would occur half the time. This also turns out to be cheaper than
213 * doing two separate 8-bit accesses, as the memory is so rarely in the
214 * L1 cache.
215 *
216 * The window buffer, however, apparently spends a lot of time in the
217 * cache, and so it is faster to retrieve the word at the end of the
218 * match string with two 8-bit loads. The instructions that test the
219 * word at the beginning of the match string, however, are executed
220 * much less frequently, and there it was cheaper to use 16-bit
221 * instructions, which avoided the necessity of saving off and
222 * subsequently reloading one of the other registers.
223 */
224LookupLoop:
225 /* 1 U & V */
226 movw (%edi,%edx,2), %cx /* 2 U pipe */
227 movl wmask(%esp), %edx /* 2 V pipe */
228 cmpl %ebp, %ecx /* 3 U pipe */
229 jbe LeaveNow /* 3 V pipe */
230 subl $0x00010000, %ebx /* 4 U pipe */
231 js LeaveNow /* 4 V pipe */
232LoopEntry: movb -1(%esi,%ecx), %al /* 5 U pipe */
233 andl %ecx, %edx /* 5 V pipe */
234 cmpb %bl, %al /* 6 U pipe */
235 jnz LookupLoop /* 6 V pipe */
236 movb (%esi,%ecx), %ah
237 cmpb %bh, %ah
238 jnz LookupLoop
239 movl window(%esp), %eax
240 movw (%eax,%ecx), %ax
241 cmpw scanstart(%esp), %ax
242 jnz LookupLoop
243
244/* Store the current value of chainlen. */
245
246 movl %ebx, chainlenscanend(%esp)
247
248/* Point %edi to the string under scrutiny, and %esi to the string we */
249/* are hoping to match it up with. In actuality, %esi and %edi are */
250/* both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and %edx is */
251/* initialized to -(MAX_MATCH_8 - scanalign). */
252
253 movl window(%esp), %esi
254 movl scan(%esp), %edi
255 addl %ecx, %esi
256 movl scanalign(%esp), %eax
257 movl $(-MAX_MATCH_8), %edx
258 lea MAX_MATCH_8(%edi,%eax), %edi
259 lea MAX_MATCH_8(%esi,%eax), %esi
260
261/* Test the strings for equality, 8 bytes at a time. At the end,
262 * adjust %edx so that it is offset to the exact byte that mismatched.
263 *
264 * We already know at this point that the first three bytes of the
265 * strings match each other, and they can be safely passed over before
266 * starting the compare loop. So what this code does is skip over 0-3
267 * bytes, as much as necessary in order to dword-align the %edi
268 * pointer. (%esi will still be misaligned three times out of four.)
269 *
270 * It should be confessed that this loop usually does not represent
271 * much of the total running time. Replacing it with a more
272 * straightforward "rep cmpsb" would not drastically degrade
273 * performance.
274 */
275LoopCmps:
276 movl (%esi,%edx), %eax
277 movl (%edi,%edx), %ebx
278 xorl %ebx, %eax
279 jnz LeaveLoopCmps
280 movl 4(%esi,%edx), %eax
281 movl 4(%edi,%edx), %ebx
282 xorl %ebx, %eax
283 jnz LeaveLoopCmps4
284 addl $8, %edx
285 jnz LoopCmps
286 jmp LenMaximum
287LeaveLoopCmps4: addl $4, %edx
288LeaveLoopCmps: testl $0x0000FFFF, %eax
289 jnz LenLower
290 addl $2, %edx
291 shrl $16, %eax
292LenLower: subb $1, %al
293 adcl $0, %edx
294
295/* Calculate the length of the match. If it is longer than MAX_MATCH, */
296/* then automatically accept it as the best possible match and leave. */
297
298 lea (%edi,%edx), %eax
299 movl scan(%esp), %edi
300 subl %edi, %eax
301 cmpl $MAX_MATCH, %eax
302 jge LenMaximum
303
304/* If the length of the match is not longer than the best match we */
305/* have so far, then forget it and return to the lookup loop. */
306
307 movl deflatestate(%esp), %edx
308 movl bestlen(%esp), %ebx
309 cmpl %ebx, %eax
310 jg LongerMatch
311 movl chainlenscanend(%esp), %ebx
312 movl windowbestlen(%esp), %esi
313 movl dsPrev(%edx), %edi
314 movl wmask(%esp), %edx
315 andl %ecx, %edx
316 jmp LookupLoop
317
318/* s->match_start = cur_match; */
319/* best_len = len; */
320/* if (len >= nice_match) break; */
321/* scan_end = *(ushf*)(scan+best_len-1); */
322
323LongerMatch: movl nicematch(%esp), %ebx
324 movl %eax, bestlen(%esp)
325 movl %ecx, dsMatchStart(%edx)
326 cmpl %ebx, %eax
327 jge LeaveNow
328 movl window(%esp), %esi
329 addl %eax, %esi
330 movl %esi, windowbestlen(%esp)
331 movl chainlenscanend(%esp), %ebx
332 movw -1(%edi,%eax), %bx
333 movl dsPrev(%edx), %edi
334 movl %ebx, chainlenscanend(%esp)
335 movl wmask(%esp), %edx
336 andl %ecx, %edx
337 jmp LookupLoop
338
339/* Accept the current string, with the maximum possible length. */
340
341LenMaximum: movl deflatestate(%esp), %edx
342 movl $MAX_MATCH, bestlen(%esp)
343 movl %ecx, dsMatchStart(%edx)
344
345/* if ((uInt)best_len <= s->lookahead) return (uInt)best_len; */
346/* return s->lookahead; */
347
348LeaveNow:
349 movl deflatestate(%esp), %edx
350 movl bestlen(%esp), %ebx
351 movl dsLookahead(%edx), %eax
352 cmpl %eax, %ebx
353 jg LookaheadRet
354 movl %ebx, %eax
355LookaheadRet:
356
357/* Restore the stack and return from whence we came. */
358
359 addl $LocalVarsSize, %esp
360 popl %ebx
361 popl %esi
362 popl %edi
363 popl %ebp
364match_init: ret