summaryrefslogtreecommitdiff
path: root/contrib/asm586
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--contrib/asm586/README.58643
-rw-r--r--contrib/asm586/match.S354
2 files changed, 397 insertions, 0 deletions
diff --git a/contrib/asm586/README.586 b/contrib/asm586/README.586
new file mode 100644
index 0000000..6bb78f3
--- /dev/null
+++ b/contrib/asm586/README.586
@@ -0,0 +1,43 @@
1This is a patched version of zlib modified to use
2Pentium-optimized assembly code in the deflation algorithm. The files
3changed/added by this patch are:
4
5README.586
6match.S
7
8The effectiveness of these modifications is a bit marginal, as the the
9program's bottleneck seems to be mostly L1-cache contention, for which
10there is no real way to work around without rewriting the basic
11algorithm. The speedup on average is around 5-10% (which is generally
12less than the amount of variance between subsequent executions).
13However, when used at level 9 compression, the cache contention can
14drop enough for the assembly version to achieve 10-20% speedup (and
15sometimes more, depending on the amount of overall redundancy in the
16files). Even here, though, cache contention can still be the limiting
17factor, depending on the nature of the program using the zlib library.
18This may also mean that better improvements will be seen on a Pentium
19with MMX, which suffers much less from L1-cache contention, but I have
20not yet verified this.
21
22Note that this code has been tailored for the Pentium in particular,
23and will not perform well on the Pentium Pro (due to the use of a
24partial register in the inner loop).
25
26If you are using an assembler other than GNU as, you will have to
27translate match.S to use your assembler's syntax. (Have fun.)
28
29Brian Raiter
30breadbox@muppetlabs.com
31April, 1998
32
33
34Added for zlib 1.1.3:
35
36The patches come from
37http://www.muppetlabs.com/~breadbox/software/assembly.html
38
39To compile zlib with this asm file, copy match.S to the zlib directory
40then do:
41
42CFLAGS="-O3 -DASMV" ./configure
43make OBJA=match.o
diff --git a/contrib/asm586/match.S b/contrib/asm586/match.S
new file mode 100644
index 0000000..8f16140
--- /dev/null
+++ b/contrib/asm586/match.S
@@ -0,0 +1,354 @@
1/* match.s -- Pentium-optimized version of longest_match()
2 * Written for zlib 1.1.2
3 * Copyright (C) 1998 Brian Raiter <breadbox@muppetlabs.com>
4 *
5 * This is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License.
7 */
8
9#ifndef NO_UNDERLINE
10#define match_init _match_init
11#define longest_match _longest_match
12#endif
13
14#define MAX_MATCH (258)
15#define MIN_MATCH (3)
16#define MIN_LOOKAHEAD (MAX_MATCH + MIN_MATCH + 1)
17#define MAX_MATCH_8 ((MAX_MATCH + 7) & ~7)
18
19/* stack frame offsets */
20
21#define wmask 0 /* local copy of s->wmask */
22#define window 4 /* local copy of s->window */
23#define windowbestlen 8 /* s->window + bestlen */
24#define chainlenscanend 12 /* high word: current chain len */
25 /* low word: last bytes sought */
26#define scanstart 16 /* first two bytes of string */
27#define scanalign 20 /* dword-misalignment of string */
28#define nicematch 24 /* a good enough match size */
29#define bestlen 28 /* size of best match so far */
30#define scan 32 /* ptr to string wanting match */
31
32#define LocalVarsSize (36)
33/* saved ebx 36 */
34/* saved edi 40 */
35/* saved esi 44 */
36/* saved ebp 48 */
37/* return address 52 */
38#define deflatestate 56 /* the function arguments */
39#define curmatch 60
40
41/* Offsets for fields in the deflate_state structure. These numbers
42 * are calculated from the definition of deflate_state, with the
43 * assumption that the compiler will dword-align the fields. (Thus,
44 * changing the definition of deflate_state could easily cause this
45 * program to crash horribly, without so much as a warning at
46 * compile time. Sigh.)
47 */
48#define dsWSize 36
49#define dsWMask 44
50#define dsWindow 48
51#define dsPrev 56
52#define dsMatchLen 88
53#define dsPrevMatch 92
54#define dsStrStart 100
55#define dsMatchStart 104
56#define dsLookahead 108
57#define dsPrevLen 112
58#define dsMaxChainLen 116
59#define dsGoodMatch 132
60#define dsNiceMatch 136
61
62
63.file "match.S"
64
65.globl match_init, longest_match
66
67.text
68
69/* uInt longest_match(deflate_state *deflatestate, IPos curmatch) */
70
71longest_match:
72
73/* Save registers that the compiler may be using, and adjust %esp to */
74/* make room for our stack frame. */
75
76 pushl %ebp
77 pushl %edi
78 pushl %esi
79 pushl %ebx
80 subl $LocalVarsSize, %esp
81
82/* Retrieve the function arguments. %ecx will hold cur_match */
83/* throughout the entire function. %edx will hold the pointer to the */
84/* deflate_state structure during the function's setup (before */
85/* entering the main loop). */
86
87 movl deflatestate(%esp), %edx
88 movl curmatch(%esp), %ecx
89
90/* if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead; */
91
92 movl dsNiceMatch(%edx), %eax
93 movl dsLookahead(%edx), %ebx
94 cmpl %eax, %ebx
95 jl LookaheadLess
96 movl %eax, %ebx
97LookaheadLess: movl %ebx, nicematch(%esp)
98
99/* register Bytef *scan = s->window + s->strstart; */
100
101 movl dsWindow(%edx), %esi
102 movl %esi, window(%esp)
103 movl dsStrStart(%edx), %ebp
104 lea (%esi,%ebp), %edi
105 movl %edi, scan(%esp)
106
107/* Determine how many bytes the scan ptr is off from being */
108/* dword-aligned. */
109
110 movl %edi, %eax
111 negl %eax
112 andl $3, %eax
113 movl %eax, scanalign(%esp)
114
115/* IPos limit = s->strstart > (IPos)MAX_DIST(s) ? */
116/* s->strstart - (IPos)MAX_DIST(s) : NIL; */
117
118 movl dsWSize(%edx), %eax
119 subl $MIN_LOOKAHEAD, %eax
120 subl %eax, %ebp
121 jg LimitPositive
122 xorl %ebp, %ebp
123LimitPositive:
124
125/* unsigned chain_length = s->max_chain_length; */
126/* if (s->prev_length >= s->good_match) { */
127/* chain_length >>= 2; */
128/* } */
129
130 movl dsPrevLen(%edx), %eax
131 movl dsGoodMatch(%edx), %ebx
132 cmpl %ebx, %eax
133 movl dsMaxChainLen(%edx), %ebx
134 jl LastMatchGood
135 shrl $2, %ebx
136LastMatchGood:
137
138/* chainlen is decremented once beforehand so that the function can */
139/* use the sign flag instead of the zero flag for the exit test. */
140/* It is then shifted into the high word, to make room for the scanend */
141/* scanend value, which it will always accompany. */
142
143 decl %ebx
144 shll $16, %ebx
145
146/* int best_len = s->prev_length; */
147
148 movl dsPrevLen(%edx), %eax
149 movl %eax, bestlen(%esp)
150
151/* Store the sum of s->window + best_len in %esi locally, and in %esi. */
152
153 addl %eax, %esi
154 movl %esi, windowbestlen(%esp)
155
156/* register ush scan_start = *(ushf*)scan; */
157/* register ush scan_end = *(ushf*)(scan+best_len-1); */
158
159 movw (%edi), %bx
160 movw %bx, scanstart(%esp)
161 movw -1(%edi,%eax), %bx
162 movl %ebx, chainlenscanend(%esp)
163
164/* Posf *prev = s->prev; */
165/* uInt wmask = s->w_mask; */
166
167 movl dsPrev(%edx), %edi
168 movl dsWMask(%edx), %edx
169 mov %edx, wmask(%esp)
170
171/* Jump into the main loop. */
172
173 jmp LoopEntry
174
175.balign 16
176
177/* do {
178 * match = s->window + cur_match;
179 * if (*(ushf*)(match+best_len-1) != scan_end ||
180 * *(ushf*)match != scan_start) continue;
181 * [...]
182 * } while ((cur_match = prev[cur_match & wmask]) > limit
183 * && --chain_length != 0);
184 *
185 * Here is the inner loop of the function. The function will spend the
186 * majority of its time in this loop, and majority of that time will
187 * be spent in the first ten instructions.
188 *
189 * Within this loop:
190 * %ebx = chainlenscanend - i.e., ((chainlen << 16) | scanend)
191 * %ecx = curmatch
192 * %edx = curmatch & wmask
193 * %esi = windowbestlen - i.e., (window + bestlen)
194 * %edi = prev
195 * %ebp = limit
196 *
197 * Two optimization notes on the choice of instructions:
198 *
199 * The first instruction uses a 16-bit address, which costs an extra,
200 * unpairable cycle. This is cheaper than doing a 32-bit access and
201 * zeroing the high word, due to the 3-cycle misalignment penalty which
202 * would occur half the time. This also turns out to be cheaper than
203 * doing two separate 8-bit accesses, as the memory is so rarely in the
204 * L1 cache.
205 *
206 * The window buffer, however, apparently spends a lot of time in the
207 * cache, and so it is faster to retrieve the word at the end of the
208 * match string with two 8-bit loads. The instructions that test the
209 * word at the beginning of the match string, however, are executed
210 * much less frequently, and there it was cheaper to use 16-bit
211 * instructions, which avoided the necessity of saving off and
212 * subsequently reloading one of the other registers.
213 */
214LookupLoop:
215 /* 1 U & V */
216 movw (%edi,%edx,2), %cx /* 2 U pipe */
217 movl wmask(%esp), %edx /* 2 V pipe */
218 cmpl %ebp, %ecx /* 3 U pipe */
219 jbe LeaveNow /* 3 V pipe */
220 subl $0x00010000, %ebx /* 4 U pipe */
221 js LeaveNow /* 4 V pipe */
222LoopEntry: movb -1(%esi,%ecx), %al /* 5 U pipe */
223 andl %ecx, %edx /* 5 V pipe */
224 cmpb %bl, %al /* 6 U pipe */
225 jnz LookupLoop /* 6 V pipe */
226 movb (%esi,%ecx), %ah
227 cmpb %bh, %ah
228 jnz LookupLoop
229 movl window(%esp), %eax
230 movw (%eax,%ecx), %ax
231 cmpw scanstart(%esp), %ax
232 jnz LookupLoop
233
234/* Store the current value of chainlen. */
235
236 movl %ebx, chainlenscanend(%esp)
237
238/* Point %edi to the string under scrutiny, and %esi to the string we */
239/* are hoping to match it up with. In actuality, %esi and %edi are */
240/* both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and %edx is */
241/* initialized to -(MAX_MATCH_8 - scanalign). */
242
243 movl window(%esp), %esi
244 movl scan(%esp), %edi
245 addl %ecx, %esi
246 movl scanalign(%esp), %eax
247 movl $(-MAX_MATCH_8), %edx
248 lea MAX_MATCH_8(%edi,%eax), %edi
249 lea MAX_MATCH_8(%esi,%eax), %esi
250
251/* Test the strings for equality, 8 bytes at a time. At the end,
252 * adjust %edx so that it is offset to the exact byte that mismatched.
253 *
254 * We already know at this point that the first three bytes of the
255 * strings match each other, and they can be safely passed over before
256 * starting the compare loop. So what this code does is skip over 0-3
257 * bytes, as much as necessary in order to dword-align the %edi
258 * pointer. (%esi will still be misaligned three times out of four.)
259 *
260 * It should be confessed that this loop usually does not represent
261 * much of the total running time. Replacing it with a more
262 * straightforward "rep cmpsb" would not drastically degrade
263 * performance.
264 */
265LoopCmps:
266 movl (%esi,%edx), %eax
267 movl (%edi,%edx), %ebx
268 xorl %ebx, %eax
269 jnz LeaveLoopCmps
270 movl 4(%esi,%edx), %eax
271 movl 4(%edi,%edx), %ebx
272 xorl %ebx, %eax
273 jnz LeaveLoopCmps4
274 addl $8, %edx
275 jnz LoopCmps
276 jmp LenMaximum
277LeaveLoopCmps4: addl $4, %edx
278LeaveLoopCmps: testl $0x0000FFFF, %eax
279 jnz LenLower
280 addl $2, %edx
281 shrl $16, %eax
282LenLower: subb $1, %al
283 adcl $0, %edx
284
285/* Calculate the length of the match. If it is longer than MAX_MATCH, */
286/* then automatically accept it as the best possible match and leave. */
287
288 lea (%edi,%edx), %eax
289 movl scan(%esp), %edi
290 subl %edi, %eax
291 cmpl $MAX_MATCH, %eax
292 jge LenMaximum
293
294/* If the length of the match is not longer than the best match we */
295/* have so far, then forget it and return to the lookup loop. */
296
297 movl deflatestate(%esp), %edx
298 movl bestlen(%esp), %ebx
299 cmpl %ebx, %eax
300 jg LongerMatch
301 movl chainlenscanend(%esp), %ebx
302 movl windowbestlen(%esp), %esi
303 movl dsPrev(%edx), %edi
304 movl wmask(%esp), %edx
305 andl %ecx, %edx
306 jmp LookupLoop
307
308/* s->match_start = cur_match; */
309/* best_len = len; */
310/* if (len >= nice_match) break; */
311/* scan_end = *(ushf*)(scan+best_len-1); */
312
313LongerMatch: movl nicematch(%esp), %ebx
314 movl %eax, bestlen(%esp)
315 movl %ecx, dsMatchStart(%edx)
316 cmpl %ebx, %eax
317 jge LeaveNow
318 movl window(%esp), %esi
319 addl %eax, %esi
320 movl %esi, windowbestlen(%esp)
321 movl chainlenscanend(%esp), %ebx
322 movw -1(%edi,%eax), %bx
323 movl dsPrev(%edx), %edi
324 movl %ebx, chainlenscanend(%esp)
325 movl wmask(%esp), %edx
326 andl %ecx, %edx
327 jmp LookupLoop
328
329/* Accept the current string, with the maximum possible length. */
330
331LenMaximum: movl deflatestate(%esp), %edx
332 movl $MAX_MATCH, bestlen(%esp)
333 movl %ecx, dsMatchStart(%edx)
334
335/* if ((uInt)best_len <= s->lookahead) return (uInt)best_len; */
336/* return s->lookahead; */
337
338LeaveNow:
339 movl deflatestate(%esp), %edx
340 movl bestlen(%esp), %ebx
341 movl dsLookahead(%edx), %eax
342 cmpl %eax, %ebx
343 jg LookaheadRet
344 movl %ebx, %eax
345LookaheadRet:
346
347/* Restore the stack and return from whence we came. */
348
349 addl $LocalVarsSize, %esp
350 popl %ebx
351 popl %esi
352 popl %edi
353 popl %ebp
354match_init: ret