aboutsummaryrefslogtreecommitdiff
path: root/contrib/asm686
diff options
context:
space:
mode:
authorMark Adler <zlib@madler.net>2017-10-12 20:08:53 -0700
committerMark Adler <zlib@madler.net>2017-10-12 20:27:14 -0700
commit288f1080317b954b6bdca33708631c011549c008 (patch)
tree9629f01104722ba8e490f04a0790c56513ba989a /contrib/asm686
parenta5773513942b1c57d0eff51fcb2ebac72796ed95 (diff)
downloadzlib-288f1080317b954b6bdca33708631c011549c008.tar.gz
zlib-288f1080317b954b6bdca33708631c011549c008.tar.bz2
zlib-288f1080317b954b6bdca33708631c011549c008.zip
Remove old assembler code in which bugs have manifested.
In addition, there is not sufficient gain from the inflate assembler code to warrant its inclusion.
Diffstat (limited to 'contrib/asm686')
-rw-r--r--contrib/asm686/README.68651
-rw-r--r--contrib/asm686/match.S357
2 files changed, 0 insertions, 408 deletions
diff --git a/contrib/asm686/README.686 b/contrib/asm686/README.686
deleted file mode 100644
index a0bf3be..0000000
--- a/contrib/asm686/README.686
+++ /dev/null
@@ -1,51 +0,0 @@
1This is a patched version of zlib, modified to use
2Pentium-Pro-optimized assembly code in the deflation algorithm. The
3files changed/added by this patch are:
4
5README.686
6match.S
7
8The speedup that this patch provides varies, depending on whether the
9compiler used to build the original version of zlib falls afoul of the
10PPro's speed traps. My own tests show a speedup of around 10-20% at
11the default compression level, and 20-30% using -9, against a version
12compiled using gcc 2.7.2.3. Your mileage may vary.
13
14Note that this code has been tailored for the PPro/PII in particular,
15and will not perform particuarly well on a Pentium.
16
17If you are using an assembler other than GNU as, you will have to
18translate match.S to use your assembler's syntax. (Have fun.)
19
20Brian Raiter
21breadbox@muppetlabs.com
22April, 1998
23
24
25Added for zlib 1.1.3:
26
27The patches come from
28http://www.muppetlabs.com/~breadbox/software/assembly.html
29
30To compile zlib with this asm file, copy match.S to the zlib directory
31then do:
32
33CFLAGS="-O3 -DASMV" ./configure
34make OBJA=match.o
35
36
37Update:
38
39I've been ignoring these assembly routines for years, believing that
40gcc's generated code had caught up with it sometime around gcc 2.95
41and the major rearchitecting of the Pentium 4. However, I recently
42learned that, despite what I believed, this code still has some life
43in it. On the Pentium 4 and AMD64 chips, it continues to run about 8%
44faster than the code produced by gcc 4.1.
45
46In acknowledgement of its continuing usefulness, I've altered the
47license to match that of the rest of zlib. Share and Enjoy!
48
49Brian Raiter
50breadbox@muppetlabs.com
51April, 2007
diff --git a/contrib/asm686/match.S b/contrib/asm686/match.S
deleted file mode 100644
index fa42109..0000000
--- a/contrib/asm686/match.S
+++ /dev/null
@@ -1,357 +0,0 @@
1/* match.S -- x86 assembly version of the zlib longest_match() function.
2 * Optimized for the Intel 686 chips (PPro and later).
3 *
4 * Copyright (C) 1998, 2007 Brian Raiter <breadbox@muppetlabs.com>
5 *
6 * This software is provided 'as-is', without any express or implied
7 * warranty. In no event will the author be held liable for any damages
8 * arising from the use of this software.
9 *
10 * Permission is granted to anyone to use this software for any purpose,
11 * including commercial applications, and to alter it and redistribute it
12 * freely, subject to the following restrictions:
13 *
14 * 1. The origin of this software must not be misrepresented; you must not
15 * claim that you wrote the original software. If you use this software
16 * in a product, an acknowledgment in the product documentation would be
17 * appreciated but is not required.
18 * 2. Altered source versions must be plainly marked as such, and must not be
19 * misrepresented as being the original software.
20 * 3. This notice may not be removed or altered from any source distribution.
21 */
22
23#ifndef NO_UNDERLINE
24#define match_init _match_init
25#define longest_match _longest_match
26#endif
27
28#define MAX_MATCH (258)
29#define MIN_MATCH (3)
30#define MIN_LOOKAHEAD (MAX_MATCH + MIN_MATCH + 1)
31#define MAX_MATCH_8 ((MAX_MATCH + 7) & ~7)
32
33/* stack frame offsets */
34
35#define chainlenwmask 0 /* high word: current chain len */
36 /* low word: s->wmask */
37#define window 4 /* local copy of s->window */
38#define windowbestlen 8 /* s->window + bestlen */
39#define scanstart 16 /* first two bytes of string */
40#define scanend 12 /* last two bytes of string */
41#define scanalign 20 /* dword-misalignment of string */
42#define nicematch 24 /* a good enough match size */
43#define bestlen 28 /* size of best match so far */
44#define scan 32 /* ptr to string wanting match */
45
46#define LocalVarsSize (36)
47/* saved ebx 36 */
48/* saved edi 40 */
49/* saved esi 44 */
50/* saved ebp 48 */
51/* return address 52 */
52#define deflatestate 56 /* the function arguments */
53#define curmatch 60
54
55/* All the +zlib1222add offsets are due to the addition of fields
56 * in zlib in the deflate_state structure since the asm code was first written
57 * (if you compile with zlib 1.0.4 or older, use "zlib1222add equ (-4)").
58 * (if you compile with zlib between 1.0.5 and 1.2.2.1, use "zlib1222add equ 0").
59 * if you compile with zlib 1.2.2.2 or later , use "zlib1222add equ 8").
60 */
61
62#define zlib1222add (8)
63
64#define dsWSize (36+zlib1222add)
65#define dsWMask (44+zlib1222add)
66#define dsWindow (48+zlib1222add)
67#define dsPrev (56+zlib1222add)
68#define dsMatchLen (88+zlib1222add)
69#define dsPrevMatch (92+zlib1222add)
70#define dsStrStart (100+zlib1222add)
71#define dsMatchStart (104+zlib1222add)
72#define dsLookahead (108+zlib1222add)
73#define dsPrevLen (112+zlib1222add)
74#define dsMaxChainLen (116+zlib1222add)
75#define dsGoodMatch (132+zlib1222add)
76#define dsNiceMatch (136+zlib1222add)
77
78
79.file "match.S"
80
81.globl match_init, longest_match
82
83.text
84
85/* uInt longest_match(deflate_state *deflatestate, IPos curmatch) */
86.cfi_sections .debug_frame
87
88longest_match:
89
90.cfi_startproc
91/* Save registers that the compiler may be using, and adjust %esp to */
92/* make room for our stack frame. */
93
94 pushl %ebp
95 .cfi_def_cfa_offset 8
96 .cfi_offset ebp, -8
97 pushl %edi
98 .cfi_def_cfa_offset 12
99 pushl %esi
100 .cfi_def_cfa_offset 16
101 pushl %ebx
102 .cfi_def_cfa_offset 20
103 subl $LocalVarsSize, %esp
104 .cfi_def_cfa_offset LocalVarsSize+20
105
106/* Retrieve the function arguments. %ecx will hold cur_match */
107/* throughout the entire function. %edx will hold the pointer to the */
108/* deflate_state structure during the function's setup (before */
109/* entering the main loop). */
110
111 movl deflatestate(%esp), %edx
112 movl curmatch(%esp), %ecx
113
114/* uInt wmask = s->w_mask; */
115/* unsigned chain_length = s->max_chain_length; */
116/* if (s->prev_length >= s->good_match) { */
117/* chain_length >>= 2; */
118/* } */
119
120 movl dsPrevLen(%edx), %eax
121 movl dsGoodMatch(%edx), %ebx
122 cmpl %ebx, %eax
123 movl dsWMask(%edx), %eax
124 movl dsMaxChainLen(%edx), %ebx
125 jl LastMatchGood
126 shrl $2, %ebx
127LastMatchGood:
128
129/* chainlen is decremented once beforehand so that the function can */
130/* use the sign flag instead of the zero flag for the exit test. */
131/* It is then shifted into the high word, to make room for the wmask */
132/* value, which it will always accompany. */
133
134 decl %ebx
135 shll $16, %ebx
136 orl %eax, %ebx
137 movl %ebx, chainlenwmask(%esp)
138
139/* if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead; */
140
141 movl dsNiceMatch(%edx), %eax
142 movl dsLookahead(%edx), %ebx
143 cmpl %eax, %ebx
144 jl LookaheadLess
145 movl %eax, %ebx
146LookaheadLess: movl %ebx, nicematch(%esp)
147
148/* register Bytef *scan = s->window + s->strstart; */
149
150 movl dsWindow(%edx), %esi
151 movl %esi, window(%esp)
152 movl dsStrStart(%edx), %ebp
153 lea (%esi,%ebp), %edi
154 movl %edi, scan(%esp)
155
156/* Determine how many bytes the scan ptr is off from being */
157/* dword-aligned. */
158
159 movl %edi, %eax
160 negl %eax
161 andl $3, %eax
162 movl %eax, scanalign(%esp)
163
164/* IPos limit = s->strstart > (IPos)MAX_DIST(s) ? */
165/* s->strstart - (IPos)MAX_DIST(s) : NIL; */
166
167 movl dsWSize(%edx), %eax
168 subl $MIN_LOOKAHEAD, %eax
169 subl %eax, %ebp
170 jg LimitPositive
171 xorl %ebp, %ebp
172LimitPositive:
173
174/* int best_len = s->prev_length; */
175
176 movl dsPrevLen(%edx), %eax
177 movl %eax, bestlen(%esp)
178
179/* Store the sum of s->window + best_len in %esi locally, and in %esi. */
180
181 addl %eax, %esi
182 movl %esi, windowbestlen(%esp)
183
184/* register ush scan_start = *(ushf*)scan; */
185/* register ush scan_end = *(ushf*)(scan+best_len-1); */
186/* Posf *prev = s->prev; */
187
188 movzwl (%edi), %ebx
189 movl %ebx, scanstart(%esp)
190 movzwl -1(%edi,%eax), %ebx
191 movl %ebx, scanend(%esp)
192 movl dsPrev(%edx), %edi
193
194/* Jump into the main loop. */
195
196 movl chainlenwmask(%esp), %edx
197 jmp LoopEntry
198
199.balign 16
200
201/* do {
202 * match = s->window + cur_match;
203 * if (*(ushf*)(match+best_len-1) != scan_end ||
204 * *(ushf*)match != scan_start) continue;
205 * [...]
206 * } while ((cur_match = prev[cur_match & wmask]) > limit
207 * && --chain_length != 0);
208 *
209 * Here is the inner loop of the function. The function will spend the
210 * majority of its time in this loop, and majority of that time will
211 * be spent in the first ten instructions.
212 *
213 * Within this loop:
214 * %ebx = scanend
215 * %ecx = curmatch
216 * %edx = chainlenwmask - i.e., ((chainlen << 16) | wmask)
217 * %esi = windowbestlen - i.e., (window + bestlen)
218 * %edi = prev
219 * %ebp = limit
220 */
221LookupLoop:
222 andl %edx, %ecx
223 movzwl (%edi,%ecx,2), %ecx
224 cmpl %ebp, %ecx
225 jbe LeaveNow
226 subl $0x00010000, %edx
227 js LeaveNow
228LoopEntry: movzwl -1(%esi,%ecx), %eax
229 cmpl %ebx, %eax
230 jnz LookupLoop
231 movl window(%esp), %eax
232 movzwl (%eax,%ecx), %eax
233 cmpl scanstart(%esp), %eax
234 jnz LookupLoop
235
236/* Store the current value of chainlen. */
237
238 movl %edx, chainlenwmask(%esp)
239
240/* Point %edi to the string under scrutiny, and %esi to the string we */
241/* are hoping to match it up with. In actuality, %esi and %edi are */
242/* both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and %edx is */
243/* initialized to -(MAX_MATCH_8 - scanalign). */
244
245 movl window(%esp), %esi
246 movl scan(%esp), %edi
247 addl %ecx, %esi
248 movl scanalign(%esp), %eax
249 movl $(-MAX_MATCH_8), %edx
250 lea MAX_MATCH_8(%edi,%eax), %edi
251 lea MAX_MATCH_8(%esi,%eax), %esi
252
253/* Test the strings for equality, 8 bytes at a time. At the end,
254 * adjust %edx so that it is offset to the exact byte that mismatched.
255 *
256 * We already know at this point that the first three bytes of the
257 * strings match each other, and they can be safely passed over before
258 * starting the compare loop. So what this code does is skip over 0-3
259 * bytes, as much as necessary in order to dword-align the %edi
260 * pointer. (%esi will still be misaligned three times out of four.)
261 *
262 * It should be confessed that this loop usually does not represent
263 * much of the total running time. Replacing it with a more
264 * straightforward "rep cmpsb" would not drastically degrade
265 * performance.
266 */
267LoopCmps:
268 movl (%esi,%edx), %eax
269 xorl (%edi,%edx), %eax
270 jnz LeaveLoopCmps
271 movl 4(%esi,%edx), %eax
272 xorl 4(%edi,%edx), %eax
273 jnz LeaveLoopCmps4
274 addl $8, %edx
275 jnz LoopCmps
276 jmp LenMaximum
277LeaveLoopCmps4: addl $4, %edx
278LeaveLoopCmps: testl $0x0000FFFF, %eax
279 jnz LenLower
280 addl $2, %edx
281 shrl $16, %eax
282LenLower: subb $1, %al
283 adcl $0, %edx
284
285/* Calculate the length of the match. If it is longer than MAX_MATCH, */
286/* then automatically accept it as the best possible match and leave. */
287
288 lea (%edi,%edx), %eax
289 movl scan(%esp), %edi
290 subl %edi, %eax
291 cmpl $MAX_MATCH, %eax
292 jge LenMaximum
293
294/* If the length of the match is not longer than the best match we */
295/* have so far, then forget it and return to the lookup loop. */
296
297 movl deflatestate(%esp), %edx
298 movl bestlen(%esp), %ebx
299 cmpl %ebx, %eax
300 jg LongerMatch
301 movl windowbestlen(%esp), %esi
302 movl dsPrev(%edx), %edi
303 movl scanend(%esp), %ebx
304 movl chainlenwmask(%esp), %edx
305 jmp LookupLoop
306
307/* s->match_start = cur_match; */
308/* best_len = len; */
309/* if (len >= nice_match) break; */
310/* scan_end = *(ushf*)(scan+best_len-1); */
311
312LongerMatch: movl nicematch(%esp), %ebx
313 movl %eax, bestlen(%esp)
314 movl %ecx, dsMatchStart(%edx)
315 cmpl %ebx, %eax
316 jge LeaveNow
317 movl window(%esp), %esi
318 addl %eax, %esi
319 movl %esi, windowbestlen(%esp)
320 movzwl -1(%edi,%eax), %ebx
321 movl dsPrev(%edx), %edi
322 movl %ebx, scanend(%esp)
323 movl chainlenwmask(%esp), %edx
324 jmp LookupLoop
325
326/* Accept the current string, with the maximum possible length. */
327
328LenMaximum: movl deflatestate(%esp), %edx
329 movl $MAX_MATCH, bestlen(%esp)
330 movl %ecx, dsMatchStart(%edx)
331
332/* if ((uInt)best_len <= s->lookahead) return (uInt)best_len; */
333/* return s->lookahead; */
334
335LeaveNow:
336 movl deflatestate(%esp), %edx
337 movl bestlen(%esp), %ebx
338 movl dsLookahead(%edx), %eax
339 cmpl %eax, %ebx
340 jg LookaheadRet
341 movl %ebx, %eax
342LookaheadRet:
343
344/* Restore the stack and return from whence we came. */
345
346 addl $LocalVarsSize, %esp
347 .cfi_def_cfa_offset 20
348 popl %ebx
349 .cfi_def_cfa_offset 16
350 popl %esi
351 .cfi_def_cfa_offset 12
352 popl %edi
353 .cfi_def_cfa_offset 8
354 popl %ebp
355 .cfi_def_cfa_offset 4
356.cfi_endproc
357match_init: ret