summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMark Adler <zlib@madler.net>2017-10-12 20:08:53 -0700
committerMark Adler <zlib@madler.net>2017-10-12 20:27:14 -0700
commit288f1080317b954b6bdca33708631c011549c008 (patch)
tree9629f01104722ba8e490f04a0790c56513ba989a
parenta5773513942b1c57d0eff51fcb2ebac72796ed95 (diff)
downloadzlib-288f1080317b954b6bdca33708631c011549c008.tar.gz
zlib-288f1080317b954b6bdca33708631c011549c008.tar.bz2
zlib-288f1080317b954b6bdca33708631c011549c008.zip
Remove old assembler code in which bugs have manifested.
In addition, there is not sufficient gain from the inflate assembler code to warrant its inclusion.
-rw-r--r--contrib/README.contrib21
-rw-r--r--contrib/amd64/amd64-match.S452
-rw-r--r--contrib/asm686/README.68651
-rw-r--r--contrib/asm686/match.S357
-rw-r--r--contrib/inflate86/inffas86.c1157
-rw-r--r--contrib/inflate86/inffast.S1368
-rw-r--r--contrib/masmx64/bld_ml64.bat2
-rw-r--r--contrib/masmx64/gvmat64.asm553
-rw-r--r--contrib/masmx64/inffas8664.c186
-rw-r--r--contrib/masmx64/inffasx64.asm396
-rw-r--r--contrib/masmx64/readme.txt31
-rw-r--r--contrib/masmx86/bld_ml32.bat2
-rw-r--r--contrib/masmx86/inffas32.asm1080
-rw-r--r--contrib/masmx86/match686.asm479
-rw-r--r--contrib/masmx86/readme.txt27
-rw-r--r--win32/Makefile.bor1
-rw-r--r--win32/Makefile.gcc5
-rw-r--r--win32/Makefile.msc4
18 files changed, 0 insertions, 6172 deletions
diff --git a/contrib/README.contrib b/contrib/README.contrib
index a411d5c..335e435 100644
--- a/contrib/README.contrib
+++ b/contrib/README.contrib
@@ -8,14 +8,6 @@ ada/ by Dmitriy Anisimkov <anisimkov@yahoo.com>
8 Support for Ada 8 Support for Ada
9 See http://zlib-ada.sourceforge.net/ 9 See http://zlib-ada.sourceforge.net/
10 10
11amd64/ by Mikhail Teterin <mi@ALDAN.algebra.com>
12 asm code for AMD64
13 See patch at http://www.freebsd.org/cgi/query-pr.cgi?pr=bin/96393
14
15asm686/ by Brian Raiter <breadbox@muppetlabs.com>
16 asm code for Pentium and PPro/PII, using the AT&T (GNU as) syntax
17 See http://www.muppetlabs.com/~breadbox/software/assembly.html
18
19blast/ by Mark Adler <madler@alumni.caltech.edu> 11blast/ by Mark Adler <madler@alumni.caltech.edu>
20 Decompressor for output of PKWare Data Compression Library (DCL) 12 Decompressor for output of PKWare Data Compression Library (DCL)
21 13
@@ -32,9 +24,6 @@ gcc_gvmat64/by Gilles Vollant <info@winimage.com>
32infback9/ by Mark Adler <madler@alumni.caltech.edu> 24infback9/ by Mark Adler <madler@alumni.caltech.edu>
33 Unsupported diffs to infback to decode the deflate64 format 25 Unsupported diffs to infback to decode the deflate64 format
34 26
35inflate86/ by Chris Anderson <christop@charm.net>
36 Tuned x86 gcc asm code to replace inflate_fast()
37
38iostream/ by Kevin Ruland <kevin@rodin.wustl.edu> 27iostream/ by Kevin Ruland <kevin@rodin.wustl.edu>
39 A C++ I/O streams interface to the zlib gz* functions 28 A C++ I/O streams interface to the zlib gz* functions
40 29
@@ -45,16 +34,6 @@ iostream3/ by Ludwig Schwardt <schwardt@sun.ac.za>
45 and Kevin Ruland <kevin@rodin.wustl.edu> 34 and Kevin Ruland <kevin@rodin.wustl.edu>
46 Yet another C++ I/O streams interface 35 Yet another C++ I/O streams interface
47 36
48masmx64/ by Gilles Vollant <info@winimage.com>
49 x86 64-bit (AMD64 and Intel EM64t) code for x64 assembler to
50 replace longest_match() and inflate_fast(), also masm x86
51 64-bits translation of Chris Anderson inflate_fast()
52
53masmx86/ by Gilles Vollant <info@winimage.com>
54 x86 asm code to replace longest_match() and inflate_fast(),
55 for Visual C++ and MASM (32 bits).
56 Based on Brian Raiter (asm686) and Chris Anderson (inflate86)
57
58minizip/ by Gilles Vollant <info@winimage.com> 37minizip/ by Gilles Vollant <info@winimage.com>
59 Mini zip and unzip based on zlib 38 Mini zip and unzip based on zlib
60 Includes Zip64 support by Mathias Svensson <mathias@result42.com> 39 Includes Zip64 support by Mathias Svensson <mathias@result42.com>
diff --git a/contrib/amd64/amd64-match.S b/contrib/amd64/amd64-match.S
deleted file mode 100644
index 81d4a1c..0000000
--- a/contrib/amd64/amd64-match.S
+++ /dev/null
@@ -1,452 +0,0 @@
1/*
2 * match.S -- optimized version of longest_match()
3 * based on the similar work by Gilles Vollant, and Brian Raiter, written 1998
4 *
5 * This is free software; you can redistribute it and/or modify it
6 * under the terms of the BSD License. Use by owners of Che Guevarra
7 * parafernalia is prohibited, where possible, and highly discouraged
8 * elsewhere.
9 */
10
11#ifndef NO_UNDERLINE
12# define match_init _match_init
13# define longest_match _longest_match
14#endif
15
16#define scanend ebx
17#define scanendw bx
18#define chainlenwmask edx /* high word: current chain len low word: s->wmask */
19#define curmatch rsi
20#define curmatchd esi
21#define windowbestlen r8
22#define scanalign r9
23#define scanalignd r9d
24#define window r10
25#define bestlen r11
26#define bestlend r11d
27#define scanstart r12d
28#define scanstartw r12w
29#define scan r13
30#define nicematch r14d
31#define limit r15
32#define limitd r15d
33#define prev rcx
34
35/*
36 * The 258 is a "magic number, not a parameter -- changing it
37 * breaks the hell loose
38 */
39#define MAX_MATCH (258)
40#define MIN_MATCH (3)
41#define MIN_LOOKAHEAD (MAX_MATCH + MIN_MATCH + 1)
42#define MAX_MATCH_8 ((MAX_MATCH + 7) & ~7)
43
44/* stack frame offsets */
45#define LocalVarsSize (112)
46#define _chainlenwmask ( 8-LocalVarsSize)(%rsp)
47#define _windowbestlen (16-LocalVarsSize)(%rsp)
48#define save_r14 (24-LocalVarsSize)(%rsp)
49#define save_rsi (32-LocalVarsSize)(%rsp)
50#define save_rbx (40-LocalVarsSize)(%rsp)
51#define save_r12 (56-LocalVarsSize)(%rsp)
52#define save_r13 (64-LocalVarsSize)(%rsp)
53#define save_r15 (80-LocalVarsSize)(%rsp)
54
55
56.globl match_init, longest_match
57
58/*
59 * On AMD64 the first argument of a function (in our case -- the pointer to
60 * deflate_state structure) is passed in %rdi, hence our offsets below are
61 * all off of that.
62 */
63
64/* you can check the structure offset by running
65
66#include <stdlib.h>
67#include <stdio.h>
68#include "deflate.h"
69
70void print_depl()
71{
72deflate_state ds;
73deflate_state *s=&ds;
74printf("size pointer=%u\n",(int)sizeof(void*));
75
76printf("#define dsWSize (%3u)(%%rdi)\n",(int)(((char*)&(s->w_size))-((char*)s)));
77printf("#define dsWMask (%3u)(%%rdi)\n",(int)(((char*)&(s->w_mask))-((char*)s)));
78printf("#define dsWindow (%3u)(%%rdi)\n",(int)(((char*)&(s->window))-((char*)s)));
79printf("#define dsPrev (%3u)(%%rdi)\n",(int)(((char*)&(s->prev))-((char*)s)));
80printf("#define dsMatchLen (%3u)(%%rdi)\n",(int)(((char*)&(s->match_length))-((char*)s)));
81printf("#define dsPrevMatch (%3u)(%%rdi)\n",(int)(((char*)&(s->prev_match))-((char*)s)));
82printf("#define dsStrStart (%3u)(%%rdi)\n",(int)(((char*)&(s->strstart))-((char*)s)));
83printf("#define dsMatchStart (%3u)(%%rdi)\n",(int)(((char*)&(s->match_start))-((char*)s)));
84printf("#define dsLookahead (%3u)(%%rdi)\n",(int)(((char*)&(s->lookahead))-((char*)s)));
85printf("#define dsPrevLen (%3u)(%%rdi)\n",(int)(((char*)&(s->prev_length))-((char*)s)));
86printf("#define dsMaxChainLen (%3u)(%%rdi)\n",(int)(((char*)&(s->max_chain_length))-((char*)s)));
87printf("#define dsGoodMatch (%3u)(%%rdi)\n",(int)(((char*)&(s->good_match))-((char*)s)));
88printf("#define dsNiceMatch (%3u)(%%rdi)\n",(int)(((char*)&(s->nice_match))-((char*)s)));
89}
90
91*/
92
93
94/*
95 to compile for XCode 3.2 on MacOSX x86_64
96 - run "gcc -g -c -DXCODE_MAC_X64_STRUCTURE amd64-match.S"
97 */
98
99
100#ifndef CURRENT_LINX_XCODE_MAC_X64_STRUCTURE
101#define dsWSize ( 68)(%rdi)
102#define dsWMask ( 76)(%rdi)
103#define dsWindow ( 80)(%rdi)
104#define dsPrev ( 96)(%rdi)
105#define dsMatchLen (144)(%rdi)
106#define dsPrevMatch (148)(%rdi)
107#define dsStrStart (156)(%rdi)
108#define dsMatchStart (160)(%rdi)
109#define dsLookahead (164)(%rdi)
110#define dsPrevLen (168)(%rdi)
111#define dsMaxChainLen (172)(%rdi)
112#define dsGoodMatch (188)(%rdi)
113#define dsNiceMatch (192)(%rdi)
114
115#else
116
117#ifndef STRUCT_OFFSET
118# define STRUCT_OFFSET (0)
119#endif
120
121
122#define dsWSize ( 56 + STRUCT_OFFSET)(%rdi)
123#define dsWMask ( 64 + STRUCT_OFFSET)(%rdi)
124#define dsWindow ( 72 + STRUCT_OFFSET)(%rdi)
125#define dsPrev ( 88 + STRUCT_OFFSET)(%rdi)
126#define dsMatchLen (136 + STRUCT_OFFSET)(%rdi)
127#define dsPrevMatch (140 + STRUCT_OFFSET)(%rdi)
128#define dsStrStart (148 + STRUCT_OFFSET)(%rdi)
129#define dsMatchStart (152 + STRUCT_OFFSET)(%rdi)
130#define dsLookahead (156 + STRUCT_OFFSET)(%rdi)
131#define dsPrevLen (160 + STRUCT_OFFSET)(%rdi)
132#define dsMaxChainLen (164 + STRUCT_OFFSET)(%rdi)
133#define dsGoodMatch (180 + STRUCT_OFFSET)(%rdi)
134#define dsNiceMatch (184 + STRUCT_OFFSET)(%rdi)
135
136#endif
137
138
139
140
141.text
142
143/* uInt longest_match(deflate_state *deflatestate, IPos curmatch) */
144
145longest_match:
146/*
147 * Retrieve the function arguments. %curmatch will hold cur_match
148 * throughout the entire function (passed via rsi on amd64).
149 * rdi will hold the pointer to the deflate_state (first arg on amd64)
150 */
151 mov %rsi, save_rsi
152 mov %rbx, save_rbx
153 mov %r12, save_r12
154 mov %r13, save_r13
155 mov %r14, save_r14
156 mov %r15, save_r15
157
158/* uInt wmask = s->w_mask; */
159/* unsigned chain_length = s->max_chain_length; */
160/* if (s->prev_length >= s->good_match) { */
161/* chain_length >>= 2; */
162/* } */
163
164 movl dsPrevLen, %eax
165 movl dsGoodMatch, %ebx
166 cmpl %ebx, %eax
167 movl dsWMask, %eax
168 movl dsMaxChainLen, %chainlenwmask
169 jl LastMatchGood
170 shrl $2, %chainlenwmask
171LastMatchGood:
172
173/* chainlen is decremented once beforehand so that the function can */
174/* use the sign flag instead of the zero flag for the exit test. */
175/* It is then shifted into the high word, to make room for the wmask */
176/* value, which it will always accompany. */
177
178 decl %chainlenwmask
179 shll $16, %chainlenwmask
180 orl %eax, %chainlenwmask
181
182/* if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead; */
183
184 movl dsNiceMatch, %eax
185 movl dsLookahead, %ebx
186 cmpl %eax, %ebx
187 jl LookaheadLess
188 movl %eax, %ebx
189LookaheadLess: movl %ebx, %nicematch
190
191/* register Bytef *scan = s->window + s->strstart; */
192
193 mov dsWindow, %window
194 movl dsStrStart, %limitd
195 lea (%limit, %window), %scan
196
197/* Determine how many bytes the scan ptr is off from being */
198/* dword-aligned. */
199
200 mov %scan, %scanalign
201 negl %scanalignd
202 andl $3, %scanalignd
203
204/* IPos limit = s->strstart > (IPos)MAX_DIST(s) ? */
205/* s->strstart - (IPos)MAX_DIST(s) : NIL; */
206
207 movl dsWSize, %eax
208 subl $MIN_LOOKAHEAD, %eax
209 xorl %ecx, %ecx
210 subl %eax, %limitd
211 cmovng %ecx, %limitd
212
213/* int best_len = s->prev_length; */
214
215 movl dsPrevLen, %bestlend
216
217/* Store the sum of s->window + best_len in %windowbestlen locally, and in memory. */
218
219 lea (%window, %bestlen), %windowbestlen
220 mov %windowbestlen, _windowbestlen
221
222/* register ush scan_start = *(ushf*)scan; */
223/* register ush scan_end = *(ushf*)(scan+best_len-1); */
224/* Posf *prev = s->prev; */
225
226 movzwl (%scan), %scanstart
227 movzwl -1(%scan, %bestlen), %scanend
228 mov dsPrev, %prev
229
230/* Jump into the main loop. */
231
232 movl %chainlenwmask, _chainlenwmask
233 jmp LoopEntry
234
235.balign 16
236
237/* do {
238 * match = s->window + cur_match;
239 * if (*(ushf*)(match+best_len-1) != scan_end ||
240 * *(ushf*)match != scan_start) continue;
241 * [...]
242 * } while ((cur_match = prev[cur_match & wmask]) > limit
243 * && --chain_length != 0);
244 *
245 * Here is the inner loop of the function. The function will spend the
246 * majority of its time in this loop, and majority of that time will
247 * be spent in the first ten instructions.
248 */
249LookupLoop:
250 andl %chainlenwmask, %curmatchd
251 movzwl (%prev, %curmatch, 2), %curmatchd
252 cmpl %limitd, %curmatchd
253 jbe LeaveNow
254 subl $0x00010000, %chainlenwmask
255 js LeaveNow
256LoopEntry: cmpw -1(%windowbestlen, %curmatch), %scanendw
257 jne LookupLoop
258 cmpw %scanstartw, (%window, %curmatch)
259 jne LookupLoop
260
261/* Store the current value of chainlen. */
262 movl %chainlenwmask, _chainlenwmask
263
264/* %scan is the string under scrutiny, and %prev to the string we */
265/* are hoping to match it up with. In actuality, %esi and %edi are */
266/* both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and %edx is */
267/* initialized to -(MAX_MATCH_8 - scanalign). */
268
269 mov $(-MAX_MATCH_8), %rdx
270 lea (%curmatch, %window), %windowbestlen
271 lea MAX_MATCH_8(%windowbestlen, %scanalign), %windowbestlen
272 lea MAX_MATCH_8(%scan, %scanalign), %prev
273
274/* the prefetching below makes very little difference... */
275 prefetcht1 (%windowbestlen, %rdx)
276 prefetcht1 (%prev, %rdx)
277
278/*
279 * Test the strings for equality, 8 bytes at a time. At the end,
280 * adjust %rdx so that it is offset to the exact byte that mismatched.
281 *
282 * It should be confessed that this loop usually does not represent
283 * much of the total running time. Replacing it with a more
284 * straightforward "rep cmpsb" would not drastically degrade
285 * performance -- unrolling it, for example, makes no difference.
286 */
287
288#undef USE_SSE /* works, but is 6-7% slower, than non-SSE... */
289
290LoopCmps:
291#ifdef USE_SSE
292 /* Preload the SSE registers */
293 movdqu (%windowbestlen, %rdx), %xmm1
294 movdqu (%prev, %rdx), %xmm2
295 pcmpeqb %xmm2, %xmm1
296 movdqu 16(%windowbestlen, %rdx), %xmm3
297 movdqu 16(%prev, %rdx), %xmm4
298 pcmpeqb %xmm4, %xmm3
299 movdqu 32(%windowbestlen, %rdx), %xmm5
300 movdqu 32(%prev, %rdx), %xmm6
301 pcmpeqb %xmm6, %xmm5
302 movdqu 48(%windowbestlen, %rdx), %xmm7
303 movdqu 48(%prev, %rdx), %xmm8
304 pcmpeqb %xmm8, %xmm7
305
306 /* Check the comparisions' results */
307 pmovmskb %xmm1, %rax
308 notw %ax
309 bsfw %ax, %ax
310 jnz LeaveLoopCmps
311
312 /* this is the only iteration of the loop with a possibility of having
313 incremented rdx by 0x108 (each loop iteration add 16*4 = 0x40
314 and (0x40*4)+8=0x108 */
315 add $8, %rdx
316 jz LenMaximum
317 add $8, %rdx
318
319
320 pmovmskb %xmm3, %rax
321 notw %ax
322 bsfw %ax, %ax
323 jnz LeaveLoopCmps
324
325
326 add $16, %rdx
327
328
329 pmovmskb %xmm5, %rax
330 notw %ax
331 bsfw %ax, %ax
332 jnz LeaveLoopCmps
333
334 add $16, %rdx
335
336
337 pmovmskb %xmm7, %rax
338 notw %ax
339 bsfw %ax, %ax
340 jnz LeaveLoopCmps
341
342 add $16, %rdx
343
344 jmp LoopCmps
345LeaveLoopCmps: add %rax, %rdx
346#else
347 mov (%windowbestlen, %rdx), %rax
348 xor (%prev, %rdx), %rax
349 jnz LeaveLoopCmps
350
351 mov 8(%windowbestlen, %rdx), %rax
352 xor 8(%prev, %rdx), %rax
353 jnz LeaveLoopCmps8
354
355 mov 16(%windowbestlen, %rdx), %rax
356 xor 16(%prev, %rdx), %rax
357 jnz LeaveLoopCmps16
358
359 add $24, %rdx
360 jnz LoopCmps
361 jmp LenMaximum
362# if 0
363/*
364 * This three-liner is tantalizingly simple, but bsf is a slow instruction,
365 * and the complicated alternative down below is quite a bit faster. Sad...
366 */
367
368LeaveLoopCmps: bsf %rax, %rax /* find the first non-zero bit */
369 shrl $3, %eax /* divide by 8 to get the byte */
370 add %rax, %rdx
371# else
372LeaveLoopCmps16:
373 add $8, %rdx
374LeaveLoopCmps8:
375 add $8, %rdx
376LeaveLoopCmps: testl $0xFFFFFFFF, %eax /* Check the first 4 bytes */
377 jnz Check16
378 add $4, %rdx
379 shr $32, %rax
380Check16: testw $0xFFFF, %ax
381 jnz LenLower
382 add $2, %rdx
383 shrl $16, %eax
384LenLower: subb $1, %al
385 adc $0, %rdx
386# endif
387#endif
388
389/* Calculate the length of the match. If it is longer than MAX_MATCH, */
390/* then automatically accept it as the best possible match and leave. */
391
392 lea (%prev, %rdx), %rax
393 sub %scan, %rax
394 cmpl $MAX_MATCH, %eax
395 jge LenMaximum
396
397/* If the length of the match is not longer than the best match we */
398/* have so far, then forget it and return to the lookup loop. */
399
400 cmpl %bestlend, %eax
401 jg LongerMatch
402 mov _windowbestlen, %windowbestlen
403 mov dsPrev, %prev
404 movl _chainlenwmask, %edx
405 jmp LookupLoop
406
407/* s->match_start = cur_match; */
408/* best_len = len; */
409/* if (len >= nice_match) break; */
410/* scan_end = *(ushf*)(scan+best_len-1); */
411
412LongerMatch:
413 movl %eax, %bestlend
414 movl %curmatchd, dsMatchStart
415 cmpl %nicematch, %eax
416 jge LeaveNow
417
418 lea (%window, %bestlen), %windowbestlen
419 mov %windowbestlen, _windowbestlen
420
421 movzwl -1(%scan, %rax), %scanend
422 mov dsPrev, %prev
423 movl _chainlenwmask, %chainlenwmask
424 jmp LookupLoop
425
426/* Accept the current string, with the maximum possible length. */
427
428LenMaximum:
429 movl $MAX_MATCH, %bestlend
430 movl %curmatchd, dsMatchStart
431
432/* if ((uInt)best_len <= s->lookahead) return (uInt)best_len; */
433/* return s->lookahead; */
434
435LeaveNow:
436 movl dsLookahead, %eax
437 cmpl %eax, %bestlend
438 cmovngl %bestlend, %eax
439LookaheadRet:
440
441/* Restore the registers and return from whence we came. */
442
443 mov save_rsi, %rsi
444 mov save_rbx, %rbx
445 mov save_r12, %r12
446 mov save_r13, %r13
447 mov save_r14, %r14
448 mov save_r15, %r15
449
450 ret
451
452match_init: ret
diff --git a/contrib/asm686/README.686 b/contrib/asm686/README.686
deleted file mode 100644
index a0bf3be..0000000
--- a/contrib/asm686/README.686
+++ /dev/null
@@ -1,51 +0,0 @@
1This is a patched version of zlib, modified to use
2Pentium-Pro-optimized assembly code in the deflation algorithm. The
3files changed/added by this patch are:
4
5README.686
6match.S
7
8The speedup that this patch provides varies, depending on whether the
9compiler used to build the original version of zlib falls afoul of the
10PPro's speed traps. My own tests show a speedup of around 10-20% at
11the default compression level, and 20-30% using -9, against a version
12compiled using gcc 2.7.2.3. Your mileage may vary.
13
14Note that this code has been tailored for the PPro/PII in particular,
15and will not perform particuarly well on a Pentium.
16
17If you are using an assembler other than GNU as, you will have to
18translate match.S to use your assembler's syntax. (Have fun.)
19
20Brian Raiter
21breadbox@muppetlabs.com
22April, 1998
23
24
25Added for zlib 1.1.3:
26
27The patches come from
28http://www.muppetlabs.com/~breadbox/software/assembly.html
29
30To compile zlib with this asm file, copy match.S to the zlib directory
31then do:
32
33CFLAGS="-O3 -DASMV" ./configure
34make OBJA=match.o
35
36
37Update:
38
39I've been ignoring these assembly routines for years, believing that
40gcc's generated code had caught up with it sometime around gcc 2.95
41and the major rearchitecting of the Pentium 4. However, I recently
42learned that, despite what I believed, this code still has some life
43in it. On the Pentium 4 and AMD64 chips, it continues to run about 8%
44faster than the code produced by gcc 4.1.
45
46In acknowledgement of its continuing usefulness, I've altered the
47license to match that of the rest of zlib. Share and Enjoy!
48
49Brian Raiter
50breadbox@muppetlabs.com
51April, 2007
diff --git a/contrib/asm686/match.S b/contrib/asm686/match.S
deleted file mode 100644
index fa42109..0000000
--- a/contrib/asm686/match.S
+++ /dev/null
@@ -1,357 +0,0 @@
1/* match.S -- x86 assembly version of the zlib longest_match() function.
2 * Optimized for the Intel 686 chips (PPro and later).
3 *
4 * Copyright (C) 1998, 2007 Brian Raiter <breadbox@muppetlabs.com>
5 *
6 * This software is provided 'as-is', without any express or implied
7 * warranty. In no event will the author be held liable for any damages
8 * arising from the use of this software.
9 *
10 * Permission is granted to anyone to use this software for any purpose,
11 * including commercial applications, and to alter it and redistribute it
12 * freely, subject to the following restrictions:
13 *
14 * 1. The origin of this software must not be misrepresented; you must not
15 * claim that you wrote the original software. If you use this software
16 * in a product, an acknowledgment in the product documentation would be
17 * appreciated but is not required.
18 * 2. Altered source versions must be plainly marked as such, and must not be
19 * misrepresented as being the original software.
20 * 3. This notice may not be removed or altered from any source distribution.
21 */
22
23#ifndef NO_UNDERLINE
24#define match_init _match_init
25#define longest_match _longest_match
26#endif
27
28#define MAX_MATCH (258)
29#define MIN_MATCH (3)
30#define MIN_LOOKAHEAD (MAX_MATCH + MIN_MATCH + 1)
31#define MAX_MATCH_8 ((MAX_MATCH + 7) & ~7)
32
33/* stack frame offsets */
34
35#define chainlenwmask 0 /* high word: current chain len */
36 /* low word: s->wmask */
37#define window 4 /* local copy of s->window */
38#define windowbestlen 8 /* s->window + bestlen */
39#define scanstart 16 /* first two bytes of string */
40#define scanend 12 /* last two bytes of string */
41#define scanalign 20 /* dword-misalignment of string */
42#define nicematch 24 /* a good enough match size */
43#define bestlen 28 /* size of best match so far */
44#define scan 32 /* ptr to string wanting match */
45
46#define LocalVarsSize (36)
47/* saved ebx 36 */
48/* saved edi 40 */
49/* saved esi 44 */
50/* saved ebp 48 */
51/* return address 52 */
52#define deflatestate 56 /* the function arguments */
53#define curmatch 60
54
55/* All the +zlib1222add offsets are due to the addition of fields
56 * in zlib in the deflate_state structure since the asm code was first written
57 * (if you compile with zlib 1.0.4 or older, use "zlib1222add equ (-4)").
58 * (if you compile with zlib between 1.0.5 and 1.2.2.1, use "zlib1222add equ 0").
59 * if you compile with zlib 1.2.2.2 or later , use "zlib1222add equ 8").
60 */
61
62#define zlib1222add (8)
63
64#define dsWSize (36+zlib1222add)
65#define dsWMask (44+zlib1222add)
66#define dsWindow (48+zlib1222add)
67#define dsPrev (56+zlib1222add)
68#define dsMatchLen (88+zlib1222add)
69#define dsPrevMatch (92+zlib1222add)
70#define dsStrStart (100+zlib1222add)
71#define dsMatchStart (104+zlib1222add)
72#define dsLookahead (108+zlib1222add)
73#define dsPrevLen (112+zlib1222add)
74#define dsMaxChainLen (116+zlib1222add)
75#define dsGoodMatch (132+zlib1222add)
76#define dsNiceMatch (136+zlib1222add)
77
78
79.file "match.S"
80
81.globl match_init, longest_match
82
83.text
84
85/* uInt longest_match(deflate_state *deflatestate, IPos curmatch) */
86.cfi_sections .debug_frame
87
88longest_match:
89
90.cfi_startproc
91/* Save registers that the compiler may be using, and adjust %esp to */
92/* make room for our stack frame. */
93
94 pushl %ebp
95 .cfi_def_cfa_offset 8
96 .cfi_offset ebp, -8
97 pushl %edi
98 .cfi_def_cfa_offset 12
99 pushl %esi
100 .cfi_def_cfa_offset 16
101 pushl %ebx
102 .cfi_def_cfa_offset 20
103 subl $LocalVarsSize, %esp
104 .cfi_def_cfa_offset LocalVarsSize+20
105
106/* Retrieve the function arguments. %ecx will hold cur_match */
107/* throughout the entire function. %edx will hold the pointer to the */
108/* deflate_state structure during the function's setup (before */
109/* entering the main loop). */
110
111 movl deflatestate(%esp), %edx
112 movl curmatch(%esp), %ecx
113
114/* uInt wmask = s->w_mask; */
115/* unsigned chain_length = s->max_chain_length; */
116/* if (s->prev_length >= s->good_match) { */
117/* chain_length >>= 2; */
118/* } */
119
120 movl dsPrevLen(%edx), %eax
121 movl dsGoodMatch(%edx), %ebx
122 cmpl %ebx, %eax
123 movl dsWMask(%edx), %eax
124 movl dsMaxChainLen(%edx), %ebx
125 jl LastMatchGood
126 shrl $2, %ebx
127LastMatchGood:
128
129/* chainlen is decremented once beforehand so that the function can */
130/* use the sign flag instead of the zero flag for the exit test. */
131/* It is then shifted into the high word, to make room for the wmask */
132/* value, which it will always accompany. */
133
134 decl %ebx
135 shll $16, %ebx
136 orl %eax, %ebx
137 movl %ebx, chainlenwmask(%esp)
138
139/* if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead; */
140
141 movl dsNiceMatch(%edx), %eax
142 movl dsLookahead(%edx), %ebx
143 cmpl %eax, %ebx
144 jl LookaheadLess
145 movl %eax, %ebx
146LookaheadLess: movl %ebx, nicematch(%esp)
147
148/* register Bytef *scan = s->window + s->strstart; */
149
150 movl dsWindow(%edx), %esi
151 movl %esi, window(%esp)
152 movl dsStrStart(%edx), %ebp
153 lea (%esi,%ebp), %edi
154 movl %edi, scan(%esp)
155
156/* Determine how many bytes the scan ptr is off from being */
157/* dword-aligned. */
158
159 movl %edi, %eax
160 negl %eax
161 andl $3, %eax
162 movl %eax, scanalign(%esp)
163
164/* IPos limit = s->strstart > (IPos)MAX_DIST(s) ? */
165/* s->strstart - (IPos)MAX_DIST(s) : NIL; */
166
167 movl dsWSize(%edx), %eax
168 subl $MIN_LOOKAHEAD, %eax
169 subl %eax, %ebp
170 jg LimitPositive
171 xorl %ebp, %ebp
172LimitPositive:
173
174/* int best_len = s->prev_length; */
175
176 movl dsPrevLen(%edx), %eax
177 movl %eax, bestlen(%esp)
178
179/* Store the sum of s->window + best_len in %esi locally, and in %esi. */
180
181 addl %eax, %esi
182 movl %esi, windowbestlen(%esp)
183
184/* register ush scan_start = *(ushf*)scan; */
185/* register ush scan_end = *(ushf*)(scan+best_len-1); */
186/* Posf *prev = s->prev; */
187
188 movzwl (%edi), %ebx
189 movl %ebx, scanstart(%esp)
190 movzwl -1(%edi,%eax), %ebx
191 movl %ebx, scanend(%esp)
192 movl dsPrev(%edx), %edi
193
194/* Jump into the main loop. */
195
196 movl chainlenwmask(%esp), %edx
197 jmp LoopEntry
198
199.balign 16
200
201/* do {
202 * match = s->window + cur_match;
203 * if (*(ushf*)(match+best_len-1) != scan_end ||
204 * *(ushf*)match != scan_start) continue;
205 * [...]
206 * } while ((cur_match = prev[cur_match & wmask]) > limit
207 * && --chain_length != 0);
208 *
209 * Here is the inner loop of the function. The function will spend the
210 * majority of its time in this loop, and majority of that time will
211 * be spent in the first ten instructions.
212 *
213 * Within this loop:
214 * %ebx = scanend
215 * %ecx = curmatch
216 * %edx = chainlenwmask - i.e., ((chainlen << 16) | wmask)
217 * %esi = windowbestlen - i.e., (window + bestlen)
218 * %edi = prev
219 * %ebp = limit
220 */
221LookupLoop:
222 andl %edx, %ecx
223 movzwl (%edi,%ecx,2), %ecx
224 cmpl %ebp, %ecx
225 jbe LeaveNow
226 subl $0x00010000, %edx
227 js LeaveNow
228LoopEntry: movzwl -1(%esi,%ecx), %eax
229 cmpl %ebx, %eax
230 jnz LookupLoop
231 movl window(%esp), %eax
232 movzwl (%eax,%ecx), %eax
233 cmpl scanstart(%esp), %eax
234 jnz LookupLoop
235
236/* Store the current value of chainlen. */
237
238 movl %edx, chainlenwmask(%esp)
239
240/* Point %edi to the string under scrutiny, and %esi to the string we */
241/* are hoping to match it up with. In actuality, %esi and %edi are */
242/* both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and %edx is */
243/* initialized to -(MAX_MATCH_8 - scanalign). */
244
245 movl window(%esp), %esi
246 movl scan(%esp), %edi
247 addl %ecx, %esi
248 movl scanalign(%esp), %eax
249 movl $(-MAX_MATCH_8), %edx
250 lea MAX_MATCH_8(%edi,%eax), %edi
251 lea MAX_MATCH_8(%esi,%eax), %esi
252
253/* Test the strings for equality, 8 bytes at a time. At the end,
254 * adjust %edx so that it is offset to the exact byte that mismatched.
255 *
256 * We already know at this point that the first three bytes of the
257 * strings match each other, and they can be safely passed over before
258 * starting the compare loop. So what this code does is skip over 0-3
259 * bytes, as much as necessary in order to dword-align the %edi
260 * pointer. (%esi will still be misaligned three times out of four.)
261 *
262 * It should be confessed that this loop usually does not represent
263 * much of the total running time. Replacing it with a more
264 * straightforward "rep cmpsb" would not drastically degrade
265 * performance.
266 */
267LoopCmps:
268 movl (%esi,%edx), %eax
269 xorl (%edi,%edx), %eax
270 jnz LeaveLoopCmps
271 movl 4(%esi,%edx), %eax
272 xorl 4(%edi,%edx), %eax
273 jnz LeaveLoopCmps4
274 addl $8, %edx
275 jnz LoopCmps
276 jmp LenMaximum
277LeaveLoopCmps4: addl $4, %edx
278LeaveLoopCmps: testl $0x0000FFFF, %eax
279 jnz LenLower
280 addl $2, %edx
281 shrl $16, %eax
282LenLower: subb $1, %al
283 adcl $0, %edx
284
285/* Calculate the length of the match. If it is longer than MAX_MATCH, */
286/* then automatically accept it as the best possible match and leave. */
287
288 lea (%edi,%edx), %eax
289 movl scan(%esp), %edi
290 subl %edi, %eax
291 cmpl $MAX_MATCH, %eax
292 jge LenMaximum
293
294/* If the length of the match is not longer than the best match we */
295/* have so far, then forget it and return to the lookup loop. */
296
297 movl deflatestate(%esp), %edx
298 movl bestlen(%esp), %ebx
299 cmpl %ebx, %eax
300 jg LongerMatch
301 movl windowbestlen(%esp), %esi
302 movl dsPrev(%edx), %edi
303 movl scanend(%esp), %ebx
304 movl chainlenwmask(%esp), %edx
305 jmp LookupLoop
306
307/* s->match_start = cur_match; */
308/* best_len = len; */
309/* if (len >= nice_match) break; */
310/* scan_end = *(ushf*)(scan+best_len-1); */
311
312LongerMatch: movl nicematch(%esp), %ebx
313 movl %eax, bestlen(%esp)
314 movl %ecx, dsMatchStart(%edx)
315 cmpl %ebx, %eax
316 jge LeaveNow
317 movl window(%esp), %esi
318 addl %eax, %esi
319 movl %esi, windowbestlen(%esp)
320 movzwl -1(%edi,%eax), %ebx
321 movl dsPrev(%edx), %edi
322 movl %ebx, scanend(%esp)
323 movl chainlenwmask(%esp), %edx
324 jmp LookupLoop
325
326/* Accept the current string, with the maximum possible length. */
327
328LenMaximum: movl deflatestate(%esp), %edx
329 movl $MAX_MATCH, bestlen(%esp)
330 movl %ecx, dsMatchStart(%edx)
331
332/* if ((uInt)best_len <= s->lookahead) return (uInt)best_len; */
333/* return s->lookahead; */
334
335LeaveNow:
336 movl deflatestate(%esp), %edx
337 movl bestlen(%esp), %ebx
338 movl dsLookahead(%edx), %eax
339 cmpl %eax, %ebx
340 jg LookaheadRet
341 movl %ebx, %eax
342LookaheadRet:
343
344/* Restore the stack and return from whence we came. */
345
346 addl $LocalVarsSize, %esp
347 .cfi_def_cfa_offset 20
348 popl %ebx
349 .cfi_def_cfa_offset 16
350 popl %esi
351 .cfi_def_cfa_offset 12
352 popl %edi
353 .cfi_def_cfa_offset 8
354 popl %ebp
355 .cfi_def_cfa_offset 4
356.cfi_endproc
357match_init: ret
diff --git a/contrib/inflate86/inffas86.c b/contrib/inflate86/inffas86.c
deleted file mode 100644
index 7292f67..0000000
--- a/contrib/inflate86/inffas86.c
+++ /dev/null
@@ -1,1157 +0,0 @@
1/* inffas86.c is a hand tuned assembler version of
2 *
3 * inffast.c -- fast decoding
4 * Copyright (C) 1995-2003 Mark Adler
5 * For conditions of distribution and use, see copyright notice in zlib.h
6 *
7 * Copyright (C) 2003 Chris Anderson <christop@charm.net>
8 * Please use the copyright conditions above.
9 *
10 * Dec-29-2003 -- I added AMD64 inflate asm support. This version is also
11 * slightly quicker on x86 systems because, instead of using rep movsb to copy
12 * data, it uses rep movsw, which moves data in 2-byte chunks instead of single
13 * bytes. I've tested the AMD64 code on a Fedora Core 1 + the x86_64 updates
14 * from http://fedora.linux.duke.edu/fc1_x86_64
15 * which is running on an Athlon 64 3000+ / Gigabyte GA-K8VT800M system with
16 * 1GB ram. The 64-bit version is about 4% faster than the 32-bit version,
17 * when decompressing mozilla-source-1.3.tar.gz.
18 *
19 * Mar-13-2003 -- Most of this is derived from inffast.S which is derived from
20 * the gcc -S output of zlib-1.2.0/inffast.c. Zlib-1.2.0 is in beta release at
21 * the moment. I have successfully compiled and tested this code with gcc2.96,
22 * gcc3.2, icc5.0, msvc6.0. It is very close to the speed of inffast.S
23 * compiled with gcc -DNO_MMX, but inffast.S is still faster on the P3 with MMX
24 * enabled. I will attempt to merge the MMX code into this version. Newer
25 * versions of this and inffast.S can be found at
26 * http://www.eetbeetee.com/zlib/ and http://www.charm.net/~christop/zlib/
27 */
28
29#include "zutil.h"
30#include "inftrees.h"
31#include "inflate.h"
32#include "inffast.h"
33
34/* Mark Adler's comments from inffast.c: */
35
36/*
37 Decode literal, length, and distance codes and write out the resulting
38 literal and match bytes until either not enough input or output is
39 available, an end-of-block is encountered, or a data error is encountered.
40 When large enough input and output buffers are supplied to inflate(), for
41 example, a 16K input buffer and a 64K output buffer, more than 95% of the
42 inflate execution time is spent in this routine.
43
44 Entry assumptions:
45
46 state->mode == LEN
47 strm->avail_in >= 6
48 strm->avail_out >= 258
49 start >= strm->avail_out
50 state->bits < 8
51
52 On return, state->mode is one of:
53
54 LEN -- ran out of enough output space or enough available input
55 TYPE -- reached end of block code, inflate() to interpret next block
56 BAD -- error in block data
57
58 Notes:
59
60 - The maximum input bits used by a length/distance pair is 15 bits for the
61 length code, 5 bits for the length extra, 15 bits for the distance code,
62 and 13 bits for the distance extra. This totals 48 bits, or six bytes.
63 Therefore if strm->avail_in >= 6, then there is enough input to avoid
64 checking for available input while decoding.
65
66 - The maximum bytes that a single length/distance pair can output is 258
67 bytes, which is the maximum length that can be coded. inflate_fast()
68 requires strm->avail_out >= 258 for each loop to avoid checking for
69 output space.
70 */
71void inflate_fast(strm, start)
72z_streamp strm;
73unsigned start; /* inflate()'s starting value for strm->avail_out */
74{
75 struct inflate_state FAR *state;
76 struct inffast_ar {
77/* 64 32 x86 x86_64 */
78/* ar offset register */
79/* 0 0 */ void *esp; /* esp save */
80/* 8 4 */ void *ebp; /* ebp save */
81/* 16 8 */ unsigned char FAR *in; /* esi rsi local strm->next_in */
82/* 24 12 */ unsigned char FAR *last; /* r9 while in < last */
83/* 32 16 */ unsigned char FAR *out; /* edi rdi local strm->next_out */
84/* 40 20 */ unsigned char FAR *beg; /* inflate()'s init next_out */
85/* 48 24 */ unsigned char FAR *end; /* r10 while out < end */
86/* 56 28 */ unsigned char FAR *window;/* size of window, wsize!=0 */
87/* 64 32 */ code const FAR *lcode; /* ebp rbp local strm->lencode */
88/* 72 36 */ code const FAR *dcode; /* r11 local strm->distcode */
89/* 80 40 */ unsigned long hold; /* edx rdx local strm->hold */
90/* 88 44 */ unsigned bits; /* ebx rbx local strm->bits */
91/* 92 48 */ unsigned wsize; /* window size */
92/* 96 52 */ unsigned write; /* window write index */
93/*100 56 */ unsigned lmask; /* r12 mask for lcode */
94/*104 60 */ unsigned dmask; /* r13 mask for dcode */
95/*108 64 */ unsigned len; /* r14 match length */
96/*112 68 */ unsigned dist; /* r15 match distance */
97/*116 72 */ unsigned status; /* set when state chng*/
98 } ar;
99
100#if defined( __GNUC__ ) && defined( __amd64__ ) && ! defined( __i386 )
101#define PAD_AVAIL_IN 6
102#define PAD_AVAIL_OUT 258
103#else
104#define PAD_AVAIL_IN 5
105#define PAD_AVAIL_OUT 257
106#endif
107
108 /* copy state to local variables */
109 state = (struct inflate_state FAR *)strm->state;
110 ar.in = strm->next_in;
111 ar.last = ar.in + (strm->avail_in - PAD_AVAIL_IN);
112 ar.out = strm->next_out;
113 ar.beg = ar.out - (start - strm->avail_out);
114 ar.end = ar.out + (strm->avail_out - PAD_AVAIL_OUT);
115 ar.wsize = state->wsize;
116 ar.write = state->wnext;
117 ar.window = state->window;
118 ar.hold = state->hold;
119 ar.bits = state->bits;
120 ar.lcode = state->lencode;
121 ar.dcode = state->distcode;
122 ar.lmask = (1U << state->lenbits) - 1;
123 ar.dmask = (1U << state->distbits) - 1;
124
125 /* decode literals and length/distances until end-of-block or not enough
126 input data or output space */
127
128 /* align in on 1/2 hold size boundary */
129 while (((unsigned long)(void *)ar.in & (sizeof(ar.hold) / 2 - 1)) != 0) {
130 ar.hold += (unsigned long)*ar.in++ << ar.bits;
131 ar.bits += 8;
132 }
133
134#if defined( __GNUC__ ) && defined( __amd64__ ) && ! defined( __i386 )
135 __asm__ __volatile__ (
136" leaq %0, %%rax\n"
137" movq %%rbp, 8(%%rax)\n" /* save regs rbp and rsp */
138" movq %%rsp, (%%rax)\n"
139" movq %%rax, %%rsp\n" /* make rsp point to &ar */
140" movq 16(%%rsp), %%rsi\n" /* rsi = in */
141" movq 32(%%rsp), %%rdi\n" /* rdi = out */
142" movq 24(%%rsp), %%r9\n" /* r9 = last */
143" movq 48(%%rsp), %%r10\n" /* r10 = end */
144" movq 64(%%rsp), %%rbp\n" /* rbp = lcode */
145" movq 72(%%rsp), %%r11\n" /* r11 = dcode */
146" movq 80(%%rsp), %%rdx\n" /* rdx = hold */
147" movl 88(%%rsp), %%ebx\n" /* ebx = bits */
148" movl 100(%%rsp), %%r12d\n" /* r12d = lmask */
149" movl 104(%%rsp), %%r13d\n" /* r13d = dmask */
150 /* r14d = len */
151 /* r15d = dist */
152" cld\n"
153" cmpq %%rdi, %%r10\n"
154" je .L_one_time\n" /* if only one decode left */
155" cmpq %%rsi, %%r9\n"
156" je .L_one_time\n"
157" jmp .L_do_loop\n"
158
159".L_one_time:\n"
160" movq %%r12, %%r8\n" /* r8 = lmask */
161" cmpb $32, %%bl\n"
162" ja .L_get_length_code_one_time\n"
163
164" lodsl\n" /* eax = *(uint *)in++ */
165" movb %%bl, %%cl\n" /* cl = bits, needs it for shifting */
166" addb $32, %%bl\n" /* bits += 32 */
167" shlq %%cl, %%rax\n"
168" orq %%rax, %%rdx\n" /* hold |= *((uint *)in)++ << bits */
169" jmp .L_get_length_code_one_time\n"
170
171".align 32,0x90\n"
172".L_while_test:\n"
173" cmpq %%rdi, %%r10\n"
174" jbe .L_break_loop\n"
175" cmpq %%rsi, %%r9\n"
176" jbe .L_break_loop\n"
177
178".L_do_loop:\n"
179" movq %%r12, %%r8\n" /* r8 = lmask */
180" cmpb $32, %%bl\n"
181" ja .L_get_length_code\n" /* if (32 < bits) */
182
183" lodsl\n" /* eax = *(uint *)in++ */
184" movb %%bl, %%cl\n" /* cl = bits, needs it for shifting */
185" addb $32, %%bl\n" /* bits += 32 */
186" shlq %%cl, %%rax\n"
187" orq %%rax, %%rdx\n" /* hold |= *((uint *)in)++ << bits */
188
189".L_get_length_code:\n"
190" andq %%rdx, %%r8\n" /* r8 &= hold */
191" movl (%%rbp,%%r8,4), %%eax\n" /* eax = lcode[hold & lmask] */
192
193" movb %%ah, %%cl\n" /* cl = this.bits */
194" subb %%ah, %%bl\n" /* bits -= this.bits */
195" shrq %%cl, %%rdx\n" /* hold >>= this.bits */
196
197" testb %%al, %%al\n"
198" jnz .L_test_for_length_base\n" /* if (op != 0) 45.7% */
199
200" movq %%r12, %%r8\n" /* r8 = lmask */
201" shrl $16, %%eax\n" /* output this.val char */
202" stosb\n"
203
204".L_get_length_code_one_time:\n"
205" andq %%rdx, %%r8\n" /* r8 &= hold */
206" movl (%%rbp,%%r8,4), %%eax\n" /* eax = lcode[hold & lmask] */
207
208".L_dolen:\n"
209" movb %%ah, %%cl\n" /* cl = this.bits */
210" subb %%ah, %%bl\n" /* bits -= this.bits */
211" shrq %%cl, %%rdx\n" /* hold >>= this.bits */
212
213" testb %%al, %%al\n"
214" jnz .L_test_for_length_base\n" /* if (op != 0) 45.7% */
215
216" shrl $16, %%eax\n" /* output this.val char */
217" stosb\n"
218" jmp .L_while_test\n"
219
220".align 32,0x90\n"
221".L_test_for_length_base:\n"
222" movl %%eax, %%r14d\n" /* len = this */
223" shrl $16, %%r14d\n" /* len = this.val */
224" movb %%al, %%cl\n"
225
226" testb $16, %%al\n"
227" jz .L_test_for_second_level_length\n" /* if ((op & 16) == 0) 8% */
228" andb $15, %%cl\n" /* op &= 15 */
229" jz .L_decode_distance\n" /* if (!op) */
230
231".L_add_bits_to_len:\n"
232" subb %%cl, %%bl\n"
233" xorl %%eax, %%eax\n"
234" incl %%eax\n"
235" shll %%cl, %%eax\n"
236" decl %%eax\n"
237" andl %%edx, %%eax\n" /* eax &= hold */
238" shrq %%cl, %%rdx\n"
239" addl %%eax, %%r14d\n" /* len += hold & mask[op] */
240
241".L_decode_distance:\n"
242" movq %%r13, %%r8\n" /* r8 = dmask */
243" cmpb $32, %%bl\n"
244" ja .L_get_distance_code\n" /* if (32 < bits) */
245
246" lodsl\n" /* eax = *(uint *)in++ */
247" movb %%bl, %%cl\n" /* cl = bits, needs it for shifting */
248" addb $32, %%bl\n" /* bits += 32 */
249" shlq %%cl, %%rax\n"
250" orq %%rax, %%rdx\n" /* hold |= *((uint *)in)++ << bits */
251
252".L_get_distance_code:\n"
253" andq %%rdx, %%r8\n" /* r8 &= hold */
254" movl (%%r11,%%r8,4), %%eax\n" /* eax = dcode[hold & dmask] */
255
256".L_dodist:\n"
257" movl %%eax, %%r15d\n" /* dist = this */
258" shrl $16, %%r15d\n" /* dist = this.val */
259" movb %%ah, %%cl\n"
260" subb %%ah, %%bl\n" /* bits -= this.bits */
261" shrq %%cl, %%rdx\n" /* hold >>= this.bits */
262" movb %%al, %%cl\n" /* cl = this.op */
263
264" testb $16, %%al\n" /* if ((op & 16) == 0) */
265" jz .L_test_for_second_level_dist\n"
266" andb $15, %%cl\n" /* op &= 15 */
267" jz .L_check_dist_one\n"
268
269".L_add_bits_to_dist:\n"
270" subb %%cl, %%bl\n"
271" xorl %%eax, %%eax\n"
272" incl %%eax\n"
273" shll %%cl, %%eax\n"
274" decl %%eax\n" /* (1 << op) - 1 */
275" andl %%edx, %%eax\n" /* eax &= hold */
276" shrq %%cl, %%rdx\n"
277" addl %%eax, %%r15d\n" /* dist += hold & ((1 << op) - 1) */
278
279".L_check_window:\n"
280" movq %%rsi, %%r8\n" /* save in so from can use it's reg */
281" movq %%rdi, %%rax\n"
282" subq 40(%%rsp), %%rax\n" /* nbytes = out - beg */
283
284" cmpl %%r15d, %%eax\n"
285" jb .L_clip_window\n" /* if (dist > nbytes) 4.2% */
286
287" movl %%r14d, %%ecx\n" /* ecx = len */
288" movq %%rdi, %%rsi\n"
289" subq %%r15, %%rsi\n" /* from = out - dist */
290
291" sarl %%ecx\n"
292" jnc .L_copy_two\n" /* if len % 2 == 0 */
293
294" rep movsw\n"
295" movb (%%rsi), %%al\n"
296" movb %%al, (%%rdi)\n"
297" incq %%rdi\n"
298
299" movq %%r8, %%rsi\n" /* move in back to %rsi, toss from */
300" jmp .L_while_test\n"
301
302".L_copy_two:\n"
303" rep movsw\n"
304" movq %%r8, %%rsi\n" /* move in back to %rsi, toss from */
305" jmp .L_while_test\n"
306
307".align 32,0x90\n"
308".L_check_dist_one:\n"
309" cmpl $1, %%r15d\n" /* if dist 1, is a memset */
310" jne .L_check_window\n"
311" cmpq %%rdi, 40(%%rsp)\n" /* if out == beg, outside window */
312" je .L_check_window\n"
313
314" movl %%r14d, %%ecx\n" /* ecx = len */
315" movb -1(%%rdi), %%al\n"
316" movb %%al, %%ah\n"
317
318" sarl %%ecx\n"
319" jnc .L_set_two\n"
320" movb %%al, (%%rdi)\n"
321" incq %%rdi\n"
322
323".L_set_two:\n"
324" rep stosw\n"
325" jmp .L_while_test\n"
326
327".align 32,0x90\n"
328".L_test_for_second_level_length:\n"
329" testb $64, %%al\n"
330" jnz .L_test_for_end_of_block\n" /* if ((op & 64) != 0) */
331
332" xorl %%eax, %%eax\n"
333" incl %%eax\n"
334" shll %%cl, %%eax\n"
335" decl %%eax\n"
336" andl %%edx, %%eax\n" /* eax &= hold */
337" addl %%r14d, %%eax\n" /* eax += len */
338" movl (%%rbp,%%rax,4), %%eax\n" /* eax = lcode[val+(hold&mask[op])]*/
339" jmp .L_dolen\n"
340
341".align 32,0x90\n"
342".L_test_for_second_level_dist:\n"
343" testb $64, %%al\n"
344" jnz .L_invalid_distance_code\n" /* if ((op & 64) != 0) */
345
346" xorl %%eax, %%eax\n"
347" incl %%eax\n"
348" shll %%cl, %%eax\n"
349" decl %%eax\n"
350" andl %%edx, %%eax\n" /* eax &= hold */
351" addl %%r15d, %%eax\n" /* eax += dist */
352" movl (%%r11,%%rax,4), %%eax\n" /* eax = dcode[val+(hold&mask[op])]*/
353" jmp .L_dodist\n"
354
355".align 32,0x90\n"
356".L_clip_window:\n"
357" movl %%eax, %%ecx\n" /* ecx = nbytes */
358" movl 92(%%rsp), %%eax\n" /* eax = wsize, prepare for dist cmp */
359" negl %%ecx\n" /* nbytes = -nbytes */
360
361" cmpl %%r15d, %%eax\n"
362" jb .L_invalid_distance_too_far\n" /* if (dist > wsize) */
363
364" addl %%r15d, %%ecx\n" /* nbytes = dist - nbytes */
365" cmpl $0, 96(%%rsp)\n"
366" jne .L_wrap_around_window\n" /* if (write != 0) */
367
368" movq 56(%%rsp), %%rsi\n" /* from = window */
369" subl %%ecx, %%eax\n" /* eax -= nbytes */
370" addq %%rax, %%rsi\n" /* from += wsize - nbytes */
371
372" movl %%r14d, %%eax\n" /* eax = len */
373" cmpl %%ecx, %%r14d\n"
374" jbe .L_do_copy\n" /* if (nbytes >= len) */
375
376" subl %%ecx, %%eax\n" /* eax -= nbytes */
377" rep movsb\n"
378" movq %%rdi, %%rsi\n"
379" subq %%r15, %%rsi\n" /* from = &out[ -dist ] */
380" jmp .L_do_copy\n"
381
382".align 32,0x90\n"
383".L_wrap_around_window:\n"
384" movl 96(%%rsp), %%eax\n" /* eax = write */
385" cmpl %%eax, %%ecx\n"
386" jbe .L_contiguous_in_window\n" /* if (write >= nbytes) */
387
388" movl 92(%%rsp), %%esi\n" /* from = wsize */
389" addq 56(%%rsp), %%rsi\n" /* from += window */
390" addq %%rax, %%rsi\n" /* from += write */
391" subq %%rcx, %%rsi\n" /* from -= nbytes */
392" subl %%eax, %%ecx\n" /* nbytes -= write */
393
394" movl %%r14d, %%eax\n" /* eax = len */
395" cmpl %%ecx, %%eax\n"
396" jbe .L_do_copy\n" /* if (nbytes >= len) */
397
398" subl %%ecx, %%eax\n" /* len -= nbytes */
399" rep movsb\n"
400" movq 56(%%rsp), %%rsi\n" /* from = window */
401" movl 96(%%rsp), %%ecx\n" /* nbytes = write */
402" cmpl %%ecx, %%eax\n"
403" jbe .L_do_copy\n" /* if (nbytes >= len) */
404
405" subl %%ecx, %%eax\n" /* len -= nbytes */
406" rep movsb\n"
407" movq %%rdi, %%rsi\n"
408" subq %%r15, %%rsi\n" /* from = out - dist */
409" jmp .L_do_copy\n"
410
411".align 32,0x90\n"
412".L_contiguous_in_window:\n"
413" movq 56(%%rsp), %%rsi\n" /* rsi = window */
414" addq %%rax, %%rsi\n"
415" subq %%rcx, %%rsi\n" /* from += write - nbytes */
416
417" movl %%r14d, %%eax\n" /* eax = len */
418" cmpl %%ecx, %%eax\n"
419" jbe .L_do_copy\n" /* if (nbytes >= len) */
420
421" subl %%ecx, %%eax\n" /* len -= nbytes */
422" rep movsb\n"
423" movq %%rdi, %%rsi\n"
424" subq %%r15, %%rsi\n" /* from = out - dist */
425" jmp .L_do_copy\n" /* if (nbytes >= len) */
426
427".align 32,0x90\n"
428".L_do_copy:\n"
429" movl %%eax, %%ecx\n" /* ecx = len */
430" rep movsb\n"
431
432" movq %%r8, %%rsi\n" /* move in back to %esi, toss from */
433" jmp .L_while_test\n"
434
435".L_test_for_end_of_block:\n"
436" testb $32, %%al\n"
437" jz .L_invalid_literal_length_code\n"
438" movl $1, 116(%%rsp)\n"
439" jmp .L_break_loop_with_status\n"
440
441".L_invalid_literal_length_code:\n"
442" movl $2, 116(%%rsp)\n"
443" jmp .L_break_loop_with_status\n"
444
445".L_invalid_distance_code:\n"
446" movl $3, 116(%%rsp)\n"
447" jmp .L_break_loop_with_status\n"
448
449".L_invalid_distance_too_far:\n"
450" movl $4, 116(%%rsp)\n"
451" jmp .L_break_loop_with_status\n"
452
453".L_break_loop:\n"
454" movl $0, 116(%%rsp)\n"
455
456".L_break_loop_with_status:\n"
457/* put in, out, bits, and hold back into ar and pop esp */
458" movq %%rsi, 16(%%rsp)\n" /* in */
459" movq %%rdi, 32(%%rsp)\n" /* out */
460" movl %%ebx, 88(%%rsp)\n" /* bits */
461" movq %%rdx, 80(%%rsp)\n" /* hold */
462" movq (%%rsp), %%rax\n" /* restore rbp and rsp */
463" movq 8(%%rsp), %%rbp\n"
464" movq %%rax, %%rsp\n"
465 :
466 : "m" (ar)
467 : "memory", "%rax", "%rbx", "%rcx", "%rdx", "%rsi", "%rdi",
468 "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15"
469 );
470#elif ( defined( __GNUC__ ) || defined( __ICC ) ) && defined( __i386 )
471 __asm__ __volatile__ (
472" leal %0, %%eax\n"
473" movl %%esp, (%%eax)\n" /* save esp, ebp */
474" movl %%ebp, 4(%%eax)\n"
475" movl %%eax, %%esp\n"
476" movl 8(%%esp), %%esi\n" /* esi = in */
477" movl 16(%%esp), %%edi\n" /* edi = out */
478" movl 40(%%esp), %%edx\n" /* edx = hold */
479" movl 44(%%esp), %%ebx\n" /* ebx = bits */
480" movl 32(%%esp), %%ebp\n" /* ebp = lcode */
481
482" cld\n"
483" jmp .L_do_loop\n"
484
485".align 32,0x90\n"
486".L_while_test:\n"
487" cmpl %%edi, 24(%%esp)\n" /* out < end */
488" jbe .L_break_loop\n"
489" cmpl %%esi, 12(%%esp)\n" /* in < last */
490" jbe .L_break_loop\n"
491
492".L_do_loop:\n"
493" cmpb $15, %%bl\n"
494" ja .L_get_length_code\n" /* if (15 < bits) */
495
496" xorl %%eax, %%eax\n"
497" lodsw\n" /* al = *(ushort *)in++ */
498" movb %%bl, %%cl\n" /* cl = bits, needs it for shifting */
499" addb $16, %%bl\n" /* bits += 16 */
500" shll %%cl, %%eax\n"
501" orl %%eax, %%edx\n" /* hold |= *((ushort *)in)++ << bits */
502
503".L_get_length_code:\n"
504" movl 56(%%esp), %%eax\n" /* eax = lmask */
505" andl %%edx, %%eax\n" /* eax &= hold */
506" movl (%%ebp,%%eax,4), %%eax\n" /* eax = lcode[hold & lmask] */
507
508".L_dolen:\n"
509" movb %%ah, %%cl\n" /* cl = this.bits */
510" subb %%ah, %%bl\n" /* bits -= this.bits */
511" shrl %%cl, %%edx\n" /* hold >>= this.bits */
512
513" testb %%al, %%al\n"
514" jnz .L_test_for_length_base\n" /* if (op != 0) 45.7% */
515
516" shrl $16, %%eax\n" /* output this.val char */
517" stosb\n"
518" jmp .L_while_test\n"
519
520".align 32,0x90\n"
521".L_test_for_length_base:\n"
522" movl %%eax, %%ecx\n" /* len = this */
523" shrl $16, %%ecx\n" /* len = this.val */
524" movl %%ecx, 64(%%esp)\n" /* save len */
525" movb %%al, %%cl\n"
526
527" testb $16, %%al\n"
528" jz .L_test_for_second_level_length\n" /* if ((op & 16) == 0) 8% */
529" andb $15, %%cl\n" /* op &= 15 */
530" jz .L_decode_distance\n" /* if (!op) */
531" cmpb %%cl, %%bl\n"
532" jae .L_add_bits_to_len\n" /* if (op <= bits) */
533
534" movb %%cl, %%ch\n" /* stash op in ch, freeing cl */
535" xorl %%eax, %%eax\n"
536" lodsw\n" /* al = *(ushort *)in++ */
537" movb %%bl, %%cl\n" /* cl = bits, needs it for shifting */
538" addb $16, %%bl\n" /* bits += 16 */
539" shll %%cl, %%eax\n"
540" orl %%eax, %%edx\n" /* hold |= *((ushort *)in)++ << bits */
541" movb %%ch, %%cl\n" /* move op back to ecx */
542
543".L_add_bits_to_len:\n"
544" subb %%cl, %%bl\n"
545" xorl %%eax, %%eax\n"
546" incl %%eax\n"
547" shll %%cl, %%eax\n"
548" decl %%eax\n"
549" andl %%edx, %%eax\n" /* eax &= hold */
550" shrl %%cl, %%edx\n"
551" addl %%eax, 64(%%esp)\n" /* len += hold & mask[op] */
552
553".L_decode_distance:\n"
554" cmpb $15, %%bl\n"
555" ja .L_get_distance_code\n" /* if (15 < bits) */
556
557" xorl %%eax, %%eax\n"
558" lodsw\n" /* al = *(ushort *)in++ */
559" movb %%bl, %%cl\n" /* cl = bits, needs it for shifting */
560" addb $16, %%bl\n" /* bits += 16 */
561" shll %%cl, %%eax\n"
562" orl %%eax, %%edx\n" /* hold |= *((ushort *)in)++ << bits */
563
564".L_get_distance_code:\n"
565" movl 60(%%esp), %%eax\n" /* eax = dmask */
566" movl 36(%%esp), %%ecx\n" /* ecx = dcode */
567" andl %%edx, %%eax\n" /* eax &= hold */
568" movl (%%ecx,%%eax,4), %%eax\n"/* eax = dcode[hold & dmask] */
569
570".L_dodist:\n"
571" movl %%eax, %%ebp\n" /* dist = this */
572" shrl $16, %%ebp\n" /* dist = this.val */
573" movb %%ah, %%cl\n"
574" subb %%ah, %%bl\n" /* bits -= this.bits */
575" shrl %%cl, %%edx\n" /* hold >>= this.bits */
576" movb %%al, %%cl\n" /* cl = this.op */
577
578" testb $16, %%al\n" /* if ((op & 16) == 0) */
579" jz .L_test_for_second_level_dist\n"
580" andb $15, %%cl\n" /* op &= 15 */
581" jz .L_check_dist_one\n"
582" cmpb %%cl, %%bl\n"
583" jae .L_add_bits_to_dist\n" /* if (op <= bits) 97.6% */
584
585" movb %%cl, %%ch\n" /* stash op in ch, freeing cl */
586" xorl %%eax, %%eax\n"
587" lodsw\n" /* al = *(ushort *)in++ */
588" movb %%bl, %%cl\n" /* cl = bits, needs it for shifting */
589" addb $16, %%bl\n" /* bits += 16 */
590" shll %%cl, %%eax\n"
591" orl %%eax, %%edx\n" /* hold |= *((ushort *)in)++ << bits */
592" movb %%ch, %%cl\n" /* move op back to ecx */
593
594".L_add_bits_to_dist:\n"
595" subb %%cl, %%bl\n"
596" xorl %%eax, %%eax\n"
597" incl %%eax\n"
598" shll %%cl, %%eax\n"
599" decl %%eax\n" /* (1 << op) - 1 */
600" andl %%edx, %%eax\n" /* eax &= hold */
601" shrl %%cl, %%edx\n"
602" addl %%eax, %%ebp\n" /* dist += hold & ((1 << op) - 1) */
603
604".L_check_window:\n"
605" movl %%esi, 8(%%esp)\n" /* save in so from can use it's reg */
606" movl %%edi, %%eax\n"
607" subl 20(%%esp), %%eax\n" /* nbytes = out - beg */
608
609" cmpl %%ebp, %%eax\n"
610" jb .L_clip_window\n" /* if (dist > nbytes) 4.2% */
611
612" movl 64(%%esp), %%ecx\n" /* ecx = len */
613" movl %%edi, %%esi\n"
614" subl %%ebp, %%esi\n" /* from = out - dist */
615
616" sarl %%ecx\n"
617" jnc .L_copy_two\n" /* if len % 2 == 0 */
618
619" rep movsw\n"
620" movb (%%esi), %%al\n"
621" movb %%al, (%%edi)\n"
622" incl %%edi\n"
623
624" movl 8(%%esp), %%esi\n" /* move in back to %esi, toss from */
625" movl 32(%%esp), %%ebp\n" /* ebp = lcode */
626" jmp .L_while_test\n"
627
628".L_copy_two:\n"
629" rep movsw\n"
630" movl 8(%%esp), %%esi\n" /* move in back to %esi, toss from */
631" movl 32(%%esp), %%ebp\n" /* ebp = lcode */
632" jmp .L_while_test\n"
633
634".align 32,0x90\n"
635".L_check_dist_one:\n"
636" cmpl $1, %%ebp\n" /* if dist 1, is a memset */
637" jne .L_check_window\n"
638" cmpl %%edi, 20(%%esp)\n"
639" je .L_check_window\n" /* out == beg, if outside window */
640
641" movl 64(%%esp), %%ecx\n" /* ecx = len */
642" movb -1(%%edi), %%al\n"
643" movb %%al, %%ah\n"
644
645" sarl %%ecx\n"
646" jnc .L_set_two\n"
647" movb %%al, (%%edi)\n"
648" incl %%edi\n"
649
650".L_set_two:\n"
651" rep stosw\n"
652" movl 32(%%esp), %%ebp\n" /* ebp = lcode */
653" jmp .L_while_test\n"
654
655".align 32,0x90\n"
656".L_test_for_second_level_length:\n"
657" testb $64, %%al\n"
658" jnz .L_test_for_end_of_block\n" /* if ((op & 64) != 0) */
659
660" xorl %%eax, %%eax\n"
661" incl %%eax\n"
662" shll %%cl, %%eax\n"
663" decl %%eax\n"
664" andl %%edx, %%eax\n" /* eax &= hold */
665" addl 64(%%esp), %%eax\n" /* eax += len */
666" movl (%%ebp,%%eax,4), %%eax\n" /* eax = lcode[val+(hold&mask[op])]*/
667" jmp .L_dolen\n"
668
669".align 32,0x90\n"
670".L_test_for_second_level_dist:\n"
671" testb $64, %%al\n"
672" jnz .L_invalid_distance_code\n" /* if ((op & 64) != 0) */
673
674" xorl %%eax, %%eax\n"
675" incl %%eax\n"
676" shll %%cl, %%eax\n"
677" decl %%eax\n"
678" andl %%edx, %%eax\n" /* eax &= hold */
679" addl %%ebp, %%eax\n" /* eax += dist */
680" movl 36(%%esp), %%ecx\n" /* ecx = dcode */
681" movl (%%ecx,%%eax,4), %%eax\n" /* eax = dcode[val+(hold&mask[op])]*/
682" jmp .L_dodist\n"
683
684".align 32,0x90\n"
685".L_clip_window:\n"
686" movl %%eax, %%ecx\n"
687" movl 48(%%esp), %%eax\n" /* eax = wsize */
688" negl %%ecx\n" /* nbytes = -nbytes */
689" movl 28(%%esp), %%esi\n" /* from = window */
690
691" cmpl %%ebp, %%eax\n"
692" jb .L_invalid_distance_too_far\n" /* if (dist > wsize) */
693
694" addl %%ebp, %%ecx\n" /* nbytes = dist - nbytes */
695" cmpl $0, 52(%%esp)\n"
696" jne .L_wrap_around_window\n" /* if (write != 0) */
697
698" subl %%ecx, %%eax\n"
699" addl %%eax, %%esi\n" /* from += wsize - nbytes */
700
701" movl 64(%%esp), %%eax\n" /* eax = len */
702" cmpl %%ecx, %%eax\n"
703" jbe .L_do_copy\n" /* if (nbytes >= len) */
704
705" subl %%ecx, %%eax\n" /* len -= nbytes */
706" rep movsb\n"
707" movl %%edi, %%esi\n"
708" subl %%ebp, %%esi\n" /* from = out - dist */
709" jmp .L_do_copy\n"
710
711".align 32,0x90\n"
712".L_wrap_around_window:\n"
713" movl 52(%%esp), %%eax\n" /* eax = write */
714" cmpl %%eax, %%ecx\n"
715" jbe .L_contiguous_in_window\n" /* if (write >= nbytes) */
716
717" addl 48(%%esp), %%esi\n" /* from += wsize */
718" addl %%eax, %%esi\n" /* from += write */
719" subl %%ecx, %%esi\n" /* from -= nbytes */
720" subl %%eax, %%ecx\n" /* nbytes -= write */
721
722" movl 64(%%esp), %%eax\n" /* eax = len */
723" cmpl %%ecx, %%eax\n"
724" jbe .L_do_copy\n" /* if (nbytes >= len) */
725
726" subl %%ecx, %%eax\n" /* len -= nbytes */
727" rep movsb\n"
728" movl 28(%%esp), %%esi\n" /* from = window */
729" movl 52(%%esp), %%ecx\n" /* nbytes = write */
730" cmpl %%ecx, %%eax\n"
731" jbe .L_do_copy\n" /* if (nbytes >= len) */
732
733" subl %%ecx, %%eax\n" /* len -= nbytes */
734" rep movsb\n"
735" movl %%edi, %%esi\n"
736" subl %%ebp, %%esi\n" /* from = out - dist */
737" jmp .L_do_copy\n"
738
739".align 32,0x90\n"
740".L_contiguous_in_window:\n"
741" addl %%eax, %%esi\n"
742" subl %%ecx, %%esi\n" /* from += write - nbytes */
743
744" movl 64(%%esp), %%eax\n" /* eax = len */
745" cmpl %%ecx, %%eax\n"
746" jbe .L_do_copy\n" /* if (nbytes >= len) */
747
748" subl %%ecx, %%eax\n" /* len -= nbytes */
749" rep movsb\n"
750" movl %%edi, %%esi\n"
751" subl %%ebp, %%esi\n" /* from = out - dist */
752" jmp .L_do_copy\n" /* if (nbytes >= len) */
753
754".align 32,0x90\n"
755".L_do_copy:\n"
756" movl %%eax, %%ecx\n"
757" rep movsb\n"
758
759" movl 8(%%esp), %%esi\n" /* move in back to %esi, toss from */
760" movl 32(%%esp), %%ebp\n" /* ebp = lcode */
761" jmp .L_while_test\n"
762
763".L_test_for_end_of_block:\n"
764" testb $32, %%al\n"
765" jz .L_invalid_literal_length_code\n"
766" movl $1, 72(%%esp)\n"
767" jmp .L_break_loop_with_status\n"
768
769".L_invalid_literal_length_code:\n"
770" movl $2, 72(%%esp)\n"
771" jmp .L_break_loop_with_status\n"
772
773".L_invalid_distance_code:\n"
774" movl $3, 72(%%esp)\n"
775" jmp .L_break_loop_with_status\n"
776
777".L_invalid_distance_too_far:\n"
778" movl 8(%%esp), %%esi\n"
779" movl $4, 72(%%esp)\n"
780" jmp .L_break_loop_with_status\n"
781
782".L_break_loop:\n"
783" movl $0, 72(%%esp)\n"
784
785".L_break_loop_with_status:\n"
786/* put in, out, bits, and hold back into ar and pop esp */
787" movl %%esi, 8(%%esp)\n" /* save in */
788" movl %%edi, 16(%%esp)\n" /* save out */
789" movl %%ebx, 44(%%esp)\n" /* save bits */
790" movl %%edx, 40(%%esp)\n" /* save hold */
791" movl 4(%%esp), %%ebp\n" /* restore esp, ebp */
792" movl (%%esp), %%esp\n"
793 :
794 : "m" (ar)
795 : "memory", "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
796 );
797#elif defined( _MSC_VER ) && ! defined( _M_AMD64 )
798 __asm {
799 lea eax, ar
800 mov [eax], esp /* save esp, ebp */
801 mov [eax+4], ebp
802 mov esp, eax
803 mov esi, [esp+8] /* esi = in */
804 mov edi, [esp+16] /* edi = out */
805 mov edx, [esp+40] /* edx = hold */
806 mov ebx, [esp+44] /* ebx = bits */
807 mov ebp, [esp+32] /* ebp = lcode */
808
809 cld
810 jmp L_do_loop
811
812ALIGN 4
813L_while_test:
814 cmp [esp+24], edi
815 jbe L_break_loop
816 cmp [esp+12], esi
817 jbe L_break_loop
818
819L_do_loop:
820 cmp bl, 15
821 ja L_get_length_code /* if (15 < bits) */
822
823 xor eax, eax
824 lodsw /* al = *(ushort *)in++ */
825 mov cl, bl /* cl = bits, needs it for shifting */
826 add bl, 16 /* bits += 16 */
827 shl eax, cl
828 or edx, eax /* hold |= *((ushort *)in)++ << bits */
829
830L_get_length_code:
831 mov eax, [esp+56] /* eax = lmask */
832 and eax, edx /* eax &= hold */
833 mov eax, [ebp+eax*4] /* eax = lcode[hold & lmask] */
834
835L_dolen:
836 mov cl, ah /* cl = this.bits */
837 sub bl, ah /* bits -= this.bits */
838 shr edx, cl /* hold >>= this.bits */
839
840 test al, al
841 jnz L_test_for_length_base /* if (op != 0) 45.7% */
842
843 shr eax, 16 /* output this.val char */
844 stosb
845 jmp L_while_test
846
847ALIGN 4
848L_test_for_length_base:
849 mov ecx, eax /* len = this */
850 shr ecx, 16 /* len = this.val */
851 mov [esp+64], ecx /* save len */
852 mov cl, al
853
854 test al, 16
855 jz L_test_for_second_level_length /* if ((op & 16) == 0) 8% */
856 and cl, 15 /* op &= 15 */
857 jz L_decode_distance /* if (!op) */
858 cmp bl, cl
859 jae L_add_bits_to_len /* if (op <= bits) */
860
861 mov ch, cl /* stash op in ch, freeing cl */
862 xor eax, eax
863 lodsw /* al = *(ushort *)in++ */
864 mov cl, bl /* cl = bits, needs it for shifting */
865 add bl, 16 /* bits += 16 */
866 shl eax, cl
867 or edx, eax /* hold |= *((ushort *)in)++ << bits */
868 mov cl, ch /* move op back to ecx */
869
870L_add_bits_to_len:
871 sub bl, cl
872 xor eax, eax
873 inc eax
874 shl eax, cl
875 dec eax
876 and eax, edx /* eax &= hold */
877 shr edx, cl
878 add [esp+64], eax /* len += hold & mask[op] */
879
880L_decode_distance:
881 cmp bl, 15
882 ja L_get_distance_code /* if (15 < bits) */
883
884 xor eax, eax
885 lodsw /* al = *(ushort *)in++ */
886 mov cl, bl /* cl = bits, needs it for shifting */
887 add bl, 16 /* bits += 16 */
888 shl eax, cl
889 or edx, eax /* hold |= *((ushort *)in)++ << bits */
890
891L_get_distance_code:
892 mov eax, [esp+60] /* eax = dmask */
893 mov ecx, [esp+36] /* ecx = dcode */
894 and eax, edx /* eax &= hold */
895 mov eax, [ecx+eax*4]/* eax = dcode[hold & dmask] */
896
897L_dodist:
898 mov ebp, eax /* dist = this */
899 shr ebp, 16 /* dist = this.val */
900 mov cl, ah
901 sub bl, ah /* bits -= this.bits */
902 shr edx, cl /* hold >>= this.bits */
903 mov cl, al /* cl = this.op */
904
905 test al, 16 /* if ((op & 16) == 0) */
906 jz L_test_for_second_level_dist
907 and cl, 15 /* op &= 15 */
908 jz L_check_dist_one
909 cmp bl, cl
910 jae L_add_bits_to_dist /* if (op <= bits) 97.6% */
911
912 mov ch, cl /* stash op in ch, freeing cl */
913 xor eax, eax
914 lodsw /* al = *(ushort *)in++ */
915 mov cl, bl /* cl = bits, needs it for shifting */
916 add bl, 16 /* bits += 16 */
917 shl eax, cl
918 or edx, eax /* hold |= *((ushort *)in)++ << bits */
919 mov cl, ch /* move op back to ecx */
920
921L_add_bits_to_dist:
922 sub bl, cl
923 xor eax, eax
924 inc eax
925 shl eax, cl
926 dec eax /* (1 << op) - 1 */
927 and eax, edx /* eax &= hold */
928 shr edx, cl
929 add ebp, eax /* dist += hold & ((1 << op) - 1) */
930
931L_check_window:
932 mov [esp+8], esi /* save in so from can use it's reg */
933 mov eax, edi
934 sub eax, [esp+20] /* nbytes = out - beg */
935
936 cmp eax, ebp
937 jb L_clip_window /* if (dist > nbytes) 4.2% */
938
939 mov ecx, [esp+64] /* ecx = len */
940 mov esi, edi
941 sub esi, ebp /* from = out - dist */
942
943 sar ecx, 1
944 jnc L_copy_two
945
946 rep movsw
947 mov al, [esi]
948 mov [edi], al
949 inc edi
950
951 mov esi, [esp+8] /* move in back to %esi, toss from */
952 mov ebp, [esp+32] /* ebp = lcode */
953 jmp L_while_test
954
955L_copy_two:
956 rep movsw
957 mov esi, [esp+8] /* move in back to %esi, toss from */
958 mov ebp, [esp+32] /* ebp = lcode */
959 jmp L_while_test
960
961ALIGN 4
962L_check_dist_one:
963 cmp ebp, 1 /* if dist 1, is a memset */
964 jne L_check_window
965 cmp [esp+20], edi
966 je L_check_window /* out == beg, if outside window */
967
968 mov ecx, [esp+64] /* ecx = len */
969 mov al, [edi-1]
970 mov ah, al
971
972 sar ecx, 1
973 jnc L_set_two
974 mov [edi], al /* memset out with from[-1] */
975 inc edi
976
977L_set_two:
978 rep stosw
979 mov ebp, [esp+32] /* ebp = lcode */
980 jmp L_while_test
981
982ALIGN 4
983L_test_for_second_level_length:
984 test al, 64
985 jnz L_test_for_end_of_block /* if ((op & 64) != 0) */
986
987 xor eax, eax
988 inc eax
989 shl eax, cl
990 dec eax
991 and eax, edx /* eax &= hold */
992 add eax, [esp+64] /* eax += len */
993 mov eax, [ebp+eax*4] /* eax = lcode[val+(hold&mask[op])]*/
994 jmp L_dolen
995
996ALIGN 4
997L_test_for_second_level_dist:
998 test al, 64
999 jnz L_invalid_distance_code /* if ((op & 64) != 0) */
1000
1001 xor eax, eax
1002 inc eax
1003 shl eax, cl
1004 dec eax
1005 and eax, edx /* eax &= hold */
1006 add eax, ebp /* eax += dist */
1007 mov ecx, [esp+36] /* ecx = dcode */
1008 mov eax, [ecx+eax*4] /* eax = dcode[val+(hold&mask[op])]*/
1009 jmp L_dodist
1010
1011ALIGN 4
1012L_clip_window:
1013 mov ecx, eax
1014 mov eax, [esp+48] /* eax = wsize */
1015 neg ecx /* nbytes = -nbytes */
1016 mov esi, [esp+28] /* from = window */
1017
1018 cmp eax, ebp
1019 jb L_invalid_distance_too_far /* if (dist > wsize) */
1020
1021 add ecx, ebp /* nbytes = dist - nbytes */
1022 cmp dword ptr [esp+52], 0
1023 jne L_wrap_around_window /* if (write != 0) */
1024
1025 sub eax, ecx
1026 add esi, eax /* from += wsize - nbytes */
1027
1028 mov eax, [esp+64] /* eax = len */
1029 cmp eax, ecx
1030 jbe L_do_copy /* if (nbytes >= len) */
1031
1032 sub eax, ecx /* len -= nbytes */
1033 rep movsb
1034 mov esi, edi
1035 sub esi, ebp /* from = out - dist */
1036 jmp L_do_copy
1037
1038ALIGN 4
1039L_wrap_around_window:
1040 mov eax, [esp+52] /* eax = write */
1041 cmp ecx, eax
1042 jbe L_contiguous_in_window /* if (write >= nbytes) */
1043
1044 add esi, [esp+48] /* from += wsize */
1045 add esi, eax /* from += write */
1046 sub esi, ecx /* from -= nbytes */
1047 sub ecx, eax /* nbytes -= write */
1048
1049 mov eax, [esp+64] /* eax = len */
1050 cmp eax, ecx
1051 jbe L_do_copy /* if (nbytes >= len) */
1052
1053 sub eax, ecx /* len -= nbytes */
1054 rep movsb
1055 mov esi, [esp+28] /* from = window */
1056 mov ecx, [esp+52] /* nbytes = write */
1057 cmp eax, ecx
1058 jbe L_do_copy /* if (nbytes >= len) */
1059
1060 sub eax, ecx /* len -= nbytes */
1061 rep movsb
1062 mov esi, edi
1063 sub esi, ebp /* from = out - dist */
1064 jmp L_do_copy
1065
1066ALIGN 4
1067L_contiguous_in_window:
1068 add esi, eax
1069 sub esi, ecx /* from += write - nbytes */
1070
1071 mov eax, [esp+64] /* eax = len */
1072 cmp eax, ecx
1073 jbe L_do_copy /* if (nbytes >= len) */
1074
1075 sub eax, ecx /* len -= nbytes */
1076 rep movsb
1077 mov esi, edi
1078 sub esi, ebp /* from = out - dist */
1079 jmp L_do_copy
1080
1081ALIGN 4
1082L_do_copy:
1083 mov ecx, eax
1084 rep movsb
1085
1086 mov esi, [esp+8] /* move in back to %esi, toss from */
1087 mov ebp, [esp+32] /* ebp = lcode */
1088 jmp L_while_test
1089
1090L_test_for_end_of_block:
1091 test al, 32
1092 jz L_invalid_literal_length_code
1093 mov dword ptr [esp+72], 1
1094 jmp L_break_loop_with_status
1095
1096L_invalid_literal_length_code:
1097 mov dword ptr [esp+72], 2
1098 jmp L_break_loop_with_status
1099
1100L_invalid_distance_code:
1101 mov dword ptr [esp+72], 3
1102 jmp L_break_loop_with_status
1103
1104L_invalid_distance_too_far:
1105 mov esi, [esp+4]
1106 mov dword ptr [esp+72], 4
1107 jmp L_break_loop_with_status
1108
1109L_break_loop:
1110 mov dword ptr [esp+72], 0
1111
1112L_break_loop_with_status:
1113/* put in, out, bits, and hold back into ar and pop esp */
1114 mov [esp+8], esi /* save in */
1115 mov [esp+16], edi /* save out */
1116 mov [esp+44], ebx /* save bits */
1117 mov [esp+40], edx /* save hold */
1118 mov ebp, [esp+4] /* restore esp, ebp */
1119 mov esp, [esp]
1120 }
1121#else
1122#error "x86 architecture not defined"
1123#endif
1124
1125 if (ar.status > 1) {
1126 if (ar.status == 2)
1127 strm->msg = "invalid literal/length code";
1128 else if (ar.status == 3)
1129 strm->msg = "invalid distance code";
1130 else
1131 strm->msg = "invalid distance too far back";
1132 state->mode = BAD;
1133 }
1134 else if ( ar.status == 1 ) {
1135 state->mode = TYPE;
1136 }
1137
1138 /* return unused bytes (on entry, bits < 8, so in won't go too far back) */
1139 ar.len = ar.bits >> 3;
1140 ar.in -= ar.len;
1141 ar.bits -= ar.len << 3;
1142 ar.hold &= (1U << ar.bits) - 1;
1143
1144 /* update state and return */
1145 strm->next_in = ar.in;
1146 strm->next_out = ar.out;
1147 strm->avail_in = (unsigned)(ar.in < ar.last ?
1148 PAD_AVAIL_IN + (ar.last - ar.in) :
1149 PAD_AVAIL_IN - (ar.in - ar.last));
1150 strm->avail_out = (unsigned)(ar.out < ar.end ?
1151 PAD_AVAIL_OUT + (ar.end - ar.out) :
1152 PAD_AVAIL_OUT - (ar.out - ar.end));
1153 state->hold = ar.hold;
1154 state->bits = ar.bits;
1155 return;
1156}
1157
diff --git a/contrib/inflate86/inffast.S b/contrib/inflate86/inffast.S
deleted file mode 100644
index 2245a29..0000000
--- a/contrib/inflate86/inffast.S
+++ /dev/null
@@ -1,1368 +0,0 @@
1/*
2 * inffast.S is a hand tuned assembler version of:
3 *
4 * inffast.c -- fast decoding
5 * Copyright (C) 1995-2003 Mark Adler
6 * For conditions of distribution and use, see copyright notice in zlib.h
7 *
8 * Copyright (C) 2003 Chris Anderson <christop@charm.net>
9 * Please use the copyright conditions above.
10 *
11 * This version (Jan-23-2003) of inflate_fast was coded and tested under
12 * GNU/Linux on a pentium 3, using the gcc-3.2 compiler distribution. On that
13 * machine, I found that gzip style archives decompressed about 20% faster than
14 * the gcc-3.2 -O3 -fomit-frame-pointer compiled version. Your results will
15 * depend on how large of a buffer is used for z_stream.next_in & next_out
16 * (8K-32K worked best for my 256K cpu cache) and how much overhead there is in
17 * stream processing I/O and crc32/addler32. In my case, this routine used
18 * 70% of the cpu time and crc32 used 20%.
19 *
20 * I am confident that this version will work in the general case, but I have
21 * not tested a wide variety of datasets or a wide variety of platforms.
22 *
23 * Jan-24-2003 -- Added -DUSE_MMX define for slightly faster inflating.
24 * It should be a runtime flag instead of compile time flag...
25 *
26 * Jan-26-2003 -- Added runtime check for MMX support with cpuid instruction.
27 * With -DUSE_MMX, only MMX code is compiled. With -DNO_MMX, only non-MMX code
28 * is compiled. Without either option, runtime detection is enabled. Runtime
29 * detection should work on all modern cpus and the recomended algorithm (flip
30 * ID bit on eflags and then use the cpuid instruction) is used in many
31 * multimedia applications. Tested under win2k with gcc-2.95 and gas-2.12
32 * distributed with cygwin3. Compiling with gcc-2.95 -c inffast.S -o
33 * inffast.obj generates a COFF object which can then be linked with MSVC++
34 * compiled code. Tested under FreeBSD 4.7 with gcc-2.95.
35 *
36 * Jan-28-2003 -- Tested Athlon XP... MMX mode is slower than no MMX (and
37 * slower than compiler generated code). Adjusted cpuid check to use the MMX
38 * code only for Pentiums < P4 until I have more data on the P4. Speed
39 * improvment is only about 15% on the Athlon when compared with code generated
40 * with MSVC++. Not sure yet, but I think the P4 will also be slower using the
41 * MMX mode because many of it's x86 ALU instructions execute in .5 cycles and
42 * have less latency than MMX ops. Added code to buffer the last 11 bytes of
43 * the input stream since the MMX code grabs bits in chunks of 32, which
44 * differs from the inffast.c algorithm. I don't think there would have been
45 * read overruns where a page boundary was crossed (a segfault), but there
46 * could have been overruns when next_in ends on unaligned memory (unintialized
47 * memory read).
48 *
49 * Mar-13-2003 -- P4 MMX is slightly slower than P4 NO_MMX. I created a C
50 * version of the non-MMX code so that it doesn't depend on zstrm and zstate
51 * structure offsets which are hard coded in this file. This was last tested
52 * with zlib-1.2.0 which is currently in beta testing, newer versions of this
53 * and inffas86.c can be found at http://www.eetbeetee.com/zlib/ and
54 * http://www.charm.net/~christop/zlib/
55 */
56
57
58/*
59 * if you have underscore linking problems (_inflate_fast undefined), try
60 * using -DGAS_COFF
61 */
62#if ! defined( GAS_COFF ) && ! defined( GAS_ELF )
63
64#if defined( WIN32 ) || defined( __CYGWIN__ )
65#define GAS_COFF /* windows object format */
66#else
67#define GAS_ELF
68#endif
69
70#endif /* ! GAS_COFF && ! GAS_ELF */
71
72
73#if defined( GAS_COFF )
74
75/* coff externals have underscores */
76#define inflate_fast _inflate_fast
77#define inflate_fast_use_mmx _inflate_fast_use_mmx
78
79#endif /* GAS_COFF */
80
81
82.file "inffast.S"
83
84.globl inflate_fast
85
86.text
87.align 4,0
88.L_invalid_literal_length_code_msg:
89.string "invalid literal/length code"
90
91.align 4,0
92.L_invalid_distance_code_msg:
93.string "invalid distance code"
94
95.align 4,0
96.L_invalid_distance_too_far_msg:
97.string "invalid distance too far back"
98
99#if ! defined( NO_MMX )
100.align 4,0
101.L_mask: /* mask[N] = ( 1 << N ) - 1 */
102.long 0
103.long 1
104.long 3
105.long 7
106.long 15
107.long 31
108.long 63
109.long 127
110.long 255
111.long 511
112.long 1023
113.long 2047
114.long 4095
115.long 8191
116.long 16383
117.long 32767
118.long 65535
119.long 131071
120.long 262143
121.long 524287
122.long 1048575
123.long 2097151
124.long 4194303
125.long 8388607
126.long 16777215
127.long 33554431
128.long 67108863
129.long 134217727
130.long 268435455
131.long 536870911
132.long 1073741823
133.long 2147483647
134.long 4294967295
135#endif /* NO_MMX */
136
137.text
138
139/*
140 * struct z_stream offsets, in zlib.h
141 */
142#define next_in_strm 0 /* strm->next_in */
143#define avail_in_strm 4 /* strm->avail_in */
144#define next_out_strm 12 /* strm->next_out */
145#define avail_out_strm 16 /* strm->avail_out */
146#define msg_strm 24 /* strm->msg */
147#define state_strm 28 /* strm->state */
148
149/*
150 * struct inflate_state offsets, in inflate.h
151 */
152#define mode_state 0 /* state->mode */
153#define wsize_state 32 /* state->wsize */
154#define write_state 40 /* state->write */
155#define window_state 44 /* state->window */
156#define hold_state 48 /* state->hold */
157#define bits_state 52 /* state->bits */
158#define lencode_state 68 /* state->lencode */
159#define distcode_state 72 /* state->distcode */
160#define lenbits_state 76 /* state->lenbits */
161#define distbits_state 80 /* state->distbits */
162
163/*
164 * inflate_fast's activation record
165 */
166#define local_var_size 64 /* how much local space for vars */
167#define strm_sp 88 /* first arg: z_stream * (local_var_size + 24) */
168#define start_sp 92 /* second arg: unsigned int (local_var_size + 28) */
169
170/*
171 * offsets for local vars on stack
172 */
173#define out 60 /* unsigned char* */
174#define window 56 /* unsigned char* */
175#define wsize 52 /* unsigned int */
176#define write 48 /* unsigned int */
177#define in 44 /* unsigned char* */
178#define beg 40 /* unsigned char* */
179#define buf 28 /* char[ 12 ] */
180#define len 24 /* unsigned int */
181#define last 20 /* unsigned char* */
182#define end 16 /* unsigned char* */
183#define dcode 12 /* code* */
184#define lcode 8 /* code* */
185#define dmask 4 /* unsigned int */
186#define lmask 0 /* unsigned int */
187
188/*
189 * typedef enum inflate_mode consts, in inflate.h
190 */
191#define INFLATE_MODE_TYPE 11 /* state->mode flags enum-ed in inflate.h */
192#define INFLATE_MODE_BAD 26
193
194
195#if ! defined( USE_MMX ) && ! defined( NO_MMX )
196
197#define RUN_TIME_MMX
198
199#define CHECK_MMX 1
200#define DO_USE_MMX 2
201#define DONT_USE_MMX 3
202
203.globl inflate_fast_use_mmx
204
205.data
206
207.align 4,0
208inflate_fast_use_mmx: /* integer flag for run time control 1=check,2=mmx,3=no */
209.long CHECK_MMX
210
211#if defined( GAS_ELF )
212/* elf info */
213.type inflate_fast_use_mmx,@object
214.size inflate_fast_use_mmx,4
215#endif
216
217#endif /* RUN_TIME_MMX */
218
219#if defined( GAS_COFF )
220/* coff info: scl 2 = extern, type 32 = function */
221.def inflate_fast; .scl 2; .type 32; .endef
222#endif
223
224.text
225
226.align 32,0x90
227inflate_fast:
228 pushl %edi
229 pushl %esi
230 pushl %ebp
231 pushl %ebx
232 pushf /* save eflags (strm_sp, state_sp assumes this is 32 bits) */
233 subl $local_var_size, %esp
234 cld
235
236#define strm_r %esi
237#define state_r %edi
238
239 movl strm_sp(%esp), strm_r
240 movl state_strm(strm_r), state_r
241
242 /* in = strm->next_in;
243 * out = strm->next_out;
244 * last = in + strm->avail_in - 11;
245 * beg = out - (start - strm->avail_out);
246 * end = out + (strm->avail_out - 257);
247 */
248 movl avail_in_strm(strm_r), %edx
249 movl next_in_strm(strm_r), %eax
250
251 addl %eax, %edx /* avail_in += next_in */
252 subl $11, %edx /* avail_in -= 11 */
253
254 movl %eax, in(%esp)
255 movl %edx, last(%esp)
256
257 movl start_sp(%esp), %ebp
258 movl avail_out_strm(strm_r), %ecx
259 movl next_out_strm(strm_r), %ebx
260
261 subl %ecx, %ebp /* start -= avail_out */
262 negl %ebp /* start = -start */
263 addl %ebx, %ebp /* start += next_out */
264
265 subl $257, %ecx /* avail_out -= 257 */
266 addl %ebx, %ecx /* avail_out += out */
267
268 movl %ebx, out(%esp)
269 movl %ebp, beg(%esp)
270 movl %ecx, end(%esp)
271
272 /* wsize = state->wsize;
273 * write = state->write;
274 * window = state->window;
275 * hold = state->hold;
276 * bits = state->bits;
277 * lcode = state->lencode;
278 * dcode = state->distcode;
279 * lmask = ( 1 << state->lenbits ) - 1;
280 * dmask = ( 1 << state->distbits ) - 1;
281 */
282
283 movl lencode_state(state_r), %eax
284 movl distcode_state(state_r), %ecx
285
286 movl %eax, lcode(%esp)
287 movl %ecx, dcode(%esp)
288
289 movl $1, %eax
290 movl lenbits_state(state_r), %ecx
291 shll %cl, %eax
292 decl %eax
293 movl %eax, lmask(%esp)
294
295 movl $1, %eax
296 movl distbits_state(state_r), %ecx
297 shll %cl, %eax
298 decl %eax
299 movl %eax, dmask(%esp)
300
301 movl wsize_state(state_r), %eax
302 movl write_state(state_r), %ecx
303 movl window_state(state_r), %edx
304
305 movl %eax, wsize(%esp)
306 movl %ecx, write(%esp)
307 movl %edx, window(%esp)
308
309 movl hold_state(state_r), %ebp
310 movl bits_state(state_r), %ebx
311
312#undef strm_r
313#undef state_r
314
315#define in_r %esi
316#define from_r %esi
317#define out_r %edi
318
319 movl in(%esp), in_r
320 movl last(%esp), %ecx
321 cmpl in_r, %ecx
322 ja .L_align_long /* if in < last */
323
324 addl $11, %ecx /* ecx = &in[ avail_in ] */
325 subl in_r, %ecx /* ecx = avail_in */
326 movl $12, %eax
327 subl %ecx, %eax /* eax = 12 - avail_in */
328 leal buf(%esp), %edi
329 rep movsb /* memcpy( buf, in, avail_in ) */
330 movl %eax, %ecx
331 xorl %eax, %eax
332 rep stosb /* memset( &buf[ avail_in ], 0, 12 - avail_in ) */
333 leal buf(%esp), in_r /* in = buf */
334 movl in_r, last(%esp) /* last = in, do just one iteration */
335 jmp .L_is_aligned
336
337 /* align in_r on long boundary */
338.L_align_long:
339 testl $3, in_r
340 jz .L_is_aligned
341 xorl %eax, %eax
342 movb (in_r), %al
343 incl in_r
344 movl %ebx, %ecx
345 addl $8, %ebx
346 shll %cl, %eax
347 orl %eax, %ebp
348 jmp .L_align_long
349
350.L_is_aligned:
351 movl out(%esp), out_r
352
353#if defined( NO_MMX )
354 jmp .L_do_loop
355#endif
356
357#if defined( USE_MMX )
358 jmp .L_init_mmx
359#endif
360
361/*** Runtime MMX check ***/
362
363#if defined( RUN_TIME_MMX )
364.L_check_mmx:
365 cmpl $DO_USE_MMX, inflate_fast_use_mmx
366 je .L_init_mmx
367 ja .L_do_loop /* > 2 */
368
369 pushl %eax
370 pushl %ebx
371 pushl %ecx
372 pushl %edx
373 pushf
374 movl (%esp), %eax /* copy eflags to eax */
375 xorl $0x200000, (%esp) /* try toggling ID bit of eflags (bit 21)
376 * to see if cpu supports cpuid...
377 * ID bit method not supported by NexGen but
378 * bios may load a cpuid instruction and
379 * cpuid may be disabled on Cyrix 5-6x86 */
380 popf
381 pushf
382 popl %edx /* copy new eflags to edx */
383 xorl %eax, %edx /* test if ID bit is flipped */
384 jz .L_dont_use_mmx /* not flipped if zero */
385 xorl %eax, %eax
386 cpuid
387 cmpl $0x756e6547, %ebx /* check for GenuineIntel in ebx,ecx,edx */
388 jne .L_dont_use_mmx
389 cmpl $0x6c65746e, %ecx
390 jne .L_dont_use_mmx
391 cmpl $0x49656e69, %edx
392 jne .L_dont_use_mmx
393 movl $1, %eax
394 cpuid /* get cpu features */
395 shrl $8, %eax
396 andl $15, %eax
397 cmpl $6, %eax /* check for Pentium family, is 0xf for P4 */
398 jne .L_dont_use_mmx
399 testl $0x800000, %edx /* test if MMX feature is set (bit 23) */
400 jnz .L_use_mmx
401 jmp .L_dont_use_mmx
402.L_use_mmx:
403 movl $DO_USE_MMX, inflate_fast_use_mmx
404 jmp .L_check_mmx_pop
405.L_dont_use_mmx:
406 movl $DONT_USE_MMX, inflate_fast_use_mmx
407.L_check_mmx_pop:
408 popl %edx
409 popl %ecx
410 popl %ebx
411 popl %eax
412 jmp .L_check_mmx
413#endif
414
415
416/*** Non-MMX code ***/
417
418#if defined ( NO_MMX ) || defined( RUN_TIME_MMX )
419
420#define hold_r %ebp
421#define bits_r %bl
422#define bitslong_r %ebx
423
424.align 32,0x90
425.L_while_test:
426 /* while (in < last && out < end)
427 */
428 cmpl out_r, end(%esp)
429 jbe .L_break_loop /* if (out >= end) */
430
431 cmpl in_r, last(%esp)
432 jbe .L_break_loop
433
434.L_do_loop:
435 /* regs: %esi = in, %ebp = hold, %bl = bits, %edi = out
436 *
437 * do {
438 * if (bits < 15) {
439 * hold |= *((unsigned short *)in)++ << bits;
440 * bits += 16
441 * }
442 * this = lcode[hold & lmask]
443 */
444 cmpb $15, bits_r
445 ja .L_get_length_code /* if (15 < bits) */
446
447 xorl %eax, %eax
448 lodsw /* al = *(ushort *)in++ */
449 movb bits_r, %cl /* cl = bits, needs it for shifting */
450 addb $16, bits_r /* bits += 16 */
451 shll %cl, %eax
452 orl %eax, hold_r /* hold |= *((ushort *)in)++ << bits */
453
454.L_get_length_code:
455 movl lmask(%esp), %edx /* edx = lmask */
456 movl lcode(%esp), %ecx /* ecx = lcode */
457 andl hold_r, %edx /* edx &= hold */
458 movl (%ecx,%edx,4), %eax /* eax = lcode[hold & lmask] */
459
460.L_dolen:
461 /* regs: %esi = in, %ebp = hold, %bl = bits, %edi = out
462 *
463 * dolen:
464 * bits -= this.bits;
465 * hold >>= this.bits
466 */
467 movb %ah, %cl /* cl = this.bits */
468 subb %ah, bits_r /* bits -= this.bits */
469 shrl %cl, hold_r /* hold >>= this.bits */
470
471 /* check if op is a literal
472 * if (op == 0) {
473 * PUP(out) = this.val;
474 * }
475 */
476 testb %al, %al
477 jnz .L_test_for_length_base /* if (op != 0) 45.7% */
478
479 shrl $16, %eax /* output this.val char */
480 stosb
481 jmp .L_while_test
482
483.L_test_for_length_base:
484 /* regs: %esi = in, %ebp = hold, %bl = bits, %edi = out, %edx = len
485 *
486 * else if (op & 16) {
487 * len = this.val
488 * op &= 15
489 * if (op) {
490 * if (op > bits) {
491 * hold |= *((unsigned short *)in)++ << bits;
492 * bits += 16
493 * }
494 * len += hold & mask[op];
495 * bits -= op;
496 * hold >>= op;
497 * }
498 */
499#define len_r %edx
500 movl %eax, len_r /* len = this */
501 shrl $16, len_r /* len = this.val */
502 movb %al, %cl
503
504 testb $16, %al
505 jz .L_test_for_second_level_length /* if ((op & 16) == 0) 8% */
506 andb $15, %cl /* op &= 15 */
507 jz .L_save_len /* if (!op) */
508 cmpb %cl, bits_r
509 jae .L_add_bits_to_len /* if (op <= bits) */
510
511 movb %cl, %ch /* stash op in ch, freeing cl */
512 xorl %eax, %eax
513 lodsw /* al = *(ushort *)in++ */
514 movb bits_r, %cl /* cl = bits, needs it for shifting */
515 addb $16, bits_r /* bits += 16 */
516 shll %cl, %eax
517 orl %eax, hold_r /* hold |= *((ushort *)in)++ << bits */
518 movb %ch, %cl /* move op back to ecx */
519
520.L_add_bits_to_len:
521 movl $1, %eax
522 shll %cl, %eax
523 decl %eax
524 subb %cl, bits_r
525 andl hold_r, %eax /* eax &= hold */
526 shrl %cl, hold_r
527 addl %eax, len_r /* len += hold & mask[op] */
528
529.L_save_len:
530 movl len_r, len(%esp) /* save len */
531#undef len_r
532
533.L_decode_distance:
534 /* regs: %esi = in, %ebp = hold, %bl = bits, %edi = out, %edx = dist
535 *
536 * if (bits < 15) {
537 * hold |= *((unsigned short *)in)++ << bits;
538 * bits += 16
539 * }
540 * this = dcode[hold & dmask];
541 * dodist:
542 * bits -= this.bits;
543 * hold >>= this.bits;
544 * op = this.op;
545 */
546
547 cmpb $15, bits_r
548 ja .L_get_distance_code /* if (15 < bits) */
549
550 xorl %eax, %eax
551 lodsw /* al = *(ushort *)in++ */
552 movb bits_r, %cl /* cl = bits, needs it for shifting */
553 addb $16, bits_r /* bits += 16 */
554 shll %cl, %eax
555 orl %eax, hold_r /* hold |= *((ushort *)in)++ << bits */
556
557.L_get_distance_code:
558 movl dmask(%esp), %edx /* edx = dmask */
559 movl dcode(%esp), %ecx /* ecx = dcode */
560 andl hold_r, %edx /* edx &= hold */
561 movl (%ecx,%edx,4), %eax /* eax = dcode[hold & dmask] */
562
563#define dist_r %edx
564.L_dodist:
565 movl %eax, dist_r /* dist = this */
566 shrl $16, dist_r /* dist = this.val */
567 movb %ah, %cl
568 subb %ah, bits_r /* bits -= this.bits */
569 shrl %cl, hold_r /* hold >>= this.bits */
570
571 /* if (op & 16) {
572 * dist = this.val
573 * op &= 15
574 * if (op > bits) {
575 * hold |= *((unsigned short *)in)++ << bits;
576 * bits += 16
577 * }
578 * dist += hold & mask[op];
579 * bits -= op;
580 * hold >>= op;
581 */
582 movb %al, %cl /* cl = this.op */
583
584 testb $16, %al /* if ((op & 16) == 0) */
585 jz .L_test_for_second_level_dist
586 andb $15, %cl /* op &= 15 */
587 jz .L_check_dist_one
588 cmpb %cl, bits_r
589 jae .L_add_bits_to_dist /* if (op <= bits) 97.6% */
590
591 movb %cl, %ch /* stash op in ch, freeing cl */
592 xorl %eax, %eax
593 lodsw /* al = *(ushort *)in++ */
594 movb bits_r, %cl /* cl = bits, needs it for shifting */
595 addb $16, bits_r /* bits += 16 */
596 shll %cl, %eax
597 orl %eax, hold_r /* hold |= *((ushort *)in)++ << bits */
598 movb %ch, %cl /* move op back to ecx */
599
600.L_add_bits_to_dist:
601 movl $1, %eax
602 shll %cl, %eax
603 decl %eax /* (1 << op) - 1 */
604 subb %cl, bits_r
605 andl hold_r, %eax /* eax &= hold */
606 shrl %cl, hold_r
607 addl %eax, dist_r /* dist += hold & ((1 << op) - 1) */
608 jmp .L_check_window
609
610.L_check_window:
611 /* regs: %esi = from, %ebp = hold, %bl = bits, %edi = out, %edx = dist
612 * %ecx = nbytes
613 *
614 * nbytes = out - beg;
615 * if (dist <= nbytes) {
616 * from = out - dist;
617 * do {
618 * PUP(out) = PUP(from);
619 * } while (--len > 0) {
620 * }
621 */
622
623 movl in_r, in(%esp) /* save in so from can use it's reg */
624 movl out_r, %eax
625 subl beg(%esp), %eax /* nbytes = out - beg */
626
627 cmpl dist_r, %eax
628 jb .L_clip_window /* if (dist > nbytes) 4.2% */
629
630 movl len(%esp), %ecx
631 movl out_r, from_r
632 subl dist_r, from_r /* from = out - dist */
633
634 subl $3, %ecx
635 movb (from_r), %al
636 movb %al, (out_r)
637 movb 1(from_r), %al
638 movb 2(from_r), %dl
639 addl $3, from_r
640 movb %al, 1(out_r)
641 movb %dl, 2(out_r)
642 addl $3, out_r
643 rep movsb
644
645 movl in(%esp), in_r /* move in back to %esi, toss from */
646 jmp .L_while_test
647
648.align 16,0x90
649.L_check_dist_one:
650 cmpl $1, dist_r
651 jne .L_check_window
652 cmpl out_r, beg(%esp)
653 je .L_check_window
654
655 decl out_r
656 movl len(%esp), %ecx
657 movb (out_r), %al
658 subl $3, %ecx
659
660 movb %al, 1(out_r)
661 movb %al, 2(out_r)
662 movb %al, 3(out_r)
663 addl $4, out_r
664 rep stosb
665
666 jmp .L_while_test
667
668.align 16,0x90
669.L_test_for_second_level_length:
670 /* else if ((op & 64) == 0) {
671 * this = lcode[this.val + (hold & mask[op])];
672 * }
673 */
674 testb $64, %al
675 jnz .L_test_for_end_of_block /* if ((op & 64) != 0) */
676
677 movl $1, %eax
678 shll %cl, %eax
679 decl %eax
680 andl hold_r, %eax /* eax &= hold */
681 addl %edx, %eax /* eax += this.val */
682 movl lcode(%esp), %edx /* edx = lcode */
683 movl (%edx,%eax,4), %eax /* eax = lcode[val + (hold&mask[op])] */
684 jmp .L_dolen
685
686.align 16,0x90
687.L_test_for_second_level_dist:
688 /* else if ((op & 64) == 0) {
689 * this = dcode[this.val + (hold & mask[op])];
690 * }
691 */
692 testb $64, %al
693 jnz .L_invalid_distance_code /* if ((op & 64) != 0) */
694
695 movl $1, %eax
696 shll %cl, %eax
697 decl %eax
698 andl hold_r, %eax /* eax &= hold */
699 addl %edx, %eax /* eax += this.val */
700 movl dcode(%esp), %edx /* edx = dcode */
701 movl (%edx,%eax,4), %eax /* eax = dcode[val + (hold&mask[op])] */
702 jmp .L_dodist
703
704.align 16,0x90
705.L_clip_window:
706 /* regs: %esi = from, %ebp = hold, %bl = bits, %edi = out, %edx = dist
707 * %ecx = nbytes
708 *
709 * else {
710 * if (dist > wsize) {
711 * invalid distance
712 * }
713 * from = window;
714 * nbytes = dist - nbytes;
715 * if (write == 0) {
716 * from += wsize - nbytes;
717 */
718#define nbytes_r %ecx
719 movl %eax, nbytes_r
720 movl wsize(%esp), %eax /* prepare for dist compare */
721 negl nbytes_r /* nbytes = -nbytes */
722 movl window(%esp), from_r /* from = window */
723
724 cmpl dist_r, %eax
725 jb .L_invalid_distance_too_far /* if (dist > wsize) */
726
727 addl dist_r, nbytes_r /* nbytes = dist - nbytes */
728 cmpl $0, write(%esp)
729 jne .L_wrap_around_window /* if (write != 0) */
730
731 subl nbytes_r, %eax
732 addl %eax, from_r /* from += wsize - nbytes */
733
734 /* regs: %esi = from, %ebp = hold, %bl = bits, %edi = out, %edx = dist
735 * %ecx = nbytes, %eax = len
736 *
737 * if (nbytes < len) {
738 * len -= nbytes;
739 * do {
740 * PUP(out) = PUP(from);
741 * } while (--nbytes);
742 * from = out - dist;
743 * }
744 * }
745 */
746#define len_r %eax
747 movl len(%esp), len_r
748 cmpl nbytes_r, len_r
749 jbe .L_do_copy1 /* if (nbytes >= len) */
750
751 subl nbytes_r, len_r /* len -= nbytes */
752 rep movsb
753 movl out_r, from_r
754 subl dist_r, from_r /* from = out - dist */
755 jmp .L_do_copy1
756
757 cmpl nbytes_r, len_r
758 jbe .L_do_copy1 /* if (nbytes >= len) */
759
760 subl nbytes_r, len_r /* len -= nbytes */
761 rep movsb
762 movl out_r, from_r
763 subl dist_r, from_r /* from = out - dist */
764 jmp .L_do_copy1
765
766.L_wrap_around_window:
767 /* regs: %esi = from, %ebp = hold, %bl = bits, %edi = out, %edx = dist
768 * %ecx = nbytes, %eax = write, %eax = len
769 *
770 * else if (write < nbytes) {
771 * from += wsize + write - nbytes;
772 * nbytes -= write;
773 * if (nbytes < len) {
774 * len -= nbytes;
775 * do {
776 * PUP(out) = PUP(from);
777 * } while (--nbytes);
778 * from = window;
779 * nbytes = write;
780 * if (nbytes < len) {
781 * len -= nbytes;
782 * do {
783 * PUP(out) = PUP(from);
784 * } while(--nbytes);
785 * from = out - dist;
786 * }
787 * }
788 * }
789 */
790#define write_r %eax
791 movl write(%esp), write_r
792 cmpl write_r, nbytes_r
793 jbe .L_contiguous_in_window /* if (write >= nbytes) */
794
795 addl wsize(%esp), from_r
796 addl write_r, from_r
797 subl nbytes_r, from_r /* from += wsize + write - nbytes */
798 subl write_r, nbytes_r /* nbytes -= write */
799#undef write_r
800
801 movl len(%esp), len_r
802 cmpl nbytes_r, len_r
803 jbe .L_do_copy1 /* if (nbytes >= len) */
804
805 subl nbytes_r, len_r /* len -= nbytes */
806 rep movsb
807 movl window(%esp), from_r /* from = window */
808 movl write(%esp), nbytes_r /* nbytes = write */
809 cmpl nbytes_r, len_r
810 jbe .L_do_copy1 /* if (nbytes >= len) */
811
812 subl nbytes_r, len_r /* len -= nbytes */
813 rep movsb
814 movl out_r, from_r
815 subl dist_r, from_r /* from = out - dist */
816 jmp .L_do_copy1
817
818.L_contiguous_in_window:
819 /* regs: %esi = from, %ebp = hold, %bl = bits, %edi = out, %edx = dist
820 * %ecx = nbytes, %eax = write, %eax = len
821 *
822 * else {
823 * from += write - nbytes;
824 * if (nbytes < len) {
825 * len -= nbytes;
826 * do {
827 * PUP(out) = PUP(from);
828 * } while (--nbytes);
829 * from = out - dist;
830 * }
831 * }
832 */
833#define write_r %eax
834 addl write_r, from_r
835 subl nbytes_r, from_r /* from += write - nbytes */
836#undef write_r
837
838 movl len(%esp), len_r
839 cmpl nbytes_r, len_r
840 jbe .L_do_copy1 /* if (nbytes >= len) */
841
842 subl nbytes_r, len_r /* len -= nbytes */
843 rep movsb
844 movl out_r, from_r
845 subl dist_r, from_r /* from = out - dist */
846
847.L_do_copy1:
848 /* regs: %esi = from, %esi = in, %ebp = hold, %bl = bits, %edi = out
849 * %eax = len
850 *
851 * while (len > 0) {
852 * PUP(out) = PUP(from);
853 * len--;
854 * }
855 * }
856 * } while (in < last && out < end);
857 */
858#undef nbytes_r
859#define in_r %esi
860 movl len_r, %ecx
861 rep movsb
862
863 movl in(%esp), in_r /* move in back to %esi, toss from */
864 jmp .L_while_test
865
866#undef len_r
867#undef dist_r
868
869#endif /* NO_MMX || RUN_TIME_MMX */
870
871
872/*** MMX code ***/
873
874#if defined( USE_MMX ) || defined( RUN_TIME_MMX )
875
876.align 32,0x90
877.L_init_mmx:
878 emms
879
880#undef bits_r
881#undef bitslong_r
882#define bitslong_r %ebp
883#define hold_mm %mm0
884 movd %ebp, hold_mm
885 movl %ebx, bitslong_r
886
887#define used_mm %mm1
888#define dmask2_mm %mm2
889#define lmask2_mm %mm3
890#define lmask_mm %mm4
891#define dmask_mm %mm5
892#define tmp_mm %mm6
893
894 movd lmask(%esp), lmask_mm
895 movq lmask_mm, lmask2_mm
896 movd dmask(%esp), dmask_mm
897 movq dmask_mm, dmask2_mm
898 pxor used_mm, used_mm
899 movl lcode(%esp), %ebx /* ebx = lcode */
900 jmp .L_do_loop_mmx
901
902.align 32,0x90
903.L_while_test_mmx:
904 /* while (in < last && out < end)
905 */
906 cmpl out_r, end(%esp)
907 jbe .L_break_loop /* if (out >= end) */
908
909 cmpl in_r, last(%esp)
910 jbe .L_break_loop
911
912.L_do_loop_mmx:
913 psrlq used_mm, hold_mm /* hold_mm >>= last bit length */
914
915 cmpl $32, bitslong_r
916 ja .L_get_length_code_mmx /* if (32 < bits) */
917
918 movd bitslong_r, tmp_mm
919 movd (in_r), %mm7
920 addl $4, in_r
921 psllq tmp_mm, %mm7
922 addl $32, bitslong_r
923 por %mm7, hold_mm /* hold_mm |= *((uint *)in)++ << bits */
924
925.L_get_length_code_mmx:
926 pand hold_mm, lmask_mm
927 movd lmask_mm, %eax
928 movq lmask2_mm, lmask_mm
929 movl (%ebx,%eax,4), %eax /* eax = lcode[hold & lmask] */
930
931.L_dolen_mmx:
932 movzbl %ah, %ecx /* ecx = this.bits */
933 movd %ecx, used_mm
934 subl %ecx, bitslong_r /* bits -= this.bits */
935
936 testb %al, %al
937 jnz .L_test_for_length_base_mmx /* if (op != 0) 45.7% */
938
939 shrl $16, %eax /* output this.val char */
940 stosb
941 jmp .L_while_test_mmx
942
943.L_test_for_length_base_mmx:
944#define len_r %edx
945 movl %eax, len_r /* len = this */
946 shrl $16, len_r /* len = this.val */
947
948 testb $16, %al
949 jz .L_test_for_second_level_length_mmx /* if ((op & 16) == 0) 8% */
950 andl $15, %eax /* op &= 15 */
951 jz .L_decode_distance_mmx /* if (!op) */
952
953 psrlq used_mm, hold_mm /* hold_mm >>= last bit length */
954 movd %eax, used_mm
955 movd hold_mm, %ecx
956 subl %eax, bitslong_r
957 andl .L_mask(,%eax,4), %ecx
958 addl %ecx, len_r /* len += hold & mask[op] */
959
960.L_decode_distance_mmx:
961 psrlq used_mm, hold_mm /* hold_mm >>= last bit length */
962
963 cmpl $32, bitslong_r
964 ja .L_get_dist_code_mmx /* if (32 < bits) */
965
966 movd bitslong_r, tmp_mm
967 movd (in_r), %mm7
968 addl $4, in_r
969 psllq tmp_mm, %mm7
970 addl $32, bitslong_r
971 por %mm7, hold_mm /* hold_mm |= *((uint *)in)++ << bits */
972
973.L_get_dist_code_mmx:
974 movl dcode(%esp), %ebx /* ebx = dcode */
975 pand hold_mm, dmask_mm
976 movd dmask_mm, %eax
977 movq dmask2_mm, dmask_mm
978 movl (%ebx,%eax,4), %eax /* eax = dcode[hold & lmask] */
979
980.L_dodist_mmx:
981#define dist_r %ebx
982 movzbl %ah, %ecx /* ecx = this.bits */
983 movl %eax, dist_r
984 shrl $16, dist_r /* dist = this.val */
985 subl %ecx, bitslong_r /* bits -= this.bits */
986 movd %ecx, used_mm
987
988 testb $16, %al /* if ((op & 16) == 0) */
989 jz .L_test_for_second_level_dist_mmx
990 andl $15, %eax /* op &= 15 */
991 jz .L_check_dist_one_mmx
992
993.L_add_bits_to_dist_mmx:
994 psrlq used_mm, hold_mm /* hold_mm >>= last bit length */
995 movd %eax, used_mm /* save bit length of current op */
996 movd hold_mm, %ecx /* get the next bits on input stream */
997 subl %eax, bitslong_r /* bits -= op bits */
998 andl .L_mask(,%eax,4), %ecx /* ecx = hold & mask[op] */
999 addl %ecx, dist_r /* dist += hold & mask[op] */
1000
1001.L_check_window_mmx:
1002 movl in_r, in(%esp) /* save in so from can use it's reg */
1003 movl out_r, %eax
1004 subl beg(%esp), %eax /* nbytes = out - beg */
1005
1006 cmpl dist_r, %eax
1007 jb .L_clip_window_mmx /* if (dist > nbytes) 4.2% */
1008
1009 movl len_r, %ecx
1010 movl out_r, from_r
1011 subl dist_r, from_r /* from = out - dist */
1012
1013 subl $3, %ecx
1014 movb (from_r), %al
1015 movb %al, (out_r)
1016 movb 1(from_r), %al
1017 movb 2(from_r), %dl
1018 addl $3, from_r
1019 movb %al, 1(out_r)
1020 movb %dl, 2(out_r)
1021 addl $3, out_r
1022 rep movsb
1023
1024 movl in(%esp), in_r /* move in back to %esi, toss from */
1025 movl lcode(%esp), %ebx /* move lcode back to %ebx, toss dist */
1026 jmp .L_while_test_mmx
1027
1028.align 16,0x90
1029.L_check_dist_one_mmx:
1030 cmpl $1, dist_r
1031 jne .L_check_window_mmx
1032 cmpl out_r, beg(%esp)
1033 je .L_check_window_mmx
1034
1035 decl out_r
1036 movl len_r, %ecx
1037 movb (out_r), %al
1038 subl $3, %ecx
1039
1040 movb %al, 1(out_r)
1041 movb %al, 2(out_r)
1042 movb %al, 3(out_r)
1043 addl $4, out_r
1044 rep stosb
1045
1046 movl lcode(%esp), %ebx /* move lcode back to %ebx, toss dist */
1047 jmp .L_while_test_mmx
1048
1049.align 16,0x90
1050.L_test_for_second_level_length_mmx:
1051 testb $64, %al
1052 jnz .L_test_for_end_of_block /* if ((op & 64) != 0) */
1053
1054 andl $15, %eax
1055 psrlq used_mm, hold_mm /* hold_mm >>= last bit length */
1056 movd hold_mm, %ecx
1057 andl .L_mask(,%eax,4), %ecx
1058 addl len_r, %ecx
1059 movl (%ebx,%ecx,4), %eax /* eax = lcode[hold & lmask] */
1060 jmp .L_dolen_mmx
1061
1062.align 16,0x90
1063.L_test_for_second_level_dist_mmx:
1064 testb $64, %al
1065 jnz .L_invalid_distance_code /* if ((op & 64) != 0) */
1066
1067 andl $15, %eax
1068 psrlq used_mm, hold_mm /* hold_mm >>= last bit length */
1069 movd hold_mm, %ecx
1070 andl .L_mask(,%eax,4), %ecx
1071 movl dcode(%esp), %eax /* ecx = dcode */
1072 addl dist_r, %ecx
1073 movl (%eax,%ecx,4), %eax /* eax = lcode[hold & lmask] */
1074 jmp .L_dodist_mmx
1075
1076.align 16,0x90
1077.L_clip_window_mmx:
1078#define nbytes_r %ecx
1079 movl %eax, nbytes_r
1080 movl wsize(%esp), %eax /* prepare for dist compare */
1081 negl nbytes_r /* nbytes = -nbytes */
1082 movl window(%esp), from_r /* from = window */
1083
1084 cmpl dist_r, %eax
1085 jb .L_invalid_distance_too_far /* if (dist > wsize) */
1086
1087 addl dist_r, nbytes_r /* nbytes = dist - nbytes */
1088 cmpl $0, write(%esp)
1089 jne .L_wrap_around_window_mmx /* if (write != 0) */
1090
1091 subl nbytes_r, %eax
1092 addl %eax, from_r /* from += wsize - nbytes */
1093
1094 cmpl nbytes_r, len_r
1095 jbe .L_do_copy1_mmx /* if (nbytes >= len) */
1096
1097 subl nbytes_r, len_r /* len -= nbytes */
1098 rep movsb
1099 movl out_r, from_r
1100 subl dist_r, from_r /* from = out - dist */
1101 jmp .L_do_copy1_mmx
1102
1103 cmpl nbytes_r, len_r
1104 jbe .L_do_copy1_mmx /* if (nbytes >= len) */
1105
1106 subl nbytes_r, len_r /* len -= nbytes */
1107 rep movsb
1108 movl out_r, from_r
1109 subl dist_r, from_r /* from = out - dist */
1110 jmp .L_do_copy1_mmx
1111
1112.L_wrap_around_window_mmx:
1113#define write_r %eax
1114 movl write(%esp), write_r
1115 cmpl write_r, nbytes_r
1116 jbe .L_contiguous_in_window_mmx /* if (write >= nbytes) */
1117
1118 addl wsize(%esp), from_r
1119 addl write_r, from_r
1120 subl nbytes_r, from_r /* from += wsize + write - nbytes */
1121 subl write_r, nbytes_r /* nbytes -= write */
1122#undef write_r
1123
1124 cmpl nbytes_r, len_r
1125 jbe .L_do_copy1_mmx /* if (nbytes >= len) */
1126
1127 subl nbytes_r, len_r /* len -= nbytes */
1128 rep movsb
1129 movl window(%esp), from_r /* from = window */
1130 movl write(%esp), nbytes_r /* nbytes = write */
1131 cmpl nbytes_r, len_r
1132 jbe .L_do_copy1_mmx /* if (nbytes >= len) */
1133
1134 subl nbytes_r, len_r /* len -= nbytes */
1135 rep movsb
1136 movl out_r, from_r
1137 subl dist_r, from_r /* from = out - dist */
1138 jmp .L_do_copy1_mmx
1139
1140.L_contiguous_in_window_mmx:
1141#define write_r %eax
1142 addl write_r, from_r
1143 subl nbytes_r, from_r /* from += write - nbytes */
1144#undef write_r
1145
1146 cmpl nbytes_r, len_r
1147 jbe .L_do_copy1_mmx /* if (nbytes >= len) */
1148
1149 subl nbytes_r, len_r /* len -= nbytes */
1150 rep movsb
1151 movl out_r, from_r
1152 subl dist_r, from_r /* from = out - dist */
1153
1154.L_do_copy1_mmx:
1155#undef nbytes_r
1156#define in_r %esi
1157 movl len_r, %ecx
1158 rep movsb
1159
1160 movl in(%esp), in_r /* move in back to %esi, toss from */
1161 movl lcode(%esp), %ebx /* move lcode back to %ebx, toss dist */
1162 jmp .L_while_test_mmx
1163
1164#undef hold_r
1165#undef bitslong_r
1166
1167#endif /* USE_MMX || RUN_TIME_MMX */
1168
1169
1170/*** USE_MMX, NO_MMX, and RUNTIME_MMX from here on ***/
1171
1172.L_invalid_distance_code:
1173 /* else {
1174 * strm->msg = "invalid distance code";
1175 * state->mode = BAD;
1176 * }
1177 */
1178 movl $.L_invalid_distance_code_msg, %ecx
1179 movl $INFLATE_MODE_BAD, %edx
1180 jmp .L_update_stream_state
1181
1182.L_test_for_end_of_block:
1183 /* else if (op & 32) {
1184 * state->mode = TYPE;
1185 * break;
1186 * }
1187 */
1188 testb $32, %al
1189 jz .L_invalid_literal_length_code /* if ((op & 32) == 0) */
1190
1191 movl $0, %ecx
1192 movl $INFLATE_MODE_TYPE, %edx
1193 jmp .L_update_stream_state
1194
1195.L_invalid_literal_length_code:
1196 /* else {
1197 * strm->msg = "invalid literal/length code";
1198 * state->mode = BAD;
1199 * }
1200 */
1201 movl $.L_invalid_literal_length_code_msg, %ecx
1202 movl $INFLATE_MODE_BAD, %edx
1203 jmp .L_update_stream_state
1204
1205.L_invalid_distance_too_far:
1206 /* strm->msg = "invalid distance too far back";
1207 * state->mode = BAD;
1208 */
1209 movl in(%esp), in_r /* from_r has in's reg, put in back */
1210 movl $.L_invalid_distance_too_far_msg, %ecx
1211 movl $INFLATE_MODE_BAD, %edx
1212 jmp .L_update_stream_state
1213
1214.L_update_stream_state:
1215 /* set strm->msg = %ecx, strm->state->mode = %edx */
1216 movl strm_sp(%esp), %eax
1217 testl %ecx, %ecx /* if (msg != NULL) */
1218 jz .L_skip_msg
1219 movl %ecx, msg_strm(%eax) /* strm->msg = msg */
1220.L_skip_msg:
1221 movl state_strm(%eax), %eax /* state = strm->state */
1222 movl %edx, mode_state(%eax) /* state->mode = edx (BAD | TYPE) */
1223 jmp .L_break_loop
1224
1225.align 32,0x90
1226.L_break_loop:
1227
1228/*
1229 * Regs:
1230 *
1231 * bits = %ebp when mmx, and in %ebx when non-mmx
1232 * hold = %hold_mm when mmx, and in %ebp when non-mmx
1233 * in = %esi
1234 * out = %edi
1235 */
1236
1237#if defined( USE_MMX ) || defined( RUN_TIME_MMX )
1238
1239#if defined( RUN_TIME_MMX )
1240
1241 cmpl $DO_USE_MMX, inflate_fast_use_mmx
1242 jne .L_update_next_in
1243
1244#endif /* RUN_TIME_MMX */
1245
1246 movl %ebp, %ebx
1247
1248.L_update_next_in:
1249
1250#endif
1251
1252#define strm_r %eax
1253#define state_r %edx
1254
1255 /* len = bits >> 3;
1256 * in -= len;
1257 * bits -= len << 3;
1258 * hold &= (1U << bits) - 1;
1259 * state->hold = hold;
1260 * state->bits = bits;
1261 * strm->next_in = in;
1262 * strm->next_out = out;
1263 */
1264 movl strm_sp(%esp), strm_r
1265 movl %ebx, %ecx
1266 movl state_strm(strm_r), state_r
1267 shrl $3, %ecx
1268 subl %ecx, in_r
1269 shll $3, %ecx
1270 subl %ecx, %ebx
1271 movl out_r, next_out_strm(strm_r)
1272 movl %ebx, bits_state(state_r)
1273 movl %ebx, %ecx
1274
1275 leal buf(%esp), %ebx
1276 cmpl %ebx, last(%esp)
1277 jne .L_buf_not_used /* if buf != last */
1278
1279 subl %ebx, in_r /* in -= buf */
1280 movl next_in_strm(strm_r), %ebx
1281 movl %ebx, last(%esp) /* last = strm->next_in */
1282 addl %ebx, in_r /* in += strm->next_in */
1283 movl avail_in_strm(strm_r), %ebx
1284 subl $11, %ebx
1285 addl %ebx, last(%esp) /* last = &strm->next_in[ avail_in - 11 ] */
1286
1287.L_buf_not_used:
1288 movl in_r, next_in_strm(strm_r)
1289
1290 movl $1, %ebx
1291 shll %cl, %ebx
1292 decl %ebx
1293
1294#if defined( USE_MMX ) || defined( RUN_TIME_MMX )
1295
1296#if defined( RUN_TIME_MMX )
1297
1298 cmpl $DO_USE_MMX, inflate_fast_use_mmx
1299 jne .L_update_hold
1300
1301#endif /* RUN_TIME_MMX */
1302
1303 psrlq used_mm, hold_mm /* hold_mm >>= last bit length */
1304 movd hold_mm, %ebp
1305
1306 emms
1307
1308.L_update_hold:
1309
1310#endif /* USE_MMX || RUN_TIME_MMX */
1311
1312 andl %ebx, %ebp
1313 movl %ebp, hold_state(state_r)
1314
1315#define last_r %ebx
1316
1317 /* strm->avail_in = in < last ? 11 + (last - in) : 11 - (in - last) */
1318 movl last(%esp), last_r
1319 cmpl in_r, last_r
1320 jbe .L_last_is_smaller /* if (in >= last) */
1321
1322 subl in_r, last_r /* last -= in */
1323 addl $11, last_r /* last += 11 */
1324 movl last_r, avail_in_strm(strm_r)
1325 jmp .L_fixup_out
1326.L_last_is_smaller:
1327 subl last_r, in_r /* in -= last */
1328 negl in_r /* in = -in */
1329 addl $11, in_r /* in += 11 */
1330 movl in_r, avail_in_strm(strm_r)
1331
1332#undef last_r
1333#define end_r %ebx
1334
1335.L_fixup_out:
1336 /* strm->avail_out = out < end ? 257 + (end - out) : 257 - (out - end)*/
1337 movl end(%esp), end_r
1338 cmpl out_r, end_r
1339 jbe .L_end_is_smaller /* if (out >= end) */
1340
1341 subl out_r, end_r /* end -= out */
1342 addl $257, end_r /* end += 257 */
1343 movl end_r, avail_out_strm(strm_r)
1344 jmp .L_done
1345.L_end_is_smaller:
1346 subl end_r, out_r /* out -= end */
1347 negl out_r /* out = -out */
1348 addl $257, out_r /* out += 257 */
1349 movl out_r, avail_out_strm(strm_r)
1350
1351#undef end_r
1352#undef strm_r
1353#undef state_r
1354
1355.L_done:
1356 addl $local_var_size, %esp
1357 popf
1358 popl %ebx
1359 popl %ebp
1360 popl %esi
1361 popl %edi
1362 ret
1363
1364#if defined( GAS_ELF )
1365/* elf info */
1366.type inflate_fast,@function
1367.size inflate_fast,.-inflate_fast
1368#endif
diff --git a/contrib/masmx64/bld_ml64.bat b/contrib/masmx64/bld_ml64.bat
deleted file mode 100644
index 8f9343d..0000000
--- a/contrib/masmx64/bld_ml64.bat
+++ /dev/null
@@ -1,2 +0,0 @@
1ml64.exe /Flinffasx64 /c /Zi inffasx64.asm
2ml64.exe /Flgvmat64 /c /Zi gvmat64.asm
diff --git a/contrib/masmx64/gvmat64.asm b/contrib/masmx64/gvmat64.asm
deleted file mode 100644
index 9879c28..0000000
--- a/contrib/masmx64/gvmat64.asm
+++ /dev/null
@@ -1,553 +0,0 @@
1;uInt longest_match_x64(
2; deflate_state *s,
3; IPos cur_match); /* current match */
4
5; gvmat64.asm -- Asm portion of the optimized longest_match for 32 bits x86_64
6; (AMD64 on Athlon 64, Opteron, Phenom
7; and Intel EM64T on Pentium 4 with EM64T, Pentium D, Core 2 Duo, Core I5/I7)
8; Copyright (C) 1995-2010 Jean-loup Gailly, Brian Raiter and Gilles Vollant.
9;
10; File written by Gilles Vollant, by converting to assembly the longest_match
11; from Jean-loup Gailly in deflate.c of zLib and infoZip zip.
12;
13; and by taking inspiration on asm686 with masm, optimised assembly code
14; from Brian Raiter, written 1998
15;
16; This software is provided 'as-is', without any express or implied
17; warranty. In no event will the authors be held liable for any damages
18; arising from the use of this software.
19;
20; Permission is granted to anyone to use this software for any purpose,
21; including commercial applications, and to alter it and redistribute it
22; freely, subject to the following restrictions:
23;
24; 1. The origin of this software must not be misrepresented; you must not
25; claim that you wrote the original software. If you use this software
26; in a product, an acknowledgment in the product documentation would be
27; appreciated but is not required.
28; 2. Altered source versions must be plainly marked as such, and must not be
29; misrepresented as being the original software
30; 3. This notice may not be removed or altered from any source distribution.
31;
32;
33;
34; http://www.zlib.net
35; http://www.winimage.com/zLibDll
36; http://www.muppetlabs.com/~breadbox/software/assembly.html
37;
38; to compile this file for infozip Zip, I use option:
39; ml64.exe /Flgvmat64 /c /Zi /DINFOZIP gvmat64.asm
40;
41; to compile this file for zLib, I use option:
42; ml64.exe /Flgvmat64 /c /Zi gvmat64.asm
43; Be carrefull to adapt zlib1222add below to your version of zLib
44; (if you use a version of zLib before 1.0.4 or after 1.2.2.2, change
45; value of zlib1222add later)
46;
47; This file compile with Microsoft Macro Assembler (x64) for AMD64
48;
49; ml64.exe is given with Visual Studio 2005/2008/2010 and Windows WDK
50;
51; (you can get Windows WDK with ml64 for AMD64 from
52; http://www.microsoft.com/whdc/Devtools/wdk/default.mspx for low price)
53;
54
55
56;uInt longest_match(s, cur_match)
57; deflate_state *s;
58; IPos cur_match; /* current match */
59.code
60longest_match PROC
61
62
63;LocalVarsSize equ 88
64 LocalVarsSize equ 72
65
66; register used : rax,rbx,rcx,rdx,rsi,rdi,r8,r9,r10,r11,r12
67; free register : r14,r15
68; register can be saved : rsp
69
70 chainlenwmask equ rsp + 8 - LocalVarsSize ; high word: current chain len
71 ; low word: s->wmask
72;window equ rsp + xx - LocalVarsSize ; local copy of s->window ; stored in r10
73;windowbestlen equ rsp + xx - LocalVarsSize ; s->window + bestlen , use r10+r11
74;scanstart equ rsp + xx - LocalVarsSize ; first two bytes of string ; stored in r12w
75;scanend equ rsp + xx - LocalVarsSize ; last two bytes of string use ebx
76;scanalign equ rsp + xx - LocalVarsSize ; dword-misalignment of string r13
77;bestlen equ rsp + xx - LocalVarsSize ; size of best match so far -> r11d
78;scan equ rsp + xx - LocalVarsSize ; ptr to string wanting match -> r9
79IFDEF INFOZIP
80ELSE
81 nicematch equ (rsp + 16 - LocalVarsSize) ; a good enough match size
82ENDIF
83
84save_rdi equ rsp + 24 - LocalVarsSize
85save_rsi equ rsp + 32 - LocalVarsSize
86save_rbx equ rsp + 40 - LocalVarsSize
87save_rbp equ rsp + 48 - LocalVarsSize
88save_r12 equ rsp + 56 - LocalVarsSize
89save_r13 equ rsp + 64 - LocalVarsSize
90;save_r14 equ rsp + 72 - LocalVarsSize
91;save_r15 equ rsp + 80 - LocalVarsSize
92
93
94; summary of register usage
95; scanend ebx
96; scanendw bx
97; chainlenwmask edx
98; curmatch rsi
99; curmatchd esi
100; windowbestlen r8
101; scanalign r9
102; scanalignd r9d
103; window r10
104; bestlen r11
105; bestlend r11d
106; scanstart r12d
107; scanstartw r12w
108; scan r13
109; nicematch r14d
110; limit r15
111; limitd r15d
112; prev rcx
113
114; all the +4 offsets are due to the addition of pending_buf_size (in zlib
115; in the deflate_state structure since the asm code was first written
116; (if you compile with zlib 1.0.4 or older, remove the +4).
117; Note : these value are good with a 8 bytes boundary pack structure
118
119
120 MAX_MATCH equ 258
121 MIN_MATCH equ 3
122 MIN_LOOKAHEAD equ (MAX_MATCH+MIN_MATCH+1)
123
124
125;;; Offsets for fields in the deflate_state structure. These numbers
126;;; are calculated from the definition of deflate_state, with the
127;;; assumption that the compiler will dword-align the fields. (Thus,
128;;; changing the definition of deflate_state could easily cause this
129;;; program to crash horribly, without so much as a warning at
130;;; compile time. Sigh.)
131
132; all the +zlib1222add offsets are due to the addition of fields
133; in zlib in the deflate_state structure since the asm code was first written
134; (if you compile with zlib 1.0.4 or older, use "zlib1222add equ (-4)").
135; (if you compile with zlib between 1.0.5 and 1.2.2.1, use "zlib1222add equ 0").
136; if you compile with zlib 1.2.2.2 or later , use "zlib1222add equ 8").
137
138
139IFDEF INFOZIP
140
141_DATA SEGMENT
142COMM window_size:DWORD
143; WMask ; 7fff
144COMM window:BYTE:010040H
145COMM prev:WORD:08000H
146; MatchLen : unused
147; PrevMatch : unused
148COMM strstart:DWORD
149COMM match_start:DWORD
150; Lookahead : ignore
151COMM prev_length:DWORD ; PrevLen
152COMM max_chain_length:DWORD
153COMM good_match:DWORD
154COMM nice_match:DWORD
155prev_ad equ OFFSET prev
156window_ad equ OFFSET window
157nicematch equ nice_match
158_DATA ENDS
159WMask equ 07fffh
160
161ELSE
162
163 IFNDEF zlib1222add
164 zlib1222add equ 8
165 ENDIF
166dsWSize equ 56+zlib1222add+(zlib1222add/2)
167dsWMask equ 64+zlib1222add+(zlib1222add/2)
168dsWindow equ 72+zlib1222add
169dsPrev equ 88+zlib1222add
170dsMatchLen equ 128+zlib1222add
171dsPrevMatch equ 132+zlib1222add
172dsStrStart equ 140+zlib1222add
173dsMatchStart equ 144+zlib1222add
174dsLookahead equ 148+zlib1222add
175dsPrevLen equ 152+zlib1222add
176dsMaxChainLen equ 156+zlib1222add
177dsGoodMatch equ 172+zlib1222add
178dsNiceMatch equ 176+zlib1222add
179
180window_size equ [ rcx + dsWSize]
181WMask equ [ rcx + dsWMask]
182window_ad equ [ rcx + dsWindow]
183prev_ad equ [ rcx + dsPrev]
184strstart equ [ rcx + dsStrStart]
185match_start equ [ rcx + dsMatchStart]
186Lookahead equ [ rcx + dsLookahead] ; 0ffffffffh on infozip
187prev_length equ [ rcx + dsPrevLen]
188max_chain_length equ [ rcx + dsMaxChainLen]
189good_match equ [ rcx + dsGoodMatch]
190nice_match equ [ rcx + dsNiceMatch]
191ENDIF
192
193; parameter 1 in r8(deflate state s), param 2 in rdx (cur match)
194
195; see http://weblogs.asp.net/oldnewthing/archive/2004/01/14/58579.aspx and
196; http://msdn.microsoft.com/library/en-us/kmarch/hh/kmarch/64bitAMD_8e951dd2-ee77-4728-8702-55ce4b5dd24a.xml.asp
197;
198; All registers must be preserved across the call, except for
199; rax, rcx, rdx, r8, r9, r10, and r11, which are scratch.
200
201
202
203;;; Save registers that the compiler may be using, and adjust esp to
204;;; make room for our stack frame.
205
206
207;;; Retrieve the function arguments. r8d will hold cur_match
208;;; throughout the entire function. edx will hold the pointer to the
209;;; deflate_state structure during the function's setup (before
210;;; entering the main loop.
211
212; parameter 1 in rcx (deflate_state* s), param 2 in edx -> r8 (cur match)
213
214; this clear high 32 bits of r8, which can be garbage in both r8 and rdx
215
216 mov [save_rdi],rdi
217 mov [save_rsi],rsi
218 mov [save_rbx],rbx
219 mov [save_rbp],rbp
220IFDEF INFOZIP
221 mov r8d,ecx
222ELSE
223 mov r8d,edx
224ENDIF
225 mov [save_r12],r12
226 mov [save_r13],r13
227; mov [save_r14],r14
228; mov [save_r15],r15
229
230
231;;; uInt wmask = s->w_mask;
232;;; unsigned chain_length = s->max_chain_length;
233;;; if (s->prev_length >= s->good_match) {
234;;; chain_length >>= 2;
235;;; }
236
237 mov edi, prev_length
238 mov esi, good_match
239 mov eax, WMask
240 mov ebx, max_chain_length
241 cmp edi, esi
242 jl LastMatchGood
243 shr ebx, 2
244LastMatchGood:
245
246;;; chainlen is decremented once beforehand so that the function can
247;;; use the sign flag instead of the zero flag for the exit test.
248;;; It is then shifted into the high word, to make room for the wmask
249;;; value, which it will always accompany.
250
251 dec ebx
252 shl ebx, 16
253 or ebx, eax
254
255;;; on zlib only
256;;; if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead;
257
258IFDEF INFOZIP
259 mov [chainlenwmask], ebx
260; on infozip nice_match = [nice_match]
261ELSE
262 mov eax, nice_match
263 mov [chainlenwmask], ebx
264 mov r10d, Lookahead
265 cmp r10d, eax
266 cmovnl r10d, eax
267 mov [nicematch],r10d
268ENDIF
269
270;;; register Bytef *scan = s->window + s->strstart;
271 mov r10, window_ad
272 mov ebp, strstart
273 lea r13, [r10 + rbp]
274
275;;; Determine how many bytes the scan ptr is off from being
276;;; dword-aligned.
277
278 mov r9,r13
279 neg r13
280 and r13,3
281
282;;; IPos limit = s->strstart > (IPos)MAX_DIST(s) ?
283;;; s->strstart - (IPos)MAX_DIST(s) : NIL;
284IFDEF INFOZIP
285 mov eax,07efah ; MAX_DIST = (WSIZE-MIN_LOOKAHEAD) (0x8000-(3+8+1))
286ELSE
287 mov eax, window_size
288 sub eax, MIN_LOOKAHEAD
289ENDIF
290 xor edi,edi
291 sub ebp, eax
292
293 mov r11d, prev_length
294
295 cmovng ebp,edi
296
297;;; int best_len = s->prev_length;
298
299
300;;; Store the sum of s->window + best_len in esi locally, and in esi.
301
302 lea rsi,[r10+r11]
303
304;;; register ush scan_start = *(ushf*)scan;
305;;; register ush scan_end = *(ushf*)(scan+best_len-1);
306;;; Posf *prev = s->prev;
307
308 movzx r12d,word ptr [r9]
309 movzx ebx, word ptr [r9 + r11 - 1]
310
311 mov rdi, prev_ad
312
313;;; Jump into the main loop.
314
315 mov edx, [chainlenwmask]
316
317 cmp bx,word ptr [rsi + r8 - 1]
318 jz LookupLoopIsZero
319
320LookupLoop1:
321 and r8d, edx
322
323 movzx r8d, word ptr [rdi + r8*2]
324 cmp r8d, ebp
325 jbe LeaveNow
326 sub edx, 00010000h
327 js LeaveNow
328
329LoopEntry1:
330 cmp bx,word ptr [rsi + r8 - 1]
331 jz LookupLoopIsZero
332
333LookupLoop2:
334 and r8d, edx
335
336 movzx r8d, word ptr [rdi + r8*2]
337 cmp r8d, ebp
338 jbe LeaveNow
339 sub edx, 00010000h
340 js LeaveNow
341
342LoopEntry2:
343 cmp bx,word ptr [rsi + r8 - 1]
344 jz LookupLoopIsZero
345
346LookupLoop4:
347 and r8d, edx
348
349 movzx r8d, word ptr [rdi + r8*2]
350 cmp r8d, ebp
351 jbe LeaveNow
352 sub edx, 00010000h
353 js LeaveNow
354
355LoopEntry4:
356
357 cmp bx,word ptr [rsi + r8 - 1]
358 jnz LookupLoop1
359 jmp LookupLoopIsZero
360
361
362;;; do {
363;;; match = s->window + cur_match;
364;;; if (*(ushf*)(match+best_len-1) != scan_end ||
365;;; *(ushf*)match != scan_start) continue;
366;;; [...]
367;;; } while ((cur_match = prev[cur_match & wmask]) > limit
368;;; && --chain_length != 0);
369;;;
370;;; Here is the inner loop of the function. The function will spend the
371;;; majority of its time in this loop, and majority of that time will
372;;; be spent in the first ten instructions.
373;;;
374;;; Within this loop:
375;;; ebx = scanend
376;;; r8d = curmatch
377;;; edx = chainlenwmask - i.e., ((chainlen << 16) | wmask)
378;;; esi = windowbestlen - i.e., (window + bestlen)
379;;; edi = prev
380;;; ebp = limit
381
382LookupLoop:
383 and r8d, edx
384
385 movzx r8d, word ptr [rdi + r8*2]
386 cmp r8d, ebp
387 jbe LeaveNow
388 sub edx, 00010000h
389 js LeaveNow
390
391LoopEntry:
392
393 cmp bx,word ptr [rsi + r8 - 1]
394 jnz LookupLoop1
395LookupLoopIsZero:
396 cmp r12w, word ptr [r10 + r8]
397 jnz LookupLoop1
398
399
400;;; Store the current value of chainlen.
401 mov [chainlenwmask], edx
402
403;;; Point edi to the string under scrutiny, and esi to the string we
404;;; are hoping to match it up with. In actuality, esi and edi are
405;;; both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and edx is
406;;; initialized to -(MAX_MATCH_8 - scanalign).
407
408 lea rsi,[r8+r10]
409 mov rdx, 0fffffffffffffef8h; -(MAX_MATCH_8)
410 lea rsi, [rsi + r13 + 0108h] ;MAX_MATCH_8]
411 lea rdi, [r9 + r13 + 0108h] ;MAX_MATCH_8]
412
413 prefetcht1 [rsi+rdx]
414 prefetcht1 [rdi+rdx]
415
416
417;;; Test the strings for equality, 8 bytes at a time. At the end,
418;;; adjust rdx so that it is offset to the exact byte that mismatched.
419;;;
420;;; We already know at this point that the first three bytes of the
421;;; strings match each other, and they can be safely passed over before
422;;; starting the compare loop. So what this code does is skip over 0-3
423;;; bytes, as much as necessary in order to dword-align the edi
424;;; pointer. (rsi will still be misaligned three times out of four.)
425;;;
426;;; It should be confessed that this loop usually does not represent
427;;; much of the total running time. Replacing it with a more
428;;; straightforward "rep cmpsb" would not drastically degrade
429;;; performance.
430
431
432LoopCmps:
433 mov rax, [rsi + rdx]
434 xor rax, [rdi + rdx]
435 jnz LeaveLoopCmps
436
437 mov rax, [rsi + rdx + 8]
438 xor rax, [rdi + rdx + 8]
439 jnz LeaveLoopCmps8
440
441
442 mov rax, [rsi + rdx + 8+8]
443 xor rax, [rdi + rdx + 8+8]
444 jnz LeaveLoopCmps16
445
446 add rdx,8+8+8
447
448 jnz short LoopCmps
449 jmp short LenMaximum
450LeaveLoopCmps16: add rdx,8
451LeaveLoopCmps8: add rdx,8
452LeaveLoopCmps:
453
454 test eax, 0000FFFFh
455 jnz LenLower
456
457 test eax,0ffffffffh
458
459 jnz LenLower32
460
461 add rdx,4
462 shr rax,32
463 or ax,ax
464 jnz LenLower
465
466LenLower32:
467 shr eax,16
468 add rdx,2
469LenLower: sub al, 1
470 adc rdx, 0
471;;; Calculate the length of the match. If it is longer than MAX_MATCH,
472;;; then automatically accept it as the best possible match and leave.
473
474 lea rax, [rdi + rdx]
475 sub rax, r9
476 cmp eax, MAX_MATCH
477 jge LenMaximum
478
479;;; If the length of the match is not longer than the best match we
480;;; have so far, then forget it and return to the lookup loop.
481;///////////////////////////////////
482
483 cmp eax, r11d
484 jg LongerMatch
485
486 lea rsi,[r10+r11]
487
488 mov rdi, prev_ad
489 mov edx, [chainlenwmask]
490 jmp LookupLoop
491
492;;; s->match_start = cur_match;
493;;; best_len = len;
494;;; if (len >= nice_match) break;
495;;; scan_end = *(ushf*)(scan+best_len-1);
496
497LongerMatch:
498 mov r11d, eax
499 mov match_start, r8d
500 cmp eax, [nicematch]
501 jge LeaveNow
502
503 lea rsi,[r10+rax]
504
505 movzx ebx, word ptr [r9 + rax - 1]
506 mov rdi, prev_ad
507 mov edx, [chainlenwmask]
508 jmp LookupLoop
509
510;;; Accept the current string, with the maximum possible length.
511
512LenMaximum:
513 mov r11d,MAX_MATCH
514 mov match_start, r8d
515
516;;; if ((uInt)best_len <= s->lookahead) return (uInt)best_len;
517;;; return s->lookahead;
518
519LeaveNow:
520IFDEF INFOZIP
521 mov eax,r11d
522ELSE
523 mov eax, Lookahead
524 cmp r11d, eax
525 cmovng eax, r11d
526ENDIF
527
528;;; Restore the stack and return from whence we came.
529
530
531 mov rsi,[save_rsi]
532 mov rdi,[save_rdi]
533 mov rbx,[save_rbx]
534 mov rbp,[save_rbp]
535 mov r12,[save_r12]
536 mov r13,[save_r13]
537; mov r14,[save_r14]
538; mov r15,[save_r15]
539
540
541 ret 0
542; please don't remove this string !
543; Your can freely use gvmat64 in any free or commercial app
544; but it is far better don't remove the string in the binary!
545 db 0dh,0ah,"asm686 with masm, optimised assembly code from Brian Raiter, written 1998, converted to amd 64 by Gilles Vollant 2005",0dh,0ah,0
546longest_match ENDP
547
548match_init PROC
549 ret 0
550match_init ENDP
551
552
553END
diff --git a/contrib/masmx64/inffas8664.c b/contrib/masmx64/inffas8664.c
deleted file mode 100644
index e8af06f..0000000
--- a/contrib/masmx64/inffas8664.c
+++ /dev/null
@@ -1,186 +0,0 @@
1/* inffas8664.c is a hand tuned assembler version of inffast.c - fast decoding
2 * version for AMD64 on Windows using Microsoft C compiler
3 *
4 * Copyright (C) 1995-2003 Mark Adler
5 * For conditions of distribution and use, see copyright notice in zlib.h
6 *
7 * Copyright (C) 2003 Chris Anderson <christop@charm.net>
8 * Please use the copyright conditions above.
9 *
10 * 2005 - Adaptation to Microsoft C Compiler for AMD64 by Gilles Vollant
11 *
12 * inffas8664.c call function inffas8664fnc in inffasx64.asm
13 * inffasx64.asm is automatically convert from AMD64 portion of inffas86.c
14 *
15 * Dec-29-2003 -- I added AMD64 inflate asm support. This version is also
16 * slightly quicker on x86 systems because, instead of using rep movsb to copy
17 * data, it uses rep movsw, which moves data in 2-byte chunks instead of single
18 * bytes. I've tested the AMD64 code on a Fedora Core 1 + the x86_64 updates
19 * from http://fedora.linux.duke.edu/fc1_x86_64
20 * which is running on an Athlon 64 3000+ / Gigabyte GA-K8VT800M system with
21 * 1GB ram. The 64-bit version is about 4% faster than the 32-bit version,
22 * when decompressing mozilla-source-1.3.tar.gz.
23 *
24 * Mar-13-2003 -- Most of this is derived from inffast.S which is derived from
25 * the gcc -S output of zlib-1.2.0/inffast.c. Zlib-1.2.0 is in beta release at
26 * the moment. I have successfully compiled and tested this code with gcc2.96,
27 * gcc3.2, icc5.0, msvc6.0. It is very close to the speed of inffast.S
28 * compiled with gcc -DNO_MMX, but inffast.S is still faster on the P3 with MMX
29 * enabled. I will attempt to merge the MMX code into this version. Newer
30 * versions of this and inffast.S can be found at
31 * http://www.eetbeetee.com/zlib/ and http://www.charm.net/~christop/zlib/
32 *
33 */
34
35#include <stdio.h>
36#include "zutil.h"
37#include "inftrees.h"
38#include "inflate.h"
39#include "inffast.h"
40
41/* Mark Adler's comments from inffast.c: */
42
43/*
44 Decode literal, length, and distance codes and write out the resulting
45 literal and match bytes until either not enough input or output is
46 available, an end-of-block is encountered, or a data error is encountered.
47 When large enough input and output buffers are supplied to inflate(), for
48 example, a 16K input buffer and a 64K output buffer, more than 95% of the
49 inflate execution time is spent in this routine.
50
51 Entry assumptions:
52
53 state->mode == LEN
54 strm->avail_in >= 6
55 strm->avail_out >= 258
56 start >= strm->avail_out
57 state->bits < 8
58
59 On return, state->mode is one of:
60
61 LEN -- ran out of enough output space or enough available input
62 TYPE -- reached end of block code, inflate() to interpret next block
63 BAD -- error in block data
64
65 Notes:
66
67 - The maximum input bits used by a length/distance pair is 15 bits for the
68 length code, 5 bits for the length extra, 15 bits for the distance code,
69 and 13 bits for the distance extra. This totals 48 bits, or six bytes.
70 Therefore if strm->avail_in >= 6, then there is enough input to avoid
71 checking for available input while decoding.
72
73 - The maximum bytes that a single length/distance pair can output is 258
74 bytes, which is the maximum length that can be coded. inflate_fast()
75 requires strm->avail_out >= 258 for each loop to avoid checking for
76 output space.
77 */
78
79
80
81 typedef struct inffast_ar {
82/* 64 32 x86 x86_64 */
83/* ar offset register */
84/* 0 0 */ void *esp; /* esp save */
85/* 8 4 */ void *ebp; /* ebp save */
86/* 16 8 */ unsigned char FAR *in; /* esi rsi local strm->next_in */
87/* 24 12 */ unsigned char FAR *last; /* r9 while in < last */
88/* 32 16 */ unsigned char FAR *out; /* edi rdi local strm->next_out */
89/* 40 20 */ unsigned char FAR *beg; /* inflate()'s init next_out */
90/* 48 24 */ unsigned char FAR *end; /* r10 while out < end */
91/* 56 28 */ unsigned char FAR *window;/* size of window, wsize!=0 */
92/* 64 32 */ code const FAR *lcode; /* ebp rbp local strm->lencode */
93/* 72 36 */ code const FAR *dcode; /* r11 local strm->distcode */
94/* 80 40 */ size_t /*unsigned long */hold; /* edx rdx local strm->hold */
95/* 88 44 */ unsigned bits; /* ebx rbx local strm->bits */
96/* 92 48 */ unsigned wsize; /* window size */
97/* 96 52 */ unsigned write; /* window write index */
98/*100 56 */ unsigned lmask; /* r12 mask for lcode */
99/*104 60 */ unsigned dmask; /* r13 mask for dcode */
100/*108 64 */ unsigned len; /* r14 match length */
101/*112 68 */ unsigned dist; /* r15 match distance */
102/*116 72 */ unsigned status; /* set when state chng*/
103 } type_ar;
104#ifdef ASMINF
105
106void inflate_fast(strm, start)
107z_streamp strm;
108unsigned start; /* inflate()'s starting value for strm->avail_out */
109{
110 struct inflate_state FAR *state;
111 type_ar ar;
112 void inffas8664fnc(struct inffast_ar * par);
113
114
115
116#if (defined( __GNUC__ ) && defined( __amd64__ ) && ! defined( __i386 )) || (defined(_MSC_VER) && defined(_M_AMD64))
117#define PAD_AVAIL_IN 6
118#define PAD_AVAIL_OUT 258
119#else
120#define PAD_AVAIL_IN 5
121#define PAD_AVAIL_OUT 257
122#endif
123
124 /* copy state to local variables */
125 state = (struct inflate_state FAR *)strm->state;
126
127 ar.in = strm->next_in;
128 ar.last = ar.in + (strm->avail_in - PAD_AVAIL_IN);
129 ar.out = strm->next_out;
130 ar.beg = ar.out - (start - strm->avail_out);
131 ar.end = ar.out + (strm->avail_out - PAD_AVAIL_OUT);
132 ar.wsize = state->wsize;
133 ar.write = state->wnext;
134 ar.window = state->window;
135 ar.hold = state->hold;
136 ar.bits = state->bits;
137 ar.lcode = state->lencode;
138 ar.dcode = state->distcode;
139 ar.lmask = (1U << state->lenbits) - 1;
140 ar.dmask = (1U << state->distbits) - 1;
141
142 /* decode literals and length/distances until end-of-block or not enough
143 input data or output space */
144
145 /* align in on 1/2 hold size boundary */
146 while (((size_t)(void *)ar.in & (sizeof(ar.hold) / 2 - 1)) != 0) {
147 ar.hold += (unsigned long)*ar.in++ << ar.bits;
148 ar.bits += 8;
149 }
150
151 inffas8664fnc(&ar);
152
153 if (ar.status > 1) {
154 if (ar.status == 2)
155 strm->msg = "invalid literal/length code";
156 else if (ar.status == 3)
157 strm->msg = "invalid distance code";
158 else
159 strm->msg = "invalid distance too far back";
160 state->mode = BAD;
161 }
162 else if ( ar.status == 1 ) {
163 state->mode = TYPE;
164 }
165
166 /* return unused bytes (on entry, bits < 8, so in won't go too far back) */
167 ar.len = ar.bits >> 3;
168 ar.in -= ar.len;
169 ar.bits -= ar.len << 3;
170 ar.hold &= (1U << ar.bits) - 1;
171
172 /* update state and return */
173 strm->next_in = ar.in;
174 strm->next_out = ar.out;
175 strm->avail_in = (unsigned)(ar.in < ar.last ?
176 PAD_AVAIL_IN + (ar.last - ar.in) :
177 PAD_AVAIL_IN - (ar.in - ar.last));
178 strm->avail_out = (unsigned)(ar.out < ar.end ?
179 PAD_AVAIL_OUT + (ar.end - ar.out) :
180 PAD_AVAIL_OUT - (ar.out - ar.end));
181 state->hold = (unsigned long)ar.hold;
182 state->bits = ar.bits;
183 return;
184}
185
186#endif
diff --git a/contrib/masmx64/inffasx64.asm b/contrib/masmx64/inffasx64.asm
deleted file mode 100644
index 60a8d89..0000000
--- a/contrib/masmx64/inffasx64.asm
+++ /dev/null
@@ -1,396 +0,0 @@
1; inffasx64.asm is a hand tuned assembler version of inffast.c - fast decoding
2; version for AMD64 on Windows using Microsoft C compiler
3;
4; inffasx64.asm is automatically convert from AMD64 portion of inffas86.c
5; inffasx64.asm is called by inffas8664.c, which contain more info.
6
7
8; to compile this file, I use option
9; ml64.exe /Flinffasx64 /c /Zi inffasx64.asm
10; with Microsoft Macro Assembler (x64) for AMD64
11;
12
13; This file compile with Microsoft Macro Assembler (x64) for AMD64
14;
15; ml64.exe is given with Visual Studio 2005/2008/2010 and Windows WDK
16;
17; (you can get Windows WDK with ml64 for AMD64 from
18; http://www.microsoft.com/whdc/Devtools/wdk/default.mspx for low price)
19;
20
21
22.code
23inffas8664fnc PROC
24
25; see http://weblogs.asp.net/oldnewthing/archive/2004/01/14/58579.aspx and
26; http://msdn.microsoft.com/library/en-us/kmarch/hh/kmarch/64bitAMD_8e951dd2-ee77-4728-8702-55ce4b5dd24a.xml.asp
27;
28; All registers must be preserved across the call, except for
29; rax, rcx, rdx, r8, r-9, r10, and r11, which are scratch.
30
31
32 mov [rsp-8],rsi
33 mov [rsp-16],rdi
34 mov [rsp-24],r12
35 mov [rsp-32],r13
36 mov [rsp-40],r14
37 mov [rsp-48],r15
38 mov [rsp-56],rbx
39
40 mov rax,rcx
41
42 mov [rax+8], rbp ; /* save regs rbp and rsp */
43 mov [rax], rsp
44
45 mov rsp, rax ; /* make rsp point to &ar */
46
47 mov rsi, [rsp+16] ; /* rsi = in */
48 mov rdi, [rsp+32] ; /* rdi = out */
49 mov r9, [rsp+24] ; /* r9 = last */
50 mov r10, [rsp+48] ; /* r10 = end */
51 mov rbp, [rsp+64] ; /* rbp = lcode */
52 mov r11, [rsp+72] ; /* r11 = dcode */
53 mov rdx, [rsp+80] ; /* rdx = hold */
54 mov ebx, [rsp+88] ; /* ebx = bits */
55 mov r12d, [rsp+100] ; /* r12d = lmask */
56 mov r13d, [rsp+104] ; /* r13d = dmask */
57 ; /* r14d = len */
58 ; /* r15d = dist */
59
60
61 cld
62 cmp r10, rdi
63 je L_one_time ; /* if only one decode left */
64 cmp r9, rsi
65
66 jne L_do_loop
67
68
69L_one_time:
70 mov r8, r12 ; /* r8 = lmask */
71 cmp bl, 32
72 ja L_get_length_code_one_time
73
74 lodsd ; /* eax = *(uint *)in++ */
75 mov cl, bl ; /* cl = bits, needs it for shifting */
76 add bl, 32 ; /* bits += 32 */
77 shl rax, cl
78 or rdx, rax ; /* hold |= *((uint *)in)++ << bits */
79 jmp L_get_length_code_one_time
80
81ALIGN 4
82L_while_test:
83 cmp r10, rdi
84 jbe L_break_loop
85 cmp r9, rsi
86 jbe L_break_loop
87
88L_do_loop:
89 mov r8, r12 ; /* r8 = lmask */
90 cmp bl, 32
91 ja L_get_length_code ; /* if (32 < bits) */
92
93 lodsd ; /* eax = *(uint *)in++ */
94 mov cl, bl ; /* cl = bits, needs it for shifting */
95 add bl, 32 ; /* bits += 32 */
96 shl rax, cl
97 or rdx, rax ; /* hold |= *((uint *)in)++ << bits */
98
99L_get_length_code:
100 and r8, rdx ; /* r8 &= hold */
101 mov eax, [rbp+r8*4] ; /* eax = lcode[hold & lmask] */
102
103 mov cl, ah ; /* cl = this.bits */
104 sub bl, ah ; /* bits -= this.bits */
105 shr rdx, cl ; /* hold >>= this.bits */
106
107 test al, al
108 jnz L_test_for_length_base ; /* if (op != 0) 45.7% */
109
110 mov r8, r12 ; /* r8 = lmask */
111 shr eax, 16 ; /* output this.val char */
112 stosb
113
114L_get_length_code_one_time:
115 and r8, rdx ; /* r8 &= hold */
116 mov eax, [rbp+r8*4] ; /* eax = lcode[hold & lmask] */
117
118L_dolen:
119 mov cl, ah ; /* cl = this.bits */
120 sub bl, ah ; /* bits -= this.bits */
121 shr rdx, cl ; /* hold >>= this.bits */
122
123 test al, al
124 jnz L_test_for_length_base ; /* if (op != 0) 45.7% */
125
126 shr eax, 16 ; /* output this.val char */
127 stosb
128 jmp L_while_test
129
130ALIGN 4
131L_test_for_length_base:
132 mov r14d, eax ; /* len = this */
133 shr r14d, 16 ; /* len = this.val */
134 mov cl, al
135
136 test al, 16
137 jz L_test_for_second_level_length ; /* if ((op & 16) == 0) 8% */
138 and cl, 15 ; /* op &= 15 */
139 jz L_decode_distance ; /* if (!op) */
140
141L_add_bits_to_len:
142 sub bl, cl
143 xor eax, eax
144 inc eax
145 shl eax, cl
146 dec eax
147 and eax, edx ; /* eax &= hold */
148 shr rdx, cl
149 add r14d, eax ; /* len += hold & mask[op] */
150
151L_decode_distance:
152 mov r8, r13 ; /* r8 = dmask */
153 cmp bl, 32
154 ja L_get_distance_code ; /* if (32 < bits) */
155
156 lodsd ; /* eax = *(uint *)in++ */
157 mov cl, bl ; /* cl = bits, needs it for shifting */
158 add bl, 32 ; /* bits += 32 */
159 shl rax, cl
160 or rdx, rax ; /* hold |= *((uint *)in)++ << bits */
161
162L_get_distance_code:
163 and r8, rdx ; /* r8 &= hold */
164 mov eax, [r11+r8*4] ; /* eax = dcode[hold & dmask] */
165
166L_dodist:
167 mov r15d, eax ; /* dist = this */
168 shr r15d, 16 ; /* dist = this.val */
169 mov cl, ah
170 sub bl, ah ; /* bits -= this.bits */
171 shr rdx, cl ; /* hold >>= this.bits */
172 mov cl, al ; /* cl = this.op */
173
174 test al, 16 ; /* if ((op & 16) == 0) */
175 jz L_test_for_second_level_dist
176 and cl, 15 ; /* op &= 15 */
177 jz L_check_dist_one
178
179L_add_bits_to_dist:
180 sub bl, cl
181 xor eax, eax
182 inc eax
183 shl eax, cl
184 dec eax ; /* (1 << op) - 1 */
185 and eax, edx ; /* eax &= hold */
186 shr rdx, cl
187 add r15d, eax ; /* dist += hold & ((1 << op) - 1) */
188
189L_check_window:
190 mov r8, rsi ; /* save in so from can use it's reg */
191 mov rax, rdi
192 sub rax, [rsp+40] ; /* nbytes = out - beg */
193
194 cmp eax, r15d
195 jb L_clip_window ; /* if (dist > nbytes) 4.2% */
196
197 mov ecx, r14d ; /* ecx = len */
198 mov rsi, rdi
199 sub rsi, r15 ; /* from = out - dist */
200
201 sar ecx, 1
202 jnc L_copy_two ; /* if len % 2 == 0 */
203
204 rep movsw
205 mov al, [rsi]
206 mov [rdi], al
207 inc rdi
208
209 mov rsi, r8 ; /* move in back to %rsi, toss from */
210 jmp L_while_test
211
212L_copy_two:
213 rep movsw
214 mov rsi, r8 ; /* move in back to %rsi, toss from */
215 jmp L_while_test
216
217ALIGN 4
218L_check_dist_one:
219 cmp r15d, 1 ; /* if dist 1, is a memset */
220 jne L_check_window
221 cmp [rsp+40], rdi ; /* if out == beg, outside window */
222 je L_check_window
223
224 mov ecx, r14d ; /* ecx = len */
225 mov al, [rdi-1]
226 mov ah, al
227
228 sar ecx, 1
229 jnc L_set_two
230 mov [rdi], al
231 inc rdi
232
233L_set_two:
234 rep stosw
235 jmp L_while_test
236
237ALIGN 4
238L_test_for_second_level_length:
239 test al, 64
240 jnz L_test_for_end_of_block ; /* if ((op & 64) != 0) */
241
242 xor eax, eax
243 inc eax
244 shl eax, cl
245 dec eax
246 and eax, edx ; /* eax &= hold */
247 add eax, r14d ; /* eax += len */
248 mov eax, [rbp+rax*4] ; /* eax = lcode[val+(hold&mask[op])]*/
249 jmp L_dolen
250
251ALIGN 4
252L_test_for_second_level_dist:
253 test al, 64
254 jnz L_invalid_distance_code ; /* if ((op & 64) != 0) */
255
256 xor eax, eax
257 inc eax
258 shl eax, cl
259 dec eax
260 and eax, edx ; /* eax &= hold */
261 add eax, r15d ; /* eax += dist */
262 mov eax, [r11+rax*4] ; /* eax = dcode[val+(hold&mask[op])]*/
263 jmp L_dodist
264
265ALIGN 4
266L_clip_window:
267 mov ecx, eax ; /* ecx = nbytes */
268 mov eax, [rsp+92] ; /* eax = wsize, prepare for dist cmp */
269 neg ecx ; /* nbytes = -nbytes */
270
271 cmp eax, r15d
272 jb L_invalid_distance_too_far ; /* if (dist > wsize) */
273
274 add ecx, r15d ; /* nbytes = dist - nbytes */
275 cmp dword ptr [rsp+96], 0
276 jne L_wrap_around_window ; /* if (write != 0) */
277
278 mov rsi, [rsp+56] ; /* from = window */
279 sub eax, ecx ; /* eax -= nbytes */
280 add rsi, rax ; /* from += wsize - nbytes */
281
282 mov eax, r14d ; /* eax = len */
283 cmp r14d, ecx
284 jbe L_do_copy ; /* if (nbytes >= len) */
285
286 sub eax, ecx ; /* eax -= nbytes */
287 rep movsb
288 mov rsi, rdi
289 sub rsi, r15 ; /* from = &out[ -dist ] */
290 jmp L_do_copy
291
292ALIGN 4
293L_wrap_around_window:
294 mov eax, [rsp+96] ; /* eax = write */
295 cmp ecx, eax
296 jbe L_contiguous_in_window ; /* if (write >= nbytes) */
297
298 mov esi, [rsp+92] ; /* from = wsize */
299 add rsi, [rsp+56] ; /* from += window */
300 add rsi, rax ; /* from += write */
301 sub rsi, rcx ; /* from -= nbytes */
302 sub ecx, eax ; /* nbytes -= write */
303
304 mov eax, r14d ; /* eax = len */
305 cmp eax, ecx
306 jbe L_do_copy ; /* if (nbytes >= len) */
307
308 sub eax, ecx ; /* len -= nbytes */
309 rep movsb
310 mov rsi, [rsp+56] ; /* from = window */
311 mov ecx, [rsp+96] ; /* nbytes = write */
312 cmp eax, ecx
313 jbe L_do_copy ; /* if (nbytes >= len) */
314
315 sub eax, ecx ; /* len -= nbytes */
316 rep movsb
317 mov rsi, rdi
318 sub rsi, r15 ; /* from = out - dist */
319 jmp L_do_copy
320
321ALIGN 4
322L_contiguous_in_window:
323 mov rsi, [rsp+56] ; /* rsi = window */
324 add rsi, rax
325 sub rsi, rcx ; /* from += write - nbytes */
326
327 mov eax, r14d ; /* eax = len */
328 cmp eax, ecx
329 jbe L_do_copy ; /* if (nbytes >= len) */
330
331 sub eax, ecx ; /* len -= nbytes */
332 rep movsb
333 mov rsi, rdi
334 sub rsi, r15 ; /* from = out - dist */
335 jmp L_do_copy ; /* if (nbytes >= len) */
336
337ALIGN 4
338L_do_copy:
339 mov ecx, eax ; /* ecx = len */
340 rep movsb
341
342 mov rsi, r8 ; /* move in back to %esi, toss from */
343 jmp L_while_test
344
345L_test_for_end_of_block:
346 test al, 32
347 jz L_invalid_literal_length_code
348 mov dword ptr [rsp+116], 1
349 jmp L_break_loop_with_status
350
351L_invalid_literal_length_code:
352 mov dword ptr [rsp+116], 2
353 jmp L_break_loop_with_status
354
355L_invalid_distance_code:
356 mov dword ptr [rsp+116], 3
357 jmp L_break_loop_with_status
358
359L_invalid_distance_too_far:
360 mov dword ptr [rsp+116], 4
361 jmp L_break_loop_with_status
362
363L_break_loop:
364 mov dword ptr [rsp+116], 0
365
366L_break_loop_with_status:
367; /* put in, out, bits, and hold back into ar and pop esp */
368 mov [rsp+16], rsi ; /* in */
369 mov [rsp+32], rdi ; /* out */
370 mov [rsp+88], ebx ; /* bits */
371 mov [rsp+80], rdx ; /* hold */
372
373 mov rax, [rsp] ; /* restore rbp and rsp */
374 mov rbp, [rsp+8]
375 mov rsp, rax
376
377
378
379 mov rsi,[rsp-8]
380 mov rdi,[rsp-16]
381 mov r12,[rsp-24]
382 mov r13,[rsp-32]
383 mov r14,[rsp-40]
384 mov r15,[rsp-48]
385 mov rbx,[rsp-56]
386
387 ret 0
388; :
389; : "m" (ar)
390; : "memory", "%rax", "%rbx", "%rcx", "%rdx", "%rsi", "%rdi",
391; "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15"
392; );
393
394inffas8664fnc ENDP
395;_TEXT ENDS
396END
diff --git a/contrib/masmx64/readme.txt b/contrib/masmx64/readme.txt
deleted file mode 100644
index 2da6733..0000000
--- a/contrib/masmx64/readme.txt
+++ /dev/null
@@ -1,31 +0,0 @@
1Summary
2-------
3This directory contains ASM implementations of the functions
4longest_match() and inflate_fast(), for 64 bits x86 (both AMD64 and Intel EM64t),
5for use with Microsoft Macro Assembler (x64) for AMD64 and Microsoft C++ 64 bits.
6
7gvmat64.asm is written by Gilles Vollant (2005), by using Brian Raiter 686/32 bits
8 assembly optimized version from Jean-loup Gailly original longest_match function
9
10inffasx64.asm and inffas8664.c were written by Chris Anderson, by optimizing
11 original function from Mark Adler
12
13Use instructions
14----------------
15Assemble the .asm files using MASM and put the object files into the zlib source
16directory. You can also get object files here:
17
18 http://www.winimage.com/zLibDll/zlib124_masm_obj.zip
19
20define ASMV and ASMINF in your project. Include inffas8664.c in your source tree,
21and inffasx64.obj and gvmat64.obj as object to link.
22
23
24Build instructions
25------------------
26run bld_64.bat with Microsoft Macro Assembler (x64) for AMD64 (ml64.exe)
27
28ml64.exe is given with Visual Studio 2005, Windows 2003 server DDK
29
30You can get Windows 2003 server DDK with ml64 and cl for AMD64 from
31 http://www.microsoft.com/whdc/devtools/ddk/default.mspx for low price)
diff --git a/contrib/masmx86/bld_ml32.bat b/contrib/masmx86/bld_ml32.bat
deleted file mode 100644
index e1b86bf..0000000
--- a/contrib/masmx86/bld_ml32.bat
+++ /dev/null
@@ -1,2 +0,0 @@
1ml /coff /Zi /c /Flmatch686.lst match686.asm
2ml /coff /Zi /c /Flinffas32.lst inffas32.asm
diff --git a/contrib/masmx86/inffas32.asm b/contrib/masmx86/inffas32.asm
deleted file mode 100644
index 03d20f8..0000000
--- a/contrib/masmx86/inffas32.asm
+++ /dev/null
@@ -1,1080 +0,0 @@
1;/* inffas32.asm is a hand tuned assembler version of inffast.c -- fast decoding
2; *
3; * inffas32.asm is derivated from inffas86.c, with translation of assembly code
4; *
5; * Copyright (C) 1995-2003 Mark Adler
6; * For conditions of distribution and use, see copyright notice in zlib.h
7; *
8; * Copyright (C) 2003 Chris Anderson <christop@charm.net>
9; * Please use the copyright conditions above.
10; *
11; * Mar-13-2003 -- Most of this is derived from inffast.S which is derived from
12; * the gcc -S output of zlib-1.2.0/inffast.c. Zlib-1.2.0 is in beta release at
13; * the moment. I have successfully compiled and tested this code with gcc2.96,
14; * gcc3.2, icc5.0, msvc6.0. It is very close to the speed of inffast.S
15; * compiled with gcc -DNO_MMX, but inffast.S is still faster on the P3 with MMX
16; * enabled. I will attempt to merge the MMX code into this version. Newer
17; * versions of this and inffast.S can be found at
18; * http://www.eetbeetee.com/zlib/ and http://www.charm.net/~christop/zlib/
19; *
20; * 2005 : modification by Gilles Vollant
21; */
22; For Visual C++ 4.x and higher and ML 6.x and higher
23; ml.exe is in directory \MASM611C of Win95 DDK
24; ml.exe is also distributed in http://www.masm32.com/masmdl.htm
25; and in VC++2003 toolkit at http://msdn.microsoft.com/visualc/vctoolkit2003/
26;
27;
28; compile with command line option
29; ml /coff /Zi /c /Flinffas32.lst inffas32.asm
30
31; if you define NO_GZIP (see inflate.h), compile with
32; ml /coff /Zi /c /Flinffas32.lst /DNO_GUNZIP inffas32.asm
33
34
35; zlib122sup is 0 fort zlib 1.2.2.1 and lower
36; zlib122sup is 8 fort zlib 1.2.2.2 and more (with addition of dmax and head
37; in inflate_state in inflate.h)
38zlib1222sup equ 8
39
40
41IFDEF GUNZIP
42 INFLATE_MODE_TYPE equ 11
43 INFLATE_MODE_BAD equ 26
44ELSE
45 IFNDEF NO_GUNZIP
46 INFLATE_MODE_TYPE equ 11
47 INFLATE_MODE_BAD equ 26
48 ELSE
49 INFLATE_MODE_TYPE equ 3
50 INFLATE_MODE_BAD equ 17
51 ENDIF
52ENDIF
53
54
55; 75 "inffast.S"
56;FILE "inffast.S"
57
58;;;GLOBAL _inflate_fast
59
60;;;SECTION .text
61
62
63
64 .586p
65 .mmx
66
67 name inflate_fast_x86
68 .MODEL FLAT
69
70_DATA segment
71inflate_fast_use_mmx:
72 dd 1
73
74
75_TEXT segment
76
77
78
79ALIGN 4
80 db 'Fast decoding Code from Chris Anderson'
81 db 0
82
83ALIGN 4
84invalid_literal_length_code_msg:
85 db 'invalid literal/length code'
86 db 0
87
88ALIGN 4
89invalid_distance_code_msg:
90 db 'invalid distance code'
91 db 0
92
93ALIGN 4
94invalid_distance_too_far_msg:
95 db 'invalid distance too far back'
96 db 0
97
98
99ALIGN 4
100inflate_fast_mask:
101dd 0
102dd 1
103dd 3
104dd 7
105dd 15
106dd 31
107dd 63
108dd 127
109dd 255
110dd 511
111dd 1023
112dd 2047
113dd 4095
114dd 8191
115dd 16383
116dd 32767
117dd 65535
118dd 131071
119dd 262143
120dd 524287
121dd 1048575
122dd 2097151
123dd 4194303
124dd 8388607
125dd 16777215
126dd 33554431
127dd 67108863
128dd 134217727
129dd 268435455
130dd 536870911
131dd 1073741823
132dd 2147483647
133dd 4294967295
134
135
136mode_state equ 0 ;/* state->mode */
137wsize_state equ (32+zlib1222sup) ;/* state->wsize */
138write_state equ (36+4+zlib1222sup) ;/* state->write */
139window_state equ (40+4+zlib1222sup) ;/* state->window */
140hold_state equ (44+4+zlib1222sup) ;/* state->hold */
141bits_state equ (48+4+zlib1222sup) ;/* state->bits */
142lencode_state equ (64+4+zlib1222sup) ;/* state->lencode */
143distcode_state equ (68+4+zlib1222sup) ;/* state->distcode */
144lenbits_state equ (72+4+zlib1222sup) ;/* state->lenbits */
145distbits_state equ (76+4+zlib1222sup) ;/* state->distbits */
146
147
148;;SECTION .text
149; 205 "inffast.S"
150;GLOBAL inflate_fast_use_mmx
151
152;SECTION .data
153
154
155; GLOBAL inflate_fast_use_mmx:object
156;.size inflate_fast_use_mmx, 4
157; 226 "inffast.S"
158;SECTION .text
159
160ALIGN 4
161_inflate_fast proc near
162.FPO (16, 4, 0, 0, 1, 0)
163 push edi
164 push esi
165 push ebp
166 push ebx
167 pushfd
168 sub esp,64
169 cld
170
171
172
173
174 mov esi, [esp+88]
175 mov edi, [esi+28]
176
177
178
179
180
181
182
183 mov edx, [esi+4]
184 mov eax, [esi+0]
185
186 add edx,eax
187 sub edx,11
188
189 mov [esp+44],eax
190 mov [esp+20],edx
191
192 mov ebp, [esp+92]
193 mov ecx, [esi+16]
194 mov ebx, [esi+12]
195
196 sub ebp,ecx
197 neg ebp
198 add ebp,ebx
199
200 sub ecx,257
201 add ecx,ebx
202
203 mov [esp+60],ebx
204 mov [esp+40],ebp
205 mov [esp+16],ecx
206; 285 "inffast.S"
207 mov eax, [edi+lencode_state]
208 mov ecx, [edi+distcode_state]
209
210 mov [esp+8],eax
211 mov [esp+12],ecx
212
213 mov eax,1
214 mov ecx, [edi+lenbits_state]
215 shl eax,cl
216 dec eax
217 mov [esp+0],eax
218
219 mov eax,1
220 mov ecx, [edi+distbits_state]
221 shl eax,cl
222 dec eax
223 mov [esp+4],eax
224
225 mov eax, [edi+wsize_state]
226 mov ecx, [edi+write_state]
227 mov edx, [edi+window_state]
228
229 mov [esp+52],eax
230 mov [esp+48],ecx
231 mov [esp+56],edx
232
233 mov ebp, [edi+hold_state]
234 mov ebx, [edi+bits_state]
235; 321 "inffast.S"
236 mov esi, [esp+44]
237 mov ecx, [esp+20]
238 cmp ecx,esi
239 ja L_align_long
240
241 add ecx,11
242 sub ecx,esi
243 mov eax,12
244 sub eax,ecx
245 lea edi, [esp+28]
246 rep movsb
247 mov ecx,eax
248 xor eax,eax
249 rep stosb
250 lea esi, [esp+28]
251 mov [esp+20],esi
252 jmp L_is_aligned
253
254
255L_align_long:
256 test esi,3
257 jz L_is_aligned
258 xor eax,eax
259 mov al, [esi]
260 inc esi
261 mov ecx,ebx
262 add ebx,8
263 shl eax,cl
264 or ebp,eax
265 jmp L_align_long
266
267L_is_aligned:
268 mov edi, [esp+60]
269; 366 "inffast.S"
270L_check_mmx:
271 cmp dword ptr [inflate_fast_use_mmx],2
272 je L_init_mmx
273 ja L_do_loop
274
275 push eax
276 push ebx
277 push ecx
278 push edx
279 pushfd
280 mov eax, [esp]
281 xor dword ptr [esp],0200000h
282
283
284
285
286 popfd
287 pushfd
288 pop edx
289 xor edx,eax
290 jz L_dont_use_mmx
291 xor eax,eax
292 cpuid
293 cmp ebx,0756e6547h
294 jne L_dont_use_mmx
295 cmp ecx,06c65746eh
296 jne L_dont_use_mmx
297 cmp edx,049656e69h
298 jne L_dont_use_mmx
299 mov eax,1
300 cpuid
301 shr eax,8
302 and eax,15
303 cmp eax,6
304 jne L_dont_use_mmx
305 test edx,0800000h
306 jnz L_use_mmx
307 jmp L_dont_use_mmx
308L_use_mmx:
309 mov dword ptr [inflate_fast_use_mmx],2
310 jmp L_check_mmx_pop
311L_dont_use_mmx:
312 mov dword ptr [inflate_fast_use_mmx],3
313L_check_mmx_pop:
314 pop edx
315 pop ecx
316 pop ebx
317 pop eax
318 jmp L_check_mmx
319; 426 "inffast.S"
320ALIGN 4
321L_do_loop:
322; 437 "inffast.S"
323 cmp bl,15
324 ja L_get_length_code
325
326 xor eax,eax
327 lodsw
328 mov cl,bl
329 add bl,16
330 shl eax,cl
331 or ebp,eax
332
333L_get_length_code:
334 mov edx, [esp+0]
335 mov ecx, [esp+8]
336 and edx,ebp
337 mov eax, [ecx+edx*4]
338
339L_dolen:
340
341
342
343
344
345
346 mov cl,ah
347 sub bl,ah
348 shr ebp,cl
349
350
351
352
353
354
355 test al,al
356 jnz L_test_for_length_base
357
358 shr eax,16
359 stosb
360
361L_while_test:
362
363
364 cmp [esp+16],edi
365 jbe L_break_loop
366
367 cmp [esp+20],esi
368 ja L_do_loop
369 jmp L_break_loop
370
371L_test_for_length_base:
372; 502 "inffast.S"
373 mov edx,eax
374 shr edx,16
375 mov cl,al
376
377 test al,16
378 jz L_test_for_second_level_length
379 and cl,15
380 jz L_save_len
381 cmp bl,cl
382 jae L_add_bits_to_len
383
384 mov ch,cl
385 xor eax,eax
386 lodsw
387 mov cl,bl
388 add bl,16
389 shl eax,cl
390 or ebp,eax
391 mov cl,ch
392
393L_add_bits_to_len:
394 mov eax,1
395 shl eax,cl
396 dec eax
397 sub bl,cl
398 and eax,ebp
399 shr ebp,cl
400 add edx,eax
401
402L_save_len:
403 mov [esp+24],edx
404
405
406L_decode_distance:
407; 549 "inffast.S"
408 cmp bl,15
409 ja L_get_distance_code
410
411 xor eax,eax
412 lodsw
413 mov cl,bl
414 add bl,16
415 shl eax,cl
416 or ebp,eax
417
418L_get_distance_code:
419 mov edx, [esp+4]
420 mov ecx, [esp+12]
421 and edx,ebp
422 mov eax, [ecx+edx*4]
423
424
425L_dodist:
426 mov edx,eax
427 shr edx,16
428 mov cl,ah
429 sub bl,ah
430 shr ebp,cl
431; 584 "inffast.S"
432 mov cl,al
433
434 test al,16
435 jz L_test_for_second_level_dist
436 and cl,15
437 jz L_check_dist_one
438 cmp bl,cl
439 jae L_add_bits_to_dist
440
441 mov ch,cl
442 xor eax,eax
443 lodsw
444 mov cl,bl
445 add bl,16
446 shl eax,cl
447 or ebp,eax
448 mov cl,ch
449
450L_add_bits_to_dist:
451 mov eax,1
452 shl eax,cl
453 dec eax
454 sub bl,cl
455 and eax,ebp
456 shr ebp,cl
457 add edx,eax
458 jmp L_check_window
459
460L_check_window:
461; 625 "inffast.S"
462 mov [esp+44],esi
463 mov eax,edi
464 sub eax, [esp+40]
465
466 cmp eax,edx
467 jb L_clip_window
468
469 mov ecx, [esp+24]
470 mov esi,edi
471 sub esi,edx
472
473 sub ecx,3
474 mov al, [esi]
475 mov [edi],al
476 mov al, [esi+1]
477 mov dl, [esi+2]
478 add esi,3
479 mov [edi+1],al
480 mov [edi+2],dl
481 add edi,3
482 rep movsb
483
484 mov esi, [esp+44]
485 jmp L_while_test
486
487ALIGN 4
488L_check_dist_one:
489 cmp edx,1
490 jne L_check_window
491 cmp [esp+40],edi
492 je L_check_window
493
494 dec edi
495 mov ecx, [esp+24]
496 mov al, [edi]
497 sub ecx,3
498
499 mov [edi+1],al
500 mov [edi+2],al
501 mov [edi+3],al
502 add edi,4
503 rep stosb
504
505 jmp L_while_test
506
507ALIGN 4
508L_test_for_second_level_length:
509
510
511
512
513 test al,64
514 jnz L_test_for_end_of_block
515
516 mov eax,1
517 shl eax,cl
518 dec eax
519 and eax,ebp
520 add eax,edx
521 mov edx, [esp+8]
522 mov eax, [edx+eax*4]
523 jmp L_dolen
524
525ALIGN 4
526L_test_for_second_level_dist:
527
528
529
530
531 test al,64
532 jnz L_invalid_distance_code
533
534 mov eax,1
535 shl eax,cl
536 dec eax
537 and eax,ebp
538 add eax,edx
539 mov edx, [esp+12]
540 mov eax, [edx+eax*4]
541 jmp L_dodist
542
543ALIGN 4
544L_clip_window:
545; 721 "inffast.S"
546 mov ecx,eax
547 mov eax, [esp+52]
548 neg ecx
549 mov esi, [esp+56]
550
551 cmp eax,edx
552 jb L_invalid_distance_too_far
553
554 add ecx,edx
555 cmp dword ptr [esp+48],0
556 jne L_wrap_around_window
557
558 sub eax,ecx
559 add esi,eax
560; 749 "inffast.S"
561 mov eax, [esp+24]
562 cmp eax,ecx
563 jbe L_do_copy1
564
565 sub eax,ecx
566 rep movsb
567 mov esi,edi
568 sub esi,edx
569 jmp L_do_copy1
570
571 cmp eax,ecx
572 jbe L_do_copy1
573
574 sub eax,ecx
575 rep movsb
576 mov esi,edi
577 sub esi,edx
578 jmp L_do_copy1
579
580L_wrap_around_window:
581; 793 "inffast.S"
582 mov eax, [esp+48]
583 cmp ecx,eax
584 jbe L_contiguous_in_window
585
586 add esi, [esp+52]
587 add esi,eax
588 sub esi,ecx
589 sub ecx,eax
590
591
592 mov eax, [esp+24]
593 cmp eax,ecx
594 jbe L_do_copy1
595
596 sub eax,ecx
597 rep movsb
598 mov esi, [esp+56]
599 mov ecx, [esp+48]
600 cmp eax,ecx
601 jbe L_do_copy1
602
603 sub eax,ecx
604 rep movsb
605 mov esi,edi
606 sub esi,edx
607 jmp L_do_copy1
608
609L_contiguous_in_window:
610; 836 "inffast.S"
611 add esi,eax
612 sub esi,ecx
613
614
615 mov eax, [esp+24]
616 cmp eax,ecx
617 jbe L_do_copy1
618
619 sub eax,ecx
620 rep movsb
621 mov esi,edi
622 sub esi,edx
623
624L_do_copy1:
625; 862 "inffast.S"
626 mov ecx,eax
627 rep movsb
628
629 mov esi, [esp+44]
630 jmp L_while_test
631; 878 "inffast.S"
632ALIGN 4
633L_init_mmx:
634 emms
635
636
637
638
639
640 movd mm0,ebp
641 mov ebp,ebx
642; 896 "inffast.S"
643 movd mm4,dword ptr [esp+0]
644 movq mm3,mm4
645 movd mm5,dword ptr [esp+4]
646 movq mm2,mm5
647 pxor mm1,mm1
648 mov ebx, [esp+8]
649 jmp L_do_loop_mmx
650
651ALIGN 4
652L_do_loop_mmx:
653 psrlq mm0,mm1
654
655 cmp ebp,32
656 ja L_get_length_code_mmx
657
658 movd mm6,ebp
659 movd mm7,dword ptr [esi]
660 add esi,4
661 psllq mm7,mm6
662 add ebp,32
663 por mm0,mm7
664
665L_get_length_code_mmx:
666 pand mm4,mm0
667 movd eax,mm4
668 movq mm4,mm3
669 mov eax, [ebx+eax*4]
670
671L_dolen_mmx:
672 movzx ecx,ah
673 movd mm1,ecx
674 sub ebp,ecx
675
676 test al,al
677 jnz L_test_for_length_base_mmx
678
679 shr eax,16
680 stosb
681
682L_while_test_mmx:
683
684
685 cmp [esp+16],edi
686 jbe L_break_loop
687
688 cmp [esp+20],esi
689 ja L_do_loop_mmx
690 jmp L_break_loop
691
692L_test_for_length_base_mmx:
693
694 mov edx,eax
695 shr edx,16
696
697 test al,16
698 jz L_test_for_second_level_length_mmx
699 and eax,15
700 jz L_decode_distance_mmx
701
702 psrlq mm0,mm1
703 movd mm1,eax
704 movd ecx,mm0
705 sub ebp,eax
706 and ecx, [inflate_fast_mask+eax*4]
707 add edx,ecx
708
709L_decode_distance_mmx:
710 psrlq mm0,mm1
711
712 cmp ebp,32
713 ja L_get_dist_code_mmx
714
715 movd mm6,ebp
716 movd mm7,dword ptr [esi]
717 add esi,4
718 psllq mm7,mm6
719 add ebp,32
720 por mm0,mm7
721
722L_get_dist_code_mmx:
723 mov ebx, [esp+12]
724 pand mm5,mm0
725 movd eax,mm5
726 movq mm5,mm2
727 mov eax, [ebx+eax*4]
728
729L_dodist_mmx:
730
731 movzx ecx,ah
732 mov ebx,eax
733 shr ebx,16
734 sub ebp,ecx
735 movd mm1,ecx
736
737 test al,16
738 jz L_test_for_second_level_dist_mmx
739 and eax,15
740 jz L_check_dist_one_mmx
741
742L_add_bits_to_dist_mmx:
743 psrlq mm0,mm1
744 movd mm1,eax
745 movd ecx,mm0
746 sub ebp,eax
747 and ecx, [inflate_fast_mask+eax*4]
748 add ebx,ecx
749
750L_check_window_mmx:
751 mov [esp+44],esi
752 mov eax,edi
753 sub eax, [esp+40]
754
755 cmp eax,ebx
756 jb L_clip_window_mmx
757
758 mov ecx,edx
759 mov esi,edi
760 sub esi,ebx
761
762 sub ecx,3
763 mov al, [esi]
764 mov [edi],al
765 mov al, [esi+1]
766 mov dl, [esi+2]
767 add esi,3
768 mov [edi+1],al
769 mov [edi+2],dl
770 add edi,3
771 rep movsb
772
773 mov esi, [esp+44]
774 mov ebx, [esp+8]
775 jmp L_while_test_mmx
776
777ALIGN 4
778L_check_dist_one_mmx:
779 cmp ebx,1
780 jne L_check_window_mmx
781 cmp [esp+40],edi
782 je L_check_window_mmx
783
784 dec edi
785 mov ecx,edx
786 mov al, [edi]
787 sub ecx,3
788
789 mov [edi+1],al
790 mov [edi+2],al
791 mov [edi+3],al
792 add edi,4
793 rep stosb
794
795 mov ebx, [esp+8]
796 jmp L_while_test_mmx
797
798ALIGN 4
799L_test_for_second_level_length_mmx:
800 test al,64
801 jnz L_test_for_end_of_block
802
803 and eax,15
804 psrlq mm0,mm1
805 movd ecx,mm0
806 and ecx, [inflate_fast_mask+eax*4]
807 add ecx,edx
808 mov eax, [ebx+ecx*4]
809 jmp L_dolen_mmx
810
811ALIGN 4
812L_test_for_second_level_dist_mmx:
813 test al,64
814 jnz L_invalid_distance_code
815
816 and eax,15
817 psrlq mm0,mm1
818 movd ecx,mm0
819 and ecx, [inflate_fast_mask+eax*4]
820 mov eax, [esp+12]
821 add ecx,ebx
822 mov eax, [eax+ecx*4]
823 jmp L_dodist_mmx
824
825ALIGN 4
826L_clip_window_mmx:
827
828 mov ecx,eax
829 mov eax, [esp+52]
830 neg ecx
831 mov esi, [esp+56]
832
833 cmp eax,ebx
834 jb L_invalid_distance_too_far
835
836 add ecx,ebx
837 cmp dword ptr [esp+48],0
838 jne L_wrap_around_window_mmx
839
840 sub eax,ecx
841 add esi,eax
842
843 cmp edx,ecx
844 jbe L_do_copy1_mmx
845
846 sub edx,ecx
847 rep movsb
848 mov esi,edi
849 sub esi,ebx
850 jmp L_do_copy1_mmx
851
852 cmp edx,ecx
853 jbe L_do_copy1_mmx
854
855 sub edx,ecx
856 rep movsb
857 mov esi,edi
858 sub esi,ebx
859 jmp L_do_copy1_mmx
860
861L_wrap_around_window_mmx:
862
863 mov eax, [esp+48]
864 cmp ecx,eax
865 jbe L_contiguous_in_window_mmx
866
867 add esi, [esp+52]
868 add esi,eax
869 sub esi,ecx
870 sub ecx,eax
871
872
873 cmp edx,ecx
874 jbe L_do_copy1_mmx
875
876 sub edx,ecx
877 rep movsb
878 mov esi, [esp+56]
879 mov ecx, [esp+48]
880 cmp edx,ecx
881 jbe L_do_copy1_mmx
882
883 sub edx,ecx
884 rep movsb
885 mov esi,edi
886 sub esi,ebx
887 jmp L_do_copy1_mmx
888
889L_contiguous_in_window_mmx:
890
891 add esi,eax
892 sub esi,ecx
893
894
895 cmp edx,ecx
896 jbe L_do_copy1_mmx
897
898 sub edx,ecx
899 rep movsb
900 mov esi,edi
901 sub esi,ebx
902
903L_do_copy1_mmx:
904
905
906 mov ecx,edx
907 rep movsb
908
909 mov esi, [esp+44]
910 mov ebx, [esp+8]
911 jmp L_while_test_mmx
912; 1174 "inffast.S"
913L_invalid_distance_code:
914
915
916
917
918
919 mov ecx, invalid_distance_code_msg
920 mov edx,INFLATE_MODE_BAD
921 jmp L_update_stream_state
922
923L_test_for_end_of_block:
924
925
926
927
928
929 test al,32
930 jz L_invalid_literal_length_code
931
932 mov ecx,0
933 mov edx,INFLATE_MODE_TYPE
934 jmp L_update_stream_state
935
936L_invalid_literal_length_code:
937
938
939
940
941
942 mov ecx, invalid_literal_length_code_msg
943 mov edx,INFLATE_MODE_BAD
944 jmp L_update_stream_state
945
946L_invalid_distance_too_far:
947
948
949
950 mov esi, [esp+44]
951 mov ecx, invalid_distance_too_far_msg
952 mov edx,INFLATE_MODE_BAD
953 jmp L_update_stream_state
954
955L_update_stream_state:
956
957 mov eax, [esp+88]
958 test ecx,ecx
959 jz L_skip_msg
960 mov [eax+24],ecx
961L_skip_msg:
962 mov eax, [eax+28]
963 mov [eax+mode_state],edx
964 jmp L_break_loop
965
966ALIGN 4
967L_break_loop:
968; 1243 "inffast.S"
969 cmp dword ptr [inflate_fast_use_mmx],2
970 jne L_update_next_in
971
972
973
974 mov ebx,ebp
975
976L_update_next_in:
977; 1266 "inffast.S"
978 mov eax, [esp+88]
979 mov ecx,ebx
980 mov edx, [eax+28]
981 shr ecx,3
982 sub esi,ecx
983 shl ecx,3
984 sub ebx,ecx
985 mov [eax+12],edi
986 mov [edx+bits_state],ebx
987 mov ecx,ebx
988
989 lea ebx, [esp+28]
990 cmp [esp+20],ebx
991 jne L_buf_not_used
992
993 sub esi,ebx
994 mov ebx, [eax+0]
995 mov [esp+20],ebx
996 add esi,ebx
997 mov ebx, [eax+4]
998 sub ebx,11
999 add [esp+20],ebx
1000
1001L_buf_not_used:
1002 mov [eax+0],esi
1003
1004 mov ebx,1
1005 shl ebx,cl
1006 dec ebx
1007
1008
1009
1010
1011
1012 cmp dword ptr [inflate_fast_use_mmx],2
1013 jne L_update_hold
1014
1015
1016
1017 psrlq mm0,mm1
1018 movd ebp,mm0
1019
1020 emms
1021
1022L_update_hold:
1023
1024
1025
1026 and ebp,ebx
1027 mov [edx+hold_state],ebp
1028
1029
1030
1031
1032 mov ebx, [esp+20]
1033 cmp ebx,esi
1034 jbe L_last_is_smaller
1035
1036 sub ebx,esi
1037 add ebx,11
1038 mov [eax+4],ebx
1039 jmp L_fixup_out
1040L_last_is_smaller:
1041 sub esi,ebx
1042 neg esi
1043 add esi,11
1044 mov [eax+4],esi
1045
1046
1047
1048
1049L_fixup_out:
1050
1051 mov ebx, [esp+16]
1052 cmp ebx,edi
1053 jbe L_end_is_smaller
1054
1055 sub ebx,edi
1056 add ebx,257
1057 mov [eax+16],ebx
1058 jmp L_done
1059L_end_is_smaller:
1060 sub edi,ebx
1061 neg edi
1062 add edi,257
1063 mov [eax+16],edi
1064
1065
1066
1067
1068
1069L_done:
1070 add esp,64
1071 popfd
1072 pop ebx
1073 pop ebp
1074 pop esi
1075 pop edi
1076 ret
1077_inflate_fast endp
1078
1079_TEXT ends
1080end
diff --git a/contrib/masmx86/match686.asm b/contrib/masmx86/match686.asm
deleted file mode 100644
index 3b09212..0000000
--- a/contrib/masmx86/match686.asm
+++ /dev/null
@@ -1,479 +0,0 @@
1; match686.asm -- Asm portion of the optimized longest_match for 32 bits x86
2; Copyright (C) 1995-1996 Jean-loup Gailly, Brian Raiter and Gilles Vollant.
3; File written by Gilles Vollant, by converting match686.S from Brian Raiter
4; for MASM. This is as assembly version of longest_match
5; from Jean-loup Gailly in deflate.c
6;
7; http://www.zlib.net
8; http://www.winimage.com/zLibDll
9; http://www.muppetlabs.com/~breadbox/software/assembly.html
10;
11; For Visual C++ 4.x and higher and ML 6.x and higher
12; ml.exe is distributed in
13; http://www.microsoft.com/downloads/details.aspx?FamilyID=7a1c9da0-0510-44a2-b042-7ef370530c64
14;
15; this file contain two implementation of longest_match
16;
17; this longest_match was written by Brian raiter (1998), optimized for Pentium Pro
18; (and the faster known version of match_init on modern Core 2 Duo and AMD Phenom)
19;
20; for using an assembly version of longest_match, you need define ASMV in project
21;
22; compile the asm file running
23; ml /coff /Zi /c /Flmatch686.lst match686.asm
24; and do not include match686.obj in your project
25;
26; note: contrib of zLib 1.2.3 and earlier contained both a deprecated version for
27; Pentium (prior Pentium Pro) and this version for Pentium Pro and modern processor
28; with autoselect (with cpu detection code)
29; if you want support the old pentium optimization, you can still use these version
30;
31; this file is not optimized for old pentium, but it compatible with all x86 32 bits
32; processor (starting 80386)
33;
34;
35; see below : zlib1222add must be adjuster if you use a zlib version < 1.2.2.2
36
37;uInt longest_match(s, cur_match)
38; deflate_state *s;
39; IPos cur_match; /* current match */
40
41 NbStack equ 76
42 cur_match equ dword ptr[esp+NbStack-0]
43 str_s equ dword ptr[esp+NbStack-4]
44; 5 dword on top (ret,ebp,esi,edi,ebx)
45 adrret equ dword ptr[esp+NbStack-8]
46 pushebp equ dword ptr[esp+NbStack-12]
47 pushedi equ dword ptr[esp+NbStack-16]
48 pushesi equ dword ptr[esp+NbStack-20]
49 pushebx equ dword ptr[esp+NbStack-24]
50
51 chain_length equ dword ptr [esp+NbStack-28]
52 limit equ dword ptr [esp+NbStack-32]
53 best_len equ dword ptr [esp+NbStack-36]
54 window equ dword ptr [esp+NbStack-40]
55 prev equ dword ptr [esp+NbStack-44]
56 scan_start equ word ptr [esp+NbStack-48]
57 wmask equ dword ptr [esp+NbStack-52]
58 match_start_ptr equ dword ptr [esp+NbStack-56]
59 nice_match equ dword ptr [esp+NbStack-60]
60 scan equ dword ptr [esp+NbStack-64]
61
62 windowlen equ dword ptr [esp+NbStack-68]
63 match_start equ dword ptr [esp+NbStack-72]
64 strend equ dword ptr [esp+NbStack-76]
65 NbStackAdd equ (NbStack-24)
66
67 .386p
68
69 name gvmatch
70 .MODEL FLAT
71
72
73
74; all the +zlib1222add offsets are due to the addition of fields
75; in zlib in the deflate_state structure since the asm code was first written
76; (if you compile with zlib 1.0.4 or older, use "zlib1222add equ (-4)").
77; (if you compile with zlib between 1.0.5 and 1.2.2.1, use "zlib1222add equ 0").
78; if you compile with zlib 1.2.2.2 or later , use "zlib1222add equ 8").
79
80 zlib1222add equ 8
81
82; Note : these value are good with a 8 bytes boundary pack structure
83 dep_chain_length equ 74h+zlib1222add
84 dep_window equ 30h+zlib1222add
85 dep_strstart equ 64h+zlib1222add
86 dep_prev_length equ 70h+zlib1222add
87 dep_nice_match equ 88h+zlib1222add
88 dep_w_size equ 24h+zlib1222add
89 dep_prev equ 38h+zlib1222add
90 dep_w_mask equ 2ch+zlib1222add
91 dep_good_match equ 84h+zlib1222add
92 dep_match_start equ 68h+zlib1222add
93 dep_lookahead equ 6ch+zlib1222add
94
95
96_TEXT segment
97
98IFDEF NOUNDERLINE
99 public longest_match
100 public match_init
101ELSE
102 public _longest_match
103 public _match_init
104ENDIF
105
106 MAX_MATCH equ 258
107 MIN_MATCH equ 3
108 MIN_LOOKAHEAD equ (MAX_MATCH+MIN_MATCH+1)
109
110
111
112MAX_MATCH equ 258
113MIN_MATCH equ 3
114MIN_LOOKAHEAD equ (MAX_MATCH + MIN_MATCH + 1)
115MAX_MATCH_8_ equ ((MAX_MATCH + 7) AND 0FFF0h)
116
117
118;;; stack frame offsets
119
120chainlenwmask equ esp + 0 ; high word: current chain len
121 ; low word: s->wmask
122window equ esp + 4 ; local copy of s->window
123windowbestlen equ esp + 8 ; s->window + bestlen
124scanstart equ esp + 16 ; first two bytes of string
125scanend equ esp + 12 ; last two bytes of string
126scanalign equ esp + 20 ; dword-misalignment of string
127nicematch equ esp + 24 ; a good enough match size
128bestlen equ esp + 28 ; size of best match so far
129scan equ esp + 32 ; ptr to string wanting match
130
131LocalVarsSize equ 36
132; saved ebx byte esp + 36
133; saved edi byte esp + 40
134; saved esi byte esp + 44
135; saved ebp byte esp + 48
136; return address byte esp + 52
137deflatestate equ esp + 56 ; the function arguments
138curmatch equ esp + 60
139
140;;; Offsets for fields in the deflate_state structure. These numbers
141;;; are calculated from the definition of deflate_state, with the
142;;; assumption that the compiler will dword-align the fields. (Thus,
143;;; changing the definition of deflate_state could easily cause this
144;;; program to crash horribly, without so much as a warning at
145;;; compile time. Sigh.)
146
147dsWSize equ 36+zlib1222add
148dsWMask equ 44+zlib1222add
149dsWindow equ 48+zlib1222add
150dsPrev equ 56+zlib1222add
151dsMatchLen equ 88+zlib1222add
152dsPrevMatch equ 92+zlib1222add
153dsStrStart equ 100+zlib1222add
154dsMatchStart equ 104+zlib1222add
155dsLookahead equ 108+zlib1222add
156dsPrevLen equ 112+zlib1222add
157dsMaxChainLen equ 116+zlib1222add
158dsGoodMatch equ 132+zlib1222add
159dsNiceMatch equ 136+zlib1222add
160
161
162;;; match686.asm -- Pentium-Pro-optimized version of longest_match()
163;;; Written for zlib 1.1.2
164;;; Copyright (C) 1998 Brian Raiter <breadbox@muppetlabs.com>
165;;; You can look at http://www.muppetlabs.com/~breadbox/software/assembly.html
166;;;
167;;
168;; This software is provided 'as-is', without any express or implied
169;; warranty. In no event will the authors be held liable for any damages
170;; arising from the use of this software.
171;;
172;; Permission is granted to anyone to use this software for any purpose,
173;; including commercial applications, and to alter it and redistribute it
174;; freely, subject to the following restrictions:
175;;
176;; 1. The origin of this software must not be misrepresented; you must not
177;; claim that you wrote the original software. If you use this software
178;; in a product, an acknowledgment in the product documentation would be
179;; appreciated but is not required.
180;; 2. Altered source versions must be plainly marked as such, and must not be
181;; misrepresented as being the original software
182;; 3. This notice may not be removed or altered from any source distribution.
183;;
184
185;GLOBAL _longest_match, _match_init
186
187
188;SECTION .text
189
190;;; uInt longest_match(deflate_state *deflatestate, IPos curmatch)
191
192;_longest_match:
193 IFDEF NOUNDERLINE
194 longest_match proc near
195 ELSE
196 _longest_match proc near
197 ENDIF
198.FPO (9, 4, 0, 0, 1, 0)
199
200;;; Save registers that the compiler may be using, and adjust esp to
201;;; make room for our stack frame.
202
203 push ebp
204 push edi
205 push esi
206 push ebx
207 sub esp, LocalVarsSize
208
209;;; Retrieve the function arguments. ecx will hold cur_match
210;;; throughout the entire function. edx will hold the pointer to the
211;;; deflate_state structure during the function's setup (before
212;;; entering the main loop.
213
214 mov edx, [deflatestate]
215 mov ecx, [curmatch]
216
217;;; uInt wmask = s->w_mask;
218;;; unsigned chain_length = s->max_chain_length;
219;;; if (s->prev_length >= s->good_match) {
220;;; chain_length >>= 2;
221;;; }
222
223 mov eax, [edx + dsPrevLen]
224 mov ebx, [edx + dsGoodMatch]
225 cmp eax, ebx
226 mov eax, [edx + dsWMask]
227 mov ebx, [edx + dsMaxChainLen]
228 jl LastMatchGood
229 shr ebx, 2
230LastMatchGood:
231
232;;; chainlen is decremented once beforehand so that the function can
233;;; use the sign flag instead of the zero flag for the exit test.
234;;; It is then shifted into the high word, to make room for the wmask
235;;; value, which it will always accompany.
236
237 dec ebx
238 shl ebx, 16
239 or ebx, eax
240 mov [chainlenwmask], ebx
241
242;;; if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead;
243
244 mov eax, [edx + dsNiceMatch]
245 mov ebx, [edx + dsLookahead]
246 cmp ebx, eax
247 jl LookaheadLess
248 mov ebx, eax
249LookaheadLess: mov [nicematch], ebx
250
251;;; register Bytef *scan = s->window + s->strstart;
252
253 mov esi, [edx + dsWindow]
254 mov [window], esi
255 mov ebp, [edx + dsStrStart]
256 lea edi, [esi + ebp]
257 mov [scan], edi
258
259;;; Determine how many bytes the scan ptr is off from being
260;;; dword-aligned.
261
262 mov eax, edi
263 neg eax
264 and eax, 3
265 mov [scanalign], eax
266
267;;; IPos limit = s->strstart > (IPos)MAX_DIST(s) ?
268;;; s->strstart - (IPos)MAX_DIST(s) : NIL;
269
270 mov eax, [edx + dsWSize]
271 sub eax, MIN_LOOKAHEAD
272 sub ebp, eax
273 jg LimitPositive
274 xor ebp, ebp
275LimitPositive:
276
277;;; int best_len = s->prev_length;
278
279 mov eax, [edx + dsPrevLen]
280 mov [bestlen], eax
281
282;;; Store the sum of s->window + best_len in esi locally, and in esi.
283
284 add esi, eax
285 mov [windowbestlen], esi
286
287;;; register ush scan_start = *(ushf*)scan;
288;;; register ush scan_end = *(ushf*)(scan+best_len-1);
289;;; Posf *prev = s->prev;
290
291 movzx ebx, word ptr [edi]
292 mov [scanstart], ebx
293 movzx ebx, word ptr [edi + eax - 1]
294 mov [scanend], ebx
295 mov edi, [edx + dsPrev]
296
297;;; Jump into the main loop.
298
299 mov edx, [chainlenwmask]
300 jmp short LoopEntry
301
302align 4
303
304;;; do {
305;;; match = s->window + cur_match;
306;;; if (*(ushf*)(match+best_len-1) != scan_end ||
307;;; *(ushf*)match != scan_start) continue;
308;;; [...]
309;;; } while ((cur_match = prev[cur_match & wmask]) > limit
310;;; && --chain_length != 0);
311;;;
312;;; Here is the inner loop of the function. The function will spend the
313;;; majority of its time in this loop, and majority of that time will
314;;; be spent in the first ten instructions.
315;;;
316;;; Within this loop:
317;;; ebx = scanend
318;;; ecx = curmatch
319;;; edx = chainlenwmask - i.e., ((chainlen << 16) | wmask)
320;;; esi = windowbestlen - i.e., (window + bestlen)
321;;; edi = prev
322;;; ebp = limit
323
324LookupLoop:
325 and ecx, edx
326 movzx ecx, word ptr [edi + ecx*2]
327 cmp ecx, ebp
328 jbe LeaveNow
329 sub edx, 00010000h
330 js LeaveNow
331LoopEntry: movzx eax, word ptr [esi + ecx - 1]
332 cmp eax, ebx
333 jnz LookupLoop
334 mov eax, [window]
335 movzx eax, word ptr [eax + ecx]
336 cmp eax, [scanstart]
337 jnz LookupLoop
338
339;;; Store the current value of chainlen.
340
341 mov [chainlenwmask], edx
342
343;;; Point edi to the string under scrutiny, and esi to the string we
344;;; are hoping to match it up with. In actuality, esi and edi are
345;;; both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and edx is
346;;; initialized to -(MAX_MATCH_8 - scanalign).
347
348 mov esi, [window]
349 mov edi, [scan]
350 add esi, ecx
351 mov eax, [scanalign]
352 mov edx, 0fffffef8h; -(MAX_MATCH_8)
353 lea edi, [edi + eax + 0108h] ;MAX_MATCH_8]
354 lea esi, [esi + eax + 0108h] ;MAX_MATCH_8]
355
356;;; Test the strings for equality, 8 bytes at a time. At the end,
357;;; adjust edx so that it is offset to the exact byte that mismatched.
358;;;
359;;; We already know at this point that the first three bytes of the
360;;; strings match each other, and they can be safely passed over before
361;;; starting the compare loop. So what this code does is skip over 0-3
362;;; bytes, as much as necessary in order to dword-align the edi
363;;; pointer. (esi will still be misaligned three times out of four.)
364;;;
365;;; It should be confessed that this loop usually does not represent
366;;; much of the total running time. Replacing it with a more
367;;; straightforward "rep cmpsb" would not drastically degrade
368;;; performance.
369
370LoopCmps:
371 mov eax, [esi + edx]
372 xor eax, [edi + edx]
373 jnz LeaveLoopCmps
374 mov eax, [esi + edx + 4]
375 xor eax, [edi + edx + 4]
376 jnz LeaveLoopCmps4
377 add edx, 8
378 jnz LoopCmps
379 jmp short LenMaximum
380LeaveLoopCmps4: add edx, 4
381LeaveLoopCmps: test eax, 0000FFFFh
382 jnz LenLower
383 add edx, 2
384 shr eax, 16
385LenLower: sub al, 1
386 adc edx, 0
387
388;;; Calculate the length of the match. If it is longer than MAX_MATCH,
389;;; then automatically accept it as the best possible match and leave.
390
391 lea eax, [edi + edx]
392 mov edi, [scan]
393 sub eax, edi
394 cmp eax, MAX_MATCH
395 jge LenMaximum
396
397;;; If the length of the match is not longer than the best match we
398;;; have so far, then forget it and return to the lookup loop.
399
400 mov edx, [deflatestate]
401 mov ebx, [bestlen]
402 cmp eax, ebx
403 jg LongerMatch
404 mov esi, [windowbestlen]
405 mov edi, [edx + dsPrev]
406 mov ebx, [scanend]
407 mov edx, [chainlenwmask]
408 jmp LookupLoop
409
410;;; s->match_start = cur_match;
411;;; best_len = len;
412;;; if (len >= nice_match) break;
413;;; scan_end = *(ushf*)(scan+best_len-1);
414
415LongerMatch: mov ebx, [nicematch]
416 mov [bestlen], eax
417 mov [edx + dsMatchStart], ecx
418 cmp eax, ebx
419 jge LeaveNow
420 mov esi, [window]
421 add esi, eax
422 mov [windowbestlen], esi
423 movzx ebx, word ptr [edi + eax - 1]
424 mov edi, [edx + dsPrev]
425 mov [scanend], ebx
426 mov edx, [chainlenwmask]
427 jmp LookupLoop
428
429;;; Accept the current string, with the maximum possible length.
430
431LenMaximum: mov edx, [deflatestate]
432 mov dword ptr [bestlen], MAX_MATCH
433 mov [edx + dsMatchStart], ecx
434
435;;; if ((uInt)best_len <= s->lookahead) return (uInt)best_len;
436;;; return s->lookahead;
437
438LeaveNow:
439 mov edx, [deflatestate]
440 mov ebx, [bestlen]
441 mov eax, [edx + dsLookahead]
442 cmp ebx, eax
443 jg LookaheadRet
444 mov eax, ebx
445LookaheadRet:
446
447;;; Restore the stack and return from whence we came.
448
449 add esp, LocalVarsSize
450 pop ebx
451 pop esi
452 pop edi
453 pop ebp
454
455 ret
456; please don't remove this string !
457; Your can freely use match686 in any free or commercial app if you don't remove the string in the binary!
458 db 0dh,0ah,"asm686 with masm, optimised assembly code from Brian Raiter, written 1998",0dh,0ah
459
460
461 IFDEF NOUNDERLINE
462 longest_match endp
463 ELSE
464 _longest_match endp
465 ENDIF
466
467 IFDEF NOUNDERLINE
468 match_init proc near
469 ret
470 match_init endp
471 ELSE
472 _match_init proc near
473 ret
474 _match_init endp
475 ENDIF
476
477
478_TEXT ends
479end
diff --git a/contrib/masmx86/readme.txt b/contrib/masmx86/readme.txt
deleted file mode 100644
index 3271f72..0000000
--- a/contrib/masmx86/readme.txt
+++ /dev/null
@@ -1,27 +0,0 @@
1
2Summary
3-------
4This directory contains ASM implementations of the functions
5longest_match() and inflate_fast().
6
7
8Use instructions
9----------------
10Assemble using MASM, and copy the object files into the zlib source
11directory, then run the appropriate makefile, as suggested below. You can
12donwload MASM from here:
13
14 http://www.microsoft.com/downloads/details.aspx?displaylang=en&FamilyID=7a1c9da0-0510-44a2-b042-7ef370530c64
15
16You can also get objects files here:
17
18 http://www.winimage.com/zLibDll/zlib124_masm_obj.zip
19
20Build instructions
21------------------
22* With Microsoft C and MASM:
23nmake -f win32/Makefile.msc LOC="-DASMV -DASMINF" OBJA="match686.obj inffas32.obj"
24
25* With Borland C and TASM:
26make -f win32/Makefile.bor LOCAL_ZLIB="-DASMV -DASMINF" OBJA="match686.obj inffas32.obj" OBJPA="+match686c.obj+match686.obj+inffas32.obj"
27
diff --git a/win32/Makefile.bor b/win32/Makefile.bor
index d152bbb..4495353 100644
--- a/win32/Makefile.bor
+++ b/win32/Makefile.bor
@@ -3,7 +3,6 @@
3# 3#
4# Usage: 4# Usage:
5# make -f win32/Makefile.bor 5# make -f win32/Makefile.bor
6# make -f win32/Makefile.bor LOCAL_ZLIB=-DASMV OBJA=match.obj OBJPA=+match.obj
7 6
8# ------------ Borland C++ ------------ 7# ------------ Borland C++ ------------
9 8
diff --git a/win32/Makefile.gcc b/win32/Makefile.gcc
index 305be50..081e391 100644
--- a/win32/Makefile.gcc
+++ b/win32/Makefile.gcc
@@ -11,10 +11,6 @@
11# 11#
12# make -fwin32/Makefile.gcc; make test testdll -fwin32/Makefile.gcc 12# make -fwin32/Makefile.gcc; make test testdll -fwin32/Makefile.gcc
13# 13#
14# To use the asm code, type:
15# cp contrib/asm?86/match.S ./match.S
16# make LOC=-DASMV OBJA=match.o -fwin32/Makefile.gcc
17#
18# To install libz.a, zconf.h and zlib.h in the system directories, type: 14# To install libz.a, zconf.h and zlib.h in the system directories, type:
19# 15#
20# make install -fwin32/Makefile.gcc 16# make install -fwin32/Makefile.gcc
@@ -38,7 +34,6 @@ IMPLIB = libz.dll.a
38# 34#
39SHARED_MODE=0 35SHARED_MODE=0
40 36
41#LOC = -DASMV
42#LOC = -DZLIB_DEBUG -g 37#LOC = -DZLIB_DEBUG -g
43 38
44PREFIX = 39PREFIX =
diff --git a/win32/Makefile.msc b/win32/Makefile.msc
index 6831882..9c65153 100644
--- a/win32/Makefile.msc
+++ b/win32/Makefile.msc
@@ -4,10 +4,6 @@
4# Usage: 4# Usage:
5# nmake -f win32/Makefile.msc (standard build) 5# nmake -f win32/Makefile.msc (standard build)
6# nmake -f win32/Makefile.msc LOC=-DFOO (nonstandard build) 6# nmake -f win32/Makefile.msc LOC=-DFOO (nonstandard build)
7# nmake -f win32/Makefile.msc LOC="-DASMV -DASMINF" \
8# OBJA="inffas32.obj match686.obj" (use ASM code, x86)
9# nmake -f win32/Makefile.msc AS=ml64 LOC="-DASMV -DASMINF -I." \
10# OBJA="inffasx64.obj gvmat64.obj inffas8664.obj" (use ASM code, x64)
11 7
12# The toplevel directory of the source tree. 8# The toplevel directory of the source tree.
13# 9#