summaryrefslogtreecommitdiff
path: root/contrib/masmx64
diff options
context:
space:
mode:
authorMark Adler <madler@alumni.caltech.edu>2011-09-09 23:24:43 -0700
committerMark Adler <madler@alumni.caltech.edu>2011-09-09 23:24:43 -0700
commit6b8233bfe00e79134cb1b84fc49d4f750a797f79 (patch)
treeca2b03b0169568681dc3d9c823e9f0bc4417d6b5 /contrib/masmx64
parent0484693e1723bbab791c56f95597bd7dbe867d03 (diff)
downloadzlib-1.2.2.3.tar.gz
zlib-1.2.2.3.tar.bz2
zlib-1.2.2.3.zip
zlib 1.2.2.3v1.2.2.3
Diffstat (limited to 'contrib/masmx64')
-rw-r--r--contrib/masmx64/bld_ml64.bat2
-rw-r--r--contrib/masmx64/gvmat64.asm464
-rw-r--r--contrib/masmx64/gvmat64.objbin0 -> 4155 bytes
-rw-r--r--contrib/masmx64/inffas8664.c186
-rw-r--r--contrib/masmx64/inffasx64.asm392
-rw-r--r--contrib/masmx64/inffasx64.objbin0 -> 5913 bytes
-rw-r--r--contrib/masmx64/readme.txt28
7 files changed, 1072 insertions, 0 deletions
diff --git a/contrib/masmx64/bld_ml64.bat b/contrib/masmx64/bld_ml64.bat
new file mode 100644
index 0000000..8f9343d
--- /dev/null
+++ b/contrib/masmx64/bld_ml64.bat
@@ -0,0 +1,2 @@
1ml64.exe /Flinffasx64 /c /Zi inffasx64.asm
2ml64.exe /Flgvmat64 /c /Zi gvmat64.asm
diff --git a/contrib/masmx64/gvmat64.asm b/contrib/masmx64/gvmat64.asm
new file mode 100644
index 0000000..cee2145
--- /dev/null
+++ b/contrib/masmx64/gvmat64.asm
@@ -0,0 +1,464 @@
1;uInt longest_match_x64(
2; deflate_state *s,
3; IPos cur_match); /* current match */
4
5; gvmat64.asm -- Asm portion of the optimized longest_match for 32 bits x86
6; Copyright (C) 1995-2005 Jean-loup Gailly, Brian Raiter and Gilles Vollant.
7; File written by Gilles Vollant, by modifiying the longest_match
8; from Jean-loup Gailly in deflate.c
9; and modifying asm686 with masm, optimised assembly code from Brian Raiter,
10; written 1998
11; http://www.zlib.net
12; http://www.winimage.com/zLibDll
13; http://www.muppetlabs.com/~breadbox/software/assembly.html
14;
15; to compile this file, I use option
16; ml64.exe /Flgvmat64 /c /Zi gvmat64.asm
17; with Microsoft Macro Assembler (x64) for AMD64
18;
19; ml64.exe is given with Visual Studio 2005, Windows 2003 server DDK
20;
21; (you can get Windows 2003 server DDK with ml64 and cl for AMD64 from
22; http://www.microsoft.com/whdc/devtools/ddk/default.mspx for low price)
23;
24; Be carrefull to adapt zlib1222add below to your version of zLib
25
26
27;uInt longest_match(s, cur_match)
28; deflate_state *s;
29; IPos cur_match; /* current match */
30.code
31longest_match PROC
32
33
34;LocalVarsSize equ 88
35 LocalVarsSize equ 72
36
37; register used : rax,rbx,rcx,rdx,rsi,rdi,r8,r9,r10,r11,r12
38; free register : r14,r15
39; register can be saved : rsp
40
41 chainlenwmask equ rsp + 8 - LocalVarsSize ; high word: current chain len
42 ; low word: s->wmask
43;window equ rsp + xx - LocalVarsSize ; local copy of s->window ; stored in r10
44;windowbestlen equ rsp + xx - LocalVarsSize ; s->window + bestlen , use r10+r11
45;scanstart equ rsp + xx - LocalVarsSize ; first two bytes of string ; stored in r12w
46;scanend equ rsp + xx - LocalVarsSize ; last two bytes of string use ebx
47;scanalign equ rsp + xx - LocalVarsSize ; dword-misalignment of string r13
48;bestlen equ rsp + xx - LocalVarsSize ; size of best match so far -> r11d
49;scan equ rsp + xx - LocalVarsSize ; ptr to string wanting match -> r9
50 nicematch equ rsp + 16 - LocalVarsSize ; a good enough match size -> r14
51
52save_rdi equ rsp + 24 - LocalVarsSize
53save_rsi equ rsp + 32 - LocalVarsSize
54save_rbx equ rsp + 40 - LocalVarsSize
55save_rbp equ rsp + 48 - LocalVarsSize
56save_r12 equ rsp + 56 - LocalVarsSize
57save_r13 equ rsp + 64 - LocalVarsSize
58;save_r14 equ rsp + 72 - LocalVarsSize
59;save_r15 equ rsp + 80 - LocalVarsSize
60
61
62
63; all the +4 offsets are due to the addition of pending_buf_size (in zlib
64; in the deflate_state structure since the asm code was first written
65; (if you compile with zlib 1.0.4 or older, remove the +4).
66; Note : these value are good with a 8 bytes boundary pack structure
67
68
69 MAX_MATCH equ 258
70 MIN_MATCH equ 3
71 MIN_LOOKAHEAD equ (MAX_MATCH+MIN_MATCH+1)
72
73
74;;; Offsets for fields in the deflate_state structure. These numbers
75;;; are calculated from the definition of deflate_state, with the
76;;; assumption that the compiler will dword-align the fields. (Thus,
77;;; changing the definition of deflate_state could easily cause this
78;;; program to crash horribly, without so much as a warning at
79;;; compile time. Sigh.)
80
81; all the +zlib1222add offsets are due to the addition of fields
82; in zlib in the deflate_state structure since the asm code was first written
83; (if you compile with zlib 1.0.4 or older, use "zlib1222add equ (-4)").
84; (if you compile with zlib between 1.0.5 and 1.2.2.1, use "zlib1222add equ 0").
85; if you compile with zlib 1.2.2.2 or later , use "zlib1222add equ 8").
86
87zlib1222add equ 8
88
89dsWSize equ 56+zlib1222add+(zlib1222add/2)
90dsWMask equ 64+zlib1222add+(zlib1222add/2)
91dsWindow equ 72+zlib1222add
92dsPrev equ 88+zlib1222add
93dsMatchLen equ 128+zlib1222add
94dsPrevMatch equ 132+zlib1222add
95dsStrStart equ 140+zlib1222add
96dsMatchStart equ 144+zlib1222add
97dsLookahead equ 148+zlib1222add
98dsPrevLen equ 152+zlib1222add
99dsMaxChainLen equ 156+zlib1222add
100dsGoodMatch equ 172+zlib1222add
101dsNiceMatch equ 176+zlib1222add
102
103
104; parameter 1 in r8(deflate state s), param 2 in rdx (cur match)
105
106; see http://weblogs.asp.net/oldnewthing/archive/2004/01/14/58579.aspx and
107; http://msdn.microsoft.com/library/en-us/kmarch/hh/kmarch/64bitAMD_8e951dd2-ee77-4728-8702-55ce4b5dd24a.xml.asp
108;
109; All registers must be preserved across the call, except for
110; rax, rcx, rdx, r8, r-9, r10, and r11, which are scratch.
111
112
113
114;;; Save registers that the compiler may be using, and adjust esp to
115;;; make room for our stack frame.
116
117
118;;; Retrieve the function arguments. r8d will hold cur_match
119;;; throughout the entire function. edx will hold the pointer to the
120;;; deflate_state structure during the function's setup (before
121;;; entering the main loop.
122
123; parameter 1 in rcx (deflate_state* s), param 2 in edx -> r8 (cur match)
124
125; this clear high 32 bits of r8, which can be garbage in both r8 and rdx
126
127
128 mov [save_rdi],rdi
129 mov [save_rsi],rsi
130 mov [save_rbx],rbx
131 mov [save_rbp],rbp
132 mov r8d,edx
133 mov [save_r12],r12
134 mov [save_r13],r13
135; mov [save_r14],r14
136; mov [save_r15],r15
137
138
139;;; uInt wmask = s->w_mask;
140;;; unsigned chain_length = s->max_chain_length;
141;;; if (s->prev_length >= s->good_match) {
142;;; chain_length >>= 2;
143;;; }
144
145 mov edi, [rcx + dsPrevLen]
146 mov esi, [rcx + dsGoodMatch]
147 mov eax, [rcx + dsWMask]
148 mov ebx, [rcx + dsMaxChainLen]
149 cmp edi, esi
150 jl LastMatchGood
151 shr ebx, 2
152LastMatchGood:
153
154;;; chainlen is decremented once beforehand so that the function can
155;;; use the sign flag instead of the zero flag for the exit test.
156;;; It is then shifted into the high word, to make room for the wmask
157;;; value, which it will always accompany.
158
159 dec ebx
160 shl ebx, 16
161 or ebx, eax
162 mov [chainlenwmask], ebx
163
164;;; if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead;
165
166 mov eax, [rcx + dsNiceMatch]
167 mov r10d, [rcx + dsLookahead]
168 cmp r10d, eax
169 cmovnl r10d, eax
170 mov [nicematch],r10d
171LookaheadLess:
172
173;;; register Bytef *scan = s->window + s->strstart;
174
175 mov r10, [rcx + dsWindow]
176 mov ebp, [rcx + dsStrStart]
177 lea r13, [r10 + rbp]
178
179;;; Determine how many bytes the scan ptr is off from being
180;;; dword-aligned.
181
182 mov r9,r13
183 neg r13
184 and r13,3
185
186;;; IPos limit = s->strstart > (IPos)MAX_DIST(s) ?
187;;; s->strstart - (IPos)MAX_DIST(s) : NIL;
188
189 mov eax, [rcx + dsWSize]
190 sub eax, MIN_LOOKAHEAD
191 xor edi,edi
192 sub ebp, eax
193
194 mov r11d, [rcx + dsPrevLen]
195
196 cmovng ebp,edi
197
198;;; int best_len = s->prev_length;
199
200
201;;; Store the sum of s->window + best_len in esi locally, and in esi.
202
203 lea rsi,[r10+r11]
204
205;;; register ush scan_start = *(ushf*)scan;
206;;; register ush scan_end = *(ushf*)(scan+best_len-1);
207;;; Posf *prev = s->prev;
208
209 movzx r12d,word ptr [r9]
210 movzx ebx, word ptr [r9 + r11 - 1]
211 mov rdi, [rcx + dsPrev]
212
213;;; Jump into the main loop.
214
215 mov edx, [chainlenwmask]
216
217 cmp bx,word ptr [rsi + r8 - 1]
218 jz LookupLoopIsZero
219
220LookupLoop1:
221 and r8d, edx
222
223 movzx r8d, word ptr [rdi + r8*2]
224 cmp r8d, ebp
225 jbe LeaveNow
226 sub edx, 00010000h
227 js LeaveNow
228
229LoopEntry1:
230 cmp bx,word ptr [rsi + r8 - 1]
231 jz LookupLoopIsZero
232
233LookupLoop2:
234 and r8d, edx
235
236 movzx r8d, word ptr [rdi + r8*2]
237 cmp r8d, ebp
238 jbe LeaveNow
239 sub edx, 00010000h
240 js LeaveNow
241
242LoopEntry2:
243 cmp bx,word ptr [rsi + r8 - 1]
244 jz LookupLoopIsZero
245
246LookupLoop4:
247 and r8d, edx
248
249 movzx r8d, word ptr [rdi + r8*2]
250 cmp r8d, ebp
251 jbe LeaveNow
252 sub edx, 00010000h
253 js LeaveNow
254
255LoopEntry4:
256
257 cmp bx,word ptr [rsi + r8 - 1]
258 jnz LookupLoop1
259 jmp LookupLoopIsZero
260
261
262;;; do {
263;;; match = s->window + cur_match;
264;;; if (*(ushf*)(match+best_len-1) != scan_end ||
265;;; *(ushf*)match != scan_start) continue;
266;;; [...]
267;;; } while ((cur_match = prev[cur_match & wmask]) > limit
268;;; && --chain_length != 0);
269;;;
270;;; Here is the inner loop of the function. The function will spend the
271;;; majority of its time in this loop, and majority of that time will
272;;; be spent in the first ten instructions.
273;;;
274;;; Within this loop:
275;;; ebx = scanend
276;;; r8d = curmatch
277;;; edx = chainlenwmask - i.e., ((chainlen << 16) | wmask)
278;;; esi = windowbestlen - i.e., (window + bestlen)
279;;; edi = prev
280;;; ebp = limit
281
282LookupLoop:
283 and r8d, edx
284
285 movzx r8d, word ptr [rdi + r8*2]
286 cmp r8d, ebp
287 jbe LeaveNow
288 sub edx, 00010000h
289 js LeaveNow
290
291LoopEntry:
292
293 cmp bx,word ptr [rsi + r8 - 1]
294 jnz LookupLoop1
295LookupLoopIsZero:
296 cmp r12w, word ptr [r10 + r8]
297 jnz LookupLoop1
298
299
300;;; Store the current value of chainlen.
301 mov [chainlenwmask], edx
302
303;;; Point edi to the string under scrutiny, and esi to the string we
304;;; are hoping to match it up with. In actuality, esi and edi are
305;;; both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and edx is
306;;; initialized to -(MAX_MATCH_8 - scanalign).
307
308 lea rsi,[r8+r10]
309 mov rdx, 0fffffffffffffef8h; -(MAX_MATCH_8)
310 lea rsi, [rsi + r13 + 0108h] ;MAX_MATCH_8]
311 lea rdi, [r9 + r13 + 0108h] ;MAX_MATCH_8]
312
313 prefetcht1 [rsi+rdx]
314 prefetcht1 [rdi+rdx]
315;;; Test the strings for equality, 8 bytes at a time. At the end,
316;;; adjust edx so that it is offset to the exact byte that mismatched.
317;;;
318;;; We already know at this point that the first three bytes of the
319;;; strings match each other, and they can be safely passed over before
320;;; starting the compare loop. So what this code does is skip over 0-3
321;;; bytes, as much as necessary in order to dword-align the edi
322;;; pointer. (esi will still be misaligned three times out of four.)
323;;;
324;;; It should be confessed that this loop usually does not represent
325;;; much of the total running time. Replacing it with a more
326;;; straightforward "rep cmpsb" would not drastically degrade
327;;; performance.
328
329;LoopCmps:
330; mov eax, [rsi + rdx]
331; xor eax, [rdi + rdx]
332; jnz LeaveLoopCmps
333; mov eax, [rsi + rdx + 4]
334; xor eax, [rdi + rdx + 4]
335; jnz LeaveLoopCmps4
336; add rdx, 8
337; jnz LoopCmps
338; jmp LenMaximum
339;LeaveLoopCmps4: add rdx, 4
340;LeaveLoopCmps: test eax, 0000FFFFh
341; jnz LenLower
342; add rdx, 2
343; shr eax, 16
344;LenLower: sub al, 1
345; adc rdx, 0
346
347
348LoopCmps:
349 mov rax, [rsi + rdx]
350 xor rax, [rdi + rdx]
351 jnz LeaveLoopCmps
352
353 mov rax, [rsi + rdx + 8]
354 xor rax, [rdi + rdx + 8]
355 jnz LeaveLoopCmps8
356
357
358 mov rax, [rsi + rdx + 8+8]
359 xor rax, [rdi + rdx + 8+8]
360 jnz LeaveLoopCmps16
361
362 add rdx,8+8+8
363
364 jmp short LoopCmps
365LeaveLoopCmps16: add rdx,8
366LeaveLoopCmps8: add rdx,8
367LeaveLoopCmps:
368
369 test eax, 0000FFFFh
370 jnz LenLower
371
372 test eax,0ffffffffh
373
374 jnz LenLower32
375
376 add rdx,4
377 shr rax,32
378 or ax,ax
379 jnz LenLower
380
381LenLower32:
382 shr eax,16
383 add rdx,2
384LenLower: sub al, 1
385 adc rdx, 0
386;;; Calculate the length of the match. If it is longer than MAX_MATCH,
387;;; then automatically accept it as the best possible match and leave.
388
389 lea rax, [rdi + rdx]
390 sub rax, r9
391 cmp eax, MAX_MATCH
392 jge LenMaximum
393
394;;; If the length of the match is not longer than the best match we
395;;; have so far, then forget it and return to the lookup loop.
396;///////////////////////////////////
397
398 cmp eax, r11d
399 jg LongerMatch
400
401 lea rsi,[r10+r11]
402
403 mov rdi, [rcx + dsPrev]
404 mov edx, [chainlenwmask]
405 jmp LookupLoop
406
407;;; s->match_start = cur_match;
408;;; best_len = len;
409;;; if (len >= nice_match) break;
410;;; scan_end = *(ushf*)(scan+best_len-1);
411
412LongerMatch:
413 mov r11d, eax
414 mov [rcx + dsMatchStart], r8d
415 cmp eax, [nicematch]
416 jge LeaveNow
417
418 lea rsi,[r10+rax]
419
420 movzx ebx, word ptr [r9 + rax - 1]
421 mov rdi, [rcx + dsPrev]
422 mov edx, [chainlenwmask]
423 jmp LookupLoop
424
425;;; Accept the current string, with the maximum possible length.
426
427LenMaximum:
428 mov r11d,MAX_MATCH
429 mov [rcx + dsMatchStart], r8d
430
431;;; if ((uInt)best_len <= s->lookahead) return (uInt)best_len;
432;;; return s->lookahead;
433
434LeaveNow:
435 mov eax, [rcx + dsLookahead]
436 cmp r11d, eax
437 cmovng eax, r11d
438
439
440;;; Restore the stack and return from whence we came.
441
442
443 mov rsi,[save_rsi]
444 mov rdi,[save_rdi]
445 mov rbx,[save_rbx]
446 mov rbp,[save_rbp]
447 mov r12,[save_r12]
448 mov r13,[save_r13]
449; mov r14,[save_r14]
450; mov r15,[save_r15]
451
452
453 ret 0
454; please don't remove this string !
455; Your can freely use gvmat32 in any free or commercial app if you don't remove the string in the binary!
456 db 0dh,0ah,"asm686 with masm, optimised assembly code from Brian Raiter, written 1998, converted to amd 64 by Gilles Vollant 2005",0dh,0ah,0
457longest_match ENDP
458
459match_init PROC
460 ret 0
461match_init ENDP
462
463
464END
diff --git a/contrib/masmx64/gvmat64.obj b/contrib/masmx64/gvmat64.obj
new file mode 100644
index 0000000..10a73fe
--- /dev/null
+++ b/contrib/masmx64/gvmat64.obj
Binary files differ
diff --git a/contrib/masmx64/inffas8664.c b/contrib/masmx64/inffas8664.c
new file mode 100644
index 0000000..3af764d
--- /dev/null
+++ b/contrib/masmx64/inffas8664.c
@@ -0,0 +1,186 @@
1/* inffas8664.c is a hand tuned assembler version of inffast.c - fast decoding
2 * version for AMD64 on Windows using Microsoft C compiler
3 *
4 * Copyright (C) 1995-2003 Mark Adler
5 * For conditions of distribution and use, see copyright notice in zlib.h
6 *
7 * Copyright (C) 2003 Chris Anderson <christop@charm.net>
8 * Please use the copyright conditions above.
9 *
10 * 2005 - Adaptation to Microsoft C Compiler for AMD64 by Gilles Vollant
11 *
12 * inffas8664.c call function inffas8664fnc in inffasx64.asm
13 * inffasx64.asm is automatically convert from AMD64 portion of inffas86.c
14 *
15 * Dec-29-2003 -- I added AMD64 inflate asm support. This version is also
16 * slightly quicker on x86 systems because, instead of using rep movsb to copy
17 * data, it uses rep movsw, which moves data in 2-byte chunks instead of single
18 * bytes. I've tested the AMD64 code on a Fedora Core 1 + the x86_64 updates
19 * from http://fedora.linux.duke.edu/fc1_x86_64
20 * which is running on an Athlon 64 3000+ / Gigabyte GA-K8VT800M system with
21 * 1GB ram. The 64-bit version is about 4% faster than the 32-bit version,
22 * when decompressing mozilla-source-1.3.tar.gz.
23 *
24 * Mar-13-2003 -- Most of this is derived from inffast.S which is derived from
25 * the gcc -S output of zlib-1.2.0/inffast.c. Zlib-1.2.0 is in beta release at
26 * the moment. I have successfully compiled and tested this code with gcc2.96,
27 * gcc3.2, icc5.0, msvc6.0. It is very close to the speed of inffast.S
28 * compiled with gcc -DNO_MMX, but inffast.S is still faster on the P3 with MMX
29 * enabled. I will attempt to merge the MMX code into this version. Newer
30 * versions of this and inffast.S can be found at
31 * http://www.eetbeetee.com/zlib/ and http://www.charm.net/~christop/zlib/
32 *
33 */
34
35#include <stdio.h>
36#include "zutil.h"
37#include "inftrees.h"
38#include "inflate.h"
39#include "inffast.h"
40
41/* Mark Adler's comments from inffast.c: */
42
43/*
44 Decode literal, length, and distance codes and write out the resulting
45 literal and match bytes until either not enough input or output is
46 available, an end-of-block is encountered, or a data error is encountered.
47 When large enough input and output buffers are supplied to inflate(), for
48 example, a 16K input buffer and a 64K output buffer, more than 95% of the
49 inflate execution time is spent in this routine.
50
51 Entry assumptions:
52
53 state->mode == LEN
54 strm->avail_in >= 6
55 strm->avail_out >= 258
56 start >= strm->avail_out
57 state->bits < 8
58
59 On return, state->mode is one of:
60
61 LEN -- ran out of enough output space or enough available input
62 TYPE -- reached end of block code, inflate() to interpret next block
63 BAD -- error in block data
64
65 Notes:
66
67 - The maximum input bits used by a length/distance pair is 15 bits for the
68 length code, 5 bits for the length extra, 15 bits for the distance code,
69 and 13 bits for the distance extra. This totals 48 bits, or six bytes.
70 Therefore if strm->avail_in >= 6, then there is enough input to avoid
71 checking for available input while decoding.
72
73 - The maximum bytes that a single length/distance pair can output is 258
74 bytes, which is the maximum length that can be coded. inflate_fast()
75 requires strm->avail_out >= 258 for each loop to avoid checking for
76 output space.
77 */
78
79
80
81 typedef struct inffast_ar {
82/* 64 32 x86 x86_64 */
83/* ar offset register */
84/* 0 0 */ void *esp; /* esp save */
85/* 8 4 */ void *ebp; /* ebp save */
86/* 16 8 */ unsigned char FAR *in; /* esi rsi local strm->next_in */
87/* 24 12 */ unsigned char FAR *last; /* r9 while in < last */
88/* 32 16 */ unsigned char FAR *out; /* edi rdi local strm->next_out */
89/* 40 20 */ unsigned char FAR *beg; /* inflate()'s init next_out */
90/* 48 24 */ unsigned char FAR *end; /* r10 while out < end */
91/* 56 28 */ unsigned char FAR *window;/* size of window, wsize!=0 */
92/* 64 32 */ code const FAR *lcode; /* ebp rbp local strm->lencode */
93/* 72 36 */ code const FAR *dcode; /* r11 local strm->distcode */
94/* 80 40 */ size_t /*unsigned long */hold; /* edx rdx local strm->hold */
95/* 88 44 */ unsigned bits; /* ebx rbx local strm->bits */
96/* 92 48 */ unsigned wsize; /* window size */
97/* 96 52 */ unsigned write; /* window write index */
98/*100 56 */ unsigned lmask; /* r12 mask for lcode */
99/*104 60 */ unsigned dmask; /* r13 mask for dcode */
100/*108 64 */ unsigned len; /* r14 match length */
101/*112 68 */ unsigned dist; /* r15 match distance */
102/*116 72 */ unsigned status; /* set when state chng*/
103 } type_ar;
104#ifdef ASMINF
105
106void inflate_fast(strm, start)
107z_streamp strm;
108unsigned start; /* inflate()'s starting value for strm->avail_out */
109{
110 struct inflate_state FAR *state;
111 type_ar ar;
112 void inffas8664fnc(struct inffast_ar * par);
113
114
115
116#if (defined( __GNUC__ ) && defined( __amd64__ ) && ! defined( __i386 )) || (defined(_MSC_VER) && defined(_M_AMD64))
117#define PAD_AVAIL_IN 6
118#define PAD_AVAIL_OUT 258
119#else
120#define PAD_AVAIL_IN 5
121#define PAD_AVAIL_OUT 257
122#endif
123
124 /* copy state to local variables */
125 state = (struct inflate_state FAR *)strm->state;
126
127 ar.in = strm->next_in;
128 ar.last = ar.in + (strm->avail_in - PAD_AVAIL_IN);
129 ar.out = strm->next_out;
130 ar.beg = ar.out - (start - strm->avail_out);
131 ar.end = ar.out + (strm->avail_out - PAD_AVAIL_OUT);
132 ar.wsize = state->wsize;
133 ar.write = state->write;
134 ar.window = state->window;
135 ar.hold = state->hold;
136 ar.bits = state->bits;
137 ar.lcode = state->lencode;
138 ar.dcode = state->distcode;
139 ar.lmask = (1U << state->lenbits) - 1;
140 ar.dmask = (1U << state->distbits) - 1;
141
142 /* decode literals and length/distances until end-of-block or not enough
143 input data or output space */
144
145 /* align in on 1/2 hold size boundary */
146 while (((size_t)(void *)ar.in & (sizeof(ar.hold) / 2 - 1)) != 0) {
147 ar.hold += (unsigned long)*ar.in++ << ar.bits;
148 ar.bits += 8;
149 }
150
151 inffas8664fnc(&ar);
152
153 if (ar.status > 1) {
154 if (ar.status == 2)
155 strm->msg = "invalid literal/length code";
156 else if (ar.status == 3)
157 strm->msg = "invalid distance code";
158 else
159 strm->msg = "invalid distance too far back";
160 state->mode = BAD;
161 }
162 else if ( ar.status == 1 ) {
163 state->mode = TYPE;
164 }
165
166 /* return unused bytes (on entry, bits < 8, so in won't go too far back) */
167 ar.len = ar.bits >> 3;
168 ar.in -= ar.len;
169 ar.bits -= ar.len << 3;
170 ar.hold &= (1U << ar.bits) - 1;
171
172 /* update state and return */
173 strm->next_in = ar.in;
174 strm->next_out = ar.out;
175 strm->avail_in = (unsigned)(ar.in < ar.last ?
176 PAD_AVAIL_IN + (ar.last - ar.in) :
177 PAD_AVAIL_IN - (ar.in - ar.last));
178 strm->avail_out = (unsigned)(ar.out < ar.end ?
179 PAD_AVAIL_OUT + (ar.end - ar.out) :
180 PAD_AVAIL_OUT - (ar.out - ar.end));
181 state->hold = (unsigned long)ar.hold;
182 state->bits = ar.bits;
183 return;
184}
185
186#endif
diff --git a/contrib/masmx64/inffasx64.asm b/contrib/masmx64/inffasx64.asm
new file mode 100644
index 0000000..b5d93a2
--- /dev/null
+++ b/contrib/masmx64/inffasx64.asm
@@ -0,0 +1,392 @@
1; inffasx64.asm is a hand tuned assembler version of inffast.c - fast decoding
2; version for AMD64 on Windows using Microsoft C compiler
3;
4; inffasx64.asm is automatically convert from AMD64 portion of inffas86.c
5; inffasx64.asm is called by inffas8664.c, which contain more info.
6
7
8; to compile this file, I use option
9; ml64.exe /Flinffasx64 /c /Zi inffasx64.asm
10; with Microsoft Macro Assembler (x64) for AMD64
11;
12; ml64.exe is given with Visual Studio 2005, Windows 2003 server DDK
13;
14; (you can get Windows 2003 server DDK with ml64 and cl.exe for AMD64 from
15; http://www.microsoft.com/whdc/devtools/ddk/default.mspx for low price)
16;
17
18.code
19inffas8664fnc PROC
20
21; see http://weblogs.asp.net/oldnewthing/archive/2004/01/14/58579.aspx and
22; http://msdn.microsoft.com/library/en-us/kmarch/hh/kmarch/64bitAMD_8e951dd2-ee77-4728-8702-55ce4b5dd24a.xml.asp
23;
24; All registers must be preserved across the call, except for
25; rax, rcx, rdx, r8, r-9, r10, and r11, which are scratch.
26
27
28 mov [rsp-8],rsi
29 mov [rsp-16],rdi
30 mov [rsp-24],r12
31 mov [rsp-32],r13
32 mov [rsp-40],r14
33 mov [rsp-48],r15
34 mov [rsp-56],rbx
35
36 mov rax,rcx
37
38 mov [rax+8], rbp ; /* save regs rbp and rsp */
39 mov [rax], rsp
40
41 mov rsp, rax ; /* make rsp point to &ar */
42
43 mov rsi, [rsp+16] ; /* rsi = in */
44 mov rdi, [rsp+32] ; /* rdi = out */
45 mov r9, [rsp+24] ; /* r9 = last */
46 mov r10, [rsp+48] ; /* r10 = end */
47 mov rbp, [rsp+64] ; /* rbp = lcode */
48 mov r11, [rsp+72] ; /* r11 = dcode */
49 mov rdx, [rsp+80] ; /* rdx = hold */
50 mov ebx, [rsp+88] ; /* ebx = bits */
51 mov r12d, [rsp+100] ; /* r12d = lmask */
52 mov r13d, [rsp+104] ; /* r13d = dmask */
53 ; /* r14d = len */
54 ; /* r15d = dist */
55
56
57 cld
58 cmp r10, rdi
59 je L_one_time ; /* if only one decode left */
60 cmp r9, rsi
61
62 jne L_do_loop
63
64
65L_one_time:
66 mov r8, r12 ; /* r8 = lmask */
67 cmp bl, 32
68 ja L_get_length_code_one_time
69
70 lodsd ; /* eax = *(uint *)in++ */
71 mov cl, bl ; /* cl = bits, needs it for shifting */
72 add bl, 32 ; /* bits += 32 */
73 shl rax, cl
74 or rdx, rax ; /* hold |= *((uint *)in)++ << bits */
75 jmp L_get_length_code_one_time
76
77ALIGN 4
78L_while_test:
79 cmp r10, rdi
80 jbe L_break_loop
81 cmp r9, rsi
82 jbe L_break_loop
83
84L_do_loop:
85 mov r8, r12 ; /* r8 = lmask */
86 cmp bl, 32
87 ja L_get_length_code ; /* if (32 < bits) */
88
89 lodsd ; /* eax = *(uint *)in++ */
90 mov cl, bl ; /* cl = bits, needs it for shifting */
91 add bl, 32 ; /* bits += 32 */
92 shl rax, cl
93 or rdx, rax ; /* hold |= *((uint *)in)++ << bits */
94
95L_get_length_code:
96 and r8, rdx ; /* r8 &= hold */
97 mov eax, [rbp+r8*4] ; /* eax = lcode[hold & lmask] */
98
99 mov cl, ah ; /* cl = this.bits */
100 sub bl, ah ; /* bits -= this.bits */
101 shr rdx, cl ; /* hold >>= this.bits */
102
103 test al, al
104 jnz L_test_for_length_base ; /* if (op != 0) 45.7% */
105
106 mov r8, r12 ; /* r8 = lmask */
107 shr eax, 16 ; /* output this.val char */
108 stosb
109
110L_get_length_code_one_time:
111 and r8, rdx ; /* r8 &= hold */
112 mov eax, [rbp+r8*4] ; /* eax = lcode[hold & lmask] */
113
114L_dolen:
115 mov cl, ah ; /* cl = this.bits */
116 sub bl, ah ; /* bits -= this.bits */
117 shr rdx, cl ; /* hold >>= this.bits */
118
119 test al, al
120 jnz L_test_for_length_base ; /* if (op != 0) 45.7% */
121
122 shr eax, 16 ; /* output this.val char */
123 stosb
124 jmp L_while_test
125
126ALIGN 4
127L_test_for_length_base:
128 mov r14d, eax ; /* len = this */
129 shr r14d, 16 ; /* len = this.val */
130 mov cl, al
131
132 test al, 16
133 jz L_test_for_second_level_length ; /* if ((op & 16) == 0) 8% */
134 and cl, 15 ; /* op &= 15 */
135 jz L_decode_distance ; /* if (!op) */
136
137L_add_bits_to_len:
138 sub bl, cl
139 xor eax, eax
140 inc eax
141 shl eax, cl
142 dec eax
143 and eax, edx ; /* eax &= hold */
144 shr rdx, cl
145 add r14d, eax ; /* len += hold & mask[op] */
146
147L_decode_distance:
148 mov r8, r13 ; /* r8 = dmask */
149 cmp bl, 32
150 ja L_get_distance_code ; /* if (32 < bits) */
151
152 lodsd ; /* eax = *(uint *)in++ */
153 mov cl, bl ; /* cl = bits, needs it for shifting */
154 add bl, 32 ; /* bits += 32 */
155 shl rax, cl
156 or rdx, rax ; /* hold |= *((uint *)in)++ << bits */
157
158L_get_distance_code:
159 and r8, rdx ; /* r8 &= hold */
160 mov eax, [r11+r8*4] ; /* eax = dcode[hold & dmask] */
161
162L_dodist:
163 mov r15d, eax ; /* dist = this */
164 shr r15d, 16 ; /* dist = this.val */
165 mov cl, ah
166 sub bl, ah ; /* bits -= this.bits */
167 shr rdx, cl ; /* hold >>= this.bits */
168 mov cl, al ; /* cl = this.op */
169
170 test al, 16 ; /* if ((op & 16) == 0) */
171 jz L_test_for_second_level_dist
172 and cl, 15 ; /* op &= 15 */
173 jz L_check_dist_one
174
175L_add_bits_to_dist:
176 sub bl, cl
177 xor eax, eax
178 inc eax
179 shl eax, cl
180 dec eax ; /* (1 << op) - 1 */
181 and eax, edx ; /* eax &= hold */
182 shr rdx, cl
183 add r15d, eax ; /* dist += hold & ((1 << op) - 1) */
184
185L_check_window:
186 mov r8, rsi ; /* save in so from can use it's reg */
187 mov rax, rdi
188 sub rax, [rsp+40] ; /* nbytes = out - beg */
189
190 cmp eax, r15d
191 jb L_clip_window ; /* if (dist > nbytes) 4.2% */
192
193 mov ecx, r14d ; /* ecx = len */
194 mov rsi, rdi
195 sub rsi, r15 ; /* from = out - dist */
196
197 sar ecx, 1
198 jnc L_copy_two ; /* if len % 2 == 0 */
199
200 rep movsw
201 mov al, [rsi]
202 mov [rdi], al
203 inc rdi
204
205 mov rsi, r8 ; /* move in back to %rsi, toss from */
206 jmp L_while_test
207
208L_copy_two:
209 rep movsw
210 mov rsi, r8 ; /* move in back to %rsi, toss from */
211 jmp L_while_test
212
213ALIGN 4
214L_check_dist_one:
215 cmp r15d, 1 ; /* if dist 1, is a memset */
216 jne L_check_window
217 cmp [rsp+40], rdi ; /* if out == beg, outside window */
218 je L_check_window
219
220 mov ecx, r14d ; /* ecx = len */
221 mov al, [rdi-1]
222 mov ah, al
223
224 sar ecx, 1
225 jnc L_set_two
226 mov [rdi], al
227 inc rdi
228
229L_set_two:
230 rep stosw
231 jmp L_while_test
232
233ALIGN 4
234L_test_for_second_level_length:
235 test al, 64
236 jnz L_test_for_end_of_block ; /* if ((op & 64) != 0) */
237
238 xor eax, eax
239 inc eax
240 shl eax, cl
241 dec eax
242 and eax, edx ; /* eax &= hold */
243 add eax, r14d ; /* eax += len */
244 mov eax, [rbp+rax*4] ; /* eax = lcode[val+(hold&mask[op])]*/
245 jmp L_dolen
246
247ALIGN 4
248L_test_for_second_level_dist:
249 test al, 64
250 jnz L_invalid_distance_code ; /* if ((op & 64) != 0) */
251
252 xor eax, eax
253 inc eax
254 shl eax, cl
255 dec eax
256 and eax, edx ; /* eax &= hold */
257 add eax, r15d ; /* eax += dist */
258 mov eax, [r11+rax*4] ; /* eax = dcode[val+(hold&mask[op])]*/
259 jmp L_dodist
260
261ALIGN 4
262L_clip_window:
263 mov ecx, eax ; /* ecx = nbytes */
264 mov eax, [rsp+92] ; /* eax = wsize, prepare for dist cmp */
265 neg ecx ; /* nbytes = -nbytes */
266
267 cmp eax, r15d
268 jb L_invalid_distance_too_far ; /* if (dist > wsize) */
269
270 add ecx, r15d ; /* nbytes = dist - nbytes */
271 cmp dword ptr [rsp+96], 0
272 jne L_wrap_around_window ; /* if (write != 0) */
273
274 mov rsi, [rsp+56] ; /* from = window */
275 sub eax, ecx ; /* eax -= nbytes */
276 add rsi, rax ; /* from += wsize - nbytes */
277
278 mov eax, r14d ; /* eax = len */
279 cmp r14d, ecx
280 jbe L_do_copy ; /* if (nbytes >= len) */
281
282 sub eax, ecx ; /* eax -= nbytes */
283 rep movsb
284 mov rsi, rdi
285 sub rsi, r15 ; /* from = &out[ -dist ] */
286 jmp L_do_copy
287
288ALIGN 4
289L_wrap_around_window:
290 mov eax, [rsp+96] ; /* eax = write */
291 cmp ecx, eax
292 jbe L_contiguous_in_window ; /* if (write >= nbytes) */
293
294 mov esi, [rsp+92] ; /* from = wsize */
295 add rsi, [rsp+56] ; /* from += window */
296 add rsi, rax ; /* from += write */
297 sub rsi, rcx ; /* from -= nbytes */
298 sub ecx, eax ; /* nbytes -= write */
299
300 mov eax, r14d ; /* eax = len */
301 cmp eax, ecx
302 jbe L_do_copy ; /* if (nbytes >= len) */
303
304 sub eax, ecx ; /* len -= nbytes */
305 rep movsb
306 mov rsi, [rsp+56] ; /* from = window */
307 mov ecx, [rsp+96] ; /* nbytes = write */
308 cmp eax, ecx
309 jbe L_do_copy ; /* if (nbytes >= len) */
310
311 sub eax, ecx ; /* len -= nbytes */
312 rep movsb
313 mov rsi, rdi
314 sub rsi, r15 ; /* from = out - dist */
315 jmp L_do_copy
316
317ALIGN 4
318L_contiguous_in_window:
319 mov rsi, [rsp+56] ; /* rsi = window */
320 add rsi, rax
321 sub rsi, rcx ; /* from += write - nbytes */
322
323 mov eax, r14d ; /* eax = len */
324 cmp eax, ecx
325 jbe L_do_copy ; /* if (nbytes >= len) */
326
327 sub eax, ecx ; /* len -= nbytes */
328 rep movsb
329 mov rsi, rdi
330 sub rsi, r15 ; /* from = out - dist */
331 jmp L_do_copy ; /* if (nbytes >= len) */
332
333ALIGN 4
334L_do_copy:
335 mov ecx, eax ; /* ecx = len */
336 rep movsb
337
338 mov rsi, r8 ; /* move in back to %esi, toss from */
339 jmp L_while_test
340
341L_test_for_end_of_block:
342 test al, 32
343 jz L_invalid_literal_length_code
344 mov dword ptr [rsp+116], 1
345 jmp L_break_loop_with_status
346
347L_invalid_literal_length_code:
348 mov dword ptr [rsp+116], 2
349 jmp L_break_loop_with_status
350
351L_invalid_distance_code:
352 mov dword ptr [rsp+116], 3
353 jmp L_break_loop_with_status
354
355L_invalid_distance_too_far:
356 mov dword ptr [rsp+116], 4
357 jmp L_break_loop_with_status
358
359L_break_loop:
360 mov dword ptr [rsp+116], 0
361
362L_break_loop_with_status:
363; /* put in, out, bits, and hold back into ar and pop esp */
364 mov [rsp+16], rsi ; /* in */
365 mov [rsp+32], rdi ; /* out */
366 mov [rsp+88], ebx ; /* bits */
367 mov [rsp+80], rdx ; /* hold */
368
369 mov rax, [rsp] ; /* restore rbp and rsp */
370 mov rbp, [rsp+8]
371 mov rsp, rax
372
373
374
375 mov rsi,[rsp-8]
376 mov rdi,[rsp-16]
377 mov r12,[rsp-24]
378 mov r13,[rsp-32]
379 mov r14,[rsp-40]
380 mov r15,[rsp-48]
381 mov rbx,[rsp-56]
382
383 ret 0
384; :
385; : "m" (ar)
386; : "memory", "%rax", "%rbx", "%rcx", "%rdx", "%rsi", "%rdi",
387; "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15"
388; );
389
390inffas8664fnc ENDP
391;_TEXT ENDS
392END
diff --git a/contrib/masmx64/inffasx64.obj b/contrib/masmx64/inffasx64.obj
new file mode 100644
index 0000000..8df5d82
--- /dev/null
+++ b/contrib/masmx64/inffasx64.obj
Binary files differ
diff --git a/contrib/masmx64/readme.txt b/contrib/masmx64/readme.txt
new file mode 100644
index 0000000..ee03115
--- /dev/null
+++ b/contrib/masmx64/readme.txt
@@ -0,0 +1,28 @@
1Summary
2-------
3This directory contains ASM implementations of the functions
4longest_match() and inflate_fast(), for 64 bits x86 (both AMD64 and Intel EM64t),
5for use with Microsoft Macro Assembler (x64) for AMD64 and Microsoft C++ 64 bits.
6
7gvmat64.asm is written by Gilles Vollant (2005), by using Brian Raiter 686/32 bits
8 assembly optimized version from Jean-loup Gailly original longest_match function
9
10inffasx64.asm and inffas8664.c were written by Chris Anderson, by optimizing
11 original function from Mark Adler
12
13Use instructions
14----------------
15Copy these files into the zlib source directory.
16
17define ASMV and ASMINF in your project. Include inffas8664.c in your source tree,
18and inffasx64.obj and gvmat64.obj as object to link.
19
20
21Build instructions
22------------------
23run bld_64.bat with Microsoft Macro Assembler (x64) for AMD64 (ml64.exe)
24
25ml64.exe is given with Visual Studio 2005, Windows 2003 server DDK
26
27You can get Windows 2003 server DDK with ml64 and cl for AMD64 from
28 http://www.microsoft.com/whdc/devtools/ddk/default.mspx for low price)